kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 31529df9243e91a1de7b32633d38dd8825b3a7db
parent 4bbf87e038f9528f4ce83aeab3f9a34a9a7bdbf5
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon,  1 Jun 2026 11:27:54 -0700

Rewrite docs

Diffstat:
MREADME.md | 40++++++++++++++++++++++++++--------------
Adoc/ARCH.md | 394+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdoc/ASM.md | 503++++++++++++++++++++++++++++++++++++++++---------------------------------------
Ddoc/ASM_ROUNDTRIP_TESTING.md | 472-------------------------------------------------------------------------------
Ddoc/BOOTSTRAP_O1.md | 163-------------------------------------------------------------------------------
Adoc/BUILD.md | 266+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdoc/CBACKEND.md | 331++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
Ddoc/CGTARGET.md | 764-------------------------------------------------------------------------------
Adoc/CODEGEN.md | 263+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdoc/DBG.md | 792+++++++++++++++++++++++++++++++------------------------------------------------
Ddoc/DBG_TODO.md | 169-------------------------------------------------------------------------------
Mdoc/DESIGN.md | 421+++++++++++++++++++++++++++++++++++++------------------------------------------
Mdoc/DISTRIBUTE.md | 850+++++++++++++++++++++++--------------------------------------------------------
Adoc/DRIVER.md | 276+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdoc/DWARF.md | 1035++++++++++++++++++++++++++++---------------------------------------------------
Mdoc/EMU.md | 1459+++++++++++++++++++++++--------------------------------------------------------
Adoc/FRONTENDS.md | 306+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ddoc/GO_RUNTIME_CG_JIT.md | 279-------------------------------------------------------------------------------
Ddoc/HOT_RELOAD.md | 443-------------------------------------------------------------------------------
Ddoc/IMAGE_INSPECT.md | 382-------------------------------------------------------------------------------
Ddoc/INCREMENTAL_LINK.md | 416-------------------------------------------------------------------------------
Ddoc/INCREMENTAL_OBJLINK.md | 786-------------------------------------------------------------------------------
Mdoc/INTERFACES.md | 414++++++++++++++++++++++++++++++++++++++++---------------------------------------
Mdoc/INTERPRETER.md | 724+++++++++++++++++++++++++++++++++++--------------------------------------------
Mdoc/IR.md | 821+++++++++++++++++++++++++++++++++----------------------------------------------
Mdoc/JIT.md | 432++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
Adoc/LINK.md | 367+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ddoc/LINK_DEBUG.md | 187-------------------------------------------------------------------------------
Ddoc/NATIVE_ARCH_COMPLETENESS.md | 144-------------------------------------------------------------------------------
Ddoc/NATIVE_DIRECT_CACHE.md | 189-------------------------------------------------------------------------------
Ddoc/NATIVE_PORT_RV64.md | 3647-------------------------------------------------------------------------------
Ddoc/NATIVE_PORT_X64.md | 4342-------------------------------------------------------------------------------
Ddoc/O1_INLINE.md | 188-------------------------------------------------------------------------------
Adoc/OBJ.md | 363+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdoc/OPT.md | 814++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Ddoc/OPT_MACHINE_REG_CONSTRAINTS.md | 180-------------------------------------------------------------------------------
Ddoc/OPT_O0_NATIVE_DIRECT_NOTES.md | 186-------------------------------------------------------------------------------
Ddoc/OPT_O0_PERF_NOTES.md | 168-------------------------------------------------------------------------------
Ddoc/OPT_O1_PASSES.md | 412-------------------------------------------------------------------------------
Ddoc/OPT_O1_PERF_TODO.md | 214-------------------------------------------------------------------------------
Ddoc/OPT_PERF.md | 828-------------------------------------------------------------------------------
Ddoc/OPTv2.md | 937-------------------------------------------------------------------------------
Ddoc/PERCALL.md | 138-------------------------------------------------------------------------------
Ddoc/PROF.md | 392-------------------------------------------------------------------------------
Ddoc/REGISTRY.md | 235-------------------------------------------------------------------------------
Adoc/RUNTIME.md | 283+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ddoc/SSA2.md | 932-------------------------------------------------------------------------------
Adoc/TESTING.md | 349+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ddoc/TOY.md | 868-------------------------------------------------------------------------------
Ddoc/TOY_TRANSACTIONAL.md | 368-------------------------------------------------------------------------------
Mdoc/WASM.md | 1722+++++++++++++------------------------------------------------------------------
Ddoc/WASM_PARSE_CHECKLIST.md | 95-------------------------------------------------------------------------------
Ddoc/WINDOWS.md | 408-------------------------------------------------------------------------------
Adoc/plan/ARCH.md | 159+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adoc/plan/BOOTSTRAP.md | 159+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adoc/plan/DEBUG.md | 252+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adoc/plan/IMAGE_INSPECT.md | 174+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adoc/plan/JIT.md | 271+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adoc/plan/LINKER.md | 253+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adoc/plan/OPTIMIZER.md | 278+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adoc/plan/README.md | 18++++++++++++++++++
Adoc/plan/WASM.md | 205+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ddoc/std/ANNEX-A.txt | 603-------------------------------------------------------------------------------
Ddoc/std/ANNEX-J.txt | 618-------------------------------------------------------------------------------
Ddoc/std/CHAPTER-5.txt | 1678-------------------------------------------------------------------------------
Ddoc/std/CHAPTER-6.txt | 9476-------------------------------------------------------------------------------
Msrc/api/asm_emit.c | 2+-
Msrc/asm/asm.c | 2+-
Msrc/cg/native_direct_target.h | 2+-
Msrc/obj/coff/link.c | 6+++---
Msrc/obj/elf/read.c | 4++--
Msrc/obj/obj_secnames.c | 12++++++------
Msrc/opt/pass_lower.c | 4++--
73 files changed, 8642 insertions(+), 37691 deletions(-)

diff --git a/README.md b/README.md @@ -2,31 +2,34 @@ cfree is a compilation multi-tool featuring a C11 frontend, written in C11. -Its inspirations are TCC and LLVM. +Its inspirations are TCC, MIR, and LLVM. It features: - A C11 preprocessor - A single-pass C11 parser and code generator -- A JIT compiler, linker, and executor -- A lightweight optimizer delivering ~70% of GCC/Clang -O2 performance -- A cross-compiling backend, supporting 32-bit and 64-bit RISC-V, x86, ARM, and - WASM. +- A JIT compiler, linker, and in-process executor +- An IR interpreter (`run --no-jit`) +- A user-mode ELF emulator (per-basic-block JIT translation) +- A lightweight optimizer (`-O1`) with SSA construction, register allocation, and + local cleanup +- Cross-compiling backends for aarch64, x86-64, and riscv64, plus WebAssembly and + a portable C-source backend - Support for object files and executables (PE/COFF, ELF, Mach-O) - Primary tested targets: x86_64-linux, aarch64-linux, x86_64-macos, - aarch64-macos, x86_64-windows, aarch64-windows, plus freestanding + aarch64-macos, x86_64-windows, aarch64-windows, riscv64-linux, plus freestanding variants of the same architectures. -- An archiver -- An assembler, standalone and inline -- Basic linker script support -- An object file inspection tool and disassembler +- An assembler, standalone and inline, with basic linker-script support +- A disassembler and object/image inspection +- Object and binary utilities: ar, ranlib, nm, size, strip, objcopy, objdump, + addr2line, strings - A single multi-call binary -- Debug info generation (DWARF) -- Debugger +- Debug info generation and consumption (DWARF) +- An interactive debugger - Header dependency generation - Reproducible builds -- A build and packaging system +- Signed, content-addressed code distribution (`.cfpkg`) - Bootstrap from hex0-seed -- A C library providing access to the above +- A C library providing access to all of the above cfree also provides these headers beyond the freestanding set: - stdatomic.h @@ -37,3 +40,12 @@ And cfree-specific extensions: - cfree/syscall.h - cfree/baremetal.h - cfree/coro.h + +## Documentation + +Start with [`doc/DESIGN.md`](doc/DESIGN.md) — the entrypoint that covers cfree's +design principles, layered architecture, and primary data flows, and indexes a +design doc for every major subsystem (frontends, codegen, IR, optimizer, arch +backends, object formats, linker, JIT, emulator, debug info, debugger, runtime, +driver, packaging, …). Forward-looking roadmaps live in [`doc/plan/`](doc/plan/). +Code-interface detail lives in the public headers under `include/cfree/`. diff --git a/doc/ARCH.md b/doc/ARCH.md @@ -0,0 +1,394 @@ +# Architecture Backends + +This document describes cfree's arch backend abstraction: how a target +architecture plugs into the compiler, what its responsibilities are, and how +the three native backends (aa64, x64, rv64) are structured to maximize sharing +while keeping the ISA-specific seams thin. It also covers the ABI / calling +convention layer in `src/abi`, which is the single authority for storage layout +and call classification. The semantic codegen surface a backend sits behind is +in [CODEGEN.md](CODEGEN.md); the IR the optimizer feeds it is in [IR.md](IR.md); +the SSA/regalloc machinery driving the optimizing path is in [OPT.md](OPT.md); +the standalone assembler that shares the ISA tables is in [ASM.md](ASM.md). ABI +content is canonical here. + +## 1. Two layers of "backend": CGBackend and ArchImpl + +A target enters the compiler through two related abstractions that are wired by +struct-prefix subtyping (`src/cg/cgtarget.h`, `src/arch/arch.h`). + +``` + CGBackend ArchImpl + --------- -------- + const char* name; +---> CGBackend backend; (first field) + CgTarget* (*make)(...); | CfreeArchKind kind; + | CgTarget* (*cgtarget_new)(...); + | ArchAsm* (*asm_new)(...); + | ArchDisasm* (*disasm_new)(...); + | int (*apply_label_fixup)(...); + | const LinkArchDesc* link; + | const ArchDecodeOps* decode; (emu/objdump) + | const ArchEmuOps* emu; + | const ArchDwarfOps* dwarf; + | const ArchDbgOps* dbg; + | const ArchAsmOps* asm_ops; + | register-file accessors; + | CFI / .eh_frame CIE constants; + | predefined target macros; +``` + +- A **`CGBackend`** answers exactly one question: "build me a `CgTarget` for this + `Compiler` + `ObjBuilder` + `CfreeCodeOptions`." It is the unit the session + pipeline cares about — it knows nothing about machine code, registers, or + object formats. `cg_backend_c_target` (the C source emitter, see + [CBACKEND.md](CBACKEND.md)) and `cg_backend_check` (the no-emit frontend + checker) are standalone `CGBackend`s with **no `ArchImpl`** — they are + `CGBackend` and nothing more. + +- An **`ArchImpl`** is a `CGBackend` *plus* the machine-code metadata a native + target needs. Because `CGBackend backend` is its first field, + `(const CGBackend*)&arch_impl_x` is a valid downcast — every machine-code arch + is a `CGBackend` by composition. The extra fields are everything that is + genuinely arch-specific and not about producing a `CgTarget`: the assembler + and disassembler constructors, the label-fixup encoder, the linker/emu/DWARF/ + debugger op tables, the register file, the predefined macros (`__aarch64__`, + `__x86_64__`, ...), and the DWARF CFI defaults that seed the `.eh_frame` CIE. + +`aa64`, `x64`, and `rv64` expose full `ArchImpl`s (`arch_impl_aa64`, +`arch_impl_x64`, `arch_impl_rv64`). `wasm` also exposes an `ArchImpl` +(`arch_impl_wasm`), but it is a *thin* one: its machine-code seams +(`asm_new`, `apply_label_fixup`, `link`, register accessors, CFI) are all NULL, +since wasm32 has no native machine encoding, no stack-frame ABI, and no +assembly form in this toolchain — it produces a `WasmModule` attached to the +`ObjBuilder` and only provides a disassembler that renders WAT for objdump (see +[WASM.md](WASM.md)). So the precise rule is: native machine-code arches have an +`ArchImpl` whose machine seams are populated; `c_target`/`check` have only a +`CGBackend`; `wasm` has an `ArchImpl` shell with the machine seams nulled out. + +This split is deliberate. The pipeline picks a `CGBackend` per emit; metadata +consumers (DWARF producer, debugger, disassembler, register-name lookups) reach +for an `ArchImpl` and get back NULL when the target has no machine-code identity. +Neither layer leaks into the other. + +## 2. The arch registry + +`src/arch/registry.c` is the sole place that gates the *arch vtable roster* on +`CFREE_ARCH_*_ENABLED` — it is the canonical config-gate site for the backend +axis, mirroring `src/api/lang_registry.c` for frontends. (The flags are also +read by the parallel object-format and ABI registries, which gate their own +rosters on the arch × format cross-product, and by `src/core/config_assert.c` +for build-time validity asserts — see [OBJ.md](OBJ.md) and §7.) Everything +downstream of the registry operates on its outputs and never re-checks the build +flags. + +The registry holds a single static `arch_impls[]` array (each entry gated by its +`CFREE_ARCH_*_ENABLED` flag) and exposes two lookups: + +- `arch_lookup(CfreeArchKind)` / `arch_for_compiler(Compiler*)` walk + `arch_impls[]` and return the `ArchImpl*` whose `kind` matches — the path for + *machine-arch metadata*. `c_target` is intentionally absent from this roster: + it has no `ArchImpl`, so a metadata query for it correctly returns NULL. + +- `cg_backend_for_session(Compiler*, CfreeCodeOptions*)` picks the `CGBackend*` + for an emit. It short-circuits to `cg_backend_check` when `check_only` is set + and to `cg_backend_c_target` when `emit_c_source` is set; otherwise it returns + `&arch_for_compiler(c)->backend`. This is the one place the "is this an + `ArchImpl` or a standalone `CGBackend`?" decision is made, and it does not + consult `arch_impls[]` (the source-emit and check backends are not in it). + +The registry also owns the thin dispatchers `arch_reloc_operand`, +`arch_is_local_branch`, and `arch_reloc_call_pair`, which forward to the target's +`ArchAsmOps` (used by `cc -S` symbolization, see §4 and [ASM.md](ASM.md)), plus +`arch_disasm_*` / `arch_decode_*` / formatter helpers. All are NULL-safe: a +target lacking the relevant op table gets the documented "no transformation" +answer rather than a crash. + +## 3. The NativeTarget contract + +`src/arch/native_target.h` defines `NativeTarget`, the physical machine-emission +contract that all three native backends implement. It is the layer where the +generic codegen drivers stop speaking in semantic terms (`CGLocal` ids, +high-level types) and start speaking in physical terms: hard registers, frame +slots, legal immediates, and concrete addressing modes. A `NativeTarget` never +allocates registers and never decides storage layout — callers hand it +caller-selected, target-legal physical operands; the target only encodes. + +It is driven from two directions: + +``` + -O0 path: CG semantic ops ──► NativeDirectTarget ──┐ + (src/cg/native_*) ├──► NativeTarget ──► MCEmitter ──► ObjBuilder + -O1+ path: CG ──► record IR ──► opt passes ──────────┘ (~35 hooks) + (SSA, machinize, regalloc, pass_native_emit) +``` + +- At **-O0**, the shared `NativeDirectTarget` (`src/cg/native_direct_target.c`) + is the `CgTarget`. It owns semantic local homes, a small register cache, and + conservative flushes, and lowers each semantic op directly into + `NativeTarget` hook calls. The arch supplies a `NativeTarget` plus a small + semantic adapter, `NativeOps` (bind_param, plan_call, emit_call/ret, va_*, + asm_block, barriers, legality predicates) — the parts that need a foot in the + semantic world. Everything else (frame slots, `class_for_type`, `addr_legal`) + the direct target calls straight through to `NativeTarget`. + +- At **-O1+**, the optimizer records IR, runs SSA/CFG passes, machinizes, and + allocates registers (see [OPT.md](OPT.md)), then `src/opt/pass_native_emit.c` + replays the allocated program against the *same* `NativeTarget` hooks. By this + point every value already has a physical home, so the emit pass hands the + target hard registers and frame slots and the target just encodes. + +That a single ~35-hook contract serves both paths is what keeps the two code +generators byte-compatible per arch. The hook families: + +- **Frame & prologue.** `func_begin` (single-pass reserve-and-patch, used by the + direct path) and `func_begin_known_frame` (the optimizer path, where regalloc + has finished so the full frame — slots, callee-saves, alloca, scratch spills — + is known before the prologue). `frame_slot`, `bind_param`/`bind_params_end`, + `reserve_callee_saves`, the optional `emit_prologue` (exact-size in-place + prologue), `note_frame_state`, and `frame_slot_debug_loc` for the DWARF + coordinate of a slot. +- **Control flow.** `label_new`/`label_place`, `jump`, `cmp_branch`, + `indirect_branch` (with a valid-target set for jump tables), `load_label_addr` + (for `&&label`). +- **Data movement.** `move`, `load_imm`, `load_const`, `load_addr`, `load`, + `store`, `tls_addr_of`, `copy_bytes`/`set_bytes` (aggregate memcpy/memset), + `bitfield_load`/`bitfield_store`, `spill`/`reload`. +- **Arithmetic.** `binop`, `unop`, `cmp`, `convert`, `alloca_`. +- **Calls & returns.** A two-phase split: `plan_call` turns a `NativeCallDesc` + into a `NativeCallPlan` (arg moves, return slots, clobber/return masks, + outgoing stack size) that the optimizer can inspect during frame planning, and + `emit_call` realizes it. Symmetrically `plan_ret`/`ret`. `call_stack_bytes` + and `signature_stack_bytes` are pure pre-pass queries used to size the + outgoing area and to decide tail-call (sibling) realizability. +- **Atomics & fences.** `atomic_load`/`store`/`rmw`/`cas`, `fence`. +- **Variadics.** `va_start_`/`va_arg_`/`va_end_`/`va_copy_`. All `va_list` + layout knowledge (pointer ABI vs register-save-area ABI, field offsets) lives + behind these and is answered by querying the target ABI; the optimizer makes + no layout assumptions. +- **Intrinsics & asm.** `intrinsic`, `asm_block` (inline asm with constraints), + `file_scope_asm`, plus `trap`, `set_loc`, deferred `patch_add`/`patch_apply`, + `finalize`, `destroy`. + +A handful of small capability flags/queries let the generic drivers specialize +without arch branches: `imm_legal`/`addr_legal` (immediate and addressing-mode +legality), `has_store_zero_reg`/`store_zero_reg` (aa64 `xzr`, rv64 `x0` — store +a constant 0 without materializing it), and the optional `machine_op_clobbers`, +which reports the fixed registers an encoding clobbers as a side effect (x86 +`idiv` writes rax/rdx, a variable shift uses cl, atomics use rax/rcx/rdx) so the +allocator keeps values out of them; aa64/rv64 leave it NULL because their +encodings have no such fixed clobbers. + +**aa64 is the reference backend.** `src/arch/aa64/native.c` is the most complete +and most heavily commented implementation; the x64 and rv64 ports are written +against it. Shared scaffolding extracted across all three lives in +`src/cg/native_frame.c` (slot-offset arithmetic, the frame-final gate, the +used-callee-save derivation, ABI-driven va-save sizing) and +`src/cg/native_argmove.c` (the parallel-copy register shuffle for call-arg and +param marshalling). What stays per-arch is everything ISA-specific: the +slot-offset coordinate transform (fp/s0/rbp-relative), prologue/epilogue +encoding, the slim-prologue variants, and instruction selection. + +## 4. The ISA single-source-of-truth table + +Each native arch has an `isa.h` + `isa.c` pair that is the *one* place its +instruction bit-layout lives. `isa.h` holds inline `pack`/`unpack` encoders +(e.g. `aa64_movz`, `aa64_logsr_pack`/`_unpack`) and a descriptor table +(`aa64_insn_table[]`: `{mnemonic, match, mask, format, flags}`). `isa.c` holds +the table data plus the operand print/parse dispatch keyed on `format`. + +The key property is that **three different consumers share the same tables**: + +``` + src/arch/aa64/isa.{h,c} ◄── single source of truth + │ │ │ + encoder │ disasm │ │ standalone assembler + (native.c emit) │ │ (asm.c) + (disasm.c decode) +``` + +- the **encoder** (codegen in `native.c`) calls the inline `pack` helpers to + emit instruction words; +- the **disassembler** (`disasm.c`) does one mask-and-compare against the table + to identify a word, then dispatches on `format` to the same `unpack` helpers + to extract operands; +- the **standalone assembler** (`asm.c`, the `cfree as` tool and inline-`asm()` + handling, see [ASM.md](ASM.md)) parses mnemonics against the table and encodes + through the same inline helpers. + +(For aa64 the same header is also pulled in by `link.c` and `dbg.c`.) The +invariant: when an opcode value or a field position changes, you update one site +and the encoder, decoder, and assembler stay consistent. The table is ordered +first-match-wins, with alias rows (tighter masks, e.g. `mov` ≡ `orr Rd,zr,Rm`, +`cmp` ≡ `subs zr,...`) placed before the canonical rows so the disassembler +renders the alias spelling while the assembler accepts both. `x64/isa.{h,c}` and +`rv64/isa.{h,c}` follow the identical pattern; x64 additionally factors its +byte-level REX/ModR/M/SIB primitives and prologue/epilogue into `emit.c`. + +The `ArchAsmOps` table (`reloc_operand`, `is_local_branch`, `reloc_call_pair`) +is the *textual* complement to this: it tells the `cc -S` symbolizer how a +relocated operand is spelled for the target object format (aarch64 ELF +`:lo12:sym`, Mach-O `sym@PAGEOFF`, x86-64 `sym(%rip)`/`@PLT`, RISC-V +`%pcrel_hi`/`%pcrel_lo` with anchor pairing) so that re-assembling cfree's `-S` +output reproduces byte-identical objects. It is the inverse of the assembler's +reloc-modifier parser. + +## 5. MCEmitter — one generic emitter for all native arches + +`src/arch/mc.c` is a single generic machine-code/object emitter (`MCEmitter`, +declared in `src/arch/mc.h`) used by *every* native arch. It sits between the +backend (or the assembler) and the `ObjBuilder`, and it owns only the bytes-and- +bookkeeping concerns that are genuinely arch-independent: + +- the current section and byte position; +- the **machine-label table**: 1-based `MCLabel` ids, each carrying either a + placement `(sec, offset)` or a list of pending forward-reference fixups that + are applied at `label_place`; plus lazily-minted per-label `SB_LOCAL` symbols + (`.Lcfblk.N`) so code-location references (`&&label`, jump-table entries) + relocate against a real symbol and survive a re-encoding assembler; +- relocation forwarding (`emit_reloc`, `emit_reloc_at`, `emit_label_ref`, + `emit_label_data_reloc`); +- the per-function context (`mc_begin_function`/`mc_end_function`) that the + deferred data-section label relocs read; +- **CFI buffering**: `cfi_startproc`/`cfi_def_cfa`/`cfi_offset`/... records are + buffered per-function and flushed into a `.eh_frame` section by + `mc_emit_eh_frame` at TU finalize. CFI directives are byte-position-bound, so + they live on the one object that already tracks `(section, offset)`. + `cfi_set_next_pc_offset` provides a *sticky* prologue-PC override so backends + that patch the prologue in `func_end` (after the live PC has moved past it) can + pin every frame-state rule to the post-prologue PC. + +Encoding itself is *not* MCEmitter's job — it writes whatever bytes it is handed. +**Arch-specific behavior enters through exactly two thin seams:** + +1. `ArchImpl.apply_label_fixup` — given a resolved label displacement, encode it + into the already-emitted bytes (aa64 splits the 26-bit imm26 of B/BL, the + 19-bit CONDBR, the ADR immlo/immhi, and falls back to a literal-pool `LDR` for + out-of-range `&&label`; x64 writes a 4-byte rel32). `mc.c` builds an + `ArchLabelFixup` descriptor and calls through `arch_for_compiler`. + +2. The `ArchImpl.cfi_*` constants — the per-psABI CIE defaults + (`cfi_return_addr_reg`, code/data alignment factors, initial CFA reg/offset) + that `mc_emit_eh_frame` reads to encode the CIE. + +This is the single most leverage-dense decision in the backend layer: the entire +`.eh_frame` producer, label resolution, relocation plumbing, and section/byte +management is written once and reused, with only those two pinpoint hooks per +arch. `mc.h` is split out from `arch.h` precisely so the many emission-only +consumers (per-arch emit/ops TUs, the assembler, the Debug producer) do not +transitively pull in the decode/disasm/emu/dbg surfaces. + +## 6. Register files + +Each native backend declares its register file as static `NativeRegInfo` data in +its `native.c` (e.g. `aa_reg_info`, wired into the `NativeTarget` at +construction; the DWARF-index ↔ assembler-name tables that the `ArchImpl` +exposes for objdump/asm live separately in `regs.c`). A `NativeRegInfo` is a set +of `NativeAllocClassInfo` (one per `NativeAllocClass`: INT, FP, VEC), each +carrying: + +- an ordered **allocable** list — registers the allocator may assign, ordered by + preference (aa64 lists caller-saved first so the allocator prefers them and + avoids prologue saves); +- a **scratch** list — registers reserved for the backend's own temporaries + (address materialization, atomic retry loops) and never handed to the + allocator; +- a `NativePhysRegInfo` row per physical register (class, ABI arg/ret index, + caller/callee-saved flags, spill/copy costs); +- precomputed caller/callee/arg/ret/reserved bitmasks. + +This one declaration feeds **both** code paths: + +- the **-O0 direct path** resolves `reg_info` and the three `class_info[]` + pointers once at `NativeDirectTarget` construction, so its register cache + (allocate / evict / scratch-acquire) is an O(1) lookup. It uses the allocable + order as a simple "next free register" pool with conservative flushes. +- the **-O1 allocator** (`src/opt`) consumes the same allocable lists, costs, and + masks as its interference-graph inputs, and reports the callee-saves it + actually used back through `func_begin_known_frame` / `reserve_callee_saves` + so the backend can reserve save slots and emit the matching prologue/epilogue. + +Because incoming arg registers are marked non-allocable, register-destination +param binds can never alias a live incoming arg, which is what lets `bind_param` +ordering be unconstrained and lets `bind_params_end` resolve a param permutation +as a single parallel copy. + +## 7. The ABI / calling-convention layer + +`src/abi` is the single authority for target-dependent **storage layout** and +**call classification**. Frontends lower source types to `CfreeCgTypeId` before +entering it; from there the answers are language-agnostic. The public surface is +`TargetABI` (`src/abi/abi.h`), reachable as `c->abi` and consulted by both the +semantic codegen (`src/cg/local.c` for local sizing, `cg/*` for layout) and the +optimizer (`src/opt/cg_ir_lower.c` resolves `abi_cg_func_info` to drive +param-bind and call lowering). It is the canonical owner of: scalar sizes/aligns, +struct/union record layout (including bitfield storage units), function argument +classification (`ABIFuncInfo`: per-arg `DIRECT`/`INDIRECT`/`EXPAND`/`IGNORE`, +sret, byval, sign/zero-ext, vararg routing), and `va_list` shape. + +The layer is split into a shared core and per-ABI vtables: + +- **`abi.c` holds everything C-standard-driven and identical across ABIs**: the + scalar profile (LP64 sizes), record layout computation, and the memoizing + caches for record layouts and function info. This is shared by all targets. +- **`abi_internal.h` defines `ABIVtable`** — the parts that genuinely vary: + `compute_func_info` (the argument/return classifier) and the `va_list` + type/layout facts. +- **`registry.c` selects the per-(arch, object-format) vtable.** Like the arch + and obj registries, it gates entries on the combined `CFREE_ARCH_*` + + `CFREE_OBJ_*` flags and maps `(CfreeArchKind, CfreeObjFmt)` to an `ABIVtable` + via `abi_vtable_lookup`. `abi_init` does this lookup once at compiler init. + +ABIs are a *derived* axis, not a user-facing knob: every valid ABI is a 1:1 +function of an (arch, OS-family) pair, where OS family follows from the object +format (ELF → SysV/AAPCS-style, Mach-O → Apple, COFF → Windows). The registry +therefore enumerates the cross-product cells that both sides enable: + +| Arch | ELF (SysV-ish) | Mach-O (Apple) | COFF (Windows) | +|----------|----------------|----------------|-------------------| +| aa64 | `aapcs64` | `apple_arm64` | `aapcs64_windows` | +| x64 | `sysv_x64` | `apple_x64` | `win64_x64` | +| rv64 | `rv64` | — | — | +| wasm | — (`wasm32`, via the wasm object format) | + +Each per-ABI TU (`abi_aapcs64.c`, `abi_sysv_x64.c`, `abi_apple_arm64.c`, +`abi_apple_x64.c`, `abi_win64_x64.c`, `abi_aapcs64_windows.c`, `abi_rv64.c`) +implements its `compute_func_info` and `va_list` facts; the Apple/Windows +variants encode their divergences (e.g. Apple ARM64 routes the variadic tail +exclusively to the stack, recorded as `vararg_on_stack` in `ABIFuncInfo`). The +classification is the *only* authority — the `NativeTarget` plan/bind hooks and +the optimizer both consume `ABIFuncInfo`; they never re-derive argument +placement. Frame-relevant ABI facts (the vararg register-save-area size) are +funneled through `src/cg/native_frame.c` so the per-arch magic numbers all trace +back to one `va_list`-layout query. + +## 8. Per-call cost model (aa64 -O1) + +The fixed per-call overhead a backend pays — prologue, epilogue, and call-site +setup, independent of the function body — dominates call-heavy workloads, so the +aa64 known-frame path is structured to minimize it. The backend chooses one of a +small set of frame shapes per function (decided in `aa_func_begin_known_frame`, +encoded in `native.c`): + +| Frame shape | When | Fixed insns (entry+exit, excl. `ret`) | +|-------------------|------------------------------------------------------------|----------------------------------------| +| slim prologue | leaf-ish: no callee-saves, no alloca, no body slots, no outgoing stack | 3 (optimal) | +| `fp_at_bottom` | ≥1 callee-save/body slot, **no outgoing stack args**, frame ≤ 504 | 5 (optimal) | +| `slim_small_frame`| as above but with outgoing stack args | 7 | +| fat | large frame / alloca / big saved-pair offset | 7+ | + +The key structural idea is `fp_at_bottom`: when there are no outgoing stack args, +the frame record moves to the *bottom* of the frame (`fp = sp`), so the sp +adjustment folds into a pre/post-indexed `stp x29,x30,[sp,#-N]!` / `ldp +x29,x30,[sp],#N`, and callee-saves stack *above* the record at positive offsets. +This is the common case for any function that keeps a value live across a call +without itself passing >8 register-class args, and it reaches the same 5-insn +fixed cost as gcc -O0; the DWARF CFA becomes `fp + frame_size`. Functions with +outgoing stack args can't move the record to the bottom (the args live there), +so they keep the top-record `slim_small_frame` layout. This availability +asymmetry — bottom-record only on the known-frame path — exists because the +frame-size-dependent offsets require the frame to be final before the body, which +is only true under the optimizer's `func_begin_known_frame`. + +--- + +Remaining and planned per-arch work (deferred niche encodings, audit +follow-ups) is tracked in [plan/ARCH.md](plan/ARCH.md). diff --git a/doc/ASM.md b/doc/ASM.md @@ -1,278 +1,281 @@ -# ASM — assembler, disassembler, inline asm - -Scope: cfree's asm frontend — standalone `.s`, inline `asm("...")`, and the -matching disassembler. aarch64 only today; x64 / rv64 are stubs that panic -cleanly. Companion to `DESIGN.md §10`. - -Asm and disasm are designed together: one description of each instruction -serves both. When an opcode bit moves, encoder and decoder update at one site -and stay in sync by construction. - ---- - -## 1. Status - -| layer | state | -|---|---| -| standalone `.s` (parse → ELF, JIT, round-trip) | aarch64 ✓ | -| disasm (`cfree_disasm_iter_*`, `cfree_obj_disasm`) | aarch64 ✓ | -| inline `asm("...")` C statement | aarch64 ✓ | -| `cfree as` multi-call driver subcommand | aarch64 ✓ | -| `cfree_arch_register_name` / `_index` | aarch64 ✓ | -| x64 / rv64 backends (asm, disasm, inline) | panic with clean diagnostic | - -Coverage of `aa64_asm.c` per-mnemonic table: `nop, ret/br/blr, -mov(reg/imm)/mvn/movz/movn/movk, add(s)/sub(s)/cmp/cmn/neg(s), -and/orr/eor/bic/orn/eon/ands/bics, madd/msub/mul/mneg, -udiv/sdiv/lslv/lsrv/asrv/rorv, b/bl/b.<cc>/cbz/cbnz, svc/brk/hlt, ldr/str -(scaled + simm9 fallback), ldur/stur, ldp/stp (signed-offset + pre-indexed), -adr/adrp, dmb/dsb/isb/clrex`. +# ASM — the assembler(s) + +cfree contains three textual-assembly surfaces that all feed the same object +path: a standalone GNU-`as`-compatible assembler (`cfree as`, `cc` on a `.s` +input), an inline-`asm("...")` statement plumbed through codegen, and a +symbolizing disassembler-to-text printer (`cc -S`). They are deliberately +co-designed: the per-arch encoders are the single source of truth for bit +layout, the disassembler shares those encoders, and `cc -S` renders operands +as the *inverse* of what the assembler parses — so `cc -S | as` round-trips +`cc -c` (see [TESTING.md](TESTING.md)). This document describes the layering +and the invariants; ISA encoding tables live in [ARCH.md](ARCH.md), object +emission in [OBJ.md](OBJ.md). --- -## 2. Encode/decode pairing — the design discipline +## 1. Layering ``` - ┌──────────────────┐ - asm text ─lex─► │ per-format │ ─pack(fields)─► u32 bytes - │ parse_operands │ │ - bytes ──────────┤ │ ◄─unpack(word)── │ - │ per-format │ - disasm text ◄────────┤ print_operands │ - └──────────────────┘ - ▲ - │ - AA64InsnDesc { mnemonic, match, mask, format, AsmFlags } + .s text ──► AsmLexer ──► AsmDriver (arch-neutral) + │ directives, labels, section/symbol state, + │ expression evaluator, string decoding + ▼ + ArchAsm vtable .insn(driver, mnemonic) + │ per-arch instruction parser + ▼ (per-arch encoders == ISA source of truth) + MCEmitter ──► ObjBuilder ──► ELF / Mach-O + ▲ + inline asm("...") ──► CgTarget.asm_block ─┘ (same MCEmitter, same encoders) + + ObjBuilder ──► arch_disasm_decode ──► cc -S printer (asm_emit.c) + ▲ symbolizes operands, re-spells dirs + └── shared with objdump / cfree_disasm ``` -Per format (in `aa64_isa.h`): `AA64<Fmt>` struct + `pack` / `unpack` / -`print`. Per instruction (one row in `aa64_insn_table`): mnemonic, match, -mask, format, AsmFlags (alias / sf-required / etc.). - -**Source of truth.** Encoder, decoder, asm parser, asm printer all go through -`aa64_isa.h`. No second copy of the bit layout anywhere. If `S` -(asm-roundtrip) fails on a cg-emitted word, fix the format definition; never -the parser site. +Three seams keep the design factored: -**Aliases** (`MOV` for `ORR Rd, ZR, Rm`; `MUL` for `MADD ..., ZR`; `NEG` for -`SUB Rd, ZR, Rm`) are extra rows with tighter masks placed *before* the -canonical row. First-match-wins picks the alias spelling. +- **driver ↔ per-arch parser**: `src/asm/asm_helpers.h`. `AsmDriver` is opaque + to per-arch code; the helper surface (peek/next, `eat_punct`, `parse_const`, + `parse_sym_expr`, `intern_sym`, `cur_section`, `panic`) is the only contact. +- **assembler/codegen ↔ object bytes**: `MCEmitter` (src/arch/mc.c) is the + byte/reloc sink for *both* the standalone assembler and C codegen, so a + hand-written `.s` and a compiled `.c` produce structurally identical objects. +- **printer ↔ per-arch operand syntax**: `ArchAsmOps` (src/arch/arch.h), + reached via `arch_reloc_operand` / `arch_is_local_branch` / + `arch_reloc_call_pair`. This is the inverse of the per-arch reloc-operand + parsers and keeps `cc -S` arch-agnostic but format-aware. --- -## 3. Module layout +## 2. The lexer — `src/asm/asm_lex.c` -``` -src/parse/parse_asm.c arch-agnostic .s driver: directives, labels, - expression evaluator, string decoding. - asm_driver_open_inline constructor for inline - asm template parsing. -src/parse/parse_asm_helpers.h - driver↔arch seam (peek/next/eat_*/parse_const/ - parse_sym_expr/intern_sym/panic). - AsmDriver stays opaque. -src/parse/parse.c parse_asm_stmt: GNU asm("...") statement - grammar (volatile, goto, four colon-separated - lists, [name] symbolic operands). -src/arch/aa64/isa.{h,c} per-format pack/unpack/print + AA64InsnDesc - table + alias flags. Shared between encoder, - decoder, and printer. -src/arch/aa64/asm.{h,c} aa64 instruction parser: per-mnemonic dispatch - over the table → inline encoders. - aa64_inline_bind + aa64_asm_run_template - implement the inline-asm template walker. -src/arch/aa64/disasm.{h,c} aa64 ArchDisasm impl wrapping aa64_disasm_find + - aa64_print_operands; synthesizes b.<cond>. -src/arch/aa64/regs.{h,c} canonical aarch64 register name list. -src/arch/disasm.c arch_disasm_new dispatch on c->target.arch. -src/arch/aa64/arch.c aa_asm_block: CGTarget vtable entry for inline - asm; opens AA64Asm, binds operands, runs - template, closes. -src/cg/cg.c cg_inline_asm: constraint binder (pops inputs, - allocates output regs, handles "memory" - clobber, calls target->asm_block, pushes - outputs). -src/opt/opt.c w_asm_block recorder + IR_ASM_BLOCK replay - (mirrors w_call / IR_CALL). -src/api/disasm.c cfree_disasm_iter_* / cfree_obj_disasm + reloc/ - symbol annotation overlay. -src/api/arch_regs.c stateless cfree_arch_register_name / _index - dispatcher. -driver/cmd/as.c cfree as subcommand wired to - cfree_compile_obj_emit(CFREE_LANG_ASM). -``` +`AsmLexer` streams tokens from a borrowed source buffer. It intentionally keeps +C-like number/string spelling rules, because `.s` sources arrive *after* C +preprocessing and GNU `as` accepts those spellings in directives and +expressions. It does line-splice handling (phase-2 `\<newline>`) and treats +comments (`//`, `/* */`) as whitespace, but surfaces physical newlines as +`ASM_TOK_NEWLINE` so the driver can stay line-oriented. -Three seams: (a) `parse_asm` ↔ per-arch instruction parser via -`parse_asm_helpers.h`, (b) `MCEmitter` as the byte sink for both asm and -codegen, (c) `arch_disasm_new` ↔ per-arch decoder. `aa64_isa.h` is the -shared truth crossing all three. +Two classification quirks are load-bearing: ---- +- **`.L`-prefixed names lex as a single identifier**, leading dot included. + This is the universal GNU convention for assembler-local labels + (`.Lfoo`, `.LBB0_1`, `.Lcfblk.3`). It is unambiguous against directives — no + directive starts with `.L` — so `.text` / `.section` still tokenize as + `'.'` + `IDENT` and reach the directive dispatcher. +- **`name.N` (dot-then-digit) continues an identifier** so discriminator-mangled + symbols (`acc.1`) survive, but `.`+letter does not glue — leaving mnemonic + suffixes (`b.eq`) and the location-counter dot (`. - foo`) to be reassembled + or evaluated by the driver. -## 4. Inline asm — constraint binder + template walker - -**Constraints (v1)**: `r`, `=r`, `+r`, `=&r`, `i`, `m`, `0` (matching by -index). `AsmConstraint` carries `{str, name, type, dir}` — `name` is the -optional `[name]` Sym, `type` is the bound expression's C type (drives -`RegClass` + width). Hand-built test constraints with `NULL` type fall back -to 64-bit int. `+r` is parser-decomposed into `=r` + a synthesized matching -`"<k>"` input that pushes the lvalue's current value; the binder's -matching-constraint path then copies it into the bound output reg before -the asm runs. - -**Clobbers**: -- `"memory"` — spill all live RES_REG SValues via `target->spill_reg`; - subsequent reads reload through `target->reload_reg`. Same machinery cg - uses across function calls. -- Register names (`"x0"`, …) — resolved via `target->resolve_reg_name`. - `cg_inline_asm` spills any live SValue currently bound to the named - phys reg (when `"memory"` didn't already sweep the stack) and rejects - overlaps with in/out operands. `aa_asm_block` additionally bumps the - callee-save high-water mark so the prologue saves/restores callee-saved - regs the asm body trashes even when no SValue ever bound them. -- `"cc"` — silently ignored on aarch64 (NZCV reserved across the block). - -**Placeholders**: `%N` / `%NN` (1- or 2-digit operand index), `%wN` (force -W form), `%xN` (force X form), `%[name]` and `%w[name]`/`%x[name]`/`%a[name]` -(symbolic operand, optionally with width modifier; resolved against -`AsmConstraint.name`), `%aN` (memory addressing form), `%%`. The walker -pre-substitutes them into asm source text and re-lexes through the -standalone per-mnemonic parsers — no second operand grammar. Line splits -on `;` and `\n` honor bracket depth and quoted-string state, so a literal -`;` inside `[...]` does not split a statement. - -**IR**: `IR_ASM_BLOCK` with `IRAsmAux { tmpl, outs, ins, in_ops, out_ops, -clobbers, nout, nin, nclob }`. The opt recorder arena-copies the payload; -replay xlat_op's each Operand and forwards to the wrapped target. - -**`asm volatile`**: accepted but informational — `IR_ASM_BLOCK` is already -opaque-to-passes, so volatile changes nothing at the IR level. +`#` is a distinct token (`ASM_TOK_HASH`) because it is both the asm immediate +marker and the cpp linemarker introducer; the driver disambiguates by position. --- -## 5. Testing - -``` -make test-asm # standalone .s harness -make test-isa # aa64 ISA descriptor table -make test-aa64-inline # aa64 inline-asm walker (hand-built Operands) -make test-cg-binder # cg_inline_asm constraint binder (mock CGTarget) -make test # includes all of the above -``` - -### `test/asm/` — standalone - -Three sub-corpora keyed off filename suffix: - -| dir | input | expected | drives | -|---|---|---|---| -| `test/asm/encode/` | `<name>.s` | `<name>.expected.hex` | `cfree as`, hex-compare | -| `test/asm/decode/` | `<name>.hex` | `<name>.expected.txt` | `cfree_disasm_iter_*`, text-compare | -| `test/asm/listing/` | `<name>.in.bin` (ELF) | `<name>.expected.lst` | `cfree_obj_disasm`, listing-compare | - -`test/asm/regen.sh` regenerates from host `as` / `objdump` (maintainer -aid; not run by CI). - -### Path letters — `test/asm/run.sh` and `test/cg/run.sh` - -| letter | path | check | -|---|---|---| -| `H` | hex encode | `--encode` → diff `.expected.hex` | -| `T` | text decode | `--decode` → diff `.expected.txt` | -| `L` | listing | `--listing` → diff `.expected.lst` | -| `D` | direct JIT | `--jit` → exit code | -| `J` | JIT via file | `--emit` + `jit-runner` | -| `E` | ELF exec | `--emit` + `link-exe-runner` + qemu/podman | -| `S` | asm round-trip (cg corpus) | decode every cg-emitted insn, re-assemble, byte-compare | - -`S` is opt-in on `test/cg/run.sh` (default matrix stays `DREJW`) until the -remaining cg-emitted formats land in `aa64_insn_table`. Run explicitly: - -``` -bash test/cg/run.sh '' DREJWS # full matrix incl. S -bash test/cg/run.sh '' S # just S -``` +## 3. The arch-neutral driver — `src/asm/asm.c` + +`asm_parse` runs the top loop: skip blank lines, skip a `#`-at-BOL cpp +linemarker, dispatch `.directive` lines, treat `IDENT :` as a label +definition, and otherwise hand `IDENT [.suffix...]` to the per-arch instruction +parser. Composite mnemonics (`b.eq`, RISC-V `fcvt.w.s`, `amoadd.d`) arrive as +`IDENT '.' IDENT ...` and are reassembled (`maybe_compose_mnemonic`) before +dispatch; dotted directive/section names (`.rodata.foo`) are stitched the same +way. The per-arch parser is created from the target's `ArchImpl.asm_new`; an +arch without that hook is a clean panic. + +**State held by the driver.** Current section, three hashmaps — `Sym→ObjSecId` +(sections), `Sym→ObjSymId` (symbols), `Sym→AsmEqu` (`.set`/`.equ` constants). +The symbol map ensures a forward reference (`b foo` before `foo:`) shares one +`ObjSymId` with its later definition. New symbols are minted SB_LOCAL/SK_NOTYPE +and, post-parse, `promote_undef_externs` turns any still-undefined local into an +undefined *global* — matching GNU `as`, since a local UNDEF can't pull an +archive member at link time. + +**Directives.** Section switches (`.text/.data/.rodata/.bss/.section`), symbol +attributes (`.globl/.local/.weak/.hidden/.type/.size/...`), data emission +(`.byte/.short/.long/.quad`, `.ascii/.asciz/.string`, `.zero/.fill/.align/ +.p2align`, `.uleb128/.sleb128`, `.inst`), `.comm/.lcomm` (SK_COMMON), +`.set/.equ`. `.section` parses both GNU (`,"flags",@type,entsize`) and — when +the target object format is Mach-O — the `segname,sectname,type` dialect; cfree +`as` parses the dialect of *its* target only, no hybrid. CFI, `.loc/.file`, +`.option` (RISC-V), and a handful of other directives are accepted-and-ignored +so a real `.s` from `cc -S` parses to completion; an unknown directive recovers +by skipping to end-of-line. + +**Expression evaluator.** A precedence-climbing evaluator over +`+ - * / % << >> & | ^ ~` and parentheses. Pure-constant subexpressions fold; +symbol-involving expressions are restricted to `sym ± const`. The lone `.` +token is the location counter, valid only as `sym - .`, which the +`.long/.quad` path turns into a PC-relative data relocation (`R_PC32`/`R_PC64`) +rather than an absolute one. `asm_driver_parse_const` rejects any symbol; +`asm_driver_parse_sym_expr` returns `(sym, offset)`. + +**Symbolic data relocations.** A `.quad sym+8` in `.data` goes through +`MCEmitter.emit_reloc_at` against the existing `RelocKind` set — no new +mechanism. The addend is pre-written into the data field (not zeroed): Mach-O +REL relocs carry the addend implicitly in the relocated field, and ELF RELA +overwrites it harmlessly, so both converge on `sym + addend`. cfree codegen +emits data relocs the same way. + +**Same-section branch relaxation.** The per-arch parser emits a relocation for +*every* symbolic branch target, because a forward `b .Lfoo` is only known to be +local once `.Lfoo:` appears. After the parse, `relax_local_branches` resolves +intra-section branches in place — compute the displacement, patch the +instruction, drop the relocation — for PC-relative *branch* kinds whose target +is a same-section, locally-bound, non-function symbol. The two guards match +the two systems this must agree with: GNU `as` keeps the relocation on a global +(preemptible) target, and cfree codegen keeps the relocation on an intra-file +call/tail-call to a function symbol while resolving branches to internal +labels. This is what makes a control-flow-bearing `cc -S | as` reproduce +`cc -c`'s `.text` relocation table. --- -## 6. Remaining TODOs +## 4. Per-arch instruction parsers + +`src/arch/{aa64,x64,rv64}/asm.c`. Each implements the `ArchAsm` vtable: a +per-mnemonic dispatch that reads operand tokens through the driver helpers and +emits the encoded bytes via the arch's ISA encoders — the **same encoders the +disassembler decodes through**, so when an opcode bit moves the encoder and +decoder update at one site and stay in sync by construction. Aliases +(`mov`/`neg`/`cmp`/`mul`; AT&T size-suffix folding on x64; RISC-V pseudos like +`call`/`tail`/`la`) are handled inside the parser, branching on operand shape +where one mnemonic admits several forms. + +### Reloc-operator syntax (per-arch, per-object-format) + +A symbolic operand can carry a relocation modifier. The spelling depends on +both the architecture and the object format; each arch's parser accepts exactly +the dialect of its target (`asm_driver_compiler(d)->target.obj`): + +- **aarch64** — ELF spells modifiers as a `:mod:` *prefix* + (`:lo12:sym`, `:got:sym`, `:got_lo12:sym`); Mach-O spells them as an `@MOD` + *suffix* (`sym@PAGE`, `sym@PAGEOFF`, `sym@GOTPAGE`, `sym@GOTPAGEOFF`). Both + map to the same internal `AA64RelMod` so downstream reloc emission is shared; + the load/store `:lo12:` reloc is selected by access size. A bare `adrp sym` + is the implicit page reloc on ELF, but Mach-O requires the explicit `@PAGE`. +- **x86-64** — RIP-relative memory operands use `sym(%rip)`; the GOT form is + `sym@GOTPCREL(%rip)`; a call target may carry `@PLT`. A symbolic memory + displacement that is not `(%rip)` is rejected. +- **rv64** — the `%hi`/`%lo`/`%pcrel_hi`/`%pcrel_lo`/`%got_pcrel_hi` operator + syntax, identical on every object format. `%pcrel_lo(label)` names the AUIPC + *anchor label*, not the target symbol, per the RISC-V ABI; the `la`, `call`, + and `tail` pseudos expand to AUIPC+ADDI / AUIPC+JALR pairs with the + appropriate paired relocations. -End-to-end inline asm: complete (smoke case in `test/cg/harness/cases_asm.c`, -grammar-test `.skip` lifted, multi-digit + bracket-aware template walker, -named-clobber RA spill + prologue hwm bump). `asm goto` remains -unsupported in v1 per §7 below. - -`aa64_insn_table` coverage (gates `S` on the full cg corpus): - -- [ ] bitfield (`ubfm`/`sbfm` families) -- [ ] condsel (`csel`/`csinc`/`csinv`/`csneg`) -- [ ] FP-DP1 / FP-DP2 (`fadd`/`fsub`/`fmul`/`fdiv`/`fneg`/`fabs`/`fsqrt`) -- [ ] FP↔int cvt (`fcvtzs`/`scvtf` families) -- [ ] ldst-exclusive (`ldxr`/`stxr`/`ldaxr`/`stlxr`) -- [x] memory barriers (`dmb`/`dsb`/`isb`/`clrex`) — landed; consolidated - `aa_fence` onto the shared helper. -- [ ] system reg access (`mrs`/`msr`) -- [ ] data-processing 1-source (`rbit`/`rev`/`clz`/`cls`) -- [ ] SIMD basic (the cg-emitted subset) - -Once these land, flip `S` into the default cg matrix and drop the -`asm_01_grammar.skip`. - -Multiarch seam (Phase 5): - -- [ ] `arch/disasm.c::arch_disasm_new` switches by `c->target.arch` - (currently aarch64-only; x64 / rv64 panic). -- [ ] `parse_asm` driver dispatches per-arch instruction parser by - `c->target.arch`. `aa64_asm_open` becomes one of N constructors. -- [ ] `cfree_arch_register_name` dispatched the same way. -- [ ] `x64_isa.{h,c}` + `rv64_isa.{h,c}` skeletons (formats + tables, - not populated). x64 brings AT&T, rv64 brings GNU. - -Driver / runtime: +--- -- [ ] CFI directives (`.cfi_*`) are parsed and accepted-but-ignored; - forward to `MCEmitter.cfi_*` once those hooks store records. -- [ ] `.loc` / `.file` likewise accepted-and-ignored; wire to - `mc->set_loc` for inline DWARF line info. -- [ ] `__cfree_setjmp.s` (and any other `.s` in `rt/`) migrate from clang - to `parse_asm` once the assembler proves on the cg corpus. +## 5. Inline asm — `src/cg/asm.c` + per-arch template walkers + +`cfree_cg_inline_asm` (src/cg/asm.c) is the public-API constraint binder. It +maps the GCC operand model onto the CG stack: + +- **Constraints**: `r`/`=r`/`+r`/`=&r`, `i`, `m`, and matching `0`..`9`. `+r` + is decomposed by the frontend into `=r` plus a synthesized matching input; + the binder copies the lvalue's current value into the bound output register + before the asm runs. Inputs are popped off the CG stack; `=r` outputs get + fresh temp locals; early-clobber (`=&r`) outputs are allocated *after* inputs + are bound and checked for collision. `m` operands resolve to an indirect + (address-of) operand. +- **Clobbers**: `"memory"` spills every live RES_LOCAL stack value (the same + machinery cg uses across function calls); named register clobbers are + resolved by the arch and force callee-save preservation. `"cc"` is benign. +- The bound operands (combined `outs` then `ins`, GCC indexing) plus the + template string are handed to `CgTarget.asm_block`. Results come back as + fresh SValues pushed onto the stack. + +The **template walker** lives per-arch (`<arch>_asm_run_template`, e.g. +src/arch/aa64/asm.c). Rather than carry `Operand`s inside tokens, it +*pre-substitutes* placeholders into physical asm text and **re-lexes that text +through the same per-mnemonic parsers** used by the standalone driver — one +operand grammar, one lexer. It splits the template on `\n`/`;` honoring bracket +depth and quote state, substitutes `%N`/`%NN`, width forms (`%wN`/`%xN`), +address form (`%aN`), and symbolic `%[name]`/`%w[name]` (resolved against the +constraint's `[name]`), then drives each rendered line through +`asm_driver_open_inline` — an `AsmDriver` built around a memory-backed lexer and +the caller's `MCEmitter` that emits into cg's *current* section and does not +allocate a default `.text`. + +`CgTarget.asm_block` is wired in each backend's `native.c`: the optimizer path +binds the allocator's pre-assigned registers; a direct path self-allocates and +saves/restores callee-clobbered registers around the block. +`CgTarget.file_scope_asm` reuses the full standalone `asm_parse` over a memory +lexer, so a top-level `asm("...")` is just a small `.s` translation unit. `asm` +statements become an opaque `IR_ASM_BLOCK` instruction (`IRAsmAux` payload) that +the optimizer records and replays like `IR_CALL` (see [IR.md](IR.md), +[OPT.md](OPT.md)); `asm volatile` needs no special IR handling because the block +is already opaque to passes. -Deferred / explicitly out of scope: +--- -- Macros, `.if`, `.macro`, `.altmacro` inside templates — same deferral - as standalone. -- Multi-alternative constraints; most GCC letter constraints — tracked - under `DESIGN.md §10`. -- WAT for WASM (separate document). +## 6. `cc -S` — symbolized disassembly — `src/api/asm_emit.c` + +`cc -S` does not have a separate textual back end. It **disassembles the +already-emitted object** (`arch_disasm_decode`, the exact decoder used by +`objdump` and `cfree_disasm`; see [OBJ.md](OBJ.md)) and re-spells the result as +re-assemblable text. The work is in making operands and directives faithful +enough that the output feeds back through the assembler to the same object. + +**Directive spelling — the `AsmSyntax` vtable.** A tiny vtable selected by +`c->target.obj` (not arch) supplies the format-specific directives: +`section_header`, `sym_type` (ELF `.type`, Mach-O none), `sym_size`, `align`. +Selecting by format is correct because an x64-ELF and aa64-ELF `.s` share +`.type`/`.size`/`.section`; everything else (`.globl`, `.comm`, labels, data +directives, instruction lines) is format-neutral and stays on the shared path. + +**Operand symbolization — the `ArchRelocOperand` inverse.** For each +relocation covering an instruction, `arch_reloc_operand` returns how that +operand should be spelled: a `prefix`/`suffix` modifier (the inverse of the +per-arch parser of §4), an `addend_bias` (x86-64 rel32 relocs store `addend-4`; +the bias undoes it so the printed offset is the *symbol* offset), and a +*surgery kind* telling the printer where in the operand text to splice the +symref: + +- `SURG_TAIL` — replace the last comma component / whole operand (branch + targets, aarch64 `adrp`). +- `SURG_MEM` — rewrite the offset inside `[...]` (aarch64 load/store). +- `SURG_RIP` — insert `sym` before `disp(%rip)` (x86-64); auto-selected + whenever the operand text contains `(%rip)`, so one reloc kind (`R_PC32`) + serves both a branch and a RIP-relative memory operand. +- `SURG_RV_LO12` — RISC-V low-half: rewrite the displacement of a `disp(base)` + load/store, or append `%lo(...)` as a new operand to a register-immediate + form (the assembler folds it back into the ADDI). + +Three printer-side reconstructions complete the round-trip: + +- **Intra-section branch labels.** A relaxed branch (§3) carries no relocation, + so the disassembler prints a numeric target. `arch_is_local_branch` flags such + mnemonics; the printer pre-scans for these targets, synthesizes a local label + at each, and rewrites the operand to name it. +- **RISC-V hi/lo anchors.** A `%pcrel_hi` (AUIPC) reloc sets `emit_anchor`, so + the printer defines a unique `.Lpcrel_hi_<sec>_<off>:` label at it; the paired + `%pcrel_lo` reloc sets `ref_anchor` and its operand names the nearest + preceding anchor — not its own symbol. (Codegen's shared `.LpcrelHi` symbols + are suppressed and replaced by these synthesized unique anchors.) +- **Call-pair fusion.** `arch_reloc_call_pair` collapses a disassembled + AUIPC+JALR pair (RISC-V `R_RV_CALL`) back into a single `call`/`tail sym` + pseudo, reading the partner JALR's link register to choose call vs tail. + +Data relocations are printed by `emit_data_range` as `.quad sym+addend` (bare +symbol, no modifier), with a trailing ` - .` for the PC-relative kinds so the +assembler re-derives `R_PC{32,64}`. An undecodable instruction byte falls back +to `.byte 0x..`, and an undecodable instruction word to `.inst 0x...` so nothing +is silently dropped. --- -## 7. Decisions - -- **Disasm immediate format**: context-sensitive. Signed decimal for fields - the ISA defines as signed (branch displacements, signed-imm12 add/sub, - load/store offsets). `0x`-prefixed hex everywhere else. `aa64_<fmt>_print` - carries a per-field signedness bit; goldens lock the chosen form. -- **`.s` constant expressions**: arithmetic with parens — `+ - * / % << >> - & | ^ ~` over signed integer constants and `sym + const` terms. - Symbol-involving expressions restricted to `(sym ± const)`. Reloc-modifier - syntax (`:lo12:sym`, `:got:sym`) and macro counters (`\@`) are deferred. -- **Absolute relocs in `.s`** (e.g. `.quad some_sym + 8` in `.data`) go - through `MCEmitter.emit_reloc_at` against the existing `RelocKind` set — - no new mechanism. -- **Operand transport for inline asm**: parser pushes only inputs onto the - CG stack; outputs come back as fresh SValues that the parser assigns to - the declared lvalues. Matches the `cg_inline_asm` docstring at - `src/cg/cg.h:178-181`. -- **Template lexing**: pre-substitute placeholders to physical asm text and - re-lex via the standalone parser, instead of carrying `Operand`s in `Tok` - variants. One operand grammar, one lexer; cost is one extra StrBuf pass - per inline block. -- **Memory clobber**: route through `target->spill_reg` / - `target->reload_reg` — same machinery cg already uses across function - calls. No new flush mechanism. -- **`asm goto`**: parsed and rejected in `cg_inline_asm`. Keyword grammar - ships; label-ref machinery does not. -- **Self-hosting**: per `DESIGN.md §12`, anything in `src/` must be - C11-freestanding-writable. `parse_asm.c`, `aa64_asm.c`, `parse_asm_stmt` - in `parse.c` all follow the rule. `rt/` is on its own bootstrap track. +## 7. The encoding–relocation invariant + +A code-location reference that an encoding-divergent assembler must be able to +*recompute* never bakes a fixed `function + byte-offset`. Switch jump-table +entries and `&&label` address-takes relocate against a **per-block local +symbol** (`mc_label_symbol`, `src/arch/mc.c`) whose value *is* the label's +offset — minted lazily as `.Lcfblk.<id>` and defined when the label is placed. +A third-party assembler that re-encodes the function to different instruction +lengths still resolves such an entry to the right address, where a fixed +`fn+offset` would point into the wrong instruction. `cc -S` relies on exactly +this: jump tables print as `.quad .Lcfblk.*` and re-assemble against the same +local symbols. Only `.L`-prefixed (and ordinary) names are treated as +re-assemblable operands; other dotted names (section symbols) keep the numeric +form because the expression parser does not accept them. diff --git a/doc/ASM_ROUNDTRIP_TESTING.md b/doc/ASM_ROUNDTRIP_TESTING.md @@ -1,472 +0,0 @@ -# Asm/disasm completeness via codegen round-trip testing - -Goal: measure and lock in **completeness** of the per-arch assembler (`as`), -disassembler (`objdump -d` / `cc -S`), and the link relocation path, by -round-tripping the **compiler's own output** rather than only a hand-written -corpus. The corpus (`test/asm/`) only tests instructions we thought to write -down; codegen output tests every instruction codegen actually emits. - -Status: aa64 slice now covers the **full core op set** (2026-05-29). The corpus -exercises every CG operation family — int/fp arith, bitwise, shifts, compares, -unary, conversions (incl. bitcast), loads/stores of every width, control flow, -switch (compare-chain + jump-table), indirect/recursive/stack-arg calls, -aggregates (struct by-val/by-ref, bitfields, unions), globals/static-locals, and -atomics (RMW / compare-exchange via the exclusive-monitor sequence) — and the -round-trip passes all three lanes at `-O0` and `-O1`: **858 lane-checks pass, -0 skip**. L0+L1 are wired into the default `make test` via -`test-asm-roundtrip`; L2 stays opt-in (`test-asm-roundtrip-exec`, native arch). - -### Implemented so far (aa64) - -- **P2 — same-section branch relaxation (DONE).** At `asm_parse` finalize the - assembler resolves branch relocations (JUMP26/CONDBR19/TSTBR14, never CALL26) - whose target is a defined local non-function symbol in the same section — - patching the displacement via `link_reloc_apply` with section-relative S/P and - dropping the reloc, matching codegen/GNU-as. L1 now covers control-flow-bearing - code (was auto-skipped). `src/asm/asm.c:relax_local_branches`. -- **`.L` local symbols + data-section symbolization (DONE).** The assembler lexer - accepts `.L`-prefixed locals (incl. embedded dots, `.Lcfree_ro.0`) and the - `name.N` discriminator mangling (`acc.1`) as identifiers; the `-S` symbolizer - emits `.L` operands instead of numeric fallback. `emit_data_range` renders - relocated data as `.quad/.word sym+addend` (the inverse of the assembler's - `.quad`), so switch jump tables (R_ABS64 against the function) and global - pointer tables round-trip. L1 compares relocs across `.text/.rodata/.data`. -- **FP/SIMD scalar load/store + unscaled ld/st family (DONE).** `p_ldst_core` / - `p_ldur_stur` now encode FP transfer registers (Bt..Qt, V=1) and the full - unscaled family (`sturb`/`ldurb`/`sturh`/`ldurh`/`ldursb`/`ldursh`/`ldursw`); - the disassembler decodes the signed unscaled loads (keying Wt/Xt on opc). This - unblocked every FP spill and conversion case. -- **Assembler `.bss` NOBITS (DONE).** The standalone assembler emitted `.bss` as - `SHT_PROGBITS` (a zero-init global store then faulted in the JIT image). The - assembler now creates `.bss` as `SSEM_NOBITS`, and `obj_write`/`obj_pos` track - the `bss_size` cursor for a NOBITS section, so `.zero`/labels reserve space - rather than writing a byte buffer the emitters drop. (Codegen is unaffected — - it never `obj_write`s/`obj_pos`es a BSS section.) Closed the last corpus skip. -- **`.inst` emits the word (DONE).** `as` silently dropped the `.inst` directive - (emitting no bytes), so an instruction the disassembler can't decode yet - (`.inst 0x<word>` in `cc -S`) was *deleted* on re-assembly — a silent - miscompile, and every following branch offset shifted. `.inst` now emits the - 4-byte word(s) like GNU as / llvm-mc. Found by the Toy round-trip lane below - (a multiply-high the disassembler doesn't decode); it now round-trips - correctly even before the decode lands. -- **Exclusive / acquire-release atomic decode (DONE).** The assembler already - encoded `ldxr`/`ldaxr`/`stxr`/`stlxr`/`ldar`/`stlr` (+ b/h), but the - disassembler rendered them `.inst`, so the atomic RMW sequence codegen emits - for `_Atomic` was dropped by `cc -S`. Added `AA64_FMT_LDST_EXCL` + - `print_ldst_excl` and the matching decode rows. Found by an adversarial sweep - (atomics were the one core-op family the corpus fan-out missed); now - `roundtrip/atomic_{rmw,cas,ops}`. - -### asm⊗disasm self-symmetry sweep (`test-asm-symmetry`) - -The codegen round-trip only exercises the disassembler on instructions the -compiler emits. A complementary sweep checks the *tools' own* instruction set -for asm⊗disasm symmetry, independent of codegen (`test/asm/symmetry.sh`): - -- **decode-side** (`test/arch/aa64_sweep_gen.c`): synthesize one representative - encoding per row of `aa64_insn_table`, decode → re-assemble → decode, and - require the disassembly text to be a fixed point. Catches a form the - disassembler decodes but the assembler can't re-encode, or where they - disagree. Now clean (closed `fmax`/`fmin`/`fnmul`, missing from `as`). -- **encode-side**: assemble every aa64 `test/asm/encode/*.s` and disassemble; - any `.inst` is a form the assembler encodes but the disassembler can't decode. - -Known asymmetries live in a checked-in snapshot, `test/asm/symmetry.baseline`; -the sweep passes iff the current set equals it, so it **gates against new -asymmetry** (a regression) while the baseline is the disasm-completeness -backlog. The current 69 entries are all encode-only (the assembler accepts -these for completeness but codegen never emits them, so the disassembler never -had to decode them): LSE atomics (`ldadd`/`swp`/…), CAS, single-register -writeback ld/st (`ldr x,[x,#imm]!`), signed register-offset ld/st, logical- -immediate (`mov #bitmask`/`orr #imm`), the `bfm` bitfield + aliases, 128-bit -`q` ld/st, and a couple of `ldp`/`stp` variants. Closing any of these shrinks -the baseline (`bash test/asm/symmetry.sh --update`). - -### Earlier vertical-slice notes (aa64) - -- **L0 decode-completeness** — `cc -S` already emits the distinct, re-assemblable - marker `.inst 0x<word>` for an undecodable word (only `aa64_write_unknown` - produces it), so the gate is "no `.inst` inside .text". No emitter change was - needed for aa64. `test-disasm-complete` runs it at -O0 and -O1. -- **Phase 2 reloc symbolization** — `src/api/asm_emit.c` now consults the - section reloc table and rewrites the covered operand into reloc-operator - syntax (CALL26/JUMP26 → `bl/b sym`; ADR_PREL_PG_HI21 → `adrp Rd, sym`; - ADR_GOT_PAGE → `:got:`; ADD_ABS_LO12_NC → `:lo12:`; LDST*_ABS_LO12_NC → - `[Rn, :lo12:sym]`; LD64_GOT_LO12_NC → `:got_lo12:`). Text-surgery keyed by - reloc kind, so register names the disassembler produced are preserved. -- **Phase 2 branch-label synthesis** — intra-section branches (`b`/`b.cc`/`cbz`/ - `cbnz`, no reloc) get a synthesized local label `Lcf_<sec>_<off>` and the - operand is rewritten to reference it. `-S` is therefore re-assemblable for - control flow. (Non-dot label spelling sidesteps the assembler not yet - accepting `.L`-prefixed identifiers as operands.) -- **Harness** — `test/asm/roundtrip.sh` over a C corpus in `test/asm/roundtrip/`; - targets `test-disasm-complete` (L0), `test-asm-roundtrip` (L0+L1), - `test-asm-roundtrip-exec` (L0+L1+L2 via jit-runner). 28 pass / 8 skip on aa64. -- **Bug found + fixed** — the round-trip immediately caught the aa64 assembler - encoding post-index `ldp/stp [Rn], #imm` as the offset form (`p_ldp_stp` - ignored `post_index`); fixed, with regression case - `test/asm/encode/aa64_ldp_stp_index`. - -### Remaining (tracked here) - -- **FP register-offset + 128-bit `q` decode.** The assembler now *encodes* FP - register-offset (`str d0,[x,x,lsl#3]`) and `q` ldr/str, but the disassembler - decodes neither (renders `.inst`). Codegen emits neither for scalar C (FP - array indexing computes the address in a GPR first), so the round-trip never - hits them; add the decode rows if a NEON/vector path later emits them. -- **`.inst` is dropped by `as`** — `cfree as` accepts the `.inst` directive but - emits no bytes for it, so an undecoded word would not round-trip at L1 (L0 - still flags it). `as` should emit the word (or error). -- **Section-relative + TLS reloc symbolization** — `build_symref` accepts `.L` - locals but still skips bare section symbols (`.text`) and TLS kinds, which - fall back to numeric. Extend once `as` accepts those operands. -- **Other arches** — the symbolizer switches on aa64 reloc kinds, and the - branch-relaxation predicate lists only the aa64 branch kinds; x64/rv64 keep - the numeric `-S` output and current `as` behavior. Broaden per the - RelocKind→syntax tables below. The self-symmetry sweep and llvm differential - are aa64-only too. - -### Toy-corpus L2 round-trip (`test-asm-roundtrip-toy`) - -Reuses the ~150-case Toy corpus (`test/toy/cases/`, which exercises the full CG -op set and carries an exit-code oracle) for free L2 coverage far beyond the -hand-written `roundtrip/` set (`test/asm/roundtrip_toy.sh`). For each case, -native: `cfree run case.toy` (direct) vs `cfree cc -S | cfree as | cfree run` -(round-trip); the exit codes must match. `cfree run` propagates `main()`'s -return on the native arch (aarch64 macOS), so the oracle is the exit code. - -310 pass / 0 fail / 1 skip. It found — and drove fixes for — a real miscompile -and four classes of gap the hand corpus never reached: -- the dropped-`.inst` miscompile (see above); -- **computed-`goto` `&&label` materialization** — codegen resolves an `adr` to a - local code label in place (no reloc); the disassembly rendered it numerically - (`adr x, 0x1c`), which `as` rejected. Now the branch-label synthesizer also - labels `adr` (and `tbz`/`tbnz`) targets, and P2 relaxes `ADR_PREL_LO21`; -- **tentative/common globals** — `cc -S` now emits `.comm`/`.lcomm` for - `SK_COMMON` symbols (they live in no section, so the section walk missed them); -- **Mach-O implicit addend** — the assembler's `.quad sym+N` wrote a zero data - field and relied on the explicit reloc addend, which Mach-O (REL form, addend - implicit in the data) drops — so every switch jump-table entry resolved to - `sym+0` and `br` dispatched into hyperspace. It now writes the addend into the - data like codegen (harmless on ELF, where RELA overwrites it with S+A). - -Named sections now round-trip (closed `118_decl_extra_attrs`): a global in an -explicit `__attribute__((section(...)))` with merge/strings/retain attributes -lands in a `SEC_OTHER` section, which `cc -S` now emits as -`.section <name>, "<flags>", @<type>[, <entsize>]` (GNU-as) and `as` -reconstructs — the round-tripped section table matches a direct `cc -c` object. -The remaining skip is `141_threadlocal_mutate`, a new case blocked on TLS -symbolization (`cc -S` emits an unsymbolized `adrp x,0x0` for a thread-local -access; tracked separately). Also still open: `smulh`/`umulh` (and the -`*L` long multiplies) DP3 *decode* — correctness is restored by the `.inst` -fix, but `-S` still shows `.inst` for them. - -### llvm differential (`test-diff-llvm`) - -A second-oracle cross-check against llvm (`test/asm/diff_llvm.sh`), byte-level -so it sidesteps disassembly-text normalization (movz-vs-mov, `#16`-vs-`#0x10`): - -- **encode lane**: assemble every aa64 `test/asm/encode/*.s` with both `cfree as` - and `llvm-mc`; the `.text` bytes must match. Validates cfree's assembler. -- **disasm lane**: `cfree cc -c` bytes vs `llvm-mc` of `cfree cc -S`. Since the - `-S` text *is* cfree's disassembly, llvm re-encoding it to codegen's bytes - confirms the decode — a *wrong* decode that cfree's own re-encode would repeat - is caught here. The one benign disagreement (cfree codegen keeps a CALL26/ - JUMP26 reloc for a same-section call/branch to a defined local symbol, which - llvm-mc resolves in place — link-equivalent) is recognized by the reloc-table - diff and not flagged. - -Currently 269 agree, 34 reloc-equivalent, 0 differ over the corpus at -O0/-O1. -Opt-in; skips cleanly when `llvm-mc` is absent. The host carries the -aarch64/x86_64/riscv64 llvm tools. - -### Host-assembler execution lane (`test-hostas-toy`) - -The round-trip and llvm lanes either use cfree's *own* assembler or compare -*bytes*. This lane instead proves `cc -S` is **standard assembly a third-party -assembler accepts and that means the same thing**, judged by **execution**, not -bytes (cfree and clang produce different code, so a byte/text match would be -meaningless). For each Toy case (native target, `-O0`+`-O1`), -`test/asm/hostas_toy.sh` emits **one** `cc -S` and feeds it to two assemblers, -each linked with `cfree ld` and run, exit asserted against the case oracle — -so the *assembler* is the only variable: - -- **`cfree-as` lane** (baseline): `cfree cc -S | cfree as | cfree ld | ./a.out`. - Mirrors `roundtrip_toy.sh`'s `/ld` lane; **312 pass / 0 fail**. -- **`clang-as` lane** (the real test): `cfree cc -S | clang -c | cfree ld | - ./b.out`. A standard assembler can't paper over a private-dialect quirk the - way cfree's own `as` can. - -This immediately surfaced a real bug the round-trip lane structurally cannot -catch: on the native **Mach-O** target `cc -S` originally emitted a broken -hybrid — Mach-O section/symbol conventions (`_main`, `.section __TEXT,__eh_frame`) -mixed with **ELF-only** `.type`/`.size` directives, an `@progbits` type token -inside the Mach-O `.section`, ELF section names (`.section .rodata`), and ELF -relocation operands (`adrp x,sym` + `:lo12:`/`:got:`). `cfree as` accepted it -(so `roundtrip_toy.sh` was green and blind to it); clang/llvm-mc rejected it. - -**Fixed (clang-as now gates by default, 312 pass / 0 fail):** `cc -S` is -object-format-aware, so it emits the clean dialect of the target format. The -format-divergent directive *spelling* (`.type`/`.size`/`.section`/`.p2align`) -lives behind an `AsmSyntax` vtable selected by `c->target.obj` -(`src/api/asm_emit.c`); the relocation *operand* syntax (ELF `:lo12:` prefix vs -Mach-O `sym@PAGEOFF` suffix, even `adrp sym@PAGE`) lives behind a per-arch -`ArchAsmOps.reloc_operand` hook (`src/arch/aa64/asm.c`, reached via -`arch_reloc_operand`). Selecting by format keeps it arch-independent; the -per-arch hook keeps the printer free of `R_AARCH64_*` knowledge. **No hybrid**: -`cfree as` parses the dialect of *its* target too — it dispatches on -`c->target.obj` for the `@PAGE`-family operand suffixes (aa64 `parse_reloc_mod`/ -`parse_reloc_suffix`) and the Mach-O 2-positional `.section seg,sect` -(`src/asm/asm.c`), so the one `cc -S` output assembles identically under both -`cfree as` and clang. ELF output and ELF parsing are byte-identical to before -(the ELF AsmSyntax methods and the `:`-prefix parse path are unchanged), so the -ELF-triple `roundtrip`/`diff-llvm` lanes are unaffected. The same -`141_threadlocal_mutate` TLS-symbolization skip applies. - -`CFREE_HOSTAS_ENFORCE_CLANG=0` demotes the clang-as lane back to **XFAIL** -(useful while bringing up a new arch/format whose Mach-O printer side isn't done -yet — x64/rv64 add their own `ArchAsmOps.reloc_operand`, COFF adds an `AsmSyntax` -impl, then this lane extends to them). - -Opt-in (`make test-hostas-toy`); skips cleanly when `clang` is absent. - -### Cross-compile + cross-exec lane (`test-hostas-cross`) - -`test/asm/hostas_cross.sh` is the cross extension of the host-assembler lane: -the same two-assembler-by-execution test, but for ELF **Linux** targets -(`aarch64`/`x86_64`/`riscv64-linux`) emitted with `cc -S -target <triple>`, -assembled by BOTH cfree-as and clang, linked into a **static, non-PIE** ELF with -`cfree ld -static`, and run under **podman/qemu** via the shared -`test/lib/exec_target.sh` helper (one batched container per target). The -executable is made runnable without a libc/loader by linking the freestanding -crt `test/link/harness/start.c` compiled `-Dtest_main=main`: its `_start` runs -ctors then calls the toy's `main` and exits with its return (the oracle) via a -raw `exit_group` syscall. - -Each target **self-skips** (never fails) unless the host has (1) a clang cross -target, (2) a runner (podman/qemu), (3) a working `cc -S | cfree as` round-trip -for that arch, and (4) a passing **bounded** exec smoke (so a wedged emulator -downgrades to SKIP instead of hanging). All three ELF targets are in the gating -default and pass **both** lanes — **936/936** = 312 cases × {O0,O1} × 3 arches, -cfree-as **and** clang-as, judged purely by execution (matching exit code): - -Code locations that an encoding-divergent assembler must be able to recompute — -switch jump-table entries and `&&label` address-takes — are referenced through a -**per-basic-block local symbol** the MCEmitter mints (`mc_label_symbol`, -`src/arch/mc.c`), uniformly on all three arches. The jump table emits -`.quad .Lcfblk.*` (`R_ABS64` against the block symbol) and the address-take emits -the arch's standard PC-relative relocation against the same symbol: x86-64 -`leaq .Lcfblk.*(%rip)` (`R_PC32`), aarch64 `adrp`/`add` -(`ADR_PREL_PG_HI21`+`ADD_ABS_LO12_NC`), riscv64 `auipc`/`addi` -(`%pcrel_hi`/`%pcrel_lo`). The existing reloc-operand symbolizer renders all of -them, so the references are genuinely relocatable everywhere and clang's encoding -choices (movabs vs mov-imm32, `jmp` rel32 vs rel8, RVC compression) can't shift a -baked offset onto the wrong instruction. `cc -c` and `cc -S` emit the same -relocations, so the round-trip byte/reloc lanes stay faithful too. - -(aarch64-linux runs arm64 natively in the podman VM, so it's the fastest lane; -x86_64/riscv64 run under qemu-user in their containers.) - -Both lanes are judged by **execution**, never by bytes: cfree and clang emit -different (execution-equivalent) code, so a byte/text match would be meaningless. -The batched container runner caps each case at `EXEC_CASE_TIMEOUT` seconds -(default 20) so a single hanging binary can't wedge the whole single-container -run, leaving every later case unscored. - -Override the matrix with `CFREE_HOSTAS_CROSS_TARGETS="tag:triple ..."`, the -exec-smoke cap with `CFREE_HOSTAS_EXEC_TIMEOUT=<secs>`, and per-arch images with -`RUN_{AARCH64,X64,RV64}_IMAGE`. Opt-in (`make test-hostas-cross`); skips cleanly -without clang/podman. - -## Background — what cfree can do today (verified) - -- **`cc -S` exists** and is *disassembly-to-text plus module scaffolding*: - `driver/cmd/cc.c` (`emit_asm_source`, ~line 1116) → `cfree_obj_builder_emit_asm` - (`src/api/asm_emit.c:256`). It walks each section, emits labels for **symbols** - (`collect_labels`, `asm_emit.c:68`), and disassembles `.text` via - `emit_disasm_range` (`asm_emit.c:215`) using `arch_disasm_decode`. So `-S` - and `objdump -d` share the **same disassembler** — they are one decode - surface. -- **Cross-arch execution needs no qemu/podman**: `cfree run` (in-process JIT, - `driver/cmd/run.c`) and `cfree emu` (user-mode ELF emulator, `driver/cmd/emu.c`) both - run guest code on the host. The asm harness already has JIT/exec paths - (`test/asm/run.sh`, the `D`/`J`/`E` lanes; `link-exe-runner`, `jit-runner`). -- **The assembler accepts the reloc-operator syntax** (this is the key enabler): - aa64 `:lo12:`/`:got:`/`:got_lo12:`, rv64 `%hi`/`%lo`/`%pcrel_hi`/`%pcrel_lo`, - x64 `sym(%rip)`/`@PLT`/`@GOTPCREL` — see `src/arch/{aa64,x64,rv64}/asm.c`. - -### Two gotchas the design must handle - -1. **`-S` is a listing, not re-assemblable assembly (the blocker).** Today it - emits **numeric** branch targets (`b 0x100`) and **de-symbolized** - relocated operands (`bl 0x11c` instead of `bl add`; `adrp x16, 0x0` + `ldr - [x16]` instead of `adrp x16, g` + `ldr [x16, :lo12:g]`). Re-assembling that - branches to the wrong place and loads from address 0. **L1/L2 below are - blocked until `-S` symbolizes (Phase 2).** - -2. **The `.byte` fallback masks decode gaps (the trap).** When the disassembler - can't decode a word, `emit_disasm_range` emits `.byte 0x..` (`asm_emit.c` - ~227). Re-assembling a `.byte` reproduces the exact original bytes — so a - *run-only* round-trip **passes even when disasm is incomplete**. A decode or - byte/reloc check must gate before trusting an exec round-trip. - -3. **Padding/data is indistinguishable from a decode failure today.** Inter- - function alignment fill is emitted as the *same* `.byte 0x0` token as a real - decode failure (observed: x64 zero-pads between functions; aa64/rv64 nop-pad - so show none). The completeness metric must separate "byte the disassembler - failed on, inside a function" from "padding/data outside any function". - -## The three layers (build cheapest/sharpest first) - -| Layer | Check | Catches | Cost | Needs Phase 2 | -|-------|-------|---------|------|---------------| -| **L0 decode completeness** | over a program corpus, assert **no in-function decode failure** | disasm can't decode an insn codegen emits | cheap, no exec, pinpoints the word | no | -| **L1 byte round-trip** | `cc -c` vs `cc -S \| as`; diff `.text` bytes **and** reloc tables | asm⊗disasm disagreements (round-trip violations) | cheap, exact | yes | -| **L2 exec equivalence** | `cc` direct vs `cc -S \| as \| ld`, run + compare output/exit | semantic bugs; tolerant of benign encoding diffs | exec via `run`/`emu` | yes | - -L0 measures the disassembler against codegen. L1 measures asm⊗disasm -agreement (the exact "round-trip violation" framing of the completeness doc). -L2 is the end-to-end "it actually runs the same" signal. - -## Phase 1 — L0 decode-completeness gate (unblocked, do first) - -The single cheapest high-value win; implementable with no new features. - -- [ ] Make the decode-failure signal **unambiguous**. Pick one: - - (preferred) In `emit_disasm_range`, emit padding/data outside function - symbol ranges as `.zero N` / `.p2align`, and emit a *genuine in-stream - decode failure* as a distinct token — `.inst 0x<word>` (a real aarch64 - directive) or `.byte … # UNDECODED`. Then "grep for the marker" is exact. - - (alt) Bound the L0 scan to `[sym.value, sym.value+sym.size)` function - ranges using the emitted `.size` directives, counting only in-range - `.byte`/`.inst`. -- [ ] Curate an L0 corpus that stresses instruction families codegen emits: - int/long arith + shifts/bitops, `float`/`double` (FP/SIMD — most likely gap), - `switch` (jump tables), structs/by-ref, loops/memory, `cmp`/`cset`/`cmov`, - calls + global/TLS access, varargs, `asm()` inline. Reuse `test/toy` and a - small C set. Run at `-O0` and `-O1` (different encodings). -- [ ] For massive free coverage, also run L0 over the **bootstrap objects** - (cfree compiling cfree — see `doc/`/`bootstrap-*` targets); it exercises a - huge slice of the ISA. -- [ ] New target (e.g. `test-disasm-complete`): for each arch in {aa64,x64,rv64}, - `cfree cc -S -target <triple> <src>` and assert zero in-function decode - markers. Wire into the default suite (no exec needed — host-independent). - -Prototype already run (rich.c with `double`/`float`/`switch`/bitops): aa64 and -rv64 disassembled clean; x64 only showed inter-function zero padding (a false -positive that motivates the "unambiguous signal" item above). A genuine gap — -e.g. the signed sub-word load decode fixed this session — would have surfaced -here. - -### Force multiplier: differential decode vs llvm -"No decode failure" does not catch a *wrong* decode (decodes to the wrong -mnemonic). Add an opt-in lane that diffs `cfree objdump -d` against -`llvm-objdump -d` over the same objects, normalized (whitespace, hex case, -`0x` prefixes, address columns). llvm is the oracle for decode *text*. - -## Phase 2 — Symbolize `-S` (the keystone; unblocks L1/L2) - -Make `cc -S` emit **re-assemblable** assembly. Lives in `src/api/asm_emit.c` -(and possibly a shared symbolizing-disasm layer also used by `objdump -d`). -Note `objdump`'s symbol annotations are *comments* (`bl 0x11c <add>`), which -are not re-assemblable — `-S` needs the symbol to *be* the operand (`bl add`). - -Two new inputs into the emit loop, then a two-pass emit: - -1. **Section relocations** from the `ObjBuilder`: a map `offset → (RelocKind, - target sym, addend)`. When an instruction covers a reloc offset, render its - operand symbolically via the reloc-kind → modifier mapping (the inverse of - what the assembler parses). -2. **Branch-target labels**: collect intra-section branch / PC-rel targets, - synthesize `.L<sec>_<off>` labels at those offsets, render branches as - `b .L...`. - -Emit as two passes: pass 1 decodes everything and builds the label set = -{symbols} ∪ {synthesized branch-target labels} ∪ {rv64 `%pcrel_hi` anchors}; -pass 2 emits, inserting labels at offsets and symbolic operands per the table. - -### RelocKind → operand-syntax mapping (inverse of the assembler) - -aarch64: -- `R_AARCH64_CALL26` → `bl sym`; `R_AARCH64_JUMP26` → `b sym`; - `R_AARCH64_CONDBR19`/`TSTBR14` → `b.cc sym`/`tbz sym` -- `R_AARCH64_ADR_PREL_PG_HI21` → `adrp Rd, sym`; - `R_AARCH64_ADR_GOT_PAGE` → `adrp Rd, :got:sym` -- `R_AARCH64_ADD_ABS_LO12_NC` → `add Rd, Rn, :lo12:sym` -- `R_AARCH64_LDST{8,16,32,64}_ABS_LO12_NC` → `ldr/str …, [Rn, :lo12:sym]`; - `R_AARCH64_LD64_GOT_LO12_NC` → `:got_lo12:sym` -- TLS LE (`TLSLE_*`) → `:tprel_hi12:` / `:tprel_lo12_nc:` (later) - -riscv64: -- `R_RV_CALL` → `call sym` (collapses the auipc+jalr pair) -- `R_RV_PCREL_HI20` → `auipc Rd, %pcrel_hi(sym)` **and emit a local anchor - label** at this offset; `R_RV_PCREL_LO12_I/S` → `… %pcrel_lo(.Lanchor)` - referencing that anchor (mirrors codegen's `.LpcrelHi`, `native.c`) -- `R_RV_HI20`/`R_RV_LO12_I/S` → `%hi(sym)`/`%lo(sym)`; - `R_RV_GOT_HI20` → `%got_pcrel_hi(sym)` -- `R_RV_BRANCH`/`R_RV_JAL` → `beq …, sym` / `j sym` - -x86-64: -- `R_X64_PLT32` on a `call`/`jmp` → `call sym@PLT` / `jmp sym@PLT` -- `R_PC32` on a rip-relative mem operand → `sym(%rip)` -- `R_X64_REX_GOTPCRELX`/`GOTPCREL` → `sym@GOTPCREL(%rip)` -- absolute data refs → `sym` / `sym+addend` - -Decisions: -- [ ] Where does symbolization live — `asm_emit.c` only, or a shared - symbolizing-disasm layer reused by `objdump -d`? (Recommend: a small shared - "resolve operand at offset → symbolic string" helper fed by a reloc map + - label map, with `-S` and `objdump` choosing operand-substitution vs comment.) -- [ ] How to recover the *instruction operand position* a reloc patches (the - disassembler's `CfreeInsn` may not expose which operand the reloc field maps - to). May need the decoder to report the immediate/branch field offset, or the - symbolizer to re-derive it from the reloc offset within the instruction. - -## Phase 3 — L1 + L2 round-trip lanes - -Once `-S` is re-assemblable: - -- [ ] **L1 (byte round-trip)**: per arch, `cfree cc -c <src> → a.o`; - `cfree cc -S <src> | cfree as → b.o`; assert `.text` bytes **and** the - relocation table (kind, offset, target, addend) of `a.o` == `b.o`. Exact, - host-independent, pinpoints the divergent instruction. Gate on Phase 1 - (no decode failures) first. -- [ ] **L2 (exec equivalence)**: `cfree cc <src> → run` (direct) vs - `cfree cc -S <src> | cfree as | cfree ld → run` (round-trip); compare - stdout + exit. Execute via `cfree run` (host arch) or `cfree emu` (cross - arch), reusing the `test/asm/run.sh` J/E plumbing. Robust to benign encoding - differences; the end-to-end "it works" signal. -- [ ] Run L1/L2 across {aa64, x64, rv64} × {-O0, -O1} over the corpus. Make L1 - default-suite (cheap); L2 opt-in (exec). - -### Force multiplier: llvm-mc as a second assembler -On `-S` output, assemble with **both** `cfree as` and `llvm-mc` and compare -bytes. This cross-checks cfree's assembler against an oracle on real -codegen-shaped input — coverage the hand-written corpus can't reach. (Caveat: -llvm-mc may pick different-but-equivalent encodings; normalize or scope to -forms where they agree.) - -## Harness / file map - -- Emitter to extend: `src/api/asm_emit.c` (`emit_disasm_range`, - `collect_labels`, `cfree_obj_builder_emit_asm`). -- Disassembler: `src/arch/disasm.c` + per-arch `src/arch/<arch>/isa.c` - (`arch_disasm_decode`, `CfreeInsn`). -- Reloc kinds: `src/obj/obj.h` (`RelocKind`); per-arch ELF maps under - `src/obj/elf/reloc_*.c`. -- Tools: `cfree as` (`driver/cmd/as.c`), `objdump` (`driver/cmd/objdump.c`), - `run` (`driver/cmd/run.c`), `emu` (`driver/cmd/emu.c`), `ld` (`driver/cmd/ld.c`). -- Test harness: `test/asm/run.sh` (H/T/L/D/J/E), `test/toy/run.sh`, - `test/lib/exec_target.sh`; golden regen `test/asm/regen.sh` / - `test/asm/regen-rv64.sh`. Cross toolchain present on dev host: `clang`, - `llvm-mc`, `llvm-objdump` (triples aarch64/x86_64/riscv64-linux-gnu). -- New targets to add: `test-disasm-complete` (L0), `test-asm-roundtrip` (L1), - `test-asm-roundtrip-exec` (L2, opt-in). - -## Why this is worth it - -It converts "we wrote corpus cases for the instructions we remembered" into -"every instruction the compiler emits is provably decodable (L0), re-encodes -identically (L1), and runs identically (L2)" — a *coverage-driven* guarantee -that tracks codegen automatically as new instructions are added, and that -closes the loop with the relocation-operator syntax already in the assembler. diff --git a/doc/BOOTSTRAP_O1.md b/doc/BOOTSTRAP_O1.md @@ -1,163 +0,0 @@ -# 3-Stage Bootstrap: O0/O1 Fixed Point - -The bootstrap builds cfree with itself three times and requires the last two -stages to be byte-identical: - -- **stage1** = the host-built cfree copied into `build/<mode>/bootstrap/stage1`. -- **stage2** = stage1 compiling the full tree. -- **stage3** = stage2 compiling the full tree. -- The fixed-point check is `cmp stage2/cfree stage3/cfree`. - -On aarch64-macos, both the debug/O0 and release/O1 bootstrap paths now reach -the fixed point, and both bootstrapped compilers run the full Toy corpus. - -## Current Status - -Commands verified: - -```sh -gmake -s bootstrap-debug -gmake -s bootstrap-release -gmake -s bootstrap-test-toy -CFREE=$(pwd)/build/release/bootstrap/stage3/cfree test/toy/run.sh -``` - -Stage executable hashes: - -```text -62e17394d27b5f69678abf7a65c74fec2954132341b3a2b01bb539d91d77ea83 build/release/bootstrap/stage2/cfree -62e17394d27b5f69678abf7a65c74fec2954132341b3a2b01bb539d91d77ea83 build/release/bootstrap/stage3/cfree -a90c6c56281856e6963745a14b0c0ac1779583dc1c5199fc80e22fb513f7a72d build/debug/bootstrap/stage2/cfree -a90c6c56281856e6963745a14b0c0ac1779583dc1c5199fc80e22fb513f7a72d build/debug/bootstrap/stage3/cfree -``` - -The per-object fixed-point check also has zero differences in both modes: - -```sh -find build/release/bootstrap/stage2 build/release/bootstrap/stage3 -type f -name '*.o' | - sed 's#build/release/bootstrap/stage[23]/##' | sort -u | - while read f; do - cmp -s "build/release/bootstrap/stage2/$f" \ - "build/release/bootstrap/stage3/$f" || echo "$f" - done - -find build/debug/bootstrap/stage2 build/debug/bootstrap/stage3 -type f -name '*.o' | - sed 's#build/debug/bootstrap/stage[23]/##' | sort -u | - while read f; do - cmp -s "build/debug/bootstrap/stage2/$f" \ - "build/debug/bootstrap/stage3/$f" || echo "$f" - done -``` - -Both commands print nothing. The same check piped to `wc -l` prints `0`. - -Toy results: - -```text -debug stage3: 1034 pass, 0 fail, 8 skip -release stage3: 1034 pass, 0 fail, 8 skip -``` - -The Toy runner used `PATHS=RLCW` and `OPT_LEVELS=0 1`, so this covers run, -link/native, C backend, and Wasm paths where supported, at both Toy opt levels. - -## Final O1 Bugs Closed - -### Native emitter clobbered the RHS location - -`src/opt/pass_native_emit.c` materialized the left-hand operand of `IR_BINOP`, -`IR_CMP`, and `IR_CMP_BRANCH` without first protecting a right-hand operand that -was already in a target scratch register. - -The concrete failure was in `src/obj/macho/read.c` while evaluating: - -```c -1u << (m->align_log2 & 31) -``` - -The RHS shift count had been loaded into `w9`, then the LHS constant `1` was -also materialized into `w9`, so the shift used `1 << 1`. Stage2's object reader -therefore reported many Mach-O section alignments as `2**1`; `ld -r` propagated -those bad alignments into `libcfree.o`, and the later stage3 link failed with -Mach-O relocation alignment and entry-address errors. - -The fix is to compute the RHS `NativeLoc` first and pass `loc_avoid_reg(b)` when -materializing the LHS for binops and comparisons. This prevents the LHS -materialization from choosing a scratch register that already holds the RHS. - -### MIR combine propagated target scratch registers into calls - -`src/opt/pass_combine.c` treated backend scratch registers like ordinary hard -registers during copy propagation. That is invalid after native lowering starts -using those registers as transient emit-time temporaries. - -The concrete failure was the 9-argument call to `declare_function` in -`parse_external_decl`. The `out_decl_flags` stack argument was first protected -in an allocable register, but combine rewrote the call aux argument back to -scratch `x9`. Before `aa_plan_call` stored the stack argument, another operand -materialization reused `x9`, so the stack slot received the unrelated `dattrs` -value. - -That bad argument sometimes gave `metrics_sink` the `CFREE_CG_INLINE_ALWAYS` -flag, which made stage2 and stage3 emit different objects even after the link -alignment bug was fixed. - -The fix is to reject copy propagation from any register listed in -`f->opt_scratch_regs[cls]`. Scratch registers may appear in lowered MIR, but -they must not be extended across later instructions by combine. - -## Investigation Harness Notes - -The useful oracle was not just "does stage3 link"; it was whether stage2, when -used as a compiler, reproduced the same objects as the host-built compiler. -That allowed the search to separate malformed-object bugs from link-driver -symptoms. - -Effective checks: - -```sh -# Compare a specific stage2-compiled object with the host-built compiler's output. -build/release/cfree cc --support-dir . -O1 -DNDEBUG -ffunction-sections \ - -fdata-sections -std=c11 -Wpedantic -Wall -Wextra -Werror \ - -ffreestanding -nostdinc -Irt/include -fvisibility=hidden \ - -Iinclude -Ilang/cpp -Ilang/c -MMD -MP \ - -c lang/c/parse/parse.c -o /tmp/parse-host.o - -build/release/bootstrap/stage2/cfree cc --support-dir . -O1 -DNDEBUG \ - -ffunction-sections -fdata-sections -std=c11 -Wpedantic -Wall -Wextra \ - -Werror -ffreestanding -nostdinc -Irt/include -fvisibility=hidden \ - -Iinclude -Ilang/cpp -Ilang/c -MMD -MP \ - -c lang/c/parse/parse.c -o /tmp/parse-stage2.o - -cmp /tmp/parse-host.o /tmp/parse-stage2.o -``` - -Hybrid rebuilds were also effective: relink stage2 after replacing one suspect -TU, or one piece of a split TU, with a clang-built object, then use stage2 to -compile the known-differing target object. This found that the malformed-link -failure was not in the linker itself, then narrowed the final object divergence -to codegen around C parsing and call lowering. - -For MIR-level inspection, a temporary filtered dump around the target symbol -was enough. The decisive dump was the call to `_declare_function` after lowering -and combine: before the fix, the call aux stack argument referenced scratch -`x9`; after the fix, it references a non-scratch allocable register and survives -the later materialization that also uses `x9`. - -Avoid `-g` while triaging O1 codegen. It changes object layout and can create or -hide different bugs. - -## Earlier O1 Fixes Still Relevant - -These previous fixes are part of the same O1 bootstrap path and should stay in -mind if this regresses: - -- `src/opt/pass_coalesce.c`: overlap checks must use raw range points, not - compressed points. -- `src/opt/pass_lower.c`: the hint fallback must not put live-across-call values - into caller-saved hint registers. -- `src/arch/aa64/native.c`: aa64 native emit needs three integer scratch - registers for all-spilled three-operand operations. -- `src/opt/cg_ir_lower.c`: aggregate copy/set operands that are pointer values - must not force-home the pointer local; real frame-backed pointer locals need - prematerialized indirect bases. diff --git a/doc/BUILD.md b/doc/BUILD.md @@ -0,0 +1,266 @@ +# Build & Configuration + +This document describes how cfree is built and configured: the make targets that +produce the library, binary, and runtime; the compile-time component-gating model +that lets a build include only the architectures, formats, frontends, subsystems, +and tools it needs; the small set of choke points where those gates are honored; +how output is made reproducible; and the staged self-build (bootstrap). It is a +map of the build architecture, not a recipe list — see [DESIGN.md](DESIGN.md) for +the system overview and [RUNTIME.md](RUNTIME.md) for the runtime library it links +against. + +## Three products, one tree + +The source tree compiles into three outputs: + +``` +libcfree.a the engine (public + internal C) +cfree the multi-call driver binary +rt/<variant>/libcfree_rt.a compiler-rt/libc support, per target variant +``` + +The first two are direct make targets (`make lib`, `make bin`); the runtime is +*not* a standalone user target. Its variants are produced as a dependency of the +self-host path — the freshly built `cfree` compiles each `libcfree_rt.a` (`RT_CC = +$(BIN) cc`) — so building the runtime always goes through a working driver binary. + +Layering is enforced by include paths, not convention: + +- **libcfree** (`src/` + `lang/`) is freestanding C11. It sees both its public + surface (`-Iinclude`) and its internals (`-Isrc`), and is compiled + `-ffreestanding -nostdinc` against the runtime headers (`-Irt/include`) with + `-fvisibility=hidden`. It links to no host libc. +- **the driver** (`driver/`) is the *first consumer* of the public API. It gets + `-Iinclude` and `-Ilang` but deliberately **not** `-Isrc`, so internal headers + are unreachable — if the driver needs something it must be public. The one + hosted seam, `driver/env/`, is the only code compiled against the host SDK/libc + (see "Host detection" below). +- **the runtime** (`rt/`) is built *by cfree itself* (`RT_CC = $(BIN) cc`), once + per target variant, and is what hosted programs link against. + +`make all` builds `lib` + `bin`. The driver binary links the static +`libcfree.a`; `bin` also drops a `support/rt` symlink so the freshly built +compiler can find runtime sources at run time. + +### libcfree.a is a single relocated object + +The library is not a naive `ar` of every `.o`. The object set is first combined +with `ld -r` into one relocatable object (`build/.../libcfree.o`), and *that* is +archived. This guarantees the archive is rebuilt wholesale when sources are +added or removed (plain `ar rcs` only adds/updates members and would silently +retain a deleted file's object), and it gives the symbol-discipline check +(below) a single object to inspect. + +## Build modes + +`RELEASE=0` (default) is the development build: `-O0 -g3`, frame pointers, and +ASan+UBSan with `halt_on_error`. `RELEASE=1` is `-O2`, `-DNDEBUG`, +function/data sections plus dead-strip at link. Mode flags +(`HOST_OPTFLAGS` / `HOST_MODE_*FLAGS`) live in the root Makefile; they are +recorded into `build/.../.build-config` so that flipping a mode flag forces a +rebuild of objects that were produced under the old flags. Build-mode concerns +are kept strictly separate from host-environment concerns (next section). + +## Host detection: mk/env.mk + +`mk/env.mk` is the **only** place in the build that branches on the host OS or +arch. It normalizes `uname` into `HOST_OS` / `HOST_ARCH`, resolves the Darwin +`-isysroot` (empty elsewhere, so splicing `$(HOST_SYSROOT_*FLAGS)` is always +safe), and — crucially — selects the exact set of `driver/env/*.c` files for the +host: one per OS, one per arch for icache flush, and on POSIX one per +(arch, OS) for `ucontext` register marshalling. Choosing source files in the +build, rather than `#ifdef`-ing one mega-file, is the explicit design: every +hosted adapter TU compiles for exactly one platform. The rest of the Makefile +reads `env.mk`'s outputs and never re-derives anything from `uname`. + +## Component gating + +cfree is a large toolchain, but most builds want only a slice of it. Gating lets +a build drop whole axes — an arch, an object format, a language frontend, the +optimizer, a subsystem, a CLI tool — down to a minimal freestanding library that +still links and presents the full public API (gated-out calls return +`CFREE_UNSUPPORTED`). The axes are: + +``` +arch AA64 X64 RV64 WASM C_TARGET +obj-format ELF MACHO COFF WASM +language ASM CPP C TOY WASM +optimizer OPT (O1+; O0 direct codegen is always present) +subsystems AR DISASM DWARF LINK JIT DBG EMU INTERP +tools CC CHECK CPP AS LD AR RANLIB STRIP OBJCOPY OBJDUMP + DBG RUN EMU NM SIZE ADDR2LINE STRINGS CAS PKG +``` + +### One source of truth: config.h + +All flags are `CFREE_<COMPONENT>_ENABLED` macros defined in +`include/cfree/config.h`. That file is **preprocessor-only**: every flag expands +to a literal `0` or `1` usable from both `#if` and `_Static_assert`. The +`_ENABLED` suffix exists so the macros never collide with the public enum +constants of the same root (`CFREE_ARCH_RV64`, `CFREE_OBJ_ELF`, `CFREE_LANG_C` +are enum *values*, not gates). + +The build mirrors these flags rather than duplicating them. `mk/config.mk` parses +`config.h` in a single `awk` pass and `eval`s each `CFREE_*_ENABLED` line into an +identically named make variable. The header is the single source of truth; the +Makefile only reads it. This keeps the `#if` that drops a feature from the +compile and the make rule that drops its source files perfectly in sync. + +Some axes carry dependencies. These are documented constraints, not enforced +invariants — `config.h` records them in comments, but no `$(error)` / +`_Static_assert` rejects a contradictory configuration, so a hand-edited combination +that violates them may simply fail to compile or link rather than being diagnosed. +The constraints are: the C frontend needs the preprocessor (`CFREE_LANG_C` consumes +`CFREE_LANG_CPP`); the interpreter consumes the optimizer's PReg-path `Func`, so it +needs the optimizer; and the assembler substrate is always present +(`CFREE_LANG_ASM_ENABLED` gates only *automatic registration* of the asm +frontend, because inline and file-scope asm in the C frontend depend on the same +parser/emitter machinery). + +### Gate choke points + +A central design rule: **`#if CFREE_<axis>_*` appears in exactly one file per +axis.** Everything downstream operates on registry outputs (vtables / impl +pointers) and never re-checks a flag. This keeps the gating coherent and makes it +trivial to audit what a given configuration includes. + +``` + the ONLY sites that test these flags + CFREE_LANG_* -> src/api/lang_registry.c + CFREE_ARCH_* -> src/arch/registry.c + (ABI, derived) -> src/abi/registry.c +``` + +- **`src/api/lang_registry.c`** runs at compiler construction and wires each + compiled-in frontend's vtable into the compiler's frontend table, dispatched + later by `CfreeLanguage`. Third parties can still install or override a slot + via the public `cfree_register_frontend()`. +- **`src/arch/registry.c`** owns the roster of `ArchImpl` (machine-code backends: + emit, DWARF, debugger hooks, register file) and resolves the `CGBackend` for a + session. An `ArchImpl`'s first field is a `CGBackend`, so a machine arch is a + superset of a code-emitting backend; `c_target` (C source output) and the + check-only backend are `CGBackend`s with no `ArchImpl`. +- **`src/abi/registry.c`** is *derived*, not user-configurable. An ABI entry + exists only when both its machine arch and its object/OS-format are enabled — + e.g. AAPCS64 needs AA64 (or the C target) plus ELF, Apple-arm64 needs Mach-O, + win64 needs COFF. The arch x format product is computed at preprocessor time + from the same `CFREE_ARCH_*` / `CFREE_OBJ_*` flags. + +### Linkability when gated out: config_stubs.c + +The public API surface must always link, even for components that were compiled +out. `src/api/config_stubs.c` provides **weak no-op definitions** for every +public entry point of a gateable subsystem (ar, disasm, dwarf, link, jit, dbg, +emu, and the internal `debug_*` producer hooks the DWARF axis would normally +supply). Each stub body is itself wrapped in `#if !CFREE_<X>_ENABLED`, so it +compiles only when the real implementation is absent, and returns +`CFREE_UNSUPPORTED` / `NULL`. The result: an embedder linking a stripped +`libcfree.a` still resolves every symbol; calls to disabled features fail +cleanly at run time instead of failing to link. + +The Makefile does the symmetric work on the *source* side. For each disabled +axis it `filter-out`s the matching implementation files (e.g. dropping each +arch's `disasm.c`, `link.c`, `dbg.c`, `emu.c`, the `src/opt/` tree, the +per-format `link.c`, etc.) and, where an *internal* substrate symbol would go +missing, adds a parallel `*_stubs.c` (`src/arch/disasm_stubs.c`, +`src/obj/link_stubs.c`, `src/interp/interp_stubs.c`, ...). So there are two stub +layers: `config_stubs.c` keeps the *public* API whole; the `*_stubs.c` files keep +the *internal* link whole. + +### Tools and per-tool source sets + +Each `CFREE_TOOL_*_ENABLED` flag gates both the dispatch/help entry in +`driver/main.c` and the `driver/cmd/<tool>.c` object compiled in. The driver +also pulls in shared helper TUs (`driver/lib/cflags.c`, `lib_resolve.c`, +`hosted.c`, `runtime.c`, `inputs.c`) only when at least one tool that needs them +is enabled, and the distribution tools (`cas`, `pkg`) drag in their own +`driver/dist/` vendor set. The tool roster is centralized; see +[DRIVER.md](DRIVER.md). + +## Reproducible builds + +Output is deterministic by construction — no timestamps, no randomness, no host +paths in artifacts: + +- **Image identity** is a content+layout hash, not a clock read. + `src/link/link_image_id.c` folds each segment's vaddr, file size, and + post-relocation bytes through two FNV-1a streams to produce a stable 128-bit + id. The id is wrapped per format: ELF emits it as a `.note.gnu.build-id` + (`src/obj/elf/link.c`), Mach-O as the `LC_UUID` payload (`src/obj/macho/link.c`) + — the same bytes either way. +- **Object headers** zero their time fields: COFF `TimeDateStamp` and the COFF + archive/import time fields are written as 0 (and PE marks itself deterministic), + Mach-O dylib timestamps are zeroed. +- The **self-build** verifies this end to end: stage2 and stage3 must be + byte-identical (below). + +## Symbol discipline + +`make test-lib-deps` is a build-architecture guard, not a behavioral test. It +asserts two invariants over the release `libcfree.a`: + +1. The set of *external* (undefined) symbols the archive imports matches a + checked-in allowlist (`test/lib_deps.allowlist`) — the library must not grow a + hidden dependency on host libc or anything else. +2. After relinking the archive into one relocatable object, every remaining + externally visible *definition* uses a public prefix (`Cfree`, `cfree_`, + `CFREE`). Internal symbols must stay internal. + +Together these keep libcfree genuinely freestanding and its public surface +honest. The check is part of the default `make test` set. + +## Tests + +Tests are a large family of `make test-*` targets defined in `test/test.mk`, +grouped roughly as: + +``` +frontend test-pp test-parse test-lex test-asm test-toy +codegen/opt test-cg-api test-opt test-isa test-*-inline test-abi-classify +object/link test-elf test-macho test-coff test-ar test-link test-driver-ar +debug test-debug test-dwarf test-dbg +exec/interp/emu test-smoke-* test-interp* test-emu* test-rt-* test-libc* +roundtrip/diff test-asm-roundtrip* test-asm-symmetry test-diff-llvm test-hostas-* +driver/tools test-driver test-driver-{cc,ar,strip,objcopy,objdump,pkg,strings} +``` + +`make test` runs a curated `DEFAULT_TEST_TARGETS` subset (which includes +`test-lib-deps`). Many harness binaries are built by the Makefile so they inherit +host flags (sanitizers in debug). See [TESTING.md](TESTING.md) for the harness +design and conventions. + +## Bootstrap (staged self-build) + +The goal is to compile cfree with cfree and prove the result is stable. The +chain, per build mode: + +``` +seed cc (host clang) + | build libcfree.a + cfree + v +stage1 = the host-built cfree, copied aside, with cc/ld/ar/ranlib/as + | symlinks (busybox-style multi-call) + | rebuild the whole tree using stage1 as CC/AR/LD + v +stage2 = cfree built by cfree + | rebuild the whole tree again using stage2 as CC/AR/LD + v +stage3 = cfree built by (cfree built by cfree) + +invariant: cmp stage2/cfree stage3/cfree (must be byte-identical) +``` + +Stage2 vs stage3 is the fixed-point: once the compiler reproduces itself, a +further self-application changes nothing. Reaching it exercises essentially the +whole compiler on a real, substantial program — cfree's own source — and the +byte-identity check (leaning +on the deterministic output above) catches any non-reproducible codegen. The +bootstrap drives the *normal* Makefile with `CC`/`AR`/`LD` repointed at the +stage's symlinks, so there is no separate "bootstrap build" — it is the same +build rules run with cfree as the toolchain. `make bootstrap` runs both the debug +and release chains; `make bootstrap-test-toy` additionally runs the Toy corpus +through the bootstrapped compiler. + +The host-clang seed is the current root of trust. A full diverse-double-compilation +/ hex0-style seed chain (a tiny seed binary that needs no pre-existing C compiler) +is a separate concern and is deliberately outside the boundary of this build. diff --git a/doc/CBACKEND.md b/doc/CBACKEND.md @@ -1,112 +1,249 @@ # C Source Backend -## Motivation - -cfree's no-deps posture rules out linking against LLVM or GCC's optimizer. The practical path to "industrial-strength" optimization for cfree users is to emit C from CG and hand the result to `gcc` or `clang`, which exist on every build host. The output is `.c` source, not `.o` bytes; the host C compiler handles ABI lowering, instruction selection, register allocation, and advanced optimizations like SROA. cfree's job is to produce *legal* and *complete* C, not necessarily human-readable C. - -GCC/clang-extension C covers everything cfree CG can express. Concretely: -- inline asm — emits verbatim as GCC `__asm__`. -- overflow/trap — `__builtin_{add,sub,mul}_overflow`, `__builtin_trap`. -- atomics — `__atomic_*` builtins with explicit memory orders. -- TLS — `_Thread_local`. -- `setjmp`/`longjmp` — standard C. -- computed goto — GCC `&&label` and `goto *ptr` extension. -- bitfields — emitted as bit-extract/insert on the underlying storage unit; cfree does not emit C bitfield declarations. - -## Scope: Target-Locked, Not Portable - -The emitted C is **target-locked**: it must be compiled for the same triple that `cfree --target=` selected. Compile it for a different triple and it may silently misbehave. - -Cause: CG flattens semantic lvalue chains to `(base_reg, byte_offset)` before any backend sees them. `cfree_cg_field(g, field_index)` becomes `OPK_INDIRECT(reg, ofs=12)` at the vtable; the field identity is gone. The offset `12` was computed using the cfree-selected target's `abi_cg_record_layout`. If the downstream C compiler assumes a different layout, the access is wrong. - -This is the same trade LLVM IR makes (datalayout-locked). It does *not* limit usefulness for the stated goal — "industrial-strength optimization via the host toolchain" — because the user already controls the triple at cfree invocation. - -## Where the C Backend Plugs In - -A C backend is *not* a new arch in the sense `arch_impl_x64` is. The eventual machine code still runs on the host triple. What changes is the *form of CG output*: text instead of object bytes. The seam is `CGTarget`. - -1. Ignores `MCEmitter` and writes C source to a `CfreeWriter` instead. -2. Inherits the host's `ABIVtable` only for `sizeof`/`alignof`/`record_layout`. It does **not** consult ABI classification for arg routing — the host C compiler will re-do that on the emitted C. -3. Sets `virtual_regs = 1` so CG hands out fresh, unbounded `Reg` ids; each id becomes a unique C local variable. - -### Selection - -When `emit_c_source` is set in `CodeOptions`, `cfree_cg_new` constructs `c_cgtarget_new` instead of dispatching through standard machine-code generation. `MCEmitter` and `Debug` are skipped entirely in this mode. - -The downstream driver workflow: `cfree cc --emit=c foo.c -o foo.cfree.c`, then the user runs `cc -O2 foo.cfree.c`. - -## Architecture Sketch +cfree's no-deps posture rules out linking against LLVM or GCC for an +industrial-strength optimizer. The C-source backend gives cfree users that +optimizer anyway: it emits portable C source (`cc -S=c`-style, selected via +`--emit=c`) and hands the result to whatever `gcc`/`clang` exists on the build +host. The host C compiler then performs ABI lowering, instruction selection, +register allocation, and aggressive optimization (SROA, vectorization, etc.). +cfree's job here is to produce *legal* and *complete* C, not human-readable C. +See [CODEGEN.md](CODEGEN.md) for the CG model this backend consumes and +[IR.md](IR.md) for the semantic IR it walks. + +## A CGBackend, not an ArchImpl + +There is no `ArchImpl` for the C target. An `ArchImpl` describes a machine — +registers, encodings, an `MCEmitter` that writes object bytes. The C backend +produces no machine code and no object bytes; the eventual machine code runs on +the host triple after the host `cc` compiles the emitted source. What it +*is* is a `CGBackend` (`cg_backend_c_target` in `src/arch/c_target/target.c`): +the small "give me a CgTarget for this Compiler + ObjBuilder + emit options" +unit the registry hands out. The registry selects it in +`cg_backend_for_session` (`src/arch/registry.c`) whenever +`CodeOptions.emit_c_source` is set; output is written to +`CodeOptions.c_source_writer` instead of to an object file. + +## Two-stage pipeline: record CG into IR, then emit C from IR + +The backend does not translate CG calls to C text directly. It splits into two +stages with the semantic IR (see [IR.md](IR.md)) as the seam: ```text -+---------------------+ -| frontend (lang/c/) | source AST → CG calls -+----------+----------+ - | - v -+---------------------+ -| CfreeCg (src/cg/*) | value stack, lvalues, virtual Regs -+----------+----------+ - | CGTarget vtable - v -+---------------------+ -| C-source CGTarget | -| (src/arch/c_target) | -+----------+----------+ - | - v - CfreeWriter→text - ↓.c file + frontend (lang/c, lang/toy) + | cfree_cg_* calls + v + CgIrRecorder (src/cg/ir_recorder.c) <- a CgTarget that records + | semantic CG into CgIrModule + | [opt passes run here if opt_level>0] + v + c_emit_ir_module (src/arch/c_target/ir_emit.c) + | switch over CgIrOp -> c_emit_* calls + v + c_emit_* (src/arch/c_target/c_emit.c) + | string buffers (cbuf) + v + CfreeWriter -> .c text ``` -### Substrate: `virtual_regs` - -`CGTarget.virtual_regs = 1` is used. Each minted `Reg` id maps to one declared C local: `uintN_t v17;` or `double v23;`. Registers are reused across different types by the CG value stack, so all writes cast back to the target type, typically bridging through `uintptr_t` to avoid `-Wint-conversion` warnings. - -### Aggregate-by-Address - -When CG packs a call arg whose type is an aggregate, it materializes an address operand (`OPK_LOCAL` / `OPK_GLOBAL` / `OPK_INDIRECT`) referring to a memory image of the struct. The C target emits a direct pointer dereference (e.g. `(*(T*)((char*)vN + K))`). - -For returns of aggregates, `api_alloc_call_ret_storage` allocates a fresh frame slot. The C target emits direct assignment `slot_R = f(args);` or lifts the call into an indirect write `(*(T*)addr) = f(args);`. No `sret` shim is added — the host C compiler handles the ABI details. - -## Sequencing with Opt - -`opt_cgtarget` runs SSA/DCE/combine/loop passes. For the C backend this is undesirable: the *whole point* is to defer optimization to the host C compiler (gcc/clang). - -Decision: when `emit_c_source` is set, `opt_level = 0` is forced regardless of what the caller asked for. The C target sits directly under CG with `virtual_regs = 1`. - -## Type Emission +`c_target_backend_make` constructs the C emitter (`CTarget`, via +`c_emit_target_new`) and then wraps it in a `CgIrRecorder` whose `finalize` +callback (`c_ir_finalize`) replays the recorded `CgIrModule` through +`c_emit_ir_module` and then flushes the C source. The recorder is what the +session and frontend actually drive; the `CTarget` is private behind the +recorder's `user` pointer. + +This design has two consequences worth stating. First, the C target never sees +a live CG call stream — it walks a finished `CgIrModule`, so its emission code +is a straightforward op-dispatch (`ir_emit_inst`) with no value-stack +bookkeeping of its own. Second, because the recorder is itself a normal +`CgTarget`, the IR optimizer can sit between record and emit exactly as it does +for the machine backends: at `opt_level > 0` the session wraps the recorder in +`opt_cgtarget_new`, so the C emitted is the *optimized* IR. Deferring the heavy +optimization to the host `cc` is still the intent, but the cfree-side IR passes +are not bypassed. + +`ir_emit.c` carries one piece of glue beyond pure dispatch: CG scope handles +are recorder-relative, so `CIrEmitter` keeps a `scope_map` that translates each +recorded `CGScope` to the handle the emitter minted at `scope_begin`. + +## Target-locked, not portable + +The emitted C is **target-locked**: it must be compiled for the same triple +that `cfree --target=` selected. Compiled for a different triple it may silently +misbehave. The cause is fundamental to CG: semantic lvalue chains are flattened +to `(base, byte_offset)` before any backend sees them. `cfree_cg_field(g, n)` +arrives as an indirect access with `ofs=12`; the field identity is gone, and +that `12` came from the cfree-selected target's record layout. If a downstream +compiler assumes a different layout, the access is wrong. This is the same +trade LLVM IR makes (datalayout-locked), and it does not limit the stated goal, +since the user already fixed the triple at cfree invocation. + +## Semantic temporaries become C locals + +CG mints fresh, unbounded local ids (`CGLocal`); each one becomes a single +declared C automatic variable named `vN` (`c_local_name`). Declaration is lazy: +the first time a local is referenced, `c_ensure_local` appends one typed +declaration to the per-function `decls` buffer. Locals are zero-initialized +(`= 0`, or `= {0}` for aggregates) and marked `__attribute__((unused))` to +silence host-`cc` diagnostics on control flow the host can't reason through; +the host DSEs the init when a real assignment dominates. + +Each local has exactly one declared C type, recorded in `local_type` and +checked for consistency on re-use. Where CG arithmetic crosses +pointer/integer or differing-width boundaries, the emitter bridges through +`uintptr_t` casts so host-`cc` warnings (`-Wint-conversion` and friends) stay +quiet while the bit semantics are exact. Signedness-sensitive operations +(unsigned divide/remainder, logical shift right, unsigned compares) get an +explicit width-sized signed/unsigned cast on their operands. + +## Types: scalars map, aggregates are opaque bytes + +`c_typename` lowers a CG type id to a C type: + +- scalars -> fixed-width `<stdint.h>` types (`int8_t`..`int64_t`, `__int128`), + `float`/`double`/`long double`, or `int32_t` for `bool`; +- pointers -> `void*` (all pointers collapse; access type comes from the cast + at each load/store); +- enums -> their underlying integer base; +- vararg state -> `va_list` (flags a `<stdarg.h>` include); +- records, arrays, function types -> an emitted typedef. + +The key invariant: **composite types are opaque storage.** A record or array of +size `N` and alignment `A` becomes + +```c +typedef struct { _Alignas(A) uint8_t raw[N]; } __ty_<id>; +``` -The C target maintains a type-emission worklist. C source needs each composite type declared before first use. -1. As CG calls into emission methods, the target records every `CfreeCgTypeId` it sees. -2. At `finalize`, walk the recorded types, topologically order by dependency, and emit: - - scalars → use `<stdint.h>` / `float` / `double`. - - pointers → `void*`. - - records / arrays → `typedef struct { _Alignas(A) uint8_t raw[N]; } __ty_N;` - - function types → `typedef R (*__ty_N)(...);` +regardless of its fields. Field and element access is never expressed through C +field syntax; CG already speaks in `(base, byte_offset)`, so every access is an +indirect dereference `(*(T*)((char*)addr + ofs))`. Emitting types as raw bytes +sidesteps all C aggregate-semantics ambiguity (bitfield layout rules, array +decay, packed/aligned attribute interactions) and keeps types orthogonal to +access patterns. Modern hosts see through the offset arithmetic for SROA +anyway. Function types instead become a function-pointer typedef +`R (*__ty_<id>)(...)` for indirect calls. Multi-result returns synthesize a +guarded `__cfree_tuple<N>_...` struct. -**Composite types are opaque.** Records and arrays are emitted as `typedef struct { _Alignas(A) uint8_t raw[N]; } __ty_N;` — the same shape regardless of field layout. CG already speaks in `(base, byte_offset)` for field/element access, so the indirect path `(*(T*)((char*)addr + ofs))` does all the work and the host C compiler never needs to see the original field declarations. Modern compilers (clang) see perfectly through these offsets for SROA and other optimizations. +Typedefs are emitted into a TU-wide `typedefs` buffer, keyed by unaliased type +id with a per-id state machine (unseen / inflight / emitted) so each type is +declared once, dependencies first, and recursive types degrade to forward-only +rather than looping. -## Symbol and Data Emission +## TU structure and the deferred prologue -The C target maps cfree CG definitions to standard C: -- Data Definition → Emitted as an initialized packed struct (`struct __attribute__((packed)) __cfree_data_<name> { ... }`). -- Data Relocations (cross-symbol references) → Synthesized directly into the data definition struct by interleaving raw byte array chunks with typed pointer fields (e.g. `void* ptr_0`). The initializer assigns standard C address-of (`&<sym>`) to these pointer fields, allowing the host C compiler's linker to handle cross-symbol references natively without any runtime constructor overhead. -- TLS objects → `_Thread_local` is used on the synthesized struct for TLS data. +The emitter accumulates several string buffers and flushes them in a fixed +order at `c_emit_finalize`: -## Source Locations +```text + prologue #include <stdint.h>, <stdalign.h> (+ stdarg/setjmp if used) + typedefs __ty_* opaque-storage and function-pointer typedefs + forwards one `RetT name(params);` per function seen + data_defs data symbol definitions and extern declarations + function bodies signatures + spliced-in decls + body statements +``` -`cfree_cg_set_loc` emits `#line N "path"` directives. With `-g` set on the downstream host gcc/clang invocation, the resulting object code carries source-mapped debug info back to the original cfree input. cfree `Debug` DWARF generation is unused in this mode. +Header choice beyond the two unconditional includes is deferred to finalize so +the include lines stay deterministic regardless of when a feature was first +referenced. The data walk (`c_emit_data`) populates two buffers: the +`data_defs` buffer it owns, and — as a side effect, since data initializers can +take the address of functions — the function forward-declaration buffer. So the +walk runs first, then `forwards` is flushed, then `data_defs`. Forwards precede +data definitions because a data initializer may reference a function by name. + +Per function, declarations and body text are buffered separately: CG needs all +locals declared at the top of the function, but surfaces them interleaved with +body emission. `func_end` records the byte offset just past the opening brace +(`fn_body_start`) and splices the accumulated `decls` in there. A +`last_was_terminator` flag drops dead statements after an unconditional +`return`/`goto` so the output is not littered with unreachable C. + +## Control flow + +CG's structured scopes map to C control flow where possible. `SCOPE_LOOP` +becomes `for (;;) { ... }`; within such a structured scope, jumps to the +scope's break/continue labels are emitted as C `break;`/`continue;` rather than +`goto`. Everything else lowers to labels and `goto`, which the host `cc` +re-structures. Switches, computed/indirect branches (GCC `&&label` / +`goto *p`), and address-of-label all have direct emitters. + +## Tail calls + +CG owns the tail-call policy (see [CODEGEN.md](CODEGEN.md)): before flagging a +call as a sibling call it asks the target whether the call is *realizable*, and +only sets `CG_CALL_TAIL` when the target agrees. The C backend answers through +`c_emit_tail_call_unrealizable_reason_for`, wired into the recorder config as +`tail_call_unrealizable_reason`. A realizable tail call is emitted as +`__attribute__((musttail)) return <call>;`, which clang lowers to a guaranteed +sibling call; the host compiler does the actual stack-reuse. + +The reason hook declines the cases clang's `musttail` cannot honor, returning a +human-readable string instead of `NULL`: a variadic caller, a variadic callee, +or a caller/callee parameter-count mismatch. For those CG leaves the call +unflagged and the backend emits an ordinary call. This keeps the C output +within the subset clang's `musttail` accepts rather than asserting a sibling +call the host would reject. + +## Mapping cfree semantics onto GCC/clang C + +GCC/clang-extension C covers everything CG can express, so each feature maps to +a builtin or extension rather than a runtime shim: + +- inline asm -> verbatim `__asm__ __volatile__ (...)`, constraints and + clobbers passed through (with one fix-up: cfree's synthesized matching input + for a `+`-tied output is dropped, since gcc rejects the redundant operand); +- overflow/trap/builtins -> `__builtin_{add,sub,mul}_overflow`, + `__builtin_trap`, `__builtin_unreachable`, `__builtin_{popcount,ctz,clz, + bswap}*`, `__builtin_prefetch`, `__builtin_expect`, `__builtin_memcpy`/ + `memmove`/`memset`, `__builtin_alloca[_with_align]`; +- atomics -> `__atomic_*` generic builtins with explicit memory orders; +- varargs -> `__builtin_va_*` over `va_list`; +- float-constant loads -> a `static const uint8_t[]` of the ABI byte pattern + copied into the destination via `__builtin_memcpy`, so the host sees the + exact bits; +- bitfields -> bit-extract/insert arithmetic on the underlying storage unit; + cfree never emits a C bitfield declaration. + +## Data symbols and cross-symbol relocations + +Data emission walks the `ObjBuilder`'s symbols at finalize. A defined data +symbol is emitted as a typed file-scope object carrying its initializer bytes; +undefined data becomes an `extern uint8_t name[];` declaration. Linkage, +visibility, weakness, `const` (for rodata), `static` (local binding) and +`_Thread_local` (TLS) are reproduced via attributes and qualifiers so the host +linker reconstructs the same symbol table. + +Cross-symbol references (relocations into a symbol's bytes) are the interesting +case. Rather than a runtime constructor, the symbol's storage struct is split +so each relocated slot is a real typed field: raw `uint8_t chunk_K[]` runs +interleaved with pointer-width fields (`void*` for 8-byte, `uint32_t` for +`R_ABS32`). The initializer assigns standard C address-of expressions +(`(void*)((char*)&target + addend)`) to those fields, so the host C compiler +and linker resolve the references natively. The relocation slots are sorted by +offset, and when any are present the struct is `__attribute__((packed))` so the +field layout matches the original byte image exactly. + +TLS delegates entirely to `_Thread_local`; the host compiler builds its own +descriptor. On Mach-O, where TLS is split into a descriptor symbol plus a +synthesized init symbol (see [OBJ.md](OBJ.md)), the emitter pulls the initial +bytes from the init symbol via the descriptor's `R_ABS64` and emits a single +`_Thread_local`, skipping the object-level descriptor machinery. + +Function-local `static` data uses CG's narrow source-backend hook: those +symbols are emitted inside the owning function and skipped by the TU-wide data +walk. + +## Source locations and debug info + +`set_loc` emits `#line N "path"` directives (deduplicated against the last one +emitted) into the function body. When the user passes `-g` to the downstream +host `gcc`/`clang`, the resulting object carries debug info mapped back to the +original cfree input. cfree's own DWARF producer (see [DWARF.md](DWARF.md)) is +unused in this mode — there is no `Debug` and no `MCEmitter` on this path. ## Testing -The C backend is tested via the `CFREE_TEST_PATHS=C` path in both `test/toy/run.sh` and `test/parse/run.sh`. -For each case, the test harness: -1. Emits C source via `cfree cc --emit=c <src> -o <name>.cfree.c`. -2. Compiles the emitted source with the host C compiler (`cc -Werror -O0`). -3. Runs the compiled executable and asserts the output matches the expected behavior. - -## Known Limitations - -- **PCREL and SYMDIFF relocations**: These are link-time concepts with no faithful in-language C-source equivalent. Frontends attempting to emit these will trigger a compiler panic/skip. -- **128-bit Types**: `__int128` and `_Float128` usage may be limited or skipped depending on the host C compiler capabilities (e.g. host long double might not be 128-bit). -- **Floating-point reproducibility**: cfree's FP-flag enum (REASSOC, APPROX) applies per-operation. C does not have a per-operation fast-math syntax, so these flags are currently ignored, resulting in correctly strict but potentially pessimistic FP evaluation. +Exercised end-to-end: emit C with `cfree cc --emit=c`, compile the result with +the host `cc -Werror`, run it, and assert behavior matches the machine-code +path. The `test/toy` and `test/parse` corpora drive this via a dedicated emit +mode. See [TESTING.md](TESTING.md). diff --git a/doc/CGTARGET.md b/doc/CGTARGET.md @@ -1,764 +0,0 @@ -# CGTarget and NativeTarget - -This document describes the intended split between cfree's semantic codegen -target interface and the native backend emission interface. It complements -`doc/IR.md`: that document defines the clean recorded-CG IR, while this one -defines how `CfreeCg`, direct `-O0` emission, and optimized native emission -fit around that IR. - -## Goals - -- Keep a fast direct `-O0` path that does not record or optimize IR. -- Give `CfreeCg` a clean semantic target interface that maps directly to the - planned lowered-CG IR. -- Keep hard registers, spill slots, call plans, prologue sizing, liveness, and - register allocation out of the semantic interface. -- Let native architectures share implementation helpers between direct `-O0` - emission and optimized post-regalloc emission without making them the same - public vtable. - -## Current Problem - -The current internal `CGTarget` serves two different levels at once. - -First, `CfreeCg` drives it as a semantic sink. The value-stack layer lowers -public CG operations into target calls such as loads, stores, arithmetic, -labels, branches, calls, returns, atomics, varargs, and inline assembly. - -Second, the optimizer also records through a `CGTarget`, optimizes the recorded -function, performs backend preparation and register allocation, then replays -the lowered result into a native `CGTarget`. That final replay uses hard -registers, frame slots, spill/reload hooks, call plans, and backend register -metadata. - -Those are different contracts. The first is a target-data-layout-specific -semantic interface. The second is a machine-emission interface after -machinization and register allocation. Combining them forces the shared target -API to expose both IR concepts and backend-private lowering state. - -## Proposed Layering - -The intended layering is: - -```text -frontend -> CfreeCg/value stack -> semantic CGTarget - |-> direct O0 NativeDirectTarget - | -> NativeOps -> NativeTarget - |-> C target / WASM / check target - |-> IR recorder -> clean IR -> optimizer - -> NativeTarget -``` - -Native architectures expose a `NativeTarget` for physical emission. Direct -`-O0` native codegen should not require every arch to implement a separate -semantic `CGTarget` vtable. Instead, a shared `NativeDirectTarget` implements -the semantic `CGTarget` interface once and is parameterized by: - -- the arch's `NativeTarget`, which emits physical/native operations; -- a small arch-specific `NativeOps` adapter for direct-mode ABI/frame/legality - questions that the shared direct target cannot answer generically. - -Optimized codegen does not use `NativeOps`; after MIR lowering and register -allocation, the optimizer drives `NativeTarget` directly. `NativeOps` exists -only to let `NativeDirectTarget` reuse `NativeTarget` without duplicating the -semantic `CGTarget` surface per native arch. - -## Semantic CGTarget - -The semantic `CGTarget` is the interface driven by `CfreeCg` and implemented by -both direct targets and the IR recorder. It speaks in terms that can be -recorded directly as clean lowered-CG IR: - -- typed semantic locals; -- immediates, globals, locals, and indirect addresses; -- labels and structured scopes; -- target-data-layout-specific memory accesses; -- calls and returns as typed semantic locals, including multi-result returns; -- aggregate, bitfield, atomic, vararg, intrinsic, and inline-asm operations; -- sticky source locations. - -It should not expose optimizer or native emission state such as CFG blocks, -SSA, hard registers, physical register files, liveness, frame slots, spill -slots, call plans, scratch-register policy, or prologue/epilogue patching. - -The semantic value namespace is one mutable local namespace. Parameters, source -locals, compiler temporaries, aggregate homes, call results, and alloca results -are all `CGLocal` ids allocated by the target: - -```c -typedef u32 CGLocal; -#define CG_LOCAL_NONE 0u -``` - -Semantic operands should not contain hard registers: - -```text -OPK_IMM signed immediate bit pattern -OPK_LOCAL typed semantic local -OPK_GLOBAL object symbol plus addend address -OPK_INDIRECT base local plus optional index local, scale, and offset -``` - -The semantic API still includes operations such as: - -- `local` and `param`; -- `load_imm`, `load_const`, `copy`, `load`, `store`, `addr_of`, - `tls_addr_of`, aggregate copies/sets, and bitfield operations; -- `binop`, `unop`, `cmp`, and `convert`; -- labels, branches, switches, label-address materialization, indirect - branches, and structured scopes; -- `call` and `ret` using local-only semantic descriptors; -- `alloca_`, `va_*`, atomics, fences, intrinsics, inline asm, and source - location tracking. - -The semantic API should not include: - -- `FrameSlot`; -- `CGKnownFrameDesc`; -- `CGCallPlan`; -- spill/reload hooks; -- hard-register discovery or reservation hooks; -- call-plan emission hooks; -- inline-asm register-name resolution as a target-wide semantic operation. - -Those belong to native lowering or native emission. - -## Direct O0 Native Target - -The direct `-O0` path remains: - -```text -frontend -> CfreeCg/value stack -> semantic CGTarget - -> NativeDirectTarget - -> NativeOps -> NativeTarget -``` - -No IR recording is required. `NativeDirectTarget` is the shared semantic -`CGTarget` implementation for native architectures. It receives semantic -locals and operations, maps them to direct-mode physical storage, and emits -machine code immediately through the injected `NativeTarget`. - -`NativeDirectTarget` owns the direct-mode policy and state that should no -longer live in `CfreeCg`: - -- semantic local allocation and local metadata; -- assigning semantic locals frame homes; -- direct-mode scratch register allocation; -- optional local register caching; -- dirty-local flushing and cache invalidation; -- call/volatile/atomic/inline-asm memory barriers; -- caller-saved invalidation using native register metadata; -- materializing semantic operands into physical values; -- storing physical results back into semantic locals; -- max-outgoing-call-area tracking for frame finalization. - -`NativeOps` should stay small. It is not a second copy of `CGTarget`. It -answers arch-specific direct-mode questions and forwards special cases into -`NativeTarget`: - -- static register metadata (`NativeRegInfo`); -- function/frame begin/end glue for direct mode; -- frame slot allocation and slot-address formation; -- incoming parameter binding into a semantic local's home; -- return and tail-call ABI decisions; -- call planning/routing for direct calls; -- operand/addressing legality when the generic direct target has a choice; -- inline-asm, vararg, and other arch-sensitive helpers that cannot be - described as ordinary physical MIR emission. - -The simplest correct direct target can give every local a frame home and use -scratch registers per instruction: - -```text -load_imm dst, 42 - -> move 42 to scratch - -> store scratch to dst's frame home - -binop dst, a, b - -> load a into scratch0 - -> load b into scratch1 - -> emit the operation - -> store the result to dst's frame home -``` - -That baseline is intentionally conservative. It needs no liveness, CFG, SSA, -or global register allocation. - -A faster direct target can add a small per-function local register cache. Each -semantic local has target-private state: - -```c -typedef struct NativeLocal { - CfreeCgTypeId type; - u32 size; - u32 align; - u32 flags; - - FrameSlot home; - Reg reg; - u8 cls; - u8 dirty; - u8 address_taken; - u8 memory_required; -} NativeLocal; -``` - -`NativeDirectTarget` then uses local greedy helpers: - -```c -Reg materialize(NativeDirectTarget *, Operand op, NativeAllocClass cls); -Reg ensure_writable_reg(NativeDirectTarget *, CGLocal dst, NativeAllocClass cls); -void flush_local(NativeDirectTarget *, CGLocal local); -void spill_one(NativeDirectTarget *, NativeAllocClass cls); -``` - -The cache policy can be simple: - -- Keep `reg_owner[reg] = CGLocal` per register class. -- Prefer a free allocable register. -- Otherwise spill a non-pinned register to its frame home. -- Pin source and scratch registers for the duration of one instruction. -- Mark destination locals dirty when their cached register has newer contents - than memory. - -The direct target must flush conservatively when memory may observe cached -locals: - -- before calls unless the call lowering can prove a local need not be saved; -- before volatile or atomic memory operations; -- before inline asm with a memory clobber; -- before operations that may observe address-taken locals. - -This remains a local register cache, not a real allocator. The semantic API -does not promise where a local lives, so a target can choose the frame-only -baseline first and add caching later. - -## Optimized O1+ Native Target - -The optimized path records clean semantic IR first: - -```text -CfreeCg/value stack -> recording CGTarget -> clean IR -``` - -Optimization then derives private views from that clean IR: - -```text -clean IR - -> CFG/SSA and semantic optimization - -> native MIR or backend-prep form - -> liveness and register allocation - -> final machine locations -``` - -Once the optimizer has assigned hard registers and spill slots, it should not -replay through the semantic `CGTarget`. At that point the representation is no -longer semantic lowered-CG IR. It is a native backend form, so final emission -should drive `NativeTarget` directly. - -The optimizer-private representation may still use hard registers, spill slots, -frame slots, call plans, block arrays, phis, dominance, liveness, and other -backend-prep metadata. Those are derived views, not part of the semantic -target contract. `NativeOps` is not part of this path; any operation the -optimizer needs should be represented in MIR or exposed by `NativeTarget` -itself. - -## Unified IR Container - -The semantic IR and machine IR can share one representation substrate without -sharing one semantic contract. - -The useful unification is at the container and infrastructure level: - -- one function container; -- one linear instruction stream and/or derived block representation; -- one label namespace; -- one source-location model; -- one operand storage shape; -- one aux-payload allocation strategy; -- shared dump, walk, rewrite, and verification infrastructure. - -The phases remain distinct: - -```c -typedef enum IRPhase { - IR_PHASE_SEMANTIC, - IR_PHASE_MIR, - IR_PHASE_ALLOCATED_MIR, -} IRPhase; -``` - -The same `Func`/`Inst` storage can carry different phase-specific op and -operand subsets. Not every operand kind is legal in every phase: - -```text -semantic IR: - OPK_IMM, OPK_LOCAL, OPK_GLOBAL, OPK_INDIRECT - -MIR before register allocation: - OPK_IMM, OPK_VREG, OPK_GLOBAL, OPK_FRAME_SLOT, OPK_MACH_ADDR - -MIR after register allocation: - OPK_IMM, OPK_HARD_REG, OPK_FRAME_SLOT, OPK_STACK_SLOT, OPK_MACH_ADDR -``` - -Similarly, some operations are semantic-only, some are MIR-only, and some are -shared control-flow operations: - -```text -semantic-only examples: - IR_CALL, IR_RET, IR_SCOPE_BEGIN, IR_VA_ARG, IR_AGG_COPY - -MIR-only examples: - IR_MACH_CALL, IR_SPILL, IR_RELOAD, selected two-address ops - -shared examples: - IR_NOP, IR_BR, IR_CMP_BRANCH, IR_SWITCH, IR_LOAD_LABEL_ADDR -``` - -The lowering should not preserve an opcode name merely because the shape looks -similar. A semantic `IR_LOAD` carries source-level memory and type facts. A -machine load carries selected addressing modes, register constraints, and -instruction-emission constraints. If those semantics diverge, the MIR phase -should use a distinct op even though both live in the same `Inst` container. - -This model lets optimization lower uniformly: - -```text -semantic Func - -> semantic cleanup - -> MIR Func using the same storage conventions - -> allocated MIR Func - -> NativeTarget emission -``` - -The guardrail is phase-specific verification: - -```c -void ir_verify_semantic(Func *); -void ir_verify_mir(Func *); -void ir_verify_allocated_mir(Func *); -``` - -The verifier enforces legal opcodes, operand kinds, aux payloads, block/label -rules, and phase invariants. This keeps the representation compact and shared -without letting machine-only concepts leak back into the semantic `CGTarget` -contract. - -## NativeTarget Surface - -`NativeTarget` is the physical native emission interface. Optimized code uses -it post-machinize and post-regalloc, where it speaks final machine locations -and selected/native operations. Direct `-O0` uses the same interface only after -`NativeDirectTarget` has chosen scratch registers and materialized semantic -operands. - -```text -MIR_LOC_REG hard physical register -MIR_LOC_STACK frame, spill, or outgoing stack slot -MIR_LOC_IMM immediate -MIR_LOC_GLOBAL symbol plus addend -MIR_LOC_ADDR final addressing mode -``` - -The central rule is that `NativeTarget` is not a register allocator. Every -instruction hook receives caller-selected, target-legal physical operands: -register destinations are `NATIVE_LOC_REG`, arithmetic sources are registers or -target-legal immediates, and memory addresses have already had base/index -values materialized into legal registers when the architecture requires that. -The native target may validate this contract and panic on invalid input, but it -must not pick replacement registers or spill behind the caller's back. - -The surface is a low-level vtable rather than a semantic target: - -```c -typedef struct NativeTarget NativeTarget; - -struct NativeTarget { - Compiler *c; - ObjBuilder *obj; - MCEmitter *mc; - const NativeRegInfo *regs; - - NativeAllocClass (*class_for_type)(NativeTarget *, CfreeCgTypeId); - int (*imm_legal)(NativeTarget *, NativeImmUse, u32 op, - CfreeCgTypeId type, i64 value); - int (*addr_legal)(NativeTarget *, const NativeAddr *, MemAccess); - - void (*func_begin)(NativeTarget *, const CGFuncDesc *); - void (*func_begin_known_frame)(NativeTarget *, const CGFuncDesc *, - const NativeKnownFrameDesc *, - NativeFrameSlot *out_slots); - void (*note_frame_state)(NativeTarget *, const NativeFramePatchState *); - void (*func_end)(NativeTarget *); - - NativeFrameSlot (*frame_slot)(NativeTarget *, - const NativeFrameSlotDesc *); - - void (*move)(NativeTarget *, NativeLoc dst_reg, NativeLoc src_reg); - void (*load_imm)(NativeTarget *, NativeLoc dst_reg, i64 imm); - void (*load_addr)(NativeTarget *, NativeLoc dst_reg, NativeAddr addr); - void (*load)(NativeTarget *, NativeLoc dst_reg, NativeAddr addr, - MemAccess); - void (*store)(NativeTarget *, NativeAddr addr, NativeLoc src_reg, - MemAccess); - void (*binop)(NativeTarget *, BinOp, NativeLoc dst_reg, - NativeLoc a_reg, NativeLoc b_reg_or_imm); - void (*cmp)(NativeTarget *, CmpOp, NativeLoc dst_reg, - NativeLoc a_reg, NativeLoc b_reg_or_imm); - void (*convert)(NativeTarget *, ConvKind, NativeLoc dst_reg, - NativeLoc src_reg); - - void (*spill)(NativeTarget *, NativeLoc src_reg, NativeFrameSlot, - MemAccess); - void (*reload)(NativeTarget *, NativeLoc dst_reg, NativeFrameSlot, - MemAccess); - - void (*plan_call)(NativeTarget *, const NativeCallDesc *, - NativeCallPlan *); - void (*emit_call)(NativeTarget *, const NativeCallPlan *); - void (*plan_ret)(NativeTarget *, const CGFuncDesc *, - const NativeLoc *values, u32 nvalues, - NativeCallPlanRet **out_rets, u32 *out_nrets); - void (*ret)(NativeTarget *); - - void (*patch_add)(NativeTarget *, const NativePatch *); - void (*patch_apply)(NativeTarget *); -}; -``` - -The real header has the full operation set, including labels, aggregate and -bitfield operations, atomics, intrinsics, inline assembly, traps, source -locations, finalize, and destroy. The important shape is the same for every -hook: it emits one selected native operation using already-legal operands. - -The shared direct path may also use the same `NativeTarget`, but it does so -through `NativeDirectTarget` and the small `NativeOps` adapter. This keeps the -semantic `CGTarget` surface maximally reused while avoiding a large -arch-specific direct `CGTarget` implementation per native backend. - -This interface owns the machine-level concerns removed from semantic -`CGTarget`: - -- concrete frame, spill, save, alloca, and outgoing slots; -- known-frame layout and max outgoing call area; -- callee-save reservation and prologue/epilogue patching; -- hard-register operands and final addressing modes; -- spill/reload insertion or emission; -- selected two-address and arch-specific instruction forms; -- direct, indirect, and tail call emission after ABI routing; -- CFI and unwind emission; -- inline-asm constraint binding and clobber handling; -- backend register discovery and legality queries. - -Static register-file metadata belongs in `NativeRegInfo`: - -```c -typedef struct NativeAllocClassInfo { - NativeAllocClass cls; - - const Reg *allocable; - u32 nallocable; - - const Reg *scratch; - u32 nscratch; - - const CGPhysRegInfo *phys; - u32 nphys; - - u32 caller_saved_mask; - u32 callee_saved_mask; - u32 arg_mask; - u32 ret_mask; - u32 reserved_mask; -} NativeAllocClassInfo; - -typedef struct NativeRegInfo { - const NativeAllocClassInfo *classes; - u32 nclasses; - - int (*resolve_name)(const NativeRegInfo *, Sym name, Reg *out, - NativeAllocClass *cls_out); - const char *(*debug_name)(const NativeRegInfo *, NativeAllocClass cls, - Reg reg); - u32 (*dwarf_reg)(const NativeRegInfo *, NativeAllocClass cls, Reg reg); -} NativeRegInfo; -``` - -`resolve_name` belongs here when it is pure register-file metadata. If inline -assembly dialects later affect name resolution, the callback can take a small -dialect context. - -Call-specific answers should not be static register metadata when they depend -on ABI, calling convention, variadic state, vector ABI, or attributes. Those -belong to native call planning and must be expressed in native locations: - -```text -NativeCallDesc: - callee: native location or address - args/results: semantic values already represented as NativeLoc homes - -NativeCallPlan: - argument moves into hard argument registers or outgoing stack slots - return moves from hard return registers or result memory to result homes - clobber/return masks per allocation class - stack_arg_size for late frame patching -``` - -For direct `-O0`, `NativeOps` may expose call planning as an adapter because -`NativeDirectTarget` starts from semantic `CGCallDesc` values and frame homes. -The adapter must still return legal native destinations, not semantic storage -decisions. For optimized code, call planning belongs on `NativeTarget` or in -MIR lowering; the optimizer does not call `NativeOps`. - -Frame size and outgoing-call size are allowed to be unknown when prologue code -is first emitted. The native target exposes explicit patch points: - -```text -func_begin or func_begin_known_frame - -> emit provisional prologue and record NativePatch records -body emission - -> NativeDirectTarget or allocated MIR updates NativeFramePatchState -func_end - -> note_frame_state, patch_apply, then final epilogue/finalization -``` - -This gives the single-pass direct path a clean escape hatch without moving -frame layout or register allocation back into `NativeTarget`. - -## CfreeCg Value Stack - -The value stack remains useful, but its role should be narrowed. It should be a -public API adapter and semantic lowering layer, not a physical allocator. - -It still provides: - -- push/pop API state and validation; -- expression-stack lowering; -- lvalue/rvalue conversion; -- aggregate, bitfield, call, switch, computed-goto, vararg, alloca, and inline - asm lowering; -- construction of local-only `CGCallDesc` records; -- delayed semantic patterns such as delayed compares for branches; -- a single diagnostic point for misuse of the public CG API; -- a convenient frontend interface for simple non-C producers. - -It should stop owning: - -- hard-register allocation; -- frame-slot allocation; -- spill/reload policy; -- caller-saved preservation; -- backend scratch-register selection. - -Stack entries should describe semantic values: - -```c -typedef enum SValueKind { - SV_IMM, - SV_CONST, - SV_LOCAL, - SV_LVALUE, - SV_DELAYED_CMP, -} SValueKind; - -typedef struct SValue { - CfreeCgTypeId type; - u8 kind; - CGLocal local; - Operand addr; -} SValue; -``` - -## Local and Lvalue Model - -`SV_LOCAL` is a computed rvalue stored in a semantic local. `SV_LVALUE` is an -addressable storage location that may be loaded from or stored to. - -For a read: - -```text -x - -> push SV_LVALUE(local x) - -lvalue conversion - -> tmp = target->local(i32 temporary) - -> target->load(tmp, local x, mem) - -> push SV_LOCAL(tmp) -``` - -For an assignment: - -```text -x = y + 1 - -> keep x as SV_LVALUE - -> compute y + 1 as SV_LOCAL - -> target->store(address of x, value local, mem) -``` - -For aggregates, the distinction is more important. Aggregate values often stay -in addressable homes and move through `copy_bytes` rather than becoming scalar -register values. - -## Migration Notes - -The migration should treat the semantic interface as the stable boundary and -move machine concepts downward: - -1. Define semantic `CGLocal` and semantic `Operand` without `OPK_REG`. -2. Move frame slots, call plans, hard-register metadata, and spill/reload hooks - out of semantic `CGTarget`. - The new internal contracts start in `src/arch/native_target.h` and - `src/cg/native_direct_target.h`. -3. Convert `CfreeCg` stack entries from physical register/frame ownership to - semantic locals, lvalues, immediates, constants, and delayed compares. -4. Implement shared `NativeDirectTarget` as the native semantic `CGTarget`, - initially with the frame-only baseline. -5. Introduce per-arch `NativeOps` only for direct-mode ABI/frame/legality glue, - forwarding physical emission through each arch's `NativeTarget`. -6. Add a local register cache to `NativeDirectTarget` only after correctness is - stable. -7. Unify semantic IR, MIR, and allocated MIR around one `Func`/`Inst` - container where practical, guarded by phase-specific verification. -8. Keep machine-only concepts phase-local while introducing a `NativeTarget` - emission boundary for post-regalloc output. - -The result is a direct `-O0` path that stays fast and simple, plus an optimized -path whose final emission interface matches the data it actually has after -register allocation. - -## Impact Surface - -The interface split touches every layer that currently sees `CGTarget`, -`Operand`, `OPK_REG`, `FrameSlot`, `CGLocalStorage`, or `CGCallPlan`. - -The main surfaces are: - -- `src/arch/arch.h`: current shared definitions for semantic operations, - physical operands, frame slots, call plans, register metadata, `CGTarget`, - `CGBackend`, and `ArchImpl`. -- `src/arch/cgtarget.c`: arch-agnostic constructor/finalize helpers and helper - lowering such as indexed-address folding that currently emits through - `OPK_REG`. -- `src/arch/registry.c`: feature-gated backend registry. It currently returns - a `CGBackend` whose only construction hook is `make -> CGTarget`. -- `src/arch/check_target.c`: check-only backend. It implements the full current - target vtable, including frame slots and register hooks. -- `src/cg/*`: the public CG value-stack implementation. This is the largest - semantic migration because it currently owns value registers, spill slots, - frame lvalues, local storage, caller-saved preservation, and delayed - materialization into `OPK_REG`. -- `src/opt/*`: current recorder, IR container, optimization passes, MIR - lowering, register allocation, and final `opt_emit` replay into a native - `CGTarget`. -- `src/arch/{aa64,x64,rv64}/*`: native physical emitters. The split moves - direct-mode policy into shared `NativeDirectTarget`; per-arch code should - provide `NativeTarget`, `NativeOps`, ABI helpers, frame/prologue/epilogue - code, inline-asm support, and instruction encoders. -- `src/arch/{aa64,x64,rv64}/opt_coord.c`: current hard-register and call-plan - coordination hooks for the optimizer. These should move behind - `NativeRegInfo`, native call planning, and native MIR emission. Direct-mode - consumers should reach equivalent answers through `NativeOps`, not through - the semantic `CGTarget` surface. -- `src/arch/c_target/*`: source backend. It should implement semantic - `CGTarget`, not `NativeTarget`. -- `src/arch/wasm/*`: wasm target and structurizer. It is closer to a semantic - structured target than native machine emitters, but it still currently sees - the combined `CGTarget`/`Operand` model. -- `lang/c/parse/cg_adapter.c` and related parser integration: public - `CfreeCg` users. If the public `CfreeCg` API remains stable, these should - not need to change for the target split. -- tests under `test/opt`, `test/arch`, `test/api`, `test/parse`, and smoke - harnesses: they include direct `CGTarget` construction, mock targets, opt IR - dumps, inline asm backend tests, and public CG tests. - -The public `include/cfree/cg.h` API does not need to expose the split. It can -keep the push/pop CG interface while the internal value stack changes from -physical allocation to semantic local lowering. - -## Existing Build Gating - -The repo already has coarse build gates in `include/cfree/config.h`, mirrored -by `mk/config.mk` so `Makefile` drops matching source directories: - -- `CFREE_ARCH_AA64_ENABLED` -- `CFREE_ARCH_X64_ENABLED` -- `CFREE_ARCH_RV64_ENABLED` -- `CFREE_ARCH_WASM_ENABLED` -- `CFREE_ARCH_C_TARGET_ENABLED` -- `CFREE_OBJ_*_ENABLED` -- `CFREE_LANG_*_ENABLED` -- `CFREE_OPT_ENABLED` - -The existing `CFREE_OPT_ENABLED=0` path is the first useful safety valve. It -drops `src/opt/*`, filters arch `opt_coord.c`, and makes `CfreeCg` reject -`opt_level > 0`. During the semantic `CGTarget` cutover, this allows work to -start with direct `-O0` only. - -No new target-migration gates are needed. The migration should rely on the -existing component gates to remove unported code from the build while one -backend is brought forward. - -The public `CfreeCg` API is the boundary that keeps frontends insulated. If -`CfreeCg` remains source-compatible, C, toy, wasm-language, and preprocessor -frontends do not need to be disabled just because `CGTarget` changes. They only -need disabling if their own source files directly include or depend on changed -internal target details. - -`src/cg/*` is core codegen infrastructure and is not meaningfully optional for -this migration. It must compile in every codegen-capable slice. The practical -way to shrink the work is to disable unported consumers and implementers of the -internal target interface: - -- disable `CFREE_OPT_ENABLED` to drop the optimizer recorder, MIR passes, - regalloc, final replay, and per-arch `opt_coord.c`; -- disable all but one native `CFREE_ARCH_*_ENABLED` backend while porting the - new direct `-O0` target implementation; -- optionally disable `CFREE_ARCH_C_TARGET_ENABLED` until the semantic source - backend is ported; -- optionally disable `CFREE_ARCH_WASM_ENABLED` until the wasm backend is ported; -- keep object-format gates narrow to the selected backend's required format - when possible. - -The smallest buildable slice should be: - -```text -CFREE_OPT_ENABLED=0 -CFREE_ARCH_C_TARGET_ENABLED=0 or 1 -one native arch enabled, preferably the host arch -only object formats required by that arch enabled -``` - -That slice only requires: - -- semantic `CGTarget`; -- semantic `CfreeCg` value stack; -- check-only target; -- shared `NativeDirectTarget`; -- one arch's `NativeTarget` and direct-mode `NativeOps`; -- object writer and ABI support for that arch/object format. - -After that, re-enable components in this order: - -1. `check_target`: validates the semantic target shape without native emission. -2. `NativeDirectTarget` plus one arch's `NativeTarget`/`NativeOps`, with - frame-only local homes. -3. The C-source target as a semantic-only backend. -4. Local register caching in `NativeDirectTarget`. -5. `CFREE_OPT_ENABLED`: clean IR recorder using the semantic target. -6. Semantic optimizer passes that do not require native MIR/regalloc. -7. MIR lowering and `NativeTarget` for one arch. -8. Register allocation and allocated-MIR emission. -9. Remaining native arches through their existing `CFREE_ARCH_*_ENABLED` - gates. -10. Wasm and structurized/source-like targets through existing arch gates. - -Tests should follow the same gating: - -- direct API and parser codegen tests first with `opt_level=0`; -- arch inline-asm tests only after the selected direct native target is ported; -- `test-opt` only after the semantic recorder and verifier compile; -- optimized codegen tests only after `NativeTarget` emission exists for the - selected arch; -- smoke/link/debug tests last, because they exercise the whole backend, - object, linker, debug, and runtime pipeline. diff --git a/doc/CODEGEN.md b/doc/CODEGEN.md @@ -0,0 +1,263 @@ +# Codegen Spine + +This is the codegen spine that sits between cfree's frontends and machine code. +Frontends never speak machine: they drive the public `CfreeCg` stack-machine API +(`cfree_cg_*`), which lowers to a single internal semantic sink — `CgTarget`. +`CgTarget` has several realizations selected by opt-level and output kind: a +direct machine-code emitter for `-O0`, an IR recorder that feeds the optimizer +and interpreter, and source-like targets (the C backend, wasm). All the native +`-O0` backends share one `CgTarget` implementation parameterized by per-arch +hooks. This doc covers that layering, the public-API-to-`CgTarget` lowering, and +the shared native infrastructure. See [IR.md](IR.md) for the recorded IR, +[OPT.md](OPT.md) for optimization and machine lowering, [ARCH.md](ARCH.md) for +per-arch emission, [CBACKEND.md](CBACKEND.md) and [WASM.md](WASM.md) for the +source-like targets, and [INTERPRETER.md](INTERPRETER.md) for the interpreter. + +## The two boundaries + +Codegen has exactly two stable interfaces, stacked: + +``` + frontend (C / toy / wasm-lang / preprocessor) + | public push/pop API + v + CfreeCg value stack (src/cg/*.c, op families) + | semantic CgTarget vtable (src/cg/cgtarget.h) + v + CgTarget realization + |-- NativeDirectTarget -O0 direct emit -> NativeOps + NativeTarget + |-- CgIrRecorder -O1/-O2/interp -> recorded IR -> optimizer -> NativeTarget + |-- C-source target / wasm (semantic, source-like output) +``` + +`CgTarget` (the *semantic* interface) speaks in target-data-layout terms a +frontend can produce: typed semantic locals, immediates/globals/indirect +addresses, labels and structured scopes, typed loads/stores, arithmetic, +calls/returns as local-valued descriptors, aggregates, bitfields, atomics, +varargs, intrinsics, and inline asm. It deliberately exposes **no** machine +state: no hard registers, no spill slots, no call plans, no CFG/SSA, no +prologue patching. That keeps it recordable verbatim as semantic IR and lets one +frontend path serve every backend. + +`NativeTarget` (`src/arch/native_target.h`, the *physical* interface) is the +other boundary — the post-register-allocation machine-emission contract. Every +hook receives caller-selected, target-legal physical operands and emits one +native operation; it is explicitly **not** a register allocator. Both the `-O0` +direct path and the optimizer's machine-emit path bottom out here. + +The split exists because the two contracts are genuinely different levels. The +semantic interface is "what the program means"; the physical interface is "emit +this selected instruction with these registers." Folding them, as an earlier +design did, forced the shared vtable to expose both IR concepts and backend +register state at once. Keeping them apart lets each native arch implement only +physical emission, while the semantic-to-physical bridge for `-O0` is written +once and shared. + +## The public CfreeCg API and value stack + +`include/cfree/cg.h` is a 132-call stack machine. A frontend pushes typed +values (locals, lvalues, immediates, constants), names types, and issues +operations that pop operands and push results — `cfree_cg_func_begin`, +`cfree_cg_load`/`cfree_cg_store`, `cfree_cg_fp_binop`, `cfree_cg_call`, +`cfree_cg_branch_true`, `cfree_cg_block_begin`, and so on. This API is the +insulation layer: it is source-stable across all the internal changes below it, +so the C parser's `cg_adapter`, the toy frontend, and the wasm-language frontend +do not change when a backend does. See [FRONTENDS.md](FRONTENDS.md). + +The implementation lives in `src/cg/`, split by op family rather than one +monolith: `value.c` (stack discipline, lvalue/rvalue conversion, delayed +compares/arith), `memory.c` (loads/stores/addressing/aggregates), `arith.c`, +`control.c` (labels, branches, scopes, switch, computed goto), `call.c`, +`atomic.c`, `asm.c`, `type.c`, `local.c`, `data.c`, `wide.c` (128-bit scalars), +with shared state and helpers in `internal.h` and lifecycle in `session.c`. +(Files are named per family; there is no literal `api_*.c` prefix.) + +The value stack's job is purely semantic lowering. Each entry (`ApiSValue`) is +one of an operand (immediate / constant / semantic local / lvalue address), a +*delayed compare*, or a *delayed arith* — forms held un-emitted so a following +branch can fuse a compare instead of materializing a 0/1, or so a small +immediate can flow straight into a `binop`. The stack does **not** own +registers, frame slots, spill policy, or caller-saved preservation; those moved +down into the target realizations. When an operation needs a value emitted, the +stack calls the corresponding `g->target->op(...)` semantic hook with +local-only operands. + +Switch is a good example of the semantic/structured division. `CgTarget` carries +an optional `switch_` hook and a `supports_label_table` query. Native arches +leave `switch_` NULL and the shared `cg_lower_switch_default` lowers the +structured descriptor into `cmp_branch`/`jump`/`indirect_branch` + a rodata label +table. Source-like targets override `switch_` to emit a native construct (the C +target a real `switch`, a wasm target `br_table`). Same semantic input, different +realization. + +## CgTarget realizations + +`session.c`'s `cfree_cg_begin_obj` picks the realization. It asks the arch +registry (`cg_backend_for_session`, `src/arch/registry.c`) for a `CGBackend` +whose `make` builds the base `CgTarget` for this target arch and output kind, +then conditionally wraps it: + +- **`-O0`, native arch:** the backend's `make` returns a `NativeDirectTarget` + (see below). No IR is recorded; semantic ops emit machine code immediately. +- **`-O1`/`-O2` or interpreter:** `session.c` wraps the base target with + `opt_cgtarget_new` (`src/opt/opt.c`), which returns a `CgIrRecorder` + (`src/cg/ir_recorder.c`). Recording does not emit; at `finalize` the optimizer + replays optimized IR. The recorder still holds the unwrapped native target so + the optimizer can drive `NativeTarget` directly after lowering. +- **C-source / wasm:** the registry returns a source-like `CgTarget` that + implements the semantic vtable and writes C text or a wasm module. These are + semantic backends, not `NativeTarget` implementers. + +The arch `make` always builds the `NativeDirectTarget` as the leaf; the optimizer +wrapper, when present, sits *above* it and reaches the leaf's `NativeTarget` +through `native_direct_target_native`. So a native arch implements its semantic +surface exactly once — there is no separate per-arch semantic `CgTarget`. + +### The IR recorder + +`CgIrRecorder` is a thin `CgTarget` that turns each semantic call into exactly +one IR instruction in a per-function `CgIrFunc`, preserving operands, sticky +source locations, tail-call policy, and global references. It is purely a sink: +`finalize` triggers the optimizer's cross-function passes and per-function +lowering. From the recorded clean IR, the optimizer derives its own +CFG/SSA/MIR/allocated-MIR views and finally calls `opt_emit_native` to drive the +arch `NativeTarget`. The same recorded IR feeds the interpreter via a sibling +lowering path (`opt_run_o1_interp`). See [IR.md](IR.md), [OPT.md](OPT.md), and +[INTERPRETER.md](INTERPRETER.md). + +## Shared native `-O0`: NativeDirectTarget + +`src/cg/native_direct_target.c` is the single direct `-O0` semantic `CgTarget` +for *all* native arches (aa64/rv64/x64). It accepts semantic locals and ops and +emits machine code in one pass, with no IR recording, liveness, CFG, or SSA. +That single-pass property is the whole point of the `-O0` path: it is the fast, +low-overhead route for unoptimized builds, JIT, and bootstrapping. + +It is parameterized two ways: + +- a per-arch `NativeTarget` for physical emission (injected at construction); +- a small per-arch `NativeOps` adapter (`native_direct_target.h`) for the few + direct-mode questions the generic target can't answer: parameter binding, + operand/address legality, call planning + emission, return + tail-call + realizability, varargs, inline asm, and the conservative memory barrier. + +`NativeOps` is intentionally tiny — it is *not* a second copy of `CgTarget`. +Pure pass-throughs (register info, `func_begin`/`func_end`, frame-slot +allocation, `class_for_type`, `addr_legal`) are called on the `NativeTarget` +directly; `NativeOps` exists only for the arch-sensitive direct-mode glue. + +What `NativeDirectTarget` owns (and what the value stack therefore does not): +semantic-local allocation and metadata, assigning locals frame homes, direct +scratch-register selection, a local register cache, dirty-flush/invalidation, +materializing operands into physical values, storing results back, conservative +flushes around calls/barriers, and outgoing-call-area tracking for frame +finalization. + +### The semantic-vs-physical split, concretely + +`NativeOps` is the *semantic-side* adapter — it answers questions phrased in +semantic terms (`CGParamDesc`, `Operand`, `CGCallDesc`) and is used only on the +`-O0` direct path. `NativeTarget` is the *physical-side* emitter — it speaks +`NativeLoc` (registers / frame slots / immediates / addresses) and is used by +both `-O0` (after `NativeDirectTarget` has chosen scratch regs and materialized +operands) and the optimizer (after regalloc). The optimizer never touches +`NativeOps`: anything it needs is in MIR or on `NativeTarget`. This is why the +two structs exist side by side — they sit on opposite sides of the +register-selection line. + +### The local register cache + +The correct baseline gives every scalar local a frame home and round-trips it +through memory per use — no liveness needed. On top of that baseline, +`NativeDirectTarget` runs a write-back, **basic-block-scoped** local register +cache that removes most of those round trips while staying single-pass. The +load-bearing invariants are: + +- **What is cached:** only scalar locals that fit a register and are neither + address-taken nor memory-required. Aggregates and escaped locals stay + frame-only. +- **Where:** only caller-saved allocable registers. This sidesteps prologue + bookkeeping (the direct path reports no clobbered callee-saves) and means the + blanket flush before any call already covers ABI clobbering. +- **Block scope:** with no CFG/liveness, a cached value cannot cross a control + edge or join. The cache is spilled and emptied at every branch, label, and + return; `func_begin` starts empty. +- **Escape-based aliasing:** because address-taken locals are never cached, a + pointer access can only alias an *escaped* local, which is never in a register + — so loads/stores need no value-cache flush for aliasing. Addressing is made + cache-aware: when a base/index local is live in a register, the address points + at that register instead of reloading the frame home. Direct frame-home + accesses to a cached local (e.g. by-value field extraction) flush just that one + local. +- **Calls and barriers** still flush the whole cache (caller-saved regs die, + and a memory clobber may observe everything). Address-taking a cached local + flushes just that local and marks it uncacheable thereafter. + +A monotonic use-tick drives approximate-LRU eviction; cached locals are tracked +in an intrusive insertion-order list so flush-all is O(cached), not O(locals). +The cache is never worse than the frame-only baseline — each cached local is +stored at most once per boundary instead of once per definition. It is a local +cache, not an allocator: the semantic API promises nothing about where a local +lives, so the target is free to choose. + +Single-pass `-O0` cannot pre-plan the full frame, so the native prologue is +emitted into a reserved region at `func_begin` and the frame-size/outgoing-area +immediates are patched at `func_end` once final sizes are known. The optimizer +path, which knows the whole frame up front, instead uses +`func_begin_known_frame` + `emit_prologue` for an exact-size prologue. Both +mechanisms are on the same `NativeTarget`. + +## Shared native infrastructure + +Two more pieces are shared by every native backend (both `-O0` and the +optimizer's machine emit), so the per-arch code carries only ISA/ABI specifics. + +### NativeFrame — arch-neutral frame bookkeeping + +`src/cg/native_frame.{c,h}` holds the parts of stack-frame layout that are +identical across aa64/rv64/x64: the table of frame slots (locals, spills, +sret/variadic homes, and aa64 callee-save homes) accumulated below the frame +anchor, the cumulative-offset arithmetic, the running max-outgoing-arg size, the +*frame-final* gate that forbids growing the frame after the prologue is emitted, +and deriving the used-callee-save set from the optimizer's per-class register +masks. It also answers the one frame-relevant ABI query — +`native_frame_va_save_bytes` derives the vararg register-save-area size from the +target ABI's `va_list` layout, so the per-arch magic numbers all flow from one +ABI-driven source. + +What stays per-arch is everything ISA/ABI-specific: the transform from a slot's +cumulative offset to an anchor-relative displacement (fp/s0/rbp-relative, plus +aa64's top- vs bottom-record choice), prologue/epilogue encoding, callee-save +placement, slim-prologue variants, deferred-patch application, and variadic +register-save stores. + +### native_argmove — parallel-copy register shuffle + +`src/cg/native_argmove.{c,h}` is the shared scheduler for realizing a set of +register `dst <- src` moves as a *parallel* copy. Marshalling call arguments — +and, on the optimizer path, binding incoming parameters — is a parallel copy: +every source register must be read before any move overwrites it. The allocator +usually hands over a conflict-free order, but not always (variadics, and +tail-call / entry permutations it is free to rotate). When a true cycle remains +(e.g. `rdi<-rdx, rsi<-rdi, rdx<-rsi`), the scheduler breaks it by stashing one +member into a scratch register and redirecting that value's readers. + +The scheduling — topological emission, cycle detection, the scratch break — is +identical across the three arches; only the leaf operations differ (how one move +is emitted, which register is scratch), supplied through a small ops struct. All +three native backends plus the entry param-bind path share this one scheduler. + +## Why this shape + +- One semantic interface, recorded verbatim, means the optimizer and the + `-O0` emitter consume *the same* program description — the frontend writes it + once. +- One shared `NativeDirectTarget` means a new native arch implements physical + emission (`NativeTarget`) plus a tiny `NativeOps` adapter, and gets a correct + single-pass `-O0` backend for free, sharing frame and arg-move logic. +- The strict semantic/physical line keeps register allocation, spilling, and + ABI placement out of the frontend-facing surface entirely, so they can change + (or be skipped, at `-O0`) without touching anything above `NativeTarget`. +- Direct `-O0` emission with a local register cache buys fast unoptimized + builds and JIT without a recording/optimization round trip, while remaining + obviously correct via its block-local and escape-based invariants. diff --git a/doc/DBG.md b/doc/DBG.md @@ -1,497 +1,323 @@ -# cfree dbg design - -Architecture of `cfree dbg`, the interactive JIT debugger. Companion to -`DESIGN.md`. Scope: how the REPL drives the JIT'd code under controlled -execution, and where the OS-specific machinery is isolated. Not a tutorial; -not implementation notes. - -## 1. Goals - -- `dbg` multi-call subcommand: compile C sources with `-g`, JIT-link, and - drop into a REPL that controls one worker running the JIT'd entry. -- Source-level operations: breakpoints by `file:line`, `sym[+off]`, or - `0xADDR`; step-insn / step-line / next-line / finish; backtrace using - DWARF CFI; print and write named variables; raw memory examine. -- All OS interaction — threads, signals, ucontext, page protection flips, - icache flushing — funnels through a single env vtable - (`CfreeDbgOs`, §5) populated by `driver/env.c`. `src/dbg/` (the - library-side session) is C11 freestanding like the rest of `src/`. -- v1 target: aarch64 on macOS and Linux. Other arches/hosts follow once the - contracts in §5 and §8 are stable. - -## 2. Non-goals (v1) - -- Multi-threaded guests. Single worker thread; concurrent guest threads are - future work and will require widening `CfreeDbgOs` (thread enumeration, - per-tid stop). -- Out-of-process / remote debugging. The worker runs in the same address - space as the REPL — read/write memory is a guarded `memcpy`, not - `ptrace`/`mach_vm_*`. -- Hardware breakpoints, watchpoints, conditional-breakpoint JIT codegen. - All breakpoints are software (BRK patch); the `condition` callback in - `CfreeBreakpointSpec` is host-side C. -- Stepping through optimized code with reconstructed values. `-g` is - forced on, but `-O0` is the contract for usable source-level stepping. -- Forked-child / `exec`-into-guest. The JIT entry is called directly. -- x86_64 / rv64 single-step. Wiring exists at the vtable layer; per-arch - displaced-step lifters land after aa64 is solid. - -## 3. Layout +# Debugger (cfree dbg) -``` -include/ - cfree.h CfreeJitSession + CfreeDbgOs + CfreeStopInfo (public) - -src/ - dbg/ library-side session (NEW) - session.c worker thread, park/unpark, stop dispatch - bp.c breakpoint patch table (addr -> saved bytes, refcount) - step.c resume-mode state machine (insn / line / next / out) - displaced.c arch-neutral plumbing for out-of-line execution - arch/aa64/dbg.c aa64 BRK encoding + PC-relative fixups for displaced - arch_x64.c (later) - arch_rv64.c (later) - mem.c read/write_mem with sigsetjmp bad-address guard - dbg.h internal contracts - -driver/ - dbg.c REPL (already exists; see §4) - env.c CfreeDbgOs POSIX impl (NEW section in this file) -``` +`cfree dbg` is an interactive, in-process source-level debugger that drives a +JIT-linked image under controlled execution. It owns one worker thread running +the JIT'd entry, catches its faults, and lets a REPL inspect and steer the +program: breakpoints by `file:line` / `sym[+off]` / `0xADDR`, instruction and +source-line stepping (into / over / out), backtraces, register and named-variable +read/write, and raw memory examine. The library half is `src/dbg/` (freestanding +C11, like all of `src/`); the host primitives and the REPL live in the driver. +See [JIT.md](JIT.md) for how the image is produced and [DWARF.md](DWARF.md) for +the line/CFI/variable tables the source-level features consume. -## 4. Dataflow +## Layering ``` -stdin → dbg REPL → CfreeJitSession ──► worker thread runs JIT entry - │ │ - │ ▼ - │ SIGTRAP / SIGSEGV / ... - │ │ - ▼ ▼ - CfreeStopInfo ◄────── ucontext → CfreeUnwindFrame - │ - ▼ - DWARF (line/CFI/var) → user-visible output +driver/cmd/dbg.c REPL: command parsing, stop rendering, DWARF queries, + driver-local breakpoint table, SIGINT forwarding +driver/env/* CfreeDbgOs host adapter (threads, signals, W^X, fault copy) + │ (public API: cfree_jit_session_*, CfreeDbgOs vtable) + ▼ +src/dbg/ library-side session + session.c worker thread, event handshake, fault classification + step.c resume-mode state machine (insn / line / next / out) + displaced.c arch-neutral out-of-line single-step plumbing + bp.c address-keyed breakpoint patch table + mem.c guarded guest-memory read/write + bp-byte overlay + dbg.h internal session contracts + │ (per-arch hooks: ArchImpl.dbg → ArchDbgOps) + ▼ +src/arch/{aa64,x64,rv64}/dbg.c trap encoding, insn decode, displaced shims ``` -The REPL is already wired (`driver/cmd/dbg.c`). It owns the breakpoint id -namespace presented to the user, the DWARF consumer, and SIGINT -forwarding at the prompt. Everything *behind* `cfree_jit_session_*` — -threading, signals, memory protection — is what this doc covers. - -## 5. CfreeDbgOs vtable - -The session never calls `pthread_*`, `sigaction`, `mprotect`, or -`pthread_jit_write_protect_np` directly. All host primitives go through -a single vtable supplied via `CfreeEnv`. `driver/env.c` is the only TU -in the tree that includes `<pthread.h>`, `<signal.h>`, or `<sys/mman.h>` -for debugger use; `src/dbg/` stays freestanding. - -```c -typedef struct CfreeDbgOs { - /* --- threading ------------------------------------------------- */ - /* Spawn `fn(arg)` on a new thread. *out is an opaque handle the - * session passes back to join/cancel. Returns 0 on success. */ - int (*thread_start)(void* user, void (*fn)(void*), void* arg, - void** thread_out); - /* Block until the worker exits; releases the handle. */ - void (*thread_join)(void* user, void* thread); - /* Async-signal-safe: deliver the debugger's interrupt signal to the - * worker thread. Called from session_interrupt; must use - * pthread_kill / equivalent. */ - int (*thread_signal_self_worker)(void* user); - - /* --- park/unpark ----------------------------------------------- */ - /* One-shot binary handoffs. The session uses two: one for "worker - * has stopped, REPL may inspect" and one for "REPL has issued - * resume, worker may continue". `event_new` allocates; `wait` - * blocks until `signal`; `reset` rearms. */ - void* (*event_new)(void* user); - void (*event_free)(void* user, void* ev); - void (*event_wait)(void* user, void* ev); - void (*event_signal)(void* user, void* ev); - void (*event_reset)(void* user, void* ev); - - /* --- signal plumbing ------------------------------------------- */ - /* Install process-wide handlers for SIGTRAP, SIGSEGV, SIGBUS, - * SIGILL, SIGFPE, and one user-chosen signal for INTERRUPT (the - * implementation reserves SIGUSR2 on POSIX). The handler must: - * 1. confirm the faulting thread is the worker (pthread_self == - * the worker tid captured at thread_start); - * 2. snapshot the ucontext into the CfreeUnwindFrame buffer the - * session pre-registered via `register_stop_slot`; - * 3. classify the cause (which signal, was the PC patched, was an - * interrupt pending) and store the CfreeStopKind; - * 4. event_signal the stop event; - * 5. event_wait the resume event; - * 6. on resume, write any mutated regs back into the ucontext and - * return so the kernel restarts the worker. - * The session supplies the snapshot/classify/wait callbacks; the - * OS impl only owns sigaction + the ucontext <-> CfreeUnwindFrame - * marshalling for the host arch. */ - int (*signals_install)(void* user, const CfreeDbgSignalOps* ops, - void* session); - void (*signals_uninstall)(void* user); - - /* Slot the handler reads/writes. The session owns the memory; this - * is just a pointer the OS layer caches so it can be reached from - * async-signal context without indirecting through the session. */ - void (*register_stop_slot)(void* user, CfreeUnwindFrame* regs, - CfreeStopKind* kind, int* signal_out); - - /* --- memory protection (W^X dance for the BRK patch) ---------- */ - /* CfreeExecMem already provides reserve/protect/release. The dbg - * extension is per-page write-window: on Apple silicon the JIT - * pages live in the dual-mapping (write alias is RW), so the patch - * goes through `write` and the BRK is observed at `runtime`. On - * Linux a transient mprotect RW->RX flip is required. The session - * asks the OS layer to "open" and "close" a write window over an - * address range that lies inside an existing JIT reservation. */ - int (*code_write_begin)(void* user, void* runtime_addr, size_t n, - void** write_addr_out); - void (*code_write_end)(void* user, void* runtime_addr, size_t n); - void (*flush_icache)(void* user, void* runtime_addr, size_t n); - - /* --- fault-guarded memory copy -------------------------------- */ - /* Read/write `n` bytes between guest (in-process) memory and a - * caller buffer, returning nonzero on SIGSEGV/SIGBUS. Implemented - * with sigsetjmp + a SIGSEGV handler scoped to a TLS landing slot - * the dbg OS owns; the standard fault handlers above defer to this - * landing slot when set. */ - int (*guarded_copy)(void* user, void* dst, const void* src, size_t n); - - void* user; -} CfreeDbgOs; -``` +The session never calls `pthread_*`, `sigaction`, `mprotect`, or an icache flush +directly. Every host primitive funnels through one vtable, `CfreeDbgOs` +(`include/cfree/dbg.h`), supplied at `cfree_jit_session_new` through a +`CfreeDbgHost`. Everything architecture-specific — the trap byte sequence, +instruction decoding, the displaced-step shim — reaches the session through +`ArchImpl.dbg` (an `ArchDbgOps`, `src/arch/arch.h`). The session itself is pure +coordination logic: it knows about events, a stop slot, a breakpoint table, and a +single scratch page, and nothing about the host OS or the target ISA. -`CfreeEnv` gains one new field: +This separation is the central design decision. It keeps the debugger's control +logic testable and portable, isolates all signal/thread/memory-protection hazards +to a handful of driver TUs, and lets a new target gain a debugger by implementing +only the `ArchDbgOps` hooks. -```c -const CfreeDbgOs* dbg_os; /* NULL ok unless dbg paths run */ -``` +## Worker thread and the fault handshake -The session looks at `env->dbg_os` once, in `cfree_jit_session_new`, -and returns NULL if absent — exactly the contract `dbg.c:1862-1868` -already expects. +A session owns exactly one worker thread (`worker_main` in `src/dbg/session.c`). +The worker and the REPL thread share two one-shot events — `ev_resume` and +`ev_stop` — and a single `CfreeStopInfo` slot on the session. The events are the +only synchronization point: the REPL reads the stop slot only while the worker is +parked, and the worker mutates session state only from the fault handler. + +``` +REPL thread worker thread +----------- ------------- +session_new ── thread_start ──────► wait(ev_resume) + │ +session_call(entry) ──────────────────┤ + state = RUNNING │ + signal(ev_resume) ───────────────► run entry (call_with_catch) + wait(ev_stop) │ + │ (a) fault: SIGTRAP/SEGV/BUS/ILL/FPE/interrupt + │ on_fault: snapshot regs, classify, + │ state = STOPPED, + │ ◄──────────────── signal(ev_stop); wait(ev_resume) + inspect stop slot (parked inside the signal handler) + session_resume(MODE) ──────────────► (b) entry returns normally: + │ state = EXITED, stop.kind = EXIT, + │ ◄──────────────── signal(ev_stop) + read CfreeStopInfo +``` -## 6. Session lifecycle +The worker loops over `ev_resume`. On the run-from-entry path it invokes +`worker_run_entry`, which dispatches the entry as either an `int(int,char**)` +`main` or a 0–8 argument `uint64_t(...)` thunk (the latter backs the REPL's `expr` +function calls). The entry runs inside `call_with_catch`, an optional host hook +that establishes a `sigsetjmp` landing pad so a `RESUME_ABORT` can longjmp the +worker out of a parked fault back to the loop. When the entry returns the worker +records a synthetic `CFREE_STOP_EXIT` and signals `ev_stop`. + +A fault on the worker enters the host signal handler, which marshals the +`ucontext` into a `CfreeUnwindFrame` and calls back into the session's `on_fault`. +This is the heart of the design: **the worker parks itself inside the signal +handler.** `on_fault` snapshots the registers into the stop slot, classifies the +cause, sets `state = STOPPED`, signals `ev_stop`, then *blocks on `ev_resume`*. +When the REPL later resumes, `on_fault` writes any mutated registers (a PC +override or REPL `set_regs` edits) back into the live `CfreeUnwindFrame` and +returns, so the kernel restarts the worker exactly where the debugger wants it. +There is no separate "continue" mechanism — resume is just unblocking the parked +handler. + +### Fault classification + +`on_fault` decides what kind of stop a fault represents (`CfreeStopKind` + +`CfreeStopReason`): + +- **Interrupt** — the signal equals the host's `interrupt_signo` (delivered by + `cfree_jit_session_interrupt` via `pthread_kill`). Reported as + `CFREE_STOP_INTERRUPT`. +- **Breakpoint** — the faulting PC, normalized by the arch hook + `breakpoint_addr_from_fault_pc` (x86 INT3 reports the PC *after* the trap byte; + aarch64 BRK reports at the byte), hits the breakpoint table. The handler then + branches on the entry kind: + - *Displaced-step sentinel* (an `internal` bp at the active + `displaced.return_pc`) → finalize the displaced step, restore the user PC, and + either resume silently (the continue-over-breakpoint fast path) or surface a + step completion. + - *Plain internal bp* (a one-shot dropped by `step.c`) → clear it and report a + step completion. + - *User bp* → apply `skip_count`, the host-side `condition` callback, and + `max_hits`; a skip or rejected condition silently re-steps over the patched + instruction (see displaced step) without notifying the REPL; otherwise report + `CFREE_STOP_BREAKPOINT` with the user's bp id. +- **Signal** — any unpatched fault: SEGV, BUS, ILL, FPE, or a SIGTRAP the program + emitted itself. Reported as `CFREE_STOP_SIGNAL`, distinguishing a genuine trap + (`trap_signo`) from other signals. + +Invariants the handshake relies on: one worker per session; `session_call` is +rejected while RUNNING or STOPPED; the REPL touches the stop slot only after +`wait(ev_stop)` returns; the worker writes the stop slot only from the +async-signal context inside `on_fault`. + +### Teardown while parked + +`cfree_jit_session_free` deliberately leaks the worker when it is torn down in the +STOPPED state. A worker parked inside the signal handler cannot be cleanly +unwound without re-running the program to completion, and the session is only +freed at process exit, so the OS reaps the thread and the event/signal/heap +teardown is skipped. This keeps quitting from a stopped prompt immediate. + +## Resume-mode state machine + +`cfree_jit_session_resume` takes a `CfreeResumeMode` and produces the next stop. +Plain `CONTINUE` simply unblocks the parked handler — *unless* the current PC sits +on a breakpoint patch, in which case the original instruction must execute before +control continues. Everything more elaborate is built in `src/dbg/step.c` on two +primitives: the displaced single-step (one instruction out of line) and one-shot +internal breakpoints. + +The state machine has two execution styles. `STEP_INSN` and continue-over-bp set +a pending PC override and let the outer `session_resume` drive the single +signal/wait cycle. The source-level modes (`STEP_LINE`, `NEXT_LINE`, `STEP_OUT`) +drive their *own* signal/wait cycles in a loop inside `step.c` and set +`pending_done` so the outer resume short-circuits. They require an attached DWARF +binding and return an error without one. + +- **STEP_INSN** — prepare a displaced step at the current PC; resume; the sentinel + trap reports the completion. +- **STEP_LINE** (step into) — record the current source line and subprogram from + DWARF, then advance one instruction at a time. At each instruction: if it is a + direct call, drop a one-shot bp at the callee entry and continue into it; if it + is a direct jump, follow it the same way; otherwise displaced-step over it. Stop + when the source line changes or control leaves the original subprogram. +- **NEXT_LINE** (step over) — if the current instruction is a call, set a one-shot + bp at the unwound return address (via DWARF CFI) and continue over it, then fall + into the STEP_LINE loop to keep advancing until the line actually changes. A + non-call falls straight into the STEP_LINE loop. +- **STEP_OUT** (finish) — unwind one frame with `cfree_dwarf_unwind_step` (or an + arch link-register fallback), set a one-shot bp at the caller's return address, + and continue. + +A bounded instruction cap guards the STEP_LINE loop against runaway stepping when +the line table is sparse or absent. All PCs cross a runtime↔image translation +boundary before any DWARF query (see below). + +## Displaced single-step + +User-mode aarch64 has no architectural single-step (the single-step bit lives at +EL1). The debugger therefore executes one instruction *out of line*: it copies a +fixed-up version of the target instruction into a scratch page, appends a trap +sentinel, sets the worker PC to the scratch entry, and resumes. This primitive +also underlies "resume past a breakpoint": the patched byte cannot execute in +place, so the original instruction runs from scratch instead. `src/dbg/displaced.c` +is the arch-neutral plumbing; the per-arch lifter is +`ArchDbgOps.build_displaced_shim`. ``` -session_new - ├── allocate state, stop slot, two events - ├── dbg_os->signals_install - └── dbg_os->thread_start(worker_main) - │ - ▼ (worker) ┌── (REPL) - wait resume_event │ - call entry │ session_call(entry, argv) - │ │ install entry args - │ │ event_signal(resume) - │ trap │ event_wait(stop) - handler: snapshot, │ - classify, signal stop, │ - wait resume │ - │ │ session_resume(MODE) - │ │ prepare per-mode trampoline / - │ │ one-shot bps (§7,§8) - │ │ event_reset(stop) - │ │ event_signal(resume) - │ │ event_wait(stop) - entry returns │ - final stop = EXIT ─────┘ session_call returns CfreeStopInfo -session_free - ├── dbg_os->thread_join - └── dbg_os->signals_uninstall +session reserves one RX scratch page (from the JIT's execmem pool) + +displaced_prepare(insn_pc): + decode the original instruction (read via dbg_mem_read, so the saved + byte — not the trap patch — is decoded) + build_displaced_shim → writes a fixed-up copy + a trap sentinel into the + scratch page, returns the sentinel offset and the fallthrough PC + flush icache over the whole slot + arm an internal bp on the sentinel (so the fault classifier recognizes it) + new_pc = scratch entry + + worker resumes at scratch ─► runs the fixed-up insn ─► hits the sentinel + +displaced_finalize: + clear the internal bp + if the worker stopped at the sentinel, set PC = fallthrough_pc; + if a fixed-up branch took, PC is already elsewhere — leave it ``` -Invariants: - -- Exactly one worker per session. `session_call` is rejected if the - worker is already running. -- The REPL thread never reads `CfreeStopInfo` while the worker is - unparked. `event_wait(stop)` is the only synchronization point. -- The worker never writes to session state except through the - preregistered stop slot from async-signal context. - -## 7. Software breakpoints - -aa64-specific encoding lives in `src/arch/aa64/dbg.c`; everything else -in `src/dbg/bp.c` is arch-neutral. - -- Patch instruction: `BRK #0` (4 bytes on aa64; `0xCC` on x64 later). -- Per-address entry: `{addr, saved_bytes[8], refcount, enabled}`. The - refcount lets the line/next-line state machine drop temporary - breakpoints without disturbing user breakpoints at the same PC. -- Install: `code_write_begin(addr, 4)` → `memcpy` original out → - write `BRK` → `code_write_end` → `flush_icache(addr, 4)`. -- Clear: reverse, refcount-gated. -- On a SIGTRAP, the handler looks up the faulting PC in the table. - Hit → stop kind BREAKPOINT, bp_id from the table. Miss → stop kind - SIGNAL with `signal=SIGTRAP` (program-emitted BRK passes through). -- The trap byte is *never* visible to a `read_mem` against the patched - range: the bp table is consulted first and the saved byte is - substituted. `info b` and disassembly stay honest. - -## 8. Displaced single-step - -User-mode aarch64 has no architectural single-step (MDSCR.SS is EL1). -`RESUME_STEP_INSN`, and "resume past a breakpoint", both use an -out-of-line copy. - -- Each session reserves a small executable scratch page (one slot per - worker; 64 bytes is enough for one fixed-up insn + a B back). -- `displaced_prepare(insn_addr)`: - 1. Read 4 bytes of original (from the bp table, not the patched - memory). - 2. Fix up PC-relative operands so the insn behaves at the scratch - address: `B`, `BL`, `B.cond`, `CBZ/CBNZ`, `TBZ/TBNZ`, `ADR`, - `ADRP`, `LDR (literal)`. Branch targets become absolute via a - post-insn `B`/`MOV+BR`; ADR/ADRP get the original PC - substituted; LDR-literal gets converted to a synthesized load - from a fixed-up immediate. - 3. Append a one-shot internal breakpoint at `scratch + brk_offset` - where `brk_offset` is set by the arch fixup (4 for verbatim-copy - forms, larger for multi-insn trampolines that resolve out-of-range - CBZ/TBZ/B.cond, ADR/ADRP literal-loads, and LDR-literal). The - trampoline shape is `cond-branch +8 ; BRK ; LDR x16,[pc+N] ; BR x16 ; - <8-byte literal pool>`. -- Set worker PC to scratch, resume. On the return-slot BRK the handler - restores PC to `insn_addr + 4` and reports the user-visible stop kind - for the mode (STEP_INSN → MODE_DONE; STEP_LINE → continue to next - line-table entry; etc.). -- For indirect branches (`BR`, `BLR`, `RET`) the original insn is copied - verbatim — the trailing BRK never fires because control leaves the - scratch slot. `displaced_finalize` is idempotent; the next - `displaced_prepare` clears any lingering internal bp before laying - down the new shim. - -`STEP_LINE` / `NEXT_LINE` / `STEP_OUT` are state machines on top of -this primitive (`src/dbg/step.c`): - -- STEP_LINE: after each insn step, check `cfree_dwarf_addr_to_line`; - stop when the line index changes and the PC stays inside the - current subprogram. -- NEXT_LINE: same, but if the next insn is a `BL` to a sub at lower - call-depth than the current frame, drop a one-shot bp at the return - address (from `cfree_dwarf_unwind_step` on the current frame) and - CONTINUE instead of stepping. -- STEP_OUT: one-shot bp at the unwound return address; CONTINUE. - -## 9. Memory and register access - -- `session_read_mem`/`session_write_mem` route through - `dbg_os->guarded_copy`, which sets a TLS sigsetjmp landing slot - before the copy and tears it down after. The standard SIGSEGV - handler (installed at session_new) checks the slot first and longjmps - if set; otherwise it falls through to the normal "worker took a - fault" path. This keeps `p` and `x` from killing the worker on a - bad pointer. -- Register access reads/writes the `CfreeUnwindFrame` snapshot the - handler captured. `set_regs` mutates the same slot; the handler - writes it back into the ucontext on resume. -- `cfree_jit_session_get_regs` (currently declared but not even - stubbed in `src/api/`) lands in `src/dbg/session.c` as a simple - copy from the slot. - -## 10. DWARF integration - -The DWARF consumer entries exist and are used by the driver: - -- `cfree_dwarf_line_to_addr` supplies `b file:line` targets. -- `cfree_dwarf_unwind_step` powers `bt` and is reused by `STEP_OUT` / - `NEXT_LINE`. -- `cfree_dwarf_var_at` + `cfree_dwarf_loc_read` drive `p name` and - `set name = ...`; the loc reader calls `session_read_mem`, which is - live. - -The session takes its DWARF handle through -`cfree_jit_session_attach_dwarf(session, debug_info)` — the binding is -optional; source-level resume modes return an error if it's absent. - -**Current gap (Task #4 in §12).** `cfree_jit_view` is still a stub -returning NULL (`src/link/link_jit.c`), so the driver's -`cfree_dwarf_open` call yields NULL and `attach_dwarf` is never called. -Every DWARF-dependent feature in the REPL (`bt`, source-level steps, -typed `p name`, `info locals/args`, `b file:line`) is therefore offline -even though the lib-side wiring is complete. Fix is independent and -self-contained; see §12. - -**Multi-input fallout.** The driver now accepts `.o` / `.a` inputs -alongside pipeline-compiled sources (§11), which introduces three new -DWARF concerns the consumer must handle: - -1. `cfree_jit_view` must span every input in the JIT image — including - debug sections from prebuilt `.o` / `.a` files — not just the - pipeline's compile outputs. Without this, PCs inside externally - built code resolve to no source location even when those inputs - were compiled with `-g`. -2. `cfree_dwarf_line_to_addr` keys on a filename string; two inputs - that share a basename (`util.c` in both `a/` and `b/`) need a - disambiguation rule. Options: require a unique path suffix, or - error on collision and surface the candidate list. -3. PC ranges that land inside an input compiled without `-g` (the - "blackbox region" — typical for `.o` / `.a` from a prebuilt SDK) - should degrade gracefully: `bt` shows raw PC + symbol only, `p - NAME` reports "no debug info for this frame", `b file:line` errors - if the file isn't covered. The existing `dbg_pc_rt_to_img` - pass-through fallback at `dbg.c:360` already handles the runtime - side; the DWARF lookups themselves need parallel "no data" - returns rather than treating absence as a hard failure. - -## 11. Driver-side changes - -Three TUs in the driver carry dbg-specific code: `driver/env.c`, -`driver/cmd/dbg.c`, and the shared `driver/lib/inputs.{c,h}` module that backs -both `cfree run` and `cfree dbg`. - -- `driver/env.c` carries the `g_dbg_os_posix` singleton populated by - `driver_env_init` and exposed through `driver_env_to_cfree`. Sections: - thread shim (`pthread_create` / `pthread_join` / `pthread_kill`), - event shim (`pthread_mutex_t`+`pthread_cond_t` pair + signaled flag - per event), signal install (one `sigaction` per signo with the cohort - blocked in `sa_mask`), code-write window (a per-process registry - `g_jit_dual_map` records each dual-mapped exec reservation as - `{write_base, runtime_base, size}` so `code_write_begin` can return - the write alias for any runtime address inside it; on hosts without - a dual mapping it falls back to a transient `mprotect` RW↔RX flip - on the page span), icache flush (`sys_icache_invalidate` on Apple, - `__builtin___clear_cache` elsewhere), guarded copy (TLS - `sigjmp_buf` + armed flag; the SEGV/BUS handler checks this slot - before delegating to `on_fault`). -- `driver/cmd/dbg.c` calls `cfree_jit_session_attach_dwarf(session, - debug_info)` right after `cfree_dwarf_open` succeeds, so source-level - resume modes light up the moment `cfree_jit_view` returns non-NULL. - The degraded-mode warning at `dbg.c:1862-1868` is left in place; it - now only fires on non-aarch64 hosts. -- `driver/lib/inputs.{c,h}` owns the `DriverInputs` struct + classify / - load / compile-and-jit pipeline, shared with `cfree run`. Recognized - positionals: `.c` / `.cc` / `.cpp` sources, `.o` / `.obj` objects, - `.a` archives, and `-` for stdin (single source). The dbg-specific - consequence is that DWARF coverage is now uneven: pipeline-compiled - sources get `-g` forced on, but `.o` / `.a` inputs carry only the - debug info they were built with — see §10. - -## 12. Checklist - -Single source of truth for what's done and what's open. Items grouped -by lane; ordered top-to-bottom by priority within each group. Add new -work here as a new `[ ]` line; never delete completed lines, just flip -the box. - -### Library — `src/dbg/` - -- [x] `session.c` — worker thread, park/unpark, on_fault classifier -- [x] `bp.c` — refcounted patch table, idempotent set/clear, read overlay -- [x] `mem.c` — guarded read/write via `dbg_os->guarded_copy` -- [x] `displaced.c` — scratch page + per-insn shim primitive -- [x] `arch/aa64/dbg.c` — verbatim copy + B / BL / B.cond / CBZ / CBNZ / - TBZ / TBNZ / ADR / ADRP / LDR-lit (W/X/SW) / BR / BLR / RET -- [x] `step.c` — `STEP_LINE` / `NEXT_LINE` / `STEP_OUT` state machines -- [ ] `arch/aa64/dbg.c`: LDR-literal vector forms (S/D/Q register dest); - currently decline. Common in optimized builds. -- [ ] `arch_x64.c`: INT3 + RIP-relative fixups for the same insn family -- [ ] `arch_rv64.c`: EBREAK + AUIPC/JAL/branch fixups - -### Public API — `include/cfree.h` - -- [x] `CfreeDbgOs` + `CfreeDbgSignalOps` vtables -- [x] `CfreeEnv.dbg_os` field -- [x] `cfree_jit_session_attach_dwarf(session, debug_info)` -- [x] `cfree_jit_session_get_regs` (was declared without a body) -- [x] All `cfree_jit_session_*` stub bodies deleted from `src/api/stubs.c` - -### Linker bridge — `src/link/link_jit.c` - -- [x] `cfree_jit_image_contains(jit, runtime_addr)` -- [x] `cfree_jit_image_arch(jit)` -- [x] `cfree_jit_compiler(jit)` -- [x] `cfree_jit_sym_iter_*` and `cfree_jit_addr_to_sym` — walk - `LinkImage->syms`, surface FUNC / OBJ / COMMON / TLS / IFUNC / - ABS; names go through `obj_format_demangle_c` so Mach-O's - leading `_` is stripped -- [x] `cfree_jit_view(jit)` — multi-input concatenation. Debug emitter - writes R_ABS32 relocs against SK_SECTION symbols for every - cross-section offset; view-builder walks every dbg input, - snapshots per-section prefix sizes, and resolves SK_SECTION - relocs against that snapshot. Externally produced `.o` /`.a` - debug input rides the same reloc-application path. - -### Public arch-register API - -- [x] `cfree_arch_register_count(arch)` + `cfree_arch_register_at(arch, - idx, out)` replace the stubbed `cfree_arch_reg_iter_*` surface. - Stateless and allocation-free; dense indices in `[0, count)`, - unrelated to the sparse DWARF numbering. The previous iter API - is removed from `cfree.h` and `src/api/stubs.c`. - -### Host adapter — `driver/env.c` - -- [x] POSIX `g_dbg_os_posix` singleton wired in `driver_env_init` -- [x] aarch64 ucontext marshalling for macOS (Apple silicon) and Linux -- [x] Dual-mapping registry `g_jit_dual_map` for code-write window -- [x] Single-mapping `mprotect` RW↔RX fallback -- [x] TLS sigsetjmp guarded-copy + SEGV/BUS handler check -- [ ] Windows host: vectored exception handlers + `SetThreadContext` - instead of POSIX signals - -### Driver — `driver/cmd/dbg.c` and `driver/lib/inputs.{c,h}` - -- [x] `cfree_jit_session_attach_dwarf` call right after `cfree_dwarf_open` -- [x] Degraded-mode warning still present but only fires on non-aarch64 -- [x] Accept the full `cfree run` input shape (`.c` / `.cc` / `.cpp`, - `.o` / `.obj`, `.a`, stdin via `-`) through the shared - `DriverInputs` module in `driver/lib/inputs.{c,h}` -- [ ] Remove the warning entirely once x64 / rv64 sessions are real - -### DWARF / multi-input (lands once `cfree_jit_view` is real) - -The driver now feeds the linker a mixed input set (§11), but the DWARF -consumer still assumes one debug-info-bearing compile unit per lookup. -The three items below land after `cfree_jit_view` so they can be tested -against real debug info; see §10 for the failure modes each addresses. - -- [x] `cfree_dwarf_line_to_addr`: exact-or-path-suffix match. Returns - 0/1/2/3 (ok / file not covered / no row at line / ambiguous). - `cfree_dwarf_line_to_addr_all` enumerates candidates so the REPL - can prompt the user with a longer suffix on collision. -- [x] DWARF lookups return distinct "no data" codes (2) when a PC sits - outside every CU's coverage / a file is uncovered. REPL formats - `bt` as `[no debug info for this frame]`, `p NAME` as `"no debug - info for this frame; '%s' not in global symbols"`, and - `b file:line` as `"file not covered by debug info: %s"`. -- [ ] `driver/cmd/dbg.c` source listing (`list file:line`) reads from disk - via `env.file_io`; when the input is from a `.o` / `.a` debug - section whose source file isn't accessible, show the DWARF line - number alone and omit the source snippet. - -### Tests (none landed yet — all verification to date is by-hand REPL) - -- [ ] `test/smoke/dbg_hello`: scripted REPL against a JIT'd C source, - golden-transcript diff. Exercise `b sym`, `r`, `c`, `s`, `x ADDR`, - `p NAME`, `q`. aarch64 hosts only. -- [ ] `test/dbg/bp_patch_roundtrip`: install/clear at one address, - verify byte restore, refcount, and `dbg_bp_unpatch_read` overlay -- [ ] `test/dbg/displaced_aa64`: one canned encoding from every - PC-relative family; assert shim bytes + literal pool layout -- [ ] `test/dbg/guarded_copy_segv`: `read_mem` from NULL returns - nonzero, worker survives the next resume -- [ ] `test/dbg/source_step` (gated on `cfree_jit_view`): scripted - REPL drives `n` / `step` / `finish`, assert reported source line - at each stop - -### Bigger follow-ons - -- [ ] Watchpoints once `CGTarget` can express them without an - ISA-specific debug-register API -- [ ] Multi-thread guests; widen `CfreeDbgOs` with thread enumeration - and per-tid event slots - -### Design note (not a checklist item) - -`cfree_jit_session_free` deliberately leaks the worker thread when -torn down with `state == DBG_STATE_STOPPED`. There is no async-safe -way to unwind a worker parked inside the signal handler without -re-running the program to completion. The session is only freed at -process exit, so the OS reaps the worker; events, signal handlers, -and session memory are left untouched until `_exit`. Keeps -`q`-while-stopped immediate. +Fixup is needed for any PC-relative operand so the instruction behaves correctly +at the scratch address rather than its original one. The aarch64 lifter +(`src/arch/aa64/dbg.c`) handles the full core family: + +- No PC-relative operand → copied verbatim, sentinel immediately after. +- `B` / `BL` / `B.cond` → re-encode the immediate (or, when out of range, a + literal-load + indirect-branch trampoline). +- `CBZ` / `CBNZ` / `TBZ` / `TBNZ` → always a trampoline: the not-taken path falls + through to a sentinel, the taken path loads the absolute target from a literal + pool and branches to it. +- `ADR` / `ADRP` → replaced with a load of the original (absolute) result from a + literal pool. +- `LDR` (literal), integer and `LDRSW` → synthesized as a two-step indirect load + through the fixed-up literal address. +- `BR` / `BLR` / `RET` → copied verbatim; the trailing sentinel never fires + because control has already left the scratch slot, so `prepare` is idempotent + and clears any lingering internal bp before laying down the next shim. + +A single scratch slot per session suffices because exactly one displaced step is +ever in flight. The scratch page is allocated lazily from the JIT image's own +execmem pool (`cfree_jit_image_execmem`); if the JIT was built without one, +single-step paths return `CFREE_UNSUPPORTED`. + +## Breakpoint table + +`src/dbg/bp.c` is an arch-neutral table keyed by runtime address. Each slot holds +the address, the bytes the trap patch overwrote, a refcount, a monotonic +user-visible id, and the per-breakpoint policy (`skip_count`, `condition`, +`max_hits`). Internal (one-shot) ids are drawn from a separate high id space so +they never collide with user ids; this lets the step machinery drop temporaries +without disturbing a user breakpoint at the same PC. The refcount makes +`set`/`clear` idempotent: a second `set` at an occupied address bumps the count +and reuses the existing id. + +Patching goes through the host W^X window: `code_write_begin` returns a writable +alias for the runtime address (the write side of a dual mapping on hosts that have +one, otherwise a transient `mprotect` RW↔RX flip), the original bytes are saved +and the arch's trap sequence (`ArchDbgOps.breakpoint_patch`, e.g. `BRK` on +aarch64) is written, then `code_write_end` and an icache flush. User breakpoints +are constrained to the JIT image address range; internal breakpoints are also +allowed in the scratch page, which lies outside it. `dbg_bp_fini` restores every +armed patch so the image is left clean. + +The table is also a *read overlay*: `dbg_bp_unpatch_read` substitutes the saved +bytes back into any memory read that overlaps a patched range, so `x`, +disassembly, and the displaced-step decoder never see the trap byte. + +## Guest memory and registers + +Because the worker shares the REPL's address space, reading and writing guest +memory is a guarded `memcpy`, not a `ptrace`/`mach_vm` round trip. `src/dbg/mem.c` +delegates to the host's `guarded_copy`, which arms a thread-local `sigsetjmp` +landing slot before the copy; the host SEGV/BUS handler checks that slot first and +longjmps back on a bad address *before the session's `on_fault` ever sees it*. So a +`p *badptr` returns an error instead of killing the worker. Read results then pass +through the breakpoint read overlay. + +Register access reads and writes the `CfreeUnwindFrame` snapshot captured in the +stop slot; `set_regs` mutates that slot and the parked `on_fault` writes it back +into the `ucontext` on resume. The valid states differ by operation: memory +read/write are both accepted while STOPPED or EXITED, while register get/set both +require STOPPED (there is no live register snapshot once the program has exited). A +`set_regs` PC must lie inside the JIT image. + +## REPL (driver/cmd/dbg.c) + +The driver TU mirrors `cfree run` for compile flags and argv shape (with `-g` +forced on), turns the input list into a JIT image, opens a DWARF view over it +(`cfree_jit_view` → `cfree_dwarf_open` → `cfree_jit_session_attach_dwarf`), then +reads commands from stdin and dispatches them. Inputs follow the shared +`DriverInputs` shape used by `cfree run` — C sources, objects, archives, and stdin +— so a session can mix pipeline-compiled `-g` sources with prebuilt `.o`/`.a`. With +no inputs the REPL starts empty and code is appended interactively with +`jit`/`expr`. + +Responsibilities that stay in the driver: + +- **Command engine** — a flat token dispatch covering `r`/`c`, the four stepping + commands, `b`/`info b`/`delete`/`ignore`, `bt`, `p`/`set`, `x`/`disasm`/`list`, + `info reg`/`locals`/`args`, `jit`/`edit`/`expr`, and language switching. +- **Driver-local breakpoint table** — the user-facing id namespace, the spec text, + and the enabled flag, keyed off the session-side handles. Location strings + (`file:line`, `sym[+off]`, `0xADDR`) are resolved with `cfree_dwarf_line_to_addr` + and `cfree_jit_lookup`. +- **Stop rendering** — `dbg_render_stop` turns a `CfreeStopInfo` into a + source-level message (breakpoint hit, step completion, signal, interrupt, exit) + and a compact source-context listing. +- **DWARF queries** — `bt` via `cfree_dwarf_unwind_step`; `p name` / `set` via + `cfree_dwarf_var_at` + `cfree_dwarf_loc_read`; locals/args enumeration; `list`. +- **SIGINT forwarding** — while a session call is in flight the driver installs a + SIGINT handler that calls `cfree_jit_session_interrupt`; at the prompt it + restores default behavior so Ctrl-C terminates. + +### Runtime-vs-image PC translation + +DWARF line, CFI, and variable tables are authored in image-relative vaddrs, while +stop PCs, breakpoint addresses, and return addresses pushed by calls live in the +JIT runtime address space. Both the session (`step.c`) and the REPL translate at +the boundary: every DWARF call consumes an image vaddr and every DWARF result that +names a code address is image-relative, mapped with `cfree_jit_runtime_to_image` / +`cfree_jit_image_to_runtime`. The fallback is pass-through, so a PC outside the +image (e.g. a future multi-input stop in foreign code) degrades gracefully instead +of resolving to zero. Register and CFA values stay in their runtime form because +the DWARF location evaluator reads them as live host values. + +## Host adapter + +The POSIX adapter (`driver/env/posix_dbg.c`, plus per-OS code in +`driver/env/{macos,linux,freebsd}.c`) is the only place in the tree that includes +`<pthread.h>`, `<signal.h>`, and `<sys/mman.h>` for debugger use. It provides: +pthreads for the worker and for the event objects (a mutex + condvar + signaled +flag per event); `sigaction`-based handlers for the trap/fault cohort plus a +reserved interrupt signal, which confirm the faulting thread is the worker before +marshalling the `ucontext` and calling `on_fault`; the W^X code-write window, +which returns a dual-mapping write alias when the host has one (`mach_vm_remap` on +macOS, `memfd_create` on Linux, both recorded in a shared registry) and otherwise +flips protection transiently; icache flush; the `sigsetjmp`-guarded memory copy; +and the `call_with_catch`/`thread_abort` pair backing `RESUME_ABORT`. + +## Scope and assumptions + +Single worker thread; no concurrent guest threads. In-process only — read/write +memory is a guarded copy, not remote debugging. All breakpoints are software (a +trap-byte patch); the `condition` callback is host-side C, not JIT-compiled. +Source-level stepping assumes `-O0` and `-g` for usable line/variable mapping. diff --git a/doc/DBG_TODO.md b/doc/DBG_TODO.md @@ -1,169 +0,0 @@ -# dbg TODO - -This is the working list for making `cfree dbg` feel like a good systems-code -REPL. The first milestone is Toy because its frontend already has persistent -REPL state and expression/block input kinds. Most debugger and UI work below is -language-neutral; C-specific expression support can follow once the shared -experience is solid. - -## Current Shape - -- `cfree dbg` has a real REPL in `driver/cmd/dbg.c`: `run`, `cont`, stepping, - breakpoints, `jit`, `expr`, `bt`, `p`, `set`, `x`, `disasm`, `list`, and - `info` commands. -- `src/dbg/` owns the JIT session, worker thread, signal stop/resume loop, - breakpoint patching, guarded memory access, displaced stepping, and - source-level resume modes. -- Toy supports REPL top-level snippets and expression/block thunks. Persistent - Toy declarations already work across `jit` and `expr` commands. -- CG now owns debug-info emission for named source locals and parameters: - frontends pass normal CG local/param metadata, CG emits the variable DIEs at - function finalization, and targets provide finalized storage locations through - `local_debug_loc` / native frame-slot debug-location hooks. This keeps Toy out - of the debug producer API and leaves architecture-specific frame math behind - target interfaces. -- C can append normal translation-unit snippets, but raw C expressions/blocks - are not implemented as REPL thunks yet. - -## Test Discipline - -- Follow red-green TDD for dbg work where possible: first add or update the - focused `test/dbg` transcript or unit test so it fails for the current - behavior, then implement the smallest change that makes it pass. -- Golden outputs must encode the desired user-visible behavior, not the current - bug or an overly loose approximation. Prefer exact normalized transcripts - over grep-based checks. -- Keep expectations stable and intentional. If a prompt, generation line, - diagnostic, or stop message is noise, normalize it in the harness or change - the product output deliberately rather than baking accidental text into a - golden. -- When moving coverage, preserve the behavioral expectation in the new test - before deleting the old one. The old test should only be removed once the new - location proves the same behavior red/green. - -## Command Direction - -- `expr` is the way to call functions from the REPL. Users write - `twice(value)` or `fn(arg0, arg1)` in the language they are using instead of - learning a debugger-specific call command. -- The old `call SYMBOL ...` command has been removed. Do not reintroduce a - parallel function-call command unless expression evaluation cannot cover a - specific workflow and the limitation is documented first. - -## Toy-First Milestone - -- [x] Keep `test/dbg` as the scripted REPL test home. Cases should drive - `cfree dbg` through stdin and compare normalized transcripts. -- [x] Move the existing Toy REPL smoke coverage out of `test/driver/run.sh`: - land equivalent exact transcript coverage in `test/dbg`, run it red/green, - then delete the duplicated `test/driver/run.sh` checks. -- [ ] Add Toy transcript coverage for: - - [x] empty-session expression evaluation - - [x] persistent top-level `let` - - [x] persistent functions - - [x] record/type declarations across snippets - - [x] function calls through expressions with integer/address-like - arguments - - [x] expression blocks - - [x] compile error recovery behavior (`toy-error-recovery` green; a - failed snippet rolls back and the next valid snippet runs as `$1`) - - [x] `:language toy` switching from a non-Toy default -- [ ] Add debugger-control Toy coverage for: - - [x] `b sym`, `run`, `cont` - - [x] `b file:line` - - [x] `stepi`, `next` - - [x] `step`, `finish` - - [x] `bt` - - [x] `info reg` - - [x] `p`, `set`, `info locals`, `info args` - - [x] `x ADDR [count]` - - [ ] Ctrl-C/interrupt behavior where the host can test it reliably -- [ ] Improve Toy REPL output: - - [x] quieter successful `jit` output; generation spam is hidden - - [ ] typed result formatting instead of always `u64/i64` style - - [ ] richer expression thunk signatures so expressions can accept and - return non-integer values, including pointers, floats, records, - arrays/slices, and other structured values (`toy-structured-expr` - red transcript is in place) - - [ ] pretty-print structured expression results using type info instead - of only showing integer/hex scalar output - - [ ] readable diagnostics that keep the REPL usable after bad input - - [ ] better handling for multi-line input and unmatched braces -- [ ] Improve Toy source/debug info: - - [ ] stable synthetic file names for REPL snippets - - [x] useful `list` output for synthetic snippets - - [x] local/argument names and values at expected source stops - -## Toy Transactional Frontend State - -Implemented; see `doc/TOY_TRANSACTIONAL.md` for the design. Commit is gated on -JIT-publish success (not merely compile success), so a publish rejection such -as a duplicate global cannot leave the Toy table advertising a symbol the image -lacks. - -- [x] Make each Toy REPL compile a transaction. A failed snippet leaves the - persistent Toy state exactly as it was before the snippet started, and the - next valid expression/top-level snippet compiles and runs normally. -- [x] Split durable Toy frontend state from per-input parser state. `ToyModule` - owns functions, globals, named types, type ids, and counters that survive - successful snippets; the per-compile `ToyParser` owns the lexer, current - token, current CG object, locals, scopes, labels, goto targets, current - function return state, diagnostics, and input kind for one compile. -- [x] Stop storing per-object CG symbol handles as durable Toy symbol identity. - Function/global names, types, and attrs are persistent declarations; the - per-compile symbol environment (`fn_syms`/`global_syms`) maps each to its - `CfreeCgSym` in the current object when replaying previous declarations. -- [x] Stage new functions, globals, and named/type-table entries until the full - snippet succeeds. Lookups see both durable and staged entries (single - append-only tables); commit keeps the staged appends, rollback truncates - them to the compile-start watermarks plus an undo journal for the - forward-declaration in-place mutations. -- [x] Make rollback safe for both ordinary diagnostics and compiler panics from - CG/backend code. The session drives `abort` on the soft-error and panic - (longjmp) paths; rollback uses watermarks + a typed undo journal, never a - copy of the parser struct. -- [x] Preserve the existing object/backend boundary: the compile-session layer - discards the failed `ObjBuilder`, and dbg only publishes after a - successful compile. Frontend commit is gated on publish success. -- [x] Change the compile-session error contract so frontend diagnostic failures - return `CFREE_ERR` without an extra fatal `frontend failed` diagnostic. - Internal compiler panics still use the panic path. - -## Shared REPL Work - -- [x] Add line editing and history. -- [x] Add completion for commands, symbols, locals, files, and breakpoint ids. -- [ ] Repeat the last stepping command on a blank line. -- [x] Print a compact source context after source-level stops. -- [x] Make stop messages distinguish user breakpoints, internal step - completions, signals, traps, and program exits cleanly. -- [x] Add `disasm` / `x/i` around the current PC. -- [ ] Add memory-format variants for `x` (bytes, words, strings, pointers). -- [ ] Add stable machine-readable transcript mode for tests and tools. -- [ ] Keep command parsing factored enough that future editor/IDE frontends can - reuse the command engine without scraping human output. -- [x] Remove the old `call` command; expression calls are the REPL path. - -## Robustness And Portability - -- [x] Fix Darwin/arm64 sanitizer build breakpointing: `ucontext_t` access in - the signal handler currently trips UBSan on misalignment in debug builds. -- [ ] Add direct tests for breakpoint patch/restore and read overlay. -- [ ] Add direct tests for guarded bad memory reads. -- [ ] Add direct tests for displaced AA64 stepping families. -- [ ] Make session teardown explicit enough for tests that stop while the - worker is parked. -- [ ] Decide whether `test-dbg` should self-skip or fail when the compiled - backend/host cannot support a debug session. -- [ ] Bring x64 and rv64 debug sessions up to the same baseline as AA64. - -## C Later - -- [ ] Teach the C frontend `CFREE_FRONTEND_INPUT_REPL_EXPR` and - `CFREE_FRONTEND_INPUT_REPL_BLOCK`. -- [ ] Preserve C declarations across snippets without leaking frontend internals - into the driver. -- [ ] Support C function calls through normal REPL expressions. -- [ ] Infer C expression result types and print typed values. -- [ ] Allow expression thunks to refer to stopped-frame locals where feasible. -- [ ] Add C transcript tests after the Toy harness is stable. diff --git a/doc/DESIGN.md b/doc/DESIGN.md @@ -1,252 +1,223 @@ # cfree Design -This document describes the current implementation structure of cfree. It is -not a roadmap and it does not describe target surfaces that are not wired into -the tree today. - -cfree is organized around a small public `libcfree` API, with CLI tools and -language frontends as API consumers. The library owns compilation, object -construction, linking, JIT mapping, debugging support, and emulation internals. -The driver owns command-line policy and host I/O. - -## Public Boundary - -The public headers are: - -- `include/cfree.h`: compiler lifecycle, targets, compile/link/JIT/debug/emu - APIs, host vtables, object inspection, archive and disassembly helpers. -- `include/cfree/cg.h`: the public code-generation API used by language - frontends. -- `include/cfree/frontend.h`: frontend support APIs that do not expose - `CfreeCompiler` internals, such as arenas, source registration, symbols, and - frontend panic boundaries. -- `include/cfree/hashmap.h`: public helper used by frontends. - -`driver/` is built against the public include tree only. It must not include -private `src/` headers. Its job is to parse tool options, load and release -files, provide host vtables in `CfreeEnv`, register frontends on each compiler, -and call public `cfree_*` entry points. - -`lang/` is also outside `src/` and is an API consumer. `lang/c` and `lang/toy` -use `<cfree.h>`, `<cfree/cg.h>`, and frontend support headers, plus their own -private headers under `lang/...`. They do not reach into `src/` implementation -headers. - -`src/` is `libcfree` implementation. Internal modules may share private -headers, but their public surface is exposed only through `include/`. - -## Layering - -From outside to inside: - -1. **Driver (`driver/`)** - Implements the multi-call `cfree` binary: `cc`, `as`, `ld`, `ar`, - `objdump`, `run`, `dbg`, and `emu`. It translates command-line flags into - public API options, supplies heap/diagnostic/file/executable-memory vtables, - and resolves path-shaped inputs into byte buffers and writers. - -2. **Language frontends (`lang/`)** - Registered per `CfreeCompiler` with `cfree_register_frontend`. - `lang/c` preprocesses, parses, type-checks, manages C declarations, and - drives the public CG API. `lang/toy` is a small frontend used to exercise the - same CG API. Frontends produce object contents through `CfreeCg`; they do - not own object formats or linker policy. - -3. **Public API glue (`src/api/`)** - Implements `CfreeCompiler`/`CfreePipeline` lifecycle, compile and link entry - points, writer helpers, object inspection, archive APIs, disassembly APIs, - frontend support, and the public CG API. This layer is the composition point - between public handles and internal subsystems. - -4. **Core services (`src/core/`)** - Provides allocation helpers, arenas, vectors, buffers, string buffers, - interned symbols, source-file tracking, diagnostics, hashing, and common - utilities. Most state is rooted in a `CfreeCompiler` or in explicit - subsystem contexts passed through the call graph. - -5. **Frontend-neutral compilation internals** - `src/abi/` owns target ABI layout and call classification. `src/arch/` - owns target registration, internal `CGTarget` and `MCEmitter` creation, - instruction emission, register files, register allocation support, - disassembly, and per-architecture fixups. `src/asm/` owns the standalone - assembler and shared assembler helpers. - -6. **Optimizer (`src/opt/`)** - Implements an internal `CGTarget` wrapper. At `-O0`, code generation drives - the real target directly. At `-O1`, the wrapper records functions as IR, - runs the implemented lowering/register-allocation/combine/DCE path, and - replays into the wrapped target. `-O2` is reserved in API shape and comments - but is not currently implemented as a full optimization pipeline. - -7. **Object, debug, link, JIT, dbg, and emu** - `src/obj/` owns the in-memory object model plus ELF and Mach-O reading and - writing paths. `src/debug/` owns DWARF production and reading. `src/link/` - resolves symbols, lays out images, applies relocations, emits executables, - and builds JIT images. The public shared-library entry point is present, but - shared-library codegen is not yet supported. `src/dbg/` layers breakpoints, - stepping, memory access, and displaced execution over JIT sessions. - `src/emu/` loads guest ELF images, decodes/lifts guest blocks, and runs them - through the JIT-backed runtime. - -8. **Runtime (`rt/`)** - Provides freestanding headers and compiler-rt/libc-style support code used - by generated programs and self-hosting configurations. It is separate from - the compiler implementation library. - -## Compile Data Flow - -### C source to object +cfree is a freestanding C11 compiler multi-tool, written in C11. This document +is the front door to the design docs: it states what cfree is, the principles +that shape it, the layered architecture, the primary data flows, and an index of +every sibling design doc. It is a map, not a manual — API signatures and struct +layouts live in the headers under `include/cfree/`; per-subsystem detail lives in +the docs indexed at the end. + +## What cfree is + +A single multi-call binary (`cfree`) that bundles a complete C toolchain plus the +machinery to JIT, debug, and emulate what it produces. Capabilities: + +- C11 preprocessor, single-pass parser/type checker, and code generator. +- A JIT compiler, an in-process runner, and an interactive debugger. +- A linker (objects/archives/DSO inputs -> executable or shared image), with + basic linker-script support and file-based incremental linking. +- A standalone assembler (GAS subset) and inline assembler sharing one emitter. +- A lightweight optimizer (a recording IR with SSA, register allocation, and + local cleanup behind `-O1`). +- Cross-compiling backends for aarch64, x86-64, riscv64, and WebAssembly, plus a + portable C-source backend. +- Object read/write for ELF, Mach-O, and PE/COFF; a Wasm object form. +- DWARF debug-info production and consumption; a disassembler. +- A user-mode guest-ELF emulator (per-basic-block JIT translation). +- A bytecode interpreter over the optimizer IR (`run --no-jit`). +- Signed, content-addressed code distribution (`.cfpkg`). +- Object/archive utilities: `ar`, `ranlib`, `nm`, `size`, `strip`, `objcopy`, + `objdump`, `addr2line`, `strings`. + +## Design principles + +- **Freestanding C11.** The compiler builds and runs without a hosted libc; it + ships its own headers and runtime so it can compile itself and target bare + metal. The implementation obeys the same constraints it imposes on its output. +- **No global state.** There are no mutable globals or hidden singletons. All + state hangs off an explicit context — a `CfreeCompiler` or a subsystem handle + (`CfreeObjBuilder`, `CfreeLinkSession`, `CfreeJit`, `CfreeJitSession`, + `CfreeEmu`, frontend state) — so the library is reentrant and embeddable. +- **The host supplies all side effects.** libcfree itself touches no OS. The + host injects every side effect through vtables: heap, diagnostics, file I/O, + metrics, and a clock via `CfreeContext` (`include/cfree/core.h`); executable + memory and JIT thread-local storage via `CfreeJitHost` (`jit.h`); and debugger + OS hooks — threads, events, signal handling, code patching — via `CfreeDbgHost` + (`dbg.h`). This is what makes "no global state" enforceable and keeps the + library portable across hosts. +- **No VLAs.** Stack growth is bounded and predictable; dynamic sizing goes + through arenas/heaps so freestanding and constrained targets stay safe. +- **Strict modular layering.** Each layer depends only inward, and subsystem + internals are private. The public header set is the entire contract; crossing a + boundary the wrong way is a design bug, not a shortcut. +- **Multi-arch & multi-platform via vtables, not `#ifdef`.** Architecture, + ABI, and object-format variation is expressed as runtime dispatch tables — + `ArchImpl` (`src/arch/arch.h`), `TargetABI` (`src/abi/abi.h`), and + `ObjFormatImpl` (`src/obj/format.h`) — selected from the target triple. New + targets are new table instances, not new conditional branches scattered + through the tree. +- **Build-time component gating.** `include/cfree/config.h` defines + `CFREE_*_ENABLED` flags for archs, object formats, languages, subsystems, and + tools. The build drops disabled units entirely, so a minimal embedding pays + only for what it uses, and the tool registry in `driver/main.c` is gated by the + same flags. + +## Layered architecture + +From outside in, each layer depends only on the layer beneath it: ``` -driver cc - -> CfreeEnv + CfreeCompiler - -> cfree_compile_obj*() - -> registered C frontend - -> lexer -> preprocessor -> parser/type/declaration logic - -> public CfreeCg API - -> internal CGTarget - -> MCEmitter - -> ObjBuilder - -> object writer or linker input +driver/ CLI policy + host I/O. Includes ONLY <cfree/*.h>. + lang/ Frontends (c, cpp, toy, wasm). API consumers; ONLY + <cfree/*.h> + their own private headers. + include/cfree/ PUBLIC BOUNDARY. The library's entire stable contract. + src/api/ Composition: public handles <-> internal subsystems. + src/... Internal subsystems. Share private headers among their own + TUs; expose nothing except through include/cfree/. ``` -The driver loads the source bytes and chooses `CfreeCompileOptions`. The -pipeline entry in `src/api/pipeline.c` creates an `ObjBuilder` and dispatches -to the frontend registered for `input->lang`. - -The C frontend registers source files and include edges through frontend -support APIs, then preprocesses and parses the token stream. It records C -declaration semantics in its own `lang/c` tables and emits functions and data -through `CfreeCg`. - -`CfreeCg` maps public CG types, symbols, stack operations, calls, branches, -data definitions, and source locations onto internal target operations. The -selected target either receives those operations directly (`-O0`) or through -the optimizer wrapper (`-O1`). Final machine bytes, labels, relocations, and -section contents are written into `ObjBuilder` through `MCEmitter`. - -### Assembly source to object - -``` -driver as or cfree_compile_obj*(CFREE_LANG_ASM) - -> asm lexer/parser - -> MCEmitter - -> ObjBuilder - -> object writer or linker input -``` - -Assembly bypasses `CfreeCg` because it is already target-level syntax. The -assembler uses the same object builder and machine emitter path as compiled C. - -### Toy source to object +- **`driver/`** implements the multi-call binary. `driver/main.c` holds the + central tool table; each tool (`cc`, `as`, `ld`, `ar`, `run`, `dbg`, `emu`, + `cas`, `pkg`, …) translates command-line flags into public API calls and + supplies the host vtables. `driver/dist/` carries the content-addressed store + and `.cfpkg` packaging (tar/deflate/lz4, BLAKE2b/SHA-256, ed25519/minisign). +- **`lang/`** holds the frontends. `lang/c` preprocesses (`lang/cpp`), parses, + type-checks, manages C declarations, and drives the public CG API; `lang/toy` + and `lang/wasm` are smaller frontends exercising the same boundary. Each + registers a `CfreeFrontendVTable` per compiler and emits through `CfreeCg`; no + frontend owns object formats or linker policy. +- **`include/cfree/`** is the public boundary — the only headers `driver/` and + `lang/` may include. +- **`src/api/`** is the composition layer: it implements the public handles + (`src/api/compile.c`, `link.c`, `object_builder.c`, `archive.c`, `disasm.c`, + …) and wires enabled frontends (`src/api/lang_registry.c`). It is the single + place where public types meet internal subsystems. +- **`src/`** subsystems do the work: `core` (arenas, vectors, buffers, symbol + interning, diagnostics, hashing), `abi`, `arch`, `asm`, `cg`, `opt`, `obj`, + `link`, `jit`, `dbg`, `emu`, `interp`, `debug` (DWARF), `wasm`, and `os`. + +**The layering invariant:** `driver/` and `lang/` include only `<cfree/*.h>` — +never a `src/` header. Anything a frontend or tool needs is promoted into the +public headers; reaching into `src/` is a layering violation. Subsystem +`*_internal.h` headers stay private to their own translation units. + +## Key abstractions + +- **`CfreeCg` (`include/cfree/cg.h`)** is the frontend-facing code-generation + API: a typed stack-machine IR over which all frontends emit functions, control + flow, data, calls, and inline asm. It is the largest public contract and the + point frontends couple to hardest. +- **Tiered backend.** A `CgTarget` (`src/cg/cgtarget.h`) receives the lowered CG + stream. At `-O0` a shared `NativeDirectTarget` adapts the physical + `NativeTarget` (`src/arch/native_target.h`) directly; at `-O1` the optimizer + wrapper (`src/opt/`) records IR, runs its passes, then replays into the same + `NativeTarget`. Physical machine bytes flow through one arch-neutral + `MCEmitter` (`src/arch/mc.h`). +- **`ObjBuilder` (`src/obj/obj.h`)** is the canonical in-memory object model + during compilation, assembly, linking, JIT, inspection, and DWARF emission — + one section/symbol/relocation store, with format knowledge behind + `ObjFormatImpl`. +- **Symbols.** `CfreeSym` is an interned spelling (an identity, not a + definition). Object builders use object-scoped symbol ids so locals from + different objects never collide; the linker builds a separate resolved-symbol + table across all inputs. + +## Primary data flows + +### 1. C source -> object ``` -driver cc input.toy - -> registered toy frontend - -> public CfreeCg API - -> normal backend/object path +driver cc -> CfreeContext + CfreeCompiler -> cfree_compile_* (src/api/compile.c) + -> registered C frontend (lang/c): lex -> preprocess -> parse/type/decl + -> CfreeCg (public CG API) + -> CgTarget ( -O0 NativeDirect | -O1 opt wrapper ) + -> NativeTarget -> MCEmitter + -> ObjBuilder -> object writer ``` -The toy frontend exists to exercise and test the public CG API independently of -C language semantics. +The driver loads source bytes and picks options; `src/api/compile.c` creates an +`ObjBuilder` and dispatches to the frontend registered for the input language. +Assembly (`.s`) takes a shortcut: the asm frontend feeds the `MCEmitter`/ +`ObjBuilder` path directly, bypassing `CfreeCg` because it is already +target-level. -## Link and Run Data Flow - -### File link +### 2. File link -> executable ``` -objects / object bytes / archives / DSO stubs - -> cfree_link_exe() - -> Linker - -> object/archive readers - -> symbol resolution - -> layout - -> relocation - -> executable writer +objects / object bytes / archives / DSO inputs + -> cfree_link_* (src/api/link.c -> src/link/) + -> object/archive readers -> symbol resolution -> layout + -> relocation -> executable (or incremental patch) writer ``` -The linker accepts already-built `CfreeObjBuilder` values, encoded object -bytes, archives, and dynamic library inputs described by public API options. -It owns archive member selection, symbol resolution, section and segment -layout, relocation, build-id/image-id handling, and final image emission. -`cfree_link_shared()` has a public option surface, but currently reports that -shared-library codegen is not supported. +The linker owns archive member selection, symbol resolution, section/segment +layout, relocation (per-arch fixups behind `ArchImpl`), build/image-id handling, +and final emission, for any enabled object format. -### JIT run and debug +### 3. Run / JIT in-process ``` -source/object inputs - -> compile/link to LinkImage - -> cfree_link_jit() - -> executable-memory host vtable +source/object inputs -> compile/link to a JIT LinkImage + -> cfree_link_jit (CfreeExecMem from CfreeJitHost maps + protects pages) -> CfreeJit / CfreeJitSession - -> run or dbg + -> run (invoke entry) | dbg (breakpoints, stepping, regs/mem via CfreeDbgHost) ``` -The JIT path shares the same compile, object, symbol, and relocation machinery -as file output. Mapping executable memory is delegated to the host through -`CfreeEnv`; libcfree enforces the image layout and relocation model. - -`driver/cmd/run.c` invokes an entry point in-process. `driver/cmd/dbg.c` builds on JIT -sessions and `src/dbg/` for breakpoints, stepping, register display, and memory -inspection. +The JIT shares the same compile/object/relocation machinery as file output; only +the final sink differs. Mapping executable memory and installing TLS are +delegated to the host through `CfreeJitHost`. `run --no-jit` instead attaches a +bytecode `InterpProgram` (`src/interp/`) and executes the entry through the +interpreter while still using the JIT image for real data/extern addresses. -### Emulation +### 4. Emulate a guest ELF ``` -guest ELF bytes - -> emu ELF loader +guest ELF bytes -> emu ELF loader (src/emu/) -> decode/lift guest basic blocks - -> CGTarget or opt_cgtarget - -> JIT image - -> emu runtime + -> CgTarget -> JIT image + -> emu runtime (syscall + memory model) ``` -The emulator is a user-mode ELF runner. It translates guest basic blocks into -the same backend/JIT infrastructure used by native JIT compilation. The public -`CfreeEmuOptions.optimize` API currently reserves level `2`; implemented use is -through the available direct or optimizer-backed translation paths described in -`include/cfree.h` and `doc/EMU.md`. - -## Object and Symbol Model - -`CfreeSym` is an interned spelling. It is suitable for identifiers, section -names, symbol names, and lookup keys, but it is not itself a definition. - -`CfreeCgSym` is the public CG handle for a symbol inside one generated object. -Internally, object builders use object-scoped symbol ids so local symbols from -different objects do not collide. Linker resolution builds a separate -resolved-symbol table over all input objects and matches externally visible -definitions by name, binding, and object-format rules. - -`ObjBuilder` is the canonical in-memory object representation during -compilation and assembly. Object writers, the linker, object inspection, debug -emission, and JIT image construction consume this model rather than duplicating -section/symbol/relocation storage. - -## State and Ownership - -The host supplies storage and side effects through `CfreeEnv`: heap, -diagnostics, file I/O, executable memory, debugger OS hooks, JIT TLS hooks, and -time. Public APIs receive explicit options and handles; internal subsystems -hang state off `CfreeCompiler`, `CfreePipeline`, `CfreeCg`, `ObjBuilder`, -`Linker`, `CfreeJit`, `CfreeJitSession`, `CfreeEmu`, or frontend-owned context -structures. - -Compile inputs are byte buffers owned by the caller and must outlive the call. -Writers are host-owned. Builders returned by `cfree_compile_obj` are owned by -the compiler and must remain alive until consumers finish with them. Encoded -object bytes, archive bytes, and DSO bytes are borrowed by link calls for the -duration of the call unless a specific API says otherwise. - -## Current Optimization Contract - -- `opt_level == 0`: direct code generation into the selected backend. -- `opt_level == 1`: implemented optimizer-backed path. It records CGTarget - operations as IR, performs the implemented backend-prep and local cleanup - pipeline, allocates registers, combines, removes dead code, and emits through - the wrapped real target. -- `opt_level == 2`: not yet implemented as a full optimization level. Public - option fields and some internal pass declarations reserve this level, but the - current design should treat `-O2` as future work rather than a dependable - behavior contract. +The emulator is a user-mode ELF runner that translates guest basic blocks into +the same backend/JIT infrastructure used for native JIT, executing them under a +guest memory and syscall model. + +## State and ownership + +The host owns storage and side effects (heap, file I/O, executable memory, TLS, +debugger OS hooks); libcfree owns compilation, object construction, linking, JIT +layout, and relocation policy. Public APIs take explicit options and handles; +internal state hangs off `CfreeCompiler`, the subsystem handles, or frontend +context structs. Compile inputs are caller-owned byte buffers that must outlive +the call; builders returned by compile are owned by the compiler until consumers +finish; object/archive/DSO bytes handed to link calls are borrowed for the call +unless an API states otherwise. + +## Documentation index + +| Doc | Covers | +|-----|--------| +| [DESIGN.md](DESIGN.md) | This map: what cfree is, principles, layering, data flows, index. | +| [INTERFACES.md](INTERFACES.md) | Interface inventory and review checklist across all tiers (public, backend, subsystem, core, frontend). | +| [FRONTENDS.md](FRONTENDS.md) | The `lang/` frontends — C (preprocess/parse/type/decl), cpp, toy, wasm — and the frontend vtable contract. | +| [CODEGEN.md](CODEGEN.md) | The `CfreeCg` public CG API and the tiered `CgTarget` -> `NativeDirect`/opt -> `NativeTarget` lowering path. | +| [IR.md](IR.md) | The recording/optimizer IR: instructions, types, and how CG operations become analyzable functions. | +| [ARCH.md](ARCH.md) | Per-arch backends (aarch64/x86-64/riscv64), `ArchImpl` dispatch, `MCEmitter`, register files, and fixups. | +| [ASM.md](ASM.md) | The standalone + inline assembler, GAS-subset syntax, and the shared emitter. | +| [OPT.md](OPT.md) | The `-O1` optimizer: SSA construction, register allocation, combine/DCE, and replay into the backend. | +| [INTERPRETER.md](INTERPRETER.md) | The bytecode interpreter over the optimizer IR used by `run --no-jit`. | +| [OBJ.md](OBJ.md) | The format-neutral object model and ELF/Mach-O/COFF/Wasm read/write behind `ObjFormatImpl`. | +| [LINK.md](LINK.md) | Linking: symbol resolution, layout, relocation, linker scripts, and incremental linking. | +| [JIT.md](JIT.md) | The JIT image model, executable-memory and TLS host hooks, and publish/append/replace. | +| [EMU.md](EMU.md) | The user-mode guest-ELF emulator and its per-block JIT translation. | +| [DWARF.md](DWARF.md) | DWARF debug-info production and the consumer used by the debugger and dumpers. | +| [DBG.md](DBG.md) | The debugger: breakpoints, single-step, displaced execution, register/memory access. | +| [CBACKEND.md](CBACKEND.md) | The portable C-source backend (`src/arch/c_target/`). | +| [WASM.md](WASM.md) | The WebAssembly backend, object form, and host-import binding. | +| [DISTRIBUTE.md](DISTRIBUTE.md) | Signed `.cfpkg` packaging and the content-addressed store (`driver/dist/`, `cas`/`pkg` tools). | +| [DRIVER.md](DRIVER.md) | The multi-call binary, tool registry, and command-line policy. | +| [RUNTIME.md](RUNTIME.md) | The freestanding headers and compiler-rt/libc-style support in `rt/`. | +| [BUILD.md](BUILD.md) | The build system and `CFREE_*_ENABLED` component gating. | +| [TESTING.md](TESTING.md) | The test suites and harnesses under `test/`. | + +Planned work and roadmaps live under `doc/plan/`. diff --git a/doc/DISTRIBUTE.md b/doc/DISTRIBUTE.md @@ -1,206 +1,119 @@ -# Code distribution: CAS, packaging, signing, and verification - -cfree distribution v3 has one shared content model and two package -representations. - -The shared content model is a content-addressed store of: - -- blobs: raw regular-file bytes, -- trees: deterministic manifests for output directories. - -The package model is a signed claim over one or more trees. Package files can -embed the tree manifests, chunk index, and file content, or they can reference -those objects externally so a client can fetch only the metadata and chunks it -needs. - -There is no dependency resolution or network transport in the package format. -cfree creates, verifies, inspects, unpacks, and materializes local files. Other -tools may move package bytes or serve CAS objects. - -## Trust and identity - -All cryptographic hashes in distribution v3 are BLAKE2b-256 unless explicitly -stated otherwise. - -The signed package object is always the package manifest's literal byte stream: - -``` -package-id = BLAKE2b-256(package manifest literal bytes) -``` - -The package signature is a detached minisign signature over the package -manifest. Its trusted comment contains: - -``` -pkgid=<hex package-id> -``` - -Verification rejects a package when the trusted comment package id does not -match the recomputed manifest hash. - -Native `.cfpkg` files also carry a signed encoding descriptor. The descriptor -must be signed by the same trusted key as the package manifest. The package -manifest signs the logical release/install claim; the encoding descriptor signs -the physical layout, embedded object regions, chunk index, compression, and -external object locators. - -Trust anchors are public keys in the trusted-keys file: - -- `$CFREE_TRUSTED_KEYS`, if set, -- otherwise `$HOME/.config/cfree/trusted_keys`. - -Each line is: - -``` -<keyid-hex> <pubkey-base64> <label> -``` - -A bundled `.pub` file is never trust by itself. It is only a TOFU candidate. -With `--tofu`, cfree pins the bundled public key after confirming its key id -matches the signature's key id. Without TOFU or `-p PUBKEY`, unknown signers -fail. +# Code Distribution + +cfree ships signed, content-addressed software packages with zero host +library dependencies. Everything the package pipeline needs — BLAKE2b and +Ed25519 (via vendored monocypher), the minisign file format, DEFLATE/gzip, +LZ4, base64, and ustar tar — is vendored under `driver/dist/`, so a stock +cfree binary can create, sign, verify, inspect, and unpack packages without +linking OpenSSL, zlib, libsodium, or libarchive. Two driver tools surface the +subsystem: `cfree cas` (the shared content store) and `cfree pkg` (signed +packages). See [DRIVER.md](DRIVER.md) for how these slot into the multitool. + +## Why this shape + +Three design decisions drive the whole subsystem: + +- **No host crypto/compression.** A self-hosting toolchain that depended on + the host's OpenSSL or zlib would not be freestanding. The primitives are + small, audited, and checked into the tree (`driver/dist/vendor/`), wrapped + behind narrow `dist_*` shims so the rest of the code never touches a vendor + API directly. +- **Content identity before trust.** The store layer is self-verifying by + hash and carries no signatures. Trust is layered on top by signing a single + small manifest, never by signing bulk content. This keeps the trusted-byte + surface tiny and lets the same blobs be shared, mirrored, and re-bundled + without re-signing. +- **Determinism.** Manifests are byte-stable canonical text; ids are hashes of + those exact bytes. A package, its trees, and its blobs reproduce identically + regardless of which representation carried them. + +## Layering + +``` + cfree pkg / cfree cas driver/cmd/{pkg,cas}.c + | + package model (signed) driver/dist/{manifest,cfpkg}.c + | + content model (self-verifying) driver/dist/{tree,blob,cas}.c + | + vendored primitives driver/dist/{blake2b,ed25519,minisig, + deflate,lz4,b64,tar}.c + driver/dist/vendor/{monocypher,lz4} +``` + +The content model knows nothing about signatures or package names. The package +model adds a signed claim over content ids. The CLI tools wire these to the +host filesystem and CSPRNG through `DriverEnv`/`CfreeFileIO`; the `dist_*` +layers themselves source no entropy and do no I/O beyond writer callbacks. ## Vendored primitives -The driver-side distribution subsystem vendors the primitives needed for the -package pipeline. - -| Primitive | Purpose | -|---|---| -| BLAKE2b-256 | package ids, blob ids, tree ids, chunk hashes, region roots | -| BLAKE2b-512 | minisign prehash compatibility | -| Ed25519 | minisign signature scheme | -| base64 | minisign key/signature text | -| tar | portable archive container | -| gzip/DEFLATE | portable compression | -| LZ4 block | optional native chunk compression | - -Compression APIs are deterministic and do not require host compression -libraries. - -## Paths and modes - -All tree and package paths are slash-separated relative paths. - -Rejected path forms: - -- absolute paths, -- empty paths, -- empty path components, -- `.` or `..` components, -- backslashes, -- drive-style `:`, -- paths containing NUL or newline bytes. - -Tree entries currently describe regular files only. Directories are implicit. -The only file mode values are: - -| Mode | Meaning | -|---|---| -| `-` | regular non-executable file | -| `x` | regular executable file | - -## Blob format - -A blob is the raw byte content of one regular file. Blob identity is +| Shim (`driver/dist/`) | Backed by | Used for | +|---|---|---| +| `blake2b.c` | monocypher | content ids, region/merkle roots, minisign checksums | +| `ed25519.c` | monocypher | minisign signature scheme | +| `minisig.c` | blake2b + ed25519 + b64 | minisign key/signature file format | +| `b64.c` | self-contained | minisign key/signature text encoding | +| `deflate.c` | miniz (public domain) | gzip container for portable packages | +| `lz4.c` | lz4 reference | optional per-chunk block compression | +| `tar.c` | self-contained | ustar container framing | + +All content hashes are **BLAKE2b-256** (`DIST_BLAKE2B_LEN` = 32). The shims are +deliberately thin: `dist_blake2b`, `dist_ed25519_*`, `dist_gz_*`, +`dist_lz4_*`, `dist_b64_*`, `dist_tar_*`. The vendored monocypher and lz4 +trees stay pristine and are pulled in by `#include` from the shim so cfree +carries no fork to maintain. + +minisign compatibility is exact: keys and signatures use stock minisign's +on-disk byte layout (base64 of `"Ed" || keyid || pk`, etc.), and signatures +are over minisign's 64-byte BLAKE2b prehash. A passwordless minisign key or +signature can be used interchangeably with `cfree pkg`. Password-encrypted +secret keys (`kdf_alg = "Sc"`) require scrypt, which is not vendored; they are +detected and rejected with a clear error rather than mis-parsed. + +## Content model + +### Blobs + +A blob is the raw byte content of one regular file. Identity is path-independent: ``` blob-id = BLAKE2b-256(raw file bytes) ``` -Blobs are also split into fixed-size chunks for streaming verification. The -default and canonical package chunk size is 64 KiB. +Blobs also carry a **chunk merkle root** (`dist_blob_root`, in +`driver/dist/blob.c`) computed over fixed-size chunks (default 64 KiB, +`DIST_BLOB_CHUNK_SIZE_DEFAULT`). Leaves are domain-separated hashes of +`("cfree blob leaf v1" || u64le chunk-index || u64le raw-size || bytes)`; +interior nodes hash `("cfree blob node v1" || left || right)`, pairing +adjacent hashes left-to-right with an odd final hash promoted unchanged (no +padding, no duplicated leaf); the root wraps the top hash under +`"cfree blob root v1"`. An empty blob has the fixed root +`BLAKE2b-256("cfree blob empty v1")`. -All integers mixed into hash domains are unsigned little-endian. +The two ids serve different jobs: `blob-id` is the simple CAS key for whole +file bytes; `blob-root` authenticates the chunk stream so a streaming verifier +can accept chunks as they arrive without holding the whole file. -For non-empty blobs: - -``` -leaf = BLAKE2b-256("cfree blob leaf v1" || - u64le(chunk-index) || - u64le(raw-size) || - raw-bytes) +### Trees -node = BLAKE2b-256("cfree blob node v1" || left-hash || right-hash) +A tree is a deterministic manifest for one output directory +(`driver/dist/tree.c`). It is strict, byte-stable INI-style text beginning with +`cfree-tree 1`, one `[file]` section per regular file, sorted bytewise by path. +Each entry records path, mode (`-` regular or `x` executable; directories are +implicit), size, `blob` id, and `root`. Unknown keys/sections, duplicate +paths, and non-canonical ordering are errors. Paths are slash-separated +relative paths; absolute paths, empty components, `.`/`..`, backslashes, drive +colons, and NUL/newline bytes are all rejected (`dist_tree_path_valid`). -blob-root = BLAKE2b-256("cfree blob root v1" || top-hash) ``` - -At each tree level, adjacent hashes are paired left-to-right. If a level has -an odd final hash, that hash is promoted unchanged to the next level. There is -no virtual padding and no duplicated final leaf. - -For an empty blob: - -``` -blob-root = BLAKE2b-256("cfree blob empty v1") -``` - -The blob id and blob root serve different purposes. The blob id is the simple -CAS key for complete file bytes. The blob root authenticates the chunk stream -and allows a package index to verify chunks as they are fetched. - -## Tree format - -A tree is a deterministic manifest for an output directory. It contains no -package name, version, signature, dependency, or build trace information. - -The logical tree manifest is strict, byte-stable INI-style text. Unknown keys, -unknown sections, unknown versions, duplicate paths, and non-canonical ordering -are errors. Emitters sort file sections by bytewise path order. - -Example: - -```ini -cfree-tree 1 -hash = blake2b-256 -blob = cfree-blob-v1 - -[file] -path = bin/hello -mode = x -size = 16384 -blob = <blob-id> -root = <blob-root> - -[file] -path = include/hello.h -mode = - -size = 512 -blob = <blob-id> -root = <blob-root> -``` - -Top-level fields: - -| Key | Required | Meaning | -|---|---|---| -| `hash` | yes | currently `blake2b-256` | -| `blob` | yes | currently `cfree-blob-v1` | - -`[file]` fields: - -| Key | Required | Meaning | -|---|---|---| -| `path` | yes | materialized file path | -| `mode` | yes | `-` or `x` | -| `size` | yes | uncompressed file byte length | -| `blob` | yes | BLAKE2b-256 of full file bytes | -| `root` | yes | cfree blob root for the file bytes | - -The tree id is: - -``` -tree-id = BLAKE2b-256(tree manifest literal bytes) +tree-id = BLAKE2b-256(canonical tree manifest bytes) ``` -Tree manifest bytes are the canonical bytes stored in CAS and embedded or -referenced by packages. +### CAS layout -## CAS layout - -The shared local CAS layout is: +`cfree cas` maintains a shared on-disk store (`driver/dist/cas.c`): ``` <cas>/ @@ -210,458 +123,179 @@ The shared local CAS layout is: chunk/<blob-prefix>/<blob-id>/<chunk-index> ``` -`<prefix>` is the first two lowercase hex characters of the id. For chunk -objects, `<blob-prefix>` is the first two lowercase hex characters of the -blob id. - -Blob objects are raw file bytes. Tree objects are canonical `cfree-tree 1` -manifest bytes. Index objects are native cfpkg chunk index byte streams keyed -by their signed `index-root`. Chunk objects are stored package chunk bytes, -keyed by the blob id and chunk index that the signed index authenticates. - -CAS objects are not signed. They are self-verifying by content identity. Trust -enters through signed package manifests, signed package encoding descriptors, -or build-system trace records outside this package format. - -## Package manifest - -The package manifest is the signed logical package object. It names the -package, identifies one or more output trees, and attaches package-level -metadata to files in those trees. - -The logical package manifest is strict, byte-stable INI-style text. Unknown -keys, unknown sections, unknown versions, duplicate output ids, duplicate -default outputs, and duplicate artifact paths within an output are errors. - -Example: - -```ini -cfree-package 3 -name = hello -version = 0.3.1 -description = minimal greeting program -hash = blake2b-256 -tree = cfree-tree-v1 -blob = cfree-blob-v1 - -[output] -id = 0 -name = runtime -tree = <tree-id> -target = x86_64-linux-cfree -default = true - -[artifact] -output = 0 -path = bin/hello -kind = exe -entry = true - -[dependency] -name = libfoo -version = >=1.2.0 -package = <dependency-package-id> -key = <expected-signer-keyid> -``` - -Top-level fields: - -| Key | Required | Meaning | -|---|---|---| -| `name` | yes | package name | -| `version` | yes | package version string | -| `description` | no | one-line free text | -| `hash` | yes | currently `blake2b-256` | -| `tree` | yes | currently `cfree-tree-v1` | -| `blob` | yes | currently `cfree-blob-v1` | - -`[output]` fields: +`<prefix>` is the first two lowercase hex chars of the id. Blob and tree +objects are raw/canonical bytes; index and chunk objects hold native-package +chunk data keyed by the signed index that authenticates them. CAS objects are +never signed — they are self-verifying by content identity. `cfree cas` +supports `add-blob`, `add-tree` (from a directory walk or an explicit +path/mode/source map file), `inspect-tree`, `verify-tree`, and `materialize` +(which recreates the directory, verifying every blob and applying modes before +writing). -| Key | Required | Meaning | -|---|---|---| -| `id` | yes | numeric output id, unique in the manifest | -| `name` | yes | output name, such as `runtime`, `dev`, or `debug` | -| `tree` | yes | tree id of this output | -| `target` | no | cfree target triple | -| `default` | no | `true` for the default unpack output | +## Package model -`[artifact]` fields: - -| Key | Required | Meaning | -|---|---|---| -| `output` | yes | output id containing the path | -| `path` | yes | file path inside the referenced output tree | -| `kind` | yes | `exe`, `dso`, `obj`, `wasm`, `lib`, `data`, or `source` | -| `entry` | no | `true` if runnable under jit/emu/wasm | - -Artifact sections are semantic overlays. File size, file hash, blob root, and -mode live in the tree manifest. Package verification rejects an artifact whose -path is not present in the referenced output tree. - -`[dependency]` fields are validated but not resolved: - -| Key | Required | Meaning | -|---|---|---| -| `name` | yes | dependency package name | -| `version` | yes | version constraint | -| `package` | no | expected dependency package id | -| `key` | no | expected signer key id | +A package is a signed claim over one or more output trees. The signed object is +the **package manifest** (`driver/dist/manifest.c`), strict byte-stable text +beginning with `cfree-package 3`: top-level name/version/description plus +`[output]` sections (each naming a numeric id, a human-readable name, a tree +id, an optional target triple, and an optional default flag), `[artifact]` +overlays (semantic labels — `exe`, `dso`, `obj`, `wasm`, `lib`, `data`, +`source` — each keyed to an output id and a path), and optional +`[dependency]` sections (validated for shape but not resolved: there is no +dependency solver or network transport in this format). -## Portable `.tar.gz` +An output's numeric `id` is the join key that `[artifact]` rows reference; its +`name` is a human-facing label (e.g. the binary or directory name) that +identifies the output to a user without exposing the tree hash. `id`, `name`, +and `tree` are mandatory on every output; `target` and `default` are optional. -The portable representation is a gzip-compressed tar containing the signed -package manifest and a CAS object bundle: - -``` -hello-0.3.1.tar.gz - cfree/package.manifest - cfree/package.manifest.minisig - cfree/package.pub - cfree/cas/tree/<prefix>/<tree-id> - cfree/cas/blob/<prefix>/<blob-id> -``` - -Verification: - -1. Decompress and parse the tar container. -2. Read `cfree/package.manifest` and `cfree/package.manifest.minisig`. -3. Anchor and verify the package manifest signature. -4. Parse the package manifest and recompute `package-id`. -5. Load every output tree from `cfree/cas/tree`. -6. Verify every tree object by `tree-id`. -7. Verify every blob referenced by each output tree by `blob-id` and - `blob-root`. -8. Verify artifact overlays reference files that exist in their output trees. - -Portable archives are not optimized for seeking. They are intended for ordinary -archive tooling and offline transfer. - -## Native `.cfpkg` - -The native representation is a signed package pack. It supports three -practical shapes with one format: - -- thin: manifest and descriptor only; tree manifests, chunk index, and chunks - are fetched externally, -- metadata-rich: manifest, descriptor, tree manifests, and chunk index are - embedded; chunks are fetched externally, -- fat: manifest, descriptor, tree manifests, chunk index, and chunks are - embedded in one file. - -The fixed header is trust-neutral. It only locates the early signed metadata: - -``` -offset size field -0 8 magic = "cfpkg3\0\0" -8 4 version = 3, little-endian -12 4 header-size = 96, little-endian -16 8 manifest-offset, little-endian -24 8 manifest-size, little-endian -32 8 signature-offset, little-endian -40 8 signature-size, little-endian -48 8 descriptor-offset, little-endian -56 8 descriptor-size, little-endian -64 8 descriptor-signature-offset, little-endian -72 8 descriptor-signature-size, little-endian -80 8 pubkey-offset, little-endian -88 8 pubkey-size, little-endian ``` - -Trust starts at the verified package manifest. Layout trust starts at the -verified encoding descriptor. - -### Encoding descriptor - -The encoding descriptor is strict INI-style text signed by the same trusted key -as the package manifest. - -Example: - -```ini -cfree-encoding 3 -package-id = <package-id> -format = cfpkg -hash = blake2b-256 -tree = cfree-tree-v1 -blob = cfree-blob-v1 -chunk-size = 65536 -alignment = 16 -tree-offset = 4096 -tree-size = 2048 -tree-root = <region-root> -index-offset = 6144 -index-size = 1680 -index-bytes = 1680 -index-root = <region-root> -index-url = index/<prefix>/<index-root> -content-offset = 8192 -content-size = 65536 -content-root = <region-root> - -[tree-object] -tree = <tree-id> -offset = 0 -size = 512 -blake2b = <BLAKE2b-256 of tree manifest bytes> -url = tree/<prefix>/<tree-id> - -[chunk-source] -kind = embedded - -[chunk-source] -kind = url-template -template = chunk/{blob-prefix}/{blob}/{chunk} +package-id = BLAKE2b-256(package manifest bytes) ``` -Top-level descriptor fields: - -| Key | Required | Meaning | -|---|---|---| -| `package-id` | yes | package manifest id | -| `format` | yes | `cfpkg` | -| `hash` | yes | `blake2b-256` | -| `tree` | yes | `cfree-tree-v1` | -| `blob` | yes | `cfree-blob-v1` | -| `chunk-size` | yes | raw chunk size, normally `65536` | -| `alignment` | yes | embedded region alignment | -| `tree-offset` | yes | embedded tree region file offset, or `0` when absent | -| `tree-size` | yes | embedded tree region byte size, or `0` when absent | -| `tree-root` | yes | authenticated tree region root | -| `index-offset` | yes | embedded index region file offset, or `0` when absent | -| `index-size` | yes | embedded index region byte size, or `0` when absent | -| `index-bytes` | yes | logical chunk index byte size | -| `index-root` | yes | authenticated logical chunk index root | -| `index-url` | no | untrusted fetch locator for external index bytes | -| `content-offset` | yes | embedded content region file offset, or `0` when absent | -| `content-size` | yes | embedded content region byte size, or `0` when absent | -| `content-root` | yes | authenticated content region root | - -Region roots are: - -``` -region-root = BLAKE2b-256("cfree region v1" || - kind-bytes || - BLAKE2b-256(region bytes)) -``` - -The `kind-bytes` value is `tree`, `index`, or `content`. - -When `tree-size` or `content-size` is zero, the region bytes are empty and the -matching region root is the root of the empty byte string in the same domain. - -`index-root` authenticates the logical chunk index bytes, not merely the -embedded region. When `index-size` is non-zero, `index-size` must equal -`index-bytes`, and the embedded index bytes are used. When `index-size` is -zero, the index is fetched externally through `index-url` and accepted only -after its size and root match `index-bytes` and `index-root`. +Artifact overlays carry no hashes of their own; size/hash/root/mode live in the +referenced tree, and verification rejects any artifact whose path is absent +from its output tree. An earlier `cfree-package 2` manifest form (flat, +artifact-indexed) is still parseable for backward compatibility; the tool +emits v3. -`[tree-object]` fields: +### Signing and trust -| Key | Required | Meaning | -|---|---|---| -| `tree` | yes | tree id | -| `offset` | no | byte offset relative to embedded tree region | -| `size` | no | tree manifest byte size in embedded tree region | -| `blake2b` | yes | BLAKE2b-256 of the tree manifest bytes | -| `url` | no | untrusted fetch locator for external tree bytes | - -If `offset` and `size` are present, the tree manifest is embedded. If they are -absent, the tree manifest must be fetched externally. The `url` field is never -trusted; fetched bytes are accepted only when their BLAKE2b-256 equals both -`blake2b` and the package output's `tree` id. - -`[chunk-source]` fields: - -| Key | Required | Meaning | -|---|---|---| -| `kind` | yes | `embedded` or `url-template` | -| `template` | when `kind=url-template` | untrusted external chunk locator | +Signing uses a detached minisign signature over the manifest bytes +(`dist_minisig_sign`). The signature's trusted comment — which minisign covers +with a second global signature — carries `pkgid=<hex package-id>`. Verification +recomputes the manifest hash and rejects the package if the trusted comment's +pkgid does not match. This binds the signature to the exact manifest content, +not merely to a name. -The template may contain `{blob-prefix}` for the first two lowercase hex -characters of the blob id, `{blob}` for the lowercase blob id, and `{chunk}` -for the decimal chunk index. Locator strings are not trusted and do not affect -verification. - -### Chunk index - -The binary chunk index is sorted by `(blob-id, chunk-index)`. - -Each index record is 168 bytes and little-endian: +Trust anchors live in a trusted-keys file (`$CFREE_TRUSTED_KEYS`, else +`$HOME/.config/cfree/trusted_keys`; `driver/dist/trust.c`), one +`keyid pubkey label` line each. A `.pub` bundled inside a package is never +trusted on its own. The verifier picks a key by the signature's key id: ``` -blob-id BLAKE2b-256 -chunk-index u64 -content-offset u64 # relative to embedded content region when embedded -stored-size u64 -raw-size u64 -compression u32 # 0 = none, 1 = lz4-block-v1 -reserved u32 # must be zero -stored-hash BLAKE2b-256 -raw-hash BLAKE2b-256 -leaf-hash BLAKE2b-256 +-p PUBKEY verify against an explicitly supplied public key +(default) look the signer's key id up in the trusted-keys file +--tofu trust-on-first-use: pin the bundled key after its key id + matches the signature, then record it in the trusted-keys file ``` -Empty blobs have no index records. - -`content-offset` is used only for embedded content. For external chunk sources, -clients locate chunks through a `[chunk-source]` template and verify the -returned bytes against `stored-hash` before decoding. +`cfree pkg trust {path|list|add|remove}` manages the anchor file, and +`cfree pkg keygen` produces a passwordless minisign keypair from the host +CSPRNG. -Chunk verification: +## Representations -1. Fetch or range-read the stored chunk bytes. -2. Verify `BLAKE2b-256(stored bytes) == stored-hash`. -3. Decode according to `compression`. -4. Verify `BLAKE2b-256(raw bytes) == raw-hash`. -5. Recompute the blob leaf hash and compare `leaf-hash`. +One package model, two on-disk shapes. -Full blob verification also recomputes `blob-id` and `blob-root` from the raw -chunks and compares the referenced tree entry. +### Portable `.tar.gz` -### Native verification - -Native package verification: - -1. Read the fixed header. -2. Verify the package manifest signature. -3. Verify the encoding descriptor signature with the same trusted key. -4. Confirm descriptor `package-id` matches the package manifest id. -5. Confirm embedded region offsets, sizes, roots, chunk size, and alignment are - well-formed. -6. Load embedded or external tree manifests and verify every `tree-id`. -7. Verify package output tree references and artifact overlays. -8. Load embedded or external chunk index and verify `index-root`. -9. For complete package verification or unpack, read chunks from embedded - content or a caller-supplied external object directory, verify stored bytes, - decode, verify raw bytes, verify leaf hashes, and recompute each referenced - blob id and blob root. -10. Re-materialize the selected tree and apply file modes. - -### External object workflow - -The package tool does not perform network fetches. Descriptor `url`, -`index-url`, and `url-template` values are fetch hints for external tools such -as `curl`, package mirrors, or build-system cache clients. - -Consumers that want to verify a thin or metadata-rich native package fetch the -referenced bytes into the same local CAS layout used by `cfree cas`, then pass -that directory to `pkg verify` or `pkg unpack`: - -``` -cfree pkg inspect --encoding hello.cfpkg -curl -o objects/index/<prefix>/<index-root> <index-url> -curl -o objects/tree/<prefix>/<tree-id> <tree-url> -curl -o objects/chunk/<blob-prefix>/<blob-id>/<chunk-index> <chunk-url> -cfree pkg verify -p key.pub --external objects hello.cfpkg -cfree pkg unpack --verify -p key.pub --external objects hello.cfpkg -C out -``` - -The verifier treats local paths derived from descriptor locators as untrusted. -It accepts only relative paths under `--external DIR`, rejects absolute paths -and `..`, and verifies every byte against signed descriptor fields, tree ids, -blob ids, blob roots, chunk hashes, and leaf hashes. - -The external object directory is a CAS root. Its layout is identical to the -shared CAS layout: +A gzip-compressed ustar archive carrying the signed manifest plus a CAS-shaped +object bundle (`pkg_create_targz`): ``` -<external>/ - blob/<prefix>/<blob-id> - tree/<prefix>/<tree-id> - index/<prefix>/<index-root> - chunk/<blob-prefix>/<blob-id>/<chunk-index> +hello-0.3.1.tar.gz + cfree/package.manifest + cfree/package.manifest.minisig + cfree/package.pub + cfree/cas/tree/<prefix>/<tree-id> + cfree/cas/blob/<prefix>/<blob-id> ``` -When descriptor locators are present, the verifier uses them as relative paths -under `--external DIR`. When a tree locator is absent, it falls back to the -default `tree/<prefix>/<tree-id>` path. When an index locator is absent, it -falls back to `index/<prefix>/<index-root>`. External chunk content can be -located with a `[chunk-source] kind = url-template`; the template is rendered -with `{blob-prefix}`, `{blob}`, and `{chunk}` and then constrained to the -external directory. If a descriptor omits the chunk template, the verifier -falls back to the default CAS chunk path. +This is meant for ordinary archive tooling and offline transfer — it is not +seek-optimized. Verification (`pkg_verify_portable`) decompresses, parses the +tar, anchors and verifies the manifest signature, recomputes the package id, +then for every output tree verifies the `tree-id`, verifies every referenced +blob by `blob-id` and `blob-root`, and checks that artifact overlays resolve. + +### Native `.cfpkg` + +A signed pack (`driver/dist/cfpkg.c`, `pkg_create_cfpkg`) with a fixed +trust-neutral header (`cfpkg3\0`, 96 bytes) that only locates the early signed +metadata: manifest, manifest signature, encoding descriptor, descriptor +signature, and bundled pubkey. It supports three shapes from one format: + +- **fat** — everything embedded (tree manifests, chunk index, chunk content), +- **metadata-rich** — trees and chunk index embedded, chunk content external, +- **thin** — only signed metadata in the file; trees, index, and chunks + external. + +Non-fat shapes write their external objects into a `--external DIR` laid out +exactly like the shared CAS. + +The native physical layout is itself signed, separately from the logical +package, by an **encoding descriptor**: strict text beginning +`cfree-encoding 3`, signed by the same trusted key as the manifest. The +manifest signs *what the release is*; the descriptor signs *how the bytes are +arranged* — region offsets/sizes, embedded-vs-external decisions, chunk size, +alignment, the chunk index root, and per-region authentication roots. A region +root is domain-separated: + +``` +region-root = BLAKE2b-256("cfree region v1" || kind || BLAKE2b-256(region bytes)) +``` + +for `kind` in `tree`/`index`/`content`. Verification confirms the descriptor's +`package-id` matches the manifest, then recomputes each region root from the +actual bytes and compares — so the descriptor's claims about layout are as +trusted as the manifest's claims about content. (A legacy `cfree-encoding 2` +descriptor and `cfpkg2\0` header are still parseable; the tool emits v3.) + +The **chunk index** is a sorted (by blob id, then chunk index) array of +fixed-size little-endian records (`DIST_CFPKG3_INDEX_RECORD_SIZE` = 168). Each +record names a blob's chunk and carries `stored-size`/`raw-size`, a +`compression` tag (`none` or `lz4-block-v1`), the stored byte offset within the +embedded content region (zero for external chunks), and three hashes: +`stored-hash`, `raw-hash`, and the blob `leaf-hash`. Empty blobs contribute no +records. + +Chunk verification is layered defense-in-depth: fetch stored bytes, check +`stored-hash`, decompress, check `raw-hash`, recompute and check the blob +`leaf-hash`, and finally recompute the whole blob's `blob-id` and `blob-root` +against the tree entry. Native verification (`pkg_verify_native`) runs in this +order: read header, verify manifest signature, verify descriptor signature with +the same key, confirm package-id and that all region ranges sit inside the +file, recompute and match every region root, confirm the index is sorted and +well-formed, then reconstruct, verify, and (for unpack) materialize each blob +of the selected output tree. + +### External objects + +The package tool performs no network fetches. Descriptor `url`, `index-url`, +and `[chunk-source] template` values are untrusted fetch hints (rendered with +`{blob-prefix}`, `{blob}`, `{chunk}`) for external tools — `curl`, mirrors, +build caches. A consumer fetches the referenced bytes into a CAS-shaped +`--external DIR` and runs `pkg verify`/`pkg unpack` against it. Locators are +treated as untrusted relative paths: the verifier rejects absolute paths and +`..`, constrains everything under the external dir, and accepts bytes only +after they match the signed descriptor's hashes, ids, and roots. ## CLI -Shared CAS utilities: - -``` -cfree cas add-blob --cas DIR FILE -cfree cas add-tree --cas DIR --root DIR -cfree cas add-tree --cas DIR --map FILE -cfree cas inspect-tree --cas DIR TREE_ID -cfree cas verify-tree --cas DIR TREE_ID -cfree cas materialize --cas DIR TREE_ID -C DIR -``` - -`cas add-tree --root DIR` walks a directory, stores every regular file under -`blob/`, writes the canonical tree manifest under `tree/`, and prints the -tree id. - -`cas add-blob --cas DIR FILE` stores the raw file bytes under `blob/` and -prints the blob id. - -`cas add-tree --map FILE` builds a tree from explicit path/mode/source triples. -The map file grammar is one entry per line: - ``` -<tree-path> <mode> <source-path> -``` - -`cas materialize --cas DIR TREE_ID -C DIR` reads the tree manifest and blobs -from the CAS and writes the tree into the target directory named by `-C`. -Materialization creates missing parent directories, rejects unsafe tree paths, -verifies every blob before writing it, and applies each entry's `mode`. - -`cas inspect-tree --cas DIR TREE_ID` prints the canonical tree manifest bytes. -`cas verify-tree --cas DIR TREE_ID` verifies the tree object id and every blob -referenced by the tree. +cfree cas add-blob/add-tree/inspect-tree/verify-tree/materialize --cas DIR ... -Package utilities: - -``` -cfree pkg keygen -o BASE -cfree pkg create --name N --version V [--desc D] -s SECKEY \ - [--format cfpkg|tar.gz] [--compression none|lz4-block-v1] \ - [--native-shape fat|metadata|thin] [--external DIR] \ - --cas DIR --tree TREE_ID -o OUT -cfree pkg create --name N --version V [--desc D] -s SECKEY \ - [--format cfpkg|tar.gz] [--compression none|lz4-block-v1] \ - [--native-shape fat|metadata|thin] [--external DIR] \ - --root DIR -o OUT -cfree pkg verify [-p PUBKEY | --tofu] [--format cfpkg|tar.gz] \ - [--external DIR] FILE -cfree pkg unpack [--verify] [-p PUBKEY | --tofu] [--format cfpkg|tar.gz] \ - [--external DIR] FILE -C DIR +cfree pkg keygen -o BASE # writes BASE.pub + BASE.key +cfree pkg create --name N --version V [--desc D] -s SECKEY + [--format cfpkg|tar.gz] [--compression none|lz4-block-v1] + [--native-shape fat|metadata|thin] [--external DIR] + (--cas DIR --tree TREE_ID | --root DIR) -o OUT +cfree pkg verify [-p PUBKEY | --tofu] [--external DIR] FILE +cfree pkg unpack [--verify] [-p PUBKEY | --tofu] [--external DIR] FILE -C DIR cfree pkg inspect [--manifest | --encoding] FILE cfree pkg trust {path | list | add PUBKEY [label] | remove KEYID} ``` -`pkg create --cas DIR --tree TREE_ID` packages an existing tree from a CAS. -`pkg create --root DIR` is a convenience form that first creates a temporary -tree from `DIR` and then packages it. - -Local `pkg create --format cfpkg` writes a fat native package by default: -manifest, descriptor, tree manifests, chunk index, and chunks are embedded. -`--native-shape metadata --external DIR` embeds tree manifests and the chunk -index but writes chunks under the external directory. `--native-shape thin ---external DIR` writes tree manifests, the chunk index, and chunks under the -external directory and leaves only signed metadata in the `.cfpkg` file. - -`pkg inspect --encoding FILE` prints the native encoding descriptor so a caller -can derive the fetch plan before running external fetch commands. - -`pkg unpack` verifies before writing files. When `--verify` is supplied, it -also prints the normal verification success line before unpacking. - -## Determinism requirements +`create --root DIR` first builds a temporary tree from the directory, then +packages it; `--cas DIR --tree TREE_ID` packages an existing tree. Format is +inferred from the output suffix when `--format` is omitted. `unpack` always +verifies before writing files. The native descriptor for thin/metadata-rich +packages can be dumped with `inspect --encoding` to derive a fetch plan. -Emitters and verifiers must preserve these identities exactly: +## Determinism invariants -- package ids are manifest-byte hashes, -- tree ids are tree-manifest-byte hashes, -- blob ids are raw-byte hashes, -- blob roots are path-independent chunk Merkle roots, -- native package chunk indexes are blob-indexed, not path-indexed, -- portable and native packages verify the same logical package/tree/blob - content. +Emitters and verifiers preserve these identities exactly: package ids are +manifest-byte hashes; tree ids are tree-manifest-byte hashes; blob ids are +raw-byte hashes; blob roots are path-independent chunk merkle roots; native +chunk indexes are blob-indexed (not path-indexed); and portable and native +packages verify the same logical package/tree/blob content. diff --git a/doc/DRIVER.md b/doc/DRIVER.md @@ -0,0 +1,276 @@ +# DRIVER + +The `cfree` multitool is the toolchain's only executable: a single binary that +dispatches to ~20 named tools (compiler, assembler, linker, archive/object +utilities, JIT runner, debugger, emulator, packager). It is also the first and +canonical *consumer* of libcfree — it depends only on the public API under +`include/cfree/`, never on `src/`. Everything that the OS provides (heap, file +I/O, executable memory, threads, signals, time, entropy) enters libcfree through +host vtables that the driver constructs in exactly one place. See +[DESIGN.md](DESIGN.md) for the library it drives, [INTERFACES.md](INTERFACES.md) +for the public API, [RUNTIME.md](RUNTIME.md) for `libcfree_rt.a`, and +[DISTRIBUTE.md](DISTRIBUTE.md) for the `pkg`/`cas` subsystem. + +## Layering + +The driver is three layers plus a vendored subsystem, all under `driver/`: + +``` + main.c dispatch + top-level help (the only entry: int main) + cmd/<tool>.c one CLI shell per tool (cc, ld, run, objdump, ...) + lib/ cross-tool helpers (cflags, triples, inputs, runtime, hosted) + env/ THE host boundary: turns "the OS" into a CfreeContext + dist/ vendored crypto/compression for pkg + cas + \________________ all of the above call only <cfree/...> public headers +``` + +Two compile profiles enforce the boundary (see `Makefile`): + +- `driver/cmd/`, `driver/lib/`, `driver/main.c`, `driver/dist/` are compiled + `-ffreestanding -nostdinc`, with `-Iinclude` (public API) but deliberately + **no `-Isrc`**. Internal libcfree headers are physically unreachable. `-Ilang` + is the one concession: it lets `cc`/`run` reach the C frontend's *public* + header `c/c.h`. These TUs do no syscalls of their own; all host effects route + through the `env/` shims. +- `driver/env/` is the *only* part compiled with the hosted toolchain + (`DRIVER_ENV_CFLAGS`, real libc headers, per-OS feature-test macros). It is + the single TU family permitted to `#include <stdio.h>`, call `malloc`, + `mmap`, `pthread_*`, `sigaction`, etc. + +This split is the concrete form of the project's "no global state" rule: the +freestanding tools cannot touch the host except through callbacks the `env/` +layer hands them, so each tool's behavior is fully a function of its arguments +and the vtables it is given. + +## Dispatch (main.c) + +`driver/main.c` holds the single centralized tool table — an array of +`{name, main, help, summary}` rows. Dispatch is multi-call: + +``` + argv[0] basename matches a tool? -> run it (installed as `cc` symlink) + else argv[0] was bare "cfree": + no argv[1] / -h / --help -> top-level help + argv[1] == "help" [<tool>] -> top-level or per-tool help + argv[1] matches a tool -> run it with argv shifted by one + otherwise -> "no such tool" + help, exit 2 +``` + +So `cfree cc -c f.c` and a `cc` symlink to the binary behave identically; the +shift trick rewrites `argv[1]` to `argv[0]` before delegating so the tool sees a +conventional argv. `dispatch` returns `-1` for "no such tool" (distinct from a +tool's own non-`-1` exit code) which is what lets the bare-`cfree` fallback +logic run only when argv[0] itself wasn't a tool name. + +Each table row is wrapped in `#if CFREE_TOOL_<NAME>_ENABLED` (defined in +`include/cfree/config.h`). The same flags gate which `driver/cmd/*.c` objects the +Makefile compiles in, so a disabled tool drops out of both the table and the +build with no `#ifdef` scattered through the tool implementations. Adding a tool +is: a `config.h` flag, a row here, the `driver_<tool>` / `driver_help_<tool>` +prototypes in `driver/driver.h`, the `cmd/<tool>.c`, and a Makefile stanza. + +Exit-code convention across all tools: `0` success, `1` tool-reported error, +`2` bad usage. Help requests are detected by `driver_argv_wants_help`, which +stops scanning at a literal `--` so that a `--help` meant for a JITed program +(`run`, `dbg`) or an emulated guest (`emu`) is not hijacked by the driver. + +## The tools (cmd/) + +Each `cmd/<tool>.c` is a thin CLI shell: parse a flag surface, classify inputs, +load bytes via the env file_io, call public libcfree APIs, format output. No +tool reaches into compiler internals. + +| Tool | Role | +|------|------| +| `cc` | C compiler driver: compile, optionally link; preprocess (`-E`), dep-emit (`-M*`), `-shared`. GCC flag subset. Resolves `-l`/`-L` to concrete archive paths. | +| `check` | Run the C frontend checks with no code emission. | +| `cpp` | Standalone preprocessor (alias for `cc -E` without link scaffolding). | +| `as` | Assemble one GAS-subset text source to a relocatable object. | +| `ld` | Link objects/archives into an executable, shared library, or relocatable object; parses `-T` scripts into structured form. | +| `ar` / `ranlib` | Create/modify/list/extract `ar` archives; refresh the symbol index. | +| `strip` / `objcopy` | Drop debug/symbols; rename/remove sections, reformat. | +| `objdump` / `nm` / `size` | Inspect sections, symbols, disassembly, relocations, sizes. | +| `addr2line` / `strings` | Address→`file:line` via DWARF; printable runs. | +| `run` | JIT-compile inputs and call the entry symbol in-process. | +| `dbg` | Interactive JIT debugger (REPL over a `CfreeJitSession`). | +| `emu` | Run a guest user-mode ELF (aarch64/riscv64) via per-block JIT translation. | +| `cas` / `pkg` | Content-addressed store and signed `.cfpkg` distribution. | + +`run`, `dbg`, and `emu` share the `--`-terminated argv convention: flags before +`--` configure the tool, tokens after `--` become the JITed program's / guest's +argv. `cc` and `run` overlap heavily on input shape and the preprocessor flag +family — that overlap is exactly what `driver/lib/` factors out. + +## Cross-tool helpers (lib/) + +These hold the logic that more than one tool needs, so the CLI shells stay thin +and consistent. All are freestanding (no host calls except through `env/`). + +- **cflags** (`lib/cflags.c`): parses the `-I`/`-isystem`/`-D`/`-U` + preprocessor family shared by `cc` and `run`, accumulating into a back-store + sized to argc, then fills a `CfreePreprocessOptions` whose arrays borrow that + store. +- **triple parsing** (`lib/target.c`): pure string-walking `<arch>-...-<os>` + parser → `CfreeTarget`, plus the reverse renderer and the PIC/PIE defaulting + policy (`driver_default_pic`, `driver_link_pie`). Hosted targets default to + PIE; freestanding and WASM stay non-PIE. Lives outside `env/` precisely + because it touches no host state. +- **inputs** (`lib/inputs.c`): classifies a mixed positional list (`-` stdin + source, `.c`/`.s`/`.wat`/… sources, `.o` objects, `.a` archives) into + parallel arrays, then loads + compiles + JIT-links them for `run`/`dbg`. Also + the shared object-symbol enumerator used by `ar`/`ranlib`/`strip`. +- **lib_resolve** (`lib/lib_resolve.c`): resolves `-l<name>` against `-L` + directories with the GNU-ld positional `-Bstatic`/`-Bdynamic` rule and a + target-OS-aware suffix list (POSIX `lib<name>.{so,a}` / Apple `.tbd` vs. + Windows `.lib`/`.dll.a`). The OS hint is independent of the host so + cross-compilation works. +- **hosted** (`lib/hosted.c`): builds a `DriverHostedPlan` — the crt objects, + default libraries, system include dirs, interpreter path, and predefined + macros needed to link a *hosted* executable for a target profile. This is the + classification of which inputs the link step must inject implicitly. +- **runtime** (`lib/runtime.c`): discovers the cfree support root (next to + `argv[0]`, or `--support-dir`), then locates-or-builds `libcfree_rt.a` for the + selected target. It carries a per-target `RuntimeVariant` table (sources, ABI + include dir, `HAS_INT128`/`LDBL128` defines) and rebuilds the archive into a + cache dir (`$XDG_CACHE_HOME`/`~/.cache/cfree`, falling back) when any source + or the tool binary is newer than the cached archive. This is how `cc` ships a + freestanding runtime without a separate install step; see [RUNTIME.md](RUNTIME.md). + +## The host boundary (env/) + +`driver/env/` is the heart of the driver's design. It is the single place that +constructs a `DriverEnv` and projects it into the vtables libcfree consumes: + +``` + DriverEnv -> driver_env_to_context() -> CfreeContext (heap, file_io, diag, metrics, now) + -> driver_env_to_jit_host() -> CfreeJitHost (execmem, jit_tls) + -> driver_env_to_dbg_host() -> CfreeDbgHost (dbg_os) +``` + +A `CfreeContext` is passed by const-pointer into every libcfree entry; the JIT +and debugger take their extra host vtables per-call rather than on the context, +which keeps the common compile/link path from carrying execmem/signal machinery +it never uses. libcfree itself holds no global state and issues no syscalls — +it only calls back through these function pointers, so the driver alone decides +how the abstract operations map onto the real OS. + +### What the vtables abstract + +- **heap** (`g_heap_libc`, `common.c`): `malloc`/`realloc`/`free` shim. Pure + libc, no OS specifics, compiled on every host. +- **diag sink** (`g_diag_stderr`, `common.c`): formats diagnostics to stderr, + resolving `SrcLoc.file_id` to a path via the *active* `CfreeCompiler`. The + `driver_compiler_{new,free}` wrappers register that compiler so messages get + real filenames instead of `<file:N>`. +- **file_io** (`posix.c` / `windows.c`): `read_all` / `release` / `open_writer` + over real paths, with `CfreeWriter` implementations for files (fd-backed) and + stdout (stdio-backed, so it shares libc's buffer with `driver_printf`). +- **execmem** (`CfreeExecMem`): reserve/protect/release/flush of W^X executable + memory for the JIT — the OS-divergent core (below). +- **jit_tls** (`CfreeJitTls`, `jit_tls_posix.c`): per-thread TLS blocks for + JITed code that uses thread-locals. The ctx's first field *must* be the + `get_block` function pointer because the asm TLV thunk + (`src/jit/tlv_thunk.h`) calls through it directly. +- **dbg_os** (`CfreeDbgOs`, `posix_dbg.c`): worker thread, event objects, + signal/exception capture, guarded memory copies, and W^X code-patching for + the debugger. +- **metrics**: optional; `run` supplies a buffered sink so the hot compile/JIT + path does zero I/O and all formatting happens once at exit. + +Beyond the vtables, `env/` also exposes the syscall-shaped helpers the +freestanding tools need but can't make themselves: `driver_printf`/`errf`, +path existence/mtime, `mkdir -p`, directory walks, stdin slurp, an `$EDITOR` +temp-file round-trip, a raw-mode line editor with history/completion for the +`dbg` REPL, SIGINT install/restore, monotonic time, CSPRNG bytes (for `pkg` +key generation), and a `dlsym` resolver so JITed code can call host libc. + +### One TU per concern, zero `#ifdef` + +The env layer's structuring invariant: each TU implements one slice of the +host with **no preprocessor OS/arch conditionals**. `mk/env.mk` is the *only* +place in the build that branches on `uname`, and it selects exactly one file +per axis: + +``` + common.c every host (libc-pure floor) + posix.c / windows.c shared POSIX scaffold | whole Win32 surface + posix_dbg.c, jit_tls_posix.c POSIX dbg + TLS | (folded into windows.c) + macos.c | linux.c | freebsd.c per-OS hooks (one) + icache_{arm,x86,riscv}.c per-arch icache flush (one) + uctx_<arch>_<os>.c per-(arch,OS) ucontext<->frame marshalling (one) + linux_exec_hint_{x86_64,default}.c per-arch Linux mmap hint (one) +``` + +`env_internal.h` holds the OS-neutral surface (heap/diag singletons, the +arch-only icache hook). `env_posix.h` adds the POSIX-only surface (the +exec_dual alias registry, the `os_*` per-OS hooks, ucontext marshalling, the dbg +interrupt signo). Windows folds everything into one TU because it shares no +POSIX overlap. + +### W^X executable memory: the genuine divergence + +The interesting per-OS work is producing executable memory under a strict +write-xor-execute regime, where the JIT/debugger needs *both* a writable view +and an executable view of the same physical pages: + +- **macOS** (`macos.c`): `mach_vm_remap` makes a second VA aliasing the same + pages — a RW write alias and an RX runtime alias, never a single W+X mapping. +- **Linux** (`linux.c`): `memfd_create` + two `mmap`s of the fd. On x86_64 the + runtime alias is hinted into the low 2 GiB (`MAP_32BIT`, + `linux_exec_hint_x86_64.c`) so direct call/jmp displacements from text reach + without thunks; other arches let the kernel choose (`..._default.c`). +- **FreeBSD** (`freebsd.c`): `memfd_create` (FreeBSD 13+) dual-mapping, same + shape; older versions use `SHM_ANON` via `shm_open(SHM_ANON)`. +- **Windows** (`windows.c`): a pagefile-backed file-mapping mapped twice (RW + + RX). + +When write and runtime aliases differ, the reservation registers itself in the +`exec_dual` registry (`posix.c`) so the debugger's `code_write_begin` can +translate a runtime address into the writable alias. Single-mapping +reservations (write == runtime, the non-exec path) skip the registry and the +debugger falls back to a transient `mprotect`. The arch-correct icache flush +after a code write lives in the `icache_*` TUs (`__builtin___clear_cache` on +arm/riscv; a no-op on coherent x86). + +### Debugger host (posix_dbg.c) + +The POSIX `CfreeDbgOs` runs the debuggee on a worker thread and installs +`sigaction` handlers for `SIGTRAP`/`SIGSEGV`/`SIGBUS`/`SIGILL`/`SIGFPE` plus a +`SIGUSR2` interrupt. On a fault it marshals the `ucontext_t` into a +`CfreeUnwindFrame` (delegating the register layout to the per-(arch,OS) +`uctx_*` TU), hands it to the session's `on_fault`, and writes back any +session-edited register state. A `sigsetjmp`-guarded `memcpy` lets the session +read possibly-bad target memory without crashing the process. Only the +registered worker thread participates; faults on other threads fall through to +the previous handler. Windows mirrors this with vectored exception handling and +`Suspend`/`GetThreadContext`. + +## Data flow: a representative `cc` invocation + +``` + main(argv) -> driver_main -> dispatch("cc") -> driver_cc(argc, argv) + driver_env_init(&env) # build host vtables once + parse flags (lib/cflags, lib/target) + lib/runtime: discover support root, ensure libcfree_rt.a for target + lib/hosted: plan crt/libs/includes if linking a hosted exe + lib/lib_resolve: -l/-L -> archive paths + driver_env_to_context(&env) -> CfreeContext + cfree_compiler_new(target, ctx) ; cfree_compile_* ; cfree_link_* + (libcfree calls back through ctx.heap / ctx.file_io / ctx.diag) + return 0 / 1 / 2 +``` + +`run`/`dbg` differ only in also building a `CfreeJitHost` (and, for `dbg`, a +`CfreeDbgHost`) and pumping inputs through `lib/inputs` instead of emitting a +file. The shape — *parse, classify, build a context, call public APIs* — is the +same for every tool. + +## The dist subsystem (cas, pkg) + +`driver/dist/` is a self-contained, vendored implementation of content-addressed +storage and signed-package distribution: BLAKE2b hashing, ed25519 (monocypher) +signing, minisign-format signatures, deflate/lz4 compression, tar bundling, and +the `.cfpkg` manifest. It is vendored so the package pipeline has no runtime +dependency on host crypto/compression libraries; the only host input it takes is +CSPRNG bytes via `driver_random_bytes`. The `cas` and `pkg` CLI shells in +`cmd/` are thin layers over it. See [DISTRIBUTE.md](DISTRIBUTE.md). diff --git a/doc/DWARF.md b/doc/DWARF.md @@ -1,700 +1,401 @@ -# DWARF — implementation plan - -Scope: what it takes for cfree to produce a DWARF-bearing object file -and to read DWARF back out of one. The producer side is `Debug` -(`src/debug/debug.h`) + the MCEmitter line program; the consumer side -is the `cfree_dwarf_*` family (`include/cfree.h:1224-1450`), implemented -by the `src/debug/dwarf_*.c` reader. Both sides share `ObjBuilder` as -the carrier — debug bytes are sections, abbrev codes are interned, and -DIE references are section-relative relocations. - -Today the headers are real, the implementations are stubs, and the -W path in `test/cg/run.sh` is staged and waiting for them. The first -case it asserts (`p01_line_one_inst`) is one `set_loc` + one -instruction — the smallest demand surface that exercises the full -producer→consumer round trip. This plan starts there and builds out. - ---- - -## 1. What working DWARF must look like - -Two artifacts — the same one viewed from each end. - -### 1.1 Producer output (object file shape) - -A non-`-O` aarch64-elf64 object compiled with `-g` should carry, at a -minimum: - -``` -.debug_abbrev DWARF 5 abbreviation table -.debug_info one CU; subprogram + scope + variable DIEs -.debug_line line program; rows for every set_loc transition -.debug_line_str file & dir strings for the line program (DW5) -.debug_str strings for .debug_info -.debug_str_offsets DW5 indirection table for .debug_str -.debug_aranges CU pc-range index (kept for gdb fast path) -.debug_loclists location lists for opt'd code (Phase 5) -.debug_rnglists range lists for noncontiguous scopes (Phase 3+) -.eh_frame CFI for unwind (Phase 4) -``` - -DIE shape we commit to in Phase 1–3: +# DWARF debug info + +cfree's debug-info subsystem turns frontend type/variable/line information into +DWARF 5 inside an object file (the producer), and reads DWARF back out of an +object file to answer source-level queries (the consumer). Both halves live +under `src/debug/`, but they share no state types — the on-disk DWARF wire +format is the only contract between them. The producer is driven *into* by the +code generator; the consumer is a standalone reader over an already-parsed +`CfreeObjFile`. This split is what lets `cfree -g` emit DWARF that `cfree +addr2line`, `cfree objdump --dwarf`, and the `dbg` debugger consume through the +same public API. ``` -DW_TAG_compile_unit (root) - DW_AT_producer "cfree <semver>" - DW_AT_language DW_LANG_C11 (or C17/C23 from CompileOptions) - DW_AT_name TU path (post path-remap) - DW_AT_comp_dir cwd at invocation (post path-remap) - DW_AT_stmt_list .debug_line offset - DW_AT_low_pc 0 - DW_AT_ranges .debug_rnglists offset - DW_AT_addr_base .debug_addr offset (DW5 split disabled — addrs are inline) - DW_TAG_base_type, DW_TAG_pointer_type, DW_TAG_array_type, - DW_TAG_const_type, DW_TAG_volatile_type, DW_TAG_restrict_type, - DW_TAG_typedef, DW_TAG_subroutine_type, - DW_TAG_structure_type/DW_TAG_union_type with DW_TAG_member, - DW_TAG_enumeration_type with DW_TAG_enumerator - DW_TAG_subprogram - DW_AT_name, DW_AT_type, DW_AT_decl_file, DW_AT_decl_line, - DW_AT_low_pc, DW_AT_high_pc (offset form), DW_AT_frame_base - DW_TAG_formal_parameter (per param: name, type, decl_loc, location) - DW_TAG_lexical_block (per scope_begin/end pair) - DW_TAG_variable (per local) + frontend ──► CG API ──► Debug (producer) ─emit─► .debug_* sections + (lang/c) (session) src/debug/debug*.c in CfreeObjFile + │ + ▼ + cfree_dwarf_* (consumer) ◄─open─ src/debug/dwarf_*.c + addr2line / objdump / dbg / emu ``` -What we do not emit (out of scope, at least until called for): +DWARF version is **5 only**, **32-bit DWARF (DWARF32)** length form. The +consumer tolerates and skips DWARF64 and pre-5 units rather than decoding them. +The CFI/`.eh_frame` half of unwinding is produced *elsewhere* — by the +MCEmitter, not by Debug — but consumed here; see §3. -- `DW_TAG_inlined_subroutine` — opt won't synthesize inlines yet. -- `DW_TAG_namespace`, anything C++. -- Split DWARF (`.dwo`, `DW_AT_GNU_dwo_*`). -- `.debug_pubnames` / `.debug_pubtypes` — DW5 deprecated, gdb-index - builds its own. -- `.debug_macro` — pp doesn't feed it yet. +See [OBJ.md](OBJ.md) for the section/symbol/relocation substrate, +[CODEGEN.md](CODEGEN.md) for the CG API that drives the producer, +[LINK.md](LINK.md) for how debug sections survive linking and JIT view-merging, +and [DBG.md](DBG.md)/[EMU.md](EMU.md) for the debugger and emulator that consume +the reader. -### 1.2 Consumer surface +--- -`cfree_dwarf_open(CfreeCompiler*, const CfreeObjFile*) → CfreeDebugInfo*` -must answer the queries declared in `include/cfree.h:1252-1450`. The -test/cg W path exercises a strict subset to begin with: +## 1. The SourceManager: the file-id authority -- `cfree_dwarf_addr_to_line` / `cfree_dwarf_line_to_addr` -- `cfree_dwarf_subprogram_at` (and its thin wrapper `_func_at`) -- `cfree_dwarf_var_at` + `cfree_dwarf_loc_read` -- `cfree_dwarf_type_info` + field/enum iters -- `cfree_dwarf_unwind_step` (Phase 4) +`src/core/source.c` owns the mapping from a small integer `file_id` to a source +file's name/path/kind. It is the single authority shared across diagnostics, +dependency (`-M`) output, and DWARF. A `SrcLoc` is `(file_id, line, col)`; +`file_id == 0` is reserved as the null/invalid slot (so `source_new` seeds slot +0 empty and real files start at 1), and `source_file()` returns NULL for it. -`emu/dbg` (`doc/EMU.md` §8) consumes the same API; nothing in the -debugger should reach past `cfree_dwarf_*`. +Files enter via `source_add_file` (a real on-disk path), `source_add_memory` +(an in-memory unit, used heavily by tests so paths are stable across runs), or +`source_add_builtin`. The manager also records `#include` edges +(`source_add_include`) which feed dependency generation, and macro-expansion +pseudo-files. The DWARF producer never invents its own file numbering: it asks +the SourceManager for the path behind a `SrcLoc.file_id` and assigns its own +dense DWARF file index on top (see §2.4). This keeps every `file:line` the +compiler reports — in an error message, in a `.d` file, and in `.debug_line` — +referring to the same underlying file identity. --- -## 2. Current state inventory - -### 2.1 Headers — real - -- `src/debug/debug.h` — full producer API (Debug, type DIE builders, - func/scope/var lifecycle, line program, loclist, `debug_emit`). -- `src/debug/c_debug.h` — `c_debug_type(Debug*, TargetABI*, const Type*)` - adapter, with the documented "intern by Type* pointer" contract. -- `include/cfree.h:1224-1450` — full consumer API. - -### 2.2 Implementations — stubs - -- `src/api/stubs.c:93-98` — `debug_new` panics, `debug_emit` and - `debug_free` no-op. No `Debug` ever exists today. -- `src/api/pipeline.c:230-238` — pipeline already calls - `debug_new(c, ob)` when `opts->debug_info` is set, and - `debug_emit(debug)` after codegen. The driver is wired; only the - module behind it is missing. -- `src/api/stubs.c:319-440` — every `cfree_dwarf_*` returns "no DWARF" - / NULL. - -### 2.3 Producer-side wiring — partial - -- `cg_set_loc(CG*, SrcLoc)` — declared in `src/cg/cg.h:159` ("propagates - to CGTarget and Debug") but `cg.c` propagates only to `CGTarget` - today; the Debug fanout is dead until `Debug` exists. -- `CGTarget::set_loc` — every backend stamps the loc on the impl, but - no backend calls into `Debug` yet (correct: that's `cg_set_loc`'s - responsibility, not the backend's). -- MCEmitter — has no notion of `(text_section, offset, SrcLoc)` rows. - Needs a per-emitter `LineProgram` accumulator that flushes into - `Debug` on demand (or that `Debug` polls on `debug_func_pc_range`). -- Aarch64 backend — emits sized text sections (`func_end` calls - `obj_symbol_define` with the function size), so `debug_func_pc_range` - has the bounds it needs. -- `c_debug_type` — declared, not implemented. Needs to walk the - `Type*` chain producing `debug_type_*` calls, with a per-Debug - `Type* → DebugTypeId` cache for interning. - -### 2.4 Test surface - -- `test/cg/run.sh:336-359` — W path runs `cg-runner --dwarf-checks NAME - | cg_check_dwarf OBJ`; cases with no directives are silently - skipped, cases with directives are graded by the `line` and - `subprogram` directives implemented in - `test/cg/harness/cg_check_dwarf.c:124-167`. -- `test/cg/harness/cases_p.c:26-38` — `build_p01_line_one_inst` is the - only registered Group P case. Comment block calls out the dependency - chain: Debug, MCEmitter line program, `cfree_dwarf_open`. -- `cg_check_dwarf` uses **only** the public consumer API. It is the - consumer's first real client. +## 2. Producer architecture + +The producer is the `Debug` object. Its public surface is `src/debug/debug.h`; +its state and the wire-format serializer are private to `src/debug/`. + +### 2.1 Who creates and drives Debug + +`Debug` is **driven into, never out of**. `src/debug/` includes `core` and +`obj` but not `src/cg/` or `src/arch/`; the reverse direction (CG using Debug) +is fine. The flow: + +- The backend's `make()` creates the producer when `opts->debug_info` is set, + via `cg_mc_debug_new` (`src/arch/cgtarget.c`), gated by `CFREE_DWARF_ENABLED`. + The same `Debug*` is handed to the `MCEmitter` (for line rows) and captured by + the CG session before any optimizer wrapper. +- The CG session (`src/cg/session.c`) drives function/scope/variable lifecycle + from inside the public CG API entry points, and calls `debug_emit` then + `debug_free` at `cfree_cg_end_obj`. +- The C-type → DWARF-type adapter lives at `src/cg/debug.c` (`api_debug_type`), + not in the language frontend: it lowers a CG type id (`CfreeCgTypeId`) into a + chain of `debug_type_*` calls. Debug itself is language-neutral — it knows + DWARF type *kinds* (base, ptr, array, qualified, typedef, func, record, enum) + but nothing about C. + +Events split by who owns the information: + +| Event | Driver | Producer call | +|---|---|---| +| function begin/end, return type | CG session | `debug_func_begin`/`debug_func_end` | +| params, locals, their storage | CG session at `func_end` | `debug_param`/`debug_local` | +| current source location | CG session on `set_loc` | `debug_set_pending_loc` | +| line rows (offset ↔ loc) | backend per instruction | `debug_emit_row` | +| function PC bounds | backend at finalize | `debug_func_pc_range` | +| types | `api_debug_type` adapter | `debug_type_*` | + +The two-sided event is the **line program**: only the parser/CG side knows the +`SrcLoc`, and only the backend knows the byte offset of an emitted instruction. +So the CG session stashes the latest loc with `debug_set_pending_loc`, and each +backend instruction emitter calls `debug_emit_row(debug, section, offset, loc)` +*after* writing bytes (see the dense `if (mc->debug) debug_emit_row(...)` calls +in `src/arch/*/emit.c`). `debug_line` dedupes a row whose `(section, offset, +loc)` equals the previous one, so a multi-instruction CG op that re-reports the +same loc costs nothing. Rows are accumulated per-function in emit order, so no +sort pass is needed. + +`debug_func_pc_range` records `(text_section, begin_ofs, end_ofs)` against the +currently-open function; function size is `end - begin`. `debug_prune_removed_funcs` +drops functions whose symbol the object layer later marked `removed` (e.g. an +inlined-away or dead function), so stale DIEs and line rows don't ship. + +### 2.2 Producer state shape + +`Debug` (`src/debug/debug_internal.h`) holds: + +- a **file table** — dense `DebugFile[]` plus a `src_file_id → dwarf_idx` map; + each entry caches the path split into interned `dir`/`base` symbols; +- a **type DIE pool** — `DebugType[]` indexed by `DebugTypeId` (1-based; + `DEBUG_TYPE_NONE == 0`). Construction is one-shot per call: building the same + shape twice yields two ids. `void` is the one interned singleton. Records and + enums are built incrementally through opaque builder handles + (`debug_type_record_begin`/`_field`/`_end`) so recursive shapes resolve via + the adapter's own cache; +- a **function table** — `DebugFunc[]`, each carrying its symbol, function-type + id, decl loc, PC range, a flat `DebugVarDIE[]` of params+locals, a + `DebugScope[]` tree built from `scope_begin`/`scope_end` pairs (with an + open-scope stack while building), and its function-local `LineRow[]`; +- a **loclist** table — storage for time-varying variable locations + (registered via `debug_loclist_new`/`_add`), wired but not yet serialized. + +Variable location is a tagged `DebugVarLoc`: `DVL_FRAME` (frame offset), +`DVL_REG` (DWARF register number), `DVL_GLOBAL` (symbol), or `DVL_LOCLIST`. + +### 2.3 Serialization: debug_emit + +`debug_emit` (`src/debug/debug_emit.c`) linearizes everything into `.debug_*` +sections in one pass over an `EmitCtx`. The helper layers are +`src/debug/debug_abbrev.c` (abbreviation pool, dedup by +`(tag, has_children, attr-list)`, 1-based codes assigned in first-use order) and +`src/debug/debug_form.c` (LEB128 and fixed-width form byte encoders that write +into a `Buf`, independent of any live ObjBuilder section). + +Sections emitted: `.debug_abbrev`, `.debug_info`, `.debug_line`, +`.debug_line_str`, `.debug_str`, `.debug_str_offsets`, `.debug_aranges`, +`.debug_rnglists`. All eight section ids — plus a paired `SK_SECTION` symbol per +section — are created up front, before any payload, because cross-section +references are emitted as relocations that must name their target symbol. + +Key wire-format choices, all centralized in `resolve_abbrevs` and the emit +helpers: + +- **Strings** are interned into `.debug_str`, referenced from `.debug_info` + uniformly as `DW_FORM_strx4` indices through the `.debug_str_offsets` table + (the CU root carries `DW_AT_str_offsets_base`). Line-program file/dir paths + live in the separate `.debug_line_str` and are referenced by + `DW_FORM_line_strp`. +- **Intra-CU DIE references** (e.g. `DW_AT_type`) are `DW_FORM_ref4`, + unit-relative. They are forward-resolved: the type's body offset is unknown at + reference time, so a fixup list is recorded and patched after all DIEs are + laid out (`cu_relative = cu_header_size + target_body_offset`). A reference to + `void` (which has no DIE) is written as 0; the consumer reads 0 as void. +- **Addresses** use relocations, never literal values. `DW_AT_low_pc` + (`DW_FORM_addr`) and the line program's `DW_LNE_set_address` emit an + `R_ABS64` against the function symbol. `DW_AT_high_pc` is `DW_FORM_data4` + holding the *function size* (offset form), so it needs no reloc. +- **Address size** is a single value, the target's pointer width + (`c->target.ptr_size`), threaded everywhere an address-sized field appears: + the CU header `address_size` byte, the line-program header, `DW_FORM_addr` + slots, the `DW_OP_addr` operand width, and the `.debug_aranges`/`.debug_rnglists` + payload widths (aranges also aligns each tuple to `2 * address_size` from the + section start). The producer never hardcodes 8; cross-targeting a 32-bit + pointer target would narrow these fields uniformly. +- **Cross-section offsets** (`DW_AT_stmt_list`, `DW_AT_ranges`, + `DW_AT_str_offsets_base`, `debug_abbrev_offset`, `.debug_str_offsets` + entries, `.debug_line` `line_strp` slots, the aranges `debug_info_offset`) + are written with the correct literal value *and* paired with an `R_ABS32` + reloc against the target section's symbol. In a plain `.o` the debug sections + are not laid out (section vaddr 0), so the reloc is a no-op and the literal + stands; under the JIT view-builder (`src/link/link_jit.c`), where several + inputs' debug sections are concatenated, the reloc rebases each offset to its + slot in the merged view. This dual-write is the single trick that makes + multi-input DWARF (one CU per input) resolve correctly in-process. + +The CU root DIE carries `DW_AT_producer`, `DW_AT_language` (`DW_LANG_C11`), +`DW_AT_name`/`DW_AT_comp_dir` (the primary file's base/dir, seeded from the +first function's decl site if no file has been referenced yet), `DW_AT_stmt_list`, +`DW_AT_ranges`, and `DW_AT_str_offsets_base`. The subprogram DIE uses a single +abbrev with `DW_AT_type` always present (a void return writes ref4=0). When a +function has no source-level params (e.g. unoptimized prototype-only info), the +emitter falls back to synthesizing `DW_TAG_formal_parameter` children from the +function type's parameter types so the signature is still recoverable. + +`DW_AT_frame_base` on every subprogram is the one-byte exprloc +`{ DW_OP_call_frame_cfa }`. Variable locations become exprlocs per +`DebugVarLoc`: `DW_OP_regN`/`DW_OP_regx` for registers, `DW_OP_fbreg <sleb>` for +frame offsets, `DW_OP_addr` for globals. + +### 2.4 Line program + +The line program (`emit_section_line`) is built program-first (so its byte +length is known before the header). DWARF 5 conventions: **file 0 is the CU +primary file**; `directory_entry_format`/`file_name_entry_format` are fixed +(`DW_LNCT_path` as `line_strp`, plus `DW_LNCT_directory_index` for files); +directories are deduped. `minimum_instruction_length` and +`maximum_operations_per_instruction` come from the arch's `ArchDwarfOps` +(`src/arch/arch.h`) — fixed-width ISAs use their instruction width, x86-64 uses +1 because PC advances are byte-granular. + +Per function with a PC range: emit `DW_LNE_set_address` (relocated against the +function symbol), then for each row advance file/column/PC/line with standard +opcodes (`DW_LNS_set_file`, `set_column`, `advance_pc`/`fixed_advance_pc`, +`advance_line`, `copy`), then advance to the function end and emit +`DW_LNE_end_sequence`. No special opcodes or extension opcodes are produced; +the encoding stays simple and re-decodable. + +`.debug_aranges` (a `(low_pc, length)` per function, kept for fast attach) and +`.debug_rnglists` (one `DW_RLE_start_length` per function) round out the +address-coverage indexes. --- -## 3. Producer pipeline - -### 3.1 Who drives Debug - -Producer events split into three classes by who has the information. -Each class has a different driver — there is no single "the producer" -that calls Debug. - -| Event class | Driver | Source-level info | Storage info | Text-offset info | -|---|---|---|---|---| -| `func_begin/end`, decl loc, types | parser | parser (`Type*`, decl `SrcLoc`) | — | — | -| `param`, `local`, `scope_begin/end` | parser | parser (name, type, scope) | CG (Reg / frame_ofs, returned to parser) | — | -| `set_loc` line rows | parser + backend | parser (`SrcLoc`) | — | backend (`obj_pos`) | -| `func_pc_range` | CG | — | — | backend (`obj_pos` at `func_end`) | - -The parser holds `(cg, debug)` as peers. Anything the parser knows on -its own — declarations, scopes, types — goes from parser straight to -Debug; it does not transit CG. CG and the backend drive Debug only -for events whose information they own. - -**Class 1 — declarations, types, scopes.** The parser is the only -thing that has lexical scopes, decl `SrcLoc`s, and the C `Type*` -chain. CG sees individual ops, not `for`-loop bodies; the backend -sees instructions, not declarations. The parser calls Debug directly: - -```c -parser_decl_local(p, name, type, init) { - Reg r = cg_alloc_local(p->cg, type); /* CG returns storage */ - if (p->debug) { - DebugTypeId tid = c_debug_type(p->debug, p->abi, type); - debug_local(p->debug, name, tid, loc, dvl_from_reg(r)); - } -} -``` - -`debug_func_begin` is the same shape: parser resolves the function's -`Type*` through `c_debug_type` and calls `debug_func_begin` itself -(in addition to whatever it tells CG to do). - -**Class 2 — line rows.** The parser knows `SrcLoc`; the backend knows -`obj_pos`. Neither alone can produce a row, so this is the one place -Debug receives events from two sides: - -1. Parser calls `cg_set_loc(cg, loc)` before each statement-level IR - op. CG forwards to `target->set_loc` (already wired) and stashes - the loc on `debug` via `debug_set_pending_loc(debug, loc)`. -2. Each backend instruction emit calls - `debug_emit_row(debug, section_id, offset, pending_loc)` after - writing bytes. Debug appends; rows arrive in text order, no sort - pass needed. - -Granularity is per-instruction, not per-CG-op. A multi-instruction CG -op (e.g. a 64-bit immediate via `MOVZ; MOVK; MOVK; MOVK`) produces -four rows pointing at the same loc. This is correct DWARF: only the -first row sets `is_stmt`, the rest are continuation rows. Debug -deduplicates a row whose `(section, offset, loc)` matches the -previous row, so back-to-back identical events from the parser cost -nothing. The backend doesn't grow a Debug dependency; it grows a -single one-line call against an already-needed `obj_pos`. - -The harness in `test/cg/harness/cases_p.c:33` calls -`target->set_loc` directly. That stays — it's the parser-side half of -Class 2, and the harness is the parser stand-in for these tests. - -**Class 3 — `func_pc_range`.** Function bounds are only known at -`func_end`, after the backend has finalized the function size. CG -holds those bounds and calls `debug_func_pc_range` from inside -`cg_func_end`: - -```c -void cg_func_end(CG* cg) { - u32 end_ofs = obj_pos(cg->ob, cg->cur_text_sec); - cg->target->func_end(cg->target); - if (cg->debug) - debug_func_pc_range(cg->debug, cg->cur_text_sec, - cg->func_begin_ofs, end_ofs); -} -``` - -This is the only class where CG drives Debug. The parser doesn't have -the bounds; the backend doesn't have the Debug handle. - -**Emu lifter.** `src/emu/` (see `doc/EMU.md` §8) is a parser-shaped -client for guest code: it calls Debug directly with synthetic -`file_id`s encoding guest PC. Same Class-1 / Class-2 split — the -lifter is the parser, the host backend is the backend. +## 3. CFI / `.eh_frame` — produced outside Debug -### 3.2 What this means for module dependencies +Unwind info is **not** emitted by the Debug producer. The `.eh_frame` section is +synthesized by the MCEmitter (`mc_emit_eh_frame` in `src/arch/mc.c`), driven by +per-arch CFI directives (`cfi_startproc`/`cfi_def_cfa`/`cfi_offset`/`cfi_endproc`) +that the native backends call around their prologues (e.g. +`src/arch/aa64/native.c`). The MCEmitter buffers a per-function FDE of CFI +directives, each tagged with a post-prologue PC offset, and assembles one CIE + +one FDE-per-function at TU finalize. This lives in the codegen path because only +the backend knows the exact prologue shape and PC offsets. -- `src/debug/` does not include `src/cg/` or `src/arch/`. Debug is - driven *into*, never *out of*. -- The backend (`src/arch/*`) gets one new dependency: a single Debug - forward declaration plus `debug_emit_row`. No type DIE, no - declaration API, nothing else. -- CG (`src/cg/cg.c`) calls Debug only for `set_pending_loc` (in - `cg_set_loc`) and `func_pc_range` (in `cg_func_end`). -- Everything else — type construction, params, locals, scopes, - func_begin — is parser → Debug, with CG out of the path. - -### 3.3 Module shape - -``` -src/debug/ - debug.h (existing) - c_debug.h (existing) - dwarf_defs.h shared DWARF wire-format constants, no state - debug.c NEW: state, type DIEs, func/scope/var, line program - debug_emit.c NEW: linearize to .debug_* sections in ObjBuilder - debug_abbrev.c NEW: abbrev pool, dedup, encode - debug_form.c NEW: form encoders (LEB128, strx, addrx, sec_offset) - debug_eh.c NEW: .eh_frame CIE+FDE assembler (Phase 4) - c_debug.c NEW: c_debug_type adapter + Type* → DebugTypeId cache - dwarf_internal.h reader-private state - dwarf_open.c cfree_dwarf_open/close, sections, forms, abbrevs - dwarf_die.c DIE walking for subprograms, locals, globals - dwarf_line.c .debug_line decoder and line queries - dwarf_query.c public variable/subprogram query entry points - dwarf_type.c type DIE resolution - dwarf_loc.c location expression and loclist evaluator - dwarf_cfi.c CFI machine and unwind step -``` - -State held by `Debug`: - -``` - Compiler* c; - ObjBuilder* ob; - - /* file table — DWARF file index ←→ SourceManager file_id */ - Vec<u32> file_to_src; /* dwarf_idx → src file_id */ - Map<u32,u32> src_to_file; /* src file_id → dwarf_idx */ - - /* type DIE pool */ - Vec<DebugType> types; /* indexed by DebugTypeId-1 */ - - /* function lifecycle stack (one entry per open func_begin) */ - Vec<DebugFunc> funcs; - - /* line program rows, in (section, offset) order */ - Vec<LineRow> lines; - SrcLoc pending_loc; /* set by debug_set_pending_loc */ - - /* loclists keyed by debug_loclist_new id */ - Vec<LocList> loclists; -``` - -`DebugType` is a tagged record carrying everything the abbrev encoder -needs: kind, name (interned `Sym`), inner ids, byte size, member -list, encoding, etc. - -### 3.4 Line program - -DWARF 5 line program, header-only complications: - -- File 0 is the CU's primary file (the `DW_AT_name` value). - Subsequent file numbers are dense, allocated as `debug_file` is - called. -- `directory_entry_format` and `file_name_entry_format` use a fixed - shape: `DW_LNCT_path, DW_FORM_line_strp` and `DW_LNCT_directory_index, - DW_FORM_udata`. Strings live in `.debug_line_str` (separate from - `.debug_str`). -- `minimum_instruction_length = 4`, `maximum_operations_per_instruction - = 1` for aarch64. -- Standard opcodes only — no extension opcodes. `DW_LNS_set_file`, - `set_column`, `negate_stmt`, `advance_pc`, `advance_line`, - `const_add_pc`, special opcodes for compact (advance, line) deltas. -- Address advances are fixed-form `DW_LNE_set_address` followed by - `DW_RELOC_ABS64` against the function symbol — the linker patches - PCs at link time. We do **not** emit `.debug_addr` indirection in - Phase 1; switch to it in Phase 5 if the size cost matters. - -Emit order, per CU: - -``` -header → file_names + dir_names assembled from `file_to_src` -opcodes → emit one DW_LNE_set_address per function entry - (sym = function ObjSymId, addend = 0) - → walk `lines` for that function, advancing PC and line - → DW_LNE_end_sequence at function end -``` - -A canonical output (one func, one line, one inst): - -``` -[header ...] -DW_LNE_set_address &test_main -DW_LNS_advance_line +9 ; from default 1 → 10 -DW_LNS_copy ; emits row (file=0, line=10, addr=&test_main) -DW_LNS_advance_pc 4 -DW_LNE_end_sequence -``` - -This is exactly what `p01_line_one_inst` should produce. - -### 3.5 .debug_info / .debug_abbrev - -Two-pass: - -1. Walk the in-memory DIE tree to assign abbrev codes (dedup by - `(tag, has_children, attr_list)` tuple). Build the abbrev section - in the order codes were assigned. -2. Encode the DIE tree against the abbrev table. Forward references - (e.g. `DW_AT_type` to a type DIE that hasn't been emitted yet) are - resolved by recording an offset table during pass 1. - -Forms we commit to: - -- `DW_FORM_strx1` for short string indices, `DW_FORM_strx4` for the - rest. Strings are interned in `.debug_str` via a hash table; offsets - are written into `.debug_str_offsets`, indexed from `DW_AT_str_offsets_base`. -- `DW_FORM_sec_offset` for everything pointing at another debug - section (line, loclists, rnglists). -- `DW_FORM_addr` for `DW_AT_low_pc`, written as a `R_*_ABS64` reloc - against the function symbol, addend = 0. `DW_AT_high_pc` uses - `DW_FORM_data4` and stores `func_size` (i.e. function-relative). -- `DW_FORM_exprloc` for `DW_AT_location` and `DW_AT_frame_base`. -- `DW_FORM_data1/2/4/udata/sdata` per attribute; pick the smallest - fixed form that holds the value. - -### 3.6 Variable locations - -`DebugVarLoc → DW_AT_location` mapping: - -| `DebugVarLocKind` | exprloc bytes | -|---|---| -| `DVL_REG` (reg `n`) | `DW_OP_reg<n>` for n<32, else `DW_OP_regx <n>` | -| `DVL_FRAME` (`ofs`) | `DW_OP_fbreg <sleb128 ofs>` | -| `DVL_GLOBAL` (sym) | `DW_OP_addr <reloc against sym>` | -| `DVL_LOCLIST` (id) | `DW_AT_location DW_FORM_loclistx <idx>` | - -`DW_AT_frame_base` for every subprogram is `DW_OP_call_frame_cfa` — -the CFI machine then defines what the CFA is. This is the cleanest -encoding and matches what gcc/clang emit. - -`Reg` numbering must be the architecture's DWARF register number. Use -`cfree_arch_register_*` (`include/cfree.h:196-206`) as the canonical -mapping; the Debug module asks the `Compiler`'s arch for its kind -once and caches the table. - -### 3.7 .eh_frame (Phase 4) - -One CIE per CU, fixed augmentation: - -``` -CIE - version 1 - augmentation "zR" ; FDE pointer encoding present - code_align 4 ; aarch64 - data_align -8 ; aarch64 - return_register r30 ; LR - augmentation_data [DW_EH_PE_pcrel|DW_EH_PE_sdata4] - initial instructions: DW_CFA_def_cfa r31, 0 ; sp-based, 0 offset -``` - -One FDE per function. Backend-emitted CFI directives (we'll need a -`CGTarget.cfi_*` surface, or piggyback on existing prologue/epilogue -hooks) drive `DW_CFA_advance_loc`, `DW_CFA_def_cfa_offset`, -`DW_CFA_offset`. The aarch64 backend's prologue is small and uniform; -the FDE bytes can be templated for "stp x29,x30,[sp,-N]; mov x29,sp" -forms initially and only generalized when the prologue diversifies. +The DWARF *consumer* (`dwarf_cfi.c`) reads this `.eh_frame` for unwinding, so the +two ends still meet at the wire format — just on the codegen side, not the Debug +side. See [CODEGEN.md](CODEGEN.md)/[ARCH.md](ARCH.md) for the producer. --- -## 4. Consumer pipeline +## 4. Consumer architecture -### 4.1 Open - -`cfree_dwarf_open` reads sections by name from the `CfreeObjBuilder` -(`cfree_obj_builder()` from the file). Mandatory: `.debug_abbrev`, -`.debug_info`, `.debug_line`, `.debug_str`, `.debug_line_str`. If any -of these are absent, return NULL. - -The DWARF reader does *not* re-decode object format. It treats the -already-parsed `ObjBuilder` (which holds the raw section bytes via -`obj_section_get`) as its substrate. Cross-section references resolve -by section name + offset. - -State: +The consumer is `CfreeDebugInfo`, opened from a `CfreeObjFile` and answering the +`cfree_dwarf_*` queries declared in `include/cfree/dwarf.h`. It is split by +concern into one file per stage, sharing the private `dwarf_internal.h`. The +reader **never re-decodes the object format**: it asks the obj layer for section +bytes by name and treats them as its substrate. Most state is built lazily on +the first query that needs it. ``` -struct CfreeDebugInfo { - CfreeCompiler* c; - CfreeObjFile* obj; /* not owned */ - - /* abbrev cache: per-CU, abbrev_code → AbbrevDecl */ - Vec<AbbrevTable> abbrevs; - - /* lazy: built on first query that needs it */ - LineTable* lines; /* (section_idx, offset) → row */ - Vec<Subprogram> subs; /* sorted by low_pc */ - TypeCache types; /* DIE offset → CfreeDwarfType* */ - EhFrame* eh; /* CIE list + FDE index */ -}; + open/abbrev dwarf_open.c sections, byte primitives, abbrev cache, + CU headers, form decoding, DIE iteration + │ + DIE walk dwarf_die.c subprograms, lexical blocks, params/locals, + globals — attribute packing + │ + line prog dwarf_line.c decode .debug_line → row matrix; addr↔line + │ + CFI dwarf_cfi.c .eh_frame machine; cfree_dwarf_unwind_step + │ + loc/type dwarf_loc.c DWARF stack machine; loclist resolution + dwarf_type.c type DIE → CfreeDwarfType (cached) + │ + query dwarf_query.c subprogram_at/named, var_at, vars/param iters, + loc_read + dump dwarf_dump.c structural iterators for objdump --dwarf ``` -### 4.2 Line program decoder - -Walks `.debug_line`, materializing the row matrix once and indexing it -two ways: - -- by PC range (sorted `(low_pc, high_pc, file, line, col)` tuples) for - `addr_to_line`. -- by `(file_norm, line)` → first-matching-PC for `line_to_addr`. - -`file_norm` is the post-path-remap absolute path. Comparison is -byte-equal; the producer is responsible for emitting a single -canonical form. - -### 4.3 DIE walker - -A streaming walker over `.debug_info` keyed off the abbrev table. The -public surface only needs: - -- root CU traversal, -- `DW_TAG_subprogram` collection (for `subprogram_at` / `func_at`), -- `DW_TAG_lexical_block` traversal (for `var_at` scope resolution), -- `DW_TAG_variable` / `DW_TAG_formal_parameter` resolution, -- type DIE following. - -We don't need a general "iterate every DIE" API outside the module. - -### 4.4 Loc-expr evaluator - -A small DWARF stack machine, supporting just the ops the producer -emits in §3.6: `DW_OP_reg0..31`, `DW_OP_regx`, `DW_OP_fbreg`, -`DW_OP_addr`, `DW_OP_call_frame_cfa`, plus the arithmetic ops needed -for any future composite locations (`DW_OP_plus_uconst`, -`DW_OP_breg*`, `DW_OP_consts`, `DW_OP_and`, `DW_OP_shr`). - -Composite locations (`DW_OP_piece`) are not in scope until opt -generates them. - -### 4.5 CFI machine (Phase 4) - -`cfree_dwarf_unwind_step` walks `.eh_frame` from the highest-address -end (CIEs first) and runs the FDE program for the FDE whose -`(initial_location, address_range)` covers `frame->pc`. State is -the standard CFI table; output mutates `frame->pc`, `frame->cfa`, and -caller-saved register slots. Returns 0 on a step, 1 at stack bottom -(no caller information, return address register is `0`), nonzero on -decode error. - ---- - -## 5. Test plan - -### 5.1 Group P / W path - -The CORPUS sketch from `test/cg/CORPUS.md` is the producer→consumer -end-to-end test. Each case registers directives that -`cg_check_dwarf` runs through the public consumer API. Failure of any -directive fails the W run for that case. - -Existing today: -- `p01_line_one_inst` — `line p01.c 10` + `subprogram test_main`. - -To register as Phase 1+2 land: -- `p02_line_monotone` — three lines, three directives. -- `p03_line_repeat` — same line on two PCs, one directive (the round - trip is enough). -- `p05_func_pc_range` — `subprogram` directive carries - no inclusive bounds today; add a `pc_range FILE LINE LOW HIGH` - directive once `subprogram_at` returns ranges that we can predict. -- `p07_local_loc` — needs a new directive `var PC NAME EXPECT_KIND - EXPECT_VALUE` that drives `cfree_dwarf_var_at` + `loc_read`. - -The directive grammar in `cg_check_dwarf.c:169-205` is intentionally -small. Extend it as cases require — each new directive is one switch -arm calling one consumer entry. - -### 5.2 Self-roundtrip unit - -A small unit test under `test/debug/` that: - -1. spins up an in-memory `ObjBuilder`, -2. drives `Debug` directly (no CG), -3. calls `debug_emit`, -4. opens the result with `cfree_dwarf_open`, -5. asserts every input row, type, and subprogram round-trips. - -This is what catches abbrev/encoding bugs that the W path would -attribute to "the backend emitted nothing for set_loc". - -### 5.3 External validators - -Run two third-party DWARF readers against the same Phase-1 obj as -sanity for the wire format itself: - -- `llvm-dwarfdump --verify` — fails on malformed sections, ambiguous - abbrevs, dangling DIE refs. -- `readelf --debug-dump=info,line,abbrev,aranges` — reference - rendering; hand-diff once per phase. - -Both run under `test/libc/` style: optional, gated by tool -availability (`command -v llvm-dwarfdump`), skipped otherwise. They -are **not** the oracle for any case; the W path is. They exist to -catch wire-format errors that our own consumer would also miss. - ---- - -## 6. Phasing - -Order is chosen so each phase produces a green test the next can rely -on. Each phase ends with a runnable W-path case green. - -### Phase 0 — wiring (≈300 LOC, no DWARF bytes) - -- Remove `unimplemented` from `debug_new`, return a real `Debug`. -- `cg_set_loc` fanout to Debug (`debug_set_pending_loc`). -- Backend op-end fanout: when `cg->debug != NULL`, after each emitted - instruction, call `debug_emit_row(debug, text_section, offset, - pending_loc)`. -- `debug_emit` writes nothing; it just frees state. - -End state: `-g` builds run without panicking; no `.debug_*` sections -yet; W-path stays red. - -### Phase 1 — minimal producer - -- `.debug_abbrev`, `.debug_info` with one `DW_TAG_compile_unit` and - one `DW_TAG_subprogram` per `debug_func_begin`, -- `.debug_line` with the line program assembled in §3.4, -- `.debug_str`, `.debug_line_str`, `.debug_str_offsets`, -- `.debug_aranges` (one entry per subprogram), -- relocations against function symbols for low_pc. - -End state: `readelf --debug-dump=line` and `--debug-dump=info` show -sane output; `llvm-dwarfdump --verify` clean. - -### Phase 2 — minimal consumer - -- `cfree_dwarf_open` (real), -- `cfree_dwarf_addr_to_line`, `cfree_dwarf_line_to_addr`, -- `cfree_dwarf_subprogram_at`, `cfree_dwarf_func_at`. - -End state: `p01_line_one_inst/W` green. Add `p02`, `p03`, `p05`. - -### Phase 3 — types, locals, params - -Producer: -- `c_debug_type` adapter (full Type chain → DIE tree), -- `debug_param`, `debug_local`, `debug_scope_begin/end` write - `DW_TAG_formal_parameter`, `DW_TAG_variable`, `DW_TAG_lexical_block`. - -Consumer: -- `cfree_dwarf_var_at`, `cfree_dwarf_vars_at_*`, - `cfree_dwarf_param_iter_*`, -- `cfree_dwarf_type_info`, field/enum iters, -- `cfree_dwarf_loc_read` against a `CfreeJitSession` (regs from - `CfreeUnwindFrame`, frame memory through the JIT session's read). - -End state: `p06`/`p07` directives extended; opt-off dbg can render -`info locals` for an aarch64 binary. - -### Phase 4 — CFI / unwind - -- `.eh_frame` producer (templated FDEs from the aarch64 prologue). -- Consumer CFI machine + `cfree_dwarf_unwind_step`. -- `dbg` backtrace works on a self-built obj. - -### Phase 5 — opt path (loclists) - -- Producer `debug_loclist_new/add` realize as `.debug_loclists`. -- `DW_FORM_loclistx` references on `DW_AT_location`. -- Consumer loc-expr evaluator already handles single-location - exprlocs; loclists are an outer wrapper. - -End state: `-O2` builds keep variables debuggable. - -### Deferred (no phase) - -- `.debug_macro` (preprocessor macros). Cheap to add once pp records - edges; nothing depends on it. -- Inlined subprograms. Wait until opt synthesizes inlines. -- Split DWARF, `.debug_pubnames`, GNU index. No client. -- `LSDA` / `.gcc_except_table`. C has no exceptions. +### 4.1 Open, sections, primitives (dwarf_open.c) + +`cfree_dwarf_open` looks up debug sections by name. It is format-aware in only +one place: `dw_find_section` also tries the Mach-O spelling +(`__DWARF,__debug_*`, 16-char truncated, and `__TEXT,__eh_frame`) so one lookup +spans ELF and Mach-O. The mandatory five are `.debug_abbrev`, `.debug_info`, +`.debug_line`, `.debug_str`, `.debug_line_str`; if any is missing, open fails +with `CFREE_NOT_FOUND`. `.debug_str_offsets`, `.debug_addr`, `.debug_loclists`, +`.debug_rnglists`, `.debug_aranges`, and `.eh_frame` are optional. + +This file also holds the bounds-checked byte-stream primitives (`dw_u8`/`u16`/ +`u24`/`u32`/`u64`/`uleb`/`sleb`/`cstr`), the abbrev-table parser and cache +(keyed by abbrev-section offset, shared across CUs that point at the same +table), the CU-header parser (which records each CU's `address_size`), the form +decoder (`dw_read_form`, which resolves `strx`/`strp`/`line_strp` to strings +inline and sizes `DW_FORM_addr` by the CU's `address_size` rather than assuming +8), and the generic DIE reader/skipper. +On truncated input the primitives clamp and return zero rather than crash. All +CUs are parsed eagerly into `d->cus` (`dw_parse_all_cus`, idempotent); each CU's +root DIE is scanned for the base attributes (`str_offsets_base`, `addr_base`, +`stmt_list`, `name`, `comp_dir`) in two passes so that `strx` resolution has its +base before any string attribute is read. + +### 4.2 DIE walk (dwarf_die.c) + +A recursive walker keyed off the abbrev table, run lazily and cached. It does +not expose a general "iterate every DIE" surface to queries (that's +`dwarf_dump.c`); instead it collects exactly what the query layer needs: + +- `dw_build_subs` — every `DW_TAG_subprogram`/`DW_TAG_inlined_subroutine`, + indexed by PC range. `decl_file` is resolved through the CU's line-program + file table. +- `dw_build_locals` — for one subprogram, walks its subtree for + `DW_TAG_formal_parameter`/`DW_TAG_variable`, threading lexical-block PC ranges + down so each local carries the `[scope_lo, scope_hi)` it is live in. +- `dw_build_globals` — top-level `DW_TAG_variable` DIEs under each CU root. + +Attribute reads funnel through `read_pack`/`DieAttrPack`, a flat struct that +captures the attributes any consumer cares about (name, low/high pc, type +offset, decl file/line, location block or loclist index, frame base, member +offset, byte/bit size, encoding, array count). `DW_AT_type` is normalized to an +absolute `.debug_info` offset (`ref*` forms are CU-relative; `ref_addr` is +absolute). + +### 4.3 Line program decoder (dwarf_line.c) + +`dw_build_line` runs the DWARF 5 line-number state machine for one CU's +`stmt_list`, materializing a `DwLineRow[]` row matrix. It parses the v5 +directory/file entry formats, then composes a normalized absolute path per file +index (`file_norm`: dir + '/' + path, or the path as-is if already absolute) for +byte-equal matching. DWARF64 and non-5 versions are skipped. + +`cfree_dwarf_addr_to_line` finds the row covering a PC. The subtlety, encoded in +the loop, is sequence boundaries: a row covers `[row.addr, next_row.addr)`, and +an `end_sequence` row closes a sequence rather than covering anything. Without +honoring that, in a multi-CU image (one CU per linked input, abutting in a +single `.text`) an earlier CU would swallow addresses belonging to a later one. +`cfree_dwarf_line_to_addr` does the reverse, matching the user's file either +exactly or as a `/`-anchored suffix (so `util.c:42` resolves against an absolute +`file_norm`); it returns `CFREE_AMBIGUOUS` when distinct file paths match, with +`cfree_dwarf_line_to_addr_all` to enumerate candidates. + +### 4.4 Location evaluator and loclists (dwarf_loc.c) + +`dw_eval_expr` is a small DWARF stack machine over a fixed 64-slot stack. It +supports the ops the producer emits plus enough arithmetic for composite forms: +`DW_OP_litN`/`regN`/`bregN`, `addr`, the `constNu/s` family, `dup`/`drop`, +`and`/`or`/`xor`/`plus`/`minus`/`mul`/`shl`/`shr`/`shra`/`plus_uconst`, `regx`, +`bregx`, `fbreg`, `call_frame_cfa`, and `stack_value`. `DW_OP_fbreg` recursively +evaluates the subprogram's `DW_AT_frame_base` (which is `DW_OP_call_frame_cfa`, +so the caller's `frame->cfa` supplies the base). The result is tagged as a +memory address, a register number, or an immediate ("stack value"). + +`dw_loclist_resolve` walks a `.debug_loclists` entry for a `DW_FORM_loclistx` +index and returns the location expression active at a PC. It handles +`offset_pair`, `start_end`, `start_length`, `default_location`, and +`base_address`; the `.debug_addr`-indirected variants are recognized and skipped. + +### 4.5 Type resolution (dwarf_type.c) + +`dw_type_from_die` builds a `CfreeDwarfType` on demand from a DIE offset, cached +by offset. It interns the node *before* recursing into inner/field/element +types, which breaks cycles (a struct containing a pointer to itself). Qualifier +types (`const`/`volatile`/`restrict`) are modeled as transparent wrappers; the +public `cfree_dwarf_type_info` and the field/enum iterators look through typedef +and qualifier layers to the underlying aggregate. Base-type encoding is mapped +to a small public kind enum (bool/sint/uint/float/char). + +### 4.6 CFI unwinder (dwarf_cfi.c) + +`cfree_dwarf_unwind_step` sweeps `.eh_frame`, finds the FDE whose +`(initial_location, range)` covers `frame->pc`, runs the CIE initial +instructions then the FDE program up to `pc`, and computes the caller frame. It +handles the common CFA opcodes (`def_cfa`/`def_cfa_register`/`def_cfa_offset` +and their `_sf` forms, `advance_loc*`, `offset*`, `register`, `undefined`, +`same_value`) and the `zR`-augmentation FDE pointer encodings. It mutates +`frame->cfa` and `frame->pc`; the return address comes from a `register` rule or +is treated as stack-bottom (`CFREE_NOT_FOUND`) when undefined. Recovering +arbitrary callee-saved registers would require CFA-relative memory loads, which +this step does not perform — the debugger supplies a memory provider for variable +reads (`loc_read`), but the unwinder leaves register slots as-is. + +### 4.7 Query and dump surfaces + +`dwarf_query.c` is the PC/name-keyed public API: `cfree_dwarf_subprogram_at` +/`_named`, the thin `cfree_dwarf_func_at`, `cfree_dwarf_var_at` (deepest-scope +first, then params, then globals), the `vars_at` / `param_iter` iterators, and +`cfree_dwarf_loc_read`. `loc_read` is where the consumer reaches outside DWARF: +it takes a `CfreeUnwindFrame` (registers + CFA) and a caller-supplied +`CfreeDwarfReadMemFn`, so register locations resolve from the frame and +frame/global/expr locations resolve through the memory callback. The debugger +backs that callback with the JIT session's memory reader (`driver/cmd/dbg.c`). + +`dwarf_dump.c` is the structural-enumeration API for dumpers (`objdump +--dwarf`): CU, DIE (depth-first across all CUs), DIE-attribute, abbrev, +abbrev-attribute, line-row, and `.debug_str` iterators. These hand back raw +numeric DWARF codes and form classes; symbolic rendering is the dumper's job. +They are thin cursors over the same lazily-built state the query layer uses. --- -## 7. Risks and decisions - -### DWARF 4 vs 5 - -Pick **5**. gdb ≥ 10 and lldb ≥ 9 read it; clang ≥ 11 emits it by -default; the format is cleaner (`.debug_line_str`, `loclistx`, -`rnglistx`). The only consumer-side cost is implementing the indirect -form encodings, which we'd want anyway for any non-trivial CU. Don't -support 4; if we ever need to, it's an emit option, not a different -codepath in the consumer. - -### Path remap - -The producer must apply `CfreeCompileOptions.path_map` -(`include/cfree.h:589-592, 604-605`) before any path enters -`.debug_line_str` / `.debug_str`. The remap is "first match wins". The -consumer does *not* apply remaps — paths come back exactly as the -producer wrote them. The W path checks paths byte-equal, so test -cases must register synthetic file ids whose names are stable across -runs (already the case with `source_add_memory`). - -### Reproducibility - -`CfreeCompileOptions.epoch` already gates timestamps elsewhere -(`include/cfree.h:599-603`). DWARF has no required timestamp; the -producer's `DW_AT_producer` should not embed a build time. With -`epoch == 0` (the default) we additionally avoid file-mtime metadata -in any future `.debug_macro` emission. - -### File 0 vs file 1 - -DW5 makes file 0 valid (the CU's primary file). DW4 reserved 0. We -emit 5, so we use file 0 for the CU primary and start -`debug_file`-allocated indices at 1. The line program header's -file/dir entry counts include file 0. - -### .debug_aranges duplication - -Rangelists in `.debug_info` (`DW_AT_ranges`) supersede aranges, but -gdb's fast attach path still uses `.debug_aranges`. Cost is one -section with `(low_pc, length)` per subprogram (Phase 1). Keep it. - -### Backend coupling - -The Debug module must not include any backend headers -(`src/cg/cg.h`, `src/arch/*.h`). It depends on `core` + `obj` + the -arch register-name mapping from `include/cfree.h`. The reverse -direction — CG using Debug — is fine and already in `cg.h`. +## 5. Clients -### Consumer / producer separation +- **`cfree addr2line`** (`driver/cmd/addr2line.c`) is the smallest consumer + client: open the object, `cfree_dwarf_open`, then per address call + `cfree_dwarf_addr_to_line` (and `cfree_dwarf_func_at` for `-f`). It supports + `-e/-a/-f/-p/--basenames` and reads addresses from argv or stdin. +- **`cfree objdump --dwarf`** drives the `dwarf_dump.c` structural iterators. +- **`dbg`** (`driver/cmd/dbg.c`, `src/dbg/step.c`) uses + `cfree_dwarf_subprogram_at` for frame naming, `cfree_dwarf_unwind_step` for + backtraces, and `cfree_dwarf_var_at` + `cfree_dwarf_loc_read` for `p name`. + See [DBG.md](DBG.md). +- **`emu`** lifts guest code as a parser-shaped client of the same reader; see + [EMU.md](EMU.md). -The producer and consumer are colocated under `src/debug/`, but they -must not share state types behind the public API. The consumer reads -bytes; the producer writes bytes; the public DWARF wire format is the -only contract between them. `debug/dwarf_defs.h` is the allowed shared -header because it contains only numeric wire-format constants. -Concretely: no `#include "debug/debug.h"` from `dwarf_*.c`, and no -`#include "debug/dwarf_internal.h"` from the producer files. This is -what lets `test/debug/` self-roundtrip catch encoder bugs. +Nothing past these entry points reaches into the reader internals — the +`cfree_dwarf_*` API is the whole contract, which is what lets the producer and +consumer be tested against each other purely through emitted bytes. --- -## 8. Pointers - -- W path runner: `test/cg/run.sh:336-359` -- Consumer probe: `test/cg/harness/cg_check_dwarf.c` -- Directive registry: `test/cg/harness/cases.c:504-510` -- First case body: `test/cg/harness/cases_p.c` -- Producer header: `src/debug/debug.h` -- C-type adapter header: `src/debug/c_debug.h` -- Consumer header: `include/cfree.h:1224-1450` -- Stubs to delete: `src/api/stubs.c:93-98` (producer), - `src/api/stubs.c:319-440` (consumer) -- Pipeline integration: `src/api/pipeline.c:230-238` -- Group P CORPUS entries: `test/cg/CORPUS.md` Group P table -- Emu-side use: `doc/EMU.md` §8 (guest-DWARF reader extends `src/debug/`) +Planned work: see doc/plan/DEBUG.md. diff --git a/doc/EMU.md b/doc/EMU.md @@ -1,1048 +1,429 @@ -# cfree emu design +# Emulator + +`cfree emu` is a user-mode emulator for guest ELF executables. It loads a +guest program image into a host-managed address space, then runs it by +JIT-translating one guest basic block at a time into host machine code +through the *same* CG -> MC -> link pipeline the native JIT uses (see +[JIT.md](JIT.md)), caching each translation keyed by guest PC and +dispatching between cached blocks until the guest exits. There is no +interpreter loop over guest opcodes and no separate guest codegen path: +a guest ISA is treated as just another frontend that emits CG. + +The emulator is feature-gated (`CFREE_EMU_ENABLED`). When disabled, the +arch and object-format emu vtables compile to empty stubs +(`src/arch/emu_stubs.c`, `src/obj/emu_stubs.c`) and the public +`cfree_emu_*` calls return `CFREE_UNSUPPORTED`. + +## Why this shape + +The guiding decision is that the emulator owns *process orchestration and +nothing else*. It must not embed ELF parsing, ISA decode/lift, or Linux +ABI semantics inline. Each of those is the domain of an existing registry +(object format, arch, OS), reached only through vtables. This keeps +libcfree policy-free — the library describes requests (a syscall, an +unresolved import, a needed shared object) and an embedder or the driver +decides what they mean — and it lets the bulk of the backend (opt, +register allocation, MC emission, linking, JIT execmem) be *reused +unchanged*: a lifted guest block is an ordinary CG function. + +A second decision is that **execution starts from a binary image, not +source**. Loading maps a guest process image; it never builds an +`ObjBuilder`. An `ObjBuilder` appears only *after* the lifter emits CG for +a translated block. The type split is deliberate: + +- object readers / `CfreeObjFile`: inspect binary formats (read-only) +- `EmuLoadedImage` / `EmuProcess`: the live guest process state +- `ObjBuilder` -> `LinkImage` / `CfreeJit`: host code generated per block + +## Footprint: three directories by design + +The emulator deliberately spans three source trees, each behind the +boundary it owns: -This document describes the target design for `cfree emu`: a user-mode -guest executable runner built out of the same registries and pipeline -boundaries used by the rest of cfree. - -This is both the target design and the status note for the current -implementation. Sections marked as target shape describe where the code is -going; sections marked as landed describe behavior already present in the -tree. - -## Current Status - -Landed: - -- `CfreeEmu` lifecycle, block lookup, and dispatch through lifted CG and the - existing JIT/link path. -- Object-format executable loading through `ObjFormatImpl.emu`; ELF maps one - object at a time, records `PT_INTERP`, parses `PT_DYNAMIC` metadata, and - records `PT_TLS`. -- Linux user-mode ABI code lives under `src/os/linux/`, selected by - `src/os/registry.c`. -- RV64 uses shared `ArchDecodeOps` and `ArchEmuOps`; the current emu subset - covers the smoke-test instructions (`addi`, `add`, `auipc`, `ld`, `sd`, - `jalr`, `ecall`). -- Runtime helpers use `EmuThread *` as their context. `EmuCPUState` remains - the arch-owned register/trap payload below the thread. -- `EmuAddrSpace` is now a sparse VM model with ordered mappings, unmapped - holes, guard pages, anonymous/file-backed maps, per-page dirty/translated - state, structured fault reporting, and VM operations for map/unmap/protect, - gap search, brk, copy, and destroy. -- Linux `brk`, anonymous/private `mmap`, `munmap`, and `mprotect` route - through the VM API with Linux-style result/errno behavior for the supported - subset. -- The emulator dynamic-loader layer owns dependency loading, ordered - link-map traversal, dynamic symbol lookup, runtime relocation scheduling, - import binding records, TLS module-list construction, and dynamic-loader - policy state. -- Runtime relocation byte application is factored into a neutral helper used - by both the linker and emulator dynamic loader. -- Import bindings can now produce either guest addresses or typed host-native - bridges through generated guest thunk records. The current RV64 bridge - supports the tested integer signatures and stores the declared signature in - the public binding result. -- TLS state is process/thread-owned: loaded `PT_TLS` modules are collected in - `EmuProcess.tls_state`, and each `EmuThread` owns its per-module TLS blocks. -- Signal delivery is layered: runtime helpers emit fault/signal events, and - the Linux/RV64 OS layer builds/restores the guest frame for the current - signal fixtures. - -Still target work: - -- Behind-interface dynamic-loader breadth: multi-level search paths, symbol - versioning, weak/interposition edge cases, RELRO enforcement, lazy binding, - `COPY`, IFUNC/IRELATIVE, init/fini execution, and additional relocation - families. -- Behind-interface TLS breadth: DTV/dynamic lookup helpers, descriptor - variants, additional TLS relocation models, and thread creation. -- Behind-interface signal breadth: exact production frame layouts, - blocked/pending behavior, restart behavior, alternate signal stacks, - default dispositions, and nested signals. -- Broader RV64 coverage and additional guest ISAs. - -## Goals - -- Run guest user-mode executables in-process. -- Keep all format, architecture, language, and OS knowledge behind the - registry/vtable system described in `doc/REGISTRY.md`. -- Treat the guest ISA lifter as a frontend: decode guest bytes, emit CG, - then reuse the existing opt, backend, object, link, and JIT pipeline. -- Keep executable loading separate from object building. Loading maps a - guest process image; lifting/codegen produces new host objects. - -## Non-goals - -- Full-system emulation: no privileged ISA, devices, kernel, or MMU. -- Built-in host policy for outside effects. Syscalls, dynamic imports, - filesystem access, clocks, signals, and other outside interactions are - delegated to emulator/embedder-provided bindings. -- Self-modifying code in v1. The first implementation may trap or refuse - writes to translated pages. -- Source-language frontend involvement. `emu` starts from a binary image; - language registries are relevant only because they share the same - registry discipline. - -## Top-level Shape - -`emu` owns process-level orchestration. It should not own ELF, Mach-O, -PE/COFF, RISC-V, AArch64, or Linux semantics directly. - -The expected flow is: - -```text -guest executable bytes - -> obj-format executable loader - -> EmuProcess { EmuImage, EmuAddrSpace, initial CPU state } - -> arch decoder: bytes at guest PC -> CfreeDecodedInsn[] - -> arch lifter: CfreeDecodedInsn[] -> CG function - -> optional opt - -> backend/JIT - -> dispatch loop and runtime helpers -``` - -The important type split is: - -- `ObjFile` / object readers: inspect binary formats. -- `EmuImage`: loaded executable image and metadata. -- `EmuProcess`: address space, threads, CPU state, OS ABI state. -- `ObjBuilder`: generated host object from lifted guest code. -- `LinkImage` / `CfreeJit`: resolved/generated host code. - -Executable loading does not produce an `ObjBuilder`. `ObjBuilder` appears -only after the lifter emits CG for a translated guest block. - -## Registry Integration - -The registry direction in `doc/REGISTRY.md` should extend naturally to -`emu`. - -### Object Format Vtable - -Object/image format implementations own executable file parsing and load -commands/program headers. `emu` asks the selected object format to load a -process image. - -Implementation location: - -- ELF loader lives with the ELF object/image code. -- Mach-O loader lives with Mach-O object/image code. -- PE/COFF loader lives with PE/COFF object/image code. - -Sketch: - -```c -typedef struct EmuLoadOptions { - CfreeSlice name; - CfreeSlice bytes; - const char *const *argv; - const char *const *envp; - const CfreeOsImpl *os; -} EmuLoadOptions; - -typedef struct EmuLoadedImage { - CfreeTarget guest_target; - uint64_t entry_pc; - uint64_t phdr_vaddr; - uint64_t base_vaddr; - EmuAddrSpace *addr_space; - EmuImageSymbols symbols; - EmuImageDebug debug; -} EmuLoadedImage; - -typedef struct ObjFormatEmuOps { - CfreeStatus (*detect_executable)(CfreeCompiler *, CfreeSlice bytes, - CfreeTarget *target_out); - CfreeStatus (*load_executable)(CfreeCompiler *, const EmuLoadOptions *, - EmuLoadedImage *out); -} ObjFormatEmuOps; - -typedef struct ObjFormatImpl { - /* Existing object-format operations from doc/REGISTRY.md. */ - CfreeObjFmt kind; - const char *name; - /* ... object read/emit, DSO read, link-image emit ... */ - - const ObjFormatEmuOps *emu; -} ObjFormatImpl; -``` - -The loader maps segments into an `EmuAddrSpace` and records metadata. -It does not construct source-like objects and it does not lower code. - -For ELF, loading is program-header driven. Sections may be used for -symbols and debug information, but PT_LOAD, PT_INTERP, TLS, dynamic -tables, and auxv data are the executable-loading contract. - -### OS Vtable - -`emu` needs OS-specific user ABI behavior that does not belong in an -architecture or object format module. - -The core C calling-convention ABI is already derived by -`src/abi/registry.c` from `(CfreeArchKind, CfreeObjFmt)`. The OS vtable -does not select `ABIVtable`s. It owns user-mode process conventions: -initial stack shape, syscall register ABI, errno/restart behavior, -signal frames, TLS process setup, and dynamic-loader policy. - -Implementation location: - -- Linux behavior lives with the Linux OS implementation. -- Darwin behavior lives with the Darwin OS implementation. -- Windows behavior lives with the Windows OS implementation. - -Sketch: - -```c -typedef struct CfreeOsImpl { - CfreeOSKind kind; - const char *name; - - CfreeStatus (*emu_init_process)(CfreeCompiler *, EmuProcess *, - const EmuLoadOptions *, - const EmuLoadedImage *); - CfreeStatus (*emu_init_thread)(CfreeCompiler *, EmuProcess *, - EmuThread *); - CfreeStatus (*emu_decode_syscall)(EmuProcess *, EmuThread *, - EmuSyscallRequest *out); - CfreeStatus (*emu_encode_syscall_result)(EmuProcess *, EmuThread *, - const EmuSyscallResult *); - CfreeStatus (*emu_deliver_signal)(EmuProcess *, EmuThread *, - const EmuSignalEvent *); -} CfreeOsImpl; - -const CfreeOsImpl *os_lookup(CfreeOSKind); -``` - -The OS vtable owns: - -- initial stack shape: argv, envp, auxv, alignment; -- syscall number table and argument/result ABI decoding; -- errno, signal-frame, and signal-delivery ABI conventions; -- TLS and dynamic-loader process conventions; -- OS-specific pages such as vdso-like regions, if supported. - -It does not issue host syscalls, open files, read clocks, resolve host -symbols, or deliver host signals directly. It translates between guest OS -ABI state and emulator-level requests. The emulator/embedder bindings -decide what those requests mean. - -This preserves the `doc/REGISTRY.md` split: C ABI selection remains a -derived ABI-registry lookup, while guest OS behavior has its own explicit -registry home. - -### Outside Interaction Bindings - -All interaction outside the emulated process is delegated to bindings -provided by the embedding application or by the `cfree emu` driver. This -keeps libcfree policy-free: the library describes requests, the embedder -chooses whether to service, virtualize, deny, record, or replay them. - -Sketch: - -```c -typedef struct EmuSyscallRequest { - uint64_t number; - uint64_t args[6]; -} EmuSyscallRequest; - -typedef struct EmuSyscallResult { - int64_t result; - int32_t guest_errno; - uint32_t flags; -} EmuSyscallResult; - -typedef struct EmuImportRequest { - CfreeSlice object_name; - CfreeSlice symbol_name; - uint32_t bind_flags; -} EmuImportRequest; - -typedef struct EmuExternalBindings { - CfreeStatus (*syscall)(void *user, EmuProcess *, EmuThread *, - const EmuSyscallRequest *, - EmuSyscallResult *out); - CfreeStatus (*resolve_import)(void *user, EmuProcess *, - const EmuImportRequest *, - EmuResolvedImport *out); - CfreeStatus (*signal)(void *user, EmuProcess *, EmuThread *, - const EmuSignalEvent *); - CfreeStatus (*trace)(void *user, EmuProcess *, const EmuTraceEvent *); - void *user; -} EmuExternalBindings; -``` - -The OS vtable is still required for syscalls because it knows how to read -and write guest registers, errno, restart behavior, and signal frames. -But after decoding a syscall into `EmuSyscallRequest`, it calls the -binding rather than a host syscall API. - -Dynamic linking follows the same rule. The object-format loader and OS -dynamic-linker support parse imports, relocations, PLT/GOT conventions, -and loader metadata; binding `resolve_import` decides what a requested -external symbol resolves to. A binding may point to a native host -function, an emulated thunk, a synthetic guest address, or a denial -stub. The core emulator should not hard-code libc, filesystem, network, -or host process behavior. - -### Architecture Vtable - -Architecture implementations own guest instruction knowledge. `emu` -should not have a private decoder per ISA. - -Implementation location: - -- `src/arch/rv64/` owns RV64 decode, formatting, lifter hooks, and - CPU-state schema. -- `src/arch/aa64/` owns AArch64 equivalents. -- `src/arch/x64/` can add guest support later without changing `emu`. - -Sketch: - -```c -typedef enum CfreeDecodeFlag { - CFREE_DECODE_TERMINATOR = 1u << 0, - CFREE_DECODE_BRANCH = 1u << 1, - CFREE_DECODE_CALL = 1u << 2, - CFREE_DECODE_RET = 1u << 3, - CFREE_DECODE_MEMORY = 1u << 4, - CFREE_DECODE_TRAP = 1u << 5, -} CfreeDecodeFlag; - -typedef enum CfreeDecodedOperandKind { - CFREE_DECOP_NONE, - CFREE_DECOP_REG, - CFREE_DECOP_IMM, - CFREE_DECOP_MEM, - CFREE_DECOP_PCREL, - CFREE_DECOP_SYSREG, -} CfreeDecodedOperandKind; - -typedef struct CfreeDecodedOperand { - uint8_t kind; - uint8_t width_bits; - uint16_t flags; - uint32_t reg; - uint32_t index_reg; - int64_t imm; - uint8_t scale; -} CfreeDecodedOperand; - -typedef struct CfreeDecodedInsn { - uint64_t pc; - const uint8_t *bytes; - uint8_t nbytes; - uint8_t noperands; - uint16_t flags; - uint32_t opcode; /* Arch-owned stable opcode enum. */ - uint32_t encoding_id; /* Optional row/table id for formatting. */ - CfreeDecodedOperand operands[CFREE_DECODE_MAX_OPERANDS]; - uint64_t arch[2]; /* Small arch-private payload. */ -} CfreeDecodedInsn; - -typedef struct ArchInsnFormatter ArchInsnFormatter; - -typedef struct ArchDecodeOps { - uint8_t min_insn_len; - uint8_t max_insn_len; - - CfreeStatus (*decode_one)(CfreeCompiler *, const uint8_t *bytes, - size_t len, uint64_t pc, - CfreeDecodedInsn *out); - CfreeStatus (*decode_block)(CfreeCompiler *, const uint8_t *bytes, - size_t len, uint64_t pc, - CfreeDecodedInsn *out, uint32_t cap, - uint32_t *n_out); - - ArchInsnFormatter *(*formatter_new)(CfreeCompiler *); - CfreeStatus (*format)(ArchInsnFormatter *, const CfreeDecodedInsn *, - CfreeInsn *out); - void (*formatter_free)(ArchInsnFormatter *); -} ArchDecodeOps; - -typedef struct ArchEmuOps { - CfreeStatus (*cpu_layout)(CfreeCompiler *, EmuCpuLayout *out); - CfreeStatus (*init_cpu)(CfreeCompiler *, EmuThread *, - const EmuLoadedImage *); - CfreeStatus (*lift_block)(CfreeCompiler *, CfreeCg *, - const CfreeDecodedInsn *, uint32_t n, - const EmuLiftContext *); -} ArchEmuOps; - -typedef struct ArchImpl { - /* Existing arch registry fields. */ - /* ... */ - - const ArchDecodeOps *decode; - const ArchEmuOps *emu; -} ArchImpl; -``` - -`ArchDecodeOps` is a general arch service. It is useful to `objdump`, -`dbg`, and `emu`. `ArchEmuOps` is optional; an architecture can support -disassembly and native codegen without supporting guest emulation. - -### Language Registry - -Language frontends are not on the `emu` hot path. The only design rule is -symmetry: `emu` should follow the same registry discipline as languages, -not add parallel hard-coded switches. - -## Decoder And Disassembler Split - -The current public disassembler shape is text-oriented: - -```c -typedef struct CfreeInsn { - uint64_t vaddr; - const uint8_t *bytes; - uint32_t nbytes; - CfreeSlice mnemonic; - CfreeSlice operands; - CfreeSlice annotation; -} CfreeInsn; -``` - -The current internal arch hook also decodes directly into that text -record: - -```c -struct ArchDisasm { - u32 (*decode)(ArchDisasm *, const u8 *bytes, size_t len, u64 vaddr, - CfreeInsn *out); - void (*destroy)(ArchDisasm *); -}; -``` - -That is sufficient for `objdump`, but it is the wrong interface for -`emu`. A lifter needs operand identity, widths, immediates, memory -addressing, flags behavior, and terminator classification. It should -never parse disassembly text. - -Target shape: - -```text -bytes -> ArchDecodeOps.decode_one -> CfreeDecodedInsn - -> ArchDecodeOps.format -> CfreeInsn text - -> ArchEmuOps.lift_block -> CG - -> ArchDbgOps -> breakpoint/displaced stepping helpers -``` - -The disassembler iterator remains public and can continue returning -`CfreeInsn`. Internally, it becomes: - -```text -decode_one(..., &decoded) -format(formatter, &decoded, &public_text_insn) -apply object annotations -``` - -This keeps the public disassembler API stable while moving the reusable -decode contract into `arch/`. - -The formatter object exists because `CfreeInsn` contains string slices. -As today, those slices are valid until the next formatter call or until -the formatter is freed. - -### Operand Policy - -The decoded instruction type should be format-neutral but not force all -architectures into an artificial common ISA. The contract is: - -- `opcode` is arch-owned. RV64, AArch64, and x64 each define their own - stable opcode enums. -- `operands[]` uses common storage for common shapes: registers, - immediates, PC-relative targets, memory references, and system - registers. -- `arch[]` carries small arch-private fields such as condition codes, - rounding modes, CSR IDs, or x86 prefix state. -- Larger arch-private decode payloads can live behind an arena-backed - side table if needed; the common fast path should remain inline and - fixed-size. - -The lifter switches on the arch-owned opcode. `objdump` formats using -the same `CfreeDecodedInsn` plus the arch's formatter. `dbg` can either -reuse `CfreeDecodedInsn` directly or keep a narrow `ArchDbgInsn` wrapper -implemented on top of the shared decoder. - -### Block Decode - -`decode_block` is a convenience over `decode_one`, not a separate source -of truth. It repeatedly decodes instructions until one of these happens: - -- decoded instruction has `CFREE_DECODE_TERMINATOR`; -- output capacity is reached; -- decode fails; -- a page boundary or mapped-range boundary blocks a complete instruction. - -The dispatcher uses this to form translation units. The debugger and -disassembler can continue to decode one instruction at a time. - -## Process Model - -`EmuProcess` is the root context for emulation. All mutable state hangs -off it or below it. - -```c -typedef struct EmuProcess { - CfreeCompiler *compiler; - const ObjFormatImpl *obj_format; - const ArchImpl *guest_arch; - const CfreeOsImpl *guest_os; - - EmuAddrSpace *addr_space; - EmuLoadedImage image; - EmuRuntime runtime; - - EmuThread *threads; - uint32_t nthreads; -} EmuProcess; ``` - -`EmuThread` owns one guest CPU state: - -```c -typedef struct EmuThread { - EmuProcess *process; - void *cpu_state; - uint64_t pc; - uint32_t trap; - int exit_code; -} EmuThread; -``` - -No global process state is required. Registries are static immutable -tables; process state is explicit. - -## Address Space - -`EmuAddrSpace` is the only module that translates guest virtual -addresses to host pointers. - -Responsibilities: - -- map executable segments with guest permissions; -- provide checked read/write/fetch helpers; -- maintain page permissions and dirty/translated-page metadata; -- expose a bounded host pointer for decode and runtime memory helpers; -- report faults in a target-independent form. - -The lifter should not inline host pointer arithmetic initially. It emits -calls to runtime helpers: - -```c -uint8_t emu_load8 (EmuThread *, uint64_t guest_addr); -uint16_t emu_load16(EmuThread *, uint64_t guest_addr); -uint32_t emu_load32(EmuThread *, uint64_t guest_addr); -uint64_t emu_load64(EmuThread *, uint64_t guest_addr); -void emu_store8 (...); +src/emu/ process orchestration, lifecycle, dispatch, address + space, code cache, runtime helpers, dynamic loader, + CPUState, TLS, fault routing +src/os/ guest-OS personality registry + per-OS impls + (Linux is the only one today) +src/obj/elf/emu_load.c the guest ELF image loader (ObjFormatImpl.emu) ``` -An inline memory fast path can be added later by making the helper ABI -and address-space invariants explicit first. - -## Lifting +Plus per-ISA decode/lift under `src/arch/<arch>/` (only `rv64` ships a +real `ArchEmuOps`). The boundary, not the file count, is the invariant: +format code maps files, arch code decodes/lifts instructions, OS code +models the user ABI, and `src/emu` coordinates execution. -The lifter emits one CG function per guest basic block. +## Top-level data flow -```c -uint64_t emu_block_<guest_pc>(EmuThread *thread); ``` - -The return value is normally the next guest PC. Traps and exits are -recorded in the thread/process state so the dispatcher can stop without -requiring every block to encode a large result struct. - -The lifter may use: - -- function begin/end and symbols; -- integer and floating-point arithmetic; -- loads/stores of CPU-state fields; -- branches and labels; -- calls to runtime helpers; -- source locations that encode guest PCs for debug views. - -The lifter should not use: - -- object format APIs; -- linker APIs; -- executable memory APIs; -- disassembly strings; -- host OS syscall APIs directly. - -Architecture-specific lowering belongs in `ArchEmuOps.lift_block`. -Shared helpers for CPU field addressing, helper calls, and block -function construction belong in `src/emu/lift/`. - -## JIT Engine - -`emu` uses the normal CG/backend/JIT pipeline: - -```text -decoded block -> CG -> optional opt -> ObjBuilder -> link -> CfreeJit -``` - -The existing link/JIT APIs should do most of the work once the lifter -emits valid CG. `emu` still owns: - -- guest PC to host entry cache; -- dispatcher; -- runtime helper resolver; -- trap/exit handling; -- code invalidation policy; -- block publication strategy. - -Two publication strategies are acceptable: - -- per-block object publication into a long-lived JIT session; -- a single growing `LinkImage`, if the linker keeps stable host - addresses across extension. - -The first implementation should choose the simpler strategy that fits -the current JIT API. The design requirement is stable lookup and -eventual invalidation, not a specific linker-internal shape. - -## Runtime Helpers - -Runtime helpers live in `src/emu/runtime/` and are linked by symbol or -function table into JIT execution. - -They own: - -- memory helpers; -- dispatcher callbacks; -- syscall trampoline that asks the OS vtable to decode/encode guest ABI - state and asks `EmuExternalBindings` to service the request; -- dynamic import trampolines that call `EmuExternalBindings`; -- trap and exit helpers; -- optional tracing hooks. - -Runtime helpers are process-aware: they receive `EmuThread *` or -`EmuProcess *`, not global state. - -## Debugging And Object Views - -Guest debug information belongs to the loaded image. Host debug -information belongs to generated blocks. - -The design should support: - -- guest PC source mapping from the guest executable's DWARF; -- generated host block symbols named by guest PC; -- `cfree_jit_view` / objdump of generated host code; -- debugger stepping that maps host PCs back to guest PCs. - -The lifter should stamp CG locations with guest PC information. The JIT -debug view can then relate generated host instructions back to guest -addresses even before source-level guest debugging is complete. - -## Module Ownership - -Proposed ownership: - -```text -src/emu/ - emu.c public API glue and lifecycle - cpu.c arch-owned CPU payload allocation/accessors - image.c loaded-image cleanup and address translation helpers - runtime.c code cache, memory helpers, syscall/import helpers - and tracing - process.c target home for broader EmuProcess / EmuThread ownership - mem.c optional split-out home for sparse guest address space - bindings.c target home for embedder outside-interaction binding glue - lift/common.c target home for arch-independent CG lifter helpers - -src/arch/<arch>/ - decode.c ArchDecodeOps implementation - disasm.c formatter over CfreeDecodedInsn - emu.c ArchEmuOps: CPU layout, init, lift - isa.* shared encoding tables and operand extraction - -src/obj/<format>/ - *_read.c relocatable/shared object reader - *_emit.c relocatable object emitter - *_load.c executable image loader for ObjFormatEmuOps - registry.c ObjFormatImpl entries - -src/os/<os>/ - <os>.c CfreeOsImpl entry and current small ABI slice - syscall.c target home for guest syscall ABI decode/encode - process.c target home for stack, auxv, TLS, signal conventions -``` - -Exact filenames can vary. The boundary should not: format code loads -files, arch code decodes/lifts instructions, OS code models user ABI, -and `emu` coordinates process execution. - -## First Slice: Landed Shape - -The first implementation target was a deliberately small vertical slice -that exercised every intended boundary without requiring broad ISA, -loader, OS, or execution-engine coverage. That slice has landed and has -been extended just enough to cover dynamic ELF metadata, imports, TLS, page -permissions, sparse VM behavior, Linux VM syscalls, and minimal signal -delivery. - -Current tested slice: - -- guest arch: RV64; -- guest OS: Linux; -- object format: ELF; -- executable shape: ELF64 little-endian `ET_EXEC`, static plus the small - dynamic fixture shape (`PT_INTERP`, `PT_DYNAMIC`, `DT_NEEDED`, - `R_RISCV_JUMP_SLOT`, `PT_TLS`); -- execution engine: lifted CG through the existing JIT path; -- outside interaction: `EmuExternalBindings.syscall` and the public - `resolve_import` surface; the default import path currently handles a - synthetic guest-callable no-op thunk for the test import; -- guest behavior: `_start` exits with a code through `ecall`, can read the - initial TLS word through `tp`, and can redirect an RX-page write fault to - a registered SIGSEGV handler. -- guest memory: executable segments, import thunks, stack, stack guard, brk, - anonymous mmap, mprotect, and munmap all flow through `EmuAddrSpace`. - -The original acceptance program was: - -```asm -addi a0, zero, 42 # exit code -addi a7, zero, 93 # Linux rv64 SYS_exit -ecall -``` - -`SYS_exit_group` (`a7 = 94`) is accepted too. - -Landed implementation surface: - -- `ObjFormatImpl(ELF).emu` - - Detect ELF64 little-endian `EM_RISCV`. - - Accept `ET_EXEC` with `PT_LOAD` segments. - - Map loadable segments into `EmuAddrSpace` and apply page permissions. - - Set entry, initial stack, program-header metadata, interpreter path, - dynamic import metadata, and TLS metadata. - - Create a main executable `EmuLoadedObject` / `EmuLinkMap` record and - parse the dynamic table into `EmuDynInfo` for later DSO relocation work. - - Patch the current RV64 `JUMP_SLOT` fixture through a reserved guest - import-thunk range. -- `CfreeOsImpl(Linux)` - - Initialize one process and one thread. - - Provide a valid aligned stack pointer with argv/envp/auxv. - - Initialize `tp` from `PT_TLS` for the current local-exec-style fixture. - - Decode RV64 Linux syscall ABI: number in `a7`, arguments in `a0-a5`. - - Encode syscall results in `a0`. - - Implement the supported VM syscall subset through `EmuAddrSpace`: - `brk`, anonymous/private `mmap`, `munmap`, and `mprotect`. - - Record `rt_sigaction` handlers for the current SIGSEGV delivery path. -- `EmuAddrSpace` - - Own sparse ordered maps with explicit page size. - - Represent anonymous, file-backed, and guard mappings. - - Track permissions, dirty pages, translated pages, and structured faults. - - Provide map, unmap, protect, gap search, brk, copy, checked pointer, and - invalidation APIs. -- `EmuExternalBindings` - - Implement `syscall`. - - Recognize syscall `93` and set the emulated exit state from `a0`. - - Recognize syscall `94`. - - Return a deterministic unsupported result for every other syscall. - - Expose `resolve_import` for dynamic import policy, though the current - default test path does not require host-native import calls. -- `ArchDecodeOps(RV64)` - - Decode the current smoke subset: `ADDI`, `ADD`, `AUIPC`, `LD`, `SD`, - `JALR`, `ECALL`. - - Mark control/trap instructions as terminators. - - Provide a formatter over the same `CfreeDecodedInsn` records so the - disassembler path uses the shared decoder shape. -- `ArchEmuOps(RV64)` - - Define a minimal CPU state with `x[32]`, `pc`, trap state, and exit - code. - - Initialize `pc` from `EmuLoadedImage.entry_pc` and `x2` from the - initial stack pointer. - - Lift the current smoke subset to CG. -- JIT engine - - Compile the lifted block through the existing CG, opt, object, link, - and JIT APIs. - - Execute the lifted block function with an `EmuThread *`. - - Route runtime helper calls through the same helper/binding table that - later execution engines can reuse. - -Acceptance criteria: - -- The in-memory ELF loads through the ELF object-format vtable. -- Process/thread initialization goes through the Linux OS vtable. -- Instruction decode goes through `ArchDecodeOps(RV64)`. -- The disassembler formatter can format the same decoded instructions. -- The lifter goes through `ArchEmuOps(RV64)` and emits CG. -- The existing CG/JIT path materializes and runs the lifted block. -- The exit syscall goes through `EmuExternalBindings.syscall`, not a - built-in host syscall. -- The static, dynamic-import/TLS, host-import, DSO-import/reloc, - distinct-TLS, signal/perms, and signal/`sigreturn` fixtures exit cleanly. - -This slice still intentionally excludes libc startup, broad dynamic-loader -policy, broad relocation and TLS families, exact production signal semantics, -broad file I/O, clocks, host-backed syscall coverage, and broad ISA coverage. -Those features should extend the same boundaries rather than introduce new -ones. - -## Remaining Architecture Work - -The core architectural split is now in place: executable loading is behind -object-format hooks, guest OS behavior is behind `CfreeOsImpl`, guest ISA -decode/lift is behind arch hooks, outside effects route through bindings, and -guest virtual memory is owned by `EmuAddrSpace`. Remaining work is primarily -coverage and deeper semantics within those boundaries, not a new top-level -architecture. - -The main remaining coverage and semantic-completeness areas are: - -1. General dynamic loading breadth: deeper dependency search/order, symbol - versioning, weak/interposition policy, RELRO, init/fini execution, and - interpreter modeling. -2. Broader dynamic relocation coverage and factoring shared byte patching out - of linker-internal ownership. -3. Broader import bridges: declared signatures, more ABI adapters, lazy PLT - resolver state, and data imports. -4. Full TLS relocation models on top of the loader's module list. -5. Production signal semantics: exact frame layouts, masks/pending sets, - restart behavior, alternate stacks, and nested delivery. -6. Broader Linux syscall semantics, RV64 instruction coverage, and additional - guest ISAs. - -Each remaining slice should continue to land red-green with narrow `test-emu` -fixtures first, then one hosted smoke binary only after the architecture for -that behavior is in place. - -### Dynamic Loading And Relocations - -`EmuLoadedImage` now has a process `EmuLinkMap` with the main executable and -binding-supplied DSOs for the minimal tested path. `EmuDynInfo` records parsed -dynamic-table metadata, and the loader applies the RV64 `RELATIVE`, 64-bit, -and `JUMP_SLOT` relocations needed by the current fixtures. The next step is -to broaden that model into production dynamic-loader policy and relocation -coverage. - -The ELF image loader owns program-header parsing and segment mapping for one -ELF object. The OS dynamic-loader layer owns dependency order, object search -policy, initial link-map construction, relocation scheduling, TLS allocation, -init/fini ordering, and auxv values such as `AT_BASE` when an interpreter is -modeled. - -The broader dynamic-loader pass should support: - -- loading the main executable and named DSOs supplied by a binding; -- `ET_DYN` load-bias assignment into sparse VM gaps; -- `PT_LOAD`, `PT_DYNAMIC`, `PT_TLS`, `PT_GNU_RELRO`, and `PT_INTERP` - metadata across all loaded objects; -- `DT_NEEDED`, `DT_STRTAB`, `DT_SYMTAB`, `DT_HASH`/`DT_GNU_HASH`, - `DT_RELA*`, `DT_JMPREL`, `DT_PLTGOT`, init/fini arrays, and symbol version - metadata parsed enough to reject unsupported cases cleanly; -- breadth-first dependency loading in link-map order; -- explicit symbol lookup policy over ordered scopes. - -Relocation handling should be data-driven by object format plus architecture, -with a sharp split between loader work and byte-patching work. The normal -linker and JIT linker already know how to patch relocation encodings once the -final values are known; the emulator should reuse that relocation-apply -primitive, not the `LinkImage` layout/resolution machinery. - -The emulator dynamic loader owns the runtime-only work: - -- parse `DT_RELA*` / `DT_JMPREL` records from already linked objects; -- map ELF relocation type numbers to existing `RelocKind` values through the - object-format/arch relocation tables; -- compute the mapped patch address `P` from object load bias plus relocation - offset; -- resolve `S` through the emulated link map, import bindings, weak rules, TLS - module state, or loader-specific relocation semantics; -- obtain writable patch bytes through `EmuAddrSpace`, respecting permissions, - RELRO, dirty tracking, and code-cache invalidation; -- call the shared relocation byte patcher with final `RelocKind`, `P_bytes`, - `S`, `A`, and `P`. - -The shared relocation apply function should therefore be factored out of the -linker-specific internal surface into a neutral internal module. The linker -continues to feed it `LinkRelocApply` records after laying out a new output -image; the emulator feeds it runtime relocation records after mapping an input -image. Both paths share encoding semantics for relocations such as PC-relative -branches, ADRP/LO12 pairs, absolute writes, `GLOB_DAT`, `JUMP_SLOT`, and TLS -offset encodings. - -Some relocation kinds remain loader semantics around the shared patcher rather -than plain byte encodings: - -- `RELATIVE`: compute `S` from the object's load bias, then patch normally; -- `GLOB_DAT` / `JUMP_SLOT`: perform symbol/import binding first, then patch - the GOT/PLT slot normally; -- `COPY`: copy data from a DSO definition into the main executable, or reject - with a specific unsupported diagnostic until data interposition is modeled; -- TLS families: compute module IDs, thread-pointer offsets, DTV entries, or - descriptors in the TLS layer, then use the shared patcher for the final - encoding when applicable; -- IFUNC/IRELATIVE: execute or deny resolver policy in the loader before - patching the resolved target. - -Minimum relocation set for the first full ELF/RV64 pass: - -- `RELATIVE`; -- `GLOB_DAT`; -- `JUMP_SLOT`; -- `COPY` rejected with a specific unsupported diagnostic unless main-exe data - interposition is implemented in the same slice; -- TLS relocation families deferred until the TLS slice below, but parsed and - rejected by name rather than falling through. - -Symbol lookup policy should be explicit and testable: - -- lookup starts in the requesting object's scope list; -- global/default visibility participates, local/hidden does not; -- weak undefined resolves to zero when absent; -- strong undefined without a binding result is a load failure; -- main executable interposition and `LD_PRELOAD`-like bindings are represented - as ordered scope entries, even if the driver does not expose preload flags - immediately. - -### Import Binding - -The current implementation has import records that describe what was bound and -how calls cross the guest/host boundary for the minimal eager-binding path. -Unresolved legacy fixture imports can still fall back to a no-op guest thunk so -old smoke coverage remains deterministic. - -There are two valid binding results: - -- a guest address inside an emulated object or generated guest-callable thunk; -- a native host function plus an ABI adapter selected by guest architecture, - guest OS ABI, and the binding's declared signature. - -The public `resolve_import` callback should remain policy-only: it decides -whether an object/symbol may resolve and what it resolves to. It should not -patch GOT slots directly. Loader/runtime code owns GOT/PLT writes, -lazy/eager binding state, and generated thunks. - -Target shape: - -```c -typedef struct EmuImportBinding { - uint32_t object_id; - uint32_t symbol_index; - uint64_t got_vaddr; - uint64_t plt_vaddr; - uint64_t resolved_guest_addr; - void *resolved_host_fn; - uint32_t flags; -} EmuImportBinding; +guest bytes + -> cfree_detect_fmt + ObjFormatImpl.emu->detect_executable (target) + -> ObjFormatImpl.emu->load_executable -> EmuLoadedImage (image) + -> CfreeOsImpl.emu_init_process / _thread (stack, ABI) + -> ArchEmuOps.cpu_new (+ attach addr space, set PC/SP, set tp) + | + v dispatch loop (cfree_emu_step): + read guest PC + -> code cache hit? -- yes --> call cached host block + -- no --> translate_block: + decode_block (one BB) + -> lift_block -> CG function + -> opt -> ObjBuilder + -> link session (JIT output) -> CfreeJit + -> cache (guest_pc -> host entry) + call host block -> returns next guest PC + inspect CPUState trap: EXIT stops the loop, FAULT panics ``` -For eager binding, `GLOB_DAT` and `JUMP_SLOT` write the resolved guest -address immediately. For lazy binding, PLT entries route to an arch-owned -resolver trampoline that calls a runtime helper, resolves once, patches the -GOT slot, and returns the final target. - -Native host calls require bindings that provide or imply a signature the -adapter can marshal. The current RV64 adapter supports the tested integer -signatures; broader host calls should extend that adapter behind the same -descriptor. Do not guess libc signatures inside libcfree. The default driver -can provide a small allowlist for smoke tests later; the library API stays -explicit. - -Test order: - -- guest-to-guest DSO function import through `JUMP_SLOT`; -- data import through `GLOB_DAT`; -- missing weak import resolves to zero; -- missing strong import fails load; -- host-native import through a declared integer adapter. - -### TLS Model - -TLS setup has moved from "set RV64 `tp` to the PT_TLS vaddr" to a per-thread -TLS allocation for the initial `PT_TLS` image. The remaining work is to apply -the same model across loaded TLS modules and the broader TLS relocation -families. - -The model needs: - -- one TLS module ID per loaded object with `PT_TLS`; -- static TLS image allocation for the initial executable and startup DSOs; -- per-thread copy of `.tdata` plus zeroed `.tbss`; -- target-specific thread-pointer layout, selected by OS/arch ABI; -- dynamic TLS blocks for modules that are loaded after thread creation; -- DTV-like bookkeeping where the target ABI requires it; -- relocation support for local-exec, initial-exec, local-dynamic, - general-dynamic, and TLS descriptor variants as they become reachable. - -Do this in layers: - -1. Static TLS for initial objects, with ABI-correct `tp` and local-exec / - initial-exec relocations. -2. Dynamic TLS lookup helper used by general-dynamic/local-dynamic models. -3. TLS descriptors and lazy descriptor patching where the target ABI uses - them. -4. Thread creation support only after the single-thread model has correct - per-thread data structures. - -The architecture backend already emits several TLS relocation forms for link -outputs. The emulator loader should share relocation constants and ABI layout -helpers where possible, but it should patch mapped guest memory through -`EmuAddrSpace`, not through linker section buffers. - -### Signal Frames And `sigreturn` - -The current write-fault-to-handler shortcut has been replaced for the tested -path with signal delivery that builds a guest frame on the guest stack and -lets `rt_sigreturn` restore the interrupted register state. The remaining work -is exact Linux/RV64 frame compatibility and broader signal semantics. - -Target responsibilities: - -- `rt_sigaction`: store handler, flags, restorer, mask, and ABI-specific - action layout; -- `rt_sigprocmask`: maintain per-thread blocked/pending signal sets; -- faults from memory/fetch/decode carry `EmuSignalEvent` with signal number, - code, fault address, and fault PC; -- `emu_deliver_signal` chooses action/default behavior and writes a real guest - frame; -- the thread register state is saved in the frame and replaced with handler - entry state; -- `rt_sigreturn` validates/restores the frame and resumes the interrupted PC; -- `SA_ONSTACK`, `SA_RESTORER`, `SA_SIGINFO`, default dispositions, and nested - signals are represented, even if some return `CFREE_UNSUPPORTED` initially. - -The signal frame layout is OS/arch-owned. The emulator core should only route -fault events and stop/continue decisions. - -Test order: - -- SIGSEGV handler with restorer returns to the faulting program path through - `rt_sigreturn`; -- handler sees `siginfo.si_addr` for a protection fault; -- blocked SIGSEGV causes default termination/fault instead of handler entry; -- alternate signal stack is used when `SA_ONSTACK` is set. - -## Implementation Order - -The architectural split through sparse VM, Linux VM syscalls, dynamic-loader -ownership, neutral relocation byte patching, typed import bridge descriptors, -process/thread TLS ownership, and OS-owned signal frame construction has -landed. - -Remaining work is behind-interface broadening: - -1. Extend dynamic-loader policy coverage: multi-level dependency search, - version/interposition rules, RELRO, lazy binding, init/fini, and - interpreter behavior. -2. Add broader relocation families, including `COPY`, IFUNC/IRELATIVE, data - imports, and architecture-specific TLS relocations. -3. Broaden generated guest/native bridges beyond the tested integer - signatures. -4. Implement DTV/dynamic TLS lookup, descriptors, additional TLS models, and - thread creation. -5. Complete production signal semantics: exact frame layouts, masks/pending - sets, restart behavior, alternate signal stacks, default dispositions, and - nested signals. -6. Expand ISA coverage and binding-backed syscall/import coverage with - differential tests against known guest binaries. - -Each step should be testable in isolation. Loader work can be tested by -inspecting `EmuLoadedImage` / `EmuLinkMap`; relocation work can be tested by -mapping small synthetic DSOs; lifter work can continue to use tiny -in-memory RV64 ELF fixtures through the existing JIT path. +Public surface is `cfree_emu_run` (load + run to completion) and the +finer-grained `cfree_emu_new` / `cfree_emu_step` / `cfree_emu_lookup` / +`cfree_emu_free` in `include/cfree/emu.h`. The driver entry is +`driver/cmd/emu.c`, which turns a path into bytes, marshals argv/envp, +wires a `CfreeJitHost` (execmem + TLS), and reports the guest exit code. + +## CfreeEmu lifecycle + +`src/emu/emu.c` owns `CfreeEmu` and the translate/dispatch loop. At +construction (`cfree_emu_new`) it: + +1. **Resolves config** (`emu_resolve_config`): detect the binary format, + look up `ObjFormatImpl` and require an `emu` vtable; determine the + guest `CfreeTarget` (caller-supplied or `detect_executable`, which + accepts a main `ET_EXEC` image); look up the `ArchImpl` and require its + decode + emu hooks; look up the `CfreeOsImpl`. Any missing piece is + `CFREE_UNSUPPORTED`. +2. **Wires bindings**: public `CfreeEmuExternalBindings` (syscall / + resolve_import / resolve_object) are adapted into the internal + `EmuExternalBindings` shape via small thunks. When no syscall binding + is supplied, the OS's `emu_default_syscall` is used directly, so the + driver gets working Linux semantics out of the box. +3. **Initializes OS process/thread private state**, then calls the + object format's `load_executable`, then `emu_init_process` (stack, + auxv, brk, dynamic loading) and per-thread `emu_init_thread` (TLS, + thread pointer). +4. **Allocates CPUState** via `ArchEmuOps.cpu_new`, seeds PC/SP, and + attaches the address space. + +### Error and fault discipline + +The emulator distinguishes two failure axes and routes each through a +single boundary. *Host-side build failures* (out of memory, an +unsupported guest, a lift or link error) use the compiler's +panic/longjmp mechanism: `cfree_emu_new` and `cfree_emu_step` each wrap +their body in a `compiler_panic_save` / `setjmp` frame, so any +`compiler_panic` inside unwinds to the boundary, runs the registered +cleanups (tearing down a partially built emu or a half-translated block), +restores the prior panic frame, and returns a status. A code-cache *hit* +short-circuits this boundary entirely, so the hot dispatch path pays no +setjmp cost. + +*Guest-side faults* are data, not control flow: a block records an +`EmuTrap*` reason in CPUState and returns normally. The dispatcher reads +the trap after the call — `EMU_TRAP_EXIT` stops the loop with an exit +code, and an `EMU_TRAP_FAULT` that no OS personality converted into a +signal frame is escalated into a host panic at the boundary. A guest +decode failure surfaces as a translate miss and the same panic. The +invariant is that no guest condition ever longjmps out of guest code; +only the host build/escalation paths use the unwind boundary. + +The whole emulator is allocated off the borrowed `Compiler`'s heap and +hangs off `CfreeEmu`; there is no global state. The `CfreeJitHost` +(execmem allocator + TLS support) is *borrowed* and must outlive the emu — +without one, runs surface `CFREE_UNSUPPORTED`, since cold blocks need +executable memory. + +`CfreeEmu` carries two execution strategies (see below): the default JIT +path stores host code entries in the cache; the optional INTERP path +stores `CfreeInterpFunc*` and runs blocks through the IR interpreter. + +## Translation and dispatch + +`cfree_emu_step(e, nblocks)` runs up to `nblocks` guest basic blocks. For +each iteration it reads the guest PC, looks it up (`cfree_emu_lookup`), +calls the resulting host block, sets the next PC from the block's return +value, and inspects the CPUState trap reason: + +- `EMU_TRAP_EXIT` -> mark done, capture exit code, stop +- `EMU_TRAP_FAULT` -> panic (a fault not converted into a signal frame) +- otherwise continue to the next block + +`cfree_emu_lookup` is the cold-miss path. A cache hit short-circuits even +the panic boundary. On a miss `translate_block` runs: + +- `decode_block` decodes a single basic block: it walks instructions until + it hits a terminator, the per-block cap (`EMU_MAX_INSTS_PER_BLOCK`), a + decode failure, or a mapped-range boundary, reading through a + bounds-checked `EMU_MEM_EXEC` host pointer. +- A fresh `ObjBuilder` and CG function are created. The block is one host + function of the block signature above, with a guest-PC-derived symbol + name (`emu_block_<16-hex-pc>` — fixed width, the full 64-bit guest PC in + hex). The encoding is a bijection on guest PCs: two translations of the + same PC hash to the same symbol, and distinct PCs hash to distinct + symbols within the linker's global pool. +- `ArchEmuOps.lift_block` emits the body. The block returns the next guest + PC; traps/exits are recorded in CPUState, not the return value, so the + dispatcher observes them after the call. +- CG is finalized, the object built, and a one-shot link session (output + kind JIT) links it with `emu_runtime_extern_resolver` wired in. The + block entry is looked up by symbol in the resulting `CfreeJit`. + +**Each cold block is published as its own standalone one-block JIT image.** +The image is retained in the emu's `jits` vector (so its executable memory +stays mapped for the emu's lifetime) and the block entry is inserted into +the code cache. There is no cross-image relocation between blocks; control +flows from block to block only through the dispatcher and the next-PC +return value. + +### The code cache and invalidation + +`src/emu/runtime.c` holds the cache: an open-addressed, linear-probe hash +from guest PC to host entry, grown by doubling, never evicted, created +lazily on first lookup (and requiring a wired JIT host). + +Self-modifying / dynamically-patched guest code is handled through a +**generation counter** on the address space. Writes to translated pages, +dynamic relocations, and explicit invalidation all bump the generation +and clear the per-page "translated" bit. `cfree_emu_lookup` compares the +cache's recorded generation against the address space's current +generation; on mismatch it drops the entire cache and re-translates on +demand. This is coarse but correct: stale host code is never executed. + +### Block chaining (design intent) + +The shipping strategy is deliberately the simple one — every cold block is +its own one-block JIT image, and all inter-block control flows through the +dispatcher's next-PC return. This keeps invalidation trivial (drop a cache +entry, the image stays mapped harmlessly) and reuses the JIT path +unchanged, at the cost of a dispatcher round-trip per block edge. + +A faster strategy is anticipated but not wired into the live loop: +`runtime.c` defines `EmuCodeRegion` (an up-front `PROT_NONE` reservation +with write/runtime dual-aliasing and a monotonic RX high-water mark) and a +`__emu_dispatch` cross-block helper symbol. The intended shape is to +bump-allocate translated blocks into one growing RX image and patch +direct jumps between them in place (block chaining), falling back to +`__emu_dispatch` for edges whose target is not yet translated. That would +require incremental relocation into a shared image; the present design +keeps blocks relocation-isolated so that the generation-counter +invalidation can remain whole-cache. + +## Address-space mapping (image.c) + +`EmuAddrSpace` (`src/emu/image.c`) is the *only* module that translates +guest virtual addresses to host pointers. It is a sparse VM model: an +ordered array of `EmuMap` regions with unmapped holes between them. There +is no flat guest-base + offset; the host owns the storage for each map +separately (heap-allocated `bytes`), so guest VAs need not be host-mapped +at the same address. + +Each map records `[start,end)`, permissions (`EMU_MEM_READ/WRITE/EXEC`), +a kind (anonymous, file-backed, or guard), and per-page `dirty` and +`translated` bitmaps. Responsibilities: + +- map / unmap / protect with page-granular splitting and merging + (unmap and protect carve a region out of the middle of a map by + re-appending the unaffected head/tail pieces) +- a checked pointer accessor (`emu_addr_space_ptr`) that validates the + range lies in one map and has the needed permissions, records a + structured `EmuMemFault` (unmapped vs protection) on failure, and marks + written pages dirty — flipping any previously-translated page back to + untranslated and bumping the generation +- gap search for placing new maps, `brk` growth/shrink, copy-in for + loader/stack setup, and explicit invalidation + +`emu_cpu_attach_addr_space` lets the CPUState borrow the address space so +runtime helpers can translate without threading the process pointer. + +## Runtime, helpers, and the extern resolver (runtime.c) + +The runtime is **in-process**: there is no separate runtime object file to +link. Lifted blocks call helper functions by referencing undefined extern +symbols (`EMU_SYM_*`, e.g. `__emu_load64`, `__emu_store32`, +`__emu_syscall`, `__emu_cpu_state`). At link time the linker calls +`emu_runtime_extern_resolver`, which maps each name to the host address of +the matching C function (or, for `__emu_cpu_state`, to the running emu's +CPUState pointer). Unrecognized shared names fall through to the arch's +`resolve_runtime_helper` hook, so a backend can register its own +arch-private helpers (RV64 registers register-file and `jalr` helpers). +Anything still unresolved becomes the linker's ordinary +undefined-symbol diagnostic. + +Memory helpers come in two flavors. The plain `emu_mem_loadN` set +bounds-checks against the CPUState's window and, on miss, writes +`EMU_TRAP_FAULT` and returns zero — the dispatcher then stops. The +*checked* variants (`emu_mem_loadN_checked`, all stores) take the faulting +PC and the fall-through next PC and, on a fault, route through +`emu_fault_deliver` so an OS personality can convert the fault into a +guest signal frame and hand back a resume PC — this is how a guest SIGSEGV +handler runs instead of the process dying. + +The syscall trampoline (`emu_syscall` / `emu_syscall_next`) is purely a +marshaller: it asks the OS vtable to **decode** guest registers into an +`EmuSyscallRequest`, calls the wired `bindings.syscall` to service it, and +asks the OS to **encode** the result back into the guest return register. +The runtime never issues a host syscall itself. + +## Guest CPUState (cpu.c) + +`EmuCPUState` (`src/emu/cpu.c`) is the per-thread guest register/trap +record. The core keeps it deliberately thin: PC, trap reason, exit code, a +borrowed address-space pointer, and an *opaque arch-private blob* +(`arch_state`) sized and owned by the backend +(`emu_cpu_new_with_arch_state`). The core never interprets the register +file; the arch's helpers (`get_gpr`/`set_gpr`, `get_sp`/`set_sp`, +`get_tp`/`set_tp`, syscall arg/result accessors) do. Trap state +(`EMU_TRAP_EXIT` / `EMU_TRAP_FAULT`) is how a block tells the dispatcher +to stop. + +`cpu.c` also defines the CG types the lifter needs: `EmuThread*` (modeled +as `void*`) and the block signature, which uses `i64` as the return type in +the CG type system (`emu_block_fn_type`). The direct-call typedef in +`emu.c` (`u64 (*)(EmuThread*)`) is the unsigned spelling of the same thing; +the dispatch loop treats the return as a 64-bit machine word (the next +guest PC) and never depends on its sign. Lifted blocks take the thread +pointer as their one parameter and reach guest registers and memory only +through helper calls — they hold no inline state. + +## Dynamic loader and relocation (dl.c) + +`src/emu/dl.c` owns the runtime-only dynamic-linking work, sitting above +the object-format `emu` vtable which supplies all format-specific parsing. +After the main object is mapped, `emu_dl_load_dependencies_and_relocate`: + +1. **Loads needed objects**: for each object, iterates `DT_NEEDED` + entries; an entry not already in the link map is fetched via the + `resolve_object` binding (so the embedder controls the search) and + mapped with the format's `map_object` (an `ET_DYN` gets a load bias + assigned into a VM gap). +2. **Rebuilds TLS modules** across the new link map (see TLS). +3. **Applies relocations** for every object's main and PLT tables. Each + relocation is classified by the format/arch into a neutral class + (relative, symbolic, or import-slot) plus a `RelocKind`; the symbol + value `S` is resolved through the link map first, then via the + `resolve_import` binding. The final bytes are patched in mapped guest + memory through a checked writable pointer, and the patched range is + invalidated so any cached translation of that page is dropped. + +Symbol value resolution and relocation **byte patching are shared with the +linker**: `emu_apply_reloc_bytes` is a thin wrapper over the neutral +`link_reloc_apply` (`src/obj/reloc_apply.c`), so PC-relative, absolute, +GOT-slot, etc. encodings are computed identically whether the linker is +laying out a new image or the emulator is patching an input image at +runtime. See [LINK.md](LINK.md) and [OBJ.md](OBJ.md). + +**Import binding / thunks.** An import resolving to a guest address binds +directly. One resolving to a *native host function* gets a guest import +thunk: `emu_dl_init_process` reserves a small executable guest VA range; +each host-backed import is assigned a slot, the arch emits a thunk there +(`emit_import_thunk`), and an `EmuImportBinding` records the host function +plus a typed signature. When the guest calls into the thunk range, the +arch's call helper detects it (`emu_dl_resolve_import_thunk`, a +base+size range check that returns the matching `EmuImportBinding`). +`emu_call_host_import` then marshals the call: it reads the guest argument +registers per the arch's syscall/call ABI into a small `u64` array, casts +the recorded host pointer to a function type chosen by the binding's +`EmuValue` signature, invokes it, and writes the result back into the +guest return register. Arguments are passed as raw 64-bit words; the +signature is the contract that fixes arity and which words are live. +Marshalling is intentionally narrow — host imports are limited to a small +fixed argument count (integer/pointer words today), and a call that +exceeds it is `CFREE_UNSUPPORTED` rather than a guessed ABI. A binding +with no declared signature defaults to a single-`u64` shape. The +`resolve_import` binding is policy-only — it decides *whether* and *to +what* a symbol resolves; the loader owns GOT/thunk writes and the +marshalling contract. + +## Guest TLS (tls.c) + +`src/emu/tls.c` models thread-local storage as a process-owned module list +plus per-thread blocks. `emu_tls_rebuild_modules` assigns a module ID to +each loaded object that has a `PT_TLS` segment and accumulates the static +TLS size/alignment. Per-thread allocation lives in the OS layer +(`linux_init_thread`): for each module it maps an anonymous block, copies +the module's `.tdata` image in (`.tbss` stays zero), records an +`EmuTlsBlock`, and for the initial module sets the guest thread pointer +via the arch's `set_tp`. The TLS image bytes are read and written through +`EmuAddrSpace` like any other guest memory, never through linker buffers. + +## Guest-OS personality (src/os) + +The guest OS is a pluggable registry. `src/os/registry.c` maps a +`CfreeOSKind` to a `CfreeOsImpl` vtable; Linux (`src/os/linux/linux.c`) is +the only implementation. The OS owns everything about the *user-mode +process convention* that is neither arch nor object-format specific: + +- **process/thread private state** (the Linux impl stores the mmap hint, + the per-signal action table, the per-thread signal mask and TLS blocks) +- **initial stack layout**: argv/envp copy-in, the `AT_RANDOM` block, the + aux vector, 16-byte alignment, and the `argc`/argv/envp/auxv table on + the guest stack; it also sizes stack, guard page, and brk reserve and + kicks off dynamic loading +- **syscall ABI and default table**: decode the number and six args from + guest registers, service them, and encode the result back, plus a + per-syscall `next_pc` hook (so `rt_sigreturn` resumes at the restored PC + rather than past the trap). The bundled table covers process exit, the + core file/io and memory (`brk`, anonymous `mmap`/`munmap`/`mprotect` + through the VM API with Linux errno semantics), clock, identity, and + signal calls — enough to run a static Linux program. +- **signal delivery**: `rt_sigaction` records handler/flags/restorer, + `rt_sigprocmask` maintains the blocked mask; faults arrive via + `emu_deliver_fault`, which builds a guest signal frame on the stack + (saving the interrupted context through arch hooks), redirects to the + handler, and `rt_sigreturn` restores the interrupted state. Blocked or + unhandled signals fall back to a process fault. +- **map-region placement** (`emu_find_map_region` / `emu_note_map_region`): + where anonymous maps, DL thunks, and TLS blocks land in the VA space. + +The OS layer translates between guest ABI state and emulator-level +requests; it does not perform host I/O or resolve host symbols. The C +*calling-convention* ABI is separate, derived from `(arch, obj-format)` by +`src/abi/registry.c`; the OS vtable does not pick ABI vtables. + +## Guest ELF loader (src/obj/elf/emu_load.c) + +The ELF loader implements `ObjFormatImpl.emu` (`ObjFormatEmuOps`, +`elf_emu_ops`), handling ELF64 little-endian executable loading via program +headers. It parses the header directly rather than going through +`elf_read.c` — the reader builds an `ObjBuilder` for the linker, which is +the wrong output here. It maps `ET_EXEC` for the main object and `ET_DYN` +for dependencies (assigned a load bias into a VM gap), extracting the +dynamic, TLS, and interpreter metadata that the rest of the pipeline needs. +The interface backs the dynamic loader's needed-entry iteration, dynamic +symbol lookup (by name and index), relocation iteration, and relocation +classification (ELF type numbers -> neutral classes + `RelocKind`). The +program-header types and dynamic tables are the loading contract; the +per-object format-private blob is `EmuElfDynInfo`. + +## Architecture vtable + +A guest ISA plugs in via `ArchEmuOps` (`src/arch/arch.h`) on top of the +shared `ArchDecodeOps` decoder (also used by `objdump` and `dbg`, see +[ARCH.md](ARCH.md)). The emu ops provide CPUState construction, the CG +types for the block function, the lifter (`lift_block`), register/SP/TP +accessors, the syscall-register ABI, signal-context save/restore, and the +import-thunk emit/size. `rv64` is the implemented backend; its lifter +emits one CG function per block that returns the next guest PC and calls +the shared `__emu_*` helpers plus its own register-file helpers, with +control-transfer and `ecall` instructions acting as block terminators. + +## Optional INTERP execution + +The same lifted-block path can run through the cfree IR interpreter +instead of host code (`CFREE_EMU_MODE_INTERP`, requires +`CFREE_INTERP_ENABLED`; forces at least `-O1` so the optimizer's PReg-path +IR is available for capture). The mode is chosen once at construction, not +per block. + +The two strategies differ only in what the code cache stores. In JIT mode +the payload is a host code entry, invoked directly through the +`EmuBlockFn` typedef. In INTERP mode the block is *still* linked as a JIT +image — that link is what resolves and validates the helper externs and +proves the block is well-formed — but the cache payload is the captured +`CfreeInterpFunc*`, and dispatch runs it on a per-emu interpreter program +and stack, seeding the thread pointer as the single argument and shuttling +back the next-PC return. Either way guest registers and memory are reached +only through the same `__emu_*` helpers, so the interpreter holds no guest +state and the lifter is identical across modes. A block the interpreter +cannot capture or run is a hard failure with a reason, never a silent +fallback to the JIT payload. See [INTERPRETER.md](INTERPRETER.md). diff --git a/doc/FRONTENDS.md b/doc/FRONTENDS.md @@ -0,0 +1,306 @@ +# Frontends + +cfree turns several source languages into machine code through one narrow +contract. A *frontend* is the front half of the compiler: it reads bytes of a +single translation unit and drives the public code-generation API +(see [CODEGEN.md](CODEGEN.md)) to populate a relocatable object. Everything +behind codegen — IR, optimization, register allocation, object emission — is +language-agnostic. This document covers the frontend model, the C frontend +pipeline, and the smaller toy and wasm frontends. For testing, see +[TESTING.md](TESTING.md). + +cfree ships four frontends: C (the real one), asm (lives inside the codegen +substrate), toy (a CG-API exercise vehicle), and wasm (WAT/wasm lowering). + + +## The frontend contract + +Every frontend implements one vtable, `CfreeFrontendVTable` +(`include/cfree/compile.h`). The contract is deliberately tiny: + +``` + new_frontend(CfreeCompiler*) -> opaque CfreeFrontendState* + compile(state, opts, input, out) -> CfreeStatus, populates an ObjBuilder + free_frontend(state) + extensions / nextensions -> file extensions this lang claims + commit / abort -> optional, for durable REPL state +``` + +The state object is opaque to libcfree: the frontend allocates it from the +compiler's heap in `new_frontend`, threads its own context through it, and +frees it in `free_frontend`. `compile` is handed a `CfreeSourceInput` (name, +bytes, language, and an input-kind that distinguishes a whole translation unit +from a REPL top-level / expression / block) and an output `CfreeObjBuilder` +to fill. The frontend opens a `CfreeCg`, emits into it, and the object builder +captures the result. This keeps each language a thin client of the public CG +and object APIs ([INTERFACES.md](INTERFACES.md)) with no privileged access to +internals. + +Two design choices fall out of this boundary: + +- **Errors unwind, they don't return codes mid-stream.** Frontends report + fatal problems by panicking (`compiler_panic` / `cfree_frontend_fatal`), + which longjmps to a `setjmp` landing pad that runs registered cleanups. + `cfree_frontend_run` (`src/api/frontend.c`) is the public shim that + establishes that boundary for standalone frontend helpers (preprocess, + token-dump) without exposing libcfree's panic machinery. Ordinary + diagnostics are non-fatal; only invariant breaks unwind. + +- **Durable cross-compile state is transactional.** Most frontends are + one-shot: one `compile` call, one object, no state carried forward + (`commit`/`abort` are NULL for C, asm, and wasm). The toy frontend is the + exception — its REPL accumulates declarations across snippets — so the + vtable carries optional `commit`/`abort` hooks. A compile *stages* new + declarations; the compile session then commits them on full success or + aborts (also fired automatically on a panic via the cleanup stack) to + restore the pre-compile state. The session API + (`cfree_compile_session_compile` = stage + auto-commit; + `cfree_compile_session_stage` + explicit commit/abort) lets a REPL gate the + commit on the *whole* compile -> link -> publish chain, not just compile, + since publish can reject a clean compile (duplicate global). Both hooks must + be idempotent. + + +## The language registry + +`src/api/lang_registry.c` is the *only* place in libcfree that consults the +`CFREE_LANG_*_ENABLED` build flags. During compiler construction +`lang_registry_init` walks the enabled set and calls +`cfree_register_frontend` to wire each compiled-in vtable into +`c->frontends[CfreeLanguage]`. After that, the public compile and pipeline +paths dispatch purely by `CfreeLanguage` index — no host bootstrap, no +`#ifdef` outside the registry. Third parties can still call +`cfree_register_frontend` to install or override any slot at runtime; the +registry just provides the default wiring. + +`cfree_language_for_path` (`src/api/compile.c`) maps a file path to a language +by extracting the trailing extension and walking every registered frontend's +`extensions` list (case-insensitively, so `.S` and `.s` both hit asm's `"s"`). +The C frontend claims *no* extensions on purpose: an unrecognized extension +falls through to `CFREE_LANG_C` as the default, so listing `c`/`h` would only +duplicate the fallback. asm claims `s`, toy claims `toy`, wasm claims `wat` +and `wasm`. + +``` + path --cfree_language_for_path--> CfreeLanguage + CfreeLanguage --c->frontends[]--> CfreeFrontendVTable + vtable.compile(...) --CfreeCg--> CfreeObjBuilder +``` + + +## The C frontend pipeline + +The C frontend (`lang/c/c.c`, vtable `cfree_c_frontend_vtable`) is a classic +C11 compiler arranged as a streaming pipeline. There is no AST: tokens flow +through the preprocessor into a single-pass recursive-descent parser that +drives codegen directly as it goes. The stages: + +``` + bytes + | lexer lang/cpp/lex (C11 6.4 token streaming) + v + tokens + | preprocessor lang/cpp/pp (translation phase 4) + v + preprocessed tokens + | parser lang/c/parse (single-pass recursive descent) + v + type / decl / sem / abi semantic layers (lang/c) + CfreeCg ops -> CfreeObjBuilder +``` + +The lexer and preprocessor live under `lang/cpp/` rather than `lang/c/` +because they are shared C-preprocessor infrastructure: the same code backs the +C frontend, the standalone `cpp` tool, and `cc -E`. The C-specific stages — +parser, type system, declarations, semantics, and ABI lowering — live under +`lang/c/`. + +`c_frontend_compile` is the conductor. It builds the per-compile objects +(token pool, lexer over the input bytes, preprocessor, decl table, `CfreeCg`), +applies preprocessor options (`-I`, `-D`, `-U`) recovered from the +`CfreeCCompileOptions` planted in `language_options`, pushes the input lexer +onto the preprocessor's include stack, then calls `parse_c`. The whole run is +bracketed with metrics scopes so the build can profile each stage. + +### Lexer — lang/cpp/lex + +`lex.c` streams tokens out of a borrowed source buffer per the C11 lexical +grammar (6.4): identifiers, pp-numbers (classified into integer/float), +string and character constants with their L/u/u8/U encoding prefixes, +longest-match punctuators including digraphs, and the `#`/`##` tokens that the +preprocessor needs for directives and pasting. It handles the earlier +translation phases inline: line splicing (phase 2) and comments as whitespace, +with physical newlines surfaced as `TOK_NEWLINE` so the preprocessor can +implement directive-line semantics, and a small directive-context latch so a +header-name after `#include` lexes correctly (6.4.7). + +A deliberate layering decision: **keyword bucketing is deferred to the +parser.** The lexer emits every word as `TOK_IDENT`; it does not know what a +keyword is. This is correct because keyword-ness is a translation-phase-7 +concern — a name that is a macro must expand *before* it can be a keyword, and +the preprocessor traffics in identifiers, not keywords. `parse_c` interns the +C11 keyword spellings into symbols once at startup (`kw_names[]` in +`parse.c`) and recognizes keywords by symbol identity as it consumes tokens. + +### Preprocessor — lang/cpp/pp + +`pp.c` (+ `pp_directive.c`, `pp_expand.c`) implements translation phase 4 and +exposes a pull interface: `pp_next` yields one fully preprocessed token at a +time, with directives consumed and macros expanded. It backs both the C +frontend (the parser pulls tokens from it) and the standalone preprocessor +(`pp_emit_text` reconstructs source text from the token stream for `cpp` / +`cc -E`). + +Internally the preprocessor runs a **token-source stack**. Each source is +either a `Lexer` (the main file or an `#include`d file) or a pre-built token +buffer (a macro expansion in progress). Includes push a new lexer; macro +invocations push an expansion buffer; EOF pops. This is what makes the +include stack and macro rescanning fall out of one mechanism. + +Macro expansion uses the **Prosser hideset** algorithm (the standard's +"nested replacement" / blue-paint rule). Every token in an expansion buffer +carries a hideset: the set of macro names it must not be re-expanded by during +rescan. Function-like expansions compute the result hideset as the invocation +hideset unioned with the just-expanded macro name, which is exactly what stops +infinite recursion through self-referential and mutually-referential macros. +Hidesets are interned into a small table (`pp_expand.c`) and kept sorted for +canonical identity, so identical hidesets share one id. + +`pp_directive.c` owns the rest of phase 4: the `#if` nesting stack and the +preprocessor constant-expression evaluator, `#include` search and file open +(through the compiler's file-IO service), `#line`, `#pragma`, `#error`, +`#embed`, and `#define`/`#undef`. Include edges are recorded +(`pp_add_include_edge` -> the source registry, `include/cfree/source.h`) so +dependency reporting (`cfree_dep_iter_*`) and diagnostics can attribute tokens +to the right file. + +### Parser — lang/c/parse + +`parse_c` (`parse.c`) is a single-pass recursive-descent parser. It pulls +tokens from the preprocessor with one or two tokens of lookahead and emits CG +ops as it recognizes constructs — there is no separate AST or +semantic-analysis pass over a tree. The parser is split by syntactic area: +declarations and the TU driver in `parse.c`, expressions in `parse_expr.c`, +types/declarators in `parse_type.c`, initializers in `parse_init.c`, and +statements in `parse_stmt.c`. + +Because lowering happens inline, the parser maintains a **typed value stack** +that shadows the CG operand stack. Each entry records the C-language type of a +value, value flags (lvalue / modifiable-lvalue / bit-field / register / +null-pointer-constant), and an lvalue auxiliary record. The fields are +`cg_type_stack`, `cg_value_flags`, and `cg_lv_aux` in `struct Parser` +(`parse_priv.h`). This shadow stack is the parser's entire notion of "the +expression evaluated so far": the CG side holds the runtime values, the parser +side holds the static types and lvalue bookkeeping needed to drive the next op +correctly (signedness for division/compare/shift, pointer-element size for +arithmetic, bit-field metadata for loads/stores, and so on). + +### The cg_public_compat.h coupling point + +`lang/c/parse/cg_adapter.c` and its header `cg_public_compat.h` are the real +parser <-> codegen seam, and the most load-bearing design decision in the C +frontend. The header defines a family of `pcg_*` helpers and a set of +`cg_*` macros that look like thin wrappers over the public `cfree_cg_*` API but +do two extra jobs on every call: + +- **They keep the parser's typed value stack in lockstep with the CG stack.** + `cg_dup`/`cg_swap`/`cg_drop` mirror the structural op onto the type stack; + `pcg_push_int` / `pcg_load` / `pcg_binop` / `pcg_convert` push, retag, or pop + type-stack entries alongside emitting the CG op. The parser never calls + `cfree_cg_*` for stack-affecting ops directly — it goes through `pcg_*` so + the two stacks can never drift. + +- **They fold C lvalue chains into a single CG memop.** There is no CG-level + `field` / `index` / `addr_offset` op. Instead a chain like `a[i].g` + accumulates a byte offset, an index scale, and bit-field metadata onto the + TOS lvalue's `PcgLvAux` (`pcg_lv_member`, `pcg_lv_subscript`, + `pcg_decay_array`), and the *next* `pcg_load` / `pcg_store` / `pcg_addr` + consumes that aux and emits one effective-address memop. The aux also records + whether the base is a frame local or an already-computed pointer rvalue, so + address-taking knows whether to emit a `cfree_cg_addr` or treat the base as + a pointer. + +The implication: **the C frontend's lowering strategy is encoded in this +shim, not in the public CG contract.** The compatibility-header name reflects +its history — it preserves the call shapes of an older richer CG surface on +top of today's leaner one — but in practice it is where C-specific decisions +(lvalue-as-effective-address, lazy load/store/addr, the typed shadow stack) +are concentrated. A change to how C lowers indirection lands here, not in the +backends. (`pcg_emit_enabled` also lets the parser run constant-folding / +sizeof contexts with codegen suppressed while still tracking types.) + +### Semantic layers — type / decl / sem / abi + +The parser leans on four C-specific semantic layers under `lang/c`: + +- **type** (`lang/c/type`) owns the C type representation and lowers each C + type to a `CfreeCgTypeId` (`type_cg_id_in_pool`), interning records, enums, + pointers, arrays, and function signatures into the CG type universe. C types + are the currency of the parser's value stack. + +- **decl** (`lang/c/decl`) is the `DeclTable`, deliberately layered *above* + the object builder: the object layer stores object-format facts, while the + decl table owns the C-level rules the object layer does not know about — + linkage, storage duration, tentative definitions, function-local statics, + and initializer/redeclaration bookkeeping. `decl_attrs.c` handles GNU/C23 + attribute parsing. + +- **sem** (`lang/c/sem`) is a small library of pure C semantic checks — + assignment compatibility, compound-assignment, and redeclaration/composite + type rules — that the parser calls at the relevant points and that produce + a diagnostic message on failure rather than emitting anything. + +- **abi** (`lang/c/abi`) computes C scalar/aggregate layout facts and feeds the + target calling-convention lowering for function entry, parameters, and + returns. See [ARCH.md](ARCH.md) for the per-target ABI implementations the + C ABI layer drives. + + +## The toy frontend + +The toy frontend (`lang/toy`) is a small, statically-typed, +single-pass-friendly language whose purpose is to exercise the public +code-generation API broadly and readably — it is the coverage vehicle for +`make test-toy`, not a language anyone ships. It is C-like where that keeps +tests legible and prefix-oriented (`@builtin`, `@[...]` attributes, +dot-constants) where C syntax would make parsing or lowering ambiguous. Toy is +fully self-contained: it has its own lexer (`lexer.c`), parser +(`parser.c` + `expr.c`/`decls.c`/`data.c`/`stmt` logic), type system +(`types.c`), and builtin/intrinsic/inline-asm handling (`builtins.c`, +`asm.c`) — it does **not** reuse the C lexer or preprocessor. + +Toy deliberately surfaces low-level CG features that have no clean C spelling: +explicit linkage and symbol/ABI attributes, address-space pointers, tail calls, +computed goto with target sets, relocatable data expressions (`@pcrel`, +`@symdiff`, `@labeladdr`), atomics with explicit ordering/access, the full +conversion-builtin set with rounding modes, target-capability queries, and +typed inline assembly. The broad executable demo +`test/toy/cases/123_spec_demo.toy` is a normal corpus case (not a doc-only +sample) so the implementation must keep accepting and running the syntax it +demonstrates. + +Toy is also the one frontend with **durable cross-compile state**, because +`cfree dbg` runs it as a REPL. Its frontend object splits into a durable +`ToyModule` (append-only declaration tables — functions, globals, named types +— carrying metadata and compiler-durable type ids, but *no* per-object symbol +handles) and a per-compile `ToyParser` that borrows the module and holds the +per-object symbol environment plus the transaction state. The transaction is a +journaled-in-place model: each compile records watermarks into the durable +tables, mutates them in place during the parse, and on `abort` truncates back +to the watermark and replays a small undo journal for the rare in-place +mutation of an already-committed entry (completing a forward-declared record). +`commit` is then a cheap disarm. This is what lets a failed or panicking REPL +snippet leave the persistent declaration tables exactly as they were. The +hooks are wired through the vtable's `commit`/`abort`; the session and `dbg` +driver gate the commit on the full compile -> link -> publish chain. See +[DBG.md](DBG.md) for the debugger REPL. + + +## The wasm frontend + +The wasm frontend (`lang/wasm`, vtable `cfree_wasm_frontend_vtable`) accepts +either WebAssembly text (`.wat`) or binary (`.wasm`) — `wasm_parse_any` sniffs +the input and routes to the WAT parser or the binary decoder, validates the +module, then lowers it into the CG API (`cg.c`) and synthesizes the host-import +ABI shims (`host_imports.c`). Like C and asm it carries no durable +cross-compile state (`commit`/`abort` are NULL). The structure and the +WAT/binary/lowering details live in [WASM.md](WASM.md). diff --git a/doc/GO_RUNTIME_CG_JIT.md b/doc/GO_RUNTIME_CG_JIT.md @@ -1,279 +0,0 @@ -# Go-like runtime support in CG and JIT - -This note sketches the interface changes cfree would likely want before -supporting a language with Go-like runtime semantics. - -Go is statically typed, so the main gap is not dynamic typing in the codegen -interface. The gap is the managed runtime model: precise GC, goroutines, -managed stacks, interfaces/reflection metadata, panic/defer/recover, and a -long-lived JIT image whose code and metadata can evolve safely. - -## Current fit - -`include/cfree/cg.h` is a typed storage and ABI interface. That is still the -right boundary for a Go-like frontend. Source constructs such as slices, -strings, interfaces, closures, maps, and channels can lower to records, -pointers, indirect calls, and runtime helper calls. - -The parts that need new public or semi-public contracts are the places where a -managed runtime must cooperate with generated code and the JIT: - -- where GC roots live at safepoints -- which pointer stores need barriers -- how managed stack growth and goroutine context are represented -- how panic paths and implicit checks map back to language semantics -- how runtime metadata is attached to JIT code -- how live JIT code is appended, replaced, and retired - -The incremental and replacement side should build on `doc/INCREMENTAL_LINK.md` -and `doc/HOT_RELOAD.md`. - -## Design principles - -- Keep CG typed. Do not turn `CfreeCg` into a dynamically typed language IR. -- Keep source language concepts out of the backend where possible. Lower them - to storage types plus runtime metadata. -- Make managed runtime behavior explicit. GC roots, safepoints, stack growth, - and barriers should not be inferred from raw loads and stores late in the - backend. -- Keep AOT and normal C/Toy/Wasm JIT behavior unchanged by default. Managed - runtime features should be opt-in through code options, function attributes, - or language-specific frontend options. -- Keep all runtime services on context/session structs. Do not add global - runtime state. - -## CG interface extensions - -### Precise GC metadata - -Add a way for frontends to describe managed roots and safepoints. A precise GC -needs to know, for each safepoint PC, which frame slots, registers, params, -spills, and globals contain managed pointers. - -Possible public shape: - -```c -typedef enum CfreeCgGcRootKind { - CFREE_CG_GC_ROOT_LOCAL, - CFREE_CG_GC_ROOT_PARAM, - CFREE_CG_GC_ROOT_GLOBAL, -} CfreeCgGcRootKind; - -typedef struct CfreeCgGcRoot { - uint8_t kind; - uint8_t pointer_kind; - uint16_t flags; - CfreeCgLocal local; - CfreeCgSym global; - uint32_t offset; - CfreeCgTypeId type; -} CfreeCgGcRoot; - -CFREE_API void cfree_cg_safepoint(CfreeCg*, const CfreeCgGcRoot* roots, - uint32_t nroots); -``` - -The exact encoding should probably become compact backend metadata rather than -DWARF-only data. DWARF can describe variables for humans, but the runtime needs -fast PC-to-stackmap lookup. - -### Managed pointer identity - -Raw pointers and managed heap pointers need to be distinguishable. Options: - -- address spaces for managed heap pointers -- pointer type attributes -- explicit managed load/store/allocation intrinsics - -Address spaces are attractive because `cfree_cg_type_ptr` already accepts an -address space. The missing piece is policy: which address spaces are scanned, -movable, non-moving, interior, or raw. - -### Write barriers and allocation - -A Go-like runtime needs write barriers for pointer stores into managed heap -objects. Do not require every frontend to open-code barrier sequences. - -Add either explicit operations: - -```c -CFREE_API void cfree_cg_managed_store(CfreeCg*, CfreeCgMemAccess access, - CfreeCgEffAddr ea); -``` - -or runtime intrinsics: - -```c -CFREE_CG_INTRIN_GC_ALLOC -CFREE_CG_INTRIN_GC_WRITE_BARRIER -CFREE_CG_INTRIN_GC_READ_BARRIER -``` - -The barrier operation should carry enough metadata for the runtime to identify -the object base, field offset, and pointer kind. Ordinary C stores should remain -ordinary stores. - -### Runtime calling convention - -The current call convention enum is mostly C ABI oriented. A managed runtime may -need: - -- a hidden runtime/thread/goroutine context parameter -- reserved registers -- stack-bound checks in prologues -- a runtime-specific helper-call ABI -- better support for source-level multiple returns - -This could be expressed with a new call convention plus function attributes: - -```c -CFREE_CG_CC_MANAGED - -typedef enum CfreeCgFuncFlag { - ... - CFREE_CG_FUNC_MANAGED_STACK = 1u << N, - CFREE_CG_FUNC_GC_SAFEPOINTS = 1u << M, -} CfreeCgFuncFlag; -``` - -Multiple returns can be lowered through structs or sret today, but a first-class -multi-result function model would better match Go and the internal IR's -semantic multi-result direction. - -### Stack growth and goroutines - -Stackful coroutines already exist in the runtime, but Go-style goroutines need a -different contract: - -- prologue stack checks -- runtime calls to grow or switch stacks -- maps describing live pointers before a growth call -- frame relocation metadata if stacks can move -- unwind/debug compatibility after stack movement - -This should be a managed-stack function attribute, not a generic default for all -CG functions. - -### Panic and implicit checks - -Go-like languages turn many checks into language panics: - -- nil dereference -- bounds check -- divide by zero -- failed type assertion -- explicit `panic` - -CG should expose either explicit check operations or metadata attached to trap -sites. The JIT/debug layer should be able to translate a trap PC into the -language panic path instead of treating it as only a process signal. - -Useful metadata: - -- check kind -- source location -- recovery/cleanup target if any -- runtime helper to call - -### Defer/recover and cleanup edges - -`defer` and `recover` require non-local control behavior that is more structured -than plain `longjmp`. The minimal path is to lower defer management into runtime -calls and make panic edges explicit enough for stack maps and debugger stepping. - -A more complete path would add cleanup/landing-pad metadata to CG, but that -should be deferred until the runtime semantics are clearer. - -## JIT interface extensions - -### Transactional publish - -The existing `cfree_jit_publish` shape is the right direction. A managed runtime -needs stronger guarantees around transactions: - -- compile/link failure leaves the live image unchanged -- metadata is published atomically with code -- old code remains executable while any frame may return into it -- replacement increments the JIT generation -- readers can detect generation changes - -This aligns with `doc/INCREMENTAL_LINK.md` and `doc/HOT_RELOAD.md`. - -### Runtime metadata registry - -Add a JIT metadata channel separate from object/debug inspection: - -```c -typedef enum CfreeJitMetadataKind { - CFREE_JIT_META_STACK_MAP, - CFREE_JIT_META_FUNC_TABLE, - CFREE_JIT_META_TYPE_DESC, - CFREE_JIT_META_TRAP_TABLE, - CFREE_JIT_META_INLINE_TABLE, -} CfreeJitMetadataKind; -``` - -The runtime needs fast queries: - -- PC to function -- PC to stack map -- PC to trap/check record -- PC to inline/source frame -- symbol to active generation - -`cfree_jit_view` can remain object/DWARF oriented. Runtime metadata should be -compact, stable, and queryable without parsing DWARF. - -### Code lifetime and reclamation - -Hot reload for a managed runtime cannot immediately unmap old code. Add -lifetime hooks or state transitions: - -- active -- replaced but still callable by existing frames -- retired -- reclaimable - -The runtime or debugger should be able to veto reclamation until stack scanning -proves no frame or return address is inside an old generation. - -### Managed entry invocation - -The current debugger entry call helpers are narrow: argv-style or `u64` values. -Managed language entry points need a descriptor-based call path or explicit -language trampolines so the host can pass runtime context, managed arguments, -and return slots safely. - -The conservative design is to keep the low-level JIT call ABI simple and require -the frontend/runtime to emit C-callable trampolines. - -### Thread and stop-the-world coordination - -`CfreeJitSession` currently models controlled in-process execution for a single -JIT image. A managed runtime eventually needs hooks for: - -- safepoint polling -- cooperative stop requests -- thread/goroutine enumeration -- stack scanning while stopped -- metadata refresh while workers are paused - -This does not need to be part of the first CG change, but the JIT metadata and -publish APIs should avoid assuming a single worker forever. - -## Suggested sequence - -1. Add managed pointer/address-space policy and explicit safepoint records. -2. Emit and query compact stack maps from the JIT image. -3. Add managed allocation and write-barrier intrinsics. -4. Add managed-stack function attrs and stack-check lowering. -5. Publish runtime metadata transactionally with JIT appends. -6. Add trap/check tables for panic lowering. -7. Add function replacement/lifetime support on top of hot reload. -8. Only then broaden debugger/session APIs for multi-threaded managed runtime - coordination. - -The first useful milestone is smaller than "compile Go": compile a tiny -managed-language frontend that can allocate traced objects, hit safepoints, let -the host enumerate roots from a JIT stack map, and call through a C-callable -runtime trampoline. diff --git a/doc/HOT_RELOAD.md b/doc/HOT_RELOAD.md @@ -1,443 +0,0 @@ -# Function-only hot reload - -This document extends `doc/INCREMENTAL_LINK.md`. It assumes cfree already -has append-only incremental JIT linking: new code can be compiled and -placed in a live image without moving old code. - -Hot reload adds replacement. In v1, only functions can be replaced. Data -symbols, TLS, type layouts, initializers, destructors, and object lifetime -changes are out of scope. - -## 1. Goals - -- Replace the implementation of an existing function in a live JIT image. -- Keep the public address of the function stable when possible. -- Avoid patching every caller for the baseline implementation. -- Allow new code to call old code and replaced code through the same symbol - name. -- Keep old function bodies alive while any stack frame may still return - into them. -- Integrate with `dbg` so a stopped debuggee can reload a function and - continue execution. - -## 2. Non-goals - -- Data replacement or migration. -- Changing function ABI: parameter types, return type, calling convention, - variadic-ness, or visibility. -- Replacing inline copies already compiled into other functions. -- Fully concurrent multi-threaded reload. -- Unloading old generations immediately. -- Replacing functions in external DSOs. -- A production dynamic linker ABI. - -## 3. Core idea - -Append-only linking gives us a way to add a new function body. Hot reload -adds a stable function entry that indirects to the current body. - -For a reloadable function `foo`, callers see: - -```text -foo entry/trampoline -> foo.slot -> current foo body -``` - -Reloading `foo` compiles and appends a new body, relocates it, then updates -`foo.slot` to point at the new body. Existing pointers to `foo` remain -valid because they point at the stable entry/trampoline, not a specific -body generation. - -## 4. Reloadable function representation - -Add a per-function record in the link session: - -```c -typedef struct LinkReloadFunc { - Sym name; - LinkSymId public_sym; - LinkSymId current_body_sym; - uint64_t entry_vaddr; - uint64_t slot_vaddr; - uint64_t current_body_vaddr; - uint32_t generation; - uint8_t abi_hash[16]; -} LinkReloadFunc; -``` - -The exact hash shape can be internal. It must identify the C ABI contract: - -- return type ABI class and size -- parameter ABI classes and sizes -- variadic flag -- target ABI -- calling convention once cfree has more than one - -The hash is not a C type-system identity. It is a runtime-callability -identity. - -## 5. Entry/trampoline shape - -Each supported architecture needs one stable entry sequence. - -AArch64 example: - -```asm -foo: - adrp x16, foo.slot - ldr x16, [x16, #:lo12:foo.slot] - br x16 -foo.slot: - .quad foo.body.0 -``` - -x86-64 example: - -```asm -foo: - jmp *foo.slot(%rip) -foo.slot: - .quad foo.body.0 -``` - -RISC-V example: - -```asm -foo: - auipc t0, %pcrel_hi(foo.slot) - ld t0, %pcrel_lo(foo)(t0) - jr t0 -foo.slot: - .quad foo.body.0 -``` - -The entry lives in RX memory. The slot lives in writable data or in a -JIT-managed patchable cell with the same W^X discipline used elsewhere. - -Slot updates must be pointer-width atomic when the target ABI can observe -the function concurrently. `dbg` v1 can require the worker to be stopped, -but the representation should not rule out atomic publication later. - -## 6. Symbol semantics - -The public symbol name resolves to the stable entry: - -```text -cfree_jit_lookup("foo") == runtime address of foo entry -``` - -The body symbol is internal and generationed: - -```text -foo$body$0 -foo$body$1 -foo$body$2 -``` - -Debug and inspector surfaces should present the public function as `foo`, -with the active body generation as implementation detail. Low-level -symbol iteration may expose generationed body names only under an -internal/debug flag. - -Relocations against `foo` use the public entry by default. Direct body -references are allowed only for linker-synthesized records. - -## 7. Baseline call policy - -The baseline policy is simple: - -- Calls to reloadable functions target the stable entry/trampoline. -- Address-taking of reloadable functions returns the stable entry. -- The slot points at the active body. - -This means reloading a function usually patches one pointer-sized cell, -not every call site. - -Later optimization can patch selected call sites directly to the current -body. That requires a patch-site index and invalidation. It should be a -separate performance phase, not the correctness baseline. - -## 8. Selecting reloadable functions - -Do not make every function reloadable by default in all JIT modes. The -trampoline cost is real and unnecessary for normal `cfree run`. - -Enable it through a JIT/debug option: - -- `dbg` hot-reload mode: externally visible functions are reloadable. -- Optional attribute later: only marked functions are reloadable. -- Internal static functions are not reloadable in v1 unless the reload - command names a containing translation unit and the compiler preserves a - stable synthetic identity for them. - -For v1, restrict reload to global functions with C linkage names visible -to `cfree_jit_lookup`. - -## 9. Reload flow - -Debugger-side: - -```text -reload foo from replacement source - -> compile replacement source to ObjBuilder - -> identify exactly one replacement body for foo - -> verify ABI compatibility - -> append new body and dependencies - -> update foo.slot - -> refresh JIT view / DWARF -``` - -Link-side: - -```text -link_session_reload_function(session, "foo", new_obj) - -> resolve new object against current globals - -> reject data definitions and unsupported side effects - -> place new function sections append-only - -> assign vaddrs - -> apply relocations - -> publish body symbol as generation N+1 - -> atomically store body runtime address into foo.slot -``` - -The public symbol table entry for `foo` does not move. - -## 10. Input restrictions for v1 - -The replacement object may contain: - -- the replacement function body -- private helper functions used only by the replacement -- constants and read-only literals needed by the replacement -- debug sections -- undefined references to already-linked symbols or external resolver - symbols - -The replacement object may not contain: - -- new writable global data -- TLS -- constructors or destructors -- public definitions other than the function being replaced -- strong definitions that collide with existing non-target symbols -- COMDAT/group semantics that require replacing prior selected members - -This keeps reload function-only in practice, not just in name. - -## 11. ABI compatibility - -Before publishing a replacement, verify that the replacement can be called -through the old entry. - -For C frontend replacements, record a compact ABI signature at compile time -for each function definition. The linker should not need to understand full -C types. - -Suggested record: - -```c -typedef struct CfreeFuncAbiSig { - uint8_t target_arch; - uint8_t target_os; - uint8_t call_conv; - uint8_t variadic; - uint8_t ret_class; - uint8_t ret_size_log2; - uint8_t nargs; - uint8_t arg_class[CFREE_ABI_MAX_ARGS]; - uint8_t arg_size_log2[CFREE_ABI_MAX_ARGS]; -} CfreeFuncAbiSig; -``` - -No VLA. If the signature exceeds a fixed bound, mark the function -non-reloadable until a heap-backed encoding is added. - -For non-C objects or missing signatures, v1 should reject reload unless -the user explicitly opts into unchecked replacement. - -## 12. Old generation lifetime - -After a slot update, old bodies remain mapped. - -In `dbg` v1, reload occurs only while the worker is stopped. Even then, the -current stack may contain frames inside the old function. Continuing after -reload must be valid: - -- Existing frames finish in the old body. -- New calls enter the new body. -- Breakpoints in old body addresses remain attached to old code unless - the driver chooses to move source breakpoints. - -Retirement policy: - -- v1: never reclaim old generations until `cfree_jit_free`. -- later: retire when the debugger can prove no stopped/running frame has a - PC or return address inside the old generation. - -Never unmap old code immediately after publishing a replacement. - -## 13. Debugger behavior - -The debugger must distinguish symbol breakpoints from address breakpoints. - -Address breakpoint: - -```text -b *0x1234 -``` - -Stays at that exact address, even if it belongs to an old generation. - -Symbol/source breakpoint: - -```text -b foo -b file.c:42 -``` - -Should be rebound after reload if the source location exists in the new -generation. The old breakpoint should be cleared or marked stale depending -on user policy. - -For v1, a pragmatic rule: - -- Breakpoints set by exact address stay exact. -- Breakpoints set by symbol are moved to the active generation. -- Breakpoints set by file/line are re-resolved after DWARF refresh. -- If re-resolution fails, keep the old breakpoint but mark it stale in - `info breakpoints`. - -The session should be stopped while reload changes breakpoint bindings. - -## 14. DWARF and JIT view - -Every reload increments the JIT generation, same as append-only extension. -`cfree_jit_view` rebuilds on generation mismatch. - -DWARF consumers need enough information to answer two questions: - -- What is the active source location for `foo`? -- If the PC is in an old generation, can we still render its source line? - -v1 can keep old debug info in the rebuilt view. That lets backtraces from -old frames still resolve. Active symbol lookup should prefer the latest -generation for name-to-address queries. - -## 15. Patch-site index - -The baseline does not need caller patching, but a patch-site index is still -useful for future fast mode and for non-function slot fixups later. - -Build the index from durable relocation records: - -```c -target LinkSymId -> LinkRelocApply ids -owner input id -> LinkRelocApply ids -write section id -> LinkRelocApply ids -``` - -Do not scan every relocation on reload. When direct-call optimization lands, -the linker can patch only relocation sites that target the reloaded symbol. - -For v1, it is acceptable to create the data structures but use them only in -assertions/tests. - -## 16. Concurrency and publication - -`dbg` v1 reload is single-threaded: - -1. Worker is stopped. -2. REPL compiles and links replacement. -3. Slot is updated. -4. Breakpoints and DWARF are refreshed. -5. Worker resumes. - -The slot update still should be implemented as an atomic aligned pointer -store. That makes the representation compatible with future multi-threaded -sessions. - -If compilation or relocation fails, the slot is not updated and the old -generation remains active. - -## 17. API sketch - -Debugger-facing experimental surface: - -```c -typedef struct CfreeJitReloadOptions { - const char* symbol; - CfreeObjBuilder* obj; - uint32_t flags; -} CfreeJitReloadOptions; - -int cfree_jit_reload_function(CfreeJit*, const CfreeJitReloadOptions*); -``` - -Internal link session surface: - -```c -int link_session_mark_reloadable(LinkSession*, Sym name); -int link_session_reload_function(LinkSession*, LinkImage*, Sym name, - ObjBuilder* replacement); -``` - -Initial JIT link needs an option to create reloadable entries: - -```c -typedef enum CfreeJitIndirectionMode { - CFREE_JIT_INDIRECT_NONE, - CFREE_JIT_INDIRECT_EXPORTED_FUNCS, -} CfreeJitIndirectionMode; -``` - -This should not affect AOT executable links. - -## 18. Failure behavior - -Reload must be transactional: - -- ABI mismatch: reject, old body remains active. -- Replacement contains data/TLS/init arrays: reject, old body remains - active. -- Unresolved symbol: reject, old body remains active. -- Out of append capacity: reject, old body remains active. -- Relocation failure: reject, old body remains active. - -If new pages were committed before failure, they may remain reserved as -dead space, but no public symbol or slot may point at them. - -## 19. Test plan - -Targeted tests: - -- JIT unit: `cfree_jit_lookup("foo")` returns the same address before and - after reload. -- JIT unit: calling `foo` before reload returns old result; after reload - returns new result. -- JIT unit: a saved function pointer from before reload calls the new body. -- JIT unit: old body remains mapped and `addr_to_sym` can describe an old - PC. -- Negative: ABI mismatch rejects. -- Negative: replacement defining writable global data rejects. -- Negative: duplicate public non-target definition rejects. -- Debug smoke: stop in `foo`, reload `foo`, finish old frame, call `foo` - again and observe new behavior. -- Debug smoke: symbol breakpoint on `foo` moves to the active generation. - -Run these on one JIT target first. Cross-arch trampoline encoding gets its -own arch-specific tests. - -## 20. Implementation sequence - -1. Land append-only incremental link and debugger snippet append. -2. Add reloadable function entries and slots for selected exported - functions. -3. Make `cfree_jit_lookup` return stable entries for reloadable functions. -4. Add function ABI signature emission from the C frontend. -5. Implement replacement object validation. -6. Append replacement body and publish through slot update. -7. Refresh JIT view/DWARF and rebind symbol breakpoints. -8. Add optional direct-call patching only after the baseline is correct. - -The first usable milestone is: in `dbg`, reload a global function while the -worker is stopped; existing function pointers keep working; new calls hit -the new body; old frames can return safely. diff --git a/doc/IMAGE_INSPECT.md b/doc/IMAGE_INSPECT.md @@ -1,382 +0,0 @@ -# Executable / Image Inspection - -Status: design proposal (2026-05-27) - -## Goal - -Extend the library and `objdump` driver to inspect *linked images* — -executables and shared objects — not just relocatable objects. Cover ELF, -Mach-O, and PE. ELF lands first; Mach-O and PE follow on the same API. - -## Why this is a real extension, not a flag - -The reader today is relocatable-object-shaped. `cfree_obj_open` → -`cfree_detect_target` → `impl->read()`, and the ELF backend hard-rejects -anything but `ET_REL`: - - /* src/obj/elf/read.c:218 */ - if (e_type != ET_REL) - compiler_panic(c, ..., "read_elf: only ET_REL inputs are accepted ..."); - -There are DSO readers (`read_elf_dso`, `read_coff_dso`, a Mach-O dylib stub), -but they're wired only into the *linker's* input path, not into the public -`impl->read` / `cfree_obj_open` surface. `ET_EXEC` has no reader at all. - -The canonical in-memory model (`ObjBuilder`) is section / symbol / reloc -oriented. It has no notion of a **segment** (PT_LOAD), the **dynamic table** -(DT_NEEDED / SONAME / RPATH), an **entry point**, **image base**, **imports**, -or **data directories** — which is exactly what image inspection is about. - -Consequence today: `objdump` carries a hand-rolled `pe_parse_image` raw-byte -walker (`driver/cmd/objdump.c:1597`) that bypasses the library to serve `-f`/`-h`/ -`-p` for PE, and emits a soft error for `-t`/`-d`/`-r`/`-s`. nm, size, strip, -objcopy, addr2line all share `cfree_obj_open` and inherit the relocatable-only -limit. - -## Decisions - -- **API:** extend `CfreeObjFile` / `cfree_obj_open` (not a parallel - `CfreeImage` type). One open call; sections/symbols keep working where the - format still carries them; other tools inherit it for free. -- **Rollout:** ELF exec + DSO first (program headers, `.dynamic`, dynsym, - dynamic relocs), de-risking the neutral API on the best-understood format, - then Mach-O and PE. -- **Driver:** extend `objdump` with binutils-style `-p` / `-T` / `-R`, - format-neutral. Delete the PE special-case. - -## Design - -### The image dimension - -Add an internal `ObjImage` populated by the format backends and hung off -`CfreeObjFile` (NULL for pure relocatables). It holds the format-neutral -*common denominator* of a linked image: - - struct ObjImage { - ObjKind kind; /* REL / EXEC / DYN / CORE */ - u64 entry; /* entry point vaddr, 0 if none */ - u64 image_base; /* preferred load address / __TEXT base */ - Slice interp; /* PT_INTERP / LC_LOAD_DYLINKER, empty if none */ - Slice soname; /* DT_SONAME / LC_ID_DYLIB / PE export name */ - Vec segments; /* ImageSegment */ - Vec deps; /* ImageDep: needed lib + (PE/Mach-O) imports */ - Vec rpaths; /* DT_RPATH/RUNPATH, LC_RPATH, @rpath */ - Vec dynsyms; /* dynamic symbol table */ - Vec dynrelocs; /* dynamic relocations */ - const void* raw; /* per-format escape hatch (see below) */ - }; - -`ImageSegment` is the load-layout unit absent from `ObjBuilder`: - - { u64 vaddr; u64 vsize; u64 file_off; u64 file_size; - u32 perms; /* R/W/X bits */ - u32 align; - Slice name; /* PT_* spelling / segname / "" */ } - -Sections continue to map through the existing `ObjBuilder` view where present -(ELF execs usually retain section headers; PE always; Mach-O sections live -inside segments). When section headers are absent (stripped ELF), the section -view is simply empty and the segment view carries the load picture. - -### Reader wiring - -`ObjFormatImpl` gains the ability to populate an image. Cleanest is for each -backend's existing entry to branch on kind and fill both views: - -- `cfree_obj_open` detects kind (reuse `cfree_detect_target` + `e_type` / - `filetype` / PE characteristics) and routes to the backend, which fills - `ObjBuilder` (sections/symbols where present) and, for EXEC/DYN, `ObjImage`. -- **ELF** reuses `read_elf_dso`'s machinery: walk program headers for - segments + PT_INTERP, parse `.dynamic` for DT_NEEDED/SONAME/RPATH/RUNPATH - and the dynsym/dynstr/relocation pointers, read `.dynsym`, and - `.rela.dyn` / `.rela.plt` for dynamic relocs. `ET_EXEC` and `ET_DYN` share - one path; the `e_type != ET_REL` guard is replaced by a kind switch. -- **PE** gets a real library reader (DOS/NT headers, optional header → entry + - image base + subsystem, data directories, section table, import + - export directories, base relocations), replacing the driver's - `pe_parse_image` walker, which is then deleted. -- **Mach-O** parses the load commands currently skipped: LC_SEGMENT_64 vmaddr - (already read for sections — promote to segments), LC_LOAD_DYLIB / - LC_ID_DYLIB, LC_LOAD_DYLINKER, LC_MAIN / LC_UNIXTHREAD (entry), LC_RPATH, - and exports/binds (LC_DYLD_INFO / chained fixups / exports-trie). - -### Public API additions (`include/cfree/object.h`) - -Mirrors the existing reader iterators (`CfreeObjSymInfo`, `CfreeObjReloc`, -`CfreeObjGroupInfo` patterns): - - typedef enum CfreeObjKind { - CFREE_OBJ_KIND_REL, CFREE_OBJ_KIND_EXEC, - CFREE_OBJ_KIND_DYN, CFREE_OBJ_KIND_CORE, - } CfreeObjKind; - CfreeObjKind cfree_obj_kind(const CfreeObjFile*); - - typedef struct CfreeObjImageInfo { /* whole-image scalars */ - uint64_t entry; uint64_t image_base; - CfreeSlice interp; CfreeSlice soname; - } CfreeObjImageInfo; - CfreeStatus cfree_obj_image_info(const CfreeObjFile*, CfreeObjImageInfo*); - - /* segment iterator */ - typedef struct CfreeObjSegInfo { - CfreeSlice name; uint64_t vaddr, vsize, file_off, file_size; - uint32_t perms; /* CFREE_SEG_R | _W | _X */ - uint32_t align; - } CfreeObjSegInfo; - CfreeStatus cfree_obj_segiter_new(CfreeObjFile*, CfreeObjSegIter**); - CfreeIterResult cfree_obj_segiter_next(CfreeObjSegIter*, CfreeObjSegInfo*); - void cfree_obj_segiter_free(CfreeObjSegIter*); - - /* dependency / rpath iterators (deps carry imported names for PE/Mach-O) */ - CfreeStatus cfree_obj_depiter_new(CfreeObjFile*, CfreeObjDepIter**); - CfreeIterResult cfree_obj_depiter_next(CfreeObjDepIter*, CfreeObjDepInfo*); - void cfree_obj_depiter_free(CfreeObjDepIter*); - - /* dynamic symbols + dynamic relocations: reuse CfreeObjSymInfo / - * CfreeObjReloc shapes via parallel iterators */ - CfreeStatus cfree_obj_dynsymiter_new(CfreeObjFile*, CfreeObjSymIter**); - CfreeStatus cfree_obj_dynreliter_new(CfreeObjFile*, CfreeObjRelocIter**); - -Format-specific raw fields (DT_* values, raw load commands, PE data -directories) are surfaced through an escape hatch in the spirit of the -existing `cfree_obj_section_format_flags` — keep the neutral API clean. - -### Driver (`objdump`), binutils-aligned - -- `-f` file header: add entry point + image base for EXEC/DYN. -- `-h` section headers: works once exec sections parse. -- `-p` / `--private-headers`: ELF program headers + dynamic section; Mach-O - load commands; PE optional header + data dirs (matches binutils `objdump -p` - semantics; folds in today's PE path). -- `-T` / `--dynamic-syms`: dynamic symbol table. -- `-R` / `--dynamic-reloc`: dynamic relocations. -- `-d`: already disassembles by vaddr — point it at PT_LOAD/text for execs. - -The `CFREE_BIN_PE` special-case in `driver_objdump` collapses into the normal -`dump_obj` path once PE images open via `cfree_obj_open`. - -## Landed API surface (phase 1 contract) - -The neutral API + internal model are in place and build green (relocatable -inputs report `CFREE_OBJ_KIND_REL` with empty image iterators; the section/ -symbol path is unchanged). Impl and driver work parallelize against this: - -- **Public** (`include/cfree/object.h`): `CfreeObjKind` + `cfree_obj_kind`; - `CfreeObjImageInfo` + `cfree_obj_image_info`; `cfree_obj_segiter_*` over - `CfreeObjSegInfo` (`CFREE_SEG_R/W/X`); `cfree_obj_depiter_*` over - `CfreeObjDepInfo`; `cfree_obj_dynsymiter_new` (reuses `CfreeObjSymIter` / - `cfree_obj_symiter_next`); `cfree_obj_dynreliter_new` (reuses - `CfreeObjRelocIter` / `cfree_obj_reliter_next`). -- **Internal model** (`src/obj/obj.h`, impl in `src/obj/obj.c`): `ObjImage` - hung off `ObjBuilder`. Readers call `obj_image_ensure(ob, OBJ_KIND_*)` then - the setters (`obj_image_set_entry/base/interp/soname`) and appenders - (`obj_image_add_segment/dep/rpath/dynsym/dynreloc`). `obj_image(ob)` is NULL - on relocatables. `obj_free` releases the image. -- **Glue** (`src/api/object_file.c`): maps `ObjImage` → the public iterators. - -**Driver (landed):** objdump grew `-p` (program/dynamic headers, format- -neutral via the image API), `-T` (dynamic symbols), and `-R` (dynamic -relocations); `-f` now reports EXEC_P/DYNAMIC/D_PAGED and the real entry -point. Section/symbol dumps now work on executables too (previously -`cfree_obj_open` rejected ET_EXEC). Verified on a real `elf64-arm64` -executable: `-p` renders the LOAD segments (perms, vaddr, align) + entry + -image base; static execs correctly show empty dynamic tables. Populated -NEEDED/SONAME/dynsym/dynrel can't be exercised from cfree yet — it only emits -static executables (no `-shared`); the empty-path rendering is verified and -the populated paths will light up when dynamic linking lands. - -**Impl (ELF, landed):** `read_elf` accepts `ET_EXEC`/`ET_DYN` and populates -`ObjImage` from program headers (+ dynamic info where present). - -**Impl (Mach-O, landed):** `read_macho` accepts `MH_EXECUTE`/`MH_DYLIB` -alongside `MH_OBJECT`; its existing section/symbol passes run unchanged and -`read_macho_image` (`src/obj/macho/read.c`) adds the image dimension by -re-walking the load commands: `LC_SEGMENT_64` → segments (+ `__TEXT` base, -`VM_PROT_*`→`OBJ_SEG_*` perms), `LC_LOAD_DYLINKER` → interp, `LC_ID_DYLIB` → -soname, `LC_LOAD_DYLIB`/`WEAK`/`REEXPORT` → deps, `LC_RPATH` → rpaths, -`LC_MAIN` (`+__TEXT` base) / `LC_UNIXTHREAD` → entry, the `LC_SYMTAB` external -nlist entries → dynamic symbols, and `LC_DYLD_CHAINED_FIXUPS` binds/rebases -(`DYLD_CHAINED_PTR_64`) → dynamic relocations (bind names resolved through the -chained-imports table). Verified end-to-end: cfree-linked static execs/dylibs -(segments, entry, base, interp, soname, `_main`/exports) and host clang arm64 -binaries (`NEEDED libSystem.B.dylib`, `RPATH`, and `ABS64 _printf` binds). -Classic `LC_DYLD_INFO` and the exports trie are not read (the symbol table is -the authoritative dynamic-symbol source); chained pointer formats other than -the 64-bit family are skipped leniently. - -## Phasing - -1. **Neutral API + ELF.** Add `ObjImage`, the public iterators, and ELF - EXEC/DYN reading (segments, dynamic, dynsym, dynrel). objdump `-p`/`-T`/`-R` - for ELF. Red-green tests on cfree-linked ELF executables and `.so`s. -2. **PE.** Library reader replacing `pe_parse_image`; delete the driver - walker. objdump `-p`/`-T`/`-R` for PE imports/exports/data dirs. -3. **Mach-O (landed).** Load-command parsing for execs/dylibs; chained-fixup - binds + rebases; dynamic symbols from `LC_SYMTAB`. objdump `-f`/`-p`/`-T`/ - `-R` for Mach-O. Committed golden on a cfree-linked aarch64 exec under - `test/objdump/aarch64-darwin/`. -4. **Inherited tools (landed).** nm, size, addr2line, strings open images via - `cfree_obj_open` now that it accepts `ET_EXEC`/`ET_DYN`. nm grew `-D` - (`.dynsym` via `cfree_obj_dynsymiter_new`); `CfreeObjSecInfo` grew a neutral - `addr` (load vaddr, 0 for relocatables, from `sh_addr`) so SysV `size -A` - reports an image's real layout. strings is format-agnostic and was already - fine. addr2line opens images correctly but reports "no debug info" — the - linker drops `.debug_*` (see phase 5). Golden: - `test/objdump/aarch64/cases/02-size-sysv-image`. -5. **Debug-info retention in the linker (landed).** `.debug_*` sections are - carried through to linked images as file-only sections with their - relocations resolved in place, so `cfree addr2line` / `cfree dbg` resolve - file:line on cfree-linked executables, single- and multi-input. ELF - (static / PIE / DSO) places them in non-alloc sections; Mach-O in a - dedicated `__DWARF` segment (before `__LINKEDIT`, with `__debug_*` - section names matching Apple's spelling). See `doc/LINK_DEBUG.md` and the - dedicated section below. Goldens: - `test/objdump/aarch64/cases/05-addr2line-linked`, driver - `addr2line-macho` / `addr2line-macho-linked`. The Mach-O reader reports - absolute symbol vaddrs for linked images (MH_EXECUTE/MH_DYLIB) and - section-local offsets for relocatable MH_OBJECT inputs — matching the ELF - reader — so the `nm | addr2line` flow works the same across formats. - -## Test strategy - -The compiler links its own ELF/Mach-O/PE images, so tests round-trip: link a -small program, open it via `cfree_obj_open`, assert kind/entry/segments/deps/ -dynsyms against what the linker emitted. Cross-check against the host -`readelf`/`objdump` in smoke tests where available. New corpora under -`test/elf/` (image cases), later `test/macho/` and `test/{coff,pe}/`. - -## Resolved decisions - -- **Stripped ELF (no section header table):** do not synthesize pseudo-sections - from segments. This matches GNU `objdump` and `llvm-objdump`, which are - section-header-driven and report "no sections" when the table is absent - (`readelf -l` still shows segments). Note the common `strip` keeps the - section table and `.text`, so this only bites genuinely table-stripped - images. The section view is empty in that case; as a value-add beyond GNU - objdump, `-d` may disassemble executable `PT_LOAD` segments directly by - vaddr, so a fully-stripped image is still disassemblable. *(Implemented: - `read_elf` now accepts a zeroed section-header table and surfaces an empty - section view; `dump_disasm` falls back to iterating X-perm segments via - `cfree_obj_segiter_*` when the section walk yields nothing on an image, - using each segment's vaddr as the base. Format-agnostic — no ELF - special-casing in the driver. The section-driven path also now uses - `sec.addr` as the base, so `-d` on a non-stripped image resolves real - vaddrs and symbol labels too. Golden: `test/objdump/aarch64/cases/ - 04-disasm-stripped-segment`.)* - -- **Core files (`ET_CORE` / Mach-O `MH_CORE`):** out of scope. Keep - `CFREE_OBJ_KIND_CORE` defined but unimplemented; detect and reject cleanly. - Note/register-state parsing is a separate feature. - -- **Mach-O exports/binds (phase 3):** support only `LC_DYLD_CHAINED_FIXUPS` + - `LC_DYLD_EXPORTS_TRIE` (modern). Classic `LC_DYLD_INFO` opcode/trie reading - is not supported; reading older dylibs is out of scope. Confirm cfree's own - linker emits chained fixups when phase 3 begins. - -## Debug-info retention in the linker (phase 5, landed) - -Status: design proposal (2026-05-27). ELF / aarch64 first. - -### Goal - -Make `addr2line -e <cfree-linked-exe>` resolve `file:line`. Today it opens the -image fine (phase 4) but finds nothing: the linker drops every `.debug_*` -section, so the linked image carries no DWARF at all. - -### Why it's dropped - -`link_section_kept` keeps only allocatable progbits/nobits/array sections; -`.debug_*` lack `SHF_ALLOC` and are cut at the first line: - - /* src/link/link_layout.c:49 */ - int link_section_kept(const Section* s) { - if (!(s->flags & SF_ALLOC)) return 0; /* .debug_* dies here */ - if (s->sem == SSEM_PROGBITS || s->sem == SSEM_NOBITS) return 1; - if (s->sem == SSEM_INIT_ARRAY || s->sem == SSEM_FINI_ARRAY) return 1; - return 0; - } - -That predicate gates three things: layout (`link_layout.c:196,238`), -section-header planning (`obj/elf/link.c:854`), and **relocation emission** — -`link_emit_relocations` skips any reloc whose section isn't kept -(`link_reloc_layout.c:1221`). So even the relocation records inside `.debug_*` -are discarded along with the bytes. - -### What the debug sections need - -The DWARF producer (`src/debug/debug_emit.c`) writes zero placeholders and -records relocations against them. Two classes, with different resolution: - -| Reloc | Sections | Target | Resolves to | -|----------|-------------------------------------------------------------|------------------|-------------| -| `R_ABS64`| `.debug_info` low_pc, `.debug_line` set_address, `.debug_aranges`, `.debug_rnglists` | function symbols | final code **vaddr** | -| `R_ABS32`| `.debug_info`, `.debug_line`, `.debug_str_offsets` | **section symbols** (`.debug_abbrev`/`.debug_str`/…) | **offset within the output debug section** (DWARF sec-relative) | - -Self-contained, no relocs: `.debug_abbrev`, `.debug_str`, `.debug_line_str`. - -The reader applies no relocations — `dwarf_open.c` reads section bytes verbatim -and trusts `DW_FORM_addr` to already hold the final address -(`dwarf_open.c:453`, `dwarf_die.c:35`). So the linker must fully resolve the -placeholders in place. - -### The relocation engine is already reusable - -Symbol resolution is generic: - - /* src/link/link_reloc_layout.c:66 */ - s->vaddr = ls->vaddr + (s->value - ls->obj_offset); - -Set a retained debug section's `LinkSection.vaddr` to its **section-relative -base** (0 for a single input; cumulative across inputs) and the rest falls out: -a section symbol pointing at it resolves to that base, so `R_ABS32 = base + -addend` yields the correct DWARF sec-offset, while function symbols already -resolve to real code vaddrs so `R_ABS64` is correct for free. No new -relocation math — `link_reloc_apply` handles `R_ABS64`/`R_ABS32` as-is once the -sections participate. - -### What has to be built - -The `.symtab`/`.strtab` survival path is **not** the model: those are -*synthesized* file-only blobs (`obj/elf/link.c:956`), whereas `.debug_*` must be -*carried and relocated*. The work is a new "retained, file-only input section" -class: - -1. **Retention.** Split `link_section_kept` into alloc-kept vs a new file-only - predicate (debug sections, gated by a `--strip-debug` linker flag; default - keep when inputs carry debug). Route retained sections into `img->sections` - flagged `file_only`, with no segment. -2. **Layout.** A file-only pass after the segments: concatenate same-named - debug sections across inputs, assign each output section a file offset and - `sh_addr = 0`, and record each input's contribution offset. Set - `LinkSection.vaddr` = contribution base so the resolver above does the right - thing. Sits beside the existing `.symtab` file placement - (`obj/elf/link.c:1032`). -3. **Relocation routing.** Widen the gate at `link_reloc_layout.c:1221` to - include retained sections, and give `apply_all_relocs` - (`obj/elf/link.c:306`) a path that writes into the file-only section's own - byte buffer instead of `img->segment_bytes[...]` (keyed on a segment debug - sections won't have). -4. **Output.** Emit `OutShdr` entries for the retained sections (`SHT_PROGBITS`, - `sh_addr=0`, no `SHF` bits) and write their relocated bytes at their file - offsets, next to `.symtab`/`.strtab`/`.shstrtab`. -5. **Section symbols.** Verify local `SK_SECTION` symbols for debug sections - survive resolution as reloc targets (`link_reloc_layout.c:51-66` handles - defined section symbols generically; local-symbol retention needs a check). - -### Decisions / scope - -- **Single-input static exec first.** The common cfree case: merging is trivial - (one contribution at offset 0), collapsing steps 2/5. This is what unblocks - `addr2line` on a cfree-linked exe and is directly testable on aa64. -- **Multi-input merge is the larger half.** Per-input contribution offsets and - section-symbol rebasing are the fiddly part; `.debug_line`/`.debug_aranges` - are per-CU so concatenation is naturally correct, but offsets must be exact. -- **`--strip-debug` / `-S`** controls retention; default keeps debug when - present, matching the toolchain expectation that `-g` survives linking. -- **Biggest risk:** silent corruption — a missed reloc class produces wrong - line numbers with no error, because the reader trusts the bytes. Mitigate - with a round-trip golden: link `-g`, `addr2line` a known function address, - assert `file:line`, cross-checked against host `addr2line` on the same bytes. - -This is a real linker feature (a debug-retention pass), not a flag flip — -comparable in size to the image-inspection reader work. diff --git a/doc/INCREMENTAL_LINK.md b/doc/INCREMENTAL_LINK.md @@ -1,416 +0,0 @@ -# Append-only incremental link - -This document describes the first incremental-linking step for cfree: -append-only growth of a live JIT image. It is sequenced before -`doc/HOT_RELOAD.md`. - -The first concrete use case is `cfree dbg`: while stopped at the debugger -REPL, a user should be able to enter C code, JIT it into the existing -debuggee image, and call or inspect the new symbols as if they had been -present in the original link. - -`cfree emu` also wants append-only linking, but it is a separate -workstream. This document intentionally keeps the motivating path in -`dbg`. - -## 1. Goals - -- Grow one live `CfreeJit` image with additional object inputs. -- Keep all previously published runtime addresses stable. -- Let new code reference old symbols from the original debuggee. -- Let old debugger surfaces see new symbols: `cfree_jit_lookup`, - `cfree_jit_addr_to_sym`, symbol iteration, breakpoints, PC translation, - and the JIT debug view. -- Preserve relocation records as durable data so later work can index and - selectively reapply them. -- Keep the public surface small until the implementation has one real - consumer. The first API can be private to `src/link/`, `src/dbg/`, and - `driver/cmd/dbg.c`. - -## 2. Non-goals - -- Replacing or removing existing code. That is hot reload and is covered - by `doc/HOT_RELOAD.md`. -- Reclaiming appended code. Debug sessions are short-lived; v1 may leak - appended code until `cfree_jit_free`. -- Data-symbol migration. New code may define new data, but existing data - addresses do not move and are not replaced. -- Cross-thread debuggee mutation. `dbg` remains a single-worker session. -- A general dynamic loader ABI. This is an in-process cfree JIT facility, - not `dlopen`. -- Expression parsing as a linker feature. C expression evaluation is a - frontend/driver problem that can be implemented by wrapping an - expression in a generated function. - -## 3. User model in `dbg` - -The REPL grows a new command family around JIT extension. Exact spelling is -driver policy; the link-side contract should support these shapes: - -```text -(cfree) jit { -int twice(int x) { return x * 2; } -} -(cfree) p twice -(cfree) call twice(21) -``` - -and: - -```text -(cfree) jit { -extern int existing_func(int); -int probe(int x) { return existing_func(x) + 1; } -} -(cfree) b probe -(cfree) call probe(41) -``` - -The minimal v1 can require full C declarations and function definitions in -the snippet. A later REPL expression command can synthesize: - -```c -static <T> __cfree_dbg_expr_N(void) { return <user-expression>; } -``` - -The important linker property is that the synthesized object is just -another input appended to the same live JIT. - -## 4. Current shape - -Today the JIT path is single-shot: - -```text -objects / archives - -> Linker - -> link_resolve() - -> LinkImage - -> cfree_jit_from_image() - -> CfreeJit -``` - -`CfreeJit` owns the mapped pages and the resolved `LinkImage`. The debugger -owns a `CfreeJitSession` over that `CfreeJit`. - -The linker already has the right discipline for incremental work: - -- `LinkInputId` values are stable for the lifetime of a `Linker`. -- `ObjBuilder` inputs are not consumed by `link_resolve`. -- `LinkRelocApply` records survive as data. -- Resolution is structured as a function from linker inputs to an image. - -The missing piece is a live link session that survives after initial JIT -mapping and can append newly compiled objects. - -## 5. Proposed internal model - -Add an internal session owned by `CfreeJit`: - -```c -typedef struct LinkSession LinkSession; - -struct CfreeJit { - Compiler* c; - LinkSession* link; - LinkImage* image; - CfreeExecMemRegion master; - ... -}; -``` - -`LinkSession` owns state that must outlive one `link_resolve` call: - -- the `Linker`-style input list -- a watermark for inputs already placed into the live image -- the global symbol hash -- per-input `InputMap` entries -- append cursors for each segment class -- executable-memory capacity and committed ranges -- durable relocation records -- optional relocation indexes, introduced in later phases - -`LinkImage` remains the read-side view consumed by JIT inspection, -debugging, and DWARF. In v1 it can still hold the arrays directly, but they -must become growable or session-backed: - -- symbols: appendable -- sections: appendable -- segments: fixed count where possible -- relocations: appendable -- debug input list: appendable - -The simplest v1 should keep the same segment classes as normal JIT layout: - -- RX -- R -- RW -- TLS - -Instead of creating new segments for every append, the JIT reserves one -larger contiguous master region up front, then commits pages as appended -sections land. - -## 6. Address stability - -Append-only incremental link has one hard invariant: - -> Once a runtime address is observable, it never changes. - -Observable addresses include: - -- `cfree_jit_lookup` results -- breakpoint locations -- return addresses on the worker stack -- addresses shown by `info functions` -- addresses captured by host code that called into the JIT -- DWARF PC ranges already handed to debugger consumers - -Therefore `link_extend` never compacts, reorders, or lays out old -sections. It only appends new sections at segment append cursors. - -## 7. Reservation and commit - -The current JIT maps a contiguous reservation sized to the initial image. -Append-only linking needs slack. - -Add a JIT option internally: - -```c -typedef struct LinkJitReserveOptions { - uint64_t reserve_rx; - uint64_t reserve_r; - uint64_t reserve_rw; - uint64_t reserve_tls; -} LinkJitReserveOptions; -``` - -For `dbg`, default to a conservative fixed budget, for example 64 MiB RX -and smaller R/RW budgets. The exact number should be target/host tunable, -but the first implementation can choose a simple constant. - -The reservation model: - -1. Reserve one contiguous master VA range large enough for the initial - image plus append budgets. -2. Lay out initial segments at the front of each class range. -3. Copy and relocate initial bytes. -4. Protect committed pages. -5. Keep uncommitted slack inaccessible. -6. On append, commit the pages covering newly used ranges, write bytes, - apply relocations, flush icache, then protect RX pages. - -This keeps AArch64 branch and ADRP proximity behavior predictable because -old and new code live in one planned range. - -## 8. Append flow - -Debugger-side flow: - -```text -REPL snippet - -> compile as C input through the existing frontend - -> ObjBuilder - -> cfree_jit_append_obj(jit, obj) - -> update dbg's DWARF/JIT view bindings -``` - -Link-side flow: - -```text -link_session_add_obj(session, obj) - -> read new input summaries - -> resolve new definitions and undefs - -> place new sections at append cursors - -> assign vaddrs for new symbols - -> synthesize any needed GOT/stub/helper sections - -> emit relocation-apply records for new sections - -> write new section bytes into the live mapping - -> apply new relocations - -> publish new symbols -``` - -Only new sections and new relocation records are processed during append. -Old relocation records remain valid but are not revisited. - -## 9. Symbol resolution - -New inputs resolve against: - -1. Existing global definitions in the live image. -2. New definitions from this append batch. -3. The registered external resolver. -4. Archive members, if archive support is enabled for incremental sessions. - -v1 should probably skip archive reselection for REPL snippets. The initial -debuggee link can include archives as normal, but appending a snippet should -resolve against the already-linked result plus the external resolver. That -keeps the first cut smaller. - -Duplicate definitions: - -- A new strong definition of an existing strong global is an error. -- A new weak definition of an existing global is ignored for global - resolution but remains present as an object-local symbol. -- A new strong definition can satisfy prior unresolved weak references only - if those references belong to new code. Old code is not repatched in - append-only mode. - -The last point is deliberate. If old code needs to start calling a new -definition, that is replacement/patching territory and belongs in hot -reload. - -## 10. Relocations - -The append pass emits `LinkRelocApply` records for new sections only. -Each record must include enough information to reapply the relocation later: - -- write location as image vaddr -- write width -- relocation kind -- target `LinkSymId` -- addend -- owning input and section - -For the live JIT mapping, relocation application translates image vaddr to -the write alias, computes target runtime address or image address according -to the relocation kind, writes the bytes, and flushes icache for executable -patches. - -Old relocations are not re-run. New code can refer to old code. Old code -does not learn about new code unless it already had an indirect call through -some user-controlled data slot. - -## 11. Debug info and JIT view - -`cfree_jit_view` currently builds a borrowed object view lazily from debug -inputs. Append-only linking needs invalidation: - -- Every append increments `jit->generation`. -- The cached view records the generation it was built for. -- `cfree_jit_view` rebuilds when generations differ. -- `CfreeDebugInfo` attached to a `CfreeJitSession` must be refreshed after - append. - -For `dbg`, the REPL can handle this directly: - -1. Append object. -2. Drop the old `CfreeDebugInfo`. -3. Call `cfree_jit_view`. -4. Open a new `CfreeDebugInfo`. -5. Attach it to the existing session. - -The worker should be stopped while this happens. That keeps debugger state -single-threaded and avoids racing line-table replacement with a running -thread. - -## 12. Breakpoints - -Existing breakpoints remain valid because old addresses remain valid. - -New breakpoints can be set against appended symbols after the append -publishes the symbol table. Breakpoint specs by name should resolve through -the normal `cfree_jit_lookup` path. - -If a source-level breakpoint was pending by file/line and the file was not -covered before the append, `dbg` can either: - -- leave it unresolved until the user retries, or -- maintain pending source breakpoint specs and arm them after each append. - -The second behavior is nicer but not required for the first linker cut. - -## 13. API sketch - -Keep the first surface private or experimental: - -```c -int cfree_jit_append_obj(CfreeJit*, CfreeObjBuilder*); -uint64_t cfree_jit_generation(CfreeJit*); -``` - -Internally this maps to: - -```c -LinkSession* link_session_from_initial(Linker*, LinkImage*); -void link_session_set_reserve(LinkSession*, const LinkJitReserveOptions*); -void link_session_add_obj(LinkSession*, ObjBuilder*); -void link_session_extend(LinkSession*, LinkImage*); -``` - -Once the REPL path is proven, the public API can be made more general: - -```c -typedef struct CfreeJitAppendOptions { - CfreeObjBuilder* const* objs; - uint32_t nobjs; -} CfreeJitAppendOptions; - -int cfree_jit_append(CfreeJit*, const CfreeJitAppendOptions*); -``` - -## 14. Failure behavior - -Append should be transactional from the user's point of view: - -- If compile fails, the JIT is unchanged. -- If symbol resolution fails, the JIT is unchanged. -- If reservation capacity is exhausted, the JIT is unchanged. -- If relocation application fails, the new allocation is not published. - -Implementation detail: pages may have been committed before a late failure. -That memory can stay reserved and unused, but symbols must not become -visible and append cursors must roll back. - -Use a small append transaction: - -```c -typedef struct LinkAppendTxn { - uint32_t old_nsyms; - uint32_t old_nsections; - uint32_t old_nrelocs; - uint64_t old_rx_cursor; - uint64_t old_r_cursor; - uint64_t old_rw_cursor; - uint64_t old_tls_cursor; -} LinkAppendTxn; -``` - -No VLA. The transaction hangs off the link session or the stack as a fixed -struct. - -## 15. Test plan - -Targeted tests: - -- Link unit: initial object plus appended object where appended code calls - an initial function. -- Link unit: appended duplicate strong definition fails without changing - existing lookup results. -- Link unit: appended object with unresolved symbol fails transactionally. -- JIT unit: `cfree_jit_lookup` sees appended function and old function - addresses are unchanged. -- JIT unit: `cfree_jit_addr_to_sym` maps PCs in both initial and appended - code. -- Debug smoke: scripted `cfree dbg`, append `twice`, set breakpoint on it, - call it, observe stop. -- Debug smoke: append code with `-g`, refresh DWARF, `b file:line` inside - appended code. - -Prefer narrow tests by target/arch. AArch64 JIT on the host is enough for -the first debugger path; ELF/Mach-O file emission should not be in scope. - -## 16. Implementation sequence - -1. Convert `CfreeJit` to retain a link session or enough linker state to - append. -2. Reserve JIT slack and track append cursors. -3. Implement append placement for RX/R/RW sections without archives. -4. Apply new relocations into live mappings. -5. Grow JIT symbol lookup, addr-to-symbol, and iteration. -6. Invalidate/rebuild `cfree_jit_view` by generation. -7. Add `dbg` REPL command for full C snippets. -8. Add a small `call` command or equivalent helper for invoking appended - functions. - -The first usable milestone is: append a function in `dbg`, call it, and set -a breakpoint in it without changing any original address. diff --git a/doc/INCREMENTAL_OBJLINK.md b/doc/INCREMENTAL_OBJLINK.md @@ -1,786 +0,0 @@ -# File-based incremental linking — obj + link internals - -Status: design draft. Scope: the `src/obj` and `src/link` substrate that lets a -file-based rebuild patch a prior image instead of relinking from scratch, plus -the public interface a build system consumes. This is **distinct from** the two -existing incremental docs and is sequenced under neither: - -- `doc/INCREMENTAL_LINK.md` — append-only growth of a *live JIT image* for - `cfree dbg`. In-process, never patches existing code. We reuse its machinery - (append cursors, durable reloc records) but target on-disk rebuilds. -- `doc/HOT_RELOAD.md` — replacing a *running* function body in a live process. - Shares the "indirection cell" idea (see §8) but is a different consumer. - -The build graph, compile cache, dependency scanning, and daemon/watch modes are -**out of scope** here — they live in the separate build-system plan and consume -the interface in §16. This document is deliberately about the linker and object -internals those layers stand on. - ---- - -## 1. Scope & non-goals - -**In scope.** Make the obj/link layers able to: -- give every object and every function/data *atom* a stable **content identity**; -- persist a prior link's placement + relocation state as side-band data; -- on a changed input, **re-resolve and patch only the changed atoms**, keeping - every unchanged address byte-stable; -- detect when a change is *not* provably local and **fall back to a full link**; -- expose all of the above through a small public API. - -**Non-goals (here).** -- Build graph / DAG, compile cache, header-dependency scanning, `cfree serve` - daemon, watch mode — separate build-system plan (consumes §16). -- Cross-TU / whole-program optimization (ThinLTO-style). Incremental link is a - `-O0`/`-O1` *dev* feature; release builds always full-link, clean. -- Reclaiming dead patch space within a session (we recycle via a free-list but - do not compact; a clean link reclaims). -- Mach-O / COFF in-place patching in v1 (ELF first — see §14). - ---- - -## 2. Target & cost model - -**"Instant" defined.** After editing one translation unit in a project of *N* -TUs, the *link* cost should be `O(changed atoms + their relocations)`, not -`O(whole program)`. Compile cost is the build system's problem (cache); this doc -makes the *link* incremental. - -**Where link time goes today.** `link_resolve` (`src/link/link_layout.c:1212`) -is six whole-program phases: - -| Phase | Function | Cost | -|---|---|---| -| 1 Symbol resolve | `link_resolve_symbols` (`link_resolve.c:228`) | `O(Σ symbols)` — one global `SymHash` | -| 1b GC | `link_gc_compute` | `O(sections + relocs)` BFS, no delta-marking | -| 2 Layout | `link_layout_sections` (`link_layout.c:209`) | `O(total_kept)`; **any size change shifts all downstream vaddrs** (`link_layout.c:340-348`, no slack) | -| 2c Bytes | `link_emit_segment_bytes` (`link_layout.c:1050`) | `O(Σ bytes)` into one monolithic per-segment buffer | -| 3 Vaddr | `link_assign_symbol_vaddrs` (`link_reloc_layout.c:40`) | `O(Σ symbols)` | -| 4 Relocs | `link_emit_relocations` (`link_reloc_layout.c:1227`) | `O(Σ relocs)` | -| 6 Emit + id | format emitter + `link_image_id_compute` (`link_image_id.c:31`) | `O(output)`; FNV-1a over **all** segment bytes *and* vaddrs | - -Plus: `link_resolve_at`/`link_resolve_extend` are panic stubs -(`link.c:629,638`); the GOT is one exactly-sized segment placed after everything -(`link_reloc_layout.c:710-748`); relocations are applied **destructively** into -`segment_bytes` at emit (`src/obj/elf/link.c:318-470`) — but the -`LinkRelocApply` records that *produce* those writes are preserved as data first -(invariant, internal `src/link/link.h:234-246`). - -**Benchmark shape.** The substrate must model (a) archive members as link inputs -and (b) one edited TU fanning out to multiple final images. The acceptance suite -uses a small **synthetic** multi-TU fixture for exactly this (§19.2) — a handful -of core TUs archived into a static library, linked into two executables that -share it — rather than vendoring a third-party project as a test dependency. -Real codebases (amalgamations like sqlite, multi-TU libraries like lua) remain -useful as *later* wall-clock perf targets, but are deliberately not test deps. - ---- - -## 3. Design principles - -1. **Provable locality, else fall back.** Reuse is correct only when the change - cannot alter symbol resolution (mold's cascading-effects argument). The full - link is always available and always correct; the incremental path is an - accelerator gated on a soundness check (§7.3). A correct-but-slow result - always beats a fast-but-wrong one. -2. **Address stability is the bedrock.** Once a vaddr is published it never - moves. Unchanged atoms keep their bytes *and* addresses, so their relocations - never reapply — this is what makes a patch `O(change)`. Enforced by - overwrite-in-slack / append-to-free-slot, **never compact**. -3. **Content-hash keying, not transient IDs.** `LinkInputId`/`LinkSymId` are - stable only *in-process* (`link.h:240-241`); a file rebuild allocates a fresh - `Linker`. So persisted state is keyed by **content hashes** and **symbol - names**, never by re-derived IDs (§10). This makes determinism a dedup - *nicety*, not a correctness *requirement*. -4. **Relocation location is relative, target is symbolic.** Persist a reloc as - `(atom, offset-within-atom, kind, target-name, addend)`. Derive the absolute - write address and target address from *current* placements at apply time. - Moving an atom then needs **zero reloc rewriting** — placements change, the - reloc re-derives. (Closes the "rewrite `write_vaddr` on move" hazard.) -5. **The move-on-grow primitive is swappable.** Everything else (atoms, slack, - free-list, persisted session, soundness gate) is independent of *how a caller - reaches a moved callee*. Ship **thunk-on-grow** first (no codegen change), - converge on **GOT-cell** later to share one mechanism with hot reload (§8). -6. **Frontend-agnostic.** All work attaches at the shared `ObjBuilder` boundary - (`obj_finalize`, `src/api/compile.c:356`). C, Toy, asm, and WASM all benefit - with no per-frontend code beyond a tiny capability (§15). -7. **Project rules.** No VLAs; no global state (everything hangs off - `Compiler`/`LinkSession`); multi-arch/multi-format behind the existing - `ArchImpl`/`ObjFormatImpl` vtables; determinism preserved on the full-link - path. - ---- - -## 4. What exists vs what is new - -| Need | Status | Where | -|---|---|---| -| Durable, non-destructive reloc records | **exists** | `LinkRelocApply`, internal `link.h:234-246`, `link_internal.h:129` | -| Stable IDs *within a link* | **exists** (in-process only) | `link.h:240-241` | -| Per-input id translation | **exists** | `InputMap`, `link_internal.h:21` | -| Atom-level GC granularity | **exists** | `InputMap.section_atom_*`; atoms placed individually `link_layout.c:282-284` | -| Append cursors + reserved slack | **exists (JIT only)** | `link_jit.c:111-114`; **panics** on exhaustion `link_jit.c:1080` | -| Apply one reloc to a live mapping | **exists (JIT)** | `cfree_jit_append_obj` path, `reloc_apply.c` | -| AArch64 call-stub template | **exists (JIT only, off for static exe)** | `link_layout_jit_stubs` `link_reloc_layout.c:429` | -| GOT slot machinery | **exists, but only for GOT-reloc kinds** | `link_layout_got` `link_reloc_layout.c:654`; `reloc_uses_got` `:376` | -| BLAKE2b CAS blob/tree store | **exists** | `driver/dist`: `cas.c`/`blob.c`/`tree.c`, `DIST_BLAKE2B_LEN=32` | -| Dependency iteration (C includes) | **exists** | `cfree_dep_iter_new/next` `src/api/compile.c:417-462`; `cc_dep_finish` `cc.c:2121` | -| `LinkSession` type | **new** (only sketched in docs; today bare fields on `CfreeJit`) | — | -| `link_resolve_extend` | **new** (panic stub) | `link.c:638` | -| Per-atom slack / free-list / overwrite / grow-relocate | **new** | — | -| Fall-back-instead-of-panic discipline | **new** (JIT preflight panics) | — | -| Object/atom content identity | **new** | — | -| Per-atom reloc & symbol indices | **new** (flat scans today: `obj_reloc_count` `obj.c:831`, `obj_symbol_find` `obj.c:528`) | — | -| Incremental (per-segment) build-id | **new** (FNV-1a is whole-image) | `link_image_id.c:31` | -| Move-on-grow primitive (thunk / GOT) | **new** (direct `R_AARCH64_CALL26` today) | — | - -The honest summary: durable relocs, stable in-process IDs, atom GC, and the JIT -append *placement* are reusable; **everything that makes a relink incremental — -content identity, slack/free-list, overwrite/grow, persistence, the soundness -gate, graceful fallback, and the move primitive — is net-new code.** - ---- - -## 5. The atom model (obj side) - -The minimal relocatable unit is an **atom**: one function or one data object. -cfree already carries atoms for GC; incremental link promotes them to the -patch unit. - -- Under `--incremental` (dev mode), frontends emit one section per - function/global (a `-ffunction-sections`/`-fdata-sections` equivalent) so each - atom is independently placeable. cfree already lays out kept atoms as - individual `LinkSection`s (`link_layout.c:282-284`). -- Each atom gets a **content id**: BLAKE2b over its canonical form — - `bytes || align || flags || canonical(relocs)`, where `canonical(relocs)` - encodes each reloc as `(offset-within-atom, kind, target-name, addend)`. - Target is the *name*, never a transient id (principle 3). -- The **object content id** is BLAKE2b over the atom-id list plus object-level - metadata (format, arch, ext flags). Two byte-identical compiles → identical - object id (modulo the determinism audit; see §12). - -### 5.1 obj internals additions - -```c -/* src/obj/obj.h — new */ -typedef struct ObjAtomId { u32 v; } ObjAtomId; /* 0 = none */ - -/* Deterministic content identity over the canonical form above. */ -void obj_atom_content_id(ObjBuilder*, ObjAtomId, u8 out[DIST_BLAKE2B_LEN]); -void obj_content_id(ObjBuilder*, u8 out[DIST_BLAKE2B_LEN]); - -/* O(1) per-atom lookups (today both are linear scans). */ -const Reloc* obj_atom_reloc_first(ObjBuilder*, ObjAtomId, ObjRelocCursor*); -ObjSymId obj_symbol_by_name(ObjBuilder*, Sym name); /* hash, not O(nsyms) */ -``` - -Required obj changes: -1. **Per-atom reloc index.** Today `obj_reloc_count` scans the flat reloc table - (`obj.c:831`). Add a per-atom reloc list so "relocs touching atom A" is `O(1)`. -2. **Symbol-by-name hash.** `obj_symbol_find` is `O(nsyms)` (`obj.c:528`). Add a - name→`ObjSymId` hash on the builder. -3. **Deterministic, lossless serialize/deserialize.** A cached/persisted object - must rehydrate identically: atoms, relocs, COMDAT groups, debug sections, and - format `ext_type/ext_flags` (round-trip-safe today per survey) all preserved. - This is the cache-value contract the build system relies on (§16). -4. `obj_finalize` (`obj.c`, currently reserved/empty) is the natural place to - compute and memoize the content ids once a TU is built. - ---- - -## 6. The `LinkSession` (link side) - -A new type that owns the state that must outlive one `link_resolve` and can be -persisted. It generalizes the per-segment cursor/slack fields currently inlined -in `struct CfreeJit` (`link_jit.c:92-114`) and adds overwrite, free-list, grow, -and graceful fallback. - -```c -/* src/link/link_session.h — new. Hangs off Compiler; no global state. */ -typedef struct LinkFreeList LinkFreeList; /* gold-style two-level free list */ - -typedef struct LinkAtomPlace { /* one per placed atom */ - u8 content_id[DIST_BLAKE2B_LEN]; /* §5 atom content id (the key) */ - u64 vaddr; /* published address — STABLE */ - u64 file_offset; - u32 size; - u32 capacity; /* size + reserved slack */ - u32 seg_bucket; /* SEG_RX/R/RW/TLS */ - /* relocs stored relative: (offset-within-atom, kind, target_name, addend) */ -} LinkAtomPlace; - -typedef struct LinkSession { - Compiler* c; - Linker* l; /* stable LinkInputId -> ObjBuilder* */ - LinkImage* img; /* now MUTABLE-by-patch */ - u64 cursor[SEG_NBUCKETS]; /* append cursor per class (from JIT) */ - u64 limit[SEG_NBUCKETS]; /* reserved ceiling per class */ - LinkFreeList free[SEG_NBUCKETS]; /* vacated slots, first-fit reuse */ - u32 code_slack_pct; /* per-code-atom reserve; modest (code can relocate) */ - u32 data_slack_pct; /* per-data-atom reserve; generous (data-grow forces fallback) */ - /* atom placement table, keyed by content_id; the persisted core (§10) */ - LinkAtomPlace* atoms; u32 natoms; -} LinkSession; - -/* Fixed-size transaction watermark (no VLA). */ -typedef struct LinkPatchTxn { - u32 old_natoms, old_nsyms, old_nsections, old_nrelocs; - u64 old_cursor[SEG_NBUCKETS]; - /* free-list undo log handle */ -} LinkPatchTxn; -``` - -`LinkImage` stays the read-side view for inspection/DWARF/emit, but its symbol, -section, and reloc vectors become append/overwrite-capable (they already grow on -the JIT path). - ---- - -## 7. Incremental resolve & the soundness gate - -Implement `link_resolve_extend` (`link.c:638`, panic stub today) in two stages. - -### 7.1 Stage A — append-only (sound subset, first milestone) -New inputs that only *add* definitions, resolving against the existing image + -external resolver. This is exactly the JIT append model and reuses its cursor + -slack *placement* (`link_jit.c` append path) — but writing to a file image and, -critically, **falling back instead of panicking** on exhaustion. - -### 7.2 Stage B — patch changed atoms -For a changed input, diff its atoms (by content id) against the persisted -placement table: -- **Unchanged atom** (id matches): keep placement, keep bytes, **do nothing** — - its relocations are never revisited. -- **Changed atom, fits capacity**: overwrite bytes in place; reapply only *its* - relocations (re-derived from current placements, §9). -- **Changed atom, grows past capacity**: allocate a new slot (free-list, else - bump `cursor[seg]`), return the old slot to the free-list, write bytes, and - make callers reach it via the move primitive (§8). Reapply its own relocations. -- **New atom**: place at cursor/free-list; resolve & apply its relocations. -- **Removed atom**: return its slot to the free-list; drop its symbols. - -### 7.3 The soundness gate — fall back to full link when -Compute the changed object's **interface** = { defined global names + bindings, -COMMON sizes/aligns, set of undefs }. The edit is *local* only if the interface -is identical to the persisted one **and** no archive pull-in changes. Otherwise -fall back. Triggers (grounded in `link_resolve.c`): -1. **Symbol-set / binding change** — added/removed global, weak↔strong flip: - changes global resolution (`bind_strength`) and which archive members pull. -2. **Archive pull-in change** — a new undef now selects a `.a` member that was - not in the prior link (`link_ingest_archives` is greedy single-pass). -3. **COMDAT ownership** — COFF SELECTANY keeps the *earlier* winner - (`link_resolve.c:308-323`). If the edited TU is the winner and its group body - changed, patch the shared body; if it is a loser, no-op; if ownership would - flip, fall back. COMMON size/align merge (`:288-303`) changing → fall back. -4. **TLS size change** — boundary syms `__tdata_start/end`, `__tbss_size` - (`link_layout.c`) shift if any TLS section resizes → fall back. -5. **Import-set change** (PLT/`.got.plt`/dynamic) → re-synthesize via - `fmt->layout_dyn` → fall back. -6. **Slack/free-list exhaustion** in any segment → fall back. -7. **Layout-affecting flags / linker script / `--gc-sections` / `-r` / LTO** → - full link (GC liveness is whole-graph; incompatible, as in gold). - -On fallback, **discard the half-mutated session** (the `LinkPatchTxn` watermark -rolls back `cursor[]`/free-list/append counts) and run a normal full link, which -because objects are resident is already far cheaper than a cold `cfree ld`. - -The JIT's duplicate-global *preflight* is the precedent for the gate — but it -**panics**; converting "detect non-local" into "roll back + full link" is new -control flow. - ---- - -## 8. Placement, slack, and the move-on-grow primitive - -**Slack.** Today sections are contiguous with only alignment padding -(`link_layout.c:340-348`). Under `--incremental`, reserve per-atom slack so -overwrite-in-place is the common case. **Code and data get separate, tunable -budgets** (decision §20.2): `code_slack_pct` is modest because code atoms can -relocate cheaply (§8.1), while `data_slack_pct` is more generous because a data -atom that outgrows its slot forces a full-link fallback (data can't be thunked). -Both default sensibly and are overridable via a link option (gold's -`--incremental-patch=n` style). A two-level free-list (one of free file blocks, -one per segment bucket) recycles vacated slots, first-fit. - -**The move primitive — swappable.** When an atom moves, callers must still reach -it without their bytes changing. Abstract this as one hook with two -implementations; the rest of the design is identical either way. - -```c -/* src/link/link_move.h — the only A/B difference */ -typedef struct LinkMoveOps { - /* make all references to `atom` reach its NEW vaddr, without touching callers */ - void (*atom_moved)(LinkSession*, LinkAtomPlace* atom, u64 new_vaddr); -} LinkMoveOps; -``` - -### 8.1 Thunk-on-grow (ship first) — `LinkMoveOps` = thunk -Calls stay **direct** (`R_AARCH64_CALL26`, x64 `PLT32`, RV `CALL` — what codegen -emits today; cross-TU is direct per `src/obj/macho/link.c:537`, -`src/obj/elf/link.c:251`). On a move, leave a **jump island** at the atom's *old* -slot pointing to the new location. Callers branch to the old address as before → -hit the island → jump on. Properties: -- **No codegen change.** Pure linker. Reuses the `link_layout_jit_stubs` - (`link_reloc_layout.c:429`) island shape as a template (per arch: aa64 jit/iplt - stub, x64 iplt stub `src/obj/x64/link.c:40`, rv64 trampoline). -- **Reachability is free**: callers already branched directly to the old slot, - so the island there is in range by construction. -- **Tax**: an extra jump *only for functions that moved* (one island per - function that ever grew, re-pointed on subsequent grows). Unmoved functions - pay nothing. -- **Data caveat**: a thunk redirects code only. A grown *global* that must move - cannot be thunked. v1 rule: give data atoms generous slack and **fall back to - full link if a data atom outgrows its capacity** (never move data). This keeps - the thunk path entirely codegen-free. - -### 8.2 GOT-cell (convergence target) — `LinkMoveOps` = got -Under `--incremental`, codegen emits cross-unit calls (and movable-data loads) -through a GOT cell (aa64 `ADRP+LDR+BLR`, x64 `call *cell(%rip)`, rv64 -`auipc+ld+jalr`). A move updates **one** cell. Properties: -- **Per-arch codegen change** (instruction selection + reloc kinds) for calls - *and* data — `reloc_uses_got` (`link_reloc_layout.c:376`) currently lists only - GOT-relative kinds, and `link_layout_got` only allocates slots for those. -- **Tax**: one extra indirect load on *every* cross-unit reference, uniformly. -- **GOT growth**: a new cross-unit target adds a slot, but the GOT is a single - exactly-sized segment at the image end (`link_reloc_layout.c:710-748`). Needs - **reserved GOT slack + a GOT free-list**, with fall-back on exhaustion — - otherwise adding a slot moves the GOT and breaks stability for existing slots. -- **Strategic upside**: it is the *same* primitive `doc/HOT_RELOAD.md §7` assumes - ("one slot per function changes; call sites not patched"). One GOT-cell-update - mechanism would then serve JIT hot reload *and* file incremental link. - -**Why thunk-first.** Thunk taxes only what moves and needs zero codegen, so it -proves the slack/free-list/persistence/soundness machinery end-to-end fastest. -The free-list, slack, session, and gate are reused verbatim when we later swap in -GOT cells; only `LinkMoveOps` changes. Converge on GOT when hot reload needs it. - ---- - -## 9. Relocation reuse & application - -`LinkRelocApply` records are durable data, never burned into bytes before emit -(invariant `link.h:234-246`). Incremental link leans on this hard. - -- **Relative + symbolic form.** Persist each reloc as `(atom_content_id, - offset_within_atom, kind, target_name, addend)`. At apply time the absolute - write address is `atom.vaddr + offset_within_atom` and the target address is - the *current* placement of `target_name`. **An atom that moves needs no reloc - rewriting** — both addresses re-derive from current placements (principle 4). -- **Reapply only the changed atom's relocs**, found via the new per-atom index - (§5.1). Unchanged atoms' relocs are never touched. -- **Apply path.** File emit currently writes relocations destructively into - `segment_bytes` at emit (`src/obj/elf/link.c:318-470`). For patching we apply a - single atom's relocs into its (possibly newly placed) bytes using the same - `reloc_apply.c` kind dispatch the JIT uses, then re-emit only the changed - segment ranges. Per-arch reloc kinds already flow through `reloc_apply.c`. - ---- - -## 10. Persisted incremental state - -Side-band, content-addressed — **not** gold's ELF-embedded `.gnu_incremental_*` -sections (those are ELF-only; we are multi-format). Store one blob in the -existing `driver/dist` CAS (`dist_cas_put_blob`, BLAKE2b) keyed by the link -action id (a build-system concern; §16). The blob records, per input and per -atom: -- object content id + atom content ids (the diff keys); -- `LinkAtomPlace` table: vaddr / file_offset / size / capacity / bucket; -- symbol → vaddr bindings, keyed by **name**; -- relocations in the relative+symbolic form of §9; -- free-list state and per-segment cursors/limits. - -Because everything is content/name-keyed, reloading does **not** depend on a -fresh `Linker` re-deriving identical `LinkSymId`s. The determinism audit (§12) -becomes a dedup optimization, not a correctness gate. We still add a cheap guard: -on reload, verify each referenced object blob's BLAKE2b matches its recorded id -before trusting it (defends against a torn/garbage cache entry). - ---- - -## 11. Image identity / build-id - -`link_image_id_compute` (`link_image_id.c:31`) is FNV-1a streamed over **every** -segment's vaddr + file_size + bytes — `O(image)` and not incrementally -updatable. For patching, compute a **per-segment subhash** and combine them -(Merkle-style) into the image id, so a patch re-hashes only changed segments. -Note this hash is FNV-1a and independent of the BLAKE2b used for content/CAS; -keep them distinct. - -Consequence (acceptable, document loudly): an incremental output is **not -byte-reproducible against a from-scratch full link** of the same sources — slack -padding and (under GOT mode) indirection differ. Release builds (`--incremental` -off) are canonical and reproducible. - ---- - -## 12. Address-stability & determinism invariants - -- **Stability (falsifiable):** after a patch, `nm`/`addr2line` on an *unchanged* - symbol must return the identical vaddr as before. Enforced by - overwrite-in-slack / append-to-free-slot, never compact. -- **Determinism (decision §20.4 — lock with a test, keep content-keying):** obj - emission is *already* byte-deterministic — sections/symbols/relocs emit in - insertion order, `.strtab` dedups by linear search, and there are no - timestamps, embedded addresses, hash-map iteration, or threading in the emit - path (`src/obj/elf/emit.c:298,386,505`). Lock this with a regression test (two - compiles ⇒ identical bytes), which enables cross-machine / shared-cache dedup. - Content/name keying (§10) remains the *correctness* backbone: if a future - change ever reintroduces nondeterminism, it degrades dedup, never correctness. -- **Reloc re-derivation:** never store an absolute `write_vaddr`; always - `atom.vaddr + offset_within_atom` (principle 4). - ---- - -## 13. Debug info (DWARF) consistency - -**Decision §20.1: regenerate the changed TU's debug on any body change.** -Investigation (`src/debug/debug_emit.c:823-905`, `:650-703`) established that -debug update is **`O(changed TU)`, not `O(atom)`**, and that skipping it is -*incoherent*: - -- cfree emits **one monolithic `.debug_line` program per CU** and a single - `.debug_info` CU whose DIEs reference each other by intra-CU `DW_FORM_ref4` - offsets. You cannot splice one function's line rows or patch one subprogram DIE - in isolation — a change shifts subsequent offsets across the CU. -- A body change rewrites the instruction→line mapping **regardless of whether - the atom moved** (an in-slack overwrite has the same vaddr but different - instruction offsets). So "keep `.debug_line` because the address didn't change" - is wrong — there is no correct skip. -- Therefore: on any changed atom, **re-emit that TU's full `.debug_*`**. This is - `O(changed TU)`, which is fine — unchanged TUs' debug is byte-stable because - their atoms keep their addresses, and one TU's debug regen is cheap relative to - the rest of the patch. Address-bearing fields (`DW_AT_low_pc`, `.debug_aranges`, - `.debug_rnglists`) re-derive from current placements (§9); `DW_AT_high_pc` is a - size, not an address. -- `addr2line` and `cfree dbg` re-read debug from the patched image; the build-id - change (§11) is the staleness signal for file consumers. -- *Future option (not pursued now):* per-function CUs / split line programs would - make debug `O(atom)`, but the gain is marginal versus the per-TU regen cost. - ---- - -## 14. Multi-format & multi-arch - -- **No format is fundamentally unsuitable (decision §20.6); difference is - machinery, so: ELF first, then COFF, then Mach-O.** The atom + slack + - move-primitive core is format-agnostic; the persisted state is side-band CAS - for all three (§10). Per-format cost: - - **ELF — least machinery (first).** Relocations are side data (`.rela.*`) - applied at emit, the symtab is flat fixed-offset records (patch the changed - entry in place), section headers are file-only, signing is optional. An atom - move touches only its own symbol's `st_value` and the relocs targeting it. - - **COFF/PE — incremental-friendly (second).** PE is the canonical incremental - target (MSVC `/INCREMENTAL` + `.ilk`): imported calls indirect through the - IAT, base relocs are simple per-page RVA lists, debug lives in a separate - PDB, and Authenticode is optional. The practical gate is cfree's COFF - maturity, not the format. (Reasoned from the PE/MSVC precedent, not a code - dive of cfree's COFF link.) - - **Mach-O — heaviest but feasible (last).** `__LINKEDIT` folds loader fixup - metadata into compact whole-image structures co-designed with a *mandatory* - dyld: chained fixups (`LC_DYLD_CHAINED_FIXUPS`), the export trie, the - indirect symtab. Each needs its own incremental updater — but they are - **bounded, not `O(image)`**: a code-atom move updates only the chained-fixup - *slots pointing at it* (chain `next`-links don't move unless pointer slots - move), and the symtab patch is `O(changed symbols)` like ELF (an earlier - survey overstated these as whole-image). The one real floor is **mandatory - code signing** on Apple Silicon: every patch must re-sign, and the - CodeDirectory is a per-4KiB-page hash array — naively `O(image)`, but - cacheable to `O(changed pages)` by retaining unchanged pages' hashes. - - Until a format's updater lands, that format falls back to the fast in-process - full link. -- **Per-arch surface is small:** only (a) the move primitive's island/cell shape - and (b) the branch-into-island/cell reloc kind. aa64 has the jit-stub shape to - reuse; x64 (`src/obj/x64/link.c:40`) and rv64 each have a trampoline shape to - adapt. All reloc kinds already dispatch through `reloc_apply.c`. -- CI exercises the patch path on **ELF/aa64 + ELF/x64** first (per project - "narrow test runs"); rv64 and Mach-O/COFF follow. - ---- - -## 15. The frontend contract (shared across all frontends) - -All frontends converge to `ObjBuilder` and join the shared path at -`obj_finalize` (`src/api/compile.c:356`), so the incremental machinery attaches -once, frontend-agnostically. **Toy, asm, and WASM get incremental link with no -frontend-specific code.** The "clear expectations" are a small optional -capability plus four guarantees. - -```c -/* include/cfree/compile.h — optional addition to CfreeFrontendVTable */ -typedef struct CfreeFrontendCaps { - const char* frontend_id; /* "c" / "toy" / "asm" / "wasm" */ - u32 schema_version; /* bump on any codegen/output change */ - /* report external inputs read this compile (for the build system's key). */ - CfreeStatus (*report_deps)(CfreeFrontendState*, const CfreeFrontendCompileOptions*, - const CfreeSourceInput*, CfreeDepSink*); -} CfreeFrontendCaps; -``` - -The contract each frontend must honor to be incrementally safe: -1. **Deterministic output** — identical `(source, flags, target, deps)` ⇒ - byte-identical `ObjBuilder` (§12). -2. **Declared dependency set** — report every external input read. C reuses the - existing `CfreeDepIter` (`src/api/compile.c:417-462`); asm/Toy/WASM report - "none" (single-source TUs). -3. **Stable, source-derived symbol naming** — no run-varying temp names; atom - content ids depend on it (§5). -4. **Identity + version** — `frontend_id` + `schema_version` salt the - build-system key so any frontend change invalidates correctly. - -**Per-frontend cost:** C — low (wire `CfreeDepIter` + a version constant). asm, -Toy, WASM — trivial (no deps; version constant). - -**Toy's REPL wrinkle.** Toy's durable module (the existing `commit`/`abort` -hooks, `lang/toy/compile.c:215-223`) means the REPL path is *not* a pure function -of source. That path either folds the module snapshot into the input key or opts -out of caching; Toy's **batch/file** compile conforms like any other frontend. - ---- - -## 16. The interface boundary the build system consumes - -**Incrementality is not a parallel API — it is the existing `CfreeLinkSession` -made fully mutable.** A full link is the degenerate cold case (no prior state, -nothing replaced); an incremental relink seeds prior state and replaces the -changed inputs. The build system always drives *the same* session, and -`resolve` internally decides patch-vs-full and reports which — there is no -separate "incremental" entry point to keep in sync with the full-link path. This -matches the internal direction (`link_resolve` is "inputs → image"; -`link_resolve_at`/`extend`, `link.c:629,638`, make that `resolve` extend-capable). - -Changes from today's surface (`new`/`add_obj…`/`resolve`/`emit`/`jit`/`free`, -`link.h:189-207`) — all **additive**: - -```c -/* include/cfree/object.h — object identity */ -CfreeStatus cfree_obj_content_id(CfreeObjBuilder*, uint8_t out[CFREE_BLAKE2B_LEN]); - -/* include/cfree/link.h */ -typedef enum { CFREE_LINK_FULL, /* cold: no prior state */ - CFREE_LINK_PATCHED, /* incremental fast path applied */ - CFREE_LINK_FELL_BACK_FULL /* was incremental, gate forced full */ -} CfreeLinkOutcome; - -/* add_* gain a stable input handle so a live session can mutate one slot; - the handle is optional (NULL) for the cold/file path. */ -CfreeStatus cfree_link_session_add_obj(CfreeLinkSession*, CfreeObjBuilder*, - CfreeLinkInputId* out /*nullable*/); -CfreeStatus cfree_link_session_replace_input(CfreeLinkSession*, CfreeLinkInputId, - CfreeObjBuilder* changed); -CfreeStatus cfree_link_session_remove_input(CfreeLinkSession*, CfreeLinkInputId); - -/* seed prior incremental state (opaque bytes); unset/empty => cold full link. */ -CfreeStatus cfree_link_session_set_prior_state(CfreeLinkSession*, const CfreeSlice*); - -/* resolve() and emit() are the SAME calls, now incremental-aware: - resolve reconciles the CURRENT input set against prior state BY CONTENT (§10) - — unchanged atoms reuse placement, changed atoms patch, non-local edits fall - back to a full re-resolve. Idempotent: safe to re-call after mutations. */ -/* CfreeStatus cfree_link_session_resolve(CfreeLinkSession*); (existing) */ -/* CfreeStatus cfree_link_session_emit(CfreeLinkSession*, CfreeWriter*); (existing) */ - -/* emit the new persisted incremental state (opaque); query the last outcome. */ -CfreeStatus cfree_link_session_serialize_state(CfreeLinkSession*, CfreeWriter*); -CfreeStatus cfree_link_session_outcome(CfreeLinkSession*, CfreeLinkOutcome* out); -``` - -- **Cold full link (today, unchanged):** `new → add_obj… → resolve → emit`. -- **Incremental relink:** `new → set_prior_state(blob) → add_obj…/replace_input → - resolve → emit + serialize_state`, then read `outcome`. - -Because reconciliation is **by content hash** (§10, decision §20.3), the -cross-process path needs no stable-id continuity: a fresh session seeded with -prior state matches re-added inputs to prior placements by content. -`replace_input`/`remove_input` are a live-session (daemon) convenience for -slot-precise mutation; they still match by content underneath. - -**Ownership (decision §20.5):** the build system owns the persisted blob's key, -CAS storage, and lifetime; the session only reads it via `set_prior_state` and -writes it via `serialize_state` as opaque bytes through `CfreeWriter`. libcfree -does no file IO/CAS (both are driver-only — `driver/dist`; libcfree reads bytes -via `Compiler.env->file_io`, `src/link/link.h:150`). - ---- - -## 17. Failure behavior (transactional) - -A patch is all-or-nothing from the consumer's view: -- compile/resolve failure, gate fallback, slack exhaustion, or reloc-apply - failure ⇒ the image is unchanged (or a clean full link is produced). -- Pages/bytes may have been written before a late failure; the `LinkPatchTxn` - watermark rolls back `cursor[]`, the free-list undo log, and the - atom/symbol/section/reloc counts so no partial result is published. - ---- - -## 18. Implementation sequence (acceptance-test-first, red → green) - -The **first phase builds the acceptance suite** (§19), which encodes "done for -ELF" as an executable spec. It starts fully red; each milestone below drives a -named set of its scenarios green. Each milestone also has its own narrow -red-green unit cycles (listed inline); the acceptance scenarios are the -integration capstones. - -**Phase 0 (first) — author the ELF acceptance suite, RED.** -Land the additive public surface (§16) as not-implemented stubs — -`_set_prior_state` / `_replace_input` / `_remove_input` / `_serialize_state` / -`_outcome`, the input-id out-param on `add_obj`, the `CfreeLinkOutcome` enum, and -`cfree_obj_content_id` — each returning a "not implemented" status, with -`resolve`/`emit` initially doing only the cold full link (mirroring the existing -`link_resolve_at`/`extend` panic stubs, `link.c:629,638`), so the suite compiles -and links. Then write `test/link-incremental/` with the §19 scenarios A–F, the -synthetic fixture build, and the `link_resolve` whole-program instrumentation. Every -scenario is red. This nails the spec before any implementation and is the red -baseline. (No parallel "incremental" surface — §16: the one mutable session.) - -**M0 — atom identity & obj indices (no behavior change).** -`obj_content_id` / `obj_atom_content_id`, per-atom reloc index, symbol-by-name -hash, deterministic round-trip, `CfreeFrontendCaps` (C deps via `CfreeDepIter`; -trivial for others). → **turns Scenario E (determinism) green** and provides the -`obj_content_id` the harness keys on. Narrow: one-byte body edit ⇒ exactly that -atom's content id changes, others stable. - -**M1 — `LinkSession` + append-only extend (Stage A).** -Introduce `LinkSession`; implement the append-only subset of -`link_resolve_extend` against a file image, reusing JIT cursor/slack *placement* -but **falling back, not panicking**; persisted-blob round-trip (§10). → **turns -Scenario F (no-op relink) green**. Narrow: appended object whose code calls an -initial function links; appended duplicate-strong-def falls back (not panic); -unresolved ref is transactional (image unchanged). - -**M2 — patch changed atoms in slack (Stage B, no move yet).** -Per-atom diff, overwrite-in-slack, reapply the changed atom's relocs (§9), -per-segment build-id (§11), regenerate the changed TU's debug (§13), the -soundness gate + transactional rollback (§7.3, §17). Atoms that would grow past -capacity fall back here (no move primitive yet). → **turns Scenarios A (in-slack -edit), C (fallback), and D (multi-output) green.** - -**M3 — move-on-grow via thunk (`LinkMoveOps` = thunk).** -Free-list, relocate grown code atoms, jump islands (reuse the -`link_layout_jit_stubs` shape, `link_reloc_layout.c:429`), separate code/data -slack with data-grow → fallback. → **turns Scenario B (grow past slack) green.** -At the end of M3 the full §19 suite is green on **ELF/aa64 + ELF/x64 — this is -"done for ELF."** - -**M4 (deferred) — converge on GOT-cell (`LinkMoveOps` = got).** -Built when hot reload is scheduled (decision §20.3), designed to serve both -paths: a `--incremental` codegen mode for cross-unit calls + movable data, -reserved GOT slack + free-list. Not required for the §19 suite. - -Then COFF, then Mach-O updaters (§14); the rv64 patch path follows aa64/x64. - ---- - -## 19. ELF definition of done: outcome & acceptance suite - -This is the executable specification authored **first** (Phase 0, §18) and the -north-star M0–M3 drive to green. ELF/aa64 + ELF/x64 only (per the "narrow runs" -rule). It is `test/link-incremental/`. - -### 19.1 Outcome — what a fully-built ELF implementation produces - -Under `--incremental` (`-O0`/`-O1`), the build system seeds the session with prior -state, replaces the changed input(s) (`cfree_link_session_replace_input`, or -re-adds inputs on the cold path), then calls the same `resolve` → `emit` + -`serialize_state` (§16): - -- **In-slack body edit:** the changed atom's bytes overwrite in place, only its - relocs re-derive/reapply, the changed TU's `.debug_*` regenerates, only the - changed segment's build-id subhash recomputes. Outcome `CFREE_LINK_PATCHED`; - every other atom and every unchanged vaddr is byte-identical. -- **Grow past slack:** the atom relocates to a free-list slot, a jump island is - left at its old address, **callers' bytes do not change**; only the moved - atom's relocs re-derive. Still `PATCHED`, still `O(change)`. -- **Non-local edit:** added/removed global, weak↔strong flip, new archive - pull-in, COMDAT-ownership flip, TLS/import-size change, or slack exhaustion - (incl. any data-atom grow) ⇒ `CFREE_LINK_FELL_BACK_FULL`: a correct full - in-process link. **Never a wrong binary.** - -Guarantees: address stability (unchanged symbols keep their vaddr), debug -correctness after a patch (`addr2line`/`cfree dbg`), byte-deterministic objects -(release builds with `--incremental` off are the reproducible artifact), and -consistent multi-output (a core-TU edit patches/falls-back for both apps). Cost: -a one-function in-slack edit takes the link from `O(all objects)` to `O(one atom -+ its relocs + one TU's debug + one segment rehash)` — for the synthetic fixture -(§19.2), from "relink both executables over the whole archive" to "patch one -atom." - -### 19.2 Harness & instrumentation - -- Run the patched binary via `test/lib` `exec_target`/`exec_kernel`. -- Instrument the whole-program `link_resolve` entry with a counter dumped to a - file (read it back; don't re-run). A `PATCHED` outcome must **not** increment - it; a fallback must. -- **Fixture — a synthetic multi-TU codebase** under - `test/link-incremental/fixture/` (hand-written, deterministic, no third-party - deps; freestanding-style apps that return a *computed* status, like the smoke - harness, so the only archive in play is the fixture's own). Five core TUs - archived into `libcore.a`, linked into two executables that share it: - - `arith.c` — leaf callees `arith_add` / `arith_mul`. - - `table.c` — `tbl_get` (the **in-slack** edit target) and `tbl_sum` (the - **grow** target); both call `arith_*` cross-TU (so callers live in other TUs). - - `state.c` — data globals `g_config[8]`, `g_name[]` (the **data** atoms for - data-slack / data-grow). - - `weakdef.c` — `feature_level()` defined **weak** (the **weak→strong** target). - - `optional.c` — `opt_helper()`, an archive member **referenced by no one - initially** (so it isn't pulled — the **archive-pull-in** target). - - `app_a.c`, `app_b.c` — two `main`s with overlapping core use, each linking - `app_*.o` + `libcore.a` → executables `app_a`, `app_b` (the multi-output - shape; `table.c` is shared by both). -- Build the fixture once with `--incremental`; capture each app's baseline `nm` - vaddr map, the persisted-state blob, and each app's expected computed status - (run via `test/lib` `exec_target`). -- Prefer targeted runs; redirect output to a file (project rules). - -### 19.3 Acceptance scenarios (each assertion falsifies one claim) - -| Scenario | Action | Assertions | Green at | -|---|---|---|---| -| **A — in-slack edit** | edit `tbl_get`'s body within its slack | `outcome==PATCHED`; whole-program `link_resolve` counter **did not** increment; `nm` diff: **every** symbol keeps its vaddr; `app_b` (a cross-TU caller) runs and its status reflects the edit; `addr2line` correct for an unchanged *and* the edited function; only the changed segment's bytes differ; build-id changed | M2 | -| **B — grow past slack** | grow `tbl_sum` ~10 KB past its slot | `PATCHED`; `tbl_sum` moved to a new vaddr; jump island at its **old** vaddr; **`app_a`'s caller bytes byte-identical** to baseline; both apps run | M3 | -| **C — soundness gate** | c1 add `int g_extra;` to `state.c` · c2 flip `feature_level` weak→strong · c3 edit `table.c` to call `opt_helper` (pulls `optional.c`) · c4 grow `g_config` past `data_slack` | each ⇒ `FELL_BACK_FULL`; binary matches a from-scratch full link | M2 | -| **D — multi-output** | edit `table.c` (shared by both `app_a` and `app_b`) | both images patch or fall back **consistently**; both run | M2 | -| **E — determinism** | compile `table.c` twice | identical `obj_content_id` **and** identical bytes; one-byte edit ⇒ only that atom's id changes | M0 | -| **F — no-op relink** | `replace_input` with a byte-identical object | no atoms diff ⇒ image unchanged, near-zero link work | M1 | - -### 19.4 The two gates that define correctness - -**Scenario A's vaddr-stability assertion** (a patch must move nothing it -shouldn't) and **Scenario C** (non-local edits must fall back, never silently -mislink) are the two ways this feature can be wrong. Both must be green before -"done for ELF" is claimed. - ---- - -## 20. Decisions (resolved) - -All six original open questions were investigated against the code and resolved -with the user. Recorded here as the binding design choices; the relevant sections -above were updated to match. - -1. **DWARF (§13) — regenerate the changed TU's debug on any body change.** The - "keep stale `.debug_line`" option is *incoherent* (a body change rewrites the - line mapping regardless of move) and debug is per-CU monolithic, so update is - `O(changed TU)`, not `O(atom)`. Per-function CUs for `O(atom)` debug are a - noted future option, not pursued now. -2. **Slack (§6, §8) — separate, tunable code/data budgets.** `code_slack_pct` - modest (code relocates cheaply via the move primitive); `data_slack_pct` - generous (a data-grow forces a full-link fallback since data can't be - thunked). Both default sensibly, overridable via a link option. -3. **GOT timing (§3, §8.2, §18) — defer; thunk-first.** Ship thunk-on-grow for - file-incremental now (zero codegen change). Build the shared indirection-cell - primitive when hot reload is scheduled, designing it then to serve both. - Rationale: hot reload needs its *own* per-function slots regardless - (`doc/HOT_RELOAD.md:34-48,144-157`), thunk-on-grow doesn't advance it, and - neither is implemented — so unifying now is speculative. -4. **Determinism (§12) — lock with a test, keep content-keying.** Objects are - already byte-deterministic (`src/obj/elf/emit.c:298,386,505`); a regression - test locks it (enabling cross-machine dedup), while content/name keying stays - the correctness backbone so any future drift degrades dedup, not correctness. -5. **Persisted-state keying (§16) — build system owns key/storage/lifetime; the - session emits opaque bytes** (+ exposes the blob's content id). Keeps libcfree - IO/CAS-free, matching the driver-only CAS boundary. -6. **Format scope (§14) — ELF first; COFF then Mach-O as follow-on milestones.** - None of the three is fundamentally unsuitable; the difference is how much - format-specific machinery each needs. Formats without an updater yet fall back - to the fast in-process full link. diff --git a/doc/INTERFACES.md b/doc/INTERFACES.md @@ -1,278 +1,310 @@ # cfree Interfaces Modularity and clean interfaces are a top project priority. This document is the -**interface inventory** and the **interface-review checklist** for cfree. +operational companion to [DESIGN.md](DESIGN.md): where DESIGN.md tells the +layering narrative, this doc is the **interface inventory** and the +**interface-review checklist**. It catalogues every boundary worth reviewing — +the public API, the backend/codegen contracts, the internal subsystem seams, the +core utilities, and the frontend-to-library edge — names the responsibility each +one carries, and gives a checklist to apply when adding to or changing one. -It complements `doc/DESIGN.md` (which describes the layering narrative). This doc -is operational: it lists every interface worth reviewing, the contract each one -carries, and a checklist to apply when adding to or changing one of them. - -- **Scope**: public API (`include/cfree/`), the backend/codegen contracts, - shared internal subsystem boundaries, the core utilities, and the - frontend↔library boundary. -- **How to use**: when touching an interface, run the [review checklist](#interface-review-checklist) - against it and update the [status table](#review-status). When adding a new - cross-module header, add it here first. +The aim is to make the boundaries legible so they stay clean: an interface that +nobody can locate is an interface nobody will defend. --- ## Boundary map -From outside to inside (see `DESIGN.md` for the full narrative): +From the outside in (see [DESIGN.md](DESIGN.md) for the full narrative): ``` -driver/ CLI policy + host I/O. Includes ONLY <cfree/*.h>. - └─ lang/ Frontends (c, cpp, toy, wasm). API consumers; ONLY <cfree/*.h> + own private headers. - └─ include/cfree/ PUBLIC BOUNDARY. The library's entire contract. - └─ src/api/ Composition layer: public handles ↔ internal subsystems. - └─ src/ Internal subsystems. Share private headers; expose nothing except through include/. +driver/ CLI policy + host I/O. Built with -Iinclude only: + │ the public surface is all it can reach. No -Isrc. + └─ lang/ Frontends (c, cpp, toy, wasm). Compiled INTO libcfree. + │ API consumers in spirit; built with -Isrc, so they may + │ reach a few internal headers where the public API has + │ no equivalent (C ABI, wasm module model). + └─ include/cfree/ PUBLIC BOUNDARY. The library's entire external contract. + └─ src/api/ Composition layer: public handles <-> internal subsystems. + └─ src/ Internal subsystems. Share private headers; expose + nothing outward except through include/cfree/. ``` -**Invariants (verified by grep; keep them true):** -- `driver/` and `lang/` include only `<cfree/...>` headers — never `src/`. -- There is **no** umbrella `include/cfree.h`; consumers include the specific - headers they use. (`DESIGN.md` still references `include/cfree.h` and - `include/cfree/hashmap.h` — stale; real paths are `include/cfree/*.h` and - `include/cfree/support/hashmap.h`.) -- `*_internal.h` headers are private to their subsystem and must not be included +**The boundary that the build enforces.** The hard layering line is the +**driver**, not `lang/`. The driver is compiled with `-Iinclude` (plus `-Ilang` +solely to reach a frontend's public `c/c.h` for the JIT REPL) and deliberately +*without* `-Isrc`, so internal headers (`core/...`, `link/...`, `cg/...`) are +physically unreachable from it. That makes the driver the first true consumer of +libcfree's public API and the thing that proves the public surface is +sufficient. + +Frontends in `lang/` are a softer boundary: they are built as part of +`libcfree.a` (with `-Isrc`), so the compiler does not stop them from including an +internal header. They are expected to live on `include/cfree/` and almost +entirely do; the deliberate exceptions are the C frontend reaching +`abi/c_abi.h` for C-specific ABI lowering and the wasm frontend reaching +`wasm/wasm.h` for the shared module model. New frontend code that reaches into +`src/` for anything else is a signal the public API is missing something — add it +to `include/cfree/`, don't widen the exception list. + +**Invariants (keep them true):** +- There is **no** umbrella `include/cfree.h`. Consumers include the specific + headers they use: `include/cfree/*.h` and `include/cfree/support/*.h`. +- The driver includes only `<cfree/...>` (the `-Ilang` exception is a single + frontend public header), never `src/`. +- `*_internal.h` headers are private to one subsystem and must not be included across subsystem boundaries. +- Format / arch / OS specifics stay behind their dispatch vtable + (`ObjFormatImpl`, `ArchImpl`, the codegen `*Target` structs) — never leaked + above the dispatch line. --- ## Tier 1 — Public API (`include/cfree/`) -The library's entire stable contract. 19 headers + 2 support headers. No umbrella. +The library's entire stable contract: nineteen headers in `include/cfree/` plus +two in `include/cfree/support/`. No umbrella header — each consumer includes what +it uses. | Header | Purpose | Key opaque type(s) | Primary consumer | |--------|---------|--------------------|------------------| | `core.h` | Foundational substrate: compiler lifecycle, target triple, slices, status codes, host vtables (`CfreeHeap`/`CfreeWriter`/`CfreeDiagSink`/`CfreeContext`), symbol interning. | `CfreeCompiler` | everyone | | `config.h` | Build-time component enable flags (arch / obj-format / language / subsystem / tool). Preprocessor-only. | — | build | -| `compile.h` | High-level source→object compilation; frontend registration vtable; dep iteration. | `CfreeCompileSession`, `CfreeDepIter` | driver, frontends | -| `cg.h` | Code-generation API (~53 KB): stack-machine typed IR over `CfreeCg`. Types/ABI, functions, control flow, memory, arithmetic, calls, intrinsics, inline+file asm, static data. | `CfreeCg` | frontends | +| `compile.h` | High-level source->object compilation; frontend registration vtable; dep iteration. | `CfreeCompileSession`, `CfreeDepIter` | driver, frontends | +| `cg.h` | Code-generation API (the largest contract): a stack-machine typed IR over `CfreeCg`. Types/ABI, functions, control flow, memory, arithmetic, calls, intrinsics, inline+file asm, static data. | `CfreeCg` | frontends | | `frontend.h` | Frontend convenience bridge: panic boundary (`cfree_frontend_run`), metrics scopes, fatal helpers. | — | frontends | | `source.h` | Source registry: stable file IDs + include-edge recording. | — | frontends | | `preprocess.h` | Standalone C preprocessor entry. | — | driver | | `object.h` | Format-neutral object model: builder + read-only inspection; section/symbol/reloc enums. | `CfreeObjBuilder`, `CfreeObjFile` | cg, link, jit, disasm, dwarf | | `link.h` | Linker: byte/object/archive/DSO inputs, linker-script model, emit or JIT. | `CfreeLinkSession`, `CfreeLinkScript` | driver, jit | | `jit.h` | JIT image: mapped pages, symbol resolution, publish/append/replace, object view. | `CfreeJit` | runtime, dbg | +| `interp.h` | Threaded-bytecode interpreter over the optimizer IR; host-identity and emu/guest configurations. | `CfreeInterpProgram` | `run --no-jit`, emu | | `dbg.h` | In-process JIT execution control: breakpoints, stepping, regs/mem, signal host. | `CfreeJitSession` | debuggers | -| `dwarf.h` | DWARF5 consumer: PC↔line, type/var/subprogram queries, structural iterators. | `CfreeDebugInfo`, `CfreeDwarfType` | debuggers, dumpers | +| `dwarf.h` | DWARF5 consumer: PC<->line, type/var/subprogram queries, structural iterators. | `CfreeDebugInfo`, `CfreeDwarfType` | debuggers, dumpers | | `disasm.h` | Disassembly of byte ranges and objects, with symbol/reloc annotation. | `CfreeDisasmIter` | objdump, dbg | | `emu.h` | User-mode guest-ELF emulator (per-block JIT). | `CfreeEmu` | emu tool | -| `arch.h` | Arch-agnostic register/unwind-frame metadata helpers. | — | dbg, dwarf, disasm | +| `arch.h` | Arch-agnostic register / unwind-frame metadata helpers. | — | dbg, dwarf, disasm | | `archive.h` | POSIX `ar` reader/writer + symbol index. | `CfreeArIter` | ar/ranlib | | `asm_emit.h` | Emit assembled object bytes as GAS text. | — | objdump | | `wasm.h` | WebAssembly host-import resolver/binder. | `CfreeWasmInstance` | wasm runners | | `support/arena.h` | Public bump allocator (narrowed mirror of `src/core/arena.h`). | `CfreeArena` | frontends | | `support/hashmap.h` | Header-only `CFREE_HASHMAP_DEFINE` template + hash fns. | — (macro) | frontends | -**Public-tier review notes:** +**Public-tier notes:** - `cg.h` is by far the largest contract and the one frontends couple to hardest. Changes here ripple to every frontend — treat it as the highest-risk public interface. - `core.h` defines the host vtables (`CfreeHeap`, `CfreeWriter`, `CfreeDiagSink`, - `CfreeContext`) — these are the project's "no global state" enforcement point; - every subsystem threads context through them. + `CfreeContext`). These are the project's "no global state" enforcement point; + every subsystem threads context through them rather than reaching for a static. +- The inspection family (`dwarf.h`, `disasm.h`, `arch.h`, plus `object.h`'s read + side) is consumed by the dumper/debugger tools and shares the format-neutral + object model — keep arch/format detail behind the dispatch vtables it already + uses. --- ## Tier 2 — Backend / codegen contract (internal) -The codegen path is **tiered**; each tier is a distinct vtable a backend or layer -fills in. This is the most actively-changing area (x64/rv64 are being ported onto -`NativeTarget`; **aa64 is the done reference**). +The codegen path is **tiered**: each tier is a distinct contract struct (a +vtable) that a backend or layer fills in. A frontend records into the highest +tier; the bytes come out of the lowest. The native backends — aarch64, x64, and +rv64 — are all live and all satisfy the same contracts. aarch64 is the reference +implementation; x64 and rv64 are full peers built on the shared +`NativeTarget` / `NativeOps` / `NativeFrame` substrate rather than bespoke +per-arch frame and lowering code. | Tier | Header | Contract type | What implements it | Role | |------|--------|---------------|--------------------|------| -| ABI | `src/abi/abi.h` (+ `abi_internal.h`) | `TargetABI` / `ABIVtable` | per-ABI TUs (`aapcs64`, `sysv_x64`, `rv64`, `wasm32`, `apple_*`, `win64_x64`) | calling-convention + layout queries; `abi_new(Compiler)` selects vtable | -| Arch registry | `src/arch/arch.h` | `ArchImpl` | one singleton per arch, via `arch_lookup(kind)` | discovery + dispatch to decode/emu/link/dbg/dwarf surfaces; CFI defaults | -| Semantic CG | `src/cg/cgtarget.h` | `CgTarget` | `native_direct_target` (-O0) or `opt_cgtarget` (-O≥1) | frontend-facing lowering, pre-regalloc | -| -O0 adapter | `src/cg/native_direct_target.h` | `NativeDirectTarget` + `NativeOps` | shared, parameterized by arch `NativeOps` | adapts `NativeTarget` to `CgTarget` for -O0 | -| Physical emit | `src/arch/native_target.h` | `NativeTarget` | `aa64`/`x64`/`rv64` `*_native_target_new()` | hard-register, machine-code emission + frame/CFI | -| Frame model (shared) | `src/cg/native_frame.h` | `NativeFrame` | shared impl (`native_frame.c`); embedded by aa64 + rv64 (x64 shortly) | arch-neutral frame-slot bookkeeping the `NativeTarget` impls delegate to | -| Machine code | `src/arch/mc.h` | `MCEmitter` | one generic impl, `mc_new(Compiler, ObjBuilder)` | section/label/reloc/CFI byte emission for all MC archs | - -**Per-arch entry points** (the surface each backend exposes to the rest of the -compiler): +| ABI | `src/abi/abi.h` (+ `abi_internal.h`) | `TargetABI` | per-ABI TUs (`aapcs64`, `aapcs64_windows`, `sysv_x64`, `rv64`, `wasm32`, `apple_arm64`, `apple_x64`, `win64_x64`) | calling-convention + storage-layout queries; `abi_new(Compiler)` selects the implementation by `(arch, obj-format)` | +| Arch registry | `src/arch/arch.h` | `ArchImpl` | one singleton per arch, via `arch_lookup(kind)` | discovery + dispatch to backend/decode/emu/link/dbg/dwarf surfaces; CFI defaults | +| Semantic CG | `src/cg/cgtarget.h` | `CgTarget` | `native_direct_target` (-O0) or `opt_cgtarget` (-O>=1), wrapping a per-arch `CgTarget` | frontend-facing typed lowering, pre-regalloc | +| -O0 adapter | `src/cg/native_direct_target.h` | `NativeDirectTarget` + `NativeOps` | shared adapter, parameterized by each arch's `NativeOps` | adapts a `NativeTarget` to `CgTarget` for the direct -O0 path | +| Physical emit | `src/arch/native_target.h` | `NativeTarget` | aa64/x64/rv64 `*_native_target_new()` | hard-register, machine-code emission + frame/CFI | +| Frame model (shared) | `src/cg/native_frame.h` | `NativeFrame` | shared `native_frame.c`, embedded by all three native backends | arch-neutral frame-slot bookkeeping the `NativeTarget` impls delegate to | +| Machine code | `src/arch/mc.h` | `MCEmitter` | one generic impl, `mc_new(Compiler, ObjBuilder)` | section/label/reloc/CFI byte emission for all machine-code archs | + +**Per-arch entry points** — the surface each backend exposes to the rest of the +compiler. The native archs each expose the same pair; rv64 additionally exports +its raw word/halfword emit helpers for the assembler path: | Arch | Header | Entry points | |------|--------|--------------| | aa64 (reference) | `src/arch/aa64/aa64.h` | `aa64_native_target_new`, `aa64_native_direct_ops` | -| x64 (porting) | `src/arch/x64/x64.h` | `x64_native_target_new`, `x64_native_direct_ops` | -| rv64 (porting) | `src/arch/rv64/rv64.h` | `rv64_native_target_new`, `rv64_native_direct_ops`, `rv64_emit32/16` | -| c_target | `src/arch/c_target/{c_emit,ir_emit}.h` | C-source emission backend | +| x64 | `src/arch/x64/x64.h` | `x64_native_target_new`, `x64_native_direct_ops` | +| rv64 | `src/arch/rv64/rv64.h` | `rv64_native_target_new`, `rv64_native_direct_ops`, `rv64_emit32/16` | +| c_target | `src/arch/c_target/{c_emit,ir_emit}.h` | C-source emission backend (standalone `CGBackend`, no `ArchImpl`) | | wasm | `src/arch/wasm/*` | wasm emission backend | -**Backend-tier review notes:** -- `NativeTarget` (~35 hooks: frame, control flow, data movement, arithmetic, - calls, atomics, variadics, intrinsics, asm) is the contract the port must - satisfy. Reviewing a port = checking every hook against the aa64 reference for - semantics, not just compilation. -- The same arch fills both `NativeTarget` (physical) and `NativeOps` (semantic - shims used by the -O0 adapter). Keep the split clean: semantic decisions in - `NativeOps`, pure emission in `NativeTarget`. -- `mc.h` is arch-neutral; per-arch reloc encoding lives behind - `ArchImpl.apply_label_fixup` + CFI constants. Don't leak arch knowledge into - the generic emitter. - -### Native frame model (`src/cg/native_frame.h`) — in use by aa64 + rv64 - -A shared frame-bookkeeping module extracted because aa64/rv64/x64 all lay out a -stack frame the same way at the bookkeeping level. `NativeFrame` owns the -arch-neutral parts; each backend embeds one and keeps the ISA/ABI-specific parts. +**Backend-tier notes:** +- `NativeTarget` is the physical-emission contract: frame setup and prologue + policy, control flow, data movement, arithmetic/compare/convert, calls + (`plan_call`/`emit_call`/`plan_ret`/`ret`), atomics, variadics, intrinsics, and + inline/file-scope asm — roughly three dozen hooks. The caller (NDT or the + optimizer) has already selected legal physical operands and run register + allocation; `NativeTarget` validates and emits but must never allocate + registers itself. All three native backends implement the full contract. +- The same arch fills both `NativeTarget` (physical) and `NativeOps` (the + semantic shims the -O0 adapter calls). Keep the split clean: semantic decisions + (operand legality, call planning policy, `va_*`, asm binding, barriers) live in + `NativeOps`; pure emission lives in `NativeTarget`. +- A handful of `NativeTarget` hooks are explicitly optional and exist for archs + whose ISA needs them — `machine_op_clobbers` (x86 idiv/shift fixed-register + clobbers; NULL on aa64/rv64), `emit_prologue` / `emit_minimal_prologue`, + `bind_params_end` (for backends that resolve param binds as a parallel copy), + and the zero-register store fast path (`has_store_zero_reg`). NULL is the + documented "this arch doesn't need it" answer, not an unimplemented gap. +- `CgTarget` likewise carries capability queries so the semantic layer can stay + arch-neutral: `supports_label_table` (false on Wasm, which has no code + addresses in linear memory), `switch_` (overridden only by backends with a + native multiway branch), and `tail_call_unrealizable_reason` (CG asks before + setting `CG_CALL_TAIL`). Native archs take the shared label-table / cmp-chain + defaults. +- `mc.h` is arch-neutral. Per-arch relocation encoding lives behind + `ArchImpl.apply_label_fixup` plus the per-arch CFI constants on `ArchImpl`; + don't leak arch knowledge into the generic emitter. + +### Shared native frame model (`src/cg/native_frame.h`) + +Every native backend lays out a stack frame the same way at the *bookkeeping* +level, so that bookkeeping was lifted into one shared module. `NativeFrame` owns +the arch-neutral parts; each `NativeTarget` embeds one and keeps the ISA/ABI +specifics. All three native backends embed a `NativeFrame`. **The split — what `NativeFrame` owns vs. what stays in the backend:** | Owned by `NativeFrame` (arch-neutral) | Stays in the backend (ISA/ABI-specific) | |----------------------------------------|------------------------------------------| | Slot table + cumulative-offset arithmetic (`native_frame_slot_alloc`/`_at`) | Coordinate transform from `off` to anchor-relative disp (fp/s0/rbp; aa64 top- vs bottom-record) | -| `frame_final` gate (no slots after prologue) | Prologue/epilogue + slim-variant instruction encoding | -| Used-callee-save set from optimizer per-class masks (`native_frame_set_callee_saves`/`_collect_saves`) | Callee-save *placement* (aa64 reserves slots here; rv64/x64 compute offsets) | +| `frame_final` gate (no new slots after the prologue) | Prologue/epilogue + slim-variant instruction encoding | +| Used-callee-save set derived from optimizer per-class masks (`native_frame_set_callee_saves`/`_collect_saves`) | Callee-save *placement* (aa64 reserves slots here; rv64/x64 compute offsets below the locals) | | `max_outgoing` tracking (`native_frame_note_outgoing`) | Deferred-patch application, variadic register-save stores | -| Vararg save-area size from ABI va_list layout (`native_frame_va_save_bytes`) | — | - -**Review notes:** -- **Status: in use by aa64 and rv64** (`src/arch/{aa64,rv64}/native.c` embed a - `NativeFrame`); x64 adoption is next, landing with its `NativeTarget` port. - Until x64 is on it, the contract is proven against two of three backends — the - x64 port is the remaining validation that the split generalizes. -- It consolidates the per-arch vararg-save magic numbers (rv64 64, x64 176, - aa64 64+128) into the single ABI-driven `native_frame_va_save_bytes` query — - aligned with the no-magic-numbers rule. When adopting per arch, verify the old - literal is *deleted*, not duplicated. -- `NativeFrameSlotEntry` layout is documented as matching the per-arch slot - structs it replaces — when porting, confirm the backend's local slot struct is - actually retired rather than kept in parallel. -- Handles are **1-indexed**; `NATIVE_FRAME_SLOT_NONE` is the sentinel. Both the - shared `native_frame_slot_alloc` and any backend that still mints raw - `NativeFrameSlot` values must agree on this. +| Vararg save-area size from the ABI's va_list layout (`native_frame_va_save_bytes`) | — | + +**Why it is shared:** the slot arithmetic, the no-grow-after-prologue gate, and +the derivation of the used-callee-save set from the optimizer's per-class masks +are identical across the three archs. Consolidating them also folds the three +per-arch vararg-save magic numbers (rv64 64, x64 176, aa64 64+128) into a single +ABI-driven `native_frame_va_save_bytes` query, in line with the no-magic-numbers +rule. Frame-slot handles are **1-indexed** with `NATIVE_FRAME_SLOT_NONE` as the +sentinel; callers and the shared allocator must agree on that. --- ## Tier 3 — Internal subsystem boundaries Each subsystem exposes a single shared header (its boundary) and may keep an -`*_internal.h` private to its own TUs. - -| Subsystem | Boundary header | Internal header | Key exported types | Main entry points | -|-----------|-----------------|-----------------|--------------------|-------------------| -| obj | `src/obj/obj.h` | — (format headers private) | `ObjBuilder`, `Section`, `ObjSym`, `Reloc`, `RelocKind`, `ObjImage` | `obj_new/free`, `obj_section/symbol/reloc`, `obj_finalize`, `obj_sweep_dead`; format emit/read via `ObjFormatImpl` | -| ↳ formats | `src/obj/{elf,macho,coff,wasm}/*.h`, `format.h`, `reloc_apply.h` | — | `ObjFormatImpl`, per-format arch ops | `emit_*`/`read_*` per format; `link_reloc_apply` | -| link | `src/link/link.h` (+ `link_arch.h`) | `link_internal.h` | `LinkInput`, `LinkSymbol`, `LinkImage`, `LinkArchDesc` | `link_new`, `link_add_*`, `link_resolve[_extend]`, `link_emit_*_writer`, `cfree_jit_from_image` | -| opt | `src/opt/opt.h` (+ `ir.h`) | `opt_internal.h` | `OptOperand`, `OptFrameSlot`, optimizer `Func` | `opt_cgtarget_new`, `opt_func_from_cg_ir`, pass entries (`opt_build_ssa`, `opt_regalloc`, `opt_lower_to_mir`, …) | -| cg | `src/cg/{ir,ir_recorder,type}.h` | `internal.h` | `CgIrFunc`, `CgIrInst`, `CgType`, `CgTypeField` | IR recording, `cg_type_*` queries | -| debug | `src/debug/debug.h` (+ `dwarf_defs.h`) | `debug_internal.h`, `dwarf_internal.h` | `Debug`, `DebugTypeId`, `DebugVarLoc` | `debug_new`, `debug_type_*`, `debug_func_*`, `debug_line`, `debug_emit` | -| emu | `<cfree/emu.h>` (public face) | `src/emu/emu.h` | `EmuProcess`, `EmuThread`, `ObjFormatEmuOps` | runs via public API; format hooks in `ObjFormatEmuOps` | -| dbg | `<cfree/dbg.h>` (public face) | `src/dbg/dbg.h` | session internals | public `cfree_jit_session_*` | -| asm | `src/asm/asm.h` (+ `asm_lex.h`) | — (`asm_helpers.h` shared) | `AsmLexer`, `AsmTok` | `asm_parse(Compiler, AsmLexer, MCEmitter)`; `asm_driver_*` helpers | -| jit | `src/jit/tlv_thunk.h` | — | — | `cfree_jit_tlv_thunk` (Mach-O TLV); rest via `LinkImage` | -| wasm | `src/wasm/wasm.h` | — | `WasmValType`, `WasmFeatureSet`, `WasmInsnKind` | module model / codec / WAT / validate (public Cfree types only) | -| api | `src/api/lang_registry.h` | — | — | `lang_registry_init(Compiler)` wires enabled frontends | - -**Subsystem-tier review notes:** +`*_internal.h` private to its own TUs. The boundary header is what other +subsystems are allowed to include; the internal header is not. + +| Subsystem | Boundary header | Internal header | Role | +|-----------|-----------------|-----------------|------| +| obj | `src/obj/obj.h` | format headers private | Format-neutral object model (`ObjBuilder`, sections, symbols, relocs) plus read side; the hub cg/link/jit/disasm/dwarf depend on. | +| ↳ formats | `src/obj/{elf,macho,coff}/*.h`, `format.h`, `reloc_apply.h` | — | Per-format emit/read behind `ObjFormatImpl`; `link_reloc_apply` for relocation. | +| link | `src/link/link.h` (+ `link_arch.h`) | `link_internal.h` | Byte/object/archive/DSO inputs, symbol resolution (single-shot and incremental), ELF/JIT output; `cfree_jit_from_image`. | +| opt | `src/opt/opt.h` (+ `ir.h`) | `opt_internal.h` | SSA construction, CFG passes, register allocation, MIR lowering; `opt_cgtarget_new(Compiler, CgTarget, level)` wraps a backend target. | +| cg | `src/cg/{ir,ir_recorder,type}.h` | `internal.h` | IR recording and the codegen type system (`cg_type_*`). | +| debug | `src/debug/debug.h` (+ `dwarf_defs.h`) | `debug_internal.h`, `dwarf_internal.h` | DWARF producer: types, subprograms, line program, emit. | +| emu | `<cfree/emu.h>` (public face) | `src/emu/emu.h` | Guest-ELF emulator; format hooks via `ObjFormatEmuOps`. | +| dbg | `<cfree/dbg.h>` (public face) | `src/dbg/dbg.h` | JIT execution control; the real contract is the public header. | +| asm | `src/asm/asm.h` (+ `asm_lex.h`) | `asm_helpers.h` shared | `asm_parse(Compiler, AsmLexer, MCEmitter)`; driver helpers. | +| jit | `src/jit/tlv_thunk.h` | — | Mach-O TLV thunk; the rest of JIT runs through `LinkImage`. | +| wasm | `src/wasm/wasm.h` | — | Module model / codec / WAT / validate, in terms of public Cfree types. | +| api | `src/api/lang_registry.h` | — | `lang_registry_init(Compiler)` wires the enabled frontends. | + +**Subsystem-tier notes:** - `obj.h` is the hub: cg, link, jit, disasm, and dwarf all depend on it. Format - knowledge (ELF/Mach-O/COFF/Wasm) stays behind `ObjFormatImpl` — verify new code - doesn't hard-code one format above that line. -- `link` exposes both single-shot (`link_resolve`) and incremental - (`link_resolve_extend`) surfaces; keep them consistent. -- `emu` and `dbg` present their real contract through the **public** headers; - the `src/` headers are implementation. Don't grow a second public-ish surface - in `src/`. + knowledge (ELF/Mach-O/COFF) stays behind `ObjFormatImpl`; verify new code doesn't + hard-code one format above that line. +- `link` exposes both single-shot and incremental resolve surfaces; keep them + consistent — the incremental path patches a prior on-disk image rather than + relinking from scratch. +- `emu` and `dbg` present their real contract through the **public** headers; the + `src/` headers are implementation detail. Don't grow a second public-ish + surface in `src/`. --- ## Tier 4 — Core utilities (`src/core/`) -Foundational data structures. Enforce the project rules: **no global state** -(everything takes an explicit `Heap*`/`Arena*`/`Compiler*`), **no VLAs**. +Foundational data structures. They enforce two project rules at the type level: +**no global state** (everything takes an explicit `Heap*` / `Arena*` / +`Compiler*`) and **no VLAs**. | Header | Purpose | Takes explicit allocator? | Public mirror | |--------|---------|---------------------------|---------------| -| `core.h` | Type aliases, `Compiler` struct, panic/defer machinery | `Compiler` holds context | partially via `include/cfree/core.h` | +| `core.h` | Type aliases, `Compiler` struct, panic/defer machinery | `Compiler` holds context | partial, via `include/cfree/core.h` | | `arena.h` | Bump allocator; reset frees all | `Heap*` | `include/cfree/support/arena.h` (narrowed) | | `pool.h` | Symbol interning (`Sym` canonical IDs) | `Heap*` | — | | `buf.h` | Chunked byte buffer with patch/seek | `Heap*` | — | -| `vec.h` | Doubling-growth vector (`VEC_GROW` macro) | `Heap*` | — | +| `vec.h` | Doubling-growth vector (`VEC_GROW`) | `Heap*` | — | | `segvec.h` | Segmented append-only array, stable pointers (`SEGVEC_DEFINE`) | `Heap*` | — | -| `hashmap.h` | Alias to public template | n/a | `include/cfree/support/hashmap.h` | +| `hashmap.h` | Alias to the public template | n/a | `include/cfree/support/hashmap.h` | | `heap.h` | Heap abstraction + JIT exec-mmap helper | wraps `CfreeHeap` | `CfreeHeap` in `core.h` | | `strbuf.h` | Bounded text builder, caller-owned buffer | none (caller buffer) | — | | `slice.h` | Fat-pointer byte view (alias of `CfreeSlice`) | `Arena*` for dup | `CfreeSlice` in `core.h` | -| `bytes.h` | LE/BE int serialize helpers | none | — | +| `bytes.h` | LE/BE integer serialize helpers | none | — | | `diag.h` | Diagnostic-sink convenience wrappers | none (delegates) | — | | `metrics.h` | Telemetry dispatch to optional callbacks | none (reads Compiler) | — | | `sha256.h` | Streaming SHA-256 | none | — | | `util.h` | `MIN`/`MAX`/`ALIGN_*`/`CONTAINER_OF` macros | none | — | -**Core-tier review notes:** -- Public mirrors (`arena`, `hashmap`, parts of `core`) deliberately expose a - *narrowed* surface, not the full internal one. When changing an internal - utility that has a mirror, decide explicitly whether the public mirror moves - too — they are allowed to diverge. -- These are the foundation for the "no global state" rule; any new core utility - that reaches for a static/global is a red flag. +**Core-tier notes:** +- The public mirrors (`arena`, `hashmap`, parts of `core`) deliberately expose a + *narrowed* surface, not the full internal one, and they are allowed to diverge. + When changing an internal utility that has a mirror, decide explicitly whether + the public mirror moves too. +- These are the foundation for the no-global-state rule: any new core utility + that reaches for a static or a global is a red flag. --- -## Tier 5 — Frontend ↔ library boundary +## Tier 5 — Frontend <-> library boundary Frontends live in `lang/`, are API consumers, and register per-`CfreeCompiler`. +Each implements `CfreeFrontendVTable` (`include/cfree/compile.h`): a constructor, +a `compile` function that turns a source slice into a `CfreeObjBuilder`, a +destructor, and the list of file extensions it claims. A frontend is registered +with `cfree_register_frontend(compiler, language, vtable)`; +`src/api/lang_registry.h::lang_registry_init` auto-wires the enabled +`CFREE_LANG_*` frontends at compiler construction. -**The contract a frontend must implement** — `CfreeFrontendVTable` -(`include/cfree/compile.h`): - -```c -typedef struct CfreeFrontendVTable { - CfreeFrontendNewFn new_frontend; // CfreeFrontendState* (*)(CfreeCompiler*) - CfreeFrontendCompileFn compile; // source -> CfreeObjBuilder - CfreeFrontendFreeFn free_frontend; - const CfreeSlice* extensions; // file extensions claimed (no leading dot) - uint32_t nextensions; -} CfreeFrontendVTable; -``` - -Registered via `cfree_register_frontend(compiler, language, vtable)`; -`src/api/lang_registry.h::lang_registry_init` auto-wires the -`CFREE_LANG_*_ENABLED` frontends at construction. - -**What frontends consume** (public only): `cg.h`, `frontend.h`, `source.h`, -`object.h`, `support/arena.h`, `support/hashmap.h`, `core.h`. +**What frontends consume** is overwhelmingly the public API — `cg.h`, +`frontend.h`, `source.h`, `object.h`, `support/arena.h`, `support/hashmap.h`, +`core.h` — with the two documented internal exceptions noted in the boundary map. | Frontend | Public entry | Notable internal headers | |----------|--------------|--------------------------| -| C (`lang/c/`) | `cfree_c_frontend_vtable` (`c.h`) | `type/`, `decl/`, `sem/`, `abi/c_abi.h`, `parse/parse.h` | +| C (`lang/c/`) | `cfree_c_frontend_vtable` (`c.h`) | `type/`, `decl/`, `sem/`, `parse/parse.h`; reaches `abi/c_abi.h` for C ABI lowering | | cpp (`lang/cpp/`) | shared by C; `pp/pp.h`, `lex/lex.h` | `cpp_support.h`, `pp/pp_priv.h` | | toy (`lang/toy/`) | `cfree_toy_frontend_vtable` (`toy.h`) | `internal.h`, `lexer.h` | -| wasm (`lang/wasm/`) | `cfree_wasm_frontend_vtable` (`wasm.h`) | `runtime_abi.h` | - -**Frontend-tier review notes & flags:** -- ⚠️ **`lang/c/parse/cg_public_compat.h`** is a compatibility shim wrapping - `cg.h` with C-semantic sugar (lvalue aux, type stack, `pcg_*` helpers, >100 - functions). It is the real coupling point between the C parser and codegen. - Its existence suggests `cg.h` doesn't yet serve the C frontend's needs - directly — worth tracking as interface debt: does the shim hide gaps that - should be promoted into `cg.h`, or is it legitimately C-specific policy? -- ⚠️ `lang/wasm/wasm.h` exposes `cfree_wasm_wat_to_wasm()` — a test/dev helper - living in a public-facing header. Confirm it belongs in the public surface vs. - a test-only header. -- Frontends must not reach into `src/` (verified clean today). New frontend code - that needs something from `src/` is a signal the public API is missing - something — add it to `include/cfree/`, don't cross the boundary. +| wasm (`lang/wasm/`) | `cfree_wasm_frontend_vtable` (`wasm.h`) | reaches `wasm/wasm.h` (shared module model) | + +**Frontend-tier notes:** +- `lang/c/parse/cg_public_compat.h` is a compatibility shim that wraps `cg.h` + with C-semantic sugar (lvalue auxiliaries, the type stack, `pcg_*` helpers). It + is the real coupling point between the C parser and codegen, and it is worth + understanding when working in either: it carries the C frontend's own policy on + top of the generic `cg.h`, so a change to one frequently belongs in the other. +- `lang/wasm/wasm.h` exposes `cfree_wasm_wat_to_wasm()` — a WAT-to-wasm helper + living in a frontend-public header, used by wasm tooling/tests. --- ## Interface review checklist -Apply to any interface (header / vtable) you add or change. Tier-1 (public) and -Tier-2 (backend contract) changes warrant the full list; lower tiers can be +Apply this to any interface (header / vtable) you add or change. Tier-1 (public) +and Tier-2 (backend contract) changes warrant the full list; lower tiers can be lighter. ### Boundary & layering - [ ] Header lives at the right tier; consumers at the correct layer can reach it. -- [ ] No layering violation: `driver/`+`lang/` use only `<cfree/*.h>`; subsystems - don't include each other's `*_internal.h`. +- [ ] No layering violation: the driver uses only `<cfree/*.h>`; subsystems don't + include each other's `*_internal.h`; a frontend reaches into `src/` only + via the documented C-ABI / wasm-module exceptions. - [ ] Format/arch/OS specifics stay behind their dispatch vtable - (`ObjFormatImpl`, `ArchImpl`, `*Vtable`) — not leaked above it. + (`ObjFormatImpl`, `ArchImpl`, the codegen `*Target` structs) — not leaked + above it. - [ ] If a public mirror exists (arena/hashmap/core), the divergence from the internal version is intentional and documented. @@ -280,7 +312,8 @@ lighter. - [ ] Opaque handles where the consumer shouldn't see layout; concrete structs only where the layout *is* the contract. - [ ] Minimal surface — no entry points added "just in case"; each has a caller. -- [ ] Naming consistent with tier (`cfree_*` public; subsystem-prefixed internal). +- [ ] Naming consistent with the tier (`cfree_*` public; subsystem-prefixed + internal). - [ ] Enums/flags are explicitly valued where they cross a format/wire boundary. ### State & ownership @@ -291,53 +324,24 @@ lighter. - [ ] Borrowed vs. owned bytes (`CfreeSlice` etc.) documented at the boundary. ### Errors & contracts -- [ ] Errors reported via `CfreeStatus` / diag sink, not ad-hoc returns; failure - modes documented. +- [ ] Errors reported via `CfreeStatus` / the diag sink, not ad-hoc returns; + failure modes documented. - [ ] Pre/postconditions and ordering constraints stated (e.g. `obj_finalize` before read-side queries; `func_begin`/`func_end` pairing). - [ ] No magic numbers — shared constants promoted to a header (project rule). ### Vtable / backend contracts (Tier 2) -- [ ] Every hook present in the reference (aa64) is implemented and matches its - semantics, not just its signature. +- [ ] Every required hook is implemented and matches its semantics, not just its + signature; optional hooks left NULL only where the arch genuinely doesn't + need them. - [ ] Semantic vs. physical responsibilities kept on the right side (`NativeOps` vs. `NativeTarget`). -- [ ] New hook added to the contract is implemented by *all* live backends (or - has a documented capability-query fallback, e.g. `supports_label_table`). +- [ ] A new hook added to the contract is implemented by *all* live native + backends, or has a documented capability-query fallback (e.g. + `supports_label_table`, `machine_op_clobbers` returning NULL). ### Stability & docs -- [ ] Public (Tier-1) change: is it source-compatible? If not, callers updated in - the same change. -- [ ] This document's inventory + status table updated. -- [ ] `DESIGN.md` updated if the layering narrative changed. - ---- - -## Review status - -Track interface-review passes here. Status: ⬜ not reviewed · 🔶 in progress · ✅ reviewed. - -| Interface | Tier | Status | Notes | -|-----------|------|--------|-------| -| `core.h` | 1 | ⬜ | host vtables = no-global-state enforcement point | -| `cg.h` | 1 | ⬜ | largest/highest-risk; frontends couple hard | -| `object.h` | 1 | ⬜ | hub for cg/link/jit/disasm/dwarf | -| `link.h` | 1 | ⬜ | single-shot vs. incremental surfaces | -| `jit.h` / `dbg.h` | 1 | ⬜ | — | -| `dwarf.h` / `disasm.h` / `arch.h` | 1 | ⬜ | inspection family | -| `compile.h` / `frontend.h` / `source.h` | 1 | ⬜ | frontend-facing | -| other Tier-1 (`archive`, `asm_emit`, `emu`, `preprocess`, `wasm`, `config`, support) | 1 | ⬜ | smaller surfaces | -| `NativeTarget` (`native_target.h`) | 2 | 🔶 | aa64 ✅ reference; x64/rv64 porting | -| `NativeFrame` (`native_frame.h`) | 2 | 🔶 | in use by aa64 + rv64; x64 adoption pending | -| `CgTarget` (`cgtarget.h`) | 2 | ⬜ | — | -| `NativeDirectTarget`/`NativeOps` | 2 | ⬜ | -O0 adapter; semantic/physical split | -| `MCEmitter` (`mc.h`) | 2 | ⬜ | arch-neutral; keep it that way | -| `TargetABI` (`abi.h`) | 2 | ⬜ | — | -| `ArchImpl` (`arch.h`) | 2 | ⬜ | dispatch hub | -| obj boundary (`obj.h` + formats) | 3 | ⬜ | format dispatch via `ObjFormatImpl` | -| link / opt / cg / debug boundaries | 3 | ⬜ | — | -| asm / emu / dbg / jit / wasm / api | 3 | ⬜ | — | -| core utilities (`src/core/`) | 4 | ⬜ | no-global-state foundation | -| `CfreeFrontendVTable` | 5 | ⬜ | the frontend contract | -| `cg_public_compat.h` shim | 5 | ⬜ | ⚠️ interface debt — gap in `cg.h`? | -| `cfree_wasm_wat_to_wasm` placement | 5 | ⬜ | ⚠️ test helper in public header? | +- [ ] Public (Tier-1) change: is it source-compatible? If not, callers are + updated in the same change. +- [ ] This document's inventory is updated. +- [ ] [DESIGN.md](DESIGN.md) is updated if the layering narrative changed. diff --git a/doc/INTERPRETER.md b/doc/INTERPRETER.md @@ -1,414 +1,326 @@ -# Threaded Bytecode Interpreter for the cfree IR - -## Context - -cfree currently has two ways to *run* IR: the native backend (lowers opt IR → machine -code) and the emulator, which dynamic-binary-translates a guest ISA by decoding → -lifting to CG IR → optimizing → **JIT**ing to host code (`src/emu/emu.c:380-505`). -Both require allocating executable memory and emitting native instructions. - -We want a third execution path: a **threaded interpreter** that runs cfree IR directly, -for environments that cannot JIT (no W^X-exempt mmap, no codegen for the host arch, etc.). -It must serve two configurations behind one engine: - -1. **Host-identity** — run compiled C directly; abstract addresses *are* real host - pointers; external calls go to real linked symbols. -2. **Emu/guest** — replace the emu's JIT step; pointers are guest VAs translated - through the existing `EmuAddrSpace`. - -Design decisions (confirmed with the user): -- Consume the **optimizer IR** (`src/opt/ir.h` `Func`/`Block`/`Inst`), tapped on the - **O1 PReg path** — *after* target-independent passes, *before* register allocation / - MIR lowering. At this point `f->opt_reg_ssa == 0`, `OPK_REG` operands carry virtual - **PReg** ids, and there are no `IR_PHI` nodes. This gives an unbounded virtual-register - machine and lets us reuse opt's optimizations before interpreting. (O2/SSA is - deprecated/disabled — ignored.) -- New **fixed-width, cache-friendly bytecode** with **direct threading** (labels-as-values - / computed goto): each record's first word is a pre-resolved `&&handler`. -- **Explicit, first-class call stack.** The interpreter must NOT use host C recursion for - IR-level calls. The whole runtime is parameterized by an `InterpStack` (an explicit - heap/region-backed frame stack) so execution can be suspended and swapped between stacks - — the substrate for "virtual threads"/fibers. No scheduling *policy* is specified yet; - the requirement is that the *mechanism* be explicit and swap-ready from day one. - -## Tap point: `opt_run_o1_interp` - -`opt_run_o1_native` (`src/opt/opt.c:84-224`) is the model. We add a sibling -`opt_run_o1_interp(Compiler*, const CgIrFunc*) -> Func*` (declared in `src/opt/opt.h`) -that runs the **maximal target-independent subset** and stops before regalloc: - -Run: `opt_func_from_cg_ir` → `opt_build_cfg` → `opt_jump_cleanup(CFG)` → `opt_build_cfg` -→ `opt_simplify_local` → `opt_try_tiny_inline` (+ rebuild idiom from `opt.c:119-121`) -→ `opt_addr_xform_pregs` → `opt_promote_scalar_locals` → `opt_addr_of_global_cse` -→ `opt_build_loop_tree` → `opt_live_blocks` + `opt_dead_def_elim_with_live` → **STOP**. - -Skip: `opt_machinize_native` (only captures phys-reg pools + resolves inline-asm -constraints — irrelevant to a virtual-register interpreter), `opt_lower_loop_imm_operands` -+ `opt_hoist_loop_consts` (pure pessimization for an interpreter, which takes immediates -directly; the first early-returns on `target==NULL` anyway), `opt_regalloc_locations`, -`opt_lower_to_mir`, and all MIR passes. Leave `opt_reg_ssa==0`, `opt_rewritten==0`. - -**Phase-0 verification risk:** `opt_addr_xform_pregs`/`opt_promote_scalar_locals`/ -`opt_addr_of_global_cse` run *after* machinize in the native pipeline; confirm they have -no dependency on machinize side-effects (expected: they are local/escape analysis only). -Also confirm the `opt_verify` debug calls don't assert on `opt_has_target`; if they do, -omit them from the interp entry (debug-only). - -## Control flow & operand model at the tap point (verified) - -- **Scopes are CFG no-ops.** `IR_SCOPE_BEGIN/ELSE/END` are treated as `return` by - `pass_native_emit.c:751`; control flow is driven entirely by `Block.succ[]` + - terminators. `IR_BREAK_TO`/`IR_CONTINUE_TO` carry their destination in `succ[0]` - (`pass_cfg.c:10-14`) → lower as `OP_BR succ[0]`. Every reachable block has explicit - succ edges (fallthrough materialized in `cg_ir_lower.c:1062-1095`), so the loader can - resolve all branches to bytecode pcs. -- Terminators: `IR_BR`(succ0), `IR_CONDBR`(succ0=true/succ1=false), `IR_CMP_BRANCH` - (`extra.imm`=CmpOp, succ0=taken/succ1=fall), `IR_SWITCH` (`IRSwitchAux` cases+default — - interpret directly), `IR_INDIRECT_BRANCH` (`IRIndirectAux.targets[]`), `IR_RET`. -- `OptOperand` (`ir.h:65-86`): `OPK_REG`→PReg id (`v.reg`); `OPK_IMM`→`v.imm`; - `OPK_LOCAL`→`v.frame_slot`; `OPK_GLOBAL`→`v.global.{sym,addend}`; - `OPK_INDIRECT`→`v.ind.{base,index,log2_scale,ofs}`. PRegs are `1..f->npregs`, typed by - `f->preg_type[]`/`f->preg_cls[]` (`RC_INT`/`RC_FP`/`RC_VEC`). -- `IR_CALL` carries `IRCallAux` with the **semantic, un-ABI-lowered** `CGCallDesc` - (`desc.callee` Operand, `desc.args[]`/`desc.ret` as `CGABIValue`, `desc.abi`=`ABIFuncInfo*`). - -## New files & build wiring - -New dir `src/interp/`: -- `interp.h` — `InterpProgram`/`InterpFunc`/`InterpFrame`/`InterpMem`, entry functions. -- `bytecode.h` — `InterpInsn` record layout, opcode enum, side-table structs. -- `lower.c` — `Func` (post `opt_run_o1_interp`) → `InterpFunc` (loader + threader). -- `engine.c` — computed-goto dispatch loop + handlers; exports the dispatch table. -- `mem.c` — `InterpMem` implementations (host-identity, emu-addr-space). -- `ffi.c` — external-call marshaller + bounded thunk family. -- `interp_program.c` — program lifecycle, sym→`InterpFunc` table, extern resolver. - -Public header `include/cfree/interp.h` (parallel to `include/cfree/jit.h`): -`cfree_interp_program_new/_free`, `_add_func`, `_lookup`, `CfreeInterpMemVtable`, and the -explicit-stack API: `cfree_interp_stack_new/_free`, `cfree_interp_call_on` -(seed a stack with an entry frame + args) and `cfree_interp_resume` (run/resume a stack -until it returns/traps/blocks). `_call` is a convenience wrapper that allocates a stack, -seeds it, and resumes to completion. - -`include/cfree/config.h`: add `CFREE_INTERP_ENABLED` (near `CFREE_JIT_ENABLED`); requires -`CFREE_OPT_ENABLED`. `Makefile`: mirror `LIB_SRCS_OPT` (`Makefile:164,188-190`) with -`LIB_SRCS_INTERP = $(shell find src/interp -name '*.c')` gated on the flag; add an -`interp_stubs.c` (mirror `disasm_stubs.c`, `Makefile:126`) for the disabled build. - -## Bytecode format - -Fixed-width record (target ~32 bytes), direct-threaded: +# Interpreter + +cfree's IR interpreter (`cfree run --no-jit`, and the emulator's interpret mode) +executes the compiler's own semantic IR directly, with no native code emission +and no executable memory. It is the third way to *run* cfree IR alongside the +native backend ([CODEGEN.md](CODEGEN.md)) and the JIT ([JIT.md](JIT.md)): a +threaded-bytecode engine that lowers a post-optimization function to fixed-width +records and runs them over an explicit, suspendable call stack. The design point +is fidelity to codegen semantics — it interprets the pre-machinize IR, the same +view the native backend lowers from, so an interpreted program and a compiled +program agree by construction. It lives in `src/interp/`. + +## Why interpret the semantic IR + +The interpreter taps the optimizer pipeline (see [OPT.md](OPT.md)) at +`opt_run_o1_interp` (`src/opt/opt.c`): it runs the target-independent O1 subset +and stops *before* machinization and register allocation. At this tap point the +function is still a virtual-register machine: + ``` -struct InterpInsn { - void* handler; // pre-resolved &&label (set at load); word 0 for computed goto - u16 opcode; // for dump/debug + non-GNU fallback - u16 flags; // BinOp/CmpOp/ConvKind tag, width, signedness fast bits - u32 dst; // dest PReg id - u32 a, b; // src PReg ids / pcs / side-table indices (opcode-specific) - i64 aux; // inline immediate or aux-table index -}; + CG IR --opt_func_from_cg_ir--> Func --[ti passes]--> Func (PReg view) + | + STOP before machinize/regalloc + | + interp_lower --> InterpFunc bytecode ``` -Opcode set: one family per IROp, width/signedness-specialized where it speeds dispatch -(`OP_BINOP_I32/I64/F32/F64` with BinOp in `flags` initially; specialize hot add/sub/mul -later). `OP_LOAD_{8,16,32,64,F32,F64}` / `OP_STORE_*`, `OP_LOAD_IMM`, `OP_LOAD_CONST`, -`OP_COPY`, `OP_ADDR_LOCAL`, `OP_ADDR_GLOBAL`, `OP_TLS_ADDR`, `OP_UNOP_*`, `OP_CMP_*`, -`OP_CONVERT_*`, `OP_BR`, `OP_CONDBR`, `OP_CMP_BRANCH_*`, `OP_SWITCH`, `OP_INDIRECT_BR`, -`OP_CALL_INTERNAL`, `OP_CALL_EXTERNAL`, `OP_RET[_VOID]`, `OP_ALLOCA`, `OP_AGG_COPY/SET`, -`OP_BITFIELD_*`, `OP_VA_*`, `OP_ATOMIC_*`, `OP_FENCE`, `OP_INTRINSIC`, `OP_TRAP`, `OP_NOP`. -## Explicit stack & frame model +Concretely `opt_run_o1_interp` runs CFG build, jump cleanup, local +simplification, the address/escape transforms (`opt_addr_xform_pregs`, +`opt_promote_scalar_locals`, `opt_addr_of_global_cse`), the loop tree, and +live-block dead-def elimination. It deliberately skips `opt_machinize_native`, +loop-immediate lowering / const hoisting, register allocation, and all MIR +passes. Those passes either bind to a physical register file the interpreter does +not have, or are pessimizations for a machine that takes immediates directly. + +Consequences that shape the rest of the engine: + +- **Unbounded virtual registers.** `OPK_REG` operands carry PReg ids + (`1..f->npregs`); the register file is a flat slab indexed by PReg id. There is + no spilling, no phys-reg pool. +- **No SSA / no PHIs.** The interp tap leaves `opt_reg_ssa == 0`; the lowerer + asserts no `IR_PHI` survives. +- **Explicit CFG with materialized edges.** Control flow is driven entirely by + block successors and terminators; every reachable block has explicit succ + edges, so the loader can resolve every branch to a bytecode pc. +- **Semantic, un-ABI-lowered calls.** `IR_CALL` still carries the high-level + `CGCallDesc` (callee operand, args/ret as ABI values, the `ABIFuncInfo`). The + interpreter applies value semantics for internal calls and only consults the + ABI descriptor when it must cross into host code. + +Because this is exactly the IR the native backend consumes, the interpreter and +the compiled program share the same semantics — the differential test of "run +the same IR both ways and compare" is the central correctness lever. + +## Layering and files + +``` + src/opt/opt.c opt_run_o1_interp + interp sink hook (one-way dep) + src/interp/lower.c Func -> InterpFunc bytecode (loader / threader) + src/interp/engine.c suspendable dispatch loop + handlers + intrinsics + FFI marshal + src/interp/ffi.c host-ABI external-call cast-thunk family + src/interp/interp_program.c program lifecycle, sym/name tables, memory + symbol resolution + src/interp/interp.h internal types (InterpInsn, InterpFunc, frame/stack) + include/cfree/interp.h public API (program, host vtable, explicit-stack calls) +``` -The runtime is parameterized by an explicit call stack so it can be suspended and swapped -(virtual threads). Execution state lives in data structures, never in the host C stack. +The dependency is one-way: the optimizer calls `interp_capture_func` (declared +locally in `opt.c`, not by including the interp header) when a compiler has an +interp sink attached. Each compiled function is then lowered into the program in +addition to whatever native emission is happening. + +## Bytecode: `InterpFunc` + +`lower.c` turns a `Func` into an `InterpFunc`: a flat array of fixed-width +`InterpInsn` records plus side tables. The record is cache-friendly and caches +the hot fields a handler needs — destination PReg, resolved branch pcs, an inline +immediate, operand widths, fp flags, a tail-call flag, and a direct-threading +`handler` slot — while retaining a pointer to the source `Inst` so handlers can +read full operand detail (operand kinds, types, `MemAccess`, aux structs, the +call descriptor) generically rather than re-encoding every field. + +Lowering is two passes over `f->emit_order`: + +1. **Pass A** places each reachable block at its starting pc, counting records + (one per emitting `Inst`; `NOP`/`PHI`/`PARAM_DECL`/`SCOPE_*` and the constant + markers emit nothing), operands, and switches. It bump-allocates frame-slot + byte offsets honoring alignment (FS_ALLOCA slots are dynamic, sized at run + time), and fixes the static frame size. +2. **Pass B** emits each record, mapping `IROp -> InterpOp` (with + aggregate-specialized variants such as `IOP_COPY_AGG`/`IOP_LOAD_AGG`), caching + widths and sub-op tags (BinOp/CmpOp/ConvKind/AtomicOp), and resolving every + branch/switch/indirect/label target from block id to bytecode pc via the + `block_pc[]` table built in Pass A. + +The opcode set is one family per `IROp`, specialized only where the width or the +scalar/aggregate distinction changes the handler. There is no width-per-opcode +explosion; arithmetic carries its width and fp-ness in record fields and a sub-op +tag, and the handler masks/sign-extends accordingly. + +**Unsupported ops are not silently dropped.** An op the interpreter cannot run +(notably `IR_ASM_BLOCK`) lowers to `IOP_TRAP` and flags the whole function +`!ok` with a reject reason. The engine reports a clean +`interp: <reason> not supported` diagnostic; it never miscompiles or falls back +to native code. This "diagnose, don't miscompile" rule is the contract for the +no-JIT path. + +### Static data and jump tables + +Function-scope static blobs — ordinary static locals, dense-switch jump tables, +and computed-goto label arrays — are materialized at lower time +(`lower_static_blobs`) into an interp-private, program-lifetime buffer, and the +blob's symbol is bound to that buffer. `WRITE` markers contribute literal bytes; +`LABEL_ADDR` markers contribute the target block's **bytecode pc**, not a native +code address. This is essential: the interpreter addresses code by `InterpInsn` +index, so a jump table that the program later walks with `IR_LOAD` + +`IR_INDIRECT_BRANCH` must hold interp pcs. The stream marker ops themselves lower +to `IOP_NOP`; they are fully consumed by the materialization pass. This is what +lets the dense `-O1` switch lowering and labels-as-values work under the +interpreter. + +## Engine: explicit-stack dispatch + +The engine (`interp_run_stack` in `engine.c`) runs the **top** frame of an +explicit `InterpStack`. Execution state lives in data structures, never on the +host C stack: an IR-level call pushes an `InterpFrame`, a return pops one. The +host C stack stays O(1) regardless of IR call depth — deep IR recursion grows the +stack's frame array, not the host stack. ``` -typedef struct InterpFrame { // one IR-level activation - InterpFunc* fn; - InterpInsn* ip; // resume point (saved across calls/yields) - u64* regs; // register file slab [fn->npregs], owned by the stack - u32 mem_off; // offset of this frame's addressable bytes in the stack - u32 frame_bytes, alloca_top; - u8* sret_ptr; // aggregate-return destination, if any - u32 ret_dst; // caller PReg/slot to receive scalar return (resolved on push) -} InterpFrame; - -typedef struct InterpStack { // a swappable execution context / fiber - InterpFrame* frames; u32 nframes, frames_cap; // explicit call stack (grows on CALL) - u8* regs_arena; u32 regs_top, regs_cap; // bump region for per-frame register files - u8* mem_arena; u32 mem_top, mem_cap; // addressable frame bytes (host-identity mode) - u64 guest_sp_base; // emu mode: frames carve guest stack instead - u64 scalar_ret; u8 ret_is_fp; // return shuttle between frames - u8 status; // RUNNING/DONE/TRAP/BLOCKED -} InterpStack; + InterpStack (a swappable execution context / fiber) + frames[] explicit call stack; interp_run_stack runs frames[top] + regs_arena bump region: each frame's PReg file (npregs u64s) + mem_arena bump region: each frame's addressable bytes (locals, allocas, varargs) + scalar_ret return shuttle between frames + status / trap_reason ``` -- **CALL pushes** a new `InterpFrame` onto `stack->frames` (bump-allocating its register - file from `regs_arena` and its addressable bytes from `mem_arena`/guest stack); **RET - pops** and writes the return into the caller frame's `ret_dst`. The dispatch loop runs - the *top* frame; it never calls itself for IR-level calls. -- The engine entry is `interp_resume(InterpProgram*, InterpStack*) -> InterpRunStatus`. - It runs until the stack empties (DONE), traps, or blocks. **Swapping virtual threads = - calling `interp_resume` with a different `InterpStack`.** A suspension/yield point just - saves `ip` into the top frame and returns a status; resuming re-enters the loop on the - same stack. (External/native calls are the one place host C stack is unavoidably used, - for the duration of the call — noted as a non-suspendable region.) -- Memory-model split stays clean: `InterpStack` holds **control state** (frames, ips, - register files); **addressable bytes** come from `mem_arena` (host-identity) or the guest - address space (emu mode, frames carve `guest_sp_base`). The register file is always - host-side, so it never needs translation. -- The host C stack stays O(1) regardless of IR call depth; deep IR recursion grows - `stack->frames`, not the host stack. - -Per-`InterpFunc` side tables: constant pool (wide imms + `ConstBytes`), call-target table -`{internal InterpFunc* | external host_fp, CGCallDesc*, ABIFuncInfo*}`, switch tables -(case→pc, default→pc resolved), `slot_off[nframe_slots]`+`frame_bytes`, and a transient -`block_pc[nblocks]` used during branch fixup. - -## Lowering algorithm (`Func` → `InterpFunc`) - -1. Walk blocks in `f->emit_order`, skipping unreachable; record `block_pc[b]`; emit one - record per non-no-op `Inst` (SCOPE_*/NOP/PARAM_DECL emit nothing; assert no `IR_PHI`). -2. PReg id *is* its register-file index; register file size = `f->npregs`. -3. Bump-allocate `slot_off[]` over `f->frame_slots[]` honoring `align`/`size`; - `FS_ALLOCA` slots allocate dynamically at `OP_ALLOCA`. -4. Encode operands per `OptOperand` union; choose width/signedness from - `MemAccess.size` / `abi_cg_type_info` (`abi.h:141`). `OPK_INDIRECT` needs an aux record. -5. Second pass: rewrite branch/switch/indirect targets from block id → `block_pc`. -6. Thread: set `record.handler = dispatch_table[opcode]` (table fetched once via an - engine "publish table" call). - -**Arena lifetime:** `InterpFunc` references small descriptors (`CGCallDesc`, switch aux) -from the opt `Func` arena. Deep-copy them in `lower.c` (recommended, given the emu's -per-block churn) rather than pinning the opt arena. - -## Engine (`engine.c`) - -`interp_resume(P, stack)` runs the top frame with a direct-threaded loop. `ip`/`regs`/ -`fr` are cached locals reloaded from `stack->frames[top]` after any push/pop; -`#define NEXT() goto *(ip->handler)`. Representative handlers: -- **BINOP int/fp:** read `regs[a]`,`regs[b]`, apply by `flags`, write `regs[dst]`; guard - div/rem-by-zero and FTOI overflow → `goto trap` (sets `stack->status=TRAP`, returns). -- **LOAD/STORE:** compute abstract addr (`regs[base] + ofs [+ regs[index]<<scale]`), call - `P->mem.translate(ctx, addr, n, perms)`; never raw-deref — this is what makes the two - memory models swap cleanly. -- **CMP_BRANCH:** compare, set `ip = code + (taken ? taken_pc : fall_pc)`. -- **CALL_INTERNAL:** save `ip` into current frame; **push** a new `InterpFrame` (bump regs - + mem from the stack); copy `desc.args[i]` by value into callee param homes; record - `ret_dst`; reload cached locals from the new top frame; `NEXT()`. No host recursion. -- **CALL_EXTERNAL:** `ffi_call_external` (below) — the only handler that uses the host C - stack, for the call's duration (non-suspendable region). -- **RET:** shuttle scalar/aggregate into `stack->scalar_ret`/`sret_ptr`; **pop** the frame; - if the stack is now empty set `status=DONE` and return; else write the result into the - caller's `ret_dst`, reload cached locals, `NEXT()`. -- **ALLOCA:** bump `fr->alloca_top`, return `frame_base + off`. -- **CONVERT:** specialized by src/dst width → the corresponding C cast. - -## Pluggable memory (`mem.c`) - -`InterpMem { u8* (*translate)(void* ctx, u64 addr, u64 n, u8 perms); void* ctx; }`. -- Host-identity: `return (u8*)(uintptr_t)addr`. Locals/allocas return real `&frame.mem[off]`; - globals resolve via `cfree_jit_lookup`/extern resolver. -- Emu: `return emu_addr_space_ptr((EmuAddrSpace*)ctx, addr, n, perms)` (`emu.h:402`, - bounds+perm checked). Interpreter frames live in *guest* stack memory: carve - `frame_bytes` off the guest SP so `&local` stays a valid guest VA. - -## Call marshalling (`ffi.c`) - -- **Internal** (callee resolves to an `InterpFunc`): handled by the engine's push/pop on - the explicit `InterpStack` (above) — pure value semantics, no host ABI, no host recursion; - aggregates copied by `abi_cg_sizeof` bytes. -- **External** (host function pointer): hand-rolled marshaller driven by `desc.abi` - (`ABIFuncInfo`, `abi.h:116`). Classify ret + args into int/fp/by-ref buckets; dispatch - through a finite family of typed function-pointer cast thunks keyed on - `(ret_class, n_int, n_fp)` up to a fixed cap (~8 int + 8 fp). Handle `has_sret` - (alloc aggregate, pass hidden pointer, copy back) and byval struct args (copy to host - buffer, pass pointer). Variadic supported when varargs fit the register thunk family; - `vararg_on_stack` (Apple ARM64) → diagnose, defer. Anything beyond the family → - diagnose `"interp: unsupported external call signature"` with the precise reason. - -## Integration with existing paths - -- **Standalone:** front end records CG IR as today → `opt_run_o1_interp` per func → - `lower.c` → register in `InterpProgram`; globals via extern resolver / `cfree_jit_lookup`; - `cfree_interp_lookup` + `cfree_interp_call` to run. -- **Emu interpret-mode:** in `emu_translate_block` (`emu.c:380-505`) keep decode + - `arch->emu->lift_block` (`emu.c:464`) unchanged; instead of `cfree_link_session_jit`, - run the lifted CG IR through `opt_run_o1_interp` + `lower.c` into an `InterpFunc`, cache - by `guest_pc` (mirror `emu_cache_insert`, `emu.c:500`). Dispatch calls `interp_run` with - the emu-addr-space `InterpMem`; external/runtime symbols route via - `emu_runtime_extern_resolver` (`emu.c:480`). Gate on a new JIT-vs-INTERP `CfreeEmuMode`. - -## Phased implementation order - -- **Phase 0 — scaffolding:** `src/interp/` skeletons, config flag, Makefile + stubs, - `opt_run_o1_interp` (verify the pass-subset risks above). Build green. -- **Phase 1 — minimal leaf int fn (milestone):** build the explicit `InterpStack`/ - `InterpFrame` model and `interp_resume` loop up front (single-frame is enough here, but - the push/pop machinery and arenas land now — not retrofitted). Support `LOAD_IMM`, - `COPY`, int `BINOP`, scalar int `RET`, `BR`, params-in-PRegs. Run `int add(int,int)` / - `return 2+3;` host-identity by seeding a stack and resuming; assert results. -- **Phase 2 — control flow & memory:** `CONDBR`, `CMP_BRANCH`, `SWITCH`, `LOAD`/`STORE`/ - `ADDR_OF` (local+global) via vtable, frame-slot offsets, `ALLOCA`, `CONVERT`, `UNOP`, - `CMP`. Loops/switches/pointers work. -- **Phase 3 — calls & FP:** internal call/return via stack push/pop (incl. aggregates; - exercises multi-frame depth on the explicit stack), external FFI thunks - (sret/byval/basic variadic), F32/F64 handlers. -- **Phase 4 — emu integration:** emu-addr-space `InterpMem`, interpret-mode branch, - guest-stack frames, runtime-symbol routing; run a guest ISA smoke test interpreted. -- **Phase 5 — long tail:** atomics (host atomics / single-thread serialize), bitfields, - `AGG_SET`, `TLS_ADDR`, `VA_*`, intrinsics. Diagnose-and-reject the rest. - -## Verification - -- **Unit harness:** new `test/interp/interp_smoke_test.c` mirroring - `test/opt/cg_ir_lower_test.c` (self-contained heap/diag sink, `EXPECT`): build tiny CG - IR, run `opt_run_o1_interp` + lower + `interp_run`, assert return values. Register in - `test/test.mk`; add `test/interp/run.sh` paralleling `test/opt/run.sh`. -- **Differential testing (highest value):** run the same CG IR through native JIT - (`cfree_link_session_jit`) and the interpreter over the existing opt corpus; assert - identical results/side effects. Reuses the optimizer front half; catches semantic drift. -- **Emu mode:** mirror `test/emu/rv64_smoke_test.c`; compare interpret-mode vs JIT output. -- **Tooling:** an `interp_dump` bytecode disassembler gated like `CFREE_DUMP` - (`opt.c:69-82`) for debugging lowering. - -## Unsupported / risks (diagnose, don't miscompile) - -- **Inline asm** (`IR_ASM_BLOCK`): needs machinize's constraint resolution we skip; no - portable interpretation → reject at lower time. -- **Vectors** (`RC_VEC`): `u64` register file → reject vector pregs/types initially; widen - to 128-bit lanes later if needed. -- **setjmp/longjmp:** host `setjmp` over `InterpFrame` chains feasible in identity mode; - emu mode needs interpreter-stack unwinding → defer, diagnose. -- **FFI cap:** signatures beyond the thunk family / `vararg_on_stack` / exotic - `ABI_ARG_EXPAND` → diagnose with the reason. -- **`opt_verify` w/o machinize:** if it asserts on `opt_has_target`, drop from interp entry. - -## Implementation status (initial landing) - -Implemented and validated differentially against the JIT: - -- **`opt_run_o1_interp`** (`src/opt/opt.c`): runs `opt_func_from_cg_ir` → cfg/jump-cleanup - → `opt_simplify_local` → `opt_addr_xform_pregs` → `opt_promote_scalar_locals` - → `opt_addr_of_global_cse` → loop tree → `opt_live_blocks` + `opt_dead_def_elim_with_live`, - then STOP. `opt_verify` is omitted (debug-only; some checks assume the machinized shape). -- **`src/interp/`**: `interp.h`/`interp_program.c`/`lower.c`/`engine.c`/`ffi.c` (+ `interp_stubs.c`). - Public API in `include/cfree/interp.h`. Gated by `CFREE_INTERP_ENABLED`. -- **Direct-threaded dispatch.** Each `InterpInsn` caches the `&&handler` of its opcode and - every handler tail-dispatches with `goto *in->handler`, so the branch predictor sees a - distinct indirect branch per opcode site. The same handler bodies compile as a portable - `switch` fallback via the `OP()`/`NEXT()`/`GO()` macros: the threaded path is gated on - `(defined(__GNUC__) || defined(__clang__)) && !defined(__cfree__)` (cfree's own C front end - has no labels-as-values, so the self-host build uses the switch), with the GNU label-as-value - warnings suppressed locally. The dispatch table is published from the in-function labels once, - then each function's records are threaded lazily on first entry. A build can force either path - via `-DCFREE_INTERP_THREADED=0|1`. -- **Explicit `InterpStack`/`InterpFrame`** with offset-based register/mem arenas (so a deep - call can `realloc` an arena without invalidating cached frame pointers). CALL pushes / RET - pops; **tail calls** (terminator `IR_CALL` / `CG_CALL_TAIL`) relocate the freshly-built callee - frame down onto the dead caller's register/memory region and rewind the arenas, so a tail - loop runs in **true O(1)** interp + host stack space (validated to 5M deep). -- **Ops**: imm/copy/load/store/addr-of (incl. memory-destination results via `write_dst`), - binop/unop/cmp/convert (width+sign from the op tag), br/condbr/cmp_branch/switch/ - indirect-br/load-label-addr, internal+external/indirect call, ret (scalar + aggregate - sret), alloca, agg-copy/set, **bitfield load/store** (shift+mask extract / read-modify-write - insert, signed-field sign-extension), **`IR_TLS_ADDR_OF`** (thread-local address; see TLS - below), **variadics** (`va_start`/`va_arg`/`va_end`/`va_copy`), **atomics** (single-thread - serialized), and the common **intrinsics** (memcpy/move/set, popcount/ctz/clz/bswap, - checked-overflow, expect, trap). Float→int conversion **saturates** (NaN→0, clamp) — matches - Wasm `trunc_sat` and avoids the UB of a raw out-of-range cast. -- **Static local data + switch jump tables** (`IR_LOCAL_STATIC_DATA_*`, `lower.c`): each - function-scope static blob (regular static locals, dense-switch jump tables, computed-goto - label arrays) is materialized into an interp-private, program-lifetime buffer at lower time - and its symbol bound to that buffer. `WRITE` records contribute literal bytes; `LABEL_ADDR` - records contribute the target block's **bytecode pc** (so an `IR_LOAD` + `IR_INDIRECT_BRANCH` - through a jump table lands on the right record, not a native code address baked in by the - parallel object/JIT path). This is what unblocks the dense `-O1` switch lowering. -- **Variadics** (interpreter-private va_list): on an internal call to a variadic callee the - anonymous arguments are laid out into a contiguous buffer in the callee frame; `va_start` - seeds the va_list with a cursor over it, `va_arg` reads the typed slot and advances, `va_copy` - duplicates the cursor. The layout is self-consistent because the interpreter owns both the - call-site build and the va ops — independent of the target ABI's real va_list. -- **Thread-local storage**: a thread-local's symbol does not denote its storage on every target - (a Mach-O symbol resolves to a TLV descriptor), so `IR_TLS_ADDR_OF` routes through the host - `resolve_tls` hook. In `--no-jit` the driver implements it via `cfree_jit_tlv_resolve`, which - unwraps **our own** Mach-O descriptor (it verifies `desc[+0]==&cfree_jit_tlv_thunk` and - `desc[+8]==jit->tls_ctx` first, so a foreign/dyld descriptor never becomes a wild call) and - calls the JIT image's per-thread block accessor to get the variable's real address. The - interpreter shares the same per-thread storage the JIT would use, so reads/writes are - consistent and persist across calls. Anything it can't resolve safely — a non-Mach-O image or - a foreign descriptor (e.g. an extern thread-local) — returns NULL and is **diagnosed**, never - treated as storage. -- **`cfree run --no-jit`** (`driver/cmd/run.c`): forces `-O1` minimum, attaches an `InterpProgram` - so each function is captured while the normal object/JIT-link still runs (it lays out data - globals and resolves externs / function pointers). The entry executes **only** through the - interpreter — there is **no JIT execution fallback**; a non-interpretable entry is an error. - Globals/externs resolve by iterating the JIT image's symbol table (locals included, - tolerating the target's leading-underscore C mangling), then host `dlsym`; thread-locals - additionally route through `cfree_jit_tlv_resolve`. Wasm entries get their instance/linear- - memory set up and run `__cfree_wasm_init` + entry via the interpreter. -- **FFI** (`ffi.c`): a maximal host-ABI prototype family — int args in `u64×8`, fp args in - `double×8` (or `float×8` when every fp arg is a 4-byte single; a float/double mix in one - signature is diagnosed). Int/fp args land in their register pools regardless of interleaving - on the supported ABIs. Handles sret, byval-by-pointer, and **multi-register struct returns** - (up to two registers, any int/fp class combination — struct-returning thunks steer the return - registers and the caller scatters each part into the aggregate). `vararg_on_stack` (Apple - ARM64 variadics) and 32-bit-fp struct-return fields are diagnosed/deferred. -- **Tests**: `test/interp/interp_smoke_test.c` (unit, `make test-interp`); toy `I` path and - wasm `N` path (`make test-interp-toy`, default paths now include them). Differential result: - toy `I` 298/312 match the JIT exactly with 0 diffs (only the 7 inline-asm cases SKIP, by - design); wasm `N` `.wat`/`.wasm` match; C (incl. libc via FFI — f32 args, multi-register - struct returns) matches; `141_threadlocal_mutate` exercises TLS define+mutate. - -## Phase 4: emu interpret-mode (implemented, rv64) - -The emulator can now run each lifted guest block through the interpreter instead of JITing it, -behind `CfreeEmuMode {JIT,INTERP}` on `CfreeEmuOptions` (driver flag `cfree emu -interp`, which -forces `-O1`). The key simplification — verified against the rv64 lifter — is that **the interp -frame stays host-identity**: the rv64 lifter (`src/arch/rv64/emu.c`) lowers guest loads/stores to -FFI calls into `__emu_*_checked` host helpers that bounds/perm-check the guest VA internally, and -the lifted block's frame locals are pure host scratch passed as host out-pointers. So no guest-VA -`translate` hook, no guest-stack frame carving, and no frame-model changes are needed (the doc's -original "carve `frame_bytes` off the guest SP" plan would have corrupted host memory and was -dropped). Concretely: - -- `cfree_emu_new` attaches an `InterpProgram` as the compiler's interp sink (so each translated - block is also captured as an `InterpFunc`) and stands up one long-lived `InterpStack`; it binds - only `resolve_sym = emu_runtime_extern_resolver` (translate hook stays NULL). The per-block JIT - image is still built to resolve helper externs and validate the lifted IR. -- `translate_block` caches the captured `InterpFunc*` (payload disambiguated by `e->mode`). -- The single JIT/INTERP fork is at `cfree_emu_step`: INTERP runs `emu_interp_run_block`, which - resets the stack, seeds param0 with `(u64)EmuThread*`, resumes, and returns the scalar result as - `next_pc`. Guest faults/exits are delivered in-band by the helpers (CPU trap + `next_pc`) and - observed by the existing post-dispatch check, exactly as in JIT mode. -- Chosen UX: a block the interpreter can't run (lower-time reject or runtime trap) **hard-fails** - via `compiler_panic` with the reason — no silent JIT fallback. -- New additive engine API: `cfree_interp_stack_reset`, `cfree_interp_call_args_on`, - `cfree_interp_stack_trap_reason` (so one stack is reused across blocks without realloc). -- **Self-modifying code:** when a guest writes a translated code page the addr-space generation - bumps and the code cache is flushed, but `interp_prog` is append-only, so re-translation appends - a new same-named `InterpFunc`. `cfree_interp_lookup` therefore returns the **last** (freshest) - name match, giving INTERP the same fresh-code semantics the JIT gets from its per-block object - lookup. (`--no-jit` captures each name once, so last == first.) -- Tests: `test/emu/rv64_interp_smoke_test.c` (`make test-interp-emu`) runs two guests under both - modes at `-O1` and asserts identical exit codes — an `SD`/`LD`/`ecall` guest (exercises the - helper memory path) and a self-modifying guest (exercises code-cache + interp-capture - invalidation; verified by mutation to catch a stale-lookup regression). Scope is rv64: it is the - only arch with an emu lifter, and that lifter currently implements only - `ADDI/ADD/AUIPC/LD/SD/JALR/ECALL` (others hit a re-dispatch-this-pc default). - -Not yet implemented (diagnosed → SKIP, not miscompiled): inline asm (by design; needs -machinize's constraint resolution), FFI signatures beyond the register-thunk family -(`vararg_on_stack` external variadics, 3+-register struct returns, 32-bit-fp struct-return -fields, and an aggregate/>8-byte scalar in an external variadic-argument position — these have -no per-call ABI classification, so they are rejected rather than marshalled), and thread-locals on -non-Mach-O images or via foreign/dyld descriptors (extern thread-locals). Emu interpret-mode is -rv64-only (no other arch has an emu lifter) and does not reclaim stale `InterpFunc` captures under -churny self-modifying code (bounded leak until emu teardown, like the JIT's `e->jits[]`). - -Known limitations (correct results, not bugs): the threaded-dispatch per-site branch benefit -only materializes at optimization — the `-O0`+sanitizer test build merges the dispatch sites -(still computed-goto through the handler field, never a switch). The `g_mem_fault` latch is -re-checked on straight-line ops and on branch selectors; in emu mode this latch is not exercised -by guest memory (the interp frame is host-identity and guest loads/stores go through the bounds- -checked `__emu_*` helpers, not the interp `translate` hook), so guest faults surface in-band as a -block return value rather than through `g_mem_fault`. +**The two arenas are fixed, non-relocating reservations.** An `OP_ADDR_OF` +materializes a local's address as an absolute host pointer into `mem_arena`, and +that pointer can escape into a register or into another local; reallocating +(moving) the arena would dangle it. Frames follow strict stack discipline (CALL +bumps the top, RET rewinds it), so a generous fixed reservation suffices, and +overflow traps cleanly as a stack overflow rather than corrupting memory. Frames +themselves reference the arenas by **offset**, not pointer, so the frame array +*can* be `realloc`'d on growth without invalidating anything. + +### Dispatch: direct-threaded with a switch fallback + +Where the host compiler supports labels-as-values (GCC, clang, and cfree itself), +the engine is **direct-threaded**: on first entry to a function each opcode's +`&&handler` is copied into its records, and every handler tail-dispatches with +`goto *in->handler`, giving the branch predictor a distinct indirect branch per +opcode site. The identical handler bodies compile as a portable `switch` for any +other compiler, sharing one source through the `OP()`/`NEXT()`/`GO()` macros. The +choice is governed by `CFREE_INTERP_THREADED` (default on, in +`include/cfree/config.h`) AND the compiler's capability; it can be forced with +`-DCFREE_INTERP_THREADED=0|1`. Keeping a switch fallback is what lets any +compiler lacking labels-as-values run the same engine through one portable code +path, with no behavioral difference from the threaded build. + +### Handler shape and key behaviors + +- **Arithmetic / compare / convert** read operand values, apply the operation by + width and fp flag, and write the result. Width masking and sign-extension are + explicit. Integer divide/rem guard divide-by-zero and the `INT_MIN / -1` + overflow (wrap, not UB). Float-to-int conversion **saturates** (NaN -> 0, + out-of-range -> clamped): this matches Wasm `trunc_sat` and, crucially, avoids + the UB (and UBSan trap) of casting an out-of-range double to an integer, while + staying identical to a plain cast for well-defined inputs. +- **Loads / stores / addressing** never raw-dereference. Every memory access goes + through `interp_translate` (below), which is what makes the two memory models + swap cleanly. A destination operand may itself be memory — the optimizer leaves + address-taken locals un-promoted — so `write_dst` handles register *and* memory + destinations. +- **Branches** retarget `ip` to a resolved pc and re-dispatch. Because branch + handlers skip the straight-line memory-fault recheck, they test the fault latch + before consuming a possibly-garbage selector. `IOP_SWITCH` reads its + pre-resolved case/default pcs from a side table; `IOP_INDIRECT_BR` and + `IOP_LOAD_LABEL_ADDR` traffic in bytecode pcs (see static data above). +- **Faults vs. unsupported.** A runtime fault (bad memory, divide-by-zero, + `__builtin_trap`, unreachable) sets `TRAP`; an op/signature the engine can't + handle sets `ERROR`. Both record a borrowed reason string retrievable via + `cfree_interp_stack_trap_reason`. Memory faults use a latch rechecked on + straight-line ops and at branch selectors so a faulting access stops the loop + rather than propagating zero. + +### Calls and O(1) tail calls + +`IOP_CALL` resolves its callee from the call descriptor. A `GLOBAL` callee that +names a TU-internal function (or a function-pointer in a register whose host +address reverse-maps to one) is **interpreted**, not run as native — even through +a function pointer — so the no-JIT contract holds: the interpreter never executes +JITed code. Only a genuinely external target reaches the FFI path. + +- **Internal call:** push a frame (bump regs + mem), bind arguments by value into + the callee's parameter homes (register params into the register file, aggregate + / large params copied into frame slots), lay out any anonymous (variadic) args + into a contiguous buffer in the callee frame, record where the caller wants the + result (a register, or an sret pointer into the caller's slot), then re-dispatch + on the new top frame. No host recursion. +- **Return:** shuttle the scalar (or copy the aggregate into the sret pointer), + rewind the arenas to the frame's bases, pop, and deliver the result into the + caller's recorded destination. An empty stack means `DONE`. +- **Tail calls** (terminator `IR_CALL` / `CG_CALL_TAIL`, or the last emitting inst + of a successor-less block) are true O(1): the freshly-built callee frame is + relocated *down* onto the dead caller's register/memory region and the arenas + are rewound, so a tail loop runs in constant interp- and host-stack space. This + is safe precisely because the callee has not executed yet — no absolute pointers + into its own frame exist, and the argument-binding step has already copied every + argument value out of the caller. External tail calls similarly forward the + call's result as this frame's result without growing the stack. + +### Variadics, bitfields, atomics, intrinsics, TLS + +The interpreter owns both ends of variadics, so its **interpreter-private +va_list** is self-consistent regardless of the target ABI's real layout: the +anonymous args are packed into the callee frame at aligned slots, `va_start` +seeds a cursor over that buffer, `va_arg` reads the typed slot and advances, +`va_copy` duplicates the cursor. + +**Bitfields** are interpreted by shift/mask extract (sign-extending signed +fields) and read-modify-write insert over the storage unit, using the field's +layout descriptor. **Atomics** run on the single-threaded engine: the operation +is serialized and the memory order is treated as sequentially consistent; a fence +is a no-op. **Intrinsics** cover mem{cpy,move,set}, popcount/ctz/clz/bswap, +checked-overflow builtins (exact-width detection), expect/assume/prefetch, and +trap/unreachable. + +**Thread-local storage** routes through the host's `resolve_tls` hook rather than +treating the symbol as storage, because a thread-local symbol does not denote its +storage on every target (a Mach-O symbol resolves to a TLV descriptor). The hook +returns the calling thread's address of the variable. When no hook is bound it +falls back to plain-global resolution, correct only where the symbol *is* the +storage; anything it cannot resolve safely returns NULL and is diagnosed. + +## Pluggable memory and symbols (host vs. emu) + +The engine never assumes how an abstract address maps to a host pointer. A +`CfreeInterpHost` vtable (resolved through `interp_translate` / +`interp_resolve_sym` in `interp_program.c`) provides `translate`, `resolve_sym`, +and `resolve_tls`; any may be NULL. Two configurations share the engine: + +- **Host-identity** (`cfree run --no-jit`): abstract addresses *are* real host + pointers. `interp_translate` returns the address unchanged; locals/allocas live + in `mem_arena` and their addresses are real host pointers; globals/externs + resolve through the bound resolver. +- **Emu/guest**: addresses are guest VAs translated through an `EmuAddrSpace`, + bounds- and permission-checked (see [EMU.md](EMU.md)). + +Global symbols a function references are noted at lower time (their names +captured from the obj while it is alive), and their host addresses resolved +lazily and cached on first use, after the JIT image has been linked. + +## Integration + +**`cfree run --no-jit`** (`driver/cmd/run.c`) forces at least `-O1`, attaches an +`InterpProgram` so each function is captured to bytecode while the normal +object/JIT-link still runs (the link lays out data globals and resolves externs / +function pointers), then executes the entry **only** through the interpreter — +there is no JIT execution fallback; a non-interpretable entry is a hard error. +Globals/externs resolve by walking the JIT image's symbol table (tolerating a +target's leading-underscore C mangling), then host `dlsym`; thread-locals +additionally route through `cfree_jit_tlv_resolve`, which unwraps cfree's own +Mach-O TLV descriptor (verifying it before any indirect call so a foreign/dyld +descriptor never becomes a wild call). Wasm entries get their instance and linear +memory set up and run `__cfree_wasm_init` plus the entry through the interpreter. + +**Emu interpret mode** (`CFREE_EMU_MODE_INTERP`, `src/emu/emu.c`) runs each lifted +guest block through the interpreter instead of JITing it, also forcing `-O1`. The +key simplification, verified against the rv64 lifter, is that the interp frame +stays **host-identity**: the lifter lowers guest loads/stores to FFI calls into +bounds-checked `__emu_*` host helpers, so there is no guest-VA translate hook and +no guest-stack frame carving — only `resolve_sym` is bound. A long-lived stack is +reset and reseeded per block (the additive `cfree_interp_stack_reset` / +`cfree_interp_call_args_on` API). Because capture is append-only, +`cfree_interp_lookup` returns the **newest** same-named function, which gives +interpret mode the same fresh-code semantics the JIT gets when self-modifying code +invalidates a translation. A block the interpreter cannot run hard-fails with the +reason; there is no silent JIT fallback. + +## Public API surface + +The public header (`include/cfree/interp.h`) exposes the program lifecycle, the +host vtable, and an **explicit-stack** API designed as a swap-ready substrate for +fibers / virtual threads: create a stack, seed it with an entry frame and +arguments, and run/resume it to a `DONE`/`TRAP`/`ERROR`/`BLOCKED` status. +Swapping execution contexts is just resuming a different stack. `cfree_interp_call` +is the convenience wrapper that allocates a stack, seeds it, resumes to +completion, and frees it. The external-call path through host code is the one +region that necessarily uses the host C stack for the call's duration, and is +therefore non-suspendable. Exact signatures live in the header. + +## FFI: external calls (`ffi.c`) + +External (host) calls are marshalled by a hand-rolled cast-thunk family — the +classic libffi-lite trick. The engine classifies the call's `ABIFuncInfo` into +integer-register and fp-register slots (sret pointer first, byval aggregates +passed by pointer, register-split aggregates chunked), then `interp_ffi_invoke` +calls the host function pointer through a prototype cast that matches the +classified shape. This is correct on the supported ABIs (SysV x64, AAPCS64, RV64 +LP64D) because integer and fp arguments come from independent register sequences, +so a maximal `T(u64 x8, fp x8)` prototype places the first N integers and M fp +values in the right registers regardless of interleaving; unused trailing slots +are ignored. Two fp shapes exist because a 4-byte single and an 8-byte double +occupy the fp register differently; a signature mixing the two is rejected. +Returns mirror this: a value comes back in one or two registers, dispatched +through scalar or struct-returning thunks whose field types steer the return +registers, and the caller scatters each part into the aggregate destination. + +Signatures outside this family are diagnosed, not guessed: too many register +args, stack-routed variadics (Apple ARM64 `vararg_on_stack`), 3+-register struct +returns, 32-bit-fp struct-return fields, and aggregate / oversized scalars in a +variadic-tail position (which have no per-call ABI classification). The thunk +casts deliberately mismatch the real prototype, which trips clang's +`-fsanitize=function`, so the dispatcher opts out of that one check (clang only; +the cfree self-host build never enables it). + +## What it does not do, by design + +Inline asm (`IR_ASM_BLOCK`) is rejected: it needs machinize's constraint +resolution, which the interp tap skips, and has no portable interpretation. The +FFI signatures listed above are diagnosed rather than marshalled. These are clean +rejections with a reason, never miscompilations. + +See also: [IR.md](IR.md), [OPT.md](OPT.md), [JIT.md](JIT.md), [EMU.md](EMU.md), +[CODEGEN.md](CODEGEN.md). diff --git a/doc/IR.md b/doc/IR.md @@ -1,490 +1,363 @@ # IR -This document defines the target shape for cfree's lowered-CG IR: the -function-level representation recorded from the internal `CGTarget` interface -and consumed by optimization, backend replay, and future non-JIT interpreted -execution. - -The intended boundary is a shared recorded-CG IR layer. Frontends keep emitting -public `CfreeCg` calls, `CfreeCg` lowers stack/lvalue source operations into -`CGTarget` calls, and this IR records the resulting typed locals, labels, -control-flow operations, memory operations, and ABI-shaped call operations. -Optimizers and backends may attach private side tables or lowered views, but -those are not part of the shared IR contract. - -## Pipeline Position - -The IR sits below the public CG API and above target machinization: +This document defines the semantics of cfree's semantic CG IR: the +function-level, recorded form of the internal `CgTarget` interface, captured as +a `CgIrModule` of `CgIrFunc` bodies (see src/cg/ir.h). It is the stable hinge +between the frontend's typed CG-API calls and everything downstream that wants a +durable program form rather than immediate emission: the optimizer, the +threaded interpreter, and source-like replay backends. This is the +authoritative semantics-of-the-IR reference; for how the IR is produced and +replayed see [CODEGEN.md](CODEGEN.md), for the optimizer's own derived form see +[OPT.md](OPT.md), and for interpreted execution see +[INTERPRETER.md](INTERPRETER.md). + +## What the IR is, and what it is not + +The IR is a faithful tape of `CgTarget` calls. The `CgTarget` interface +(src/cg/cgtarget.h) is the semantic codegen API: typed locals, labels, +structured scopes, memory ops, ABI-shaped calls, atomics, intrinsics, inline +asm. A backend can implement that interface to emit code immediately (the O0 +native path, the C-source path). The IR recorder (src/cg/ir_recorder.c) is a +second implementation of the *same* interface that, instead of emitting, records +each call into a `CgIrInst` on the current `CgIrFunc`. Replaying the recorded +tape through a direct target reproduces exactly what immediate emission would +have done. + +Because of that, the IR carries no optimizer state and no machine state. There +are no basic blocks, no SSA values, no phis, no dominance, no liveness, no +virtual or hard registers, no spill slots, and no call plans in the CG IR. Those +are all derived, consumer-private views. In particular the optimizer's `Func` +IR (src/opt/ir.h) is a *separate* representation with its own op set +(`IR_PHI`, `IR_PARAM_DECL`, `IR_CONST_I`, ...); it is built from the CG IR, not +a superset of it. Do not conflate the two: the CG IR enum is `CgIrOp` in +src/cg/ir.h; the optimizer enum is `IROp` in src/opt/ir.h. + +The IR is target-data-layout-specific but not target-instruction-specific. Type +sizes, alignments, record field offsets, bitfield bit ranges, ABI +classifications, and pointer widths are already resolved for the compile target +by the time the recorder sees a call. The IR does not know about machine +instructions, addressing-mode legality, or register files. + +## Pipeline position ```text -frontend -> CfreeCg -> recording CGTarget -> lowered-CG IR - |-> optimizer-derived CFG/SSA/MIR views - |-> interpreter - |-> replay into native/C/wasm target +frontend + -> CfreeCg (public CG API: stack/lvalue model) + -> CgTarget (semantic codegen interface) + |-> direct native target (O0 emit) + |-> direct C-source target (--emit=c) + \-> IR recorder -> CgIrModule (O1/O2, interpreter) + |-> opt: derive Func (CFG/SSA/MIR) -> native emit + \-> opt: derive Func (reduced) -> interpreter ``` -The IR is target-data-layout-specific: type sizes, alignments, record field -offsets, bitfield positions, ABI classifications, and pointer widths are -already selected for the compiler target. It is not target-instruction-specific -until `opt_machinize` or an equivalent backend-prep pass runs. - -An interpreter should execute this pre-machinize IR. Post-machinize and -post-register-allocation forms contain hard registers, spill slots, scratch -policies, call plans, and backend emission constraints; those are not semantic -execution concepts. +`CfreeCg` lowers the frontend's stack/lvalue source operations into flat +`CgTarget` calls. At O0 those calls hit a direct target and become code right +away. At O1/O2 and under the interpreter they hit the recorder and become a +`CgIrModule`. The recorder is created by the optimizer (src/opt/opt.c calls +`cg_ir_recorder_new`); it notifies the optimizer per completed function and at +finalize through callbacks so cross-function work (inlining, reachability, +alias resolution) can run before the buffered IR is lowered into the wrapped +direct target. + +## Module and function structure + +A `CgIrModule` owns the translation unit's recorded functions, symbol aliases, +and file-scope `__asm__` blocks. File-scope asm is retained on the module rather +than emitted during recording because the optimizer path has no live emit +target at recording time; it is replayed at finalize. + +A `CgIrFunc` is one function body and owns everything needed to replay, +optimize, or interpret it: the preserved `CGFuncDesc` (symbol, function type, +result/param descriptors, source location, attributes, inline policy); a linear +instruction stream (`CgIrInst` tape); and side tables for locals, params, +labels, and scopes. It also caches two `ObjSymSet`s — the set of symbols it +calls and the set of globals it references — populated as operands are +recorded, so reachability and alias passes need not rescan the tape. + +There is one local namespace per function. A `CgIrLocal` is a mutable typed +location identified by a `CGLocal` id (1-based; `CG_LOCAL_NONE` is the sentinel). +A local records its `CGLocalDesc` (type, size, align, source name/loc), whether +it is a parameter (with parameter index), and whether its address has been +taken. Parameters are *declarations*, not executable ops: the recorder adds the +parameter local and a `CgIrParam` entry; there is no parameter instruction in +the tape. Taking the address of a local (`CG_IR_ADDR_OF`, or the dedicated +`local_addr` recording) sets the local's `address_taken` flag, which downstream +consumers use to decide it needs a concrete memory home; non-address-taken +scalar locals may live in registers, SSA values, or interpreter slots as the +consumer sees fit. + +### Labels, scopes, and derived blocks + +Labels are first-class because CG control-flow ops name them: branch targets, +switch case/default targets, label-address materialization, and the closed +target set of a computed goto. A `CgIrLabel` records its id and the source +location of its first placement. Placement appears in the tape as a +`CG_IR_LABEL` instruction. + +Structured scopes (`CgIrScope`) capture CG's structured control model — block, +loop, and if scopes with their associated break/continue/else/end semantics — +so that backends able to express structure (the C-source target, a future Wasm +target; see [WASM.md](WASM.md)) can replay it directly. Native CFG consumers +flatten scopes to ordinary labels and branches. + +Basic blocks are *not* part of the IR. A consumer that needs CFG form derives it +by splitting the linear tape at labels, scope boundaries, and terminators. That +derived CFG, with its predecessor/successor edges, layout order, and dominance, +belongs to the consumer (the optimizer builds exactly this in +`opt_func_from_cg_ir`). + +## Instructions and operands + +A `CgIrInst` has an op (`CgIrOp`), a sticky source location captured from the +last `set_loc`, an operand array, and an `extra` union holding op-specific +auxiliary data: a raw immediate, constant bytes, a `MemAccess`, or an arena +pointer to an op-specific aux struct. There is no separate result-type field; +each operand carries its own `CfreeCgTypeId` and destinations name typed locals, +so the instruction's types are recoverable from its operands and aux. + +Most ops map one-to-one to a `CgTarget` method, and the operand order in the +tape follows the method's argument order — destination first where there is one. +Multi-result ops (calls, compare-and-swap, checked-arithmetic intrinsics) name +several destination locals. + +Operands use the shared `Operand` shape (src/cg/cgtarget.h), every variant typed +by a `CfreeCgTypeId`: + +- `OPK_IMM`: a signed immediate bit pattern. +- `OPK_LOCAL`: a typed function local. +- `OPK_GLOBAL`: an object symbol plus signed addend — an *address*, not a load. +- `OPK_INDIRECT`: base local, optional index local with a log2 scale (1/2/4/8), + and a signed displacement; an addressing expression, not a load. + +There is deliberately no register operand kind. Register-like temporaries are +just locals; physical registers are a backend concern that never appears in the +CG IR. ## Types -Every IR local has a `CfreeCgTypeId`. IR value types are target-selected CG -storage types, not frontend AST types: - -- `void`: absence of a value. -- `bool`: i1 condition and compare result. -- integers: width-only i8, i16, i32, i64, and i128. Signedness is carried by - operations, comparisons, conversions, and ABI attributes. -- floats: f32, f64, f80, and f128. `f80` is needed for x87-style extended - precision targets; targets that do not support it reject or lower it through - runtime helpers. -- pointers: pointer-sized values with an address space. -- function pointers: pointer values whose pointee type is a function type. -- aggregates: opaque object types with size and alignment. Source arrays and - records lower to aggregate storage at this level; field identity is already - gone and record layout has become byte offsets, bit ranges, and aggregate - sizes. -- `vararg_state`: target ABI vararg state object, accessed through addressable - storage. - -Enums and aliases are not distinct IR value types. They lower to their storage -type before reaching this layer. Frontends and debug metadata may retain source -identity separately. - -ABI values may decompose one source type into multiple storage parts for -argument passing and returns. The IR records that ABI shape at call and return -sites, while ordinary value ops remain typed by their CG value type. - -## Representation - -### Functions - -A `Func` is one function body. It owns the semantic IR needed to execute or -replay that body: - -- The preserved `CGFuncDesc`, including the function symbol, type, ABI - classification, source location, and function attributes. -- A linear instruction stream with labels and explicit control-transfer ops. -- Typed IR locals for parameters, source locals, compiler temporaries, aggregate - homes, call results, and dynamic-allocation handles. -- Optional source/debug metadata attached to function, local, label, and - instruction records. - -The IR contract should not expose optimizer state such as SSA construction -tables, block arrays, dominance, liveness, register allocation, hard-register -metadata, or pass scratch. Optimizers may derive and cache those views -privately. - -There is one local namespace at this level. A local is a mutable typed location. -Operations define destination locals and read source locals. Taking the address -of a local with `IR_ADDR_OF` makes it addressable; backends may then home it in -a frame slot, static storage, interpreter activation memory, or another -target-specific location. An aggregate local is the way to model fixed-size -local storage independent of scalar registers. - -### Labels and Derived Blocks - -Labels are part of the shared IR contract because they are exposed by CG-level -control-flow operations: branch targets, switch targets, label addresses, and -computed-goto valid target sets all name labels. - -Basic blocks are a derived view, not the base IR API. An optimizer or -interpreter may split the instruction stream at labels and terminators to build -a CFG with predecessor/successor lists. That CFG may cache layout order, -fallthrough edges, dominance, and block-local analysis, but those belong to the -consumer's view. - -### Instructions - -An `Inst` has: - -- `op`: an `IROp`. -- `loc`: the sticky source location active when the instruction was recorded. -- Destination locals, if the op produces values. -- Source operands. -- `extra`: immediate, constant bytes, memory access, or op-specific auxiliary - data. - -Destination arity is op-specific. Each destination names a typed IR local, so -the instruction does not need an independent result type. Multi-result ops such -as calls, compare-and-swap, and checked-arithmetic intrinsics list multiple -destination locals. - -Most IR ops correspond one-to-one with a `CGTarget` method. SSA-only helpers -such as phi nodes are optimizer-internal extensions, not base IR ops. - -### Locals and Addresses - -The IR has one mutable local namespace. - -Scalar locals hold scalar values. Aggregate locals hold opaque bytes with target -size and alignment. A local may be used as an ordinary value, assigned by value -operations, or addressed by `IR_ADDR_OF`. Address-taken locals are lowered by -consumers to concrete storage. Non-address-taken scalar locals may remain in -registers, SSA values, interpreter slots, or other consumer-owned storage. - -Function pointers and ordinary object pointers are produced by address -materialization. A direct function declaration is an object symbol with function -type; materializing `&fn + addend` is `IR_ADDR_OF` over a global symbol operand, -and direct calls may also carry that global symbol directly as the callee. -Indirect calls use a local containing a function-pointer value. - -Function-local goto labels are a different pointer-like value. `IR_LOAD_LABEL_ADDR` -materializes an opaque label token into a local. The token may be stored, -loaded, compared, selected, and consumed by `IR_INDIRECT_BRANCH` inside the -same function activation. It is not a function pointer, not callable, and not -dereferenceable as data. Static dispatch tables use the data equivalent of the -same operation: a label-address data relocation tied to the containing function. - -### Operands - -IR uses the internal `Operand` shape: - -- `OPK_IMM`: signed immediate bit pattern. -- `OPK_LOCAL`: typed IR local. -- `OPK_GLOBAL`: object symbol plus addend address. -- `OPK_INDIRECT`: base pointer local plus optional index local, scale, and - offset. - -There is no distinct `OPK_REG` in the base IR. Register-like temporaries are IR -locals. Optimizers may derive SSA values or machine virtual registers as private -views, but those are not the API-level operand model. - -### Memory - -Memory accesses carry `MemAccess`: - -- Codegen type and access size. -- Known alignment. -- Volatile, atomic, restrict, readonly, writeonly, and unaligned flags. -- Address space. -- Alias root when known. - -Non-volatile scalar loads are ordinary pure value producers. Volatile loads, -stores, aggregate memory operations, bitfield stores, atomics, fences, calls, -inline asm, and relevant intrinsics are observable. Optimizations may remove or -reorder memory operations only when these flags and alias facts make that legal. - -### Control Flow - -The base IR keeps CG's control-flow model: labels, explicit unstructured -branches, structured scopes, returns, tail transfers, switches, computed gotos, -and terminating intrinsics. Unstructured control-flow ops name labels directly. -Structured control-flow ops name scope handles whose metadata records the -associated break, continue, else, and end labels. - -Consumers that need CFG form derive blocks and successor edges from labels, -structured-scope metadata, terminators, and lexical instruction order. If -control can continue at the next instruction, the op is not a terminator. - -## Operation Semantics - -### Administrative Ops - -- `IR_NOP`: no effect. Used as a deletion marker. - -Parameters are function/local declarations, not executable base-IR operations. -The function descriptor and local table identify each parameter local, its -source/debug metadata, type, index, and ABI incoming shape. - -### Data Movement - -- `IR_LOAD_IMM`: assign destination local from an integer-like immediate bit - pattern in `extra.imm`. This covers null pointers, integer constants, bools, - and small immediates that fit the immediate field. -- `IR_LOAD_CONST`: assign destination local from target ABI bytes in - `extra.cbytes`. This covers constants whose representation is byte-oriented - rather than integer-immediate-oriented, such as floating constants, i128, - f128/f80, and other fixed-size constants that should be preserved exactly. -- `IR_COPY`: assign destination local from source local. -- `IR_LOAD`: load a scalar value from a local/global/indirect address using - `extra.mem` into destination local. -- `IR_STORE`: store a scalar local or immediate to a local/global/indirect - address using `extra.mem`. -- `IR_ADDR_OF`: materialize the address of a local/global/indirect lvalue. -- `IR_TLS_ADDR_OF`: materialize the address of a thread-local object for the - current thread, using target TLS semantics. This remains a separate op rather - than a flag on `IR_ADDR_OF` because TLS address materialization can require - target-selected model logic, relocations, helper calls, or thread-pointer - arithmetic; it is not just an address-space property of an ordinary lvalue. -- `IR_AGG_COPY`: copy a fixed-size aggregate byte range from `src` to `dst`. -- `IR_AGG_SET`: set a fixed-size aggregate byte range at `dst` to a byte value. -- `IR_BITFIELD_LOAD`: load and extract a bitfield from a record storage unit. -- `IR_BITFIELD_STORE`: insert a bitfield into a record storage unit. - -`IR_LOAD`, `IR_STORE`, aggregate ops, and bitfield ops use target layout facts -already encoded in their operands and auxiliary records. - -### Arithmetic and Conversions - -- `IR_BINOP`: integer or floating binary operation. The operation tag is a - `BinOp` in `extra.imm`; operands are `dst, a, b`. -- `IR_UNOP`: unary operation. The operation tag is a `UnOp` in `extra.imm`; - operands are `dst, a`. -- `IR_CMP`: compare operation. The comparison tag is a `CmpOp` in `extra.imm`; - operands are `dst, a, b`; result is an i1/bool value. -- `IR_CONVERT`: conversion operation. The conversion tag is a `ConvKind` in - `extra.imm`; operands are `dst, src`. - -Integer types are width-only. Signedness is carried by op variants such as -signed divide, signed compare, sign-extension, and signed integer/float -conversion. - -### Calls and Returns - -- `IR_CALL`: call a direct or indirect callee described by `IRCallAux`. -- `IR_RET`: return the optional ABI value described by `IRRetAux`. - -Calls preserve the full `CGCallDesc`: function type, ABI classification, -callee operand, argument ABI values, result ABI value, tail-call flag, and -inline policy. Direct calls use an `OPK_GLOBAL` callee. Other callee operands -are indirect calls. - -Tail calls are represented by `IR_CALL`, not by `IR_RET`. A normal call has a -local continuation and may be followed by `IR_RET` if the caller returns the -call result. A required or selected tail transfer is a terminating `IR_CALL` -with no local successor and no following `IR_RET`. - -`CGABIValue` may describe a scalar value, an indirect/byval/sret address, or -multiple ABI-decomposed parts. The IR records enough information for replay, -optimization, or interpretation without re-running frontend type checking. - -### Branching - -- `IR_BR`: unconditional branch to one target label. -- `IR_CONDBR`: branch on a bool local to true and false target labels. -- `IR_CMP_BRANCH`: fused compare-and-branch; operands are `a, b`, comparison - tag is `CmpOp`, and targets are taken and fallthrough labels. -- `IR_SWITCH`: branch on selector to matching case label, else default label. -- `IR_LOAD_LABEL_ADDR`: assign a local the opaque address of a - function-local label. -- `IR_INDIRECT_BRANCH`: branch to a label address, constrained to the closed - target set recorded in `IRIndirectAux`. - -Label addresses are function-local opaque values. They may be compared, stored, -loaded, selected, and consumed by `IR_INDIRECT_BRANCH` in the same function -activation; they are not callable function pointers and are not dereferenceable -data pointers. - -### Structured Control - -- `IR_SCOPE_BEGIN`: begin a structured block, loop, or if scope. `IRScopeAux` - records target scope id and associated labels. -- `IR_SCOPE_ELSE`: transition to the else arm for an if scope. -- `IR_SCOPE_END`: close a structured scope. -- `IR_BREAK_TO`: transfer to a scope's break target. -- `IR_CONTINUE_TO`: transfer to a scope's continue target. - -Structured ops exist so backends that can express structure, such as a C-source -or future wasm target, can replay it. Native CFG consumers may lower them to -ordinary labels and branches. - -### Stack Allocation and Variadics - -- `IR_ALLOCA`: dynamic stack allocation. Operands are `dst, size`; - `extra.imm` is required alignment. -- `IR_VA_START`: initialize a target ABI vararg state at `ap`. -- `IR_VA_ARG`: read the next vararg value of type `extra.aux` into `dst`. -- `IR_VA_END`: end a vararg state. -- `IR_VA_COPY`: copy one vararg state to another. - -These model target calling-convention variadics, not language-level rest -parameters. +Every local and operand carries a `CfreeCgTypeId` — a CG storage type already +selected for the target, not a frontend AST type. Enums and typedef aliases have +already collapsed to their storage type; record/array field identity is gone, +replaced by byte offsets, bit ranges, and aggregate sizes. The CG type system +covers void, a boolean/i1 condition type, width-only integers, the float +widths, pointers (with address space), function-pointer values, opaque +fixed-size aggregates, and the per-arch vararg-state object. Signedness is not a +property of an integer type; it is carried by the operation that consumes the +value (signed vs unsigned divide, compare, shift, extend, and int/float +conversion). ABI decomposition — splitting one source value into several +storage parts for argument passing or returns — is recorded in the call and +return descriptors, not by re-typing ordinary value ops. + +## Operation families + +The complete op set is `CgIrOp` in src/cg/ir.h; the categories below describe +its semantics. The textual dumper (src/cg/ir_dump.c, reachable as +`cg_ir_func_dump`) is the canonical rendering and a good cross-check for the +spelling and operand order of any op. + +### Administrative + +- `CG_IR_NOP`: no effect; also used as a deletion marker. +- `CG_IR_LABEL`: marks the placement of a label (id in `extra.imm`). + +### Data movement + +- `CG_IR_LOAD_IMM`: set a destination local from an integer-like immediate bit + pattern (`extra.imm`) — null pointers, integer/bool constants, small literals. +- `CG_IR_LOAD_CONST`: set a destination local from exact target ABI bytes + (`extra.cbytes`) — floating constants, i128, f128, and other byte-oriented + constants whose representation must be preserved exactly. +- `CG_IR_COPY`: assign one local from another. +- `CG_IR_LOAD` / `CG_IR_STORE`: scalar load/store through a + local/global/indirect address, carrying a `MemAccess` in `extra.mem`. +- `CG_IR_ADDR_OF`: materialize the address of a local/global/indirect lvalue; + marks an addressed local `address_taken`. +- `CG_IR_TLS_ADDR_OF`: materialize a thread-local object's address for the + current thread. Separate from `ADDR_OF` because TLS materialization may need a + target-selected access model, relocations, helper calls, or thread-pointer + arithmetic — it is not merely an address-space attribute of an ordinary lvalue. +- `CG_IR_AGG_COPY` / `CG_IR_AGG_SET`: fixed-size aggregate byte-range copy and + fill, carrying an `AggregateAccess`. +- `CG_IR_BITFIELD_LOAD` / `CG_IR_BITFIELD_STORE`: extract/insert a bitfield in a + record storage unit, carrying a `BitFieldAccess` (storage offset, bit offset, + bit width, signedness). + +All memory and aggregate/bitfield ops rely on target layout facts already +encoded in their operands and aux records; consumers must not reinterpret layout +for a different target. + +### Arithmetic, compare, convert + +- `CG_IR_BINOP`: integer/float binary op; the `BinOp` tag is in `extra.imm`, + operands `dst, a, b`. +- `CG_IR_UNOP`: unary op; `UnOp` tag in `extra.imm`, operands `dst, a`. +- `CG_IR_CMP`: compare producing an i1/bool local; `CmpOp` tag in `extra.imm`, + operands `dst, a, b`. +- `CG_IR_CONVERT`: width/representation conversion; `ConvKind` tag in + `extra.imm`, operands `dst, src`. + +Source operands of binop/unop/cmp may be `OPK_IMM` as well as `OPK_LOCAL`; the +backend or interpreter decides whether to fold a small immediate into an +instruction form or materialize it. The operation tag families (BinOp, UnOp, +CmpOp, ConvKind, AtomicOp, MemOrder, IntrinKind) are defined in +src/cg/cgtarget.h and are open to vector/SIMD extension — consumers must switch +with a `default` arm rather than assume exhaustiveness. + +### Calls and returns + +- `CG_IR_CALL`: a direct or indirect call. The full `CGCallDesc` is preserved in + the call aux: function type, callee operand, argument locals, result locals, + flags, and inline/tail policy. A direct call has an `OPK_GLOBAL` callee; any + other callee operand is an indirect call through a function-pointer local. +- `CG_IR_RET`: return zero or more result locals (recorded in the return aux). + +Tail calls are modeled as a `CG_IR_CALL` carrying the `CG_CALL_TAIL` flag, not +as a property of `CG_IR_RET`. CG verifies realizability before setting the flag +(through the target's `tail_call_unrealizable_reason` query, which the recorder +forwards to its configured callback); the recorder preserves the tail policy so +replay can emit a sibling call, fall back to call-plus-return, or diagnose. + +### Branching and computed goto + +- `CG_IR_BR`: unconditional branch to a label (id in `extra.imm`). +- `CG_IR_CMP_BRANCH`: fused compare-and-branch; operands `a, b`, with the + `CmpOp` and taken-target label in the cmp-branch aux. This is CG's preferred + conditional-branch form; an arbitrary i1 in a local branches via + `cmp_branch(CMP_NE, val, 0, label)`. +- `CG_IR_SWITCH`: structured multi-way branch; the switch aux holds the selector + type, case/value pairs, default label, and density hints. Backends that can + express it natively (C `switch`, a future Wasm `br_table`) override the target + hook; otherwise CG's shared lowering reduces it to compare-branch chains or a + label-address jump table. +- `CG_IR_LOAD_LABEL_ADDR`: materialize a function-local label's address into a + local (label id in `extra.imm`). +- `CG_IR_INDIRECT_BRANCH`: branch to a label address, constrained to the closed + target set in the indirect aux. The closed set drives CFG reconstruction and + branch-target hardening (BTI/PAC/IBT). + +Label addresses are opaque, function-local tokens. They may be stored, loaded, +compared, selected, and consumed by `CG_IR_INDIRECT_BRANCH` within the same +function activation; they are not callable function pointers and not +dereferenceable data. + +### Function-local static data + +- `CG_IR_LOCAL_STATIC_DATA_BEGIN` / `..._WRITE` / `..._LABEL_ADDR` / `..._END`: + define a function-scoped static-data object that needs function-label scope. + The motivating case is C `&&label` dispatch-table initializers, where a static + array is filled with code-label addresses: `_WRITE` appends bytes (or zeros), + and `_LABEL_ADDR` records a relocation to a function-local label with an + addend, width, and address space. A target that cannot resolve code-label + addresses in static data (e.g. Wasm) declines `_BEGIN`, and the recorder + reports that it likewise cannot build a label-address jump table so `switch_` + takes a different lowering. + +### Structured scopes + +These ops preserve CG's C-like structured control model — `block`, `loop`, and +`if` scopes — so backends that express structure directly (the C-source target, +a future Wasm target) can replay it without rebuilding a CFG. CFG-based +consumers ignore the structure and reconstruct control flow from the underlying +labels and branches instead. + +- `CG_IR_SCOPE_BEGIN`: open a scope. The scope id and full `CGScopeDesc` (its + `kind` — block/loop/if — and associated descriptor fields) ride in a + `CgIrScopeAux` on `extra.aux`. Recording also adds a `CgIrScope` to the + function's scope side table. +- `CG_IR_SCOPE_ELSE`: begin the else arm of an `if` scope; scope id in + `extra.imm`. +- `CG_IR_SCOPE_END`: close the most recently opened matching scope; scope id in + `extra.imm`. +- `CG_IR_BREAK_TO`: exit the named enclosing scope (loop/block/switch break); + scope id in `extra.imm`. +- `CG_IR_CONTINUE_TO`: continue the named enclosing loop scope; scope id in + `extra.imm`. + +Scope ids are 1-based with `CG_SCOPE_NONE` as the zero sentinel. The structured +form is advisory metadata layered over the same primitive control flow: a +consumer that flattens scopes to labels and branches produces the same observable +behavior as one that replays the structure natively. + +### Stack allocation and variadics + +- `CG_IR_ALLOCA`: dynamic stack allocation; operands `dst, size`, required + alignment in `extra.imm`. Models target-ABI dynamic allocation (reached via + `__builtin_alloca`), not language VLAs. +- `CG_IR_VA_START` / `CG_IR_VA_ARG` / `CG_IR_VA_END` / `CG_IR_VA_COPY`: the four + C vararg operations over a target-ABI vararg-state object, always addressed by + pointer. `VA_ARG` carries the next argument's type in `extra.imm`. ### Atomics -- `IR_ATOMIC_LOAD`: atomic load from `addr` into `dst`. -- `IR_ATOMIC_STORE`: atomic store from `src` to `addr`. -- `IR_ATOMIC_RMW`: atomic read-modify-write; defines the prior value. -- `IR_ATOMIC_CAS`: compare-and-swap; defines prior value and success bool. -- `IR_FENCE`: memory fence with `MemOrder` in `extra.imm`. - -Atomic accesses carry both `MemAccess` and memory-order metadata. They are -observable and must preserve the ordering required by the memory model. - -### Inline Assembly - -- `IR_ASM_BLOCK`: one inline assembly block with template, constraints, - clobbers, input operands, and output operands. - -Constraint strings remain target-specific. Optimization may inspect clobbers -and operands but must preserve the asm block unless it can prove the source -contract permits removal. Current consumers treat inline asm conservatively. - -### Intrinsics - -- `IR_INTRINSIC`: compiler intrinsic identified by `IntrinKind`, with explicit - destination and argument operands. - -Intrinsic semantics depend on the kind: - -- Bit operations: popcount, ctz, clz, bswap. -- Memory helpers: memcpy, memmove, memset, prefetch, assume-aligned. -- Hints/control: expect, unreachable, trap. -- Non-local control: setjmp, longjmp. -- Checked arithmetic: signed/unsigned add, subtract, multiply with overflow. - -Some intrinsics are pure value producers with destinations, some are observable -side effects, and some are terminators. Consumers must classify by -`IntrinKind`, not by `IR_INTRINSIC` alone. - -### Optimizer Extensions - -Optimizer-owned views may add ops that are not part of the base IR API: - -- `IR_PARAM_DECL`: implementation artifact used by the current opt recorder to - place a definition for register-backed parameter locals in the entry stream. - Base IR consumers should get parameter information from the function's - parameter/local declarations instead. -- `IR_CONST_I`: SSA integer constant. Recording uses `IR_LOAD_IMM`. -- `IR_CONST_BYTES`: SSA byte constant. Recording uses `IR_LOAD_CONST`. -- `IR_PHI`: SSA merge for a derived CFG block. - -These ops should not leak into replay or interpretation unless that consumer -explicitly opts into the optimizer's SSA view. +- `CG_IR_ATOMIC_LOAD` / `CG_IR_ATOMIC_STORE`: ordered scalar load/store. +- `CG_IR_ATOMIC_RMW`: read-modify-write defining the prior value; `AtomicOp` in + the atomic aux. +- `CG_IR_ATOMIC_CAS`: compare-and-swap defining the prior value and a success + bool; carries both success and failure orderings. +- `CG_IR_FENCE`: standalone fence; `MemOrder` in `extra.imm`. + +Atomic ops carry both a `MemAccess` and memory-order metadata in their aux. They +are observable and must preserve the ordering the memory model requires. + +### Intrinsics and inline asm + +- `CG_IR_INTRINSIC`: a compiler intrinsic identified by `IntrinKind`, with + explicit destination and argument operand arrays in the intrinsic aux. + Semantics depend entirely on the kind: bit ops (popcount, ctz, clz, bswap), + memory helpers (memcpy/memmove/memset/prefetch/assume-aligned), hints + (expect/unreachable/trap), non-local control (setjmp/longjmp), and checked + arithmetic (add/sub/mul-with-overflow). Some are pure value producers, some are + observable side effects, and some are terminators or return twice — consumers + must classify by `IntrinKind`, not by `CG_IR_INTRINSIC` alone. +- `CG_IR_ASM_BLOCK`: one GCC-style inline-asm block — template string, input and + output constraint/operand pairs, and clobbers — captured verbatim in the asm + aux. Constraint strings are target-specific; optimization may inspect operands + and clobbers but must treat the block conservatively. ## Invariants -- Local id zero, label zero, symbol zero, and related `*_NONE` constants are - sentinels. +- Sentinels are zero-valued: `CG_LOCAL_NONE`, `LABEL_NONE`, `CG_SCOPE_NONE`, + `OBJ_SYM_NONE`. Local, label, and scope ids are 1-based. - A local has exactly one declared type for the whole function. -- Every destination and source local must be declared before use. -- A control-transfer op's target labels must name labels in the same function, - except for ordinary call targets represented as symbols or pointer values. -- A terminating op ends the current linear control path. Any following - reachable instruction must be made reachable through a label. -- Source locations are sticky at CG recording time and stored per instruction. -- Data layout facts are already target-selected; consumers must not reinterpret - record or bitfield layout for another target. - -## Consumer Guidance - -Optimization may transform the IR as long as it preserves target-data-layout -semantics, memory observability, ABI-shaped calls/returns, and CFG validity. - -Backend replay may either emit each op directly to a `CGTarget` or run a -target-prep pipeline first. Native targets generally need machinization, -liveness, register allocation, and final replay. Source-like targets may prefer -direct replay. - -Interpreted execution should use the pre-machinize IR: - -- Maintain activation storage for typed IR locals. -- Represent address-taken locals, aggregate locals, and globals as - byte-addressable target-layout memory. -- Execute control transfers by label, or by a derived CFG block id in an - interpreter-owned view. -- Treat label addresses as opaque function-local label tokens. -- Implement interpreted-to-interpreted calls from retained `Func` bodies. -- Route external calls through an explicit host-helper/FFI layer rather than - lowering the interpreter to machine ABI internals. - -This keeps interpretation aligned with CG semantics while avoiding native-code -emission details. - -## Implementation Plan - -The migration should be a clean cutover of the semantic `CgTarget` interface, -not an incremental layering of another IR beside the current target API. - -The O0 path must remain direct: - -```text -CfreeCg -> semantic CgTarget - |-> native direct target (O0) - |-> C source direct target (--emit=c) - |-> IR recorder (O1/O2/interpreter) -``` - -Direct targets implement the semantic interface and emit immediately. The IR -recorder implements the same interface and stores the clean IR. This preserves -O0 compile-time behavior while giving optimized and interpreted paths a stable -recorded form. - -### Phase 1: Cut Over `CgTarget` - -Update the internal target interface to match the clean IR model: - -- Use one typed mutable local namespace for scalar temporaries, source locals, - parameters, aggregate homes, and call results. -- Make parameter information part of function/local declarations, not an - executable target op. -- Remove `OPK_REG` from the semantic target surface. Register allocation is a - native-lowering concern. -- Keep labels in the base interface. Blocks remain derived consumer views. -- Keep `IR_ADDR_OF`, `IR_TLS_ADDR_OF`, and `IR_LOAD_LABEL_ADDR` distinct. -- Keep ABI-shaped call descriptors, including decomposed arguments and returns. -- Keep aggregate, bitfield, atomic, inline asm, intrinsic, and structured - control operations at the semantic level. - -Native O0 targets may still map locals immediately to registers, frame homes, -or target-private storage. Taking the address of a local must force or require -a concrete home; frontends/CG should continue marking known address-taken and -memory-required locals so O0 does not need avoidable late repair. - -### Phase 2: Update Direct Backends - -Port the native and C-source targets to the new semantic interface. - -Native targets should keep their current emission strategy: - -- Map non-address-taken scalar locals to backend temporaries/registers. -- Map aggregate and address-taken locals to frame or equivalent storage. -- Materialize global function/object addresses with ordinary `addr_of`. -- Materialize TLS addresses through the dedicated TLS op. -- Materialize function-local label addresses through the label-address op. -- Lower calls directly from ABI-shaped descriptors. - -The C-source target should become simpler: typed locals become C temporaries or -opaque aggregate storage, records/arrays remain raw-byte aggregate typedefs, and -enums/aliases stay lowered to storage types except in source/debug metadata. - -### Phase 3: Update O1 Optimizer - -Make the O1 path record clean IR first, then derive the current backend-oriented -view needed by the existing O1 machinery: - -```text -clean IR - -> derive CFG from labels and terminators - -> classify locals: scalar, address-taken, aggregate, ABI home - -> lower locals to virtual registers or frame/storage objects - -> lower calls to backend call plans where useful - -> run existing O1 passes - -> replay into native target -``` - -This front conversion lets the existing O1 pipeline survive the cutover: -CFG cleanup, local simplify, machinize, liveness, dead-def elimination, -register allocation, combine, and emit can continue operating on an internal -MIR-like view. That view may still use virtual registers, frame slots, phis, and -block arrays because it is optimizer-private. - -O2 can be ignored during the first cut. Once O1 is stable, O2 can either start -from clean IR or reuse the same O1-derived view and then add SSA/inlining back -on top. - -### Phase 4: Add Interpreter - -Add interpreted execution after the clean recorder and O1 conversion are stable. - -The interpreter should execute the clean pre-machinize IR: - -- Allocate activation storage for typed locals. -- Use byte-addressable memory for aggregates, address-taken locals, globals, - TLS instances, and dynamic allocas. -- Execute labels directly or through an interpreter-owned CFG view. -- Treat label addresses as opaque function-local tokens. -- Dispatch interpreted-to-interpreted calls through retained clean IR function - bodies. -- Route external calls through an explicit host-helper or FFI layer. - -The interpreter should not consume the native MIR/regalloc view. That keeps it -aligned with language/CG semantics rather than backend emission details. +- Every destination and source local is declared before use. +- A control-transfer op's label operands name labels in the same function; the + exception is a call, whose callee is a symbol or a function-pointer value. +- A terminating op ends the current linear control path; any following reachable + instruction must be reached through a label. +- Source locations are sticky at recording time and stamped on each instruction. +- Data-layout facts are already target-selected; consumers must not reinterpret + record or bitfield layout for a different target. + +## Consumer guidance + +Anything that reads the IR is reading a layout-resolved, ABI-shaped, but +machine-neutral program. The contract a consumer must respect: preserve +target-data-layout semantics, memory observability (the `MemFlag` set and alias +roots on each access), the ABI shape of calls and returns, and CFG validity. + +Two consumers exist today, and they take different paths: + +- The optimizer (see [OPT.md](OPT.md)) does not run passes on the CG IR in + place. It converts each `CgIrFunc` into its own `Func` IR + (`opt_func_from_cg_ir` in src/opt/cg_ir_lower.c), which materializes basic + blocks, SSA, virtual registers, and frame objects, then runs CFG cleanup, + simplification, machinization, liveness, register allocation, and emission, + and finally replays into the wrapped direct backend. This conversion is why + SSA/phi/const ops live in the optimizer's enum and never in the CG IR. + +- The interpreter (see [INTERPRETER.md](INTERPRETER.md)) also goes through the + optimizer's `Func` form, but via a reduced pipeline (`opt_run_o1_interp` in + src/opt/opt.c) that stops before machinization: it builds the CFG, runs + target-independent cleanups, promotes scalar locals, and hands a Func with + virtual registers to the interpreter loader (src/interp/lower.c), which emits + fixed-width bytecode. The interpreter never consumes the native MIR/regalloc + view, keeping execution aligned with CG semantics rather than backend + emission. Address-taken locals, aggregates, globals, TLS instances, and + allocas become byte-addressable interpreter memory; label addresses stay + opaque tokens; interpreted-to-interpreted calls dispatch through retained + function bodies and external calls go through an FFI layer. + +Source-like backends (the C-source target, a future Wasm target) can instead +replay the tape op-by-op into a direct `CgTarget`, taking advantage of the +retained structured scopes and switch descriptors. diff --git a/doc/JIT.md b/doc/JIT.md @@ -1,134 +1,298 @@ -# JIT — known limitations and TODOs - -Status of the in-process JIT path: `cfree_link_jit` (`src/link/link_jit.c`), -its driver front-end `cfree run` (`driver/cmd/run.c`), and the debugger session -that rides on top (`cfree dbg`, `driver/cmd/dbg.c`). Adjacent docs that cover -specific slices stay authoritative; entries here cross-reference instead of -duplicating. - -Companion docs: -- `doc/DESIGN.md` §5.5 — `LinkImage` / `CfreeJit` ownership and lifetime. -- `doc/MACHO.md` §3 — Mach-O Path-J reloc-apply gaps (the longest list). -- `doc/DBG.md` §12 — JIT debugger checklist (session, view, REPL). -- `doc/INCREMENTAL_LINK.md` — append-only incremental JIT linking, first - for `dbg` REPL snippets. -- `doc/HOT_RELOAD.md` — function-only hot reload built on append-only - incremental linking. -- `doc/EMU.md` §6 — per-block JIT on a growing `LinkImage` (separate scheme). - -## Driver — `cfree run` - -- [x] Pipeline outlives the JIT — `cfree_jit_lookup` dereferences - `jit->c`, so freeing the pipeline (and its Compiler) before lookup - segfaults. Ownership lifted to `driver_run` to mirror - `driver/cmd/dbg.c`. -- [x] Zero-source input case (`.o`/`.a` only) no longer spuriously - reports "out of memory" from a 0-byte `driver_alloc_zeroed`. -- [x] `driver_dlsym_resolver` retries with the leading `_` stripped so - Mach-O-mangled C names (`_strlen`) resolve through - `dlsym(RTLD_DEFAULT)`. (Call-site reachability is a separate - issue — see §"Reloc-apply" below.) -- [x] Synthetic `argv[0]` — JIT'd `main` now receives `argv[0]` set to - the first input's display name (source path, `<stdin>`, `.o`, or - `.a`); user args from `--` start at `argv[1]`. Without `--` the - program sees `argc==1`. -- [ ] `-O2` crashes on the multi-file inline-asm demo with `Bus error`. - Likely an optimizer bug surfaced through `IR_ASM_BLOCK` replay — - reduce and file against `src/opt/opt.c` (the recorder/replay seam - from `INLINEASM.md` track B), not the JIT. -- [ ] Regression test: scripted `test/run/` harness diffing exit codes - and stdout for `.c`, stdin, `.o`, `.a`, multi-file, and `-e` - entry cases. No coverage today. - -## Reloc-apply gaps - -The JIT shares resolver/layout with the file linker but has its own -in-process apply path. The Mach-O J-path issues are listed in -`doc/MACHO.md` §3; the corresponding ELF JIT path is green on the same -inputs. - -- [x] **Cross-TU data via ADRP/ADD/LDR.** (`doc/MACHO.md` §3.1.) - Resolved by running `layout_got` on the Mach-O JIT path - (`src/link/link_layout.c`, gated on `!l->emit_static_exe`). - The ELF-shaped synthesis materializes one `.got` slot per - GOT-referenced symbol with a per-slot `R_ABS64` reloc, and - rewrites `R_AARCH64_ADR_GOT_PAGE` / `LD64_GOT_LO12_NC` to - target the slot. The exe path keeps using - `link_macho.c::collect_imports`. -- [x] **Weak-undef / proximity.** (`doc/MACHO.md` §3.2.) Resolved - by reserving the JIT image as a single contiguous mapping - (`src/link/link_jit.c::cfree_jit_from_image`): one - `mem->reserve` call covers the full image span and segments - are subdivisions of it, so inter-segment displacements stay - within ±4 GiB (ADRP) and ±128 MiB (CALL26) regardless of - where the OS placed the mapping. Weak-undef now naturally - routes through a GOT slot whose `R_ABS64` writes 0. -- [x] **IFUNC under Mach-O JIT.** (`doc/MACHO.md` §3.3.) Excluded via - `j_targets` on `32_ifunc`, `33_ifunc_in_init`, `34_ifunc_addr_taken` - — Mach-O has no `__mod_init_func` analogue for iplt synthesis and - IFUNC is an ELF/glibc extension. Revisit only if a Mach-O-shaped - iplt scheme inside the JIT mapping becomes a requirement. -- [x] **Extern resolver / far-call.** (`doc/MACHO.md` §3.4.) - Resolved by a new layout pass `layout_jit_call_stubs` - (`src/link/link_layout.c`) that, for the AArch64 JIT path, - synthesizes a 12-byte PLT-style stub - (`ADRP x16, slot ; LDR x16,[x16] ; BR x16`) and an 8-byte - slot per resolver-supplied / weak-undef `SK_ABS` target hit - by `CALL26`/`JUMP26`. The slot is filled by a per-slot - `R_ABS64` against a synthetic resolver-pointer LinkSymbol - preserving the original (host) vaddr, and - `emit_reloc_records` redirects the CALL26/JUMP26 to the - stub. Stubs live in their own RX subsegment of the - contiguous JIT reservation so the call-site branch - displacement stays in range. `cfree run` can now call - libc directly (verified end-to-end with `write` and - `printf`). - -## Inspector / debugger surface - -`cfree_jit_view` and the symbol-walk inspector entries -(`cfree_jit_sym_iter_*`, `cfree_jit_addr_to_sym`, -`cfree_jit_image_contains`, `cfree_jit_image_arch`, -`cfree_jit_compiler`) landed with commit `1b5a596` and PC translation is -wired up. Remaining items are listed in `doc/DBG.md` §12; the JIT-facing -ones to keep an eye on: - -- [x] `cfree_jit_view` — multi-input handling. The debug emitter now - emits R_ABS32 relocs against SK_SECTION symbols for every - cross-section offset (CU header `debug_abbrev_offset`, root-DIE - `stmt_list` / `ranges` / `str_offsets_base`, `.debug_str_offsets` - entries, `.debug_line` `line_strp` slots). The view-builder in - `link_jit.c` walks every dbg input, snapshots per-section prefix - sizes, and resolves SK_SECTION relocs against the snapshot — - concatenated CUs land their cross-section offsets in the right - slot. Externally produced `.o` debug info routes through the - same path. -- [ ] Windows host adapter for the JIT debugger (vectored exception - handlers + `SetThreadContext` instead of POSIX signals). - `doc/DBG.md` §host-adapter. -- [ ] x64 / rv64 displaced-step (`arch_x64.c` INT3 + RIP-relative fixups, - `arch_rv64.c` EBREAK + AUIPC/JAL/branch fixups). aarch64 only - today. - -## Memory mapping / executable allocator - -- [ ] Cross-host `CfreeExecMem` audit. Today Apple silicon goes through - dual-mapping (`g_jit_dual_map`) and other POSIX hosts fall back to - `mprotect` RW↔RX. Document the contract and the failure mode when - `env->execmem` is unset (currently `compiler_panic`). -- [ ] Page-size: JIT defaults to `0x4000` when the host adapter reports - `page_size = 0`. Either require the adapter to fill it, or query - `sysconf(_SC_PAGESIZE)` in `driver/env.c`. - -## Tests - -Coverage today is `make test-link CFREE_TEST_OBJ=macho` (Path J), which -prints raw `Segmentation fault: 11` lines from the harness wrapper with -no J-specific markers. Items below would catch the failure modes -explicitly. - -- [ ] J-path markers in the link-test reporter so the four MACHO §3 - groups are distinguishable from generic SIGSEGV. -- [ ] `test/run/` smoke suite for `cfree run` (see Driver above). -- [ ] `test/smoke/dbg_hello` — scripted REPL diff against a JIT'd - source. `doc/DBG.md` §tests. +# JIT + +cfree's in-process JIT maps a fully linked program into the running +process's address space and hands back callable function pointers. There +is no separate "JIT compiler": the same linker that writes ELF/Mach-O/PE +files produces a resolved `LinkImage`, and the JIT mapper copies that +image into executable memory, applies relocations against the live +runtime addresses, and exposes a symbol/inspector surface. The mapper is +`cfree_jit_from_image` in `src/link/link_jit.c` — *not* `src/jit/`, which +holds only the Mach-O TLV thunk. The `cfree run` driver +(`driver/cmd/run.c`) is the headline consumer; the JIT debugger (see +[DBG.md](DBG.md)) and the emulator's block translator (see [EMU.md](EMU.md)) +ride on the same mapping primitives. + +## Where the JIT sits + +``` + inputs (.c/.o/.a/.wat) + | frontend + codegen (see FRONTENDS.md, CODEGEN.md) + v + ObjBuilders --link_add_obj--> Linker (jit_mode=1, jit_host set) + | resolve + layout (see LINK.md) + v + LinkImage (segments, relocs, resolved symbols; bytes NOT serialized) + | cfree_jit_from_image (src/link/link_jit.c) + v + CfreeJit (mapped exec memory + symbol inspector) + | cfree_jit_lookup / addr_to_sym / sym_iter / view + v + host code calls the JITed entry in-process +``` + +The frontend-to-image half is shared verbatim with the file linker. A +`CfreeLinkSession` opened with output kind `CFREE_LINK_OUTPUT_JIT` (see +`src/api/link.c`) sets two pieces of state on the `Linker`: `jit_mode`, +which tells layout to skip file serialization and synthesize JIT-only +stubs/GOT, and the *JIT host* (`CfreeJitHost`), the vtable through which +libcfree reaches the executable-memory allocator and the TLS runtime +without itself depending on any OS. `cfree_link_session_jit` then calls +`cfree_jit_from_image`, transferring ownership of the image (and the +linker that backs it) into the returned `CfreeJit`. + +Because the code runs in *this* process, the JIT only produces runnable +code when the target arch and object format match the host. The driver +defaults the target to the host, but `-target` overrides it without a +guard: libcfree lowers and lays out for whatever target the compiler was +created with (`cfree_jit_image_arch` simply reports `target.arch`), so a +cross-target `cfree run` emits native code the host CPU cannot execute +and fails at runtime rather than with a diagnostic. Enforcing +target==host is the caller's responsibility. The JIT always lowers PIC; +`-fPIC`/`-fPIE`/`-mcmodel` are accepted by the driver but have no +observable effect. + +## The single contiguous reservation + +The defining invariant of `cfree_jit_from_image` is that the *entire* +image lives in one `execmem->reserve` mapping. Layout assigns every +segment a page-aligned image vaddr inside a single span `[image_base, +image_end)`; the mapper reserves that whole span (plus append slack, see +below) in one call and then treats each segment as a sub-range at offset +`vaddr - image_base`. No segment gets its own independent `mmap`. + +This exists to keep inter-segment displacements in branch/addressing +range. AArch64 `ADRP` reaches ±4 GiB and `CALL26` reaches ±128 MiB; +RISC-V `AUIPC`+branch and x86-64 RIP-relative loads have their own +windows. If code, rodata, and data were three independent mappings, the +OS could scatter them gigabytes apart and a perfectly legal cross-segment +reference would overflow the relocation's range check. One reservation +makes every intra-image displacement a function of the layout, not of +where the kernel happened to place three separate regions. The same +property is what lets a weak-undef symbol resolve through a zero-valued +slot, and what lets the far-call stubs (below) sit close enough to their +call sites. + +`vaddr_to_runtime` / `vaddr_to_write` translate an image vaddr to the +two aliases of the master mapping (see W^X below). They scan the handful +of segments linearly, with a second pass that resolves a vaddr landing +exactly on a segment's one-past-end boundary (e.g. `__fini_array_end`). + +## Reloc-apply runs in-process on the shared path + +The JIT does not have its own relocation engine. It iterates the image's +`LinkRelocApply` records and calls the same `link_reloc_apply` used by +the file writers (see [LINK.md](LINK.md)). The only JIT-specific twist is +the address arithmetic: the *patch bytes* are written through the write +alias, while the symbol value `S` and the patch-site address `P` are the +*runtime* alias addresses, because that is where the CPU will execute and +fetch from. A handful of relocation kinds get special in-process +handling before reaching `link_reloc_apply`: + +- TLS-LE / TPREL (AArch64, RISC-V, x86-64): both the target and the TLS + base are image-relative, so the runtime alias cancels out and the + offset is computed in image space. AArch64/RISC-V use a 16-byte TCB + bias (`JIT_TLS_TCB_SIZE`) matching `start.c` and the ELF writer; + x86-64 SysV variant II addresses a TLS symbol as `offset - tls_memsz`. +- RISC-V `PCREL_LO12_I/S`: the low-12 half of an `AUIPC` pair targets a + local anchor at the paired `HI20` site. The mapper finds that paired + reloc, recomputes the displacement against runtime addresses, and feeds + it to the `LO12` encoder so the two halves agree. +- Weak-undef (`SK_ABS`, bind weak, vaddr 0): address-of must yield NULL. + An AArch64 `ADRP`/`ADD` pair against such a target would compute a + displacement far outside ±4 GiB once the image is placed away from + address 0, tripping the range check, so the `ADRP` is rewritten to + `MOVZ Xd,#0` and the paired `ADD #0` left as-is. +- Mach-O `TLVP_LOAD_PAGEOFF12`: Apple's mandatory "LDR→ADD" TLV + relaxation. Every TLV descriptor is in-image, so the extra indirection + through a `__thread_ptrs` slot is unnecessary; the load is rewritten to + an add so the descriptor address lands directly in the register. + +After relocations are applied, IFUNC resolvers (ELF only) are run +in-process and their results stored into the iplt slots, `.init_array` +constructors are run in forward order, and `cfree_jit_run_dtors` runs +`.fini_array` in reverse on teardown. + +## Call stubs and GOT slots for host calls + +JITed code routinely needs to call into the host process — libc, or any +symbol resolvable via the link session's extern resolver. Those targets +resolve to real host addresses (`SK_ABS`), which can be arbitrarily far +from the JIT mapping. A direct `CALL26`/`JUMP26` to libc would overflow +the branch range, and a far data reference needs an indirection slot. + +Layout solves both with JIT-only passes (gated on `jit_mode`, skipped for +static-exe output) in `src/link/link_layout.c` and +`src/link/link_reloc_layout.c`: + +- `link_layout_jit_stubs` synthesizes, for each `SK_ABS` target hit by a + call relocation, a PLT-style stub plus an 8-byte pointer slot. The stub + loads the slot and branches indirectly (e.g. AArch64 `ADRP`/`LDR`/`BR`); + the slot is filled by a per-slot `R_ABS64` against a synthetic + resolver-pointer symbol that preserves the original host vaddr. + `link_emit_relocations` then redirects the original `CALL26`/`JUMP26` + to the *stub*. The arch backend supplies the stub shape via + `needs_jit_call_stub` / `emit_iplt_stub` / `iplt_stub_size` — the same + machinery the ELF iplt uses (see [ARCH.md](ARCH.md)). +- `link_layout_got` materializes one GOT slot per GOT-referenced symbol + with an `R_ABS64`, and rewrites `ADR_GOT_PAGE`/`LD64_GOT_LO12` to point + at the slot. Weak-undef GOT slots simply hold 0. + +Both the stub section and the slot section are ordinary subsegments of +the single contiguous reservation, so the redirected call's branch stays +in range. The net effect: `cfree run hello.c` can call `printf` even +though `printf` lives in a libc loaded megabytes away. + +## Executable memory: the host vtable and W^X + +libcfree never calls `mmap`, `VirtualAlloc`, or `mach_vm_remap` itself. +All executable-memory operations go through `CfreeExecMem` in the JIT +host (`include/cfree/jit.h`): `reserve`, `protect`, `release`, +`flush_icache`, and a `page_size`. The driver supplies the concrete +adapter via `driver_env_to_jit_host` (`driver/env/`). The contract has +two distinct shapes: + +- **Dual-mapping** (Apple silicon via `mach_vm_remap` in + `driver/env/macos.c`; Linux via `memfd_create` in `linux.c`; FreeBSD + via `memfd`/`shm_open` in `freebsd.c`; Windows via `CreateFileMappingW` + in `windows.c`). `reserve` with `CFREE_PROT_EXEC` returns *two* virtual + addresses backing the same physical pages: a `write` alias (RW, never + X) and a `runtime` alias (X after `protect`, never W). The mapper + populates through the write alias and the CPU fetches from the runtime + alias, so no page is ever simultaneously writable and executable. A + process-wide registry (`exec_dual_*` in `driver/env/posix.c`) lets the + debugger recover the write alias for a given runtime address. +- **Single-mapping** (`execmem_reserve_single`, non-exec reservations or + hosts without a dual-map primitive). `write` and `runtime` are the same + address and `protect` flips RW↔RX via `mprotect`. + +The mapper requests `CFREE_PROT_EXEC` on the master reservation if any +segment is executable (triggering the dual-mapping path), populates and +relocates through the write alias, then `protect`s each segment's runtime +sub-range to its final perms. EXEC segments get `flush_icache` against +the runtime alias. On x86 this is a no-op: instruction fetches are +coherent with stores on the same core, and dispatch into freshly written +code always crosses a serializing return/call, so no explicit flush is +needed (the rationale is spelled out in `driver/env/icache_x86.c`). ARM +and RISC-V have separate I/D caches and do need an explicit flush +(`__builtin___clear_cache` / `sys_icache_invalidate`; see +`driver/env/icache_*.c`). The append-slack tail is protected `PROT_NONE` +until needed. + +An absent or incomplete `execmem` vtable is a hard error +(`compiler_panic`): the JIT cannot run without one. `page_size` is taken +from the adapter, falling back to `0x4000` if the adapter reports 0; the +POSIX adapter fills it from `sysconf(_SC_PAGESIZE)`. + +## Mach-O thread-local storage: the TLV thunk + +`src/jit/` contains exactly one thing: the Mach-O TLV thunk +(`tlv_thunk_aarch64.S`, with a trapping stub for non-aarch64 hosts in +`tlv_thunk_stub.c`; contract in `tlv_thunk.h`). + +On Mach-O, codegen emits Apple's TLV access sequence: load the variable's +24-byte descriptor, load `descriptor[+0]` as a thunk pointer, and `blr` +it with the descriptor in `x0`, expecting the per-thread variable address +back in `x0`. In an AOT image dyld rewrites that slot to a libdyld thunk +and allocates a pthread key per descriptor. A JIT image is never seen by +dyld, so the mapper does dyld's job itself in `jit_patch_tlv_descriptors`: +it finds every in-image descriptor (reloc-driven, keyed on the +`__tlv_bootstrap` undef each TU emits), allocates one per-image TLS +context through the host's `CfreeJitTls` vtable, and overwrites the three +descriptor slots with `(&cfree_jit_tlv_thunk, ctx, per-thread-offset)`. + +The thunk's calling convention is custom — `x0` in/out, every other +GPR/SIMD register preserved — because it is invoked mid-expression in +JITed code that has no idea a call is happening. That is why it is hand- +written assembly: it saves and restores all caller-saved registers around +a normal C call to the context's `get_block` (the ctx's first field, by +contract), which lazily allocates and seeds the calling thread's TLS +block. The driver's `CfreeJitTls` implementation +(`driver/env/jit_tls_posix.c`) backs `get_block` with a pthread key whose +destructor frees the per-thread block on thread exit. `cfree_jit_tlv_resolve` +lets host/interpreter code resolve a thread-local without going through +the asm path, validating descriptor ownership first so a foreign (dyld) +descriptor is never blindly called. + +## Symbol and inspector surface + +`CfreeJit` exposes a read-only view of the mapped program (declared in +`include/cfree/jit.h`): + +- `cfree_jit_lookup` — name (C-mangled per target) to runtime address, + GLOBAL-bind defined symbols only. +- `cfree_jit_sym_iter_*` — walk every defined, user-visible symbol + (functions, objects, common, TLS, ifunc, abs; mapping/section/file + symbols filtered out), yielding demangled names and runtime addresses. +- `cfree_jit_addr_to_sym` — runtime PC to nearest containing symbol + + offset, for backtraces and disassembly annotation. +- `cfree_jit_runtime_to_image` / `cfree_jit_image_to_runtime` / + `cfree_jit_image_contains` — translate and bounds-check between the + runtime alias and the image vaddr space DWARF was emitted in; the + debugger crosses this boundary at every stop. +- `cfree_jit_view` — a lazily built, in-memory `CfreeObjFile` that + concatenates the debug sections of every input and resolves their + cross-section relocations, so a DWARF consumer sees one coherent object + even for a multi-input JIT. `SK_SECTION` relocs are resolved against a + per-input prefix-size snapshot so merged CUs land their offsets + correctly; code/data relocs resolve to final image vaddrs. See + [DWARF.md](DWARF.md) and [DBG.md](DBG.md). + +## Incremental append + +A `CfreeJit` reserves append slack (RX/R/RW/TLS buckets, protected +`PROT_NONE` initially) past the image end so additional objects can be +linked into the *live* mapping without a full relink. `cfree_jit_publish` +with `CFREE_JIT_PUBLISH_APPEND_OBJECTS` runs `jit_append_obj_inner`: it +preflights for duplicate strong definitions and unresolved references, +carves new segments out of the slack, appends symbols/relocations to the +image, applies the new relocations in place, and flips the new pages to +their final perms — bumping a generation counter. This is what lets the +`dbg` REPL compile and run snippets against an already-mapped program +(see [DBG.md](DBG.md)). + +## The `cfree run` driver + +`driver/cmd/run.c` is the user-facing front end. It classifies inputs by +suffix (`.c`/`-` source, `.wat`/`.wasm` modules, `.o`, `.a`), compiles +sources through a caller-owned compiler, JIT-links everything, looks up +the entry (default `main`, overridable with `-e`), and calls it as +`int(*)(int, char**)`. Notable design points: + +- **Lifetime.** The compiler backs `jit->c`, which `cfree_jit_lookup` + dereferences, so `driver_run` keeps the compiler alive across lookup + and the entry call and frees it only after `cfree_jit_free`. +- **Host-symbol fallback.** Unresolved externs route through + `driver_dlsym_resolver` (`driver/env/`), which retries with a leading + `_` stripped so Mach-O-mangled C names resolve through + `dlsym(RTLD_DEFAULT)`. This is how JITed code reaches host libc. +- **Synthetic argv[0].** A JITed `main` expects a program name in + `argv[0]`, but there is no executable path. The driver fills `argv[0]` + with the first input's display name; user args after `--` start at + `argv[1]`. Without `--`, the program sees `argc==1`. +- **`--no-jit`.** Routes entry *execution* through the IR interpreter + (see [INTERPRETER.md](INTERPRETER.md)) instead of native code. The + native JIT image is still built — it lays out data globals and resolves + externs/function pointers — but only the entry's code runs interpreted. + Symbol resolution for the interpreter walks the JIT image's full symbol + table (locals included) before falling back to host dlsym. +- **Wasm.** `.wat`/`.wasm` modules get a linear-memory instance wired up + and run via a two-call `__cfree_wasm_init` + entry sequence (see + [WASM.md](WASM.md)), on either the JIT or interpreter path. +- Optional `--metrics`/`--time`/`--bench-time` surface the scoped + compile/link/JIT timings libcfree emits through `CfreeMetrics`. + +## Cross-references + +- [LINK.md](LINK.md) — shared resolve/layout/reloc machinery and the + `LinkImage`/`LinkSession` model the JIT consumes. +- [DBG.md](DBG.md) — the JIT debugger session, breakpoints, and PC↔source + mapping built on the inspector surface. +- [EMU.md](EMU.md) — per-basic-block JIT translation on a growing image, a + separate scheme that reuses the mapping primitives. +- [DWARF.md](DWARF.md) — the debug-info producer/consumer behind + `cfree_jit_view`. +- [DRIVER.md](DRIVER.md) — the multitool and host-env adapter layering. +- [INTERPRETER.md](INTERPRETER.md) — the `--no-jit` execution path that + shares the JIT image for data layout and symbol resolution. +- [WASM.md](WASM.md) — the linear-memory instancing and two-call entry + sequence the driver wires up for `.wat`/`.wasm` inputs. diff --git a/doc/LINK.md b/doc/LINK.md @@ -0,0 +1,367 @@ +# LINK.md — the cfree linker + +The linker turns a set of relocatable inputs (objects, archives, shared +objects, raw byte buffers) into a single resolved image: a static ET_EXEC, +a position-independent ET_DYN, a partial ET_REL, or an in-process JIT +mapping. It is a multi-format, multi-arch component built as a strict +pipeline of passes over an immutable input set. This document describes +that architecture — the layering, the data flow, and the invariants that +hold the whole thing together. For how a resolved image becomes runnable +in process see [JIT.md](JIT.md); for the object-file read/write substrate +underneath it see [OBJ.md](OBJ.md); for per-target relocation kinds and +register/ABI detail see [ARCH.md](ARCH.md); for debug-section retention +see [DWARF.md](DWARF.md). + +## Where the linker sits + +The public surface is `CfreeLinkSession` (include/cfree/link.h); the +driver tools `ld`, `cc`, `run`, and `dbg` drive it. Inside libcfree the +real work is the internal `Linker` / `LinkImage` pair declared in +src/link/link.h and src/link/link_internal.h. The session is a thin +wrapper: it owns a `Linker`, accumulates inputs, and on resolve produces +a `LinkImage`. Path handling (reading bytes off disk, `-l` search paths, +sysroots) lives entirely in the driver — the library boundary is +byte-buffer-shaped. Every bytes input is read through `Compiler.env-> +file_io` by the driver before it reaches the linker. + +The two central abstractions: + +- **`Linker`** — the mutable accumulator. Holds the registered inputs + (objects, archives, DSOs), the entry name, link-mode flags (PIE, + static-exe, JIT, gc-sections, strip-debug), an optional linker script, + and an optional external resolver. It is built up by the `link_add_*` + / `link_set_*` calls and then *read* by resolution. It is never + rewritten by resolution. +- **`LinkImage`** — the resolved output. A fresh image is produced by + every `link_resolve`. It owns the symbol table, the section and + segment tables, the per-segment byte buffers, the durable relocation + records, and any synthesized dynamic-link / IFUNC / debug state. It is + the read-side view consumed by the format emitters, the JIT mapper, + and the debug-info view. + +## The load-bearing invariants + +Three rules are stated in the link.h header comment and enforced +throughout. They exist so that incremental re-resolution can be added +without reworking the core; they also make the single-shot path easier +to reason about. + +1. **Inputs are never mutated; resolve is a function from inputs to a + fresh image.** `link_resolve` reads the `Linker` and allocates a new + `LinkImage`; it does not edit input `ObjBuilder`s or rewrite the + `Linker` in place. Re-resolving the same `Linker` would yield another + independent image. +2. **`LinkInputId` / `ObjBuilder*` mappings are stable for the Linker's + lifetime.** Adding an input never invalidates an existing handle. + `ObjSymId` / `ObjSecId` are per-`ObjBuilder` id spaces, so each input + carries an `InputMap` (link_internal.h) translating its local ids + into the global `LinkSymId` / `LinkSectionId` space. +3. **Relocation records stay as data; they are never burned destructively + into segment bytes during resolve.** `link_emit_relocations` produces + `LinkRelocApply` records — `(write site, kind, target symbol, addend)` + — and stores them on the image. The actual patching of bytes happens + later, at emit time (format emitter) or map time (JIT mapper). The + segment byte buffers produced during layout hold raw, *unrelocated* + input bytes. + +A fourth invariant governs addresses: + +4. **Image-relative vaddr discipline.** Every vaddr and file_offset on a + resolved `LinkImage` is computed as if the image were based at 0. + Layout, symbol vaddrs, GOT/PLT placement, and reloc write-sites are + all in this coordinate system. Consumers add their own runtime base + exactly once: the ELF emitter bumps everything by `img_base` + (`shift_image_addresses` in src/obj/elf/link.c — 0x400000 for static + ET_EXEC, 0 for PIE/DSO so the loader picks the base), and the JIT + mapper bumps by the chosen reservation address. Because relocations + are re-derived from (post-shift) placements, an image can be shifted + wholesale by adding one delta to every coordinate. + +## The pass pipeline + +`link_resolve` (src/link/link_layout.c) orchestrates the whole pipeline. +The phases, with the file that owns each: + +``` + inputs (Linker) + | + | link_synth_coff_ctor_dtor_list (PE/COFF CRT boundary synth) + | link_ingest_archives ── archive member selection + v + link_resolve_symbols ── build global symbol table [resolve.c] + link_gc_compute ── --gc-sections liveness (BFS) + | + v + link_layout_sections ── bucket + place sections [layout.c] + link_layout_commons ── COMMON -> .bss.common + link_emit_segment_bytes ── copy raw input bytes + link_layout_debug ── carry .debug_* (file-only) + | + v + link_assign_symbol_vaddrs ── symbol -> vaddr [reloc_layout.c] + link_emit_*_boundaries ── __init_array_start, __tdata_*, __start_X ... + link_resolve_undefs ── globals / DSO imports / resolver + link_gc_drop_dead_globals + link_layout_iplt ── STT_GNU_IFUNC trampolines + link_layout_jit_stubs ── AArch64 JIT call islands + link_layout_got ── static-PIC .got + link_emit_relocations ── LinkRelocApply records + fmt->layout_dyn ── PIE/DSO synthetic dyn sections [obj/elf/link_dyn.c] + link_resolve_entry ── entry symbol lookup + link_capture_debug_inputs ── retain ObjBuilders for JIT view + | + v + LinkImage ──> link_emit_image_writer (format emit) | cfree_jit_from_image +``` + +### Phase 1 — input registration and archive selection (link.c, link_resolve.c) + +`link_add_obj` borrows a caller-owned `ObjBuilder`. `link_add_obj_bytes` +detects the binary format, reads bytes into a linker-owned `ObjBuilder`, +and — via the format's `classify_obj_input` hook — reclassifies the input +as a DSO if the bytes are a shared object. `link_add_dso_bytes` parses an +ET_DYN explicitly, materializing only its exported (dynsym) symbols. +`link_add_archive_bytes` eagerly parses every member into an `ObjBuilder` +at registration time but defers the include/exclude decision to resolve. + +A DSO input contributes **nothing** to layout. Its presence only +influences resolution (an undef matched by name against its exports +becomes an *imported* symbol) and DT_NEEDED bookkeeping (its SONAME, or +filename fallback, is recorded as a runtime dependency). + +Archive member selection (`link_ingest_archives`) is the demand-driven +pull familiar from GNU ld. `--whole-archive` members are pulled +unconditionally first. The rest are scanned in input order: for each +archive a presence scan (`scan_presence_before`) computes the set of +defined and still-wanted undefined globals from all inputs that come +before that archive in link order; any member that *defines* a wanted, +not-yet-defined global is pulled, and the scan repeats until a fixed +point so a freshly pulled member can drag in its own dependencies. +Spurious header-artifact undefs (unreferenced extern prototypes) are +excluded from the want set so an unused declaration never pulls a member. +Archives in the same nonzero `group_id` form a `--start-group` cycle. +Pulled members move into `Linker.inputs` and get stable ids like any +other input. PE/COFF has two special cases handled here: short-import +shim members route through the DSO path (their symbols are DLL exports), +and a synthetic `ObjBuilder` supplies the mingw CRT ctor/dtor boundary +symbols and an AArch64 `__chkstk`. + +### Phase 1 — symbol resolution (link_resolve.c) + +`link_resolve_symbols` walks every (non-DSO) input's symbols, allocating +its `InputMap` and appending a `LinkSymbol` per local symbol while +building `img->globals` — an open-addressed name→`LinkSymId` hash for +global/weak definitions. Locals never enter that hash. When two inputs +define the same global, a binding-strength policy decides the winner: +GLOBAL beats WEAK beats LOCAL; two COMMON symbols merge to the larger +size with the stricter alignment; a real definition overrides a COMMON; +two strong definitions are an error — except COFF/PE SELECTANY, where two +COMDAT (`SF_GROUP`) definitions keep the earlier and mark the later +section discarded (recorded in `InputMap.comdat_discarded`, honored by GC +and layout). + +`link_resolve_undefs` runs *after* layout has assigned vaddrs (it needs +them) and settles every still-undefined symbol: against a defined global +of the same name; else against a DSO export (becomes imported); else +against the external resolver (becomes an absolute address — this is the +JIT/host-symbol path); else a COFF mingw alias-by-naming-convention +fallback; else, for a weak undef, resolves to absolute zero; else it is a +hard "undefined reference" error. A JIT-mode escape hatch tolerates +Mach-O `__tlv_bootstrap`. + +The atom model underlies GC and layout: an `ObjBuilder` section may be +subdivided into *atoms* (one function / one data object), and the +`InputMap` records, per section, which atoms are live and which +`LinkSection` each atom/section maps to. This lets `--gc-sections` +operate at function granularity. + +### Phase 1b — garbage collection (link_resolve.c) + +With `--gc-sections` off, `link_gc_compute` simply marks every kept +allocatable section (or its atoms) live. With it on, GC is a BFS: roots +are the entry symbol, retained (`SF_RETAIN`) and init/fini-array +sections, and script-`KEEP` sections; the worklist follows relocations +from live sections/atoms to the symbols they reference, marking each +target's defining section/atom. `__start_X` / `__stop_X` references +promote every section named `X`. After layout, `link_gc_drop_dead_ +globals` clears `defined` on symbols whose section was collected. + +### Phase 2 — section and segment layout (link_layout.c) + +`link_layout_sections` (the default, non-scripted path) partitions kept +sections into four permission buckets — `SEG_RX`, `SEG_R`, `SEG_RW`, +`SEG_TLS` — and lays them out grouped by name within each bucket, in +first-occurrence order. Same-name contributions are placed adjacently so +the format emitter can merge them into one output section. NOBITS +(`.bss`, `.tbss`) sections are tracked as trailing zero-fill so a +segment's `mem_size` exceeds its `file_size`. One `LinkSegment` is +materialized per non-empty bucket; segments are assigned image-relative, +page-aligned vaddrs back-to-back from 0, and every section's vaddr / +file_offset is fixed up into its segment. A PIE quirk lives here: +read-only data carrying an absolute reloc is promoted from `SEG_R` to +`SEG_RW`, because the dynamic loader must write the relocated pointer +into the slot and a never-writable segment would fault. + +`link_layout_commons` allocates all surviving COMMON symbols into a +synthetic `.bss.common` section appended to the writable segment, +assigning each its offset and rewriting it to a normal `SK_OBJ` +definition. `link_emit_segment_bytes` then copies each section's *raw* +input bytes into its segment buffer (skipping NOBITS) — no relocations +are applied, per invariant 3. On the JIT lane this byte copy is skipped: +the mapper copies input bytes straight into execmem. + +`link_layout_debug` carries `.debug_*` sections through to AOT ELF/Mach-O +output as **file-only** `LinkSection`s: they live in `img->sections` (so +their `SK_SECTION` symbols resolve and the reloc engine applies to them) +but carry `segment_id == LINK_SEG_NONE` and their own byte buffers in the +image's debug registry, getting no PT_LOAD. Same-name contributions are +assigned a per-name cumulative base (a DWARF-section-relative offset) so +the emitter merges them into one output section with correct +cross-section offsets. The JIT lane serves debug differently (via +`cfree_jit_view`, see [DWARF.md](DWARF.md)), so it skips this pass; strip +mode drops it. See [DWARF.md](DWARF.md) for the producer/reader side. + +### Scripted layout (link_layout.c, link_script.c) + +When a linker script is set, `link_layout_sections_scripted` replaces the +bucket path: it walks the script's output sections in declaration order, +placing matched input sections at a "dot" location counter, materializing +one segment per non-DISCARD output section and turning script symbol +assignments into defined global symbols. `/DISCARD/` matches leave the +input section's `InputMap` slot as `LINK_SEC_NONE`, which downstream +passes already treat as "dropped". A scripted image is flagged so the +emitter keeps script-assigned absolute vaddrs and omits the +self-describing header PT_LOAD / build-id note. + +The script itself is parsed by `cfree_link_script_parse` (link_script.c), +a hand-written recursive-descent parser for a deliberately small GNU-ld +subset: `ENTRY(sym)`, top-level and in-section symbol assignments with a +small arithmetic-expression grammar, `. = expr` dot moves and alignment, +`SECTIONS { output : { input-matchers } }`, and `/DISCARD/`. Unsupported +directives (`MEMORY`, `PHDRS`, `PROVIDE`, `OVERLAY`, `OUTPUT_FORMAT`, +`GROUP`, ...) are rejected with a diagnostic rather than silently +ignored. The linker accepts only the structured `CfreeLinkScript` form — +there is no text setter on the `Linker`; hosts that have GNU-ld text run +the parser first. Input matchers use a `*`-only glob. + +### Phase 3 — post-placement vaddr / boundary / GOT / PLT / IPLT (link_reloc_layout.c) + +With sections placed, `link_assign_symbol_vaddrs` binds every defined +symbol to `section.vaddr + (symbol.value - section.obj_offset)`. Then a +family of boundary passes synthesize the linker-defined globals that C +runtimes expect: `__init_array_start/end`, `__fini_array_start/end`, +preinit equivalents; the TLS boundaries `__tdata_start/end` and +`__tbss_size`; the encoding-section `__start_X` / `__stop_X` pairs; and +target/format globals such as `_GLOBAL_OFFSET_TABLE_`, `_DYNAMIC`, +`__dso_handle`, the RISC-V global pointer, and PE `__ImageBase`. + +Three synthesis passes append new segments/sections to the image after +the user payload (each using the `link_iplt_alloc_*` growth helpers, +which keep image-owned tables resizable): + +- **`link_layout_iplt`** — for every defined `STT_GNU_IFUNC` symbol, + builds a per-arch resolver trampoline plus its `.igot.plt` slot and + records the `(resolver_vaddr, slot_vaddr)` pairs on the image. On the + static-exe path it also wires a `.init_array` entry calling + `__cfree_ifunc_init` so slots are filled at startup; the JIT path + resolves them in process instead. +- **`link_layout_jit_stubs`** — AArch64-only, JIT-lane: synthesizes + call/jump islands for `CALL26` / `JUMP26` relocs whose targets may sit + outside ±128 MB of the call site once mapped, and returns a per-symbol + stub map. +- **`link_layout_got`** — a static-PIC `.got`: scans relocations for + GOT-relative kinds (`reloc_uses_got`), allocates one 8-byte slot per + referenced symbol in a single exactly-sized `.got` segment placed after + everything, defines a local symbol per slot, and emits an `R_ABS64` + record to fill each slot. Returns a per-symbol GOT map. + +### Phase 4 — relocation emission (link_reloc_layout.c) and dynamic synthesis + +`link_emit_relocations` walks every input relocation, skips ones whose +source section was dropped, redirects GOT-using relocs to their GOT slot +(via the GOT map) and AArch64 JIT-call relocs to their stub (via the stub +map), and emits a `LinkRelocApply` record with the write site in +image-relative coordinates, the resolved target `LinkSymId`, the kind, +and the addend. These records are the durable, non-destructive output of +resolve (invariant 3); nothing is patched into bytes yet. + +For PIE/DSO output the format's `layout_dyn` hook (src/obj/elf/link_dyn.c) +then synthesizes the dynamic sections — `.interp`, `.dynsym`, `.dynstr`, +`.gnu.hash`, `.plt`, `.got.plt`, `.rela.plt`, `.rela.dyn`, `.dynamic` — +recording one JUMP_SLOT per imported function and a PLT entry per import. +Its layout invariants (dynsym slot 0 reserved, imports ordered +PLT-functions-then-GOT-data, the three reserved `.got.plt` slots) live in +`LinkDynState` (link_internal.h). The `.rela.dyn` RELATIVE tail is filled +during emit, when internal absolute relocs are seen. + +`link_resolve_entry` looks up the entry symbol (the per-format default +from `obj_format_default_entry_name`: `_start` for ELF, `_main` for +Mach-O) and stamps it on the image. + +### Emit / consume + +`link_emit_image_writer` dispatches by target object format to the ELF / +Mach-O / COFF `link_emit` function. That emitter is where invariant 4's +shift happens (`shift_image_addresses`) and where the `LinkRelocApply` +records are finally applied into the output bytes (`apply_all_relocs`), +with imported targets routed through PLT/GLOB_DAT and internal absolutes +turned into RELATIVE records under PIE. Image identity +(`link_image_id_compute`) is a format-agnostic 16-byte hash over +post-shift segment bytes and vaddrs, wrapped per-format (ELF build-id +note, Mach-O LC_UUID, PE debug directory). See [OBJ.md](OBJ.md) for the +format writers. Alternatively `cfree_jit_from_image` maps the image into +executable memory — that is [JIT.md](JIT.md)'s territory. + +## Partial / relocatable linking (link_relocatable.c) + +`ld -r` is a deliberately separate path: `link_emit_relocatable_writer` +builds a *fresh `ObjBuilder`* rather than a `LinkImage`. A relocatable +output must preserve object-file structure — keep non-alloc sections, +leave unresolved externals as relocatable references, assign no final +vaddrs, synthesize no GOT/PLT/IFUNC/entry state. So it merges input +sections into compatible output sections, merges globals (with the same +binding-strength policy, including COMMON merging), copies symbols, +COMDAT groups, and relocations with their symbol/section references +rewritten into the output id space, then emits through the object-format +writer. Archive ingest still runs (so `-r` over an archive pulls members), +but linker scripts are rejected on this path. + +## Incremental linking + +Incremental relink avoids paying `O(whole program)` for a one-line edit. +The four invariants exist precisely to keep this addable without +reworking the core. There are two tiers, and they are at different +levels of realization. + +### Append-only, in-process (JIT) — the realized mechanism + +This is the one incremental path that exists today. A live JIT image +grows by appending new objects without ever moving a previously +published address. It lives on the JIT side (`cfree_jit_append_obj`, +append cursors and reserved per-bucket slack in src/link/link_jit.c) and +serves `cfree dbg`. Its hard invariant is that any observable runtime +address — a lookup result, a breakpoint, a return address, a DWARF PC +range — never changes: new code may reference old code, old code is +never repatched, and an append that would exhaust a bucket's reserved +slack fails rather than relocating. See [JIT.md](JIT.md). + +### Forward-compat surface for file-based patch (AOT) — designed only + +Two internal entry points — `link_resolve_at` (base-pinned resolve) and +`link_resolve_extend` (append new inputs to an existing image) — are +declared and reserved but not yet implemented; they are panic stubs. They +exist so the invariants above have a concrete shape to satisfy, not as a +working feature. The *intended* design they anchor: patch a prior on-disk +image instead of relinking from scratch — diff a changed input's atoms by +content hash against a persisted placement table, overwrite unchanged-size +atoms in per-atom slack, relocate grown atoms via a move primitive (a jump +island, later a GOT cell), and re-derive only the touched relocations from +current placements. The design is gated by a *soundness check*: apply +incrementally only when an edit provably cannot change symbol resolution +(no added/removed/rebound global, no new archive pull-in, no +COMDAT-ownership flip, no TLS/import size change, no slack exhaustion); +otherwise fall back to a full — but in-memory, so cheap — relink, because +a correct-but-slow result always beats a fast-but-wrong one. The substrate +that design leans on — the durable `LinkRelocApply` records, the stable +input-id mapping, and atom granularity — is the same substrate the +realized JIT path already uses. diff --git a/doc/LINK_DEBUG.md b/doc/LINK_DEBUG.md @@ -1,187 +0,0 @@ -# Debug-info retention in the linker (IMAGE_INSPECT phase 5) - -## Context - -`addr2line -e <cfree-linked-exe>` opens cfree-linked images fine (phase 4) but -finds no DWARF: the linker drops every `.debug_*` section because -`link_section_kept` keeps only `SF_ALLOC` progbits/nobits/array sections -(`src/link/link_layout.c:49`). That predicate gates three things — layout, -section-header planning, and relocation emission — so the debug *bytes* and the -*relocation records* inside them are both discarded. - -Goal: carry `.debug_*` through to the linked image and fully resolve their -relocations in place, so `addr2line` resolves `file:line` on a cfree-linked -executable. The DWARF reader applies no relocations (`dwarf_open.c:453`, -`dwarf_die.c:35` read `DW_FORM_addr` verbatim), so the linker must write final -values into the placeholders. - -Scope (confirmed): **multi-input** concatenation supported and tested, across -**all linked image types** — static ET_EXEC, PIE, and DSO (ET_DYN). The -mechanism is image-type-agnostic: debug sections are **not loaded** (no -PT_LOAD), so they carry link-time vaddrs and need **no dynamic relocations** — -which is exactly what `addr2line -e` reads statically. For PIE/DSO `img_base` -is 0, so the same `img_base + vaddr` formula yields link-time vaddrs that match -the symbol table. Default keeps debug when present; existing `-S`/`--strip-debug` -(already plumbed to `l->strip_debug`, `driver/cmd/ld.c:112,839`; -`src/link/link.c:365`) drops it. - -## Design - -Debug sections are non-allocatable, so they get **no PT_LOAD segment**. But -they must participate in symbol resolution and the relocation engine. The -cleanest fit (and what the doc's "what has to be built" describes) is a new -**file-only LinkSection** class that lives in `img->sections` (so section -symbols resolve and the existing reloc engine applies) but carries its own byte -buffer and `segment_id == LINK_SEG_NONE`. - -The relocation math is already reusable: symbol resolution does -`s->vaddr = ls->vaddr + (s->value - ls->obj_offset)` (`link_reloc_layout.c:66`). -Set a debug output section's `LinkSection.vaddr` to its **DWARF-section-relative -base** (cumulative contribution offset, 0 for the first input). Then: -- a `SK_SECTION` symbol pointing at a debug section resolves to that base, so - `R_ABS32 = base + addend` yields the right DWARF sec-offset; -- function symbols already resolve to real code vaddrs, so `R_ABS64` low_pc / - set_address values come out correct for free. - -### Multi-input concatenation - -Each input's `.debug_X` becomes its own file-only `LinkSection`, laid out at a -**contiguous, name-grouped** file region with `vaddr` = the running per-name -cumulative size (0, size0, size0+size1, …). Because contributions of the same -name are placed adjacently, the existing OutShdr planner -(`src/obj/elf/link.c:896`, which merges adjacent same-`(segment_id,name)` runs) -collapses them into **one** output `.debug_X` section spanning the whole run. -Each input's `SK_SECTION` symbol carries its own per-input base via its own -LinkSection's `vaddr`, so cross-section `R_ABS32` offsets land in the merged -section correctly with no per-input bookkeeping beyond the vaddr. This reuses -the same prefix-snapshot idea already proven in the JIT debug view -(`src/link/link_jit.c:1487-1707`). - -## Changes - -### 1. Internal model (`src/link/link_internal.h`, `link.h`) -- `LinkSection`: add a `u8 file_only` flag (reuse the existing `pad`/`u16 pad` - field) marking a non-segment, file-resident section. -- `LinkImage`: add a small file-only-section registry so apply/emit can find a - debug section's byte buffer by id — `LinkSectionId dbg_first_lsid; u32 - dbg_count; u8** dbg_bytes; u64* dbg_size;` plus a helper - `u8* link_fileonly_bytes(LinkImage*, LinkSectionId)` (returns NULL for - non-file-only ids). Debug LinkSections are appended as a contiguous id range, - so lookup is `dbg_bytes[lsid - dbg_first_lsid]`. - -### 2. Retention predicate (`src/link/link_layout.c`) -- Keep `link_section_kept` (alloc) unchanged. -- Add `int link_section_kept_fileonly(const Section* s)` → true for - `s->kind == SEC_DEBUG` (and not `removed`). - -### 3. New layout pass `link_layout_debug` (`src/link/link_layout.c`) -Run after segment placement, before reloc emission (in the `link_resolve` -orchestration, `src/link/link_layout.c:1040+`; gate on `!l->strip_debug`): -1. Walk inputs; for each `SEC_DEBUG` section not discarded, record a - contribution `{input, obj_sec_id, name, size, align, bytes}`. -2. Group contributions by name; assign per-name cumulative `vaddr` (the - DWARF-relative base) and a per-output-section running size. -3. Append one `LinkSection` per contribution to `img->sections`: - `file_only=1`, `segment_id=LINK_SEG_NONE`, `vaddr`=per-name cumulative base, - `obj_offset=0`, `size`, `align`, `name`, `sem=SSEM_PROGBITS`, - input/obj ids set. Record the contiguous id range in the registry. -4. Allocate a per-output-name byte buffer (`dbg_bytes`) and copy each - contribution's bytes at its cumulative offset (concatenation). File offsets - are *not* assigned here — that's ELF-specific (step 6). -5. Populate the InputMap via `map_placed_unit(m, sid, OBJ_ATOM_NONE, lsid)` so - `link_input_symbol_section` / `link_input_reloc_section` - (`link_internal.h:87`) resolve debug section symbols and relocs to the new - LinkSections (today they return `LINK_SEC_NONE`). - -Result: the existing symbol-section assignment loop -(`link_reloc_layout.c:44-54`) stamps `section_id` onto the local `SK_SECTION` -debug symbols, and the vaddr resolver at `:66` gives them their sec-relative -base — no new symbol code needed. (Section symbols stay out of `.symtab`: the -`name==0` skip at `elf/link.c:991` already excludes them; they only need to -exist as reloc targets.) - -### 4. Relocation emission gate (`src/link/link_reloc_layout.c:1221`) -Widen the gate to also admit file-only debug sections: -`if (!s || (!link_section_kept(s) && !link_section_kept_fileonly(s))) continue;` -The existing record-building (`:1244-1262`) then produces `LinkRelocApply`s -with `link_section_id` = the debug LinkSection and `offset = r->offset` (since -`obj_offset==0`); `write_vaddr/file_offset` are computed but unused for -file-only apply (see step 5). - -### 5. Relocation apply (`src/obj/elf/link.c:306` `apply_all_relocs`) -Add a file-only branch keyed on `sec->segment_id == LINK_SEG_NONE`, placed -**before** the `tgt->imported` / GLOB_DAT dynamic-reloc handling so debug -sections never get dynamic relocs (they aren't loaded): -- `P_bytes = link_fileonly_bytes(img, r->link_section_id) + r->offset` - (instead of `segment_bytes[...]`). -- `S`: if `tgt->kind == SK_SECTION` → `S = tgt->vaddr` (DWARF sec-relative - base, **no `img_base`**), matching `R_ABS32` cross-section offsets; else - `S = img_base + tgt->vaddr` (link-time code vaddr) for `R_ABS64` low_pc / - set_address. `img_base` is the fixed base for ET_EXEC and **0** for PIE/DSO, - so debug bytes hold link-time vaddrs in every image type — matching the - symtab and what `addr2line -e` expects (the loader never relocates these - unloaded bytes). Then `link_reloc_apply(c, r->kind, P_bytes, S, addend, 0)`. - -This is the only place the function-vs-section reloc-class distinction lives, -and it mirrors the JIT view's logic (`link_jit.c:1588-1615`). - -### 6. ELF output (`src/obj/elf/link.c`) -- **OutShdr planning (`:854-920`):** debug LinkSections already flow through the - `img->sections` walk and merge by name. Add `int is_fileonly` to `OutShdr` - (set from `ls->file_only`). Adjust the sort comparator so file-only sections - sort **after** all real segments (treat `segment_id==0` as +inf) — keeps - debug shdrs at the end, matching `objdump` convention and leaving alloc shdr - indices stable. -- **Trailing file offsets (`:1032-1050`):** after `end_of_segs`, assign debug - output sections a `file_offset` (ALIGN_UP) in the trailing non-alloc region, - *before* `.symtab`; bump `symtab_off` past them. Stash each merged OutShdr's - `file_offset` and propagate to its LinkSections (for the byte-write step). -- **Shdr emit (`:1325-1372`):** for `is_fileonly` OutShdrs write - `sh_type=SHT_PROGBITS`, `sh_addr=0`, `sh_flags=0`, real `sh_offset`/`sh_size`. -- **Byte write (`:1261-1278`):** before `.symtab`, write each debug output - section's buffer at its assigned `file_offset` (pad as needed), iterating the - registry in shdr order. -- Bump the shdr count: debug shdrs are part of `noutshdr`, so `nshdr = - 1 + noutshdr + 4` already accounts for them; verify `shndx_*` arithmetic - still places build-id/symtab/strtab/shstrtab last. - -## Files touched -- `src/link/link_internal.h`, `src/link/link.h` — `file_only` flag, registry, - helper decl. -- `src/link/link_layout.c` — `link_section_kept_fileonly`, `link_layout_debug`, - registry alloc, orchestration call. -- `src/link/link_reloc_layout.c` — widen reloc gate (`:1221`). -- `src/obj/elf/link.c` — `OutShdr.is_fileonly`, sort tweak, apply branch, - trailing-offset + shdr-emit + byte-write for debug sections. - -No change to the DWARF producer/reader; relocation classes (`R_ABS32`/`R_ABS64` -in `src/obj/reloc_apply.c:31-51`) are applied as-is. - -## Verification - -Red-green, aarch64-linux ELF (DWARF path that addr2line already exercises): - -1. **Round-trip golden — single input.** New objdump case - `test/objdump/aarch64/cases/05-addr2line-linked` (mirroring existing cases): - `cfree cc -g -target aarch64-linux ... -o prog.elf` (static exec), then - `cfree nm prog.elf` to get a function vaddr, then - `cfree addr2line -e prog.elf <vaddr>` asserting `file:line`. Compare against - the `.expected` golden. Cross-check with host `addr2line` on the same bytes - where available. -2. **Multi-input.** Same flow linking two `-g` `.o`s into one static exec; - assert addr2line resolves a function from *each* input (exercises per-input - `SK_SECTION` rebasing and OutShdr concatenation). Inspect with - `cfree objdump -h prog.elf` to confirm one merged `.debug_info` etc. -3. **PIE.** Same single-input round-trip linked `-pie` (ET_DYN): assert - addr2line resolves `file:line`, confirming the `img_base==0` path holds - link-time vaddrs and no spurious dynamic relocs were emitted for `.debug_*` - (`cfree objdump -R prog.elf` shows none against debug offsets). -4. **Strip path.** `cfree ld -S` (or `cc ... -Wl,-S`) drops debug — assert no - `.debug_*` shdrs and addr2line reports no debug info. -5. **No regressions.** `make test-link test-elf test-debug test-dwarf - test-driver` and the objdump goldens (`CFREE=build/cfree sh - test/objdump/run.sh`). Watch smoke tests that link `-g`. - -Risk (per doc): a missed reloc class silently produces wrong line numbers -because the reader trusts the bytes — the addr2line round-trip cross-checked -against host `addr2line` is the guard. diff --git a/doc/NATIVE_ARCH_COMPLETENESS.md b/doc/NATIVE_ARCH_COMPLETENESS.md @@ -1,144 +0,0 @@ -# Native-arch completeness: asm / disasm / link-reloc / dwarf - -Goal: aa64, x64, rv64 each have complete **asm**, **disasm**, **link/reloc**, -and **dwarf** support across the OS support matrix: - -| arch | OSes | object formats | -|------|----------------------------|--------------------| -| aa64 | Linux, Windows, Mac | ELF, COFF/PE, Mach-O | -| x64 | Linux, Windows | ELF, COFF/PE | -| rv64 | Linux | ELF | - -Status from the 2026-05-29 audit (baseline tests + a 40-agent static audit with -adversarial verification of every gap). Only the items below are *verified real* -and *in scope*; seven other candidate gaps were checked and rejected -(e.g. rv64 omitting the four Mach-O `LinkArchDesc` classification hooks is N/A — -no rv64 Mach-O; aa64 ELF TLS being Local-Exec-only is correct for the static -whole-module link model). - -## Tier 0 — correctness blockers (broken today, in scope) - -- [x] **aa64 Windows/COFF TLS codegen** — DONE (commit: aa64 Windows TLS LE - sequence; `test-coff` green). `aa_tls_addr_of_win` mirrors x64. -- [x] **rv64 TLS-IE reloc unhandled → hard link failure** — DONE (rv64 now emits - Local-Exec TPREL like aa64/x64; `test-rv64-tls-link` regression added). -- [x] **rv64 assembler emits no relocations** — DONE. Symbolic branches/jumps - (`beq a0,a1,label`, `j label`, `jal ra,func` → `R_RV_BRANCH`/`R_RV_JAL` via - `rv_reloc_target`; `rv64_branch_sym`); `call`/`tail` (→ `R_RV_CALL`), `la`/`lla` - (→ `R_RV_PCREL_HI20`+`R_RV_PCREL_LO12_I` via a `.LpcrelHi` anchor) and multi-word - `li` (`rv64_call_tail`/`rv64_la_lla`/`rv64_li_multi`); and `%hi/%lo/%pcrel_hi/ - %pcrel_lo/%got_pcrel_hi` modifier syntax (`rv64_reloc_modifiers`). All byte- and - reloc-identical to llvm-mc (verified via `llvm-objdump -r` on the emitted object). - -## Tier 1 — correctness bugs (wrong output, in scope) - -- [x] **x64 `.eh_frame` wrong DWARF reg for RBP** — DONE (`x64_dwarf_from_hw_gpr` - maps HW→DWARF before CFI emit; cfi_unit.c x64 case + reg-map asserts added; - llvm-dwarfdump confirms RBP, not RDI). - -## Tier 2 — codegen emits but disasm/asm can't handle (round-trip violations) - -- [x] **aa64 disasm: FP/SIMD data-processing family undecodable** — DONE - (FP-DP/CMP/CVT/INT-CVT rows + integer DP1/bitfield + register-offset `ldr/str - [xn, xm, lsl#s]` decode added to `src/arch/aa64/isa.c`; byte-match vs - llvm-objdump; `aa64_fp_bitfield_dp1` decode corpus case). -- [x] **x64 disasm: SSE `movd/movq` (66 0F 6E/7E), `xorps/xorpd` (0F 57)** — DONE - (table rows + `print_xmm_rr` 7E reversed-order handling; matches llvm-objdump; - decode corpus case added). -- [x] **aa64 asm: FP-scalar instrs** — DONE (see "Standalone `as` encode"). -- [x] **aa64 asm: byte/half loads/stores** — DONE incl. pre/post-index - `[Xn,#i]!` / `[Xn],#i` and register-offset `[Xn,Xm{,LSL#s}]` / `[Xn,Wm,SXTW]` - (`aa64_ldst_regoff`, `aa64_ldst_pre_post_index`; new `AA64LdStRegOff`/`WBack` - encoders in `isa.h`). -- [x] **aa64 asm: atomics / exclusive / bitfield / clz / rev** — DONE. bitfield - (`sbfm/ubfm/bfm`) + `clz/rbit/rev/rev16`; and exclusive (`ldxr/stxr/ldaxr/ - stlxr`), acquire/release (`ldar/stlr`), `cas{,a,l,al}`, and LSE (`swp/ldadd/ - ldclr/ldeor/ldset` + a/l/al + b/h) atomics (`aa64_exclusive_load_store`, - `aa64_load_acquire_store_release`, `aa64_compare_and_swap`, `aa64_lse_atomics`; - new `AA64LdStEx`/`Cas`/`LseAtomic` encoders). REMAINING (niche, deferred): - `CASP`, LSE min/max (`ldsmax/ldsmin/ldumax/ldumin`), `LDAPR`/`STLLR`, and - disasm rows for these encode-only forms (they render as `.inst`). -- [x] **x64 asm: memory operands only `disp(%base)`** — DONE. Full AT&T - `disp(%base,%index,scale)`, index-only, and (numeric + symbolic) `(%rip)` - (`x64_memop_sib_load`, `x64_memop_rip`; reuses `x64_pack_mem_sib`). -- [x] **x64 asm: ALU reg→mem / imm→mem store forms** — DONE (`/r` reg-to-rm and - group-1 `imm`-to-rm stores; `x64_memop_alu_store`, `x64_memop_mov_store`). -- [x] **x64 dwarf: no named params/locals** — DONE (`x64_frame_slot_debug_loc`). -- [x] **rv64 dwarf: no named params/locals** — DONE (`rv_frame_slot_debug_loc`); - both now byte-identical to aa64's DWARF for the same source. -- [ ] **x64 dwarf: step-out can't recover RA** — `cfree_dwarf_unwind_step` has no - memory provider, and x64 has no link-register fallback. Needs a mem-reading - unwind variant. (Compounded by the JIT debugger not populating eh_frame for - in-process images; deferred — debugging-UX robustness, has test-infra deps.) - -## Tier 3 — minor correctness / robustness - -- [x] rv64 `needs_jit_call_stub` — DONE (`rv64_is_branch_reloc`; reuses IPLT stub). -- [x] CFI offset rules pinned to function-end PC — DONE (sticky prologue-PC - override in `src/arch/mc.c`; all archs now pin rules to the post-prologue PC). -- [x] shared asm: `.comm/.lcomm`/`.uleb128/.sleb128` — DONE (were silent - miscompiles; LEB128 now matches llvm-mc byte-for-byte). -- [x] aa64 asm: `mov Rd,#bitmask-imm` via ORR alias — DONE (`mov` falls back to - `orr Rd,zr,#bitmask` via `aa64_logimm_encode` when not movz/movn-encodable; - `aa64_mov_orr_bitmask`). -- [x] rv64 asm: `call/tail/la/lla` pseudos + multi-word `li` — DONE (see Tier 0). -- [x] asm: relocation-operator operand syntax — DONE on all three arches: - aa64 `:lo12:`/`:got:`/`:got_lo12:` (`aa64_reloc_modifiers`), rv64 `%hi`/`%lo`/ - `%pcrel_hi`/`%pcrel_lo`/`%got_pcrel_hi` (`rv64_reloc_modifiers`), x64 `sym(%rip)`/ - `@PLT`/`@GOTPCREL` (`x64_reloc_modifiers`). All byte- and reloc-identical to - llvm-mc incl. addends. REMAINING (niche): TLS variants (`:tprel_*:`, - `%tls_*`), and `.L`-prefixed local-label spellings in operand references (a - shared-lexer concern; plain labels work, e.g. as the `%pcrel_lo` anchor). -- [x] rv64 link: `R_RV_SET_ULEB128/SUB_ULEB128` — DONE (apply path re-encodes - the ULEB128 field in place to its original byte width; `test-link-reloc-uleb128` - drives `link_reloc_apply` with values from a real `clang -g` rv64 object). - -## Tier 4 — test coverage (no behavior change; lock in the above) - -- [x] x64 decode coverage — DONE (`test-asm-x64` HTL lane in default suite; - runs the SSE-decode corpus case). -- [x] x64 encode corpus in default `make test` — DONE (`test-asm-x64`). -- [x] rv64 asm corpus no-exec default lane — DONE (`test-asm-rv64` HT). -- [x] x64 ELF linker reloc application — DONE (opt-in `test-link-x64`, R+E). -- [x] `test/debug/cfi_unit.c` wired — DONE (`test-debug` builds + runs it). -- [x] x64 case in `cfi_unit.c` + hw→dwarf reg-map asserts — DONE. -- [x] FP decode tests — DONE (aa64 FP decode corpus runs by default; x64 SSE - decode corpus runs in the x64 lane). - -## Standalone `as` encode (GNU-as parity for hand-written asm) - -These are all in the **text assembler's encode path** (the `cfree as` tool and -inline-`asm()` template handling). The compiler's codegen emits machine code -directly and never routes through the text assembler, and the shipped runtime -`.s`/`.S` files don't use these forms, so none of this breaks any build. - -Done (all byte-identical to clang/llvm-mc, with default-suite corpus cases): -- [x] aa64 FP-scalar (`fadd/fsub/fmul/fdiv/fneg/fabs/fsqrt/fmov[reg,gpr<->fp]/ - fcmp/fcvt/scvtf/ucvtf/fcvtzs/fcvtzu`). -- [x] aa64 bitfield (`sbfm/ubfm/bfm`) + DP1 (`clz/rbit/rev/rev16`). -- [x] aa64 byte/half + signed sub-word ldst (`ldrb/strb/ldrh/strh/ldrsb/ldrsh/ - ldrsw`). -- [x] aa64 ldst addressing modes: register-offset `[Xn, Xm, LSL #s]` / - `[Xn, Wm, SXTW]`, pre/post-index `[Xn, #i]!` / `[Xn], #i`. -- [x] aa64 atomics/exclusive (`ldxr/stxr/ldaxr/stlxr/ldar/stlr/cas{,a,l,al}`) - and LSE (`swp/ldadd/ldclr/ldeor/ldset` + a/l/al + b/h). -- [x] aa64 `mov Rd,#bitmask` via ORR alias. -- [x] x64 SIB index/scale + numeric/symbolic `(%rip)` memory operands, ALU - reg→mem / imm→mem stores, MOV imm→mem. -- [x] Relocation-operator operand syntax on all three archs: aa64 - `:lo12:`/`:got:`/`:got_lo12:`, rv64 `%hi`/`%lo`/`%pcrel_hi`/`%pcrel_lo`/ - `%got_pcrel_hi`, x64 `sym(%rip)`/`@PLT`/`@GOTPCREL` (byte- and reloc-identical - to llvm-mc incl. addends). -- [x] rv64 `call/tail/la/lla` pseudos + multi-word `li`. -- [x] ldrsb/ldrsh/ldrsw DECODE rows (`aa64_ldrs_subword`). - -Remaining (niche / deferred, none block any build): -- [ ] aa64 atomics: `CASP`, LSE min/max (`ldsmax/ldsmin/ldumax/ldumin`), - `LDAPR`/`STLLR`; and disasm rows for the new encode-only atomics / register- - offset / writeback forms (round-trip would currently print `.inst`). -- [ ] TLS relocation modifiers (`:tprel_*:` / `%tls_*`) and `.L`-prefixed - local-label spellings in operand references (shared-lexer change). -- [ ] x64 debugger step-out RA recovery (needs a memory-reading unwind variant; - also blocked on the JIT debugger not populating `.eh_frame` for in-process - images). Tracks the same root cause as the Tier-2 "x64 dwarf: step-out" item; - both remain deferred on test-infra dependencies. diff --git a/doc/NATIVE_DIRECT_CACHE.md b/doc/NATIVE_DIRECT_CACHE.md @@ -1,189 +0,0 @@ -# NativeDirectTarget local register cache - -`NativeDirectTarget` is the single-pass `-O0` lowering shared by the native -backends (currently aarch64). The baseline lowers every semantic op to -load-operands / compute / store-result against frame homes, so a local round -trips through memory on every use. This document describes the local register -cache that avoids those round trips. **Design A** and **Design B** are both -implemented; B builds incrementally on A. - -See also `doc/CGTARGET.md` ("local register cache") for the original sketch. - -## Invariants shared by both designs - -- **What is cached.** Only scalar locals (`size <= ptr_size`) that are neither - address-taken nor `memory_required`. Aggregates and escaped locals stay - frame-only. A cache access is keyed on the local's storage type; a foreign - width bypasses (and flushes) the entry. -- **Where it is cached.** Only **caller-saved allocable** registers - (`caller_saved_mask & allocable`). Consequences: - - No prologue/epilogue work — the direct path never reports clobbered - callee-saved registers (`reserve_callee_saves` is unused on this path), so - using callee-saved regs would be unsound. Caller-saved sidesteps it. - - The conservative flush before every call/barrier (which spills the whole - cache) fully covers ABI clobbering across calls. -- **Basic-block scope.** Without CFG/liveness, the cache cannot survive a - control-flow edge or a join. Both designs spill+empty the cache at every - branch, label placement, and `ret`. `func_begin` starts with an empty cache. -- **State.** `reg_owner[cls][reg]` names the local cached in a physical - register (or `CG_LOCAL_NONE`); `scratch_used[cls]` doubles as the - "pinned for the current instruction" mask; per-local `reg`/`cls`/`dirty` live - on `NativeDirectLocal`. - -## Design A — write-back, compute-only cache (implemented) - -The simplest correct form. Cache entries are created **only** by pure-compute -destinations (`binop`, `unop`, `cmp`, `convert`, `load_imm`, `load_const`, -scalar `copy`) and are therefore always dirty. A read returns a live entry -(hit) or loads into a scratch temporary **without** creating an entry. The -cache thus only survives across a straight-line run of pure-compute ops. - -`nd_flush_all` (spill all dirty entries to their homes, then empty) runs at the -**top of every non-compute op**: control-flow/labels (block boundaries), calls -and barriers (clobbering / observable memory), and every memory or address op -(so frame homes are authoritative before an address base is read from a frame). - -This uniform flush is what makes Design A obviously correct: at the start of any -op that could observe memory, alias a local, read an address base from a frame, -or merge control flow, the cache is already empty and memory is authoritative. -No escape analysis, alias reasoning, or address-base tracking is required. - -Key helpers (`src/cg/native_direct_target.c`): - -``` -nd_local_cacheable(d, l) scalar, !address_taken, !memory_required, fits -nd_cache_alloc(d, cls) free caller-saved allocable reg, else evict one -nd_flush_local(d, local) store if dirty; drop entry -nd_invalidate_local(d, local) drop entry without storing (store supersedes) -nd_flush_all(d) flush every live entry -nd_dst_reg / nd_dst_writeback compute-op result reg; mark dirty without storing -``` - -`nd_scratch_acquire` skips owned registers and, under pressure, evicts a -non-pinned cached local (spilling it) to reuse its register as a temporary. - -**Wins:** expression/temp chains (`t1 = a+b; t2 = t1*c; t3 = t2+d`) never touch -memory between definition and use. **Limits:** the cache collapses on every -load/store/call, so memory-dense code degrades to the baseline. It is never -worse than baseline: each cached local is stored at most once per boundary, -versus once per definition in the baseline. - -## Design B — escape-aware cache with cache-aware addressing (implemented) - -Design B keeps the same block-local invariant but stops tearing the cache down -on memory operations, by reasoning about escape and by making address -construction consult the cache. It is strictly incremental over A: A with (a) -the memory ops' `nd_flush_all` removed on escape grounds, (b) addressing taught -to use live cache registers, and (c) a real spill victim selector. - -> **Refinement found during implementation — direct frame addressing.** The -> escape argument below assumes a memory op reaches a local only *through a -> pointer*. But a `load`/`store` can also address a non-escaped local's frame -> home *directly* via an `OPK_LOCAL` storage operand (`nd_addr_storage` → -> `NATIVE_ADDR_BASE_FRAME`) — the frontend does this for by-value aggregate -> field extraction on a scalar-sized struct temp. That is not pointer aliasing, -> but it does read/write the home, so a cached value would be stale. The fix is -> targeted, not a `nd_flush_all`: when `nd_addr_storage`/`nd_addr_pointer` build -> a `BASE_FRAME` address for an `OPK_LOCAL`, they `nd_flush_local` that one local -> first (spill if dirty, drop the entry). Pointer-based accesses (`BASE_REG` / -> `BASE_FRAME_VALUE`) are unaffected, so the common case stays fully cached. - -### 1. Escape-based aliasing replaces flush-on-memory-op - -Address-taken / `memory_required` locals are never cached. Therefore a pointer -`load`/`store`/`copy_bytes`/`set_bytes`/`bitfield_*`/non-clobbering atomic can -only alias an **escaped** local — which is never in a register. So these ops -need **no value-cache flush at all** for aliasing correctness; they keep the -cache live across the access. - -This is the central change: A flushes before a memory op to make the frame home -current; B instead never lets a non-escaped local's home be aliased, so there is -nothing to make current. - -### 2. Cache-aware addressing - -Today `nd_addr_storage` / `nd_addr_pointer` emit `NATIVE_ADDR_BASE_FRAME_VALUE` -and `nd_addr_materialize` *loads* the pointer/index from the frame. When the -base or index local is currently cached, B instead points the address at the -live register: - -``` -building an INDIRECT/pointer address whose base/index local L is cached: - base_kind = NATIVE_ADDR_BASE_REG, base.reg = L->reg (pin for the op) - else: - NATIVE_ADDR_BASE_FRAME_VALUE (load from home, as today) -...after the memory op: unpin the base/index regs; do NOT invalidate and do NOT - store even if dirty — the value was read from the live register, so the stale - home is irrelevant. -``` - -This simultaneously removes the reload and fixes the staleness hazard that A -sidesteps by flushing. A dirty pointer local can serve as an address base -directly from its register. - -### 3. Calls / barriers still flush + invalidate - -Caller-saved registers die across a call, and a memory-clobber barrier -(volatile, atomic with clobber, inline asm with `memory`) may observe -everything, so `nd_flush_all` still runs before `call`/`atomic`/volatile/`asm`. -A future refinement could use a per-call save-set to shrink this; flush is the -simple correct version. - -### 4. Branches / labels / ret still flush - -The block-local invariant is unchanged from A and is unavoidable without -CFG/liveness. - -### 5. Address-taking is targeted, not global - -`addr_of` / `local_addr` on a cached local does `nd_flush_local(L)` and marks it -`address_taken` (uncacheable thereafter) — a targeted flush, not `nd_flush_all`. - -### Extra bookkeeping B requires over A - -- **Read entries.** B benefits from caching read-only locals (loop variables, - reused operands), not just compute results. Read-created entries are clean - (not dirty). This reintroduces the hazard A avoids: an entry created during a - non-compute op that emits clobbering code (intrinsic/call) must not survive. - B must therefore either (i) confine read-entry creation to compute ops, or - (ii) invalidate (clean entries need no store) after any clobbering op. Option - (i) — only compute ops may create cache entries — is simplest and preserves - the "cache only grows across compute runs" property. **Implemented: option - (i).** A `load`/`bitfield_load`/`addr_of` result is written to the dst local's - frame home, bypassing the cache, and must drop any stale entry for that local - (the home write supersedes it). This is centralized in - `nd_store_operand_from_reg` — the single choke point that writes a - freshly-computed scratch value to a local's home — which `nd_invalidate_local`s - the dst entry before the store. Because it runs *after* the value reg is - produced, a dst that was its own address base has already been consumed. -- **Real eviction/spill (implemented as approximate-LRU).** Pressure is higher - because values survive across more ops, so "don't cache when full" is - insufficient. Each cache touch (def in `nd_dst_reg`/`nd_dst_writeback`, read - hit in `nd_materialize_operand`, addressing use in `nd_cache_reg_for`) stamps - `NativeDirectLocal.last_use` from a monotonic `NativeDirectTarget.use_tick`. - `nd_pick_cache_victim` returns the least-recently-used non-pinned owned - register; `nd_cache_alloc` and `nd_scratch_acquire` route their eviction - through it and `nd_flush_local` the victim. Pinned source/dst/address regs are - never victims. -- **Pin discipline through addressing.** Base/index cache regs must be pinned - for the duration of a memory op and unpinned (never invalidated) afterward, - alongside the existing source/dst/temp pins. -- **INDIRECT compute operands.** Keep A's `nd_flush_operand_addr_locals` guard - for the rare compute op that receives an `OPK_INDIRECT` operand directly, - unless addressing in such operands is also made cache-aware. - -### Correctness rests on - -- The escape argument: non-escaped locals are never aliasable through a pointer, - so memory ops need not flush them. -- The addressing intercept being exact: every frame-value base/index read of a - cached local must instead use the live register. -- Clean read entries never surviving a clobbering op. -- The block-local invariant (flush at every edge/label/ret) being complete. - -### Why B is worth it - -A wins only on arithmetic-dense straight-line code. B keeps the cache alive -across loads, stores, and pointer-base reuse, so it wins on realistic mixed code -— `p->a + p->b`, pointer-walking loops, struct-field math — while remaining -strictly single-pass with no lookahead and no liveness. diff --git a/doc/NATIVE_PORT_RV64.md b/doc/NATIVE_PORT_RV64.md @@ -1,3647 +0,0 @@ -# RV64 NativeTarget Porting Reference - -(generated guide — cross-references aa64 native.c, the -O0 driver, rv64 legacy ISA/ABI) - - - ---- - -# RV64 NativeTarget API Porting Guide — GROUP 1: Skeleton, Frame Model, and Function Lifecycle - -## Overview - -This guide specifies the rv64 NativeTarget implementation (src/arch/rv64/native.c) for the single-pass -O0 path (NativeDirectTarget). The reference implementation is AA64 (src/arch/aa64/native.c ~4557 lines); the rv64 legacy code (src/arch/rv64/*.c) provides correct ISA/ABI logic but does not compile to the NativeTarget API. Focus initially on the single-pass path; the known-frame (-O1) path is a separate optimization discussed at the end. - ---- - -## (a) Includes and RV64-Specific Subclass Struct - -### Header Includes (model aa64/native.c lines 30–45) - -```c -#include <string.h> - -#include "abi/abi.h" -#include "arch/rv64/isa.h" // ISA instruction encoders (rv_add, rv_addi, etc.) -#include "arch/rv64/regs.h" // Register name/index lookup -#include "arch/rv64/rv64.h" // Public rv64_native_target_new() declaration -#include "asm/asm.h" -#include "asm/asm_lex.h" -#include "cg/native_direct_target.h" -#include "cg/type.h" -#include "core/arena.h" -#include "core/bytes.h" -#include "core/pool.h" -#include "core/slice.h" -#include "obj/obj.h" -``` - -The rv64/isa.h file contains instruction format helpers (rv_r, rv_i, rv_s, etc.) and named-register constants: -- RV_S0 = 8 (frame pointer, s0/fp in psABI) -- RV_RA = 1 (return address) -- RV_SP = 2 (stack pointer) -- RV_A0..A7 = 10..17 (argument registers) -- RV_T0..T6 = 5, 6, 7, 28–31 (temporaries; use T0=5 as primary scratch) -- RV_S2..S11 = 18..27 (callee-saved; allocatable by register allocator) -- RV_FS0..FS11 = 40, 41, 50–59 (FP callee-saved) -- RV_FA0..FA7 = 42–49 (FP argument registers; 32-based DWARF numbering) - ---- - -### RvNativeTarget Subclass Struct (model AANativeTarget at aa64/native.c lines 181–261) - -```c -#define RV_PROLOGUE_WORDS 128u // Worst-case placeholder size (single-pass) -#define RV_TMP0 5u // Scratch register (RV_T0 = x5) -#define RV_TMP1 6u // Secondary scratch (RV_T1 = x6) -#define RV_FRAME_SAVE_SIZE 16u // saved s0 (8B) + saved ra (8B) - -typedef struct RvNativeSlot { - u32 off; // Bytes below s0 (positive); address = s0 - off - u32 size; - u32 align; - u8 kind; // NativeFrameSlotKind - u8 pad[3]; -} RvNativeSlot; - -typedef struct RvCalleeSave { - NativeFrameSlot slot; - CfreeCgTypeId type; - u8 cls; // NativeAllocClass (NATIVE_REG_INT or NATIVE_REG_FP) - Reg reg; -} RvCalleeSave; - -#define RV_MAX_CALLEE_SAVES 18u // s2–s11 (10) + fs2–fs11 (8) - -typedef struct RvNativeTarget { - NativeTarget base; - SrcLoc loc; - const CGFuncDesc* func; - - RvNativeSlot* slots; - u32 nslots; - u32 slots_cap; - u32 cum_off; // Cumulative frame-slot bytes below s0 (not including saved pair) - u32 max_outgoing; // Max outgoing-arg bytes across all calls - - u32 incoming_stack_size; // Callee's incoming stack args (for tail-call validation) - u32 next_param_int; // 0–8: next a-register index for INT parts; 8+ = stack - u32 next_param_fp; // 0–8: next fa-register index for FP parts; 8+ = stack - u32 next_param_stack; // 0-based byte offset for stack-passed params - - NativeFrameSlot sret_ptr_slot; // Hidden slot for sret pointer (a0 on entry) - NativeFrameSlot va_gp_slot; // Variadic GP save area slot (if needed) - - // Deferred patches (single-pass path only) - struct RvPatch { - u32 pos; // Code offset in text section - u32 dst_reg; // Destination register (for alloca patches) - }* patches; - u32 npatches; - u32 npatches_cap; - - u32 func_start; // func_start offset in text section - u32 prologue_pos; // prologue_pos within func_start (start of NOP region) - MCLabel epilogue_label; - - RvCalleeSave callee_saves[RV_MAX_CALLEE_SAVES]; - u32 ncallee_saves; - - // Frame layout flags (single-pass: only known_frame and has_alloca used) - u8 known_frame; // 0 on single-pass (NativeDirectTarget); 1 on known-frame - u8 has_alloca; // Set if body contains dynamic alloca -} RvNativeTarget; - -static inline RvNativeTarget* rv_of(NativeTarget* t) { return (RvNativeTarget*)t; } -``` - -**Key differences from AA64:** -1. **Frame anchor:** RV64 uses s0 (x8) as the frame pointer, which anchors the saved s0/ra pair at [s0+0]/[s0+8]. Offsets are **bytes below s0** (positive values; address = s0 - off). -2. **No bottom-record layout:** AA64 has fp_at_bottom for small frames; RV64's simpler ABI (8-byte-aligned stack, no outgoing-area subtlety) uses a single **top-record** layout where s0 is set at the prologue to sp + fp_pair_off (= frame_size - 16 - variadic_save_sz). -3. **Variadic:** If the function is variadic, a 64-byte GP save area sits immediately above the saved pair (at [s0 + 16]) to form a contiguous va_list walk. This is implicit in the frame layout and reserved in prologue (not via a frame_slot). -4. **Scratch:** RV_TMP0 (x5 = t0) and RV_TMP1 (x6 = t1) are the primary temporaries for immediate materialization; they are not allocable. - ---- - -## (b) Frame Layout and Helper Functions - -### Frame Layout Math (model aa64's aa_build_layout at aa64/native.c lines 121–128 + frame offset helpers lines 275–293) - -RV64 uses a single, simpler layout: - -``` - high addr caller's stack frame - +------------------------------+ - | incoming stack args | s0-relative: s0 + 16 + byte_off - +------------------------------+ - s0 --> | saved s0 | s0-relative: 0 - | saved ra | s0-relative: 8 - +------------------------------+ - | frame slots | s0-relative: -(off) where off = cum_off - | (callee-saves + locals | - | + spills + sret/va) | - +------------------------------+ - | outgoing args | sp-relative: byte_off - sp --> +------------------------------+ - low addr CFA = s0 + frame_size - frame_size = align16(16 + cum_off + max_outgoing + va_save_sz) - where va_save_sz = is_variadic ? 64 : 0 - fp_pair_off = frame_size - 16 - va_save_sz (where saved pair sits in sp frame) -``` - -**Inline helpers:** - -```c -#define RV_FRAME_SAVE_SIZE 16u - -// s0-relative offset of saved s0 (or ra at +8) -static inline i32 rv_s0_off_saved_s0(void) { return 0; } -static inline i32 rv_s0_off_saved_ra(void) { return 8; } - -// s0-relative offset of incoming stack arg at byte_off (0-based caller offset) -// Incoming stack args sit at s0 + 16 [+ 64 for variadic] + byte_off -static inline i32 rv_s0_off_in_arg(const RvNativeTarget* a, u32 byte_off) { - u32 base = a->is_variadic ? 16u + 64u : 16u; - return (i32)(base + byte_off); -} - -// s0-relative offset of a frame slot (off = cum_off value from its RvNativeSlot) -// Slots stack downward from the saved pair: address = s0 - off -static inline i32 rv_s0_off_slot(u32 slot_off) { - return -(i32)slot_off; -} - -// CFA = s0 + (frame_size - fp_pair_off) = s0 + 16 + va_save_sz (absolute offset) -static inline i32 rv_cfa_off(u32 frame_size, u32 fp_pair_off) { - return (i32)(frame_size - fp_pair_off); -} - -// Frame size calculation (called once per function at prologue patch time) -static inline u32 rv_frame_size(u32 cum_off, u32 max_outgoing, u8 is_variadic) { - u32 va_sz = is_variadic ? 64u : 0u; - u32 raw = RV_FRAME_SAVE_SIZE + cum_off + max_outgoing + va_sz; - return (raw + 15u) & ~15u; // align to 16 bytes -} - -// fp_pair_off: where the saved s0/ra pair sits within the frame (sp-relative) -static inline u32 rv_fp_pair_off(u32 frame_size, u8 is_variadic) { - return frame_size - RV_FRAME_SAVE_SIZE - (is_variadic ? 64u : 0u); -} -``` - ---- - -## (c) rv64_native_target_new() Entry (model aa64_native_target_new at aa64/native.c lines 3540–3609) - -```c -NativeTarget* rv64_native_target_new(Compiler* c, ObjBuilder* obj, MCEmitter* mc) { - RvNativeTarget* a = arena_znew(c->tu, RvNativeTarget); - NativeTarget* t; - if (!a) return NULL; - t = &a->base; - t->c = c; - t->obj = obj; - t->mc = mc; - t->regs = &rv_reg_info; // Defined in rv64/regs.c (NativeRegInfo with allocable/scratch/phys) - - // Semantic-decision hooks (class, immediates, addressing) - t->class_for_type = rv_class_for_type; - t->imm_legal = rv_imm_legal; - t->addr_legal = rv_addr_legal; - - // Function lifecycle - t->func_begin = rv_func_begin; - t->func_begin_known_frame = rv_func_begin_known_frame; // For -O1 path - t->note_frame_state = rv_note_frame_state; // Optional, for patching - t->reserve_callee_saves = NULL; // OR rv_reserve_callee_saves if needed - t->signature_stack_bytes = rv_signature_stack_bytes; - t->call_stack_bytes = rv_call_stack_bytes; - t->has_store_zero_reg = 0; // RV64 has x0 but most targets don't use it for store (test this) - t->store_zero_reg = 0; - t->func_end = rv_func_end; - - // Frame slot and parameter binding - t->frame_slot = rv_frame_slot; - t->frame_slot_debug_loc = NULL; // Optional; use if debugger needs per-slot dwarf locs - t->bind_param = rv_bind_native_param; - - // Control flow - t->label_new = rv_label_new; - t->label_place = rv_label_place; - t->jump = rv_jump; - t->cmp_branch = rv_cmp_branch; - t->indirect_branch = rv_indirect_branch; - t->load_label_addr = rv_load_label_addr; - - // Instruction emission (scalars, memory, shifts, calls, etc.) - t->move = rv_move; - t->load_imm = rv_load_imm; - t->load_const = rv_load_const; - t->load_addr = rv_load_addr; - t->load = rv_load; - t->store = rv_store; - t->tls_addr_of = rv_tls_addr_of; - t->copy_bytes = rv_copy_bytes; - t->set_bytes = rv_set_bytes; - t->bitfield_load = rv_bitfield_load; - t->bitfield_store = rv_bitfield_store; - t->binop = rv_binop; - t->unop = rv_unop; - t->cmp = rv_cmp; - t->convert = rv_convert; - t->alloca_ = rv_alloca; - - // Spill/reload - t->spill = rv_spill; - t->reload = rv_reload; - - // Calls and returns - t->plan_call = rv_plan_call; - t->emit_call = rv_emit_call; - t->plan_ret = rv_plan_ret; - t->ret = rv_ret; - - // Atomics - t->atomic_load = rv_atomic_load; - t->atomic_store = rv_atomic_store; - t->atomic_rmw = rv_atomic_rmw; - t->atomic_cas = rv_atomic_cas; - t->fence = rv_fence; - - // Variadic - t->va_start_ = rv_va_start_; - t->va_arg_ = rv_va_arg_; - t->va_end_ = rv_va_end_; - t->va_copy_ = rv_va_copy_; - - // Inline/file-scope asm and misc - t->intrinsic = rv_intrinsic; - t->asm_block = rv_asm_block; - t->file_scope_asm = rv_file_scope_asm; - t->trap = rv_trap; - t->set_loc = rv_set_loc; - t->finalize = rv_finalize; - - return t; -} -``` - ---- - -## (d) func_begin, func_begin_known_frame, func_end, and Prologue/Epilogue Emission - -### func_begin_common (Single-Pass Setup) - -```c -static void rv_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) { - RvNativeTarget* a = rv_of(t); - MCEmitter* mc = t->mc; - - a->func = fd; - a->nslots = 0; - a->cum_off = 0; - a->max_outgoing = 0; - a->incoming_stack_size = 0; - a->next_param_int = 0; - a->next_param_fp = 0; - a->next_param_stack = 0; - a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE; - a->va_gp_slot = NATIVE_FRAME_SLOT_NONE; - a->npatches = 0; - a->ncallee_saves = 0; - a->known_frame = 0; - a->has_alloca = 0; - - mc->set_section(mc, fd->text_section_id); - mc->emit_align(mc, 4, 0); - a->func_start = mc->pos(mc); - mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start); - if (mc->cfi_startproc) mc->cfi_startproc(mc); - - a->prologue_pos = mc->pos(mc); - a->epilogue_label = mc->label_new(mc); -} -``` - -### func_begin (Single-Pass, NativeDirectTarget Path) - -```c -// Model: aa64_func_begin (aa64/native.c lines 1089–1095) -static void rv_func_begin(NativeTarget* t, const CGFuncDesc* fd) { - RvNativeTarget* a = rv_of(t); - MCEmitter* mc = t->mc; - - rv_func_begin_common(t, fd); - - // Reserve a worst-case prologue region (RV_PROLOGUE_WORDS NOPs). - // The exact size is unknown until all frame-slot and call-site max_outgoing - // data is gathered; func_end patches it once the frame is final. - for (u32 i = 0; i < RV_PROLOGUE_WORDS; ++i) { - rv64_emit32(mc, RV_NOP); - } - - // Emit entry saves (sret pointer, variadic GP register save area). - // Slots are reserved so spill/reload addresses are known immediately. - rv_reserve_entry_saves(a); -} -``` - -### rv_reserve_entry_saves and rv_emit_entry_save_stores - -```c -// Model: aa64's aa_reserve_entry_saves / aa_emit_entry_save_stores -// (aa64/native.c lines 1101–1142) - -static void rv_reserve_entry_saves(RvNativeTarget* a) { - NativeTarget* t = &a->base; - const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type); - - // sret: hidden slot for incoming a0 (destination pointer for struct return) - if (abi && abi->has_sret) { - NativeFrameSlotDesc sd; - memset(&sd, 0, sizeof sd); - sd.type = builtin_id(CFREE_CG_BUILTIN_I64); - sd.size = 8; - sd.align = 8; - sd.kind = NATIVE_FRAME_SLOT_SAVE; - a->sret_ptr_slot = t->frame_slot(t, &sd); - } - - // Variadic: GP save area (64 bytes) is implicit at [s0 + 16] but no explicit slot - // (The prologue will spill unconsumed a-registers there automatically.) -} - -static void rv_emit_entry_save_stores(RvNativeTarget* a) { - NativeTarget* t = &a->base; - const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type); - - // Spill a0 (sret pointer) into the hidden slot - if (abi && abi->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) { - NativeAddr addr; - NativeLoc src; - MemAccess mem; - CfreeCgTypeId i64 = builtin_id(CFREE_CG_BUILTIN_I64); - - memset(&addr, 0, sizeof addr); - addr.base_kind = NATIVE_ADDR_BASE_FRAME; - addr.base.frame = a->sret_ptr_slot; - addr.base_type = i64; - - memset(&src, 0, sizeof src); - src.kind = NATIVE_LOC_REG; - src.cls = NATIVE_REG_INT; - src.type = i64; - src.v.reg = RV_A0; // Incoming a0 - - memset(&mem, 0, sizeof mem); - mem.type = i64; - mem.size = 8; - mem.align = 8; - - rv_emit_mem(a, 0, src, addr, mem); // Store (0 = write) - } - // Variadic save spills happen in the prologue itself (auto via rv_build_prologue). -} -``` - -### func_end (Single-Pass Prologue Patching) - -```c -// Model: aa64_func_end (aa64/native.c lines 1493–1543) -// For single-pass (known_frame=0): patch the reserved prologue region. - -static void rv_func_end(NativeTarget* t) { - RvNativeTarget* a = rv_of(t); - MCEmitter* mc = t->mc; - ObjBuilder* obj = t->obj; - - // Compute final frame size now that max_outgoing and callee-saves are known - u32 n_int_saves = 0, n_fp_saves = 0; - u32 int_regs[10], fp_regs[10]; // Caller provides these - - if (!a->known_frame) { - // Single-pass: collect the actual callee-saves from some allocator state - // (this would be filled in by reserve_callee_saves or tracked during body emission) - n_int_saves = rv_collect_callee_saves(a, int_regs, fp_regs, &n_fp_saves); - } - - u32 frame_size = rv_frame_size(a->cum_off, a->max_outgoing, a->func->abi && a->func->abi->variadic); - u32 fp_pair_off = rv_fp_pair_off(frame_size, a->func->abi && a->func->abi->variadic); - - // Place epilogue label - mc->label_place(mc, a->epilogue_label); - - // Emit epilogue: restore callee-saves and frame, then ret - rv_emit_callee_restores(a, int_regs, n_int_saves, fp_regs, n_fp_saves); - rv_emit_restore_frame(a, frame_size, fp_pair_off); - rv64_emit32(mc, rv_jalr(RV_ZERO, RV_RA, 0)); // ret (jalr x0, x1, 0) - - if (!a->known_frame) { - // Single-pass: patch the prologue region with actual instructions - u32 words[RV_PROLOGUE_WORDS]; - u32 nwords = rv_build_prologue(t, words, RV_PROLOGUE_WORDS, - frame_size, fp_pair_off, - a->cum_off, a->max_outgoing, - int_regs, n_int_saves, fp_regs, n_fp_saves, - a->func->abi && a->func->abi->has_sret, - a->func->abi && a->func->abi->variadic); - rv64_patch_region(obj, a->func->text_section_id, a->prologue_pos, words, nwords); - } - - // CFI frame information - if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) { - u32 post_prologue = a->prologue_pos + (a->known_frame ? a->nwords_emitted * 4 : RV_PROLOGUE_WORDS * 4); - i32 cfa = rv_cfa_off(frame_size, fp_pair_off); - mc->cfi_set_next_pc_offset(mc, post_prologue); - mc->cfi_def_cfa(mc, RV_S0, cfa); // CFA = s0 + cfa_dist - mc->cfi_offset(mc, RV_S0, -cfa); // saved s0 at CFA - cfa - mc->cfi_offset(mc, RV_RA, -cfa + 8); // saved ra at CFA - cfa + 8 - for (u32 i = 0; i < n_int_saves; ++i) { - i32 slot_off = -(i32)(a->cum_off + 8u + i * 8u); // s0-relative - i32 cfa_off = slot_off - cfa; - mc->cfi_offset(mc, int_regs[i], cfa_off); - } - for (u32 i = 0; i < n_fp_saves; ++i) { - i32 slot_off = -(i32)(a->cum_off + 8u + n_int_saves * 8u + i * 8u); - i32 cfa_off = slot_off - cfa; - mc->cfi_offset(mc, 32u + fp_regs[i], cfa_off); // DWARF: fp regs 32–63 - } - } - - obj_symbol_define(obj, a->func->sym, a->func->text_section_id, - a->func_start, mc->pos(mc) - a->func_start); - if (a->func->atomize) { - obj_atom_define(obj, a->func->text_section_id, a->func_start, - mc->pos(mc) - a->func_start, a->func->sym, 0); - } - if (mc->debug) - debug_func_pc_range(mc->debug, a->func->text_section_id, a->func_start, mc->pos(mc)); - if (mc->cfi_endproc) mc->cfi_endproc(mc); - - mc_end_function(mc); - a->func = NULL; -} -``` - -### rv_build_prologue (Prologue Word Array) - -**Pseudo-C sketch** (full ISA details in rv64/isa.h): - -```c -// Model: rv64_emit.c rv_build_prologue (lines 338–416) -static u32 rv_build_prologue(NativeTarget* t, u32* words, u32 cap, - u32 frame_size, u32 fp_pair_off, - u32 cum_off, u32 max_outgoing, - const u32* int_regs, u32 n_int_saves, - const u32* fp_regs, u32 n_fp_saves, - u8 has_sret, u8 is_variadic) { - u32 wi = 0; - - // 1. Adjust sp: sp -= frame_size - // Encoding: ADDI sp, sp, -frame_size (or multi-instruction if imm > 12 bits) - if (fits_i12(-(i32)frame_size)) { - if (wi >= cap) goto overflow; - words[wi++] = rv_addi(RV_SP, RV_SP, -(i32)frame_size); - } else { - // Use t0 as scratch for large immediates - if (wi >= cap) goto overflow; - i32 hi = (i32)(((i64)(-(i32)frame_size) + 0x800) >> 12); - i32 lo = -(i32)frame_size - (hi << 12); - words[wi++] = rv_lui(RV_TMP0, (u32)hi & 0xffffu); - if (lo) { - if (wi >= cap) goto overflow; - words[wi++] = rv_addiw(RV_TMP0, RV_TMP0, lo); - } - if (wi >= cap) goto overflow; - words[wi++] = rv_add(RV_SP, RV_SP, RV_TMP0); - } - - // 2. Save s0 and ra at [sp + fp_pair_off] - if (fits_i12((i32)fp_pair_off)) { - if (wi + 2 > cap) goto overflow; - words[wi++] = rv_sd(RV_S0, RV_SP, (i32)fp_pair_off); - words[wi++] = rv_sd(RV_RA, RV_SP, (i32)fp_pair_off + 8); - } else { - // Use t0 to compute address - if (wi >= cap) goto overflow; - i32 hi = (i32)(((i64)fp_pair_off + 0x800) >> 12); - i32 lo = (i32)fp_pair_off - (hi << 12); - words[wi++] = rv_lui(RV_TMP0, (u32)hi & 0xffffu); - if (lo) { - if (wi >= cap) goto overflow; - words[wi++] = rv_addiw(RV_TMP0, RV_TMP0, lo); - } - if (wi >= cap) goto overflow; - words[wi++] = rv_add(RV_TMP0, RV_SP, RV_TMP0); - if (wi + 2 > cap) goto overflow; - words[wi++] = rv_sd(RV_S0, RV_TMP0, 0); - words[wi++] = rv_sd(RV_RA, RV_TMP0, 8); - } - - // 3. Set s0 = sp + fp_pair_off - if (fits_i12((i32)fp_pair_off)) { - if (wi >= cap) goto overflow; - words[wi++] = rv_addi(RV_S0, RV_SP, (i32)fp_pair_off); - } else { - // Already in t0 from step 2 - if (wi >= cap) goto overflow; - words[wi++] = rv_addi(RV_S0, RV_TMP0, 0); - } - - // 4. If sret: spill a0 into hidden slot - if (has_sret) { - // (Assume sret_ptr_slot.off is known from frame_slot calls) - // For now, emit stores via rv_store_int_s0 helper - // words[wi++] = rv_sd(RV_A0, RV_S0, -(i32)sret_slot_off); - } - - // 5. If variadic: spill unconsumed a-regs into save area at [s0 + 16] - if (is_variadic) { - u32 first_var = /* computed from fixed param count */; - for (u32 i = first_var; i < 8; ++i) { - if (wi >= cap) goto overflow; - words[wi++] = rv_sd(RV_A0 + i, RV_S0, 16 + (i32)(i * 8)); - } - } - - // 6. Save callee-saved integer registers (s2–s11) - for (u32 i = 0; i < n_int_saves; ++i) { - u32 r = int_regs[i]; - i32 off = -(i32)(cum_off + 8u * (i + 1u)); // s0-relative - if (fits_i12(off)) { - if (wi >= cap) goto overflow; - words[wi++] = rv_sd(r, RV_S0, off); - } else { - // Use t0 for far offset - if (wi >= cap) goto overflow; - i32 hi = (i32)(((i64)off + 0x800) >> 12); - i32 lo = off - (hi << 12); - words[wi++] = rv_lui(RV_TMP0, (u32)hi & 0xffffu); - if (lo) { - if (wi >= cap) goto overflow; - words[wi++] = rv_addiw(RV_TMP0, RV_TMP0, lo); - } - if (wi >= cap) goto overflow; - words[wi++] = rv_add(RV_TMP0, RV_S0, RV_TMP0); - if (wi >= cap) goto overflow; - words[wi++] = rv_sd(r, RV_TMP0, 0); - } - } - - // 7. Save callee-saved FP registers (fs2–fs11) - for (u32 i = 0; i < n_fp_saves; ++i) { - u32 r = fp_regs[i]; - i32 off = -(i32)(cum_off + 8u * (n_int_saves + i + 1u)); - // Similar store logic as int saves - if (wi >= cap) goto overflow; - words[wi++] = rv_fsd(r, RV_S0, off); - } - - return wi; - -overflow: - compiler_panic(t->c, rv_of(t)->loc, "rv64: prologue overflow (cap %u)", cap); - return 0; -} -``` - -**Helper functions for prologue emission:** - -```c -static inline int fits_i12(i32 imm) { - return imm >= -2048 && imm <= 2047; -} - -static void rv64_patch_region(ObjBuilder* obj, u32 sec_id, u32 ofs, - const u32* words, u32 nwords) { - for (u32 i = 0; i < nwords; ++i) { - u8 b[4]; - u32 word = words[i]; - b[0] = (u8)(word & 0xff); - b[1] = (u8)((word >> 8) & 0xff); - b[2] = (u8)((word >> 16) & 0xff); - b[3] = (u8)((word >> 24) & 0xff); - obj_patch(obj, sec_id, ofs + i * 4, b, 4); - } -} - -static void rv_emit_callee_restores(RvNativeTarget* a, - const u32* int_regs, u32 n_int_saves, - const u32* fp_regs, u32 n_fp_saves) { - NativeTarget* t = &a->base; - MCEmitter* mc = t->mc; - // Restore in reverse order of saves - for (i32 i = (i32)n_int_saves - 1; i >= 0; --i) { - i32 off = -(i32)(a->cum_off + 8u * (i + 1u)); - rv64_emit32(mc, rv_ld(int_regs[i], RV_S0, off)); - } - for (i32 i = (i32)n_fp_saves - 1; i >= 0; --i) { - i32 off = -(i32)(a->cum_off + 8u * (n_int_saves + i + 1u)); - rv64_emit32(mc, rv_fld(fp_regs[i], RV_S0, off)); - } -} - -static void rv_emit_restore_frame(RvNativeTarget* a, u32 frame_size, u32 fp_pair_off) { - NativeTarget* t = &a->base; - MCEmitter* mc = t->mc; - // Load s0, ra from [sp + fp_pair_off] - if (fits_i12((i32)fp_pair_off)) { - rv64_emit32(mc, rv_ld(RV_S0, RV_SP, (i32)fp_pair_off)); - rv64_emit32(mc, rv_ld(RV_RA, RV_SP, (i32)fp_pair_off + 8)); - } else { - rv64_emit32(mc, rv_lui(RV_TMP0, (u32)(((i64)fp_pair_off + 0x800) >> 12) & 0xffffu)); - if ((i32)fp_pair_off - ((i32)(((i64)fp_pair_off + 0x800) >> 12) << 12)) { - rv64_emit32(mc, rv_addiw(RV_TMP0, RV_TMP0, - (i32)fp_pair_off - ((i32)(((i64)fp_pair_off + 0x800) >> 12) << 12))); - } - rv64_emit32(mc, rv_add(RV_TMP0, RV_SP, RV_TMP0)); - rv64_emit32(mc, rv_ld(RV_S0, RV_TMP0, 0)); - rv64_emit32(mc, rv_ld(RV_RA, RV_TMP0, 8)); - } - // Adjust sp: sp += frame_size (inverse of prologue) - if (fits_i12((i32)frame_size)) { - rv64_emit32(mc, rv_addi(RV_SP, RV_SP, (i32)frame_size)); - } else { - rv64_emit32(mc, rv_lui(RV_TMP0, (u32)(((i64)frame_size + 0x800) >> 12) & 0xffffu)); - if ((i32)frame_size - ((i32)(((i64)frame_size + 0x800) >> 12) << 12)) { - rv64_emit32(mc, rv_addiw(RV_TMP0, RV_TMP0, - (i32)frame_size - ((i32)(((i64)frame_size + 0x800) >> 12) << 12))); - } - rv64_emit32(mc, rv_add(RV_SP, RV_SP, RV_TMP0)); - } -} -``` - ---- - -## (e) frame_slot, reserve_callee_saves, note_frame_state, signature_stack_bytes, call_stack_bytes - -### frame_slot (model aa64/native.c lines 1545–1567) - -```c -static NativeFrameSlot rv_frame_slot(NativeTarget* t, - const NativeFrameSlotDesc* d) { - RvNativeTarget* a = rv_of(t); - RvNativeSlot* s; - u32 size = d->size ? d->size : 8u; - u32 align = d->align ? d->align : 1u; - - // Panic on known-frame path if frame is already finalized - if (a->frame_final) - compiler_panic(a->base.c, a->loc, "rv64: frame slot requested after prologue"); - - // Grow slots array if needed - if (a->nslots == a->slots_cap) { - u32 cap = a->slots_cap ? a->slots_cap * 2u : 16u; - RvNativeSlot* nb = arena_zarray(t->c->tu, RvNativeSlot, cap); - if (a->slots) memcpy(nb, a->slots, sizeof(*nb) * a->nslots); - a->slots = nb; - a->slots_cap = cap; - } - - // Allocate: align cum_off, then reserve [cum_off, cum_off+size) - a->cum_off = align_up_u32(a->cum_off + size, align); - s = &a->slots[a->nslots++]; - s->off = a->cum_off; // This is the address: s0 - cum_off - s->size = size; - s->align = align; - s->kind = d->kind; - - return (NativeFrameSlot)a->nslots; // 1-based slot ID -} -``` - -### reserve_callee_saves (Optional, model aa64/native.c lines 1230–1286) - -If implemented (not required on single-pass path): - -```c -static void rv_reserve_callee_saves(NativeTarget* t, const u32* used_by_class, - u32 nclasses) { - RvNativeTarget* a = rv_of(t); - // For each (class, mask) pair, walk the callee-saved registers (s2–s11 for INT, - // fs2–fs11 for FP) and reserve frame slots for those the allocator used. - for (u32 cls = 0; cls < nclasses; ++cls) { - u32 mask = used_by_class[cls]; - if (mask == 0) continue; - u32 first = (cls == NATIVE_REG_INT) ? RV_S2 : (32 + 18); // fs2 = DWARF 50 - u32 last = (cls == NATIVE_REG_INT) ? RV_S11 : (32 + 27); - for (u32 reg = first; reg <= last; ++reg) { - if (!(mask & (1u << reg))) continue; - NativeFrameSlotDesc sd; - memset(&sd, 0, sizeof sd); - sd.type = (cls == NATIVE_REG_INT) ? builtin_id(CFREE_CG_BUILTIN_I64) - : builtin_id(CFREE_CG_BUILTIN_F64); - sd.size = 8; - sd.align = 8; - sd.kind = NATIVE_FRAME_SLOT_SAVE; - NativeFrameSlot slot = t->frame_slot(t, &sd); - a->callee_saves[a->ncallee_saves].slot = slot; - a->callee_saves[a->ncallee_saves].reg = reg; - a->callee_saves[a->ncallee_saves].cls = cls; - a->ncallee_saves++; - } - } -} -``` - -### note_frame_state (Optional, for deferred patching) - -```c -static void rv_note_frame_state(NativeTarget* t, - const NativeFramePatchState* state) { - RvNativeTarget* a = rv_of(t); - if (state->max_outgoing > a->max_outgoing) - a->max_outgoing = state->max_outgoing; -} -``` - -### signature_stack_bytes (model aa64/native.c lines 1350–1370) - -Query the incoming stack-argument bytes for a function signature. Used for tail-call validation. - -```c -static u32 rv_signature_stack_bytes(NativeTarget* t, CfreeCgTypeId fn_type, - int* variadic, u32* nparams) { - const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type); - if (!abi) { - if (variadic) *variadic = 0; - if (nparams) *nparams = 0; - return 0; - } - if (variadic) *variadic = abi->variadic; - if (nparams) *nparams = abi->nparams; - - u32 stack_bytes = 0; - for (u32 i = 0; i < abi->nparams; ++i) { - const ABIArgInfo* ai = &abi->params[i]; - if (ai->kind == ABI_ARG_IGNORE) continue; - if (ai->kind == ABI_ARG_INDIRECT) { - // Indirect arg: takes one a-register or stack slot (8 bytes) - if (i >= 8) stack_bytes += 8; - } else { - // Direct parts: walk each part, count stack occupants - for (u16 j = 0; j < ai->nparts; ++j) { - const ABIArgPart* pt = &ai->parts[j]; - u32 part_reg_idx = (pt->cls == ABI_CLASS_FP) ? RV_NEXT_FP : RV_NEXT_INT; - if (RV_NEXT_INT >= 8 || (pt->cls == ABI_CLASS_FP && RV_NEXT_FP >= 8)) { - stack_bytes += 8; // Simplified; real logic increments per-class cursor - } - } - } - } - return align_up_u32(stack_bytes, 16u); -} -``` - -### call_stack_bytes (model aa64/native.c lines 1371–1400) - -Pure query: given a NativeCallDesc (already marshalled with locations), return the outgoing stack-argument bytes. - -```c -static u32 rv_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) { - if (!desc || desc->nargs == 0) return 0; - // Walk desc->args, which are already assigned to physical locations. - // Count those on the stack (NATIVE_LOC_STACK). - u32 max_off = 0; - for (u32 i = 0; i < desc->nargs; ++i) { - if (desc->args[i].kind == NATIVE_LOC_STACK) { - u32 end = desc->args[i].v.stack.offset + cg_type_size(t->c, desc->args[i].type); - if (end > max_off) max_off = end; - } - } - return align_up_u32(max_off, 16u); -} -``` - ---- - -## (f) bind_param (NativeTarget Hook) — Parameter Binding - -### High-Level Contract - -`bind_param` is called once per parameter after register allocation and frame slots are final. It reads the parameter from its ABI-mandated incoming location (a-register or stack) and places it in the allocator-chosen destination: -- **NATIVE_LOC_REG:** The allocator assigned the param to a hard register. -- **NATIVE_LOC_FRAME:** The allocator assigned the param to a frame slot (address-taken, large aggregate, or spilled). -- **NATIVE_LOC_NONE:** The param is unused; only the ABI cursor advances. - -### bind_param Pseudo-C (model aa64/native.c lines 3616–3696) - -```c -static void rv_bind_native_param(NativeTarget* t, const CGParamDesc* p, - NativeLoc dst) { - RvNativeTarget* a = rv_of(t); - const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type); - const ABIArgInfo* ai = (p->index < abi->nparams) ? &abi->params[p->index] : NULL; - int to_reg = (dst.kind == NATIVE_LOC_REG); - - if (!ai || ai->kind == ABI_ARG_IGNORE) return; - - // INDIRECT argument: sret or byval aggregate. - // The caller passes a pointer in an a-register (or stack if beyond a7). - if (ai->kind == ABI_ARG_INDIRECT) { - NativeAddr d_addr, from; - AggregateAccess access; - NativeLoc src; - - // Fetch the pointer from the next available a-register or stack - if (a->next_param_int < 8u) { - src = rv_reg_loc(p->type, NATIVE_REG_INT, a->next_param_int++); - } else { - // Stack-passed pointer: load into t0 - src = rv_reg_loc(p->type, NATIVE_REG_INT, RV_TMP0); - NativeAddr saddr; - memset(&saddr, 0, sizeof saddr); - saddr.base_kind = NATIVE_ADDR_BASE_REG; - saddr.base.reg = RV_S0; // Frame pointer (could also use sp with offset calc) - saddr.offset = rv_s0_off_in_arg(a, a->next_param_stack); - a->next_param_stack += 8u; - rv_emit_mem(a, 1, src, saddr, rv_mem_for_type(t, p->type, 8)); // Load - } - - // Destination must be a frame slot (indirect params can't go to registers) - if (dst.kind != NATIVE_LOC_FRAME) - compiler_panic(t->c, a->loc, "rv64: indirect param requires frame dest"); - - // Copy aggregate from [pointer] to [dst slot] - memset(&d_addr, 0, sizeof d_addr); - d_addr.base_kind = NATIVE_ADDR_BASE_FRAME; - d_addr.base.frame = dst.v.frame; - d_addr.base_type = p->type; - - memset(&from, 0, sizeof from); - from.base_kind = NATIVE_ADDR_BASE_REG; - from.base.reg = src.v.reg; - from.base_type = p->type; - - memset(&access, 0, sizeof access); - access.type = p->type; - access.size = p->size ? p->size : (u32)cg_type_size(t->c, p->type); - access.align = p->align ? p->align : type_align32(t, p->type); - - rv_copy_bytes(t, d_addr, from, access); - return; - } - - // DIRECT argument: one or more parts (INT / FP scalars or small aggregates). - for (u32 i = 0; i < ai->nparts; ++i) { - const ABIArgPart* part = &ai->parts[i]; - NativeAllocClass cls = (part->cls == ABI_CLASS_FP) ? NATIVE_REG_FP : NATIVE_REG_INT; - int reg_dst = to_reg && (NativeAllocClass)dst.cls == cls; - NativeLoc src; - - // Fetch the part from the next available a-reg/fa-reg or stack - if (cls == NATIVE_REG_FP && a->next_param_fp < 8u) { - src = rv_reg_loc(p->type, cls, a->next_param_fp++); - } else if (cls == NATIVE_REG_INT && a->next_param_int < 8u) { - src = rv_reg_loc(p->type, cls, a->next_param_int++); - } else { - // Stack-passed part: load into a scratch (t0 for int, ft0 for fp) or directly into dst reg - Reg tmp = reg_dst ? (Reg)dst.v.reg : (cls == NATIVE_REG_FP ? 8u : RV_TMP0); // ft0=DWARF 32 - src = rv_reg_loc(p->type, cls, tmp); - - // Align and load from stack - a->next_param_stack = align_up_u32(a->next_param_stack, rv_part_stack_align(part)); - NativeAddr saddr; - memset(&saddr, 0, sizeof saddr); - saddr.base_kind = NATIVE_ADDR_BASE_REG; - saddr.base.reg = RV_S0; - saddr.base_type = p->type; - saddr.offset = rv_s0_off_in_arg(a, a->next_param_stack); - rv_emit_mem(a, 1, src, saddr, rv_mem_for_type(t, p->type, part->size)); - a->next_param_stack += 8u; - } - - // Place src into dst - if (dst.kind == NATIVE_LOC_NONE) { - // Unused parameter: only the ABI cursor advances. - } else if (to_reg) { - NativeLoc d = rv_reg_loc(dst.type ? dst.type : p->type, - (NativeAllocClass)dst.cls, (Reg)dst.v.reg); - if (!(src.kind == NATIVE_LOC_REG && src.v.reg == d.v.reg && - (NativeAllocClass)src.cls == (NativeAllocClass)d.cls)) { - rv_move(t, d, src); - } - } else { - // Store part into frame slot at offset part->src_offset - rv_store_part(t, rv_stack_loc(p->type, dst.v.frame, (i32)part->src_offset), - src, 0, part->size); - } - } - - a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u); -} -``` - -### Helpers for bind_param - -```c -static inline NativeLoc rv_reg_loc(CfreeCgTypeId type, NativeAllocClass cls, Reg reg) { - NativeLoc loc; - memset(&loc, 0, sizeof loc); - loc.kind = NATIVE_LOC_REG; - loc.cls = cls; - loc.type = type; - loc.v.reg = reg; - return loc; -} - -static inline NativeLoc rv_stack_loc(CfreeCgTypeId type, NativeFrameSlot slot, i32 offset) { - NativeLoc loc; - memset(&loc, 0, sizeof loc); - loc.kind = NATIVE_LOC_FRAME; - loc.type = type; - loc.v.frame = slot; - return loc; -} - -static inline u32 rv_part_stack_align(const ABIArgPart* part) { - return part->align ? part->align : 8u; -} - -static inline MemAccess rv_mem_for_type(NativeTarget* t, CfreeCgTypeId type, u32 size) { - MemAccess mem; - memset(&mem, 0, sizeof mem); - mem.type = type; - mem.size = size; - mem.align = size; // Simplified; real code queries type alignment - return mem; -} - -static void rv_store_part(NativeTarget* t, NativeLoc dst, NativeLoc src, - u32 dst_offset, u32 size) { - // Generalized store of a part into a frame location - // (Pseudo-implementation; real code uses rv_emit_mem with computed addresses) - rv_move(t, dst, src); -} - -static void rv_emit_mem(RvNativeTarget* a, int is_load, NativeLoc dst, - NativeAddr addr, MemAccess mem) { - // Emit a memory load or store instruction with the given operands. - // dst is the register location; addr is the address (base_kind + base + offset). - // This is a facade that dispatches to rv_load / rv_store. -} -``` - ---- - -## (g) Summary: Single-Pass vs Known-Frame Flow - -### Single-Pass (-O0, NativeDirectTarget) - -1. **func_begin:** Reserve prologue region (RV_PROLOGUE_WORDS NOPs), reserve entry-save slots. -2. **Body:** Frame slots grow (via frame_slot) as needed; max_outgoing grows (via note_frame_state) as calls are encountered. -3. **func_end:** - - Compute final frame size = align16(16 + cum_off + max_outgoing + va_sz). - - Patch prologue region with rv_build_prologue (exact instructions). - - Emit epilogue (restore + ret). - - Post CFI metadata. - -### Known-Frame (-O1, Optimizer Emit Path) - -1. **func_begin_known_frame:** Receives NativeKnownFrameDesc (slots, max_outgoing, callee_saved_used pre-computed). -2. **Frame is final immediately:** Call aa_reserve_callee_saves, aa_frame_slot for all planned slots, set frame_final. -3. **Emit prologue inline:** Call aa_build_prologue_words once, emit the exact word count (no patching). -4. **Body:** No frame growth; allocas / tail-epilogues can be emitted final with no back-patching. -5. **func_end:** Just place epilogue label and emit epilogue (no patching). - -For this porting guide (GROUP 1), the focus is **single-pass only**. Known-frame is a future -O1 optimization; the entry point is rv_func_begin_known_frame (stub or minimal), and the frame_final flag prevents post-prologue frame changes. - ---- - -## Key Takeaways for the RV64 Implementation - -1. **Register model:** s0 (x8) is the frame pointer. Offsets are **bytes below s0** (positive = downward). Stack grows downward (sp decreases on entry). -2. **Frame layout:** Single top-record: saved pair at [sp + fp_pair_off], then locals/slots below. -3. **ABI alignment:** 16-byte stack alignment; variadic save area (64B) sits at [s0 + 16]. -4. **Prologue:** Multi-phase: sp adjust → save pair → set s0 → save callee-saves → spill sret/variadic. -5. **Epilogue:** Reverse: restore callee-saves → restore pair → sp adjust → ret. -6. **Immediate materialization:** Use LUI+ADDIW for large immediates (> 12 bits); store absolute offsets in t0/t1. -7. **Parameter binding:** Walk ABI parts, read from a0–a7 (or fa0–fa7), move to destination (register or frame). -8. **Tail calls:** Compare outgoing stack bytes vs incoming (via signature_stack_bytes); if smaller or equal, sibling call is feasible. - - - ---- - -# RV64 NativeTarget: Register Tables and Operand Legality (GROUP 2) - -## Overview - -This group implements the register and operand legality infrastructure for the rv64 NativeTarget. The reference is `/Users/ryan/code/cfree/src/arch/aa64/native.c` (lines ~3370-3526 for register tables), modeled exactly on the aa64 structure but parameterized by RISC-V ISA and LP64D ABI specifics. - ---- - -## 1. Register Constants and Reserved Registers - -Define at the top of `src/arch/rv64/native.c` (before the allocable/scratch/phys tables): - -```c -enum { - RV_PROLOGUE_WORDS = 128u, /* reserved NOP region for -O0 prologue */ - RV_TMP0 = 29, /* t4: backend scratch (caller-saved, temp) */ - RV_TMP1 = 30, /* t5: backend scratch (caller-saved, temp) */ - RV_FP = 8, /* s0: frame pointer (callee-saved) */ - RV_RA = 1, /* return address (callee-saved) */ - RV_SP = 2, /* stack pointer (reserved) */ - RV_GP = 3, /* global pointer (reserved) */ - RV_TP = 4, /* thread pointer (reserved) */ - RV_ZERO = 0, /* x0: hardware zero register (reserved) */ - /* Argument registers (a0-a7 / fa0-fa7, caller-saved) */ - RV_A0 = 10, RV_A1 = 11, RV_A2 = 12, RV_A3 = 13, - RV_A4 = 14, RV_A5 = 15, RV_A6 = 16, RV_A7 = 17, - /* Callee-saved integer registers (s2-s11) */ - RV_S2 = 18, RV_S3 = 19, RV_S4 = 20, RV_S5 = 21, - RV_S6 = 22, RV_S7 = 23, RV_S8 = 24, RV_S9 = 25, - RV_S10 = 26, RV_S11 = 27, - /* Temporary registers (t0-t2 before register region; t3-t6 after allocable) */ - RV_T0 = 5, RV_T1 = 6, RV_T2 = 7, RV_T3 = 28, RV_T4 = 29, RV_T5 = 30, RV_T6 = 31, - /* FP temporaries */ - RV_FT0 = 0, RV_FT1 = 1, RV_FT2 = 2, RV_FT3 = 3, RV_FT4 = 4, - RV_FT5 = 5, RV_FT6 = 6, RV_FT7 = 7, - RV_FS0 = 8, RV_FS1 = 9, /* callee-saved */ - RV_FA0 = 10, RV_FA1 = 11, RV_FA2 = 12, RV_FA3 = 13, /* argument regs */ - RV_FA4 = 14, RV_FA5 = 15, RV_FA6 = 16, RV_FA7 = 17, - RV_FS2 = 18, RV_FS3 = 19, RV_FS4 = 20, RV_FS5 = 21, /* callee-saved */ - RV_FS6 = 22, RV_FS7 = 23, RV_FS8 = 24, RV_FS9 = 25, - RV_FS10 = 26, RV_FS11 = 27, - RV_FT8 = 28, RV_FT9 = 29, RV_FT10 = 30, RV_FT11 = 31, -}; -``` - -**Source of truth for RISC-V register names:** `/Users/ryan/code/cfree/src/arch/rv64/isa.h` lines 17-67 (RV_X0 through RV_T6 enum definitions). - ---- - -## 2. Integer Register Allocable and Scratch Tables - -### rv_int_allocable[] - -Allocable integer registers come from the RISC-V psABI callee-saved pool (s2-s11 / x18-x27). The allocator prefers caller-saved temporaries when not under pressure, so they're listed separately and pulled via distinct scratch array. - -**Source:** `/Users/ryan/code/cfree/src/arch/rv64/opt_coord.c` lines 8, 11 (legacy tables; note these enumerate both the allocable and reserved scratch regs). - -```c -/* Allocable integers: s2-s11 (callee-saved, x18-x27). - * These are the only registers available for general allocation after - * reserving tmp0/tmp1 (t4/t5, x29/x30), ra/sp/gp/tp/zero, and the - * FP (s0). */ -static const Reg rv_int_allocable[] = { - 20, 21, 22, 23, 24, 25, 26, 27, /* s4-s11 */ - 18, 19, /* s2-s3: allocated only under register pressure */ -}; -``` - -**Rationale:** s2 and s3 are marked as "reserved by opt_emit" in the legacy code (line 11), indicating the original backend used them for special purposes. However, under the NativeTarget contract they can be allocable since emit hooks (not alloc) control their usage. - -### rv_int_scratch[] - -Scratch registers available for temporary materialization without forcing the allocator into the callee-saved pool. Drawn from caller-saved temporaries (t0-t2, t4-t6). - -```c -/* Scratch integers available to emit without spilling. - * t4/t5 (x29/x30) are reserved for backend internal use (e.g., atomic RMW - * helper, address computation). t0-t2/t3/t6 are available to the emitter. - * For simplicity, we expose t4/t5 here; emit hooks may use them if free. */ -static const Reg rv_int_scratch[] = {29, 30}; /* t4, t5 */ -``` - ---- - -## 3. FP Register Allocable and Scratch Tables - -### rv_fp_allocable[] - -Allocable FP registers from the RISC-V psABI callee-saved pool (fs2-fs11 / f18-f27) plus fs0-fs1 (f8-f9). Note: fa0-fa7 are argument registers (not allocable; reserved). ft0-ft7 and ft8-ft11 are caller-saved temporaries. - -```c -/* Allocable FP: fs0-fs1 (callee-saved, f8-f9), fs2-fs11 (f18-f27). - * fa0-fa7 are argument registers (reserved for ABI). */ -static const Reg rv_fp_allocable[] = { - 8, 9, /* fs0-fs1 (callee-saved, prefer first) */ - 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, /* fs2-fs11 */ -}; -``` - -### rv_fp_scratch[] - -FP scratch registers for temporary use without spilling. - -```c -/* Scratch FP: ft8-ft11 (x28-x31 of f-register numbering, caller-saved). */ -static const Reg rv_fp_scratch[] = {28, 29, 30, 31}; /* ft8-ft11 */ -``` - ---- - -## 4. NativePhysRegInfo Arrays (phys[]) - -Each register in the physical register file gets a descriptor. Order the integer array as: argument/return registers first (a0-a7), then allocables (s4-s11, s2-s3), then reserved (sp, gp, tp, zero, ra). - -### Integer Physical Registers - -**Source:** `/Users/ryan/code/cfree/src/arch/aa64/native.c` lines 3424-3441 (aa_int_phys[] macro pattern). - -```c -#define RV_PHYS_INT_ARG(r, idx) \ - {.reg = (r), \ - .cls = NATIVE_REG_INT, \ - .abi_index = (idx), \ - .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \ - ((idx) < 2u ? NATIVE_REG_RET : 0), \ - .spill_cost = 1u, \ - .copy_cost = 1u} - -#define RV_PHYS_INT_ALLOC(r) \ - {.reg = (r), \ - .cls = NATIVE_REG_INT, \ - .abi_index = 0xffu, \ - .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \ - .spill_cost = 4u, \ - .copy_cost = 1u} - -#define RV_PHYS_INT_RESERVED(r) \ - {.reg = (r), \ - .cls = NATIVE_REG_INT, \ - .abi_index = 0xffu, \ - .flags = NATIVE_REG_RESERVED, \ - .spill_cost = 0u, \ - .copy_cost = 0u} - -static const NativePhysRegInfo rv_int_phys[] = { - /* Argument/return registers (a0-a7, x10-x17) */ - RV_PHYS_INT_ARG(10, 0), /* a0 / x10 — return arg 0 + arg 0 */ - RV_PHYS_INT_ARG(11, 1), /* a1 / x11 — return arg 1 + arg 1 */ - RV_PHYS_INT_ARG(12, 2), /* a2 / x12 — arg 2 only */ - RV_PHYS_INT_ARG(13, 3), /* a3 / x13 — arg 3 only */ - RV_PHYS_INT_ARG(14, 4), /* a4 / x14 — arg 4 only */ - RV_PHYS_INT_ARG(15, 5), /* a5 / x15 — arg 5 only */ - RV_PHYS_INT_ARG(16, 6), /* a6 / x16 — arg 6 only */ - RV_PHYS_INT_ARG(17, 7), /* a7 / x17 — arg 7 only */ - /* Allocable callee-saved (s2-s11, x18-x27) */ - RV_PHYS_INT_ALLOC(18), /* s2 / x18 */ - RV_PHYS_INT_ALLOC(19), /* s3 / x19 */ - RV_PHYS_INT_ALLOC(20), /* s4 / x20 */ - RV_PHYS_INT_ALLOC(21), /* s5 / x21 */ - RV_PHYS_INT_ALLOC(22), /* s6 / x22 */ - RV_PHYS_INT_ALLOC(23), /* s7 / x23 */ - RV_PHYS_INT_ALLOC(24), /* s8 / x24 */ - RV_PHYS_INT_ALLOC(25), /* s9 / x25 */ - RV_PHYS_INT_ALLOC(26), /* s10 / x26 */ - RV_PHYS_INT_ALLOC(27), /* s11 / x27 */ - /* Reserved: temporaries, frame pointer, return address, zero, etc. */ - RV_PHYS_INT_RESERVED(0), /* zero / x0 */ - RV_PHYS_INT_RESERVED(1), /* ra / x1 */ - RV_PHYS_INT_RESERVED(2), /* sp / x2 */ - RV_PHYS_INT_RESERVED(3), /* gp / x3 */ - RV_PHYS_INT_RESERVED(4), /* tp / x4 */ - RV_PHYS_INT_RESERVED(5), /* t0 / x5 */ - RV_PHYS_INT_RESERVED(6), /* t1 / x6 */ - RV_PHYS_INT_RESERVED(7), /* t2 / x7 */ - RV_PHYS_INT_RESERVED(8), /* s0 / x8 (frame pointer) */ - RV_PHYS_INT_RESERVED(9), /* s1 / x9 */ - RV_PHYS_INT_RESERVED(28), /* t3 / x28 */ - RV_PHYS_INT_RESERVED(29), /* t4 / x29 (backend tmp0) */ - RV_PHYS_INT_RESERVED(30), /* t5 / x30 (backend tmp1) */ - RV_PHYS_INT_RESERVED(31), /* t6 / x31 */ -}; -``` - -**Key points:** -- `abi_index`: Position in ABI register order (a0-a7 map to indices 0-7; non-arg registers get 0xff). -- `flags`: Argument registers have NATIVE_REG_ARG; return arg registers (a0-a1) additionally have NATIVE_REG_RET. -- `spill_cost`: Callee-saved set to 4u (higher cost discourages allocation under light pressure); caller-saved and arg regs 1u. -- **Return register mask:** a0 and a1 both participate in 64-bit integer returns. - -### FP Physical Registers - -```c -#define RV_PHYS_FP_ARG(r, idx) \ - {.reg = (r), \ - .cls = NATIVE_REG_FP, \ - .abi_index = (idx), \ - .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \ - ((idx) < 2u ? NATIVE_REG_RET : 0), \ - .spill_cost = 1u, \ - .copy_cost = 1u} - -#define RV_PHYS_FP_ALLOC(r) \ - {.reg = (r), \ - .cls = NATIVE_REG_FP, \ - .abi_index = 0xffu, \ - .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \ - .spill_cost = 4u, \ - .copy_cost = 1u} - -#define RV_PHYS_FP_RESERVED(r) \ - {.reg = (r), \ - .cls = NATIVE_REG_FP, \ - .abi_index = 0xffu, \ - .flags = NATIVE_REG_RESERVED, \ - .spill_cost = 0u, \ - .copy_cost = 0u} - -static const NativePhysRegInfo rv_fp_phys[] = { - /* Argument/return registers (fa0-fa7, f10-f17) */ - RV_PHYS_FP_ARG(10, 0), /* fa0 / f10 */ - RV_PHYS_FP_ARG(11, 1), /* fa1 / f11 */ - RV_PHYS_FP_ARG(12, 2), /* fa2 / f12 */ - RV_PHYS_FP_ARG(13, 3), /* fa3 / f13 */ - RV_PHYS_FP_ARG(14, 4), /* fa4 / f14 */ - RV_PHYS_FP_ARG(15, 5), /* fa5 / f15 */ - RV_PHYS_FP_ARG(16, 6), /* fa6 / f16 */ - RV_PHYS_FP_ARG(17, 7), /* fa7 / f17 */ - /* Allocable callee-saved (fs0-fs1, fs2-fs11) */ - RV_PHYS_FP_ALLOC(8), /* fs0 / f8 */ - RV_PHYS_FP_ALLOC(9), /* fs1 / f9 */ - RV_PHYS_FP_ALLOC(18), /* fs2 / f18 */ - RV_PHYS_FP_ALLOC(19), /* fs3 / f19 */ - RV_PHYS_FP_ALLOC(20), /* fs4 / f20 */ - RV_PHYS_FP_ALLOC(21), /* fs5 / f21 */ - RV_PHYS_FP_ALLOC(22), /* fs6 / f22 */ - RV_PHYS_FP_ALLOC(23), /* fs7 / f23 */ - RV_PHYS_FP_ALLOC(24), /* fs8 / f24 */ - RV_PHYS_FP_ALLOC(25), /* fs9 / f25 */ - RV_PHYS_FP_ALLOC(26), /* fs10 / f26 */ - RV_PHYS_FP_ALLOC(27), /* fs11 / f27 */ - /* Reserved: caller-saved temps, thread-local fp. */ - RV_PHYS_FP_RESERVED(0), /* ft0 / f0 */ - RV_PHYS_FP_RESERVED(1), /* ft1 / f1 */ - RV_PHYS_FP_RESERVED(2), /* ft2 / f2 */ - RV_PHYS_FP_RESERVED(3), /* ft3 / f3 */ - RV_PHYS_FP_RESERVED(4), /* ft4 / f4 */ - RV_PHYS_FP_RESERVED(5), /* ft5 / f5 */ - RV_PHYS_FP_RESERVED(6), /* ft6 / f6 */ - RV_PHYS_FP_RESERVED(7), /* ft7 / f7 */ - RV_PHYS_FP_RESERVED(28), /* ft8 / f28 */ - RV_PHYS_FP_RESERVED(29), /* ft9 / f29 */ - RV_PHYS_FP_RESERVED(30), /* ft10 / f30 */ - RV_PHYS_FP_RESERVED(31), /* ft11 / f31 */ -}; -``` - ---- - -## 5. NativeAllocClassInfo Arrays - -Define two class infos: one for NATIVE_REG_INT, one for NATIVE_REG_FP. Include the four register-state masks. - -```c -static const NativeAllocClassInfo rv_classes[] = { - /* INTEGER CLASS */ - {.cls = NATIVE_REG_INT, - .allocable = rv_int_allocable, - .nallocable = sizeof rv_int_allocable / sizeof rv_int_allocable[0], - .scratch = rv_int_scratch, - .nscratch = sizeof rv_int_scratch / sizeof rv_int_scratch[0], - .phys = rv_int_phys, - .nphys = sizeof rv_int_phys / sizeof rv_int_phys[0], - /* Caller-saved mask: a0-a7 (x10-x17) + t0-t2, t3-t6 (x5-x7, x28-x31). - * RISC-V psABI: x5-x7, x10-x17, x28-x31 are caller-saved. */ - .caller_saved_mask = - ((1u << 5) | (1u << 6) | (1u << 7) | /* t0-t2 */ - (1u << 10) | (1u << 11) | (1u << 12) | (1u << 13) | /* a0-a3 */ - (1u << 14) | (1u << 15) | (1u << 16) | (1u << 17) | /* a4-a7 */ - (1u << 28) | (1u << 29) | (1u << 30) | (1u << 31)), /* t3-t6 */ - /* Callee-saved mask: s0-s11 (x8-x9, x18-x27). */ - .callee_saved_mask = - ((1u << 8) | (1u << 9) | /* s0-s1 */ - (1u << 18) | (1u << 19) | (1u << 20) | (1u << 21) | /* s2-s5 */ - (1u << 22) | (1u << 23) | (1u << 24) | (1u << 25) | /* s6-s9 */ - (1u << 26) | (1u << 27)), /* s10-s11 */ - /* Argument mask: a0-a7 (x10-x17). */ - .arg_mask = - ((1u << 10) | (1u << 11) | (1u << 12) | (1u << 13) | - (1u << 14) | (1u << 15) | (1u << 16) | (1u << 17)), - /* Return mask: a0-a1 (x10-x11) for 64-bit integers. */ - .ret_mask = ((1u << 10) | (1u << 11)), - /* Reserved: zero, ra, sp, gp, tp, s0 (fp), s1, t4/t5 (tmp0/tmp1). */ - .reserved_mask = - ((1u << 0) | /* zero */ - (1u << 1) | /* ra */ - (1u << 2) | /* sp */ - (1u << 3) | /* gp */ - (1u << 4) | /* tp */ - (1u << 8) | /* s0 / fp */ - (1u << 9) | /* s1 */ - (1u << 29) | (1u << 30))}, /* t4/t5 tmp0/tmp1 */ - - /* FLOATING-POINT CLASS */ - {.cls = NATIVE_REG_FP, - .allocable = rv_fp_allocable, - .nallocable = sizeof rv_fp_allocable / sizeof rv_fp_allocable[0], - .scratch = rv_fp_scratch, - .nscratch = sizeof rv_fp_scratch / sizeof rv_fp_scratch[0], - .phys = rv_fp_phys, - .nphys = sizeof rv_fp_phys / sizeof rv_fp_phys[0], - /* Caller-saved FP: ft0-ft7 (f0-f7) + fa0-fa7 (f10-f17) + ft8-ft11 (f28-f31). */ - .caller_saved_mask = - ((1u << 0) | (1u << 1) | (1u << 2) | (1u << 3) | /* ft0-ft3 */ - (1u << 4) | (1u << 5) | (1u << 6) | (1u << 7) | /* ft4-ft7 */ - (1u << 10) | (1u << 11) | (1u << 12) | (1u << 13) | /* fa0-fa3 */ - (1u << 14) | (1u << 15) | (1u << 16) | (1u << 17) | /* fa4-fa7 */ - (1u << 28) | (1u << 29) | (1u << 30) | (1u << 31)), /* ft8-ft11 */ - /* Callee-saved FP: fs0-fs11 (f8-f9, f18-f27). */ - .callee_saved_mask = - ((1u << 8) | (1u << 9) | /* fs0-fs1 */ - (1u << 18) | (1u << 19) | (1u << 20) | (1u << 21) | /* fs2-fs5 */ - (1u << 22) | (1u << 23) | (1u << 24) | (1u << 25) | /* fs6-fs9 */ - (1u << 26) | (1u << 27)), /* fs10-fs11 */ - /* Argument mask: fa0-fa7 (f10-f17). */ - .arg_mask = - ((1u << 10) | (1u << 11) | (1u << 12) | (1u << 13) | - (1u << 14) | (1u << 15) | (1u << 16) | (1u << 17)), - /* Return mask: fa0-fa1 (f10-f11) for 64-bit floats. */ - .ret_mask = ((1u << 10) | (1u << 11)), - /* Reserved: all temp registers. */ - .reserved_mask = - ((1u << 0) | (1u << 1) | (1u << 2) | (1u << 3) | /* ft0-ft3 */ - (1u << 4) | (1u << 5) | (1u << 6) | (1u << 7) | /* ft4-ft7 */ - (1u << 28) | (1u << 29) | (1u << 30) | (1u << 31))}, /* ft8-ft11 */ -}; -``` - ---- - -## 6. NativeRegInfo Global - -```c -static const NativeRegInfo rv_reg_info = { - .classes = rv_classes, - .nclasses = sizeof rv_classes / sizeof rv_classes[0], - /* Function pointers are set to NULL here; no resolve_name / debug_name / dwarf_reg - * implementations are exposed via NativeTarget (they are used internally via - * rv64_register_index / rv64_register_name from src/arch/rv64/regs.c if needed). */ -}; -``` - ---- - -## 7. Operand Legality: class_for_type - -Query which register class a type occupies. RISC-V uses GPR (INT) for integers/pointers and FPR (FP) for floats/doubles. Inline or placed before imm_legal. - -```c -static NativeAllocClass rv_class_for_type(NativeTarget* t, CfreeCgTypeId type) { - /* FP types use the FP register class. All others (including aggregates, - * which get passed by reference) use INT. */ - if (type && cg_type_is_float(t->c, type) && cg_type_size(t->c, type) <= 8u) - return NATIVE_REG_FP; - return NATIVE_REG_INT; -} -``` - ---- - -## 8. Operand Legality: addr_legal - -Check if a memory address mode is legal. RISC-V supports base+imm12 only (no indexed addressing). Zba extension allows folding index into base via sh{1,2,3}add, but these are codegen decisions, not legality checks here. - -```c -/* RISC-V memory addressing: base + imm12 (signed 12-bit) only. - * No indexed addressing without Zba transforms (handled by emit). */ -static int rv_addr_legal(NativeTarget* t, const NativeAddr* addr, - MemAccess mem) { - (void)t; - (void)mem; - if (!addr) return 0; - /* Index must be absent. */ - if (addr->index_kind != NATIVE_ADDR_INDEX_NONE) return 0; - /* Base must be present (NATIVE_ADDR_BASE_REG or NATIVE_ADDR_BASE_FRAME). */ - return addr->base_kind == NATIVE_ADDR_BASE_REG || - addr->base_kind == NATIVE_ADDR_BASE_FRAME; -} -``` - ---- - -## 9. Operand Legality: imm_legal - -Check if an immediate can be folded directly into an instruction without materialization. - -```c -/* RISC-V immediate legality. - * - ALU/load immediates: 12-bit signed [-2048, 2047] via I-type. - * - Shifts: 6-bit (shamt) in [0, 63] for 64-bit, [0, 31] for 32-bit. - * - Moves: any value can be materialized via LUI+ADDI or LI pseudo. - * - Comparisons: 12-bit signed for CMP (substracting immediate). - * - * This is a simplified query used by the optimizer to avoid materializing - * large constants. The emitter has full responsibility for folding or - * rejecting each case. */ -static int rv_imm_legal(NativeTarget* t, NativeImmUse use, u32 op, - CfreeCgTypeId type, i64 imm) { - (void)t; - (void)type; - - switch (use) { - case NATIVE_IMM_MOVE: - /* Any constant can be materialized. */ - return 1; - - case NATIVE_IMM_BINOP: - /* For ALU binops (IADD, ISUB, etc.), check if imm fits I-type (12-bit). */ - if ((BinOp)op == BO_IADD || (BinOp)op == BO_ISUB) { - return imm >= -2048 && imm <= 2047; - } - /* Shifts: 6-bit shamt for RV64, 5-bit for RV32. */ - if ((BinOp)op == BO_SHL || (BinOp)op == BO_LSHR || (BinOp)op == BO_ASHR) { - return imm >= 0 && imm <= 63; - } - /* AND, OR, XOR: 12-bit immediate. */ - if ((BinOp)op == BO_AND || (BinOp)op == BO_OR || (BinOp)op == BO_XOR) { - return imm >= -2048 && imm <= 2047; - } - return 0; - - case NATIVE_IMM_CMP: - /* CMP uses subtraction, so 12-bit signed immediate. */ - return imm >= -2048 && imm <= 2047; - - case NATIVE_IMM_ADDR_OFFSET: - /* Address computations and load/store offsets: 12-bit signed. */ - return imm >= -2048 && imm <= 2047; - } - return 0; -} -``` - ---- - -## 10. NativeTarget Initialization - -In `rv64_native_target_new()` (or wherever the NativeTarget is created), set: - -```c - t->regs = &rv_reg_info; - t->class_for_type = rv_class_for_type; - t->imm_legal = rv_imm_legal; - t->addr_legal = rv_addr_legal; - t->has_store_zero_reg = 1; /* x0 is hardware zero */ - t->store_zero_reg = RV_ZERO; /* x0 */ -``` - ---- - -## 11. Summary: Mask Computation - -For reference, the four masks per class are computed as follows: - -**Integer:** -- `caller_saved_mask`: All registers in (a0-a7, t0-t2, t3-t6) = bits [5-7] | [10-17] | [28-31] -- `callee_saved_mask`: All registers in (s0-s11) = bits [8-9] | [18-27] -- `arg_mask`: All registers in (a0-a7) = bits [10-17] -- `ret_mask`: Registers that receive return values (a0-a1) = bits [10-11] -- `reserved_mask`: zero, ra, sp, gp, tp, s0 (fp), s1, tmp0 (t4), tmp1 (t5) = bits [0-4] | [8-9] | [29-30] - -**FP:** -- `caller_saved_mask`: All registers in (fa0-fa7, ft0-ft7, ft8-ft11) = bits [0-7] | [10-17] | [28-31] -- `callee_saved_mask`: All registers in (fs0-fs11) = bits [8-9] | [18-27] -- `arg_mask`: All registers in (fa0-fa7) = bits [10-17] -- `ret_mask`: Registers that receive FP returns (fa0-fa1) = bits [10-11] -- `reserved_mask`: All temporaries = bits [0-7] | [28-31] - ---- - -## 12. Key Differences from AA64 - -1. **Immediate range:** AA64 uses 12-bit with optional shift-left-12; RV64 uses plain 12-bit [-2048, 2047]. -2. **Shift operands:** AA64 shift has a 6-bit immediate field for both 32 and 64-bit ops; RV64 uses 6-bit shamt for RV64I, 5-bit for RV32I. -3. **Indexed addressing:** AA64 supports optional shift-and-add on the index (log2_scale checked against memory size); RV64 has no indexed addressing—the emit layer folds index via Zba if needed. -4. **Register names:** DWARF indices match hardware register numbers (0-31 for x-regs, 32-63 for f-regs). Use `rv64_register_index()` from `src/arch/rv64/regs.c` for name → index mapping if needed. -5. **Return registers:** Both a0 and a1 (for 128-bit integer returns) and fa0-fa1 (for FP pair returns) are marked NATIVE_REG_RET in the phys[] descriptor but only a0 and fa0 are typically returned as single values. - ---- - -## 13. Integration Checklist - -- [ ] Define RV_* register constants at the top of native.c -- [ ] Implement rv_int_allocable[], rv_int_scratch[], rv_fp_allocable[], rv_fp_scratch[] -- [ ] Populate rv_int_phys[] (16 argument/return + 10 callee-saved + 8 reserved = 34 entries) -- [ ] Populate rv_fp_phys[] (8 argument/return + 10 callee-saved + 12+ reserved) -- [ ] Define rv_classes[] with correct caller_saved, callee_saved, arg, ret, reserved masks -- [ ] Create rv_reg_info pointing to rv_classes[] -- [ ] Implement rv_class_for_type(), rv_addr_legal(), rv_imm_legal() -- [ ] Wire up .regs, .class_for_type, .imm_legal, .addr_legal, .has_store_zero_reg, .store_zero_reg in rv64_native_target_new() - - - - ---- - -# RV64 NativeTarget Porting Guide — GROUP 3: Data Movement, ALU, Control Flow, Addressing - -## Overview -This guide details the implementation of rv64 NativeTarget hooks for GROUP 3 operations, mirroring the aa64 reference implementation (/Users/ryan/code/cfree/src/arch/aa64/native.c) and mining the correct rv64 legacy code (/Users/ryan/code/cfree/src/arch/rv64/ops.c, emit.c, alloc.c, isa.h). RV64 has no condition flags (unlike aa64's NZCV), so all condition-based operations must materialize the result into a register via SLT/SLTU or FLT/FLE for FP. - -## ISA Encoder Helpers (src/arch/rv64/isa.h) -- **rv_r(funct7, rs2, rs1, funct3, rd, op)** — R-type: ADD/SUB/SLL/SRL/SRA/MUL/DIV etc. -- **rv_i(imm12, rs1, funct3, rd, op)** — I-type: ADDI/ANDI/ORI/XORI/SLTI/SLTIU/loads/JALR -- **rv_s(imm12, rs2, rs1, funct3, op)** — S-type: SB/SH/SW/SD/FSW/FSD -- **rv_b(imm13, rs2, rs1, funct3, op)** — B-type: BEQ/BNE/BLT/BGE/BLTU/BGEU -- **rv_u(imm32_hi20, rd, op)** — U-type: LUI/AUIPC (imm32_hi20 = upper 20 bits, shifted left 12) -- **rv_j(imm21, rd, op)** — J-type: JAL -- **rv_sh1add/rv_sh2add/rv_sh3add(rd, rs1, rs2)** — Zba: (rs1 << {1,2,3}) + rs2 - -Integer register mnemonics: RV_ZERO/RV_X0 (x0), RV_RA/RV_X1 (x1, return address), RV_SP/RV_X2 (x2, stack pointer), RV_GP/RV_X3, RV_TP/RV_X4, RV_T0..RV_T6 (x5, x6, x7, x28..x31 temp), RV_S0..RV_S11 (x8, x9, x18, x27 callee-saved), RV_A0..RV_A7 (x10..x17 args). - -FP register constants: RV_FMT_S (0) for float, RV_FMT_D (1) for double. - -## Key Implementation Patterns - -### Load Immediate (rv64_emit_load_imm — src/arch/rv64/emit.c:117) -For **sf=1 (64-bit)** and large immediates: -1. Recursively decompose via hi20/lo12 split -2. Use LUI rd, hi20 (upper 20 bits) -3. Add ADDIW rd, rd, lo12 if lo12 ≠ 0 - -For **sf=0 (32-bit)**: -- Fits in 12-bit signed: ADDI rd, x0, imm12 -- Otherwise: LUI rd, hi; ADDIW rd, rd, lo (as above) - -Address-computation path: In load_imm_native and during addr materialization, use emit_li_32 (emit.c:76) or rv64_emit_load_imm. The latter auto-detects sign-extend range and chooses the shortest encoding. - -### Address Folding (Zba Index — src/arch/rv64/ops.c:273) -rv_fold_indexed materializes `base + (index << log2_scale)` into a scratch register using: -- log2_scale=0: ADD scratch, base, index -- log2_scale=1: SH1ADD scratch, index, base (= (index<<1) + base) -- log2_scale=2: SH2ADD scratch, index, base (= (index<<2) + base) -- log2_scale=3: SH3ADD scratch, index, base (= (index<<3) + base) - -Then update the addr tuple: base ← scratch, index ← REG_NONE, log2_scale ← 0. - -### Integer Sign/Zero Extension (src/arch/rv64/ops.c:322, 329, src/arch/rv64/alloc.c:322) -For **CV_SEXT** on 32-bit source (src/arch/rv64/ops.c:901): -- To 64-bit: ADDIW rd, rs, 0 (sign-extends low 32) - -For **CV_ZEXT** on 32-bit source (src/arch/rv64/ops.c:914): -- To 64-bit: SLLI rd, rs, 32 ; SRLI rd, rd, 32 - -For **CV_SEXT** on <32-bit (e.g., 16-bit): -- sh = 64 - src_bits -- SLLI rd, rs, sh ; SRAI rd, rd, sh - -For **CV_ZEXT** on <32-bit: -- sh = 64 - src_bits -- SLLI rd, rs, sh ; SRLI rd, rd, sh - -Canonical i32 CMP operands (src/arch/rv64/alloc.c:337): Before signed order comparisons (CMP_LT_S etc.) or EQ/NE on 32-bit types, sign-extend both operands. For unsigned order comparisons, zero-extend both. - -## Hook Implementation Sketches - -### move(NativeTarget* t, NativeLoc dst, NativeLoc src) -**Location**: /Users/ryan/code/cfree/src/arch/aa64/native.c:1707 - -Integer register-to-register: Move via ADDI rd, rs, 0 (rv_addi). -FP-to-FP: Use FSGNJ.fmt rd, rs, rs (rv_fsgnj(fmt, rd, rs, rs)) for same register copy. -Int-to-FP: FMV.D.X rd, rs (rv_fmv_d_x) for 64-bit, FMV.W.X for 32-bit. -FP-to-Int: FMV.X.D rd, rs (rv_fmv_x_d) for 64-bit, FMV.X.W for 32-bit. - -Elision: Skip if same register, same class (rv64 unlike aa64 has no disjoint register files). - -### load_imm(NativeTarget* t, NativeLoc dst_reg, i64 imm) -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c:1740; rv64 /Users/ryan/code/cfree/src/arch/rv64/emit.c:117 - -```c -void rv64_native_load_imm(NativeTarget* t, NativeLoc dst, i64 imm) { - u32 rd = loc_reg(dst); - int is_64 = (type_size(t, dst.type) == 8); - rv64_emit_load_imm(t->mc, is_64, rd, imm); -} -``` - -Call rv64_emit_load_imm (which handles the LUI/ADDIW/SLLI/ADDI sequence internally). - -### load_const(NativeTarget* t, NativeLoc dst_reg, ConstBytes cbytes) -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c:1744; rv64 /Users/ryan/code/cfree/src/arch/rv64/ops.c:43 - -Pack the bytes into a u64 (little-endian): -```c -u64 v = 0; -for (u32 i = 0; i < cbytes.size; ++i) - v |= (u64)cbytes.bytes[i] << (i * 8); -``` - -If FP dst: materialize into a temp (t0), then move to the FP register (move(t, dst, tmp_loc)). -If Int dst: call load_imm with the packed value. - -### load_addr(NativeTarget* t, NativeLoc dst_reg, NativeAddr addr) -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c:1759 - -**NATIVE_ADDR_BASE_FRAME**: Frame slot address. -- Compute FP offset (frame-relative address) -- If offset fits ±2047: ADDI rd, fp, off -- Else: LUI/ADDIW sequence to materialize offset, then ADD rd, fp, t0 -- Apply index if present (via Zba sh{1,2,3}add) - -**NATIVE_ADDR_BASE_FRAME_VALUE**: Load pointer from frame, add offset. -- Recursively load base address from frame slot (use enc_int_load for LD) -- Add offset via ADDI if fits, else materialize then ADD -- Apply index - -**NATIVE_ADDR_BASE_REG**: Register + offset. -- ADDI rd, base_reg, offset (or LUI/ADDIW + ADD for large offsets) -- Apply index - -**NATIVE_ADDR_BASE_GLOBAL**: Global symbol. -- If extern-via-GOT: AUIPC rd, %got_pcrel_hi(sym) + LD rd, %pcrel_lo(.)(rd), then ADDI for addend -- Else: AUIPC rd, %pcrel_hi(sym) + ADDI rd, %pcrel_lo(.)(rd) with relocations -- Emit relocs at each site (R_RV_PCREL_HI20 on AUIPC, R_RV_PCREL_LO12_I on the load/add) -- Apply index - -Index application (Zba): If index_kind != NONE, fold via rv_sh{1,2,3}add before returning. - -### load(NativeTarget* t, NativeLoc dst_reg, NativeAddr addr, MemAccess mem) -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c:1825 - -1. Fold any indexed address component (rv_fold_indexed into a scratch) -2. Materialize base + offset via addr_mode (lines 160–204 in rv64/ops.c) -3. Emit the appropriate load instruction: - - FP: FLD (8-byte, funct3=0x3) or FLW (4-byte, funct3=0x2) - - Int: enc_int_load(mem.size, sign_extend, rd, base, offset) → LB/LH/LW/LD/LBU/LHU/LWU - -```c -void rv64_native_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, MemAccess mem) { - // Fold index - NativeAddr a = addr; - if (addr.index_kind != NATIVE_ADDR_INDEX_NONE) { - // Materialize index fold into scratch - // (Zba sh1add/sh2add/sh3add as per rv_fold_indexed pattern) - } - // Materialize base + offset - RvAddrMode am = addr_mode(t, a, RV_T0); - // Emit load - u32 sz = mem.size; - if (/* FP */) { - rv64_emit32(t->mc, (sz == 8) ? rv_fld(loc_reg(dst), am.base, am.ofs) - : rv_flw(loc_reg(dst), am.base, am.ofs)); - } else { - rv64_emit32(t->mc, enc_int_load(sz, /* sign_extend */, loc_reg(dst), am.base, am.ofs)); - } -} -``` - -### store(NativeTarget* t, NativeAddr addr, NativeLoc src_reg, MemAccess mem) -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c:1830 - -Parallel to load: -1. Fold indexed address -2. Materialize base + offset -3. Emit store (enc_int_store or FSW/FSD) - -```c -void rv64_native_store(NativeTarget* t, NativeAddr addr, NativeLoc src, MemAccess mem) { - // Fold + materialize as above - RvAddrMode am = /* ... */; - u32 sz = mem.size; - if (/* FP */) { - rv64_emit32(t->mc, (sz == 8) ? rv_fsd(loc_reg(src), am.base, am.ofs) - : rv_fsw(loc_reg(src), am.base, am.ofs)); - } else { - rv64_emit32(t->mc, enc_int_store(sz, loc_reg(src), am.base, am.ofs)); - } -} -``` - -### tls_addr_of(NativeTarget* t, NativeLoc dst_reg, ObjSymId sym, i64 addend) -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c:1835; rv64 /Users/ryan/code/cfree/src/arch/rv64/ops.c:491 - -**ELF LE (local-exec model)**: -1. Materialize TLS offset via LUI/ADDIW -2. ADD rd, RV_TP (thread pointer), offset -3. Emit R_RV_TPREL_HI20 / R_RV_TPREL_LO12_I relocations - -### copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src, AggregateAccess access) -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c:1922 - -Forward copy (non-overlapping) or backward (overlapping). For each granule (8/4/2/1 bytes): -1. Load from src + offset -2. Store to dst + offset - -```c -void rv64_native_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src, AggregateAccess access) { - CfreeCgTypeId i64 = /* i64 type id */, i32, i16, i8; - NativeLoc tmp = /* tmp_loc(i64, RV_T0) */; - for (u32 off = 0; off < access.size; ) { - u32 rem = access.size - off; - u32 sz = (rem >= 8) ? 8 : (rem >= 4) ? 4 : (rem >= 2) ? 2 : 1; - MemAccess mem = /* set size, align */; - load(t, tmp, /* src + off */, mem); - store(t, /* dst + off */, tmp, mem); - off += sz; - } -} -``` - -### set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value, AggregateAccess access) -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c:1927 - -Loop, storing the byte_value repetitively: -```c -void rv64_native_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value, AggregateAccess access) { - CfreeCgTypeId i8 = /* i8 type id */; - MemAccess mem = /* i8, size=1, align=1 */; - NativeLoc byte = byte_value; - byte.type = i8; - for (u32 off = 0; off < access.size; ++off) - store(t, /* dst + off */, byte, mem); -} -``` - -### bitfield_load / bitfield_store -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c:2328, 2350 - -Extract/insert a bitfield. Use SLLI/SRLI/SRAI to mask and position: -- Load: SLLI to align, SRLI/SRAI to extract (sign-extend if signed) -- Store: Mask old value, shift new value, OR together - -### binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc a_reg, NativeLoc b_reg_or_imm) -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c:2019; rv64 /Users/ryan/code/cfree/src/arch/rv64/ops.c:697 - -**FP operations** (BO_FADD/FSUB/FMUL/FDIV): -```c -u32 fmt = (type_size(t, dst.type) == 8) ? RV_FMT_D : RV_FMT_S; -switch (op) { - case BO_FADD: rv64_emit32(mc, rv_fadd(fmt, rd, ra, rb)); break; - case BO_FSUB: rv64_emit32(mc, rv_fsub(fmt, rd, ra, rb)); break; - case BO_FMUL: rv64_emit32(mc, rv_fmul(fmt, rd, ra, rb)); break; - case BO_FDIV: rv64_emit32(mc, rv_fdiv(fmt, rd, ra, rb)); break; -} -``` - -**Integer immediate fast paths** (sf = type_size == 8): -- BO_IADD with imm fitting ±2047: ADDI rd, ra, imm (or ADDIW for 32-bit) -- BO_ISUB with -imm fitting ±2047: ADDI rd, ra, -imm (negation) -- BO_AND/OR/XOR with imm: ANDI/ORI/XORI rd, ra, imm -- BO_SHL/SHR_U/SHR_S with imm: SLLI/SRLI/SRAI rd, ra, shamt (shamt masked to 5 bits for 32-bit, 6 for 64-bit) - -**Register-register** (both operands in registers or IMM out of range): -- BO_IADD: ADD rd, ra, rb (or ADDW for 32-bit) -- BO_ISUB: SUB rd, ra, rb (or SUBW) -- BO_IMUL: MUL rd, ra, rb (or MULW) -- BO_SDIV/UDIV: DIV/DIVU rd, ra, rb (or DIVW/DIVUW) -- BO_SREM/UREM: REM/REMU rd, ra, rb (or REMW/REMUW) -- BO_AND/OR/XOR: AND/OR/XOR rd, ra, rb -- BO_SHL: SLL rd, ra, rb (or SLLW) -- BO_SHR_U: SRL rd, ra, rb (or SRLW) -- BO_SHR_S: SRA rd, ra, rb (or SRAW) - -Commutative canonicalization (ops.c:728): Swap a_op ↔ b_op for IADD/AND/OR/XOR if a is IMM and b is not, so the imm-form check handles both orders. - -### unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c:2109; rv64 /Users/ryan/code/cfree/src/arch/rv64/ops.c:860 - -**FP negation** (UO_FNEG): -- FSGNJN.fmt rd, rs, rs (fsgnj with negation, rv_fsgnjn) - -**Integer negation** (UO_NEG): -- SUB rd, x0, rs (or SUBW for 32-bit); rv_sub(rd, RV_ZERO, rs) - -**Bitwise NOT** (UO_BNOT): -- XORI rd, rs, -1; rv_xori(rd, rs, -1) - -**Logical NOT** (UO_NOT): -- SLTIU rd, rs, 1 (set if rs < 1, i.e., rs == 0); rv_sltiu(rd, rs, 1) - -### cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc a_reg, NativeLoc b_reg_or_imm) -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c:2158; rv64 /Users/ryan/code/cfree/src/arch/rv64/alloc.c:431 - -**FP comparisons** (CMP_EQ, CMP_NE, CMP_LT_F, CMP_LE_F, CMP_GT_F, CMP_GE_F): -- FEQ.fmt rd, fa, fb; for CMP_EQ (rv_feq_s/rv_feq_d) -- FLT.fmt rd, fa, fb; for CMP_LT_F (rv_flt_s/rv_flt_d) -- FLE.fmt rd, fa, fb; for CMP_LE_F (rv_fle_s/rv_fle_d) -- Invert GT/GE by swapping operands: CMP_GT → FLT(fb, fa), CMP_GE → FLE(fb, fa) -- For CMP_NE: FEQ, then XORI rd, rd, 1 - -**Integer comparisons**: -1. Canonicalize i32 operands (sign/zero extend as per context) -2. Use SLT/SLTU/BEQ-based sequences: - - CMP_EQ: SUB rd, ra, rb; SLTIU rd, rd, 1 (set if diff == 0) - - CMP_NE: SUB rd, ra, rb; SLTU rd, x0, rd (set if diff != 0) - - CMP_LT_S: SLT rd, ra, rb - - CMP_LT_U: SLTU rd, ra, rb - - CMP_GT_S: SLT rd, rb, ra (swapped operands) - - CMP_GT_U: SLTU rd, rb, ra - - CMP_GE_S: SLT rd, ra, rb; XORI rd, rd, 1 (NOT of <) - - CMP_GE_U: SLTU rd, ra, rb; XORI rd, rd, 1 - - CMP_LE_S: SLT rd, rb, ra; XORI rd, rd, 1 - - CMP_LE_U: SLTU rd, rb, ra; XORI rd, rd, 1 - -### convert(NativeTarget* t, ConvKind op, NativeLoc dst, NativeLoc src) -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c:2164; rv64 /Users/ryan/code/cfree/src/arch/rv64/ops.c:894 - -**CV_SEXT** (sign-extend): -- 32 bits: ADDIW rd, rs, 0 -- < 32 bits: SLLI rd, rs, (64 - src_bits); SRAI rd, rd, (64 - src_bits) - -**CV_ZEXT** (zero-extend): -- 32 bits: SLLI rd, rs, 32; SRLI rd, rd, 32 -- < 32 bits: SLLI rd, rs, (64 - src_bits); SRLI rd, rd, (64 - src_bits) - -**CV_TRUNC**: ADDIW rd, rs, 0 (truncates to 32 bits, sign-extends; narrower widths handled by store) - -**CV_ITOF_S** (int → float, signed): -- FCVT.D.L / FCVT.D.W (64-bit / 32-bit src to double) -- FCVT.S.L / FCVT.S.W (to single) - -**CV_ITOF_U** (unsigned): -- FCVT.D.LU / FCVT.D.WU -- FCVT.S.LU / FCVT.S.WU - -**CV_FTOI_S** (float → int, signed): -- FCVT.L.D / FCVT.W.D (double to 64-bit / 32-bit) -- FCVT.L.S / FCVT.W.S (from single) - -**CV_FTOI_U** (unsigned): -- FCVT.LU.D / FCVT.WU.D -- FCVT.LU.S / FCVT.WU.S - -**CV_FEXT** (float extend, single → double): -- FCVT.D.S rd, rs; rv_fcvt_d_s(rd, rs) - -**CV_FTRUNC** (float truncate, double → single): -- FCVT.S.D rd, rs; rv_fcvt_s_d(rd, rs) - -**CV_BITCAST** (bitcast between int/float registers): -- Int → FP: FMV.D.X / FMV.W.X (rv_fmv_d_x / rv_fmv_w_x) -- FP → Int: FMV.X.D / FMV.X.W (rv_fmv_x_d / rv_fmv_x_w) -- Same-class: Move (ADDI for int, FSGNJ for FP) or elide if same register - -### alloca_(NativeTarget* t, NativeLoc dst, NativeLoc size, u32 align) -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c:2234 - -Round size up: ADDI t0, size, (align - 1); AND t0, t0, -align -Decrement sp: SUB sp, sp, t0 -Return address: ADDI dst, sp, max_outgoing (or record a patch if max_outgoing not final) - -### spill / reload -**Reference**: aa64 /Users/ryan/code/cfree/src/arch/aa64/native.c (via aa_emit_mem) - -Spill src_reg to a frame slot: Materialize the slot's address, then call store. -Reload from slot to dst_reg: Load from the materialized address. - -### label_new / label_place / jump -**Reference**: rv64 /Users/ryan/code/cfree/src/arch/rv64/alloc.c:261 - -```c -MCLabel rv64_native_label_new(NativeTarget* t) { - return t->mc->label_new(t->mc); -} - -void rv64_native_label_place(NativeTarget* t, MCLabel label) { - t->mc->label_place(t->mc, label); -} - -void rv64_native_jump(NativeTarget* t, MCLabel label) { - rv64_emit32(t->mc, rv_jal(RV_ZERO, 0)); // JAL x0, offset (discards return address) - t->mc->emit_label_ref(t->mc, label, R_RV_JAL, 4, 0); -} -``` - -### cmp_branch(NativeTarget* t, CmpOp op, NativeLoc a, NativeLoc b, MCLabel label) -**Reference**: rv64 /Users/ryan/code/cfree/src/arch/rv64/alloc.c:355 - -**FP branch** (CMP_LT_F, CMP_LE_F, CMP_GT_F, CMP_GE_F): -1. Materialize comparison into a register via FLT/FLE -2. Branch: BNE rd, x0, label - -**Integer branch**: -1. Canonicalize i32 operands if needed -2. Emit appropriate branch: - - CMP_EQ: BEQ ra, rb, 0 - - CMP_NE: BNE ra, rb, 0 - - CMP_LT_S: BLT ra, rb, 0 - - CMP_GE_S: BGE ra, rb, 0 - - CMP_LT_U: BLTU ra, rb, 0 - - CMP_GE_U: BGEU ra, rb, 0 - - CMP_GT_S: BLT rb, ra, 0 (swapped) - - CMP_LE_S: BGE rb, ra, 0 - - CMP_GT_U: BLTU rb, ra, 0 - - CMP_LE_U: BGEU rb, ra, 0 -3. Emit relocation: emit_label_ref(mc, label, R_RV_BRANCH, 4, 0) - -### indirect_branch(NativeTarget* t, NativeLoc addr, const MCLabel* valid_targets, u32 ntargets) -**Reference**: rv64 /Users/ryan/code/cfree/src/arch/rv64/alloc.c:287 - -```c -void rv64_native_indirect_branch(NativeTarget* t, NativeLoc addr, const MCLabel* valid_targets, u32 ntargets) { - (void)valid_targets; - (void)ntargets; // Not used; CFI is optional - u32 rs1 = loc_reg(addr); - rv64_emit32(t->mc, rv_jalr(RV_ZERO, rs1, 0)); // JALR x0, rs1, 0 (indirect jump) -} -``` - -### load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel label) -**Reference**: rv64 /Users/ryan/code/cfree/src/arch/rv64/alloc.c:271 - -PC-relative pair: AUIPC + ADDI with R_RV_INTRA_AUIPC_ADDI relocation (width=8, addend=0 anchors to AUIPC): -```c -void rv64_native_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel label) { - u32 rd = loc_reg(dst); - rv64_emit32(t->mc, rv_auipc(rd, 0)); - rv64_emit32(t->mc, rv_addi(rd, rd, 0)); - t->mc->emit_label_ref(t->mc, label, R_RV_INTRA_AUIPC_ADDI, 8, 0); -} -``` - -## ABI & Frame Considerations - -- **RV64 LP64D ABI**: Integer args in a0..a7 (x10..x17), FP args in fa0..fa7 (f10..f17). -- **Callee-saved**: s0..s11 (x8, x9, x18..x27). -- **Frame layout** (known-frame path, /Users/ryan/code/cfree/src/arch/rv64/emit.c:227): - - Outgoing area (aligned 16) - - Saved GP registers (callee-saves, FP saves) - - Local slots - - Saved s0/ra (16 bytes) -- **FP (frame pointer)**: s0 (x8); points to saved-pair; CFA = sp + frame_size. - -## Register References - -- x0: RV_ZERO (hardwired 0; write discards, read returns 0) -- x1: RV_RA (return address) -- x2: RV_SP (stack pointer) -- x5..x7: RV_T0..RV_T2 (temporaries, caller-saved) -- x8: RV_S0 / RV_FP (frame pointer, callee-saved) -- x10..x17: RV_A0..RV_A7 (argument/return registers) -- x18..x27: RV_S2..RV_S11 (callee-saved) -- x28..x31: RV_T3..RV_T6 (temporaries, caller-saved) - -Floating-point temporaries: f5..f7, f28..f31 (caller-saved); f8..f9, f18..f27 (callee-saved). - - - ---- - -# GROUP 4: Calls, Returns, and the ABI Interface — Porting Guide for rv64 NativeTarget - -## Overview - -This guide covers porting the call, return, and ABI-binding mechanisms to rv64's NativeTarget implementation. The contract is in `/Users/ryan/code/cfree/src/arch/native_target.h` (NativeCallDesc, NativeCallPlan, plan_call, emit_call, plan_ret, ret). The reference implementation is aa64 (`src/arch/aa64/native.c`, lines ~2614–2891), which queries the ABI via `src/abi/abi.h` (ABIFuncInfo, ABIArgInfo, ABIArgPart). The rv64 legacy code provides ISA and ABI logic in `src/arch/rv64/ops.c` (rv_call, rv_ret, rv_call_stack_size) and `src/arch/rv64/opt_coord.c` (rv_plan_call), plus the RISC-V LP64D ABI rules in `src/abi/abi_rv64.c`. - -## ABI Architecture - -Both aa64 and rv64 keep ABI decisions **behind the abi/ interface** — hardcoding is forbidden. The flow is: - -1. **ABIFuncInfo** (`abi_cg_func_info(c->abi, fn_type)`) gives you the signature's calling convention: parameter ABIArgInfo array, return ABIArgInfo, sret flag, variadic flag, vararg-stack behavior. - -2. **ABIArgInfo** classifies one parameter or return value: - - `kind`: ABI_ARG_IGNORE, ABI_ARG_DIRECT (parts), ABI_ARG_INDIRECT (caller passes address) - - `nparts` + `parts[]` (ABIArgPart): for DIRECT, each part is one register-passed or stack-passed chunk - - Each ABIArgPart holds: `cls` (ABI_CLASS_INT/FP), `size`, `align`, `src_offset` (offset in the original value) - -3. **Variadic handling**: ABIFuncInfo.variadic flag + vararg_on_stack control whether varargs bypass register pools and go straight to stack (Apple ARM64 sets this; RISC-V does not). - -4. **Return values**: ABIFuncInfo.ret + has_sret. If ret.kind == ABI_ARG_INDIRECT, the caller passes a pointer in a0, and plan_ret handles the aggregate copy-back. - -## RV64 ABI Specifics (from abi_rv64.c) - -- **8 integer argument registers** (a0–a7, aka x10–x17) and **8 FP argument registers** (fa0–fa7, aka f10–f17) -- **Scalar rules**: integers ≤8B → DIRECT + one INT part; float/double → DIRECT + one FP part; void → IGNORE -- **Small aggregate rules** (≤16B): homogeneous FP aggregates → FP parts; one FP + one INT → mixed parts; otherwise INT parts (up to 2 GPRs) -- **Large aggregates** (>16B): INDIRECT (sret for return, byval for args) -- **Stack arguments**: 8-byte aligned slots at sp+0, sp+8, sp+16, … -- **Variadic args**: forced to stack (integer registers only, no FP split) — handled at call/return sites -- **Return value registers**: a0/a1 for integers, fa0/fa1 for FP -- **sret**: a0 holds the destination pointer; the callee copies the return value to [a0] - -## Step 1: Implement plan_call (analog: aa64 lines 2614–2762) - -### Signature -```c -static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc, - NativeCallPlan* plan) { - // plan->callee = desc->callee; - // plan->flags = desc->flags; - // plan->stack_arg_size = <computed>; - // plan->has_sret = abi && abi->has_sret; - // plan->is_variadic = abi && abi->variadic; - // plan->args[0..nargs-1] with src/dst moves - // plan->rets[0..nrets-1] with src/dst return moves -} -``` - -### Body Sketch - -1. **Query ABI**: `const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);` -2. **Initialize plan**: zero the struct, copy callee, flags, has_sret, is_variadic. -3. **Compute stack_arg_size**: walk arguments with lookahead cursors (next_int=0, next_fp=0, stack=0) to find which go on stack: - - For each arg, get ABIArgInfo from abi→params[i] (or synthesize one if no ABI) - - IGNORE: skip - - INDIRECT: takes a register if next_int < 8, else stack (8 bytes) - - DIRECT with parts: for each part, check if it fits in registers (next_int/next_fp counters) or stack - - Variadic args with vararg_on_stack: force to stack - - Accumulate stack offset, align to part's alignment, round final size to 16 bytes (rv64 stack is 16-byte aligned) -4. **Sret handling**: if has_sret and not a tail call, first arg move writes the destination address to a0 -5. **Prepare arg moves**: for each argument: - - Create NativeCallPlanMove entries (src is desc→args[i] location, dst is the target register/stack) - - src_kind: NATIVE_CALL_MOVE_VALUE (load the value) or NATIVE_CALL_MOVE_ADDR (write addr-of) - - dst_kind: NATIVE_LOC_REG (a0–a7, fa0–fa7), NATIVE_LOC_STACK (sp+offset) - - For indirects on stack, compute sp+offset; use a scratch register (t0) to emit the address -6. **Return value setup**: if not a tail call and nresults > 0: - - Query abi→ret - - If ret.kind == ABI_ARG_INDIRECT: sret case, no return moves (copy happens in plan_ret) - - If ret.kind == ABI_ARG_DIRECT: create NativeCallPlanRet entries, one per part (a0, a1, fa0, fa1, etc.) - - Each rets[i]: src is the return register, dst is desc→results[0] (adjusted by part→src_offset) - -### RV64-specific Details - -- **Scratch registers**: a5 (x15, t0) is available for address calculations -- **Parallel-copy cycle-breaking**: aa64 handles this with cycle detection; rv64 may not need it if the allocator cooperates, but emit conservatively (process register moves in topological order) -- **Frame anchoring**: sp is the stack pointer; stack args are at sp+0..sp+N (unlike aa64, which uses fp for incoming args) -- **Outgoing area growth**: track max_outgoing across all calls; update t→mc state or record patches if needed - -## Step 2: Implement emit_call (analog: aa64 lines 2805–2826) - -### Signature -```c -static void rv_emit_call(NativeTarget* t, const NativeCallPlan* plan) { - // Emit the actual call instruction(s) -} -``` - -### Body Sketch - -1. **Tail call path** (plan→flags & CG_CALL_TAIL): - - Restore callee-saved registers (if needed) - - Restore frame (sp/fp adjustment) - - Emit indirect branch (jr) or PC-relative jump (jal with R_RV_CALL reloc) - - Return (no fallthrough to regular return path) -2. **Non-tail call path**: - - Emit argument moves (plan→args array): parallel copy of registers / stack stores - - If plan→has_sret and not tail: set a0 to destination pointer (already handled in plan_call) - - If plan→callee.kind == NATIVE_LOC_GLOBAL: emit auipc ra, 0; jalr ra, ra, 0 with R_RV_CALL reloc on auipc - - If plan→callee.kind == NATIVE_LOC_REG: emit jalr ra, <reg>, 0 - - Emit return-value collection (plan→rets): load each return register into its destination - -### RV64 ISA Notes - -- **jalr ra, ra, 0**: 2-instruction CALL sequence (auipc + jalr); R_RV_CALL relocation on auipc -- **jalr ra, <reg>, 0**: indirect call via register -- **jr <reg>** (pseudo → jalr x0, <reg>, 0): for tail calls -- **Scratch t0 (x5)** for intermediate calculations without clobbering live registers - -## Step 3: Implement plan_ret (analog: aa64 lines 2828–2889) - -### Signature -```c -static void rv_plan_ret(NativeTarget* t, const CGFuncDesc* fd, - const NativeLoc* values, u32 nvalues, - NativeCallPlanRet** out_rets, u32* out_nrets) { - // Plan the moves of return values into a0/a1 (int), fa0/fa1 (FP), or sret -} -``` - -### Body Sketch - -1. **Query ABI**: `const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd→fn_type);` -2. **No return case**: nvalues == 0 → set *out_rets = NULL, *out_nrets = 0, return -3. **Indirect return (sret)** — abi→ret.kind == ABI_ARG_INDIRECT: - - Load the sret pointer (spilled at entry in a reserved frame slot, or already in a0 for tail calls) - - Emit aggregate copy from values[0] to [a0] using memcpy - - No return moves needed (copy is in-place) - - Set *out_rets = NULL, *out_nrets = 0 -4. **Direct return** — abi→ret.kind == ABI_ARG_DIRECT: - - Allocate NativeCallPlanRet array - - For each part in abi→ret.parts: - - src: values[0] (adjusted by part→src_offset if aggregate) - - dst: a0/a1 for INT parts, fa0/fa1 for FP parts - - mem: MemAccess with part→size - - Set *out_rets and *out_nrets -5. **No ABI or void return**: *out_rets = NULL, *out_nrets = 0 - -### RV64 Details - -- **sret pointer location**: stored at function entry in a reserved frame slot (e.g., rv_sret_ptr_slot). Retrieve with sp/fp offset math. -- **Return register mapping**: part→cls == ABI_CLASS_FP → fa0 (x10 FP), fa1 (x11 FP); else a0 (x10), a1 (x11) -- **Part ordering**: emit parts in source order (part→src_offset) for aggregates - -## Step 4: Implement ret (analog: aa64 lines 2891–2894) - -### Signature -```c -static void rv_ret(NativeTarget* t) { - // Emit return (jump to epilogue or direct ret instruction) -} -``` - -### Body Sketch - -1. **Single-pass mode** (no known_frame): emit jal x0, <epilogue_label> (NOP-placeholder until patched) -2. **Known-frame mode** (optimized): epilogue is already emitted inline; just emit jr ra - -## Step 5: Helper Functions for ABI Queries - -### rv_param_abi (analog: aa64 lines 2385–2401) -```c -static const ABIArgInfo* rv_param_abi(NativeTarget* t, const ABIFuncInfo* abi, - const NativeCallDesc* desc, u32 i, - ABIArgInfo* scratch) { - if (abi && i < abi->nparams) return &abi->params[i]; - // Synthesize a default DIRECT + INT part for untyped/extern calls - memset(scratch, 0, sizeof *scratch); - scratch->kind = ABI_ARG_DIRECT; - scratch->nparts = 1; - scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1); - scratch->parts[0].cls = cg_type_is_float(t->c, desc->args[i].type) ? ABI_CLASS_FP : ABI_CLASS_INT; - scratch->parts[0].loc = ABI_LOC_REG; - scratch->parts[0].size = type_size32(t, desc->args[i].type); - scratch->parts[0].align = type_align32(t, desc->args[i].type); - scratch->parts[0].src_offset = 0; - return scratch; -} -``` - -### rv_part_scalar_type (analog: aa64 lines 2417–2436) -```c -static CfreeCgTypeId rv_part_scalar_type(const ABIArgPart* part) { - if (part->cls == ABI_CLASS_FP) { - if (part->size <= 4) return builtin_id(CFREE_CG_BUILTIN_F32); - return builtin_id(CFREE_CG_BUILTIN_F64); // rv64 locks F128 to F64 - } - switch (part->size) { - case 1: return builtin_id(CFREE_CG_BUILTIN_I8); - case 2: return builtin_id(CFREE_CG_BUILTIN_I16); - case 4: return builtin_id(CFREE_CG_BUILTIN_I32); - default: return builtin_id(CFREE_CG_BUILTIN_I64); - } -} -``` - -### rv_call_stack_size (analog: aa64 lines 2449–2489) -```c -static u32 rv_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) { - const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type); - u32 next_int = 0, next_fp = 0, stack = 0; - for (u32 i = 0; i < desc->nargs; ++i) { - ABIArgInfo tmp; - const ABIArgInfo* ai = rv_param_abi(t, abi, desc, i, &tmp); - int force_stack = abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams; - - if (ai->kind == ABI_ARG_IGNORE) continue; - if (force_stack) { - // Variadic args: round to 8 bytes, advance stack - stack += 8; - continue; - } - if (ai->kind == ABI_ARG_INDIRECT) { - if (next_int < 8) next_int++; - else stack += 8; - continue; - } - for (u32 p = 0; p < ai->nparts; ++p) { - const ABIArgPart* part = &ai->parts[p]; - if (part->cls == ABI_CLASS_FP) { - if (next_fp < 8) next_fp++; - else stack += 8; // FP part on stack - } else { - if (next_int < 8) next_int++; - else stack += 8; // INT part on stack - } - } - } - return (stack + 15) & ~15; // 16-byte align -} -``` - -### rv_signature_stack_bytes (analog: aa64 lines 2495–2506) -```c -static u32 rv_signature_stack_bytes(NativeTarget* t, CfreeCgTypeId fn_type, - int* variadic, u32* nparams) { - const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type); - NativeCallDesc d; - if (variadic) *variadic = abi ? (int)abi->variadic : 0; - if (nparams) *nparams = abi ? abi->nparams : 0; - memset(&d, 0, sizeof d); - d.fn_type = fn_type; - d.nargs = abi ? abi->nparams : 0; - if (d.nargs) d.args = arena_zarray(t->c->tu, NativeLoc, d.nargs); - return rv_call_stack_size(t, &d); -} -``` - -### rv_call_stack_bytes (analog: aa64 lines 2508–2514) -```c -static u32 rv_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) { - return rv_call_stack_size(t, desc); -} -``` - -## Step 6: Register Parameter Binding (NativeDirectTarget path) - -The NativeDirectTarget adapter is in `/Users/ryan/code/cfree/src/cg/native_direct_target.h` (NativeOps struct, lines 66–94). For the -O0 semantic path, implement: - -### rv_bind_param (analog: aa64 lines 3616–3695 aa_bind_native_param) - -**Purpose**: Accept a CGParamDesc (semantic parameter) and route its incoming value from the ABI location (arg register or stack) to the user's allocated home (register or frame slot). - -**Signature**: -```c -static void rv_bind_param(NativeTarget* t, const CGParamDesc* p, - NativeLoc dst) { - // Route p from its ABI location to dst -} -``` - -**Body Sketch**: -1. Query ABI: `const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, rv_of(t)->func->fn_type);` -2. Get param ABIArgInfo: `const ABIArgInfo* ai = (p->index < abi->nparams) ? &abi->params[p->index] : NULL;` -3. If IGNORE or NULL: no action (only ABI cursor advances) -4. If INDIRECT: - - Load the pointer from the current int register (a0+next_param_int) or stack - - Copy aggregate from [pointer] to dst - - Advance next_param_int or next_param_stack -5. If DIRECT with parts: - - For each part: - - Determine source: a0+next_int, fa0+next_fp, or stack+offset - - Load from source into intermediate register (if needed) - - Move/store into dst (register or frame slot) - - Advance int/fp cursors and stack offset - -**Key fields to track** (arch-private state): -- `next_param_int`: cursor for a0..a7 -- `next_param_fp`: cursor for fa0..fa7 -- `next_param_stack`: sp-relative offset for stack params (round up to part alignment) - -## Step 7: Tail Call Unrealizability (NativeDirectTarget path) - -### rv_tail_call_unrealizable_reason - -**Purpose**: Return a blocker string if the tail call cannot be emitted, else NULL. - -**Signature**: -```c -static const char* rv_tail_call_unrealizable_reason(NativeDirectTarget* d, - const CGCallDesc* call) { - // Check if tail call is realizable given the current function's incoming stack args -} -``` - -**Body Sketch**: -1. Compute the outgoing stack-arg size for the call descriptor (use rv_call_stack_size) -2. Compare against incoming_stack_size (set at function entry) -3. If outgoing > incoming: return "rv64 tail call: stack argument area too small" -4. Else return NULL (realizable) - -## Integration Points - -### 1. NativeTarget Hook Registration (in rv_native_target_new or similar) - -```c -t->plan_call = rv_plan_call; -t->emit_call = rv_emit_call; -t->plan_ret = rv_plan_ret; -t->ret = rv_ret; -t->signature_stack_bytes = rv_signature_stack_bytes; -t->call_stack_bytes = rv_call_stack_bytes; -``` - -### 2. NativeOps Adapter Registration (for NativeDirectTarget) - -```c -static const NativeOps rv_direct_ops = { - .bind_param = rv_bind_native_param, // the NativeTarget version - .tail_call_unrealizable_reason = rv_no_tail, // or version that checks stack - .va_start_ = rv_va_start_, - .va_arg_ = rv_va_arg_, - .va_end_ = rv_va_end_, - .va_copy_ = rv_va_copy_, - .asm_block = rv_direct_asm_block, - .barrier = rv_direct_barrier, -}; -const NativeOps* rv64_native_direct_ops(void) { return &rv_direct_ops; } -``` - -But note: the NativeOps bind_param receives a **semantic CGParamDesc** and routes to a **semantic home** (CGLocal), which is wrapped in a NativeDirectLocal. For the NativeTarget emission path, a separate bind_param hook on the NativeTarget is NOT called directly by the optimizer; instead, the backend's own prologue setup handles parameter binding. The NativeDirectTarget adapter is only for -O0. - -## Key Distinctions: aa64 vs rv64 - -| Aspect | aa64 | rv64 | -|--------|------|------| -| Arg registers | a0–a7 (x0–x7) + fa0–fa7 (v0–v7 FP) | a0–a7 (x10–x17) + fa0–fa7 (f10–f17) | -| Incoming frame anchor | fp (x29) | sp (x2) | -| Outgoing frame anchor | sp (x31) | sp (x2) | -| Variadic stack behavior | vararg_on_stack (Apple) | always uses int regs then stack | -| Return registers | a0/a1 (x0–x1), v0–v1 (FP) | a0/a1 (x10–x11), fa0–fa1 (f10–f11) | -| sret pointer reg | x8 (always indirect) | a0 (first arg) | -| Stack alignment | 16 bytes | 16 bytes | -| Longest scalar type | F128 | double (F128 deferred) | - -## Notes - -- **Parallel-copy semantics**: aa64 handles cycles explicitly (aa_emit_reg_arg_moves). rv64 may rely on the allocator to avoid cycles; emit conservatively. -- **Scratch registers**: rv64 uses t0 (x5), t1 (x6), t2 (x7) for temporaries; don't clobber live argument registers. -- **Frame pointer**: rv64 uses s0 (x8) as fp; sp is x2. Adjust offset calculations accordingly. -- **ABI queries must go through abi_cg_func_info and friends**; no hardcoding of register numbers. -- **Tail call forwarding**: if has_sret and (flags & CG_CALL_TAIL), a0 already holds the incoming sret pointer (forwarded); no need to load it again. - - - - ---- - -# RV64 NativeTarget Porting Guide — GROUP 5: Atomics, Variadics, Inline ASM, Intrinsics, Finalize - -## Overview -This guide covers the five functional groups at the tail of `src/arch/rv64/native.c` implementation: atomics (load/store/RMW/CAS/fence), variadic support (va_start_/va_arg_/va_end_/va_copy_), inline assembly (asm_block + direct path), compiler intrinsics, and finalization hooks (trap, set_loc, finalize, destroy). - -The reference implementation is **aa64 native.c** (4557 lines total). The legacy rv64 code exists but does not compile; it provides correct ISA/ABI logic that must be ported. The -O0 path uses **NativeOps** adapter (native_direct_target.h); the optimizer path uses **NativeTarget** hooks directly. - ---- - -## File Lifecycle: Keep vs. Delete - -**DELETE** (old legacy single-pass code; not ported): -- `src/arch/rv64/ops.c` — semantic CG lowering (replaced by NativeTarget hooks) -- `src/arch/rv64/emit.c` — emission driver (replaced by MCEmitter in NativeTarget) -- `src/arch/rv64/alloc.c` — register allocation (replaced by NativeAllocClass + optimizer) -- `src/arch/rv64/opt_coord.c` — optimizer coordination (not part of native backend) -- `src/arch/rv64/internal.h` — old RvImpl state struct (replaced by RvNativeTarget private state) - -**KEEP** (ISA encoding, disasm, link, debug, register info): -- `src/arch/rv64/isa.h` — instruction encoders (RV_A, RV_I extensions) -- `src/arch/rv64/isa.c` — disassembler / utility decoders -- `src/arch/rv64/regs.c` — register file info (migrate to NativeRegInfo hooks) -- `src/arch/rv64/regs.h` — register enum definitions -- `src/arch/rv64/link.c` — object linker integration -- `src/arch/rv64/dbg.c` — DWARF debug info emission -- `src/arch/rv64/disasm.c/disasm.h` — disassembler (if needed for diagnostics) -- `src/arch/rv64/emu.c` — emulator / JIT (independent of CG) -- `src/arch/rv64/arch.c` — arch initialization hooks -- `src/arch/rv64/asm.c/asm.h` — assembler (inline asm binding) -- `src/arch/rv64/rv64.h` — public header - ---- - -## GROUP 5A: Atomics (MemOrder → RISC-V A-Extension) - -### Design Overview -- **Memory ordering mapping**: MemOrder enum (relaxed, acquire, release, acq_rel, seq_cst) → RISC-V .aq/.rl bits on LR/SC and fence instructions -- **Loop strategy**: LR.W/D (load-reserved) + SC.W/D (store-conditional) retry loop for all ops (preferred over per-op AMO for simplicity) -- **Alternative**: AMO instructions (AMOADD, AMOAND, AMOOR, AMOXOR, AMOSWAP with .aq/.rl) for hot paths (deferred) -- **Spill slot for CAS**: unlike aa64 which uses a single saved-tmp-reg, rv64 can allocate temp regs freely, so no backend spill slot needed unless aggressive optimization desired - -### Source Files for ISA Encoding -- `src/arch/rv64/isa.h` lines 520–529: `rv_lr_w/d`, `rv_sc_w/d` function signatures -- `src/arch/rv64/ops.c` lines 1917–2109: complete atomic_load/store/rmw/cas/fence legacy implementation - -### NativeTarget Hook Signatures (src/arch/native_target.h, lines 396–405) -```c -void (*atomic_load)(NativeTarget*, NativeLoc dst, NativeAddr addr, MemAccess, MemOrder); -void (*atomic_store)(NativeTarget*, NativeAddr addr, NativeLoc src, MemAccess, MemOrder); -void (*atomic_rmw)(NativeTarget*, AtomicOp, NativeLoc dst, NativeAddr addr, - NativeLoc val, MemAccess, MemOrder); -void (*atomic_cas)(NativeTarget*, NativeLoc prior, NativeLoc ok, - NativeAddr addr, NativeLoc expected, NativeLoc desired, - MemAccess, MemOrder success, MemOrder failure); -void (*fence)(NativeTarget*, MemOrder); -``` - -### Implementation Body Sketches - -#### 1. `rv_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, MemAccess mem, MemOrder order)` -**Location**: `src/arch/rv64/native.c` (new file) - -**Pseudo-C**: -```c -static void rv_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, MemAccess mem, MemOrder order) { - u32 base_reg = RV_T0; /* or use first available scratch */ - u32 dst_reg = dst.v.reg & 0x1f; - u32 size = mem.size ? mem.size : type_size32(t, dst.type); - - /* Materialize address base into base_reg */ - rv_atomic_addr_reg(t, addr, base_reg); - - int aq = mem_order_is_acquire(order); - int rl = 0; /* LR ignores rl bit, but API expects it */ - - if (aq) { - /* lr.w/d dst, (base) with aq=1 */ - u32 enc = (size == 8) - ? rv_lr_d(dst_reg, base_reg, aq, rl) - : rv_lr_w(dst_reg, base_reg, aq, rl); - rv64_emit32(t->mc, enc); - } else { - /* Plain load (relaxed read) */ - u32 enc = enc_int_load(size, 0, dst_reg, base_reg, 0); - rv64_emit32(t->mc, enc); - } - - if (order == MO_SEQ_CST) { - /* fence rw,rw after acquire-load for seq_cst */ - rv64_emit32(t->mc, rv_fence_rw_rw()); - } -} -``` - -**Key points**: -- LR.W/D with aq=1 serves as acquire-load; plain load for relaxed/release (rl is ignored on loads) -- SEQ_CST requires a FENCE after the load to satisfy full ordering -- Address materialization delegates to `rv_atomic_addr_reg(t, addr, base_reg)` helper (similar to aa64's aa_atomic_addr_reg) - ---- - -#### 2. `rv_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src, MemAccess mem, MemOrder order)` - -**Pseudo-C**: -```c -static void rv_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src, MemAccess mem, MemOrder order) { - u32 base_reg = RV_T0; - u32 src_reg = src.v.reg & 0x1f; - u32 size = mem.size ? mem.size : type_size32(t, src.type); - - if (order == MO_SEQ_CST) { - /* fence rw,rw before seq_cst write */ - rv64_emit32(t->mc, rv_fence_rw_rw()); - } - - rv_atomic_addr_reg(t, addr, base_reg); - - int rl = mem_order_is_release(order); - - if (rl) { - /* fence rw,w before release store (conservative) */ - rv64_emit32(t->mc, rv_fence_rw_w()); - u32 enc = enc_int_store(size, src_reg, base_reg, 0); - rv64_emit32(t->mc, enc); - } else { - /* Plain store (relaxed) */ - u32 enc = enc_int_store(size, src_reg, base_reg, 0); - rv64_emit32(t->mc, enc); - } - - if (order == MO_SEQ_CST) { - /* fence rw,rw after seq_cst write */ - rv64_emit32(t->mc, rv_fence_rw_rw()); - } -} -``` - -**Key points**: -- RISC-V stores are always non-atomic (plain SW/SD); use FENCE to emulate release/seq_cst semantics -- Release store uses fence-rw-w (release) + plain store; SEQ_CST wraps in full fences - ---- - -#### 3. `rv_atomic_rmw(NativeTarget* t, AtomicOp op, NativeLoc dst, NativeAddr addr, NativeLoc val, MemAccess mem, MemOrder order)` - -**Pseudo-C**: -```c -static void rv_atomic_rmw(NativeTarget* t, AtomicOp op, NativeLoc dst, NativeAddr addr, - NativeLoc val, MemAccess mem, MemOrder order) { - MCEmitter* mc = t->mc; - u32 base_reg = RV_T0; - u32 dst_reg = dst.v.reg & 0x1f; - u32 val_reg = RV_T1; - u32 new_reg = RV_T2; /* computed result in loop */ - u32 status = RV_T3; /* SC.W/D's failure flag */ - u32 size = mem.size ? mem.size : type_size32(t, dst.type); - int sf = (size == 8) ? 1 : 0; - - if (order == MO_SEQ_CST) { - rv64_emit32(mc, rv_fence_rw_rw()); - } - - /* Materialize val into val_reg if not already there */ - if (val.kind == NATIVE_LOC_IMM) { - rv64_emit_load_imm(mc, sf, val_reg, val.v.imm); - } else { - rv64_emit32(mc, rv_addi(val_reg, val.v.reg, 0)); - } - - rv_atomic_addr_reg(t, addr, base_reg); - - int aq = mem_order_is_acquire(order); - int rl = mem_order_is_release(order); - - MCLabel retry = mc->label_new(mc); - mc->label_place(mc, retry); - - /* LR.W/D: load-reserve current value into dst */ - u32 enc = sf ? rv_lr_d(dst_reg, base_reg, aq, 0) - : rv_lr_w(dst_reg, base_reg, aq, 0); - rv64_emit32(mc, enc); - - /* Compute: new = f(dst, val) based on op */ - switch (op) { - case AO_XCHG: - rv64_emit32(mc, rv_addi(new_reg, val_reg, 0)); - break; - case AO_ADD: - rv64_emit32(mc, sf ? rv_add(new_reg, dst_reg, val_reg) - : rv_addw(new_reg, dst_reg, val_reg)); - break; - case AO_SUB: - rv64_emit32(mc, sf ? rv_sub(new_reg, dst_reg, val_reg) - : rv_subw(new_reg, dst_reg, val_reg)); - break; - case AO_AND: - rv64_emit32(mc, rv_and(new_reg, dst_reg, val_reg)); - break; - case AO_OR: - rv64_emit32(mc, rv_or(new_reg, dst_reg, val_reg)); - break; - case AO_XOR: - rv64_emit32(mc, rv_xor(new_reg, dst_reg, val_reg)); - break; - case AO_NAND: - rv64_emit32(mc, rv_and(new_reg, dst_reg, val_reg)); - rv64_emit32(mc, rv_xori(new_reg, new_reg, -1)); - break; - default: - /* Unsupported op */ - compiler_panic(t->c, (SrcLoc){0, 0, 0}, "rv64: unsupported atomic rmw op"); - break; - } - - /* SC.W/D: try to store new value; status != 0 on failure */ - enc = sf ? rv_sc_d(status, base_reg, new_reg, 0, rl) - : rv_sc_w(status, base_reg, new_reg, 0, rl); - rv64_emit32(mc, enc); - - /* If status != 0 (SC failed), retry */ - rv64_emit32(mc, rv_bne(status, RV_ZERO, 0)); - mc->emit_label_ref(mc, retry, R_RV_BRANCH, 4, 0); - - if (order == MO_SEQ_CST) { - rv64_emit32(mc, rv_fence_rw_rw()); - } -} -``` - -**Key points**: -- LR.W/D / SC.W/D pair forms the core atomic RMW loop -- status register (T3) holds SC result (0 = success, != 0 = retry needed) -- Ordering: aq on LR, rl on SC; SEQ_CST wraps in full fences -- Each op (ADD, SUB, AND, OR, XOR, NAND) is computed in a temp reg between LR and SC - ---- - -#### 4. `rv_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok, NativeAddr addr, NativeLoc expected, NativeLoc desired, MemAccess mem, MemOrder success, MemOrder failure)` - -**Pseudo-C**: -```c -static void rv_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok, - NativeAddr addr, NativeLoc expected, NativeLoc desired, - MemAccess mem, MemOrder success, MemOrder failure) { - MCEmitter* mc = t->mc; - u32 base_reg = RV_T0; - u32 prior_reg = prior.v.reg & 0x1f; - u32 exp_reg = RV_T1; - u32 des_reg = RV_T2; - u32 status = RV_T3; - u32 ok_reg = ok.v.reg & 0x1f; - u32 size = mem.size ? mem.size : type_size32(t, prior.type); - int sf = (size == 8) ? 1 : 0; - - if (success == MO_SEQ_CST || failure == MO_SEQ_CST) { - rv64_emit32(mc, rv_fence_rw_rw()); - } - - /* Materialize expected and desired into temp regs */ - if (expected.kind == NATIVE_LOC_IMM) { - rv64_emit_load_imm(mc, sf, exp_reg, expected.v.imm); - } else { - rv64_emit32(mc, rv_addi(exp_reg, expected.v.reg, 0)); - } - - if (desired.kind == NATIVE_LOC_IMM) { - rv64_emit_load_imm(mc, sf, des_reg, desired.v.imm); - } else { - rv64_emit32(mc, rv_addi(des_reg, desired.v.reg, 0)); - } - - rv_atomic_addr_reg(t, addr, base_reg); - - int aq = mem_order_is_acquire(success) || mem_order_is_acquire(failure); - int rl = mem_order_is_release(success); - - MCLabel retry = mc->label_new(mc); - MCLabel fail = mc->label_new(mc); - MCLabel done = mc->label_new(mc); - - mc->label_place(mc, retry); - - /* LR.W/D: load-reserve prior value */ - u32 enc = sf ? rv_lr_d(prior_reg, base_reg, aq, 0) - : rv_lr_w(prior_reg, base_reg, aq, 0); - rv64_emit32(mc, enc); - - /* if (prior != expected) goto fail */ - rv64_emit32(mc, rv_bne(prior_reg, exp_reg, 0)); - mc->emit_label_ref(mc, fail, R_RV_BRANCH, 4, 0); - - /* SC.W/D: try to store desired */ - enc = sf ? rv_sc_d(status, base_reg, des_reg, 0, rl) - : rv_sc_w(status, base_reg, des_reg, 0, rl); - rv64_emit32(mc, enc); - - /* if (status != 0) goto retry */ - rv64_emit32(mc, rv_bne(status, RV_ZERO, 0)); - mc->emit_label_ref(mc, retry, R_RV_BRANCH, 4, 0); - - /* ok = 1; goto done */ - rv64_emit_load_imm(mc, 0, ok_reg, 1); - rv64_emit32(mc, rv_jal(RV_ZERO, 0)); - mc->emit_label_ref(mc, done, R_RV_JAL, 4, 0); - - /* fail label: ok = 0 */ - mc->label_place(mc, fail); - rv64_emit_load_imm(mc, 0, ok_reg, 0); - - /* done label */ - mc->label_place(mc, done); - - if (success == MO_SEQ_CST || failure == MO_SEQ_CST) { - rv64_emit32(mc, rv_fence_rw_rw()); - } -} -``` - -**Key points**: -- Three labels: retry (LR loop), fail (mismatch exit), done (all-done exit) -- ok output is 1 on success, 0 on failure -- Failure ordering is ignored in the simple LR/SC model; both success/failure use the same LR (aq from both) and SC (rl from success) -- Fence placement respects both success and failure orders (conservative: if either is SEQ_CST, full fence) - ---- - -#### 5. `rv_fence(NativeTarget* t, MemOrder order)` - -**Pseudo-C**: -```c -static void rv_fence(NativeTarget* t, MemOrder order) { - if (order == MO_RELAXED) return; - /* All other orders use fence rw,rw (full barrier) */ - rv64_emit32(t->mc, rv_fence_rw_rw()); -} -``` - -**Key points**: -- RISC-V FENCE instruction with pred=rw, succ=rw is the full memory barrier -- Relaxed needs no fence -- Acquire/release/seq_cst all use rw,rw; fine-grained pred/succ bits are a refinement (not yet needed) - ---- - -## GROUP 5B: Variadics (LP64D Save-Area Spill) - -### Design Overview -- **va_list layout**: single 8-byte pointer (ABI_VA_LIST_POINTER) pointing to the next argument slot -- **Prologue**: variadic functions spill a_{nparams_int}..a7 (unused GP regs) into a save area at [s0 + 16] (top of callee frame, above saved s0/ra pair) -- **Calling convention**: va_arg advances the pointer by 8 bytes per call; all variadic args sit in the same save area regardless of type (integer regs are bit-cast to FP when needed) -- **Setup**: next_param_int cursor in RvNativeTarget tracks how many GP regs have been bound as fixed params; variadic spill begins after that - -### Source Files for ABI & Lowering -- `src/abi/abi_rv64.c` lines 223–251: ABIVaListInfo initialization (.kind = ABI_VA_LIST_POINTER) -- `src/abi/abi.h` lines 30–50: ABIVaListKind enum and ABIVaListInfo struct -- `src/arch/rv64/ops.c` lines 1846–1905: legacy va_start_/va_arg_/va_end_/va_copy_ implementation - -### NativeTarget Hook Signatures (src/arch/native_target.h, lines 406–417) -```c -void (*va_start_)(NativeTarget*, NativeLoc ap_ptr); -void (*va_arg_)(NativeTarget*, NativeLoc dst, NativeLoc ap_ptr, CfreeCgTypeId type); -void (*va_end_)(NativeTarget*, NativeLoc ap_ptr); -void (*va_copy_)(NativeTarget*, NativeLoc dst_ap_ptr, NativeLoc src_ap_ptr); -``` - -### NativeOps Adapter (for -O0 path, src/cg/native_direct_target.h lines 81–86) -```c -void (*va_start_)(NativeDirectTarget*, Operand ap_addr); -void (*va_arg_)(NativeDirectTarget*, Operand dst, Operand ap_addr, CfreeCgTypeId type); -void (*va_end_)(NativeDirectTarget*, Operand ap_addr); -void (*va_copy_)(NativeDirectTarget*, Operand dst_ap_addr, Operand src_ap_addr); -``` - -### Frame Layout State (RvNativeTarget private struct fields) -```c -u32 next_param_int; /* count of integer register params bound so far (0–7) */ -u32 next_param_stack; /* byte offset of next stack arg past fixed params */ -u8 is_variadic; /* function is variadic */ -``` - -### Implementation Body Sketches - -#### 1. `rv_va_start_(NativeTarget* t, NativeLoc ap_ptr)` - -**Pseudo-C**: -```c -static void rv_va_start_(NativeTarget* t, NativeLoc ap_ptr) { - RvNativeTarget* a = rv_of(t); - MCEmitter* mc = t->mc; - - /* ap_ptr is a register or frame slot holding &va_list (i.e., the address - * where va_start writes the initial ap pointer). */ - - /* Compute first-variadic-slot address: s0 + 16 + next_param_int*8 */ - u32 ap_base_reg = RV_T0; - u32 ap_value_reg = RV_T1; - i32 offset = 16 + (i32)(a->next_param_int * 8u); - - /* t0 = s0 + offset */ - rv_emit_add_imm(mc, ap_base_reg, RV_S0, offset); - - /* Store t0 into the va_list location (*ap_ptr = t0) */ - rv_emit_store(mc, ap_value_reg, ap_ptr, 0, 8); /* Store t0 @ ap_ptr */ -} -``` - -**For -O0 path (NativeOps: rv_va_start_ in native_direct_target.h)**: - -```c -static void rv_va_start_(NativeDirectTarget* d, Operand ap_addr) { - RvNativeTarget* a = rv_of(d->native); - MCEmitter* mc = d->native->mc; - - /* ap_addr is a semantic Operand (local, immediate offset, etc.). - * Materialize its address into a register. */ - u32 ap_ptr_reg = RV_T0; - rv_direct_materialize_addr(d, ap_addr, ap_ptr_reg); - - /* Compute ap value: s0 + 16 + next_param_int*8 */ - u32 ap_val_reg = RV_T1; - i32 offset = 16 + (i32)(a->next_param_int * 8u); - rv_emit_add_imm(mc, ap_val_reg, RV_S0, offset); - - /* *ap_ptr = ap_val */ - rv_emit_store(mc, ap_val_reg, ap_ptr_reg, 0, 8); -} -``` - -**Key points**: -- RV64 LP64D: all variadic arguments (int and FP) are spilled to the integer save area -- Offset 16 is above the saved s0/ra pair at [s0+0] and [s0+8] -- next_param_int tracks how many of {a0..a7} are already consumed by fixed params - ---- - -#### 2. `rv_va_arg_(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr, CfreeCgTypeId type)` - -**Pseudo-C**: -```c -static void rv_va_arg_(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr, - CfreeCgTypeId type) { - MCEmitter* mc = t->mc; - u32 size = type_size32(t, type); - int is_fp = cg_type_is_float(t->c, type); - - /* Load ap_ptr's current value into t0 (the current ap) */ - u32 ap_reg = RV_T0; - u32 val_reg = RV_T1; - u32 next_ap_reg = RV_T2; - - /* t0 = *ap_ptr (load current va_list pointer) */ - rv_emit_load(mc, ap_reg, ap_ptr, 0, 8); - - /* Load value from [t0] into val_reg or dst (depending on type) */ - if (is_fp && size == 8) { - /* FP8: load double from [t0], bit-cast to FP register */ - rv64_emit32(mc, rv_ld(RV_T1, ap_reg, 0)); /* t1 = *ap (int64) */ - rv64_emit32(mc, rv_fmv_d_x(dst.v.reg, RV_T1)); /* dst.fp = bit_cast(t1) */ - } else if (is_fp && size == 4) { - /* FP4: load word, bit-cast to FP */ - rv64_emit32(mc, rv_lw(RV_T1, ap_reg, 0)); - rv64_emit32(mc, rv_fmv_w_x(dst.v.reg, RV_T1)); - } else { - /* Integer: load with sign extension based on type signedness */ - int sx = type_is_signed(type); - u32 enc = enc_int_load(size, sx, dst.v.reg, ap_reg, 0); - rv64_emit32(mc, enc); - } - - /* Advance ap_ptr by 8 bytes: t2 = t0 + 8 */ - rv64_emit32(mc, rv_addi(next_ap_reg, ap_reg, 8)); - - /* Store back: *ap_ptr = t2 */ - rv64_emit32(mc, rv_sd(next_ap_reg, ap_ptr.v.reg, 0)); -} -``` - -**For -O0 path (NativeOps)**: - -```c -static void rv_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr, - CfreeCgTypeId type) { - MCEmitter* mc = d->native->mc; - u32 size = type_size32(d->native, type); - int is_fp = cg_type_is_float(d->base.c, type); - - /* Materialize ap_addr into a register holding &va_list */ - u32 ap_ptr_reg = RV_T0; - rv_direct_materialize_addr(d, ap_addr, ap_ptr_reg); - - /* Load current va_list value */ - u32 ap_reg = RV_T1; - rv64_emit32(mc, rv_ld(ap_reg, ap_ptr_reg, 0)); - - /* Load value from [ap] */ - u32 dst_reg = rv_dst_reg(d, dst); - if (is_fp && size == 8) { - rv64_emit32(mc, rv_ld(RV_T2, ap_reg, 0)); - rv64_emit32(mc, rv_fmv_d_x(dst_reg, RV_T2)); - } else if (is_fp && size == 4) { - rv64_emit32(mc, rv_lw(RV_T2, ap_reg, 0)); - rv64_emit32(mc, rv_fmv_w_x(dst_reg, RV_T2)); - } else { - int sx = type_is_signed(type); - u32 enc = enc_int_load(size, sx, dst_reg, ap_reg, 0); - rv64_emit32(mc, enc); - } - - /* Advance ap: t2 = t1 + 8; *ap_ptr = t2 */ - rv64_emit32(mc, rv_addi(RV_T2, ap_reg, 8)); - rv64_emit32(mc, rv_sd(RV_T2, ap_ptr_reg, 0)); - - /* If dst is a memory location, store dst_reg into it */ - if (dst.kind != OPK_REG) { - rv_direct_store_reg_to_operand(d, dst, dst_reg); - } -} -``` - -**Key points**: -- All variadic args occupy 8-byte slots (even int32 and float32) -- FP args are stored in the integer save area; va_arg bit-casts them back via fmv_d_x / fmv_w_x -- Sign extension applies to integer variadic args per type signedness -- ap advances by 8 unconditionally - ---- - -#### 3. `rv_va_end_(NativeTarget* t, NativeLoc ap_ptr)` and `rv_va_copy_(NativeTarget* t, NativeLoc dst_ap_ptr, NativeLoc src_ap_ptr)` - -**va_end** is a no-op for pointer-based va_list: -```c -static void rv_va_end_(NativeTarget* t, NativeLoc ap_ptr) { - (void)t; - (void)ap_ptr; -} -``` - -**va_copy** copies the 8-byte pointer: -```c -static void rv_va_copy_(NativeTarget* t, NativeLoc dst_ap_ptr, NativeLoc src_ap_ptr) { - MCEmitter* mc = t->mc; - - /* t0 = *src_ap_ptr; *dst_ap_ptr = t0 */ - u32 tmp_reg = RV_T0; - rv64_emit32(mc, rv_ld(tmp_reg, src_ap_ptr.v.reg, 0)); - rv64_emit32(mc, rv_sd(tmp_reg, dst_ap_ptr.v.reg, 0)); -} -``` - -**Key points**: -- LP64D pointer va_list is 8 bytes; simple load/store copy -- No complex save-area state like AAPCS64 - ---- - -## GROUP 5C: Inline Assembly (NativeTarget + NativeOps) - -### Design Overview -- Two paths: **NativeOps direct** (aa64 aa_direct_asm_block, lines 4293–4395) and **NativeTarget** (aa64 aa_asm_block_native, lines 4480–4545) -- Direct path: -O0, self-allocates registers for operands, loads/stores operands, binds to template -- NativeTarget path: optimizer has pre-allocated all registers; backend only materializes memory bases and saves/restores clobbered callee-saves -- Template binding uses aa64_inline_bind (constraint parsing, named operands, tied operands) -- RV64 must adapt the same pattern with RV64-specific constraints: r (int), f (FP), i (imm), m (mem) -- **asm.h pseudo-kinds**: aa64 adds AA64_INLINE_OPK_REG (0xf0) and AA64_INLINE_OPCLS_{INT,FP}; rv64 must add equivalent RV_INLINE_OPK_REG and RV_INLINE_OPCLS_{INT,FP} - -### Source Files -- `src/arch/aa64/native.c` lines 4150–4395: direct-path constraint/register handling (aa_asm_alloc_reg, aa_asm_constraint_class, etc.) -- `src/arch/aa64/native.c` lines 4409–4545: memory-operand materialization and native-path asm_block -- `src/arch/aa64/asm.h` lines 27–32: AA64_INLINE_OPK_REG, AA64_INLINE_OPCLS_* pseudo-kinds -- `src/arch/rv64/asm.c` / `src/arch/rv64/asm.h`: existing assembler (to be adapted for inline asm) - -### NativeTarget Hook Signature (native_target.h, lines 420–423) -```c -void (*asm_block)(NativeTarget*, const char* tmpl, const AsmConstraint* outs, - u32 nout, NativeLoc* out_locs, const AsmConstraint* ins, - u32 nin, const NativeLoc* in_locs, const Sym* clobbers, - u32 nclob); -``` - -### NativeOps Hook Signature (native_direct_target.h, lines 88–91) -```c -void (*asm_block)(NativeDirectTarget*, const char* tmpl, - const AsmConstraint* outs, u32 nout, Operand* out_ops, - const AsmConstraint* ins, u32 nin, const Operand* in_ops, - const Sym* clobbers, u32 nclob); -``` - -### Constraint Parsing & Register Allocation (Direct Path Only) -Both paths need these helpers in asm.c or native.c: - -```c -/* Constraint string introspection */ -static const char* rv_asm_constraint_body(const char* s) { - if (!s) return ""; - if (s[0] == '=' && s[1] == '&') return s + 2; /* =& early-clobber output */ - if (s[0] == '=' || s[0] == '+' || s[0] == '&') return s + 1; - return s; -} - -static int rv_asm_constraint_early(const char* s) { - if (!s) return 0; - return (s[0] == '=' && s[1] == '&') || s[0] == '&'; -} - -static int rv_asm_match_index(const char* s) { - int n = 0; - if (!s || s[0] < '0' || s[0] > '9') return -1; - for (const char* p = s; *p >= '0' && *p <= '9'; ++p) { - n = n * 10 + (*p - '0'); - } - return n; -} - -static NativeAllocClass rv_asm_constraint_class(NativeDirectTarget* d, - const char* body) { - if (body[0] == 'r') return NATIVE_REG_INT; - if (body[0] == 'f') return NATIVE_REG_FP; - /* Panic for unsupported constraint */ - compiler_panic(d->base.c, d->loc, "rv64 asm: unsupported constraint '%s'", body); - return NATIVE_REG_INT; -} - -/* Register allocation for the direct path (scratch pools) */ -static Reg rv_asm_alloc_reg(NativeDirectTarget* d, NativeAllocClass cls, - u32* used_int, u32* used_fp) { - /* Allocable scratch: t0-t6 for int (5–11), ft0-ft7 for FP (0–7) */ - static const Reg int_pool[] = {5, 6, 7, 8, 9, 10, 11}; - static const Reg fp_pool[] = {0, 1, 2, 3, 4, 5, 6, 7}; - const Reg* pool = (cls == NATIVE_REG_FP) ? fp_pool : int_pool; - u32 n = (cls == NATIVE_REG_FP) ? 8u : 7u; - u32* used = (cls == NATIVE_REG_FP) ? used_fp : used_int; - - for (u32 i = 0; i < n; ++i) { - Reg r = pool[i]; - if ((*used & (1u << r)) != 0) continue; - *used |= 1u << r; - return r; - } - compiler_panic(d->base.c, d->loc, "rv64 asm: out of registers for operands"); - return REG_NONE; -} -``` - -### asm.h Pseudo-Kinds for RV64 - -Add to `src/arch/rv64/asm.h`: -```c -enum RvAsmPseudoOperandKind { - RV_INLINE_OPK_REG = 0xf0u, -}; - -enum RvAsmOperandClass { - RV_INLINE_OPCLS_INT = 0u, - RV_INLINE_OPCLS_FP = 1u, -}; -``` - -### Implementation Body Sketches - -#### Direct Path: `rv_direct_asm_block(NativeDirectTarget* d, const char* tmpl, const AsmConstraint* outs, u32 nout, Operand* out_ops, const AsmConstraint* ins, u32 nin, const Operand* in_ops, const Sym* clobbers, u32 nclob)` - -**Pseudo-C** (lines ~4000–4300 in native.c): -```c -static void rv_direct_asm_block(NativeDirectTarget* d, const char* tmpl, - const AsmConstraint* outs, u32 nout, - Operand* out_ops, const AsmConstraint* ins, - u32 nin, const Operand* in_ops, - const Sym* clobbers, u32 nclob) { - Operand* bound_outs = nout ? arena_zarray(d->base.c->tu, Operand, nout) : NULL; - Operand* bound_ins = nin ? arena_zarray(d->base.c->tu, Operand, nin) : NULL; - u32 clob_int, clob_fp, used_int, used_fp; - RvAsmSavedClobber* saved; - u32 nsaved; - - /* Parse clobber list into bitmasks */ - rv_asm_clobber_masks(d->base.c, d->loc, clobbers, nclob, &clob_int, &clob_fp); - - /* Reserve scratch: t0,t1 + call-saved regs + sp/gp/tp */ - used_int = clob_int | (1u << RV_T0) | (1u << RV_T1) | (1u << RV_S0) | - (1u << RV_SP) | (1u << RV_GP) | (1u << RV_TP); - used_fp = clob_fp; - - /* Bind outputs: allocate registers, load initial values if inout */ - for (u32 i = 0; i < nout; ++i) { - const char* body = rv_asm_constraint_body(outs[i].str); - if (body[0] == 'r' || body[0] == 'f') { - NativeAllocClass cls = rv_asm_constraint_class(d, body); - Reg reg = rv_asm_alloc_reg(d, cls, &used_int, &used_fp); - CfreeCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; - rv_asm_bound_reg(&bound_outs[i], type, cls, reg); - if (outs[i].dir == ASM_INOUT) { - /* Inout: load initial value from out_ops[i] into allocated reg */ - NativeLoc loc = rv_reg_loc(type, cls, reg); - rv_direct_load_operand_to_reg(d, out_ops[i], loc); - } - } else if (body[0] == 'm') { - /* Memory output: allocate base register */ - Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp); - NativeLoc loc = rv_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, reg); - CfreeCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; - rv_direct_load_address_to_reg(d, out_ops[i], loc); - rv_asm_bound_mem(&bound_outs[i], type, reg); - } else { - compiler_panic(d->base.c, d->loc, "rv64 asm: unsupported output constraint"); - } - } - - /* Bind inputs: match to outputs if tied, else allocate / load */ - for (u32 i = 0; i < nin; ++i) { - const char* body = rv_asm_constraint_body(ins[i].str); - int matched = rv_asm_match_index(body); - if (matched >= 0) { - if ((u32)matched >= nout) - compiler_panic(d->base.c, d->loc, "rv64 asm: matching constraint out of range"); - if (rv_asm_constraint_early(outs[matched].str)) - compiler_panic(d->base.c, d->loc, "rv64 asm: matching input ties early-clobber output"); - if (bound_outs[matched].kind != RV_INLINE_OPK_REG) - compiler_panic(d->base.c, d->loc, "rv64 asm: matching constraint requires register output"); - bound_ins[i] = bound_outs[matched]; - /* Load input value into the matched register */ - rv_direct_load_operand_to_reg(d, in_ops[i], - rv_reg_loc(bound_ins[i].type, - bound_ins[i].pad[0] == RV_INLINE_OPCLS_FP - ? NATIVE_REG_FP : NATIVE_REG_INT, - (Reg)bound_ins[i].v.local)); - continue; - } - - if (body[0] == 'r' || body[0] == 'f') { - NativeAllocClass cls = rv_asm_constraint_class(d, body); - Reg reg = rv_asm_alloc_reg(d, cls, &used_int, &used_fp); - CfreeCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type; - rv_asm_bound_reg(&bound_ins[i], type, cls, reg); - rv_direct_load_operand_to_reg(d, in_ops[i], rv_reg_loc(type, cls, reg)); - } else if (body[0] == 'i') { - if (in_ops[i].kind != OPK_IMM) - compiler_panic(d->base.c, d->loc, "rv64 asm: immediate constraint requires immediate"); - bound_ins[i] = in_ops[i]; - } else if (body[0] == 'm') { - Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp); - NativeLoc loc = rv_reg_loc(builtin_id(CFREE_CG_BUILTIN_I64), NATIVE_REG_INT, reg); - CfreeCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type; - rv_direct_load_address_to_reg(d, in_ops[i], loc); - rv_asm_bound_mem(&bound_ins[i], type, reg); - } else { - compiler_panic(d->base.c, d->loc, "rv64 asm: unsupported input constraint"); - } - } - - /* Save clobbered callee-saved regs */ - saved = rv_asm_save_callee_clobbers(rv_of(d->native), clob_int, clob_fp, &nsaved); - - /* Open assembler, bind operands, run template */ - RvAsm* a = rv_asm_open(d->base.c); - rv_inline_bind(a, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, nclob); - rv_asm_run_template(a, d->native->mc, tmpl); - rv_asm_close(a); - - /* Store output results back to operands */ - for (u32 i = 0; i < nout; ++i) { - NativeAllocClass cls; - NativeLoc src; - if (bound_outs[i].kind != RV_INLINE_OPK_REG) continue; - cls = bound_outs[i].pad[0] == RV_INLINE_OPCLS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; - src = rv_reg_loc(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local); - rv_direct_store_reg_to_operand(d, out_ops[i], src); - } - - /* Restore clobbered callee-saves */ - for (u32 i = nsaved; i > 0; --i) { - rv_asm_restore_one(rv_of(d->native), &saved[i - 1u]); - } -} -``` - -**Helper functions for the direct path**: - -```c -static void rv_asm_bound_reg(Operand* out, CfreeCgTypeId type, - NativeAllocClass cls, Reg reg) { - memset(out, 0, sizeof *out); - out->kind = RV_INLINE_OPK_REG; - out->pad[0] = (cls == NATIVE_REG_FP) ? RV_INLINE_OPCLS_FP : RV_INLINE_OPCLS_INT; - out->type = type; - out->v.local = (CGLocal)reg; -} - -static void rv_asm_bound_mem(Operand* out, CfreeCgTypeId type, Reg base) { - memset(out, 0, sizeof *out); - out->kind = OPK_INDIRECT; - out->type = type; - out->v.ind.base = (CGLocal)base; - out->v.ind.index = CG_LOCAL_NONE; -} - -static void rv_direct_load_operand_to_reg(NativeDirectTarget* d, - Operand op, NativeLoc dst) { - NativeAddr addr; - memset(&addr, 0, sizeof addr); - switch ((OpKind)op.kind) { - case OPK_IMM: - if ((NativeAllocClass)dst.cls != NATIVE_REG_INT) - compiler_panic(d->base.c, d->loc, "rv64 asm: FP immediate unsupported"); - d->native->load_imm(d->native, dst, op.v.imm); - return; - case OPK_LOCAL: - addr.base_kind = NATIVE_ADDR_BASE_FRAME; - addr.base.frame = d->locals[op.v.local - 1u].home; - addr.base_type = op.type; - rv_emit_mem(rv_of(d->native), 1, dst, addr, - rv_mem_for_type(d->native, op.type, 0)); - return; - /* Global, indirect cases: similar to aa64 */ - default: - compiler_panic(d->base.c, d->loc, "rv64 asm: unsupported input operand kind"); - } -} - -static void rv_direct_store_reg_to_operand(NativeDirectTarget* d, - Operand op, NativeLoc src) { - NativeAddr addr; - memset(&addr, 0, sizeof addr); - if (op.kind == OPK_LOCAL) { - addr.base_kind = NATIVE_ADDR_BASE_FRAME; - addr.base.frame = d->locals[op.v.local - 1u].home; - addr.base_type = op.type; - } else { - addr = rv_direct_materialize_addr(d, op); - } - rv_emit_mem(rv_of(d->native), 0, src, addr, - rv_mem_for_type(d->native, op.type, 0)); -} - -static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, - u32 nclob, u32* clob_int, u32* clob_fp) { - *clob_int = 0; - *clob_fp = 0; - for (u32 i = 0; i < nclob; ++i) { - Reg r; - NativeAllocClass cls; - if (rv_reg_resolve(c, clobbers[i], &r, &cls) != 0) { - compiler_panic(c, loc, "rv64 asm: unknown clobbered register"); - } - if (cls == NATIVE_REG_FP) - *clob_fp |= 1u << r; - else - *clob_int |= 1u << r; - } -} - -typedef struct RvAsmSavedClobber { - NativeFrameSlot slot; - NativeAllocClass cls; - Reg reg; - CfreeCgTypeId type; -} RvAsmSavedClobber; - -static RvAsmSavedClobber* rv_asm_save_callee_clobbers(RvNativeTarget* a, - u32 clob_int, - u32 clob_fp, - u32* nsaved) { - /* Identify which clobbered registers are callee-saved, allocate frame slots */ - static const u32 callee_saved_int[] = {8, 9, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}; - static const u32 callee_saved_fp[] = {8, 9, 10, 11}; - - /* Count and allocate slots; return array for later restore */ - /* Similar to aa64's aa_asm_save_callee_clobbers */ -} - -static void rv_asm_save_one(RvNativeTarget* a, RvAsmSavedClobber* s) { - /* Allocate frame slot and emit save instruction */ -} - -static void rv_asm_restore_one(RvNativeTarget* a, RvAsmSavedClobber* s) { - /* Emit restore instruction */ -} -``` - ---- - -#### NativeTarget Path: `rv_asm_block_native(NativeTarget* t, const char* tmpl, const AsmConstraint* outs, u32 nout, NativeLoc* out_locs, const AsmConstraint* ins, u32 nin, const NativeLoc* in_locs, const Sym* clobbers, u32 nclob)` - -**Pseudo-C** (lines ~4480–4600 in native.c): -```c -static void rv_asm_block_native(NativeTarget* t, const char* tmpl, - const AsmConstraint* outs, u32 nout, - NativeLoc* out_locs, const AsmConstraint* ins, - u32 nin, const NativeLoc* in_locs, - const Sym* clobbers, u32 nclob) { - RvNativeTarget* a = rv_of(t); - Compiler* c = t->c; - SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; - Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; - Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; - u32 clob_int, clob_fp, ntmp = 0; - RvAsmSavedClobber* saved; - u32 nsaved; - - rv_asm_clobber_masks(c, loc, clobbers, nclob, &clob_int, &clob_fp); - - /* Bind outputs using pre-allocated physical registers */ - for (u32 i = 0; i < nout; ++i) { - CfreeCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type; - rv_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, - out_locs[i], &ntmp); - } - - /* Bind inputs: match to outputs or allocate from reserved scratch */ - for (u32 i = 0; i < nin; ++i) { - const char* body = rv_asm_constraint_body(ins[i].str); - int matched = rv_asm_match_index(body); - CfreeCgTypeId type; - - if (matched >= 0) { - if ((u32)matched >= nout) - compiler_panic(c, loc, "rv64 asm: matching constraint out of range"); - bound_ins[i] = bound_outs[matched]; - continue; - } - - type = ins[i].type ? ins[i].type : in_locs[i].type; - /* Address-taken locals may arrive in frame slots; load into scratch if needed */ - NativeLoc inloc = in_locs[i]; - if (body[0] == 'r' && inloc.kind != NATIVE_LOC_REG) { - Reg r; - if (ntmp >= 2u) - compiler_panic(c, loc, "rv64 asm: too many memory operands"); - r = (ntmp == 0u) ? RV_T0 : RV_T1; - ntmp++; - inloc = rv_reg_loc(type, NATIVE_REG_INT, r); - rv_emit_mem(a, 1, inloc, rv_asm_loc_to_addr(a, loc, in_locs[i]), - rv_mem_for_type(t, type, type_size32(t, type))); - } - rv_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp); - } - - /* Save clobbered callee-saves */ - saved = rv_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved); - - /* Open assembler, bind, run template, close */ - RvAsm* asmh = rv_asm_open(c); - rv_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, nclob); - rv_asm_run_template(asmh, t->mc, tmpl); - rv_asm_close(asmh); - - /* Restore clobbered callee-saves */ - for (u32 i = nsaved; i > 0; --i) { - rv_asm_restore_one(a, &saved[i - 1u]); - } -} - -/* Helper: convert NativeLoc to NativeAddr (for memory operand materialization) */ -static NativeAddr rv_asm_loc_to_addr(RvNativeTarget* a, SrcLoc loc, - NativeLoc src) { - NativeAddr addr; - memset(&addr, 0, sizeof addr); - addr.base_type = src.type; - switch ((NativeLocKind)src.kind) { - case NATIVE_LOC_FRAME: - addr.base_kind = NATIVE_ADDR_BASE_FRAME; - addr.base.frame = src.v.frame; - return addr; - case NATIVE_LOC_ADDR: - return src.v.addr; - case NATIVE_LOC_GLOBAL: - addr.base_kind = NATIVE_ADDR_BASE_GLOBAL; - addr.base.global.sym = src.v.global.sym; - addr.base.global.addend = src.v.global.addend; - return addr; - case NATIVE_LOC_REG: - addr.base_kind = NATIVE_ADDR_BASE_REG; - addr.cls = NATIVE_REG_INT; - addr.base.reg = src.v.reg; - return addr; - default: - compiler_panic(a->base.c, loc, "rv64 asm: unsupported memory operand"); - } -} - -/* Helper: materialize memory-operand base into a single register */ -static Reg rv_asm_native_mem_base(RvNativeTarget* a, SrcLoc loc, NativeLoc src, - u32* ntmp) { - NativeAddr addr = rv_asm_loc_to_addr(a, loc, src); - u32 base; - i32 off; - Reg dst; - - if (addr.index_kind != NATIVE_ADDR_INDEX_NONE) - compiler_panic(a->base.c, loc, "rv64 asm: indexed memory operand unsupported"); - - rv_addr_base(a, addr, &base, &off); - if (off == 0) return (Reg)base; - - if (*ntmp >= 2u) - compiler_panic(a->base.c, loc, "rv64 asm: too many memory operands"); - - dst = (*ntmp == 0u) ? RV_T0 : RV_T1; - (*ntmp)++; - rv_emit_add_imm(a, dst, base, off); - return dst; -} - -/* Helper: bind a single operand (output or input) to Operand struct for template */ -static void rv_asm_bind_native(RvNativeTarget* a, SrcLoc loc, Operand* out, - const char* constraint, CfreeCgTypeId type, - NativeLoc src, u32* ntmp) { - const char* body = rv_asm_constraint_body(constraint); - - if (body[0] == 'r' || body[0] == 'f') { - NativeAllocClass cls = (body[0] == 'f') ? NATIVE_REG_FP : NATIVE_REG_INT; - if (src.kind != NATIVE_LOC_REG) - compiler_panic(a->base.c, loc, "rv64 asm: register operand not in a register"); - rv_asm_bound_reg(out, type, cls, (Reg)src.v.reg); - } else if (body[0] == 'i') { - if (src.kind != NATIVE_LOC_IMM) - compiler_panic(a->base.c, loc, "rv64 asm: immediate operand is not immediate"); - memset(out, 0, sizeof *out); - out->kind = OPK_IMM; - out->type = type; - out->v.imm = src.v.imm; - } else if (body[0] == 'm') { - rv_asm_bound_mem(out, type, rv_asm_native_mem_base(a, loc, src, ntmp)); - } else { - compiler_panic(a->base.c, loc, "rv64 asm: unsupported constraint '%s'", constraint); - } -} -``` - ---- - -## GROUP 5D: Compiler Intrinsics & Finalization - -### Intrinsics: `rv_intrinsic(NativeTarget* t, IntrinKind kind, const NativeLoc* dsts, u32 ndst, const NativeLoc* args, u32 narg)` - -**Subset of intrinsics to implement** (from rv64/ops.c lines 2112–2250): -- INTRIN_EXPECT / INTRIN_ASSUME_ALIGNED: identity on value -- INTRIN_PREFETCH: no-op -- INTRIN_CLZ / INTRIN_CTZ / INTRIN_POPCOUNT: via software loops or native instructions if available -- INTRIN_BSWAP16 / BSWAP32 / BSWAP64: byte-swap sequences -- INTRIN_SADD/UADD/SSUB/USUB/SMUL/UMUL_OVERFLOW: compute result + overflow bit (via intermediate size or bit introspection) -- INTRIN_MEMCPY / MEMMOVE / MEMSET: bulk memory operations (delegates to copy_bytes / set_bytes) -- INTRIN_TRAP / UNREACHABLE: EBREAK instruction - -**Body sketch** (pseudo-C, ~200 lines): -```c -static void rv_intrinsic(NativeTarget* t, IntrinKind kind, - const NativeLoc* dsts, u32 ndst, - const NativeLoc* args, u32 narg) { - RvNativeTarget* a = rv_of(t); - MCEmitter* mc = t->mc; - - switch (kind) { - case INTRIN_EXPECT: - case INTRIN_ASSUME_ALIGNED: - if (ndst == 1u && narg >= 1u) { - if (args[0].kind == NATIVE_LOC_IMM) { - t->load_imm(t, dsts[0], args[0].v.imm); - } else { - t->move(t, dsts[0], args[0]); - } - } - return; - - case INTRIN_PREFETCH: - return; /* No-op; PREFETCH hint (HINTs) not yet emitted */ - - case INTRIN_TRAP: - case INTRIN_UNREACHABLE: - rv64_emit32(mc, rv_ebreak()); - return; - - case INTRIN_CLZ: - if (ndst == 1u && narg == 1u) { - /* Count leading zeros (software loop if no Zbb extension) */ - /* For now: loop via set-bit-on-msb and bit-scan-reverse emulation */ - /* Deferred: check for Zbb (CLZ native instruction) support */ - rv_intrinsic_clz(t, dsts[0], args[0]); - } - return; - - case INTRIN_CTZ: - if (ndst == 1u && narg == 1u) { - /* Count trailing zeros (software; Zbb has CTZ) */ - rv_intrinsic_ctz(t, dsts[0], args[0]); - } - return; - - case INTRIN_POPCOUNT: - if (ndst == 1u && narg == 1u) { - /* Population count (software; Zbb has CPOP) */ - rv_intrinsic_popcount(t, dsts[0], args[0]); - } - return; - - case INTRIN_BSWAP16: - case INTRIN_BSWAP32: - case INTRIN_BSWAP64: { - u32 result_size = (kind == INTRIN_BSWAP16) ? 2 : (kind == INTRIN_BSWAP32) ? 4 : 8; - rv_intrinsic_bswap(t, dsts[0], args[0], result_size); - return; - } - - case INTRIN_SADD_OVERFLOW: - case INTRIN_UADD_OVERFLOW: - case INTRIN_SSUB_OVERFLOW: - case INTRIN_USUB_OVERFLOW: - if (ndst == 2u && narg == 2u) { - rv_intrinsic_binop_overflow(t, kind, dsts[0], dsts[1], args[0], args[1]); - } - return; - - case INTRIN_SMUL_OVERFLOW: - case INTRIN_UMUL_OVERFLOW: - if (ndst == 2u && narg == 2u) { - rv_intrinsic_mul_overflow(t, kind, dsts[0], dsts[1], args[0], args[1]); - } - return; - - case INTRIN_MEMCPY: - if (narg == 3u && args[0].kind == NATIVE_LOC_REG && - args[1].kind == NATIVE_LOC_REG && args[2].kind == NATIVE_LOC_IMM) { - NativeAddr dst_addr, src_addr; - AggregateAccess access; - memset(&dst_addr, 0, sizeof dst_addr); - memset(&src_addr, 0, sizeof src_addr); - memset(&access, 0, sizeof access); - access.size = (u32)args[2].v.imm; - access.align = 1u; - dst_addr.base_kind = NATIVE_ADDR_BASE_REG; - dst_addr.base.reg = args[0].v.reg; - src_addr.base_kind = NATIVE_ADDR_BASE_REG; - src_addr.base.reg = args[1].v.reg; - t->copy_bytes(t, dst_addr, src_addr, access); - } - return; - - case INTRIN_MEMMOVE: { - MCLabel forward = mc->label_new(mc); - MCLabel done = mc->label_new(mc); - if (narg == 3u && args[0].kind == NATIVE_LOC_REG && - args[1].kind == NATIVE_LOC_REG && args[2].kind == NATIVE_LOC_IMM) { - NativeAddr dst_addr, src_addr; - AggregateAccess access; - memset(&dst_addr, 0, sizeof dst_addr); - memset(&src_addr, 0, sizeof src_addr); - memset(&access, 0, sizeof access); - access.size = (u32)args[2].v.imm; - access.align = 1u; - dst_addr.base_kind = NATIVE_ADDR_BASE_REG; - dst_addr.base.reg = args[0].v.reg; - src_addr.base_kind = NATIVE_ADDR_BASE_REG; - src_addr.base.reg = args[1].v.reg; - /* Check direction: if src >= dst, copy forward, else backward */ - rv64_emit32(mc, rv_blt(args[0].v.reg, args[1].v.reg, 0)); - mc->emit_label_ref(mc, forward, R_RV_BRANCH, 4, 0); - /* Copy backward (from end) */ - t->copy_bytes(t, dst_addr, src_addr, access); - rv64_emit32(mc, rv_jal(RV_ZERO, 0)); - mc->emit_label_ref(mc, done, R_RV_JAL, 4, 0); - /* Copy forward (from start) */ - mc->label_place(mc, forward); - t->copy_bytes(t, dst_addr, src_addr, access); - mc->label_place(mc, done); - } - return; - } - - case INTRIN_MEMSET: - if (narg == 3u && args[0].kind == NATIVE_LOC_REG && - args[2].kind == NATIVE_LOC_IMM) { - NativeAddr dst_addr; - AggregateAccess access; - memset(&dst_addr, 0, sizeof dst_addr); - memset(&access, 0, sizeof access); - access.size = (u32)args[2].v.imm; - access.align = 1u; - dst_addr.base_kind = NATIVE_ADDR_BASE_REG; - dst_addr.base.reg = args[0].v.reg; - if (args[1].kind == NATIVE_LOC_IMM) { - NativeLoc byte = rv_reg_loc(builtin_id(CFREE_CG_BUILTIN_I8), NATIVE_REG_INT, RV_T0); - t->load_imm(t, byte, args[1].v.imm & 0xffu); - t->set_bytes(t, dst_addr, byte, access); - } else { - t->set_bytes(t, dst_addr, args[1], access); - } - } - return; - - default: - compiler_panic(t->c, (SrcLoc){0, 0, 0}, "rv64: unsupported intrinsic %d", (int)kind); - } -} -``` - ---- - -### Finalization Hooks - -#### `rv_trap(NativeTarget* t)` -```c -static void rv_trap(NativeTarget* t) { - rv64_emit32(t->mc, rv_ebreak()); -} -``` - -#### `rv_set_loc(NativeTarget* t, SrcLoc loc)` -```c -static void rv_set_loc(NativeTarget* t, SrcLoc loc) { - RvNativeTarget* a = rv_of(t); - a->loc = loc; - if (t->mc && t->mc->set_loc) t->mc->set_loc(t->mc, loc); -} -``` - -#### `rv_file_scope_asm(NativeTarget* t, const char* src, size_t len)` -```c -static void rv_file_scope_asm(NativeTarget* t, const char* src, size_t len) { - RvAsm* lex = rv_asm_open_mem(t->c, "<file-scope-asm>", src, len); - rv_asm_parse(t->c, lex, t->mc); - rv_asm_close(lex); -} -``` - -#### `rv_finalize(NativeTarget* t)` -```c -static void rv_finalize(NativeTarget* t) { - if (t->mc) mc_emit_eh_frame(t->mc); /* DWARF unwind info */ -} -``` - -#### `rv_destroy(NativeTarget* t)` (if needed) -```c -static void rv_destroy(NativeTarget* t) { - /* Cleanup: free RvNativeTarget private state if dynamically allocated */ - if (t) free((RvNativeTarget*)t); -} -``` - ---- - -## Initialization Hook Registration (near end of native.c) - -Insert before the final `return t;` in the main creation function (e.g., `rv64_backend_make`): - -```c - t->atomic_load = rv_atomic_load; - t->atomic_store = rv_atomic_store; - t->atomic_rmw = rv_atomic_rmw; - t->atomic_cas = rv_atomic_cas; - t->fence = rv_fence; - - t->va_start_ = rv_va_start_native; - t->va_arg_ = rv_va_arg_native; - t->va_end_ = rv_va_end_native; - t->va_copy_ = rv_va_copy_native; - - t->intrinsic = rv_intrinsic; - t->asm_block = rv_asm_block_native; - t->file_scope_asm = rv_file_scope_asm; - - t->trap = rv_trap; - t->set_loc = rv_set_loc; - t->finalize = rv_finalize; - /* t->destroy optional; only if custom cleanup needed */ -``` - -And for the **NativeOps** direct-path adapter (to be registered in the NativeOps vtable): - -```c -static const NativeOps rv_direct_ops = { - .bind_param = rv_bind_param, - .tail_call_unrealizable_reason = rv_no_tail, - .va_start_ = rv_va_start_, - .va_arg_ = rv_va_arg_, - .va_end_ = rv_va_end_, - .va_copy_ = rv_va_copy_, - .asm_block = rv_direct_asm_block, -}; - -const NativeOps* rv64_native_direct_ops(void) { return &rv_direct_ops; } -``` - ---- - -## Summary: Function Counts & Organization - -**Group 5 hook count**: 14 core hooks (atomic×5, va×4, asm×2, intrinsic, trap, set_loc, finalize; destroy optional) - -**Subdirectory organization**: -- `native.c`: ~4500 lines (mirror aa64 volume) - - Atomics: ~250 lines - - Variadics: ~200 lines (core + helpers) - - Inline ASM: ~900 lines (direct + native paths + constraint/register helpers) - - Intrinsics: ~200 lines - - Finalization: ~100 lines - - Helper utilities (addr materialization, scratch alloc, etc.): ~300 lines - - Setup/registration: ~50 lines - -**Key helper modules** (leverage from existing rv64 files): -- `isa.h` — atomic instruction encoders (rv_lr_w/d, rv_sc_w/d, rv_fence_*) -- `asm.c/asm.h` — inline ASM binding (rv_inline_bind, rv_asm_run_template) -- `regs.c` — register metadata (NativeRegInfo, constraint resolution) - -**Do NOT duplicate**: regs.c, isa.h, arch.c, link.c, dbg.c, disasm.c — all are reused as-is. - diff --git a/doc/NATIVE_PORT_X64.md b/doc/NATIVE_PORT_X64.md @@ -1,4342 +0,0 @@ -# x64 NativeTarget Porting Reference - - - ---- - -# X64 NativeTarget Porting Guide (GROUP 1: Skeleton, Encoders, Frame Model, Lifecycle) - -## Overview - -This guide covers porting the x64 backend from the legacy CGTarget API (disabled, non-compiling as of commit 429defa) to the NativeTarget API. GROUP 1 establishes the infrastructure: the X64NativeTarget subclass, encoder header restructuring, frame model, and the func_begin/func_end/bind_param lifecycle. - -References: -- **Contract**: `/Users/ryan/code/cfree/src/arch/native_target.h` (NativeTarget vtable & types) -- **Driver**: `/Users/ryan/code/cfree/src/cg/native_direct_target.h` (NativeDirectTarget & NativeOps adapter) -- **RV64 Template**: `/Users/ryan/code/cfree/src/arch/rv64/native.c` (working reference, 1500+ lines) -- **AA64 Template**: `/Users/ryan/code/cfree/src/arch/aa64/native.c` (aarch64 reference, 2000+ lines) -- **Legacy x64 (non-compiling)**: - - `git show 429defa:src/arch/x64/ops.c` (old vtable, data-movement hooks) - - `git show 429defa:src/arch/x64/emit.c` (byte-level encoders, prologue/epilogue) - - `git show 429defa:src/arch/x64/alloc.c` (frame slots, parameter binding) - - `git show 429defa:src/arch/x64/internal.h` (XImpl state, X64ABIRegs, type helpers) -- **ISA Constants**: `/Users/ryan/code/cfree/src/arch/x64/isa.h` (X64_* opcodes, REX, ModRM, condition codes) -- **ABI**: `/Users/ryan/code/cfree/src/abi/abi_sysv_x64.c` & `/Users/ryan/code/cfree/src/abi/abi_win64_x64.c` (parameter/return lowering) - ---- - -## (a) X64NativeTarget Subclass Struct - -The x64 backend maintains per-function state in a subclass of NativeTarget. Model your struct after RvNativeTarget (rv64/native.c:198–234) but adapted for x64's SysV vs. Win64 ABI duality. - -### Struct Definition (Pseudo-C) - -```c -typedef struct X64NativeSlot { - u32 off; /* bytes below rbp (positive); address = rbp - off */ - u32 size; - u32 align; - u8 kind; /* NativeFrameSlotKind */ - u8 pad[3]; -} X64NativeSlot; - -typedef struct X64Patch { - u8 kind; /* X64PatchKind (e.g., X64_PATCH_ALLOCA) */ - u32 pos; /* byte offset in text section */ - u32 dst_reg; /* for X64_PATCH_ALLOCA: destination register */ -} X64Patch; - -typedef struct X64NativeTarget { - NativeTarget base; /* parent vtable & MCEmitter */ - SrcLoc loc; - const CGFuncDesc* func; - - /* Frame slots (locals, spills, sret, variadic save area). */ - X64NativeSlot* slots; - u32 nslots; - u32 slots_cap; - u32 cum_off; /* sum of slot reservations below rbp */ - u32 max_outgoing; /* max stack-arg bytes across all calls */ - u32 frame_size_final; /* final frame size (patched at func_end) */ - - /* Parameter tracking (for bind_param). */ - u32 incoming_stack_size; /* fixed-param stack bytes (tail-call check) */ - u32 next_param_int; /* index into ABI int-arg register list */ - u32 next_param_fp; /* index into ABI FP-arg register list */ - u32 next_param_stack; /* cumulative stack-arg offset */ - u8 has_sret; - u8 is_variadic; - NativeFrameSlot sret_ptr_slot; - - /* Patches deferred to func_end (alloca disp32, etc). */ - X64Patch* patches; - u32 npatches; - u32 patches_cap; - u32 nalloca; /* count of allocas (gates slim epilogue) */ - - /* Prologue/epilogue state. */ - u32 func_start; /* text-section offset at func_begin */ - u32 prologue_pos; /* offset of prologue placeholder */ - MCLabel epilogue_label; - - /* Callee-saved registers assigned by the allocator. */ - struct X64CalleeSave { - NativeFrameSlot slot; - CfreeCgTypeId type; - u8 cls; /* NATIVE_REG_INT or NATIVE_REG_FP */ - Reg reg; - } callee_saves[X64_MAX_CS_REGS]; /* X64_MAX_CS_REGS = 15u (5 int + 10 xmm) */ - u32 ncallee_saves; - - /* Flags. */ - u8 known_frame; /* set by func_begin_known_frame */ - u8 has_alloca; /* dynamic alloca in body */ - u8 frame_final; /* prologue emitted (not patched at func_end) */ - u8 pad[1]; - - /* ABI selection: points to either SysV or Win64 config. */ - const X64ABIRegs* abi; -} X64NativeTarget; - -static inline X64NativeTarget* x64_of(NativeTarget* t) { - return (X64NativeTarget*)t; -} -``` - -### Key Fields - -- **`cum_off`**: Running total of frame-slot reservations. Each new slot allocates `(cum_off + size + align-1) & ~(align-1)` bytes and bumps `cum_off`. -- **`next_param_int`, `next_param_fp`, `next_param_stack`**: Cursors into the ABI's argument lists. On SysV, int args live in rdi/rsi/rdx/rcx/r8/r9 (6 total); on Win64, rcx/rdx/r8/r9 (4 total). FP args differ similarly. Parameters beyond the register pool land on the stack; the cursors track both. -- **`abi`**: Resolved once at func_begin from `c->target.os` via `x64_abi_for_os()`. Points to either `&g_x64_abi_sysv` or `&g_x64_abi_win64`. -- **`frame_size_final`**: Calculated at func_end after alloca/call site patches are known. On single-pass (-O0), the prologue is emitted with a NOP placeholder and patched with the real size; on known-frame (optimizer), the prologue is emitted once with the final size. - ---- - -## (b) Emit.c Restructuring & emit.h - -### Current State - -Legacy x64/emit.c (`git show 429defa:src/arch/x64/emit.c`) contains: -1. **Byte-level encoders** (emit1, emit_rex, emit_mem_operand, emit_rm_reg, emit_alu_rr, emit_imul_rr, etc.) -2. **Constant tables** (g_int_order, g_fp_order, X64ABIRegs SysV/Win64) -3. **Prologue/epilogue builders** (x_build_prologue, x_compute_frame_size, x_func_begin, x_func_end) - -### Keep (Byte-Encoders → emit.h) - -These are low-level helpers used by both native.c (new NativeTarget) and asm.c (standalone assembler): - -- `emit1(mc, b)` – emit one byte -- `emit_u32le(mc, v)` – emit 32-bit little-endian -- `make_rex(w, reg, index, rm)` → `u8`; `emit_rex(mc, ...)`; `emit_rex_force(mc, ...)` -- `modrm(mod, reg, rm)` → `u8`; `sib(scale, index, base)` → `u8` -- `emit_mem_operand(mc, reg, base, disp)` – ModR/M + SIB + displacement -- `emit_rm_reg(mc, reg, rm)` – ModR/M for reg-to-reg (mod=3) -- `emit_mov_rr(mc, w, dst, src)` – MOV reg64/reg32 -- `emit_mov_load(mc, size, signed, dst, base, disp)` – MOV from [base+disp] -- `emit_mov_store(mc, size, dst_reg, base, disp)` – MOV to [base+disp] -- `emit_lea(mc, dst, base, disp)` – LEA dst, [base+disp] -- `emit_sse_rr(mc, prefix, opcode, dst, src)` – SSE reg-to-reg (MOVSD, ADDPD, etc.) -- `emit_sse_load(mc, prefix, opcode, dst, base, disp)` – SSE load from memory -- `emit_sse_store(mc, prefix, opcode, src, base, disp)` – SSE store to memory -- `emit_alu_rr(mc, w, op, dst, src)` – ALU two-operand (ADD, SUB, AND, OR, XOR, CMP, MOV, TEST) -- `emit_alu_imm8(mc, w, op, dst, imm)` – ALU with imm8 -- `emit_alu_imm32(mc, w, op, dst, imm)` – ALU with imm32 -- `emit_imul_rr(mc, w, dst, src)` – IMUL (reg * reg form) -- `emit_imul_imm8(mc, w, dst, src, imm)` – IMUL with imm8 -- `emit_imul_imm32(mc, w, dst, src, imm)` – IMUL with imm32 -- `emit_f7_rm(mc, w, sub, reg)` – F7 opcode (DIV, MUL, NEG, NOT) -- `emit_shift_cl(mc, w, sub, reg)` – Shift by CL (SHL reg, cl; etc.) -- `emit_shift_imm(mc, w, sub, reg, imm)` – Shift by imm8 (SHL reg, imm; etc.) -- `emit_cqo_or_cdq(mc, w)` – CQO/CDQ (sign-extend rax→rdx:rax / eax→edx:eax) -- `emit_setcc(mc, cc, dst)` – SETCC (SETcc reg8) -- `emit_movzx(mc, w_src, dst, src)` – MOVZX (zero-extend) -- `emit_extend_rr(mc, w_src, w_dst, dst, src)` – Sign-extend or zero-extend based on type -- `emit_ret(mc)` – RET -- `emit_leave(mc)` – LEAVE -- `x64_emit_load_imm(mc, is64, dst, imm)` – Load immediate (handles splits for >32-bit on 32-bit form) -- `x64_abi_for_os(os)` → `const X64ABIRegs*` – Resolve ABI from OS kind - -### New Header: x64/emit.h - -Create `/Users/ryan/code/cfree/src/arch/x64/emit.h` with: -- Declarations of all byte-encoder functions above -- Inline helpers: `make_rex()`, `modrm()`, `sib()` -- Inline immediates-legality checks: `imm_fits_i8(v)`, `imm_fits_i32(v)` -- The three byte-encoder constants already in isa.h (X64_REX_*, X64_NOP1, etc.) -- No internal.h dependency (no XImpl, XSlot, etc.) -- Include `"arch/mc.h"`, `"arch/x64/isa.h"`, `"core/bytes.h"` - -### Delete - -- All `x_*` semantic wrappers (x_load_imm, x_copy, x_load, x_call, etc.) → moved to native.c as NativeTarget hooks -- The legacy XImpl-only internal state helpers -- internal.h (subsumed by native.c's X64NativeTarget struct) - -### Update asm.c - -The standalone assembler (`src/arch/x64/asm.c`) currently includes internal.h. Swap that for: -```c -#include "arch/x64/emit.h" /* instead of internal.h */ -``` -No functional change to asm.c's own encoder-wrapping logic; just link against the new emit.h symbols instead of inline copies. - ---- - -## (c) Frame Model: SysV vs. Win64 - -x64 uses an **RBP-anchored frame** on both ABIs. The prologue saves the caller's RBP and chains frames; slots live below RBP at negative offsets. The two ABIs differ in: -1. Caller-saved and callee-saved registers -2. Argument registers and register-to-stack mapping -3. Win64 requires 32 bytes of "shadow space" (home space) above the return address for the first 4 register arguments -4. Win64 requires stack probing (via `__chkstk`) for frames > 4096 bytes -5. Win64 XMM registers 6–15 are callee-saved - -### Frame Layout (SysV / Win64 with RBP frame) - -``` - high addr (caller's frame) - +---------------------+ - | incoming stack args | [rbp + 16 + shadow_space + ...] (Win64 shadow=32) - +---------------------+ - | return address | [rbp + 8] - +---------------------+ -rbp →| saved rbp | [rbp + 0] - +---------------------+ - | sret pointer (if) | [rbp - 8] (if has_sret) - | local vars / spills | [rbp - 8*k] - | callee-saved GPRs | [rbp - N_gpr*8] - | callee-saved XMMs | [rbp - N_xmm*16] (Win64 only, xmm6–15) - +---------------------+ -rsp →| outgoing args | [rsp + 0 .. rsp + max_outgoing) - +---------------------+ - low addr -``` - -**Frame-size formula:** -``` -xmm_base = (sret_slot_exists ? 8 : 0) + cum_off -frame_size = align_up_16(xmm_base + cs_xmm_count * 16 + cs_gpr_count * 8 + max_outgoing) -``` - -**RBP-relative offsets:** -- Saved RBP: rbp + 0 -- Return address: rbp + 8 -- Incoming stack arg at stack offset K: rbp + 16 + shadow_space + K -- Local/spill slot with `off` bytes: rbp - off -- Saved GPR i (i=0 for first callee-saved): rbp - xmm_base - xmm_count*16 - (i+1)*8 -- Saved XMM i (Win64): rbp - xmm_base - (i+1)*16 - -**Red zone (SysV only):** 128 bytes below RSP are reserved for leaf functions; no signal handler can clobber them. Win64 has no red zone. - -**Stack alignment (both):** RSP must be 16-byte aligned *before* a call. After `push rbp`, RSP ≡ 0 mod 16. After `sub rsp, frame_size`, RSP must be 16-byte aligned again so the next call's implicit `push return_address` leaves RSP ≡ 8 mod 16 (the ABI invariant on function entry). Thus `frame_size ≡ 0 mod 16`. - -### Win64-Specific: Shadow Space & Stack Probing - -Win64 reserves 32 bytes of caller-provided "home space" immediately above the return address. The first 4 register arguments (int or FP) correspond to home slots [rbp+16], [rbp+24], [rbp+32], [rbp+40] respectively. Stack-passed arguments sit at [rbp+48] onward. - -**Stack probing** (large frames > 4096B): -```asm -mov eax, frame_size -call __chkstk ; __chkstk probes page-by-page; does NOT adjust rsp -sub rsp, rax ; explicit adjustment -``` -This prevents stack-overflow corruption by touching each page before allocation. - ---- - -## (d) Lifecycle: func_begin, func_end, bind_param - -### x_func_begin_init (helper, called by both func_begin paths) - -**Purpose:** Initialize frame/ABI state once per function. - -**Pseudo-code:** -```c -static void x_func_begin_init(NativeTarget* t, const CGFuncDesc* fd) { - X64NativeTarget* a = x64_of(t); - a->func = fd; - a->loc = fd->text_section_id; // or fd->loc - - /* Resolve ABI (SysV or Win64) from compiler's OS target. */ - a->abi = x64_abi_for_os(t->c->target.os); - - /* Initialize cursors & counters. */ - a->cum_off = 0; - a->max_outgoing = 0; - a->next_param_int = 0; - a->next_param_fp = 0; - a->next_param_stack = 0; - a->has_sret = 0; - a->is_variadic = 0; - a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE; - a->nslots = 0; - a->ncallee_saves = 0; - a->nalloca = 0; - a->known_frame = 0; - a->has_alloca = 0; - a->frame_final = 0; - - /* Mark function start in text section. */ - a->func_start = t->mc->pos(t->mc); - a->epilogue_label = t->mc->label_new(t->mc); -} -``` - -### x_func_begin (NativeTarget hook, -O0 single-pass path) - -**Purpose:** Reserve prologue placeholder (filled with NOPs), then call x_add_entry_frame_slots and variadic saves. - -**Pseudo-code:** -```c -static void x_func_begin(NativeTarget* t, const CGFuncDesc* fd) { - X64NativeTarget* a = x64_of(t); - MCEmitter* mc = t->mc; - - x_func_begin_init(t, fd); - - /* Query ABI to populate variadic / sret flags. */ - const ABIFuncInfo* abi_info = abi_cg_func_info(t->c->abi, fd->fn_type); - a->has_sret = abi_info->has_sret ? 1 : 0; - a->is_variadic = abi_info->variadic ? 1 : 0; - - /* Determine prologue size budget based on OS. */ - u32 prologue_nbytes = (a->abi == &g_x64_abi_win64) - ? X64_PROLOGUE_BYTES_WIN64 // 192 - : X64_PROLOGUE_BYTES; // 96 - - /* Reserve prologue region filled with NOPs. */ - a->prologue_pos = mc->pos(mc); - for (u32 i = 0; i < prologue_nbytes; ++i) { - emit1(mc, X64_NOP1); // 0x90 - } - - /* Allocate frame slots for incoming sret pointer, variadic GP save area. */ - if (a->has_sret) { - NativeFrameSlotDesc sret_desc = { - .type = /* ptr type */, - .size = 8, - .align = 8, - .kind = NATIVE_FRAME_SLOT_SAVE, - .flags = 0, - }; - a->sret_ptr_slot = t->frame_slot(t, &sret_desc); - } - - if (a->is_variadic) { - /* SysV variadic: reserve 176-byte reg-save area (__va_list_tag). - Win64 variadic: no special reg-save area (args already in home space). */ - if (a->abi == &g_x64_abi_sysv) { - NativeFrameSlotDesc va_desc = { - .type = 0, // untyped - .size = 176, - .align = 8, - .kind = NATIVE_FRAME_SLOT_SAVE, - .flags = 0, - }; - /* Store for later access by va_start_ hook. */ - a->va_reg_save_slot = t->frame_slot(t, &va_desc); - } - } -} -``` - -### x_func_begin_known_frame (NativeTarget hook, optimizer path) - -**Purpose:** Called with a pre-computed frame layout; emit the final prologue immediately (no placeholder). - -**Pseudo-code:** -```c -static void x_func_begin_known_frame( - NativeTarget* t, - const CGFuncDesc* fd, - const NativeKnownFrameDesc* frame, - NativeFrameSlot* out_slots) { - X64NativeTarget* a = x64_of(t); - MCEmitter* mc = t->mc; - - x_func_begin_init(t, fd); - a->known_frame = 1; - - /* ABI info. */ - const ABIFuncInfo* abi_info = abi_cg_func_info(t->c->abi, fd->fn_type); - a->has_sret = abi_info->has_sret ? 1 : 0; - a->is_variadic = abi_info->variadic ? 1 : 0; - - /* Allocate frame slots from the known-frame descriptor. */ - for (u32 i = 0; i < frame->nslots; ++i) { - NativeFrameSlot fs = t->frame_slot(t, &frame->slots[i]); - if (out_slots) out_slots[i] = fs; - } - - /* Populate frame dimensions. */ - a->cum_off = 0; // already accounted in frame->slots - a->max_outgoing = frame->max_outgoing; - a->has_alloca = frame->has_alloca ? 1 : 0; - - /* Compute frame size from callee-saves and slot sum. */ - /* Collect the allocator's assigned callee-saved masks. */ - u32 cs_int_mask = frame->callee_saved_used[NATIVE_REG_INT]; - u32 cs_fp_mask = frame->callee_saved_used[NATIVE_REG_FP]; - - u32 frame_size = x_compute_frame_size(a, cs_int_mask, cs_fp_mask); - a->frame_size_final = frame_size; - - /* Check if prologue can be omitted entirely (leaf function, no frame needed). */ - if (x_can_omit_frame(a, cs_int_mask, cs_fp_mask)) { - a->frame_final = 1; - return; - } - - /* Emit final prologue (no placeholder, no patching). */ - u8 buf[X64_PROLOGUE_BYTES_WIN64]; - a->prologue_pos = mc->pos(mc); - u32 chkstk_disp_pos = (u32)-1; - u32 nbytes = x_build_prologue(t, buf, sizeof buf, frame_size, - cs_int_mask, cs_fp_mask, &chkstk_disp_pos); - mc->emit_bytes(mc, buf, nbytes); - if (chkstk_disp_pos != (u32)-1) { - ObjSymId chk = x_chkstk_sym(t); - mc->emit_reloc_at(mc, mc->section_id, a->prologue_pos + chkstk_disp_pos, - R_X64_PLT32, chk, -4, 1, 0); - } - - a->frame_final = 1; -} -``` - -### x_build_prologue (helper) - -**Purpose:** Generate prologue bytes: `push rbp; mov rbp, rsp; [chkstk]; sub rsp, frame_size; [sret spill]; [callee-save spills]`. - -**Structure (from legacy emit.c):** - -```c -static u32 x_build_prologue( - NativeTarget* t, - u8* buf, - u32 cap, - u32 frame_size, - u32 cs_int_mask, // bitmask of used callee-saved GPRs - u32 cs_fp_mask, // bitmask of used callee-saved XMMs (Win64) - u32* chkstk_disp_pos_out) { - X64NativeTarget* a = x64_of(t); - u32 wi = 0; - - if (chkstk_disp_pos_out) *chkstk_disp_pos_out = (u32)-1; - - // 1. push rbp (1 byte) - buf[wi++] = 0x55; - - // 2. mov rbp, rsp (3 bytes: REX.W 89 E5) - buf[wi++] = X64_REX_BASE | X64_REX_W; - buf[wi++] = 0x89; - buf[wi++] = modrm(3, X64_RSP, X64_RBP); // 0xE5 - - // 3. (Win64 only) chkstk if frame_size > 4096 - if (a->abi->shadow_space && frame_size > X64_WIN64_CHKSTK_THRESHOLD) { - // mov eax, frame_size (5 bytes: B8 imm32) - buf[wi++] = 0xB8; - wr_u32_le(buf + wi, frame_size); - wi += 4; - - // call __chkstk (5 bytes: E8 disp32) - buf[wi++] = 0xE8; - if (chkstk_disp_pos_out) *chkstk_disp_pos_out = wi; - wi += 4; // disp32 patched by caller - - // sub rsp, rax (3 bytes: REX.W 29 C4) - buf[wi++] = X64_REX_BASE | X64_REX_W; - buf[wi++] = 0x29; - buf[wi++] = modrm(3, X64_RAX, X64_RSP); - } else { - // sub rsp, frame_size (7 bytes: REX.W 81 EC imm32) - buf[wi++] = X64_REX_BASE | X64_REX_W; - buf[wi++] = 0x81; - buf[wi++] = modrm(3, 5, X64_RSP); // /5 for SUB - wr_u32_le(buf + wi, frame_size); - wi += 4; - } - - // 4. Spill sret pointer (if present) - if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) { - X64NativeSlot* s = x64_slot_get(a, a->sret_ptr_slot); - u32 sret_reg = a->abi->int_args[0]; // RDI (SysV) or RCX (Win64) - i32 off = -(i32)s->off; - // mov [rbp + disp32], sret_reg (7 bytes) - buf[wi++] = X64_REX_BASE | X64_REX_W | ((sret_reg & 8) ? X64_REX_R : 0); - buf[wi++] = 0x89; - buf[wi++] = modrm(2, sret_reg & 7, X64_RBP); - wr_u32_le(buf + wi, (u32)off); - wi += 4; - } - - // 5. Spill callee-saved GPRs - u32 xmm_base = (a->has_sret ? 8 : 0) + a->cum_off; - u32 cs_fp_count = popcount(cs_fp_mask); - for (u32 reg = 0; reg < 16; ++reg) { - if (!(cs_int_mask & (1 << reg))) continue; - u32 idx = __builtin_ctz(cs_int_mask & ((1 << reg) - 1)); // position in order - i32 off = -(i32)(xmm_base + cs_fp_count * 16 + (idx + 1) * 8); - // mov [rbp + disp32], reg (7 bytes) - buf[wi++] = X64_REX_BASE | X64_REX_W | ((reg & 8) ? X64_REX_R : 0); - buf[wi++] = 0x89; - buf[wi++] = modrm(2, reg & 7, X64_RBP); - wr_u32_le(buf + wi, (u32)off); - wi += 4; - } - - // 6. Spill callee-saved XMMs (Win64 only) - if (a->abi == &g_x64_abi_win64) { - for (u32 xmm = 0; xmm < 16; ++xmm) { - if (!(cs_fp_mask & (1 << xmm))) continue; - u32 idx = __builtin_ctz(cs_fp_mask & ((1 << xmm) - 1)); - i32 off = -(i32)(xmm_base + (idx + 1) * 16); - // movaps [rbp + disp32], xmmN (8 or 7 bytes w/ REX) - u8 rex = (xmm & 8) ? (X64_REX_BASE | X64_REX_R) : 0; - if (rex) buf[wi++] = rex; - buf[wi++] = 0x0F; - buf[wi++] = 0x29; - buf[wi++] = modrm(2, xmm & 7, X64_RBP); - wr_u32_le(buf + wi, (u32)off); - wi += 4; - } - } - - return wi; -} -``` - -### x_func_end (NativeTarget hook) - -**Purpose:** Patch prologue (if single-pass), emit epilogue, patch alloca sites, define function symbol. - -**Pseudo-code:** -```c -static void x_func_end(NativeTarget* t) { - X64NativeTarget* a = x64_of(t); - MCEmitter* mc = t->mc; - - if (a->frame_final) { - // Known-frame path: prologue already emitted final, skip patching. - goto emit_epilogue; - } - - // Single-pass path: collect actual callee-saves used & patch prologue. - u32 cs_int_mask = 0, cs_fp_mask = 0; - for (u32 i = 0; i < a->ncallee_saves; ++i) { - if (a->callee_saves[i].cls == NATIVE_REG_INT) { - cs_int_mask |= (1 << a->callee_saves[i].reg); - } else { - cs_fp_mask |= (1 << a->callee_saves[i].reg); - } - } - - u32 frame_size = x_compute_frame_size(a, cs_int_mask, cs_fp_mask); - a->frame_size_final = frame_size; - - if (!x_can_omit_frame(a, cs_int_mask, cs_fp_mask)) { - // Patch prologue placeholder. - u8 buf[X64_PROLOGUE_BYTES_WIN64]; - for (u32 i = 0; i < sizeof buf; ++i) buf[i] = 0x90; - u32 chkstk_disp_pos = (u32)-1; - u32 nbytes = x_build_prologue(t, buf, sizeof buf, frame_size, - cs_int_mask, cs_fp_mask, &chkstk_disp_pos); - obj_patch(t->obj, a->func->text_section_id, a->prologue_pos, buf, nbytes); - if (chkstk_disp_pos != (u32)-1) { - ObjSymId chk = x_chkstk_sym(t); - mc->emit_reloc_at(mc, a->func->text_section_id, - a->prologue_pos + chkstk_disp_pos, R_X64_PLT32, chk, -4, 1, 0); - } - } - -emit_epilogue: - // Place epilogue label (target of tail-call or exception-unwind). - mc->label_place(mc, a->epilogue_label); - - // Restore callee-saved XMMs (Win64). - u32 xmm_base = (a->has_sret ? 8 : 0) + a->cum_off; - for (i32 i = (i32)a->ncallee_saves - 1; i >= 0; --i) { - if (a->callee_saves[i].cls != NATIVE_REG_FP) continue; - u32 xmm = a->callee_saves[i].reg; - u32 idx = /* position in order */; - i32 off = -(i32)(xmm_base + (idx + 1) * 16); - emit_sse_load(mc, 0, 0x28, xmm, X64_RBP, off); // movaps xmm, [rbp+off] - } - - // Restore callee-saved GPRs. - u32 cs_fp_count = /* count of XMMs saved */; - for (i32 i = (i32)a->ncallee_saves - 1; i >= 0; --i) { - if (a->callee_saves[i].cls != NATIVE_REG_INT) continue; - u32 reg = a->callee_saves[i].reg; - u32 idx = /* position in order */; - i32 off = -(i32)(xmm_base + cs_fp_count * 16 + (idx + 1) * 8); - emit_mov_load(mc, 8, 0, reg, X64_RBP, off); - } - - // leave; ret (2 bytes) - emit_leave(mc); // 0xC9 - emit_ret(mc); // 0xC3 - - // Patch alloca sites with final max_outgoing. - for (u32 i = 0; i < a->npatches; ++i) { - if (a->patches[i].kind != X64_PATCH_ALLOCA) continue; - u8 dbuf[4]; - wr_u32_le(dbuf, a->max_outgoing); - obj_patch(t->obj, a->func->text_section_id, a->patches[i].pos, dbuf, 4); - } - - // Define function symbol. - u32 end = mc->pos(mc); - obj_symbol_define(t->obj, a->func->sym, a->func->text_section_id, - (u64)a->func_start, (u64)(end - a->func_start)); - if (a->func->atomize) { - obj_atom_define(t->obj, a->func->text_section_id, a->func_start, - end - a->func_start, a->func->sym, 0); - } - if (t->debug) { - debug_func_pc_range(t->debug, a->func->text_section_id, a->func_start, end); - } - - mc->cfi_endproc(mc); - mc_end_function(mc); - a->func = NULL; -} -``` - -### x_compute_frame_size (helper) - -```c -static u32 x_compute_frame_size(const X64NativeTarget* a, - u32 cs_int_mask, - u32 cs_fp_mask) { - u32 xmm_base = (a->has_sret ? 8 : 0) + a->cum_off; - u32 cs_gpr_count = __builtin_popcount(cs_int_mask); - u32 cs_xmm_count = __builtin_popcount(cs_fp_mask); - u32 raw = a->max_outgoing + cs_gpr_count * 8 + cs_xmm_count * 16 + xmm_base; - u32 frame_size = align_up_u32(raw, 16); - return frame_size ? frame_size : 16; // never 0 -} -``` - -### bind_param (NativeTarget hook) - -**Purpose:** Move incoming parameter from ABI register/stack location into the caller-selected destination (hard reg or frame slot). - -**Pseudo-code (SysV example):** - -```c -static void x_bind_param(NativeTarget* t, const CGParamDesc* p, NativeLoc dst) { - X64NativeTarget* a = x64_of(t); - MCEmitter* mc = t->mc; - - if (dst.kind == NATIVE_LOC_NONE) { - // Parameter unused; just advance the ABI cursor. - x_consume_param_location(a, p->abi); - return; - } - - const ABIArgInfo* ai = p->abi; - if (!ai || ai->kind == ABI_ARG_IGNORE) return; - - // Incoming stack bias: the offset from rbp to the first stack-passed arg. - // On entry, the return address is at [rsp], and rbp = rsp + 8 after `push rbp`. - // So [rbp + 16] is the first incoming stack arg (SysV); Win64 adds shadow_space (32). - i32 incoming_stack_bias = 16 + a->abi->shadow_space; - - // Handle INDIRECT (byval): incoming is a pointer to the actual data. - if (ai->kind == ABI_ARG_INDIRECT) { - u32 ptr_reg; - if (a->next_param_int < a->abi->n_int_args) { - ptr_reg = a->abi->int_args[a->next_param_int++]; - } else { - ptr_reg = X64_R11; // scratch - emit_mov_load(mc, 8, 0, ptr_reg, X64_RBP, incoming_stack_bias + (i32)a->next_param_stack); - a->next_param_stack += 8; - } - - // Copy byval data from [ptr_reg] into dst. - if (dst.kind == NATIVE_LOC_FRAME) { - X64NativeSlot* s = x64_slot_get(a, dst.v.frame); - // memcpy [rbp - s->off], [ptr_reg], p->size - u32 nbytes = p->size; - for (u32 off = 0; off < nbytes; off += 8) { - emit_mov_load(mc, 8, 0, X64_RAX, ptr_reg, (i32)off); - emit_mov_store(mc, 8, X64_RAX, X64_RBP, -(i32)s->off + (i32)off); - } - } - return; - } - - // Handle DIRECT: one or more ABI parts (int/FP scalars, or aggregate pieces). - if (ai->kind == ABI_ARG_DIRECT || ai->kind == ABI_ARG_EXPAND) { - for (u16 i = 0; i < ai->nparts; ++i) { - const ABIArgPart* pt = &ai->parts[i]; - u32 part_size = pt->size; - NativeLoc part_dst = dst; // same destination for all parts (or split across?) - - if (pt->cls == ABI_CLASS_INT) { - u32 src_reg; - if (a->next_param_int < a->abi->n_int_args) { - src_reg = a->abi->int_args[a->next_param_int++]; - } else { - src_reg = X64_RAX; // load from stack - emit_mov_load(mc, part_size, 0, src_reg, X64_RBP, - incoming_stack_bias + (i32)a->next_param_stack); - a->next_param_stack += 8; - } - - // Move src_reg to dst. - if (dst.kind == NATIVE_LOC_REG) { - if (dst.v.reg != src_reg) { - emit_mov_rr(mc, part_size == 8, dst.v.reg & 15, src_reg & 15); - } - } else if (dst.kind == NATIVE_LOC_FRAME) { - X64NativeSlot* s = x64_slot_get(a, dst.v.frame); - emit_mov_store(mc, part_size, src_reg, X64_RBP, -(i32)s->off); - } - } else if (pt->cls == ABI_CLASS_FP) { - u32 src_xmm; - u8 prefix = (part_size == 8) ? 0xF2 : 0xF3; // MOVSD vs. MOVSS - if (a->next_param_fp < a->abi->n_fp_args) { - src_xmm = a->next_param_fp++; - } else { - src_xmm = 0; // XMM0 - emit_sse_load(mc, prefix, 0x10, src_xmm, X64_RBP, - incoming_stack_bias + (i32)a->next_param_stack); - a->next_param_stack += 8; - } - - // Move src_xmm to dst. - if (dst.kind == NATIVE_LOC_REG) { - if (dst.v.reg != src_xmm) { - emit_sse_rr(mc, prefix, 0x10, dst.v.reg & 15, src_xmm & 15); - } - } else if (dst.kind == NATIVE_LOC_FRAME) { - X64NativeSlot* s = x64_slot_get(a, dst.v.frame); - emit_sse_store(mc, prefix, 0x11, src_xmm, X64_RBP, -(i32)s->off); - } - } - } - return; - } -} -``` - ---- - -## Summary: Files to Create/Modify - -### Create -- **`src/arch/x64/native.c`** (~2500 lines, modeled after rv64/native.c) - - X64NativeTarget struct, lifecycle hooks (func_begin, func_end, bind_param) - - Byte-encoder inline wrappers (rv_* → x_* pattern) - - All NativeTarget vtable hooks - - x64_native_target_new constructor - -- **`src/arch/x64/emit.h`** (~300 lines) - - Declarations of all byte-encoder functions - - Inline helpers (modrm, sib, REX builder) - - Immediates-legality checks - - No struct/state definitions; pure functions - -### Modify -- **`src/arch/x64/emit.c`** (legacy, commit 429defa) - - Extract all byte-encoder bodies into `.h` inline / in native.c - - Keep only the byte-emission function definitions (for asm.c to link) - - Delete XImpl/XSlot struct definitions, XImpl-only helpers - - Delete x_* semantic wrappers (they move to native.c) - -- **`src/arch/x64/asm.c`** - - Replace `#include "arch/x64/internal.h"` with `#include "arch/x64/emit.h"` - - No other functional changes - -### Delete -- **`src/arch/x64/internal.h`** (subsumed by native.c's X64NativeTarget) -- **`src/arch/x64/ops.c`** (legacy vtable; body moves to native.c) -- **`src/arch/x64/alloc.c`** (legacy frame/param; body moves to native.c) -- **`src/arch/x64/opt_coord.c`** (legacy reg tables; move to native.c) - -### Keep (Already Compiling) -- `src/arch/x64/isa.h` (opcode constants) -- `src/arch/x64/isa.c` (disasm) -- `src/arch/x64/regs.c` (DWARF names) -- `src/arch/x64/link.c` (linker integration) -- `src/arch/x64/dbg.c` (debugging) -- `src/arch/x64/disasm.c` (disassembly) -- `src/arch/x64/x64.h` (public header if any) - ---- - -## Key References & Constants - -From `/Users/ryan/code/cfree/src/arch/x64/isa.h`: -```c -enum { - X64_RAX = 0, ..., X64_R15 = 15, /* GPR encoding (DWARF numbering) */ - X64_XMM0 = 0, ..., X64_XMM15 = 15, - X64_CC_* = 0x0..0xF, /* condition codes for Jcc/SETcc/CMOVcc */ -}; -#define X64_REX_BASE 0x40u -#define X64_REX_W 0x08u -#define X64_REX_R 0x04u -#define X64_REX_X 0x02u -#define X64_REX_B 0x01u -``` - -From legacy emit.c (`git show 429defa:src/arch/x64/emit.c`): -```c -#define X64_PROLOGUE_BYTES 96u /* SysV budget */ -#define X64_PROLOGUE_BYTES_WIN64 192u /* Win64 budget */ -#define X64_WIN64_SHADOW_SPACE 32u /* home space */ -#define X64_WIN64_CHKSTK_THRESHOLD 4096u /* stack probe threshold */ -#define X64_MAX_CS_INT_REGS 7u /* SysV 5 + Win64 +2 for RDI/RSI */ -#define X64_MAX_CS_FP_REGS 10u /* Win64 XMM6..15 */ -``` - ---- - -## Notes for Implementation - -1. **ABI Dispatch:** Use `x64_abi_for_os(t->c->target.os)` once at func_begin to resolve the ABI struct. Store in `a->abi` so all parameter/call logic reads from one place. - -2. **Frame Slot Offsets:** RBP-relative offsets are always negative. A slot with `off` bytes is at address `rbp - off`. Incoming stack args are at positive offsets from RBP (e.g., `[rbp + 16 + shadow + byte_off]`). - -3. **Emit Patterns:** Follow rv64/native.c's pattern: small inline wrappers (rv_reg_loc, rv_stack_loc, rv_mem_for_type) that construct NativeLoc/MemAccess, then call the byte-encoder (rv_emit_li64, rv_emit_addr_adjust, etc.). This keeps semantic logic tight and encoders reusable. - -4. **Placeholder & Patch:** Single-pass -O0 reserves X64_PROLOGUE_BYTES of NOPs at prologue_pos, then patches at func_end once max_outgoing/callee-saves are known. Known-frame emits immediately with the final size. - -5. **Callee-Save Tracking:** The allocator provides a bitmask per reg class (int/fp) in reserve_callee_saves or known_frame. Collect from the allocator, then lay them out in x_build_prologue and epilogue in reverse order (highest reg first in the stack layout). - -6. **Win64 Specifics:** - - Shadow space: first 4 args get 32 bytes of home slots, so stack args start at rbp+48. - - Stack probing: frame > 4096 → `mov eax, N; call __chkstk; sub rsp, rax`. - - Callee-saved XMMs: Win64 must save xmm6–15; SysV never saves XMMs (all caller-saved). - -7. **Relocation:** Emit R_X64_PLT32 for the __chkstk call site with addend -4 (PC-relative at end of insn). - - - ---- - -# x64 NativeTarget Backend Port: Register Tables & Legality (GROUP 2) - -## Overview - -This guide produces the exact x64 register metadata tables (NativePhysRegInfo, NativeAllocClassInfo, NativeRegInfo) required by the NativeTarget contract, along with ABI-abstract routing for SysV and Win64. The source is legacy commit 429defa (src/arch/x64/{ops,emit,alloc,opt_coord,internal}.h), which encodes the x64 register landscape and calling conventions. - ---- - -## Register Enumeration (Hardware Model) - -From `src/arch/x64/isa.h` (lines 36–68): - -**Integer Registers (16 total, DWARF/ABI encoding 0–15):** -``` -X64_RAX = 0, X64_RCX = 1, X64_RDX = 2, X64_RBX = 3, -X64_RSP = 4, X64_RBP = 5, X64_RSI = 6, X64_RDI = 7, -X64_R8 = 8, X64_R9 = 9, X64_R10 = 10, X64_R11 = 11, -X64_R12 = 12, X64_R13 = 13, X64_R14 = 14, X64_R15 = 15, -``` - -**FP/SSE Registers (16 total, XMM0–XMM15, encoding 0–15):** -``` -X64_XMM0 = 0 through X64_XMM15 = 15 -``` - ---- - -## ABI Constants & SysV vs Win64 Differences - -From `git show 429defa:src/arch/x64/emit.c` (lines 15–65): - -### SysV x86-64 (Linux, BSD, most Unix): -- **Int arg regs (6):** RDI, RSI, RDX, RCX, R8, R9 -- **FP arg regs (8):** XMM0–XMM7 -- **Callee-saved int regs:** RBX, RBP, R12, R13, R14, R15 (5 + RBP = 6 total) - - cs_int_mask = `(1ull << RBX) | (1ull << RBP) | (1ull << R12) | (1ull << R13) | (1ull << R14) | (1ull << R15)` -- **Callee-saved FP regs:** none - - cs_fp_mask = 0 -- **Return regs:** RAX, RDX (int); XMM0, XMM1 (FP) -- **Shadow space:** 0 (stack-pass args directly after return addr) -- **Variadic save:** 176-byte __va_list_tag register-save area emitted by prologue - -### Win64 x86-64 (Windows): -- **Int arg regs (4):** RCX, RDX, R8, R9 -- **FP arg regs (4):** XMM0–XMM3 -- **Callee-saved int regs:** RBX, RBP, R12–R15, **RDI, RSI** (7 total, extra 2 vs SysV) - - cs_int_mask = `(1ull << RBX) | (1ull << RBP) | (1ull << R12) | (1ull << R13) | (1ull << R14) | (1ull << R15) | (1ull << RDI) | (1ull << RSI)` -- **Callee-saved FP regs:** XMM6–XMM15 (10 regs) - - cs_fp_mask = `(1ull << XMM6) | (1ull << XMM7) | (1ull << XMM8) | ... | (1ull << XMM15)` (all 10 bits set) -- **Return regs:** RAX, RDX (int); XMM0, XMM1 (FP) -- **Shadow space:** 32 bytes (4 × 8B home slots for first 4 args, even if passed in regs) -- **Variadic:** no register-save area; variadic FP args must be duplicated into matching GPR for stack-offset tracking -- **Stack align:** Win64 requires special __chkstk probe call for allocations > 4096 bytes - ---- - -## Legacy Register Pool Extraction - -From `git show 429defa:src/arch/x64/opt_coord.c` (lines 4–49): - -### Allocable Register Pools (for optimizer spill/reload): -```c -// INT allocable (4 regs for opt, excludes callee-saves used by -O0 single-pass) -static const Reg x_int_allocable[] = {X64_R13, X64_R14, X64_R15, X64_R10}; - -// FP allocable (8 regs for opt) -static const Reg x_fp_allocable[] = { - X64_XMM6, X64_XMM7, X64_XMM8, X64_XMM0 + 9, - X64_XMM0 + 10, X64_XMM0 + 11, X64_XMM0 + 12, X64_XMM0 + 13 -}; -``` - -**Key insight:** The optimizer's allocable set is curated to avoid the backend's internal scratch registers (RAX, R11 for int; XMM14, XMM15 for FP), leaving those free for lowering paths. - -### Scratch Registers (for emit-internal temporary use): -```c -// INT scratch: must NOT overlap allocable or ABI-protected regs -static const Reg x_int_scratch[] = {X64_RBX, X64_R12}; - -// FP scratch -static const Reg x_fp_scratch[] = {X64_XMM0 + 14, X64_XMM15}; // XMM14, XMM15 -``` - -**Reserve regs (never allocable/scratch):** RAX, RBP (frame ptr), RSP (stack ptr), R11 (internal scratch). - ---- - -## NativePhysRegInfo Tables (Legacy to NativeTarget Mapping) - -From `git show 429defa:src/arch/x64/opt_coord.c` (lines 52–95), we reconstruct the full register inventory: - -### x64 Integer Register Set (16 regs): - -```c -// Full NativePhysRegInfo[] for INT class (all 16 GPRs) -// Placeholder frame: all are (spill_cost=?, copy_cost=?) -// Legacy opt_coord.c does not expose these costs; use default 0. - -static const NativePhysRegInfo x64_int_phys[] = { - // ABI arg regs (SysV or Win64, resolved via ABIFuncInfo) - // SysV: RDI=0, RSI=1, RDX=2, RCX=3, R8=4, R9=5; legacy used abi_index for ordering - // Win64: RCX=0, RDX=1, R8=2, R9=3; RDI/RSI not args - - {X64_RAX, NATIVE_REG_INT, 0xff, NATIVE_REG_RESERVED | NATIVE_REG_RET, 0, 0}, - // RAX: return value, reserved scratch (not allocable) - - {X64_RCX, NATIVE_REG_INT, 0xff, NATIVE_REG_CALLER_SAVED, 0, 0}, - // RCX: arg reg (Win64 only, or general caller-saved on SysV); 0xff = no fixed ABI position - - {X64_RDX, NATIVE_REG_INT, 0xff, NATIVE_REG_CALLER_SAVED | NATIVE_REG_RET, 0, 0}, - // RDX: return value, caller-saved (arg on SysV) - - {X64_RBX, NATIVE_REG_INT, 0xff, NATIVE_REG_CALLEE_SAVED, 0, 0}, - // RBX: callee-saved (both SysV/Win64) - - {X64_RSP, NATIVE_REG_INT, 0xff, NATIVE_REG_RESERVED, 0, 0}, - // RSP: stack pointer, reserved - - {X64_RBP, NATIVE_REG_INT, 0xff, NATIVE_REG_RESERVED, 0, 0}, - // RBP: frame pointer, reserved (saved/restored by prologue) - - {X64_RSI, NATIVE_REG_INT, 0xff, NATIVE_REG_CALLER_SAVED, 0, 0}, - // RSI: arg reg (SysV, or callee-saved on Win64) - - {X64_RDI, NATIVE_REG_INT, 0xff, NATIVE_REG_CALLER_SAVED, 0, 0}, - // RDI: arg reg (SysV, or callee-saved on Win64) - - {X64_R8, NATIVE_REG_INT, 0xff, NATIVE_REG_CALLER_SAVED, 0, 0}, - // R8: arg reg (both ABIs) - - {X64_R9, NATIVE_REG_INT, 0xff, NATIVE_REG_CALLER_SAVED, 0, 0}, - // R9: arg reg (both ABIs) - - {X64_R10, NATIVE_REG_INT, 0xff, NATIVE_REG_CALLER_SAVED | NATIVE_REG_TEMP_PREFERRED, 0, 0}, - // R10: caller-saved, scratch-preferred (used for internal ops) - - {X64_R11, NATIVE_REG_INT, 0xff, NATIVE_REG_RESERVED | NATIVE_REG_CALLER_SAVED, 0, 0}, - // R11: reserved scratch (used by emit paths for immediates, not allocable) - - {X64_R12, NATIVE_REG_INT, 0xff, NATIVE_REG_CALLEE_SAVED, 50, 4}, - // R12: callee-saved (both ABIs); spill_cost/copy_cost from legacy table - - {X64_R13, NATIVE_REG_INT, 0xff, NATIVE_REG_CALLEE_SAVED, 50, 4}, - // R13: callee-saved (both ABIs) - - {X64_R14, NATIVE_REG_INT, 0xff, NATIVE_REG_CALLEE_SAVED, 50, 4}, - // R14: callee-saved (both ABIs) - - {X64_R15, NATIVE_REG_INT, 0xff, NATIVE_REG_CALLEE_SAVED, 50, 4}, - // R15: callee-saved (both ABIs) -}; -``` - -### x64 FP/SSE Register Set (16 regs): - -```c -static const NativePhysRegInfo x64_fp_phys[] = { - // Arg/return regs (SysV XMM0–7 are args, all caller-saved; Win64 XMM0–3 args, XMM6–15 callee-saved) - - {X64_XMM0, NATIVE_REG_FP, 0xff, NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | NATIVE_REG_RET, 0, 0}, - {X64_XMM1, NATIVE_REG_FP, 0xff, NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | NATIVE_REG_RET, 0, 0}, - {X64_XMM2, NATIVE_REG_FP, 0xff, NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG, 0, 0}, - {X64_XMM3, NATIVE_REG_FP, 0xff, NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG, 0, 0}, - {X64_XMM4, NATIVE_REG_FP, 0xff, NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG, 0, 0}, - // XMM4–5 are arg-passable on SysV (arg 5–6 of 8); not args on Win64 but still caller-saved - {X64_XMM5, NATIVE_REG_FP, 0xff, NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG, 0, 0}, - - // XMM6–7: args on SysV (7–8), but on Win64 both are callee-saved (not args) - {X64_XMM6, NATIVE_REG_FP, 0xff, NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG, 0, 0}, - {X64_XMM7, NATIVE_REG_FP, 0xff, NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG, 0, 0}, - - // XMM8–15: caller-saved on SysV (not args); callee-saved on Win64 - {X64_XMM8, NATIVE_REG_FP, 0xff, NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, 0, 0}, - {X64_XMM9, NATIVE_REG_FP, 0xff, NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, 0, 0}, - {X64_XMM10, NATIVE_REG_FP, 0xff, NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, 0, 0}, - {X64_XMM11, NATIVE_REG_FP, 0xff, NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, 0, 0}, - {X64_XMM12, NATIVE_REG_FP, 0xff, NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, 0, 0}, - {X64_XMM13, NATIVE_REG_FP, 0xff, NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, 0, 0}, - - // XMM14: caller-saved, reserved for emit scratch (like R11 for int) - {X64_XMM14, NATIVE_REG_FP, 0xff, NATIVE_REG_RESERVED | NATIVE_REG_CALLER_SAVED, 0, 0}, - - // XMM15: caller-saved on SysV, callee-saved on Win64; reserved for scratch on both - {X64_XMM15, NATIVE_REG_FP, 0xff, NATIVE_REG_RESERVED | NATIVE_REG_CALLER_SAVED, 0, 0}, -}; -``` - -**Key abi_index note:** The legacy code sets abi_index = 0–5 for SysV arg regs (RDI–R9), 0xff for non-args. The NativeTarget contract uses abi_index for **ordered ABI sequencing** in call/return marshalling. Since x64 routing differs by OS, this must be resolved **per-OS** at initialization: either the per-OS NativeRegInfo is built at native_new time, or the NativeTarget legality hooks query the ABI directly. - ---- - -## NativeAllocClassInfo Structure Definition - -From `src/arch/native_target.h` (lines 95–113), each class needs: - -```c -typedef struct NativeAllocClassInfo { - u8 cls; // NATIVE_REG_INT or NATIVE_REG_FP - u8 pad[3]; - - const Reg* allocable; // array of allocable register enums - u32 nallocable; // count - - const Reg* scratch; // array of emit-internal scratch regs - u32 nscratch; // count - - const NativePhysRegInfo* phys; // full register inventory for this class - u32 nphys; // count (16 for INT, 16 for FP) - - u32 caller_saved_mask; // bitmask of caller-saved regs in this class - u32 callee_saved_mask; // bitmask of callee-saved regs in this class - u32 arg_mask; // bitmask of argument-passing regs - u32 ret_mask; // bitmask of return-value regs - u32 reserved_mask; // bitmask of reserved (non-allocable) regs -} NativeAllocClassInfo; -``` - -### Concrete x64 INT class: - -```c -// Allocable: {R13, R14, R15, R10} -static const Reg x64_int_allocable[] = {X64_R13, X64_R14, X64_R15, X64_R10}; - -// Scratch: {RBX, R12} (not allocable; used internally by emit) -static const Reg x64_int_scratch[] = {X64_RBX, X64_R12}; - -static const NativeAllocClassInfo x64_int_class_sysv = { - .cls = NATIVE_REG_INT, - .allocable = x64_int_allocable, - .nallocable = 4, - .scratch = x64_int_scratch, - .nscratch = 2, - .phys = x64_int_phys, - .nphys = 16, - - // SysV: all GPRs except RBP/RSP are either caller or callee-saved - .caller_saved_mask = (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX) | - (1u << X64_RSI) | (1u << X64_RDI) | (1u << X64_R8) | - (1u << X64_R9) | (1u << X64_R10) | (1u << X64_R11), - - .callee_saved_mask = (1u << X64_RBX) | (1u << X64_RBP) | (1u << X64_R12) | - (1u << X64_R13) | (1u << X64_R14) | (1u << X64_R15), - - .arg_mask = (1u << X64_RDI) | (1u << X64_RSI) | (1u << X64_RDX) | - (1u << X64_RCX) | (1u << X64_R8) | (1u << X64_R9), // 6 args - - .ret_mask = (1u << X64_RAX) | (1u << X64_RDX), // RAX, RDX - - .reserved_mask = (1u << X64_RSP) | (1u << X64_RBP) | (1u << X64_RAX) | - (1u << X64_R11), // RAX and R11 are emit-internal scratch -}; - -static const NativeAllocClassInfo x64_int_class_win64 = { - .cls = NATIVE_REG_INT, - .allocable = x64_int_allocable, // {R13, R14, R15, R10} same - .nallocable = 4, - .scratch = x64_int_scratch, - .nscratch = 2, - .phys = x64_int_phys, - .nphys = 16, - - // Win64: RDI/RSI are callee-saved (unlike SysV) - .caller_saved_mask = (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX) | - (1u << X64_R8) | (1u << X64_R9) | (1u << X64_R10) | - (1u << X64_R11), - - .callee_saved_mask = (1u << X64_RBX) | (1u << X64_RBP) | (1u << X64_R12) | - (1u << X64_R13) | (1u << X64_R14) | (1u << X64_R15) | - (1u << X64_RDI) | (1u << X64_RSI), // +2 vs SysV - - .arg_mask = (1u << X64_RCX) | (1u << X64_RDX) | (1u << X64_R8) | - (1u << X64_R9), // 4 args - - .ret_mask = (1u << X64_RAX) | (1u << X64_RDX), - - .reserved_mask = (1u << X64_RSP) | (1u << X64_RBP) | (1u << X64_RAX) | - (1u << X64_R11), -}; -``` - -### Concrete x64 FP class: - -```c -static const Reg x64_fp_allocable[] = { - X64_XMM6, X64_XMM7, X64_XMM8, X64_XMM9, - X64_XMM10, X64_XMM11, X64_XMM12, X64_XMM13 -}; - -static const Reg x64_fp_scratch[] = {X64_XMM14, X64_XMM15}; - -// SysV: all XMMs are caller-saved; none are callee-saved -static const NativeAllocClassInfo x64_fp_class_sysv = { - .cls = NATIVE_REG_FP, - .allocable = x64_fp_allocable, - .nallocable = 8, - .scratch = x64_fp_scratch, - .nscratch = 2, - .phys = x64_fp_phys, - .nphys = 16, - - .caller_saved_mask = 0xFFFF, // all 16 XMMs - .callee_saved_mask = 0, // none - - .arg_mask = 0xFF, // XMM0–7 - - .ret_mask = (1u << X64_XMM0) | (1u << X64_XMM1), - - .reserved_mask = (1u << X64_XMM14) | (1u << X64_XMM15), -}; - -// Win64: XMM0–5 caller-saved (4 args + 2 more), XMM6–15 callee-saved -static const NativeAllocClassInfo x64_fp_class_win64 = { - .cls = NATIVE_REG_FP, - .allocable = x64_fp_allocable, // XMM6–13 (Win64 allocable: callee-saved + non-arg caller-saved) - .nallocable = 8, - .scratch = x64_fp_scratch, - .nscratch = 2, - .phys = x64_fp_phys, - .nphys = 16, - - .caller_saved_mask = 0x3F, // XMM0–5 (args 0–3, overflow args 4–5) - .callee_saved_mask = 0xFFC0, // XMM6–15 (10 regs) - - .arg_mask = 0x0F, // XMM0–3 (4 args only) - - .ret_mask = (1u << X64_XMM0) | (1u << X64_XMM1), - - .reserved_mask = (1u << X64_XMM14) | (1u << X64_XMM15), -}; -``` - ---- - -## NativeRegInfo & ABI-Abstract Initialization - -From `src/arch/native_target.h` (lines 115–124): - -```c -typedef struct NativeRegInfo { - const NativeAllocClassInfo* classes; // array of per-class info - u32 nclasses; // 2: INT + FP (VEC optional) - - int (*resolve_name)(const NativeRegInfo*, Sym name, Reg* out, NativeAllocClass* cls_out); - const char* (*debug_name)(const NativeRegInfo*, NativeAllocClass, Reg); - u32 (*dwarf_reg)(const NativeRegInfo*, NativeAllocClass, Reg); -} NativeRegInfo; -``` - -### x64 Implementation Sketch (per-OS): - -```c -// Helper: dispatch to SysV or Win64 based on compiler OS -static const NativeAllocClassInfo* x64_get_classes(Compiler* c, u32* out_nclasses) { - static const NativeAllocClassInfo x64_classes_sysv[2] = {x64_int_class_sysv, x64_fp_class_sysv}; - static const NativeAllocClassInfo x64_classes_win64[2] = {x64_int_class_win64, x64_fp_class_win64}; - - if (c->target.os == CFREE_OS_WINDOWS) { - *out_nclasses = 2; - return x64_classes_win64; - } - *out_nclasses = 2; - return x64_classes_sysv; -} - -static int x64_resolve_name(const NativeRegInfo* ri, Sym name, Reg* out, NativeAllocClass* cls_out) { - // Use regs.c x64_register_hw_index or x64_register_index (which returns DWARF index) - // to resolve name to hardware register number, then classify as INT or FP - Slice ns = pool_slice(/* compiler pool */, name); - // ... (see alloc.c x_resolve_reg_name for full mapping) -} - -static const char* x64_debug_name(const NativeRegInfo* ri, NativeAllocClass cls, Reg reg) { - // Return assembler name: "rax", "xmm0", etc. Use regs.c x64_register_name(dwarf_idx) - if (cls == NATIVE_REG_INT) { - return x64_register_name(reg); // reg is hardware index 0–15 - } else if (cls == NATIVE_REG_FP) { - // XMM reg: return x64_register_name(17 + reg) for DWARF mapping - return x64_register_name(17 + reg); - } - return NULL; -} - -static u32 x64_dwarf_reg(const NativeRegInfo* ri, NativeAllocClass cls, Reg reg) { - // DWARF numbering: GPR 0–15 = DWARF 0–15; XMM0–15 = DWARF 17–32 - if (cls == NATIVE_REG_INT) return (u32)reg; - if (cls == NATIVE_REG_FP) return 17u + (u32)reg; - return 0xffffffffu; -} - -static NativeRegInfo x64_reg_info_sysv = { - .classes = x64_classes_sysv, - .nclasses = 2, - .resolve_name = x64_resolve_name, - .debug_name = x64_debug_name, - .dwarf_reg = x64_dwarf_reg, -}; - -static NativeRegInfo x64_reg_info_win64 = { - .classes = x64_classes_win64, - .nclasses = 2, - .resolve_name = x64_resolve_name, - .debug_name = x64_debug_name, - .dwarf_reg = x64_dwarf_reg, -}; - -// At x64_native_target_new(Compiler* c, ...): -const NativeRegInfo* x64_get_reg_info(Compiler* c) { - return (c->target.os == CFREE_OS_WINDOWS) ? &x64_reg_info_win64 : &x64_reg_info_sysv; -} -``` - ---- - -## Operand & Addressing Legality (class_for_type, imm_legal, addr_legal) - -### class_for_type: - -```c -static NativeAllocClass x64_class_for_type(NativeTarget* nt, CfreeCgTypeId type) { - // Dispatch on type_is_fp_scalar / type_is_fp_double - CfreeCgTypeInfo ti = cg_type_info(nt->c, type); - if (ti.scalar_kind == ABI_SC_FLOAT || ti.scalar_kind == ABI_SC_DOUBLE) { - // But this NativeTarget is -O0 direct emission, so no intrinsic SIMD yet - // For now: all FP → NATIVE_REG_FP - return NATIVE_REG_FP; - } - return NATIVE_REG_INT; -} -``` - -### imm_legal: - -From `git show 429defa:src/arch/x64/ops.c` (immediate-legality check patterns): - -```c -static int x64_imm_legal(NativeTarget* nt, NativeImmUse use, u32 op, CfreeCgTypeId type, i64 imm) { - // x64 immediates: - // - MOV: imm32 sign-extended to 64, or movabs imm64 (15 bytes, expensive) - // - ALU (ADD/SUB/AND/OR/XOR/CMP): imm8 (1 byte) or imm32 (sign-extended to 64) - // - SHIFT: imm8 (1 byte) or CL register - // - Large imms: must use movabs or two-step (mov imm32, shift+or imm32) - - switch (use) { - case NATIVE_IMM_MOVE: - // MOV is always legal; will use movabs if imm doesn't fit i32 - return 1; - case NATIVE_IMM_BINOP: - // ALU/CMP operand: must fit i8 or i32 - return imm >= -128 && imm <= 127; // i8 - || (imm >= -2147483648LL && imm <= 2147483647LL); // i32 sign-extended - case NATIVE_IMM_CMP: - // CMP is an ALU op, same rules - return imm >= -128 && imm <= 127 || - (imm >= -2147483648LL && imm <= 2147483647LL); - case NATIVE_IMM_ADDR_OFFSET: - // disp32 in addressing mode: [base + disp32] - return imm >= -2147483648LL && imm <= 2147483647LL; - default: - return 0; - } -} -``` - -### addr_legal: - -From legacy ops.c (addr_mode, emit_global_lea patterns): - -```c -static int x64_addr_legal(NativeTarget* nt, const NativeAddr* addr, MemAccess mem) { - // x64 addressing: [base + index<<scale + disp32] - // - base: any GPR (or RIP for RIP-relative literals) - // - index: any GPR except RSP (base register cannot be index) - // - scale: 1, 2, 4, 8 (log2_scale 0–3) - // - disp32: signed i32 (-2^31 .. 2^31-1) - - // The NativeDirect path materializes index into a register before the load, - // so we only check validity of the mode itself (no index register aliasing). - - if (addr->base_kind == NATIVE_ADDR_BASE_NONE) return 0; - if (addr->offset < -2147483648LL || addr->offset > 2147483647LL) return 0; - - // x64 permits index scaling; log2_scale is 0–3 (scales 1, 2, 4, 8) - if (addr->log2_scale > 3) return 0; - - // All base kinds (REG, FRAME, FRAME_VALUE, GLOBAL) are valid; - // the backend resolves them to physical addresses before emission. - return 1; -} -``` - ---- - -## Reserved Registers & Scratch Strategy - -From `git show 429defa:src/arch/x64/internal.h` (prologue budgets, register counts): - -**Reserved (not allocable by register allocator):** -- RAX: return value, emit-internal scratch for immediates (movabs, load-const relocs) -- R11: emit-internal scratch for immediates and address calculations -- RBP: frame pointer (saved/restored by prologue; restored by epilogue) -- RSP: stack pointer - -**Callee-saved GPRs (both ABIs, saved/restored automatically by prologue/epilogue):** -- RBX, R12, R13, R14, R15 (5 regs on SysV) -- **Win64 adds:** RDI, RSI (2 extra = 7 total) - -**Callee-saved XMMs (Win64 only; SysV none):** -- XMM6–XMM15 (10 regs on Win64; must be saved if used) - ---- - -## Practical Pseudo-Code: Materialization in NativeTarget Hooks - -### bind_param (incoming argument binding): - -```c -void x64_bind_param(NativeTarget* nt, const CGParamDesc* pd, NativeLoc dst) { - // Query ABIFuncInfo for the current function's parameter layout - const ABIFuncInfo* abi = abi_cg_func_info(nt->c->abi, /* fn_type */); - const ABIArgInfo* ai = &abi->params[param_index]; // resolved by allocator - - // Materialize the source location from the ABI (register or stack) - NativeLoc src = {0}; - if (ai->kind == ABI_ARG_INDIRECT) { - // Sret or byval: caller passes address in first int arg register - u32 arg_idx = x64_next_param_int++; // 0 = RDI/RCX, etc. - src.kind = NATIVE_LOC_REG; - src.cls = NATIVE_REG_INT; - src.v.reg = get_int_arg_reg(abi, arg_idx); - } else if (ai->kind == ABI_ARG_DIRECT) { - // Direct args: split across registers and stack per ABIArgPart - for (u16 i = 0; i < ai->nparts; ++i) { - const ABIArgPart* part = &ai->parts[i]; - if (part->cls == ABI_CLASS_INT && x64_next_param_int < abi->n_int_args) { - src.kind = NATIVE_LOC_REG; - src.cls = NATIVE_REG_INT; - src.v.reg = get_int_arg_reg(abi, x64_next_param_int++); - } else if (part->cls == ABI_CLASS_FP && x64_next_param_fp < abi->n_fp_args) { - src.kind = NATIVE_LOC_REG; - src.cls = NATIVE_REG_FP; - src.v.reg = get_fp_arg_reg(abi, x64_next_param_fp++); - } else { - // Stack-passed: incoming args live above the saved pair - src.kind = NATIVE_LOC_FRAME; - src.v.frame = ...; // compute incoming-stack frame slot - } - } - } - - // Emit move from src (ABI location) to dst (allocator location) - if (dst.kind == NATIVE_LOC_REG) { - nt->move(nt, dst, src); - } else if (dst.kind == NATIVE_LOC_FRAME) { - nt->store(nt, native_addr_of_frame(dst.v.frame), src, mem_access_for_param_type(pd->type)); - } -} -``` - -### plan_call (outgoing argument marshalling): - -```c -void x64_plan_call(NativeTarget* nt, const NativeCallDesc* desc, NativeCallPlan* plan) { - // Query the callee's ABI to decide which registers each argument goes in - const ABIFuncInfo* callee_abi = abi_cg_func_info(nt->c->abi, desc->fn_type); - - u32 stack_arg_size = 0; - u32 arg_idx_int = 0, arg_idx_fp = 0; - - for (u32 i = 0; i < desc->nargs; ++i) { - const ABIArgInfo* ai = &callee_abi->params[i]; - NativeLoc* arg_loc = &desc->args[i]; - - // Decide where this argument goes (register or stack) - for (u16 j = 0; j < ai->nparts; ++j) { - const ABIArgPart* part = &ai->parts[j]; - NativeCallPlanMove* move = &plan->args[plan->nargs++]; - - if (part->cls == ABI_CLASS_INT && arg_idx_int < callee_abi->n_int_args) { - move->dst.kind = NATIVE_LOC_REG; - move->dst.cls = NATIVE_REG_INT; - move->dst.v.reg = get_int_arg_reg(callee_abi, arg_idx_int++); - } else if (part->cls == ABI_CLASS_FP && arg_idx_fp < callee_abi->n_fp_args) { - move->dst.kind = NATIVE_LOC_REG; - move->dst.cls = NATIVE_REG_FP; - move->dst.v.reg = get_fp_arg_reg(callee_abi, arg_idx_fp++); - } else { - // Stack-passed argument - move->dst.kind = NATIVE_LOC_STACK; - move->dst.v.stack.slot = 0; // stack offset computed from stack_arg_size - move->dst.v.stack.offset = stack_arg_size; - stack_arg_size += 8; // aligned to 8 on x64 (or ABI-specified stack_align) - } - - move->src = *arg_loc; // pre-materialized by caller - move->src_kind = NATIVE_CALL_MOVE_VALUE; - } - } - - plan->stack_arg_size = align_up(stack_arg_size + abi->shadow_space, 16); - // Win64: add shadow space; SysV: shadow_space = 0 -} -``` - ---- - -## Summary: Data Flow for x64 NativeTarget - -1. **At init (x64_native_target_new):** - - Resolve `const NativeRegInfo* regs = x64_get_reg_info(compiler);` - - Select SysV or Win64 class tables based on compiler->target.os - - Wire up class_for_type, imm_legal, addr_legal hooks - -2. **Per-function (func_begin):** - - Read abi_cg_func_info(callee type) to learn parameter/return layout - - Track next_param_int / next_param_fp indices for incoming args - - Allocate frame slots for sret pointer, variadic save area, etc. - -3. **Per-parameter (bind_param):** - - Consult ABIArgInfo for the parameter's class (INT/FP), location (reg/stack), and part layout - - Materialize the source from the ABI location (e.g., RDI for SysV param 0) - - Emit a move to the allocator-chosen destination - -4. **Per-call (plan_call):** - - Iterate callee ABIArgInfo to decide arg destination (register or outgoing stack) - - Emit setup moves (caller is responsible; NativeTarget validates legality) - -5. **Legality (class_for_type, imm_legal, addr_legal):** - - class_for_type: FP type → NATIVE_REG_FP, else NATIVE_REG_INT - - imm_legal: permit i8 or i32 sign-extended immediates for ALU, disp32 for addresses - - addr_legal: validate [base + index<<scale + disp32] format - -This design keeps x64 emission **independent of the allocator** and **agnostic to direct vs. optimized lowering**, while delegating ABI decisions to the abi/ layer (which is already OS-aware). - - - - ---- - -# x64 NativeTarget Porting Guide — GROUP 3: Data Movement, ALU, Flags, Convert, Control Flow - -## Overview - -This guide ports GROUP 3 hooks (move, load_imm, load_const, load_addr, load/store, tls_addr_of, copy_bytes, set_bytes, bitfield_load/store, binop, unop, cmp, convert, alloca_, spill/reload, label/jump, cmp_branch, indirect_branch, load_label_addr) from the disabled x64 legacy backend to the NativeTarget API. The contract is in `src/arch/native_target.h`; working templates are in `src/arch/rv64/native.c` (just finished) and `src/arch/aa64/native.c` (aarch64, both -O0 and -O1+ via optimizer). - -x64 is **two-address** (destination is also a source) and uses the **flags register** instead of materializing condition bits. Division, shifts, and multiplication have special implicit-register requirements (rax/rdx, cl, etc.). The byte-level emit helpers in the legacy `emit.c` (kept and still compiling) are reusable; this guide quotes line ranges and function signatures to call. - ---- - -## Key Differences: x64 vs. Templates - -1. **Flags-based comparisons**: x64 cmp/jcc/setcc set the RFLAGS register; rv64/aa64 materialize 0/1 directly. -2. **Two-address ALU**: Intel alu ops read-modify-write a single operand; ARM64 has three-register forms. -3. **Implicit registers**: div/idiv clobber rax/rdx, mul clobbers rdx, shifts use cl, conversion opcodes hardcode registers. -4. **Variable-width encoding**: sizes 1/2/4/8 bytes map to distinct opcodes (movsxd for 32→64, movzx for byte/word, movabs for imm64). -5. **RIP-relative addressing**: PC-relative immediates are -4 addend (end-of-insn) for relocations (R_PC32, R_X64_PLT32, R_X64_REX_GOTPCRELX). -6. **ABI dual-path**: SysV vs. Win64 differ in arg regs, shadow space, callee-save masks. Use `abi_cg_func_info()` to abstract. - ---- - -## Kept Emit.c Encoders (Reusable Byte-Level Helpers) - -File: `git show 429defa:src/arch/x64/emit.c` - -### Low-Level Primitives - -- **`x64_make_rex(w, reg, index, rm)`** (isa.h:374): Build REX byte or return 0. -- **`x64_pack_rex(out, w, reg, index, rm)`** (isa.h:474): Emit optional REX. -- **`x64_pack_mem(out, reg, base, disp)`** (isa.h:418): ModR/M + disp for `[base+disp]`. -- **`x64_pack_mem_sib(out, reg, base, index, log2_scale, disp)`** (isa.h:442): ModR/M + SIB for `[base+index*scale+disp]`. -- **`x64_pack_rm_reg(out, reg, rm)`** (isa.h:468): ModR/M for reg-reg (mod=3). - -### Instruction Encoders (Called from emit.c, wrapped by native.c) - -Each returns byte count; caller reserves ≥16 bytes in buffer. - -**Movement & Loads:** -- **`x64_mov_ri_pack()`** (isa.h:552): MOV r, imm32/imm64 (B8+rd). -- **`x64_mov_rm_load_pack()`** (isa.h:572): MOV r, [base+disp] or LEA. -- **`x64_movzx_rr_pack()`** (isa.h:594): MOVZX/MOVSX r,r (0F B6/B7/BE/BF). -- **`x64_movsxd_pack()`** (isa.h:611): MOVSXD r64, r32 (REX.W 63). - -**SSE (Scalar FP):** -- **`x64_sse_rr_pack()`** (isa.h:743): SSE reg-reg with optional prefix (0x66/0xF2/0xF3). -- **`x64_sse_mem_pack()`** (isa.h:760): SSE load/store via [base+disp]. - -**ALU:** -- **`x64_alu_rr_pack()`** (isa.h:515): op r/m, r (MOV/ADD/SUB/AND/OR/XOR/CMP/TEST). -- **`x64_alu_rm_pack()`** (isa.h:534): op [base+disp], r (memory form). -- **`x64_alu_imm8_pack()`** (isa.h:625): op r/m, imm8 (83 /sub). -- **`x64_alu_imm32_pack()`** (isa.h:640): op r/m, imm32 (81 /sub). -- **`x64_imul_rr_pack()`** (isa.h:654): IMUL r, r (0F AF). -- **`x64_imul_rri_pack()`** (isa.h:670): IMUL r, r, imm (69/6B). -- **`x64_f7_rm_pack()`** (isa.h:687): F7 /sub (NOT/NEG/MUL/IMUL/DIV/IDIV). -- **`x64_shift_imm_pack()`** (isa.h:701): SHL/SHR/SAR r, imm8 (C1). -- **`x64_shift_cl_pack()`** (isa.h:715): SHL/SHR/SAR r, cl (D3). - -**Branches & Setcc:** -- **`x64_setcc_pack()`** (isa.h:727): SETcc r8 (0F 9x /0). -- **`x64_nullary_pack()`** (isa.h:496): RET, CQO/CDQ, etc. - -### Called Emit Functions (emit.c impl side, called from native.c) - -Each emits debug row via `debug_emit_row()` when `mc->debug` is set. - -```c -void emit_mov_rr(MCEmitter* mc, int w, u32 dst, u32 src); // MOV w=1→r64, w=0→r32 -void emit_mov_load(MCEmitter* mc, u32 size, int signed_ext, u32 dst, u32 base, i32 disp); // size 1/2/4/8 -void emit_mov_store(MCEmitter* mc, u32 size, u32 src, u32 base, i32 disp); -void emit_lea(MCEmitter* mc, u32 dst, u32 base, i32 disp); // LEA (always 64-bit in our ISA) -void emit_mov_load_idx(MCEmitter* mc, u32 size, int signed_ext, u32 dst, u32 base, u32 index, u32 log2_scale, i32 disp); -void emit_mov_store_idx(MCEmitter* mc, u32 size, u32 src, u32 base, u32 index, u32 log2_scale, i32 disp); -void x64_emit_load_imm(MCEmitter* mc, int is64, u32 dst, i64 imm); // MOV/MOVABS -void emit_alu_rr(MCEmitter* mc, int w, u8 op, u32 dst, u32 src); -void emit_imul_rr(MCEmitter* mc, int w, u32 dst, u32 src); -void emit_f7_rm(MCEmitter* mc, int w, u32 sub, u32 reg); // NOT/NEG/MUL/IMUL/DIV/IDIV -void emit_shift_cl(MCEmitter* mc, int w, u32 sub, u32 reg); -void emit_shift_imm(MCEmitter* mc, int w, u32 sub, u32 reg, u8 imm); -void emit_alu_imm8(MCEmitter* mc, int w, u32 sub, u32 reg, i8 imm); -void emit_alu_imm32(MCEmitter* mc, int w, u32 sub, u32 reg, i32 imm); -void emit_imul_imm8(MCEmitter* mc, int w, u32 dst, u32 src, i8 imm); -void emit_imul_imm32(MCEmitter* mc, int w, u32 dst, u32 src, i32 imm); -void emit_cmp_imm8(MCEmitter* mc, int w, u32 reg, i8 imm); -void emit_test_self(MCEmitter* mc, int w, u32 reg); // TEST r, r -void emit_setcc(MCEmitter* mc, u32 cc, u32 reg); // SETcc (cc = X64_CC_*) -void emit_extend_rr(MCEmitter* mc, int w, int signed_ext, u32 src_size, u32 dst, u32 src); -void emit_cqo_or_cdq(MCEmitter* mc, int w); // CQO/CDQ -void emit_xor_self(MCEmitter* mc, int w, u32 r); // XOR r, r (zero) -void emit_movzx_r32_r8(MCEmitter* mc, u32 dst, u32 src); // MOVZX r32, r8 -void emit_ret(MCEmitter* mc); -void emit_sse_rr(MCEmitter* mc, u8 prefix, u8 opcode, u32 dst, u32 src); -void emit_sse_load(MCEmitter* mc, u8 prefix, u8 opcode, u32 dst, u32 base, i32 disp); -void emit_sse_store(MCEmitter* mc, u8 prefix, u8 opcode, u32 src, u32 base, i32 disp); -void emit_sse_load_idx(MCEmitter* mc, u8 prefix, u8 opcode, u32 dst, u32 base, u32 index, u32 log2_scale, i32 disp); -void emit_sse_store_idx(MCEmitter* mc, u8 prefix, u8 opcode, u32 src, u32 base, u32 index, u32 log2_scale, i32 disp); -void emit_sse_rr_w(MCEmitter* mc, u8 prefix, u8 opcode, int w, u32 dst, u32 src); -int imm_fits_i8(i64 imm); -int imm_fits_i32(i64 imm); -``` - -### Width Flags - -- **`w=1`**: 64-bit (q suffix), REX.W set. -- **`w=0`**: 32-bit (l suffix), REX.W clear (32-bit write zero-extends to 64). -- **`size` (load/store)**: 1, 2, 4, or 8 bytes; calls use MOVZX/MOVSX for 1/2-byte loads. - -### Opcode Constants (from isa.h) - -Key ones for GROUP 3: - -```c -#define X64_OPC_MOV_RM_R 0x89u // MOV r/m, r (store) -#define X64_OPC_MOV_R_RM 0x8Bu // MOV r, r/m (load) -#define X64_OPC_MOV_RI 0xB8u // MOV r, imm (with +rd in low bits) -#define X64_OPC_LEA 0x8Du -#define X64_OPC_MOVSXD 0x63u // MOVSXD r64, r32 (REX.W 63) -#define X64_OPC_ALU_ADD 0x01u // ADD r/m, r -#define X64_OPC_ALU_SUB 0x29u // SUB r/m, r -#define X64_OPC_ALU_AND 0x21u // AND r/m, r -#define X64_OPC_ALU_OR 0x09u // OR r/m, r -#define X64_OPC_ALU_XOR 0x31u // XOR r/m, r -#define X64_OPC_ALU_CMP 0x39u // CMP r/m, r -#define X64_OPC_ALU_TEST 0x85u // TEST r/m, r -#define X64_OPC_IMUL_2B 0xAFu // IMUL r, r/m (0F AF) -#define X64_OPC_IMUL_IMM8 0x6Bu // IMUL r, r, imm8 -#define X64_OPC_IMUL_IMM32 0x69u // IMUL r, r, imm32 -#define X64_OPC_F7 0xF7u // NOT/NEG/MUL/IMUL/DIV/IDIV (sub picks op) -#define X64_F7_SUB_NOT 2u -#define X64_F7_SUB_NEG 3u -#define X64_F7_SUB_DIV 6u -#define X64_F7_SUB_IDIV 7u -#define X64_OPC_SHIFT_IMM 0xC1u // SHL/SHR/SAR r, imm8 -#define X64_OPC_SHIFT_CL 0xD3u // SHL/SHR/SAR r, cl -#define X64_SHIFT_SUB_SHL 4u -#define X64_SHIFT_SUB_SHR 5u -#define X64_SHIFT_SUB_SAR 7u -#define X64_OPC_SETCC_BASE 0x90u // SETcc (cc in low nibble, 0F 9x) -#define X64_OPC_CDQ_CQO 0x99u // CQO (REX.W) / CDQ -#define X64_ALU_SUB_ADD 0u // For 83/81 encoding -#define X64_ALU_SUB_CMP 7u -#define X64_ALU_SUB_SUB 5u -#define X64_ALU_SUB_AND 4u -#define X64_ALU_SUB_OR 1u -#define X64_ALU_SUB_XOR 6u - -// Condition codes (for jcc, setcc, cmovcc) -#define X64_CC_E 0x4u // equal / ZF=1 -#define X64_CC_NE 0x5u -#define X64_CC_B 0x2u // below (unsigned) / CF=1 -#define X64_CC_AE 0x3u // above-or-equal (unsigned) / CF=0 -#define X64_CC_BE 0x6u // below-or-equal (unsigned) -#define X64_CC_A 0x7u // above (unsigned) -#define X64_CC_L 0xCu // less (signed) / SF!=OF -#define X64_CC_GE 0xDu -#define X64_CC_LE 0xEu -#define X64_CC_G 0xFu // greater (signed) -#define X64_CC_S 0x8u // sign set -#define X64_CC_NS 0x9u -#define X64_CC_P 0xAu // parity (FP unordered) -#define X64_CC_NP 0xBu // no parity (FP ordered) -``` - -### SSE Prefixes and Opcodes - -```c -#define X64_PFX_66 0x66u // Operand-size override (16-bit); also picks double precision for SSE -#define X64_PFX_F2 0xF2u // Scalar double (ADDSD, etc.) -#define X64_PFX_F3 0xF3u // Scalar single (ADDSS, etc.) -#define X64_OPC_TWOBYTE 0x0Fu // Prefix for two-byte opcodes (SSE, shift 0Fxx, etc.) - -// SSE scalar FP opcodes (second byte after 0x0F): -#define 0x10 // movs{s,d} (load); as emit_sse_rr, it's always sse_rr form -#define 0x58 // adds{s,d} -#define 0x5C // subs{s,d} -#define 0x59 // muls{s,d} -#define 0x5E // divs{s,d} -#define 0x2E // ucomis{s,d} (compare, sets flags) -#define 0x2A // cvtsi2s{s,d} (int→fp) -#define 0x2C // cvtts{s,d}2si (fp→int, truncate) -#define 0x6E // MOVD/MOVQ (GPR→XMM, used for bitcast) -#define 0x7E // MOVD/MOVQ (XMM→GPR, used for bitcast) -``` - ---- - -## ABI Query Interface - -Call via `abi.h`: - -```c -const ABIFuncInfo* abi_cg_func_info(Compiler*, CfreeCgTypeId fn_type); -``` - -Returns ABIFuncInfo with: -- `nparams`: Number of fixed parameters. -- `is_variadic`: Boolean. -- `params[]`: Array of ABIArgInfo per parameter. -- `ret_*`: Return value layout(s). - -For x64, also check `os_kind`: -```c -if (c->config->os_kind == CFREE_OS_WINDOWS) { /* Win64 path */ } -else { /* SysV path */ } -``` - ---- - -## NativeOps Adapter Structure - -File: `src/cg/native_direct_target.h:66` - -For -O0 (direct lowering), the NativeOps adapter bridges semantic operands (Operand type with OPK_REG/OPK_IMM/OPK_LOCAL/OPK_INDIRECT) to NativeLoc/NativeAddr. At -O1+, the optimizer emits NativeInst directly, so NativeOps is not called. - -Key callbacks used in the emit path (from ops.c, now native.c): -- **`operand_legal()`**: Check if semantic Operand is legal for the arch. -- **`semantic_addr_legal()`**: Check if an Operand address is legal and reachable. -- **`plan_call()`, `emit_call()`, `emit_ret()`**: Lowering calls and returns. -- **`va_start_()`, `va_arg_()`, `va_end_()`, `va_copy_()`**: Variadic setup. - -For GROUP 3, focus is on NativeTarget emission; NativeOps is used only in the -O0 path to map semantic operands to NativeLoc. See `src/arch/aa64/native_direct.c` for the adapter implementation. - ---- - -## Group 3 Hook Bodies - -### move(dst_reg, src_reg) - -**Input:** Two NATIVE_LOC_REG locations, same class (int→int or fp→fp). -**Emit:** Register move or elide (same reg). - -**Pseudo-C:** -```c -static void x_move(NativeTarget* t, NativeLoc dst, NativeLoc src) { - // Elide if same reg and class. - if (dst.kind == NATIVE_LOC_REG && src.kind == NATIVE_LOC_REG && - (NativeAllocClass)dst.cls == (NativeAllocClass)src.cls && - dst.v.reg == src.v.reg) - return; - - u32 rd = dst.v.reg & 0xFu; - u32 rs = src.v.reg & 0xFu; - - if ((NativeAllocClass)dst.cls == NATIVE_REG_FP) { - // FP reg move: prefix selects width (0xF2=double, 0xF3=single). - u8 prefix = type_size32(t, dst.type) == 8u ? 0xF2u : 0xF3u; - emit_sse_rr(t->mc, prefix, 0x10, rd, rs); // movs{d,s} - } else { - // Integer: width from type size or pointer size. - int w = (dst.type && cg_type_size(t->c, dst.type) >= 8u) ? 1 : 0; - emit_mov_rr(t->mc, w, rd, rs); - } -} -``` - -**Emit.c calls:** `emit_mov_rr()`, `emit_sse_rr()`. - ---- - -### load_imm(dst_reg, imm) - -**Input:** NativeLoc dst (REG), immediate value. -**Emit:** MOV r, imm32 or MOVABS r, imm64. - -**Pseudo-C:** -```c -static void x_load_imm(NativeTarget* t, NativeLoc dst, i64 imm) { - int is64 = (dst.type && cg_type_size(t->c, dst.type) >= 8u) ? 1 : 0; - u32 rd = dst.v.reg & 0xFu; - x64_emit_load_imm(t->mc, is64, rd, imm); -} -``` - -**Emit.c calls:** `x64_emit_load_imm()` (which emits MOV or MOVABS based on is64). - ---- - -### load_const(dst_reg, const_bytes) - -**Input:** FP constant (size 4 or 8), dst is FP register. -**Emit:** .rodata symbol, RIP-relative movss/movsd with R_PC32 reloc. - -**Pseudo-C:** -```c -static void x_load_const(NativeTarget* t, NativeLoc dst, ConstBytes cb) { - // Must route through .rodata to emit RIP-relative load. - // 1. Allocate .rodata section if needed. - // 2. Align and emit const bytes at rodata_offset. - // 3. Create local symbol for the constant. - // 4. Emit: movs{s,d} xmm_dst, [rip + disp32] - // with R_PC32 reloc (addend=-4). - - // Pseudo: allocate rodata offset, emit symbol, store current section, switch to .rodata. - u32 ro_off = /* rodata-aligned offset */; - ObjSymId sym = /* create symbol at ro_off */; - - u8 prefix = (cb.size == 8) ? 0xF2u : 0xF3u; // F2=double, F3=single - u32 dst_x = dst.v.reg & 0xFu; - - // Emit: prefix 0F 10 /r [RIP + disp32] - // Use emit_sse_load with base=X64_RBP (signals rip-relative in our ISA). - // OR manually build with x64_sse_mem_pack and emit_reloc_at. - u32 pos = t->mc->pos(t->mc); - emit_sse_load(t->mc, prefix, 0x10, dst_x, X64_RBP, 0); // [RIP] form - t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos, R_PC32, sym, -4, 1, 0); -} -``` - -**Note:** Implementation deferred to platform-specific rodata handling. For now, treat like rv64: load via register + immediate offset. - ---- - -### load_addr(dst_reg, addr) - -**Input:** NativeLoc dst (REG), NativeAddr with base (reg/frame/global), index, scale, offset. -**Emit:** LEA [base+disp], or for globals: LEA [RIP+disp] with R_PC32/R_X64_PLT32 for function symbols / R_X64_REX_GOTPCRELX for GOT-accessed externs. - -**Pseudo-C:** -```c -static void x_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) { - XNativeTarget* x = (XNativeTarget*)t; - u32 rd = dst.v.reg & 0xFu; - - if ((NativeAddrBaseKind)addr.base_kind == NATIVE_ADDR_BASE_GLOBAL) { - ObjSymId sym = addr.base.global.sym; - i64 addend = addr.base.global.addend + (i64)addr.offset; - - // Route through GOT for extern undef symbols in PIC/PIE. - if (obj_symbol_extern_via_got(t->c, t->obj, sym)) { - // mov rd, [rip + disp32] (R_X64_REX_GOTPCRELX) - // addend applied post-load if nonzero. - u32 pos = t->mc->pos(t->mc); - emit_mov_load(t->mc, 8, /*signed=*/0, rd, X64_RBP, 0); // [RIP] form - t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos, R_X64_REX_GOTPCRELX, sym, -4, 1, 0); - if (addend) { - if (addend >= -2048 && addend <= 2047) { - emit_alu_imm32(t->mc, 1, X64_ALU_SUB_ADD, rd, (i32)addend); - } else { - x64_emit_load_imm(t->mc, 1, X64_R11, addend); - emit_alu_rr(t->mc, 1, X64_OPC_ALU_ADD, rd, X64_R11); - } - } - return; - } - - // lea rd, [rip + disp32] (R_PC32 for data, R_X64_PLT32 for funcs) - u32 reloc_kind = (obj_symbol_get(t->obj, sym)->kind == SK_FUNC) ? - R_X64_PLT32 : R_PC32; - u32 pos = t->mc->pos(t->mc); - emit_lea(t->mc, rd, X64_RBP, 0); // [RIP] form - t->mc->emit_reloc_at(t->mc, t->mc->section_id, pos, reloc_kind, sym, addend - 4, 1, 0); - return; - } - - if ((NativeAddrBaseKind)addr.base_kind == NATIVE_ADDR_BASE_FRAME) { - // lea rd, [rbp - slot_offset + addr.offset] - XNativeSlot* s = x_slot_get(x, addr.base.frame); - i32 disp = -(i32)s->off + addr.offset; - emit_lea(t->mc, rd, X64_RBP, disp); - // apply index if present (scaled, rare for frame addresses) - if ((NativeAddrIndexKind)addr.index_kind == NATIVE_ADDR_INDEX_REG) { - u32 ri = addr.index.reg & 0xFu; - u32 scale = 1u << addr.log2_scale; - // lea rd, [rd + ri*scale] (SIB form) - emit_lea_sib(t->mc, rd, rd, ri, addr.log2_scale, 0); - } - return; - } - - if ((NativeAddrBaseKind)addr.base_kind == NATIVE_ADDR_BASE_REG) { - // lea rd, [base_reg + offset] - u32 rb = addr.base.reg & 0xFu; - emit_lea(t->mc, rd, rb, addr.offset); - if ((NativeAddrIndexKind)addr.index_kind == NATIVE_ADDR_INDEX_REG) { - // apply scaled index - u32 ri = addr.index.reg & 0xFu; - emit_lea_sib(t->mc, rd, rd, ri, addr.log2_scale, 0); - } - return; - } - - compiler_panic(t->c, x->loc, "x64 load_addr: unsupported base kind"); -} -``` - -**Emit.c calls:** `emit_lea()`, `emit_mov_load()` (for GOT), `emit_alu_imm32()`, `emit_alu_rr()`, `x64_emit_load_imm()`. - ---- - -### load(dst_reg, addr, mem_access) - -**Input:** NativeLoc dst (REG), NativeAddr, MemAccess (type, size, align). -**Emit:** MOV with size 1/2/4/8 bytes; MOVSX/MOVZX for sub-word; MOVSD/MOVSS for FP. - -**Pseudo-C:** -```c -static void x_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, MemAccess mem) { - u32 rd = dst.v.reg & 0xFu; - u32 size = mem.size; - u32 base = addr.base.reg & 0xFu; // assume simple base (no SIB for now) - - if ((NativeAllocClass)dst.cls == NATIVE_REG_FP) { - // FP load: movs{s,d} xmm_dst, [base + disp] - u8 prefix = (size == 8) ? 0xF2u : 0xF3u; - emit_sse_load(t->mc, prefix, 0x10, rd, base, addr.offset); - } else { - // Integer load: size determines extension. - int signed_ext = cg_type_is_signed(t->c, mem.type) ? 1 : 0; - emit_mov_load(t->mc, size, signed_ext, rd, base, addr.offset); - } -} -``` - -**Emit.c calls:** `emit_mov_load()`, `emit_sse_load()`. - ---- - -### store(addr, src_reg, mem_access) - -**Input:** NativeAddr, NativeLoc src (REG), MemAccess. -**Emit:** MOV [base+disp], src with size 1/2/4/8. - -**Pseudo-C:** -```c -static void x_store(NativeTarget* t, NativeAddr addr, NativeLoc src, MemAccess mem) { - u32 rs = src.v.reg & 0xFu; - u32 size = mem.size; - u32 base = addr.base.reg & 0xFu; - - if ((NativeAllocClass)src.cls == NATIVE_REG_FP) { - // FP store: movs{s,d} [base + disp], xmm_src - u8 prefix = (size == 8) ? 0xF2u : 0xF3u; - emit_sse_store(t->mc, prefix, 0x11, rs, base, addr.offset); // 0x11 = MOVS*D/S store - } else { - // Integer store. - emit_mov_store(t->mc, size, rs, base, addr.offset); - } -} -``` - -**Emit.c calls:** `emit_mov_store()`, `emit_sse_store()`. - ---- - -### tls_addr_of(dst_reg, sym, addend) - -**Input:** TLS symbol, NativeLoc dst (REG), addend. -**Emit:** SysV: `mov rd, %fs:0` then `add rd, [rip+disp]` with R_X64_TLSGD / R_X64_TLSLD reloc. Win64: reserved (OS-specific path). - -**Pseudo-C:** -```c -static void x_tls_addr_of(NativeTarget* t, NativeLoc dst, ObjSymId sym, i64 addend) { - XNativeTarget* x = (XNativeTarget*)t; - u32 rd = dst.v.reg & 0xFu; - - // x86-64 TLS is complex; for now, emit panic to mark as not yet implemented. - // SysV model: emit movabs rd, sym@gottpoff(rip) + addend. - // Win64: no TLS support in -O0 direct lowering. - - compiler_panic(t->c, x->loc, "x64 tls_addr_of: not yet implemented"); -} -``` - ---- - -### copy_bytes(dst_addr, src_addr, agg_access) - -**Input:** Two NativeAddr, AggregateAccess (size, align). -**Emit:** REP MOVS or unrolled MOV loop. - -**Pseudo-C:** -```c -static void x_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src, AggregateAccess agg) { - u32 size = agg.size; - - // Unrolled loop for small sizes or misaligned; REP MOVS for larger aligned blocks. - if (size <= 32u) { - // Unroll: emit mov for each chunk (8, 4, 2, 1 as needed). - u32 off = 0; - while (off < size) { - u32 chunk = (size - off >= 8) ? 8 : (size - off >= 4) ? 4 : (size - off >= 2) ? 2 : 1; - u32 rs = X64_R10; // scratch for load - emit_mov_load(t->mc, chunk, /*signed=*/0, rs, src.base.reg, src.offset + (i32)off); - emit_mov_store(t->mc, chunk, rs, dst.base.reg, dst.offset + (i32)off); - off += chunk; - } - } else { - // REP MOVS: set rcx=size, rsi=src, rdi=dst, then: rep movsq (8-byte chunks). - x64_emit_load_imm(t->mc, 1, X64_RCX, (i64)(size / 8)); - emit_mov_rr(t->mc, 1, X64_RSI, src.base.reg); - emit_alu_imm32(t->mc, 1, X64_ALU_SUB_ADD, X64_RSI, src.offset); - emit_mov_rr(t->mc, 1, X64_RDI, dst.base.reg); - emit_alu_imm32(t->mc, 1, X64_ALU_SUB_ADD, X64_RDI, dst.offset); - emit_rep_movsq(t->mc); // not shown; emit 0xF3 0x48 0xA5 - } -} -``` - ---- - -### set_bytes(dst_addr, byte_value, agg_access) - -**Input:** NativeAddr dst, NativeLoc byte_value (REG with 0..255), size. -**Emit:** Unrolled or REP STOS. - -**Pseudo-C:** -```c -static void x_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value, AggregateAccess agg) { - u32 size = agg.size; - - // For small sizes, unroll MOV stores. - if (size <= 32u) { - u32 rs = byte_value.v.reg & 0xFu; - u32 off = 0; - while (off < size) { - u32 chunk = (size - off >= 8) ? 8 : (size - off >= 4) ? 4 : (size - off >= 2) ? 2 : 1; - emit_mov_store(t->mc, chunk, rs, dst.base.reg, dst.offset + (i32)off); - off += chunk; - } - } else { - // REP STOS: al=byte, rcx=size, rdi=dst, then rep stosq. - u32 rs = byte_value.v.reg & 0xFu; - x64_emit_load_imm(t->mc, 1, X64_RCX, (i64)(size / 8)); - emit_mov_rr(t->mc, 0, X64_RAX, rs); // mov al, rs (or al if byte already) - emit_mov_rr(t->mc, 1, X64_RDI, dst.base.reg); - emit_alu_imm32(t->mc, 1, X64_ALU_SUB_ADD, X64_RDI, dst.offset); - emit_rep_stosq(t->mc); // emit 0xF3 0x48 0xAB - } -} -``` - ---- - -### bitfield_load(dst_reg, record_addr, bf_access) - -**Input:** NativeLoc dst (REG), NativeAddr (record), BitFieldAccess (width, offset, sign-extend). -**Emit:** Load, then mask/shift/extend. - -**Pseudo-C:** -```c -static void x_bitfield_load(NativeTarget* t, NativeLoc dst, NativeAddr record_addr, BitFieldAccess bf) { - u32 rd = dst.v.reg & 0xFu; - u32 byte_off = bf.byte_offset; - u32 bit_off = bf.bit_offset; - u32 width = bf.width; - - // Load the container (size rounded up to 1/2/4/8 bytes). - u32 container_size = 1u << ((31 - __builtin_clz(width + bit_off + 7)) >> 1); // power of 2 - emit_mov_load(t->mc, container_size, /*signed=*/0, rd, record_addr.base.reg, - record_addr.offset + (i32)byte_off); - - // Shift right to align LSB to bit 0. - if (bit_off > 0) { - emit_shift_imm(t->mc, 1, X64_SHIFT_SUB_SHR, rd, (u8)bit_off); - } - - // Mask to width bits. - u64 mask = (1ull << width) - 1; - if (mask < 0xffffffff) { - x64_emit_load_imm(t->mc, 0, X64_R11, (i64)mask); - emit_alu_rr(t->mc, 0, X64_OPC_ALU_AND, rd, X64_R11); - } - - // Sign-extend if needed. - if (bf.sign_extend && width < 64) { - emit_extend_rr(t->mc, 1, /*signed=*/1, width / 8, rd, rd); - } -} -``` - ---- - -### binop(op, dst_reg, a_reg, b_reg_or_imm) - -**Input:** BinOp, NativeLoc dst (REG), NativeLoc a (REG), NativeLoc b (REG or IMM). -**Emit:** Two-address: copy a→dst, then dst op= b. - -**Key branches by op:** - -**FP binops (BO_FADD/FSUB/FMUL/FDIV):** -- All commutative except subtract. Use SSE opcodes (0x58/0x5C/0x59/0x5E). -- If `dst==rb && rd!=ra`: commutative ops can emit `op rd, ra`; for non-commutative, spill rb to temp. - -**Integer division/remainder (BO_SDIV/UDIV/SREM/UREM):** -- Signed: emit CQO/CDQ, then IDIV. Result in RAX (quotient) or RDX (remainder). -- Unsigned: XOR RDX, RDX, then DIV. Same result regs. -- Route divisor through R11 if it's RAX/RDX. - -**Shifts (BO_SHL/SHR_U/SHR_S):** -- Count must be in CL or encoded as imm8. Route RHS through CL if not immediate. -- Sub-opcodes: SHL=4, SHR_U=5, SHR_S=7. - -**ALU (BO_IADD/ISUB/AND/OR/XOR/IMUL):** -- Commutative ops: swap to move immediate to RHS for fast-path immediate forms. -- Immediate fast-paths: 0x83 (imm8 sext) or 0x81 (imm32 sext). -- Fallback: mov ra→rd, then op with register form (0x01/0x29/0x21/0x09/0x31/0xAF). - -**Pseudo-C:** -```c -static void x_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc a, NativeLoc b) { - MCEmitter* mc = t->mc; - u32 rd = dst.v.reg & 0xFu; - - // FP binops. - if (op == BO_FADD || op == BO_FSUB || op == BO_FMUL || op == BO_FDIV) { - u32 ra = a.v.reg & 0xFu; - u32 rb = b.v.reg & 0xFu; - u8 prefix = (dst.type && cg_type_size(t->c, dst.type) == 8u) ? 0xF2u : 0xF3u; - u8 opcode = (op == BO_FADD) ? 0x58 : (op == BO_FSUB) ? 0x5C : - (op == BO_FMUL) ? 0x59 : 0x5E; // FDIV - - if (rd == rb && rd != ra) { - // Can use commutative source swap. - if (op == BO_FADD || op == BO_FMUL) { - emit_sse_rr(mc, prefix, opcode, rd, ra); - return; - } - // For FSUB/FDIV, must preserve order: use temp. - emit_sse_rr(mc, prefix, 0x10, X64_XMM15, rb); // spill rb - emit_sse_rr(mc, prefix, 0x10, rd, ra); // rd = ra - emit_sse_rr(mc, prefix, opcode, rd, X64_XMM15); // rd -= temp - return; - } - if (rd != ra) emit_sse_rr(mc, prefix, 0x10, rd, ra); // movs{s,d} - emit_sse_rr(mc, prefix, opcode, rd, rb); - return; - } - - int w = (dst.type && cg_type_size(t->c, dst.type) >= 8u) ? 1 : 0; - - // Integer division. - if (op == BO_SDIV || op == BO_UDIV || op == BO_SREM || op == BO_UREM) { - u32 ra = (a.kind == NATIVE_LOC_REG) ? (a.v.reg & 0xFu) : X64_R11; - if (ra != (a.v.reg & 0xFu) && a.kind == NATIVE_LOC_REG) { - emit_mov_rr(mc, w, ra, a.v.reg & 0xFu); - } else if (a.kind == NATIVE_LOC_IMM) { - x64_emit_load_imm(mc, w, ra, a.v.imm); - } - if (ra != X64_RAX) emit_mov_rr(mc, w, X64_RAX, ra); - - u32 rb = (b.kind == NATIVE_LOC_REG) ? (b.v.reg & 0xFu) : X64_R11; - if (b.kind == NATIVE_LOC_REG) { - if (rb == X64_RAX || rb == X64_RDX) { - emit_mov_rr(mc, w, X64_R11, rb); - rb = X64_R11; - } - } else if (b.kind == NATIVE_LOC_IMM) { - x64_emit_load_imm(mc, w, X64_R11, b.v.imm); - rb = X64_R11; - } - - if (op == BO_SDIV || op == BO_SREM) { - emit_cqo_or_cdq(mc, w); // sign-extend rax→rdx:rax - emit_f7_rm(mc, w, X64_F7_SUB_IDIV, rb); - } else { - emit_xor_self(mc, w, X64_RDX); // zero rdx - emit_f7_rm(mc, w, X64_F7_SUB_DIV, rb); - } - - u32 result_reg = (op == BO_SREM || op == BO_UREM) ? X64_RDX : X64_RAX; - if (rd != result_reg) emit_mov_rr(mc, w, rd, result_reg); - return; - } - - // Shifts. - if (op == BO_SHL || op == BO_SHR_U || op == BO_SHR_S) { - u32 ra = (a.kind == NATIVE_LOC_REG) ? (a.v.reg & 0xFu) : X64_R11; - if (a.kind == NATIVE_LOC_IMM) { - x64_emit_load_imm(mc, w, ra, a.v.imm); - } else { - ra = a.v.reg & 0xFu; - } - - u32 sub = (op == BO_SHL) ? X64_SHIFT_SUB_SHL : - (op == BO_SHR_U) ? X64_SHIFT_SUB_SHR : X64_SHIFT_SUB_SAR; - - if (b.kind == NATIVE_LOC_IMM) { - // Immediate shift: encode in C1 /sub ib. - if (rd != ra) emit_mov_rr(mc, w, rd, ra); - u32 width = w ? 64u : 32u; - emit_shift_imm(mc, w, sub, rd, (u8)(b.v.imm & (width - 1u))); - return; - } - - // Register shift: count in CL. - u32 rc = b.v.reg & 0xFu; - if (rc != X64_RCX) emit_mov_rr(mc, 0, X64_RCX, rc); - if (rd != ra) emit_mov_rr(mc, w, rd, ra); - emit_shift_cl(mc, w, sub, rd); - return; - } - - // Canonicalize commutative ops: IMM to RHS. - if ((op == BO_IADD || op == BO_AND || op == BO_OR || op == BO_XOR || op == BO_IMUL) && - a.kind == NATIVE_LOC_IMM && b.kind != NATIVE_LOC_IMM) { - NativeLoc tmp = a; - a = b; - b = tmp; - } - - // Immediate fast-paths (ALU/IMUL). - if (b.kind == NATIVE_LOC_IMM && a.kind == NATIVE_LOC_REG && - (op == BO_IADD || op == BO_ISUB || op == BO_AND || op == BO_OR || op == BO_XOR || op == BO_IMUL)) { - i64 imm = b.v.imm; - u32 ra = a.v.reg & 0xFu; - - if (op == BO_IMUL) { - if (imm >= -128 && imm <= 127) { - emit_imul_imm8(mc, w, rd, ra, (i8)imm); - return; - } else if (imm >= -(1LL<<31) && imm <= (1LL<<31)-1) { - emit_imul_imm32(mc, w, rd, ra, (i32)imm); - return; - } - } else { - u32 sub = (op == BO_IADD) ? X64_ALU_SUB_ADD : (op == BO_OR) ? X64_ALU_SUB_OR : - (op == BO_AND) ? X64_ALU_SUB_AND : (op == BO_ISUB) ? X64_ALU_SUB_SUB : X64_ALU_SUB_XOR; - if (imm >= -128 && imm <= 127) { - if (rd != ra) emit_mov_rr(mc, w, rd, ra); - emit_alu_imm8(mc, w, sub, rd, (i8)imm); - return; - } else if (imm >= -(1LL<<31) && imm <= (1LL<<31)-1) { - if (rd != ra) emit_mov_rr(mc, w, rd, ra); - emit_alu_imm32(mc, w, sub, rd, (i32)imm); - return; - } - } - // Fall through to full materialization. - } - - // Generic two-operand: copy ra→dst, then dst op= rb. - u32 ra = (a.kind == NATIVE_LOC_REG) ? (a.v.reg & 0xFu) : X64_R11; - u32 rb = (b.kind == NATIVE_LOC_REG) ? (b.v.reg & 0xFu) : X64_R11; - - if (a.kind == NATIVE_LOC_IMM) x64_emit_load_imm(mc, w, ra, a.v.imm); - else ra = a.v.reg & 0xFu; - - if (b.kind == NATIVE_LOC_IMM && ra != X64_R11) { - x64_emit_load_imm(mc, w, X64_R11, b.v.imm); - rb = X64_R11; - } else if (b.kind == NATIVE_LOC_IMM) { - // ra already in R11; need another scratch. - x64_emit_load_imm(mc, w, X64_R10, b.v.imm); - rb = X64_R10; - } else { - rb = b.v.reg & 0xFu; - } - - // Preserve rb if dst == rb && dst != ra. - if (rd == rb && rd != ra) { - if (op == BO_IADD || op == BO_AND || op == BO_OR || op == BO_XOR || op == BO_IMUL) { - emit_mov_rr(mc, w, X64_R10, rb); - rb = X64_R10; - } else { - // Non-commutative (ISUB): must still preserve rb. - emit_mov_rr(mc, w, X64_R10, rb); - rb = X64_R10; - } - } - - if (rd != ra) emit_mov_rr(mc, w, rd, ra); - - switch (op) { - case BO_IADD: emit_alu_rr(mc, w, X64_OPC_ALU_ADD, rd, rb); break; - case BO_ISUB: emit_alu_rr(mc, w, X64_OPC_ALU_SUB, rd, rb); break; - case BO_AND: emit_alu_rr(mc, w, X64_OPC_ALU_AND, rd, rb); break; - case BO_OR: emit_alu_rr(mc, w, X64_OPC_ALU_OR, rd, rb); break; - case BO_XOR: emit_alu_rr(mc, w, X64_OPC_ALU_XOR, rd, rb); break; - case BO_IMUL: emit_imul_rr(mc, w, rd, rb); break; - default: compiler_panic(t->c, ((XNativeTarget*)t)->loc, "x64 binop: unsupported op"); - } -} -``` - -**Emit.c calls:** `emit_sse_rr()`, `x64_emit_load_imm()`, `emit_mov_rr()`, `emit_f7_rm()`, `emit_xor_self()`, `emit_cqo_or_cdq()`, `emit_shift_imm()`, `emit_shift_cl()`, `emit_alu_imm8()`, `emit_alu_imm32()`, `emit_imul_imm8()`, `emit_imul_imm32()`, `emit_alu_rr()`, `emit_imul_rr()`. - ---- - -### unop(op, dst_reg, src_reg) - -**Input:** UnOp, NativeLoc dst (REG), NativeLoc src (REG). -**Emit:** F7 /sub for NEG/BNOT, TEST+SETCC for NOT, SSE xor-sign for FNEG. - -**Pseudo-C:** -```c -static void x_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) { - MCEmitter* mc = t->mc; - u32 rd = dst.v.reg & 0xFu; - u32 rs = src.v.reg & 0xFu; - - if (op == UO_FNEG) { - // FP negate: flip sign bit via xor with sign-mask constant. - if (rd != rs) { - u8 prefix = (dst.type && cg_type_size(t->c, dst.type) == 8u) ? 0xF2u : 0xF3u; - emit_sse_rr(mc, prefix, 0x10, rd, rs); // movs{s,d} - } - - // Load sign mask from .rodata and xor. - u8 mask_bytes[8]; - memset(mask_bytes, 0, sizeof mask_bytes); - if (cg_type_size(t->c, dst.type) == 8u) { - mask_bytes[7] = 0x80u; // double: bit 63 sign - ConstBytes cb = {mask_bytes, 8, 8, dst.type}; - } else { - mask_bytes[3] = 0x80u; // single: bit 31 sign - ConstBytes cb = {mask_bytes, 4, 4, dst.type}; - } - // Load mask into temp FP reg and xor. - u8 prefix = (cg_type_size(t->c, dst.type) == 8u) ? 0x66u : 0u; - // (Not shown: emit load of cb into X64_XMM15, then xor) - emit_sse_rr(mc, prefix, 0x57, rd, X64_XMM15); // xorpd/xorps - return; - } - - int w = (dst.type && cg_type_size(t->c, dst.type) >= 8u) ? 1 : 0; - - if (op == UO_NEG) { - if (rd != rs) emit_mov_rr(mc, w, rd, rs); - emit_f7_rm(mc, w, X64_F7_SUB_NEG, rd); - return; - } - - if (op == UO_BNOT) { // Bitwise NOT. - if (rd != rs) emit_mov_rr(mc, w, rd, rs); - emit_f7_rm(mc, w, X64_F7_SUB_NOT, rd); - return; - } - - if (op == UO_NOT) { // Logical NOT (x ? 0 : 1). - emit_test_self(mc, w, rs); - emit_setcc(mc, X64_CC_E, rd); // ZF set if rs == 0 - emit_movzx_r32_r8(mc, rd, rd); // zero-extend al→r32 - return; - } - - compiler_panic(t->c, ((XNativeTarget*)t)->loc, "x64 unop: unsupported op"); -} -``` - -**Emit.c calls:** `emit_sse_rr()`, `emit_mov_rr()`, `emit_f7_rm()`, `emit_test_self()`, `emit_setcc()`, `emit_movzx_r32_r8()`. - ---- - -### cmp(op, dst_reg, a_reg, b_reg_or_imm) - -**Input:** CmpOp, NativeLoc dst (REG), two operands. -**Emit:** Compare (CMP or UCOMISD), then SETCC to materialize 0/1. - -**Pseudo-C:** -```c -static void x_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc a, NativeLoc b) { - MCEmitter* mc = t->mc; - u32 rd = dst.v.reg & 0xFu; - u32 ra = a.v.reg & 0xFu; - - if ((NativeAllocClass)a.cls == NATIVE_REG_FP) { - // FP comparison: UCOMISD/UCOMISS, then handle unordered/ordered cases. - u8 prefix = (cg_type_size(t->c, a.type) == 8u) ? 0x66u : 0u; - u32 rb = b.v.reg & 0xFu; - emit_sse_rr(mc, prefix, 0x2E, ra, rb); // ucomisd/ucomiss sets EFLAGS - - switch (op) { - case CMP_NE: - // Unordered OR not-equal: set if P (unordered) OR NE. - emit_setcc(mc, X64_CC_P, rd); // P (parity, i.e., unordered) - emit_movzx_r32_r8(mc, rd, rd); - emit_setcc(mc, X64_CC_NE, X64_R11); - emit_movzx_r32_r8(mc, X64_R11, X64_R11); - emit_alu_rr(mc, 0, X64_OPC_ALU_OR, rd, X64_R11); - return; - case CMP_EQ: - case CMP_LT_F: - case CMP_LE_F: - // Ordered comparisons: must check NP (not unordered). - // Set if (cond AND ordered). - u32 cc = (op == CMP_EQ) ? X64_CC_E : (op == CMP_LT_F) ? X64_CC_B : X64_CC_BE; - emit_setcc(mc, cc, rd); - emit_movzx_r32_r8(mc, rd, rd); - emit_setcc(mc, X64_CC_NP, X64_R11); - emit_movzx_r32_r8(mc, X64_R11, X64_R11); - emit_alu_rr(mc, 0, X64_OPC_ALU_AND, rd, X64_R11); - return; - case CMP_GT_F: - emit_setcc(mc, X64_CC_A, rd); - emit_movzx_r32_r8(mc, rd, rd); - return; - case CMP_GE_F: - emit_setcc(mc, X64_CC_AE, rd); - emit_movzx_r32_r8(mc, rd, rd); - return; - default: - emit_setcc(mc, cmp_to_cc(op), rd); - emit_movzx_r32_r8(mc, rd, rd); - return; - } - } - - // Integer comparison. - int w = (a.type && cg_type_size(t->c, a.type) >= 8u) ? 1 : 0; - - if (b.kind == NATIVE_LOC_IMM && imm_fits_i8(b.v.imm)) { - emit_cmp_imm8(mc, w, ra, (i8)b.v.imm); - } else if (b.kind == NATIVE_LOC_IMM && imm_fits_i32(b.v.imm)) { - emit_alu_imm32(mc, w, X64_ALU_SUB_CMP, ra, (i32)b.v.imm); - } else { - u32 rb = (b.kind == NATIVE_LOC_REG) ? (b.v.reg & 0xFu) : X64_R11; - if (b.kind == NATIVE_LOC_IMM) x64_emit_load_imm(mc, w, rb, b.v.imm); - else rb = b.v.reg & 0xFu; - emit_alu_rr(mc, w, X64_OPC_ALU_CMP, ra, rb); - } - - emit_setcc(mc, cmp_to_cc(op), rd); - emit_movzx_r32_r8(mc, rd, rd); -} - -static u32 cmp_to_cc(CmpOp op) { - switch (op) { - case CMP_EQ: return X64_CC_E; - case CMP_NE: return X64_CC_NE; - case CMP_LT_U: return X64_CC_B; - case CMP_LE_U: return X64_CC_BE; - case CMP_GT_U: return X64_CC_A; - case CMP_GE_U: return X64_CC_AE; - case CMP_LT_S: return X64_CC_L; - case CMP_LE_S: return X64_CC_LE; - case CMP_GT_S: return X64_CC_G; - case CMP_GE_S: return X64_CC_GE; - default: return X64_CC_E; - } -} -``` - -**Emit.c calls:** `emit_sse_rr()`, `emit_setcc()`, `emit_movzx_r32_r8()`, `emit_alu_rr()`, `emit_cmp_imm8()`, `emit_alu_imm32()`, `x64_emit_load_imm()`, `imm_fits_i8()`, `imm_fits_i32()`. - ---- - -### convert(kind, dst_reg, src_reg) - -**Input:** ConvKind, NativeLoc dst, NativeLoc src. -**Emit:** MOVZX/MOVSX for extension, MOVSXD for 32→64 signed, MOV r32,r32 for 32→64 unsigned (zero-extends high 32), CVTSI2/CVTTS/CVTF for int↔fp with special paths for unsigned 64-bit. - -**Pseudo-C:** -```c -static void x_convert(NativeTarget* t, ConvKind kind, NativeLoc dst, NativeLoc src) { - MCEmitter* mc = t->mc; - u32 rd = dst.v.reg & 0xFu; - u32 rs = src.v.reg & 0xFu; - - switch (kind) { - case CV_SEXT: { - u32 src_bytes = cg_type_size(t->c, src.type); - int w = (cg_type_size(t->c, dst.type) >= 8u) ? 1 : 0; - emit_extend_rr(mc, w, /*signed=*/1, src_bytes, rd, rs); - return; - } - case CV_ZEXT: { - u32 src_bytes = cg_type_size(t->c, src.type); - int w = (cg_type_size(t->c, dst.type) >= 8u) ? 1 : 0; - emit_extend_rr(mc, w, /*signed=*/0, src_bytes, rd, rs); - return; - } - case CV_TRUNC: - // In-register truncation: mov r32, r32 clears high 32. - emit_mov_rr(mc, 0, rd, rs); - return; - case CV_ITOF_S: { - int w_src = (cg_type_size(t->c, src.type) >= 8u) ? 1 : 0; - u8 prefix = (cg_type_size(t->c, dst.type) == 8u) ? 0xF2u : 0xF3u; - emit_sse_rr_w(mc, prefix, 0x2A, w_src, rd, rs); // cvtsi2sd/cvtsi2ss - return; - } - case CV_ITOF_U: { - int w_src = (cg_type_size(t->c, src.type) >= 8u) ? 1 : 0; - u8 prefix = (cg_type_size(t->c, dst.type) == 8u) ? 0xF2u : 0xF3u; - - if (w_src == 1) { - // Unsigned 64→FP: special path (test sign, branch, two paths). - MCLabel L_high = mc->label_new(mc), L_done = mc->label_new(mc); - emit_test_self(mc, 1, rs); - emit_jcc_label(mc, X64_CC_S, L_high); - emit_sse_rr_w(mc, prefix, 0x2A, 1, rd, rs); - emit_jmp_label(mc, L_done); - - mc->label_place(mc, L_high); - emit_mov_rr(mc, 1, X64_R11, rs); - emit_mov_rr(mc, 1, X64_RAX, rs); - emit_alu_imm8(mc, 1, X64_ALU_SUB_AND, X64_RAX, 1); // and rax, 1 - emit_shift_imm(mc, 1, X64_SHIFT_SUB_SHR, X64_R11, 1); // shr r11, 1 - emit_alu_rr(mc, 1, X64_OPC_ALU_OR, X64_R11, X64_RAX); // or r11, rax - emit_sse_rr_w(mc, prefix, 0x2A, 1, rd, X64_R11); - emit_sse_rr(mc, prefix, 0x58, rd, rd); // adds{s,d} dst, dst - - mc->label_place(mc, L_done); - return; - } else { - // u32→fp: zero-extend to 64-bit, then signed convert works. - emit_extend_rr(mc, 1, /*signed=*/0, 4, X64_R11, rs); - emit_sse_rr_w(mc, prefix, 0x2A, 1, rd, X64_R11); - return; - } - } - case CV_FTOI_S: { - int w_dst = (cg_type_size(t->c, dst.type) >= 8u) ? 1 : 0; - u8 prefix = (cg_type_size(t->c, src.type) == 8u) ? 0xF2u : 0xF3u; - emit_sse_rr_w(mc, prefix, 0x2C, w_dst, rd, rs); // cvtts{d,s}2si - return; - } - case CV_FTOI_U: { - int w_dst = (cg_type_size(t->c, dst.type) >= 8u) ? 1 : 0; - u8 prefix = (cg_type_size(t->c, src.type) == 8u) ? 0xF2u : 0xF3u; - - if (w_dst == 1) { - // FP→u64: special path (compare against 2^63, two branches). - // (Shortened for space; see legacy ops.c:1050-1075.) - MCLabel L_small = mc->label_new(mc), L_done = mc->label_new(mc); - - // Load 2^63 constant and compare. - ConstBytes cb = {/*2^63 bytes*/}; - x_load_const(t, (NativeLoc){.kind=NATIVE_LOC_REG, .cls=NATIVE_REG_FP, .v.reg=X64_XMM15}, cb); - - emit_sse_rr(mc, prefix == 0xF2u ? 0x66u : 0u, 0x2E, rs, X64_XMM15); - emit_jcc_label(mc, X64_CC_B, L_small); - - // src >= 2^63: subtract 2^63, convert, add sign bit back. - emit_sse_rr(mc, prefix, 0x10, X64_XMM0 + 14, rs); - emit_sse_rr(mc, prefix, 0x5C, X64_XMM0 + 14, X64_XMM15); // sub - emit_sse_rr_w(mc, prefix, 0x2C, 1, rd, X64_XMM0 + 14); - x64_emit_load_imm(mc, 1, X64_R11, -9223372036854775807LL - 1LL); - emit_alu_rr(mc, 1, X64_OPC_ALU_XOR, rd, X64_R11); - emit_jmp_label(mc, L_done); - - mc->label_place(mc, L_small); - emit_sse_rr_w(mc, prefix, 0x2C, 1, rd, rs); - mc->label_place(mc, L_done); - return; - } else { - // FP→u32: convert as signed, result fits in u32 range. - emit_sse_rr_w(mc, prefix, 0x2C, 0, rd, rs); - return; - } - } - case CV_BITCAST: - if ((NativeAllocClass)src.cls == NATIVE_REG_FP && (NativeAllocClass)dst.cls == NATIVE_REG_INT) { - // FP→int: MOVQ xmm, gpr (0F 7E with REX.W). - emit_sse_rr_w(mc, 0x66u, 0x7E, 1, rs, rd); - } else if ((NativeAllocClass)src.cls == NATIVE_REG_INT && (NativeAllocClass)dst.cls == NATIVE_REG_FP) { - // int→FP: MOVQ gpr, xmm (0F 6E with REX.W). - emit_sse_rr_w(mc, 0x66u, 0x6E, 1, rd, rs); - } else { - // same class, same reg: already correct. - if (rd != rs) emit_mov_rr(mc, 1, rd, rs); - } - return; - default: - compiler_panic(t->c, ((XNativeTarget*)t)->loc, "x64 convert: unsupported kind"); - } -} -``` - -**Emit.c calls:** `emit_extend_rr()`, `emit_mov_rr()`, `emit_sse_rr_w()`, `emit_sse_rr()`, `emit_test_self()`, `emit_jcc_label()`, `emit_jmp_label()`, `mc->label_place()`, `emit_alu_imm8()`, `emit_shift_imm()`, `emit_alu_rr()`, `x64_emit_load_imm()`. - ---- - -### cmp_branch(op, a_reg, b_reg, target_label) - -**Input:** CmpOp, two operands, MCLabel target. -**Emit:** Compare (CMP or UCOMISD), then conditional branch (Jcc rel32) with label reloc. - -**Pseudo-C:** -```c -static void x_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc a, NativeLoc b, MCLabel label) { - MCEmitter* mc = t->mc; - u32 ra = a.v.reg & 0xFu; - - if ((NativeAllocClass)a.cls == NATIVE_REG_FP) { - // FP: UCOMISD/UCOMISS, then Jcc for ordered/unordered case. - u8 prefix = (cg_type_size(t->c, a.type) == 8u) ? 0x66u : 0u; - u32 rb = b.v.reg & 0xFu; - emit_sse_rr(mc, prefix, 0x2E, ra, rb); - - // Pick CC based on op and ordering. - u32 cc = cmp_to_cc(op); - if (op == CMP_NE) { - // Branch if unordered OR not-equal: JP label; JNE label (can chain) - emit_jcc_label(mc, X64_CC_P, label); - emit_jcc_label(mc, X64_CC_NE, label); - } else if (op == CMP_EQ || op == CMP_LT_F || op == CMP_LE_F) { - // Ordered: emit JP to skip, then Jcc. - MCLabel skip = mc->label_new(mc); - emit_jcc_label(mc, X64_CC_P, skip); - emit_jcc_label(mc, cc, label); - mc->label_place(mc, skip); - } else { - // Other FP ops: direct Jcc (already ordered by fused cmp+branch). - emit_jcc_label(mc, cc, label); - } - return; - } - - // Integer comparison. - int w = (a.type && cg_type_size(t->c, a.type) >= 8u) ? 1 : 0; - - if (b.kind == NATIVE_LOC_IMM && imm_fits_i8(b.v.imm)) { - emit_cmp_imm8(mc, w, ra, (i8)b.v.imm); - } else if (b.kind == NATIVE_LOC_IMM && imm_fits_i32(b.v.imm)) { - emit_alu_imm32(mc, w, X64_ALU_SUB_CMP, ra, (i32)b.v.imm); - } else { - u32 rb = (b.kind == NATIVE_LOC_REG) ? (b.v.reg & 0xFu) : X64_R11; - if (b.kind == NATIVE_LOC_IMM) x64_emit_load_imm(mc, w, rb, b.v.imm); - emit_alu_rr(mc, w, X64_OPC_ALU_CMP, ra, rb); - } - - emit_jcc_label(mc, cmp_to_cc(op), label); -} - -void emit_jcc_label(MCEmitter* mc, u32 cc, MCLabel label) { - u32 pos = mc->pos(mc); - u8 buf[6]; - buf[0] = X64_OPC_TWOBYTE; - buf[1] = (u8)(X64_OPC_JCC_BASE | (cc & 0xFu)); - mc->emit_bytes(mc, buf, 2); - mc->emit_u32le(mc, 0); // placeholder - mc->emit_label_ref(mc, label, R_PC32, 4, -4); // or R_X64_PC32_RELOC if x64-specific -} -``` - -**Emit.c calls:** `emit_sse_rr()`, `emit_cmp_imm8()`, `emit_alu_imm32()`, `emit_alu_rr()`, `x64_emit_load_imm()`, and custom `emit_jcc_label()`. - ---- - -### Additional Hooks (Brief Outlines) - -**jump(label):** Emit JMP rel32 with label reloc. -**indirect_branch(addr_reg, valid_targets, ntargets):** JMP r/m64. -**load_label_addr(dst_reg, label):** LEA [RIP+offset] with label fixup or jump-over technique. -**label_new() / label_place():** Delegate to mc. -**alloca_(dst_reg, size_reg, align):** SUB RSP, size; LEA dst, [RSP + max_outgoing]. -**spill(src_reg, slot, mem_access):** MOV [slot_addr], src. -**reload(dst_reg, slot, mem_access):** MOV dst, [slot_addr]. - ---- - -## Summary - -The x64 NativeTarget port reuses the legacy byte-level emit.c helpers for instruction encoding, abstracts frame layout and ABI details via NativeFrameSlot and abi.h queries, and maps semantic operations (two-address ALU, flags comparisons, implicit-register division/shifts, FP SSE) to NativeLoc-based NativeTarget hooks. Key differences from rv64/aa64 templates: - -1. **Two-address ALU**: Always copy `a → dst` before `dst op= b`. -2. **Flags-based branches**: Emit CMP+JCC instead of materializing condition bits. -3. **Implicit registers**: Route division through RAX/RDX, shifts through CL, multiplies via special implicit-register opcodes. -4. **Width encoding**: Use w=0/1 (32/64) and size= (1/2/4/8) parameters to emit correct opcodes. -5. **Relocations**: RIP-relative addressing with -4 addend for end-of-insn. - -See `src/arch/aa64/native.c` for the complete working template; apply this guide to write `src/arch/x64/native.c` with identical hook signatures and the x64-specific emission logic outlined above. - - - - ---- - -# x64 NativeTarget Port: GROUP 4 — Calls, Returns, and ABI - -## Executive Summary - -GROUP 4 handles **calling conventions, return-value marshalling, and ABI routing** for both **SysV x86-64** (Unix/Linux) and **Win64** (Windows x64). The x64 ABI is dramatically different from the RV64/AA64 references: **two separate register passing windows** (SysV: rdi/rsi/rdx/rcx/r8/r9 GPR + xmm0-7 FPR; Win64: rcx/rdx/r8/r9 GPR + xmm0-3 FPR with 32-byte shadow space), **sret hidden pointers that consume an integer argument slot**, and **stack-passing mismatches between the two ABIs**. Return values span rax/rdx (GPR) or xmm0/xmm1 (FPR). Tail calls must fit the caller's incoming parameter area. The implementation must dispatch at runtime via `c->target.os` to select SysV or Win64 mode **once**, store the ABI dispatch table in the backend state, and route all argument/return logic through it. - ---- - -## ABI Dispatch & Register Tables - -### X64ABIRegs Structure -The legacy implementation stores a pointer to one of two dispatch tables, **`X64ABIRegs`**, selected at `func_begin` based on `c->target.os`. On the NativeTarget port, this table must be created once per function and held in state so that `plan_call`, `bind_param`, and return logic reuse it. - -**Source references:** -- Legacy internal.h ~lines 60–72: X64ABIRegs definition -- Legacy abi_sysv_x64.c & abi_win64_x64.c: ABI initialization functions (not part of this port; the abi/ interface already computes ABIFuncInfo) - -**Table structure (pseudo-code):** -```c -typedef struct X64ABIRegs { - const u32* int_args; // rdi/rsi/rdx/rcx/r8/r9 (SysV, 6) - // rcx/rdx/r8/r9 (Win64, 4) - u32 n_int_args; // 6 or 4 - u32 n_fp_args; // 8 (SysV) or 4 (Win64) - int slot_shared_int_fp; // 0 (SysV) or 1 (Win64): slots shared between int/fp - u32 shadow_space; // 0 (SysV) or 32 (Win64) - int emit_sysv_vararg_save; // 1 (SysV only): emit 176-B GP/FP save area - int vararg_fp_dup_to_gpr; // 1 (Win64): variadic FPs duplicated to GPRs - u64 cs_int_mask; // callee-saved GPRs eligible for save: - // SysV 0xxxxxxxE0 | 0xF000 (rbx/r12-15 + rdi/rsi tail) - // Win64: same + rdi/rsi (home-arg regs) - u64 cs_fp_mask; // callee-saved XMMs: xmm6-15 (10 regs, ~0xFFC0) -} X64ABIRegs; -``` - -**Register orderings in src/arch/x64/isa.h:** -- `X64_RDI` = 5u, `X64_RSI` = 4u, `X64_RDX` = 3u, `X64_RCX` = 1u, `X64_R8` = 8u, `X64_R9` = 9u -- `X64_RAX` = 0u, `X64_RBP` = 5u (frame pointer, reserved) -- `X64_RSP` = 4u (stack pointer, reserved) -- `X64_RBX`, `X64_R12`..`X64_R15` are callee-saved -- `X64_XMM0`..`X64_XMM15` are SSE/FP registers - ---- - -## Call Site Marshalling: `plan_call` & `emit_call` - -### Design Pattern - -The legacy code separates **planning** (compute argument locations, stack usage) from **emission** (generate actual code). The NativeTarget port mirrors this with `plan_call` (populates `NativeCallPlan*`) and `emit_call` (reads the plan, emits moves and the call instruction). - -**Key invariants:** -1. **sret pointer reserves the first integer argument slot** (rdi in SysV, rcx in Win64). The callee receives it as an implicit first parameter in the integer argument register AND as a hidden implicit return-value pointer. On return, it is passed back in rax. -2. **Win64 shadow space** (32 bytes = 4 home slots) is caller-reserved at [rsp+0..31] and is counted as part of `stack_arg_size`. SysV has no shadow space. -3. **Variadic calls**: SysV places variadic args into the same register pools as fixed args; Win64 duplicates variadic FP values into the matching GPR and bypasses the FP register pool for variadics. -4. **Stack arguments** must 16-byte-align the rsp *before* the call (i.e., the call instruction itself misaligns rsp by 8 bytes, so outgoing stack args + shadow space must be 16-aligned). -5. **Tail calls** are realized if the callee's outgoing stack args fit in the caller's incoming parameter area (checked via `signature_stack_bytes` against `call_stack_bytes`). - -### `plan_call` Body Sketch - -**Inputs:** `NativeCallDesc* desc` (fn_type via abi_cg_func_info, callee, args[], nargs, results[], nresults, flags, tail_policy, inline_policy) - -**Outputs:** `NativeCallPlan* out` (callee, args[], nargs; rets[], nrets; stack_arg_size; clobber_mask[], return_mask[]; has_sret, is_variadic) - -**Pseudo-code:** -```c -void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, - NativeCallPlan* out) { - // 1. Fetch ABI function signature via the abi/ interface - const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type); - const X64ABIRegs* x_abi = x64_abi_for_os(t->c->target.os); // [selected once] - - // 2. Initialize output - memset(out, 0, sizeof *out); - out->callee = desc->callee; - out->flags = desc->flags; - out->has_sret = abi && abi->has_sret; - out->is_variadic = abi && abi->variadic; - - // 3. Allocate argument move array (desc->nargs typically * 2 for splits) - out->args = arena_array(t->c->tu, NativeCallPlanMove, - desc->nargs * 2 + 2); - out->nargs = 0; - - // 4. Start with shadow space (Win64 only) - u32 stack = x_abi->shadow_space; - u32 next_int = 0, next_fp = 0; - - // 5. sret argument (if present): occupies first int arg slot and stack reservation - if (abi && abi->has_sret) { - if ((desc->flags & CG_CALL_TAIL) == 0) { - // Ordinary call: pass destination address in first int arg reg - NativeCallPlanMove* m = &out->args[out->nargs++]; - m->src = desc->ret.storage; // caller's destination (frame/reg/indirect) - m->src_kind = NATIVE_CALL_MOVE_ADDR; // compute address, not value - m->dst = native_loc_reg(builtin_i64(), NATIVE_REG_INT, - x_abi->int_args[0]); - m->dst_kind = NATIVE_LOC_REG; - } - next_int = 1; // sret consumed first slot; any FP args skip if slot_shared - if (x_abi->slot_shared_int_fp) next_fp = 1; - } - - // 6. Iterate through each argument descriptor - for (u32 i = 0; i < desc->nargs; ++i) { - const NativeLoc* arg = &desc->args[i]; - const ABIArgInfo* ai = abi ? &abi->params[i] : NULL; - - // Handle ABI_ARG_IGNORE, ABI_ARG_INDIRECT (pass address), ABI_ARG_DIRECT (parts) - // For each ABIArgPart: - // - If next_int < n_int_args and cls == INT: assign to int_args[next_int++] - // - Else if next_fp < n_fp_args and cls == FP: assign to xmm_args[next_fp++] - // - Else: stack-pass at offset stack; stack += 8 (aligned per part->align) - // Win64: if slot_shared_int_fp, next_fp mirrors next_int - } - - // 7. Finalize stack size and alignment - out->stack_arg_size = (stack + 15) & ~15; // 16-byte align - - // 8. Allocate return-value array and clobber/return masks - out->rets = arena_array(t->c->tu, NativeCallPlanRet, 4); - out->nrets = 0; // populated by caller after plan_call via abi->ret - - // Clobber mask: all caller-saved registers - for (u32 c = 0; c < NATIVE_CALL_PLAN_CLASSES; ++c) - out->clobber_mask[c] = ~x_abi->cs_*_mask; - - // Return mask: ret registers (rax, rdx for int; xmm0, xmm1 for fp) - // Populated from abi->ret parts -} -``` - -**Key points:** -- **sret as implicit first arg**: SysV passes it in rdi, Win64 in rcx. It occupies a slot and is not separately stack-passed. -- **slot_shared_int_fp (Win64)**: When true, next_int and next_fp advance in lockstep so that int_args[1] and xmm_args[1] both point to slot 1, etc. -- **Stack alignment**: must be 16-byte-aligned *before* the call (the call instruction pushes an 8-byte return address, misaligning to 8 mod 16, which is correct for the first instruction of the callee). -- **Variadic handling**: SysV uses the same pools; Win64 duplicates FP args to GPRs (handled in emit_call by checking is_variadic and writing xmm→gpr moves). - -### `emit_call` Body Sketch - -**Inputs:** `NativeCallPlan* plan` (pre-computed by plan_call) - -**Pseudo-code:** -```c -void x64_emit_call(NativeTarget* t, const NativeCallPlan* plan) { - MCEmitter* mc = t->mc; - const X64ABIRegs* x_abi = x64_abi_for_os(t->c->target.os); - - // Tail-call path (covered in tail_call_unrealizable_reason) - if (plan->flags & CG_CALL_TAIL) { - if (plan->has_sret) { - // Load incoming sret pointer from frame into rdi (SysV) or rcx (Win64) - // Spilled at func_begin to preserve it across the body - } - // Restore frame (callee-saved regs, rbp), then jmp/call to callee - return; - } - - // Ordinary call: emit argument moves - for (u32 i = 0; i < plan->nargs; ++i) { - const NativeCallPlanMove* m = &plan->args[i]; - // Resolve m->src (NativeLoc: reg, frame, stack, imm, global, addr) - // Move into m->dst (always a physical register or stack slot) - // For ADDR moves: compute address and store pointer; for VALUE: load/move - } - - // Set AL = number of XMM regs for variadic (SysV convention) - if (plan->is_variadic) { - // Count XMM regs used and emit: mov al, (count) - } - - // Emit the call instruction - if (plan->callee.kind == NATIVE_LOC_GLOBAL) { - // call rel32 + R_X64_PLT32 reloc for function symbols - // call rel32 + R_PC32 reloc for data symbols (rare) - emit_call_rel32(mc, plan->callee.v.global.sym, - plan->callee.v.global.addend); - } else if (plan->callee.kind == NATIVE_LOC_REG) { - // call r/m (opcode FF /2) - u32 r = plan->callee.v.reg & 0xFu; - emit_rex(mc, 0, 0, 0, r); - u8 buf[2] = {0xFF, modrm(3u, 2u, r)}; - mc->emit_bytes(mc, buf, 2); - } - - // Return-value harvest (via rets[] populated elsewhere) - // Caller moves from rax/rdx/xmm0/xmm1 into destination -} -``` - -**Emit helpers from legacy emit.c:** -- `emit_rex(mc, w, reg, index, rm)`: emit REX prefix; x64_emit_load_imm calls this -- `emit_mov_rr(mc, w, dst, src)`: register-to-register move (opcode 89/8B) -- `emit_mov_load(mc, sz, signed_ext, dst, base, disp)`: load from [base+disp] into reg -- `emit_mov_store(mc, sz, src, base, disp)`: store reg into [base+disp] -- `emit_sse_rr(mc, prefix2, opcode, dst, src)`: SSE move (movss/movsd) -- `x64_emit_load_imm(mc, is64, dst, imm)`: materialize immediate into register (uses movz/movk or mov-immediate sequences) - ---- - -## Return-Value Marshalling: `plan_ret` & Return Instruction - -### `plan_ret` Body Sketch - -The return path is simpler than calls: the callee extracts the return value(s) from wherever the caller placed them (frame slot, register, indirect address) and moves them into the standard return registers. - -**Inputs:** `CGFuncDesc* func`, `NativeLoc* values[]`, `u32 nvalues` (semantic return locations from the optimizer) - -**Outputs:** `NativeCallPlanRet** out_rets`, `u32* out_nrets` (array of moves from return values to rax/rdx/xmm0/xmm1) - -**Pseudo-code:** -```c -void x64_plan_ret(NativeTarget* t, const CGFuncDesc* func, - const NativeLoc* values, u32 nvalues, - NativeCallPlanRet** out_rets, u32* out_nrets) { - // 1. Query function signature - const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, func->type); - - // 2. Handle no-return case (void or unreachable) - if (!abi || abi->ret.kind == ABI_ARG_IGNORE) { - *out_nrets = 0; - return; - } - - // 3. Handle sret (structure return): copy struct to caller-supplied address - if (abi->ret.kind == ABI_ARG_INDIRECT) { - // The sret pointer arrived in rdi (SysV) or rcx (Win64). - // It was spilled to a frame slot at func_begin (sret_ptr_slot). - // - // Two patterns: - // (a) struct value is in a frame slot: memcpy each granule into [rdi] - // (b) struct already at an address (indirect): memcpy from there - // - // Emit inline memcpy-like moves (8B / 4B / 1B chunks) - // Then move rdi into rax (return the sret pointer) - - *out_nrets = 1; // one "move" that represents the memcpy + rax setup - // *out_rets[0] encodes the source struct and destination (rdi) - } - - // 4. Handle direct return (scalar or small aggregate) - // abi->ret.parts[] lists the parts and their classes (INT or FP) - u32 next_int_ret = 0, next_fp_ret = 0; // indices into rax/rdx, xmm0/xmm1 - for (u16 i = 0; i < abi->ret.nparts; ++i) { - const ABIArgPart* p = &abi->ret.parts[i]; - NativeCallPlanRet* r = &(*out_rets)[(*out_nrets)++]; - - // Determine source location for this part - // (from values[], or constructed from the aggregate) - - // Determine destination register - if (p->cls == ABI_CLASS_INT) { - r->dst = native_loc_reg(..., NATIVE_REG_INT, - next_int_ret == 0 ? X64_RAX : X64_RDX); - next_int_ret++; - } else if (p->cls == ABI_CLASS_FP) { - r->dst = native_loc_reg(..., NATIVE_REG_FP, - X64_XMM0 + next_fp_ret); - next_fp_ret++; - } - } -} -``` - -### Return Instruction Emission - -The legacy `x_ret` emits a plain `ret` (opcode C3) after setting up the return-value registers. The NativeTarget `ret` hook is simpler: it just emits `ret`. - -**Pseudo-code:** -```c -void x64_ret(NativeTarget* t) { - u8 op = 0xC3; // ret - t->mc->emit_bytes(t->mc, &op, 1); -} -``` - -**Return register handling:** -- **Int return**: value in rax (64-bit) or eax (32-bit, zero-extended to 64) -- **FP return**: value in xmm0 (double) or xmm0 (float) -- **Multi-register return** (e.g., __int128 or 16-byte struct): rax (first 8B), rdx (next 8B); or xmm0/xmm1 -- **sret return**: address pointer in rax, struct copied to [rdi] - ---- - -## Parameter Binding: `bind_param` - -Parameters arrive at the callee in one of three places: -1. **Integer registers**: rdi/rsi/rdx/rcx/r8/r9 (SysV) or rcx/rdx/r8/r9 (Win64) -2. **FP registers**: xmm0–xmm7 (SysV) or xmm0–xmm3 (Win64) -3. **Stack**: [rsp+8], [rsp+16], ... (Win64 shadow space at [rsp+0..31]) - -### `bind_param` Body Sketch - -**Inputs:** `CGParamDesc* param`, `NativeLoc dst` (destination chosen by allocator: REG or FRAME) - -**Pseudo-code:** -```c -void x64_bind_param(NativeTarget* t, const CGParamDesc* param, NativeLoc dst) { - const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, param->fn_type); - const ABIArgInfo* ai = &abi->params[param->index]; // param's ABI classification - const X64ABIRegs* x_abi = x64_abi_for_os(t->c->target.os); - - // sret handling (implicit first parameter): - if (param->index == 0 && abi->has_sret) { - // sret pointer arrives in rdi (SysV) or rcx (Win64) - // If dst is NATIVE_LOC_REG: direct move into dst->v.reg - // If dst is NATIVE_LOC_FRAME: store into frame slot - // Special: spill sret_ptr_slot = frame_slot where pointer is saved for tail calls - u32 src_reg = x_abi->int_args[0]; // rdi or rcx - if (dst.kind == NATIVE_LOC_REG) { - emit_mov_rr(t->mc, 1, dst.v.reg, src_reg); - } else if (dst.kind == NATIVE_LOC_FRAME) { - // Store into frame slot; also record sret_ptr_slot for tail calls - emit_mov_store(t->mc, 8, src_reg, X64_RBP, -frame_slot_offset(dst)); - } - return; - } - - // Ordinary parameter: query its ABI part (Int or FP, Reg or Stack) - const ABIArgPart* part = &ai->parts[0]; // assume single-part for now - - if (part->loc == ABI_LOC_REG) { - u32 src_reg; - if (part->cls == ABI_CLASS_INT) { - src_reg = x_abi->int_args[next_int]; - } else { - src_reg = X64_XMM0 + next_fp; - } - - if (dst.kind == NATIVE_LOC_REG) { - if (part->cls == ABI_CLASS_INT) { - emit_mov_rr(t->mc, w, dst.v.reg, src_reg); - } else { - emit_sse_rr(t->mc, prefix, 0x10, dst.v.reg, src_reg); - } - } else if (dst.kind == NATIVE_LOC_FRAME) { - if (part->cls == ABI_CLASS_INT) { - emit_mov_store(t->mc, part->size, src_reg, X64_RBP, -frame_offset); - } else { - emit_sse_store(t->mc, prefix, 0x11, src_reg, X64_RBP, -frame_offset); - } - } else if (dst.kind == NATIVE_LOC_NONE) { - // Parameter is unused; ABI register advances but nothing is emitted - } - } else if (part->loc == ABI_LOC_STACK) { - // Parameter on stack: [rsp+stack_offset] (Win64: after shadow space) - u32 in_arg_offset = X64_WIN64_SHADOW_SPACE + (param->index * 8); - - if (dst.kind == NATIVE_LOC_REG) { - // Load from stack into register - emit_mov_load(t->mc, part->size, 0, dst.v.reg, X64_RSP, in_arg_offset); - } else if (dst.kind == NATIVE_LOC_FRAME) { - // Copy from incoming stack to frame slot (via RAX scratch) - emit_mov_load(t->mc, part->size, 0, X64_RAX, X64_RSP, in_arg_offset); - emit_mov_store(t->mc, part->size, X64_RAX, X64_RBP, -frame_offset); - } - } -} -``` - -**Calling context:** -- Called once per parameter during `func_begin` (NativeDirectTarget path) or `func_begin_known_frame` (optimizer path) -- Incoming ABI registers are **never allocable** (they are not in the allocable register pool), so no collision checking is needed -- The caller's allocator pre-computes `dst` (register vs. frame), and this hook simply moves the incoming value to that destination - ---- - -## Signature Stack Bytes & Call Stack Bytes - -### `signature_stack_bytes` - -Computes the stack-parameter bytes a function's *fixed parameters* use (beyond the register pools). Used to gate tail-call realizability. - -**Pseudo-code:** -```c -u32 x64_signature_stack_bytes(NativeTarget* t, CfreeCgTypeId fn_type, - int* out_variadic, u32* out_nparams) { - const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type); - if (!abi) { - if (out_variadic) *out_variadic = 0; - if (out_nparams) *out_nparams = 0; - return 0; - } - - if (out_variadic) *out_variadic = abi->variadic; - if (out_nparams) *out_nparams = abi->nparams; - - const X64ABIRegs* x_abi = x64_abi_for_os(t->c->target.os); - u32 next_int = abi->has_sret ? 1 : 0, next_fp = 0; - u32 stack = 0; // no shadow space here; shadow is caller's responsibility - - for (u32 i = 0; i < abi->nparams; ++i) { - const ABIArgInfo* ai = &abi->params[i]; - for (u16 j = 0; j < ai->nparts; ++j) { - const ABIArgPart* p = &ai->parts[j]; - if (p->cls == ABI_CLASS_INT && next_int < x_abi->n_int_args) { - next_int++; - } else if (p->cls == ABI_CLASS_FP && next_fp < x_abi->n_fp_args) { - next_fp++; - } else { - stack += 8; // Stack-passed argument - } - if (x_abi->slot_shared_int_fp) next_fp = next_int; - } - } - - return (stack + 15) & ~15; // 16-byte align -} -``` - -### `call_stack_bytes` - -Computes the outgoing stack-argument bytes for a specific *call*, including shadow space. Used in a pre-pass frame-planning phase. - -**Pseudo-code:** -```c -u32 x64_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) { - const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type); - const X64ABIRegs* x_abi = x64_abi_for_os(t->c->target.os); - - u32 stack = x_abi->shadow_space; // Win64: 32; SysV: 0 - u32 next_int = abi && abi->has_sret ? 1 : 0, next_fp = 0; - - for (u32 i = 0; i < desc->nargs; ++i) { - const NativeLoc* arg = &desc->args[i]; - const ABIArgInfo* ai = abi ? &abi->params[i] : NULL; - if (!ai) continue; - - for (u16 j = 0; j < ai->nparts; ++j) { - const ABIArgPart* p = &ai->parts[j]; - if (p->cls == ABI_CLASS_INT && next_int < x_abi->n_int_args) { - next_int++; - } else if (p->cls == ABI_CLASS_FP && next_fp < x_abi->n_fp_args) { - next_fp++; - } else { - stack += 8; - } - if (x_abi->slot_shared_int_fp) next_fp = next_int; - } - } - - return (stack + 15) & ~15; -} -``` - ---- - -## Tail-Call Realizability & Emission - -### `tail_call_unrealizable_reason` - -Tail calls are a **sibling call**: the callee reuses the caller's stack frame and incoming parameter area. The callee's outgoing stack args must fit in the caller's incoming parameter window. - -**Pseudo-code:** -```c -const char* x64_tail_call_unrealizable_reason(NativeTarget* t, - const NativeCallDesc* desc) { - // Compute the callee's outgoing stack-arg bytes - u32 callee_stack = x64_call_stack_bytes(t, desc); - - // Compute the caller's incoming stack-arg bytes - // (This requires querying the caller's signature; held in backend state) - u32 caller_incoming = x_impl->incoming_stack_size; // set at func_begin - - if (callee_stack > caller_incoming) { - return "tail call stack arguments exceed the caller's parameter area"; - } - - // Win64: additional check for FP arguments (no true tail calls if fp args conflict) - // Deferred: handle as "not yet implemented" - - return NULL; // realizable -} -``` - -### Tail Call Emission - -On the NativeTarget path, tail calls are **deferred as not implemented** (return a blocker string). A full implementation would: - -1. **Restore callee-saved registers** in reverse order (mirrors `func_begin` prologue) -2. **Emit variadic AL setup** (count of XMM args) -3. **Restore the frame pointer** (mov rsp, rbp; pop rbp / leave instruction) -4. **Emit a jump** to the callee (jmp rel32 for global, jmp r/m for indirect) - -**Key constraint:** sret pointer must be forwarded from the caller's incoming sret (spilled to `sret_ptr_slot` at entry). - ---- - -## NativeOps Adapter Block - -The `-O0` (NativeDirectTarget) path requires a `NativeOps` struct with semantic-level call marshalling. The key members for GROUP 4: - -```c -struct NativeOps { - // ... other hooks ... - - void (*plan_call)(NativeDirectTarget*, const NativeCallDesc*, - NativeCallPlan*); - const char* (*tail_call_unrealizable_reason)(NativeDirectTarget*, - const CGCallDesc*); - void (*emit_call)(NativeDirectTarget*, const NativeCallPlan*); - void (*emit_ret)(NativeDirectTarget*, const CGLocal* values, u32 nvalues); - - // ... variadics, asm ... -}; -``` - -**Implementation notes:** -- `plan_call`: map the semantic `NativeCallDesc` (with CGLocal homes) to a `NativeCallPlan` (with physical locations) -- `emit_call`: emit the code to move arguments into registers/stack, call the function, and harvest return values -- `emit_ret`: emit code to place return values in rax/rdx/xmm0/xmm1 and emit `ret` - ---- - -## Integration Checklist - -### Per-Function State (XImpl analog) - -Store in the NativeTarget backend state: -- **X64ABIRegs*** x_abi: selected once at func_begin from c->target.os -- **u32** incoming_stack_size: fixed-param stack bytes (for tail-call checks) -- **NativeFrameSlot** sret_ptr_slot: where the incoming sret pointer is spilled (for tail calls) -- **u32** max_outgoing: largest call's stack arg size (updated by plan_call / emit_call) - -### Relocation Constants (from src/arch/x64/isa.h) - -- **R_X64_PLT32** (42): PC-relative function-call relocation (collapsible to local) -- **R_PC32** (2): PC-relative data-symbol relocation -- **R_X64_REX_GOTPCRELX** (41): GOT-indirect relocation for extern symbols in PIC/PIE - -### Byte Encoders (from legacy emit.c) - -Reuse / migrate to emit.h: -- `emit_rex(mc, w, reg, index, rm)`: REX prefix -- `emit_mov_rr(mc, w, dst, src)`: move register to register (opcode 89 / 8B) -- `emit_mov_load(mc, sz, signed_ext, dst, base, disp)`: load from [base+disp] -- `emit_mov_store(mc, sz, src, base, disp)`: store to [base+disp] -- `emit_sse_rr(mc, prefix, opcode, dst, src)`: SSE move -- `x64_emit_load_imm(mc, is64, dst, imm)`: load immediate into register -- `modrm(mod, reg, rm)`: encode ModR/M byte -- `emit_u32le(mc, v)`: emit 32-bit little-endian value - ---- - -## Summary of Data Flow - -1. **func_begin**: Select X64ABIRegs from c->target.os; query caller's signature for incoming_stack_size; compute prologue. -2. **bind_param**: Route each parameter (rdi/rsi/rdx/rcx/r8/r9/xmm0-7 or stack) to its destination (register or frame slot). -3. **plan_call** (when encountering a call): Query callee signature; lay out arg slots (int/fp registers, then stack); return NativeCallPlan with stack_arg_size. -4. **emit_call**: Emit argument moves, set AL (variadic), emit call instruction, harvest return values. -5. **plan_ret**: Route return values (rax/rdx/xmm0/xmm1) to caller-specified destinations. -6. **ret**: Emit the ret instruction. -7. **func_end**: Finalize frame size, patch prologue/epilogue. - ---- - -## Critical Divergences from RV64/AA64 - -| Aspect | x64 | RV64/AA64 | -|--------|-----|-----------| -| **Arg regs (int)** | rdi/rsi/rdx/rcx/r8/r9 (SysV); rcx/rdx/r8/r9 (Win64) | a0–a7 (uniform) | -| **Arg regs (fp)** | xmm0–7 (8, SysV); xmm0–3 (4, Win64) | fa0–fa7 (8, uniform) | -| **Shadow space** | 32 bytes (Win64 only); SysV none | None | -| **Slot sharing** | Win64: int & fp slots shared (slot_shared_int_fp=1); SysV: separate | Separate pools | -| **sret** | Reserves first int arg slot; callee also returns pointer in rax | Passed as pointer in register; no special return | -| **Tail calls** | Stack-arg fit check; forward sret if present | Similar, no sret complication | -| **Variadic** | SysV: same pools; Win64: FP dup to GPR | Same pools (RV64); register-save-area (AA64) | - -**Key bug risk (noted in rv64 comments):** sret reserves the first int argument slot even when sret is not an ordinary parameter. The callee must not allocate the first int arg slot to any user parameter. The ABI query ensures the caller knows this; bind_param must skip the first slot when sret is present. - - - ---- - -# X64 NativeTarget Porting Guide: GROUP 5 -## Atomics, Variadics, Inline Asm, Intrinsics, File-Scope Asm, Finalize & Cleanup - ---- - -## Overview - -This guide covers the final set of NativeTarget hooks for x64 porting, plus the file management strategy. The x64 backend must transition from the legacy ops.c/alloc.c/opt_coord.c/internal.h architecture to the NativeTarget abstraction (src/arch/native_target.h), driven at -O0 by NativeDirectTarget + NativeOps adapter and at -O1+ by the optimizer. - -Key constraint: **x64 must support both SysV (Linux/BSD/Unix) and Win64 ABIs** without Apple variants. All ABI queries route through `src/abi/{abi_sysv_x64.c, abi_win64_x64.c}` via `abi_cg_func_info()` and `abi_va_list_layout()`. - -Emit-level byte encoders (emit_rex, emit_mem_operand, emit_mov_rr, etc.) live in a new emit.h header shared by native.c and asm.c; function prologue/epilogue and lifecycle hooks remain in native.c. - ---- - -## Part 1: Atomic Operations - -### Context: x86-64 TSO Model - -x86-64 is **Strongly Ordered (Total Store Order)**: all memory operations are fully sequential within a core. A plain `mov` (load or store) satisfies acquire/release/seq_cst. Atomics only need explicit fencing in very specific cases: - -- **atomic_load(seq_cst)**: plain load (TSO guarantees it sees all prior stores) + **mfence** after -- **atomic_store(release)**: plain store (no fence before; TSO makes it visible to future loads) -- **atomic_store(seq_cst)**: **mfence** before, plain store, **mfence** after -- **atomic_rmw/cas**: lock-prefixed instruction (implicit full barrier) -- **fence(seq_cst)**: **mfence** - -### NativeTarget Hooks - -Located in src/arch/native_target.h lines 396–405: - -```c -void (*atomic_load)(NativeTarget*, NativeLoc dst, NativeAddr addr, MemAccess, MemOrder); -void (*atomic_store)(NativeTarget*, NativeAddr addr, NativeLoc src, MemAccess, MemOrder); -void (*atomic_rmw)(NativeTarget*, AtomicOp, NativeLoc dst, NativeAddr addr, - NativeLoc val, MemAccess, MemOrder); -void (*atomic_cas)(NativeTarget*, NativeLoc prior, NativeLoc ok, - NativeAddr addr, NativeLoc expected, NativeLoc desired, - MemAccess, MemOrder success, MemOrder failure); -void (*fence)(NativeTarget*, MemOrder); -``` - -### Body Sketch: x_atomic_load - -**Caller contract**: `dst` = NATIVE_LOC_REG (allocable register); `addr` resolved (base_reg + imm offset, no index). - -```c -static void x_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, - MemAccess mem, MemOrder order) { - MCEmitter* mc = t->mc; - u32 sz = mem.size ? mem.size : x_type_byte_size(t, dst.type); - - /* Resolve addr to (base_reg, imm_offset) — no index. */ - u32 base = x_atomic_addr_base(t, addr); /* Helper: extracts addr.base.reg */ - i32 offset = addr.offset; - - /* x86: plain MOV satisfies all acquire/release/seq_cst for load due to TSO. - Only seq_cst needs post-load mfence. */ - - int signext = type_is_signed(mem.type ? mem.type : dst.type); - x_emit_mov_load(mc, sz, signext, dst.v.reg & 0xfu, base, offset); - - if (order == MO_SEQ_CST) - emit_mfence(mc); -} -``` - -**Emit helpers** (from git 429defa:src/arch/x64/emit.c): -- `emit_mfence()`: 0x0f 0xae 0xf0 (3 bytes) -- `x_emit_mov_load(sz, signext, dst_reg, base, offset)`: mov reg, [base+offset] with size/sign handling (reuse emit.h encoders) - -### Body Sketch: x_atomic_store - -```c -static void x_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src, - MemAccess mem, MemOrder order) { - MCEmitter* mc = t->mc; - u32 sz = mem.size ? mem.size : x_type_byte_size(t, src.type); - u32 base = x_atomic_addr_base(t, addr); - i32 offset = addr.offset; - - if (order == MO_SEQ_CST) - emit_mfence(mc); - - x_emit_mov_store(mc, sz, src.v.reg & 0xfu, base, offset); - - if (order == MO_SEQ_CST) - emit_mfence(mc); -} -``` - -### Body Sketch: x_atomic_rmw - -**Lock-prefixed read-modify-write** for add/sub/xchg (single instruction); **cmpxchg retry loop** for and/or/xor/nand. - -```c -static void x_atomic_rmw(NativeTarget* t, AtomicOp op, NativeLoc dst, - NativeAddr addr, NativeLoc val, MemAccess mem, - MemOrder order) { - MCEmitter* mc = t->mc; - u32 sz = mem.size ? mem.size : x_type_byte_size(t, dst.type); - int w = (sz == 8) ? 1 : 0; - u32 base = x_atomic_addr_base(t, addr); - i32 offset = addr.offset; - u32 dr = dst.v.reg & 0xfu; - u32 tmp_reg = X64_R11; /* Working register */ - - /* (void)order; LOCK prefixed ops are unconditionally full barriers. */ - - /* Materialize val into tmp_reg. For SUB, negate it. */ - if (val.kind == NATIVE_LOC_IMM) { - i64 v = val.v.imm; - if (op == AO_SUB) v = -v; - x_emit_load_imm(mc, w, tmp_reg, v); - } else if (val.kind == NATIVE_LOC_REG) { - u32 vr = val.v.reg & 0xfu; - if (vr != tmp_reg) - emit_mov_rr(mc, w, tmp_reg, vr); - if (op == AO_SUB) - emit_f7_rm(mc, w, 3u, tmp_reg); /* NEG tmp_reg */ - } else { - compiler_panic(t->c, /* loc */, "x64 atomic_rmw: val kind unsupported"); - } - - if (op == AO_ADD || op == AO_SUB) { - /* LOCK XADD [base+offset], tmp_reg - Opcode: 0xf0 (lock) 0x0f 0xc1 /r - Afterwards tmp_reg = prior value. */ - emit_lock_xadd(mc, w, tmp_reg, base, offset); - if (dr != tmp_reg) - emit_mov_rr(mc, w, dr, tmp_reg); - return; - } - - if (op == AO_XCHG) { - /* LOCK XCHG [base+offset], tmp_reg (lock is implicit for XCHG mem) - Opcode: 0xf0 (explicit) 0x87 /r */ - emit_lock_xchg_mem(mc, w, tmp_reg, base, offset); - if (dr != tmp_reg) - emit_mov_rr(mc, w, dr, tmp_reg); - return; - } - - /* AND/OR/XOR/NAND: CMPXCHG retry loop - rax = prior, rcx = new, r11 = val - .retry: lr.w/d rd, [mem] (load-reserve, implicit aq=1) - <new op val> - [NAND: not new] - lock cmpxchg [mem], new - jne .retry */ - - x_emit_mov_load(mc, sz, 0, X64_RAX, base, offset); - MCLabel L_retry = mc->label_new(mc); - mc->label_place(mc, L_retry); - emit_mov_rr(mc, w, X64_RCX, X64_RAX); - - switch (op) { - case AO_AND: - emit_alu_rr(mc, w, 0x21, X64_RCX, tmp_reg); /* AND */ - break; - case AO_OR: - emit_alu_rr(mc, w, 0x09, X64_RCX, tmp_reg); /* OR */ - break; - case AO_XOR: - emit_alu_rr(mc, w, 0x31, X64_RCX, tmp_reg); /* XOR */ - break; - case AO_NAND: - emit_alu_rr(mc, w, 0x21, X64_RCX, tmp_reg); /* AND */ - emit_f7_rm(mc, w, 2u, X64_RCX); /* NOT rcx */ - break; - default: - compiler_panic(t->c, /* loc */, "unsupported atomic rmw op"); - } - - emit_lock_cmpxchg(mc, w, X64_RCX, base, offset); - /* jne .retry (ZF = 0 if failed) */ - emit_jcc_label(mc, X64_CC_NE, L_retry); - - if (dr != X64_RAX) - emit_mov_rr(mc, w, dr, X64_RAX); -} -``` - -**Emit helpers**: -- `emit_lock_xadd(w, src, base, offset)`: lock.prefix + rex + 0x0f 0xc1 + modrm+sib+disp -- `emit_lock_xchg_mem(w, src, base, offset)`: lock.prefix + rex + 0x87 + modrm+sib+disp -- `emit_lock_cmpxchg(w, src, base, offset)`: lock.prefix + rex + 0x0f 0xb1 + modrm+sib+disp - -### Body Sketch: x_atomic_cas - -**Caller contract**: Compare-and-swap with two memory orders (success/failure). Returns: -- `prior`: the loaded value (old or new depending on success) -- `ok`: condition flag (0/1 indicating success) - -```c -static void x_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok, - NativeAddr addr, NativeLoc expected, NativeLoc desired, - MemAccess mem, MemOrder success, MemOrder failure) { - MCEmitter* mc = t->mc; - u32 sz = mem.size ? mem.size : x_type_byte_size(t, prior.type); - int w = (sz == 8) ? 1 : 0; - u32 base = x_atomic_addr_base(t, addr); - i32 offset = addr.offset; - - /* CMPXCHG uses RAX as implicit operand (compare value). - On success, ZF=1; on failure, ZF=0 (and RAX=actual). */ - - /* Materialize expected into RAX. */ - if (expected.kind == NATIVE_LOC_IMM) - x_emit_load_imm(mc, w, X64_RAX, expected.v.imm); - else - emit_mov_rr(mc, w, X64_RAX, expected.v.reg & 0xfu); - - /* Materialize desired into RCX (working reg). */ - if (desired.kind == NATIVE_LOC_IMM) - x_emit_load_imm(mc, w, X64_RCX, desired.v.imm); - else - emit_mov_rr(mc, w, X64_RCX, desired.v.reg & 0xfu); - - /* LOCK CMPXCHG [base+offset], rcx - Implicit: rax = expected, mem = current. ZF = (mem == expected). */ - emit_lock_cmpxchg(mc, w, X64_RCX, base, offset); - - /* prior = rax (actual value, either old or fetched). */ - if (prior.v.reg != X64_RAX) - emit_mov_rr(mc, w, prior.v.reg & 0xfu, X64_RAX); - - /* ok = (ZF ? 1 : 0) via SETO or SETNE. SETO sets byte to 1 if OF=1 (never happens - for cmpxchg); better: SETE (ZF=1). */ - u32 ok_reg = ok.v.reg & 0xfu; - emit_setcc(mc, X64_CC_E, ok_reg); /* sete ok_reg */ - /* Zero-extend the byte result. */ - emit_movzx(mc, 0, ok_reg, ok_reg, 0); /* movzx ok_reg, ok_reg_b */ -} -``` - -**Emit helpers**: -- `emit_setcc(cond, reg)`: 0x0f 0x9<cond> modrm — sets byte in reg based on condition (ZF, OF, etc.) -- `emit_movzx(w, dst, src, sign)`: zero/sign extends src into dst - -### Body Sketch: x_fence - -```c -static void x_fence(NativeTarget* t, MemOrder order) { - if (order == MO_SEQ_CST) - emit_mfence(t->mc); - /* Other orders (acquire, release, relaxed) are implicit in TSO. */ -} -``` - ---- - -## Part 2: Variadic Argument Handling - -### Context: SysV vs Win64 va_list Layout - -Two fundamentally different designs: - -**SysV x64** (Linux/BSD): -- `va_list` is a 24-byte struct holding gp_offset, fp_offset, overflow_arg_area, reg_save_area -- Prologue saves 6 GPR (rdi, rsi, rdx, rcx, r8, r9) + 8 XMM (xmm0..7) to a 176-byte register save area (6*8 + 8*16) -- va_arg scans offsets and fetches from either the save area (if offset < max) or overflow area - -**Win64** (Windows): -- `va_list` is a single pointer to the next variadic stack slot -- No register save area; variadic args are on the stack -- Caller-side **vararg_fp_dup_to_gpr** flag: FP varargs are duplicated into the matching GPR slot by the call site - -### NativeTarget Hooks - -Lines 413–417 in src/arch/native_target.h: - -```c -void (*va_start_)(NativeTarget*, NativeLoc ap_ptr); -void (*va_arg_)(NativeTarget*, NativeLoc dst, NativeLoc ap_ptr, CfreeCgTypeId type); -void (*va_end_)(NativeTarget*, NativeLoc ap_ptr); -void (*va_copy_)(NativeTarget*, NativeLoc dst_ap_ptr, NativeLoc src_ap_ptr); -``` - -**Caller contract**: -- `ap_ptr`: NATIVE_LOC_REG or NATIVE_LOC_FRAME (the va_list object's address) -- `dst`: for va_arg, NATIVE_LOC_REG (destination register for the fetched value) -- The backend queries the ABI via `abi_va_list_layout(t->c->abi)` to determine structure layout - -### Body Sketch: x_va_start_ - -**Must query the ABI** to determine SysV vs Win64. Use `abi_cg_func_info()` + `abi_va_list_layout()`. - -```c -static void x_va_start_(NativeTarget* t, NativeLoc ap_ptr) { - X64NativeTarget* a = x64_of(t); - MCEmitter* mc = t->mc; - - if (!a->is_variadic) - compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0}, - "x64 va_start: function not variadic"); - - ABIVaListInfo vai = abi_va_list_layout(t->c->abi); - u32 ap_reg = ap_ptr.v.reg & 0xfu; /* Address of va_list object */ - - if (vai.kind == ABI_VA_LIST_POINTER) { - /* Win64: va_list = pointer to next stack slot. - Incoming variadics start at [rbp + 16 + (n_named_int * 8) + n_named_stack]. - (The prologue's shadow space at [rbp+0..16] holds a copy of RCX/RDX/R8/R9.) */ - u32 first_var_off = 16u + (a->next_param_int * 8u) + a->next_param_stack; - x_emit_lea(mc, X64_RAX, X64_RBP, (i32)first_var_off); - x_emit_mov_store(mc, 8, X64_RAX, ap_reg, 0); /* *ap = lea result */ - return; - } - - if (vai.kind == ABI_VA_LIST_SYSV_STRUCT) { - /* SysV: 24-byte va_list struct with 4 fields, plus 176-byte register save area. - *ap = { gp_offset=0, fp_offset=48, overflow_arg_area, reg_save_area } */ - - /* Get the register save slot (allocated during func_begin). */ - X64NativeSlot* rs = x64_slot_get(a, a->reg_save_slot); - if (!rs) - compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0}, - "x64 va_start: no reg_save_slot"); - - /* gp_offset = next_param_int * 8 (bytes into save area for next GP reg) */ - x_emit_load_imm(mc, 0, X64_RAX, (i64)(a->next_param_int * 8u)); - x_emit_mov_store(mc, 4, X64_RAX, ap_reg, 0); - - /* fp_offset = 48 + next_param_fp * 16 (XMM area starts at byte 48) */ - x_emit_load_imm(mc, 0, X64_RAX, (i64)(48u + a->next_param_fp * 16u)); - x_emit_mov_store(mc, 4, X64_RAX, ap_reg, 4); - - /* overflow_arg_area = rbp + 16 + next_param_stack (stack args start above saved pair) */ - x_emit_lea(mc, X64_RAX, X64_RBP, (i32)(16u + a->next_param_stack)); - x_emit_mov_store(mc, 8, X64_RAX, ap_reg, 8); - - /* reg_save_area = rbp - rs->off (address of register save area) */ - x_emit_lea(mc, X64_RAX, X64_RBP, -(i32)rs->off); - x_emit_mov_store(mc, 8, X64_RAX, ap_reg, 16); - return; - } - - compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0}, - "x64 va_start: unsupported va_list layout"); -} -``` - -**Key state tracked** (set during func_begin): -- `a->next_param_int`: count of GPR params consumed by the signature -- `a->next_param_fp`: count of FP params -- `a->next_param_stack`: byte offset of first stack param relative to rbp -- `a->reg_save_slot`: frame slot holding the register save area (SysV only) -- `a->is_variadic`: whether the function signature is variadic - -### Body Sketch: x_va_arg_ - -Complex: SysV has two paths (register or overflow area), Win64 is simpler. - -```c -static void x_va_arg_(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr, - CfreeCgTypeId type) { - X64NativeTarget* a = x64_of(t); - MCEmitter* mc = t->mc; - - ABIVaListInfo vai = abi_va_list_layout(t->c->abi); - u32 ap_reg = ap_ptr.v.reg & 0xfu; - u32 sz = x_type_byte_size(t, type); - int is_fp = (dst.cls == NATIVE_REG_FP); - u32 dr = dst.v.reg & 0xfu; - - if (vai.kind == ABI_VA_LIST_POINTER) { - /* Win64: va_list is a plain pointer. All variadics are 8-byte slots - on the stack. FP varargs are duplicated into GPR by call site. - - r11 = *ap (current slot address) - if (is_fp) xmm_dst = [r11] else dst = [r11] - r11 += 8 - *ap = r11 */ - - x_emit_mov_load(mc, 8, 0, X64_R11, ap_reg, 0); - if (is_fp) { - u8 prefix = (sz == 8) ? 0xf2 : 0xf3; - x_emit_sse_load(mc, prefix, 0x10, dr, X64_R11, 0); /* movs[sd] */ - } else { - int sx = type_is_signed(type); - x_emit_mov_load(mc, sz, sx, dr, X64_R11, 0); - } - - /* add r11, 8 (advance to next slot) */ - x_emit_alu_imm(mc, 1, 0, X64_R11, 8); /* ADD */ - x_emit_mov_store(mc, 8, X64_R11, ap_reg, 0); - return; - } - - if (vai.kind == ABI_VA_LIST_SYSV_STRUCT) { - /* SysV: check if arg came via register or overflow. - offs_field = (is_fp ? 4 : 0) (gp_offset or fp_offset field offset) - max_offs = (is_fp ? 176 : 48) (end of register save area) - stride = (is_fp ? 16 : 8) (bytes per register slot) - - eax = va_list[offs_field] - if (eax >= max_offs) goto L_stack - dst = va_list[reg_save_area] + eax - eax += stride - va_list[offs_field] = eax - goto L_done - L_stack: - dst = va_list[overflow_arg_area] - va_list[overflow_arg_area] += 8 - L_done: */ - - u32 offs_field = is_fp ? 4u : 0u; - u32 max_offs = is_fp ? 176u : 48u; - u32 stride = is_fp ? 16u : 8u; - - MCLabel L_stack = mc->label_new(mc); - MCLabel L_done = mc->label_new(mc); - - /* Load offset field. */ - x_emit_mov_load(mc, 4, 0, X64_RAX, ap_reg, (i32)offs_field); - - /* Compare with max_offs. */ - if (max_offs <= 127u) { - x_emit_cmp_imm8(mc, 0, X64_RAX, (i8)max_offs); - } else { - /* cmp eax, imm32 via 0x3d (EAX-specific form). */ - u32 ofs = obj_pos(mc->obj, mc->section_id); - u8 op = 0x3d; - mc->emit_bytes(mc, &op, 1); - emit_u32le(mc, max_offs); - /* Debug row if needed */ - } - - /* jae L_stack (jump if >=, i.e., not in register range). */ - emit_jcc_label(mc, X64_CC_AE, L_stack); - - /* In-register path: dst = reg_save_area + eax. */ - x_emit_mov_load(mc, 8, 0, X64_RCX, ap_reg, 16); /* rcx = reg_save_area */ - x_emit_lea(mc, X64_RCX, X64_RCX, 0); /* lea rcx, [rcx + eax * scale] — need indexed addr */ - /* Actually: add rcx, rax; then load from rcx. */ - x_emit_alu_rr(mc, 0, 0x01, X64_RCX, X64_RAX); /* add eax, ecx */ - - if (is_fp) { - u8 prefix = (sz == 8) ? 0xf2 : 0xf3; - x_emit_sse_load(mc, prefix, 0x10, dr, X64_RCX, 0); - } else { - int sx = type_is_signed(type); - x_emit_mov_load(mc, sz, sx, dr, X64_RCX, 0); - } - - /* Update offset: offset += stride. */ - if (stride <= 127u) { - x_emit_alu_imm8(mc, 0, 0, X64_RAX, (i8)stride); /* add eax, stride */ - } else { - x_emit_alu_imm32(mc, 0, 0, X64_RAX, stride); - } - x_emit_mov_store(mc, 4, X64_RAX, ap_reg, (i32)offs_field); - - emit_jmp_label(mc, L_done); - - /* Overflow area path: dst = overflow_arg_area; overflow_arg_area += 8. */ - mc->label_place(mc, L_stack); - x_emit_mov_load(mc, 8, 0, X64_RCX, ap_reg, 8); /* rcx = overflow_arg_area */ - - if (is_fp) { - u8 prefix = (sz == 8) ? 0xf2 : 0xf3; - x_emit_sse_load(mc, prefix, 0x10, dr, X64_RCX, 0); - } else { - int sx = type_is_signed(type); - x_emit_mov_load(mc, sz, sx, dr, X64_RCX, 0); - } - - x_emit_alu_imm(mc, 1, 0, X64_RCX, 8); /* add rcx, 8 */ - x_emit_mov_store(mc, 8, X64_RCX, ap_reg, 8); - - mc->label_place(mc, L_done); - return; - } - - compiler_panic(t->c, a->func ? a->func->loc : (SrcLoc){0, 0, 0}, - "x64 va_arg: unsupported va_list layout"); -} -``` - -**Key point**: For SysV, the register-save area is **prologue-emitted** in x_emit_variadic_reg_saves() (part of x_func_begin). The caller must ensure 176 bytes are reserved on the stack and the 6 GPR + 8 XMM are saved at a fixed offset. - -### Body Sketch: x_va_end_ & x_va_copy_ - -```c -static void x_va_end_(NativeTarget* t, NativeLoc ap_ptr) { - (void)t; - (void)ap_ptr; - /* x64 va_end is a no-op (no resources to clean up). */ -} - -static void x_va_copy_(NativeTarget* t, NativeLoc dst_ap_ptr, NativeLoc src_ap_ptr) { - MCEmitter* mc = t->mc; - /* Copy 24 bytes (SysV) or 8 bytes (Win64) from src to dst. - For simplicity, memcpy the whole va_list struct. */ - - ABIVaListInfo vai = abi_va_list_layout(t->c->abi); - u32 copy_sz = (vai.kind == ABI_VA_LIST_POINTER) ? 8u : 24u; - - u32 src_ptr = src_ap_ptr.v.reg & 0xfu; - u32 dst_ptr = dst_ap_ptr.v.reg & 0xfu; - - /* r10 = src; r11 = dst; copy copy_sz bytes. */ - x_emit_mov_load(mc, copy_sz, 0, X64_R10, src_ptr, 0); - x_emit_mov_store(mc, copy_sz, X64_R10, dst_ptr, 0); - - /* For 24 bytes, split into 8+8+8 or use a loop. Simpler: two 8-byte moves. */ - if (copy_sz > 8u) { - x_emit_mov_load(mc, 8, 0, X64_R10, src_ptr, 8); - x_emit_mov_store(mc, 8, X64_R10, dst_ptr, 8); - if (copy_sz > 16u) { - x_emit_mov_load(mc, 8, 0, X64_R10, src_ptr, 16); - x_emit_mov_store(mc, 8, X64_R10, dst_ptr, 16); - } - } -} -``` - -### NativeOps Adapter (NativeDirectTarget path) - -The -O0 direct path uses semantic operands (OPK_REG, OPK_LOCAL), not NativeLoc. The NativeOps vtable (src/cg/native_direct_target.h lines 81–86) bridges: - -```c -struct NativeOps { - ... - void (*va_start_)(NativeDirectTarget*, Operand ap_addr); - void (*va_arg_)(NativeDirectTarget*, Operand dst, Operand ap_addr, CfreeCgTypeId); - void (*va_end_)(NativeDirectTarget*, Operand ap_addr); - void (*va_copy_)(NativeDirectTarget*, Operand dst_ap_addr, Operand src_ap_addr); - ... -}; -``` - -Pattern (mirror rv64's rv_direct_va_base + rv_va_*_core): - -```c -/* Convert semantic Operand (OPK_LOCAL holding va_list struct) to NativeAddr. - Key issue: OPK_LOCAL is the address of the frame slot; we need to pass that - pointer as a register location to the native hooks. */ -static NativeAddr x_direct_va_base(NativeDirectTarget* d, Operand ap_addr, - u32 scratch_reg) { - NativeAddr addr; - memset(&addr, 0, sizeof addr); - - if (ap_addr.kind == OPK_LOCAL) { - /* Load the address into scratch_reg, then use that register. */ - NativeTarget* nt = d->native; - OperandLoc floc = /* resolve ap_addr to frame slot */; - /* Load frame address into scratch_reg. */ - addr.base_kind = NATIVE_ADDR_BASE_REG; - addr.base.reg = scratch_reg; - } else if (ap_addr.kind == OPK_REG) { - addr.base_kind = NATIVE_ADDR_BASE_REG; - addr.base.reg = ap_addr.v.reg; - } - return addr; -} - -static void x_va_start_direct(NativeDirectTarget* d, Operand ap_addr) { - NativeTarget* nt = d->native; - /* Resolve ap_addr to a register (or load it into a scratch). */ - NativeLoc ap_ptr = nd_materialize_operand(d, ap_addr); /* Helper: load if needed */ - nt->va_start_(nt, ap_ptr); -} -``` - ---- - -## Part 3: Inline Assembly (Inline Asm) - -### Context: x64 Inline Assembly Constraints - -x64 constraints in the legacy asm.c (git 429defa:src/arch/x64/asm.c): -- **r**: GPR (rax, rbx, ..., r15) -- **m**: Memory operand (direct reg, indirect [reg], indexed [reg+scale*idx], RIP-relative) -- **i**: Immediate (any constant) -- **a,b,c,d,S,D**: Specific registers (rax, rbx, rcx, rdx, rsi, rdi) -- **x**: XMM registers (xmm0..xmm15) -- Width qualifiers: b (byte), w (word), l (dword), q (qword) - -### NativeTarget Hooks - -Line 420–423 in src/arch/native_target.h: - -```c -void (*asm_block)(NativeTarget*, const char* tmpl, const AsmConstraint* outs, - u32 nout, NativeLoc* out_locs, const AsmConstraint* ins, - u32 nin, const NativeLoc* in_locs, const Sym* clobbers, - u32 nclob); -``` - -**Caller contract**: All operand locations are already physically bound (NATIVE_LOC_REG, NATIVE_LOC_FRAME, NATIVE_LOC_IMM, NATIVE_LOC_ADDR). No further allocation needed. - -### Body Sketch: x_asm_block_native - -Mirrors aa64/rv64 pattern: open an Asm context, bind operands (semantic → physical), run template expansion, close. - -```c -static void x_asm_block_native(NativeTarget* t, const char* tmpl, - const AsmConstraint* outs, u32 nout, - NativeLoc* out_locs, - const AsmConstraint* ins, u32 nin, - const NativeLoc* in_locs, - const Sym* clobbers, u32 nclob) { - X64NativeTarget* a = x64_of(t); - Compiler* c = t->c; - SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; - - /* Allocate bound operand arrays. */ - Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; - Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; - - X64Asm* asmh; - u32 i; - - /* Track clobbered registers for prologue/epilogue purposes. */ - for (i = 0; i < nclob; ++i) { - Reg phys; - RegClass cls; - if (!c->resolve_reg_name || c->resolve_reg_name(c, clobbers[i], &phys, &cls) != 0) - continue; - if (cls == RC_INT) { - /* Mark callee-saved regs that are clobbered. */ - if (phys == X64_RBX || phys == X64_R12 || phys == X64_R13 || - phys == X64_R14 || phys == X64_R15) - a->used_cs_int_mask |= (1u << phys); - } - } - - /* Bind outputs: constraint + out_locs[i] → Operand. */ - for (i = 0; i < nout; ++i) { - CfreeCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type; - x_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i]); - } - - /* Bind inputs. */ - for (i = 0; i < nin; ++i) { - const char* constraint = ins[i].str; - /* Check for matching constraint (e.g., "0" = matches output 0). */ - int matched = x_asm_match_index(constraint); - CfreeCgTypeId type = ins[i].type ? ins[i].type : in_locs[i].type; - - if (matched >= 0) { - if ((u32)matched >= nout) - compiler_panic(c, loc, "x64 asm: matching constraint out of range"); - bound_ins[i] = bound_outs[matched]; - continue; - } - - /* Regular constraint. */ - x_asm_bind_native(a, loc, &bound_ins[i], constraint, type, in_locs[i]); - } - - /* Open asm template processor, run, close. */ - asmh = x64_asm_open(c); - x64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, - clobbers, nclob); - x64_asm_run_template(asmh, t->mc, tmpl); - x64_asm_close(asmh); -} - -/* Helper: bind a single constraint to a native location. - Handles "r" → allocable GPR, "m" → [reg+offset], "i" → immediate, etc. */ -static void x_asm_bind_native(X64NativeTarget* a, SrcLoc loc, - Operand* out, const char* constraint, - CfreeCgTypeId type, NativeLoc nloc) { - /* Parse constraint (e.g., "=r", "+m", "i", "0", etc.). */ - const char* body = x_asm_constraint_body(constraint); - - memset(out, 0, sizeof *out); - - if (body[0] == 'r') { - /* Register constraint: expect nloc.kind == NATIVE_LOC_REG. */ - if (nloc.kind != NATIVE_LOC_REG) - compiler_panic(a->base.c, loc, "x64 asm: 'r' constraint needs register"); - out->kind = OPK_REG; - out->cls = x_class_from_type(type); /* RC_INT or RC_FP */ - out->v.reg = nloc.v.reg; - return; - } - - if (body[0] == 'm') { - /* Memory constraint: nloc is NATIVE_LOC_FRAME or NATIVE_LOC_STACK. */ - if (nloc.kind == NATIVE_LOC_FRAME) { - X64NativeSlot* s = x64_slot_get(a, nloc.v.frame); - out->kind = OPK_LOCAL; - out->v.frame_slot = nloc.v.frame; - } else if (nloc.kind == NATIVE_LOC_ADDR) { - /* Materialized address: [base + disp]. */ - out->kind = OPK_INDIRECT; - out->v.ind.base = nloc.v.addr.base.reg; - out->v.ind.index = REG_NONE; - out->v.ind.ofs = nloc.v.addr.offset; - } else { - compiler_panic(a->base.c, loc, "x64 asm: 'm' constraint needs memory"); - } - return; - } - - if (body[0] == 'i') { - /* Immediate constraint: nloc.kind == NATIVE_LOC_IMM. */ - if (nloc.kind != NATIVE_LOC_IMM) - compiler_panic(a->base.c, loc, "x64 asm: 'i' constraint needs immediate"); - out->kind = OPK_IMM; - out->v.imm = nloc.v.imm; - return; - } - - if (body[0] == 'a' || body[0] == 'b' || body[0] == 'c' || body[0] == 'd' || - body[0] == 'S' || body[0] == 'D') { - /* Specific register constraint (a=rax, b=rbx, c=rcx, d=rdx, S=rsi, D=rdi). */ - static const Reg map[] = {X64_RAX, X64_RBX, X64_RCX, X64_RDX, X64_RSI, X64_RDI}; - const char* names = "abcdSD"; - for (int j = 0; j < 6; ++j) { - if (body[0] == names[j]) { - if (nloc.kind != NATIVE_LOC_REG || (nloc.v.reg & 0xfu) != map[j]) - compiler_panic(a->base.c, loc, "x64 asm: constraint '%c' requires %s", - body[0], /* reg name */); - out->kind = OPK_REG; - out->cls = RC_INT; - out->v.reg = nloc.v.reg; - return; - } - } - } - - if (body[0] == 'x') { - /* XMM constraint: expect NATIVE_LOC_REG with FP class. */ - if (nloc.kind != NATIVE_LOC_REG || nloc.cls != NATIVE_REG_FP) - compiler_panic(a->base.c, loc, "x64 asm: 'x' constraint needs XMM"); - out->kind = OPK_REG; - out->cls = RC_FP; - out->v.reg = nloc.v.reg; - return; - } - - compiler_panic(a->base.c, loc, "x64 asm: unsupported constraint '%s'", constraint); -} -``` - -### Legacy asm.c Integration - -The standalone assembler (asm.c) currently includes internal.h to access legacy types and must be updated to include a new emit.h header instead. See file management section. - ---- - -## Part 4: Intrinsics - -### NativeTarget Hook - -Line 418–419 in src/arch/native_target.h: - -```c -void (*intrinsic)(NativeTarget*, IntrinKind, const NativeLoc* dsts, u32 ndst, - const NativeLoc* args, u32 narg); -``` - -IntrinKind enum (from cg/cgtarget.h): INTRIN_POPCOUNT, INTRIN_CTZ, INTRIN_CLZ, INTRIN_BSWAP*, INTRIN_EXPECT, INTRIN_ASSUME_ALIGNED, INTRIN_PREFETCH, INTRIN_TRAP, INTRIN_UNREACHABLE, INTRIN_*_OVERFLOW, and memory intrinsics (MEMCPY, MEMSET). - -### Body Sketch: x_intrinsic - -```c -static void x_intrinsic(NativeTarget* t, IntrinKind kind, const NativeLoc* dsts, - u32 ndst, const NativeLoc* args, u32 narg) { - MCEmitter* mc = t->mc; - (void)ndst; /* Caller guarantees valid indices. */ - (void)narg; - - switch (kind) { - case INTRIN_POPCOUNT: { - /* POPCNT rd, rs: F3 0F B8 /r. Requires SSE4.2. */ - if (ndst >= 1 && narg >= 1) { - u32 sz = x_type_byte_size(t, args[0].type); - int w = (sz == 8) ? 1 : 0; - emit_popcnt(mc, w, dsts[0].v.reg & 0xfu, args[0].v.reg & 0xfu); - } - return; - } - - case INTRIN_CTZ: { - /* BSF gives index of lowest set bit; ZF if input is 0 (undefined). */ - if (ndst >= 1 && narg >= 1) { - u32 sz = x_type_byte_size(t, args[0].type); - int w = (sz == 8) ? 1 : 0; - emit_bs(mc, w, 0xbc, dsts[0].v.reg & 0xfu, args[0].v.reg & 0xfu); /* BSF */ - } - return; - } - - case INTRIN_CLZ: { - /* BSR gives index of highest set bit; XOR with (bits-1) for CLZ. */ - if (ndst >= 1 && narg >= 1) { - u32 sz = x_type_byte_size(t, args[0].type); - int w = (sz == 8) ? 1 : 0; - u32 dr = dsts[0].v.reg & 0xfu; - emit_bs(mc, w, 0xbd, dr, args[0].v.reg & 0xfu); /* BSR */ - emit_xor_imm32(mc, w, dr, w ? 63 : 31); - } - return; - } - - case INTRIN_BSWAP16: - case INTRIN_BSWAP32: - case INTRIN_BSWAP64: { - if (ndst >= 1 && narg >= 1) { - u32 dr = dsts[0].v.reg & 0xfu; - u32 sr = args[0].v.reg & 0xfu; - int w = (kind == INTRIN_BSWAP64) ? 1 : 0; - if (dr != sr) emit_mov_rr(mc, w, dr, sr); - if (kind == INTRIN_BSWAP16) { - emit_rol16_imm8(mc, dr, 8); /* ROR dx, 8 (16-bit) */ - } else { - emit_bswap(mc, w, dr); /* BSWAP rax/eax */ - } - } - return; - } - - case INTRIN_SADD_OVERFLOW: - case INTRIN_UADD_OVERFLOW: - case INTRIN_SSUB_OVERFLOW: - case INTRIN_USUB_OVERFLOW: - case INTRIN_SMUL_OVERFLOW: - case INTRIN_UMUL_OVERFLOW: { - /* Result in dsts[0], overflow flag in dsts[1]. */ - if (ndst >= 2 && narg >= 2) { - u32 sz = x_type_byte_size(t, dsts[0].type); - int w = (sz == 8) ? 1 : 0; - u32 rd = dsts[0].v.reg & 0xfu; - u32 ro = dsts[1].v.reg & 0xfu; /* overflow flag */ - - u32 ra = args[0].v.reg & 0xfu; - u32 rb = args[1].v.reg & 0xfu; - - switch (kind) { - case INTRIN_SADD_OVERFLOW: - case INTRIN_UADD_OVERFLOW: - emit_mov_rr(mc, w, rd, ra); - emit_alu_rr(mc, w, 0x01, rd, rb); /* ADD */ - break; - case INTRIN_SSUB_OVERFLOW: - case INTRIN_USUB_OVERFLOW: - emit_mov_rr(mc, w, rd, ra); - emit_alu_rr(mc, w, 0x29, rd, rb); /* SUB */ - break; - case INTRIN_SMUL_OVERFLOW: - case INTRIN_UMUL_OVERFLOW: - emit_imul_rr(mc, w, rd, rb); /* IMUL */ - break; - default: - break; - } - - /* SETO ro (set if overflow: 0x0F 0x90 /0) */ - emit_setcc(mc, X64_CC_O, ro); - emit_movzx(mc, 0, ro, ro, 0); - } - return; - } - - case INTRIN_TRAP: - case INTRIN_UNREACHABLE: { - /* UD2: 0x0F 0x0B (undefined instruction) */ - mc->emit_bytes(mc, (u8[]){0x0f, 0x0b}, 2); - return; - } - - case INTRIN_EXPECT: - case INTRIN_ASSUME_ALIGNED: { - /* Hints dropped; just copy value. */ - if (ndst >= 1 && narg >= 1) { - if (args[0].kind == NATIVE_LOC_IMM) - x_emit_load_imm(mc, x_is_64(t, dsts[0].type) ? 1 : 0, - dsts[0].v.reg & 0xfu, args[0].v.imm); - else - emit_mov_rr(mc, x_is_64(t, dsts[0].type) ? 1 : 0, - dsts[0].v.reg & 0xfu, args[0].v.reg & 0xfu); - } - return; - } - - case INTRIN_PREFETCH: - /* No-op on x64 (or PREFETCHT0 if we want to emit one). */ - return; - - case INTRIN_MEMCPY: - case INTRIN_MEMSET: { - /* Inline byte-at-a-time copy/set (from legacy x_copy_bytes / x_set_bytes). */ - if (kind == INTRIN_MEMCPY && ndst == 0 && narg == 3) { - /* dst_addr, src_addr, count: use REP MOVSB or byte loop. */ - x_intrinsic_memcpy(t, args[0], args[1], args[2]); - } else if (kind == INTRIN_MEMSET && ndst == 0 && narg == 3) { - /* dst_addr, byte_val, count. */ - x_intrinsic_memset(t, args[0], args[1], args[2]); - } - return; - } - - default: - /* Unimplemented intrinsic. */ - break; - } -} - -/* Helper: inline memcpy via REP MOVSB or byte loop. */ -static void x_intrinsic_memcpy(NativeTarget* t, NativeLoc dst, NativeLoc src, - NativeLoc count) { - MCEmitter* mc = t->mc; - /* dst = rdi, src = rsi, count = rcx; REP MOVSB */ - /* (Simplified: assumes operands are already in the right regs or we materialize them.) */ - u32 rdi = dst.v.reg & 0xfu; - u32 rsi = src.v.reg & 0xfu; - u32 rcx = count.v.reg & 0xfu; - - if (rdi != X64_RDI) emit_mov_rr(mc, 1, X64_RDI, rdi); - if (rsi != X64_RSI) emit_mov_rr(mc, 1, X64_RSI, rsi); - if (rcx != X64_RCX) emit_mov_rr(mc, 1, X64_RCX, rcx); - - /* REP MOVSB: 0xf3 0xa4 */ - u8 rep_movsb[] = {0xf3, 0xa4}; - mc->emit_bytes(mc, rep_movsb, 2); -} -``` - -**Emit helpers** (from git 429defa:src/arch/x64/emit.c): -- `emit_popcnt(w, dst, src)`: F3 0F B8 /r (SSE4.2) -- `emit_bs(w, opcode, dst, src)`: 0x0f (0xbc=BSF or 0xbd=BSR) /r -- `emit_bswap(w, reg)`: 0x0f 0xc8+reg (BSWAP; 32-bit only, 64-bit is 0xc8+reg in REX.W) -- `emit_setcc(cond, reg)`: 0x0f 0x90+cond modrm (SETCC byte) -- `emit_movzx(w, dst, src, sign)`: 0x0f (0xb6/0xb7/0xbe/0xbf) modrm - ---- - -## Part 5: File-Scope Asm & Finalize - -### file_scope_asm Hook - -Line 424 in src/arch/native_target.h: - -```c -void (*file_scope_asm)(NativeTarget*, const char* src, size_t len); -``` - -Used to emit raw assembly at the file scope (e.g., from asm_parse pseudo-statements in the source). - -### Body Sketch: x_file_scope_asm - -```c -static void x_file_scope_asm(NativeTarget* t, const char* src, size_t len) { - /* Write source verbatim to the current section. Most backers emit as a blob - and let the assembler parse; for native emission, run the asm parser here. */ - X64Asm* a = x64_asm_open(t->c); - x64_asm_run_template(a, t->mc, src); - x64_asm_close(a); -} -``` - -Alternatively, if full template processing is unneeded, pass through to the standalone asm.c parser. - -### finalize Hook - -Line 429 in src/arch/native_target.h: - -```c -void (*finalize)(NativeTarget*); -``` - -Called at the end of code generation to finalize any pending state (e.g., unwind info, extra frame state patches, register allocation reporting for debuggers). - -### Body Sketch: x_finalize - -```c -static void x_finalize(NativeTarget* t) { - /* Finalize any pending state (unwind info, profile tables, etc). */ - (void)t; /* No-op on x64 v1. */ -} -``` - -### trap Hook - -Line 427 in src/arch/native_target.h: - -```c -void (*trap)(NativeTarget*); -``` - -Emit an unconditional trap/breakpoint (used for unreachable code, panic sites). - -```c -static void x_trap(NativeTarget* t) { - /* UD2: undefined instruction. */ - u8 ud2[] = {0x0f, 0x0b}; - t->mc->emit_bytes(t->mc, ud2, 2); -} -``` - ---- - -## Part 6: File Management Plan - -### DELETE (Replaced by native.c) - -1. **src/arch/x64/ops.c** (104,924 bytes): All semantic-level operations (loads, stores, arithmetic, atomics, calls, intrinsics, variadics). Fully subsumed by native.c hooks. - -2. **src/arch/x64/alloc.c** (19,461 bytes): Register allocation state machine (XImpl frame slots, param binding, spill/reload). Replaced by NativeTarget frame_slot/bind_param/spill/reload hooks. - -3. **src/arch/x64/opt_coord.c** (14,220 bytes): Register tables and ABI coordination (x_int_allocable, x_fp_allocable, x_plan_call, X64ABIRegs dispatch). Replaced by native.c register tables and NativeOps vtable. - -4. **src/arch/x64/internal.h** (12,299 bytes): XImpl state struct and shared helpers. Absorbed into native.c and a new x64.h header (if needed for ISA helpers). - -### STRIP (Extract byte encoders; keep ISA/relocs) - -1. **src/arch/x64/emit.c** (41,172 bytes): - - **KEEP**: emit_rex(), emit_mem_operand(), emit_rm_reg(), emit_mov_rr(), emit_mov_load(), emit_mov_store(), emit_lea(), specific instruction emitters (emit_alu_rr, emit_imul_rr, emit_f7_rm, emit_shift_*, emit_cqo_or_cdq, emit_alu_imm*, emit_setcc, emit_movzx, emit_extend_rr, emit_ret, emit_sse_*), plus the shared ABI constant tables (g_int_order, g_fp_order, g_x64_abi_sysv, g_x64_abi_win64, x64_abi_for_os). - - **DELETE**: x_func_begin, x_func_end, x_func_begin_known_frame, x_func_begin_init, x_build_prologue, x_compute_frame_size, x_collect_cs_regs, x_emit_variadic_reg_saves, x_add_entry_frame_slots, x_chkstk_sym, x_planned_prologue_bytes, x_prologue_placeholder. (These move to native.c.) - - **OUTPUT**: Create src/arch/x64/emit.h (header exporting only the byte-level encoders and ABI tables). - -### KEEP-AS-IS (No changes required) - -1. **src/arch/x64/isa.h** (26,086 bytes): X64_* opcode constants, REX/ModRM helpers, register enums. No changes. - -2. **src/arch/x64/isa.c** (40,662 bytes): Disassembler. No changes. - -3. **src/arch/x64/regs.c** (3,251 bytes): DWARF register names. No changes. - -4. **src/arch/x64/link.c** (3,689 bytes): Object file relocation helpers. No changes. - -5. **src/arch/x64/dbg.c** (12,299 bytes): Debug info emission. No changes (or minimal updates for the new frame slot API). - -6. **src/arch/x64/disasm.c** (4,282 bytes): Disassembler helpers. No changes. - -### ADAPT (Update includes; minor rewiring) - -1. **src/arch/x64/asm.c** (53,046 bytes): - - **CHANGE**: Replace `#include "arch/x64/internal.h"` with `#include "arch/x64/emit.h"` (for emit helpers). - - **CHANGE**: Adapt x64_inline_bind() if it references deleted functions from ops.c/alloc.c (e.g., constraint resolution should use NativeOps adapter or standalone resolution). - - **KEEP**: Inline assembly template parsing and operand binding (x64_asm_open, x64_asm_run_template, x64_asm_close, inline_bind helpers). - -2. **src/arch/x64/arch.c** (2,936 bytes): - - **CHANGE**: Replace `x64_cgtarget_new()` call with `x64_native_target_new()` and `native_direct_target_new()` wiring. - - **DELETE**: Any old CGTarget construction code. - - **ADD**: NativeTarget → NativeDirectTarget → CgTarget adapter chain. - -3. **src/arch/x64/x64.h** (145 bytes): - - **KEEP**: public API declarations. - - **ADD**: `X64NativeTarget* x64_native_target_new(Compiler*, ObjBuilder*, MCEmitter*);` - -### NEW FILE - -**src/arch/x64/emit.h** (exported byte-level encoder stubs): - -Extract the following from emit.c into a header: - -```c -#ifndef CFREE_ARCH_X64_EMIT_H -#define CFREE_ARCH_X64_EMIT_H - -#include "arch/mc.h" -#include "arch/x64/isa.h" - -/* Byte-level emit helpers. */ -void emit_rex(MCEmitter*, int w, u32 reg, u32 index, u32 rm); -void emit_rex_force(MCEmitter*, int w, u32 reg, u32 index, u32 rm); -void emit_mem_operand(MCEmitter*, u32 reg, u32 base, i32 disp); -void emit_rm_reg(MCEmitter*, u32 reg, u32 rm); -void emit_mov_rr(MCEmitter*, int w, u32 dst, u32 src); -void emit_mov_load(MCEmitter*, u32 size, int signed_ext, u32 dst, u32 base, i32 disp); -void emit_mov_store(MCEmitter*, u32 size, int is_store, u32 reg, u32 base, i32 disp); -void emit_lea(MCEmitter*, u32 dst, u32 base, i32 disp); -/* ... etc for all instruction emitters ... */ - -/* Shared ABI tables. */ -extern const Reg g_int_order[]; -extern const Reg g_fp_order[]; -typedef struct { /* X64ABIRegs ... */ } X64ABIRegs; -const X64ABIRegs* x64_abi_for_os(CfreeOSKind os); - -#endif -``` - ---- - -## Integration Checklist - -1. **native.c creation**: - - Copy the structure from rv64/native.c and aa64/native.c. - - Implement all NativeTarget hooks (func_begin, bind_param, load_imm, move, load, store, binop, ..., atomic_load, atomic_rmw, atomic_cas, va_start_, va_arg_, intrinsic, asm_block, file_scope_asm, finalize). - - Use src/abi/abi_{sysv,win64}_x64.c for ABI queries (abi_cg_func_info, abi_va_list_layout). - - Register tables: x_int_allocable, x_fp_allocable (from opt_coord.c); x_int_phys, x_fp_phys; NativeAllocClassInfo x_classes. - - Frame layout: X64NativeSlot, X64NativeTarget state struct. - - Prologue/epilogue: x_func_begin, x_func_end (adapting from emit.c). - -2. **emit.h extraction**: - - Create src/arch/x64/emit.h exporting byte-level encoders and ABI constants. - - Update emit.c to include it (if not self-contained). - -3. **asm.c rewiring**: - - Replace `#include "arch/x64/internal.h"` with `#include "arch/x64/emit.h"`. - - Keep inline template parsing intact. - -4. **arch.c rewiring**: - - Replace x64_cgtarget_new() vtable construction with native + NativeOps adapter. - - Wire native.c hooks into the CgTarget path. - -5. **Delete obsolete files**: - - ops.c, alloc.c, opt_coord.c, internal.h. - -6. **Update build system**: - - Remove ops.o, alloc.o, opt_coord.o from x64 link. - - Add native.o. - ---- - -## Summary of NativeTarget Hooks (x64) - -| Hook | Implemented | Notes | -|------|-------------|-------| -| func_begin | native.c | Prologue placeholder, ABI setup | -| func_begin_known_frame | native.c | Optimizer path: exact prologue | -| note_frame_state | native.c | Prologue patch state | -| reserve_callee_saves | native.c | Mark save slots for allocator-assigned regs | -| emit_prologue | native.c | Emit minimal prologue (opt path) | -| frame_slot | native.c | Allocate frame slot | -| bind_param | native.c | Move incoming param to dst (reg or frame) | -| label_new, label_place, jump, cmp_branch, indirect_branch, load_label_addr | native.c | Control flow | -| move, load_imm, load_const, load_addr, load, store, tls_addr_of | native.c | Data movement | -| copy_bytes, set_bytes, bitfield_load, bitfield_store | native.c | Aggregate ops | -| binop, unop, cmp, convert, alloca_ | native.c | Arithmetic/conversion | -| spill, reload | native.c | Register save/restore | -| plan_call, emit_call, plan_ret, ret | native.c | Call ABI | -| **atomic_load, atomic_store, atomic_rmw, atomic_cas, fence** | **native.c GROUP 5** | **x86 TSO semantics** | -| **va_start_, va_arg_, va_end_, va_copy_** | **native.c GROUP 5** | **SysV & Win64 layouts** | -| **intrinsic** | **native.c GROUP 5** | **popcount, bswap, overflow, trap, etc.** | -| **asm_block** | **native.c GROUP 5** | **Inline asm with constraints** | -| **file_scope_asm** | **native.c GROUP 5** | **File-scope assembly** | -| trap, set_loc, finalize, destroy | native.c | Utilities | - ---- - -## Legend: Code Locations (git 429defa) - -- **ops.c atomics** (lines ~1600–1800): x_atomic_load/store/rmw/cas, emit_lock_*, emit_mfence -- **ops.c variadics** (lines ~1200–1400): x_va_start_, x_va_arg_, x_va_end_, x_va_copy_ -- **ops.c intrinsics** (lines ~1900–2100): x_intrinsic, emit_popcnt, emit_bs, emit_bswap, emit_*_overflow -- **ops.c asm** (lines ~2100–2250): x_asm_block, x_set_loc, x_finalize -- **emit.c prologues** (lines ~1000–1400): x_func_begin, x_func_end, x_build_prologue, x_emit_variadic_reg_saves -- **emit.c encoders** (lines ~200–900): emit_rex, emit_mem_operand, emit_mov_*, emit_alu_*, emit_sse_*, etc. -- **isa.h opcodes** (lines 1–300): X64_* constants, modrm/rex helpers -- **internal.h structs** (lines 80–150): XImpl, XSlot, X64ABIRegs -- **opt_coord.c tables** (lines 1–200): register tables, x_plan_call, X64ABIRegs dispatch -- **alloc.c frame** (lines 1–300): x_frame_slot, x_param, frame layout computation - diff --git a/doc/O1_INLINE.md b/doc/O1_INLINE.md @@ -1,188 +0,0 @@ -# Plan: Inline tiny functions at O1 - -## Context - -The cfree optimizer currently does **no function inlining in any live compile**. Both -`-O1` and `-O2` are normalized to the single O1 native path (`src/opt/opt.h:11`, -"O2 cutover window"); the SSA-based O2 pipeline (`opt_cleanup`) and the whole-program -inliner (`opt_inline`) exist in the tree but have **no callers** outside the stale, -no-longer-compiling `test/opt/opt_test.c`. Verified empirically: at `-O1`/`-O2` a call to a -trivial helper like `add1(x) = x+1` still emits a `bl` — nothing is inlined. - -We want O1 to inline *tiny* functions — simple `static inline`-style helpers — so callers -don't pay call/frame overhead for one- or two-instruction bodies. This is the single -biggest easy win for the kind of small leaf helpers that dominate idiomatic C. - -The good news: a complete, correct inlining toolkit already exists in -`src/opt/pass_inline.c` and operates on the **pre-machinize PReg form**, which is exactly -the IR shape present in the O1 pipeline before `opt_machinize_native`. This change is -therefore mostly *wiring* that machinery into the live O1 path, gated to tiny callees — -not writing an inliner from scratch. - -### Decisions -- **Policy:** inline `DEFAULT`/`HINT` callees only when cost ≤ `INLINE_TINY_COST_LIMIT` - (8 straightline ops); always refuse `NEVER`; always inline `ALWAYS` - (`always_inline`) regardless of the tiny cap. -- **Tests:** behavioral disasm fixture (reuse toy-135 at `-O1`) **plus** a fresh small - unit test in the `cg_ir_lower_test.c` style. Do **not** revive `opt_test.c` (it no - longer compiles against the current API — references the removed `CGTarget` type). - -## Key facts (verified) - -- `opt_on_func` (`src/opt/opt.c:171`) is the per-function sink callback: lowers each - `CgIrFunc` to a pre-machinize `Func` via `opt_func_from_cg_ir`, then runs - `opt_run_o1_native`. `OptImpl` struct at `opt.c:20`. -- `opt_run_o1_native` (`opt.c:43`): cfg build → jump-cleanup → cfg → `opt_simplify_local` - (line 66) → `opt_verify` (69) → `opt_machinize_native` (73). The window **after line 67 - and before line 68** is pre-machinize PReg form, CFG valid — the inliner's required shape. -- The recorder (`src/cg/ir_recorder.c`) calls `cg_ir_module_add_func` at `func_begin` and - fires `func_recorded` at `func_end`. So when a **caller** is processed, every function - defined **earlier** in the TU is already recorded and `complete`. Static inline helpers - are defined before use ⇒ callee body is available. Forward-defined callees are not - (acceptable limitation). -- Lifetimes: `Func` is allocated on `c->tu` (`ir_func_new`, `src/opt/ir.c:269`) and - `CgIrFunc`s live on the module — nothing is freed per-function. `opt_func_from_cg_ir` is - cheap and re-runnable, producing a fresh pre-machinize `Func` from a persistent `CgIrFunc`. -- Reusable machinery in `pass_inline.c` (all operate on pre-machinize PReg form): - `inline_call_site` (line 500), gates `callee_inline_shape` (164, already enforces a - policy-dependent cost cap and a straightline-op whitelist that **excludes `IR_CALL`**), - `inline_rewrite_supported` (473), `recursive_or_scc` (135), `direct_callee` (85), - `effective_inline_policy` (100). Cost limits at lines 34-37. - -## Implementation - -### 1. `src/opt/pass_inline.c` — add the tiny-inline driver (reuse existing gates) - -Keep all existing static gates as-is; add in the same TU so nothing else needs exposing. - -- Add constant near lines 34-37: - ```c - #define INLINE_TINY_COST_LIMIT 8u - ``` -- Add `try_tiny_inline_call(FuncSet* fs, Func* caller, u32 b, u32 i)` — mirrors - `try_inline_call` (line 607) but: no `base_cost`/`caller_growth_ok` (the streaming path - has no whole-program cost array, and tiny callees can't blow up the caller); add the tiny - gate after `callee_inline_shape`: - ```c - if (policy != CFREE_CG_INLINE_ALWAYS && cost > INLINE_TINY_COST_LIMIT) return 0; - ``` - Emit `opt.tiny_inline.candidates` / `opt.tiny_inline.inlined` metrics. -- Add public driver (callback-based so `opt.c` owns the registry/cache): - ```c - typedef Func* (*OptInlineCalleeLookup)(void* ctx, ObjSymId callee_sym); - int opt_try_tiny_inline(Func* caller, OptInlineCalleeLookup lookup, void* ctx); - ``` - It walks caller blocks/insts; for each `IR_CALL` resolves the callee `ObjSymId` from - `aux->desc.callee.v.global.sym`, calls `lookup(ctx, sym)` to get a fresh/cached - pre-machinize callee `Func`, builds a 2-element ad-hoc `FuncSet {caller, callee}` on - `caller->arena` (so `direct_callee`/`recursive_or_scc` work unchanged), and calls - `try_tiny_inline_call`. Returns the number of inlines performed. - - **Iteration:** fixed-point loop with cap `#define TINY_INLINE_MAX_PASSES 4`. After any - successful `inline_call_site` (which splits the block, invalidating indices), set - `changed` and restart the block scan. Note inlined bodies are straightline-only - (`op_supported_in_straightline_inline` excludes `IR_CALL`) ⇒ inlining never introduces - new calls, so this terminates quickly; the cap is a guard. - - **Recursion safety:** the only real check needed is `caller == callee` (kept via - `recursive_or_scc`). Any callee containing a call is already rejected by - `callee_inline_shape`'s whitelist, so transitive cycles cannot arise. - -### 2. `src/opt/opt_internal.h` — declare the new surface - -Add near the other pass prototypes: -```c -typedef Func* (*OptInlineCalleeLookup)(void* ctx, ObjSymId callee_sym); -int opt_try_tiny_inline(Func* caller, OptInlineCalleeLookup lookup, void* ctx); -void opt_inline(FuncSet*, int max_iters); /* formalize existing symbol */ -``` - -### 3. `src/opt/opt.c` — registry + wiring - -- Extend `OptImpl` (line 20) with a callee registry/cache, all on `c->tu`: - ```c - CgIrFunc** cg_by_sym; /* recorded functions, for callee lookup */ - Func** lowered_cache; /* parallel: lazily re-lowered callee Func */ - u32 ncg; - u32 cg_cap; - ``` - (zero-initialized by the existing `arena_znew` in `opt_cgtarget_new`). -- At the **top of `opt_on_func`** (before lowering the caller), append `cg_func` to - `cg_by_sym` (it is `complete`; earlier-defined funcs already present). Growing array on - `c->tu`. -- Add a lookup function `static Func* opt_tiny_callee_lookup(void* ctx, ObjSymId sym)`: - linear-scan `cg_by_sym` for `desc.sym == sym`; if found and `lowered_cache[idx] == NULL`, - `opt_func_from_cg_ir(o->c, cg_by_sym[idx])` and cache it (one re-lower per helper, reused - across all call sites — `inline_call_site` does not mutate the callee). Return NULL if - absent (forward-defined) so the call is left as-is. -- In `opt_run_o1_native`, between line 67 (`metrics_scope_end "opt.cfg.simplify_local"`) - and line 68: - ```c - metrics_scope_begin(o->c, "opt.o1.tiny_inline"); - int inlined = opt_try_tiny_inline(f, opt_tiny_callee_lookup, o); - metrics_scope_end(o->c, "opt.o1.tiny_inline"); - if (inlined) { - /* REQUIRED for correctness: inline_call_site invalidated CFG and left - * preds stale (it maintains succ + emit_order only). Verify + machinize - * + regalloc all consume preds/dominance. */ - opt_build_cfg(f); - /* QUALITY (optional): merge the single-pred/single-succ BR-glue chain the - * inliner introduced (pre -> callee body -> cont) back into straight-line - * blocks so regalloc doesn't see artificial block boundaries. Mirrors the - * prologue's build_cfg -> jump_cleanup -> build_cfg idiom (opt.c:57-63). - * NOTE: use opt_jump_cleanup, NOT opt_simplify_local — the latter is a - * per-inst peephole and does not merge blocks. Without this, output is - * still correct (post-RA opt_mir_jump_cleanup removes the dead jumps) but - * allocation is worse. */ - opt_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG); - opt_build_cfg(f); - opt_verify(f, "o1-tiny-inline"); - } - ``` - The pass needs `OptImpl* o` for the lookup — `opt_run_o1_native` already has `o`. - -### 4. Tests - -**(a) New unit test** — `test/opt/tiny_inline_test.c`, modeled on -`test/opt/cg_ir_lower_test.c` (same `EXPECT` harness, builds `Func`s via `opt_func_from_cg_ir` -from hand-built `CgIrFunc` tapes, links `$(LIB_OBJS)` with `-Isrc`). Cases: - - Tiny callee (`add1(x)=x+1`, cost 1): drive `opt_try_tiny_inline` with a trivial lookup - closure returning the callee `Func`; assert caller has no `IR_CALL` and contains the - cloned `BINOP`. - - Threshold: a `DEFAULT` callee with cost > 8 is refused (call survives); the same body - marked `CFREE_CG_INLINE_ALWAYS` is inlined. - Add a `test-opt-tiny-inline` target + binary in `test/test.mk` next to `test-opt` - (~line 509), and add it to the aggregate list near line 66. - -**(b) Behavioral disasm** — reuse `test/toy/cases/135_inline_cleanup_quality.toy` -(`add1(x)=x+1`; `__user_main` calls `add1(41)`; `main` calls `__user_main`). Compile at -`-O1`, `cfree objdump -d`, and assert the inlined helper's `bl` is **gone** from its caller. -Realistic O1 assertions (O1 has no constant folding, so do **not** assert "main == constant -42"): `__user_main` contains no `bl ... # add1`, and `main` contains no `bl ... # __user_main`. -`test/opt/run.sh` already contains a (too-strong, orphaned) version of this check — repurpose -it to the no-`bl` assertions and wire it into a `test-opt-inline` make target (it is -currently not invoked by any target). - -## Verification - -1. `make bin && make lib` -2. Behavioral, aarch64 cross (host-arch-independent): - ``` - build/cfree cc -target aarch64-linux-gnu -O1 -c \ - test/toy/cases/135_inline_cleanup_quality.toy -o /tmp/ti.o - build/cfree objdump -d /tmp/ti.o - ``` - Confirm `<__user_main>` has no `bl ... add1` and `<main>` has no `bl ... __user_main` - (compare against the pre-change disasm, which shows both `bl`s). -3. `make test-opt-tiny-inline` (new unit test) and `make test-opt-inline` (disasm) pass. -4. Regression sweep: `make test-opt test-cg-api test-isa test-aa64-inline test-smoke-x64 - test-smoke-rv64` — confirm no behavioral changes elsewhere and that `opt_verify` - ("o1-tiny-inline") never trips. -5. Sanity that NEVER is honored and forward-defined callees are left alone (a quick - hand-written C file with `__attribute__((noinline))` and a forward-declared helper). - -## Out of scope -- O2 / SSA pipeline (entirely disabled). -- Cross-TU / forward-defined-callee inlining. -- Inlining callees with control flow richer than the existing straightline whitelist, or - with multi-part/aggregate ABI args/returns (rejected by `inline_rewrite_supported`). diff --git a/doc/OBJ.md b/doc/OBJ.md @@ -0,0 +1,363 @@ +# Object Model + +cfree's `src/obj/` is the format-neutral object layer: one in-memory +representation of "a relocatable object or linked image" that every other +subsystem reads and writes, plus a registry that hides ELF/Mach-O/COFF/Wasm +specifics behind a single dispatch seam. Codegen (`cg`), the static linker, the +JIT linker, the disassembler, the DWARF producer, the emulator loader, and the +inspection tools (`objdump`/`nm`/`size`/`strip`/`objcopy`/`addr2line`) all meet +here. The design goal is that the rest of the compiler reasons about sections, +symbols, relocations, groups, and atoms — never about ELF section header tables +or Mach-O load commands — and that adding a format is a matter of filling one +table, not threading new branches through every caller. + +See [LINK.md](LINK.md) for how the linker consumes this model, [ARCH.md](ARCH.md) +for the backends that produce it, and [DWARF.md](DWARF.md) for debug sections. + + +## The ObjBuilder + +The single concrete in-memory object is the `ObjBuilder` (`src/obj/obj.c`, +declared in `src/obj/obj.h`; the public handle is `CfreeObjBuilder`). It is the +*only* object representation in the system — there is no separate "parsed +object" type. A builder is produced by exactly two kinds of writer: + + - a backend during compilation (`cg` via the MCEmitter / CGTarget path), or + - an `.o` reader during linking/inspection (`read_elf`, `read_macho`, + `read_coff`, `read_wasm`). + +The central invariant: **post-finalize, a backend-produced builder is identical +in shape to what a reader would produce from the same object written to disk.** +Consumers therefore never care which path created the builder — the linker reads +a freshly compiled TU and a `.o` off disk through one API. + +### Storage: segmented arrays + chunked byte buffers + +Sections, symbols, relocations, groups, and atoms each live in their own +segmented array (`core/segvec.h`). Segmentation is load-bearing: callers hold +`const Section*` / `const ObjSym*` pointers returned by `obj_*_get` across +further appends, so storage must never relocate existing elements the way a +flat realloc would. Section payloads use the chunked `Buf` type so large +`.text`/`.data` bodies grow without copying. + +Handles are small integer ids (`ObjSecId`, `ObjSymId`, `ObjGroupId`, +`ObjAtomId`), each scoped to one builder, with **index 0 reserved as the "none" +sentinel** in every id space. Ids are stable for the builder's lifetime; +relocations carry a section + symbol id, supporting forward references (mint an +undefined `ObjSymId` for a reloc, define it later with `obj_symbol_define`). + +The five tables and what they model: + + - **Sections** — name + `SecKind` (TEXT/RODATA/DATA/BSS/DEBUG/OTHER) + + `SecSem` (PROGBITS/NOBITS/SYMTAB/RELA/GROUP/...) + neutral `SecFlag` bits + (EXEC/WRITE/ALLOC/TLS/MERGE/GROUP/RETAIN/...). `obj_section` find-or-creates + by `(name, kind, PROGBITS)` so repeated literal/initializer emissions + coalesce into one section with merged align/flags rather than fanning out. + - **Symbols** — `SymBind` x `SymVis` x `SymKind`, a defining section id (or + NONE for undefined externs/commons), value, size. The object owns its whole + symbol namespace: locals, section symbols, file symbols, commons, and + external references are all `ObjSym`s. + - **Relocs** — flat across all sections (filtered by `section_id` on read), + each a `(section, offset, RelocKind, sym, addend)` tuple. `RelocKind` is a + *canonical* enum spanning every arch (see "Relocation model" below). + - **Groups** — COMDAT / section groups: a signature symbol plus a member + section list (dedup keyed on the signature at link time). + - **Atoms** — sub-section ranges (`section`, `offset`, `size`, signature) + that let a format split one section into independently-linkable pieces. + Mach-O sets `split_sections_as_atoms`; the linker uses atoms for dead-strip + and `-r` granularity where the format has no section-per-symbol convention. + +### Format pass-through without leaking format knowledge + +Generic tables stay neutral, but `.o` round-tripping needs to preserve bits the +canonical model doesn't name. Three escape hatches handle this without +polluting the core: + + - Per-section `ext_type` / `ext_flags` (raw `sh_type`/`sh_flags`) re-emit + format-specific section types that collapse to a generic `SecSem` + (`SHT_LLVM_ADDRSIG`, `SHT_ARM_ATTRIBUTES`, `SHF_EXCLUDE`, ...). + - Per-symbol `flags` carry format attribute bits (today Mach-O `n_desc`). + - Builder-level fields: ELF `e_flags`, the COFF short-import DLL annotation. + - `obj_ext_set`/`obj_ext_get` attach one opaque payload per `ObjExtKind` + (today the Wasm module model and Wasm import descriptors); the builder owns + the payload's lifetime via a registered free function. + +### Lifecycle and the finalize discipline + +``` + obj_new + │ + ├─ write side: obj_section / obj_symbol / obj_reloc / obj_atom / obj_group + │ (MCEmitter / CGTarget, or an .o reader) + │ + ├─ cgtarget_finalize (flush lowered code into sections; -O2 path) + ├─ debug_emit (if -g: writes .debug_* sections) + │ + ├─ obj_finalize ─────── freezes the read-side view + │ + └─ read side: obj_section_get / obj_symiter / obj_reloc_at ... + (file emitters, linker, objdump) +``` + +`obj_finalize` is the read-side gate. The contract is "build mutably, then +finalize before any read-side query." Today it is a deliberate near-no-op — the +build path already keeps the index spaces consistent and section bytes are +flattened on demand by emitters — but it is the designated home for any future +intra-section fixup pass (label-to-offset resolution after a full section is +written), and keeping every consumer routed through it preserves that option. + +### Mutators and the tombstone sweep + +`strip`/`objcopy` mutate a finalized builder. Rather than compact storage +(which would invalidate the stable-id contract), mutators flip per-entry +`removed` *tombstones* and individual fields. `obj_sweep_dead` then runs the +cascading cleanup — drop symbols defined in removed sections, prune +non-referenced undefined externs (the historical "spurious extern from a +header" filter, now folded in), kill relocs that became dangling, compact group +member lists, clear stale `Section.link`. Every file emitter calls +`obj_sweep_dead` at the top of emit, and raw id-based iteration must consult +`removed` itself — tombstones are a per-entry field, not hidden behind the +iterators, so the model stays cheap and idempotent. + + +## Relocation model and the shared byte-patcher + +`RelocKind` (`src/obj/obj.h`) is a single canonical enum covering every target: +arch-neutral forms (`R_ABS32/64`, `R_REL32/64`, `R_PC32/64`), then per-arch +families (AArch64 ADRP/ADD/LDST/branch/TLS, x86-64 GOT/PLT/TLS, RISC-V +HI20/LO12/branch/`ADD`/`SUB`/`SET`/ULEB128, COFF SECREL/SECTION, Wasm idx +relocs). Backends emit canonical kinds; the per-format reloc translators +(`reloc_*` in each format dir) map between canonical kinds and on-disk wire +types in both directions. + +### reloc_apply.c — one byte-patcher, three loaders + +`src/obj/reloc_apply.c` exposes `link_reloc_apply(c, kind, P_bytes, S, A, P)`: a +**pure S/P/A byte patcher**. It computes nothing about loader or linker policy — +it receives the already-resolved symbol address `S`, the in-memory patch site +`P_bytes`, the addend `A`, and the site's runtime/virtual address `P`, then +encodes the bits for that `RelocKind` (with range checks). It owns the fiddly +encoding details: AArch64 imm19/imm26/imm12 field placement and ADRP page math, +RISC-V U/I/S/B/J immediate scatter, the `0x800` HI20 bias, and the +fixed-width-ULEB128 re-encode that lets `SET/SUB_ULEB128` relocs rewrite a +DWARF symbol-difference field without shifting section layout. + +This routine is a **key shared boundary** — it is reused verbatim by every +consumer that has to put resolved bytes down: + +``` + link_reloc_apply(c, kind, P_bytes, S, A, P) + ▲ ▲ ▲ + static linker ─────┘ │ └───── emu guest loader + (src/link, src/obj/*/link.c) JIT linker (src/emu/dl.c — dynamic + assembler (src/asm) (src/link/link_jit.c) reloc at guest load) +``` + +Each caller computes the *policy* (where `S` lives — link-time vaddr, +JIT-mapped runtime address, or guest virtual address; whether a reference is +redirected through a GOT/PLT/IAT slot) and then defers the *encoding* to this +one function. That separation is why the JIT, the static linker, and the +emulator can never disagree on how an `R_AARCH64_CALL26` is encoded: there is +exactly one encoder. The few relocs that are intrinsically loader-only +(`R_X64_COPY`) panic here, since they have no static-byte meaning. + + +## The format registry + +`src/obj/registry.c` is the dispatch seam. Each format is one `ObjFormatImpl` +(`src/obj/format.h`) — a vtable of function pointers and small per-format +constants: + + - `read` / `read_dso` — parse bytes into an `ObjBuilder` (relocatable view, + plus DSO export-only view for the linker's `-l` inputs); + - `emit` — write a relocatable `.o` from a builder; + - `link_emit` / `layout_dyn` / `free_dyn` — final-image writer + dynamic-table + synthesis hooks, owned by `src/link` but registered here; + - `emu` — `ObjFormatEmuOps` for the user-mode emulator's guest loader + (detect/load executable, map object, dynamic-needed/symbol/reloc iterators); + - per-arch sub-tables (`elf_arch`/`elf_machine`, `macho_arch`/`macho_cputype`, + `coff_arch`/`coff_machine`) that pair a `CfreeArchKind` with its machine + code, dynamic reloc type numbers, stub emitters, and reloc translators; + - optional archive-ingestion policy (`classify_obj_input`, `archive_hint`, + `archive_member`) — only COFF needs it, to reclassify short-import members + as DSOs. + +`obj_format_lookup` resolves by `ObjFmt`; `obj_format_lookup_bin` by detected +`CfreeBinFmt`. Each format is independently gated by a `CFREE_OBJ_*_ENABLED` +build flag, and the whole link/archive/emu machinery is gated too: when those +subsystems are compiled out, the registry binds the hooks to disabled stubs +rather than carrying `#ifdef`s at the call sites. A backend or tool that wants +format behavior calls through this table; it never names a format directly. + +### Detection + +`src/api/object_detect.c` sniffs the leading bytes: ar magic, `\x7fELF`, +`\0asm`, the five Mach-O magics, `MZ`/COFF machine words, and the Microsoft +short-import `00 00 FF FF` prefix. `cfree_detect_fmt` returns the binary family; +`cfree_detect_target` decodes the arch/OS/pointer-size into a `CfreeTarget`. +`cfree_obj_open` (`src/api/object_file.c`) chains detect → registry lookup → +`impl->read`, so every inspection tool opens any supported format through one +call. + +### Format-aware policy helpers + +The OS/format knowledge that backends would otherwise hardcode is concentrated +in two policy TUs so it lands as one case when a format is added, not as +fan-out across every CGTarget: + + - `src/obj/obj_secnames.c` — canonical *synthetic* section names. Most + sections keep ELF-style dotted names end-to-end (the writer translates at + emit), but linker-synthesized sections diverge: `obj_secname_init_array` + returns `.init_array` (ELF) / `__DATA,__mod_init_func` (Mach-O) / + `.CRT$XCU` (COFF); likewise fini/preinit arrays and TLS template sections. + This TU also owns the format codegen predicates: `obj_format_extern_via_got` + (Mach-O always, ELF only under PIC/PIE), `obj_format_c_mangle` (Mach-O + leading `_`), `obj_format_default_entry_name` (`_main` vs `_start` vs + `mainCRTStartup`), and the Mach-O DWARF/section-name spellings shared by the + writer and the DWARF reader. + - `src/obj/obj_tls.c` — format-aware TLS emission (below). + + +## Format-aware TLS emission + +`_Thread_local` storage has one source-level shape but two radically different +on-disk forms, and `src/obj/obj_tls.c` owns the split so the frontend and +backends stay format-agnostic. The frontend collects a TLS definition's bytes +(or BSS marker), alignment, and any pointer-init relocs, then calls +`obj_define_tls`; backends consult `obj_format_tls_via_descriptor` when choosing +an access sequence. + + - **ELF / COFF**: the symbol is defined directly in `.tdata`/`.tbss` (ELF) or + `.tls$` (COFF); access is a direct TP-relative offset (TLSLE relocs on ELF; + the COFF SECREL relocs on Windows). + - **Mach-O**: storage and access split. Bytes live under a private + `<name>$tlv$init` symbol in `__DATA,__thread_data`/`__thread_bss`; the + user-visible symbol is defined onto a 24-byte TLV *descriptor* in + `__DATA,__thread_vars` — `[_tlv_bootstrap, 0, &init]` — that dyld rewrites + at load time. Compiled code reaches the variable through an indirect call + via the descriptor's slot 0 (`TLVP_LOAD_PAGE21`/`PAGEOFF12` reloc pair). The + `_tlv_bootstrap` undef extern is cached on the builder so multiple TLV vars + in one TU share one symbol entry. + + +## The read-only inspection surface + +`include/cfree/object.h` + `src/api/object_file.c` expose the read side as +`CfreeObjFile`: open a blob, then iterate sections, symbols, relocations, +groups, and section data — the format-neutral view the `objdump`/`nm`/`size`/ +`strip`/`objcopy`/`addr2line`/`strings` tools share. `src/api/object_builder.c` +is the peer write-side adapter (`CfreeObjBuilder`), with static asserts pinning +the public `CFREE_RELOC_*` enum to the internal `R_*` values so the two never +drift. + +### The linked-image dimension + +Relocatable objects (`ET_REL`/`MH_OBJECT`/COFF `.obj`) carry no image: +`obj_image(ob)` is NULL. Executables and shared objects carry an extra +dimension the section/symbol tables can't model — load segments, an entry +point, image base, interpreter, soname, dependencies, rpaths, dynamic symbols, +and dynamic relocations. The `ObjImage` (defined in `obj.c`, hung off the +builder, released by `obj_free`) holds this *common denominator* across formats. +Readers call `obj_image_ensure(ob, OBJ_KIND_EXEC|DYN)` and the appenders; the +section/symbol view stays populated where the format still carries it, so a +non-stripped ELF exec presents both views and a table-stripped image presents +only segments. The public API mirrors the relocatable iterators: +`cfree_obj_kind`, image-info scalars, and segment/dep/rpath/dynsym/dynreloc +iterators. `OBJ_KIND_CORE` is reserved — detected and rejected cleanly, not +parsed. + + +## Per-format notes + +All four formats implement the same `read`/`emit` contract over the neutral +model; the differences are in what the wire format carries. + +### ELF64 (`src/obj/elf/`) + +The most complete path. `read.c` parses `ET_REL` into the section/symbol/reloc +view and `ET_EXEC`/`ET_DYN` additionally into the `ObjImage` (program headers → +segments + `PT_INTERP`; `.dynamic` → needed/soname/rpath; `.dynsym`/`.rela.*` → +dynamic symbols and relocs). `emit.c` writes relocatable objects; +`reloc_aarch64.c`/`reloc_riscv64.c`/`reloc_x86_64.c` translate canonical kinds +to/from per-arch wire types, paired in the registry with each arch's dynamic +reloc type numbers (`RELATIVE`/`GLOB_DAT`/`JUMP_SLOT`) and default musl interp +string. `read_elf_dso` produces an export-only builder for `-l` inputs. +`link.c` and `link_dyn.c` (registered as ELF's `link_emit`/`layout_dyn`) write +the final image and synthesize the dynamic-link tables — enumerate imports and +`DT_NEEDED`, reserve and fill `.rela.dyn`/`.rela.plt`, lay out `.dynamic`, +`.got`/`.got.plt`, and the dynamic symbol/string tables for PIE/shared output. +`emu_load.c` provides the guest-ELF loader for the emulator (`elf_emu_ops`): +detect/load an executable, map dependent objects, and walk dynamic relocs, +applying them through the shared `link_reloc_apply`. + +### Mach-O (`src/obj/macho/`) + +`read.c`/`emit.c` handle `MH_OBJECT` plus the `MH_EXECUTE`/`MH_DYLIB` image view +(re-walking load commands for segments, dylinker, install-name, dylib deps, +rpaths, entry, `LC_SYMTAB` dynamic symbols, and `LC_DYLD_CHAINED_FIXUPS` +binds/rebases). `link.c` is the final-image writer and link glue; +`reloc_aarch64.c`/`reloc_x86_64.c` translate relocs and supply pcrel/length +metadata. Mach-O sets `split_sections_as_atoms`. Two extra readers feed the +linker's `-l` path: `read_macho_dso` (MH_DYLIB exports) and `tbd_read.c` (Apple +`.tbd` text stubs from the SDK). Format quirks — the leading-`_` C mangle, the +`__DATA,__got` non-lazy-pointer indirection for externs, the TLV descriptor +model, the `__DWARF` segment section-name spellings — are concentrated in +`obj_secnames.c`/`obj_tls.c` and the writer, not the backends. + +### COFF / PE (Windows, `src/obj/coff/`) + +64-bit only (`x86_64-windows`, `aarch64-windows`); the hosted profile is +mingw/llvm-mingw UCRT, not MSVC. `read.c`/`emit.c` round-trip relocatable +PE/COFF: sections with `Characteristics`, symbols with auxiliary records, +COMDAT groups and SELECTANY dedup, weak externals + mingw alias fallback, +commons, long section names via the string table, and per-arch relocations. +`read_dso.c` walks raw PE DLL export directories (and forwarder ENT entries, +surfaced as defined symbols so the OS loader chases the chain at runtime). +`archive.c` implements the registry's archive-ingestion hooks: it classifies +import-library members, routing Microsoft short-import records +(`Sig1=0, Sig2=0xFFFF`) through `read_coff` to synthesize the imported symbols +and tagging the builder with the providing DLL name (so the link layer +reclassifies the input as a DSO), while long-form members fall through as +regular objects — handling mixed-member archives in one pass. + +`link.c` is the PE32+ writer (registered as COFF's `link_emit`): DOS stub + PE +headers, PE32+ optional header, Windows-aligned sections, `.idata` import +descriptors with per-DLL ILT/IAT/hint-name tables, per-arch IAT call stubs (the +registry's `emit_iat_stub`), `.reloc` base-relocation blocks, the TLS directory ++ `_tls_used`, and subsystem/entry-point selection (`mainCRTStartup` console / +`WinMainCRTStartup` GUI). COFF has no ELF GOT/PLT model: the object emits direct +references and the linker binds imports through IAT slots and stubs. Windows TLS +access is materialized into `.tls$` sections with the platform's TEB-relative +sequence (x64 `gs:[0x58]`; aarch64 `x18` TEB slot), using the +`R_COFF_*SECREL*` relocs. ABI selection (Win64 calling convention, `__chkstk` +probes, `long double == double`, the mingw `__int128` split) is keyed on +`(arch, os)` in the ABI dispatch — see [ARCH.md](ARCH.md). + +### Wasm object (`src/obj/wasm/`) + +Minimal and inspection-oriented. `read.c` parses a core module's container +sections into neutral `ObjBuilder` sections carrying their raw payload (so +`objdump -h/-s` show the real container), marks the code section `SF_EXEC` for +`-d`, and adds one function symbol per defined function. It is *not* a +linkable-object reader: the WebAssembly tool-conventions `linking`/`reloc.*` +sections aren't recovered, so relocations don't round-trip. `emit.c` is the +peer writer and the module model hangs off the builder via `OBJ_EXT_WASM` / +`OBJ_EXT_WASM_IMPORTS`. See [WASM.md](WASM.md). + + +## Why this shape + + - **One object type, two writers, many readers.** Backends and `.o` readers + converge on the same post-finalize shape, so the linker, emitters, and + objdump have a single contract regardless of provenance. + - **One byte-patcher.** `link_reloc_apply` is the only place relocation + encoding lives, so the static linker, JIT, and emulator loader cannot + disagree on a fixup — they differ only in how they compute `S`/`P`. + - **One registry seam.** Format knowledge sits behind `ObjFormatImpl` and the + `obj_secnames`/`obj_tls` policy helpers, so a new format is a table entry, + not a sweep through callers. + - **Stable ids + tombstones.** Segmented storage and `removed` flags let + `strip`/`objcopy` mutate freely without invalidating outstanding handles. + +Planned work (image-inspection extensions, fuller Wasm object support): see +doc/plan/. diff --git a/doc/OPT.md b/doc/OPT.md @@ -1,336 +1,506 @@ -# OPT - Optimizer Design - -This document describes cfree's optimized backend pipeline. It is intended to -be the stable design reference for `-O1` and `-O2`: the IR shape, pass order, -major analyses, and ownership boundaries. Performance measurements and current -benchmark gaps live in `doc/OPT_PERF.md`. - -The optimizer is implemented as a `CGTarget` wrapper. The C frontend and other -frontends still emit through the public CG interface; the wrapper records that -stream as cfree IR, runs the requested optimization schedule, then replays the -lowered IR into the wrapped target. - -## Goals - -- Keep `-O0` as the direct backend path. -- Keep `-O1` as a fast optimized tier: shared lowering, liveness, allocation, - post-RA cleanup, and emission, but no SSA value optimization or inlining. -- Make `-O2` the full quality tier: interprocedural retention for inlining, - SSA value/memory passes, loop cleanup, pressure relief, coalescing, live-range - splitting, and the same final lowering path as `-O1`. -- Preserve module boundaries. Frontends own language semantics, ABI lowering - owns call/return shape, targets own encoding legality, and optimizer passes - operate on recorded CG-level IR. - -## Level Shape - -`-O0`: +# Optimizer (OPT) + +This document is the design reference for cfree's optimizer: the module that +sits between the recording code-generation API and the per-architecture native +backends. The optimizer owns a private, mutable IR; it lowers each recorded +function into that IR, runs analyses and transforms over it, performs register +allocation, builds a physical post-allocation IR (MIR), and finally replays the +MIR into a `NativeTarget` backend. It is also the source of the function shape +the bytecode interpreter consumes. The focus here is layering, ownership, +representation invariants, and the reasoning behind the boundaries — not API +signatures, which live in the headers. Cross-references: [DESIGN.md](DESIGN.md), +[CODEGEN.md](CODEGEN.md), [IR.md](IR.md), [ARCH.md](ARCH.md), and +[INTERPRETER.md](INTERPRETER.md). + +## 1. Where the optimizer sits + +cfree's codegen has two surfaces. The semantic surface is the recording +code-generation API (`cg/ir.h`, the `CgTarget` interface): frontends call it to +describe a function. The physical surface is the per-architecture +`NativeTarget` (`arch/native_target.h`): it encodes machine code. The optimizer +is the bridge. ```text -frontend -> CG -> target +frontend --CgTarget calls--> CgIrRecorder --records--> CgIrFunc tape + | + opt_func_from_cg_ir (cg_ir_lower.c) + v + optimizer IR: Func / Block / Inst / Val / PReg + | + passes (analysis + transform) + | + regalloc -> MFunc (physical MIR) + v + opt_emit_native --NativeTarget calls--> machine code ``` -`-O1`: +At `-O0` the optimizer is not installed at all: the driver wires the frontend's +`CgTarget` straight to the backend's `NativeDirectTarget`, which emits in a +single pass with a small register cache (see [CODEGEN.md](CODEGEN.md)). At +`opt_level >= 1`, `opt_cgtarget_new` (`src/opt/opt.c`) installs a `CgIrRecorder` +(`cg/ir_recorder.c`) as the sink. The recorder captures each function as a +`CgIrFunc` tape, and on completion fires the optimizer's per-function callback. + +`OptImpl` (in `src/opt/opt.c`) is the wrapper state: the wrapped real target, +the resolved `NativeTarget*`, an optional dump writer, and a per-translation- +unit registry of recorded `CgIrFunc`s (with a parallel lazily-lowered-`Func` +cache) used for streaming tiny-callee inline lookup. + +### When each function is processed: streaming vs. finalization + +Two scheduling regimes exist, chosen by target architecture: + +- **Per-function streaming (x64, rv64).** As the recorder completes each + function it fires the optimizer's per-function callback, which lowers and fully + processes that one function immediately. Functions flow through the pipeline in + recording order, one at a time, before the module is finalized. +- **Finalization-time, reachability-driven (ARM_64).** The per-function callback + registers the recorded `CgIrFunc` but does no lowering. All processing is + deferred to module finalization, where a reachability sweep over the + call/data-reloc graph computes the set of functions actually referenced from a + root, prunes the rest, and only then lowers and processes the survivors. + +Both regimes converge on the same lowering path and backend tail; they differ +only in *when* a function is lowered and whether dead local functions are dropped +before lowering or left for the linker (Section 3.1). + +### The recording/optimizing boundary + +The split between *recording* (`cg/ir`) and *optimizing* (`opt/ir`) is +deliberate and is the central design decision of this module: + +- The recorded `CgIrFunc` is a faithful, immutable transcript of the frontend's + semantic intent. It speaks in `CGLocal`/`Label`/`CGCallDesc` terms and knows + nothing about CFGs, dominators, or physical registers. Frontends and ABI + lowering own that layer; the optimizer never mutates it. Keeping it immutable + is what makes streaming tiny-inline re-lowering cheap and repeatable, and what + lets the same recorded tape feed both the native pipeline and the interpreter. +- `opt_func_from_cg_ir` (`src/opt/cg_ir_lower.c`) translates one `CgIrFunc` into + the optimizer's own mutable `Func` — a real CFG of `Block`s, each holding a + linear list of `Inst`s, plus frame slots, a pseudo-register table, a value + table, and the params/locals tables. From here on the optimizer works only on + `Func`; the recorded tape is a read-only source. + +Lowering also performs the first storage-classification decision. In +`lower_locals`, each semantic `CGLocal` becomes either register storage +(`CG_LOCAL_STORAGE_REG`, a fresh `PReg`, operands of kind `OPK_REG`) or frame +storage (`CG_LOCAL_STORAGE_FRAME`, a `FrameSlot`, operands of kind `OPK_LOCAL`). +A local is forced to a frame home when it is address-taken / memory-required +(`local_needs_home`), an aggregate, or larger than a machine word. Everything +else starts in a pseudo-register. Address-taken locals begin in frame storage; +later HIR address-folding and promotion passes recover register storage for +those whose address does not actually escape (Section 4). `va_list` operands +are lowered as opaque pointer values, never address-taken, so that all +va-layout knowledge stays behind the `NativeTarget` va hooks. + +## 2. The optimizer IR and its operand model + +The optimizer IR lives in `src/opt/ir.h` / `src/opt/ir.c`. Its shape: + +- `Func` owns one function: its CFG (`blocks`, `entry`, `emit_order`), frame + slots, params, locals, the pseudo-register table, the SSA value table, scope + bookkeeping, allocation results, and per-pass scratch. +- `Block` is a basic block: a growable `Inst[]`, explicit `preds`/`succ` edges, + and a pre-allocated `MCLabel` for blocks born from `cg_label_new`. +- `Inst` is one recorded operation. The `IROp` enum mirrors the `CgTarget` + surface essentially 1:1 (each recorded `CgTarget` call becomes exactly one + `Inst`), plus a few SSA-only ops (`IR_PHI`, `IR_CONST_I`, `IR_CONST_BYTES`). + Rich operations (calls, returns, switches, inline asm, atomics, aggregate + memory ops, intrinsics, scopes, phis) carry a structured `aux` record so the + full semantic descriptor round-trips to emission. + +### Virtual vs physical operands; the mode-on-`Func` invariant + +`Operand` (kind `OPK_REG`/`OPK_IMM`/`OPK_LOCAL`/`OPK_GLOBAL`/`OPK_INDIRECT`) is +intentionally not a bare value id. Register operands change *meaning* across the +pipeline, but the *field* never changes — the mode is a flag on `Func`, never +encoded in the numeric id: + +- During lowering and the whole O1 path, `OPK_REG` carries a **`PReg`**: a + mutable pseudo-register id, the persistent storage location of a value. +- After `opt_build_reg_ssa` (O2 only), `OPK_REG` carries a **`Val`**: an + SSA single-definition value id. `Func.opt_reg_ssa` records which namespace is + live; shared helpers (`opt_reg_count`, `opt_reg_type`, `opt_reg_cls` in + `opt_internal.h`) consult it rather than guessing from context. +- Physical registers never appear in `OPK_REG` HIR operands. Allocation results + go to a separate location table, and physical operands appear only in the MIR + (Section 6). + +`IR_PARAM_DECL` is a def-only marker carrying no operands — the param's storage +lives in the `IRParam` table, not in a synthetic self-operand. These invariants +(virtual-only HIR operands, single-namespace-at-a-time, def-only param decls) +are what the debug verifier (`opt_verify`, Section 7) checks at phase +boundaries, so that a stale physical operand or a wrong-namespace use fails at +the nearest checkpoint rather than in the backend encoder. + +`FrameSlot` is the frame-storage currency: locals forced to memory, spill slots, +ABI parameter slots, alloca regions, and outgoing-argument areas. + +### Token aliasing: optimizer-local names onto NativeTarget types + +`src/opt/ir.h` deliberately reuses the physical backend's data types as the +optimizer's own, via a layer of preprocessor `#define`s. After including +`arch/native_target.h` it remaps a set of optimizer-local tokens onto the +`Native*` types: + +- `FrameSlot` → `NativeFrameSlot`, `FrameSlotKind`/`FS_*` → the `NativeFrameSlot*` + enum, `RegClass`/`RC_*` → `NativeAllocClass`, `CGPhysRegInfo` → + `NativePhysRegInfo`, the known-frame descriptor, and the `CG_REG_*` register + role flags. +- It also re-`#define`s the now-removed semantic CG spellings — `Operand`, + `CGCallDesc`, `CGFuncDesc`, `CGParamDesc`, `CGScopeDesc`, `CGLocalStorage`, + `FrameSlotDesc` — onto the optimizer's own `Opt*` structs, so optimizer code + can keep using the short historical names. + +The reason is that the optimizer's frame-slot, register-class, and physical- +register vocabulary *is* the backend's; sharing the structs avoids a translation +layer at the emit boundary, where `NativeFrame*` is exactly what +`opt_emit_native` hands the `NativeTarget`. The cost is a namespace hazard: a +`.c` file that needs the real semantic `cg/ir.h` `Operand`/`CG*Desc` types (for +example because it reads the recorded tape, or it talks to the live `NativeTarget` +in `Native*` terms) must first `#undef` the aliased tokens. `cg_ir_lower.c`, +`opt.c`, and `pass_native_emit.c` each do exactly this at the top of the file — +they straddle the boundary and must escape the optimizer-local remapping to name +the other side's types. Files that live entirely inside the optimizer IR (the +analysis and transform passes) keep the aliases and never `#undef`. + +## 3. One lowering path, three consumers + +There is a single lowering path through the optimizer IR. The opt level and the +consumer choose how far down it the function travels. ```text -frontend -> CG -> opt_cgtarget records Func IR - -func_end: - build/lower CFG - machinize - loop tree - liveness - dead-def elimination - register allocation without live-range splitting - post-RA combine - DCE - jump cleanup - emit to wrapped target + opt_func_from_cg_ir + | + +-----------------+------------------+ + | | | + O1 native O2 mid-end interpreter tap + opt_run_o1_native opt_cleanup + opt_run_o1_interp + shared lowering (stops before machinize) + | | | + machinize SSA build, | + regalloc value/mem passes, | + MIR + emit conventional SSA, | + undo-SSA, then | + shared lowering | + v v v + NativeTarget NativeTarget interp bytecode ``` -`-O2`: +### O1 native (`opt_run_o1_native`) + +This is the live optimized path for compiled output. `opt_run_o1_native` +(`src/opt/opt.c`) is the per-function driver; how a function reaches it depends +on the scheduling regime of Section 1. On x64/rv64 the per-function callback +lowers the recorded function and calls `opt_run_o1_native` directly as each +function is recorded. On ARM_64 the callback only registers the function; +lowering and the call to `opt_run_o1_native` happen at finalization, once the +reachability sweep has selected the function. Either way the function travels the +same pipeline, entirely in the PReg namespace (`opt_reg_ssa == 0`) — no SSA +construction, no value numbering. In source order: ```text -frontend -> CG -> opt_cgtarget records all Func IR - -finalize: - inline retained functions - for each retained function: - SSA cleanup and value/memory optimization - lower through the shared backend pipeline - register allocation with coalescing and live-range splitting - emit to wrapped target +build_cfg -> jump_cleanup(CFG) -> build_cfg -> simplify_local +try_tiny_inline (+ cfg/jump_cleanup/cfg if anything inlined) +verify "lowering-cfg" +machinize_native ABI/call/ret/param constraints + machine clobbers +verify "lowering-machinize" +addr_xform_pregs fold ADDR_OF(local) into OPK_LOCAL loads/stores +promote_scalar_locals non-escaped scalar frame slot -> PReg +addr_of_global_cse hoist duplicate ADDR_OF(global) to entry +build_loop_tree +lower_loop_imm_operands / hoist_loop_consts loop-invariant imm materialization +live_blocks per-block PReg liveness (backward dataflow) +dead_def_elim_with_live pre-RA dead-definition elimination +regalloc_locations PReg -> hard reg / spill slot (no live-range splitting) +verify "post-regalloc" +lower_to_mir build physical MFunc; insert spill/reload +mir_verify "lower-mir" +mir_combine post-RA peephole / addressing-mode synthesis +mir_dce post-RA dead-code elimination +mir_jump_cleanup(CFG) -> mir_build_cfg -> mir_jump_cleanup(LAYOUT) +emit_native replay MIR into the NativeTarget ``` -Functions that contain constructs whose cloning or SSA semantics are not yet -explicit, such as inline asm, label-address loads, or indirect branches, are -conservatively routed through the non-SSA lowering path. - -## Core Data Structures - -### `OptImpl` - -`OptImpl` is the `CGTarget` wrapper state in `src/opt/opt.c`. - -- `target` is the wrapped real backend target. -- `level` selects the optimization schedule. -- `f` and `cur` track the function and current block during recording. -- `pending_loc` stores the latest source location and stamps new instructions. -- `FuncSet funcs` retains all `-O2` functions until `finalize`, so inlining can - see both callees and callers before any function is emitted. - -### Function IR - -The optimizer IR is defined in `src/opt/ir.h`. - -- `Func` owns the arena-backed IR for one function: blocks, frame slots, - pseudo-register metadata, value metadata, local declarations, emit order, and - pass scratch fields. -- `Block` owns a linear instruction list plus CFG predecessor/successor edges. - `emit_order` records layout order separately from block id order. -- `Inst` is one recorded operation. Most `IROp` values map directly to a single - CGTarget method. Source locations are stored on instructions. -- `PReg` is the mutable pseudo-register id used by recorded CG IR and by the - lowering pipeline before register SSA. -- `Val` is the SSA value id used after `opt_build_reg_ssa` and mem2reg SSA. - `VAL_NONE` and id zero are sentinels. -- Per-op auxiliary structs preserve structured data for calls, returns, phis, - switches, inline asm, atomics, aggregate memory operations, intrinsics, and - debug/scope operations. - -The representation deliberately keeps mode explicit. Before pseudo-register -SSA, `OPK_REG` operands carry `PReg` ids. After pseudo-register SSA, they carry -`Val` ids, and `Func.opt_reg_ssa` tells shared helpers which namespace is -active. - -### Memory Shape - -Memory operations carry `MemAccess`, including type, size, alignment, volatile -or atomic properties, and alias root. Important alias roots include locals, -globals, TLS, parameters, and unknown memory. Memory-aware passes must preserve -observable memory: volatile, atomic, aggregate operations with side effects, -calls that may clobber, fences, and inline asm. - -### Analysis State - -`OptAnalysis` holds per-pass order and dominance data: - -- postorder/reverse-postorder -- reachability -- immediate dominators and dominator children -- dominance frontiers - -`Func.opt_valid_analyses` tracks coarse invalidation for CFG, def-use, -dominators, and loop info. Passes that mutate control flow, operands, or -instructions either rebuild the relevant analysis or invalidate it. - -The shared def-use representation records every use of every `Val`, including -ordinary operands, indirect bases, phi inputs, call arguments/results, -aggregate operands, inline asm operands, and intrinsic operands. - -## Shared Lowering Pipeline - -`opt_run_lowering_pipeline` is used by both `-O1` and `-O2`. - -1. `opt_build_cfg` - Builds explicit CFG edges from branches, returns, switches, fallthroughs, - scopes, indirect branches, and synthetic blocks. It also removes unreachable - connected blocks after cleanup. - -2. `opt_jump_cleanup` - Performs CFG cleanup before and after lowering. The early form repairs and - simplifies CFG shape; the late form also optimizes layout fallthroughs. - -3. `opt_simplify_local` - Performs local algebraic/address simplifications that do not require SSA. - -4. `opt_machinize` - Lowers generic IR toward target-legal machine IR. This includes target - operand constraints, hard-register constraints, clobber information, and - target-specific rewrite decisions needed before allocation. - -5. `opt_build_loop_tree` - Builds loop metadata from dominators for use by later analysis and metrics. - -6. `opt_live_blocks` - Computes block liveness over pseudo-registers or SSA values, depending on - the active mode. - -7. `opt_dead_def_elim_with_live` - Removes dead register definitions using live-out information while - preserving side-effecting instructions. - -8. `opt_regalloc` - Allocates pseudo-registers to hard registers or spill slots. `-O1` uses the - simple allocation mode. `-O2` enables coalescing and live-range splitting. - -9. `opt_combine` - Runs post-allocation peephole combines over lowered IR. - -10. `opt_dce` - Deletes now-dead non-side-effecting instructions after allocation and - combine. - -11. `opt_emit` - Replays the final IR into the wrapped backend target. +Once a function enters this pipeline it runs every stage — there is no per-op +bypass within the pipeline itself. Varargs, inline asm, aggregates/sret/byval are +all handled here. Most stages are bracketed by an `opt_verify` / +`opt_mir_verify` checkpoint with a stage tag, and `CFREE_DUMP=<tag>` dumps the +IR at the matching stage (`entry` before any pass, `pre-emit` just before emit). + +The reachability decision lives *outside* this pipeline and is per-architecture +(Section 1). At module finalization (`opt_on_finalize`) file-scope asm blocks +captured during recording are replayed on every target. On ARM_64, finalization +additionally runs the reachability sweep that selects which functions are lowered +at all, so dead local functions/data are never lowered or emitted; the survivors +then each run the full pipeline above. On x64/rv64 every recorded function was +already lowered and emitted during streaming, so dead-static elimination is left +to the linker rather than performed here. + +### O2 mid-end (`opt_cleanup` + shared lowering) + +The O2 mid-end is the SSA-based optimization schedule defined in `opt_cleanup` +(`src/opt/pass_o2.c`). It is the intended mid-end architecture and is fully +implemented, but it is not on the shipped code path: `opt_cgtarget_new` normalizes +every requested `opt_level` to `1` (the line `o->level = 1` in `src/opt/opt.c`), +so no compilation ever selects O2 and every `opt_level >= 1` request runs the O1 +native path. + +The rationale for this normalization is isolation. Keeping the O2 schedule +defined and its passes maintained means the SSA representation and its +incremental def-use can stabilize against targeted optimizer tests independently, +without an SSA-construction or value-numbering bug affecting shipped output. The +schedule is documented here because it is the designed mid-end shape that the O1 +path is a deliberately reduced subset of; the section describes the intended +architecture, not a live code path. The schedule is: -The shared path is the bisection floor. Bugs in CFG construction, target -machinization, liveness, allocation, rewrite, combine, or emission should -usually reproduce at `-O1`. - -## `-O1` Pipeline - -`-O1` runs only the shared lowering pipeline. It intentionally skips: - -- interprocedural retention -- inlining -- SSA construction -- GVN -- copy propagation beyond local cleanup -- DSE -- LICM -- pressure relief -- live-range splitting - -`-O1` should compile much closer to `-O0` than to `-O2`, while removing obvious -backend artifacts and exercising the optimized allocation/emission path. - -## `-O2` Pipeline - -`-O2` retains every function until target finalization. This enables whole-TU -inlining before any function is emitted. - -### Inlining - -`opt_inline` operates on `FuncSet`. - -- The call graph is built from direct `IR_CALL` sites whose callee is an - `OPK_GLOBAL` with addend zero and a retained `Func`. -- Strongly connected components are used to avoid recursive inlining. Self and - mutually recursive SCCs are refused for v1. -- The current iteration cap is small. The default is one inline iteration. -- Candidate callees must be retained, target-compatible, non-variadic, not - recursive, and within a conservative growth budget. -- V1 refuses callees containing alloca, varargs, setjmp/longjmp, inline asm, - label-address loads, indirect branches, coroutine switching, or unknown - cloning semantics. -- Void and scalar register returns are supported. Aggregate/sret/byval returns - are left for future work. -- The caller block is split into pre-call, cloned body, and continuation blocks. - Arguments are materialized in the clone entry, returns branch to the - continuation, and modified callers are marked dirty. - -After inlining, dirty functions run the normal `-O2` cleanup schedule. - -### SSA Cleanup Schedule - -`opt_cleanup` is the `-O2` pre-lowering schedule. - -1. `opt_build_cfg`, `opt_jump_cleanup`, `opt_build_cfg` - Canonicalize control flow before SSA. - -2. `opt_build_reg_ssa` - Converts mutable pseudo-registers to SSA `Val` ids. - -3. `opt_block_cloning` - Clones small cold/return blocks into hot predecessors when bounded growth - exposes better local optimization. - -4. `opt_build_ssa` - Promotes eligible frame-backed locals/params to mem2reg SSA, inserts phis, - and rewrites loads/stores to values. - -5. `opt_ssa_dce` and `opt_copy_cleanup` - Remove unused SSA definitions and redundant copies before heavier passes. - -6. `opt_addr_xform` - Folds address-producing pseudos into direct memory operands where legal and - preserves address-taken storage in memory. - -7. `opt_simplify` - Applies SSA-aware identity simplification, constant simplification, and - local algebraic cleanup. - -8. `opt_gvn` - Performs scalar value numbering, constant propagation, branch folding, - redundant expression elimination, and memory-aware redundant load/store-load - reuse where alias/version rules permit it. - -9. `opt_copy_prop` - Propagates SSA copies and removes redundant extension/copy chains. - -10. `opt_dse` - Removes stores proven overwritten or dead along all paths, while preserving - observable memory and call/escape boundaries. - -11. `opt_build_loop_tree`, `opt_licm` - Rebuild loop information and hoist safe invariant computations. - -12. `opt_pressure_relief` - Sinks legal same-block computations to reduce peak pressure before - allocation. - -13. `opt_make_conventional_ssa` - Lowers SSA phis into edge copies. These copies are marked - `IRF_NO_COALESCE`; coalescing them can collapse current and next - loop-carried values and miscompile loops. - -14. `opt_ssa_combine` - Combines patterns while conventional SSA edge copies are explicit. - -15. `opt_undo_ssa`, `opt_copy_cleanup` - Rewrites SSA values back into allocation-ready pseudo-register form. - -16. `opt_jump_opt` - Runs full O2 jump optimization before shared lowering. - -Verifier checkpoints bracket the important transitions: CFG, reg SSA, block -cloning, mem2reg SSA, address transform, simplify, GVN, copy prop, DSE, LICM, -pressure relief, conventional SSA, SSA combine, undo SSA, and jump opt. - -After cleanup, `-O2` enters the shared lowering pipeline with live-range -splitting enabled. - -## Register Allocation - -Allocation uses target-provided register classes and hard-register sets. - -- Pseudo-register metadata records type, class, fixed/tied hard-register - constraints, and forbidden hard-register masks. -- Liveness is represented as block live-in/live-out sets and live ranges over - instruction points. -- Coalescing collects move-related candidates, builds a bounded conflict matrix - for those candidates, and merges only same-class, same-type registers with - compatible hard-register constraints and no live-range conflict. -- Copies marked `IRF_NO_COALESCE` are not coalescing candidates. -- `-O2` allocation may split live ranges and insert boundary reload/store - instructions. Critical edges are split when needed for placement. -- Spill slots are represented as frame slots and rewritten as ordinary loads - and stores before emission. - -## Verification, Dumps, and Metrics - -`opt_verify(Func*, stage)` checks CFG reciprocity, reachable block shape, -emit-order validity, instruction ids, operand namespaces, phi consistency, and -def-use freshness. Passes should keep verifier failures local by using stage -names that describe the last completed transformation. - -The optimizer also emits metrics through the compiler metrics interface. The -driver's `cfree run --time` mode is the easiest way to inspect scoped timings -such as frontend, pass totals, link, JIT, and execution. - -IR dumps and pass-local tests should prefer stable ids and pass-specific -fixtures. End-to-end parse/toy tests should be focused on user-visible behavior -and disassembly sanity checks, not broad churn. - -## Current Limits +```text +build_cfg / jump_cleanup(CFG) / build_cfg canonicalize control flow +build_reg_ssa PReg -> Val (register SSA) +block_cloning bounded clone of small blocks +build_ssa mem2reg: promote frame locals, insert phis +ssa_dce / copy_cleanup +addr_xform fold address pseudos into mem operands +simplify SSA-aware identity/algebraic cleanup +gvn value numbering, constprop, branch fold, + redundant-load reuse +copy_prop copy + redundant-extension elimination +dse dead store elimination +build_loop_tree / licm hoist loop invariants +pressure_relief sink same-block computes +make_conventional_ssa phis -> edge copies (IRF_NO_COALESCE) +ssa_combine +undo_ssa / copy_cleanup Val -> PReg, allocation-ready +jump_opt +``` -- `_Float16` is parsed for Darwin header compatibility but is not a real - half-precision ABI implementation. -- Inlining v1 does not handle aggregate returns, sret/byval, recursive SCCs, - varargs, alloca, setjmp/longjmp, inline asm, or indirect-control constructs. -- `cfree-run --bench-time` reports compile+JIT as one slice. Finer-grained - timing is available through `--time`, but benchmark CSVs do not yet split - cfree frontend, optimizer, link, and JIT into separate columns. -- Performance goals, measurements, and tuning priorities live in - `doc/OPT_PERF.md`. +By design an O2 function then re-enters the same backend tail as O1 (machinize +through emit), with the allocator's live-range splitting and move-related +coalescing enabled — the variants that the O1 path leaves off. The SSA +value/memory passes (`opt_gvn`, `opt_dse`, `opt_licm`, +`opt_pressure_relief`, `opt_ssa_combine`) live in `src/opt/pass_o2.c`; SSA +construction and phi destruction in `src/opt/pass_ssa.c`. + +### Interpreter tap (`opt_run_o1_interp`) + +The interpreter consumes the optimizer IR directly rather than machine code. The +tap runs the maximal target-independent subset of the O1 pipeline and stops +before machinization: build CFG, jump cleanup, `simplify_local`, the PReg-level +address folds and scalar-local promotion, `addr_of_global_cse`, loop tree, and +liveness-driven dead-def elimination. It deliberately stops before +`opt_machinize_native`, register allocation, MIR lowering, and native emit. The +result is a `Func` still in the PReg namespace (`opt_reg_ssa == 0`, no +`IR_PHI` phis) that `src/interp/lower.c` lowers into threaded bytecode. The tap +runs the folds even though in the native pipeline they sit after machinize, +because they depend only on the PReg/frame-slot view, not on physical-register +pools — so they are safe and they shrink the interpreter's work. See +[INTERPRETER.md](INTERPRETER.md). + +## 4. Pass catalog by role + +The passes are grouped here by responsibility. Each is one `Func`-in-place +transform or analysis; the file paths orient the reader. + +### SSA mid-end (O2) + +- **Register SSA + mem2reg** (`src/opt/pass_ssa.c`): `opt_build_reg_ssa` renames + multiply-assigned PRegs into SSA `Val`s; `opt_build_ssa` promotes eligible + frame-backed locals/params to SSA via dominance-frontier phi insertion and + rewrites their loads/stores to values. `opt_make_conventional_ssa` lowers phis + to edge copies (marked `IRF_NO_COALESCE`, because coalescing a phi edge copy + can collapse a loop-carried value with its successor and miscompile the loop) + and `opt_undo_ssa` returns to the PReg namespace for allocation. +- **GVN + DSE orchestration** (`src/opt/pass_o2.c`): `opt_gvn` does scalar value + numbering, constant propagation, branch folding, and memory-aware redundant + load / store-to-load reuse gated by alias-root and version rules; `opt_dse` + removes stores proven dead or overwritten while preserving observable memory + (volatile, atomic, calls that may clobber, escapes). `opt_licm` and + `opt_pressure_relief` round out the loop/pressure work, also here. +- **Peephole combine + addressing-mode synthesis** (`src/opt/pass_combine.c`): + `opt_combine` is a per-block forward-pass-with-fixpoint that propagates copies, + folds address-producing computations into a load/store's `OPK_INDIRECT` + base/index/scale/offset where the backend accepts the shape, sinks defs toward + their sole use, and folds extension chains. It is used in two roles: directly + in the O2 SSA combine (`opt_ssa_combine` wraps it) and as the post-RA MIR + combine (Section 6). When run over physical MIR it gates each rewrite on a + live-range safety check (Section 5). +- **Simplify** (`src/opt/pass_simplify.c`): `opt_simplify_local` is the + no-SSA-required local algebraic/addressing canonicalizer used on every path; + `opt_simplify` is the SSA-aware identity/constant cleanup used in O2. +- **DCE** (`src/opt/pass_dce.c`): `opt_ssa_dce` removes unused SSA defs; + `opt_mir_dce` removes post-RA dead physical defs; both preserve side effects, + including the subtle case of a value-producing op whose destination is an + `OPK_LOCAL` (a write to an escaped frame-homed local is a memory side effect + even when the op is otherwise pure). +- **Copy cleanup / copy prop** (`src/opt/pass_copy.c`): redundant-copy removal + and copy propagation, including redundant extension/convert-chain elimination. +- **Inlining** (`src/opt/pass_inline.c`): `opt_try_tiny_inline` is the streaming + O1 entry. On the pre-machinize PReg form it resolves each direct `IR_CALL` to a + recorded callee via a lookup callback (`OptImpl` owns the registry and the + lazily re-lowered callee cache), gates on a tiny straightline-cost cap and a + whitelist that excludes calls/control-rich constructs, refuses self/recursive + callees, and splices the cloned body in. The whole-program inliner machinery + (`inline_call_site` and its gates) also lives here. +- **Address folding** (`src/opt/pass_addr_fold.c`): the always-on O1 HIR folds — + `opt_addr_xform_pregs` (fold `ADDR_OF(local)` into direct `OPK_LOCAL` + load/store operands and clear `FSF_ADDR_TAKEN` when all such defs retire), + `opt_promote_scalar_locals` (promote a non-escaped scalar frame slot to a + PReg, turning its stores/loads into copies), `opt_addr_of_global_cse` (hoist + one `ADDR_OF(global)` compute to the entry block and reuse it), and the + loop-invariant constant materialization (`opt_hoist_loop_consts` / + `opt_lower_loop_imm_operands`). `opt_addr_xform` is the SSA-namespace + counterpart used in O2. + +### Shared analyses + +- **CFG** (`src/opt/pass_cfg.c`): `opt_build_cfg` derives `preds`/`succ` from + each block's terminator (branches, conditional/fused branches, returns, + switches, indirect branches, scope break/continue edges) and validates + reciprocity; `opt_mir_build_cfg` recomputes them over the physical MIR. +- **Order + dominators + verify** (`src/opt/pass_analysis.c`): postorder / + reverse-postorder, reachability, immediate dominators, dominator children, + dominance frontiers (`OptAnalysis`), the coarse analysis-validity bits + (`OPT_ANALYSIS_CFG/DEF_USE/DOM/LOOP`), and the debug verifier `opt_verify`. +- **Liveness** (`src/opt/pass_live.c`): `opt_live_blocks` solves per-block PReg + liveness by backward dataflow into elastic 64-bit-word bitsets (`OptBitset`, + grown on demand, trailing-zero-trimmed); `opt_live_ranges_build` produces the + compressed point-indexed live ranges and per-PReg frequency/spill-cost metrics + the allocator consumes. +- **Hard-register liveness** (`src/opt/pass_hard_live.c`): physical-register + live-in/out over the post-RA MIR, plus the per-call clobber mask + (`opt_call_clobber_mask_for`). This is what makes post-RA combine/DCE safe: a + value in a callee-saved register survives a call, while caller-saved registers + are killed by it. +- **Loop detection** (`src/opt/pass_loop.c`): `opt_build_loop_tree` computes loop + nesting depth from dominators; depth feeds the allocator's spill-cost weighting + and LICM. + +### Backend tail + +- **Type-size lowering** (`src/opt/pass_lower.c`): the type/size machinery and + the allocator that the PReg form needs before MIR (also hosts the allocation + and constraint application described below). +- **Machinize** (`src/opt/pass_machinize.c`): `opt_machinize_native` is ABI + lowering against the `NativeTarget`. It annotates calls/returns/params with + calling-convention constraints (argument/result registers, the call clobber + and return masks, callee-save markers), collects the target's register classes + (allocable set, reserved scratch set, caller/callee-saved masks) and checks + allocable and scratch sets do not overlap, resolves inline-asm + named-register constraint strings into masks, and records per-instruction + fixed-register clobbers (Section 5). +- **MIR view** (`src/opt/pass_mir.c`): the post-allocation physical IR. Rather + than duplicate the CFG passes, `pass_mir.c` builds a transient `Func` *view* + whose block arrays point at `Func.mir`, runs the shared `opt_combine`, + `opt_dce`, `opt_build_cfg`, and `opt_jump_cleanup` over that view, and commits + it back. The `opt_mir_*` wrappers are thin shims over this view; the shared + passes are written once and reused for both HIR and MIR. +- **Coalescing / allocation** (`src/opt/pass_coalesce.c`, `src/opt/pass_lower.c`): + `opt_regalloc_locations` is a point-bitmap linear-scan allocator producing the + canonical `Func.preg_locs` location table (hard reg or spill slot per PReg) + without mutating HIR operands. The non-splitting form is the O1 path; when + live-range splitting is enabled (the O2 quality path) it invokes + move-related coalescing (`opt_coalesce_ranges`), which builds a bounded + conflict matrix and merges only same-class, same-type values with compatible + constraints and no range conflict — never an `IRF_NO_COALESCE` copy. +- **Jump / layout cleanup** (`src/opt/pass_jump.c`): `opt_jump_cleanup` in CFG + mode drops unreachable blocks and collapses unconditional-jump chains; in + LAYOUT mode it reorders blocks for fallthrough, rotates simple single-latch + loops, and inverts mis-aligned conditional branches so the per-iteration + back-jump disappears. +- **Native emit** (`src/opt/pass_native_emit.c`): `opt_emit_native` replays the + physical MIR into a `NativeTarget`, using `NativeLoc` (register / frame / imm / + address) as the operand currency. It reserves exactly the callee-saved + registers the allocator used, pre-maps frame slots, drives the backend's + minimal-prologue hook when available, routes scalar call results straight to + their destination, uses a hardware zero register for stored zeros where the + backend advertises one, and legalizes addresses the backend rejects into a + reserved scratch register. See [ARCH.md](ARCH.md) for the backend contract. + +## 5. Machine register-constraint model + +Some target instructions pin operands or results to specific physical registers +and clobber others as a side effect of their encoding — hardware constraints, +not allocator choices (x86-64 `idiv`/`div` pinning the dividend to `rax` and +clobbering `rdx`, variable shifts requiring the count in `cl`, one-operand +`mul`, `cmpxchg`, and the `va_arg` offset scratch). aarch64 and riscv64 have +no such instructions — their div/shift/mul are ordinary three-operand forms — +so on those targets the constraint hooks are inert. + +The optimizer models all fixed-register requirements through two allocator +primitives, and the allocator (`pass_lower.c`) speaks only in physical register +numbers here: + +- **Tied hard register** (`OptPRegInfo.tied_hard_reg`): pin a value to a + specific physical register. Set for inline-asm operands with a `{reg}` + constraint and for fixed-input/fixed-output machine operands. +- **Forbidden / clobbered hard registers** + (`OptPRegInfo.forbidden_hard_regs` / `clobbered_hard_regs`): for each register + an instruction clobbers, every value live *across* the instruction (live-after, + not a use or def of it) is forbidden from that register. The clobbered subset + is recorded separately so the soft return-register placement hint cannot later + clear a forbid that came from a real hardware clobber. + +Three sources feed these, all unified at allocation time: + +1. **Calls** — the call plan's `clobber_mask` (caller-saved by default, or the + call-specific mask) drives the live-across-forbid loop; argument/result + registers come from the plan. +2. **Inline asm** — `pass_machinize.c` resolves named-register constraint + strings and clobber lists into masks and fixed-register indices on the + `IRAsmAux`; `pass_lower.c`'s `apply_asm_register_constraints` ties the fixed + operands and runs the live-across-forbid loop. +3. **Generic machine instructions** — a binop or convert has no `aux` to hang + constraints on, so machinization queries the target's machine-clobber hook + per instruction and stores the result in a per-function side table keyed by + `InstId` (`Func.inst_clobbers`, built in `machinize_inst_clobbers`). At + allocation, `apply_machine_reg_clobbers` looks up the instruction's clobber + mask and applies the same live-across-forbid loop. A NULL hook (aa64/rv64) + means no entries and zero behavior change. + +This is the single place where target ISA register rules enter the allocator, +and all three sources reuse one mechanism — tie + forbid — rather than patching +assignments after the fact. A value that merely dies at the instruction needs no +constraint (the backend stages it into/out of the fixed register itself); only +values that survive past the instruction are forbidden. + +## 6. Allocation, MIR, and the physical boundary + +Allocation does not rewrite HIR. `opt_regalloc_locations` consumes block +liveness and the compressed live ranges and writes one canonical location per +PReg into `Func.preg_locs` (`OptLoc`: hard register or spill slot). HIR operands +stay virtual after allocation — the verifier checks this. + +`opt_lower_to_mir` then builds the physical IR `Func.mir` (an `MFunc`): each +virtual `OPK_REG` is translated through its `OptLoc` into a physical register or +a frame access, spilled values get a reload before each use and a store after +each def, and call plans are lowered into physical argument/return moves. From +this point the IR is physical and non-SSA (registers may be multiply defined). +All PReg-to-physical knowledge lives in this one step; after it, the HIR is +untouched and the MIR is fully physical. The downstream MIR passes (combine, +DCE, jump/layout cleanup) run over the MIR view and rely on physical-register +liveness for their safety checks, then `opt_emit_native` replays the MIR. + +The reason allocation results are a separate table rather than rewritten +operands is the same mode-clarity principle from Section 2: a post-allocation +pass can never accidentally treat a physical register as a PReg, replay can +never see a stale virtual operand, and the MIR verifier can assert "no PRegs or +Vals here" at a single boundary. + +## 7. Verification and observability + +The optimizer is checkpoint-verified in debug builds. `opt_verify(Func*, stage)` +checks CFG reciprocity, reachable-block shape, emit-order validity, instruction +ids, operand namespaces (no physical registers in HIR; correct PReg-vs-Val +namespace for the current mode), phi consistency, and def-use freshness; +`opt_mir_verify` checks the physical boundary (no virtual operands, valid frame +slots, fully physical call plans). Each pass tags its checkpoint with the name +of the transformation just completed, so a failure localizes to the nearest +boundary. `Func.opt_valid_analyses` tracks coarse invalidation; passes that +mutate control flow, operands, or instructions rebuild or invalidate the +relevant analysis. + +Observability hooks: `CFREE_DUMP=<tag>` dumps the optimizer IR at a named stage, +`CFREE_DUMPCG=1` dumps the recorded semantic tape before lowering, +`CFREE_DUMP_INTERP` dumps the interpreter-tap `Func`, and the optimizer emits +scoped timing/count metrics (visible through `cfree run --time`) for the +frontend, each pass scope, allocation, and emit. diff --git a/doc/OPT_MACHINE_REG_CONSTRAINTS.md b/doc/OPT_MACHINE_REG_CONSTRAINTS.md @@ -1,180 +0,0 @@ -# Teaching the optimizer about fixed-register machine instructions - -## Problem - -Some target instructions pin operands or results to specific physical registers -and clobber others as a side effect of the encoding — *hardware* constraints, -not allocator choices. On x86-64: - -| operation | constraint | -|---|---| -| `idiv` / `div` (DIV) | dividend in `rax`; quotient → `rax`; `rdx` clobbered (`cqo`/`xor`) | -| `idiv` / `div` (MOD) | dividend in `rax`; remainder → `rdx`; `rax` clobbered | -| variable shift `shl/shr/sar` | shift count in `cl` (`rcx`) | -| `mul` / `imul` one-operand (unsigned-mul-high, `*_overflow`) | `rax`*`r/m` → `rdx:rax`; `rdx` clobbered | -| `cmpxchg` (atomic CAS) | expected/prior in `rax`; `rax` clobbered | -| `xadd`/`cmpxchg` loop (atomic RMW) | `rax`/`rcx`/`rdx` used | -| `va_arg` (SysV) | `rax` used internally for the gp/fp offset | - -aarch64 and riscv64 have **no** such instructions — their div, shift, and mul -are ordinary three-operand forms with no fixed registers — which is why this -never came up porting them. - -Today the optimizer (`-O1`) models none of this. It treats `idiv` as a generic -binop `def = a / b`, so the register allocator is free to keep an unrelated live -value in `rdx` across the divide. The x64 backend then emits `xor rdx,rdx; idiv` -and silently destroys that value. Concretely (`6_5_04_div_mod`, the VLA cases, -`continue` cases): a live pointer/sum allocated to `rdx` is gone after the first -`idiv`, producing wrong results or a SIGSEGV. This is the single root cause -behind the x64 parse `-O1` tail (div/mod, VLA size math, modulo in loops, the -atomic and overflow intrinsics, and the variadic accumulators). - -The `-O0` path is immune: NativeDirectTarget keeps values in memory / flushes its -small register cache around such ops, so nothing is live in a clobbered register. - -## The machinery that already exists - -The allocator (`src/opt/pass_lower.c`) already supports exactly the two -primitives we need; today only inline-asm and calls feed them: - -- **`preg_info[v].tied_hard_reg`** — pin value `v` to a physical register. - Set by `apply_fixed_asm_operand` from `aux->in_fixed_regs` / `out_fixed_regs`. -- **`clobber_mask[cls]` → `forbidden_hard_regs`** — for each register an - instruction clobbers, every value live *across* the instruction (live-after, - not a use/def of it) is forbidden from that register. Applied in - `apply_asm_register_constraints` (asm) and `opt_call_clobber_mask_for` (calls). - -`pass_machinize.c` populates the asm constraints in `machinize_prepare_insts` -(`asm_prepare_constraints`), resolving named-register constraint strings into -masks. Calls carry their clobber/return masks in the call plan. - -So the gap is narrow: **a generic machine instruction (div, shift, …) has no way -to declare fixed registers and clobbers.** It has no `aux` to hang them on — a -binop's `extra` union already holds the `BinOp` tag (`extra.imm`). - -## Design - -### 1. A target hook describing an instruction's register constraints - -```c -/* arch/native_target.h */ -#define NATIVE_MAX_FIXED_OPNDS 4u - -typedef struct NativeInstRegConstraints { - /* Use-operand index -> physical register it must occupy, or -1. Indices match - * the instruction's OPK_REG use operands in order. */ - i8 in_fixed[NATIVE_MAX_FIXED_OPNDS]; - /* Result (def) index -> physical register, or -1. */ - i8 out_fixed[NATIVE_MAX_FIXED_OPNDS]; - /* Extra physical registers the instruction clobbers (defines), per class. - * The fixed in/out registers need not be repeated here. */ - u32 clobber_mask[NATIVE_CALL_PLAN_CLASSES]; -} NativeInstRegConstraints; - -/* Fill *out with the register constraints the target's encoding of `op` imposes, - * given the instruction (for the BinOp/Conv sub-tag in extra.imm and operand - * kinds — e.g. a variable vs constant shift count). Return 1 if constrained, 0 - * if the instruction is unconstrained (the common case). May be NULL. - * - * The optimizer speaks only in physical register numbers here; the hook is the - * one place target ISA register rules enter the allocator. */ -int (*inst_reg_constraints)(NativeTarget*, const struct Inst* in, - NativeInstRegConstraints* out); -``` - -aarch64/riscv64 leave the hook NULL (or `return 0`) — zero behaviour change. - -### 2. Where it is computed and stored - -The allocator (`pass_lower`) does not hold a `NativeTarget*`, and a binop has no -free `aux` slot, so constraints are **computed in machinization and stored in a -per-function side table keyed by `Inst.id`**: - -```c -/* Func, built in opt_machinize_native */ -typedef struct OptInstConstraints { - InstId inst; - NativeInstRegConstraints c; -} OptInstConstraints; -/* f->inst_constraints : sorted/dense by InstId; lookup is O(1) by id or a - * small bsearch. Only constrained instructions are stored. */ -``` - -`machinize_prepare_insts` already walks every instruction; extend it: for each -non-asm, non-call instruction, call `inst_reg_constraints`; on a `1` return, -append an entry. (Asm and calls keep their existing paths.) - -### 3. Where it is applied - -In `pass_lower`'s per-instruction allocation walk, beside the existing -`apply_asm_register_constraints(f, in, use, def, live)` call, add -`apply_machine_reg_constraints(f, in, use, def, live)`: - -- Look up `in->id` in `f->inst_constraints`; if none, return. -- For each `in_fixed[i] >= 0`: tie the i-th OPK_REG **use** operand's value via - `preg_info[v].tied_hard_reg` (reuse `apply_fixed_asm_operand`). -- For each `out_fixed[i] >= 0`: tie the i-th **def** value. -- For `clobber_mask`: run the same live-across-forbid loop the asm path uses. - -This reuses 100% of the existing allocator mechanism; the only new code is the -lookup + dispatch. No change to the allocation algorithm itself. - -### 4. The x64 hook - -```c -int x64_inst_reg_constraints(NativeTarget*, const Inst* in, - NativeInstRegConstraints* out) { - zero(out); set all in_fixed/out_fixed = -1; - switch ((IROp)in->op) { - case IR_BINOP: - switch ((BinOp)in->extra.imm) { - SDIV/UDIV: in_fixed[dividend]=RAX; out_fixed[0]=RAX; clobber{RDX}; return 1; - SMOD/UMOD: in_fixed[dividend]=RAX; out_fixed[0]=RDX; clobber{RAX}; return 1; - SHL/SHR/SAR with non-const count: in_fixed[count]=RCX; return 1; - default: return 0; - } - case IR_VA_ARG: clobber{RAX}; return 1; /* internal offset scratch */ - case IR_ATOMIC_CAS: in_fixed[expected]=RAX; out_fixed[prior]=RAX; - clobber{RAX,RCX}; return 1; - case IR_ATOMIC_RMW: clobber{RAX,RCX,RDX}; return 1; - case IR_INTRINSIC (umul_overflow): clobber{RDX}; return 1; - default: return 0; - } -} -``` - -(Exact operand indices follow the IR's use/def layout; `mul`/`imul` two-operand -forms and signed-mul-overflow via `imul r,r` need no constraint.) - -### 5. Backend simplification (follow-on, optional) - -Once the allocator guarantees the dividend is already in `rax` and nothing live -sits in `rdx`/`rcx`, several defensive moves in the x64 backend become -no-ops/removable: the `mov rax, dividend` in `x64_binop`, the address-staging in -`x64_atomic_cas`/`_rmw` (the optimizer ties expected→rax etc.), and the shift -`mov rcx, count`. These can be simplified after the constraints land, but are -left in initially (harmless — they become register-to-itself moves). - -## Validation - -- x64 parse `-O1` div/mod, VLA, continue, atomic, overflow, varargs cases go - green; toy `-O1` stays 156/0; `-O0` unchanged (hook only consulted on the - optimizer path). -- aa64/rv64 unaffected (hook returns 0 / NULL): re-run toy + parse to confirm - byte-identical output. -- Add a focused regression: a function that divides and keeps an unrelated value - live across the divide, compiled at `-O1`. - -## Risks / corner cases - -- **Tied input that is also the clobber/def register** (div: dividend tied to - `rax`, `rax` redefined as quotient). The allocator must accept a use whose - tied register is also written — same shape as a two-address op. Verify - `tied_hard_reg` + the def coexist; if the live-across-forbid loop also forbids - the *result* value from `rax`, exclude defs of this instruction (the asm loop - already skips use/def values — reuse that exactly). -- **Two operands tied to the same register** must be rejected at the hook level - (never emitted for the x64 set above). -- **`InstId` density** for the side table — confirm ids are assigned per - function and stable through `pass_lower`; otherwise key by instruction pointer - in a small open-addressed map. diff --git a/doc/OPT_O0_NATIVE_DIRECT_NOTES.md b/doc/OPT_O0_NATIVE_DIRECT_NOTES.md @@ -1,186 +0,0 @@ -# NativeDirectTarget O0 investigation - -Notes from a 2026-05-29 read of the direct native O0 path, focused on reducing -the aarch64 prologue padding and reducing stack traffic for compiler -temporaries while keeping semantic codegen single-pass. - -Relevant code: - -- `src/cg/native_direct_target.c`: shared direct O0 semantic target. -- `src/arch/aa64/native.c`: physical aarch64 frame/prologue/call lowering. -- `src/cg/local.c`, `src/cg/memory.c`: value-stack temporary allocation and - local/value stack behavior. -- `src/opt/pass_native_emit.c`: known-frame O1 path for comparison. - -## Current direct O0 shape - -`NativeDirectTarget` is genuinely single-pass: frontend/value-stack operations -are lowered immediately to `NativeTarget` calls. It does not record the function -body, liveness, or a complete frame plan. - -On aa64, `aa_func_begin` reserves `AA_PROLOGUE_WORDS` (24 words) at entry, -emits entry-save stores after that reserved region, and `aa_func_end` patches -the reserved words once final `cum_off` and `max_outgoing` are known. For common -small top-record frames the real prologue is only four instructions: - -```asm -sub sp, sp, #N -add x17, sp, #(N - 16) -stp x29, x30, [x17] -add x29, sp, #(N - 16) -``` - -Because the reserved region is 24 words, the patcher emits those four -instructions followed by `b body` and 19 nops. That is the branch/nop blob seen -in O0 disassembly. - -The O1 path avoids this by planning the whole frame in `plan_frame`, then using -`func_begin_known_frame`. That is not directly available to O0 without either -recording/pre-scanning the function or buffering the body. - -## Prologue options that keep single-pass semantics - -### 1. Small inline region plus out-of-line slow prologue - -Reserve a small direct-prologue region, probably four words, and record the -first body offset. At `func_end`: - -- If the final frame fits the four-word top-record prologue, patch those four - words directly. No entry branch and no padding in the common case. -- If it does not fit, patch entry to branch to an out-of-line prologue stub - emitted after the function body/epilogue. The stub emits the full prologue - and branches back to the recorded body-start label. - -This keeps the frontend and direct target single-pass: only the rare large-frame -machine-code prologue is moved out of line. It needs careful unwind/debug work: -the first executed prologue instructions are no longer laid out before the body -in the slow case, so CFI/line handling cannot keep assuming a contiguous -entry-prologue region. - -This is the best shape if we want the common O0 case to have zero padding and -still support arbitrarily large frames without pre-scanning. - -### 2. Smaller fixed reserve - -Use a smaller reserved region for direct O0, e.g. 8 or 12 words instead of 24. -This is simple and low risk, and immediately reduces the current branch target -distance and static padding. - -It does not remove the branch unless the real prologue exactly fills the -reserved region. For the observed small frames, a reserve of 8 still gives: - -```asm -4 real prologue insns -b body -3 nops -``` - -This is a conservative stepping stone but not the end state. - -### 3. Four-word reserve with hard fallback restriction - -Reserve four words and require direct O0 frames to fit the small top-record -prologue. This would remove the common padding with minimal code churn, but it -is not acceptable as a general solution unless paired with a fallback. Large -frames, odd entry-save cases, or future callee-save use could exceed four words. - -### 4. Buffer body emission until function end - -Keep semantic generation single-pass, but make the native emitter buffer each -function body in memory. At `func_end`, compute the final frame, emit the exact -prologue, then append the buffered body and epilogue. - -This gives the cleanest machine code and keeps frontend parsing single-pass, but -it is no longer single-pass object emission. It also needs relocation/label/CFI -plumbing in the emitter buffer. This is effectively a mini recorded native body, -so it is less attractive than option 1. - -### What is not possible - -The `fp_at_bottom` O1 fold needs final `frame_size` before any body memory -operand is emitted, because slot offsets become `frame_size - slot_off`. -Without a pre-scan or body buffering, direct O0 cannot use that layout safely. -For true single-pass direct emission, O0 should keep the top-record layout and -focus on removing reserved padding. - -## Stack traffic and compiler temporaries - -`NativeDirectTarget` already has a write-back local register cache: - -- cacheable locals are scalar, non-address-taken, non-memory-required; -- cache regs are caller-saved allocables only, so calls just flush the cache; -- pure compute destinations use `nd_dst_reg` and stay dirty in a cache reg until - a flush or eviction. - -The cache is underused. Several ops still force their scalar result through a -scratch register and immediately store to the frame home: - -- `nd_load` -- `nd_bitfield_load` -- `nd_addr_of` -- `nd_load_label_addr` -- `nd_tls_addr_of` -- `nd_alloca` -- `nd_atomic_load` / `nd_atomic_rmw` / `nd_atomic_cas` -- `nd_intrinsic` -- direct call returns -- parameter binding - -For scalar, non-escaped destination locals, most of these can use -`nd_dst_reg` + `nd_dst_writeback` instead of `nd_dst_scratch` + -`nd_store_operand_from_reg`. That keeps the result in the direct local cache and -avoids a store/reload pair when the value feeds later compute or address ops. - -The safest first wave: - -1. Change scalar `nd_load` to write through `nd_dst_reg`. -2. Change `nd_addr_of`, `nd_load_label_addr`, `nd_tls_addr_of`, and `nd_alloca` - similarly for pointer results. -3. Change `nd_bitfield_load` for scalar results. -4. For direct call returns, allocate cache regs for scalar result locals before - planning the call, let the call plan move return registers into those regs, - then mark the locals dirty after the call. Keep aggregate/sret returns on the - frame path. -5. Bind scalar parameters into cache regs when the local is cacheable; flush if - their address is later taken. - -These are still not a real allocator. They only extend the existing local cache -to more producer kinds. - -## Compiler-temp identity - -The public API has `CFREE_CG_LOCAL_COMPILER_TEMP`, and the Wasm frontend uses it -heavily, but the current semantic `CGLocalDesc` only carries -`CG_LOCAL_ADDR_TAKEN` and `CG_LOCAL_MEMORY_REQUIRED`. The C parser also creates -many temporary locals through `pcg_local` / `cg_local` without setting a -compiler-temp flag. - -Short-term improvement: - -- Add an internal `CG_LOCAL_COMPILER_TEMP` bit to `CGLocalFlag`. -- In `cfree_cg_local` / `cfree_cg_param`, propagate - `CFREE_CG_LOCAL_COMPILER_TEMP` from `CfreeCgLocalAttrs` into `CGLocalDesc`. -- Add C-parser helpers for compiler temporaries and use them in - `builtin_tmp_slot`, `ll_tmp_slot`, conditional/compound-assignment stash - temps, EA normalization temps, and value-stack temps from - `api_alloc_temp_local`. - -This flag should not by itself remove the frame home in direct O0. Without -liveness, a temp may still need to survive a branch, call, or cache eviction. -But it can drive policy: - -- do not require debug homes for compiler temps; -- prefer caching compiler temps over anonymous source locals; -- make diagnostics/debug omit them; -- later, allow an explicitly "ephemeral" temp API for values whose lifetime is - stack-top-only and cannot cross barriers. - -## Practical priority - -1. Prologue: implement the small inline region + out-of-line slow prologue, or - first reduce the fixed direct reserve as a low-risk intermediate. -2. Stack traffic: extend the existing cache to scalar load/address/alloca and - bitfield-load producers. -3. Call/param traffic: cache scalar params and scalar direct returns. -4. Metadata: propagate compiler-temp identity through `CGLocalDesc`, then use it - for cache preference and debug/home policy. diff --git a/doc/OPT_O0_PERF_NOTES.md b/doc/OPT_O0_PERF_NOTES.md @@ -1,168 +0,0 @@ -# cfree -O0 runtime gaps vs clang -O0 - -Focused notes for the unoptimized codegen/runtime gap. `doc/OPT_O1_PERF_TODO.md` -tracks O1 work; this file catalogs the O0 shape seen in a small aarch64/Apple -sample against clang O0. - -## Snapshot - -Measured 2026-05-29 on Apple/aarch64 using the MIR `c-benchmarks` sources from -`$HOME/tmp/mir/c-benchmarks`. Each runtime is best of two runs with the -benchmark's checked-in `.arg` and `.expect` files. Artifacts: - -- raw timings: `build/bench/o0gap/manual_results.csv` -- object disassembly: `build/bench/o0gap/disasm/{clang,cfree}.O0.*.txt` -- executable disassembly for binary-trees: - `build/bench/o0gap/disasm/{clang,cfree}.O0.binary-trees.exe.txt` - -| bench | clang O0 runtime ms | cfree O0 runtime ms | cfree / clang | object text: clang -> cfree | -| --- | ---: | ---: | ---: | ---: | -| binary-trees | 2720 | 3754 | 1.38x slower | 975 B -> 10459 B | -| hash2 | 8754 | 15138 | 1.73x slower | 3127 B -> 18258 B | -| sieve | 13668 | 12607 | 0.92x faster | 411 B -> 2571 B | -| mandelbrot | 10544 | 13215 | 1.25x slower | 774 B -> 3870 B | - -Geomean over these four: cfree O0 is 1.29x slower than clang O0. - -## Cross-cutting gaps - -### O0 still emits patched-prologue padding - -Every cfree O0 function in this sample starts with a real prologue, then an -unconditional branch over roughly 19 nops: - -```asm -sub sp, sp, #0x60 -add x17, sp, #0x50 -stp x29, x30, [x17] -add x29, sp, #0x50 -b body -nop -... -body: -``` - -This is the old fat-prologue reservation shape that O1's known-frame path fixed. -At runtime it costs one extra unconditional branch per function entry, and -statically it bloats hot functions enough to hurt I-cache locality. In -binary-trees, the recursive hot functions all have it: - -| function | clang O0 insns | cfree O0 insns | -| --- | ---: | ---: | -| NewTreeNode | 18 | 58 | -| ItemCheck | 27 | 73 | -| BottomUpTree | 29 | 82 | -| DeleteTree | 20 | 59 | - -### Compiler-generated temporaries spill as if they were user variables - -cfree O0 preserves a very literal expression lowering: many values are copied -through temps, stored to stack, then immediately reloaded. Clang O0 still uses -stack slots heavily, but it keeps simple expression values in registers across a -single expression. - -Concrete examples: - -- `NewTreeNode`: cfree stores the constant `16` to the frame and reloads it into - `x0` before `malloc`; clang uses `mov x0, #0x10`. -- `BottomUpTree(NULL, NULL)`: cfree creates several zero/copy temporaries before - loading `x0`/`x1`; clang does `mov x1, #0; mov x0, x1`. -- `ht_find_new`: cfree is 171 instructions with a 0x130-byte frame; clang is 61 - instructions with a 0x40-byte frame. -- `mandelbrot`: cfree main is 562 instructions with a large temporary area; - clang main is 191 instructions. - -This is an O0-specific quality issue, not necessarily an optimizer issue: keep -debuggable stack locations for source variables, but avoid materializing -compiler-only intermediates as mandatory stack homes. - -### Header inline/helper emission bloats O0 objects - -cfree emits many SDK/header helper functions that clang does not emit into these -objects: - -- `binary-trees` includes `<math.h>` and cfree emits `___inline_isfinite*`, - `___inline_isinf*`, `___inline_isnan*`, `___sincos*`, `___sputc`, - `__OSSwapInt*`, etc. -- `hash2` includes `simple_hash.h`, which includes `<ctype.h>`; cfree emits a - large set of ctype helpers such as `_isalnum`, `_isalpha`, `_isdigit`, - `_tolower`, `_toupper`, etc. -- even `sieve` and `mandelbrot` get `___sputc` / `__OSSwapInt*` helpers. - -Most of this is not on the hot runtime path, but it inflates object text and the -linked image. It also obscures disassembly and compile/link-time measurements. -O0 should not eagerly emit unused static inline/header helpers unless they are -referenced. - -### FP O0 misses cheap instruction selection wins - -The `mandelbrot` escape loop shows a runtime-visible gap: - -- clang uses `fmov` immediates for `1.0` / `2.0` and uses `fmadd` for - `2.0 * Zr * Zi + Ci` even at O0. -- cfree repeatedly materializes FP constants with `mov`/`movk`/`fmov`, spills - intermediate FP values, and emits separate `fmul` + `fadd`. - -These are instruction-selection/local lowering issues rather than global -optimization. They should be safe to improve at O0 without changing source-level -debuggability. - -## Per-benchmark notes - -### binary-trees - -Runtime: cfree O0 is 1.38x slower than clang O0. - -The main gap is hot recursive function shape: every call pays the O0 padded -prologue branch and 2-3x the instruction count. The final executable uses stubs -for allocator/libc calls on both compilers, so unlike the O1 note this sample -does not look like a pure external-call-path issue; cfree's function bodies are -substantially larger at O0. - -Priority fixes: - -1. Reuse the known-frame/no-padding prologue path for O0 when the frame is known. -2. Stop stack-homing compiler-only temporaries in the expression-lowering path. -3. Extend the existing call-argument immediate/coalescing work to O0 lowering. - -### hash2 - -Runtime: cfree O0 is 1.73x slower than clang O0, the worst case in this sample. - -The hot helpers are much larger: - -| function | clang O0 insns | cfree O0 insns | -| --- | ---: | ---: | -| ht_hashcode | 30 | 87 | -| ht_find | 37 | 105 | -| ht_find_new | 61 | 171 | -| ht_next | 55 | 142 | - -Both compilers use `udiv` for `val % ht->size` at O0, so the O1 modulo/reciprocal -hypothesis is not the O0 gap here. The O0 gap is mostly stack/copy expansion, -plus eager emission of unused ctype helpers from headers. - -### sieve - -Runtime: cfree O0 is 0.92x of clang O0 in this run, so this is not currently a -runtime gap despite cfree's object being larger. - -The disassembly still shows the cross-cutting O0 problems: padded prologue, -larger frame, repeated constant/base materialization, and extra stack temporaries. -However, the hot loop is simple enough that the extra code did not lose in this -two-run sample. Treat sieve as a regression guard rather than a first target. - -### mandelbrot - -Runtime: cfree O0 is 1.25x slower than clang O0. - -This is the clearest local instruction-selection case. The inner FP loop has -substantial temporary spills, repeated FP constant construction, and misses -`fmadd`. Clang O0 is still unoptimized, but its local FP lowering is compact -enough to matter here. - -Priority fixes: - -1. Use encodable FP immediates directly where available. -2. Select `fmadd` for multiply-add expression trees during lowering. -3. Avoid spilling FP expression intermediates that have no source-level storage. diff --git a/doc/OPT_O1_PASSES.md b/doc/OPT_O1_PASSES.md @@ -1,412 +0,0 @@ -# O1 Optimizer Pass Pipeline - -A reference for cfree's `-O1` lowering pipeline as it currently exists: the -entry path, what each pass does, the key data structures, and the -`NativeTarget` backend contract the emit step relies on. Intended as a map for -performance work (see `doc/OPT_O1_PERF_TODO.md` for the active runtime targets). - -Scope: **aa64 only**. The aarch64 backend is the only `NativeTarget` -implementation; x64/rv64 `NativeTarget` ports do not exist. The O0 path uses a -separate backend (`NativeDirectTarget`, §16) and is not covered here beyond the -boundary. - -History note: this pipeline replaced an older `CGTarget`-replay design (commit -`f60a16d`). The transitional "direct-replay bypass" — which sent functions with -alloca/varargs/asm/aggregates straight to unoptimized emission — **has been -deleted**. Every function now goes through the full pipeline below -unconditionally; varargs, inline asm, and aggregate/sret/byval are all handled -on the optimizer path. - ---- - -## Entry path - -`opt_cgtarget_new(level >= 1)` installs the optimizer as the CG sink. The C -frontend drives `CgIrRecorder`, which records each function as a `CgIrFunc` -tape. On function completion, `opt_on_func` (src/opt/opt.c) runs: - -1. `opt_func_from_cg_ir` (src/opt/cg_ir_lower.c) — convert the recorded - `CgIrFunc` into the optimizer's `Func` CFG/IR, classifying each local as - register (`PReg`) or frame storage (§1). -2. `opt_run_o1_native` (src/opt/opt.c) — run the pass sequence on `Func`. - -There is no per-function or per-op bypass. - ---- - -## Pipeline overview - -`opt_run_o1_native`, in order (verify/dump steps interleaved): - -``` -opt_build_cfg ┐ CFG build + cleanup -opt_jump_cleanup(CFG) │ -opt_build_cfg │ -opt_simplify_local ┘ -opt_try_tiny_inline streaming inline of tiny callees recorded earlier -opt_machinize_native ABI lowering (call/ret/param constraints) -opt_addr_xform_pregs fold ADDR_OF(local) → OPK_LOCAL in load/store -opt_promote_scalar_locals non-escaped frame slot → PReg -opt_addr_of_global_cse hoist duplicate ADDR_OF(global) to entry -opt_build_loop_tree loop nesting (informs spill cost) -opt_hoist_loop_consts hoist single-def LOAD_IMM (imm!=0) in loops to entry -opt_live_blocks per-block PReg liveness -opt_dead_def_elim_with_live pre-RA DCE -opt_regalloc_locations PReg → hard reg / spill slot -opt_lower_to_mir materialize hard regs; insert spill/reload -opt_mir_combine MIR peephole (substitute/addr-synth/sink/ext) -opt_mir_dce post-RA DCE -opt_mir_jump_cleanup(CFG) + build_cfg unreachable/jump-chain cleanup -opt_mir_jump_cleanup(LAYOUT) reorder for fallthrough + rotate loops + invert -opt_emit_native emit to NativeTarget (minimal prologue on opt path) -``` - -An `opt_verify` / `opt_mir_verify` runs after most stages (tags: `lowering-cfg`, -`lowering-machinize`, `o1-addr-xform`, `o1-promote-scalar`, -`o1-addr-global-cse`, `post-regalloc`, `lower-mir`, `post-mir-combine`, -`post-mir-dce`, `post-mir-jump-cfg`). `CFREE_DUMP=<tag>` dumps the IR at the -matching stage (`entry` before any pass, `pre-emit` just before emit); see -Debugging aids. - ---- - -## §1 — opt_func_from_cg_ir (CgIrFunc → Func) - -**Source**: src/opt/cg_ir_lower.c - -Converts the recorded tape into the optimizer `Func` and classifies locals in -`lower_locals`: - -- **Address-taken or memory-required** locals (`local_needs_home`: - `in->address_taken`, `CG_LOCAL_ADDR_TAKEN`, `CG_LOCAL_MEMORY_REQUIRED`), and - **aggregate / >8-byte** locals, get `CG_LOCAL_STORAGE_FRAME` → `OPK_LOCAL`. -- All other locals get `CG_LOCAL_STORAGE_REG` → a fresh `PReg` → `OPK_REG`. - -Address-taken locals start in frame storage; the addr-fold + promotion passes -(§5, §6) recover register storage for those whose address does not truly escape. - -va_list operands are lowered as opaque pointer *values* (a `NativeLoc`), not -flagged address-taken — all va-layout knowledge lives behind the `NativeTarget` -va hooks. Aggregate locals are forced to frame. - ---- - -## §2 — opt_build_cfg / opt_jump_cleanup(CFG) / opt_simplify_local - -**Sources**: src/opt/pass_cfg.c, src/opt/pass_jump.c, src/opt/pass_simplify.c - -Build the CFG, drop unreachable blocks and trivially-foldable branches, and fold -single-successor chains. Runs before `opt_machinize_native` so ABI lowering sees -an accurate live/critical-edge picture. `opt_build_cfg` runs twice with a jump -cleanup between to settle the shape. - ---- - -## §3 — opt_machinize_native - -**Source**: src/opt/pass_machinize.c - -ABI lowering against the `NativeTarget`. Annotates call/ret/param instructions -with calling-convention constraints (argument/result registers, the call -clobber mask, callee-save markers) so the register allocator knows which hard -registers are killed across each call. Collects the target's register classes -(`machinize_collect_regs`): the allocable set, the reserved scratch set, and the -caller/callee-saved masks (`collect_class`), and checks allocable and scratch -sets do not overlap (`machinize_check_overlap`). - ---- - -## §4 — opt_addr_xform_pregs - -**Source**: src/opt/pass_addr_fold.c - -Folds `IR_ADDR_OF(local)` whose result pointer is used only as the base of -non-observable zero-offset, no-index `IR_LOAD`/`IR_STORE`. When all uses -qualify, the `IR_ADDR_OF` is removed and each use is rewritten to -`OPK_LOCAL(slot)`. Partial folding leaves the `IR_ADDR_OF` alive for any -effective-address-shaped uses. When a slot's `IR_ADDR_OF` defs are all retired, -`FSF_ADDR_TAKEN` is cleared so §6 can promote it. - ---- - -## §5 — opt_promote_scalar_locals - -**Source**: src/opt/pass_addr_fold.c - -For each `FS_LOCAL` slot without `FSF_ADDR_TAKEN`/`FSF_VOLATILE` whose every -`OPK_LOCAL(slot)` appearance is a matching-type non-observable `IR_LOAD.opnds[1]` -or `IR_STORE.opnds[0]`, replaces the slot with a fresh `PReg`: stores become -`IR_COPY preg, src`, loads become `IR_COPY dst, preg`. This pulls scalar locals -out of memory into the register namespace before allocation. - -**Limitation (perf):** `promote_inst_classify` rejects a slot if `OPK_LOCAL` -appears anywhere else (call aux, `IR_ADDR_OF` operand, any other instruction). -An address-taken local whose address is stored into a pointer and later -dereferenced is not promoted (would need ADDR_OF copy-propagation through -PRegs); genuinely aliased locals correctly stay in memory. - ---- - -## §6 — opt_addr_of_global_cse - -**Source**: src/opt/pass_addr_fold.c - -CSE for `IR_ADDR_OF(global{sym, addend})`: when the same (sym, addend) appears -in multiple defs, materialize one in the entry block (after the -`IR_PARAM_DECL` prologue) and reuse it, collapsing per-iteration `adrp`/`add` -pairs in loops. - ---- - -## §7 — opt_build_loop_tree - -**Source**: src/opt/pass_loop.c - -Computes loop nesting depth, used as a spill-cost weight by the register -allocator (values live across deep loops are preferred for hard registers). - ---- - -## §7.5 — opt_hoist_loop_consts - -**Source**: src/opt/pass_addr_fold.c - -LICM-lite for constant materialization, mirroring `opt_addr_of_global_cse`'s -hoist-to-entry pattern for `LOAD_IMM`. For each distinct `(imm, type, cls)` -that appears inside a loop block (`loop_depth > 0`), allocates a canonical -PReg, emits one `LOAD_IMM` in the entry block (after `IR_PARAM_DECL`s), NOPs -the in-loop and any other defs of that key, and remaps every use to the -canonical PReg. Without this the backend re-materializes the constant -(`movz`/`movk`) on every iteration. - -Safety: only hoists `LOAD_IMM` whose def PReg is **single-defined** (the -LOAD_IMM is its only def). The PReg form here is non-SSA, so a mutable promoted -local initialized to a constant (e.g. a loop counter `i = 1`) is multiply -defined; remapping it would discard the increment defs and break the loop. -Skips `imm == 0` so the backend's store-zero-register path (see §14) keeps the -cheaper `strb wzr, …` form for stored zeros. - ---- - -## §8 — opt_live_blocks + opt_dead_def_elim_with_live - -**Sources**: src/opt/pass_live.c, src/opt/pass_lower.c - -`opt_live_blocks` computes per-block PReg liveness bitmaps via backward -dataflow. `opt_dead_def_elim_with_live` sweeps each block backward and removes -any side-effect-free instruction whose defined PRegs are all dead at its output. -Running before allocation shrinks live ranges and the allocable set. - ---- - -## §9 — opt_regalloc_locations - -**Source**: src/opt/pass_lower.c - -Point-bitmap linear-scan allocator. A `used_locs[p * loc_words + w]` bitmap -tracks (PReg, location) conflicts; for each PReg in program order it picks the -first non-conflicting hard register, else a spill slot. The allocable set is the -target's `allocable` list (aa64: caller-saved first so they're preferred, then -callee-saved under pressure). It never assigns a reserved scratch register. - ---- - -## §10 — opt_lower_to_mir - -**Source**: src/opt/pass_lower.c - -Rewrites the PReg `Func` into a hard-register MIR `MFunc` (`f->mir`): every PReg -operand becomes its assigned hard register or spill slot. Spilled PRegs get an -`IR_LOAD` (reload) before each use and an `IR_STORE` (spill) after each def. -`f->blocks` is updated to the rewritten arrays. From here the IR is non-SSA -(physical registers, multiply defined). - ---- - -## §11 — opt_mir_combine - -**Source**: src/opt/pass_combine.c - -MIR peephole, per-block fixpoint, four rewrites: - -1. **Substitute** — propagate a copy `r1 ← r2` (or `r1 ← imm` / convert) into - later uses of `r1`, removing redundant moves. The reg-source form is full - local copy propagation: it rewrites every safe use of `r1` to `r2` (gated - only on `r2` being unchanged between the copy and the consumer, which - `ctx_def_changed_since` already verifies with call clobbers folded in); the - stricter single-use + live-out gate is kept only for the immediate/convert - forms, which would otherwise duplicate a multi-instruction materialization. -2. **Addr-mode synthesis** — fold `r ← addr_of`/`r ← base+index*scale` into a - load/store's `OPK_INDIRECT` base/index/scale/offset where the backend - accepts it. -3. **Sink** — move a def adjacent to its sole use to shrink its live range. -4. **Ext-chain** — fold ext-of-ext and ext feeding an instruction that already - extends. - -`opt_combine_compact_block` additionally merges adjacent spill/reload pairs. - -Because the MIR is non-SSA, each rewrite is gated on a **live-range safety -check**: `count_uses_in_live_range` counts uses of the producer's register up to -its next redefinition in the block, and a producer that is **live-out of the -block** (per `opt_block_live_out_has_phys_reg`, backed by the `hard_live` -analysis) is not folded away. Register kills are computed from each -instruction's precise def/clobber set (`inst_kills_phys_reg` → -`opt_hard_inst_use_def`), so a call kills only its caller-saved clobbers — a -value in a callee-saved register survives it. - ---- - -## §12 — opt_mir_dce - -**Source**: src/opt/pass_dce.c - -Post-RA dead-def elimination over hard-register IR: removes side-effect-free -instructions whose defined registers are never used, cleaning up copy chains -substitution left behind. Uses the same `hard_live` machinery as §11. - ---- - -## §13 — opt_mir_jump_cleanup(CFG) + build_cfg + jump_cleanup(LAYOUT) - -**Source**: src/opt/pass_jump.c, src/opt/pass_cfg.c - -- `OPT_JUMP_CLEANUP_CFG`: drop unreachable blocks, collapse unconditional-jump - chains. -- `opt_mir_build_cfg`: recompute successor lists. -- `OPT_JUMP_CLEANUP_LAYOUT`: `cleanup_reorder_for_fallthrough` greedily orders - blocks so the hot taken edge falls through, then `cleanup_rotate_loops` - detects simple single-latch loops (header with `IR_CMP_BRANCH`, fallthrough - into the body chain, one back-edge predecessor later in emit order) and - moves the header to just after its latch, inverting the test in place. The - back-edge becomes the conditional branch and the latch falls through to the - header — the per-iteration unconditional back-jump is gone. `cleanup_invert_ - taken_fallthrough` + `cleanup_layout_fallthrough_branches` then strip - trailing `b`-to-next-block and invert any remaining mis-aligned conditional. - ---- - -## §14 — opt_emit_native - -**Source**: src/opt/pass_native_emit.c - -Replays the MIR into a `NativeTarget` using `NativeLoc` as the operand currency -(not raw `Reg`). Key helpers: - -- **`loc_from_operand`** — `OptOperand` → `NativeLoc`: `OPT_OPK_REG` → - `NATIVE_LOC_REG`, `OPT_OPK_LOCAL` → `NATIVE_LOC_FRAME`, `OPT_OPK_IMM` → - `NATIVE_LOC_IMM`, `OPT_OPK_INDIRECT` → `NATIVE_LOC_ADDR`, etc. -- **`reserve_callee_saves`** — scans the lowered MIR for the callee-saved - registers the allocator actually used and hands them to the backend's - `reserve_callee_saves` hook *before* frame-slot mapping, so the prologue saves - exactly that set. -- **`map_frame_slots`** — pre-allocates every `IRFrameSlot` via - `target->frame_slot` in one scan. -- **`emit_prologue` (minimal prologue)** — once the callee-save set and frame - slots are known the opt path calls this hook on backends that support it - (aa64) and sets `target->emit_minimal_prologue = 1` so `func_begin` skips its - worst-case NOP reservation. The backend emits an exact-size contiguous - prologue in place (no reserved region); the frame-size immediates are still - patched in `func_end` because the final frame isn't known until body emission - allocates its temporaries. `NativeDirectTarget` (O0) leaves `emit_prologue` - unset and keeps the reserve-and-patch path, which now branches over the - unused tail rather than NOP-filling it. -- **call result routing** — for scalar return values `emit_call` now passes the - MIR's destination location (a register or its spill slot) straight to - `plan_call`, so the backend moves the ABI result register directly to its - destination. The old code routed every scalar result through a fresh temp - spill slot (`bl; stur [slot]; ldur`), a round trip on every call; aggregates / - oversized results still use the temp slot path. -- **store-zero register** — when the value operand of an `IR_STORE` is the - integer immediate 0 and the backend advertises a hardware zero register - (`has_store_zero_reg` + `store_zero_reg`), the store goes straight from that - register instead of materializing `movz w,0` into a scratch first - (`strb wzr, …` on aa64). -- **tail-call realizability** — `opt_on_tail_call_unrealizable_reason` returns - a reason when the callee's outgoing stack-arg bytes exceed the area this - function received for its own incoming stack args (via the backend's - `signature_stack_bytes` hook). CG then falls back to an ordinary call + - return for an ALLOWED tail, or panics for `musttail`. -- **`materialize`** — loads a non-register `NativeLoc` into a reserved scratch - register when an instruction needs a register operand. -- **`legalize_addr` / `collapse_addr_to_reg`** — if `target->addr_legal` - rejects an address shape, materialize it into a **reserved scratch register** - via `load_addr` (never the base register — the base may hold a value the - allocator keeps live past this op). - ---- - -## Backend contract (`NativeTarget`, aa64) - -The emit step depends on these invariants; performance work that touches the -allocator or emit must preserve them. - -- **2-scratch discipline.** An emit hook may use at most the **2 reserved - scratch registers** as private temporaries (aa64 int `x9`/`x10`; the hand- - written-PLT temps `x16`/`x17` = TMP0/TMP1 are reserved separately and used by - address-materialization helpers). Every other register an op needs — operand - bases, results — is **provided by the caller**. The optimizer owns all - allocation; ops clobber nothing the allocator tracks live. -- **Allocable set (aa64).** Int: caller-saved `x8, x11..x15` first (preferred, - low spill cost), then callee-saved `x19..x28` under pressure. FP: caller-saved - `v18, v19` then callee-saved `v8..v15`. Scratch: int `{x9, x10}`, FP - `{v20, v21}` — disjoint from allocable. **Argument registers `x0..x7`/`v0..v7` - are *not* allocable** (see limitations). -- **No `plan_hard_regs`/`reserve_hard_regs`.** Because only caller-saved (plus - optionally-reserved callee-saved) registers are allocable and every emit hook - receives fully-resolved hard regs, the backend never needs an up-front - hard-reg plan. Callee-save planning happens through `reserve_callee_saves`. -- **Shared cores.** aa64 ops are factored so the semantic-operand path - (`NativeDirectTarget`, O0) and the `NativeLoc` path (this pipeline, O1) call - one shared core; the wrappers only convert operands. - ---- - -## §15 — O0 path (boundary) - -At `opt_level == 0` the optimizer is not installed; the backend's -`NativeDirectTarget` (src/cg/native_direct_target.c) emits directly in a single -pass. It is a separate, simpler lowering with its own (documented) limitations -and is not part of this pipeline. - ---- - -## Debugging aids - -- `CFREE_DUMP=<tag>` — dump the optimizer IR at a named stage and panic - (`entry`, `pre-emit`, or any `opt_verify` tag above). `CFREE_DUMP=1` dumps at - `entry`. Add an `opt_dbg_dump(o, f, "<tag>")` call next to an existing - `opt_verify` to expose another stage. -- `CFREE_DUMPCG=1` — dump the semantic CG IR tape (pre-lowering). Note the CG-IR - dumper does not print `OPK_INDIRECT` offsets. -- Both panic on the first recorded function; to dump all functions, temporarily - swap the `compiler_panic` in `opt_dbg_dump` / `opt_dbg_dump_cg` for - `cfree_debug_printf`. -- Disassembly: `cfree objdump -d <obj>` or `lldb -b -o "disassemble -n <fn>"`. - ---- - -## Known limitations affecting performance - -Feeds `doc/OPT_O1_PERF_TODO.md`. Recovery of the new pipeline is functionally -complete (every function optimized; varargs/asm/aggregates handled; callee-saved -allocable; params no longer round-trip through a frame home). Remaining -quality gaps: - -- **Argument registers not allocable.** `x0..x7`/`v0..v7` cannot hold - allocator-managed values, shrinking the allocable set and forcing extra spills - in register-pressured code. An "entry-only" subset was attempted and reverted - as unsound: making arg regs allocable simultaneously exposes a parallel-move / - shuffle hazard across four ABI paths (call-arg setup, tail-call arg setup, - function entry / param materialization, multi-result return). The correct fix - is one general parallel-move-with-memory sequencer (cycle-break via reserved - scratch) applied uniformly to all four, gated by permutation/swap stress - tests. -- **Address-taken-local promotion gap.** A local whose address is stored into a - pointer variable and later dereferenced stays in frame storage even when it - does not truly escape (§5). Needs ADDR_OF copy-propagation through PRegs. -- **`BREAK_TO` / `CONTINUE_TO` + SCOPE cond** are unwired in emit (no current - frontend produces them — toy/C lower break/continue to `BR` + labels). Wire - emit or lower to CFG edges once a producer exists. -- **Unit-test coverage** for the `CgIrFunc`→`NativeTarget` path (local - promotion, addr-fold, regalloc, lowered bypass ops) is thin; the old - `test/opt/opt_test.c` uses the pre-refactor API and is disabled. diff --git a/doc/OPT_O1_PERF_TODO.md b/doc/OPT_O1_PERF_TODO.md @@ -1,214 +0,0 @@ -# cfree -O1 performance TODO - -Working list of benchmarks where cfree's `-O1` codegen leaves clear runtime on -the table. These are the high-confidence targets pulled from the full -15-benchmark `scripts/opt_bench.sh` sweep (aarch64, Apple). - -## The bar - -cfree `-O1` runtime should be **at least 10% faster** than *both* reference -points — i.e. cfree runtime ≤ 0.90× theirs (speedup ≥ 1.10×): - -- **gcc-15 -O0** — an unoptimized native baseline; losing here means we trail - even a compiler doing no real optimization. -- **mir-c2m -O1** — a fast lightweight JIT-class optimizer; a peer we should - match or beat. - -A benchmark stays on this list until it clears 1.10× against both (where the -reference exists). - -## Current standings - -Numbers below are a single 3-run sweep on aarch64/Apple (`COMPILE_REPEATS=3`, -`RUN_REPEATS=3`, best-of), **refreshed 2026-05-28** after the `fp_at_bottom` -prologue fold; runtime in ms; speedup = reference_time / cfree_time (>1 means -cfree is faster). gcc-O0 and mir-O1 columns come from the cached baseline in -`scripts/opt_bench_baseline.csv`; regenerate with -`CFREE_OPT_BENCH_MODE=baseline scripts/opt_bench.sh`. - -| bench | cfree -O1 | gcc -O0 | vs gcc-O0 | mir -O1 | vs mir-O1 | behind | -| --- | ---: | ---: | ---: | ---: | ---: | --- | -| binary-trees | 2939² | 2647 | **0.90×** (slower) | n/a¹ | — | gcc (cc path) | -| lists | 4015 | 8842 | 2.20× ✓ | 4988 | 1.24× ✓ | — (clears bar) | -| hash2 | 4915 | 7399 | 1.51× ✓ | 3857 | **0.78×** | mir | -| sieve | 4880 | 5032 | 1.03× (~tied) | 4170 | **0.85×** | gcc (~tied), mir | -| mandelbrot | 3655 | 10319 | 2.82× ✓ | 3332 | 0.91× | mir | -| strcat | 5906 | 5971 | 1.01× (~tied) | 5772 | 0.98× | both (~tied) | - -Geomean over the six (cfree -O1 cc): **1.44× faster than gcc -O0**, **0.86× vs -mir -O1** (mir-c2m geomean excludes binary-trees, which it can't compile). - -¹ mir-c2m fails to compile `binary-trees`, so only the gcc comparison applies. - -² The `fp_at_bottom` fold landed (−2 fixed insns/call; codegen now byte-for-byte -the same prologue/epilogue shape as gcc -O0, and the four hot functions are each -smaller than gcc -O0) but the **cc-linked runtime barely moved** (2973 → 2939): -binary-trees is malloc/free-bound, so per-call ALU savings are noise. The **JIT -path is at parity** — `cfree-run -O1` = 2642 ms (~tied with gcc 2647), vs the cc -binary's 2939. That cc-only deficit (identical codegen) points at PLT/GOT -indirection for `malloc`/`free` — ~15M dynamic-symbol calls routed through -stubs — as the real remaining cost on the cc path, not codegen. - -## Per-benchmark notes - -### binary-trees — cc path slower than unoptimized gcc; JIT at parity -cfree `-O1` (cc) is 0.89× gcc -O0 — but this is **no longer a codegen gap**. -After the `fp_at_bottom` prologue fold (item 2, DONE) the four hot functions -(`NewTreeNode` 12, `ItemCheck` 18, `BottomUpTree` 21, `DeleteTree` 16 insns) are -each *smaller* than gcc -O0 (16/21/24/18) and share gcc's exact -prologue/epilogue shape, yet cc runtime didn't move (2973 → 2969). The workload -is malloc/free-bound: four tiny functions called ~7.6M times at depth=19 with a -`malloc`/`free` per node, so the ~15M instructions removed are dwarfed by -allocator cost. Evidence it's *not* codegen: the JIT path (`cfree-run -O1`, -identical codegen, no PLT) is ~tied with gcc (2617 vs 2647). The remaining cc -deficit is the dynamic-call path to `malloc`/`free` through PLT/GOT stubs — the -next thing to chase here, if anything (e.g. `-fno-plt`-style direct calls, or -the linker resolving intra-image calls). The **body** quality was never the -problem: cfree keeps the recurring pointer in a callee-save (x19) where gcc -O0 -spills and reloads it three times per call. - -Open items, in priority order (most recent disasm in -`/tmp/mc/binary-trees.cfree.o`): - -1. **Useless leading `b PC+4` at every function entry. [DONE]** Fixed by the - known-frame prologue rework (frame planning in `opt_emit_native` → - `func_begin_known_frame`). The branch was *not* the empty entry block — it - was filler left by the old two-phase prologue: `aa_emit_prologue` sized a - *fat* prologue (slim eligibility wasn't decided until `func_end`), then the - `func_end` patch rebuilt it as the shorter slim form and back-filled the - leftover words with `b PC+N; nop`. The known-frame path decides slim - eligibility *before* emitting, so it emits the slim/slim_small_frame prologue - directly — no filler, no patch. Functions now fall straight from the prologue - into the body. **-1 to -2 insns at entry, every function.** - -2. **Prologue compaction: 4-insn → 2-insn pre-indexed. [DONE]** Implemented as - the `fp_at_bottom` tier in `src/arch/aa64/native.c`: a known-frame (-O1) small - frame with callee-saves and no outgoing stack args moves the frame record to - the bottom (fp = sp), folding the sp adjustment into the pre-indexed - `stp x29,x30,[sp,#-N]!` entry and post-indexed `ldp x29,x30,[sp],#N` exit; - callee-saves stack above the record at positive offsets (scaled `str`/`stp`). - Entry 4→3, exit 3→2 insns (−2/call), matching gcc -O0's shape. CFA becomes - `fp + frame_size`. See `doc/PERCALL.md`. Removed ~15M instructions at - depth=19 — but binary-trees runtime is malloc-bound, so the cc-path runtime - was unchanged (see the section intro). The win is real for code size and for - call-heavy, *non*-allocation-bound workloads. - -3. **Zero materialized through a temp in `BottomUpTree` leaf path.** - `NewTreeNode(NULL, NULL)` still emits: - ``` - c44: mov x8, #0x0 - c48: mov x0, x8 - c4c: mov x8, #0x0 - c50: mov x1, x8 - c54: bl _NewTreeNode - ``` - Should be `mov x0, #0; mov x1, #0`. The sibling fix in `9ac2416` got the - `ldr` → call-arg case, but `IR_LOAD_IMM` sources don't seem to participate - in the ABI aliasing-hint propagation in `pass_lower.c`. Likely a small - extension to `set_preg_pref_for_call_args` / `propagate_hint_through_copies` - to also fire when the source op is `IR_LOAD_IMM`. **+2 insns × 524k leaf - calls.** - -4. **Trailing `b A; A: b B` pair in `DeleteTree`'s if/else merge. [GONE]** No - longer present as of the 2026-05-28 disasm: `DeleteTree`'s `b.eq` now targets - the epilogue directly, one clean branch, 16 insns total (vs gcc's 18). Either - a `pass_jump.c` change or the layout shift resolved it; re-add to this list - only if it reappears. - -These items are all resolved or no-impact for binary-trees: the four functions -are now smaller than gcc -O0 (12/18/21/16 vs 16/21/24/18). The remaining -cc-path runtime gap is allocator/PLT cost, not codegen — see the section intro. - -### mandelbrot — 0.91× vs mir (close to the bar) -Inner loop is FP-heavy (`Tr*Tr + Ti*Ti < 4.0` Mandelbrot escape test + -4 fmuls + 2 fadds per iter). Hasn't been deeply investigated since the -recent codegen batch. Worth disassembling the hot loop and comparing -against mir to see what specifically is still on the table — likely some -combination of FP register allocation, vectorization (which we don't do), -and constant-pool material. - -### strcat — 0.97× vs mir, ~tied with gcc -Small gap; not worth standalone investigation yet. Should naturally -absorb any remaining cross-cutting wins. - -### hash2 — 0.78× vs mir (still the worst against mir) -The previously-noted hoisting and strength-reduction wins landed (and -moved hash2 from 0.72× to 0.78×), but mir is still ~1.27× faster. Remaining -gap is in the parts of the loop the prior items didn't touch — most -likely the modulo `val % ht->size` (mir probably emits a Barrett/reciprocal -multiply for the small-divisor case where we still emit `udiv`) and the -`strcmp` probe shape. Worth a fresh disassembly read of `ht_hashcode` and -the probe loop in `ht_find_new` against mir's output. - -### sieve — 0.85× vs mir, ~tied with gcc -Improved (5148 → 4880 ms; 0.78× → 0.85× vs mir). Loop-invariant `movz` and IV -copies are gone; remaining gap is structural. mir is ~1.18× faster on the same -loop shape (`flags[k] = 0` strided store + `flags[i] = 1` init). Candidate gaps: -address-mode folding into the store (using `[x19, x8]` vs `add` + bare addr), -and whether mir is auto-vectorizing the init loop. - -### lists — CLEARED (1.24× vs mir, 2.20× vs gcc) -Was 1.03× vs mir; now 4015 ms (down from 4843), clearing the 1.10× bar against -both references. Doubly-linked list traversal + splice. No longer a tracked -target — kept here for the record; drop on the next cleanup if it holds. - -## Cross-cutting fixes (open) - -These help several benches. Both are partial; the binary-trees items above -are the most concrete tests for whether each is complete. - -1. **Drop the leading `b PC+4` at function entry. [DONE]** See binary-trees - item 1. Resolved by the known-frame prologue (the optimizer plans the whole - frame up front, so the prologue is emitted final in its slim form rather than - fat-then-patched). Affected every cfree-compiled function. - -2. **Compact FP-frame prologue/epilogue. [DONE]** See binary-trees item 2. - The fp-at-bottom layout (`fp_at_bottom` tier) now folds the sp decrement and - fp/lr save into one pre-indexed `stp x29,x30,[sp,#-N]!` (and post-indexed - `ldp` on exit) for known-frame small frames with callee-saves and no outgoing - stack args. −2 fixed insns/call. Helps code-heavy benches; runtime-neutral on - allocation-bound ones like binary-trees. - -3. **Hard-register copy coalescing for `IR_LOAD_IMM` sources.** See - binary-trees item 3. The hint-propagation path covers `ldr` → call-arg - but skips immediates. - -4. **Jump-thread the `b A; A: b B` shape.** See binary-trees item 4. - General `pass_jump.c` cleanup, not bench-specific. - -## Reproducing - -```sh -# Build the optimized compiler first (clean release): -rm -rf build/release && make RELEASE=1 bin - -# Run the still-open or stale-number benches with 3 repeats (best-of). The -# default mode measures only cfree (fast iteration) and the trailing compare -# step automatically pulls gcc/mir numbers from the cached -# scripts/opt_bench_baseline.csv — no need to re-run the fixed compilers: -CFREE="$PWD/build/release/cfree" \ -CFREE_OPT_BENCHES="binary-trees lists hash2 sieve mandelbrot strcat" \ -CFREE_OPT_BENCH_LEVELS="0 1" \ -CFREE_OPT_BENCH_COMPILE_REPEATS=3 CFREE_OPT_BENCH_RUN_REPEATS=3 \ -bash scripts/opt_bench.sh -``` - -The comparison against the cached baseline prints at the end of the run; re-run -it standalone any time with `python3 scripts/opt_bench_compare.py`. Only -regenerate the baseline cache (the `CFREE_OPT_BENCH_MODE=baseline` command -above) when the host, the reference compilers, or the benchmark sources change. - -Per-iteration codegen for a single function is easiest to inspect via the -optimizer's staged IR dump (`CFREE_DUMP=pre-emit cfree cc -O1 -c bench.c ...`, -which panics after printing the pre-emit MIR) plus `objdump`/`lldb` -disassembly of the hot function. - -## Notes / caveats - -- The cfree -O1 column is a single 2026-05-28 sweep (consistent revision); the - gcc/mir baseline columns are the cached `opt_bench_baseline.csv`. Re-run with - the same `COMPILE_REPEATS=3 RUN_REPEATS=3` after a change to confirm movement. -- `cfree-run` (JIT) shares this codegen, so its runtimes track `cfree cc -O1` — - except where the cc binary pays link-time overhead the JIT doesn't (e.g. - binary-trees: JIT is ~10% faster than the cc binary on the same code, the - PLT/GOT `malloc`/`free` indirection noted above). diff --git a/doc/OPT_PERF.md b/doc/OPT_PERF.md @@ -1,828 +0,0 @@ -# OPT Performance Tracking - -This document tracks optimizer performance, benchmark coverage, and the current -gap to MIR and GCC. The optimizer design and pass order live in `doc/OPT.md`. - -## How To Run - -Primary harness: - -```sh -make bench-opt -``` - -Useful focused runs: - -```sh -CFREE_OPT_BENCHES="sieve spectral-norm" make bench-opt -CFREE_OPT_BENCH_LEVELS="1 2" CFREE_OPT_BENCH_RUN_REPEATS=5 make bench-opt -GCC=/opt/homebrew/bin/gcc-15 MIR_DIR=~/tmp/mir make bench-opt -``` - -The harness compares: - -- `gcc-15` -- `clang` -- `cfree cc` -- `cfree run` -- MIR `c2m` - -It writes `results.csv`, per-case logs, binaries, and `summary.md` under -`build/bench/opt/` or `CFREE_OPT_BENCH_OUT`. - -## Reporting Model - -The summary reports compile time in two complementary ways. - -1. Unified geomean ratios - - Compile-speed ratios use `compile_ms + codegen_ms` where both are - available. For MIR, this combines C-to-binary-MIR time and JIT link/generate - time so the headline number is comparable to `cfree-run` as a whole. - -2. Split compile timings - - For MIR, `compile_ms` is C-to-binary-MIR time and `codegen_ms` is the - `c2m -v` link/JIT generation slice. This table explains whether a - compile-time difference comes from C parsing/lowering or from the optimized - generation pipeline. - -For `cfree-run`, `--bench-time` currently reports compile+JIT as one slice. -Use `cfree run --time` for finer-grained local investigations. The benchmark -CSV does not yet split cfree frontend, optimizer, link, and JIT into separate -columns. - -MIR's benchmark binary is an optimized host executable. On the current Darwin -machine, `/Users/ryan/tmp/mir/c2m` is built by MIR's makefile with -`-O3 -DNDEBUG`. - -## Current Representative Run - -Date: 2026-05-22. - -Scope: - -- `array` -- `binary-trees` -- `hash` -- `hash2` -- `matrix` -- `nbody` -- `sieve` -- `spectral-norm` - -Levels: `0 1 2`. - -Repeats: one compile repeat and one run repeat. - -Output: - -- `build/bench/opt/results.csv` -- `build/bench/opt/summary.md` - -Coverage: - -| status | rows | -| --- | ---: | -| OK | 111 | -| COMPILE_FAIL | 9 | -| RUN_FAIL | 0 | -| OUTPUT_FAIL | 0 | - -All `cfree` and `cfree-run` rows completed successfully, including the -previously broken `matrix -O2`, `binary-trees`, `nbody`, and `spectral-norm` -rows. The remaining `COMPILE_FAIL` rows are all MIR `c2m` on the Darwin -hosted/math benchmarks. - -## Latest Focused O0/O1 Refresh - -Date: 2026-05-25. - -Scope: - -- `array` -- `hash` -- `hash2` -- `matrix` -- `sieve` - -Levels: `0 1`. - -Repeats: one compile repeat and three run repeats. - -Command: - -```sh -CFREE_OPT_BENCH_LEVELS="0 1" \ -CFREE_OPT_BENCHES="array matrix hash hash2 sieve" \ -make bench-opt -``` - -Output: - -- `build/bench/opt/results.csv` -- `build/bench/opt/summary.md` - -Coverage: - -| status | rows | -| --- | ---: | -| OK | 50 | - -Because this was a focused O0/O1 run, the generated summary does not include -gcc `-O2` base rows and therefore reports base-relative speed ratios as `NA`. -The generated summary averages are: - -| tool | opt | cases | avg compile+codegen ms | avg runtime ms | -| --- | ---: | ---: | ---: | ---: | -| `gcc-15` | `O0` | 5 | `413.780` | `8371.356` | -| `gcc-15` | `O1` | 5 | `408.000` | `3142.126` | -| `clang` | `O0` | 5 | `188.181` | `8858.075` | -| `clang` | `O1` | 5 | `182.725` | `3168.854` | -| `mir-c2m` | `O0` | 5 | `139.286` | `5608.600` | -| `mir-c2m` | `O1` | 5 | `138.051` | `5044.000` | -| `cfree` | `O0` | 5 | `68.995` | `8488.662` | -| `cfree` | `O1` | 5 | `67.520` | `4276.252` | -| `cfree-run` | `O0` | 5 | `27.630` | `8766.237` | -| `cfree-run` | `O1` | 5 | `28.935` | `4156.487` | - -Direct ratios from this run: - -| comparison | compile-time ratio | runtime ratio | -| --- | ---: | ---: | -| `cfree O1` vs `MIR O1` | `0.489x` time, `2.04x` faster | `0.848x` time, `1.18x` faster | -| `cfree O1` vs `gcc-15 O0` | `0.163x` time, `6.13x` faster | `0.511x` time, `1.96x` faster | -| `cfree-run O1` vs `MIR O1` | `0.210x` time, `4.77x` faster | `0.824x` time, `1.21x` faster | -| `cfree-run O1` vs `gcc-15 O0` | `0.070x` time, `14.30x` faster | `0.497x` time, `2.01x` faster | - -Compared with the previous O1-only focused refresh, current O1 rows are within -single-run noise: `cfree` compile time moved from `63.851` to `67.520` ms and -runtime from `4275.508` to `4276.252` ms; `cfree-run` compile+JIT moved from -`28.682` to `28.935` ms and runtime from `4150.248` to `4156.487` ms. - -Per-bench `cfree-run O1` runtime vs peers: - -| bench | cfree-run ms | MIR ms | gcc-15 ms | clang ms | cfree vs MIR | -| --- | ---: | ---: | ---: | ---: | ---: | -| `array` | `2828.6` | `4576.0` | `2007.1` | `2839.8` | `0.62x` | -| `hash` | `4048.2` | `3978.0` | `3957.6` | `3966.2` | `1.02x` | -| `hash2` | `4501.2` | `3687.0` | `4155.9` | `3684.7` | `1.22x` | -| `matrix` | `4812.5` | `9033.0` | `2949.7` | `2990.8` | `0.53x` | -| `sieve` | `4592.0` | `3946.0` | `2640.3` | `2362.8` | `1.16x` | - -Interpretation: the O1 correctness/performance acceptance scope remains clean -after moving post-allocation consumers to read allocation placement through -`Func.preg_locs` and fixing replayed call-plan outgoing stack reservation. -Runtime shape is materially unchanged: cfree is strongly ahead of MIR on -`array` and `matrix`, close on `hash`, and still trails on `hash2` and `sieve`. - -Correctness follow-up: clean O0 and O1 toy bootstraps were reconfirmed on -2026-05-25 after fixing replayed call-plan outgoing stack reservation. - -## Headline Geomeans - -Refreshed 2026-05-23 after the combiner perf+correctness fixes, MIR-shaped -allocator, and block-layout fallthrough pass. Scope: 8-case -(`array`, `binary-trees`, `hash`, `hash2`, `matrix`, `nbody`, `sieve`, -`spectral-norm`). MIR fails `binary-trees`, `nbody`, `spectral-norm` on -Darwin so its rows reflect 5 cases. - -Base: each benchmark's `gcc-15 -O2` row is `1.0x`. Higher compile-speed -ratio = faster compile; higher runtime-speed ratio = faster runtime. - -The compile column is the **whole pipeline** (C frontend + optimizer + -codegen + link + JIT). For MIR this sums `compile_ms` (c2m source→bmir) -and `codegen_ms` (`MIR link finish` from the JIT invocation). For -cfree-run it is `cfree-run compile_and_jit`. **Most of cfree's compile -lead over MIR comes from cfree's C frontend** — for the backend-only -slice see *Optimizer/Link/JIT Split* below (cfree and MIR are roughly -tied there). - -| tool | opt | cases | compile speed | runtime speed | avg comp+gen (ms) | avg runtime (ms) | -| --- | ---: | ---: | ---: | ---: | ---: | ---: | -| `gcc-15` | `O0` | 8 | `1.172x` | `0.409x` | `149.5` | `6905.2` | -| `gcc-15` | `O1` | 8 | `1.110x` | `0.955x` | `157.7` | `2960.3` | -| `gcc-15` | `O2` | 8 | `1.000x` | `1.000x` | `175.1` | `2827.2` | -| `clang` | `O0` | 8 | `1.282x` | `0.367x` | `136.6` | `7707.3` | -| `clang` | `O1` | 8 | `1.209x` | `0.958x` | `144.8` | `2952.6` | -| `clang` | `O2` | 8 | `1.141x` | `1.062x` | `153.5` | `2661.4` | -| `mir-c2m` | `O0` | 5 | `1.224x` | `0.549x` | `141.0` | `5103.6` | -| `mir-c2m` | `O1` | 5 | `1.247x` | `0.607x` | `138.4` | `4612.5` | -| `mir-c2m` | `O2` | 5 | `1.272x` | `0.786x` | `135.6` | `3564.5` | -| `cfree` | `O0` | 8 | `3.468x` | `0.337x` | `50.5` | `8394.7` | -| `cfree` | `O1` | 8 | `3.370x` | `0.777x` | `52.0` | `3637.0` | -| `cfree` | `O2` | 7 | `3.336x` | `0.681x` | `52.7` | `4152.8` | -| `cfree-run` | `O0` | 8 | `5.594x` | `0.341x` | `31.3` | `8285.0` | -| `cfree-run` | `O1` | 8 | `6.688x` | `0.807x` | `26.2` | `3505.4` | -| `cfree-run` | `O2` | 7 | `5.865x` | `0.703x` | `30.0` | `4022.1` | - -Pairwise whole-pipeline geomeans at the same opt level (cfree-run vs -peer; ratio `<1.0` = cfree-run is faster, `>1.0` = peer is faster): - -| peer | opt | cases | whole-pipeline compile (cfree/peer) | runtime (cfree/peer) | -| --- | ---: | ---: | ---: | ---: | -| `gcc-15` | `O0` | 8 | `0.21x` | `1.20x` | -| `gcc-15` | `O1` | 8 | `0.17x` | `1.18x` | -| `gcc-15` | `O2` | 7 | `0.17x` | `1.42x` | -| `clang` | `O0` | 8 | `0.23x` | `1.08x` | -| `clang` | `O1` | 8 | `0.18x` | `1.19x` | -| `clang` | `O2` | 7 | `0.20x` | `1.51x` | -| `mir-c2m` | `O0` | 5 | `0.24x` | `1.55x` | -| `mir-c2m` | `O1` | 5 | `0.20x` | `0.82x` | -| `mir-c2m` | `O2` | 4 | `0.23x` | `1.16x` | - -Backend-only (no frontend) `O1` comparison vs MIR (5-case scope, from -the *Optimizer/Link/JIT Split* table below): - -| metric | cfree | MIR | cfree/MIR | -| --- | ---: | ---: | ---: | -| opt+link+JIT geomean (ms) | `0.701` | `0.726` | `0.97x` (3% faster) | - -Interpretation: - -- The 4-6x whole-pipeline compile lead over every peer is **almost - entirely the C frontend**. The cfree optimizer+backend slice at `O1` - is only `~3%` faster than MIR's equivalent JIT slice on the 5-case - scope. The frontend dominates the absolute time at every opt level on - these benches. -- **cfree `O1` runtime now leads MIR `O1` by `1.22x` geomean** on the - 5-case MIR-comparable scope. This is the cleanest peer comparison - since both run equivalent O1 pipelines (no SSA, no coalescing). -- cfree `O1` runtime **trails** gcc/clang `O1` by `1.18-1.19x` geomean — - most of the remaining O1 quality gap is against the production - compilers, not MIR. -- cfree `O2` regressed from `O1` (`0.777 → 0.681x` vs gcc base; matrix - `O2` fails to compile). `O2` LICM-without-pressure-model and the - deferred live-range splitter are the suspects. - -### cfree `O1` vs gcc/clang `-O0` - -Useful framing for the typical developer workflow: users normally -iterate against `gcc -O0` / `clang -O0` debug builds. cfree `O1` is the -relevant comparison point because its compile cost is the same order of -magnitude. - -Geomean over the 8-case scope: - -| metric | cfree `O1` vs gcc-15 `O0` | cfree `O1` vs clang `O0` | -| --- | ---: | ---: | -| whole-pipeline compile (cfree/peer) | `0.18x` (`5.5x` faster) | `0.19x` (`5.3x` faster) | -| runtime (cfree/peer) | `0.51x` (`1.97x` faster) | `0.45x` (`2.20x` faster) | - -cfree `O1` is **strictly better than `-O0`**: faster to compile *and* -roughly `2x` faster at runtime than gcc/clang debug builds. - -Per-bench cfree-run `O1` runtime vs `-O0` peers (ms; lower is faster): - -| bench | cfree `O1` | gcc `O0` | clang `O0` | vs gcc `O0` | vs clang `O0` | -| --- | ---: | ---: | ---: | ---: | ---: | -| `array` | `2649.7` | `6435.5` | `5399.4` | `0.41x` (`2.43x`) | `0.49x` (`2.04x`) | -| `binary-trees` | `2520.3` | `2494.5` | `2616.5` | `1.01x` (tied) | `0.96x` (tied) | -| `hash` | `3905.0` | `4313.0` | `4576.3` | `0.91x` (`1.10x`) | `0.85x` (`1.18x`) | -| `hash2` | `3928.9` | `6910.9` | `8227.1` | `0.57x` (`1.76x`) | `0.48x` (`2.09x`) | -| `matrix` | `4713.5` | `19344.0` | `13043.2` | `0.24x` (`4.10x`) | `0.36x` (`2.77x`) | -| `nbody` | `2978.7` | `8460.6` | `10045.3` | `0.35x` (`2.84x`) | `0.30x` (`3.37x`) | -| `sieve` | `4117.6` | `4749.5` | `12864.9` | `0.87x` (`1.15x`) | `0.32x` (`3.12x`) | -| `spectral-norm` | `3849.0` | `13897.9` | `13887.6` | `0.28x` (`3.61x`) | `0.28x` (`3.61x`) | - -cfree `O1` ties or wins every bench against both `-O0` peers. The -largest wins are on compute-heavy floating-point and matrix cases -(`matrix`, `spectral-norm`, `nbody`) where `-O0`'s lack of register -allocation hurts most. - -Per-bench cfree-run `O1` runtime vs peers (ms; lower is faster): - -| bench | cfree | mir-c2m | gcc-15 | clang | vs MIR | vs gcc | -| --- | ---: | ---: | ---: | ---: | ---: | ---: | -| `array` | `2649.7` | `4460.0` | `2000.5` | `2793.8` | **`0.59x`** | `1.32x` | -| `binary-trees` | `2520.3` | — | `2451.3` | `2249.9` | — | `1.03x` | -| `hash` | `3905.0` | `3885.0` | `3832.2` | `3850.7` | `1.01x` | `1.02x` | -| `hash2` | `3928.9` | `3609.0` | `4075.6` | `3604.7` | `1.09x` | **`0.96x`** | -| `matrix` | `4713.5` | `8927.0` | `2864.4` | `2910.9` | **`0.53x`** | `1.65x` | -| `nbody` | `2978.7` | — | `2669.9` | `2534.6` | — | `1.12x` | -| `sieve` | `4117.6` | `3740.0` | `2628.5` | `2311.2` | `1.10x` | `1.57x` | -| `spectral-norm` | `3849.0` | — | `3831.2` | `3882.8` | — | `1.00x` | - -At `O1` cfree-run beats MIR on `array` (1.68x) and `matrix` (1.89x), is -tied on `hash`, and trails on `hash2` (9%) and `sieve` (10%). Against -gcc/clang `O1` the largest gaps are `matrix` (1.65x), `sieve` (1.57x), -`array` (1.32x), `nbody` (1.12x) — these are the cases worth studying -for next-step O1 improvements. - -## Formerly Broken cfree Rows - -Focused rerun: - -- Scope: `matrix`, `binary-trees`, `nbody`, `spectral-norm` -- Levels: `0 1 2` -- Output: `build/bench/opt/rerun-broken-cfree/results.csv` - -All `cfree` and `cfree-run` rows were `OK`. - -Base: each benchmark's `gcc-15 -O2` row is `1.0x`. - -| tool | opt | compile speed | runtime speed | -| --- | ---: | ---: | ---: | -| `cfree` | `O0` | `7.050x` | `0.319x` | -| `cfree` | `O1` | `7.062x` | `0.534x` | -| `cfree` | `O2` | `6.831x` | `0.483x` | -| `cfree-run` | `O0` | `13.709x` | `0.323x` | -| `cfree-run` | `O1` | `14.286x` | `0.550x` | -| `cfree-run` | `O2` | `11.791x` | `0.512x` | - -## Optimizer/Link/JIT Split - -Focused timing pass: - -- Scope: `array`, `hash`, `hash2`, `matrix`, `sieve` -- cfree measurement: sum of `opt.o*.total`, `link.resolve.total`, and JIT - scoped timings from `cfree run --bench-time` (best-of-3 per bench) -- MIR measurement: `MIR link finish` from `c2m -v -O<n> file.bmir -eg` - -Latest refresh (2026-05-23, `CFREE_OPT_BENCH_FAST=1`, single Darwin host), -post-jump-layout pass (see Iteration Notes below): - -| opt | cfree opt+link+JIT ms | MIR link/JIT ms | result | -| ---: | ---: | ---: | --- | -| `O1` (HEAD before regalloc rewrite) | `0.968` | `0.722` | MIR `1.34x` faster | -| `O1` (MIR-shaped allocator, 2026-05-22) | `0.621` | `0.720` | cfree `1.16x` faster | -| `O1` (slot-scan combiner, regression) | `0.824` | `0.711` | MIR `1.16x` faster | -| `O1` (operand-driven combiner) | `0.719` | `0.711` | tied | -| `O1` (live-range-scoped use counting) | `0.710` | `0.711` | tied | -| `O1` (block-layout for fallthrough) | `0.701` | `0.726` | cfree `1.04x` faster | - -Per-bench at `O1` after the block-layout pass: - -| bench | cfree opt+l+j ms | MIR link/JIT ms | cfree runtime ms | MIR runtime ms | -| --- | ---: | ---: | ---: | ---: | -| `array` | `0.384` | `0.316` | `2686.9` | `4529.0` | -| `hash` | `1.327` | `1.668` | `3965.8` | `3941.0` | -| `hash2` | `1.380` | `1.762` | `3978.2` | `3640.0` | -| `matrix` | `0.734` | `0.830` | `4734.2` | `8966.0` | -| `sieve` | `0.329` | `0.262` | `4146.8` | `3885.0` | -| **geomean** | **`0.701`** | **`0.726`** | **`3837.5`** | **`4687.5`** | - -cfree `O1` now leads on both axes of this 5-case scope: opt+link+JIT is -`3.4%` faster than MIR's `MIR link finish` slice, and runtime is `18.1%` -faster on geomean. Two benches are dramatically ahead (matrix `1.89x`, -array `1.69x`); two are within noise of MIR (hash, sieve); hash2 trails by -`9.3%`. Sequence of wins that got here: MIR-shaped allocator (compile), -operand-driven combiner dispatch (compile), live-range-scoped use -counting (runtime, mostly matrix), block-layout for fallthrough (runtime, -mostly sieve/array/matrix loop bodies). - -`O0` and `O2` rows have not yet been re-measured under the new allocator, -combiner, or layout pass; the full-scope refresh is pending. cfree's -final link/JIT mechanics stay fast across both data-structure choices. - -## Goals - -Compile-time goals: - -- Preserve cfree's frontend and whole-pipeline compile/JIT advantage. -- Add benchmark CSV columns for cfree frontend, optimizer, link, and JIT slices - instead of relying on ad hoc `--time` logs. -- Reduce optimized-pass cost. The first target is closing the MIR link/JIT gap - on the common-case split measurement. -- Keep `-O1` materially faster to compile than `-O2`. - -Runtime-quality goals: - -- Make `O2` reliably faster than `O1` on the representative benchmark mix. -- Close toward MIR `O2` on MIR-common cases. -- First external target: reach at least QBE-class performance on the MIR - `c-benchmarks` set, then close toward MIR's published `c2m -eg` geomean. - -Coverage goals: - -- Keep all cfree rows passing on the representative scope. -- Expand the benchmark set back toward all MIR `c-benchmarks` that require only - supported hosted libc/runtime features. -- Track MIR failures separately from cfree failures so cfree coverage is not - obscured by external tool limitations. - -## Current Tuning Priorities - -1. Port MIR's live-range splitting (`get_hard_reg_with_split`, `lr_gap_tab`, - `split()`) on top of the new point-bitmap core, restoring the O2 splitting - layer that was deferred during the regalloc rewrite. Until then O2 may - regress on benches where splitting matters (`array`, `hash`, `matrix`). -2. Investigate why `O2` trails `O1` on the current cfree runtime geomean. -3. Profile expensive O2 passes on `hash`, `hash2`, and `matrix`, where the - opt/link split shows cfree optimized generation much slower than MIR. -4. Improve generated-code quality before adding broad new passes. Prefer - targeted fixes backed by benchmark deltas and pass-local tests. -5. Add finer cfree timing columns to `scripts/opt_bench.sh` once the driver can - expose parse, optimize, link, and JIT slices in parseable form. - -## O1 Gap Analysis vs MIR - -cfree `O1` runs only the shared lowering pipeline (`doc/OPT.md`): build_cfg, -jump_cleanup, simplify_local, machinize, build_loop_tree, live_blocks, -dead_def_elim, regalloc (simple mode), combine, dce, emit. The local/global -address folds (`opt_addr_xform_pregs`, scalar local promotion, global address -CSE) remain optional and are not currently called by O1. -No SSA-era passes and no coalescing run at `O1`, matching MIR -(`mir-gen.c:9431`: coalesce is gated on `optimize_level >= 2`). - -Headline at `O1` (focused O0/O1 refresh, 2026-05-25): - -| metric | cfree | MIR | cfree vs MIR | -| --- | ---: | ---: | ---: | -| full compile (5-case avg) | `67.520 ms` | `138.051 ms` | `0.489x` time (`2.04x` faster) | -| runtime (5-case avg) | `4276.3 ms` | `5044.0 ms` | `0.848x` time (`1.18x` faster) | - -cfree `O1` leads MIR on both axes of this 5-case scope. Runtime is driven by -matrix (`1.87x` faster than MIR) and array (`1.62x` faster), where the combiner -and block-layout work eliminate redundant address-mode moves and trivial -inter-block branches. `hash` is near parity, while `hash2` and `sieve` remain -the current O1 runtime outliers. - -Compile-time gaps closed by recent work: - -- `opt_regalloc` uses point-indexed bitsets ORed per program point (matching - MIR's `assign()` in `mir-gen.c:7551-7728`, simplified branch), replacing - the previous sorted interval-vector overlap checks per hard register. - ~`55%` per-bench reduction in `opt.regalloc` time across the 5-case scope. -- `opt_combine` runs a per-BB fixpoint with the four MIR-shaped rewrites - (substitute / sink / addr-mode synth / combine_exts), uses live-range- - scoped use counting (so reuse of a scratch physreg later in the block - no longer makes every earlier producer look multi-use), and dispatches - by walking the consumer's operands rather than scanning 96 producer - slots per inst. -- `opt_jump_cleanup` LAYOUT stage now runs `cleanup_reorder_for_fallthrough` - ahead of the invert/strip passes, which reorders `f->emit_order` via - greedy chain extension. This lets the existing - `cleanup_layout_fallthrough_branches` collapse trivial `b <next>` - branches that the original block-id ordering left in place. - -Compile-time gaps that remain: - -- `opt_live_blocks` tracks every pseudo-register uniformly. MIR's - `calculate_func_cfg_live_info` also tracks all live vars at the - pre-allocation step (`consider_all_live_vars`), but its bitsets reuse a - shared sparse vector representation; cfree's `OptBitset` is dense. -- `opt_dead_def_elim_with_live` runs before allocation and `opt_dce` runs - after; some of this work overlaps and may be redundant. - -Runtime gaps unique to `O1`: - -- No coalescing at `O1` (MIR matches this). Every `IR_COPY` survives to - emission unless `opt_combine` catches it post-RA. With sink+substitute - now firing reliably this is much less load-bearing than before, but - cross-block copies still survive. -- Invariant loads inside loops (`mov w9, #1` per iteration in the sieve - flag-init loop). LICM is an O2 pass; even simple block-local hoisting - for single-BB invariants is not yet wired in at O1. -- hash2 specifically trails its sibling hash by ~24% despite identical - code; likely register pressure as the table grows. Coalescing at O2 is - the standard mitigation. - -## O2 Gap Analysis vs MIR - -cfree `O2` adds `opt_cleanup` (the SSA-era schedule) ahead of the shared -lowering pipeline: ssa construction, block cloning, mem2reg, ssa_dce, -copy_cleanup, addr_xform, simplify, gvn, copy_prop, dse, licm, -pressure_relief, conventional ssa, ssa_combine, undo ssa, jump_opt. Then the -shared lowering pipeline runs with coalescing and live-range splitting -enabled. - -Headline at `O2`: - -| metric | cfree | MIR | -| --- | ---: | ---: | -| compile speed (vs gcc-15 `O2`) | `5.73x` | `2.44x` | -| runtime speed (vs gcc-15 `O2`) | `0.47x` | `0.79x` | -| opt+link+JIT (5-case scope) | `4.728 ms` | `1.100 ms` | - -cfree `O2` is roughly the same compile speed as `O1` but slower at runtime -than `O1` on the current mix - the optimization investment is not paying off -on this benchmark set. - -Compile-time gaps: - -- `opt_rebuild_def_use` is called wholesale after most mutating passes - (`opt_addr_xform`, `opt_make_conventional_ssa`, every iteration of - `opt_copy_cleanup`). MIR maintains `ssa_edge_t` lists incrementally. -- The pipeline interleaves `opt_copy_cleanup` and `opt_ssa_dce` five or more - times. MIR folds equivalent work into single passes. -- Coalescing runs after liveness on the full program. MIR coalesces before - liveness, deleting moves first and computing liveness on the reduced - program. -- `opt_build_reg_ssa` and `opt_build_ssa` insert phis at every IDF site - without live-in filtering. MIR's demand-driven `get_def` only materializes - phis where reaching defs differ, then runs `minimize_ssa` to fixpoint. -- `opt_dse` runs an inline memory-availability fixpoint per invocation. MIR - computes memory liveness once before the pass. -- LICM rescans dominance to detect loops instead of reusing the loop tree - already built by `opt_build_loop_tree`. - -Runtime gaps: - -- `opt_licm` has no register-pressure cost model. MIR's LICM has a third - backward filter pass that skips hoists which would raise pressure unless - the operation is expensive (multiply, divide). This is the most likely - cause of cfree `O2` trailing cfree `O1` on the current benchmark mix: - invariants get hoisted into preheaders, increasing live ranges across the - loop body and forcing the allocator to spill what fit in registers at - `O1`. -- LICM also iterates loops in raster order instead of inner-to-outer. -- `opt_ssa_combine` does not synthesize complex address modes. MIR - iteratively decomposes base+index*scale+disp chains via `update_addr_p` - validated by `target_memory_ok_p`. Largest single missing codegen pattern - for memory-heavy benchmarks (`hash`, `matrix`). -- `opt_combine` has no per-BB fixpoint (same as `O1`). -- No ext-of-ext semantic fold (same as `O1`). -- Live-range splitting is boundary-driven from precomputed ranges and limited - to singletons that are not call-crossing. MIR's `get_hard_reg_with_split` - searches gaps in already-allocated ranges and picks based on a - spiller-vs-spillee frequency profit comparison. -- GVN branch folding does not re-enqueue dependent blocks within the same - pass. New constants only propagate further on the next `opt_cleanup` - iteration. -- `opt_pressure_relief` only sinks `IR_LOAD_IMM`/`IR_CONST_I` candidates. - MIR sinks any single-cross-block-use move. -- `opt_addr_xform` is all-or-nothing: a single non-foldable use prevents - elimination. MIR can fold most uses and rewrite the rest. - -## Iteration Notes - -### Block-layout pass for fallthrough (2026-05-23) - -`pass_jump.c` already had a `cleanup_layout_fallthrough_branches` that -strips trailing `IR_BR` when the next emit-order block is the branch -target — but it was layout-passive: cfree's frontend lowers C `for` loops -as `[head, inc, body, exit]` (with `inc` between head and body because -`continue` jumps to it), so `body`'s `b inc` was a backward jump in -emit-order and could never be stripped. Disassembly inspection of sieve's -hottest loop showed 2 redundant `b` per iteration — `head → b body → inc -→ b head; body → b inc` — that no per-block cleanup could fix. - -Change: added `cleanup_reorder_for_fallthrough` that rewrites -`f->emit_order` via greedy chain extension before the existing -invert/strip cleanups run. Starting at `f->entry`, repeatedly extend the -chain to the current block's preferred unvisited successor: for -`CONDBR`/`CMP_BRANCH` that's `succ[1]` (fallthrough); for `IR_BR` or any -1-succ block it's `succ[0]`. When the chain stalls, fall back to the -taken arm (the subsequent `cleanup_invert_taken_fallthrough` will invert -the branch), then to the lowest unvisited block in original order. After -this pass every chained `(p, n)` pair is adjacent in emit_order, so the -trailing-branch strip can collapse the jump. - -Effect on sieve inner loops (3 nested loops collapsed from -`9 / 8 / 8 insts/iter` to `7 / 6 / 6`): - -``` - mov x0, #2 -mov x0, #2 cmp x0, #8192 -cmp x0, #8192 b.gt exit -b.gt exit mov w9, #1 ; LICM-eligible -b body ← redundant strb w9, [x19, x0] -add x0, x0, #1 add x0, x0, #1 -b loop b loop -body: exit: - mov w9, #1 - strb w9, [x19, x0] - b inc ← redundant -``` - -Measurement (`CFREE_OPT_BENCH_FAST=1`, best of 3 compile/run repeats, -single Darwin host), 5-case scope at O1: - -| bench | runtime before (ms) | runtime after (ms) | delta | vs MIR | -| --- | ---: | ---: | ---: | ---: | -| `array` | `4988.9` | `2686.9` | `-46.1%` | cfree `1.69x` ahead | -| `hash` | `4079.2` | `3965.8` | `-2.8%` | tied (`1.006x`) | -| `hash2` | `5246.3` | `3978.2` | `-24.2%` | `9.3%` slower | -| `matrix` | `8279.9` | `4734.2` | `-42.8%` | cfree `1.89x` ahead | -| `sieve` | `8351.3` | `4146.8` | `-50.3%` | `6.7%` slower | -| **geomean** | `5938.0` | `3837.5` | `-35.4%` | cfree `1.22x` ahead | - -Compile time barely moved (`0.710 → 0.701 ms` geomean): the reorder is -O(nblocks) and the strip phase saves work in emit. cfree opt+link+JIT is -now `3.4%` faster than MIR's `MIR link finish` slice on this scope. - -All tests pass: `test-opt` 691, `test-toy` 955, `test-cg-api`, `test-isa`, -`test-aa64-inline`, `test-smoke-x64`, `test-smoke-rv64`. All 15 bench -rows green. - -### Combiner perf+correctness fixes (2026-05-23) - -Three issues landed together in `src/opt/pass_combine.c`: - -1. **Operand-driven dispatch.** The MIR-shaped COMBINE rewrite (commit - `ea26470`) was algorithmically O(96) per instruction: `try_substitute` - scanned all `OPT_REG_CLASSES * OPT_MAX_HARD_REGS = 96` producer slots - per consumer, and `ctx_record` probed all 96 to find each instruction's - defines. Replaced both with operand walks: `try_substitute` iterates - `in->opnds`, looks up only the producers of registers actually - referenced (typically 2–3); `ctx_record` switches on `IROp` and records - the destination operand directly. Combiner cost: `0.197 → 0.050 ms` - geomean (`3.9x` faster). - -2. **Live-range-scoped use counting.** `count_uses_between` summed uses of - a physreg across the entire rest of the block, ignoring intervening - redefs. With scratch physregs reused multiple times per BB (the - matrix-mmult `sxtw x12; mov x13, x12; mov x12, ...` pattern), every - fold looked multi-use and was rejected. Replaced with - `count_uses_in_live_range` that walks until the next def of the same - physreg or a CALL/ASM/INTRINSIC clobber barrier. When the live range - terminates inside the block, the cross-block live-out check becomes - moot and is skipped. - -3. **`try_combine_exts` no-op guard.** When the inner producer was an - identity convert (`convert rB, rB`), rewriting `in->opnds[1]` to - `prod->opnds[1]` was a no-op but still reported a change, spinning the - per-BB fixpoint forever. Added an early-return when the rewrite would - set the operand to the same register it already holds. Defensive - 64-iteration cap added on the fixpoint as future-proofing. - -Effect on matrix `mmult` inner loop: 17 → 10 instructions per iteration. -Every indexed access is now `sxtw + ldr [base, idx, lsl #s]` instead of -three leading copies + the load. - -Measurement (`CFREE_OPT_BENCH_FAST=1`, best of 3 compile/run repeats, -single Darwin host), 5-case scope at O1: - -| stage | combine ms | opt+l+j ms | runtime ms | matrix runtime | -| --- | ---: | ---: | ---: | ---: | -| Pre-COMBINE-rewrite baseline | `0.032` | `0.682` | `6314.0` | `10833.6` | -| After COMBINE rewrite (regressed) | `0.197` | `0.824` | `6213.9` | `10721.6` | -| After operand-driven dispatch | `0.050` | `0.719` | `6254.5` | `10811.4` | -| After live-range-scoping | `0.046` | `0.710` | `5938.0` | `8279.9` | - -Runtime: `-1.6%` overall before the live-range fix (basically noise), then -an additional `-5.1%` after — matrix `-23%` carries most of it. cfree's -matrix runtime now beats MIR's by `6.3%`. - -Per-bench runtime vs MIR at O1 after the fix: - -| bench | cfree ms | MIR ms | ratio | cfree vs MIR | -| --- | ---: | ---: | ---: | ---: | -| `array` | `4988.9` | `4510.0` | `1.106x` | `10.6%` slower | -| `hash` | `4079.2` | `3907.0` | `1.044x` | `4.4%` slower | -| `hash2` | `5246.3` | `3600.0` | `1.457x` | `45.7%` slower | -| `matrix` | `8279.9` | `8832.0` | `0.937x` | `6.3%` **faster** | -| `sieve` | `8351.3` | `3757.0` | `2.223x` | `122.3%` slower | -| **geomean** | | | `1.285x` | `28.5%` slower | - -### MIR-shaped regalloc (2026-05-22) - -Change: replace `OptAllocator`'s sorted interval-vector conflict structure -(`AllocIntervalVec hard_used_locs[hard_loc_bit]` plus per-stack-slot -intervals) with MIR's point-indexed bitmap (`used_locs[p * loc_words + w]`, -one row per compressed program point). Sort coalesce-root PRegs by the same -heuristic MIR uses (`mir-gen.c:7320-7337`: tied-reg first, then descending -freq, then descending live_length). For each candidate, OR `used_locs[j]` -across the candidate's live-range points into a scratch `conflict_locs` -bitmap, pick the cheapest free hard reg, or fall back to a stack slot -(probing existing stack-slot bits in `conflict_locs` for automatic reuse). -Re-gate `opt_coalesce_ranges` on `allow_live_range_split` (O2 only), -matching `mir-gen.c:9431` — the earlier "coalescing at O1" experiment -(below) is rolled back as part of this change. Live-range splitting -(`get_hard_reg_with_split`, `lr_gap_tab`, `split()`) is deferred; the new -core is the foundation on which splitting will be re-added. - -Measurement (`CFREE_OPT_BENCH_FAST=1`, best of 3 compile/run repeats, -single Darwin host), 5-case scope at O1: - -| bench | HEAD opt+link+jit ms | MIR-shaped opt+link+jit ms | delta | -| --- | ---: | ---: | ---: | -| `array` | `0.535` | `0.361` | `-32.5%` | -| `hash` | `1.916` | `1.156` | `-39.7%` | -| `hash2` | `1.929` | `1.200` | `-37.8%` | -| `matrix` | `0.899` | `0.573` | `-36.3%` | -| `sieve` | `0.477` | `0.321` | `-32.7%` | -| **geomean** | `0.968` | `0.621` | `-35.9%` | - -Per-bench `opt.regalloc` (the bucket the rewrite targets) drops by `~55%` -across all five benches: `array -54.1%`, `hash -58.3%`, `hash2 -55.5%`, -`matrix -53.7%`, `sieve -52.2%`. Runtime is unchanged within noise -(geomean delta `-0.2%`), consistent with the rewrite changing only the -conflict data structure, not allocator decisions. - -`compile_and_jit` total shifts modestly per-bench (`+0.85%` to `-21.75%`) -because the C frontend is 99%+ of that total on these benches; the -allocator gain is only visible after isolating the optimized stages from -the frontend. - -After this rewrite, cfree's `opt+link+JIT` geomean (`0.621 ms`) is -~14% **ahead** of MIR's `MIR link finish` geomean (`0.720 ms`) on the -same scope — see the Optimizer/Link/JIT Split table. - -### Coalescing at O1 (2026-05-22) - -**Rolled back** as part of the MIR-shaped regalloc rewrite above. MIR -gates coalescing on `optimize_level >= 2` (`mir-gen.c:9431`), so matching -MIR's O1 pipeline means coalesce does not run at O1. The original entry -is preserved below for context. - -Change: remove the `allow_live_range_split` gate around -`opt_coalesce_ranges` in `opt_regalloc` so coalesce runs at both O1 and -O2. Initially this produced wrong code on the address-taken branch-join -pattern (`test/toy/cases/128_o2_branch_join_addr_mem.toy`). - -Root cause: in `ranges_overlap_kind` (`src/opt/pass_coalesce.c`), any -number of unit-length overlaps between two pregs collapsed into a -single "unit conflict" bit. `group_conflicts` then allowed that single -unit conflict, treating it as the natural conflict from the move -itself. At O1 a pseudo with multiple non-SSA defs (e.g. the -`var x = 0; x = v` pattern) can have unit overlaps with src at every -def point. Only the move's own def point should be allowed; the others -are real conflicts. The coalescer was merging the local's pseudo with -the param `v` (tied to `x1`), then the `LOAD_IMM 0` def of the local -emitted `movz x1, 0`, clobbering the incoming `v`. - -Fix: `ranges_overlap_kind` now counts unit overlaps. Two or more -distinct unit overlaps are reported as a real conflict (returns 2), -which blocks the merge. - -Measurement (CFREE_OPT_BENCH_FAST=1, 3 compile/run repeats, single -machine, current load — single-run noise still ~5%): - -cfree-run runtime_ms at O1: - -| bench | baseline | coalesce | delta | -| --- | ---: | ---: | ---: | -| array | `5497.8` | `5344.5` | `-2.8%` | -| hash | `4497.4` | `4362.4` | `-3.0%` | -| hash2 | `5688.3` | `6312.0` | `+11.0%` | -| matrix | `11222.6` | `11089.0` | `-1.2%` | -| sieve | `9515.9` | `8775.9` | `-7.8%` | - -cfree-run compile_ms at O1: - -| bench | baseline | coalesce | delta | -| --- | ---: | ---: | ---: | -| array | `34.3` | `24.4` | `-29%` | -| hash | `49.4` | `47.8` | `-3%` | -| hash2 | `56.2` | `59.0` | `+5%` | -| matrix | `43.8` | `43.0` | `-2%` | -| sieve | `26.8` | `32.2` | `+20%` | - -The hash2 +11% runtime regression and sieve +20% compile-time -regression are worth investigating; both runs are single-shot at this -repeat count and could be partly noise. Average across the five -benches is roughly neutral on compile time and slightly positive on -runtime. The change unlocks downstream wins (fewer moves to emit, -fewer post-RA copies to clean up) and is a precondition for further -O1 work. - -### opt_addr_xform at O1 (2026-05-22) - -Change: new pass `opt_addr_xform_pregs` (`src/opt/pass_o2.c`). The -existing `opt_addr_xform` operates on the SSA Val namespace and uses -Val-keyed def-use chains, neither of which exists at O1. The new pass -is a PReg-namespace equivalent that scans each `IR_ADDR_OF` def, -checks whether every use of the result is the base of a -non-observable `IR_LOAD`/`IR_STORE` with offset 0 at the right -operand index, and if so rewrites those uses from -`OPK_INDIRECT(base=p, ofs=0)` to `OPK_LOCAL(local)` and converts the -`IR_ADDR_OF` to `IR_NOP`. Aux operands (call args, asm ops, -intrinsic ops, etc.) blocking folding unconditionally. It runs once -right after `opt_machinize` in the shared lowering pipeline. - -Effect on `same()` from -`test/toy/cases/128_o2_branch_join_addr_mem.toy`: the read of `x` was -`sub x1, x29, #8; ldur x0, [x1]` and is now a single -`ldur x0, [x29, #-8]`. - -Measurement (CFREE_OPT_BENCH_FAST=1, 3 compile/run repeats): - -cfree-run runtime_ms at O1 (cumulative effect with coalesce): - -| bench | baseline | +coalesce+addrxform | delta | -| --- | ---: | ---: | ---: | -| array | `5497.8` | `5283.9` | `-3.9%` | -| hash | `4497.4` | `4475.2` | `-0.5%` | -| hash2 | `5688.3` | `5818.8` | `+2.3%` | -| matrix | `11222.6` | `11205.8` | `-0.1%` | -| sieve | `9515.9` | `8662.2` | `-9.0%` | - -Incremental delta of addr_xform on top of coalesce: - -| bench | +coalesce only | +coalesce+addrxform | delta | -| --- | ---: | ---: | ---: | -| array | `5344.5` | `5283.9` | `-1.1%` | -| hash | `4362.4` | `4475.2` | `+2.6%` | -| hash2 | `6312.0` | `5818.8` | `-7.8%` | -| matrix | `11089.0` | `11205.8` | `+1.1%` | -| sieve | `8775.9` | `8662.2` | `-1.3%` | - -cfree-run compile_ms at O1 (cumulative effect with coalesce): - -| bench | baseline | +coalesce+addrxform | delta | -| --- | ---: | ---: | ---: | -| array | `34.3` | `24.3` | `-29%` | -| hash | `49.4` | `62.9` | `+27%` | -| hash2 | `56.2` | `59.9` | `+6.6%` | -| matrix | `43.8` | `39.1` | `-10.7%` | -| sieve | `26.8` | `28.1` | `+4.8%` | - -Direction is small positive on runtime overall. sieve and array show -sustained wins, hash2 partly recovers the +11% coalesce regression -when addr_xform also runs. Single-shot noise still around 5% at this -repeat count, so finer measurements (more repeats, controlled load) -are needed to draw strong conclusions per-benchmark. The change is -correct (full test suite passes including the previously failing -`128_o2_branch_join_addr_mem` toy case under both new transforms). diff --git a/doc/OPTv2.md b/doc/OPTv2.md @@ -1,937 +0,0 @@ -# OPTv2 - Typed Lowering and Allocation Design - -This document proposes the next optimizer representation for cfree. The -current optimizer design and pass order are documented in `doc/OPT.md`. -Performance measurements, current O1/MIR comparisons, and optimization targets -are tracked in `doc/OPT_PERF.md`; this design treats those numbers as -constraints, not as background information. - -The short version: keep the fast O1 algorithms, but stop changing the meaning -of the same operand field as the function moves through lowering. The recorded -IR should stay virtual, allocation should be a separate map, and emitted code -should come from an explicitly physical MIR. - -## Status - 2026-05-25 - -Current implementation status: - -- Slice 1 is partially implemented: `OptBitset` grows on demand, trims active - words, and `opt_live_blocks` now uses a predecessor worklist. High-numbered - PReg coverage was added to `test-opt`. -- Slice 3 is complete: `Func.preg_locs` is the canonical allocation location - table. Transitional mirroring into `OptPRegInfo` remains only for allocator - metadata and legacy tests. -- HIR verification is stricter: `IR_PARAM_DECL` is def-only, storage and aux - operands are checked at phase boundaries, and allocation locations are - cross-checked. -- Slice 4 is implemented for the O1/non-splitting path: register allocation - writes locations without mutating HIR, `opt_lower_to_mir` builds a separate - physical `Func.mir` block tape, MIR verification checks the lowered boundary, - post-RA combine/DCE/jump cleanup run through the MIR view, and `opt_emit` - emits from MIR when present. -- Known-frame replay is re-enabled through the explicit known-frame descriptor. - The C-source target leaves `func_begin_known_frame` NULL because the hook is - optional and backend-owned. -- O1 HIR local/global address folds are re-enabled before liveness: - `opt_addr_xform_pregs`, `opt_promote_scalar_locals`, and - `opt_addr_of_global_cse`. -- Split allocation remains deferred for OPTv2 v1. The shared O1/O2 lowering - pipeline currently uses the non-splitting allocation path while the MIR - boundary is hardened. - -Validation status for this slice: - -- Passing (reconfirmed 2026-05-25, 698 checks): `make test-opt - HOST_OPTFLAGS=-O0` -- Passing (reconfirmed 2026-05-25, 698 checks): `make test-opt - HOST_OPTFLAGS=-O1` -- Passing (reconfirmed 2026-05-25, 6/6): `make test-driver-ar - HOST_OPTFLAGS=-O1` -- Passing (reconfirmed 2026-05-25, 955/955): `make test-toy - HOST_OPTFLAGS=-O1` -- Passing (reconfirmed 2026-05-25, 3712/3712): `make test-parse-ok - HOST_OPTFLAGS=-O1` -- Passing (reconfirmed 2026-05-25, clean `build/bootstrap`, bootstrapped toy - 955/955): `make bootstrap-test-toy HOST_OPTFLAGS=-O0` -- Passing (reconfirmed 2026-05-25, clean `build/bootstrap`, bootstrapped toy - 955/955): `make bootstrap-test-toy HOST_OPTFLAGS=-O1` -- Passing (reconfirmed 2026-05-25, 50/50 OK): `CFREE_OPT_BENCH_LEVELS="0 1" - CFREE_OPT_BENCHES="array matrix hash hash2 sieve" make bench-opt` - -Current O1 pipeline status: - -- Enabled: `build_cfg`, `jump_cleanup`, `simplify_local`, `machinize`, - O1 HIR address folds, `build_loop_tree`, `live_blocks`, - `dead_def_elim`, simple regalloc to `Func.preg_locs`, HIR-to-MIR lowering, - MIR verification, MIR post-RA combine, MIR DCE, MIR CFG/layout cleanup, MIR - emission, and known-frame replay. -- Enabled for this slice: post-allocation consumers operate on the MIR view. - HIR remains virtual after allocation; the legacy rewritten-HIR path is kept - only for existing pass tests and `opt_regalloc` callers. -- Deferred: split allocation and O2-specific allocation quality work. - -Recent O1 bootstrap fixes: - -- Fixed post-RA `try_sink` retargeting across call barriers. A call barrier - does not prove a callee-saved physical register is dead, so combine now - rejects retargeting when the source hard register is used after a call before - a real redefinition. This fixed the O1 miscompile of - `link_macho.c:build_codesig_skeleton`, where `sb = out->data` was lost after - the first `wr_u32_be` call and later code-signature writes corrupted the - Mach-O link state. -- Fixed post-RA combine last-def repair when `try_sink` retargets a producer - away from a physical register that had an earlier reaching definition. This - removed the `coff_read_dso.c` stage3 crash in `init_struct_fields`. -- Blocked copy substitution into store-address operands. Load address - substitution remains enabled, but memory-writing operands now keep their - original address base/index through this rewrite. -- Split and simplified `opt_machinize` register collection so the self-hosted - O1 compiler no longer corrupts `opt_callee_saved` after the - `callee_save_mask` callback. -- Avoided direct struct-return and returned-pointer array stores in the ar and - runtime driver paths that exposed the same store-address corruption class. -- Fixed replayed call-plan outgoing stack reservation on aarch64, x64, and - rv64. Call plans now update the backend `max_outgoing` area and account for - `max(mem.size, 8)` per stack move, which prevents outgoing argument stores - from overwriting callee-saved slots. The O1 bootstrap `env.c` failure was a - symptom of this class: a replayed call to `obj_symbol_ex` clobbered the saved - `x19` slot in `cfree_cg_decl`, and the same overlap pattern could corrupt - parser stack frames. -- Replay hard-register collection now maps virtual PRegs through - `Func.preg_locs` and keeps a raw-register fallback for operands that are - already physical. -- Known-frame replay now uses the descriptor path after MIR lowering. Native - backends receive frame slots, outgoing stack area size, alloca/call flags, - and callee-save reservations before body emission. - -Acceptance criteria before marking OPTv2 O1 complete: - -- Done: `make bootstrap-test-toy HOST_OPTFLAGS=-O1` passes on a clean - `build/bootstrap`. -- Done: O1 still emits toy tests correctly with the bootstrapped compiler. -- Done: `CFREE_OPT_BENCH_LEVELS="0 1" CFREE_OPT_BENCHES="array matrix hash - hash2 sieve" make bench-opt` shows 50/50 OK rows. In the current summary - (`build/bench/opt/summary.md`), `cfree O1` averages `68.021 ms` compile time - and `4198.782 ms` runtime on the 5-case full-pipeline scope. That is `2.08x` - faster to compile than `MIR O1` and `1.19x` faster at runtime; versus - `gcc-15 O0`, it is `5.72x` faster to compile and `2.00x` faster at runtime. -- Done for the O1 structural slice: allocation placement is separate from HIR, - MIR lowering/emission is active, known-frame replay is active, and the - optional O1 HIR folds are re-enabled. - -## Goals - -- Preserve O1 compile speed. `doc/OPT_PERF.md` shows cfree O1 is already in - the same backend-time class as MIR and materially faster than gcc/clang - whole-pipeline builds. OPTv2 must keep dense arrays, arena allocation, and - linear scans where they are currently winning. -- Make representation invariants local and enforceable. A pass should not - need to know from context whether `Operand.v.reg` names a PReg, Val, or - physical register. -- Keep frontends and targets behind their existing ownership boundaries. - Frontends still emit CG. Backends still own instruction encoding and ABI - prologue/epilogue details. The optimizer owns only the recorded IR, - allocation, and lowering handoff. -- Support O1 first. O2 SSA and inlining are out of scope for OPTv2 v1. If the - boundary changes make O2 temporarily awkward, disable O2 or route it through - the O1 lowering path until it can be ported deliberately. - -## Current Problem - -The current O1 pipeline records CG operations into `Func`/`Block`/`Inst`, runs -liveness and allocation on PRegs, then rewrites operands in place so replay can -emit target code. - -That is efficient, but it gives one field multiple meanings: - -- Before register SSA, `OPK_REG` carries a PReg. -- In SSA mode, `OPK_REG` carries a Val. -- After O1 rewrite, the same `OPK_REG` usually carries a physical register. - -This phase-dependent convention is brittle. A post-RA pass can accidentally -treat a physical register as a PReg. Replay can see stale virtual operands. -`IR_PARAM_DECL` has also existed in two shapes: as a def-only marker and as an -instruction with a synthetic self operand. Both are easy mistakes because the -data model permits them. - -OPTv2 fixes this by giving each phase its own representation. - -## Representations - -### HIR: Recorded Optimizer IR - -HIR is the function shape recorded by `opt_cgtarget`. - -- `Func`, `Block`, and `Inst` remain arena-backed. -- HIR operands are virtual. `OPK_REG` means PReg before SSA and Val during SSA. -- HIR never stores physical registers in ordinary operands. -- `IR_PARAM_DECL` is def-only: - -```c -in->op = IR_PARAM_DECL; -in->def = param_preg; -in->type = param_type; -in->nopnds = 0; -in->opnds = NULL; -``` - -- Parameter and local storage remain in `IRParam` and `IRLocal`. -- ABI call/return data remains structured in call/return aux records. - -HIR is the only representation consumed by pre-allocation passes such as CFG -cleanup, local simplification, machinization, liveness, and allocation. - -### Allocation Map - -Allocation results should be stored in one canonical location table, separate -from HIR operands. - -```c -typedef enum OptLocKind { - OPT_LOC_NONE, - OPT_LOC_HARD, - OPT_LOC_SPILL, - OPT_LOC_SPLIT, -} OptLocKind; - -typedef struct OptLocation { - u8 kind; - u8 cls; - union { - Reg hard; - FrameSlot spill; - u32 first_segment; - } v; -} OptLocation; -``` - -`Func` owns `OptLocation* preg_locs`, indexed by PReg. `OptPRegInfo` can keep -frequency, cost, live-range, and constraint metadata, but final placement -should have one source of truth. - -Current code uses `OptLoc`, `OPT_LOC_STACK`, and `Func.preg_info` placement -fields. The migration can either keep those names initially or rename them to -the spelling above, but it should not do both semantic and naming churn in the -same patch. The important boundary is one canonical PReg-to-location table that -lowering reads and HIR operands do not encode. - -When split allocation is reintroduced, keep it segment-based: - -```c -typedef struct OptSplitSegment { - PReg preg; - u32 block; - u32 start_point; - u32 end_point; - u8 loc_kind; - u8 cls; - Reg hard; - FrameSlot spill; - u8 reload_at_start; - u8 store_at_end; - u32 next; -} OptSplitSegment; -``` - -This keeps today's dense, point-indexed allocator shape from `doc/OPT_PERF.md` -while making the allocation result explicit. - -### MIR: Physical Lowered IR - -MIR is the output of O1 lowering after allocation. Replay should consume MIR, -not rewritten HIR. - -```c -typedef enum MOperandKind { - MOP_IMM, - MOP_PHYS_REG, - MOP_FRAME, - MOP_GLOBAL, - MOP_INDIRECT, -} MOperandKind; - -typedef struct MOperand { - u8 kind; - u8 cls; - CfreeCgTypeId type; - union { - i64 imm; - Reg phys; - FrameSlot frame; - CGGlobalRef global; - struct { - Reg base; - Reg index; /* REG_NONE when absent */ - i32 ofs; - u8 log2_scale; - } ind; - } v; -} MOperand; -``` - -MIR should use inline operands for the common case: - -```c -typedef struct MInst { - u16 op; - u16 nops; - SrcLoc loc; - MOperand small[3]; - MOperand* extra_ops; - union { - i64 imm; - MemAccess mem; - void* aux; - } extra; -} MInst; -``` - -Most instructions have at most three operands, so this is at least as cache -friendly as today's per-instruction operand arrays and often better. Large -operand lists still spill to arena storage. - -`MFunc` owns MIR blocks in emit order. CFG metadata can be omitted unless a -future MIR pass needs it; replay only needs layout order and labels. - -## Pipeline Shape - -O1 becomes: - -```text -frontend -> CG -> opt_cgtarget records HIR - -func_end: - frame-home address-taken reg locals - build CFG on HIR - jump cleanup (CFG mode) - rebuild CFG - simplify local HIR - verify HIR - machinize HIR - verify machinized HIR - loop tree - liveness over PRegs - dead-def elimination over HIR - register allocation -> OptLocation table - verify allocation/lowered state - lower HIR + OptLocation -> MIR - post-RA MIR combine - verify MIR/combine state - MIR DCE - verify MIR/DCE state - jump cleanup (CFG mode) - rebuild CFG - verify post-RA CFG - MIR jump/layout cleanup - emit MIR to wrapped target -``` - -O2 is deferred: - -```text -frontend -> CG -> opt_cgtarget records HIR - -finalize: - for OPTv2 v1, either disable O2 or lower retained functions through the O1 - path without running the SSA cleanup schedule -``` - -The important change is that O1 rewrite no longer mutates HIR. It builds MIR. -O2 can later reuse the HIR allocation and MIR lowering boundary after the O1 -bootstrap path is correct and measured. - -## Current O1 Source Order - -The source of truth is `src/opt/opt.c`, not `doc/OPT.md`. As of this design, -`w_func_end` first calls `opt_frame_home_addr_taken_locals`, then -`opt_run_o1_pipeline`, which is `opt_run_lowering_pipeline(..., -allow_live_range_split = 0)`. - -The current O1 order in code is: - -1. `opt_frame_home_addr_taken_locals` -2. `opt_build_cfg` -3. `opt_jump_cleanup(..., OPT_JUMP_CLEANUP_CFG)` -4. `opt_build_cfg` -5. `opt_simplify_local` -6. `opt_verify("lowering-cfg")` -7. `opt_machinize` -8. `opt_verify("lowering-machinize")` -9. `opt_build_loop_tree` -10. `opt_live_blocks` -11. `opt_dead_def_elim_with_live` -12. `opt_regalloc(..., allow_live_range_split = 0)` -13. `opt_verify("post-regalloc-rewrite")` -14. `opt_combine` -15. `opt_verify("post-ra-combine")` -16. `opt_dce` -17. `opt_verify("post-ra-dce")` -18. `opt_jump_cleanup(..., OPT_JUMP_CLEANUP_CFG)` -19. `opt_build_cfg` -20. `opt_verify("post-ra-jump-cfg")` -21. `opt_jump_cleanup(..., OPT_JUMP_CLEANUP_LAYOUT)` -22. `opt_emit` - -The following PReg-level canonicalization passes exist in `src/opt/pass_o2.c` -but are not currently called by O1: - -- `opt_addr_xform_pregs` -- `opt_promote_scalar_locals` -- `opt_addr_of_global_cse` - -They are treated below as optional future O1 HIR folds, not as active O1 -passes. - -## Invariants - -### HIR Invariants - -- `IR_PARAM_DECL` is def-only and has no operands. -- HIR `OPK_REG` operands are never physical registers. -- Before register SSA, HIR `OPK_REG` ids are PRegs in `[1, f->npregs)`. -- During register SSA, HIR `OPK_REG` ids are Vals in `[1, f->nvals)`. -- `IRParam.storage` and `IRLocal.storage` reference valid PRegs or valid frame - slots. -- `OPK_LOCAL` frame slots are valid and refer to the expected slot class when a - pass requires that distinction. -- Aux operands in calls, returns, inline asm, aggregates, atomics, and - intrinsics obey the same namespace rules as normal operands. - -### Allocation Invariants - -- Every live PReg has exactly one `OptLocation`. -- `OPT_LOC_HARD` has a valid physical register of the matching class. -- `OPT_LOC_SPILL` references an `FS_SPILL` frame slot. -- `OPT_LOC_SPLIT` has at least one valid segment. -- Split segments are ordered, non-overlapping per PReg, and cover every point - where the PReg is used or defined. -- Two simultaneously live PRegs cannot share a hard register or spill slot - unless an explicit coalescing proof says they are the same value. -- Fixed-register and inline-asm constraints are represented in allocation - inputs, not patched in after allocation. - -### MIR Invariants - -- MIR contains no PRegs or Vals. -- Every register operand is `MOP_PHYS_REG` or part of `MOP_INDIRECT` and is a - valid physical register. -- Every frame operand references a valid frame slot. -- Call plans are fully physical: argument sources, return destinations, and - ABI register destinations are valid MIR operands. -- MIR replay never calls PReg allocation helpers. - -## O1 Pass Porting Plan - -This section describes each active O1 pass from `src/opt/opt.c` and how it -ports to OPTv2. - -### `opt_frame_home_addr_taken_locals` - -Keep as a HIR preparation pass. - -This pass runs before the O1 lowering pipeline. It materializes frame homes for -address-taken locals that otherwise live in PRegs, then inserts loads/stores -around uses/defs so taking the address observes the frame slot. In OPTv2 it -should remain pre-allocation HIR because it changes local storage and virtual -load/store structure. - -Verifier additions: - -- address-taken register locals with a home slot have valid frame homes -- inserted loads/stores reference the local's home slot -- the pass does not introduce physical registers - -### `opt_build_cfg` - -Keep as a HIR pass. - -CFG construction should continue to derive edges from HIR terminators and -scope/control instructions. It should not inspect allocation state. The output -is HIR block predecessor/successor lists plus `emit_order`. - -Verifier additions: - -- terminator successor count -- reciprocal predecessor/successor links -- no connected unreachable blocks -- every `emit_order` block is valid and unique - -### `opt_jump_cleanup` - -Split into three uses matching the code. - -The early CFG cleanup remains a HIR pass. It canonicalizes branches and removes -empty or unreachable control-flow artifacts before liveness. The post-RA CFG -cleanup should move to MIR once post-RA code is represented as MIR. The final -layout cleanup should also become a MIR-layout pass. - -Once MIR exists, fallthrough and branch deletion should operate on physical -emitted blocks so it cannot accidentally rewrite virtual operands after -allocation. - -Performance note: `doc/OPT_PERF.md` attributes part of O1's current runtime win -to block-layout fallthrough cleanup. OPTv2 must preserve the greedy -emit-order chain behavior and measure it with `make bench-opt`. - -### `opt_simplify_local` - -Keep as a HIR pass. - -This pass performs algebraic and addressing simplifications that are valid -before allocation. It should continue to rewrite virtual operands and HIR -instructions. Any rewrite that changes use/def structure must maintain or -invalidate def-use in the current model, and later should use the incremental -SSA/use-edge model described in `doc/SSA2.md`. - -### `opt_machinize` - -Keep as a HIR pass, but make its contract sharper. - -Machinization should express target constraints without converting operands to -physical registers. Examples: - -- required register class -- forbidden physical register masks -- fixed-register constraints for inline asm or ABI-specific operations -- target legality flags for addressing forms -- call clobber and hard-live metadata - -It should not write physical registers into HIR operands. If a target decision -requires a physical register, it becomes an allocation constraint. - -### Optional Future O1 Address Folds - -`opt_addr_xform_pregs`, scalar local promotion, and global address CSE are HIR -canonicalization passes present in the codebase, but they are not active in the -current O1 source order. If re-enabled, they should run before liveness. - -Porting rule: - -- address-of-local/global folds rewrite HIR operands only -- scalar promotion updates `IRLocal`/frame-slot state and HIR loads/stores only -- neither pass may depend on post-allocation physical registers - -These passes are optional for correctness. Re-enable them one at a time under -HIR verification and benchmark with the `O1 Gap Analysis` workflow in -`doc/OPT_PERF.md`. - -### `opt_build_loop_tree` - -Keep as a HIR analysis. - -The loop tree is built from HIR CFG after cleanup. It feeds liveness metrics and -future O1 pressure models. No MIR dependency is needed for O1. - -### `opt_live_blocks` - -Keep as a HIR analysis over PRegs, but make it MIR-shaped. - -MIR does not use a true sparse bitmap here. Its `bitmap_t` is a growable vector -of 64-bit words that expands on demand and truncates trailing zero words after -set operations. cfree already has the most important local behavior through -`OptBitset.active_words`; OPTv2 should lean into that and match MIR's model -more directly: - -- keep 64-bit word bitmaps -- keep active/trailing-zero trimming on every mutating bitmap operation -- allow bitmaps to grow to the highest touched PReg instead of eagerly sizing - every block bitmap to `opt_reg_count` -- use a worklist dataflow solver ordered by reverse postorder/postorder, - matching MIR's `solve_dataflow` -- keep optional scan-var remapping available for special analyses, but use - direct PReg ids for O1 RA - -The dataflow equation stays the same: - -```text -live_out = union(successor live_in) -live_in = live_use | (live_out & ~live_def) -``` - -This preserves the current O1 semantics while stealing MIR's allocation and -iteration shape. Do not switch to chunked sparse sets unless benchmark data -shows the elastic dense bitmap is a real bottleneck. - -### `opt_dead_def_elim_with_live` - -Keep as a HIR pass. - -It removes dead virtual definitions before allocation. This should stay before -MIR lowering because it reduces allocation pressure. The pass must treat -`IR_PARAM_DECL` as a def-only value source and preserve all side-effecting -operations. - -### `opt_regalloc` - -Keep the allocator algorithm shape, change the output. In current O1, -`allow_live_range_split` is `0`. - -The allocator should consume HIR liveness and produce: - -- `OptLocation[preg]` -- spill frame slots -- split segments when enabled -- scratch register lists -- call-save/call-restore requirements - -It should not rewrite HIR operands. - -The point-indexed occupancy bitset design should be preserved and kept close to -MIR. `doc/OPT_PERF.md` records a large O1 compile-time win from moving to -MIR-shaped point occupancy; OPTv2 should keep that structure and only move the -final assignment out of `OptPRegInfo` into a canonical location table. - -The live-range builder should also keep matching MIR: - -- build ranges by walking each block backward from `live_out` -- model whole-block spans separately from instruction-local ranges -- advance raw points only when live ranges are born or die -- compress raw points to a dense point id space before allocation -- feed `used_locs[point]` occupancy bitmaps from compressed points - -### O1 Lowering: HIR + Allocation -> MIR - -This replaces the current in-place rewrite performed during/after -`opt_regalloc`. - -For each HIR instruction: - -- translate every virtual use through `OptLocation` -- insert reloads before instructions that use spilled values -- insert stores after instructions that define spilled values -- materialize split reload/store edges -- materialize call-save and call-restore code -- lower call plans into physical argument and return moves -- emit a physical `MInst` sequence - -This is where all PReg-to-physical knowledge belongs. After this pass, HIR is -unchanged and MIR is fully physical. - -### `opt_combine` - -Move post-RA combine to MIR. - -The current post-allocation combiner is valuable and should not be discarded. -`doc/OPT_PERF.md` credits O1 gains to operand-driven combine, live-range-scoped -use counting, address-mode synthesis, and extension folding. Those algorithms -should port directly, but their input should be MIR physical operands. - -MIR combine can be simpler because: - -- register operands are always physical -- indirect operands never hide PRegs -- use counts can be local to MIR blocks -- no pass can accidentally consult PReg allocation state through a physical - operand - -### `opt_dce` - -Split into HIR DDE and MIR DCE. - -The pre-allocation dead-def elimination remains HIR-based. The post-combine DCE -should become MIR-based and delete physical instructions whose definitions are -unused and side-effect-free. It should not reason about PRegs. - -`doc/OPT_PERF.md` notes possible overlap between pre-allocation DDE and -post-allocation DCE. OPTv2 should keep both initially for behavior parity, then -benchmark whether MIR DCE can be narrowed. - -### Post-RA CFG Rebuild and Layout Cleanup - -Move these to MIR once post-RA code is physical MIR. - -The current code runs a CFG-mode jump cleanup, rebuilds CFG, verifies, then runs -layout-mode jump cleanup. OPTv2 should preserve that ordering, but apply it to -MIR after combine and DCE. The CFG-mode pass removes/control-simplifies blocks; -the layout-mode pass optimizes fallthroughs in emit order. - -### `opt_emit` - -Replace replay of rewritten HIR with MIR emission. - -`opt_emit_mir` should: - -- open the function on the wrapped target -- replay params and known frame slots from HIR function metadata -- emit MIR blocks in layout order -- reserve physical hard registers from MIR use collection -- close the function - -It should not inspect `OptLocation` except for parameter storage translation -where the backend prologue needs the final home location. That translation -should be explicit and verifier-covered. - -## Verification - -OPTv2 should keep debug-only verification cheap and frequent: - -- HIR verifier after CFG cleanup and machinization -- allocation verifier immediately after `opt_regalloc` -- MIR verifier immediately after HIR-to-MIR lowering -- MIR verifier after combine, DCE, and layout cleanup - -The verifier should fail at the phase boundary closest to the bug. For example, -an operand-backed `IR_PARAM_DECL` should fail during HIR verification, not -during replay. A stale PReg in MIR should fail during MIR verification, not in -the backend encoder. - -## Performance Rules - -`doc/OPT_PERF.md` is the performance contract for this design. - -Preserve: - -- arena-backed function-local allocation -- dense PReg-indexed arrays for hot per-value data -- point-indexed allocator occupancy bitsets -- operand-driven combiner dispatch -- block-layout fallthrough cleanup -- no SSA construction at O1 -- no coalescing or live-range splitting at O1 unless benchmarks justify it - -Avoid: - -- per-operand heap allocation in MIR -- whole-function rescans in inner loops -- rebuilding def-use or liveness after every small post-RA rewrite -- adding abstraction layers in hot loops where a dense indexed array is enough - -Benchmark gates: - -- `make bench-opt` -- focused O1 comparison: - -```sh -CFREE_OPT_BENCH_LEVELS=1 make bench-opt -``` - -- targeted regressions from `doc/OPT_PERF.md`: - -```sh -CFREE_OPT_BENCHES="array matrix hash hash2 sieve" \ -CFREE_OPT_BENCH_LEVELS=1 make bench-opt -``` - -Acceptance for the initial OPTv2 O1 port should be no material regression in -the backend-only O1 geomean and no bootstrap regression. - -## Migration Plan - -1. Lock down HIR invariants. - - Keep the current debug verifier strict: `IR_PARAM_DECL` is def-only, HIR - operands are virtual, params/locals reference valid storage, and aux operands - use the correct namespace. - -2. Make liveness and ranges MIR-shaped. - - Keep `OptBitset` as a 64-bit-word bitmap, but make it elastic like MIR: - grow to the highest touched PReg, trim trailing zero words after operations, - and run liveness through a postorder/reverse-postorder worklist solver. Keep - the existing compressed point range builder and tighten it to MIR's - born/dead-boundary model. - -3. Introduce `OptLocation`. - - Add the location table while still mirroring existing `OptPRegInfo` - allocation fields. Convert users one at a time, then remove duplicate final - placement fields. - -4. Add MIR data structures. - - Add `MOperand`, `MInst`, `MBlock`, and `MFunc` with inline small operands. - Add a MIR verifier before any pass consumes MIR. - -5. Port rewrite into `opt_lower_to_mir`. - - Keep the current reload/store/call-save algorithms, but write MIR instead of - mutating HIR. HIR should remain valid after the pass. - -6. Port emission. - - Add `opt_emit_mir` and make O1 emit from MIR. Keep the old replay path - temporarily for comparison, then delete it once bootstrap and test-toy are - clean. - -7. Port post-RA combine and DCE. - - Move the profitable O1 combines to MIR. Preserve the measured behaviors from - `doc/OPT_PERF.md` before adding new combines. - -8. Re-enable optional HIR O1 folds. - - Re-enable address folds and scalar promotion only after HIR/MIR boundaries - are stable. Each pass should have focused tests and an O1 benchmark check. - -9. Defer O2. - - For OPTv2 v1, keep the O2 implementation out of the critical path. Either - disable it or make it lower through the O1 path until O1 bootstrap and - `test-toy` are clean with the new HIR/MIR boundary. - -## V1 Decisions - -OPTv2 v1 is an O1 redesign. These choices favor bootstrap correctness and a -small migration surface over solving O2 quality problems at the same time. - -### MIR CFG Shape - -Store layout order and successor labels in MIR v1, but do not build full -predecessor lists by default. - -O1 needs: - -- physical block order for emission -- terminator targets for jump cleanup -- enough successor information for CFG-mode cleanup after combine/DCE - -It does not need dominance, loop analysis, SSA repair, or predecessor-driven -phi updates after lowering. If a MIR pass needs predecessors later, build them -as a derived scratch analysis from terminators, mirroring today's -`opt_build_cfg` rather than storing and maintaining them eagerly. - -Tradeoff: a derived CFG costs one linear pass when needed, but avoids making -every MIR rewrite responsible for keeping predecessor lists coherent. - -### Parameter Prologue Materialization - -Keep parameter prologue materialization backend-owned through `target->param` -for v1. - -The native backends already encode ABI-specific incoming-register and -incoming-stack handling in `x_param`, `aa_param`, and `rv_param`. Moving that -into MIR immediately would duplicate ABI lowering and frame-layout logic at the -same time as the HIR/MIR split. That is too much churn for the bootstrap path. - -MIR v1 should instead verify the boundary explicitly: - -- each parameter has a valid final storage location -- register-backed parameters map to physical registers or are marked unused -- frame-backed parameters have known-frame slot mappings when using - `func_begin_known_frame` -- `target->param` is called before MIR body emission - -Later, if we want prologue instructions visible to MIR combine/DCE, add an -explicit `MIR_PROLOGUE_PARAM` lowering step after O1 is stable. - -### Split Allocation - -Defer split allocation for O1 v1. - -Current O1 calls `opt_regalloc(..., allow_live_range_split = 0)`, and -`doc/OPT_PERF.md` shows O1 is already close to MIR compile time while the -bootstrap bug pressure is correctness, not spill quality. OPTv2 should first -move today's non-splitting allocator result into `OptLocation[preg]` and lower -that to MIR. - -Keep `OPT_LOC_SPLIT` and segment terminology in the design only as a -compatibility slot for later O2 or O1 quality work. Do not require split -segments, edge materialization, or split verification for the first O1 port. - -Tradeoff: this preserves today's O1 quality profile and avoids porting the -most complex allocator path while the representation boundary is still moving. - -### Liveness Bitsets - -Match MIR's elastic dense bitmap model for v1. - -MIR's liveness bitmap is not a sparse set. It is a growable `uint64_t` vector -that stores only words up to the current highest nonzero word. cfree should use -the same shape: dense inside the active prefix, trimmed at the end, and iterated -by scanning nonzero words. - -The current `OptBitset.active_words` design is already close. The remaining v1 -changes should be: - -- stop allocating every block bitmap to full `opt_reg_count` size up front -- grow bitmaps on demand -- keep bitmap operations trimming trailing zero words -- switch liveness convergence to a MIR-like worklist solver -- keep compressed live-range points as allocator input - -Tradeoff: this is still dense within the active prefix, so pathological high -PReg ids can cost dense scans. That is acceptable because it matches MIR's -successful O1 shape and avoids a separate sparse-set design. - -## Implementation Handoff - -The implementation should proceed in small behavior-preserving slices. Do not -start by introducing MIR. First make the O1 analyses match MIR closely while -the existing rewritten-HIR backend path still runs. - -### Slice 1: Elastic Bitmaps and Worklist Liveness - -Update `OptBitset`/`opt_live_blocks` before touching allocation output: - -- add an internal grow helper for `OptBitset` -- make `opt_bitset_set` grow on demand -- keep all mutating operations trimming trailing zero words -- keep the public `opt_bitset_has` behavior unchanged -- replace full reverse block sweeps with a MIR-like worklist solver -- preserve the current `live_use | (live_out & ~live_def)` equation - -Targeted tests: - -```sh -make test-opt HOST_OPTFLAGS=-O0 -make test-opt HOST_OPTFLAGS=-O1 -``` - -The existing live tests around `opt_live_blocks` and -`opt_live_ranges_phase2` should stay green. Add focused tests for high-numbered -PRegs so on-demand growth and trailing-zero trimming are covered. - -### Slice 2: Tighten MIR-Style Live Ranges - -Keep `opt_live_ranges_build` compatible with current callers, but align the -details with MIR: - -- start from each block's `live_out` -- walk instructions backward -- open ranges on uses and close ranges on defs -- represent whole-block spans separately with `whole_block` -- advance raw points only when a range is born or dies -- compress raw points before allocation - -Targeted checks: - -```sh -make test-opt HOST_OPTFLAGS=-O1 -CFREE_OPT_BENCH_LEVELS=1 CFREE_OPT_BENCHES="array matrix hash hash2 sieve" make bench-opt -``` - -### Slice 3: Canonical Allocation Locations - -Only after liveness/ranges are stable, introduce the single allocation table. -Keep existing allocator heuristics and O1 `allow_live_range_split = 0`. - -Rules: - -- allocation writes one canonical PReg location table -- HIR operands remain virtual after allocation -- any temporary mirroring into `OptPRegInfo` is transitional and should have a - deletion point -- verifier checks that post-allocation HIR still has virtual operands - -### Slice 4: MIR Lowering and Emission - -Then add MIR and move the in-place rewrite into `opt_lower_to_mir`. - -Keep v1 conservative: - -- backend-owned parameter prologue through `target->param` -- no O1 live-range splitting -- no O2 dependency -- old rewritten-HIR emit path may remain temporarily for comparison - -Acceptance: - -```sh -make bootstrap HOST_OPTFLAGS=-O0 -make bootstrap-test-toy HOST_OPTFLAGS=-O0 -make bootstrap HOST_OPTFLAGS=-O1 -make bootstrap-test-toy HOST_OPTFLAGS=-O1 -``` - -If O2 blocks progress, disable it or route it through the O1 lowering path as -described above. diff --git a/doc/PERCALL.md b/doc/PERCALL.md @@ -1,138 +0,0 @@ -# cfree -O1 per-call overhead (aarch64) - -The fixed cost cfree `-O1` pays around *every* function call — prologue, -epilogue, and call-site argument setup — independent of the function body. On -call-heavy workloads (binary-trees: ~7.6M calls at depth 19) this dominates. The -prologue/epilogue fold described here (`fp_at_bottom`) is now implemented, taking -the common per-call overhead from 7 to the optimal 5 fixed insns. The function -*bodies* are already at or ahead of gcc -O0 (cfree keeps recurring values in -callee-saves where gcc -O0 spills and reloads); see -[OPT_O1_PERF_TODO.md](OPT_O1_PERF_TODO.md) for the benchmark standings. - -Numbers below are from `/tmp/mc/binary-trees.cfree.o` vs `binary-trees.gcc.o` -(gcc-15 -O0), native arm64/Apple. - -## Prologue / epilogue tiers - -cfree picks one of four frame shapes per function. "Fixed insns" excludes the -final `ret` (always required) and counts entry + exit. - -| tier | when | fixed insns | folded? | -| --- | --- | ---: | --- | -| Tier A (`slim_prologue`) | no callee-saves, no alloca, no body slots, no outgoing stack | 3 (2 entry + 1 exit) | yes — optimal | -| `fp_at_bottom` | ≥1 callee-save (or body slots), **no outgoing stack args**, frame_size ≤ 504 | **5 (3 entry + 2 exit)** | **yes — optimal** | -| `slim_small_frame` | as above but with outgoing stack args (out_stack > 0), saved-pair offset ≤ 504 | 7 (4 entry + 3 exit) | no | -| fat | large frame, alloca, big saved-pair offset | 7+ | no | - -Tier A (leaf-ish, no callee-saves) and `fp_at_bottom` are both optimal. -`fp_at_bottom` is the common case — any function that keeps a value live across a -call (but doesn't itself pass >8 register-class args to a callee) lands here. The -remaining gap is only `slim_small_frame` (out_stack > 0, comparatively rare). - -### `fp_at_bottom` (the four binary-trees functions) — implemented - -The frame record moves to the bottom of the frame (`fp = sp`), so the sp -adjustment folds into the pre/post-indexed `stp`/`ldp` and callee-saves stack -*above* the record at positive offsets. This matches what gcc -O0 emits. - -Entry (3 insns) — e.g. `NewTreeNode`, two callee-saves: -``` -stp x29, x30, [sp, #-32]! ; one insn: decrement sp AND save fp/lr at the bottom -mov x29, sp ; frame record at the bottom of the frame -stp x20, x19, [x29, #16] ; callee-saves above the record (str for an odd one) -``` -Exit (2 insns + ret): -``` -ldp x20, x19, [x29, #16] -ldp x29, x30, [sp], #32 ; one insn: restore fp/lr AND release sp -ret -``` - -**−2 fixed insns per call** (−1 entry, −1 exit) vs the old top-record layout. -Implemented in `src/arch/aa64/native.c`: the `fp_at_bottom` flag (set in -`aa_func_begin_known_frame`), the layout-aware `aa_fp_off_*` / `aa_cfa_off` -helpers, the prologue/epilogue branches in `aa_build_prologue_words` / -`aa_words_restore_frame`, and the positive-offset callee-save stores in -`aa_words_callee_saves`. Only the known-frame (-O1) path takes it — the -frame-size-dependent offsets require the frame to be final before the body. -The DWARF CFA becomes `fp + frame_size` (vs `fp + 16` for the top-record forms). - -`out_stack > 0` functions can't move the record to the absolute bottom (outgoing -args live there), so they keep the top-record `slim_small_frame` layout below. - -#### Old top-record `slim_small_frame` (still used for out_stack > 0) - -Entry (4 insns) / exit (3 insns + ret): `sub sp` + `stp [sp,#N-16]` + -`add x29,sp,#N-16` + callee-saves; reverse on exit. fp points mid-frame above the -callee-saves, which is what blocks the fold. - -(An aside: omitting the frame pointer entirely — packing a callee-save with lr, -e.g. `stp x19, x30, [sp, #-N]!`, no `x29` at all — would save one more insn, but -neither cfree nor gcc -O0 does this; it costs unwind/debug fidelity and is not -planned.) - -## Body-level per-call warts - -Two small constant adders beyond the prologue, both call-shaped: - -1. **Zero through a temp** (item 3) — **FIXED**. `BottomUpTree`'s leaf - `NewTreeNode(NULL, NULL)` used to materialize each null arg via a scratch: - ``` - movz x8, 0x0 ; mov x0, x8 ; movz x8, 0x0 ; mov x1, x8 ; 4 insns - ``` - It now emits the optimal `movz x0, 0x0 ; movz x1, 0x0` (2 insns), **−2** on - that path. The trigger was a *pointer-typed* null (`(void*)0` / `NULL` from - headers, not an integer `0`): the cast inserts an int→ptr `IR_CONVERT` - between the `load_imm` and the call arg. `apply_abi_aliasing_hints` hints the - arg PReg to x0/x1 and `propagate_hint_through_copies` carries it backward — - but only across `IR_COPY`, so the convert broke the chain and its `load_imm` - source landed in the x8 scratch. Fixed by `try_fold_const_convert` in - `pass_combine.c`, which constant-folds `load_imm rT,k ; convert rD,rT` into a - single `load_imm rD,k'` (the now-dead `load_imm` is removed by DCE); the - folded `load_imm` inherits the convert's hinted dst directly. Covered by - `test/opt/zero_arg.sh`. - -2. **Redundant branch chain** (item 4). `DeleteTree`'s if/else merge emits - `b A; A: b B` — a conditional branch to a label that just unconditionally - branches on to the real target. **−1** per `DeleteTree` call. - `cleanup_layout_fallthrough_branches` in `pass_jump.c` doesn't thread this - shape. - -## Per-function tally (fixed + warts vs gcc -O0) - -cfree insns are the `fp_at_bottom` numbers (incl. `ret`); the "was" column is the -old top-record layout, each −2 from the prologue fold. - -| function | cfree insns | was | gcc -O0 insns | remaining overhead | -| --- | ---: | ---: | ---: | --- | -| NewTreeNode | 12 | 14 | 16 | — (ahead of gcc) | -| ItemCheck | 18 | 20 | 21 | — (ahead of gcc) | -| BottomUpTree | 21 | 23 | 24 | — (ahead of gcc) | -| DeleteTree | 16 | 18 | 18 | +1 branch (item 4) | - -With the prologue fold all four are now at or ahead of gcc -O0. `DeleteTree`'s -only remaining wart is the item-4 redundant branch chain. - -## Optimal target — reached for the common case - -Per-call fixed overhead: - -- **`fp_at_bottom` (common case):** 5 insns (3 entry + 2 exit) + ret — optimal, - matching gcc -O0. Done. -- **`slim_small_frame` (out_stack > 0 only):** 7 insns (4 entry + 3 exit) + ret — - unchanged; the record can't move to the bottom when outgoing args occupy it. - -Across binary-trees (~7.6M calls) the prologue fold removes ~15M instructions; -combined with the zero-temp fix (item 3) and the pending item-4 branch fix the -total reaches ~30M, flipping `BottomUpTree`/`DeleteTree` ahead of gcc -O0. - -## Reproducing - -```sh -SDK="$(xcrun --show-sdk-path)" -SRC=~/tmp/mir/c-benchmarks/binary-trees.c -build/release/cfree cc -O1 -std=c99 --sysroot "$SDK" -c "$SRC" -o /tmp/bt.cfree.o -gcc-15 -O0 -c "$SRC" -o /tmp/bt.gcc.o -build/release/cfree objdump -d /tmp/bt.cfree.o -llvm-objdump -d /tmp/bt.gcc.o -``` diff --git a/doc/PROF.md b/doc/PROF.md @@ -1,392 +0,0 @@ -# cfree prof design - -Architecture of `cfree prof`, a sampling CPU profiler for JIT'd code. -Companion to `DBG.md` and `DESIGN.md`. Scope: how the profiler intercepts -the worker thread without disturbing it, how samples are recorded cheaply, -and how PCs are turned into human-readable output after the run. Not a -tutorial; not implementation notes. - -## 1. Goals - -- `prof` multi-call subcommand: compile C sources, JIT-link, run under a - sampling profiler, and emit a profile report. -- Statistical CPU profiling: deliver SIGPROF to the worker at a - configurable rate, capture the current PC and a frame-pointer-based call - stack in the signal handler, and resume the worker without a park/unpark - cycle. -- Post-run symbolication: translate raw PCs to `symbol+offset` and - `file:line` using the JIT image and DWARF, after the guest exits. -- Output in folded-stacks format (Brendan Gregg's `flamegraph.pl` input), - and a flat function-by-self-time text report. -- Reuse `CfreeDbgOs` signal infrastructure from `dbg`; `src/dbg/prof.c` - stays freestanding C11. -- v1 target: aarch64 on macOS and Linux. - -## 2. Non-goals (v1) - -- Heap / allocation profiling. Separate concern; different instrumentation. -- Kernel-time profiling (`ITIMER_REAL` vs `ITIMER_PROF`). Wall-clock - sampling can be added later; v1 profiles user CPU time only. -- Multi-threaded guests. One worker thread per session; per-thread timer - targeting is future work. -- Hardware performance counters (PMU). Pure-software SIGPROF for now. -- Continuous / streaming profile output. One profile file per run. -- Instrumented (non-sampling) profiling. Breakpoint-based entry/exit - counting is higher overhead and a different tool. - -## 3. Layout - -``` -include/ - cfree.h on_sample added to CfreeDbgSignalOps; CfreeProfSpec; - cfree_jit_session_prof_attach / prof_collect - -src/ - dbg/ - prof.c sample ring buffer; frame-pointer walk; symbolication - dbg.h CfreeProfSample, CfreeProfBuf (internal) - -driver/ - prof.c new driver entry: parse flags, arm timer, run session, - emit folded-stacks and flat report - env.c SIGPROF added to g_dbg_signos[]; on_sample dispatch - path that returns without parking -``` - -## 4. Dataflow - -``` -setitimer(ITIMER_PROF, rate) - │ - │ SIGPROF → worker thread - ▼ -dbg_signal_handler - ├── is worker thread? no → re-raise (default) - ├── on_sample set? no → park (normal dbg path) - └── yes → dbg_fp_walk(ucontext, buf) → append to ring buffer → return - (worker continues) - │ - │ after session exits - ▼ -cfree_jit_session_prof_collect(session, buf, nsamples) - │ - ▼ -symbolicate: cfree_jit_addr_to_sym + cfree_dwarf_addr_to_line - │ - ├── emit folded stacks → output.folded (flamegraph.pl input) - └── emit flat report → stdout -``` - -The critical property: **the worker is never parked**. SIGPROF fires, -the handler reads the frame pointer chain and appends to a pre-allocated -buffer, and returns. No event signals, no mutex acquires, no park/unpark -handshake. The guest's elapsed time and memory state are unaffected except -for the signal-delivery overhead. - -## 5. Signal handler path - -`dbg_signal_handler` (in `driver/env.c`) currently dispatches every -delivered signal through the park/stop handshake. For profiling a second -path is needed that returns without parking. - -The dispatch condition is: `signo == SIGPROF && on_sample != NULL`. - -```c -if (signo == SIGPROF && g_dbg_ops.on_sample) { - g_dbg_ops.on_sample(g_dbg_session, uc); - return; /* worker resumes immediately */ -} -``` - -`on_sample` is a new field in `CfreeDbgSignalOps` (alongside the existing -`on_fault`): - -```c -typedef struct CfreeDbgSignalOps { - /* existing */ - int (*on_fault)(void* session, int signo, CfreeUnwindFrame* frame); - /* new */ - void (*on_sample)(void* session, void* ucontext); -} CfreeDbgSignalOps; -``` - -`on_sample` receives the raw `ucontext_t*` rather than a marshalled -`CfreeUnwindFrame` because it will extract only FP and PC directly — -marshalling all 32 registers would be unnecessary work in the hot path. - -SIGPROF is added to `g_dbg_signos[]` in `driver/env.c` alongside the -existing fault signals. The `sa_mask` for the handler already blocks the -cohort of debugger signals during delivery; SIGPROF joins that mask so it -doesn't recurse. - -## 6. Frame-pointer walk - -The signal handler calls `dbg_fp_walk(ucontext, sample)` in -`src/dbg/prof.c`. This is the only code that runs on the signal-delivery -path outside of the existing dispatcher boilerplate. - -Contract: -- No heap allocation. Writes into a caller-supplied `CfreeProfSample`. -- No DWARF, no symbol lookup. Raw PCs only. -- Max depth is a compile-time constant `PROF_MAX_DEPTH` (default 64). -- Terminates when: FP is NULL, FP is misaligned, FP is outside the JIT - image address range, or depth limit is reached. -- Uses `dbg_os->guarded_copy` for each dereference to tolerate a corrupt - or partially-initialised frame chain. - -AArch64 frame layout (standard AAPCS64 with frame pointer enabled): - -``` -[FP + 0] → saved FP of caller -[FP + 8] → saved LR (return address = PC of call site in caller) -``` - -PC at the point of the sample is taken directly from `ucontext.__ss.__pc` -(macOS) or `mcontext.pc` (Linux). FP comes from register x29. - -Walk pseudocode: - -```c -void dbg_fp_walk(void* uc, CfreeProfSample* s) { - uint64_t pc = uc_get_pc(uc); - uint64_t fp = uc_get_fp(uc); - s->pcs[s->nframes++] = pc; - while (fp && s->nframes < PROF_MAX_DEPTH) { - uint64_t saved_fp, saved_lr; - if (dbg_os->guarded_copy(&saved_fp, (void*)fp, 8)) break; - if (dbg_os->guarded_copy(&saved_lr, (void*)(fp+8), 8)) break; - s->pcs[s->nframes++] = saved_lr; - if (!saved_fp || saved_fp <= fp) break; /* no backward progress */ - fp = saved_fp; - } -} -``` - -cfree controls codegen and ensures `-fno-omit-frame-pointer` is the -default (or forced for prof mode). Frames from the host's C runtime above -the JIT entry are recorded as-is; symbolication will simply leave them as -hex if they fall outside the JIT image. - -## 7. Sample buffer - -```c -/* CfreeProfSample — one entry per timer tick */ -typedef struct CfreeProfSample { - uint64_t pcs[PROF_MAX_DEPTH]; - uint32_t nframes; -} CfreeProfSample; - -/* CfreeProfBuf — pre-allocated, fixed-size ring */ -typedef struct CfreeProfBuf { - CfreeProfSample* samples; - uint32_t cap; /* allocated slots */ - uint32_t count; /* samples recorded */ - uint32_t dropped; /* samples dropped when full */ -} CfreeProfBuf; -``` - -`CfreeProfBuf` is allocated by the driver before calling -`cfree_jit_session_prof_attach(session, buf)`. The session stores a -pointer; `on_sample` appends to it. When `count == cap`, the sample is -discarded and `dropped` is incremented (both writes are non-atomic; single -worker thread makes that safe). - -Default capacity: 1 million samples. At 64 PCs × 8 bytes each that is -~512 MB — reduce `PROF_MAX_DEPTH` or the cap for constrained environments. -The driver flags `--depth` and `--cap` control both. - -## 8. Post-run symbolication - -After `cfree_jit_session_call()` returns, the buffer holds raw PCs. -Symbolication is a post-processing pass in `src/dbg/prof.c`: - -```c -int cfree_jit_session_prof_collect( - CfreeJitSession* session, - CfreeProfBuf* buf, - CfreeProfWriter* writer); -``` - -`CfreeProfWriter` is a vtable the driver supplies: - -```c -typedef struct CfreeProfWriter { - void (*write_sample)(void* user, const char** syms, uint32_t nframes); - void* user; -} CfreeProfWriter; -``` - -For each sample, `prof_collect`: -1. For each PC in `sample.pcs[]`: - a. `cfree_jit_addr_to_sym(jit, pc, &sym, &off)` — symbol name or - NULL if outside the JIT image. - b. `cfree_dwarf_addr_to_line(dwarf, img_pc, &file, &line)` if DWARF - is attached and the symbol was found. - c. Format: `"sym+0xOFF"` when no line info; `"sym (file:line)"` when - available; `"0xADDR"` when outside the image entirely. -2. Calls `writer->write_sample(user, syms, nframes)` with the - null-terminated string array. - -Symbolication touches no async-signal machinery and may allocate freely. - -## 9. Output format - -### Folded stacks (primary) - -One line per sample. Frames innermost-first, separated by `;`, followed by -a space and the count weight (`1` per sample). This is the canonical input -for `flamegraph.pl`: - -``` -main;compute;inner_loop 1 -main;compute;inner_loop 1 -main;compute 1 -main 1 -``` - -Written to `--output FILE` (default: `prof.folded`). The driver emits -this via a `CfreeProfWriter` that builds strings into a `CfrBuf` and -flushes lines. - -Aggregate identical stacks before writing: sort samples lexicographically -by their frame sequence, then run-length-encode. This reduces output size -significantly for programs with tight hot loops. - -### Flat report (secondary) - -Printed to stdout after the run: - -``` -Samples: 10243 (0 dropped) rate: 1ms - - SELF% CUMUL% FUNCTION - 42.1% 42.1% inner_loop (compute.c:17) - 31.0% 73.1% compute (compute.c:45) - ... -``` - -Self% counts samples where the function is the top (leaf) frame. -Cumul% counts samples where it appears anywhere in the stack. - -## 10. Driver interface - -``` -cfree prof [options] [sources/objects] [-- args...] -``` - -Options: - -``` ---rate=MICROSECONDS SIGPROF interval, default 1000 (1 ms) ---depth=N max frames per sample, default 64 ---cap=N sample buffer capacity, default 1000000 ---output=FILE folded-stacks output path, default prof.folded ---no-folded suppress folded-stacks file ---no-flat suppress flat report to stdout -``` - -Input handling mirrors `cfree run`: `.c` / `.o` / `.a` / stdin, compiled -with `-g` forced on so DWARF is always present for symbolication. - -The driver: -1. Allocates `CfreeProfBuf`. -2. Arms `setitimer(ITIMER_PROF, rate)`. -3. Creates session with `on_sample` wired. -4. Calls `cfree_jit_session_call()`. -5. Disarms timer after return. -6. Calls `cfree_jit_session_prof_collect()` to symbolicate and write output. -7. Prints flat report and dropped-sample count. - -## 11. CfreeDbgOs and public API changes - -### `include/cfree.h` - -`CfreeDbgSignalOps` gains `on_sample`: - -```c -typedef struct CfreeDbgSignalOps { - int (*on_fault)(void* session, int signo, CfreeUnwindFrame* frame); - void (*on_sample)(void* session, void* ucontext); /* NEW; NULL = ignore SIGPROF */ -} CfreeDbgSignalOps; -``` - -New public entry points: - -```c -/* Attach a pre-allocated sample buffer; must be called before session_call. */ -void cfree_jit_session_prof_attach(CfreeJitSession*, CfreeProfBuf*); - -/* Symbolicate buffer and call writer once per sample. Safe after session_call. */ -int cfree_jit_session_prof_collect(CfreeJitSession*, CfreeProfBuf*, - CfreeProfWriter*); -``` - -`CfreeProfBuf` and `CfreeProfWriter` are declared in `cfree.h`; their -internals (frame storage, PROF_MAX_DEPTH) live in `src/dbg/prof.c`. - -### `driver/env.c` - -- Add `SIGPROF` to `g_dbg_signos[]`. -- In `dbg_signal_handler`: early-out path for `SIGPROF` + non-null - `on_sample` that calls the callback and returns without touching the - park/unpark events. - -No other OS layer changes. `CfreeDbgOs` itself does not need new fields: -timer arming (`setitimer`) and thread targeting (`pthread_kill`) are both -driver-side and do not belong behind the vtable. - -## 12. Checklist - -### Public API — `include/cfree.h` - -- [ ] `on_sample` field added to `CfreeDbgSignalOps` -- [ ] `CfreeProfBuf` and `CfreeProfWriter` structs declared -- [ ] `cfree_jit_session_prof_attach` -- [ ] `cfree_jit_session_prof_collect` - -### Library — `src/dbg/prof.c` - -- [ ] `CfreeProfBuf` alloc/free helpers -- [ ] `dbg_fp_walk(ucontext, sample)` — aarch64 frame-pointer walk using - `guarded_copy`; terminates on NULL/misaligned/non-advancing FP -- [ ] `on_sample` implementation: check capacity, call `dbg_fp_walk`, - append or increment `dropped` -- [ ] `cfree_jit_session_prof_attach` body -- [ ] `cfree_jit_session_prof_collect` body: symbolication loop + - `CfreeProfWriter` dispatch -- [ ] `dbg_fp_walk` x64 variant (frame layout identical; FP = rbp) -- [ ] `dbg_fp_walk` rv64 variant (frame layout identical; FP = s0/x8) - -### Host adapter — `driver/env.c` - -- [ ] `SIGPROF` added to `g_dbg_signos[]` -- [ ] `on_sample` early-return path in `dbg_signal_handler` - -### Driver — `driver/prof.c` - -- [ ] Flag parsing (`--rate`, `--depth`, `--cap`, `--output`, - `--no-folded`, `--no-flat`) -- [ ] `setitimer(ITIMER_PROF, ...)` arm before `session_call`, disarm after -- [ ] `CfreeProfWriter` for folded-stacks output (sort + run-length encode) -- [ ] Flat report: self% / cumul% table printed to stdout -- [ ] Dropped-sample warning when `buf.dropped > 0` -- [ ] Wire into multi-call dispatch in `driver/main.c` - -### Tests - -- [ ] `test/smoke/prof_hello`: run a simple C program under `cfree prof`, - assert `prof.folded` is non-empty, `main` appears in output -- [ ] `test/dbg/fp_walk_aa64`: canned aarch64 frame chain (stack buffer - with crafted FP links); assert `dbg_fp_walk` produces expected PC - sequence and terminates correctly on a NULL sentinel -- [ ] `test/dbg/prof_buf_overflow`: fill buffer to capacity, assert - `dropped` increments and count stays at cap - -### Bigger follow-ons - -- [ ] Per-thread timer via `timer_create(CLOCK_THREAD_CPUTIME_ID)` + - `SIGEV_THREAD_ID` (Linux) for reliable delivery in multi-thread guests -- [ ] `ITIMER_REAL` mode (`--wall`) for profiling I/O-bound programs -- [ ] Allocation profiling via breakpoint on the allocator entry with - `CfreeBreakpointSpec.condition` recording stack traces -- [ ] SpeedScope / pprof output formats diff --git a/doc/REGISTRY.md b/doc/REGISTRY.md @@ -1,235 +0,0 @@ -# Build-time configuration and component registries - -This document specifies how cfree's optional components — backend -architectures, object/image formats, and language frontends — are gated -at build time and registered at runtime. The goal is a single -`config.h` that selects which components are compiled into `libcfree.a` -(and the `cfree` driver), with each axis flowing through a vtable + -registry pair so dispatch sites collapse to a single lookup. - -## Configuration axes - -Three independent axes, plus one derived axis: - -1. **Backend architectures** — `aa64`, `rv64`, `x64`, `c_target` -2. **Object/image formats** — `ELF`, `Mach-O`, `PE/COFF` -3. **Language frontends** — `C`, `Toy`, `WASM` (`asm` is unconditional) -4. **ABIs (derived)** — the (arch × format/OS) cross-product. Not user - configurable; pulled in automatically when both sides are enabled. - -The ABI table is the reason we don't expose ABIs as a fourth knob: every -valid ABI maps 1:1 to an (arch, OS-family) pair, where OS family is -implied by the object format (ELF→SysV-style, Mach-O→Apple, -PE/COFF→Windows). Exposing ABIs separately would create mostly-invalid -combinations and offer no real flexibility. - -## `include/cfree/config.h` - -A single header consumed by both `libcfree.a` sources and the driver. -Hand-edited today; a future `configure` script can generate it. - -```c -#ifndef CFREE_CONFIG_H -#define CFREE_CONFIG_H - -/* Backend architectures. */ -#define CFREE_ARCH_AA64_ENABLED 1 -#define CFREE_ARCH_X64_ENABLED 1 -#define CFREE_ARCH_RV64_ENABLED 1 -#define CFREE_ARCH_C_TARGET_ENABLED 1 - -/* Object/image formats. */ -#define CFREE_OBJ_ELF_ENABLED 1 -#define CFREE_OBJ_MACHO_ENABLED 1 -#define CFREE_OBJ_COFF_ENABLED 1 - -/* Language frontends. CFREE_LANG_ASM is unconditional: the assembler - * lives inside libcfree as part of the codegen substrate. */ -#define CFREE_LANG_CPP_ENABLED 1 -#define CFREE_LANG_C_ENABLED 1 -#define CFREE_LANG_TOY_ENABLED 1 -#define CFREE_LANG_WASM_ENABLED 1 - -#endif -``` - -`src/core/config_assert.c` adds `_Static_assert` checks that at least -one arch and at least one obj format are enabled. - -## Axis 1: Backend architectures - -**Status: vtable, registry, and source gating are done.** - -- **Vtable**: `ArchImpl` (`src/arch/arch.h`). Carries `dwarf`/`dbg` - hooks, the `link` arch descriptor, and the codegen/assembler/ - disassembler factories. ABI selection lives under `src/abi/`. -- **Registry**: `src/arch/registry.c` already holds a static - `arch_impls[]` array and exposes `arch_lookup`. -- **Build**: each `src/arch/<name>/` source group is gated by the - matching `CFREE_ARCH_*_ENABLED` flag. - -## Axis 2: Object/image formats - -**Status: vtable, registry, format-arch ops, and directory-based source -gating are done.** - -`emit_elf` / `emit_macho` / `emit_coff` and their read/link paths live -under `src/obj/{elf,macho,coff}/`. Generic call sites reach them through -`ObjFormatImpl` in `src/obj/format.h` and `src/obj/registry.c`. - -**Vtable** (`src/obj/format.h`): - -```c -typedef struct ObjFormatImpl { - ObjFmt kind; - CfreeBinFmt bin_fmt; - const char* name; - const char* read_name; - const char* read_dso_name; - - /* Relocatable object emit + read. */ - void (*emit)(Compiler*, ObjBuilder*, Writer*); - ObjBuilder* (*read)(Compiler*, const char* name, const u8* data, size_t len); - - /* DSO header reader for `-lfoo` resolution against .so/.dylib/.dll - * plus Mach-O .tbd handling via obj_format_dso_reader_for_bytes(). */ - ObjBuilder* (*read_dso)(Compiler*, const char* name, const u8*, size_t, - Sym* soname_out); - - /* Link-image emit (executable / shared object). */ - void (*link_emit)(LinkImage*, Writer*); - void (*layout_dyn)(Linker*, LinkImage*); - void (*free_dyn)(LinkImage*); - - /* Format-owned arch mappings and relocation wire encoders. */ - const ObjElfArchOps* (*elf_arch)(CfreeArchKind); - const ObjElfArchOps* (*elf_machine)(u32 e_machine); - const ObjMachoArchOps* (*macho_arch)(CfreeArchKind); - const ObjMachoArchOps* (*macho_cputype)(u32 cputype); - const ObjCoffArchOps* (*coff_arch)(CfreeArchKind); - const ObjCoffArchOps* (*coff_machine)(u16 machine); - - /* Optional format-specific linker input policy. */ - int (*classify_obj_input)(Compiler*, ObjBuilder*, Sym* soname_out); - Sym (*archive_hint)(Compiler*, const char* archive_name); - ObjFormatArchiveAction (*archive_member)(Compiler*, - const ObjFormatArchiveMember*, - ObjBuilder** out); -} ObjFormatImpl; - -const ObjFormatImpl* obj_format_lookup(ObjFmt); -const ObjFormatImpl* obj_format_lookup_bin(CfreeBinFmt); -``` - -**Registry** (`src/obj/registry.c`), gated the same way as the arch -registry: - -```c -static const ObjFormatImpl* const obj_format_impls[] = { -#if CFREE_OBJ_ELF_ENABLED - &obj_format_impl_elf, -#endif -#if CFREE_OBJ_COFF_ENABLED - &obj_format_impl_coff, -#endif -#if CFREE_OBJ_MACHO_ENABLED - &obj_format_impl_macho, -#endif - &obj_format_impl_wasm, -}; -``` - -**Call-site changes**: the switch in `src/link/link.c` and the obj -emit/read entry points have collapsed to -`obj_format_lookup(target.obj_format)->fn(...)`. COFF short-import and -long-form import-archive handling, ELF dynamic layout, and -(arch × format) relocation wire metadata now live behind object-format -hooks, so the linker and object entry points stay generic over the -object format implementation. - -**Build**: `Makefile` excludes `src/obj/elf/`, `src/obj/macho/`, and -`src/obj/coff/` from the shared source glob, then adds each directory -back only when the matching `CFREE_OBJ_*_ENABLED` flag is enabled. - -## Axis 3: ABIs (derived) - -**Status: vtable, registry, and source gating are done.** - -- **Vtable**: `ABIVtable` (`src/abi/abi_internal.h`). It carries the - function classifier and ABI-owned `va_list` layout facts. -- **Registry**: `src/abi/registry.c` maps `(CfreeArchKind, - CfreeObjFmt)` to the active `ABIVtable`. `abi_init()` now performs a - single `abi_vtable_lookup(target.arch, target.obj)` instead of asking - the arch implementation to dispatch by OS. -- **Gating**: registry entries are gated by the combined arch + object - format flags. ELF selects SysV/AAPCS-style ABIs, Mach-O selects Apple - variants, and COFF selects Windows variants. -- **Build**: per-ABI `.c` files in `src/abi/` are excluded from the - broad lib source glob and re-added only for enabled arch/format cells. - A few ABI variants share a base classifier implementation, so the - Makefile also keeps those base classifier TUs when a derived variant - delegates to them. - -| Arch | ELF (SysV-ish) | Mach-O (Apple) | PE/COFF (Windows) | -|----------|-------------------|-----------------|----------------------| -| aa64 | aapcs64 | apple_arm64 | aapcs64_windows | -| x64 | sysv_x64 | apple_x64 | win64_x64 | -| rv64 | rv64 | — | — | -| c_target | n/a (source-level emission, no machine ABI) | - -## Axis 4: Language frontends - -**Status: registry-driven, done.** - -- **Vtable**: `CfreeFrontendVTable` (`include/cfree/compile.h`), public - API. Per-frontend instances are exposed as externs: - `cfree_c_frontend_vtable`, `cfree_toy_frontend_vtable`, - `cfree_wasm_frontend_vtable`, and `cfree_asm_frontend_vtable` (the - asm vtable's declaration lives in `src/api/lang_registry.c` since - asm has no `lang/asm/` directory). -- **Extensions on the vtable**: each vtable carries a NULL-terminated - `extensions` list (lowercase, no leading dot). `cfree_language_for_path` - now takes a `CfreeCompiler*` and walks `c->frontends[]`, matching - case-insensitively so `.S` resolves to asm's `"s"` entry. C has no - extensions and serves as the fallback when nothing else matches. -- **Registry**: `src/api/lang_registry.c` is the sole place that checks - `CFREE_LANG_*_ENABLED`. `lang_registry_init()` is called from - `compiler_init` and populates `c->frontends[]` with each compiled-in - vtable plus the always-on asm frontend. -- **Build**: `lang/c`, `lang/wasm`, and `lang/toy` sources are folded - into `libcfree.a` and gated by the matching `_ENABLED` flag in the - Makefile. The standalone `libcfree_toy.a` archive is gone — the toy - frontend is now first-class alongside C and WASM. -- **No fallback**: `frontend_for_language()` returns whatever is in - `c->frontends[lang]` and nothing more. The asm frontend is registered - by the registry like any other; an embedder that doesn't want asm can - clear the slot with `cfree_register_frontend(c, CFREE_LANG_ASM, NULL)` - after construction. -- **Public override**: `cfree_register_frontend()` remains public, so - embedders can swap in a custom vtable for any `CfreeLanguage` slot - (or clear it) after `cfree_compiler_new`. - -## Summary - -| Axis | Vtable | Registry | Remaining work | -|------------|-----------------------|---------------------------|-------------------------------------------------| -| Arch | `ArchImpl` | `src/arch/registry.c` | none for registry/source gating | -| Obj format | `ObjFormatImpl` | `src/obj/registry.c` | move small policy checks into vtable as needed | -| ABI | `ABIVtable` | `src/abi/registry.c` | none for registry/source gating | -| Frontend | `CfreeFrontendVTable` | `src/api/lang_registry.c` | none for registry/source gating | - -## Implementation order - -1. Done: land `include/cfree/config.h` and `mk/config.mk` with all flags - enabled by default. -2. Done: gate arch registry entries and arch Makefile sources. -3. Done: extract `ObjFormatImpl`, move format code under - `src/obj/{elf,macho,coff}/`, and gate those directories. -4. Done: add `src/abi/registry.c`, move ABI selection out of `ArchImpl`, - and gate per-ABI sources. -5. Done: add `src/api/lang_registry.c`, expose - `cfree_<lang>_frontend_vtable` externs, and fold frontends into - `libcfree.a`. - -Each step is independently testable and leaves the build green with the -default all-on configuration. diff --git a/doc/RUNTIME.md b/doc/RUNTIME.md @@ -0,0 +1,283 @@ +# Runtime (`libcfree_rt.a`) + +`libcfree_rt.a` is the *target* runtime: the small body of code and headers that +cfree-compiled programs link against, entirely separate from the *compiler* +library `libcfree.a`. The compiler emits calls to ABI-mandated helper symbols +(`__divti3`, `__addtf3`, `__atomic_load_8`, …), references freestanding standard +headers (`<stdint.h>`, `<stdatomic.h>`, …), and may emit startup hooks +(`.init_array` IFUNC resolution). The runtime supplies the implementations. It +is freestanding — no OS, no hosted libc — so the same archive backs a Linux +binary, a Darwin binary, and a bare-metal image alike. It is built per target by +cfree itself (see [BUILD.md](BUILD.md)) and ships with the toolchain; the driver +links it automatically (see [DRIVER.md](DRIVER.md)). + +``` + user .c ──cfree──► emits __divti3, va_arg, _Atomic, coro_resume, ... + │ + <stdint.h> etc. ◄─┘ (shipped headers, rt/include/) + │ + libcfree_rt.a (per-target archive) + ┌──────────────┬───────────┬──────────┬──────────┬───────────┐ + int/fp soft mem/string atomic coro startup freestanding + helpers /stdio shim switch (IFUNC) libc subset +``` + +## Design principles + +**No target-dispatch ifdefs in source.** The integer/float helpers derive from +compiler-rt (`lib/builtins/`, Apache-2.0 WITH LLVM-exception, see +`rt/lib/LICENSE-compiler-rt.txt`), but upstream's `#ifdef __ARM_EABI__ / __MINGW32__ / __SOFTFP__` +target cascades were stripped out. Per-target variation is expressed by the +*build* — which directories and flags are selected — not by preprocessor +branches inside the C. What remains of the preprocessor is parameterization +(precision, src/dst pair) and genuinely-orthogonal concerns (assembler syntax in +`assembly.h`, `HAS_INT128`), never target dispatch. + +**One master `.c`/`.S` per feature, one object in the archive.** Rather than +globbing compiler-rt's many per-op files, each feature group is a single master +translation unit (`rt/lib/int/int.c`, `rt/lib/fp/fp.c`, …) with the per-op +snippets inlined as commented blocks (`// ---- udivmoddi4.c ----`). Templates in +`rt/lib/impl/` (`fp_add_impl.inc`, `int_div_impl.inc`, …) and the re-includable +`rt/lib/include/common/fp_lib.h` are pulled into the master multiple times per +TU, once per precision or per (src,dst) pair, with suffix-renamed statics so the +single object carries every needed instance. This keeps the archive small and +the member list explicit. + +**Weak portable fallbacks.** Everything a hosted libc would normally own — +`memcpy`/`memmove`/`memset`/`memcmp`, the string and stdlib functions, +`__clear_cache`, `__cfree_assert_fail` — is defined `__attribute__((weak))` in +portable C, so a real libc or a tuned arch-specific routine wins at link time +without a conflict. The freestanding definitions only matter when nothing else +provides them. + +## Build-time target selection (multilib) + +`rt/Makefile` (included by the root Makefile) enumerates `RT_VARIANTS` and, for +each, a small feature vector that drives source and flag selection. The +dimensions are: clang target triple, data model (`lp64` / `ilp32` / `llp64`), +`HAS_INT128` (0/1), the coro arch token, binary128 long-double support +(`LDBL128`), RISC-V save/restore, ARM AEABI mode, and a `HOSTED` flag. A single +GNU-make `define` template expands each variant into its object list, compile +flags, and an `ar`-built `libcfree_rt.a` under `$(BUILD_DIR)/rt/<variant>/`. +cfree compiles and archives its own runtime: `RT_CC`/`RT_AS`/`RT_AR` default to +`cfree cc`/`cfree as`/`cfree ar`, so a codegen change in the compiler rebuilds +the runtime. + +The data-model dimension is the multilib axis. It selects: + +| Data model | `long`/ptr | 128-bit | int master | include dir | targets | +| ----------------- | ---------- | ------- | ---------------- | --------------------------------- | ------------------------------ | +| `lp64_le` | 64 / 64 | yes | `int64/int64.c` | `rt/lib/include/lp64_le` | x86_64, aarch64, rv64 (LE) | +| `llp64_le` | 32 / 64 | yes | `int64/int64.c` | `rt/lib/include/llp64_le` | Win64 x86_64 / aarch64 | +| `ilp32_le` | 32 / 32 | no | `int32/int32.c` | `rt/lib/include/ilp32_le` | i386, arm32, rv32, wasm32 | +| `lp64_le_ldbl128` | (lp64 +) | yes | (lp64 + `fp_tf`) | `-include .../tf_supplement.h` | aarch64/rv64 binary128 `long double` | + +The per-model dir holds one file, `int_lib.h`, the compiler-rt support header +folded together with upstream's `int_endianness.h`/`int_types.h`. They differ +only where the data model forces it: LP64/LLP64 declare the `ti_int`/`tu_int` +`__int128` machinery and `twords`/`utwords` unions; ILP32 omits all of it (no +128-bit type) and instead defines `AEABI_RTABI` (the AAPCS `__pcs__` attribute +the ARM sources need). `lp64_le_ldbl128` is not a separate header set but an +*extra* `-include tf_supplement.h` layered onto an LP64 build, defining +`tf_float` / `CRT_HAS_TF_MODE` before `fp_lib.h` processes them — keeping the +base header free of feature gates. All headers assume little-endian; a +big-endian port would need a parallel `*_be/` set. + +The `HOSTED` flag (Windows variants) ships only the compiler-support subset +(`RT_COMPILER_SRCS`: int/fp/atomic/cache/ifunc) and lets the platform libc +supply mem/string/stdio/stdlib. Everything else ships the full `RT_BASE_SRCS`. + +## Compiler-support helpers + +These are the ABI-mandated symbols the backends emit when an operation has no +native instruction. + +- **Integer** (`rt/lib/int/int.c`, always built). 64-bit divide/modulo + (`__udivdi3`/`__divdi3`/`__umoddi3`/…) built on `udivmoddi4`, plus the + bit-twiddling family every target may need: `bswap`, `clz`/`ctz`/`ffs`, + `parity`, `popcount`, `cmp`/`ucmp`, `abs`, `neg` — at 32- and 64-bit widths. + `rt/lib/int/si_div.c` adds bit-serial `__udivsi3`/`__divsi3` etc. (referenced + by other helpers even where C int division is one instruction). +- **64-bit-on-32-bit** (`rt/lib/int32/int32.c`, ILP32 only): 64-bit shifts and + `__muldi3` synthesized from 32-bit lanes. +- **128-bit-on-64-bit** (`rt/lib/int64/int64.c`, LP64/LLP64 only): `__int128` + shifts, `clz`/`ctz`, `__multi3`, `__negti2`, and div/mod via `udivmodti4`. +- **Soft float — binary32/binary64** (`rt/lib/fp/fp.c`, always built): `sf`/`df` + add/sub/mul/div/neg, the full compare set (`__eqsf2`…`__gtdf2`, each a real + function rather than an object-format-conditional alias), `sf`↔`df` + extend/truncate, every int↔float conversion (`floatsisf`, `fixdfdi`, …), and + `fp_mode` (rounding-mode query). Native-FPU targets still use the soft + conversions cfree's contract requires; FPU-less targets (rv32/64 without F/D, + ARM softfp, wasm) use the whole set. +- **Soft float — binary128** (`rt/lib/fp_tf/fp_tf.c`) and **`__int128`↔float** + (`rt/lib/fp_ti/fp_ti.c`): built where `long double` is IEEE binary128 + (aarch64/rv64). Adds `tf` arithmetic, `sf`/`df`↔`tf` conversions, and the + `i128`↔`tf` / `i128`↔`sf`/`df` fixes. +- **Atomics** (`rt/lib/atomic/atomic_freestanding.c`, always built): the + `__atomic_*_N` fallbacks for objects the backend cannot lower to a native + atomic instruction. A pointer-sized `_Atomic(uintptr_t)` spinlock pool + (`atomic_common.inc`) provides the lock, hashed by address — no OS dependency. + Implemented over the GCC-style `__atomic_*` builtin family that cfree itself + documents (`doc/builtins.md`), with upstream's Clang-only `__c11_atomic_*` + calls translated. 16-byte cases are keyed off `HAS_INT128`. +- **Misc** (`rt/lib/cache/clear_cache.c`): a weak `__clear_cache` (target for + `__builtin___clear_cache`) plus weak bare-metal cache stubs. ARM and RISC-V + variants add the AEABI / save-restore assembly described below. + +What this runtime deliberately does *not* provide: 80-bit x86 `xf` soft float +(x86 always has the FPU for long double), half-precision conversions, big-endian +targets, and the `__riscv_32e`/`64e` embedded ABIs — none are in cfree's +runtime contract. + +### Per-arch assembly helpers + +- **ARM AEABI** (`rt/lib/arm/aeabi_thumb2.S`, `aeabi_thumb1.S`, `aeabi.c`): the + AEABI div/mod dual-result helpers, soft-float compares, and size-specialized + `__aeabi_mem*` wrappers (which forward to the weak `memcpy`/`memset`). Two ISA + variants — Thumb-2 (ARMv7+, tail-calls and `subs`/`muls` folding) and Thumb-1 + (ARMv6-M Cortex-M0, no tail-calls, restricted forms). `aeabi.c` carries the + ISA-agnostic `__aeabi_drsub`/`__aeabi_frsub`. +- **RISC-V save/restore** (`rt/lib/riscv/rv32.S`, `rv64.S`): the + `__riscv_save_*`/`__riscv_restore_*` millicode for `-msave-restore`, split per + XLEN (upstream gated one file on `__riscv_xlen`). + +A Win64 stack-probe helper (`rt/lib/stack/chkstk_x86_64_win.c`, `__chkstk` / +`___chkstk_ms` page-touch probes for large frames) lives in the tree but is +*not* wired into any variant's source list and so ships in no archive today. It +is noted here only so the orphan is not mistaken for a present provider; when +Win64 large-frame probing lands it would join the hosted Windows variant's +source set. + +## cfree-specific startup: IFUNC resolution + +`rt/lib/cfree/ifunc_init.c` provides `__cfree_ifunc_init`, the startup hook for +statically-linked ELF images that use `STT_GNU_IFUNC` symbols. The linker +(`src/link/link_layout.c`) materializes one IPLT stub and `.igot.plt` slot per +IFUNC, emits a parallel `.iplt.pairs` section of `(resolver, slot)` pointer +pairs, and synthesizes a `.init_array` entry pointing at this function. Before +`main`, the CRT walks `.init_array`; `__cfree_ifunc_init` iterates the pairs, +calls each resolver, and stores the chosen implementation pointer into its slot, +so the IPLT load-and-branch tail-calls the right target. The `.iplt.pairs` span +symbols (`__start_iplt_pairs`/`__stop_iplt_pairs`) are weak, so the object is a +harmless no-op when linked into images with no IFUNCs or by a non-cfree linker. +The JIT path resolves slots in-process at load time and skips the `.init_array` +synthesis, so this symbol is never an unresolved reference there (see +[JIT.md](JIT.md), [LINK.md](LINK.md)). + +`rt/lib/assert/assert.c` supplies the weak `__cfree_assert_fail` (the target of +`assert()` failure), which `__builtin_trap`s and spins. + +## Coroutines: stackful asymmetric context switch (`rt/lib/coro/`) + +cfree ships `<cfree/coro.h>` as a native extension — C11 has no stackful +coroutine facility — built as a deliberate counterpart to `<setjmp.h>`. The two +share one per-target register-context payload (256 bytes, 16-aligned): the same +save/restore instruction sequences back `setjmp`/`longjmp` and the coroutine +switch. The module is two layers: + +``` + <cfree/coro.h> coro_init / coro_resume / coro_yield / coro_self + │ + coro/coro.c arch-agnostic asymmetric layer ── resume chain, thunk + │ (one TU, built for every coro variant) + ▼ + coro/<arch>.c per-arch primitives: setjmp / longjmp, + (+ aarch64*.s) __cfree_coro_switch, __cfree_coro_ctx_init, trampoline +``` + +**Per-arch primitive (`rt/lib/coro/<arch>.c`).** One per ABI: +`aarch64`, `x86_64`, `x86_64_win`, `i386`, `arm32`, `arm32_thumb1`, `riscv32`, +`riscv64`. Each defines the per-target context struct (callee-saved GPRs + +callee-saved FPRs + sp + return address — e.g. x86_64 SysV is 8 words/64 bytes; +aarch64 is x19–x28/fp/lr/sp + d8–d15/176 bytes), and verifies via +`_Static_assert` that it fits both `jmp_buf` and `coro_ctx`. The three primitives +that save/restore registers — `setjmp`, `longjmp`, and `__cfree_coro_switch` — +share one pair of `SAVE_INTO`/`RESTORE_FROM` macros so identical instruction +bytes are emitted in all three. Symbol decoration uses `__USER_LABEL_PREFIX__`, +so one source compiles for ELF / Mach-O / COFF. Most arches keep the asm +file-scope inside the `.c`; aarch64 splits it into `aarch64_elf.s` / +`aarch64_macho.s` (selected per variant via `RT_EXTRA_SRCS`) so the C TU needs +no file-scope-asm support. The thumb1 variant is a separate file because its +ARMv6-M sequences (no IT blocks, no VFP, no `str sp`) can't share with `arm32.c`. +wasm32 ships no coro (would need an Asyncify fiber port). + +`__cfree_coro_switch` is the symmetric register shuffle: save callee state into +`*from`, restore from `*to`, deliver a value. It is exposed in the public header +for advanced (M:N, work-stealing) schedulers, and is the building block under +the asymmetric layer. `__cfree_coro_ctx_init` lays down a fresh context: zero +the saved registers, point the entry-fn register and return address at the +trampoline, and set sp to the (16-aligned, downward-growing) stack top. + +**Asymmetric layer (`rt/lib/coro/coro.c`).** Implements `coro_init` / +`coro_resume` / `coro_yield` / `coro_self`. A `coro_t`'s private blob holds the +`coro_ctx`, a `resumer` back-pointer, and the user entry fn (a `_Static_assert` +pins the fit inside the header's 288-byte reservation). `coro_resume` records the +caller as the resumer (`NULL` meaning the main flow), switches in, and on return +restores the previous current-coroutine pointer; `coro_yield` reads its own +resumer slot and switches back — so resumes nest like calls. The trampoline +enters a static thunk (`__cfree_coro_thunk`) that runs the user fn, marks the +coroutine `CORO_DEAD`, and switches back to the resumer with the return value, so +the symmetric primitive never needs to know about `coro_t` lifecycle. The +"current coroutine" pointer and the "main" save slot are `_Thread_local`, so each +thread gets an independent resume chain; cfree's contract defines +`__STDC_NO_THREADS__`, but `_Thread_local` is an independent C11 language +feature, and bare-metal images with no TLS runtime collapse to single-thread +semantics. + +## Shipped headers (`rt/include/`) + +cfree ships its own header set so freestanding compilation needs no system +headers. Two groups: + +**Freestanding C standard headers.** The C11 freestanding-mandated set +(`<stddef.h>`, `<stdint.h>`, `<stdarg.h>`, `<stdalign.h>`, `<stdbool.h>`, +`<stdnoreturn.h>`, `<float.h>`, `<limits.h>`, `<iso646.h>`, `<stdatomic.h>`), +plus headers cfree provides as extensions beyond the freestanding subset +(`<setjmp.h>`, `<assert.h>`, and small `<string.h>`/`<stdlib.h>`/`<stdio.h>`/ +`<math.h>` surfaces matching the runtime's weak functions). They lean on +compiler builtins where the data model varies: `<stdarg.h>` is `__builtin_va_*`; +`<stdint.h>` hardcodes the exact-/min-width *limits* (cfree fixes +`CHAR_BIT == 8`, `int == 32`, `long long == 64`) but delegates the *type aliases* +(`__INT32_TYPE__`, `intptr_t`, the FAST family) to the compiler since those vary +by model. A handful of x86 intrinsic shims (`<emmintrin.h>`, `<x86intrin.h>`, +`<mm_malloc.h>`) round out source compatibility. + +`<setjmp.h>` is ABI-coupled to the coro payload: `jmp_buf` is a 256-byte, +16-aligned struct-wrapped array (struct wrapper guarantees alignment for x86_64 +xmm saves; `[1]` keeps it an array type so it decays to a pointer), sized to the +largest per-target context (x86_64 Windows). No signal-mask slot — C11 7.13 +excludes FP status and open-file state. + +**cfree extension headers (`rt/include/cfree/`).** Non-standard primitives cfree +exposes so low-level code stays pure C: +- `<cfree/coro.h>` — the coroutine API above. `coro_ctx` is the raw 256-byte + register buffer; `coro_t` embeds it plus private scheduler storage. +- `<cfree/syscall.h>` — `__cfree_syscall0..6`, the bare kernel-trap primitive. + These are *compiler*-lowered (the backend emits `syscall`/`int 0x80`/`svc`/ + `ecall` inline as an opaque, full-memory-clobber operation) — there is no + library implementation in this archive. The result is normalized to the + Linux "non-negative success / -errno failure" convention on every target, + with the BSD/Darwin carry-flag form rewritten by the lowering. WASM is a + compile-time error (use WASI imports). +- `<cfree/baremetal.h>` — IRQ mask save/restore, CPU memory barriers + (`__cfree_dmb`/`dsb`/`isb`, distinct from C11 fences and meant for DMA / MMU / + self-modifying code), range-based cache maintenance, and CPU hints + (`__cfree_yield`/`wfi`/`wfe`/`sev`). Like syscalls, these are compiler-lowered + opaque operations, not library calls; targets with no meaningful lowering + raise a compile-time error rather than silently no-op. + +## Freestanding libc subset + +For non-hosted targets the runtime also carries a minimal libc so freestanding +programs can do basic work without a platform libc. All weak: +`rt/lib/mem/mem.c` (the four mem* primitives, hand-written portable C, 0BSD — +not from compiler-rt), `rt/lib/string/string.c` (`strlen`, `strcpy`, `memchr`, +…), `rt/lib/stdlib/stdlib.c` + `qsort.c` (string-to-number conversions, `qsort`, +the `div_t` family), and `rt/lib/stdio/printf.c` (a callback-driven formatter +derived from mpaland/printf, integer formats only — floating-point conversions +omitted). Hosted (Windows) variants drop all of these in favor of the platform +libc. + +--- + +Planned/roadmap work, if any, lives under doc/plan/ — not here. diff --git a/doc/SSA2.md b/doc/SSA2.md @@ -1,932 +0,0 @@ -# SSA2 — Incremental SSA Def-Use, MIR-shape - -This document is the design sketch for replacing cfree's wholesale rebuilt -def-use representation (`opt_rebuild_def_use` in `src/opt/pass_analysis.c:327`) -with MIR-style incremental SSA edges. The structural move lands first; every -SSA-era downstream pass is disabled in-tree and then re-introduced one at a -time, each matching or improving on the corresponding MIR pass. - -Background: see `doc/OPT.md` for the O2 pipeline shape, `doc/OPT_PERF.md` "O2 -Gap Analysis vs MIR" for the measured impact of wholesale rebuilds (38 call -sites, 11 SSA-era DCE/copy invocations vs MIR's 2). - -## Status quo: what we are replacing - -cfree today carries a flat side table of uses, rebuilt from scratch by every -pass that mutates instructions. - -`OptUse` (`src/opt/ir.h:374-385`) records one use: - -```c -typedef struct OptUse { - Val val; - u32 block; - u32 inst; - InstId inst_id; - u32 next_for_val; /* singly-linked through opt_uses[] */ - u32 operand_index; - u32 phi_pred_index; - Operand* operand; /* pointer into Inst.opnds; valid only between rebuilds */ - u8 kind; /* OPT_USE_{OPERAND,INDIRECT_BASE,INDIRECT_INDEX,PHI_INPUT} */ - u8 pad[3]; -} OptUse; -``` - -`Func` owns `opt_uses[]` plus the per-Val head array -`opt_first_use_by_val[v]` (`ir.h:471-473`). The contract: - -- After `opt_rebuild_def_use(f)`, every use of every live Val is enumerable by - walking the `next_for_val` chain starting at `opt_first_use_by_val[v]`. -- Defs are addressed separately through `Func.val_def_block[v]` and - `Func.val_def_inst[v]` (`ir.h:405-406`) — one def per Val (SSA single-def - invariant). -- Any pass that adds, deletes, or rewrites an instruction must re-run - `opt_rebuild_def_use` before the next query, or the `operand` pointers in - `OptUse` go stale and the chains misrepresent the IR. - -That contract is what produces the 38 rebuild call sites and the 11 SSA-era -DCE/copy interleaving documented in `doc/OPT_PERF.md`. - -## MIR's representation - -MIR encodes def-use as a per-operand linked-list system (mir-gen.c:2134-2164): - -```c -typedef struct ssa_edge *ssa_edge_t; -struct ssa_edge { - bb_insn_t use, def; - char flag; /* scratch bit used by renaming/worklist */ - uint16_t def_op_num; - uint32_t use_op_num; - ssa_edge_t prev_use, next_use; /* doubly-linked list of uses of the same def */ -}; -``` - -The clever part: every operand carries a single pointer `op->data`. Its -meaning depends on whether the operand is a def or a use: - -- **Def operand** (`bb_insn->insn->ops[def_op_num].data`): head of the - doubly-linked list of `ssa_edge_t` records, one per use of this def. -- **Use operand** (`bb_insn->insn->ops[use_op_num].data`): the single - `ssa_edge_t` representing this use, embedded in the def's list via - `prev_use`/`next_use`. - -So traversing all uses of a def is `for (se = def->ops[i].data; se; se = se->next_use)`. Removing one use is O(1) via the doubly-linked pointers -(`mir-gen.c:2422-2434`). The shared `flag` byte is reused by renaming -(`push_to_rename`/`pop_to_rename`, `mir-gen.c:2550-2616`) and other worklist -algorithms — handy because it gives any pass a per-edge scratch bit without a -separate map. - -The companion structures (also `mir-gen.c:2166-2186`): - -- `def_tab` — hash `(bb, reg) → bb_insn_t` used by demand-driven `get_def` - (mir-gen.c:2284-2304). Only live during SSA construction; cleared after. -- `phis` / `deleted_phis` — running lists used by `minimize_ssa` - (mir-gen.c:2323-2372). -- `ssa_edges_to_process` — VARR worklist used by renaming. - -The whole machinery is built once by `build_ssa` (mir-gen.c:2674-2709), -maintained incrementally for the lifetime of SSA, and torn down by -`undo_build_ssa` (mir-gen.c:2789-2820, walking ops and freeing edges). - -## Why this works for MIR and what changes for cfree - -MIR can stash a pointer directly on each operand because: - -1. `MIR_op_t` already has a `void* data` field for back-end use. -2. `MIR_insn_t` is heap-allocated and reached via a doubly-linked list — instruction objects never move. -3. `MIR_OP_VAR_MEM` packs base+index into one operand; in practice MIR - treats each operand position as a single use site (the back-end iterates - vars within an op separately). - -cfree differs on every count and needs a slightly different layout: - -1. `Operand` (`src/opt/ir.h`) has no scratch field. We have to add one or - keep edges in a side table. -2. `Block.insts` is an `Inst*` array that can be reallocated on growth. - `Inst*` pointers are not stable — we must address instructions by - `(block_id, inst_index)`. `Inst.opnds` is arena-backed and stable for the - life of the instruction, so `Operand*` is stable as long as the instruction - exists, but it is not unique-id-able across arena resets. -3. `OPK_INDIRECT` carries two register uses (`v.ind.base`, `v.ind.index`) - within one `Operand`. We need two edge slots per indirect operand. -4. Many uses live in `extra.aux` (phi inputs in `IRPhiAux.pred_vals[]`, call - args in `IRCallAux`, asm operands in `IRAsmAux`, intrinsic args, return - value, scope cond) — not in `opnds[]`. Each of those needs an edge slot - too. cfree already enumerates them in `opt_collect_inst_uses` - (`pass_analysis.c:269-325`); the same enumeration is the basis for - maintenance. - -## Design constraints - -Speed is paramount, second only to correctness. The hot operations are walk -uses of a Val (GVN, DSE, LICM, DCE), relink an operand (copy prop, -substitute, addr_xform), remove a use (delete inst, rewrite operand), and -redirect a def (GVN replacement, copy_cleanup). Each must be O(1) per -operation aside from `redirect_def` which is O(uses_of_val). - -Three structural decisions that follow from this: - -1. **`u32` ids throughout — no pointers stored across operations.** Edge - ids are u32 indices into a pool. Storing `OptSsaEdge*` across an `add` - or `remove` requires pointer-stable backing storage; ids don't. -2. **`SegVec` (`src/core/segvec.h`) for the edge pool.** Push never moves - existing elements. A free list through `next_use` reuses freed slots, so - the segment count tracks peak live edges, not lifetime allocations. MIR - allocates each `ssa_edge_t` separately with `gen_malloc`; we get the - same pointer stability with better cache density and no per-edge malloc - overhead. -3. **Use-side edge id stored on the use site itself.** Operands gain a - `u32 edge` field. Aux structs gain a parallel `u32* edges` array sized - at aux creation. This makes "rewrite this operand's Val" a single edge - lookup, never a hash hit or a scan of the def's use list. - -We do **not** use MIR's `op->data` double-duty (head-at-def, edge-at-use). -cfree already has `opt_first_use_by_val[v]` keyed directly by Val id, which -is faster than walking from a def-side head — one array load vs an -operand load plus an Inst lookup. Use sites store their edge id, defs -store nothing in the operand. - -## Data structures - -All names use the `opt_ssa_` / `OptSsa` prefix to avoid colliding with the -legacy `OptUse` types during the migration. - -### Edge record - -```c -#define OPT_SSA_EDGE_NONE 0xffffffffu - -typedef enum OptSsaUseKind { - OPT_SSA_USE_OPERAND, /* Inst.opnds[op_idx] is an OPK_REG */ - OPT_SSA_USE_INDIRECT_BASE, /* Inst.opnds[op_idx].v.ind.base */ - OPT_SSA_USE_INDIRECT_INDEX, /* Inst.opnds[op_idx].v.ind.index */ - OPT_SSA_USE_PHI_INPUT, /* IRPhiAux.pred_vals[sub_idx] */ - OPT_SSA_USE_AUX_OPERAND, /* call/asm/intrinsic/ret/scope aux operand */ -} OptSsaUseKind; - -typedef struct OptSsaEdge { - /* Val whose def this edge refers to. Cached on the edge so the per-Val - * list head update on remove doesn't need to chase the use operand. */ - Val val; - - /* Use site. (use_block, use_inst) identify the consuming instruction; - * use_op_idx and sub_idx locate the specific use within it. */ - u32 use_block; - u32 use_inst; - u16 use_op_idx; - u16 sub_idx; /* phi pred index or aux sub-index; 0 otherwise */ - - /* Def location. Single-def SSA invariant means there's exactly one - * producing inst per Val; def_op_idx is which output position of that - * inst produces val (0 for primary def; >0 for multi-result via - * Inst.defs[]). Cached to avoid scanning the def inst on each touch. */ - u32 def_block; - u32 def_inst; - u8 def_op_idx; - - u8 kind; /* OptSsaUseKind */ - u8 flag; /* scratch bit; reused by renaming/GVN worklist */ - u8 pad; - - /* Doubly-linked use list per Val. Head at f->opt_first_use_by_val[val]. */ - u32 prev_use; /* OPT_SSA_EDGE_NONE if first */ - u32 next_use; /* OPT_SSA_EDGE_NONE if last; also free-list link */ -} OptSsaEdge; -``` - -The struct is 32 bytes on a 64-bit host. Two edges per cache line. - -### Edge pool - -```c -SEGVEC_DEFINE(OptSsaEdgeVec, OptSsaEdge, 7); /* 128 edges per segment ≈ 4 KB */ - -typedef struct OptSsaState { - OptSsaEdgeVec edges; /* pool; index = edge id */ - u32 free_head; /* head of free list through next_use */ - /* Per-Val use list heads. Indexed by Val id. */ - u32* first_use_by_val; - u32 first_use_by_val_cap; -} OptSsaState; -``` - -`OptSsaState` lives on `Func` (`Func.opt_ssa`) and is initialized at -`opt_build_reg_ssa` / finalized at `opt_undo_ssa`. The SegVec uses the -Compiler's `Heap`; segments are freed at `_fini`. - -Per-Val head array can be arena-grown (`Func.arena`): it holds only u32 -indices, never pointers, so reallocation on Val mint is safe. - -`first_use_by_val_cap` tracks current capacity; on Val mint, if `nvals > -cap`, double and copy. The copy is cheap (u32s, count = active Vals). - -### Use-site edge storage - -The use site stores its edge id so that `relink` and `remove` are one -lookup, never a list walk. - -**`Operand`** (in `src/opt/ir.h`) — add to the `OPK_REG` and `OPK_INDIRECT` -variants: - -```c -union { - struct { Reg r; u32 edge; } reg; - struct { Reg base, index; i32 ofs; u32 edge_base; - u32 edge_index; } ind; - /* other variants unchanged */ -} v; -``` - -`Operand` grows by 4 bytes for `reg` and 8 bytes for `ind`. Non-reg, -non-indirect variants are untouched. - -**Aux structs** (in `src/opt/ir.h`) — each gains a parallel edge-id array, -sized at aux construction: - -```c -typedef struct IRPhiAux { - /* existing fields */ - Val* pred_vals; - u32* pred_edges; /* edge id per pred; OPT_SSA_EDGE_NONE if none */ - u32 npreds; - /* ... */ -} IRPhiAux; -``` - -Same shape for `IRCallAux`, `IRAsmAux`, `IRIntrinAux`, `IRRetAux`, -`IRScopeAux`. Sized exactly to the use count enumerated by -`opt_collect_inst_uses` today (`pass_analysis.c:269-325`), so no growth. - -Memory accounting: per use site we spend 4 bytes (edge id) at the use plus -32 bytes (edge record) in the pool, vs today's 24 bytes per `OptUse` entry -plus an entire `opt_uses[]` rebuild per pass. The win is in not paying the -rebuild cost across ~38 sites per O2 pipeline. - -### Defs - -Existing `Func.val_def_block[v]` / `val_def_inst[v]` continue to identify -the single def for each Val (`ir.h:405-406`). The edge's `def_block`, -`def_inst`, `def_op_idx` are caches of the same information — written at -edge creation, updated by `opt_ssa_redirect_def`. They exist so that walks -of a Val's use list don't have to re-lookup the def through `val_def_*` -just to know "what produced this". - -## Core operations - -The API splits into two layers: - -- **Low-level pool/list primitives** (this section). Hot path. Inline in - `src/opt/ssa_edges.h`. These touch the edge pool and the per-Val use - list only; they do not know about Operand or aux layouts. Suitable for - use inside tight walks where the caller already has the use-site handle. - -- **Use-site-aware public helpers** (next section). What passes call. - Out-of-line in `src/opt/ssa_edges.c`. These wrap the primitives and also - resolve and update the use-site slot (Operand field, aux edges array). - -Splitting this way lets the inner-most loops — walking a Val's use list -and rewriting edges as you go — call only the primitives without paying -for a use-site lookup that the caller already has implicitly. - -### Lookup and iteration - -```c -/* O(1) — two dependent loads inside SegVec_at. */ -static inline OptSsaEdge* opt_ssa_edge(Func* f, u32 edge_id) { - return OptSsaEdgeVec_at(&f->opt_ssa.edges, edge_id); -} - -/* Walk every use of val. Cache next_use before the body if the body may - * remove the current edge — see the "iterator invalidation" note below. */ -#define OPT_SSA_FOR_EACH_USE(F, V, E) \ - for (u32 E = (F)->opt_ssa.first_use_by_val[V]; \ - E != OPT_SSA_EDGE_NONE; \ - E = opt_ssa_edge((F), E)->next_use) -``` - -### Pool allocate / free - -```c -/* O(1). Pop from free list if non-empty, else SegVec_push. Returned slot - * is uninitialized — the caller writes every field. */ -static inline u32 opt_ssa_pool_alloc(Func* f) { - u32 id = f->opt_ssa.free_head; - if (id != OPT_SSA_EDGE_NONE) { - f->opt_ssa.free_head = opt_ssa_edge(f, id)->next_use; - return id; - } - (void)OptSsaEdgeVec_push(&f->opt_ssa.edges, &id); - return id; -} - -/* O(1). Push onto free list via next_use. flag = 0xff marks the slot as - * free for the debug verifier. */ -static inline void opt_ssa_pool_free(Func* f, u32 id) { - OptSsaEdge* e = opt_ssa_edge(f, id); -#ifndef NDEBUG - e->flag = 0xff; -#endif - e->next_use = f->opt_ssa.free_head; - f->opt_ssa.free_head = id; -} -``` - -### List link / unlink - -These are the inner loops of `add`/`remove`/`relink`. Inlined. - -```c -/* O(1). Insert edge at the head of val's use list. Caller has already - * written e->val. */ -static inline void opt_ssa_list_push(Func* f, u32 edge_id) { - OptSsaEdge* e = opt_ssa_edge(f, edge_id); - u32 head = f->opt_ssa.first_use_by_val[e->val]; - e->prev_use = OPT_SSA_EDGE_NONE; - e->next_use = head; - if (head != OPT_SSA_EDGE_NONE) - opt_ssa_edge(f, head)->prev_use = edge_id; - f->opt_ssa.first_use_by_val[e->val] = edge_id; -} - -/* O(1). Unlink edge from val's use list. The edge fields are left as-is; - * the caller decides whether to free or relink. */ -static inline void opt_ssa_list_unlink(Func* f, u32 edge_id) { - OptSsaEdge* e = opt_ssa_edge(f, edge_id); - if (e->prev_use != OPT_SSA_EDGE_NONE) - opt_ssa_edge(f, e->prev_use)->next_use = e->next_use; - else - f->opt_ssa.first_use_by_val[e->val] = e->next_use; - if (e->next_use != OPT_SSA_EDGE_NONE) - opt_ssa_edge(f, e->next_use)->prev_use = e->prev_use; -} -``` - -### Primitive add / remove / relink - -```c -/* O(1). Allocate, initialize, and link an edge. Caller writes the new - * edge id into the use-site handle. */ -static inline u32 opt_ssa_edge_add_raw( - Func* f, Val val, - u32 def_block, u32 def_inst, u8 def_op_idx, - u32 use_block, u32 use_inst, - OptSsaUseKind kind, u16 use_op_idx, u16 sub_idx) { - u32 id = opt_ssa_pool_alloc(f); - OptSsaEdge* e = opt_ssa_edge(f, id); - e->val = val; - e->def_block = def_block; - e->def_inst = def_inst; - e->def_op_idx = def_op_idx; - e->use_block = use_block; - e->use_inst = use_inst; - e->use_op_idx = use_op_idx; - e->sub_idx = sub_idx; - e->kind = (u8)kind; - e->flag = 0; - opt_ssa_list_push(f, id); - return id; -} - -/* O(1). Unlink and free. Caller clears the use-site handle. */ -static inline void opt_ssa_edge_remove_raw(Func* f, u32 edge_id) { - opt_ssa_list_unlink(f, edge_id); - opt_ssa_pool_free(f, edge_id); -} - -/* O(1). Repoint an existing edge to a new Val. Caller is responsible for - * rewriting the use-site Reg/Val to match. The edge id and its position - * in the use site stay the same — only the val and the use-list it - * belongs to change. */ -static inline void opt_ssa_edge_relink_raw(Func* f, u32 edge_id, Val new_val, - u32 new_def_block, - u32 new_def_inst, - u8 new_def_op_idx) { - opt_ssa_list_unlink(f, edge_id); - OptSsaEdge* e = opt_ssa_edge(f, edge_id); - e->val = new_val; - e->def_block = new_def_block; - e->def_inst = new_def_inst; - e->def_op_idx = new_def_op_idx; - opt_ssa_list_push(f, edge_id); -} -``` - -### Iterator invalidation - -Removing the current edge during a `OPT_SSA_FOR_EACH_USE` walk corrupts -the iteration because the macro fetches `next_use` *after* the body. The -safe pattern: - -```c -u32 e = f->opt_ssa.first_use_by_val[v]; -while (e != OPT_SSA_EDGE_NONE) { - u32 next = opt_ssa_edge(f, e)->next_use; - if (should_remove_p(e)) opt_ssa_edge_remove_raw(f, e); - e = next; -} -``` - -This is the same shape as MIR's removal loops (mir-gen.c:2810-2811). The -macro is for read-only walks; mutation walks use the explicit form above. - -## Public helpers - -Out-of-line in `src/opt/ssa_edges.c`. These are what passes actually call; -each resolves the use-site slot and then calls the primitive layer. - -### Use-site slot resolver - -A single switch from `(use_block, use_inst, kind, op_idx, sub_idx)` to a -pointer to the u32 edge slot. Used by `_drop_inst`, `_add_inst`, the high- -level relink helpers, and the verifier. - -```c -static u32* opt_ssa_use_slot(Func* f, u32 use_block, u32 use_inst, - OptSsaUseKind kind, u16 op_idx, u16 sub_idx) { - Inst* in = &f->blocks[use_block].insts[use_inst]; - switch (kind) { - case OPT_SSA_USE_OPERAND: - return &in->opnds[op_idx].v.reg.edge; - case OPT_SSA_USE_INDIRECT_BASE: - return &in->opnds[op_idx].v.ind.edge_base; - case OPT_SSA_USE_INDIRECT_INDEX: - return &in->opnds[op_idx].v.ind.edge_index; - case OPT_SSA_USE_PHI_INPUT: - return &((IRPhiAux*)in->extra.aux)->pred_edges[sub_idx]; - case OPT_SSA_USE_AUX_OPERAND: - /* Dispatch on IROp; each aux struct carries its own *_edges[] sized - * to the use count enumerated by opt_collect_inst_uses. */ - switch ((IROp)in->op) { - case IR_CALL: return &((IRCallAux*)in->extra.aux)->arg_edges[sub_idx]; - case IR_ASM_BLOCK: return &((IRAsmAux*)in->extra.aux)->in_edges[sub_idx]; - case IR_INTRINSIC: return &((IRIntrinAux*)in->extra.aux)->arg_edges[sub_idx]; - case IR_RET: return &((IRRetAux*)in->extra.aux)->val_edges[sub_idx]; - case IR_SCOPE_BEGIN: return &((IRScopeAux*)in->extra.aux)->cond_edge; - default: cfree_unreachable(); - } - } - cfree_unreachable(); -} -``` - -The dispatch is one switch and one struct field load per resolution. -Hot-loop callers (relink during copy_prop, GVN substitute) avoid this by -holding the slot pointer or the Operand directly — see the relink-operand -helpers below. - -### Inst-level add / drop - -```c -/* Walk every use site on the instruction and add an edge. Called once - * per instruction from opt_ssa_edges_add_inst's outer loop, plus per - * new instruction emitted mid-pipeline. */ -void opt_ssa_edges_add_inst(Func* f, u32 block, u32 inst_idx); - -/* Walk every use site and remove its edge. Asserts (debug) that none of - * the instruction's defined Vals still has uses — caller has done its - * rewriting. */ -void opt_ssa_edges_drop_inst(Func* f, u32 block, u32 inst_idx); -``` - -The structure of both mirrors `opt_collect_inst_uses` (pass_analysis.c: -269-325) exactly — same operand walk, same aux-struct dispatch. The -difference is that today's path appends to `opt_uses[]` and writes -`opt_first_use_by_val[v]` once at the end; the new path calls -`opt_ssa_edge_add_raw` for each site and writes the returned edge id back -through `opt_ssa_use_slot`. - -### Operand relink (the most common public helper) - -This is what copy_prop, GVN substitute, addr_xform, and ssa_combine call -to rewrite an operand's Val: - -```c -/* Rewrite an OPK_REG operand from old_val to new_val, maintaining edges. - * Reads the edge id from op->v.reg.edge; relinks it to new_val's list; - * writes op->v.reg.r = (Reg)new_val. */ -static inline void opt_ssa_relink_operand(Func* f, Operand* op, Val new_val) { - u32 edge_id = op->v.reg.edge; - opt_ssa_edge_relink_raw(f, edge_id, new_val, - f->val_def_block[new_val], - f->val_def_inst[new_val], - opt_ssa_val_def_op_idx(f, new_val)); - op->v.reg.r = (Reg)new_val; -} - -/* OPK_INDIRECT.base variant. Same shape against op->v.ind.edge_base. */ -static inline void opt_ssa_relink_indirect_base(Func* f, Operand* op, Val new_val); -static inline void opt_ssa_relink_indirect_index(Func* f, Operand* op, Val new_val); - -/* Phi input: aux->pred_edges[p], aux->pred_vals[p] = new_val. */ -static inline void opt_ssa_relink_phi_input(Func* f, IRPhiAux* aux, - u32 pred_idx, Val new_val); -``` - -All five are inline because the body is short and they're called inside -inner loops. The cost is one edge lookup, one list unlink + push, two -field writes — no use-site resolver call because the caller already has -the Operand/aux pointer. - -### Redirect def (whole-Val substitution) - -O(uses_of(from)). One pass over the list, rewriting each edge's -`val`/`def_*` and the operand it refers to, then splicing the whole list -onto `to`'s head. Single-pass equivalent of N individual relinks. - -```c -void opt_ssa_redirect_def(Func* f, Val from, Val to) { - u32 head = f->opt_ssa.first_use_by_val[from]; - if (head == OPT_SSA_EDGE_NONE) return; - - u32 to_def_block = f->val_def_block[to]; - u32 to_def_inst = f->val_def_inst[to]; - u8 to_def_op_idx = opt_ssa_val_def_op_idx(f, to); - Reg to_reg = (Reg)to; - - /* Walk from-val's list, rewriting val + operand at each edge. */ - u32 tail = head; - for (u32 e = head; e != OPT_SSA_EDGE_NONE; ) { - OptSsaEdge* ed = opt_ssa_edge(f, e); - ed->val = to; - ed->def_block = to_def_block; - ed->def_inst = to_def_inst; - ed->def_op_idx = to_def_op_idx; - /* Rewrite the use-site operand to match. */ - u32* slot = opt_ssa_use_slot(f, ed->use_block, ed->use_inst, - (OptSsaUseKind)ed->kind, - ed->use_op_idx, ed->sub_idx); - (void)slot; /* slot already holds e; what we update is the Reg/Val it points to */ - opt_ssa_write_use_reg(f, ed, to_reg); - tail = e; - e = ed->next_use; - } - - /* Splice the whole list onto to's head. */ - u32 to_head = f->opt_ssa.first_use_by_val[to]; - opt_ssa_edge(f, tail)->next_use = to_head; - if (to_head != OPT_SSA_EDGE_NONE) - opt_ssa_edge(f, to_head)->prev_use = tail; - f->opt_ssa.first_use_by_val[to] = head; - f->opt_ssa.first_use_by_val[from] = OPT_SSA_EDGE_NONE; -} -``` - -`opt_ssa_write_use_reg` is a small helper that switches on the edge's -`kind` and writes the appropriate Reg field of the use's Operand or aux -entry. It's the inverse of `opt_ssa_use_slot` but for the Val/Reg value -rather than the edge id. - -This mirrors MIR's `change_ssa_edge_list_def` (mir-gen.c:2445-2462) one -to one, with cfree's per-kind dispatch for the operand-side write. - -### Inst-level delete - -```c -void opt_ssa_delete_inst(Func* f, u32 block, u32 inst_idx) { - opt_ssa_edges_drop_inst(f, block, inst_idx); - /* existing inst-removal path: mark NOP / shift block.insts / etc. */ - ir_block_remove_inst(f, block, inst_idx); -} -``` - -Wraps the existing inst-removal path with the edge drop step. Passes -that delete an inst always go through this rather than touching the -block insts array directly. - -### State lifecycle - -```c -void opt_ssa_state_init(Func* f); /* opt_build_reg_ssa */ -void opt_ssa_state_fini(Func* f); /* opt_undo_ssa */ - -/* On Val mint: ensure first_use_by_val[] is large enough for the new - * highest Val id. Caller passes the new minimum capacity. */ -void opt_ssa_state_ensure_val_cap(Func* f, u32 nvals_needed); -``` - -`_init` zero-inits `OptSsaState`, sets `free_head = OPT_SSA_EDGE_NONE`, -initializes the SegVec on the Compiler heap, allocates -`first_use_by_val` with capacity `nvals` and fills with -`OPT_SSA_EDGE_NONE`. - -`_fini` calls `OptSsaEdgeVec_fini` and clears the pointers. The arena- -backed `first_use_by_val` is reclaimed by the function-end arena reset. - -### Debug verifier - -```c -#ifndef NDEBUG -/* Walk all blocks/insts; re-derive expected use set; compare against - * live edges. Called at OPT_VERIFY checkpoints in pass_o2.c. */ -void opt_ssa_edges_check(Func* f, const char* stage); -#endif -``` - -For each instruction: - -1. For every use site, read the edge id from its slot. If non-NONE, - look up the edge and assert `e.val == operand_val`, - `e.use_block/use_inst/op_idx/sub_idx/kind` match, and the edge is on - `first_use_by_val[e.val]`'s doubly-linked list. -2. For every Val the inst defines, walk `first_use_by_val[v]` and - assert each edge's `def_block`/`def_inst`/`def_op_idx` agrees with - the producing inst. -3. Verify the free list — no live edge id appears on it, every free - edge has `flag == 0xff`. - -This is O(insts + uses) per call. Gated under `CFREE_OPT_SSA_VERIFY` so -that day-to-day debug builds stay fast and only the full verify runs -under explicit opt-in. - -## Data flow - -The lifecycle of an edge, from creation to teardown. - -**Creation (SSA construction).** `opt_build_reg_ssa` and `opt_build_ssa` -walk the function once after IDF-based phi insertion. For each -instruction, they call `opt_ssa_edges_add_inst`, which enumerates use -sites the same way `opt_collect_inst_uses` does today. Each use site: - -1. Look up `val_def_block[v]`, `val_def_inst[v]` for the producing inst. -2. Allocate edge (free list or pool push). -3. Initialize all fields. -4. Write `next_use = first_use_by_val[v]; prev_use = NONE`. If the - existing head exists, set its `prev_use = new_edge_id`. -5. Set `first_use_by_val[v] = new_edge_id`. -6. Write the edge id into the use-site handle (Operand or aux array). - -**Use walk (the most common operation).** Passes consult uses of a Val v. -The walk reads `first_use_by_val[v]`, then chases `next_use`. Each step is -one SegVec `_at` (two dependent loads). Within a walk, edges are stable — -the caller can hold `OptSsaEdge*` for the duration of one iteration safely -across edge removal at the current cursor (cache the `next_use` before -removing). - -**Relink (operand rewrite).** A pass discovers that an operand currently -referring to Val `old_v` should now refer to `new_v`. It reads the edge id -from the use-site handle, calls `opt_ssa_edge_relink`. The edge's -`prev_use`/`next_use` are unlinked from `old_v`'s list, the edge's `val` -and the operand's Reg are updated, and the edge is linked at the head of -`new_v`'s list. Six pointer rewrites total, no list walk. - -**Redirect def (whole-Val substitution).** A pass discovers Val `a` is -equivalent to Val `b` (GVN substitute, copy_cleanup). `opt_ssa_redirect_def` -walks `a`'s use list once. For each edge: rewrite its `val` and the -operand it indexes. After the walk, splice the tail of `a`'s list onto -`b`'s list head: - -```c -u32 head_a = first_use_by_val[a]; -if (head_a == NONE) { return; } -u32 tail = walk(head_a); /* find tail, rewriting val + operand along the way */ -opt_ssa_edge(tail)->next_use = first_use_by_val[b]; -if (first_use_by_val[b] != NONE) - opt_ssa_edge(first_use_by_val[b])->prev_use = tail; -first_use_by_val[b] = head_a; -first_use_by_val[a] = NONE; -``` - -O(uses_of(a)) with no per-element list-juggling. The producing inst of `a` -is left in place (DCE will remove it once unused). - -**Inst deletion.** Three steps: - -1. `opt_ssa_edges_drop_inst(f, block, inst_idx)`. For each use site on the - inst, remove the edge (incoming). For each Val the inst defines, walk - its use list — if non-empty, the caller is wrong to be deleting this - inst; in debug builds, assert. The walk is fine because deletion implies - no remaining uses (the caller has already done its rewriting). -2. Remove the inst from `Block.insts[]` (or mark NOP — existing - convention). -3. The arena memory for `Inst.opnds` and aux structs is not freed - individually; it's reclaimed at function-end arena reset. - -**Inst creation mid-pipeline.** A pass emits a new inst (e.g., a copy -inserted by copy_cleanup, a hoisted invariant by LICM): - -1. `ir_emit` allocates the inst with Operand fields initialized to - `OPT_SSA_EDGE_NONE`. -2. Pass fills in operands and aux. -3. Pass calls `opt_ssa_edges_add_inst` on the new inst. - -The order matters: edges are added *after* operands are populated, so the -add path can read the Reg values to find their producing Val and edge to. - -**Teardown (opt_undo_ssa).** Walks all blocks/insts, calls -`opt_ssa_edges_drop_inst` on each. After the walk, `OptSsaState` is reset: -edges SegVec emptied (or kept for the next function), free_head reset, -first_use_by_val cleared. - -## Lifetimes - -Everything is per-function. SSA edges are not retained across functions in -the optimizer's `FuncSet`; each function is built, lowered, and emitted in -isolation. The lifetimes: - -| Object | Born | Dies | Storage | -| --- | --- | --- | --- | -| `OptSsaEdge` (record) | `opt_ssa_edge_add` | `opt_ssa_edge_remove` | `OptSsaState.edges` (SegVec on Heap) | -| Edge slot in SegVec | first segment push | `OptSsaState_fini` | Heap; freed at function teardown | -| `OptSsaState.free_head` | `opt_build_reg_ssa` | `opt_undo_ssa` | `Func.opt_ssa` | -| `first_use_by_val[]` | `opt_build_reg_ssa` / on Val mint grow | `opt_undo_ssa` | `Func.arena` | -| `Operand.v.reg.edge` | `ir_emit` (init to NONE) → `opt_ssa_edge_add` | `opt_ssa_edge_remove` / inst deletion | inside `Inst.opnds` (arena) | -| `IRPhiAux.pred_edges[]` | aux construction | function arena reset | `Func.arena` | -| Other aux `*_edges[]` | aux construction | function arena reset | `Func.arena` | - -Three lifetime invariants for callers: - -1. **Edge ids are valid only between `opt_build_*_ssa` and `opt_undo_ssa`.** - At O1, no edges exist. Any code path that runs at both O1 and O2 must - gate edge maintenance on `Func.opt_reg_ssa`. -2. **`OptSsaEdge*` is stable across `add`/`remove`/`relink`** thanks to - SegVec. It is NOT stable across `OptSsaState_fini`. Within a pass it's - safe to hold; across passes use the edge id. -3. **An edge's use-site handle must always agree with the edge.** If a - pass rewrites an operand's Reg directly (not through `relink`), the - edge's `val` becomes stale and the use list is wrong. The debug - verifier (`opt_ssa_edges_check`) walks all insts, re-derives the - expected use set, and compares; this is the catch-all. - -## Operand mutation: the contract - -Every place that mutates an instruction operand or aux use moves to a -helper: - -| Today | New | -| --- | --- | -| `op->v.reg = new_val;` | `opt_ssa_relink_operand(f, op, new_val);` | -| `aux->pred_vals[p] = new_val;` | `opt_ssa_relink_phi_input(f, aux, p, new_val);` | -| `block.insts[i].op = IR_NOP;` (delete) | `opt_ssa_delete_inst(f, b, i);` | -| emit new inst, then read its operands | emit new inst, then `opt_ssa_edges_add_inst(f, b, i);` | -| `opt_rebuild_def_use(f);` | deleted; the invariant is maintained | - -The migration tax is one-time: every pass that mutates touches these -helpers exactly once per mutation kind, then is done. - -## Construction and teardown - -`opt_build_reg_ssa` / `opt_build_ssa` (currently in `src/opt/pass_ssa.c`) are -rewritten in two phases: - -1. **Today's shape, edge-aware**: replace the post-construction - `opt_rebuild_def_use(f)` call (`pass_ssa.c:548`, `:807`) with a single - forward walk that calls `opt_ssa_edges_add_inst` once per instruction. - IDF-based phi insertion stays unchanged. Net structural move with no - algorithm change. - -2. **Later (post-roadmap re-enable)**: switch to MIR-shape demand-driven - construction with `get_def` and `minimize_ssa` - (mir-gen.c:2284-2372). Add live-in filtering to mem2reg - (`compute_phi_sites`, `pass_ssa.c:623-654`). - -Teardown is symmetric: `opt_undo_ssa` calls `opt_ssa_edges_drop_inst` for -each instruction as it converts back to PReg form, returning all edges to -the free list. - -## Invalidation and verification - -`Func.opt_valid_analyses` (`ir.h:475`) currently has an -`OPT_ANALYSIS_DEF_USE` bit. Under the new representation, def-use is -**always valid in SSA mode** — there is no rebuild. The bit becomes a -verification flag rather than a state flag: - -- Set after `opt_build_*_ssa` (matches today). -- Cleared by `opt_undo_ssa`. -- Never cleared by individual passes. If a pass needs to mutate instructions, - it goes through `opt_ssa_edges_*` and the invariant survives. - -In debug builds, `opt_verify` (`pass_o2.c` checkpoints) gains an -`opt_ssa_edges_check` step that re-derives uses from instructions and -compares against the live edge lists. This is the safety net for the -migration: any pass that mutates instructions without maintaining edges -trips the verifier at the next checkpoint. - -## Migration plan - -The migration treats every SSA-era pass as opt-in. The structural move lands -green with downstream passes disabled, then each pass is re-enabled in turn -with its MIR-parity work. - -### Phase 0 — quarantine SSA-era passes - -In `src/opt/pass_o2.c:opt_cleanup`, gate every pass between -`opt_build_reg_ssa` and `opt_undo_ssa` behind a per-pass `#ifdef -CFREE_OPT_SSA_PASS_<NAME>`. By default all are undefined. The schedule then -runs: - -```text -opt_build_cfg, opt_jump_cleanup, opt_build_cfg -opt_build_reg_ssa -opt_build_ssa -opt_undo_ssa -opt_jump_opt -``` - -That is the smallest legal O2 schedule: build SSA, immediately undo, run -the shared lowering pipeline. It is functionally equivalent to O1 plus the -SSA round-trip — expensive but correct, and the bisection floor for the -structural change. - -Tests that currently exercise SSA-era transformations -(`128_o2_branch_join_addr_mem.toy` and friends) move to an "expected -mediocre" tier in `test-opt` and `test-toy`: they must continue to compile -and produce correct output, but quality is not asserted until the -corresponding pass is re-enabled. This avoids losing coverage during the -migration. `make bench-opt` at O2 will regress against today's numbers -during the quarantine window — that is expected and recorded in -`OPT_PERF.md` as a known trough. - -### Phase 1 — land the data structures - -1. Add `OptSsaEdge`, the pool fields on `Func`, and the `u32 edge` / - `u32 edge_index` slots on `Operand`. -2. Add `u32* pred_edges` to `IRPhiAux`; `u32* operand_edges` to the call / - asm / intrinsic / ret / scope aux structs (sizes track the existing - use enumeration in `opt_collect_inst_uses`). -3. Implement the core operations: `opt_ssa_edge_add`, - `opt_ssa_edge_remove`, `opt_ssa_edge_relink`, `opt_ssa_redirect_def`, - `opt_ssa_edges_add_inst`, `opt_ssa_edges_drop_inst`. -4. Implement `opt_ssa_edges_check` (debug-only verifier). -5. Wire `opt_build_*_ssa` to build edges in one forward pass; wire - `opt_undo_ssa` to drop them. -6. Delete `opt_uses[]`, `OptUse`, `opt_rebuild_def_use`, and every call to - it. Each call site converts to a direct edge walk. - -Acceptance: O2 quarantine schedule (Phase 0) passes `make test-opt -test-toy`, with the verifier enabled in debug builds. - -### Phase 2 — re-enable passes, one per landing - -Each phase is its own commit (or small series), shaped: - -1. Implement the MIR-equivalent algorithm using incremental edges from the - start — never reintroduce a wholesale rebuild. -2. Define the corresponding `CFREE_OPT_SSA_PASS_<NAME>` and enable it in - the O2 schedule. -3. Restore the pass's quality assertions in `test-opt` / `test-toy`. -4. Refresh the corresponding row in `doc/OPT_PERF.md` and add an iteration - note documenting the MIR parity (or improvement) achieved. - -Suggested order, chosen so each pass exercises only edges already proven by -its predecessors: - -1. **SSA DCE** (`opt_ssa_dce`). The simplest consumer: walks edges, deletes - instructions with no uses, calls `opt_ssa_edges_drop_inst`. Validates the - delete path. -2. **Copy cleanup / copy prop** (`opt_copy_cleanup`, `opt_copy_prop`). Heavy - use of `opt_ssa_redirect_def`. Validates the redirect path. Folds into a - single pass (vs MIR's `copy_prop` — see `OPT_PERF.md` gap analysis). -3. **Address transform** (`opt_addr_xform`). Per-use rewrite using - `opt_ssa_edge_relink`. Implements MIR's per-use folding (vs cfree's - all-or-nothing today). -4. **Simplify** (`opt_simplify`). Pure local rewrites — minimal new edge - maintenance, mostly relink. -5. **GVN** (`opt_gvn`). Worklist re-enqueue after branch fold (MIR parity - item from gap analysis). Use the per-edge `flag` for the worklist. -6. **DSE** (`opt_dse`). Liveness computed once outside; pass walks - incrementally. Drops the inline fixpoint. -7. **LICM** (`opt_licm`) — with pressure filter and loop-tree reuse from - the start. This is the priority runtime fix from `OPT_PERF.md`; it - waits until edges are stable because the filter needs use-set queries. -8. **Pressure relief**, **conventional SSA**, **ssa_combine**, **undo SSA** - round out the schedule. - -The structural move itself is Phases 0–1. Phase 2 is the pass-by-pass -re-introduction and overlaps the O2 runtime roadmap in `OPT_PERF.md`. - -## Files touched - -Structural change (Phase 0–1): - -- `src/opt/ir.h` — `OptSsaEdge`, pool fields on `Func`, `Operand.edge` and - `Operand.edge_index`, aux struct edge arrays. -- `src/opt/pass_analysis.c` — replace `opt_rebuild_def_use` and `OptUse` - collection with edge-aware enumeration helpers. -- `src/opt/pass_ssa.c` — `opt_build_reg_ssa`, `opt_build_ssa`, `opt_undo_ssa` - switch to edge maintenance. -- `src/opt/pass_o2.c` — add `CFREE_OPT_SSA_PASS_*` gates around the SSA-era - schedule. -- New `src/opt/ssa_edges.c` (or extend `pass_analysis.c`) — pool, free - list, core operations, debug verifier. - -Pass-by-pass (Phase 2): one file per pass, edited in place. - -## Open questions - -These are deferred to implementation: - -- **`SEG_SHIFT` choice.** 7 (128 edges/segment ≈ 4 KB) is the starting - point. Measure peak edge count on `matrix`/`hash` and tune up to 8 if - segments are routinely full. Don't tune down — segment header overhead - outweighs waste at small sizes. -- **`OPK_INDIRECT.edge_base` / `edge_index` packing.** Two adjacent u32s - is the obvious layout. Alternative: pack as `u32 edges[2]` with kind - implicit by slot. Pick whichever reads cleanest during the pass_analysis - rewrite. No performance difference. -- **Per-edge `flag` reuse.** MIR uses it for renaming and for GVN/copy - worklists. cfree passes may want more than one bit — start with one, - add a second `u8 flag2` only when a pass concretely needs it. -- **Verifier cost.** `opt_ssa_edges_check` is debug-only. If it becomes - too slow even in debug, gate it behind a separate `CFREE_OPT_SSA_VERIFY` - define so day-to-day debug builds still complete in reasonable time. diff --git a/doc/TESTING.md b/doc/TESTING.md @@ -0,0 +1,349 @@ +# Testing + +cfree's test architecture is built around one core idea: **drive testing from +what codegen actually emits**, not from a hand-written corpus of cases someone +remembered to write. A hand corpus only ever tests the instructions we thought +to write down; the compiler's own output exercises every instruction codegen +emits, and tracks codegen automatically as new instructions appear. This doc +describes the harness design — the layering, the invariants each lane locks in, +and why the lanes are shaped the way they are. For the surfaces these tests +exercise see [ASM.md](ASM.md) (assembler/disassembler/`cc -S`), +[ARCH.md](ARCH.md) (per-arch ISA tables), and [FRONTENDS.md](FRONTENDS.md) +(the Toy and C frontends). + +The test tree lives under `test/`, with per-area subdirectories +(`test/asm/`, `test/toy/`, `test/smoke/`, `test/libc/`, plus unit-test areas +like `test/arch/`, `test/elf/`, `test/opt/`). Build/run wiring lives in +`test/test.mk`; shared helpers (the C unit-test harness `test/lib/cfree_unit.h` +and its build rules `test/lib/unit.mk`; the cross-exec helper +`test/lib/exec_target.sh`) are factored out for reuse. + +## Why codegen-driven round-trip testing + +`cc -S` is not a separate pretty-printer: it is the **disassembler** plus module +scaffolding (`src/api/asm_emit.c`). The same `arch_disasm_decode` surface backs +both `cc -S` and `objdump -d`, so `-S` text *is* cfree's disassembly rendered as +re-assemblable assembly. That makes a single feedback loop possible: + +``` + source ──cc -c──▶ object (codegen's bytes + relocs) + │ + └────cc -S──▶ assembly ──as──▶ object (re-encoded bytes + relocs) +``` + +If the disassembler and assembler are mutually faithful, the two objects are +byte-and-reloc identical. Any instruction codegen emits is, by construction, in +the loop — so the coverage tracks the compiler, not a static list. + +## The three completeness layers (L0/L1/L2) + +`test/asm/roundtrip.sh` runs a C corpus (`test/asm/roundtrip/`, each case a +`test_main`) through three lanes, cheapest and sharpest first. They build on +each other: a higher lane is only trustworthy once the lower one passes. + +``` + L0 decode-completeness cc -S, assert no in-function decode failure + L1 byte + reloc round cc -c vs cc -S | as (.text bytes AND reloc table) + L2 exec equivalence run direct.o vs run rt.o (exit code / stdout) +``` + +**L0 — decode completeness.** Disassemble the program and assert no instruction +codegen emitted failed to decode. This is host-independent (no execution), +cheap, and pinpoints the exact undecodable word. The signal must be +*unambiguous*: padding and data are not decode failures. The harness keys on the +`.inst 0x<word>` marker — a real assembler directive emitted *only* by the +disassembler's unknown-word path (`aa64_write_unknown`) — and counts it only +inside `.text`. Inter-function padding (`.byte 0x0` fill) and data-section +`.byte` are explicitly distinct from the in-stream `.inst` marker, so "grep for +the marker inside `.text`" is an exact completeness test. + +**L1 — byte + reloc round-trip.** Compile the source two ways: `cc -c` directly, +and `cc -S | as`. Diff both the `.text` bytes **and** the relocation table. The +reloc-table comparison is essential — a same-section branch resolved in place +versus kept as a relocation produces identical bytes but a different relocatable +object, and only the reloc diff catches it. L1 covers the sections `cc -S` +reproduces (`.text`, `.rodata`, `.data`, and `.bss`; switch jump tables live in +`.rodata`). `.bss` is NOBITS — it carries a size but no bytes, so byte +round-tripping does not apply to it; only its presence and symbols are checked. +L1 excludes sections `-S` does not emit (e.g. `.eh_frame`) so their absence in +the round-tripped object is not misread as a divergence. L1 is gated on L0: a +byte match is meaningless if the disassembler punted to a `.byte`/`.inst` +fallback (see the gotcha below). + +**L2 — exec equivalence.** Run `direct.o` and `rt.o` and compare exit codes (and +stdout, and an optional `<name>.expected` oracle). This is the end-to-end "it +runs the same" signal, tolerant of benign encoding differences L1 would flag. +Crucially, **no qemu is needed for the host arch**: execution goes through the +in-process JIT (`cfree run` / the `jit-runner`), and cross-arch execution is +available via the emulator (`cfree emu`) — see [JIT.md](JIT.md) and +[EMU.md](EMU.md). L2 runs only when the target arch matches the host +(native JIT); otherwise it self-skips. + +Opt levels matter: `-O0` and `-O1` emit different encodings, so each lane runs +at every level in `CFREE_TEST_OPTS`. + +### The symbolization invariant + +L1/L2 only work because `cc -S` is **re-assemblable**, not a listing. A listing +emits numeric branch targets (`b 0x100`) and de-symbolized relocated operands +(`bl 0x11c` instead of `bl add`); re-assembling that branches to the wrong place +and loads from address 0. The invariant the symbolizer maintains is: every +operand a relocation patches is rendered in the assembler's own reloc-operator +syntax (aa64 `:lo12:`/`:got:`, rv64 `%pcrel_hi`/`%pcrel_lo`, x64 `sym(%rip)`/ +`@PLT`), and every intra-section branch/PC-relative target gets a synthesized +local label. The symbolizer is the inverse of what the assembler parses; see +[ASM.md](ASM.md) for the RelocKind→syntax mapping. + +A second structural decision makes this robust across *different* assemblers: +code locations an encoding-divergent assembler must be able to recompute — +switch jump-table entries and `&&label` address-takes — are referenced through a +**per-basic-block local symbol** the emitter mints (`mc_label_symbol`, +`src/arch/mc.c`), uniformly on all three arches. The jump table emits +`.quad .Lcfblk.*` and the address-take a standard PC-relative relocation against +that block symbol, rather than a baked numeric offset. Because the reference is +genuinely relocatable, a third-party assembler's encoding choices (movabs vs +mov-imm32, `jmp` rel32 vs rel8, RVC compression) cannot shift a baked offset onto +the wrong instruction. `cc -c` and `cc -S` emit the same relocations, so the L1 +byte/reloc lanes stay faithful too. + +### The `.byte`/`.inst` fallback gotcha + +The disassembler's fallback for an undecodable word reproduces the exact original +bytes (`.byte 0x..`, or `.inst 0x<word>`). Re-assembling that fallback yields the +same bytes — so a *run-only* round-trip **passes even when the disassembler is +incomplete**. The decode/byte check (L0, and the reloc half of L1) must gate +before an exec round-trip is trusted, otherwise L2 green hides a real +disassembler gap. This is why the lanes are ordered, and why L0 keys on the +in-stream `.inst` marker specifically — it is the unambiguous "the disassembler +could not decode this word" signal. + +## Self-symmetry sweep + checked-in baseline + +The codegen round-trip exercises the disassembler only on what the compiler +emits. `test/asm/symmetry.sh` complements it by sweeping the *tools' own* +instruction set for asm⊗disasm symmetry, independent of codegen: + +- **decode-side**: `test/arch/aa64_sweep_gen.c` synthesizes one representative + encoding per row of the disassembler's instruction table; each is decoded, + the disassembly re-assembled, and decoded again — the text must be a fixed + point. Catches a form the disassembler decodes but the assembler can't + re-encode (decode-only), or where they disagree. +- **encode-side**: assemble every aa64 `test/asm/encode/*.s` and disassemble; + any `.inst` means the assembler encodes a form the disassembler can't decode + (encode-only). + +The two tools cover slightly different ISA subsets on purpose (forms the +assembler accepts for completeness that codegen never emits, so the +disassembler never had to decode them). Known asymmetries live in a checked-in +snapshot, `test/asm/symmetry.baseline`; the sweep **passes iff the current set +equals the baseline**. So it gates against *new* asymmetry (a regression) while +the baseline documents the disasm-completeness backlog. Closing a gap shrinks +the baseline (`symmetry.sh --update`). This is the standard cfree pattern: an +honest, checked-in "what is currently known-incomplete" snapshot that turns a +regression into a diff. + +## The diff-vs-llvm second oracle + +"No decode failure" does not catch a *wrong* decode, and a self-round-trip +can't either — cfree's own re-encode would repeat the mistake. `test/asm/ +diff_llvm.sh` adds an independent oracle (llvm-mc), **byte-level** so it +sidesteps disassembly-text normalization (movz-vs-mov, `#16`-vs-`#0x10`), which +would founder on alias/format differences: + +- **encode lane**: assemble every aa64 `encode/*.s` with both `cfree as` and + `llvm-mc`; the `.text` bytes must match. Validates cfree's assembler. +- **disasm lane**: `cc -c` gives codegen's bytes; `cc -S` gives cfree's + disassembly as re-assemblable text; assemble that with llvm-mc and require the + bytes to match codegen's. If llvm agrees the `-S` text means the original + bytes, the decode is correct. + +The one benign disagreement is recognized structurally: cfree codegen keeps a +same-section CALL26/JUMP26/CONDBR relocation that llvm-mc (like GNU as) resolves +in place and drops. The bytes are link-equivalent, only the relocatable form +differs, so the reloc-table diff distinguishes it and it is not flagged. +Opt-in; skips cleanly when `llvm-mc` is absent. + +## Host-assembler execution lanes (cc -S is standard assembly) + +The round-trip and diff-llvm lanes use either cfree's own assembler or compare +*bytes*. A separate question is whether `cc -S` is **standard assembly a +third-party assembler accepts and that means the same thing** — judged by +**execution**, not bytes (cfree and clang emit different but execution-equivalent +code, so a byte/text match would be meaningless). + +`test/asm/hostas_toy.sh` answers it on the native target. Per Toy case (both +`-O0`/`-O1`) it emits **one** `cc -S` and feeds it to two assemblers, then links +each with `cfree ld` and runs it, asserting the exit-code oracle: + +``` + cc -S ──┬── cfree as ──cfree ld──▶ ./a.out exit == oracle (lane A, baseline) + └── clang -c ──cfree ld──▶ ./b.out exit == oracle (lane B, the test) +``` + +The assembler is the only variable. Lane A is a baseline (cfree both writes and +reads its own dialect, so a private-dialect quirk could hide); lane B is the real +test — a standard assembler can't paper over such a quirk. This is what proves +`cc -S` emits the clean dialect of the target object format: the format-divergent +directive *spelling* (`.type`/`.size`/`.section`/`.p2align`) lives behind an +`AsmSyntax` vtable selected by object format in `src/api/asm_emit.c`, and the +relocation *operand* syntax (ELF `:lo12:` vs Mach-O `@PAGEOFF`) behind a per-arch +`ArchAsmOps.reloc_operand` hook — keeping the printer free of arch-specific reloc +knowledge while staying format-correct. cfree's own `as` parses the dialect of +*its* target too, so the single `cc -S` output assembles identically under both. +The clang-as lane gates by default; `CFREE_HOSTAS_ENFORCE_CLANG=0` demotes it to +XFAIL (useful while bringing up a new arch/format whose printer side isn't done). + +`test/asm/hostas_cross.sh` is the cross extension: the same two-assembler-by- +execution test, but for ELF **Linux** targets (`aarch64`/`x86_64`/`riscv64`) +emitted with `cc -S -target <triple>`, assembled by both cfree-as and clang, +linked into a **static non-PIE** ELF with `cfree ld -static`, and run under +**podman/qemu** via `test/lib/exec_target.sh`. The executable is made runnable +with no libc/loader by linking the freestanding crt `test/link/harness/start.c` +(`-Dtest_main=main`): `_start` runs ctors, calls `main`, and exits with its +return (the oracle) via a raw syscall. Each target **self-skips** (never fails) +unless the host has (1) a clang cross target, (2) a runner (podman/qemu), (3) a +working `cc -S | cfree as` round-trip for that arch, and (4) a passing **bounded** +exec smoke — so a wedged emulator downgrades to SKIP instead of hanging. Both +lanes are judged purely by matching exit codes. Opt-in; both lanes skip cleanly +without clang/podman. + +### Shared cross-exec helper + +`test/lib/exec_target.sh` is the one place that knows how to *run* a guest ELF +for a `<arch>-<os>` target tag. It offers synchronous (`exec_target_run`) and +batched-queue (`exec_target_queue` + `exec_target_flush`) modes. The batched mode +groups queued cases by target and runs **one `podman run` per group**, amortizing +podman's per-launch client round-trip across the whole suite; the in-container +loop caps each case (`EXEC_CASE_TIMEOUT`) so one hanging binary can't wedge the +batch and silently fail every later case. The helper picks native exec, then +qemu-user, then a batched podman container, and reports "no runner" so callers +SKIP cleanly. It is shared by the Toy cross lane, the hostas-cross lane, and the +link/smoke/libc harnesses, so cross-exec policy lives in exactly one file. + +## The Toy corpus as CG-API coverage + +The Toy frontend (`lang/toy/`, see [FRONTENDS.md](FRONTENDS.md)) is a small +language that exists to exercise the full CG API op set, and every case carries +an exit-code oracle (`test/toy/cases/<name>.expected`). `test/toy/run.sh` runs +each case through several **paths**, each a distinct backend/seam, all judged +against the same oracle: + +``` + R cfree run in-process JIT, native + I cfree run --no-jit the IR interpreter (see INTERPRETER.md) + L cfree cc -c | cfree ld native object → linked executable + X cfree cc -target | ld cross ELF (aa64/x64/rv64) run via exec_target + C cfree cc --emit=c | host cc the C-source backend (see CBACKEND.md) + W cfree cc -target wasm32 the Wasm backend → re-lower → run (see WASM.md) +``` + +One corpus thus validates the JIT, the interpreter, the linker, cross targets, +and the C and Wasm `CGTarget`s — proving the CGTarget seam is frontend-agnostic. +Paths the interpreter/C/Wasm targets don't yet implement emit a greppable +"not supported"/"not yet implemented" diagnostic and report SKIP, so the suite +signal stays "real regressions". `test/toy/err/` holds compile-failure cases +checked against an expected diagnostic substring. + +Because the Toy corpus is broad and oracle-carrying, it is reused for free +coverage elsewhere: `test/asm/roundtrip_toy.sh` runs it through the L2 exec +round-trip (`cc -S | as | run` and `| ld | exec`, exit must equal the oracle), +and both host-assembler lanes use it. This reuse found a real miscompile (a +multiply-high the disassembler couldn't decode, silently dropped by `as` until +the `.inst`-emits-the-word fix) that the hand corpus never reached. + +## Unit tests + +Lower-level invariants are covered by C unit-test binaries built from +`test/lib/unit.mk` and linking `test/lib/cfree_unit.h`. There are two link +flavors that differ only in what they can reach: `UNIT_TESTS_AR` link the public +archive (internal symbols hidden — exercises the public surface), `UNIT_TESTS_OBJS` +link the raw objects (internal hidden symbols reachable). These cover ISA +encode/decode (`test/arch/`), the CG API and ABI classification (`test/api/`), +optimizer passes (`test/opt/`), DWARF roundtrip (`test/dwarf/`, `test/debug/`), +object/link plumbing, the emulator, and the interpreter. Registering one is two +lines (a stem + its `_SRC`), and both shared test headers are dependencies of +every binary so editing them rebuilds dependents. + +## Smoke tests per arch + +`test/smoke/x64.sh` and `test/smoke/rv64.sh` are end-to-end sanity checks for the +multi-arch exec pipeline itself. They build a tiny freestanding static ELF (a +direct `exit` syscall in `_start`, no libc/relocations/PIE) with a cross clang, +push it through `exec_target_run` and `exec_target_queue`+`flush`, and assert the +expected exit code on both paths. The point is to validate the harness plumbing +(cross-compile → podman/qemu → recorded rc) before relying on it for the heavier +lanes, and to give a clear per-tool ok/MISSING diagnosis when a host lacks a +runner. There is also a header smoke test (`test/smoke.c`): every freestanding +header must parse and expose its required macros/typedefs under a strict +freestanding compile. + +## libc conformance vs glibc/musl + +`test/libc/` proves `cfree ld` and the cfree runtime interoperate with real, +unmodified system libcs. Each case in `test/libc/cases/*.c` is compiled against an +extracted sysroot and linked by `cfree ld` against the real libc, then run and +checked against an expected exit code and optional stdout substring: + +- `test/libc/musl/` — Alpine + musl, where the loader is the libc itself. +- `test/libc/glibc/` — Debian + glibc, dynamic-link only (static glibc relies on + dlopen-loaded NSS and isn't a real deployment shape). glibc's loader is a + separate ELF, so the run passes `-dynamic-linker` and hands `libc.so.6` plus + `libc_nonshared.a` directly (cfree ld doesn't parse the GROUP linker script). + +This exercises the dynamic-link path of `cfree ld` (PT_INTERP, PT_DYNAMIC / +DT_NEEDED, `.dynsym`/`.gnu.hash`, `.rela.plt`/`.got.plt`) against a real loader — +see [LINK.md](LINK.md). Arch selection is `CFREE_LIBC_ARCHES` (aa64/x64/rv64); +a missing sysroot or runtime for an enabled arch is SKIP, not failure. + +A related guard, `test-lib-deps`, asserts `libcfree.a`'s set of external +(undefined) symbols matches a checked-in allowlist (`test/lib_deps.allowlist`) +and that a relocatable link of the library exposes no non-public symbols. This +keeps the freestanding library's dependency surface from drifting silently. + +## Bootstrap reproducibility (stage2 == stage3) + +The strongest end-to-end correctness signal is that cfree can compile itself to a +**fixed point**. The bootstrap (driven from the top-level `Makefile`, in both +debug/-O0 and release/-O1 modes) builds: + +``` + stage1 = host-built cfree, copied into the bootstrap tree + stage2 = stage1 compiling the full source tree + stage3 = stage2 compiling the full source tree +``` + +The check is `cmp stage2/cfree stage3/cfree` — byte-identical binaries (and the +recipe also compares every `.o`). If stage2 mis-compiles any part of cfree, the +two stages diverge, so this exercises a very large slice of the language, the +ISA, the optimizer, the assembler, the linker, and the object writers at once, on +real code rather than test fixtures. `bootstrap-test-toy` additionally runs the +full Toy corpus through a bootstrapped compiler to confirm it not only reproduces +but works. The diagnostic discipline that makes a bootstrap divergence tractable +is to compare a single stage2-compiled object against the host compiler's output +for the same TU — separating a malformed-object bug from a link-driver symptom — +and to triage O1 codegen without `-g`, which perturbs object layout. + +## Aggregation and conventions + +`test/test.mk` defines the targets. A default `test` aggregate runs the +host-independent lanes (frontend corpora, unit tests, L0/L1 round-trip, libc-dep +guard); the exec-dependent and second-oracle lanes (L2 exec, symmetry, diff-llvm, +hostas-toy/cross, smoke, libc conformance) are opt-in so the default run stays +host-independent and fast. Bootstrap is *not* part of this test system: it is a +separate top-level target (`make bootstrap`, `make bootstrap-debug/release`) +driven from the top-level `Makefile`, not from `test/test.mk` — see the +bootstrap section above. Conventions shared across harnesses: + +- **SKIP, never silently pass.** A lane that can't run (no runner, no cross + toolchain, an unimplemented backend path) reports SKIP with a reason rather + than vanishing. Several harnesses honor `CFREE_TEST_ALLOW_SKIP` to decide + whether a SKIP is a soft pass or a hard fail in a given context. +- **Checked-in baselines turn incompleteness into a regression diff.** The + symmetry baseline and the per-case `.skip` sidecars document what is known- + incomplete; the gate fires only on a *change* to that set. +- **Per-case sidecars over conditionals in the harness.** Applicability and + expectations ride alongside the case (`<name>.expected`, `<name>.targets`, + `<name>.skip`, `<name>.objdump`, the `err/` cases), so adding or quarantining a + case is a data change, not a script edit. + +Planned work: see doc/plan/. diff --git a/doc/TOY.md b/doc/TOY.md @@ -1,868 +0,0 @@ -# Toy Language Specification - -Toy is a small, explicit language for exercising the public code generation API -in `include/cfree/cg.h`. It is not C. It uses C-like expressions where that -keeps tests readable, and prefix-oriented syntax where C syntax would make -parsing or lowering ambiguous. - -Toy source is statically typed, single-pass friendly, and LL(1)-oriented. -Definitions are local/private by default. Public linkage, external -declarations, target details, ABI details, and low-level code generation -features are spelled explicitly. - -## Program Structure - -A program is a sequence of declarations: - -```toy -type Word = u64; - -record Pair { - a: i32, - b: i32, -} - -pub fn main(): i32 { - let p: Pair = Pair { a: 10 }; - return p.a + p.b; -} -``` - -Declarations must appear before use unless a declaration form explicitly -defines an external symbol. Source names use C linkage spelling for object-file -symbols. - -## Runnable Demo - -A broad executable demo lives in `test/toy/cases/123_spec_demo.toy`. It is part -of `make test-toy` and is intentionally written as a normal corpus case rather -than a documentation-only sample, so the implementation must keep accepting and -executing the specified syntax it demonstrates. - -The demo covers aliases, records, packed/aligned fields, tuple records, enums, -extern declarations, public symbols, aliases, global and static data, recursive -record pointers, function pointers, tail calls, varargs, ABI attributes, -aggregate literals, field and index lvalues, expression `if`/`switch`/`while`, -labeled loops, computed goto, memory builtins, atomics, scalar intrinsics, -conversion builtins, target queries, inline assembly, address-space pointers, -and nullable pointer casts. - -## Lexical Conventions - -Identifiers are ASCII names beginning with a letter or `_`, followed by letters, -digits, or `_`. Integer literals are decimal. Floating literals contain a -decimal point. Byte string literals use double quotes. - -Byte string literals support `\0`, `\n`, `\t`, `\"`, `\\`, and `\xNN` escapes. -The type of a byte string literal is `[N]u8`, where `N` is the number of -emitted bytes. There is no separate text string type and no implicit trailing -zero byte. - -Language builtins use an `@` prefix. Builtin constants, attribute names, and -attribute arguments use dot names, for example `.seq_cst`, `.sysv`, -`.strict_alignment`, `.bind`, and `.section`. This keeps names such as `bind`, -`visibility`, and `section` available as ordinary identifiers. - -Attribute lists use `@[...]`. - -## Types - -Scalar types: - -- `void` -- `bool` -- `i8`, `u8`, `i16`, `u16`, `i32`, `u32`, `i64`, `u64`, `i128`, `u128` -- `isize`, `usize` -- `f32`, `f64` -- `va_list` - -Compound and user types: - -- Pointer: `*T` -- Address-space pointer: `*addrspace(N) T` -- Array: `[N]T` -- Slice: `[]T` -- Function: `fn(T, U): R` -- Variadic function: `fn(T, ...): R` -- Transparent alias: `type Name = T;` -- Record: `record Name { field: T, ... }` -- Tuple record: `tuple Name { T, U }` -- Anonymous record: `record { field: T, ... }` -- Enum: `enum Name: BaseInt { .value = N, ... }` - -`type Name = T;` creates a transparent source alias. Values of `T` type-check -where `Name` is expected and values of `Name` type-check where `T` is expected. -The alias preserves source spelling for diagnostics and debug info but does not -create a distinct nominal type. - -Named records, tuple records, and enums are nominal. To get a distinct type with -the same representation as another type, wrap it in a record or tuple record: - -```toy -tuple Word { u64 } -``` - -Recursive types are expressed with nominal records and pointers. A record body -may refer to its own name through a pointer, and mutually recursive records use -forward declarations: - -```toy -record Node { - value: i32, - next: *Node, -} - -record A; -record B; - -record A { b: *B } -record B { a: *A } -``` - -Direct by-value recursion is rejected because it has no finite size. - -For code generation, recursive pointer fields may be lowered with type-erased -pointer storage. The frontend preserves the source pointee type and recovers it -when checking field access, loads, stores, calls, and casts. The erased storage -representation must have the same size and alignment as the source pointer type. - -Function types are not first-class values by themselves. Source values use -pointer-to-function types such as `*fn(i32): i32` or `*AliasToFunction`. - -Slices are fat values with the source spelling `[]T`. They lower as a record -with readonly fields `ptr: *T` and `len: usize`, which are accessible to user -code as `slice.ptr` and `slice.len`. A slice does not own its elements and does -not extend the lifetime of the array or slice it was derived from. Slice values -may be copied, passed, returned, indexed, and sliced. - -Type qualifiers are prefix forms: - -```toy -const T -volatile T -restrict T -``` - -`restrict` is valid only on pointer types after qualifier folding. Qualifiers -inform memory access and aliasing rules while lowering to the corresponding code -generation storage type. - -## Declarations - -Top-level declarations: - -```toy -fn local_helper(x: i32): i32 { ... } -pub fn exported(x: i32): i32 { ... } -extern fn puts(s: *i8): i32; - -var counter: i64; -let answer: i32 = 42; -pub var exported_counter: i64; -extern var errno: i32; -extern let ro_table: *u8; -var @[.threadlocal] tls_counter: i32; -extern var @[.threadlocal] errno_tls: i32; - -alias public_name = private_name; -pub alias exported_alias = target_name; -``` - -`fn`, `let`, `var`, and `alias` definitions are local binding by default. -`pub` changes the default binding to global. `extern` declares without defining -and defaults to global binding. The `.threadlocal` object attribute marks object -storage as thread-local and may appear with `pub` or `extern`. - -`let` defines readonly object storage unless it is `extern`. `var` defines -mutable object storage. `alias` targets must already be declared. - -Attribute lists are placed after the syntactic keyword that introduces the item -they decorate. Declaration attributes come after `fn`, `let`, `var`, or -`alias`. Record attributes come after `record`, `tuple`, or `enum`. Field and -parameter attributes come after the field or parameter name. Return ABI -attributes come after the return type. - -```toy -pub fn @[.bind(.weak), .visibility(.hidden), .section(".text.hot"), .hot] -fast_path(x: i32): i32 { ... } - -pub let @[.section(".rodata.tests"), .align(16), .used] -table: [4]i32 = [1, 2, 3, 4]; - -extern var @[.threadlocal, .tls_model(.local_exec)] tls_counter: i32; -``` - -Symbol attributes: - -- `.bind(.local|.global|.weak)` -- `.visibility(.default|.hidden|.protected)` -- `.used`, `.dllimport`, `.dllexport` - -Function attributes: - -- `.section("name")`, `.noreturn`, `.ifunc`, `.cold`, `.hot` -- `.naked`, `.interrupt`, `.no_red_zone` -- `.stack_align(N)`, `.target_features("features")` -- `.callconv(.target_c|.sysv|.win64|.aapcs|.wasm|.interrupt)` - -Object attributes: - -- `.section("name")`, `.align(N)`, `.readonly`, `.threadlocal`, `.static` -- `.tls_model(.auto|.local_exec|.initial_exec|.local_dynamic|.general_dynamic)` -- `.common` - -Data-definition attributes for object definitions: - -- `.retain`, `.merge`, `.strings`, `.entsize(N)` - -Parameter and return ABI attributes are written inline: - -```toy -extern fn sx(x @[.signext]: i8): u8 @[.zeroext]; -extern fn byval(p @[.byval, .align(16)]: *Pair): void; -extern fn borrowed(p @[.byref, .readonly, .noalias, .nonnull, - .dereferenceable(32)]: *Pair): void; -``` - -ABI attributes are `.signext`, `.zeroext`, `.sret`, `.byval`, `.byref`, -`.inreg`, `.noalias`, `.readonly`, `.writeonly`, `.nonnull`, `.nest`, -`.align(N)`, and `.dereferenceable(N)`. - -## Records, Enums, And Aggregates - -Named records, tuple records, and enums are nominal. Record fields are named. -Tuple record fields are anonymous and are accessed by numeric field names in -declaration order. - -```toy -record @[.packed] Header { - tag: u8, - len: u32, -} - -record Padded { - x @[.align(16)]: i32, - y: i32, -} - -tuple Tuple { - i32, - i32, -} - -enum Color: i32 { - .red = 1, - .green = 2, - .blue = 3, -} -``` - -Record fields, tuple fields, and enum values are comma-separated. A trailing -comma is allowed. - -Enum values are dot constants. They require an expected enum type from context, -such as a typed initializer, switch selector, parameter type, or explicit cast. - -Record literals use named fields. Tuple record literals use positional fields. -Array literals use the expected array type for count and element type. Omitted -fields and trailing array elements are zero-filled. - -```toy -let p: Pair = Pair { a: 10, b: 32 }; -let pz: Pair = Pair {}; -let t: Tuple = Tuple { 10, 32 }; -let xs: [4]i32 = [1, 2]; -let zeros: [4]i32 = []; -let z: Pair = {}; -let c: Color = .green; -``` - -Aggregate assignment is allowed for identical array, record, or alias-expanded -storage types. - -Anonymous records are structural product types. They are valid as builtin result -types, inline assembly result types, local inference results, and explicit -`record { ... }` type literals. Two anonymous record types are identical when -their field names, field order, and field types are identical. They cannot be -recursive and do not introduce source declarations. - -When a code generation intrinsic produces multiple stack values, Toy treats -those values as an anonymous record at the source level. Field projection may -consume the component directly. If the value must be stored, passed, returned, -or addressed, the frontend materializes a private code generation record type -with the same field layout. - -## Expressions - -Toy values are statically typed. There is limited local inference: an -initialized block-local `let` or `var` may omit `: T` when the initializer has a -complete, unambiguous type. Function parameters, function returns, top-level -objects, external declarations, empty aggregate literals, and numeric literals -that need a storage width require an expected or explicit type. - -There is no implicit numeric conversion except function-name decay to a -function pointer. Explicit conversion uses `expr as T`. Conditions accept -`bool` and integer-like values; zero is false and non-zero is true. - -`NULL` is the null pointer literal. It requires an expected pointer type or an -explicit cast, for example `NULL as *i32`. - -Operators are parsed in small precedence islands. Operators within the same -island associate left-to-right. Operators from different binary islands must be -parenthesized explicitly, except that additive and multiplicative operators may -be mixed with the usual precedence rules. For example, `a + b * c` is accepted -as `a + (b * c)`, but `a + b << c` is rejected and must be parenthesized. - -- Postfix chain: call, index, field, pointer dereference `expr.*` -- Prefix unary: `+`, `-`, `!`, `~`, `&` -- Cast: `expr as T` -- Multiplicative: `*`, `/`, `%` -- Additive: `+`, `-` -- Shift: `<<`, `>>` -- Less-than: `<` -- Less-than-or-equal: `<=` -- Greater-than: `>` -- Greater-than-or-equal: `>=` -- Equal: `==` -- Not-equal: `!=` -- Bitwise and: `&` -- Bitwise xor: `^` -- Bitwise or: `|` -- Logical and: `and` -- Logical or: `or` - -Assignment is statement-only and does not produce a value. - -### Lvalues - -Lvalue forms: - -- Variable: `x` -- Dereference: `expr.*` -- Index: `expr[index]` -- Named field: `expr.field` -- Tuple field: `expr.N` -- Address-of: `&lvalue` - -Indexing works for array lvalues, pointer-to-element values, and -pointer-to-array values. Indexing a `*[N]T` implicitly dereferences the pointer -to the array, so `p[i]` is equivalent to `p.*[i]` in that case. Field access on -a `*Record` implicitly dereferences the pointer; Toy has no `->` operator. - -`&arr` produces `*[N]T`, a pointer to the whole array. `&arr[0]` produces `*T`, -a pointer to the first element. - -Slicing uses `expr[start:end]`, where `start` and `end` are `isize` -expressions. Arrays and slices can be sliced. Slicing `[N]T` or `[]T` produces -`[]T` with `.ptr` pointing at element `start` and `.len` equal to `end - start`. -Indexing arrays, pointers, and slices requires an `isize` index. Indexing a -slice with `slice[index]` indexes through `.ptr`. Toy does not -insert runtime bounds checks for indexing or slicing. - -## Statements - -Blocks introduce lexical local scopes: - -```toy -{ - let x: i32 = 1; - var y: i32 = 2; -} -``` - -Statement forms: - -```toy -let name: T = expr; -let name = expr; -var name: T = expr; -var name = expr; -lvalue = expr; -expr; - -if cond { ... } else { ... } -while cond { ... } -label: while cond { ... } -switch expr { ... } -label: switch expr { ... } -break; -break expr; -break label; -break label expr; -continue; -continue label; -return; -return expr; -return tail callee(args); -``` - -`return tail callee(args);` supports direct functions and function-pointer -callees. Tail calls to variadic functions are rejected. - -Block-local `let` and `var` declarations may use `@[.static]` to allocate -function-local static storage with internal linkage and lexical visibility: - -```toy -fn next_id(): i32 { - var @[.static] id: i32 = 0; - id = id + 1; - return id; -} -``` - -## Expression Control Flow - -`if` can be an expression when both arms produce the same type. Each arm is a -block: it may contain intervening statements, and the final expression without a -semicolon is the block value. - -```toy -let x: i32 = if cond { - let base = 4; - base + 6 -} else { - let base = 10; - base + 10 -}; -``` - -A final expression with a semicolon is an expression statement and does not -provide the block value. - -Result-typed loops use `while<T>` and an `else` expression for fallthrough: - -```toy -let found: i32 = while<i32> i < n { - if xs[i] == needle { - break i; - } - i = i + 1; - continue; -} else { - -1 -}; -``` - -Structured loops and switches may have labels. `break label;`, -`break label expr;`, and `continue label;` target the named enclosing control -scope rather than the innermost one. `continue label;` is valid only when the -target is a loop. A value-bearing break must match the target scope's result -type. - -```toy -let found: i32 = outer: while<i32> row < rows { - while col < cols { - if grid[row][col] == needle { - break outer row; - } - col = col + 1; - } - row = row + 1; - continue outer; -} else { - -1 -}; -``` - -Unlabeled `break` and `continue` still target the innermost valid scope. - -`break expr;` is valid only when the target is a result-typed loop or expression -switch and must match the target result type. - -## Switch And Labels - -`switch` can be a statement or an expression. There is no `case` keyword. -Statement arms use blocks: - -```toy -switch @[.jump_table] tag { - 0 { - return 10; - } - 1, 2 { - return 20; - } - default { - return 30; - } -} -``` - -In expression context, switch arms use the same block shape and must all produce -the same type. As with `if` expression blocks, statements may precede the final -expression. - -```toy -let value: i32 = switch tag { - 0 { - let x = 4; - x + 6 - } - 1, 2 { - 20 - } - default { - 30 - } -}; -``` - -The selector must be integer-like, `bool`, or an enum. Arm labels are integer -or enum constants. `default` is optional for statement switches and required -for expression switches unless the selector is an enum and all values are -covered. Switch strategy hints are `.branch_chain` and `.jump_table`. - -Labels must be declared before placement: - -```toy -label again; -label done; - -let target: *void = @labeladdr(again); -goto *target within (again, done); - -again: -... -done: -``` - -Label address values have type `*void`. `goto *target;` is a computed goto. -`within (...)` is optional; when present, it gives the valid target set for -diagnostics and target branch-protection lowering. Omitting it is legal but may -be rejected by targets that require an explicit branch-protection target set. - -## Calls And Varargs - -Direct and indirect calls use the same syntax: - -```toy -fn add1(x: i32): i32 { - return x + 1; -} - -let fp: *fn(i32): i32 = add1; -let fp2: *fn(i32): i32 = &add1; -return fp(41); -``` - -A bare function name in rvalue position decays to a function pointer. Taking -the address of a function name produces the same value. - -Varargs builtins: - -- `@va_start(ap)` -- `@va_arg<T>(ap)` -- `@va_copy(dst, src)` -- `@va_end(ap)` - -## Type Queries - -Type query builtins: - -- `@sizeof<T>()` -- `@alignof<T>()` -- `@offsetof<T>(field)` - -`@offsetof` accepts named fields and tuple field indexes. - -## Memory - -Stack allocation and memory operations: - -```toy -let p: *i32 = @alloca<i32>(count, 16); - -@memmove(dst, src, size, align); -@memcpy(dst, src, size, align, .volatile); -@memset(dst, 0, size, align, .nontemporal); -``` - -When `size` and `align` are compile-time constants, `@memcpy`, `@memmove`, and -`@memset` lower to fixed-size code generation memory operations. Otherwise they -lower to the target/runtime dynamic memory intrinsic selected by the frontend. - -Memory flags are `.volatile`, `.nontemporal`, and `.invariant`. Address space -comes from pointer types. Alias and noalias scopes come from language semantics, -such as `restrict` and ABI attributes. - -Ordinary lvalues, pointer qualifiers, and assignment are the primary spelling -for loads and stores. - -## Data Definitions - -Ordinary `let` and `var` definitions use typed initializers: - -```toy -pub let answer: i32 = 42; -pub let pi: f64 = 3.0; -pub let msg: [6]u8 = "hello\0"; -pub let table: [4]i32 = [1, 2]; -pub let pair: Pair = Pair { a: 1 }; -pub let panswer: *i32 = &answer; - -pub var @[.common] tentative: i64; -``` - -Low-level relocatable data expressions may appear in typed object initializers: - -```toy -pub let rels: [2]i32 = [ - @pcrel(target, 4), - @symdiff(end, start, 0), -]; -``` - -Function-local static object initializers may use label addresses while the -containing function is open: - -```toy -fn run(op: i32): void { - label case0; - label case1; - - let @[.static] dispatch: [2]*void = [ - @labeladdr(case0), - @labeladdr(case1), - ]; - - goto *dispatch[op] within (case0, case1); - -case0: - return; -case1: - return; -} -``` - -Data initializer builtins: - -- `@pad(N, value)` -- `@align(N)` -- `@pcrel(symbol, addend)` -- `@symdiff(lhs, rhs, addend)` -- `@labeladdr(label)` - -`@pcrel` and `@symdiff` are valid only inside object initializers. `@labeladdr` -in an object initializer is valid only for function-local static objects in the -function that owns the label. - -`@pcrel` and `@symdiff` require an expected integer type from the object -initializer slot. That type determines the encoded relocation field width. There -is no default width: use an explicitly typed object, array element, record -field, or cast when context is ambiguous. Signed integer types are preferred for -range diagnostics, but unsigned integer storage is allowed when the bit pattern -is intentional. - -## Atomics - -Atomic builtins: - -- `@atomic_load<T>(ptr, order, access(...))` -- `@atomic_store<T>(ptr, value, order, access(...))` -- `@atomic_rmw<T>(op, ptr, value, order, access(...))` -- `@atomic_cmpxchg<T>(ptr, expected, desired, success_order, failure_order, - strength, access(...))` -- `@atomic_fence(order)` -- `@atomic_is_legal<T>(order, access(...))` -- `@atomic_is_lock_free<T>(access(...))` - -The `access(...)` group is optional. Empty `access()` means natural alignment, -address space from the pointer operand or zero for query-only builtins, and no -memory flags. Access entries are `.align(N)`, `.addrspace(N)`, `.volatile`, -`.nontemporal`, `.invariant`, `.alias_scope(N)`, and `.noalias_scope(N)`. -Operation builtins normally derive address space from `ptr`; `.addrspace(N)` is -required only for query-only builtins targeting a nonzero address space. - -Memory orders are `.relaxed`, `.consume`, `.acquire`, `.release`, `.acq_rel`, -and `.seq_cst`. - -RMW operations are `.xchg`, `.add`, `.sub`, `.and`, `.or`, `.xor`, and `.nand`. -Compare-exchange strengths are `.strong` and `.weak`. - -`@atomic_cmpxchg<T>` returns an anonymous record: - -```toy -{ prior: T, ok: bool } -``` - -## Intrinsics - -Scalar and arithmetic intrinsics: - -- `@trap()` -- `@unreachable()` -- `@compile_error("message")` -- `@clz(x)`, `@ctz(x)`, `@popcount(x)`, `@bswap(x)` -- `@expect(value, expected)` -- `@bitget(value, lo, width)` -- `@bitset(dst, src, lo, width)` -- `@fma(a, b, c)` -- `@prefetch(ptr)` -- `@assume_aligned<T>(ptr, align)` - -Low-level conversion builtins are available when tests need to select the exact -code generation conversion rather than the source-level `as` conversion: - -- `@sext<T>(x)`, `@zext<T>(x)`, `@trunc<T>(x)` -- `@ptr_to_int<T>(x)`, `@int_to_ptr<T>(x)`, `@bitcast<T>(x)` -- `@fpext<T>(x)`, `@fptrunc<T>(x)` -- `@sint_to_float<T>(x, rounding)`, `@uint_to_float<T>(x, rounding)` -- `@float_to_sint<T>(x, rounding)`, `@float_to_uint<T>(x, rounding)` - -Rounding modes are `.default`, `.nearest_even`, `.toward_zero`, `.down`, and -`.up`. - -Overflow intrinsics: - -- `@add_overflow<T>(a, b)` -- `@sub_overflow<T>(a, b)` -- `@mul_overflow<T>(a, b)` - -Overflow builtins return an anonymous record: - -```toy -{ value: T, overflow: bool } -``` - -Syscall: - -- `@syscall(nr, arg0, ..., arg5)` returns `isize` - -Non-local control transfer: - -- `@setjmp(buf)` returns `i32` -- `@longjmp(buf, value)` does not return - -`buf` is an lvalue or pointer to target-defined setjmp buffer storage. - -Bare-metal, cache, barrier, and coroutine intrinsics: - -- `@irq_save()`, `@irq_restore(prev)`, `@irq_disable()`, `@irq_enable()` -- `@dmb(scope)`, `@dsb(scope)`, `@isb()` -- `@dcache_clean(ptr, size)` -- `@dcache_invalidate(ptr, size)` -- `@dcache_clean_invalidate(ptr, size)` -- `@icache_invalidate(ptr, size)` -- `@cpu_nop()`, `@cpu_yield()`, `@wfi()`, `@wfe()`, `@sev()` -- `@coro_switch<T>(from, to, value)` returns `T` - -Barrier scopes are `.full`, `.inner`, `.inner_store`, `.outer`, -`.outer_store`, and `.non_share`. - -Target-specific intrinsics that cannot be legally lowered for the selected -target are compile-time errors. - -`@compile_error` emits a compile-time diagnostic and can appear in any expected -expression type. - -## Inline Assembly - -Inline assembly uses one typed builtin: - -```toy -let v: i32 = switch @target_arch() { - .arm64 { - @asm<i32>( - "add %w0, %w1, %w2", - outputs(out("=r", value: i32)), - inputs(in("r", a), in("r", b)), - clobbers("cc"), - flags(.volatile) - ) - } - .x64 { - @asm<i32>( - "leal (%1,%2), %0", - outputs(out("=r", value: i32)), - inputs(in("r", a), in("r", b)), - clobbers("cc"), - flags(.volatile) - ) - } - .rv64 { - @asm<i32>( - "addw %0, %1, %2", - outputs(out("=r", value: i32)), - inputs(in("r", a), in("r", b)), - clobbers(), - flags(.volatile) - ) - } - default { - @compile_error("unsupported asm target") - } -}; - -@asm<void>("nop", outputs(), inputs(), clobbers(), flags(.volatile)); -``` - -The template is a byte string literal. Target-specific assembly is selected with -ordinary fold-only control flow over `@target_arch()` rather than a special asm -selector form. - -Operand wrappers: - -- `in("constraint", expr)` -- `in("m", lvalue)` -- `in("i", const_expr)` -- `name = in("constraint", expr)` -- `out("constraint", name: T)` -- `inout("constraint", expr)` - -Groups: - -- `outputs(...)` -- `inputs(...)` -- `clobbers("memory", "cc", ...)` -- `clobber_abi(.caller_saved)` -- `flags(.volatile, .pure, .nomem, .readonly, .preserves_flags, .nostack, - .noreturn)` - -`outputs(...)` is required. Empty trailing groups may be omitted. - -`@asm<void>` produces no value. `@asm<T>` with one output returns that value. -`@asm<Record>` with multiple outputs maps output names to record fields. Inout -operands count as outputs for result-shape purposes. Anonymous record type -literals are valid as asm result types: - -```toy -let pair = @asm<record { lo: i32, hi: i32 }>( - "...", - outputs(out("=r", lo: i32), out("=r", hi: i32)), - inputs() -); -``` - -## Target Capability Queries - -Capability queries are fold-only constants for the selected target: - -- `@target_arch()` -- `@supports_callconv(kind)` -- `@supports_symbol_feature(feature)` -- `@has_backend_feature(feature)` - -Arch constants: - -- `.x86`, `.x64`, `.arm32`, `.arm64`, `.rv32`, `.rv64`, `.wasm` - -`@target_arch()` returns the selected target's arch enum value. - -Call convention constants: - -- `.target_c`, `.sysv`, `.win64`, `.aapcs`, `.wasm`, `.interrupt` - -Symbol feature constants: - -- `.weak`, `.protected_visibility`, `.dllimport`, `.dllexport`, `.comdat` -- `.common`, `.merge_sections`, `.constructor_priority` -- `.tls_local_exec`, `.tls_initial_exec` -- `.tls_local_dynamic`, `.tls_general_dynamic` - -Backend feature constants: - -- `.unaligned_memory`, `.strict_alignment`, `.red_zone`, `.simd` -- `.pointer_auth`, `.branch_protection` - -Unknown constants are compile-time errors. Known unsupported features evaluate -to `false`. diff --git a/doc/TOY_TRANSACTIONAL.md b/doc/TOY_TRANSACTIONAL.md @@ -1,368 +0,0 @@ -# Transactional Toy frontend & incremental compile - -This document is the design + implementation plan for making the Toy -frontend and the REPL compile/link/publish chain **transactional**, so that -`cfree dbg` compilation and linking are incremental and a failed snippet -leaves persistent state exactly as it was. It implements the -"Toy Transactional Frontend State" section of `doc/DBG_TODO.md`. - -Status: IMPLEMENTED. Breaking API changes were made deliberately; the goal was -a clean codebase that supports this mode well, not a minimal patch. - -Decisions taken (see §8): commit is **publish-gated** (the -`cfree_compile_session_stage`/`commit`/`abort` API), and the durable tables use -the **journaled in-place** rollback model (§3.2). - -Implementation notes: - -- A pre-existing latent double-free in `cfree_jit_publish` was fixed as part of - enabling the publish-failure path: on a panic (e.g. duplicate global) its - handler ran `compiler_run_cleanups`, freeing the borrowed link session's - `Linker`, but did not null the session's `linker`/`image`, so the caller's - `cfree_link_session_free` double-freed. It now nulls them, mirroring - `link_session_guard` (`src/link/link_jit.c`). This is exercised by - `test/dbg/cases/toy-redefine-function`. -- `test/dbg/cases/toy-error-recovery` was flipped from xfail to green. Its - stderr golden column was corrected from `2:15` to the parser's actual `2:14` - (verified with a plain `cc` compile of the identical wrapper); the old value - was an unvalidated xfail guess. -- The panic-path rollback shares `toy_txn_abort` with the soft-error path and - is reasoned correct; the duplicate-global publish panic exercises the session - `abort` path. A dedicated transcript for a CG-internal panic *during parse* is - not included because there is no deterministic toy construct that triggers one - while remaining a recoverable REPL session. - -## 1. Problem - -`cfree dbg` runs one long-lived `CfreeCompileSession` per language -(`driver/cmd/dbg.c:1821-1845`, cached in `s->compile_sessions[lang]`). For Toy, -that session owns a single heap-resident `ToyParser` that accumulates all -REPL declarations across snippets (`lang/toy/compile.c:6-11`). Two things -are wrong: - -1. **No rollback.** A snippet mutates the durable parser arrays *in place* - during the parse. If the snippet then fails — a diagnostic *or* a - `compiler_panic` longjmp out of the CG/backend — the durable state is left - half-mutated. The frontend papers over this with a `poisoned` latch - (`compile.c:152,188,195`) that kills the frontend permanently after the - first error. That is exactly why `test/dbg/cases/toy-error-recovery` is - xfail. - -2. **Per-object handles stored as durable identity.** `ToyFn.sym` / - `ToyGlobal.sym` hold a `CfreeCgSym` that is only valid for one `CfreeCg` - object (`cfree_cg_decl(CfreeCg*, …)`, `cg.h:400`). `toy_seed_repl_symbols` - rewrites every one of them on every compile (`compile.c:88-122`). Identity - and per-object handle are conflated in the durable struct. - -The object/link/JIT layers are **already transactional**: a failed compile -discards the per-compile `ObjBuilder` (`src/api/compile.c:270-279`) and the -JIT image is only touched by `cfree_jit_publish` after success, which -preflights duplicate/undefined symbols before mutating -(`src/link/link_jit.c:949-1014`). The missing transaction is Toy frontend -state plus one driver counter. - -## 2. Key facts (from investigation) - -Lifetimes (verified in `include/cfree/cg.h`): - -| Handle | Scope | Notes | -| --- | --- | --- | -| `CfreeCgSym` (`cfree_cg_decl(CfreeCg*)`) | **per-object** | invalid after `cfree_cg_free`; must not be durable identity | -| `CfreeCgTypeId` (`cfree_cg_type_*(CfreeCompiler*)`) | **compiler-durable** | safe to store across compiles | -| `ToyTypeId` (index into `type_table.types`) | **durable identity** | table is append-only + dedup'd, never compacted → indices stable | -| `CfreeCgLabel` (`cfree_cg_label_new(CfreeCg*)`) | per-object | per-compile | - -Durability boundary (`parser_core.c:112-114`, `toy_parser_reinit`): resets -`nvars/nscopes/nlabels` only. The durable cross-snippet surface is exactly -four append-only arrays: - -- `fns` / `nfns` — appended by `toy_add_fn_typed` (`decls.c:461`) -- `globals` / `nglobals` — appended by `toy_add_global_typed` (`data.c:78`) -- `type_table.types` — appended by `toy_type_add` via - `toy_type_register_*` and **via `toy_type_from_cg` on a miss** (`types.c:444`) -- `type_table.named` — appended/`toy_add_named_type` (`types.c:761`) - -`vars`/`scopes`/`labels`/`goto_targets` are per-compile scratch and need no -transaction. A top-level REPL `let` is **not** a `ToyVar`: `jit { … }` and -bare `{ … }` strip braces and feed the body as `REPL_TOPLEVEL` -(`driver/cmd/dbg.c:1981-2034,3034`), parsed by `toy_parse_program` → -`toy_parse_global_var` → a **durable global** (`parser.c:1830`, -`data.c:78`). That is why `value` persists in `toy-empty-repl`. - -In-place mutations of (possibly committed) durable entries — these break a -naive truncate-only rollback: - -- `toy_add_named_type` completing a forward-declared named type - (`types.c:773-779`): overwrites `existing->type/toy_type/kind/base_type`. -- `toy_type_register_named_record` upgrading an incomplete record - (`types.c:520-528`): writes `type->cg` in place. -- `toy_set_named_type_fields` / `toy_set_named_type_enum_values` allocate the - entry's `fields`/`enum_values` arrays (`types.c:796-819`). - -Within one snippet the mutated entry is itself newly-added (safe to drop on -rollback). The only hard case is completing in snippet B a record that was -forward-declared (committed) in snippet A, then B fails. - -Rollback primitive (`src/core/core.c:100-129`): `compiler_defer(c, fn, arg)` -pushes a LIFO cleanup (allocated in `c->scratch`, never reset mid-compile); -`compiler_undefer` unlinks it; `compiler_run_cleanups` fires all of them and -is called by every `setjmp` landing pad on panic (`src/api/compile.c:204-208`) -and at `compiler_fini`. `toy_error` only sets `has_error` and emits a -non-fatal diag — it does **not** longjmp (`parser_core.c:254-261`); only -`compiler_panic`/`cfree_frontend_fatal` longjmp. The established pattern is -`src/link/link.c:87` (defer on construct) / `:98` (undefer on clean free). - -Publish can reject a clean compile: `cfree_jit_publish` with -`APPEND_OBJECTS` fails on a duplicate strong global -(`src/link/link_jit.c:968`) or undefined reference — checks that live in the -JIT layer, not the frontend. The appended `ObjBuilder` is **borrowed** by the -image (`dbg_objs_owned=0`, `link_jit.c:1386-1387`) and intentionally leaks -until `cfree_jit_free`; it must not be freed on the success path. - -Driver `$N`: `id = ++s->expr_counter` runs **before** the compile -(`driver/cmd/dbg.c:2195`) and is not rolled back on the `goto out` failure path, -so a failed expression still burns a `$N`. The golden requires the failed -`1 +` to leave numbering at `$1` for the next valid input. - -## 3. Design - -### 3.1 Split durable state from per-compile state - -Introduce two structs in place of the monolithic `ToyParser`: - -- **`ToyModule`** (durable, owned by `ToyFrontend`, lives for the session): - the four tables (`fns`, `globals`, `type_table.types`, `type_table.named`) - holding **declaration metadata only** — names, `CfreeCgTypeId`, - `ToyTypeId`, attrs, params, mutability, variadic, kind. **No `CfreeCgSym`.** - Plus durable counters (`static_counter`) and the builtin type - registrations (registered once). - -- **`ToyParser`** (per-compile): lexer, `cur`, `CfreeCg* cg`, builtin-type - cache, `vars`/`scopes`/`labels`/`goto_targets`, `cur_fn_ret*`, `diag`, - `input_name`, `file_id`, `has_error`, island/tail-call scalars, - `input_kind`, plus the **transaction state** (§3.2) and the **per-compile - symbol environment** (§3.3), and a borrowed pointer to the `ToyModule`. - -The ~200 helper signatures in `internal.h` keep taking `ToyParser*`; inside, -durable lookups/appends go through `p->module`. This is wide but mechanical. - -### 3.2 Transaction: journaled append + targeted undo - -Each durable table mutates **in place** during the parse (so lookups stay a -single reverse scan — no two-level lookup), guarded by a per-compile undo -journal owned by the `ToyParser`: - -- At compile start, capture a **watermark** (`nfns`, `nglobals`, `ntypes`, - `named.count`). -- **Appends** past the watermark need no per-record journaling; rollback - truncates each table back to its watermark and frees the per-element - allocations of the dropped tail (`ToyFn.params/toy_params`, - `ToyType.params`, `ToyNamedType.fields/enum_values`). -- **In-place mutation of a committed entry** (index < watermark) — only the - forward-decl-completion sites above — pushes a typed **undo record** that - snapshots the entry by value *before* the mutation. Rollback restores the - saved bytes and frees any sub-array the mutation allocated. -- `toy_type_from_cg`'s append-on-miss is covered by the `ntypes` watermark. - -Operations: - -- `toy_txn_begin(p)`: record watermarks, `compiler_defer(toy_txn_abort, p)`, - mark `txn_open`. -- `toy_txn_abort(p)` (idempotent on `!txn_open`): replay undo journal in - reverse, truncate tables to watermarks freeing dropped tails, free the - per-compile sym env, clear `txn_open`. Used both as the deferred - panic cleanup and the explicit abort. -- `toy_txn_commit(p)` (idempotent): the in-place appends are already durable, - so commit just `compiler_undefer`s the cleanup, discards the journal, and - clears `txn_open`. - -Because mutations are applied in place and the *rollback* is the deferred -cleanup, a `compiler_panic` longjmp out of the CG layer fires -`toy_txn_abort` automatically (`run_cleanups`), leaving `ToyModule` -pristine. Commit is a pointer-free disarm; abort frees only per-compile -memory. This avoids the copy-by-value parser hazard documented at -`compile.c:173-178`. - -> Alternative considered: isolated staging arrays with commit-appends (the -> literal phrasing in DBG_TODO). Rejected as the primary design because it -> forces two-level lookups and complicates `ToyTypeId` index stability for -> no behavioral gain; the journaled-in-place model gives identical observable -> semantics with simpler lookups. - -### 3.3 Per-compile symbol environment - -The `ToyModule` holds no `CfreeCgSym`. The `ToyParser` holds per-compile -parallel arrays `fn_syms[fn_index]` / `global_syms[global_index]` sized to the -module table length and grown in lockstep as new fns/globals are appended. - -- **Seed**: at compile start, for each committed fn/global, `cfree_cg_decl` - it as an external `SB_GLOBAL` into the current `CfreeCg` and store the - returned `CfreeCgSym` in the env (this replaces `toy_seed_repl_symbols` - writing into durable structs). -- **New decls**: `toy_parse_fn`/`toy_parse_global_var` append metadata to the - module *and* push the defining `CfreeCgSym` into the env. -- **Lookups**: `toy_find_fn`/`toy_find_global`/`toy_find_decl_sym` return a - small ref `{ const ToyFnDecl* decl; CfreeCgSym sym; }` (sym resolved from - the env by index). Emission sites (`expr.c`, `builtins.c`, `decls.c`, - `symbols.c` push/addr/call) use `ref.sym`. - -On rollback the env is per-compile and freed; the module never carried syms, -so there is nothing to undo there. - -### 3.4 Commit gating: publish-success (recommended) - -A snippet's durable commit is gated on the **whole** compile→link→publish -chain succeeding, not just compile, because publish can reject a clean -compile (duplicate global / undefined ref, `link_jit.c:968`). Otherwise Toy -would advertise a symbol the JIT does not have — breaking the next snippet's -seed/lookup. - -This requires the session/driver to drive the transaction explicitly. New -compile-session surface (breaking): - -```c -/* Batch one-shot (cc/as): compile and auto-commit on success. */ -CfreeStatus cfree_compile_session_compile(CfreeCompileSession*, - const CfreeSourceInput*, - CfreeObjBuilder** out); - -/* REPL: compile and leave the frontend transaction OPEN on success. */ -CfreeStatus cfree_compile_session_stage(CfreeCompileSession*, - const CfreeSourceInput*, - CfreeObjBuilder** out); -void cfree_compile_session_commit(CfreeCompileSession*); /* idempotent */ -void cfree_compile_session_abort(CfreeCompileSession*); /* idempotent */ -``` - -`CfreeFrontendVTable` gains optional `commit`/`abort` hooks (NULL ⇒ no-op for -asm/c/wasm). The session routes `compile` = `stage` + auto -`commit`/`abort`; `stage` leaves the txn open on `CFREE_OK`. The driver -(`dbg_jit_compile_append_ex`) calls `stage`, then link+publish, then -`commit` on full success or `abort` on any failure. `commit`/`abort` are -idempotent (guarded by `txn_open`) so the panic-fired abort and an explicit -abort never double-run. - -> Simpler alternative: keep `cfree_compile_session_compile` committing at -> compile-success and add only `abort` for the driver to call on publish -> failure (undo of an already-committed txn). Rejected: undoing a committed -> txn means the journal must outlive commit, which is strictly more complex -> and reintroduces a partial-commit window. Publish-gating makes commit the -> trivial disarm and abort the only state-changer. - -### 3.5 Compile-session error contract - -`compile_frontend_state_into` currently `compiler_panic`s -("frontend failed for input", `src/api/compile.c:304`) whenever the frontend -returns non-OK — a synthetic panic for what is an ordinary diagnostic -failure (the only occurrence of that string in the repo; nothing asserts on -it). Change it so a frontend `CFREE_ERR` propagates as `CFREE_ERR` with **no** -synthetic panic and **no** `obj_finalize` on the failed builder. Genuine -internal failures still `compiler_panic` from inside `vtable->compile` (CG -layer) and are caught by `cfree_frontend_compile`'s `setjmp`, which runs -cleanups (firing `toy_txn_abort`) and returns `CFREE_ERR`. This cleanly -separates "diagnostics emitted, fail softly" from "invariant broken, unwind". - -### 3.6 Driver `$N` counter - -Decouple the internal thunk symbol name from the user-visible result number: - -- A monotonic **attempt** counter names the thunk (`__cfree_dbg_expr_<attempt>`) - — must be unique per attempt for lookup; never rolled back. -- A separate **result** counter (`$N`) advances only after `stage` + publish - + call all succeed. On any failure the result counter is untouched. - -This makes the failed `1 +` leave `$1` for the next valid input, matching the -golden. - -## 4. Public API changes (breaking) - -- `include/cfree/compile.h`: add `cfree_compile_session_stage`, - `cfree_compile_session_commit`, `cfree_compile_session_abort`; document - `cfree_compile_session_compile` as auto-commit. -- `CfreeFrontendVTable`: add `commit`/`abort` hooks (optional). -- `lang/toy/`: `ToyParser` split into `ToyModule` + per-compile `ToyParser`; - `ToyFn`/`ToyGlobal` lose their `sym` field (→ `ToyFnDecl`/`ToyGlobalDecl`); - `toy_find_fn`/`toy_find_global`/`toy_find_decl_sym` return refs carrying the - per-compile sym; `toy_seed_repl_symbols` becomes the env seeder; remove - `ToyFrontend.poisoned`. -- All in-tree callers of `cfree_compile_session_compile` (cc/as drivers) keep - working unchanged (auto-commit), so churn is limited to the dbg driver. - -## 5. Implementation phases (TDD, red→green) - -Each phase keeps the tree building and the full suite green except the one -target xfail being flipped. - -1. **Harness/contract first.** Confirm `toy-error-recovery` is red for the - right reason; assert the desired stderr (no "frontend failed" line). Keep - it xfail until phase 6. -2. **Session error contract** (§3.5). Drop the synthetic "frontend failed" - panic; `CFREE_ERR` propagates cleanly; skip `obj_finalize` on failure. - No behavior change yet for poisoned Toy (still latches) — verify suite - stays green. -3. **State split** (§3.1). Mechanical extraction of `ToyModule`; durable - helpers read `p->module`. No transaction yet; behavior identical. Green. -4. **Per-compile sym env** (§3.3). Remove `sym` from durable structs; seed - into the env; refs at lookups/emission. Behavior identical. Green. -5. **Transaction** (§3.2) + **vtable commit/abort + session stage/commit/abort** - (§3.4). Arm `toy_txn_begin`/`abort`/`commit`; remove `poisoned`. -6. **Driver wiring** (§3.4, §3.6). `dbg_jit_compile_append_ex` uses - `stage` + commit/abort; fix the `$N` counters. **Flip - `toy-error-recovery` to green.** -7. **New coverage** (§6). Add the transactional cases below; run red→green. -8. **Cleanup pass.** Remove dead code (`toy_add_fn`/`toy_add_global` - non-typed are never called), update `doc/DBG_TODO.md` checkboxes. - -## 6. Test plan - -Preserve (must stay green): `toy-persistent-repl`, `toy-empty-repl`, -`toy-expr-scalar`, `toy-expr-call`, `toy-expr-block`, `toy-repl-source-list`, -all debugger-control cases. - -Flip to green: `toy-error-recovery` (remove xfail). - -Remain xfail (out of scope — separate feature): `toy-structured-expr`. - -Add (new `test/dbg/cases`, each red first): - -- `toy-rollback-toplevel` — failed `jit { fn bad( }` then a good fn compiles - and is callable; the bad fn is absent. -- `toy-rollback-type` — failed snippet that declares a record then errors - leaves the type table clean; re-declaring the record later works. -- `toy-rollback-after-define` — define `f`, a failing snippet that references - `f` plus a syntax error still leaves `f` callable. -- `toy-redefine-function` — defining `twice` twice: publish rejects the - duplicate, and the Toy table is **not** left advertising the rejected - second definition (exercises publish-gated commit). Encodes the chosen - redefine semantics explicitly. -- A direct/unit check (where feasible) that a `compiler_panic` mid-parse - rolls back durable state (panic-path abort), per DBG_TODO robustness items. - -## 7. Risks & mitigations - -- **longjmp double-free** (`compile.c:173-178`): mutate the heap-resident - parser in place; never copy `ToyParser`/its arrays by value; rollback uses - watermarks + typed undo records, not pointer copies. -- **Pointer-into-array invalidation**: `toy_parser_reserve` reallocs; never - cache a `ToyFnDecl*`/`ToyNamedType*` across an append to the same table — - re-resolve by index, or pre-reserve before handing out pointers. -- **Forgot-to-undefer / double-abort**: `txn_open` flag makes commit/abort - idempotent; ensure every return path through `vtable->compile` leaves the - txn in a defined state (open on OK, the deferred cleanup covers panic). -- **`compiler_defer` OOM** returns NULL: treat as "rollback not armed" and - fail the compile rather than proceed unguarded. -- **Borrowed ObjBuilder**: do not free the appended `ObjBuilder` on publish - success; the image and debug view borrow it. -- **Seed resurrecting rolled-back globals**: rollback truncates the module - before the next compile's seed runs, so seed never sees dropped decls. - -## 8. Open decisions (for confirmation) - -1. **Commit gating / API shape** — publish-gated `stage`+`commit`/`abort` - (recommended, atomic end-to-end, fixes redefine inconsistency) vs - compile-gated + `abort`-only (smaller API, leaves a window). -2. **Redefine semantics** — with publish-gated commit, redefining a function - currently fails at publish (duplicate strong global). Options for a - follow-up: (a) keep failing with a clear message; (b) teach the REPL to - supersede a prior definition (hot-reload territory, `doc/HOT_RELOAD.md`). - This plan only guarantees the table stays consistent on the failure. diff --git a/doc/WASM.md b/doc/WASM.md @@ -1,1454 +1,288 @@ -# WebAssembly Input and Target Plan +# WebAssembly -This plan covers two related but different features: +WebAssembly support in cfree spans the whole tree but is organized around one +shared in-memory model, the `WasmModule`. A single binary/text/validation layer +under `src/wasm/` owns all format mechanics; three independent consumers sit on +top of it — a frontend that lowers Wasm into native code, a backend that emits +Wasm from cfree's codegen API, and an object backend that reads/writes `.wasm` +containers. This document describes that layering, the data flow through it, and +why the seams sit where they do. See [FRONTENDS.md](FRONTENDS.md), +[ARCH.md](ARCH.md), and [OBJ.md](OBJ.md) for the surfaces this hangs off. -- **Wasm as input**: accept WebAssembly modules as a frontend language and - lower them into cfree's normal codegen path for native or Wasm output. -- **Wasm as target**: compile C, toy, and later other frontends to - WebAssembly modules or linkable WebAssembly object files. +## The two directions -The implementation should share one Wasm binary/module layer. Do not build a -reader for the frontend and a second writer for the target. +cfree treats Wasm as both an input language and an output target, and the same +module model serves both: -Priority: implement **Wasm frontend to native object/JIT first**, then the -Wasm backend. Both directions remain part of the plan, but JITing a `.wasm` -module means validating Wasm input and lowering it through `CfreeCg` into the -existing native targets and native `LinkImage`/JIT path. The Wasm backend and -`WasmLinkImage` path are only needed when the selected target object format is -`CFREE_OBJ_WASM`, i.e. for producing final `.wasm` modules or relocatable Wasm -objects. +- **Wasm as input.** A `.wasm` or `.wat` file is a frontend source language + (`CFREE_LANG_WASM`). `lang/wasm` decodes/validates it and lowers it through the + public `CfreeCg` API onto a *native* target (aa64/x64/rv64), where it compiles + and JITs/links like any other frontend. The module's runtime state — memory, + globals, tables, imports — is reified as an explicit instance struct. -## Current Anchors +- **Wasm as target.** C or toy is compiled *to* a Wasm module via + `CFREE_ARCH_WASM` / `CFREE_OBJ_WASM`. The backend in `src/arch/wasm` is a + codegen target that records into IR and replays into a private Wasm emitter, + producing a tool-conventions-shaped `.wasm` file. -The public API already names the target and format: +Neither direction builds a private reader or writer: both go through the one +`src/wasm` layer. That non-duplication is the central design constraint. -- `CFREE_ARCH_WASM`, `CFREE_OBJ_WASM`, and `CFREE_OS_WASI` exist in - `include/cfree.h`. -- `driver/lib/target.c` parses `wasm32` and `wasm64` triples and selects - `CFREE_OBJ_WASM` for freestanding/WASI. -- `CFREE_CG_CC_WASM` exists in `include/cfree/cg.h`, and - `src/api/cg.c` reports it supported only for `CFREE_ARCH_WASM`. -- `emit_wasm` and `read_wasm` are declared but stubbed in - `src/api/stubs.c`. -- `src/obj/obj.h` already has `OBJ_EXT_WASM`, `SSEM_WASM_CUSTOM`, and a small - set of Wasm relocation kinds. -- `src/arch/arch.h` already expects a future Wasm backend to consume - structured CG scopes directly. - -The missing pieces are an `arch_impl_wasm`, a real Wasm object/module -reader/writer, a Wasm static linker, and a frontend that can validate/lower a -Wasm module. - -## Codebase Fit Findings - -The plan fits the current architecture, but a few seams must be made explicit -before implementation: - -- [done] `CFREE_ARCH_WASM` is a public enum value, but there is no registered - `ArchImpl` in `src/arch/registry.c`. `compiler_init` constructs - `TargetABI` immediately through `abi_new`, so a usable Wasm compiler target - needs `arch_impl_wasm.abi_vtable` before C/toy-to-Wasm compile tests can - safely create a `CfreeCompiler`. - → `arch_impl_wasm` + `wasm32_vtable` live in `src/arch/wasm/{arch.c,abi.c}` - and are registered in `src/arch/registry.c`. -- [done] `ObjBuilder` has `OBJ_EXT_WASM` as a tag, but no payload storage API. - → `obj_ext_set/get/clear` (`src/obj/obj.{h,c}`); `ObjBuilder` owns the - payload's lifetime and releases via the registered free fn at `obj_free`. -- [done, single-TU] The public compile path is already relocatable-object-first: - `cfree_compile_obj*` builds an `ObjBuilder`, finalizes it, then dispatches - `emit_wasm` from `emit_object_bytes` when `target.obj == CFREE_OBJ_WASM`. - → `src/obj/wasm_emit.c` reads the `WasmModule` attached under - `OBJ_EXT_WASM` and flushes via `wasm_encode`. v1 emits a final module - shape (no relocations needed for a single TU); the relocatable-object - shape with `linking`/`reloc.*` custom sections is still pending. -- [pending] The public link path currently always builds a `Linker`, calls - `link_resolve`, then emits a native `LinkImage`. A Wasm final module should - not use `LinkImage` segment layout. The least invasive fit is a - `target.obj == CFREE_OBJ_WASM` branch in `cfree_link_exe` after - `build_linker` and before `link_resolve`, calling a Wasm-specific resolver - and emitter. This branch is not used for Wasm input JIT: Wasm input lowered - to a native target should continue through the existing native `LinkImage` - path. -- [done] Driver source classification only recognizes `.c`, `.toy`, and `.s` - today, and `cfree_language_for_path` only infers those languages. Adding - `CFREE_LANG_WASM` also requires driver classification for `.wasm`/`.wat`, - frontend registration in `driver_compiler_new`/`driver_pipeline_new`, and - updates to dbg language switches that enumerate `CFREE_LANG_*`. - → Resolved by the earlier Wasm-input frontend work. -- [pending] `CfreeTarget` has no feature field. Wasm feature policy should - therefore live in internal Wasm decode/validation/link options for v1, not - in the public target struct. A later public feature API can be added once - the target has real users. - -## References - -Primary references checked on 2026-05-18: - -- WebAssembly Core Specification 3.0: - <https://www.w3.org/TR/wasm-core/>. Binary modules are sectioned, most - sections are optional, custom sections are semantically ignored, and code - uses a structured stack-machine instruction model. -- WebAssembly tool-conventions `Linking.md`: - <https://github.com/WebAssembly/tool-conventions/blob/main/Linking.md>. - Relocatable Wasm object files are valid Wasm binaries distinguished by a - required `linking` custom section, plus `reloc.*` custom sections and - target-feature metadata. -- WebAssembly tool-conventions `BasicCABI.md`: - <https://github.com/WebAssembly/tool-conventions/blob/main/BasicCABI.md>. - The current basic C ABI uses wasm32 ILP32, maps C scalar types to Wasm value - types, uses a linear memory stack through a mutable stack-pointer global, - and passes most aggregates indirectly. - -Treat the core spec as normative. Treat tool-conventions as an interop target, -not as the core Wasm standard. - -## Design Principles - -- Keep Wasm state in context structs. No process-global decode tables, - current-module pointers, or singleton runtime state. -- Keep Wasm core parsing independent from cfree object and frontend code. - A module parser should not know whether the caller is `read_wasm`, the Wasm - frontend, or a validation test. -- Make unsupported features explicit. The frontend, object reader, and target - backend should diagnose unsupported Wasm features rather than silently - dropping sections, imports, memories, tables, or target features. -- Start with wasm32. `wasm64` triples can continue to parse, but memory64 and - 64-bit tool-convention relocations should stay behind explicit capability - checks until the object/linker story is stable. -- Preserve boundaries: `lang/wasm` uses public frontend/CG APIs where possible; - `src/wasm` owns binary/core Wasm mechanics; `src/obj` owns object-format - adaptation; `src/arch/wasm` owns target lowering. - -## Proposed Layout - -``` -src/wasm/ - wasm.h shared core structs, feature flags, error codes - decode.c binary reader: LEB128, sections, indices, expressions - wat.c text reader for the accepted feature subset - validate.c type stack, control stack, module validation - encode.c module/object section writer helpers - names.c name/import/export helpers - object.c linking custom section and reloc custom section helpers - -src/obj/ - wasm_read.c read_wasm: Wasm object file -> ObjBuilder plus metadata - wasm_emit.c emit_wasm: ObjBuilder plus metadata -> Wasm object/module - -src/arch/wasm/ - arch.c ArchImpl registration, target ABI selection - target.c CGTarget implementation emitting Wasm function bodies - abi.c BasicCABI classification - -lang/wasm/ - wasm_frontend.c CfreeCompileFn for CFREE_LANG_WASM - lower.c validated Wasm function -> CfreeCg - runtime_abi.h generated-module instance/runtime contract - -test/wasm/ - format/ parser/encoder/validation fixtures - front/ Wasm input -> native object/JIT behavior fixtures - obj/ object roundtrip and relocation fixtures - target/ C/toy -> Wasm module fixtures -``` - -Add `CFREE_LANG_WASM` and teach suffix inference about `.wasm` and `.wat`. -Binary Wasm remains the canonical object/input format, but v1 should include a -native `.wat` parser for the accepted feature subset. The text parser is a -developer and testability feature: it should lower into the same `WasmModule` -model as the binary reader and share validation, feature gating, and -diagnostics. Do not implement text-only semantics or a separate frontend path. - -## API and Data Structure Sketch - -### Public Surface - -Keep the first public API change small: - -```c -typedef enum CfreeLanguage { - CFREE_LANG_C = 0, - CFREE_LANG_ASM = 1, - CFREE_LANG_TOY = 2, - CFREE_LANG_WASM = 3, - CFREE_LANG_COUNT = 4, -} CfreeLanguage; -``` - -Update `cfree_language_for_path` and driver classification so `.wasm` and -`.wat` are source inputs only when compiling through the Wasm frontend. Object -inputs should remain classified by `.o`/`.obj`/archive suffixes and by -`cfree_detect_fmt` in linker/object-reader paths. - -Do not add public Wasm feature knobs in phase 0/1. Internally, construct the -accepted feature set from the target and the caller: - -```c -typedef enum WasmFeatureSet { - WASM_FEATURE_THREADS = 1u << 0, - WASM_FEATURE_TYPED_FUNC_REFS = 1u << 1, - WASM_FEATURE_TAIL_CALLS = 1u << 2, - WASM_FEATURE_MULTI_MEMORY = 1u << 3, - WASM_FEATURE_MEMORY64 = 1u << 4, - WASM_FEATURE_BULK_MEMORY = 1u << 5, - WASM_FEATURE_NONTRAPPING_FTOI = 1u << 6, -} WasmFeatureSet; -``` - -`BULK_MEMORY` and `NONTRAPPING_FTOI` share the `0xfc` opcode prefix; they -are decoded together but feature-gated independently so a module may opt -into one without the other. - -If external embedders need to instantiate Wasm-input modules later, add that -public runtime API with the frontend milestone. Until then, keep -`CfreeWasmInstance`, `CfreeWasmMemory`, and `CfreeWasmTable` as runtime ABI -types under `lang/wasm` or `rt/`, not as prematurely stable libcfree API. - -### Internal Wasm Core - -The `src/wasm` layer should be usable by `read_wasm`, `emit_wasm`, validation -tests, and `lang/wasm` without depending on any one caller's pipeline shape: - -```c -typedef struct WasmBytes { - const uint8_t *data; - size_t len; -} WasmBytes; - -typedef enum WasmValType { - WASM_VAL_I32, - WASM_VAL_I64, - WASM_VAL_F32, - WASM_VAL_F64, - WASM_VAL_FUNCREF, - WASM_VAL_EXTERNREF, -} WasmValType; - -typedef struct WasmFuncType { - WasmValType *params; - uint32_t nparams; - WasmValType *results; - uint32_t nresults; -} WasmFuncType; - -typedef struct WasmFunc { - uint32_t typeidx; - WasmBytes raw_body; - uint32_t code_section_offset; - /* Filled only when validation/lowering asks for decoded instructions. */ - struct WasmInsn *insns; - uint32_t ninsns; -} WasmFunc; - -typedef struct WasmModule { - WasmFuncType *types; - uint32_t ntypes; - struct WasmImport *imports; - uint32_t nimports; - WasmFunc *funcs; - uint32_t nfuncs; - struct WasmTable *tables; - uint32_t ntables; - struct WasmMemory *memories; - uint32_t nmemories; - struct WasmGlobal *globals; - uint32_t nglobals; - struct WasmExport *exports; - uint32_t nexports; - struct WasmElemSegment *elems; - uint32_t nelems; - struct WasmDataSegment *data; - uint32_t ndata; - struct WasmCustomSection *customs; - uint32_t ncustoms; -} WasmModule; -``` - -Decode, validation, and encode contexts own all scratch state. The module -storage should come from a caller-owned arena so no decode result depends on -hidden globals: - -```c -typedef struct WasmDecodeOptions { - WasmFeatureSet features; - uint8_t preserve_custom_sections; - uint8_t allow_object_metadata; -} WasmDecodeOptions; - -typedef struct WasmDiag { - void (*error)(void *user, uint32_t offset, const char *fmt, va_list ap); - void *user; -} WasmDiag; - -typedef struct WasmDecodeCtx { - Heap *heap; - Arena *arena; /* module lifetime */ - Arena *scratch; /* transient decode stacks */ - WasmDiag diag; /* adapter-owned; may bridge to Compiler diagnostics */ - WasmDecodeOptions opts; -} WasmDecodeCtx; - -typedef struct WasmEncodeCtx { - Heap *heap; - Arena *scratch; - WasmDiag diag; - WasmFeatureSet features; -} WasmEncodeCtx; - -int wasm_decode_module(WasmDecodeCtx *, const uint8_t *data, size_t len, - WasmModule *out); -int wasm_validate_module(WasmDecodeCtx *, const WasmModule *); -int wasm_encode_module(WasmEncodeCtx *, const WasmModule *, Writer *); -``` - -The core parser should return structured failures to direct tests. The -libcfree adapters may translate those failures into `compiler_panic` or -diagnostics at the API boundary. - -### ObjBuilder Extension Payload - -Add a tiny internal extension API to `src/obj/obj.h` before implementing -`WasmObjMeta`: - -```c -typedef void (*ObjExtFreeFn)(Compiler *, void *); - -void obj_ext_set(ObjBuilder *, ObjExtKind, void *payload, ObjExtFreeFn); -void *obj_ext_get(const ObjBuilder *, ObjExtKind); -void obj_ext_clear(ObjBuilder *, ObjExtKind); -``` - -`ObjBuilder` should own extension payload lifetime and release it from -`obj_free`. One payload per `ObjExtKind` is enough for the current formats. -This keeps the generic object tables format-neutral while giving Wasm -read/write/link code a real place to hang module-level metadata. - -Wasm object metadata should be compact and typed: - -```c -typedef struct WasmObjReloc { - uint32_t reloc_section_index; /* reloc.CODE, reloc.DATA, etc. */ - uint32_t target_section_index; - uint32_t offset; - RelocKind kind; - ObjSymId sym; - int64_t addend; -} WasmObjReloc; - -typedef struct WasmObjSymbol { - ObjSymId obj_sym; - uint32_t flags; - uint32_t kind; /* tool-conventions symbol kind */ - uint32_t index; /* function/global/table/data/event index */ -} WasmObjSymbol; - -typedef struct WasmObjMeta { - WasmModule *module; - WasmObjSymbol *symbols; - uint32_t nsymbols; - WasmObjReloc *relocs; - uint32_t nrelocs; - struct WasmObjDataSegmentInfo *data_segments; - uint32_t ndata_segments; - struct WasmTargetFeature *target_features; - uint32_t ntarget_features; - struct WasmInitFunc *init_funcs; - uint32_t ninit_funcs; -} WasmObjMeta; ``` - -`read_wasm` should fill normal `ObjBuilder` symbols so archives and generic -symbol inspection still work. The Wasm linker and `emit_wasm` should use -`WasmObjMeta` for index spaces, custom sections, function bodies, data segment -metadata, and padded-LEB relocation details that do not fit a native section -layout model. - -### Wasm Target API - -Register `arch_impl_wasm` in `src/arch/registry.c` with: - -- `abi_vtable`: wasm32 BasicCABI first; reject wasm64 until memory64 and - object relocation support are real. -- `cgtarget_new`: returns the Wasm `CGTarget`. -- `asm_new`, `disasm_new`, `elf`, `macho`, and native register hooks: NULL for - v1. -- `link`: NULL; Wasm static linking is format-level module merging, not a - native `LinkArchDesc`. - -The Wasm `CGTarget` still implements the full vtable in `src/arch/arch.h`, -but its implementation maps operands to Wasm value-stack entries, locals, and -linear-memory addresses instead of hard registers. Unsupported hooks must -diagnose deliberately: inline asm, native TLS models, SIMD, and -irreducible control flow should fail before emitting malformed Wasm. - -### Wasm Link API - -Add a Wasm-specific link product beside `LinkImage`: - -```c -typedef struct WasmLinkImage WasmLinkImage; - -WasmLinkImage *wasm_link_resolve(Linker *); -void wasm_link_emit(WasmLinkImage *, Writer *); -void wasm_link_image_free(WasmLinkImage *); + src/wasm/ (format mechanics + WasmModule model) + decode.c encode.c wat.c validate.c insn.c module.c + | + boundary header: src/wasm/wasm.h + / | \ + lang/wasm/cg.c src/arch/wasm/* src/obj/wasm/{emit,read}.c + (Wasm -> native CG) (CG -> Wasm) (.wasm container <-> ObjBuilder) ``` -`cfree_link_exe` should dispatch to this path when -`compiler->target.obj == CFREE_OBJ_WASM` after `build_linker` and before -`link_resolve`. That lets existing public inputs, archive loading, entry -selection, and diagnostics stay shared while avoiding native segment layout. - -## Shared Wasm Module Model - -Introduce a compact in-memory model that can represent both executable modules -and relocatable object files: - -- Types: function signatures over Wasm value types. -- Imports and exports: module/name pairs, external kind, type index or limits. -- Functions: type index, locals, raw body bytes initially, decoded instruction - stream after validation when needed. -- Tables, memories, globals, tags, elements, and data segments. -- Custom sections: preserve unknown custom sections by name and bytes. -- Object metadata: symbol table, init functions, data segment names, - `linking` custom section payload, `reloc.*` entries, and target features. - -The reader should decode raw sections first, then validate into typed arrays. -This keeps malformed-input diagnostics precise and lets object roundtrip tests -preserve unknown custom sections. - -cfree owns validation for every Wasm feature it accepts. External validators -may be used in optional comparison tests, but they are not the semantic gate for -input safety or output correctness. The internal validator should cover section -ordering, index spaces, type stack, control stack, block result types, -memory/table/global references, limits, data/element segment consistency, and -object metadata invariants needed by the linker. - -Feature policy should be centralized in a `WasmFeatureSet` from the beginning. -Decode may preserve unknown custom sections, but validation, lowering, and -linking must reject instructions or section semantics that are not enabled by -the active feature set. Initialize the v1 feature set conservatively and add -features by extending the table rather than scattering ad hoc checks. - -Represent object-level Wasm metadata with a hybrid model: - -- Generic sections, symbols, groups, and relocations stay in `ObjBuilder`. -- Wasm-only object metadata lives in a typed extension payload, for example - `WasmObjMeta`, associated with the builder under `OBJ_EXT_WASM`. -- Raw module parse/validation structures stay typed by `src/wasm`. When the - writer or linker still needs the module graph, the extension payload may own - a `WasmModule*`; the generic object core should not inline or interpret it. -- Generic object consumers may ignore the extension payload. `read_wasm`, - `emit_wasm`, and the Wasm linker are responsible for understanding it. - -This avoids turning the format-neutral object core into a Wasm-specific module -model while still preserving the metadata needed for archives, object -roundtrip, and linking. - -Do not use VLAs for section-local temporary arrays. All variable-size decode -storage should hang off `WasmDecodeCtx` or an arena owned by the caller. - -## End-to-End Data Flow - -### Wasm Input to Native Object/JIT - -This is the first implementation priority. - -1. `.wasm`/`.wat` source input is tagged `CFREE_LANG_WASM`. -2. The compiler target is a normal native target such as aa64, x64, or rv64, - not `CFREE_ARCH_WASM`. -3. `lang/wasm` decodes, validates, and rejects relocatable-object metadata. -4. Lowering creates explicit instance-state types and emits through `CfreeCg`. -5. Native backends compile the generated functions like any other frontend, - with imported functions and memories represented through the instance - context rather than hidden globals. -6. `cfree_compile_obj` produces a native `ObjBuilder`, and `cfree_link_jit` - uses the existing native `LinkImage` path. - -### C/Toy to Wasm Object - -This is the Wasm backend path and comes after the frontend-to-native path. - -1. Driver or host constructs `CfreeCompiler` with - `{CFREE_ARCH_WASM, CFREE_OS_FREESTANDING, CFREE_OBJ_WASM, ptr_size=4}`. -2. `compiler_init` selects the Wasm ABI vtable through `arch_impl_wasm`. -3. C or toy frontend emits through the existing public `CfreeCg` API. -4. `cgtarget_new` returns the Wasm target. It records function types, locals, - code bodies, data segments, exports, and relocations into `WasmObjMeta` - while maintaining enough generic `ObjBuilder` symbols/sections for - inspection and archives. -5. `obj_finalize` closes the generic builder. -6. `emit_wasm` reads `WasmObjMeta` and writes a tool-conventions relocatable - Wasm object. - -### Wasm Object to Final Module - -1. Linker byte inputs use `cfree_detect_fmt`; Wasm bytes dispatch to - `read_wasm`. -2. `read_wasm` decodes and validates the module/object metadata, fills generic - `ObjBuilder` symbols for archive demand loading, and attaches `WasmObjMeta`. -3. `cfree_link_exe` sees `CFREE_OBJ_WASM` and calls `wasm_link_resolve` - instead of native `link_resolve`. -4. The Wasm linker merges module index spaces, resolves symbols, applies - padded-LEB relocations, synthesizes memory/table/stack/ctor/export policy, - and returns `WasmLinkImage`. -5. `wasm_link_emit` writes the final `.wasm` module. - -## Wasm as Target - -### Target Contract - -Add `arch_impl_wasm` and a Wasm `CGTarget`. - -The Wasm target is not register-allocated in the native sense. It should map -CG values to Wasm stack values and Wasm locals, and it should use structured -scope callbacks for `block`, `loop`, `if`, `else`, and branch depths. The -backend still needs a complete `CGTarget` surface because the public CG API -and existing frontends call through that contract. - -Use a hybrid control-flow strategy: - -- Prefer structured CG scopes when frontends emit them. -- Include a limited CFG-to-structured-Wasm pass for reducible label/jump - patterns so C code does not have to be perfectly structured before the Wasm - target can work. -- Diagnose irreducible control flow, computed goto, and unsupported switch - lowering explicitly in v1 rather than emitting incorrect Wasm. - -This keeps toy and Wasm-input lowering on the clean structured path while -leaving room for ordinary C control flow that reaches the backend as labels. - -Initial target tuple: - -- `wasm32-unknown-unknown` or `wasm32-none` for freestanding modules. -- `wasm32-wasi` is recognized by triple parsing but unsupported in v1. It - should diagnose clearly until WASI imports, startup, argv/env, and libc - policy are specified. -- no threads, SIMD, exceptions, GC/reference types beyond `funcref`, or - multi-memory in v1. - -### ABI - -Implement BasicCABI for wasm32 first: - -- ILP32: `int`, `long`, and pointers are 32-bit. -- Scalar params/results map to `i32`, `i64`, `f32`, and `f64`. -- Aggregates lower according to BasicCABI: empty ignored, singleton scalar - direct, other structs/unions indirect. Arrays are indirect. -- Use a mutable `__stack_pointer` global and a downward-growing linear stack - for address-taken locals, spills that need memory, and aggregate temporaries. -- Defer `long double`, `i128`, broad compiler-rt helper coverage, and full - libc/runtime conventions. Diagnose those cases explicitly in the first - target milestone. - -This makes v1 larger than scalar-only smoke tests but smaller than a full C -runtime target. The initial ABI boundary is scalar plus indirect aggregates. -Varargs are caller-packed into a linear-memory buffer of uniform 8-byte -slots; the callee receives a hidden trailing `i32` pointer to the buffer -and walks it through the existing `va_*` CG hooks. Atomics are lowered via -the wasm-threads opcodes (load/store/RMW/cmpxchg/fence; see Wasm Target -Backend below). - -The ABI classifier should consume neutral CG types, not C `Type*`. - -### Module/Object Emission - -There are two output modes: - -1. **Relocatable Wasm object**: emit a tool-conventions object with a - `linking` custom section, `reloc.*` custom sections, symbol metadata, data - segment metadata, and target-feature metadata. This is required for `cfree - cc -c`, archives, and cfree's linker. -2. **Final module**: the linker merges relocatable Wasm objects into a valid - `.wasm` module with type/import/function/table/memory/global/export/code/ - data sections. - -Do not add a public single-TU final-module compile shortcut before the object -path exists. cfree's compile contract is already relocatable-object-first; Wasm -should fit that contract instead of growing a parallel target path. Low-level -encoder tests may still build tiny final modules directly as fixtures. - -### Linker - -Add a Wasm static link path separate from ELF/Mach-O image layout: - -- Merge type, import, function, table, global, element, data, and custom - sections. -- Renumber functions, globals, tables, memories, data segments, and types. -- Resolve undefined function/data/global/table symbols. -- Apply Wasm relocations without disassembling the code section. Relocatable - objects must use padded LEB encodings where relocations rewrite immediates. -- Merge compatible target-feature sections and diagnose incompatible feature - sets. -- Synthesize `__wasm_call_ctors`, stack/memory symbols, and exports according - to the selected output mode. - -Do not route Wasm final output through `LinkImage` segment layout. A Wasm -module is not a virtual-addressed native image. - -### External runtime compatibility - -cfree-produced Wasm modules are intended to run on any standard Wasm -runtime (wasmtime, wasmer, the browser engines, Node) with no host-side -adaptation beyond providing whatever imports the module declares. The -backend follows a small set of conventions to make that work: - -- **Linear memory is exported under the conventional name `"memory"`.** - Browsers, wasmtime, wasmer, and Node all expect this name when they - reach into module memory from the host. The export is emitted next to - the function exports in `src/arch/wasm/emit.c`. -- **Functions are exported by user name.** No synthesized `_start`, - `_main`, or wrapper. A C `int main(void)` becomes a wasm function - named `main`; a toy program's `test_main` becomes a wasm function - named `test_main`. Hosts pick the entry by name (e.g. - `wasmtime run --invoke main module.wasm`). -- **`extern` C declarations become `(import "env" "<sym>" ...)`** - declarations in the produced module. Default policy matches - wasm-clang: module `"env"`, field `"<C symbol name>"`. The C frontend - accepts `__attribute__((import_module("M"), import_name("F")))` on - the `extern` declaration to override either name; the override flows - through to the produced import entry without further runtime - involvement. -- **No cfree-internal symbols in the produced module.** The internal - cfree-instance ABI (`CfreeWasmInstance*`, `__cfree_wasm_init`, trap - helpers) is a *frontend lowering* implementation detail and does not - leak into modules produced by the `wasm32-none` target. A host - loading a cfree-produced module sees only standard imports, exported - functions, and exported memory. - -Together with the bulk-memory changes above, this closes the gap to a -loadable-on-standard-hosts story. Round-trip the cfree-produced module -through cfree's own host-import binder (see "Host import resolver" -under "Wasm as Frontend") and the same module also runs under cfree's -runtime. - -## Wasm as Frontend - -### Frontend Input Semantics - -Add `CFREE_LANG_WASM` for `.wasm` modules. This is a binary frontend, not an -object reader: - -- `read_wasm` is for link-time object inputs. -- `lang/wasm` is for compiling a Wasm module's semantics to the selected - target. - -The frontend should accept executable/core modules first, not relocatable Wasm -objects. If an input contains a `linking` custom section, diagnose that it is a -linkable object and should be supplied as an object input once `read_wasm` -exists. - -### Native Lowering Model - -Compiling Wasm input to a native target is a module-to-native translation, not -a C ABI translation of individual functions. A Wasm module instance has state: -memories, tables, mutable globals, imports, and traps. Represent that state -with generated or runtime-provided context structs. - -Always use the whole-module instance model at the semantic boundary. Even a -module whose exports look like standalone scalar functions is lowered with an -explicit instance context; if the context is unused, later optimization may -remove it. Do not add a standalone-export fast path with a second ABI. - -Suggested v1 instance contract: - -```c -typedef struct CfreeWasmInstance CfreeWasmInstance; -typedef struct CfreeWasmMemory CfreeWasmMemory; -typedef struct CfreeWasmTable CfreeWasmTable; -``` - -Generated internal functions receive a hidden `CfreeWasmInstance*` parameter. -Imported functions are indirect calls through import slots in the instance. -Memory accesses load the active memory base/size from the instance and perform -bounds checks before native load/store. Traps call runtime helpers that do not -return. - -Exported Wasm functions should be exposed in two layers: - -- internal ABI: `export_name(CfreeWasmInstance*, wasm params...)`. This is the - semantic ABI for every lowered Wasm-defined function and keeps module state - explicit. -- embedder C wrapper ABI: host-callable thunks that preserve the explicit - instance parameter while using C-friendly scalar types and symbol names for - exports that can be represented directly. - -Start with the internal ABI, but treat embedders as a first-class use case: -design and implement the C wrapper layer as an explicit follow-up task, not as -an optional later convenience. Do not generate wrappers that hide or globalize -the instance; the instance parameter remains part of the C-facing contract. - -### Instruction Coverage - -Frontend v1 should target the MVP numeric/control/memory subset first: - -- numeric constants, integer and FP arithmetic/comparison/conversion, - `select`, `drop`, `unreachable`. -- `local.get/set/tee`, `global.get/set`. -- `block`, `loop`, `if`, `else`, `br`, `br_if`, `br_table`, `return`. -- direct `call`; `call_indirect` after table representation exists. -- memory load/store, `memory.size`, `memory.grow`, and active data segments - for memory index 0. - -Then extend the same frontend path for these proposals, behind explicit -`WasmFeatureSet` bits and targeted fixtures: - -- threads: shared memory limits, atomic loads/stores/RMW/cmpxchg, - `memory.atomic.wait*`, `memory.atomic.notify`, and `atomic.fence`. -- typed function references: typed `funcref` tables, `ref.null`, `ref.func`, - `ref.is_null`, `call_ref`, and validation of reference subtyping needed by - calls and table operations. -- tail calls: `return_call`, `return_call_indirect`, and typed-reference tail - calls when typed function refs are enabled. -- multi-memory: memory imports/definitions beyond index 0, memory-indexed - memory ops, and per-memory data initialization. -- memory64: 64-bit memory limits, i64 memory indices, i64 `memory.size` and - `memory.grow`, and 64-bit bounds checking in the instance runtime. - -Still defer or reject: - -- reference types beyond the typed-function-reference subset. -- exceptions/tags, GC, SIMD, and component model. - -Validation should run before lowering. A malformed module should never reach -CG emission. - -### Frontend Proposal Support - -Proposal support belongs in the Wasm frontend, validator, native lowering, and -runtime instance model before it belongs in the Wasm target backend. A module -that uses a proposal must be decoded into the shared `WasmModule`, validated -under an enabled `WasmFeatureSet`, and either lowered semantically to native CG -or rejected with a diagnostic naming the missing feature. - -Do not add proposal-specific global state. Threads and shared memories still -hang off `CfreeWasmInstance`; wait/notify and blocking behavior go through -runtime hooks owned by the instance or embedder. Multi-memory and memory64 -should replace single-memory helper assumptions with indexed memory helpers -rather than adding parallel special cases. - -Lower tail-call instructions through the existing CG tail-call surface: -`return_call` and `return_call_indirect` are required-tail operations and should -use `CFREE_CG_TAIL_MUST`. If the selected native target cannot guarantee the -tail-call shape, diagnose before emission instead of silently compiling them as -ordinary calls. - -Typed function references should share the existing table and `call_indirect` -runtime checks where possible, but the type identity must come from Wasm -function types, not C wrapper signatures. `call_ref` must trap on null refs and -must validate the referenced function type before lowering the call. - -Memory64 is a frontend/runtime property, not permission to use native pointer -width as the Wasm address type. The lowered address calculation remains i64, -then the runtime checks that the current host allocation can represent the -effective range before converting to a host pointer. - -### Host import resolver - -When the frontend consumes a Wasm module that declares imports, the -embedder configures host-side bindings through a small public API on -`include/cfree.h`: - -```c -typedef struct CfreeWasmInstance CfreeWasmInstance; - -typedef struct CfreeWasmImportType { - const CfreeWasmValType *params; - uint32_t nparams; - const CfreeWasmValType *results; - uint32_t nresults; -} CfreeWasmImportType; - -/* Host function pointer. Called with the cfree-instance ABI: - * void (*)(CfreeWasmInstance*, [wasm params...]) -> result - * The explicit instance parameter is a feature, not a quirk: imports - * almost always want to read or write the module's linear memory, and - * that lives in the instance. */ -typedef struct CfreeWasmHostImport { - const char *module; - const char *field; - void *func; -} CfreeWasmHostImport; - -/* Optional dynamic resolver. Returns a function pointer with the - * cfree-instance ABI for the requested import, or NULL to leave it - * unbound (the call site traps on invocation). */ -typedef void *(*CfreeWasmResolveFn)(void *user, const char *module, - const char *field, - const CfreeWasmImportType *type); - -/* Configure host-side imports for the next `cfree_link_jit` / - * `cfree run` invocation. The static table is tried first; the - * resolver fills any misses. Either may be NULL. */ -void cfree_wasm_set_host_imports(CfreeCompiler *, - const CfreeWasmHostImport *imports, - size_t nimports, - CfreeWasmResolveFn resolve, void *user); -``` - -The wiring path on the runtime side is straightforward: - -- Codegen emits two side metadata symbols per lowered module: - `__cfree_wasm_imports` (a const array of `{const char* module; const - char* field; uint32_t typeidx; uint32_t slot_offset;}`) and - `__cfree_wasm_nimports` (a const u32). AOT and JIT outputs get the - same metadata. -- A runtime helper `cfree_wasm_bind_host_imports(instance, imports, - nimports, resolve, user)` walks `__cfree_wasm_imports`, looks up - each entry by `(module, field)` in the static table or via the - resolver, **validates the (params, results) shape against the - module's `WasmFuncType` table** (also emitted as static data), and - writes the function pointer into - `*(void**)((char*)instance + slot_offset)`. -- `driver/cmd/run.c`, `test/link/harness/jit_runner.c`, and - `test/wasm/harness/start_wasm.c` call the binder **before** - `__cfree_wasm_init`. With no imports configured the binder is a - no-op, so existing modules with empty import lists continue to work - unchanged. - -Signature validation produces a clean diagnostic naming the -`(module, field)` pair when the bound host function's declared type -disagrees with the module's import type. Unbound imports do not fail -at bind time — they fail with a runtime trap on the first call, -exactly like the existing trap-on-null-slot path. - -## Phasing - -### Phase 0: Format Spine — done - -Add `src/wasm` with binary decode helpers, section scanning, LEB128 encode/ -decode, minimal module structs, the `ObjBuilder` extension-payload hook, and -tests for tiny hand-authored modules. - -Acceptance: - -- [x] `cfree_detect_fmt` remains able to identify Wasm. -- [x] parser rejects malformed section lengths and bad ordering. -- [x] encoder can roundtrip a minimal module with type/function/export/code. -- [x] `obj_ext_set/get/clear` can attach and release a dummy `OBJ_EXT_WASM` - payload without affecting existing ELF/Mach-O object tests. - -Note: the shared core lives under `lang/wasm/` today (`wasm_internal.h`, -`module.c`, `decode.c`, `encode.c`, `validate.c`, `insn.c`). The -`src/arch/wasm` and `src/obj/wasm_emit.c` TUs reach it through `-Ilang/wasm` -on the build line. Moving it under `src/wasm/` per the layout in the -"Proposed Layout" section remains a planned cleanup. - -Targeted test: `make test-wasm-format`. - -### Phase 1: Wasm Frontend to Native Object/JIT — done - -Add `CFREE_LANG_WASM`, `.wasm`/`.wat` driver classification as source input, -module validation, and native lowering through `CfreeCg`. - -Acceptance: - -- [x] simple Wasm modules compile to native objects for aa64/x64/rv64 targets. -- [x] simple Wasm modules JIT and execute through the existing native - `cfree_link_jit`/`LinkImage` path. -- [x] bounds checks and traps are deterministic. -- [x] imported scalar functions can be resolved through an explicit instance - import table. -- [x] relocatable Wasm objects with a `linking` custom section are rejected as - frontend input with a clear diagnostic. -- [partial] Host imports are bound by name through the public - `cfree_wasm_set_host_imports` API (subagent C). The binder reads the - per-module `__cfree_wasm_imports` / `__cfree_wasm_nimports` metadata - symbols, looks up each `(module, field)` pair in the configured - static table or via the optional resolver, validates the signature - against the module's `WasmFuncType` table, and writes the function - pointer into the instance import slot. Unbound imports trap on first - call. -- [partial] Bulk memory and non-trapping float-to-int conversion are - parsed, validated, and lowered through the existing - `copy_bytes`/`set_bytes` CG ops with bounds-check prologues. - Passive-data and passive-elem tables hang off `CfreeWasmInstance`; - `data.drop`/`elem.drop` zero the length field so subsequent - `memory.init`/`table.init` traps cleanly. - -Targeted test: `make test-wasm-front` (291 fixtures, all green). - -### Phase 2: Minimal Wasm Final-Module Target — partial - -Add `src/arch/wasm` and enough `emit_wasm` support for `cfree cc -target -wasm32-none -c` to produce a valid Wasm module. Use toy first where practical -because it already exercises the public CG API without C parser complexity. - -This phase must include `arch_impl_wasm` registration and a wasm32 ABI vtable; -otherwise creating a Wasm-target `CfreeCompiler` reaches `abi_init` before the -backend exists. - -What's landed (v1 slice, single-TU final modules): - -- [x] `arch_impl_wasm` + wasm32 BasicCABI vtable for scalar params/results - and aggregate-by-indirect classification (`src/arch/wasm/{arch,abi}.c`). -- [x] Wasm `CGTarget` (`src/arch/wasm/{target,emit}.c`): records a per-function - WIR list, then linearizes to a `WasmFunc` body — locals, type entries, - exports, direct calls, scalar arithmetic, structured `block`/`loop`/`if` - via `SCOPE_BLOCK`/`SCOPE_LOOP`, `br`/`br_if` to scope-registered labels, - `return`, basic conversions. -- [x] `emit_wasm` reads the `WasmModule` attached under `OBJ_EXT_WASM` and - flushes via the existing `wasm_encode`. -- [x] Structured `if`/`if-else` lowering via `cfree_cg_block_begin` plus - rewritten `cfree_cg_if_begin/else/end` (nested SCOPE_BLOCKs with `break`), - removing the need for a CFG stackifier for the common case. Toy and C - parsers updated to use it. -- [x] CFG structurer for direct gotos (`src/arch/wasm/structure.c`'s - `wasm_structurize`). Runs in `linearize()` before WIR emission and - rewrites the WIR list so every reachable free label becomes the - break/continue of a synthetic SCOPE_BLOCK/SCOPE_LOOP. Unblocks C - `goto`, Toy's AND/OR short-circuit lowering, and dynamic-size memcpy - loops. Switch islands (frontend's - `JUMP dispatch / case bodies / LABEL dispatch / selector / SWITCH` - shape) are unrolled by the structurer's `unroll_switch_islands` pass - — it reorders the WIR so selector + SWITCH appear before the case - bodies, making case labels forward refs the structurer handles - uniformly. The old `try_linearize_switch_island` linear-time matcher - is removed. -- [x] End-to-end Toy → wasm → run roundtrip via `cfree run` (lang/wasm - frontend → native CG → JIT). See `make test-wasm-toy`. - -What's not yet done in this phase: - -- [x] Linear-memory + `__stack_pointer` global; address-taken locals; - `addr_of`; data/rodata sections lifted into a linear-memory + active - data segment. -- [x] Compact data layout: ObjBuilder SF_ALLOC sections are assigned - aligned bases from a 16-byte null guard upward (no more `sid * 1 MiB` - stride). Symbol-address resolution in function bodies is queued via - `WSymFixup` and patched once `wasm_materialize_data` knows every - section's final size. -- [x] Data-section relocations applied to the linear-memory image: - `R_ABS32` cross-symbol references (e.g. `*p = &x`) resolve to - `section_base[target_sec] + sym->value + addend`. `R_ABS64` and other - reloc kinds diagnose explicitly. -- [x] `&&label` addresses in static-data initializers diagnose early as - `wasm target: &&label addresses in static-data initializers are not - yet supported` (via the new `data_label_addr_unsupported_msg` - CGTarget hook), so the runner's SKIP regex catches them. -- [x] Aggregate (sret/byval) lowering for the wasm32 BasicCABI shape: - hidden i32 sret pointer prepended to the wasm function signature, - byval params received as i32 pointers and copied into a callee-isolated - stack-frame buffer at prologue time, `ret` of an INDIRECT value memcpys - the source bytes to the sret buffer, callers push the destination buffer - and indirect-arg addresses through `emit_addr_operand`. Targeted toy - fixtures: `129_record_by_value` (byval), `130_record_sret_return` (sret). -- [partial] Atomics: lowered via wasm-threads opcodes through the WIR ops - `WIR_ATOMIC_{LOAD,STORE,RMW,CAS}` and `WIR_FENCE` (`src/arch/wasm/emit.c`). - First atomic emission promotes the module's single linear memory to - `shared` with a `max_pages` cap via `ensure_shared_memory`. Full-width - i32/i64 load, store, RMW (xchg/add/sub/and/or/xor), cmpxchg, and fence - are covered. Sub-word atomic RMW/cmpxchg and atomic NAND diagnose-and-fail - because cfree's wasm core does not yet define the 8/16-bit RMW opcodes. - Memory order is ignored (wasm only models seq_cst). -- [x] Varargs: caller-packed linear-memory buffer per call site. - Variadic callees take a hidden trailing `i32` parameter holding the - buffer's address; the type-section signature, indirect-call - signature interning (`intern_indirect_signature` / - `abi_to_wasm_func_type`), and `wasm_func_begin` agree on the shape. - Each variadic arg occupies an 8-byte slot (`i32`/`f32` fill low - bytes; `i64`/`f64` fill the slot); the caller saves/restores - `__stack_pointer` around the call so varargs in a loop don't grow - the linear stack. `va_list` is a single `i32` pointer into the - buffer (`wasm32_vtable.va_list_info = {4, 4, ABI_SC_PTR, …}`). - `WIR_VA_START` / `WIR_VA_ARG` / `WIR_VA_COPY` lower the four CG - hooks; `va_arg<T>` loads at natural width then advances `*ap` by 8. - Aggregate variadic args and `va_arg` of an aggregate type - diagnose-and-fail. Targeted toy fixtures: - `23_cg_api_typed_varargs` (single i64 vararg) and - `133_varargs_mixed_types` (multi-type, `va_copy`, zero varargs). -- [ ] Inline asm, TLS, bitfields, indirect calls — all - diagnose-and-fail. -- [x] Compiler intrinsics: `CGTarget.intrinsic` now records a - `WIR_INTRINSIC` for bit ops and overflow arith and emits inline - expansions at linearize time (`src/arch/wasm/emit.c`, - `wasm_intrinsic` recorder + `emit_intrinsic_*` helpers): - - bit ops: `CLZ`/`CTZ`/`POPCOUNT` lower to `i32/i64.{clz,ctz,popcnt}`. - - byte-reverse: `BSWAP16`/`BSWAP32` via i32 shift/and/or expansion, - `BSWAP64` via i64 expansion. - - hints: `PREFETCH` drops, `EXPECT`/`ASSUME_ALIGNED` pass the input - through to the result reg. - - memory: `MEMMOVE` (and `MEMCPY`/`MEMSET` if reached via the - intrinsic path) route through `WIR_COPY_BYTES`/`WIR_SET_BYTES`, - which now lower to `memory.copy`/`memory.fill`. `emit_addr_operand` - accepts `OPK_REG` so address-already-in-register operands work. - - checked overflow: `S/UADD_OVERFLOW`, `S/USUB_OVERFLOW` lower to - add/sub + (un)signed compare; `S/UMUL_OVERFLOW` for i32 widens - to i64 multiply + range check. `S/UMUL_OVERFLOW` for i64 - diagnose-and-fail (needs partial-product synthesis). - - `setjmp`/`longjmp` and any unrecognized intrinsic kind reach - `compiler_panic` with a per-name diagnostic. -- [partial] `copy_bytes`/`set_bytes` lower to `memory.copy`/`memory.fill` - when the active feature set includes `BULK_MEMORY` (subagent B). -- [partial] Standard runtime targeting: linear memory exported as - `"memory"`, no synthesized `_start`, and undefined function symbols - emitted as `(import "env" "<sym>" ...)` declarations honoring - `__attribute__((import_module, import_name))` overrides (subagent B). -- [ ] Relocatable-object emission (`linking`/`reloc.*` custom sections, - `WasmObjMeta`). v1 single-TU output is a final module with no relocations; - multi-TU needs this. - -Targeted tests: `make test-wasm-toy` drives the existing toy corpus through -`-target wasm32-none -c` + roundtrip. Current snapshot: **99 pass / 0 fail / -34 skip** out of 133 fixtures, after the CFG structurer for direct gotos -(`src/arch/wasm/structure.c`) landed and absorbed the switch-island -shape. The structurer unblocked Toy AND/OR short-circuit lowering (free -`branch_false` / `jump` to forward labels), dynamic-size `@memcpy` / -`@memmove` / `@memset` byte-copy loops, and direct `goto` in C. Skips -cover pre-existing wasm-backend gaps: -`@atomic_is_lock_free<i64>` returns 0 on wasm32, and dynamic i64 -pointer-index arithmetic emits a type-mismatched `i32.add`. A -dedicated `test-wasm-target` fixture directory is still planned. - -### Phase 3: Wasm Object Read/Write - -Broaden `emit_wasm` and `read_wasm` for relocatable objects using -tool-conventions metadata. - -Acceptance: - -- Wasm objects roundtrip through `cfree objdump`/reader/writer without losing - sections, symbols, relocations, and unknown custom sections. -- Existing internal relocation kinds `R_WASM_FUNCIDX`, `R_WASM_TABLEIDX`, - `R_WASM_MEMOFS`, and `R_WASM_TYPEIDX` are encoded/decoded against the - corresponding tool-conventions wire relocation numbers. Add new - `RelocKind` values only when the wire format requires a distinction the - current internal names cannot represent. -- unsupported relocation kinds fail with a diagnostic that names the kind. - -Targeted test: `make test-wasm-obj`. - -### Phase 4: Wasm Static Linker - -Implement a Wasm-specific link path reached from `cfree_link_exe` before the -native `link_resolve`/`LinkImage` path. Reuse the existing `Linker` input and -archive machinery where practical, but produce a `WasmLinkImage`, not a -native `LinkImage`. - -Acceptance: - -- multiple Wasm object files link into one valid module. -- archives participate in demand loading. -- undefined imports, exported symbols, constructors, memory layout, and table - layout are deterministic. - -Targeted test: `make test-wasm-link`. - -### Phase 5: Wasm to Wasm - -Use the frontend and target together for Wasm-to-Wasm normalization and later -optimization. This should not be a byte-preserving copy; object roundtrip tests -cover preservation. The frontend path validates and re-emits semantically. - -Acceptance: - -- Wasm input lowered to Wasm output validates. -- simple numeric/control modules preserve behavior. -- unsupported features still fail before emission. - -## Remaining Feature Checklist - -This checklist tracks the path from the initial Wasm/WAT frontend subset to a -complete implementation. Keep each item tied to a small named fixture or -targeted test target. - -Current checked reader/encoder and validation items describe the -`lang/wasm` frontend implementation: it can parse, validate, preserve, and -re-encode the listed module metadata. Native lowering now covers mutable -numeric globals, imported function declarations, active tables/elements for -`call_indirect`, start functions, and growable single-memory state for the -accepted frontend subset. The native runner path now uses an explicit -`CfreeWasmInstance*` ABI: lowered functions, direct calls, `call_indirect` -arms, start dispatch, and `__cfree_wasm_init` all receive the instance; -single-memory metadata and numeric globals live under that instance. Import -slots and runtime table storage remain open work. - -### Frontend Source and Driver - -- [x] Add `CFREE_LANG_WASM` and suffix inference for `.wat`/`.wasm`. -- [x] Register the Wasm frontend in driver-created compilers and pipelines. -- [x] Add `make test-wasm-front` and a small WAT-to-Wasm test helper. -- [x] Add explicit negative frontend tests for malformed WAT, malformed Wasm, - bad indices, stack underflow, unsupported sections, and unsupported opcodes. -- [x] Decide stdin language selection for WAT input instead of treating `-` as - C-only. -- [x] Add dbg smoke coverage for `:language wasm` / `:language wat`. - -### WAT Reader - -- [x] Parse modules, functions, exports, params, results, locals, folded - expressions, numeric indices, and `$name` function/local references. -- [x] Parse line comments. -- [x] Parse block comments. -- [x] Parse standard WAT string escapes and byte escapes. -- [x] Parse integer literals with signs, underscores, hex notation, and - boundary diagnostics. -- [x] Parse float literals for `f32.const` and `f64.const`. -- [x] Parse module-level type definitions and `(func (type N) ...)`. -- [x] Parse memories and active data segments for the frontend subset, - including indexed memories and memory64 declarations. -- [x] Parse imports, tables, globals, elements, start, and custom/name - sections. -- [x] Parse staged proposal syntax for memory-indexed memory ops, - `memory.size`/`memory.grow`, `return_call`, `return_call_indirect`, shared - memories, atomic load/store aliases, and `atomic.fence`. -- [x] Preserve source locations through validation and lowering diagnostics. - -### Binary Reader and Encoder - -- [x] Decode and encode the current executable-module subset: type, function, - export, code, locals, constants, calls, local ops, and integer ops. -- [x] Reject `linking` custom sections as frontend input. -- [x] Move shared binary mechanics into `src/wasm` with decode/encode contexts. -- [x] Validate section length, ordering, count, and index-space edge cases with - direct format tests. -- [x] Decode and encode memories and active data segments for the frontend - subset, including multi-memory memargs/data segments and memory64 limits. -- [x] Decode and encode imports, tables, globals, elements, start, and - custom/name sections; target-feature custom sections are preserved as raw - metadata. -- [x] Decode and encode tail-call opcodes for frontend input and WAT-to-Wasm - roundtrips. -- [x] Preserve unknown custom sections in the frontend WAT-to-Wasm/binary - module path. -- [x] Add deterministic fixtures for malformed LEB128 and truncated bodies. - -### Validation - -- [x] Validate basic stack depth, direct call indices, local indices, and - integer-only locals/params for the current subset. -- [x] Replace depth-only validation with typed operand and control stacks for - the accepted frontend subset. -- [x] Validate exact function result stack shape, unreachable polymorphism, and - fallthrough after `return`/`unreachable`. -- [x] Validate blocks, loops, if/else, and branch depths for no-result - control blocks. -- [x] Validate branch result arity and `br_table`. -- [x] Validate memory/table/global/data/element indices, limits, active/passive - segment rules, and start function signature. -- [x] Add internal `WasmFeatureSet` bits for staged frontend proposal support. -- [x] Centralize WAT/binary feature gates and diagnostics around - `WasmFeatureSet`. -- [ ] Add clear diagnostics for proposals still outside the frontend support - plan: SIMD, exceptions, GC, and component model. - -### Frontend Proposals - -- [x] Add `WasmFeatureSet` bits for threads, typed function refs, tail calls, - multi-memory, memory64, bulk memory, and non-trapping float-to-int - conversions. -- [x] Add WAT parser gates, binary opcode/section gates, and negative fixtures - for threads, typed function refs, tail calls, multi-memory, and memory64. -- [partial] Add WAT parser gates, binary opcode gates, and negative fixtures - for bulk memory and non-trapping float-to-int conversions (the `0xfc` - opcode prefix block: `memory.copy`, `memory.fill`, `memory.init`, - `data.drop`, `table.copy`, `table.init`, `table.fill`, `table.grow`, - `table.size`, `elem.drop`, and the eight saturating-truncate - variants). Subagent A is landing the decoder/encoder/validator gates; - fixtures live under `test/wasm/cases/bulk_*` and - `test/wasm/trap/bulk_*`. -- [partial] Lower bulk-memory operations in the frontend through the - existing `copy_bytes`/`set_bytes` CG ops with bounds-check prologues, - passive-segment tables hung off `CfreeWasmInstance`, and `data.drop`/ - `elem.drop` zeroing of the length field. Subagent C is wiring this. -- [x] Implement staged threads parsing for shared memories, atomic load/store - aliases, and `atomic.fence`. -- [x] Implement full threads parsing and validation: atomic RMW/cmpxchg, - wait/notify, legal alignment/type checks, and shared-memory-only rejection. -- [x] Lower threads proposal operations through CG atomics and instance/runtime - wait/notify hooks without process-global synchronization state. -- [x] Implement typed function references parsing and validation: typed refs, - `ref.null`, `ref.func`, `ref.is_null`, `call_ref`, and reference-aware table - checks. -- [x] Lower typed function references through nullable runtime function - reference values and Wasm type-id checks, preserving trap behavior. -- [x] Implement tail-call parsing and validation for `return_call` and - `return_call_indirect`. -- [x] Implement typed-reference tail calls where enabled. -- [x] Lower tail calls through `CFREE_CG_TAIL_MUST` and diagnose unsupported - native target tail-call shapes before emission. -- [x] Implement multi-memory parsing, validation, encoding, and runtime - instance layout for more than one imported or defined memory. -- [x] Lower memory-indexed loads/stores, `memory.size`, `memory.grow`, and data - initialization against the selected memory. -- [x] Implement memory64 parsing, validation, encoding, and runtime metadata - for 64-bit limits and i64 memory indices. -- [x] Lower memory64 addressing and bounds checks with i64 arithmetic before - converting checked addresses to host pointers. - -### Native Lowering - -- [x] Lower straight-line exported functions to native object/JIT through - `CfreeCg`. -- [x] Lower i32/i64 constants, local get/set/tee, direct calls, returns, drops, - integer arithmetic, shifts, bitwise ops, and integer comparisons. -- [x] Lower `unreachable` and deterministic traps for memory bounds checks. -- [x] Lower deterministic traps for integer divide/remainder edge cases. -- [x] Lower `select`. -- [x] Lower `local.set`/`local.tee` for address-taken or spilled locals without - depending on backend-local optimization behavior. -- [x] Lower structured control flow: `block`, `loop`, `if`, `else`, `br`, - `br_if`, and `return`. -- [x] Lower `br_table`. -- [x] Lower full i32/i64 integer ops, including count/rotate/popcount/clz/ctz - and MVP conversions. -- [x] Lower f32/f64 arithmetic, comparisons, constants, conversions, and - reinterpret ops. -- [x] Add checked traps for `i32/i64.trunc_f32/f64_{s,u}` NaN/out-of-range - cases. -- [x] Lower single linear memory load/store operations, `memory.size`, and - active data initialization. -- [x] Implement checked single-memory loads/stores and growable storage. -- [x] Lower indexed multi-memory load/store operations, `memory.size`, - `memory.grow`, and active data initialization. -- [x] Lower memory64 address operands and bounds checks using i64 arithmetic. -- [x] Lower `return_call` and `return_call_indirect` with validated result - shape through the required-tail CG path. -- [x] Lower atomic load/store, RMW, cmpxchg, wait/notify, and `atomic.fence` - for the current single-threaded runner semantics. -- [x] Implement numeric globals for native lowering. -- [x] Implement imported function declarations for native lowering. -- [x] Implement tables, active elements, and `call_indirect`. -- [x] Pass an explicit `CfreeWasmInstance*` through lowered functions, direct - calls, `call_indirect` arms, start dispatch, and module initialization. -- [x] Move lowered single-memory metadata and numeric global state behind the - explicit `CfreeWasmInstance` ABI instead of generated module-local storage. -- [x] Move imported functions, imported globals, imported memories, and tables - behind instance import/table slots instead of direct external declarations - and compile-time active-element dispatch. -- [ ] Define and implement the C-facing exported wrapper ABI that keeps the - instance parameter explicit. -- [x] Trim the unconditional trailing `cfree_cg_ret` after a function body - whose last instruction is `return`, `return_call`, - `return_call_indirect`, `return_call_ref`, or `unreachable`. The body - already terminated the SValue stack; an extra ret underflowed when the - Wasm target emits an explicit `return` at end-of-body (which the new - toy-frontend roundtrip exercises). - -### Runtime and Instance Model - -- [x] Add an internal runtime ABI header under `lang/wasm` with - `CfreeWasmMemory`, opaque `CfreeWasmInstance`, and `CfreeWasmInitFn`. -- [x] Allocate runner-owned instances and linear memories in `cfree run`, - `jit-runner`, and the Wasm executable start harness without process-global - Wasm instance or memory storage. -- [x] Define initialization for single memories, active data, numeric globals, - and start functions in `__cfree_wasm_init(instance)`. -- [x] Add internal `CfreeWasmTable` and import-slot structs under `lang/wasm` - or `rt`, and use them for imported functions/globals/memories and active - element initialization. -- [x] Add trap helpers for unreachable, division traps, invalid conversion, - bounds checks, table checks, and indirect-call signature checks. -- [x] Define ownership and initialization for runtime tables, active elements, - and imported state. -- [x] Add frontend runner tests that instantiate memory/global/start modules - without process-global Wasm state. -- [x] Add frontend runner tests for instance-owned import slots and runtime - table state once those slots exist. -- [x] Extend `CfreeWasmMemory` and runner/start harness prefixes for 64-bit - page counts, memory flags, and multiple instance-owned memories. - -### Wasm Target Backend - -- [x] Register `arch_impl_wasm` and a wasm32 ABI vtable - (`src/arch/wasm/{arch,abi}.c`; `src/arch/registry.c`). -- [x] Implement scalar wasm32 BasicCABI classification: void, scalar - (i32/i64/f32/f64 plus i32 pointer), empty struct → IGNORE, singleton - scalar record → DIRECT, other aggregates and arrays → INDIRECT - (sret/byval). i128 and long-double scalars panic. -- [x] Implement a Wasm `CGTarget` that emits function bodies, locals, type - entries, and exports. **Data segments and relocations are not yet - wired** — single-TU final modules need no relocations, and the linear - memory / data-segment lowering is still pending. -- [x] Diagnose unsupported target features: wasm64 (ABI panics in - `compute_func_info`), TLS, bitfields, indirect calls, - `setjmp`/`longjmp`, i64 checked-mul overflow — all reach - `compiler_panic` with a `wasm:` / `wasm target:` / `wasm32 ABI:` - prefix that `test/toy/run.sh`'s W path recognizes as SKIP. WASI - startup, SIMD, and irreducible control flow still produce raw - diagnostics rather than wasm-specific ones; tighten later. - Varargs and the `va_*` CG hooks now have real implementations - (caller-packed linear-memory buffer; see "Varargs" bullet below). -- [x] Implement inline asm with WAT-instruction templates. The - `asm_block` vtable entry parses the template via the - `wasm_parse_wat_body` helper in `src/wasm/wat.c`, validates against a - synthetic function via the new `wasm_validate_func` (factored out of - `wasm_validate` in `src/wasm/validate.c`), then emits a new - `WIR_ASM_BLOCK` op (`src/arch/wasm/internal.h`, `emit.c`) whose - linearizer allocates wasm locals for each input/output operand and - splices the parsed body verbatim. The snippet addresses operands via - `local.get/set/tee N`: `N < nin` reads/writes input i, `N in [nin, - nin+nout)` reads/writes output (N-nin). `"+r"` inout and numeric - tieback constraints ("0".."9") share the input and output's wasm - local, so an empty template paired with `+r` behaves as identity. - Constraints map: `"r"` → wasm local, `"i"` → const-folded - `i32.const`/`i64.const`, `"m"` → i32 base address. `memory` clobber - is an accepted no-op (cg/asm.c already spilled live SSA across the - block); named-register clobbers are rejected. Disallowed in v1: - escaping `br`/`br_if`/`br_table`, `return`/`return_call*`, - `call_indirect`, snippet-internal locals (the only declared locals - are the implicit operand slots), more than one output, register - clobbers. File-scope `__asm__("…")` is rejected via the new - `CGTarget.file_scope_asm` vtable hook with `wasm target: file-scope - asm not yet supported`. Tests: `test/wasm-target/check_asm.sh` - exercises a `i32.popcnt` template positive case and an escaping `br` - negative case; the existing `20_cg_api_inline_asm_full` / - `102_typed_asm_operands` toy fixtures and their kin now run end-to-end - through `test-wasm-toy` (formerly SKIP'd). -- [partial] Lower `copy_bytes` and `set_bytes` to the bulk-memory opcodes - `memory.copy` (0xfc 0x0a) and `memory.fill` (0xfc 0x0b) when the - active feature set includes `BULK_MEMORY` (subagent B). Replaces the - inline byte-load/store loops in `src/arch/wasm/emit.c` for - `WIR_COPY_BYTES`/`WIR_SET_BYTES`; verified structurally by - `test/wasm-target/check_memory_copy.sh`. -- [partial] Export the linear memory under the conventional name - `"memory"` so cfree-produced modules load directly in wasmtime, - wasmer, Node, and browser hosts (subagent B). Verified by - `test/wasm-target/check_memory_export.sh`. -- [partial] Emit `(import "env" "<sym>" ...)` declarations for undefined - function symbols (subagent B). `__attribute__((import_module("M"), - import_name("F")))` in the C frontend overrides the default - module/field pair. Verified by `test/wasm-target/check_imports.sh` - against `import_decl.c` and `import_decl_attribute.c`. -- [x] Refactor structured `if`/`if-else` lowering through - `cfree_cg_block_begin` and nested `SCOPE_BLOCK` scopes - (`include/cfree/cg.h`; `src/cg/control.c`). Drives the Toy parser and - C `parse_if_stmt` so both reach the Wasm backend as - `block`/`break`/`end` rather than label-and-jump primitives — no CFG - stackifier needed for the common case. -- [x] `&&label` addresses inside static-data initializers panic with an - early `wasm target: &&label addresses in static-data initializers are - not yet supported` diagnostic before the MCEmitter sees them. Wired - via the new `CGTarget.data_label_addr_unsupported_msg` hook - (`src/arch/arch.h`; `src/cg/data.c`; `src/arch/wasm/target.c`). -- [x] Lower Wasm-target `switch_` through real `br_table` for dense ranges - and stackify the frontend switch-island shape into structured blocks so - case/default labels become legal Wasm branch depths. Sparse ranges fall - back to a structured compare chain instead of the shared label-based - lowering. -- [x] Linear-memory + `__stack_pointer` global; address-taken locals; - `addr_of`, stack-backed frame slots, `alloca`, and byte copy/set lowering. -- [x] Compact global/static data layout. `wasm_materialize_data` - (`src/arch/wasm/emit.c`) walks SF_ALLOC ObjBuilder sections in id order, - assigns each one an aligned `section_base[sid]` starting from a 16-byte - null guard, and flattens section bytes into the linear-memory image. - Replaces the previous `sid * 1MiB` per-section stride that produced - ~1MB modules for any global. Symbol-address resolution in function - bodies is queued via `WSymFixup` (an i32.const placeholder + patch at - finalize) so absolute addresses are only computed once every section's - final size is settled. -- [x] Apply ObjBuilder relocations into the linear-memory image. After - flatten, `apply_data_relocs` walks `obj_reloc_at` and patches `R_ABS32` - references with `section_base[sym->sec] + sym->value + addend` so - cross-symbol initializers (e.g. `*p = &x`) work for the single-TU - final-module case. `R_ABS64` and other reloc kinds diagnose explicitly. -- [x] Add `make test-wasm-toy` (Toy → wasm32 -c → `cfree run` of the - `.wasm`). Drives the existing toy corpus through the backend and the - Wasm-input frontend's native JIT. Snapshot: 74 pass / 4 fail / 55 skip - after the compact data layout / static-data relocation / &&label-data - reject landings; remaining failures are switch/enum lowering issues - unrelated to global data. -- [partial] Add a dedicated `make test-wasm-target` corpus that - structurally validates produced Wasm modules (independent of the JIT - roundtrip). Initial scripts under `test/wasm-target/` cover bulk - memory opcode lowering, exported `"memory"`, and `(import ...)` - declarations. - -### Object Read/Write and Link - -- [x] Add `ObjBuilder` extension payload hooks (`obj_ext_set/get/clear` in - `src/obj/obj.{h,c}`); `OBJ_EXT_WASM` slot used by the Wasm target. -- [x] Implement `emit_wasm` for the single-TU final-module case - (`src/obj/wasm_emit.c`): flushes the `WasmModule` attached under - `OBJ_EXT_WASM` via `wasm_encode`, or an empty magic+version header if no - module was attached. -- [ ] Add `WasmObjMeta` for module graph, symbols, relocations, data segment - metadata, target features, and init functions. v1 stores a `WasmModule*` - directly under `OBJ_EXT_WASM`; the typed `WasmObjMeta` is still pending - and is needed before the tool-conventions relocatable-object shape. -- [ ] Extend `emit_wasm` for tool-conventions relocatable objects - (`linking` custom section, `reloc.*` custom sections, target features). -- [ ] Implement `read_wasm` for relocatable objects and symbol inspection - (still stubbed in `src/api/stubs.c`). -- [ ] Implement reloc custom sections for the current `R_WASM_*` relocation - kinds, including padded LEB rewrites. -- [ ] Add `make test-wasm-obj`. -- [ ] Add a Wasm-specific linker path that emits `WasmLinkImage`, not native - `LinkImage`. `cfree_link_exe` still always builds a `Linker` + native - `LinkImage`. -- [ ] Merge type/import/function/table/global/element/data/custom sections and - apply relocations. -- [ ] Support archives, demand loading, undefined imports, export policy, - constructors, memory layout, and table layout. -- [ ] Add `make test-wasm-link`. - -### Wasm-to-Wasm - -- [partial] Use the frontend plus Wasm target to normalize simple modules. - The Toy → Wasm → run roundtrip via `make test-wasm-toy` exercises the - backend's output through the Wasm-input frontend's native JIT (an - indirect Wasm-to-native rather than Wasm-to-Wasm path). A direct - Wasm-input → Wasm-output mode requires hooking the lang/wasm frontend - to the Wasm target instead of native CG. -- [partial] Validate emitted modules with cfree's own validator and optional - external validators when available. The frontend's validator runs on the - Toy-produced `.wasm` during `cfree run`; an explicit - validate-only pass over Wasm-backend output is not yet wired. -- [x] Preserve behavior for numeric/control fixtures while allowing the - byte representation to change. 32 of 131 toy fixtures roundtrip - correctly today; memory-using fixtures don't yet (linear memory in the - backend is pending). - -## Testing Strategy - -Prefer small, named fixtures over broad corpus runs: - -- hard-coded binary fixtures for malformed section and LEB cases. -- tiny `.wasm` modules checked into `test/wasm/format`. -- generated fixtures only when the generator is checked in and deterministic. -- external validators/runtimes are optional skips, not hard requirements for - all developer machines. - -Suggested new targets: - -- `make test-wasm-format` -- `make test-wasm-obj` -- `make test-wasm-target` -- `make test-wasm-link` -- `make test-wasm-front` - -External tools such as `wasm-tools validate` or WABT `wasm-validate` may be -used as optional comparison oracles, but the normal tests should exercise -cfree's own validator directly. The suite must remain useful without external -Wasm tooling installed. - -## Open Questions - -- Should the first Wasm final module target `unknown-unknown` only, or should - we define enough WASI startup/import policy for `wasm32-wasi` early? - Decision: freestanding only first. Keep `wasm32-wasi` as a recognized but - explicitly unsupported platform until the freestanding object/link path is - working. -- Should C-to-Wasm initially require structured CG scopes from frontends, or - should the Wasm backend include a general unstructured-CFG-to-structured-Wasm - pass from day one? Decision: hybrid. Prefer structured scopes, add a limited - reducible-CFG structurer, and diagnose irreducible/computed-goto cases in v1. - Implemented as `wasm_structurize` in `src/arch/wasm/structure.c`: wraps - free WIR_LABELs in synthetic `SCOPE_BLOCK` (forward goto) / `SCOPE_LOOP` - (backward goto) before linearization, lifting open positions across CG - scope boundaries when a predecessor sits inside an extra scope. Hybrid - labels (both forward and backward predecessors), forward jumps that - cross into a CG scope's interior, and computed-goto patterns diagnose - with `wfail`. Switch-island regions are unrolled in the same pass by - `unroll_switch_islands`: a pre-pass that detects the frontend's - `JUMP dispatch / case bodies / LABEL dispatch / SWITCH` shape and - rewrites the WIR to put selector + SWITCH first and case bodies - after, dropping the unconditional jump and the dispatch label. Case - labels then become forward refs and flow through the same direct-goto - wrapping as any other label. -- How much of BasicCABI should be implemented before any public Wasm target - flag is advertised: scalar-only, scalar plus indirect aggregates, or full - varargs and long-double helper coverage? Decision: scalar plus indirect - aggregates first; defer varargs and runtime-heavy cases with diagnostics. -- What embedding API should own `CfreeWasmInstance` for Wasm-input modules? - Keeping it explicit avoids global state, but the public shape should be - designed before exported C wrappers are promised. +## The shared model and the boundary header + +`src/wasm/wasm.h` is the contract that decouples format mechanics from the three +consumers. It declares the in-memory `WasmModule` (types, funcs, memories, +tables, globals, element/data segments, exports, custom sections, start +function, plus a `WasmFeatureSet` bitfield) and the small leaf types it is built +from (`WasmFuncType`, `WasmFunc`, `WasmInsn`, `WasmValType`, the +`WasmInsnKind` enum of decoded opcodes). It also declares the entry points the +consumers call: `wasm_decode_binary`, `wasm_parse_wat`, `wasm_validate`, +`wasm_encode`, `wasm_emit_cg`, plus the per-section builder helpers +(`wasm_add_func`, `wasm_intern_func_type`, `wasm_func_add_insn`, ...) and the +instruction-classification helpers (`wasm_insn_is_load`, `wasm_atomic_rmw_op`, +`wasm_conversion_kind`, ...). + +Three design choices keep this boundary clean: + +- **One model represents every shape of module.** The same `WasmModule` holds a + fully-decoded executable module, a WAT-parsed module, and a + backend-synthesized module. Imports are modelled inline on each kind + (`WasmFunc.is_import`, `WasmMemory.is_import`, ...) rather than in a separate + import vector, so a synthesized module and a decoded one look identical to a + consumer. Decoded instruction bodies live in the per-function `insns` vector; + decode/validate populate it, the backend builds it, and encode flushes it. + +- **The header uses public `Cfree*` aliases on purpose.** It includes only the + public `cfree/cg.h`, `cfree/compile.h`, `cfree/core.h`, `cfree/frontend.h` + headers, not libcfree internals. That lets every Wasm caller in the tree — + including `lang/wasm`, which is a frontend and must stay on public APIs — + share the model without pulling in compiler internals. All allocation hangs + off the module's `CfreeHeap*` (`wasm_realloc`/`wasm_strdup`); there is no + global decode state, per the project's no-global-state rule. + +- **Diagnostics route through the compiler.** `wasm_error` forwards to + `cfree_frontend_vfatal`, so a malformed module or an unsupported feature is a + clean front-end diagnostic with a source location rather than a crash. Every + unsupported feature is diagnosed explicitly rather than silently dropped. + +## Format mechanics (`src/wasm/`) + +This layer knows the wire format and the type system; it knows nothing about who +is calling it. + +**Binary decode (`decode.c`).** A bounds-checked `BinReader` cursor walks the +sectioned binary: magic/version, then size-prefixed sections in order. LEB128 is +decoded with explicit overflow guards — `bin_uleb`/`bin_uleb64` reject +over-long encodings, and the signed reader accumulates in `uint64_t` and casts +at the end to avoid shifting into the sign bit. `wasm_is_binary` sniffs the four +magic bytes `\0asm`; this is what `cfree_detect_fmt` (`src/api/object_detect.c`) +keys on to classify link inputs as `CFREE_BIN_WASM`. A second entry point, +`wasm_decode_one_insn`, decodes a single instruction into a caller-owned scratch +module — the disassembler uses it so the opcode mapping has exactly one source +of truth. + +**Text parse (`wat.c`).** A from-scratch S-expression tokenizer + parser for the +accepted WAT subset: modules, funcs, params/results/locals, folded and flat +expressions, `$name` and numeric index references, type definitions, memories, +data/element segments, globals, exports, imports, comments, and the full literal +grammar (signed/hex/underscored integers with boundary diagnostics, f32/f64 +floats, string and byte escapes). It lowers into the *same* `WasmModule` as the +binary decoder and then runs the same validation, so text is a developer/test +convenience, not a parallel semantic path. `wasm_parse_wat_body` parses a bare +instruction sequence into a caller-supplied function — the backend's inline-asm +path reuses it. + +**Validation (`validate.c`).** A typed operand-stack + control-stack validator. +`wasm_validate` checks module-level invariants (index spaces, start-function +signature, section consistency) then validates each function body via +`wasm_validate_func`, which tracks the value stack and a control-frame stack +with block result types and unreachable-after-branch handling. Validation runs +before any lowering: a malformed module never reaches CG emission or encoding. +`wasm_validate_func` is exposed separately so callers that synthesize scratch +functions (the Wasm-target inline-asm path) can validate them in isolation. + +**Encode (`encode.c`).** The inverse of decode: writes magic/version and the +section sequence with LEB128 immediates, through the public `CfreeWriter`. This +is the single writer used by both `cfree_wasm_wat_to_wasm` and the +object-backend `emit_wasm`. + +**Model + helpers (`module.c`, `insn.c`).** `module.c` owns construction and +teardown of `WasmModule` and its heap-grown sub-vectors. `insn.c` is the +shared classification table: predicates over `WasmInsnKind` (is-load, +is-store, the atomic families), the mnemonic table, feature gating +(`wasm_feature_enabled`, `wasm_require_feature`), and the maps from opcode kind +to CG-level operations that the frontend lowering consumes. + +## Frontend: `lang/wasm` (Wasm → native CG) + +`lang/wasm` is a normal cfree frontend (`cfree_wasm_frontend_vtable`, registered +for `.wat`/`.wasm`; the driver also accepts `-x wasm`/`-x wat` and the dbg +`:language` switch). It compiles a Wasm module's *semantics* onto a native +target. This is distinct from `read_wasm`, which treats a `.wasm` file as a +link-time object; the frontend rejects relocatable objects (a `linking` custom +section) as input, telling the user to supply them as objects instead. + +`lang/wasm/wasm.c` is thin: decode-or-parse (binary vs WAT by magic), validate, +then hand the module to `wasm_emit_cg` in `cg.c`. The interesting work is the +whole-module-to-native lowering in `lang/wasm/cg.c`. + +**Instance model.** A Wasm module instance has state — linear memories, mutable +globals, tables, imported functions, and the ability to trap. cfree reifies that +state as a generated per-module `CfreeWasmInstance` record (built field-by-field +in `wasm_cg_build_runtime`): a `CfreeWasmMemory` per memory, a function-pointer +slot per import, a func-ref entry per defined function, a slot per global, table +storage, and passive data/elem segment slots. The runtime ABI types +(`CfreeWasmMemory`, `CfreeWasmTable`, the passive-segment slots) live in +`lang/wasm/runtime_abi.h`, not in the public API. + +Every lowered function — including direct calls, `call_indirect` arms, the start +dispatcher, and the generated `__cfree_wasm_init` — receives a hidden +`CfreeWasmInstance*`. There is no standalone-export fast path: module state is +always explicit, and an unused instance pointer is left for later optimization +to remove. Imported functions are indirect calls through the instance's import +slots. Memory accesses read the active memory base/size from the instance and +bounds-check before the native load/store. Traps call non-returning runtime +helpers (`__cfree_wasm_trap_*`, one per `WasmTrapKind`: unreachable, division, +invalid-conversion, bounds, table, signature). + +**Coverage.** The MVP numeric/control/memory core, plus mutable globals, imported +function declarations, active tables/elements for `call_indirect`, start +functions, growable single-memory state, bulk-memory ops +(`memory.copy`/`fill`/`init`, `data.drop`, the table equivalents) with +bounds-check prologues, and non-trapping float-to-int conversion. Bulk ops and +non-trapping conversions are gated behind their `WasmFeatureSet` bits. + +**Host imports.** When a frontend-lowered module declares imports, an embedder +binds them by name through the public API in `include/cfree/wasm.h` +(`cfree_wasm_set_host_imports` with a static table and/or a dynamic resolver). +The lowered image carries three readonly metadata symbols — +`__cfree_wasm_imports`, `__cfree_wasm_nimports`, `__cfree_wasm_types` (wire +format in `runtime_abi.h`). The runtime binder in `lang/wasm/host_imports.c` +walks that metadata, looks up each `(module, field)` pair, validates the bound +function's signature against the module's recorded `WasmFuncType`, and writes the +pointer into the instance's import slot. Unbound imports trap on first call. The +binder deliberately mirrors the raw `WasmValType` byte encoding rather than +including the internal `src/wasm` header, keeping the public runtime free of +compiler internals. + +## Backend: `src/arch/wasm` (CG → Wasm) + +The Wasm backend is a codegen target with an unusual shape: it is a `CGBackend` +**without** the native parts of `ArchImpl`. `arch_impl_wasm` (in `arch.c`) +registers `cgtarget_new` and a disassembler (`wasm_disasm_new`, which renders the +code section as WAT for objdump) but leaves `asm_new`, `link`, the label-fixup +hook, and all register-file hooks NULL — there is no machine code, no native +assembler, and no native image layout for wasm32. It also installs +clang-compatible predefined macros (`__wasm__`, `__wasm32__`, `__ILP32__`, ...). + +**Record-then-replay.** Rather than lower CG operations to Wasm directly, the +backend uses the shared CG IR recorder ([CODEGEN.md](CODEGEN.md), [IR.md](IR.md)): +`wasm_cgtarget_new` (in `target.c`) wraps a private `WTarget` emitter in a +`cg_ir_recorder` (`src/cg/ir_recorder`). The frontend's CG calls are first +recorded into a `CgIrModule`; at finalize, `wasm_emit_ir_module` (`ir_emit.c`) +replays that IR into the `WTarget`. The replay drives a second, private IR — the +**WIR** list defined in `internal.h` — one record per emitted operation, kept +separate from the final `WasmFunc` body. This two-stage buffering is what makes +deferred, whole-function structuring possible: the WIR can be reordered and +rewritten before it is linearized into the structured Wasm a function body +requires. + +The recorder config also carries the backend's diagnose-before-emit policy: it +opts out of local-static-data emission, supplies the `&&label`-in-static-data +diagnostic, and reports why a tail call is unrealizable (e.g. a variadic callee +whose vararg buffer lives in a torn-down frame). + +**WIR and the structurer.** Each `WTarget` SSA `Reg` becomes a Wasm local, +materialized by `local.get`/`local.set`. Control flow is the hard problem: Wasm +has no arbitrary jumps, only structured `block`/`loop`/`if` with relative branch +depths. CG scopes (`SCOPE_LOOP`/`SCOPE_BLOCK`/`SCOPE_IF`) map directly, but +frontends also emit free labels and `goto`. `structure.c`'s `wasm_structurize` +runs over the recorded WIR before linearization and rewrites it so every +reachable free label becomes the break target of a synthetic forward +`SCOPE_BLOCK` or the continue target of a backward `SCOPE_LOOP`; jumps then +resolve through the ordinary scope-bound branch machinery. A +`unroll_switch_islands` pass reorders the frontend's `switch` shape +(jump-to-dispatch / case bodies / dispatch / selector / `SWITCH`) so the selector +and switch precede the case bodies, turning case labels into uniform forward +references. Irreducible control flow and unliftable cross-scope labels are +diagnosed, not miscompiled. + +**ABI (`abi.c`).** `wasm32_vtable` implements the tool-conventions BasicCABI for +wasm32 (ILP32): void ignored; scalars ≤ 8 bytes direct as i32/i64/f32/f64; +pointers as a 4-byte int part; empty structs ignored; a singleton-scalar struct +passed as that scalar; all other aggregates and arrays passed indirectly +(sret/byval). `__int128` and binary128 `long double` have no Wasm representation +and panic at classification with a specific message rather than routing silently +through memory. Aggregates use a downward-growing linear-memory frame through a +`__stack_pointer` global; byval params arrive as i32 pointers and are copied into +a callee-isolated frame buffer; varargs are caller-packed into a uniform-slot +linear-memory buffer with a hidden trailing i32 pointer. + +**Emit (`emit.c`).** The largest piece, and the WIR→`WasmFunc` linearizer. It +allocates Wasm locals, interns function types (`wasm_intern_func_type`), lays out +the linear-memory data image compactly (SF_ALLOC sections assigned aligned bases +from a low null guard upward, with symbol-address fixups patched once every +section size is known), builds the func-ref table for address-taken functions, +and emits the structured body. It also lowers atomics through the wasm-threads +opcodes (promoting the single memory to `shared` on first use), compiler +intrinsics (clz/ctz/popcount/bswap, checked-overflow arithmetic, memcpy/memset +via `memory.copy`/`fill`), and the standard-runtime conventions: linear memory +exported as `"memory"`, functions exported by user name with no synthesized +`_start`, and undefined function symbols promoted to `(import "env" "<sym>" ...)` +honoring `__attribute__((import_module/import_name))` overrides (the C frontend +records those into a side table under `OBJ_EXT_WASM_IMPORTS`, read back here). +The accumulating `WasmModule` is attached to the `ObjBuilder` under +`OBJ_EXT_WASM` for the object backend to flush. + +## Object backend: `src/obj/wasm` (minimal) + +The object backend is intentionally small and one-directional today. `emit.c`'s +`emit_wasm` reads the `WasmModule` the codegen backend attached under +`OBJ_EXT_WASM` and flushes it through the shared `wasm_encode` (or, when no module +is attached, writes a bare magic+version header). It does **not** synthesize a +relocatable object: `emit_wasm` produces a single-TU *final module* with no +`linking`/`reloc.*` custom sections and no separate object-metadata structure. + +`read.c`'s `read_wasm` is the reverse glue used by the linker/objdump path +([LINK.md](LINK.md)): it mirrors each binary section into a format-neutral +`ObjBuilder` section carrying the original payload bytes (so `objdump -h/-s` +show the real container, the code section marked `SF_EXEC` so `-d` disassembles +it as WAT), and adds one function symbol per defined function (named from the +name section, an export, or a synthesized placeholder, with the symbol value the +byte offset of the body's locals vector so disassembly lines up). It decodes the +full module only to recover names; container metadata beyond raw bytes is not +otherwise interpreted. These hooks plug into `ObjFormatImpl` in +`src/obj/registry.c` under `CFREE_OBJ_WASM` / `CFREE_BIN_WASM`. The reserved +Wasm relocation kinds in `obj/obj.h` (`R_WASM_FUNCIDX`, `R_WASM_TABLEIDX`, +`R_WASM_MEMOFS`, `R_WASM_TYPEIDX`) and the custom-section semantic tag +`SSEM_WASM_CUSTOM` are the format-neutral vocabulary the generic object/link +registry holds for Wasm, so relocatable-object support can grow on this layer +without disturbing the format-neutral core. + +## Cross-tree boundary, restated + +The whole arrangement holds together because each tier depends only downward, +through `src/wasm/wasm.h`: + +- `src/wasm` knows the format and the type system, nothing about callers. +- `lang/wasm` consumes the model to *read* Wasm into native code; it stays on + public APIs and owns the instance/runtime contract. +- `src/arch/wasm` produces the model from CG; it owns the WIR, the structurer, + the wasm32 ABI, and the linear-memory/import conventions. +- `src/obj/wasm` adapts the model to/from the format-neutral `ObjBuilder` and the + generic object/link registry. + +Module metadata that does not belong in any one tier rides on the shared +`ObjBuilder` via typed extension payloads (`OBJ_EXT_WASM` for the module itself, +`OBJ_EXT_WASM_IMPORTS` for import-attribute overrides), keeping the +format-neutral object core free of Wasm specifics. diff --git a/doc/WASM_PARSE_CHECKLIST.md b/doc/WASM_PARSE_CHECKLIST.md @@ -1,95 +0,0 @@ -# Wasm backend — `test/parse` W-path checklist - -Status of the Wasm CGTarget against the `test/parse` C suite, path **W** -(`cfree cc -O0 -target wasm32-none -c case.c` → `cfree run -e test_main case.wasm`). - -- Host: arm64 (native JIT for the re-lowering). Opt level 0. -- 465 cases: **433 pass · 0 fail · 32 skip**. -- The skips below match run.sh's phased-rollout regex (reported SKIP). - The fails fall outside it and report as **FAIL** in the harness. -- Reproduce / re-probe: `CFREE_TEST_ALLOW_SKIP=1 ./test/parse/run.sh "" W`. - -## ✅ Fixed — wrong exit code (was 7, now 0) - -### Decoder UB (LEB128 sign-extend shifts into sign bit) -- [x] `6_5_58_large_integer_immediates` — `bin_sleb` accumulated in `int64_t`, - so a 7-bit group landing in bit 63 was UB. Now accumulates in `uint64_t` - and casts at the end (`src/wasm/decode.c`). -- [x] `rv64_large_imm_li` — same `bin_sleb` fix. - -### Misc lowering mismatches -- [x] `attr_p2_10_alias` — function aliases are now installed in `sym_to_func` - *before* function bodies are linearized (`wasm_emit_ir_module`), so a call - through the alias resolves to the target instead of allocating an empty - func with a result-type-mismatched body. -- [x] `builtin_22_ctz_long_widths` — clz/ctz/popcount now run at the operand - width (i64) and wrap to the i32 result (`emit_intrinsic_bit_op`). -- [x] `builtin_24_atomic_lock_free` — lock-free ceiling is the native atomic - width (`CG_MAX_ATOMIC_SIZE` = 8), not the pointer width; wasm32 has 4-byte - pointers but 8-byte atomics (`src/cg/atomic.c`). -- [x] `builtin_clear_cache_01` — `__builtin___clear_cache` is a no-op on wasm - and x86 (no I-cache to flush), matching GCC/Clang, instead of calling an - undefined `__clear_cache` (`lang/c/parse/parse_expr.c`). -- [x] `6_8_31_switch_char_extremes` — added the standard sign-extension - operators (`i32.extend8_s` … `i64.extend32_s`, opcodes 0xc0–0xc4) across - the wasm insn set; `emit_convert` uses them so a narrow signed promotion - (e.g. `signed char` → `int`) actually sign-extends in-register instead of - being treated as a same-valtype no-op. - -## ⏭️ Skip — phased-rollout (32, reported SKIP) - -### `long double` — `wasm: long double not supported` -Wasm now advertises binary128 `long double` (clang/LLVM convention); the backend -fatals when a value is materialized, so these report SKIP instead of silently -returning 0. `ldbl128_01_layout_macros` still PASSES (compile-time layout checks only). -- [ ] `6_7_2_12_long_double` -- [ ] `ldbl128_02_literal_to_int` -- [ ] `ldbl128_03_arith` -- [ ] `ldbl128_04_conversions` -- [ ] `ldbl128_05_compare` -- [ ] `ldbl128_06_call_return` -- [ ] `ldbl128_07_struct_storage` -- [ ] `ldbl128_08_literal_bits` -- [ ] `ldbl128_09_global_init` -- [ ] `ldbl128_10_unary_neg` -- [ ] `ldbl128_11_array_copy` -- [ ] `ldbl128_12_stack_args` -- [ ] `ldbl128_13_mixed_arith` -- [ ] `ldbl128_14_struct_return` -- [ ] `ldbl128_15_arbitrary_mul` - - -### `__int128` ABI — `wasm32 ABI: scalar 16-byte values are not supported` -- [ ] `i128_02_literal_storage` -- [ ] `i128_03_add_sub_carry` -- [ ] `i128_04_mul_high_half` -- [ ] `i128_05_div_mod` -- [ ] `i128_06_shifts_bitwise` -- [ ] `i128_07_compare` -- [ ] `i128_08_signed_shift_convert` -- [ ] `i128_09_call_return` -- [ ] `i128_10_struct_storage` -- [ ] `i128_11_union_lanes` -- [ ] `i128_12_global_init` -- [ ] `i128_13_signed_div_mod` -- [ ] `i128_14_arbitrary_mul` - -### Other -- [ ] `asm_02_file_scope` — `wasm target: address of undefined symbol not yet implemented`. - Implementable via the wasm object linking section + relocations: - `R_WASM_MEMORY_ADDR_{LEB,SLEB,I32,I64}` for data symbols, - `R_WASM_TABLE_INDEX_{SLEB,I32}` for address-taken functions (which go in - the indirect-call table). The undefined symbol gets a `SYMBOL_INFO` with - `WASM_SYM_UNDEFINED`; `wasm-ld` resolves at link time. Same machinery - needed for any cross-TU function pointer — shared with `attr_p2_08`. -- [ ] `attr_p2_08_weak_undef` — `wasm target: address of undefined symbol not yet implemented`. - Implementable, same mechanism as `asm_02_file_scope` plus - `WASM_SYM_BINDING_WEAK` in the symbol flags. `wasm-ld` resolves a weak - undef to 0 (data) or to a trapping stub (code) if nothing defines it. -- [ ] `builtin_26_sadd_overflow` — `wasm target: 64-bit checked-overflow multiply is not yet supported`. - Implementable as a software lowering: wasm has `i64.mul` but no widening - or flag-producing form, so synthesize a 64×64→128 multiply by splitting - each i64 into two i32 halves, doing four i32×i32→i64 partial products, - and checking the high 64 bits against sign-extension of the low 64 - (signed) or non-zero (unsigned). ~dozen wasm ops inlined, or a runtime - helper — same shape as the 32-bit software-mul path other backends use. diff --git a/doc/WINDOWS.md b/doc/WINDOWS.md @@ -1,408 +0,0 @@ -# Windows / PE-COFF support - -This document describes the Windows target support in `cfree` as it -exists now. It is no longer a bring-up plan: x64 and aarch64 PE/COFF -object emission, PE executable linking, mingw import library ingestion, -Windows ABI selection, and llvm-mingw UCRT hosted links are implemented. - -## Scope - -Supported targets: - -- `x86_64-windows` -- `aarch64-windows` - -The Windows path is 64-bit only. i386 Win32 is out of scope because -`cfree` has no 32-bit x86 backend. - -The intended hosted profile is mingw/llvm-mingw UCRT, not MSVC. cfree -links against llvm-mingw's CRT and import archives and emits PE32+ -executables that import UCRT API-set DLLs and system DLLs such as -`KERNEL32.dll`. - -Non-goals for the current Windows path: - -- SEH unwind metadata and C++ exception interop through cfree frames -- `.pdata` / `.xdata` emission for cfree-generated functions -- PDB, CodeView, windbg integration, and MSVC object/debug parity -- ARM64EC ABI support -- legacy MSVCRT as a separately selectable hosted profile - -## Current Status - -The implemented path can: - -- compile C to relocatable PE/COFF objects for x64 and aarch64 -- read and write COFF objects, including COMDAT, weak externals, - common symbols, section aux records, and per-arch relocations -- link PE32+ executables directly with `cfree ld` / `cfree cc` -- ingest mingw import archives and synthesize PE import tables -- link llvm-mingw UCRT startup objects, CRT archives, and system import - libraries -- run trivial x64 and aarch64 Windows executables under Wine through - Debian podman containers -- select the Win64 x64 and Windows AArch64 ABI through the normal - `(arch, os)` ABI dispatch -- emit Windows driver defaults such as `.obj`, `.exe`, Windows - predefined macros, subsystem selection, and sysroot library search - -Validated smoke coverage includes: - -- COFF round-trip: 22 hand-built ObjBuilder cases, byte-stable -- PE import unit smoke: synthetic short import to linked `.exe` -- PE import mingw smoke: real `libkernel32.a` to linked `.exe` -- llvm-mingw UCRT hosted x64 and aarch64 console executables -- x64 and aarch64 `windows.h` console and `-mwindows` GUI links -- Debian podman + Wine execution for x64 and aarch64 return-code - propagation -- x64 and aarch64 `Sleep` smoke execution through `KERNEL32.dll` -- x64 and aarch64 `windows.h` coverage for handles, callback typedefs, - wide APIs, `winbase`, `processthreadsapi`, `synchapi`, `fileapi`, - `errhandlingapi`, `winuser`, inline helpers, and macro-heavy declarations -- x64 and aarch64 Wine runtime coverage for `argc` / `argv` / `envp`, - stdout/stderr handles, heap allocation, file I/O, error codes, and - callback execution through `qsort` -- x64 and aarch64 Wine runtime coverage for UCRT stdio entry points and - imported data reads through `__dcrt_initial_narrow_environment` -- cfree-emitted TLS variables on x64 and aarch64, including PE TLS - directory presence and Wine runtime execution when matching Wine - containers are available -- system-DLL coverage for `user32` + `gdi32` GUI links, `gdi32` drawing - via memory DC + stock objects, `advapi32` registry open/query, - `ws2_32` Winsock startup/socket/closesocket/cleanup, `ole32` - CoInitializeEx / CoUninitialize, `shell32` `CommandLineToArgvW`, - `comctl32` `InitCommonControls(Ex)`, and a mixed-member `libucrt.a` - case that pulls in both an `api-ms-win-crt-*` short-import and a - real `lib64_libmingwex_a-*.o` stdio helper — x64 and aarch64, - link-level imports verified via `cfree objdump -p` and exit code - checked under Debian podman Wine when the matching container is - available -- ABI classifier tests for x64/aa64 Windows alongside Linux/macOS - -The remaining work is coverage and polish, not first-link bring-up. See -the checklist at the end of this file. - -## Compile, Link, And Run Under Wine - -Use llvm-mingw UCRT for run-on-Wine validation on both architectures. -The Homebrew `mingw-w64` x64 sysroot is still useful for object and -import-library tests, but its legacy MSVCRT profile can import CRT entry -points that Debian bookworm Wine does not implement -(`msvcrt.dll.__acrt_iob_func`). Prefer llvm-mingw UCRT for executable -runtime checks. - -On this host, `podman --arch ...` is the reliable way to select the -container architecture. `--platform linux/amd64` has not consistently -selected amd64 under the qemu-backed podman setup. The minimal Debian -Wine package exposes the launcher as `/usr/lib/wine/wine64`. - -Build one trivial return-code executable per Windows target: - -```sh -UCRT_ROOT=/private/tmp/llvm-mingw/llvm-mingw-20260519-ucrt-macos-universal - -cat >/tmp/ret7.c <<'SRC' -int main(void) { return 7; } -SRC - -build/cfree cc -target x86_64-windows \ - --sysroot "$UCRT_ROOT/x86_64-w64-mingw32" \ - /tmp/ret7.c -o build/test/ret7-x64-ucrt-windows.exe - -build/cfree cc -target aarch64-windows \ - --sysroot "$UCRT_ROOT/aarch64-w64-mingw32" \ - /tmp/ret7.c -o build/test/ret7-arm64-windows.exe -``` - -Build the Debian Wine containers once: - -```sh -podman build --arch amd64 -t localhost/cfree-wine-amd64 - <<'EOF' -FROM docker.io/library/debian:bookworm -RUN apt-get update \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends wine64 \ - && rm -rf /var/lib/apt/lists/* -EOF - -podman build --arch arm64 -t localhost/cfree-wine-arm64 - <<'EOF' -FROM docker.io/library/debian:bookworm -RUN apt-get update \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends wine64 \ - && rm -rf /var/lib/apt/lists/* -EOF -``` - -Run the executables through Wine and assert the process exit code: - -```sh -podman run --rm --arch amd64 -v "$PWD:/work:ro" \ - localhost/cfree-wine-amd64 \ - bash -lc ' - export WINEDEBUG=-all WINEPREFIX=/tmp/wineprefix - /usr/lib/wine/wine64 /work/build/test/ret7-x64-ucrt-windows.exe - rc=$? - echo "x64 exit=$rc" - test "$rc" -eq 7 - ' - -podman run --rm --arch arm64 -v "$PWD:/work:ro" \ - localhost/cfree-wine-arm64 \ - bash -lc ' - export WINEDEBUG=-all WINEPREFIX=/tmp/wineprefix - /usr/lib/wine/wine64 /work/build/test/ret7-arm64-windows.exe - rc=$? - echo "arm64 exit=$rc" - test "$rc" -eq 7 - ' -``` - -## Design - -### Target And Driver - -`driver/lib/target.c` recognizes `x86_64-windows` and -`aarch64-windows`, sets `CFREE_OS_WINDOWS`, and selects `CFREE_OBJ_COFF`. -Windows targets use `.obj` for relocatable output and `.exe` for linked -programs. - -The driver defines the mingw-style Windows macros: - -- `_WIN32` -- `_WIN64` -- `__MINGW32__` -- `__MINGW64__` -- `_M_X64` / `_M_AMD64` for x64 -- `_M_ARM64` for aarch64 - -For `cfree cc --sysroot <mingw-target-sysroot>`, the driver adds the -target sysroot library directory and links the mingw CRT startup and -runtime archives around user objects. The hosted UCRT profile uses: - -- `crt2.o` -- `crtbegin.o` -- `libmingw32.a` -- `libmoldname.a` -- `libmingwex.a` -- `libmsvcrt.a` -- system import libraries such as `libkernel32.a` -- `crtend.o` - -In llvm-mingw UCRT, `libmsvcrt.a` is a compatibility/import archive. -The final PE should import `api-ms-win-crt-*.dll` API-set DLLs, not -literal `msvcrt.dll` or `ucrt.dll`. - -The linker supports console and GUI subsystem selection through -`ld --subsystem=windows`, `ld --ms-link-driver /SUBSYSTEM:WINDOWS`, -`cc -mwindows`, and `cc -Wl,/SUBSYSTEM:WINDOWS`. Console is the default. - -### ABI And Code Generation - -The ABI vtable is selected by `(arch, os)`. - -x64 Windows uses the Win64 calling convention: - -- integer/pointer args in `RCX`, `RDX`, `R8`, `R9` -- floating args in `XMM0`-`XMM3` -- 32-byte caller shadow space -- `RBX`, `RBP`, `RDI`, `RSI`, `RSP`, and `R12`-`R15` callee-saved -- `XMM6`-`XMM15` callee-saved when used -- varargs duplicate floating-point arguments into the paired integer - argument registers -- `va_list` is pointer-shaped - -Large Win64 stack frames emit `__chkstk` probes. The prologue loads the -frame size into `EAX`, calls `__chkstk`, then subtracts the probed size -from `RSP`. mingw's `libmingwex.a` supplies the hosted symbol. - -AArch64 Windows mostly follows AAPCS64, with the Windows `va_list` -layout handled through the target ABI path. Variadic functions use a -pointer-shaped `va_list`; floating-point arguments to variadic functions -are carried in integer argument slots, matching llvm-mingw/Clang and the -UCRT stdio wrappers. cfree also accepts ARM64EC COFF machine values as -AArch64 where the object encoding is identical; ARM64EC ABI interop is -still out of scope. - -`long double` is 64-bit `double` on Windows. `__int128` follows the -mingw/GCC split into two GPR slots on Win64 rather than MSVC's -pass-by-reference rule. - -### PE/COFF Objects - -`emit_coff` and `read_coff` implement relocatable COFF object support -through the normal `ObjBuilder` API. - -The object path handles: - -- file, section, symbol, auxiliary symbol, relocation, and string-table - wire records -- AMD64 and ARM64 machine types -- COMDAT section groups and SELECTANY-style deduplication -- weak externals and mingw alias fallback -- common symbols -- COFF section characteristics and alignment -- section-relative and architecture-specific relocations -- long section names through the COFF string table -- short import records and long-form mingw import members - -The reader preserves enough COFF-specific metadata in object extension -fields for round-trip stability while normalizing the information the -linker needs. - -### PE Linker - -`link_emit_image_writer` dispatches COFF targets to the PE writer. The -PE writer emits: - -- DOS stub and PE/COFF headers -- PE32+ optional header -- loadable sections with Windows alignment -- `.idata` import descriptors, ILT/IAT blocks, hint/name tables, and - per-DLL grouping -- per-architecture import call stubs -- `.reloc` base relocation blocks for absolute addresses -- TLS directory records and relocations for the directory fields -- subsystem and entry-point selection -- image identifiers through the shared `link_image_id_compute` path - -The default console entry point resolves to `mainCRTStartup`. GUI links -default to `WinMainCRTStartup` when the subsystem is Windows GUI. - -### Imports, DLLs, And IAT - -COFF has no ELF-style GOT/PLT model. cfree emits direct references in -the object and the linker resolves imported functions through IAT -slots and import stubs. - -The import reader handles both: - -- short import records (`Sig1=0`, `Sig2=0xffff`) -- long-form mingw import archive members such as those in - `libkernel32.a` - -Archive ingestion classifies import members as DSO shims, preserves -per-member DLL names, and skips head/trailer members. The PE linker then -builds one import descriptor per DLL. - -Imported data aliases such as `__imp_<name>` are object-like IAT data -slots, not callable function imports. The PE hint/name table strips the -`__imp_` prefix only for the exported symbol name. - -`read_coff_dso` can walk raw PE DLL export directories for named -exports. Forwarder ENT entries (EAT RVA inside the export directory's -own range, contents `OTHERDLL.OtherSym`) are surfaced as defined -symbols so the linker can satisfy imports against them; cfree does -not chase the chain at link time — the OS loader follows it at -runtime, which is how `api-ms-win-crt-*.dll` resolves to -`ucrtbase.dll`. The contract is pinned by -`test/coff/pe-dso-forwarder.c`. Ordinal-only exports (entries present -in the EAT but absent from the ENT) and ordinal-only short imports -(`NameType=IMPORT_OBJECT_ORDINAL` in a short-import archive member) -are not yet implemented: the latter fails with a clean diagnostic -naming the offending archive member and ordinal rather than an -internal panic. No mingw / llvm-mingw sysroot archive on the -supported targets uses either shape. - -Mixed-member archives — where one `.a` file contains both short-import -members and full long-form COFF object members — are ingested in a -single pass: each member is classified independently, short-import -records route through `read_coff_short_import` and become DSO inputs -keyed by the embedded DLL name, while long-form members fall through -to `read_coff` as regular objects. `libucrt.a` uses exactly this shape -(`api-ms-win-crt-*.dll` short imports alongside -`lib64_libucrt_extra_a-*.o` helpers). The composition is pinned by -`test/coff/pe-mixed-archive.c`. - -### TLS - -COFF TLS data is materialized into `.tls$` sections. Code generation -uses Windows TLS access: - -- x64: `gs:[0x58] + _tls_index * 8 + SECREL(sym)` -- aarch64: `x18` (TEB), then the Windows TLS slot at `+0x58`, then - `_tls_index * 8 + SECREL(sym)` - -The PE writer emits a TLS directory in `.rdata` and base relocations for -the directory's absolute fields. The optional-header TLS data directory -and the mingw-visible `_tls_used` symbol both name that same record. The -hosted UCRT smoke compiles and runs cfree-emitted TLS variables on both -x64 and aarch64 under Wine when the matching podman images are -installed. - -### Tooling - -Windows support is wired into the existing tools: - -- `objdump -p` prints PE image headers, data directories, and imports -- `objdump -h` decodes raw `IMAGE_SECTION_HEADER.Characteristics` into - GNU-objdump-style tags (`LINK_ONCE`, `DISCARDABLE`, `LINK_REMOVE`, - `SHARED`, `GPREL`, ...) for both COFF .obj inputs and PE images -- `objdump -f` summarizes architecture, format, section/symbol counts, - and (for PE images) image base / entry point / subsystem -- `objdump -h` also prints COMDAT group membership immediately after - the section table -- `objcopy` and `strip` accept COFF inputs -- object detection distinguishes COFF objects from PE images -- `ld --ms-link-driver` accepts common MS-link spellings such as - `/OUT`, `/ENTRY`, `/LIBPATH`, `/DEFAULTLIB`, and `/SUBSYSTEM` - -## Test Expectations - -Current test layers: - -- **COFF unit**: hand-built ObjBuilder to emit/read round-trip, - byte-stable -- **mingw fixtures**: mingw-built `.obj` inputs read and re-emitted -- **cfree codegen**: `cfree -target windows -c` objects linked by - external mingw tools where useful -- **cfree linker**: `cfree cc` / `cfree ld` emits PE executables -- **Wine execution**: produced `.exe` files run under Debian podman - Wine containers for amd64 and arm64 -- **bad inputs**: malformed PE/COFF inputs should diagnose cleanly -- **header ingestion**: `cfree cc` against llvm-mingw headers -- **DLL/import reader**: raw PE DLL and import-library absorption - -The harness should skip Windows-target tests with a clear -`SKIP: no mingw` message when the required toolchain is unavailable. -Set `CFREE_TEST_HAS_MINGW=1` to require mingw/llvm-mingw test inputs. -Wine execution should remain gated so normal local test runs do not -require podman or Wine. - -## Remaining Checklist - -- [x] Broaden `windows.h` coverage beyond the current trivial smoke. - Add focused cases for handles, structs, callback typedefs, wide - APIs, selected `winbase`, `processthreadsapi`, `synchapi`, - `fileapi`, `errhandlingapi`, `winuser`, inline helpers, and - macro-heavy declarations. -- [x] Broaden runtime execution under Wine on both x64 and aarch64. - Covered `argc` / `argv` / `envp`, stdout/stderr handles, heap - allocation, file I/O, error codes, callbacks, GUI `WinMain` links, - and cfree-emitted TLS variables. -- [x] Continue broadening runtime execution under Wine for UCRT stdio - on aarch64 and imported data reads. -- [x] Fix aarch64 Windows variadic UCRT stdio calls such as - `printf("x:%d\n", 42)` and floating-point printf arguments. - cfree now matches llvm-mingw/Clang argument lowering; Debian Wine - 8.0's aarch64 UCRT still crashes on formatted `printf` arguments - for clang-built binaries too, so Wine runtime coverage remains on - non-formatted UCRT stdio calls there. -- [x] Add runtime tests for cfree-emitted TLS variables on both - architectures. -- [x] Finish `_tls_used` symbol synthesis for the PE TLS contract. -- [x] Expand DLL/import-library coverage for forwarded exports, - ordinal-only exports, mixed import-library members, and larger - system libraries. Forwarders pinned by - `test/coff/pe-dso-forwarder.c`; ordinal-only short imports - currently diagnose cleanly rather than panic; mixed-member - archives covered by `test/coff/pe-mixed-archive.c`; broader - system-DLL link + Wine coverage in - `test/coff/windows-system-dlls-smoke.sh` (user32/gdi32, - advapi32, ws2_32, ole32, shell32, comctl32, mixed UCRT). -- [ ] Implement ordinal-only short imports if a real consumer surfaces - (today the path is a clean diagnostic, not a panic). -- [ ] Optionally walk forwarder chains at link time (today the OS - loader chases them at runtime). -- [x] Expand x64 hosted `windows.h` execution coverage to match the - aarch64 `Sleep` and GUI subsystem smokes. -- [x] Optionally expose richer COFF section characteristics in - `objdump` output. diff --git a/doc/plan/ARCH.md b/doc/plan/ARCH.md @@ -0,0 +1,159 @@ +# Arch-Backend Completeness (planned work) + +This roadmap consolidates the *remaining* native-backend work across the three +machine-code targets (aa64, x64, rv64). The bulk of the NativeTarget port -- the +single-pass (-O0) path and the known-frame (-O1) path for all three arches, plus +the asm/disasm/link-reloc/dwarf matrix -- is already in tree and is treated here +as the **baseline**, not as planned work. What follows is the genuinely-open +follow-up: per-arch hooks where x64/rv64 still trail the aa64 reference, +prologue/epilogue and tail-call cost-model parity, and a small set of niche +asm/disasm and debugger gaps. The backend abstraction and ABI layer this work +sits behind are documented in the design set: +[../CODEGEN.md](../CODEGEN.md), [../OPT.md](../OPT.md), [../ASM.md](../ASM.md), +[../DWARF.md](../DWARF.md). + +## Baseline (done -- context, not planned work) + +- All three backends implement the full NativeTarget vtable on both the + single-pass (`NativeDirectTarget`) and known-frame (`func_begin_known_frame`) + paths: prologue/epilogue, `bind_param`, frame slots, calls/returns, atomics, + variadics, inline/file-scope asm, TLS (Local-Exec), and intrinsics. +- x64 carries both ABIs (SysV + Win64), including shadow space, callee-saved + XMMs, the `__chkstk` large-frame probe, and the SysV 176-byte variadic + register-save area. rv64 covers LP64D with the s0-anchored frame and Zba/Zbb + use where available. +- asm/disasm/link-reloc/dwarf parity across the OS matrix (ELF/COFF/Mach-O as + applicable) is in place: rv64 relocation emission, the x64 `.eh_frame` RBP + DWARF-reg fix, the aa64 FP/SIMD and x64 SSE disasm rows, named params/locals in + x64 and rv64 DWARF, and the shared LEB128 / `.comm` assembler directives. + +The items below are what is *not* yet at aa64 parity. + +## 1. Tail-call realization on x64 and rv64 (blocker-check removal) + +aa64 realizes sibling (tail) calls whenever the outgoing stack-argument area fits +the caller's incoming parameter window -- `aa_no_tail` returns a blocker *only* on +that size check. The same is true of the size check on x64 and rv64 (x64 even +accounts for the shadow-space prefix), and the restore-before-jump *machinery* +already exists on both: `x64_emit_tail_site` / `rv_emit_tail_site` emit the +callee-save restore and frame teardown ahead of the tail jump exactly the way +aa64 does. What remains is conservatism in the realizability gate -- `x64_no_tail` +and `rv_no_tail` still bail out with `"callee-saved registers in use"` whenever +the function has any callee-save live (`frame.ncallee_saves != 0`), so those +functions fall back to a normal call + return even though the tail site could +handle them. + +This is the single largest aa64-vs-rest divergence and matters most for the +recursion-heavy / interpreter-dispatch workloads that the O(1)-tail-call work +targets (see the interpreter and toy `musttail` tracks). + +- Remove the `ncallee_saves` guard from `x64_no_tail` / `rv_no_tail` so the + size check alone gates realizability, letting the existing tail-site restore + sequences run for callee-saves-live functions. +- x64: confirm the existing restore ordering interacts correctly with the SysV + vs Win64 callee-save sets (Win64 also saves XMM6-15) and with a forwarded sret + pointer before lifting the guard. +- rv64: confirm the s2-s11 / fs2-fs11 and s0/ra restore-then-`jr` sequence holds + for the previously-blocked frames once the guard is gone. +- Extend the tail-call test corpus to cover the callee-saves-live case on x64 and + rv64, since those paths were unexercised while the guard masked them. +- Win64 FP-arg tail-call interaction is noted as a deferred sub-case in the port + notes; validate or document the restriction explicitly. + +## 2. Prologue / epilogue cost-model parity (per-call overhead) + +The fixed per-call overhead -- prologue + epilogue + arg setup, independent of the +body -- is the dominant cost on call-heavy code. aa64 picks one of four frame +shapes per function to minimize it; x64 and rv64 currently emit a single +RBP-anchored / s0-anchored shape and do **not** have the fold tiers. The design +rationale lives in [../ARCH.md](../ARCH.md); the aa64 measurements and the +remaining body-level warts are tracked alongside [../OPT.md](../OPT.md) and +[OPTIMIZER.md](OPTIMIZER.md). + +aa64 tiers (baseline, for reference): + +| tier | when | fixed insns | +| --- | --- | ---: | +| `slim_prologue` (Tier A) | no callee-saves, no alloca, no body slots, no outgoing stack | 3 (optimal) | +| `fp_at_bottom` | >=1 callee-save/body slot, **no outgoing stack args**, frame <= 504 | 5 (optimal) | +| `slim_small_frame` | as above but with outgoing stack args | 7 | +| fat | large frame / alloca / big saved-pair offset | 7+ | + +The known-frame asymmetry (bottom-record only on the -O1 path) is intentional: +the frame-size-dependent offsets require the frame to be final before the body, +which only the optimizer's frame planner guarantees. + +Planned: + +- **rv64 frame fold.** Port the `fp_at_bottom` idea: for known-frame functions + with no outgoing stack args and a small frame, place the saved s0/ra pair at the + bottom (s0 = sp) so the sp adjustment folds into the save/restore and + callee-saves stack above the record at positive offsets. RISC-V has no + pre/post-indexed store, so the fold is the address-arithmetic saving, not an + addressing-mode one -- quantify the win before committing. Add a leaf/no-frame + (`slim_prologue`-equivalent) tier for leaf functions with no callee-saves. +- **x64 frame fold / leaf omission.** Add the equivalent tiers for SysV and + Win64. x64 already emits the exact known-frame prologue (no placeholder/patch), + so this is shape selection in `func_begin_known_frame`, not a re-architecture. + SysV leaf functions can also exploit the 128-byte red zone to skip the + `sub rsp` entirely -- design and gate this carefully against alloca and any + outgoing-arg use. +- **Cost-model alignment.** `signature_stack_bytes` / `call_stack_bytes` are the + shared hooks the optimizer uses to size the outgoing area and gate tail-call + realizability; they exist on all three. As the fold tiers and tail-call paths + land, verify the optimizer's per-call cost estimates reflect the cheaper + shapes so frame/spill decisions stay consistent across arches. + +Body-level per-call warts from the aa64 study that are arch-shared and still +open: + +- **Redundant branch chain.** An if/else merge can emit `b A; A: b B` -- a + conditional branch to a label that just unconditionally branches onward. + `cleanup_layout_fallthrough_branches` in the jump pass does not yet thread this + shape; this is an optimizer pass fix, surfaced per-arch at the call site. + +## 3. x64 debugger step-out / unwind + +`cfree_dwarf_unwind_step` has no memory provider, and x64 (unlike aa64/rv64, which +have a link register) has no link-register fallback, so step-out can't recover the +return address from the stack. Compounding it, the JIT debugger doesn't populate +`.eh_frame` for in-process images. + +- Add a memory-reading unwind variant so the unwinder can read the saved RA / + RBP from the stack on x64. +- Populate `.eh_frame` (or an equivalent CFI source) for JIT in-process images so + the debugger has unwind data to consume. + +This is a debugging-UX robustness item with test-infra dependencies; see +[../DBG.md](../DBG.md) and [../DWARF.md](../DWARF.md). Sibling debugger roadmap: +[DEBUG.md](DEBUG.md). + +## 4. Niche assembler / disassembler gaps + +These are in the standalone `as` / inline-`asm()` encode-decode paths only. The +compiler's codegen emits machine code directly and never routes through the text +assembler, and the shipped runtime `.s`/`.S` files don't use these forms, so +none of this blocks any build. They are GNU-as / llvm-mc parity gaps for +hand-written assembly. Design context: [../ASM.md](../ASM.md). + +- **aa64 atomics, remaining encode forms.** `CASP`, the LSE min/max family + (`ldsmax`/`ldsmin`/`ldumax`/`ldumin`), and `LDAPR`/`STLLR` are not yet encoded. +- **aa64 disasm rows for the new encode-only forms.** The recently-added + exclusive/LSE atomics, register-offset, and writeback load/store forms encode + correctly but have no decode rows, so a round-trip currently renders them as + `.inst`. Add the matching disasm rows. +- **TLS relocation modifiers in operands.** `:tprel_*:` (aa64) and `%tls_*` + (rv64) operand syntax is not yet accepted; the non-TLS modifiers + (`:lo12:`/`:got:`, `%hi`/`%lo`/`%pcrel_*`, x64 `@PLT`/`@GOTPCREL`) are done. +- **`.L`-prefixed local-label spellings in operand references.** Plain labels + work (including as the `%pcrel_lo` anchor); the `.L`-prefixed spelling in an + operand position is a shared-lexer change. + +## 5. Cross-cutting hygiene + +- Keep the three backends converging on the shared `NativeFrame` / `native_argmove` + (parallel-copy shuffle) scaffolding rather than re-implementing per-arch; new + fold tiers and tail-call paths should reuse it. +- As each gap above closes, prefer locking it in with a targeted corpus case + (per-arch, per-form) over broad sweeps, per the testing guidance in + [../TESTING.md](../TESTING.md). diff --git a/doc/plan/BOOTSTRAP.md b/doc/plan/BOOTSTRAP.md @@ -0,0 +1,159 @@ +# Self-Build Bootstrap (current state and roadmap) + +This roadmap covers the staged self-build of cfree: building the compiler with +itself until it reproduces its own output byte-for-byte. The mechanics and +products of the build are described in [../BUILD.md](../BUILD.md); this document +tracks the reproducibility goal, the current baseline, the open problems that +remain, and the next steps for widening coverage. The bootstrap is the strongest +end-to-end correctness oracle in the project, because it exercises the C +frontend, every optimizer pass, the native backends, the object writers, the +linker, and the archive tools, all on the compiler's own source. + +## Goal: a self-reproducing fixed point + +The bootstrap builds cfree three times and requires the last two stages to be +identical: + +- **stage1** = the host-built cfree, copied aside, exposing `cc`/`ld`/`ar`/`ranlib`/`as`. +- **stage2** = the whole tree rebuilt with stage1 as the toolchain (`CC`/`AR`/`LD`). +- **stage3** = the whole tree rebuilt again with stage2 as the toolchain. +- The invariant is `cmp stage2/cfree stage3/cfree` — they must be byte-identical. + +Stage2 vs stage3 is the fixed point: once the compiler reproduces itself, a third +pass cannot change anything. The bootstrap drives the *normal* Makefile with +`CC`/`AR`/`LD` repointed at each stage's symlinks, so there is no separate build +system to maintain — it is the same rules run with cfree as the toolchain. This +depends on the reproducible-build guarantees in [../BUILD.md](../BUILD.md) +(deterministic ordering, no embedded timestamps/paths); any nondeterminism in +codegen or object layout surfaces here as a stage2/stage3 mismatch. + +Driving targets (see [../BUILD.md](../BUILD.md)): + +- `make bootstrap` runs both the debug (`-O0`) and release (`-O1`) chains. +- `make bootstrap-debug` / `make bootstrap-release` run one chain. +- `make bootstrap-test-toy` additionally runs the Toy corpus through the + bootstrapped compiler as a behavioral check on top of the byte-identity check. + +## Current baseline + +Done (baseline) on aarch64-macos: + +- Both the `-O0` (debug) and `-O1` (release) chains reach the fixed point: + `cmp stage2/cfree stage3/cfree` is clean, and the per-object check across all + `*.o` in stage2 vs stage3 reports zero differences in both modes. +- Both bootstrapped compilers run the full Toy corpus clean (1034 pass, 0 fail, + 8 skip) across the run, link/native, C-backend, and Wasm paths at Toy opt + levels 0 and 1. + +This gives one fully self-hosting configuration. The remaining work is breadth: +other targets, other host platforms, and guarding the property over time. + +## Open problems and next steps + +### Widen target and platform coverage + +The fixed point is currently demonstrated only for aarch64-macos. The bootstrap +should hold for every supported native target and object format. Until each is +green it is an open question whether its backend + object writer are fully +deterministic and self-consistent. + +- [ ] Reach the fixed point on x86-64 (ELF and Mach-O) for both `-O0` and `-O1`. +- [ ] Reach the fixed point on rv64 (ELF) for both `-O0` and `-O1`. +- [ ] Reach the fixed point on aarch64-linux (ELF), distinct from the macOS + Mach-O path already covered. +- [ ] For each new configuration, run the per-object diff and the Toy corpus + through the bootstrapped compiler, not just the final `cmp`. + +These connect to the per-arch backend state tracked in [../CODEGEN.md](../CODEGEN.md) +and [../ARCH.md](../ARCH.md), and to the object/format paths in +[../OBJ.md](../OBJ.md) and [../LINK.md](../LINK.md) / [LINKER.md](LINKER.md). A +new arch's first bootstrap is also the most thorough regression test those +components get. + +### Guard the property over time + +The fixed point is easy to break with a single nondeterministic or +miscompiling change, and a regression is expensive to bisect after the fact. + +- [ ] Run `make bootstrap` (or at least one chain) in CI on the reference host so + breakage is caught at the offending change, not after multiple commits + have been made. +- [ ] Keep the per-object diff available as a first-line triage signal: it points + directly at the diverging translation unit, which is far cheaper than + diffing whole linked binaries. + +### Cross-bootstrap (stretch) + +The current chains are native (host arch building host arch). A cross-bootstrap — +host cfree building a stage2 for a *different* target, then validating that +stage2 reproduces a stage3 when run under [../EMU.md](../EMU.md) or on hardware — +would prove the backends independent of the host. This is a stretch goal that +depends on the emulator being able to host the full compiler. + +## Triage playbook for fixed-point regressions + +When a stage2/stage3 mismatch (or a stage3 link failure) appears, the following +approach has proven effective and should be the default starting point. + +**Use object reproduction, not just "does it link", as the oracle.** A stage3 +link failure is usually a *symptom* of a malformed object emitted earlier, not a +linker bug. The decisive question is whether stage2, used as a compiler, +reproduces the same `.o` that the host-built compiler produces. Compile one +suspect TU with both the host `cfree` and the stage2 `cfree` using identical +flags, then `cmp` the two objects. This separates malformed-object bugs from +link-driver symptoms and points straight at the diverging codegen. + +**Narrow with hybrid relinks.** Relink stage2 after replacing one suspect TU (or +one piece of a split TU) with a clang-built object, then use that stage2 to +compile the known-differing target object. This isolates whether a failure is in +the linker itself or in codegen for a specific source file. + +**Inspect MIR around the suspect symbol.** A temporary filtered MIR dump around +the target function, taken after lowering and the combine pass, is usually enough +to see the divergence (e.g. a call argument that should reference an allocable +register but instead references a backend scratch register). + +**Avoid `-g` while triaging `-O1` codegen.** Debug info changes object layout and +can create or hide layout-sensitive bugs; one historical "regalloc" diagnosis was +actually a `-g` artifact. Triage on the non-`-g` object first. + +## Root-cause classes seen at the fixed point + +These bug classes were responsible for past `-O1` fixed-point and stage3-link +failures. They are fixed in the baseline, but they map the parts of the pipeline +most likely to break the property again, so they are worth keeping in mind when a +new arch or platform is brought up. See [../OPT.md](../OPT.md) for the passes. + +- **Operand clobber in native emit.** Materializing the left operand of a binop, + compare, or compare-branch into a scratch register that already holds the right + operand. The general rule: compute the RHS location first and exclude its + register when materializing the LHS. A real instance produced `1 << 1` for + `1u << (n & 31)`, which corrupted Mach-O section alignments and only manifested + as a downstream `ld -r` / stage3 link failure. + +- **Copy propagation across backend scratch registers.** Treating backend scratch + registers as ordinary hard registers during the combine pass: scratch registers + may appear in lowered MIR, but they must not be extended across later + instructions, because native lowering reuses them as transient temporaries. + A real instance rewrote a stack-argument call operand back to scratch `x9`, + which was then clobbered before the store, sending an unrelated value into a + stack slot and flipping an inline-always flag. + +- **Coalesce overlap checks** must use raw range points, not compressed points. + +- **Lower-pass hint fallback** must not place values that are live across a call + into caller-saved hint registers. + +- **Native scratch budget.** A backend needs enough integer scratch registers for + all-spilled three-operand operations (aa64 needs three). + +- **Aggregate copy/set with pointer operands.** Pointer-valued operands of + aggregate copy/set must not force-home the pointer local; genuinely + frame-backed pointer locals need prematerialized indirect bases. + +The throughline: the fragile interactions are between the optimizer's +register-level reasoning ([../OPT.md](../OPT.md)) and the backend's scratch-register +discipline ([../ARCH.md](../ARCH.md)), with the object/link layer +([../OBJ.md](../OBJ.md), [../LINK.md](../LINK.md)) as where the symptom finally +surfaces. New backends should expect to re-litigate these before reaching their +own fixed point. diff --git a/doc/plan/DEBUG.md b/doc/plan/DEBUG.md @@ -0,0 +1,252 @@ +# Debugger, Debug Info, and Profiling (planned work) + +This roadmap consolidates the remaining work across the interactive JIT +debugger (`cfree dbg`), the DWARF producer/consumer, and the not-yet-built +sampling profiler (`cfree prof`). Designs live one level up: +[../DBG.md](../DBG.md) covers the `CfreeJitSession` architecture, the +`CfreeDbgOs` host vtable, software breakpoints, and displaced single-step; +[../DWARF.md](../DWARF.md) covers the producer pipeline and the +`cfree_dwarf_*` consumer surface. This document is forward-looking: it states +the baseline only as a starting point, then enumerates the open gaps, their +rationale, and the next steps. Shipped items are noted as "done (baseline)". + +## Baseline + +What already works and is not re-planned here: + +- The JIT debugger session (`src/dbg/session.c`, `bp.c`, `mem.c`, `step.c`, + `displaced.c`) is real: worker thread, park/unpark, fault classification, + refcounted software breakpoints with a read overlay, guarded memory access, + displaced single-step, and the `STEP_LINE` / `NEXT_LINE` / `STEP_OUT` state + machines. Done (baseline). +- Displaced-step lifter implementations exist for all three backends. The + lifter is arch-neutral (`dbg_displaced_prepare` drives an `ArchDbgOps` + vtable); each backend ships `build_displaced_shim` + `decode_insn`: + `src/arch/aa64/dbg.c`, `src/arch/x64/dbg.c` (INT3 + RIP-relative/rel8/rel32 + fixups), `src/arch/rv64/dbg.c` (EBREAK + AUIPC/JAL/branch fixups). Done + (baseline). +- Session-level integration of displaced-step is complete only on aarch64 + hosts. The x64 and rv64 backends decode and fix up instructions correctly in + isolation, but their end-to-end REPL session loop (fault classification, trap + PC normalization, register marshalling) has not been validated; closing that + gap is §1 below. Done (baseline) for aarch64 only. +- The POSIX host adapter (`driver/env/posix_dbg.c`, macOS/Linux/FreeBSD + ucontext marshalling) and the Windows host adapter (`driver/env/windows.c`: + `g_dbg_os_win` with `AddVectoredExceptionHandler`, `Set`/`GetThreadContext` + interrupt path, `__try`/`__except` guarded copy) are both wired. Done + (baseline). +- The DWARF producer (`src/debug/debug*.c`: abbrev/form/emit, line program, + type DIEs, `.eh_frame` CFI) and consumer (`src/debug/dwarf_*.c`: + open/line/die/type/loc/query/cfi) are implemented and tested via `test-dwarf` + / `test-debug`, including multi-input `cfree_jit_view`, + `cfree_dwarf_line_to_addr` suffix matching, and graceful "no debug info for + this frame" degradation. Done (baseline). + +The sections below are the work that remains. + +## 1. Bring x64 / rv64 debug sessions to full parity + +The non-aarch64 lifters exist (per baseline) but `test/dbg/run.sh` self-skips +on any host that is not aarch64. The remaining work is proving the non-aa64 +sessions are real, not writing new lifters. + +- Run the full `test/dbg` transcript suite against x64 and rv64 hosts and fix + whatever the session-level integration surfaces (fault classification, trap + PC normalization, register marshalling). The backends decode and fix up + instructions correctly in isolation (`test-x64-dbg`); the gap is the + session/host loop around them. +- Decide and implement the `test-dbg` host policy: extend the + `Darwin/Linux + arm64/aarch64` allow-list in `test/dbg/run.sh` to admit x64 + and rv64 once green, and decide whether `test-dbg` self-skips or hard-fails + when the compiled backend/host cannot support a session at all. +- Remove the non-aarch64 degraded-mode warning in `driver/cmd/dbg.c` once x64 + and rv64 sessions are real, not just present. + +## 2. Displaced single-step: remaining instruction coverage + +The lifters cover the common PC-relative families per arch. One known decline +remains, plus general hardening: + +- aarch64 `LDR (literal)` **vector forms** (S/D/Q destinations) are declined + today (`src/arch/aa64/dbg.c` returns unsupported when `V==1`). These are + common in optimized and FP-heavy builds; synthesize the same indirect-load + shim used for the integer/`LDRSW` forms but targeting a vector register. WHY: + stepping otherwise fails at any FP literal load. +- Audit the x64 and rv64 lifters for analogous declined forms surfaced by §1's + parity testing, and either lift them or document the decline with a clear + `CFREE_UNSUPPORTED` path so the session degrades to "cannot step here" rather + than misbehaving. + +## 3. Direct dbg unit + smoke tests + +Much verification has gone through transcript tests; the low-level primitives +still lack focused unit coverage. Following red-green TDD (see +[../TESTING.md](../TESTING.md)): + +- `test/dbg/bp_patch_roundtrip`: install/clear at one address; assert byte + restore, refcount behavior, and the `dbg_bp_unpatch_read` overlay. +- `test/dbg/displaced_*`: one canned encoding from every PC-relative family, + per arch; assert shim bytes + literal-pool layout. (x64 has `test-x64-dbg`; + give aa64 and rv64 the same.) +- `test/dbg/guarded_copy_segv`: `read_mem` from NULL returns nonzero and the + worker survives the next resume. +- `test/smoke/dbg_hello`: scripted REPL against a JIT'd C source with a + golden-transcript diff (`b sym`, `r`, `c`, `s`, `x ADDR`, `p NAME`, `q`). +- `test/dbg/source_step`: scripted `n` / `step` / `finish`, asserting the + reported source line at each stop. +- Make session teardown explicit enough to test stopping while the worker is + parked. Note: `cfree_jit_session_free` deliberately leaks a worker parked + inside the signal handler (no async-safe unwind), so tests must account for + this rather than expect a clean join. + +## 4. REPL polish and machine-readable mode + +Shared REPL work that improves usability and unblocks tooling/IDE frontends: + +- Repeat the last stepping command on a blank line. +- Add memory-format variants for `x` (bytes, words, strings, pointers). +- Add a stable machine-readable transcript mode for tests and external tools, + keeping command parsing factored so an editor/IDE frontend can reuse the + command engine without scraping human output. WHY: the REPL is the only + programmatic entry to the session; scraping is brittle. +- Add Ctrl-C / interrupt transcript coverage where the host can test it + reliably (the interrupt path itself is wired on both POSIX and Windows). + +## 5. Toy and C REPL frontends + +The Toy frontend drives the debugger as the first REPL language; C support is +the larger follow-on. Design detail lives in [../FRONTENDS.md](../FRONTENDS.md). + +Toy result formatting and structured values: + +- Typed result formatting (and pretty-printing via type info) instead of always + rendering scalar `u64`/`i64` hex. +- Richer expression-thunk signatures so expressions can accept and return + non-integer values — pointers, floats, records, arrays/slices (the + `toy-structured-expr` red transcript is the spec). +- More readable diagnostics that keep the REPL usable after bad input, plus + better multi-line / unmatched-brace handling, and stable synthetic file names + so `list` and `file:line` breakpoints are predictable across runs. + +C as a REPL language (after the Toy experience is solid): + +- Teach the C frontend `CFREE_FRONTEND_INPUT_REPL_EXPR` / + `CFREE_FRONTEND_INPUT_REPL_BLOCK`, and preserve C declarations across snippets + without leaking frontend internals into the driver. +- Support C function calls through normal REPL expressions (no separate `call` + command), infer result types and print typed values, allow thunks to refer to + stopped-frame locals where feasible, and add transcript tests once Toy is + stable. + +## 6. DWARF producer/consumer gaps + +Producer and consumer are colocated under `src/debug/` but share only the wire +format (`dwarf_defs.h`); that boundary must hold for any new work. The +remaining gaps: + +- **Loclists for optimized code** (producer-only). The consumer already + resolves `DW_FORM_loclistx` against `.debug_loclists` (`dw_loclist_resolve` + in `src/debug/dwarf_loc.c`), and the producer models time-varying locations + as `DVL_LOCLIST` via `debug_loclist_new`/`_add` (`src/debug/debug.c`) — but + those producer entry points are placeholders that do not serialize a + `.debug_loclists` section. Realize the serialization so a variable's location + can vary by PC range. WHY: without it, `-O1`/`-O2` builds lose variable + locations the moment a value moves between register and frame slot — exactly + where a debugger is most needed. +- **Richer CFI register recovery.** The unwinder (`src/debug/dwarf_cfi.c`) + computes the caller CFA/PC and the return address but does not do CFA-relative + loads to recover arbitrary callee-saved registers — needed to show correct + register values in outer frames. +- **Composite locations.** Once opt generates split values, the loc-expr + evaluator needs `DW_OP_piece`; defer until opt synthesizes them. +- **`list file:line` for prebuilt inputs.** When the JIT image includes `.o` / + `.a` debug sections whose source file is not on disk, `driver/cmd/dbg.c` + should show the DWARF line number alone and omit the source snippet rather + than failing the listing. The line/symbol lookups already degrade; the + on-disk source read (via `env.file_io`) is missing. + +Explicitly deferred until a client needs them (carried forward, not planned): + +- **`.debug_macro`** — emit macro definition/expansion records so a debugger + can report `#define`d values. The SourceManager already tracks the + macro-expansion pseudo-files that can seed the records; cheap once wired. +- **Inlined-subroutine DIEs** — emit `DW_TAG_inlined_subroutine` with + abstract-origin links once the optimizer reports inline decisions to the CG + session (consumer-side `dw_build_subs` already indexes the tag). +- Split DWARF / `.dwo`, `.debug_pubnames`, and any `LSDA` / exception tables + (C has none). + +## 7. Sampling profiler — `cfree prof` (not yet built) + +A statistical CPU profiler that reuses the debugger's host signal +infrastructure. Nothing exists yet: no `prof` subcommand in `driver/main.c`, +no `src/dbg/prof.c`, no `on_sample` field on `CfreeDbgSignalOps`. Design +intent: SIGPROF fires on the worker, the handler walks the frame-pointer chain +into a pre-allocated ring buffer and returns **without parking** — the one +property that keeps sampling cheap and guest timing undisturbed — and PCs are +symbolicated after the guest exits. + +Public API (`include/cfree.h`): + +- Add `on_sample(void* session, void* ucontext)` to `CfreeDbgSignalOps` (NULL = + ignore SIGPROF); it receives the raw `ucontext_t*`, not a marshalled frame, + because it extracts only PC and FP on the hot path. +- Declare `CfreeProfBuf` (fixed-capacity sample ring: `pcs[PROF_MAX_DEPTH]` per + sample, `count`/`cap`/`dropped`) and `CfreeProfWriter` (post-run symbolication + callback vtable), plus `cfree_jit_session_prof_attach(session, buf)` (before + `session_call`) and `cfree_jit_session_prof_collect(session, buf, writer)`. + +Library (`src/dbg/prof.c`, freestanding C11): + +- `dbg_fp_walk(ucontext, sample)`: frame-pointer walk via + `dbg_os->guarded_copy` for every dereference; terminate on NULL / misaligned + / non-advancing FP or `PROF_MAX_DEPTH`. The three frame layouts are identical + (`[FP]` = saved FP, `[FP+8]` = saved LR/return addr); FP is x29 / rbp / + s0(x8). WHY: no DWARF or symbol lookup on the signal path — raw PCs only. +- `on_sample` body: capacity check, walk, append or bump `dropped` (non-atomic; + the single worker makes that safe). `prof_attach` / `prof_collect` bodies; + `prof_collect` symbolicates each PC via `cfree_jit_addr_to_sym` + + `cfree_dwarf_addr_to_line`, dispatching to the writer (may allocate freely). + +Host adapter (`driver/env/posix_dbg.c` and `windows.c`): + +- Add SIGPROF to the POSIX handler's signal set with an early-return path: + `if (signo == SIGPROF && on_sample) { on_sample(...); return; }` — no + park/unpark. SIGPROF joins the blocked cohort so it does not recurse. Timer + arming (`setitimer(ITIMER_PROF)`) and thread targeting stay driver-side and + do **not** belong behind `CfreeDbgOs`. +- Decide the Windows sampling mechanism (no SIGPROF): a periodic + `SuspendThread` + `GetThreadContext` sampler thread is the natural analog of + the VEH interrupt path. WHY: the SIGPROF design has no direct Windows + equivalent, so this is a genuine open design question, not a port. + +Driver (`driver/cmd/prof.c`, wired into the multi-call dispatch in +`driver/main.c`): + +- Flags `--rate` (default 1ms), `--depth` (64), `--cap` (1M), `--output` + (`prof.folded`), `--no-folded`, `--no-flat`. Input handling mirrors + `cfree run`, with `-g` forced on so symbolication always has DWARF. +- Arm the timer before `session_call`, disarm after, then `prof_collect`. + Emit folded stacks (sorted + RLE for `flamegraph.pl`) plus a flat + self%/cumul% report to stdout and a dropped-sample warning. + +Tests: `test/smoke/prof_hello` (assert `prof.folded` is non-empty and `main` +appears), `test/dbg/fp_walk_*` (canned frame chains per arch, assert the PC +sequence and termination), and `test/dbg/prof_buf_overflow` (fill to capacity, +assert `dropped` increments and `count` caps). + +Profiler follow-ons (deferred): per-thread timers via +`timer_create(CLOCK_THREAD_CPUTIME_ID)` + `SIGEV_THREAD_ID` for multi-thread +guests; an `ITIMER_REAL` wall-clock mode for I/O-bound programs; allocation +profiling via a conditional breakpoint on the allocator; SpeedScope / pprof +output. + +## 8. Bigger follow-ons (cross-cutting) + +- **Watchpoints**, once `CGTarget` can express them without an ISA-specific + debug-register API. All breakpoints are software today; watchpoints need + hardware debug registers, hence the abstraction requirement. +- **Multi-threaded guests.** The session assumes one worker. Concurrent guest + threads require widening `CfreeDbgOs` with thread enumeration and per-tid + stop/event slots; this is also the prerequisite for reliable per-thread + profiler timer delivery in §7. Out of scope until a concrete need lands. diff --git a/doc/plan/IMAGE_INSPECT.md b/doc/plan/IMAGE_INSPECT.md @@ -0,0 +1,174 @@ +# Image Inspection (planned work) + +cfree can read relocatable objects through `cfree_obj_open`, and it has been +extended to also inspect *linked images* -- executables and shared objects -- +across the same neutral API. ELF and Mach-O reading have landed; the +remaining work is the COFF/PE image reader plus a handful of follow-ups. This +doc captures the goal, what is already baseline, and what is left, so the PE +work and any later refinements parallelize against a settled contract. The +matching design lives in [../OBJ.md](../OBJ.md); see also +[LINKER.md](LINKER.md) for the linker side and [../DBG.md](../DBG.md) / +[../DWARF.md](../DWARF.md) for the debug-info flow that rides on it. + +## Goal + +One `cfree_obj_open` call inspects any of: a relocatable object, an +executable, or a shared object, for ELF, Mach-O, and COFF/PE. Sections and +symbols keep working where the format still carries them; linked images +additionally expose segments, an entry point and image base, an interpreter / +SONAME, library dependencies and rpaths, and a dynamic symbol/relocation +table. `objdump` and the inherited tools (nm, size, addr2line) operate on +images the same way they operate on objects, with no per-format +special-casing in the driver. + +## Why this is a real extension, not a flag + +The original reader was relocatable-object-shaped. `cfree_obj_open` -> +`cfree_detect_target` -> `impl->read`, and the ELF backend rejected anything +but `ET_REL`. There were DSO readers (`read_elf_dso`, `read_coff_dso`, a +Mach-O dylib stub) but they were wired only into the *linker's* input path, +not the public `impl->read` / `cfree_obj_open` surface; `ET_EXEC` had no +reader at all. The in-memory model (`ObjBuilder`) was section / symbol / +reloc oriented with no notion of a **segment** (PT_LOAD), the **dynamic +table** (DT_NEEDED / SONAME / RPATH), an **entry point**, **image base**, +**imports**, or **data directories** -- which is exactly what image +inspection is about. The fix is a new image dimension on the model plus +neutral iterators, not a flag. + +## Baseline (done) + +The neutral API and internal model are in place, and ELF + Mach-O image +reading work end to end: + +- **Neutral API** (`include/cfree/object.h`): `CfreeObjKind` + + `cfree_obj_kind`; `CfreeObjImageInfo` + `cfree_obj_image_info` (entry, image + base, interp, soname); segment iterator (`cfree_obj_segiter_*` over + `CfreeObjSegInfo` with `CFREE_SEG_R/W/X`); dependency iterator + (`cfree_obj_depiter_*`, carrying imported names for PE/Mach-O); rpath + iterator (`cfree_obj_rpathiter_*`); dynamic symbols and relocations reusing + the `CfreeObjSymIter` / `CfreeObjRelocIter` shapes via + `cfree_obj_dynsymiter_new` / `cfree_obj_dynreliter_new`. +- **Internal model** (`src/obj/obj.h`, `src/obj/obj.c`): an `ObjImage` hung + off `ObjBuilder`, NULL on pure relocatables. Readers call + `obj_image_ensure(ob, OBJ_KIND_*)` then setters/appenders for entry, base, + interp, soname, segments, deps, rpaths, dynsyms, and dynrelocs; + `obj_free` releases it. +- **Glue** (`src/api/object_file.c`): maps `ObjImage` to the public + iterators; relocatable inputs report `CFREE_OBJ_KIND_REL` with empty image + iterators and the section/symbol path unchanged. +- **ELF reader** (`src/obj/elf/read.c`): `read_elf` accepts `ET_EXEC` / + `ET_DYN`, sharing one path -- the old `e_type != ET_REL` guard is now a kind + switch. `read_elf_image` walks program headers for segments + PT_INTERP + + image base, and parses `.dynamic` for DT_NEEDED / DT_SONAME / DT_RPATH / + DT_RUNPATH plus the dynsym/dynstr/reloc pointers. A zeroed section-header + table is accepted for images (empty section view, segment view carries the + load picture). +- **Mach-O reader** (`src/obj/macho/read.c`): accepts `MH_EXECUTE` / + `MH_DYLIB`; `read_macho_image` re-walks load commands for segments + (`LC_SEGMENT_64`, `__TEXT` base, VM_PROT->OBJ_SEG perms), interp + (`LC_LOAD_DYLINKER`), soname (`LC_ID_DYLIB`), deps (`LC_LOAD_DYLIB` and + weak/reexport variants), rpaths (`LC_RPATH`), entry (`LC_MAIN` / + `LC_UNIXTHREAD`), dynamic symbols from the external `LC_SYMTAB` nlist + entries, and `LC_DYLD_CHAINED_FIXUPS` binds/rebases + (`DYLD_CHAINED_PTR_64`). Classic `LC_DYLD_INFO` and the exports trie are + intentionally not read; non-64-bit chained pointer formats are skipped + leniently. +- **objdump** (`driver/cmd/objdump.c`): grew `-p` / `--private-headers` + (program/dynamic headers, format-neutral via the image API), `-T` / + `--dynamic-syms`, and `-R` / `--dynamic-reloc`; `-f` reports the image type + flags and real entry point; `-h` / `-t` / `-d` work on executables. `-d` + falls back to disassembling X-perm `PT_LOAD` segments by vaddr when the + section walk is empty (stripped images), with no ELF special-casing. +- **Inherited tools**: nm, size, addr2line open images via `cfree_obj_open`. + nm grew `-D` (`.dynsym`); `CfreeObjSecInfo.addr` carries the load vaddr (0 + for relocatables) so SysV `size -A` reports real layout. (strings is + intentionally format-agnostic -- it scans raw bytes and does not call + `cfree_obj_open`.) +- **Debug-info retention in the linker**: `.debug_*` sections are carried + through to linked images as file-only sections with relocations resolved in + place, so `addr2line` / `dbg` resolve `file:line` on cfree-linked + executables (single- and multi-input, ELF and Mach-O). See + [../LINK.md](../LINK.md) and [../DWARF.md](../DWARF.md). + +## Remaining work + +### COFF/PE image reader (primary gap) + +PE is the one format whose linked images do not yet open through +`cfree_obj_open`. The COFF backend's `read` does not populate `ObjImage`, and +`read_coff_dso` is still wired only into the linker. As a result `objdump` +keeps a hand-rolled `pe_parse_image` raw-byte walker +(`driver/cmd/objdump.c:392`) behind a `CFREE_BIN_PE` special-case +(`driver/cmd/objdump.c:1831`) that serves `-f` / `-h` / `-p` and soft-errors +`-t` / `-d` / `-r` / `-s`. The plan: + +- Give the COFF backend a real image reader: DOS / NT headers, optional + header (entry point + image base + subsystem), data directories, the + section table, the import and export directories, and the base-relocation + table. Reuse / fold in `read_coff_dso`'s machinery so EXEC and DLL share one + path the way ELF EXEC/DYN do. +- Populate `ObjImage`: segments from sections + image base, deps from the + import directory (each DLL's imported names go into the per-dep imports + list), exports from the export directory into the dynamic symbol table, and + base relocations into the dynamic relocation view. +- Extend objdump `-p` to render the PE optional header + data directories, + `-T` for exports, `-R` for base relocations, all through the neutral image + API. +- **Delete** `pe_parse_image` and collapse the `CFREE_BIN_PE` branch in + `driver_objdump` into the normal `dump_obj` path once PE images open via + `cfree_obj_open`. + +### Escape hatch for format-specific raw fields + +Some inspection needs format-specific values that do not fit the neutral +model: raw DT_* tag values, raw Mach-O load commands, PE data-directory +entries. Surface these through a per-format escape hatch in the spirit of the +existing `cfree_obj_section_format_flags`, keeping the neutral API clean +rather than widening it per format. + +### Mach-O classic-format breadth (deferred) + +The Mach-O reader deliberately supports only the modern fixup path +(`LC_DYLD_CHAINED_FIXUPS`, with the symbol table as the authoritative +dynamic-symbol source) and `LC_DYLD_EXPORTS_TRIE`. Classic `LC_DYLD_INFO` +opcode/trie reading and the exports trie remain out of scope; reading older +dylibs is a separate, lower-priority effort. Revisit only if a real input +demands it. + +## Out of scope + +- **Core files** (`ET_CORE`, Mach-O `MH_CORE`): `CFREE_OBJ_KIND_CORE` stays + defined but unimplemented; detect and reject cleanly. Note / register-state + parsing is a separate feature. +- **Synthesizing pseudo-sections from segments** on stripped ELF: matches GNU + `objdump` / `llvm-objdump`, which are section-header-driven and report "no + sections" when the table is absent. The segment view (and `-d` over X-perm + segments) covers the disassembly case. + +## Design notes carried forward + +- **One open call, two views.** `cfree_obj_open` detects kind (reusing + `cfree_detect_target` + `e_type` / `filetype` / PE characteristics) and + routes to the backend, which fills `ObjBuilder` (sections/symbols where + present) and, for EXEC/DYN, `ObjImage`. Tools that already use + `cfree_obj_open` inherit image support for free. +- **Segment is the load-layout unit.** `{ vaddr, vsize, file_off, file_size, + perms, align, name }`, populated from PT_LOAD / LC_SEGMENT_64 / PE sections. + Sections continue to map through the existing `ObjBuilder` view where the + format retains them; the segment view carries the load picture when section + headers are absent. +- **Dynamic syms/relocs reuse object shapes.** The dynamic symbol and + relocation iterators reuse `CfreeObjSymInfo` / `CfreeObjReloc` rather than + introducing parallel types, so consumers written for objects work on + images. + +## Test strategy + +The compiler links its own ELF / Mach-O / PE images, so tests round-trip: +link a small program, open it via `cfree_obj_open`, and assert +kind/entry/segments/deps/dynsyms against what the linker emitted, cross-checked +against host `readelf` / `objdump` in smoke tests where available. ELF and +Mach-O goldens live under `test/objdump/`; PE corpora land under +`test/{coff,pe}/` with the reader. Dynamic NEEDED/SONAME/dynsym paths fully +exercise once `-shared` / dynamic linking emit populated tables; the +empty-table rendering is already covered. See [../TESTING.md](../TESTING.md). diff --git a/doc/plan/JIT.md b/doc/plan/JIT.md @@ -0,0 +1,271 @@ +# JIT (planned work) + +This roadmap covers the future of cfree's in-process JIT: the +append-only incremental linker, function-level hot reload built on top of +it, the managed-runtime (Go-style) codegen/JIT contracts, and the +remaining cross-host and parity gaps in `cfree run` / `cfree dbg`. The +implemented design lives in [../JIT.md](../JIT.md); the linker and +incremental-link mechanics are in [../LINK.md](../LINK.md) and +[LINKER.md](LINKER.md). This document records only what is still planned, +the open problems, and the order we intend to build them in. + +Baseline (the starting point): the JIT mapper (`src/link/link_jit.c`) +reserves one contiguous region, copies segments, applies relocations +against the runtime base, and flips to W^X final perms; ELF and Mach-O +reloc-apply (cross-TU GOT, weak-undef proximity, far-call stubs, ELF +IFUNC pre-resolution) are green, and the inspector surface +(`cfree_jit_view`, `cfree_jit_addr_to_sym`, `cfree_jit_sym_iter_*`) is +wired. Append-only extension already increments a JIT generation +(`cfree_jit_generation`) and stages relocations via +`link_append_reloc_slot`. Reload, managed-runtime hooks, and the +regression harness are not yet built. + +## 1. Function-level hot reload + +Hot reload adds *replacement* on top of append-only incremental linking. +In v1 only functions can be replaced; data symbols, TLS, type layouts, +initializers, destructors, and object lifetime are out of scope. The +first usable milestone: in `dbg`, reload a global function while the +worker is stopped, existing function pointers keep working, new calls hit +the new body, and old frames return safely. + +### 1.1 Core idea — stable entry, indirected body + +Append-only linking can already add a new function body. Reload adds a +stable function entry that indirects through a slot to the current body: + +``` +foo entry/trampoline -> foo.slot -> current foo body +``` + +Reloading compiles and appends a new body, relocates it, then atomically +updates `foo.slot`. Existing pointers to `foo` stay valid because they +point at the stable entry, not a body generation. The baseline patches +one pointer-sized cell, not every call site. + +- Per-arch stable entry sequence (aarch64 `adrp/ldr/br`, x64 + `jmp *foo.slot(%rip)`, rv64 `auipc/ld/jr`). Entry lives in RX, slot in + writable data under the same W^X discipline as the rest of the image. +- Slot update is a pointer-width atomic aligned store so the + representation stays compatible with future multi-threaded sessions, + even though `dbg` v1 reloads only while the worker is stopped. +- Public symbol resolves to the stable entry; body symbols are internal + and generationed (`foo$body$0`, `foo$body$1`, ...). Inspector and + debugger present the public name; generationed bodies show only under + an internal/debug flag. + +### 1.2 Indirection is opt-in + +The trampoline cost is real and pointless for normal `cfree run`. Gate it +behind a JIT mode (`CFREE_JIT_INDIRECT_EXPORTED_FUNCS` vs +`CFREE_JIT_INDIRECT_NONE`) so AOT executable links are unaffected. v1 +restricts reload to global C-linkage functions visible to +`cfree_jit_lookup`; static functions need a stable synthetic identity +keyed on the containing TU before they qualify. + +### 1.3 ABI compatibility gate + +Reload must not change the ABI a caller already compiled against. The +C frontend should emit a compact per-definition ABI signature (target +arch/os, call conv, variadic flag, return class+size, arg classes+sizes; +fixed `CFREE_ABI_MAX_ARGS` bound, no VLA) so the linker can verify +runtime-callability without re-deriving C types. Functions whose +signature exceeds the bound are marked non-reloadable until a heap-backed +encoding exists. Missing/non-C signatures reject reload unless the user +explicitly opts into unchecked replacement. + +### 1.4 Replacement-object restrictions (v1) + +A replacement object may contain the new body, private helpers used only +by it, read-only literals it needs, debug sections, and undefined refs to +already-linked or resolver symbols. It may **not** contain new writable +globals, TLS, ctors/dtors, public definitions other than the target, or +colliding strong definitions. This keeps reload function-only in +practice, not just in name. + +### 1.5 Old-generation lifetime + +After a slot update, old bodies stay mapped. Even with the worker +stopped, a live frame may be inside the old function, so continuing must +be valid: existing frames finish in the old body, new calls enter the new +one. v1 never reclaims old generations until `cfree_jit_free`; later, +retire a generation only when the debugger/runtime can prove no stopped or +running frame has a PC or return address inside it. Never unmap old code +immediately after publishing. + +### 1.6 Debugger integration + +`dbg` must distinguish address breakpoints from symbol/source ones: + +- `b *0x1234` stays at that exact address, even in an old generation. +- `b foo` is rebound to the active generation after reload. +- `b file.c:42` is re-resolved after DWARF refresh; if re-resolution + fails, keep the old breakpoint but mark it stale in `info breakpoints`. + +Every reload bumps the JIT generation; `cfree_jit_view` rebuilds on +mismatch. The rebuilt view keeps old debug info so backtraces from old +frames still resolve, while name-to-address lookup prefers the latest +generation. + +### 1.7 Transactional publish and failure behavior + +Reload is all-or-nothing. ABI mismatch, disallowed data/TLS/init arrays, +unresolved symbols, out-of-capacity, or relocation failure all reject and +leave the old body active. Pages committed before a failure may remain as +dead space, but no public symbol or slot may ever point at them. + +### 1.8 Patch-site index (later, performance only) + +The correctness baseline needs no caller patching. A patch-site index +built from durable relocation records (target sym -> apply ids, owner +input -> apply ids, write section -> apply ids) enables a later fast mode +that patches only call sites targeting the reloaded symbol, plus future +non-function slot fixups. For v1, build the structures but use them only +in assertions/tests. + +### 1.9 Reload work items + +- [ ] Reloadable-function records + per-arch stable entry/slot emission, + gated by an indirection-mode JIT option. +- [ ] `cfree_jit_lookup` returns stable entries for reloadable functions. +- [ ] C-frontend ABI-signature emission per definition. +- [ ] Replacement-object validation (reject data/TLS/init/collisions). +- [ ] Append + relocate body, publish via atomic slot store, increment + generation. +- [ ] JIT-view/DWARF refresh + symbol/line breakpoint rebinding. +- [ ] `link_session_mark_reloadable` / `link_session_reload_function` + internal surface and `cfree_jit_reload_function` experimental API. +- [ ] Direct-call patching as a separate phase, after the baseline is + correct. + +## 2. Go-runtime-style codegen / JIT support + +Go is statically typed, so the gap is not dynamic typing in +[../CODEGEN.md](../CODEGEN.md)'s `CfreeCg` — it is the *managed runtime* +model: precise GC, goroutines, managed stacks, panic/defer/recover, and a +long-lived JIT image whose code and metadata evolve safely. Principles: +keep CG typed; lower source concepts to storage types plus runtime +metadata; make managed behavior explicit rather than inferred late; +keep AOT and ordinary C/Toy/Wasm JIT unchanged by default (managed +features are opt-in via code options or function attributes); keep all +runtime services on context/session structs, never global state. The +first useful milestone is far smaller than "compile Go": a tiny managed +frontend that allocates traced objects, hits a safepoint, lets the host +enumerate roots from a JIT stack map, and calls in through a C-callable +trampoline. + +### 2.1 CG interface extensions + +- **Precise GC metadata.** A `cfree_cg_safepoint` that records, per + safepoint PC, which frame slots / params / globals hold managed + pointers. The encoding should become compact backend stackmap data + (fast PC-to-stackmap lookup), not DWARF-only. +- **Managed pointer identity.** Distinguish raw from managed heap + pointers. `cfree_cg_type_ptr` already takes an address space; the + missing piece is policy (which spaces are scanned / movable / + non-moving / interior / raw). +- **Write barriers and allocation.** Don't make every frontend open-code + barriers. Add managed store ops or `GC_ALLOC` / `GC_WRITE_BARRIER` / + `GC_READ_BARRIER` intrinsics carrying object base, field offset, and + pointer kind; ordinary C stores stay ordinary. +- **Runtime calling convention.** A `CFREE_CG_CC_MANAGED` plus function + flags (`MANAGED_STACK`, `GC_SAFEPOINTS`) for a hidden goroutine/thread + context param, reserved registers, prologue stack checks, and a + helper-call ABI. Pairs with first-class multi-result functions to match + Go better than sret today. +- **Managed stacks / goroutines.** Prologue stack checks, grow/switch + runtime calls, live-pointer maps before a growth call, and frame + relocation metadata if stacks move — all as a managed-stack attribute, + not a default. +- **Panic and implicit checks.** Explicit check ops or trap-site metadata + (check kind, source location, recovery target, runtime helper) so the + JIT/debug layer can map a trap PC to a language panic path instead of + just a process signal. +- **Defer/recover.** Minimal path: lower defer management to runtime + calls and make panic edges explicit enough for stackmaps and stepping. + Full cleanup/landing-pad metadata is deferred until semantics settle. + +### 2.2 JIT interface extensions + +- **Transactional publish.** Strengthen the publish contract: link + failure leaves the image unchanged, metadata publishes atomically with + code, old code stays executable while frames may return into it, and + readers detect generation changes. Aligns with §1.7. +- **Runtime metadata registry.** A JIT metadata channel separate from + object/DWARF inspection (stack map, func table, type desc, trap table, + inline table) supporting fast PC-to-function / PC-to-stackmap / + PC-to-trap / PC-to-inline-frame / symbol-to-generation queries. + `cfree_jit_view` stays DWARF-oriented; runtime metadata must be compact + and queryable without parsing DWARF. +- **Code lifetime / reclamation.** Explicit states (active, replaced but + callable, retired, reclaimable) with a runtime/debugger veto on + reclamation until stack scanning proves no frame references the old + generation. Extends §1.5. +- **Managed entry invocation.** Today's entry-call helpers are + argv/`u64`-narrow. Keep the low-level JIT call ABI simple and require + the frontend/runtime to emit C-callable trampolines for managed entry + points. +- **Thread / stop-the-world coordination.** Eventually: safepoint + polling, cooperative stop requests, goroutine enumeration, stack + scanning while stopped, metadata refresh while paused. Not needed for + the first CG change, but the publish/metadata APIs must not assume a + single worker forever. + +### 2.3 Managed-runtime sequence + +1. Managed pointer / address-space policy + explicit safepoint records. +2. Emit and query compact stack maps from the JIT image. +3. Managed allocation and write-barrier intrinsics. +4. Managed-stack function attributes + stack-check lowering. +5. Publish runtime metadata transactionally with JIT appends. +6. Trap/check tables for panic lowering. +7. Function replacement/lifetime on top of hot reload (§1). +8. Broaden debugger/session APIs for multi-threaded coordination. + +## 3. Remaining JIT TODOs + +### 3.1 Driver — `cfree run` + +- [ ] `-O2` crashes on the multi-file inline-asm demo with `Bus error`. + Likely an optimizer bug surfaced through `IR_ASM_BLOCK` replay; + reduce and file against `src/opt/opt.c` (the recorder/replay seam), + not the JIT. See [../OPT.md](../OPT.md). +- [ ] Regression harness: a scripted `test/run/` suite diffing exit codes + and stdout across `.c`, stdin, `.o`, `.a`, multi-file, and `-e` + entry cases, plus a `--no-jit` interpreter-vs-JIT cross-check. No + coverage today; wire a `test-run` target into `test/test.mk`. + +### 3.2 Inspector / debugger surface + +- [ ] Windows host adapter for the JIT debugger: vectored exception + handlers + `SetThreadContext` instead of POSIX signals. See + [../DBG.md](../DBG.md). +- [ ] x64 / rv64 displaced single-step (x64 `INT3` + RIP-relative fixups, + rv64 `EBREAK` + AUIPC/JAL/branch fixups). aarch64 only today — a + JIT-specific parity gap that blocks `dbg` step on those targets. + +### 3.3 Memory mapping / executable allocator + +- [ ] Cross-host `CfreeExecMem` audit. Apple silicon uses dual-mapping; + other POSIX hosts fall back to `mprotect` RW<->RX. Document the + contract and the failure mode when `host->execmem` is unset + (currently `compiler_panic`), and define the Windows + `VirtualAlloc` / `VirtualProtect` story alongside §3.2. +- [ ] Page size: the JIT defaults to `0x4000` when the host adapter + reports `page_size = 0`. Either require the adapter to fill it or + query `sysconf(_SC_PAGESIZE)` in `driver/env.c`. + +### 3.4 Tests + +- [ ] Mach-O J-path markers in the link-test reporter so the reloc-apply + groups are distinguishable from a generic SIGSEGV (today + `make test-link CFREE_TEST_OBJ=macho` prints raw `Segmentation + fault` with no J-specific markers). +- [ ] `test/smoke/dbg_hello`: a scripted REPL diff against a JIT'd source + (see [../DBG.md](../DBG.md)). +- [ ] Hot-reload unit + smoke tests once §1 lands: lookup-address + stability across reload, old vs new return value, saved function + pointer hits the new body, old PC still describable via + `addr_to_sym`, and the negative cases (ABI mismatch, writable-data + replacement, duplicate public definition reject). Run on one JIT + target first; cross-arch trampoline encoding gets its own tests. diff --git a/doc/plan/LINKER.md b/doc/plan/LINKER.md @@ -0,0 +1,253 @@ +# Linker (planned work) + +This roadmap covers where the cfree linker is headed beyond the static +and JIT linking it does today. It is dominated by **incremental linking**: +two related but distinct workstreams — append-only growth of a live JIT +image (the `cfree dbg` / `cfree emu` consumer) and file-based incremental +object linking (the build-system consumer, the "m2" redesign). Both rest +on the same linker invariants — address stability, durable non-destructive +relocation records, content-keyed reuse — and both fall back to a correct +full link whenever a change cannot be proven local. For the linker's +current architecture, passes, and invariants see [../LINK.md](../LINK.md); +for how a resolved image runs in process see [../JIT.md](../JIT.md); for +the object substrate see [../OBJ.md](../OBJ.md); for the build-system layer +that consumes the file-incremental interface see [../BUILD.md](../BUILD.md) +and the distribution CAS in [../DISTRIBUTE.md](../DISTRIBUTE.md). + +## Why incremental, and the shared invariants + +The full link is always available and always correct. Incremental linking +is an *accelerator* gated on a soundness check: a correct-but-slow result +always beats a fast-but-wrong one. Three invariants hold across both +workstreams and must never be violated by any incremental path: + +- **Address stability.** Once a runtime/file vaddr is observable it never + moves. Unchanged atoms keep their bytes *and* their addresses, so their + relocations are never reapplied — this is what makes a patch cost + `O(change)`. Enforced by overwrite-in-slack / append-to-free-slot, + **never compact**. +- **Relocations are durable, relative, and symbolic.** `LinkRelocApply` + records survive as data and are not burned into bytes before emit. + Persist each as `(atom, offset-within-atom, kind, target-name, addend)`; + derive the absolute write address and target address from *current* + placements at apply time. An atom that moves then needs zero reloc + rewriting. +- **Content-hash keying, not transient IDs.** `LinkInputId`/`LinkSymId` + are stable only in-process. Persisted state is keyed by content hashes + and symbol names, never by re-derived IDs, so determinism is a dedup + nicety, not a correctness requirement. + +## Workstream 1 — append-only incremental JIT link + +Grow one live `CfreeJit` image with additional compiled objects while +keeping every previously published runtime address stable. New code may +reference old symbols; old debugger surfaces (`cfree_jit_lookup`, +`cfree_jit_addr_to_sym`, symbol iteration, breakpoints, PC translation, +the JIT debug view) must see new symbols. This is explicitly *not* hot +reload: existing code is never replaced or repatched (see +[../DBG.md](../DBG.md) for the debugger and the separate hot-reload +design). + +### Done (baseline) + +The in-process append path is implemented and is the foundation the rest +builds on: the `cfree_jit_publish` surface (an append/replace batch driven +by a `CfreeLinkSession`, reporting a bumped generation); append cursors with +reserved RX/R/RW/TLS slack over one contiguous master +VA reservation committed page-by-page; transactional rollback of cursors +and symbol/section/reloc counts on failure; generation-bumped invalidation +of the cached `cfree_jit_view`; symbol resolution against the existing +image, the append batch, and the external resolver with duplicate-strong +detection; and a `dbg` REPL that drives compile → append → DWARF refresh +with the worker stopped so line-table replacement never races a running +thread. + +### Remaining + +- **Pending source-level breakpoints across appends.** Today a `b + file:line` for a file not yet covered stays unresolved until retried. + Maintain pending source breakpoint specs and arm them automatically + after each append. +- **Archive reselection on append.** v1 resolves a snippet against the + already-linked image plus the external resolver only. A later cut can + let appended inputs pull fresh archive members, sharing the gate logic + from the file-incremental gate (below). +- **`cfree emu`'s append consumer.** Per-basic-block JIT translation wants + to grow a single `LinkImage` as cold blocks land (see + [../EMU.md](../EMU.md) §6). This is a separate consumer of the same + append machinery and lands alongside the emu lifter cut. It is the + motivation for the `link_resolve_at(Linker*, base_va)` / + `link_resolve_extend(Linker*, LinkImage*)` entries in `src/link/link.c`, + which are **panic stubs today** — see the shared surface below. +- **Promote the API.** `cfree_jit_publish` stays experimental until a + second consumer (emu) exercises its append/replace batch, then settles + as the stable extend surface. + +## Workstream 2 — file-based incremental object link (the "m2" redesign) + +The goal is "instant" relinks for dev builds: after editing one +translation unit in a project of *N* TUs, the *link* cost should be +`O(changed atoms + their relocations)`, not `O(whole program)`. Compile +cost (caching, dependency scanning, the build graph and watch/daemon +modes) is the build system's problem and is out of scope here — this +workstream is the obj/link substrate that layer stands on. Incremental +link is a `-O0`/`-O1` *dev* feature; release builds (`--incremental` off) +always full-link, clean, and remain the canonical reproducible artifact. + +### Done (baseline) — "Done for ELF" + +The first cut landed on ELF as the reference format, with the acceptance +suite (`test/link-incremental/`) green on **ELF/aa64 + ELF/x64**: atom +content identity, per-atom reloc/symbol indices, the `LinkSession` with +per-segment cursors/slack/free-list, append-only extend, patch-in-slack, +the soundness gate with transactional rollback, per-segment build-id, +per-changed-TU debug regen, and move-on-grow via thunk. This is the +starting point; the rest of this section is what is *not* yet built. + +### The m2 redesign — design intent + +The redesign's central decision: **incrementality is not a parallel API — +it is the existing link session made fully mutable.** A full link is the +degenerate cold case (no prior state, nothing replaced); an incremental +relink seeds prior state and replaces the changed inputs. The build system +always drives *the same* session, and `resolve` internally decides +patch-vs-full and reports which via an outcome enum +(`FULL` / `PATCHED` / `FELL_BACK_FULL`). There is no separate "incremental" +entry point that could drift from the full-link path. This directly +matches the internal direction where `link_resolve` is "inputs → image" +and the `link_resolve_at` / `link_resolve_extend` surface makes that +resolve extend-capable. + +The atom is the patch unit — one function or one data object. Under +`--incremental`, frontends emit one section per function/global (a +`-ffunction-sections`/`-fdata-sections` equivalent) so each atom is +independently placeable; cfree already lays out kept atoms as individual +`LinkSection`s. Each atom gets a BLAKE2b content id over its canonical +form (`bytes || align || flags || canonical(relocs)`), the diff key. + +### The soundness gate + +Reuse is correct only when the change cannot alter symbol resolution. The +edit is local only when the changed object's *interface* (defined global +names + bindings, COMMON sizes/aligns, set of undefs) is unchanged and no +archive pull-in changes; anything that can shift layout or resolution — +symbol-set/binding flips, new archive members, COMDAT/COMMON merge changes, +TLS-size shifts, import-set changes, slack/free-list exhaustion (data is +never thunked), or layout-affecting flags — forces a fall-back. On +fall-back the half-mutated session is discarded via the `LinkPatchTxn` +watermark and a correct full link runs; the JIT append path's +duplicate-global preflight is the precedent, but it **panics**, so +converting "detect non-local" into "roll back + full link" is the new +control flow at the heart of the redesign. See [../LINK.md](../LINK.md) for +the full trigger set and rollback mechanics. + +### The move-on-grow primitive (swappable) + +When an atom outgrows its slot it must move, and callers must still reach +it without their bytes changing. This is abstracted behind a single +`LinkMoveOps.atom_moved` hook with two implementations; the rest of the +design (atoms, slack, free-list, persisted session, the gate) is identical +either way. + +- **Thunk-on-grow — ship first.** Calls stay direct (what codegen emits + today). On a move, leave a jump island at the atom's *old* slot pointing + to the new location; callers branch to the old address and hit the + island. No codegen change, reachability is free by construction, and the + tax is one extra jump only for functions that actually moved. Reuses the + existing JIT call-stub island shape per arch. Data cannot be thunked, so + a grown data atom that outgrows its slack falls back to a full link. +- **GOT-cell — convergence target.** Under `--incremental`, codegen emits + cross-unit calls and movable-data loads through a GOT cell; a move + updates one cell. Costs a per-arch codegen change and a uniform extra + indirect load, and needs reserved GOT slack + a GOT free-list (the GOT + is one exactly-sized end segment today). Its strategic value is that it + is the *same* primitive hot reload assumes, so one mechanism would serve + both JIT hot reload and file incremental link. Build it when hot reload + is scheduled, designed then to serve both — unifying earlier is + speculative. + +### Persisted incremental state + +Side-band and content-addressed — **not** ELF-embedded incremental +sections, because cfree is multi-format. Store one blob in the existing +`driver/dist` BLAKE2b CAS, recording per input and per atom: object + atom +content ids, the `LinkAtomPlace` table (vaddr / file_offset / size / +capacity / bucket), symbol→vaddr bindings keyed by *name*, relocations in +relative+symbolic form, and free-list + per-segment cursor state. The +session reads/writes it as opaque bytes through `CfreeWriter`; the build +system owns the key, CAS storage, and lifetime — libcfree stays IO/CAS-free. + +### Remaining work + +- **Resolve the panic stubs.** `link_resolve_at` and `link_resolve_extend` + in `src/link/link.c` are still `compiler_panic` stubs on the main path. + They are the public extend-capable surface for both the file-incremental + consumer and the emu append consumer; wiring them to the `LinkSession` + patch/extend logic (and to graceful fallback rather than panic on + exhaustion) is the remaining integration step to land the redesign on + the main resolve path. +- **Non-ELF formats.** The atom + slack + move-primitive core is + format-agnostic; the difference is per-format machinery, so the order is + ELF (done) → COFF/PE → Mach-O. COFF/PE is the incremental-friendly case + (IAT-indirected imports, per-page base relocs, side-band PDB debug) and + is gated mainly on cfree's COFF maturity — see [../OBJ.md](../OBJ.md). + Mach-O is heaviest but feasible last: each of `__LINKEDIT` fixups, the + export trie, the indirect symtab, and the per-page code-signing + CodeDirectory needs a bounded (not `O(image)`) incremental updater. Until + a format's updater lands, that format falls back to the fast in-process + full link. +- **GOT-cell move primitive.** Deferred until hot reload is scheduled + (above); the free-list, slack, session, and gate are reused verbatim + when it lands — only `LinkMoveOps` changes. +- **rv64 patch path.** The per-arch surface is small — the island/cell + shape and the branch-into-island reloc kind. CI exercises ELF/aa64 + + ELF/x64 first; rv64 follows by adapting its trampoline shape. +- **Incremental build-id.** Per-segment FNV-1a subhashes combined + Merkle-style so a patch re-hashes only changed segments, replacing the + current whole-image `O(image)` build-id. Keep this FNV-1a distinct from + the BLAKE2b used for content/CAS keying. +- **Determinism regression lock.** Object emission is already + byte-deterministic; lock it with a two-compiles-equal regression test to + enable cross-machine / shared-cache dedup. Content/name keying stays the + correctness backbone so any future drift degrades dedup, never + correctness. + +### Frontend contract and debug-info consistency + +All frontends converge to `ObjBuilder` and join the shared path at +`obj_finalize`, so the machinery attaches once, frontend-agnostically — +Toy, asm, and WASM get incremental link with no frontend-specific code. To +be incrementally safe a frontend must produce deterministic output for +identical `(source, flags, target, deps)`, declare its external dependency +set (C reuses `CfreeDepIter`; single-source frontends report none), use +stable source-derived symbol names, and expose a `frontend_id` + +`schema_version` that salts the build-system key. Toy's durable-module REPL +path is not a pure function of source, so it folds the module snapshot into +the input key or opts out of caching; Toy's batch/file compile conforms +like any other frontend. + +On debug info: on any changed atom, **re-emit that TU's full `.debug_*`**. +cfree emits one monolithic `.debug_line` program and one `.debug_info` CU +with intra-CU `DW_FORM_ref4` offsets, so a function's rows cannot be spliced +in isolation; and a body change rewrites the instruction→line mapping even +when the atom did not move, so "keep stale `.debug_line`" is incoherent. +Per-TU regen is `O(changed TU)`, cheap relative to the rest of the patch, +and unchanged TUs' debug stays byte-stable because their atoms keep their +addresses. Per-function CUs for `O(atom)` debug are a future option, not +pursued now. See [../DWARF.md](../DWARF.md). + +## Acceptance: definition of done per format + +The executable spec lives in `test/link-incremental/`, authored test-first +(red → green). Its synthetic multi-TU fixture (core TUs archived into a +static library linked into two executables that share it; no third-party +deps) covers an in-slack body edit (`PATCHED`, every vaddr stable, +whole-program `link_resolve` counter does not increment), a grow-past-slack +edit (`PATCHED`, atom moves, jump island at the old address, caller bytes +byte-identical), the soundness gate (each non-local edit ⇒ +`FELL_BACK_FULL` matching a from-scratch link), multi-output consistency, +determinism, and a no-op relink. The two gates that define correctness are +vaddr-stability on a patch and fall-back on a non-local edit; both must be +green before a format is "done." ELF/aa64 + ELF/x64 are done; COFF, Mach-O, +and the rv64 patch path each repeat this bar. See +[../TESTING.md](../TESTING.md). diff --git a/doc/plan/OPTIMIZER.md b/doc/plan/OPTIMIZER.md @@ -0,0 +1,278 @@ +# Optimizer Roadmap (planned work) + +This is the forward-looking plan for cfree's optimizer: the work still ahead to +turn on the O2 SSA mid-end, broaden inlining, close the remaining O0/O1 +generated-code quality gaps, and finish the machine register-constraint model. +The current design — IR layering, the recording/optimizing boundary, the pass +catalog, allocation, and the MIR physical boundary — is documented in +[../OPT.md](../OPT.md), which this roadmap treats as the starting baseline. Pass +order, SSA representation, and the verification model described there are built +and shipping at O1; the items below are about extending and enabling them, not +re-describing them. Performance is tracked against gcc/clang and the MIR `c2m` +JIT on the `scripts/opt_bench.sh` benchmark set; the long-term external target +is QBE-class quality on that set, then closing toward MIR `c2m -eg`. + +Related plans: [INTERPRETER.md](../INTERPRETER.md), [CODEGEN.md](../CODEGEN.md), +[ARCH.md](../ARCH.md). + +## Baseline (where the remaining work starts) + +A few facts about the current code path frame everything below: + +- **O1 is the only live optimized path.** `opt_cgtarget_new` normalizes every + `opt_level >= 1` request to `1` (`src/opt/opt.c`), so no compilation selects + O2 today. The O1 pipeline runs entirely in the PReg namespace with the + non-splitting allocator (`opt_regalloc_locations(..., allow_live_range_split + = 0)`). +- **The O2 SSA mid-end (`opt_cleanup`) is fully implemented but unreached.** The + schedule — register SSA, mem2reg, GVN, copy-prop, DSE, LICM, pressure relief, + conventional SSA, undo-SSA — is defined and maintained against targeted opt + tests; it just has no caller in shipped compilation. Turning it on is a + correctness-and-quality project, not a from-scratch build. +- **Tiny-function inlining at O1 is done (baseline).** `opt_try_tiny_inline` + runs in the streaming O1 path; it inlines straightline `DEFAULT`/`HINT` + callees under an 8-op cost cap, always inlines `always_inline`, and refuses + `noinline`/recursive/control-rich callees. The whole-program inliner + (`opt_inline`) also exists but is not wired into a live path. +- **The generic machine register-constraint mechanism is done (baseline).** + The tie/forbid/clobber primitives, the per-instruction clobber side table + (`Func.inst_clobbers`), `machinize_inst_clobbers`, and + `apply_machine_reg_clobbers` are in tree, fed by the x64 machine-clobber hook + for div/mod/shift/mul/CAS/RMW/`va_arg`; aarch64/riscv64 leave the hook inert. + Remaining work is follow-on cleanup, not the core model. + +## 1. Complete and turn on the O2 SSA mid-end + +**Goal.** Make `-O2` a real, selectable optimization level whose generated code +is reliably faster than `-O1` on the representative benchmark mix, and route the +shared backend tail through it with coalescing and live-range splitting enabled. + +**Rationale.** O1 already leads MIR `O1` on the MIR-comparable benchmark scope +and beats gcc/clang `-O0` on both compile time and runtime. The remaining +runtime quality gap is against the production compilers at `-O1`/`-O2`, and the +SSA value/memory passes (GVN, DSE, LICM, copy-prop) are where that gap closes. +The mid-end is written; the blocker is that enabling it as-is regressed runtime +versus O1 on the bench mix and broke `matrix -O2`, so it stays isolated until it +is a net win. + +The path to flipping the switch: + +- **Replace wholesale def-use rebuilds with incremental SSA edges.** The biggest + O2 compile-time cost is re-running `opt_rebuild_def_use` after most mutating + passes (dozens of call sites). Move to MIR-style per-operand doubly-linked + `ssa_edge` lists maintained incrementally, so walk/relink/remove/redirect are + O(1) and the def-use bit becomes a verification flag, not a rebuild trigger. + Land the structures first with SSA-era passes quarantined behind per-pass build + gates, then re-enable each pass one at a time with its MIR-parity work and + quality assertions, refreshing its benchmark row as it lands. +- **Fix the LICM register-pressure regression.** `opt_licm` has no pressure cost + model and iterates loops in raster order; it hoists invariants into preheaders + that lengthen live ranges and force spills the O1 allocator avoided — the most + likely cause of O2 trailing O1 today. Add a backward pressure filter that skips + hoists raising pressure unless the op is expensive (multiply/divide), reuse the + already-built loop tree, and walk loops inner-to-outer. +- **Add address-mode synthesis to the SSA combine.** `opt_ssa_combine` does not + fold `base + index*scale + disp` chains into memory operands the way the + post-RA MIR combine does; this is the largest missing codegen pattern for + memory-heavy benchmarks (`hash`, `matrix`). Decompose address chains in SSA, + validated by a backend memory-legality check. +- **Make GVN converge in one pass.** GVN branch folding does not re-enqueue + dependent blocks within a pass, so new constants only propagate on the next + `opt_cleanup` iteration. Add intra-pass worklist re-enqueue (the per-edge + scratch bit from the incremental-edge work is the natural worklist marker). +- **Demand-driven phi insertion.** `opt_build_reg_ssa`/`opt_build_ssa` insert + phis at every iterated-dominance-frontier site without live-in filtering. Move + to demand-driven materialization (phis only where reaching defs differ) plus a + fixpoint minimization pass. +- **Broaden the SSA passes that are currently all-or-nothing.** `opt_addr_xform` + bails on a single non-foldable use; teach it to fold the foldable uses and + rewrite the rest. `opt_pressure_relief` only sinks immediate/const candidates; + extend it to any single-cross-block-use move. Compute DSE memory liveness once + before the pass instead of an inline fixpoint per invocation. + +## 2. Live-range splitting and coalescing (the O2 allocator layer) + +**Goal.** Restore the live-range splitting layer on top of the point-indexed +allocator core, and keep move-related coalescing as the O2-only allocation +quality step. + +**Rationale.** Splitting was deliberately deferred during the allocator rewrite +to the MIR-shaped point-bitmap core (`opt_regalloc_locations` takes an +`allow_live_range_split` flag that is currently a no-op at the splitting site). +Until it returns, O2 may regress on benchmarks where splitting matters +(`array`, `hash`, `matrix`), and `hash2`-style cases that trail their siblings +as the table grows — where register pressure is the suspect — have no mitigation +beyond coalescing. + +- Port MIR's `get_hard_reg_with_split` / `lr_gap_tab` / `split()` on top of the + existing point-bitmap occupancy structure, keeping the dense + `OptLocation`/segment representation already designed for split results. +- Drive splitting from a spiller-vs-spillee frequency profit comparison (search + gaps in already-allocated ranges), rather than the boundary-driven, + singleton-only, non-call-crossing shape sketched earlier. +- Keep coalescing gated to O2 (matching MIR), preserving the unit-overlap + conflict counting that prevents merging a multiply-defined local with a + tied-register param. + +## 3. O1 generated-code quality + +**Goal.** Reach at least 1.10x faster than *both* gcc `-O0` and MIR `c2m -O1` on +every tracked benchmark, without giving up O1's compile-speed lead. + +**Rationale.** O1 is the developer-workflow optimization level: its compile cost +is the same order of magnitude as a debug build but it produces materially +faster code. A handful of benchmarks (`hash2`, `sieve`, `mandelbrot`, `strcat`) +still trail MIR `O1`, and the gaps are specific and individually addressable. + +- **Extend loop-invariant hoisting at O1 beyond constants.** The block-local + hoist at O1 (`opt_hoist_loop_consts`) only consolidates duplicate + `IR_LOAD_IMM` defs that recur inside a loop into one canonical preg — enough + for the sieve flag-init constant `mov`, but it does not move loop-invariant + *computations* (address arithmetic, pure binops on loop-invariant operands) + out of the loop. A conservative, pressure-aware single-BB invariant hoist + would help `sieve` and similar loops without paying for the full SSA mid-end + LICM. +- **Hard-register copy coalescing for `IR_LOAD_IMM` sources.** The + call-argument hint-propagation path covers `ldr`-to-call-arg but skips + immediates, so `f(NULL, NULL)`-style calls still materialize each zero through + a temp. Extend `set_preg_pref_for_call_args` / hint propagation to fire when + the source op is `IR_LOAD_IMM`. +- **Strength-reduce small-divisor modulo.** `hash2`'s `val % size` still emits + `udiv`; MIR emits a reciprocal/Barrett multiply for the small-constant-divisor + case. This is the largest remaining `hash2` gap against MIR. +- **FP instruction selection.** `mandelbrot`'s inner escape loop is FP-bound; + inspect it against MIR for FP register allocation and constant-pool material + (cfree does not vectorize, which is a separate, larger question). +- **General jump-threading.** Collapse `b A; A: b B` chains in `pass_jump.c` as + a general cleanup (recurs intermittently across benches and layout shifts). +- **Resolve the `binary-trees` cc-path deficit.** The JIT path with identical + codegen is at parity, so the loss is PLT/GOT indirection for `malloc`/`free`; + a `-fno-plt`-style direct-call or intra-image link resolution is a link-path + question more than a codegen one — see [LINKER.md](LINKER.md). + +## 4. O0 generated-code quality + +**Goal.** Shrink the O0 runtime and code-size gap versus clang `-O0` (currently +~1.3x slower on a small aarch64 sample) without sacrificing debuggability of +source-level variables. + +**Rationale.** O0 is the no-optimizer path (the frontend `CgTarget` wires +straight to `NativeDirectTarget`), so these are instruction-selection and +local-lowering issues, not optimizer-pass issues. They are safe to fix because +they do not change which source variables get stable stack homes. + +- **Reuse the known-frame/no-padding prologue at O0.** O0 functions still emit + the old fat-prologue reservation shape: a real prologue, then an unconditional + branch over ~19 nops. O1's known-frame path already eliminated this; apply the + same frame-known-up-front prologue to O0 when the frame is known. This costs + one extra branch per call entry and bloats hot functions (hurting I-cache). +- **Stop stack-homing compiler-only temporaries.** O0 materializes many + intermediate values to the stack and immediately reloads them + (`NewTreeNode` stores the constant `16` to a slot before `malloc`; clang uses + `mov x0, #0x10`). Keep stable stack homes for *source* variables but keep + simple expression intermediates in registers across a single expression. +- **Local FP selection wins.** Use encodable FP immediates directly, select + `fmadd` for multiply-add expression trees, and avoid spilling FP intermediates + that have no source-level storage. Visible on `mandelbrot` at O0. +- **Do not eagerly emit unused header inline/helper functions.** O0 emits SDK + header helpers (`___inline_isfinite`, ctype helpers, `__OSSwapInt*`, etc.) + that clang does not, inflating object text and the linked image and obscuring + measurements. Emit static-inline/header helpers only when referenced. + +## 5. Machine register-constraint model: remaining work + +**Goal.** Finish the follow-on cleanup now that the generic fixed-register/ +clobber mechanism is in place, and keep the model the single entry point for +target ISA register rules into the allocator. + +**Rationale.** The core mechanism (tie + forbid + per-instruction clobber side +table, with the x64 hook covering div/mod/shift/mul/CAS/RMW/`va_arg`) is shipped +and is what makes the x64 `-O1` div/mod/atomic/overflow/varargs cases correct. +What remains is removing the now-redundant defensive code the mechanism made +unnecessary, and keeping the contract clean as new constrained ops appear. + +- **Remove defensive backend moves the allocator now guarantees.** With the + dividend tied to `rax` and nothing live in `rdx`/`rcx`, the `mov rax, + dividend` in `x64_binop`, the address-staging in `x64_atomic_cas`/`_rmw`, and + the shift `mov rcx, count` are register-to-itself no-ops. Simplify them away. +- **Confirm `InstId` density and stability** through `pass_lower` so the + side-table lookup stays O(1); fall back to a small open-addressed map keyed by + instruction only if ids ever become sparse. +- **Add focused regressions** for the live-across-the-constraint shape (e.g. a + function that divides while keeping an unrelated value live across the divide) + so the live-across-forbid loop stays covered as new ops are added. + +## 6. Inlining beyond tiny streaming callees + +**Goal.** Grow inlining past the streaming tiny-callee policy toward a real +cost-model-driven inliner, reusing the whole-program machinery that already +exists in `pass_inline.c`. + +**Rationale.** The tiny O1 inliner captures the biggest easy win (one- and +two-instruction leaf helpers) but deliberately refuses anything with control +flow, calls, or aggregate/sret/byval ABI shapes, and cannot see forward-defined +or cross-TU callees. `opt_inline` (the whole-program inliner with growth gates) +is in tree but unreached; richer inlining is mostly a policy-and-wiring problem, +not new transform code. + +- **Inline callees with control flow** beyond the straightline whitelist, and + callees with multi-part/aggregate ABI args/returns (today rejected by + `inline_rewrite_supported`). +- **Forward-defined-callee inlining.** The streaming registry only sees callees + recorded earlier in the TU; a finalization-time pass (matching the ARM_64 + reachability regime) could inline forward-defined callees. +- **A whole-program cost model.** Wire `opt_inline`'s growth-bounded gates into + a live path (most naturally at O2) so larger-than-tiny callees inline when the + cost model says it pays. + +## 7. Open questions and non-goals + +Cross-cutting questions to resolve as the work above lands: + +- **Cheapest O2-positive subset.** What minimal set of SSA passes already makes + O2 > O1, so O2 can be enabled incrementally rather than waiting for the whole + schedule? +- **Allocation path.** Should O2 keep O1's point-indexed bitmap core plus + splitting/coalescing, or does its quality justify a distinct path — and does + coalescing-before-liveness (delete moves first, then compute liveness on the + reduced program, as MIR does) buy enough compile time to reorder the pipeline? +- **DCE overlap.** Can the pre-allocation dead-def elimination and the post-RA + DCE be unified (at O1, and narrowed once SSA DCE runs at O2)? They overlap. +- **Constraint timing.** When O2 turns on, do any SSA-era passes need to learn + about fixed-register constraints before allocation, or does applying all + constraints at allocation time (as today) remain sufficient? +- **Inlining scope.** Is there an O1-affordable inlining subset (forward-defined + tiny callees) worth enabling on the streaming path, or is the whole-program + cost model O2-only? Cross-TU inlining is likely a separate, later effort. +- **Temporary-spilling boundary.** How much of the O0 "compiler-only + temporaries" fix belongs in the C frontend's expression lowering versus + `NativeDirectTarget`'s register cache? + +A shared register-pressure model is the common precursor under several items +(LICM, pressure relief, O1 invariant hoisting): once it exists those passes can +make pressure-aware rather than structural decisions. Auto-vectorization is +explicitly *not* planned near-term — noted only because several FP benchmarks +(`mandelbrot`, sieve init loops) lose to vectorizing peers; any future work +there is a large standalone project. + +## 8. Measurement and acceptance + +**Goal.** Keep every change benchmark-gated and keep the benchmark harness able +to attribute time to the right pipeline stage. + +- Add benchmark CSV columns that split cfree frontend, optimizer, link, and JIT + time, instead of relying on ad hoc `--time` logs; the frontend currently + dominates absolute time on these benches and masks backend movement. +- Keep `-O1` materially faster to compile than `-O2`, and treat any O2 enablement + as gated on "no material regression in the backend-only O1 geomean and no + bootstrap regression." +- Expand the benchmark set back toward all MIR `c-benchmarks` that need only + supported hosted libc/runtime features, and track MIR's own failures + separately so cfree coverage is not obscured by external-tool limits. + +The performance contract for all of the above is preservation of what already +wins: arena-backed function-local allocation, dense PReg-indexed arrays for hot +per-value data, point-indexed allocator occupancy bitsets, operand-driven +combiner dispatch, and block-layout fallthrough cleanup. New work must not add +abstraction layers in hot loops, per-operand heap allocation in MIR, or +whole-function rescans where a dense indexed array suffices. diff --git a/doc/plan/README.md b/doc/plan/README.md @@ -0,0 +1,18 @@ +# Planned work + +Forward-looking roadmaps: what is intended, why, the open problems, and the +design of features not yet built. These are distinct from the design docs one +level up (`../`), which describe the system *as it is* — when a feature here +ships, its durable design moves up to the matching design doc and the entry here +shrinks to whatever remains open. + +| Roadmap | Scope | Design doc | +|---------|-------|------------| +| [OPTIMIZER.md](OPTIMIZER.md) | Completing the O2 SSA mid-end, expanded inlining, -O0/-O1 performance work, machine register-constraint improvements. | [../OPT.md](../OPT.md) | +| [LINKER.md](LINKER.md) | Incremental linking: the file-based object-link redesign and remaining non-ELF format coverage. | [../LINK.md](../LINK.md) | +| [JIT.md](JIT.md) | Function-level hot reload, Go-runtime-style codegen support, and remaining JIT host-portability work. | [../JIT.md](../JIT.md) | +| [DEBUG.md](DEBUG.md) | The Windows debugger host adapter, x64/rv64 displaced single-step, profiling, and DWARF gaps. | [../DBG.md](../DBG.md), [../DWARF.md](../DWARF.md) | +| [WASM.md](WASM.md) | Completing the Wasm object backend and remaining parser/validator coverage. | [../WASM.md](../WASM.md) | +| [ARCH.md](ARCH.md) | Remaining native-backend completeness for x64/rv64 relative to the aa64 reference, and per-call cost follow-ups. | [../ARCH.md](../ARCH.md) | +| [BOOTSTRAP.md](BOOTSTRAP.md) | The 3-stage self-build reproducibility goal and the open `-O1` issues blocking it. | [../BUILD.md](../BUILD.md) | +| [IMAGE_INSPECT.md](IMAGE_INSPECT.md) | Extending object inspection to executables and shared libraries. | [../OBJ.md](../OBJ.md) | diff --git a/doc/plan/WASM.md b/doc/plan/WASM.md @@ -0,0 +1,205 @@ +# WebAssembly (planned work) + +cfree treats WebAssembly as both an input language and an output target, +sharing one binary/module layer between the two directions. The frontend path +(Wasm in, native object/JIT out) and the minimal final-module backend (C/toy +in, single-TU `.wasm` out) are working baselines. The remaining work is +concentrated in three areas: completing the `obj/wasm` object backend so it can +read and write tool-conventions *relocatable* objects (today it only mirrors +raw section bytes); building the Wasm static linker; and closing the last +frontend/backend feature gaps (relocations for cross-TU references, atomics +sub-word coverage, the C-facing exported wrapper ABI, and the unsupported-proposal +diagnostics). This doc is the forward-looking plan. Design rationale, the shared +module model, and the API sketch live in [../WASM.md](../WASM.md); related +design docs are [../OBJ.md](../OBJ.md) and [../LINK.md](../LINK.md), and the +sibling plan is [LINKER.md](LINKER.md). + +Baseline already in place (do not re-plan): `src/wasm` core decode/validate/ +encode/wat; `lang/wasm` frontend with native lowering through `CfreeCg` and an +explicit `CfreeWasmInstance*` ABI; `src/arch/wasm` with `arch_impl_wasm`, a +wasm32 BasicCABI vtable, a structured-CG + CFG-structurer backend, and a +single-TU `emit_wasm`; host-import binding via `cfree_wasm_set_host_imports`. +`read_wasm`/`emit_wasm` are real (no longer stubbed) but partial. + +## Object backend (largest gap) + +`src/obj/wasm` is far smaller than the ELF/Mach-O/COFF backends. `read.c` +mirrors each Wasm section into an `ObjBuilder` section carrying raw payload +bytes and synthesizes one function symbol per defined function (enough for +`objdump -h/-s/-d/-t`). `emit.c` flushes a `WasmModule` attached under +`OBJ_EXT_WASM` via `wasm_encode`, or an empty magic+version header. Neither +understands tool-conventions object metadata: there is no symbol-table decode, +no relocation decode/encode, no `linking`/`reloc.*` custom-section handling, +and no `WasmObjMeta`. + +Work items: + +- Add the typed `WasmObjMeta` extension payload (module graph, symbol table, + relocations, data-segment metadata, target features, init funcs) and hang it + off the builder under `OBJ_EXT_WASM`. Today only a bare `WasmModule*` is + stored there; the linker and relocatable emitter need the richer struct. +- Extend `emit_wasm` to produce tool-conventions relocatable objects: a + required `linking` custom section, `reloc.CODE`/`reloc.DATA` custom sections, + symbol-info subsections, data-segment-info subsections, and target-feature + metadata. Relocatable objects must use padded-LEB immediate encodings so + relocations can rewrite immediates without re-disassembling the code section. +- Extend `read_wasm` from raw-byte mirroring to a real relocatable-object + reader: decode the symbol table into generic `ObjBuilder` symbols (so + archives and generic symbol inspection keep working), decode `reloc.*` into + generic relocations plus `WasmObjMeta`, and preserve unknown custom sections + by name and bytes for lossless roundtrip. +- Map the existing internal `RelocKind` values (`R_WASM_FUNCIDX`, + `R_WASM_TABLEIDX`, `R_WASM_MEMOFS`, `R_WASM_TYPEIDX`) onto the + tool-conventions wire relocation numbers, plus the data-symbol and + table-index variants the cross-TU work needs + (`R_WASM_MEMORY_ADDR_{LEB,SLEB,I32,I64}`, `R_WASM_TABLE_INDEX_{SLEB,I32}`). + Add new `RelocKind` values only where the wire format needs a distinction the + current names cannot express; unsupported kinds fail with a diagnostic naming + the kind. +- Add `make test-wasm-obj`: objects roundtrip through reader/writer/`objdump` + without losing sections, symbols, relocations, or unknown custom sections. + +## Static linker + +There is no Wasm linker yet. `cfree_link_exe` always builds a native `Linker` +and emits a native `LinkImage`; a Wasm final module is not a virtual-addressed +native image and must not go through that segment layout. + +Work items: + +- Add `WasmLinkImage`, `wasm_link_resolve(Linker*)`, `wasm_link_emit`, and + `wasm_link_image_free`. Dispatch to this path from `cfree_link_exe` when + `target.obj == CFREE_OBJ_WASM`, after `build_linker` and before + `link_resolve`, so existing input handling, archive loading, entry selection, + and diagnostics stay shared. +- Merge index spaces across objects: renumber functions, globals, tables, + memories, data segments, and types; merge type/import/function/table/global/ + element/data/custom sections. +- Resolve undefined function/data/global/table symbols and apply Wasm + relocations against the merged module without disassembling the code section. +- Merge compatible target-feature sections; diagnose incompatible feature sets. +- Synthesize `__wasm_call_ctors`, stack/memory symbols, and exports per the + selected output mode; participate in archive demand loading. +- Add `make test-wasm-link`: multiple objects link into one valid module; + archives demand-load; imports, exports, ctors, and memory/table layout are + deterministic. + +This is the path that unblocks multi-TU Wasm output. The single-TU final +module the backend produces today needs no relocations; everything cross-TU +depends on the object backend and linker above. + +## Frontend validator coverage + +The shared validator (`src/wasm/validate.c`) and WAT/binary readers already +cover the accepted feature subset with typed operand/control stacks, section +ordering and index-space checks, branch arity, `br_table`, limits, segment +rules, and start-function signatures, plus the staged proposal gates (threads, +typed function refs, tail calls, multi-memory, memory64, bulk memory, +non-trapping float-to-int) behind `WasmFeatureSet`. + +Remaining work item: + +- Add explicit "outside the support plan" diagnostics for SIMD, exceptions/ + tags, GC, and the component model, so a module using them fails with a + feature-naming message rather than a generic stack/opcode error. These + proposals stay rejected, not lowered. + +## Frontend lowering gaps + +Native lowering covers the MVP numeric/control/memory subset plus the staged +proposals (threads, typed refs, tail calls, multi-memory, memory64, bulk +memory), all routed through the explicit `CfreeWasmInstance*` ABI with import +slots and runtime table storage. + +Remaining work items: + +- Define and implement the C-facing exported wrapper ABI: host-callable thunks + that keep the instance parameter explicit but use C-friendly scalar types and + symbol names. Today only the internal `export_name(CfreeWasmInstance*, ...)` + ABI exists. Embedders are a first-class use case; do not generate wrappers + that hide or globalize the instance. +- Once `read_wasm` handles relocatable objects, consider relaxing the frontend + rule that rejects modules with a `linking` custom section (currently directed + to be supplied as an object input instead). + +## Backend feature gaps + +The `wasm32-none` backend emits valid single-TU final modules: scalar + +indirect-aggregate BasicCABI, structured control flow with a reducible-CFG +structurer and `br_table` switches, linear memory + `__stack_pointer`, +compact data layout with intra-module `R_ABS32` relocations, varargs via a +caller-packed linear-memory buffer, the bit/overflow intrinsics, atomics via +wasm-threads opcodes, `memory.copy`/`memory.fill` lowering, inline asm with +WAT templates, conventional `"memory"` export, and `(import "env" ...)` +declarations with `import_module`/`import_name` attribute overrides. + +Remaining work items, mostly blocked on the object/linker layer or on wider +ABI support: + +- Cross-TU references. Address-of an undefined symbol and address-of a + cross-TU function currently diagnose ("address of undefined symbol not yet + implemented"). The fix is the object linking section plus relocations: + data symbols via `R_WASM_MEMORY_ADDR_*`, address-taken functions placed in + the indirect-call table via `R_WASM_TABLE_INDEX_*`, with the undefined symbol + carrying `WASM_SYM_UNDEFINED` (and `WASM_SYM_BINDING_WEAK` for weak undefs). + Resolved at link time. Shared machinery for any cross-TU function pointer. +- `R_ABS64` and non-`R_ABS32` data relocation kinds in the linear-memory image + (currently diagnosed explicitly). +- `&&label` addresses in static-data initializers (diagnosed early today). +- Wider scalar ABI: `__int128` (wasm32 ABI rejects 16-byte scalars) and + `long double`/binary128 (advertised but fatals on materialization). These are + phased-rollout SKIPs in the C corpus W path. +- 64-bit checked-overflow multiply (`__builtin_*mul_overflow` on i64), lowerable + as a software 64x64->128 product splitting each i64 into i32 halves and + checking the high word; i32 already widens through i64. +- Atomics: sub-word (8/16-bit) RMW and cmpxchg, and atomic NAND — blocked on + the 8/16-bit atomic RMW opcodes not yet defined in cfree's Wasm core. Full + i32/i64 width is covered. Memory order is ignored (Wasm models seq_cst only). +- TLS and bitfields diagnose-and-fail; WASI startup, and irreducible control + flow / computed goto still produce raw rather than wasm-specific diagnostics + — tighten the messages later. + +## Wasm-to-Wasm + +A Toy/C -> Wasm -> run roundtrip exists via `make test-wasm-toy`/`test-wasm-c`, +but it routes the backend's output back through the lang/wasm frontend's native +JIT (Wasm-to-native), not a direct Wasm-to-Wasm path. + +Work items: + +- Wire the lang/wasm frontend to the Wasm target backend instead of native CG + for a direct Wasm-input -> Wasm-output normalization mode. Re-emit + semantically (not byte-preserving; object roundtrip tests cover preservation); + unsupported features still fail before emission. +- Add an explicit validate-only pass over Wasm-backend output (the frontend + validator currently only runs on Toy-produced `.wasm` during `cfree run`). + +## wasm64 and WASI + +Both stay recognized-but-unsupported until the freestanding wasm32 object/link +path is solid: + +- `wasm64`/memory64 parses and the frontend lowers it, but the wasm64 *target* + ABI panics in `compute_func_info`. Keep 64-bit tool-convention relocations and + the wasm64 backend behind explicit capability checks until the object/linker + story is stable. +- `wasm32-wasi` parses but is unsupported as a target: it must diagnose clearly + until WASI imports, startup, argv/env, and libc policy are specified. Decision + stands: freestanding `wasm32-unknown-unknown`/`wasm32-none` first. + +## Cleanup + +- Move the shared Wasm core from `lang/wasm/` to `src/wasm/` per the layout in + [../WASM.md](../WASM.md). The core (decode/encode/validate/insn/module/wat) + already lives under `src/wasm/`; the obj-layer glue is under `src/obj/wasm/`. + Confirm no TU still reaches the core through `-Ilang/wasm` and retire that + include path. + +## Test targets + +Present and green: `test-wasm-front`, `test-wasm-target`, `test-wasm-toy`, +`test-wasm-c` (aggregated under `test-wasm`). Still to add as the object and +link layers land: `make test-wasm-obj` and `make test-wasm-link`. Prefer small +named fixtures over broad corpus runs; keep external validators (wasm-tools, +WABT) as optional comparison oracles, never a hard dependency — cfree's own +validator is the semantic gate. See [TESTING.md](../TESTING.md). diff --git a/doc/std/ANNEX-A.txt b/doc/std/ANNEX-A.txt @@ -1,603 +0,0 @@ -Annex A - - (informative) - Language syntax summary -1 NOTE The notation is described in 6.1. - -Contents - -A.1 Lexical grammar - -Contents - -A.1.1 Lexical elements - -(6.4) token: - keyword - identifier - constant - string-literal - punctuator -(6.4) preprocessing-token: - header-name - identifier - pp-number - character-constant - string-literal - punctuator - each non-white-space character that cannot be one of the above -Contents - -A.1.2 Keywords - -(6.4.1) keyword: one of - auto if unsigned - break inline void - case int volatile - char long while - const register _Alignas - continue restrict _Alignof - default return _Atomic - do short _Bool - double signed _Complex - else sizeof _Generic - enum static _Imaginary - extern struct _Noreturn - float switch _Static_assert - for typedef _Thread_local - goto union -Contents - -A.1.3 Identifiers - -(6.4.2.1) identifier: - identifier-nondigit - identifier identifier-nondigit - identifier digit -(6.4.2.1) identifier-nondigit: - nondigit - universal-character-name - other implementation-defined characters -(6.4.2.1) nondigit: one of - _ a b c d e f g h i j k l m - n o p q r s t u v w x y z - A B C D E F G H I J K L M - N O P Q R S T U V W X Y Z -(6.4.2.1) digit: one of - 0 1 2 3 4 5 6 7 8 9 -Contents - -A.1.4 Universal character names - -(6.4.3) universal-character-name: - \u hex-quad - \U hex-quad hex-quad -(6.4.3) hex-quad: - hexadecimal-digit hexadecimal-digit - hexadecimal-digit hexadecimal-digit -Contents - -A.1.5 Constants - -(6.4.4) constant: - integer-constant - floating-constant - enumeration-constant - character-constant -(6.4.4.1) integer-constant: - decimal-constant integer-suffixopt - octal-constant integer-suffixopt - hexadecimal-constant integer-suffixopt -(6.4.4.1) decimal-constant: - nonzero-digit - decimal-constant digit -(6.4.4.1) octal-constant: - 0 - octal-constant octal-digit -(6.4.4.1) hexadecimal-constant: - hexadecimal-prefix hexadecimal-digit - hexadecimal-constant hexadecimal-digit -(6.4.4.1) hexadecimal-prefix: one of - 0x 0X -(6.4.4.1) nonzero-digit: one of - 1 2 3 4 5 6 7 8 9 -(6.4.4.1) octal-digit: one of - 0 1 2 3 4 5 6 7 -(6.4.4.1) hexadecimal-digit: one of - 0 1 2 3 4 5 6 7 8 9 - a b c d e f - A B C D E F -(6.4.4.1) integer-suffix: - unsigned-suffix long-suffixopt - unsigned-suffix long-long-suffix - long-suffix unsigned-suffixopt - long-long-suffix unsigned-suffixopt -(6.4.4.1) unsigned-suffix: one of - u U -(6.4.4.1) long-suffix: one of - l L -(6.4.4.1) long-long-suffix: one of - ll LL -(6.4.4.2) floating-constant: - decimal-floating-constant - hexadecimal-floating-constant -(6.4.4.2) decimal-floating-constant: - fractional-constant exponent-partopt floating-suffixopt - digit-sequence exponent-part floating-suffixopt -(6.4.4.2) hexadecimal-floating-constant: - hexadecimal-prefix hexadecimal-fractional-constant - binary-exponent-part floating-suffixopt - hexadecimal-prefix hexadecimal-digit-sequence - binary-exponent-part floating-suffixopt -(6.4.4.2) fractional-constant: - digit-sequenceopt . digit-sequence - digit-sequence . -(6.4.4.2) exponent-part: - e signopt digit-sequence - E signopt digit-sequence -(6.4.4.2) sign: one of - + - -(6.4.4.2) digit-sequence: - digit - digit-sequence digit -(6.4.4.2) hexadecimal-fractional-constant: - hexadecimal-digit-sequenceopt . - hexadecimal-digit-sequence - hexadecimal-digit-sequence . -(6.4.4.2) binary-exponent-part: - p signopt digit-sequence - P signopt digit-sequence -(6.4.4.2) hexadecimal-digit-sequence: - hexadecimal-digit - hexadecimal-digit-sequence hexadecimal-digit -(6.4.4.2) floating-suffix: one of - f l F L -(6.4.4.3) enumeration-constant: - identifier -(6.4.4.4) character-constant: - ' c-char-sequence ' - L' c-char-sequence ' - u' c-char-sequence ' - U' c-char-sequence ' -(6.4.4.4) c-char-sequence: - c-char - c-char-sequence c-char -(6.4.4.4) c-char: - any member of the source character set except - the single-quote ', backslash \, or new-line character - escape-sequence -(6.4.4.4) escape-sequence: - simple-escape-sequence - octal-escape-sequence - hexadecimal-escape-sequence - universal-character-name -(6.4.4.4) simple-escape-sequence: one of - \' \" \? \\ - \a \b \f \n \r \t \v -(6.4.4.4) octal-escape-sequence: - \ octal-digit - \ octal-digit octal-digit - \ octal-digit octal-digit octal-digit -(6.4.4.4) hexadecimal-escape-sequence: - \x hexadecimal-digit - hexadecimal-escape-sequence hexadecimal-digit -Contents - -A.1.6 String literals - -(6.4.5) string-literal: - encoding-prefixopt " s-char-sequenceopt " -(6.4.5) encoding-prefix: - u8 - u - U - L -(6.4.5) s-char-sequence: - s-char - s-char-sequence s-char -(6.4.5) s-char: - any member of the source character set except - the double-quote ", backslash \, or new-line character - escape-sequence -Contents - -A.1.7 Punctuators - -(6.4.6) punctuator: one of - [ ] ( ) { } . -> - ++ -- & * + - ~ ! - / % << >> < > <= >= == != ^ | && || - ? : ; ... - = *= /= %= += -= <<= >>= &= ^= |= - , # ## - <: :> <% %> %: %:%: -Contents - -A.1.8 Header names - -(6.4.7) header-name: - < h-char-sequence > - " q-char-sequence " -(6.4.7) h-char-sequence: - h-char - h-char-sequence h-char -(6.4.7) h-char: - any member of the source character set except - the new-line character and > -(6.4.7) q-char-sequence: - q-char - q-char-sequence q-char -(6.4.7) q-char: - any member of the source character set except - the new-line character and " -Contents - -A.1.9 Preprocessing numbers - -(6.4.8) pp-number: - digit - . digit - pp-number digit - pp-number identifier-nondigit - pp-number e sign - pp-number E sign - pp-number p sign - pp-number P sign - pp-number . -Contents - -A.2 Phrase structure grammar - -Contents - -A.2.1 Expressions - -(6.5.1) primary-expression: - identifier - constant - string-literal - ( expression ) - generic-selection -(6.5.1.1) generic-selection: - _Generic ( assignment-expression , generic-assoc-list ) -(6.5.1.1) generic-assoc-list: - generic-association - generic-assoc-list , generic-association -(6.5.1.1) generic-association: - type-name : assignment-expression - default : assignment-expression -(6.5.2) postfix-expression: - primary-expression - postfix-expression [ expression ] - postfix-expression ( argument-expression-listopt ) - postfix-expression . identifier - postfix-expression -> identifier - postfix-expression ++ - postfix-expression -- - ( type-name ) { initializer-list } - ( type-name ) { initializer-list , } -(6.5.2) argument-expression-list: - assignment-expression - argument-expression-list , assignment-expression -(6.5.3) unary-expression: - postfix-expression - ++ unary-expression - -- unary-expression - unary-operator cast-expression - sizeof unary-expression - sizeof ( type-name ) - _Alignof ( type-name ) -(6.5.3) unary-operator: one of - & * + - ~ ! -(6.5.4) cast-expression: - unary-expression - ( type-name ) cast-expression -(6.5.5) multiplicative-expression: - cast-expression - multiplicative-expression * cast-expression - multiplicative-expression / cast-expression - multiplicative-expression % cast-expression -(6.5.6) additive-expression: - multiplicative-expression - additive-expression + multiplicative-expression - additive-expression - multiplicative-expression -(6.5.7) shift-expression: - additive-expression - shift-expression << additive-expression - shift-expression >> additive-expression -(6.5.8) relational-expression: - shift-expression - relational-expression < shift-expression - relational-expression > shift-expression - relational-expression <= shift-expression - relational-expression >= shift-expression -(6.5.9) equality-expression: - relational-expression - equality-expression == relational-expression - equality-expression != relational-expression -(6.5.10) AND-expression: - equality-expression - AND-expression & equality-expression -(6.5.11) exclusive-OR-expression: - AND-expression - exclusive-OR-expression ^ AND-expression -(6.5.12) inclusive-OR-expression: - exclusive-OR-expression - inclusive-OR-expression | exclusive-OR-expression -(6.5.13) logical-AND-expression: - inclusive-OR-expression - logical-AND-expression && inclusive-OR-expression -(6.5.14) logical-OR-expression: - logical-AND-expression - logical-OR-expression || logical-AND-expression -(6.5.15) conditional-expression: - logical-OR-expression - logical-OR-expression ? expression : conditional-expression -(6.5.16) assignment-expression: - conditional-expression - unary-expression assignment-operator assignment-expression -(6.5.16) assignment-operator: one of - = *= /= %= += -= <<= >>= &= ^= |= -(6.5.17) expression: - assignment-expression - expression , assignment-expression -(6.6) constant-expression: - conditional-expression -Contents - -A.2.2 Declarations - -(6.7) declaration: - declaration-specifiers init-declarator-listopt ; - static_assert-declaration -(6.7) declaration-specifiers: - storage-class-specifier declaration-specifiersopt - type-specifier declaration-specifiersopt - type-qualifier declaration-specifiersopt - function-specifier declaration-specifiersopt - alignment-specifier declaration-specifiersopt -(6.7) init-declarator-list: - init-declarator - init-declarator-list , init-declarator -(6.7) init-declarator: - declarator - declarator = initializer -(6.7.1) storage-class-specifier: - typedef - extern - static - _Thread_local - auto - register -(6.7.2) type-specifier: - void - char - short - int - long - float - double - signed - unsigned - _Bool - _Complex - atomic-type-specifier - struct-or-union-specifier - enum-specifier - typedef-name -(6.7.2.1) struct-or-union-specifier: - struct-or-union identifieropt { struct-declaration-list } - struct-or-union identifier -(6.7.2.1) struct-or-union: - struct - union -(6.7.2.1) struct-declaration-list: - struct-declaration - struct-declaration-list struct-declaration -(6.7.2.1) struct-declaration: - specifier-qualifier-list struct-declarator-listopt ; - static_assert-declaration -(6.7.2.1) specifier-qualifier-list: - type-specifier specifier-qualifier-listopt - type-qualifier specifier-qualifier-listopt -(6.7.2.1) struct-declarator-list: - struct-declarator - struct-declarator-list , struct-declarator -(6.7.2.1) struct-declarator: - declarator - declaratoropt : constant-expression -(6.7.2.2) enum-specifier: - enum identifieropt { enumerator-list } - enum identifieropt { enumerator-list , } - enum identifier -(6.7.2.2) enumerator-list: - enumerator - enumerator-list , enumerator -(6.7.2.2) enumerator: - enumeration-constant - enumeration-constant = constant-expression -(6.7.2.4) atomic-type-specifier: - _Atomic ( type-name ) -(6.7.3) type-qualifier: - const - restrict - volatile - _Atomic -(6.7.4) function-specifier: - inline - _Noreturn -(6.7.5) alignment-specifier: - _Alignas ( type-name ) - _Alignas ( constant-expression ) -(6.7.6) declarator: - pointeropt direct-declarator -(6.7.6) direct-declarator: - identifier - ( declarator ) - direct-declarator [ type-qualifier-listopt assignment-expressionopt ] - direct-declarator [ static type-qualifier-listopt assignment-expression ] - direct-declarator [ type-qualifier-list static assignment-expression ] - direct-declarator [ type-qualifier-listopt * ] - direct-declarator ( parameter-type-list ) - direct-declarator ( identifier-listopt ) -(6.7.6) pointer: - * type-qualifier-listopt - * type-qualifier-listopt pointer -(6.7.6) type-qualifier-list: - type-qualifier - type-qualifier-list type-qualifier -(6.7.6) parameter-type-list: - parameter-list - parameter-list , ... -(6.7.6) parameter-list: - parameter-declaration - parameter-list , parameter-declaration -(6.7.6) parameter-declaration: - declaration-specifiers declarator - declaration-specifiers abstract-declaratoropt -(6.7.6) identifier-list: - identifier - identifier-list , identifier -(6.7.7) type-name: - specifier-qualifier-list abstract-declaratoropt -(6.7.7) abstract-declarator: - pointer - pointeropt direct-abstract-declarator -(6.7.7) direct-abstract-declarator: - ( abstract-declarator ) - direct-abstract-declaratoropt [ type-qualifier-listopt - assignment-expressionopt ] - direct-abstract-declaratoropt [ static type-qualifier-listopt - assignment-expression ] - direct-abstract-declaratoropt [ type-qualifier-list static - assignment-expression ] - direct-abstract-declaratoropt [ * ] - direct-abstract-declaratoropt ( parameter-type-listopt ) -(6.7.8) typedef-name: - identifier -(6.7.9) initializer: - assignment-expression - { initializer-list } - { initializer-list , } -(6.7.9) initializer-list: - designationopt initializer - initializer-list , designationopt initializer -(6.7.9) designation: - designator-list = -(6.7.9) designator-list: - designator - designator-list designator -(6.7.9) designator: - [ constant-expression ] - . identifier -(6.7.10) static_assert-declaration: - _Static_assert ( constant-expression , string-literal ) ; -Contents - -A.2.3 Statements - -(6.8) statement: - labeled-statement - compound-statement - expression-statement - selection-statement - iteration-statement - jump-statement -(6.8.1) labeled-statement: - identifier : statement - case constant-expression : statement - default : statement -(6.8.2) compound-statement: - { block-item-listopt } -(6.8.2) block-item-list: - block-item - block-item-list block-item -(6.8.2) block-item: - declaration - statement -(6.8.3) expression-statement: - expressionopt ; -(6.8.4) selection-statement: - if ( expression ) statement - if ( expression ) statement else statement - switch ( expression ) statement -(6.8.5) iteration-statement: - while ( expression ) statement - do statement while ( expression ) ; - for ( expressionopt ; expressionopt ; expressionopt ) statement - for ( declaration expressionopt ; expressionopt ) statement -(6.8.6) jump-statement: - goto identifier ; - continue ; - break ; - return expressionopt ; -Contents - -A.2.4 External definitions - -(6.9) translation-unit: - external-declaration - translation-unit external-declaration -(6.9) external-declaration: - function-definition - declaration -(6.9.1) function-definition: - declaration-specifiers declarator declaration-listopt compound-statement -(6.9.1) declaration-list: - declaration - declaration-list declaration -Contents - -A.3 Preprocessing directives - -(6.10) preprocessing-file: - groupopt -(6.10) group: - group-part - group group-part -(6.10) group-part: - if-section - control-line - text-line - # non-directive -(6.10) if-section: - if-group elif-groupsopt else-groupopt endif-line -(6.10) if-group: - # if constant-expression new-line groupopt - # ifdef identifier new-line groupopt - # ifndef identifier new-line groupopt -(6.10) elif-groups: - elif-group - elif-groups elif-group -(6.10) elif-group: - # elif constant-expression new-line groupopt -(6.10) else-group: - # else new-line groupopt -(6.10) endif-line: - # endif new-line -(6.10) control-line: - # include pp-tokens new-line - # define identifier replacement-list new-line - # define identifier lparen identifier-listopt ) - replacement-list new-line - # define identifier lparen ... ) replacement-list new-line - # define identifier lparen identifier-list , ... ) - replacement-list new-line - # undef identifier new-line - # line pp-tokens new-line - # error pp-tokensopt new-line - # pragma pp-tokensopt new-line - # new-line -(6.10) text-line: - pp-tokensopt new-line -(6.10) non-directive: - pp-tokens new-line -(6.10) lparen: - a ( character not immediately preceded by white-space -(6.10) replacement-list: - pp-tokensopt -(6.10) pp-tokens: - preprocessing-token - pp-tokens preprocessing-token -(6.10) new-line: - the new-line character diff --git a/doc/std/ANNEX-J.txt b/doc/std/ANNEX-J.txt @@ -1,618 +0,0 @@ -Annex J - - (informative) - Portability issues -1 This annex collects some information about portability that appears in this International Standard. - -Contents - -J.1 Unspecified behavior - -1 The following are unspecified: - -The manner and timing of static initialization (5.1.2). -The termination status returned to the hosted environment if the return type of main is not compatible with int (5.1.2.2.3). -The values of objects that are neither lock-free atomic objects nor of type volatile sig_atomic_t and the state of the floating-point environment, when the processing of the abstract machine is interrupted by receipt of a signal (5.1.2.3). -The behavior of the display device if a printing character is written when the active position is at the final position of a line (5.2.2). -The behavior of the display device if a backspace character is written when the active position is at the initial position of a line (5.2.2). -The behavior of the display device if a horizontal tab character is written when the active position is at or past the last defined horizontal tabulation position (5.2.2). -The behavior of the display device if a vertical tab character is written when the active position is at or past the last defined vertical tabulation position (5.2.2). -How an extended source character that does not correspond to a universal character name counts toward the significant initial characters in an external identifier (5.2.4.1). -Many aspects of the representations of types (6.2.6). -The value of padding bytes when storing values in structures or unions (6.2.6.1). -The values of bytes that correspond to union members other than the one last stored into (6.2.6.1). -The representation used when storing a value in an object that has more than one object representation for that value (6.2.6.1). -The values of any padding bits in integer representations (6.2.6.2). -Whether certain operators can generate negative zeros and whether a negative zero becomes a normal zero when stored in an object (6.2.6.2). -Whether two string literals result in distinct arrays (6.4.5). -The order in which subexpressions are evaluated and the order in which side effects take place, except as specified for the function-call (), &&, ||, ? :, and comma operators (6.5). -The order in which the function designator, arguments, and subexpressions within the arguments are evaluated in a function call (6.5.2.2). -The order of side effects among compound literal initialization list expressions (6.5.2.5). -The order in which the operands of an assignment operator are evaluated (6.5.16). -The alignment of the addressable storage unit allocated to hold a bit-field (6.7.2.1). -Whether a call to an inline function uses the inline definition or the external definition of the function (6.7.4). -Whether or not a size expression is evaluated when it is part of the operand of a sizeof operator and changing the value of the size expression would not affect the result of the operator (6.7.6.2). -The order in which any side effects occur among the initialization list expressions in an initializer (6.7.9). -The layout of storage for function parameters (6.9.1). -When a fully expanded macro replacement list contains a function-like macro name as its last preprocessing token and the next preprocessing token from the source file is a (, and the fully expanded replacement of that macro ends with the name of the first macro and the next preprocessing token from the source file is again a (, whether that is considered a nested replacement (6.10.3). -The order in which # and ## operations are evaluated during macro substitution (6.10.3.2, 6.10.3.3). -The state of the floating-point status flags when execution passes from a part of the program translated with FENV_ACCESS ''off'' to a part translated with FENV_ACCESS ''on'' (7.6.1). -The order in which feraiseexcept raises floating-point exceptions, except as stated in F.8.6 (7.6.2.3). -Whether math_errhandling is a macro or an identifier with external linkage (7.12). -The results of the frexp functions when the specified value is not a floating-point number (7.12.6.4). -The numeric result of the ilogb functions when the correct value is outside the range of the return type (7.12.6.5, F.10.3.5). -The result of rounding when the value is out of range (7.12.9.5, 7.12.9.7, F.10.6.5). -The value stored by the remquo functions in the object pointed to by quo when y is zero (7.12.10.3). -Whether a comparison macro argument that is represented in a format wider than its semantic type is converted to the semantic type (7.12.14). -Whether setjmp is a macro or an identifier with external linkage (7.13). -Whether va_copy and va_end are macros or identifiers with external linkage (7.16.1). -The hexadecimal digit before the decimal point when a non-normalized floating-point number is printed with an a or A conversion specifier (7.21.6.1, 7.29.2.1). -The value of the file position indicator after a successful call to the ungetc function for a text stream, or the ungetwc function for any stream, until all pushed-back characters are read or discarded (7.21.7.10, 7.29.3.10). -The details of the value stored by the fgetpos function (7.21.9.1). -The details of the value returned by the ftell function for a text stream (7.21.9.4). -Whether the strtod, strtof, strtold, wcstod, wcstof, and wcstold functions convert a minus-signed sequence to a negative number directly or by negating the value resulting from converting the corresponding unsigned sequence (7.22.1.3, 7.29.4.1.1). -The order and contiguity of storage allocated by successive calls to the calloc, malloc, and realloc functions (7.22.3). -The amount of storage allocated by a successful call to the calloc, malloc, or realloc function when 0 bytes was requested (7.22.3). -Whether a call to the atexit function that does not happen before the exit function is called will succeed (7.22.4.2). -Whether a call to the at_quick_exit function that does not happen before the quick_exit function is called will succeed (7.22.4.3). -Which of two elements that compare as equal is matched by the bsearch function (7.22.5.1). -The order of two elements that compare as equal in an array sorted by the qsort function (7.22.5.2). -The encoding of the calendar time returned by the time function (7.27.2.4). -The characters stored by the strftime or wcsftime function if any of the time values being converted is outside the normal range (7.27.3.5, 7.29.5.1). -Whether an encoding error occurs if a wchar_t value that does not correspond to a member of the extended character set appears in the format string for a function in 7.29.2 or 7.29.5 and the specified semantics do not require that value to be processed by wcrtomb (7.29.1). -The conversion state after an encoding error occurs (7.29.6.3.2, 7.29.6.3.3, 7.29.6.4.1, 7.29.6.4.2, -The resulting value when the ''invalid'' floating-point exception is raised during IEC 60559 floating to integer conversion (F.4). -Whether conversion of non-integer IEC 60559 floating values to integer raises the ''inexact'' floating-point exception (F.4). -Whether or when library functions in <math.h> raise the ''inexact'' floating-point exception in an IEC 60559 conformant implementation (F.10). -Whether or when library functions in <math.h> raise an undeserved ''underflow'' floating-point exception in an IEC 60559 conformant implementation (F.10). -The exponent value stored by frexp for a NaN or infinity (F.10.3.4). -The numeric result returned by the lrint, llrint, lround, and llround functions if the rounded value is outside the range of the return type (F.10.6.5, F.10.6.7). -The sign of one part of the complex result of several math functions for certain special cases in IEC 60559 compatible implementations (G.6.1.1, G.6.2.2, G.6.2.3, G.6.2.4, G.6.2.5, G.6.2.6, G.6.3.1, G.6.4.2). -Contents - -J.2 Undefined behavior - -1 The behavior is undefined in the following circumstances: - -A ''shall'' or ''shall not'' requirement that appears outside of a constraint is violated (clause 4). -A nonempty source file does not end in a new-line character which is not immediately preceded by a backslash character or ends in a partial preprocessing token or comment (5.1.1.2). -Token concatenation produces a character sequence matching the syntax of a universal character name (5.1.1.2). -A program in a hosted environment does not define a function named main using one of the specified forms (5.1.2.2.1). -The execution of a program contains a data race (5.1.2.4). -A character not in the basic source character set is encountered in a source file, except in an identifier, a character constant, a string literal, a header name, a comment, or a preprocessing token that is never converted to a token (5.2.1). -An identifier, comment, string literal, character constant, or header name contains an invalid multibyte character or does not begin and end in the initial shift state (5.2.1.2). -The same identifier has both internal and external linkage in the same translation unit (6.2.2). -An object is referred to outside of its lifetime (6.2.4). -The value of a pointer to an object whose lifetime has ended is used (6.2.4). -The value of an object with automatic storage duration is used while it is indeterminate (6.2.4, 6.7.9, 6.8). -A trap representation is read by an lvalue expression that does not have character type (6.2.6.1). -A trap representation is produced by a side effect that modifies any part of the object using an lvalue expression that does not have character type (6.2.6.1). -The operands to certain operators are such that they could produce a negative zero result, but the implementation does not support negative zeros (6.2.6.2). -Two declarations of the same object or function specify types that are not compatible (6.2.7). -A program requires the formation of a composite type from a variable length array type whose size is specified by an expression that is not evaluated (6.2.7). -Conversion to or from an integer type produces a value outside the range that can be represented (6.3.1.4). -Demotion of one real floating type to another produces a value outside the range that can be represented (6.3.1.5). -An lvalue does not designate an object when evaluated (6.3.2.1). -A non-array lvalue with an incomplete type is used in a context that requires the value of the designated object (6.3.2.1). -An lvalue designating an object of automatic storage duration that could have been declared with the register storage class is used in a context that requires the value of the designated object, but the object is uninitialized. (6.3.2.1). -An lvalue having array type is converted to a pointer to the initial element of the array, and the array object has register storage class (6.3.2.1). -An attempt is made to use the value of a void expression, or an implicit or explicit conversion (except to void) is applied to a void expression (6.3.2.2). -Conversion of a pointer to an integer type produces a value outside the range that can be represented (6.3.2.3). -Conversion between two pointer types produces a result that is incorrectly aligned (6.3.2.3). -A pointer is used to call a function whose type is not compatible with the referenced type (6.3.2.3). -An unmatched ' or " character is encountered on a logical source line during tokenization (6.4). -A reserved keyword token is used in translation phase 7 or 8 for some purpose other than as a keyword (6.4.1). -A universal character name in an identifier does not designate a character whose encoding falls into one of the specified ranges (6.4.2.1). -The initial character of an identifier is a universal character name designating a digit (6.4.2.1). -Two identifiers differ only in nonsignificant characters (6.4.2.1). -The identifier __func__ is explicitly declared (6.4.2.2). -The program attempts to modify a string literal (6.4.5). -The characters ', \, ", //, or /* occur in the sequence between the < and > delimiters, or the characters ', \, //, or /* occur in the sequence between the " delimiters, in a header name preprocessing token (6.4.7). -A side effect on a scalar object is unsequenced relative to either a different side effect on the same scalar object or a value computation using the value of the same scalar object (6.5). -An exceptional condition occurs during the evaluation of an expression (6.5). -An object has its stored value accessed other than by an lvalue of an allowable type (6.5). -For a call to a function without a function prototype in scope, the number of arguments does not equal the number of parameters (6.5.2.2). -For call to a function without a function prototype in scope where the function is defined with a function prototype, either the prototype ends with an ellipsis or the types of the arguments after promotion are not compatible with the types of the parameters (6.5.2.2). -For a call to a function without a function prototype in scope where the function is not defined with a function prototype, the types of the arguments after promotion are not compatible with those of the parameters after promotion (with certain exceptions) (6.5.2.2). -A function is defined with a type that is not compatible with the type (of the expression) pointed to by the expression that denotes the called function (6.5.2.2). -A member of an atomic structure or union is accessed (6.5.2.3). -The operand of the unary * operator has an invalid value (6.5.3.2). -A pointer is converted to other than an integer or pointer type (6.5.4). -The value of the second operand of the / or % operator is zero (6.5.5). -Addition or subtraction of a pointer into, or just beyond, an array object and an integer type produces a result that does not point into, or just beyond, the same array object (6.5.6). -Addition or subtraction of a pointer into, or just beyond, an array object and an integer type produces a result that points just beyond the array object and is used as the operand of a unary * operator that is evaluated (6.5.6). -Pointers that do not point into, or just beyond, the same array object are subtracted (6.5.6). -An array subscript is out of range, even if an object is apparently accessible with the given subscript (as in the lvalue expression a[1][7] given the declaration int a[4][5]) (6.5.6). -The result of subtracting two pointers is not representable in an object of type ptrdiff_t (6.5.6). -An expression is shifted by a negative number or by an amount greater than or equal to the width of the promoted expression (6.5.7). -An expression having signed promoted type is left-shifted and either the value of the expression is negative or the result of shifting would be not be representable in the promoted type (6.5.7). -Pointers that do not point to the same aggregate or union (nor just beyond the same array object) are compared using relational operators (6.5.8). -An object is assigned to an inexactly overlapping object or to an exactly overlapping object with incompatible type (6.5.16.1). -An expression that is required to be an integer constant expression does not have an integer type; has operands that are not integer constants, enumeration constants, character constants, sizeof expressions whose results are integer constants, _Alignof expressions, or immediately-cast floating constants; or contains casts (outside operands to sizeof and _Alignof operators) other than conversions of arithmetic types to integer types (6.6). -A constant expression in an initializer is not, or does not evaluate to, one of the following: an arithmetic constant expression, a null pointer constant, an address constant, or an address constant for a complete object type plus or minus an integer constant expression (6.6). -An arithmetic constant expression does not have arithmetic type; has operands that are not integer constants, floating constants, enumeration constants, character constants, sizeof expressions whose results are integer constants, or _Alignof expressions; or contains casts (outside operands to sizeof or _Alignof operators) other than conversions of arithmetic types to arithmetic types (6.6). -The value of an object is accessed by an array-subscript [], member-access . or ->, address &, or indirection * operator or a pointer cast in creating an address constant (6.6). -An identifier for an object is declared with no linkage and the type of the object is incomplete after its declarator, or after its init-declarator if it has an initializer (6.7). -A function is declared at block scope with an explicit storage-class specifier other than extern (6.7.1). -A structure or union is defined without any named members (including those specified indirectly via anonymous structures and unions) (6.7.2.1). -An attempt is made to access, or generate a pointer to just past, a flexible array member of a structure when the referenced object provides no elements for that array (6.7.2.1). -When the complete type is needed, an incomplete structure or union type is not completed in the same scope by another declaration of the tag that defines the content (6.7.2.3). -An attempt is made to modify an object defined with a const-qualified type through use of an lvalue with non-const-qualified type (6.7.3). -An attempt is made to refer to an object defined with a volatile-qualified type through use of an lvalue with non-volatile-qualified type (6.7.3). -The specification of a function type includes any type qualifiers (6.7.3). -Two qualified types that are required to be compatible do not have the identically qualified version of a compatible type (6.7.3). -An object which has been modified is accessed through a restrict-qualified pointer to a const-qualified type, or through a restrict-qualified pointer and another pointer that are not both based on the same object (6.7.3.1). -A restrict-qualified pointer is assigned a value based on another restricted pointer whose associated block neither began execution before the block associated with this pointer, nor ended before the assignment (6.7.3.1). -A function with external linkage is declared with an inline function specifier, but is not also defined in the same translation unit (6.7.4). -A function declared with a _Noreturn function specifier returns to its caller (6.7.4). -The definition of an object has an alignment specifier and another declaration of that object has a different alignment specifier (6.7.5). -Declarations of an object in different translation units have different alignment specifiers (6.7.5). -Two pointer types that are required to be compatible are not identically qualified, or are not pointers to compatible types (6.7.6.1). -The size expression in an array declaration is not a constant expression and evaluates at program execution time to a nonpositive value (6.7.6.2). -In a context requiring two array types to be compatible, they do not have compatible element types, or their size specifiers evaluate to unequal values (6.7.6.2). -A declaration of an array parameter includes the keyword static within the [ and ] and the corresponding argument does not provide access to the first element of an array with at least the specified number of elements (6.7.6.3). -A storage-class specifier or type qualifier modifies the keyword void as a function parameter type list (6.7.6.3). -In a context requiring two function types to be compatible, they do not have compatible return types, or their parameters disagree in use of the ellipsis terminator or the number and type of parameters (after default argument promotion, when there is no parameter type list or when one type is specified by a function definition with an identifier list) (6.7.6.3). -The value of an unnamed member of a structure or union is used (6.7.9). -The initializer for a scalar is neither a single expression nor a single expression enclosed in braces (6.7.9). -The initializer for a structure or union object that has automatic storage duration is neither an initializer list nor a single expression that has compatible structure or union type (6.7.9). -The initializer for an aggregate or union, other than an array initialized by a string literal, is not a brace-enclosed list of initializers for its elements or members (6.7.9). -An identifier with external linkage is used, but in the program there does not exist exactly one external definition for the identifier, or the identifier is not used and there exist multiple external definitions for the identifier (6.9). -A function definition includes an identifier list, but the types of the parameters are not declared in a following declaration list (6.9.1). -An adjusted parameter type in a function definition is not a complete object type (6.9.1). -A function that accepts a variable number of arguments is defined without a parameter type list that ends with the ellipsis notation (6.9.1). -The } that terminates a function is reached, and the value of the function call is used by the caller (6.9.1). -An identifier for an object with internal linkage and an incomplete type is declared with a tentative definition (6.9.2). -The token defined is generated during the expansion of a #if or #elif preprocessing directive, or the use of the defined unary operator does not match one of the two specified forms prior to macro replacement (6.10.1). -The #include preprocessing directive that results after expansion does not match one of the two header name forms (6.10.2). -The character sequence in an #include preprocessing directive does not start with a letter (6.10.2). -There are sequences of preprocessing tokens within the list of macro arguments that would otherwise act as preprocessing directives (6.10.3). -The result of the preprocessing operator # is not a valid character string literal (6.10.3.2). -The result of the preprocessing operator ## is not a valid preprocessing token (6.10.3.3). -The #line preprocessing directive that results after expansion does not match one of the two well-defined forms, or its digit sequence specifies zero or a number greater than 2147483647 (6.10.4). -A non-STDC #pragma preprocessing directive that is documented as causing translation failure or some other form of undefined behavior is encountered (6.10.6). -A #pragma STDC preprocessing directive does not match one of the well-defined forms (6.10.6). -The name of a predefined macro, or the identifier defined, is the subject of a #define or #undef preprocessing directive (6.10.8). -An attempt is made to copy an object to an overlapping object by use of a library function, other than as explicitly allowed (e.g., memmove) (clause 7). -A file with the same name as one of the standard headers, not provided as part of the implementation, is placed in any of the standard places that are searched for included source files (7.1.2). -A header is included within an external declaration or definition (7.1.2). -A function, object, type, or macro that is specified as being declared or defined by some standard header is used before any header that declares or defines it is included (7.1.2). -A standard header is included while a macro is defined with the same name as a keyword (7.1.2). -The program attempts to declare a library function itself, rather than via a standard header, but the declaration does not have external linkage (7.1.2). -The program declares or defines a reserved identifier, other than as allowed by 7.1.4 (7.1.3). -The program removes the definition of a macro whose name begins with an underscore and either an uppercase letter or another underscore (7.1.3). -An argument to a library function has an invalid value or a type not expected by a function with variable number of arguments (7.1.4). -The pointer passed to a library function array parameter does not have a value such that all address computations and object accesses are valid (7.1.4). -The macro definition of assert is suppressed in order to access an actual function (7.2). -The argument to the assert macro does not have a scalar type (7.2). -The CX_LIMITED_RANGE, FENV_ACCESS, or FP_CONTRACT pragma is used in any context other than outside all external declarations or preceding all explicit declarations and statements inside a compound statement (7.3.4, 7.6.1, 7.12.2). -The value of an argument to a character handling function is neither equal to the value of EOF nor representable as an unsigned char (7.4). -A macro definition of errno is suppressed in order to access an actual object, or the program defines an identifier with the name errno (7.5). -Part of the program tests floating-point status flags, sets floating-point control modes, or runs under non-default mode settings, but was translated with the state for the FENV_ACCESS pragma ''off'' (7.6.1). -The exception-mask argument for one of the functions that provide access to the floating-point status flags has a nonzero value not obtained by bitwise OR of the floating-point exception macros (7.6.2). -The fesetexceptflag function is used to set floating-point status flags that were not specified in the call to the fegetexceptflag function that provided the value of the corresponding fexcept_t object (7.6.2.4). -The argument to fesetenv or feupdateenv is neither an object set by a call to fegetenv or feholdexcept, nor is it an environment macro (7.6.4.3, 7.6.4.4). -The value of the result of an integer arithmetic or conversion function cannot be represented (7.8.2.1, 7.8.2.2, 7.8.2.3, 7.8.2.4, 7.22.6.1, 7.22.6.2, 7.22.1). -The program modifies the string pointed to by the value returned by the setlocale function (7.11.1.1). -The program modifies the structure pointed to by the value returned by the localeconv function (7.11.2.1). -A macro definition of math_errhandling is suppressed or the program defines an identifier with the name math_errhandling (7.12). -An argument to a floating-point classification or comparison macro is not of real floating type (7.12.3, 7.12.14). -A macro definition of setjmp is suppressed in order to access an actual function, or the program defines an external identifier with the name setjmp (7.13). -An invocation of the setjmp macro occurs other than in an allowed context (7.13.2.1). -The longjmp function is invoked to restore a nonexistent environment (7.13.2.1). -After a longjmp, there is an attempt to access the value of an object of automatic storage duration that does not have volatile-qualified type, local to the function containing the invocation of the corresponding setjmp macro, that was changed between the setjmp invocation and longjmp call (7.13.2.1). -The program specifies an invalid pointer to a signal handler function (7.14.1.1). -A signal handler returns when the signal corresponded to a computational exception (7.14.1.1). -A signal handler called in response to SIGFPE, SIGILL, SIGSEGV, or any other implementation-defined value corresponding to a computational exception returns (7.14.1.1). -A signal occurs as the result of calling the abort or raise function, and the signal handler calls the raise function (7.14.1.1). -A signal occurs other than as the result of calling the abort or raise function, and the signal handler refers to an object with static or thread storage duration that is not a lock-free atomic object other than by assigning a value to an object declared as volatile sig_atomic_t, or calls any function in the standard library other than the abort function, the _Exit function, the quick_exit function, or the signal function (for the same signal number) (7.14.1.1). -The value of errno is referred to after a signal occurred other than as the result of calling the abort or raise function and the corresponding signal handler obtained a SIG_ERR return from a call to the signal function (7.14.1.1). -A signal is generated by an asynchronous signal handler (7.14.1.1). -The signal function is used in a multi-threaded program (7.14.1.1). -A function with a variable number of arguments attempts to access its varying arguments other than through a properly declared and initialized va_list object, or before the va_start macro is invoked (7.16, 7.16.1.1, 7.16.1.4). -The macro va_arg is invoked using the parameter ap that was passed to a function that invoked the macro va_arg with the same parameter (7.16). -A macro definition of va_start, va_arg, va_copy, or va_end is suppressed in order to access an actual function, or the program defines an external identifier with the name va_copy or va_end (7.16.1). -The va_start or va_copy macro is invoked without a corresponding invocation of the va_end macro in the same function, or vice versa (7.16.1, 7.16.1.2, 7.16.1.3, 7.16.1.4). -The type parameter to the va_arg macro is not such that a pointer to an object of that type can be obtained simply by postfixing a * (7.16.1.1). -The va_arg macro is invoked when there is no actual next argument, or with a specified type that is not compatible with the promoted type of the actual next argument, with certain exceptions (7.16.1.1). -The va_copy or va_start macro is called to initialize a va_list that was previously initialized by either macro without an intervening invocation of the va_end macro for the same va_list (7.16.1.2, 7.16.1.4). -The parameter parmN of a va_start macro is declared with the register storage class, with a function or array type, or with a type that is not compatible with the type that results after application of the default argument promotions (7.16.1.4). -The member designator parameter of an offsetof macro is an invalid right operand of the . operator for the type parameter, or designates a bit-field (7.19). -The argument in an instance of one of the integer-constant macros is not a decimal, octal, or hexadecimal constant, or it has a value that exceeds the limits for the corresponding type (7.20.4). -A byte input/output function is applied to a wide-oriented stream, or a wide character input/output function is applied to a byte-oriented stream (7.21.2). -Use is made of any portion of a file beyond the most recent wide character written to a wide-oriented stream (7.21.2). -The value of a pointer to a FILE object is used after the associated file is closed (7.21.3). -The stream for the fflush function points to an input stream or to an update stream in which the most recent operation was input (7.21.5.2). -The string pointed to by the mode argument in a call to the fopen function does not exactly match one of the specified character sequences (7.21.5.3). -An output operation on an update stream is followed by an input operation without an intervening call to the fflush function or a file positioning function, or an input operation on an update stream is followed by an output operation with an intervening call to a file positioning function (7.21.5.3). -An attempt is made to use the contents of the array that was supplied in a call to the setvbuf function (7.21.5.6). -There are insufficient arguments for the format in a call to one of the formatted input/output functions, or an argument does not have an appropriate type (7.21.6.1, 7.21.6.2, 7.29.2.1, 7.29.2.2). -The format in a call to one of the formatted input/output functions or to the strftime or wcsftime function is not a valid multibyte character sequence that begins and ends in its initial shift state (7.21.6.1, 7.21.6.2, 7.27.3.5, 7.29.2.1, 7.29.2.2, 7.29.5.1). -In a call to one of the formatted output functions, a precision appears with a conversion specifier other than those described (7.21.6.1, 7.29.2.1). -A conversion specification for a formatted output function uses an asterisk to denote an argument-supplied field width or precision, but the corresponding argument is not provided (7.21.6.1, 7.29.2.1). -A conversion specification for a formatted output function uses a # or 0 flag with a conversion specifier other than those described (7.21.6.1, 7.29.2.1). -A conversion specification for one of the formatted input/output functions uses a length modifier with a conversion specifier other than those described (7.21.6.1, 7.21.6.2, 7.29.2.1, 7.29.2.2). -An s conversion specifier is encountered by one of the formatted output functions, and the argument is missing the null terminator (unless a precision is specified that does not require null termination) (7.21.6.1, 7.29.2.1). -An n conversion specification for one of the formatted input/output functions includes any flags, an assignment-suppressing character, a field width, or a precision (7.21.6.1, 7.21.6.2, 7.29.2.1, 7.29.2.2). -A % conversion specifier is encountered by one of the formatted input/output functions, but the complete conversion specification is not exactly %% (7.21.6.1, 7.21.6.2, 7.29.2.1, 7.29.2.2). -An invalid conversion specification is found in the format for one of the formatted input/output functions, or the strftime or wcsftime function (7.21.6.1, 7.21.6.2, 7.27.3.5, 7.29.2.1, 7.29.2.2, 7.29.5.1). -The number of characters or wide characters transmitted by a formatted output function (or written to an array, or that would have been written to an array) is greater than INT_MAX (7.21.6.1, 7.29.2.1). -The number of input items assigned by a formatted input function is greater than INT_MAX (7.21.6.2, 7.29.2.2). -The result of a conversion by one of the formatted input functions cannot be represented in the corresponding object, or the receiving object does not have an appropriate type (7.21.6.2, 7.29.2.2). -A c, s, or [ conversion specifier is encountered by one of the formatted input functions, and the array pointed to by the corresponding argument is not large enough to accept the input sequence (and a null terminator if the conversion specifier is s or [) (7.21.6.2, 7.29.2.2). -A c, s, or [ conversion specifier with an l qualifier is encountered by one of the formatted input functions, but the input is not a valid multibyte character sequence that begins in the initial shift state (7.21.6.2, 7.29.2.2). -The input item for a %p conversion by one of the formatted input functions is not a value converted earlier during the same program execution (7.21.6.2, 7.29.2.2). -The vfprintf, vfscanf, vprintf, vscanf, vsnprintf, vsprintf, vsscanf, vfwprintf, vfwscanf, vswprintf, vswscanf, vwprintf, or vwscanf function is called with an improperly initialized va_list argument, or the argument is used (other than in an invocation of va_end) after the function returns (7.21.6.8, 7.21.6.9, 7.21.6.10, 7.21.6.11, 7.21.6.12, 7.21.6.13, 7.21.6.14, 7.29.2.5, 7.29.2.6, 7.29.2.7, 7.29.2.8, 7.29.2.9, 7.29.2.10). -The contents of the array supplied in a call to the fgets or fgetws function are used after a read error occurred (7.21.7.2, 7.29.3.2). -The file position indicator for a binary stream is used after a call to the ungetc function where its value was zero before the call (7.21.7.10). -The file position indicator for a stream is used after an error occurred during a call to the fread or fwrite function (7.21.8.1, 7.21.8.2). -A partial element read by a call to the fread function is used (7.21.8.1). -The fseek function is called for a text stream with a nonzero offset and either the offset was not returned by a previous successful call to the ftell function on a stream associated with the same file or whence is not SEEK_SET (7.21.9.2). -The fsetpos function is called to set a position that was not returned by a previous successful call to the fgetpos function on a stream associated with the same file (7.21.9.3). -A non-null pointer returned by a call to the calloc, malloc, or realloc function with a zero requested size is used to access an object (7.22.3). -The value of a pointer that refers to space deallocated by a call to the free or realloc function is used (7.22.3). -The alignment requested of the aligned_alloc function is not valid or not supported by the implementation, or the size requested is not an integral multiple of the alignment (7.22.3.1). -The pointer argument to the free or realloc function does not match a pointer earlier returned by a memory management function, or the space has been deallocated by a call to free or realloc (7.22.3.3, 7.22.3.5). -The value of the object allocated by the malloc function is used (7.22.3.4). -The value of any bytes in a new object allocated by the realloc function beyond the size of the old object are used (7.22.3.5). -The program calls the exit or quick_exit function more than once, or calls both functions (7.22.4.4, 7.22.4.7). -During the call to a function registered with the atexit or at_quick_exit function, a call is made to the longjmp function that would terminate the call to the registered function (7.22.4.4, 7.22.4.7). -The string set up by the getenv or strerror function is modified by the program (7.22.4.6, 7.24.6.2). -A signal is raised while the quick_exit function is executing (7.22.4.7). -A command is executed through the system function in a way that is documented as causing termination or some other form of undefined behavior (7.22.4.8). -A searching or sorting utility function is called with an invalid pointer argument, even if the number of elements is zero (7.22.5). -The comparison function called by a searching or sorting utility function alters the contents of the array being searched or sorted, or returns ordering values inconsistently (7.22.5). -The array being searched by the bsearch function does not have its elements in proper order (7.22.5.1). -The current conversion state is used by a multibyte/wide character conversion function after changing the LC_CTYPE category (7.22.7). -A string or wide string utility function is instructed to access an array beyond the end of an object (7.24.1, 7.29.4). -A string or wide string utility function is called with an invalid pointer argument, even if the length is zero (7.24.1, 7.29.4). -The contents of the destination array are used after a call to the strxfrm, strftime, wcsxfrm, or wcsftime function in which the specified length was too small to hold the entire null-terminated result (7.24.4.5, 7.27.3.5, 7.29.4.4.4, 7.29.5.1). -The first argument in the very first call to the strtok or wcstok is a null pointer (7.24.5.8, 7.29.4.5.7). -The type of an argument to a type-generic macro is not compatible with the type of the corresponding parameter of the selected function (7.25). -A complex argument is supplied for a generic parameter of a type-generic macro that has no corresponding complex function (7.25). -At least one member of the broken-down time passed to asctime contains a value outside its normal range, or the calculated year exceeds four digits or is less than the year 1000 (7.27.3.1). -The argument corresponding to an s specifier without an l qualifier in a call to the fwprintf function does not point to a valid multibyte character sequence that begins in the initial shift state (7.29.2.11). -In a call to the wcstok function, the object pointed to by ptr does not have the value stored by the previous call for the same wide string (7.29.4.5.7). -An mbstate_t object is used inappropriately (7.29.6). -The value of an argument of type wint_t to a wide character classification or case mapping function is neither equal to the value of WEOF nor representable as a wchar_t (7.30.1). -The iswctype function is called using a different LC_CTYPE category from the one in effect for the call to the wctype function that returned the description (7.30.2.2.1). -The towctrans function is called using a different LC_CTYPE category from the one in effect for the call to the wctrans function that returned the description (7.30.3.2.1). -Contents - -J.3 Implementation-defined behavior - -1 A conforming implementation is required to document its choice of behavior in each of the areas listed in this subclause. The following are implementation-defined: - -Contents - -J.3.1 Translation - -1 - -How a diagnostic is identified (3.10, 5.1.1.3). -Whether each nonempty sequence of white-space characters other than new-line is retained or replaced by one space character in translation phase 3 (5.1.1.2). -Contents - -J.3.2 Environment - -1 - -The mapping between physical source file multibyte characters and the source character set in translation phase 1 (5.1.1.2). -The name and type of the function called at program startup in a freestanding environment (5.1.2.1). -The effect of program termination in a freestanding environment (5.1.2.1). -An alternative manner in which the main function may be defined (5.1.2.2.1). -The values given to the strings pointed to by the argv argument to main (5.1.2.2.1). -What constitutes an interactive device (5.1.2.3). -Whether a program can have more than one thread of execution in a freestanding environment (5.1.2.4). -The set of signals, their semantics, and their default handling (7.14). -Signal values other than SIGFPE, SIGILL, and SIGSEGV that correspond to a computational exception (7.14.1.1). -Signals for which the equivalent of signal(sig, SIG_IGN); is executed at program startup (7.14.1.1). -The set of environment names and the method for altering the environment list used by the getenv function (7.22.4.6). -The manner of execution of the string by the system function (7.22.4.8). -Contents - -J.3.3 Identifiers - -1 - -Which additional multibyte characters may appear in identifiers and their correspondence to universal character names (6.4.2). -The number of significant initial characters in an identifier (5.2.4.1, 6.4.2). -Contents - -J.3.4 Characters - -1 - -The number of bits in a byte (3.6). -The values of the members of the execution character set (5.2.1). -The unique value of the member of the execution character set produced for each of the standard alphabetic escape sequences (5.2.2). -The value of a char object into which has been stored any character other than a member of the basic execution character set (6.2.5). -Which of signed char or unsigned char has the same range, representation, and behavior as ''plain'' char (6.2.5, 6.3.1.1). -The mapping of members of the source character set (in character constants and string literals) to members of the execution character set (6.4.4.4, 5.1.1.2). -The value of an integer character constant containing more than one character or containing a character or escape sequence that does not map to a single-byte execution character (6.4.4.4). -The value of a wide character constant containing more than one multibyte character or a single multibyte character that maps to multiple members of the extended execution character set, or containing a multibyte character or escape sequence not represented in the extended execution character set (6.4.4.4). -The current locale used to convert a wide character constant consisting of a single multibyte character that maps to a member of the extended execution character set into a corresponding wide character code (6.4.4.4). -Whether differently-prefixed wide string literal tokens can be concatenated and, if so, the treatment of the resulting multibyte character sequence (6.4.5). -The current locale used to convert a wide string literal into corresponding wide character codes (6.4.5). -The value of a string literal containing a multibyte character or escape sequence not represented in the execution character set (6.4.5). -The encoding of any of wchar_t, char16_t, and char32_t where the corresponding standard encoding macro (__STDC_ISO_10646__, __STDC_UTF_16__, or __STDC_UTF_32__) is not defined (6.10.8.2). -Contents - -J.3.5 Integers - -1 - -Any extended integer types that exist in the implementation (6.2.5). -Whether signed integer types are represented using sign and magnitude, two's complement, or ones' complement, and whether the extraordinary value is a trap representation or an ordinary value (6.2.6.2). -The rank of any extended integer type relative to another extended integer type with the same precision (6.3.1.1). -The result of, or the signal raised by, converting an integer to a signed integer type when the value cannot be represented in an object of that type (6.3.1.3). -The results of some bitwise operations on signed integers (6.5). -Contents - -J.3.6 Floating point - -1 - -The accuracy of the floating-point operations and of the library functions in <math.h> and <complex.h> that return floating-point results (5.2.4.2.2). -The accuracy of the conversions between floating-point internal representations and string representations performed by the library functions in <stdio.h>, <stdlib.h>, and <wchar.h> (5.2.4.2.2). -The rounding behaviors characterized by non-standard values of FLT_ROUNDS (5.2.4.2.2). -The evaluation methods characterized by non-standard negative values of FLT_EVAL_METHOD (5.2.4.2.2). -The direction of rounding when an integer is converted to a floating-point number that cannot exactly represent the original value (6.3.1.4). -The direction of rounding when a floating-point number is converted to a narrower floating-point number (6.3.1.5). -How the nearest representable value or the larger or smaller representable value immediately adjacent to the nearest representable value is chosen for certain floating constants (6.4.4.2). -Whether and how floating expressions are contracted when not disallowed by the FP_CONTRACT pragma (6.5). -The default state for the FENV_ACCESS pragma (7.6.1). -Additional floating-point exceptions, rounding modes, environments, and classifications, and their macro names (7.6, 7.12). -The default state for the FP_CONTRACT pragma (7.12.2). -Contents - -J.3.7 Arrays and pointers - -1 - -The result of converting a pointer to an integer or vice versa (6.3.2.3). -The size of the result of subtracting two pointers to elements of the same array (6.5.6). -Contents - -J.3.8 Hints - -1 - -The extent to which suggestions made by using the register storage-class specifier are effective (6.7.1). -The extent to which suggestions made by using the inline function specifier are effective (6.7.4). -Contents - -J.3.9 Structures, unions, enumerations, and bit-fields - -1 - -Whether a ''plain'' int bit-field is treated as a signed int bit-field or as an unsigned int bit-field (6.7.2, 6.7.2.1). -Allowable bit-field types other than _Bool, signed int, and unsigned int (6.7.2.1). -Whether atomic types are permitted for bit-fields (6.7.2.1). -Whether a bit-field can straddle a storage-unit boundary (6.7.2.1). -The order of allocation of bit-fields within a unit (6.7.2.1). -The alignment of non-bit-field members of structures (6.7.2.1). This should present no problem unless binary data written by one implementation is read by another. -The integer type compatible with each enumerated type (6.7.2.2). -Contents - -J.3.10 Qualifiers - -1 - -What constitutes an access to an object that has volatile-qualified type (6.7.3). -Contents - -J.3.11 Preprocessing directives - -1 - -The locations within #pragma directives where header name preprocessing tokens are recognized (6.4, 6.4.7). -How sequences in both forms of header names are mapped to headers or external source file names (6.4.7). -Whether the value of a character constant in a constant expression that controls conditional inclusion matches the value of the same character constant in the execution character set (6.10.1). -Whether the value of a single-character character constant in a constant expression that controls conditional inclusion may have a negative value (6.10.1). -The places that are searched for an included < > delimited header, and how the places are specified or the header is identified (6.10.2). -How the named source file is searched for in an included " " delimited header (6.10.2). -The method by which preprocessing tokens (possibly resulting from macro expansion) in a #include directive are combined into a header name (6.10.2). -The nesting limit for #include processing (6.10.2). -Whether the # operator inserts a \ character before the \ character that begins a universal character name in a character constant or string literal (6.10.3.2). -The behavior on each recognized non-STDC #pragma directive (6.10.6). -The definitions for __DATE__ and __TIME__ when respectively, the date and time of translation are not available (6.10.8.1). -Contents - -J.3.12 Library functions - -1 - -Any library facilities available to a freestanding program, other than the minimal set required by clause 4 (5.1.2.1). -The format of the diagnostic printed by the assert macro (7.2.1.1). -The representation of the floating-point status flags stored by the fegetexceptflag function (7.6.2.2). -Whether the feraiseexcept function raises the ''inexact'' floating-point exception in addition to the ''overflow'' or ''underflow'' floating-point exception (7.6.2.3). -Strings other than "C" and "" that may be passed as the second argument to the setlocale function (7.11.1.1). -The types defined for float_t and double_t when the value of the FLT_EVAL_METHOD macro is less than 0 (7.12). -Domain errors for the mathematics functions, other than those required by this International Standard (7.12.1). -The values returned by the mathematics functions on domain errors or pole errors (7.12.1). -The values returned by the mathematics functions on underflow range errors, whether errno is set to the value of the macro ERANGE when the integer expression math_errhandling & MATH_ERRNO is nonzero, and whether the ''underflow'' floating-point exception is raised when the integer expression math_errhandling & MATH_ERREXCEPT is nonzero. (7.12.1). -Whether a domain error occurs or zero is returned when an fmod function has a second argument of zero (7.12.10.1). -Whether a domain error occurs or zero is returned when a remainder function has a second argument of zero (7.12.10.2). -The base-2 logarithm of the modulus used by the remquo functions in reducing the quotient (7.12.10.3). -Whether a domain error occurs or zero is returned when a remquo function has a second argument of zero (7.12.10.3). -Whether the equivalent of signal(sig, SIG_DFL); is executed prior to the call of a signal handler, and, if not, the blocking of signals that is performed (7.14.1.1). -The null pointer constant to which the macro NULL expands (7.19). -Whether the last line of a text stream requires a terminating new-line character (7.21.2). -Whether space characters that are written out to a text stream immediately before a new-line character appear when read in (7.21.2). -The number of null characters that may be appended to data written to a binary stream (7.21.2). -Whether the file position indicator of an append-mode stream is initially positioned at the beginning or end of the file (7.21.3). -Whether a write on a text stream causes the associated file to be truncated beyond that point (7.21.3). -The characteristics of file buffering (7.21.3). -Whether a zero-length file actually exists (7.21.3). -The rules for composing valid file names (7.21.3). -Whether the same file can be simultaneously open multiple times (7.21.3). -The nature and choice of encodings used for multibyte characters in files (7.21.3). -The effect of the remove function on an open file (7.21.4.1). -The effect if a file with the new name exists prior to a call to the rename function (7.21.4.2). -Whether an open temporary file is removed upon abnormal program termination (7.21.4.3). -Which changes of mode are permitted (if any), and under what circumstances (7.21.5.4). -The style used to print an infinity or NaN, and the meaning of any n-char or n-wchar sequence printed for a NaN (7.21.6.1, 7.29.2.1). -The output for %p conversion in the fprintf or fwprintf function (7.21.6.1, 7.29.2.1). -The interpretation of a - character that is neither the first nor the last character, nor the second where a ^ character is the first, in the scanlist for %[ conversion in the fscanf or fwscanf function (7.21.6.2, 7.29.2.1). -The set of sequences matched by a %p conversion and the interpretation of the corresponding input item in the fscanf or fwscanf function (7.21.6.2, 7.29.2.2). -The value to which the macro errno is set by the fgetpos, fsetpos, or ftell functions on failure (7.21.9.1, 7.21.9.3, 7.21.9.4). -The meaning of any n-char or n-wchar sequence in a string representing a NaN that is converted by the strtod, strtof, strtold, wcstod, wcstof, or wcstold function (7.22.1.3, 7.29.4.1.1). -Whether or not the strtod, strtof, strtold, wcstod, wcstof, or wcstold function sets errno to ERANGE when underflow occurs (7.22.1.3, 7.29.4.1.1). -Whether the calloc, malloc, and realloc functions return a null pointer or a pointer to an allocated object when the size requested is zero (7.22.3). -Whether open streams with unwritten buffered data are flushed, open streams are closed, or temporary files are removed when the abort or _Exit function is called (7.22.4.1, 7.22.4.5). -The termination status returned to the host environment by the abort, exit, _Exit, or quick_exit function (7.22.4.1, 7.22.4.4, 7.22.4.5, 7.22.4.7). -The value returned by the system function when its argument is not a null pointer (7.22.4.8). -The range and precision of times representable in clock_t and time_t (7.27). * -The local time zone and Daylight Saving Time (7.27.1). -The era for the clock function (7.27.2.1). -The TIME_UTC epoch (7.27.2.5). -The replacement string for the %Z specifier to the strftime, and wcsftime functions in the "C" locale (7.27.3.5, 7.29.5.1). -Whether the functions in <math.h> honor the rounding direction mode in an IEC 60559 conformant implementation, unless explicitly specified otherwise (F.10). -Contents - -J.3.13 Architecture - -1 - -The values or expressions assigned to the macros specified in the headers <float.h>, <limits.h>, and <stdint.h> (5.2.4.2, 7.20.2, 7.20.3). -The result of attempting to indirectly access an object with automatic or thread storage duration from a thread other than the one with which it is associated (6.2.4). -The number, order, and encoding of bytes in any object (when not explicitly specified in this International Standard) (6.2.6.1). -Whether any extended alignments are supported and the contexts in which they are supported (6.2.8). -Valid alignment values other than those returned by an _Alignof expression for fundamental types, if any (6.2.8). -The value of the result of the sizeof and _Alignof operators (6.5.3.4). -Contents - -J.4 Locale-specific behavior - -1 The following characteristics of a hosted environment are locale-specific and are required to be documented by the implementation: - -Additional members of the source and execution character sets beyond the basic character set (5.2.1). -The presence, meaning, and representation of additional multibyte characters in the execution character set beyond the basic character set (5.2.1.2). -The shift states used for the encoding of multibyte characters (5.2.1.2). -The direction of writing of successive printing characters (5.2.2). -The decimal-point character (7.1.1). -The set of printing characters (7.4, 7.30.2). -The set of control characters (7.4, 7.30.2). -The sets of characters tested for by the isalpha, isblank, islower, ispunct, isspace, isupper, iswalpha, iswblank, iswlower, iswpunct, iswspace, or iswupper functions (7.4.1.2, 7.4.1.3, 7.4.1.7, 7.4.1.9, 7.4.1.10, 7.4.1.11, 7.30.2.1.2, 7.30.2.1.3, 7.30.2.1.7, 7.30.2.1.9, 7.30.2.1.10, 7.30.2.1.11). -The native environment (7.11.1.1). -Additional subject sequences accepted by the numeric conversion functions (7.22.1, 7.29.4.1). -The collation sequence of the execution character set (7.24.4.3, 7.29.4.4.2). -The contents of the error message strings set up by the strerror function (7.24.6.2). -The formats for time and date (7.27.3.5, 7.29.5.1). -Character mappings that are supported by the towctrans function (7.30.1). -Character classifications that are supported by the iswctype function (7.30.1). -Contents - -J.5 Common extensions - -1 The following extensions are widely used in many systems, but are not portable to all implementations. The inclusion of any extension that may cause a strictly conforming program to become invalid renders an implementation nonconforming. Examples of such extensions are new keywords, extra library functions declared in standard headers, or predefined macros with names that do not begin with an underscore. - -Contents - -J.5.1 Environment arguments - -1 In a hosted environment, the main function receives a third argument, char *envp[], that points to a null-terminated array of pointers to char, each of which points to a string that provides information about the environment for this execution of the program (5.1.2.2.1). - -Contents - -J.5.2 Specialized identifiers - -1 Characters other than the underscore _, letters, and digits, that are not part of the basic source character set (such as the dollar sign $, or characters in national character sets) may appear in an identifier (6.4.2). - -Contents - -J.5.3 Lengths and cases of identifiers - -1 All characters in identifiers (with or without external linkage) are significant (6.4.2). - -Contents - -J.5.4 Scopes of identifiers - -1 A function identifier, or the identifier of an object the declaration of which contains the keyword extern, has file scope (6.2.1). - -Contents - -J.5.5 Writable string literals - -1 String literals are modifiable (in which case, identical string literals should denote distinct objects) (6.4.5). - -Contents - -J.5.6 Other arithmetic types - -1 Additional arithmetic types, such as __int128 or double double, and their appropriate conversions are defined (6.2.5, 6.3.1). Additional floating types may have more range or precision than long double, may be used for evaluating expressions of other floating types, and may be used to define float_t or double_t. Additional floating types may also have less range or precision than float. - -Contents - -J.5.7 Function pointer casts - -1 A pointer to an object or to void may be cast to a pointer to a function, allowing data to be invoked as a function (6.5.4). - -2 A pointer to a function may be cast to a pointer to an object or to void, allowing a function to be inspected or modified (for example, by a debugger) (6.5.4). - -Contents - -J.5.8 Extended bit-field types - -1 A bit-field may be declared with a type other than _Bool, unsigned int, or signed int, with an appropriate maximum width (6.7.2.1). - -Contents - -J.5.9 The fortran keyword - -1 The fortran function specifier may be used in a function declaration to indicate that calls suitable for FORTRAN should be generated, or that a different representation for the external name is to be generated (6.7.4). - -Contents - -J.5.10 The asm keyword - -1 The asm keyword may be used to insert assembly language directly into the translator output (6.8). The most common implementation is via a statement of the form: - - asm ( character-string-literal ); -Contents - -J.5.11 Multiple external definitions - -1 There may be more than one external definition for the identifier of an object, with or without the explicit use of the keyword extern; if the definitions disagree, or more than one is initialized, the behavior is undefined (6.9.2). - -Contents - -J.5.12 Predefined macro names - -1 Macro names that do not begin with an underscore, describing the translation and execution environments, are defined by the implementation before translation begins (6.10.8). - -Contents - -J.5.13 Floating-point status flags - -1 If any floating-point status flags are set on normal termination after all calls to functions registered by the atexit function have been made (see 7.22.4.4), the implementation writes some diagnostics indicating the fact to the stderr stream, if it is still open, - -Contents - -J.5.14 Extra arguments for signal handlers - -1 Handlers for specific signals are called with extra arguments in addition to the signal number (7.14.1.1). - -Contents - -J.5.15 Additional stream types and file-opening modes - -1 Additional mappings from files to streams are supported (7.21.2). - -2 Additional file-opening modes may be specified by characters appended to the mode argument of the fopen function (7.21.5.3). - -Contents - -J.5.16 Defined file position indicator - -1 The file position indicator is decremented by each successful call to the ungetc or ungetwc function for a text stream, except if its value was zero before a call (7.21.7.10, 7.29.3.10). - -Contents - -J.5.17 Math error reporting - -1 Functions declared in <complex.h> and <math.h> raise SIGFPE to report errors instead of, or in addition to, setting errno or raising floating-point exceptions (7.3, 7.12). - - diff --git a/doc/std/CHAPTER-5.txt b/doc/std/CHAPTER-5.txt @@ -1,1678 +0,0 @@ -5. Environment - - -1 - An implementation translates C source files and executes C programs in two data- - processing-system environments, which will be called the translation environment and - the execution environment in this International Standard. Their characteristics define and - constrain the results of executing conforming C programs constructed according to the - syntactic and semantic rules for conforming implementations. - - Forward references: In this clause, only a few of many possible forward references - have been noted. - - -Contents - -5.1 Conceptual models - - -Contents - -5.1.1 Translation environment - - -Contents - -5.1.1.1 Program structure - - -1 - A C program need not all be translated at the same time. The text of the program is kept - in units called source files, (or preprocessing files) in this International Standard. A - source file together with all the headers and source files included via the preprocessing - directive #include is known as a preprocessing translation unit. After preprocessing, a - preprocessing translation unit is called a translation unit. Previously translated translation - units may be preserved individually or in libraries. The separate translation units of a - program communicate by (for example) calls to functions whose identifiers have external - linkage, manipulation of objects whose identifiers have external linkage, or manipulation - of data files. Translation units may be separately translated and then later linked to - produce an executable program. - - Forward references: linkages of identifiers (6.2.2), external definitions (6.9), - preprocessing directives (6.10). - - -Contents - -5.1.1.2 Translation phases - - -1 - The precedence among the syntax rules of translation is specified by the following - phases.6) - - -- Physical source file multibyte characters are mapped, in an implementation- - defined manner, to the source character set (introducing new-line characters for - end-of-line indicators) if necessary. Trigraph sequences are replaced by - corresponding single-character internal representations. - - - - - -- Each instance of a backslash character (\) immediately followed by a new-line - character is deleted, splicing physical source lines to form logical source lines. - Only the last backslash on any physical source line shall be eligible for being part - of such a splice. A source file that is not empty shall end in a new-line character, - which shall not be immediately preceded by a backslash character before any such - splicing takes place. - -- The source file is decomposed into preprocessing tokens7) and sequences of - white-space characters (including comments). A source file shall not end in a - partial preprocessing token or in a partial comment. Each comment is replaced by - one space character. New-line characters are retained. Whether each nonempty - sequence of white-space characters other than new-line is retained or replaced by - one space character is implementation-defined. - -- Preprocessing directives are executed, macro invocations are expanded, and - _Pragma unary operator expressions are executed. If a character sequence that - matches the syntax of a universal character name is produced by token - concatenation (6.10.3.3), the behavior is undefined. A #include preprocessing - directive causes the named header or source file to be processed from phase 1 - through phase 4, recursively. All preprocessing directives are then deleted. - -- Each source character set member and escape sequence in character constants and - string literals is converted to the corresponding member of the execution character - set; if there is no corresponding member, it is converted to an implementation- - defined member other than the null (wide) character.8) - -- Adjacent string literal tokens are concatenated. - -- White-space characters separating tokens are no longer significant. Each - preprocessing token is converted into a token. The resulting tokens are - syntactically and semantically analyzed and translated as a translation unit. - -- All external object and function references are resolved. Library components are - linked to satisfy external references to functions and objects not defined in the - current translation. All such translator output is collected into a program image - which contains information needed for execution in its execution environment. - - - Forward references: universal character names (6.4.3), lexical elements (6.4), - preprocessing directives (6.10), trigraph sequences (5.2.1.1), external definitions (6.9). - - - - - -Footnotes - -6) Implementations shall behave as if these separate phases occur, even though many are typically folded - together in practice. Source files, translation units, and translated translation units need not - necessarily be stored as files, nor need there be any one-to-one correspondence between these entities - and any external representation. The description is conceptual only, and does not specify any - particular implementation. - - -7) As described in 6.4, the process of dividing a source file's characters into preprocessing tokens is - context-dependent. For example, see the handling of < within a #include preprocessing directive. - - -8) An implementation need not convert all non-corresponding source characters to the same execution - character. - - -Contents - -5.1.1.3 Diagnostics - - -1 - A conforming implementation shall produce at least one diagnostic message (identified in - an implementation-defined manner) if a preprocessing translation unit or translation unit - contains a violation of any syntax rule or constraint, even if the behavior is also explicitly - specified as undefined or implementation-defined. Diagnostic messages need not be - produced in other circumstances.9) - -2 - EXAMPLE An implementation shall issue a diagnostic for the translation unit: - - - char i; - int i; - - - because in those cases where wording in this International Standard describes the behavior for a construct - as being both a constraint error and resulting in undefined behavior, the constraint error shall be diagnosed. - - - -Footnotes - -9) The intent is that an implementation should identify the nature of, and where possible localize, each - violation. Of course, an implementation is free to produce any number of diagnostics as long as a - valid program is still correctly translated. It may also successfully translate an invalid program. - - -Contents - -5.1.2 Execution environments - - -1 - Two execution environments are defined: freestanding and hosted. In both cases, - program startup occurs when a designated C function is called by the execution - environment. All objects with static storage duration shall be initialized (set to their - initial values) before program startup. The manner and timing of such initialization are - otherwise unspecified. Program termination returns control to the execution - environment. - - Forward references: storage durations of objects (6.2.4), initialization (6.7.9). - - -Contents - -5.1.2.1 Freestanding environment - - -1 - In a freestanding environment (in which C program execution may take place without any - benefit of an operating system), the name and type of the function called at program - startup are implementation-defined. Any library facilities available to a freestanding - program, other than the minimal set required by clause 4, are implementation-defined. - -2 - The effect of program termination in a freestanding environment is implementation- - defined. - - -Contents - -5.1.2.2 Hosted environment - - -1 - A hosted environment need not be provided, but shall conform to the following - specifications if present. - - - - - - -Contents - -5.1.2.2.1 Program startup - - -1 - The function called at program startup is named main. The implementation declares no - prototype for this function. It shall be defined with a return type of int and with no - parameters: - - - int main(void) { /* ... */ } - - - or with two parameters (referred to here as argc and argv, though any names may be - used, as they are local to the function in which they are declared): - - - int main(int argc, char *argv[]) { /* ... */ } - - - or equivalent;10) or in some other implementation-defined manner. - -2 - If they are declared, the parameters to the main function shall obey the following - constraints: - - -- The value of argc shall be nonnegative. - -- argv[argc] shall be a null pointer. - -- If the value of argc is greater than zero, the array members argv[0] through - argv[argc-1] inclusive shall contain pointers to strings, which are given - implementation-defined values by the host environment prior to program startup. The - intent is to supply to the program information determined prior to program startup - from elsewhere in the hosted environment. If the host environment is not capable of - supplying strings with letters in both uppercase and lowercase, the implementation - shall ensure that the strings are received in lowercase. - -- If the value of argc is greater than zero, the string pointed to by argv[0] - represents the program name; argv[0][0] shall be the null character if the - program name is not available from the host environment. If the value of argc is - greater than one, the strings pointed to by argv[1] through argv[argc-1] - represent the program parameters. - -- The parameters argc and argv and the strings pointed to by the argv array shall - be modifiable by the program, and retain their last-stored values between program - startup and program termination. - - -Footnotes - -10) Thus, int can be replaced by a typedef name defined as int, or the type of argv can be written as - char ** argv, and so on. - - -Contents - -5.1.2.2.2 Program execution - - -1 - In a hosted environment, a program may use all the functions, macros, type definitions, - and objects described in the library clause (clause 7). - - - - - - -Contents - -5.1.2.2.3 Program termination - - -1 - If the return type of the main function is a type compatible with int, a return from the - initial call to the main function is equivalent to calling the exit function with the value - returned by the main function as its argument;11) reaching the } that terminates the - main function returns a value of 0. If the return type is not compatible with int, the - termination status returned to the host environment is unspecified. - - Forward references: definition of terms (7.1.1), the exit function (7.22.4.4). - - -Footnotes - -11) In accordance with 6.2.4, the lifetimes of objects with automatic storage duration declared in main - will have ended in the former case, even where they would not have in the latter. - - -Contents - -5.1.2.3 Program execution - - -1 - The semantic descriptions in this International Standard describe the behavior of an - abstract machine in which issues of optimization are irrelevant. - -2 - Accessing a volatile object, modifying an object, modifying a file, or calling a function - that does any of those operations are all side effects,12) which are changes in the state of - the execution environment. Evaluation of an expression in general includes both value - computations and initiation of side effects. Value computation for an lvalue expression - includes determining the identity of the designated object. - -3 - Sequenced before is an asymmetric, transitive, pair-wise relation between evaluations - executed by a single thread, which induces a partial order among those evaluations. - Given any two evaluations A and B, if A is sequenced before B, then the execution of A - shall precede the execution of B. (Conversely, if A is sequenced before B, then B is - sequenced after A.) If A is not sequenced before or after B, then A and B are - unsequenced. Evaluations A and B are indeterminately sequenced when A is sequenced - either before or after B, but it is unspecified which.13) The presence of a sequence point - between the evaluation of expressions A and B implies that every value computation and - side effect associated with A is sequenced before every value computation and side effect - associated with B. (A summary of the sequence points is given in annex C.) - -4 - In the abstract machine, all expressions are evaluated as specified by the semantics. An - actual implementation need not evaluate part of an expression if it can deduce that its - value is not used and that no needed side effects are produced (including any caused by - - - calling a function or accessing a volatile object). - -5 - When the processing of the abstract machine is interrupted by receipt of a signal, the - values of objects that are neither lock-free atomic objects nor of type volatile - sig_atomic_t are unspecified, as is the state of the floating-point environment. The - value of any object modified by the handler that is neither a lock-free atomic object nor of - type volatile sig_atomic_t becomes indeterminate when the handler exits, as - does the state of the floating-point environment if it is modified by the handler and not - restored to its original state. - -6 - The least requirements on a conforming implementation are: - - -- Accesses to volatile objects are evaluated strictly according to the rules of the abstract - machine. - -- At program termination, all data written into files shall be identical to the result that - execution of the program according to the abstract semantics would have produced. - -- The input and output dynamics of interactive devices shall take place as specified in - 7.21.3. The intent of these requirements is that unbuffered or line-buffered output - appear as soon as possible, to ensure that prompting messages actually appear prior to - a program waiting for input. - - This is the observable behavior of the program. - -7 - What constitutes an interactive device is implementation-defined. - -8 - More stringent correspondences between abstract and actual semantics may be defined by - each implementation. - -9 - EXAMPLE 1 An implementation might define a one-to-one correspondence between abstract and actual - semantics: at every sequence point, the values of the actual objects would agree with those specified by the - abstract semantics. The keyword volatile would then be redundant. - -10 - Alternatively, an implementation might perform various optimizations within each translation unit, such - that the actual semantics would agree with the abstract semantics only when making function calls across - translation unit boundaries. In such an implementation, at the time of each function entry and function - return where the calling function and the called function are in different translation units, the values of all - externally linked objects and of all objects accessible via pointers therein would agree with the abstract - semantics. Furthermore, at the time of each such function entry the values of the parameters of the called - function and of all objects accessible via pointers therein would agree with the abstract semantics. In this - type of implementation, objects referred to by interrupt service routines activated by the signal function - would require explicit specification of volatile storage, as well as other implementation-defined - restrictions. - - -11 - EXAMPLE 2 In executing the fragment - - - char c1, c2; - /* ... */ - c1 = c1 + c2; - - - the ''integer promotions'' require that the abstract machine promote the value of each variable to int size - and then add the two ints and truncate the sum. Provided the addition of two chars can be done without - - overflow, or with overflow wrapping silently to produce the correct result, the actual execution need only - produce the same result, possibly omitting the promotions. - - -12 - EXAMPLE 3 Similarly, in the fragment - - - float f1, f2; - double d; - /* ... */ - f1 = f2 * d; - - - the multiplication may be executed using single-precision arithmetic if the implementation can ascertain - that the result would be the same as if it were executed using double-precision arithmetic (for example, if d - were replaced by the constant 2.0, which has type double). - - -13 - EXAMPLE 4 Implementations employing wide registers have to take care to honor appropriate - semantics. Values are independent of whether they are represented in a register or in memory. For - example, an implicit spilling of a register is not permitted to alter the value. Also, an explicit store and load - is required to round to the precision of the storage type. In particular, casts and assignments are required to - perform their specified conversion. For the fragment - - - double d1, d2; - float f; - d1 = f = expression; - d2 = (float) expression; - - - the values assigned to d1 and d2 are required to have been converted to float. - - -14 - EXAMPLE 5 Rearrangement for floating-point expressions is often restricted because of limitations in - precision as well as range. The implementation cannot generally apply the mathematical associative rules - for addition or multiplication, nor the distributive rule, because of roundoff error, even in the absence of - overflow and underflow. Likewise, implementations cannot generally replace decimal constants in order to - rearrange expressions. In the following fragment, rearrangements suggested by mathematical rules for real - numbers are often not valid (see F.9). - - - double x, y, z; - /* ... */ - x = (x * y) * z; // not equivalent to x *= y * z; - z = (x - y) + y ; // not equivalent to z = x; - z = x + x * y; // not equivalent to z = x * (1.0 + y); - y = x / 5.0; // not equivalent to y = x * 0.2; - - - - -15 - EXAMPLE 6 To illustrate the grouping behavior of expressions, in the following fragment - - - int a, b; - /* ... */ - a = a + 32760 + b + 5; - - - the expression statement behaves exactly the same as - - - a = (((a + 32760) + b) + 5); - - - due to the associativity and precedence of these operators. Thus, the result of the sum (a + 32760) is - next added to b, and that result is then added to 5 which results in the value assigned to a. On a machine in - which overflows produce an explicit trap and in which the range of values representable by an int is - [-32768, +32767], the implementation cannot rewrite this expression as - - - a = ((a + b) + 32765); - - - since if the values for a and b were, respectively, -32754 and -15, the sum a + b would produce a trap - - while the original expression would not; nor can the expression be rewritten either as - - - a = ((a + 32765) + b); - - - or - - - a = (a + (b + 32765)); - - - since the values for a and b might have been, respectively, 4 and -8 or -17 and 12. However, on a machine - in which overflow silently generates some value and where positive and negative overflows cancel, the - above expression statement can be rewritten by the implementation in any of the above ways because the - same result will occur. - - -16 - EXAMPLE 7 The grouping of an expression does not completely determine its evaluation. In the - following fragment - - - #include <stdio.h> - int sum; - char *p; - /* ... */ - sum = sum * 10 - '0' + (*p++ = getchar()); - - - the expression statement is grouped as if it were written as - - - sum = (((sum * 10) - '0') + ((*(p++)) = (getchar()))); - - - but the actual increment of p can occur at any time between the previous sequence point and the next - sequence point (the ;), and the call to getchar can occur at any point prior to the need of its returned - value. - - - Forward references: expressions (6.5), type qualifiers (6.7.3), statements (6.8), floating- - point environment <fenv.h> (7.6), the signal function (7.14), files (7.21.3). - - -Footnotes - -12) The IEC 60559 standard for binary floating-point arithmetic requires certain user-accessible status - flags and control modes. Floating-point operations implicitly set the status flags; modes affect result - values of floating-point operations. Implementations that support such floating-point state are - required to regard changes to it as side effects -- see annex F for details. The floating-point - environment library <fenv.h> provides a programming facility for indicating when these side - effects matter, freeing the implementations in other cases. - - -13) The executions of unsequenced evaluations can interleave. Indeterminately sequenced evaluations - cannot interleave, but can be executed in any order. - - -Contents - -5.1.2.4 Multi-threaded executions and data races - - -1 - Under a hosted implementation, a program can have more than one thread of execution - (or thread) running concurrently. The execution of each thread proceeds as defined by - the remainder of this standard. The execution of the entire program consists of an - execution of all of its threads.14) Under a freestanding implementation, it is - implementation-defined whether a program can have more than one thread of execution. - -2 - The value of an object visible to a thread T at a particular point is the initial value of the - object, a value stored in the object by T , or a value stored in the object by another thread, - according to the rules below. - -3 - NOTE 1 In some cases, there may instead be undefined behavior. Much of this section is motivated by - the desire to support atomic operations with explicit and detailed visibility constraints. However, it also - implicitly supports a simpler view for more restricted programs. - - -4 - Two expression evaluations conflict if one of them modifies a memory location and the - other one reads or modifies the same memory location. - - - - -5 - The library defines a number of atomic operations (7.17) and operations on mutexes - (7.26.4) that are specially identified as synchronization operations. These operations play - a special role in making assignments in one thread visible to another. A synchronization - operation on one or more memory locations is either an acquire operation, a release - operation, both an acquire and release operation, or a consume operation. A - synchronization operation without an associated memory location is a fence and can be - either an acquire fence, a release fence, or both an acquire and release fence. In addition, - there are relaxed atomic operations, which are not synchronization operations, and - atomic read-modify-write operations, which have special characteristics. - -6 - NOTE 2 For example, a call that acquires a mutex will perform an acquire operation on the locations - composing the mutex. Correspondingly, a call that releases the same mutex will perform a release - operation on those same locations. Informally, performing a release operation on A forces prior side effects - on other memory locations to become visible to other threads that later perform an acquire or consume - operation on A. We do not include relaxed atomic operations as synchronization operations although, like - synchronization operations, they cannot contribute to data races. - - -7 - All modifications to a particular atomic object M occur in some particular total order, - called the modification order of M. If A and B are modifications of an atomic object M, - and A happens before B, then A shall precede B in the modification order of M, which is - defined below. - -8 - NOTE 3 This states that the modification orders must respect the ''happens before'' relation. - - -9 - NOTE 4 There is a separate order for each atomic object. There is no requirement that these can be - combined into a single total order for all objects. In general this will be impossible since different threads - may observe modifications to different variables in inconsistent orders. - - -10 - A release sequence headed by a release operation A on an atomic object M is a maximal - contiguous sub-sequence of side effects in the modification order of M, where the first - operation is A and every subsequent operation either is performed by the same thread that - performed the release or is an atomic read-modify-write operation. - -11 - Certain library calls synchronize with other library calls performed by another thread. In - particular, an atomic operation A that performs a release operation on an object M - synchronizes with an atomic operation B that performs an acquire operation on M and - reads a value written by any side effect in the release sequence headed by A. - -12 - NOTE 5 Except in the specified cases, reading a later value does not necessarily ensure visibility as - described below. Such a requirement would sometimes interfere with efficient implementation. - - -13 - NOTE 6 The specifications of the synchronization operations define when one reads the value written by - another. For atomic variables, the definition is clear. All operations on a given mutex occur in a single total - order. Each mutex acquisition ''reads the value written'' by the last mutex release. - - -14 - An evaluation A carries a dependency 15) to an evaluation B if: - - - - -- the value of A is used as an operand of B, unless: - - -- B is an invocation of the kill_dependency macro, - - -- A is the left operand of a && or || operator, - - -- A is the left operand of a ? : operator, or - - -- A is the left operand of a , operator; - - or - -- A writes a scalar object or bit-field M, B reads from M the value written by A, and A - is sequenced before B, or - -- for some evaluation X, A carries a dependency to X and X carries a dependency to B. - - -15 - An evaluation A is dependency-ordered before16) an evaluation B if: - - -- A performs a release operation on an atomic object M, and, in another thread, B - performs a consume operation on M and reads a value written by any side effect in - the release sequence headed by A, or - -- for some evaluation X, A is dependency-ordered before X and X carries a - dependency to B. - - -16 - An evaluation A inter-thread happens before an evaluation B if A synchronizes with B, A - is dependency-ordered before B, or, for some evaluation X: - - -- A synchronizes with X and X is sequenced before B, - -- A is sequenced before X and X inter-thread happens before B, or - -- A inter-thread happens before X and X inter-thread happens before B. - - -17 - NOTE 7 The ''inter-thread happens before'' relation describes arbitrary concatenations of ''sequenced - before'', ''synchronizes with'', and ''dependency-ordered before'' relationships, with two exceptions. The - first exception is that a concatenation is not permitted to end with ''dependency-ordered before'' followed - by ''sequenced before''. The reason for this limitation is that a consume operation participating in a - ''dependency-ordered before'' relationship provides ordering only with respect to operations to which this - consume operation actually carries a dependency. The reason that this limitation applies only to the end of - such a concatenation is that any subsequent release operation will provide the required ordering for a prior - consume operation. The second exception is that a concatenation is not permitted to consist entirely of - ''sequenced before''. The reasons for this limitation are (1) to permit ''inter-thread happens before'' to be - transitively closed and (2) the ''happens before'' relation, defined below, provides for relationships - consisting entirely of ''sequenced before''. - - -18 - An evaluation A happens before an evaluation B if A is sequenced before B or A inter- - thread happens before B. - - - - - -19 - A visible side effect A on an object M with respect to a value computation B of M - satisfies the conditions: - - -- A happens before B, and - -- there is no other side effect X to M such that A happens before X and X happens - before B. - - The value of a non-atomic scalar object M, as determined by evaluation B, shall be the - value stored by the visible side effect A. - -20 - NOTE 8 If there is ambiguity about which side effect to a non-atomic object is visible, then there is a data - race and the behavior is undefined. - - -21 - NOTE 9 This states that operations on ordinary variables are not visibly reordered. This is not actually - detectable without data races, but it is necessary to ensure that data races, as defined here, and with suitable - restrictions on the use of atomics, correspond to data races in a simple interleaved (sequentially consistent) - execution. - - -22 - The visible sequence of side effects on an atomic object M, with respect to a value - computation B of M, is a maximal contiguous sub-sequence of side effects in the - modification order of M, where the first side effect is visible with respect to B, and for - every subsequent side effect, it is not the case that B happens before it. The value of an - atomic object M, as determined by evaluation B, shall be the value stored by some - operation in the visible sequence of M with respect to B. Furthermore, if a value - computation A of an atomic object M happens before a value computation B of M, and - the value computed by A corresponds to the value stored by side effect X, then the value - computed by B shall either equal the value computed by A, or be the value stored by side - effect Y , where Y follows X in the modification order of M. - -23 - NOTE 10 This effectively disallows compiler reordering of atomic operations to a single object, even if - both operations are ''relaxed'' loads. By doing so, we effectively make the ''cache coherence'' guarantee - provided by most hardware available to C atomic operations. - - -24 - NOTE 11 The visible sequence depends on the ''happens before'' relation, which in turn depends on the - values observed by loads of atomics, which we are restricting here. The intended reading is that there must - exist an association of atomic loads with modifications they observe that, together with suitably chosen - modification orders and the ''happens before'' relation derived as described above, satisfy the resulting - constraints as imposed here. - - -25 - The execution of a program contains a data race if it contains two conflicting actions in - different threads, at least one of which is not atomic, and neither happens before the - other. Any such data race results in undefined behavior. - -26 - NOTE 12 It can be shown that programs that correctly use simple mutexes and - memory_order_seq_cst operations to prevent all data races, and use no other synchronization - operations, behave as though the operations executed by their constituent threads were simply interleaved, - with each value computation of an object being the last value stored in that interleaving. This is normally - referred to as ''sequential consistency''. However, this applies only to data-race-free programs, and data- - race-free programs cannot observe most program transformations that do not change single-threaded - program semantics. In fact, most single-threaded program transformations continue to be allowed, since - any program that behaves differently as a result must contain undefined behavior. - - -27 - NOTE 13 Compiler transformations that introduce assignments to a potentially shared memory location - that would not be modified by the abstract machine are generally precluded by this standard, since such an - assignment might overwrite another assignment by a different thread in cases in which an abstract machine - execution would not have encountered a data race. This includes implementations of data member - assignment that overwrite adjacent members in separate memory locations. We also generally preclude - reordering of atomic loads in cases in which the atomics in question may alias, since this may violate the - "visible sequence" rules. - - -28 - NOTE 14 Transformations that introduce a speculative read of a potentially shared memory location may - not preserve the semantics of the program as defined in this standard, since they potentially introduce a data - race. However, they are typically valid in the context of an optimizing compiler that targets a specific - machine with well-defined semantics for data races. They would be invalid for a hypothetical machine that - is not tolerant of races or provides hardware race detection. - - -Footnotes - -14) The execution can usually be viewed as an interleaving of all of the threads. However, some kinds of - atomic operations, for example, allow executions inconsistent with a simple interleaving as described - below. - - -15) The ''carries a dependency'' relation is a subset of the ''sequenced before'' relation, and is similarly - strictly intra-thread. - - -16) The ''dependency-ordered before'' relation is analogous to the ''synchronizes with'' relation, but uses - release/consume in place of release/acquire. - - -Contents - -5.2 Environmental considerations - - -Contents - -5.2.1 Character sets - - -1 - Two sets of characters and their associated collating sequences shall be defined: the set in - which source files are written (the source character set), and the set interpreted in the - execution environment (the execution character set). Each set is further divided into a - basic character set, whose contents are given by this subclause, and a set of zero or more - locale-specific members (which are not members of the basic character set) called - extended characters. The combined set is also called the extended character set. The - values of the members of the execution character set are implementation-defined. - -2 - In a character constant or string literal, members of the execution character set shall be - represented by corresponding members of the source character set or by escape - sequences consisting of the backslash \ followed by one or more characters. A byte with - all bits set to 0, called the null character, shall exist in the basic execution character set; it - is used to terminate a character string. - -3 - Both the basic source and basic execution character sets shall have the following - members: the 26 uppercase letters of the Latin alphabet - - - A B C D E F G H I J K L M - N O P Q R S T U V W X Y Z - - - the 26 lowercase letters of the Latin alphabet - - - a b c d e f g h i j k l m - n o p q r s t u v w x y z - - - the 10 decimal digits - - - 0 1 2 3 4 5 6 7 8 9 - - - the following 29 graphic characters - - - ! " # % & ' ( ) * + , - . / : - ; < = > ? [ \ ] ^ _ { | } ~ - - - the space character, and control characters representing horizontal tab, vertical tab, and - form feed. The representation of each member of the source and execution basic - character sets shall fit in a byte. In both the source and execution basic character sets, the - value of each character after 0 in the above list of decimal digits shall be one greater than - the value of the previous. In source files, there shall be some way of indicating the end of - each line of text; this International Standard treats such an end-of-line indicator as if it - were a single new-line character. In the basic execution character set, there shall be - control characters representing alert, backspace, carriage return, and new line. If any - other characters are encountered in a source file (except in an identifier, a character - constant, a string literal, a header name, a comment, or a preprocessing token that is never - - converted to a token), the behavior is undefined. - -4 - A letter is an uppercase letter or a lowercase letter as defined above; in this International - Standard the term does not include other characters that are letters in other alphabets. - -5 - The universal character name construct provides a way to name other characters. - - Forward references: universal character names (6.4.3), character constants (6.4.4.4), - preprocessing directives (6.10), string literals (6.4.5), comments (6.4.9), string (7.1.1). - - -Contents - -5.2.1.1 Trigraph sequences - - -1 - Before any other processing takes place, each occurrence of one of the following - sequences of three characters (called trigraph sequences17)) is replaced with the - corresponding single character. - - - ??= # ??) ] ??! | - ??( [ ??' ^ ??> } - ??/ \ ??< { ??- ~ - - - No other trigraph sequences exist. Each ? that does not begin one of the trigraphs listed - above is not changed. - -2 - EXAMPLE 1 - - - ??=define arraycheck(a, b) a??(b??) ??!??! b??(a??) - - - becomes - - - #define arraycheck(a, b) a[b] || b[a] - - - - -3 - EXAMPLE 2 The following source line - - - printf("Eh???/n"); - - - becomes (after replacement of the trigraph sequence ??/) - - - printf("Eh?\n"); - - - - - -Footnotes - -17) The trigraph sequences enable the input of characters that are not defined in the Invariant Code Set as - described in ISO/IEC 646, which is a subset of the seven-bit US ASCII code set. - - -Contents - -5.2.1.2 Multibyte characters - - -1 - The source character set may contain multibyte characters, used to represent members of - the extended character set. The execution character set may also contain multibyte - characters, which need not have the same encoding as for the source character set. For - both character sets, the following shall hold: - - -- The basic character set shall be present and each character shall be encoded as a - single byte. - -- The presence, meaning, and representation of any additional members is locale- - specific. - - - -- A multibyte character set may have a state-dependent encoding, wherein each - sequence of multibyte characters begins in an initial shift state and enters other - locale-specific shift states when specific multibyte characters are encountered in the - sequence. While in the initial shift state, all single-byte characters retain their usual - interpretation and do not alter the shift state. The interpretation for subsequent bytes - in the sequence is a function of the current shift state. - -- A byte with all bits zero shall be interpreted as a null character independent of shift - state. Such a byte shall not occur as part of any other multibyte character. - - -2 - For source files, the following shall hold: - - -- An identifier, comment, string literal, character constant, or header name shall begin - and end in the initial shift state. - -- An identifier, comment, string literal, character constant, or header name shall consist - of a sequence of valid multibyte characters. - - -Contents - -5.2.2 Character display semantics - - -1 - The active position is that location on a display device where the next character output by - the fputc function would appear. The intent of writing a printing character (as defined - by the isprint function) to a display device is to display a graphic representation of - that character at the active position and then advance the active position to the next - position on the current line. The direction of writing is locale-specific. If the active - position is at the final position of a line (if there is one), the behavior of the display device - is unspecified. - -2 - Alphabetic escape sequences representing nongraphic characters in the execution - character set are intended to produce actions on display devices as follows: - - \a (alert) Produces an audible or visible alert without changing the active position. - \b (backspace) Moves the active position to the previous position on the current line. If - the active position is at the initial position of a line, the behavior of the display - device is unspecified. - \f (form feed) Moves the active position to the initial position at the start of the next - logical page. - \n (new line) Moves the active position to the initial position of the next line. - \r (carriage return) Moves the active position to the initial position of the current line. - \t (horizontal tab) Moves the active position to the next horizontal tabulation position - on the current line. If the active position is at or past the last defined horizontal - tabulation position, the behavior of the display device is unspecified. - \v (vertical tab) Moves the active position to the initial position of the next vertical - - tabulation position. If the active position is at or past the last defined vertical - tabulation position, the behavior of the display device is unspecified. - - -3 - Each of these escape sequences shall produce a unique implementation-defined value - which can be stored in a single char object. The external representations in a text file - need not be identical to the internal representations, and are outside the scope of this - International Standard. - - Forward references: the isprint function (7.4.1.8), the fputc function (7.21.7.3). - - -Contents - -5.2.3 Signals and interrupts - - -1 - Functions shall be implemented such that they may be interrupted at any time by a signal, - or may be called by a signal handler, or both, with no alteration to earlier, but still active, - invocations' control flow (after the interruption), function return values, or objects with - automatic storage duration. All such objects shall be maintained outside the function - image (the instructions that compose the executable representation of a function) on a - per-invocation basis. - - -Contents - -5.2.4 Environmental limits - - -1 - Both the translation and execution environments constrain the implementation of - language translators and libraries. The following summarizes the language-related - environmental limits on a conforming implementation; the library-related limits are - discussed in clause 7. - - -Contents - -5.2.4.1 Translation limits - - -1 - The implementation shall be able to translate and execute at least one program that - contains at least one instance of every one of the following limits:18) - - -- 127 nesting levels of blocks - -- 63 nesting levels of conditional inclusion - -- 12 pointer, array, and function declarators (in any combinations) modifying an - arithmetic, structure, union, or void type in a declaration - -- 63 nesting levels of parenthesized declarators within a full declarator - -- 63 nesting levels of parenthesized expressions within a full expression - -- 63 significant initial characters in an internal identifier or a macro name (each - universal character name or extended source character is considered a single - character) - -- 31 significant initial characters in an external identifier (each universal character name - specifying a short identifier of 0000FFFF or less is considered 6 characters, each - - universal character name specifying a short identifier of 00010000 or more is - considered 10 characters, and each extended source character is considered the same - number of characters as the corresponding universal character name, if any)19) - -- 4095 external identifiers in one translation unit - -- 511 identifiers with block scope declared in one block - -- 4095 macro identifiers simultaneously defined in one preprocessing translation unit - -- 127 parameters in one function definition - -- 127 arguments in one function call - -- 127 parameters in one macro definition - -- 127 arguments in one macro invocation - -- 4095 characters in a logical source line - -- 4095 characters in a string literal (after concatenation) - -- 65535 bytes in an object (in a hosted environment only) - -- 15 nesting levels for #included files - -- 1023 case labels for a switch statement (excluding those for any nested switch - statements) - -- 1023 members in a single structure or union - -- 1023 enumeration constants in a single enumeration - -- 63 levels of nested structure or union definitions in a single struct-declaration-list - - -Footnotes - -18) Implementations should avoid imposing fixed translation limits whenever possible. - - -19) See ''future language directions'' (6.11.3). - - -Contents - -5.2.4.2 Numerical limits - - -1 - An implementation is required to document all the limits specified in this subclause, - which are specified in the headers <limits.h> and <float.h>. Additional limits are - specified in <stdint.h>. - - Forward references: integer types <stdint.h> (7.20). - - -Contents - -5.2.4.2.1 Sizes of integer types <limits.h> - - -1 - The values given below shall be replaced by constant expressions suitable for use in #if - preprocessing directives. Moreover, except for CHAR_BIT and MB_LEN_MAX, the - following shall be replaced by expressions that have the same type as would an - expression that is an object of the corresponding type converted according to the integer - promotions. Their implementation-defined values shall be equal or greater in magnitude - - - - (absolute value) to those shown, with the same sign. - - -- number of bits for smallest object that is not a bit-field (byte) - - - CHAR_BIT 8 - - -- minimum value for an object of type signed char - - - SCHAR_MIN -127 // -(27 - 1) - - -- maximum value for an object of type signed char - - - SCHAR_MAX +127 // 27 - 1 - - -- maximum value for an object of type unsigned char - - - UCHAR_MAX 255 // 28 - 1 - - -- minimum value for an object of type char - - - CHAR_MIN see below - - -- maximum value for an object of type char - - - CHAR_MAX see below - - -- maximum number of bytes in a multibyte character, for any supported locale - - - MB_LEN_MAX 1 - - -- minimum value for an object of type short int - - - SHRT_MIN -32767 // -(215 - 1) - - -- maximum value for an object of type short int - - - SHRT_MAX +32767 // 215 - 1 - - -- maximum value for an object of type unsigned short int - - - USHRT_MAX 65535 // 216 - 1 - - -- minimum value for an object of type int - - - INT_MIN -32767 // -(215 - 1) - - -- maximum value for an object of type int - - - INT_MAX +32767 // 215 - 1 - - -- maximum value for an object of type unsigned int - - - UINT_MAX 65535 // 216 - 1 - - -- minimum value for an object of type long int - - - LONG_MIN -2147483647 // -(231 - 1) - - -- maximum value for an object of type long int - - - LONG_MAX +2147483647 // 231 - 1 - - -- maximum value for an object of type unsigned long int - - - ULONG_MAX 4294967295 // 232 - 1 - - -- minimum value for an object of type long long int - - - LLONG_MIN -9223372036854775807 // -(263 - 1) - - -- maximum value for an object of type long long int - - - LLONG_MAX +9223372036854775807 // 263 - 1 - - -- maximum value for an object of type unsigned long long int - - - ULLONG_MAX 18446744073709551615 // 264 - 1 - - -2 - If the value of an object of type char is treated as a signed integer when used in an - expression, the value of CHAR_MIN shall be the same as that of SCHAR_MIN and the - value of CHAR_MAX shall be the same as that of SCHAR_MAX. Otherwise, the value of - CHAR_MIN shall be 0 and the value of CHAR_MAX shall be the same as that of - UCHAR_MAX.20) The value UCHAR_MAX shall equal 2CHAR_BIT - 1. - - Forward references: representations of types (6.2.6), conditional inclusion (6.10.1). - - -Footnotes - -20) See 6.2.5. - - -Contents - -5.2.4.2.2 Characteristics of floating types <float.h> - - -1 - The characteristics of floating types are defined in terms of a model that describes a - representation of floating-point numbers and values that provide information about an - implementation's floating-point arithmetic.21) The following parameters are used to - define the model for each floating-point type: - - - s sign ((+-)1) - b base or radix of exponent representation (an integer > 1) - e exponent (an integer between a minimum emin and a maximum emax ) - p precision (the number of base-b digits in the significand) - fk nonnegative integers less than b (the significand digits) - - -2 - A floating-point number (x) is defined by the following model: - - - p - x = s be (Sum) fk b-k , emin <= e <= emax - k=1 - - - - -3 - In addition to normalized floating-point numbers ( f1 > 0 if x != 0), floating types may be - able to contain other kinds of floating-point numbers, such as subnormal floating-point - numbers (x != 0, e = emin , f1 = 0) and unnormalized floating-point numbers (x != 0, - e > emin , f1 = 0), and values that are not floating-point numbers, such as infinities and - NaNs. A NaN is an encoding signifying Not-a-Number. A quiet NaN propagates - through almost every arithmetic operation without raising a floating-point exception; a - signaling NaN generally raises a floating-point exception when occurring as an - - - - arithmetic operand.22) - -4 - An implementation may give zero and values that are not floating-point numbers (such as - infinities and NaNs) a sign or may leave them unsigned. Wherever such values are - unsigned, any requirement in this International Standard to retrieve the sign shall produce - an unspecified sign, and any requirement to set the sign shall be ignored. - -5 - The minimum range of representable values for a floating type is the most negative finite - floating-point number representable in that type through the most positive finite floating- - point number representable in that type. In addition, if negative infinity is representable - in a type, the range of that type is extended to all negative real numbers; likewise, if - positive infinity is representable in a type, the range of that type is extended to all positive - real numbers. - -6 - The accuracy of the floating-point operations (+, -, *, /) and of the library functions in - <math.h> and <complex.h> that return floating-point results is implementation- - defined, as is the accuracy of the conversion between floating-point internal - representations and string representations performed by the library functions in - <stdio.h>, <stdlib.h>, and <wchar.h>. The implementation may state that the - accuracy is unknown. - -7 - All integer values in the <float.h> header, except FLT_ROUNDS, shall be constant - expressions suitable for use in #if preprocessing directives; all floating values shall be - constant expressions. All except DECIMAL_DIG, FLT_EVAL_METHOD, FLT_RADIX, - and FLT_ROUNDS have separate names for all three floating-point types. The floating-point - model representation is provided for all values except FLT_EVAL_METHOD and - FLT_ROUNDS. - -8 - The rounding mode for floating-point addition is characterized by the implementation- - defined value of FLT_ROUNDS:23) - - - -1 indeterminable - 0 toward zero - 1 to nearest - 2 toward positive infinity - 3 toward negative infinity - - - All other values for FLT_ROUNDS characterize implementation-defined rounding - behavior. - - - - -9 - Except for assignment and cast (which remove all extra range and precision), the values - yielded by operators with floating operands and values subject to the usual arithmetic - conversions and of floating constants are evaluated to a format whose range and precision - may be greater than required by the type. The use of evaluation formats is characterized - by the implementation-defined value of FLT_EVAL_METHOD:24) - - - -1 indeterminable; - 0 evaluate all operations and constants just to the range and precision of the - type; - 1 evaluate operations and constants of type float and double to the - range and precision of the double type, evaluate long double - operations and constants to the range and precision of the long double - type; - 2 evaluate all operations and constants to the range and precision of the - long double type. - - - All other negative values for FLT_EVAL_METHOD characterize implementation-defined - behavior. - -10 - The presence or absence of subnormal numbers is characterized by the implementation- - defined values of FLT_HAS_SUBNORM, DBL_HAS_SUBNORM, and - LDBL_HAS_SUBNORM: - - - -1 indeterminable25) - 0 absent26) (type does not support subnormal numbers) - 1 present (type does support subnormal numbers) - - -11 - The values given in the following list shall be replaced by constant expressions with - implementation-defined values that are greater or equal in magnitude (absolute value) to - those shown, with the same sign: - - -- radix of exponent representation, b - - - FLT_RADIX 2 - - -- number of base-FLT_RADIX digits in the floating-point significand, p - - - FLT_MANT_DIG - DBL_MANT_DIG - LDBL_MANT_DIG - - -- number of decimal digits, n, such that any floating-point number with p radix b digits - can be rounded to a floating-point number with n decimal digits and back again - without change to the value, - - - { p log10 b if b is a power of 10 - { - { [^1 + p log10 b^] otherwise - - - FLT_DECIMAL_DIG 6 - DBL_DECIMAL_DIG 10 - LDBL_DECIMAL_DIG 10 - - -- number of decimal digits, n, such that any floating-point number in the widest - supported floating type with pmax radix b digits can be rounded to a floating-point - number with n decimal digits and back again without change to the value, - - - { pmax log10 b if b is a power of 10 - { - { [^1 + pmax log10 b^] otherwise - - - DECIMAL_DIG 10 - - -- number of decimal digits, q, such that any floating-point number with q decimal digits - can be rounded into a floating-point number with p radix b digits and back again - without change to the q decimal digits, - - - { p log10 b if b is a power of 10 - { - { [_( p - 1) log10 b_] otherwise - - FLT_DIG 6 - DBL_DIG 10 - LDBL_DIG 10 - - -- minimum negative integer such that FLT_RADIX raised to one less than that power is - a normalized floating-point number, emin - - - FLT_MIN_EXP - DBL_MIN_EXP - LDBL_MIN_EXP - - -- minimum negative integer such that 10 raised to that power is in the range of - normalized floating-point numbers, [^log10 bemin-1^] - - - FLT_MIN_10_EXP -37 - DBL_MIN_10_EXP -37 - LDBL_MIN_10_EXP -37 - - -- maximum integer such that FLT_RADIX raised to one less than that power is a - representable finite floating-point number, emax - - - FLT_MAX_EXP - DBL_MAX_EXP - LDBL_MAX_EXP - - -- maximum integer such that 10 raised to that power is in the range of representable - finite floating-point numbers, [_log10 ((1 - b-p)bemax)_] - - - FLT_MAX_10_EXP +37 - DBL_MAX_10_EXP +37 - LDBL_MAX_10_EXP +37 - - -12 - The values given in the following list shall be replaced by constant expressions with - implementation-defined values that are greater than or equal to those shown: - - -- maximum representable finite floating-point number, (1 - b-p)bemax - - - FLT_MAX 1E+37 - DBL_MAX 1E+37 - LDBL_MAX 1E+37 - - -13 - The values given in the following list shall be replaced by constant expressions with - implementation-defined (positive) values that are less than or equal to those shown: - - -- the difference between 1 and the least value greater than 1 that is representable in the - given floating point type, b1-p - - - FLT_EPSILON 1E-5 - DBL_EPSILON 1E-9 - LDBL_EPSILON 1E-9 - - -- minimum normalized positive floating-point number, bemin-1 - - - FLT_MIN 1E-37 - DBL_MIN 1E-37 - LDBL_MIN 1E-37 - - -- minimum positive floating-point number27) - FLT_TRUE_MIN 1E-37 - DBL_TRUE_MIN 1E-37 - LDBL_TRUE_MIN 1E-37 - - -Recommended practice - -14 - Conversion from (at least) double to decimal with DECIMAL_DIG digits and back - should be the identity function. - -15 - EXAMPLE 1 The following describes an artificial floating-point representation that meets the minimum - requirements of this International Standard, and the appropriate values in a <float.h> header for type - float: - - - 6 - x = s 16e (Sum) fk 16-k , -31 <= e <= +32 - k=1 - - - FLT_RADIX 16 - FLT_MANT_DIG 6 - FLT_EPSILON 9.53674316E-07F - FLT_DECIMAL_DIG 9 - FLT_DIG 6 - FLT_MIN_EXP -31 - FLT_MIN 2.93873588E-39F - FLT_MIN_10_EXP -38 - FLT_MAX_EXP +32 - FLT_MAX 3.40282347E+38F - FLT_MAX_10_EXP +38 - - - - -16 - EXAMPLE 2 The following describes floating-point representations that also meet the requirements for - single-precision and double-precision numbers in IEC 60559,28) and the appropriate values in a - <float.h> header for types float and double: - - - 24 - xf = s 2e (Sum) fk 2-k , -125 <= e <= +128 - k=1 - - - 53 - xd = s 2e (Sum) fk 2-k , -1021 <= e <= +1024 - k=1 - - - FLT_RADIX 2 - DECIMAL_DIG 17 - FLT_MANT_DIG 24 - FLT_EPSILON 1.19209290E-07F // decimal constant - FLT_EPSILON 0X1P-23F // hex constant - FLT_DECIMAL_DIG 9 - - - - - - - FLT_DIG 6 - FLT_MIN_EXP -125 - FLT_MIN 1.17549435E-38F // decimal constant - FLT_MIN 0X1P-126F // hex constant - FLT_TRUE_MIN 1.40129846E-45F // decimal constant - FLT_TRUE_MIN 0X1P-149F // hex constant - FLT_HAS_SUBNORM 1 - FLT_MIN_10_EXP -37 - FLT_MAX_EXP +128 - FLT_MAX 3.40282347E+38F // decimal constant - FLT_MAX 0X1.fffffeP127F // hex constant - FLT_MAX_10_EXP +38 - DBL_MANT_DIG 53 - DBL_EPSILON 2.2204460492503131E-16 // decimal constant - DBL_EPSILON 0X1P-52 // hex constant - DBL_DECIMAL_DIG 17 - DBL_DIG 15 - DBL_MIN_EXP -1021 - DBL_MIN 2.2250738585072014E-308 // decimal constant - DBL_MIN 0X1P-1022 // hex constant - DBL_TRUE_MIN 4.9406564584124654E-324 // decimal constant - DBL_TRUE_MIN 0X1P-1074 // hex constant - DBL_HAS_SUBNORM 1 - DBL_MIN_10_EXP -307 - DBL_MAX_EXP +1024 - DBL_MAX 1.7976931348623157E+308 // decimal constant - DBL_MAX 0X1.fffffffffffffP1023 // hex constant - DBL_MAX_10_EXP +308 - - - If a type wider than double were supported, then DECIMAL_DIG would be greater than 17. For - example, if the widest type were to use the minimal-width IEC 60559 double-extended format (64 bits of - precision), then DECIMAL_DIG would be 21. - - - Forward references: conditional inclusion (6.10.1), complex arithmetic - <complex.h> (7.3), extended multibyte and wide character utilities <wchar.h> - (7.29), floating-point environment <fenv.h> (7.6), general utilities <stdlib.h> - (7.22), input/output <stdio.h> (7.21), mathematics <math.h> (7.12). - - -Footnotes - -21) The floating-point model is intended to clarify the description of each floating-point characteristic and - does not require the floating-point arithmetic of the implementation to be identical. - - -22) IEC 60559:1989 specifies quiet and signaling NaNs. For implementations that do not support - IEC 60559:1989, the terms quiet NaN and signaling NaN are intended to apply to encodings with - similar behavior. - - -23) Evaluation of FLT_ROUNDS correctly reflects any execution-time change of rounding mode through - the function fesetround in <fenv.h>. - - -24) The evaluation method determines evaluation formats of expressions involving all floating types, not - just real types. For example, if FLT_EVAL_METHOD is 1, then the product of two float - _Complex operands is represented in the double _Complex format, and its parts are evaluated to - double. - - -25) Characterization as indeterminable is intended if floating-point operations do not consistently interpret - subnormal representations as zero, nor as nonzero. - - -26) Characterization as absent is intended if no floating-point operations produce subnormal results from - non-subnormal inputs, even if the type format includes representations of subnormal numbers. - - -27) If the presence or absence of subnormal numbers is indeterminable, then the value is intended to be a - positive number no greater than the minimum normalized positive number for the type. - - -28) The floating-point model in that standard sums powers of b from zero, so the values of the exponent - limits are one less than shown here. - - -Contents diff --git a/doc/std/CHAPTER-6.txt b/doc/std/CHAPTER-6.txt @@ -1,9476 +0,0 @@ -6. Language - - -Contents - -6.1 Notation - - -1 - In the syntax notation used in this clause, syntactic categories (nonterminals) are - indicated by italic type, and literal words and character set members (terminals) by bold - type. A colon (:) following a nonterminal introduces its definition. Alternative - definitions are listed on separate lines, except when prefaced by the words ''one of''. An - optional symbol is indicated by the subscript ''opt'', so that - - - { expressionopt } - - - indicates an optional expression enclosed in braces. - -2 - When syntactic categories are referred to in the main text, they are not italicized and - words are separated by spaces instead of hyphens. - -3 - A summary of the language syntax is given in annex A. - - -Contents - -6.2 Concepts - - -Contents - -6.2.1 Scopes of identifiers - - -1 - An identifier can denote an object; a function; a tag or a member of a structure, union, or - enumeration; a typedef name; a label name; a macro name; or a macro parameter. The - same identifier can denote different entities at different points in the program. A member - of an enumeration is called an enumeration constant. Macro names and macro - parameters are not considered further here, because prior to the semantic phase of - program translation any occurrences of macro names in the source file are replaced by the - preprocessing token sequences that constitute their macro definitions. - -2 - For each different entity that an identifier designates, the identifier is visible (i.e., can be - used) only within a region of program text called its scope. Different entities designated - by the same identifier either have different scopes, or are in different name spaces. There - are four kinds of scopes: function, file, block, and function prototype. (A function - prototype is a declaration of a function that declares the types of its parameters.) - -3 - A label name is the only kind of identifier that has function scope. It can be used (in a - goto statement) anywhere in the function in which it appears, and is declared implicitly - by its syntactic appearance (followed by a : and a statement). - -4 - Every other identifier has scope determined by the placement of its declaration (in a - declarator or type specifier). If the declarator or type specifier that declares the identifier - appears outside of any block or list of parameters, the identifier has file scope, which - terminates at the end of the translation unit. If the declarator or type specifier that - declares the identifier appears inside a block or within the list of parameter declarations in - a function definition, the identifier has block scope, which terminates at the end of the - associated block. If the declarator or type specifier that declares the identifier appears - - within the list of parameter declarations in a function prototype (not part of a function - definition), the identifier has function prototype scope, which terminates at the end of the - function declarator. If an identifier designates two different entities in the same name - space, the scopes might overlap. If so, the scope of one entity (the inner scope) will end - strictly before the scope of the other entity (the outer scope). Within the inner scope, the - identifier designates the entity declared in the inner scope; the entity declared in the outer - scope is hidden (and not visible) within the inner scope. - -5 - Unless explicitly stated otherwise, where this International Standard uses the term - ''identifier'' to refer to some entity (as opposed to the syntactic construct), it refers to the - entity in the relevant name space whose declaration is visible at the point the identifier - occurs. - -6 - Two identifiers have the same scope if and only if their scopes terminate at the same - point. - -7 - Structure, union, and enumeration tags have scope that begins just after the appearance of - the tag in a type specifier that declares the tag. Each enumeration constant has scope that - begins just after the appearance of its defining enumerator in an enumerator list. Any - other identifier has scope that begins just after the completion of its declarator. - -8 - As a special case, a type name (which is not a declaration of an identifier) is considered to - have a scope that begins just after the place within the type name where the omitted - identifier would appear were it not omitted. - - Forward references: declarations (6.7), function calls (6.5.2.2), function definitions - (6.9.1), identifiers (6.4.2), macro replacement (6.10.3), name spaces of identifiers (6.2.3), - source file inclusion (6.10.2), statements (6.8). - - -Contents - -6.2.2 Linkages of identifiers - - -1 - An identifier declared in different scopes or in the same scope more than once can be - made to refer to the same object or function by a process called linkage.29) There are - three kinds of linkage: external, internal, and none. - -2 - In the set of translation units and libraries that constitutes an entire program, each - declaration of a particular identifier with external linkage denotes the same object or - function. Within one translation unit, each declaration of an identifier with internal - linkage denotes the same object or function. Each declaration of an identifier with no - linkage denotes a unique entity. - -3 - If the declaration of a file scope identifier for an object or a function contains the storage- - class specifier static, the identifier has internal linkage.30) - - - - - -4 - For an identifier declared with the storage-class specifier extern in a scope in which a - prior declaration of that identifier is visible,31) if the prior declaration specifies internal or - external linkage, the linkage of the identifier at the later declaration is the same as the - linkage specified at the prior declaration. If no prior declaration is visible, or if the prior - declaration specifies no linkage, then the identifier has external linkage. - -5 - If the declaration of an identifier for a function has no storage-class specifier, its linkage - is determined exactly as if it were declared with the storage-class specifier extern. If - the declaration of an identifier for an object has file scope and no storage-class specifier, - its linkage is external. - -6 - The following identifiers have no linkage: an identifier declared to be anything other than - an object or a function; an identifier declared to be a function parameter; a block scope - identifier for an object declared without the storage-class specifier extern. - -7 - If, within a translation unit, the same identifier appears with both internal and external - linkage, the behavior is undefined. - - Forward references: declarations (6.7), expressions (6.5), external definitions (6.9), - statements (6.8). - - -Footnotes - -29) There is no linkage between different identifiers. - - -30) A function declaration can contain the storage-class specifier static only if it is at file scope; see - 6.7.1. - - -31) As specified in 6.2.1, the later declaration might hide the prior declaration. - - -Contents - -6.2.3 Name spaces of identifiers - - -1 - If more than one declaration of a particular identifier is visible at any point in a - translation unit, the syntactic context disambiguates uses that refer to different entities. - Thus, there are separate name spaces for various categories of identifiers, as follows: - - -- label names (disambiguated by the syntax of the label declaration and use); - -- the tags of structures, unions, and enumerations (disambiguated by following any32) - of the keywords struct, union, or enum); - -- the members of structures or unions; each structure or union has a separate name - space for its members (disambiguated by the type of the expression used to access the - member via the . or -> operator); - -- all other identifiers, called ordinary identifiers (declared in ordinary declarators or as - enumeration constants). - - - Forward references: enumeration specifiers (6.7.2.2), labeled statements (6.8.1), - structure and union specifiers (6.7.2.1), structure and union members (6.5.2.3), tags - (6.7.2.3), the goto statement (6.8.6.1). - - - -Footnotes - -32) There is only one name space for tags even though three are possible. - - -Contents - -6.2.4 Storage durations of objects - - -1 - An object has a storage duration that determines its lifetime. There are four storage - durations: static, thread, automatic, and allocated. Allocated storage is described in - 7.22.3. - -2 - The lifetime of an object is the portion of program execution during which storage is - guaranteed to be reserved for it. An object exists, has a constant address,33) and retains - its last-stored value throughout its lifetime.34) If an object is referred to outside of its - lifetime, the behavior is undefined. The value of a pointer becomes indeterminate when - the object it points to (or just past) reaches the end of its lifetime. - -3 - An object whose identifier is declared without the storage-class specifier - _Thread_local, and either with external or internal linkage or with the storage-class - specifier static, has static storage duration. Its lifetime is the entire execution of the - program and its stored value is initialized only once, prior to program startup. - -4 - An object whose identifier is declared with the storage-class specifier _Thread_local - has thread storage duration. Its lifetime is the entire execution of the thread for which it - is created, and its stored value is initialized when the thread is started. There is a distinct - object per thread, and use of the declared name in an expression refers to the object - associated with the thread evaluating the expression. The result of attempting to - indirectly access an object with thread storage duration from a thread other than the one - with which the object is associated is implementation-defined. - -5 - An object whose identifier is declared with no linkage and without the storage-class - specifier static has automatic storage duration, as do some compound literals. The - result of attempting to indirectly access an object with automatic storage duration from a - thread other than the one with which the object is associated is implementation-defined. - -6 - For such an object that does not have a variable length array type, its lifetime extends - from entry into the block with which it is associated until execution of that block ends in - any way. (Entering an enclosed block or calling a function suspends, but does not end, - execution of the current block.) If the block is entered recursively, a new instance of the - object is created each time. The initial value of the object is indeterminate. If an - initialization is specified for the object, it is performed each time the declaration or - compound literal is reached in the execution of the block; otherwise, the value becomes - indeterminate each time the declaration is reached. - - - - - -7 - For such an object that does have a variable length array type, its lifetime extends from - the declaration of the object until execution of the program leaves the scope of the - declaration.35) If the scope is entered recursively, a new instance of the object is created - each time. The initial value of the object is indeterminate. - -8 - A non-lvalue expression with structure or union type, where the structure or union - contains a member with array type (including, recursively, members of all contained - structures and unions) refers to an object with automatic storage duration and temporary - lifetime.36) Its lifetime begins when the expression is evaluated and its initial value is the - value of the expression. Its lifetime ends when the evaluation of the containing full - expression or full declarator ends. Any attempt to modify an object with temporary - lifetime results in undefined behavior. - - Forward references: array declarators (6.7.6.2), compound literals (6.5.2.5), declarators - (6.7.6), function calls (6.5.2.2), initialization (6.7.9), statements (6.8). - - -Footnotes - -33) The term ''constant address'' means that two pointers to the object constructed at possibly different - times will compare equal. The address may be different during two different executions of the same - program. - - -34) In the case of a volatile object, the last store need not be explicit in the program. - - -35) Leaving the innermost block containing the declaration, or jumping to a point in that block or an - embedded block prior to the declaration, leaves the scope of the declaration. - - -36) The address of such an object is taken implicitly when an array member is accessed. - - -Contents - -6.2.5 Types - - -1 - The meaning of a value stored in an object or returned by a function is determined by the - type of the expression used to access it. (An identifier declared to be an object is the - simplest such expression; the type is specified in the declaration of the identifier.) Types - are partitioned into object types (types that describe objects) and function types (types - that describe functions). At various points within a translation unit an object type may be - incomplete (lacking sufficient information to determine the size of objects of that type) or - complete (having sufficient information).37) - -2 - An object declared as type _Bool is large enough to store the values 0 and 1. - -3 - An object declared as type char is large enough to store any member of the basic - execution character set. If a member of the basic execution character set is stored in a - char object, its value is guaranteed to be nonnegative. If any other character is stored in - a char object, the resulting value is implementation-defined but shall be within the range - of values that can be represented in that type. - -4 - There are five standard signed integer types, designated as signed char, short - int, int, long int, and long long int. (These and other types may be - designated in several additional ways, as described in 6.7.2.) There may also be - implementation-defined extended signed integer types.38) The standard and extended - signed integer types are collectively called signed integer types.39) - - - -5 - An object declared as type signed char occupies the same amount of storage as a - ''plain'' char object. A ''plain'' int object has the natural size suggested by the - architecture of the execution environment (large enough to contain any value in the range - INT_MIN to INT_MAX as defined in the header <limits.h>). - -6 - For each of the signed integer types, there is a corresponding (but different) unsigned - integer type (designated with the keyword unsigned) that uses the same amount of - storage (including sign information) and has the same alignment requirements. The type - _Bool and the unsigned integer types that correspond to the standard signed integer - types are the standard unsigned integer types. The unsigned integer types that - correspond to the extended signed integer types are the extended unsigned integer types. - The standard and extended unsigned integer types are collectively called unsigned integer - types.40) - -7 - The standard signed integer types and standard unsigned integer types are collectively - called the standard integer types, the extended signed integer types and extended - unsigned integer types are collectively called the extended integer types. - -8 - For any two integer types with the same signedness and different integer conversion rank - (see 6.3.1.1), the range of values of the type with smaller integer conversion rank is a - subrange of the values of the other type. - -9 - The range of nonnegative values of a signed integer type is a subrange of the - corresponding unsigned integer type, and the representation of the same value in each - type is the same.41) A computation involving unsigned operands can never overflow, - because a result that cannot be represented by the resulting unsigned integer type is - reduced modulo the number that is one greater than the largest value that can be - represented by the resulting type. - -10 - There are three real floating types, designated as float, double, and long - double.42) The set of values of the type float is a subset of the set of values of the - type double; the set of values of the type double is a subset of the set of values of the - type long double. - - - - -11 - There are three complex types, designated as float _Complex, double - _Complex, and long double _Complex.43) (Complex types are a conditional - feature that implementations need not support; see 6.10.8.3.) The real floating and - complex types are collectively called the floating types. - -12 - For each floating type there is a corresponding real type, which is always a real floating - type. For real floating types, it is the same type. For complex types, it is the type given - by deleting the keyword _Complex from the type name. - -13 - Each complex type has the same representation and alignment requirements as an array - type containing exactly two elements of the corresponding real type; the first element is - equal to the real part, and the second element to the imaginary part, of the complex - number. - -14 - The type char, the signed and unsigned integer types, and the floating types are - collectively called the basic types. The basic types are complete object types. Even if the - implementation defines two or more basic types to have the same representation, they are - nevertheless different types.44) - -15 - The three types char, signed char, and unsigned char are collectively called - the character types. The implementation shall define char to have the same range, - representation, and behavior as either signed char or unsigned char.45) - -16 - An enumeration comprises a set of named integer constant values. Each distinct - enumeration constitutes a different enumerated type. - -17 - The type char, the signed and unsigned integer types, and the enumerated types are - collectively called integer types. The integer and real floating types are collectively called - real types. - -18 - Integer and floating types are collectively called arithmetic types. Each arithmetic type - belongs to one type domain: the real type domain comprises the real types, the complex - type domain comprises the complex types. - -19 - The void type comprises an empty set of values; it is an incomplete object type that - cannot be completed. - - - - - -20 - Any number of derived types can be constructed from the object and function types, as - follows: - - -- An array type describes a contiguously allocated nonempty set of objects with a - particular member object type, called the element type. The element type shall be - complete whenever the array type is specified. Array types are characterized by their - element type and by the number of elements in the array. An array type is said to be - derived from its element type, and if its element type is T , the array type is sometimes - called ''array of T ''. The construction of an array type from an element type is called - ''array type derivation''. - -- A structure type describes a sequentially allocated nonempty set of member objects - (and, in certain circumstances, an incomplete array), each of which has an optionally - specified name and possibly distinct type. - -- A union type describes an overlapping nonempty set of member objects, each of - which has an optionally specified name and possibly distinct type. - -- A function type describes a function with specified return type. A function type is - characterized by its return type and the number and types of its parameters. A - function type is said to be derived from its return type, and if its return type is T , the - function type is sometimes called ''function returning T ''. The construction of a - function type from a return type is called ''function type derivation''. - -- A pointer type may be derived from a function type or an object type, called the - referenced type. A pointer type describes an object whose value provides a reference - to an entity of the referenced type. A pointer type derived from the referenced type T - is sometimes called ''pointer to T ''. The construction of a pointer type from a - referenced type is called ''pointer type derivation''. A pointer type is a complete - object type. - -- An atomic type describes the type designated by the construct _Atomic ( type- - name ). (Atomic types are a conditional feature that implementations need not - support; see 6.10.8.3.) - - These methods of constructing derived types can be applied recursively. - -21 - Arithmetic types and pointer types are collectively called scalar types. Array and - structure types are collectively called aggregate types.46) - -22 - An array type of unknown size is an incomplete type. It is completed, for an identifier of - that type, by specifying the size in a later declaration (with internal or external linkage). - A structure or union type of unknown content (as described in 6.7.2.3) is an incomplete - - - - type. It is completed, for all declarations of that type, by declaring the same structure or - union tag with its defining content later in the same scope. - -23 - A type has known constant size if the type is not incomplete and is not a variable length - array type. - -24 - Array, function, and pointer types are collectively called derived declarator types. A - declarator type derivation from a type T is the construction of a derived declarator type - from T by the application of an array-type, a function-type, or a pointer-type derivation to - T. - -25 - A type is characterized by its type category, which is either the outermost derivation of a - derived type (as noted above in the construction of derived types), or the type itself if the - type consists of no derived types. - -26 - Any type so far mentioned is an unqualified type. Each unqualified type has several - qualified versions of its type,47) corresponding to the combinations of one, two, or all - three of the const, volatile, and restrict qualifiers. The qualified or unqualified - versions of a type are distinct types that belong to the same type category and have the - same representation and alignment requirements.48) A derived type is not qualified by the - qualifiers (if any) of the type from which it is derived. - -27 - Further, there is the _Atomic qualifier. The presence of the _Atomic qualifier - designates an atomic type. The size, representation, and alignment of an atomic type - need not be the same as those of the corresponding unqualified type. Therefore, this - Standard explicitly uses the phrase ''atomic, qualified or unqualified type'' whenever the - atomic version of a type is permitted along with the other qualified versions of a type. - The phrase ''qualified or unqualified type'', without specific mention of atomic, does not - include the atomic types. - -28 - A pointer to void shall have the same representation and alignment requirements as a - pointer to a character type.48) Similarly, pointers to qualified or unqualified versions of - compatible types shall have the same representation and alignment requirements. All - pointers to structure types shall have the same representation and alignment requirements - as each other. All pointers to union types shall have the same representation and - alignment requirements as each other. Pointers to other types need not have the same - representation or alignment requirements. - -29 - EXAMPLE 1 The type designated as ''float *'' has type ''pointer to float''. Its type category is - pointer, not a floating type. The const-qualified version of this type is designated as ''float * const'' - whereas the type designated as ''const float *'' is not a qualified type -- its type is ''pointer to const- - - - - qualified float'' and is a pointer to a qualified type. - - -30 - EXAMPLE 2 The type designated as ''struct tag (*[5])(float)'' has type ''array of pointer to - function returning struct tag''. The array has length five and the function has a single parameter of type - float. Its type category is array. - - - Forward references: compatible type and composite type (6.2.7), declarations (6.7). - - -Footnotes - -37) A type may be incomplete or complete throughout an entire translation unit, or it may change states at - different points within a translation unit. - - -38) Implementation-defined keywords shall have the form of an identifier reserved for any use as - described in 7.1.3. - - -39) Therefore, any statement in this Standard about signed integer types also applies to the extended - signed integer types. - - -40) Therefore, any statement in this Standard about unsigned integer types also applies to the extended - unsigned integer types. - - -41) The same representation and alignment requirements are meant to imply interchangeability as - arguments to functions, return values from functions, and members of unions. - - -42) See ''future language directions'' (6.11.1). - - -43) A specification for imaginary types is in annex G. - - -44) An implementation may define new keywords that provide alternative ways to designate a basic (or - any other) type; this does not violate the requirement that all basic types be different. - Implementation-defined keywords shall have the form of an identifier reserved for any use as - described in 7.1.3. - - -45) CHAR_MIN, defined in <limits.h>, will have one of the values 0 or SCHAR_MIN, and this can be - used to distinguish the two options. Irrespective of the choice made, char is a separate type from the - other two and is not compatible with either. - - -46) Note that aggregate type does not include union type because an object with union type can only - contain one member at a time. - - -47) See 6.7.3 regarding qualified array and function types. - - -48) The same representation and alignment requirements are meant to imply interchangeability as - arguments to functions, return values from functions, and members of unions. - - -Contents - -6.2.6 Representations of types - - -Contents - -6.2.6.1 General - - -1 - The representations of all types are unspecified except as stated in this subclause. - -2 - Except for bit-fields, objects are composed of contiguous sequences of one or more bytes, - the number, order, and encoding of which are either explicitly specified or - implementation-defined. - -3 - Values stored in unsigned bit-fields and objects of type unsigned char shall be - represented using a pure binary notation.49) - -4 - Values stored in non-bit-field objects of any other object type consist of n x CHAR_BIT - bits, where n is the size of an object of that type, in bytes. The value may be copied into - an object of type unsigned char [n] (e.g., by memcpy); the resulting set of bytes is - called the object representation of the value. Values stored in bit-fields consist of m bits, - where m is the size specified for the bit-field. The object representation is the set of m - bits the bit-field comprises in the addressable storage unit holding it. Two values (other - than NaNs) with the same object representation compare equal, but values that compare - equal may have different object representations. - -5 - Certain object representations need not represent a value of the object type. If the stored - value of an object has such a representation and is read by an lvalue expression that does - not have character type, the behavior is undefined. If such a representation is produced - by a side effect that modifies all or any part of the object by an lvalue expression that - does not have character type, the behavior is undefined.50) Such a representation is called - a trap representation. - -6 - When a value is stored in an object of structure or union type, including in a member - object, the bytes of the object representation that correspond to any padding bytes take - unspecified values.51) The value of a structure or union object is never a trap - - - - representation, even though the value of a member of the structure or union object may be - a trap representation. - -7 - When a value is stored in a member of an object of union type, the bytes of the object - representation that do not correspond to that member but do correspond to other members - take unspecified values. - -8 - Where an operator is applied to a value that has more than one object representation, - which object representation is used shall not affect the value of the result.52) Where a - value is stored in an object using a type that has more than one object representation for - that value, it is unspecified which representation is used, but a trap representation shall - not be generated. - -9 - Loads and stores of objects with atomic types are done with - memory_order_seq_cst semantics. - - Forward references: declarations (6.7), expressions (6.5), lvalues, arrays, and function - designators (6.3.2.1), order and consistency (7.17.3). - - -Footnotes - -49) A positional representation for integers that uses the binary digits 0 and 1, in which the values - represented by successive bits are additive, begin with 1, and are multiplied by successive integral - powers of 2, except perhaps the bit with the highest position. (Adapted from the American National - Dictionary for Information Processing Systems.) A byte contains CHAR_BIT bits, and the values of - type unsigned char range from 0 to 2CHAR_BIT - 1. - - -50) Thus, an automatic variable can be initialized to a trap representation without causing undefined - behavior, but the value of the variable cannot be used until a proper value is stored in it. - - -51) Thus, for example, structure assignment need not copy any padding bits. - - -52) It is possible for objects x and y with the same effective type T to have the same value when they are - accessed as objects of type T, but to have different values in other contexts. In particular, if == is - defined for type T, then x == y does not imply that memcmp(&x, &y, sizeof (T)) == 0. - Furthermore, x == y does not necessarily imply that x and y have the same value; other operations - on values of type T may distinguish between them. - - -Contents - -6.2.6.2 Integer types - - -1 - For unsigned integer types other than unsigned char, the bits of the object - representation shall be divided into two groups: value bits and padding bits (there need - not be any of the latter). If there are N value bits, each bit shall represent a different - power of 2 between 1 and 2N - 1, so that objects of that type shall be capable of - representing values from 0 to 2N - 1 using a pure binary representation; this shall be - known as the value representation. The values of any padding bits are unspecified.53) - -2 - For signed integer types, the bits of the object representation shall be divided into three - groups: value bits, padding bits, and the sign bit. There need not be any padding bits; - signed char shall not have any padding bits. There shall be exactly one sign bit. - Each bit that is a value bit shall have the same value as the same bit in the object - representation of the corresponding unsigned type (if there are M value bits in the signed - type and N in the unsigned type, then M <= N ). If the sign bit is zero, it shall not affect - - - the resulting value. If the sign bit is one, the value shall be modified in one of the - following ways: - - -- the corresponding value with sign bit 0 is negated (sign and magnitude); - -- the sign bit has the value -(2M) (two's complement); - -- the sign bit has the value -(2M- 1) (ones' complement). - - Which of these applies is implementation-defined, as is whether the value with sign bit 1 - and all value bits zero (for the first two), or with sign bit and all value bits 1 (for ones' - complement), is a trap representation or a normal value. In the case of sign and - magnitude and ones' complement, if this representation is a normal value it is called a - negative zero. - -3 - If the implementation supports negative zeros, they shall be generated only by: - - -- the &, |, ^, ~, <<, and >> operators with operands that produce such a value; - -- the +, -, *, /, and % operators where one operand is a negative zero and the result is - zero; - -- compound assignment operators based on the above cases. - - It is unspecified whether these cases actually generate a negative zero or a normal zero, - and whether a negative zero becomes a normal zero when stored in an object. - -4 - If the implementation does not support negative zeros, the behavior of the &, |, ^, ~, <<, - and >> operators with operands that would produce such a value is undefined. - -5 - The values of any padding bits are unspecified.54) A valid (non-trap) object representation - of a signed integer type where the sign bit is zero is a valid object representation of the - corresponding unsigned type, and shall represent the same value. For any integer type, - the object representation where all the bits are zero shall be a representation of the value - zero in that type. - -6 - The precision of an integer type is the number of bits it uses to represent values, - excluding any sign and padding bits. The width of an integer type is the same but - including any sign bit; thus for unsigned integer types the two values are the same, while - for signed integer types the width is one greater than the precision. - - - - - - -Footnotes - -53) Some combinations of padding bits might generate trap representations, for example, if one padding - bit is a parity bit. Regardless, no arithmetic operation on valid values can generate a trap - representation other than as part of an exceptional condition such as an overflow, and this cannot occur - with unsigned types. All other combinations of padding bits are alternative object representations of - the value specified by the value bits. - - -54) Some combinations of padding bits might generate trap representations, for example, if one padding - bit is a parity bit. Regardless, no arithmetic operation on valid values can generate a trap - representation other than as part of an exceptional condition such as an overflow. All other - combinations of padding bits are alternative object representations of the value specified by the value - bits. - - -Contents - -6.2.7 Compatible type and composite type - - -1 - Two types have compatible type if their types are the same. Additional rules for - determining whether two types are compatible are described in 6.7.2 for type specifiers, - in 6.7.3 for type qualifiers, and in 6.7.6 for declarators.55) Moreover, two structure, - union, or enumerated types declared in separate translation units are compatible if their - tags and members satisfy the following requirements: If one is declared with a tag, the - other shall be declared with the same tag. If both are completed anywhere within their - respective translation units, then the following additional requirements apply: there shall - be a one-to-one correspondence between their members such that each pair of - corresponding members are declared with compatible types; if one member of the pair is - declared with an alignment specifier, the other is declared with an equivalent alignment - specifier; and if one member of the pair is declared with a name, the other is declared - with the same name. For two structures, corresponding members shall be declared in the - same order. For two structures or unions, corresponding bit-fields shall have the same - widths. For two enumerations, corresponding members shall have the same values. - -2 - All declarations that refer to the same object or function shall have compatible type; - otherwise, the behavior is undefined. - -3 - A composite type can be constructed from two types that are compatible; it is a type that - is compatible with both of the two types and satisfies the following conditions: - - -- If both types are array types, the following rules are applied: - - -- If one type is an array of known constant size, the composite type is an array of - that size. - -- Otherwise, if one type is a variable length array whose size is specified by an - expression that is not evaluated, the behavior is undefined. - -- Otherwise, if one type is a variable length array whose size is specified, the - composite type is a variable length array of that size. - -- Otherwise, if one type is a variable length array of unspecified size, the composite - type is a variable length array of unspecified size. - -- Otherwise, both types are arrays of unknown size and the composite type is an - array of unknown size. - - The element type of the composite type is the composite type of the two element - types. - -- If only one type is a function type with a parameter type list (a function prototype), - the composite type is a function prototype with the parameter type list. - - - - -- If both types are function types with parameter type lists, the type of each parameter - in the composite parameter type list is the composite type of the corresponding - parameters. - - These rules apply recursively to the types from which the two types are derived. - -4 - For an identifier with internal or external linkage declared in a scope in which a prior - declaration of that identifier is visible,56) if the prior declaration specifies internal or - external linkage, the type of the identifier at the later declaration becomes the composite - type. - - Forward references: array declarators (6.7.6.2). - -5 - EXAMPLE Given the following two file scope declarations: - - - int f(int (*)(), double (*)[3]); - int f(int (*)(char *), double (*)[]); - - - The resulting composite type for the function is: - - - int f(int (*)(char *), double (*)[3]); - - - - - -Footnotes - -55) Two types need not be identical to be compatible. - - -56) As specified in 6.2.1, the later declaration might hide the prior declaration. - - -Contents - -6.2.8 Alignment of objects - - -1 - Complete object types have alignment requirements which place restrictions on the - addresses at which objects of that type may be allocated. An alignment is an - implementation-defined integer value representing the number of bytes between - successive addresses at which a given object can be allocated. An object type imposes an - alignment requirement on every object of that type: stricter alignment can be requested - using the _Alignas keyword. - -2 - A fundamental alignment is represented by an alignment less than or equal to the greatest - alignment supported by the implementation in all contexts, which is equal to - _Alignof (max_align_t). - -3 - An extended alignment is represented by an alignment greater than - _Alignof (max_align_t). It is implementation-defined whether any extended - alignments are supported and the contexts in which they are supported. A type having an - extended alignment requirement is an over-aligned type.57) - -4 - Alignments are represented as values of the type size_t. Valid alignments include only - those values returned by an _Alignof expression for fundamental types, plus an - additional implementation-defined set of values, which may be empty. Every valid - alignment value shall be a nonnegative integral power of two. - - - - -5 - Alignments have an order from weaker to stronger or stricter alignments. Stricter - alignments have larger alignment values. An address that satisfies an alignment - requirement also satisfies any weaker valid alignment requirement. - -6 - The alignment requirement of a complete type can be queried using an _Alignof - expression. The types char, signed char, and unsigned char shall have the - weakest alignment requirement. - -7 - Comparing alignments is meaningful and provides the obvious results: - - -- Two alignments are equal when their numeric values are equal. - -- Two alignments are different when their numeric values are not equal. - -- When an alignment is larger than another it represents a stricter alignment. - - -Footnotes - -57) Every over-aligned type is, or contains, a structure or union type with a member to which an extended - alignment has been applied. - - -Contents - -6.3 Conversions - - -1 - Several operators convert operand values from one type to another automatically. This - subclause specifies the result required from such an implicit conversion, as well as those - that result from a cast operation (an explicit conversion). The list in 6.3.1.8 summarizes - the conversions performed by most ordinary operators; it is supplemented as required by - the discussion of each operator in 6.5. - -2 - Conversion of an operand value to a compatible type causes no change to the value or the - representation. - - Forward references: cast operators (6.5.4). - - -Contents - -6.3.1 Arithmetic operands - - -Contents - -6.3.1.1 Boolean, characters, and integers - - -1 - Every integer type has an integer conversion rank defined as follows: - - -- No two signed integer types shall have the same rank, even if they have the same - representation. - -- The rank of a signed integer type shall be greater than the rank of any signed integer - type with less precision. - -- The rank of long long int shall be greater than the rank of long int, which - shall be greater than the rank of int, which shall be greater than the rank of short - int, which shall be greater than the rank of signed char. - -- The rank of any unsigned integer type shall equal the rank of the corresponding - signed integer type, if any. - -- The rank of any standard integer type shall be greater than the rank of any extended - integer type with the same width. - -- The rank of char shall equal the rank of signed char and unsigned char. - -- The rank of _Bool shall be less than the rank of all other standard integer types. - -- The rank of any enumerated type shall equal the rank of the compatible integer type - (see 6.7.2.2). - -- The rank of any extended signed integer type relative to another extended signed - integer type with the same precision is implementation-defined, but still subject to the - other rules for determining the integer conversion rank. - -- For all integer types T1, T2, and T3, if T1 has greater rank than T2 and T2 has - greater rank than T3, then T1 has greater rank than T3. - - -2 - The following may be used in an expression wherever an int or unsigned int may - be used: - - -- An object or expression with an integer type (other than int or unsigned int) - whose integer conversion rank is less than or equal to the rank of int and - unsigned int. - -- A bit-field of type _Bool, int, signed int, or unsigned int. - - If an int can represent all values of the original type (as restricted by the width, for a - bit-field), the value is converted to an int; otherwise, it is converted to an unsigned - int. These are called the integer promotions.58) All other types are unchanged by the - integer promotions. - -3 - The integer promotions preserve value including sign. As discussed earlier, whether a - ''plain'' char is treated as signed is implementation-defined. - - Forward references: enumeration specifiers (6.7.2.2), structure and union specifiers - (6.7.2.1). - - -Footnotes - -58) The integer promotions are applied only: as part of the usual arithmetic conversions, to certain - argument expressions, to the operands of the unary +, -, and ~ operators, and to both operands of the - shift operators, as specified by their respective subclauses. - - -Contents - -6.3.1.2 Boolean type - - -1 - When any scalar value is converted to _Bool, the result is 0 if the value compares equal - to 0; otherwise, the result is 1.59) - - -Footnotes - -59) NaNs do not compare equal to 0 and thus convert to 1. - - -Contents - -6.3.1.3 Signed and unsigned integers - - -1 - When a value with integer type is converted to another integer type other than _Bool, if - the value can be represented by the new type, it is unchanged. - -2 - Otherwise, if the new type is unsigned, the value is converted by repeatedly adding or - subtracting one more than the maximum value that can be represented in the new type - until the value is in the range of the new type.60) - -3 - Otherwise, the new type is signed and the value cannot be represented in it; either the - result is implementation-defined or an implementation-defined signal is raised. - - -Footnotes - -60) The rules describe arithmetic on the mathematical value, not the value of a given type of expression. - - -Contents - -6.3.1.4 Real floating and integer - - -1 - When a finite value of real floating type is converted to an integer type other than _Bool, - the fractional part is discarded (i.e., the value is truncated toward zero). If the value of - the integral part cannot be represented by the integer type, the behavior is undefined.61) - - - - -2 - When a value of integer type is converted to a real floating type, if the value being - converted can be represented exactly in the new type, it is unchanged. If the value being - converted is in the range of values that can be represented but cannot be represented - exactly, the result is either the nearest higher or nearest lower representable value, chosen - in an implementation-defined manner. If the value being converted is outside the range of - values that can be represented, the behavior is undefined. Results of some implicit - conversions may be represented in greater range and precision than that required by the - new type (see 6.3.1.8 and 6.8.6.4). - - -Footnotes - -61) The remaindering operation performed when a value of integer type is converted to unsigned type - need not be performed when a value of real floating type is converted to unsigned type. Thus, the - range of portable real floating values is (-1, Utype_MAX+1). - - -Contents - -6.3.1.5 Real floating types - - -1 - When a value of real floating type is converted to a real floating type, if the value being - converted can be represented exactly in the new type, it is unchanged. If the value being - converted is in the range of values that can be represented but cannot be represented - exactly, the result is either the nearest higher or nearest lower representable value, chosen - in an implementation-defined manner. If the value being converted is outside the range of - values that can be represented, the behavior is undefined. Results of some implicit - conversions may be represented in greater range and precision than that required by the - new type (see 6.3.1.8 and 6.8.6.4). - - -Contents - -6.3.1.6 Complex types - - -1 - When a value of complex type is converted to another complex type, both the real and - imaginary parts follow the conversion rules for the corresponding real types. - - -Contents - -6.3.1.7 Real and complex - - -1 - When a value of real type is converted to a complex type, the real part of the complex - result value is determined by the rules of conversion to the corresponding real type and - the imaginary part of the complex result value is a positive zero or an unsigned zero. - -2 - When a value of complex type is converted to a real type, the imaginary part of the - complex value is discarded and the value of the real part is converted according to the - conversion rules for the corresponding real type. - - -Contents - -6.3.1.8 Usual arithmetic conversions - - -1 - Many operators that expect operands of arithmetic type cause conversions and yield result - types in a similar way. The purpose is to determine a common real type for the operands - and result. For the specified operands, each operand is converted, without change of type - domain, to a type whose corresponding real type is the common real type. Unless - explicitly stated otherwise, the common real type is also the corresponding real type of - the result, whose type domain is the type domain of the operands if they are the same, - and complex otherwise. This pattern is called the usual arithmetic conversions: - - -- First, if the corresponding real type of either operand is long double, the other - operand is converted, without change of type domain, to a type whose - corresponding real type is long double. - -- Otherwise, if the corresponding real type of either operand is double, the other - operand is converted, without change of type domain, to a type whose - corresponding real type is double. - -- Otherwise, if the corresponding real type of either operand is float, the other - operand is converted, without change of type domain, to a type whose - corresponding real type is float.62) - -- Otherwise, the integer promotions are performed on both operands. Then the - following rules are applied to the promoted operands: - - -- If both operands have the same type, then no further conversion is needed. - -- Otherwise, if both operands have signed integer types or both have unsigned - integer types, the operand with the type of lesser integer conversion rank is - converted to the type of the operand with greater rank. - -- Otherwise, if the operand that has unsigned integer type has rank greater or - equal to the rank of the type of the other operand, then the operand with - signed integer type is converted to the type of the operand with unsigned - integer type. - -- Otherwise, if the type of the operand with signed integer type can represent - all of the values of the type of the operand with unsigned integer type, then - the operand with unsigned integer type is converted to the type of the - operand with signed integer type. - -- Otherwise, both operands are converted to the unsigned integer type - corresponding to the type of the operand with signed integer type. - - -2 - The values of floating operands and of the results of floating expressions may be - represented in greater range and precision than that required by the type; the types are not - changed thereby.63) - - - - - - -Footnotes - -62) For example, addition of a double _Complex and a float entails just the conversion of the - float operand to double (and yields a double _Complex result). - - -63) The cast and assignment operators are still required to remove extra range and precision. - - -Contents - -6.3.2 Other operands - - -Contents - -6.3.2.1 Lvalues, arrays, and function designators - - -1 - An lvalue is an expression (with an object type other than void) that potentially - designates an object;64) if an lvalue does not designate an object when it is evaluated, the - behavior is undefined. When an object is said to have a particular type, the type is - specified by the lvalue used to designate the object. A modifiable lvalue is an lvalue that - does not have array type, does not have an incomplete type, does not have a const- - qualified type, and if it is a structure or union, does not have any member (including, - recursively, any member or element of all contained aggregates or unions) with a const- - qualified type. - -2 - Except when it is the operand of the sizeof operator, the _Alignof operator, the - unary & operator, the ++ operator, the -- operator, or the left operand of the . operator - or an assignment operator, an lvalue that does not have array type is converted to the - value stored in the designated object (and is no longer an lvalue); this is called lvalue - conversion. If the lvalue has qualified type, the value has the unqualified version of the - type of the lvalue; additionally, if the lvalue has atomic type, the value has the non-atomic - version of the type of the lvalue; otherwise, the value has the type of the lvalue. If the - lvalue has an incomplete type and does not have array type, the behavior is undefined. If - the lvalue designates an object of automatic storage duration that could have been - declared with the register storage class (never had its address taken), and that object - is uninitialized (not declared with an initializer and no assignment to it has been - performed prior to use), the behavior is undefined. - -3 - Except when it is the operand of the sizeof operator, the _Alignof operator, or the - unary & operator, or is a string literal used to initialize an array, an expression that has - type ''array of type'' is converted to an expression with type ''pointer to type'' that points - to the initial element of the array object and is not an lvalue. If the array object has - register storage class, the behavior is undefined. - -4 - A function designator is an expression that has function type. Except when it is the - operand of the sizeof operator, the _Alignof operator,65) or the unary & operator, a - function designator with type ''function returning type'' is converted to an expression that - - - - has type ''pointer to function returning type''. - - Forward references: address and indirection operators (6.5.3.2), assignment operators - (6.5.16), common definitions <stddef.h> (7.19), initialization (6.7.9), postfix - increment and decrement operators (6.5.2.4), prefix increment and decrement operators - (6.5.3.1), the sizeof and _Alignof operators (6.5.3.4), structure and union members - (6.5.2.3). - - -Footnotes - -64) The name ''lvalue'' comes originally from the assignment expression E1 = E2, in which the left - operand E1 is required to be a (modifiable) lvalue. It is perhaps better considered as representing an - object ''locator value''. What is sometimes called ''rvalue'' is in this International Standard described - as the ''value of an expression''. - An obvious example of an lvalue is an identifier of an object. As a further example, if E is a unary - expression that is a pointer to an object, *E is an lvalue that designates the object to which E points. - - -65) Because this conversion does not occur, the operand of the sizeof or _Alignof operator remains - a function designator and violates the constraints in 6.5.3.4. - - -Contents - -6.3.2.2 void - - -1 - The (nonexistent) value of a void expression (an expression that has type void) shall not - be used in any way, and implicit or explicit conversions (except to void) shall not be - applied to such an expression. If an expression of any other type is evaluated as a void - expression, its value or designator is discarded. (A void expression is evaluated for its - side effects.) - - -Contents - -6.3.2.3 Pointers - - -1 - A pointer to void may be converted to or from a pointer to any object type. A pointer to - any object type may be converted to a pointer to void and back again; the result shall - compare equal to the original pointer. - -2 - For any qualifier q, a pointer to a non-q-qualified type may be converted to a pointer to - the q-qualified version of the type; the values stored in the original and converted pointers - shall compare equal. - -3 - An integer constant expression with the value 0, or such an expression cast to type - void *, is called a null pointer constant.66) If a null pointer constant is converted to a - pointer type, the resulting pointer, called a null pointer, is guaranteed to compare unequal - to a pointer to any object or function. - -4 - Conversion of a null pointer to another pointer type yields a null pointer of that type. - Any two null pointers shall compare equal. - -5 - An integer may be converted to any pointer type. Except as previously specified, the - result is implementation-defined, might not be correctly aligned, might not point to an - entity of the referenced type, and might be a trap representation.67) - -6 - Any pointer type may be converted to an integer type. Except as previously specified, the - result is implementation-defined. If the result cannot be represented in the integer type, - the behavior is undefined. The result need not be in the range of values of any integer - type. - - - - -7 - A pointer to an object type may be converted to a pointer to a different object type. If the - resulting pointer is not correctly aligned68) for the referenced type, the behavior is - undefined. Otherwise, when converted back again, the result shall compare equal to the - original pointer. When a pointer to an object is converted to a pointer to a character type, - the result points to the lowest addressed byte of the object. Successive increments of the - result, up to the size of the object, yield pointers to the remaining bytes of the object. - -8 - A pointer to a function of one type may be converted to a pointer to a function of another - type and back again; the result shall compare equal to the original pointer. If a converted - pointer is used to call a function whose type is not compatible with the referenced type, - the behavior is undefined. - - Forward references: cast operators (6.5.4), equality operators (6.5.9), integer types - capable of holding object pointers (7.20.1.4), simple assignment (6.5.16.1). - - - - - - -Footnotes - -66) The macro NULL is defined in <stddef.h> (and other headers) as a null pointer constant; see 7.19. - - -67) The mapping functions for converting a pointer to an integer or an integer to a pointer are intended to - be consistent with the addressing structure of the execution environment. - - -68) In general, the concept ''correctly aligned'' is transitive: if a pointer to type A is correctly aligned for a - pointer to type B, which in turn is correctly aligned for a pointer to type C, then a pointer to type A is - correctly aligned for a pointer to type C. - - -Contents - -6.4 Lexical elements - - -Syntax - -1 - - - token: - keyword - identifier - constant - string-literal - punctuator - preprocessing-token: - header-name - identifier - pp-number - character-constant - string-literal - punctuator - each non-white-space character that cannot be one of the above - - -Constraints - -2 - Each preprocessing token that is converted to a token shall have the lexical form of a - keyword, an identifier, a constant, a string literal, or a punctuator. - -Semantics - -3 - A token is the minimal lexical element of the language in translation phases 7 and 8. The - categories of tokens are: keywords, identifiers, constants, string literals, and punctuators. - A preprocessing token is the minimal lexical element of the language in translation - phases 3 through 6. The categories of preprocessing tokens are: header names, - identifiers, preprocessing numbers, character constants, string literals, punctuators, and - single non-white-space characters that do not lexically match the other preprocessing - token categories.69) If a ' or a " character matches the last category, the behavior is - undefined. Preprocessing tokens can be separated by white space; this consists of - comments (described later), or white-space characters (space, horizontal tab, new-line, - vertical tab, and form-feed), or both. As described in 6.10, in certain circumstances - during translation phase 4, white space (or the absence thereof) serves as more than - preprocessing token separation. White space may appear within a preprocessing token - only as part of a header name or between the quotation characters in a character constant - or string literal. - - - - - -4 - If the input stream has been parsed into preprocessing tokens up to a given character, the - next preprocessing token is the longest sequence of characters that could constitute a - preprocessing token. There is one exception to this rule: header name preprocessing - tokens are recognized only within #include preprocessing directives and in - implementation-defined locations within #pragma directives. In such contexts, a - sequence of characters that could be either a header name or a string literal is recognized - as the former. - -5 - EXAMPLE 1 The program fragment 1Ex is parsed as a preprocessing number token (one that is not a - valid floating or integer constant token), even though a parse as the pair of preprocessing tokens 1 and Ex - might produce a valid expression (for example, if Ex were a macro defined as +1). Similarly, the program - fragment 1E1 is parsed as a preprocessing number (one that is a valid floating constant token), whether or - not E is a macro name. - - -6 - EXAMPLE 2 The program fragment x+++++y is parsed as x ++ ++ + y, which violates a constraint on - increment operators, even though the parse x ++ + ++ y might yield a correct expression. - - - Forward references: character constants (6.4.4.4), comments (6.4.9), expressions (6.5), - floating constants (6.4.4.2), header names (6.4.7), macro replacement (6.10.3), postfix - increment and decrement operators (6.5.2.4), prefix increment and decrement operators - (6.5.3.1), preprocessing directives (6.10), preprocessing numbers (6.4.8), string literals - (6.4.5). - - -Footnotes - -69) An additional category, placemarkers, is used internally in translation phase 4 (see 6.10.3.3); it cannot - occur in source files. - - -Contents - -6.4.1 Keywords - - -Syntax - -1 - - - keyword: one of - auto if unsigned - break inline void - case int volatile - char long while - const register _Alignas - continue restrict _Alignof - default return _Atomic - do short _Bool - double signed _Complex - else sizeof _Generic - enum static _Imaginary - extern struct _Noreturn - float switch _Static_assert - for typedef _Thread_local - goto union - - -Semantics - -2 - The above tokens (case sensitive) are reserved (in translation phases 7 and 8) for use as - keywords, and shall not be used otherwise. The keyword _Imaginary is reserved for - - specifying imaginary types.70) - - -Footnotes - -70) One possible specification for imaginary types appears in annex G. - - -Contents - -6.4.2 Identifiers - - -Contents - -6.4.2.1 General - - -Syntax - -1 - - - identifier: - identifier-nondigit - identifier identifier-nondigit - identifier digit - identifier-nondigit: - nondigit - universal-character-name - other implementation-defined characters - nondigit: one of - _ a b c d e f g h i j k l m - n o p q r s t u v w x y z - A B C D E F G H I J K L M - N O P Q R S T U V W X Y Z - digit: one of - 0 1 2 3 4 5 6 7 8 9 - - -Semantics - -2 - An identifier is a sequence of nondigit characters (including the underscore _, the - lowercase and uppercase Latin letters, and other characters) and digits, which designates - one or more entities as described in 6.2.1. Lowercase and uppercase letters are distinct. - There is no specific limit on the maximum length of an identifier. - -3 - Each universal character name in an identifier shall designate a character whose encoding - in ISO/IEC 10646 falls into one of the ranges specified in D.1.71) The initial character - shall not be a universal character name designating a character whose encoding falls into - one of the ranges specified in D.2. An implementation may allow multibyte characters - that are not part of the basic source character set to appear in identifiers; which characters - and their correspondence to universal character names is implementation-defined. - - - - - -4 - When preprocessing tokens are converted to tokens during translation phase 7, if a - preprocessing token could be converted to either a keyword or an identifier, it is converted - to a keyword. - -Implementation limits - -5 - As discussed in 5.2.4.1, an implementation may limit the number of significant initial - characters in an identifier; the limit for an external name (an identifier that has external - linkage) may be more restrictive than that for an internal name (a macro name or an - identifier that does not have external linkage). The number of significant characters in an - identifier is implementation-defined. - -6 - Any identifiers that differ in a significant character are different identifiers. If two - identifiers differ only in nonsignificant characters, the behavior is undefined. - - Forward references: universal character names (6.4.3), macro replacement (6.10.3). - - -Footnotes - -71) On systems in which linkers cannot accept extended characters, an encoding of the universal character - name may be used in forming valid external identifiers. For example, some otherwise unused - character or sequence of characters may be used to encode the \u in a universal character name. - Extended characters may produce a long external identifier. - - -Contents - -6.4.2.2 Predefined identifiers - - -Semantics - -1 - The identifier __func__ shall be implicitly declared by the translator as if, - immediately following the opening brace of each function definition, the declaration - - - static const char __func__[] = "function-name"; - - - appeared, where function-name is the name of the lexically-enclosing function.72) - -2 - This name is encoded as if the implicit declaration had been written in the source - character set and then translated into the execution character set as indicated in translation - phase 5. - -3 - EXAMPLE Consider the code fragment: - - - #include <stdio.h> - void myfunc(void) - { - printf("%s\n", __func__); - /* ... */ - } - - - Each time the function is called, it will print to the standard output stream: - - - myfunc - - - - - Forward references: function definitions (6.9.1). - - - - - - -Footnotes - -72) Since the name __func__ is reserved for any use by the implementation (7.1.3), if any other - identifier is explicitly declared using the name __func__, the behavior is undefined. - - -Contents - -6.4.3 Universal character names - - -Syntax - -1 - - - universal-character-name: - \u hex-quad - \U hex-quad hex-quad - hex-quad: - hexadecimal-digit hexadecimal-digit - hexadecimal-digit hexadecimal-digit - - -Constraints - -2 - A universal character name shall not specify a character whose short identifier is less than - 00A0 other than 0024 ($), 0040 (@), or 0060 ('), nor one in the range D800 through - DFFF inclusive.73) - -Description - -3 - Universal character names may be used in identifiers, character constants, and string - literals to designate characters that are not in the basic character set. - -Semantics - -4 - The universal character name \Unnnnnnnn designates the character whose eight-digit - short identifier (as specified by ISO/IEC 10646) is nnnnnnnn.74) Similarly, the universal - character name \unnnn designates the character whose four-digit short identifier is nnnn - (and whose eight-digit short identifier is 0000nnnn). - - - - - - -Footnotes - -73) The disallowed characters are the characters in the basic character set and the code positions reserved - by ISO/IEC 10646 for control characters, the character DELETE, and the S-zone (reserved for use by - UTF-16). - - - -74) Short identifiers for characters were first specified in ISO/IEC 10646-1/AMD9:1997. - - -Contents - -6.4.4 Constants - - -Syntax - -1 - - - constant: - integer-constant - floating-constant - enumeration-constant - character-constant - - -Constraints - -2 - Each constant shall have a type and the value of a constant shall be in the range of - representable values for its type. - -Semantics - -3 - Each constant has a type, determined by its form and value, as detailed later. - - -Contents - -6.4.4.1 Integer constants - - -Syntax - -1 - - - integer-constant: - decimal-constant integer-suffixopt - octal-constant integer-suffixopt - hexadecimal-constant integer-suffixopt - decimal-constant: - nonzero-digit - decimal-constant digit - octal-constant: - 0 - octal-constant octal-digit - hexadecimal-constant: - hexadecimal-prefix hexadecimal-digit - hexadecimal-constant hexadecimal-digit - hexadecimal-prefix: one of - 0x 0X - nonzero-digit: one of - 1 2 3 4 5 6 7 8 9 - octal-digit: one of - 0 1 2 3 4 5 6 7 - hexadecimal-digit: one of - 0 1 2 3 4 5 6 7 8 9 - a b c d e f - A B C D E F - integer-suffix: - unsigned-suffix long-suffixopt - unsigned-suffix long-long-suffix - long-suffix unsigned-suffixopt - long-long-suffix unsigned-suffixopt - unsigned-suffix: one of - u U - long-suffix: one of - l L - long-long-suffix: one of - ll LL - - -Description - -2 - An integer constant begins with a digit, but has no period or exponent part. It may have a - prefix that specifies its base and a suffix that specifies its type. - -3 - A decimal constant begins with a nonzero digit and consists of a sequence of decimal - digits. An octal constant consists of the prefix 0 optionally followed by a sequence of the - digits 0 through 7 only. A hexadecimal constant consists of the prefix 0x or 0X followed - by a sequence of the decimal digits and the letters a (or A) through f (or F) with values - 10 through 15 respectively. - -Semantics - -4 - The value of a decimal constant is computed base 10; that of an octal constant, base 8; - that of a hexadecimal constant, base 16. The lexically first digit is the most significant. - -5 - The type of an integer constant is the first of the corresponding list in which its value can - be represented. - - - Suffix Decimal Constant Octal or Hexadecimal Constant - - none - - -int -long int -long long int - - - - -int -unsigned int -long int -unsigned long int -long long int -unsigned long long int - - - u or U - - -unsigned int -unsigned long int -unsigned long long int - - - - -unsigned int -unsigned long int -unsigned long long int - - - l or L - - -long int -long long int - - - - -long int -unsigned long int -long long int -unsigned long long int - - - Both u or U and l or L - - -unsigned long int -unsigned long long int - - - - -unsigned long int -unsigned long long int - - - ll or LL - - -long long int - - - - -long long int -unsigned long long int - - - Both u or U and ll or LL - - -unsigned long long int - - - - -unsigned long long int - - -6 - If an integer constant cannot be represented by any type in its list, it may have an - extended integer type, if the extended integer type can represent its value. If all of the - types in the list for the constant are signed, the extended integer type shall be signed. If - all of the types in the list for the constant are unsigned, the extended integer type shall be - unsigned. If the list contains both signed and unsigned types, the extended integer type - may be signed or unsigned. If an integer constant cannot be represented by any type in - its list and has no extended integer type, then the integer constant has no type. - - -Contents - -6.4.4.2 Floating constants - - -Syntax - -1 - - - floating-constant: - decimal-floating-constant - hexadecimal-floating-constant - decimal-floating-constant: - fractional-constant exponent-partopt floating-suffixopt - digit-sequence exponent-part floating-suffixopt - hexadecimal-floating-constant: - hexadecimal-prefix hexadecimal-fractional-constant - binary-exponent-part floating-suffixopt - hexadecimal-prefix hexadecimal-digit-sequence - binary-exponent-part floating-suffixopt - fractional-constant: - digit-sequenceopt . digit-sequence - digit-sequence . - exponent-part: - e signopt digit-sequence - E signopt digit-sequence - sign: one of - + - - digit-sequence: - digit - digit-sequence digit - hexadecimal-fractional-constant: - hexadecimal-digit-sequenceopt . - hexadecimal-digit-sequence - hexadecimal-digit-sequence . - binary-exponent-part: - p signopt digit-sequence - P signopt digit-sequence - hexadecimal-digit-sequence: - hexadecimal-digit - hexadecimal-digit-sequence hexadecimal-digit - floating-suffix: one of - f l F L - - -Description - -2 - A floating constant has a significand part that may be followed by an exponent part and a - suffix that specifies its type. The components of the significand part may include a digit - sequence representing the whole-number part, followed by a period (.), followed by a - digit sequence representing the fraction part. The components of the exponent part are an - e, E, p, or P followed by an exponent consisting of an optionally signed digit sequence. - Either the whole-number part or the fraction part has to be present; for decimal floating - constants, either the period or the exponent part has to be present. - -Semantics - -3 - The significand part is interpreted as a (decimal or hexadecimal) rational number; the - digit sequence in the exponent part is interpreted as a decimal integer. For decimal - floating constants, the exponent indicates the power of 10 by which the significand part is - to be scaled. For hexadecimal floating constants, the exponent indicates the power of 2 - by which the significand part is to be scaled. For decimal floating constants, and also for - hexadecimal floating constants when FLT_RADIX is not a power of 2, the result is either - the nearest representable value, or the larger or smaller representable value immediately - adjacent to the nearest representable value, chosen in an implementation-defined manner. - For hexadecimal floating constants when FLT_RADIX is a power of 2, the result is - correctly rounded. - -4 - An unsuffixed floating constant has type double. If suffixed by the letter f or F, it has - type float. If suffixed by the letter l or L, it has type long double. - -5 - Floating constants are converted to internal format as if at translation-time. The - conversion of a floating constant shall not raise an exceptional condition or a floating- - point exception at execution time. All floating constants of the same source form75) shall - convert to the same internal format with the same value. - -Recommended practice - -6 - The implementation should produce a diagnostic message if a hexadecimal constant - cannot be represented exactly in its evaluation format; the implementation should then - proceed with the translation of the program. - -7 - The translation-time conversion of floating constants should match the execution-time - conversion of character strings by library functions, such as strtod, given matching - inputs suitable for both conversions, the same result format, and default execution-time - rounding.76) - - - -Footnotes - -75) 1.23, 1.230, 123e-2, 123e-02, and 1.23L are all different source forms and thus need not - convert to the same internal format and value. - - -76) The specification for the library functions recommends more accurate conversion than required for - floating constants (see 7.22.1.3). - - -Contents - -6.4.4.3 Enumeration constants - - -Syntax - -1 - - - enumeration-constant: - identifier - - -Semantics - -2 - An identifier declared as an enumeration constant has type int. - - Forward references: enumeration specifiers (6.7.2.2). - - -Contents - -6.4.4.4 Character constants - - -Syntax - -1 - - - character-constant: - ' c-char-sequence ' - L' c-char-sequence ' - u' c-char-sequence ' - U' c-char-sequence ' - c-char-sequence: - c-char - c-char-sequence c-char - c-char: - any member of the source character set except - the single-quote ', backslash \, or new-line character - escape-sequence - escape-sequence: - simple-escape-sequence - octal-escape-sequence - hexadecimal-escape-sequence - universal-character-name - simple-escape-sequence: one of - \' \" \? \\ - \a \b \f \n \r \t \v - octal-escape-sequence: - \ octal-digit - \ octal-digit octal-digit - \ octal-digit octal-digit octal-digit - hexadecimal-escape-sequence: - \x hexadecimal-digit - hexadecimal-escape-sequence hexadecimal-digit - - -Description - -2 - An integer character constant is a sequence of one or more multibyte characters enclosed - in single-quotes, as in 'x'. A wide character constant is the same, except prefixed by the - letter L, u, or U. With a few exceptions detailed later, the elements of the sequence are - any members of the source character set; they are mapped in an implementation-defined - manner to members of the execution character set. - -3 - The single-quote ', the double-quote ", the question-mark ?, the backslash \, and - arbitrary integer values are representable according to the following table of escape - sequences: - - - single quote ' \' - double quote " \" - question mark ? \? - backslash \ \\ - octal character \octal digits - hexadecimal character \x hexadecimal digits - - -4 - The double-quote " and question-mark ? are representable either by themselves or by the - escape sequences \" and \?, respectively, but the single-quote ' and the backslash \ - shall be represented, respectively, by the escape sequences \' and \\. - -5 - The octal digits that follow the backslash in an octal escape sequence are taken to be part - of the construction of a single character for an integer character constant or of a single - wide character for a wide character constant. The numerical value of the octal integer so - formed specifies the value of the desired character or wide character. - -6 - The hexadecimal digits that follow the backslash and the letter x in a hexadecimal escape - sequence are taken to be part of the construction of a single character for an integer - character constant or of a single wide character for a wide character constant. The - numerical value of the hexadecimal integer so formed specifies the value of the desired - character or wide character. - -7 - Each octal or hexadecimal escape sequence is the longest sequence of characters that can - constitute the escape sequence. - -8 - In addition, characters not in the basic character set are representable by universal - character names and certain nongraphic characters are representable by escape sequences - consisting of the backslash \ followed by a lowercase letter: \a, \b, \f, \n, \r, \t, - and \v.77) - - -Constraints - -9 - The value of an octal or hexadecimal escape sequence shall be in the range of - representable values for the corresponding type: - - - Prefix Corresponding Type - - none unsigned char - - L the unsigned type corresponding to wchar_t - - u char16_t - - U char32_t - - -Semantics - -10 - An integer character constant has type int. The value of an integer character constant - containing a single character that maps to a single-byte execution character is the - numerical value of the representation of the mapped character interpreted as an integer. - The value of an integer character constant containing more than one character (e.g., - 'ab'), or containing a character or escape sequence that does not map to a single-byte - execution character, is implementation-defined. If an integer character constant contains - a single character or escape sequence, its value is the one that results when an object with - type char whose value is that of the single character or escape sequence is converted to - type int. - -11 - A wide character constant prefixed by the letter L has type wchar_t, an integer type - defined in the <stddef.h> header; a wide character constant prefixed by the letter u or - U has type char16_t or char32_t, respectively, unsigned integer types defined in the - <uchar.h> header. The value of a wide character constant containing a single - multibyte character that maps to a single member of the extended execution character set - is the wide character corresponding to that multibyte character, as defined by the - mbtowc, mbrtoc16, or mbrtoc32 function as appropriate for its type, with an - implementation-defined current locale. The value of a wide character constant containing - more than one multibyte character or a single multibyte character that maps to multiple - members of the extended execution character set, or containing a multibyte character or - escape sequence not represented in the extended execution character set, is - implementation-defined. - -12 - EXAMPLE 1 The construction '\0' is commonly used to represent the null character. - - -13 - EXAMPLE 2 Consider implementations that use two's complement representation for integers and eight - bits for objects that have type char. In an implementation in which type char has the same range of - values as signed char, the integer character constant '\xFF' has the value -1; if type char has the - same range of values as unsigned char, the character constant '\xFF' has the value +255. - - - - - - -14 - EXAMPLE 3 Even if eight bits are used for objects that have type char, the construction '\x123' - specifies an integer character constant containing only one character, since a hexadecimal escape sequence - is terminated only by a non-hexadecimal character. To specify an integer character constant containing the - two characters whose values are '\x12' and '3', the construction '\0223' may be used, since an octal - escape sequence is terminated after three octal digits. (The value of this two-character integer character - constant is implementation-defined.) - - -15 - EXAMPLE 4 Even if 12 or more bits are used for objects that have type wchar_t, the construction - L'\1234' specifies the implementation-defined value that results from the combination of the values - 0123 and '4'. - - - Forward references: common definitions <stddef.h> (7.19), the mbtowc function - (7.22.7.2), Unicode utilities <uchar.h> (7.28). - - -Footnotes - -77) The semantics of these characters were discussed in 5.2.2. If any other character follows a backslash, - the result is not a token and a diagnostic is required. See ''future language directions'' (6.11.4). - - -Contents - -6.4.5 String literals - - -Syntax - -1 - - - string-literal: - encoding-prefixopt " s-char-sequenceopt " - encoding-prefix: - u8 - u - U - L - s-char-sequence: - s-char - s-char-sequence s-char - s-char: - any member of the source character set except - the double-quote ", backslash \, or new-line character - escape-sequence - - -Constraints - -2 - A sequence of adjacent string literal tokens shall not include both a wide string literal and - a UTF-8 string literal. - -Description - -3 - A character string literal is a sequence of zero or more multibyte characters enclosed in - double-quotes, as in "xyz". A UTF-8 string literal is the same, except prefixed by u8. - A wide string literal is the same, except prefixed by the letter L, u, or U. - -4 - The same considerations apply to each element of the sequence in a string literal as if it - were in an integer character constant (for a character or UTF-8 string literal) or a wide - character constant (for a wide string literal), except that the single-quote ' is - representable either by itself or by the escape sequence \', but the double-quote " shall - - be represented by the escape sequence \". - -Semantics - -5 - In translation phase 6, the multibyte character sequences specified by any sequence of - adjacent character and identically-prefixed string literal tokens are concatenated into a - single multibyte character sequence. If any of the tokens has an encoding prefix, the - resulting multibyte character sequence is treated as having the same prefix; otherwise, it - is treated as a character string literal. Whether differently-prefixed wide string literal - tokens can be concatenated and, if so, the treatment of the resulting multibyte character - sequence are implementation-defined. - -6 - In translation phase 7, a byte or code of value zero is appended to each multibyte - character sequence that results from a string literal or literals.78) The multibyte character - sequence is then used to initialize an array of static storage duration and length just - sufficient to contain the sequence. For character string literals, the array elements have - type char, and are initialized with the individual bytes of the multibyte character - sequence. For UTF-8 string literals, the array elements have type char, and are - initialized with the characters of the multibyte character sequence, as encoded in UTF-8. - For wide string literals prefixed by the letter L, the array elements have type wchar_t - and are initialized with the sequence of wide characters corresponding to the multibyte - character sequence, as defined by the mbstowcs function with an implementation- - defined current locale. For wide string literals prefixed by the letter u or U, the array - elements have type char16_t or char32_t, respectively, and are initialized with the - sequence of wide characters corresponding to the multibyte character sequence, as - defined by successive calls to the mbrtoc16, or mbrtoc32 function as appropriate for - its type, with an implementation-defined current locale. The value of a string literal - containing a multibyte character or escape sequence not represented in the execution - character set is implementation-defined. - -7 - It is unspecified whether these arrays are distinct provided their elements have the - appropriate values. If the program attempts to modify such an array, the behavior is - undefined. - -8 - EXAMPLE 1 This pair of adjacent character string literals - - - "\x12" "3" - - - produces a single character string literal containing the two characters whose values are '\x12' and '3', - because escape sequences are converted into single members of the execution character set just prior to - adjacent string literal concatenation. - - -9 - EXAMPLE 2 Each of the sequences of adjacent string literal tokens - - - - - - "a" "b" L"c" - "a" L"b" "c" - L"a" "b" L"c" - L"a" L"b" L"c" - - - is equivalent to the string literal - - - L"abc" - - - Likewise, each of the sequences - - - "a" "b" u"c" - "a" u"b" "c" - u"a" "b" u"c" - u"a" u"b" u"c" - - - is equivalent to - - - u"abc" - - - - - Forward references: common definitions <stddef.h> (7.19), the mbstowcs - function (7.22.8.1), Unicode utilities <uchar.h> (7.28). - - -Footnotes - -78) A string literal need not be a string (see 7.1.1), because a null character may be embedded in it by a - \0 escape sequence. - - -Contents - -6.4.6 Punctuators - - -Syntax - -1 - - - punctuator: one of - [ ] ( ) { } . -> - ++ -- & * + - ~ ! - / % << >> < > <= >= == != ^ | && || - ? : ; ... - = *= /= %= += -= <<= >>= &= ^= |= - , # ## - <: :> <% %> %: %:%: - - -Semantics - -2 - A punctuator is a symbol that has independent syntactic and semantic significance. - Depending on context, it may specify an operation to be performed (which in turn may - yield a value or a function designator, produce a side effect, or some combination thereof) - in which case it is known as an operator (other forms of operator also exist in some - contexts). An operand is an entity on which an operator acts. - - -3 - In all aspects of the language, the six tokens79) - - - <: :> <% %> %: %:%: - - - behave, respectively, the same as the six tokens - - - [ ] { } # ## - - - except for their spelling.80) - - Forward references: expressions (6.5), declarations (6.7), preprocessing directives - (6.10), statements (6.8). - - -Footnotes - -79) These tokens are sometimes called ''digraphs''. - - -80) Thus [ and <: behave differently when ''stringized'' (see 6.10.3.2), but can otherwise be freely - interchanged. - - -Contents - -6.4.7 Header names - - -Syntax - -1 - - - header-name: - < h-char-sequence > - " q-char-sequence " - h-char-sequence: - h-char - h-char-sequence h-char - h-char: - any member of the source character set except - the new-line character and > - q-char-sequence: - q-char - q-char-sequence q-char - q-char: - any member of the source character set except - the new-line character and " - - -Semantics - -2 - The sequences in both forms of header names are mapped in an implementation-defined - manner to headers or external source file names as specified in 6.10.2. - -3 - If the characters ', \, ", //, or /* occur in the sequence between the < and > delimiters, - the behavior is undefined. Similarly, if the characters ', \, //, or /* occur in the - - - - - - sequence between the " delimiters, the behavior is undefined.81) Header name - preprocessing tokens are recognized only within #include preprocessing directives and - in implementation-defined locations within #pragma directives.82) - -4 - EXAMPLE The following sequence of characters: - - - 0x3<1/a.h>1e2 - #include <1/a.h> - #define const.member@$ - - - forms the following sequence of preprocessing tokens (with each individual preprocessing token delimited - by a { on the left and a } on the right). - - - {0x3}{<}{1}{/}{a}{.}{h}{>}{1e2} - {#}{include} {<1/a.h>} - {#}{define} {const}{.}{member}{@}{$} - - - - - Forward references: source file inclusion (6.10.2). - - -Footnotes - -81) Thus, sequences of characters that resemble escape sequences cause undefined behavior. - - -82) For an example of a header name preprocessing token used in a #pragma directive, see 6.10.9. - - -Contents - -6.4.8 Preprocessing numbers - - -Syntax - -1 - - - pp-number: - digit - . digit - pp-number digit - pp-number identifier-nondigit - pp-number e sign - pp-number E sign - pp-number p sign - pp-number P sign - pp-number . - - -Description - -2 - A preprocessing number begins with a digit optionally preceded by a period (.) and may - be followed by valid identifier characters and the character sequences e+, e-, E+, E-, - p+, p-, P+, or P-. - -3 - Preprocessing number tokens lexically include all floating and integer constant tokens. - -Semantics - -4 - A preprocessing number does not have type or a value; it acquires both after a successful - conversion (as part of translation phase 7) to a floating constant token or an integer - constant token. - - - - -Contents - -6.4.9 Comments - - -1 - Except within a character constant, a string literal, or a comment, the characters /* - introduce a comment. The contents of such a comment are examined only to identify - multibyte characters and to find the characters */ that terminate it.83) - -2 - Except within a character constant, a string literal, or a comment, the characters // - introduce a comment that includes all multibyte characters up to, but not including, the - next new-line character. The contents of such a comment are examined only to identify - multibyte characters and to find the terminating new-line character. - -3 - EXAMPLE - - - "a//b" // four-character string literal - #include "//e" // undefined behavior - // */ // comment, not syntax error - f = g/**//h; // equivalent to f = g / h; - //\ - i(); // part of a two-line comment - /\ - / j(); // part of a two-line comment - #define glue(x,y) x##y - glue(/,/) k(); // syntax error, not comment - /*//*/ l(); // equivalent to l(); - m = n//**/o - + p; // equivalent to m = n + p; - - - - - - - - -Footnotes - -83) Thus, /* ... */ comments do not nest. - - -Contents - -6.5 Expressions - - -1 - An expression is a sequence of operators and operands that specifies computation of a - value, or that designates an object or a function, or that generates side effects, or that - performs a combination thereof. The value computations of the operands of an operator - are sequenced before the value computation of the result of the operator. - -2 - If a side effect on a scalar object is unsequenced relative to either a different side effect - on the same scalar object or a value computation using the value of the same scalar - object, the behavior is undefined. If there are multiple allowable orderings of the - subexpressions of an expression, the behavior is undefined if such an unsequenced side - effect occurs in any of the orderings.84) - -3 - The grouping of operators and operands is indicated by the syntax.85) Except as specified - later, side effects and value computations of subexpressions are unsequenced.86) - -4 - Some operators (the unary operator ~, and the binary operators <<, >>, &, ^, and |, - collectively described as bitwise operators) are required to have operands that have - integer type. These operators yield values that depend on the internal representations of - integers, and have implementation-defined and undefined aspects for signed types. - -5 - If an exceptional condition occurs during the evaluation of an expression (that is, if the - result is not mathematically defined or not in the range of representable values for its - type), the behavior is undefined. - - - - - -6 - The effective type of an object for an access to its stored value is the declared type of the - object, if any.87) If a value is stored into an object having no declared type through an - lvalue having a type that is not a character type, then the type of the lvalue becomes the - effective type of the object for that access and for subsequent accesses that do not modify - the stored value. If a value is copied into an object having no declared type using - memcpy or memmove, or is copied as an array of character type, then the effective type - of the modified object for that access and for subsequent accesses that do not modify the - value is the effective type of the object from which the value is copied, if it has one. For - all other accesses to an object having no declared type, the effective type of the object is - simply the type of the lvalue used for the access. - -7 - An object shall have its stored value accessed only by an lvalue expression that has one of - the following types:88) - - -- a type compatible with the effective type of the object, - -- a qualified version of a type compatible with the effective type of the object, - -- a type that is the signed or unsigned type corresponding to the effective type of the - object, - -- a type that is the signed or unsigned type corresponding to a qualified version of the - effective type of the object, - -- an aggregate or union type that includes one of the aforementioned types among its - members (including, recursively, a member of a subaggregate or contained union), or - -- a character type. - - -8 - A floating expression may be contracted, that is, evaluated as though it were a single - operation, thereby omitting rounding errors implied by the source code and the - expression evaluation method.89) The FP_CONTRACT pragma in <math.h> provides a - way to disallow contracted expressions. Otherwise, whether and how expressions are - contracted is implementation-defined.90) - - Forward references: the FP_CONTRACT pragma (7.12.2), copying functions (7.24.2). - - - - -Footnotes - -84) This paragraph renders undefined statement expressions such as - - - i = ++i + 1; - a[i++] = i; - - - while allowing - - - i = i + 1; - a[i] = i; - - - - - -85) The syntax specifies the precedence of operators in the evaluation of an expression, which is the same - as the order of the major subclauses of this subclause, highest precedence first. Thus, for example, the - expressions allowed as the operands of the binary + operator (6.5.6) are those expressions defined in - 6.5.1 through 6.5.6. The exceptions are cast expressions (6.5.4) as operands of unary operators - (6.5.3), and an operand contained between any of the following pairs of operators: grouping - parentheses () (6.5.1), subscripting brackets [] (6.5.2.1), function-call parentheses () (6.5.2.2), and - the conditional operator ? : (6.5.15). - Within each major subclause, the operators have the same precedence. Left- or right-associativity is - indicated in each subclause by the syntax for the expressions discussed therein. - - -86) In an expression that is evaluated more than once during the execution of a program, unsequenced and - indeterminately sequenced evaluations of its subexpressions need not be performed consistently in - different evaluations. - - -87) Allocated objects have no declared type. - - -88) The intent of this list is to specify those circumstances in which an object may or may not be aliased. - - -89) The intermediate operations in the contracted expression are evaluated as if to infinite range and - precision, while the final operation is rounded to the format determined by the expression evaluation - method. A contracted expression might also omit the raising of floating-point exceptions. - - -90) This license is specifically intended to allow implementations to exploit fast machine instructions that - combine multiple C operators. As contractions potentially undermine predictability, and can even - decrease accuracy for containing expressions, their use needs to be well-defined and clearly - documented. - - -Contents - -6.5.1 Primary expressions - - -Syntax - -1 - - - primary-expression: - identifier - constant - string-literal - ( expression ) - generic-selection - - -Semantics - -2 - An identifier is a primary expression, provided it has been declared as designating an - object (in which case it is an lvalue) or a function (in which case it is a function - designator).91) - -3 - A constant is a primary expression. Its type depends on its form and value, as detailed in - 6.4.4. - -4 - A string literal is a primary expression. It is an lvalue with type as detailed in 6.4.5. - -5 - A parenthesized expression is a primary expression. Its type and value are identical to - those of the unparenthesized expression. It is an lvalue, a function designator, or a void - expression if the unparenthesized expression is, respectively, an lvalue, a function - designator, or a void expression. - -6 - A generic selection is a primary expression. Its type and value depend on the selected - generic association, as detailed in the following subclause. - - Forward references: declarations (6.7). - - -Footnotes - -91) Thus, an undeclared identifier is a violation of the syntax. - - -Contents - -6.5.1.1 Generic selection - - -Syntax - -1 - - - generic-selection: - _Generic ( assignment-expression , generic-assoc-list ) - generic-assoc-list: - generic-association - generic-assoc-list , generic-association - generic-association: - type-name : assignment-expression - default : assignment-expression - - - - - - - -Constraints - -2 - A generic selection shall have no more than one default generic association. The type - name in a generic association shall specify a complete object type other than a variably - modified type. No two generic associations in the same generic selection shall specify - compatible types. The controlling expression of a generic selection shall have type - compatible with at most one of the types named in its generic association list. If a - generic selection has no default generic association, its controlling expression shall - have type compatible with exactly one of the types named in its generic association list. - -Semantics - -3 - The controlling expression of a generic selection is not evaluated. If a generic selection - has a generic association with a type name that is compatible with the type of the - controlling expression, then the result expression of the generic selection is the - expression in that generic association. Otherwise, the result expression of the generic - selection is the expression in the default generic association. None of the expressions - from any other generic association of the generic selection is evaluated. - -4 - The type and value of a generic selection are identical to those of its result expression. It - is an lvalue, a function designator, or a void expression if its result expression is, - respectively, an lvalue, a function designator, or a void expression. - -5 - EXAMPLE The cbrt type-generic macro could be implemented as follows: - - - #define cbrt(X) _Generic((X), \ - long double: cbrtl, \ - default: cbrt, \ - float: cbrtf \ - )(X) - - - - - -Contents - -6.5.2 Postfix operators - - -Syntax - -1 - - - postfix-expression: - primary-expression - postfix-expression [ expression ] - postfix-expression ( argument-expression-listopt ) - postfix-expression . identifier - postfix-expression -> identifier - postfix-expression ++ - postfix-expression -- - ( type-name ) { initializer-list } - ( type-name ) { initializer-list , } - argument-expression-list: - assignment-expression - argument-expression-list , assignment-expression - - -Contents - -6.5.2.1 Array subscripting - - -Constraints - -1 - One of the expressions shall have type ''pointer to complete object type'', the other - expression shall have integer type, and the result has type ''type''. - -Semantics - -2 - A postfix expression followed by an expression in square brackets [] is a subscripted - designation of an element of an array object. The definition of the subscript operator [] - is that E1[E2] is identical to (*((E1)+(E2))). Because of the conversion rules that - apply to the binary + operator, if E1 is an array object (equivalently, a pointer to the - initial element of an array object) and E2 is an integer, E1[E2] designates the E2-th - element of E1 (counting from zero). - -3 - Successive subscript operators designate an element of a multidimensional array object. - If E is an n-dimensional array (n >= 2) with dimensions i x j x . . . x k, then E (used as - other than an lvalue) is converted to a pointer to an (n - 1)-dimensional array with - dimensions j x . . . x k. If the unary * operator is applied to this pointer explicitly, or - implicitly as a result of subscripting, the result is the referenced (n - 1)-dimensional - array, which itself is converted into a pointer if used as other than an lvalue. It follows - from this that arrays are stored in row-major order (last subscript varies fastest). - -4 - EXAMPLE Consider the array object defined by the declaration - - - int x[3][5]; - - - Here x is a 3 x 5 array of ints; more precisely, x is an array of three element objects, each of which is an - array of five ints. In the expression x[i], which is equivalent to (*((x)+(i))), x is first converted to - a pointer to the initial array of five ints. Then i is adjusted according to the type of x, which conceptually - entails multiplying i by the size of the object to which the pointer points, namely an array of five int - objects. The results are added and indirection is applied to yield an array of five ints. When used in the - expression x[i][j], that array is in turn converted to a pointer to the first of the ints, so x[i][j] - yields an int. - - - Forward references: additive operators (6.5.6), address and indirection operators - (6.5.3.2), array declarators (6.7.6.2). - - -Contents - -6.5.2.2 Function calls - - -Constraints - -1 - The expression that denotes the called function92) shall have type pointer to function - returning void or returning a complete object type other than an array type. - -2 - If the expression that denotes the called function has a type that includes a prototype, the - number of arguments shall agree with the number of parameters. Each argument shall - have a type such that its value may be assigned to an object with the unqualified version - of the type of its corresponding parameter. - -Semantics - -3 - A postfix expression followed by parentheses () containing a possibly empty, comma- - separated list of expressions is a function call. The postfix expression denotes the called - function. The list of expressions specifies the arguments to the function. - -4 - An argument may be an expression of any complete object type. In preparing for the call - to a function, the arguments are evaluated, and each parameter is assigned the value of the - corresponding argument.93) - -5 - If the expression that denotes the called function has type pointer to function returning an - object type, the function call expression has the same type as that object type, and has the - value determined as specified in 6.8.6.4. Otherwise, the function call has type void. - -6 - If the expression that denotes the called function has a type that does not include a - prototype, the integer promotions are performed on each argument, and arguments that - have type float are promoted to double. These are called the default argument - promotions. If the number of arguments does not equal the number of parameters, the - behavior is undefined. If the function is defined with a type that includes a prototype, and - either the prototype ends with an ellipsis (, ...) or the types of the arguments after - promotion are not compatible with the types of the parameters, the behavior is undefined. - If the function is defined with a type that does not include a prototype, and the types of - the arguments after promotion are not compatible with those of the parameters after - promotion, the behavior is undefined, except for the following cases: - - -- one promoted type is a signed integer type, the other promoted type is the - corresponding unsigned integer type, and the value is representable in both types; - - - - - -- both types are pointers to qualified or unqualified versions of a character type or - void. - - -7 - If the expression that denotes the called function has a type that does include a prototype, - the arguments are implicitly converted, as if by assignment, to the types of the - corresponding parameters, taking the type of each parameter to be the unqualified version - of its declared type. The ellipsis notation in a function prototype declarator causes - argument type conversion to stop after the last declared parameter. The default argument - promotions are performed on trailing arguments. - -8 - No other conversions are performed implicitly; in particular, the number and types of - arguments are not compared with those of the parameters in a function definition that - does not include a function prototype declarator. - -9 - If the function is defined with a type that is not compatible with the type (of the - expression) pointed to by the expression that denotes the called function, the behavior is - undefined. - -10 - There is a sequence point after the evaluations of the function designator and the actual - arguments but before the actual call. Every evaluation in the calling function (including - other function calls) that is not otherwise specifically sequenced before or after the - execution of the body of the called function is indeterminately sequenced with respect to - the execution of the called function.94) - -11 - Recursive function calls shall be permitted, both directly and indirectly through any chain - of other functions. - -12 - EXAMPLE In the function call - - - (*pf[f1()]) (f2(), f3() + f4()) - - - the functions f1, f2, f3, and f4 may be called in any order. All side effects have to be completed before - the function pointed to by pf[f1()] is called. - - - Forward references: function declarators (including prototypes) (6.7.6.3), function - definitions (6.9.1), the return statement (6.8.6.4), simple assignment (6.5.16.1). - - -Footnotes - -92) Most often, this is the result of converting an identifier that is a function designator. - - -93) A function may change the values of its parameters, but these changes cannot affect the values of the - arguments. On the other hand, it is possible to pass a pointer to an object, and the function may - change the value of the object pointed to. A parameter declared to have array or function type is - adjusted to have a pointer type as described in 6.9.1. - - -94) In other words, function executions do not ''interleave'' with each other. - - -Contents - -6.5.2.3 Structure and union members - - -Constraints - -1 - The first operand of the . operator shall have an atomic, qualified, or unqualified - structure or union type, and the second operand shall name a member of that type. - -2 - The first operand of the -> operator shall have type ''pointer to atomic, qualified, or - unqualified structure'' or ''pointer to atomic, qualified, or unqualified union'', and the - second operand shall name a member of the type pointed to. - - - - -Semantics - -3 - A postfix expression followed by the . operator and an identifier designates a member of - a structure or union object. The value is that of the named member,95) and is an lvalue if - the first expression is an lvalue. If the first expression has qualified type, the result has - the so-qualified version of the type of the designated member. - -4 - A postfix expression followed by the -> operator and an identifier designates a member - of a structure or union object. The value is that of the named member of the object to - which the first expression points, and is an lvalue.96) If the first expression is a pointer to - a qualified type, the result has the so-qualified version of the type of the designated - member. - -5 - Accessing a member of an atomic structure or union object results in undefined - behavior.97) - -6 - One special guarantee is made in order to simplify the use of unions: if a union contains - several structures that share a common initial sequence (see below), and if the union - object currently contains one of these structures, it is permitted to inspect the common - initial part of any of them anywhere that a declaration of the completed type of the union - is visible. Two structures share a common initial sequence if corresponding members - have compatible types (and, for bit-fields, the same widths) for a sequence of one or more - initial members. - -7 - EXAMPLE 1 If f is a function returning a structure or union, and x is a member of that structure or - union, f().x is a valid postfix expression but is not an lvalue. - - -8 - EXAMPLE 2 In: - - - struct s { int i; const int ci; }; - struct s s; - const struct s cs; - volatile struct s vs; - - - the various members have the types: - - - - - - - s.i int - s.ci const int - cs.i const int - cs.ci const int - vs.i volatile int - vs.ci volatile const int - - - - -9 - EXAMPLE 3 The following is a valid fragment: - - - union { - struct { - int alltypes; - } n; - struct { - int type; - int intnode; - } ni; - struct { - int type; - double doublenode; - } nf; - } u; - u.nf.type = 1; - u.nf.doublenode = 3.14; - /* ... */ - if (u.n.alltypes == 1) - if (sin(u.nf.doublenode) == 0.0) - /* ... */ - - - The following is not a valid fragment (because the union type is not visible within function f): - - - struct t1 { int m; }; - struct t2 { int m; }; - int f(struct t1 *p1, struct t2 *p2) - { - if (p1->m < 0) - p2->m = -p2->m; - return p1->m; - } - int g() - { - union { - struct t1 s1; - struct t2 s2; - } u; - /* ... */ - return f(&u.s1, &u.s2); - } - - - - - Forward references: address and indirection operators (6.5.3.2), structure and union - specifiers (6.7.2.1). - - -Footnotes - -95) If the member used to read the contents of a union object is not the same as the member last used to - store a value in the object, the appropriate part of the object representation of the value is reinterpreted - as an object representation in the new type as described in 6.2.6 (a process sometimes called ''type - punning''). This might be a trap representation. - - -96) If &E is a valid pointer expression (where & is the ''address-of '' operator, which generates a pointer to - its operand), the expression (&E)->MOS is the same as E.MOS. - - -97) For example, a data race would occur if access to the entire structure or union in one thread conflicts - with access to a member from another thread, where at least one access is a modification. Members - can be safely accessed using a non-atomic object which is assigned to or from the atomic object. - - -Contents - -6.5.2.4 Postfix increment and decrement operators - - -Constraints - -1 - The operand of the postfix increment or decrement operator shall have atomic, qualified, - or unqualified real or pointer type, and shall be a modifiable lvalue. - -Semantics - -2 - The result of the postfix ++ operator is the value of the operand. As a side effect, the - value of the operand object is incremented (that is, the value 1 of the appropriate type is - added to it). See the discussions of additive operators and compound assignment for - information on constraints, types, and conversions and the effects of operations on - pointers. The value computation of the result is sequenced before the side effect of - updating the stored value of the operand. With respect to an indeterminately-sequenced - function call, the operation of postfix ++ is a single evaluation. Postfix ++ on an object - with atomic type is a read-modify-write operation with memory_order_seq_cst - memory order semantics.98) - -3 - The postfix -- operator is analogous to the postfix ++ operator, except that the value of - the operand is decremented (that is, the value 1 of the appropriate type is subtracted from - it). - - Forward references: additive operators (6.5.6), compound assignment (6.5.16.2). - - -Footnotes - -98) Where a pointer to an atomic object can be formed and E has integer type, E++ is equivalent to the - following code sequence where T is the type of E: - - - T *addr = &E; - T old = *addr; - T new; - do { - new = old + 1; - } while (!atomic_compare_exchange_strong(addr, &old, new)); - - - with old being the result of the operation. - Special care must be taken if E has floating type; see 6.5.16.2. - - -Contents - -6.5.2.5 Compound literals - - -Constraints - -1 - The type name shall specify a complete object type or an array of unknown size, but not a - variable length array type. - -2 - All the constraints for initializer lists in 6.7.9 also apply to compound literals. - -Semantics - -3 - A postfix expression that consists of a parenthesized type name followed by a brace- - enclosed list of initializers is a compound literal. It provides an unnamed object whose - - - value is given by the initializer list.99) - -4 - If the type name specifies an array of unknown size, the size is determined by the - initializer list as specified in 6.7.9, and the type of the compound literal is that of the - completed array type. Otherwise (when the type name specifies an object type), the type - of the compound literal is that specified by the type name. In either case, the result is an - lvalue. - -5 - The value of the compound literal is that of an unnamed object initialized by the - initializer list. If the compound literal occurs outside the body of a function, the object - has static storage duration; otherwise, it has automatic storage duration associated with - the enclosing block. - -6 - All the semantic rules for initializer lists in 6.7.9 also apply to compound literals.100) - -7 - String literals, and compound literals with const-qualified types, need not designate - distinct objects.101) - -8 - EXAMPLE 1 The file scope definition - - - int *p = (int []){2, 4}; - - - initializes p to point to the first element of an array of two ints, the first having the value two and the - second, four. The expressions in this compound literal are required to be constant. The unnamed object - has static storage duration. - - -9 - EXAMPLE 2 In contrast, in - - - void f(void) - { - int *p; - /*...*/ - p = (int [2]){*p}; - /*...*/ - } - - - p is assigned the address of the first element of an array of two ints, the first having the value previously - pointed to by p and the second, zero. The expressions in this compound literal need not be constant. The - unnamed object has automatic storage duration. - - -10 - EXAMPLE 3 Initializers with designations can be combined with compound literals. Structure objects - created using compound literals can be passed to functions without depending on member order: - - - drawline((struct point){.x=1, .y=1}, - (struct point){.x=3, .y=4}); - - - - - - - Or, if drawline instead expected pointers to struct point: - - - drawline(&(struct point){.x=1, .y=1}, - &(struct point){.x=3, .y=4}); - - - - -11 - EXAMPLE 4 A read-only compound literal can be specified through constructions like: - - - (const float []){1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6} - - - - -12 - EXAMPLE 5 The following three expressions have different meanings: - - - "/tmp/fileXXXXXX" - (char []){"/tmp/fileXXXXXX"} - (const char []){"/tmp/fileXXXXXX"} - - - The first always has static storage duration and has type array of char, but need not be modifiable; the last - two have automatic storage duration when they occur within the body of a function, and the first of these - two is modifiable. - - -13 - EXAMPLE 6 Like string literals, const-qualified compound literals can be placed into read-only memory - and can even be shared. For example, - - - (const char []){"abc"} == "abc" - - - might yield 1 if the literals' storage is shared. - - -14 - EXAMPLE 7 Since compound literals are unnamed, a single compound literal cannot specify a circularly - linked object. For example, there is no way to write a self-referential compound literal that could be used - as the function argument in place of the named object endless_zeros below: - - - struct int_list { int car; struct int_list *cdr; }; - struct int_list endless_zeros = {0, &endless_zeros}; - eval(endless_zeros); - - - - -15 - EXAMPLE 8 Each compound literal creates only a single object in a given scope: - - - struct s { int i; }; - int f (void) - { - struct s *p = 0, *q; - int j = 0; - again: - q = p, p = &((struct s){ j++ }); - if (j < 2) goto again; - return p == q && q->i == 1; - } - - - The function f() always returns the value 1. - -16 - Note that if an iteration statement were used instead of an explicit goto and a labeled statement, the - lifetime of the unnamed object would be the body of the loop only, and on entry next time around p would - have an indeterminate value, which would result in undefined behavior. - - - Forward references: type names (6.7.7), initialization (6.7.9). - - -Footnotes - -99) Note that this differs from a cast expression. For example, a cast specifies a conversion to scalar types - or void only, and the result of a cast expression is not an lvalue. - - -100) For example, subobjects without explicit initializers are initialized to zero. - - -101) This allows implementations to share storage for string literals and constant compound literals with - the same or overlapping representations. - - -Contents - -6.5.3 Unary operators - - -Syntax - -1 - - - unary-expression: - postfix-expression - ++ unary-expression - -- unary-expression - unary-operator cast-expression - sizeof unary-expression - sizeof ( type-name ) - _Alignof ( type-name ) - unary-operator: one of - & * + - ~ ! - - -Contents - -6.5.3.1 Prefix increment and decrement operators - - -Constraints - -1 - The operand of the prefix increment or decrement operator shall have atomic, qualified, - or unqualified real or pointer type, and shall be a modifiable lvalue. - -Semantics - -2 - The value of the operand of the prefix ++ operator is incremented. The result is the new - value of the operand after incrementation. The expression ++E is equivalent to (E+=1). - See the discussions of additive operators and compound assignment for information on - constraints, types, side effects, and conversions and the effects of operations on pointers. - -3 - The prefix -- operator is analogous to the prefix ++ operator, except that the value of the - operand is decremented. - - Forward references: additive operators (6.5.6), compound assignment (6.5.16.2). - - -Contents - -6.5.3.2 Address and indirection operators - - -Constraints - -1 - The operand of the unary & operator shall be either a function designator, the result of a - [] or unary * operator, or an lvalue that designates an object that is not a bit-field and is - not declared with the register storage-class specifier. - -2 - The operand of the unary * operator shall have pointer type. - -Semantics - -3 - The unary & operator yields the address of its operand. If the operand has type ''type'', - the result has type ''pointer to type''. If the operand is the result of a unary * operator, - neither that operator nor the & operator is evaluated and the result is as if both were - omitted, except that the constraints on the operators still apply and the result is not an - - lvalue. Similarly, if the operand is the result of a [] operator, neither the & operator nor - the unary * that is implied by the [] is evaluated and the result is as if the & operator - were removed and the [] operator were changed to a + operator. Otherwise, the result is - a pointer to the object or function designated by its operand. - -4 - The unary * operator denotes indirection. If the operand points to a function, the result is - a function designator; if it points to an object, the result is an lvalue designating the - object. If the operand has type ''pointer to type'', the result has type ''type''. If an - invalid value has been assigned to the pointer, the behavior of the unary * operator is - undefined.102) - - Forward references: storage-class specifiers (6.7.1), structure and union specifiers - (6.7.2.1). - - -Footnotes - -102) Thus, &*E is equivalent to E (even if E is a null pointer), and &(E1[E2]) to ((E1)+(E2)). It is - always true that if E is a function designator or an lvalue that is a valid operand of the unary & - operator, *&E is a function designator or an lvalue equal to E. If *P is an lvalue and T is the name of - an object pointer type, *(T)P is an lvalue that has a type compatible with that to which T points. - Among the invalid values for dereferencing a pointer by the unary * operator are a null pointer, an - address inappropriately aligned for the type of object pointed to, and the address of an object after the - end of its lifetime. - - -Contents - -6.5.3.3 Unary arithmetic operators - - -Constraints - -1 - The operand of the unary + or - operator shall have arithmetic type; of the ~ operator, - integer type; of the ! operator, scalar type. - -Semantics - -2 - The result of the unary + operator is the value of its (promoted) operand. The integer - promotions are performed on the operand, and the result has the promoted type. - -3 - The result of the unary - operator is the negative of its (promoted) operand. The integer - promotions are performed on the operand, and the result has the promoted type. - -4 - The result of the ~ operator is the bitwise complement of its (promoted) operand (that is, - each bit in the result is set if and only if the corresponding bit in the converted operand is - not set). The integer promotions are performed on the operand, and the result has the - promoted type. If the promoted type is an unsigned type, the expression ~E is equivalent - to the maximum value representable in that type minus E. - -5 - The result of the logical negation operator ! is 0 if the value of its operand compares - unequal to 0, 1 if the value of its operand compares equal to 0. The result has type int. - The expression !E is equivalent to (0==E). - - - - - -Contents - -6.5.3.4 The sizeof and _Alignof operators - - -Constraints - -1 - The sizeof operator shall not be applied to an expression that has function type or an - incomplete type, to the parenthesized name of such a type, or to an expression that - designates a bit-field member. The _Alignof operator shall not be applied to a - function type or an incomplete type. - -Semantics - -2 - The sizeof operator yields the size (in bytes) of its operand, which may be an - expression or the parenthesized name of a type. The size is determined from the type of - the operand. The result is an integer. If the type of the operand is a variable length array - type, the operand is evaluated; otherwise, the operand is not evaluated and the result is an - integer constant. - -3 - The _Alignof operator yields the alignment requirement of its operand type. The - operand is not evaluated and the result is an integer constant. When applied to an array - type, the result is the alignment requirement of the element type. - -4 - When sizeof is applied to an operand that has type char, unsigned char, or - signed char, (or a qualified version thereof) the result is 1. When applied to an - operand that has array type, the result is the total number of bytes in the array.103) When - applied to an operand that has structure or union type, the result is the total number of - bytes in such an object, including internal and trailing padding. - -5 - The value of the result of both operators is implementation-defined, and its type (an - unsigned integer type) is size_t, defined in <stddef.h> (and other headers). - -6 - EXAMPLE 1 A principal use of the sizeof operator is in communication with routines such as storage - allocators and I/O systems. A storage-allocation function might accept a size (in bytes) of an object to - allocate and return a pointer to void. For example: - - - extern void *alloc(size_t); - double *dp = alloc(sizeof *dp); - - - The implementation of the alloc function should ensure that its return value is aligned suitably for - conversion to a pointer to double. - - -7 - EXAMPLE 2 Another use of the sizeof operator is to compute the number of elements in an array: - - - sizeof array / sizeof array[0] - - - - -8 - EXAMPLE 3 In this example, the size of a variable length array is computed and returned from a - function: - - - #include <stddef.h> - - - - - - - - size_t fsize3(int n) - { - char b[n+3]; // variable length array - return sizeof b; // execution time sizeof - } - int main() - { - size_t size; - size = fsize3(10); // fsize3 returns 13 - return 0; - } - - - - - Forward references: common definitions <stddef.h> (7.19), declarations (6.7), - structure and union specifiers (6.7.2.1), type names (6.7.7), array declarators (6.7.6.2). - - -Footnotes - -103) When applied to a parameter declared to have array or function type, the sizeof operator yields the - size of the adjusted (pointer) type (see 6.9.1). - - -Contents - -6.5.4 Cast operators - - -Syntax - -1 - - - cast-expression: - unary-expression - ( type-name ) cast-expression - - -Constraints - -2 - Unless the type name specifies a void type, the type name shall specify atomic, qualified, - or unqualified scalar type, and the operand shall have scalar type. - -3 - Conversions that involve pointers, other than where permitted by the constraints of - 6.5.16.1, shall be specified by means of an explicit cast. - -4 - A pointer type shall not be converted to any floating type. A floating type shall not be - converted to any pointer type. - -Semantics - -5 - Preceding an expression by a parenthesized type name converts the value of the - expression to the named type. This construction is called a cast.104) A cast that specifies - no conversion has no effect on the type or value of an expression. - -6 - If the value of the expression is represented with greater range or precision than required - by the type named by the cast (6.3.1.8), then the cast specifies a conversion even if the - type of the expression is the same as the named type and removes any extra range and - precision. - - Forward references: equality operators (6.5.9), function declarators (including - prototypes) (6.7.6.3), simple assignment (6.5.16.1), type names (6.7.7). - - - -Footnotes - -104) A cast does not yield an lvalue. Thus, a cast to a qualified type has the same effect as a cast to the - unqualified version of the type. - - -Contents - -6.5.5 Multiplicative operators - - -Syntax - -1 - - - multiplicative-expression: - cast-expression - multiplicative-expression * cast-expression - multiplicative-expression / cast-expression - multiplicative-expression % cast-expression - - -Constraints - -2 - Each of the operands shall have arithmetic type. The operands of the % operator shall - have integer type. - -Semantics - -3 - The usual arithmetic conversions are performed on the operands. - -4 - The result of the binary * operator is the product of the operands. - -5 - The result of the / operator is the quotient from the division of the first operand by the - second; the result of the % operator is the remainder. In both operations, if the value of - the second operand is zero, the behavior is undefined. - -6 - When integers are divided, the result of the / operator is the algebraic quotient with any - fractional part discarded.105) If the quotient a/b is representable, the expression - (a/b)*b + a%b shall equal a; otherwise, the behavior of both a/b and a%b is - undefined. - - -Footnotes - -105) This is often called ''truncation toward zero''. - - -Contents - -6.5.6 Additive operators - - -Syntax - -1 - - - additive-expression: - multiplicative-expression - additive-expression + multiplicative-expression - additive-expression - multiplicative-expression - - -Constraints - -2 - For addition, either both operands shall have arithmetic type, or one operand shall be a - pointer to a complete object type and the other shall have integer type. (Incrementing is - equivalent to adding 1.) - -3 - For subtraction, one of the following shall hold: - - - - - - -- both operands have arithmetic type; - -- both operands are pointers to qualified or unqualified versions of compatible complete - object types; or - -- the left operand is a pointer to a complete object type and the right operand has - integer type. - - (Decrementing is equivalent to subtracting 1.) - -Semantics - -4 - If both operands have arithmetic type, the usual arithmetic conversions are performed on - them. - -5 - The result of the binary + operator is the sum of the operands. - -6 - The result of the binary - operator is the difference resulting from the subtraction of the - second operand from the first. - -7 - For the purposes of these operators, a pointer to an object that is not an element of an - array behaves the same as a pointer to the first element of an array of length one with the - type of the object as its element type. - -8 - When an expression that has integer type is added to or subtracted from a pointer, the - result has the type of the pointer operand. If the pointer operand points to an element of - an array object, and the array is large enough, the result points to an element offset from - the original element such that the difference of the subscripts of the resulting and original - array elements equals the integer expression. In other words, if the expression P points to - the i-th element of an array object, the expressions (P)+N (equivalently, N+(P)) and - (P)-N (where N has the value n) point to, respectively, the i+n-th and i-n-th elements of - the array object, provided they exist. Moreover, if the expression P points to the last - element of an array object, the expression (P)+1 points one past the last element of the - array object, and if the expression Q points one past the last element of an array object, - the expression (Q)-1 points to the last element of the array object. If both the pointer - operand and the result point to elements of the same array object, or one past the last - element of the array object, the evaluation shall not produce an overflow; otherwise, the - behavior is undefined. If the result points one past the last element of the array object, it - shall not be used as the operand of a unary * operator that is evaluated. - -9 - When two pointers are subtracted, both shall point to elements of the same array object, - or one past the last element of the array object; the result is the difference of the - subscripts of the two array elements. The size of the result is implementation-defined, - and its type (a signed integer type) is ptrdiff_t defined in the <stddef.h> header. - If the result is not representable in an object of that type, the behavior is undefined. In - other words, if the expressions P and Q point to, respectively, the i-th and j-th elements of - an array object, the expression (P)-(Q) has the value i-j provided the value fits in an - - object of type ptrdiff_t. Moreover, if the expression P points either to an element of - an array object or one past the last element of an array object, and the expression Q points - to the last element of the same array object, the expression ((Q)+1)-(P) has the same - value as ((Q)-(P))+1 and as -((P)-((Q)+1)), and has the value zero if the - expression P points one past the last element of the array object, even though the - expression (Q)+1 does not point to an element of the array object.106) - -10 - EXAMPLE Pointer arithmetic is well defined with pointers to variable length array types. - - - { - int n = 4, m = 3; - int a[n][m]; - int (*p)[m] = a; // p == &a[0] - p += 1; // p == &a[1] - (*p)[2] = 99; // a[1][2] == 99 - n = p - a; // n == 1 - } - - -11 - If array a in the above example were declared to be an array of known constant size, and pointer p were - declared to be a pointer to an array of the same known constant size (pointing to a), the results would be - the same. - - - Forward references: array declarators (6.7.6.2), common definitions <stddef.h> - (7.19). - - -Footnotes - -106) Another way to approach pointer arithmetic is first to convert the pointer(s) to character pointer(s): In - this scheme the integer expression added to or subtracted from the converted pointer is first multiplied - by the size of the object originally pointed to, and the resulting pointer is converted back to the - original type. For pointer subtraction, the result of the difference between the character pointers is - similarly divided by the size of the object originally pointed to. - When viewed in this way, an implementation need only provide one extra byte (which may overlap - another object in the program) just after the end of the object in order to satisfy the ''one past the last - element'' requirements. - - -Contents - -6.5.7 Bitwise shift operators - - -Syntax - -1 - - - shift-expression: - additive-expression - shift-expression << additive-expression - shift-expression >> additive-expression - - -Constraints - -2 - Each of the operands shall have integer type. - -Semantics - -3 - The integer promotions are performed on each of the operands. The type of the result is - that of the promoted left operand. If the value of the right operand is negative or is - - - greater than or equal to the width of the promoted left operand, the behavior is undefined. - -4 - The result of E1 << E2 is E1 left-shifted E2 bit positions; vacated bits are filled with - zeros. If E1 has an unsigned type, the value of the result is E1 x 2E2 , reduced modulo - one more than the maximum value representable in the result type. If E1 has a signed - type and nonnegative value, and E1 x 2E2 is representable in the result type, then that is - the resulting value; otherwise, the behavior is undefined. - -5 - The result of E1 >> E2 is E1 right-shifted E2 bit positions. If E1 has an unsigned type - or if E1 has a signed type and a nonnegative value, the value of the result is the integral - part of the quotient of E1 / 2E2 . If E1 has a signed type and a negative value, the - resulting value is implementation-defined. - - -Contents - -6.5.8 Relational operators - - -Syntax - -1 - - - relational-expression: - shift-expression - relational-expression < shift-expression - relational-expression > shift-expression - relational-expression <= shift-expression - relational-expression >= shift-expression - - -Constraints - -2 - One of the following shall hold: - - -- both operands have real type; or - -- both operands are pointers to qualified or unqualified versions of compatible object - types. - - -Semantics - -3 - If both of the operands have arithmetic type, the usual arithmetic conversions are - performed. - -4 - For the purposes of these operators, a pointer to an object that is not an element of an - array behaves the same as a pointer to the first element of an array of length one with the - type of the object as its element type. - -5 - When two pointers are compared, the result depends on the relative locations in the - address space of the objects pointed to. If two pointers to object types both point to the - same object, or both point one past the last element of the same array object, they - compare equal. If the objects pointed to are members of the same aggregate object, - pointers to structure members declared later compare greater than pointers to members - declared earlier in the structure, and pointers to array elements with larger subscript - values compare greater than pointers to elements of the same array with lower subscript - - values. All pointers to members of the same union object compare equal. If the - expression P points to an element of an array object and the expression Q points to the - last element of the same array object, the pointer expression Q+1 compares greater than - P. In all other cases, the behavior is undefined. - -6 - Each of the operators < (less than), > (greater than), <= (less than or equal to), and >= - (greater than or equal to) shall yield 1 if the specified relation is true and 0 if it is - false.107) The result has type int. - - -Footnotes - -107) The expression a<b<c is not interpreted as in ordinary mathematics. As the syntax indicates, it - means (a<b)<c; in other words, ''if a is less than b, compare 1 to c; otherwise, compare 0 to c''. - - -Contents - -6.5.9 Equality operators - - -Syntax - -1 - - - equality-expression: - relational-expression - equality-expression == relational-expression - equality-expression != relational-expression - - -Constraints - -2 - One of the following shall hold: - - -- both operands have arithmetic type; - -- both operands are pointers to qualified or unqualified versions of compatible types; - -- one operand is a pointer to an object type and the other is a pointer to a qualified or - unqualified version of void; or - -- one operand is a pointer and the other is a null pointer constant. - - -Semantics - -3 - The == (equal to) and != (not equal to) operators are analogous to the relational - operators except for their lower precedence.108) Each of the operators yields 1 if the - specified relation is true and 0 if it is false. The result has type int. For any pair of - operands, exactly one of the relations is true. - -4 - If both of the operands have arithmetic type, the usual arithmetic conversions are - performed. Values of complex types are equal if and only if both their real parts are equal - and also their imaginary parts are equal. Any two values of arithmetic types from - different type domains are equal if and only if the results of their conversions to the - (complex) result type determined by the usual arithmetic conversions are equal. - - - - - -5 - Otherwise, at least one operand is a pointer. If one operand is a pointer and the other is a - null pointer constant, the null pointer constant is converted to the type of the pointer. If - one operand is a pointer to an object type and the other is a pointer to a qualified or - unqualified version of void, the former is converted to the type of the latter. - -6 - Two pointers compare equal if and only if both are null pointers, both are pointers to the - same object (including a pointer to an object and a subobject at its beginning) or function, - both are pointers to one past the last element of the same array object, or one is a pointer - to one past the end of one array object and the other is a pointer to the start of a different - array object that happens to immediately follow the first array object in the address - space.109) - -7 - For the purposes of these operators, a pointer to an object that is not an element of an - array behaves the same as a pointer to the first element of an array of length one with the - type of the object as its element type. - - -Footnotes - -108) Because of the precedences, a<b == c<d is 1 whenever a<b and c<d have the same truth-value. - - -109) Two objects may be adjacent in memory because they are adjacent elements of a larger array or - adjacent members of a structure with no padding between them, or because the implementation chose - to place them so, even though they are unrelated. If prior invalid pointer operations (such as accesses - outside array bounds) produced undefined behavior, subsequent comparisons also produce undefined - behavior. - - -Contents - -6.5.10 Bitwise AND operator - - -Syntax - -1 - - - AND-expression: - equality-expression - AND-expression & equality-expression - - -Constraints - -2 - Each of the operands shall have integer type. - -Semantics - -3 - The usual arithmetic conversions are performed on the operands. - -4 - The result of the binary & operator is the bitwise AND of the operands (that is, each bit in - the result is set if and only if each of the corresponding bits in the converted operands is - set). - - - - - - -Contents - -6.5.11 Bitwise exclusive OR operator - - -Syntax - -1 - - - exclusive-OR-expression: - AND-expression - exclusive-OR-expression ^ AND-expression - - -Constraints - -2 - Each of the operands shall have integer type. - -Semantics - -3 - The usual arithmetic conversions are performed on the operands. - -4 - The result of the ^ operator is the bitwise exclusive OR of the operands (that is, each bit - in the result is set if and only if exactly one of the corresponding bits in the converted - operands is set). - - -Contents - -6.5.12 Bitwise inclusive OR operator - - -Syntax - -1 - - - inclusive-OR-expression: - exclusive-OR-expression - inclusive-OR-expression | exclusive-OR-expression - - -Constraints - -2 - Each of the operands shall have integer type. - -Semantics - -3 - The usual arithmetic conversions are performed on the operands. - -4 - The result of the | operator is the bitwise inclusive OR of the operands (that is, each bit in - the result is set if and only if at least one of the corresponding bits in the converted - operands is set). - - -Contents - -6.5.13 Logical AND operator - - -Syntax - -1 - - - logical-AND-expression: - inclusive-OR-expression - logical-AND-expression && inclusive-OR-expression - - -Constraints - -2 - Each of the operands shall have scalar type. - -Semantics - -3 - The && operator shall yield 1 if both of its operands compare unequal to 0; otherwise, it - yields 0. The result has type int. - -4 - Unlike the bitwise binary & operator, the && operator guarantees left-to-right evaluation; - if the second operand is evaluated, there is a sequence point between the evaluations of - the first and second operands. If the first operand compares equal to 0, the second - operand is not evaluated. - - -Contents - -6.5.14 Logical OR operator - - -Syntax - -1 - - - logical-OR-expression: - logical-AND-expression - logical-OR-expression || logical-AND-expression - - -Constraints - -2 - Each of the operands shall have scalar type. - -Semantics - -3 - The || operator shall yield 1 if either of its operands compare unequal to 0; otherwise, it - yields 0. The result has type int. - -4 - Unlike the bitwise | operator, the || operator guarantees left-to-right evaluation; if the - second operand is evaluated, there is a sequence point between the evaluations of the first - and second operands. If the first operand compares unequal to 0, the second operand is - not evaluated. - - -Contents - -6.5.15 Conditional operator - - -Syntax - -1 - - - conditional-expression: - logical-OR-expression - logical-OR-expression ? expression : conditional-expression - - -Constraints - -2 - The first operand shall have scalar type. - -3 - One of the following shall hold for the second and third operands: - - -- both operands have arithmetic type; - -- both operands have the same structure or union type; - -- both operands have void type; - -- both operands are pointers to qualified or unqualified versions of compatible types; - -- one operand is a pointer and the other is a null pointer constant; or - -- one operand is a pointer to an object type and the other is a pointer to a qualified or - unqualified version of void. - - -Semantics - -4 - The first operand is evaluated; there is a sequence point between its evaluation and the - evaluation of the second or third operand (whichever is evaluated). The second operand - is evaluated only if the first compares unequal to 0; the third operand is evaluated only if - the first compares equal to 0; the result is the value of the second or third operand - (whichever is evaluated), converted to the type described below.110) - -5 - If both the second and third operands have arithmetic type, the result type that would be - determined by the usual arithmetic conversions, were they applied to those two operands, - is the type of the result. If both the operands have structure or union type, the result has - that type. If both operands have void type, the result has void type. - -6 - If both the second and third operands are pointers or one is a null pointer constant and the - other is a pointer, the result type is a pointer to a type qualified with all the type qualifiers - of the types referenced by both operands. Furthermore, if both operands are pointers to - compatible types or to differently qualified versions of compatible types, the result type is - a pointer to an appropriately qualified version of the composite type; if one operand is a - null pointer constant, the result has the type of the other operand; otherwise, one operand - is a pointer to void or a qualified version of void, in which case the result type is a - pointer to an appropriately qualified version of void. - - - -7 - EXAMPLE The common type that results when the second and third operands are pointers is determined - in two independent stages. The appropriate qualifiers, for example, do not depend on whether the two - pointers have compatible types. - -8 - Given the declarations - - - const void *c_vp; - void *vp; - const int *c_ip; - volatile int *v_ip; - int *ip; - const char *c_cp; - - - the third column in the following table is the common type that is the result of a conditional expression in - which the first two columns are the second and third operands (in either order): - - - c_vp c_ip const void * - v_ip 0 volatile int * - c_ip v_ip const volatile int * - vp c_cp const void * - ip c_ip const int * - vp ip void * - - - - - -Footnotes - -110) A conditional expression does not yield an lvalue. - - -Contents - -6.5.16 Assignment operators - - -Syntax - -1 - - - assignment-expression: - conditional-expression - unary-expression assignment-operator assignment-expression - assignment-operator: one of - = *= /= %= += -= <<= >>= &= ^= |= - - -Constraints - -2 - An assignment operator shall have a modifiable lvalue as its left operand. - -Semantics - -3 - An assignment operator stores a value in the object designated by the left operand. An - assignment expression has the value of the left operand after the assignment,111) but is not - an lvalue. The type of an assignment expression is the type the left operand would have - after lvalue conversion. The side effect of updating the stored value of the left operand is - sequenced after the value computations of the left and right operands. The evaluations of - the operands are unsequenced. - - - - - - -Footnotes - -111) The implementation is permitted to read the object to determine the value but is not required to, even - when the object has volatile-qualified type. - - -Contents - -6.5.16.1 Simple assignment - - -Constraints - -1 - One of the following shall hold:112) - - -- the left operand has atomic, qualified, or unqualified arithmetic type, and the right has - arithmetic type; - -- the left operand has an atomic, qualified, or unqualified version of a structure or union - type compatible with the type of the right; - -- the left operand has atomic, qualified, or unqualified pointer type, and (considering - the type the left operand would have after lvalue conversion) both operands are - pointers to qualified or unqualified versions of compatible types, and the type pointed - to by the left has all the qualifiers of the type pointed to by the right; - -- the left operand has atomic, qualified, or unqualified pointer type, and (considering - the type the left operand would have after lvalue conversion) one operand is a pointer - to an object type, and the other is a pointer to a qualified or unqualified version of - void, and the type pointed to by the left has all the qualifiers of the type pointed to - by the right; - -- the left operand is an atomic, qualified, or unqualified pointer, and the right is a null - pointer constant; or - -- the left operand has type atomic, qualified, or unqualified _Bool, and the right is a - pointer. - - -Semantics - -2 - In simple assignment (=), the value of the right operand is converted to the type of the - assignment expression and replaces the value stored in the object designated by the left - operand. - -3 - If the value being stored in an object is read from another object that overlaps in any way - the storage of the first object, then the overlap shall be exact and the two objects shall - have qualified or unqualified versions of a compatible type; otherwise, the behavior is - undefined. - -4 - EXAMPLE 1 In the program fragment - - - - - - - int f(void); - char c; - /* ... */ - if ((c = f()) == -1) - /* ... */ - - - the int value returned by the function may be truncated when stored in the char, and then converted back - to int width prior to the comparison. In an implementation in which ''plain'' char has the same range of - values as unsigned char (and char is narrower than int), the result of the conversion cannot be - negative, so the operands of the comparison can never compare equal. Therefore, for full portability, the - variable c should be declared as int. - - -5 - EXAMPLE 2 In the fragment: - - - char c; - int i; - long l; - l = (c = i); - - - the value of i is converted to the type of the assignment expression c = i, that is, char type. The value - of the expression enclosed in parentheses is then converted to the type of the outer assignment expression, - that is, long int type. - - -6 - EXAMPLE 3 Consider the fragment: - - - const char **cpp; - char *p; - const char c = 'A'; - cpp = &p; // constraint violation - *cpp = &c; // valid - *p = 0; // valid - - - The first assignment is unsafe because it would allow the following valid code to attempt to change the - value of the const object c. - - - -Footnotes - -112) The asymmetric appearance of these constraints with respect to type qualifiers is due to the conversion - (specified in 6.3.2.1) that changes lvalues to ''the value of the expression'' and thus removes any type - qualifiers that were applied to the type category of the expression (for example, it removes const but - not volatile from the type int volatile * const). - - -Contents - -6.5.16.2 Compound assignment - - -Constraints - -1 - For the operators += and -= only, either the left operand shall be an atomic, qualified, or - unqualified pointer to a complete object type, and the right shall have integer type; or the - left operand shall have atomic, qualified, or unqualified arithmetic type, and the right - shall have arithmetic type. - -2 - For the other operators, the left operand shall have atomic, qualified, or unqualified - arithmetic type, and (considering the type the left operand would have after lvalue - conversion) each operand shall have arithmetic type consistent with those allowed by the - corresponding binary operator. - -Semantics - -3 - A compound assignment of the form E1 op = E2 is equivalent to the simple assignment - expression E1 = E1 op (E2), except that the lvalue E1 is evaluated only once, and with - respect to an indeterminately-sequenced function call, the operation of a compound - - assignment is a single evaluation. If E1 has an atomic type, compound assignment is a - read-modify-write operation with memory_order_seq_cst memory order - semantics.113) - - - - - - -Footnotes - -113) Where a pointer to an atomic object can be formed and E1 and E2 have integer type, this is equivalent - to the following code sequence where T1 is the type of E1 and T2 is the type of E2: - - - T1 *addr = &E1; - T2 val = (E2); - T1 old = *addr; - T1 new; - do { - new = old op val; - } while (!atomic_compare_exchange_strong(addr, &old, new)); - - - with new being the result of the operation. - If E1 or E2 has floating type, then exceptional conditions or floating-point exceptions encountered - during discarded evaluations of new should also be discarded in order to satisfy the equivalence of E1 - op = E2 and E1 = E1 op (E2). For example, if annex F is in effect, the floating types involved have - IEC 60559 formats, and FLT_EVAL_METHOD is 0, the equivalent code would be: - - - #include <fenv.h> - #pragma STDC FENV_ACCESS ON - /* ... */ - fenv_t fenv; - T1 *addr = &E1; - T2 val = E2; - T1 old = *addr; - T1 new; - feholdexcept(&fenv); - for (;;) { - new = old op val; - if (atomic_compare_exchange_strong(addr, &old, new)) - break; - feclearexcept(FE_ALL_EXCEPT); - } - feupdateenv(&fenv); - - - If FLT_EVAL_METHOD is not 0, then T2 must be a type with the range and precision to which E2 is - evaluated in order to satisfy the equivalence. - - -Contents - -6.5.17 Comma operator - - -Syntax - -1 - - - expression: - assignment-expression - expression , assignment-expression - - -Semantics - -2 - The left operand of a comma operator is evaluated as a void expression; there is a - sequence point between its evaluation and that of the right operand. Then the right - operand is evaluated; the result has its type and value.114) - -3 - EXAMPLE As indicated by the syntax, the comma operator (as described in this subclause) cannot - appear in contexts where a comma is used to separate items in a list (such as arguments to functions or lists - of initializers). On the other hand, it can be used within a parenthesized expression or within the second - expression of a conditional operator in such contexts. In the function call - - - f(a, (t=3, t+2), c) - - - the function has three arguments, the second of which has the value 5. - - - Forward references: initialization (6.7.9). - - - - - - -Footnotes - -114) A comma operator does not yield an lvalue. - - -Contents - -6.6 Constant expressions - - -Syntax - -1 - - - constant-expression: - conditional-expression - - -Description - -2 - A constant expression can be evaluated during translation rather than runtime, and - accordingly may be used in any place that a constant may be. - -Constraints - -3 - Constant expressions shall not contain assignment, increment, decrement, function-call, - or comma operators, except when they are contained within a subexpression that is not - evaluated.115) - -4 - Each constant expression shall evaluate to a constant that is in the range of representable - values for its type. - -Semantics - -5 - An expression that evaluates to a constant is required in several contexts. If a floating - expression is evaluated in the translation environment, the arithmetic range and precision - shall be at least as great as if the expression were being evaluated in the execution - environment.116) - -6 - An integer constant expression117) shall have integer type and shall only have operands - that are integer constants, enumeration constants, character constants, sizeof - expressions whose results are integer constants, _Alignof expressions, and floating - constants that are the immediate operands of casts. Cast operators in an integer constant - expression shall only convert arithmetic types to integer types, except as part of an - operand to the sizeof or _Alignof operator. - -7 - More latitude is permitted for constant expressions in initializers. Such a constant - expression shall be, or evaluate to, one of the following: - - -- an arithmetic constant expression, - - - - - -- a null pointer constant, - -- an address constant, or - -- an address constant for a complete object type plus or minus an integer constant - expression. - - -8 - An arithmetic constant expression shall have arithmetic type and shall only have - operands that are integer constants, floating constants, enumeration constants, character - constants, sizeof expressions whose results are integer constants, and _Alignof - expressions. Cast operators in an arithmetic constant expression shall only convert - arithmetic types to arithmetic types, except as part of an operand to a sizeof or - _Alignof operator. - -9 - An address constant is a null pointer, a pointer to an lvalue designating an object of static - storage duration, or a pointer to a function designator; it shall be created explicitly using - the unary & operator or an integer constant cast to pointer type, or implicitly by the use of - an expression of array or function type. The array-subscript [] and member-access . - and -> operators, the address & and indirection * unary operators, and pointer casts may - be used in the creation of an address constant, but the value of an object shall not be - accessed by use of these operators. - -10 - An implementation may accept other forms of constant expressions. - -11 - The semantic rules for the evaluation of a constant expression are the same as for - nonconstant expressions.118) - - Forward references: array declarators (6.7.6.2), initialization (6.7.9). - - - - - - -Footnotes - -115) The operand of a sizeof or _Alignof operator is usually not evaluated (6.5.3.4). - - -116) The use of evaluation formats as characterized by FLT_EVAL_METHOD also applies to evaluation in - the translation environment. - - -117) An integer constant expression is required in a number of contexts such as the size of a bit-field - member of a structure, the value of an enumeration constant, and the size of a non-variable length - array. Further constraints that apply to the integer constant expressions used in conditional-inclusion - preprocessing directives are discussed in 6.10.1. - - -118) Thus, in the following initialization, - - - static int i = 2 || 1 / 0; - - - the expression is a valid integer constant expression with value one. - - -Contents - -6.7 Declarations - - -Syntax - -1 - - - declaration: - declaration-specifiers init-declarator-listopt ; - static_assert-declaration - declaration-specifiers: - storage-class-specifier declaration-specifiersopt - type-specifier declaration-specifiersopt - type-qualifier declaration-specifiersopt - function-specifier declaration-specifiersopt - alignment-specifier declaration-specifiersopt - init-declarator-list: - init-declarator - init-declarator-list , init-declarator - init-declarator: - declarator - declarator = initializer - - -Constraints - -2 - A declaration other than a static_assert declaration shall declare at least a declarator - (other than the parameters of a function or the members of a structure or union), a tag, or - the members of an enumeration. - -3 - If an identifier has no linkage, there shall be no more than one declaration of the identifier - (in a declarator or type specifier) with the same scope and in the same name space, except - that: - - -- a typedef name may be redefined to denote the same type as it currently does, - provided that type is not a variably modified type; - -- tags may be redeclared as specified in 6.7.2.3. - - -4 - All declarations in the same scope that refer to the same object or function shall specify - compatible types. - -Semantics - -5 - A declaration specifies the interpretation and attributes of a set of identifiers. A definition - of an identifier is a declaration for that identifier that: - - -- for an object, causes storage to be reserved for that object; - -- for a function, includes the function body;119) - - -- for an enumeration constant, is the (only) declaration of the identifier; - -- for a typedef name, is the first (or only) declaration of the identifier. - - -6 - The declaration specifiers consist of a sequence of specifiers that indicate the linkage, - storage duration, and part of the type of the entities that the declarators denote. The init- - declarator-list is a comma-separated sequence of declarators, each of which may have - additional type information, or an initializer, or both. The declarators contain the - identifiers (if any) being declared. - -7 - If an identifier for an object is declared with no linkage, the type for the object shall be - complete by the end of its declarator, or by the end of its init-declarator if it has an - initializer; in the case of function parameters (including in prototypes), it is the adjusted - type (see 6.7.6.3) that is required to be complete. - - Forward references: declarators (6.7.6), enumeration specifiers (6.7.2.2), initialization - (6.7.9), type names (6.7.7), type qualifiers (6.7.3). - - -Footnotes - -119) Function definitions have a different syntax, described in 6.9.1. - - -Contents - -6.7.1 Storage-class specifiers - - -Syntax - -1 - - - storage-class-specifier: - typedef - extern - static - _Thread_local - auto - register - - -Constraints - -2 - At most, one storage-class specifier may be given in the declaration specifiers in a - declaration, except that _Thread_local may appear with static or extern.120) - -3 - In the declaration of an object with block scope, if the declaration specifiers include - _Thread_local, they shall also include either static or extern. If - _Thread_local appears in any declaration of an object, it shall be present in every - declaration of that object. - -4 - _Thread_local shall not appear in the declaration specifiers of a function declaration. - - - - - - -Semantics - -5 - The typedef specifier is called a ''storage-class specifier'' for syntactic convenience - only; it is discussed in 6.7.8. The meanings of the various linkages and storage durations - were discussed in 6.2.2 and 6.2.4. - -6 - A declaration of an identifier for an object with storage-class specifier register - suggests that access to the object be as fast as possible. The extent to which such - suggestions are effective is implementation-defined.121) - -7 - The declaration of an identifier for a function that has block scope shall have no explicit - storage-class specifier other than extern. - -8 - If an aggregate or union object is declared with a storage-class specifier other than - typedef, the properties resulting from the storage-class specifier, except with respect to - linkage, also apply to the members of the object, and so on recursively for any aggregate - or union member objects. - - Forward references: type definitions (6.7.8). - - - - - - -Footnotes - -120) See ''future language directions'' (6.11.5). - - -121) The implementation may treat any register declaration simply as an auto declaration. However, - whether or not addressable storage is actually used, the address of any part of an object declared with - storage-class specifier register cannot be computed, either explicitly (by use of the unary & - operator as discussed in 6.5.3.2) or implicitly (by converting an array name to a pointer as discussed in - 6.3.2.1). Thus, the only operators that can be applied to an array declared with storage-class specifier - register are sizeof and _Alignof. - - -Contents - -6.7.2 Type specifiers - - -Syntax - -1 - - - type-specifier: - void - char - short - int - long - float - double - signed - unsigned - _Bool - _Complex - atomic-type-specifier - struct-or-union-specifier - enum-specifier - typedef-name - - -Constraints - -2 - At least one type specifier shall be given in the declaration specifiers in each declaration, - and in the specifier-qualifier list in each struct declaration and type name. Each list of - type specifiers shall be one of the following multisets (delimited by commas, when there - is more than one multiset per item); the type specifiers may occur in any order, possibly - intermixed with the other declaration specifiers. - - -- void - -- char - -- signed char - -- unsigned char - -- short, signed short, short int, or signed short int - -- unsigned short, or unsigned short int - -- int, signed, or signed int - -- unsigned, or unsigned int - -- long, signed long, long int, or signed long int - -- unsigned long, or unsigned long int - - -- long long, signed long long, long long int, or - signed long long int - -- unsigned long long, or unsigned long long int - -- float - -- double - -- long double - -- _Bool - -- float _Complex - -- double _Complex - -- long double _Complex - -- atomic type specifier - -- struct or union specifier - -- enum specifier - -- typedef name - - -3 - The type specifier _Complex shall not be used if the implementation does not support - complex types (see 6.10.8.3). - -Semantics - -4 - Specifiers for structures, unions, enumerations, and atomic types are discussed in 6.7.2.1 - through 6.7.2.4. Declarations of typedef names are discussed in 6.7.8. The - characteristics of the other types are discussed in 6.2.5. - -5 - Each of the comma-separated multisets designates the same type, except that for bit- - fields, it is implementation-defined whether the specifier int designates the same type as - signed int or the same type as unsigned int. - - Forward references: atomic type specifiers (6.7.2.4), enumeration specifiers (6.7.2.2), - structure and union specifiers (6.7.2.1), tags (6.7.2.3), type definitions (6.7.8). - - -Contents - -6.7.2.1 Structure and union specifiers - - -Syntax - -1 - - - struct-or-union-specifier: - struct-or-union identifieropt { struct-declaration-list } - struct-or-union identifier - struct-or-union: - struct - union - struct-declaration-list: - struct-declaration - struct-declaration-list struct-declaration - struct-declaration: - specifier-qualifier-list struct-declarator-listopt ; - static_assert-declaration - specifier-qualifier-list: - type-specifier specifier-qualifier-listopt - type-qualifier specifier-qualifier-listopt - struct-declarator-list: - struct-declarator - struct-declarator-list , struct-declarator - struct-declarator: - declarator - declaratoropt : constant-expression - - -Constraints - -2 - A struct-declaration that does not declare an anonymous structure or anonymous union - shall contain a struct-declarator-list. - -3 - A structure or union shall not contain a member with incomplete or function type (hence, - a structure shall not contain an instance of itself, but may contain a pointer to an instance - of itself), except that the last member of a structure with more than one named member - may have incomplete array type; such a structure (and any union containing, possibly - recursively, a member that is such a structure) shall not be a member of a structure or an - element of an array. - -4 - The expression that specifies the width of a bit-field shall be an integer constant - expression with a nonnegative value that does not exceed the width of an object of the - type that would be specified were the colon and expression omitted.122) If the value is - zero, the declaration shall have no declarator. - -5 - A bit-field shall have a type that is a qualified or unqualified version of _Bool, signed - int, unsigned int, or some other implementation-defined type. It is - implementation-defined whether atomic types are permitted. - - - -Semantics - -6 - As discussed in 6.2.5, a structure is a type consisting of a sequence of members, whose - storage is allocated in an ordered sequence, and a union is a type consisting of a sequence - of members whose storage overlap. - -7 - Structure and union specifiers have the same form. The keywords struct and union - indicate that the type being specified is, respectively, a structure type or a union type. - -8 - The presence of a struct-declaration-list in a struct-or-union-specifier declares a new type, - within a translation unit. The struct-declaration-list is a sequence of declarations for the - members of the structure or union. If the struct-declaration-list does not contain any - named members, either directly or via an anonymous structure or anonymous union, the - behavior is undefined. The type is incomplete until immediately after the } that - terminates the list, and complete thereafter. - -9 - A member of a structure or union may have any complete object type other than a - variably modified type.123) In addition, a member may be declared to consist of a - specified number of bits (including a sign bit, if any). Such a member is called a - bit-field;124) its width is preceded by a colon. - -10 - A bit-field is interpreted as having a signed or unsigned integer type consisting of the - specified number of bits.125) If the value 0 or 1 is stored into a nonzero-width bit-field of - type _Bool, the value of the bit-field shall compare equal to the value stored; a _Bool - bit-field has the semantics of a _Bool. - -11 - An implementation may allocate any addressable storage unit large enough to hold a bit- - field. If enough space remains, a bit-field that immediately follows another bit-field in a - structure shall be packed into adjacent bits of the same unit. If insufficient space remains, - whether a bit-field that does not fit is put into the next unit or overlaps adjacent units is - implementation-defined. The order of allocation of bit-fields within a unit (high-order to - low-order or low-order to high-order) is implementation-defined. The alignment of the - addressable storage unit is unspecified. - -12 - A bit-field declaration with no declarator, but only a colon and a width, indicates an - unnamed bit-field.126) As a special case, a bit-field structure member with a width of 0 - - - - indicates that no further bit-field is to be packed into the unit in which the previous bit- - field, if any, was placed. - -13 - An unnamed member whose type specifier is a structure specifier with no tag is called an - anonymous structure; an unnamed member whose type specifier is a union specifier with - no tag is called an anonymous union. The members of an anonymous structure or union - are considered to be members of the containing structure or union. This applies - recursively if the containing structure or union is also anonymous. - -14 - Each non-bit-field member of a structure or union object is aligned in an implementation- - defined manner appropriate to its type. - -15 - Within a structure object, the non-bit-field members and the units in which bit-fields - reside have addresses that increase in the order in which they are declared. A pointer to a - structure object, suitably converted, points to its initial member (or if that member is a - bit-field, then to the unit in which it resides), and vice versa. There may be unnamed - padding within a structure object, but not at its beginning. - -16 - The size of a union is sufficient to contain the largest of its members. The value of at - most one of the members can be stored in a union object at any time. A pointer to a - union object, suitably converted, points to each of its members (or if a member is a bit- - field, then to the unit in which it resides), and vice versa. - -17 - There may be unnamed padding at the end of a structure or union. - -18 - As a special case, the last element of a structure with more than one named member may - have an incomplete array type; this is called a flexible array member. In most situations, - the flexible array member is ignored. In particular, the size of the structure is as if the - flexible array member were omitted except that it may have more trailing padding than - the omission would imply. However, when a . (or ->) operator has a left operand that is - (a pointer to) a structure with a flexible array member and the right operand names that - member, it behaves as if that member were replaced with the longest array (with the same - element type) that would not make the structure larger than the object being accessed; the - offset of the array shall remain that of the flexible array member, even if this would differ - from that of the replacement array. If this array would have no elements, it behaves as if - it had one element but the behavior is undefined if any attempt is made to access that - element or to generate a pointer one past it. - -19 - EXAMPLE 1 The following illustrates anonymous structures and unions: - - - struct v { - union { // anonymous union - struct { int i, j; }; // anonymous structure - struct { long k, l; } w; - }; - int m; - } v1; - v1.i = 2; // valid - v1.k = 3; // invalid: inner structure is not anonymous - v1.w.k = 5; // valid - - - - -20 - EXAMPLE 2 After the declaration: - - - struct s { int n; double d[]; }; - - - the structure struct s has a flexible array member d. A typical way to use this is: - - - int m = /* some value */; - struct s *p = malloc(sizeof (struct s) + sizeof (double [m])); - - - and assuming that the call to malloc succeeds, the object pointed to by p behaves, for most purposes, as if - p had been declared as: - - - struct { int n; double d[m]; } *p; - - - (there are circumstances in which this equivalence is broken; in particular, the offsets of member d might - not be the same). - -21 - Following the above declaration: - - - struct s t1 = { 0 }; // valid - struct s t2 = { 1, { 4.2 }}; // invalid - t1.n = 4; // valid - t1.d[0] = 4.2; // might be undefined behavior - - - The initialization of t2 is invalid (and violates a constraint) because struct s is treated as if it did not - contain member d. The assignment to t1.d[0] is probably undefined behavior, but it is possible that - - - sizeof (struct s) >= offsetof(struct s, d) + sizeof (double) - - - in which case the assignment would be legitimate. Nevertheless, it cannot appear in strictly conforming - code. - -22 - After the further declaration: - - - struct ss { int n; }; - - - the expressions: - - - sizeof (struct s) >= sizeof (struct ss) - sizeof (struct s) >= offsetof(struct s, d) - - - are always equal to 1. - -23 - If sizeof (double) is 8, then after the following code is executed: - - - struct s *s1; - struct s *s2; - s1 = malloc(sizeof (struct s) + 64); - s2 = malloc(sizeof (struct s) + 46); - - - and assuming that the calls to malloc succeed, the objects pointed to by s1 and s2 behave, for most - purposes, as if the identifiers had been declared as: - - - struct { int n; double d[8]; } *s1; - struct { int n; double d[5]; } *s2; - - -24 - Following the further successful assignments: - - - s1 = malloc(sizeof (struct s) + 10); - s2 = malloc(sizeof (struct s) + 6); - - - they then behave as if the declarations were: - - - struct { int n; double d[1]; } *s1, *s2; - - - and: - - - double *dp; - dp = &(s1->d[0]); // valid - *dp = 42; // valid - dp = &(s2->d[0]); // valid - *dp = 42; // undefined behavior - - -25 - The assignment: - - - *s1 = *s2; - - - only copies the member n; if any of the array elements are within the first sizeof (struct s) bytes - of the structure, they might be copied or simply overwritten with indeterminate values. - - -26 - EXAMPLE 3 Because members of anonymous structures and unions are considered to be members of the - containing structure or union, struct s in the following example has more than one named member and - thus the use of a flexible array member is valid: - - - struct s { - struct { int i; }; - int a[]; - }; - - - - - Forward references: declarators (6.7.6), tags (6.7.2.3). - - -Footnotes - -122) While the number of bits in a _Bool object is at least CHAR_BIT, the width (number of sign and - value bits) of a _Bool may be just 1 bit. - - -123) A structure or union cannot contain a member with a variably modified type because member names - are not ordinary identifiers as defined in 6.2.3. - - -124) The unary & (address-of) operator cannot be applied to a bit-field object; thus, there are no pointers to - or arrays of bit-field objects. - - -125) As specified in 6.7.2 above, if the actual type specifier used is int or a typedef-name defined as int, - then it is implementation-defined whether the bit-field is signed or unsigned. - - -126) An unnamed bit-field structure member is useful for padding to conform to externally imposed - layouts. - - -Contents - -6.7.2.2 Enumeration specifiers - - -Syntax - -1 - - - enum-specifier: - enum identifieropt { enumerator-list } - enum identifieropt { enumerator-list , } - enum identifier - enumerator-list: - enumerator - enumerator-list , enumerator - enumerator: - enumeration-constant - enumeration-constant = constant-expression - - -Constraints - -2 - The expression that defines the value of an enumeration constant shall be an integer - constant expression that has a value representable as an int. - - -Semantics - -3 - The identifiers in an enumerator list are declared as constants that have type int and - may appear wherever such are permitted.127) An enumerator with = defines its - enumeration constant as the value of the constant expression. If the first enumerator has - no =, the value of its enumeration constant is 0. Each subsequent enumerator with no = - defines its enumeration constant as the value of the constant expression obtained by - adding 1 to the value of the previous enumeration constant. (The use of enumerators with - = may produce enumeration constants with values that duplicate other values in the same - enumeration.) The enumerators of an enumeration are also known as its members. - -4 - Each enumerated type shall be compatible with char, a signed integer type, or an - unsigned integer type. The choice of type is implementation-defined,128) but shall be - capable of representing the values of all the members of the enumeration. The - enumerated type is incomplete until immediately after the } that terminates the list of - enumerator declarations, and complete thereafter. - -5 - EXAMPLE The following fragment: - - - enum hue { chartreuse, burgundy, claret=20, winedark }; - enum hue col, *cp; - col = claret; - cp = &col; - if (*cp != burgundy) - /* ... */ - - - makes hue the tag of an enumeration, and then declares col as an object that has that type and cp as a - pointer to an object that has that type. The enumerated values are in the set { 0, 1, 20, 21 }. - - - Forward references: tags (6.7.2.3). - - -Footnotes - -127) Thus, the identifiers of enumeration constants declared in the same scope shall all be distinct from - each other and from other identifiers declared in ordinary declarators. - - -128) An implementation may delay the choice of which integer type until all enumeration constants have - been seen. - - -Contents - -6.7.2.3 Tags - - -Constraints - -1 - A specific type shall have its content defined at most once. - -2 - Where two declarations that use the same tag declare the same type, they shall both use - the same choice of struct, union, or enum. - -3 - A type specifier of the form - - - enum identifier - - - without an enumerator list shall only appear after the type it specifies is complete. - - - - -Semantics - -4 - All declarations of structure, union, or enumerated types that have the same scope and - use the same tag declare the same type. Irrespective of whether there is a tag or what - other declarations of the type are in the same translation unit, the type is incomplete129) - until immediately after the closing brace of the list defining the content, and complete - thereafter. - -5 - Two declarations of structure, union, or enumerated types which are in different scopes or - use different tags declare distinct types. Each declaration of a structure, union, or - enumerated type which does not include a tag declares a distinct type. - -6 - A type specifier of the form - - - struct-or-union identifieropt { struct-declaration-list } - - - or - - - enum identifieropt { enumerator-list } - - - or - - - enum identifieropt { enumerator-list , } - - - declares a structure, union, or enumerated type. The list defines the structure content, - union content, or enumeration content. If an identifier is provided,130) the type specifier - also declares the identifier to be the tag of that type. - -7 - A declaration of the form - - - struct-or-union identifier ; - - - specifies a structure or union type and declares the identifier as a tag of that type.131) - -8 - If a type specifier of the form - - - struct-or-union identifier - - - occurs other than as part of one of the above forms, and no other declaration of the - identifier as a tag is visible, then it declares an incomplete structure or union type, and - declares the identifier as the tag of that type.131) - - - - - -9 - If a type specifier of the form - - - struct-or-union identifier - - - or - - - enum identifier - - - occurs other than as part of one of the above forms, and a declaration of the identifier as a - tag is visible, then it specifies the same type as that other declaration, and does not - redeclare the tag. - -10 - EXAMPLE 1 This mechanism allows declaration of a self-referential structure. - - - struct tnode { - int count; - struct tnode *left, *right; - }; - - - specifies a structure that contains an integer and two pointers to objects of the same type. Once this - declaration has been given, the declaration - - - struct tnode s, *sp; - - - declares s to be an object of the given type and sp to be a pointer to an object of the given type. With - these declarations, the expression sp->left refers to the left struct tnode pointer of the object to - which sp points; the expression s.right->count designates the count member of the right struct - tnode pointed to from s. - -11 - The following alternative formulation uses the typedef mechanism: - - - typedef struct tnode TNODE; - struct tnode { - int count; - TNODE *left, *right; - }; - TNODE s, *sp; - - - - -12 - EXAMPLE 2 To illustrate the use of prior declaration of a tag to specify a pair of mutually referential - structures, the declarations - - - struct s1 { struct s2 *s2p; /* ... */ }; // D1 - struct s2 { struct s1 *s1p; /* ... */ }; // D2 - - - specify a pair of structures that contain pointers to each other. Note, however, that if s2 were already - declared as a tag in an enclosing scope, the declaration D1 would refer to it, not to the tag s2 declared in - D2. To eliminate this context sensitivity, the declaration - - - struct s2; - - - may be inserted ahead of D1. This declares a new tag s2 in the inner scope; the declaration D2 then - completes the specification of the new type. - - - Forward references: declarators (6.7.6), type definitions (6.7.8). - - -Footnotes - -129) An incomplete type may only by used when the size of an object of that type is not needed. It is not - needed, for example, when a typedef name is declared to be a specifier for a structure or union, or - when a pointer to or a function returning a structure or union is being declared. (See incomplete types - in 6.2.5.) The specification has to be complete before such a function is called or defined. - - -130) If there is no identifier, the type can, within the translation unit, only be referred to by the declaration - of which it is a part. Of course, when the declaration is of a typedef name, subsequent declarations - can make use of that typedef name to declare objects having the specified structure, union, or - enumerated type. - - -131) A similar construction with enum does not exist. - - -Contents - -6.7.2.4 Atomic type specifiers - - -Syntax - -1 - - - atomic-type-specifier: - _Atomic ( type-name ) - - -Constraints - -2 - Atomic type specifiers shall not be used if the implementation does not support atomic - types (see 6.10.8.3). - -3 - The type name in an atomic type specifier shall not refer to an array type, a function type, - an atomic type, or a qualified type. - -Semantics - -4 - The properties associated with atomic types are meaningful only for expressions that are - lvalues. If the _Atomic keyword is immediately followed by a left parenthesis, it is - interpreted as a type specifier (with a type name), not as a type qualifier. - - -Contents - -6.7.3 Type qualifiers - - -Syntax - -1 - - - type-qualifier: - const - restrict - volatile - _Atomic - - -Constraints - -2 - Types other than pointer types whose referenced type is an object type shall not be - restrict-qualified. - -3 - The type modified by the _Atomic qualifier shall not be an array type or a function - type. - -Semantics - -4 - The properties associated with qualified types are meaningful only for expressions that - are lvalues.132) - -5 - If the same qualifier appears more than once in the same specifier-qualifier-list, either - directly or via one or more typedefs, the behavior is the same as if it appeared only - once. If other qualifiers appear along with the _Atomic qualifier in a specifier-qualifier- - - - list, the resulting type is the so-qualified atomic type. - -6 - If an attempt is made to modify an object defined with a const-qualified type through use - of an lvalue with non-const-qualified type, the behavior is undefined. If an attempt is - made to refer to an object defined with a volatile-qualified type through use of an lvalue - with non-volatile-qualified type, the behavior is undefined.133) - -7 - An object that has volatile-qualified type may be modified in ways unknown to the - implementation or have other unknown side effects. Therefore any expression referring - to such an object shall be evaluated strictly according to the rules of the abstract machine, - as described in 5.1.2.3. Furthermore, at every sequence point the value last stored in the - object shall agree with that prescribed by the abstract machine, except as modified by the - unknown factors mentioned previously.134) What constitutes an access to an object that - has volatile-qualified type is implementation-defined. - -8 - An object that is accessed through a restrict-qualified pointer has a special association - with that pointer. This association, defined in 6.7.3.1 below, requires that all accesses to - that object use, directly or indirectly, the value of that particular pointer.135) The intended - use of the restrict qualifier (like the register storage class) is to promote - optimization, and deleting all instances of the qualifier from all preprocessing translation - units composing a conforming program does not change its meaning (i.e., observable - behavior). - -9 - If the specification of an array type includes any type qualifiers, the element type is so- - qualified, not the array type. If the specification of a function type includes any type - qualifiers, the behavior is undefined.136) - -10 - For two qualified types to be compatible, both shall have the identically qualified version - of a compatible type; the order of type qualifiers within a list of specifiers or qualifiers - does not affect the specified type. - -11 - EXAMPLE 1 An object declared - - - extern const volatile int real_time_clock; - - - - - - - may be modifiable by hardware, but cannot be assigned to, incremented, or decremented. - - -12 - EXAMPLE 2 The following declarations and expressions illustrate the behavior when type qualifiers - modify an aggregate type: - - - const struct s { int mem; } cs = { 1 }; - struct s ncs; // the object ncs is modifiable - typedef int A[2][3]; - const A a = {{4, 5, 6}, {7, 8, 9}}; // array of array of const int - int *pi; - const int *pci; - ncs = cs; // valid - cs = ncs; // violates modifiable lvalue constraint for = - pi = &ncs.mem; // valid - pi = &cs.mem; // violates type constraints for = - pci = &cs.mem; // valid - pi = a[0]; // invalid: a[0] has type ''const int *'' - - - - -13 - EXAMPLE 3 The declaration - - - _Atomic volatile int *p; - - - specifies that p has the type ''pointer to volatile atomic int'', a pointer to a volatile-qualified atomic type. - - - -Footnotes - -132) The implementation may place a const object that is not volatile in a read-only region of - storage. Moreover, the implementation need not allocate storage for such an object if its address is - never used. - - -133) This applies to those objects that behave as if they were defined with qualified types, even if they are - never actually defined as objects in the program (such as an object at a memory-mapped input/output - address). - - -134) A volatile declaration may be used to describe an object corresponding to a memory-mapped - input/output port or an object accessed by an asynchronously interrupting function. Actions on - objects so declared shall not be ''optimized out'' by an implementation or reordered except as - permitted by the rules for evaluating expressions. - - -135) For example, a statement that assigns a value returned by malloc to a single pointer establishes this - association between the allocated object and the pointer. - - -136) Both of these can occur through the use of typedefs. - - -Contents - -6.7.3.1 Formal definition of restrict - - -1 - Let D be a declaration of an ordinary identifier that provides a means of designating an - object P as a restrict-qualified pointer to type T. - -2 - If D appears inside a block and does not have storage class extern, let B denote the - block. If D appears in the list of parameter declarations of a function definition, let B - denote the associated block. Otherwise, let B denote the block of main (or the block of - whatever function is called at program startup in a freestanding environment). - -3 - In what follows, a pointer expression E is said to be based on object P if (at some - sequence point in the execution of B prior to the evaluation of E) modifying P to point to - a copy of the array object into which it formerly pointed would change the value of E.137) - Note that ''based'' is defined only for expressions with pointer types. - -4 - During each execution of B, let L be any lvalue that has &L based on P. If L is used to - access the value of the object X that it designates, and X is also modified (by any means), - then the following requirements apply: T shall not be const-qualified. Every other lvalue - used to access the value of X shall also have its address based on P. Every access that - modifies X shall be considered also to modify P, for the purposes of this subclause. If P - is assigned the value of a pointer expression E that is based on another restricted pointer - - - - object P2, associated with block B2, then either the execution of B2 shall begin before - the execution of B, or the execution of B2 shall end prior to the assignment. If these - requirements are not met, then the behavior is undefined. - -5 - Here an execution of B means that portion of the execution of the program that would - correspond to the lifetime of an object with scalar type and automatic storage duration - associated with B. - -6 - A translator is free to ignore any or all aliasing implications of uses of restrict. - -7 - EXAMPLE 1 The file scope declarations - - - int * restrict a; - int * restrict b; - extern int c[]; - - - assert that if an object is accessed using one of a, b, or c, and that object is modified anywhere in the - program, then it is never accessed using either of the other two. - - -8 - EXAMPLE 2 The function parameter declarations in the following example - - - void f(int n, int * restrict p, int * restrict q) - { - while (n-- > 0) - *p++ = *q++; - } - - - assert that, during each execution of the function, if an object is accessed through one of the pointer - parameters, then it is not also accessed through the other. - -9 - The benefit of the restrict qualifiers is that they enable a translator to make an effective dependence - analysis of function f without examining any of the calls of f in the program. The cost is that the - programmer has to examine all of those calls to ensure that none give undefined behavior. For example, the - second call of f in g has undefined behavior because each of d[1] through d[49] is accessed through - both p and q. - - - void g(void) - { - extern int d[100]; - f(50, d + 50, d); // valid - f(50, d + 1, d); // undefined behavior - } - - - - -10 - EXAMPLE 3 The function parameter declarations - - - void h(int n, int * restrict p, int * restrict q, int * restrict r) - { - int i; - for (i = 0; i < n; i++) - p[i] = q[i] + r[i]; - } - - - illustrate how an unmodified object can be aliased through two restricted pointers. In particular, if a and b - are disjoint arrays, a call of the form h(100, a, b, b) has defined behavior, because array b is not - modified within function h. - - -11 - EXAMPLE 4 The rule limiting assignments between restricted pointers does not distinguish between a - function call and an equivalent nested block. With one exception, only ''outer-to-inner'' assignments - between restricted pointers declared in nested blocks have defined behavior. - - - { - int * restrict p1; - int * restrict q1; - p1 = q1; // undefined behavior - { - int * restrict p2 = p1; // valid - int * restrict q2 = q1; // valid - p1 = q2; // undefined behavior - p2 = q2; // undefined behavior - } - } - - -12 - The one exception allows the value of a restricted pointer to be carried out of the block in which it (or, more - precisely, the ordinary identifier used to designate it) is declared when that block finishes execution. For - example, this permits new_vector to return a vector. - - - typedef struct { int n; float * restrict v; } vector; - vector new_vector(int n) - { - vector t; - t.n = n; - t.v = malloc(n * sizeof (float)); - return t; - } - - - - - -Footnotes - -137) In other words, E depends on the value of P itself rather than on the value of an object referenced - indirectly through P. For example, if identifier p has type (int **restrict), then the pointer - expressions p and p+1 are based on the restricted pointer object designated by p, but the pointer - expressions *p and p[1] are not. - - -Contents - -6.7.4 Function specifiers - - -Syntax - -1 - - - function-specifier: - inline - _Noreturn - - -Constraints - -2 - Function specifiers shall be used only in the declaration of an identifier for a function. - -3 - An inline definition of a function with external linkage shall not contain a definition of a - modifiable object with static or thread storage duration, and shall not contain a reference - to an identifier with internal linkage. - -4 - In a hosted environment, no function specifier(s) shall appear in a declaration of main. - -Semantics - -5 - A function specifier may appear more than once; the behavior is the same as if it - appeared only once. - -6 - A function declared with an inline function specifier is an inline function. Making a - function an inline function suggests that calls to the function be as fast as possible.138) - - The extent to which such suggestions are effective is implementation-defined.139) - -7 - Any function with internal linkage can be an inline function. For a function with external - linkage, the following restrictions apply: If a function is declared with an inline - function specifier, then it shall also be defined in the same translation unit. If all of the - file scope declarations for a function in a translation unit include the inline function - specifier without extern, then the definition in that translation unit is an inline - definition. An inline definition does not provide an external definition for the function, - and does not forbid an external definition in another translation unit. An inline definition - provides an alternative to an external definition, which a translator may use to implement - any call to the function in the same translation unit. It is unspecified whether a call to the - function uses the inline definition or the external definition.140) - -8 - A function declared with a _Noreturn function specifier shall not return to its caller. - -Recommended practice - -9 - The implementation should produce a diagnostic message for a function declared with a - _Noreturn function specifier that appears to be capable of returning to its caller. - -10 - EXAMPLE 1 The declaration of an inline function with external linkage can result in either an external - definition, or a definition available for use only within the translation unit. A file scope declaration with - extern creates an external definition. The following example shows an entire translation unit. - - - inline double fahr(double t) - { - return (9.0 * t) / 5.0 + 32.0; - } - inline double cels(double t) - { - return (5.0 * (t - 32.0)) / 9.0; - } - extern double fahr(double); // creates an external definition - - - - - - - - - double convert(int is_fahr, double temp) - { - /* A translator may perform inline substitutions */ - return is_fahr ? cels(temp) : fahr(temp); - } - - -11 - Note that the definition of fahr is an external definition because fahr is also declared with extern, but - the definition of cels is an inline definition. Because cels has external linkage and is referenced, an - external definition has to appear in another translation unit (see 6.9); the inline definition and the external - definition are distinct and either may be used for the call. - - -12 - EXAMPLE 2 - - - _Noreturn void f () { - abort(); // ok - } - _Noreturn void g (int i) { // causes undefined behavior if i <= 0 - if (i > 0) abort(); - } - - - - - Forward references: function definitions (6.9.1). - - -Footnotes - -138) By using, for example, an alternative to the usual function call mechanism, such as ''inline - substitution''. Inline substitution is not textual substitution, nor does it create a new function. - Therefore, for example, the expansion of a macro used within the body of the function uses the - definition it had at the point the function body appears, and not where the function is called; and - identifiers refer to the declarations in scope where the body occurs. Likewise, the function has a - single address, regardless of the number of inline definitions that occur in addition to the external - definition. - - -139) For example, an implementation might never perform inline substitution, or might only perform inline - substitutions to calls in the scope of an inline declaration. - - -140) Since an inline definition is distinct from the corresponding external definition and from any other - corresponding inline definitions in other translation units, all corresponding objects with static storage - duration are also distinct in each of the definitions. - - -Contents - -6.7.5 Alignment specifier - - -Syntax - -1 - - - alignment-specifier: - _Alignas ( type-name ) - _Alignas ( constant-expression ) - - -Constraints - -2 - An alignment attribute shall not be specified in a declaration of a typedef, or a bit-field, or - a function, or a parameter, or an object declared with the register storage-class - specifier. - -3 - The constant expression shall be an integer constant expression. It shall evaluate to a - valid fundamental alignment, or to a valid extended alignment supported by the - implementation in the context in which it appears, or to zero. - -4 - The combined effect of all alignment attributes in a declaration shall not specify an - alignment that is less strict than the alignment that would otherwise be required for the - type of the object or member being declared. - -Semantics - -5 - The first form is equivalent to _Alignas (_Alignof (type-name)). - -6 - The alignment requirement of the declared object or member is taken to be the specified - alignment. An alignment specification of zero has no effect.141) When multiple - alignment specifiers occur in a declaration, the effective alignment requirement is the - strictest specified alignment. - - -7 - If the definition of an object has an alignment specifier, any other declaration of that - object shall either specify equivalent alignment or have no alignment specifier. If the - definition of an object does not have an alignment specifier, any other declaration of that - object shall also have no alignment specifier. If declarations of an object in different - translation units have different alignment specifiers, the behavior is undefined. - - -Footnotes - -141) An alignment specification of zero also does not affect other alignment specifications in the same - declaration. - - -Contents - -6.7.6 Declarators - - -Syntax - -1 - - - declarator: - pointeropt direct-declarator - direct-declarator: - identifier - ( declarator ) - direct-declarator [ type-qualifier-listopt assignment-expressionopt ] - direct-declarator [ static type-qualifier-listopt assignment-expression ] - direct-declarator [ type-qualifier-list static assignment-expression ] - direct-declarator [ type-qualifier-listopt * ] - direct-declarator ( parameter-type-list ) - direct-declarator ( identifier-listopt ) - pointer: - * type-qualifier-listopt - * type-qualifier-listopt pointer - type-qualifier-list: - type-qualifier - type-qualifier-list type-qualifier - parameter-type-list: - parameter-list - parameter-list , ... - parameter-list: - parameter-declaration - parameter-list , parameter-declaration - parameter-declaration: - declaration-specifiers declarator - declaration-specifiers abstract-declaratoropt - - - - - - - - identifier-list: - identifier - identifier-list , identifier - - -Semantics - -2 - Each declarator declares one identifier, and asserts that when an operand of the same - form as the declarator appears in an expression, it designates a function or object with the - scope, storage duration, and type indicated by the declaration specifiers. - -3 - A full declarator is a declarator that is not part of another declarator. The end of a full - declarator is a sequence point. If, in the nested sequence of declarators in a full - declarator, there is a declarator specifying a variable length array type, the type specified - by the full declarator is said to be variably modified. Furthermore, any type derived by - declarator type derivation from a variably modified type is itself variably modified. - -4 - In the following subclauses, consider a declaration - - - T D1 - - - where T contains the declaration specifiers that specify a type T (such as int) and D1 is - a declarator that contains an identifier ident. The type specified for the identifier ident in - the various forms of declarator is described inductively using this notation. - -5 - If, in the declaration ''T D1'', D1 has the form - - - identifier - - - then the type specified for ident is T . - -6 - If, in the declaration ''T D1'', D1 has the form - - - ( D ) - - - then ident has the type specified by the declaration ''T D''. Thus, a declarator in - parentheses is identical to the unparenthesized declarator, but the binding of complicated - declarators may be altered by parentheses. - -Implementation limits - -7 - As discussed in 5.2.4.1, an implementation may limit the number of pointer, array, and - function declarators that modify an arithmetic, structure, union, or void type, either - directly or via one or more typedefs. - - Forward references: array declarators (6.7.6.2), type definitions (6.7.8). - - -Contents - -6.7.6.1 Pointer declarators - - -Semantics - -1 - If, in the declaration ''T D1'', D1 has the form - - - * type-qualifier-listopt D - - - and the type specified for ident in the declaration ''T D'' is ''derived-declarator-type-list - T '', then the type specified for ident is ''derived-declarator-type-list type-qualifier-list - pointer to T ''. For each type qualifier in the list, ident is a so-qualified pointer. - -2 - For two pointer types to be compatible, both shall be identically qualified and both shall - be pointers to compatible types. - -3 - EXAMPLE The following pair of declarations demonstrates the difference between a ''variable pointer - to a constant value'' and a ''constant pointer to a variable value''. - - - const int *ptr_to_constant; - int *const constant_ptr; - - - The contents of any object pointed to by ptr_to_constant shall not be modified through that pointer, - but ptr_to_constant itself may be changed to point to another object. Similarly, the contents of the - int pointed to by constant_ptr may be modified, but constant_ptr itself shall always point to the - same location. - -4 - The declaration of the constant pointer constant_ptr may be clarified by including a definition for the - type ''pointer to int''. - - - typedef int *int_ptr; - const int_ptr constant_ptr; - - - declares constant_ptr as an object that has type ''const-qualified pointer to int''. - - - -Contents - -6.7.6.2 Array declarators - - -Constraints - -1 - In addition to optional type qualifiers and the keyword static, the [ and ] may delimit - an expression or *. If they delimit an expression (which specifies the size of an array), the - expression shall have an integer type. If the expression is a constant expression, it shall - have a value greater than zero. The element type shall not be an incomplete or function - type. The optional type qualifiers and the keyword static shall appear only in a - declaration of a function parameter with an array type, and then only in the outermost - array type derivation. - -2 - If an identifier is declared as having a variably modified type, it shall be an ordinary - identifier (as defined in 6.2.3), have no linkage, and have either block scope or function - prototype scope. If an identifier is declared to be an object with static or thread storage - duration, it shall not have a variable length array type. - - -Semantics - -3 - If, in the declaration ''T D1'', D1 has one of the forms: - - - D[ type-qualifier-listopt assignment-expressionopt ] - D[ static type-qualifier-listopt assignment-expression ] - D[ type-qualifier-list static assignment-expression ] - D[ type-qualifier-listopt * ] - - - and the type specified for ident in the declaration ''T D'' is ''derived-declarator-type-list - T '', then the type specified for ident is ''derived-declarator-type-list array of T ''.142) - (See 6.7.6.3 for the meaning of the optional type qualifiers and the keyword static.) - -4 - If the size is not present, the array type is an incomplete type. If the size is * instead of - being an expression, the array type is a variable length array type of unspecified size, - which can only be used in declarations or type names with function prototype scope;143) - such arrays are nonetheless complete types. If the size is an integer constant expression - and the element type has a known constant size, the array type is not a variable length - array type; otherwise, the array type is a variable length array type. (Variable length - arrays are a conditional feature that implementations need not support; see 6.10.8.3.) - -5 - If the size is an expression that is not an integer constant expression: if it occurs in a - declaration at function prototype scope, it is treated as if it were replaced by *; otherwise, - each time it is evaluated it shall have a value greater than zero. The size of each instance - of a variable length array type does not change during its lifetime. Where a size - expression is part of the operand of a sizeof operator and changing the value of the - size expression would not affect the result of the operator, it is unspecified whether or not - the size expression is evaluated. - -6 - For two array types to be compatible, both shall have compatible element types, and if - both size specifiers are present, and are integer constant expressions, then both size - specifiers shall have the same constant value. If the two array types are used in a context - which requires them to be compatible, it is undefined behavior if the two size specifiers - evaluate to unequal values. - -7 - EXAMPLE 1 - - - float fa[11], *afp[17]; - - - declares an array of float numbers and an array of pointers to float numbers. - - -8 - EXAMPLE 2 Note the distinction between the declarations - - - - - - - extern int *x; - extern int y[]; - - - The first declares x to be a pointer to int; the second declares y to be an array of int of unspecified size - (an incomplete type), the storage for which is defined elsewhere. - - -9 - EXAMPLE 3 The following declarations demonstrate the compatibility rules for variably modified types. - - - extern int n; - extern int m; - void fcompat(void) - { - int a[n][6][m]; - int (*p)[4][n+1]; - int c[n][n][6][m]; - int (*r)[n][n][n+1]; - p = a; // invalid: not compatible because 4 != 6 - r = c; // compatible, but defined behavior only if - // n == 6 and m == n+1 - } - - - - -10 - EXAMPLE 4 All declarations of variably modified (VM) types have to be at either block scope or - function prototype scope. Array objects declared with the _Thread_local, static, or extern - storage-class specifier cannot have a variable length array (VLA) type. However, an object declared with - the static storage-class specifier can have a VM type (that is, a pointer to a VLA type). Finally, all - identifiers declared with a VM type have to be ordinary identifiers and cannot, therefore, be members of - structures or unions. - - - extern int n; - int A[n]; // invalid: file scope VLA - extern int (*p2)[n]; // invalid: file scope VM - int B[100]; // valid: file scope but not VM - void fvla(int m, int C[m][m]); // valid: VLA with prototype scope - void fvla(int m, int C[m][m]) // valid: adjusted to auto pointer to VLA - { - typedef int VLA[m][m]; // valid: block scope typedef VLA - struct tag { - int (*y)[n]; // invalid: y not ordinary identifier - int z[n]; // invalid: z not ordinary identifier - }; - int D[m]; // valid: auto VLA - static int E[m]; // invalid: static block scope VLA - extern int F[m]; // invalid: F has linkage and is VLA - int (*s)[m]; // valid: auto pointer to VLA - extern int (*r)[m]; // invalid: r has linkage and points to VLA - static int (*q)[m] = &B; // valid: q is a static block pointer to VLA - } - - - - - Forward references: function declarators (6.7.6.3), function definitions (6.9.1), - initialization (6.7.9). - - -Footnotes - -142) When several ''array of'' specifications are adjacent, a multidimensional array is declared. - - -143) Thus, * can be used only in function declarations that are not definitions (see 6.7.6.3). - - -Contents - -6.7.6.3 Function declarators (including prototypes) - - -Constraints - -1 - A function declarator shall not specify a return type that is a function type or an array - type. - -2 - The only storage-class specifier that shall occur in a parameter declaration is register. - -3 - An identifier list in a function declarator that is not part of a definition of that function - shall be empty. - -4 - After adjustment, the parameters in a parameter type list in a function declarator that is - part of a definition of that function shall not have incomplete type. - -Semantics - -5 - If, in the declaration ''T D1'', D1 has the form - - - D( parameter-type-list ) - - - or - - - D( identifier-listopt ) - - - and the type specified for ident in the declaration ''T D'' is ''derived-declarator-type-list - T '', then the type specified for ident is ''derived-declarator-type-list function returning - T ''. - -6 - A parameter type list specifies the types of, and may declare identifiers for, the - parameters of the function. - -7 - A declaration of a parameter as ''array of type'' shall be adjusted to ''qualified pointer to - type'', where the type qualifiers (if any) are those specified within the [ and ] of the - array type derivation. If the keyword static also appears within the [ and ] of the - array type derivation, then for each call to the function, the value of the corresponding - actual argument shall provide access to the first element of an array with at least as many - elements as specified by the size expression. - -8 - A declaration of a parameter as ''function returning type'' shall be adjusted to ''pointer to - function returning type'', as in 6.3.2.1. - -9 - If the list terminates with an ellipsis (, ...), no information about the number or types - of the parameters after the comma is supplied.144) - -10 - The special case of an unnamed parameter of type void as the only item in the list - specifies that the function has no parameters. - - - - - -11 - If, in a parameter declaration, an identifier can be treated either as a typedef name or as a - parameter name, it shall be taken as a typedef name. - -12 - If the function declarator is not part of a definition of that function, parameters may have - incomplete type and may use the [*] notation in their sequences of declarator specifiers - to specify variable length array types. - -13 - The storage-class specifier in the declaration specifiers for a parameter declaration, if - present, is ignored unless the declared parameter is one of the members of the parameter - type list for a function definition. - -14 - An identifier list declares only the identifiers of the parameters of the function. An empty - list in a function declarator that is part of a definition of that function specifies that the - function has no parameters. The empty list in a function declarator that is not part of a - definition of that function specifies that no information about the number or types of the - parameters is supplied.145) - -15 - For two function types to be compatible, both shall specify compatible return types.146) - Moreover, the parameter type lists, if both are present, shall agree in the number of - parameters and in use of the ellipsis terminator; corresponding parameters shall have - compatible types. If one type has a parameter type list and the other type is specified by a - function declarator that is not part of a function definition and that contains an empty - identifier list, the parameter list shall not have an ellipsis terminator and the type of each - parameter shall be compatible with the type that results from the application of the - default argument promotions. If one type has a parameter type list and the other type is - specified by a function definition that contains a (possibly empty) identifier list, both shall - agree in the number of parameters, and the type of each prototype parameter shall be - compatible with the type that results from the application of the default argument - promotions to the type of the corresponding identifier. (In the determination of type - compatibility and of a composite type, each parameter declared with function or array - type is taken as having the adjusted type and each parameter declared with qualified type - is taken as having the unqualified version of its declared type.) - -16 - EXAMPLE 1 The declaration - - - int f(void), *fip(), (*pfi)(); - - - declares a function f with no parameters returning an int, a function fip with no parameter specification - returning a pointer to an int, and a pointer pfi to a function with no parameter specification returning an - int. It is especially useful to compare the last two. The binding of *fip() is *(fip()), so that the - declaration suggests, and the same construction in an expression requires, the calling of a function fip, - and then using indirection through the pointer result to yield an int. In the declarator (*pfi)(), the - extra parentheses are necessary to indicate that indirection through a pointer to a function yields a function - - - - designator, which is then used to call the function; it returns an int. - -17 - If the declaration occurs outside of any function, the identifiers have file scope and external linkage. If the - declaration occurs inside a function, the identifiers of the functions f and fip have block scope and either - internal or external linkage (depending on what file scope declarations for these identifiers are visible), and - the identifier of the pointer pfi has block scope and no linkage. - - -18 - EXAMPLE 2 The declaration - - - int (*apfi[3])(int *x, int *y); - - - declares an array apfi of three pointers to functions returning int. Each of these functions has two - parameters that are pointers to int. The identifiers x and y are declared for descriptive purposes only and - go out of scope at the end of the declaration of apfi. - - -19 - EXAMPLE 3 The declaration - - - int (*fpfi(int (*)(long), int))(int, ...); - - - declares a function fpfi that returns a pointer to a function returning an int. The function fpfi has two - parameters: a pointer to a function returning an int (with one parameter of type long int), and an int. - The pointer returned by fpfi points to a function that has one int parameter and accepts zero or more - additional arguments of any type. - - -20 - EXAMPLE 4 The following prototype has a variably modified parameter. - - - void addscalar(int n, int m, - double a[n][n*m+300], double x); - int main() - { - double b[4][308]; - addscalar(4, 2, b, 2.17); - return 0; - } - void addscalar(int n, int m, - double a[n][n*m+300], double x) - { - for (int i = 0; i < n; i++) - for (int j = 0, k = n*m+300; j < k; j++) - // a is a pointer to a VLA with n*m+300 elements - a[i][j] += x; - } - - - - -21 - EXAMPLE 5 The following are all compatible function prototype declarators. - - - double maximum(int n, int m, double a[n][m]); - double maximum(int n, int m, double a[*][*]); - double maximum(int n, int m, double a[ ][*]); - double maximum(int n, int m, double a[ ][m]); - - - as are: - - - void f(double (* restrict a)[5]); - void f(double a[restrict][5]); - void f(double a[restrict 3][5]); - void f(double a[restrict static 3][5]); - - - (Note that the last declaration also specifies that the argument corresponding to a in any call to f must be a - non-null pointer to the first of at least three arrays of 5 doubles, which the others do not.) - - - Forward references: function definitions (6.9.1), type names (6.7.7). - - -Footnotes - -144) The macros defined in the <stdarg.h> header (7.16) may be used to access arguments that - correspond to the ellipsis. - - -145) See ''future language directions'' (6.11.6). - - -146) If both function types are ''old style'', parameter types are not compared. - - -Contents - -6.7.7 Type names - - -Syntax - -1 - - - type-name: - specifier-qualifier-list abstract-declaratoropt - abstract-declarator: - pointer - pointeropt direct-abstract-declarator - direct-abstract-declarator: - ( abstract-declarator ) - direct-abstract-declaratoropt [ type-qualifier-listopt - assignment-expressionopt ] - direct-abstract-declaratoropt [ static type-qualifier-listopt - assignment-expression ] - direct-abstract-declaratoropt [ type-qualifier-list static - assignment-expression ] - direct-abstract-declaratoropt [ * ] - direct-abstract-declaratoropt ( parameter-type-listopt ) - - -Semantics - -2 - In several contexts, it is necessary to specify a type. This is accomplished using a type - name, which is syntactically a declaration for a function or an object of that type that - omits the identifier.147) - -3 - EXAMPLE The constructions - - - (a) int - (b) int * - (c) int *[3] - (d) int (*)[3] - (e) int (*)[*] - (f) int *() - (g) int (*)(void) - (h) int (*const [])(unsigned int, ...) - - - name respectively the types (a) int, (b) pointer to int, (c) array of three pointers to int, (d) pointer to an - array of three ints, (e) pointer to a variable length array of an unspecified number of ints, (f) function - with no parameter specification returning a pointer to int, (g) pointer to function with no parameters - - - - returning an int, and (h) array of an unspecified number of constant pointers to functions, each with one - parameter that has type unsigned int and an unspecified number of other parameters, returning an - int. - - - -Footnotes - -147) As indicated by the syntax, empty parentheses in a type name are interpreted as ''function with no - parameter specification'', rather than redundant parentheses around the omitted identifier. - - -Contents - -6.7.8 Type definitions - - -Syntax - -1 - - - typedef-name: - identifier - - -Constraints - -2 - If a typedef name specifies a variably modified type then it shall have block scope. - -Semantics - -3 - In a declaration whose storage-class specifier is typedef, each declarator defines an - identifier to be a typedef name that denotes the type specified for the identifier in the way - described in 6.7.6. Any array size expressions associated with variable length array - declarators are evaluated each time the declaration of the typedef name is reached in the - order of execution. A typedef declaration does not introduce a new type, only a - synonym for the type so specified. That is, in the following declarations: - - - typedef T type_ident; - type_ident D; - - - type_ident is defined as a typedef name with the type specified by the declaration - specifiers in T (known as T ), and the identifier in D has the type ''derived-declarator- - type-list T '' where the derived-declarator-type-list is specified by the declarators of D. A - typedef name shares the same name space as other identifiers declared in ordinary - declarators. - -4 - EXAMPLE 1 After - - - typedef int MILES, KLICKSP(); - typedef struct { double hi, lo; } range; - - - the constructions - - - MILES distance; - extern KLICKSP *metricp; - range x; - range z, *zp; - - - are all valid declarations. The type of distance is int, that of metricp is ''pointer to function with no - parameter specification returning int'', and that of x and z is the specified structure; zp is a pointer to - such a structure. The object distance has a type compatible with any other int object. - - -5 - EXAMPLE 2 After the declarations - - - typedef struct s1 { int x; } t1, *tp1; - typedef struct s2 { int x; } t2, *tp2; - - - type t1 and the type pointed to by tp1 are compatible. Type t1 is also compatible with type struct - - s1, but not compatible with the types struct s2, t2, the type pointed to by tp2, or int. - - -6 - EXAMPLE 3 The following obscure constructions - - - typedef signed int t; - typedef int plain; - struct tag { - unsigned t:4; - const t:5; - plain r:5; - }; - - - declare a typedef name t with type signed int, a typedef name plain with type int, and a structure - with three bit-field members, one named t that contains values in the range [0, 15], an unnamed const- - qualified bit-field which (if it could be accessed) would contain values in either the range [-15, +15] or - [-16, +15], and one named r that contains values in one of the ranges [0, 31], [-15, +15], or [-16, +15]. - (The choice of range is implementation-defined.) The first two bit-field declarations differ in that - unsigned is a type specifier (which forces t to be the name of a structure member), while const is a - type qualifier (which modifies t which is still visible as a typedef name). If these declarations are followed - in an inner scope by - - - t f(t (t)); - long t; - - - then a function f is declared with type ''function returning signed int with one unnamed parameter - with type pointer to function returning signed int with one unnamed parameter with type signed - int'', and an identifier t with type long int. - - -7 - EXAMPLE 4 On the other hand, typedef names can be used to improve code readability. All three of the - following declarations of the signal function specify exactly the same type, the first without making use - of any typedef names. - - - typedef void fv(int), (*pfv)(int); - void (*signal(int, void (*)(int)))(int); - fv *signal(int, fv *); - pfv signal(int, pfv); - - - - -8 - EXAMPLE 5 If a typedef name denotes a variable length array type, the length of the array is fixed at the - time the typedef name is defined, not each time it is used: - - - void copyt(int n) - { - typedef int B[n]; // B is n ints, n evaluated now - n += 1; - B a; // a is n ints, n without += 1 - int b[n]; // a and b are different sizes - for (int i = 1; i < n; i++) - a[i-1] = b[i]; - } - - -Contents - -6.7.9 Initialization - - -Syntax - -1 - - - initializer: - assignment-expression - { initializer-list } - { initializer-list , } - initializer-list: - designationopt initializer - initializer-list , designationopt initializer - designation: - designator-list = - designator-list: - designator - designator-list designator - designator: - [ constant-expression ] - . identifier - - -Constraints - -2 - No initializer shall attempt to provide a value for an object not contained within the entity - being initialized. - -3 - The type of the entity to be initialized shall be an array of unknown size or a complete - object type that is not a variable length array type. - -4 - All the expressions in an initializer for an object that has static or thread storage duration - shall be constant expressions or string literals. - -5 - If the declaration of an identifier has block scope, and the identifier has external or - internal linkage, the declaration shall have no initializer for the identifier. - -6 - If a designator has the form - - - [ constant-expression ] - - - then the current object (defined below) shall have array type and the expression shall be - an integer constant expression. If the array is of unknown size, any nonnegative value is - valid. - -7 - If a designator has the form - - - . identifier - - - then the current object (defined below) shall have structure or union type and the - identifier shall be the name of a member of that type. - - -Semantics - -8 - An initializer specifies the initial value stored in an object. - -9 - Except where explicitly stated otherwise, for the purposes of this subclause unnamed - members of objects of structure and union type do not participate in initialization. - Unnamed members of structure objects have indeterminate value even after initialization. - -10 - If an object that has automatic storage duration is not initialized explicitly, its value is - indeterminate. If an object that has static or thread storage duration is not initialized - explicitly, then: - - -- if it has pointer type, it is initialized to a null pointer; - -- if it has arithmetic type, it is initialized to (positive or unsigned) zero; - -- if it is an aggregate, every member is initialized (recursively) according to these rules, - and any padding is initialized to zero bits; - -- if it is a union, the first named member is initialized (recursively) according to these - rules, and any padding is initialized to zero bits; - - -11 - The initializer for a scalar shall be a single expression, optionally enclosed in braces. The - initial value of the object is that of the expression (after conversion); the same type - constraints and conversions as for simple assignment apply, taking the type of the scalar - to be the unqualified version of its declared type. - -12 - The rest of this subclause deals with initializers for objects that have aggregate or union - type. - -13 - The initializer for a structure or union object that has automatic storage duration shall be - either an initializer list as described below, or a single expression that has compatible - structure or union type. In the latter case, the initial value of the object, including - unnamed members, is that of the expression. - -14 - An array of character type may be initialized by a character string literal or UTF-8 string - literal, optionally enclosed in braces. Successive bytes of the string literal (including the - terminating null character if there is room or if the array is of unknown size) initialize the - elements of the array. - -15 - An array with element type compatible with a qualified or unqualified version of - wchar_t, char16_t, or char32_t may be initialized by a wide string literal with - the corresponding encoding prefix (L, u, or U, respectively), optionally enclosed in - braces. Successive wide characters of the wide string literal (including the terminating - null wide character if there is room or if the array is of unknown size) initialize the - elements of the array. - -16 - Otherwise, the initializer for an object that has aggregate or union type shall be a brace- - enclosed list of initializers for the elements or named members. - - -17 - Each brace-enclosed initializer list has an associated current object. When no - designations are present, subobjects of the current object are initialized in order according - to the type of the current object: array elements in increasing subscript order, structure - members in declaration order, and the first named member of a union.148) In contrast, a - designation causes the following initializer to begin initialization of the subobject - described by the designator. Initialization then continues forward in order, beginning - with the next subobject after that described by the designator.149) - -18 - Each designator list begins its description with the current object associated with the - closest surrounding brace pair. Each item in the designator list (in order) specifies a - particular member of its current object and changes the current object for the next - designator (if any) to be that member.150) The current object that results at the end of the - designator list is the subobject to be initialized by the following initializer. - -19 - The initialization shall occur in initializer list order, each initializer provided for a - particular subobject overriding any previously listed initializer for the same subobject;151) - all subobjects that are not initialized explicitly shall be initialized implicitly the same as - objects that have static storage duration. - -20 - If the aggregate or union contains elements or members that are aggregates or unions, - these rules apply recursively to the subaggregates or contained unions. If the initializer of - a subaggregate or contained union begins with a left brace, the initializers enclosed by - that brace and its matching right brace initialize the elements or members of the - subaggregate or the contained union. Otherwise, only enough initializers from the list are - taken to account for the elements or members of the subaggregate or the first member of - the contained union; any remaining initializers are left to initialize the next element or - member of the aggregate of which the current subaggregate or contained union is a part. - -21 - If there are fewer initializers in a brace-enclosed list than there are elements or members - of an aggregate, or fewer characters in a string literal used to initialize an array of known - size than there are elements in the array, the remainder of the aggregate shall be - initialized implicitly the same as objects that have static storage duration. - - - - - -22 - If an array of unknown size is initialized, its size is determined by the largest indexed - element with an explicit initializer. The array type is completed at the end of its - initializer list. - -23 - The evaluations of the initialization list expressions are indeterminately sequenced with - respect to one another and thus the order in which any side effects occur is - unspecified.152) - -24 - EXAMPLE 1 Provided that <complex.h> has been #included, the declarations - - - int i = 3.5; - double complex c = 5 + 3 * I; - - - define and initialize i with the value 3 and c with the value 5.0 + i3.0. - - -25 - EXAMPLE 2 The declaration - - - int x[] = { 1, 3, 5 }; - - - defines and initializes x as a one-dimensional array object that has three elements, as no size was specified - and there are three initializers. - - -26 - EXAMPLE 3 The declaration - - - int y[4][3] = { - { 1, 3, 5 }, - { 2, 4, 6 }, - { 3, 5, 7 }, - }; - - - is a definition with a fully bracketed initialization: 1, 3, and 5 initialize the first row of y (the array object - y[0]), namely y[0][0], y[0][1], and y[0][2]. Likewise the next two lines initialize y[1] and - y[2]. The initializer ends early, so y[3] is initialized with zeros. Precisely the same effect could have - been achieved by - - - int y[4][3] = { - 1, 3, 5, 2, 4, 6, 3, 5, 7 - }; - - - The initializer for y[0] does not begin with a left brace, so three items from the list are used. Likewise the - next three are taken successively for y[1] and y[2]. - - -27 - EXAMPLE 4 The declaration - - - int z[4][3] = { - { 1 }, { 2 }, { 3 }, { 4 } - }; - - - initializes the first column of z as specified and initializes the rest with zeros. - - -28 - EXAMPLE 5 The declaration - - - struct { int a[3], b; } w[] = { { 1 }, 2 }; - - - is a definition with an inconsistently bracketed initialization. It defines an array with two element - - - - - structures: w[0].a[0] is 1 and w[1].a[0] is 2; all the other elements are zero. - - -29 - EXAMPLE 6 The declaration - - - short q[4][3][2] = { - { 1 }, - { 2, 3 }, - { 4, 5, 6 } - }; - - - contains an incompletely but consistently bracketed initialization. It defines a three-dimensional array - object: q[0][0][0] is 1, q[1][0][0] is 2, q[1][0][1] is 3, and 4, 5, and 6 initialize - q[2][0][0], q[2][0][1], and q[2][1][0], respectively; all the rest are zero. The initializer for - q[0][0] does not begin with a left brace, so up to six items from the current list may be used. There is - only one, so the values for the remaining five elements are initialized with zero. Likewise, the initializers - for q[1][0] and q[2][0] do not begin with a left brace, so each uses up to six items, initializing their - respective two-dimensional subaggregates. If there had been more than six items in any of the lists, a - diagnostic message would have been issued. The same initialization result could have been achieved by: - - - short q[4][3][2] = { - 1, 0, 0, 0, 0, 0, - 2, 3, 0, 0, 0, 0, - 4, 5, 6 - }; - - - or by: - - - short q[4][3][2] = { - { - { 1 }, - }, - { - { 2, 3 }, - }, - { - { 4, 5 }, - { 6 }, - } - }; - - - in a fully bracketed form. - -30 - Note that the fully bracketed and minimally bracketed forms of initialization are, in general, less likely to - cause confusion. - - -31 - EXAMPLE 7 One form of initialization that completes array types involves typedef names. Given the - declaration - - - typedef int A[]; // OK - declared with block scope - - - the declaration - - - A a = { 1, 2 }, b = { 3, 4, 5 }; - - - is identical to - - - int a[] = { 1, 2 }, b[] = { 3, 4, 5 }; - - - due to the rules for incomplete types. - - -32 - EXAMPLE 8 The declaration - - - char s[] = "abc", t[3] = "abc"; - - - defines ''plain'' char array objects s and t whose elements are initialized with character string literals. - This declaration is identical to - - - char s[] = { 'a', 'b', 'c', '\0' }, - t[] = { 'a', 'b', 'c' }; - - - The contents of the arrays are modifiable. On the other hand, the declaration - - - char *p = "abc"; - - - defines p with type ''pointer to char'' and initializes it to point to an object with type ''array of char'' - with length 4 whose elements are initialized with a character string literal. If an attempt is made to use p to - modify the contents of the array, the behavior is undefined. - - -33 - EXAMPLE 9 Arrays can be initialized to correspond to the elements of an enumeration by using - designators: - - - enum { member_one, member_two }; - const char *nm[] = { - [member_two] = "member two", - [member_one] = "member one", - }; - - - - -34 - EXAMPLE 10 Structure members can be initialized to nonzero values without depending on their order: - - - div_t answer = { .quot = 2, .rem = -1 }; - - - - -35 - EXAMPLE 11 Designators can be used to provide explicit initialization when unadorned initializer lists - might be misunderstood: - - - struct { int a[3], b; } w[] = - { [0].a = {1}, [1].a[0] = 2 }; - - - - -36 - EXAMPLE 12 Space can be ''allocated'' from both ends of an array by using a single designator: - - - int a[MAX] = { - 1, 3, 5, 7, 9, [MAX-5] = 8, 6, 4, 2, 0 - }; - - -37 - In the above, if MAX is greater than ten, there will be some zero-valued elements in the middle; if it is less - than ten, some of the values provided by the first five initializers will be overridden by the second five. - - -38 - EXAMPLE 13 Any member of a union can be initialized: - - - union { /* ... */ } u = { .any_member = 42 }; - - - - - Forward references: common definitions <stddef.h> (7.19). - - -Footnotes - -148) If the initializer list for a subaggregate or contained union does not begin with a left brace, its - subobjects are initialized as usual, but the subaggregate or contained union does not become the - current object: current objects are associated only with brace-enclosed initializer lists. - - -149) After a union member is initialized, the next object is not the next member of the union; instead, it is - the next subobject of an object containing the union. - - -150) Thus, a designator can only specify a strict subobject of the aggregate or union that is associated with - the surrounding brace pair. Note, too, that each separate designator list is independent. - - -151) Any initializer for the subobject which is overridden and so not used to initialize that subobject might - not be evaluated at all. - - -152) In particular, the evaluation order need not be the same as the order of subobject initialization. - - -Contents - -6.7.10 Static assertions - - -Syntax - -1 - - - static_assert-declaration: - _Static_assert ( constant-expression , string-literal ) ; - - -Constraints - -2 - The constant expression shall compare unequal to 0. - -Semantics - -3 - The constant expression shall be an integer constant expression. If the value of the - constant expression compares unequal to 0, the declaration has no effect. Otherwise, the - constraint is violated and the implementation shall produce a diagnostic message that - includes the text of the string literal, except that characters not in the basic source - character set are not required to appear in the message. - - Forward references: diagnostics (7.2). - - -Contents - -6.8 Statements and blocks - - -Syntax - -1 - - - statement: - labeled-statement - compound-statement - expression-statement - selection-statement - iteration-statement - jump-statement - - -Semantics - -2 - A statement specifies an action to be performed. Except as indicated, statements are - executed in sequence. - -3 - A block allows a set of declarations and statements to be grouped into one syntactic unit. - The initializers of objects that have automatic storage duration, and the variable length - array declarators of ordinary identifiers with block scope, are evaluated and the values are - stored in the objects (including storing an indeterminate value in objects without an - initializer) each time the declaration is reached in the order of execution, as if it were a - statement, and within each declaration in the order that declarators appear. - -4 - A full expression is an expression that is not part of another expression or of a declarator. - Each of the following is a full expression: an initializer that is not part of a compound - literal; the expression in an expression statement; the controlling expression of a selection - statement (if or switch); the controlling expression of a while or do statement; each - of the (optional) expressions of a for statement; the (optional) expression in a return - statement. There is a sequence point between the evaluation of a full expression and the - evaluation of the next full expression to be evaluated. - - Forward references: expression and null statements (6.8.3), selection statements - (6.8.4), iteration statements (6.8.5), the return statement (6.8.6.4). - - -Contents - -6.8.1 Labeled statements - - -Syntax - -1 - - - labeled-statement: - identifier : statement - case constant-expression : statement - default : statement - - -Constraints - -2 - A case or default label shall appear only in a switch statement. Further - constraints on such labels are discussed under the switch statement. - - -3 - Label names shall be unique within a function. - -Semantics - -4 - Any statement may be preceded by a prefix that declares an identifier as a label name. - Labels in themselves do not alter the flow of control, which continues unimpeded across - them. - - Forward references: the goto statement (6.8.6.1), the switch statement (6.8.4.2). - - -Contents - -6.8.2 Compound statement - - -Syntax - -1 - - - compound-statement: - { block-item-listopt } - block-item-list: - block-item - block-item-list block-item - block-item: - declaration - statement - - -Semantics - -2 - A compound statement is a block. - - -Contents - -6.8.3 Expression and null statements - - -Syntax - -1 - - - expression-statement: - expressionopt ; - - -Semantics - -2 - The expression in an expression statement is evaluated as a void expression for its side - effects.153) - -3 - A null statement (consisting of just a semicolon) performs no operations. - -4 - EXAMPLE 1 If a function call is evaluated as an expression statement for its side effects only, the - discarding of its value may be made explicit by converting the expression to a void expression by means of - a cast: - - - int p(int); - /* ... */ - (void)p(0); - - - - - - - -5 - EXAMPLE 2 In the program fragment - - - char *s; - /* ... */ - while (*s++ != '\0') - ; - - - a null statement is used to supply an empty loop body to the iteration statement. - - -6 - EXAMPLE 3 A null statement may also be used to carry a label just before the closing } of a compound - statement. - - - while (loop1) { - /* ... */ - while (loop2) { - /* ... */ - if (want_out) - goto end_loop1; - /* ... */ - } - /* ... */ - end_loop1: ; - } - - - - - Forward references: iteration statements (6.8.5). - - -Footnotes - -153) Such as assignments, and function calls which have side effects. - - -Contents - -6.8.4 Selection statements - - -Syntax - -1 - - - selection-statement: - if ( expression ) statement - if ( expression ) statement else statement - switch ( expression ) statement - - -Semantics - -2 - A selection statement selects among a set of statements depending on the value of a - controlling expression. - -3 - A selection statement is a block whose scope is a strict subset of the scope of its - enclosing block. Each associated substatement is also a block whose scope is a strict - subset of the scope of the selection statement. - - -Contents - -6.8.4.1 The if statement - - -Constraints - -1 - The controlling expression of an if statement shall have scalar type. - -Semantics - -2 - In both forms, the first substatement is executed if the expression compares unequal to 0. - In the else form, the second substatement is executed if the expression compares equal - - to 0. If the first substatement is reached via a label, the second substatement is not - executed. - -3 - An else is associated with the lexically nearest preceding if that is allowed by the - syntax. - - -Contents - -6.8.4.2 The switch statement - - -Constraints - -1 - The controlling expression of a switch statement shall have integer type. - -2 - If a switch statement has an associated case or default label within the scope of an - identifier with a variably modified type, the entire switch statement shall be within the - scope of that identifier.154) - -3 - The expression of each case label shall be an integer constant expression and no two of - the case constant expressions in the same switch statement shall have the same value - after conversion. There may be at most one default label in a switch statement. - (Any enclosed switch statement may have a default label or case constant - expressions with values that duplicate case constant expressions in the enclosing - switch statement.) - -Semantics - -4 - A switch statement causes control to jump to, into, or past the statement that is the - switch body, depending on the value of a controlling expression, and on the presence of a - default label and the values of any case labels on or in the switch body. A case or - default label is accessible only within the closest enclosing switch statement. - -5 - The integer promotions are performed on the controlling expression. The constant - expression in each case label is converted to the promoted type of the controlling - expression. If a converted value matches that of the promoted controlling expression, - control jumps to the statement following the matched case label. Otherwise, if there is - a default label, control jumps to the labeled statement. If no converted case constant - expression matches and there is no default label, no part of the switch body is - executed. - -Implementation limits - -6 - As discussed in 5.2.4.1, the implementation may limit the number of case values in a - switch statement. - - - - - - -7 - EXAMPLE In the artificial program fragment - - - switch (expr) - { - int i = 4; - f(i); - case 0: - i = 17; - /* falls through into default code */ - default: - printf("%d\n", i); - } - - - the object whose identifier is i exists with automatic storage duration (within the block) but is never - initialized, and thus if the controlling expression has a nonzero value, the call to the printf function will - access an indeterminate value. Similarly, the call to the function f cannot be reached. - - - -Footnotes - -154) That is, the declaration either precedes the switch statement, or it follows the last case or - default label associated with the switch that is in the block containing the declaration. - - -Contents - -6.8.5 Iteration statements - - -Syntax - -1 - - - iteration-statement: - while ( expression ) statement - do statement while ( expression ) ; - for ( expressionopt ; expressionopt ; expressionopt ) statement - for ( declaration expressionopt ; expressionopt ) statement - - -Constraints - -2 - The controlling expression of an iteration statement shall have scalar type. - -3 - The declaration part of a for statement shall only declare identifiers for objects having - storage class auto or register. - -Semantics - -4 - An iteration statement causes a statement called the loop body to be executed repeatedly - until the controlling expression compares equal to 0. The repetition occurs regardless of - whether the loop body is entered from the iteration statement or by a jump.155) - -5 - An iteration statement is a block whose scope is a strict subset of the scope of its - enclosing block. The loop body is also a block whose scope is a strict subset of the scope - of the iteration statement. - -6 - An iteration statement whose controlling expression is not a constant expression,156) that - performs no input/output operations, does not access volatile objects, and performs no - synchronization or atomic operations in its body, controlling expression, or (in the case of - - - a for statement) its expression-3, may be assumed by the implementation to - terminate.157) - - -Footnotes - -155) Code jumped over is not executed. In particular, the controlling expression of a for or while - statement is not evaluated before entering the loop body, nor is clause-1 of a for statement. - - -156) An omitted controlling expression is replaced by a nonzero constant, which is a constant expression. - - -157) This is intended to allow compiler transformations such as removal of empty loops even when - termination cannot be proven. - - -Contents - -6.8.5.1 The while statement - - -1 - The evaluation of the controlling expression takes place before each execution of the loop - body. - - -Contents - -6.8.5.2 The do statement - - -1 - The evaluation of the controlling expression takes place after each execution of the loop - body. - - -Contents - -6.8.5.3 The for statement - - -1 - The statement - - - for ( clause-1 ; expression-2 ; expression-3 ) statement - - - behaves as follows: The expression expression-2 is the controlling expression that is - evaluated before each execution of the loop body. The expression expression-3 is - evaluated as a void expression after each execution of the loop body. If clause-1 is a - declaration, the scope of any identifiers it declares is the remainder of the declaration and - the entire loop, including the other two expressions; it is reached in the order of execution - before the first evaluation of the controlling expression. If clause-1 is an expression, it is - evaluated as a void expression before the first evaluation of the controlling expression.158) - -2 - Both clause-1 and expression-3 can be omitted. An omitted expression-2 is replaced by a - nonzero constant. - - -Footnotes - -158) Thus, clause-1 specifies initialization for the loop, possibly declaring one or more variables for use in - the loop; the controlling expression, expression-2, specifies an evaluation made before each iteration, - such that execution of the loop continues until the expression compares equal to 0; and expression-3 - specifies an operation (such as incrementing) that is performed after each iteration. - - -Contents - -6.8.6 Jump statements - - -Syntax - -1 - - - jump-statement: - goto identifier ; - continue ; - break ; - return expressionopt ; - - - - - - - - -Semantics - -2 - A jump statement causes an unconditional jump to another place. - - -Contents - -6.8.6.1 The goto statement - - -Constraints - -1 - The identifier in a goto statement shall name a label located somewhere in the enclosing - function. A goto statement shall not jump from outside the scope of an identifier having - a variably modified type to inside the scope of that identifier. - -Semantics - -2 - A goto statement causes an unconditional jump to the statement prefixed by the named - label in the enclosing function. - -3 - EXAMPLE 1 It is sometimes convenient to jump into the middle of a complicated set of statements. The - following outline presents one possible approach to a problem based on these three assumptions: - - -- The general initialization code accesses objects only visible to the current function. - -- The general initialization code is too large to warrant duplication. - -- The code to determine the next operation is at the head of the loop. (To allow it to be reached by - continue statements, for example.) - - - /* ... */ - goto first_time; - for (;;) { - // determine next operation - /* ... */ - if (need to reinitialize) { - // reinitialize-only code - /* ... */ - first_time: - // general initialization code - /* ... */ - continue; - } - // handle other operations - /* ... */ - } - - -4 - EXAMPLE 2 A goto statement is not allowed to jump past any declarations of objects with variably - modified types. A jump within the scope, however, is permitted. - - - goto lab3; // invalid: going INTO scope of VLA. - { - double a[n]; - a[j] = 4.4; - lab3: - a[j] = 3.3; - goto lab4; // valid: going WITHIN scope of VLA. - a[j] = 5.5; - lab4: - a[j] = 6.6; - } - goto lab4; // invalid: going INTO scope of VLA. - - - - - -Contents - -6.8.6.2 The continue statement - - -Constraints - -1 - A continue statement shall appear only in or as a loop body. - -Semantics - -2 - A continue statement causes a jump to the loop-continuation portion of the smallest - enclosing iteration statement; that is, to the end of the loop body. More precisely, in each - of the statements - - - while (/* ... */) { do { for (/* ... */) { - /* ... */ /* ... */ /* ... */ - continue; continue; continue; - /* ... */ /* ... */ /* ... */ - contin: ; contin: ; contin: ; - } } while (/* ... */); } - - - unless the continue statement shown is in an enclosed iteration statement (in which - case it is interpreted within that statement), it is equivalent to goto contin;.159) - - -Footnotes - -159) Following the contin: label is a null statement. - - -Contents - -6.8.6.3 The break statement - - -Constraints - -1 - A break statement shall appear only in or as a switch body or loop body. - -Semantics - -2 - A break statement terminates execution of the smallest enclosing switch or iteration - statement. - - - - - -Contents - -6.8.6.4 The return statement - - -Constraints - -1 - A return statement with an expression shall not appear in a function whose return type - is void. A return statement without an expression shall only appear in a function - whose return type is void. - -Semantics - -2 - A return statement terminates execution of the current function and returns control to - its caller. A function may have any number of return statements. - -3 - If a return statement with an expression is executed, the value of the expression is - returned to the caller as the value of the function call expression. If the expression has a - type different from the return type of the function in which it appears, the value is - converted as if by assignment to an object having the return type of the function.160) - -4 - EXAMPLE In: - - - struct s { double i; } f(void); - union { - struct { - int f1; - struct s f2; - } u1; - struct { - struct s f3; - int f4; - } u2; - } g; - struct s f(void) - { - return g.u1.f2; - } - /* ... */ - g.u2.f3 = f(); - - - there is no undefined behavior, although there would be if the assignment were done directly (without using - a function call to fetch the value). - - - - - - -Footnotes - -160) The return statement is not an assignment. The overlap restriction of subclause 6.5.16.1 does not - apply to the case of function return. The representation of floating-point values may have wider range - or precision than implied by the type; a cast may be used to remove this extra range and precision. - - -Contents - -6.9 External definitions - - -Syntax - -1 - - - translation-unit: - external-declaration - translation-unit external-declaration - external-declaration: - function-definition - declaration - - -Constraints - -2 - The storage-class specifiers auto and register shall not appear in the declaration - specifiers in an external declaration. - -3 - There shall be no more than one external definition for each identifier declared with - internal linkage in a translation unit. Moreover, if an identifier declared with internal - linkage is used in an expression (other than as a part of the operand of a sizeof or - _Alignof operator whose result is an integer constant), there shall be exactly one - external definition for the identifier in the translation unit. - -Semantics - -4 - As discussed in 5.1.1.1, the unit of program text after preprocessing is a translation unit, - which consists of a sequence of external declarations. These are described as ''external'' - because they appear outside any function (and hence have file scope). As discussed in - 6.7, a declaration that also causes storage to be reserved for an object or a function named - by the identifier is a definition. - -5 - An external definition is an external declaration that is also a definition of a function - (other than an inline definition) or an object. If an identifier declared with external - linkage is used in an expression (other than as part of the operand of a sizeof or - _Alignof operator whose result is an integer constant), somewhere in the entire - program there shall be exactly one external definition for the identifier; otherwise, there - shall be no more than one.161) - - - - - - -Footnotes - -161) Thus, if an identifier declared with external linkage is not used in an expression, there need be no - external definition for it. - - -Contents - -6.9.1 Function definitions - - -Syntax - -1 - - - function-definition: - declaration-specifiers declarator declaration-listopt compound-statement - declaration-list: - declaration - declaration-list declaration - - -Constraints - -2 - The identifier declared in a function definition (which is the name of the function) shall - have a function type, as specified by the declarator portion of the function definition.162) - -3 - The return type of a function shall be void or a complete object type other than array - type. - -4 - The storage-class specifier, if any, in the declaration specifiers shall be either extern or - static. - -5 - If the declarator includes a parameter type list, the declaration of each parameter shall - include an identifier, except for the special case of a parameter list consisting of a single - parameter of type void, in which case there shall not be an identifier. No declaration list - shall follow. - -6 - If the declarator includes an identifier list, each declaration in the declaration list shall - have at least one declarator, those declarators shall declare only identifiers from the - identifier list, and every identifier in the identifier list shall be declared. An identifier - declared as a typedef name shall not be redeclared as a parameter. The declarations in the - declaration list shall contain no storage-class specifier other than register and no - initializations. - - - - - -Semantics - -7 - The declarator in a function definition specifies the name of the function being defined - and the identifiers of its parameters. If the declarator includes a parameter type list, the - list also specifies the types of all the parameters; such a declarator also serves as a - function prototype for later calls to the same function in the same translation unit. If the - declarator includes an identifier list,163) the types of the parameters shall be declared in a - following declaration list. In either case, the type of each parameter is adjusted as - described in 6.7.6.3 for a parameter type list; the resulting type shall be a complete object - type. - -8 - If a function that accepts a variable number of arguments is defined without a parameter - type list that ends with the ellipsis notation, the behavior is undefined. - -9 - Each parameter has automatic storage duration; its identifier is an lvalue.164) The layout - of the storage for parameters is unspecified. - -10 - On entry to the function, the size expressions of each variably modified parameter are - evaluated and the value of each argument expression is converted to the type of the - corresponding parameter as if by assignment. (Array expressions and function - designators as arguments were converted to pointers before the call.) - -11 - After all parameters have been assigned, the compound statement that constitutes the - body of the function definition is executed. - -12 - If the } that terminates a function is reached, and the value of the function call is used by - the caller, the behavior is undefined. - -13 - EXAMPLE 1 In the following: - - - extern int max(int a, int b) - { - return a > b ? a : b; - } - - - extern is the storage-class specifier and int is the type specifier; max(int a, int b) is the - function declarator; and - - - { return a > b ? a : b; } - - - is the function body. The following similar definition uses the identifier-list form for the parameter - declarations: - - - - - - - extern int max(a, b) - int a, b; - { - return a > b ? a : b; - } - - - Here int a, b; is the declaration list for the parameters. The difference between these two definitions is - that the first form acts as a prototype declaration that forces conversion of the arguments of subsequent calls - to the function, whereas the second form does not. - - -14 - EXAMPLE 2 To pass one function to another, one might say - - - int f(void); - /* ... */ - g(f); - - - Then the definition of g might read - - - void g(int (*funcp)(void)) - { - /* ... */ - (*funcp)(); /* or funcp(); ... */ - } - - - or, equivalently, - - - void g(int func(void)) - { - /* ... */ - func(); /* or (*func)(); ... */ - } - - - - - -Footnotes - -162) The intent is that the type category in a function definition cannot be inherited from a typedef: - - - typedef int F(void); // type F is ''function with no parameters - // returning int'' - F f, g; // f and g both have type compatible with F - F f { /* ... */ } // WRONG: syntax/constraint error - F g() { /* ... */ } // WRONG: declares that g returns a function - int f(void) { /* ... */ } // RIGHT: f has type compatible with F - int g() { /* ... */ } // RIGHT: g has type compatible with F - F *e(void) { /* ... */ } // e returns a pointer to a function - F *((e))(void) { /* ... */ } // same: parentheses irrelevant - int (*fp)(void); // fp points to a function that has type F - F *Fp; // Fp points to a function that has type F - - -163) See ''future language directions'' (6.11.7). - - -164) A parameter identifier cannot be redeclared in the function body except in an enclosed block. - - -Contents - -6.9.2 External object definitions - - -Semantics - -1 - If the declaration of an identifier for an object has file scope and an initializer, the - declaration is an external definition for the identifier. - -2 - A declaration of an identifier for an object that has file scope without an initializer, and - without a storage-class specifier or with the storage-class specifier static, constitutes a - tentative definition. If a translation unit contains one or more tentative definitions for an - identifier, and the translation unit contains no external definition for that identifier, then - the behavior is exactly as if the translation unit contains a file scope declaration of that - identifier, with the composite type as of the end of the translation unit, with an initializer - equal to 0. - -3 - If the declaration of an identifier for an object is a tentative definition and has internal - linkage, the declared type shall not be an incomplete type. - - -4 - EXAMPLE 1 - - - int i1 = 1; // definition, external linkage - static int i2 = 2; // definition, internal linkage - extern int i3 = 3; // definition, external linkage - int i4; // tentative definition, external linkage - static int i5; // tentative definition, internal linkage - int i1; // valid tentative definition, refers to previous - int i2; // 6.2.2 renders undefined, linkage disagreement - int i3; // valid tentative definition, refers to previous - int i4; // valid tentative definition, refers to previous - int i5; // 6.2.2 renders undefined, linkage disagreement - extern int i1; // refers to previous, whose linkage is external - extern int i2; // refers to previous, whose linkage is internal - extern int i3; // refers to previous, whose linkage is external - extern int i4; // refers to previous, whose linkage is external - extern int i5; // refers to previous, whose linkage is internal - - - - -5 - EXAMPLE 2 If at the end of the translation unit containing - - - int i[]; - - - the array i still has incomplete type, the implicit initializer causes it to have one element, which is set to - zero on program startup. - - -Contents - -6.10 Preprocessing directives - - -Syntax - -1 - - - preprocessing-file: - groupopt - group: - group-part - group group-part - group-part: - if-section - control-line - text-line - # non-directive - if-section: - if-group elif-groupsopt else-groupopt endif-line - if-group: - # if constant-expression new-line groupopt - # ifdef identifier new-line groupopt - # ifndef identifier new-line groupopt - elif-groups: - elif-group - elif-groups elif-group - elif-group: - # elif constant-expression new-line groupopt - else-group: - # else new-line groupopt - endif-line: - # endif new-line - control-line: - # include pp-tokens new-line - # define identifier replacement-list new-line - # define identifier lparen identifier-listopt ) - replacement-list new-line - # define identifier lparen ... ) replacement-list new-line - # define identifier lparen identifier-list , ... ) - replacement-list new-line - # undef identifier new-line - # line pp-tokens new-line - # error pp-tokensopt new-line - # pragma pp-tokensopt new-line - # new-line - text-line: - pp-tokensopt new-line - non-directive: - pp-tokens new-line - lparen: - a ( character not immediately preceded by white-space - replacement-list: - pp-tokensopt - pp-tokens: - preprocessing-token - pp-tokens preprocessing-token - new-line: - the new-line character - - -Description - -2 - A preprocessing directive consists of a sequence of preprocessing tokens that satisfies the - following constraints: The first token in the sequence is a # preprocessing token that (at - the start of translation phase 4) is either the first character in the source file (optionally - after white space containing no new-line characters) or that follows white space - containing at least one new-line character. The last token in the sequence is the first new- - line character that follows the first token in the sequence.165) A new-line character ends - the preprocessing directive even if it occurs within what would otherwise be an - - - invocation of a function-like macro. - -3 - A text line shall not begin with a # preprocessing token. A non-directive shall not begin - with any of the directive names appearing in the syntax. - -4 - When in a group that is skipped (6.10.1), the directive syntax is relaxed to allow any - sequence of preprocessing tokens to occur between the directive name and the following - new-line character. - -Constraints - -5 - The only white-space characters that shall appear between preprocessing tokens within a - preprocessing directive (from just after the introducing # preprocessing token through - just before the terminating new-line character) are space and horizontal-tab (including - spaces that have replaced comments or possibly other white-space characters in - translation phase 3). - -Semantics - -6 - The implementation can process and skip sections of source files conditionally, include - other source files, and replace macros. These capabilities are called preprocessing, - because conceptually they occur before translation of the resulting translation unit. - -7 - The preprocessing tokens within a preprocessing directive are not subject to macro - expansion unless otherwise stated. - -8 - EXAMPLE In: - - - #define EMPTY - EMPTY # include <file.h> - - - the sequence of preprocessing tokens on the second line is not a preprocessing directive, because it does not - begin with a # at the start of translation phase 4, even though it will do so after the macro EMPTY has been - replaced. - - - -Footnotes - -165) Thus, preprocessing directives are commonly called ''lines''. These ''lines'' have no other syntactic - significance, as all white space is equivalent except in certain situations during preprocessing (see the - # character string literal creation operator in 6.10.3.2, for example). - - -Contents - -6.10.1 Conditional inclusion - - -Constraints - -1 - The expression that controls conditional inclusion shall be an integer constant expression - except that: identifiers (including those lexically identical to keywords) are interpreted as - described below;166) and it may contain unary operator expressions of the form - - - defined identifier - - - or - - - defined ( identifier ) - - - which evaluate to 1 if the identifier is currently defined as a macro name (that is, if it is - - - - predefined or if it has been the subject of a #define preprocessing directive without an - intervening #undef directive with the same subject identifier), 0 if it is not. - -2 - Each preprocessing token that remains (in the list of preprocessing tokens that will - become the controlling expression) after all macro replacements have occurred shall be in - the lexical form of a token (6.4). - -Semantics - -3 - Preprocessing directives of the forms - - - # if constant-expression new-line groupopt - # elif constant-expression new-line groupopt - - - check whether the controlling constant expression evaluates to nonzero. - -4 - Prior to evaluation, macro invocations in the list of preprocessing tokens that will become - the controlling constant expression are replaced (except for those macro names modified - by the defined unary operator), just as in normal text. If the token defined is - generated as a result of this replacement process or use of the defined unary operator - does not match one of the two specified forms prior to macro replacement, the behavior is - undefined. After all replacements due to macro expansion and the defined unary - operator have been performed, all remaining identifiers (including those lexically - identical to keywords) are replaced with the pp-number 0, and then each preprocessing - token is converted into a token. The resulting tokens compose the controlling constant - expression which is evaluated according to the rules of 6.6. For the purposes of this - token conversion and evaluation, all signed integer types and all unsigned integer types - act as if they have the same representation as, respectively, the types intmax_t and - uintmax_t defined in the header <stdint.h>.167) This includes interpreting - character constants, which may involve converting escape sequences into execution - character set members. Whether the numeric value for these character constants matches - the value obtained when an identical character constant occurs in an expression (other - than within a #if or #elif directive) is implementation-defined.168) Also, whether a - single-character character constant may have a negative value is implementation-defined. - - - - - - -5 - Preprocessing directives of the forms - - - # ifdef identifier new-line groupopt - # ifndef identifier new-line groupopt - - - check whether the identifier is or is not currently defined as a macro name. Their - conditions are equivalent to #if defined identifier and #if !defined identifier - respectively. - -6 - Each directive's condition is checked in order. If it evaluates to false (zero), the group - that it controls is skipped: directives are processed only through the name that determines - the directive in order to keep track of the level of nested conditionals; the rest of the - directives' preprocessing tokens are ignored, as are the other preprocessing tokens in the - group. Only the first group whose control condition evaluates to true (nonzero) is - processed. If none of the conditions evaluates to true, and there is a #else directive, the - group controlled by the #else is processed; lacking a #else directive, all the groups - until the #endif are skipped.169) - - Forward references: macro replacement (6.10.3), source file inclusion (6.10.2), largest - integer types (7.20.1.5). - - -Footnotes - -166) Because the controlling constant expression is evaluated during translation phase 4, all identifiers - either are or are not macro names -- there simply are no keywords, enumeration constants, etc. - - -167) Thus, on an implementation where INT_MAX is 0x7FFF and UINT_MAX is 0xFFFF, the constant - 0x8000 is signed and positive within a #if expression even though it would be unsigned in - translation phase 7. - - -168) Thus, the constant expression in the following #if directive and if statement is not guaranteed to - evaluate to the same value in these two contexts. - - - #if 'z' - 'a' == 25 - if ('z' - 'a' == 25) - - -169) As indicated by the syntax, a preprocessing token shall not follow a #else or #endif directive - before the terminating new-line character. However, comments may appear anywhere in a source file, - including within a preprocessing directive. - - -Contents - -6.10.2 Source file inclusion - - -Constraints - -1 - A #include directive shall identify a header or source file that can be processed by the - implementation. - -Semantics - -2 - A preprocessing directive of the form - - - # include <h-char-sequence> new-line - - - searches a sequence of implementation-defined places for a header identified uniquely by - the specified sequence between the < and > delimiters, and causes the replacement of that - directive by the entire contents of the header. How the places are specified or the header - identified is implementation-defined. - -3 - A preprocessing directive of the form - - - # include "q-char-sequence" new-line - - - causes the replacement of that directive by the entire contents of the source file identified - by the specified sequence between the " delimiters. The named source file is searched - - - - for in an implementation-defined manner. If this search is not supported, or if the search - fails, the directive is reprocessed as if it read - - - # include <h-char-sequence> new-line - - - with the identical contained sequence (including > characters, if any) from the original - directive. - -4 - A preprocessing directive of the form - - - # include pp-tokens new-line - - - (that does not match one of the two previous forms) is permitted. The preprocessing - tokens after include in the directive are processed just as in normal text. (Each - identifier currently defined as a macro name is replaced by its replacement list of - preprocessing tokens.) The directive resulting after all replacements shall match one of - the two previous forms.170) The method by which a sequence of preprocessing tokens - between a < and a > preprocessing token pair or a pair of " characters is combined into a - single header name preprocessing token is implementation-defined. - -5 - The implementation shall provide unique mappings for sequences consisting of one or - more nondigits or digits (6.4.2.1) followed by a period (.) and a single nondigit. The - first character shall not be a digit. The implementation may ignore distinctions of - alphabetical case and restrict the mapping to eight significant characters before the - period. - -6 - A #include preprocessing directive may appear in a source file that has been read - because of a #include directive in another file, up to an implementation-defined - nesting limit (see 5.2.4.1). - -7 - EXAMPLE 1 The most common uses of #include preprocessing directives are as in the following: - - - #include <stdio.h> - #include "myprog.h" - - - - - - - - -8 - EXAMPLE 2 This illustrates macro-replaced #include directives: - - - #if VERSION == 1 - #define INCFILE "vers1.h" - #elif VERSION == 2 - #define INCFILE "vers2.h" // and so on - #else - #define INCFILE "versN.h" - #endif - #include INCFILE - - - - - Forward references: macro replacement (6.10.3). - - -Footnotes - -170) Note that adjacent string literals are not concatenated into a single string literal (see the translation - phases in 5.1.1.2); thus, an expansion that results in two string literals is an invalid directive. - - -Contents - -6.10.3 Macro replacement - - -Constraints - -1 - Two replacement lists are identical if and only if the preprocessing tokens in both have - the same number, ordering, spelling, and white-space separation, where all white-space - separations are considered identical. - -2 - An identifier currently defined as an object-like macro shall not be redefined by another - #define preprocessing directive unless the second definition is an object-like macro - definition and the two replacement lists are identical. Likewise, an identifier currently - defined as a function-like macro shall not be redefined by another #define - preprocessing directive unless the second definition is a function-like macro definition - that has the same number and spelling of parameters, and the two replacement lists are - identical. - -3 - There shall be white-space between the identifier and the replacement list in the definition - of an object-like macro. - -4 - If the identifier-list in the macro definition does not end with an ellipsis, the number of - arguments (including those arguments consisting of no preprocessing tokens) in an - invocation of a function-like macro shall equal the number of parameters in the macro - definition. Otherwise, there shall be more arguments in the invocation than there are - parameters in the macro definition (excluding the ...). There shall exist a ) - preprocessing token that terminates the invocation. - -5 - The identifier __VA_ARGS__ shall occur only in the replacement-list of a function-like - macro that uses the ellipsis notation in the parameters. - -6 - A parameter identifier in a function-like macro shall be uniquely declared within its - scope. - -Semantics - -7 - The identifier immediately following the define is called the macro name. There is one - name space for macro names. Any white-space characters preceding or following the - replacement list of preprocessing tokens are not considered part of the replacement list - - for either form of macro. - -8 - If a # preprocessing token, followed by an identifier, occurs lexically at the point at which - a preprocessing directive could begin, the identifier is not subject to macro replacement. - -9 - A preprocessing directive of the form - - - # define identifier replacement-list new-line - - - defines an object-like macro that causes each subsequent instance of the macro name171) - to be replaced by the replacement list of preprocessing tokens that constitute the - remainder of the directive. The replacement list is then rescanned for more macro names - as specified below. - -10 - A preprocessing directive of the form - - - # define identifier lparen identifier-listopt ) replacement-list new-line - # define identifier lparen ... ) replacement-list new-line - # define identifier lparen identifier-list , ... ) replacement-list new-line - - - defines a function-like macro with parameters, whose use is similar syntactically to a - function call. The parameters are specified by the optional list of identifiers, whose scope - extends from their declaration in the identifier list until the new-line character that - terminates the #define preprocessing directive. Each subsequent instance of the - function-like macro name followed by a ( as the next preprocessing token introduces the - sequence of preprocessing tokens that is replaced by the replacement list in the definition - (an invocation of the macro). The replaced sequence of preprocessing tokens is - terminated by the matching ) preprocessing token, skipping intervening matched pairs of - left and right parenthesis preprocessing tokens. Within the sequence of preprocessing - tokens making up an invocation of a function-like macro, new-line is considered a normal - white-space character. - -11 - The sequence of preprocessing tokens bounded by the outside-most matching parentheses - forms the list of arguments for the function-like macro. The individual arguments within - the list are separated by comma preprocessing tokens, but comma preprocessing tokens - between matching inner parentheses do not separate arguments. If there are sequences of - preprocessing tokens within the list of arguments that would otherwise act as - preprocessing directives,172) the behavior is undefined. - -12 - If there is a ... in the identifier-list in the macro definition, then the trailing arguments, - including any separating comma preprocessing tokens, are merged to form a single item: - - - - the variable arguments. The number of arguments so combined is such that, following - merger, the number of arguments is one more than the number of parameters in the macro - definition (excluding the ...). - - -Footnotes - -171) Since, by macro-replacement time, all character constants and string literals are preprocessing tokens, - not sequences possibly containing identifier-like subsequences (see 5.1.1.2, translation phases), they - are never scanned for macro names or parameters. - - -172) Despite the name, a non-directive is a preprocessing directive. - - -Contents - -6.10.3.1 Argument substitution - - -1 - After the arguments for the invocation of a function-like macro have been identified, - argument substitution takes place. A parameter in the replacement list, unless preceded - by a # or ## preprocessing token or followed by a ## preprocessing token (see below), is - replaced by the corresponding argument after all macros contained therein have been - expanded. Before being substituted, each argument's preprocessing tokens are - completely macro replaced as if they formed the rest of the preprocessing file; no other - preprocessing tokens are available. - -2 - An identifier __VA_ARGS__ that occurs in the replacement list shall be treated as if it - were a parameter, and the variable arguments shall form the preprocessing tokens used to - replace it. - - -Contents - -6.10.3.2 The # operator - - -Constraints - -1 - Each # preprocessing token in the replacement list for a function-like macro shall be - followed by a parameter as the next preprocessing token in the replacement list. - -Semantics - -2 - If, in the replacement list, a parameter is immediately preceded by a # preprocessing - token, both are replaced by a single character string literal preprocessing token that - contains the spelling of the preprocessing token sequence for the corresponding - argument. Each occurrence of white space between the argument's preprocessing tokens - becomes a single space character in the character string literal. White space before the - first preprocessing token and after the last preprocessing token composing the argument - is deleted. Otherwise, the original spelling of each preprocessing token in the argument - is retained in the character string literal, except for special handling for producing the - spelling of string literals and character constants: a \ character is inserted before each " - and \ character of a character constant or string literal (including the delimiting " - characters), except that it is implementation-defined whether a \ character is inserted - before the \ character beginning a universal character name. If the replacement that - results is not a valid character string literal, the behavior is undefined. The character - string literal corresponding to an empty argument is "". The order of evaluation of # and - ## operators is unspecified. - - -Contents - -6.10.3.3 The ## operator - - -Constraints - -1 - A ## preprocessing token shall not occur at the beginning or at the end of a replacement - list for either form of macro definition. - -Semantics - -2 - If, in the replacement list of a function-like macro, a parameter is immediately preceded - or followed by a ## preprocessing token, the parameter is replaced by the corresponding - argument's preprocessing token sequence; however, if an argument consists of no - preprocessing tokens, the parameter is replaced by a placemarker preprocessing token - instead.173) - -3 - For both object-like and function-like macro invocations, before the replacement list is - reexamined for more macro names to replace, each instance of a ## preprocessing token - in the replacement list (not from an argument) is deleted and the preceding preprocessing - token is concatenated with the following preprocessing token. Placemarker - preprocessing tokens are handled specially: concatenation of two placemarkers results in - a single placemarker preprocessing token, and concatenation of a placemarker with a - non-placemarker preprocessing token results in the non-placemarker preprocessing token. - If the result is not a valid preprocessing token, the behavior is undefined. The resulting - token is available for further macro replacement. The order of evaluation of ## operators - is unspecified. - -4 - EXAMPLE In the following fragment: - - - #define hash_hash # ## # - #define mkstr(a) # a - #define in_between(a) mkstr(a) - #define join(c, d) in_between(c hash_hash d) - char p[] = join(x, y); // equivalent to - // char p[] = "x ## y"; - - - The expansion produces, at various stages: - - - join(x, y) - in_between(x hash_hash y) - in_between(x ## y) - mkstr(x ## y) - "x ## y" - - - In other words, expanding hash_hash produces a new token, consisting of two adjacent sharp signs, but - this new token is not the ## operator. - - - - -Footnotes - -173) Placemarker preprocessing tokens do not appear in the syntax because they are temporary entities that - exist only within translation phase 4. - - -Contents - -6.10.3.4 Rescanning and further replacement - - -1 - After all parameters in the replacement list have been substituted and # and ## - processing has taken place, all placemarker preprocessing tokens are removed. The - resulting preprocessing token sequence is then rescanned, along with all subsequent - preprocessing tokens of the source file, for more macro names to replace. - -2 - If the name of the macro being replaced is found during this scan of the replacement list - (not including the rest of the source file's preprocessing tokens), it is not replaced. - Furthermore, if any nested replacements encounter the name of the macro being replaced, - it is not replaced. These nonreplaced macro name preprocessing tokens are no longer - available for further replacement even if they are later (re)examined in contexts in which - that macro name preprocessing token would otherwise have been replaced. - -3 - The resulting completely macro-replaced preprocessing token sequence is not processed - as a preprocessing directive even if it resembles one, but all pragma unary operator - expressions within it are then processed as specified in 6.10.9 below. - -4 - EXAMPLE There are cases where it is not clear whether a replacement is nested or not. For example, - given the following macro definitions: - - - #define f(a) a*g - #define g(a) f(a) - - - the invocation - - - f(2)(9) - - - may expand to either - - - 2*f(9) - - - or - - - 2*9*g - - - Strictly conforming programs are not permitted to depend on such unspecified behavior. - - - -Contents - -6.10.3.5 Scope of macro definitions - - -1 - A macro definition lasts (independent of block structure) until a corresponding #undef - directive is encountered or (if none is encountered) until the end of the preprocessing - translation unit. Macro definitions have no significance after translation phase 4. - -2 - A preprocessing directive of the form - - - # undef identifier new-line - - - causes the specified identifier no longer to be defined as a macro name. It is ignored if - the specified identifier is not currently defined as a macro name. - -3 - EXAMPLE 1 The simplest use of this facility is to define a ''manifest constant'', as in - - - #define TABSIZE 100 - int table[TABSIZE]; - - - - -4 - EXAMPLE 2 The following defines a function-like macro whose value is the maximum of its arguments. - It has the advantages of working for any compatible types of the arguments and of generating in-line code - without the overhead of function calling. It has the disadvantages of evaluating one or the other of its - arguments a second time (including side effects) and generating more code than a function if invoked - several times. It also cannot have its address taken, as it has none. - - - #define max(a, b) ((a) > (b) ? (a) : (b)) - - - The parentheses ensure that the arguments and the resulting expression are bound properly. - - -5 - EXAMPLE 3 To illustrate the rules for redefinition and reexamination, the sequence - - - #define x 3 - #define f(a) f(x * (a)) - #undef x - #define x 2 - #define g f - #define z z[0] - #define h g(~ - #define m(a) a(w) - #define w 0,1 - #define t(a) a - #define p() int - #define q(x) x - #define r(x,y) x ## y - #define str(x) # x - f(y+1) + f(f(z)) % t(t(g)(0) + t)(1); - g(x+(3,4)-w) | h 5) & m - (f)^m(m); - p() i[q()] = { q(1), r(2,3), r(4,), r(,5), r(,) }; - char c[2][6] = { str(hello), str() }; - - - results in - - - f(2 * (y+1)) + f(2 * (f(2 * (z[0])))) % f(2 * (0)) + t(1); - f(2 * (2+(3,4)-0,1)) | f(2 * (~ 5)) & f(2 * (0,1))^m(0,1); - int i[] = { 1, 23, 4, 5, }; - char c[2][6] = { "hello", "" }; - - - - -6 - EXAMPLE 4 To illustrate the rules for creating character string literals and concatenating tokens, the - sequence - - - #define str(s) # s - #define xstr(s) str(s) - #define debug(s, t) printf("x" # s "= %d, x" # t "= %s", \ - x ## s, x ## t) - #define INCFILE(n) vers ## n - #define glue(a, b) a ## b - #define xglue(a, b) glue(a, b) - #define HIGHLOW "hello" - #define LOW LOW ", world" - debug(1, 2); - fputs(str(strncmp("abc\0d", "abc", '\4') // this goes away - == 0) str(: @\n), s); - #include xstr(INCFILE(2).h) - glue(HIGH, LOW); - xglue(HIGH, LOW) - - - results in - - - printf("x" "1" "= %d, x" "2" "= %s", x1, x2); - fputs( - "strncmp(\"abc\\0d\", \"abc\", '\\4') == 0" ": @\n", - s); - #include "vers2.h" (after macro replacement, before file access) - "hello"; - "hello" ", world" - - - or, after concatenation of the character string literals, - - - printf("x1= %d, x2= %s", x1, x2); - fputs( - "strncmp(\"abc\\0d\", \"abc\", '\\4') == 0: @\n", - s); - #include "vers2.h" (after macro replacement, before file access) - "hello"; - "hello, world" - - - Space around the # and ## tokens in the macro definition is optional. - - -7 - EXAMPLE 5 To illustrate the rules for placemarker preprocessing tokens, the sequence - - - #define t(x,y,z) x ## y ## z - int j[] = { t(1,2,3), t(,4,5), t(6,,7), t(8,9,), - t(10,,), t(,11,), t(,,12), t(,,) }; - - - results in - - - int j[] = { 123, 45, 67, 89, - 10, 11, 12, }; - - - - -8 - EXAMPLE 6 To demonstrate the redefinition rules, the following sequence is valid. - - - #define OBJ_LIKE (1-1) - #define OBJ_LIKE /* white space */ (1-1) /* other */ - #define FUNC_LIKE(a) ( a ) - #define FUNC_LIKE( a )( /* note the white space */ \ - a /* other stuff on this line - */ ) - - - But the following redefinitions are invalid: - - - #define OBJ_LIKE (0) // different token sequence - #define OBJ_LIKE (1 - 1) // different white space - #define FUNC_LIKE(b) ( a ) // different parameter usage - #define FUNC_LIKE(b) ( b ) // different parameter spelling - - - - -9 - EXAMPLE 7 Finally, to show the variable argument list macro facilities: - - - #define debug(...) fprintf(stderr, __VA_ARGS__) - #define showlist(...) puts(#__VA_ARGS__) - #define report(test, ...) ((test)?puts(#test):\ - printf(__VA_ARGS__)) - debug("Flag"); - debug("X = %d\n", x); - showlist(The first, second, and third items.); - report(x>y, "x is %d but y is %d", x, y); - - - results in - - - fprintf(stderr, "Flag" ); - fprintf(stderr, "X = %d\n", x ); - puts( "The first, second, and third items." ); - ((x>y)?puts("x>y"): - printf("x is %d but y is %d", x, y)); - - - - - -Contents - -6.10.4 Line control - - -Constraints - -1 - The string literal of a #line directive, if present, shall be a character string literal. - -Semantics - -2 - The line number of the current source line is one greater than the number of new-line - characters read or introduced in translation phase 1 (5.1.1.2) while processing the source - file to the current token. - -3 - A preprocessing directive of the form - - - # line digit-sequence new-line - - - causes the implementation to behave as if the following sequence of source lines begins - with a source line that has a line number as specified by the digit sequence (interpreted as - a decimal integer). The digit sequence shall not specify zero, nor a number greater than - 2147483647. - -4 - A preprocessing directive of the form - - - # line digit-sequence "s-char-sequenceopt" new-line - - - sets the presumed line number similarly and changes the presumed name of the source - file to be the contents of the character string literal. - -5 - A preprocessing directive of the form - - - # line pp-tokens new-line - - - (that does not match one of the two previous forms) is permitted. The preprocessing - tokens after line on the directive are processed just as in normal text (each identifier - currently defined as a macro name is replaced by its replacement list of preprocessing - tokens). The directive resulting after all replacements shall match one of the two - previous forms and is then processed as appropriate. - - -Contents - -6.10.5 Error directive - - -Semantics - -1 - A preprocessing directive of the form - - - # error pp-tokensopt new-line - - - causes the implementation to produce a diagnostic message that includes the specified - sequence of preprocessing tokens. - - -Contents - -6.10.6 Pragma directive - - -Semantics - -1 - A preprocessing directive of the form - - - # pragma pp-tokensopt new-line - - - where the preprocessing token STDC does not immediately follow pragma in the - directive (prior to any macro replacement)174) causes the implementation to behave in an - implementation-defined manner. The behavior might cause translation to fail or cause the - translator or the resulting program to behave in a non-conforming manner. Any such - pragma that is not recognized by the implementation is ignored. - -2 - If the preprocessing token STDC does immediately follow pragma in the directive (prior - to any macro replacement), then no macro replacement is performed on the directive, and - the directive shall have one of the following forms175) whose meanings are described - elsewhere: - - - #pragma STDC FP_CONTRACT on-off-switch - #pragma STDC FENV_ACCESS on-off-switch - #pragma STDC CX_LIMITED_RANGE on-off-switch - on-off-switch: one of - ON OFF DEFAULT - - - Forward references: the FP_CONTRACT pragma (7.12.2), the FENV_ACCESS pragma - (7.6.1), the CX_LIMITED_RANGE pragma (7.3.4). - - - - - - -Footnotes - -174) An implementation is not required to perform macro replacement in pragmas, but it is permitted - except for in standard pragmas (where STDC immediately follows pragma). If the result of macro - replacement in a non-standard pragma has the same form as a standard pragma, the behavior is still - implementation-defined; an implementation is permitted to behave as if it were the standard pragma, - but is not required to. - - -175) See ''future language directions'' (6.11.8). - - -Contents - -6.10.7 Null directive - - -Semantics - -1 - A preprocessing directive of the form - - - # new-line - - - has no effect. - - -Contents - -6.10.8 Predefined macro names - - -1 - The values of the predefined macros listed in the following subclauses176) (except for - __FILE__ and __LINE__) remain constant throughout the translation unit. - -2 - None of these macro names, nor the identifier defined, shall be the subject of a - #define or a #undef preprocessing directive. Any other predefined macro names - shall begin with a leading underscore followed by an uppercase letter or a second - underscore. - -3 - The implementation shall not predefine the macro __cplusplus, nor shall it define it - in any standard header. - - Forward references: standard headers (7.1.2). - - -Footnotes - -176) See ''future language directions'' (6.11.9). - - -Contents - -6.10.8.1 Mandatory macros - - -1 - The following macro names shall be defined by the implementation: - - __DATE__ The date of translation of the preprocessing translation unit: a character - string literal of the form "Mmm dd yyyy", where the names of the - months are the same as those generated by the asctime function, and the - first character of dd is a space character if the value is less than 10. If the - date of translation is not available, an implementation-defined valid date - shall be supplied. - __FILE__ The presumed name of the current source file (a character string literal).177) - __LINE__ The presumed line number (within the current source file) of the current - source line (an integer constant).177) - __STDC__ The integer constant 1, intended to indicate a conforming implementation. - __STDC_HOSTED__ The integer constant 1 if the implementation is a hosted - implementation or the integer constant 0 if it is not. - - __STDC_VERSION__ The integer constant 201ymmL.178) - __TIME__ The time of translation of the preprocessing translation unit: a character - string literal of the form "hh:mm:ss" as in the time generated by the - asctime function. If the time of translation is not available, an - implementation-defined valid time shall be supplied. - - - Forward references: the asctime function (7.27.3.1). - - -Footnotes - -177) The presumed source file name and line number can be changed by the #line directive. - - -178) This macro was not specified in ISO/IEC 9899:1990 and was specified as 199409L in - ISO/IEC 9899/AMD1:1995 and as 199901L in ISO/IEC 9899:1999. The intention is that this will - remain an integer constant of type long int that is increased with each revision of this International - Standard. - - -Contents - -6.10.8.2 Environment macros - - -1 - The following macro names are conditionally defined by the implementation: - - __STDC_ISO_10646__ An integer constant of the form yyyymmL (for example, - 199712L). If this symbol is defined, then every character in the Unicode - required set, when stored in an object of type wchar_t, has the same - value as the short identifier of that character. The Unicode required set - consists of all the characters that are defined by ISO/IEC 10646, along with - all amendments and technical corrigenda, as of the specified year and - month. If some other encoding is used, the macro shall not be defined and - the actual encoding used is implementation-defined. - __STDC_MB_MIGHT_NEQ_WC__ The integer constant 1, intended to indicate that, in - the encoding for wchar_t, a member of the basic character set need not - have a code value equal to its value when used as the lone character in an - integer character constant. - __STDC_UTF_16__ The integer constant 1, intended to indicate that values of type - char16_t are UTF-16 encoded. If some other encoding is used, the - macro shall not be defined and the actual encoding used is implementation- - defined. - __STDC_UTF_32__ The integer constant 1, intended to indicate that values of type - char32_t are UTF-32 encoded. If some other encoding is used, the - macro shall not be defined and the actual encoding used is implementation- - defined. - - - Forward references: common definitions (7.19), unicode utilities (7.28). - - - - - - -Contents - -6.10.8.3 Conditional feature macros - - -1 - The following macro names are conditionally defined by the implementation: - - __STDC_ANALYZABLE__ The integer constant 1, intended to indicate conformance to - the specifications in annex L (Analyzability). - __STDC_IEC_559__ The integer constant 1, intended to indicate conformance to the - specifications in annex F (IEC 60559 floating-point arithmetic). - __STDC_IEC_559_COMPLEX__ The integer constant 1, intended to indicate - adherence to the specifications in annex G (IEC 60559 compatible complex - arithmetic). - __STDC_LIB_EXT1__ The integer constant 201ymmL, intended to indicate support - for the extensions defined in annex K (Bounds-checking interfaces).179) - __STDC_NO_ATOMICS__ The integer constant 1, intended to indicate that the - implementation does not support atomic types (including the _Atomic - type qualifier) and the <stdatomic.h> header. - __STDC_NO_COMPLEX__ The integer constant 1, intended to indicate that the - implementation does not support complex types or the <complex.h> - header. - __STDC_NO_THREADS__ The integer constant 1, intended to indicate that the - implementation does not support the <threads.h> header. - __STDC_NO_VLA__ The integer constant 1, intended to indicate that the - implementation does not support variable length arrays or variably - modified types. - - -2 - An implementation that defines __STDC_NO_COMPLEX__ shall not define - __STDC_IEC_559_COMPLEX__. - - - - - - -Footnotes - -179) The intention is that this will remain an integer constant of type long int that is increased with - each revision of this International Standard. - - -Contents - -6.10.9 Pragma operator - - -Semantics - -1 - A unary operator expression of the form: - - - _Pragma ( string-literal ) - - - is processed as follows: The string literal is destringized by deleting any encoding prefix, - deleting the leading and trailing double-quotes, replacing each escape sequence \" by a - double-quote, and replacing each escape sequence \\ by a single backslash. The - resulting sequence of characters is processed through translation phase 3 to produce - preprocessing tokens that are executed as if they were the pp-tokens in a pragma - directive. The original four preprocessing tokens in the unary operator expression are - removed. - -2 - EXAMPLE A directive of the form: - - - #pragma listing on "..\listing.dir" - - - can also be expressed as: - - - _Pragma ( "listing on \"..\\listing.dir\"" ) - - - The latter form is processed in the same way whether it appears literally as shown, or results from macro - replacement, as in: - - - #define LISTING(x) PRAGMA(listing on #x) - #define PRAGMA(x) _Pragma(#x) - LISTING ( ..\listing.dir ) - - -Contents - -6.11 Future language directions - - -Contents - -6.11.1 Floating types - - -1 - Future standardization may include additional floating-point types, including those with - greater range, precision, or both than long double. - - -Contents - -6.11.2 Linkages of identifiers - - -1 - Declaring an identifier with internal linkage at file scope without the static storage- - class specifier is an obsolescent feature. - - -Contents - -6.11.3 External names - - -1 - Restriction of the significance of an external name to fewer than 255 characters - (considering each universal character name or extended source character as a single - character) is an obsolescent feature that is a concession to existing implementations. - - -Contents - -6.11.4 Character escape sequences - - -1 - Lowercase letters as escape sequences are reserved for future standardization. Other - characters may be used in extensions. - - -Contents - -6.11.5 Storage-class specifiers - - -1 - The placement of a storage-class specifier other than at the beginning of the declaration - specifiers in a declaration is an obsolescent feature. - - -Contents - -6.11.6 Function declarators - - -1 - The use of function declarators with empty parentheses (not prototype-format parameter - type declarators) is an obsolescent feature. - - -Contents - -6.11.7 Function definitions - - -1 - The use of function definitions with separate parameter identifier and declaration lists - (not prototype-format parameter type and identifier declarators) is an obsolescent feature. - - -Contents - -6.11.8 Pragma directives - - -1 - Pragmas whose first preprocessing token is STDC are reserved for future standardization. - - -Contents - -6.11.9 Predefined macro names - - -1 - Macro names beginning with __STDC_ are reserved for future standardization. - - -Contents diff --git a/src/api/asm_emit.c b/src/api/asm_emit.c @@ -517,7 +517,7 @@ static CfreeStatus emit_zero_range(Writer* w, u32 size) { * branch to the wrong place or load from address 0 on re-assembly. Here we * consult the section's relocation table and rewrite the covered operand into * the relocation-operator syntax the assembler parses (the inverse of - * src/arch/aa64/asm.c's parse_reloc_mod). See doc/ASM_ROUNDTRIP_TESTING.md. + * src/arch/aa64/asm.c's parse_reloc_mod). See doc/TESTING.md. * * Operand text is rewritten in place rather than re-rendered from decoded * fields, so the register names the disassembler produced are preserved and diff --git a/src/asm/asm.c b/src/asm/asm.c @@ -1204,7 +1204,7 @@ static void do_directive(AsmDriver* d, Sym name) { * assembly time: compute the displacement, patch the instruction, and emit no * relocation. Matching that is what makes `cc -S | as` reproduce `cc -c`'s * .text relocation table for control-flow-bearing code (the L1 round-trip - * lane; see doc/ASM_ROUNDTRIP_TESTING.md). + * lane; see doc/TESTING.md). * * We relax only PC-relative *branch* relocations (never CALL26 — a call keeps * its relocation on both sides) whose target is a symbol defined in the same diff --git a/src/cg/native_direct_target.h b/src/cg/native_direct_target.h @@ -140,7 +140,7 @@ struct NativeDirectTarget { * names the semantic local currently cached in that physical register, or * CG_LOCAL_NONE. scratch_used doubles as the per-class "pinned for the current * instruction" mask. Per-local cache state (reg/cls/dirty) lives on - * NativeDirectLocal. See doc/CGTARGET.md "local register cache". */ + * NativeDirectLocal. See doc/CODEGEN.md "local register cache". */ CGLocal reg_owner[3][32]; u32 use_tick; /* monotonic counter stamped onto NativeDirectLocal.last_use */ /* Head/tail of the intrusive cached-locals list (in caching order), -1 when diff --git a/src/obj/coff/link.c b/src/obj/coff/link.c @@ -1,7 +1,7 @@ /* link_emit_coff: write a PE32+ MH_EXECUTABLE-style image to the * caller-provided Writer. * - * Phase 3.1 deliverable per doc/WINDOWS.md: skeleton + base-reloc + * Phase 3.1 deliverable per doc/OBJ.md: skeleton + base-reloc * handling for the four standard PE sections. Import-table synthesis * (.idata / IAT) lands in Phase 3.2; per-arch IAT stub bytes in 3.3; * TLS directory in 3.5; debug directory in 3.6 — those code paths @@ -55,7 +55,7 @@ /* ---- .idata layout constants ---- * - * Per doc/WINDOWS.md §3.2: the .idata content is a concatenation of an + * Per doc/OBJ.md: the .idata content is a concatenation of an * IMAGE_IMPORT_DESCRIPTOR table (NULL-terminated), one ILT per DLL * (each NULL-terminated u64 array), one IAT per DLL (same shape), * a hint/name table, and a DLL-name string pool. Each block is @@ -381,7 +381,7 @@ static void coff_define_tls_used(LinkImage* img, /* ---- import-table synthesis (Phase 3.2) --------------------------- * - * Per doc/WINDOWS.md §3.2: every LinkSymbol with `imported = 1` gets + * Per doc/OBJ.md: every LinkSymbol with `imported = 1` gets * routed through an IAT slot synthesized in `.idata`. Function * imports additionally receive a small per-arch stub in `.text` * (`ff 25 disp32` on x64 / `adrp;ldr;br` on aa64) so a direct CALL26 diff --git a/src/obj/elf/read.c b/src/obj/elf/read.c @@ -8,7 +8,7 @@ * ET_EXEC / ET_DYN additionally attach the linked-image view via * read_elf_image (program-header segments, .dynamic dependencies, * .dynsym dynamic symbols, and allocatable dynamic relocations) — see - * doc/IMAGE_INSPECT.md. Their section tables still parse through the same + * doc/OBJ.md. Their section tables still parse through the same * passes. The standalone read_elf_dso (below) remains the linker's * exports-only DSO-input path. * @@ -453,7 +453,7 @@ ObjBuilder* read_elf(Compiler* c, const char* name, const u8* data, /* ET_REL parses to the section/symbol/reloc view only. ET_EXEC/ET_DYN * additionally get the linked-image view (read_elf_image, below); their * section tables still parse through the same passes. ET_CORE and other - * types are out of scope (see doc/IMAGE_INSPECT.md). */ + * types are out of scope (see doc/plan/IMAGE_INSPECT.md). */ if (e_type != ET_REL && e_type != ET_EXEC && e_type != ET_DYN) compiler_panic(c, no_loc(), "read_elf: unsupported e_type=%u (expected ET_REL, " diff --git a/src/obj/obj_secnames.c b/src/obj/obj_secnames.c @@ -79,7 +79,7 @@ Sym obj_secname_init_array(Compiler* c) { return pool_intern_slice(c->global, SLICE_LIT("__DATA,__mod_init_func")); case CFREE_OBJ_COFF: /* CRT runtime scans `.CRT$X[A-Z]` for ctor/dtor tables; XCU is - * the user-constructor bucket. See doc/WINDOWS.md §1.6. */ + * the user-constructor bucket. See doc/OBJ.md. */ return pool_intern_slice(c->global, SLICE_LIT(".CRT$XCU")); default: return secname_panic_unimpl(c, ".init_array"); @@ -94,7 +94,7 @@ Sym obj_secname_fini_array(Compiler* c) { return pool_intern_slice(c->global, SLICE_LIT("__DATA,__mod_term_func")); case CFREE_OBJ_COFF: /* `.CRT$XPA`/`XPZ` are markers; XPU is the user-destructor - * bucket. See doc/WINDOWS.md §1.6. */ + * bucket. See doc/OBJ.md. */ return pool_intern_slice(c->global, SLICE_LIT(".CRT$XPU")); default: return secname_panic_unimpl(c, ".fini_array"); @@ -114,7 +114,7 @@ Sym obj_secname_preinit_array(Compiler* c) { return secname_panic_unimpl(c, ".preinit_array"); case CFREE_OBJ_COFF: /* CRT's own setup runs in `.CRT$XI*`; user pre-init lives at - * XIA just after the CRT. See doc/WINDOWS.md §1.6. */ + * XIA just after the CRT. See doc/OBJ.md. */ return pool_intern_slice(c->global, SLICE_LIT(".CRT$XIA")); default: return secname_panic_unimpl(c, ".preinit_array"); @@ -129,7 +129,7 @@ Sym obj_secname_tdata(Compiler* c) { return pool_intern_slice(c->global, SLICE_LIT("__DATA,__thread_data")); case CFREE_OBJ_COFF: /* MSVC `.tls$` convention; linker concatenates `.tls$*` sorted - * by suffix. See doc/WINDOWS.md §1.6. */ + * by suffix. See doc/OBJ.md. */ return pool_intern_slice(c->global, SLICE_LIT(".tls$")); case CFREE_OBJ_WASM: /* Wasm has no thread-local storage model: a module instance owns a @@ -150,7 +150,7 @@ Sym obj_secname_tbss(Compiler* c) { return pool_intern_slice(c->global, SLICE_LIT("__DATA,__thread_bss")); case CFREE_OBJ_COFF: /* sorted-alphabetically-last so it falls at the tail of the TLS - * image's zero-fill region. See doc/WINDOWS.md §1.6. */ + * image's zero-fill region. See doc/OBJ.md. */ return pool_intern_slice(c->global, SLICE_LIT(".tls$ZZZ")); case CFREE_OBJ_WASM: /* See obj_secname_tdata: wasm thread-locals are ordinary @@ -245,7 +245,7 @@ const char* obj_format_default_entry_name(const Compiler* c) { if (c && c->target.obj == CFREE_OBJ_MACHO) return "_main"; /* COFF: PE/Windows CRT entry sets up argc/argv and calls main. * Resolved against the user-supplied CRT archive (mingw's - * libmingwex.a). See doc/WINDOWS.md §1.6. */ + * libmingwex.a). See doc/OBJ.md. */ if (c && c->target.obj == CFREE_OBJ_COFF) return "mainCRTStartup"; return "_start"; } diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c @@ -519,7 +519,7 @@ static void apply_abi_aliasing_hints(Func* f) { * hard_loc_bits + k -> stack slot index k (k < stack_slot_count) * * Live-range splitting (`get_hard_reg_with_split`, `lr_gap_t`, `split()`) - * is deferred per doc/OPT_PERF.md plan. + * is deferred per doc/plan/OPTIMIZER.md. * ------------------------------------------------------------------------- */ typedef struct OptAllocator OptAllocator; @@ -989,7 +989,7 @@ static int alloc_group_conflicts_bit(const OptAllocator* a, u32 bit) { static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges, OptAllocator* a, int allow_live_range_split) { (void)allow_live_range_split; /* live-range splitting deferred per - doc/OPT_PERF.md plan; the parameter is + doc/plan/OPTIMIZER.md; the parameter is passed through for ABI compatibility. */ memset(a, 0, sizeof *a); a->point_count = ranges->point_count ? ranges->point_count : 1u;