commit 2f677dcf2322c6276eb432fbadbfe5d3a8fdbad1
parent d37d6b3f8c61a24a3b2f259c22817b69525bf27d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sun, 24 May 2026 23:12:56 -0700
opt pipeline redesign
Diffstat:
27 files changed, 3090 insertions(+), 629 deletions(-)
diff --git a/Makefile b/Makefile
@@ -81,7 +81,7 @@ LIB_OBJS = $(patsubst src/%.c,$(BUILD_DIR)/lib/%.o,$(LIB_SRCS)) \
$(LANG_OBJS) \
$(patsubst src/%.S,$(BUILD_DIR)/lib/%.o,$(LIB_ASMS))
LIB_DEPS = $(LIB_OBJS:.o=.d)
-LIB_AR_STAGING = $(BUILD_DIR)/ar/libcfree
+LIB_AR_STAGING = $(BUILD_DIR)/.ar-staging/libcfree
DRIVER_SRCS = $(wildcard driver/*.c)
ifneq ($(CFREE_LANG_CPP_ENABLED),1)
diff --git a/doc/EMU.md b/doc/EMU.md
@@ -1,355 +1,758 @@
# cfree emu design
-Architecture of `cfree emu`, the guest-ISA emulator. Companion to
-`DESIGN.md`. Scope: how the emulator slots into the existing pipeline and
-what its contracts are. Not a tutorial; not implementation notes.
-
-## 1. Goals
-
-- `emu` multi-call subcommand: load and execute a guest ELF on the host,
- user-mode only (Linux/macOS userland; no full-system emulation).
-- Targets v1: aarch64, riscv64. 32-bit variants follow each 64-bit lift.
- x86_64 deferred (flag-heavy ISA — exercise lazy flags on simpler arches
- first).
-- Per-basic-block JIT translation: decode guest bytes → lift to `CG` → opt
- (optional) → MCEmitter → ObjBuilder → incremental link into a single
- growing `LinkImage` → execute.
-- Block chaining for hot paths; cold blocks may run direct-from-CG (no opt
- wrapper) for translation throughput.
-- Source-level stepping through `dbg` when the guest ELF carries DWARF.
-- Self-hosting: `src/emu/` is C11 freestanding like the rest of `src/`.
-
-The lifter is a sibling to `parse_c`: both are frontends that consume input
-bytes and drive `CG`. Everything below `CG` (`opt`, `arch`, `obj`) is
-unchanged. `link/` requires one extension — incremental resolve
-(`link_resolve_extend`, §6) — landed *before* emu work begins, so emu has a
-single lifecycle model end-to-end and never carries a "per-block fresh
-`LinkImage`" interim shape.
-
-## 2. Non-goals (v1)
-
-- Full-system emulation (privileged ISA, MMU, devices).
-- SIMD/vector ISA extensions (SSE/AVX/NEON/RVV) — blocked on `CGTarget`
- lacking vector ops (§DESIGN 5.7). Programs using them either trap to a
- scalarizer or fail to lift.
-- x86 in v1.
-- Self-modifying code (refuse to lift on observed write to a translated
- page; full support is future work).
-- Per-instruction precise exceptions / signal redirection.
-- Foreign-OS syscalls — only the host OS's syscalls are forwarded.
-
-## 3. Layout
+This document describes the target design for `cfree emu`: a user-mode
+guest executable runner built out of the same registries and pipeline
+boundaries used by the rest of cfree.
+
+This is a design document, not a description of the current prototype.
+The prototype can be discarded or migrated behind these interfaces.
+
+## Goals
+
+- Run guest user-mode executables in-process.
+- Keep all format, architecture, language, and OS knowledge behind the
+ registry/vtable system described in `doc/REGISTRY.md`.
+- Treat the guest ISA lifter as a frontend: decode guest bytes, emit CG,
+ then reuse the existing opt, backend, object, link, and JIT pipeline.
+- Support a non-executable-memory execution mode by interpreting lifted IR.
+- Keep executable loading separate from object building. Loading maps a
+ guest process image; lifting/codegen produces new host objects.
+
+## Non-goals
+
+- Full-system emulation: no privileged ISA, devices, kernel, or MMU.
+- Built-in host policy for outside effects. Syscalls, dynamic imports,
+ filesystem access, clocks, signals, and other outside interactions are
+ delegated to emulator/embedder-provided bindings.
+- Self-modifying code in v1. The first implementation may trap or refuse
+ writes to translated pages.
+- Source-language frontend involvement. `emu` starts from a binary image;
+ language registries are relevant only because they share the same
+ registry discipline.
+
+## Top-level Shape
+
+`emu` owns process-level orchestration. It should not own ELF, Mach-O,
+PE/COFF, RISC-V, AArch64, or Linux semantics directly.
+
+The expected flow is:
+
+```text
+guest executable bytes
+ -> obj-format executable loader
+ -> EmuProcess { EmuImage, EmuAddrSpace, initial CPU state }
+ -> arch decoder: bytes at guest PC -> CfreeDecodedInsn[]
+ -> arch lifter: CfreeDecodedInsn[] -> CG function
+ -> optional opt
+ -> either IR interpreter or backend/JIT
+ -> dispatch loop and runtime helpers
+```
+
+The important type split is:
+
+- `ObjFile` / object readers: inspect binary formats.
+- `EmuImage`: loaded executable image and metadata.
+- `EmuProcess`: address space, threads, CPU state, OS ABI state.
+- `ObjBuilder`: generated host object from lifted guest code.
+- `LinkImage` / `CfreeJit`: resolved/generated host code.
+
+Executable loading does not produce an `ObjBuilder`. `ObjBuilder` appears
+only after the lifter emits CG for a translated guest block.
+
+## Registry Integration
+
+The registry direction in `doc/REGISTRY.md` should extend naturally to
+`emu`.
+
+### Object Format Vtable
+
+Object/image format implementations own executable file parsing and load
+commands/program headers. `emu` asks the selected object format to load a
+process image.
+Implementation location:
+
+- ELF loader lives with the ELF object/image code.
+- Mach-O loader lives with Mach-O object/image code.
+- PE/COFF loader lives with PE/COFF object/image code.
+
+Sketch:
+
+```c
+typedef struct EmuLoadOptions {
+ CfreeSlice name;
+ CfreeSlice bytes;
+ const char *const *argv;
+ const char *const *envp;
+ const CfreeOsImpl *os;
+} EmuLoadOptions;
+
+typedef struct EmuLoadedImage {
+ CfreeTarget guest_target;
+ uint64_t entry_pc;
+ uint64_t phdr_vaddr;
+ uint64_t base_vaddr;
+ EmuAddrSpace *addr_space;
+ EmuImageSymbols symbols;
+ EmuImageDebug debug;
+} EmuLoadedImage;
+
+typedef struct ObjFormatEmuOps {
+ CfreeStatus (*detect_executable)(CfreeCompiler *, CfreeSlice bytes,
+ CfreeTarget *target_out);
+ CfreeStatus (*load_executable)(CfreeCompiler *, const EmuLoadOptions *,
+ EmuLoadedImage *out);
+} ObjFormatEmuOps;
+
+typedef struct ObjFormatImpl {
+ /* Existing object-format operations from doc/REGISTRY.md. */
+ CfreeObjFmt kind;
+ const char *name;
+ /* ... object read/emit, DSO read, link-image emit ... */
+
+ const ObjFormatEmuOps *emu;
+} ObjFormatImpl;
```
-src/
- emu/
- emu.h driver-facing API
- decode/ per-ISA structured decoder (shared tables with objdump)
- aarch64.c
- riscv64.c
- lift/ per-ISA lifter; drives CG
- aarch64.c
- riscv64.c
- cpu.h per-arch CPUState struct synthesis
- runtime.c dispatcher, code cache, block chaining
- syscall/ per-host-OS syscall forwarders (linux.c, darwin.c)
-test/
- emu/ guest binary corpus + behavioral oracles
+
+The loader maps segments into an `EmuAddrSpace` and records metadata.
+It does not construct source-like objects and it does not lower code.
+
+For ELF, loading is program-header driven. Sections may be used for
+symbols and debug information, but PT_LOAD, PT_INTERP, TLS, dynamic
+tables, and auxv data are the executable-loading contract.
+
+### OS Vtable
+
+`emu` needs OS-specific user ABI behavior that does not belong in an
+architecture or object format module.
+
+Implementation location:
+
+- Linux behavior lives with the Linux OS implementation.
+- Darwin behavior lives with the Darwin OS implementation.
+- Windows behavior lives with the Windows OS implementation.
+
+Sketch:
+
+```c
+typedef struct CfreeOsImpl {
+ CfreeOSKind kind;
+ const char *name;
+
+ const ABIVtable *(*abi_for_arch)(CfreeCompiler *, CfreeArchKind);
+
+ CfreeStatus (*emu_init_process)(CfreeCompiler *, EmuProcess *,
+ const EmuLoadOptions *,
+ const EmuLoadedImage *);
+ CfreeStatus (*emu_init_thread)(CfreeCompiler *, EmuProcess *,
+ EmuThread *);
+ CfreeStatus (*emu_decode_syscall)(EmuProcess *, EmuThread *,
+ EmuSyscallRequest *out);
+ CfreeStatus (*emu_encode_syscall_result)(EmuProcess *, EmuThread *,
+ const EmuSyscallResult *);
+ CfreeStatus (*emu_deliver_signal)(EmuProcess *, EmuThread *,
+ const EmuSignalEvent *);
+} CfreeOsImpl;
+
+const CfreeOsImpl *os_lookup(CfreeOSKind);
+```
+
+The OS vtable owns:
+
+- initial stack shape: argv, envp, auxv, alignment;
+- syscall number table and argument/result ABI decoding;
+- errno, signal-frame, and signal-delivery ABI conventions;
+- TLS and dynamic-loader process conventions;
+- OS-specific pages such as vdso-like regions, if supported.
+
+It does not issue host syscalls, open files, read clocks, resolve host
+symbols, or deliver host signals directly. It translates between guest OS
+ABI state and emulator-level requests. The emulator/embedder bindings
+decide what those requests mean.
+
+This also makes the ABI axis in `doc/REGISTRY.md` cleaner: ABI selection
+can remain derived, but OS behavior has an explicit registry home.
+
+### Outside Interaction Bindings
+
+All interaction outside the emulated process is delegated to bindings
+provided by the embedding application or by the `cfree emu` driver. This
+keeps libcfree policy-free: the library describes requests, the embedder
+chooses whether to service, virtualize, deny, record, or replay them.
+
+Sketch:
+
+```c
+typedef struct EmuSyscallRequest {
+ uint64_t number;
+ uint64_t args[6];
+} EmuSyscallRequest;
+
+typedef struct EmuSyscallResult {
+ int64_t result;
+ int32_t guest_errno;
+ uint32_t flags;
+} EmuSyscallResult;
+
+typedef struct EmuImportRequest {
+ CfreeSlice object_name;
+ CfreeSlice symbol_name;
+ uint32_t bind_flags;
+} EmuImportRequest;
+
+typedef struct EmuExternalBindings {
+ CfreeStatus (*syscall)(void *user, EmuProcess *, EmuThread *,
+ const EmuSyscallRequest *,
+ EmuSyscallResult *out);
+ CfreeStatus (*resolve_import)(void *user, EmuProcess *,
+ const EmuImportRequest *,
+ EmuResolvedImport *out);
+ CfreeStatus (*signal)(void *user, EmuProcess *, EmuThread *,
+ const EmuSignalEvent *);
+ CfreeStatus (*trace)(void *user, EmuProcess *, const EmuTraceEvent *);
+ void *user;
+} EmuExternalBindings;
```
-`src/emu/` is a sibling to `src/parse/`. The runtime helpers live in the
-cfree tool itself — the JIT's `LinkExternResolver` (§DESIGN 5.5.1) returns
-the host address of `emu_load64`, `emu_syscall`, etc. directly; no
-separate runtime object.
+The OS vtable is still required for syscalls because it knows how to read
+and write guest registers, errno, restart behavior, and signal frames.
+But after decoding a syscall into `EmuSyscallRequest`, it calls the
+binding rather than a host syscall API.
-## 4. Dataflow
+Dynamic linking follows the same rule. The object-format loader and OS
+dynamic-linker support parse imports, relocations, PLT/GOT conventions,
+and loader metadata; binding `resolve_import` decides what a requested
+external symbol resolves to. A binding may point to a native host
+function, an emulated thunk, a synthetic guest address, or a denial
+stub. The core emulator should not hard-code libc, filesystem, network,
+or host process behavior.
+### Architecture Vtable
+
+Architecture implementations own guest instruction knowledge. `emu`
+should not have a private decoder per ISA.
+
+Implementation location:
+
+- `src/arch/rv64/` owns RV64 decode, formatting, lifter hooks, and
+ CPU-state schema.
+- `src/arch/aa64/` owns AArch64 equivalents.
+- `src/arch/x64/` can add guest support later without changing `emu`.
+
+Sketch:
+
+```c
+typedef enum CfreeDecodeFlag {
+ CFREE_DECODE_TERMINATOR = 1u << 0,
+ CFREE_DECODE_BRANCH = 1u << 1,
+ CFREE_DECODE_CALL = 1u << 2,
+ CFREE_DECODE_RET = 1u << 3,
+ CFREE_DECODE_MEMORY = 1u << 4,
+ CFREE_DECODE_TRAP = 1u << 5,
+} CfreeDecodeFlag;
+
+typedef enum CfreeDecodedOperandKind {
+ CFREE_DECOP_NONE,
+ CFREE_DECOP_REG,
+ CFREE_DECOP_IMM,
+ CFREE_DECOP_MEM,
+ CFREE_DECOP_PCREL,
+ CFREE_DECOP_SYSREG,
+} CfreeDecodedOperandKind;
+
+typedef struct CfreeDecodedOperand {
+ uint8_t kind;
+ uint8_t width_bits;
+ uint16_t flags;
+ uint32_t reg;
+ uint32_t index_reg;
+ int64_t imm;
+ uint8_t scale;
+} CfreeDecodedOperand;
+
+typedef struct CfreeDecodedInsn {
+ uint64_t pc;
+ const uint8_t *bytes;
+ uint8_t nbytes;
+ uint8_t noperands;
+ uint16_t flags;
+ uint32_t opcode; /* Arch-owned stable opcode enum. */
+ uint32_t encoding_id; /* Optional row/table id for formatting. */
+ CfreeDecodedOperand operands[CFREE_DECODE_MAX_OPERANDS];
+ uint64_t arch[2]; /* Small arch-private payload. */
+} CfreeDecodedInsn;
+
+typedef struct ArchInsnFormatter ArchInsnFormatter;
+
+typedef struct ArchDecodeOps {
+ uint8_t min_insn_len;
+ uint8_t max_insn_len;
+
+ CfreeStatus (*decode_one)(CfreeCompiler *, const uint8_t *bytes,
+ size_t len, uint64_t pc,
+ CfreeDecodedInsn *out);
+ CfreeStatus (*decode_block)(CfreeCompiler *, const uint8_t *bytes,
+ size_t len, uint64_t pc,
+ CfreeDecodedInsn *out, uint32_t cap,
+ uint32_t *n_out);
+
+ ArchInsnFormatter *(*formatter_new)(CfreeCompiler *);
+ CfreeStatus (*format)(ArchInsnFormatter *, const CfreeDecodedInsn *,
+ CfreeInsn *out);
+ void (*formatter_free)(ArchInsnFormatter *);
+} ArchDecodeOps;
+
+typedef struct ArchEmuOps {
+ CfreeStatus (*cpu_layout)(CfreeCompiler *, EmuCpuLayout *out);
+ CfreeStatus (*init_cpu)(CfreeCompiler *, EmuThread *,
+ const EmuLoadedImage *);
+ CfreeStatus (*lift_block)(CfreeCompiler *, CfreeCg *,
+ const CfreeDecodedInsn *, uint32_t n,
+ const EmuLiftContext *);
+} ArchEmuOps;
+
+typedef struct ArchImpl {
+ /* Existing arch registry fields. */
+ /* ... */
+
+ const ArchDecodeOps *decode;
+ const ArchEmuOps *emu;
+} ArchImpl;
```
-guest.elf (bytes) ─► obj reader ─► LinkImage* (mapped into guest AS)
- │
- guest_pc ─► Decoder ─► EmuInst* ─► Lifter ─► CG ─┐
- │
- CGTarget ◄─────┘
- │
- (opt?)
- │
- MCEmitter ─► ObjBuilder
- │
- link_jit ─► host code
- │
- dispatcher(guest_pc) ──┘
+
+`ArchDecodeOps` is a general arch service. It is useful to `objdump`,
+`dbg`, and `emu`. `ArchEmuOps` is optional; an architecture can support
+disassembly and native codegen without supporting guest emulation.
+
+### Language Registry
+
+Language frontends are not on the `emu` hot path. The only design rule is
+symmetry: `emu` should follow the same registry discipline as languages,
+not add parallel hard-coded switches.
+
+## Decoder And Disassembler Split
+
+The current public disassembler shape is text-oriented:
+
+```c
+typedef struct CfreeInsn {
+ uint64_t vaddr;
+ const uint8_t *bytes;
+ uint32_t nbytes;
+ CfreeSlice mnemonic;
+ CfreeSlice operands;
+ CfreeSlice annotation;
+} CfreeInsn;
```
-1. **Load.** `obj/` readers parse the guest ELF. The runtime maps loadable
- segments into a *guest address space* (an mmap'd region inside the
- host process); guest virtual addresses are the addresses inside that
- region.
-2. **Decode.** When the dispatcher hits an untranslated `guest_pc`, the
- per-ISA decoder reads guest bytes and produces `EmuInst`s up to the
- next basic-block terminator. Decode tables are shared with the
- disassembler (objdump): same bit patterns, two output shapes.
-3. **Lift.** The per-ISA lifter walks the `EmuInst` stream and emits one
- synthesized C function per guest basic block: signature
- `next_pc_t block(CPUState*)`. Lifter calls `cg_*` ops only.
-4. **Codegen + JIT.** Standard cfree pipeline. At -O0 the emu drives a
- target `CGTarget` directly (fast translation, slow execution); at -O2
- it wraps with `opt_cgtarget` (slow translation, fast execution). Both
- end at `link_jit` mapping executable pages.
-5. **Execute.** `runtime.c` calls the host code with the current
- `CPUState*`. The block returns the next guest PC. The dispatcher looks
- up the next block, translating on miss.
-
-## 5. Key interfaces
-
-### 5.1 `Decoder` (`src/emu/decode/decode.h`)
-
-Per-ISA structured decoder. Output is `EmuInst` — a tagged union of
-ISA-specific shapes with operand fields, *not* text.
+The current internal arch hook also decodes directly into that text
+record:
```c
-typedef struct EmuInst {
- EmuOp op; /* per-ISA enum */
- u64 guest_pc;
- u32 guest_bytes; /* instruction width */
- EmuOperand operands[EMU_MAX_OPERANDS];
- u32 nop;
- u32 flags; /* TERMINATOR | MEM | SETS_FLAGS | ... */
-} EmuInst;
-
-u32 emu_decode_block(EmuArch, const u8* bytes, u64 guest_pc,
- EmuInst* out, u32 max);
+struct ArchDisasm {
+ u32 (*decode)(ArchDisasm *, const u8 *bytes, size_t len, u64 vaddr,
+ CfreeInsn *out);
+ void (*destroy)(ArchDisasm *);
+};
```
-The same decode tables back the disassembler (textual format) and the
-lifter (structured). One source of truth per ISA.
+That is sufficient for `objdump`, but it is the wrong interface for
+`emu`. A lifter needs operand identity, widths, immediates, memory
+addressing, flags behavior, and terminator classification. It should
+never parse disassembly text.
+
+Target shape:
-### 5.2 `Lifter` (`src/emu/lift/lift.h`)
+```text
+bytes -> ArchDecodeOps.decode_one -> CfreeDecodedInsn
+ -> ArchDecodeOps.format -> CfreeInsn text
+ -> ArchEmuOps.lift_block -> CG
+ -> ArchDbgOps -> breakpoint/displaced stepping helpers
+```
-Per-ISA lifter. Consumes `EmuInst*`, drives `CG*`, produces one CG function
-per guest basic block.
+The disassembler iterator remains public and can continue returning
+`CfreeInsn`. Internally, it becomes:
+
+```text
+decode_one(..., &decoded)
+format(formatter, &decoded, &public_text_insn)
+apply object annotations
+```
+
+This keeps the public disassembler API stable while moving the reusable
+decode contract into `arch/`.
+
+The formatter object exists because `CfreeInsn` contains string slices.
+As today, those slices are valid until the next formatter call or until
+the formatter is freed.
+
+### Operand Policy
+
+The decoded instruction type should be format-neutral but not force all
+architectures into an artificial common ISA. The contract is:
+
+- `opcode` is arch-owned. RV64, AArch64, and x64 each define their own
+ stable opcode enums.
+- `operands[]` uses common storage for common shapes: registers,
+ immediates, PC-relative targets, memory references, and system
+ registers.
+- `arch[]` carries small arch-private fields such as condition codes,
+ rounding modes, CSR IDs, or x86 prefix state.
+- Larger arch-private decode payloads can live behind an arena-backed
+ side table if needed; the common fast path should remain inline and
+ fixed-size.
+
+The lifter switches on the arch-owned opcode. `objdump` formats using
+the same `CfreeDecodedInsn` plus the arch's formatter. `dbg` can either
+reuse `CfreeDecodedInsn` directly or keep a narrow `ArchDbgInsn` wrapper
+implemented on top of the shared decoder.
+
+### Block Decode
+
+`decode_block` is a convenience over `decode_one`, not a separate source
+of truth. It repeatedly decodes instructions until one of these happens:
+
+- decoded instruction has `CFREE_DECODE_TERMINATOR`;
+- output capacity is reached;
+- decode fails;
+- a page boundary or mapped-range boundary blocks a complete instruction.
+
+The dispatcher uses this to form translation units. The debugger and
+disassembler can continue to decode one instruction at a time.
+
+## Process Model
+
+`EmuProcess` is the root context for emulation. All mutable state hangs
+off it or below it.
```c
-void emu_lift_block(EmuArch, CG* cg,
- const EmuInst* insts, u32 n,
- EmuLiftCtx* ctx);
+typedef struct EmuProcess {
+ CfreeCompiler *compiler;
+ const ObjFormatImpl *obj_format;
+ const ArchImpl *guest_arch;
+ const CfreeOsImpl *guest_os;
+
+ EmuAddrSpace *addr_space;
+ EmuLoadedImage image;
+ EmuRuntime runtime;
+
+ EmuThread *threads;
+ uint32_t nthreads;
+} EmuProcess;
```
-`EmuLiftCtx` carries: the CPUState `Type*`, the synthesized block function
-type (`next_pc_t (*)(CPUState*)`), the `ObjSymId` for the block, runtime
-helper symbols (memory load/store, syscall trampoline, dispatcher
-tail-call), and per-block lazy-flag state (§5.5).
-
-The lifter targets `CG` exclusively (`src/cg/cg.h`) — never `CGTarget`
-directly. It uses roughly this subset:
-
-- `cg_push_global(cpu_state_sym)`, `cg_push_int`, `cg_push_const`
-- `cg_load`, `cg_store`, plus a small `lift_field(ctx, off, T)` helper
- layered on `cg_addr` + offset arithmetic
-- `cg_binop`, `cg_unop`, `cg_cmp`, `cg_convert`
-- `cg_atomic_*` for guest atomics
-- `cg_label_new` / `cg_label_place` / `cg_jump` / `cg_branch_true`
-- `cg_call` for runtime helpers (memory access, syscalls, dispatcher
- tail-calls — `cg_call` materializes ABI parts from `fn_type`, which is
- the main reason CG and not CGTarget)
-- `cg_set_loc` carrying the *guest* PC encoded as a `SrcLoc` against a
- synthetic `SourceManager` file id (§DESIGN 5.0)
-
-It does not use: aggregates, bitfields, variadics, setjmp, structured
-scopes, inline asm. Those C-shaped surfaces remain available to the C
-front-end at zero cost.
-
-### 5.3 `CPUState` (`src/emu/cpu.h`)
-
-Per-arch C struct synthesized once per emu invocation as an interned
-`Type*`. Fields:
-
-- General-purpose register file (`u64 x[N]`).
-- Lazy-flag fields (§5.5): last op kind, last operands, materialized
- flags cache.
-- Pointer to the guest memory base (host pointer).
-- Pointer to the dispatcher entry / code-cache lookup function.
-- Trap reason / exit code slots written before returning to the
- dispatcher.
-
-Lifters reference fields by stable offset constants generated alongside
-the `Type*`. The runtime allocates one `CPUState` per guest thread and
-exposes its address as an `ObjSymId` resolved externally by the JIT
-linker.
-
-### 5.4 `Runtime` (`src/emu/runtime.h`)
-
-In-process runtime, linked into the cfree binary, callable from JITted
-guest blocks via the JIT's external resolver. Responsibilities:
-
-- **Dispatcher.** `emu_run(CPUState*)`: loop { lookup guest_pc →
- translate-if-cold → call block }, exits on trap.
-- **Code cache.** `guest_pc → host entry` map; translation happens on
- miss. Eviction deferred (cache grows unbounded in v1).
-- **Reserved code region.** One up-front `PROT_NONE` mmap (~128 MB)
- whose base address feeds `link_resolve_at` (§6). Per-block
- `link_resolve_extend` bump-allocates within it; the runtime commits
- pages and `mprotect`s RX as new blocks land.
-- **Block chaining.** When a block's terminator targets an
- already-translated block, patch the tail to jump directly, bypassing
- the dispatcher. Patching is a runtime concern — CG/opt see only the
- pre-patch tail-call.
-- **Memory helpers.** `emu_load{8,16,32,64}` / `emu_store_*`: bounds-check
- the guest address against the mapped guest AS, trap on miss. Lifter
- emits a `cg_call` to these for every guest memory op in v1; an inline
- fastpath is a follow-up (§9).
-- **Syscall trampoline.** `emu_syscall(CPUState*)` reads the guest
- syscall number/args from CPUState, forwards via the per-OS table in
- `src/emu/syscall/`, writes the return into the guest return register.
-
-### 5.5 Flag policy — lazy flags
-
-Most ISAs (aarch64 NZCV, x86 EFLAGS) compute condition flags as a side
-effect of arithmetic. CG/CGTarget have no flag primitives. The lifter
-implements lazy flags entirely above CG:
-
-- Each flag-setting guest op writes (op_kind, lhs, rhs) into CPUState
- fields. Flags are *not* recomputed eagerly.
-- Each flag-reading guest op (conditional branch, `cset`, …) recomputes
- the specific flag bit it needs from the recorded inputs.
-- opt's GVN/DCE eliminates redundant flag computations within a block;
- cross-block redundancy is recovered after inlining via block chaining.
-
-No CGTarget extension. Adding an ISA's flag set is a per-arch lifter
-table, not a pipeline change.
-
-### 5.6 Memory model — guest address space
-
-Guest loads/stores all go through a base pointer of unknown C provenance,
-so CG's default `MemAccess.alias` derivation (§DESIGN 5.6) collapses to
-`ALIAS_UNKNOWN`. To recover useful aliasing, the lifter sets
-`MemAccess.addr_space = EMU_GUEST_AS` on every guest memory op. opt's
-alias rules treat guest-AS accesses as:
-
-- May alias each other.
-- Do not alias C memory (CPUState fields, runtime state).
-
-This is a one-field convention on the existing `MemAccess` shape; no API
-change. CPUState field accesses use the default `addr_space = 0` and
-remain promotable / GVN'able as ordinary C memory.
-
-## 6. Lifecycle — per-block JIT on a single growing LinkImage
-
-The compiler pipeline is TU-shaped (§DESIGN 5.5.1, 9.1). The emu wants
-per-block translate-on-demand. The model is a *single* `LinkImage` that
-grows as cold blocks land — never a fresh image per block — so chaining,
-the code cache, and the host VA region all reference one stable artifact
-across the emu session.
-
-This requires `link/` to expose incremental resolution before any emu code
-lands:
+`EmuThread` owns one guest CPU state:
```c
-LinkImage* link_resolve_at(Linker*, uintptr_t base_va); /* first call */
-void link_resolve_extend(Linker*, LinkImage*); /* later calls */
+typedef struct EmuThread {
+ EmuProcess *process;
+ void *cpu_state;
+ uint64_t pc;
+ uint32_t trap;
+ int exit_code;
+} EmuThread;
```
-`link_resolve_at` reserves layout starting at a caller-specified base VA
-(the runtime hands out from a pre-reserved `PROT_NONE` region, typically
-~128 MB). `link_resolve_extend` appends new inputs: it places new sections
-at the next free offset within the reserved region, resolves new symbols
-against the existing image's symbol table plus the `LinkExternResolver`,
-and applies new relocations into the live image. **It must not change
-host addresses of previously placed sections** — chaining has already
-patched live host code with those addresses. §DESIGN 5.5.1's existing
-discipline (stable `LinkInputId`s, separable `LinkRelocApply`,
-non-destructive resolution) is exactly what makes this safe.
-
-The per-block flow is then:
-
-1. Decode guest bytes for the block.
-2. Lift into a fresh `ObjBuilder` containing one function.
-3. `link_add_obj` against the session's `Linker`.
-4. `link_resolve_extend` to place the new section in the reserved VA
- region, resolve symbols (helpers via the resolver; cross-block
- references always resolve to the dispatcher — see below), apply
- relocations.
-5. Commit the newly used pages and `mprotect` to RX.
-6. Insert `guest_pc → host_entry` into the code cache.
-
-Cross-block calls during step 4 always resolve to the dispatcher, even if
-the target block is already translated. The linker never learns about
-guest PCs or sibling blocks. Cross-block direct jumps are installed
-*later* by chaining, which is a runtime mprotect-and-patch operation
-outside the linker entirely. Keeping these two mechanisms separate avoids
-the duplication that "let the linker resolve sibling blocks" would
-introduce.
-
-What this model deliberately does *not* support in v1:
-
-- **Symbol removal / re-resolution.** Code-cache eviction would need it;
- v1 lets the cache grow unbounded (§5.4) so the question is moot. When
- eviction lands it pairs with a runtime-side "invalidate chain patches
- pointing into the evicted block" side-table, not a linker mutation.
-- **Address relocation of previously placed sections.** The reserved-VA
- bump-allocator never compacts; the chaining invariant depends on this.
-
-## 7. Driver API
-
-Mirrors §DESIGN 13.
+No global process state is required. Registries are static immutable
+tables; process state is explicit.
+
+## Address Space
+
+`EmuAddrSpace` is the only module that translates guest virtual
+addresses to host pointers.
+
+Responsibilities:
+
+- map executable segments with guest permissions;
+- provide checked read/write/fetch helpers;
+- maintain page permissions and dirty/translated-page metadata;
+- expose a bounded host pointer for decode and runtime memory helpers;
+- report faults in a target-independent form.
+
+The lifter should not inline host pointer arithmetic initially. It emits
+calls to runtime helpers:
+
+```c
+uint8_t emu_load8 (EmuThread *, uint64_t guest_addr);
+uint16_t emu_load16(EmuThread *, uint64_t guest_addr);
+uint32_t emu_load32(EmuThread *, uint64_t guest_addr);
+uint64_t emu_load64(EmuThread *, uint64_t guest_addr);
+void emu_store8 (...);
+```
+
+An inline memory fast path can be added later by making the helper ABI
+and address-space invariants explicit first.
+
+## Lifting
+
+The lifter emits one CG function per guest basic block.
```c
-typedef struct CfreeEmuOptions {
- EmuArch guest_arch;
- const u8* guest_elf_bytes;
- size_t guest_elf_len;
- u32 optimize; /* 0 = direct CGTarget, 2 = opt_cgtarget */
- EmuTraceFlags trace; /* PC trace, decoded-inst trace */
- /* argv, envp, fd map come through CfreeEnv */
-} CfreeEmuOptions;
-
-int cfree_emu_run (Compiler*, CfreeEmuOptions, int* out_exit_code);
-
-/* Lower-level surface for dbg integration. */
-typedef struct CfreeEmu CfreeEmu;
-CfreeEmu* cfree_emu_new (Compiler*, CfreeEmuOptions);
-int cfree_emu_step (CfreeEmu*, u32 nblocks);
-void* cfree_emu_lookup(CfreeEmu*, u64 guest_pc); /* translate-if-cold */
-void cfree_emu_free (CfreeEmu*);
+uint64_t emu_block_<guest_pc>(EmuThread *thread);
+```
+
+The return value is normally the next guest PC. Traps and exits are
+recorded in the thread/process state so the dispatcher can stop without
+requiring every block to encode a large result struct.
+
+The lifter may use:
+
+- function begin/end and symbols;
+- integer and floating-point arithmetic;
+- loads/stores of CPU-state fields;
+- branches and labels;
+- calls to runtime helpers;
+- source locations that encode guest PCs for debug views.
+
+The lifter should not use:
+
+- object format APIs;
+- linker APIs;
+- executable memory APIs;
+- disassembly strings;
+- host OS syscall APIs directly.
+
+Architecture-specific lowering belongs in `ArchEmuOps.lift_block`.
+Shared helpers for CPU field addressing, helper calls, and block
+function construction belong in `src/emu/lift/`.
+
+## Execution Engines
+
+`emu` should support multiple execution engines over the same process and
+decode/lift infrastructure.
+
+### JIT Engine
+
+The JIT engine uses the normal pipeline:
+
+```text
+decoded block -> CG -> optional opt -> ObjBuilder -> link -> CfreeJit
+```
+
+The existing link/JIT APIs should do most of the work once the lifter
+emits valid CG. `emu` still owns:
+
+- guest PC to host entry cache;
+- dispatcher;
+- runtime helper resolver;
+- trap/exit handling;
+- code invalidation policy;
+- block publication strategy.
+
+Two publication strategies are acceptable:
+
+- per-block object publication into a long-lived JIT session;
+- a single growing `LinkImage`, if the linker keeps stable host
+ addresses across extension.
+
+The first implementation should choose the simpler strategy that fits
+the current JIT API. The design requirement is stable lookup and
+eventual invalidation, not a specific linker-internal shape.
+
+### IR Interpreter
+
+An IR interpreter is a first-class execution engine, not just a test
+helper. It allows `emu` to run where executable memory is unavailable and
+validates lifted semantics independently from machine-code emission.
+
+The intended flow is:
+
+```text
+decoded block -> CG -> canonical IR -> optional opt -> IR interpreter
+```
+
+The interpreter should run the canonical pre-machinized IR. Interpreting
+post-lowering machine IR would mix backend details into the emulation
+contract and make cross-architecture behavior harder to test.
+
+This engine requires an explicit IR execution ABI:
+
+- call a lifted block function with an `EmuThread *`;
+- model helper calls through a host function table;
+- read/write CPU-state memory through normal IR memory operations;
+- return the next guest PC or propagate a trap.
+
+### Instruction Interpreter
+
+A direct instruction interpreter is useful for bring-up and differential
+testing, but it should not become the primary semantic source. The
+primary semantics should live in the lifter. If an instruction
+interpreter exists, it should consume `CfreeDecodedInsn`, not private
+decoder records.
+
+## Runtime Helpers
+
+Runtime helpers live in `src/emu/runtime/` and are linked by symbol or
+function table into JIT/IR execution.
+
+They own:
+
+- memory helpers;
+- dispatcher callbacks;
+- syscall trampoline that asks the OS vtable to decode/encode guest ABI
+ state and asks `EmuExternalBindings` to service the request;
+- dynamic import trampolines that call `EmuExternalBindings`;
+- trap and exit helpers;
+- optional tracing hooks.
+
+Runtime helpers are process-aware: they receive `EmuThread *` or
+`EmuProcess *`, not global state.
+
+## Debugging And Object Views
+
+Guest debug information belongs to the loaded image. Host debug
+information belongs to generated blocks.
+
+The design should support:
+
+- guest PC source mapping from the guest executable's DWARF;
+- generated host block symbols named by guest PC;
+- `cfree_jit_view` / objdump of generated host code;
+- debugger stepping that maps host PCs back to guest PCs.
+
+The lifter should stamp CG locations with guest PC information. The JIT
+debug view can then relate generated host instructions back to guest
+addresses even before source-level guest debugging is complete.
+
+## Module Ownership
+
+Proposed ownership:
+
+```text
+src/emu/
+ emu.c public API glue and lifecycle
+ process.c EmuProcess / EmuThread ownership
+ mem.c guest address space
+ dispatch.c block cache and run loop
+ bindings.c embedder outside-interaction binding glue
+ lift/common.c arch-independent CG helpers for lifters
+ runtime/*.c memory, traps, helper table, tracing
+ interp/ir.c lifted IR interpreter
+ interp/inst.c optional decoded-instruction interpreter
+
+src/arch/<arch>/
+ decode.c ArchDecodeOps implementation
+ disasm.c formatter over CfreeDecodedInsn
+ emu.c ArchEmuOps: CPU layout, init, lift
+ isa.* shared encoding tables and operand extraction
+
+src/obj/<format>/
+ *_read.c relocatable/shared object reader
+ *_emit.c relocatable object emitter
+ *_image.c executable image loader for ObjFormatEmuOps
+ registry.c ObjFormatImpl entries
+
+src/os/<os>/
+ os.c CfreeOsImpl entry
+ syscall.c guest syscall ABI decode/encode
+ process.c stack, auxv, TLS, signal conventions
+```
+
+Exact filenames can vary. The boundary should not: format code loads
+files, arch code decodes/lifts instructions, OS code models user ABI,
+and `emu` coordinates process execution.
+
+## First Target
+
+The first implementation target is a deliberately small vertical slice
+that exercises every intended boundary without requiring broad ISA,
+loader, OS, or execution-engine coverage.
+
+Target:
+
+- guest arch: RV64;
+- guest OS: Linux;
+- object format: ELF;
+- executable shape: static ELF64 little-endian `ET_EXEC`;
+- execution engine: lifted IR interpreter;
+- outside interaction: `EmuExternalBindings.syscall`;
+- guest behavior: `_start` exits with a code through `ecall`.
+
+The test program can be:
+
+```asm
+addi a0, zero, 42 # exit code
+addi a7, zero, 93 # Linux rv64 SYS_exit
+ecall
```
-Path-shaped helpers (`cfree emu prog.elf`) live in the driver layer and
-read bytes via `c->env->file_io->read_all`. The freestanding core never
-takes paths.
-
-## 8. Debug info
-
-When the guest ELF carries DWARF, source-level stepping comes for free:
-
-- A guest-DWARF reader (extension of `src/debug/`) maps guest PC →
- guest source line.
-- `dbg` interrogates `cfree_emu_lookup` plus the guest DWARF for
- per-line breakpoints and stepping.
-- Lifter feeds `cg_set_loc` a `SrcLoc` whose `file_id` is a synthetic
- `SourceManager` file representing the guest binary, with line numbers
- encoding guest PC. opt's `Inst.loc` and host-side DWARF then point
- back at guest PCs, useful for `objdump` of JITted host code.
-
-No new debug-info pipeline.
-
-## 9. Open questions
-
-- **CPUState promotion.** CPUState is passed by pointer, so its fields
- are address-taken from CG's view and `build_ssa` won't promote them
- (§DESIGN 12). Hot guest registers will spill/reload across every
- arithmetic op, which kills perf. Likely fix: a per-block convention
- that imports CPUState fields into virtuals at entry and exports at
- exit, treating mid-block accesses as the SSA values. This is the
- single largest perf question and worth prototyping before the lifter
- interface freezes.
-- **Inline memory fastpath.** A `cg_call` per guest memory op is a real
- function call. Inlining a bounds check + direct host load is a known
- emu speedup; needs either a cross-runtime/JIT inliner or a CG-level
- helper. Defer until measured.
-- **Vector ISA support.** Blocked by CGTarget lacking vector ops
- (§DESIGN 5.7). Lands the same day vectors land for the C front-end.
-- **x86 flag policy.** EFLAGS has more bits and more cross-instruction
- dependencies than aarch64 NZCV. Lazy flags work but the recorded
- payload is larger; verify on aarch64 before committing to x86.
-- **SMC detection.** v1 refuses to lift on observed write to a
- translated page (write-protect translated guest pages, trap writes).
- Full SMC support — invalidate-and-retranslate — is future work.
+`SYS_exit_group` (`a7 = 94`) may be accepted too, but `SYS_exit` is
+enough for the first acceptance test.
+
+Minimum implementation requirements:
+
+- `ObjFormatImpl(ELF).emu`
+ - Detect ELF64 little-endian `EM_RISCV`.
+ - Accept static `ET_EXEC` with `PT_LOAD` segments.
+ - Map loadable segments into `EmuAddrSpace`.
+ - Set `EmuLoadedImage.entry_pc`.
+ - Return `CFREE_UNSUPPORTED` for dynamic executables, PIE, unsupported
+ relocations, TLS, and malformed inputs.
+- `CfreeOsImpl(Linux)`
+ - Initialize one process and one thread.
+ - Provide a valid aligned stack pointer, even if argv/envp/auxv are
+ minimal.
+ - Decode RV64 Linux syscall ABI: number in `a7`, arguments in `a0-a5`.
+ - Encode syscall results for non-exit calls, although the first test
+ should not need them.
+- `EmuExternalBindings`
+ - Implement `syscall`.
+ - Recognize syscall `93` and set the emulated exit state from `a0`.
+ - Optionally recognize syscall `94`.
+ - Return a deterministic unsupported result for every other syscall.
+- `ArchDecodeOps(RV64)`
+ - Decode `ADDI` and `ECALL`.
+ - Mark `ECALL` as a terminator.
+ - Provide a formatter over the same `CfreeDecodedInsn` records so the
+ disassembler path uses the shared decoder shape.
+- `ArchEmuOps(RV64)`
+ - Define a minimal CPU state with `x[32]`, `pc`, trap state, and exit
+ code.
+ - Initialize `pc` from `EmuLoadedImage.entry_pc` and `x2` from the
+ initial stack pointer.
+ - Lift `ADDI` and `ECALL` to CG.
+- IR interpreter
+ - Execute the lifted block function with an `EmuThread *`.
+ - Route runtime helper calls through the same helper/binding table that
+ the JIT engine will use later.
+
+Acceptance criteria:
+
+- The in-memory ELF loads through the ELF object-format vtable.
+- Process/thread initialization goes through the Linux OS vtable.
+- Instruction decode goes through `ArchDecodeOps(RV64)`.
+- The disassembler formatter can format the same decoded instructions.
+- The lifter goes through `ArchEmuOps(RV64)` and emits CG.
+- The lifted IR interpreter runs the block without executable memory.
+- The exit syscall goes through `EmuExternalBindings.syscall`, not a
+ built-in host syscall.
+- The emulated process exits cleanly with code `42`.
+
+This slice intentionally excludes dynamic linking, libc startup, memory
+loads/stores, branches, JIT execution, signals, file I/O, clocks, and
+host-backed syscalls. Those features should extend the same boundaries
+rather than introduce new ones.
+
+## Implementation Order
+
+1. Add `CfreeDecodedInsn` and `ArchDecodeOps` behind `ArchImpl`.
+ Convert one architecture's disassembler to `decode_one + format`
+ without changing public `cfree_disasm_*`.
+2. Convert `objdump` and debugger instruction decode paths to the shared
+ decoder where practical.
+3. Add object-format executable loader hooks and return `EmuLoadedImage`
+ instead of an object-builder-like shape.
+4. Add an OS registry, outside-interaction bindings, and a minimal OS
+ vtable with process init and syscall ABI decode/encode.
+5. Add `ArchEmuOps` for one guest architecture with CPU layout and a
+ small lifted block subset.
+6. Add the IR interpreter execution engine.
+7. Add the JIT execution engine using existing CG, opt, object, link, and
+ JIT APIs.
+8. Expand ISA coverage and binding-backed syscall/import coverage with
+ differential tests against known guest binaries.
+
+Each step should be testable in isolation. The decoder split can land
+before any executable runs; the loader can be tested by inspecting
+`EmuLoadedImage`; the lifter can be tested through the IR interpreter
+before executable memory is involved.
diff --git a/doc/OPT_PERF.md b/doc/OPT_PERF.md
@@ -94,6 +94,91 @@ previously broken `matrix -O2`, `binary-trees`, `nbody`, and `spectral-norm`
rows. The remaining `COMPILE_FAIL` rows are all MIR `c2m` on the Darwin
hosted/math benchmarks.
+## Latest Focused O0/O1 Refresh
+
+Date: 2026-05-25.
+
+Scope:
+
+- `array`
+- `hash`
+- `hash2`
+- `matrix`
+- `sieve`
+
+Levels: `0 1`.
+
+Repeats: one compile repeat and three run repeats.
+
+Command:
+
+```sh
+CFREE_OPT_BENCH_LEVELS="0 1" \
+CFREE_OPT_BENCHES="array matrix hash hash2 sieve" \
+make bench-opt
+```
+
+Output:
+
+- `build/bench/opt/results.csv`
+- `build/bench/opt/summary.md`
+
+Coverage:
+
+| status | rows |
+| --- | ---: |
+| OK | 50 |
+
+Because this was a focused O0/O1 run, the generated summary does not include
+gcc `-O2` base rows and therefore reports base-relative speed ratios as `NA`.
+The generated summary averages are:
+
+| tool | opt | cases | avg compile+codegen ms | avg runtime ms |
+| --- | ---: | ---: | ---: | ---: |
+| `gcc-15` | `O0` | 5 | `413.780` | `8371.356` |
+| `gcc-15` | `O1` | 5 | `408.000` | `3142.126` |
+| `clang` | `O0` | 5 | `188.181` | `8858.075` |
+| `clang` | `O1` | 5 | `182.725` | `3168.854` |
+| `mir-c2m` | `O0` | 5 | `139.286` | `5608.600` |
+| `mir-c2m` | `O1` | 5 | `138.051` | `5044.000` |
+| `cfree` | `O0` | 5 | `68.995` | `8488.662` |
+| `cfree` | `O1` | 5 | `67.520` | `4276.252` |
+| `cfree-run` | `O0` | 5 | `27.630` | `8766.237` |
+| `cfree-run` | `O1` | 5 | `28.935` | `4156.487` |
+
+Direct ratios from this run:
+
+| comparison | compile-time ratio | runtime ratio |
+| --- | ---: | ---: |
+| `cfree O1` vs `MIR O1` | `0.489x` time, `2.04x` faster | `0.848x` time, `1.18x` faster |
+| `cfree O1` vs `gcc-15 O0` | `0.163x` time, `6.13x` faster | `0.511x` time, `1.96x` faster |
+| `cfree-run O1` vs `MIR O1` | `0.210x` time, `4.77x` faster | `0.824x` time, `1.21x` faster |
+| `cfree-run O1` vs `gcc-15 O0` | `0.070x` time, `14.30x` faster | `0.497x` time, `2.01x` faster |
+
+Compared with the previous O1-only focused refresh, current O1 rows are within
+single-run noise: `cfree` compile time moved from `63.851` to `67.520` ms and
+runtime from `4275.508` to `4276.252` ms; `cfree-run` compile+JIT moved from
+`28.682` to `28.935` ms and runtime from `4150.248` to `4156.487` ms.
+
+Per-bench `cfree-run O1` runtime vs peers:
+
+| bench | cfree-run ms | MIR ms | gcc-15 ms | clang ms | cfree vs MIR |
+| --- | ---: | ---: | ---: | ---: | ---: |
+| `array` | `2828.6` | `4576.0` | `2007.1` | `2839.8` | `0.62x` |
+| `hash` | `4048.2` | `3978.0` | `3957.6` | `3966.2` | `1.02x` |
+| `hash2` | `4501.2` | `3687.0` | `4155.9` | `3684.7` | `1.22x` |
+| `matrix` | `4812.5` | `9033.0` | `2949.7` | `2990.8` | `0.53x` |
+| `sieve` | `4592.0` | `3946.0` | `2640.3` | `2362.8` | `1.16x` |
+
+Interpretation: the O1 correctness/performance acceptance scope remains clean
+after moving post-allocation consumers to read allocation placement through
+`Func.preg_locs` and fixing replayed call-plan outgoing stack reservation.
+Runtime shape is materially unchanged: cfree is strongly ahead of MIR on
+`array` and `matrix`, close on `hash`, and still trails on `hash2` and `sieve`.
+
+Correctness follow-up: clean O0 and O1 toy bootstraps were reconfirmed on
+2026-05-25 after fixing replayed call-plan outgoing stack reservation.
+
## Headline Geomeans
Refreshed 2026-05-23 after the combiner perf+correctness fixes, MIR-shaped
@@ -333,26 +418,25 @@ Coverage goals:
## O1 Gap Analysis vs MIR
cfree `O1` runs only the shared lowering pipeline (`doc/OPT.md`): build_cfg,
-jump_cleanup, simplify_local, machinize, addr_xform_pregs, build_loop_tree,
-live_blocks, dead_def_elim, regalloc (simple mode), combine, dce, emit.
+jump_cleanup, simplify_local, machinize, build_loop_tree, live_blocks,
+dead_def_elim, regalloc (simple mode), combine, dce, emit. The local/global
+address folds (`opt_addr_xform_pregs`, scalar local promotion, global address
+CSE) remain optional and are not currently called by O1.
No SSA-era passes and no coalescing run at `O1`, matching MIR
(`mir-gen.c:9431`: coalesce is gated on `optimize_level >= 2`).
-Headline at `O1` (post-jump-layout pass, 2026-05-23 refresh):
+Headline at `O1` (focused O0/O1 refresh, 2026-05-25):
| metric | cfree | MIR | cfree vs MIR |
| --- | ---: | ---: | ---: |
-| opt+link+JIT (5-case scope, geomean) | `0.701 ms` | `0.726 ms` | `0.966x` (3.4% faster) |
-| runtime (5-case scope, geomean) | `3837.5 ms` | `4687.5 ms` | `0.819x` (18.1% faster) |
-
-cfree `O1` now leads MIR on both axes of this 5-case scope. Compile-time
-slice is within noise of MIR. Runtime is `1.22x` ahead, driven by matrix
-(`1.89x` ahead) and array (`1.69x` ahead) where the combiner +
-block-layout pair eliminates the redundant address-mode moves and the
-trivial inter-block branches. hash and sieve are within `7%` of MIR.
-hash2 remains the worst outlier at `9.3%` slower — same hash-table code
-as hash with larger working set, likely a register-pressure or cache
-effect that O1 can't address without coalescing/splitting.
+| full compile (5-case avg) | `67.520 ms` | `138.051 ms` | `0.489x` time (`2.04x` faster) |
+| runtime (5-case avg) | `4276.3 ms` | `5044.0 ms` | `0.848x` time (`1.18x` faster) |
+
+cfree `O1` leads MIR on both axes of this 5-case scope. Runtime is driven by
+matrix (`1.87x` faster than MIR) and array (`1.62x` faster), where the combiner
+and block-layout work eliminate redundant address-mode moves and trivial
+inter-block branches. `hash` is near parity, while `hash2` and `sieve` remain
+the current O1 runtime outliers.
Compile-time gaps closed by recent work:
diff --git a/doc/OPTv2.md b/doc/OPTv2.md
@@ -0,0 +1,937 @@
+# OPTv2 - Typed Lowering and Allocation Design
+
+This document proposes the next optimizer representation for cfree. The
+current optimizer design and pass order are documented in `doc/OPT.md`.
+Performance measurements, current O1/MIR comparisons, and optimization targets
+are tracked in `doc/OPT_PERF.md`; this design treats those numbers as
+constraints, not as background information.
+
+The short version: keep the fast O1 algorithms, but stop changing the meaning
+of the same operand field as the function moves through lowering. The recorded
+IR should stay virtual, allocation should be a separate map, and emitted code
+should come from an explicitly physical MIR.
+
+## Status - 2026-05-25
+
+Current implementation status:
+
+- Slice 1 is partially implemented: `OptBitset` grows on demand, trims active
+ words, and `opt_live_blocks` now uses a predecessor worklist. High-numbered
+ PReg coverage was added to `test-opt`.
+- Slice 3 is complete: `Func.preg_locs` is the canonical allocation location
+ table. Transitional mirroring into `OptPRegInfo` remains only for allocator
+ metadata and legacy tests.
+- HIR verification is stricter: `IR_PARAM_DECL` is def-only, storage and aux
+ operands are checked at phase boundaries, and allocation locations are
+ cross-checked.
+- Slice 4 is implemented for the O1/non-splitting path: register allocation
+ writes locations without mutating HIR, `opt_lower_to_mir` builds a separate
+ physical `Func.mir` block tape, MIR verification checks the lowered boundary,
+ post-RA combine/DCE/jump cleanup run through the MIR view, and `opt_emit`
+ emits from MIR when present.
+- Known-frame replay is re-enabled through the explicit known-frame descriptor.
+ The C-source target leaves `func_begin_known_frame` NULL because the hook is
+ optional and backend-owned.
+- O1 HIR local/global address folds are re-enabled before liveness:
+ `opt_addr_xform_pregs`, `opt_promote_scalar_locals`, and
+ `opt_addr_of_global_cse`.
+- Split allocation remains deferred for OPTv2 v1. The shared O1/O2 lowering
+ pipeline currently uses the non-splitting allocation path while the MIR
+ boundary is hardened.
+
+Validation status for this slice:
+
+- Passing (reconfirmed 2026-05-25, 698 checks): `make test-opt
+ HOST_OPTFLAGS=-O0`
+- Passing (reconfirmed 2026-05-25, 698 checks): `make test-opt
+ HOST_OPTFLAGS=-O1`
+- Passing (reconfirmed 2026-05-25, 6/6): `make test-ar-driver
+ HOST_OPTFLAGS=-O1`
+- Passing (reconfirmed 2026-05-25, 955/955): `make test-toy
+ HOST_OPTFLAGS=-O1`
+- Passing (reconfirmed 2026-05-25, 3712/3712): `make test-parse
+ HOST_OPTFLAGS=-O1`
+- Passing (reconfirmed 2026-05-25, clean `build/bootstrap`, bootstrapped toy
+ 955/955): `make bootstrap-test-toy HOST_OPTFLAGS=-O0`
+- Passing (reconfirmed 2026-05-25, clean `build/bootstrap`, bootstrapped toy
+ 955/955): `make bootstrap-test-toy HOST_OPTFLAGS=-O1`
+- Passing (reconfirmed 2026-05-25, 50/50 OK): `CFREE_OPT_BENCH_LEVELS="0 1"
+ CFREE_OPT_BENCHES="array matrix hash hash2 sieve" make bench-opt`
+
+Current O1 pipeline status:
+
+- Enabled: `build_cfg`, `jump_cleanup`, `simplify_local`, `machinize`,
+ O1 HIR address folds, `build_loop_tree`, `live_blocks`,
+ `dead_def_elim`, simple regalloc to `Func.preg_locs`, HIR-to-MIR lowering,
+ MIR verification, MIR post-RA combine, MIR DCE, MIR CFG/layout cleanup, MIR
+ emission, and known-frame replay.
+- Enabled for this slice: post-allocation consumers operate on the MIR view.
+ HIR remains virtual after allocation; the legacy rewritten-HIR path is kept
+ only for existing pass tests and `opt_regalloc` callers.
+- Deferred: split allocation and O2-specific allocation quality work.
+
+Recent O1 bootstrap fixes:
+
+- Fixed post-RA `try_sink` retargeting across call barriers. A call barrier
+ does not prove a callee-saved physical register is dead, so combine now
+ rejects retargeting when the source hard register is used after a call before
+ a real redefinition. This fixed the O1 miscompile of
+ `link_macho.c:build_codesig_skeleton`, where `sb = out->data` was lost after
+ the first `wr_u32_be` call and later code-signature writes corrupted the
+ Mach-O link state.
+- Fixed post-RA combine last-def repair when `try_sink` retargets a producer
+ away from a physical register that had an earlier reaching definition. This
+ removed the `coff_read_dso.c` stage3 crash in `init_struct_fields`.
+- Blocked copy substitution into store-address operands. Load address
+ substitution remains enabled, but memory-writing operands now keep their
+ original address base/index through this rewrite.
+- Split and simplified `opt_machinize` register collection so the self-hosted
+ O1 compiler no longer corrupts `opt_callee_saved` after the
+ `callee_save_mask` callback.
+- Avoided direct struct-return and returned-pointer array stores in the ar and
+ runtime driver paths that exposed the same store-address corruption class.
+- Fixed replayed call-plan outgoing stack reservation on aarch64, x64, and
+ rv64. Call plans now update the backend `max_outgoing` area and account for
+ `max(mem.size, 8)` per stack move, which prevents outgoing argument stores
+ from overwriting callee-saved slots. The O1 bootstrap `env.c` failure was a
+ symptom of this class: a replayed call to `obj_symbol_ex` clobbered the saved
+ `x19` slot in `cfree_cg_decl`, and the same overlap pattern could corrupt
+ parser stack frames.
+- Replay hard-register collection now maps virtual PRegs through
+ `Func.preg_locs` and keeps a raw-register fallback for operands that are
+ already physical.
+- Known-frame replay now uses the descriptor path after MIR lowering. Native
+ backends receive frame slots, outgoing stack area size, alloca/call flags,
+ and callee-save reservations before body emission.
+
+Acceptance criteria before marking OPTv2 O1 complete:
+
+- Done: `make bootstrap-test-toy HOST_OPTFLAGS=-O1` passes on a clean
+ `build/bootstrap`.
+- Done: O1 still emits toy tests correctly with the bootstrapped compiler.
+- Done: `CFREE_OPT_BENCH_LEVELS="0 1" CFREE_OPT_BENCHES="array matrix hash
+ hash2 sieve" make bench-opt` shows 50/50 OK rows. In the current summary
+ (`build/bench/opt/summary.md`), `cfree O1` averages `68.021 ms` compile time
+ and `4198.782 ms` runtime on the 5-case full-pipeline scope. That is `2.08x`
+ faster to compile than `MIR O1` and `1.19x` faster at runtime; versus
+ `gcc-15 O0`, it is `5.72x` faster to compile and `2.00x` faster at runtime.
+- Done for the O1 structural slice: allocation placement is separate from HIR,
+ MIR lowering/emission is active, known-frame replay is active, and the
+ optional O1 HIR folds are re-enabled.
+
+## Goals
+
+- Preserve O1 compile speed. `doc/OPT_PERF.md` shows cfree O1 is already in
+ the same backend-time class as MIR and materially faster than gcc/clang
+ whole-pipeline builds. OPTv2 must keep dense arrays, arena allocation, and
+ linear scans where they are currently winning.
+- Make representation invariants local and enforceable. A pass should not
+ need to know from context whether `Operand.v.reg` names a PReg, Val, or
+ physical register.
+- Keep frontends and targets behind their existing ownership boundaries.
+ Frontends still emit CG. Backends still own instruction encoding and ABI
+ prologue/epilogue details. The optimizer owns only the recorded IR,
+ allocation, and lowering handoff.
+- Support O1 first. O2 SSA and inlining are out of scope for OPTv2 v1. If the
+ boundary changes make O2 temporarily awkward, disable O2 or route it through
+ the O1 lowering path until it can be ported deliberately.
+
+## Current Problem
+
+The current O1 pipeline records CG operations into `Func`/`Block`/`Inst`, runs
+liveness and allocation on PRegs, then rewrites operands in place so replay can
+emit target code.
+
+That is efficient, but it gives one field multiple meanings:
+
+- Before register SSA, `OPK_REG` carries a PReg.
+- In SSA mode, `OPK_REG` carries a Val.
+- After O1 rewrite, the same `OPK_REG` usually carries a physical register.
+
+This phase-dependent convention is brittle. A post-RA pass can accidentally
+treat a physical register as a PReg. Replay can see stale virtual operands.
+`IR_PARAM_DECL` has also existed in two shapes: as a def-only marker and as an
+instruction with a synthetic self operand. Both are easy mistakes because the
+data model permits them.
+
+OPTv2 fixes this by giving each phase its own representation.
+
+## Representations
+
+### HIR: Recorded Optimizer IR
+
+HIR is the function shape recorded by `opt_cgtarget`.
+
+- `Func`, `Block`, and `Inst` remain arena-backed.
+- HIR operands are virtual. `OPK_REG` means PReg before SSA and Val during SSA.
+- HIR never stores physical registers in ordinary operands.
+- `IR_PARAM_DECL` is def-only:
+
+```c
+in->op = IR_PARAM_DECL;
+in->def = param_preg;
+in->type = param_type;
+in->nopnds = 0;
+in->opnds = NULL;
+```
+
+- Parameter and local storage remain in `IRParam` and `IRLocal`.
+- ABI call/return data remains structured in call/return aux records.
+
+HIR is the only representation consumed by pre-allocation passes such as CFG
+cleanup, local simplification, machinization, liveness, and allocation.
+
+### Allocation Map
+
+Allocation results should be stored in one canonical location table, separate
+from HIR operands.
+
+```c
+typedef enum OptLocKind {
+ OPT_LOC_NONE,
+ OPT_LOC_HARD,
+ OPT_LOC_SPILL,
+ OPT_LOC_SPLIT,
+} OptLocKind;
+
+typedef struct OptLocation {
+ u8 kind;
+ u8 cls;
+ union {
+ Reg hard;
+ FrameSlot spill;
+ u32 first_segment;
+ } v;
+} OptLocation;
+```
+
+`Func` owns `OptLocation* preg_locs`, indexed by PReg. `OptPRegInfo` can keep
+frequency, cost, live-range, and constraint metadata, but final placement
+should have one source of truth.
+
+Current code uses `OptLoc`, `OPT_LOC_STACK`, and `Func.preg_info` placement
+fields. The migration can either keep those names initially or rename them to
+the spelling above, but it should not do both semantic and naming churn in the
+same patch. The important boundary is one canonical PReg-to-location table that
+lowering reads and HIR operands do not encode.
+
+When split allocation is reintroduced, keep it segment-based:
+
+```c
+typedef struct OptSplitSegment {
+ PReg preg;
+ u32 block;
+ u32 start_point;
+ u32 end_point;
+ u8 loc_kind;
+ u8 cls;
+ Reg hard;
+ FrameSlot spill;
+ u8 reload_at_start;
+ u8 store_at_end;
+ u32 next;
+} OptSplitSegment;
+```
+
+This keeps today's dense, point-indexed allocator shape from `doc/OPT_PERF.md`
+while making the allocation result explicit.
+
+### MIR: Physical Lowered IR
+
+MIR is the output of O1 lowering after allocation. Replay should consume MIR,
+not rewritten HIR.
+
+```c
+typedef enum MOperandKind {
+ MOP_IMM,
+ MOP_PHYS_REG,
+ MOP_FRAME,
+ MOP_GLOBAL,
+ MOP_INDIRECT,
+} MOperandKind;
+
+typedef struct MOperand {
+ u8 kind;
+ u8 cls;
+ CfreeCgTypeId type;
+ union {
+ i64 imm;
+ Reg phys;
+ FrameSlot frame;
+ CGGlobalRef global;
+ struct {
+ Reg base;
+ Reg index; /* REG_NONE when absent */
+ i32 ofs;
+ u8 log2_scale;
+ } ind;
+ } v;
+} MOperand;
+```
+
+MIR should use inline operands for the common case:
+
+```c
+typedef struct MInst {
+ u16 op;
+ u16 nops;
+ SrcLoc loc;
+ MOperand small[3];
+ MOperand* extra_ops;
+ union {
+ i64 imm;
+ MemAccess mem;
+ void* aux;
+ } extra;
+} MInst;
+```
+
+Most instructions have at most three operands, so this is at least as cache
+friendly as today's per-instruction operand arrays and often better. Large
+operand lists still spill to arena storage.
+
+`MFunc` owns MIR blocks in emit order. CFG metadata can be omitted unless a
+future MIR pass needs it; replay only needs layout order and labels.
+
+## Pipeline Shape
+
+O1 becomes:
+
+```text
+frontend -> CG -> opt_cgtarget records HIR
+
+func_end:
+ frame-home address-taken reg locals
+ build CFG on HIR
+ jump cleanup (CFG mode)
+ rebuild CFG
+ simplify local HIR
+ verify HIR
+ machinize HIR
+ verify machinized HIR
+ loop tree
+ liveness over PRegs
+ dead-def elimination over HIR
+ register allocation -> OptLocation table
+ verify allocation/lowered state
+ lower HIR + OptLocation -> MIR
+ post-RA MIR combine
+ verify MIR/combine state
+ MIR DCE
+ verify MIR/DCE state
+ jump cleanup (CFG mode)
+ rebuild CFG
+ verify post-RA CFG
+ MIR jump/layout cleanup
+ emit MIR to wrapped target
+```
+
+O2 is deferred:
+
+```text
+frontend -> CG -> opt_cgtarget records HIR
+
+finalize:
+ for OPTv2 v1, either disable O2 or lower retained functions through the O1
+ path without running the SSA cleanup schedule
+```
+
+The important change is that O1 rewrite no longer mutates HIR. It builds MIR.
+O2 can later reuse the HIR allocation and MIR lowering boundary after the O1
+bootstrap path is correct and measured.
+
+## Current O1 Source Order
+
+The source of truth is `src/opt/opt.c`, not `doc/OPT.md`. As of this design,
+`w_func_end` first calls `opt_frame_home_addr_taken_locals`, then
+`opt_run_o1_pipeline`, which is `opt_run_lowering_pipeline(...,
+allow_live_range_split = 0)`.
+
+The current O1 order in code is:
+
+1. `opt_frame_home_addr_taken_locals`
+2. `opt_build_cfg`
+3. `opt_jump_cleanup(..., OPT_JUMP_CLEANUP_CFG)`
+4. `opt_build_cfg`
+5. `opt_simplify_local`
+6. `opt_verify("lowering-cfg")`
+7. `opt_machinize`
+8. `opt_verify("lowering-machinize")`
+9. `opt_build_loop_tree`
+10. `opt_live_blocks`
+11. `opt_dead_def_elim_with_live`
+12. `opt_regalloc(..., allow_live_range_split = 0)`
+13. `opt_verify("post-regalloc-rewrite")`
+14. `opt_combine`
+15. `opt_verify("post-ra-combine")`
+16. `opt_dce`
+17. `opt_verify("post-ra-dce")`
+18. `opt_jump_cleanup(..., OPT_JUMP_CLEANUP_CFG)`
+19. `opt_build_cfg`
+20. `opt_verify("post-ra-jump-cfg")`
+21. `opt_jump_cleanup(..., OPT_JUMP_CLEANUP_LAYOUT)`
+22. `opt_emit`
+
+The following PReg-level canonicalization passes exist in `src/opt/pass_o2.c`
+but are not currently called by O1:
+
+- `opt_addr_xform_pregs`
+- `opt_promote_scalar_locals`
+- `opt_addr_of_global_cse`
+
+They are treated below as optional future O1 HIR folds, not as active O1
+passes.
+
+## Invariants
+
+### HIR Invariants
+
+- `IR_PARAM_DECL` is def-only and has no operands.
+- HIR `OPK_REG` operands are never physical registers.
+- Before register SSA, HIR `OPK_REG` ids are PRegs in `[1, f->npregs)`.
+- During register SSA, HIR `OPK_REG` ids are Vals in `[1, f->nvals)`.
+- `IRParam.storage` and `IRLocal.storage` reference valid PRegs or valid frame
+ slots.
+- `OPK_LOCAL` frame slots are valid and refer to the expected slot class when a
+ pass requires that distinction.
+- Aux operands in calls, returns, inline asm, aggregates, atomics, and
+ intrinsics obey the same namespace rules as normal operands.
+
+### Allocation Invariants
+
+- Every live PReg has exactly one `OptLocation`.
+- `OPT_LOC_HARD` has a valid physical register of the matching class.
+- `OPT_LOC_SPILL` references an `FS_SPILL` frame slot.
+- `OPT_LOC_SPLIT` has at least one valid segment.
+- Split segments are ordered, non-overlapping per PReg, and cover every point
+ where the PReg is used or defined.
+- Two simultaneously live PRegs cannot share a hard register or spill slot
+ unless an explicit coalescing proof says they are the same value.
+- Fixed-register and inline-asm constraints are represented in allocation
+ inputs, not patched in after allocation.
+
+### MIR Invariants
+
+- MIR contains no PRegs or Vals.
+- Every register operand is `MOP_PHYS_REG` or part of `MOP_INDIRECT` and is a
+ valid physical register.
+- Every frame operand references a valid frame slot.
+- Call plans are fully physical: argument sources, return destinations, and
+ ABI register destinations are valid MIR operands.
+- MIR replay never calls PReg allocation helpers.
+
+## O1 Pass Porting Plan
+
+This section describes each active O1 pass from `src/opt/opt.c` and how it
+ports to OPTv2.
+
+### `opt_frame_home_addr_taken_locals`
+
+Keep as a HIR preparation pass.
+
+This pass runs before the O1 lowering pipeline. It materializes frame homes for
+address-taken locals that otherwise live in PRegs, then inserts loads/stores
+around uses/defs so taking the address observes the frame slot. In OPTv2 it
+should remain pre-allocation HIR because it changes local storage and virtual
+load/store structure.
+
+Verifier additions:
+
+- address-taken register locals with a home slot have valid frame homes
+- inserted loads/stores reference the local's home slot
+- the pass does not introduce physical registers
+
+### `opt_build_cfg`
+
+Keep as a HIR pass.
+
+CFG construction should continue to derive edges from HIR terminators and
+scope/control instructions. It should not inspect allocation state. The output
+is HIR block predecessor/successor lists plus `emit_order`.
+
+Verifier additions:
+
+- terminator successor count
+- reciprocal predecessor/successor links
+- no connected unreachable blocks
+- every `emit_order` block is valid and unique
+
+### `opt_jump_cleanup`
+
+Split into three uses matching the code.
+
+The early CFG cleanup remains a HIR pass. It canonicalizes branches and removes
+empty or unreachable control-flow artifacts before liveness. The post-RA CFG
+cleanup should move to MIR once post-RA code is represented as MIR. The final
+layout cleanup should also become a MIR-layout pass.
+
+Once MIR exists, fallthrough and branch deletion should operate on physical
+emitted blocks so it cannot accidentally rewrite virtual operands after
+allocation.
+
+Performance note: `doc/OPT_PERF.md` attributes part of O1's current runtime win
+to block-layout fallthrough cleanup. OPTv2 must preserve the greedy
+emit-order chain behavior and measure it with `make bench-opt`.
+
+### `opt_simplify_local`
+
+Keep as a HIR pass.
+
+This pass performs algebraic and addressing simplifications that are valid
+before allocation. It should continue to rewrite virtual operands and HIR
+instructions. Any rewrite that changes use/def structure must maintain or
+invalidate def-use in the current model, and later should use the incremental
+SSA/use-edge model described in `doc/SSA2.md`.
+
+### `opt_machinize`
+
+Keep as a HIR pass, but make its contract sharper.
+
+Machinization should express target constraints without converting operands to
+physical registers. Examples:
+
+- required register class
+- forbidden physical register masks
+- fixed-register constraints for inline asm or ABI-specific operations
+- target legality flags for addressing forms
+- call clobber and hard-live metadata
+
+It should not write physical registers into HIR operands. If a target decision
+requires a physical register, it becomes an allocation constraint.
+
+### Optional Future O1 Address Folds
+
+`opt_addr_xform_pregs`, scalar local promotion, and global address CSE are HIR
+canonicalization passes present in the codebase, but they are not active in the
+current O1 source order. If re-enabled, they should run before liveness.
+
+Porting rule:
+
+- address-of-local/global folds rewrite HIR operands only
+- scalar promotion updates `IRLocal`/frame-slot state and HIR loads/stores only
+- neither pass may depend on post-allocation physical registers
+
+These passes are optional for correctness. Re-enable them one at a time under
+HIR verification and benchmark with the `O1 Gap Analysis` workflow in
+`doc/OPT_PERF.md`.
+
+### `opt_build_loop_tree`
+
+Keep as a HIR analysis.
+
+The loop tree is built from HIR CFG after cleanup. It feeds liveness metrics and
+future O1 pressure models. No MIR dependency is needed for O1.
+
+### `opt_live_blocks`
+
+Keep as a HIR analysis over PRegs, but make it MIR-shaped.
+
+MIR does not use a true sparse bitmap here. Its `bitmap_t` is a growable vector
+of 64-bit words that expands on demand and truncates trailing zero words after
+set operations. cfree already has the most important local behavior through
+`OptBitset.active_words`; OPTv2 should lean into that and match MIR's model
+more directly:
+
+- keep 64-bit word bitmaps
+- keep active/trailing-zero trimming on every mutating bitmap operation
+- allow bitmaps to grow to the highest touched PReg instead of eagerly sizing
+ every block bitmap to `opt_reg_count`
+- use a worklist dataflow solver ordered by reverse postorder/postorder,
+ matching MIR's `solve_dataflow`
+- keep optional scan-var remapping available for special analyses, but use
+ direct PReg ids for O1 RA
+
+The dataflow equation stays the same:
+
+```text
+live_out = union(successor live_in)
+live_in = live_use | (live_out & ~live_def)
+```
+
+This preserves the current O1 semantics while stealing MIR's allocation and
+iteration shape. Do not switch to chunked sparse sets unless benchmark data
+shows the elastic dense bitmap is a real bottleneck.
+
+### `opt_dead_def_elim_with_live`
+
+Keep as a HIR pass.
+
+It removes dead virtual definitions before allocation. This should stay before
+MIR lowering because it reduces allocation pressure. The pass must treat
+`IR_PARAM_DECL` as a def-only value source and preserve all side-effecting
+operations.
+
+### `opt_regalloc`
+
+Keep the allocator algorithm shape, change the output. In current O1,
+`allow_live_range_split` is `0`.
+
+The allocator should consume HIR liveness and produce:
+
+- `OptLocation[preg]`
+- spill frame slots
+- split segments when enabled
+- scratch register lists
+- call-save/call-restore requirements
+
+It should not rewrite HIR operands.
+
+The point-indexed occupancy bitset design should be preserved and kept close to
+MIR. `doc/OPT_PERF.md` records a large O1 compile-time win from moving to
+MIR-shaped point occupancy; OPTv2 should keep that structure and only move the
+final assignment out of `OptPRegInfo` into a canonical location table.
+
+The live-range builder should also keep matching MIR:
+
+- build ranges by walking each block backward from `live_out`
+- model whole-block spans separately from instruction-local ranges
+- advance raw points only when live ranges are born or die
+- compress raw points to a dense point id space before allocation
+- feed `used_locs[point]` occupancy bitmaps from compressed points
+
+### O1 Lowering: HIR + Allocation -> MIR
+
+This replaces the current in-place rewrite performed during/after
+`opt_regalloc`.
+
+For each HIR instruction:
+
+- translate every virtual use through `OptLocation`
+- insert reloads before instructions that use spilled values
+- insert stores after instructions that define spilled values
+- materialize split reload/store edges
+- materialize call-save and call-restore code
+- lower call plans into physical argument and return moves
+- emit a physical `MInst` sequence
+
+This is where all PReg-to-physical knowledge belongs. After this pass, HIR is
+unchanged and MIR is fully physical.
+
+### `opt_combine`
+
+Move post-RA combine to MIR.
+
+The current post-allocation combiner is valuable and should not be discarded.
+`doc/OPT_PERF.md` credits O1 gains to operand-driven combine, live-range-scoped
+use counting, address-mode synthesis, and extension folding. Those algorithms
+should port directly, but their input should be MIR physical operands.
+
+MIR combine can be simpler because:
+
+- register operands are always physical
+- indirect operands never hide PRegs
+- use counts can be local to MIR blocks
+- no pass can accidentally consult PReg allocation state through a physical
+ operand
+
+### `opt_dce`
+
+Split into HIR DDE and MIR DCE.
+
+The pre-allocation dead-def elimination remains HIR-based. The post-combine DCE
+should become MIR-based and delete physical instructions whose definitions are
+unused and side-effect-free. It should not reason about PRegs.
+
+`doc/OPT_PERF.md` notes possible overlap between pre-allocation DDE and
+post-allocation DCE. OPTv2 should keep both initially for behavior parity, then
+benchmark whether MIR DCE can be narrowed.
+
+### Post-RA CFG Rebuild and Layout Cleanup
+
+Move these to MIR once post-RA code is physical MIR.
+
+The current code runs a CFG-mode jump cleanup, rebuilds CFG, verifies, then runs
+layout-mode jump cleanup. OPTv2 should preserve that ordering, but apply it to
+MIR after combine and DCE. The CFG-mode pass removes/control-simplifies blocks;
+the layout-mode pass optimizes fallthroughs in emit order.
+
+### `opt_emit`
+
+Replace replay of rewritten HIR with MIR emission.
+
+`opt_emit_mir` should:
+
+- open the function on the wrapped target
+- replay params and known frame slots from HIR function metadata
+- emit MIR blocks in layout order
+- reserve physical hard registers from MIR use collection
+- close the function
+
+It should not inspect `OptLocation` except for parameter storage translation
+where the backend prologue needs the final home location. That translation
+should be explicit and verifier-covered.
+
+## Verification
+
+OPTv2 should keep debug-only verification cheap and frequent:
+
+- HIR verifier after CFG cleanup and machinization
+- allocation verifier immediately after `opt_regalloc`
+- MIR verifier immediately after HIR-to-MIR lowering
+- MIR verifier after combine, DCE, and layout cleanup
+
+The verifier should fail at the phase boundary closest to the bug. For example,
+an operand-backed `IR_PARAM_DECL` should fail during HIR verification, not
+during replay. A stale PReg in MIR should fail during MIR verification, not in
+the backend encoder.
+
+## Performance Rules
+
+`doc/OPT_PERF.md` is the performance contract for this design.
+
+Preserve:
+
+- arena-backed function-local allocation
+- dense PReg-indexed arrays for hot per-value data
+- point-indexed allocator occupancy bitsets
+- operand-driven combiner dispatch
+- block-layout fallthrough cleanup
+- no SSA construction at O1
+- no coalescing or live-range splitting at O1 unless benchmarks justify it
+
+Avoid:
+
+- per-operand heap allocation in MIR
+- whole-function rescans in inner loops
+- rebuilding def-use or liveness after every small post-RA rewrite
+- adding abstraction layers in hot loops where a dense indexed array is enough
+
+Benchmark gates:
+
+- `make bench-opt`
+- focused O1 comparison:
+
+```sh
+CFREE_OPT_BENCH_LEVELS=1 make bench-opt
+```
+
+- targeted regressions from `doc/OPT_PERF.md`:
+
+```sh
+CFREE_OPT_BENCHES="array matrix hash hash2 sieve" \
+CFREE_OPT_BENCH_LEVELS=1 make bench-opt
+```
+
+Acceptance for the initial OPTv2 O1 port should be no material regression in
+the backend-only O1 geomean and no bootstrap regression.
+
+## Migration Plan
+
+1. Lock down HIR invariants.
+
+ Keep the current debug verifier strict: `IR_PARAM_DECL` is def-only, HIR
+ operands are virtual, params/locals reference valid storage, and aux operands
+ use the correct namespace.
+
+2. Make liveness and ranges MIR-shaped.
+
+ Keep `OptBitset` as a 64-bit-word bitmap, but make it elastic like MIR:
+ grow to the highest touched PReg, trim trailing zero words after operations,
+ and run liveness through a postorder/reverse-postorder worklist solver. Keep
+ the existing compressed point range builder and tighten it to MIR's
+ born/dead-boundary model.
+
+3. Introduce `OptLocation`.
+
+ Add the location table while still mirroring existing `OptPRegInfo`
+ allocation fields. Convert users one at a time, then remove duplicate final
+ placement fields.
+
+4. Add MIR data structures.
+
+ Add `MOperand`, `MInst`, `MBlock`, and `MFunc` with inline small operands.
+ Add a MIR verifier before any pass consumes MIR.
+
+5. Port rewrite into `opt_lower_to_mir`.
+
+ Keep the current reload/store/call-save algorithms, but write MIR instead of
+ mutating HIR. HIR should remain valid after the pass.
+
+6. Port emission.
+
+ Add `opt_emit_mir` and make O1 emit from MIR. Keep the old replay path
+ temporarily for comparison, then delete it once bootstrap and test-toy are
+ clean.
+
+7. Port post-RA combine and DCE.
+
+ Move the profitable O1 combines to MIR. Preserve the measured behaviors from
+ `doc/OPT_PERF.md` before adding new combines.
+
+8. Re-enable optional HIR O1 folds.
+
+ Re-enable address folds and scalar promotion only after HIR/MIR boundaries
+ are stable. Each pass should have focused tests and an O1 benchmark check.
+
+9. Defer O2.
+
+ For OPTv2 v1, keep the O2 implementation out of the critical path. Either
+ disable it or make it lower through the O1 path until O1 bootstrap and
+ `test-toy` are clean with the new HIR/MIR boundary.
+
+## V1 Decisions
+
+OPTv2 v1 is an O1 redesign. These choices favor bootstrap correctness and a
+small migration surface over solving O2 quality problems at the same time.
+
+### MIR CFG Shape
+
+Store layout order and successor labels in MIR v1, but do not build full
+predecessor lists by default.
+
+O1 needs:
+
+- physical block order for emission
+- terminator targets for jump cleanup
+- enough successor information for CFG-mode cleanup after combine/DCE
+
+It does not need dominance, loop analysis, SSA repair, or predecessor-driven
+phi updates after lowering. If a MIR pass needs predecessors later, build them
+as a derived scratch analysis from terminators, mirroring today's
+`opt_build_cfg` rather than storing and maintaining them eagerly.
+
+Tradeoff: a derived CFG costs one linear pass when needed, but avoids making
+every MIR rewrite responsible for keeping predecessor lists coherent.
+
+### Parameter Prologue Materialization
+
+Keep parameter prologue materialization backend-owned through `target->param`
+for v1.
+
+The native backends already encode ABI-specific incoming-register and
+incoming-stack handling in `x_param`, `aa_param`, and `rv_param`. Moving that
+into MIR immediately would duplicate ABI lowering and frame-layout logic at the
+same time as the HIR/MIR split. That is too much churn for the bootstrap path.
+
+MIR v1 should instead verify the boundary explicitly:
+
+- each parameter has a valid final storage location
+- register-backed parameters map to physical registers or are marked unused
+- frame-backed parameters have known-frame slot mappings when using
+ `func_begin_known_frame`
+- `target->param` is called before MIR body emission
+
+Later, if we want prologue instructions visible to MIR combine/DCE, add an
+explicit `MIR_PROLOGUE_PARAM` lowering step after O1 is stable.
+
+### Split Allocation
+
+Defer split allocation for O1 v1.
+
+Current O1 calls `opt_regalloc(..., allow_live_range_split = 0)`, and
+`doc/OPT_PERF.md` shows O1 is already close to MIR compile time while the
+bootstrap bug pressure is correctness, not spill quality. OPTv2 should first
+move today's non-splitting allocator result into `OptLocation[preg]` and lower
+that to MIR.
+
+Keep `OPT_LOC_SPLIT` and segment terminology in the design only as a
+compatibility slot for later O2 or O1 quality work. Do not require split
+segments, edge materialization, or split verification for the first O1 port.
+
+Tradeoff: this preserves today's O1 quality profile and avoids porting the
+most complex allocator path while the representation boundary is still moving.
+
+### Liveness Bitsets
+
+Match MIR's elastic dense bitmap model for v1.
+
+MIR's liveness bitmap is not a sparse set. It is a growable `uint64_t` vector
+that stores only words up to the current highest nonzero word. cfree should use
+the same shape: dense inside the active prefix, trimmed at the end, and iterated
+by scanning nonzero words.
+
+The current `OptBitset.active_words` design is already close. The remaining v1
+changes should be:
+
+- stop allocating every block bitmap to full `opt_reg_count` size up front
+- grow bitmaps on demand
+- keep bitmap operations trimming trailing zero words
+- switch liveness convergence to a MIR-like worklist solver
+- keep compressed live-range points as allocator input
+
+Tradeoff: this is still dense within the active prefix, so pathological high
+PReg ids can cost dense scans. That is acceptable because it matches MIR's
+successful O1 shape and avoids a separate sparse-set design.
+
+## Implementation Handoff
+
+The implementation should proceed in small behavior-preserving slices. Do not
+start by introducing MIR. First make the O1 analyses match MIR closely while
+the existing rewritten-HIR backend path still runs.
+
+### Slice 1: Elastic Bitmaps and Worklist Liveness
+
+Update `OptBitset`/`opt_live_blocks` before touching allocation output:
+
+- add an internal grow helper for `OptBitset`
+- make `opt_bitset_set` grow on demand
+- keep all mutating operations trimming trailing zero words
+- keep the public `opt_bitset_has` behavior unchanged
+- replace full reverse block sweeps with a MIR-like worklist solver
+- preserve the current `live_use | (live_out & ~live_def)` equation
+
+Targeted tests:
+
+```sh
+make test-opt HOST_OPTFLAGS=-O0
+make test-opt HOST_OPTFLAGS=-O1
+```
+
+The existing live tests around `opt_live_blocks` and
+`opt_live_ranges_phase2` should stay green. Add focused tests for high-numbered
+PRegs so on-demand growth and trailing-zero trimming are covered.
+
+### Slice 2: Tighten MIR-Style Live Ranges
+
+Keep `opt_live_ranges_build` compatible with current callers, but align the
+details with MIR:
+
+- start from each block's `live_out`
+- walk instructions backward
+- open ranges on uses and close ranges on defs
+- represent whole-block spans separately with `whole_block`
+- advance raw points only when a range is born or dies
+- compress raw points before allocation
+
+Targeted checks:
+
+```sh
+make test-opt HOST_OPTFLAGS=-O1
+CFREE_OPT_BENCH_LEVELS=1 CFREE_OPT_BENCHES="array matrix hash hash2 sieve" make bench-opt
+```
+
+### Slice 3: Canonical Allocation Locations
+
+Only after liveness/ranges are stable, introduce the single allocation table.
+Keep existing allocator heuristics and O1 `allow_live_range_split = 0`.
+
+Rules:
+
+- allocation writes one canonical PReg location table
+- HIR operands remain virtual after allocation
+- any temporary mirroring into `OptPRegInfo` is transitional and should have a
+ deletion point
+- verifier checks that post-allocation HIR still has virtual operands
+
+### Slice 4: MIR Lowering and Emission
+
+Then add MIR and move the in-place rewrite into `opt_lower_to_mir`.
+
+Keep v1 conservative:
+
+- backend-owned parameter prologue through `target->param`
+- no O1 live-range splitting
+- no O2 dependency
+- old rewritten-HIR emit path may remain temporarily for comparison
+
+Acceptance:
+
+```sh
+make bootstrap HOST_OPTFLAGS=-O0
+make bootstrap-test-toy HOST_OPTFLAGS=-O0
+make bootstrap HOST_OPTFLAGS=-O1
+make bootstrap-test-toy HOST_OPTFLAGS=-O1
+```
+
+If O2 blocks progress, disable it or route it through the O1 lowering path as
+described above.
diff --git a/driver/ar.c b/driver/ar.c
@@ -139,6 +139,14 @@ static int ar_name_selected(CfreeSlice name, int argc, char** argv,
return 0;
}
+static void ar_set_input_member(CfreeArInput* out, const char* name,
+ const CfreeFileData* fd) {
+ out->name.s = name;
+ out->name.len = driver_strlen(name);
+ out->bytes.data = fd->data;
+ out->bytes.len = fd->size;
+}
+
/* Open the archive bytes via file_io. Caller releases fd via ctx. */
static int ar_open_for_read(DriverEnv* env, const char* path,
CfreeContext* ctx_out, CfreeFileData* fd_out,
@@ -450,26 +458,20 @@ static int ar_do_write(DriverEnv* env, const char* archive_path, int nmembers,
int replaced = 0;
for (j = 0; j < nm; ++j) {
if (driver_streq(members[j].name.s, base)) {
- members[j].name = cfree_slice_cstr(base);
- members[j].bytes.data = new_fds[i].data;
- members[j].bytes.len = new_fds[i].size;
+ ar_set_input_member(&members[j], base, &new_fds[i]);
replaced = 1;
break;
}
}
if (!replaced) {
- members[nm].name = cfree_slice_cstr(base);
- members[nm].bytes.data = new_fds[i].data;
- members[nm].bytes.len = new_fds[i].size;
+ ar_set_input_member(&members[nm], base, &new_fds[i]);
nm++;
}
if (has_v)
driver_printf("%c - %.*s\n", replaced ? 'r' : 'a',
CFREE_SLICE_ARG(cfree_slice_cstr(base)));
} else {
- members[nm].name = cfree_slice_cstr(base);
- members[nm].bytes.data = new_fds[i].data;
- members[nm].bytes.len = new_fds[i].size;
+ ar_set_input_member(&members[nm], base, &new_fds[i]);
nm++;
if (has_v)
driver_printf("a - %.*s\n", CFREE_SLICE_ARG(cfree_slice_cstr(base)));
diff --git a/driver/runtime.c b/driver/runtime.c
@@ -152,6 +152,17 @@ static char* rt_join(DriverEnv* env, const char* a, const char* b,
return p;
}
+static char* rt_join_slot(DriverEnv* env, char** slot, size_t* size_slot,
+ const char* a, const char* b) {
+ char* p = rt_join(env, a, b, size_slot);
+ *slot = p;
+ return p;
+}
+
+static void rt_store_const_ptr(const char** slot, const char* value) {
+ *slot = value;
+}
+
static int rt_basename_is(const char* path, const char* name) {
const char* base;
const char* end;
@@ -352,8 +363,11 @@ int driver_runtime_add_freestanding_headers(const DriverRuntimeSupport* s,
int driver_runtime_append_freestanding_headers(const DriverRuntimeSupport* s,
DriverCflags* cf) {
+ uint32_t i;
if (!s || !s->include_dir || !cf) return 1;
- cf->system_include_dirs[cf->nsystem_include_dirs++] = s->include_dir;
+ i = cf->nsystem_include_dirs;
+ cf->system_include_dirs[i] = s->include_dir;
+ cf->nsystem_include_dirs = i + 1u;
return 0;
}
@@ -383,7 +397,6 @@ static int rt_prepare_pp(DriverEnv* env, const DriverRuntimeSupport* support,
uint32_t ninc = variant->ldbl128 ? 4u : 3u;
uint32_t nsys = 2u;
uint32_t ndefs = 1u + (variant->ldbl128 ? 1u : 0u) + (assembler ? 1u : 0u);
- uint32_t i = 0;
uint32_t d = 0;
CfreePreprocessOptions z = {0};
@@ -413,22 +426,31 @@ static int rt_prepare_pp(DriverEnv* env, const DriverRuntimeSupport* support,
return 1;
}
- dirs[d] = rt_join(env, support->rt_root, "lib/include/common", &sizes[d]);
- include_dirs[i++] = dirs[d++];
- dirs[d] = rt_join(env, support->rt_root, "lib/impl", &sizes[d]);
- include_dirs[i++] = dirs[d++];
- dirs[d] = rt_join(env, support->rt_root, variant->abi_include, &sizes[d]);
- include_dirs[i++] = dirs[d++];
+ rt_join_slot(env, &dirs[0], &sizes[0], support->rt_root,
+ "lib/include/common");
+ rt_store_const_ptr(&include_dirs[0], dirs[0]);
+ rt_join_slot(env, &dirs[1], &sizes[1], support->rt_root, "lib/impl");
+ rt_store_const_ptr(&include_dirs[1], dirs[1]);
+ rt_join_slot(env, &dirs[2], &sizes[2], support->rt_root,
+ variant->abi_include);
+ rt_store_const_ptr(&include_dirs[2], dirs[2]);
if (variant->ldbl128) {
- dirs[d] = rt_join(env, support->rt_root, "lib/include/lp64_le_ldbl128",
- &sizes[d]);
- include_dirs[i++] = dirs[d++];
+ rt_join_slot(env, &dirs[3], &sizes[3], support->rt_root,
+ "lib/include/lp64_le_ldbl128");
+ rt_store_const_ptr(&include_dirs[3], dirs[3]);
+ rt_join_slot(env, &dirs[4], &sizes[4], support->rt_root, "include");
+ rt_store_const_ptr(&system_dirs[0], dirs[4]);
+ rt_join_slot(env, &dirs[5], &sizes[5], support->rt_root, "include/libc");
+ rt_store_const_ptr(&system_dirs[1], dirs[5]);
+ d = 6;
+ } else {
+ rt_join_slot(env, &dirs[3], &sizes[3], support->rt_root, "include");
+ rt_store_const_ptr(&system_dirs[0], dirs[3]);
+ rt_join_slot(env, &dirs[4], &sizes[4], support->rt_root, "include/libc");
+ rt_store_const_ptr(&system_dirs[1], dirs[4]);
+ d = 5;
}
- dirs[d] = rt_join(env, support->rt_root, "include", &sizes[d]);
- system_dirs[0] = dirs[d++];
- dirs[d] = rt_join(env, support->rt_root, "include/libc", &sizes[d]);
- system_dirs[1] = dirs[d++];
- for (i = 0; i < d; ++i) {
+ for (uint32_t i = 0; i < d; ++i) {
if (!dirs[i]) {
uint32_t j;
for (j = 0; j < d; ++j)
@@ -445,7 +467,7 @@ static int rt_prepare_pp(DriverEnv* env, const DriverRuntimeSupport* support,
defs[0].name = CFREE_SLICE_LIT("HAS_INT128");
defs[0].body = variant->has_int128 ? def_int128_1 : def_int128_0;
- i = 1;
+ uint32_t i = 1;
if (variant->ldbl128) {
defs[i].name = CFREE_SLICE_LIT("CFREERT_LDBL128");
defs[i].body = CFREE_SLICE_LIT("1");
diff --git a/lang/c/parse/parse_init.c b/lang/c/parse/parse_init.c
@@ -22,6 +22,15 @@ static CKw ident_kw_init(const Parser* p, Sym name) {
return ident_kw_inline(p, name);
}
+static const Type* init_field_type_at(const Type* ty, u16 i) {
+ const Field* f = &ty->rec.fields[i];
+ return f->type;
+}
+
+static u32 init_field_offset_at(const ABIRecordLayout* L, u16 i) {
+ return L->fields[i].offset;
+}
+
/* True if `ty` is char/signed char/unsigned char. */
int is_char_kind(const Type* ty) {
if (!ty) return 0;
@@ -224,9 +233,10 @@ static void emit_walk_copy(Parser* p, FrameSlot dst_slot,
for (u16 i = 0; i < ty->rec.nfields; ++i) {
const Field* f = &ty->rec.fields[i];
if (f->flags & FIELD_BITFIELD) continue;
- u32 foff = L->fields[i].offset;
+ const Type* fty = init_field_type_at(ty, i);
+ u32 foff = init_field_offset_at(L, i);
emit_walk_copy(p, dst_slot, dst_arr_ty, dst_off + foff, src_ptr_slot,
- src_ptr_ty, src_off + foff, f->type);
+ src_ptr_ty, src_off + foff, fty);
}
return;
}
@@ -297,7 +307,11 @@ static void zero_init_at(Parser* p, FrameSlot slot, const Type* arr_ty,
cg_drop(p->cg);
continue;
}
- zero_init_at(p, slot, arr_ty, offset + L->fields[i].offset, f->type);
+ {
+ const Type* fty = init_field_type_at(ty, i);
+ u32 foff = init_field_offset_at(L, i);
+ zero_init_at(p, slot, arr_ty, offset + foff, fty);
+ }
}
return;
}
@@ -622,7 +636,9 @@ static void init_aggregate_remainder(Parser* p, FrameSlot slot,
cg_drop(p->cg);
}
} else {
- zero_init_at(p, slot, arr_ty, offset + L->fields[i].offset, f->type);
+ const Type* fty = init_field_type_at(ty, i);
+ u32 foff = init_field_offset_at(L, i);
+ zero_init_at(p, slot, arr_ty, offset + foff, fty);
}
}
return;
@@ -1359,8 +1375,9 @@ static void parse_static_aggregate_remainder(Parser* p, u8* buf, u32 buflen,
f->type);
}
} else {
- parse_static_init_at(p, buf, buflen, offset + L->fields[i].offset,
- f->type);
+ const Type* fty = init_field_type_at(ty, i);
+ u32 foff = init_field_offset_at(L, i);
+ parse_static_init_at(p, buf, buflen, offset + foff, fty);
}
if (i + 1u >= ty->rec.nfields) return;
if (!accept_punct(p, ',')) break;
@@ -1493,8 +1510,9 @@ void parse_static_init_at(Parser* p, u8* buf, u32 buflen, u32 offset,
f->type);
}
} else {
- parse_static_init_at(p, buf, buflen, offset + L->fields[i].offset,
- f->type);
+ const Type* fty = init_field_type_at(ty, i);
+ u32 foff = init_field_offset_at(L, i);
+ parse_static_init_at(p, buf, buflen, offset + foff, fty);
}
++i;
if (!accept_punct(p, ',')) break;
diff --git a/lang/c/parse/parse_type.c b/lang/c/parse/parse_type.c
@@ -1044,6 +1044,11 @@ const Type* parse_struct_or_union(Parser* p, TypeKind kind,
te = tag_define(p, tag_name, tdk, target, /*complete=*/0);
}
}
+ if (te) {
+ attr_list_append(&te->attrs, rec_attrs);
+ } else if (anon_attrs_out) {
+ attr_list_append(anon_attrs_out, rec_attrs);
+ }
expect_punct(p, '{', "'{' to start aggregate body");
TypeRecordOpts begin_opts;
memset(&begin_opts, 0, sizeof begin_opts);
@@ -1058,11 +1063,17 @@ const Type* parse_struct_or_union(Parser* p, TypeKind kind,
begin_opts);
parse_member_decls(p, b);
expect_punct(p, '}', "'}' after aggregate body");
- parse_attrs_into(p, &rec_attrs);
- if (te) {
- attr_list_append(&te->attrs, rec_attrs);
- } else if (anon_attrs_out) {
- attr_list_append(anon_attrs_out, rec_attrs);
+ TypeRecordOpts trailing_opts;
+ memset(&trailing_opts, 0, sizeof trailing_opts);
+ {
+ Attr* trailing_attrs = NULL;
+ parse_attrs_into(p, &trailing_attrs);
+ attrs_to_record_opts(trailing_attrs, &trailing_opts);
+ if (te) {
+ attr_list_append(&te->attrs, trailing_attrs);
+ } else if (anon_attrs_out) {
+ attr_list_append(anon_attrs_out, trailing_attrs);
+ }
}
{
const Type* fresh = type_record_end(p->pool, b);
@@ -1071,14 +1082,9 @@ const Type* parse_struct_or_union(Parser* p, TypeKind kind,
target->rec.align_override = fresh->rec.align_override;
type_record_install(target, (Field*)fresh->rec.fields, fresh->rec.nfields);
}
- {
- TypeRecordOpts opts;
- memset(&opts, 0, sizeof opts);
- attrs_to_record_opts(rec_attrs, &opts);
- if (opts.packed) target->rec.packed = 1;
- if (opts.align_override > target->rec.align_override)
- target->rec.align_override = opts.align_override;
- }
+ if (trailing_opts.packed) target->rec.packed = 1;
+ if (trailing_opts.align_override > target->rec.align_override)
+ target->rec.align_override = trailing_opts.align_override;
if (te) te->complete = 1;
return target;
}
diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c
@@ -1128,7 +1128,7 @@ static u32 aa_call_plan_stack_raw_size(const CGCallPlan* p) {
const CGCallPlanMove* m = &p->args[i];
if (m->dst_kind == CG_CALL_PLAN_STACK ||
m->dst_kind == CG_CALL_PLAN_TAIL_STACK) {
- u32 end = m->stack_offset + 8u;
+ u32 end = m->stack_offset + (m->mem.size > 8u ? m->mem.size : 8u);
if (end > size) size = end;
}
}
@@ -1616,6 +1616,16 @@ static void aa_emit_call_plan(CGTarget* t, const CGCallPlan* p) {
return;
}
+ {
+ u32 needed = (aa_call_plan_stack_raw_size(p) + 15u) & ~15u;
+ if (needed > a->max_outgoing) {
+ if (a->known_frame)
+ compiler_panic(t->c, a->loc,
+ "aarch64 call plan: known frame outgoing area too small");
+ a->max_outgoing = needed;
+ }
+ }
+
if (p->callee.kind == OPK_GLOBAL) {
u32 bl_pos = mc->pos(mc);
aa64_emit32(mc, aa64_bl_base());
diff --git a/src/arch/c_target/target.c b/src/arch/c_target/target.c
@@ -81,13 +81,6 @@ void c_fence(CGTarget*, MemOrder);
((CTarget*)t)->cur_fn->loc : (SrcLoc){0,0,0}, \
"C target: method " name " not implemented")
-static void c_unimpl_func_begin_known_frame(CGTarget* t, const CGFuncDesc* f,
- const CGKnownFrameDesc* k,
- FrameSlot* out) {
- (void)f; (void)k; (void)out;
- C_UNIMPL("func_begin_known_frame");
-}
-
static void c_unimpl_spill_reg(CGTarget* t, Operand a, FrameSlot s,
MemAccess m) {
(void)a; (void)s; (void)m;
@@ -190,7 +183,7 @@ CGTarget* c_cgtarget_new(Compiler* c, ObjBuilder* o, CfreeWriter* w) {
/* ---- function lifecycle ---- */
t->func_begin = c_func_begin;
- t->func_begin_known_frame = c_unimpl_func_begin_known_frame;
+ t->func_begin_known_frame = NULL;
t->func_end = c_func_end;
t->alias = c_alias;
diff --git a/src/arch/rv64/ops.c b/src/arch/rv64/ops.c
@@ -918,7 +918,7 @@ static u32 rv_call_plan_stack_raw_size(const CGCallPlan* p) {
const CGCallPlanMove* m = &p->args[i];
if (m->dst_kind == CG_CALL_PLAN_STACK ||
m->dst_kind == CG_CALL_PLAN_TAIL_STACK) {
- u32 end = m->stack_offset + 8u;
+ u32 end = m->stack_offset + (m->mem.size > 8u ? m->mem.size : 8u);
if (end > size) size = end;
}
}
@@ -1433,6 +1433,16 @@ static void rv_emit_call_plan(CGTarget* t, const CGCallPlan* p) {
return;
}
+ {
+ u32 needed = (rv_call_plan_stack_raw_size(p) + 15u) & ~15u;
+ if (needed > a->max_outgoing) {
+ if (a->known_frame)
+ compiler_panic(t->c, a->loc,
+ "rv64 call plan: known frame outgoing area too small");
+ a->max_outgoing = needed;
+ }
+ }
+
if (p->callee.kind == OPK_GLOBAL) {
u32 sec = mc->section_id;
u32 pos = mc->pos(mc);
diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c
@@ -1156,7 +1156,7 @@ static u32 x_call_plan_stack_raw_size(const CGCallPlan* p) {
const CGCallPlanMove* m = &p->args[i];
if (m->dst_kind == CG_CALL_PLAN_STACK ||
m->dst_kind == CG_CALL_PLAN_TAIL_STACK) {
- u32 end = m->stack_offset + 8u;
+ u32 end = m->stack_offset + (m->mem.size > 8u ? m->mem.size : 8u);
if (end > size) size = end;
}
}
@@ -1695,6 +1695,7 @@ static void x_call(CGTarget* t, const CGCallDesc* d) {
}
static void x_emit_call_plan(CGTarget* t, const CGCallPlan* p) {
+ XImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
if (p->is_variadic)
@@ -1708,6 +1709,16 @@ static void x_emit_call_plan(CGTarget* t, const CGCallPlan* p) {
return;
}
+ {
+ u32 needed = (x_call_plan_stack_raw_size(p) + 15u) & ~15u;
+ if (needed > a->max_outgoing) {
+ if (a->known_frame)
+ compiler_panic(t->c, a->loc,
+ "x64 call plan: known frame outgoing area too small");
+ a->max_outgoing = needed;
+ }
+ }
+
if (p->callee.kind == OPK_GLOBAL) {
u8 op = 0xE8;
mc->emit_bytes(mc, &op, 1);
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -323,6 +323,15 @@ typedef struct Block {
MCLabel mc_label;
} Block;
+typedef struct MFunc {
+ Block* blocks;
+ u32 nblocks;
+ u32 entry;
+ u32* emit_order;
+ u32 emit_order_n;
+ u32 emit_order_cap;
+} MFunc;
+
typedef enum OptAllocKind {
OPT_ALLOC_NONE,
OPT_ALLOC_HARD,
@@ -330,6 +339,19 @@ typedef enum OptAllocKind {
OPT_ALLOC_SPLIT,
} OptAllocKind;
+typedef enum OptLocKind {
+ OPT_LOC_NONE,
+ OPT_LOC_HARD,
+ OPT_LOC_STACK,
+} OptLocKind;
+
+typedef struct OptLoc {
+ u8 kind;
+ u8 cls;
+ Reg hard_reg;
+ FrameSlot spill_slot;
+} OptLoc;
+
typedef struct OptAllocSegment {
u32 start;
u32 end;
@@ -461,6 +483,8 @@ typedef struct Func {
u64 opt_coalesce_merges;
InstId next_inst_id;
OptPRegInfo* preg_info; /* indexed by the current allocation reg namespace */
+ OptLoc* preg_locs; /* canonical final allocation locations by PReg */
+ MFunc* mir; /* physical post-allocation IR; HIR stays virtual */
u32* opt_coalesce_parent;
u32* opt_coalesce_size;
OptAllocSegment* opt_alloc_segments;
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -442,12 +442,6 @@ static CGLocalStorage w_param(CGTarget* t, const CGParamDesc* d) {
Inst* in = rec(o, IR_PARAM_DECL);
in->def = (Val)st.v.reg;
in->type = d->type;
- in->opnds = arena_array(o->f->arena, Operand, 1);
- in->opnds[0].kind = OPK_REG;
- in->opnds[0].cls = opt_local_reg_class(o, d->type);
- in->opnds[0].type = d->type;
- in->opnds[0].v.reg = st.v.reg;
- in->nopnds = 1;
}
return st;
}
@@ -1294,17 +1288,17 @@ static int inst_spill_local(Func* f, const Inst* in, u32 op_idx) {
static u64 func_spill_alloc_count(Func* f) {
u64 n = 0;
- if (!f || !f->preg_info) return 0;
+ if (!f || (!f->preg_locs && !f->preg_info)) return 0;
for (PReg r = 1; r < opt_reg_count(f); ++r)
- if (f->preg_info[r].alloc_kind == OPT_ALLOC_SPILL) ++n;
+ if (opt_preg_alloc_kind(f, r) == OPT_ALLOC_SPILL) ++n;
return n;
}
-static u64 func_spill_load_count(Func* f) {
+static u64 blocks_spill_load_count(Func* f, Block* blocks, u32 nblocks) {
u64 n = 0;
- if (!f) return 0;
- for (u32 b = 0; b < f->nblocks; ++b) {
- Block* bl = &f->blocks[b];
+ if (!f || !blocks) return 0;
+ for (u32 b = 0; b < nblocks; ++b) {
+ Block* bl = &blocks[b];
for (u32 i = 0; i < bl->ninsts; ++i) {
Inst* in = &bl->insts[i];
if ((IROp)in->op == IR_LOAD && inst_spill_local(f, in, 1)) ++n;
@@ -1313,11 +1307,18 @@ static u64 func_spill_load_count(Func* f) {
return n;
}
-static u64 func_spill_store_count(Func* f) {
- u64 n = 0;
+static u64 func_spill_load_count(Func* f) {
if (!f) return 0;
- for (u32 b = 0; b < f->nblocks; ++b) {
- Block* bl = &f->blocks[b];
+ if (f->mir)
+ return blocks_spill_load_count(f, f->mir->blocks, f->mir->nblocks);
+ return blocks_spill_load_count(f, f->blocks, f->nblocks);
+}
+
+static u64 blocks_spill_store_count(Func* f, Block* blocks, u32 nblocks) {
+ u64 n = 0;
+ if (!f || !blocks) return 0;
+ for (u32 b = 0; b < nblocks; ++b) {
+ Block* bl = &blocks[b];
for (u32 i = 0; i < bl->ninsts; ++i) {
Inst* in = &bl->insts[i];
if ((IROp)in->op == IR_STORE && inst_spill_local(f, in, 0)) ++n;
@@ -1326,8 +1327,16 @@ static u64 func_spill_store_count(Func* f) {
return n;
}
+static u64 func_spill_store_count(Func* f) {
+ if (!f) return 0;
+ if (f->mir)
+ return blocks_spill_store_count(f, f->mir->blocks, f->mir->nblocks);
+ return blocks_spill_store_count(f, f->blocks, f->nblocks);
+}
+
static void opt_run_lowering_pipeline(OptImpl* o, const char* total_scope,
int allow_live_range_split) {
+ (void)allow_live_range_split;
metrics_scope_begin(o->c, total_scope);
metrics_count(o->c, "opt.funcs", 1);
metrics_count(o->c, "opt.blocks", o->f->nblocks);
@@ -1351,18 +1360,27 @@ static void opt_run_lowering_pipeline(OptImpl* o, const char* total_scope,
metrics_scope_begin(o->c, "opt.machinize");
opt_machinize(o->f, o->target);
metrics_scope_end(o->c, "opt.machinize");
- /* Fold address-of-local pseudos into direct memory operands. O2 runs the
- * SSA variant in opt_cleanup; here is the equivalent for the O1 pipeline
- * which operates on the PReg namespace directly. */
- metrics_scope_begin(o->c, "opt.addr_xform_pregs");
+ metrics_scope_begin(o->c, "opt.machinize.verify");
+ opt_verify(o->f, "lowering-machinize");
+ metrics_scope_end(o->c, "opt.machinize.verify");
+ metrics_scope_begin(o->c, "opt.o1.addr_xform_pregs");
opt_addr_xform_pregs(o->f);
- metrics_scope_end(o->c, "opt.addr_xform_pregs");
- metrics_scope_begin(o->c, "opt.promote_scalar_locals");
+ metrics_scope_end(o->c, "opt.o1.addr_xform_pregs");
+ metrics_scope_begin(o->c, "opt.o1.addr_xform.verify");
+ opt_verify(o->f, "o1-addr-xform");
+ metrics_scope_end(o->c, "opt.o1.addr_xform.verify");
+ metrics_scope_begin(o->c, "opt.o1.promote_scalar_locals");
opt_promote_scalar_locals(o->f);
- metrics_scope_end(o->c, "opt.promote_scalar_locals");
- metrics_scope_begin(o->c, "opt.addr_of_global_cse");
+ metrics_scope_end(o->c, "opt.o1.promote_scalar_locals");
+ metrics_scope_begin(o->c, "opt.o1.promote_scalar.verify");
+ opt_verify(o->f, "o1-promote-scalar");
+ metrics_scope_end(o->c, "opt.o1.promote_scalar.verify");
+ metrics_scope_begin(o->c, "opt.o1.addr_of_global_cse");
opt_addr_of_global_cse(o->f);
- metrics_scope_end(o->c, "opt.addr_of_global_cse");
+ metrics_scope_end(o->c, "opt.o1.addr_of_global_cse");
+ metrics_scope_begin(o->c, "opt.o1.addr_of_global.verify");
+ opt_verify(o->f, "o1-addr-global-cse");
+ metrics_scope_end(o->c, "opt.o1.addr_of_global.verify");
metrics_scope_begin(o->c, "opt.build_loop_tree");
opt_build_loop_tree(o->f);
metrics_scope_end(o->c, "opt.build_loop_tree");
@@ -1387,7 +1405,8 @@ static void opt_run_lowering_pipeline(OptImpl* o, const char* total_scope,
o->f->opt_dde_live_words_touched);
metrics_scope_end(o->c, "opt.dead_def_elim");
metrics_scope_begin(o->c, "opt.regalloc");
- opt_regalloc(o->f, allow_live_range_split);
+ OptLiveInfo regalloc_live;
+ opt_regalloc_locations(o->f, 0, ®alloc_live);
metrics_count(o->c, "opt.alloc.used_loc_words", o->f->opt_used_loc_words);
metrics_count(o->c, "opt.alloc.hard_loc_words",
o->f->opt_alloc_hard_loc_words);
@@ -1406,30 +1425,46 @@ static void opt_run_lowering_pipeline(OptImpl* o, const char* total_scope,
metrics_count(o->c, "opt.alloc.stack_mark_points",
o->f->opt_alloc_stack_mark_points);
metrics_count(o->c, "opt.alloc.spills", func_spill_alloc_count(o->f));
+ metrics_scope_end(o->c, "opt.regalloc");
+ metrics_scope_begin(o->c, "opt.regalloc.verify");
+ opt_analysis_invalidate(o->f, OPT_ANALYSIS_DEF_USE);
+ opt_verify(o->f, "post-regalloc");
+ metrics_scope_end(o->c, "opt.regalloc.verify");
+ metrics_scope_begin(o->c, "opt.lower_mir");
+ opt_lower_to_mir(o->f, ®alloc_live);
metrics_count(o->c, "opt.rewrite.reloads", func_spill_load_count(o->f));
metrics_count(o->c, "opt.rewrite.stores", func_spill_store_count(o->f));
metrics_count(o->c, "opt.rewrite.inserted_insts",
o->f->opt_rewrite_inserted_insts);
metrics_count(o->c, "opt.rewrite.live_words_touched",
o->f->opt_rewrite_live_words_touched);
- metrics_scope_end(o->c, "opt.regalloc");
+ metrics_scope_end(o->c, "opt.lower_mir");
+ metrics_scope_begin(o->c, "opt.lower_mir.verify");
+ opt_mir_verify(o->f, "lower-mir");
+ metrics_scope_end(o->c, "opt.lower_mir.verify");
metrics_scope_begin(o->c, "opt.combine");
- opt_combine(o->f);
+ opt_mir_combine(o->f);
metrics_scope_end(o->c, "opt.combine");
+ metrics_scope_begin(o->c, "opt.combine.verify");
+ opt_mir_verify(o->f, "post-mir-combine");
+ metrics_scope_end(o->c, "opt.combine.verify");
metrics_scope_begin(o->c, "opt.dce");
- opt_dce(o->f);
+ opt_mir_dce(o->f);
metrics_scope_end(o->c, "opt.dce");
+ metrics_scope_begin(o->c, "opt.dce.verify");
+ opt_mir_verify(o->f, "post-mir-dce");
+ metrics_scope_end(o->c, "opt.dce.verify");
metrics_scope_begin(o->c, "opt.post_ra.jump_cleanup_cfg");
- opt_jump_cleanup(o->f, OPT_JUMP_CLEANUP_CFG);
+ opt_mir_jump_cleanup(o->f, OPT_JUMP_CLEANUP_CFG);
metrics_scope_end(o->c, "opt.post_ra.jump_cleanup_cfg");
metrics_scope_begin(o->c, "opt.post_ra.build_cfg");
- opt_build_cfg(o->f);
+ opt_mir_build_cfg(o->f);
metrics_scope_end(o->c, "opt.post_ra.build_cfg");
metrics_scope_begin(o->c, "opt.post_ra.verify");
- opt_verify(o->f, "post-ra-jump-cfg");
+ opt_mir_verify(o->f, "post-mir-jump-cfg");
metrics_scope_end(o->c, "opt.post_ra.verify");
metrics_scope_begin(o->c, "opt.post_ra.jump_cleanup_layout");
- opt_jump_cleanup(o->f, OPT_JUMP_CLEANUP_LAYOUT);
+ opt_mir_jump_cleanup(o->f, OPT_JUMP_CLEANUP_LAYOUT);
metrics_scope_end(o->c, "opt.post_ra.jump_cleanup_layout");
metrics_scope_begin(o->c, "opt.emit");
opt_emit(o->c, o->f, o->target);
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -83,6 +83,7 @@ void opt_machinize(Func*, CGTarget* target);
void opt_build_loop_tree(Func*);
typedef struct OptBitset {
+ Arena* arena;
u64* words;
u32 nwords;
u32 active_words;
@@ -145,19 +146,6 @@ typedef struct OptLiveRangeSet {
u64 live_words_touched;
} OptLiveRangeSet;
-typedef enum OptLocKind {
- OPT_LOC_NONE,
- OPT_LOC_HARD,
- OPT_LOC_STACK,
-} OptLocKind;
-
-typedef struct OptLoc {
- u8 kind;
- u8 cls;
- Reg hard_reg;
- FrameSlot spill_slot;
-} OptLoc;
-
typedef void (*OptBitsetIterFn)(PReg, void*);
void opt_bitset_clear(OptBitset*);
@@ -177,7 +165,15 @@ void opt_ir_dump(Func*, Writer*);
void opt_ssa_dump(Func*, Writer*);
void opt_rewrite_dump(Func*, Writer*);
void opt_coalesce(Func*);
+void opt_regalloc_locations(Func*, int allow_live_range_split,
+ OptLiveInfo* live_out);
void opt_regalloc(Func*, int allow_live_range_split);
+void opt_lower_to_mir(Func*, const OptLiveInfo*);
+void opt_mir_combine(Func*);
+void opt_mir_dce(Func*);
+void opt_mir_jump_cleanup(Func*, OptJumpCleanupStage);
+void opt_mir_build_cfg(Func*);
+void opt_mir_verify(Func*, const char* stage);
void opt_combine(Func*); /* code selection: merge dependent insns */
void opt_dce(Func*); /* post-RA DCE */
void opt_dead_def_elim(Func*); /* pre-RA dead-definition elimination */
diff --git a/src/opt/opt_internal.h b/src/opt/opt_internal.h
@@ -87,6 +87,51 @@ static inline u8 opt_reg_cls(const Func* f, PReg r) {
return f->preg_cls[r];
}
+static inline const OptLoc* opt_preg_loc(const Func* f, PReg r) {
+ if (!f || !f->preg_locs || !opt_reg_valid(f, r)) return NULL;
+ return &f->preg_locs[r];
+}
+
+static inline u8 opt_preg_alloc_kind(const Func* f, PReg r) {
+ const OptLoc* loc = opt_preg_loc(f, r);
+ if (loc) {
+ switch ((OptLocKind)loc->kind) {
+ case OPT_LOC_HARD:
+ return OPT_ALLOC_HARD;
+ case OPT_LOC_STACK:
+ return OPT_ALLOC_SPILL;
+ case OPT_LOC_NONE:
+ default:
+ break;
+ }
+ }
+ if (!f || !f->preg_info || !opt_reg_valid(f, r)) return OPT_ALLOC_NONE;
+ return f->preg_info[r].alloc_kind;
+}
+
+static inline u8 opt_preg_loc_cls(const Func* f, PReg r) {
+ const OptLoc* loc = opt_preg_loc(f, r);
+ if (loc && loc->kind != OPT_LOC_NONE) return loc->cls;
+ if (f && f->preg_info && opt_reg_valid(f, r)) return f->preg_info[r].cls;
+ return opt_reg_cls(f, r);
+}
+
+static inline Reg opt_preg_hard_reg(const Func* f, PReg r) {
+ const OptLoc* loc = opt_preg_loc(f, r);
+ if (loc && loc->kind == OPT_LOC_HARD) return loc->hard_reg;
+ if (f && f->preg_info && opt_reg_valid(f, r))
+ return f->preg_info[r].hard_reg;
+ return REG_NONE;
+}
+
+static inline FrameSlot opt_preg_spill_slot(const Func* f, PReg r) {
+ const OptLoc* loc = opt_preg_loc(f, r);
+ if (loc && loc->kind == OPT_LOC_STACK) return loc->spill_slot;
+ if (f && f->preg_info && opt_reg_valid(f, r))
+ return f->preg_info[r].spill_slot;
+ return FRAME_SLOT_NONE;
+}
+
void opt_analysis_mark_valid(Func*, u32 flags);
void opt_analysis_invalidate(Func*, u32 flags);
int opt_analysis_has(Func*, u32 flags);
diff --git a/src/opt/opt_util.c b/src/opt/opt_util.c
@@ -1,5 +1,7 @@
#include "opt/opt_internal.h"
+#include <string.h>
+
int opt_val_in_inst_defs(const Inst* in, Val v) {
if (!in || v == VAL_NONE) return 0;
if (in->def == v) return 1;
@@ -66,6 +68,19 @@ void opt_walk_inst_operands(Func* f, Inst* in, OptOperandWalkFn fn, void* ctx) {
}
switch ((IROp)in->op) {
+ case IR_PARAM_DECL: {
+ Operand def;
+ if (in->def == VAL_NONE) break;
+ memset(&def, 0, sizeof def);
+ def.kind = OPK_REG;
+ def.cls =
+ (f->preg_info && in->def < opt_reg_count(f)) ? f->preg_info[in->def].cls
+ : 0;
+ def.type = in->type;
+ def.v.reg = (Reg)in->def;
+ opt_walk_operand(f, in, &def, 1, fn, ctx);
+ break;
+ }
case IR_CALL: {
IRCallAux* aux = (IRCallAux*)in->extra.aux;
if (!aux) break;
diff --git a/src/opt/pass_analysis.c b/src/opt/pass_analysis.c
@@ -38,6 +38,158 @@ static int verify_stage_is_ssa(const char* stage) {
return stage && slice_contains_cstr(s, "ssa") &&
!slice_contains_cstr(s, "pre-ssa");
}
+
+static u8 verify_type_reg_class(Func* f, CfreeCgTypeId ty) {
+ CfreeCgTypeKind kind = cfree_cg_type_kind((CfreeCompiler*)f->c, ty);
+ return kind == CFREE_CG_TYPE_FLOAT ? RC_FP : RC_INT;
+}
+
+static void verify_frame_slot(Func* f, const char* stage, FrameSlot slot,
+ const char* msg) {
+ if (slot == FRAME_SLOT_NONE || slot > f->nframe_slots)
+ opt_fail(f, stage, msg, slot, f->nframe_slots);
+}
+
+static void verify_storage(Func* f, const char* stage, CGLocalStorage st,
+ CfreeCgTypeId type, u8 expect_cls,
+ const char* msg) {
+ switch ((CGLocalStorageKind)st.kind) {
+ case CG_LOCAL_STORAGE_FRAME:
+ verify_frame_slot(f, stage, st.v.frame_slot, msg);
+ break;
+ case CG_LOCAL_STORAGE_REG: {
+ PReg r = (PReg)st.v.reg;
+ if (r == PREG_NONE || r == 0 || r >= opt_reg_count(f))
+ opt_fail(f, stage, msg, r, opt_reg_count(f));
+ if (!f->opt_reg_ssa && f->preg_cls && f->preg_cls[r] != expect_cls)
+ opt_fail(f, stage, "storage class mismatch", r, f->preg_cls[r]);
+ if (!f->opt_reg_ssa && type && f->preg_type && f->preg_type[r] &&
+ f->preg_type[r] != type)
+ opt_fail(f, stage, "storage type mismatch", r, type);
+ break;
+ }
+ default:
+ opt_fail(f, stage, "bad storage kind", st.kind, 0);
+ break;
+ }
+}
+
+static void verify_operand_shape(Func* f, const char* stage, const Operand* op,
+ int physical_regs) {
+ if (!op) return;
+ switch ((OpKind)op->kind) {
+ case OPK_IMM:
+ case OPK_GLOBAL:
+ break;
+ case OPK_LOCAL:
+ verify_frame_slot(f, stage, op->v.frame_slot, "bad local frame slot");
+ break;
+ case OPK_REG:
+ if (op->cls >= OPT_REG_CLASSES)
+ opt_fail(f, stage, "bad operand class", op->cls, OPT_REG_CLASSES);
+ if (physical_regs) {
+ if (op->v.reg == (Reg)REG_NONE || op->v.reg >= OPT_MAX_HARD_REGS)
+ opt_fail(f, stage, "bad physical operand reg", op->v.reg,
+ OPT_MAX_HARD_REGS);
+ }
+ break;
+ case OPK_INDIRECT:
+ if (op->cls >= OPT_REG_CLASSES)
+ opt_fail(f, stage, "bad indirect class", op->cls, OPT_REG_CLASSES);
+ if (physical_regs) {
+ if (op->v.ind.base == (Reg)REG_NONE ||
+ op->v.ind.base >= OPT_MAX_HARD_REGS)
+ opt_fail(f, stage, "bad physical indirect base", op->v.ind.base,
+ OPT_MAX_HARD_REGS);
+ if (op->v.ind.index != (Reg)REG_NONE &&
+ op->v.ind.index >= OPT_MAX_HARD_REGS)
+ opt_fail(f, stage, "bad physical indirect index", op->v.ind.index,
+ OPT_MAX_HARD_REGS);
+ }
+ break;
+ default:
+ opt_fail(f, stage, "bad operand kind", op->kind, 0);
+ break;
+ }
+}
+
+static void verify_abivalue_shape(Func* f, const char* stage, CGABIValue* v,
+ int physical_regs) {
+ if (!v) return;
+ verify_operand_shape(f, stage, &v->storage, physical_regs);
+ for (u32 i = 0; i < v->nparts; ++i)
+ verify_operand_shape(f, stage, &v->parts[i].op, physical_regs);
+}
+
+static void verify_call_plan_shape(Func* f, const char* stage,
+ CGCallPlan* plan, int physical_regs) {
+ if (!plan) return;
+ verify_operand_shape(f, stage, &plan->callee, physical_regs);
+ for (u32 i = 0; i < plan->nargs; ++i) {
+ verify_operand_shape(f, stage, &plan->args[i].src, physical_regs);
+ if (plan->args[i].dst_kind == CG_CALL_PLAN_REG &&
+ plan->args[i].dst_reg >= OPT_MAX_HARD_REGS)
+ opt_fail(f, stage, "bad call-plan dst reg", plan->args[i].dst_reg,
+ OPT_MAX_HARD_REGS);
+ }
+ for (u32 i = 0; i < plan->nrets; ++i) {
+ verify_operand_shape(f, stage, &plan->rets[i].dst, physical_regs);
+ if (plan->rets[i].src_reg >= OPT_MAX_HARD_REGS)
+ opt_fail(f, stage, "bad call-plan ret reg", plan->rets[i].src_reg,
+ OPT_MAX_HARD_REGS);
+ }
+}
+
+static void verify_aux_shapes(Func* f, const char* stage, Inst* in,
+ int physical_regs) {
+ switch ((IROp)in->op) {
+ case IR_CALL: {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (!aux) break;
+ if (aux->use_plan_replay) {
+ verify_call_plan_shape(f, stage, &aux->plan, physical_regs);
+ } else {
+ verify_operand_shape(f, stage, &aux->desc.callee, physical_regs);
+ for (u32 i = 0; i < aux->desc.nargs; ++i)
+ verify_abivalue_shape(f, stage, (CGABIValue*)&aux->desc.args[i],
+ physical_regs);
+ verify_abivalue_shape(f, stage, &aux->desc.ret, physical_regs);
+ }
+ break;
+ }
+ case IR_RET: {
+ IRRetAux* aux = (IRRetAux*)in->extra.aux;
+ if (aux && aux->present)
+ verify_abivalue_shape(f, stage, &aux->val, physical_regs);
+ break;
+ }
+ case IR_SCOPE_BEGIN: {
+ IRScopeAux* aux = (IRScopeAux*)in->extra.aux;
+ if (aux) verify_operand_shape(f, stage, &aux->desc.cond, physical_regs);
+ break;
+ }
+ case IR_ASM_BLOCK: {
+ IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
+ if (!aux) break;
+ for (u32 i = 0; i < aux->nin; ++i)
+ verify_operand_shape(f, stage, &aux->in_ops[i], physical_regs);
+ for (u32 i = 0; i < aux->nout; ++i)
+ verify_operand_shape(f, stage, &aux->out_ops[i], physical_regs);
+ break;
+ }
+ case IR_INTRINSIC: {
+ IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
+ if (!aux) break;
+ for (u32 i = 0; i < aux->narg; ++i)
+ verify_operand_shape(f, stage, &aux->args[i], physical_regs);
+ for (u32 i = 0; i < aux->ndst; ++i)
+ verify_operand_shape(f, stage, &aux->dsts[i], physical_regs);
+ break;
+ }
+ default:
+ break;
+ }
+}
#endif
void opt_analysis_mark_valid(Func* f, u32 flags) {
@@ -400,6 +552,9 @@ static void verify_values(Func* f, const char* stage) {
if (v == VAL_NONE || v >= ndefs)
opt_fail(f, stage, "bad inst multi-def", b, d);
}
+ for (u32 o = 0; o < in->nopnds; ++o)
+ verify_operand_shape(f, stage, &in->opnds[o], 0);
+ verify_aux_shapes(f, stage, in, 0);
opt_walk_inst_operands(f, in, verify_operand, (void*)stage);
if ((IROp)in->op == IR_PHI) {
IRPhiAux* aux = (IRPhiAux*)in->extra.aux;
@@ -424,7 +579,130 @@ static void verify_values(Func* f, const char* stage) {
f->val_type[aux->pred_vals[p]] != in->type)
opt_fail(f, stage, "phi input type mismatch", b, p);
}
+ } else if ((IROp)in->op == IR_PARAM_DECL) {
+ if (in->nopnds || in->opnds)
+ opt_fail(f, stage, "param_decl should not carry operands", b, i);
+ if (in->def == VAL_NONE)
+ opt_fail(f, stage, "param_decl missing def", b, i);
+ }
+ }
+ }
+}
+
+static void verify_function_storage(Func* f, const char* stage) {
+ for (u32 i = 0; i < f->nframe_slots; ++i) {
+ IRFrameSlot* s = &f->frame_slots[i];
+ if (s->id != i + 1u) opt_fail(f, stage, "frame slot id mismatch", s->id, i);
+ if (s->kind > FS_SPILL) opt_fail(f, stage, "bad frame slot kind", s->kind, i);
+ }
+ for (u32 i = 0; i < f->nparams; ++i) {
+ IRParam* p = &f->params[i];
+ u8 cls = verify_type_reg_class(f, p->type);
+ if (p->index != i) opt_fail(f, stage, "param index mismatch", p->index, i);
+ verify_storage(f, stage, p->storage, p->type, cls, "bad param storage");
+ }
+ for (u32 i = 0; i < f->nlocals; ++i) {
+ IRLocal* l = &f->locals[i];
+ u8 cls = verify_type_reg_class(f, l->desc.type);
+ verify_storage(f, stage, l->storage, l->desc.type, cls,
+ "bad local storage");
+ if (l->home_slot != FRAME_SLOT_NONE)
+ verify_frame_slot(f, stage, l->home_slot, "bad local home slot");
+ }
+}
+
+static void verify_allocations(Func* f, const char* stage) {
+ if (!f->preg_info && !f->preg_locs) return;
+ for (PReg r = 1; r < opt_reg_count(f); ++r) {
+ OptPRegInfo* pi = f->preg_info ? &f->preg_info[r] : NULL;
+ OptLoc* loc = f->preg_locs ? &f->preg_locs[r] : NULL;
+ u8 alloc_kind = opt_preg_alloc_kind(f, r);
+ u8 cls = opt_preg_loc_cls(f, r);
+ if (cls >= OPT_REG_CLASSES)
+ opt_fail(f, stage, "bad preg alloc class", r, cls);
+ if (pi && pi->alloc_kind != alloc_kind &&
+ !(pi->alloc_kind == OPT_ALLOC_SPLIT && alloc_kind == OPT_ALLOC_SPLIT))
+ opt_fail(f, stage, "alloc kind mirror mismatch", r, pi->alloc_kind);
+ switch ((OptAllocKind)alloc_kind) {
+ case OPT_ALLOC_NONE:
+ if (loc && loc->kind != OPT_LOC_NONE)
+ opt_fail(f, stage, "alloc location mismatch", r, loc->kind);
+ break;
+ case OPT_ALLOC_HARD:
+ if (opt_preg_hard_reg(f, r) == (Reg)REG_NONE ||
+ opt_preg_hard_reg(f, r) >= OPT_MAX_HARD_REGS)
+ opt_fail(f, stage, "bad hard allocation", r,
+ opt_preg_hard_reg(f, r));
+ if (pi && (pi->cls != cls || pi->hard_reg != opt_preg_hard_reg(f, r)))
+ opt_fail(f, stage, "hard alloc mirror mismatch", r, pi->hard_reg);
+ if (loc && (loc->kind != OPT_LOC_HARD || loc->cls != cls))
+ opt_fail(f, stage, "hard alloc location mismatch", r,
+ opt_preg_hard_reg(f, r));
+ break;
+ case OPT_ALLOC_SPILL:
+ verify_frame_slot(f, stage, opt_preg_spill_slot(f, r),
+ "bad spill slot");
+ if (f->frame_slots[opt_preg_spill_slot(f, r) - 1u].kind != FS_SPILL)
+ opt_fail(f, stage, "spill slot is not FS_SPILL", r,
+ opt_preg_spill_slot(f, r));
+ if (pi &&
+ (pi->cls != cls || pi->spill_slot != opt_preg_spill_slot(f, r)))
+ opt_fail(f, stage, "spill alloc mirror mismatch", r, pi->spill_slot);
+ if (loc && (loc->kind != OPT_LOC_STACK || loc->cls != cls))
+ opt_fail(f, stage, "spill alloc location mismatch", r,
+ opt_preg_spill_slot(f, r));
+ break;
+ case OPT_ALLOC_SPLIT:
+ if (!f->opt_first_segment_by_preg)
+ opt_fail(f, stage, "split allocation missing segments", r, 0);
+ if (loc && loc->kind != OPT_LOC_NONE)
+ opt_fail(f, stage, "split alloc location mismatch", r, loc->kind);
+ break;
+ default:
+ opt_fail(f, stage, "bad allocation kind", r, alloc_kind);
+ break;
+ }
+ }
+ if (f->opt_first_segment_by_preg && f->opt_alloc_segments) {
+ for (PReg r = 1; r < opt_reg_count(f); ++r) {
+ for (u32 si = f->opt_first_segment_by_preg[r]; si != OPT_RANGE_NONE;
+ si = f->opt_alloc_segments[si].next) {
+ OptAllocSegment* s = &f->opt_alloc_segments[si];
+ if (s->block >= f->nblocks) opt_fail(f, stage, "bad split block", r, si);
+ if (s->start > s->end) opt_fail(f, stage, "bad split range", r, si);
+ if (s->cls >= OPT_REG_CLASSES)
+ opt_fail(f, stage, "bad split class", r, s->cls);
+ if (s->loc_kind == OPT_LOC_HARD) {
+ if (s->hard_reg == (Reg)REG_NONE || s->hard_reg >= OPT_MAX_HARD_REGS)
+ opt_fail(f, stage, "bad split hard reg", r, s->hard_reg);
+ } else if (s->loc_kind == OPT_LOC_STACK) {
+ verify_frame_slot(f, stage, s->spill_slot, "bad split spill slot");
+ } else if (s->loc_kind != OPT_LOC_NONE) {
+ opt_fail(f, stage, "bad split loc kind", r, s->loc_kind);
+ }
+ }
+ }
+ }
+}
+
+static void verify_rewritten(Func* f, const char* stage) {
+ if (!f->opt_rewritten) return;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op == IR_PHI)
+ opt_fail(f, stage, "phi survived rewrite", b, i);
+ if ((IROp)in->op == IR_PARAM_DECL) {
+ if (in->nopnds || in->opnds)
+ opt_fail(f, stage, "param_decl carries operands after rewrite", b, i);
+ if (in->def == VAL_NONE || in->def >= opt_reg_count(f))
+ opt_fail(f, stage, "bad param_decl def after rewrite", b, i);
+ continue;
}
+ for (u32 o = 0; o < in->nopnds; ++o)
+ verify_operand_shape(f, stage, &in->opnds[o], 1);
+ verify_aux_shapes(f, stage, in, 1);
}
}
}
@@ -557,7 +835,10 @@ void opt_verify(Func* f, const char* stage) {
if (seen_emit[b]) opt_fail(f, stage, "duplicate emit block", b, i);
seen_emit[b] = 1;
}
+ verify_function_storage(f, stage);
+ verify_allocations(f, stage);
verify_values(f, stage);
+ verify_rewritten(f, stage);
verify_def_use(f, stage);
#endif
}
diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c
@@ -475,6 +475,26 @@ static int ctx_def_changed_since(const CombineCtx* ctx, u8 cls, Reg reg,
return ctx->last_def[cls][reg] > since_idx;
}
+static i32 ctx_prev_def_before(const CombineCtx* ctx, const Operand* reg,
+ i32 before_idx) {
+ if (!reg || reg->kind != OPK_REG) return -1;
+ for (i32 j = before_idx - 1; j >= 0; --j) {
+ const Inst* prev = &ctx->bl->insts[j];
+ if (inst_is_clobber_barrier(prev) || inst_defines_phys_reg(prev, reg))
+ return j;
+ }
+ return -1;
+}
+
+static void ctx_restore_removed_def(CombineCtx* ctx, const Operand* reg,
+ i32 removed_idx) {
+ if (!reg || reg->kind != OPK_REG) return;
+ if (reg->cls >= OPT_REG_CLASSES || reg->v.reg >= OPT_MAX_HARD_REGS) return;
+ if (ctx->last_def[reg->cls][reg->v.reg] == removed_idx)
+ ctx->last_def[reg->cls][reg->v.reg] =
+ ctx_prev_def_before(ctx, reg, removed_idx);
+}
+
/* ---- forward-scan use accounting (used for single-use checks) ---- */
/* Count phys uses of `def` within the live range starting just after
@@ -506,6 +526,18 @@ static int count_uses_in_live_range(const Block* bl, i32 prod_idx,
return n;
}
+static int use_after_clobber_before_redef(const Block* bl, i32 prod_idx,
+ const Operand* def) {
+ int saw_clobber = 0;
+ for (i32 i = prod_idx + 1; i < (i32)bl->ninsts; ++i) {
+ const Inst* in = &bl->insts[i];
+ if (saw_clobber && inst_uses_phys_reg(in, def)) return 1;
+ if (inst_defines_phys_reg(in, def)) return 0;
+ if (inst_is_clobber_barrier(in)) saw_clobber = 1;
+ }
+ return 0;
+}
+
/* ---- ConvKind helpers (for combine_exts) ---- */
static u32 builtin_int_bytes(CfreeCgTypeId t) {
@@ -672,6 +704,10 @@ static int subst_consumer_operands(Inst* in, const Operand* def,
(op->v.ind.base == def->v.reg ||
(op->v.ind.index != (Reg)REG_NONE &&
op->v.ind.index == def->v.reg))) {
+ if ((IROp)in->op == IR_STORE || (IROp)in->op == IR_ATOMIC_STORE ||
+ (IROp)in->op == IR_BITFIELD_STORE || (IROp)in->op == IR_AGG_COPY ||
+ (IROp)in->op == IR_AGG_SET)
+ continue;
set_indirect_field(op, def->v.reg, src->v.reg);
++n;
}
@@ -947,6 +983,8 @@ static int try_sink(CombineCtx* ctx, Inst* in, i32 i) {
int killed = 0;
int uses_total = count_uses_in_live_range(ctx->bl, prod_idx, &src, &killed);
if (uses_total != 1) return 0;
+ if (killed && use_after_clobber_before_redef(ctx->bl, prod_idx, &src))
+ return 0;
if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live,
ctx->bl->id, &src))
return 0;
@@ -962,8 +1000,10 @@ static int try_sink(CombineCtx* ctx, Inst* in, i32 i) {
prod->opnds[0] = dst;
if (prod->def != VAL_NONE) prod->def = (Val)dst.v.reg;
- /* Update last-def: producer no longer defines src, now defines dst. */
- ctx->last_def[src.cls][src.v.reg] = -1;
+ /* Update last-def: producer no longer defines src, now defines dst. If src
+ * had an earlier reaching definition in the block, keep it visible for
+ * later source-availability checks. */
+ ctx_restore_removed_def(ctx, &src, prod_idx);
ctx->last_def[dst.cls][dst.v.reg] = prod_idx;
/* Mark the copy NOP'd so compact removes it. */
@@ -1082,6 +1122,7 @@ static int try_ret_retarget(Func* f, Block* bl, i32 i) {
static int opt_combine_fold_block(Func* f, Block* bl,
const OptHardBlockLive* hard_live) {
enum { enable_o1_combine_rewrites = 1 };
+ enum { enable_o1_sink_rewrites = 1 };
CombineCtx ctx;
ctx.f = f;
ctx.bl = bl;
@@ -1103,7 +1144,7 @@ static int opt_combine_fold_block(Func* f, Block* bl,
for (Reg r = 0; r < OPT_MAX_HARD_REGS; ++r) {
probe.v.reg = r;
if (ctx.last_def[c][r] == i - 1 && !inst_defines_phys_reg(prev, &probe))
- ctx.last_def[c][r] = -1;
+ ctx_restore_removed_def(&ctx, &probe, i - 1);
}
}
ctx_record(&ctx, prev, i - 1);
@@ -1112,7 +1153,7 @@ static int opt_combine_fold_block(Func* f, Block* bl,
/* Skip NOPs left by prior sink rewrites. */
if ((IROp)in->op == IR_NOP) continue;
- if (enable_o1_combine_rewrites && try_sink(&ctx, in, i)) {
+ if (enable_o1_sink_rewrites && try_sink(&ctx, in, i)) {
/* sink NOP'd the copy and updated ctx. */
continue;
}
diff --git a/src/opt/pass_emit.c b/src/opt/pass_emit.c
@@ -74,14 +74,13 @@ static CGLocalStorage xlat_storage(ReplayCtx* r, CGLocalStorage st,
(void)ty;
if (st.kind == CG_LOCAL_STORAGE_REG) {
PReg pr = (PReg)st.v.reg;
- if (r->identity_regs && r->f->opt_rewritten && pr != 0 &&
- pr < opt_reg_count(r->f) && r->f->preg_info) {
- OptPRegInfo* vi = &r->f->preg_info[pr];
- if (vi->alloc_kind == OPT_ALLOC_HARD) {
- st.v.reg = vi->hard_reg;
- } else if (vi->alloc_kind == OPT_ALLOC_SPILL) {
+ if (r->identity_regs && r->f->opt_rewritten && opt_reg_valid(r->f, pr)) {
+ u8 alloc_kind = opt_preg_alloc_kind(r->f, pr);
+ if (alloc_kind == OPT_ALLOC_HARD) {
+ st.v.reg = opt_preg_hard_reg(r->f, pr);
+ } else if (alloc_kind == OPT_ALLOC_SPILL) {
st.kind = CG_LOCAL_STORAGE_FRAME;
- st.v.frame_slot = slot_to_target(r, vi->spill_slot);
+ st.v.frame_slot = slot_to_target(r, opt_preg_spill_slot(r->f, pr));
} else {
st.v.reg = val_to_target_reg(r, (Val)pr);
}
@@ -96,11 +95,11 @@ static CGLocalStorage xlat_storage(ReplayCtx* r, CGLocalStorage st,
static int replay_reg_storage_unused(ReplayCtx* r, CGLocalStorage st) {
if (!r || st.kind != CG_LOCAL_STORAGE_REG) return 0;
- if (!(r->identity_regs && r->f->opt_rewritten && r->f->preg_info)) return 0;
+ if (!(r->identity_regs && r->f->opt_rewritten)) return 0;
PReg pr = (PReg)st.v.reg;
if (pr == 0 || pr >= opt_reg_count(r->f)) return 0;
- return r->f->preg_info[pr].alloc_kind == OPT_ALLOC_NONE ||
- r->f->preg_info[pr].use_freq == 0;
+ if (opt_preg_alloc_kind(r->f, pr) == OPT_ALLOC_NONE) return 1;
+ return r->f->preg_info && r->f->preg_info[pr].use_freq == 0;
}
static Operand xlat_op(ReplayCtx* r, Operand op) {
@@ -855,39 +854,54 @@ static void add_unique_reg(Reg* used, u32* nused, u32 cap, Reg r) {
if (*nused < cap) used[(*nused)++] = r;
}
-static void collect_replayed_operand_reg(const Operand* op, RegClass cls,
+static void collect_replayed_reg(Func* f, Reg raw, RegClass cls, Reg* used,
+ u32* nused, u32 cap) {
+ if (raw == (Reg)REG_NONE) return;
+ if (f && f->opt_rewritten && opt_reg_valid(f, (PReg)raw)) {
+ PReg pr = (PReg)raw;
+ if (opt_preg_alloc_kind(f, pr) == OPT_ALLOC_HARD &&
+ opt_preg_loc_cls(f, pr) == (u8)cls)
+ add_unique_reg(used, nused, cap, opt_preg_hard_reg(f, pr));
+ }
+ add_unique_reg(used, nused, cap, raw);
+}
+
+static void collect_replayed_operand_reg(Func* f, const Operand* op, RegClass cls,
Reg* used, u32* nused, u32 cap) {
if (!op) return;
if (op->kind == OPK_REG) {
- if (op->cls == cls) add_unique_reg(used, nused, cap, op->v.reg);
+ if (op->cls == cls)
+ collect_replayed_reg(f, op->v.reg, cls, used, nused, cap);
} else if (op->kind == OPK_INDIRECT) {
if (cls == RC_INT) {
- add_unique_reg(used, nused, cap, op->v.ind.base);
+ collect_replayed_reg(f, op->v.ind.base, cls, used, nused, cap);
if (op->v.ind.index != (Reg)REG_NONE)
- add_unique_reg(used, nused, cap, op->v.ind.index);
+ collect_replayed_reg(f, op->v.ind.index, cls, used, nused, cap);
}
}
}
-static void collect_replayed_abivalue_regs(const CGABIValue* v, RegClass cls,
+static void collect_replayed_abivalue_regs(Func* f, const CGABIValue* v,
+ RegClass cls,
Reg* used, u32* nused, u32 cap) {
if (!v) return;
- collect_replayed_operand_reg(&v->storage, cls, used, nused, cap);
+ collect_replayed_operand_reg(f, &v->storage, cls, used, nused, cap);
for (u32 i = 0; i < v->nparts; ++i)
- collect_replayed_operand_reg(&v->parts[i].op, cls, used, nused, cap);
+ collect_replayed_operand_reg(f, &v->parts[i].op, cls, used, nused, cap);
}
static void collect_replayed_param_regs(Func* f, RegClass cls, Reg* used,
u32* nused, u32 cap) {
- if (!f->opt_rewritten || !f->preg_info) return;
+ if (!f->opt_rewritten) return;
for (u32 i = 0; i < f->nparams; ++i) {
IRParam* p = &f->params[i];
if (p->storage.kind != CG_LOCAL_STORAGE_REG) continue;
PReg pr = (PReg)p->storage.v.reg;
if (pr == 0 || pr >= opt_reg_count(f)) continue;
- OptPRegInfo* vi = &f->preg_info[pr];
- if (vi->alloc_kind != OPT_ALLOC_HARD || vi->cls != cls) continue;
- add_unique_reg(used, nused, cap, vi->hard_reg);
+ if (opt_preg_alloc_kind(f, pr) != OPT_ALLOC_HARD ||
+ opt_preg_loc_cls(f, pr) != (u8)cls)
+ continue;
+ add_unique_reg(used, nused, cap, opt_preg_hard_reg(f, pr));
}
}
@@ -901,35 +915,35 @@ static u32 collect_replayed_hard_regs(Func* f, CGTarget* w, RegClass cls,
Inst* in = &bl->insts[i];
if ((IROp)in->op == IR_PARAM_DECL) continue;
for (u32 j = 0; j < in->nopnds; ++j)
- collect_replayed_operand_reg(&in->opnds[j], cls, used, &nused, cap);
+ collect_replayed_operand_reg(f, &in->opnds[j], cls, used, &nused, cap);
switch ((IROp)in->op) {
case IR_CALL: {
IRCallAux* aux = (IRCallAux*)in->extra.aux;
if (!aux) break;
if (aux->use_plan_replay) {
- collect_replayed_operand_reg(&aux->plan.callee, cls, used, &nused,
+ collect_replayed_operand_reg(f, &aux->plan.callee, cls, used, &nused,
cap);
for (u32 j = 0; j < aux->plan.nargs; ++j) {
- collect_replayed_operand_reg(&aux->plan.args[j].src, cls, used,
+ collect_replayed_operand_reg(f, &aux->plan.args[j].src, cls, used,
&nused, cap);
if (aux->plan.args[j].dst_kind == CG_CALL_PLAN_REG &&
aux->plan.args[j].cls == (u8)cls)
add_unique_reg(used, &nused, cap, aux->plan.args[j].dst_reg);
}
for (u32 j = 0; j < aux->plan.nrets; ++j) {
- collect_replayed_operand_reg(&aux->plan.rets[j].dst, cls, used,
+ collect_replayed_operand_reg(f, &aux->plan.rets[j].dst, cls, used,
&nused, cap);
if (aux->plan.rets[j].cls == (u8)cls)
add_unique_reg(used, &nused, cap, aux->plan.rets[j].src_reg);
}
} else {
- collect_replayed_operand_reg(&aux->desc.callee, cls, used, &nused,
+ collect_replayed_operand_reg(f, &aux->desc.callee, cls, used, &nused,
cap);
for (u32 j = 0; j < aux->desc.nargs; ++j)
- collect_replayed_abivalue_regs(&aux->desc.args[j], cls, used,
+ collect_replayed_abivalue_regs(f, &aux->desc.args[j], cls, used,
&nused, cap);
- collect_replayed_abivalue_regs(&aux->desc.ret, cls, used, &nused,
+ collect_replayed_abivalue_regs(f, &aux->desc.ret, cls, used, &nused,
cap);
}
break;
@@ -937,13 +951,13 @@ static u32 collect_replayed_hard_regs(Func* f, CGTarget* w, RegClass cls,
case IR_RET: {
IRRetAux* aux = (IRRetAux*)in->extra.aux;
if (aux && aux->present)
- collect_replayed_abivalue_regs(&aux->val, cls, used, &nused, cap);
+ collect_replayed_abivalue_regs(f, &aux->val, cls, used, &nused, cap);
break;
}
case IR_SCOPE_BEGIN: {
IRScopeAux* aux = (IRScopeAux*)in->extra.aux;
if (aux)
- collect_replayed_operand_reg(&aux->desc.cond, cls, used, &nused,
+ collect_replayed_operand_reg(f, &aux->desc.cond, cls, used, &nused,
cap);
break;
}
@@ -951,10 +965,10 @@ static u32 collect_replayed_hard_regs(Func* f, CGTarget* w, RegClass cls,
IRAsmAux* aux = (IRAsmAux*)in->extra.aux;
if (!aux) break;
for (u32 j = 0; j < aux->nin; ++j)
- collect_replayed_operand_reg(&aux->in_ops[j], cls, used, &nused,
+ collect_replayed_operand_reg(f, &aux->in_ops[j], cls, used, &nused,
cap);
for (u32 j = 0; j < aux->nout; ++j)
- collect_replayed_operand_reg(&aux->out_ops[j], cls, used, &nused,
+ collect_replayed_operand_reg(f, &aux->out_ops[j], cls, used, &nused,
cap);
break;
}
@@ -962,9 +976,9 @@ static u32 collect_replayed_hard_regs(Func* f, CGTarget* w, RegClass cls,
IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux;
if (!aux) break;
for (u32 j = 0; j < aux->narg; ++j)
- collect_replayed_operand_reg(&aux->args[j], cls, used, &nused, cap);
+ collect_replayed_operand_reg(f, &aux->args[j], cls, used, &nused, cap);
for (u32 j = 0; j < aux->ndst; ++j)
- collect_replayed_operand_reg(&aux->dsts[j], cls, used, &nused, cap);
+ collect_replayed_operand_reg(f, &aux->dsts[j], cls, used, &nused, cap);
break;
}
default:
@@ -997,6 +1011,19 @@ static int replay_operand_uses_frame_slot(const Operand* op) {
return op && op->kind == OPK_LOCAL && op->v.frame_slot != FRAME_SLOT_NONE;
}
+static int replay_storage_uses_frame_slot(CGLocalStorage st) {
+ return st.kind == CG_LOCAL_STORAGE_FRAME &&
+ st.v.frame_slot != FRAME_SLOT_NONE;
+}
+
+static int replay_param_storage_uses_frame_slot(Func* f, CGLocalStorage st) {
+ if (replay_storage_uses_frame_slot(st)) return 1;
+ if (st.kind != CG_LOCAL_STORAGE_REG || !f || !f->opt_rewritten) return 0;
+ PReg pr = (PReg)st.v.reg;
+ return opt_reg_valid(f, pr) && opt_preg_alloc_kind(f, pr) == OPT_ALLOC_SPILL &&
+ opt_preg_spill_slot(f, pr) != FRAME_SLOT_NONE;
+}
+
static int replay_abivalue_uses_frame_slot(const CGABIValue* v) {
if (!v) return 0;
if (replay_operand_uses_frame_slot(&v->storage)) return 1;
@@ -1063,6 +1090,8 @@ static int replay_func_uses_frame_slot(Func* f) {
for (u32 i = 0; i < bl->ninsts; ++i)
if (replay_inst_uses_frame_slot(&bl->insts[i])) return 1;
}
+ for (u32 i = 0; i < f->nparams; ++i)
+ if (replay_param_storage_uses_frame_slot(f, f->params[i].storage)) return 1;
for (u32 i = 0; i < f->nframe_slots; ++i)
if (f->frame_slots[i].flags & (FSF_ADDR_TAKEN | FSF_VOLATILE)) return 1;
return 0;
@@ -1101,8 +1130,21 @@ static void collect_known_frame(Func* f, CGTarget* w, CGKnownFrameDesc* out) {
continue;
}
if ((aux->desc.flags & CG_CALL_TAIL) == 0) out->has_call = 1;
- if (!w->call_stack_size) continue;
- u32 need = w->call_stack_size(w, &aux->desc);
+ u32 need = 0;
+ if (aux->use_plan_replay) {
+ need = aux->plan.stack_arg_size;
+ for (u32 j = 0; j < aux->plan.nargs; ++j) {
+ CGCallPlanMove* m = &aux->plan.args[j];
+ if (m->dst_kind == CG_CALL_PLAN_STACK ||
+ m->dst_kind == CG_CALL_PLAN_TAIL_STACK) {
+ u32 end = m->stack_offset + (m->mem.size > 8u ? m->mem.size : 8u);
+ if (end > need) need = end;
+ }
+ }
+ need = (need + 15u) & ~15u;
+ } else if (w->call_stack_size) {
+ need = w->call_stack_size(w, &aux->desc);
+ }
if (need > out->max_outgoing) out->max_outgoing = need;
}
}
@@ -1161,7 +1203,7 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
metrics_scope_end(c, "opt.emit.plan_hard_regs");
metrics_scope_begin(c, "opt.emit.func_begin");
- int known_frame = identity && w->func_begin_known_frame && w->call_stack_size;
+ int known_frame = w->func_begin_known_frame != NULL;
if (known_frame) {
CGKnownFrameDesc frame;
FrameSlot* target_slots =
@@ -1171,7 +1213,8 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
w->func_begin_known_frame(w, &f->desc, &frame, target_slots);
for (u32 i = 0; i < f->nframe_slots; ++i)
r.slot_map[f->frame_slots[i].id] = target_slots[i];
- } else {
+ }
+ if (!known_frame) {
/* func_begin with the recorded descriptor. Parameter storage is replayed
* through target->param below after frame slots are mapped. */
w->func_begin(w, &f->desc);
@@ -1219,6 +1262,14 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
} else {
d.storage = xlat_storage(&r, p->storage, p->type);
}
+ if (known_frame && d.storage.kind == CG_LOCAL_STORAGE_FRAME &&
+ d.storage.v.frame_slot == FRAME_SLOT_NONE) {
+ SrcLoc loc = p->loc;
+ compiler_panic(c, loc,
+ "opt replay: frame-backed param %u missing known-frame "
+ "slot mapping",
+ (unsigned)i);
+ }
d.abi = p->abi;
d.loc = p->loc;
(void)w->param(w, &d);
@@ -1269,5 +1320,18 @@ void opt_replay(Compiler* c, Func* f, CGTarget* target) {
}
void opt_emit(Compiler* c, Func* f, CGTarget* target) {
+ if (f && f->mir) {
+ Func view = *f;
+ view.blocks = f->mir->blocks;
+ view.nblocks = f->mir->nblocks;
+ view.entry = f->mir->entry;
+ view.emit_order = f->mir->emit_order;
+ view.emit_order_n = f->mir->emit_order_n;
+ view.emit_order_cap = f->mir->emit_order_cap;
+ view.opt_rewritten = 1;
+ view.mir = NULL;
+ replay_func_to(c, &view, target, 1);
+ return;
+ }
replay_func_to(c, f, target, 1);
}
diff --git a/src/opt/pass_hard_live.c b/src/opt/pass_hard_live.c
@@ -141,8 +141,12 @@ void opt_hard_inst_use_def(Func* f, const Inst* in, OptHardRegSet* use,
hard_use_abivalue(use, &aux->desc.args[i]);
hard_def_abivalue(def, &aux->desc.ret);
}
- for (u32 c = 0; c < OPT_REG_CLASSES; ++c)
- def->cls[c] |= opt_call_clobber_mask_for(f, in, (u8)c);
+ for (u32 c = 0; c < OPT_REG_CLASSES; ++c) {
+ u32 idx = c;
+ u32 mask = opt_call_clobber_mask_for(f, in, (u8)idx);
+ u32 old = def->cls[idx];
+ def->cls[idx] = old | mask;
+ }
break;
}
case IR_CMP_BRANCH:
diff --git a/src/opt/pass_live.c b/src/opt/pass_live.c
@@ -11,10 +11,24 @@ static u32 opt_bit_words(u32 nregs) { return (nregs + 63u) / 64u; }
static void opt_bitset_init(Arena* arena, OptBitset* bs, u32 nwords) {
memset(bs, 0, sizeof *bs);
+ bs->arena = arena;
bs->nwords = nwords;
bs->words = arena_zarray(arena, u64, nwords ? nwords : 1u);
}
+static int opt_bitset_grow(OptBitset* bs, u32 need_words) {
+ if (!bs || need_words <= bs->nwords) return 1;
+ if (!bs->arena) return 0;
+ u32 nwords = bs->nwords ? bs->nwords : 1u;
+ while (nwords < need_words) nwords *= 2u;
+ u64* words = arena_zarray(bs->arena, u64, nwords);
+ if (bs->words && bs->nwords)
+ memcpy(words, bs->words, sizeof(words[0]) * bs->nwords);
+ bs->words = words;
+ bs->nwords = nwords;
+ return 1;
+}
+
static void opt_bitset_trim(OptBitset* bs) {
u32 n = bs->nwords;
while (n && bs->words[n - 1u] == 0) --n;
@@ -28,9 +42,9 @@ void opt_bitset_clear(OptBitset* bs) {
}
void opt_bitset_set(OptBitset* bs, PReg r) {
- if (!bs || !bs->words) return;
+ if (!bs) return;
u32 w = r / 64u;
- if (w >= bs->nwords) return;
+ if (w >= bs->nwords && !opt_bitset_grow(bs, w + 1u)) return;
bs->words[w] |= 1ull << (r % 64u);
if (bs->active_words <= w) bs->active_words = w + 1u;
}
@@ -54,9 +68,12 @@ int opt_bitset_copy(OptBitset* dst, const OptBitset* src) {
int changed = 0;
u32 n;
u32 old_active;
- if (!dst || !src || !dst->words || !src->words) return 0;
+ if (!dst || !src || !src->words) return 0;
+ if (src->active_words > dst->nwords &&
+ !opt_bitset_grow(dst, src->active_words))
+ return 0;
old_active = dst->active_words;
- n = dst->nwords < src->active_words ? dst->nwords : src->active_words;
+ n = src->active_words;
for (u32 i = 0; i < n; ++i) {
changed |= dst->words[i] != src->words[i];
dst->words[i] = src->words[i];
@@ -73,8 +90,11 @@ int opt_bitset_copy(OptBitset* dst, const OptBitset* src) {
int opt_bitset_union(OptBitset* dst, const OptBitset* src) {
int changed = 0;
u32 n;
- if (!dst || !src || !dst->words || !src->words) return 0;
- n = dst->nwords < src->active_words ? dst->nwords : src->active_words;
+ if (!dst || !src || !src->words) return 0;
+ if (src->active_words > dst->nwords &&
+ !opt_bitset_grow(dst, src->active_words))
+ return 0;
+ n = src->active_words;
for (u32 i = 0; i < n; ++i) {
u64 old = dst->words[i];
dst->words[i] |= src->words[i];
@@ -88,8 +108,11 @@ int opt_bitset_union_and_not(OptBitset* dst, const OptBitset* src,
const OptBitset* not_bits) {
int changed = 0;
u32 n;
- if (!dst || !src || !dst->words || !src->words) return 0;
- n = dst->nwords < src->active_words ? dst->nwords : src->active_words;
+ if (!dst || !src || !src->words) return 0;
+ if (src->active_words > dst->nwords &&
+ !opt_bitset_grow(dst, src->active_words))
+ return 0;
+ n = src->active_words;
for (u32 i = 0; i < n; ++i) {
u64 mask =
(not_bits && i < not_bits->active_words) ? not_bits->words[i] : 0;
@@ -144,7 +167,11 @@ static u64 live_count_set_bits(const OptBitset* bs) {
}
static void live_metric_clear(OptLiveInfo* live, OptBitset* bs) {
- if (live && bs) live->bitset_words_touched += bs->active_words;
+ u32 touched = bs ? bs->active_words : 0;
+ if (live) {
+ u64 old = live->bitset_words_touched;
+ live->bitset_words_touched = old + touched;
+ }
opt_bitset_clear(bs);
}
@@ -152,7 +179,7 @@ static int live_metric_copy(OptLiveInfo* live, OptBitset* dst,
const OptBitset* src) {
u32 n = 0;
if (dst && src) {
- n = dst->nwords < src->active_words ? dst->nwords : src->active_words;
+ n = src->active_words;
if (dst->active_words > n) n += dst->active_words - n;
}
if (live) live->bitset_words_touched += n;
@@ -162,8 +189,7 @@ static int live_metric_copy(OptLiveInfo* live, OptBitset* dst,
static int live_metric_union(OptLiveInfo* live, OptBitset* dst,
const OptBitset* src) {
u32 n = 0;
- if (dst && src)
- n = dst->nwords < src->active_words ? dst->nwords : src->active_words;
+ if (dst && src) n = src->active_words;
if (live) live->bitset_words_touched += n;
return opt_bitset_union(dst, src);
}
@@ -173,8 +199,7 @@ static int live_metric_union_and_not(OptLiveInfo* live, OptBitset* dst,
const OptBitset* not_bits) {
u32 n = 0;
(void)not_bits;
- if (dst && src)
- n = dst->nwords < src->active_words ? dst->nwords : src->active_words;
+ if (dst && src) n = src->active_words;
if (live) live->bitset_words_touched += n;
return opt_bitset_union_and_not(dst, src, not_bits);
}
@@ -195,7 +220,13 @@ static void live_recompute_metrics(OptLiveInfo* live) {
live->set_bit_scans += live_count_set_bits(&bl->live_def);
}
live->block_bytes =
- (u64)live->f->nblocks * 4u * (u64)live->words * (u64)sizeof(u64);
+ live->active_words * (u64)sizeof(u64);
+}
+
+static void live_worklist_push(u32* worklist, u8* queued, u32* n, u32 b) {
+ if (queued[b]) return;
+ queued[b] = 1;
+ worklist[(*n)++] = b;
}
void opt_live_blocks(Func* f, OptLiveInfo* live) {
@@ -210,10 +241,10 @@ void opt_live_blocks(Func* f, OptLiveInfo* live) {
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
OptBlockLive* lb = &live->blocks[b];
- opt_bitset_init(f->arena, &lb->live_in, live->words);
- opt_bitset_init(f->arena, &lb->live_out, live->words);
- opt_bitset_init(f->arena, &lb->live_use, live->words);
- opt_bitset_init(f->arena, &lb->live_def, live->words);
+ opt_bitset_init(f->arena, &lb->live_in, 0);
+ opt_bitset_init(f->arena, &lb->live_out, 0);
+ opt_bitset_init(f->arena, &lb->live_use, 0);
+ opt_bitset_init(f->arena, &lb->live_def, 0);
LiveUseDefCtx ctx;
ctx.use = &lb->live_use;
@@ -224,31 +255,39 @@ void opt_live_blocks(Func* f, OptLiveInfo* live) {
OptBitset new_out;
OptBitset tmp;
- opt_bitset_init(f->arena, &new_out, live->words);
- opt_bitset_init(f->arena, &tmp, live->words);
-
- int changed;
- do {
- changed = 0;
+ opt_bitset_init(f->arena, &new_out, 0);
+ opt_bitset_init(f->arena, &tmp, 0);
+
+ u32* worklist = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u);
+ u8* queued = arena_zarray(f->arena, u8, f->nblocks ? f->nblocks : 1u);
+ u32 nwork = 0;
+ for (u32 bi = f->nblocks; bi > 0; --bi)
+ live_worklist_push(worklist, queued, &nwork, bi - 1u);
+
+ while (nwork) {
+ u32 b = worklist[--nwork];
+ queued[b] = 0;
+ Block* bl = &f->blocks[b];
+ OptBlockLive* lb = &live->blocks[b];
++live->dataflow_iterations;
- for (u32 bi = f->nblocks; bi > 0; --bi) {
- u32 b = bi - 1u;
- Block* bl = &f->blocks[b];
- OptBlockLive* lb = &live->blocks[b];
- ++live->dataflow_block_visits;
- live_metric_clear(live, &new_out);
- live_metric_clear(live, &tmp);
- for (u32 s = 0; s < bl->nsucc; ++s) {
- u32 t = bl->succ[s];
- if (t < f->nblocks)
- live_metric_union(live, &new_out, &live->blocks[t].live_in);
+ ++live->dataflow_block_visits;
+ live_metric_clear(live, &new_out);
+ live_metric_clear(live, &tmp);
+ for (u32 s = 0; s < bl->nsucc; ++s) {
+ u32 t = bl->succ[s];
+ if (t < f->nblocks)
+ live_metric_union(live, &new_out, &live->blocks[t].live_in);
+ }
+ live_metric_copy(live, &tmp, &lb->live_use);
+ live_metric_union_and_not(live, &tmp, &new_out, &lb->live_def);
+ (void)live_metric_copy(live, &lb->live_out, &new_out);
+ if (live_metric_copy(live, &lb->live_in, &tmp)) {
+ for (u32 p = 0; p < bl->npreds; ++p) {
+ u32 pred = bl->preds[p];
+ if (pred < f->nblocks) live_worklist_push(worklist, queued, &nwork, pred);
}
- live_metric_copy(live, &tmp, &lb->live_use);
- live_metric_union_and_not(live, &tmp, &new_out, &lb->live_def);
- changed |= live_metric_copy(live, &lb->live_out, &new_out);
- changed |= live_metric_copy(live, &lb->live_in, &tmp);
}
- } while (changed);
+ }
live_recompute_metrics(live);
}
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -47,12 +47,23 @@ static u32 type_size_fallback(const Func* f, CfreeCgTypeId t) {
static u32 bit_words(u32 npregs) { return (npregs + 63u) / 64u; }
-static void bit_set(u64* bits, PReg v) { bits[v / 64u] |= 1ull << (v % 64u); }
+static void bit_set(u64* bits, PReg v) {
+ u32 w = v / 64u;
+ u64 mask = 1ull << (v % 64u);
+ u64 old = bits[w];
+ bits[w] = old | mask;
+}
static void bit_clear(u64* bits, PReg v) {
- bits[v / 64u] &= ~(1ull << (v % 64u));
+ u32 w = v / 64u;
+ u64 mask = 1ull << (v % 64u);
+ u64 old = bits[w];
+ bits[w] = old & ~mask;
}
static int bit_has(const u64* bits, PReg v) {
- return (bits[v / 64u] & (1ull << (v % 64u))) != 0;
+ u32 w = v / 64u;
+ u64 mask = 1ull << (v % 64u);
+ u64 old = bits[w];
+ return (old & mask) != 0;
}
typedef struct BitsCtx {
@@ -207,74 +218,101 @@ static int phys_arg_reg_for_index(Func* f, u8 cls, u32 abi_index, Reg* out) {
return 0;
}
-static int hard_available(Func* f, u8 cls, Reg r);
+typedef struct ParamIncomingRegs {
+ Reg regs[64];
+ u8 cls[64];
+ u8 has[64];
+ u32 nparams;
+ Reg all_regs[128];
+ u8 all_cls[128];
+ u32 nall;
+} ParamIncomingRegs;
+
+static void param_incoming_add(ParamIncomingRegs* out, u8 cls, Reg r) {
+ if (r >= 32) return;
+ for (u32 i = 0; i < out->nall; ++i)
+ if (out->all_cls[i] == cls && out->all_regs[i] == r) return;
+ if (out->nall < 128u) {
+ out->all_cls[out->nall] = cls;
+ out->all_regs[out->nall] = r;
+ ++out->nall;
+ }
+}
-static void apply_param_incoming_register_hazards(Func* f) {
- if (!f || !f->preg_info || !f->desc.abi || !f->nparams) return;
- Reg incoming_regs[64];
- u8 incoming_cls[64];
- u8 has_incoming[64];
- memset(incoming_regs, 0, sizeof incoming_regs);
- memset(incoming_cls, 0, sizeof incoming_cls);
- memset(has_incoming, 0, sizeof has_incoming);
+static void collect_param_incoming_regs(Func* f, ParamIncomingRegs* out) {
+ memset(out, 0, sizeof *out);
+ if (!f || !f->desc.abi || !f->nparams) return;
u32 next_int = 0;
u32 next_fp = 0;
if (f->desc.abi->has_sret && f->opt_target.arch != CFREE_ARCH_ARM_64)
next_int = 1;
- u32 nparams = f->nparams < 64u ? f->nparams : 64u;
- for (u32 i = 0; i < nparams; ++i) {
+ out->nparams = f->nparams < 64u ? f->nparams : 64u;
+ for (u32 i = 0; i < out->nparams; ++i) {
IRParam* p = &f->params[i];
const ABIArgInfo* ai = p->abi;
if (!ai || ai->kind == ABI_ARG_IGNORE) continue;
if (ai->kind == ABI_ARG_INDIRECT) {
Reg r = REG_NONE;
if (phys_arg_reg_for_index(f, RC_INT, next_int, &r)) {
- incoming_regs[i] = r;
- incoming_cls[i] = RC_INT;
- has_incoming[i] = 1;
+ out->regs[i] = r;
+ out->cls[i] = RC_INT;
+ out->has[i] = 1;
+ param_incoming_add(out, RC_INT, r);
}
++next_int;
continue;
}
- if (ai->kind != ABI_ARG_DIRECT || ai->nparts != 1) continue;
- const ABIArgPart* part = &ai->parts[0];
- if (part->cls == ABI_CLASS_FP) {
- Reg r = REG_NONE;
- if (phys_arg_reg_for_index(f, RC_FP, next_fp, &r)) {
- incoming_regs[i] = r;
- incoming_cls[i] = RC_FP;
- has_incoming[i] = 1;
- }
- ++next_fp;
- } else if (part->cls == ABI_CLASS_INT) {
- Reg r = REG_NONE;
- if (phys_arg_reg_for_index(f, RC_INT, next_int, &r)) {
- incoming_regs[i] = r;
- incoming_cls[i] = RC_INT;
- has_incoming[i] = 1;
+ if (ai->kind != ABI_ARG_DIRECT) continue;
+ for (u16 j = 0; j < ai->nparts; ++j) {
+ const ABIArgPart* part = &ai->parts[j];
+ if (part->cls == ABI_CLASS_FP) {
+ Reg r = REG_NONE;
+ if (phys_arg_reg_for_index(f, RC_FP, next_fp, &r)) {
+ param_incoming_add(out, RC_FP, r);
+ if (ai->nparts == 1) {
+ out->regs[i] = r;
+ out->cls[i] = RC_FP;
+ out->has[i] = 1;
+ }
+ }
+ ++next_fp;
+ } else if (part->cls == ABI_CLASS_INT) {
+ Reg r = REG_NONE;
+ if (phys_arg_reg_for_index(f, RC_INT, next_int, &r)) {
+ param_incoming_add(out, RC_INT, r);
+ if (ai->nparts == 1) {
+ out->regs[i] = r;
+ out->cls[i] = RC_INT;
+ out->has[i] = 1;
+ }
+ }
+ ++next_int;
}
- ++next_int;
}
}
+}
- for (u32 i = 0; i < nparams; ++i) {
- IRParam* p = &f->params[i];
- if (p->storage.kind != CG_LOCAL_STORAGE_REG) continue;
- PReg v = (PReg)p->storage.v.reg;
- if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) continue;
+static int hard_available(Func* f, u8 cls, Reg r);
+
+static void apply_param_incoming_register_hazards(Func* f) {
+ if (!f || !f->preg_info || !f->desc.abi || !f->nparams) return;
+ ParamIncomingRegs incoming;
+ collect_param_incoming_regs(f, &incoming);
+
+ /* O1 replays parameter materialization before the body, but values left in
+ * their ABI incoming registers are not represented as live from function
+ * entry to first use. Keep those incoming registers out of virtual
+ * allocation so the backend emits explicit entry moves/stores before body
+ * code can reuse them. Fixed asm constraints still use tied_hard_reg. */
+ for (PReg v = 1; v < opt_reg_count(f); ++v) {
u8 cls = f->preg_info[v].cls;
- if (has_incoming[i] && incoming_cls[i] == cls &&
- f->preg_info[v].tied_hard_reg < 0 &&
- f->preg_info[v].live_across_call_freq == 0 &&
- hard_available(f, cls, incoming_regs[i]) && incoming_regs[i] < 32 &&
- (f->preg_info[v].forbidden_hard_regs & (1u << incoming_regs[i])) == 0) {
- f->preg_info[v].tied_hard_reg = (i32)incoming_regs[i];
- }
- for (u32 j = i + 1u; j < nparams; ++j) {
- if (!has_incoming[j] || incoming_cls[j] != cls) continue;
- forbid_preg_reg(f, v, cls, incoming_regs[j]);
+ for (u32 j = 0; j < incoming.nall; ++j) {
+ if (incoming.all_cls[j] != cls) continue;
+ if (f->preg_info[v].tied_hard_reg == (i32)incoming.all_regs[j])
+ continue;
+ forbid_preg_reg(f, v, cls, incoming.all_regs[j]);
}
}
}
@@ -321,8 +359,8 @@ static const CGPhysRegInfo* phys_info_for(Func* f, u8 cls, Reg r) {
}
static FrameSlot spill_slot_for(Func* f, PReg v) {
- if (f->preg_info[v].spill_slot != FRAME_SLOT_NONE)
- return f->preg_info[v].spill_slot;
+ FrameSlot existing = opt_preg_spill_slot(f, v);
+ if (existing != FRAME_SLOT_NONE) return existing;
FrameSlotDesc d;
memset(&d, 0, sizeof d);
d.type = opt_reg_type(f, v);
@@ -865,6 +903,7 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
f->opt_alloc_stack_word_ors = a->stack_word_ors;
f->opt_alloc_hard_mark_points = a->hard_mark_points;
f->opt_alloc_stack_mark_points = a->stack_mark_points;
+ f->preg_locs = a->locs;
}
typedef struct RewriteList {
@@ -981,9 +1020,9 @@ static Operand hard_operand(Func* f, PReg v) {
Operand o;
memset(&o, 0, sizeof o);
o.kind = OPK_REG;
- o.cls = f->preg_info[v].cls;
+ o.cls = opt_preg_loc_cls(f, v);
o.type = opt_reg_type(f, v);
- o.v.reg = f->preg_info[v].hard_reg;
+ o.v.reg = opt_preg_hard_reg(f, v);
return o;
}
@@ -991,7 +1030,7 @@ static Operand hard_operand_reg(Func* f, PReg v, Reg hard_reg) {
Operand o;
memset(&o, 0, sizeof o);
o.kind = OPK_REG;
- o.cls = f->preg_info[v].cls;
+ o.cls = opt_preg_loc_cls(f, v);
o.type = opt_reg_type(f, v);
o.v.reg = hard_reg;
return o;
@@ -1066,21 +1105,21 @@ static void rewrite_one_operand(Func* f, Inst* owner, Operand* op, int is_def,
if (op->kind != OPK_REG) return;
PReg v = (PReg)op->v.reg;
if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return;
- OptPRegInfo* vi = &f->preg_info[v];
- if (vi->alloc_kind == OPT_ALLOC_HARD) {
- op->v.reg = vi->hard_reg;
+ u8 alloc_kind = opt_preg_alloc_kind(f, v);
+ if (alloc_kind == OPT_ALLOC_HARD) {
+ op->v.reg = opt_preg_hard_reg(f, v);
return;
}
- if (vi->alloc_kind == OPT_ALLOC_SPLIT) {
+ if (alloc_kind == OPT_ALLOC_SPLIT) {
const OptAllocSegment* seg = split_segment_at(f, v, c->raw_point);
if (seg && seg->loc_kind == OPT_LOC_HARD) {
op->v.reg = seg->hard_reg;
return;
}
- } else if (vi->alloc_kind != OPT_ALLOC_SPILL) {
+ } else if (alloc_kind != OPT_ALLOC_SPILL) {
return;
}
- u8 cls = vi->cls;
+ u8 cls = opt_preg_loc_cls(f, v);
Reg scratch = scratch_for(f, cls, &c->next_scratch[cls]);
if (scratch == (Reg)REG_NONE) {
SrcLoc loc = owner ? owner->loc : (SrcLoc){0, 0, 0};
@@ -1110,11 +1149,10 @@ static void rewrite_call_arg_operand(Func* f, Operand* op) {
if (!op || op->kind != OPK_REG) return;
PReg v = (PReg)op->v.reg;
if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return;
- OptPRegInfo* vi = &f->preg_info[v];
- if (vi->alloc_kind == OPT_ALLOC_HARD) {
- op->v.reg = vi->hard_reg;
- } else if (vi->alloc_kind == OPT_ALLOC_SPILL ||
- vi->alloc_kind == OPT_ALLOC_SPLIT) {
+ u8 alloc_kind = opt_preg_alloc_kind(f, v);
+ if (alloc_kind == OPT_ALLOC_HARD) {
+ op->v.reg = opt_preg_hard_reg(f, v);
+ } else if (alloc_kind == OPT_ALLOC_SPILL || alloc_kind == OPT_ALLOC_SPLIT) {
*op = spill_addr(f, v);
}
}
@@ -1155,9 +1193,9 @@ static void rewrite_call_save_one(PReg v, void* arg) {
Func* f = c->f;
if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return;
if (c->refs && refs_has_def(c->refs, v)) return;
- if (f->preg_info[v].alloc_kind != OPT_ALLOC_HARD) return;
- u8 cls = f->preg_info[v].cls;
- Reg hr = f->preg_info[v].hard_reg;
+ if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD) return;
+ u8 cls = opt_preg_loc_cls(f, v);
+ Reg hr = opt_preg_hard_reg(f, v);
if (cls >= OPT_REG_CLASSES || hr >= 32u) return;
if ((opt_call_clobber_mask_for(f, c->call, cls) & (1u << hr)) == 0) return;
if (c->emit_restore)
@@ -1186,7 +1224,7 @@ static PReg* rewrite_collect_call_save_pregs(Func* f, u32* count_out) {
u32 n = 0;
for (PReg v = 1; v < opt_reg_count(f); ++v) {
OptPRegInfo* vi = &f->preg_info[v];
- if (vi->alloc_kind != OPT_ALLOC_HARD) continue;
+ if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD) continue;
if (!vi->live_across_call_freq) continue;
++n;
}
@@ -1194,7 +1232,7 @@ static PReg* rewrite_collect_call_save_pregs(Func* f, u32* count_out) {
u32 w = 0;
for (PReg v = 1; v < opt_reg_count(f); ++v) {
OptPRegInfo* vi = &f->preg_info[v];
- if (vi->alloc_kind != OPT_ALLOC_HARD) continue;
+ if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD) continue;
if (!vi->live_across_call_freq) continue;
pregs[w++] = v;
}
@@ -1206,7 +1244,7 @@ static void append_split_reloads_at(Func* f, RewriteList* out, u32 block,
u32 raw_point) {
if (!f->opt_first_segment_by_preg) return;
for (PReg v = 1; v < opt_reg_count(f); ++v) {
- if (f->preg_info[v].alloc_kind != OPT_ALLOC_SPLIT) continue;
+ if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_SPLIT) continue;
for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE;
si = f->opt_alloc_segments[si].next) {
const OptAllocSegment* s = &f->opt_alloc_segments[si];
@@ -1222,7 +1260,7 @@ static void append_split_stores_at(Func* f, RewriteList* out, u32 block,
u32 raw_point) {
if (!f->opt_first_segment_by_preg) return;
for (PReg v = 1; v < opt_reg_count(f); ++v) {
- if (f->preg_info[v].alloc_kind != OPT_ALLOC_SPLIT) continue;
+ if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_SPLIT) continue;
for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE;
si = f->opt_alloc_segments[si].next) {
const OptAllocSegment* s = &f->opt_alloc_segments[si];
@@ -1264,7 +1302,7 @@ static void prepare_split_edge_materialization(Func* f, EdgeMatPlan* plan) {
}
for (PReg v = 1; v < opt_reg_count(f); ++v) {
- if (f->preg_info[v].alloc_kind != OPT_ALLOC_SPLIT) continue;
+ if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_SPLIT) continue;
for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE;
si = f->opt_alloc_segments[si].next) {
OptAllocSegment* s = &f->opt_alloc_segments[si];
@@ -1481,6 +1519,59 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
f->opt_rewritten = 1;
}
+static Block* lower_copy_blocks(Func* f) {
+ Block* blocks = arena_zarray(f->arena, Block, f->nblocks ? f->nblocks : 1u);
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* dst = &blocks[b];
+ Block* src = &f->blocks[b];
+ *dst = *src;
+ if (src->ninsts) {
+ dst->insts = arena_array(f->arena, Inst, src->ninsts);
+ memcpy(dst->insts, src->insts, sizeof(Inst) * src->ninsts);
+ dst->cap = src->ninsts;
+ }
+ if (src->nsucc) {
+ dst->succ = arena_array(f->arena, u32, src->nsucc);
+ memcpy(dst->succ, src->succ, sizeof(u32) * src->nsucc);
+ dst->succ_cap = src->nsucc;
+ }
+ if (src->npreds) {
+ dst->preds = arena_array(f->arena, u32, src->npreds);
+ memcpy(dst->preds, src->preds, sizeof(u32) * src->npreds);
+ }
+ }
+ return blocks;
+}
+
+void opt_lower_to_mir(Func* f, const OptLiveInfo* live_info) {
+ if (!f) return;
+ Func phys = *f;
+ phys.blocks = lower_copy_blocks(f);
+ phys.opt_rewritten = 0;
+ phys.mir = NULL;
+
+ rewrite_func(&phys, live_info);
+
+ MFunc* m = arena_zarray(f->arena, MFunc, 1);
+ m->blocks = phys.blocks;
+ m->nblocks = phys.nblocks;
+ m->entry = phys.entry;
+ m->emit_order_n = phys.emit_order_n;
+ m->emit_order_cap = phys.emit_order_n;
+ m->emit_order =
+ arena_array(f->arena, u32, phys.emit_order_n ? phys.emit_order_n : 1u);
+ if (phys.emit_order_n)
+ memcpy(m->emit_order, phys.emit_order, sizeof(u32) * phys.emit_order_n);
+
+ f->mir = m;
+ f->frame_slots = phys.frame_slots;
+ f->nframe_slots = phys.nframe_slots;
+ f->frame_slots_cap = phys.frame_slots_cap;
+ f->next_inst_id = phys.next_inst_id;
+ f->opt_rewrite_inserted_insts = phys.opt_rewrite_inserted_insts;
+ f->opt_rewrite_live_words_touched = phys.opt_rewrite_live_words_touched;
+}
+
static void rewrite_dump_sb(Writer* w, const StrBuf* sb) {
cfree_writer_write(w, strbuf_cstr(sb), strbuf_len(sb));
}
@@ -1581,7 +1672,112 @@ void opt_dead_def_elim(Func* f) {
opt_dead_def_elim_with_live(f, &live);
}
-void opt_regalloc(Func* f, int allow_live_range_split) {
+/* Collect the def and use PRegs of one instruction (operands + aux fan-out). */
+typedef struct AllocVerifyRefs {
+ PReg defs[256];
+ u32 ndefs;
+ PReg uses[256];
+ u32 nuses;
+} AllocVerifyRefs;
+
+static void alloc_verify_collect(Func* f, Inst* in, Operand* op, int is_def,
+ void* ctx) {
+ (void)in;
+ AllocVerifyRefs* r = (AllocVerifyRefs*)ctx;
+ PReg p;
+ if (!op || op->kind != OPK_REG) return;
+ p = (PReg)op->v.reg;
+ if (p == 0 || p >= opt_reg_count(f)) return;
+ if (is_def) {
+ if (r->ndefs < 256u) r->defs[r->ndefs++] = p;
+ } else {
+ if (r->nuses < 256u) r->uses[r->nuses++] = p;
+ }
+}
+
+/* Post-allocation interference verifier for the O1 (no-coalesce) path. Since
+ * O1 never coalesces, two *distinct* PRegs that are live simultaneously must
+ * never share a hard register. Re-derive per-instruction liveness from the
+ * block live-out sets and panic if any definition collides with a different
+ * live value in the same hard reg — turning a silent miscompile into a hard
+ * error. */
+static void opt_verify_alloc(Func* f, const OptLiveInfo* live) {
+ u32 nregs = opt_reg_count(f);
+ u8* cur;
+ if (nregs <= 1u || !live) return;
+ ParamIncomingRegs incoming;
+ collect_param_incoming_regs(f, &incoming);
+ for (PReg v = 1; v < nregs; ++v) {
+ OptPRegInfo* vi = &f->preg_info[v];
+ if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD || vi->use_freq == 0)
+ continue;
+ u8 cls = opt_preg_loc_cls(f, v);
+ Reg hard = opt_preg_hard_reg(f, v);
+ for (u32 i = 0; i < incoming.nall; ++i) {
+ if (cls == incoming.all_cls[i] && hard == incoming.all_regs[i]) {
+ SrcLoc loc = {0, 0, 0};
+ compiler_panic(f->c, loc,
+ "opt regalloc: O1 preg %u left in incoming cls%u reg%u",
+ (unsigned)v, (unsigned)cls, (unsigned)hard);
+ }
+ }
+ }
+ cur = arena_array(f->arena, u8, nregs);
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ const OptBlockLive* lb = &live->blocks[b];
+ memset(cur, 0, nregs);
+ for (PReg p = 1; p < nregs; ++p)
+ if (opt_bitset_has(&lb->live_out, p)) cur[p] = 1u;
+ for (u32 ri = bl->ninsts; ri > 0; --ri) {
+ Inst* in = &bl->insts[ri - 1u];
+ AllocVerifyRefs refs;
+ refs.ndefs = 0;
+ refs.nuses = 0;
+ opt_walk_inst_operands(f, in, alloc_verify_collect, &refs);
+ for (u32 k = 0; k < refs.ndefs; ++k) {
+ PReg d = refs.defs[k];
+ u8 d_kind = opt_preg_alloc_kind(f, d);
+ if (d_kind != OPT_ALLOC_HARD && d_kind != OPT_ALLOC_SPILL)
+ continue;
+ for (PReg p = 1; p < nregs; ++p) {
+ u8 p_kind;
+ if (!cur[p] || p == d) continue;
+ p_kind = opt_preg_alloc_kind(f, p);
+ if (p_kind == OPT_ALLOC_HARD && d_kind == OPT_ALLOC_HARD &&
+ opt_preg_loc_cls(f, p) == opt_preg_loc_cls(f, d) &&
+ opt_preg_hard_reg(f, p) == opt_preg_hard_reg(f, d)) {
+ SrcLoc loc = {0, 0, 0};
+ compiler_panic(f->c, loc,
+ "opt regalloc: O1 interference — pregs %u and %u "
+ "share cls%u reg%u, both live at block %u inst %u "
+ "(op %u)",
+ (unsigned)d, (unsigned)p,
+ (unsigned)opt_preg_loc_cls(f, d),
+ (unsigned)opt_preg_hard_reg(f, d), b, ri - 1u,
+ (unsigned)in->op);
+ }
+ if (p_kind == OPT_ALLOC_SPILL && d_kind == OPT_ALLOC_SPILL &&
+ opt_preg_spill_slot(f, p) == opt_preg_spill_slot(f, d)) {
+ SrcLoc loc = {0, 0, 0};
+ compiler_panic(f->c, loc,
+ "opt regalloc: O1 interference — pregs %u and %u "
+ "share spill slot %u, both live at block %u inst %u "
+ "(op %u)",
+ (unsigned)d, (unsigned)p,
+ (unsigned)opt_preg_spill_slot(f, d), b, ri - 1u,
+ (unsigned)in->op);
+ }
+ }
+ }
+ for (u32 k = 0; k < refs.ndefs; ++k) cur[refs.defs[k]] = 0u;
+ for (u32 k = 0; k < refs.nuses; ++k) cur[refs.uses[k]] = 1u;
+ }
+ }
+}
+
+static void opt_regalloc_place(Func* f, int allow_live_range_split,
+ OptLiveInfo* live_out) {
metrics_scope_begin(f->c, "opt.live_ranges.regalloc");
OptLiveInfo live;
opt_live_blocks(f, &live);
@@ -1615,6 +1811,18 @@ void opt_regalloc(Func* f, int allow_live_range_split) {
OptAllocator alloc;
opt_assign_ranges(f, &ranges, &alloc, allow_live_range_split);
+ if (!allow_live_range_split) opt_verify_alloc(f, &live);
+ if (live_out) *live_out = live;
+}
+
+void opt_regalloc_locations(Func* f, int allow_live_range_split,
+ OptLiveInfo* live_out) {
+ opt_regalloc_place(f, allow_live_range_split, live_out);
+}
+
+void opt_regalloc(Func* f, int allow_live_range_split) {
+ OptLiveInfo live;
+ opt_regalloc_place(f, allow_live_range_split, &live);
EdgeMatPlan edge_mats;
memset(&edge_mats, 0, sizeof edge_mats);
if (allow_live_range_split)
diff --git a/src/opt/pass_machinize.c b/src/opt/pass_machinize.c
@@ -75,7 +75,7 @@ static void asm_prepare_constraints(Func* f, CGTarget* target, IRAsmAux* aux) {
static int call_plan_replay_supported(const IRCallAux* aux,
const CGTarget* target);
-void opt_machinize(Func* f, CGTarget* target) {
+static void machinize_reset(Func* f, CGTarget* target) {
f->opt_target = target->c->target;
f->opt_has_target = 1;
for (u32 c = 0; c < OPT_REG_CLASSES; ++c) {
@@ -88,7 +88,9 @@ void opt_machinize(Func* f, CGTarget* target) {
f->opt_arg_regs[c] = 0;
f->opt_ret_regs[c] = 0;
}
+}
+static void machinize_prepare_insts(Func* f, CGTarget* target) {
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
for (u32 i = 0; i < bl->ninsts; ++i) {
@@ -105,27 +107,30 @@ void opt_machinize(Func* f, CGTarget* target) {
}
}
}
+}
+static void machinize_collect_regs(Func* f, CGTarget* target) {
for (u32 c = 0; c < OPT_REG_CLASSES; ++c) {
const CGPhysRegInfo* phys = NULL;
u32 nphys = 0;
if (target->get_phys_regs)
target->get_phys_regs(target, (RegClass)c, &phys, &nphys);
if (phys) {
- for (u32 i = 0; i < nphys && i < OPT_MAX_HARD_REGS; ++i) {
- CGPhysRegInfo pi = phys[i];
- Reg hr = pi.reg;
+ u32 phys_limit = nphys < OPT_MAX_HARD_REGS ? nphys : OPT_MAX_HARD_REGS;
+ for (u32 i = 0; i < phys_limit; ++i) {
+ Reg hr = phys[i].reg;
+ u16 flags = phys[i].flags;
if (hr < 32u) {
- if (pi.flags & CG_REG_CALLER_SAVED)
+ if (flags & CG_REG_CALLER_SAVED)
f->opt_caller_saved[c] |= 1u << hr;
- if (pi.flags & CG_REG_CALLEE_SAVED)
+ if (flags & CG_REG_CALLEE_SAVED)
f->opt_callee_saved[c] |= 1u << hr;
- if (pi.flags & CG_REG_RESERVED) f->opt_reserved_regs[c] |= 1u << hr;
- if (pi.flags & CG_REG_ARG) f->opt_arg_regs[c] |= 1u << hr;
- if (pi.flags & CG_REG_RET) f->opt_ret_regs[c] |= 1u << hr;
+ if (flags & CG_REG_RESERVED) f->opt_reserved_regs[c] |= 1u << hr;
+ if (flags & CG_REG_ARG) f->opt_arg_regs[c] |= 1u << hr;
+ if (flags & CG_REG_RET) f->opt_ret_regs[c] |= 1u << hr;
}
- f->opt_phys_regs[c][f->opt_phys_reg_count[c]++] = pi;
- if ((pi.flags & CG_REG_ALLOCABLE) && !(pi.flags & CG_REG_RESERVED)) {
+ f->opt_phys_regs[c][f->opt_phys_reg_count[c]++] = phys[i];
+ if ((flags & CG_REG_ALLOCABLE) && !(flags & CG_REG_RESERVED)) {
f->opt_hard_regs[c][f->opt_hard_reg_count[c]++] = hr;
}
}
@@ -134,7 +139,8 @@ void opt_machinize(Func* f, CGTarget* target) {
u32 nhard = 0;
if (target->get_allocable_regs)
target->get_allocable_regs(target, (RegClass)c, &hard, &nhard);
- for (u32 i = 0; i < nhard && i < OPT_MAX_HARD_REGS; ++i)
+ u32 hard_limit = nhard < OPT_MAX_HARD_REGS ? nhard : OPT_MAX_HARD_REGS;
+ for (u32 i = 0; i < hard_limit; ++i)
f->opt_hard_regs[c][f->opt_hard_reg_count[c]++] = hard[i];
}
@@ -142,7 +148,9 @@ void opt_machinize(Func* f, CGTarget* target) {
u32 nscratch = 0;
if (target->get_scratch_regs)
target->get_scratch_regs(target, (RegClass)c, &scratch, &nscratch);
- for (u32 i = 0; i < nscratch && i < OPT_MAX_SCRATCH_REGS; ++i)
+ u32 scratch_limit =
+ nscratch < OPT_MAX_SCRATCH_REGS ? nscratch : OPT_MAX_SCRATCH_REGS;
+ for (u32 i = 0; i < scratch_limit; ++i)
f->opt_scratch_regs[c][f->opt_scratch_reg_count[c]++] = scratch[i];
if (!phys && target->is_caller_saved) {
@@ -152,10 +160,15 @@ void opt_machinize(Func* f, CGTarget* target) {
f->opt_caller_saved[c] |= (1u << hr);
}
}
- if (target->callee_save_mask)
- f->opt_callee_saved[c] |= target->callee_save_mask(target, (RegClass)c);
+ u32* callee_saved = &f->opt_callee_saved[c];
+ if (target->callee_save_mask) {
+ u32 mask = target->callee_save_mask(target, (RegClass)c);
+ *callee_saved |= mask;
+ }
}
+}
+static void machinize_check_overlap(Func* f) {
for (u32 c = 0; c < OPT_REG_CLASSES; ++c) {
for (u32 i = 0; i < f->opt_hard_reg_count[c]; ++i) {
Reg hr = f->opt_hard_regs[c][i];
@@ -172,6 +185,13 @@ void opt_machinize(Func* f, CGTarget* target) {
}
}
+void opt_machinize(Func* f, CGTarget* target) {
+ machinize_reset(f, target);
+ machinize_prepare_insts(f, target);
+ machinize_collect_regs(f, target);
+ machinize_check_overlap(f);
+}
+
static int call_plan_replay_supported(const IRCallAux* aux,
const CGTarget* target) {
if (!aux || !aux->plan_valid || !target || !target->emit_call_plan) return 0;
diff --git a/src/opt/pass_mir.c b/src/opt/pass_mir.c
@@ -0,0 +1,103 @@
+#include "opt/opt_internal.h"
+
+static int mir_view(Func* f, Func* out) {
+ if (!f || !f->mir) return 0;
+ *out = *f;
+ out->blocks = f->mir->blocks;
+ out->nblocks = f->mir->nblocks;
+ out->entry = f->mir->entry;
+ out->emit_order = f->mir->emit_order;
+ out->emit_order_n = f->mir->emit_order_n;
+ out->emit_order_cap = f->mir->emit_order_cap;
+ out->opt_rewritten = 1;
+ out->mir = NULL;
+ return 1;
+}
+
+static void mir_commit(Func* f, const Func* view) {
+ if (!f || !f->mir || !view) return;
+ f->mir->blocks = view->blocks;
+ f->mir->nblocks = view->nblocks;
+ f->mir->entry = view->entry;
+ f->mir->emit_order = view->emit_order;
+ f->mir->emit_order_n = view->emit_order_n;
+ f->mir->emit_order_cap = view->emit_order_cap;
+}
+
+static void mir_verify_reg(Func* f, const char* stage, Reg r, u8 cls,
+ const char* what) {
+ if (r == (Reg)REG_NONE || cls >= OPT_REG_CLASSES ||
+ r >= OPT_MAX_HARD_REGS) {
+ SrcLoc loc = {0, 0, 0};
+ compiler_panic(f->c, loc, "opt MIR verify %s: bad %s cls%u reg%u",
+ stage ? stage : "?", what, (unsigned)cls, (unsigned)r);
+ }
+}
+
+static void mir_verify_operand(Func* f, Inst* in, Operand* op, int is_def,
+ void* arg) {
+ (void)in;
+ (void)is_def;
+ const char* stage = (const char*)arg;
+ if (!op) return;
+ if (op->kind == OPK_REG) {
+ mir_verify_reg(f, stage, op->v.reg, op->cls, "register operand");
+ } else if (op->kind == OPK_INDIRECT) {
+ mir_verify_reg(f, stage, op->v.ind.base, RC_INT, "indirect base");
+ if (op->v.ind.index != (Reg)REG_NONE)
+ mir_verify_reg(f, stage, op->v.ind.index, RC_INT, "indirect index");
+ } else if (op->kind == OPK_LOCAL) {
+ if (op->v.frame_slot == FRAME_SLOT_NONE ||
+ op->v.frame_slot > f->nframe_slots) {
+ SrcLoc loc = {0, 0, 0};
+ compiler_panic(f->c, loc, "opt MIR verify %s: bad frame slot %u",
+ stage ? stage : "?", (unsigned)op->v.frame_slot);
+ }
+ }
+}
+
+void opt_mir_verify(Func* f, const char* stage) {
+ Func v;
+ if (!mir_view(f, &v)) return;
+ for (u32 b = 0; b < v.nblocks; ++b) {
+ Block* bl = &v.blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op == IR_PHI) {
+ SrcLoc loc = in->loc;
+ compiler_panic(f->c, loc, "opt MIR verify %s: phi survived lowering",
+ stage ? stage : "?");
+ }
+ if ((IROp)in->op == IR_PARAM_DECL) continue;
+ opt_walk_inst_operands(&v, in, mir_verify_operand, (void*)stage);
+ }
+ }
+}
+
+void opt_mir_combine(Func* f) {
+ Func v;
+ if (!mir_view(f, &v)) return;
+ opt_combine(&v);
+ mir_commit(f, &v);
+}
+
+void opt_mir_dce(Func* f) {
+ Func v;
+ if (!mir_view(f, &v)) return;
+ opt_dce(&v);
+ mir_commit(f, &v);
+}
+
+void opt_mir_build_cfg(Func* f) {
+ Func v;
+ if (!mir_view(f, &v)) return;
+ opt_build_cfg(&v);
+ mir_commit(f, &v);
+}
+
+void opt_mir_jump_cleanup(Func* f, OptJumpCleanupStage stage) {
+ Func v;
+ if (!mir_view(f, &v)) return;
+ opt_jump_cleanup(&v, stage);
+ mir_commit(f, &v);
+}
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -250,9 +250,6 @@ static Inst* emit_load_imm(Func* f, u32 b, Val dst, CfreeCgTypeId ty, i64 imm) {
static Inst* emit_scalar_input(Func* f, u32 b, Val dst, CfreeCgTypeId ty) {
Inst* in = ir_emit(f, b, IR_PARAM_DECL);
- in->opnds = arena_array(f->arena, Operand, 1);
- in->opnds[0] = op_reg_(dst, ty);
- in->nopnds = 1;
in->def = dst;
in->type = ty;
f->val_def_block[dst] = b;
@@ -1245,9 +1242,6 @@ static void add_reg_param(Func* f, PReg r, CfreeCgTypeId ty) {
ir_param_add(f, &d);
Inst* in = ir_emit(f, f->entry, IR_PARAM_DECL);
- in->opnds = arena_array(f->arena, Operand, 1);
- in->opnds[0] = op_reg_(r, ty);
- in->nopnds = 1;
in->def = r;
in->type = ty;
}
@@ -1824,6 +1818,36 @@ static void opt_block_liveness_phase1(void) {
tc_fini(&tc);
}
+static void opt_liveness_grows_high_preg_bitsets(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ Func* f = new_func(&tc);
+ u32 entry = f->entry;
+ u32 use_block = ir_block_new(f);
+ ir_note_emit(f, use_block);
+
+ Val high = 193;
+ ir_ensure_val(f, high, tc.i32, RC_INT);
+ emit_load_imm(f, entry, high, tc.i32, 9);
+ emit_br_to(f, entry, use_block);
+ emit_ret_val(f, use_block, high, tc.i32);
+
+ opt_build_cfg(f);
+ opt_build_loop_tree(f);
+ OptLiveInfo live;
+ opt_live_blocks(f, &live);
+
+ EXPECT(opt_bitset_has(&live.blocks[entry].live_out, high),
+ "high preg should be live-out from defining block");
+ EXPECT(opt_bitset_has(&live.blocks[use_block].live_in, high),
+ "high preg should be live-in to use block");
+ EXPECT(live.blocks[entry].live_out.active_words >= high / 64u + 1u,
+ "live-out bitset should grow to high preg");
+ EXPECT(live.blocks[use_block].live_out.active_words == 0,
+ "live-out bitset should trim after high preg dies");
+ tc_fini(&tc);
+}
+
static void opt_live_ranges_phase2(void) {
TestCtx tc;
tc_init(&tc);
@@ -2594,7 +2618,7 @@ static void opt_copy_prop_collapses_redundant_extension_chain(void) {
expect_ir_dump_eq(f,
"ir blocks=1 vals=5\n"
"block 0 preds=[] succs=[] insts=5\n"
- " 0 param_decl def=v1 opnds=[v1]\n"
+ " 0 param_decl def=v1\n"
" 1 convert def=v2 opnds=[v2,v1]\n"
" 2 convert def=v3 opnds=[v3,v1]\n"
" 3 convert def=v4 opnds=[v4,v1]\n"
@@ -3197,7 +3221,7 @@ static void opt_gvn_reuses_store_to_local_load(void) {
expect_ir_dump_eq(f,
"ir blocks=1 vals=3\n"
"block 0 preds=[] succs=[] insts=4\n"
- " 0 param_decl def=v1 opnds=[v1]\n"
+ " 0 param_decl def=v1\n"
" 1 store opnds=[local#1,v1] mem=size4 align4 "
"flags=0x0 alias=local#1\n"
" 2 load def=v2 opnds=[v2,local#1] mem=size4 align4 "
@@ -3378,8 +3402,8 @@ static void opt_gvn_preserves_load_across_unknown_store(void) {
expect_ir_dump_eq(f,
"ir blocks=1 vals=5\n"
"block 0 preds=[] succs=[] insts=6\n"
- " 0 param_decl def=v1 opnds=[v1]\n"
- " 1 param_decl def=v2 opnds=[v2]\n"
+ " 0 param_decl def=v1\n"
+ " 1 param_decl def=v2\n"
" 2 load def=v3 opnds=[v3,local#1] mem=size4 align4 "
"flags=0x0 alias=local#1\n"
" 3 store opnds=[[v1+0],v2] mem=size4 align4 flags=0x0 "
@@ -5293,6 +5317,19 @@ static void opt_combine_sinks_or_preserves_producer_copy_after_rewrite(void) {
EXPECT(count_op(call_f, IR_COPY) == 1,
"producer-copy retargeting should not cross a call");
+ Func* call_live = new_func(&tc);
+ call_live->opt_rewritten = 1;
+ emit_phys_binop(call_live, call_live->entry, 19, 20, 21, tc.i64, BO_IADD);
+ emit_phys_copy(call_live, call_live->entry, 0, 19, tc.i64);
+ emit_call_void(call_live, call_live->entry);
+ emit_phys_binop(call_live, call_live->entry, 22, 19, 18, tc.i64, BO_IADD);
+ emit_ret_val(call_live, call_live->entry, 22, tc.i64);
+
+ opt_combine(call_live);
+ add = &call_live->blocks[call_live->entry].insts[0];
+ EXPECT(count_op(call_live, IR_COPY) == 1 && add->opnds[0].v.reg == 19,
+ "producer used after a call barrier should not sink into call arg");
+
Func* clobber = new_func(&tc);
clobber->opt_rewritten = 1;
emit_phys_binop(clobber, clobber->entry, 21, 20, 19, tc.i32, BO_IADD);
@@ -5536,6 +5573,54 @@ static void opt_combine_substitutes_into_indirect_base_and_index(void) {
"single-use copy should substitute into indirect index");
EXPECT(count_op(g, IR_COPY) == 0,
"copy whose only use is an indirect index should be dead-DCE'd");
+
+ Func* store_f = new_func(&tc);
+ store_f->opt_rewritten = 1;
+ emit_phys_copy(store_f, store_f->entry, 4, 2, tc.i32);
+ Inst* st = emit_store_indirect(store_f, store_f->entry, 4, 1, tc.i32, 0);
+ st->opnds[0].v.ind.ofs = 8;
+ ir_emit(store_f, store_f->entry, IR_RET);
+
+ opt_combine(store_f);
+ st = NULL;
+ for (u32 i = 0; i < store_f->blocks[store_f->entry].ninsts; ++i)
+ if ((IROp)store_f->blocks[store_f->entry].insts[i].op == IR_STORE)
+ st = &store_f->blocks[store_f->entry].insts[i];
+ EXPECT(st && st->opnds[0].kind == OPK_INDIRECT &&
+ st->opnds[0].v.ind.base == 4,
+ "copy substitution should not rewrite store address bases");
+ tc_fini(&tc);
+}
+
+static void opt_combine_copy_chain_source_reuse_blocks_indirect_subst(void) {
+ TestCtx tc;
+ tc_init(&tc);
+
+ Func* f = new_func(&tc);
+ f->opt_rewritten = 1;
+ emit_phys_copy(f, f->entry, 7, 8, tc.i64);
+ emit_phys_copy(f, f->entry, 14, 7, tc.i64);
+ Operand off_addr = op_indexed_indirect_(13, 12, 0, 0, tc.i32);
+ tt_emit_load_phys_indirect(f, f->entry, 8, off_addr, tc.i32);
+ emit_phys_binop(f, f->entry, 7, 21, 8, tc.i32, BO_IADD);
+ emit_phys_copy(f, f->entry, 8, 7, tc.i32);
+ emit_phys_copy(f, f->entry, 13, 8, tc.i32);
+ Operand field_addr = op_indirect_(14, tc.i64);
+ field_addr.v.ind.ofs = 8;
+ tt_emit_load_phys_indirect(f, f->entry, 12, field_addr, tc.i64);
+ emit_ret_val(f, f->entry, 12, tc.i64);
+
+ opt_combine(f);
+ Inst* load = NULL;
+ for (u32 i = 0; i < f->blocks[f->entry].ninsts; ++i)
+ if ((IROp)f->blocks[f->entry].insts[i].op == IR_LOAD &&
+ f->blocks[f->entry].insts[i].opnds[0].v.reg == 12)
+ load = &f->blocks[f->entry].insts[i];
+
+ EXPECT(load && load->opnds[1].kind == OPK_INDIRECT &&
+ load->opnds[1].v.ind.base == 14,
+ "copy-chain indirect substitution must not cross reuse of source reg");
+
tc_fini(&tc);
}
@@ -6959,6 +7044,7 @@ int main(void) {
opt_pressure_relief_sinks_many_immediates_in_one_pass();
opt_liveness_branch();
opt_block_liveness_phase1();
+ opt_liveness_grows_high_preg_bitsets();
opt_live_ranges_phase2();
opt_range_liveness_linear();
opt_interference_branch_disjoint();
@@ -6991,6 +7077,7 @@ int main(void) {
opt_combine_keeps_unsafe_and_multiuse_defs();
opt_combine_copy_chains_and_convert_pairs();
opt_combine_substitutes_into_indirect_base_and_index();
+ opt_combine_copy_chain_source_reuse_blocks_indirect_subst();
opt_combine_synthesizes_address_modes();
opt_combine_full_ext_of_ext_rules();
opt_dce_physical_dead_defs();
diff --git a/test/toy/run.sh b/test/toy/run.sh
@@ -355,6 +355,9 @@ if [ ! -x "$CFREE" ]; then
exit 2
fi
+printf 'toy: CFREE=%s\n' "$CFREE"
+printf 'toy: PATHS=%s OPT_LEVELS=%s\n' "$PATHS" "$TOY_OPT_LEVELS"
+
if [ $RUN_X -eq 1 ]; then
have_podman=0
command -v podman >/dev/null 2>&1 && have_podman=1