kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 2f677dcf2322c6276eb432fbadbfe5d3a8fdbad1
parent d37d6b3f8c61a24a3b2f259c22817b69525bf27d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sun, 24 May 2026 23:12:56 -0700

opt pipeline redesign

Diffstat:
MMakefile | 2+-
Mdoc/EMU.md | 1055++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
Mdoc/OPT_PERF.md | 112+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Adoc/OPTv2.md | 937+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdriver/ar.c | 20+++++++++++---------
Mdriver/runtime.c | 56+++++++++++++++++++++++++++++++++++++++-----------------
Mlang/c/parse/parse_init.c | 34++++++++++++++++++++++++++--------
Mlang/c/parse/parse_type.c | 32+++++++++++++++++++-------------
Msrc/arch/aa64/ops.c | 12+++++++++++-
Msrc/arch/c_target/target.c | 9+--------
Msrc/arch/rv64/ops.c | 12+++++++++++-
Msrc/arch/x64/ops.c | 13++++++++++++-
Msrc/opt/ir.h | 24++++++++++++++++++++++++
Msrc/opt/opt.c | 101+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
Msrc/opt/opt.h | 22+++++++++-------------
Msrc/opt/opt_internal.h | 45+++++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/opt_util.c | 15+++++++++++++++
Msrc/opt/pass_analysis.c | 281+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/pass_combine.c | 49+++++++++++++++++++++++++++++++++++++++++++++----
Msrc/opt/pass_emit.c | 140+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
Msrc/opt/pass_hard_live.c | 8++++++--
Msrc/opt/pass_live.c | 121++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
Msrc/opt/pass_lower.c | 356++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
Msrc/opt/pass_machinize.c | 50+++++++++++++++++++++++++++++++++++---------------
Asrc/opt/pass_mir.c | 103+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/opt/opt_test.c | 107+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
Mtest/toy/run.sh | 3+++
27 files changed, 3090 insertions(+), 629 deletions(-)

diff --git a/Makefile b/Makefile @@ -81,7 +81,7 @@ LIB_OBJS = $(patsubst src/%.c,$(BUILD_DIR)/lib/%.o,$(LIB_SRCS)) \ $(LANG_OBJS) \ $(patsubst src/%.S,$(BUILD_DIR)/lib/%.o,$(LIB_ASMS)) LIB_DEPS = $(LIB_OBJS:.o=.d) -LIB_AR_STAGING = $(BUILD_DIR)/ar/libcfree +LIB_AR_STAGING = $(BUILD_DIR)/.ar-staging/libcfree DRIVER_SRCS = $(wildcard driver/*.c) ifneq ($(CFREE_LANG_CPP_ENABLED),1) diff --git a/doc/EMU.md b/doc/EMU.md @@ -1,355 +1,758 @@ # cfree emu design -Architecture of `cfree emu`, the guest-ISA emulator. Companion to -`DESIGN.md`. Scope: how the emulator slots into the existing pipeline and -what its contracts are. Not a tutorial; not implementation notes. - -## 1. Goals - -- `emu` multi-call subcommand: load and execute a guest ELF on the host, - user-mode only (Linux/macOS userland; no full-system emulation). -- Targets v1: aarch64, riscv64. 32-bit variants follow each 64-bit lift. - x86_64 deferred (flag-heavy ISA — exercise lazy flags on simpler arches - first). -- Per-basic-block JIT translation: decode guest bytes → lift to `CG` → opt - (optional) → MCEmitter → ObjBuilder → incremental link into a single - growing `LinkImage` → execute. -- Block chaining for hot paths; cold blocks may run direct-from-CG (no opt - wrapper) for translation throughput. -- Source-level stepping through `dbg` when the guest ELF carries DWARF. -- Self-hosting: `src/emu/` is C11 freestanding like the rest of `src/`. - -The lifter is a sibling to `parse_c`: both are frontends that consume input -bytes and drive `CG`. Everything below `CG` (`opt`, `arch`, `obj`) is -unchanged. `link/` requires one extension — incremental resolve -(`link_resolve_extend`, §6) — landed *before* emu work begins, so emu has a -single lifecycle model end-to-end and never carries a "per-block fresh -`LinkImage`" interim shape. - -## 2. Non-goals (v1) - -- Full-system emulation (privileged ISA, MMU, devices). -- SIMD/vector ISA extensions (SSE/AVX/NEON/RVV) — blocked on `CGTarget` - lacking vector ops (§DESIGN 5.7). Programs using them either trap to a - scalarizer or fail to lift. -- x86 in v1. -- Self-modifying code (refuse to lift on observed write to a translated - page; full support is future work). -- Per-instruction precise exceptions / signal redirection. -- Foreign-OS syscalls — only the host OS's syscalls are forwarded. - -## 3. Layout +This document describes the target design for `cfree emu`: a user-mode +guest executable runner built out of the same registries and pipeline +boundaries used by the rest of cfree. + +This is a design document, not a description of the current prototype. +The prototype can be discarded or migrated behind these interfaces. + +## Goals + +- Run guest user-mode executables in-process. +- Keep all format, architecture, language, and OS knowledge behind the + registry/vtable system described in `doc/REGISTRY.md`. +- Treat the guest ISA lifter as a frontend: decode guest bytes, emit CG, + then reuse the existing opt, backend, object, link, and JIT pipeline. +- Support a non-executable-memory execution mode by interpreting lifted IR. +- Keep executable loading separate from object building. Loading maps a + guest process image; lifting/codegen produces new host objects. + +## Non-goals + +- Full-system emulation: no privileged ISA, devices, kernel, or MMU. +- Built-in host policy for outside effects. Syscalls, dynamic imports, + filesystem access, clocks, signals, and other outside interactions are + delegated to emulator/embedder-provided bindings. +- Self-modifying code in v1. The first implementation may trap or refuse + writes to translated pages. +- Source-language frontend involvement. `emu` starts from a binary image; + language registries are relevant only because they share the same + registry discipline. + +## Top-level Shape + +`emu` owns process-level orchestration. It should not own ELF, Mach-O, +PE/COFF, RISC-V, AArch64, or Linux semantics directly. + +The expected flow is: + +```text +guest executable bytes + -> obj-format executable loader + -> EmuProcess { EmuImage, EmuAddrSpace, initial CPU state } + -> arch decoder: bytes at guest PC -> CfreeDecodedInsn[] + -> arch lifter: CfreeDecodedInsn[] -> CG function + -> optional opt + -> either IR interpreter or backend/JIT + -> dispatch loop and runtime helpers +``` + +The important type split is: + +- `ObjFile` / object readers: inspect binary formats. +- `EmuImage`: loaded executable image and metadata. +- `EmuProcess`: address space, threads, CPU state, OS ABI state. +- `ObjBuilder`: generated host object from lifted guest code. +- `LinkImage` / `CfreeJit`: resolved/generated host code. + +Executable loading does not produce an `ObjBuilder`. `ObjBuilder` appears +only after the lifter emits CG for a translated guest block. + +## Registry Integration + +The registry direction in `doc/REGISTRY.md` should extend naturally to +`emu`. + +### Object Format Vtable + +Object/image format implementations own executable file parsing and load +commands/program headers. `emu` asks the selected object format to load a +process image. +Implementation location: + +- ELF loader lives with the ELF object/image code. +- Mach-O loader lives with Mach-O object/image code. +- PE/COFF loader lives with PE/COFF object/image code. + +Sketch: + +```c +typedef struct EmuLoadOptions { + CfreeSlice name; + CfreeSlice bytes; + const char *const *argv; + const char *const *envp; + const CfreeOsImpl *os; +} EmuLoadOptions; + +typedef struct EmuLoadedImage { + CfreeTarget guest_target; + uint64_t entry_pc; + uint64_t phdr_vaddr; + uint64_t base_vaddr; + EmuAddrSpace *addr_space; + EmuImageSymbols symbols; + EmuImageDebug debug; +} EmuLoadedImage; + +typedef struct ObjFormatEmuOps { + CfreeStatus (*detect_executable)(CfreeCompiler *, CfreeSlice bytes, + CfreeTarget *target_out); + CfreeStatus (*load_executable)(CfreeCompiler *, const EmuLoadOptions *, + EmuLoadedImage *out); +} ObjFormatEmuOps; + +typedef struct ObjFormatImpl { + /* Existing object-format operations from doc/REGISTRY.md. */ + CfreeObjFmt kind; + const char *name; + /* ... object read/emit, DSO read, link-image emit ... */ + + const ObjFormatEmuOps *emu; +} ObjFormatImpl; ``` -src/ - emu/ - emu.h driver-facing API - decode/ per-ISA structured decoder (shared tables with objdump) - aarch64.c - riscv64.c - lift/ per-ISA lifter; drives CG - aarch64.c - riscv64.c - cpu.h per-arch CPUState struct synthesis - runtime.c dispatcher, code cache, block chaining - syscall/ per-host-OS syscall forwarders (linux.c, darwin.c) -test/ - emu/ guest binary corpus + behavioral oracles + +The loader maps segments into an `EmuAddrSpace` and records metadata. +It does not construct source-like objects and it does not lower code. + +For ELF, loading is program-header driven. Sections may be used for +symbols and debug information, but PT_LOAD, PT_INTERP, TLS, dynamic +tables, and auxv data are the executable-loading contract. + +### OS Vtable + +`emu` needs OS-specific user ABI behavior that does not belong in an +architecture or object format module. + +Implementation location: + +- Linux behavior lives with the Linux OS implementation. +- Darwin behavior lives with the Darwin OS implementation. +- Windows behavior lives with the Windows OS implementation. + +Sketch: + +```c +typedef struct CfreeOsImpl { + CfreeOSKind kind; + const char *name; + + const ABIVtable *(*abi_for_arch)(CfreeCompiler *, CfreeArchKind); + + CfreeStatus (*emu_init_process)(CfreeCompiler *, EmuProcess *, + const EmuLoadOptions *, + const EmuLoadedImage *); + CfreeStatus (*emu_init_thread)(CfreeCompiler *, EmuProcess *, + EmuThread *); + CfreeStatus (*emu_decode_syscall)(EmuProcess *, EmuThread *, + EmuSyscallRequest *out); + CfreeStatus (*emu_encode_syscall_result)(EmuProcess *, EmuThread *, + const EmuSyscallResult *); + CfreeStatus (*emu_deliver_signal)(EmuProcess *, EmuThread *, + const EmuSignalEvent *); +} CfreeOsImpl; + +const CfreeOsImpl *os_lookup(CfreeOSKind); +``` + +The OS vtable owns: + +- initial stack shape: argv, envp, auxv, alignment; +- syscall number table and argument/result ABI decoding; +- errno, signal-frame, and signal-delivery ABI conventions; +- TLS and dynamic-loader process conventions; +- OS-specific pages such as vdso-like regions, if supported. + +It does not issue host syscalls, open files, read clocks, resolve host +symbols, or deliver host signals directly. It translates between guest OS +ABI state and emulator-level requests. The emulator/embedder bindings +decide what those requests mean. + +This also makes the ABI axis in `doc/REGISTRY.md` cleaner: ABI selection +can remain derived, but OS behavior has an explicit registry home. + +### Outside Interaction Bindings + +All interaction outside the emulated process is delegated to bindings +provided by the embedding application or by the `cfree emu` driver. This +keeps libcfree policy-free: the library describes requests, the embedder +chooses whether to service, virtualize, deny, record, or replay them. + +Sketch: + +```c +typedef struct EmuSyscallRequest { + uint64_t number; + uint64_t args[6]; +} EmuSyscallRequest; + +typedef struct EmuSyscallResult { + int64_t result; + int32_t guest_errno; + uint32_t flags; +} EmuSyscallResult; + +typedef struct EmuImportRequest { + CfreeSlice object_name; + CfreeSlice symbol_name; + uint32_t bind_flags; +} EmuImportRequest; + +typedef struct EmuExternalBindings { + CfreeStatus (*syscall)(void *user, EmuProcess *, EmuThread *, + const EmuSyscallRequest *, + EmuSyscallResult *out); + CfreeStatus (*resolve_import)(void *user, EmuProcess *, + const EmuImportRequest *, + EmuResolvedImport *out); + CfreeStatus (*signal)(void *user, EmuProcess *, EmuThread *, + const EmuSignalEvent *); + CfreeStatus (*trace)(void *user, EmuProcess *, const EmuTraceEvent *); + void *user; +} EmuExternalBindings; ``` -`src/emu/` is a sibling to `src/parse/`. The runtime helpers live in the -cfree tool itself — the JIT's `LinkExternResolver` (§DESIGN 5.5.1) returns -the host address of `emu_load64`, `emu_syscall`, etc. directly; no -separate runtime object. +The OS vtable is still required for syscalls because it knows how to read +and write guest registers, errno, restart behavior, and signal frames. +But after decoding a syscall into `EmuSyscallRequest`, it calls the +binding rather than a host syscall API. -## 4. Dataflow +Dynamic linking follows the same rule. The object-format loader and OS +dynamic-linker support parse imports, relocations, PLT/GOT conventions, +and loader metadata; binding `resolve_import` decides what a requested +external symbol resolves to. A binding may point to a native host +function, an emulated thunk, a synthetic guest address, or a denial +stub. The core emulator should not hard-code libc, filesystem, network, +or host process behavior. +### Architecture Vtable + +Architecture implementations own guest instruction knowledge. `emu` +should not have a private decoder per ISA. + +Implementation location: + +- `src/arch/rv64/` owns RV64 decode, formatting, lifter hooks, and + CPU-state schema. +- `src/arch/aa64/` owns AArch64 equivalents. +- `src/arch/x64/` can add guest support later without changing `emu`. + +Sketch: + +```c +typedef enum CfreeDecodeFlag { + CFREE_DECODE_TERMINATOR = 1u << 0, + CFREE_DECODE_BRANCH = 1u << 1, + CFREE_DECODE_CALL = 1u << 2, + CFREE_DECODE_RET = 1u << 3, + CFREE_DECODE_MEMORY = 1u << 4, + CFREE_DECODE_TRAP = 1u << 5, +} CfreeDecodeFlag; + +typedef enum CfreeDecodedOperandKind { + CFREE_DECOP_NONE, + CFREE_DECOP_REG, + CFREE_DECOP_IMM, + CFREE_DECOP_MEM, + CFREE_DECOP_PCREL, + CFREE_DECOP_SYSREG, +} CfreeDecodedOperandKind; + +typedef struct CfreeDecodedOperand { + uint8_t kind; + uint8_t width_bits; + uint16_t flags; + uint32_t reg; + uint32_t index_reg; + int64_t imm; + uint8_t scale; +} CfreeDecodedOperand; + +typedef struct CfreeDecodedInsn { + uint64_t pc; + const uint8_t *bytes; + uint8_t nbytes; + uint8_t noperands; + uint16_t flags; + uint32_t opcode; /* Arch-owned stable opcode enum. */ + uint32_t encoding_id; /* Optional row/table id for formatting. */ + CfreeDecodedOperand operands[CFREE_DECODE_MAX_OPERANDS]; + uint64_t arch[2]; /* Small arch-private payload. */ +} CfreeDecodedInsn; + +typedef struct ArchInsnFormatter ArchInsnFormatter; + +typedef struct ArchDecodeOps { + uint8_t min_insn_len; + uint8_t max_insn_len; + + CfreeStatus (*decode_one)(CfreeCompiler *, const uint8_t *bytes, + size_t len, uint64_t pc, + CfreeDecodedInsn *out); + CfreeStatus (*decode_block)(CfreeCompiler *, const uint8_t *bytes, + size_t len, uint64_t pc, + CfreeDecodedInsn *out, uint32_t cap, + uint32_t *n_out); + + ArchInsnFormatter *(*formatter_new)(CfreeCompiler *); + CfreeStatus (*format)(ArchInsnFormatter *, const CfreeDecodedInsn *, + CfreeInsn *out); + void (*formatter_free)(ArchInsnFormatter *); +} ArchDecodeOps; + +typedef struct ArchEmuOps { + CfreeStatus (*cpu_layout)(CfreeCompiler *, EmuCpuLayout *out); + CfreeStatus (*init_cpu)(CfreeCompiler *, EmuThread *, + const EmuLoadedImage *); + CfreeStatus (*lift_block)(CfreeCompiler *, CfreeCg *, + const CfreeDecodedInsn *, uint32_t n, + const EmuLiftContext *); +} ArchEmuOps; + +typedef struct ArchImpl { + /* Existing arch registry fields. */ + /* ... */ + + const ArchDecodeOps *decode; + const ArchEmuOps *emu; +} ArchImpl; ``` -guest.elf (bytes) ─► obj reader ─► LinkImage* (mapped into guest AS) - │ - guest_pc ─► Decoder ─► EmuInst* ─► Lifter ─► CG ─┐ - │ - CGTarget ◄─────┘ - │ - (opt?) - │ - MCEmitter ─► ObjBuilder - │ - link_jit ─► host code - │ - dispatcher(guest_pc) ──┘ + +`ArchDecodeOps` is a general arch service. It is useful to `objdump`, +`dbg`, and `emu`. `ArchEmuOps` is optional; an architecture can support +disassembly and native codegen without supporting guest emulation. + +### Language Registry + +Language frontends are not on the `emu` hot path. The only design rule is +symmetry: `emu` should follow the same registry discipline as languages, +not add parallel hard-coded switches. + +## Decoder And Disassembler Split + +The current public disassembler shape is text-oriented: + +```c +typedef struct CfreeInsn { + uint64_t vaddr; + const uint8_t *bytes; + uint32_t nbytes; + CfreeSlice mnemonic; + CfreeSlice operands; + CfreeSlice annotation; +} CfreeInsn; ``` -1. **Load.** `obj/` readers parse the guest ELF. The runtime maps loadable - segments into a *guest address space* (an mmap'd region inside the - host process); guest virtual addresses are the addresses inside that - region. -2. **Decode.** When the dispatcher hits an untranslated `guest_pc`, the - per-ISA decoder reads guest bytes and produces `EmuInst`s up to the - next basic-block terminator. Decode tables are shared with the - disassembler (objdump): same bit patterns, two output shapes. -3. **Lift.** The per-ISA lifter walks the `EmuInst` stream and emits one - synthesized C function per guest basic block: signature - `next_pc_t block(CPUState*)`. Lifter calls `cg_*` ops only. -4. **Codegen + JIT.** Standard cfree pipeline. At -O0 the emu drives a - target `CGTarget` directly (fast translation, slow execution); at -O2 - it wraps with `opt_cgtarget` (slow translation, fast execution). Both - end at `link_jit` mapping executable pages. -5. **Execute.** `runtime.c` calls the host code with the current - `CPUState*`. The block returns the next guest PC. The dispatcher looks - up the next block, translating on miss. - -## 5. Key interfaces - -### 5.1 `Decoder` (`src/emu/decode/decode.h`) - -Per-ISA structured decoder. Output is `EmuInst` — a tagged union of -ISA-specific shapes with operand fields, *not* text. +The current internal arch hook also decodes directly into that text +record: ```c -typedef struct EmuInst { - EmuOp op; /* per-ISA enum */ - u64 guest_pc; - u32 guest_bytes; /* instruction width */ - EmuOperand operands[EMU_MAX_OPERANDS]; - u32 nop; - u32 flags; /* TERMINATOR | MEM | SETS_FLAGS | ... */ -} EmuInst; - -u32 emu_decode_block(EmuArch, const u8* bytes, u64 guest_pc, - EmuInst* out, u32 max); +struct ArchDisasm { + u32 (*decode)(ArchDisasm *, const u8 *bytes, size_t len, u64 vaddr, + CfreeInsn *out); + void (*destroy)(ArchDisasm *); +}; ``` -The same decode tables back the disassembler (textual format) and the -lifter (structured). One source of truth per ISA. +That is sufficient for `objdump`, but it is the wrong interface for +`emu`. A lifter needs operand identity, widths, immediates, memory +addressing, flags behavior, and terminator classification. It should +never parse disassembly text. + +Target shape: -### 5.2 `Lifter` (`src/emu/lift/lift.h`) +```text +bytes -> ArchDecodeOps.decode_one -> CfreeDecodedInsn + -> ArchDecodeOps.format -> CfreeInsn text + -> ArchEmuOps.lift_block -> CG + -> ArchDbgOps -> breakpoint/displaced stepping helpers +``` -Per-ISA lifter. Consumes `EmuInst*`, drives `CG*`, produces one CG function -per guest basic block. +The disassembler iterator remains public and can continue returning +`CfreeInsn`. Internally, it becomes: + +```text +decode_one(..., &decoded) +format(formatter, &decoded, &public_text_insn) +apply object annotations +``` + +This keeps the public disassembler API stable while moving the reusable +decode contract into `arch/`. + +The formatter object exists because `CfreeInsn` contains string slices. +As today, those slices are valid until the next formatter call or until +the formatter is freed. + +### Operand Policy + +The decoded instruction type should be format-neutral but not force all +architectures into an artificial common ISA. The contract is: + +- `opcode` is arch-owned. RV64, AArch64, and x64 each define their own + stable opcode enums. +- `operands[]` uses common storage for common shapes: registers, + immediates, PC-relative targets, memory references, and system + registers. +- `arch[]` carries small arch-private fields such as condition codes, + rounding modes, CSR IDs, or x86 prefix state. +- Larger arch-private decode payloads can live behind an arena-backed + side table if needed; the common fast path should remain inline and + fixed-size. + +The lifter switches on the arch-owned opcode. `objdump` formats using +the same `CfreeDecodedInsn` plus the arch's formatter. `dbg` can either +reuse `CfreeDecodedInsn` directly or keep a narrow `ArchDbgInsn` wrapper +implemented on top of the shared decoder. + +### Block Decode + +`decode_block` is a convenience over `decode_one`, not a separate source +of truth. It repeatedly decodes instructions until one of these happens: + +- decoded instruction has `CFREE_DECODE_TERMINATOR`; +- output capacity is reached; +- decode fails; +- a page boundary or mapped-range boundary blocks a complete instruction. + +The dispatcher uses this to form translation units. The debugger and +disassembler can continue to decode one instruction at a time. + +## Process Model + +`EmuProcess` is the root context for emulation. All mutable state hangs +off it or below it. ```c -void emu_lift_block(EmuArch, CG* cg, - const EmuInst* insts, u32 n, - EmuLiftCtx* ctx); +typedef struct EmuProcess { + CfreeCompiler *compiler; + const ObjFormatImpl *obj_format; + const ArchImpl *guest_arch; + const CfreeOsImpl *guest_os; + + EmuAddrSpace *addr_space; + EmuLoadedImage image; + EmuRuntime runtime; + + EmuThread *threads; + uint32_t nthreads; +} EmuProcess; ``` -`EmuLiftCtx` carries: the CPUState `Type*`, the synthesized block function -type (`next_pc_t (*)(CPUState*)`), the `ObjSymId` for the block, runtime -helper symbols (memory load/store, syscall trampoline, dispatcher -tail-call), and per-block lazy-flag state (§5.5). - -The lifter targets `CG` exclusively (`src/cg/cg.h`) — never `CGTarget` -directly. It uses roughly this subset: - -- `cg_push_global(cpu_state_sym)`, `cg_push_int`, `cg_push_const` -- `cg_load`, `cg_store`, plus a small `lift_field(ctx, off, T)` helper - layered on `cg_addr` + offset arithmetic -- `cg_binop`, `cg_unop`, `cg_cmp`, `cg_convert` -- `cg_atomic_*` for guest atomics -- `cg_label_new` / `cg_label_place` / `cg_jump` / `cg_branch_true` -- `cg_call` for runtime helpers (memory access, syscalls, dispatcher - tail-calls — `cg_call` materializes ABI parts from `fn_type`, which is - the main reason CG and not CGTarget) -- `cg_set_loc` carrying the *guest* PC encoded as a `SrcLoc` against a - synthetic `SourceManager` file id (§DESIGN 5.0) - -It does not use: aggregates, bitfields, variadics, setjmp, structured -scopes, inline asm. Those C-shaped surfaces remain available to the C -front-end at zero cost. - -### 5.3 `CPUState` (`src/emu/cpu.h`) - -Per-arch C struct synthesized once per emu invocation as an interned -`Type*`. Fields: - -- General-purpose register file (`u64 x[N]`). -- Lazy-flag fields (§5.5): last op kind, last operands, materialized - flags cache. -- Pointer to the guest memory base (host pointer). -- Pointer to the dispatcher entry / code-cache lookup function. -- Trap reason / exit code slots written before returning to the - dispatcher. - -Lifters reference fields by stable offset constants generated alongside -the `Type*`. The runtime allocates one `CPUState` per guest thread and -exposes its address as an `ObjSymId` resolved externally by the JIT -linker. - -### 5.4 `Runtime` (`src/emu/runtime.h`) - -In-process runtime, linked into the cfree binary, callable from JITted -guest blocks via the JIT's external resolver. Responsibilities: - -- **Dispatcher.** `emu_run(CPUState*)`: loop { lookup guest_pc → - translate-if-cold → call block }, exits on trap. -- **Code cache.** `guest_pc → host entry` map; translation happens on - miss. Eviction deferred (cache grows unbounded in v1). -- **Reserved code region.** One up-front `PROT_NONE` mmap (~128 MB) - whose base address feeds `link_resolve_at` (§6). Per-block - `link_resolve_extend` bump-allocates within it; the runtime commits - pages and `mprotect`s RX as new blocks land. -- **Block chaining.** When a block's terminator targets an - already-translated block, patch the tail to jump directly, bypassing - the dispatcher. Patching is a runtime concern — CG/opt see only the - pre-patch tail-call. -- **Memory helpers.** `emu_load{8,16,32,64}` / `emu_store_*`: bounds-check - the guest address against the mapped guest AS, trap on miss. Lifter - emits a `cg_call` to these for every guest memory op in v1; an inline - fastpath is a follow-up (§9). -- **Syscall trampoline.** `emu_syscall(CPUState*)` reads the guest - syscall number/args from CPUState, forwards via the per-OS table in - `src/emu/syscall/`, writes the return into the guest return register. - -### 5.5 Flag policy — lazy flags - -Most ISAs (aarch64 NZCV, x86 EFLAGS) compute condition flags as a side -effect of arithmetic. CG/CGTarget have no flag primitives. The lifter -implements lazy flags entirely above CG: - -- Each flag-setting guest op writes (op_kind, lhs, rhs) into CPUState - fields. Flags are *not* recomputed eagerly. -- Each flag-reading guest op (conditional branch, `cset`, …) recomputes - the specific flag bit it needs from the recorded inputs. -- opt's GVN/DCE eliminates redundant flag computations within a block; - cross-block redundancy is recovered after inlining via block chaining. - -No CGTarget extension. Adding an ISA's flag set is a per-arch lifter -table, not a pipeline change. - -### 5.6 Memory model — guest address space - -Guest loads/stores all go through a base pointer of unknown C provenance, -so CG's default `MemAccess.alias` derivation (§DESIGN 5.6) collapses to -`ALIAS_UNKNOWN`. To recover useful aliasing, the lifter sets -`MemAccess.addr_space = EMU_GUEST_AS` on every guest memory op. opt's -alias rules treat guest-AS accesses as: - -- May alias each other. -- Do not alias C memory (CPUState fields, runtime state). - -This is a one-field convention on the existing `MemAccess` shape; no API -change. CPUState field accesses use the default `addr_space = 0` and -remain promotable / GVN'able as ordinary C memory. - -## 6. Lifecycle — per-block JIT on a single growing LinkImage - -The compiler pipeline is TU-shaped (§DESIGN 5.5.1, 9.1). The emu wants -per-block translate-on-demand. The model is a *single* `LinkImage` that -grows as cold blocks land — never a fresh image per block — so chaining, -the code cache, and the host VA region all reference one stable artifact -across the emu session. - -This requires `link/` to expose incremental resolution before any emu code -lands: +`EmuThread` owns one guest CPU state: ```c -LinkImage* link_resolve_at(Linker*, uintptr_t base_va); /* first call */ -void link_resolve_extend(Linker*, LinkImage*); /* later calls */ +typedef struct EmuThread { + EmuProcess *process; + void *cpu_state; + uint64_t pc; + uint32_t trap; + int exit_code; +} EmuThread; ``` -`link_resolve_at` reserves layout starting at a caller-specified base VA -(the runtime hands out from a pre-reserved `PROT_NONE` region, typically -~128 MB). `link_resolve_extend` appends new inputs: it places new sections -at the next free offset within the reserved region, resolves new symbols -against the existing image's symbol table plus the `LinkExternResolver`, -and applies new relocations into the live image. **It must not change -host addresses of previously placed sections** — chaining has already -patched live host code with those addresses. §DESIGN 5.5.1's existing -discipline (stable `LinkInputId`s, separable `LinkRelocApply`, -non-destructive resolution) is exactly what makes this safe. - -The per-block flow is then: - -1. Decode guest bytes for the block. -2. Lift into a fresh `ObjBuilder` containing one function. -3. `link_add_obj` against the session's `Linker`. -4. `link_resolve_extend` to place the new section in the reserved VA - region, resolve symbols (helpers via the resolver; cross-block - references always resolve to the dispatcher — see below), apply - relocations. -5. Commit the newly used pages and `mprotect` to RX. -6. Insert `guest_pc → host_entry` into the code cache. - -Cross-block calls during step 4 always resolve to the dispatcher, even if -the target block is already translated. The linker never learns about -guest PCs or sibling blocks. Cross-block direct jumps are installed -*later* by chaining, which is a runtime mprotect-and-patch operation -outside the linker entirely. Keeping these two mechanisms separate avoids -the duplication that "let the linker resolve sibling blocks" would -introduce. - -What this model deliberately does *not* support in v1: - -- **Symbol removal / re-resolution.** Code-cache eviction would need it; - v1 lets the cache grow unbounded (§5.4) so the question is moot. When - eviction lands it pairs with a runtime-side "invalidate chain patches - pointing into the evicted block" side-table, not a linker mutation. -- **Address relocation of previously placed sections.** The reserved-VA - bump-allocator never compacts; the chaining invariant depends on this. - -## 7. Driver API - -Mirrors §DESIGN 13. +No global process state is required. Registries are static immutable +tables; process state is explicit. + +## Address Space + +`EmuAddrSpace` is the only module that translates guest virtual +addresses to host pointers. + +Responsibilities: + +- map executable segments with guest permissions; +- provide checked read/write/fetch helpers; +- maintain page permissions and dirty/translated-page metadata; +- expose a bounded host pointer for decode and runtime memory helpers; +- report faults in a target-independent form. + +The lifter should not inline host pointer arithmetic initially. It emits +calls to runtime helpers: + +```c +uint8_t emu_load8 (EmuThread *, uint64_t guest_addr); +uint16_t emu_load16(EmuThread *, uint64_t guest_addr); +uint32_t emu_load32(EmuThread *, uint64_t guest_addr); +uint64_t emu_load64(EmuThread *, uint64_t guest_addr); +void emu_store8 (...); +``` + +An inline memory fast path can be added later by making the helper ABI +and address-space invariants explicit first. + +## Lifting + +The lifter emits one CG function per guest basic block. ```c -typedef struct CfreeEmuOptions { - EmuArch guest_arch; - const u8* guest_elf_bytes; - size_t guest_elf_len; - u32 optimize; /* 0 = direct CGTarget, 2 = opt_cgtarget */ - EmuTraceFlags trace; /* PC trace, decoded-inst trace */ - /* argv, envp, fd map come through CfreeEnv */ -} CfreeEmuOptions; - -int cfree_emu_run (Compiler*, CfreeEmuOptions, int* out_exit_code); - -/* Lower-level surface for dbg integration. */ -typedef struct CfreeEmu CfreeEmu; -CfreeEmu* cfree_emu_new (Compiler*, CfreeEmuOptions); -int cfree_emu_step (CfreeEmu*, u32 nblocks); -void* cfree_emu_lookup(CfreeEmu*, u64 guest_pc); /* translate-if-cold */ -void cfree_emu_free (CfreeEmu*); +uint64_t emu_block_<guest_pc>(EmuThread *thread); +``` + +The return value is normally the next guest PC. Traps and exits are +recorded in the thread/process state so the dispatcher can stop without +requiring every block to encode a large result struct. + +The lifter may use: + +- function begin/end and symbols; +- integer and floating-point arithmetic; +- loads/stores of CPU-state fields; +- branches and labels; +- calls to runtime helpers; +- source locations that encode guest PCs for debug views. + +The lifter should not use: + +- object format APIs; +- linker APIs; +- executable memory APIs; +- disassembly strings; +- host OS syscall APIs directly. + +Architecture-specific lowering belongs in `ArchEmuOps.lift_block`. +Shared helpers for CPU field addressing, helper calls, and block +function construction belong in `src/emu/lift/`. + +## Execution Engines + +`emu` should support multiple execution engines over the same process and +decode/lift infrastructure. + +### JIT Engine + +The JIT engine uses the normal pipeline: + +```text +decoded block -> CG -> optional opt -> ObjBuilder -> link -> CfreeJit +``` + +The existing link/JIT APIs should do most of the work once the lifter +emits valid CG. `emu` still owns: + +- guest PC to host entry cache; +- dispatcher; +- runtime helper resolver; +- trap/exit handling; +- code invalidation policy; +- block publication strategy. + +Two publication strategies are acceptable: + +- per-block object publication into a long-lived JIT session; +- a single growing `LinkImage`, if the linker keeps stable host + addresses across extension. + +The first implementation should choose the simpler strategy that fits +the current JIT API. The design requirement is stable lookup and +eventual invalidation, not a specific linker-internal shape. + +### IR Interpreter + +An IR interpreter is a first-class execution engine, not just a test +helper. It allows `emu` to run where executable memory is unavailable and +validates lifted semantics independently from machine-code emission. + +The intended flow is: + +```text +decoded block -> CG -> canonical IR -> optional opt -> IR interpreter +``` + +The interpreter should run the canonical pre-machinized IR. Interpreting +post-lowering machine IR would mix backend details into the emulation +contract and make cross-architecture behavior harder to test. + +This engine requires an explicit IR execution ABI: + +- call a lifted block function with an `EmuThread *`; +- model helper calls through a host function table; +- read/write CPU-state memory through normal IR memory operations; +- return the next guest PC or propagate a trap. + +### Instruction Interpreter + +A direct instruction interpreter is useful for bring-up and differential +testing, but it should not become the primary semantic source. The +primary semantics should live in the lifter. If an instruction +interpreter exists, it should consume `CfreeDecodedInsn`, not private +decoder records. + +## Runtime Helpers + +Runtime helpers live in `src/emu/runtime/` and are linked by symbol or +function table into JIT/IR execution. + +They own: + +- memory helpers; +- dispatcher callbacks; +- syscall trampoline that asks the OS vtable to decode/encode guest ABI + state and asks `EmuExternalBindings` to service the request; +- dynamic import trampolines that call `EmuExternalBindings`; +- trap and exit helpers; +- optional tracing hooks. + +Runtime helpers are process-aware: they receive `EmuThread *` or +`EmuProcess *`, not global state. + +## Debugging And Object Views + +Guest debug information belongs to the loaded image. Host debug +information belongs to generated blocks. + +The design should support: + +- guest PC source mapping from the guest executable's DWARF; +- generated host block symbols named by guest PC; +- `cfree_jit_view` / objdump of generated host code; +- debugger stepping that maps host PCs back to guest PCs. + +The lifter should stamp CG locations with guest PC information. The JIT +debug view can then relate generated host instructions back to guest +addresses even before source-level guest debugging is complete. + +## Module Ownership + +Proposed ownership: + +```text +src/emu/ + emu.c public API glue and lifecycle + process.c EmuProcess / EmuThread ownership + mem.c guest address space + dispatch.c block cache and run loop + bindings.c embedder outside-interaction binding glue + lift/common.c arch-independent CG helpers for lifters + runtime/*.c memory, traps, helper table, tracing + interp/ir.c lifted IR interpreter + interp/inst.c optional decoded-instruction interpreter + +src/arch/<arch>/ + decode.c ArchDecodeOps implementation + disasm.c formatter over CfreeDecodedInsn + emu.c ArchEmuOps: CPU layout, init, lift + isa.* shared encoding tables and operand extraction + +src/obj/<format>/ + *_read.c relocatable/shared object reader + *_emit.c relocatable object emitter + *_image.c executable image loader for ObjFormatEmuOps + registry.c ObjFormatImpl entries + +src/os/<os>/ + os.c CfreeOsImpl entry + syscall.c guest syscall ABI decode/encode + process.c stack, auxv, TLS, signal conventions +``` + +Exact filenames can vary. The boundary should not: format code loads +files, arch code decodes/lifts instructions, OS code models user ABI, +and `emu` coordinates process execution. + +## First Target + +The first implementation target is a deliberately small vertical slice +that exercises every intended boundary without requiring broad ISA, +loader, OS, or execution-engine coverage. + +Target: + +- guest arch: RV64; +- guest OS: Linux; +- object format: ELF; +- executable shape: static ELF64 little-endian `ET_EXEC`; +- execution engine: lifted IR interpreter; +- outside interaction: `EmuExternalBindings.syscall`; +- guest behavior: `_start` exits with a code through `ecall`. + +The test program can be: + +```asm +addi a0, zero, 42 # exit code +addi a7, zero, 93 # Linux rv64 SYS_exit +ecall ``` -Path-shaped helpers (`cfree emu prog.elf`) live in the driver layer and -read bytes via `c->env->file_io->read_all`. The freestanding core never -takes paths. - -## 8. Debug info - -When the guest ELF carries DWARF, source-level stepping comes for free: - -- A guest-DWARF reader (extension of `src/debug/`) maps guest PC → - guest source line. -- `dbg` interrogates `cfree_emu_lookup` plus the guest DWARF for - per-line breakpoints and stepping. -- Lifter feeds `cg_set_loc` a `SrcLoc` whose `file_id` is a synthetic - `SourceManager` file representing the guest binary, with line numbers - encoding guest PC. opt's `Inst.loc` and host-side DWARF then point - back at guest PCs, useful for `objdump` of JITted host code. - -No new debug-info pipeline. - -## 9. Open questions - -- **CPUState promotion.** CPUState is passed by pointer, so its fields - are address-taken from CG's view and `build_ssa` won't promote them - (§DESIGN 12). Hot guest registers will spill/reload across every - arithmetic op, which kills perf. Likely fix: a per-block convention - that imports CPUState fields into virtuals at entry and exports at - exit, treating mid-block accesses as the SSA values. This is the - single largest perf question and worth prototyping before the lifter - interface freezes. -- **Inline memory fastpath.** A `cg_call` per guest memory op is a real - function call. Inlining a bounds check + direct host load is a known - emu speedup; needs either a cross-runtime/JIT inliner or a CG-level - helper. Defer until measured. -- **Vector ISA support.** Blocked by CGTarget lacking vector ops - (§DESIGN 5.7). Lands the same day vectors land for the C front-end. -- **x86 flag policy.** EFLAGS has more bits and more cross-instruction - dependencies than aarch64 NZCV. Lazy flags work but the recorded - payload is larger; verify on aarch64 before committing to x86. -- **SMC detection.** v1 refuses to lift on observed write to a - translated page (write-protect translated guest pages, trap writes). - Full SMC support — invalidate-and-retranslate — is future work. +`SYS_exit_group` (`a7 = 94`) may be accepted too, but `SYS_exit` is +enough for the first acceptance test. + +Minimum implementation requirements: + +- `ObjFormatImpl(ELF).emu` + - Detect ELF64 little-endian `EM_RISCV`. + - Accept static `ET_EXEC` with `PT_LOAD` segments. + - Map loadable segments into `EmuAddrSpace`. + - Set `EmuLoadedImage.entry_pc`. + - Return `CFREE_UNSUPPORTED` for dynamic executables, PIE, unsupported + relocations, TLS, and malformed inputs. +- `CfreeOsImpl(Linux)` + - Initialize one process and one thread. + - Provide a valid aligned stack pointer, even if argv/envp/auxv are + minimal. + - Decode RV64 Linux syscall ABI: number in `a7`, arguments in `a0-a5`. + - Encode syscall results for non-exit calls, although the first test + should not need them. +- `EmuExternalBindings` + - Implement `syscall`. + - Recognize syscall `93` and set the emulated exit state from `a0`. + - Optionally recognize syscall `94`. + - Return a deterministic unsupported result for every other syscall. +- `ArchDecodeOps(RV64)` + - Decode `ADDI` and `ECALL`. + - Mark `ECALL` as a terminator. + - Provide a formatter over the same `CfreeDecodedInsn` records so the + disassembler path uses the shared decoder shape. +- `ArchEmuOps(RV64)` + - Define a minimal CPU state with `x[32]`, `pc`, trap state, and exit + code. + - Initialize `pc` from `EmuLoadedImage.entry_pc` and `x2` from the + initial stack pointer. + - Lift `ADDI` and `ECALL` to CG. +- IR interpreter + - Execute the lifted block function with an `EmuThread *`. + - Route runtime helper calls through the same helper/binding table that + the JIT engine will use later. + +Acceptance criteria: + +- The in-memory ELF loads through the ELF object-format vtable. +- Process/thread initialization goes through the Linux OS vtable. +- Instruction decode goes through `ArchDecodeOps(RV64)`. +- The disassembler formatter can format the same decoded instructions. +- The lifter goes through `ArchEmuOps(RV64)` and emits CG. +- The lifted IR interpreter runs the block without executable memory. +- The exit syscall goes through `EmuExternalBindings.syscall`, not a + built-in host syscall. +- The emulated process exits cleanly with code `42`. + +This slice intentionally excludes dynamic linking, libc startup, memory +loads/stores, branches, JIT execution, signals, file I/O, clocks, and +host-backed syscalls. Those features should extend the same boundaries +rather than introduce new ones. + +## Implementation Order + +1. Add `CfreeDecodedInsn` and `ArchDecodeOps` behind `ArchImpl`. + Convert one architecture's disassembler to `decode_one + format` + without changing public `cfree_disasm_*`. +2. Convert `objdump` and debugger instruction decode paths to the shared + decoder where practical. +3. Add object-format executable loader hooks and return `EmuLoadedImage` + instead of an object-builder-like shape. +4. Add an OS registry, outside-interaction bindings, and a minimal OS + vtable with process init and syscall ABI decode/encode. +5. Add `ArchEmuOps` for one guest architecture with CPU layout and a + small lifted block subset. +6. Add the IR interpreter execution engine. +7. Add the JIT execution engine using existing CG, opt, object, link, and + JIT APIs. +8. Expand ISA coverage and binding-backed syscall/import coverage with + differential tests against known guest binaries. + +Each step should be testable in isolation. The decoder split can land +before any executable runs; the loader can be tested by inspecting +`EmuLoadedImage`; the lifter can be tested through the IR interpreter +before executable memory is involved. diff --git a/doc/OPT_PERF.md b/doc/OPT_PERF.md @@ -94,6 +94,91 @@ previously broken `matrix -O2`, `binary-trees`, `nbody`, and `spectral-norm` rows. The remaining `COMPILE_FAIL` rows are all MIR `c2m` on the Darwin hosted/math benchmarks. +## Latest Focused O0/O1 Refresh + +Date: 2026-05-25. + +Scope: + +- `array` +- `hash` +- `hash2` +- `matrix` +- `sieve` + +Levels: `0 1`. + +Repeats: one compile repeat and three run repeats. + +Command: + +```sh +CFREE_OPT_BENCH_LEVELS="0 1" \ +CFREE_OPT_BENCHES="array matrix hash hash2 sieve" \ +make bench-opt +``` + +Output: + +- `build/bench/opt/results.csv` +- `build/bench/opt/summary.md` + +Coverage: + +| status | rows | +| --- | ---: | +| OK | 50 | + +Because this was a focused O0/O1 run, the generated summary does not include +gcc `-O2` base rows and therefore reports base-relative speed ratios as `NA`. +The generated summary averages are: + +| tool | opt | cases | avg compile+codegen ms | avg runtime ms | +| --- | ---: | ---: | ---: | ---: | +| `gcc-15` | `O0` | 5 | `413.780` | `8371.356` | +| `gcc-15` | `O1` | 5 | `408.000` | `3142.126` | +| `clang` | `O0` | 5 | `188.181` | `8858.075` | +| `clang` | `O1` | 5 | `182.725` | `3168.854` | +| `mir-c2m` | `O0` | 5 | `139.286` | `5608.600` | +| `mir-c2m` | `O1` | 5 | `138.051` | `5044.000` | +| `cfree` | `O0` | 5 | `68.995` | `8488.662` | +| `cfree` | `O1` | 5 | `67.520` | `4276.252` | +| `cfree-run` | `O0` | 5 | `27.630` | `8766.237` | +| `cfree-run` | `O1` | 5 | `28.935` | `4156.487` | + +Direct ratios from this run: + +| comparison | compile-time ratio | runtime ratio | +| --- | ---: | ---: | +| `cfree O1` vs `MIR O1` | `0.489x` time, `2.04x` faster | `0.848x` time, `1.18x` faster | +| `cfree O1` vs `gcc-15 O0` | `0.163x` time, `6.13x` faster | `0.511x` time, `1.96x` faster | +| `cfree-run O1` vs `MIR O1` | `0.210x` time, `4.77x` faster | `0.824x` time, `1.21x` faster | +| `cfree-run O1` vs `gcc-15 O0` | `0.070x` time, `14.30x` faster | `0.497x` time, `2.01x` faster | + +Compared with the previous O1-only focused refresh, current O1 rows are within +single-run noise: `cfree` compile time moved from `63.851` to `67.520` ms and +runtime from `4275.508` to `4276.252` ms; `cfree-run` compile+JIT moved from +`28.682` to `28.935` ms and runtime from `4150.248` to `4156.487` ms. + +Per-bench `cfree-run O1` runtime vs peers: + +| bench | cfree-run ms | MIR ms | gcc-15 ms | clang ms | cfree vs MIR | +| --- | ---: | ---: | ---: | ---: | ---: | +| `array` | `2828.6` | `4576.0` | `2007.1` | `2839.8` | `0.62x` | +| `hash` | `4048.2` | `3978.0` | `3957.6` | `3966.2` | `1.02x` | +| `hash2` | `4501.2` | `3687.0` | `4155.9` | `3684.7` | `1.22x` | +| `matrix` | `4812.5` | `9033.0` | `2949.7` | `2990.8` | `0.53x` | +| `sieve` | `4592.0` | `3946.0` | `2640.3` | `2362.8` | `1.16x` | + +Interpretation: the O1 correctness/performance acceptance scope remains clean +after moving post-allocation consumers to read allocation placement through +`Func.preg_locs` and fixing replayed call-plan outgoing stack reservation. +Runtime shape is materially unchanged: cfree is strongly ahead of MIR on +`array` and `matrix`, close on `hash`, and still trails on `hash2` and `sieve`. + +Correctness follow-up: clean O0 and O1 toy bootstraps were reconfirmed on +2026-05-25 after fixing replayed call-plan outgoing stack reservation. + ## Headline Geomeans Refreshed 2026-05-23 after the combiner perf+correctness fixes, MIR-shaped @@ -333,26 +418,25 @@ Coverage goals: ## O1 Gap Analysis vs MIR cfree `O1` runs only the shared lowering pipeline (`doc/OPT.md`): build_cfg, -jump_cleanup, simplify_local, machinize, addr_xform_pregs, build_loop_tree, -live_blocks, dead_def_elim, regalloc (simple mode), combine, dce, emit. +jump_cleanup, simplify_local, machinize, build_loop_tree, live_blocks, +dead_def_elim, regalloc (simple mode), combine, dce, emit. The local/global +address folds (`opt_addr_xform_pregs`, scalar local promotion, global address +CSE) remain optional and are not currently called by O1. No SSA-era passes and no coalescing run at `O1`, matching MIR (`mir-gen.c:9431`: coalesce is gated on `optimize_level >= 2`). -Headline at `O1` (post-jump-layout pass, 2026-05-23 refresh): +Headline at `O1` (focused O0/O1 refresh, 2026-05-25): | metric | cfree | MIR | cfree vs MIR | | --- | ---: | ---: | ---: | -| opt+link+JIT (5-case scope, geomean) | `0.701 ms` | `0.726 ms` | `0.966x` (3.4% faster) | -| runtime (5-case scope, geomean) | `3837.5 ms` | `4687.5 ms` | `0.819x` (18.1% faster) | - -cfree `O1` now leads MIR on both axes of this 5-case scope. Compile-time -slice is within noise of MIR. Runtime is `1.22x` ahead, driven by matrix -(`1.89x` ahead) and array (`1.69x` ahead) where the combiner + -block-layout pair eliminates the redundant address-mode moves and the -trivial inter-block branches. hash and sieve are within `7%` of MIR. -hash2 remains the worst outlier at `9.3%` slower — same hash-table code -as hash with larger working set, likely a register-pressure or cache -effect that O1 can't address without coalescing/splitting. +| full compile (5-case avg) | `67.520 ms` | `138.051 ms` | `0.489x` time (`2.04x` faster) | +| runtime (5-case avg) | `4276.3 ms` | `5044.0 ms` | `0.848x` time (`1.18x` faster) | + +cfree `O1` leads MIR on both axes of this 5-case scope. Runtime is driven by +matrix (`1.87x` faster than MIR) and array (`1.62x` faster), where the combiner +and block-layout work eliminate redundant address-mode moves and trivial +inter-block branches. `hash` is near parity, while `hash2` and `sieve` remain +the current O1 runtime outliers. Compile-time gaps closed by recent work: diff --git a/doc/OPTv2.md b/doc/OPTv2.md @@ -0,0 +1,937 @@ +# OPTv2 - Typed Lowering and Allocation Design + +This document proposes the next optimizer representation for cfree. The +current optimizer design and pass order are documented in `doc/OPT.md`. +Performance measurements, current O1/MIR comparisons, and optimization targets +are tracked in `doc/OPT_PERF.md`; this design treats those numbers as +constraints, not as background information. + +The short version: keep the fast O1 algorithms, but stop changing the meaning +of the same operand field as the function moves through lowering. The recorded +IR should stay virtual, allocation should be a separate map, and emitted code +should come from an explicitly physical MIR. + +## Status - 2026-05-25 + +Current implementation status: + +- Slice 1 is partially implemented: `OptBitset` grows on demand, trims active + words, and `opt_live_blocks` now uses a predecessor worklist. High-numbered + PReg coverage was added to `test-opt`. +- Slice 3 is complete: `Func.preg_locs` is the canonical allocation location + table. Transitional mirroring into `OptPRegInfo` remains only for allocator + metadata and legacy tests. +- HIR verification is stricter: `IR_PARAM_DECL` is def-only, storage and aux + operands are checked at phase boundaries, and allocation locations are + cross-checked. +- Slice 4 is implemented for the O1/non-splitting path: register allocation + writes locations without mutating HIR, `opt_lower_to_mir` builds a separate + physical `Func.mir` block tape, MIR verification checks the lowered boundary, + post-RA combine/DCE/jump cleanup run through the MIR view, and `opt_emit` + emits from MIR when present. +- Known-frame replay is re-enabled through the explicit known-frame descriptor. + The C-source target leaves `func_begin_known_frame` NULL because the hook is + optional and backend-owned. +- O1 HIR local/global address folds are re-enabled before liveness: + `opt_addr_xform_pregs`, `opt_promote_scalar_locals`, and + `opt_addr_of_global_cse`. +- Split allocation remains deferred for OPTv2 v1. The shared O1/O2 lowering + pipeline currently uses the non-splitting allocation path while the MIR + boundary is hardened. + +Validation status for this slice: + +- Passing (reconfirmed 2026-05-25, 698 checks): `make test-opt + HOST_OPTFLAGS=-O0` +- Passing (reconfirmed 2026-05-25, 698 checks): `make test-opt + HOST_OPTFLAGS=-O1` +- Passing (reconfirmed 2026-05-25, 6/6): `make test-ar-driver + HOST_OPTFLAGS=-O1` +- Passing (reconfirmed 2026-05-25, 955/955): `make test-toy + HOST_OPTFLAGS=-O1` +- Passing (reconfirmed 2026-05-25, 3712/3712): `make test-parse + HOST_OPTFLAGS=-O1` +- Passing (reconfirmed 2026-05-25, clean `build/bootstrap`, bootstrapped toy + 955/955): `make bootstrap-test-toy HOST_OPTFLAGS=-O0` +- Passing (reconfirmed 2026-05-25, clean `build/bootstrap`, bootstrapped toy + 955/955): `make bootstrap-test-toy HOST_OPTFLAGS=-O1` +- Passing (reconfirmed 2026-05-25, 50/50 OK): `CFREE_OPT_BENCH_LEVELS="0 1" + CFREE_OPT_BENCHES="array matrix hash hash2 sieve" make bench-opt` + +Current O1 pipeline status: + +- Enabled: `build_cfg`, `jump_cleanup`, `simplify_local`, `machinize`, + O1 HIR address folds, `build_loop_tree`, `live_blocks`, + `dead_def_elim`, simple regalloc to `Func.preg_locs`, HIR-to-MIR lowering, + MIR verification, MIR post-RA combine, MIR DCE, MIR CFG/layout cleanup, MIR + emission, and known-frame replay. +- Enabled for this slice: post-allocation consumers operate on the MIR view. + HIR remains virtual after allocation; the legacy rewritten-HIR path is kept + only for existing pass tests and `opt_regalloc` callers. +- Deferred: split allocation and O2-specific allocation quality work. + +Recent O1 bootstrap fixes: + +- Fixed post-RA `try_sink` retargeting across call barriers. A call barrier + does not prove a callee-saved physical register is dead, so combine now + rejects retargeting when the source hard register is used after a call before + a real redefinition. This fixed the O1 miscompile of + `link_macho.c:build_codesig_skeleton`, where `sb = out->data` was lost after + the first `wr_u32_be` call and later code-signature writes corrupted the + Mach-O link state. +- Fixed post-RA combine last-def repair when `try_sink` retargets a producer + away from a physical register that had an earlier reaching definition. This + removed the `coff_read_dso.c` stage3 crash in `init_struct_fields`. +- Blocked copy substitution into store-address operands. Load address + substitution remains enabled, but memory-writing operands now keep their + original address base/index through this rewrite. +- Split and simplified `opt_machinize` register collection so the self-hosted + O1 compiler no longer corrupts `opt_callee_saved` after the + `callee_save_mask` callback. +- Avoided direct struct-return and returned-pointer array stores in the ar and + runtime driver paths that exposed the same store-address corruption class. +- Fixed replayed call-plan outgoing stack reservation on aarch64, x64, and + rv64. Call plans now update the backend `max_outgoing` area and account for + `max(mem.size, 8)` per stack move, which prevents outgoing argument stores + from overwriting callee-saved slots. The O1 bootstrap `env.c` failure was a + symptom of this class: a replayed call to `obj_symbol_ex` clobbered the saved + `x19` slot in `cfree_cg_decl`, and the same overlap pattern could corrupt + parser stack frames. +- Replay hard-register collection now maps virtual PRegs through + `Func.preg_locs` and keeps a raw-register fallback for operands that are + already physical. +- Known-frame replay now uses the descriptor path after MIR lowering. Native + backends receive frame slots, outgoing stack area size, alloca/call flags, + and callee-save reservations before body emission. + +Acceptance criteria before marking OPTv2 O1 complete: + +- Done: `make bootstrap-test-toy HOST_OPTFLAGS=-O1` passes on a clean + `build/bootstrap`. +- Done: O1 still emits toy tests correctly with the bootstrapped compiler. +- Done: `CFREE_OPT_BENCH_LEVELS="0 1" CFREE_OPT_BENCHES="array matrix hash + hash2 sieve" make bench-opt` shows 50/50 OK rows. In the current summary + (`build/bench/opt/summary.md`), `cfree O1` averages `68.021 ms` compile time + and `4198.782 ms` runtime on the 5-case full-pipeline scope. That is `2.08x` + faster to compile than `MIR O1` and `1.19x` faster at runtime; versus + `gcc-15 O0`, it is `5.72x` faster to compile and `2.00x` faster at runtime. +- Done for the O1 structural slice: allocation placement is separate from HIR, + MIR lowering/emission is active, known-frame replay is active, and the + optional O1 HIR folds are re-enabled. + +## Goals + +- Preserve O1 compile speed. `doc/OPT_PERF.md` shows cfree O1 is already in + the same backend-time class as MIR and materially faster than gcc/clang + whole-pipeline builds. OPTv2 must keep dense arrays, arena allocation, and + linear scans where they are currently winning. +- Make representation invariants local and enforceable. A pass should not + need to know from context whether `Operand.v.reg` names a PReg, Val, or + physical register. +- Keep frontends and targets behind their existing ownership boundaries. + Frontends still emit CG. Backends still own instruction encoding and ABI + prologue/epilogue details. The optimizer owns only the recorded IR, + allocation, and lowering handoff. +- Support O1 first. O2 SSA and inlining are out of scope for OPTv2 v1. If the + boundary changes make O2 temporarily awkward, disable O2 or route it through + the O1 lowering path until it can be ported deliberately. + +## Current Problem + +The current O1 pipeline records CG operations into `Func`/`Block`/`Inst`, runs +liveness and allocation on PRegs, then rewrites operands in place so replay can +emit target code. + +That is efficient, but it gives one field multiple meanings: + +- Before register SSA, `OPK_REG` carries a PReg. +- In SSA mode, `OPK_REG` carries a Val. +- After O1 rewrite, the same `OPK_REG` usually carries a physical register. + +This phase-dependent convention is brittle. A post-RA pass can accidentally +treat a physical register as a PReg. Replay can see stale virtual operands. +`IR_PARAM_DECL` has also existed in two shapes: as a def-only marker and as an +instruction with a synthetic self operand. Both are easy mistakes because the +data model permits them. + +OPTv2 fixes this by giving each phase its own representation. + +## Representations + +### HIR: Recorded Optimizer IR + +HIR is the function shape recorded by `opt_cgtarget`. + +- `Func`, `Block`, and `Inst` remain arena-backed. +- HIR operands are virtual. `OPK_REG` means PReg before SSA and Val during SSA. +- HIR never stores physical registers in ordinary operands. +- `IR_PARAM_DECL` is def-only: + +```c +in->op = IR_PARAM_DECL; +in->def = param_preg; +in->type = param_type; +in->nopnds = 0; +in->opnds = NULL; +``` + +- Parameter and local storage remain in `IRParam` and `IRLocal`. +- ABI call/return data remains structured in call/return aux records. + +HIR is the only representation consumed by pre-allocation passes such as CFG +cleanup, local simplification, machinization, liveness, and allocation. + +### Allocation Map + +Allocation results should be stored in one canonical location table, separate +from HIR operands. + +```c +typedef enum OptLocKind { + OPT_LOC_NONE, + OPT_LOC_HARD, + OPT_LOC_SPILL, + OPT_LOC_SPLIT, +} OptLocKind; + +typedef struct OptLocation { + u8 kind; + u8 cls; + union { + Reg hard; + FrameSlot spill; + u32 first_segment; + } v; +} OptLocation; +``` + +`Func` owns `OptLocation* preg_locs`, indexed by PReg. `OptPRegInfo` can keep +frequency, cost, live-range, and constraint metadata, but final placement +should have one source of truth. + +Current code uses `OptLoc`, `OPT_LOC_STACK`, and `Func.preg_info` placement +fields. The migration can either keep those names initially or rename them to +the spelling above, but it should not do both semantic and naming churn in the +same patch. The important boundary is one canonical PReg-to-location table that +lowering reads and HIR operands do not encode. + +When split allocation is reintroduced, keep it segment-based: + +```c +typedef struct OptSplitSegment { + PReg preg; + u32 block; + u32 start_point; + u32 end_point; + u8 loc_kind; + u8 cls; + Reg hard; + FrameSlot spill; + u8 reload_at_start; + u8 store_at_end; + u32 next; +} OptSplitSegment; +``` + +This keeps today's dense, point-indexed allocator shape from `doc/OPT_PERF.md` +while making the allocation result explicit. + +### MIR: Physical Lowered IR + +MIR is the output of O1 lowering after allocation. Replay should consume MIR, +not rewritten HIR. + +```c +typedef enum MOperandKind { + MOP_IMM, + MOP_PHYS_REG, + MOP_FRAME, + MOP_GLOBAL, + MOP_INDIRECT, +} MOperandKind; + +typedef struct MOperand { + u8 kind; + u8 cls; + CfreeCgTypeId type; + union { + i64 imm; + Reg phys; + FrameSlot frame; + CGGlobalRef global; + struct { + Reg base; + Reg index; /* REG_NONE when absent */ + i32 ofs; + u8 log2_scale; + } ind; + } v; +} MOperand; +``` + +MIR should use inline operands for the common case: + +```c +typedef struct MInst { + u16 op; + u16 nops; + SrcLoc loc; + MOperand small[3]; + MOperand* extra_ops; + union { + i64 imm; + MemAccess mem; + void* aux; + } extra; +} MInst; +``` + +Most instructions have at most three operands, so this is at least as cache +friendly as today's per-instruction operand arrays and often better. Large +operand lists still spill to arena storage. + +`MFunc` owns MIR blocks in emit order. CFG metadata can be omitted unless a +future MIR pass needs it; replay only needs layout order and labels. + +## Pipeline Shape + +O1 becomes: + +```text +frontend -> CG -> opt_cgtarget records HIR + +func_end: + frame-home address-taken reg locals + build CFG on HIR + jump cleanup (CFG mode) + rebuild CFG + simplify local HIR + verify HIR + machinize HIR + verify machinized HIR + loop tree + liveness over PRegs + dead-def elimination over HIR + register allocation -> OptLocation table + verify allocation/lowered state + lower HIR + OptLocation -> MIR + post-RA MIR combine + verify MIR/combine state + MIR DCE + verify MIR/DCE state + jump cleanup (CFG mode) + rebuild CFG + verify post-RA CFG + MIR jump/layout cleanup + emit MIR to wrapped target +``` + +O2 is deferred: + +```text +frontend -> CG -> opt_cgtarget records HIR + +finalize: + for OPTv2 v1, either disable O2 or lower retained functions through the O1 + path without running the SSA cleanup schedule +``` + +The important change is that O1 rewrite no longer mutates HIR. It builds MIR. +O2 can later reuse the HIR allocation and MIR lowering boundary after the O1 +bootstrap path is correct and measured. + +## Current O1 Source Order + +The source of truth is `src/opt/opt.c`, not `doc/OPT.md`. As of this design, +`w_func_end` first calls `opt_frame_home_addr_taken_locals`, then +`opt_run_o1_pipeline`, which is `opt_run_lowering_pipeline(..., +allow_live_range_split = 0)`. + +The current O1 order in code is: + +1. `opt_frame_home_addr_taken_locals` +2. `opt_build_cfg` +3. `opt_jump_cleanup(..., OPT_JUMP_CLEANUP_CFG)` +4. `opt_build_cfg` +5. `opt_simplify_local` +6. `opt_verify("lowering-cfg")` +7. `opt_machinize` +8. `opt_verify("lowering-machinize")` +9. `opt_build_loop_tree` +10. `opt_live_blocks` +11. `opt_dead_def_elim_with_live` +12. `opt_regalloc(..., allow_live_range_split = 0)` +13. `opt_verify("post-regalloc-rewrite")` +14. `opt_combine` +15. `opt_verify("post-ra-combine")` +16. `opt_dce` +17. `opt_verify("post-ra-dce")` +18. `opt_jump_cleanup(..., OPT_JUMP_CLEANUP_CFG)` +19. `opt_build_cfg` +20. `opt_verify("post-ra-jump-cfg")` +21. `opt_jump_cleanup(..., OPT_JUMP_CLEANUP_LAYOUT)` +22. `opt_emit` + +The following PReg-level canonicalization passes exist in `src/opt/pass_o2.c` +but are not currently called by O1: + +- `opt_addr_xform_pregs` +- `opt_promote_scalar_locals` +- `opt_addr_of_global_cse` + +They are treated below as optional future O1 HIR folds, not as active O1 +passes. + +## Invariants + +### HIR Invariants + +- `IR_PARAM_DECL` is def-only and has no operands. +- HIR `OPK_REG` operands are never physical registers. +- Before register SSA, HIR `OPK_REG` ids are PRegs in `[1, f->npregs)`. +- During register SSA, HIR `OPK_REG` ids are Vals in `[1, f->nvals)`. +- `IRParam.storage` and `IRLocal.storage` reference valid PRegs or valid frame + slots. +- `OPK_LOCAL` frame slots are valid and refer to the expected slot class when a + pass requires that distinction. +- Aux operands in calls, returns, inline asm, aggregates, atomics, and + intrinsics obey the same namespace rules as normal operands. + +### Allocation Invariants + +- Every live PReg has exactly one `OptLocation`. +- `OPT_LOC_HARD` has a valid physical register of the matching class. +- `OPT_LOC_SPILL` references an `FS_SPILL` frame slot. +- `OPT_LOC_SPLIT` has at least one valid segment. +- Split segments are ordered, non-overlapping per PReg, and cover every point + where the PReg is used or defined. +- Two simultaneously live PRegs cannot share a hard register or spill slot + unless an explicit coalescing proof says they are the same value. +- Fixed-register and inline-asm constraints are represented in allocation + inputs, not patched in after allocation. + +### MIR Invariants + +- MIR contains no PRegs or Vals. +- Every register operand is `MOP_PHYS_REG` or part of `MOP_INDIRECT` and is a + valid physical register. +- Every frame operand references a valid frame slot. +- Call plans are fully physical: argument sources, return destinations, and + ABI register destinations are valid MIR operands. +- MIR replay never calls PReg allocation helpers. + +## O1 Pass Porting Plan + +This section describes each active O1 pass from `src/opt/opt.c` and how it +ports to OPTv2. + +### `opt_frame_home_addr_taken_locals` + +Keep as a HIR preparation pass. + +This pass runs before the O1 lowering pipeline. It materializes frame homes for +address-taken locals that otherwise live in PRegs, then inserts loads/stores +around uses/defs so taking the address observes the frame slot. In OPTv2 it +should remain pre-allocation HIR because it changes local storage and virtual +load/store structure. + +Verifier additions: + +- address-taken register locals with a home slot have valid frame homes +- inserted loads/stores reference the local's home slot +- the pass does not introduce physical registers + +### `opt_build_cfg` + +Keep as a HIR pass. + +CFG construction should continue to derive edges from HIR terminators and +scope/control instructions. It should not inspect allocation state. The output +is HIR block predecessor/successor lists plus `emit_order`. + +Verifier additions: + +- terminator successor count +- reciprocal predecessor/successor links +- no connected unreachable blocks +- every `emit_order` block is valid and unique + +### `opt_jump_cleanup` + +Split into three uses matching the code. + +The early CFG cleanup remains a HIR pass. It canonicalizes branches and removes +empty or unreachable control-flow artifacts before liveness. The post-RA CFG +cleanup should move to MIR once post-RA code is represented as MIR. The final +layout cleanup should also become a MIR-layout pass. + +Once MIR exists, fallthrough and branch deletion should operate on physical +emitted blocks so it cannot accidentally rewrite virtual operands after +allocation. + +Performance note: `doc/OPT_PERF.md` attributes part of O1's current runtime win +to block-layout fallthrough cleanup. OPTv2 must preserve the greedy +emit-order chain behavior and measure it with `make bench-opt`. + +### `opt_simplify_local` + +Keep as a HIR pass. + +This pass performs algebraic and addressing simplifications that are valid +before allocation. It should continue to rewrite virtual operands and HIR +instructions. Any rewrite that changes use/def structure must maintain or +invalidate def-use in the current model, and later should use the incremental +SSA/use-edge model described in `doc/SSA2.md`. + +### `opt_machinize` + +Keep as a HIR pass, but make its contract sharper. + +Machinization should express target constraints without converting operands to +physical registers. Examples: + +- required register class +- forbidden physical register masks +- fixed-register constraints for inline asm or ABI-specific operations +- target legality flags for addressing forms +- call clobber and hard-live metadata + +It should not write physical registers into HIR operands. If a target decision +requires a physical register, it becomes an allocation constraint. + +### Optional Future O1 Address Folds + +`opt_addr_xform_pregs`, scalar local promotion, and global address CSE are HIR +canonicalization passes present in the codebase, but they are not active in the +current O1 source order. If re-enabled, they should run before liveness. + +Porting rule: + +- address-of-local/global folds rewrite HIR operands only +- scalar promotion updates `IRLocal`/frame-slot state and HIR loads/stores only +- neither pass may depend on post-allocation physical registers + +These passes are optional for correctness. Re-enable them one at a time under +HIR verification and benchmark with the `O1 Gap Analysis` workflow in +`doc/OPT_PERF.md`. + +### `opt_build_loop_tree` + +Keep as a HIR analysis. + +The loop tree is built from HIR CFG after cleanup. It feeds liveness metrics and +future O1 pressure models. No MIR dependency is needed for O1. + +### `opt_live_blocks` + +Keep as a HIR analysis over PRegs, but make it MIR-shaped. + +MIR does not use a true sparse bitmap here. Its `bitmap_t` is a growable vector +of 64-bit words that expands on demand and truncates trailing zero words after +set operations. cfree already has the most important local behavior through +`OptBitset.active_words`; OPTv2 should lean into that and match MIR's model +more directly: + +- keep 64-bit word bitmaps +- keep active/trailing-zero trimming on every mutating bitmap operation +- allow bitmaps to grow to the highest touched PReg instead of eagerly sizing + every block bitmap to `opt_reg_count` +- use a worklist dataflow solver ordered by reverse postorder/postorder, + matching MIR's `solve_dataflow` +- keep optional scan-var remapping available for special analyses, but use + direct PReg ids for O1 RA + +The dataflow equation stays the same: + +```text +live_out = union(successor live_in) +live_in = live_use | (live_out & ~live_def) +``` + +This preserves the current O1 semantics while stealing MIR's allocation and +iteration shape. Do not switch to chunked sparse sets unless benchmark data +shows the elastic dense bitmap is a real bottleneck. + +### `opt_dead_def_elim_with_live` + +Keep as a HIR pass. + +It removes dead virtual definitions before allocation. This should stay before +MIR lowering because it reduces allocation pressure. The pass must treat +`IR_PARAM_DECL` as a def-only value source and preserve all side-effecting +operations. + +### `opt_regalloc` + +Keep the allocator algorithm shape, change the output. In current O1, +`allow_live_range_split` is `0`. + +The allocator should consume HIR liveness and produce: + +- `OptLocation[preg]` +- spill frame slots +- split segments when enabled +- scratch register lists +- call-save/call-restore requirements + +It should not rewrite HIR operands. + +The point-indexed occupancy bitset design should be preserved and kept close to +MIR. `doc/OPT_PERF.md` records a large O1 compile-time win from moving to +MIR-shaped point occupancy; OPTv2 should keep that structure and only move the +final assignment out of `OptPRegInfo` into a canonical location table. + +The live-range builder should also keep matching MIR: + +- build ranges by walking each block backward from `live_out` +- model whole-block spans separately from instruction-local ranges +- advance raw points only when live ranges are born or die +- compress raw points to a dense point id space before allocation +- feed `used_locs[point]` occupancy bitmaps from compressed points + +### O1 Lowering: HIR + Allocation -> MIR + +This replaces the current in-place rewrite performed during/after +`opt_regalloc`. + +For each HIR instruction: + +- translate every virtual use through `OptLocation` +- insert reloads before instructions that use spilled values +- insert stores after instructions that define spilled values +- materialize split reload/store edges +- materialize call-save and call-restore code +- lower call plans into physical argument and return moves +- emit a physical `MInst` sequence + +This is where all PReg-to-physical knowledge belongs. After this pass, HIR is +unchanged and MIR is fully physical. + +### `opt_combine` + +Move post-RA combine to MIR. + +The current post-allocation combiner is valuable and should not be discarded. +`doc/OPT_PERF.md` credits O1 gains to operand-driven combine, live-range-scoped +use counting, address-mode synthesis, and extension folding. Those algorithms +should port directly, but their input should be MIR physical operands. + +MIR combine can be simpler because: + +- register operands are always physical +- indirect operands never hide PRegs +- use counts can be local to MIR blocks +- no pass can accidentally consult PReg allocation state through a physical + operand + +### `opt_dce` + +Split into HIR DDE and MIR DCE. + +The pre-allocation dead-def elimination remains HIR-based. The post-combine DCE +should become MIR-based and delete physical instructions whose definitions are +unused and side-effect-free. It should not reason about PRegs. + +`doc/OPT_PERF.md` notes possible overlap between pre-allocation DDE and +post-allocation DCE. OPTv2 should keep both initially for behavior parity, then +benchmark whether MIR DCE can be narrowed. + +### Post-RA CFG Rebuild and Layout Cleanup + +Move these to MIR once post-RA code is physical MIR. + +The current code runs a CFG-mode jump cleanup, rebuilds CFG, verifies, then runs +layout-mode jump cleanup. OPTv2 should preserve that ordering, but apply it to +MIR after combine and DCE. The CFG-mode pass removes/control-simplifies blocks; +the layout-mode pass optimizes fallthroughs in emit order. + +### `opt_emit` + +Replace replay of rewritten HIR with MIR emission. + +`opt_emit_mir` should: + +- open the function on the wrapped target +- replay params and known frame slots from HIR function metadata +- emit MIR blocks in layout order +- reserve physical hard registers from MIR use collection +- close the function + +It should not inspect `OptLocation` except for parameter storage translation +where the backend prologue needs the final home location. That translation +should be explicit and verifier-covered. + +## Verification + +OPTv2 should keep debug-only verification cheap and frequent: + +- HIR verifier after CFG cleanup and machinization +- allocation verifier immediately after `opt_regalloc` +- MIR verifier immediately after HIR-to-MIR lowering +- MIR verifier after combine, DCE, and layout cleanup + +The verifier should fail at the phase boundary closest to the bug. For example, +an operand-backed `IR_PARAM_DECL` should fail during HIR verification, not +during replay. A stale PReg in MIR should fail during MIR verification, not in +the backend encoder. + +## Performance Rules + +`doc/OPT_PERF.md` is the performance contract for this design. + +Preserve: + +- arena-backed function-local allocation +- dense PReg-indexed arrays for hot per-value data +- point-indexed allocator occupancy bitsets +- operand-driven combiner dispatch +- block-layout fallthrough cleanup +- no SSA construction at O1 +- no coalescing or live-range splitting at O1 unless benchmarks justify it + +Avoid: + +- per-operand heap allocation in MIR +- whole-function rescans in inner loops +- rebuilding def-use or liveness after every small post-RA rewrite +- adding abstraction layers in hot loops where a dense indexed array is enough + +Benchmark gates: + +- `make bench-opt` +- focused O1 comparison: + +```sh +CFREE_OPT_BENCH_LEVELS=1 make bench-opt +``` + +- targeted regressions from `doc/OPT_PERF.md`: + +```sh +CFREE_OPT_BENCHES="array matrix hash hash2 sieve" \ +CFREE_OPT_BENCH_LEVELS=1 make bench-opt +``` + +Acceptance for the initial OPTv2 O1 port should be no material regression in +the backend-only O1 geomean and no bootstrap regression. + +## Migration Plan + +1. Lock down HIR invariants. + + Keep the current debug verifier strict: `IR_PARAM_DECL` is def-only, HIR + operands are virtual, params/locals reference valid storage, and aux operands + use the correct namespace. + +2. Make liveness and ranges MIR-shaped. + + Keep `OptBitset` as a 64-bit-word bitmap, but make it elastic like MIR: + grow to the highest touched PReg, trim trailing zero words after operations, + and run liveness through a postorder/reverse-postorder worklist solver. Keep + the existing compressed point range builder and tighten it to MIR's + born/dead-boundary model. + +3. Introduce `OptLocation`. + + Add the location table while still mirroring existing `OptPRegInfo` + allocation fields. Convert users one at a time, then remove duplicate final + placement fields. + +4. Add MIR data structures. + + Add `MOperand`, `MInst`, `MBlock`, and `MFunc` with inline small operands. + Add a MIR verifier before any pass consumes MIR. + +5. Port rewrite into `opt_lower_to_mir`. + + Keep the current reload/store/call-save algorithms, but write MIR instead of + mutating HIR. HIR should remain valid after the pass. + +6. Port emission. + + Add `opt_emit_mir` and make O1 emit from MIR. Keep the old replay path + temporarily for comparison, then delete it once bootstrap and test-toy are + clean. + +7. Port post-RA combine and DCE. + + Move the profitable O1 combines to MIR. Preserve the measured behaviors from + `doc/OPT_PERF.md` before adding new combines. + +8. Re-enable optional HIR O1 folds. + + Re-enable address folds and scalar promotion only after HIR/MIR boundaries + are stable. Each pass should have focused tests and an O1 benchmark check. + +9. Defer O2. + + For OPTv2 v1, keep the O2 implementation out of the critical path. Either + disable it or make it lower through the O1 path until O1 bootstrap and + `test-toy` are clean with the new HIR/MIR boundary. + +## V1 Decisions + +OPTv2 v1 is an O1 redesign. These choices favor bootstrap correctness and a +small migration surface over solving O2 quality problems at the same time. + +### MIR CFG Shape + +Store layout order and successor labels in MIR v1, but do not build full +predecessor lists by default. + +O1 needs: + +- physical block order for emission +- terminator targets for jump cleanup +- enough successor information for CFG-mode cleanup after combine/DCE + +It does not need dominance, loop analysis, SSA repair, or predecessor-driven +phi updates after lowering. If a MIR pass needs predecessors later, build them +as a derived scratch analysis from terminators, mirroring today's +`opt_build_cfg` rather than storing and maintaining them eagerly. + +Tradeoff: a derived CFG costs one linear pass when needed, but avoids making +every MIR rewrite responsible for keeping predecessor lists coherent. + +### Parameter Prologue Materialization + +Keep parameter prologue materialization backend-owned through `target->param` +for v1. + +The native backends already encode ABI-specific incoming-register and +incoming-stack handling in `x_param`, `aa_param`, and `rv_param`. Moving that +into MIR immediately would duplicate ABI lowering and frame-layout logic at the +same time as the HIR/MIR split. That is too much churn for the bootstrap path. + +MIR v1 should instead verify the boundary explicitly: + +- each parameter has a valid final storage location +- register-backed parameters map to physical registers or are marked unused +- frame-backed parameters have known-frame slot mappings when using + `func_begin_known_frame` +- `target->param` is called before MIR body emission + +Later, if we want prologue instructions visible to MIR combine/DCE, add an +explicit `MIR_PROLOGUE_PARAM` lowering step after O1 is stable. + +### Split Allocation + +Defer split allocation for O1 v1. + +Current O1 calls `opt_regalloc(..., allow_live_range_split = 0)`, and +`doc/OPT_PERF.md` shows O1 is already close to MIR compile time while the +bootstrap bug pressure is correctness, not spill quality. OPTv2 should first +move today's non-splitting allocator result into `OptLocation[preg]` and lower +that to MIR. + +Keep `OPT_LOC_SPLIT` and segment terminology in the design only as a +compatibility slot for later O2 or O1 quality work. Do not require split +segments, edge materialization, or split verification for the first O1 port. + +Tradeoff: this preserves today's O1 quality profile and avoids porting the +most complex allocator path while the representation boundary is still moving. + +### Liveness Bitsets + +Match MIR's elastic dense bitmap model for v1. + +MIR's liveness bitmap is not a sparse set. It is a growable `uint64_t` vector +that stores only words up to the current highest nonzero word. cfree should use +the same shape: dense inside the active prefix, trimmed at the end, and iterated +by scanning nonzero words. + +The current `OptBitset.active_words` design is already close. The remaining v1 +changes should be: + +- stop allocating every block bitmap to full `opt_reg_count` size up front +- grow bitmaps on demand +- keep bitmap operations trimming trailing zero words +- switch liveness convergence to a MIR-like worklist solver +- keep compressed live-range points as allocator input + +Tradeoff: this is still dense within the active prefix, so pathological high +PReg ids can cost dense scans. That is acceptable because it matches MIR's +successful O1 shape and avoids a separate sparse-set design. + +## Implementation Handoff + +The implementation should proceed in small behavior-preserving slices. Do not +start by introducing MIR. First make the O1 analyses match MIR closely while +the existing rewritten-HIR backend path still runs. + +### Slice 1: Elastic Bitmaps and Worklist Liveness + +Update `OptBitset`/`opt_live_blocks` before touching allocation output: + +- add an internal grow helper for `OptBitset` +- make `opt_bitset_set` grow on demand +- keep all mutating operations trimming trailing zero words +- keep the public `opt_bitset_has` behavior unchanged +- replace full reverse block sweeps with a MIR-like worklist solver +- preserve the current `live_use | (live_out & ~live_def)` equation + +Targeted tests: + +```sh +make test-opt HOST_OPTFLAGS=-O0 +make test-opt HOST_OPTFLAGS=-O1 +``` + +The existing live tests around `opt_live_blocks` and +`opt_live_ranges_phase2` should stay green. Add focused tests for high-numbered +PRegs so on-demand growth and trailing-zero trimming are covered. + +### Slice 2: Tighten MIR-Style Live Ranges + +Keep `opt_live_ranges_build` compatible with current callers, but align the +details with MIR: + +- start from each block's `live_out` +- walk instructions backward +- open ranges on uses and close ranges on defs +- represent whole-block spans separately with `whole_block` +- advance raw points only when a range is born or dies +- compress raw points before allocation + +Targeted checks: + +```sh +make test-opt HOST_OPTFLAGS=-O1 +CFREE_OPT_BENCH_LEVELS=1 CFREE_OPT_BENCHES="array matrix hash hash2 sieve" make bench-opt +``` + +### Slice 3: Canonical Allocation Locations + +Only after liveness/ranges are stable, introduce the single allocation table. +Keep existing allocator heuristics and O1 `allow_live_range_split = 0`. + +Rules: + +- allocation writes one canonical PReg location table +- HIR operands remain virtual after allocation +- any temporary mirroring into `OptPRegInfo` is transitional and should have a + deletion point +- verifier checks that post-allocation HIR still has virtual operands + +### Slice 4: MIR Lowering and Emission + +Then add MIR and move the in-place rewrite into `opt_lower_to_mir`. + +Keep v1 conservative: + +- backend-owned parameter prologue through `target->param` +- no O1 live-range splitting +- no O2 dependency +- old rewritten-HIR emit path may remain temporarily for comparison + +Acceptance: + +```sh +make bootstrap HOST_OPTFLAGS=-O0 +make bootstrap-test-toy HOST_OPTFLAGS=-O0 +make bootstrap HOST_OPTFLAGS=-O1 +make bootstrap-test-toy HOST_OPTFLAGS=-O1 +``` + +If O2 blocks progress, disable it or route it through the O1 lowering path as +described above. diff --git a/driver/ar.c b/driver/ar.c @@ -139,6 +139,14 @@ static int ar_name_selected(CfreeSlice name, int argc, char** argv, return 0; } +static void ar_set_input_member(CfreeArInput* out, const char* name, + const CfreeFileData* fd) { + out->name.s = name; + out->name.len = driver_strlen(name); + out->bytes.data = fd->data; + out->bytes.len = fd->size; +} + /* Open the archive bytes via file_io. Caller releases fd via ctx. */ static int ar_open_for_read(DriverEnv* env, const char* path, CfreeContext* ctx_out, CfreeFileData* fd_out, @@ -450,26 +458,20 @@ static int ar_do_write(DriverEnv* env, const char* archive_path, int nmembers, int replaced = 0; for (j = 0; j < nm; ++j) { if (driver_streq(members[j].name.s, base)) { - members[j].name = cfree_slice_cstr(base); - members[j].bytes.data = new_fds[i].data; - members[j].bytes.len = new_fds[i].size; + ar_set_input_member(&members[j], base, &new_fds[i]); replaced = 1; break; } } if (!replaced) { - members[nm].name = cfree_slice_cstr(base); - members[nm].bytes.data = new_fds[i].data; - members[nm].bytes.len = new_fds[i].size; + ar_set_input_member(&members[nm], base, &new_fds[i]); nm++; } if (has_v) driver_printf("%c - %.*s\n", replaced ? 'r' : 'a', CFREE_SLICE_ARG(cfree_slice_cstr(base))); } else { - members[nm].name = cfree_slice_cstr(base); - members[nm].bytes.data = new_fds[i].data; - members[nm].bytes.len = new_fds[i].size; + ar_set_input_member(&members[nm], base, &new_fds[i]); nm++; if (has_v) driver_printf("a - %.*s\n", CFREE_SLICE_ARG(cfree_slice_cstr(base))); diff --git a/driver/runtime.c b/driver/runtime.c @@ -152,6 +152,17 @@ static char* rt_join(DriverEnv* env, const char* a, const char* b, return p; } +static char* rt_join_slot(DriverEnv* env, char** slot, size_t* size_slot, + const char* a, const char* b) { + char* p = rt_join(env, a, b, size_slot); + *slot = p; + return p; +} + +static void rt_store_const_ptr(const char** slot, const char* value) { + *slot = value; +} + static int rt_basename_is(const char* path, const char* name) { const char* base; const char* end; @@ -352,8 +363,11 @@ int driver_runtime_add_freestanding_headers(const DriverRuntimeSupport* s, int driver_runtime_append_freestanding_headers(const DriverRuntimeSupport* s, DriverCflags* cf) { + uint32_t i; if (!s || !s->include_dir || !cf) return 1; - cf->system_include_dirs[cf->nsystem_include_dirs++] = s->include_dir; + i = cf->nsystem_include_dirs; + cf->system_include_dirs[i] = s->include_dir; + cf->nsystem_include_dirs = i + 1u; return 0; } @@ -383,7 +397,6 @@ static int rt_prepare_pp(DriverEnv* env, const DriverRuntimeSupport* support, uint32_t ninc = variant->ldbl128 ? 4u : 3u; uint32_t nsys = 2u; uint32_t ndefs = 1u + (variant->ldbl128 ? 1u : 0u) + (assembler ? 1u : 0u); - uint32_t i = 0; uint32_t d = 0; CfreePreprocessOptions z = {0}; @@ -413,22 +426,31 @@ static int rt_prepare_pp(DriverEnv* env, const DriverRuntimeSupport* support, return 1; } - dirs[d] = rt_join(env, support->rt_root, "lib/include/common", &sizes[d]); - include_dirs[i++] = dirs[d++]; - dirs[d] = rt_join(env, support->rt_root, "lib/impl", &sizes[d]); - include_dirs[i++] = dirs[d++]; - dirs[d] = rt_join(env, support->rt_root, variant->abi_include, &sizes[d]); - include_dirs[i++] = dirs[d++]; + rt_join_slot(env, &dirs[0], &sizes[0], support->rt_root, + "lib/include/common"); + rt_store_const_ptr(&include_dirs[0], dirs[0]); + rt_join_slot(env, &dirs[1], &sizes[1], support->rt_root, "lib/impl"); + rt_store_const_ptr(&include_dirs[1], dirs[1]); + rt_join_slot(env, &dirs[2], &sizes[2], support->rt_root, + variant->abi_include); + rt_store_const_ptr(&include_dirs[2], dirs[2]); if (variant->ldbl128) { - dirs[d] = rt_join(env, support->rt_root, "lib/include/lp64_le_ldbl128", - &sizes[d]); - include_dirs[i++] = dirs[d++]; + rt_join_slot(env, &dirs[3], &sizes[3], support->rt_root, + "lib/include/lp64_le_ldbl128"); + rt_store_const_ptr(&include_dirs[3], dirs[3]); + rt_join_slot(env, &dirs[4], &sizes[4], support->rt_root, "include"); + rt_store_const_ptr(&system_dirs[0], dirs[4]); + rt_join_slot(env, &dirs[5], &sizes[5], support->rt_root, "include/libc"); + rt_store_const_ptr(&system_dirs[1], dirs[5]); + d = 6; + } else { + rt_join_slot(env, &dirs[3], &sizes[3], support->rt_root, "include"); + rt_store_const_ptr(&system_dirs[0], dirs[3]); + rt_join_slot(env, &dirs[4], &sizes[4], support->rt_root, "include/libc"); + rt_store_const_ptr(&system_dirs[1], dirs[4]); + d = 5; } - dirs[d] = rt_join(env, support->rt_root, "include", &sizes[d]); - system_dirs[0] = dirs[d++]; - dirs[d] = rt_join(env, support->rt_root, "include/libc", &sizes[d]); - system_dirs[1] = dirs[d++]; - for (i = 0; i < d; ++i) { + for (uint32_t i = 0; i < d; ++i) { if (!dirs[i]) { uint32_t j; for (j = 0; j < d; ++j) @@ -445,7 +467,7 @@ static int rt_prepare_pp(DriverEnv* env, const DriverRuntimeSupport* support, defs[0].name = CFREE_SLICE_LIT("HAS_INT128"); defs[0].body = variant->has_int128 ? def_int128_1 : def_int128_0; - i = 1; + uint32_t i = 1; if (variant->ldbl128) { defs[i].name = CFREE_SLICE_LIT("CFREERT_LDBL128"); defs[i].body = CFREE_SLICE_LIT("1"); diff --git a/lang/c/parse/parse_init.c b/lang/c/parse/parse_init.c @@ -22,6 +22,15 @@ static CKw ident_kw_init(const Parser* p, Sym name) { return ident_kw_inline(p, name); } +static const Type* init_field_type_at(const Type* ty, u16 i) { + const Field* f = &ty->rec.fields[i]; + return f->type; +} + +static u32 init_field_offset_at(const ABIRecordLayout* L, u16 i) { + return L->fields[i].offset; +} + /* True if `ty` is char/signed char/unsigned char. */ int is_char_kind(const Type* ty) { if (!ty) return 0; @@ -224,9 +233,10 @@ static void emit_walk_copy(Parser* p, FrameSlot dst_slot, for (u16 i = 0; i < ty->rec.nfields; ++i) { const Field* f = &ty->rec.fields[i]; if (f->flags & FIELD_BITFIELD) continue; - u32 foff = L->fields[i].offset; + const Type* fty = init_field_type_at(ty, i); + u32 foff = init_field_offset_at(L, i); emit_walk_copy(p, dst_slot, dst_arr_ty, dst_off + foff, src_ptr_slot, - src_ptr_ty, src_off + foff, f->type); + src_ptr_ty, src_off + foff, fty); } return; } @@ -297,7 +307,11 @@ static void zero_init_at(Parser* p, FrameSlot slot, const Type* arr_ty, cg_drop(p->cg); continue; } - zero_init_at(p, slot, arr_ty, offset + L->fields[i].offset, f->type); + { + const Type* fty = init_field_type_at(ty, i); + u32 foff = init_field_offset_at(L, i); + zero_init_at(p, slot, arr_ty, offset + foff, fty); + } } return; } @@ -622,7 +636,9 @@ static void init_aggregate_remainder(Parser* p, FrameSlot slot, cg_drop(p->cg); } } else { - zero_init_at(p, slot, arr_ty, offset + L->fields[i].offset, f->type); + const Type* fty = init_field_type_at(ty, i); + u32 foff = init_field_offset_at(L, i); + zero_init_at(p, slot, arr_ty, offset + foff, fty); } } return; @@ -1359,8 +1375,9 @@ static void parse_static_aggregate_remainder(Parser* p, u8* buf, u32 buflen, f->type); } } else { - parse_static_init_at(p, buf, buflen, offset + L->fields[i].offset, - f->type); + const Type* fty = init_field_type_at(ty, i); + u32 foff = init_field_offset_at(L, i); + parse_static_init_at(p, buf, buflen, offset + foff, fty); } if (i + 1u >= ty->rec.nfields) return; if (!accept_punct(p, ',')) break; @@ -1493,8 +1510,9 @@ void parse_static_init_at(Parser* p, u8* buf, u32 buflen, u32 offset, f->type); } } else { - parse_static_init_at(p, buf, buflen, offset + L->fields[i].offset, - f->type); + const Type* fty = init_field_type_at(ty, i); + u32 foff = init_field_offset_at(L, i); + parse_static_init_at(p, buf, buflen, offset + foff, fty); } ++i; if (!accept_punct(p, ',')) break; diff --git a/lang/c/parse/parse_type.c b/lang/c/parse/parse_type.c @@ -1044,6 +1044,11 @@ const Type* parse_struct_or_union(Parser* p, TypeKind kind, te = tag_define(p, tag_name, tdk, target, /*complete=*/0); } } + if (te) { + attr_list_append(&te->attrs, rec_attrs); + } else if (anon_attrs_out) { + attr_list_append(anon_attrs_out, rec_attrs); + } expect_punct(p, '{', "'{' to start aggregate body"); TypeRecordOpts begin_opts; memset(&begin_opts, 0, sizeof begin_opts); @@ -1058,11 +1063,17 @@ const Type* parse_struct_or_union(Parser* p, TypeKind kind, begin_opts); parse_member_decls(p, b); expect_punct(p, '}', "'}' after aggregate body"); - parse_attrs_into(p, &rec_attrs); - if (te) { - attr_list_append(&te->attrs, rec_attrs); - } else if (anon_attrs_out) { - attr_list_append(anon_attrs_out, rec_attrs); + TypeRecordOpts trailing_opts; + memset(&trailing_opts, 0, sizeof trailing_opts); + { + Attr* trailing_attrs = NULL; + parse_attrs_into(p, &trailing_attrs); + attrs_to_record_opts(trailing_attrs, &trailing_opts); + if (te) { + attr_list_append(&te->attrs, trailing_attrs); + } else if (anon_attrs_out) { + attr_list_append(anon_attrs_out, trailing_attrs); + } } { const Type* fresh = type_record_end(p->pool, b); @@ -1071,14 +1082,9 @@ const Type* parse_struct_or_union(Parser* p, TypeKind kind, target->rec.align_override = fresh->rec.align_override; type_record_install(target, (Field*)fresh->rec.fields, fresh->rec.nfields); } - { - TypeRecordOpts opts; - memset(&opts, 0, sizeof opts); - attrs_to_record_opts(rec_attrs, &opts); - if (opts.packed) target->rec.packed = 1; - if (opts.align_override > target->rec.align_override) - target->rec.align_override = opts.align_override; - } + if (trailing_opts.packed) target->rec.packed = 1; + if (trailing_opts.align_override > target->rec.align_override) + target->rec.align_override = trailing_opts.align_override; if (te) te->complete = 1; return target; } diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c @@ -1128,7 +1128,7 @@ static u32 aa_call_plan_stack_raw_size(const CGCallPlan* p) { const CGCallPlanMove* m = &p->args[i]; if (m->dst_kind == CG_CALL_PLAN_STACK || m->dst_kind == CG_CALL_PLAN_TAIL_STACK) { - u32 end = m->stack_offset + 8u; + u32 end = m->stack_offset + (m->mem.size > 8u ? m->mem.size : 8u); if (end > size) size = end; } } @@ -1616,6 +1616,16 @@ static void aa_emit_call_plan(CGTarget* t, const CGCallPlan* p) { return; } + { + u32 needed = (aa_call_plan_stack_raw_size(p) + 15u) & ~15u; + if (needed > a->max_outgoing) { + if (a->known_frame) + compiler_panic(t->c, a->loc, + "aarch64 call plan: known frame outgoing area too small"); + a->max_outgoing = needed; + } + } + if (p->callee.kind == OPK_GLOBAL) { u32 bl_pos = mc->pos(mc); aa64_emit32(mc, aa64_bl_base()); diff --git a/src/arch/c_target/target.c b/src/arch/c_target/target.c @@ -81,13 +81,6 @@ void c_fence(CGTarget*, MemOrder); ((CTarget*)t)->cur_fn->loc : (SrcLoc){0,0,0}, \ "C target: method " name " not implemented") -static void c_unimpl_func_begin_known_frame(CGTarget* t, const CGFuncDesc* f, - const CGKnownFrameDesc* k, - FrameSlot* out) { - (void)f; (void)k; (void)out; - C_UNIMPL("func_begin_known_frame"); -} - static void c_unimpl_spill_reg(CGTarget* t, Operand a, FrameSlot s, MemAccess m) { (void)a; (void)s; (void)m; @@ -190,7 +183,7 @@ CGTarget* c_cgtarget_new(Compiler* c, ObjBuilder* o, CfreeWriter* w) { /* ---- function lifecycle ---- */ t->func_begin = c_func_begin; - t->func_begin_known_frame = c_unimpl_func_begin_known_frame; + t->func_begin_known_frame = NULL; t->func_end = c_func_end; t->alias = c_alias; diff --git a/src/arch/rv64/ops.c b/src/arch/rv64/ops.c @@ -918,7 +918,7 @@ static u32 rv_call_plan_stack_raw_size(const CGCallPlan* p) { const CGCallPlanMove* m = &p->args[i]; if (m->dst_kind == CG_CALL_PLAN_STACK || m->dst_kind == CG_CALL_PLAN_TAIL_STACK) { - u32 end = m->stack_offset + 8u; + u32 end = m->stack_offset + (m->mem.size > 8u ? m->mem.size : 8u); if (end > size) size = end; } } @@ -1433,6 +1433,16 @@ static void rv_emit_call_plan(CGTarget* t, const CGCallPlan* p) { return; } + { + u32 needed = (rv_call_plan_stack_raw_size(p) + 15u) & ~15u; + if (needed > a->max_outgoing) { + if (a->known_frame) + compiler_panic(t->c, a->loc, + "rv64 call plan: known frame outgoing area too small"); + a->max_outgoing = needed; + } + } + if (p->callee.kind == OPK_GLOBAL) { u32 sec = mc->section_id; u32 pos = mc->pos(mc); diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c @@ -1156,7 +1156,7 @@ static u32 x_call_plan_stack_raw_size(const CGCallPlan* p) { const CGCallPlanMove* m = &p->args[i]; if (m->dst_kind == CG_CALL_PLAN_STACK || m->dst_kind == CG_CALL_PLAN_TAIL_STACK) { - u32 end = m->stack_offset + 8u; + u32 end = m->stack_offset + (m->mem.size > 8u ? m->mem.size : 8u); if (end > size) size = end; } } @@ -1695,6 +1695,7 @@ static void x_call(CGTarget* t, const CGCallDesc* d) { } static void x_emit_call_plan(CGTarget* t, const CGCallPlan* p) { + XImpl* a = impl_of(t); MCEmitter* mc = t->mc; if (p->is_variadic) @@ -1708,6 +1709,16 @@ static void x_emit_call_plan(CGTarget* t, const CGCallPlan* p) { return; } + { + u32 needed = (x_call_plan_stack_raw_size(p) + 15u) & ~15u; + if (needed > a->max_outgoing) { + if (a->known_frame) + compiler_panic(t->c, a->loc, + "x64 call plan: known frame outgoing area too small"); + a->max_outgoing = needed; + } + } + if (p->callee.kind == OPK_GLOBAL) { u8 op = 0xE8; mc->emit_bytes(mc, &op, 1); diff --git a/src/opt/ir.h b/src/opt/ir.h @@ -323,6 +323,15 @@ typedef struct Block { MCLabel mc_label; } Block; +typedef struct MFunc { + Block* blocks; + u32 nblocks; + u32 entry; + u32* emit_order; + u32 emit_order_n; + u32 emit_order_cap; +} MFunc; + typedef enum OptAllocKind { OPT_ALLOC_NONE, OPT_ALLOC_HARD, @@ -330,6 +339,19 @@ typedef enum OptAllocKind { OPT_ALLOC_SPLIT, } OptAllocKind; +typedef enum OptLocKind { + OPT_LOC_NONE, + OPT_LOC_HARD, + OPT_LOC_STACK, +} OptLocKind; + +typedef struct OptLoc { + u8 kind; + u8 cls; + Reg hard_reg; + FrameSlot spill_slot; +} OptLoc; + typedef struct OptAllocSegment { u32 start; u32 end; @@ -461,6 +483,8 @@ typedef struct Func { u64 opt_coalesce_merges; InstId next_inst_id; OptPRegInfo* preg_info; /* indexed by the current allocation reg namespace */ + OptLoc* preg_locs; /* canonical final allocation locations by PReg */ + MFunc* mir; /* physical post-allocation IR; HIR stays virtual */ u32* opt_coalesce_parent; u32* opt_coalesce_size; OptAllocSegment* opt_alloc_segments; diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -442,12 +442,6 @@ static CGLocalStorage w_param(CGTarget* t, const CGParamDesc* d) { Inst* in = rec(o, IR_PARAM_DECL); in->def = (Val)st.v.reg; in->type = d->type; - in->opnds = arena_array(o->f->arena, Operand, 1); - in->opnds[0].kind = OPK_REG; - in->opnds[0].cls = opt_local_reg_class(o, d->type); - in->opnds[0].type = d->type; - in->opnds[0].v.reg = st.v.reg; - in->nopnds = 1; } return st; } @@ -1294,17 +1288,17 @@ static int inst_spill_local(Func* f, const Inst* in, u32 op_idx) { static u64 func_spill_alloc_count(Func* f) { u64 n = 0; - if (!f || !f->preg_info) return 0; + if (!f || (!f->preg_locs && !f->preg_info)) return 0; for (PReg r = 1; r < opt_reg_count(f); ++r) - if (f->preg_info[r].alloc_kind == OPT_ALLOC_SPILL) ++n; + if (opt_preg_alloc_kind(f, r) == OPT_ALLOC_SPILL) ++n; return n; } -static u64 func_spill_load_count(Func* f) { +static u64 blocks_spill_load_count(Func* f, Block* blocks, u32 nblocks) { u64 n = 0; - if (!f) return 0; - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; + if (!f || !blocks) return 0; + for (u32 b = 0; b < nblocks; ++b) { + Block* bl = &blocks[b]; for (u32 i = 0; i < bl->ninsts; ++i) { Inst* in = &bl->insts[i]; if ((IROp)in->op == IR_LOAD && inst_spill_local(f, in, 1)) ++n; @@ -1313,11 +1307,18 @@ static u64 func_spill_load_count(Func* f) { return n; } -static u64 func_spill_store_count(Func* f) { - u64 n = 0; +static u64 func_spill_load_count(Func* f) { if (!f) return 0; - for (u32 b = 0; b < f->nblocks; ++b) { - Block* bl = &f->blocks[b]; + if (f->mir) + return blocks_spill_load_count(f, f->mir->blocks, f->mir->nblocks); + return blocks_spill_load_count(f, f->blocks, f->nblocks); +} + +static u64 blocks_spill_store_count(Func* f, Block* blocks, u32 nblocks) { + u64 n = 0; + if (!f || !blocks) return 0; + for (u32 b = 0; b < nblocks; ++b) { + Block* bl = &blocks[b]; for (u32 i = 0; i < bl->ninsts; ++i) { Inst* in = &bl->insts[i]; if ((IROp)in->op == IR_STORE && inst_spill_local(f, in, 0)) ++n; @@ -1326,8 +1327,16 @@ static u64 func_spill_store_count(Func* f) { return n; } +static u64 func_spill_store_count(Func* f) { + if (!f) return 0; + if (f->mir) + return blocks_spill_store_count(f, f->mir->blocks, f->mir->nblocks); + return blocks_spill_store_count(f, f->blocks, f->nblocks); +} + static void opt_run_lowering_pipeline(OptImpl* o, const char* total_scope, int allow_live_range_split) { + (void)allow_live_range_split; metrics_scope_begin(o->c, total_scope); metrics_count(o->c, "opt.funcs", 1); metrics_count(o->c, "opt.blocks", o->f->nblocks); @@ -1351,18 +1360,27 @@ static void opt_run_lowering_pipeline(OptImpl* o, const char* total_scope, metrics_scope_begin(o->c, "opt.machinize"); opt_machinize(o->f, o->target); metrics_scope_end(o->c, "opt.machinize"); - /* Fold address-of-local pseudos into direct memory operands. O2 runs the - * SSA variant in opt_cleanup; here is the equivalent for the O1 pipeline - * which operates on the PReg namespace directly. */ - metrics_scope_begin(o->c, "opt.addr_xform_pregs"); + metrics_scope_begin(o->c, "opt.machinize.verify"); + opt_verify(o->f, "lowering-machinize"); + metrics_scope_end(o->c, "opt.machinize.verify"); + metrics_scope_begin(o->c, "opt.o1.addr_xform_pregs"); opt_addr_xform_pregs(o->f); - metrics_scope_end(o->c, "opt.addr_xform_pregs"); - metrics_scope_begin(o->c, "opt.promote_scalar_locals"); + metrics_scope_end(o->c, "opt.o1.addr_xform_pregs"); + metrics_scope_begin(o->c, "opt.o1.addr_xform.verify"); + opt_verify(o->f, "o1-addr-xform"); + metrics_scope_end(o->c, "opt.o1.addr_xform.verify"); + metrics_scope_begin(o->c, "opt.o1.promote_scalar_locals"); opt_promote_scalar_locals(o->f); - metrics_scope_end(o->c, "opt.promote_scalar_locals"); - metrics_scope_begin(o->c, "opt.addr_of_global_cse"); + metrics_scope_end(o->c, "opt.o1.promote_scalar_locals"); + metrics_scope_begin(o->c, "opt.o1.promote_scalar.verify"); + opt_verify(o->f, "o1-promote-scalar"); + metrics_scope_end(o->c, "opt.o1.promote_scalar.verify"); + metrics_scope_begin(o->c, "opt.o1.addr_of_global_cse"); opt_addr_of_global_cse(o->f); - metrics_scope_end(o->c, "opt.addr_of_global_cse"); + metrics_scope_end(o->c, "opt.o1.addr_of_global_cse"); + metrics_scope_begin(o->c, "opt.o1.addr_of_global.verify"); + opt_verify(o->f, "o1-addr-global-cse"); + metrics_scope_end(o->c, "opt.o1.addr_of_global.verify"); metrics_scope_begin(o->c, "opt.build_loop_tree"); opt_build_loop_tree(o->f); metrics_scope_end(o->c, "opt.build_loop_tree"); @@ -1387,7 +1405,8 @@ static void opt_run_lowering_pipeline(OptImpl* o, const char* total_scope, o->f->opt_dde_live_words_touched); metrics_scope_end(o->c, "opt.dead_def_elim"); metrics_scope_begin(o->c, "opt.regalloc"); - opt_regalloc(o->f, allow_live_range_split); + OptLiveInfo regalloc_live; + opt_regalloc_locations(o->f, 0, &regalloc_live); metrics_count(o->c, "opt.alloc.used_loc_words", o->f->opt_used_loc_words); metrics_count(o->c, "opt.alloc.hard_loc_words", o->f->opt_alloc_hard_loc_words); @@ -1406,30 +1425,46 @@ static void opt_run_lowering_pipeline(OptImpl* o, const char* total_scope, metrics_count(o->c, "opt.alloc.stack_mark_points", o->f->opt_alloc_stack_mark_points); metrics_count(o->c, "opt.alloc.spills", func_spill_alloc_count(o->f)); + metrics_scope_end(o->c, "opt.regalloc"); + metrics_scope_begin(o->c, "opt.regalloc.verify"); + opt_analysis_invalidate(o->f, OPT_ANALYSIS_DEF_USE); + opt_verify(o->f, "post-regalloc"); + metrics_scope_end(o->c, "opt.regalloc.verify"); + metrics_scope_begin(o->c, "opt.lower_mir"); + opt_lower_to_mir(o->f, &regalloc_live); metrics_count(o->c, "opt.rewrite.reloads", func_spill_load_count(o->f)); metrics_count(o->c, "opt.rewrite.stores", func_spill_store_count(o->f)); metrics_count(o->c, "opt.rewrite.inserted_insts", o->f->opt_rewrite_inserted_insts); metrics_count(o->c, "opt.rewrite.live_words_touched", o->f->opt_rewrite_live_words_touched); - metrics_scope_end(o->c, "opt.regalloc"); + metrics_scope_end(o->c, "opt.lower_mir"); + metrics_scope_begin(o->c, "opt.lower_mir.verify"); + opt_mir_verify(o->f, "lower-mir"); + metrics_scope_end(o->c, "opt.lower_mir.verify"); metrics_scope_begin(o->c, "opt.combine"); - opt_combine(o->f); + opt_mir_combine(o->f); metrics_scope_end(o->c, "opt.combine"); + metrics_scope_begin(o->c, "opt.combine.verify"); + opt_mir_verify(o->f, "post-mir-combine"); + metrics_scope_end(o->c, "opt.combine.verify"); metrics_scope_begin(o->c, "opt.dce"); - opt_dce(o->f); + opt_mir_dce(o->f); metrics_scope_end(o->c, "opt.dce"); + metrics_scope_begin(o->c, "opt.dce.verify"); + opt_mir_verify(o->f, "post-mir-dce"); + metrics_scope_end(o->c, "opt.dce.verify"); metrics_scope_begin(o->c, "opt.post_ra.jump_cleanup_cfg"); - opt_jump_cleanup(o->f, OPT_JUMP_CLEANUP_CFG); + opt_mir_jump_cleanup(o->f, OPT_JUMP_CLEANUP_CFG); metrics_scope_end(o->c, "opt.post_ra.jump_cleanup_cfg"); metrics_scope_begin(o->c, "opt.post_ra.build_cfg"); - opt_build_cfg(o->f); + opt_mir_build_cfg(o->f); metrics_scope_end(o->c, "opt.post_ra.build_cfg"); metrics_scope_begin(o->c, "opt.post_ra.verify"); - opt_verify(o->f, "post-ra-jump-cfg"); + opt_mir_verify(o->f, "post-mir-jump-cfg"); metrics_scope_end(o->c, "opt.post_ra.verify"); metrics_scope_begin(o->c, "opt.post_ra.jump_cleanup_layout"); - opt_jump_cleanup(o->f, OPT_JUMP_CLEANUP_LAYOUT); + opt_mir_jump_cleanup(o->f, OPT_JUMP_CLEANUP_LAYOUT); metrics_scope_end(o->c, "opt.post_ra.jump_cleanup_layout"); metrics_scope_begin(o->c, "opt.emit"); opt_emit(o->c, o->f, o->target); diff --git a/src/opt/opt.h b/src/opt/opt.h @@ -83,6 +83,7 @@ void opt_machinize(Func*, CGTarget* target); void opt_build_loop_tree(Func*); typedef struct OptBitset { + Arena* arena; u64* words; u32 nwords; u32 active_words; @@ -145,19 +146,6 @@ typedef struct OptLiveRangeSet { u64 live_words_touched; } OptLiveRangeSet; -typedef enum OptLocKind { - OPT_LOC_NONE, - OPT_LOC_HARD, - OPT_LOC_STACK, -} OptLocKind; - -typedef struct OptLoc { - u8 kind; - u8 cls; - Reg hard_reg; - FrameSlot spill_slot; -} OptLoc; - typedef void (*OptBitsetIterFn)(PReg, void*); void opt_bitset_clear(OptBitset*); @@ -177,7 +165,15 @@ void opt_ir_dump(Func*, Writer*); void opt_ssa_dump(Func*, Writer*); void opt_rewrite_dump(Func*, Writer*); void opt_coalesce(Func*); +void opt_regalloc_locations(Func*, int allow_live_range_split, + OptLiveInfo* live_out); void opt_regalloc(Func*, int allow_live_range_split); +void opt_lower_to_mir(Func*, const OptLiveInfo*); +void opt_mir_combine(Func*); +void opt_mir_dce(Func*); +void opt_mir_jump_cleanup(Func*, OptJumpCleanupStage); +void opt_mir_build_cfg(Func*); +void opt_mir_verify(Func*, const char* stage); void opt_combine(Func*); /* code selection: merge dependent insns */ void opt_dce(Func*); /* post-RA DCE */ void opt_dead_def_elim(Func*); /* pre-RA dead-definition elimination */ diff --git a/src/opt/opt_internal.h b/src/opt/opt_internal.h @@ -87,6 +87,51 @@ static inline u8 opt_reg_cls(const Func* f, PReg r) { return f->preg_cls[r]; } +static inline const OptLoc* opt_preg_loc(const Func* f, PReg r) { + if (!f || !f->preg_locs || !opt_reg_valid(f, r)) return NULL; + return &f->preg_locs[r]; +} + +static inline u8 opt_preg_alloc_kind(const Func* f, PReg r) { + const OptLoc* loc = opt_preg_loc(f, r); + if (loc) { + switch ((OptLocKind)loc->kind) { + case OPT_LOC_HARD: + return OPT_ALLOC_HARD; + case OPT_LOC_STACK: + return OPT_ALLOC_SPILL; + case OPT_LOC_NONE: + default: + break; + } + } + if (!f || !f->preg_info || !opt_reg_valid(f, r)) return OPT_ALLOC_NONE; + return f->preg_info[r].alloc_kind; +} + +static inline u8 opt_preg_loc_cls(const Func* f, PReg r) { + const OptLoc* loc = opt_preg_loc(f, r); + if (loc && loc->kind != OPT_LOC_NONE) return loc->cls; + if (f && f->preg_info && opt_reg_valid(f, r)) return f->preg_info[r].cls; + return opt_reg_cls(f, r); +} + +static inline Reg opt_preg_hard_reg(const Func* f, PReg r) { + const OptLoc* loc = opt_preg_loc(f, r); + if (loc && loc->kind == OPT_LOC_HARD) return loc->hard_reg; + if (f && f->preg_info && opt_reg_valid(f, r)) + return f->preg_info[r].hard_reg; + return REG_NONE; +} + +static inline FrameSlot opt_preg_spill_slot(const Func* f, PReg r) { + const OptLoc* loc = opt_preg_loc(f, r); + if (loc && loc->kind == OPT_LOC_STACK) return loc->spill_slot; + if (f && f->preg_info && opt_reg_valid(f, r)) + return f->preg_info[r].spill_slot; + return FRAME_SLOT_NONE; +} + void opt_analysis_mark_valid(Func*, u32 flags); void opt_analysis_invalidate(Func*, u32 flags); int opt_analysis_has(Func*, u32 flags); diff --git a/src/opt/opt_util.c b/src/opt/opt_util.c @@ -1,5 +1,7 @@ #include "opt/opt_internal.h" +#include <string.h> + int opt_val_in_inst_defs(const Inst* in, Val v) { if (!in || v == VAL_NONE) return 0; if (in->def == v) return 1; @@ -66,6 +68,19 @@ void opt_walk_inst_operands(Func* f, Inst* in, OptOperandWalkFn fn, void* ctx) { } switch ((IROp)in->op) { + case IR_PARAM_DECL: { + Operand def; + if (in->def == VAL_NONE) break; + memset(&def, 0, sizeof def); + def.kind = OPK_REG; + def.cls = + (f->preg_info && in->def < opt_reg_count(f)) ? f->preg_info[in->def].cls + : 0; + def.type = in->type; + def.v.reg = (Reg)in->def; + opt_walk_operand(f, in, &def, 1, fn, ctx); + break; + } case IR_CALL: { IRCallAux* aux = (IRCallAux*)in->extra.aux; if (!aux) break; diff --git a/src/opt/pass_analysis.c b/src/opt/pass_analysis.c @@ -38,6 +38,158 @@ static int verify_stage_is_ssa(const char* stage) { return stage && slice_contains_cstr(s, "ssa") && !slice_contains_cstr(s, "pre-ssa"); } + +static u8 verify_type_reg_class(Func* f, CfreeCgTypeId ty) { + CfreeCgTypeKind kind = cfree_cg_type_kind((CfreeCompiler*)f->c, ty); + return kind == CFREE_CG_TYPE_FLOAT ? RC_FP : RC_INT; +} + +static void verify_frame_slot(Func* f, const char* stage, FrameSlot slot, + const char* msg) { + if (slot == FRAME_SLOT_NONE || slot > f->nframe_slots) + opt_fail(f, stage, msg, slot, f->nframe_slots); +} + +static void verify_storage(Func* f, const char* stage, CGLocalStorage st, + CfreeCgTypeId type, u8 expect_cls, + const char* msg) { + switch ((CGLocalStorageKind)st.kind) { + case CG_LOCAL_STORAGE_FRAME: + verify_frame_slot(f, stage, st.v.frame_slot, msg); + break; + case CG_LOCAL_STORAGE_REG: { + PReg r = (PReg)st.v.reg; + if (r == PREG_NONE || r == 0 || r >= opt_reg_count(f)) + opt_fail(f, stage, msg, r, opt_reg_count(f)); + if (!f->opt_reg_ssa && f->preg_cls && f->preg_cls[r] != expect_cls) + opt_fail(f, stage, "storage class mismatch", r, f->preg_cls[r]); + if (!f->opt_reg_ssa && type && f->preg_type && f->preg_type[r] && + f->preg_type[r] != type) + opt_fail(f, stage, "storage type mismatch", r, type); + break; + } + default: + opt_fail(f, stage, "bad storage kind", st.kind, 0); + break; + } +} + +static void verify_operand_shape(Func* f, const char* stage, const Operand* op, + int physical_regs) { + if (!op) return; + switch ((OpKind)op->kind) { + case OPK_IMM: + case OPK_GLOBAL: + break; + case OPK_LOCAL: + verify_frame_slot(f, stage, op->v.frame_slot, "bad local frame slot"); + break; + case OPK_REG: + if (op->cls >= OPT_REG_CLASSES) + opt_fail(f, stage, "bad operand class", op->cls, OPT_REG_CLASSES); + if (physical_regs) { + if (op->v.reg == (Reg)REG_NONE || op->v.reg >= OPT_MAX_HARD_REGS) + opt_fail(f, stage, "bad physical operand reg", op->v.reg, + OPT_MAX_HARD_REGS); + } + break; + case OPK_INDIRECT: + if (op->cls >= OPT_REG_CLASSES) + opt_fail(f, stage, "bad indirect class", op->cls, OPT_REG_CLASSES); + if (physical_regs) { + if (op->v.ind.base == (Reg)REG_NONE || + op->v.ind.base >= OPT_MAX_HARD_REGS) + opt_fail(f, stage, "bad physical indirect base", op->v.ind.base, + OPT_MAX_HARD_REGS); + if (op->v.ind.index != (Reg)REG_NONE && + op->v.ind.index >= OPT_MAX_HARD_REGS) + opt_fail(f, stage, "bad physical indirect index", op->v.ind.index, + OPT_MAX_HARD_REGS); + } + break; + default: + opt_fail(f, stage, "bad operand kind", op->kind, 0); + break; + } +} + +static void verify_abivalue_shape(Func* f, const char* stage, CGABIValue* v, + int physical_regs) { + if (!v) return; + verify_operand_shape(f, stage, &v->storage, physical_regs); + for (u32 i = 0; i < v->nparts; ++i) + verify_operand_shape(f, stage, &v->parts[i].op, physical_regs); +} + +static void verify_call_plan_shape(Func* f, const char* stage, + CGCallPlan* plan, int physical_regs) { + if (!plan) return; + verify_operand_shape(f, stage, &plan->callee, physical_regs); + for (u32 i = 0; i < plan->nargs; ++i) { + verify_operand_shape(f, stage, &plan->args[i].src, physical_regs); + if (plan->args[i].dst_kind == CG_CALL_PLAN_REG && + plan->args[i].dst_reg >= OPT_MAX_HARD_REGS) + opt_fail(f, stage, "bad call-plan dst reg", plan->args[i].dst_reg, + OPT_MAX_HARD_REGS); + } + for (u32 i = 0; i < plan->nrets; ++i) { + verify_operand_shape(f, stage, &plan->rets[i].dst, physical_regs); + if (plan->rets[i].src_reg >= OPT_MAX_HARD_REGS) + opt_fail(f, stage, "bad call-plan ret reg", plan->rets[i].src_reg, + OPT_MAX_HARD_REGS); + } +} + +static void verify_aux_shapes(Func* f, const char* stage, Inst* in, + int physical_regs) { + switch ((IROp)in->op) { + case IR_CALL: { + IRCallAux* aux = (IRCallAux*)in->extra.aux; + if (!aux) break; + if (aux->use_plan_replay) { + verify_call_plan_shape(f, stage, &aux->plan, physical_regs); + } else { + verify_operand_shape(f, stage, &aux->desc.callee, physical_regs); + for (u32 i = 0; i < aux->desc.nargs; ++i) + verify_abivalue_shape(f, stage, (CGABIValue*)&aux->desc.args[i], + physical_regs); + verify_abivalue_shape(f, stage, &aux->desc.ret, physical_regs); + } + break; + } + case IR_RET: { + IRRetAux* aux = (IRRetAux*)in->extra.aux; + if (aux && aux->present) + verify_abivalue_shape(f, stage, &aux->val, physical_regs); + break; + } + case IR_SCOPE_BEGIN: { + IRScopeAux* aux = (IRScopeAux*)in->extra.aux; + if (aux) verify_operand_shape(f, stage, &aux->desc.cond, physical_regs); + break; + } + case IR_ASM_BLOCK: { + IRAsmAux* aux = (IRAsmAux*)in->extra.aux; + if (!aux) break; + for (u32 i = 0; i < aux->nin; ++i) + verify_operand_shape(f, stage, &aux->in_ops[i], physical_regs); + for (u32 i = 0; i < aux->nout; ++i) + verify_operand_shape(f, stage, &aux->out_ops[i], physical_regs); + break; + } + case IR_INTRINSIC: { + IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; + if (!aux) break; + for (u32 i = 0; i < aux->narg; ++i) + verify_operand_shape(f, stage, &aux->args[i], physical_regs); + for (u32 i = 0; i < aux->ndst; ++i) + verify_operand_shape(f, stage, &aux->dsts[i], physical_regs); + break; + } + default: + break; + } +} #endif void opt_analysis_mark_valid(Func* f, u32 flags) { @@ -400,6 +552,9 @@ static void verify_values(Func* f, const char* stage) { if (v == VAL_NONE || v >= ndefs) opt_fail(f, stage, "bad inst multi-def", b, d); } + for (u32 o = 0; o < in->nopnds; ++o) + verify_operand_shape(f, stage, &in->opnds[o], 0); + verify_aux_shapes(f, stage, in, 0); opt_walk_inst_operands(f, in, verify_operand, (void*)stage); if ((IROp)in->op == IR_PHI) { IRPhiAux* aux = (IRPhiAux*)in->extra.aux; @@ -424,7 +579,130 @@ static void verify_values(Func* f, const char* stage) { f->val_type[aux->pred_vals[p]] != in->type) opt_fail(f, stage, "phi input type mismatch", b, p); } + } else if ((IROp)in->op == IR_PARAM_DECL) { + if (in->nopnds || in->opnds) + opt_fail(f, stage, "param_decl should not carry operands", b, i); + if (in->def == VAL_NONE) + opt_fail(f, stage, "param_decl missing def", b, i); + } + } + } +} + +static void verify_function_storage(Func* f, const char* stage) { + for (u32 i = 0; i < f->nframe_slots; ++i) { + IRFrameSlot* s = &f->frame_slots[i]; + if (s->id != i + 1u) opt_fail(f, stage, "frame slot id mismatch", s->id, i); + if (s->kind > FS_SPILL) opt_fail(f, stage, "bad frame slot kind", s->kind, i); + } + for (u32 i = 0; i < f->nparams; ++i) { + IRParam* p = &f->params[i]; + u8 cls = verify_type_reg_class(f, p->type); + if (p->index != i) opt_fail(f, stage, "param index mismatch", p->index, i); + verify_storage(f, stage, p->storage, p->type, cls, "bad param storage"); + } + for (u32 i = 0; i < f->nlocals; ++i) { + IRLocal* l = &f->locals[i]; + u8 cls = verify_type_reg_class(f, l->desc.type); + verify_storage(f, stage, l->storage, l->desc.type, cls, + "bad local storage"); + if (l->home_slot != FRAME_SLOT_NONE) + verify_frame_slot(f, stage, l->home_slot, "bad local home slot"); + } +} + +static void verify_allocations(Func* f, const char* stage) { + if (!f->preg_info && !f->preg_locs) return; + for (PReg r = 1; r < opt_reg_count(f); ++r) { + OptPRegInfo* pi = f->preg_info ? &f->preg_info[r] : NULL; + OptLoc* loc = f->preg_locs ? &f->preg_locs[r] : NULL; + u8 alloc_kind = opt_preg_alloc_kind(f, r); + u8 cls = opt_preg_loc_cls(f, r); + if (cls >= OPT_REG_CLASSES) + opt_fail(f, stage, "bad preg alloc class", r, cls); + if (pi && pi->alloc_kind != alloc_kind && + !(pi->alloc_kind == OPT_ALLOC_SPLIT && alloc_kind == OPT_ALLOC_SPLIT)) + opt_fail(f, stage, "alloc kind mirror mismatch", r, pi->alloc_kind); + switch ((OptAllocKind)alloc_kind) { + case OPT_ALLOC_NONE: + if (loc && loc->kind != OPT_LOC_NONE) + opt_fail(f, stage, "alloc location mismatch", r, loc->kind); + break; + case OPT_ALLOC_HARD: + if (opt_preg_hard_reg(f, r) == (Reg)REG_NONE || + opt_preg_hard_reg(f, r) >= OPT_MAX_HARD_REGS) + opt_fail(f, stage, "bad hard allocation", r, + opt_preg_hard_reg(f, r)); + if (pi && (pi->cls != cls || pi->hard_reg != opt_preg_hard_reg(f, r))) + opt_fail(f, stage, "hard alloc mirror mismatch", r, pi->hard_reg); + if (loc && (loc->kind != OPT_LOC_HARD || loc->cls != cls)) + opt_fail(f, stage, "hard alloc location mismatch", r, + opt_preg_hard_reg(f, r)); + break; + case OPT_ALLOC_SPILL: + verify_frame_slot(f, stage, opt_preg_spill_slot(f, r), + "bad spill slot"); + if (f->frame_slots[opt_preg_spill_slot(f, r) - 1u].kind != FS_SPILL) + opt_fail(f, stage, "spill slot is not FS_SPILL", r, + opt_preg_spill_slot(f, r)); + if (pi && + (pi->cls != cls || pi->spill_slot != opt_preg_spill_slot(f, r))) + opt_fail(f, stage, "spill alloc mirror mismatch", r, pi->spill_slot); + if (loc && (loc->kind != OPT_LOC_STACK || loc->cls != cls)) + opt_fail(f, stage, "spill alloc location mismatch", r, + opt_preg_spill_slot(f, r)); + break; + case OPT_ALLOC_SPLIT: + if (!f->opt_first_segment_by_preg) + opt_fail(f, stage, "split allocation missing segments", r, 0); + if (loc && loc->kind != OPT_LOC_NONE) + opt_fail(f, stage, "split alloc location mismatch", r, loc->kind); + break; + default: + opt_fail(f, stage, "bad allocation kind", r, alloc_kind); + break; + } + } + if (f->opt_first_segment_by_preg && f->opt_alloc_segments) { + for (PReg r = 1; r < opt_reg_count(f); ++r) { + for (u32 si = f->opt_first_segment_by_preg[r]; si != OPT_RANGE_NONE; + si = f->opt_alloc_segments[si].next) { + OptAllocSegment* s = &f->opt_alloc_segments[si]; + if (s->block >= f->nblocks) opt_fail(f, stage, "bad split block", r, si); + if (s->start > s->end) opt_fail(f, stage, "bad split range", r, si); + if (s->cls >= OPT_REG_CLASSES) + opt_fail(f, stage, "bad split class", r, s->cls); + if (s->loc_kind == OPT_LOC_HARD) { + if (s->hard_reg == (Reg)REG_NONE || s->hard_reg >= OPT_MAX_HARD_REGS) + opt_fail(f, stage, "bad split hard reg", r, s->hard_reg); + } else if (s->loc_kind == OPT_LOC_STACK) { + verify_frame_slot(f, stage, s->spill_slot, "bad split spill slot"); + } else if (s->loc_kind != OPT_LOC_NONE) { + opt_fail(f, stage, "bad split loc kind", r, s->loc_kind); + } + } + } + } +} + +static void verify_rewritten(Func* f, const char* stage) { + if (!f->opt_rewritten) return; + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if ((IROp)in->op == IR_PHI) + opt_fail(f, stage, "phi survived rewrite", b, i); + if ((IROp)in->op == IR_PARAM_DECL) { + if (in->nopnds || in->opnds) + opt_fail(f, stage, "param_decl carries operands after rewrite", b, i); + if (in->def == VAL_NONE || in->def >= opt_reg_count(f)) + opt_fail(f, stage, "bad param_decl def after rewrite", b, i); + continue; } + for (u32 o = 0; o < in->nopnds; ++o) + verify_operand_shape(f, stage, &in->opnds[o], 1); + verify_aux_shapes(f, stage, in, 1); } } } @@ -557,7 +835,10 @@ void opt_verify(Func* f, const char* stage) { if (seen_emit[b]) opt_fail(f, stage, "duplicate emit block", b, i); seen_emit[b] = 1; } + verify_function_storage(f, stage); + verify_allocations(f, stage); verify_values(f, stage); + verify_rewritten(f, stage); verify_def_use(f, stage); #endif } diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c @@ -475,6 +475,26 @@ static int ctx_def_changed_since(const CombineCtx* ctx, u8 cls, Reg reg, return ctx->last_def[cls][reg] > since_idx; } +static i32 ctx_prev_def_before(const CombineCtx* ctx, const Operand* reg, + i32 before_idx) { + if (!reg || reg->kind != OPK_REG) return -1; + for (i32 j = before_idx - 1; j >= 0; --j) { + const Inst* prev = &ctx->bl->insts[j]; + if (inst_is_clobber_barrier(prev) || inst_defines_phys_reg(prev, reg)) + return j; + } + return -1; +} + +static void ctx_restore_removed_def(CombineCtx* ctx, const Operand* reg, + i32 removed_idx) { + if (!reg || reg->kind != OPK_REG) return; + if (reg->cls >= OPT_REG_CLASSES || reg->v.reg >= OPT_MAX_HARD_REGS) return; + if (ctx->last_def[reg->cls][reg->v.reg] == removed_idx) + ctx->last_def[reg->cls][reg->v.reg] = + ctx_prev_def_before(ctx, reg, removed_idx); +} + /* ---- forward-scan use accounting (used for single-use checks) ---- */ /* Count phys uses of `def` within the live range starting just after @@ -506,6 +526,18 @@ static int count_uses_in_live_range(const Block* bl, i32 prod_idx, return n; } +static int use_after_clobber_before_redef(const Block* bl, i32 prod_idx, + const Operand* def) { + int saw_clobber = 0; + for (i32 i = prod_idx + 1; i < (i32)bl->ninsts; ++i) { + const Inst* in = &bl->insts[i]; + if (saw_clobber && inst_uses_phys_reg(in, def)) return 1; + if (inst_defines_phys_reg(in, def)) return 0; + if (inst_is_clobber_barrier(in)) saw_clobber = 1; + } + return 0; +} + /* ---- ConvKind helpers (for combine_exts) ---- */ static u32 builtin_int_bytes(CfreeCgTypeId t) { @@ -672,6 +704,10 @@ static int subst_consumer_operands(Inst* in, const Operand* def, (op->v.ind.base == def->v.reg || (op->v.ind.index != (Reg)REG_NONE && op->v.ind.index == def->v.reg))) { + if ((IROp)in->op == IR_STORE || (IROp)in->op == IR_ATOMIC_STORE || + (IROp)in->op == IR_BITFIELD_STORE || (IROp)in->op == IR_AGG_COPY || + (IROp)in->op == IR_AGG_SET) + continue; set_indirect_field(op, def->v.reg, src->v.reg); ++n; } @@ -947,6 +983,8 @@ static int try_sink(CombineCtx* ctx, Inst* in, i32 i) { int killed = 0; int uses_total = count_uses_in_live_range(ctx->bl, prod_idx, &src, &killed); if (uses_total != 1) return 0; + if (killed && use_after_clobber_before_redef(ctx->bl, prod_idx, &src)) + return 0; if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live, ctx->bl->id, &src)) return 0; @@ -962,8 +1000,10 @@ static int try_sink(CombineCtx* ctx, Inst* in, i32 i) { prod->opnds[0] = dst; if (prod->def != VAL_NONE) prod->def = (Val)dst.v.reg; - /* Update last-def: producer no longer defines src, now defines dst. */ - ctx->last_def[src.cls][src.v.reg] = -1; + /* Update last-def: producer no longer defines src, now defines dst. If src + * had an earlier reaching definition in the block, keep it visible for + * later source-availability checks. */ + ctx_restore_removed_def(ctx, &src, prod_idx); ctx->last_def[dst.cls][dst.v.reg] = prod_idx; /* Mark the copy NOP'd so compact removes it. */ @@ -1082,6 +1122,7 @@ static int try_ret_retarget(Func* f, Block* bl, i32 i) { static int opt_combine_fold_block(Func* f, Block* bl, const OptHardBlockLive* hard_live) { enum { enable_o1_combine_rewrites = 1 }; + enum { enable_o1_sink_rewrites = 1 }; CombineCtx ctx; ctx.f = f; ctx.bl = bl; @@ -1103,7 +1144,7 @@ static int opt_combine_fold_block(Func* f, Block* bl, for (Reg r = 0; r < OPT_MAX_HARD_REGS; ++r) { probe.v.reg = r; if (ctx.last_def[c][r] == i - 1 && !inst_defines_phys_reg(prev, &probe)) - ctx.last_def[c][r] = -1; + ctx_restore_removed_def(&ctx, &probe, i - 1); } } ctx_record(&ctx, prev, i - 1); @@ -1112,7 +1153,7 @@ static int opt_combine_fold_block(Func* f, Block* bl, /* Skip NOPs left by prior sink rewrites. */ if ((IROp)in->op == IR_NOP) continue; - if (enable_o1_combine_rewrites && try_sink(&ctx, in, i)) { + if (enable_o1_sink_rewrites && try_sink(&ctx, in, i)) { /* sink NOP'd the copy and updated ctx. */ continue; } diff --git a/src/opt/pass_emit.c b/src/opt/pass_emit.c @@ -74,14 +74,13 @@ static CGLocalStorage xlat_storage(ReplayCtx* r, CGLocalStorage st, (void)ty; if (st.kind == CG_LOCAL_STORAGE_REG) { PReg pr = (PReg)st.v.reg; - if (r->identity_regs && r->f->opt_rewritten && pr != 0 && - pr < opt_reg_count(r->f) && r->f->preg_info) { - OptPRegInfo* vi = &r->f->preg_info[pr]; - if (vi->alloc_kind == OPT_ALLOC_HARD) { - st.v.reg = vi->hard_reg; - } else if (vi->alloc_kind == OPT_ALLOC_SPILL) { + if (r->identity_regs && r->f->opt_rewritten && opt_reg_valid(r->f, pr)) { + u8 alloc_kind = opt_preg_alloc_kind(r->f, pr); + if (alloc_kind == OPT_ALLOC_HARD) { + st.v.reg = opt_preg_hard_reg(r->f, pr); + } else if (alloc_kind == OPT_ALLOC_SPILL) { st.kind = CG_LOCAL_STORAGE_FRAME; - st.v.frame_slot = slot_to_target(r, vi->spill_slot); + st.v.frame_slot = slot_to_target(r, opt_preg_spill_slot(r->f, pr)); } else { st.v.reg = val_to_target_reg(r, (Val)pr); } @@ -96,11 +95,11 @@ static CGLocalStorage xlat_storage(ReplayCtx* r, CGLocalStorage st, static int replay_reg_storage_unused(ReplayCtx* r, CGLocalStorage st) { if (!r || st.kind != CG_LOCAL_STORAGE_REG) return 0; - if (!(r->identity_regs && r->f->opt_rewritten && r->f->preg_info)) return 0; + if (!(r->identity_regs && r->f->opt_rewritten)) return 0; PReg pr = (PReg)st.v.reg; if (pr == 0 || pr >= opt_reg_count(r->f)) return 0; - return r->f->preg_info[pr].alloc_kind == OPT_ALLOC_NONE || - r->f->preg_info[pr].use_freq == 0; + if (opt_preg_alloc_kind(r->f, pr) == OPT_ALLOC_NONE) return 1; + return r->f->preg_info && r->f->preg_info[pr].use_freq == 0; } static Operand xlat_op(ReplayCtx* r, Operand op) { @@ -855,39 +854,54 @@ static void add_unique_reg(Reg* used, u32* nused, u32 cap, Reg r) { if (*nused < cap) used[(*nused)++] = r; } -static void collect_replayed_operand_reg(const Operand* op, RegClass cls, +static void collect_replayed_reg(Func* f, Reg raw, RegClass cls, Reg* used, + u32* nused, u32 cap) { + if (raw == (Reg)REG_NONE) return; + if (f && f->opt_rewritten && opt_reg_valid(f, (PReg)raw)) { + PReg pr = (PReg)raw; + if (opt_preg_alloc_kind(f, pr) == OPT_ALLOC_HARD && + opt_preg_loc_cls(f, pr) == (u8)cls) + add_unique_reg(used, nused, cap, opt_preg_hard_reg(f, pr)); + } + add_unique_reg(used, nused, cap, raw); +} + +static void collect_replayed_operand_reg(Func* f, const Operand* op, RegClass cls, Reg* used, u32* nused, u32 cap) { if (!op) return; if (op->kind == OPK_REG) { - if (op->cls == cls) add_unique_reg(used, nused, cap, op->v.reg); + if (op->cls == cls) + collect_replayed_reg(f, op->v.reg, cls, used, nused, cap); } else if (op->kind == OPK_INDIRECT) { if (cls == RC_INT) { - add_unique_reg(used, nused, cap, op->v.ind.base); + collect_replayed_reg(f, op->v.ind.base, cls, used, nused, cap); if (op->v.ind.index != (Reg)REG_NONE) - add_unique_reg(used, nused, cap, op->v.ind.index); + collect_replayed_reg(f, op->v.ind.index, cls, used, nused, cap); } } } -static void collect_replayed_abivalue_regs(const CGABIValue* v, RegClass cls, +static void collect_replayed_abivalue_regs(Func* f, const CGABIValue* v, + RegClass cls, Reg* used, u32* nused, u32 cap) { if (!v) return; - collect_replayed_operand_reg(&v->storage, cls, used, nused, cap); + collect_replayed_operand_reg(f, &v->storage, cls, used, nused, cap); for (u32 i = 0; i < v->nparts; ++i) - collect_replayed_operand_reg(&v->parts[i].op, cls, used, nused, cap); + collect_replayed_operand_reg(f, &v->parts[i].op, cls, used, nused, cap); } static void collect_replayed_param_regs(Func* f, RegClass cls, Reg* used, u32* nused, u32 cap) { - if (!f->opt_rewritten || !f->preg_info) return; + if (!f->opt_rewritten) return; for (u32 i = 0; i < f->nparams; ++i) { IRParam* p = &f->params[i]; if (p->storage.kind != CG_LOCAL_STORAGE_REG) continue; PReg pr = (PReg)p->storage.v.reg; if (pr == 0 || pr >= opt_reg_count(f)) continue; - OptPRegInfo* vi = &f->preg_info[pr]; - if (vi->alloc_kind != OPT_ALLOC_HARD || vi->cls != cls) continue; - add_unique_reg(used, nused, cap, vi->hard_reg); + if (opt_preg_alloc_kind(f, pr) != OPT_ALLOC_HARD || + opt_preg_loc_cls(f, pr) != (u8)cls) + continue; + add_unique_reg(used, nused, cap, opt_preg_hard_reg(f, pr)); } } @@ -901,35 +915,35 @@ static u32 collect_replayed_hard_regs(Func* f, CGTarget* w, RegClass cls, Inst* in = &bl->insts[i]; if ((IROp)in->op == IR_PARAM_DECL) continue; for (u32 j = 0; j < in->nopnds; ++j) - collect_replayed_operand_reg(&in->opnds[j], cls, used, &nused, cap); + collect_replayed_operand_reg(f, &in->opnds[j], cls, used, &nused, cap); switch ((IROp)in->op) { case IR_CALL: { IRCallAux* aux = (IRCallAux*)in->extra.aux; if (!aux) break; if (aux->use_plan_replay) { - collect_replayed_operand_reg(&aux->plan.callee, cls, used, &nused, + collect_replayed_operand_reg(f, &aux->plan.callee, cls, used, &nused, cap); for (u32 j = 0; j < aux->plan.nargs; ++j) { - collect_replayed_operand_reg(&aux->plan.args[j].src, cls, used, + collect_replayed_operand_reg(f, &aux->plan.args[j].src, cls, used, &nused, cap); if (aux->plan.args[j].dst_kind == CG_CALL_PLAN_REG && aux->plan.args[j].cls == (u8)cls) add_unique_reg(used, &nused, cap, aux->plan.args[j].dst_reg); } for (u32 j = 0; j < aux->plan.nrets; ++j) { - collect_replayed_operand_reg(&aux->plan.rets[j].dst, cls, used, + collect_replayed_operand_reg(f, &aux->plan.rets[j].dst, cls, used, &nused, cap); if (aux->plan.rets[j].cls == (u8)cls) add_unique_reg(used, &nused, cap, aux->plan.rets[j].src_reg); } } else { - collect_replayed_operand_reg(&aux->desc.callee, cls, used, &nused, + collect_replayed_operand_reg(f, &aux->desc.callee, cls, used, &nused, cap); for (u32 j = 0; j < aux->desc.nargs; ++j) - collect_replayed_abivalue_regs(&aux->desc.args[j], cls, used, + collect_replayed_abivalue_regs(f, &aux->desc.args[j], cls, used, &nused, cap); - collect_replayed_abivalue_regs(&aux->desc.ret, cls, used, &nused, + collect_replayed_abivalue_regs(f, &aux->desc.ret, cls, used, &nused, cap); } break; @@ -937,13 +951,13 @@ static u32 collect_replayed_hard_regs(Func* f, CGTarget* w, RegClass cls, case IR_RET: { IRRetAux* aux = (IRRetAux*)in->extra.aux; if (aux && aux->present) - collect_replayed_abivalue_regs(&aux->val, cls, used, &nused, cap); + collect_replayed_abivalue_regs(f, &aux->val, cls, used, &nused, cap); break; } case IR_SCOPE_BEGIN: { IRScopeAux* aux = (IRScopeAux*)in->extra.aux; if (aux) - collect_replayed_operand_reg(&aux->desc.cond, cls, used, &nused, + collect_replayed_operand_reg(f, &aux->desc.cond, cls, used, &nused, cap); break; } @@ -951,10 +965,10 @@ static u32 collect_replayed_hard_regs(Func* f, CGTarget* w, RegClass cls, IRAsmAux* aux = (IRAsmAux*)in->extra.aux; if (!aux) break; for (u32 j = 0; j < aux->nin; ++j) - collect_replayed_operand_reg(&aux->in_ops[j], cls, used, &nused, + collect_replayed_operand_reg(f, &aux->in_ops[j], cls, used, &nused, cap); for (u32 j = 0; j < aux->nout; ++j) - collect_replayed_operand_reg(&aux->out_ops[j], cls, used, &nused, + collect_replayed_operand_reg(f, &aux->out_ops[j], cls, used, &nused, cap); break; } @@ -962,9 +976,9 @@ static u32 collect_replayed_hard_regs(Func* f, CGTarget* w, RegClass cls, IRIntrinAux* aux = (IRIntrinAux*)in->extra.aux; if (!aux) break; for (u32 j = 0; j < aux->narg; ++j) - collect_replayed_operand_reg(&aux->args[j], cls, used, &nused, cap); + collect_replayed_operand_reg(f, &aux->args[j], cls, used, &nused, cap); for (u32 j = 0; j < aux->ndst; ++j) - collect_replayed_operand_reg(&aux->dsts[j], cls, used, &nused, cap); + collect_replayed_operand_reg(f, &aux->dsts[j], cls, used, &nused, cap); break; } default: @@ -997,6 +1011,19 @@ static int replay_operand_uses_frame_slot(const Operand* op) { return op && op->kind == OPK_LOCAL && op->v.frame_slot != FRAME_SLOT_NONE; } +static int replay_storage_uses_frame_slot(CGLocalStorage st) { + return st.kind == CG_LOCAL_STORAGE_FRAME && + st.v.frame_slot != FRAME_SLOT_NONE; +} + +static int replay_param_storage_uses_frame_slot(Func* f, CGLocalStorage st) { + if (replay_storage_uses_frame_slot(st)) return 1; + if (st.kind != CG_LOCAL_STORAGE_REG || !f || !f->opt_rewritten) return 0; + PReg pr = (PReg)st.v.reg; + return opt_reg_valid(f, pr) && opt_preg_alloc_kind(f, pr) == OPT_ALLOC_SPILL && + opt_preg_spill_slot(f, pr) != FRAME_SLOT_NONE; +} + static int replay_abivalue_uses_frame_slot(const CGABIValue* v) { if (!v) return 0; if (replay_operand_uses_frame_slot(&v->storage)) return 1; @@ -1063,6 +1090,8 @@ static int replay_func_uses_frame_slot(Func* f) { for (u32 i = 0; i < bl->ninsts; ++i) if (replay_inst_uses_frame_slot(&bl->insts[i])) return 1; } + for (u32 i = 0; i < f->nparams; ++i) + if (replay_param_storage_uses_frame_slot(f, f->params[i].storage)) return 1; for (u32 i = 0; i < f->nframe_slots; ++i) if (f->frame_slots[i].flags & (FSF_ADDR_TAKEN | FSF_VOLATILE)) return 1; return 0; @@ -1101,8 +1130,21 @@ static void collect_known_frame(Func* f, CGTarget* w, CGKnownFrameDesc* out) { continue; } if ((aux->desc.flags & CG_CALL_TAIL) == 0) out->has_call = 1; - if (!w->call_stack_size) continue; - u32 need = w->call_stack_size(w, &aux->desc); + u32 need = 0; + if (aux->use_plan_replay) { + need = aux->plan.stack_arg_size; + for (u32 j = 0; j < aux->plan.nargs; ++j) { + CGCallPlanMove* m = &aux->plan.args[j]; + if (m->dst_kind == CG_CALL_PLAN_STACK || + m->dst_kind == CG_CALL_PLAN_TAIL_STACK) { + u32 end = m->stack_offset + (m->mem.size > 8u ? m->mem.size : 8u); + if (end > need) need = end; + } + } + need = (need + 15u) & ~15u; + } else if (w->call_stack_size) { + need = w->call_stack_size(w, &aux->desc); + } if (need > out->max_outgoing) out->max_outgoing = need; } } @@ -1161,7 +1203,7 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { metrics_scope_end(c, "opt.emit.plan_hard_regs"); metrics_scope_begin(c, "opt.emit.func_begin"); - int known_frame = identity && w->func_begin_known_frame && w->call_stack_size; + int known_frame = w->func_begin_known_frame != NULL; if (known_frame) { CGKnownFrameDesc frame; FrameSlot* target_slots = @@ -1171,7 +1213,8 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { w->func_begin_known_frame(w, &f->desc, &frame, target_slots); for (u32 i = 0; i < f->nframe_slots; ++i) r.slot_map[f->frame_slots[i].id] = target_slots[i]; - } else { + } + if (!known_frame) { /* func_begin with the recorded descriptor. Parameter storage is replayed * through target->param below after frame slots are mapped. */ w->func_begin(w, &f->desc); @@ -1219,6 +1262,14 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) { } else { d.storage = xlat_storage(&r, p->storage, p->type); } + if (known_frame && d.storage.kind == CG_LOCAL_STORAGE_FRAME && + d.storage.v.frame_slot == FRAME_SLOT_NONE) { + SrcLoc loc = p->loc; + compiler_panic(c, loc, + "opt replay: frame-backed param %u missing known-frame " + "slot mapping", + (unsigned)i); + } d.abi = p->abi; d.loc = p->loc; (void)w->param(w, &d); @@ -1269,5 +1320,18 @@ void opt_replay(Compiler* c, Func* f, CGTarget* target) { } void opt_emit(Compiler* c, Func* f, CGTarget* target) { + if (f && f->mir) { + Func view = *f; + view.blocks = f->mir->blocks; + view.nblocks = f->mir->nblocks; + view.entry = f->mir->entry; + view.emit_order = f->mir->emit_order; + view.emit_order_n = f->mir->emit_order_n; + view.emit_order_cap = f->mir->emit_order_cap; + view.opt_rewritten = 1; + view.mir = NULL; + replay_func_to(c, &view, target, 1); + return; + } replay_func_to(c, f, target, 1); } diff --git a/src/opt/pass_hard_live.c b/src/opt/pass_hard_live.c @@ -141,8 +141,12 @@ void opt_hard_inst_use_def(Func* f, const Inst* in, OptHardRegSet* use, hard_use_abivalue(use, &aux->desc.args[i]); hard_def_abivalue(def, &aux->desc.ret); } - for (u32 c = 0; c < OPT_REG_CLASSES; ++c) - def->cls[c] |= opt_call_clobber_mask_for(f, in, (u8)c); + for (u32 c = 0; c < OPT_REG_CLASSES; ++c) { + u32 idx = c; + u32 mask = opt_call_clobber_mask_for(f, in, (u8)idx); + u32 old = def->cls[idx]; + def->cls[idx] = old | mask; + } break; } case IR_CMP_BRANCH: diff --git a/src/opt/pass_live.c b/src/opt/pass_live.c @@ -11,10 +11,24 @@ static u32 opt_bit_words(u32 nregs) { return (nregs + 63u) / 64u; } static void opt_bitset_init(Arena* arena, OptBitset* bs, u32 nwords) { memset(bs, 0, sizeof *bs); + bs->arena = arena; bs->nwords = nwords; bs->words = arena_zarray(arena, u64, nwords ? nwords : 1u); } +static int opt_bitset_grow(OptBitset* bs, u32 need_words) { + if (!bs || need_words <= bs->nwords) return 1; + if (!bs->arena) return 0; + u32 nwords = bs->nwords ? bs->nwords : 1u; + while (nwords < need_words) nwords *= 2u; + u64* words = arena_zarray(bs->arena, u64, nwords); + if (bs->words && bs->nwords) + memcpy(words, bs->words, sizeof(words[0]) * bs->nwords); + bs->words = words; + bs->nwords = nwords; + return 1; +} + static void opt_bitset_trim(OptBitset* bs) { u32 n = bs->nwords; while (n && bs->words[n - 1u] == 0) --n; @@ -28,9 +42,9 @@ void opt_bitset_clear(OptBitset* bs) { } void opt_bitset_set(OptBitset* bs, PReg r) { - if (!bs || !bs->words) return; + if (!bs) return; u32 w = r / 64u; - if (w >= bs->nwords) return; + if (w >= bs->nwords && !opt_bitset_grow(bs, w + 1u)) return; bs->words[w] |= 1ull << (r % 64u); if (bs->active_words <= w) bs->active_words = w + 1u; } @@ -54,9 +68,12 @@ int opt_bitset_copy(OptBitset* dst, const OptBitset* src) { int changed = 0; u32 n; u32 old_active; - if (!dst || !src || !dst->words || !src->words) return 0; + if (!dst || !src || !src->words) return 0; + if (src->active_words > dst->nwords && + !opt_bitset_grow(dst, src->active_words)) + return 0; old_active = dst->active_words; - n = dst->nwords < src->active_words ? dst->nwords : src->active_words; + n = src->active_words; for (u32 i = 0; i < n; ++i) { changed |= dst->words[i] != src->words[i]; dst->words[i] = src->words[i]; @@ -73,8 +90,11 @@ int opt_bitset_copy(OptBitset* dst, const OptBitset* src) { int opt_bitset_union(OptBitset* dst, const OptBitset* src) { int changed = 0; u32 n; - if (!dst || !src || !dst->words || !src->words) return 0; - n = dst->nwords < src->active_words ? dst->nwords : src->active_words; + if (!dst || !src || !src->words) return 0; + if (src->active_words > dst->nwords && + !opt_bitset_grow(dst, src->active_words)) + return 0; + n = src->active_words; for (u32 i = 0; i < n; ++i) { u64 old = dst->words[i]; dst->words[i] |= src->words[i]; @@ -88,8 +108,11 @@ int opt_bitset_union_and_not(OptBitset* dst, const OptBitset* src, const OptBitset* not_bits) { int changed = 0; u32 n; - if (!dst || !src || !dst->words || !src->words) return 0; - n = dst->nwords < src->active_words ? dst->nwords : src->active_words; + if (!dst || !src || !src->words) return 0; + if (src->active_words > dst->nwords && + !opt_bitset_grow(dst, src->active_words)) + return 0; + n = src->active_words; for (u32 i = 0; i < n; ++i) { u64 mask = (not_bits && i < not_bits->active_words) ? not_bits->words[i] : 0; @@ -144,7 +167,11 @@ static u64 live_count_set_bits(const OptBitset* bs) { } static void live_metric_clear(OptLiveInfo* live, OptBitset* bs) { - if (live && bs) live->bitset_words_touched += bs->active_words; + u32 touched = bs ? bs->active_words : 0; + if (live) { + u64 old = live->bitset_words_touched; + live->bitset_words_touched = old + touched; + } opt_bitset_clear(bs); } @@ -152,7 +179,7 @@ static int live_metric_copy(OptLiveInfo* live, OptBitset* dst, const OptBitset* src) { u32 n = 0; if (dst && src) { - n = dst->nwords < src->active_words ? dst->nwords : src->active_words; + n = src->active_words; if (dst->active_words > n) n += dst->active_words - n; } if (live) live->bitset_words_touched += n; @@ -162,8 +189,7 @@ static int live_metric_copy(OptLiveInfo* live, OptBitset* dst, static int live_metric_union(OptLiveInfo* live, OptBitset* dst, const OptBitset* src) { u32 n = 0; - if (dst && src) - n = dst->nwords < src->active_words ? dst->nwords : src->active_words; + if (dst && src) n = src->active_words; if (live) live->bitset_words_touched += n; return opt_bitset_union(dst, src); } @@ -173,8 +199,7 @@ static int live_metric_union_and_not(OptLiveInfo* live, OptBitset* dst, const OptBitset* not_bits) { u32 n = 0; (void)not_bits; - if (dst && src) - n = dst->nwords < src->active_words ? dst->nwords : src->active_words; + if (dst && src) n = src->active_words; if (live) live->bitset_words_touched += n; return opt_bitset_union_and_not(dst, src, not_bits); } @@ -195,7 +220,13 @@ static void live_recompute_metrics(OptLiveInfo* live) { live->set_bit_scans += live_count_set_bits(&bl->live_def); } live->block_bytes = - (u64)live->f->nblocks * 4u * (u64)live->words * (u64)sizeof(u64); + live->active_words * (u64)sizeof(u64); +} + +static void live_worklist_push(u32* worklist, u8* queued, u32* n, u32 b) { + if (queued[b]) return; + queued[b] = 1; + worklist[(*n)++] = b; } void opt_live_blocks(Func* f, OptLiveInfo* live) { @@ -210,10 +241,10 @@ void opt_live_blocks(Func* f, OptLiveInfo* live) { for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; OptBlockLive* lb = &live->blocks[b]; - opt_bitset_init(f->arena, &lb->live_in, live->words); - opt_bitset_init(f->arena, &lb->live_out, live->words); - opt_bitset_init(f->arena, &lb->live_use, live->words); - opt_bitset_init(f->arena, &lb->live_def, live->words); + opt_bitset_init(f->arena, &lb->live_in, 0); + opt_bitset_init(f->arena, &lb->live_out, 0); + opt_bitset_init(f->arena, &lb->live_use, 0); + opt_bitset_init(f->arena, &lb->live_def, 0); LiveUseDefCtx ctx; ctx.use = &lb->live_use; @@ -224,31 +255,39 @@ void opt_live_blocks(Func* f, OptLiveInfo* live) { OptBitset new_out; OptBitset tmp; - opt_bitset_init(f->arena, &new_out, live->words); - opt_bitset_init(f->arena, &tmp, live->words); - - int changed; - do { - changed = 0; + opt_bitset_init(f->arena, &new_out, 0); + opt_bitset_init(f->arena, &tmp, 0); + + u32* worklist = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u); + u8* queued = arena_zarray(f->arena, u8, f->nblocks ? f->nblocks : 1u); + u32 nwork = 0; + for (u32 bi = f->nblocks; bi > 0; --bi) + live_worklist_push(worklist, queued, &nwork, bi - 1u); + + while (nwork) { + u32 b = worklist[--nwork]; + queued[b] = 0; + Block* bl = &f->blocks[b]; + OptBlockLive* lb = &live->blocks[b]; ++live->dataflow_iterations; - for (u32 bi = f->nblocks; bi > 0; --bi) { - u32 b = bi - 1u; - Block* bl = &f->blocks[b]; - OptBlockLive* lb = &live->blocks[b]; - ++live->dataflow_block_visits; - live_metric_clear(live, &new_out); - live_metric_clear(live, &tmp); - for (u32 s = 0; s < bl->nsucc; ++s) { - u32 t = bl->succ[s]; - if (t < f->nblocks) - live_metric_union(live, &new_out, &live->blocks[t].live_in); + ++live->dataflow_block_visits; + live_metric_clear(live, &new_out); + live_metric_clear(live, &tmp); + for (u32 s = 0; s < bl->nsucc; ++s) { + u32 t = bl->succ[s]; + if (t < f->nblocks) + live_metric_union(live, &new_out, &live->blocks[t].live_in); + } + live_metric_copy(live, &tmp, &lb->live_use); + live_metric_union_and_not(live, &tmp, &new_out, &lb->live_def); + (void)live_metric_copy(live, &lb->live_out, &new_out); + if (live_metric_copy(live, &lb->live_in, &tmp)) { + for (u32 p = 0; p < bl->npreds; ++p) { + u32 pred = bl->preds[p]; + if (pred < f->nblocks) live_worklist_push(worklist, queued, &nwork, pred); } - live_metric_copy(live, &tmp, &lb->live_use); - live_metric_union_and_not(live, &tmp, &new_out, &lb->live_def); - changed |= live_metric_copy(live, &lb->live_out, &new_out); - changed |= live_metric_copy(live, &lb->live_in, &tmp); } - } while (changed); + } live_recompute_metrics(live); } diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c @@ -47,12 +47,23 @@ static u32 type_size_fallback(const Func* f, CfreeCgTypeId t) { static u32 bit_words(u32 npregs) { return (npregs + 63u) / 64u; } -static void bit_set(u64* bits, PReg v) { bits[v / 64u] |= 1ull << (v % 64u); } +static void bit_set(u64* bits, PReg v) { + u32 w = v / 64u; + u64 mask = 1ull << (v % 64u); + u64 old = bits[w]; + bits[w] = old | mask; +} static void bit_clear(u64* bits, PReg v) { - bits[v / 64u] &= ~(1ull << (v % 64u)); + u32 w = v / 64u; + u64 mask = 1ull << (v % 64u); + u64 old = bits[w]; + bits[w] = old & ~mask; } static int bit_has(const u64* bits, PReg v) { - return (bits[v / 64u] & (1ull << (v % 64u))) != 0; + u32 w = v / 64u; + u64 mask = 1ull << (v % 64u); + u64 old = bits[w]; + return (old & mask) != 0; } typedef struct BitsCtx { @@ -207,74 +218,101 @@ static int phys_arg_reg_for_index(Func* f, u8 cls, u32 abi_index, Reg* out) { return 0; } -static int hard_available(Func* f, u8 cls, Reg r); +typedef struct ParamIncomingRegs { + Reg regs[64]; + u8 cls[64]; + u8 has[64]; + u32 nparams; + Reg all_regs[128]; + u8 all_cls[128]; + u32 nall; +} ParamIncomingRegs; + +static void param_incoming_add(ParamIncomingRegs* out, u8 cls, Reg r) { + if (r >= 32) return; + for (u32 i = 0; i < out->nall; ++i) + if (out->all_cls[i] == cls && out->all_regs[i] == r) return; + if (out->nall < 128u) { + out->all_cls[out->nall] = cls; + out->all_regs[out->nall] = r; + ++out->nall; + } +} -static void apply_param_incoming_register_hazards(Func* f) { - if (!f || !f->preg_info || !f->desc.abi || !f->nparams) return; - Reg incoming_regs[64]; - u8 incoming_cls[64]; - u8 has_incoming[64]; - memset(incoming_regs, 0, sizeof incoming_regs); - memset(incoming_cls, 0, sizeof incoming_cls); - memset(has_incoming, 0, sizeof has_incoming); +static void collect_param_incoming_regs(Func* f, ParamIncomingRegs* out) { + memset(out, 0, sizeof *out); + if (!f || !f->desc.abi || !f->nparams) return; u32 next_int = 0; u32 next_fp = 0; if (f->desc.abi->has_sret && f->opt_target.arch != CFREE_ARCH_ARM_64) next_int = 1; - u32 nparams = f->nparams < 64u ? f->nparams : 64u; - for (u32 i = 0; i < nparams; ++i) { + out->nparams = f->nparams < 64u ? f->nparams : 64u; + for (u32 i = 0; i < out->nparams; ++i) { IRParam* p = &f->params[i]; const ABIArgInfo* ai = p->abi; if (!ai || ai->kind == ABI_ARG_IGNORE) continue; if (ai->kind == ABI_ARG_INDIRECT) { Reg r = REG_NONE; if (phys_arg_reg_for_index(f, RC_INT, next_int, &r)) { - incoming_regs[i] = r; - incoming_cls[i] = RC_INT; - has_incoming[i] = 1; + out->regs[i] = r; + out->cls[i] = RC_INT; + out->has[i] = 1; + param_incoming_add(out, RC_INT, r); } ++next_int; continue; } - if (ai->kind != ABI_ARG_DIRECT || ai->nparts != 1) continue; - const ABIArgPart* part = &ai->parts[0]; - if (part->cls == ABI_CLASS_FP) { - Reg r = REG_NONE; - if (phys_arg_reg_for_index(f, RC_FP, next_fp, &r)) { - incoming_regs[i] = r; - incoming_cls[i] = RC_FP; - has_incoming[i] = 1; - } - ++next_fp; - } else if (part->cls == ABI_CLASS_INT) { - Reg r = REG_NONE; - if (phys_arg_reg_for_index(f, RC_INT, next_int, &r)) { - incoming_regs[i] = r; - incoming_cls[i] = RC_INT; - has_incoming[i] = 1; + if (ai->kind != ABI_ARG_DIRECT) continue; + for (u16 j = 0; j < ai->nparts; ++j) { + const ABIArgPart* part = &ai->parts[j]; + if (part->cls == ABI_CLASS_FP) { + Reg r = REG_NONE; + if (phys_arg_reg_for_index(f, RC_FP, next_fp, &r)) { + param_incoming_add(out, RC_FP, r); + if (ai->nparts == 1) { + out->regs[i] = r; + out->cls[i] = RC_FP; + out->has[i] = 1; + } + } + ++next_fp; + } else if (part->cls == ABI_CLASS_INT) { + Reg r = REG_NONE; + if (phys_arg_reg_for_index(f, RC_INT, next_int, &r)) { + param_incoming_add(out, RC_INT, r); + if (ai->nparts == 1) { + out->regs[i] = r; + out->cls[i] = RC_INT; + out->has[i] = 1; + } + } + ++next_int; } - ++next_int; } } +} - for (u32 i = 0; i < nparams; ++i) { - IRParam* p = &f->params[i]; - if (p->storage.kind != CG_LOCAL_STORAGE_REG) continue; - PReg v = (PReg)p->storage.v.reg; - if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) continue; +static int hard_available(Func* f, u8 cls, Reg r); + +static void apply_param_incoming_register_hazards(Func* f) { + if (!f || !f->preg_info || !f->desc.abi || !f->nparams) return; + ParamIncomingRegs incoming; + collect_param_incoming_regs(f, &incoming); + + /* O1 replays parameter materialization before the body, but values left in + * their ABI incoming registers are not represented as live from function + * entry to first use. Keep those incoming registers out of virtual + * allocation so the backend emits explicit entry moves/stores before body + * code can reuse them. Fixed asm constraints still use tied_hard_reg. */ + for (PReg v = 1; v < opt_reg_count(f); ++v) { u8 cls = f->preg_info[v].cls; - if (has_incoming[i] && incoming_cls[i] == cls && - f->preg_info[v].tied_hard_reg < 0 && - f->preg_info[v].live_across_call_freq == 0 && - hard_available(f, cls, incoming_regs[i]) && incoming_regs[i] < 32 && - (f->preg_info[v].forbidden_hard_regs & (1u << incoming_regs[i])) == 0) { - f->preg_info[v].tied_hard_reg = (i32)incoming_regs[i]; - } - for (u32 j = i + 1u; j < nparams; ++j) { - if (!has_incoming[j] || incoming_cls[j] != cls) continue; - forbid_preg_reg(f, v, cls, incoming_regs[j]); + for (u32 j = 0; j < incoming.nall; ++j) { + if (incoming.all_cls[j] != cls) continue; + if (f->preg_info[v].tied_hard_reg == (i32)incoming.all_regs[j]) + continue; + forbid_preg_reg(f, v, cls, incoming.all_regs[j]); } } } @@ -321,8 +359,8 @@ static const CGPhysRegInfo* phys_info_for(Func* f, u8 cls, Reg r) { } static FrameSlot spill_slot_for(Func* f, PReg v) { - if (f->preg_info[v].spill_slot != FRAME_SLOT_NONE) - return f->preg_info[v].spill_slot; + FrameSlot existing = opt_preg_spill_slot(f, v); + if (existing != FRAME_SLOT_NONE) return existing; FrameSlotDesc d; memset(&d, 0, sizeof d); d.type = opt_reg_type(f, v); @@ -865,6 +903,7 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges, f->opt_alloc_stack_word_ors = a->stack_word_ors; f->opt_alloc_hard_mark_points = a->hard_mark_points; f->opt_alloc_stack_mark_points = a->stack_mark_points; + f->preg_locs = a->locs; } typedef struct RewriteList { @@ -981,9 +1020,9 @@ static Operand hard_operand(Func* f, PReg v) { Operand o; memset(&o, 0, sizeof o); o.kind = OPK_REG; - o.cls = f->preg_info[v].cls; + o.cls = opt_preg_loc_cls(f, v); o.type = opt_reg_type(f, v); - o.v.reg = f->preg_info[v].hard_reg; + o.v.reg = opt_preg_hard_reg(f, v); return o; } @@ -991,7 +1030,7 @@ static Operand hard_operand_reg(Func* f, PReg v, Reg hard_reg) { Operand o; memset(&o, 0, sizeof o); o.kind = OPK_REG; - o.cls = f->preg_info[v].cls; + o.cls = opt_preg_loc_cls(f, v); o.type = opt_reg_type(f, v); o.v.reg = hard_reg; return o; @@ -1066,21 +1105,21 @@ static void rewrite_one_operand(Func* f, Inst* owner, Operand* op, int is_def, if (op->kind != OPK_REG) return; PReg v = (PReg)op->v.reg; if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return; - OptPRegInfo* vi = &f->preg_info[v]; - if (vi->alloc_kind == OPT_ALLOC_HARD) { - op->v.reg = vi->hard_reg; + u8 alloc_kind = opt_preg_alloc_kind(f, v); + if (alloc_kind == OPT_ALLOC_HARD) { + op->v.reg = opt_preg_hard_reg(f, v); return; } - if (vi->alloc_kind == OPT_ALLOC_SPLIT) { + if (alloc_kind == OPT_ALLOC_SPLIT) { const OptAllocSegment* seg = split_segment_at(f, v, c->raw_point); if (seg && seg->loc_kind == OPT_LOC_HARD) { op->v.reg = seg->hard_reg; return; } - } else if (vi->alloc_kind != OPT_ALLOC_SPILL) { + } else if (alloc_kind != OPT_ALLOC_SPILL) { return; } - u8 cls = vi->cls; + u8 cls = opt_preg_loc_cls(f, v); Reg scratch = scratch_for(f, cls, &c->next_scratch[cls]); if (scratch == (Reg)REG_NONE) { SrcLoc loc = owner ? owner->loc : (SrcLoc){0, 0, 0}; @@ -1110,11 +1149,10 @@ static void rewrite_call_arg_operand(Func* f, Operand* op) { if (!op || op->kind != OPK_REG) return; PReg v = (PReg)op->v.reg; if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return; - OptPRegInfo* vi = &f->preg_info[v]; - if (vi->alloc_kind == OPT_ALLOC_HARD) { - op->v.reg = vi->hard_reg; - } else if (vi->alloc_kind == OPT_ALLOC_SPILL || - vi->alloc_kind == OPT_ALLOC_SPLIT) { + u8 alloc_kind = opt_preg_alloc_kind(f, v); + if (alloc_kind == OPT_ALLOC_HARD) { + op->v.reg = opt_preg_hard_reg(f, v); + } else if (alloc_kind == OPT_ALLOC_SPILL || alloc_kind == OPT_ALLOC_SPLIT) { *op = spill_addr(f, v); } } @@ -1155,9 +1193,9 @@ static void rewrite_call_save_one(PReg v, void* arg) { Func* f = c->f; if (v == PREG_NONE || v == 0 || v >= opt_reg_count(f)) return; if (c->refs && refs_has_def(c->refs, v)) return; - if (f->preg_info[v].alloc_kind != OPT_ALLOC_HARD) return; - u8 cls = f->preg_info[v].cls; - Reg hr = f->preg_info[v].hard_reg; + if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD) return; + u8 cls = opt_preg_loc_cls(f, v); + Reg hr = opt_preg_hard_reg(f, v); if (cls >= OPT_REG_CLASSES || hr >= 32u) return; if ((opt_call_clobber_mask_for(f, c->call, cls) & (1u << hr)) == 0) return; if (c->emit_restore) @@ -1186,7 +1224,7 @@ static PReg* rewrite_collect_call_save_pregs(Func* f, u32* count_out) { u32 n = 0; for (PReg v = 1; v < opt_reg_count(f); ++v) { OptPRegInfo* vi = &f->preg_info[v]; - if (vi->alloc_kind != OPT_ALLOC_HARD) continue; + if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD) continue; if (!vi->live_across_call_freq) continue; ++n; } @@ -1194,7 +1232,7 @@ static PReg* rewrite_collect_call_save_pregs(Func* f, u32* count_out) { u32 w = 0; for (PReg v = 1; v < opt_reg_count(f); ++v) { OptPRegInfo* vi = &f->preg_info[v]; - if (vi->alloc_kind != OPT_ALLOC_HARD) continue; + if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD) continue; if (!vi->live_across_call_freq) continue; pregs[w++] = v; } @@ -1206,7 +1244,7 @@ static void append_split_reloads_at(Func* f, RewriteList* out, u32 block, u32 raw_point) { if (!f->opt_first_segment_by_preg) return; for (PReg v = 1; v < opt_reg_count(f); ++v) { - if (f->preg_info[v].alloc_kind != OPT_ALLOC_SPLIT) continue; + if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_SPLIT) continue; for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE; si = f->opt_alloc_segments[si].next) { const OptAllocSegment* s = &f->opt_alloc_segments[si]; @@ -1222,7 +1260,7 @@ static void append_split_stores_at(Func* f, RewriteList* out, u32 block, u32 raw_point) { if (!f->opt_first_segment_by_preg) return; for (PReg v = 1; v < opt_reg_count(f); ++v) { - if (f->preg_info[v].alloc_kind != OPT_ALLOC_SPLIT) continue; + if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_SPLIT) continue; for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE; si = f->opt_alloc_segments[si].next) { const OptAllocSegment* s = &f->opt_alloc_segments[si]; @@ -1264,7 +1302,7 @@ static void prepare_split_edge_materialization(Func* f, EdgeMatPlan* plan) { } for (PReg v = 1; v < opt_reg_count(f); ++v) { - if (f->preg_info[v].alloc_kind != OPT_ALLOC_SPLIT) continue; + if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_SPLIT) continue; for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE; si = f->opt_alloc_segments[si].next) { OptAllocSegment* s = &f->opt_alloc_segments[si]; @@ -1481,6 +1519,59 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) { f->opt_rewritten = 1; } +static Block* lower_copy_blocks(Func* f) { + Block* blocks = arena_zarray(f->arena, Block, f->nblocks ? f->nblocks : 1u); + for (u32 b = 0; b < f->nblocks; ++b) { + Block* dst = &blocks[b]; + Block* src = &f->blocks[b]; + *dst = *src; + if (src->ninsts) { + dst->insts = arena_array(f->arena, Inst, src->ninsts); + memcpy(dst->insts, src->insts, sizeof(Inst) * src->ninsts); + dst->cap = src->ninsts; + } + if (src->nsucc) { + dst->succ = arena_array(f->arena, u32, src->nsucc); + memcpy(dst->succ, src->succ, sizeof(u32) * src->nsucc); + dst->succ_cap = src->nsucc; + } + if (src->npreds) { + dst->preds = arena_array(f->arena, u32, src->npreds); + memcpy(dst->preds, src->preds, sizeof(u32) * src->npreds); + } + } + return blocks; +} + +void opt_lower_to_mir(Func* f, const OptLiveInfo* live_info) { + if (!f) return; + Func phys = *f; + phys.blocks = lower_copy_blocks(f); + phys.opt_rewritten = 0; + phys.mir = NULL; + + rewrite_func(&phys, live_info); + + MFunc* m = arena_zarray(f->arena, MFunc, 1); + m->blocks = phys.blocks; + m->nblocks = phys.nblocks; + m->entry = phys.entry; + m->emit_order_n = phys.emit_order_n; + m->emit_order_cap = phys.emit_order_n; + m->emit_order = + arena_array(f->arena, u32, phys.emit_order_n ? phys.emit_order_n : 1u); + if (phys.emit_order_n) + memcpy(m->emit_order, phys.emit_order, sizeof(u32) * phys.emit_order_n); + + f->mir = m; + f->frame_slots = phys.frame_slots; + f->nframe_slots = phys.nframe_slots; + f->frame_slots_cap = phys.frame_slots_cap; + f->next_inst_id = phys.next_inst_id; + f->opt_rewrite_inserted_insts = phys.opt_rewrite_inserted_insts; + f->opt_rewrite_live_words_touched = phys.opt_rewrite_live_words_touched; +} + static void rewrite_dump_sb(Writer* w, const StrBuf* sb) { cfree_writer_write(w, strbuf_cstr(sb), strbuf_len(sb)); } @@ -1581,7 +1672,112 @@ void opt_dead_def_elim(Func* f) { opt_dead_def_elim_with_live(f, &live); } -void opt_regalloc(Func* f, int allow_live_range_split) { +/* Collect the def and use PRegs of one instruction (operands + aux fan-out). */ +typedef struct AllocVerifyRefs { + PReg defs[256]; + u32 ndefs; + PReg uses[256]; + u32 nuses; +} AllocVerifyRefs; + +static void alloc_verify_collect(Func* f, Inst* in, Operand* op, int is_def, + void* ctx) { + (void)in; + AllocVerifyRefs* r = (AllocVerifyRefs*)ctx; + PReg p; + if (!op || op->kind != OPK_REG) return; + p = (PReg)op->v.reg; + if (p == 0 || p >= opt_reg_count(f)) return; + if (is_def) { + if (r->ndefs < 256u) r->defs[r->ndefs++] = p; + } else { + if (r->nuses < 256u) r->uses[r->nuses++] = p; + } +} + +/* Post-allocation interference verifier for the O1 (no-coalesce) path. Since + * O1 never coalesces, two *distinct* PRegs that are live simultaneously must + * never share a hard register. Re-derive per-instruction liveness from the + * block live-out sets and panic if any definition collides with a different + * live value in the same hard reg — turning a silent miscompile into a hard + * error. */ +static void opt_verify_alloc(Func* f, const OptLiveInfo* live) { + u32 nregs = opt_reg_count(f); + u8* cur; + if (nregs <= 1u || !live) return; + ParamIncomingRegs incoming; + collect_param_incoming_regs(f, &incoming); + for (PReg v = 1; v < nregs; ++v) { + OptPRegInfo* vi = &f->preg_info[v]; + if (opt_preg_alloc_kind(f, v) != OPT_ALLOC_HARD || vi->use_freq == 0) + continue; + u8 cls = opt_preg_loc_cls(f, v); + Reg hard = opt_preg_hard_reg(f, v); + for (u32 i = 0; i < incoming.nall; ++i) { + if (cls == incoming.all_cls[i] && hard == incoming.all_regs[i]) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(f->c, loc, + "opt regalloc: O1 preg %u left in incoming cls%u reg%u", + (unsigned)v, (unsigned)cls, (unsigned)hard); + } + } + } + cur = arena_array(f->arena, u8, nregs); + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + const OptBlockLive* lb = &live->blocks[b]; + memset(cur, 0, nregs); + for (PReg p = 1; p < nregs; ++p) + if (opt_bitset_has(&lb->live_out, p)) cur[p] = 1u; + for (u32 ri = bl->ninsts; ri > 0; --ri) { + Inst* in = &bl->insts[ri - 1u]; + AllocVerifyRefs refs; + refs.ndefs = 0; + refs.nuses = 0; + opt_walk_inst_operands(f, in, alloc_verify_collect, &refs); + for (u32 k = 0; k < refs.ndefs; ++k) { + PReg d = refs.defs[k]; + u8 d_kind = opt_preg_alloc_kind(f, d); + if (d_kind != OPT_ALLOC_HARD && d_kind != OPT_ALLOC_SPILL) + continue; + for (PReg p = 1; p < nregs; ++p) { + u8 p_kind; + if (!cur[p] || p == d) continue; + p_kind = opt_preg_alloc_kind(f, p); + if (p_kind == OPT_ALLOC_HARD && d_kind == OPT_ALLOC_HARD && + opt_preg_loc_cls(f, p) == opt_preg_loc_cls(f, d) && + opt_preg_hard_reg(f, p) == opt_preg_hard_reg(f, d)) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(f->c, loc, + "opt regalloc: O1 interference — pregs %u and %u " + "share cls%u reg%u, both live at block %u inst %u " + "(op %u)", + (unsigned)d, (unsigned)p, + (unsigned)opt_preg_loc_cls(f, d), + (unsigned)opt_preg_hard_reg(f, d), b, ri - 1u, + (unsigned)in->op); + } + if (p_kind == OPT_ALLOC_SPILL && d_kind == OPT_ALLOC_SPILL && + opt_preg_spill_slot(f, p) == opt_preg_spill_slot(f, d)) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(f->c, loc, + "opt regalloc: O1 interference — pregs %u and %u " + "share spill slot %u, both live at block %u inst %u " + "(op %u)", + (unsigned)d, (unsigned)p, + (unsigned)opt_preg_spill_slot(f, d), b, ri - 1u, + (unsigned)in->op); + } + } + } + for (u32 k = 0; k < refs.ndefs; ++k) cur[refs.defs[k]] = 0u; + for (u32 k = 0; k < refs.nuses; ++k) cur[refs.uses[k]] = 1u; + } + } +} + +static void opt_regalloc_place(Func* f, int allow_live_range_split, + OptLiveInfo* live_out) { metrics_scope_begin(f->c, "opt.live_ranges.regalloc"); OptLiveInfo live; opt_live_blocks(f, &live); @@ -1615,6 +1811,18 @@ void opt_regalloc(Func* f, int allow_live_range_split) { OptAllocator alloc; opt_assign_ranges(f, &ranges, &alloc, allow_live_range_split); + if (!allow_live_range_split) opt_verify_alloc(f, &live); + if (live_out) *live_out = live; +} + +void opt_regalloc_locations(Func* f, int allow_live_range_split, + OptLiveInfo* live_out) { + opt_regalloc_place(f, allow_live_range_split, live_out); +} + +void opt_regalloc(Func* f, int allow_live_range_split) { + OptLiveInfo live; + opt_regalloc_place(f, allow_live_range_split, &live); EdgeMatPlan edge_mats; memset(&edge_mats, 0, sizeof edge_mats); if (allow_live_range_split) diff --git a/src/opt/pass_machinize.c b/src/opt/pass_machinize.c @@ -75,7 +75,7 @@ static void asm_prepare_constraints(Func* f, CGTarget* target, IRAsmAux* aux) { static int call_plan_replay_supported(const IRCallAux* aux, const CGTarget* target); -void opt_machinize(Func* f, CGTarget* target) { +static void machinize_reset(Func* f, CGTarget* target) { f->opt_target = target->c->target; f->opt_has_target = 1; for (u32 c = 0; c < OPT_REG_CLASSES; ++c) { @@ -88,7 +88,9 @@ void opt_machinize(Func* f, CGTarget* target) { f->opt_arg_regs[c] = 0; f->opt_ret_regs[c] = 0; } +} +static void machinize_prepare_insts(Func* f, CGTarget* target) { for (u32 b = 0; b < f->nblocks; ++b) { Block* bl = &f->blocks[b]; for (u32 i = 0; i < bl->ninsts; ++i) { @@ -105,27 +107,30 @@ void opt_machinize(Func* f, CGTarget* target) { } } } +} +static void machinize_collect_regs(Func* f, CGTarget* target) { for (u32 c = 0; c < OPT_REG_CLASSES; ++c) { const CGPhysRegInfo* phys = NULL; u32 nphys = 0; if (target->get_phys_regs) target->get_phys_regs(target, (RegClass)c, &phys, &nphys); if (phys) { - for (u32 i = 0; i < nphys && i < OPT_MAX_HARD_REGS; ++i) { - CGPhysRegInfo pi = phys[i]; - Reg hr = pi.reg; + u32 phys_limit = nphys < OPT_MAX_HARD_REGS ? nphys : OPT_MAX_HARD_REGS; + for (u32 i = 0; i < phys_limit; ++i) { + Reg hr = phys[i].reg; + u16 flags = phys[i].flags; if (hr < 32u) { - if (pi.flags & CG_REG_CALLER_SAVED) + if (flags & CG_REG_CALLER_SAVED) f->opt_caller_saved[c] |= 1u << hr; - if (pi.flags & CG_REG_CALLEE_SAVED) + if (flags & CG_REG_CALLEE_SAVED) f->opt_callee_saved[c] |= 1u << hr; - if (pi.flags & CG_REG_RESERVED) f->opt_reserved_regs[c] |= 1u << hr; - if (pi.flags & CG_REG_ARG) f->opt_arg_regs[c] |= 1u << hr; - if (pi.flags & CG_REG_RET) f->opt_ret_regs[c] |= 1u << hr; + if (flags & CG_REG_RESERVED) f->opt_reserved_regs[c] |= 1u << hr; + if (flags & CG_REG_ARG) f->opt_arg_regs[c] |= 1u << hr; + if (flags & CG_REG_RET) f->opt_ret_regs[c] |= 1u << hr; } - f->opt_phys_regs[c][f->opt_phys_reg_count[c]++] = pi; - if ((pi.flags & CG_REG_ALLOCABLE) && !(pi.flags & CG_REG_RESERVED)) { + f->opt_phys_regs[c][f->opt_phys_reg_count[c]++] = phys[i]; + if ((flags & CG_REG_ALLOCABLE) && !(flags & CG_REG_RESERVED)) { f->opt_hard_regs[c][f->opt_hard_reg_count[c]++] = hr; } } @@ -134,7 +139,8 @@ void opt_machinize(Func* f, CGTarget* target) { u32 nhard = 0; if (target->get_allocable_regs) target->get_allocable_regs(target, (RegClass)c, &hard, &nhard); - for (u32 i = 0; i < nhard && i < OPT_MAX_HARD_REGS; ++i) + u32 hard_limit = nhard < OPT_MAX_HARD_REGS ? nhard : OPT_MAX_HARD_REGS; + for (u32 i = 0; i < hard_limit; ++i) f->opt_hard_regs[c][f->opt_hard_reg_count[c]++] = hard[i]; } @@ -142,7 +148,9 @@ void opt_machinize(Func* f, CGTarget* target) { u32 nscratch = 0; if (target->get_scratch_regs) target->get_scratch_regs(target, (RegClass)c, &scratch, &nscratch); - for (u32 i = 0; i < nscratch && i < OPT_MAX_SCRATCH_REGS; ++i) + u32 scratch_limit = + nscratch < OPT_MAX_SCRATCH_REGS ? nscratch : OPT_MAX_SCRATCH_REGS; + for (u32 i = 0; i < scratch_limit; ++i) f->opt_scratch_regs[c][f->opt_scratch_reg_count[c]++] = scratch[i]; if (!phys && target->is_caller_saved) { @@ -152,10 +160,15 @@ void opt_machinize(Func* f, CGTarget* target) { f->opt_caller_saved[c] |= (1u << hr); } } - if (target->callee_save_mask) - f->opt_callee_saved[c] |= target->callee_save_mask(target, (RegClass)c); + u32* callee_saved = &f->opt_callee_saved[c]; + if (target->callee_save_mask) { + u32 mask = target->callee_save_mask(target, (RegClass)c); + *callee_saved |= mask; + } } +} +static void machinize_check_overlap(Func* f) { for (u32 c = 0; c < OPT_REG_CLASSES; ++c) { for (u32 i = 0; i < f->opt_hard_reg_count[c]; ++i) { Reg hr = f->opt_hard_regs[c][i]; @@ -172,6 +185,13 @@ void opt_machinize(Func* f, CGTarget* target) { } } +void opt_machinize(Func* f, CGTarget* target) { + machinize_reset(f, target); + machinize_prepare_insts(f, target); + machinize_collect_regs(f, target); + machinize_check_overlap(f); +} + static int call_plan_replay_supported(const IRCallAux* aux, const CGTarget* target) { if (!aux || !aux->plan_valid || !target || !target->emit_call_plan) return 0; diff --git a/src/opt/pass_mir.c b/src/opt/pass_mir.c @@ -0,0 +1,103 @@ +#include "opt/opt_internal.h" + +static int mir_view(Func* f, Func* out) { + if (!f || !f->mir) return 0; + *out = *f; + out->blocks = f->mir->blocks; + out->nblocks = f->mir->nblocks; + out->entry = f->mir->entry; + out->emit_order = f->mir->emit_order; + out->emit_order_n = f->mir->emit_order_n; + out->emit_order_cap = f->mir->emit_order_cap; + out->opt_rewritten = 1; + out->mir = NULL; + return 1; +} + +static void mir_commit(Func* f, const Func* view) { + if (!f || !f->mir || !view) return; + f->mir->blocks = view->blocks; + f->mir->nblocks = view->nblocks; + f->mir->entry = view->entry; + f->mir->emit_order = view->emit_order; + f->mir->emit_order_n = view->emit_order_n; + f->mir->emit_order_cap = view->emit_order_cap; +} + +static void mir_verify_reg(Func* f, const char* stage, Reg r, u8 cls, + const char* what) { + if (r == (Reg)REG_NONE || cls >= OPT_REG_CLASSES || + r >= OPT_MAX_HARD_REGS) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(f->c, loc, "opt MIR verify %s: bad %s cls%u reg%u", + stage ? stage : "?", what, (unsigned)cls, (unsigned)r); + } +} + +static void mir_verify_operand(Func* f, Inst* in, Operand* op, int is_def, + void* arg) { + (void)in; + (void)is_def; + const char* stage = (const char*)arg; + if (!op) return; + if (op->kind == OPK_REG) { + mir_verify_reg(f, stage, op->v.reg, op->cls, "register operand"); + } else if (op->kind == OPK_INDIRECT) { + mir_verify_reg(f, stage, op->v.ind.base, RC_INT, "indirect base"); + if (op->v.ind.index != (Reg)REG_NONE) + mir_verify_reg(f, stage, op->v.ind.index, RC_INT, "indirect index"); + } else if (op->kind == OPK_LOCAL) { + if (op->v.frame_slot == FRAME_SLOT_NONE || + op->v.frame_slot > f->nframe_slots) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(f->c, loc, "opt MIR verify %s: bad frame slot %u", + stage ? stage : "?", (unsigned)op->v.frame_slot); + } + } +} + +void opt_mir_verify(Func* f, const char* stage) { + Func v; + if (!mir_view(f, &v)) return; + for (u32 b = 0; b < v.nblocks; ++b) { + Block* bl = &v.blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + if ((IROp)in->op == IR_PHI) { + SrcLoc loc = in->loc; + compiler_panic(f->c, loc, "opt MIR verify %s: phi survived lowering", + stage ? stage : "?"); + } + if ((IROp)in->op == IR_PARAM_DECL) continue; + opt_walk_inst_operands(&v, in, mir_verify_operand, (void*)stage); + } + } +} + +void opt_mir_combine(Func* f) { + Func v; + if (!mir_view(f, &v)) return; + opt_combine(&v); + mir_commit(f, &v); +} + +void opt_mir_dce(Func* f) { + Func v; + if (!mir_view(f, &v)) return; + opt_dce(&v); + mir_commit(f, &v); +} + +void opt_mir_build_cfg(Func* f) { + Func v; + if (!mir_view(f, &v)) return; + opt_build_cfg(&v); + mir_commit(f, &v); +} + +void opt_mir_jump_cleanup(Func* f, OptJumpCleanupStage stage) { + Func v; + if (!mir_view(f, &v)) return; + opt_jump_cleanup(&v, stage); + mir_commit(f, &v); +} diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c @@ -250,9 +250,6 @@ static Inst* emit_load_imm(Func* f, u32 b, Val dst, CfreeCgTypeId ty, i64 imm) { static Inst* emit_scalar_input(Func* f, u32 b, Val dst, CfreeCgTypeId ty) { Inst* in = ir_emit(f, b, IR_PARAM_DECL); - in->opnds = arena_array(f->arena, Operand, 1); - in->opnds[0] = op_reg_(dst, ty); - in->nopnds = 1; in->def = dst; in->type = ty; f->val_def_block[dst] = b; @@ -1245,9 +1242,6 @@ static void add_reg_param(Func* f, PReg r, CfreeCgTypeId ty) { ir_param_add(f, &d); Inst* in = ir_emit(f, f->entry, IR_PARAM_DECL); - in->opnds = arena_array(f->arena, Operand, 1); - in->opnds[0] = op_reg_(r, ty); - in->nopnds = 1; in->def = r; in->type = ty; } @@ -1824,6 +1818,36 @@ static void opt_block_liveness_phase1(void) { tc_fini(&tc); } +static void opt_liveness_grows_high_preg_bitsets(void) { + TestCtx tc; + tc_init(&tc); + Func* f = new_func(&tc); + u32 entry = f->entry; + u32 use_block = ir_block_new(f); + ir_note_emit(f, use_block); + + Val high = 193; + ir_ensure_val(f, high, tc.i32, RC_INT); + emit_load_imm(f, entry, high, tc.i32, 9); + emit_br_to(f, entry, use_block); + emit_ret_val(f, use_block, high, tc.i32); + + opt_build_cfg(f); + opt_build_loop_tree(f); + OptLiveInfo live; + opt_live_blocks(f, &live); + + EXPECT(opt_bitset_has(&live.blocks[entry].live_out, high), + "high preg should be live-out from defining block"); + EXPECT(opt_bitset_has(&live.blocks[use_block].live_in, high), + "high preg should be live-in to use block"); + EXPECT(live.blocks[entry].live_out.active_words >= high / 64u + 1u, + "live-out bitset should grow to high preg"); + EXPECT(live.blocks[use_block].live_out.active_words == 0, + "live-out bitset should trim after high preg dies"); + tc_fini(&tc); +} + static void opt_live_ranges_phase2(void) { TestCtx tc; tc_init(&tc); @@ -2594,7 +2618,7 @@ static void opt_copy_prop_collapses_redundant_extension_chain(void) { expect_ir_dump_eq(f, "ir blocks=1 vals=5\n" "block 0 preds=[] succs=[] insts=5\n" - " 0 param_decl def=v1 opnds=[v1]\n" + " 0 param_decl def=v1\n" " 1 convert def=v2 opnds=[v2,v1]\n" " 2 convert def=v3 opnds=[v3,v1]\n" " 3 convert def=v4 opnds=[v4,v1]\n" @@ -3197,7 +3221,7 @@ static void opt_gvn_reuses_store_to_local_load(void) { expect_ir_dump_eq(f, "ir blocks=1 vals=3\n" "block 0 preds=[] succs=[] insts=4\n" - " 0 param_decl def=v1 opnds=[v1]\n" + " 0 param_decl def=v1\n" " 1 store opnds=[local#1,v1] mem=size4 align4 " "flags=0x0 alias=local#1\n" " 2 load def=v2 opnds=[v2,local#1] mem=size4 align4 " @@ -3378,8 +3402,8 @@ static void opt_gvn_preserves_load_across_unknown_store(void) { expect_ir_dump_eq(f, "ir blocks=1 vals=5\n" "block 0 preds=[] succs=[] insts=6\n" - " 0 param_decl def=v1 opnds=[v1]\n" - " 1 param_decl def=v2 opnds=[v2]\n" + " 0 param_decl def=v1\n" + " 1 param_decl def=v2\n" " 2 load def=v3 opnds=[v3,local#1] mem=size4 align4 " "flags=0x0 alias=local#1\n" " 3 store opnds=[[v1+0],v2] mem=size4 align4 flags=0x0 " @@ -5293,6 +5317,19 @@ static void opt_combine_sinks_or_preserves_producer_copy_after_rewrite(void) { EXPECT(count_op(call_f, IR_COPY) == 1, "producer-copy retargeting should not cross a call"); + Func* call_live = new_func(&tc); + call_live->opt_rewritten = 1; + emit_phys_binop(call_live, call_live->entry, 19, 20, 21, tc.i64, BO_IADD); + emit_phys_copy(call_live, call_live->entry, 0, 19, tc.i64); + emit_call_void(call_live, call_live->entry); + emit_phys_binop(call_live, call_live->entry, 22, 19, 18, tc.i64, BO_IADD); + emit_ret_val(call_live, call_live->entry, 22, tc.i64); + + opt_combine(call_live); + add = &call_live->blocks[call_live->entry].insts[0]; + EXPECT(count_op(call_live, IR_COPY) == 1 && add->opnds[0].v.reg == 19, + "producer used after a call barrier should not sink into call arg"); + Func* clobber = new_func(&tc); clobber->opt_rewritten = 1; emit_phys_binop(clobber, clobber->entry, 21, 20, 19, tc.i32, BO_IADD); @@ -5536,6 +5573,54 @@ static void opt_combine_substitutes_into_indirect_base_and_index(void) { "single-use copy should substitute into indirect index"); EXPECT(count_op(g, IR_COPY) == 0, "copy whose only use is an indirect index should be dead-DCE'd"); + + Func* store_f = new_func(&tc); + store_f->opt_rewritten = 1; + emit_phys_copy(store_f, store_f->entry, 4, 2, tc.i32); + Inst* st = emit_store_indirect(store_f, store_f->entry, 4, 1, tc.i32, 0); + st->opnds[0].v.ind.ofs = 8; + ir_emit(store_f, store_f->entry, IR_RET); + + opt_combine(store_f); + st = NULL; + for (u32 i = 0; i < store_f->blocks[store_f->entry].ninsts; ++i) + if ((IROp)store_f->blocks[store_f->entry].insts[i].op == IR_STORE) + st = &store_f->blocks[store_f->entry].insts[i]; + EXPECT(st && st->opnds[0].kind == OPK_INDIRECT && + st->opnds[0].v.ind.base == 4, + "copy substitution should not rewrite store address bases"); + tc_fini(&tc); +} + +static void opt_combine_copy_chain_source_reuse_blocks_indirect_subst(void) { + TestCtx tc; + tc_init(&tc); + + Func* f = new_func(&tc); + f->opt_rewritten = 1; + emit_phys_copy(f, f->entry, 7, 8, tc.i64); + emit_phys_copy(f, f->entry, 14, 7, tc.i64); + Operand off_addr = op_indexed_indirect_(13, 12, 0, 0, tc.i32); + tt_emit_load_phys_indirect(f, f->entry, 8, off_addr, tc.i32); + emit_phys_binop(f, f->entry, 7, 21, 8, tc.i32, BO_IADD); + emit_phys_copy(f, f->entry, 8, 7, tc.i32); + emit_phys_copy(f, f->entry, 13, 8, tc.i32); + Operand field_addr = op_indirect_(14, tc.i64); + field_addr.v.ind.ofs = 8; + tt_emit_load_phys_indirect(f, f->entry, 12, field_addr, tc.i64); + emit_ret_val(f, f->entry, 12, tc.i64); + + opt_combine(f); + Inst* load = NULL; + for (u32 i = 0; i < f->blocks[f->entry].ninsts; ++i) + if ((IROp)f->blocks[f->entry].insts[i].op == IR_LOAD && + f->blocks[f->entry].insts[i].opnds[0].v.reg == 12) + load = &f->blocks[f->entry].insts[i]; + + EXPECT(load && load->opnds[1].kind == OPK_INDIRECT && + load->opnds[1].v.ind.base == 14, + "copy-chain indirect substitution must not cross reuse of source reg"); + tc_fini(&tc); } @@ -6959,6 +7044,7 @@ int main(void) { opt_pressure_relief_sinks_many_immediates_in_one_pass(); opt_liveness_branch(); opt_block_liveness_phase1(); + opt_liveness_grows_high_preg_bitsets(); opt_live_ranges_phase2(); opt_range_liveness_linear(); opt_interference_branch_disjoint(); @@ -6991,6 +7077,7 @@ int main(void) { opt_combine_keeps_unsafe_and_multiuse_defs(); opt_combine_copy_chains_and_convert_pairs(); opt_combine_substitutes_into_indirect_base_and_index(); + opt_combine_copy_chain_source_reuse_blocks_indirect_subst(); opt_combine_synthesizes_address_modes(); opt_combine_full_ext_of_ext_rules(); opt_dce_physical_dead_defs(); diff --git a/test/toy/run.sh b/test/toy/run.sh @@ -355,6 +355,9 @@ if [ ! -x "$CFREE" ]; then exit 2 fi +printf 'toy: CFREE=%s\n' "$CFREE" +printf 'toy: PATHS=%s OPT_LEVELS=%s\n' "$PATHS" "$TOY_OPT_LEVELS" + if [ $RUN_X -eq 1 ]; then have_podman=0 command -v podman >/dev/null 2>&1 && have_podman=1