commit 5bf1e27978b9c3fd2f5900243abaa81bcdbbcf8f
parent dbcdeb29cafba52aaf5256e95e6850b284982da9
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 1 Jun 2026 18:25:13 -0700
cg/ir: specify IR to remove all undefined behavior; lock interpreter to it
Spec (doc/IR.md): make the CG IR completely well-defined. Every op on every
input is now Portably-defined, Target-defined (deterministic per target), or a
Well-formedness precondition — never unconstrained. Adds a "Semantic modes"
section (portable default + per-op opt-in to target-defined edges, carried in
CgIrInst.flags / new CgIrInstFlag) and a "Well-definedness: edge-case semantics"
section pinning down every UB-prone edge: int wrap, div-by-zero/INT_MIN-over-1,
shift-count masking, float->int saturation, FP-compare NaN ordering, conversions,
memory/alignment, control flow, atomics order legality, intrinsics. Fixes stale
docs (no CG_IR_SCOPE_ELSE / "if scope"; scopes are block/loop only). Adds
cross-reference comments to BinOp/CmpOp/ConvKind/MemOrder in cgtarget.h.
Interpreter: establish it as the reference implementation of the portable
semantics. Verified the engine's arith/compare/convert/unop/intrinsic handlers
already match the spec, and lock it in with a parameterized conformance suite
(test/interp/interp_smoke_test.c spec_*), which runs each edge with runtime args
so the optimizer cannot fold it away (33 checks, 0 failures). Harden do_binop's
shift-count mask to the u64 storage width so a (never-reached) 16-byte BINOP
cannot invoke host C shift UB; identical to (w*8-1) for every width the engine
actually carries (<=8). Documents the interpreter-as-reference relationship in
doc/INTERPRETER.md.
Diffstat:
6 files changed, 825 insertions(+), 36 deletions(-)
diff --git a/doc/INTERPRETER.md b/doc/INTERPRETER.md
@@ -162,11 +162,20 @@ path, with no behavioral difference from the threaded build.
- **Arithmetic / compare / convert** read operand values, apply the operation by
width and fp flag, and write the result. Width masking and sign-extension are
- explicit. Integer divide/rem guard divide-by-zero and the `INT_MIN / -1`
- overflow (wrap, not UB). Float-to-int conversion **saturates** (NaN -> 0,
- out-of-range -> clamped): this matches Wasm `trunc_sat` and, crucially, avoids
- the UB (and UBSan trap) of casting an out-of-range double to an integer, while
- staying identical to a plain cast for well-defined inputs.
+ explicit. The engine is the **reference implementation of the IR's portable
+ edge-case semantics** ([IR.md](IR.md) "Well-definedness"): integer add/sub/mul
+ wrap modulo the width; shift counts reduce modulo the width; integer divide/rem
+ trap on a zero divisor and wrap `INT_MIN / -1` (no UB); float→int conversion
+ **saturates** (NaN -> 0, out-of-range -> clamped, matching Wasm `trunc_sat`),
+ avoiding the UB of casting an out-of-range double while staying identical to a
+ plain cast for in-range inputs; the floating relationals are ordered (NaN ->
+ false) while `ne` is unordered (NaN -> true). These rules are locked to the
+ spec by the parameterized conformance cases in
+ `test/interp/interp_smoke_test.c` (`spec_*`), which run each edge with runtime
+ arguments so the optimizer cannot fold the operation away. The engine stores
+ every scalar in a `u64`, so it carries scalar widths up to 64 bits exactly;
+ 128-bit scalars are memory/aggregate-lowered (or expanded to 64-bit-half /
+ libcall sequences) before reaching a register handler.
- **Loads / stores / addressing** never raw-dereference. Every memory access goes
through `interp_translate` (below), which is what makes the two memory models
swap cleanly. A destination operand may itself be memory — the optimizer leaves
diff --git a/doc/IR.md b/doc/IR.md
@@ -38,6 +38,34 @@ classifications, and pointer widths are already resolved for the compile target
by the time the recorder sees a call. The IR does not know about machine
instructions, addressing-mode legality, or register files.
+### No undefined behavior
+
+The CG IR has no undefined behavior. Every operation, on every input, has a
+fully determined meaning that falls into exactly one of three categories:
+
+- **Portably defined** — the result is the same on every target. This is the
+ default for the arithmetic edges that C leaves undefined: integer overflow
+ wraps, shift counts wrap modulo the width, float→int conversions saturate,
+ `clz(0)`/`ctz(0)` is the bit width.
+- **Target-defined** — the result is deterministic given the compile target but
+ may differ across targets. Two things are target-defined: (1) inherently
+ machine-tied effects (a fault on an invalid memory access, the bit pattern an
+ inline-asm block produces), and (2) the arithmetic edges above when the
+ frontend opts into native-instruction semantics for performance (see
+ [Semantic modes](#semantic-modes-portable-vs-target-defined)).
+- **Well-formedness preconditions** — structural requirements on the recorded
+ tape (operand kinds and widths agree, each label is placed once, every path
+ ends in a terminator, …). A tape that violates one is *malformed IR*, a
+ compiler bug in the producer — not a program exhibiting undefined behavior.
+ Consumers may assume well-formed input.
+
+There is deliberately no fourth "anything may happen" category. Where C would
+say *undefined behavior*, the CG IR says *portably defined*, *target-defined*,
+or *malformed* — never *unconstrained*. The runtime half of this guarantee (what
+each op computes on every input) is spelled out in
+[Well-definedness: edge-case semantics](#well-definedness-edge-case-semantics);
+the structural half is the [Well-formedness](#well-formedness-invariants) list.
+
## Pipeline position
```text
@@ -95,11 +123,15 @@ target set of a computed goto. A `CgIrLabel` records its id and the source
location of its first placement. Placement appears in the tape as a
`CG_IR_LABEL` instruction.
-Structured scopes (`CgIrScope`) capture CG's structured control model — block,
-loop, and if scopes with their associated break/continue/else/end semantics —
-so that backends able to express structure (the C-source target, a future Wasm
-target; see [WASM.md](WASM.md)) can replay it directly. Native CFG consumers
-flatten scopes to ordinary labels and branches.
+Structured scopes (`CgIrScope`) capture CG's structured control model. There are
+two scope kinds (`ScopeKind` in src/cg/cgtarget.h): `SCOPE_BLOCK`, a forward-only
+region whose break skips to the end, and `SCOPE_LOOP`, whose break exits forward
+and whose continue jumps to an explicit loop-header target. `if`/`if-else` is not
+a distinct scope kind: the frontend lowers it to a pair of nested forward blocks
+(`cfree_cg_if_begin`/`_else`/`_end`), so there is no else op in the IR. Backends
+able to express structure (the C-source target, a future Wasm target; see
+[WASM.md](WASM.md)) replay scopes directly; native CFG consumers flatten them to
+ordinary labels and branches.
Basic blocks are *not* part of the IR. A consumer that needs CFG form derives it
by splitting the linear tape at labels, scope boundaries, and terminators. That
@@ -255,18 +287,17 @@ dereferenceable data.
### Structured scopes
-These ops preserve CG's C-like structured control model — `block`, `loop`, and
-`if` scopes — so backends that express structure directly (the C-source target,
-a future Wasm target) can replay it without rebuilding a CFG. CFG-based
-consumers ignore the structure and reconstruct control flow from the underlying
-labels and branches instead.
+These ops preserve CG's C-like structured control model — `block` and `loop`
+scopes — so backends that express structure directly (the C-source target, a
+future Wasm target) can replay it without rebuilding a CFG. CFG-based consumers
+ignore the structure and reconstruct control flow from the underlying labels and
+branches instead. `if`/`if-else` has no dedicated op or scope kind; the frontend
+builds it from nested forward `block` scopes plus `CG_IR_BREAK_TO`.
- `CG_IR_SCOPE_BEGIN`: open a scope. The scope id and full `CGScopeDesc` (its
- `kind` — block/loop/if — and associated descriptor fields) ride in a
- `CgIrScopeAux` on `extra.aux`. Recording also adds a `CgIrScope` to the
- function's scope side table.
-- `CG_IR_SCOPE_ELSE`: begin the else arm of an `if` scope; scope id in
- `extra.imm`.
+ `kind` — `SCOPE_BLOCK` or `SCOPE_LOOP` — and associated descriptor fields)
+ ride in a `CgIrScopeAux` on `extra.aux`. Recording also adds a `CgIrScope` to
+ the function's scope side table.
- `CG_IR_SCOPE_END`: close the most recently opened matching scope; scope id in
`extra.imm`.
- `CG_IR_BREAK_TO`: exit the named enclosing scope (loop/block/switch break);
@@ -315,26 +346,305 @@ are observable and must preserve the ordering the memory model requires.
aux. Constraint strings are target-specific; optimization may inspect operands
and clobbers but must treat the block conservatively.
-## Invariants
+## Semantic modes: portable vs target-defined
+
+A handful of integer and conversion operations have edge cases whose cheapest
+lowering differs across targets: integer division by zero and `INT_MIN / -1`,
+shift counts at or beyond the operand width, and out-of-range or NaN float→int
+conversions. For these the IR offers two semantics, chosen **per instruction** by
+the frontend:
+
+- **Portable (default).** The edge is defined identically on every target
+ (details under [edge-case semantics](#well-definedness-edge-case-semantics)). A
+ frontend that wants reproducible results across architectures — or whose source
+ language has no C-style undefined behavior — gets them for free by recording
+ the op with no semantic flags.
+- **Target-defined (opt-in).** The edge follows the target's native instruction.
+ A frontend whose source language already declares the edge undefined (C
+ division by zero, oversized shifts, out-of-range `(int)` casts) can opt in to
+ skip the guards portable mode would require, trading portability for the
+ fastest lowering.
+
+The choice rides in `CgIrInst.flags` (`CgIrInstFlag` in src/cg/ir.h):
+
+| Flag | Affects | Cleared (portable default) | Set (target-defined) |
+|------|---------|----------------------------|----------------------|
+| `CG_IR_INST_TARGET_DIV_EDGES` | `BINOP` sdiv/udiv/srem/urem | div-by-zero traps; `INT_MIN/-1` wraps | target divide instruction |
+| `CG_IR_INST_TARGET_SHIFT_EDGES` | `BINOP` shl/shr_s/shr_u | count reduced modulo width | target shift instruction |
+| `CG_IR_INST_TARGET_FPTOINT_EDGES` | `CONVERT` ftoi_s/ftoi_u | saturate; NaN→0 | target convert instruction |
+
+Both modes are fully defined: *target-defined is still deterministic per target*,
+never unconstrained. This flag set is the only place the IR's value semantics
+depend on a producer choice rather than on the op alone; everything else is fixed
+by the op. Memory-safety faults are *always* target-defined and are not governed
+by these flags — there is no portable bounds-checking mode (see
+[Memory](#memory-load-store-aggregate-bitfield)).
+
+Portable is the safe default for a consumer that has not yet been taught a flag:
+implementing portable semantics where the op asked for target-defined is always
+legal, because the opt-in is only ever taken when the source language permits any
+behavior at that edge. Wiring the public CG API and recorder to set these bits,
+and teaching each consumer (optimizer, interpreter, native and C-source backends)
+to honor them, is implementation work tracked separately from this spec; the bits
+are defined here so the IR can carry the choice.
+
+## Well-definedness: edge-case semantics
+
+This section pins down every operation's behavior on the inputs that a structural
+reading of the op set leaves open. It mirrors the operation families above.
+Unless a rule is marked *target-defined*, it is portably defined.
+
+### Integer arithmetic and bitwise
+
+- **Widths.** For `BINOP`/`CMP` the source operands — and, for `binop`, the
+ destination — share one integer width *W* ∈ {8,16,32,64,128}. `CMP` yields the
+ boolean/i1 type. (Width agreement is a well-formedness precondition.)
+- **Wrapping.** `iadd`, `isub`, `imul`, and `neg` compute modulo 2^*W*
+ (two's complement). Signed and unsigned overflow both wrap; neither is
+ undefined. Overflow *detection* is not part of these ops — use the
+ `*_OVERFLOW` intrinsics for a checked result. The public API's `NSW`/`NUW`/
+ `EXACT` assertions and trap/saturate overflow flags are not represented on the
+ base IR op; a frontend that needs them realizes them as explicit checks before
+ recording.
+- **Division and remainder.** `sdiv`/`srem` are truncated (round-toward-zero)
+ division; the remainder takes the sign of the dividend. `udiv`/`urem` are
+ unsigned.
+ - *Portable:* a zero divisor **traps** (a deterministic abort, as
+ `INTRIN_TRAP`). `INT_MIN_W / -1` is defined as `INT_MIN_W` and
+ `INT_MIN_W % -1` as `0`; neither traps.
+ - *Target-defined* (`CG_IR_INST_TARGET_DIV_EDGES`): both edges follow the
+ target divide instruction — e.g. x86-64 raises `#DE` for a zero divisor and
+ for `INT_MIN/-1`; AArch64 `sdiv` yields `0` for a zero divisor and `INT_MIN`
+ for `INT_MIN/-1`.
+- **Shifts.** The shifted value and the result have width *W*; `shr_s` replicates
+ the sign bit, `shl`/`shr_u` shift in zeros. The count is an integer operand
+ interpreted as an unsigned amount.
+ - *Portable:* the count is reduced **modulo *W*** (only its low log2(*W*) bits
+ matter), so every count is defined and a high-bit-set ("negative") count
+ simply reduces mod *W*.
+ - *Target-defined* (`CG_IR_INST_TARGET_SHIFT_EDGES`): an out-of-range count
+ follows the target shift instruction's own masking or zeroing.
+- **`and`/`or`/`xor`** are total bitwise ops with no edge cases.
+
+### Floating point
+
+The IR's floating-point operations are strict IEEE-754 in the target's default
+environment: round-to-nearest-ties-to-even, non-trapping exceptions (status-flag
+only), no denormal flushing. These are portable; the IR does not represent
+alternate rounding modes or fast-math relaxations (the public API's rounding
+argument and FP fast-math flags are dropped at the IR level unless the frontend
+realizes them as explicit operations).
+
+- `fadd`/`fsub`/`fmul`/`fdiv` produce the correctly-rounded IEEE result. A NaN
+ operand yields a quiet NaN. `x/0 → ±∞` (sign per operands), `0/0 → NaN`,
+ `∞/∞ → NaN`.
+- There is no FP remainder primitive; the frontend lowers a floating `%` to a
+ runtime call (`fmod`).
+- `fneg` flips the sign bit — it is *not* `0 - x`: it negates zeros and
+ infinities and toggles a NaN's sign without otherwise altering its payload.
+- **Compares.** The relational FP compares `lt_f`, `le_f`, `gt_f`, `ge_f` are
+ **ordered**: if either operand is NaN the result is `false`. On floating
+ operands `eq` is ordered-equal (NaN → `false`) and `ne` is unordered-not-equal
+ (NaN → `true`), matching C `==`/`!=`. A frontend needing an *unordered*
+ relational composes it as the negation of the opposite ordered compare
+ (`a ULT b ≡ !(a OGE b)`), since negating an ordered compare turns the NaN
+ result to `true`.
+ - *Spec note / known gap:* the current public→IR lowering (`api_map_fp_cmp` in
+ src/cg/value.c) maps both the ordered and the unordered relational forms to
+ the same internal op, so the ordered/unordered distinction for `<,<=,>,>=` is
+ presently lost at the IR boundary — correct only under a no-NaN assumption.
+ Resolving it (unordered relational variants, or the explicit NaN composition
+ above emitted by the frontend) is an implementation follow-up; the rule above
+ is the intended contract.
+
+### Conversions
+
+- `sext`/`zext` require dst width > src width and sign-/zero-extend; `trunc`
+ requires dst width < src width and keeps the low dst bits. (Width ordering is a
+ precondition.)
+- `itof_s`/`itof_u` convert integer→float with round-to-nearest-even; magnitudes
+ beyond the float's range round to ±∞ per IEEE.
+- `ftoi_s`/`ftoi_u` convert float→int rounding **toward zero** (truncation);
+ in-range values drop their fraction.
+ - *Portable:* out-of-range and non-finite inputs **saturate** — above the
+ destination max → max, below the min → min (`0` for the unsigned floor) — and
+ **NaN → 0**.
+ - *Target-defined* (`CG_IR_INST_TARGET_FPTOINT_EDGES`): the result follows the
+ target convert instruction (e.g. x86-64 `cvttsd2si` yields the "integer
+ indefinite" `INT_MIN` on overflow/NaN; AArch64 `fcvtzs` saturates).
+- `fext` widens exactly (no rounding); `ftrunc` narrows with round-to-nearest-
+ even, overflow → ±∞.
+- `bitcast` requires equal byte size and reinterprets the operand's target ABI
+ bit pattern without changing bits. Pointer↔integer of equal width is a bitcast.
+
+### Memory: load, store, aggregate, bitfield
+
+- **Address validity.** A `load`/`store`/aggregate/bitfield/atomic op requires its
+ effective address to reference a live object of at least `size` bytes in the
+ access's address space. This is *not* portably checked: an invalid or
+ out-of-bounds access (including a null dereference) produces a **target-defined
+ fault** — the deterministic behavior of the target's load/store against that
+ address (a trap on an MMU target; a read or write of whatever occupies the
+ address on a flat-memory target). It is target-defined, never unconstrained,
+ and never governed by a semantic-mode flag.
+- **Alignment.** `MemAccess.align` is a *promise*: the producer asserts the
+ address is at least that aligned (natural alignment for the type when
+ `align == 0`), and a target may use the promise to choose wider instructions.
+ Recording an access whose address is in fact less aligned than stated, without
+ `MF_UNALIGNED`, is a precondition violation; on a strict-alignment target it
+ faults (target-defined). `MF_UNALIGNED` declares the access may be unaligned and
+ obliges the consumer to emit an unaligned-capable sequence; it is then fully
+ defined.
+- **Uninitialized reads.** Reading a local or memory location not yet assigned on
+ the current dynamic path yields an **unspecified value** of the access type — an
+ arbitrary but type-valid bit pattern. It never traps and never corrupts other
+ state; it is *not* poison and *not* undefined behavior. Producers should define
+ every location before reading it for determinism, but doing otherwise stays
+ within defined IR.
+- **Volatile.** `MF_VOLATILE` accesses are observable side effects: they must not
+ be added, removed, duplicated, or reordered with respect to other volatile or
+ atomic accesses.
+- **Aggregates.** `agg_copy` copies `size` bytes and requires source and
+ destination ranges **not to overlap** (memcpy semantics); overlap is a
+ precondition violation — use the `MEMMOVE` intrinsic for overlap. `agg_set`
+ fills `size` bytes with the byte value. `size == 0` is a defined no-op.
+- **Bitfields.** `bitfield_load`/`bitfield_store` access bits
+ `[bit_offset, bit_offset+bit_width)` within the storage unit at
+ `storage_offset`; the range must lie within the unit (precondition). A load
+ sign- or zero-extends per `signed_`. A store uses the low `bit_width` bits of
+ the source and leaves bits outside the field unchanged. A **zero-width** field
+ (`bit_width == 0`) is a layout barrier only and performs no memory access.
+
+### Control flow
+
+- **Labels.** Every label named by a branch, switch, computed-goto target set, or
+ label-address op belongs to the same function and is **placed exactly once**
+ (one `CG_IR_LABEL`); placement may follow use in tape order. (Preconditions.)
+- **Terminators and reachability.** Every dynamic path ends in a terminator
+ (`ret`, a `CG_CALL_TAIL` call, `INTRIN_UNREACHABLE`/`TRAP`/`LONGJMP`,
+ `indirect_branch`, or a `br` that ultimately reaches one). Falling off the end
+ of the instruction stream without a terminator is malformed. Instructions after
+ a terminator are reachable only through a label.
+- **Switch.** The selector is compared against each case `value` using
+ `selector_type`'s width and signedness; a match transfers to that case's label,
+ otherwise to `default_label` (`LABEL_NONE` means fall through past the switch).
+ Case values are **distinct** (a precondition); the IR defines no tie-break.
+- **Computed goto.** `indirect_branch` transfers to the label address in its
+ operand, which must be one of the `ntargets` labels in its closed set
+ (`ntargets > 0`, a precondition). The set is exhaustive: a runtime address
+ outside it is target-defined (branch-protection hardening may fault). Label
+ addresses (`load_label_addr`, `local_static_data_label_addr`) are opaque tokens
+ valid only within the defining function's activation; they may be stored,
+ loaded, compared for equality, and consumed by `indirect_branch`, but never
+ called or dereferenced as data.
+
+### Calls and returns
+
+- A call's argument and result locals match `fn_type` in count and type; for a
+ variadic callee the fixed parameters match and variadic arguments are already
+ promoted by the frontend (preconditions). Calling through an invalid function
+ pointer is a target-defined fault. A direct call uses an `OPK_GLOBAL` callee;
+ any other callee operand is indirect.
+- `ret` returns exactly the function's declared result locals, in order and type
+ (precondition). A tail call carries `CG_CALL_TAIL`, obeys the realizability
+ contract above, is a terminator, and is never followed by a `ret`.
+
+### Stack allocation and variadics
+
+- `alloca` allocates `size` bytes (an unsigned byte count) aligned to `align` (a
+ power of two; precondition), valid for the rest of the function activation.
+ Exhausting the stack is a target-defined trap.
+- `va_start`/`va_arg`/`va_end`/`va_copy` operate on a target-ABI vararg-state
+ object addressed by pointer. `va_arg`'s type must match the promoted type of
+ the corresponding actual argument, and the number of `va_arg` reads must not
+ exceed the variadic arguments actually passed (preconditions); violating either
+ is target-defined (it reads adjacent argument storage). `va_start` precedes
+ `va_arg`/`va_end` on the same state; `va_copy` duplicates state.
+
+### Atomics
+
+- **Order legality** (preconditions, per the C11 memory model; mirrored by
+ `cfree_cg_atomic_is_legal`):
+ - `atomic_load`: `relaxed`, `consume`, `acquire`, or `seq_cst`.
+ - `atomic_store`: `relaxed`, `release`, or `seq_cst`.
+ - `atomic_rmw`: any order.
+ - `atomic_cas`: any `success` order; `failure` ∈ {`relaxed`, `consume`,
+ `acquire`, `seq_cst`} and no stronger than `success`.
+ - `fence`: any order (a `relaxed` fence has no effect).
+- The access must be a supported atomic width and naturally aligned for a
+ lock-free operation; otherwise the consumer may lower to a runtime atomic call
+ (target-defined mechanism, same observable semantics). Atomic ops are
+ observable and must preserve the ordering the memory model requires. `rmw`
+ defines the prior value; `cas` defines the prior value and a success bool and
+ compares using the full access width.
+
+### Intrinsics and inline asm
-- Sentinels are zero-valued: `CG_LOCAL_NONE`, `LABEL_NONE`, `CG_SCOPE_NONE`,
- `OBJ_SYM_NONE`. Local, label, and scope ids are 1-based.
-- A local has exactly one declared type for the whole function.
-- Every destination and source local is declared before use.
-- A control-transfer op's label operands name labels in the same function; the
- exception is a call, whose callee is a symbol or a function-pointer value.
-- A terminating op ends the current linear control path; any following reachable
- instruction must be reached through a label.
+Operand shapes are fixed per `IntrinKind` (src/cg/cgtarget.h). Semantic edges:
+
+- `CLZ(0)` and `CTZ(0)` are defined to equal the operand's bit width (stronger
+ than C, where they are undefined). `POPCOUNT`, `BSWAP16/32/64` are total.
+- `SADD/UADD/SSUB/USUB/SMUL/UMUL_OVERFLOW` define a two's-complement wrapped
+ result and a boolean overflow flag.
+- `MEMCPY` requires non-overlapping ranges; `MEMMOVE` permits overlap; `MEMSET`
+ fills. All are defined no-ops at `size == 0`.
+- `SETJMP` returns `0` on the direct call and the value passed to the matching
+ `LONGJMP` when it returns again (a `LONGJMP` value of `0` surfaces as `1`); it
+ "returns twice." `LONGJMP` does not return. Consumers must preserve both
+ control effects.
+- `ASSUME_ALIGNED` returns its pointer and asserts the stated alignment (a
+ precondition; a wrong assertion is target-defined). `EXPECT` returns its value
+ unchanged (a branch-probability hint). `PREFETCH` has no value effect.
+- `TRAP` is a deterministic abort. `UNREACHABLE` asserts the point is never
+ reached and is itself a terminator; if control does reach it the behavior is a
+ target-defined trap, and consumers may assume it unreachable (e.g. to prune
+ successors). Neither corrupts unrelated state.
+- `asm_block` and file-scope asm are opaque target assembly. The IR fixes the
+ *interface* — operand directions, clobbers, volatility — but the assembly's own
+ behavior is target/external, modeled conservatively (treated as reading and
+ writing its declared operands and clobbers and as an observable side effect
+ unless flagged otherwise). This is external behavior, not undefined behavior.
+
+## Well-formedness (invariants)
+
+A *well-formed* tape satisfies all of the following; consumers may assume them,
+and a violation is a producer bug (malformed IR), not program behavior. These are
+the structural half of "no undefined behavior" — the runtime half is the
+edge-case section above.
+
+- Sentinels are zero-valued (`CG_LOCAL_NONE`, `LABEL_NONE`, `CG_SCOPE_NONE`,
+ `OBJ_SYM_NONE`); local, label, and scope ids are 1-based.
+- Every local has exactly one declared type for the whole function, and every
+ source and destination local is declared before use.
+- Destinations are `OPK_LOCAL`. Operand kinds match each op's contract
+ (src/cg/cgtarget.h): FP arithmetic and `fneg` require `OPK_LOCAL` sources;
+ `binop`/`unop`/`cmp` also accept `OPK_IMM`; addresses are
+ `OPK_LOCAL`/`OPK_GLOBAL`/`OPK_INDIRECT`; an `OPK_INDIRECT` index is an integer
+ local with log2 scale 0..3.
+- Integer `binop`/`cmp` operands (and the binop destination) share one width;
+ conversions obey their width-ordering rules.
+- A control-transfer op names labels in the same function; only a call targets a
+ symbol or function-pointer value. Each label is placed exactly once; every path
+ ends in a terminator. Switch case values are distinct; a computed goto's target
+ set is non-empty and closed.
+- Calls and returns match the function/callee type in arity and operand type;
+ atomic orders are legal for their op (above).
+- Data-layout facts (sizes, alignments, field offsets, bit ranges, ABI shape) are
+ already target-selected; consumers must not reinterpret them for a different
+ target.
- Source locations are sticky at recording time and stamped on each instruction.
-- Data-layout facts are already target-selected; consumers must not reinterpret
- record or bitfield layout for a different target.
## Consumer guidance
Anything that reads the IR is reading a layout-resolved, ABI-shaped, but
machine-neutral program. The contract a consumer must respect: preserve
target-data-layout semantics, memory observability (the `MemFlag` set and alias
-roots on each access), the ABI shape of calls and returns, and CFG validity.
+roots on each access), the ABI shape of calls and returns, and CFG validity. It
+must also implement at least the **portable** edge-case semantics of every op,
+and honor the `CgIrInst.flags` semantic-mode bits where it understands them —
+falling back to portable semantics (a safe refinement) for any bit it does not.
+A consumer may assume a well-formed tape; it must not introduce undefined
+behavior of its own where the IR defines a result.
Two consumers exist today, and they take different paths:
diff --git a/src/cg/cgtarget.h b/src/cg/cgtarget.h
@@ -18,6 +18,12 @@ typedef u32 CGLocal;
* existing load/store methods with vector-typed Operands and appropriate
* MemAccess. */
+/* Integer/float binary ops. Edge-case semantics are fully defined (no undefined
+ * behavior) in doc/IR.md: iadd/isub/imul (and UO_NEG) wrap modulo 2^width;
+ * sdiv/udiv/srem/urem and the shifts have a portable default plus an opt-in
+ * target-defined mode selected per instruction via CgIrInstFlag (src/cg/ir.h).
+ * FP ops are strict IEEE-754 in the target's default rounding/exception
+ * environment; there is no FP remainder op (the frontend calls fmod). */
typedef enum BinOp {
BO_IADD,
BO_ISUB,
@@ -45,6 +51,12 @@ typedef enum UnOp {
UO_BNOT, /* bitwise ~ */
} UnOp;
+/* Compares producing i1. Integer signed/unsigned variants are total. The
+ * floating relationals (CMP_LT_F/LE_F/GT_F/GE_F) are ordered (NaN -> false); on
+ * floats CMP_EQ is ordered-equal (NaN -> false) and CMP_NE is unordered-not-
+ * equal (NaN -> true), matching C ==/!=. The internal set does not encode the
+ * ordered/unordered distinction for the relationals; see the FP-compare notes
+ * and the known lowering gap in doc/IR.md. */
typedef enum CmpOp {
CMP_EQ,
CMP_NE,
@@ -62,6 +74,11 @@ typedef enum CmpOp {
CMP_GE_F,
} CmpOp;
+/* Conversions. Widths must order correctly (sext/zext widen, trunc narrows,
+ * bitcast preserves byte size). itof, fext, and ftrunc round to nearest-even;
+ * ftoi_s/ftoi_u round toward zero with a portable saturating out-of-range
+ * default (NaN -> 0) and an opt-in target-defined mode
+ * (CG_IR_INST_TARGET_FPTOINT_EDGES in src/cg/ir.h). Full rules in doc/IR.md. */
typedef enum ConvKind {
CV_SEXT,
CV_ZEXT,
@@ -85,6 +102,10 @@ typedef enum AtomicOp {
AO_NAND,
} AtomicOp;
+/* Memory orders. Which orders are legal depends on the atomic op: load excludes
+ * release/acq_rel; store excludes acquire/consume/acq_rel; CAS failure order is
+ * one of relaxed/consume/acquire/seq_cst and no stronger than success. See the
+ * Atomics edge-case rules in doc/IR.md (mirrored by cfree_cg_atomic_is_legal). */
typedef enum MemOrder {
MO_RELAXED,
MO_CONSUME,
diff --git a/src/cg/ir.h b/src/cg/ir.h
@@ -179,10 +179,32 @@ typedef struct CgIrIntrinsicAux {
u32 narg;
} CgIrIntrinsicAux;
+/* Per-instruction semantic-mode flags carried in CgIrInst.flags. They select,
+ * per op, between the IR's portable edge-case semantics (default, bit clear) and
+ * the target's native-instruction semantics (bit set). Both modes are fully
+ * defined — the IR has no undefined behavior in either; the target-defined
+ * choice trades cross-target portability for the cheapest lowering when the
+ * source language already declares the edge undefined. See the "Semantic modes"
+ * and "Well-definedness" sections of doc/IR.md. Honoring these bits is per
+ * consumer; a consumer that does not understand a bit must implement the
+ * portable semantics, which is always a safe refinement. */
+typedef enum CgIrInstFlag {
+ CG_IR_INST_FLAG_NONE = 0,
+ /* BINOP sdiv/udiv/srem/urem: a zero divisor and INT_MIN/-1 follow the target
+ * divide instruction instead of the portable trap (zero) / wrap (INT_MIN/-1). */
+ CG_IR_INST_TARGET_DIV_EDGES = 1u << 0,
+ /* BINOP shl/shr_s/shr_u: an out-of-range shift count follows the target shift
+ * instruction instead of the portable reduce-modulo-width. */
+ CG_IR_INST_TARGET_SHIFT_EDGES = 1u << 1,
+ /* CONVERT ftoi_s/ftoi_u: out-of-range / NaN / inf inputs follow the target
+ * convert instruction instead of the portable saturate (NaN -> 0). */
+ CG_IR_INST_TARGET_FPTOINT_EDGES = 1u << 2,
+} CgIrInstFlag;
+
typedef struct CgIrInst {
u32 id;
u16 op;
- u16 flags;
+ u16 flags; /* CgIrInstFlag: per-op portable-vs-target-defined edge semantics */
SrcLoc loc;
u32 nopnds;
Operand* opnds;
diff --git a/src/interp/engine.c b/src/interp/engine.c
@@ -277,6 +277,14 @@ static void fault(InterpStack* st, const char* what) {
/* ---- integer/fp arithmetic ---- */
+/* Shift-count mask for the spec's portable "reduce modulo width" rule
+ * (doc/IR.md). The engine stores every scalar in a u64, so the meaningful
+ * range is the storage width (<=64 bits); 16-byte scalars are lowered to
+ * memory / 64-bit-half sequences before reaching here, never as a w==16 BINOP.
+ * Clamping to the storage width keeps the host C shift in range regardless and
+ * is identical to (w*8-1) for every width the engine actually carries (<=8). */
+static u32 shift_mask(u32 w) { return (w >= 8u ? 64u : w * 8u) - 1u; }
+
static u64 do_binop(InterpStack* st, u32 binop, u64 a, u64 b, u32 w, u8 fp) {
if (fp) {
double x = rd_f(a, w), y = rd_f(b, w), r = 0;
@@ -319,12 +327,12 @@ static u64 do_binop(InterpStack* st, u32 binop, u64 a, u64 b, u32 w, u8 fp) {
case BO_AND: return mask_w(a & b, w);
case BO_OR: return mask_w(a | b, w);
case BO_XOR: return mask_w(a ^ b, w);
- case BO_SHL: return mask_w(a << (b & (w * 8u - 1u)), w);
+ case BO_SHL: return mask_w(a << (b & shift_mask(w)), w);
case BO_SHR_S: {
i64 x = sext_w(a, w);
- return mask_w((u64)(x >> (b & (w * 8u - 1u))), w);
+ return mask_w((u64)(x >> (b & shift_mask(w))), w);
}
- case BO_SHR_U: return mask_w(mask_w(a, w) >> (b & (w * 8u - 1u)), w);
+ case BO_SHR_U: return mask_w(mask_w(a, w) >> (b & shift_mask(w)), w);
default: unsupported(st, "int binop"); return 0;
}
}
diff --git a/test/interp/interp_smoke_test.c b/test/interp/interp_smoke_test.c
@@ -34,6 +34,7 @@ typedef struct TestCtx {
Compiler* c;
CfreeCgTypeId i32;
CfreeCgTypeId i64;
+ CfreeCgTypeId f64;
} TestCtx;
static void tc_init(TestCtx* tc) {
@@ -50,6 +51,7 @@ static void tc_init(TestCtx* tc) {
b = cfree_cg_builtin_types(tc->c);
tc->i32 = b.id[CFREE_CG_BUILTIN_I32];
tc->i64 = b.id[CFREE_CG_BUILTIN_I64];
+ tc->f64 = b.id[CFREE_CG_BUILTIN_F64];
}
static void tc_fini(TestCtx* tc) {
@@ -194,11 +196,428 @@ static void interp_runs_branch(void) {
tc_fini(&tc);
}
+/* ============================================================================
+ * Spec conformance: the interpreter is the reference implementation of the IR.
+ *
+ * Each case builds a PARAMETERIZED CgIrFunc and runs it through
+ * opt_run_o1_interp + the engine with RUNTIME argument values, so the optimizer
+ * cannot constant-fold the operation away — the engine's own handler computes
+ * the result. We then assert the exact value the spec mandates for that edge
+ * (doc/IR.md "Well-definedness: edge-case semantics", portable mode). These lock
+ * the engine to the spec; a divergence turns a case red.
+ * ========================================================================== */
+
+static u32 ty_size(TestCtx* tc, CfreeCgTypeId t) {
+ return (u32)cfree_cg_type_size((CfreeCompiler*)tc->c, t);
+}
+static u32 ty_align(TestCtx* tc, CfreeCgTypeId t) {
+ return (u32)cfree_cg_type_align((CfreeCompiler*)tc->c, t);
+}
+
+/* New function with `np` scalar params; fills out_params[] with the param
+ * locals (readable directly as source operands). The interpreter assigns each
+ * param's storage home from the optimizer's local map (not from fn_type's ABI),
+ * so the leaf func type used here mirrors new_func and needs no real func type. */
+static CgIrFunc* new_func_p(TestCtx* tc, CfreeCgTypeId ret,
+ const CfreeCgTypeId* ptypes, u32 np,
+ CGLocal* out_params) {
+ CGFuncDesc fd;
+ CfreeCgTypeId* rt;
+ CGParamDesc* pds;
+ CgIrFunc* f;
+ u32 i;
+ memset(&fd, 0, sizeof fd);
+ rt = arena_array(tc->c->tu, CfreeCgTypeId, 1);
+ rt[0] = ret;
+ pds = np ? arena_array(tc->c->tu, CGParamDesc, np) : NULL;
+ for (i = 0; i < np; ++i) {
+ memset(&pds[i], 0, sizeof pds[i]);
+ pds[i].index = i;
+ pds[i].type = ptypes[i];
+ pds[i].size = ty_size(tc, ptypes[i]);
+ pds[i].align = ty_align(tc, ptypes[i]);
+ }
+ fd.fn_type = ret;
+ fd.result_types = rt;
+ fd.nresults = 1;
+ fd.params = pds;
+ fd.nparams = np;
+ f = cg_ir_func_new(tc->c, &fd);
+ for (i = 0; i < np; ++i) {
+ CGLocalDesc ld;
+ CGLocal loc;
+ memset(&ld, 0, sizeof ld);
+ ld.type = ptypes[i];
+ ld.size = ty_size(tc, ptypes[i]);
+ ld.align = ty_align(tc, ptypes[i]);
+ loc = cg_ir_func_add_local(f, &ld, 1, i);
+ cg_ir_func_add_param(f, loc, &pds[i]);
+ out_params[i] = loc;
+ }
+ return f;
+}
+
+static CGLocal add_local_ty(CgIrFunc* f, TestCtx* tc, CfreeCgTypeId t) {
+ CGLocalDesc d;
+ memset(&d, 0, sizeof d);
+ d.type = t;
+ d.size = ty_size(tc, t);
+ d.align = ty_align(tc, t);
+ return cg_ir_func_add_local(f, &d, 0, 0);
+}
+
+static CfreeInterpStatus run_args(TestCtx* tc, CgIrFunc* cg, const u64* args,
+ u32 nargs, int64_t* out) {
+ CfreeInterpProgram* prog = cfree_interp_program_new(tc->c);
+ Func* f = opt_run_o1_interp(tc->c, cg);
+ InterpFunc* fn =
+ interp_lower((InterpProgram*)prog, f, OBJ_SYM_NONE, SLICE_NULL, NULL);
+ CfreeInterpStatus s =
+ cfree_interp_call_args(prog, (CfreeInterpFunc*)fn, args, nargs, out);
+ cfree_interp_program_free(prog);
+ return s;
+}
+
+static void emit_binop(CgIrFunc* f, BinOp op, CGLocal d, CfreeCgTypeId ty,
+ Operand a, Operand b) {
+ Operand o[3];
+ CgIrInst* in;
+ o[0] = local_op(d, ty);
+ o[1] = a;
+ o[2] = b;
+ in = emit_ops(f, CG_IR_BINOP, o, 3);
+ in->extra.imm = (i64)op;
+}
+static void emit_unop(CgIrFunc* f, UnOp op, CGLocal d, CfreeCgTypeId ty,
+ Operand a) {
+ Operand o[2];
+ CgIrInst* in;
+ o[0] = local_op(d, ty);
+ o[1] = a;
+ in = emit_ops(f, CG_IR_UNOP, o, 2);
+ in->extra.imm = (i64)op;
+}
+static void emit_cmp(CgIrFunc* f, CmpOp op, CGLocal d, CfreeCgTypeId dty,
+ Operand a, Operand b) {
+ Operand o[3];
+ CgIrInst* in;
+ o[0] = local_op(d, dty);
+ o[1] = a;
+ o[2] = b;
+ in = emit_ops(f, CG_IR_CMP, o, 3);
+ in->extra.imm = (i64)op;
+}
+static void emit_convert(CgIrFunc* f, ConvKind k, CGLocal d, CfreeCgTypeId dty,
+ Operand src) {
+ Operand o[2];
+ CgIrInst* in;
+ o[0] = local_op(d, dty);
+ o[1] = src;
+ in = emit_ops(f, CG_IR_CONVERT, o, 2);
+ in->extra.imm = (i64)k;
+}
+static void emit_intrin1(CgIrFunc* f, IntrinKind k, CGLocal d, CfreeCgTypeId dty,
+ Operand arg) {
+ CgIrInst* in = cg_ir_emit(f, CG_IR_INTRINSIC, (SrcLoc){0, 0, 0});
+ CgIrIntrinsicAux* aux = arena_znew(f->arena, CgIrIntrinsicAux);
+ Operand dsts[1];
+ Operand args[1];
+ dsts[0] = local_op(d, dty);
+ args[0] = arg;
+ aux->kind = k;
+ aux->dsts = cg_ir_dup_operands(f->arena, dsts, 1);
+ aux->args = cg_ir_dup_operands(f->arena, args, 1);
+ aux->ndst = 1;
+ aux->narg = 1;
+ in->extra.aux = aux;
+}
+
+/* Run a unary i32->i32 op f(x)=OP(x); return the low 32 bits of the result. */
+static u32 run_un_i32(TestCtx* tc, BinOp bo, int use_unop, UnOp uo, u32 x) {
+ CGLocal p[1];
+ CGLocal r;
+ CgIrFunc* f;
+ u64 args[1];
+ int64_t out = 0;
+ CfreeCgTypeId i32 = tc->i32;
+ f = new_func_p(tc, i32, &i32, 1, p);
+ r = add_local_ty(f, tc, i32);
+ if (use_unop)
+ emit_unop(f, uo, r, i32, local_op(p[0], i32));
+ else
+ emit_binop(f, bo, r, i32, local_op(p[0], i32), local_op(p[0], i32));
+ ret_local(f, r);
+ args[0] = x;
+ (void)run_args(tc, f, args, 1, &out);
+ return (u32)(u64)out;
+}
+
+/* Run a binary i32 op f(x,y)=x OP y; report status + low-32 result. */
+static CfreeInterpStatus run_bin_i32(TestCtx* tc, BinOp bo, u32 x, u32 y,
+ u32* res) {
+ CGLocal p[2];
+ CGLocal r;
+ CgIrFunc* f;
+ u64 args[2];
+ int64_t out = 0;
+ CfreeInterpStatus s;
+ CfreeCgTypeId i32 = tc->i32;
+ CfreeCgTypeId pt[2];
+ pt[0] = i32;
+ pt[1] = i32;
+ f = new_func_p(tc, i32, pt, 2, p);
+ r = add_local_ty(f, tc, i32);
+ emit_binop(f, bo, r, i32, local_op(p[0], i32), local_op(p[1], i32));
+ ret_local(f, r);
+ args[0] = x;
+ args[1] = y;
+ s = run_args(tc, f, args, 2, &out);
+ *res = (u32)(u64)out;
+ return s;
+}
+
+/* integer wrapping + shift masking (spec: portable). */
+static void spec_int_wrap_shift(void) {
+ TestCtx tc;
+ u32 res = 0;
+ tc_init(&tc);
+ /* imul wraps mod 2^32: 0x10000 * 0x10000 = 2^32 -> 0 */
+ EXPECT(run_bin_i32(&tc, BO_IMUL, 0x10000u, 0x10000u, &res) ==
+ CFREE_INTERP_DONE &&
+ res == 0u,
+ "imul wrap: got 0x%08x", res);
+ /* iadd wraps: 0xffffffff + 1 = 0 */
+ EXPECT(run_bin_i32(&tc, BO_IADD, 0xffffffffu, 1u, &res) == CFREE_INTERP_DONE &&
+ res == 0u,
+ "iadd wrap: got 0x%08x", res);
+ /* shl count reduced mod 32: 1 << 33 == 1 << 1 == 2 */
+ EXPECT(run_bin_i32(&tc, BO_SHL, 1u, 33u, &res) == CFREE_INTERP_DONE &&
+ res == 2u,
+ "shl mask: got 0x%08x", res);
+ /* shr_u count mod 32: 0x80000000 >> 33 == >> 1 == 0x40000000 */
+ EXPECT(run_bin_i32(&tc, BO_SHR_U, 0x80000000u, 33u, &res) ==
+ CFREE_INTERP_DONE &&
+ res == 0x40000000u,
+ "shr_u mask: got 0x%08x", res);
+ /* shr_s arithmetic (sign-replicating): -256 >> 4 == -16 */
+ EXPECT(run_bin_i32(&tc, BO_SHR_S, (u32)(-256), 4u, &res) ==
+ CFREE_INTERP_DONE &&
+ res == (u32)(-16),
+ "shr_s arith: got 0x%08x", res);
+ /* neg INT_MIN wraps to INT_MIN (two's complement, no trap) */
+ EXPECT(run_un_i32(&tc, BO_IADD, 1, UO_NEG, 0x80000000u) == 0x80000000u,
+ "neg INT_MIN wrap");
+ tc_fini(&tc);
+}
+
+/* division / remainder edges (spec: portable -> div-by-zero traps,
+ * INT_MIN/-1 wraps). */
+static void spec_div_edges(void) {
+ TestCtx tc;
+ u32 res = 0;
+ tc_init(&tc);
+ /* sdiv by zero traps */
+ EXPECT(run_bin_i32(&tc, BO_SDIV, 10u, 0u, &res) == CFREE_INTERP_TRAP,
+ "sdiv/0 should trap");
+ /* udiv by zero traps */
+ EXPECT(run_bin_i32(&tc, BO_UDIV, 10u, 0u, &res) == CFREE_INTERP_TRAP,
+ "udiv/0 should trap");
+ /* srem by zero traps */
+ EXPECT(run_bin_i32(&tc, BO_SREM, 10u, 0u, &res) == CFREE_INTERP_TRAP,
+ "srem/0 should trap");
+ /* INT_MIN / -1 wraps to INT_MIN, no trap */
+ EXPECT(run_bin_i32(&tc, BO_SDIV, 0x80000000u, 0xffffffffu, &res) ==
+ CFREE_INTERP_DONE &&
+ res == 0x80000000u,
+ "INT_MIN/-1 wrap: got 0x%08x", res);
+ /* INT_MIN %% -1 == 0, no trap */
+ EXPECT(run_bin_i32(&tc, BO_SREM, 0x80000000u, 0xffffffffu, &res) ==
+ CFREE_INTERP_DONE &&
+ res == 0u,
+ "INT_MIN%%-1: got 0x%08x", res);
+ /* ordinary signed divide truncates toward zero: -7 / 2 == -3 */
+ EXPECT(run_bin_i32(&tc, BO_SDIV, (u32)(-7), 2u, &res) == CFREE_INTERP_DONE &&
+ res == (u32)(-3),
+ "sdiv trunc: got 0x%08x", res);
+ tc_fini(&tc);
+}
+
+/* clz/ctz at zero are defined to equal the bit width (stronger than C). */
+static void spec_clz_ctz_zero(void) {
+ TestCtx tc;
+ CGLocal p[1];
+ CGLocal r;
+ CgIrFunc* f;
+ u64 args[1];
+ int64_t out;
+ CfreeCgTypeId i32;
+ tc_init(&tc);
+ i32 = tc.i32;
+ /* clz(0) == 32 */
+ f = new_func_p(&tc, i32, &i32, 1, p);
+ r = add_local_ty(f, &tc, i32);
+ emit_intrin1(f, INTRIN_CLZ, r, i32, local_op(p[0], i32));
+ ret_local(f, r);
+ args[0] = 0;
+ out = -1;
+ EXPECT(run_args(&tc, f, args, 1, &out) == CFREE_INTERP_DONE && (u32)out == 32u,
+ "clz(0)==32: got %lld", (long long)out);
+ /* ctz(0) == 32 */
+ f = new_func_p(&tc, i32, &i32, 1, p);
+ r = add_local_ty(f, &tc, i32);
+ emit_intrin1(f, INTRIN_CTZ, r, i32, local_op(p[0], i32));
+ ret_local(f, r);
+ args[0] = 0;
+ out = -1;
+ EXPECT(run_args(&tc, f, args, 1, &out) == CFREE_INTERP_DONE && (u32)out == 32u,
+ "ctz(0)==32: got %lld", (long long)out);
+ tc_fini(&tc);
+}
+
+static u64 dbits(double d) {
+ u64 u;
+ memcpy(&u, &d, 8);
+ return u;
+}
+static double bitsd(u64 u) {
+ double d;
+ memcpy(&d, &u, 8);
+ return d;
+}
+
+/* float->int conversion saturates; NaN -> 0 (spec: portable ftoi). */
+static u32 run_ftoi(TestCtx* tc, ConvKind k, double in, CfreeInterpStatus* sp) {
+ CGLocal p[1];
+ CGLocal r;
+ CgIrFunc* f;
+ u64 args[1];
+ int64_t out = 0;
+ CfreeCgTypeId f64 = tc->f64;
+ CfreeCgTypeId i32 = tc->i32;
+ f = new_func_p(tc, i32, &f64, 1, p);
+ r = add_local_ty(f, tc, i32);
+ emit_convert(f, k, r, i32, local_op(p[0], f64));
+ ret_local(f, r);
+ args[0] = dbits(in);
+ *sp = run_args(tc, f, args, 1, &out);
+ return (u32)(u64)out;
+}
+
+static void spec_ftoi_sat(void) {
+ TestCtx tc;
+ CfreeInterpStatus s;
+ double nan = bitsd(0x7ff8000000000000ull);
+ tc_init(&tc);
+ EXPECT(run_ftoi(&tc, CV_FTOI_S, 1e30, &s) == 0x7fffffffu &&
+ s == CFREE_INTERP_DONE,
+ "ftoi_s overflow -> INT_MAX");
+ EXPECT(run_ftoi(&tc, CV_FTOI_S, -1e30, &s) == 0x80000000u &&
+ s == CFREE_INTERP_DONE,
+ "ftoi_s underflow -> INT_MIN");
+ EXPECT(run_ftoi(&tc, CV_FTOI_S, nan, &s) == 0u && s == CFREE_INTERP_DONE,
+ "ftoi_s NaN -> 0");
+ EXPECT(run_ftoi(&tc, CV_FTOI_S, -7.9, &s) == (u32)(-7) &&
+ s == CFREE_INTERP_DONE,
+ "ftoi_s trunc toward zero");
+ EXPECT(run_ftoi(&tc, CV_FTOI_U, -1.0, &s) == 0u && s == CFREE_INTERP_DONE,
+ "ftoi_u negative -> 0");
+ EXPECT(run_ftoi(&tc, CV_FTOI_U, 1e30, &s) == 0xffffffffu &&
+ s == CFREE_INTERP_DONE,
+ "ftoi_u overflow -> UINT_MAX");
+ tc_fini(&tc);
+}
+
+/* FP compares: relationals + eq are ordered (NaN -> false); ne is unordered
+ * (NaN -> true). */
+static int run_fcmp(TestCtx* tc, CmpOp op, double a, double b) {
+ CGLocal p[2];
+ CGLocal r;
+ CgIrFunc* f;
+ u64 args[2];
+ int64_t out = 0;
+ CfreeCgTypeId f64 = tc->f64;
+ CfreeCgTypeId i32 = tc->i32;
+ CfreeCgTypeId pt[2];
+ pt[0] = f64;
+ pt[1] = f64;
+ f = new_func_p(tc, i32, pt, 2, p);
+ r = add_local_ty(f, tc, i32);
+ emit_cmp(f, op, r, i32, local_op(p[0], f64), local_op(p[1], f64));
+ ret_local(f, r);
+ args[0] = dbits(a);
+ args[1] = dbits(b);
+ (void)run_args(tc, f, args, 2, &out);
+ return (int)(u32)out;
+}
+
+static void spec_fp_cmp_nan(void) {
+ TestCtx tc;
+ double nan = bitsd(0x7ff8000000000000ull);
+ tc_init(&tc);
+ EXPECT(run_fcmp(&tc, CMP_LT_F, nan, 1.0) == 0, "lt_f NaN ordered -> false");
+ EXPECT(run_fcmp(&tc, CMP_GE_F, 1.0, nan) == 0, "ge_f NaN ordered -> false");
+ EXPECT(run_fcmp(&tc, CMP_EQ, nan, nan) == 0, "eq NaN ordered -> false");
+ EXPECT(run_fcmp(&tc, CMP_NE, nan, nan) == 1, "ne NaN unordered -> true");
+ EXPECT(run_fcmp(&tc, CMP_EQ, -0.0, 0.0) == 1, "eq -0.0 == 0.0 -> true");
+ EXPECT(run_fcmp(&tc, CMP_LT_F, 1.0, 2.0) == 1, "lt_f ordinary -> true");
+ tc_fini(&tc);
+}
+
+/* fneg flips the sign bit (not 0 - x); fdiv follows IEEE. */
+static void spec_fneg_fdiv(void) {
+ TestCtx tc;
+ CGLocal p[2];
+ CGLocal r;
+ CgIrFunc* f;
+ u64 args[2];
+ int64_t out;
+ CfreeCgTypeId f64;
+ CfreeCgTypeId pt[2];
+ tc_init(&tc);
+ f64 = tc.f64;
+ /* fneg(+0.0) -> -0.0 (sign bit set), proving it is not 0 - x */
+ f = new_func_p(&tc, f64, &f64, 1, p);
+ r = add_local_ty(f, &tc, f64);
+ emit_unop(f, UO_FNEG, r, f64, local_op(p[0], f64));
+ ret_local(f, r);
+ args[0] = dbits(0.0);
+ out = 0;
+ EXPECT(run_args(&tc, f, args, 1, &out) == CFREE_INTERP_DONE &&
+ (u64)out == 0x8000000000000000ull,
+ "fneg(+0.0) -> -0.0: got 0x%016llx", (unsigned long long)(u64)out);
+ /* fdiv 1.0/0.0 -> +inf */
+ pt[0] = f64;
+ pt[1] = f64;
+ f = new_func_p(&tc, f64, pt, 2, p);
+ r = add_local_ty(f, &tc, f64);
+ emit_binop(f, BO_FDIV, r, f64, local_op(p[0], f64), local_op(p[1], f64));
+ ret_local(f, r);
+ args[0] = dbits(1.0);
+ args[1] = dbits(0.0);
+ out = 0;
+ EXPECT(run_args(&tc, f, args, 2, &out) == CFREE_INTERP_DONE &&
+ (u64)out == 0x7ff0000000000000ull,
+ "fdiv 1/0 -> +inf: got 0x%016llx", (unsigned long long)(u64)out);
+ /* fdiv 0.0/0.0 -> NaN */
+ args[0] = dbits(0.0);
+ args[1] = dbits(0.0);
+ out = 0;
+ (void)run_args(&tc, f, args, 2, &out);
+ EXPECT(bitsd((u64)out) != bitsd((u64)out), "fdiv 0/0 -> NaN");
+ tc_fini(&tc);
+}
+
int main(void) {
cfree_unit_init(&g_u);
g_u.ctx.now = -1;
interp_runs_arithmetic();
interp_runs_branch();
+ spec_int_wrap_shift();
+ spec_div_edges();
+ spec_clz_ctz_zero();
+ spec_ftoi_sat();
+ spec_fp_cmp_nan();
+ spec_fneg_fdiv();
if (g_u.fails) {
fprintf(stderr, "interp-smoke: %d/%d failed\n", g_u.fails, g_u.checks);
return 1;