kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 58b3b6295afbdbdef20c1fa9236cd47eda894c7b
parent ffc9d1d619fb773720f51a7033a308752c9feb6f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 29 May 2026 11:22:30 -0700

opt: teach the allocator about fixed-register machine-instruction clobbers

Some target instructions clobber specific physical registers as a side effect of
their encoding — x86 idiv writes rax/rdx, a variable shift uses cl, cmpxchg/xadd
use rax/rcx/rdx. The optimizer modelled none of this, so at -O1 the allocator
kept a live value in (say) rdx across an idiv and the backend's cqo/idiv silently
destroyed it (e.g. 23/4 + 23%4: the first quotient parked in rdx, clobbered by
the second divide's cqo). aarch64/riscv64 never hit this — their div/shift/mul
are ordinary three-operand forms.

New mechanism (design in doc/OPT_MACHINE_REG_CONSTRAINTS.md):
- NativeTarget.machine_op_clobbers(op) reports the registers an instruction's
  encoding clobbers, one bitmask per class, from a small NativeMachineOp
  descriptor (keeps the backend off the optimizer IR). NULL for aa64/rv64.
- opt_machinize_native records them per instruction in Func.inst_clobbers
  (side table keyed by InstId; both machinize and regalloc run on HIR so ids
  are stable).
- The regalloc live-walk applies them like inline-asm clobbers: forbid each
  clobbered register for every value live ACROSS the instruction that it does
  not (re)define. Reuses the existing tied/forbidden machinery; the walk now
  also runs for clobber-bearing functions, not only asm ones.
- x64 declares clobbers for idiv/div (rax,rdx), variable shift (rcx), atomic
  cas/rmw (rax,rcx,rdx), va_arg (rax). x64 variable-shift emit also stages the
  count via r11 so the count->rcx and value->dst moves never clobber each other.

x64 parse -O1: fixes div_mod, typedef_vla_size, rv64_atomic_widths (882->885).
No regressions: toy x64/rv64 O0+O1 156/0, aa64 1034/0 (hook NULL => byte-identical).

Diffstat:
Msrc/arch/native_target.h | 28++++++++++++++++++++++++++++
Msrc/arch/x64/native.c | 56++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Msrc/opt/ir.h | 10++++++++++
Msrc/opt/pass_lower.c | 29++++++++++++++++++++++++++++-
Msrc/opt/pass_machinize.c | 42++++++++++++++++++++++++++++++++++++++++++
5 files changed, 162 insertions(+), 3 deletions(-)

diff --git a/src/arch/native_target.h b/src/arch/native_target.h @@ -223,6 +223,25 @@ typedef struct NativeFramePatchState { #define NATIVE_CALL_PLAN_CLASSES 3u +/* A semantic machine operation, enough for the target to report the physical + * registers its encoding clobbers as a side effect (e.g. x86 idiv writes + * rax/rdx, variable shifts use cl). Built by the optimizer from an instruction; + * the descriptor keeps the backend from depending on the optimizer IR. */ +typedef enum NativeMachineOpKind { + NATIVE_MOP_BINOP, + NATIVE_MOP_VA_ARG, + NATIVE_MOP_ATOMIC_CAS, + NATIVE_MOP_ATOMIC_RMW, + NATIVE_MOP_INTRINSIC, +} NativeMachineOpKind; + +typedef struct NativeMachineOp { + u8 kind; /* NativeMachineOpKind */ + u8 binop; /* BinOp, when kind == NATIVE_MOP_BINOP */ + u8 intrin; /* IntrinKind, when kind == NATIVE_MOP_INTRINSIC */ + u8 second_is_reg; /* binop's second operand is a register (not an immediate) */ +} NativeMachineOp; + typedef struct NativeCallDesc { CfreeCgTypeId fn_type; NativeLoc callee; @@ -281,6 +300,15 @@ struct NativeTarget { NativeAllocClass (*class_for_type)(NativeTarget*, CfreeCgTypeId); int (*imm_legal)(NativeTarget*, NativeImmUse, u32 op, CfreeCgTypeId, i64); int (*addr_legal)(NativeTarget*, const NativeAddr*, MemAccess); + /* Optional. Report the physical registers the target's encoding of `op` + * clobbers as a side effect (not its declared operands/results), one bitmask + * per NativeAllocClass. The optimizer keeps values live ACROSS the instruction + * out of these registers, so the backend may use them freely (x86 idiv writes + * rax/rdx; a variable shift uses cl; atomics use rax/rcx/rdx). Return non-zero + * if any register is clobbered, 0 otherwise (the common, unconstrained case). + * NULL means no instruction clobbers fixed registers (aa64/rv64). */ + int (*machine_op_clobbers)(NativeTarget*, const NativeMachineOp* op, + u32 clobber_mask[NATIVE_CALL_PLAN_CLASSES]); void (*func_begin)(NativeTarget*, const CGFuncDesc*); void (*func_begin_known_frame)(NativeTarget*, const CGFuncDesc*, diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c @@ -958,9 +958,19 @@ static void x64_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc aop, } { u32 rb = loc_reg(bop); - if (rb != X64_RCX) emit_mov_rr(mc, 0, X64_RCX, rb); + /* Place the count in cl and the value in dst. Stage the count through + * r11 first so neither move clobbers the other when the value already + * sits in rcx or the count sits in dst. (The optimizer additionally + * keeps values live across the shift out of rcx — see + * x64_machine_op_clobbers.) */ + if (rb != X64_RCX) { + emit_mov_rr(mc, 0, X64_TMP_INT2, rb); + if (rd != ra) emit_mov_rr(mc, w, rd, ra); + emit_mov_rr(mc, 0, X64_RCX, X64_TMP_INT2); + } else if (rd != ra) { + emit_mov_rr(mc, w, rd, ra); + } } - if (rd != ra) emit_mov_rr(mc, w, rd, ra); emit_shift_cl(mc, w, sub, rd); return; } @@ -3646,6 +3656,47 @@ static void x64_finalize(NativeTarget* t) { if (t->mc) mc_emit_eh_frame(t->mc); } +/* Physical registers each x86-64 instruction's encoding clobbers as a side + * effect, so the optimizer keeps values live across them out of those registers + * (the backend is then free to use them). idiv/div write rax (quotient) and rdx + * (remainder/sign); a variable shift uses cl; cmpxchg/xadd loops use rax/rcx/rdx; + * va_arg uses rax for the gp/fp offset. */ +static int x64_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op, + u32 mask[NATIVE_CALL_PLAN_CLASSES]) { + (void)t; + mask[0] = mask[1] = mask[2] = 0; + switch ((NativeMachineOpKind)op->kind) { + case NATIVE_MOP_BINOP: + switch ((BinOp)op->binop) { + case BO_SDIV: + case BO_UDIV: + case BO_SREM: + case BO_UREM: + mask[NATIVE_REG_INT] = (1u << X64_RAX) | (1u << X64_RDX); + return 1; + case BO_SHL: + case BO_SHR_S: + case BO_SHR_U: + if (op->second_is_reg) { + mask[NATIVE_REG_INT] = (1u << X64_RCX); + return 1; + } + return 0; + default: + return 0; + } + case NATIVE_MOP_ATOMIC_CAS: + case NATIVE_MOP_ATOMIC_RMW: + mask[NATIVE_REG_INT] = (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX); + return 1; + case NATIVE_MOP_VA_ARG: + mask[NATIVE_REG_INT] = (1u << X64_RAX); + return 1; + default: + return 0; + } +} + /* ============================ construction ============================ */ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj, @@ -3662,6 +3713,7 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj, t->class_for_type = x64_class_for_type; t->imm_legal = x64_imm_legal; t->addr_legal = x64_addr_legal; + t->machine_op_clobbers = x64_machine_op_clobbers; t->func_begin = x64_func_begin; t->func_begin_known_frame = x64_func_begin_known_frame; t->bind_params_end = x64_bind_params_end; diff --git a/src/opt/ir.h b/src/opt/ir.h @@ -361,6 +361,9 @@ typedef struct IRBitFieldAux { #define OPT_REG_CLASSES 3u #define OPT_MAX_HARD_REGS 32u + +/* One per-class clobber bitmask set; see Func.inst_clobbers. */ +typedef u32 OptInstClobberMask[OPT_REG_CLASSES]; #define OPT_MAX_SCRATCH_REGS 4u typedef struct IRGepAux { @@ -703,6 +706,13 @@ typedef struct Func { u64 opt_coalesce_merge_attempts; u64 opt_coalesce_merges; InstId next_inst_id; + /* Per-instruction fixed-register clobbers (one bitmask per reg class), indexed + * by InstId, sized [next_inst_id]. Built in opt_machinize_native from the + * target's machine_op_clobbers hook; consulted by the allocator (pass_lower) + * to keep values live across an instruction out of the registers its encoding + * destroys (x86 idiv → rax/rdx, etc.). NULL when no instruction clobbers. */ + OptInstClobberMask* inst_clobbers; + u32 inst_clobbers_cap; OptPRegInfo* preg_info; /* indexed by the current allocation reg namespace */ OptLoc* preg_locs; /* canonical final allocation locations by PReg */ MFunc* mir; /* physical post-allocation IR; HIR stays virtual */ diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c @@ -206,6 +206,30 @@ static void apply_asm_register_constraints(Func* f, Inst* in, u64* use, } } +/* Apply the per-instruction fixed-register clobbers recorded in machinization + * (Func.inst_clobbers). A register the instruction's encoding destroys must not + * hold any value live AFTER the instruction unless that value is (re)defined + * here — so forbid each clobbered register for every live-after, non-def value. + * Values that merely die at the instruction (dying uses) need no constraint: + * the backend stages them into/out of the fixed registers itself. */ +static void apply_machine_reg_clobbers(Func* f, Inst* in, u64* def, + u64* live_after) { + if (!f->preg_info || !f->inst_clobbers || in->id == INST_ID_NONE || + in->id >= f->inst_clobbers_cap) + return; + for (u32 cls = 0; cls < OPT_REG_CLASSES; ++cls) { + u32 mask = f->inst_clobbers[in->id][cls]; + if (!mask) continue; + for (Reg r = 0; r < 32; ++r) { + if ((mask & (1u << r)) == 0) continue; + for (PReg v = 1; v < opt_reg_count(f); ++v) { + if (!(live_after && bit_has(live_after, v)) || bit_has(def, v)) continue; + forbid_preg_reg(f, v, (u8)cls, r); + } + } + } +} + static int phys_arg_reg_for_index(Func* f, u8 cls, u32 abi_index, Reg* out) { if (!f || cls >= OPT_REG_CLASSES) return 0; for (u32 i = 0; i < f->opt_phys_reg_count[cls]; ++i) { @@ -784,7 +808,9 @@ static void opt_apply_asm_constraints_from_live(Func* f, } } } - if (!has_asm) return; + /* The live walk drives both inline-asm operand constraints and per-instruction + * fixed-register clobbers (Func.inst_clobbers); run it if either is present. */ + if (!has_asm && !f->inst_clobbers) return; u32 words = live_info ? live_info->words : f->opt_live_words; if (!words) words = bit_words(opt_reg_count(f)); @@ -805,6 +831,7 @@ static void opt_apply_asm_constraints_from_live(Func* f, opt_walk_inst_operands(f, in, collect_bits, &bc); if ((IROp)in->op == IR_ASM_BLOCK) apply_asm_register_constraints(f, in, use, def, live); + apply_machine_reg_clobbers(f, in, def, live); live_update_before(live, use, def, words); } } diff --git a/src/opt/pass_machinize.c b/src/opt/pass_machinize.c @@ -144,9 +144,51 @@ static void machinize_check_overlap(Func* f) { } } +/* Record, per instruction, the physical registers the target's encoding clobbers + * as a side effect (x86 idiv → rax/rdx, variable shift → cl, atomics, va_arg), + * so the allocator keeps values live across them out of those registers. The + * target reports this through machine_op_clobbers; a NULL hook (aa64/rv64) means + * no instruction has fixed-register clobbers and the side table stays empty. */ +static void machinize_inst_clobbers(Func* f, NativeTarget* target) { + if (!target->machine_op_clobbers || !f->next_inst_id) return; + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + NativeMachineOp mop; + u32 mask[NATIVE_CALL_PLAN_CLASSES]; + memset(&mop, 0, sizeof mop); + switch ((IROp)in->op) { + case IR_BINOP: + mop.kind = NATIVE_MOP_BINOP; + mop.binop = (u8)in->extra.imm; + mop.second_is_reg = + (u8)(in->nopnds > 2u && in->opnds[2].kind == OPK_REG); + break; + case IR_VA_ARG: mop.kind = NATIVE_MOP_VA_ARG; break; + case IR_ATOMIC_CAS: mop.kind = NATIVE_MOP_ATOMIC_CAS; break; + case IR_ATOMIC_RMW: mop.kind = NATIVE_MOP_ATOMIC_RMW; break; + default: continue; + } + mask[0] = mask[1] = mask[2] = 0; + if (!target->machine_op_clobbers(target, &mop, mask)) continue; + if (in->id == INST_ID_NONE) continue; + if (!f->inst_clobbers) { + f->inst_clobbers_cap = f->next_inst_id; + f->inst_clobbers = + arena_zarray(f->arena, OptInstClobberMask, f->inst_clobbers_cap); + } + if (in->id < f->inst_clobbers_cap) + for (u32 c = 0; c < OPT_REG_CLASSES; ++c) + f->inst_clobbers[in->id][c] = mask[c]; + } + } +} + void opt_machinize_native(Func* f, NativeTarget* target) { machinize_reset(f, target); machinize_prepare_insts(f, target); machinize_collect_regs(f, target); machinize_check_overlap(f); + machinize_inst_clobbers(f, target); }