commit 58b3b6295afbdbdef20c1fa9236cd47eda894c7b
parent ffc9d1d619fb773720f51a7033a308752c9feb6f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 29 May 2026 11:22:30 -0700
opt: teach the allocator about fixed-register machine-instruction clobbers
Some target instructions clobber specific physical registers as a side effect of
their encoding — x86 idiv writes rax/rdx, a variable shift uses cl, cmpxchg/xadd
use rax/rcx/rdx. The optimizer modelled none of this, so at -O1 the allocator
kept a live value in (say) rdx across an idiv and the backend's cqo/idiv silently
destroyed it (e.g. 23/4 + 23%4: the first quotient parked in rdx, clobbered by
the second divide's cqo). aarch64/riscv64 never hit this — their div/shift/mul
are ordinary three-operand forms.
New mechanism (design in doc/OPT_MACHINE_REG_CONSTRAINTS.md):
- NativeTarget.machine_op_clobbers(op) reports the registers an instruction's
encoding clobbers, one bitmask per class, from a small NativeMachineOp
descriptor (keeps the backend off the optimizer IR). NULL for aa64/rv64.
- opt_machinize_native records them per instruction in Func.inst_clobbers
(side table keyed by InstId; both machinize and regalloc run on HIR so ids
are stable).
- The regalloc live-walk applies them like inline-asm clobbers: forbid each
clobbered register for every value live ACROSS the instruction that it does
not (re)define. Reuses the existing tied/forbidden machinery; the walk now
also runs for clobber-bearing functions, not only asm ones.
- x64 declares clobbers for idiv/div (rax,rdx), variable shift (rcx), atomic
cas/rmw (rax,rcx,rdx), va_arg (rax). x64 variable-shift emit also stages the
count via r11 so the count->rcx and value->dst moves never clobber each other.
x64 parse -O1: fixes div_mod, typedef_vla_size, rv64_atomic_widths (882->885).
No regressions: toy x64/rv64 O0+O1 156/0, aa64 1034/0 (hook NULL => byte-identical).
Diffstat:
5 files changed, 162 insertions(+), 3 deletions(-)
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -223,6 +223,25 @@ typedef struct NativeFramePatchState {
#define NATIVE_CALL_PLAN_CLASSES 3u
+/* A semantic machine operation, enough for the target to report the physical
+ * registers its encoding clobbers as a side effect (e.g. x86 idiv writes
+ * rax/rdx, variable shifts use cl). Built by the optimizer from an instruction;
+ * the descriptor keeps the backend from depending on the optimizer IR. */
+typedef enum NativeMachineOpKind {
+ NATIVE_MOP_BINOP,
+ NATIVE_MOP_VA_ARG,
+ NATIVE_MOP_ATOMIC_CAS,
+ NATIVE_MOP_ATOMIC_RMW,
+ NATIVE_MOP_INTRINSIC,
+} NativeMachineOpKind;
+
+typedef struct NativeMachineOp {
+ u8 kind; /* NativeMachineOpKind */
+ u8 binop; /* BinOp, when kind == NATIVE_MOP_BINOP */
+ u8 intrin; /* IntrinKind, when kind == NATIVE_MOP_INTRINSIC */
+ u8 second_is_reg; /* binop's second operand is a register (not an immediate) */
+} NativeMachineOp;
+
typedef struct NativeCallDesc {
CfreeCgTypeId fn_type;
NativeLoc callee;
@@ -281,6 +300,15 @@ struct NativeTarget {
NativeAllocClass (*class_for_type)(NativeTarget*, CfreeCgTypeId);
int (*imm_legal)(NativeTarget*, NativeImmUse, u32 op, CfreeCgTypeId, i64);
int (*addr_legal)(NativeTarget*, const NativeAddr*, MemAccess);
+ /* Optional. Report the physical registers the target's encoding of `op`
+ * clobbers as a side effect (not its declared operands/results), one bitmask
+ * per NativeAllocClass. The optimizer keeps values live ACROSS the instruction
+ * out of these registers, so the backend may use them freely (x86 idiv writes
+ * rax/rdx; a variable shift uses cl; atomics use rax/rcx/rdx). Return non-zero
+ * if any register is clobbered, 0 otherwise (the common, unconstrained case).
+ * NULL means no instruction clobbers fixed registers (aa64/rv64). */
+ int (*machine_op_clobbers)(NativeTarget*, const NativeMachineOp* op,
+ u32 clobber_mask[NATIVE_CALL_PLAN_CLASSES]);
void (*func_begin)(NativeTarget*, const CGFuncDesc*);
void (*func_begin_known_frame)(NativeTarget*, const CGFuncDesc*,
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -958,9 +958,19 @@ static void x64_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc aop,
}
{
u32 rb = loc_reg(bop);
- if (rb != X64_RCX) emit_mov_rr(mc, 0, X64_RCX, rb);
+ /* Place the count in cl and the value in dst. Stage the count through
+ * r11 first so neither move clobbers the other when the value already
+ * sits in rcx or the count sits in dst. (The optimizer additionally
+ * keeps values live across the shift out of rcx — see
+ * x64_machine_op_clobbers.) */
+ if (rb != X64_RCX) {
+ emit_mov_rr(mc, 0, X64_TMP_INT2, rb);
+ if (rd != ra) emit_mov_rr(mc, w, rd, ra);
+ emit_mov_rr(mc, 0, X64_RCX, X64_TMP_INT2);
+ } else if (rd != ra) {
+ emit_mov_rr(mc, w, rd, ra);
+ }
}
- if (rd != ra) emit_mov_rr(mc, w, rd, ra);
emit_shift_cl(mc, w, sub, rd);
return;
}
@@ -3646,6 +3656,47 @@ static void x64_finalize(NativeTarget* t) {
if (t->mc) mc_emit_eh_frame(t->mc);
}
+/* Physical registers each x86-64 instruction's encoding clobbers as a side
+ * effect, so the optimizer keeps values live across them out of those registers
+ * (the backend is then free to use them). idiv/div write rax (quotient) and rdx
+ * (remainder/sign); a variable shift uses cl; cmpxchg/xadd loops use rax/rcx/rdx;
+ * va_arg uses rax for the gp/fp offset. */
+static int x64_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op,
+ u32 mask[NATIVE_CALL_PLAN_CLASSES]) {
+ (void)t;
+ mask[0] = mask[1] = mask[2] = 0;
+ switch ((NativeMachineOpKind)op->kind) {
+ case NATIVE_MOP_BINOP:
+ switch ((BinOp)op->binop) {
+ case BO_SDIV:
+ case BO_UDIV:
+ case BO_SREM:
+ case BO_UREM:
+ mask[NATIVE_REG_INT] = (1u << X64_RAX) | (1u << X64_RDX);
+ return 1;
+ case BO_SHL:
+ case BO_SHR_S:
+ case BO_SHR_U:
+ if (op->second_is_reg) {
+ mask[NATIVE_REG_INT] = (1u << X64_RCX);
+ return 1;
+ }
+ return 0;
+ default:
+ return 0;
+ }
+ case NATIVE_MOP_ATOMIC_CAS:
+ case NATIVE_MOP_ATOMIC_RMW:
+ mask[NATIVE_REG_INT] = (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX);
+ return 1;
+ case NATIVE_MOP_VA_ARG:
+ mask[NATIVE_REG_INT] = (1u << X64_RAX);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
/* ============================ construction ============================ */
NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj,
@@ -3662,6 +3713,7 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj,
t->class_for_type = x64_class_for_type;
t->imm_legal = x64_imm_legal;
t->addr_legal = x64_addr_legal;
+ t->machine_op_clobbers = x64_machine_op_clobbers;
t->func_begin = x64_func_begin;
t->func_begin_known_frame = x64_func_begin_known_frame;
t->bind_params_end = x64_bind_params_end;
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -361,6 +361,9 @@ typedef struct IRBitFieldAux {
#define OPT_REG_CLASSES 3u
#define OPT_MAX_HARD_REGS 32u
+
+/* One per-class clobber bitmask set; see Func.inst_clobbers. */
+typedef u32 OptInstClobberMask[OPT_REG_CLASSES];
#define OPT_MAX_SCRATCH_REGS 4u
typedef struct IRGepAux {
@@ -703,6 +706,13 @@ typedef struct Func {
u64 opt_coalesce_merge_attempts;
u64 opt_coalesce_merges;
InstId next_inst_id;
+ /* Per-instruction fixed-register clobbers (one bitmask per reg class), indexed
+ * by InstId, sized [next_inst_id]. Built in opt_machinize_native from the
+ * target's machine_op_clobbers hook; consulted by the allocator (pass_lower)
+ * to keep values live across an instruction out of the registers its encoding
+ * destroys (x86 idiv → rax/rdx, etc.). NULL when no instruction clobbers. */
+ OptInstClobberMask* inst_clobbers;
+ u32 inst_clobbers_cap;
OptPRegInfo* preg_info; /* indexed by the current allocation reg namespace */
OptLoc* preg_locs; /* canonical final allocation locations by PReg */
MFunc* mir; /* physical post-allocation IR; HIR stays virtual */
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -206,6 +206,30 @@ static void apply_asm_register_constraints(Func* f, Inst* in, u64* use,
}
}
+/* Apply the per-instruction fixed-register clobbers recorded in machinization
+ * (Func.inst_clobbers). A register the instruction's encoding destroys must not
+ * hold any value live AFTER the instruction unless that value is (re)defined
+ * here — so forbid each clobbered register for every live-after, non-def value.
+ * Values that merely die at the instruction (dying uses) need no constraint:
+ * the backend stages them into/out of the fixed registers itself. */
+static void apply_machine_reg_clobbers(Func* f, Inst* in, u64* def,
+ u64* live_after) {
+ if (!f->preg_info || !f->inst_clobbers || in->id == INST_ID_NONE ||
+ in->id >= f->inst_clobbers_cap)
+ return;
+ for (u32 cls = 0; cls < OPT_REG_CLASSES; ++cls) {
+ u32 mask = f->inst_clobbers[in->id][cls];
+ if (!mask) continue;
+ for (Reg r = 0; r < 32; ++r) {
+ if ((mask & (1u << r)) == 0) continue;
+ for (PReg v = 1; v < opt_reg_count(f); ++v) {
+ if (!(live_after && bit_has(live_after, v)) || bit_has(def, v)) continue;
+ forbid_preg_reg(f, v, (u8)cls, r);
+ }
+ }
+ }
+}
+
static int phys_arg_reg_for_index(Func* f, u8 cls, u32 abi_index, Reg* out) {
if (!f || cls >= OPT_REG_CLASSES) return 0;
for (u32 i = 0; i < f->opt_phys_reg_count[cls]; ++i) {
@@ -784,7 +808,9 @@ static void opt_apply_asm_constraints_from_live(Func* f,
}
}
}
- if (!has_asm) return;
+ /* The live walk drives both inline-asm operand constraints and per-instruction
+ * fixed-register clobbers (Func.inst_clobbers); run it if either is present. */
+ if (!has_asm && !f->inst_clobbers) return;
u32 words = live_info ? live_info->words : f->opt_live_words;
if (!words) words = bit_words(opt_reg_count(f));
@@ -805,6 +831,7 @@ static void opt_apply_asm_constraints_from_live(Func* f,
opt_walk_inst_operands(f, in, collect_bits, &bc);
if ((IROp)in->op == IR_ASM_BLOCK)
apply_asm_register_constraints(f, in, use, def, live);
+ apply_machine_reg_clobbers(f, in, def, live);
live_update_before(live, use, def, words);
}
}
diff --git a/src/opt/pass_machinize.c b/src/opt/pass_machinize.c
@@ -144,9 +144,51 @@ static void machinize_check_overlap(Func* f) {
}
}
+/* Record, per instruction, the physical registers the target's encoding clobbers
+ * as a side effect (x86 idiv → rax/rdx, variable shift → cl, atomics, va_arg),
+ * so the allocator keeps values live across them out of those registers. The
+ * target reports this through machine_op_clobbers; a NULL hook (aa64/rv64) means
+ * no instruction has fixed-register clobbers and the side table stays empty. */
+static void machinize_inst_clobbers(Func* f, NativeTarget* target) {
+ if (!target->machine_op_clobbers || !f->next_inst_id) return;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ NativeMachineOp mop;
+ u32 mask[NATIVE_CALL_PLAN_CLASSES];
+ memset(&mop, 0, sizeof mop);
+ switch ((IROp)in->op) {
+ case IR_BINOP:
+ mop.kind = NATIVE_MOP_BINOP;
+ mop.binop = (u8)in->extra.imm;
+ mop.second_is_reg =
+ (u8)(in->nopnds > 2u && in->opnds[2].kind == OPK_REG);
+ break;
+ case IR_VA_ARG: mop.kind = NATIVE_MOP_VA_ARG; break;
+ case IR_ATOMIC_CAS: mop.kind = NATIVE_MOP_ATOMIC_CAS; break;
+ case IR_ATOMIC_RMW: mop.kind = NATIVE_MOP_ATOMIC_RMW; break;
+ default: continue;
+ }
+ mask[0] = mask[1] = mask[2] = 0;
+ if (!target->machine_op_clobbers(target, &mop, mask)) continue;
+ if (in->id == INST_ID_NONE) continue;
+ if (!f->inst_clobbers) {
+ f->inst_clobbers_cap = f->next_inst_id;
+ f->inst_clobbers =
+ arena_zarray(f->arena, OptInstClobberMask, f->inst_clobbers_cap);
+ }
+ if (in->id < f->inst_clobbers_cap)
+ for (u32 c = 0; c < OPT_REG_CLASSES; ++c)
+ f->inst_clobbers[in->id][c] = mask[c];
+ }
+ }
+}
+
void opt_machinize_native(Func* f, NativeTarget* target) {
machinize_reset(f, target);
machinize_prepare_insts(f, target);
machinize_collect_regs(f, target);
machinize_check_overlap(f);
+ machinize_inst_clobbers(f, target);
}