opt: teach the allocator about fixed-register machine-instruction clobbers - kit

commit 58b3b6295afbdbdef20c1fa9236cd47eda894c7b
parent ffc9d1d619fb773720f51a7033a308752c9feb6f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 29 May 2026 11:22:30 -0700

opt: teach the allocator about fixed-register machine-instruction clobbers

Some target instructions clobber specific physical registers as a side effect of
their encoding — x86 idiv writes rax/rdx, a variable shift uses cl, cmpxchg/xadd
use rax/rcx/rdx. The optimizer modelled none of this, so at -O1 the allocator
kept a live value in (say) rdx across an idiv and the backend's cqo/idiv silently
destroyed it (e.g. 23/4 + 23%4: the first quotient parked in rdx, clobbered by
the second divide's cqo). aarch64/riscv64 never hit this — their div/shift/mul
are ordinary three-operand forms.

New mechanism (design in doc/OPT_MACHINE_REG_CONSTRAINTS.md):
- NativeTarget.machine_op_clobbers(op) reports the registers an instruction's
  encoding clobbers, one bitmask per class, from a small NativeMachineOp
  descriptor (keeps the backend off the optimizer IR). NULL for aa64/rv64.
- opt_machinize_native records them per instruction in Func.inst_clobbers
  (side table keyed by InstId; both machinize and regalloc run on HIR so ids
  are stable).
- The regalloc live-walk applies them like inline-asm clobbers: forbid each
  clobbered register for every value live ACROSS the instruction that it does
  not (re)define. Reuses the existing tied/forbidden machinery; the walk now
  also runs for clobber-bearing functions, not only asm ones.
- x64 declares clobbers for idiv/div (rax,rdx), variable shift (rcx), atomic
  cas/rmw (rax,rcx,rdx), va_arg (rax). x64 variable-shift emit also stages the
  count via r11 so the count->rcx and value->dst moves never clobber each other.

x64 parse -O1: fixes div_mod, typedef_vla_size, rv64_atomic_widths (882->885).
No regressions: toy x64/rv64 O0+O1 156/0, aa64 1034/0 (hook NULL => byte-identical).

Diffstat:
M src/arch/native_target.h  | 28 ++++++++++++++++++++++++++++
M src/arch/x64/native.c  | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M src/opt/ir.h  | 10 ++++++++++
M src/opt/pass_lower.c  | 29 ++++++++++++++++++++++++++++-
M src/opt/pass_machinize.c  | 42 ++++++++++++++++++++++++++++++++++++++++++

5 files changed, 162 insertions(+), 3 deletions(-)
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -223,6 +223,25 @@ typedef struct NativeFramePatchState {
 
 #define NATIVE_CALL_PLAN_CLASSES 3u
 
+/* A semantic machine operation, enough for the target to report the physical
+ * registers its encoding clobbers as a side effect (e.g. x86 idiv writes
+ * rax/rdx, variable shifts use cl). Built by the optimizer from an instruction;
+ * the descriptor keeps the backend from depending on the optimizer IR. */
+typedef enum NativeMachineOpKind {
+  NATIVE_MOP_BINOP,
+  NATIVE_MOP_VA_ARG,
+  NATIVE_MOP_ATOMIC_CAS,
+  NATIVE_MOP_ATOMIC_RMW,
+  NATIVE_MOP_INTRINSIC,
+} NativeMachineOpKind;
+
+typedef struct NativeMachineOp {
+  u8 kind;          /* NativeMachineOpKind */
+  u8 binop;         /* BinOp, when kind == NATIVE_MOP_BINOP */
+  u8 intrin;        /* IntrinKind, when kind == NATIVE_MOP_INTRINSIC */
+  u8 second_is_reg; /* binop's second operand is a register (not an immediate) */
+} NativeMachineOp;
+
 typedef struct NativeCallDesc {
   CfreeCgTypeId fn_type;
   NativeLoc callee;
@@ -281,6 +300,15 @@ struct NativeTarget {
   NativeAllocClass (*class_for_type)(NativeTarget*, CfreeCgTypeId);
   int (*imm_legal)(NativeTarget*, NativeImmUse, u32 op, CfreeCgTypeId, i64);
   int (*addr_legal)(NativeTarget*, const NativeAddr*, MemAccess);
+  /* Optional. Report the physical registers the target's encoding of `op`
+   * clobbers as a side effect (not its declared operands/results), one bitmask
+   * per NativeAllocClass. The optimizer keeps values live ACROSS the instruction
+   * out of these registers, so the backend may use them freely (x86 idiv writes
+   * rax/rdx; a variable shift uses cl; atomics use rax/rcx/rdx). Return non-zero
+   * if any register is clobbered, 0 otherwise (the common, unconstrained case).
+   * NULL means no instruction clobbers fixed registers (aa64/rv64). */
+  int (*machine_op_clobbers)(NativeTarget*, const NativeMachineOp* op,
+                             u32 clobber_mask[NATIVE_CALL_PLAN_CLASSES]);
 
   void (*func_begin)(NativeTarget*, const CGFuncDesc*);
   void (*func_begin_known_frame)(NativeTarget*, const CGFuncDesc*,
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -958,9 +958,19 @@ static void x64_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc aop,
       }
       {
         u32 rb = loc_reg(bop);
-        if (rb != X64_RCX) emit_mov_rr(mc, 0, X64_RCX, rb);
+        /* Place the count in cl and the value in dst. Stage the count through
+         * r11 first so neither move clobbers the other when the value already
+         * sits in rcx or the count sits in dst. (The optimizer additionally
+         * keeps values live across the shift out of rcx — see
+         * x64_machine_op_clobbers.) */
+        if (rb != X64_RCX) {
+          emit_mov_rr(mc, 0, X64_TMP_INT2, rb);
+          if (rd != ra) emit_mov_rr(mc, w, rd, ra);
+          emit_mov_rr(mc, 0, X64_RCX, X64_TMP_INT2);
+        } else if (rd != ra) {
+          emit_mov_rr(mc, w, rd, ra);
+        }
       }
-      if (rd != ra) emit_mov_rr(mc, w, rd, ra);
       emit_shift_cl(mc, w, sub, rd);
       return;
     }
@@ -3646,6 +3656,47 @@ static void x64_finalize(NativeTarget* t) {
   if (t->mc) mc_emit_eh_frame(t->mc);
 }
 
+/* Physical registers each x86-64 instruction's encoding clobbers as a side
+ * effect, so the optimizer keeps values live across them out of those registers
+ * (the backend is then free to use them). idiv/div write rax (quotient) and rdx
+ * (remainder/sign); a variable shift uses cl; cmpxchg/xadd loops use rax/rcx/rdx;
+ * va_arg uses rax for the gp/fp offset. */
+static int x64_machine_op_clobbers(NativeTarget* t, const NativeMachineOp* op,
+                                   u32 mask[NATIVE_CALL_PLAN_CLASSES]) {
+  (void)t;
+  mask[0] = mask[1] = mask[2] = 0;
+  switch ((NativeMachineOpKind)op->kind) {
+    case NATIVE_MOP_BINOP:
+      switch ((BinOp)op->binop) {
+        case BO_SDIV:
+        case BO_UDIV:
+        case BO_SREM:
+        case BO_UREM:
+          mask[NATIVE_REG_INT] = (1u << X64_RAX) | (1u << X64_RDX);
+          return 1;
+        case BO_SHL:
+        case BO_SHR_S:
+        case BO_SHR_U:
+          if (op->second_is_reg) {
+            mask[NATIVE_REG_INT] = (1u << X64_RCX);
+            return 1;
+          }
+          return 0;
+        default:
+          return 0;
+      }
+    case NATIVE_MOP_ATOMIC_CAS:
+    case NATIVE_MOP_ATOMIC_RMW:
+      mask[NATIVE_REG_INT] = (1u << X64_RAX) | (1u << X64_RCX) | (1u << X64_RDX);
+      return 1;
+    case NATIVE_MOP_VA_ARG:
+      mask[NATIVE_REG_INT] = (1u << X64_RAX);
+      return 1;
+    default:
+      return 0;
+  }
+}
+
 /* ============================ construction ============================ */
 
 NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj,
@@ -3662,6 +3713,7 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj,
   t->class_for_type = x64_class_for_type;
   t->imm_legal = x64_imm_legal;
   t->addr_legal = x64_addr_legal;
+  t->machine_op_clobbers = x64_machine_op_clobbers;
   t->func_begin = x64_func_begin;
   t->func_begin_known_frame = x64_func_begin_known_frame;
   t->bind_params_end = x64_bind_params_end;
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -361,6 +361,9 @@ typedef struct IRBitFieldAux {
 
 #define OPT_REG_CLASSES 3u
 #define OPT_MAX_HARD_REGS 32u
+
+/* One per-class clobber bitmask set; see Func.inst_clobbers. */
+typedef u32 OptInstClobberMask[OPT_REG_CLASSES];
 #define OPT_MAX_SCRATCH_REGS 4u
 
 typedef struct IRGepAux {
@@ -703,6 +706,13 @@ typedef struct Func {
   u64 opt_coalesce_merge_attempts;
   u64 opt_coalesce_merges;
   InstId next_inst_id;
+  /* Per-instruction fixed-register clobbers (one bitmask per reg class), indexed
+   * by InstId, sized [next_inst_id]. Built in opt_machinize_native from the
+   * target's machine_op_clobbers hook; consulted by the allocator (pass_lower)
+   * to keep values live across an instruction out of the registers its encoding
+   * destroys (x86 idiv → rax/rdx, etc.). NULL when no instruction clobbers. */
+  OptInstClobberMask* inst_clobbers;
+  u32 inst_clobbers_cap;
   OptPRegInfo* preg_info; /* indexed by the current allocation reg namespace */
   OptLoc* preg_locs;      /* canonical final allocation locations by PReg */
   MFunc* mir;             /* physical post-allocation IR; HIR stays virtual */
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -206,6 +206,30 @@ static void apply_asm_register_constraints(Func* f, Inst* in, u64* use,
   }
 }
 
+/* Apply the per-instruction fixed-register clobbers recorded in machinization
+ * (Func.inst_clobbers). A register the instruction's encoding destroys must not
+ * hold any value live AFTER the instruction unless that value is (re)defined
+ * here — so forbid each clobbered register for every live-after, non-def value.
+ * Values that merely die at the instruction (dying uses) need no constraint:
+ * the backend stages them into/out of the fixed registers itself. */
+static void apply_machine_reg_clobbers(Func* f, Inst* in, u64* def,
+                                       u64* live_after) {
+  if (!f->preg_info || !f->inst_clobbers || in->id == INST_ID_NONE ||
+      in->id >= f->inst_clobbers_cap)
+    return;
+  for (u32 cls = 0; cls < OPT_REG_CLASSES; ++cls) {
+    u32 mask = f->inst_clobbers[in->id][cls];
+    if (!mask) continue;
+    for (Reg r = 0; r < 32; ++r) {
+      if ((mask & (1u << r)) == 0) continue;
+      for (PReg v = 1; v < opt_reg_count(f); ++v) {
+        if (!(live_after && bit_has(live_after, v)) || bit_has(def, v)) continue;
+        forbid_preg_reg(f, v, (u8)cls, r);
+      }
+    }
+  }
+}
+
 static int phys_arg_reg_for_index(Func* f, u8 cls, u32 abi_index, Reg* out) {
   if (!f || cls >= OPT_REG_CLASSES) return 0;
   for (u32 i = 0; i < f->opt_phys_reg_count[cls]; ++i) {
@@ -784,7 +808,9 @@ static void opt_apply_asm_constraints_from_live(Func* f,
       }
     }
   }
-  if (!has_asm) return;
+  /* The live walk drives both inline-asm operand constraints and per-instruction
+   * fixed-register clobbers (Func.inst_clobbers); run it if either is present. */
+  if (!has_asm && !f->inst_clobbers) return;
 
   u32 words = live_info ? live_info->words : f->opt_live_words;
   if (!words) words = bit_words(opt_reg_count(f));
@@ -805,6 +831,7 @@ static void opt_apply_asm_constraints_from_live(Func* f,
       opt_walk_inst_operands(f, in, collect_bits, &bc);
       if ((IROp)in->op == IR_ASM_BLOCK)
         apply_asm_register_constraints(f, in, use, def, live);
+      apply_machine_reg_clobbers(f, in, def, live);
       live_update_before(live, use, def, words);
     }
   }
diff --git a/src/opt/pass_machinize.c b/src/opt/pass_machinize.c
@@ -144,9 +144,51 @@ static void machinize_check_overlap(Func* f) {
   }
 }
 
+/* Record, per instruction, the physical registers the target's encoding clobbers
+ * as a side effect (x86 idiv → rax/rdx, variable shift → cl, atomics, va_arg),
+ * so the allocator keeps values live across them out of those registers. The
+ * target reports this through machine_op_clobbers; a NULL hook (aa64/rv64) means
+ * no instruction has fixed-register clobbers and the side table stays empty. */
+static void machinize_inst_clobbers(Func* f, NativeTarget* target) {
+  if (!target->machine_op_clobbers || !f->next_inst_id) return;
+  for (u32 b = 0; b < f->nblocks; ++b) {
+    Block* bl = &f->blocks[b];
+    for (u32 i = 0; i < bl->ninsts; ++i) {
+      Inst* in = &bl->insts[i];
+      NativeMachineOp mop;
+      u32 mask[NATIVE_CALL_PLAN_CLASSES];
+      memset(&mop, 0, sizeof mop);
+      switch ((IROp)in->op) {
+        case IR_BINOP:
+          mop.kind = NATIVE_MOP_BINOP;
+          mop.binop = (u8)in->extra.imm;
+          mop.second_is_reg =
+              (u8)(in->nopnds > 2u && in->opnds[2].kind == OPK_REG);
+          break;
+        case IR_VA_ARG: mop.kind = NATIVE_MOP_VA_ARG; break;
+        case IR_ATOMIC_CAS: mop.kind = NATIVE_MOP_ATOMIC_CAS; break;
+        case IR_ATOMIC_RMW: mop.kind = NATIVE_MOP_ATOMIC_RMW; break;
+        default: continue;
+      }
+      mask[0] = mask[1] = mask[2] = 0;
+      if (!target->machine_op_clobbers(target, &mop, mask)) continue;
+      if (in->id == INST_ID_NONE) continue;
+      if (!f->inst_clobbers) {
+        f->inst_clobbers_cap = f->next_inst_id;
+        f->inst_clobbers =
+            arena_zarray(f->arena, OptInstClobberMask, f->inst_clobbers_cap);
+      }
+      if (in->id < f->inst_clobbers_cap)
+        for (u32 c = 0; c < OPT_REG_CLASSES; ++c)
+          f->inst_clobbers[in->id][c] = mask[c];
+    }
+  }
+}
+
 void opt_machinize_native(Func* f, NativeTarget* target) {
   machinize_reset(f, target);
   machinize_prepare_insts(f, target);
   machinize_collect_regs(f, target);
   machinize_check_overlap(f);
+  machinize_inst_clobbers(f, target);
 }

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/native_target.h	\|	28	++++++++++++++++++++++++++++
M	src/arch/x64/native.c	\|	56	++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M	src/opt/ir.h	\|	10	++++++++++
M	src/opt/pass_lower.c	\|	29	++++++++++++++++++++++++++++-
M	src/opt/pass_machinize.c	\|	42	++++++++++++++++++++++++++++++++++++++++++