cg: imm-form encodings for x64 and rv64 binop/unop - kit

commit 4934c27ff80eca5aedb33bb589ddc42bdd9973ee
parent edb863ae6bdc7fba48ca788ced88e943c92ee280
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon, 11 May 2026 12:42:44 -0700

cg: imm-form encodings for x64 and rv64 binop/unop

Mirrors the aarch64 work in edb863a — both backends now honor the
REG|IMM contract on binop/unop and pick the imm-form encoding when the
literal fits.

x64: emit_alu_imm8/imm32, emit_imul_imm8/imm32, emit_cmp imm32 fast
path beyond the existing imm8 case.  x_binop swaps commutative ops to
put IMM on the RHS, then encodes ADD/SUB/AND/OR/XOR via 0x83/0x81 /sub
and IMUL via 0x6B/0x69 /r when the literal fits i8 or i32.  Shifts
use the existing C1 /sub ib helper for OPK_IMM counts and skip the
mov-to-cl step.  x_unop routes through force_reg_int.

rv64: rv_binop encodes IADD/AND/OR/XOR via *_addi/andi/ori/xori for
12-bit signed imm.  ISUB has no SUBI in RV-I; encoded as ADDI with the
negated literal when -imm fits the same range.  Shifts use the
slli(w)/srli(w)/srai(w) helpers for OPK_IMM counts.  rv_unop routes
through force_reg_int.

Diffstat:
M src/arch/rv64.c  | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M src/arch/x64.c  | 164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------

2 files changed, 228 insertions(+), 16 deletions(-)
diff --git a/src/arch/rv64.c b/src/arch/rv64.c
@@ -1459,6 +1459,78 @@ static void rv_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
   }
   u32 sf = type_is_64(dst.type) ? 1u : 0u;
   u32 rd = reg_num(dst);
+
+  /* Canonicalize IMM to the RHS for commutative ops so the imm-form
+   * check below handles `3 + a` the same as `a + 3`. ISUB is not
+   * commutative — IMM-on-LHS still materializes. */
+  switch (op) {
+    case BO_IADD:
+    case BO_AND:
+    case BO_OR:
+    case BO_XOR: {
+      if (a_op.kind == OPK_IMM && b_op.kind != OPK_IMM) {
+        Operand t_op = a_op; a_op = b_op; b_op = t_op;
+      }
+      break;
+    }
+    default: break;
+  }
+
+  /* IMM-form fast paths. RV-I admits a 12-bit signed immediate for
+   * ADDI/ANDI/ORI/XORI/SLTI/SLTIU (range [-2048, 2047]). ISUB has no
+   * SUBI — we encode it as ADDI with the negated literal when -imm
+   * fits the same range (i.e., imm ∈ [-2047, 2048]; INT_MIN is
+   * intentionally excluded since -INT_MIN overflows). Shifts admit a
+   * shamt: 6 bits (0..63) on the 64-bit forms, 5 bits (0..31) on the
+   * W-variants. */
+  if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) {
+    u32 ra = reg_num(a_op);
+    i64 imm = b_op.v.imm;
+    int fits12 = imm >= -2048 && imm <= 2047;
+    switch (op) {
+      case BO_IADD:
+        if (fits12) {
+          emit32(mc, sf ? rv_addi(rd, ra, (i32)imm) : rv_addiw(rd, ra, (i32)imm));
+          return;
+        }
+        break;
+      case BO_ISUB:
+        if (imm >= -2047 && imm <= 2048) {
+          emit32(mc, sf ? rv_addi(rd, ra, (i32)-imm) : rv_addiw(rd, ra, (i32)-imm));
+          return;
+        }
+        break;
+      case BO_AND:
+        if (fits12) { emit32(mc, rv_andi(rd, ra, (i32)imm)); return; }
+        break;
+      case BO_OR:
+        if (fits12) { emit32(mc, rv_ori(rd, ra, (i32)imm)); return; }
+        break;
+      case BO_XOR:
+        if (fits12) { emit32(mc, rv_xori(rd, ra, (i32)imm)); return; }
+        break;
+      case BO_SHL: {
+        u32 width = sf ? 64u : 32u;
+        u32 sh = (u32)((u64)imm & (width - 1u));
+        emit32(mc, sf ? rv_slli(rd, ra, sh) : rv_slliw(rd, ra, sh));
+        return;
+      }
+      case BO_SHR_U: {
+        u32 width = sf ? 64u : 32u;
+        u32 sh = (u32)((u64)imm & (width - 1u));
+        emit32(mc, sf ? rv_srli(rd, ra, sh) : rv_srliw(rd, ra, sh));
+        return;
+      }
+      case BO_SHR_S: {
+        u32 width = sf ? 64u : 32u;
+        u32 sh = (u32)((u64)imm & (width - 1u));
+        emit32(mc, sf ? rv_srai(rd, ra, sh) : rv_sraiw(rd, ra, sh));
+        return;
+      }
+      default: break;
+    }
+  }
+
   u32 ra = force_reg_int(t, a_op, RV_T0);
   u32 rb = force_reg_int(t, b_op, (ra == RV_T0) ? RV_T1 : RV_T0);
 
@@ -1485,10 +1557,10 @@ static void rv_unop(CGTarget* t, UnOp op, Operand dst, Operand a_op) {
   MCEmitter* mc = t->mc;
   u32 sf = type_is_64(dst.type) ? 1u : 0u;
   u32 rd = reg_num(dst);
-  if (a_op.kind != OPK_REG) {
-    compiler_panic(t->c, impl_of(t)->loc, "rv64 unop: non-REG operand NYI");
-  }
-  u32 rn = reg_num(a_op);
+  /* IMM operand is legal per the CGTarget contract (arch.h); materialize
+   * into t0 when not already a register. cg folds literal unops upstream
+   * via cg_fold_unop. */
+  u32 rn = force_reg_int(t, a_op, RV_T0);
   switch (op) {
     case UO_NEG:
       emit32(mc, sf ? rv_sub(rd, RV_ZERO, rn) : rv_subw(rd, RV_ZERO, rn));
diff --git a/src/arch/x64.c b/src/arch/x64.c
@@ -496,6 +496,73 @@ static void emit_cmp_imm8(MCEmitter* mc, int w, u32 reg, i8 imm) {
   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
 }
 
+/* ALU r/m, imm8: opcode 0x83 /sub ib (sign-extended). sub: ADD=0,
+ * OR=1, ADC=2, SBB=3, AND=4, SUB=5, XOR=6, CMP=7. */
+static void emit_alu_imm8(MCEmitter* mc, int w, u32 sub, u32 reg, i8 imm) {
+  u32 ofs = obj_pos(mc->obj, mc->section_id);
+  emit_rex(mc, w, 0, 0, reg);
+  u8 buf[3];
+  buf[0] = 0x83;
+  buf[1] = modrm(3u, sub, reg);
+  buf[2] = (u8)imm;
+  mc->emit_bytes(mc, buf, 3);
+  if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
+/* ALU r/m, imm32: opcode 0x81 /sub id (sign-extended for w=1). */
+static void emit_alu_imm32(MCEmitter* mc, int w, u32 sub, u32 reg, i32 imm) {
+  u32 ofs = obj_pos(mc->obj, mc->section_id);
+  emit_rex(mc, w, 0, 0, reg);
+  u8 buf[6];
+  buf[0] = 0x81;
+  buf[1] = modrm(3u, sub, reg);
+  buf[2] = (u8)(imm & 0xFF);
+  buf[3] = (u8)((imm >> 8) & 0xFF);
+  buf[4] = (u8)((imm >> 16) & 0xFF);
+  buf[5] = (u8)((imm >> 24) & 0xFF);
+  mc->emit_bytes(mc, buf, 6);
+  if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
+/* IMUL r, r/m, imm: 0x6B /r ib (imm8 sext) or 0x69 /r id (imm32 sext).
+ * Both forms write the result back to the same `dst` register so the
+ * caller doesn't need an explicit copy beforehand — unlike the ALU
+ * forms which read-modify-write a single operand. */
+static void emit_imul_imm8(MCEmitter* mc, int w, u32 dst, u32 src, i8 imm) {
+  u32 ofs = obj_pos(mc->obj, mc->section_id);
+  emit_rex(mc, w, dst, 0, src);
+  u8 buf[3];
+  buf[0] = 0x6B;
+  buf[1] = modrm(3u, dst, src);
+  buf[2] = (u8)imm;
+  mc->emit_bytes(mc, buf, 3);
+  if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+static void emit_imul_imm32(MCEmitter* mc, int w, u32 dst, u32 src, i32 imm) {
+  u32 ofs = obj_pos(mc->obj, mc->section_id);
+  emit_rex(mc, w, dst, 0, src);
+  u8 buf[6];
+  buf[0] = 0x69;
+  buf[1] = modrm(3u, dst, src);
+  buf[2] = (u8)(imm & 0xFF);
+  buf[3] = (u8)((imm >> 8) & 0xFF);
+  buf[4] = (u8)((imm >> 16) & 0xFF);
+  buf[5] = (u8)((imm >> 24) & 0xFF);
+  mc->emit_bytes(mc, buf, 6);
+  if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
+/* Width predicate: does `imm` fit in an i8 (used by the 0x83/0x6B
+ * imm8-sign-extended forms)? */
+static int imm_fits_i8(i64 imm) { return imm >= -128 && imm <= 127; }
+/* Width predicate: does `imm` fit in a signed 32-bit value (the 0x81/
+ * 0x69 imm32-sign-extended forms; for w=1 the imm is sign-extended to
+ * 64). Returns 0 for values outside [INT32_MIN, INT32_MAX] — those
+ * require a full materialization through emit_load_imm. */
+static int imm_fits_i32(i64 imm) {
+  return imm >= -2147483648LL && imm <= 2147483647LL;
+}
+
 static void emit_test_self(MCEmitter* mc, int w, u32 reg) {
   emit_alu_rr(mc, w, 0x85, reg, reg);
 }
@@ -1012,10 +1079,18 @@ static u32 force_reg_int(CGTarget* t, Operand op, int w, u32 scratch) {
 
 static void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op) {
   int w = type_is_64(a_op.type) ? 1 : 0;
-  if (a_op.kind == OPK_REG && b_op.kind == OPK_IMM && b_op.v.imm >= -128 &&
-      b_op.v.imm <= 127) {
-    emit_cmp_imm8(t->mc, w, a_op.v.reg & 0xFu, (i8)b_op.v.imm);
-    return;
+  /* IMM RHS imm8 / imm32 fast paths. CMP is not commutative across the
+   * cond codes, so IMM-on-LHS still has to materialize. */
+  if (b_op.kind == OPK_IMM && a_op.kind == OPK_REG) {
+    if (imm_fits_i8(b_op.v.imm)) {
+      emit_cmp_imm8(t->mc, w, a_op.v.reg & 0xFu, (i8)b_op.v.imm);
+      return;
+    }
+    if (imm_fits_i32(b_op.v.imm)) {
+      emit_alu_imm32(t->mc, w, /*sub=CMP*/ 7u, a_op.v.reg & 0xFu,
+                     (i32)b_op.v.imm);
+      return;
+    }
   }
   u32 ra = force_reg_int(t, a_op, w, X64_RAX);
   u32 rb = force_reg_int(t, b_op, w, (ra == X64_R11) ? X64_RAX : X64_R11);
@@ -1578,24 +1653,88 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
     return;
   }
 
-  /* Shifts: shift count must be in cl. */
+  /* Shifts: shift count must be in cl OR encoded as imm8 directly (C1
+   * /sub ib). Use the imm form when b is OPK_IMM and skip materializing
+   * into cl. */
   if (op == BO_SHL || op == BO_SHR_U || op == BO_SHR_S) {
     u32 ra = force_reg_int(t, a_op, w, X64_RAX);
     if (rd != ra) emit_mov_rr(mc, w, rd, ra);
+    u32 sub = (op == BO_SHL) ? 4u : (op == BO_SHR_U ? 5u : 7u);
+    if (b_op.kind == OPK_IMM) {
+      u32 width = w ? 64u : 32u;
+      emit_shift_imm(mc, w, sub, rd, (u8)((u64)b_op.v.imm & (width - 1u)));
+      return;
+    }
     if (b_op.kind == OPK_REG) {
       u32 rb = b_op.v.reg & 0xFu;
       if (rb != X64_RCX) emit_mov_rr(mc, 0, X64_RCX, rb);
-    } else if (b_op.kind == OPK_IMM) {
-      emit_load_imm(mc, 0, X64_RCX, b_op.v.imm & 0x3f);
     } else {
       compiler_panic(t->c, impl_of(t)->loc,
                      "x64 shift: count kind %d unsupported", (int)b_op.kind);
     }
-    u32 sub = (op == BO_SHL) ? 4u : (op == BO_SHR_U ? 5u : 7u);
     emit_shift_cl(mc, w, sub, rd);
     return;
   }
 
+  /* For commutative ops, canonicalize IMM to the RHS so the imm-form
+   * check below fires uniformly. ISUB is non-commutative — IMM-on-LHS
+   * still materializes. */
+  switch (op) {
+    case BO_IADD:
+    case BO_AND:
+    case BO_OR:
+    case BO_XOR:
+    case BO_IMUL: {
+      if (a_op.kind == OPK_IMM && b_op.kind != OPK_IMM) {
+        Operand t_op = a_op; a_op = b_op; b_op = t_op;
+      }
+      break;
+    }
+    default: break;
+  }
+
+  /* IMM-form fast paths. For ADD/SUB/AND/OR/XOR the ALU imm encoding
+   * reads-and-writes a single reg — copy ra → dst first, then `dst OP=
+   * imm`. For IMUL the imm form is three-operand (`dst = src * imm`)
+   * and reads from `ra` directly without the prep copy. */
+  if (b_op.kind == OPK_IMM && a_op.kind == OPK_REG &&
+      (op == BO_IADD || op == BO_ISUB || op == BO_AND || op == BO_OR ||
+       op == BO_XOR || op == BO_IMUL)) {
+    i64 imm = b_op.v.imm;
+    u32 ra = a_op.v.reg & 0xFu;
+    if (op == BO_IMUL) {
+      if (imm_fits_i8(imm)) {
+        emit_imul_imm8(mc, w, rd, ra, (i8)imm);
+        return;
+      }
+      if (imm_fits_i32(imm)) {
+        emit_imul_imm32(mc, w, rd, ra, (i32)imm);
+        return;
+      }
+    } else {
+      u32 sub;
+      switch (op) {
+        case BO_IADD: sub = 0u; break;
+        case BO_OR:   sub = 1u; break;
+        case BO_AND:  sub = 4u; break;
+        case BO_ISUB: sub = 5u; break;
+        case BO_XOR:  sub = 6u; break;
+        default:      sub = 0u; break; /* unreachable */
+      }
+      if (imm_fits_i8(imm)) {
+        if (rd != ra) emit_mov_rr(mc, w, rd, ra);
+        emit_alu_imm8(mc, w, sub, rd, (i8)imm);
+        return;
+      }
+      if (imm_fits_i32(imm)) {
+        if (rd != ra) emit_mov_rr(mc, w, rd, ra);
+        emit_alu_imm32(mc, w, sub, rd, (i32)imm);
+        return;
+      }
+    }
+    /* Fall through to materialize for >32-bit literals. */
+  }
+
   /* Generic 2-operand ALU: copy ra → dst, then dst op= rb. */
   u32 ra = force_reg_int(t, a_op, w, X64_RAX);
   if (rd != ra) emit_mov_rr(mc, w, rd, ra);
@@ -1617,10 +1756,11 @@ static void x_unop(CGTarget* t, UnOp op, Operand dst, Operand a_op) {
   MCEmitter* mc = t->mc;
   int w = type_is_64(dst.type) ? 1 : 0;
   u32 rd = dst.v.reg & 0xFu;
-  u32 ra = a_op.v.reg & 0xFu;
-  if (a_op.kind != OPK_REG)
-    compiler_panic(t->c, impl_of(t)->loc,
-                   "x64 unop: non-REG operand not supported");
+  /* IMM operand is legal per the CGTarget contract (arch.h); materialize
+   * into a scratch register when not already a register. cg folds
+   * literal unops upstream (cg_fold_unop), so this path is reached only
+   * when opt's emit hands us an unfolded literal. */
+  u32 ra = force_reg_int(t, a_op, w, X64_R11);
   switch (op) {
     case UO_NEG:
       if (rd != ra) emit_mov_rr(mc, w, rd, ra);

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/rv64.c	\|	80	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M	src/arch/x64.c	\|	164	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------