cg: pre-SSA peephole — constant fold + aarch64 imm-form encodings - kit

commit edb863ae6bdc7fba48ca788ced88e943c92ee280
parent 91a660f11818a3d87a1e8510360b843f670efa01
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon, 11 May 2026 12:29:17 -0700

cg: pre-SSA peephole — constant fold + aarch64 imm-form encodings

Adds a pure fold helper (Tier 1+2) and lets the value-stack pass
OPK_IMM operands through to backend binop/unop/cmp instead of always
force_reg'ing them.  aarch64 picks the imm-form encoding when the
literal fits: add/sub imm12 (with sh), cmp imm12, AND/ORR/EOR bitmask,
and LSL/LSR/ASR via UBFM/SBFM; misses fall back to the existing
shifted-register path through force_reg_int.

Why: avoids burning a value-stack register on x+const sites and emits
one fewer instruction per matched pattern.  The REG|IMM contract on
binop/unop/cmp is now documented in arch.h so opt's eventual machinize
+ opt_emit reuses the same backend path without new wiring.

Diffstat:
M src/arch/aa64_isa.h  | 168 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/arch/aarch64.c  | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
M src/arch/arch.h  | 19 ++++++++++++++-----
M src/cg/cg.c  | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
A src/cg/fold.c  | 154 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/cg/fold.h  | 47 +++++++++++++++++++++++++++++++++++++++++++++++

6 files changed, 569 insertions(+), 24 deletions(-)
diff --git a/src/arch/aa64_isa.h b/src/arch/aa64_isa.h
@@ -177,6 +177,147 @@ static inline u32 aa64_mvn(u32 sf, u32 Rd, u32 Rm) {
 }
 
 /* ====================================================================
+ * Logical, immediate (AND / ORR / EOR / ANDS, bitmask-imm form)
+ *   sf  opc(2)  100100  N(1)  immr(6)  imms(6)  Rn(5)  Rd(5)
+ *   31  30..29  28..23  22    21..16   15..10   9..5   4..0
+ *
+ * N:immr:imms encodes a repeated-pattern bitmask. The encoder
+ * aa64_logimm_encode below computes those fields from a literal value;
+ * this pack just lays the bits out. For 32-bit ops (sf=0), N must be 0;
+ * for 64-bit ops N can be 0 or 1 and selects whether the pattern
+ * element is 64 bits (N=1) or 2..32 bits (N=0).
+ * ==================================================================== */
+
+#define AA64_LOGIMM_FAMILY_MATCH 0x12000000u
+#define AA64_LOGIMM_FAMILY_MASK 0x1F800000u /* bits 28:23 */
+
+typedef struct AA64LogImm {
+  u32 sf, opc, N, immr, imms, Rn, Rd;
+} AA64LogImm;
+
+static inline u32 aa64_logimm_pack(AA64LogImm f) {
+  return ((f.sf & 1u) << 31) | ((f.opc & 3u) << 29) | AA64_LOGIMM_FAMILY_MATCH |
+         ((f.N & 1u) << 22) | ((f.immr & 0x3fu) << 16) |
+         ((f.imms & 0x3fu) << 10) | ((f.Rn & 0x1fu) << 5) | (f.Rd & 0x1fu);
+}
+
+static inline u32 aa64_and_imm(u32 sf, u32 Rd, u32 Rn, u32 N, u32 immr,
+                               u32 imms) {
+  return aa64_logimm_pack((AA64LogImm){.sf = sf,
+                                       .opc = AA64_LOG_AND_OPC,
+                                       .N = N,
+                                       .immr = immr,
+                                       .imms = imms,
+                                       .Rn = Rn,
+                                       .Rd = Rd});
+}
+static inline u32 aa64_orr_imm(u32 sf, u32 Rd, u32 Rn, u32 N, u32 immr,
+                               u32 imms) {
+  return aa64_logimm_pack((AA64LogImm){.sf = sf,
+                                       .opc = AA64_LOG_ORR_OPC,
+                                       .N = N,
+                                       .immr = immr,
+                                       .imms = imms,
+                                       .Rn = Rn,
+                                       .Rd = Rd});
+}
+static inline u32 aa64_eor_imm(u32 sf, u32 Rd, u32 Rn, u32 N, u32 immr,
+                               u32 imms) {
+  return aa64_logimm_pack((AA64LogImm){.sf = sf,
+                                       .opc = AA64_LOG_EOR_OPC,
+                                       .N = N,
+                                       .immr = immr,
+                                       .imms = imms,
+                                       .Rn = Rn,
+                                       .Rd = Rd});
+}
+
+/* Bitmask-immediate predicate + encoder. Returns 1 and writes N/immr/imms
+ * if `imm` is encodable as an AArch64 logical immediate of width
+ * (sf ? 64 : 32); returns 0 otherwise (caller materializes into a
+ * scratch and uses the shifted-register form).
+ *
+ * Algorithm (inverse of ARM ARM "DecodeBitMasks"): an encodable value
+ * is a non-zero, non-all-ones bitmask made of a repeated `size`-bit
+ * element (size ∈ {2,4,8,16,32,64}); within one element the pattern is
+ * a rotation of (0…0 1…1). Find size by detecting the smallest
+ * repeating period; find the rotation that places the 1-run at the
+ * LSB; encode size and ones-count into imms per the standard scheme
+ * (top bits of imms inverted-encode size, low bits are ones-count-1). */
+static inline int aa64_logimm_encode(u64 imm, u32 sf, u32 *N_out,
+                                     u32 *immr_out, u32 *imms_out) {
+  if (!sf) {
+    u64 lo = imm & 0xFFFFFFFFu;
+    u64 hi = imm >> 32;
+    if (hi != 0 && hi != lo) return 0;
+    imm = lo | (lo << 32);
+  }
+  if (imm == 0 || imm == ~(u64)0) return 0;
+
+  u32 size = 64;
+  for (u32 s = 32; s >= 2; s >>= 1) {
+    u64 mask = ((u64)1 << s) - 1u;
+    if ((imm & mask) != ((imm >> s) & mask)) break;
+    size = s;
+  }
+  u64 elt_mask = (size == 64) ? ~(u64)0 : (((u64)1 << size) - 1u);
+  u64 elt = imm & elt_mask;
+  if (elt == 0 || elt == elt_mask) return 0;
+
+  u32 ones = 0;
+  for (u64 x = elt; x; x >>= 1) ones += (u32)(x & 1u);
+  if (ones == 0 || ones >= size) return 0;
+
+  u64 aligned = ((u64)1 << ones) - 1u;
+  u32 rotation = 0xFFFFFFFFu;
+  for (u32 r = 0; r < size; r++) {
+    u64 rotated = r == 0 ? elt : (((elt >> r) | (elt << (size - r))) & elt_mask);
+    if (rotated == aligned) { rotation = r; break; }
+  }
+  if (rotation == 0xFFFFFFFFu) return 0;
+
+  if (size == 64) {
+    *N_out = 1u;
+    *imms_out = (ones - 1u) & 0x3Fu;
+  } else {
+    *N_out = 0u;
+    u32 neg_size_shl1 = ((u32)(-(i32)size) << 1) & 0x3Fu;
+    *imms_out = neg_size_shl1 | ((ones - 1u) & 0x3Fu);
+  }
+  *immr_out = rotation;
+  return 1;
+}
+
+/* Shift-by-immediate field generators for LSL/LSR/ASR (encoded via
+ * UBFM/SBFM). Predicate: shift < width. The aa64_ubfm / aa64_sbfm
+ * encoders live in aarch64.c; callers pair these (immr, imms) with the
+ * matching pack. */
+static inline int aa64_lsl_imm_fields(u32 shift, u32 sf, u32 *immr_out,
+                                      u32 *imms_out) {
+  u32 width = sf ? 64u : 32u;
+  if (shift >= width) return 0;
+  *immr_out = (width - shift) & (width - 1u);
+  *imms_out = width - 1u - shift;
+  return 1;
+}
+static inline int aa64_lsr_imm_fields(u32 shift, u32 sf, u32 *immr_out,
+                                      u32 *imms_out) {
+  u32 width = sf ? 64u : 32u;
+  if (shift >= width) return 0;
+  *immr_out = shift;
+  *imms_out = width - 1u;
+  return 1;
+}
+static inline int aa64_asr_imm_fields(u32 shift, u32 sf, u32 *immr_out,
+                                      u32 *imms_out) {
+  u32 width = sf ? 64u : 32u;
+  if (shift >= width) return 0;
+  *immr_out = shift;
+  *imms_out = width - 1u;
+  return 1;
+}
+
+/* ====================================================================
  * Add/Sub, shifted register (ADD / SUB / ADDS / SUBS)
  *   sf  op(1)  S(1)  01011  shift(2)  0  Rm(5)  imm6(6)  Rn(5)  Rd(5)
  *   31  30     29    28..24 23..22    21 20..16 15..10   9..5   4..0
@@ -460,6 +601,33 @@ static inline u32 aa64_sub_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12, u32 sh) {
   return aa64_addsubimm_pack((AA64AddSubImm){
       .sf = sf, .op = 1, .sh = sh, .imm12 = imm12, .Rn = Rn, .Rd = Rd});
 }
+/* SUBS imm — sets flags. Used for CMP imm (Rd=ZR) and for branchless
+ * compares that feed CSET. The 12-bit-shifted form covers 0..0xFFFFF000
+ * stepped by 0x1000; cg_fold collapses literal-only compares upstream,
+ * so this encoder is reached for `x cmp const` and `if (x)` patterns. */
+static inline u32 aa64_subs_imm12(u32 sf, u32 Rd, u32 Rn, u32 imm12, u32 sh) {
+  return aa64_addsubimm_pack((AA64AddSubImm){
+      .sf = sf, .op = 1, .S = 1, .sh = sh, .imm12 = imm12, .Rn = Rn, .Rd = Rd});
+}
+
+/* Predicate: does `imm` fit ADD/SUB/CMP's 12-bit immediate (optionally
+ * left-shifted by 12)? On success writes the encoded imm12 and sh and
+ * returns 1; on failure returns 0 and leaves outputs untouched.
+ *
+ * The encoding admits 0..4095 directly (sh=0) and multiples of 4096 up
+ * to 0xFFF000 (sh=1). Negative literals are rejected here — the caller
+ * (e.g. opt's machinize, or a smarter cg) is free to swap ADD ↔ SUB and
+ * retry with the negated literal; the bare predicate keeps the contract
+ * narrow. */
+static inline int aa64_addsub_imm_fits(i64 imm, u32 *imm12_out, u32 *sh_out) {
+  if (imm < 0) return 0;
+  u64 u = (u64)imm;
+  if (u <= 0xFFFu) { *imm12_out = (u32)u; *sh_out = 0; return 1; }
+  if ((u & 0xFFFu) == 0 && (u >> 12) <= 0xFFFu) {
+    *imm12_out = (u32)(u >> 12); *sh_out = 1; return 1;
+  }
+  return 0;
+}
 
 /* ====================================================================
  * Load/store, unsigned 12-bit immediate offset (LDR / STR, scaled)
diff --git a/src/arch/aarch64.c b/src/arch/aarch64.c
@@ -1068,16 +1068,22 @@ static u32 cmp_to_cond(CmpOp op) {
   }
 }
 
-/* Emit CMP a, b (= SUBS ZR, a, b). Materializes IMM operands through
- * scratch x9/x10. Width comes from `a`; signedness lives in the cond. */
+/* Emit CMP a, b (= SUBS ZR, a, b). Uses the 12-bit-imm form when `b` is
+ * an OPK_IMM that fits; otherwise materializes through scratch x9/x10
+ * and uses the shifted-register form. CMP is not commutative across the
+ * condition codes, so an IMM-on-LHS still materializes (the caller has
+ * to swap the cond if it wants to swap the operands). Width comes from
+ * `a`; signedness lives in the cond. */
 static void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op) {
   MCEmitter* mc = t->mc;
   u32 sf = type_is_64(a_op.type) ? 1u : 0u;
-  /* Special-case CMP Rn, #0 so a literal zero compare doesn't need
-   * a scratch register. */
-  if (b_op.kind == OPK_IMM && b_op.v.imm == 0 && a_op.kind == OPK_REG) {
-    emit32(mc, aa64_subs_imm(sf, /*Rd=ZR*/ 31u, reg_num(a_op), 0));
-    return;
+  if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) {
+    u32 imm12, sh;
+    if (aa64_addsub_imm_fits(b_op.v.imm, &imm12, &sh)) {
+      u32 rn = force_reg_int(t, a_op, sf, 9);
+      emit32(mc, aa64_subs_imm12(sf, /*Rd=ZR*/ 31u, rn, imm12, sh));
+      return;
+    }
   }
   u32 rn = force_reg_int(t, a_op, sf, 9);
   u32 rm = force_reg_int(t, b_op, sf, (rn == 9) ? 10u : 9u);
@@ -1838,9 +1844,100 @@ static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
 
   u32 sf = type_is_64(dst.type) ? 1u : 0u;
   u32 rd = reg_num(dst);
+
+  /* Imm-form fast paths. For commutative ops (ADD/AND/OR/XOR), if the
+   * LHS is the IMM swap to canonicalize (REG, IMM) and try to encode.
+   * For SUB we don't swap — `SUB imm, reg` has no encoding without
+   * materializing. Shifts take the imm as the count and require RHS-IMM
+   * by definition. Anything that doesn't fit the encoding falls through
+   * to force_reg_int + the shifted-register form, preserving the old
+   * behavior. */
+  u32 word;
+  switch (op) {
+    case BO_IADD:
+    case BO_AND:
+    case BO_OR:
+    case BO_XOR: {
+      if (a_op.kind == OPK_IMM && b_op.kind != OPK_IMM) {
+        Operand t_op = a_op; a_op = b_op; b_op = t_op;
+      }
+      break;
+    }
+    default: break;
+  }
+
+  /* Try the imm-form before materializing. Each case sets `word` and
+   * jumps to emit; misses fall through to the reg path below. */
+  if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) {
+    u32 rn_reg = reg_num(a_op);
+    i64 imm = b_op.v.imm;
+    u32 imm12, sh, N, immr, imms;
+    switch (op) {
+      case BO_IADD:
+        if (aa64_addsub_imm_fits(imm, &imm12, &sh)) {
+          emit32(mc, aa64_add_imm(sf, rd, rn_reg, imm12, sh));
+          return;
+        }
+        break;
+      case BO_ISUB:
+        if (aa64_addsub_imm_fits(imm, &imm12, &sh)) {
+          emit32(mc, aa64_sub_imm(sf, rd, rn_reg, imm12, sh));
+          return;
+        }
+        break;
+      case BO_AND:
+        if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) {
+          emit32(mc, aa64_and_imm(sf, rd, rn_reg, N, immr, imms));
+          return;
+        }
+        break;
+      case BO_OR:
+        if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) {
+          emit32(mc, aa64_orr_imm(sf, rd, rn_reg, N, immr, imms));
+          return;
+        }
+        break;
+      case BO_XOR:
+        if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) {
+          emit32(mc, aa64_eor_imm(sf, rd, rn_reg, N, immr, imms));
+          return;
+        }
+        break;
+      case BO_SHL: {
+        /* C shifts by ≥ width are UB but we don't exploit it; mask the
+         * count to width-1 to match the variable-shift behavior. */
+        u32 width = sf ? 64u : 32u;
+        u32 sh_amt = (u32)((u64)imm & (width - 1u));
+        if (aa64_lsl_imm_fields(sh_amt, sf, &immr, &imms)) {
+          emit32(mc, aa64_ubfm(sf, rd, rn_reg, immr, imms));
+          return;
+        }
+        break;
+      }
+      case BO_SHR_U: {
+        u32 width = sf ? 64u : 32u;
+        u32 sh_amt = (u32)((u64)imm & (width - 1u));
+        if (aa64_lsr_imm_fields(sh_amt, sf, &immr, &imms)) {
+          emit32(mc, aa64_ubfm(sf, rd, rn_reg, immr, imms));
+          return;
+        }
+        break;
+      }
+      case BO_SHR_S: {
+        u32 width = sf ? 64u : 32u;
+        u32 sh_amt = (u32)((u64)imm & (width - 1u));
+        if (aa64_asr_imm_fields(sh_amt, sf, &immr, &imms)) {
+          emit32(mc, aa64_sbfm(sf, rd, rn_reg, immr, imms));
+          return;
+        }
+        break;
+      }
+      default: break;
+    }
+  }
+
   u32 rn = force_reg_int(t, a_op, sf, 9);
   u32 rm = force_reg_int(t, b_op, sf, (rn == 9) ? 10 : 9);
-  u32 word;
 
   switch (op) {
     case BO_IADD:
@@ -1900,14 +1997,14 @@ static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a_op) {
   MCEmitter* mc = t->mc;
   u32 sf = type_is_64(dst.type) ? 1u : 0u;
   u32 rd = reg_num(dst);
-  u32 rn = reg_num(a_op);
+  /* OPK_IMM is legal per the CGTarget contract (arch.h); force_reg_int
+   * materializes into x9 when the operand isn't already a register.
+   * cg folds literal unops upstream (cg_fold_unop), so the IMM path
+   * here is only reached from opt's emit when the IR carries an
+   * unfolded literal — still a contract case we must honor. */
+  u32 rn = force_reg_int(t, a_op, sf, 9);
   u32 word;
 
-  if (a_op.kind != OPK_REG) {
-    compiler_panic(t->c, impl_of(t)->loc,
-                   "aarch64 unop: non-REG operand not yet supported");
-  }
-
   switch (op) {
     case UO_NEG:
       word = aa64_neg(sf, rd, rn);
diff --git a/src/arch/arch.h b/src/arch/arch.h
@@ -530,11 +530,20 @@ struct CGTarget {
   void (*bitfield_store)(CGTarget*, Operand record_addr,
                          Operand src /*REG|IMM*/, BitFieldAccess);
 
-  /* ---- arithmetic, compare, convert ---- */
-  void (*binop)(CGTarget*, BinOp, Operand dst, Operand a, Operand b);
-  void (*unop)(CGTarget*, UnOp, Operand dst, Operand a);
-  void (*cmp)(CGTarget*, CmpOp, Operand dst, Operand a,
-              Operand b); /* materialize 0/1 */
+  /* ---- arithmetic, compare, convert ----
+   * binop/unop/cmp accept OPK_REG or OPK_IMM in source operand positions
+   * (`a`, `b`); `dst` is always OPK_REG. The backend chooses between an
+   * imm-form encoding and materializing the literal into a scratch
+   * register based on whether the value fits the instruction's imm
+   * field. FP ops require REG sources — FP literals reach the value
+   * stack through load_const into OPK_REG. cg and opt's machinize/emit
+   * both rely on this contract to pass small constants through without
+   * burning a value-stack register on materialization. */
+  void (*binop)(CGTarget*, BinOp, Operand dst /*REG*/,
+                Operand a /*REG|IMM*/, Operand b /*REG|IMM*/);
+  void (*unop)(CGTarget*, UnOp, Operand dst /*REG*/, Operand a /*REG|IMM*/);
+  void (*cmp)(CGTarget*, CmpOp, Operand dst /*REG*/,
+              Operand a /*REG|IMM*/, Operand b /*REG|IMM*/); /* materialize 0/1 */
   void (*convert)(CGTarget*, ConvKind, Operand dst, Operand src);
 
   /* ---- calls / return ----
diff --git a/src/cg/cg.c b/src/cg/cg.c
@@ -41,6 +41,7 @@
 
 #include "abi/abi.h"
 #include "arch/arch.h"
+#include "cg/fold.h"
 #include "core/arena.h"
 #include "core/core.h"
 #include "core/heap.h"
@@ -891,6 +892,12 @@ void cg_bitfield_store(CG* g, BitFieldAccess b) {
  * Arithmetic / compare / convert
  * ============================================================ */
 
+/* Like force_reg, but leaves an OPK_IMM SValue alone — the CGTarget
+ * contract for binop/unop/cmp accepts IMM sources, so we avoid burning
+ * a value-stack register on `x + 3` style sites. The backend decides
+ * imm-form vs. materialize per the literal's width. */
+static Operand force_reg_unless_imm(CG* g, SValue* v, const Type* ty);
+
 /* Force an SValue (already popped, by reference) into a register operand
  * of the given type. Mutates `*v` so that v->op is OPK_REG and v->res is
  * RES_REG; on lvalue inputs this means the original lvalue's base reg is
@@ -920,6 +927,11 @@ static Operand force_reg(CG* g, SValue* v, const Type* ty) {
   return dst;
 }
 
+static Operand force_reg_unless_imm(CG* g, SValue* v, const Type* ty) {
+  if (v->op.kind == OPK_IMM) return v->op;
+  return force_reg(g, v, ty);
+}
+
 void cg_binop(CG* g, BinOp op) {
   /* stack: [a, b] → [a OP b] */
   SValue b = pop(g);
@@ -927,8 +939,37 @@ void cg_binop(CG* g, BinOp op) {
   CGTarget* T = g->target;
   /* Result type is `a`'s type at this slice (parser already coerced). */
   const Type* ty = a.type ? a.type : b.type;
-  Operand ra = force_reg(g, &a, ty);
-  Operand rb = force_reg(g, &b, ty);
+
+  /* Tier 1+2: constant-fold or apply algebraic identities via the
+   * pure fold helper. KEEP_A/KEEP_B re-push the non-constant operand
+   * unchanged after releasing the IMM side (IMM carries no reg/slot
+   * obligation, but the helper is symmetric and a no-op release is
+   * cheap). */
+  {
+    Operand folded;
+    switch (cg_fold_binop(op, a.op, b.op, ty, g->abi, &folded)) {
+      case CG_FOLD_IMM:
+        release(g, &a);
+        release(g, &b);
+        push(g, make_sv(folded, ty));
+        return;
+      case CG_FOLD_KEEP_A:
+        release(g, &b);
+        push(g, a);
+        return;
+      case CG_FOLD_KEEP_B:
+        release(g, &a);
+        push(g, b);
+        return;
+      case CG_FOLD_NONE: break;
+    }
+  }
+
+  /* IMM sources are legal per the binop contract (arch.h) — the backend
+   * picks imm-form vs. materialize. cg_fold_binop has already collapsed
+   * IMM+IMM, so at most one operand here is IMM. */
+  Operand ra = force_reg_unless_imm(g, &a, ty);
+  Operand rb = force_reg_unless_imm(g, &b, ty);
   Reg rr = alloc_reg_or_spill(g, type_class(ty), ty);
   Operand dst = op_reg(rr, ty);
   T->binop(T, op, dst, ra, rb);
@@ -941,7 +982,17 @@ void cg_unop(CG* g, UnOp op) {
   SValue a = pop(g);
   CGTarget* T = g->target;
   const Type* ty = a.type ? a.type : a.op.type;
-  Operand ra = force_reg(g, &a, ty);
+
+  {
+    Operand folded;
+    if (cg_fold_unop(op, a.op, ty, g->abi, &folded) == CG_FOLD_IMM) {
+      release(g, &a);
+      push(g, make_sv(folded, ty));
+      return;
+    }
+  }
+
+  Operand ra = force_reg_unless_imm(g, &a, ty);
   Reg rr = alloc_reg_or_spill(g, type_class(ty), ty);
   Operand dst = op_reg(rr, ty);
   T->unop(T, op, dst, ra);
@@ -956,8 +1007,19 @@ void cg_cmp(CG* g, CmpOp op) {
   CGTarget* T = g->target;
   const Type* opty = a.type ? a.type : b.type;
   const Type* i32 = type_prim(g->pool, TY_INT);
-  Operand ra = force_reg(g, &a, opty);
-  Operand rb = force_reg(g, &b, opty);
+
+  {
+    Operand folded;
+    if (cg_fold_cmp(op, a.op, b.op, i32, g->abi, &folded) == CG_FOLD_IMM) {
+      release(g, &a);
+      release(g, &b);
+      push(g, make_sv(folded, i32));
+      return;
+    }
+  }
+
+  Operand ra = force_reg_unless_imm(g, &a, opty);
+  Operand rb = force_reg_unless_imm(g, &b, opty);
   Reg rr = alloc_reg_or_spill(g, RC_INT, i32);
   Operand dst = op_reg(rr, i32);
   T->cmp(T, op, dst, ra, rb);
@@ -1458,6 +1520,14 @@ void cg_branch_true(CG* g, CGLabel l) {
   SValue v = pop(g);
   CGTarget* T = g->target;
   const Type* ty = v.type ? v.type : type_prim(g->pool, TY_INT);
+  /* Mirror cg_branch_false: a literal condition resolves at compile time. */
+  if (v.op.kind == OPK_IMM) {
+    if (v.op.v.imm != 0) {
+      T->jump(T, (Label)l);
+    }
+    release(g, &v);
+    return;
+  }
   Operand a = force_reg(g, &v, ty);
   Operand zero = op_imm(0, ty);
   T->cmp_branch(T, CMP_NE, a, zero, (Label)l);
diff --git a/src/cg/fold.c b/src/cg/fold.c
@@ -0,0 +1,154 @@
+#include "cg/fold.h"
+
+/* Truncate (and re-sign-extend, for signed types) a folded i64 down to
+ * the width of `ty`, so that subsequent folds and compares see the same
+ * value the backend would have produced after narrowing to the
+ * destination register. No-op for >= 8-byte types and for NULL ty. */
+static i64 narrow(TargetABI* abi, const Type* ty, i64 v) {
+  if (!ty || !abi) return v;
+  u32 sz = abi_sizeof(abi, ty);
+  if (sz >= 8) return v;
+  u64 mask = ((u64)1 << (sz * 8)) - 1;
+  u64 u = (u64)v & mask;
+  if (abi_type_info(abi, ty).signed_) {
+    u64 sign_bit = (u64)1 << (sz * 8 - 1);
+    if (u & sign_bit) u |= ~mask;
+  }
+  return (i64)u;
+}
+
+static Operand make_imm(i64 v, const Type* ty) {
+  Operand o;
+  o.kind = OPK_IMM;
+  o.cls = RC_INT;
+  o.pad = 0;
+  o.type = ty;
+  o.v.imm = v;
+  return o;
+}
+
+/* Literal-literal integer binop. Returns 1 with *out set, or 0 if `op`
+ * isn't a foldable kind. Excludes SDIV/UDIV/SREM/UREM (must trap on
+ * divisor 0 and INT_MIN/-1), SHL/SHR_* (count >= width is type-width-
+ * dependent and not in this tier), and float ops (rounding/NaN belong
+ * to the backend). */
+static int literal_binop(BinOp op, i64 a, i64 b, i64* out) {
+  switch (op) {
+    case BO_IADD: *out = (i64)((u64)a + (u64)b); return 1;
+    case BO_ISUB: *out = (i64)((u64)a - (u64)b); return 1;
+    case BO_IMUL: *out = (i64)((u64)a * (u64)b); return 1;
+    case BO_AND:  *out = a & b; return 1;
+    case BO_OR:   *out = a | b; return 1;
+    case BO_XOR:  *out = a ^ b; return 1;
+    default: return 0;
+  }
+}
+
+/* Algebraic-identity dispatch for an integer binop with one literal
+ * operand. `k` is the constant; `k_on_right` distinguishes lhs vs rhs
+ * for non-commutative ops (ISUB, SHL, SHR_*, SDIV, UDIV).
+ *   FID_NONE  — no identity, caller emits normally.
+ *   FID_KEEP  — drop the IMM, the non-constant operand is the result.
+ *   FID_ZERO  — result is constant 0; drop both operands. */
+typedef enum FoldIdent { FID_NONE, FID_KEEP, FID_ZERO } FoldIdent;
+
+static FoldIdent identity_for(BinOp op, i64 k, int k_on_right) {
+  switch (op) {
+    case BO_IADD: case BO_OR: case BO_XOR:
+      /* x + 0, 0 + x, x | 0, 0 | x, x ^ 0, 0 ^ x */
+      return (k == 0) ? FID_KEEP : FID_NONE;
+    case BO_ISUB:
+      /* x - 0 only; 0 - x needs a UO_NEG and isn't an identity. */
+      return (k == 0 && k_on_right) ? FID_KEEP : FID_NONE;
+    case BO_IMUL:
+      if (k == 1) return FID_KEEP;
+      if (k == 0) return FID_ZERO;
+      return FID_NONE;
+    case BO_AND:
+      if (k == 0) return FID_ZERO;
+      /* All-ones mask of any width sign-extends to -1 as i64. */
+      if (k == -1) return FID_KEEP;
+      return FID_NONE;
+    case BO_SDIV: case BO_UDIV:
+      /* x / 1 only; 1 / x isn't an identity and divisor-on-lhs gives
+       * no useful fold. */
+      return (k == 1 && k_on_right) ? FID_KEEP : FID_NONE;
+    case BO_SHL: case BO_SHR_S: case BO_SHR_U:
+      /* x << 0, x >> 0 only; a zero shift-count on the lhs is the
+       * value being shifted — folding that would need to release the
+       * rhs operand, deferred until a use exists. */
+      return (k == 0 && k_on_right) ? FID_KEEP : FID_NONE;
+    default:
+      return FID_NONE;
+  }
+}
+
+CGFoldKind cg_fold_binop(BinOp op, Operand a, Operand b, const Type* ty,
+                         TargetABI* abi, Operand* out) {
+  /* Tier 1: both literal — fold to a single IMM. */
+  if (a.kind == OPK_IMM && b.kind == OPK_IMM) {
+    i64 r;
+    if (literal_binop(op, a.v.imm, b.v.imm, &r)) {
+      *out = make_imm(narrow(abi, ty, r), ty);
+      return CG_FOLD_IMM;
+    }
+  }
+  /* Tier 2: algebraic identities. Side-effect-free: the non-constant
+   * operand has already been materialized onto the value stack, so any
+   * computation that produced it has already executed. Dropping the
+   * IMM side is the caller's responsibility (release reg/slot if any). */
+  if (b.kind == OPK_IMM) {
+    switch (identity_for(op, b.v.imm, /*k_on_right=*/1)) {
+      case FID_KEEP: return CG_FOLD_KEEP_A;
+      case FID_ZERO: *out = make_imm(0, ty); return CG_FOLD_IMM;
+      case FID_NONE: break;
+    }
+  }
+  if (a.kind == OPK_IMM) {
+    switch (identity_for(op, a.v.imm, /*k_on_right=*/0)) {
+      case FID_KEEP: return CG_FOLD_KEEP_B;
+      case FID_ZERO: *out = make_imm(0, ty); return CG_FOLD_IMM;
+      case FID_NONE: break;
+    }
+  }
+  return CG_FOLD_NONE;
+}
+
+CGFoldKind cg_fold_unop(UnOp op, Operand a, const Type* ty,
+                        TargetABI* abi, Operand* out) {
+  if (a.kind != OPK_IMM) return CG_FOLD_NONE;
+  i64 v = a.v.imm;
+  i64 r;
+  switch (op) {
+    case UO_NEG:  r = (i64)(-(u64)v); break;
+    case UO_BNOT: r = ~v; break;
+    case UO_NOT:  r = v ? 0 : 1; break;
+    default: return CG_FOLD_NONE;
+  }
+  *out = make_imm(narrow(abi, ty, r), ty);
+  return CG_FOLD_IMM;
+}
+
+CGFoldKind cg_fold_cmp(CmpOp op, Operand a, Operand b, const Type* int_ty,
+                       TargetABI* abi, Operand* out) {
+  if (a.kind != OPK_IMM || b.kind != OPK_IMM) return CG_FOLD_NONE;
+  (void)abi; /* compare result is `int` 0/1 — no narrowing needed */
+  i64 x = a.v.imm;
+  i64 y = b.v.imm;
+  i64 r;
+  switch (op) {
+    case CMP_EQ:   r = (x == y); break;
+    case CMP_NE:   r = (x != y); break;
+    case CMP_LT_S: r = (x <  y); break;
+    case CMP_LE_S: r = (x <= y); break;
+    case CMP_GT_S: r = (x >  y); break;
+    case CMP_GE_S: r = (x >= y); break;
+    case CMP_LT_U: r = ((u64)x <  (u64)y); break;
+    case CMP_LE_U: r = ((u64)x <= (u64)y); break;
+    case CMP_GT_U: r = ((u64)x >  (u64)y); break;
+    case CMP_GE_U: r = ((u64)x >= (u64)y); break;
+    default: return CG_FOLD_NONE;
+  }
+  *out = make_imm(r, int_ty);
+  return CG_FOLD_IMM;
+}
diff --git a/src/cg/fold.h b/src/cg/fold.h
@@ -0,0 +1,47 @@
+#ifndef CFREE_CG_FOLD_H
+#define CFREE_CG_FOLD_H
+
+/* Pure constant-folding and algebraic-identity helpers for binop/unop/cmp
+ * on Operand inputs. No CG or IR state: callers (cg.c today, opt's
+ * pass_gvn / pass_combine eventually) inspect the result and apply it
+ * to whichever value representation they hold. All folds are restricted
+ * to integer domain — float ops never reach the OPK_IMM path (FP
+ * literals materialize via load_const to OPK_REG before they enter the
+ * value stack). Division, remainder, shifts, and FP arithmetic are
+ * deliberately excluded from the literal-fold paths to preserve trap
+ * semantics and rounding behavior; algebraic identities on those ops
+ * are limited to cases that don't depend on UB-exploiting transforms
+ * (see doc/OPT.md §5.5). */
+
+#include "abi/abi.h"
+#include "arch/arch.h"
+#include "type/type.h"
+
+typedef enum CGFoldKind {
+  CG_FOLD_NONE,   /* no fold; caller emits normally */
+  CG_FOLD_IMM,    /* result is the OPK_IMM Operand in *out; drop both inputs */
+  CG_FOLD_KEEP_A, /* result is `a` unchanged; drop `b` */
+  CG_FOLD_KEEP_B, /* result is `b` unchanged; drop `a` */
+} CGFoldKind;
+
+/* Binop fold + identity. Examines `a` and `b` against `op`:
+ *   - both OPK_IMM        → CG_FOLD_IMM, *out = literal narrowed to `ty`
+ *   - one OPK_IMM identity → CG_FOLD_KEEP_A or _B, or CG_FOLD_IMM (zero)
+ *   - otherwise            → CG_FOLD_NONE
+ * `ty` is the result type (used for width-narrowing on the fold path).
+ * `abi` supplies size/signedness; required when ty is non-NULL. */
+CGFoldKind cg_fold_binop(BinOp op, Operand a, Operand b, const Type* ty,
+                         TargetABI* abi, Operand* out);
+
+/* Unop fold. Returns CG_FOLD_IMM with *out set on success, CG_FOLD_NONE
+ * otherwise. Only integer-domain unops are folded. */
+CGFoldKind cg_fold_unop(UnOp op, Operand a, const Type* ty,
+                        TargetABI* abi, Operand* out);
+
+/* Integer-compare fold. Returns CG_FOLD_IMM with *out set to 0 or 1 of
+ * type `int_ty` on success. FP compares (CMP_*_F) return CG_FOLD_NONE —
+ * NaN/ordering belongs to the backend. */
+CGFoldKind cg_fold_cmp(CmpOp op, Operand a, Operand b, const Type* int_ty,
+                       TargetABI* abi, Operand* out);
+
+#endif

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/aa64_isa.h	\|	168	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/arch/aarch64.c	\|	125	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
M	src/arch/arch.h	\|	19	++++++++++++++-----
M	src/cg/cg.c	\|	80	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
A	src/cg/fold.c	\|	154	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/cg/fold.h	\|	47	+++++++++++++++++++++++++++++++++++++++++++++++