cg: collapse INTRIN_BSWAP16/32/64 into one width-by-type BSWAP (Track 4a) - kit

commit 52897e0145b4e42aa1cfd85e10839ed370938ba1
parent d08e794c51bffbb4075e83df6dcb0d1bb3f2fc64
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue,  2 Jun 2026 05:07:02 -0700

cg: collapse INTRIN_BSWAP16/32/64 into one width-by-type BSWAP (Track 4a)

Diffstat:
M src/arch/aa64/native.c  | 34 ++++++++++++++++++++++++----------
M src/arch/c_target/c_emit.c  | 26 ++++++++++++++++----------
M src/arch/rv64/native.c  | 96 +++++++++++++++++++++++++++++++++++++++++++------------------------------------
M src/arch/wasm/emit.c  | 30 ++++++++++++------------------
M src/arch/wasm/internal.h  | 2 +-
M src/arch/x64/native.c  | 40 ++++++++++++++++++++++++----------------
M src/cg/arith.c  | 9 +++++----
M src/cg/cgtarget.h  | 4 +---
M src/interp/engine.c  | 12 ++++--------

9 files changed, 139 insertions(+), 114 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -3178,18 +3178,32 @@ static void aa_intrinsic(NativeTarget* t, IntrinKind kind,
         return;
       }
       break;
-    case INTRIN_BSWAP16:
-    case INTRIN_BSWAP32:
-    case INTRIN_BSWAP64:
+    case INTRIN_BSWAP:
       if (ndst == 1u && narg == 1u) {
-        u32 sf = kind == INTRIN_BSWAP64;
-        aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0])));
-        if (kind == INTRIN_BSWAP16) {
-          aa_emit_load_imm(t->mc, 0, AA_TMP0, 16);
-          aa_emit32(t->mc,
-                    aa64_lsrv(0, loc_reg(dsts[0]), loc_reg(dsts[0]), AA_TMP0));
+        u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type);
+        switch (width) {
+          case 2: {
+            u32 sf = 0;
+            aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0])));
+            aa_emit_load_imm(t->mc, 0, AA_TMP0, 16);
+            aa_emit32(
+                t->mc,
+                aa64_lsrv(0, loc_reg(dsts[0]), loc_reg(dsts[0]), AA_TMP0));
+            return;
+          }
+          case 4: {
+            u32 sf = 0;
+            aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0])));
+            return;
+          }
+          case 8: {
+            u32 sf = 1;
+            aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0])));
+            return;
+          }
+          default:
+            break;
         }
-        return;
       }
       break;
     case INTRIN_SADD_OVERFLOW:
diff --git a/src/arch/c_target/c_emit.c b/src/arch/c_target/c_emit.c
@@ -2488,12 +2488,11 @@ static const char* c_bitop_builtin(IntrinKind k, u32 width) {
       if (width == 64) return "__builtin_clzll";
       if (width == 16 || width == 8) return "__builtin_clz";
       return NULL;
-    case INTRIN_BSWAP16:
-      return "__builtin_bswap16";
-    case INTRIN_BSWAP32:
-      return "__builtin_bswap32";
-    case INTRIN_BSWAP64:
-      return "__builtin_bswap64";
+    case INTRIN_BSWAP:
+      if (width == 16) return "__builtin_bswap16";
+      if (width == 32) return "__builtin_bswap32";
+      if (width == 64) return "__builtin_bswap64";
+      return NULL;
     default:
       return NULL;
   }
@@ -2574,15 +2573,22 @@ void c_emit_intrinsic(CTarget* t, IntrinKind k, Operand* dsts, u32 ndst,
     case INTRIN_POPCOUNT:
     case INTRIN_CTZ:
     case INTRIN_CLZ:
-    case INTRIN_BSWAP16:
-    case INTRIN_BSWAP32:
-    case INTRIN_BSWAP64: {
+    case INTRIN_BSWAP: {
       if (ndst != 1 || narg != 1) {
         compiler_panic(t->c, loc,
                        "C target: bit-intrin: bad shape (ndst=%u narg=%u)",
                        (unsigned)ndst, (unsigned)narg);
       }
-      u32 w = c_int_width_for_signedness(t, args[0].type);
+      /* bswap width is determined by the result type (in bytes -> bit-width
+       * bucket, matching the old per-width intrinsic split). The other bit
+       * ops keep deriving width from the operand. */
+      u32 w;
+      if (k == INTRIN_BSWAP) {
+        u32 bytes = (u32)cg_type_size(t->c, dsts[0].type);
+        w = bytes <= 2 ? 16u : (bytes <= 4 ? 32u : 64u);
+      } else {
+        w = c_int_width_for_signedness(t, args[0].type);
+      }
       const char* fn = c_bitop_builtin(k, w);
       if (!fn) {
         compiler_panic(t->c, loc, "C target: bit-intrin width %u unsupported",
diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c
@@ -2628,53 +2628,61 @@ static void rv_intrinsic(NativeTarget* t, IntrinKind kind, const NativeLoc* dsts
     case INTRIN_TRAP:
       rv64_emit32(mc, rv_ebreak());
       return;
-    case INTRIN_BSWAP16: {
-      u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
-      /* rd = ((rs & 0xff) << 8) | ((rs >> 8) & 0xff). */
-      rv64_emit32(mc, rv_addi(RV_TMP2, RV_ZERO, 0xff));
-      rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); /* 0xff00 */
-      rv64_emit32(mc, rv_slli(RV_TMP1, rs, 8));
-      rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP2));
-      rv64_emit32(mc, rv_srli(RV_TMP3, rs, 8));
-      rv64_emit32(mc, rv_andi(RV_TMP3, RV_TMP3, 0xff));
-      rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP3));
-      return;
-    }
-    case INTRIN_BSWAP32: {
-      u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
-      rv64_emit32(mc, rv_srliw(RV_TMP1, rs, 24));
-      rv64_emit32(mc, rv_andi(RV_TMP1, RV_TMP1, 0xff));
-      rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 16));
-      rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
-      rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8));
-      rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
-      rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 8));
-      rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
-      rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 16));
-      rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
-      rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
-      rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 24));
-      rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP2));
-      rv64_emit32(mc, rv_slli(rd, rd, 32));
-      rv64_emit32(mc, rv_srli(rd, rd, 32));
-      return;
-    }
-    case INTRIN_BSWAP64: {
-      u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
-      int i;
-      rv64_emit32(mc, rv_addi(RV_TMP1, RV_ZERO, 0));
-      for (i = 0; i < 8; ++i) {
-        int sh = 56 - 8 * i;
-        if (i == 0) {
-          rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
-        } else {
-          rv64_emit32(mc, rv_srli(RV_TMP2, rs, (u32)(8 * i)));
+    case INTRIN_BSWAP: {
+      u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type);
+      switch (width) {
+        case 2: {
+          u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
+          /* rd = ((rs & 0xff) << 8) | ((rs >> 8) & 0xff). */
+          rv64_emit32(mc, rv_addi(RV_TMP2, RV_ZERO, 0xff));
+          rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); /* 0xff00 */
+          rv64_emit32(mc, rv_slli(RV_TMP1, rs, 8));
+          rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP2));
+          rv64_emit32(mc, rv_srli(RV_TMP3, rs, 8));
+          rv64_emit32(mc, rv_andi(RV_TMP3, RV_TMP3, 0xff));
+          rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP3));
+          return;
+        }
+        case 4: {
+          u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
+          rv64_emit32(mc, rv_srliw(RV_TMP1, rs, 24));
+          rv64_emit32(mc, rv_andi(RV_TMP1, RV_TMP1, 0xff));
+          rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 16));
           rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
+          rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8));
+          rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
+          rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 8));
+          rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
+          rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 16));
+          rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
+          rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
+          rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 24));
+          rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP2));
+          rv64_emit32(mc, rv_slli(rd, rd, 32));
+          rv64_emit32(mc, rv_srli(rd, rd, 32));
+          return;
         }
-        if (sh) rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, (u32)sh));
-        rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
+        case 8: {
+          u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
+          int i;
+          rv64_emit32(mc, rv_addi(RV_TMP1, RV_ZERO, 0));
+          for (i = 0; i < 8; ++i) {
+            int sh = 56 - 8 * i;
+            if (i == 0) {
+              rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
+            } else {
+              rv64_emit32(mc, rv_srli(RV_TMP2, rs, (u32)(8 * i)));
+              rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
+            }
+            if (sh) rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, (u32)sh));
+            rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
+          }
+          rv64_emit32(mc, rv_addi(rd, RV_TMP1, 0));
+          return;
+        }
+        default:
+          break;
       }
-      rv64_emit32(mc, rv_addi(rd, RV_TMP1, 0));
       return;
     }
     case INTRIN_POPCOUNT: {
diff --git a/src/arch/wasm/emit.c b/src/arch/wasm/emit.c
@@ -1574,12 +1574,8 @@ static const char* intrin_name(IntrinKind k) {
       return "__builtin_ctz";
     case INTRIN_CLZ:
       return "__builtin_clz";
-    case INTRIN_BSWAP16:
-      return "__builtin_bswap16";
-    case INTRIN_BSWAP32:
-      return "__builtin_bswap32";
-    case INTRIN_BSWAP64:
-      return "__builtin_bswap64";
+    case INTRIN_BSWAP:
+      return "__builtin_bswap";
     case INTRIN_MEMCPY:
       return "memcpy";
     case INTRIN_MEMMOVE:
@@ -1705,9 +1701,7 @@ void wasm_intrinsic(CGTarget* tg, IntrinKind k, Operand* dst, u32 ndst,
     case INTRIN_CLZ:
     case INTRIN_CTZ:
     case INTRIN_POPCOUNT:
-    case INTRIN_BSWAP16:
-    case INTRIN_BSWAP32:
-    case INTRIN_BSWAP64: {
+    case INTRIN_BSWAP: {
       if (ndst != 1 || nargs != 1 || dst[0].kind != OPK_REG ||
           args[0].kind != OPK_REG) {
         compiler_panic(t->c, cur_loc(t),
@@ -2890,14 +2884,16 @@ static void emit_intrinsic_bit_op(WTarget* t, const WIR* w) {
 }
 
 static void emit_intrinsic_bswap(WTarget* t, const WIR* w) {
-  IntrinKind k = (IntrinKind)w->cgop;
-  if (k == INTRIN_BSWAP16 || k == INTRIN_BSWAP32) {
-    /* Both operate over i32. BSWAP16 only touches the low 16 bits; any
-     * extra high bits in the input are discarded by the AND mask. */
+  /* Width-by-type: the recorded result type fixes the byte width. */
+  u32 width = (u32)abi_cg_sizeof(t->c->abi, w->type);
+  if (width <= 4) {
+    /* Both 16- and 32-bit forms operate over i32. The 16-bit form only
+     * touches the low 16 bits; any extra high bits in the input are
+     * discarded by the AND mask. */
     u32 tmp = add_wasm_local(t, WASM_VAL_I32);
     emit_push_operand_reg(t, w->a);
     emit_insn(t, WASM_INSN_LOCAL_SET, (i64)tmp);
-    if (k == INTRIN_BSWAP16) {
+    if (width <= 2) {
       /* (x & 0xff) << 8 */
       emit_insn(t, WASM_INSN_LOCAL_GET, (i64)tmp);
       emit_insn(t, WASM_INSN_I32_CONST, 0xff);
@@ -2941,7 +2937,7 @@ static void emit_intrinsic_bswap(WTarget* t, const WIR* w) {
     emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
     return;
   }
-  /* BSWAP64: byte reverse over i64. */
+  /* 8-byte form: byte reverse over i64. */
   u32 tmp = add_wasm_local(t, WASM_VAL_I64);
   emit_push_operand_reg(t, w->a);
   emit_insn(t, WASM_INSN_LOCAL_SET, (i64)tmp);
@@ -3110,9 +3106,7 @@ static void emit_intrinsic(WTarget* t, const WIR* w) {
     case INTRIN_POPCOUNT:
       emit_intrinsic_bit_op(t, w);
       return;
-    case INTRIN_BSWAP16:
-    case INTRIN_BSWAP32:
-    case INTRIN_BSWAP64:
+    case INTRIN_BSWAP:
       emit_intrinsic_bswap(t, w);
       return;
     case INTRIN_SADD_OVERFLOW:
diff --git a/src/arch/wasm/internal.h b/src/arch/wasm/internal.h
@@ -88,7 +88,7 @@ typedef enum WIROp {
   WIR_VA_ARG,       /* dst = load of `type` from *(*addr); advance *addr by 8 */
   WIR_VA_COPY,      /* addr = dst_ap_addr; call_sret_addr = src_ap_addr */
   WIR_INTRINSIC,    /* cgop = IntrinKind; operand layout per kind:
-                       - bit ops (CLZ/CTZ/POPCOUNT/BSWAP16/32/64): one
+                       - bit ops (CLZ/CTZ/POPCOUNT/BSWAP): one
                          register operand in `a`, single dst in `dst`,
                          `type` carries the operand/result type.
                        - overflow arith (S/U{ADD,SUB,MUL}_OVERFLOW): two
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -3238,22 +3238,30 @@ static void x64_intrinsic(NativeTarget* t, IntrinKind kind,
       emit_alu_imm8(mc, w, X64_ALU_SUB_XOR, dr, w ? 63 : 31);
       return;
     }
-    case INTRIN_BSWAP16: {
-      u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
-      if (dr != sr) emit_mov_rr(mc, 0, dr, sr);
-      emit_rol16_imm8(mc, dr, 8);
-      return;
-    }
-    case INTRIN_BSWAP32: {
-      u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
-      if (dr != sr) emit_mov_rr(mc, 0, dr, sr);
-      emit_bswap(mc, 0, dr);
-      return;
-    }
-    case INTRIN_BSWAP64: {
-      u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
-      if (dr != sr) emit_mov_rr(mc, 1, dr, sr);
-      emit_bswap(mc, 1, dr);
+    case INTRIN_BSWAP: {
+      u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type);
+      switch (width) {
+        case 2: {
+          u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
+          if (dr != sr) emit_mov_rr(mc, 0, dr, sr);
+          emit_rol16_imm8(mc, dr, 8);
+          return;
+        }
+        case 4: {
+          u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
+          if (dr != sr) emit_mov_rr(mc, 0, dr, sr);
+          emit_bswap(mc, 0, dr);
+          return;
+        }
+        case 8: {
+          u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
+          if (dr != sr) emit_mov_rr(mc, 1, dr, sr);
+          emit_bswap(mc, 1, dr);
+          return;
+        }
+        default:
+          break;
+      }
       return;
     }
     case INTRIN_SADD_OVERFLOW:
diff --git a/src/cg/arith.c b/src/cg/arith.c
@@ -784,7 +784,10 @@ void cfree_cg_float_to_uint(CfreeCg* g, CfreeCgTypeId dst,
 
 IntrinKind api_map_intrinsic(CfreeCg* g, CfreeCgIntrinsic intrin,
                              CfreeCgTypeId result_type) {
-  u32 size = result_type ? abi_cg_sizeof(g->c->abi, result_type) : 0;
+  /* Width-by-type: backends derive operand width from the result type, so the
+   * mapping no longer needs the size here. */
+  (void)g;
+  (void)result_type;
   switch (intrin) {
     case CFREE_CG_INTRIN_TRAP:
       return INTRIN_TRAP;
@@ -795,9 +798,7 @@ IntrinKind api_map_intrinsic(CfreeCg* g, CfreeCgIntrinsic intrin,
     case CFREE_CG_INTRIN_POPCOUNT:
       return INTRIN_POPCOUNT;
     case CFREE_CG_INTRIN_BSWAP:
-      if (size <= 2) return INTRIN_BSWAP16;
-      if (size <= 4) return INTRIN_BSWAP32;
-      return INTRIN_BSWAP64;
+      return INTRIN_BSWAP;
     case CFREE_CG_INTRIN_SETJMP:
       return INTRIN_SETJMP;
     case CFREE_CG_INTRIN_LONGJMP:
diff --git a/src/cg/cgtarget.h b/src/cg/cgtarget.h
@@ -118,9 +118,7 @@ typedef enum IntrinKind {
   INTRIN_POPCOUNT,
   INTRIN_CTZ,
   INTRIN_CLZ,
-  INTRIN_BSWAP16,
-  INTRIN_BSWAP32,
-  INTRIN_BSWAP64,
+  INTRIN_BSWAP,
 
   /* memory */
   INTRIN_MEMCPY,
diff --git a/src/interp/engine.c b/src/interp/engine.c
@@ -1381,6 +1381,7 @@ static int interp_intrinsic(InterpStack* st, InterpFunc* fn, u64* regs,
   if (!aux) { unsupported(st, "intrinsic"); return 0; }
 #define ARGV(i) op_value(st, fn, regs, mem_off, &aux->args[i])
 #define AWID(i) ((u32)abi_cg_sizeof(c->abi, aux->args[i].type))
+#define DWID(i) ((u32)abi_cg_sizeof(c->abi, aux->dsts[i].type))
 #define DST0 (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG ? aux->dsts[0].v.reg : 0u)
   switch (aux->kind) {
     case INTRIN_MEMCPY:
@@ -1407,14 +1408,8 @@ static int interp_intrinsic(InterpStack* st, InterpFunc* fn, u64* regs,
     case INTRIN_CLZ:
       regs[DST0] = iclz(ARGV(0), AWID(0));
       return 1;
-    case INTRIN_BSWAP16:
-      regs[DST0] = ibswap(ARGV(0), 2);
-      return 1;
-    case INTRIN_BSWAP32:
-      regs[DST0] = ibswap(ARGV(0), 4);
-      return 1;
-    case INTRIN_BSWAP64:
-      regs[DST0] = ibswap(ARGV(0), 8);
+    case INTRIN_BSWAP:
+      regs[DST0] = ibswap(ARGV(0), DWID(0));
       return 1;
     case INTRIN_EXPECT:
       regs[DST0] = ARGV(0);
@@ -1508,6 +1503,7 @@ static int interp_intrinsic(InterpStack* st, InterpFunc* fn, u64* regs,
   }
 #undef ARGV
 #undef AWID
+#undef DWID
 #undef DST0
 }

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/aa64/native.c	\|	34	++++++++++++++++++++++++----------
M	src/arch/c_target/c_emit.c	\|	26	++++++++++++++++----------
M	src/arch/rv64/native.c	\|	96	+++++++++++++++++++++++++++++++++++++++++++------------------------------------
M	src/arch/wasm/emit.c	\|	30	++++++++++++------------------
M	src/arch/wasm/internal.h	\|	2	+-
M	src/arch/x64/native.c	\|	40	++++++++++++++++++++++++----------------
M	src/cg/arith.c	\|	9	+++++----
M	src/cg/cgtarget.h	\|	4	+---
M	src/interp/engine.c	\|	12	++++--------