commit 52897e0145b4e42aa1cfd85e10839ed370938ba1
parent d08e794c51bffbb4075e83df6dcb0d1bb3f2fc64
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 2 Jun 2026 05:07:02 -0700
cg: collapse INTRIN_BSWAP16/32/64 into one width-by-type BSWAP (Track 4a)
Diffstat:
9 files changed, 139 insertions(+), 114 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -3178,18 +3178,32 @@ static void aa_intrinsic(NativeTarget* t, IntrinKind kind,
return;
}
break;
- case INTRIN_BSWAP16:
- case INTRIN_BSWAP32:
- case INTRIN_BSWAP64:
+ case INTRIN_BSWAP:
if (ndst == 1u && narg == 1u) {
- u32 sf = kind == INTRIN_BSWAP64;
- aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0])));
- if (kind == INTRIN_BSWAP16) {
- aa_emit_load_imm(t->mc, 0, AA_TMP0, 16);
- aa_emit32(t->mc,
- aa64_lsrv(0, loc_reg(dsts[0]), loc_reg(dsts[0]), AA_TMP0));
+ u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type);
+ switch (width) {
+ case 2: {
+ u32 sf = 0;
+ aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0])));
+ aa_emit_load_imm(t->mc, 0, AA_TMP0, 16);
+ aa_emit32(
+ t->mc,
+ aa64_lsrv(0, loc_reg(dsts[0]), loc_reg(dsts[0]), AA_TMP0));
+ return;
+ }
+ case 4: {
+ u32 sf = 0;
+ aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0])));
+ return;
+ }
+ case 8: {
+ u32 sf = 1;
+ aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0])));
+ return;
+ }
+ default:
+ break;
}
- return;
}
break;
case INTRIN_SADD_OVERFLOW:
diff --git a/src/arch/c_target/c_emit.c b/src/arch/c_target/c_emit.c
@@ -2488,12 +2488,11 @@ static const char* c_bitop_builtin(IntrinKind k, u32 width) {
if (width == 64) return "__builtin_clzll";
if (width == 16 || width == 8) return "__builtin_clz";
return NULL;
- case INTRIN_BSWAP16:
- return "__builtin_bswap16";
- case INTRIN_BSWAP32:
- return "__builtin_bswap32";
- case INTRIN_BSWAP64:
- return "__builtin_bswap64";
+ case INTRIN_BSWAP:
+ if (width == 16) return "__builtin_bswap16";
+ if (width == 32) return "__builtin_bswap32";
+ if (width == 64) return "__builtin_bswap64";
+ return NULL;
default:
return NULL;
}
@@ -2574,15 +2573,22 @@ void c_emit_intrinsic(CTarget* t, IntrinKind k, Operand* dsts, u32 ndst,
case INTRIN_POPCOUNT:
case INTRIN_CTZ:
case INTRIN_CLZ:
- case INTRIN_BSWAP16:
- case INTRIN_BSWAP32:
- case INTRIN_BSWAP64: {
+ case INTRIN_BSWAP: {
if (ndst != 1 || narg != 1) {
compiler_panic(t->c, loc,
"C target: bit-intrin: bad shape (ndst=%u narg=%u)",
(unsigned)ndst, (unsigned)narg);
}
- u32 w = c_int_width_for_signedness(t, args[0].type);
+ /* bswap width is determined by the result type (in bytes -> bit-width
+ * bucket, matching the old per-width intrinsic split). The other bit
+ * ops keep deriving width from the operand. */
+ u32 w;
+ if (k == INTRIN_BSWAP) {
+ u32 bytes = (u32)cg_type_size(t->c, dsts[0].type);
+ w = bytes <= 2 ? 16u : (bytes <= 4 ? 32u : 64u);
+ } else {
+ w = c_int_width_for_signedness(t, args[0].type);
+ }
const char* fn = c_bitop_builtin(k, w);
if (!fn) {
compiler_panic(t->c, loc, "C target: bit-intrin width %u unsupported",
diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c
@@ -2628,53 +2628,61 @@ static void rv_intrinsic(NativeTarget* t, IntrinKind kind, const NativeLoc* dsts
case INTRIN_TRAP:
rv64_emit32(mc, rv_ebreak());
return;
- case INTRIN_BSWAP16: {
- u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
- /* rd = ((rs & 0xff) << 8) | ((rs >> 8) & 0xff). */
- rv64_emit32(mc, rv_addi(RV_TMP2, RV_ZERO, 0xff));
- rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); /* 0xff00 */
- rv64_emit32(mc, rv_slli(RV_TMP1, rs, 8));
- rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP2));
- rv64_emit32(mc, rv_srli(RV_TMP3, rs, 8));
- rv64_emit32(mc, rv_andi(RV_TMP3, RV_TMP3, 0xff));
- rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP3));
- return;
- }
- case INTRIN_BSWAP32: {
- u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
- rv64_emit32(mc, rv_srliw(RV_TMP1, rs, 24));
- rv64_emit32(mc, rv_andi(RV_TMP1, RV_TMP1, 0xff));
- rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 16));
- rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
- rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8));
- rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
- rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 8));
- rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
- rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 16));
- rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
- rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
- rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 24));
- rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP2));
- rv64_emit32(mc, rv_slli(rd, rd, 32));
- rv64_emit32(mc, rv_srli(rd, rd, 32));
- return;
- }
- case INTRIN_BSWAP64: {
- u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
- int i;
- rv64_emit32(mc, rv_addi(RV_TMP1, RV_ZERO, 0));
- for (i = 0; i < 8; ++i) {
- int sh = 56 - 8 * i;
- if (i == 0) {
- rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
- } else {
- rv64_emit32(mc, rv_srli(RV_TMP2, rs, (u32)(8 * i)));
+ case INTRIN_BSWAP: {
+ u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type);
+ switch (width) {
+ case 2: {
+ u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
+ /* rd = ((rs & 0xff) << 8) | ((rs >> 8) & 0xff). */
+ rv64_emit32(mc, rv_addi(RV_TMP2, RV_ZERO, 0xff));
+ rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); /* 0xff00 */
+ rv64_emit32(mc, rv_slli(RV_TMP1, rs, 8));
+ rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP2));
+ rv64_emit32(mc, rv_srli(RV_TMP3, rs, 8));
+ rv64_emit32(mc, rv_andi(RV_TMP3, RV_TMP3, 0xff));
+ rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP3));
+ return;
+ }
+ case 4: {
+ u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
+ rv64_emit32(mc, rv_srliw(RV_TMP1, rs, 24));
+ rv64_emit32(mc, rv_andi(RV_TMP1, RV_TMP1, 0xff));
+ rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 16));
rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
+ rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8));
+ rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
+ rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 8));
+ rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
+ rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 16));
+ rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
+ rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
+ rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 24));
+ rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP2));
+ rv64_emit32(mc, rv_slli(rd, rd, 32));
+ rv64_emit32(mc, rv_srli(rd, rd, 32));
+ return;
}
- if (sh) rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, (u32)sh));
- rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
+ case 8: {
+ u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
+ int i;
+ rv64_emit32(mc, rv_addi(RV_TMP1, RV_ZERO, 0));
+ for (i = 0; i < 8; ++i) {
+ int sh = 56 - 8 * i;
+ if (i == 0) {
+ rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
+ } else {
+ rv64_emit32(mc, rv_srli(RV_TMP2, rs, (u32)(8 * i)));
+ rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
+ }
+ if (sh) rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, (u32)sh));
+ rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
+ }
+ rv64_emit32(mc, rv_addi(rd, RV_TMP1, 0));
+ return;
+ }
+ default:
+ break;
}
- rv64_emit32(mc, rv_addi(rd, RV_TMP1, 0));
return;
}
case INTRIN_POPCOUNT: {
diff --git a/src/arch/wasm/emit.c b/src/arch/wasm/emit.c
@@ -1574,12 +1574,8 @@ static const char* intrin_name(IntrinKind k) {
return "__builtin_ctz";
case INTRIN_CLZ:
return "__builtin_clz";
- case INTRIN_BSWAP16:
- return "__builtin_bswap16";
- case INTRIN_BSWAP32:
- return "__builtin_bswap32";
- case INTRIN_BSWAP64:
- return "__builtin_bswap64";
+ case INTRIN_BSWAP:
+ return "__builtin_bswap";
case INTRIN_MEMCPY:
return "memcpy";
case INTRIN_MEMMOVE:
@@ -1705,9 +1701,7 @@ void wasm_intrinsic(CGTarget* tg, IntrinKind k, Operand* dst, u32 ndst,
case INTRIN_CLZ:
case INTRIN_CTZ:
case INTRIN_POPCOUNT:
- case INTRIN_BSWAP16:
- case INTRIN_BSWAP32:
- case INTRIN_BSWAP64: {
+ case INTRIN_BSWAP: {
if (ndst != 1 || nargs != 1 || dst[0].kind != OPK_REG ||
args[0].kind != OPK_REG) {
compiler_panic(t->c, cur_loc(t),
@@ -2890,14 +2884,16 @@ static void emit_intrinsic_bit_op(WTarget* t, const WIR* w) {
}
static void emit_intrinsic_bswap(WTarget* t, const WIR* w) {
- IntrinKind k = (IntrinKind)w->cgop;
- if (k == INTRIN_BSWAP16 || k == INTRIN_BSWAP32) {
- /* Both operate over i32. BSWAP16 only touches the low 16 bits; any
- * extra high bits in the input are discarded by the AND mask. */
+ /* Width-by-type: the recorded result type fixes the byte width. */
+ u32 width = (u32)abi_cg_sizeof(t->c->abi, w->type);
+ if (width <= 4) {
+ /* Both 16- and 32-bit forms operate over i32. The 16-bit form only
+ * touches the low 16 bits; any extra high bits in the input are
+ * discarded by the AND mask. */
u32 tmp = add_wasm_local(t, WASM_VAL_I32);
emit_push_operand_reg(t, w->a);
emit_insn(t, WASM_INSN_LOCAL_SET, (i64)tmp);
- if (k == INTRIN_BSWAP16) {
+ if (width <= 2) {
/* (x & 0xff) << 8 */
emit_insn(t, WASM_INSN_LOCAL_GET, (i64)tmp);
emit_insn(t, WASM_INSN_I32_CONST, 0xff);
@@ -2941,7 +2937,7 @@ static void emit_intrinsic_bswap(WTarget* t, const WIR* w) {
emit_local_set(t, w->dst, w->type, (RegClass)w->cls);
return;
}
- /* BSWAP64: byte reverse over i64. */
+ /* 8-byte form: byte reverse over i64. */
u32 tmp = add_wasm_local(t, WASM_VAL_I64);
emit_push_operand_reg(t, w->a);
emit_insn(t, WASM_INSN_LOCAL_SET, (i64)tmp);
@@ -3110,9 +3106,7 @@ static void emit_intrinsic(WTarget* t, const WIR* w) {
case INTRIN_POPCOUNT:
emit_intrinsic_bit_op(t, w);
return;
- case INTRIN_BSWAP16:
- case INTRIN_BSWAP32:
- case INTRIN_BSWAP64:
+ case INTRIN_BSWAP:
emit_intrinsic_bswap(t, w);
return;
case INTRIN_SADD_OVERFLOW:
diff --git a/src/arch/wasm/internal.h b/src/arch/wasm/internal.h
@@ -88,7 +88,7 @@ typedef enum WIROp {
WIR_VA_ARG, /* dst = load of `type` from *(*addr); advance *addr by 8 */
WIR_VA_COPY, /* addr = dst_ap_addr; call_sret_addr = src_ap_addr */
WIR_INTRINSIC, /* cgop = IntrinKind; operand layout per kind:
- - bit ops (CLZ/CTZ/POPCOUNT/BSWAP16/32/64): one
+ - bit ops (CLZ/CTZ/POPCOUNT/BSWAP): one
register operand in `a`, single dst in `dst`,
`type` carries the operand/result type.
- overflow arith (S/U{ADD,SUB,MUL}_OVERFLOW): two
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -3238,22 +3238,30 @@ static void x64_intrinsic(NativeTarget* t, IntrinKind kind,
emit_alu_imm8(mc, w, X64_ALU_SUB_XOR, dr, w ? 63 : 31);
return;
}
- case INTRIN_BSWAP16: {
- u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
- if (dr != sr) emit_mov_rr(mc, 0, dr, sr);
- emit_rol16_imm8(mc, dr, 8);
- return;
- }
- case INTRIN_BSWAP32: {
- u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
- if (dr != sr) emit_mov_rr(mc, 0, dr, sr);
- emit_bswap(mc, 0, dr);
- return;
- }
- case INTRIN_BSWAP64: {
- u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
- if (dr != sr) emit_mov_rr(mc, 1, dr, sr);
- emit_bswap(mc, 1, dr);
+ case INTRIN_BSWAP: {
+ u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type);
+ switch (width) {
+ case 2: {
+ u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
+ if (dr != sr) emit_mov_rr(mc, 0, dr, sr);
+ emit_rol16_imm8(mc, dr, 8);
+ return;
+ }
+ case 4: {
+ u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
+ if (dr != sr) emit_mov_rr(mc, 0, dr, sr);
+ emit_bswap(mc, 0, dr);
+ return;
+ }
+ case 8: {
+ u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]);
+ if (dr != sr) emit_mov_rr(mc, 1, dr, sr);
+ emit_bswap(mc, 1, dr);
+ return;
+ }
+ default:
+ break;
+ }
return;
}
case INTRIN_SADD_OVERFLOW:
diff --git a/src/cg/arith.c b/src/cg/arith.c
@@ -784,7 +784,10 @@ void cfree_cg_float_to_uint(CfreeCg* g, CfreeCgTypeId dst,
IntrinKind api_map_intrinsic(CfreeCg* g, CfreeCgIntrinsic intrin,
CfreeCgTypeId result_type) {
- u32 size = result_type ? abi_cg_sizeof(g->c->abi, result_type) : 0;
+ /* Width-by-type: backends derive operand width from the result type, so the
+ * mapping no longer needs the size here. */
+ (void)g;
+ (void)result_type;
switch (intrin) {
case CFREE_CG_INTRIN_TRAP:
return INTRIN_TRAP;
@@ -795,9 +798,7 @@ IntrinKind api_map_intrinsic(CfreeCg* g, CfreeCgIntrinsic intrin,
case CFREE_CG_INTRIN_POPCOUNT:
return INTRIN_POPCOUNT;
case CFREE_CG_INTRIN_BSWAP:
- if (size <= 2) return INTRIN_BSWAP16;
- if (size <= 4) return INTRIN_BSWAP32;
- return INTRIN_BSWAP64;
+ return INTRIN_BSWAP;
case CFREE_CG_INTRIN_SETJMP:
return INTRIN_SETJMP;
case CFREE_CG_INTRIN_LONGJMP:
diff --git a/src/cg/cgtarget.h b/src/cg/cgtarget.h
@@ -118,9 +118,7 @@ typedef enum IntrinKind {
INTRIN_POPCOUNT,
INTRIN_CTZ,
INTRIN_CLZ,
- INTRIN_BSWAP16,
- INTRIN_BSWAP32,
- INTRIN_BSWAP64,
+ INTRIN_BSWAP,
/* memory */
INTRIN_MEMCPY,
diff --git a/src/interp/engine.c b/src/interp/engine.c
@@ -1381,6 +1381,7 @@ static int interp_intrinsic(InterpStack* st, InterpFunc* fn, u64* regs,
if (!aux) { unsupported(st, "intrinsic"); return 0; }
#define ARGV(i) op_value(st, fn, regs, mem_off, &aux->args[i])
#define AWID(i) ((u32)abi_cg_sizeof(c->abi, aux->args[i].type))
+#define DWID(i) ((u32)abi_cg_sizeof(c->abi, aux->dsts[i].type))
#define DST0 (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG ? aux->dsts[0].v.reg : 0u)
switch (aux->kind) {
case INTRIN_MEMCPY:
@@ -1407,14 +1408,8 @@ static int interp_intrinsic(InterpStack* st, InterpFunc* fn, u64* regs,
case INTRIN_CLZ:
regs[DST0] = iclz(ARGV(0), AWID(0));
return 1;
- case INTRIN_BSWAP16:
- regs[DST0] = ibswap(ARGV(0), 2);
- return 1;
- case INTRIN_BSWAP32:
- regs[DST0] = ibswap(ARGV(0), 4);
- return 1;
- case INTRIN_BSWAP64:
- regs[DST0] = ibswap(ARGV(0), 8);
+ case INTRIN_BSWAP:
+ regs[DST0] = ibswap(ARGV(0), DWID(0));
return 1;
case INTRIN_EXPECT:
regs[DST0] = ARGV(0);
@@ -1508,6 +1503,7 @@ static int interp_intrinsic(InterpStack* st, InterpFunc* fn, u64* regs,
}
#undef ARGV
#undef AWID
+#undef DWID
#undef DST0
}