kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 52897e0145b4e42aa1cfd85e10839ed370938ba1
parent d08e794c51bffbb4075e83df6dcb0d1bb3f2fc64
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue,  2 Jun 2026 05:07:02 -0700

cg: collapse INTRIN_BSWAP16/32/64 into one width-by-type BSWAP (Track 4a)

Diffstat:
Msrc/arch/aa64/native.c | 34++++++++++++++++++++++++----------
Msrc/arch/c_target/c_emit.c | 26++++++++++++++++----------
Msrc/arch/rv64/native.c | 96+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Msrc/arch/wasm/emit.c | 30++++++++++++------------------
Msrc/arch/wasm/internal.h | 2+-
Msrc/arch/x64/native.c | 40++++++++++++++++++++++++----------------
Msrc/cg/arith.c | 9+++++----
Msrc/cg/cgtarget.h | 4+---
Msrc/interp/engine.c | 12++++--------
9 files changed, 139 insertions(+), 114 deletions(-)

diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -3178,18 +3178,32 @@ static void aa_intrinsic(NativeTarget* t, IntrinKind kind, return; } break; - case INTRIN_BSWAP16: - case INTRIN_BSWAP32: - case INTRIN_BSWAP64: + case INTRIN_BSWAP: if (ndst == 1u && narg == 1u) { - u32 sf = kind == INTRIN_BSWAP64; - aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0]))); - if (kind == INTRIN_BSWAP16) { - aa_emit_load_imm(t->mc, 0, AA_TMP0, 16); - aa_emit32(t->mc, - aa64_lsrv(0, loc_reg(dsts[0]), loc_reg(dsts[0]), AA_TMP0)); + u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type); + switch (width) { + case 2: { + u32 sf = 0; + aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0]))); + aa_emit_load_imm(t->mc, 0, AA_TMP0, 16); + aa_emit32( + t->mc, + aa64_lsrv(0, loc_reg(dsts[0]), loc_reg(dsts[0]), AA_TMP0)); + return; + } + case 4: { + u32 sf = 0; + aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0]))); + return; + } + case 8: { + u32 sf = 1; + aa_emit32(t->mc, aa_rev(sf, loc_reg(dsts[0]), loc_reg(args[0]))); + return; + } + default: + break; } - return; } break; case INTRIN_SADD_OVERFLOW: diff --git a/src/arch/c_target/c_emit.c b/src/arch/c_target/c_emit.c @@ -2488,12 +2488,11 @@ static const char* c_bitop_builtin(IntrinKind k, u32 width) { if (width == 64) return "__builtin_clzll"; if (width == 16 || width == 8) return "__builtin_clz"; return NULL; - case INTRIN_BSWAP16: - return "__builtin_bswap16"; - case INTRIN_BSWAP32: - return "__builtin_bswap32"; - case INTRIN_BSWAP64: - return "__builtin_bswap64"; + case INTRIN_BSWAP: + if (width == 16) return "__builtin_bswap16"; + if (width == 32) return "__builtin_bswap32"; + if (width == 64) return "__builtin_bswap64"; + return NULL; default: return NULL; } @@ -2574,15 +2573,22 @@ void c_emit_intrinsic(CTarget* t, IntrinKind k, Operand* dsts, u32 ndst, case INTRIN_POPCOUNT: case INTRIN_CTZ: case INTRIN_CLZ: - case INTRIN_BSWAP16: - case INTRIN_BSWAP32: - case INTRIN_BSWAP64: { + case INTRIN_BSWAP: { if (ndst != 1 || narg != 1) { compiler_panic(t->c, loc, "C target: bit-intrin: bad shape (ndst=%u narg=%u)", (unsigned)ndst, (unsigned)narg); } - u32 w = c_int_width_for_signedness(t, args[0].type); + /* bswap width is determined by the result type (in bytes -> bit-width + * bucket, matching the old per-width intrinsic split). The other bit + * ops keep deriving width from the operand. */ + u32 w; + if (k == INTRIN_BSWAP) { + u32 bytes = (u32)cg_type_size(t->c, dsts[0].type); + w = bytes <= 2 ? 16u : (bytes <= 4 ? 32u : 64u); + } else { + w = c_int_width_for_signedness(t, args[0].type); + } const char* fn = c_bitop_builtin(k, w); if (!fn) { compiler_panic(t->c, loc, "C target: bit-intrin width %u unsupported", diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c @@ -2628,53 +2628,61 @@ static void rv_intrinsic(NativeTarget* t, IntrinKind kind, const NativeLoc* dsts case INTRIN_TRAP: rv64_emit32(mc, rv_ebreak()); return; - case INTRIN_BSWAP16: { - u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); - /* rd = ((rs & 0xff) << 8) | ((rs >> 8) & 0xff). */ - rv64_emit32(mc, rv_addi(RV_TMP2, RV_ZERO, 0xff)); - rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); /* 0xff00 */ - rv64_emit32(mc, rv_slli(RV_TMP1, rs, 8)); - rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP2)); - rv64_emit32(mc, rv_srli(RV_TMP3, rs, 8)); - rv64_emit32(mc, rv_andi(RV_TMP3, RV_TMP3, 0xff)); - rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP3)); - return; - } - case INTRIN_BSWAP32: { - u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); - rv64_emit32(mc, rv_srliw(RV_TMP1, rs, 24)); - rv64_emit32(mc, rv_andi(RV_TMP1, RV_TMP1, 0xff)); - rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 16)); - rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff)); - rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); - rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2)); - rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 8)); - rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff)); - rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 16)); - rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2)); - rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff)); - rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 24)); - rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP2)); - rv64_emit32(mc, rv_slli(rd, rd, 32)); - rv64_emit32(mc, rv_srli(rd, rd, 32)); - return; - } - case INTRIN_BSWAP64: { - u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); - int i; - rv64_emit32(mc, rv_addi(RV_TMP1, RV_ZERO, 0)); - for (i = 0; i < 8; ++i) { - int sh = 56 - 8 * i; - if (i == 0) { - rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff)); - } else { - rv64_emit32(mc, rv_srli(RV_TMP2, rs, (u32)(8 * i))); + case INTRIN_BSWAP: { + u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type); + switch (width) { + case 2: { + u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); + /* rd = ((rs & 0xff) << 8) | ((rs >> 8) & 0xff). */ + rv64_emit32(mc, rv_addi(RV_TMP2, RV_ZERO, 0xff)); + rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); /* 0xff00 */ + rv64_emit32(mc, rv_slli(RV_TMP1, rs, 8)); + rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP2)); + rv64_emit32(mc, rv_srli(RV_TMP3, rs, 8)); + rv64_emit32(mc, rv_andi(RV_TMP3, RV_TMP3, 0xff)); + rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP3)); + return; + } + case 4: { + u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); + rv64_emit32(mc, rv_srliw(RV_TMP1, rs, 24)); + rv64_emit32(mc, rv_andi(RV_TMP1, RV_TMP1, 0xff)); + rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 16)); rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff)); + rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); + rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2)); + rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 8)); + rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff)); + rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 16)); + rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2)); + rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff)); + rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 24)); + rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP2)); + rv64_emit32(mc, rv_slli(rd, rd, 32)); + rv64_emit32(mc, rv_srli(rd, rd, 32)); + return; } - if (sh) rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, (u32)sh)); - rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2)); + case 8: { + u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); + int i; + rv64_emit32(mc, rv_addi(RV_TMP1, RV_ZERO, 0)); + for (i = 0; i < 8; ++i) { + int sh = 56 - 8 * i; + if (i == 0) { + rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff)); + } else { + rv64_emit32(mc, rv_srli(RV_TMP2, rs, (u32)(8 * i))); + rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff)); + } + if (sh) rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, (u32)sh)); + rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2)); + } + rv64_emit32(mc, rv_addi(rd, RV_TMP1, 0)); + return; + } + default: + break; } - rv64_emit32(mc, rv_addi(rd, RV_TMP1, 0)); return; } case INTRIN_POPCOUNT: { diff --git a/src/arch/wasm/emit.c b/src/arch/wasm/emit.c @@ -1574,12 +1574,8 @@ static const char* intrin_name(IntrinKind k) { return "__builtin_ctz"; case INTRIN_CLZ: return "__builtin_clz"; - case INTRIN_BSWAP16: - return "__builtin_bswap16"; - case INTRIN_BSWAP32: - return "__builtin_bswap32"; - case INTRIN_BSWAP64: - return "__builtin_bswap64"; + case INTRIN_BSWAP: + return "__builtin_bswap"; case INTRIN_MEMCPY: return "memcpy"; case INTRIN_MEMMOVE: @@ -1705,9 +1701,7 @@ void wasm_intrinsic(CGTarget* tg, IntrinKind k, Operand* dst, u32 ndst, case INTRIN_CLZ: case INTRIN_CTZ: case INTRIN_POPCOUNT: - case INTRIN_BSWAP16: - case INTRIN_BSWAP32: - case INTRIN_BSWAP64: { + case INTRIN_BSWAP: { if (ndst != 1 || nargs != 1 || dst[0].kind != OPK_REG || args[0].kind != OPK_REG) { compiler_panic(t->c, cur_loc(t), @@ -2890,14 +2884,16 @@ static void emit_intrinsic_bit_op(WTarget* t, const WIR* w) { } static void emit_intrinsic_bswap(WTarget* t, const WIR* w) { - IntrinKind k = (IntrinKind)w->cgop; - if (k == INTRIN_BSWAP16 || k == INTRIN_BSWAP32) { - /* Both operate over i32. BSWAP16 only touches the low 16 bits; any - * extra high bits in the input are discarded by the AND mask. */ + /* Width-by-type: the recorded result type fixes the byte width. */ + u32 width = (u32)abi_cg_sizeof(t->c->abi, w->type); + if (width <= 4) { + /* Both 16- and 32-bit forms operate over i32. The 16-bit form only + * touches the low 16 bits; any extra high bits in the input are + * discarded by the AND mask. */ u32 tmp = add_wasm_local(t, WASM_VAL_I32); emit_push_operand_reg(t, w->a); emit_insn(t, WASM_INSN_LOCAL_SET, (i64)tmp); - if (k == INTRIN_BSWAP16) { + if (width <= 2) { /* (x & 0xff) << 8 */ emit_insn(t, WASM_INSN_LOCAL_GET, (i64)tmp); emit_insn(t, WASM_INSN_I32_CONST, 0xff); @@ -2941,7 +2937,7 @@ static void emit_intrinsic_bswap(WTarget* t, const WIR* w) { emit_local_set(t, w->dst, w->type, (RegClass)w->cls); return; } - /* BSWAP64: byte reverse over i64. */ + /* 8-byte form: byte reverse over i64. */ u32 tmp = add_wasm_local(t, WASM_VAL_I64); emit_push_operand_reg(t, w->a); emit_insn(t, WASM_INSN_LOCAL_SET, (i64)tmp); @@ -3110,9 +3106,7 @@ static void emit_intrinsic(WTarget* t, const WIR* w) { case INTRIN_POPCOUNT: emit_intrinsic_bit_op(t, w); return; - case INTRIN_BSWAP16: - case INTRIN_BSWAP32: - case INTRIN_BSWAP64: + case INTRIN_BSWAP: emit_intrinsic_bswap(t, w); return; case INTRIN_SADD_OVERFLOW: diff --git a/src/arch/wasm/internal.h b/src/arch/wasm/internal.h @@ -88,7 +88,7 @@ typedef enum WIROp { WIR_VA_ARG, /* dst = load of `type` from *(*addr); advance *addr by 8 */ WIR_VA_COPY, /* addr = dst_ap_addr; call_sret_addr = src_ap_addr */ WIR_INTRINSIC, /* cgop = IntrinKind; operand layout per kind: - - bit ops (CLZ/CTZ/POPCOUNT/BSWAP16/32/64): one + - bit ops (CLZ/CTZ/POPCOUNT/BSWAP): one register operand in `a`, single dst in `dst`, `type` carries the operand/result type. - overflow arith (S/U{ADD,SUB,MUL}_OVERFLOW): two diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c @@ -3238,22 +3238,30 @@ static void x64_intrinsic(NativeTarget* t, IntrinKind kind, emit_alu_imm8(mc, w, X64_ALU_SUB_XOR, dr, w ? 63 : 31); return; } - case INTRIN_BSWAP16: { - u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]); - if (dr != sr) emit_mov_rr(mc, 0, dr, sr); - emit_rol16_imm8(mc, dr, 8); - return; - } - case INTRIN_BSWAP32: { - u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]); - if (dr != sr) emit_mov_rr(mc, 0, dr, sr); - emit_bswap(mc, 0, dr); - return; - } - case INTRIN_BSWAP64: { - u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]); - if (dr != sr) emit_mov_rr(mc, 1, dr, sr); - emit_bswap(mc, 1, dr); + case INTRIN_BSWAP: { + u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type); + switch (width) { + case 2: { + u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]); + if (dr != sr) emit_mov_rr(mc, 0, dr, sr); + emit_rol16_imm8(mc, dr, 8); + return; + } + case 4: { + u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]); + if (dr != sr) emit_mov_rr(mc, 0, dr, sr); + emit_bswap(mc, 0, dr); + return; + } + case 8: { + u32 dr = loc_reg(dsts[0]), sr = loc_reg(args[0]); + if (dr != sr) emit_mov_rr(mc, 1, dr, sr); + emit_bswap(mc, 1, dr); + return; + } + default: + break; + } return; } case INTRIN_SADD_OVERFLOW: diff --git a/src/cg/arith.c b/src/cg/arith.c @@ -784,7 +784,10 @@ void cfree_cg_float_to_uint(CfreeCg* g, CfreeCgTypeId dst, IntrinKind api_map_intrinsic(CfreeCg* g, CfreeCgIntrinsic intrin, CfreeCgTypeId result_type) { - u32 size = result_type ? abi_cg_sizeof(g->c->abi, result_type) : 0; + /* Width-by-type: backends derive operand width from the result type, so the + * mapping no longer needs the size here. */ + (void)g; + (void)result_type; switch (intrin) { case CFREE_CG_INTRIN_TRAP: return INTRIN_TRAP; @@ -795,9 +798,7 @@ IntrinKind api_map_intrinsic(CfreeCg* g, CfreeCgIntrinsic intrin, case CFREE_CG_INTRIN_POPCOUNT: return INTRIN_POPCOUNT; case CFREE_CG_INTRIN_BSWAP: - if (size <= 2) return INTRIN_BSWAP16; - if (size <= 4) return INTRIN_BSWAP32; - return INTRIN_BSWAP64; + return INTRIN_BSWAP; case CFREE_CG_INTRIN_SETJMP: return INTRIN_SETJMP; case CFREE_CG_INTRIN_LONGJMP: diff --git a/src/cg/cgtarget.h b/src/cg/cgtarget.h @@ -118,9 +118,7 @@ typedef enum IntrinKind { INTRIN_POPCOUNT, INTRIN_CTZ, INTRIN_CLZ, - INTRIN_BSWAP16, - INTRIN_BSWAP32, - INTRIN_BSWAP64, + INTRIN_BSWAP, /* memory */ INTRIN_MEMCPY, diff --git a/src/interp/engine.c b/src/interp/engine.c @@ -1381,6 +1381,7 @@ static int interp_intrinsic(InterpStack* st, InterpFunc* fn, u64* regs, if (!aux) { unsupported(st, "intrinsic"); return 0; } #define ARGV(i) op_value(st, fn, regs, mem_off, &aux->args[i]) #define AWID(i) ((u32)abi_cg_sizeof(c->abi, aux->args[i].type)) +#define DWID(i) ((u32)abi_cg_sizeof(c->abi, aux->dsts[i].type)) #define DST0 (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG ? aux->dsts[0].v.reg : 0u) switch (aux->kind) { case INTRIN_MEMCPY: @@ -1407,14 +1408,8 @@ static int interp_intrinsic(InterpStack* st, InterpFunc* fn, u64* regs, case INTRIN_CLZ: regs[DST0] = iclz(ARGV(0), AWID(0)); return 1; - case INTRIN_BSWAP16: - regs[DST0] = ibswap(ARGV(0), 2); - return 1; - case INTRIN_BSWAP32: - regs[DST0] = ibswap(ARGV(0), 4); - return 1; - case INTRIN_BSWAP64: - regs[DST0] = ibswap(ARGV(0), 8); + case INTRIN_BSWAP: + regs[DST0] = ibswap(ARGV(0), DWID(0)); return 1; case INTRIN_EXPECT: regs[DST0] = ARGV(0); @@ -1508,6 +1503,7 @@ static int interp_intrinsic(InterpStack* st, InterpFunc* fn, u64* regs, } #undef ARGV #undef AWID +#undef DWID #undef DST0 }