kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit f5a4f04e3bf6b4fd9bd8929bfc657c3bf7be9e28
parent 80ba21ae01667332bf937b70d209a0e73d690938
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon,  8 Jun 2026 13:03:23 -0700

Track 4: drop dead INTRIN_MEMCPY/INTRIN_MEMSET intrinsics

memcpy/memset are dedicated public ops (kit_cg_memcpy/_memset) that lower to the
copy_bytes/set_bytes CgTarget hooks -> CG_IR_AGG_COPY/AGG_SET -> the backends'
agg-copy/set paths (and IOP_AGG_COPY/SET in the interpreter). Only kit_cg_memmove
flows through the generic intrinsic hook (INTRIN_MEMMOVE). So the internal
IntrinKind values INTRIN_MEMCPY and INTRIN_MEMSET were never emitted anywhere —
"two spellings of one behavior" (CODEGEN.md Principle 7). Remove them and the dead
switch arms across all six backends (x64/aa64/riscv native, wasm, c_target, interp);
the memmove-only shapes that shared an arm keep their overlap-safe path.

(The C frontend's private cg_adapter.h INTRIN_MEM* enum is a separate namespace,
left untouched.)

Green: lib/bin/rt, cg-api (interp memmove), libc 9/0, parse + parse-err 128/0,
smoke x64/rv64.

Diffstat:
Msrc/arch/aa64/native.c | 32--------------------------------
Msrc/arch/c_target/c_emit.c | 22+++++++---------------
Msrc/arch/riscv/native.c | 60+-----------------------------------------------------------
Msrc/arch/wasm/emit.c | 36+++---------------------------------
Msrc/arch/x64/native.c | 111+++++++++++++++----------------------------------------------------------------
Msrc/cg/cgtarget.h | 5++---
Msrc/interp/engine.c | 12------------
7 files changed, 34 insertions(+), 244 deletions(-)

diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -3501,20 +3501,6 @@ static void aa_intrinsic(NativeTarget* t, IntrinKind kind, return; } break; - case INTRIN_MEMCPY: - if (narg != 3u || args[0].kind != NATIVE_LOC_REG || - args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM) - aa_panic(aa_of(t), "unsupported memory intrinsic operands"); - if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll) - aa_panic(aa_of(t), "unsupported memory intrinsic size"); - access.size = (u32)args[2].v.imm; - access.align = 1u; - dst_addr.base_kind = NATIVE_ADDR_BASE_REG; - dst_addr.base.reg = args[0].v.reg; - src_addr.base_kind = NATIVE_ADDR_BASE_REG; - src_addr.base.reg = args[1].v.reg; - aa_copy_bytes(t, dst_addr, src_addr, access); - return; case INTRIN_MEMMOVE: { MCLabel forward = t->mc->label_new(t->mc); MCLabel done = t->mc->label_new(t->mc); @@ -3540,24 +3526,6 @@ static void aa_intrinsic(NativeTarget* t, IntrinKind kind, t->mc->label_place(t->mc, done); return; } - case INTRIN_MEMSET: - if (narg != 3u || args[0].kind != NATIVE_LOC_REG || - args[2].kind != NATIVE_LOC_IMM) - aa_panic(aa_of(t), "unsupported memset operands"); - if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll) - aa_panic(aa_of(t), "unsupported memset size"); - access.size = (u32)args[2].v.imm; - access.align = 1u; - dst_addr.base_kind = NATIVE_ADDR_BASE_REG; - dst_addr.base.reg = args[0].v.reg; - if (args[1].kind == NATIVE_LOC_IMM) { - NativeLoc byte = aa_tmp_loc(builtin_id(KIT_CG_BUILTIN_I8), AA_TMP0); - aa_emit_load_imm(t->mc, 0, AA_TMP0, args[1].v.imm & 0xff); - aa_set_bytes(t, dst_addr, byte, access); - } else { - aa_set_bytes(t, dst_addr, args[1], access); - } - return; case INTRIN_EXPECT: case INTRIN_ASSUME_ALIGNED: if (ndst == 1u && narg >= 1u) { diff --git a/src/arch/c_target/c_emit.c b/src/arch/c_target/c_emit.c @@ -2551,23 +2551,15 @@ void c_emit_intrinsic(CTarget* t, IntrinKind k, Operand* dsts, u32 ndst, c_emit_local_assign_close(t); return; } - case INTRIN_MEMCPY: - case INTRIN_MEMMOVE: - case INTRIN_MEMSET: { - const char* fn = (k == INTRIN_MEMCPY) ? "__builtin_memcpy" - : (k == INTRIN_MEMMOVE) ? "__builtin_memmove" - : "__builtin_memset"; - cbuf_puts(&t->body, " "); - cbuf_puts(&t->body, fn); - cbuf_puts(&t->body, "("); + case INTRIN_MEMMOVE: { + cbuf_puts(&t->body, " __builtin_memmove("); for (u32 i = 0; i < narg; ++i) { if (i > 0) cbuf_puts(&t->body, ", "); - /* The pointer operands (dst for all three; src for mem{cpy,move}) - * may be typed as a plain integer local when they come from address - * arithmetic, which the C target declares as int64_t. The builtins - * take void*, so cast explicitly to avoid -Wint-conversion. */ - int is_ptr_arg = - (i == 0) || (i == 1 && (k == INTRIN_MEMCPY || k == INTRIN_MEMMOVE)); + /* The pointer operands (dst and src) may be typed as a plain integer + * local when they come from address arithmetic, which the C target + * declares as int64_t. __builtin_memmove takes void*, so cast + * explicitly to avoid -Wint-conversion. */ + int is_ptr_arg = (i == 0) || (i == 1); if (is_ptr_arg) cbuf_puts(&t->body, "(void*)"); c_emit_operand(t, args[i]); } diff --git a/src/arch/riscv/native.c b/src/arch/riscv/native.c @@ -3268,7 +3268,6 @@ static void rv_intrinsic(NativeTarget* t, IntrinKind kind, } return; } - case INTRIN_MEMCPY: case INTRIN_MEMMOVE: { u32 dr, sr, n; if (narg != 3u || args[0].kind != NATIVE_LOC_REG || @@ -3279,64 +3278,7 @@ static void rv_intrinsic(NativeTarget* t, IntrinKind kind, dr = loc_reg(args[0]); sr = loc_reg(args[1]); n = (u32)args[2].v.imm; - rv_intrin_copy(v, mc, dr, sr, n, kind == INTRIN_MEMMOVE); - return; - } - case INTRIN_MEMSET: { - u32 dr, n, src; - int wide = v->ptr_bytes == 8u; /* 8-byte sd stores only on rv64 */ - if (narg != 3u || args[0].kind != NATIVE_LOC_REG || - args[2].kind != NATIVE_LOC_IMM) - rv_panic(a, "unsupported memset operands"); - if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll) - rv_panic(a, "unsupported memset size"); - dr = loc_reg(args[0]); - n = (u32)args[2].v.imm; - if (args[1].kind == NATIVE_LOC_IMM) { - u32 byte = (u32)(args[1].v.imm & 0xffu); - if (byte == 0) { - src = RV_ZERO; - } else { - u64 b = byte; - b |= b << 8; - b |= b << 16; - if (wide) b |= b << 32; - rv_emit_load_imm(v, mc, 1, RV_TMP3, (i64)b); - src = RV_TMP3; - } - } else { - /* Replicate the low byte across the register width (4 or 8 bytes). */ - u32 rb = loc_reg(args[1]); - rv64_emit32(mc, rv_andi(RV_TMP3, rb, 0xff)); - rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 8)); - rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2)); - rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 16)); - rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2)); - if (wide) { - rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 32)); - rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2)); - } - src = RV_TMP3; - } - { - u32 i = 0; - while (wide && i + 8u <= n) { - rv64_emit32(mc, rv_sd(src, dr, (i32)i)); - i += 8u; - } - while (i + 4u <= n) { - rv64_emit32(mc, rv_sw(src, dr, (i32)i)); - i += 4u; - } - while (i + 2u <= n) { - rv64_emit32(mc, rv_sh(src, dr, (i32)i)); - i += 2u; - } - while (i < n) { - rv64_emit32(mc, rv_sb(src, dr, (i32)i)); - i += 1u; - } - } + rv_intrin_copy(v, mc, dr, sr, n, /*reverse (overlap-safe)=*/1); return; } case INTRIN_CPU_NOP: diff --git a/src/arch/wasm/emit.c b/src/arch/wasm/emit.c @@ -1575,12 +1575,8 @@ static const char* intrin_name(IntrinKind k) { return "__builtin_clz"; case INTRIN_BSWAP: return "__builtin_bswap"; - case INTRIN_MEMCPY: - return "memcpy"; case INTRIN_MEMMOVE: return "memmove"; - case INTRIN_MEMSET: - return "memset"; case INTRIN_PREFETCH: return "__builtin_prefetch"; case INTRIN_ASSUME_ALIGNED: @@ -1668,11 +1664,10 @@ void wasm_intrinsic(CGTarget* tg, IntrinKind k, Operand* dst, u32 ndst, } return; - case INTRIN_MEMCPY: case INTRIN_MEMMOVE: { - /* memcpy/memmove both lower to memory.copy, which is spec-defined to - * handle overlap correctly. CG forces (dst, src) to REG and passes - * size as OPK_IMM (kit_cg_memmove). */ + /* memmove lowers to memory.copy, which is spec-defined to handle overlap + * correctly. CG forces (dst, src) to REG and passes size as OPK_IMM + * (kit_cg_memmove). */ if (nargs != 3 || args[0].kind != OPK_REG || args[1].kind != OPK_REG) { compiler_panic(t->c, cur_loc(t), "wasm target: %s requires register pointers", @@ -1699,31 +1694,6 @@ void wasm_intrinsic(CGTarget* tg, IntrinKind k, Operand* dst, u32 ndst, return; } - case INTRIN_MEMSET: { - if (nargs != 3 || args[0].kind != OPK_REG) { - compiler_panic(t->c, cur_loc(t), - "wasm target: memset requires a register dst pointer"); - return; - } - if (args[1].kind != OPK_IMM || args[2].kind != OPK_IMM) { - compiler_panic(t->c, cur_loc(t), - "wasm target: memset with non-constant byte/size is " - "not yet supported"); - return; - } - ensure_linear_memory(t); - AggregateAccess a; - memset(&a, 0, sizeof a); - a.size = (u32)args[2].v.imm; - a.align = 1; - WIR* w = wir_push(t); - w->op = WIR_SET_BYTES; - w->addr = args[0]; - wir_capture_operand(w, 0, args[1]); - w->agg = a; - return; - } - case INTRIN_CLZ: case INTRIN_CTZ: case INTRIN_POPCOUNT: diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c @@ -3535,9 +3535,8 @@ static void x64_intrinsic(NativeTarget* t, IntrinKind kind, emit_movzx_r32_r8(mc, rovf, rovf); return; } - case INTRIN_MEMCPY: case INTRIN_MEMMOVE: { - u32 dr, sr, n; + u32 dr, sr, n, i; if (narg != 3u || args[0].kind != NATIVE_LOC_REG || args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM) x64_panic(a, "unsupported memory intrinsic operands"); @@ -3546,50 +3545,26 @@ static void x64_intrinsic(NativeTarget* t, IntrinKind kind, dr = loc_reg(args[0]); sr = loc_reg(args[1]); n = (u32)args[2].v.imm; - if (kind == INTRIN_MEMCPY) { - u32 i = 0; - while (i + 8u <= n) { - emit_mov_load(mc, 8, 0, X64_RAX, sr, (i32)i); - emit_mov_store(mc, 8, X64_RAX, dr, (i32)i); - i += 8u; - } - while (i + 4u <= n) { - emit_mov_load(mc, 4, 0, X64_RAX, sr, (i32)i); - emit_mov_store(mc, 4, X64_RAX, dr, (i32)i); - i += 4u; - } - while (i + 2u <= n) { - emit_mov_load(mc, 2, 0, X64_RAX, sr, (i32)i); - emit_mov_store(mc, 2, X64_RAX, dr, (i32)i); - i += 2u; - } - while (i < n) { - emit_mov_load(mc, 1, 0, X64_RAX, sr, (i32)i); - emit_mov_store(mc, 1, X64_RAX, dr, (i32)i); - i += 1u; - } - } else { - u32 i = n; - while (i >= 8u) { - i -= 8u; - emit_mov_load(mc, 8, 0, X64_RAX, sr, (i32)i); - emit_mov_store(mc, 8, X64_RAX, dr, (i32)i); - } - while (i >= 4u) { - i -= 4u; - emit_mov_load(mc, 4, 0, X64_RAX, sr, (i32)i); - emit_mov_store(mc, 4, X64_RAX, dr, (i32)i); - } - while (i >= 2u) { - i -= 2u; - emit_mov_load(mc, 2, 0, X64_RAX, sr, (i32)i); - emit_mov_store(mc, 2, X64_RAX, dr, (i32)i); - } - while (i >= 1u) { - i -= 1u; - emit_mov_load(mc, 1, 0, X64_RAX, sr, (i32)i); - emit_mov_store(mc, 1, X64_RAX, dr, (i32)i); - } + i = n; /* copy high-to-low so an overlapping dst > src is safe */ + while (i >= 8u) { + i -= 8u; + emit_mov_load(mc, 8, 0, X64_RAX, sr, (i32)i); + emit_mov_store(mc, 8, X64_RAX, dr, (i32)i); + } + while (i >= 4u) { + i -= 4u; + emit_mov_load(mc, 4, 0, X64_RAX, sr, (i32)i); + emit_mov_store(mc, 4, X64_RAX, dr, (i32)i); + } + while (i >= 2u) { + i -= 2u; + emit_mov_load(mc, 2, 0, X64_RAX, sr, (i32)i); + emit_mov_store(mc, 2, X64_RAX, dr, (i32)i); + } + while (i >= 1u) { + i -= 1u; + emit_mov_load(mc, 1, 0, X64_RAX, sr, (i32)i); + emit_mov_store(mc, 1, X64_RAX, dr, (i32)i); } return; } @@ -3619,50 +3594,6 @@ static void x64_intrinsic(NativeTarget* t, IntrinKind kind, mc->emit_bytes(mc, &b, 1); return; } - case INTRIN_MEMSET: { - u32 dr, n; - if (narg != 3u || args[0].kind != NATIVE_LOC_REG || - args[2].kind != NATIVE_LOC_IMM) - x64_panic(a, "unsupported memset operands"); - if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll) - x64_panic(a, "unsupported memset size"); - dr = loc_reg(args[0]); - n = (u32)args[2].v.imm; - if (args[1].kind == NATIVE_LOC_IMM) { - u8 byte = (u8)(args[1].v.imm & 0xffu); - u64 b64 = byte; - b64 |= b64 << 8; - b64 |= b64 << 16; - b64 |= b64 << 32; - x64_emit_load_imm(mc, 1, X64_RAX, (i64)b64); - } else { - /* Broadcast low byte of a register via multiply by 0x0101010101010101. - */ - x64_emit_load_imm(mc, 1, X64_R11, (i64)0x0101010101010101ll); - emit_mov_rr(mc, 1, X64_RAX, loc_reg(args[1])); - emit_imul_rr(mc, 1, X64_RAX, X64_R11); - } - { - u32 i = 0; - while (i + 8u <= n) { - emit_mov_store(mc, 8, X64_RAX, dr, (i32)i); - i += 8u; - } - while (i + 4u <= n) { - emit_mov_store(mc, 4, X64_RAX, dr, (i32)i); - i += 4u; - } - while (i + 2u <= n) { - emit_mov_store(mc, 2, X64_RAX, dr, (i32)i); - i += 2u; - } - while (i < n) { - emit_mov_store(mc, 1, X64_RAX, dr, (i32)i); - i += 1u; - } - } - return; - } case INTRIN_FRAME_ADDRESS: case INTRIN_RETURN_ADDRESS: /* Walk the rbp frame-record chain. Every kit prologue keeps the rbp diff --git a/src/cg/cgtarget.h b/src/cg/cgtarget.h @@ -136,10 +136,9 @@ typedef enum IntrinKind { INTRIN_CLZ, INTRIN_BSWAP, - /* memory */ - INTRIN_MEMCPY, + /* memory. memcpy/memset are the dedicated copy_bytes/set_bytes hooks + * (kit_cg_memcpy/_memset); only memmove flows through the intrinsic path. */ INTRIN_MEMMOVE, - INTRIN_MEMSET, INTRIN_PREFETCH, INTRIN_ASSUME_ALIGNED, diff --git a/src/interp/engine.c b/src/interp/engine.c @@ -1568,24 +1568,12 @@ static int interp_intrinsic(InterpStack* st, InterpFunc* fn, u64* regs, #define DST0 \ (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG ? aux->dsts[0].v.reg : 0u) switch (aux->kind) { - case INTRIN_MEMCPY: case INTRIN_MEMMOVE: { u64 d = ARGV(0), s = ARGV(1), n = ARGV(2); mem_copy(st, d, s, (u32)n); if (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG) regs[DST0] = d; return 1; } - case INTRIN_MEMSET: { - u64 d = ARGV(0), byte = ARGV(1), n = ARGV(2); - u8* h = interp_translate(p, d, (u32)n, PERM_W); - if (!h) { - fault(st, "memset: invalid memory"); - return 0; - } - memset(h, (int)(byte & 0xffu), (size_t)n); - if (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG) regs[DST0] = d; - return 1; - } case INTRIN_POPCOUNT: regs[DST0] = ipopcount(ARGV(0), AWID(0)); return 1;