kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 5054c67a493c11ec0c032c04e722962402406411
parent fa5d5b7d642c01e79875545a68e24a2f03ab4ee1
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 21 May 2026 10:21:42 -0700

x64: fix optimized arithmetic and tail-call coverage

Fix two-address x64 codegen hazards where preparing the destination register could clobber the RHS for integer ALU ops, scalar FP binops, and variable shifts through %cl.

Implement unsigned 64-bit FP conversions and make stack-argument tail calls conservative in direct and optimized call planning.

Update the x64 parity checklist with the current backend status and targeted x64 toy verification.

Diffstat:
Mdoc/X64_PARITY_CHECKLIST.md | 39+++++++++++++++++++++++++++++++++++++++
Msrc/arch/x64/ops.c | 119+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Msrc/arch/x64/opt_coord.c | 11+++++++++--
3 files changed, 158 insertions(+), 11 deletions(-)

diff --git a/doc/X64_PARITY_CHECKLIST.md b/doc/X64_PARITY_CHECKLIST.md @@ -4,6 +4,34 @@ Goal: bring `x86_64` to the same practical coverage as `aarch64` across standalone asm, disasm, C/toy compilation, object/link output, runtime, and debug tooling. +## Status as of 2026-05-21 + +- Fixed an optimized x64 two-address arithmetic hazard where preparing the + destination register could clobber the RHS. This covered integer ALU ops, + scalar FP binops, and variable shifts whose count must be routed through + `%cl`. The motivating failure was `24_tail_arg_permute` computing `b * 2` + into `%r8` and then immediately overwriting `%r8` with `a`. +- Implemented x64 `u64`/FP conversions for `CV_ITOF_U` and `CV_FTOI_U`, closing + two explicit backend panics in scalar conversion coverage. +- Made x64 tail-call handling conservative when stack arguments are involved: + direct emission falls back to a normal call when the current frame cannot + reuse enough incoming stack argument area, and optimized call planning + clears `CG_CALL_TAIL` for stack-argument tail calls. Register-only tail calls + remain eligible. +- Fixed virtual-register materialization so delayed arithmetic/comparisons use + fresh SSA values instead of destructively redefining one of their source + virtual registers. Optimized x64 jump-table lowering is enabled again and the + `123_spec_demo` x64 O1/O2 switch path now uses the table path correctly. +- Fixed x64 cross-test runner overhead by passing `--pull=never` to podman. + `--net=none` blocked container networking, but podman still performed a host + image pull/manifest check before launch. A one-case x64 toy O0/O1/O2 smoke + dropped from roughly 30 seconds per container lookup to roughly 2 seconds. +- Verified after these changes: `make bin`; targeted x64 toy execution for + `24_tail_arg_permute`, `25_tail_many_stack_args`, `26_tail_live_pressure`, + `29_tail_cross_arch_stack`, `09_function_params`, `100_record_data_relocation`, + `120_data_symdiff`, `123_spec_demo`, and `65_rounding_conversions`. A full + x64 toy cross run is still pending after the podman runner fix. + ## Asm / disasm - [ ] Expand `src/arch/x64/asm.c` beyond the current small AT&T subset: @@ -35,11 +63,20 @@ debug tooling. (`u64`/FP conversions, unsupported bitcasts, non-constant memset byte paths, indirect aggregate arg shapes, tail-call/sret gaps, and other `unsupported`/`unimpl` paths). + - 2026-05-21: `u64`/FP conversions are implemented; tail-call stack-arg + cases are handled conservatively rather than panicking or emitting an + invalid sibling tail call. - [ ] Match aa64 coverage for scalar integer, FP, pointer, aggregate, varargs, atomics, intrinsics, labels, computed goto, switch lowering, and alloca. + - 2026-05-21: scalar optimized integer/FP RHS clobbers, variable shift + count clobbers, and optimized x64 jump-table virtual-reg materialization + are fixed. - [ ] Prove x64 optimized and unoptimized C parse corpus paths with targeted `CFREE_TEST_ARCH=x64` runs. - [ ] Prove toy cross-arch path `X` for x64 alongside aa64 cases. + - 2026-05-21: targeted x64 `X` runs pass for the tail-call, conversion, + data-relocation, and switch regression cases listed above. Full x64 `X` + run should be repeated after the podman `--pull=never` runner fix. ## ABI / platform @@ -90,3 +127,5 @@ debug tooling. aa64 defaults. - [ ] Promote targeted x64 runs into default or CI-equivalent coverage once they are stable on available runners. +- [x] Prevent podman-backed cross tests from hitting registries during normal + execution by using `--pull=never`; test images must be prepared explicitly. diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c @@ -556,7 +556,6 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, u32 ra = a_op.v.reg & 0xFu; u32 rb = b_op.v.reg & 0xFu; u8 prefix2 = type_is_fp_double(dst.type) ? 0xF2 : 0xF3; - if (rd != ra) emit_sse_rr(mc, prefix2, 0x10, rd, ra); u8 opcode; switch (op) { case BO_FADD: @@ -575,6 +574,17 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, opcode = 0x58; break; } + if (rd == rb && rd != ra) { + if (op == BO_FADD || op == BO_FMUL) { + emit_sse_rr(mc, prefix2, opcode, rd, ra); + return; + } + emit_sse_rr(mc, prefix2, 0x10, X64_XMM15, rb); + emit_sse_rr(mc, prefix2, 0x10, rd, ra); + emit_sse_rr(mc, prefix2, opcode, rd, X64_XMM15); + return; + } + if (rd != ra) emit_sse_rr(mc, prefix2, 0x10, rd, ra); emit_sse_rr(mc, prefix2, opcode, rd, rb); return; } @@ -618,9 +628,9 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, * into cl. */ if (op == BO_SHL || op == BO_SHR_U || op == BO_SHR_S) { u32 ra = x64_force_reg_int(t, a_op, w, X64_RAX); - if (rd != ra) emit_mov_rr(mc, w, rd, ra); u32 sub = (op == BO_SHL) ? 4u : (op == BO_SHR_U ? 5u : 7u); if (b_op.kind == OPK_IMM) { + if (rd != ra) emit_mov_rr(mc, w, rd, ra); u32 width = w ? 64u : 32u; emit_shift_imm(mc, w, sub, rd, (u8)((u64)b_op.v.imm & (width - 1u))); return; @@ -632,6 +642,7 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, compiler_panic(t->c, impl_of(t)->loc, "x64 shift: count kind %d unsupported", (int)b_op.kind); } + if (rd != ra) emit_mov_rr(mc, w, rd, ra); emit_shift_cl(mc, w, sub, rd); return; } @@ -710,10 +721,38 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, /* Fall through to materialize for >32-bit literals. */ } - /* Generic 2-operand ALU: copy ra → dst, then dst op= rb. */ + /* Generic 2-operand ALU: copy ra → dst, then dst op= rb. Preserve rb + * first when the allocator chose dst == rb; otherwise the prep copy + * would clobber the RHS. */ u32 ra = x64_force_reg_int(t, a_op, w, X64_RAX); - if (rd != ra) emit_mov_rr(mc, w, rd, ra); u32 rb = x64_force_reg_int(t, b_op, w, X64_R11); + if (rd == rb && rd != ra) { + if (op == BO_IADD || op == BO_AND || op == BO_OR || op == BO_XOR || + op == BO_IMUL) { + switch (op) { + case BO_IADD: + emit_alu_rr(mc, w, 0x01, rd, ra); + return; + case BO_AND: + emit_alu_rr(mc, w, 0x21, rd, ra); + return; + case BO_OR: + emit_alu_rr(mc, w, 0x09, rd, ra); + return; + case BO_XOR: + emit_alu_rr(mc, w, 0x31, rd, ra); + return; + case BO_IMUL: + emit_imul_rr(mc, w, rd, ra); + return; + default: + break; + } + } + emit_mov_rr(mc, w, X64_R11, rb); + rb = X64_R11; + } + if (rd != ra) emit_mov_rr(mc, w, rd, ra); switch (op) { case BO_IADD: emit_alu_rr(mc, w, 0x01, rd, rb); @@ -831,7 +870,25 @@ static void x_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) { int w_src = type_is_64(src.type) ? 1 : 0; u8 prefix2 = type_is_fp_double(dst.type) ? 0xF2 : 0xF3; if (k == CV_ITOF_U && w_src == 1) { - compiler_panic(t->c, a->loc, "x64 convert: u64→fp not yet implemented"); + MCLabel L_high = mc->label_new(mc); + MCLabel L_done = mc->label_new(mc); + u32 rr = rs; + emit_test_self(mc, 1, rr); + emit_jcc_label(mc, X64_CC_S, L_high); + emit_sse_rr_w(mc, prefix2, 0x2A, 1, rd, rr); + emit_jmp_label(mc, L_done); + + mc->label_place(mc, L_high); + emit_mov_rr(mc, 1, X64_R11, rr); + emit_mov_rr(mc, 1, X64_RAX, rr); + emit_alu_imm8(mc, 1, 4u, X64_RAX, 1); /* and rax, 1 */ + emit_shift_imm(mc, 1, 5u, X64_R11, 1); /* shr r11, 1 */ + emit_alu_rr(mc, 1, 0x09, X64_R11, X64_RAX); /* or r11, rax */ + emit_sse_rr_w(mc, prefix2, 0x2A, 1, rd, X64_R11); + emit_sse_rr(mc, prefix2, 0x58, rd, rd); /* addss/addsd dst, dst */ + + mc->label_place(mc, L_done); + return; } if (k == CV_ITOF_U) { /* u32→fp: zero-extend to 64-bit, then signed cvtsi2sd works. */ @@ -847,7 +904,45 @@ static void x_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) { int w_dst = type_is_64(dst.type) ? 1 : 0; u8 prefix2 = type_is_fp_double(src.type) ? 0xF2 : 0xF3; if (k == CV_FTOI_U && w_dst == 1) { - compiler_panic(t->c, a->loc, "x64 convert: fp→u64 not yet implemented"); + static const u8 two63_f64[8] = {0, 0, 0, 0, 0, 0, 0xE0, 0x43}; + static const u8 two63_f32[4] = {0, 0, 0, 0x5F}; + MCLabel L_small = mc->label_new(mc); + MCLabel L_done = mc->label_new(mc); + ConstBytes cb; + Operand limit; + memset(&cb, 0, sizeof cb); + if (type_is_fp_double(src.type)) { + cb.bytes = two63_f64; + cb.size = 8; + cb.align = 8; + } else { + cb.bytes = two63_f32; + cb.size = 4; + cb.align = 4; + } + cb.type = src.type; + memset(&limit, 0, sizeof limit); + limit.kind = OPK_REG; + limit.cls = RC_FP; + limit.type = src.type; + limit.v.reg = X64_XMM15; + x_load_const(t, limit, cb); + + emit_sse_rr(mc, type_is_fp_double(src.type) ? 0x66 : 0, 0x2E, rs, + X64_XMM15); + emit_jcc_label(mc, X64_CC_B, L_small); + + emit_sse_rr(mc, prefix2, 0x10, X64_XMM0 + 14, rs); + emit_sse_rr(mc, prefix2, 0x5C, X64_XMM0 + 14, X64_XMM15); + emit_sse_rr_w(mc, prefix2, 0x2C, 1, rd, X64_XMM0 + 14); + x64_emit_load_imm(mc, 1, X64_R11, -9223372036854775807LL - 1LL); + emit_alu_rr(mc, 1, 0x31, rd, X64_R11); /* xor sign bit */ + emit_jmp_label(mc, L_done); + + mc->label_place(mc, L_small); + emit_sse_rr_w(mc, prefix2, 0x2C, 1, rd, rs); + mc->label_place(mc, L_done); + return; } emit_sse_rr_w(mc, prefix2, 0x2C, w_dst, rd, rs); return; @@ -1172,6 +1267,12 @@ static void x_call(CGTarget* t, const CGCallDesc* d) { MCEmitter* mc = t->mc; u32 next_int = 0, next_fp = 0, stack_off = 0; + int requested_tail = (d->flags & CG_CALL_TAIL) != 0; + int tail_ok = 1; + if (requested_tail) { + u32 tail_stack = x_call_stack_size(t, d); + tail_ok = tail_stack <= a->next_param_stack; + } /* sret: caller puts destination pointer in rdi. */ if (d->abi && d->abi->has_sret) { @@ -1185,10 +1286,10 @@ static void x_call(CGTarget* t, const CGCallDesc* d) { } for (u32 i = 0; i < d->nargs; ++i) { emit_arg_value(t, &d->args[i], &next_int, &next_fp, &stack_off, - (d->flags & CG_CALL_TAIL) != 0); + requested_tail && tail_ok); } u32 needed = (stack_off + 15u) & ~15u; - if ((d->flags & CG_CALL_TAIL) == 0 && needed > a->max_outgoing) { + if ((!requested_tail || !tail_ok) && needed > a->max_outgoing) { if (a->known_frame) compiler_panic(t->c, a->loc, "x64 call: known frame outgoing area too small"); @@ -1200,7 +1301,7 @@ static void x_call(CGTarget* t, const CGCallDesc* d) { x64_emit_load_imm(mc, 0, X64_RAX, (i64)next_fp); } - if (d->flags & CG_CALL_TAIL) { + if (requested_tail && tail_ok) { if (d->abi && d->abi->has_sret) compiler_panic(t->c, a->loc, "x64 tail call: sret unsupported"); x_check_tail_stack_args(t, stack_off); diff --git a/src/arch/x64/opt_coord.c b/src/arch/x64/opt_coord.c @@ -174,11 +174,17 @@ static u32 x_return_reg_mask(CGTarget* t, const ABIFuncInfo* abi, return mask; } +static int x_is_void_ret_storage(Operand op) { + return op.kind == OPK_IMM && op.type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_VOID); +} + static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) { memset(out, 0, sizeof *out); out->callee = d->callee; out->flags = d->flags; out->stack_arg_size = t->call_stack_size ? t->call_stack_size(t, d) : 0; + if ((out->flags & CG_CALL_TAIL) && out->stack_arg_size) + out->flags &= ~CG_CALL_TAIL; out->has_sret = d->abi && d->abi->has_sret; out->is_variadic = d->abi && d->abi->variadic; for (u32 c = 0; c < CG_CALL_PLAN_REG_CLASSES; ++c) { @@ -267,9 +273,10 @@ static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) { } } out->variadic_fp_count = (u8)next_fp; - if ((d->flags & CG_CALL_TAIL) == 0 && + if ((out->flags & CG_CALL_TAIL) == 0 && d->abi && d->abi->ret.kind != ABI_ARG_IGNORE && - d->abi->ret.kind != ABI_ARG_INDIRECT) { + d->abi->ret.kind != ABI_ARG_INDIRECT && + !x_is_void_ret_storage(d->ret.storage)) { u32 ni = 0, nf = 0; static const u32 rregs[2] = {X64_RAX, X64_RDX}; for (u16 i = 0; i < d->abi->ret.nparts; ++i) {