commit 5054c67a493c11ec0c032c04e722962402406411
parent fa5d5b7d642c01e79875545a68e24a2f03ab4ee1
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 21 May 2026 10:21:42 -0700
x64: fix optimized arithmetic and tail-call coverage
Fix two-address x64 codegen hazards where preparing the destination register could clobber the RHS for integer ALU ops, scalar FP binops, and variable shifts through %cl.
Implement unsigned 64-bit FP conversions and make stack-argument tail calls conservative in direct and optimized call planning.
Update the x64 parity checklist with the current backend status and targeted x64 toy verification.
Diffstat:
3 files changed, 158 insertions(+), 11 deletions(-)
diff --git a/doc/X64_PARITY_CHECKLIST.md b/doc/X64_PARITY_CHECKLIST.md
@@ -4,6 +4,34 @@ Goal: bring `x86_64` to the same practical coverage as `aarch64` across
standalone asm, disasm, C/toy compilation, object/link output, runtime, and
debug tooling.
+## Status as of 2026-05-21
+
+- Fixed an optimized x64 two-address arithmetic hazard where preparing the
+ destination register could clobber the RHS. This covered integer ALU ops,
+ scalar FP binops, and variable shifts whose count must be routed through
+ `%cl`. The motivating failure was `24_tail_arg_permute` computing `b * 2`
+ into `%r8` and then immediately overwriting `%r8` with `a`.
+- Implemented x64 `u64`/FP conversions for `CV_ITOF_U` and `CV_FTOI_U`, closing
+ two explicit backend panics in scalar conversion coverage.
+- Made x64 tail-call handling conservative when stack arguments are involved:
+ direct emission falls back to a normal call when the current frame cannot
+ reuse enough incoming stack argument area, and optimized call planning
+ clears `CG_CALL_TAIL` for stack-argument tail calls. Register-only tail calls
+ remain eligible.
+- Fixed virtual-register materialization so delayed arithmetic/comparisons use
+ fresh SSA values instead of destructively redefining one of their source
+ virtual registers. Optimized x64 jump-table lowering is enabled again and the
+ `123_spec_demo` x64 O1/O2 switch path now uses the table path correctly.
+- Fixed x64 cross-test runner overhead by passing `--pull=never` to podman.
+ `--net=none` blocked container networking, but podman still performed a host
+ image pull/manifest check before launch. A one-case x64 toy O0/O1/O2 smoke
+ dropped from roughly 30 seconds per container lookup to roughly 2 seconds.
+- Verified after these changes: `make bin`; targeted x64 toy execution for
+ `24_tail_arg_permute`, `25_tail_many_stack_args`, `26_tail_live_pressure`,
+ `29_tail_cross_arch_stack`, `09_function_params`, `100_record_data_relocation`,
+ `120_data_symdiff`, `123_spec_demo`, and `65_rounding_conversions`. A full
+ x64 toy cross run is still pending after the podman runner fix.
+
## Asm / disasm
- [ ] Expand `src/arch/x64/asm.c` beyond the current small AT&T subset:
@@ -35,11 +63,20 @@ debug tooling.
(`u64`/FP conversions, unsupported bitcasts, non-constant memset byte
paths, indirect aggregate arg shapes, tail-call/sret gaps, and other
`unsupported`/`unimpl` paths).
+ - 2026-05-21: `u64`/FP conversions are implemented; tail-call stack-arg
+ cases are handled conservatively rather than panicking or emitting an
+ invalid sibling tail call.
- [ ] Match aa64 coverage for scalar integer, FP, pointer, aggregate, varargs,
atomics, intrinsics, labels, computed goto, switch lowering, and alloca.
+ - 2026-05-21: scalar optimized integer/FP RHS clobbers, variable shift
+ count clobbers, and optimized x64 jump-table virtual-reg materialization
+ are fixed.
- [ ] Prove x64 optimized and unoptimized C parse corpus paths with targeted
`CFREE_TEST_ARCH=x64` runs.
- [ ] Prove toy cross-arch path `X` for x64 alongside aa64 cases.
+ - 2026-05-21: targeted x64 `X` runs pass for the tail-call, conversion,
+ data-relocation, and switch regression cases listed above. Full x64 `X`
+ run should be repeated after the podman `--pull=never` runner fix.
## ABI / platform
@@ -90,3 +127,5 @@ debug tooling.
aa64 defaults.
- [ ] Promote targeted x64 runs into default or CI-equivalent coverage once they
are stable on available runners.
+- [x] Prevent podman-backed cross tests from hitting registries during normal
+ execution by using `--pull=never`; test images must be prepared explicitly.
diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c
@@ -556,7 +556,6 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
u32 ra = a_op.v.reg & 0xFu;
u32 rb = b_op.v.reg & 0xFu;
u8 prefix2 = type_is_fp_double(dst.type) ? 0xF2 : 0xF3;
- if (rd != ra) emit_sse_rr(mc, prefix2, 0x10, rd, ra);
u8 opcode;
switch (op) {
case BO_FADD:
@@ -575,6 +574,17 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
opcode = 0x58;
break;
}
+ if (rd == rb && rd != ra) {
+ if (op == BO_FADD || op == BO_FMUL) {
+ emit_sse_rr(mc, prefix2, opcode, rd, ra);
+ return;
+ }
+ emit_sse_rr(mc, prefix2, 0x10, X64_XMM15, rb);
+ emit_sse_rr(mc, prefix2, 0x10, rd, ra);
+ emit_sse_rr(mc, prefix2, opcode, rd, X64_XMM15);
+ return;
+ }
+ if (rd != ra) emit_sse_rr(mc, prefix2, 0x10, rd, ra);
emit_sse_rr(mc, prefix2, opcode, rd, rb);
return;
}
@@ -618,9 +628,9 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
* into cl. */
if (op == BO_SHL || op == BO_SHR_U || op == BO_SHR_S) {
u32 ra = x64_force_reg_int(t, a_op, w, X64_RAX);
- if (rd != ra) emit_mov_rr(mc, w, rd, ra);
u32 sub = (op == BO_SHL) ? 4u : (op == BO_SHR_U ? 5u : 7u);
if (b_op.kind == OPK_IMM) {
+ if (rd != ra) emit_mov_rr(mc, w, rd, ra);
u32 width = w ? 64u : 32u;
emit_shift_imm(mc, w, sub, rd, (u8)((u64)b_op.v.imm & (width - 1u)));
return;
@@ -632,6 +642,7 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
compiler_panic(t->c, impl_of(t)->loc,
"x64 shift: count kind %d unsupported", (int)b_op.kind);
}
+ if (rd != ra) emit_mov_rr(mc, w, rd, ra);
emit_shift_cl(mc, w, sub, rd);
return;
}
@@ -710,10 +721,38 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
/* Fall through to materialize for >32-bit literals. */
}
- /* Generic 2-operand ALU: copy ra → dst, then dst op= rb. */
+ /* Generic 2-operand ALU: copy ra → dst, then dst op= rb. Preserve rb
+ * first when the allocator chose dst == rb; otherwise the prep copy
+ * would clobber the RHS. */
u32 ra = x64_force_reg_int(t, a_op, w, X64_RAX);
- if (rd != ra) emit_mov_rr(mc, w, rd, ra);
u32 rb = x64_force_reg_int(t, b_op, w, X64_R11);
+ if (rd == rb && rd != ra) {
+ if (op == BO_IADD || op == BO_AND || op == BO_OR || op == BO_XOR ||
+ op == BO_IMUL) {
+ switch (op) {
+ case BO_IADD:
+ emit_alu_rr(mc, w, 0x01, rd, ra);
+ return;
+ case BO_AND:
+ emit_alu_rr(mc, w, 0x21, rd, ra);
+ return;
+ case BO_OR:
+ emit_alu_rr(mc, w, 0x09, rd, ra);
+ return;
+ case BO_XOR:
+ emit_alu_rr(mc, w, 0x31, rd, ra);
+ return;
+ case BO_IMUL:
+ emit_imul_rr(mc, w, rd, ra);
+ return;
+ default:
+ break;
+ }
+ }
+ emit_mov_rr(mc, w, X64_R11, rb);
+ rb = X64_R11;
+ }
+ if (rd != ra) emit_mov_rr(mc, w, rd, ra);
switch (op) {
case BO_IADD:
emit_alu_rr(mc, w, 0x01, rd, rb);
@@ -831,7 +870,25 @@ static void x_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) {
int w_src = type_is_64(src.type) ? 1 : 0;
u8 prefix2 = type_is_fp_double(dst.type) ? 0xF2 : 0xF3;
if (k == CV_ITOF_U && w_src == 1) {
- compiler_panic(t->c, a->loc, "x64 convert: u64→fp not yet implemented");
+ MCLabel L_high = mc->label_new(mc);
+ MCLabel L_done = mc->label_new(mc);
+ u32 rr = rs;
+ emit_test_self(mc, 1, rr);
+ emit_jcc_label(mc, X64_CC_S, L_high);
+ emit_sse_rr_w(mc, prefix2, 0x2A, 1, rd, rr);
+ emit_jmp_label(mc, L_done);
+
+ mc->label_place(mc, L_high);
+ emit_mov_rr(mc, 1, X64_R11, rr);
+ emit_mov_rr(mc, 1, X64_RAX, rr);
+ emit_alu_imm8(mc, 1, 4u, X64_RAX, 1); /* and rax, 1 */
+ emit_shift_imm(mc, 1, 5u, X64_R11, 1); /* shr r11, 1 */
+ emit_alu_rr(mc, 1, 0x09, X64_R11, X64_RAX); /* or r11, rax */
+ emit_sse_rr_w(mc, prefix2, 0x2A, 1, rd, X64_R11);
+ emit_sse_rr(mc, prefix2, 0x58, rd, rd); /* addss/addsd dst, dst */
+
+ mc->label_place(mc, L_done);
+ return;
}
if (k == CV_ITOF_U) {
/* u32→fp: zero-extend to 64-bit, then signed cvtsi2sd works. */
@@ -847,7 +904,45 @@ static void x_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) {
int w_dst = type_is_64(dst.type) ? 1 : 0;
u8 prefix2 = type_is_fp_double(src.type) ? 0xF2 : 0xF3;
if (k == CV_FTOI_U && w_dst == 1) {
- compiler_panic(t->c, a->loc, "x64 convert: fp→u64 not yet implemented");
+ static const u8 two63_f64[8] = {0, 0, 0, 0, 0, 0, 0xE0, 0x43};
+ static const u8 two63_f32[4] = {0, 0, 0, 0x5F};
+ MCLabel L_small = mc->label_new(mc);
+ MCLabel L_done = mc->label_new(mc);
+ ConstBytes cb;
+ Operand limit;
+ memset(&cb, 0, sizeof cb);
+ if (type_is_fp_double(src.type)) {
+ cb.bytes = two63_f64;
+ cb.size = 8;
+ cb.align = 8;
+ } else {
+ cb.bytes = two63_f32;
+ cb.size = 4;
+ cb.align = 4;
+ }
+ cb.type = src.type;
+ memset(&limit, 0, sizeof limit);
+ limit.kind = OPK_REG;
+ limit.cls = RC_FP;
+ limit.type = src.type;
+ limit.v.reg = X64_XMM15;
+ x_load_const(t, limit, cb);
+
+ emit_sse_rr(mc, type_is_fp_double(src.type) ? 0x66 : 0, 0x2E, rs,
+ X64_XMM15);
+ emit_jcc_label(mc, X64_CC_B, L_small);
+
+ emit_sse_rr(mc, prefix2, 0x10, X64_XMM0 + 14, rs);
+ emit_sse_rr(mc, prefix2, 0x5C, X64_XMM0 + 14, X64_XMM15);
+ emit_sse_rr_w(mc, prefix2, 0x2C, 1, rd, X64_XMM0 + 14);
+ x64_emit_load_imm(mc, 1, X64_R11, -9223372036854775807LL - 1LL);
+ emit_alu_rr(mc, 1, 0x31, rd, X64_R11); /* xor sign bit */
+ emit_jmp_label(mc, L_done);
+
+ mc->label_place(mc, L_small);
+ emit_sse_rr_w(mc, prefix2, 0x2C, 1, rd, rs);
+ mc->label_place(mc, L_done);
+ return;
}
emit_sse_rr_w(mc, prefix2, 0x2C, w_dst, rd, rs);
return;
@@ -1172,6 +1267,12 @@ static void x_call(CGTarget* t, const CGCallDesc* d) {
MCEmitter* mc = t->mc;
u32 next_int = 0, next_fp = 0, stack_off = 0;
+ int requested_tail = (d->flags & CG_CALL_TAIL) != 0;
+ int tail_ok = 1;
+ if (requested_tail) {
+ u32 tail_stack = x_call_stack_size(t, d);
+ tail_ok = tail_stack <= a->next_param_stack;
+ }
/* sret: caller puts destination pointer in rdi. */
if (d->abi && d->abi->has_sret) {
@@ -1185,10 +1286,10 @@ static void x_call(CGTarget* t, const CGCallDesc* d) {
}
for (u32 i = 0; i < d->nargs; ++i) {
emit_arg_value(t, &d->args[i], &next_int, &next_fp, &stack_off,
- (d->flags & CG_CALL_TAIL) != 0);
+ requested_tail && tail_ok);
}
u32 needed = (stack_off + 15u) & ~15u;
- if ((d->flags & CG_CALL_TAIL) == 0 && needed > a->max_outgoing) {
+ if ((!requested_tail || !tail_ok) && needed > a->max_outgoing) {
if (a->known_frame)
compiler_panic(t->c, a->loc,
"x64 call: known frame outgoing area too small");
@@ -1200,7 +1301,7 @@ static void x_call(CGTarget* t, const CGCallDesc* d) {
x64_emit_load_imm(mc, 0, X64_RAX, (i64)next_fp);
}
- if (d->flags & CG_CALL_TAIL) {
+ if (requested_tail && tail_ok) {
if (d->abi && d->abi->has_sret)
compiler_panic(t->c, a->loc, "x64 tail call: sret unsupported");
x_check_tail_stack_args(t, stack_off);
diff --git a/src/arch/x64/opt_coord.c b/src/arch/x64/opt_coord.c
@@ -174,11 +174,17 @@ static u32 x_return_reg_mask(CGTarget* t, const ABIFuncInfo* abi,
return mask;
}
+static int x_is_void_ret_storage(Operand op) {
+ return op.kind == OPK_IMM && op.type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_VOID);
+}
+
static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
memset(out, 0, sizeof *out);
out->callee = d->callee;
out->flags = d->flags;
out->stack_arg_size = t->call_stack_size ? t->call_stack_size(t, d) : 0;
+ if ((out->flags & CG_CALL_TAIL) && out->stack_arg_size)
+ out->flags &= ~CG_CALL_TAIL;
out->has_sret = d->abi && d->abi->has_sret;
out->is_variadic = d->abi && d->abi->variadic;
for (u32 c = 0; c < CG_CALL_PLAN_REG_CLASSES; ++c) {
@@ -267,9 +273,10 @@ static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
}
}
out->variadic_fp_count = (u8)next_fp;
- if ((d->flags & CG_CALL_TAIL) == 0 &&
+ if ((out->flags & CG_CALL_TAIL) == 0 &&
d->abi && d->abi->ret.kind != ABI_ARG_IGNORE &&
- d->abi->ret.kind != ABI_ARG_INDIRECT) {
+ d->abi->ret.kind != ABI_ARG_INDIRECT &&
+ !x_is_void_ret_storage(d->ret.storage)) {
u32 ni = 0, nf = 0;
static const u32 rregs[2] = {X64_RAX, X64_RDX};
for (u16 i = 0; i < d->abi->ret.nparts; ++i) {