x64: fix optimized arithmetic and tail-call coverage - kit

commit 5054c67a493c11ec0c032c04e722962402406411
parent fa5d5b7d642c01e79875545a68e24a2f03ab4ee1
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 21 May 2026 10:21:42 -0700

x64: fix optimized arithmetic and tail-call coverage

Fix two-address x64 codegen hazards where preparing the destination register could clobber the RHS for integer ALU ops, scalar FP binops, and variable shifts through %cl.

Implement unsigned 64-bit FP conversions and make stack-argument tail calls conservative in direct and optimized call planning.

Update the x64 parity checklist with the current backend status and targeted x64 toy verification.

Diffstat:
M doc/X64_PARITY_CHECKLIST.md  | 39 +++++++++++++++++++++++++++++++++++++++
M src/arch/x64/ops.c  | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
M src/arch/x64/opt_coord.c  | 11 +++++++++--

3 files changed, 158 insertions(+), 11 deletions(-)
diff --git a/doc/X64_PARITY_CHECKLIST.md b/doc/X64_PARITY_CHECKLIST.md
@@ -4,6 +4,34 @@ Goal: bring `x86_64` to the same practical coverage as `aarch64` across
 standalone asm, disasm, C/toy compilation, object/link output, runtime, and
 debug tooling.
 
+## Status as of 2026-05-21
+
+- Fixed an optimized x64 two-address arithmetic hazard where preparing the
+  destination register could clobber the RHS. This covered integer ALU ops,
+  scalar FP binops, and variable shifts whose count must be routed through
+  `%cl`. The motivating failure was `24_tail_arg_permute` computing `b * 2`
+  into `%r8` and then immediately overwriting `%r8` with `a`.
+- Implemented x64 `u64`/FP conversions for `CV_ITOF_U` and `CV_FTOI_U`, closing
+  two explicit backend panics in scalar conversion coverage.
+- Made x64 tail-call handling conservative when stack arguments are involved:
+  direct emission falls back to a normal call when the current frame cannot
+  reuse enough incoming stack argument area, and optimized call planning
+  clears `CG_CALL_TAIL` for stack-argument tail calls. Register-only tail calls
+  remain eligible.
+- Fixed virtual-register materialization so delayed arithmetic/comparisons use
+  fresh SSA values instead of destructively redefining one of their source
+  virtual registers. Optimized x64 jump-table lowering is enabled again and the
+  `123_spec_demo` x64 O1/O2 switch path now uses the table path correctly.
+- Fixed x64 cross-test runner overhead by passing `--pull=never` to podman.
+  `--net=none` blocked container networking, but podman still performed a host
+  image pull/manifest check before launch. A one-case x64 toy O0/O1/O2 smoke
+  dropped from roughly 30 seconds per container lookup to roughly 2 seconds.
+- Verified after these changes: `make bin`; targeted x64 toy execution for
+  `24_tail_arg_permute`, `25_tail_many_stack_args`, `26_tail_live_pressure`,
+  `29_tail_cross_arch_stack`, `09_function_params`, `100_record_data_relocation`,
+  `120_data_symdiff`, `123_spec_demo`, and `65_rounding_conversions`. A full
+  x64 toy cross run is still pending after the podman runner fix.
+
 ## Asm / disasm
 
 - [ ] Expand `src/arch/x64/asm.c` beyond the current small AT&T subset:
@@ -35,11 +63,20 @@ debug tooling.
       (`u64`/FP conversions, unsupported bitcasts, non-constant memset byte
       paths, indirect aggregate arg shapes, tail-call/sret gaps, and other
       `unsupported`/`unimpl` paths).
+      - 2026-05-21: `u64`/FP conversions are implemented; tail-call stack-arg
+        cases are handled conservatively rather than panicking or emitting an
+        invalid sibling tail call.
 - [ ] Match aa64 coverage for scalar integer, FP, pointer, aggregate, varargs,
       atomics, intrinsics, labels, computed goto, switch lowering, and alloca.
+      - 2026-05-21: scalar optimized integer/FP RHS clobbers, variable shift
+        count clobbers, and optimized x64 jump-table virtual-reg materialization
+        are fixed.
 - [ ] Prove x64 optimized and unoptimized C parse corpus paths with targeted
       `CFREE_TEST_ARCH=x64` runs.
 - [ ] Prove toy cross-arch path `X` for x64 alongside aa64 cases.
+      - 2026-05-21: targeted x64 `X` runs pass for the tail-call, conversion,
+        data-relocation, and switch regression cases listed above. Full x64 `X`
+        run should be repeated after the podman `--pull=never` runner fix.
 
 ## ABI / platform
 
@@ -90,3 +127,5 @@ debug tooling.
       aa64 defaults.
 - [ ] Promote targeted x64 runs into default or CI-equivalent coverage once they
       are stable on available runners.
+- [x] Prevent podman-backed cross tests from hitting registries during normal
+      execution by using `--pull=never`; test images must be prepared explicitly.
diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c
@@ -556,7 +556,6 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
     u32 ra = a_op.v.reg & 0xFu;
     u32 rb = b_op.v.reg & 0xFu;
     u8 prefix2 = type_is_fp_double(dst.type) ? 0xF2 : 0xF3;
-    if (rd != ra) emit_sse_rr(mc, prefix2, 0x10, rd, ra);
     u8 opcode;
     switch (op) {
       case BO_FADD:
@@ -575,6 +574,17 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
         opcode = 0x58;
         break;
     }
+    if (rd == rb && rd != ra) {
+      if (op == BO_FADD || op == BO_FMUL) {
+        emit_sse_rr(mc, prefix2, opcode, rd, ra);
+        return;
+      }
+      emit_sse_rr(mc, prefix2, 0x10, X64_XMM15, rb);
+      emit_sse_rr(mc, prefix2, 0x10, rd, ra);
+      emit_sse_rr(mc, prefix2, opcode, rd, X64_XMM15);
+      return;
+    }
+    if (rd != ra) emit_sse_rr(mc, prefix2, 0x10, rd, ra);
     emit_sse_rr(mc, prefix2, opcode, rd, rb);
     return;
   }
@@ -618,9 +628,9 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
    * into cl. */
   if (op == BO_SHL || op == BO_SHR_U || op == BO_SHR_S) {
     u32 ra = x64_force_reg_int(t, a_op, w, X64_RAX);
-    if (rd != ra) emit_mov_rr(mc, w, rd, ra);
     u32 sub = (op == BO_SHL) ? 4u : (op == BO_SHR_U ? 5u : 7u);
     if (b_op.kind == OPK_IMM) {
+      if (rd != ra) emit_mov_rr(mc, w, rd, ra);
       u32 width = w ? 64u : 32u;
       emit_shift_imm(mc, w, sub, rd, (u8)((u64)b_op.v.imm & (width - 1u)));
       return;
@@ -632,6 +642,7 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
       compiler_panic(t->c, impl_of(t)->loc,
                      "x64 shift: count kind %d unsupported", (int)b_op.kind);
     }
+    if (rd != ra) emit_mov_rr(mc, w, rd, ra);
     emit_shift_cl(mc, w, sub, rd);
     return;
   }
@@ -710,10 +721,38 @@ static void x_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
     /* Fall through to materialize for >32-bit literals. */
   }
 
-  /* Generic 2-operand ALU: copy ra → dst, then dst op= rb. */
+  /* Generic 2-operand ALU: copy ra → dst, then dst op= rb.  Preserve rb
+   * first when the allocator chose dst == rb; otherwise the prep copy
+   * would clobber the RHS. */
   u32 ra = x64_force_reg_int(t, a_op, w, X64_RAX);
-  if (rd != ra) emit_mov_rr(mc, w, rd, ra);
   u32 rb = x64_force_reg_int(t, b_op, w, X64_R11);
+  if (rd == rb && rd != ra) {
+    if (op == BO_IADD || op == BO_AND || op == BO_OR || op == BO_XOR ||
+        op == BO_IMUL) {
+      switch (op) {
+        case BO_IADD:
+          emit_alu_rr(mc, w, 0x01, rd, ra);
+          return;
+        case BO_AND:
+          emit_alu_rr(mc, w, 0x21, rd, ra);
+          return;
+        case BO_OR:
+          emit_alu_rr(mc, w, 0x09, rd, ra);
+          return;
+        case BO_XOR:
+          emit_alu_rr(mc, w, 0x31, rd, ra);
+          return;
+        case BO_IMUL:
+          emit_imul_rr(mc, w, rd, ra);
+          return;
+        default:
+          break;
+      }
+    }
+    emit_mov_rr(mc, w, X64_R11, rb);
+    rb = X64_R11;
+  }
+  if (rd != ra) emit_mov_rr(mc, w, rd, ra);
   switch (op) {
     case BO_IADD:
       emit_alu_rr(mc, w, 0x01, rd, rb);
@@ -831,7 +870,25 @@ static void x_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) {
       int w_src = type_is_64(src.type) ? 1 : 0;
       u8 prefix2 = type_is_fp_double(dst.type) ? 0xF2 : 0xF3;
       if (k == CV_ITOF_U && w_src == 1) {
-        compiler_panic(t->c, a->loc, "x64 convert: u64→fp not yet implemented");
+        MCLabel L_high = mc->label_new(mc);
+        MCLabel L_done = mc->label_new(mc);
+        u32 rr = rs;
+        emit_test_self(mc, 1, rr);
+        emit_jcc_label(mc, X64_CC_S, L_high);
+        emit_sse_rr_w(mc, prefix2, 0x2A, 1, rd, rr);
+        emit_jmp_label(mc, L_done);
+
+        mc->label_place(mc, L_high);
+        emit_mov_rr(mc, 1, X64_R11, rr);
+        emit_mov_rr(mc, 1, X64_RAX, rr);
+        emit_alu_imm8(mc, 1, 4u, X64_RAX, 1); /* and rax, 1 */
+        emit_shift_imm(mc, 1, 5u, X64_R11, 1); /* shr r11, 1 */
+        emit_alu_rr(mc, 1, 0x09, X64_R11, X64_RAX); /* or r11, rax */
+        emit_sse_rr_w(mc, prefix2, 0x2A, 1, rd, X64_R11);
+        emit_sse_rr(mc, prefix2, 0x58, rd, rd); /* addss/addsd dst, dst */
+
+        mc->label_place(mc, L_done);
+        return;
       }
       if (k == CV_ITOF_U) {
         /* u32→fp: zero-extend to 64-bit, then signed cvtsi2sd works. */
@@ -847,7 +904,45 @@ static void x_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) {
       int w_dst = type_is_64(dst.type) ? 1 : 0;
       u8 prefix2 = type_is_fp_double(src.type) ? 0xF2 : 0xF3;
       if (k == CV_FTOI_U && w_dst == 1) {
-        compiler_panic(t->c, a->loc, "x64 convert: fp→u64 not yet implemented");
+        static const u8 two63_f64[8] = {0, 0, 0, 0, 0, 0, 0xE0, 0x43};
+        static const u8 two63_f32[4] = {0, 0, 0, 0x5F};
+        MCLabel L_small = mc->label_new(mc);
+        MCLabel L_done = mc->label_new(mc);
+        ConstBytes cb;
+        Operand limit;
+        memset(&cb, 0, sizeof cb);
+        if (type_is_fp_double(src.type)) {
+          cb.bytes = two63_f64;
+          cb.size = 8;
+          cb.align = 8;
+        } else {
+          cb.bytes = two63_f32;
+          cb.size = 4;
+          cb.align = 4;
+        }
+        cb.type = src.type;
+        memset(&limit, 0, sizeof limit);
+        limit.kind = OPK_REG;
+        limit.cls = RC_FP;
+        limit.type = src.type;
+        limit.v.reg = X64_XMM15;
+        x_load_const(t, limit, cb);
+
+        emit_sse_rr(mc, type_is_fp_double(src.type) ? 0x66 : 0, 0x2E, rs,
+                    X64_XMM15);
+        emit_jcc_label(mc, X64_CC_B, L_small);
+
+        emit_sse_rr(mc, prefix2, 0x10, X64_XMM0 + 14, rs);
+        emit_sse_rr(mc, prefix2, 0x5C, X64_XMM0 + 14, X64_XMM15);
+        emit_sse_rr_w(mc, prefix2, 0x2C, 1, rd, X64_XMM0 + 14);
+        x64_emit_load_imm(mc, 1, X64_R11, -9223372036854775807LL - 1LL);
+        emit_alu_rr(mc, 1, 0x31, rd, X64_R11); /* xor sign bit */
+        emit_jmp_label(mc, L_done);
+
+        mc->label_place(mc, L_small);
+        emit_sse_rr_w(mc, prefix2, 0x2C, 1, rd, rs);
+        mc->label_place(mc, L_done);
+        return;
       }
       emit_sse_rr_w(mc, prefix2, 0x2C, w_dst, rd, rs);
       return;
@@ -1172,6 +1267,12 @@ static void x_call(CGTarget* t, const CGCallDesc* d) {
   MCEmitter* mc = t->mc;
 
   u32 next_int = 0, next_fp = 0, stack_off = 0;
+  int requested_tail = (d->flags & CG_CALL_TAIL) != 0;
+  int tail_ok = 1;
+  if (requested_tail) {
+    u32 tail_stack = x_call_stack_size(t, d);
+    tail_ok = tail_stack <= a->next_param_stack;
+  }
 
   /* sret: caller puts destination pointer in rdi. */
   if (d->abi && d->abi->has_sret) {
@@ -1185,10 +1286,10 @@ static void x_call(CGTarget* t, const CGCallDesc* d) {
   }
   for (u32 i = 0; i < d->nargs; ++i) {
     emit_arg_value(t, &d->args[i], &next_int, &next_fp, &stack_off,
-                   (d->flags & CG_CALL_TAIL) != 0);
+                   requested_tail && tail_ok);
   }
   u32 needed = (stack_off + 15u) & ~15u;
-  if ((d->flags & CG_CALL_TAIL) == 0 && needed > a->max_outgoing) {
+  if ((!requested_tail || !tail_ok) && needed > a->max_outgoing) {
     if (a->known_frame)
       compiler_panic(t->c, a->loc,
                      "x64 call: known frame outgoing area too small");
@@ -1200,7 +1301,7 @@ static void x_call(CGTarget* t, const CGCallDesc* d) {
     x64_emit_load_imm(mc, 0, X64_RAX, (i64)next_fp);
   }
 
-  if (d->flags & CG_CALL_TAIL) {
+  if (requested_tail && tail_ok) {
     if (d->abi && d->abi->has_sret)
       compiler_panic(t->c, a->loc, "x64 tail call: sret unsupported");
     x_check_tail_stack_args(t, stack_off);
diff --git a/src/arch/x64/opt_coord.c b/src/arch/x64/opt_coord.c
@@ -174,11 +174,17 @@ static u32 x_return_reg_mask(CGTarget* t, const ABIFuncInfo* abi,
   return mask;
 }
 
+static int x_is_void_ret_storage(Operand op) {
+  return op.kind == OPK_IMM && op.type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_VOID);
+}
+
 static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
   memset(out, 0, sizeof *out);
   out->callee = d->callee;
   out->flags = d->flags;
   out->stack_arg_size = t->call_stack_size ? t->call_stack_size(t, d) : 0;
+  if ((out->flags & CG_CALL_TAIL) && out->stack_arg_size)
+    out->flags &= ~CG_CALL_TAIL;
   out->has_sret = d->abi && d->abi->has_sret;
   out->is_variadic = d->abi && d->abi->variadic;
   for (u32 c = 0; c < CG_CALL_PLAN_REG_CLASSES; ++c) {
@@ -267,9 +273,10 @@ static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) {
     }
   }
   out->variadic_fp_count = (u8)next_fp;
-  if ((d->flags & CG_CALL_TAIL) == 0 &&
+  if ((out->flags & CG_CALL_TAIL) == 0 &&
       d->abi && d->abi->ret.kind != ABI_ARG_IGNORE &&
-      d->abi->ret.kind != ABI_ARG_INDIRECT) {
+      d->abi->ret.kind != ABI_ARG_INDIRECT &&
+      !x_is_void_ret_storage(d->ret.storage)) {
     u32 ni = 0, nf = 0;
     static const u32 rregs[2] = {X64_RAX, X64_RDX};
     for (u16 i = 0; i < d->abi->ret.nparts; ++i) {

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	doc/X64_PARITY_CHECKLIST.md	\|	39	+++++++++++++++++++++++++++++++++++++++
M	src/arch/x64/ops.c	\|	119	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
M	src/arch/x64/opt_coord.c	\|	11	+++++++++--