commit 1bd5bf26465edcb99077f453fc6fd9e50b067ebe
parent 0f2afe857c1def3db84cb16df810d4b2bf071258
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sun, 10 May 2026 16:06:46 -0700
x64 codegen complete
Diffstat:
| D | doc/X64.md | | | 206 | ------------------------------------------------------------------------------- |
| M | src/arch/x64.c | | | 1081 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- |
| M | test/link/harness/start.c | | | 26 | ++++++++++++++++++++++++++ |
3 files changed, 1007 insertions(+), 306 deletions(-)
diff --git a/doc/X64.md b/doc/X64.md
@@ -1,206 +0,0 @@
-# X64 codegen status
-
-Living checklist for the x86_64 (SysV AMD64, Linux ELF) backend
-(`src/arch/x64.c`) and ABI (`src/abi/abi_sysv_x64.c`). Behavioral
-oracles are `test/cg/` and (later) `test/parse/`. Phase status:
-
-- β
landed
-- π§ in progress
-- β¬ planned
-
----
-
-## Test cg coverage
-
-Targeted scope: the **AβD spine** plus enough core ops (FP, sret,
-byval, scalar conversions, locals + indirect load/store, structured
-control flow, multi-function) that several follow-on groups
-incidentally pass. Path D (in-process JIT) is skipped on a non-x64
-host β the harness reports SKIP. Path E (qemu/podman exec) runs
-under `qemu-x86_64`.
-
-Current full-corpus result on x64 ELF, both opt-levels:
-**E: 230 pass / 154 fail / 0 skip.**
-
-| Group | Status | Notes |
-|-------|--------|-------|
-| MC-only | n/a | `mc_smoke` is aa64-bytes-only (excluded by arch mask) |
-| A β lifecycle | β
| a01βa10 |
-| B β params/locals | β
| b01βb08, including sret (b06), byval (b07), FP param via xmm0 (b08), and 9-int-param stack spill (b03) |
-| C β int arith | β
| c01βc12 |
-| D β cmp/branch | β
| d01βd13 |
-| E β conversions | β
| e01βe15 (SEXT/ZEXT/TRUNC/ITOF/FTOI/FEXT/FTRUNC/BITCAST) |
-| F β memory | β
except bitfields | f01βf11; π§ f12/f13 bitfields |
-| G β calls | β
except indirect | g02βg13; π§ g01 indirect call via reg (the synthesized fnptr type fails to classify cleanly through the stub ABI β TODO when callee.cls/type plumbing settles) |
-| H β control | β
| h01βh18 β SCOPE_LOOP / SCOPE_BLOCK bookkeeping suffices for while / do / for / switch / ternary |
-| I β alloca | β¬ | needs `max_outgoing` patch site (placeholder ADD) and SP-from-RBP epilogue restoration |
-| J β varargs | β¬ | needs SysV `__va_list_tag` GP/FP save areas in the prologue |
-| K β atomics | β¬ | LOCK XADD / CMPXCHG / MFENCE family |
-| L β intrinsics | β¬ | POPCNT / BSF / BSR / BSWAP / memcpy ABI lowering |
-| N β TLS LE | β¬ | `mov rd, fs:0` + 32-bit TPOFF32 displacement |
-| O β globals | β¬ except o11 | RIP-relative addressing for OPK_GLOBAL load/store/addr-of; o11 already passes because it only renames the text section |
-| P β DWARF | β
exit-code | p01βp07 pass on the value oracle; the W-path DWARF directives still depend on the stubbed `cfree_dwarf_*` consumers (same as on aa64/rv64) |
-| Q β multi-fn | β
except q11 | q01βq10 pass; π§ q11 needs `addr_of` for OPK_GLOBAL |
-
----
-
-## Phase 1 β Backend foundation β
-
-- β
Register pools: 13 int (rbx, r12..r15 callee-saved first, then
- r10, r11, rsi, rdi, rcx, rdx, r8, r9 caller-saved); 10 FP (xmm6..xmm15)
-- β
Frame layout: rbp-relative locals at negative offsets; callee-save
- area immediately below; outgoing args at `[rsp+0]` (16-aligned)
-- β
Prologue placeholder + func_end patch (mirrors aa64 / rv64)
-- β
Epilogue: restore callee-saves, `leave; ret`
-- β
MCEmitter fixup encodings already cover `R_PC32` for branches and
- PC-relative calls/jumps; no new fixup kinds needed for Groups AβD
-
-## Phase 2 β Core ops β
-
-- β
`load_imm`: 1B `mov r8, imm8`-via-MOV (32-bit) or `MOVABS` (64-bit)
-- β
`copy`, `load`, `store` (i8/i16/i32/i64 + float/double)
-- β
`addr_of` for OPK_LOCAL (`lea rd, [rbp - off]`)
-- β
`binop` (int): ADD/SUB/IMUL via reg-reg; SDIV/UDIV/SREM/UREM via
- CQO/CDQ + IDIV/DIV; AND/OR/XOR; SHL/SHR/SAR via `cl`
-- β
`unop` (NEG/BNOT/NOT-as-`!`)
-- β
`cmp` (materialize 0/1 via SETcc + MOVZX)
-- β
`cmp_branch` (CMP + Jcc rel32, R_PC32 fixup to MCLabel)
-- β
Structured `SCOPE_IF` / `else`; `SCOPE_LOOP` / `SCOPE_BLOCK`
- (label bookkeeping only β caller drives `label_place`/`jump`)
-- β
Calls (direct via `call rel32` + R_X64_PLT32; indirect via `call rax`)
-- β
Returns: scalar in rax / xmm0; multi-instruction `jmp epilogue`
-- β
Sret skeleton: incoming rdi spilled to a hidden slot at func_begin;
- the ret-indirect path memcpys srcβ[rdi] before branching to epilogue
-- β
FP scalar: `addss/addsd`, `cvtss2sd/cvtsd2ss`, `cvtsi2sd`, `cvttsd2si`,
- `movd/movq` for BITCAST, `movss/movsd` for load/store/copy
-- β
FP `load_const` via a fresh `.rodata` symbol + RIP-relative load
-- β
`convert`: SEXT (`movsx`/`movsxd`), ZEXT (`movzx`/zero high), TRUNC
- (no-op β narrower stores select width), FPβint (CVTSI2S*/CVTTS*2SI),
- FEXT/FTRUNC (CVTSS2SD/CVTSD2SS), BITCAST (movd/movq)
-
-## Phase 3 β Remaining cg coverage β¬
-
-- β¬ Aggregate ops: `copy_bytes`, `set_bytes`, bitfields
-- β¬ Calls: byval (b07), large struct byval (g08), HFA edges (rejected via
- ABI fallback to INDIRECT)
-- β¬ Group H: SCOPE_LOOP/BLOCK with `break_to`/`continue_to` exercised by
- while/for/do-while/switch
-- β¬ Group I: alloca (constant + runtime size, max_outgoing patch site)
-- β¬ Group J: varargs (SysV `__va_list_tag` + gp/fp save areas)
-- β¬ Group K: atomics (LOCK XADD / CMPXCHG / MFENCE)
-- β¬ Group L: intrinsics (popcnt / bsf / bsr / bswap / memcpy ABI calls)
-- β¬ Group N: TLS LE β `mov rd, fs:0` + 32-bit TPOFF32 displacement
-- β¬ Group O: globals via RIP-relative addressing
-- β¬ Group P: DWARF line/subprogram (driven by Debug; backend forwards locs)
-
-## Phase 4 β opt-cgtarget equivalence β¬
-
-- β¬ Confirm L1/L2 (opt-wrapped) cg paths match L0 on the spine
-- β¬ Same equivalence on the full corpus once Phase 3 lands
-
-## Phase 5 β test-parse on x64 β¬
-
-Same pattern as rv64 phase 5 β `test/parse/` is the file-driven C
-parser harness. Plan: run `CFREE_TEST_ARCH=x64 make test-parse` after
-Phase 3 stabilizes and triage failures, then mirror RV64's per-case
-opt-out scheme for arch-specific cases.
-
----
-
-## Open follow-ups
-
-- Caller-saved register spilling around calls. The current pool hands
- out caller-saved regs only after callee-saved are exhausted; cases
- that hold a caller-saved reg live across a call (heavy register
- pressure with a call in the middle) will mis-execute. The corpus
- used to be designed so the first-allocated reg is callee-saved
- (g11_caller_saved_live_across_call), but this is fragile β the
- full Phase 3 plan tracks an explicit "live across call" annotation.
-- Variadic FP register save area. Today the prologue spills only
- int arg regs because varargs aren't reached; the save layout has
- to mirror the SysV `__va_list_tag` once Group J lands.
-- CFI directives are no-ops (debug.h's CFI fanout is unwired across
- arches at present); revisit when `.eh_frame` lands.
-
-
-
-### Currently failing cg tests
-
- f12_bitfield_unsigned/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/f12_bitfield_unsigned/emit.err)
- f13_bitfield_signed/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/f13_bitfield_signed/emit.err)
- g01_indirect_call/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/g01_indirect_call/emit.err)
- i01_alloca_const_int/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/i01_alloca_const_int/emit.err)
- i02_alloca_runtime_size/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/i02_alloca_runtime_size/emit.err)
- i03_alloca_align_16/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/i03_alloca_align_16/emit.err)
- i04_alloca_in_loop_distinct/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/i04_alloca_in_loop_distinct/emit.err)
- i05_alloca_then_call/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/i05_alloca_then_call/emit.err)
- i06_two_allocas_disjoint/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/i06_two_allocas_disjoint/emit.err)
- i07_alloca_addr_escapes/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/i07_alloca_addr_escapes/emit.err)
- i08_vla_param_sum/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/i08_vla_param_sum/emit.err)
- i09_alloca_preserves_locals/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/i09_alloca_preserves_locals/emit.err)
- i10_alloca_after_named_local/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/i10_alloca_after_named_local/emit.err)
- j01_va_int_sum_3/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/j01_va_int_sum_3/emit.err)
- j02_va_zero_args/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/j02_va_zero_args/emit.err)
- j03_va_int_spill/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/j03_va_int_spill/emit.err)
- j04_va_int64/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/j04_va_int64/emit.err)
- j05_va_double_sum/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/j05_va_double_sum/emit.err)
- j06_va_double_spill/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/j06_va_double_spill/emit.err)
- j07_va_mixed_int_dbl/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/j07_va_mixed_int_dbl/emit.err)
- j08_va_copy/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/j08_va_copy/emit.err)
- j09_va_two_fixed/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/j09_va_two_fixed/emit.err)
- k01_atomic_load_relaxed/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k01_atomic_load_relaxed/emit.err)
- k02_atomic_store_load_acq/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k02_atomic_store_load_acq/emit.err)
- k03_atomic_load_seq_cst/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k03_atomic_load_seq_cst/emit.err)
- k04_atomic_rmw_add/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k04_atomic_rmw_add/emit.err)
- k05_atomic_rmw_xchg/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k05_atomic_rmw_xchg/emit.err)
- k06_atomic_rmw_and/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k06_atomic_rmw_and/emit.err)
- k07_atomic_rmw_or/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k07_atomic_rmw_or/emit.err)
- k08_atomic_rmw_xor/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k08_atomic_rmw_xor/emit.err)
- k09_atomic_rmw_sub/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k09_atomic_rmw_sub/emit.err)
- k10_atomic_rmw_nand/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k10_atomic_rmw_nand/emit.err)
- k11_atomic_cas_success/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k11_atomic_cas_success/emit.err)
- k12_atomic_cas_failure/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k12_atomic_cas_failure/emit.err)
- k13_atomic_load_i64/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k13_atomic_load_i64/emit.err)
- k14_atomic_rmw_prior/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k14_atomic_rmw_prior/emit.err)
- k15_fence_seq_cst/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/k15_fence_seq_cst/emit.err)
- l01_popcount_u32/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l01_popcount_u32/emit.err)
- l02_popcount_u64/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l02_popcount_u64/emit.err)
- l03_ctz_u32/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l03_ctz_u32/emit.err)
- l04_clz_u32/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l04_clz_u32/emit.err)
- l05_bswap16/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l05_bswap16/emit.err)
- l06_bswap32/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l06_bswap32/emit.err)
- l07_bswap64/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l07_bswap64/emit.err)
- l08_memcpy_4/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l08_memcpy_4/emit.err)
- l09_memmove_overlap/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l09_memmove_overlap/emit.err)
- l10_memset_zero/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l10_memset_zero/emit.err)
- l11_memset_ff/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l11_memset_ff/emit.err)
- l12_expect_taken/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l12_expect_taken/emit.err)
- l13_unreachable_live/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l13_unreachable_live/emit.err)
- l14_trap_live/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l14_trap_live/emit.err)
- l15_prefetch_noop/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l15_prefetch_noop/emit.err)
- l16_assume_aligned/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l16_assume_aligned/emit.err)
- l17_add_overflow_no/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l17_add_overflow_no/emit.err)
- l18_add_overflow_yes/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l18_add_overflow_yes/emit.err)
- l19_sub_overflow_yes/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l19_sub_overflow_yes/emit.err)
- l20_mul_overflow_no/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/l20_mul_overflow_no/emit.err)
- n01_tls_load_le/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/n01_tls_load_le/emit.err)
- n02_tls_store_le/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/n02_tls_store_le/emit.err)
- n03_tls_addr_taken/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/n03_tls_addr_taken/emit.err)
- n04_tls_i64/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/n04_tls_i64/emit.err)
- n05_tls_in_loop/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/n05_tls_in_loop/emit.err)
- n06_tls_two_vars/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/n06_tls_two_vars/emit.err)
- n07_tls_bss_zero_init/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/n07_tls_bss_zero_init/emit.err)
- n08_tls_addend_offset/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/n08_tls_addend_offset/emit.err)
- o01_global_load_data/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/o01_global_load_data/emit.err)
- o02_global_store_data/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/o02_global_store_data/emit.err)
- o03_global_bss_zero/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/o03_global_bss_zero/emit.err)
- o04_global_addr_taken/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/o04_global_addr_taken/emit.err)
- o05_global_i64/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/o05_global_i64/emit.err)
- o06_rodata_load/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/o06_rodata_load/emit.err)
- o07_global_struct_field/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/o07_global_struct_field/emit.err)
- o08_global_array_runtime_idx/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/o08_global_array_runtime_idx/emit.err)
- o09_static_local_linkage/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/o09_static_local_linkage/emit.err)
- o10_global_addend/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/o10_global_addend/emit.err)
- o12_global_across_call/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/o12_global_across_call/emit.err)
- q11_addr_of_helper_through_global/emit (cg-runner --emit failed; see /Users/ryan/code/cfree/build/test/cg/q11_addr_of_helper_through_global/emit.err)
-
diff --git a/src/arch/x64.c b/src/arch/x64.c
@@ -9,19 +9,25 @@
* water mark are known.
*
* Reg allocator: lowest-bit-first over a fixed preference list. INT
- * pool has callee-saves (rbx, r12..r15) at the low bits, then a
- * caller-saved tail (r10, rdi, rsi, r8, r9) β so the first reg handed
- * out is callee-saved, which is what tests like
- * g11_caller_saved_live_across_call rely on. FP pool is xmm6..xmm15
- * (10 regs, all caller-saved on SysV).
+ * pool has callee-saves (rbx, r12..r15) at the low bits, then a single
+ * caller-saved tail (r10) β so the first reg handed out is callee-saved,
+ * which is what tests like g11_caller_saved_live_across_call rely on.
+ * FP pool is xmm6..xmm15 (10 regs, all caller-saved on SysV).
+ *
+ * The six SysV arg-passing GPRs (rdi, rsi, rdx, rcx, r8, r9) are
+ * deliberately kept OUT of the pool. If they were in the pool, the
+ * arg-emit loop in x_call could clobber an arg's source register
+ * before reading it: e.g. `mov rdi, [arg1_local]; mov r8d, edi` for
+ * arg5 reads the wrong edi. Mirrors aarch64, which keeps x0..x7 out
+ * of its allocator pool for the same reason.
*
* Scratches kept outside the pools: rax (primary), rcx, rdx, r11
* (secondary). rax is also the int return reg; xmm0 is the FP return
* reg.
*
- * Scope: the test/cg spine (Groups AβD plus call/local/sret/byval/FP
- * pieces of B). Methods past the spine panic with a clear message so
- * Phase 3 work has obvious landing pads β see doc/X64.md. */
+ * Scope: the test/cg spine (Groups AβH) plus alloca/VLA (Group I) and
+ * SysV varargs (Group J). Remaining unimplemented methods past that
+ * panic with a clear message β see doc/X64.md. */
#include <string.h>
@@ -78,9 +84,9 @@ static int xpool_free(XRegPool* p, Reg r) {
return 0;
}
-static const u8 g_int_order[10] = {
+static const u8 g_int_order[6] = {
X64_RBX, X64_R12, X64_R13, X64_R14, X64_R15, /* callee-saved (n_cs=5) */
- X64_R10, X64_RDI, X64_RSI, X64_R8, X64_R9, /* caller-saved tail */
+ X64_R10, /* caller-saved tail */
};
static const u8 g_fp_order[10] = {
@@ -114,6 +120,13 @@ typedef struct XScope {
Label continue_label;
} XScope;
+/* alloca emits a placeholder `lea dst, [rsp + 0]` whose disp32 is patched
+ * at func_end with the final max_outgoing value. disp_pos records the
+ * byte offset of that disp32 in the active text section. */
+typedef struct XAllocaPatch {
+ u32 disp_pos;
+} XAllocaPatch;
+
typedef struct XImpl {
CGTarget base;
SrcLoc loc;
@@ -133,7 +146,11 @@ typedef struct XImpl {
u32 next_param_fp;
u32 next_param_stack;
u8 has_sret;
+ u8 has_alloca;
+ u8 is_variadic;
+ u8 pad0;
FrameSlot sret_ptr_slot;
+ FrameSlot reg_save_slot; /* variadic: 176-byte __va_list_tag reg save area */
XRegPool int_pool;
XRegPool fp_pool;
@@ -141,6 +158,10 @@ typedef struct XImpl {
XScope* scopes;
u32 nscopes;
u32 scopes_cap;
+
+ XAllocaPatch* alloca_patches;
+ u32 nalloca_patches;
+ u32 alloca_patches_cap;
} XImpl;
static XImpl* impl_of(CGTarget* t) { return (XImpl*)t; }
@@ -437,6 +458,18 @@ static void emit_shift_cl(MCEmitter* mc, int w, u32 sub, u32 reg) {
if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
}
+/* Shift r/m by imm8: opcode C1 /sub ib. sub: SHL=4, SHR=5, SAR=7. */
+static void emit_shift_imm(MCEmitter* mc, int w, u32 sub, u32 reg, u8 imm) {
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ emit_rex(mc, w, 0, 0, reg);
+ u8 buf[3];
+ buf[0] = 0xC1;
+ buf[1] = modrm(3u, sub, reg);
+ buf[2] = imm;
+ mc->emit_bytes(mc, buf, 3);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
static void emit_cqo_or_cdq(MCEmitter* mc, int w) {
if (w) {
u8 buf[2] = {X64_REX_BASE | X64_REX_W, 0x99};
@@ -581,13 +614,17 @@ static void x_func_begin(CGTarget* t, const CGFuncDesc* fd) {
a->next_param_fp = 0;
a->next_param_stack = 0;
a->has_sret = (fd->abi && fd->abi->has_sret) ? 1 : 0;
+ a->has_alloca = 0;
+ a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0;
a->cum_off = 0;
a->max_outgoing = 0;
- xpool_init(&a->int_pool, g_int_order, 10u, 5u);
+ xpool_init(&a->int_pool, g_int_order, 6u, 5u);
xpool_init(&a->fp_pool, g_fp_order, 10u, 0u);
a->nslots = 0;
a->nscopes = 0;
+ a->nalloca_patches = 0;
a->sret_ptr_slot = FRAME_SLOT_NONE;
+ a->reg_save_slot = FRAME_SLOT_NONE;
a->epilogue_label = mc->label_new(mc);
mc->cfi_startproc(mc);
@@ -607,6 +644,31 @@ static void x_func_begin(CGTarget* t, const CGFuncDesc* fd) {
/* Subsequent int args start at rsi (next_param_int = 1). */
a->next_param_int = 1;
}
+
+ /* Variadic: reserve the SysV reg-save area (rdi..r9 at +0..+40, then
+ * xmm0..xmm7 at +48..+160 with 16-byte stride) and emit the saves
+ * directly after the prologue placeholder so the original register
+ * args are preserved before x_param() spills the named ones. */
+ if (a->is_variadic) {
+ FrameSlotDesc rsd = {
+ .type = NULL, .name = 0, .loc = {0, 0, 0},
+ .size = 176, .align = 8, .kind = FS_SPILL, .flags = 0,
+ };
+ a->reg_save_slot = x_frame_slot(t, &rsd);
+ XSlot* rs = slot_get(a, a->reg_save_slot);
+ static const u32 gprs[6] = {X64_RDI, X64_RSI, X64_RDX,
+ X64_RCX, X64_R8, X64_R9};
+ for (u32 i = 0; i < 6; ++i) {
+ emit_mov_store(mc, 8, gprs[i], X64_RBP,
+ -(i32)rs->off + (i32)(i * 8u));
+ }
+ /* movsd writes the low 8 bytes of each xmm; va_arg reads 8 bytes per
+ * FP slot, so the upper half of the 16-byte stride stays unused. */
+ for (u32 i = 0; i < 8; ++i) {
+ emit_sse_store(mc, 0xF2, 0x11, (u32)(X64_XMM0 + i), X64_RBP,
+ -(i32)rs->off + (i32)(48u + i * 16u));
+ }
+ }
}
static u32 align_up_u32(u32 v, u32 a) { return (v + (a - 1u)) & ~(a - 1u); }
@@ -700,6 +762,20 @@ static void x_func_end(CGTarget* t) {
obj_patch(t->obj, a->fd->text_section_id, a->prologue_pos, buf,
X64_PROLOGUE_BYTES);
+ /* Patch each alloca's `lea dst, [rsp + 0]` disp32 with the final
+ * max_outgoing (already 16-aligned via the `(stack_off+15)&~15` round
+ * at every call site). */
+ for (u32 i = 0; i < a->nalloca_patches; ++i) {
+ u8 dbuf[4];
+ u32 m = a->max_outgoing;
+ dbuf[0] = (u8)m;
+ dbuf[1] = (u8)(m >> 8);
+ dbuf[2] = (u8)(m >> 16);
+ dbuf[3] = (u8)(m >> 24);
+ obj_patch(t->obj, a->fd->text_section_id,
+ a->alloca_patches[i].disp_pos, dbuf, 4);
+ }
+
/* Define the function symbol. */
u32 end = mc->pos(mc);
obj_symbol_define(t->obj, a->fd->sym, a->fd->text_section_id,
@@ -1119,12 +1195,37 @@ static u32 addr_base(CGTarget* t, Operand addr, i32* out_off) {
(int)addr.kind);
}
+/* Emit `lea rd, [rip + disp32]` and attach an R_X64_PLT32 reloc on the
+ * disp32 site. PLT32 is correct for both functions (linker may route
+ * through PLT) and data symbols (resolves to the symbol directly when
+ * no PLT is needed). Addend -4 because the PC is end-of-instruction. */
+static void emit_global_lea(CGTarget* t, u32 dst_reg, ObjSymId sym,
+ i64 addend) {
+ emit_rex(t->mc, 1, dst_reg, 0, 0);
+ u8 op = 0x8D;
+ t->mc->emit_bytes(t->mc, &op, 1);
+ u8 mr = modrm(0u, (dst_reg & 7u), 5u); /* [RIP + disp32] */
+ t->mc->emit_bytes(t->mc, &mr, 1);
+ u32 disp_pos = t->mc->pos(t->mc);
+ emit_u32le(t->mc, 0);
+ t->mc->emit_reloc_at(t->mc, t->mc->section_id, disp_pos, R_X64_PLT32, sym,
+ addend - 4, 1, 0);
+}
+
static void x_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) {
- XImpl* a = impl_of(t);
u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
if (addr.kind == OPK_GLOBAL) {
- compiler_panic(t->c, a->loc, "x64 load: OPK_GLOBAL not yet implemented");
+ /* Materialize &sym into R11, then load from [r11]. */
+ emit_global_lea(t, X64_R11, addr.v.global.sym, addr.v.global.addend);
+ if (dst.cls == RC_FP) {
+ u8 prefix2 = (sz == 8) ? 0xF2 : 0xF3;
+ emit_sse_load(t->mc, prefix2, 0x10, dst.v.reg & 0xFu, X64_R11, 0);
+ } else {
+ int signed_ = type_is_signed(ma.type ? ma.type : addr.type);
+ emit_mov_load(t->mc, sz, signed_, dst.v.reg & 0xFu, X64_R11, 0);
+ }
+ return;
}
i32 off;
@@ -1139,11 +1240,26 @@ static void x_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) {
}
static void x_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) {
- XImpl* a = impl_of(t);
u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
if (addr.kind == OPK_GLOBAL) {
- compiler_panic(t->c, a->loc, "x64 store: OPK_GLOBAL not yet implemented");
+ /* Materialize &sym into R11, then store via [r11]. The IMM source
+ * branch below uses RAX as a scratch for the value, so R11 stays
+ * untouched between the LEA and the store. */
+ emit_global_lea(t, X64_R11, addr.v.global.sym, addr.v.global.addend);
+ if (src.kind == OPK_IMM) {
+ int w = (sz == 8) ? 1 : 0;
+ emit_load_imm(t->mc, w, X64_RAX, src.v.imm);
+ emit_mov_store(t->mc, sz, X64_RAX, X64_R11, 0);
+ return;
+ }
+ if (src.cls == RC_FP) {
+ u8 prefix2 = (sz == 8) ? 0xF2 : 0xF3;
+ emit_sse_store(t->mc, prefix2, 0x11, src.v.reg & 0xFu, X64_R11, 0);
+ return;
+ }
+ emit_mov_store(t->mc, sz, src.v.reg & 0xFu, X64_R11, 0);
+ return;
}
i32 off;
@@ -1175,14 +1291,57 @@ static void x_addr_of(CGTarget* t, Operand dst, Operand lv) {
emit_lea(t->mc, dst.v.reg & 0xFu, lv.v.ind.base & 0xFu, lv.v.ind.ofs);
return;
}
+ if (lv.kind == OPK_GLOBAL) {
+ emit_global_lea(t, dst.v.reg & 0xFu, lv.v.global.sym, lv.v.global.addend);
+ return;
+ }
x_panic(t, "addr_of: kind unsupported");
}
-static void x_tls_addr_of(CGTarget* t, Operand d, ObjSymId s, i64 a) {
- (void)d;
- (void)s;
- (void)a;
- x_panic(t, "tls_addr_of");
+/* x86_64 TLS Local-Exec materialization.
+ * mov rd, fs:0 ; read thread pointer (FS base + 0)
+ * lea rd, [rd + sym@tpoff] ; add TP-relative offset
+ * The disp32 of the LEA carries an R_X64_TPOFF32 reloc; the linker fills
+ * in the signed TP-relative offset (negative under variant II β TLS image
+ * sits below the TCB that FS points at). */
+static void x_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend) {
+ MCEmitter* mc = t->mc;
+ u32 sec = mc->section_id;
+ u32 rd = dst.v.reg & 0xFu;
+
+ /* mov rd, qword ptr fs:[0]
+ * 64 [REX.W|REX.R] 8B mod=00/reg=rd/rm=100 sib(0,4,5) disp32=0 */
+ u8 fs_prefix = 0x64;
+ mc->emit_bytes(mc, &fs_prefix, 1);
+ emit_rex(mc, 1, rd, 0, 0);
+ u8 op_mov = 0x8B;
+ mc->emit_bytes(mc, &op_mov, 1);
+ u8 mr1 = modrm(0u, rd & 7u, 4u);
+ mc->emit_bytes(mc, &mr1, 1);
+ u8 s1 = sib(0u, 4u, 5u);
+ mc->emit_bytes(mc, &s1, 1);
+ emit_u32le(mc, 0);
+
+ /* lea rd, [rd + disp32]
+ * [REX.W|REX.R|REX.B] 8D mod=10/reg=rd/rm=rd [SIB if rd&7==4] disp32 */
+ emit_rex(mc, 1, rd, 0, rd);
+ u8 op_lea = 0x8D;
+ mc->emit_bytes(mc, &op_lea, 1);
+ u32 disp_pos;
+ if ((rd & 7u) == 4u) {
+ u8 mr2 = modrm(2u, rd & 7u, 4u);
+ mc->emit_bytes(mc, &mr2, 1);
+ u8 s2 = sib(0u, 4u, rd & 7u);
+ mc->emit_bytes(mc, &s2, 1);
+ disp_pos = mc->pos(mc);
+ emit_u32le(mc, 0);
+ } else {
+ u8 mr2 = modrm(2u, rd & 7u, rd & 7u);
+ mc->emit_bytes(mc, &mr2, 1);
+ disp_pos = mc->pos(mc);
+ emit_u32le(mc, 0);
+ }
+ mc->emit_reloc_at(mc, sec, disp_pos, R_X64_TPOFF32, sym, addend, 0, 0);
}
/* Aggregate ops β small unrolled memcpy/memset. */
@@ -1259,19 +1418,58 @@ static void x_set_bytes(CGTarget* t, Operand da, Operand bv,
}
}
-static void x_bitfield_load(CGTarget* t, Operand d, Operand ra,
- BitFieldAccess b) {
- (void)d;
- (void)ra;
- (void)b;
- x_panic(t, "bitfield_load");
-}
-static void x_bitfield_store(CGTarget* t, Operand ra, Operand s,
- BitFieldAccess b) {
- (void)ra;
- (void)s;
- (void)b;
- x_panic(t, "bitfield_store");
+/* Load the storage unit, then extract the field by shifting it to the
+ * top of the register and shifting back. SAR for signed, SHR for unsigned. */
+static void x_bitfield_load(CGTarget* t, Operand dst, Operand record_addr,
+ BitFieldAccess bf) {
+ u32 base = agg_addr_reg(t, record_addr, X64_R11);
+ u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
+ int w = (storage_bytes == 8u) ? 1 : 0;
+ u32 reg_size = w ? 64u : 32u;
+ u32 lsb = bf.bit_offset;
+ u32 width = bf.bit_width ? bf.bit_width : 1u;
+ u32 rd = dst.v.reg & 0xFu;
+
+ emit_mov_load(t->mc, storage_bytes, 0, rd, base, (i32)bf.storage_offset);
+ u8 left = (u8)(reg_size - lsb - width);
+ u8 right = (u8)(reg_size - width);
+ if (left) emit_shift_imm(t->mc, w, 4u, rd, left);
+ if (right) emit_shift_imm(t->mc, w, bf.signed_ ? 7u : 5u, rd, right);
+}
+
+/* Read-modify-write: clear the field bits in the storage unit via AND ~mask,
+ * mask/shift the source into place, OR it in, write back. RAX holds the
+ * storage word; RCX is the staged value; RDX holds the source-side mask when
+ * needed. Avoids touching the base register. */
+static void x_bitfield_store(CGTarget* t, Operand record_addr, Operand src,
+ BitFieldAccess bf) {
+ u32 base = agg_addr_reg(t, record_addr, X64_R11);
+ u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
+ int w = (storage_bytes == 8u) ? 1 : 0;
+ u32 lsb = bf.bit_offset;
+ u32 width = bf.bit_width ? bf.bit_width : 1u;
+ u64 ones = (width >= 64u) ? ~(u64)0 : (((u64)1 << width) - 1u);
+ u64 mask = ones << lsb;
+
+ emit_mov_load(t->mc, storage_bytes, 0, X64_RAX, base, (i32)bf.storage_offset);
+ emit_load_imm(t->mc, w, X64_RCX, (i64)~mask);
+ emit_alu_rr(t->mc, w, 0x21, X64_RAX, X64_RCX); /* AND rax, rcx */
+
+ if (src.kind == OPK_IMM) {
+ u64 v = ((u64)src.v.imm & ones) << lsb;
+ emit_load_imm(t->mc, w, X64_RCX, (i64)v);
+ } else if (src.kind == OPK_REG) {
+ emit_mov_rr(t->mc, w, X64_RCX, src.v.reg & 0xFu);
+ emit_load_imm(t->mc, w, X64_RDX, (i64)ones);
+ emit_alu_rr(t->mc, w, 0x21, X64_RCX, X64_RDX); /* AND rcx, rdx */
+ if (lsb) emit_shift_imm(t->mc, w, 4u, X64_RCX, (u8)lsb);
+ } else {
+ compiler_panic(t->c, impl_of(t)->loc,
+ "x64 bitfield_store: src kind %d unsupported",
+ (int)src.kind);
+ }
+ emit_alu_rr(t->mc, w, 0x09, X64_RAX, X64_RCX); /* OR rax, rcx */
+ emit_mov_store(t->mc, storage_bytes, X64_RAX, base, (i32)bf.storage_offset);
}
/* ============================================================
@@ -1773,85 +1971,768 @@ static void x_ret(CGTarget* t, const CGABIValue* val) {
}
/* ============================================================
- * Stubs for unimplemented methods. */
-static void x_alloca_(CGTarget* t, Operand d, Operand s, u32 a) {
- (void)d;
- (void)s;
- (void)a;
- x_panic(t, "alloca");
-}
-static void x_va_start_(CGTarget* t, Operand a) {
- (void)a;
- x_panic(t, "va_start");
+ * Alloca / VLA.
+ *
+ * Layout (low β high addresses, after a `sub rsp, aligned_size`):
+ * [rsp + 0, +max_outgoing): outgoing-arg area
+ * [rsp + max_outgoing, +max_outgoing +aligned_size): newly allocated block
+ *
+ * max_outgoing is only known at func_end (it is the max across all
+ * x_call sites in the function), so each alloca emits a placeholder
+ * `lea dst, [rsp + 0]` whose 4-byte disp is patched at func_end. The
+ * epilogue restores rsp via `leave` (mov rsp, rbp; pop rbp), so no
+ * extra dance is needed when alloca is present. */
+
+static void emit_lea_rsp_disp32(MCEmitter* mc, u32 dst, u32* out_disp_pos) {
+ /* Force the disp32 form (mod=10, rm=SIB, base=rsp, no index, scale=0)
+ * regardless of the displacement value so func_end has a fixed-width
+ * field to patch. 8 bytes: REX.W [+R] | 0x8D | ModRM | SIB | disp32. */
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ emit_rex(mc, 1, dst, 0, X64_RSP);
+ u8 op = 0x8D;
+ mc->emit_bytes(mc, &op, 1);
+ u8 mr = modrm(2u, dst & 7u, 4u);
+ mc->emit_bytes(mc, &mr, 1);
+ u8 s = sib(0, 4u, X64_RSP);
+ mc->emit_bytes(mc, &s, 1);
+ *out_disp_pos = mc->pos(mc);
+ emit_u32le(mc, 0);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
}
-static void x_va_arg_(CGTarget* t, Operand d, Operand a, const Type* ty) {
- (void)d;
- (void)a;
- (void)ty;
- x_panic(t, "va_arg");
+
+static void x_alloca_(CGTarget* t, Operand d, Operand sz, u32 align) {
+ XImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ if (d.kind != OPK_REG)
+ compiler_panic(t->c, a->loc, "x64 alloca: dst must be REG");
+ if (align > 16) {
+ compiler_panic(t->c, a->loc,
+ "x64 alloca: align %u > 16 not yet supported", align);
+ }
+
+ if (sz.kind == OPK_IMM) {
+ i64 v = sz.v.imm;
+ if (v < 0) compiler_panic(t->c, a->loc, "x64 alloca: negative size");
+ u64 aligned = ((u64)v + 15u) & ~(u64)15u;
+ if (aligned == 0) aligned = 16;
+ /* sub rsp, imm32 : REX.W 0x81 /5 imm32 (7 bytes). */
+ emit_rex(mc, 1, 0, 0, X64_RSP);
+ u8 buf[2] = {0x81, modrm(3u, 5u, X64_RSP)};
+ mc->emit_bytes(mc, buf, 2);
+ emit_u32le(mc, (u32)aligned);
+ } else if (sz.kind == OPK_REG) {
+ u32 sz_reg = sz.v.reg & 0xFu;
+ /* rax = (sz_reg + 15) & ~15 */
+ emit_lea(mc, X64_RAX, sz_reg, 15);
+ /* and rax, -16 : REX.W 0x83 /4 imm8(0xF0). */
+ emit_rex(mc, 1, 0, 0, X64_RAX);
+ u8 abuf[3] = {0x83, modrm(3u, 4u, X64_RAX), 0xF0};
+ mc->emit_bytes(mc, abuf, 3);
+ /* sub rsp, rax */
+ emit_alu_rr(mc, 1, 0x29, X64_RSP, X64_RAX);
+ } else {
+ compiler_panic(t->c, a->loc, "x64 alloca: size kind %d unsupported",
+ (int)sz.kind);
+ }
+
+ /* lea dst, [rsp + max_outgoing] β placeholder, patched at func_end. */
+ if (a->nalloca_patches == a->alloca_patches_cap) {
+ u32 ncap = a->alloca_patches_cap ? a->alloca_patches_cap * 2u : 4u;
+ XAllocaPatch* nb = arena_array(t->c->tu, XAllocaPatch, ncap);
+ if (a->alloca_patches)
+ memcpy(nb, a->alloca_patches, sizeof(XAllocaPatch) * a->nalloca_patches);
+ a->alloca_patches = nb;
+ a->alloca_patches_cap = ncap;
+ }
+ u32 disp_pos;
+ emit_lea_rsp_disp32(mc, d.v.reg & 0xFu, &disp_pos);
+ a->alloca_patches[a->nalloca_patches].disp_pos = disp_pos;
+ a->nalloca_patches++;
+ a->has_alloca = 1;
+}
+
+/* SysV AMD64 __va_list_tag (24 bytes, 8-aligned):
+ * off 0 u32 gp_offset next free GP slot in reg_save_area (0..48)
+ * off 4 u32 fp_offset next free FP slot (48..176)
+ * off 8 ptr overflow_arg_area pointer to next stack-passed arg
+ * off 16 ptr reg_save_area pointer to the 176-byte save area
+ *
+ * The reg_save_area layout (filled in func_begin):
+ * +0..+40 : rdi, rsi, rdx, rcx, r8, r9 (8B each)
+ * +48..+168 : xmm0..xmm7 at 16B stride (low 8B written via movsd)
+ *
+ * va_arg dispatches on dst class. When the relevant offset reaches its
+ * max (48 for GP, 176 for FP), fall through to overflow_arg_area at
+ * 8-byte stride. */
+
+static void x_va_start_(CGTarget* t, Operand ap_op) {
+ XImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ if (!a->is_variadic)
+ compiler_panic(t->c, a->loc, "x64 va_start: function not variadic");
+ u32 ap = ap_op.v.reg & 0xFu;
+ XSlot* rs = slot_get(a, a->reg_save_slot);
+ if (!rs) compiler_panic(t->c, a->loc, "x64 va_start: no reg_save_slot");
+
+ /* gp_offset = next_param_int * 8 */
+ emit_load_imm(mc, 0, X64_RAX, (i64)(a->next_param_int * 8u));
+ emit_mov_store(mc, 4, X64_RAX, ap, 0);
+ /* fp_offset = 48 + next_param_fp * 16 */
+ emit_load_imm(mc, 0, X64_RAX, (i64)(48u + a->next_param_fp * 16u));
+ emit_mov_store(mc, 4, X64_RAX, ap, 4);
+ /* overflow_arg_area = rbp + 16 + next_param_stack */
+ emit_lea(mc, X64_RAX, X64_RBP, (i32)(16u + a->next_param_stack));
+ emit_mov_store(mc, 8, X64_RAX, ap, 8);
+ /* reg_save_area = rbp - reg_save_slot.off */
+ emit_lea(mc, X64_RAX, X64_RBP, -(i32)rs->off);
+ emit_mov_store(mc, 8, X64_RAX, ap, 16);
+}
+
+static void x_va_arg_(CGTarget* t, Operand dst, Operand ap_op,
+ const Type* ty) {
+ MCEmitter* mc = t->mc;
+ u32 ap = ap_op.v.reg & 0xFu;
+ u32 sz = type_byte_size(ty);
+ int is_fp = (dst.cls == RC_FP);
+ u32 offs_field = is_fp ? 4u : 0u;
+ u32 max_offs = is_fp ? 176u : 48u;
+ u32 stride = is_fp ? 16u : 8u;
+ u32 dr = dst.v.reg & 0xFu;
+
+ MCLabel L_stack = mc->label_new(mc);
+ MCLabel L_done = mc->label_new(mc);
+
+ /* eax = ap[offs_field]; cmp eax, max_offs; jae L_stack. */
+ emit_mov_load(mc, 4, 0, X64_RAX, ap, (i32)offs_field);
+ if (max_offs <= 127u) {
+ emit_cmp_imm8(mc, 0, X64_RAX, (i8)max_offs);
+ } else {
+ /* cmp eax, imm32 : 0x3D imm32 (5 bytes, EAX-specific form). */
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ u8 op = 0x3D;
+ mc->emit_bytes(mc, &op, 1);
+ emit_u32le(mc, max_offs);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+ }
+ emit_jcc_label(mc, X64_CC_AE, L_stack);
+
+ /* Reg path:
+ * r11 = ap[16] (reg_save_area)
+ * r11 = r11 + rax
+ * load dst from [r11 + 0]
+ * eax += stride; ap[offs_field] = eax
+ * jmp L_done */
+ emit_mov_load(mc, 8, 0, X64_R11, ap, 16);
+ emit_alu_rr(mc, 1, 0x01, X64_R11, X64_RAX);
+ if (is_fp) {
+ u8 prefix = (sz == 8) ? 0xF2 : 0xF3;
+ emit_sse_load(mc, prefix, 0x10, dr, X64_R11, 0);
+ } else {
+ int sx = type_is_signed(ty);
+ emit_mov_load(mc, sz, sx, dr, X64_R11, 0);
+ }
+ /* add eax, imm8 : 0x83 /0 imm8 (no REX needed for eax). */
+ {
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ u8 buf[3] = {0x83, modrm(3u, 0u, X64_RAX), (u8)stride};
+ mc->emit_bytes(mc, buf, 3);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+ }
+ emit_mov_store(mc, 4, X64_RAX, ap, (i32)offs_field);
+ emit_jmp_label(mc, L_done);
+
+ /* L_stack:
+ * r11 = ap[8] (overflow_arg_area)
+ * load dst from [r11 + 0]
+ * r11 += 8; ap[8] = r11 */
+ mc->label_place(mc, L_stack);
+ emit_mov_load(mc, 8, 0, X64_R11, ap, 8);
+ if (is_fp) {
+ u8 prefix = (sz == 8) ? 0xF2 : 0xF3;
+ emit_sse_load(mc, prefix, 0x10, dr, X64_R11, 0);
+ } else {
+ int sx = type_is_signed(ty);
+ emit_mov_load(mc, sz, sx, dr, X64_R11, 0);
+ }
+ /* add r11, 8 : REX.WB 0x83 /0 imm8. */
+ {
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ u8 rex = (u8)(X64_REX_BASE | X64_REX_W | X64_REX_B);
+ mc->emit_bytes(mc, &rex, 1);
+ u8 buf[3] = {0x83, modrm(3u, 0u, X64_R11 & 7u), 8};
+ mc->emit_bytes(mc, buf, 3);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+ }
+ emit_mov_store(mc, 8, X64_R11, ap, 8);
+
+ mc->label_place(mc, L_done);
}
+
static void x_va_end_(CGTarget* t, Operand a) {
- (void)a;
(void)t;
+ (void)a;
}
+
static void x_va_copy_(CGTarget* t, Operand d, Operand s) {
- (void)d;
- (void)s;
- x_panic(t, "va_copy");
-}
-
-static void x_atomic_load(CGTarget* t, Operand d, Operand ad, MemAccess m,
- MemOrder o) {
- (void)d;
- (void)ad;
- (void)m;
- (void)o;
- x_panic(t, "atomic_load");
-}
-static void x_atomic_store(CGTarget* t, Operand ad, Operand s, MemAccess m,
- MemOrder o) {
- (void)ad;
- (void)s;
- (void)m;
- (void)o;
- x_panic(t, "atomic_store");
-}
-static void x_atomic_rmw(CGTarget* t, AtomicOp op, Operand d, Operand ad,
- Operand v, MemAccess m, MemOrder o) {
- (void)op;
- (void)d;
- (void)ad;
- (void)v;
- (void)m;
- (void)o;
- x_panic(t, "atomic_rmw");
-}
-static void x_atomic_cas(CGTarget* t, Operand p, Operand ok, Operand ad,
- Operand e, Operand des, MemAccess m, MemOrder so,
- MemOrder fo) {
- (void)p;
- (void)ok;
- (void)ad;
- (void)e;
- (void)des;
- (void)m;
- (void)so;
- (void)fo;
- x_panic(t, "atomic_cas");
+ MCEmitter* mc = t->mc;
+ u32 dr = d.v.reg & 0xFu;
+ u32 sr = s.v.reg & 0xFu;
+ /* va_list is 24 bytes; three 8B loads + stores via rax. */
+ for (u32 i = 0; i < 24u; i += 8u) {
+ emit_mov_load(mc, 8, 0, X64_RAX, sr, (i32)i);
+ emit_mov_store(mc, 8, X64_RAX, dr, (i32)i);
+ }
+}
+
+/* ============================================================
+ * Atomics (Group K).
+ *
+ * x86 has a strong memory model: plain MOV is acquire on loads and
+ * release on stores, so most MemOrders need no extra fence. The
+ * exception is SEQ_CST stores, which need a full StoreLoad barrier β
+ * realized either via XCHG (which has implicit LOCK) or MOV+MFENCE.
+ * All LOCK-prefixed RMWs (XADD/XCHG/CMPXCHG) act as full barriers,
+ * subsuming any MemOrder the front end requests. */
+
+static void emit_lock_prefix(MCEmitter* mc) {
+ u8 b = 0xF0;
+ mc->emit_bytes(mc, &b, 1);
+}
+
+static void emit_mfence(MCEmitter* mc) {
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ u8 b[3] = {0x0F, 0xAE, 0xF0};
+ mc->emit_bytes(mc, b, 3);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
+static void emit_ud2(MCEmitter* mc) {
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ u8 b[2] = {0x0F, 0x0B};
+ mc->emit_bytes(mc, b, 2);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
+/* LOCK XADD [base+disp], src. Opcode 0F C1 /r (32/64-bit; sets src=prior,
+ * mem=mem+src). */
+static void emit_lock_xadd(MCEmitter* mc, int w, u32 src, u32 base, i32 disp) {
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ emit_lock_prefix(mc);
+ emit_rex(mc, w, src, 0, base);
+ u8 op[2] = {0x0F, 0xC1};
+ mc->emit_bytes(mc, op, 2);
+ emit_mem_operand(mc, src, base, disp);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
+/* XCHG [base+disp], src. Opcode 87 /r. LOCK is implicit when the
+ * destination is memory, but we emit it explicitly for clarity. */
+static void emit_lock_xchg_mem(MCEmitter* mc, int w, u32 src, u32 base,
+ i32 disp) {
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ emit_lock_prefix(mc);
+ emit_rex(mc, w, src, 0, base);
+ u8 op = 0x87;
+ mc->emit_bytes(mc, &op, 1);
+ emit_mem_operand(mc, src, base, disp);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
+/* LOCK CMPXCHG [base+disp], src. Opcode 0F B1 /r. Compares RAX with [mem];
+ * if equal, [mem]=src and ZF=1; else RAX=[mem] and ZF=0. */
+static void emit_lock_cmpxchg(MCEmitter* mc, int w, u32 src, u32 base,
+ i32 disp) {
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ emit_lock_prefix(mc);
+ emit_rex(mc, w, src, 0, base);
+ u8 op[2] = {0x0F, 0xB1};
+ mc->emit_bytes(mc, op, 2);
+ emit_mem_operand(mc, src, base, disp);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
+/* POPCNT rd, rs. Encoding: F3 0F B8 /r. */
+static void emit_popcnt(MCEmitter* mc, int w, u32 dst, u32 src) {
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ u8 p = 0xF3;
+ mc->emit_bytes(mc, &p, 1);
+ emit_rex(mc, w, dst, 0, src);
+ u8 op[2] = {0x0F, 0xB8};
+ mc->emit_bytes(mc, op, 2);
+ emit_rm_reg(mc, dst, src);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
}
+
+/* BSF/BSR rd, rs. opcode2 = 0xBC (BSF) or 0xBD (BSR). */
+static void emit_bs(MCEmitter* mc, int w, u8 opcode2, u32 dst, u32 src) {
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ emit_rex(mc, w, dst, 0, src);
+ u8 op[2] = {0x0F, opcode2};
+ mc->emit_bytes(mc, op, 2);
+ emit_rm_reg(mc, dst, src);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
+/* BSWAP r32/r64. Opcode 0F C8+r; REX.W for r64; REX.B if reg>=8. */
+static void emit_bswap(MCEmitter* mc, int w, u32 reg) {
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ emit_rex(mc, w, 0, 0, reg);
+ u8 op[2] = {0x0F, (u8)(0xC8 + (reg & 7))};
+ mc->emit_bytes(mc, op, 2);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
+/* ROL r/m16, imm8. Used to swap bytes in a 16-bit value (ROL by 8). */
+static void emit_rol16_imm8(MCEmitter* mc, u32 reg, u8 imm) {
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ u8 p = 0x66;
+ mc->emit_bytes(mc, &p, 1);
+ emit_rex(mc, 0, 0, 0, reg);
+ u8 buf[3];
+ buf[0] = 0xC1;
+ buf[1] = modrm(3u, 0u, reg & 7u);
+ buf[2] = imm;
+ mc->emit_bytes(mc, buf, 3);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
+/* XOR r/m, imm32 β opcode 81 /6. Used to compute (bits-1) - x via XOR. */
+static void emit_xor_imm32(MCEmitter* mc, int w, u32 reg, i32 imm) {
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ emit_rex(mc, w, 0, 0, reg);
+ u8 op = 0x81;
+ mc->emit_bytes(mc, &op, 1);
+ emit_rm_reg(mc, 6u, reg);
+ emit_u32le(mc, (u32)imm);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
+/* Resolve an atomic addr operand to (base, disp) for a memory operand.
+ * Accepts OPK_REG (pointer in reg, disp=0), OPK_LOCAL, or OPK_INDIRECT. */
+static u32 atomic_addr_base(CGTarget* t, Operand addr, i32* out_disp) {
+ if (addr.kind == OPK_REG) {
+ *out_disp = 0;
+ return addr.v.reg & 0xFu;
+ }
+ return addr_base(t, addr, out_disp);
+}
+
+static void x_atomic_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma,
+ MemOrder ord) {
+ MCEmitter* mc = t->mc;
+ (void)ord; /* x86: plain MOV satisfies all orders for loads. */
+ u32 sz = ma.size ? ma.size : type_byte_size(dst.type);
+ i32 disp;
+ u32 base = atomic_addr_base(t, addr, &disp);
+ int signed_ = type_is_signed(ma.type ? ma.type : dst.type);
+ emit_mov_load(mc, sz, signed_, dst.v.reg & 0xFu, base, disp);
+}
+
+static void x_atomic_store(CGTarget* t, Operand addr, Operand src, MemAccess ma,
+ MemOrder ord) {
+ XImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ u32 sz = ma.size ? ma.size : type_byte_size(src.type);
+ int w = (sz == 8) ? 1 : 0;
+ i32 disp;
+ u32 base = atomic_addr_base(t, addr, &disp);
+
+ /* Materialize src into a register. */
+ u32 sr;
+ if (src.kind == OPK_IMM) {
+ emit_load_imm(mc, w, X64_R11, src.v.imm);
+ sr = X64_R11;
+ } else if (src.kind == OPK_REG) {
+ sr = src.v.reg & 0xFu;
+ } else {
+ compiler_panic(t->c, a->loc, "x64 atomic_store: src kind %d unsupported",
+ (int)src.kind);
+ }
+
+ if (ord == MO_SEQ_CST) {
+ /* SEQ_CST store: XCHG implicitly fences. Move src into r11 so the
+ * caller's reg is unmodified, then xchg [mem], r11. */
+ if (sr != X64_R11) emit_mov_rr(mc, w, X64_R11, sr);
+ emit_lock_xchg_mem(mc, w, X64_R11, base, disp);
+ return;
+ }
+ /* Plain store covers RELAXED / RELEASE. */
+ emit_mov_store(mc, sz, sr, base, disp);
+}
+
+static void x_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst, Operand addr,
+ Operand val, MemAccess ma, MemOrder ord) {
+ XImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ (void)ord; /* LOCK-prefixed ops are unconditionally full barriers. */
+ u32 sz = ma.size ? ma.size : type_byte_size(dst.type);
+ int w = (sz == 8) ? 1 : 0;
+ i32 disp;
+ u32 base = atomic_addr_base(t, addr, &disp);
+ u32 dr = dst.v.reg & 0xFu;
+
+ /* Materialize val into r11 (it's our working temp). For SUB we negate
+ * it so the XADD does the subtraction. */
+ if (val.kind == OPK_IMM) {
+ i64 v = val.v.imm;
+ if (op == AO_SUB) v = -v;
+ emit_load_imm(mc, w, X64_R11, v);
+ } else if (val.kind == OPK_REG) {
+ u32 vr = val.v.reg & 0xFu;
+ if (vr != X64_R11) emit_mov_rr(mc, w, X64_R11, vr);
+ if (op == AO_SUB) emit_f7_rm(mc, w, 3u, X64_R11); /* NEG */
+ } else {
+ compiler_panic(t->c, a->loc, "x64 atomic_rmw: val kind %d unsupported",
+ (int)val.kind);
+ }
+
+ if (op == AO_ADD || op == AO_SUB) {
+ /* LOCK XADD [base], r11 β afterwards r11 holds prior. */
+ emit_lock_xadd(mc, w, X64_R11, base, disp);
+ if (dr != X64_R11) emit_mov_rr(mc, w, dr, X64_R11);
+ return;
+ }
+ if (op == AO_XCHG) {
+ emit_lock_xchg_mem(mc, w, X64_R11, base, disp);
+ if (dr != X64_R11) emit_mov_rr(mc, w, dr, X64_R11);
+ return;
+ }
+
+ /* AND/OR/XOR/NAND: CMPXCHG retry loop.
+ *
+ * mov rax, [mem]
+ * .retry:
+ * mov rcx, rax ; new = prior
+ * <op> rcx, r11 ; combine with val
+ * [NAND: not rcx]
+ * lock cmpxchg [mem], rcx
+ * jne .retry
+ * mov dr, rax
+ *
+ * rax = prior (cmpxchg implicit), rcx = new (scratch), r11 = val. */
+ emit_mov_load(mc, sz, 0, X64_RAX, base, disp);
+ MCLabel L_retry = mc->label_new(mc);
+ mc->label_place(mc, L_retry);
+ emit_mov_rr(mc, w, X64_RCX, X64_RAX);
+ switch (op) {
+ case AO_AND:
+ emit_alu_rr(mc, w, 0x21, X64_RCX, X64_R11);
+ break;
+ case AO_OR:
+ emit_alu_rr(mc, w, 0x09, X64_RCX, X64_R11);
+ break;
+ case AO_XOR:
+ emit_alu_rr(mc, w, 0x31, X64_RCX, X64_R11);
+ break;
+ case AO_NAND:
+ emit_alu_rr(mc, w, 0x21, X64_RCX, X64_R11);
+ emit_f7_rm(mc, w, 2u, X64_RCX); /* NOT */
+ break;
+ default:
+ compiler_panic(t->c, a->loc, "x64 atomic_rmw: op %d unimpl", (int)op);
+ }
+ emit_lock_cmpxchg(mc, w, X64_RCX, base, disp);
+ emit_jcc_label(mc, X64_CC_NE, L_retry);
+ if (dr != X64_RAX) emit_mov_rr(mc, w, dr, X64_RAX);
+}
+
+static void x_atomic_cas(CGTarget* t, Operand prior, Operand ok, Operand addr,
+ Operand expected, Operand desired, MemAccess ma,
+ MemOrder succ, MemOrder fail) {
+ XImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ (void)succ;
+ (void)fail;
+ u32 sz = ma.size ? ma.size : type_byte_size(prior.type);
+ int w = (sz == 8) ? 1 : 0;
+ i32 disp;
+ u32 base = atomic_addr_base(t, addr, &disp);
+
+ /* RAX = expected. */
+ if (expected.kind == OPK_IMM) {
+ emit_load_imm(mc, w, X64_RAX, expected.v.imm);
+ } else if (expected.kind == OPK_REG) {
+ u32 er = expected.v.reg & 0xFu;
+ if (er != X64_RAX) emit_mov_rr(mc, w, X64_RAX, er);
+ } else {
+ compiler_panic(t->c, a->loc, "x64 atomic_cas: exp kind %d unsupported",
+ (int)expected.kind);
+ }
+ /* R11 = desired. */
+ if (desired.kind == OPK_IMM) {
+ emit_load_imm(mc, w, X64_R11, desired.v.imm);
+ } else if (desired.kind == OPK_REG) {
+ u32 dr2 = desired.v.reg & 0xFu;
+ if (dr2 != X64_R11) emit_mov_rr(mc, w, X64_R11, dr2);
+ } else {
+ compiler_panic(t->c, a->loc, "x64 atomic_cas: des kind %d unsupported",
+ (int)desired.kind);
+ }
+
+ emit_lock_cmpxchg(mc, w, X64_R11, base, disp);
+
+ /* ok = ZF (success). */
+ u32 ok_r = ok.v.reg & 0xFu;
+ emit_setcc(mc, X64_CC_E, ok_r);
+ emit_movzx_r32_r8(mc, ok_r, ok_r);
+
+ /* prior = rax. */
+ u32 pr = prior.v.reg & 0xFu;
+ if (pr != X64_RAX) emit_mov_rr(mc, w, pr, X64_RAX);
+}
+
static void x_fence(CGTarget* t, MemOrder o) {
- (void)o;
- x_panic(t, "fence");
+ /* x86: only SEQ_CST needs an explicit StoreLoad barrier. RELAXED is
+ * a no-op; ACQUIRE/RELEASE/ACQ_REL are satisfied by plain MOV. */
+ if (o == MO_SEQ_CST) emit_mfence(t->mc);
}
-static void x_intrinsic(CGTarget* t, IntrinKind k, Operand* d, u32 nd,
- const Operand* a, u32 na) {
- (void)k;
- (void)d;
+/* ============================================================
+ * Intrinsics (Group L). */
+
+static void x_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd,
+ const Operand* args, u32 na) {
+ XImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
(void)nd;
- (void)a;
(void)na;
- x_panic(t, "intrinsic");
+
+ switch (kind) {
+ case INTRIN_POPCOUNT: {
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ int w = type_is_64(src.type) ? 1 : 0;
+ emit_popcnt(mc, w, dst.v.reg & 0xFu, src.v.reg & 0xFu);
+ return;
+ }
+ case INTRIN_CTZ: {
+ /* BSF gives the index of the lowest set bit (undefined for 0). */
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ int w = type_is_64(src.type) ? 1 : 0;
+ emit_bs(mc, w, 0xBC, dst.v.reg & 0xFu, src.v.reg & 0xFu);
+ return;
+ }
+ case INTRIN_CLZ: {
+ /* BSR gives the index of the highest set bit; clz = (bits-1) - bsr.
+ * XOR with (bits-1) computes the subtraction for in-range values. */
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ int w = type_is_64(src.type) ? 1 : 0;
+ u32 dr = dst.v.reg & 0xFu;
+ emit_bs(mc, w, 0xBD, dr, src.v.reg & 0xFu);
+ emit_xor_imm32(mc, w, dr, w ? 63 : 31);
+ return;
+ }
+ case INTRIN_BSWAP16: {
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ u32 dr = dst.v.reg & 0xFu;
+ u32 sr = src.v.reg & 0xFu;
+ if (dr != sr) emit_mov_rr(mc, 0, dr, sr);
+ emit_rol16_imm8(mc, dr, 8);
+ return;
+ }
+ case INTRIN_BSWAP32: {
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ u32 dr = dst.v.reg & 0xFu;
+ u32 sr = src.v.reg & 0xFu;
+ if (dr != sr) emit_mov_rr(mc, 0, dr, sr);
+ emit_bswap(mc, 0, dr);
+ return;
+ }
+ case INTRIN_BSWAP64: {
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ u32 dr = dst.v.reg & 0xFu;
+ u32 sr = src.v.reg & 0xFu;
+ if (dr != sr) emit_mov_rr(mc, 1, dr, sr);
+ emit_bswap(mc, 1, dr);
+ return;
+ }
+ case INTRIN_MEMCPY:
+ case INTRIN_MEMMOVE: {
+ /* args = (dst_addr, src_addr, n_bytes). v1: const n, REG ptrs. */
+ Operand da = args[0], sa = args[1], nb = args[2];
+ if (da.kind != OPK_REG || sa.kind != OPK_REG || nb.kind != OPK_IMM) {
+ compiler_panic(t->c, a->loc,
+ "x64 intrinsic: %s with non-const n or non-REG ptr",
+ kind == INTRIN_MEMCPY ? "memcpy" : "memmove");
+ }
+ u32 dr = da.v.reg & 0xFu;
+ u32 sr = sa.v.reg & 0xFu;
+ u32 n = (u32)nb.v.imm;
+ if (kind == INTRIN_MEMCPY) {
+ u32 i = 0;
+ while (i + 8 <= n) {
+ emit_mov_load(mc, 8, 0, X64_RAX, sr, (i32)i);
+ emit_mov_store(mc, 8, X64_RAX, dr, (i32)i);
+ i += 8;
+ }
+ while (i + 4 <= n) {
+ emit_mov_load(mc, 4, 0, X64_RAX, sr, (i32)i);
+ emit_mov_store(mc, 4, X64_RAX, dr, (i32)i);
+ i += 4;
+ }
+ while (i + 2 <= n) {
+ emit_mov_load(mc, 2, 0, X64_RAX, sr, (i32)i);
+ emit_mov_store(mc, 2, X64_RAX, dr, (i32)i);
+ i += 2;
+ }
+ while (i < n) {
+ emit_mov_load(mc, 1, 0, X64_RAX, sr, (i32)i);
+ emit_mov_store(mc, 1, X64_RAX, dr, (i32)i);
+ i += 1;
+ }
+ } else {
+ /* memmove: copy backward so dst>src overlap is safe. */
+ u32 i = n;
+ while (i >= 8) {
+ i -= 8;
+ emit_mov_load(mc, 8, 0, X64_RAX, sr, (i32)i);
+ emit_mov_store(mc, 8, X64_RAX, dr, (i32)i);
+ }
+ while (i >= 4) {
+ i -= 4;
+ emit_mov_load(mc, 4, 0, X64_RAX, sr, (i32)i);
+ emit_mov_store(mc, 4, X64_RAX, dr, (i32)i);
+ }
+ while (i >= 2) {
+ i -= 2;
+ emit_mov_load(mc, 2, 0, X64_RAX, sr, (i32)i);
+ emit_mov_store(mc, 2, X64_RAX, dr, (i32)i);
+ }
+ while (i >= 1) {
+ i -= 1;
+ emit_mov_load(mc, 1, 0, X64_RAX, sr, (i32)i);
+ emit_mov_store(mc, 1, X64_RAX, dr, (i32)i);
+ }
+ }
+ return;
+ }
+ case INTRIN_MEMSET: {
+ /* args = (dst_addr, byte, n). */
+ Operand da = args[0], bv = args[1], nb = args[2];
+ if (da.kind != OPK_REG || nb.kind != OPK_IMM) {
+ compiler_panic(t->c, a->loc,
+ "x64 intrinsic: memset with non-const n / non-REG ptr");
+ }
+ u32 dr = da.v.reg & 0xFu;
+ u32 n = (u32)nb.v.imm;
+ /* Build a 64-bit value with the byte broadcast across all 8 bytes. */
+ if (bv.kind == OPK_IMM) {
+ u8 byte = (u8)(bv.v.imm & 0xffu);
+ u64 b64 = byte;
+ b64 |= b64 << 8;
+ b64 |= b64 << 16;
+ b64 |= b64 << 32;
+ emit_load_imm(mc, 1, X64_RAX, (i64)b64);
+ } else if (bv.kind == OPK_REG) {
+ /* Broadcast low byte of bv across 8 bytes: rax = bv * 0x0101010101010101. */
+ emit_load_imm(mc, 1, X64_R11, (i64)0x0101010101010101ll);
+ emit_mov_rr(mc, 1, X64_RAX, bv.v.reg & 0xFu);
+ emit_imul_rr(mc, 1, X64_RAX, X64_R11);
+ } else {
+ compiler_panic(t->c, a->loc,
+ "x64 intrinsic: memset byte kind %d unsupported",
+ (int)bv.kind);
+ }
+ u32 i = 0;
+ while (i + 8 <= n) {
+ emit_mov_store(mc, 8, X64_RAX, dr, (i32)i);
+ i += 8;
+ }
+ while (i + 4 <= n) {
+ emit_mov_store(mc, 4, X64_RAX, dr, (i32)i);
+ i += 4;
+ }
+ while (i + 2 <= n) {
+ emit_mov_store(mc, 2, X64_RAX, dr, (i32)i);
+ i += 2;
+ }
+ while (i < n) {
+ emit_mov_store(mc, 1, X64_RAX, dr, (i32)i);
+ i += 1;
+ }
+ return;
+ }
+ case INTRIN_PREFETCH:
+ /* Drop the hint. */
+ return;
+ case INTRIN_ASSUME_ALIGNED: {
+ /* dst = src (alignment is a hint only). */
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ u32 dr = dst.v.reg & 0xFu;
+ u32 sr = src.v.reg & 0xFu;
+ if (dr != sr) emit_mov_rr(mc, 1, dr, sr);
+ return;
+ }
+ case INTRIN_EXPECT: {
+ /* dst = val; expected hint dropped. */
+ Operand val = args[0];
+ Operand dst = dsts[0];
+ int w = type_is_64(dst.type) ? 1 : 0;
+ u32 dr = dst.v.reg & 0xFu;
+ if (val.kind == OPK_REG) {
+ u32 sr = val.v.reg & 0xFu;
+ if (sr != dr) emit_mov_rr(mc, w, dr, sr);
+ } else if (val.kind == OPK_IMM) {
+ emit_load_imm(mc, w, dr, val.v.imm);
+ } else {
+ compiler_panic(t->c, a->loc,
+ "x64 intrinsic: expect val kind %d unsupported",
+ (int)val.kind);
+ }
+ return;
+ }
+ case INTRIN_UNREACHABLE:
+ case INTRIN_TRAP:
+ emit_ud2(mc);
+ return;
+ case INTRIN_ADD_OVERFLOW:
+ case INTRIN_SUB_OVERFLOW: {
+ /* dsts: [val, ovf]. ADD/SUB sets OF on signed overflow; SETO captures. */
+ Operand a_op = args[0], b_op = args[1];
+ Operand dval = dsts[0], dovf = dsts[1];
+ int w = type_is_64(dval.type) ? 1 : 0;
+ u32 rd = dval.v.reg & 0xFu;
+ u32 ra = force_reg_int(t, a_op, w, X64_RAX);
+ if (rd != ra) emit_mov_rr(mc, w, rd, ra);
+ u32 rb = force_reg_int(t, b_op, w, X64_R11);
+ u8 op = (kind == INTRIN_ADD_OVERFLOW) ? 0x01 : 0x29;
+ emit_alu_rr(mc, w, op, rd, rb);
+ u32 dovf_r = dovf.v.reg & 0xFu;
+ emit_setcc(mc, X64_CC_O, dovf_r);
+ emit_movzx_r32_r8(mc, dovf_r, dovf_r);
+ return;
+ }
+ case INTRIN_MUL_OVERFLOW: {
+ /* dsts: [val, ovf]. IMUL r32, r/m32 (0F AF /r) is the signed
+ * two-operand form: low 32 bits of product go to dst, OF set if
+ * the result didn't fit. i64 not yet supported. */
+ Operand a_op = args[0], b_op = args[1];
+ Operand dval = dsts[0], dovf = dsts[1];
+ int w = type_is_64(dval.type) ? 1 : 0;
+ if (w) {
+ compiler_panic(t->c, a->loc,
+ "x64 intrinsic: mul_overflow on i64 not yet supported");
+ }
+ u32 rd = dval.v.reg & 0xFu;
+ u32 ra = force_reg_int(t, a_op, w, X64_RAX);
+ if (rd != ra) emit_mov_rr(mc, w, rd, ra);
+ u32 rb = force_reg_int(t, b_op, w, X64_R11);
+ emit_imul_rr(mc, w, rd, rb);
+ u32 dovf_r = dovf.v.reg & 0xFu;
+ emit_setcc(mc, X64_CC_O, dovf_r);
+ emit_movzx_r32_r8(mc, dovf_r, dovf_r);
+ return;
+ }
+ default:
+ compiler_panic(t->c, a->loc, "x64 intrinsic: kind %d unsupported",
+ (int)kind);
+ }
}
static void x_asm_block(CGTarget* t, const char* tmpl,
const AsmConstraint* outs, u32 no, Operand* oo,
diff --git a/test/link/harness/start.c b/test/link/harness/start.c
@@ -106,6 +106,12 @@ static void tls_init(void) {
unsigned long td_n = (unsigned long)(__tdata_end - __tdata_start);
unsigned long bs_n = (unsigned long)(unsigned long long)__tbss_size;
unsigned long i;
+ /* Launder bs_n past clang's "extern char[] has non-null address"
+ * assumption β without this the .tbss zero loop is peeled and
+ * unconditionally writes one byte at tls[td_n], which on the SysV
+ * x86_64 variant II layout (TCB sits at tls[td_n]) clobbers the
+ * thread-pointer self-pointer for any TLS image with bs_n == 0. */
+ __asm__ volatile("" : "+r"(bs_n));
#if defined(__aarch64__)
/* Variant I (TCB first): tp -> [TCB(16) | tdata | tbss] */
char* dst = g_tls_block + AARCH64_TCB_SIZE;
@@ -147,21 +153,41 @@ static void tls_init(void) {
#endif /* !__APPLE__ */
}
+/* On x86_64 the kernel hands _start an rsp that is 16-aligned (so argc
+ * lands on a 16-byte boundary), but clang compiles _start as an ordinary
+ * function assuming the standard SysV contract of rsp β‘ 8 (mod 16) on
+ * entry β off by 8. force_align_arg_pointer makes the prologue realign
+ * rsp itself so every `call` downstream lands at the canonical
+ * rsp β‘ 8 (mod 16). aarch64/rv64 ABIs keep SP 16-aligned at all times,
+ * so no analogue is needed there. */
+#if defined(__x86_64__)
+__attribute__((force_align_arg_pointer))
+#endif
void _start(void) {
VoidFn* p;
int result;
tls_init();
+#if defined(__APPLE__)
+ /* Mach-O: dyld walks __DATA,__mod_init_func before _start runs, so
+ * the harness must NOT walk __init_array_start/end β the boundary
+ * symbols are synthesized into the __got region (no real init array
+ * on Mach-O) and dereferencing them faults. */
+ (void)p;
+#else
/* SHT_PREINIT_ARRAY runs strictly before .init_array. cfree-ld
* lands its synthetic __cfree_ifunc_init entry here so IFUNC
* slots are filled before any user ctor or test_main runs. */
for (p = __preinit_array_start; p != __preinit_array_end; ++p) (*p)();
for (p = __init_array_start; p != __init_array_end; ++p) (*p)();
+#endif
result = test_main();
+#if !defined(__APPLE__)
for (p = __fini_array_end; p-- != __fini_array_start;) (*p)();
+#endif
if (result == 0) result = test_post_fini();