kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 4c4f1db31be66b1fab039b0a1fa8c4f8ab2bdd6f
parent eff11e543f602e9bccfb86bdd2e446271957de85
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu,  4 Jun 2026 19:00:38 -0700

Fix Windows PE x64 startup and callback ABI

Diffstat:
Msrc/arch/native_target.h | 7+++++++
Msrc/arch/x64/native.c | 103++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
Msrc/cg/native_direct_target.c | 60+++++++++++++++++++++++++++++++++++++++++++++++-------------
Msrc/cg/native_direct_target.h | 3+++
Msrc/link/link_reloc_layout.c | 1+
Msrc/obj/coff/link.c | 37++++++++++++++++++++++---------------
Msrc/obj/coff/read.c | 79+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
Msrc/obj/coff/reloc_aarch64.c | 7++++---
Msrc/obj/coff/reloc_x86_64.c | 4+++-
Msrc/obj/elf/emit.c | 66+++++++++++++++++++++++++++++++++---------------------------------
Msrc/obj/obj.c | 1+
Msrc/obj/obj.h | 3+++
Mtest/coff/kit-roundtrip-coff.c | 56++++++++++++++++++++++++++++++++++++++++++++++++++++++++
13 files changed, 307 insertions(+), 120 deletions(-)

diff --git a/src/arch/native_target.h b/src/arch/native_target.h @@ -370,6 +370,13 @@ struct NativeTarget { * up front. */ void (*reserve_callee_saves)(NativeTarget*, const u32* used_by_class, u32 nclasses); + /* Optional live-ABI callee-saved register mask for a class. Static + * NativeAllocClassInfo masks describe the target register file, but some + * targets vary preservation rules by OS ABI (x64 SysV vs Win64 XMM regs). + * Direct emission uses this to decide which borrowed scratch/cache registers + * must be reported to reserve_callee_saves(). NULL falls back to + * NativeAllocClassInfo.callee_saved_mask. */ + u32 (*callee_saved_mask)(NativeTarget*, NativeAllocClass); /* Optional. When set, the optimizer emit path calls this once — after * func_begin, reserve_callee_saves, and frame-slot mapping, but before the * body — to emit a minimal, exact-size prologue in place (no reserved NOP diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c @@ -19,13 +19,13 @@ * are reserved (stack/frame pointers). RAX is reserved too (return value, the * div/mul implicit operand), but it is NOT an emit temp, so inline asm may pin * an operand to it (the Linux syscall idiom) — see x64_asm_operand_reg_ok. - * Everything else is allocable. The driver scratch pool is RBX/R12 (int) and - * XMM12/XMM13 (fp), disjoint from the emit temps so a hook never clobbers an - * operand parked there. ABI arg/ret - * registers are caller-saved-allocable; callee-saved set is resolved per-OS via - * x64_abi_for_os at runtime (the legality masks below are SysV's, the conserva- - * tive superset that both ABIs' allocators respect — Win64's extra callee-saves - * RDI/RSI/xmm6-15 only shrink the allocable pool, never grow it). */ + * The driver scratch pool is R8/R9 (int) and XMM4/XMM5 (fp), caller-saved on + * both SysV and Win64 and disjoint from the emit temps so a hook never clobbers + * an operand parked there. Scratch registers are reserved from allocation. + * Callee-saved set is resolved per-OS via x64_abi_for_os at runtime (the + * legality masks below are SysV's, the conservative superset that both ABIs' + * allocators respect — Win64's extra callee-saves RDI/RSI/xmm6-15 only shrink + * the allocable pool, never grow it). */ #include <string.h> @@ -179,6 +179,13 @@ static void emit_jcc_rel32(MCEmitter* mc, u32 cc, MCLabel l); .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG, \ .spill_cost = 1u, \ .copy_cost = 1u} +#define X64_PHYS_INT_ARG_RESERVED(r) \ + {.reg = (r), \ + .cls = NATIVE_REG_INT, \ + .abi_index = 0xffu, \ + .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | NATIVE_REG_RESERVED, \ + .spill_cost = 0u, \ + .copy_cost = 0u} #define X64_PHYS_INT_RET_ARG(r) \ {.reg = (r), \ .cls = NATIVE_REG_INT, \ @@ -209,27 +216,26 @@ static void emit_jcc_rel32(MCEmitter* mc, u32 cc, MCLabel l); .spill_cost = 0u, \ .copy_cost = 0u} -/* Allocable int pool, opt's spill/reload set: callee-saves first so the direct - * path's local cache prefers regs that don't grow the prologue. R10/R11 are - * emit scratch (reserved); RBX/R12 are the driver scratch pool; RAX is reserved - * (return / div-mul, asm-pinnable). */ +/* Allocable int pool, opt's spill/reload set. R8/R9 are the driver scratch + * pool; R10/R11 are emit scratch (reserved); RAX is reserved (return / div-mul, + * asm-pinnable). */ static const Reg x64_int_allocable[] = {X64_R13, X64_R14, X64_R15}; -static const Reg x64_int_scratch[] = {X64_RBX, X64_R12}; +static const Reg x64_int_scratch[] = {X64_R8, X64_R9}; static const NativePhysRegInfo x64_int_phys[] = { X64_PHYS_INT_RESERVED(X64_RAX), /* return / div-mul (asm-pinnable) */ X64_PHYS_INT_ARG(X64_RCX), X64_PHYS_INT_RET_ARG(X64_RDX), - X64_PHYS_INT_RESERVED(X64_RBX), /* driver scratch */ + X64_PHYS_INT_RESERVED(X64_RBX), X64_PHYS_INT_RESERVED(X64_RSP), /* stack pointer */ X64_PHYS_INT_RESERVED(X64_RBP), /* frame pointer */ X64_PHYS_INT_ARG(X64_RSI), X64_PHYS_INT_ARG(X64_RDI), - X64_PHYS_INT_ARG(X64_R8), - X64_PHYS_INT_ARG(X64_R9), - X64_PHYS_INT_RESERVED(X64_R10), /* emit scratch */ - X64_PHYS_INT_RESERVED(X64_R11), /* emit scratch */ - X64_PHYS_INT_RESERVED(X64_R12), /* driver scratch */ + X64_PHYS_INT_ARG_RESERVED(X64_R8), /* driver scratch */ + X64_PHYS_INT_ARG_RESERVED(X64_R9), /* driver scratch */ + X64_PHYS_INT_RESERVED(X64_R10), /* emit scratch */ + X64_PHYS_INT_RESERVED(X64_R11), /* emit scratch */ + X64_PHYS_INT_RESERVED(X64_R12), X64_PHYS_INT_CALLEE(X64_R13), X64_PHYS_INT_CALLEE(X64_R14), X64_PHYS_INT_CALLEE(X64_R15), @@ -250,6 +256,13 @@ static const NativePhysRegInfo x64_int_phys[] = { .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG, \ .spill_cost = 1u, \ .copy_cost = 1u} +#define X64_PHYS_FP_ARG_RESERVED(r) \ + {.reg = (r), \ + .cls = NATIVE_REG_FP, \ + .abi_index = 0xffu, \ + .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | NATIVE_REG_RESERVED, \ + .spill_cost = 0u, \ + .copy_cost = 0u} #define X64_PHYS_FP_CALLER(r) \ {.reg = (r), \ .cls = NATIVE_REG_FP, \ @@ -265,21 +278,20 @@ static const NativePhysRegInfo x64_int_phys[] = { .spill_cost = 0u, \ .copy_cost = 0u} -/* Allocable FP pool: xmm6..xmm13 (keep arg/ret xmm0..5 clear). xmm14/xmm15 are - * emit scratch; xmm12/xmm13 the driver scratch pool. */ +/* Allocable FP pool: xmm6..xmm11 (keep arg/ret xmm0..5 clear). xmm4/xmm5 are + * driver scratch; xmm14/xmm15 are emit scratch. */ static const Reg x64_fp_allocable[] = { X64_XMM6, X64_XMM7, X64_XMM8, X64_XMM0 + 9, X64_XMM0 + 10, X64_XMM0 + 11}; -static const Reg x64_fp_scratch[] = {X64_XMM0 + 12, X64_XMM0 + 13}; +static const Reg x64_fp_scratch[] = {X64_XMM4, X64_XMM5}; static const NativePhysRegInfo x64_fp_phys[] = { X64_PHYS_FP_ARG_RET(X64_XMM0), X64_PHYS_FP_ARG_RET(X64_XMM1), X64_PHYS_FP_ARG(X64_XMM2), X64_PHYS_FP_ARG(X64_XMM3), - X64_PHYS_FP_ARG(X64_XMM4), X64_PHYS_FP_ARG(X64_XMM5), + X64_PHYS_FP_ARG_RESERVED(X64_XMM4), X64_PHYS_FP_ARG_RESERVED(X64_XMM5), X64_PHYS_FP_CALLER(X64_XMM6), X64_PHYS_FP_CALLER(X64_XMM7), X64_PHYS_FP_CALLER(X64_XMM8), X64_PHYS_FP_CALLER(X64_XMM0 + 9), X64_PHYS_FP_CALLER(X64_XMM0 + 10), X64_PHYS_FP_CALLER(X64_XMM0 + 11), - X64_PHYS_FP_RESERVED(X64_XMM0 + 12), /* driver scratch */ - X64_PHYS_FP_RESERVED(X64_XMM0 + 13), /* driver scratch */ + X64_PHYS_FP_RESERVED(X64_XMM0 + 12), X64_PHYS_FP_RESERVED(X64_XMM0 + 13), X64_PHYS_FP_RESERVED(X64_XMM0 + 14), /* emit scratch */ X64_PHYS_FP_RESERVED(X64_XMM15), /* emit scratch */ }; @@ -303,10 +315,10 @@ static const NativeAllocClassInfo x64_classes[] = { .arg_mask = (1u << X64_RDI) | (1u << X64_RSI) | (1u << X64_RDX) | (1u << X64_RCX) | (1u << X64_R8) | (1u << X64_R9), .ret_mask = (1u << X64_RAX) | (1u << X64_RDX), - /* rax, rsp, rbp reserved; r10/r11 emit scratch; rbx/r12 driver scratch */ + /* rax, rsp, rbp reserved; r8/r9 driver scratch; r10/r11 emit scratch */ .reserved_mask = (1u << X64_RAX) | (1u << X64_RSP) | (1u << X64_RBP) | - (1u << X64_R10) | (1u << X64_R11) | (1u << X64_RBX) | - (1u << X64_R12)}, + (1u << X64_R8) | (1u << X64_R9) | (1u << X64_R10) | + (1u << X64_R11) | (1u << X64_RBX) | (1u << X64_R12)}, {.cls = NATIVE_REG_FP, .allocable = x64_fp_allocable, .nallocable = sizeof x64_fp_allocable / sizeof x64_fp_allocable[0], @@ -319,8 +331,9 @@ static const NativeAllocClassInfo x64_classes[] = { .callee_saved_mask = 0u, .arg_mask = 0xffu, /* xmm0..xmm7 */ .ret_mask = (1u << X64_XMM0) | (1u << X64_XMM1), - /* xmm12..xmm15 reserved (driver scratch + emit scratch) */ - .reserved_mask = (1u << (X64_XMM0 + 12)) | (1u << (X64_XMM0 + 13)) | + /* xmm4/xmm5 driver scratch; xmm14/xmm15 emit scratch. */ + .reserved_mask = (1u << X64_XMM4) | (1u << X64_XMM5) | + (1u << (X64_XMM0 + 12)) | (1u << (X64_XMM0 + 13)) | (1u << (X64_XMM0 + 14)) | (1u << X64_XMM15)}, }; @@ -356,15 +369,15 @@ static int x64_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls, if (cls == NATIVE_REG_INT) { switch (reg) { /* RAX is reserved but not an emit temp, so it is a legal asm pin (the - * Linux syscall number/return register). R10/R11 are emit scratch and - * RBX/R12 the driver scratch pool, so those stay excluded. */ + * Linux syscall number/return register). R8/R9 are driver scratch and + * R10/R11 are emit scratch, so those stay excluded. */ case X64_RAX: + case X64_RBX: case X64_RCX: case X64_RDX: case X64_RSI: case X64_RDI: - case X64_R8: - case X64_R9: + case X64_R12: case X64_R13: case X64_R14: case X64_R15: @@ -373,7 +386,8 @@ static int x64_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls, return 0; } } - if (cls == NATIVE_REG_FP) return reg <= X64_XMM0 + 11u; + if (cls == NATIVE_REG_FP) + return reg <= X64_XMM0 + 13u && reg != X64_XMM4 && reg != X64_XMM5; return 0; } @@ -1737,6 +1751,20 @@ static void x64_reserve_callee_saves(NativeTarget* t, const u32* used, static int x64_reg_is_callee_int(const X64ABIRegs* abi, Reg r); static int x64_reg_is_callee_fp(const X64ABIRegs* abi, Reg r); + +static u32 x64_live_callee_saved_mask(NativeTarget* t, + NativeAllocClass cls) { + X64NativeTarget* a = x64_of(t); + u32 mask = 0; + for (Reg r = 0; r < 16u; ++r) { + if (cls == NATIVE_REG_INT && x64_reg_is_callee_int(a->abi, r)) + mask |= 1u << r; + if (cls == NATIVE_REG_FP && x64_reg_is_callee_fp(a->abi, r)) + mask |= 1u << r; + } + return mask; +} + static void x64_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, u32 nclob, u32* int_mask, u32* fp_mask); @@ -4064,6 +4092,7 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj, /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved * set; x64_func_begin_known_frame derives the records from the masks. */ t->reserve_callee_saves = x64_reserve_callee_saves; + t->callee_saved_mask = x64_live_callee_saved_mask; t->signature_stack_bytes = x64_signature_stack_bytes; t->call_stack_bytes = x64_call_stack_bytes; t->has_store_zero_reg = 0; @@ -4258,12 +4287,12 @@ static void x64_direct_asm_block(NativeDirectTarget* d, const char* tmpl, native_asm_abi_clobber_masks(d->native, clobber_abi_sets, &abi_int, &abi_fp); clob_int |= abi_int; clob_fp |= abi_fp; - /* Reserve emit scratch (r10,r11), driver scratch (rbx,r12), rax (reserved; + /* Reserve emit scratch (r10,r11), driver scratch (r8,r9), rax (reserved; * only self-allocated here when explicitly pinned), sp/bp, and clobbers. */ used_int = clob_int | (1u << X64_RAX) | (1u << X64_R11) | (1u << X64_RSP) | - (1u << X64_RBP) | (1u << X64_RBX) | (1u << X64_R12) | + (1u << X64_RBP) | (1u << X64_R8) | (1u << X64_R9) | (1u << X64_R10); - used_fp = clob_fp | (1u << (X64_XMM0 + 12)) | (1u << (X64_XMM0 + 13)) | + used_fp = clob_fp | (1u << X64_XMM4) | (1u << X64_XMM5) | (1u << (X64_XMM0 + 14)) | (1u << X64_XMM15); for (i = 0; i < nout; ++i) { diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c @@ -108,6 +108,19 @@ static Reg nd_cache_reg_for(NativeDirectTarget* d, CGLocal local, KitCgTypeId access_type); static Reg nd_pick_cache_victim(NativeDirectTarget* d, NativeAllocClass cls); +static u32 nd_callee_saved_mask(NativeDirectTarget* d, NativeAllocClass cls) { + if (d->native && d->native->callee_saved_mask) + return d->native->callee_saved_mask(d->native, cls); + return nd_class_info(d, cls)->callee_saved_mask; +} + +static void nd_note_reg_used(NativeDirectTarget* d, NativeAllocClass cls, + Reg reg) { + if ((u32)cls >= 3u || reg >= 32u) return; + if (nd_callee_saved_mask(d, cls) & (1u << reg)) + d->callee_saved_used[cls] |= 1u << reg; +} + static Reg nd_scratch_acquire(NativeDirectTarget* d, NativeAllocClass cls) { const NativeAllocClassInfo* ci = nd_class_info(d, cls); const Reg* regs = ci->scratch; @@ -121,6 +134,7 @@ static Reg nd_scratch_acquire(NativeDirectTarget* d, NativeAllocClass cls) { if ((d->scratch_used[cls] & (1u << r)) == 0 && d->reg_owner[cls][r] == CG_LOCAL_NONE) { d->scratch_used[cls] |= 1u << r; + nd_note_reg_used(d, cls, r); return r; } } @@ -134,6 +148,7 @@ static Reg nd_scratch_acquire(NativeDirectTarget* d, NativeAllocClass cls) { if (r != REG_NONE) { nd_flush_local(d, d->reg_owner[cls][r]); d->scratch_used[cls] |= 1u << r; + nd_note_reg_used(d, cls, r); return r; } } @@ -468,9 +483,10 @@ static void nd_store_operand_from_reg(NativeDirectTarget* d, Operand dst, * (nd_dst_reg/nd_dst_writeback) and are always dirty; reads hit a live entry or * fall back to a frame load without creating one. nd_flush_all spills and * empties the cache at the top of every non-pure-compute op, so the cache only - * survives across straight-line runs of compute ops. Caching only caller-saved - * registers means that conservative flush fully covers ABI clobbering across - * calls, and no callee-save prologue/epilogue work is required. */ + * survives across straight-line runs of compute ops. Caching prefers the + * register-file caller-saved mask; if the live OS ABI treats one of those + * registers as callee-saved, nd_note_reg_used reports it to the backend before + * the deferred prologue is patched. */ static int nd_local_cacheable(NativeDirectTarget* d, const NativeDirectLocal* l) { @@ -534,12 +550,15 @@ static Reg nd_cache_alloc(NativeDirectTarget* d, NativeAllocClass cls) { Reg r = ci->allocable[i]; if (r >= 32u) continue; if ((caller & (1u << r)) && d->reg_owner[cls][r] == CG_LOCAL_NONE && - (d->scratch_used[cls] & (1u << r)) == 0) + (d->scratch_used[cls] & (1u << r)) == 0) { + nd_note_reg_used(d, cls, r); return r; + } } victim = nd_pick_cache_victim(d, cls); if (victim != REG_NONE && (caller & (1u << victim))) { nd_flush_local(d, d->reg_owner[cls][victim]); + nd_note_reg_used(d, cls, victim); return victim; } return REG_NONE; @@ -910,6 +929,7 @@ static void nd_func_begin(CgTarget* t, const CGFuncDesc* fd) { d->cache_tail = -1; d->ncached = 0; memset(d->scratch_used, 0, sizeof d->scratch_used); + memset(d->callee_saved_used, 0, sizeof d->callee_saved_used); memset(d->reg_owner, 0, sizeof d->reg_owner); if (d->native && d->native->func_begin) d->native->func_begin(d->native, fd); } @@ -917,8 +937,18 @@ static void nd_func_begin(CgTarget* t, const CGFuncDesc* fd) { static void nd_func_end(CgTarget* t) { NativeDirectTarget* d = nd_of(t); NativeFramePatchState frame; + u32 ncallee_classes = 0; memset(&frame, 0, sizeof frame); frame.max_outgoing = d->max_outgoing; + for (u32 cls = 0; cls < 3u; ++cls) { + if (d->callee_saved_used[cls]) ncallee_classes = cls + 1u; + } + if (ncallee_classes) { + if (!d->native || !d->native->reserve_callee_saves) + nd_panic(d, "target cannot preserve callee-saved scratch registers"); + d->native->reserve_callee_saves(d->native, d->callee_saved_used, + ncallee_classes); + } if (d->native && d->native->note_frame_state) d->native->note_frame_state(d->native, &frame); if (d->native && d->native->patch_apply) d->native->patch_apply(d->native); @@ -1590,11 +1620,13 @@ static void nd_call(CgTarget* t, const CGCallDesc* desc) { NativeCallDesc nd; NativeLoc* args; NativeLoc* results; - int release_callee = 0; + NativeLoc callee_tmp; + int release_callee_tmp = 0; nd_flush_all(d); nd_barrier(d, NATIVE_DIRECT_BARRIER_CALL | NATIVE_DIRECT_BARRIER_MEMORY); memset(&plan, 0, sizeof plan); memset(&nd, 0, sizeof nd); + memset(&callee_tmp, 0, sizeof callee_tmp); args = nd_loc_buf(d, d->argbuf, ND_ARG_BUF, desc->nargs); results = nd_loc_buf(d, d->retbuf, ND_RET_BUF, desc->nresults); for (u32 i = 0; i < desc->nargs; ++i) @@ -1603,6 +1635,13 @@ static void nd_call(CgTarget* t, const CGCallDesc* desc) { results[i] = nd_loc_frame(d, desc->results[i], 0); nd.fn_type = desc->fn_type; nd.callee = nd_loc_operand(d, desc->callee); + if (nd.callee.kind == NATIVE_LOC_FRAME) { + callee_tmp = nd_materialize_loc(d, nd.callee, + (NativeAllocClass)nd.callee.cls, + nd.callee.type); + nd.callee = callee_tmp; + release_callee_tmp = 1; + } nd.args = args; nd.results = results; nd.nargs = desc->nargs; @@ -1621,12 +1660,6 @@ static void nd_call(CgTarget* t, const CGCallDesc* desc) { d->max_outgoing = plan.stack_arg_size; for (u32 i = 0; i < plan.nargs; ++i) nd_write_loc(d, plan.args[i].dst, plan.args[i].src, plan.args[i].mem); - if (plan.callee.kind == NATIVE_LOC_FRAME) { - NativeLoc callee = nd_materialize_loc( - d, plan.callee, (NativeAllocClass)plan.callee.cls, plan.callee.type); - plan.callee = callee; - release_callee = 1; - } if (d->ops && d->ops->emit_call) d->ops->emit_call(d, &plan); else { @@ -1635,8 +1668,9 @@ static void nd_call(CgTarget* t, const CGCallDesc* desc) { } for (u32 i = 0; i < plan.nrets; ++i) nd_write_loc(d, plan.rets[i].dst, plan.rets[i].src, plan.rets[i].mem); - if (release_callee) - nd_scratch_release(d, (NativeAllocClass)plan.callee.cls, plan.callee.v.reg); + if (release_callee_tmp) + nd_scratch_release(d, (NativeAllocClass)callee_tmp.cls, + callee_tmp.v.reg); } static const char* nd_tail_call_unrealizable_reason(CgTarget* t, diff --git a/src/cg/native_direct_target.h b/src/cg/native_direct_target.h @@ -135,6 +135,9 @@ struct NativeDirectTarget { u32 scopes_cap; u32 scratch_used[3]; + /* Per-function callee-saved registers borrowed by direct scratch/cache + * allocation. Reported to the native backend before prologue patching. */ + u32 callee_saved_used[3]; /* Local register cache (write-back, basic-block-scoped). reg_owner[cls][reg] * names the semantic local currently cached in that physical register, or * CG_LOCAL_NONE. scratch_used doubles as the per-class "pinned for the diff --git a/src/link/link_reloc_layout.c b/src/link/link_reloc_layout.c @@ -361,6 +361,7 @@ static u8 reloc_width(RelocKind k) { * small DWARF symbol differences these encode. */ return RELOC_RV_ULEB128_NOMINAL_WIDTH; case R_COFF_SECREL: + case R_COFF_ADDR32NB: return 4; case R_COFF_SECTION: return 2; diff --git a/src/obj/coff/link.c b/src/obj/coff/link.c @@ -754,7 +754,8 @@ static void coff_emit_idata(LinkImage* img, const CoffImportTable* it, u8* buf; /* Allocate the bucket buffer (idata_size is already block-aligned). */ buf = (u8*)heap->alloc(heap, it->idata_size, _Alignof(u64)); - if (!buf) compiler_panic(c, SRCLOC_NONE, "link_emit_coff: oom on .idata buffer"); + if (!buf) + compiler_panic(c, SRCLOC_NONE, "link_emit_coff: oom on .idata buffer"); memset(buf, 0, it->idata_size); idata->bytes = buf; idata->size = it->idata_size; @@ -1133,11 +1134,6 @@ static void coff_build_reloc_section(LinkImage* img, u32 cap = 0; u32 i; - if (!img->pie) { - reloc->bytes = NULL; - reloc->size = 0; - return; - } for (i = 0; i < nrel; ++i) { const LinkRelocApply* r = LinkRelocs_at(&img->relocs, i); const LinkSection* ls; @@ -1350,6 +1346,12 @@ static void coff_apply_all_relocs(LinkImage* img, } continue; } + if (r->kind == R_COFF_ADDR32NB) { + u64 inline_addend = rd_u32_le(P_bytes); + u64 v = (S - img_base) + inline_addend + (u64)r->addend; + wr_u32_le(P_bytes, (u32)(v & 0xffffffffu)); + continue; + } link_reloc_apply(c, r->kind, P_bytes, S, r->addend, P); } } @@ -1398,7 +1400,7 @@ typedef struct CoffOutHdr { static void coff_write_optional_header(Writer* w, u32 entry_rva, const CoffSection out[COFF_NBUCKETS], u32 headers_size_padded, u32 image_size, - int pie, u16 subsystem, + int dynamic_base, u16 subsystem, const CoffImportTable* it, const CoffTlsLayout* tls) { /* Standard fields. */ @@ -1438,7 +1440,8 @@ static void coff_write_optional_header(Writer* w, u32 entry_rva, coff_wr_u32(w, headers_size_padded); coff_wr_u32(w, 0u); /* CheckSum */ coff_wr_u16(w, subsystem ? subsystem : IMAGE_SUBSYSTEM_WINDOWS_CUI); - coff_wr_u16(w, (u16)(PE_DLL_CHARS_BASE | (pie ? PE_DLL_CHARS_ASLR : 0))); + coff_wr_u16( + w, (u16)(PE_DLL_CHARS_BASE | (dynamic_base ? PE_DLL_CHARS_ASLR : 0))); coff_wr_u64(w, PE_STACK_RESERVE); coff_wr_u64(w, PE_STACK_COMMIT); coff_wr_u64(w, PE_HEAP_RESERVE); @@ -1464,7 +1467,7 @@ static void coff_write_optional_header(Writer* w, u32 entry_rva, } else if (i == IMAGE_DIRECTORY_ENTRY_IAT && has_idata) { coff_wr_u32(w, out[COFF_BUCKET_IDATA].rva + it->iat_base); coff_wr_u32(w, it->iat_total); - } else if (i == IMAGE_DIRECTORY_ENTRY_BASERELOC && pie && + } else if (i == IMAGE_DIRECTORY_ENTRY_BASERELOC && dynamic_base && out[COFF_BUCKET_RELOC].in_image) { coff_wr_u32(w, out[COFF_BUCKET_RELOC].rva); coff_wr_u32(w, out[COFF_BUCKET_RELOC].size); @@ -1560,11 +1563,14 @@ void link_emit_coff(LinkImage* img, Writer* w) { * The headers' file size (and therefore every section's file * offset) depends on the section-table entry count, so we need to * commit to "is .reloc emitted?" before laying out file offsets. - * .reloc lights up iff PIE and at least one absolute reloc points - * into a kept section, OR a TLS directory is emitted (its four u64 - * VA fields all need base-relocs). */ + * .reloc lights up iff at least one absolute VA reloc points into a kept + * section, OR a TLS directory is emitted (its VA fields need base-relocs). + * ARM64 Windows rejects fixed images (/dynamicbase:no), and x64 Windows + * accepts ASLR images by default, so PE images advertise DYNAMIC_BASE when + * this table is present instead of tying the table to the generic ELF/Mach-O + * img->pie flag. */ int emit_reloc = 0; - if (img->pie) { + { u32 i; u32 nrel = LinkRelocs_count(&img->relocs); for (i = 0; i < nrel; ++i) { @@ -1693,7 +1699,8 @@ void link_emit_coff(LinkImage* img, Writer* w) { /* ---- pass 7: write everything ---- */ u16 file_chars = IMAGE_FILE_EXECUTABLE_IMAGE | IMAGE_FILE_LARGE_ADDRESS_AWARE; - if (!img->pie || !out[COFF_BUCKET_RELOC].in_image) { + int dynamic_base = out[COFF_BUCKET_RELOC].in_image; + if (!dynamic_base) { file_chars |= IMAGE_FILE_RELOCS_STRIPPED; } @@ -1703,7 +1710,7 @@ void link_emit_coff(LinkImage* img, Writer* w) { coff_write_file_header(w, machine, (u16)nsec, file_chars); u16 subsystem = img->linker ? img->linker->pe_subsystem : 0; coff_write_optional_header(w, entry_rva, out, headers_size_padded, image_size, - img->pie, subsystem, + dynamic_base, subsystem, have_imports ? &imports : NULL, &tls); /* Section table. */ diff --git a/src/obj/coff/read.c b/src/obj/coff/read.c @@ -144,6 +144,26 @@ static void resolve_sym_name(const u8* rec, const u8* strtab, u32 strtab_size, *len_out = n; } +static int coff_reloc_inline_addend(const u8* data, size_t len, + const CSecRec* s, u32 off, u32 width, + i64* out) { + if (!s || !s->size_of_raw_data) return 0; + if ((u64)off + (u64)width > (u64)s->size_of_raw_data) return 0; + if ((u64)s->pointer_to_raw_data + (u64)off + (u64)width > (u64)len) + return 0; + const u8* p = data + s->pointer_to_raw_data + off; + switch (width) { + case 4: + *out = (i64)(i32)coff_rd_u32(p); + return 1; + case 8: + *out = (i64)coff_rd_u64(p); + return 1; + default: + return 0; + } +} + /* ---- short-import record handler ---- * Microsoft "short import" format: a 20-byte ImportObjectHeader * followed by SizeOfData bytes containing two NUL-terminated strings — @@ -648,12 +668,12 @@ ObjBuilder* read_coff(Compiler* c, const char* name, const u8* data, ObjSymId target = OBJ_SYM_NONE; if (r_sym < nsymbols) target = sym_to_obj[r_sym]; - /* AMD64 REL32 encodings are relative to a PC after the relocated - * field, while kit's R_PC32-style apply formula subtracts the - * relocation field address P. Plain REL32 is relative to P+4; - * REL32_N is relative to P+N. Record that convention as an - * implicit negative addend so link_reloc_apply can stay format - * neutral. */ + /* COFF stores addends inline in the relocated field. Fold those + * bytes into Reloc.addend for the reloc kinds whose apply path + * overwrites the field. AMD64 REL32 also subtracts from a PC after + * the relocated field: plain REL32 is relative to P+4, and REL32_N is + * relative to P+N. Record that convention as an implicit negative + * addend so link_reloc_apply can stay format neutral. */ /* ARM64 PAGEOFFSET_12L is one wire code for LDST{8,16,32,64,128}. * The per-arch translator returns R_AARCH64_LDST64_ABS_LO12_NC by * default; recover the actual access width from the patched LDR/ @@ -693,30 +713,53 @@ ObjBuilder* read_coff(Compiler* c, const char* name, const u8* data, i64 addend = 0; int has_explicit = 0; if (machine == IMAGE_FILE_MACHINE_AMD64) { + i64 inline_addend = 0; switch (r_type) { + case IMAGE_REL_AMD64_ADDR64: + if (coff_reloc_inline_addend(data, len, s, r_va, 8, + &inline_addend)) + addend = inline_addend; + break; + case IMAGE_REL_AMD64_ADDR32: + if (coff_reloc_inline_addend(data, len, s, r_va, 4, + &inline_addend)) + addend = inline_addend; + break; case IMAGE_REL_AMD64_REL32: - addend = -4; - has_explicit = 1; + if (coff_reloc_inline_addend(data, len, s, r_va, 4, + &inline_addend)) + addend = inline_addend; + addend -= 4; break; case IMAGE_REL_AMD64_REL32_1: - addend = -1; - has_explicit = 1; + if (coff_reloc_inline_addend(data, len, s, r_va, 4, + &inline_addend)) + addend = inline_addend; + addend -= 1; break; case IMAGE_REL_AMD64_REL32_2: - addend = -2; - has_explicit = 1; + if (coff_reloc_inline_addend(data, len, s, r_va, 4, + &inline_addend)) + addend = inline_addend; + addend -= 2; break; case IMAGE_REL_AMD64_REL32_3: - addend = -3; - has_explicit = 1; + if (coff_reloc_inline_addend(data, len, s, r_va, 4, + &inline_addend)) + addend = inline_addend; + addend -= 3; break; case IMAGE_REL_AMD64_REL32_4: - addend = -4; - has_explicit = 1; + if (coff_reloc_inline_addend(data, len, s, r_va, 4, + &inline_addend)) + addend = inline_addend; + addend -= 4; break; case IMAGE_REL_AMD64_REL32_5: - addend = -5; - has_explicit = 1; + if (coff_reloc_inline_addend(data, len, s, r_va, 4, + &inline_addend)) + addend = inline_addend; + addend -= 5; break; default: break; diff --git a/src/obj/coff/reloc_aarch64.c b/src/obj/coff/reloc_aarch64.c @@ -7,8 +7,7 @@ * PAGEOFFSET_12L collapses all LDST*_ABS_LO12_NC widths into one wire * code; the width is recoverable from the patched LDR/STR instruction * encoding, so the reader picks the LDST64 form and the consumer can - * disambiguate later if it cares. ADDR32NB is image-relative; v1 - * collapses it to R_ABS32 and lets layout subtract the image base. */ + * disambiguate later if it cares. */ #include "obj/coff/coff.h" @@ -20,6 +19,8 @@ u32 coff_aarch64_reloc_to(u32 kind /* RelocKind */) { return IMAGE_REL_ARM64_ADDR64; case R_ABS32: return IMAGE_REL_ARM64_ADDR32; + case R_COFF_ADDR32NB: + return IMAGE_REL_ARM64_ADDR32NB; case R_AARCH64_CALL26: case R_AARCH64_JUMP26: return IMAGE_REL_ARM64_BRANCH26; @@ -65,7 +66,7 @@ u32 coff_aarch64_reloc_from(u32 wire_type) { case IMAGE_REL_ARM64_ADDR32: return R_ABS32; case IMAGE_REL_ARM64_ADDR32NB: - return R_ABS32; + return R_COFF_ADDR32NB; case IMAGE_REL_ARM64_BRANCH26: return R_AARCH64_CALL26; case IMAGE_REL_ARM64_BRANCH19: diff --git a/src/obj/coff/reloc_x86_64.c b/src/obj/coff/reloc_x86_64.c @@ -20,6 +20,8 @@ u32 coff_x86_64_reloc_to(u32 kind /* RelocKind */) { return IMAGE_REL_AMD64_ADDR64; case R_ABS32: return IMAGE_REL_AMD64_ADDR32; + case R_COFF_ADDR32NB: + return IMAGE_REL_AMD64_ADDR32NB; case R_X64_32S: return IMAGE_REL_AMD64_ADDR32NB; case R_PC32: @@ -48,7 +50,7 @@ u32 coff_x86_64_reloc_from(u32 wire_type) { case IMAGE_REL_AMD64_ADDR32: return R_ABS32; case IMAGE_REL_AMD64_ADDR32NB: - return R_X64_32S; + return R_COFF_ADDR32NB; case IMAGE_REL_AMD64_REL32: case IMAGE_REL_AMD64_REL32_1: case IMAGE_REL_AMD64_REL32_2: diff --git a/src/obj/elf/emit.c b/src/obj/elf/emit.c @@ -300,8 +300,8 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) { obj_symiter_free(it); } u32 max_syms = 1 + (nobjsec - 1) + nobjsym; - u8* symtab = (u8*)arena_alloc(c->scratch, (size_t)sym_size * max_syms, - _Alignof(u64)); + u8* symtab = + (u8*)arena_alloc(c->scratch, (size_t)sym_size * max_syms, _Alignof(u64)); u32 nsyms = 0; memset(&symtab[nsyms * sym_size], 0, sym_size); nsyms = 1; /* index 0: STN_UNDEF */ @@ -310,37 +310,37 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) { * Elf64_Sym (24B) and Elf32_Sym (16B) REORDER fields: ELF32 places * st_value/st_size BEFORE st_info/st_other/st_shndx, so select the byte * layout by `is32` rather than just narrowing widths. */ -#define WRITE_SYM(idx, st_name, st_info, st_other, st_shndx, st_value, \ - st_size) \ - do { \ - u8* slot = &symtab[(idx) * sym_size]; \ - if (is32) { \ - slot[0] = (u8)((st_name)); \ - slot[1] = (u8)((st_name) >> 8); \ - slot[2] = (u8)((st_name) >> 16); \ - slot[3] = (u8)((st_name) >> 24); \ - for (int _b = 0; _b < 4; ++_b) \ - slot[4 + _b] = (u8)((u64)(st_value) >> (_b * 8)); \ - for (int _b = 0; _b < 4; ++_b) \ - slot[8 + _b] = (u8)((u64)(st_size) >> (_b * 8)); \ - slot[12] = (u8)((st_info)); \ - slot[13] = (u8)((st_other)); \ - slot[14] = (u8)((st_shndx)); \ - slot[15] = (u8)((st_shndx) >> 8); \ - } else { \ - slot[0] = (u8)((st_name)); \ - slot[1] = (u8)((st_name) >> 8); \ - slot[2] = (u8)((st_name) >> 16); \ - slot[3] = (u8)((st_name) >> 24); \ - slot[4] = (u8)((st_info)); \ - slot[5] = (u8)((st_other)); \ - slot[6] = (u8)((st_shndx)); \ - slot[7] = (u8)((st_shndx) >> 8); \ - for (int _b = 0; _b < 8; ++_b) \ - slot[8 + _b] = (u8)((u64)(st_value) >> (_b * 8)); \ - for (int _b = 0; _b < 8; ++_b) \ - slot[16 + _b] = (u8)((u64)(st_size) >> (_b * 8)); \ - } \ +#define WRITE_SYM(idx, st_name, st_info, st_other, st_shndx, st_value, \ + st_size) \ + do { \ + u8* slot = &symtab[(idx) * sym_size]; \ + if (is32) { \ + slot[0] = (u8)((st_name)); \ + slot[1] = (u8)((st_name) >> 8); \ + slot[2] = (u8)((st_name) >> 16); \ + slot[3] = (u8)((st_name) >> 24); \ + for (int _b = 0; _b < 4; ++_b) \ + slot[4 + _b] = (u8)((u64)(st_value) >> (_b * 8)); \ + for (int _b = 0; _b < 4; ++_b) \ + slot[8 + _b] = (u8)((u64)(st_size) >> (_b * 8)); \ + slot[12] = (u8)((st_info)); \ + slot[13] = (u8)((st_other)); \ + slot[14] = (u8)((st_shndx)); \ + slot[15] = (u8)((st_shndx) >> 8); \ + } else { \ + slot[0] = (u8)((st_name)); \ + slot[1] = (u8)((st_name) >> 8); \ + slot[2] = (u8)((st_name) >> 16); \ + slot[3] = (u8)((st_name) >> 24); \ + slot[4] = (u8)((st_info)); \ + slot[5] = (u8)((st_other)); \ + slot[6] = (u8)((st_shndx)); \ + slot[7] = (u8)((st_shndx) >> 8); \ + for (int _b = 0; _b < 8; ++_b) \ + slot[8 + _b] = (u8)((u64)(st_value) >> (_b * 8)); \ + for (int _b = 0; _b < 8; ++_b) \ + slot[16 + _b] = (u8)((u64)(st_size) >> (_b * 8)); \ + } \ } while (0) /* No automatic STT_SECTION synthesis. Section symbols are emitted diff --git a/src/obj/obj.c b/src/obj/obj.c @@ -1029,6 +1029,7 @@ const char* reloc_kind_name(RelocKind k) { _CASE(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21); _CASE(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC); _CASE(R_AARCH64_TPOFF64); + _CASE(R_COFF_ADDR32NB); _CASE(R_AARCH64_GLOB_DAT); _CASE(R_AARCH64_JUMP_SLOT); _CASE(R_AARCH64_RELATIVE); diff --git a/src/obj/obj.h b/src/obj/obj.h @@ -277,6 +277,9 @@ typedef enum RelocKind { /* Internal-only: a raw 64-bit AArch64/RISC-V local-exec tpoff written into * a TLS GOT slot ((target - tls_vaddr) + TCB). Never appears on the wire. */ R_AARCH64_TPOFF64, + /* COFF ADDR32NB: 32-bit image-relative RVA (S + A - ImageBase), used by + * PE exception tables and other image metadata. */ + R_COFF_ADDR32NB, } RelocKind; typedef struct Section { diff --git a/test/coff/kit-roundtrip-coff.c b/test/coff/kit-roundtrip-coff.c @@ -679,6 +679,61 @@ static void test_data_with_reloc_rel32_x64(void) { free_compiler(c); } +static void verify_rel32_inline_addend(const ObjBuilder* ob, Pool* p) { + ObjSecId text_id = find_section_id(ob, p, ".text"); + EXPECT(text_id != OBJ_SEC_NONE, ".text id"); + u32 total = obj_reloc_total(ob); + const Reloc* found = NULL; + for (u32 i = 0; i < total; ++i) { + const Reloc* r = obj_reloc_at(ob, i); + if (r->removed) continue; + if (r->section_id != text_id) continue; + found = r; + break; + } + EXPECT(found != NULL, "no reloc on .text"); + if (found) { + EXPECT(found->kind == R_PC32, "reloc kind=%u (want R_PC32=%u)", + found->kind, R_PC32); + EXPECT(found->addend == 0x124, "reloc addend=%lld (want 0x124)", + (long long)found->addend); + EXPECT(found->has_explicit_addend == 0, "inline addend marked explicit"); + } +} + +static void test_reloc_rel32_inline_addend_x64(void) { + g_test_name = "reloc_rel32_inline_addend_x64"; + KitTargetSpec t; + target_x64_windows(&t); + Compiler* c = make_compiler(&t); + if (!c) { + EXPECT(0, "compiler_new"); + return; + } + if (setjmp(c->panic)) { + compiler_run_cleanups(c); + free_compiler(c); + EXPECT(0, "panic"); + return; + } + ObjBuilder* ob = obj_new(c); + Pool* p = c->global; + Sym tn = pool_intern_slice(p, SLICE_LIT(".text")); + Sym hn = pool_intern_slice(p, SLICE_LIT("helper")); + ObjSecId sec = obj_section(ob, tn, SEC_TEXT, SF_ALLOC | SF_EXEC, 16); + /* call helper+0x128; ret. COFF stores this addend inline. */ + static const uint8_t bytes[6] = {0xe8, 0x28, 0x01, 0, 0, 0xc3}; + obj_write(ob, sec, bytes, sizeof bytes); + ObjSymId helper = obj_symbol(ob, hn, SB_GLOBAL, SK_UNDEF, OBJ_SEC_NONE, 0, 0); + obj_reloc_ex(ob, sec, 1, R_PC32, helper, 0, 0, 0); + obj_finalize(ob); + + run_roundtrip(c, ob, verify_rel32_inline_addend); + + obj_free(ob); + free_compiler(c); +} + /* test_aa64_branch26: .text with a BRANCH26 (R_AARCH64_CALL26) * relocation against an external. */ @@ -1403,6 +1458,7 @@ static const struct { {"data_with_reloc_abs64_x64", test_data_with_reloc_abs64_x64}, {"data_with_reloc_abs64_aa64", test_data_with_reloc_abs64_aa64}, {"reloc_rel32_x64", test_data_with_reloc_rel32_x64}, + {"reloc_rel32_inline_addend_x64", test_reloc_rel32_inline_addend_x64}, {"aa64_branch26", test_aa64_branch26}, {"aa64_pagebase_pageoffset", test_aa64_pagebase_pageoffset}, {"long_section_name", test_long_section_name},