commit 4c4f1db31be66b1fab039b0a1fa8c4f8ab2bdd6f
parent eff11e543f602e9bccfb86bdd2e446271957de85
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 4 Jun 2026 19:00:38 -0700
Fix Windows PE x64 startup and callback ABI
Diffstat:
13 files changed, 307 insertions(+), 120 deletions(-)
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -370,6 +370,13 @@ struct NativeTarget {
* up front. */
void (*reserve_callee_saves)(NativeTarget*, const u32* used_by_class,
u32 nclasses);
+ /* Optional live-ABI callee-saved register mask for a class. Static
+ * NativeAllocClassInfo masks describe the target register file, but some
+ * targets vary preservation rules by OS ABI (x64 SysV vs Win64 XMM regs).
+ * Direct emission uses this to decide which borrowed scratch/cache registers
+ * must be reported to reserve_callee_saves(). NULL falls back to
+ * NativeAllocClassInfo.callee_saved_mask. */
+ u32 (*callee_saved_mask)(NativeTarget*, NativeAllocClass);
/* Optional. When set, the optimizer emit path calls this once — after
* func_begin, reserve_callee_saves, and frame-slot mapping, but before the
* body — to emit a minimal, exact-size prologue in place (no reserved NOP
diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c
@@ -19,13 +19,13 @@
* are reserved (stack/frame pointers). RAX is reserved too (return value, the
* div/mul implicit operand), but it is NOT an emit temp, so inline asm may pin
* an operand to it (the Linux syscall idiom) — see x64_asm_operand_reg_ok.
- * Everything else is allocable. The driver scratch pool is RBX/R12 (int) and
- * XMM12/XMM13 (fp), disjoint from the emit temps so a hook never clobbers an
- * operand parked there. ABI arg/ret
- * registers are caller-saved-allocable; callee-saved set is resolved per-OS via
- * x64_abi_for_os at runtime (the legality masks below are SysV's, the conserva-
- * tive superset that both ABIs' allocators respect — Win64's extra callee-saves
- * RDI/RSI/xmm6-15 only shrink the allocable pool, never grow it). */
+ * The driver scratch pool is R8/R9 (int) and XMM4/XMM5 (fp), caller-saved on
+ * both SysV and Win64 and disjoint from the emit temps so a hook never clobbers
+ * an operand parked there. Scratch registers are reserved from allocation.
+ * Callee-saved set is resolved per-OS via x64_abi_for_os at runtime (the
+ * legality masks below are SysV's, the conservative superset that both ABIs'
+ * allocators respect — Win64's extra callee-saves RDI/RSI/xmm6-15 only shrink
+ * the allocable pool, never grow it). */
#include <string.h>
@@ -179,6 +179,13 @@ static void emit_jcc_rel32(MCEmitter* mc, u32 cc, MCLabel l);
.flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG, \
.spill_cost = 1u, \
.copy_cost = 1u}
+#define X64_PHYS_INT_ARG_RESERVED(r) \
+ {.reg = (r), \
+ .cls = NATIVE_REG_INT, \
+ .abi_index = 0xffu, \
+ .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | NATIVE_REG_RESERVED, \
+ .spill_cost = 0u, \
+ .copy_cost = 0u}
#define X64_PHYS_INT_RET_ARG(r) \
{.reg = (r), \
.cls = NATIVE_REG_INT, \
@@ -209,27 +216,26 @@ static void emit_jcc_rel32(MCEmitter* mc, u32 cc, MCLabel l);
.spill_cost = 0u, \
.copy_cost = 0u}
-/* Allocable int pool, opt's spill/reload set: callee-saves first so the direct
- * path's local cache prefers regs that don't grow the prologue. R10/R11 are
- * emit scratch (reserved); RBX/R12 are the driver scratch pool; RAX is reserved
- * (return / div-mul, asm-pinnable). */
+/* Allocable int pool, opt's spill/reload set. R8/R9 are the driver scratch
+ * pool; R10/R11 are emit scratch (reserved); RAX is reserved (return / div-mul,
+ * asm-pinnable). */
static const Reg x64_int_allocable[] = {X64_R13, X64_R14, X64_R15};
-static const Reg x64_int_scratch[] = {X64_RBX, X64_R12};
+static const Reg x64_int_scratch[] = {X64_R8, X64_R9};
static const NativePhysRegInfo x64_int_phys[] = {
X64_PHYS_INT_RESERVED(X64_RAX), /* return / div-mul (asm-pinnable) */
X64_PHYS_INT_ARG(X64_RCX),
X64_PHYS_INT_RET_ARG(X64_RDX),
- X64_PHYS_INT_RESERVED(X64_RBX), /* driver scratch */
+ X64_PHYS_INT_RESERVED(X64_RBX),
X64_PHYS_INT_RESERVED(X64_RSP), /* stack pointer */
X64_PHYS_INT_RESERVED(X64_RBP), /* frame pointer */
X64_PHYS_INT_ARG(X64_RSI),
X64_PHYS_INT_ARG(X64_RDI),
- X64_PHYS_INT_ARG(X64_R8),
- X64_PHYS_INT_ARG(X64_R9),
- X64_PHYS_INT_RESERVED(X64_R10), /* emit scratch */
- X64_PHYS_INT_RESERVED(X64_R11), /* emit scratch */
- X64_PHYS_INT_RESERVED(X64_R12), /* driver scratch */
+ X64_PHYS_INT_ARG_RESERVED(X64_R8), /* driver scratch */
+ X64_PHYS_INT_ARG_RESERVED(X64_R9), /* driver scratch */
+ X64_PHYS_INT_RESERVED(X64_R10), /* emit scratch */
+ X64_PHYS_INT_RESERVED(X64_R11), /* emit scratch */
+ X64_PHYS_INT_RESERVED(X64_R12),
X64_PHYS_INT_CALLEE(X64_R13),
X64_PHYS_INT_CALLEE(X64_R14),
X64_PHYS_INT_CALLEE(X64_R15),
@@ -250,6 +256,13 @@ static const NativePhysRegInfo x64_int_phys[] = {
.flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG, \
.spill_cost = 1u, \
.copy_cost = 1u}
+#define X64_PHYS_FP_ARG_RESERVED(r) \
+ {.reg = (r), \
+ .cls = NATIVE_REG_FP, \
+ .abi_index = 0xffu, \
+ .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | NATIVE_REG_RESERVED, \
+ .spill_cost = 0u, \
+ .copy_cost = 0u}
#define X64_PHYS_FP_CALLER(r) \
{.reg = (r), \
.cls = NATIVE_REG_FP, \
@@ -265,21 +278,20 @@ static const NativePhysRegInfo x64_int_phys[] = {
.spill_cost = 0u, \
.copy_cost = 0u}
-/* Allocable FP pool: xmm6..xmm13 (keep arg/ret xmm0..5 clear). xmm14/xmm15 are
- * emit scratch; xmm12/xmm13 the driver scratch pool. */
+/* Allocable FP pool: xmm6..xmm11 (keep arg/ret xmm0..5 clear). xmm4/xmm5 are
+ * driver scratch; xmm14/xmm15 are emit scratch. */
static const Reg x64_fp_allocable[] = {
X64_XMM6, X64_XMM7, X64_XMM8, X64_XMM0 + 9, X64_XMM0 + 10, X64_XMM0 + 11};
-static const Reg x64_fp_scratch[] = {X64_XMM0 + 12, X64_XMM0 + 13};
+static const Reg x64_fp_scratch[] = {X64_XMM4, X64_XMM5};
static const NativePhysRegInfo x64_fp_phys[] = {
X64_PHYS_FP_ARG_RET(X64_XMM0), X64_PHYS_FP_ARG_RET(X64_XMM1),
X64_PHYS_FP_ARG(X64_XMM2), X64_PHYS_FP_ARG(X64_XMM3),
- X64_PHYS_FP_ARG(X64_XMM4), X64_PHYS_FP_ARG(X64_XMM5),
+ X64_PHYS_FP_ARG_RESERVED(X64_XMM4), X64_PHYS_FP_ARG_RESERVED(X64_XMM5),
X64_PHYS_FP_CALLER(X64_XMM6), X64_PHYS_FP_CALLER(X64_XMM7),
X64_PHYS_FP_CALLER(X64_XMM8), X64_PHYS_FP_CALLER(X64_XMM0 + 9),
X64_PHYS_FP_CALLER(X64_XMM0 + 10), X64_PHYS_FP_CALLER(X64_XMM0 + 11),
- X64_PHYS_FP_RESERVED(X64_XMM0 + 12), /* driver scratch */
- X64_PHYS_FP_RESERVED(X64_XMM0 + 13), /* driver scratch */
+ X64_PHYS_FP_RESERVED(X64_XMM0 + 12), X64_PHYS_FP_RESERVED(X64_XMM0 + 13),
X64_PHYS_FP_RESERVED(X64_XMM0 + 14), /* emit scratch */
X64_PHYS_FP_RESERVED(X64_XMM15), /* emit scratch */
};
@@ -303,10 +315,10 @@ static const NativeAllocClassInfo x64_classes[] = {
.arg_mask = (1u << X64_RDI) | (1u << X64_RSI) | (1u << X64_RDX) |
(1u << X64_RCX) | (1u << X64_R8) | (1u << X64_R9),
.ret_mask = (1u << X64_RAX) | (1u << X64_RDX),
- /* rax, rsp, rbp reserved; r10/r11 emit scratch; rbx/r12 driver scratch */
+ /* rax, rsp, rbp reserved; r8/r9 driver scratch; r10/r11 emit scratch */
.reserved_mask = (1u << X64_RAX) | (1u << X64_RSP) | (1u << X64_RBP) |
- (1u << X64_R10) | (1u << X64_R11) | (1u << X64_RBX) |
- (1u << X64_R12)},
+ (1u << X64_R8) | (1u << X64_R9) | (1u << X64_R10) |
+ (1u << X64_R11) | (1u << X64_RBX) | (1u << X64_R12)},
{.cls = NATIVE_REG_FP,
.allocable = x64_fp_allocable,
.nallocable = sizeof x64_fp_allocable / sizeof x64_fp_allocable[0],
@@ -319,8 +331,9 @@ static const NativeAllocClassInfo x64_classes[] = {
.callee_saved_mask = 0u,
.arg_mask = 0xffu, /* xmm0..xmm7 */
.ret_mask = (1u << X64_XMM0) | (1u << X64_XMM1),
- /* xmm12..xmm15 reserved (driver scratch + emit scratch) */
- .reserved_mask = (1u << (X64_XMM0 + 12)) | (1u << (X64_XMM0 + 13)) |
+ /* xmm4/xmm5 driver scratch; xmm14/xmm15 emit scratch. */
+ .reserved_mask = (1u << X64_XMM4) | (1u << X64_XMM5) |
+ (1u << (X64_XMM0 + 12)) | (1u << (X64_XMM0 + 13)) |
(1u << (X64_XMM0 + 14)) | (1u << X64_XMM15)},
};
@@ -356,15 +369,15 @@ static int x64_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls,
if (cls == NATIVE_REG_INT) {
switch (reg) {
/* RAX is reserved but not an emit temp, so it is a legal asm pin (the
- * Linux syscall number/return register). R10/R11 are emit scratch and
- * RBX/R12 the driver scratch pool, so those stay excluded. */
+ * Linux syscall number/return register). R8/R9 are driver scratch and
+ * R10/R11 are emit scratch, so those stay excluded. */
case X64_RAX:
+ case X64_RBX:
case X64_RCX:
case X64_RDX:
case X64_RSI:
case X64_RDI:
- case X64_R8:
- case X64_R9:
+ case X64_R12:
case X64_R13:
case X64_R14:
case X64_R15:
@@ -373,7 +386,8 @@ static int x64_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls,
return 0;
}
}
- if (cls == NATIVE_REG_FP) return reg <= X64_XMM0 + 11u;
+ if (cls == NATIVE_REG_FP)
+ return reg <= X64_XMM0 + 13u && reg != X64_XMM4 && reg != X64_XMM5;
return 0;
}
@@ -1737,6 +1751,20 @@ static void x64_reserve_callee_saves(NativeTarget* t, const u32* used,
static int x64_reg_is_callee_int(const X64ABIRegs* abi, Reg r);
static int x64_reg_is_callee_fp(const X64ABIRegs* abi, Reg r);
+
+static u32 x64_live_callee_saved_mask(NativeTarget* t,
+ NativeAllocClass cls) {
+ X64NativeTarget* a = x64_of(t);
+ u32 mask = 0;
+ for (Reg r = 0; r < 16u; ++r) {
+ if (cls == NATIVE_REG_INT && x64_reg_is_callee_int(a->abi, r))
+ mask |= 1u << r;
+ if (cls == NATIVE_REG_FP && x64_reg_is_callee_fp(a->abi, r))
+ mask |= 1u << r;
+ }
+ return mask;
+}
+
static void x64_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
u32 nclob, u32* int_mask, u32* fp_mask);
@@ -4064,6 +4092,7 @@ NativeTarget* x64_native_target_new(Compiler* c, ObjBuilder* obj,
/* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved
* set; x64_func_begin_known_frame derives the records from the masks. */
t->reserve_callee_saves = x64_reserve_callee_saves;
+ t->callee_saved_mask = x64_live_callee_saved_mask;
t->signature_stack_bytes = x64_signature_stack_bytes;
t->call_stack_bytes = x64_call_stack_bytes;
t->has_store_zero_reg = 0;
@@ -4258,12 +4287,12 @@ static void x64_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
native_asm_abi_clobber_masks(d->native, clobber_abi_sets, &abi_int, &abi_fp);
clob_int |= abi_int;
clob_fp |= abi_fp;
- /* Reserve emit scratch (r10,r11), driver scratch (rbx,r12), rax (reserved;
+ /* Reserve emit scratch (r10,r11), driver scratch (r8,r9), rax (reserved;
* only self-allocated here when explicitly pinned), sp/bp, and clobbers. */
used_int = clob_int | (1u << X64_RAX) | (1u << X64_R11) | (1u << X64_RSP) |
- (1u << X64_RBP) | (1u << X64_RBX) | (1u << X64_R12) |
+ (1u << X64_RBP) | (1u << X64_R8) | (1u << X64_R9) |
(1u << X64_R10);
- used_fp = clob_fp | (1u << (X64_XMM0 + 12)) | (1u << (X64_XMM0 + 13)) |
+ used_fp = clob_fp | (1u << X64_XMM4) | (1u << X64_XMM5) |
(1u << (X64_XMM0 + 14)) | (1u << X64_XMM15);
for (i = 0; i < nout; ++i) {
diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c
@@ -108,6 +108,19 @@ static Reg nd_cache_reg_for(NativeDirectTarget* d, CGLocal local,
KitCgTypeId access_type);
static Reg nd_pick_cache_victim(NativeDirectTarget* d, NativeAllocClass cls);
+static u32 nd_callee_saved_mask(NativeDirectTarget* d, NativeAllocClass cls) {
+ if (d->native && d->native->callee_saved_mask)
+ return d->native->callee_saved_mask(d->native, cls);
+ return nd_class_info(d, cls)->callee_saved_mask;
+}
+
+static void nd_note_reg_used(NativeDirectTarget* d, NativeAllocClass cls,
+ Reg reg) {
+ if ((u32)cls >= 3u || reg >= 32u) return;
+ if (nd_callee_saved_mask(d, cls) & (1u << reg))
+ d->callee_saved_used[cls] |= 1u << reg;
+}
+
static Reg nd_scratch_acquire(NativeDirectTarget* d, NativeAllocClass cls) {
const NativeAllocClassInfo* ci = nd_class_info(d, cls);
const Reg* regs = ci->scratch;
@@ -121,6 +134,7 @@ static Reg nd_scratch_acquire(NativeDirectTarget* d, NativeAllocClass cls) {
if ((d->scratch_used[cls] & (1u << r)) == 0 &&
d->reg_owner[cls][r] == CG_LOCAL_NONE) {
d->scratch_used[cls] |= 1u << r;
+ nd_note_reg_used(d, cls, r);
return r;
}
}
@@ -134,6 +148,7 @@ static Reg nd_scratch_acquire(NativeDirectTarget* d, NativeAllocClass cls) {
if (r != REG_NONE) {
nd_flush_local(d, d->reg_owner[cls][r]);
d->scratch_used[cls] |= 1u << r;
+ nd_note_reg_used(d, cls, r);
return r;
}
}
@@ -468,9 +483,10 @@ static void nd_store_operand_from_reg(NativeDirectTarget* d, Operand dst,
* (nd_dst_reg/nd_dst_writeback) and are always dirty; reads hit a live entry or
* fall back to a frame load without creating one. nd_flush_all spills and
* empties the cache at the top of every non-pure-compute op, so the cache only
- * survives across straight-line runs of compute ops. Caching only caller-saved
- * registers means that conservative flush fully covers ABI clobbering across
- * calls, and no callee-save prologue/epilogue work is required. */
+ * survives across straight-line runs of compute ops. Caching prefers the
+ * register-file caller-saved mask; if the live OS ABI treats one of those
+ * registers as callee-saved, nd_note_reg_used reports it to the backend before
+ * the deferred prologue is patched. */
static int nd_local_cacheable(NativeDirectTarget* d,
const NativeDirectLocal* l) {
@@ -534,12 +550,15 @@ static Reg nd_cache_alloc(NativeDirectTarget* d, NativeAllocClass cls) {
Reg r = ci->allocable[i];
if (r >= 32u) continue;
if ((caller & (1u << r)) && d->reg_owner[cls][r] == CG_LOCAL_NONE &&
- (d->scratch_used[cls] & (1u << r)) == 0)
+ (d->scratch_used[cls] & (1u << r)) == 0) {
+ nd_note_reg_used(d, cls, r);
return r;
+ }
}
victim = nd_pick_cache_victim(d, cls);
if (victim != REG_NONE && (caller & (1u << victim))) {
nd_flush_local(d, d->reg_owner[cls][victim]);
+ nd_note_reg_used(d, cls, victim);
return victim;
}
return REG_NONE;
@@ -910,6 +929,7 @@ static void nd_func_begin(CgTarget* t, const CGFuncDesc* fd) {
d->cache_tail = -1;
d->ncached = 0;
memset(d->scratch_used, 0, sizeof d->scratch_used);
+ memset(d->callee_saved_used, 0, sizeof d->callee_saved_used);
memset(d->reg_owner, 0, sizeof d->reg_owner);
if (d->native && d->native->func_begin) d->native->func_begin(d->native, fd);
}
@@ -917,8 +937,18 @@ static void nd_func_begin(CgTarget* t, const CGFuncDesc* fd) {
static void nd_func_end(CgTarget* t) {
NativeDirectTarget* d = nd_of(t);
NativeFramePatchState frame;
+ u32 ncallee_classes = 0;
memset(&frame, 0, sizeof frame);
frame.max_outgoing = d->max_outgoing;
+ for (u32 cls = 0; cls < 3u; ++cls) {
+ if (d->callee_saved_used[cls]) ncallee_classes = cls + 1u;
+ }
+ if (ncallee_classes) {
+ if (!d->native || !d->native->reserve_callee_saves)
+ nd_panic(d, "target cannot preserve callee-saved scratch registers");
+ d->native->reserve_callee_saves(d->native, d->callee_saved_used,
+ ncallee_classes);
+ }
if (d->native && d->native->note_frame_state)
d->native->note_frame_state(d->native, &frame);
if (d->native && d->native->patch_apply) d->native->patch_apply(d->native);
@@ -1590,11 +1620,13 @@ static void nd_call(CgTarget* t, const CGCallDesc* desc) {
NativeCallDesc nd;
NativeLoc* args;
NativeLoc* results;
- int release_callee = 0;
+ NativeLoc callee_tmp;
+ int release_callee_tmp = 0;
nd_flush_all(d);
nd_barrier(d, NATIVE_DIRECT_BARRIER_CALL | NATIVE_DIRECT_BARRIER_MEMORY);
memset(&plan, 0, sizeof plan);
memset(&nd, 0, sizeof nd);
+ memset(&callee_tmp, 0, sizeof callee_tmp);
args = nd_loc_buf(d, d->argbuf, ND_ARG_BUF, desc->nargs);
results = nd_loc_buf(d, d->retbuf, ND_RET_BUF, desc->nresults);
for (u32 i = 0; i < desc->nargs; ++i)
@@ -1603,6 +1635,13 @@ static void nd_call(CgTarget* t, const CGCallDesc* desc) {
results[i] = nd_loc_frame(d, desc->results[i], 0);
nd.fn_type = desc->fn_type;
nd.callee = nd_loc_operand(d, desc->callee);
+ if (nd.callee.kind == NATIVE_LOC_FRAME) {
+ callee_tmp = nd_materialize_loc(d, nd.callee,
+ (NativeAllocClass)nd.callee.cls,
+ nd.callee.type);
+ nd.callee = callee_tmp;
+ release_callee_tmp = 1;
+ }
nd.args = args;
nd.results = results;
nd.nargs = desc->nargs;
@@ -1621,12 +1660,6 @@ static void nd_call(CgTarget* t, const CGCallDesc* desc) {
d->max_outgoing = plan.stack_arg_size;
for (u32 i = 0; i < plan.nargs; ++i)
nd_write_loc(d, plan.args[i].dst, plan.args[i].src, plan.args[i].mem);
- if (plan.callee.kind == NATIVE_LOC_FRAME) {
- NativeLoc callee = nd_materialize_loc(
- d, plan.callee, (NativeAllocClass)plan.callee.cls, plan.callee.type);
- plan.callee = callee;
- release_callee = 1;
- }
if (d->ops && d->ops->emit_call)
d->ops->emit_call(d, &plan);
else {
@@ -1635,8 +1668,9 @@ static void nd_call(CgTarget* t, const CGCallDesc* desc) {
}
for (u32 i = 0; i < plan.nrets; ++i)
nd_write_loc(d, plan.rets[i].dst, plan.rets[i].src, plan.rets[i].mem);
- if (release_callee)
- nd_scratch_release(d, (NativeAllocClass)plan.callee.cls, plan.callee.v.reg);
+ if (release_callee_tmp)
+ nd_scratch_release(d, (NativeAllocClass)callee_tmp.cls,
+ callee_tmp.v.reg);
}
static const char* nd_tail_call_unrealizable_reason(CgTarget* t,
diff --git a/src/cg/native_direct_target.h b/src/cg/native_direct_target.h
@@ -135,6 +135,9 @@ struct NativeDirectTarget {
u32 scopes_cap;
u32 scratch_used[3];
+ /* Per-function callee-saved registers borrowed by direct scratch/cache
+ * allocation. Reported to the native backend before prologue patching. */
+ u32 callee_saved_used[3];
/* Local register cache (write-back, basic-block-scoped). reg_owner[cls][reg]
* names the semantic local currently cached in that physical register, or
* CG_LOCAL_NONE. scratch_used doubles as the per-class "pinned for the
diff --git a/src/link/link_reloc_layout.c b/src/link/link_reloc_layout.c
@@ -361,6 +361,7 @@ static u8 reloc_width(RelocKind k) {
* small DWARF symbol differences these encode. */
return RELOC_RV_ULEB128_NOMINAL_WIDTH;
case R_COFF_SECREL:
+ case R_COFF_ADDR32NB:
return 4;
case R_COFF_SECTION:
return 2;
diff --git a/src/obj/coff/link.c b/src/obj/coff/link.c
@@ -754,7 +754,8 @@ static void coff_emit_idata(LinkImage* img, const CoffImportTable* it,
u8* buf;
/* Allocate the bucket buffer (idata_size is already block-aligned). */
buf = (u8*)heap->alloc(heap, it->idata_size, _Alignof(u64));
- if (!buf) compiler_panic(c, SRCLOC_NONE, "link_emit_coff: oom on .idata buffer");
+ if (!buf)
+ compiler_panic(c, SRCLOC_NONE, "link_emit_coff: oom on .idata buffer");
memset(buf, 0, it->idata_size);
idata->bytes = buf;
idata->size = it->idata_size;
@@ -1133,11 +1134,6 @@ static void coff_build_reloc_section(LinkImage* img,
u32 cap = 0;
u32 i;
- if (!img->pie) {
- reloc->bytes = NULL;
- reloc->size = 0;
- return;
- }
for (i = 0; i < nrel; ++i) {
const LinkRelocApply* r = LinkRelocs_at(&img->relocs, i);
const LinkSection* ls;
@@ -1350,6 +1346,12 @@ static void coff_apply_all_relocs(LinkImage* img,
}
continue;
}
+ if (r->kind == R_COFF_ADDR32NB) {
+ u64 inline_addend = rd_u32_le(P_bytes);
+ u64 v = (S - img_base) + inline_addend + (u64)r->addend;
+ wr_u32_le(P_bytes, (u32)(v & 0xffffffffu));
+ continue;
+ }
link_reloc_apply(c, r->kind, P_bytes, S, r->addend, P);
}
}
@@ -1398,7 +1400,7 @@ typedef struct CoffOutHdr {
static void coff_write_optional_header(Writer* w, u32 entry_rva,
const CoffSection out[COFF_NBUCKETS],
u32 headers_size_padded, u32 image_size,
- int pie, u16 subsystem,
+ int dynamic_base, u16 subsystem,
const CoffImportTable* it,
const CoffTlsLayout* tls) {
/* Standard fields. */
@@ -1438,7 +1440,8 @@ static void coff_write_optional_header(Writer* w, u32 entry_rva,
coff_wr_u32(w, headers_size_padded);
coff_wr_u32(w, 0u); /* CheckSum */
coff_wr_u16(w, subsystem ? subsystem : IMAGE_SUBSYSTEM_WINDOWS_CUI);
- coff_wr_u16(w, (u16)(PE_DLL_CHARS_BASE | (pie ? PE_DLL_CHARS_ASLR : 0)));
+ coff_wr_u16(
+ w, (u16)(PE_DLL_CHARS_BASE | (dynamic_base ? PE_DLL_CHARS_ASLR : 0)));
coff_wr_u64(w, PE_STACK_RESERVE);
coff_wr_u64(w, PE_STACK_COMMIT);
coff_wr_u64(w, PE_HEAP_RESERVE);
@@ -1464,7 +1467,7 @@ static void coff_write_optional_header(Writer* w, u32 entry_rva,
} else if (i == IMAGE_DIRECTORY_ENTRY_IAT && has_idata) {
coff_wr_u32(w, out[COFF_BUCKET_IDATA].rva + it->iat_base);
coff_wr_u32(w, it->iat_total);
- } else if (i == IMAGE_DIRECTORY_ENTRY_BASERELOC && pie &&
+ } else if (i == IMAGE_DIRECTORY_ENTRY_BASERELOC && dynamic_base &&
out[COFF_BUCKET_RELOC].in_image) {
coff_wr_u32(w, out[COFF_BUCKET_RELOC].rva);
coff_wr_u32(w, out[COFF_BUCKET_RELOC].size);
@@ -1560,11 +1563,14 @@ void link_emit_coff(LinkImage* img, Writer* w) {
* The headers' file size (and therefore every section's file
* offset) depends on the section-table entry count, so we need to
* commit to "is .reloc emitted?" before laying out file offsets.
- * .reloc lights up iff PIE and at least one absolute reloc points
- * into a kept section, OR a TLS directory is emitted (its four u64
- * VA fields all need base-relocs). */
+ * .reloc lights up iff at least one absolute VA reloc points into a kept
+ * section, OR a TLS directory is emitted (its VA fields need base-relocs).
+ * ARM64 Windows rejects fixed images (/dynamicbase:no), and x64 Windows
+ * accepts ASLR images by default, so PE images advertise DYNAMIC_BASE when
+ * this table is present instead of tying the table to the generic ELF/Mach-O
+ * img->pie flag. */
int emit_reloc = 0;
- if (img->pie) {
+ {
u32 i;
u32 nrel = LinkRelocs_count(&img->relocs);
for (i = 0; i < nrel; ++i) {
@@ -1693,7 +1699,8 @@ void link_emit_coff(LinkImage* img, Writer* w) {
/* ---- pass 7: write everything ---- */
u16 file_chars = IMAGE_FILE_EXECUTABLE_IMAGE | IMAGE_FILE_LARGE_ADDRESS_AWARE;
- if (!img->pie || !out[COFF_BUCKET_RELOC].in_image) {
+ int dynamic_base = out[COFF_BUCKET_RELOC].in_image;
+ if (!dynamic_base) {
file_chars |= IMAGE_FILE_RELOCS_STRIPPED;
}
@@ -1703,7 +1710,7 @@ void link_emit_coff(LinkImage* img, Writer* w) {
coff_write_file_header(w, machine, (u16)nsec, file_chars);
u16 subsystem = img->linker ? img->linker->pe_subsystem : 0;
coff_write_optional_header(w, entry_rva, out, headers_size_padded, image_size,
- img->pie, subsystem,
+ dynamic_base, subsystem,
have_imports ? &imports : NULL, &tls);
/* Section table. */
diff --git a/src/obj/coff/read.c b/src/obj/coff/read.c
@@ -144,6 +144,26 @@ static void resolve_sym_name(const u8* rec, const u8* strtab, u32 strtab_size,
*len_out = n;
}
+static int coff_reloc_inline_addend(const u8* data, size_t len,
+ const CSecRec* s, u32 off, u32 width,
+ i64* out) {
+ if (!s || !s->size_of_raw_data) return 0;
+ if ((u64)off + (u64)width > (u64)s->size_of_raw_data) return 0;
+ if ((u64)s->pointer_to_raw_data + (u64)off + (u64)width > (u64)len)
+ return 0;
+ const u8* p = data + s->pointer_to_raw_data + off;
+ switch (width) {
+ case 4:
+ *out = (i64)(i32)coff_rd_u32(p);
+ return 1;
+ case 8:
+ *out = (i64)coff_rd_u64(p);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
/* ---- short-import record handler ----
* Microsoft "short import" format: a 20-byte ImportObjectHeader
* followed by SizeOfData bytes containing two NUL-terminated strings —
@@ -648,12 +668,12 @@ ObjBuilder* read_coff(Compiler* c, const char* name, const u8* data,
ObjSymId target = OBJ_SYM_NONE;
if (r_sym < nsymbols) target = sym_to_obj[r_sym];
- /* AMD64 REL32 encodings are relative to a PC after the relocated
- * field, while kit's R_PC32-style apply formula subtracts the
- * relocation field address P. Plain REL32 is relative to P+4;
- * REL32_N is relative to P+N. Record that convention as an
- * implicit negative addend so link_reloc_apply can stay format
- * neutral. */
+ /* COFF stores addends inline in the relocated field. Fold those
+ * bytes into Reloc.addend for the reloc kinds whose apply path
+ * overwrites the field. AMD64 REL32 also subtracts from a PC after
+ * the relocated field: plain REL32 is relative to P+4, and REL32_N is
+ * relative to P+N. Record that convention as an implicit negative
+ * addend so link_reloc_apply can stay format neutral. */
/* ARM64 PAGEOFFSET_12L is one wire code for LDST{8,16,32,64,128}.
* The per-arch translator returns R_AARCH64_LDST64_ABS_LO12_NC by
* default; recover the actual access width from the patched LDR/
@@ -693,30 +713,53 @@ ObjBuilder* read_coff(Compiler* c, const char* name, const u8* data,
i64 addend = 0;
int has_explicit = 0;
if (machine == IMAGE_FILE_MACHINE_AMD64) {
+ i64 inline_addend = 0;
switch (r_type) {
+ case IMAGE_REL_AMD64_ADDR64:
+ if (coff_reloc_inline_addend(data, len, s, r_va, 8,
+ &inline_addend))
+ addend = inline_addend;
+ break;
+ case IMAGE_REL_AMD64_ADDR32:
+ if (coff_reloc_inline_addend(data, len, s, r_va, 4,
+ &inline_addend))
+ addend = inline_addend;
+ break;
case IMAGE_REL_AMD64_REL32:
- addend = -4;
- has_explicit = 1;
+ if (coff_reloc_inline_addend(data, len, s, r_va, 4,
+ &inline_addend))
+ addend = inline_addend;
+ addend -= 4;
break;
case IMAGE_REL_AMD64_REL32_1:
- addend = -1;
- has_explicit = 1;
+ if (coff_reloc_inline_addend(data, len, s, r_va, 4,
+ &inline_addend))
+ addend = inline_addend;
+ addend -= 1;
break;
case IMAGE_REL_AMD64_REL32_2:
- addend = -2;
- has_explicit = 1;
+ if (coff_reloc_inline_addend(data, len, s, r_va, 4,
+ &inline_addend))
+ addend = inline_addend;
+ addend -= 2;
break;
case IMAGE_REL_AMD64_REL32_3:
- addend = -3;
- has_explicit = 1;
+ if (coff_reloc_inline_addend(data, len, s, r_va, 4,
+ &inline_addend))
+ addend = inline_addend;
+ addend -= 3;
break;
case IMAGE_REL_AMD64_REL32_4:
- addend = -4;
- has_explicit = 1;
+ if (coff_reloc_inline_addend(data, len, s, r_va, 4,
+ &inline_addend))
+ addend = inline_addend;
+ addend -= 4;
break;
case IMAGE_REL_AMD64_REL32_5:
- addend = -5;
- has_explicit = 1;
+ if (coff_reloc_inline_addend(data, len, s, r_va, 4,
+ &inline_addend))
+ addend = inline_addend;
+ addend -= 5;
break;
default:
break;
diff --git a/src/obj/coff/reloc_aarch64.c b/src/obj/coff/reloc_aarch64.c
@@ -7,8 +7,7 @@
* PAGEOFFSET_12L collapses all LDST*_ABS_LO12_NC widths into one wire
* code; the width is recoverable from the patched LDR/STR instruction
* encoding, so the reader picks the LDST64 form and the consumer can
- * disambiguate later if it cares. ADDR32NB is image-relative; v1
- * collapses it to R_ABS32 and lets layout subtract the image base. */
+ * disambiguate later if it cares. */
#include "obj/coff/coff.h"
@@ -20,6 +19,8 @@ u32 coff_aarch64_reloc_to(u32 kind /* RelocKind */) {
return IMAGE_REL_ARM64_ADDR64;
case R_ABS32:
return IMAGE_REL_ARM64_ADDR32;
+ case R_COFF_ADDR32NB:
+ return IMAGE_REL_ARM64_ADDR32NB;
case R_AARCH64_CALL26:
case R_AARCH64_JUMP26:
return IMAGE_REL_ARM64_BRANCH26;
@@ -65,7 +66,7 @@ u32 coff_aarch64_reloc_from(u32 wire_type) {
case IMAGE_REL_ARM64_ADDR32:
return R_ABS32;
case IMAGE_REL_ARM64_ADDR32NB:
- return R_ABS32;
+ return R_COFF_ADDR32NB;
case IMAGE_REL_ARM64_BRANCH26:
return R_AARCH64_CALL26;
case IMAGE_REL_ARM64_BRANCH19:
diff --git a/src/obj/coff/reloc_x86_64.c b/src/obj/coff/reloc_x86_64.c
@@ -20,6 +20,8 @@ u32 coff_x86_64_reloc_to(u32 kind /* RelocKind */) {
return IMAGE_REL_AMD64_ADDR64;
case R_ABS32:
return IMAGE_REL_AMD64_ADDR32;
+ case R_COFF_ADDR32NB:
+ return IMAGE_REL_AMD64_ADDR32NB;
case R_X64_32S:
return IMAGE_REL_AMD64_ADDR32NB;
case R_PC32:
@@ -48,7 +50,7 @@ u32 coff_x86_64_reloc_from(u32 wire_type) {
case IMAGE_REL_AMD64_ADDR32:
return R_ABS32;
case IMAGE_REL_AMD64_ADDR32NB:
- return R_X64_32S;
+ return R_COFF_ADDR32NB;
case IMAGE_REL_AMD64_REL32:
case IMAGE_REL_AMD64_REL32_1:
case IMAGE_REL_AMD64_REL32_2:
diff --git a/src/obj/elf/emit.c b/src/obj/elf/emit.c
@@ -300,8 +300,8 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
obj_symiter_free(it);
}
u32 max_syms = 1 + (nobjsec - 1) + nobjsym;
- u8* symtab = (u8*)arena_alloc(c->scratch, (size_t)sym_size * max_syms,
- _Alignof(u64));
+ u8* symtab =
+ (u8*)arena_alloc(c->scratch, (size_t)sym_size * max_syms, _Alignof(u64));
u32 nsyms = 0;
memset(&symtab[nsyms * sym_size], 0, sym_size);
nsyms = 1; /* index 0: STN_UNDEF */
@@ -310,37 +310,37 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
* Elf64_Sym (24B) and Elf32_Sym (16B) REORDER fields: ELF32 places
* st_value/st_size BEFORE st_info/st_other/st_shndx, so select the byte
* layout by `is32` rather than just narrowing widths. */
-#define WRITE_SYM(idx, st_name, st_info, st_other, st_shndx, st_value, \
- st_size) \
- do { \
- u8* slot = &symtab[(idx) * sym_size]; \
- if (is32) { \
- slot[0] = (u8)((st_name)); \
- slot[1] = (u8)((st_name) >> 8); \
- slot[2] = (u8)((st_name) >> 16); \
- slot[3] = (u8)((st_name) >> 24); \
- for (int _b = 0; _b < 4; ++_b) \
- slot[4 + _b] = (u8)((u64)(st_value) >> (_b * 8)); \
- for (int _b = 0; _b < 4; ++_b) \
- slot[8 + _b] = (u8)((u64)(st_size) >> (_b * 8)); \
- slot[12] = (u8)((st_info)); \
- slot[13] = (u8)((st_other)); \
- slot[14] = (u8)((st_shndx)); \
- slot[15] = (u8)((st_shndx) >> 8); \
- } else { \
- slot[0] = (u8)((st_name)); \
- slot[1] = (u8)((st_name) >> 8); \
- slot[2] = (u8)((st_name) >> 16); \
- slot[3] = (u8)((st_name) >> 24); \
- slot[4] = (u8)((st_info)); \
- slot[5] = (u8)((st_other)); \
- slot[6] = (u8)((st_shndx)); \
- slot[7] = (u8)((st_shndx) >> 8); \
- for (int _b = 0; _b < 8; ++_b) \
- slot[8 + _b] = (u8)((u64)(st_value) >> (_b * 8)); \
- for (int _b = 0; _b < 8; ++_b) \
- slot[16 + _b] = (u8)((u64)(st_size) >> (_b * 8)); \
- } \
+#define WRITE_SYM(idx, st_name, st_info, st_other, st_shndx, st_value, \
+ st_size) \
+ do { \
+ u8* slot = &symtab[(idx) * sym_size]; \
+ if (is32) { \
+ slot[0] = (u8)((st_name)); \
+ slot[1] = (u8)((st_name) >> 8); \
+ slot[2] = (u8)((st_name) >> 16); \
+ slot[3] = (u8)((st_name) >> 24); \
+ for (int _b = 0; _b < 4; ++_b) \
+ slot[4 + _b] = (u8)((u64)(st_value) >> (_b * 8)); \
+ for (int _b = 0; _b < 4; ++_b) \
+ slot[8 + _b] = (u8)((u64)(st_size) >> (_b * 8)); \
+ slot[12] = (u8)((st_info)); \
+ slot[13] = (u8)((st_other)); \
+ slot[14] = (u8)((st_shndx)); \
+ slot[15] = (u8)((st_shndx) >> 8); \
+ } else { \
+ slot[0] = (u8)((st_name)); \
+ slot[1] = (u8)((st_name) >> 8); \
+ slot[2] = (u8)((st_name) >> 16); \
+ slot[3] = (u8)((st_name) >> 24); \
+ slot[4] = (u8)((st_info)); \
+ slot[5] = (u8)((st_other)); \
+ slot[6] = (u8)((st_shndx)); \
+ slot[7] = (u8)((st_shndx) >> 8); \
+ for (int _b = 0; _b < 8; ++_b) \
+ slot[8 + _b] = (u8)((u64)(st_value) >> (_b * 8)); \
+ for (int _b = 0; _b < 8; ++_b) \
+ slot[16 + _b] = (u8)((u64)(st_size) >> (_b * 8)); \
+ } \
} while (0)
/* No automatic STT_SECTION synthesis. Section symbols are emitted
diff --git a/src/obj/obj.c b/src/obj/obj.c
@@ -1029,6 +1029,7 @@ const char* reloc_kind_name(RelocKind k) {
_CASE(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21);
_CASE(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC);
_CASE(R_AARCH64_TPOFF64);
+ _CASE(R_COFF_ADDR32NB);
_CASE(R_AARCH64_GLOB_DAT);
_CASE(R_AARCH64_JUMP_SLOT);
_CASE(R_AARCH64_RELATIVE);
diff --git a/src/obj/obj.h b/src/obj/obj.h
@@ -277,6 +277,9 @@ typedef enum RelocKind {
/* Internal-only: a raw 64-bit AArch64/RISC-V local-exec tpoff written into
* a TLS GOT slot ((target - tls_vaddr) + TCB). Never appears on the wire. */
R_AARCH64_TPOFF64,
+ /* COFF ADDR32NB: 32-bit image-relative RVA (S + A - ImageBase), used by
+ * PE exception tables and other image metadata. */
+ R_COFF_ADDR32NB,
} RelocKind;
typedef struct Section {
diff --git a/test/coff/kit-roundtrip-coff.c b/test/coff/kit-roundtrip-coff.c
@@ -679,6 +679,61 @@ static void test_data_with_reloc_rel32_x64(void) {
free_compiler(c);
}
+static void verify_rel32_inline_addend(const ObjBuilder* ob, Pool* p) {
+ ObjSecId text_id = find_section_id(ob, p, ".text");
+ EXPECT(text_id != OBJ_SEC_NONE, ".text id");
+ u32 total = obj_reloc_total(ob);
+ const Reloc* found = NULL;
+ for (u32 i = 0; i < total; ++i) {
+ const Reloc* r = obj_reloc_at(ob, i);
+ if (r->removed) continue;
+ if (r->section_id != text_id) continue;
+ found = r;
+ break;
+ }
+ EXPECT(found != NULL, "no reloc on .text");
+ if (found) {
+ EXPECT(found->kind == R_PC32, "reloc kind=%u (want R_PC32=%u)",
+ found->kind, R_PC32);
+ EXPECT(found->addend == 0x124, "reloc addend=%lld (want 0x124)",
+ (long long)found->addend);
+ EXPECT(found->has_explicit_addend == 0, "inline addend marked explicit");
+ }
+}
+
+static void test_reloc_rel32_inline_addend_x64(void) {
+ g_test_name = "reloc_rel32_inline_addend_x64";
+ KitTargetSpec t;
+ target_x64_windows(&t);
+ Compiler* c = make_compiler(&t);
+ if (!c) {
+ EXPECT(0, "compiler_new");
+ return;
+ }
+ if (setjmp(c->panic)) {
+ compiler_run_cleanups(c);
+ free_compiler(c);
+ EXPECT(0, "panic");
+ return;
+ }
+ ObjBuilder* ob = obj_new(c);
+ Pool* p = c->global;
+ Sym tn = pool_intern_slice(p, SLICE_LIT(".text"));
+ Sym hn = pool_intern_slice(p, SLICE_LIT("helper"));
+ ObjSecId sec = obj_section(ob, tn, SEC_TEXT, SF_ALLOC | SF_EXEC, 16);
+ /* call helper+0x128; ret. COFF stores this addend inline. */
+ static const uint8_t bytes[6] = {0xe8, 0x28, 0x01, 0, 0, 0xc3};
+ obj_write(ob, sec, bytes, sizeof bytes);
+ ObjSymId helper = obj_symbol(ob, hn, SB_GLOBAL, SK_UNDEF, OBJ_SEC_NONE, 0, 0);
+ obj_reloc_ex(ob, sec, 1, R_PC32, helper, 0, 0, 0);
+ obj_finalize(ob);
+
+ run_roundtrip(c, ob, verify_rel32_inline_addend);
+
+ obj_free(ob);
+ free_compiler(c);
+}
+
/* test_aa64_branch26: .text with a BRANCH26 (R_AARCH64_CALL26)
* relocation against an external. */
@@ -1403,6 +1458,7 @@ static const struct {
{"data_with_reloc_abs64_x64", test_data_with_reloc_abs64_x64},
{"data_with_reloc_abs64_aa64", test_data_with_reloc_abs64_aa64},
{"reloc_rel32_x64", test_data_with_reloc_rel32_x64},
+ {"reloc_rel32_inline_addend_x64", test_reloc_rel32_inline_addend_x64},
{"aa64_branch26", test_aa64_branch26},
{"aa64_pagebase_pageoffset", test_aa64_pagebase_pageoffset},
{"long_section_name", test_long_section_name},