kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 8ab800473391e2774d2c694e51c6bd7675c172d4
parent 79ae72f38b6d035e974765489d3e851a89196050
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat,  9 May 2026 18:00:38 -0700

link: linker script support (kernel.lds subset) + case 35

Implements the GNU-ld subset needed to link a freestanding aarch64
kernel image: ENTRY, SECTIONS{} with absolute `. = expr`, output
sections with `: ALIGN(N)`, `*(.glob …)` input matchers, in-section
and top-level symbol assignments, /DISCARD/, and ALIGN/MAX/MIN
expressions. New components:

- src/link/link_script.c: recursive-descent parser producing the
  public CfreeLinkScript struct in compiler-arena memory; rejects
  out-of-subset constructs (MEMORY, PROVIDE, KEEP, AT>, > REGION,
  …) with a diagnostic.
- src/link/link_layout.c: layout_sections_scripted walks output
  sections in declaration order, claims input sections via a `*`
  glob, advances a `dot` location counter, materializes one
  LinkSegment per non-empty output section, and tracks file offsets
  on a separate cursor (vaddrs are absolute, file offsets sequential).
- src/link/link_elf.c: scripted images skip the headers PT_LOAD,
  build-id PT_NOTE, and the address shift; only file offsets bump.
  PAGE_SIZE promoted to link_internal.h and bumped to 16 KiB to
  match Apple Silicon and the common Linux/AArch64 kernel config.
- src/link/link_reloc.c: implement R_REL64 / R_PC64 (AArch64
  PREL64), needed by `.quad _end - _start` in the Image header.

Harness:

- test/link/harness/{link_exe,jit}_runner.c: --linker-script <path>
  reads the file, calls cfree_link_script_parse, sets
  CfreeLinkInputs.linker_script.
- test/link/cases/35_linker_script_kernel/: kernel.lds-style script
  pinning .text at 0x40080000 with the arm64 Image header in
  entry.S, qemu-system-aarch64 -kernel + semihosting SYS_EXIT_EXTENDED.

All 119 link cases pass (118 prior + new case 35).

Diffstat:
Msrc/api/stubs.c | 14+-------------
Msrc/link/link.c | 5+++--
Msrc/link/link_elf.c | 107++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
Msrc/link/link_internal.h | 16++++++++++++++++
Msrc/link/link_layout.c | 417+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/link/link_reloc.c | 9+++++++++
Asrc/link/link_script.c | 904+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/link/CORPUS.md | 8++++++++
Atest/link/cases/35_linker_script_kernel/a.c | 9+++++++++
Atest/link/cases/35_linker_script_kernel/cflags | 1+
Atest/link/cases/35_linker_script_kernel/entry.S | 79+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/link/cases/35_linker_script_kernel/expected | 1+
Atest/link/cases/35_linker_script_kernel/kernel.lds | 29+++++++++++++++++++++++++++++
Atest/link/cases/35_linker_script_kernel/kernel_image | 0
Atest/link/cases/35_linker_script_kernel/linker_script | 1+
Mtest/link/harness/jit_runner.c | 24++++++++++++++++++++++++
Mtest/link/harness/link_exe_runner.c | 26+++++++++++++++++++++++++-
17 files changed, 1600 insertions(+), 50 deletions(-)

diff --git a/src/api/stubs.c b/src/api/stubs.c @@ -209,19 +209,7 @@ int cfree_arch_reg_iter_next(CfreeArchRegIter* it, CfreeArchReg* o) { } void cfree_arch_reg_iter_free(CfreeArchRegIter* it) { (void)it; } -/* Linker script parsing. */ -int cfree_link_script_parse(CfreeCompiler* c, const char* t, size_t l, - const CfreeLinkScript** o) { - (void)c; - (void)t; - (void)l; - if (o) *o = 0; - return 1; -} -void cfree_link_script_free(CfreeCompiler* c, const CfreeLinkScript* s) { - (void)c; - (void)s; -} +/* Linker script parsing lives in src/link/link_script.c. */ /* JIT lookup, view, addr_to_sym, and the symbol iterator live in * src/link/link_jit.c. */ diff --git a/src/link/link.c b/src/link/link.c @@ -247,8 +247,9 @@ void link_set_entry(Linker* l, const char* name) { void link_set_script(Linker* l, const CfreeLinkScript* script) { if (!l || !script) return; - compiler_panic(l->c, no_loc(), - "link_set_script: linker scripts not yet implemented"); + l->script = script; + if (script->entry) + l->entry_name = pool_intern_cstr(l->c->global, script->entry); } void link_set_extern_resolver(Linker* l, LinkExternResolver fn, void* user) { diff --git a/src/link/link_elf.c b/src/link/link_elf.c @@ -104,7 +104,6 @@ typedef struct __attribute__((packed)) Shdr64 { #define PT_NOTE 4 #define PT_TLS 7 -#define PAGE_SIZE 0x1000u /* Static ET_EXEC base. ET_DYN (PIE) uses 0 — the loader picks the * runtime base. The active value lives in `img_base` below; the macro * stays for the static path's hard-coded vaddrs. */ @@ -138,6 +137,20 @@ static u32 perms_to_pflags(u32 secflags) { return f; } +/* Scripted-layout post-pass: vaddrs are already final (the script + * pinned them via `. = …`), so only file offsets need to bump to + * leave room for ehdr+phdrs. Mirror of shift_image_addresses but + * touches only the file dimension. */ +static void shift_image_file_offsets(LinkImage* img, u64 delta) { + u32 i; + for (i = 0; i < img->nsegments; ++i) + img->segments[i].file_offset += delta; + for (i = 0; i < img->nsections; ++i) + img->sections[i].file_offset += delta; + for (i = 0; i < LinkRelocs_count(&img->relocs); ++i) + LinkRelocs_at(&img->relocs, i)->write_file_offset += delta; +} + static void shift_image_addresses(LinkImage* img, u64 delta) { u32 i; for (i = 0; i < img->nsegments; ++i) { @@ -572,20 +585,34 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) { /* PIE / ET_DYN: img_base is 0 (the loader picks the runtime base; * absolute relocs against internal symbols are emitted as * R_AARCH64_RELATIVE in .rela.dyn). Otherwise classic ET_EXEC at - * IMAGE_BASE_STATIC. */ + * IMAGE_BASE_STATIC. + * + * Scripted: the linker script pinned absolute vaddrs (e.g. + * `. = 0x40080000`); img_base stays 0 and the headers PT_LOAD / + * build-id note are dropped — the script's image is consumed by a + * raw loader (qemu -kernel, a bootloader) that doesn't need a + * self-describing memory image. */ int pie = img->pie; - u64 img_base = pie ? 0ULL : IMAGE_BASE_STATIC; + int scripted = img->scripted; + u64 img_base = (pie || scripted) ? 0ULL : IMAGE_BASE_STATIC; /* ---- plan number of program headers ---- * * 1 headers PT_LOAD + nsegments PT_LOAD + 1 PT_NOTE (build-id) * + 1 PT_TLS when this image carries any TLS sections. - * + 4 dyn phdrs (PT_PHDR / PT_INTERP / PT_DYNAMIC / PT_GNU_STACK) on PIE. */ + * + 4 dyn phdrs (PT_PHDR / PT_INTERP / PT_DYNAMIC / PT_GNU_STACK) on PIE. + * + * Scripted images skip the headers PT_LOAD and PT_NOTE: phdrs are + * just the per-segment PT_LOADs. */ u32 has_tls = img->tls_memsz ? 1u : 0u; u32 nphdr_extra_dyn = pie ? 4u : 0u; - u32 nphdr_total = 1u + img->nsegments + 1u + has_tls + nphdr_extra_dyn; - u64 headers_size = - sizeof(Ehdr64) + (u64)nphdr_total * sizeof(Phdr64) + BUILD_ID_NOTE_BYTES; + u32 nphdr_headers = scripted ? 0u : 1u; + u32 nphdr_buildid = scripted ? 0u : 1u; + u32 nphdr_total = + nphdr_headers + img->nsegments + nphdr_buildid + has_tls + nphdr_extra_dyn; + u64 build_id_note_bytes = scripted ? 0ULL : BUILD_ID_NOTE_BYTES; + u64 headers_size = sizeof(Ehdr64) + (u64)nphdr_total * sizeof(Phdr64) + + build_id_note_bytes; u64 headers_load = ALIGN_UP(headers_size, (u64)PAGE_SIZE); /* The build-id note lives inside the headers PT_LOAD at this offset. */ @@ -596,7 +623,10 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) { * * Must happen before segshdrs/symtab construction so they observe * post-shift vaddrs (the values that will land in the file). */ - shift_image_addresses(img, headers_load); + if (scripted) + shift_image_file_offsets(img, headers_load); + else + shift_image_addresses(img, headers_load); apply_all_relocs(img, img_base); /* ---- write .dynamic body + re-serialize .rela.dyn (PIE only) ---- @@ -702,8 +732,9 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) { * coming out of layout_dyn. Trailing capacity stays zero — * readers stop at the first R_AARCH64_NONE record. */ { - u8* rd_bytes = img->segment_bytes[dseg->id - 1] + - (size_t)(sec_reladyn->file_offset - dseg->file_offset); + const LinkSegment* rdseg = &img->segments[sec_reladyn->segment_id - 1]; + u8* rd_bytes = img->segment_bytes[rdseg->id - 1] + + (size_t)(sec_reladyn->file_offset - rdseg->file_offset); u32 i; for (i = 0; i < dyn->nrela_dyn; ++i) { const DynRela* rr = &dyn->rela_dyn[i]; @@ -719,8 +750,9 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) { * dyn->rela_plt[i].r_offset along with the rest, so the post-shift * values match the .got.plt slot vaddrs the loader will patch. */ if (sec_relaplt && dyn->nrela_plt) { - u8* rp_bytes = img->segment_bytes[dseg->id - 1] + - (size_t)(sec_relaplt->file_offset - dseg->file_offset); + const LinkSegment* rpseg = &img->segments[sec_relaplt->segment_id - 1]; + u8* rp_bytes = img->segment_bytes[rpseg->id - 1] + + (size_t)(sec_relaplt->file_offset - rpseg->file_offset); u32 i; for (i = 0; i < dyn->nrela_plt; ++i) { const DynRela* rr = &dyn->rela_plt[i]; @@ -964,16 +996,19 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) { phdrs[pi].p_align = 8; pi++; } - /* Headers PT_LOAD (covers ehdr + phdrs + build-id note). */ - phdrs[pi].p_type = PT_LOAD; - phdrs[pi].p_flags = PF_R; - phdrs[pi].p_offset = 0; - phdrs[pi].p_vaddr = img_base; - phdrs[pi].p_paddr = img_base; - phdrs[pi].p_filesz = headers_size; - phdrs[pi].p_memsz = headers_size; - phdrs[pi].p_align = PAGE_SIZE; - pi++; + /* Headers PT_LOAD (covers ehdr + phdrs + build-id note). + * Scripted images don't emit one — see plan note above. */ + if (!scripted) { + phdrs[pi].p_type = PT_LOAD; + phdrs[pi].p_flags = PF_R; + phdrs[pi].p_offset = 0; + phdrs[pi].p_vaddr = img_base; + phdrs[pi].p_paddr = img_base; + phdrs[pi].p_filesz = headers_size; + phdrs[pi].p_memsz = headers_size; + phdrs[pi].p_align = PAGE_SIZE; + pi++; + } /* Per-segment PT_LOAD. */ u32 i; for (i = 0; i < img->nsegments; ++i) { @@ -988,16 +1023,18 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) { p->p_memsz = seg->mem_size; p->p_align = seg->align ? seg->align : PAGE_SIZE; } - /* PT_NOTE for build-id. */ - phdrs[pi].p_type = PT_NOTE; - phdrs[pi].p_flags = PF_R; - phdrs[pi].p_offset = build_id_off; - phdrs[pi].p_vaddr = build_id_addr; - phdrs[pi].p_paddr = build_id_addr; - phdrs[pi].p_filesz = BUILD_ID_NOTE_BYTES; - phdrs[pi].p_memsz = BUILD_ID_NOTE_BYTES; - phdrs[pi].p_align = 4; - pi++; + /* PT_NOTE for build-id. Scripted images skip the build-id entirely. */ + if (!scripted) { + phdrs[pi].p_type = PT_NOTE; + phdrs[pi].p_flags = PF_R; + phdrs[pi].p_offset = build_id_off; + phdrs[pi].p_vaddr = build_id_addr; + phdrs[pi].p_paddr = build_id_addr; + phdrs[pi].p_filesz = BUILD_ID_NOTE_BYTES; + phdrs[pi].p_memsz = BUILD_ID_NOTE_BYTES; + phdrs[pi].p_align = 4; + pi++; + } /* PT_TLS describing the .tdata template + .tbss zero-fill. * vaddr/file_offset point at the same bytes the matching * PT_LOAD already covers — the loader uses PT_TLS to size @@ -1097,8 +1134,10 @@ void link_emit_elf_aarch64(LinkImage* img, Writer* w) { * u32 type = NT_GNU_BUILD_ID (3) * "GNU\0" * <16 bytes of build-id> - */ - { + * + * Scripted images don't carry build-id; they have no PT_NOTE phdr to + * point at it and the file payload would just be dead bytes. */ + if (!scripted) { u8 nh[12]; u32 v; v = NOTE_NAME_GNU_LEN; diff --git a/src/link/link_internal.h b/src/link/link_internal.h @@ -85,6 +85,11 @@ struct Linker { LinkInputs inputs; /* LinkInputId = slot index + 1 */ LinkArchives archives; Sym entry_name; + /* Set by link_set_script. NULL: layout takes the existing default + * bucket-based path. Non-NULL: layout_sections_scripted walks the + * script's output sections in declaration order. Borrowed; the + * script and every sub-object must outlive link_resolve. */ + const CfreeLinkScript* script; int gc_sections; /* Set by cfree_link_exe before link_resolve. When 1, layout_iplt * synthesizes a .init_array entry pointing at __cfree_ifunc_init so @@ -281,8 +286,19 @@ struct LinkImage { LinkDynState* dyn; /* Mirror of Linker.emit_pie at link_resolve time; consulted by emit. */ int pie; + /* Set when layout was driven by Linker.script. The emitter then keeps + * segment vaddrs at their script-assigned absolute values, drops the + * self-describing headers PT_LOAD / build-id PT_NOTE, and only shifts + * file offsets to make room for ehdr+phdrs. */ + u8 scripted; }; +/* Page granularity used for ELF segment alignment and the file-offset / + * vaddr congruence the runtime loader requires. 16 KiB matches AArch64 + * Apple Silicon and the common Linux/AArch64 kernel config; 4 KiB pages + * are also valid at runtime since 16K is a multiple. */ +#define PAGE_SIZE 0x4000u + /* Apply one relocation in place. P_bytes points at the first byte of the * relocation site within the final memory; S is the resolved final * address of the target symbol; A the addend; P the final address of diff --git a/src/link/link_layout.c b/src/link/link_layout.c @@ -678,7 +678,14 @@ typedef struct PlaceEntry { u8 pad[3]; } PlaceEntry; +static void layout_sections_scripted(Linker* l, LinkImage* img, + const GcLive* g); + static void layout_sections(Linker* l, LinkImage* img, const GcLive* g) { + if (l->script) { + layout_sections_scripted(l, img, g); + return; + } Heap* h = img->heap; u32 ii, j; u32 total_kept = 0; @@ -895,6 +902,416 @@ static void layout_sections(Linker* l, LinkImage* img, const GcLive* g) { } } +/* ---- scripted layout (linker-script driven) ---- + * + * Walks the CfreeLinkScript's output sections in declaration order, + * placing matched input sections at the dot location counter. One + * LinkSegment per non-DISCARD output section maps 1:1 to a PT_LOAD on + * emit. Symbol assignments (top-level and in-section) materialize as + * defined LinkSymbol globals via upsert_global_sym (the same upsert + * pattern emit_boundary_sym uses). + * + * Discard handling: `/DISCARD/` matches input sections by glob and + * leaves their per-input m->section[id] entry as LINK_SEC_NONE — the + * downstream emit_reloc_records / link_symbols_to_sections passes + * already treat that as "section dropped" so they're naturally + * excluded from segments, gc, and reloc apply. */ + +/* `*` is the only metachar. Supported forms in the kernel.lds-style + * subset: trailing star (".text*"), leading star ("*COMMON" — not in + * kernel.lds but cheap), and exact literal. */ +static int match_glob(const char* pat, const char* name) { + size_t plen, nlen; + if (!pat || !name) return 0; + plen = strlen(pat); + nlen = strlen(name); + if (plen == 1 && pat[0] == '*') return 1; + if (plen >= 2 && pat[plen - 1] == '*') { + if (nlen + 1 < plen) return 0; + return memcmp(pat, name, plen - 1) == 0; + } + if (plen >= 2 && pat[0] == '*') { + if (nlen + 1 < plen) return 0; + return memcmp(pat + 1, name + (nlen - (plen - 1)), plen - 1) == 0; + } + return plen == nlen && memcmp(pat, name, plen) == 0; +} + +static u64 eval_link_expr(Linker* l, LinkImage* img, u64 dot, + const CfreeLinkExpr* e, int* err) { + if (!e) { + *err = 1; + return 0; + } + switch ((CfreeLinkExprKind)e->kind) { + case CFREE_LE_INT: + return (u64)e->v.int_val; + case CFREE_LE_DOT: + return dot; + case CFREE_LE_SYM: { + Sym name = pool_intern_cstr(l->c->global, e->v.name); + LinkSymId id = symhash_get(&img->globals, name); + if (id == LINK_SYM_NONE) { + compiler_panic(l->c, no_loc(), + "linker script: undefined symbol '%s' in expression", + e->v.name); + } + return LinkSyms_at(&img->syms, id - 1)->vaddr; + } + case CFREE_LE_ADD: + return eval_link_expr(l, img, dot, e->v.bin.lhs, err) + + eval_link_expr(l, img, dot, e->v.bin.rhs, err); + case CFREE_LE_SUB: + return eval_link_expr(l, img, dot, e->v.bin.lhs, err) - + eval_link_expr(l, img, dot, e->v.bin.rhs, err); + case CFREE_LE_MUL: + return eval_link_expr(l, img, dot, e->v.bin.lhs, err) * + eval_link_expr(l, img, dot, e->v.bin.rhs, err); + case CFREE_LE_DIV: { + u64 rhs = eval_link_expr(l, img, dot, e->v.bin.rhs, err); + if (rhs == 0) { + *err = 1; + return 0; + } + return eval_link_expr(l, img, dot, e->v.bin.lhs, err) / rhs; + } + case CFREE_LE_AND: + return eval_link_expr(l, img, dot, e->v.bin.lhs, err) & + eval_link_expr(l, img, dot, e->v.bin.rhs, err); + case CFREE_LE_OR: + return eval_link_expr(l, img, dot, e->v.bin.lhs, err) | + eval_link_expr(l, img, dot, e->v.bin.rhs, err); + case CFREE_LE_XOR: + return eval_link_expr(l, img, dot, e->v.bin.lhs, err) ^ + eval_link_expr(l, img, dot, e->v.bin.rhs, err); + case CFREE_LE_SHL: + return eval_link_expr(l, img, dot, e->v.bin.lhs, err) + << eval_link_expr(l, img, dot, e->v.bin.rhs, err); + case CFREE_LE_SHR: + return eval_link_expr(l, img, dot, e->v.bin.lhs, err) >> + eval_link_expr(l, img, dot, e->v.bin.rhs, err); + case CFREE_LE_ALIGN: { + u64 v = eval_link_expr(l, img, dot, e->v.align.val, err); + u64 a = eval_link_expr(l, img, dot, e->v.align.align, err); + if (a == 0) return v; + return ALIGN_UP(v, a); + } + case CFREE_LE_REGION_ORIGIN: + case CFREE_LE_REGION_LENGTH: + case CFREE_LE_MAX: + case CFREE_LE_MIN: + default: + compiler_panic(l->c, no_loc(), + "linker script: expression kind %u not supported", + (unsigned)e->kind); + return 0; + } +} + +static void emit_boundary_sym(Linker* l, LinkImage* img, const char* name, + u64 vaddr); /* defined below */ + +/* Upsert a global symbol with the given absolute vaddr. Mirrors the + * emit_boundary_sym pattern: satisfies any prior undef ref in place; + * fans out to per-input duplicate name slots. */ +static void upsert_global_sym(Linker* l, LinkImage* img, const char* name, + u64 vaddr) { + emit_boundary_sym(l, img, name, vaddr); +} + +/* Apply one CfreeLinkAssignment. CFREE_LAS_DOT updates *dot; SYM / + * PROVIDE upserts a global. PROVIDE only fires when the name isn't + * already strongly defined; for v1 we accept it as equivalent to SYM + * (no input to kernel.lds defines these names). */ +static void apply_asn(Linker* l, LinkImage* img, u64* dot, + const CfreeLinkAssignment* asn) { + int err = 0; + u64 v = eval_link_expr(l, img, *dot, asn->expr, &err); + if (err) return; + switch ((CfreeLinkAsnKind)asn->kind) { + case CFREE_LAS_DOT: + if (v < *dot) + compiler_panic(l->c, no_loc(), + "linker script: dot moved backwards (%llu -> %llu)", + (unsigned long long)*dot, (unsigned long long)v); + *dot = v; + break; + case CFREE_LAS_SYM: + case CFREE_LAS_PROVIDE: + if (asn->sym) upsert_global_sym(l, img, asn->sym, v); + break; + } +} + +static int input_match_section(const CfreeLinkInputMatch* m, const char* nm) { + /* file_pattern is ignored for v1 — kernel.lds uses `*(...)` only. */ + return match_glob(m->section_pattern, nm); +} + +static void layout_sections_scripted(Linker* l, LinkImage* img, + const GcLive* g) { + Heap* h = img->heap; + const CfreeLinkScript* script = l->script; + u64 dot = 0; + /* Scripted layout: vaddrs are absolute (driven by `dot`), but file + * offsets follow a separate cursor packed sequentially after the + * eventual ehdr+phdrs. The writer adds headers_load to file_offsets + * (only) post-layout. */ + u64 file_cursor = 0; + u32 ii, j, k, si; + u32 total_kept = 0; + + img->scripted = 1; + + /* Pass 0: count GC-live, kept, allocatable input sections — the + * upper bound on placeable LinkSections. The actual count placed + * may be lower (DISCARD sinks, unmatched). */ + for (ii = 0; ii < LinkInputs_count(&l->inputs); ++ii) { + ObjBuilder* ob = LinkInputs_at(&l->inputs, ii)->obj; + for (j = 1; j < obj_section_count(ob); ++j) { + const Section* s = obj_section_get(ob, j); + if (s && section_kept(s) && gc_live_get(g, ii, j)) ++total_kept; + } + } + + /* Pre-allocate img->sections at the upper bound; img->nsections + * tracks the actual count placed. */ + img->sections = total_kept ? (LinkSection*)h->alloc( + h, sizeof(*img->sections) * total_kept, + _Alignof(LinkSection)) + : NULL; + if (total_kept && !img->sections) + compiler_panic(img->c, no_loc(), "link: oom on sections"); + + /* Per-section "claimed" bitmap to enforce single-claim across the + * whole script. Indexed by [input_idx][obj_sec_id]. */ + u8** claimed = NULL; + if (LinkInputs_count(&l->inputs)) { + u32 ni = LinkInputs_count(&l->inputs); + claimed = (u8**)h->alloc(h, sizeof(*claimed) * ni, _Alignof(u8*)); + if (!claimed) compiler_panic(img->c, no_loc(), "link: oom on claim map"); + for (ii = 0; ii < ni; ++ii) { + ObjBuilder* ob = LinkInputs_at(&l->inputs, ii)->obj; + u32 nsec = obj_section_count(ob); + claimed[ii] = (u8*)h->alloc(h, nsec, 1); + if (!claimed[ii]) + compiler_panic(img->c, no_loc(), "link: oom on claim row"); + memset(claimed[ii], 0, nsec); + } + } + + /* Pass 1: top-level dot assignments establish the base address + * before any placement. SYM/PROVIDE assignments at the top level + * are deferred to pass 3 so they capture the post-placement dot + * (e.g. `_end = .` at the script's tail). */ + for (k = 0; k < script->ntop_asns; ++k) { + const CfreeLinkAssignment* a = &script->top_asns[k]; + if (a->kind == CFREE_LAS_DOT) apply_asn(l, img, &dot, a); + } + + /* Pre-allocate img->segments at the upper bound (one per non-DISCARD + * output section). */ + u32 nseg_max = 0; + for (si = 0; si < script->nsections; ++si) + if (strcmp(script->sections[si].name, "/DISCARD/") != 0) ++nseg_max; + img->segments = + nseg_max ? (LinkSegment*)h->alloc(h, sizeof(*img->segments) * nseg_max, + _Alignof(LinkSegment)) + : NULL; + img->segment_bytes = + nseg_max ? (u8**)h->alloc(h, sizeof(*img->segment_bytes) * nseg_max, + _Alignof(u8*)) + : NULL; + img->segment_bytes_cap = + nseg_max + ? (size_t*)h->alloc(h, sizeof(*img->segment_bytes_cap) * nseg_max, + _Alignof(size_t)) + : NULL; + if (nseg_max && + (!img->segments || !img->segment_bytes || !img->segment_bytes_cap)) + compiler_panic(img->c, no_loc(), "link: oom on segments"); + if (nseg_max) { + memset(img->segment_bytes, 0, sizeof(*img->segment_bytes) * nseg_max); + memset(img->segment_bytes_cap, 0, + sizeof(*img->segment_bytes_cap) * nseg_max); + } + + /* Pass 2: walk output sections in declaration order. */ + for (si = 0; si < script->nsections; ++si) { + const CfreeLinkOutputSection* os = &script->sections[si]; + int is_discard = (strcmp(os->name, "/DISCARD/") == 0); + + if (is_discard) { + /* Mark every matched input section as claimed. We don't add + * them to img->sections; their m->section[id] stays + * LINK_SEC_NONE so reloc-apply / link_symbols_to_sections + * naturally skip them. */ + u32 mi; + for (mi = 0; mi < os->ninputs; ++mi) { + const CfreeLinkInputMatch* im = &os->inputs[mi]; + for (ii = 0; ii < LinkInputs_count(&l->inputs); ++ii) { + ObjBuilder* ob = LinkInputs_at(&l->inputs, ii)->obj; + for (j = 1; j < obj_section_count(ob); ++j) { + const Section* s; + const char* nm; + size_t nl; + if (claimed[ii][j]) continue; + s = obj_section_get(ob, j); + if (!s) continue; + nm = pool_str(l->c->global, s->name, &nl); + if (!nm) continue; + if (input_match_section(im, nm)) claimed[ii][j] = 1; + } + } + } + continue; + } + + /* Non-DISCARD output section. Process all in-section asns first + * (header ALIGN encoded as the first dot-asn, plus any + * `__bss_start = .` style early captures), then walk inputs in + * declaration order, claiming matches across all inputs in input + * order. Each placed input section advances dot. */ + u64 sec_start_dot; + u32 perms = 0; + LinkSegmentId seg_id = (LinkSegmentId)(img->nsegments + 1u); + LinkSegment* seg; + u64 file_size_accum = 0; + u64 mem_size_accum = 0; + u32 align_max = 1; + u32 nsec_in_seg = 0; + u32 first_section_idx = img->nsections; + + /* Apply in-section asns (pre-input). */ + for (k = 0; k < os->nasns; ++k) { + apply_asn(l, img, &dot, &os->asns[k]); + } + sec_start_dot = dot; + + /* Walk input matches; for each, scan all inputs in input order. */ + { + u32 mi; + for (mi = 0; mi < os->ninputs; ++mi) { + const CfreeLinkInputMatch* im = &os->inputs[mi]; + for (ii = 0; ii < LinkInputs_count(&l->inputs); ++ii) { + ObjBuilder* ob = LinkInputs_at(&l->inputs, ii)->obj; + InputMap* m = &img->input_maps[ii]; + for (j = 1; j < obj_section_count(ob); ++j) { + const Section* s; + const char* nm; + size_t nl; + u32 align; + u64 ofs; + LinkSection* ls; + LinkSectionId lsid; + if (claimed[ii][j]) continue; + if (!gc_live_get(g, ii, j)) continue; + s = obj_section_get(ob, j); + if (!s || !section_kept(s)) continue; + nm = pool_str(l->c->global, s->name, &nl); + if (!nm) continue; + if (!input_match_section(im, nm)) continue; + + align = s->align ? s->align : 1u; + if (align > align_max) align_max = align; + dot = ALIGN_UP(dot, (u64)align); + ofs = dot; + + lsid = (LinkSectionId)(img->nsections + 1u); + ls = &img->sections[img->nsections++]; + memset(ls, 0, sizeof(*ls)); + ls->id = lsid; + ls->input_id = LinkInputs_at(&l->inputs, ii)->id; + ls->obj_section_id = j; + ls->segment_id = seg_id; + ls->vaddr = ofs; + ls->size = (s->sem == SSEM_NOBITS) ? s->bss_size : s->bytes.total; + ls->flags = s->flags; + ls->align = align; + ls->name = s->name; + ls->sem = s->sem; + /* file_offset within the segment buffer: distance from + * sec_start_dot. NOBITS contributes no file bytes. */ + ls->file_offset = ofs - sec_start_dot; + ls->input_offset = ls->file_offset; + m->section[j] = lsid; + claimed[ii][j] = 1; + + dot += ls->size; + mem_size_accum = dot - sec_start_dot; + if (s->sem != SSEM_NOBITS) file_size_accum = dot - sec_start_dot; + perms |= (s->flags & (SF_EXEC | SF_WRITE | SF_TLS)); + ++nsec_in_seg; + } + } + } + } + + /* Materialize the segment for this output section. Empty output + * sections (no input matched) are dropped — they'd produce an + * empty PT_LOAD which the loader rejects. */ + if (nsec_in_seg == 0) { + /* Roll back nsections (no entries appended in the empty case). */ + continue; + } + + seg = &img->segments[img->nsegments]; + memset(seg, 0, sizeof(*seg)); + seg->id = seg_id; + seg->flags = SF_ALLOC | perms; + seg->vaddr = sec_start_dot; + /* Page-align each segment's file offset so the writer can keep file + * offset and vaddr congruent mod page size for the runtime loader. */ + file_cursor = ALIGN_UP(file_cursor, (u64)PAGE_SIZE); + seg->file_offset = file_cursor; + seg->mem_size = mem_size_accum; + seg->file_size = file_size_accum; + seg->align = align_max; + seg->nsections = nsec_in_seg; + file_cursor += file_size_accum; + if (file_size_accum) { + img->segment_bytes[img->nsegments] = + (u8*)h->alloc(h, (size_t)file_size_accum, 16); + if (!img->segment_bytes[img->nsegments]) + compiler_panic(img->c, no_loc(), "link: oom on scripted segment bytes"); + img->segment_bytes_cap[img->nsegments] = (size_t)file_size_accum; + memset(img->segment_bytes[img->nsegments], 0, (size_t)file_size_accum); + } + + /* Shift each section's vaddr/file_offset onto the segment's + * absolute base. Sections were laid out with vaddr = absolute + * dot already, so vaddr is correct as-is; file_offset needs + * to become absolute (segment-base + relative). */ + { + u32 fi; + for (fi = first_section_idx; fi < img->nsections; ++fi) { + LinkSection* ls = &img->sections[fi]; + ls->file_offset = seg->file_offset + (ls->file_offset); + } + } + + img->nsegments++; + } + + /* Pass 3: top-level SYM / PROVIDE assignments capture the + * post-placement dot (e.g. `_end = .`). */ + for (k = 0; k < script->ntop_asns; ++k) { + const CfreeLinkAssignment* a = &script->top_asns[k]; + if (a->kind == CFREE_LAS_SYM || a->kind == CFREE_LAS_PROVIDE) + apply_asn(l, img, &dot, a); + } + + /* Free claim map. */ + if (claimed) { + u32 ni = LinkInputs_count(&l->inputs); + for (ii = 0; ii < ni; ++ii) { + ObjBuilder* ob = LinkInputs_at(&l->inputs, ii)->obj; + h->free(h, claimed[ii], obj_section_count(ob)); + } + h->free(h, claimed, sizeof(*claimed) * ni); + } +} + /* ---- pass 2b: COMMON symbol BSS allocation ---- */ /* After segments are laid out, extend the RW segment's BSS area to * accommodate any SK_COMMON symbols that have no section yet. */ diff --git a/src/link/link_reloc.c b/src/link/link_reloc.c @@ -41,6 +41,15 @@ void link_reloc_apply(Compiler* c, RelocKind k, u8* P_bytes, u64 S, i64 A, wr_u32_le(P_bytes, (u32)((u64)v & 0xffffffffu)); return; } + case R_REL64: + case R_PC64: { + /* 64-bit PC-relative; AArch64 R_AARCH64_PREL64. Used by + * `.quad sym1 - sym2` style symbol-difference encodings (e.g. + * the arm64 kernel image_size header field). */ + i64 v = (i64)S + A - (i64)P; + wr_u64_le(P_bytes, (u64)v); + return; + } case R_AARCH64_ABS16: { u64 v = S + (u64)A; wr_u16_le(P_bytes, (u16)(v & 0xffffu)); diff --git a/src/link/link_script.c b/src/link/link_script.c @@ -0,0 +1,904 @@ +/* Linker-script parser: a minimal GNU-ld-subset front end that produces + * the structured CfreeLinkScript form documented in <cfree.h>. The + * applicator (link_layout.c) consumes the structured form; this file + * never speaks ELF or layout. + * + * Subset (driven by the kernel.lds at the head of doc/DESIGN.md §13): + * ENTRY(symbol) + * SECTIONS { ... } + * . = expr + * name = expr + * name : [ALIGN(N)] { body } + * /DISCARD/ : { body } + * body items: *(p1 p2 ...), name = expr, . = expr + * exprs: int literal (dec / 0x), `.`, ident, parens, + * + - * / & | ^ << >>, ALIGN(expr, align) + * slash-star comments; whitespace insensitive. + * + * Anything else (MEMORY, PROVIDE, KEEP, AT>, > REGION, OVERLAY, INSERT, + * OUTPUT_FORMAT, INPUT, GROUP, MAX, MIN, line comments, quoted strings, + * file patterns other than the implicit `*` of `*(...)`) is a parse + * error: emits a diagnostic and returns 1, leaving *out unchanged. + * + * Encoding contracts the applicator relies on: + * - /DISCARD/ is encoded as a CfreeLinkOutputSection with name + * "/DISCARD/" (a literal sentinel, not a parsed identifier). + * - An output section's `: ALIGN(N)` header is encoded as the first + * entry in its asns[]: a dot-assignment whose expr is ALIGN(., N). + * - `*(p1 p2 ...)` produces one CfreeLinkInputMatch per pattern with + * file_pattern = NULL (implicit `*`) and section_pattern set. + * COMMON is parsed as a literal pattern "COMMON". + * + * Allocation: every node and string is owned by the compiler's tu arena. + * cfree_link_script_free is therefore a no-op — the arena outlives the + * script and is collectively freed with the compiler. During parsing we + * grow temporary arrays on the host heap, then arena-copy at finish. + * + * Diagnostics: SourceManager registration of a script buffer is a future + * cleanup; for now diagnostics carry file_id = 0 and pack the byte + * offset into the SrcLoc.line field (col is computed inline). */ + +#include <cfree.h> +#include <stdarg.h> +#include <string.h> + +#include "core/arena.h" +#include "core/core.h" +#include "core/diag.h" +#include "core/heap.h" + +typedef struct LSP { + Compiler* c; + Heap* heap; + const char* src; + size_t len; + size_t pos; + /* one-bit error sticky: any diagnostic flips this and the parser + * unwinds without producing partial output. */ + int err; +} LSP; + +/* ---- diagnostics ---- */ + +static SrcLoc lsp_loc(const LSP* p, size_t off) { + /* TODO: register the script buffer with SourceManager so diagnostics + * carry a real file_id; until then encode the byte offset as `line` + * and recompute a 1-based line/col on demand. */ + SrcLoc l; + size_t i, line = 1, col = 1; + l.file_id = 0; + for (i = 0; i < off && i < p->len; ++i) { + if (p->src[i] == '\n') { + ++line; + col = 1; + } else { + ++col; + } + } + l.line = (u32)line; + l.col = (u32)col; + return l; +} + +static void lsp_errf(LSP* p, size_t off, const char* fmt, ...) { + va_list ap; + if (!p->c || !p->c->env || !p->c->env->diag) { + p->err = 1; + return; + } + va_start(ap, fmt); + diag_emitv(p->c->env->diag, DIAG_ERROR, lsp_loc(p, off), fmt, ap); + va_end(ap); + p->err = 1; +} + +/* ---- arena helpers ---- */ + +static char* lsp_strdup(LSP* p, const char* s, size_t n) { + return arena_strdup(p->c->tu, s, n); +} + +static CfreeLinkExpr* lsp_new_expr(LSP* p) { + return arena_znew(p->c->tu, CfreeLinkExpr); +} + +/* ---- heap-backed temp vectors (copied to the arena at finish) ---- */ + +typedef struct VecAsn { + CfreeLinkAssignment* p; + u32 n, cap; +} VecAsn; +typedef struct VecMatch { + CfreeLinkInputMatch* p; + u32 n, cap; +} VecMatch; +typedef struct VecSec { + CfreeLinkOutputSection* p; + u32 n, cap; +} VecSec; + +static int vec_reserve_(LSP* p, void** ptr, u32* cap, u32 want, size_t es) { + u32 nc; + void* nb; + if (*cap >= want) return 0; + nc = *cap ? *cap * 2 : 8; + while (nc < want) nc *= 2; + nb = p->heap->realloc(p->heap, *ptr, (size_t)*cap * es, (size_t)nc * es, + sizeof(void*)); + if (!nb) return 1; + *ptr = nb; + *cap = nc; + return 0; +} + +#define VEC_PUSH(p, v, val) \ + (vec_reserve_((p), (void**)&(v).p, &(v).cap, (v).n + 1, sizeof(*(v).p)) \ + ? 1 \ + : ((v).p[(v).n++] = (val), 0)) + +static void vec_free_(LSP* p, void* ptr, u32 cap, size_t es) { + if (ptr) p->heap->free(p->heap, ptr, (size_t)cap * es); +} + +/* ---- lex primitives ---- */ + +static int is_id_start(int c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || + c == '.'; +} +static int is_id_cont(int c) { + return is_id_start(c) || (c >= '0' && c <= '9') || c == '-'; +} + +static void skip_ws(LSP* p) { + while (p->pos < p->len) { + char ch = p->src[p->pos]; + if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n') { + ++p->pos; + continue; + } + if (ch == '/' && p->pos + 1 < p->len && p->src[p->pos + 1] == '*') { + size_t start = p->pos; + p->pos += 2; + while (p->pos + 1 < p->len && + !(p->src[p->pos] == '*' && p->src[p->pos + 1] == '/')) { + ++p->pos; + } + if (p->pos + 1 >= p->len) { + lsp_errf(p, start, "unterminated /* comment"); + return; + } + p->pos += 2; + continue; + } + if (ch == '/' && p->pos + 1 < p->len && p->src[p->pos + 1] == '/') { + lsp_errf(p, p->pos, "// line comments not supported"); + return; + } + break; + } +} + +static int peek_ch(LSP* p) { + skip_ws(p); + if (p->err) return -1; + if (p->pos >= p->len) return -1; + return (unsigned char)p->src[p->pos]; +} + +static int match_ch(LSP* p, char ch) { + skip_ws(p); + if (p->err) return 0; + if (p->pos < p->len && p->src[p->pos] == ch) { + ++p->pos; + return 1; + } + return 0; +} + +static int expect_ch(LSP* p, char ch) { + if (match_ch(p, ch)) return 0; + lsp_errf(p, p->pos, "expected '%c'", ch); + return 1; +} + +/* Lex an identifier-or-section-name token in place: returns a pointer + * into p->src and length via *out_len. Section names like .text.* and + * /DISCARD/ are handled by the section-name-aware variant below. */ +static int lex_ident(LSP* p, const char** out, size_t* out_len) { + size_t start; + skip_ws(p); + if (p->err) return 1; + if (p->pos >= p->len || !is_id_start((unsigned char)p->src[p->pos])) { + lsp_errf(p, p->pos, "expected identifier"); + return 1; + } + start = p->pos; + while (p->pos < p->len && is_id_cont((unsigned char)p->src[p->pos])) ++p->pos; + *out = p->src + start; + *out_len = p->pos - start; + return 0; +} + +/* Match a literal keyword. Caller must have already peeked. */ +static int match_kw(LSP* p, const char* kw) { + size_t klen = strlen(kw); + size_t save; + skip_ws(p); + if (p->err) return 0; + save = p->pos; + if (p->pos + klen > p->len) return 0; + if (memcmp(p->src + p->pos, kw, klen) != 0) return 0; + /* must not glue to a following id-cont character */ + if (p->pos + klen < p->len && + is_id_cont((unsigned char)p->src[p->pos + klen])) + return 0; + p->pos += klen; + (void)save; + return 1; +} + +/* ---- expression parser (precedence climbing) ---- + * + * Levels (low -> high): + * 0: | + * 1: ^ + * 2: & + * 3: << >> + * 4: + - + * 5: * / + * 6: unary (none beyond parenthesized atoms here) + * atom: int | . | ALIGN(e,a) | ident | (expr) + */ + +static CfreeLinkExpr* parse_expr(LSP* p); + +static CfreeLinkExpr* parse_int(LSP* p) { + CfreeLinkExpr* e; + size_t start = p->pos; + i64 v = 0; + if (p->pos + 1 < p->len && p->src[p->pos] == '0' && + (p->src[p->pos + 1] == 'x' || p->src[p->pos + 1] == 'X')) { + p->pos += 2; + if (p->pos >= p->len) { + lsp_errf(p, start, "malformed hex literal"); + return NULL; + } + while (p->pos < p->len) { + char ch = p->src[p->pos]; + int d; + if (ch >= '0' && ch <= '9') + d = ch - '0'; + else if (ch >= 'a' && ch <= 'f') + d = 10 + (ch - 'a'); + else if (ch >= 'A' && ch <= 'F') + d = 10 + (ch - 'A'); + else + break; + v = (v << 4) | d; + ++p->pos; + } + if (p->pos == start + 2) { + lsp_errf(p, start, "empty hex literal"); + return NULL; + } + } else { + while (p->pos < p->len && p->src[p->pos] >= '0' && p->src[p->pos] <= '9') { + v = v * 10 + (p->src[p->pos] - '0'); + ++p->pos; + } + if (p->pos == start) { + lsp_errf(p, start, "expected integer"); + return NULL; + } + } + e = lsp_new_expr(p); + if (!e) return NULL; + e->kind = CFREE_LE_INT; + e->v.int_val = v; + return e; +} + +static CfreeLinkExpr* parse_atom(LSP* p) { + int ch; + skip_ws(p); + if (p->err) return NULL; + ch = peek_ch(p); + if (ch < 0) { + lsp_errf(p, p->pos, "unexpected end of expression"); + return NULL; + } + if (ch == '(') { + CfreeLinkExpr* e; + ++p->pos; + e = parse_expr(p); + if (!e) return NULL; + if (expect_ch(p, ')')) return NULL; + return e; + } + if (ch == '.') { + /* `.` only — bare dot, not a dotted ident. We disambiguate by + * looking at the next char: a digit/letter/underscore/dot here is a + * lex error in this subset (no .text in expression position). */ + size_t off = p->pos; + ++p->pos; + if (p->pos < p->len && is_id_cont((unsigned char)p->src[p->pos])) { + lsp_errf(p, off, "dotted identifiers not allowed in expressions"); + return NULL; + } + { + CfreeLinkExpr* e = lsp_new_expr(p); + if (!e) return NULL; + e->kind = CFREE_LE_DOT; + return e; + } + } + if (ch >= '0' && ch <= '9') return parse_int(p); + if (is_id_start(ch)) { + /* either ALIGN(...) or a symbol reference */ + if (match_kw(p, "ALIGN")) { + CfreeLinkExpr *val, *aln, *e; + if (expect_ch(p, '(')) return NULL; + val = parse_expr(p); + if (!val) return NULL; + if (expect_ch(p, ',')) return NULL; + aln = parse_expr(p); + if (!aln) return NULL; + if (expect_ch(p, ')')) return NULL; + e = lsp_new_expr(p); + if (!e) return NULL; + e->kind = CFREE_LE_ALIGN; + e->v.align.val = val; + e->v.align.align = aln; + return e; + } + if (match_kw(p, "MAX") || match_kw(p, "MIN")) { + lsp_errf(p, p->pos, "MAX/MIN not supported in this subset"); + return NULL; + } + { + const char* s; + size_t n; + CfreeLinkExpr* e; + if (lex_ident(p, &s, &n)) return NULL; + e = lsp_new_expr(p); + if (!e) return NULL; + e->kind = CFREE_LE_SYM; + e->v.name = lsp_strdup(p, s, n); + return e; + } + } + lsp_errf(p, p->pos, "unexpected '%c' in expression", (char)ch); + return NULL; +} + +/* Returns >=0 binding power for a binary operator at p->pos and + * advances past it; -1 if no binary operator at the lookahead. */ +static int try_take_binop(LSP* p, CfreeLinkExprKind* out_kind) { + int ch; + skip_ws(p); + if (p->err) return -1; + if (p->pos >= p->len) return -1; + ch = (unsigned char)p->src[p->pos]; + switch (ch) { + case '|': + ++p->pos; + *out_kind = CFREE_LE_OR; + return 0; + case '^': + ++p->pos; + *out_kind = CFREE_LE_XOR; + return 1; + case '&': + ++p->pos; + *out_kind = CFREE_LE_AND; + return 2; + case '<': + if (p->pos + 1 < p->len && p->src[p->pos + 1] == '<') { + p->pos += 2; + *out_kind = CFREE_LE_SHL; + return 3; + } + return -1; + case '>': + if (p->pos + 1 < p->len && p->src[p->pos + 1] == '>') { + p->pos += 2; + *out_kind = CFREE_LE_SHR; + return 3; + } + return -1; + case '+': + ++p->pos; + *out_kind = CFREE_LE_ADD; + return 4; + case '-': + ++p->pos; + *out_kind = CFREE_LE_SUB; + return 4; + case '*': + ++p->pos; + *out_kind = CFREE_LE_MUL; + return 5; + case '/': + /* Division. Block-comment and /DISCARD/ openers are filtered + * elsewhere: skip_ws eats slash-star comments, and /DISCARD/ is + * recognized by the SECTIONS-body loop before expression + * context. */ + ++p->pos; + *out_kind = CFREE_LE_DIV; + return 5; + default: + return -1; + } +} + +static CfreeLinkExpr* parse_binop_rhs(LSP* p, int min_bp, CfreeLinkExpr* lhs) { + while (!p->err) { + size_t save; + CfreeLinkExprKind k; + int bp; + skip_ws(p); + if (p->err) return NULL; + save = p->pos; + bp = try_take_binop(p, &k); + if (bp < 0) return lhs; + if (bp < min_bp) { + p->pos = save; + return lhs; + } + { + CfreeLinkExpr* rhs = parse_atom(p); + CfreeLinkExpr* node; + if (!rhs) return NULL; + rhs = parse_binop_rhs(p, bp + 1, rhs); + if (!rhs) return NULL; + node = lsp_new_expr(p); + if (!node) return NULL; + node->kind = (uint8_t)k; + node->v.bin.lhs = lhs; + node->v.bin.rhs = rhs; + lhs = node; + } + } + return NULL; +} + +static CfreeLinkExpr* parse_expr(LSP* p) { + CfreeLinkExpr* lhs = parse_atom(p); + if (!lhs) return NULL; + return parse_binop_rhs(p, 0, lhs); +} + +/* ---- assignment helpers ---- */ + +static int push_dot_align(LSP* p, VecAsn* asns, CfreeLinkExpr* align_n) { + CfreeLinkExpr* dot; + CfreeLinkExpr* aln; + CfreeLinkAssignment a; + dot = lsp_new_expr(p); + if (!dot) return 1; + dot->kind = CFREE_LE_DOT; + aln = lsp_new_expr(p); + if (!aln) return 1; + aln->kind = CFREE_LE_ALIGN; + aln->v.align.val = dot; + aln->v.align.align = align_n; + a.kind = CFREE_LAS_DOT; + a.sym = NULL; + a.expr = aln; + return VEC_PUSH(p, *asns, a); +} + +/* ---- output section body ---- */ + +static int parse_input_matchers(LSP* p, VecMatch* out) { + /* opening `*` already consumed by caller. expect `(p1 p2 ...)` */ + if (expect_ch(p, '(')) return 1; + for (;;) { + int ch; + skip_ws(p); + if (p->err) return 1; + ch = peek_ch(p); + if (ch == ')') { + ++p->pos; + return 0; + } + if (ch < 0) { + lsp_errf(p, p->pos, "unterminated `*(...)`"); + return 1; + } + /* a pattern is a section-name-like run: id-start chars plus '*'. */ + { + size_t start; + const char* s; + size_t n; + CfreeLinkInputMatch m; + start = p->pos; + while (p->pos < p->len) { + char c = p->src[p->pos]; + if (is_id_cont((unsigned char)c) || c == '*') + ++p->pos; + else + break; + } + n = p->pos - start; + if (n == 0) { + lsp_errf(p, p->pos, "expected section pattern"); + return 1; + } + s = p->src + start; + m.file_pattern = NULL; + m.section_pattern = lsp_strdup(p, s, n); + m.keep = 0; + if (VEC_PUSH(p, *out, m)) return 1; + } + } +} + +static int parse_section_body(LSP* p, VecMatch* inputs, VecAsn* asns) { + if (expect_ch(p, '{')) return 1; + for (;;) { + int ch; + skip_ws(p); + if (p->err) return 1; + ch = peek_ch(p); + if (ch == '}') { + ++p->pos; + return 0; + } + if (ch < 0) { + lsp_errf(p, p->pos, "unterminated section body"); + return 1; + } + if (ch == '*') { + ++p->pos; + if (parse_input_matchers(p, inputs)) return 1; + continue; + } + if (ch == '.') { + /* `. = expr;` */ + size_t off = p->pos; + ++p->pos; + skip_ws(p); + if (p->err) return 1; + if (!match_ch(p, '=')) { + lsp_errf(p, off, "expected `. = expr` in section body"); + return 1; + } + { + CfreeLinkExpr* e = parse_expr(p); + CfreeLinkAssignment a; + if (!e) return 1; + if (!match_ch(p, ';')) { /* ; is optional but encouraged */ + } + a.kind = CFREE_LAS_DOT; + a.sym = NULL; + a.expr = e; + if (VEC_PUSH(p, *asns, a)) return 1; + } + continue; + } + if (is_id_start(ch)) { + /* sym = expr; */ + const char* s; + size_t n; + CfreeLinkExpr* e; + CfreeLinkAssignment a; + if (match_kw(p, "PROVIDE") || match_kw(p, "KEEP")) { + lsp_errf(p, p->pos, "PROVIDE/KEEP not supported in this subset"); + return 1; + } + if (lex_ident(p, &s, &n)) return 1; + skip_ws(p); + if (p->err) return 1; + if (!match_ch(p, '=')) { + lsp_errf(p, p->pos, "expected `=` after `%.*s`", (int)n, s); + return 1; + } + e = parse_expr(p); + if (!e) return 1; + (void)match_ch(p, ';'); + a.kind = CFREE_LAS_SYM; + a.sym = lsp_strdup(p, s, n); + a.expr = e; + if (VEC_PUSH(p, *asns, a)) return 1; + continue; + } + lsp_errf(p, p->pos, "unexpected '%c' in section body", (char)ch); + return 1; + } +} + +/* ---- output section header ---- */ + +static int parse_output_section(LSP* p, const char* name_buf, size_t name_len, + VecSec* sections) { + /* The `:` is the next non-ws char on entry. Header may carry + * `: ALIGN(N)` then `{ body }`. */ + CfreeLinkOutputSection sec; + VecMatch inputs = {0}; + VecAsn asns = {0}; + CfreeLinkExpr* align_n = NULL; + + if (expect_ch(p, ':')) return 1; + skip_ws(p); + if (p->err) return 1; + if (match_kw(p, "ALIGN")) { + if (expect_ch(p, '(')) return 1; + align_n = parse_expr(p); + if (!align_n) return 1; + if (expect_ch(p, ')')) return 1; + } + /* Reject AT>, > REGION, >REGION before the body. */ + skip_ws(p); + if (p->err) return 1; + if (p->pos < p->len && + (p->src[p->pos] == '>' || (p->src[p->pos] == 'A' && match_kw(p, "AT")))) { + lsp_errf(p, p->pos, + "memory-region placement (>REGION / AT>) not supported"); + return 1; + } + + /* Section header alignment is encoded as the first asn — applicator + * pulls it before processing inputs. */ + if (align_n) { + if (push_dot_align(p, &asns, align_n)) goto fail; + } + + if (parse_section_body(p, &inputs, &asns)) goto fail; + + /* Optional trailing `> REGION` / `AT> REGION` / `: NOLOAD` etc. — all + * unsupported. We allow an optional trailing `;` and nothing else. */ + (void)match_ch(p, ';'); + + /* Materialize. */ + { + CfreeLinkInputMatch* arr_in = NULL; + CfreeLinkAssignment* arr_as = NULL; + if (inputs.n) { + arr_in = arena_array(p->c->tu, CfreeLinkInputMatch, inputs.n); + if (!arr_in) goto fail; + memcpy(arr_in, inputs.p, sizeof(*arr_in) * inputs.n); + } + if (asns.n) { + arr_as = arena_array(p->c->tu, CfreeLinkAssignment, asns.n); + if (!arr_as) goto fail; + memcpy(arr_as, asns.p, sizeof(*arr_as) * asns.n); + } + memset(&sec, 0, sizeof(sec)); + sec.name = lsp_strdup(p, name_buf, name_len); + sec.inputs = arr_in; + sec.ninputs = inputs.n; + sec.asns = arr_as; + sec.nasns = asns.n; + } + + vec_free_(p, inputs.p, inputs.cap, sizeof(*inputs.p)); + vec_free_(p, asns.p, asns.cap, sizeof(*asns.p)); + + return VEC_PUSH(p, *sections, sec); + +fail: + vec_free_(p, inputs.p, inputs.cap, sizeof(*inputs.p)); + vec_free_(p, asns.p, asns.cap, sizeof(*asns.p)); + return 1; +} + +/* ---- SECTIONS{...} ---- */ + +static int parse_sections_block(LSP* p, VecAsn* top_asns, VecSec* sections) { + if (expect_ch(p, '{')) return 1; + for (;;) { + int ch; + skip_ws(p); + if (p->err) return 1; + ch = peek_ch(p); + if (ch == '}') { + ++p->pos; + return 0; + } + if (ch < 0) { + lsp_errf(p, p->pos, "unterminated SECTIONS block"); + return 1; + } + /* /DISCARD/ : { body } */ + if (ch == '/') { + static const char kDiscard[] = "/DISCARD/"; + size_t klen = sizeof(kDiscard) - 1; + if (p->pos + klen <= p->len && + memcmp(p->src + p->pos, kDiscard, klen) == 0) { + p->pos += klen; + if (parse_output_section(p, kDiscard, klen, sections)) return 1; + continue; + } + lsp_errf(p, p->pos, "expected /DISCARD/ or section header"); + return 1; + } + /* `. = expr;` at SECTIONS top level */ + if (ch == '.') { + size_t off = p->pos; + /* Distinguish bare-dot (`. =`) from `.text :` head. Bare dot has + * no id-cont following. */ + if (p->pos + 1 < p->len && + is_id_cont((unsigned char)p->src[p->pos + 1])) { + /* falls through to identifier path */ + } else { + ++p->pos; + skip_ws(p); + if (p->err) return 1; + if (!match_ch(p, '=')) { + lsp_errf(p, off, "expected `. = expr`"); + return 1; + } + { + CfreeLinkExpr* e = parse_expr(p); + CfreeLinkAssignment a; + if (!e) return 1; + (void)match_ch(p, ';'); + a.kind = CFREE_LAS_DOT; + a.sym = NULL; + a.expr = e; + if (VEC_PUSH(p, *top_asns, a)) return 1; + } + continue; + } + } + if (is_id_start(ch)) { + /* either `name :` (output section) or `sym = expr;` */ + const char* s; + size_t n; + size_t name_off; + if (match_kw(p, "PROVIDE")) { + lsp_errf(p, p->pos, "PROVIDE not supported in this subset"); + return 1; + } + name_off = p->pos; + if (lex_ident(p, &s, &n)) return 1; + skip_ws(p); + if (p->err) return 1; + if (p->pos < p->len && p->src[p->pos] == ':') { + char* nm = lsp_strdup(p, s, n); + if (!nm) return 1; + if (parse_output_section(p, nm, n, sections)) return 1; + continue; + } + if (match_ch(p, '=')) { + CfreeLinkExpr* e = parse_expr(p); + CfreeLinkAssignment a; + if (!e) return 1; + (void)match_ch(p, ';'); + a.kind = CFREE_LAS_SYM; + a.sym = lsp_strdup(p, s, n); + a.expr = e; + if (VEC_PUSH(p, *top_asns, a)) return 1; + continue; + } + lsp_errf(p, name_off, + "expected `:` (output section) or `=` (assignment) after " + "`%.*s`", + (int)n, s); + return 1; + } + lsp_errf(p, p->pos, "unexpected '%c' in SECTIONS body", (char)ch); + return 1; + } +} + +/* ---- top level ---- */ + +static int parse_top(LSP* p, CfreeLinkScript* out) { + VecAsn top_asns = {0}; + VecSec sections = {0}; + const char* entry_name = NULL; + int saw_sections = 0; + int rc = 1; + + for (;;) { + int ch; + skip_ws(p); + if (p->err) goto done; + if (p->pos >= p->len) break; + ch = (unsigned char)p->src[p->pos]; + + if (is_id_start(ch)) { + if (match_kw(p, "ENTRY")) { + const char* s; + size_t n; + if (expect_ch(p, '(')) goto done; + if (lex_ident(p, &s, &n)) goto done; + if (expect_ch(p, ')')) goto done; + (void)match_ch(p, ';'); + entry_name = lsp_strdup(p, s, n); + if (!entry_name) goto done; + continue; + } + if (match_kw(p, "SECTIONS")) { + if (saw_sections) { + lsp_errf(p, p->pos, "duplicate SECTIONS block"); + goto done; + } + if (parse_sections_block(p, &top_asns, &sections)) goto done; + saw_sections = 1; + continue; + } + if (match_kw(p, "MEMORY") || match_kw(p, "OVERLAY") || + match_kw(p, "INSERT") || match_kw(p, "OUTPUT_FORMAT") || + match_kw(p, "OUTPUT_ARCH") || match_kw(p, "INPUT") || + match_kw(p, "GROUP") || match_kw(p, "VERSION") || + match_kw(p, "PROVIDE") || match_kw(p, "STARTUP") || + match_kw(p, "SEARCH_DIR") || match_kw(p, "TARGET")) { + lsp_errf(p, p->pos, + "directive not supported in this linker-script subset"); + goto done; + } + lsp_errf(p, p->pos, "unknown top-level directive"); + goto done; + } + lsp_errf(p, p->pos, "unexpected '%c' at top level", (char)ch); + goto done; + } + + /* Materialize. */ + out->entry = entry_name; + out->regions = NULL; + out->nregions = 0; + out->top_asns = NULL; + out->ntop_asns = 0; + out->sections = NULL; + out->nsections = 0; + + if (top_asns.n) { + CfreeLinkAssignment* a = + arena_array(p->c->tu, CfreeLinkAssignment, top_asns.n); + if (!a) goto done; + memcpy(a, top_asns.p, sizeof(*a) * top_asns.n); + out->top_asns = a; + out->ntop_asns = top_asns.n; + } + if (sections.n) { + CfreeLinkOutputSection* s = + arena_array(p->c->tu, CfreeLinkOutputSection, sections.n); + if (!s) goto done; + memcpy(s, sections.p, sizeof(*s) * sections.n); + out->sections = s; + out->nsections = sections.n; + } + rc = 0; + +done: + vec_free_(p, top_asns.p, top_asns.cap, sizeof(*top_asns.p)); + vec_free_(p, sections.p, sections.cap, sizeof(*sections.p)); + return rc; +} + +/* ---- public API ---- */ + +int cfree_link_script_parse(CfreeCompiler* c, const char* text, size_t len, + const CfreeLinkScript** out) { + LSP p; + CfreeLinkScript* script; + int rc; + + if (!c || !text || !out) return 1; + if (!c->env || !c->env->heap) return 1; + + script = arena_znew(c->tu, CfreeLinkScript); + if (!script) return 1; + + memset(&p, 0, sizeof(p)); + p.c = c; + p.heap = (Heap*)c->env->heap; + p.src = text; + p.len = len; + + rc = parse_top(&p, script); + if (rc != 0 || p.err) return 1; + *out = script; + return 0; +} + +void cfree_link_script_free(CfreeCompiler* c, const CfreeLinkScript* s) { + /* Arena-owned: lifetime tied to the compiler's tu arena. No-op. */ + (void)c; + (void)s; +} diff --git a/test/link/CORPUS.md b/test/link/CORPUS.md @@ -39,6 +39,8 @@ expects from the combined sequence. | `gc_absent` | one symbol per line that must be absent post-link (e.g. dropped by `--gc-sections`) | | `gc_present` | one symbol per line that must remain present post-link | | `archive_b` | package b.o as b.a; content `demand` (normal) or `whole` (--whole-archive) | +| `linker_script` | basename of an `.lds` file in the case dir; passed to both runners via `--linker-script` | +| `kernel_image` | empty marker; case is a freestanding kernel image. Skips R and J; on E, runs the linked exe under `qemu-system-aarch64 -kernel` with semihosting | Negative tests live in `test/link/bad/<name>/` instead of `cases/`. Each bad-case directory contains source files (compile cleanly) plus an @@ -118,6 +120,12 @@ Cases 02–09 all pair ADR_PREL_PG_HI21 with their primary LDST reloc. |---|------|-----------| | 31 | `tls_local_exec` | `_Thread_local` w/ initializer; `R_AARCH64_TLSLE_ADD_TPREL_{HI12,LO12_NC}` apply + PT_TLS layout | +### Group G — linker scripts + +| # | Name | Exercises | +|---|------|-----------| +| 35 | `linker_script_kernel` | `ENTRY`, `SECTIONS { . = 0x40080000; .text/.rodata/.data/.bss with `ALIGN`; `__bss_start`, `_end`; `/DISCARD/` of `.note.*`, `.comment`, `.eh_frame`. Linked image boots under `qemu-system-aarch64 -kernel` and exits via ARM semihosting. | + ### bad/ — negative tests | # | Name | Exercises | diff --git a/test/link/cases/35_linker_script_kernel/a.c b/test/link/cases/35_linker_script_kernel/a.c @@ -0,0 +1,9 @@ +static const char msg[] = "ok"; +static int counter; + +int kmain(void) { + volatile const char* m = msg; + volatile int* c = &counter; + *c = (int)m[0]; + return 0; +} diff --git a/test/link/cases/35_linker_script_kernel/cflags b/test/link/cases/35_linker_script_kernel/cflags @@ -0,0 +1 @@ +-fno-asynchronous-unwind-tables diff --git a/test/link/cases/35_linker_script_kernel/entry.S b/test/link/cases/35_linker_script_kernel/entry.S @@ -0,0 +1,79 @@ +/* arm64 kernel entry: Image header + EL2->EL1 drop + stack + kmain. + * Booted by qemu-system-aarch64 -kernel; exits via ARM semihosting + * SYS_EXIT_EXTENDED with kmain's return value as the host exit code. */ + + .section .text, "ax" + .globl _start +_start: + b stext /* 0x00 */ + .long 0 /* 0x04 */ + .quad 0x80000 /* 0x08 text_offset */ + .quad _end - _start /* 0x10 image_size */ + .quad 0xa /* 0x18 flags: 4K, anywhere, LE */ + .quad 0 /* 0x20 */ + .quad 0 /* 0x28 */ + .quad 0 /* 0x30 */ + .ascii "ARM\x64" /* 0x38 magic */ + .long 0 /* 0x3c PE COFF offset */ + +stext: + msr daifset, #0xf + + mrs x9, CurrentEL + lsr x9, x9, #2 + cmp x9, #2 + b.ne in_el1 + + mov x9, #(1 << 31) + msr hcr_el2, x9 + mov x9, #0x3c5 + msr spsr_el2, x9 + adrp x9, in_el1 + add x9, x9, :lo12:in_el1 + msr elr_el2, x9 + adrp x9, kstack_top + add x9, x9, :lo12:kstack_top + msr sp_el1, x9 + eret + +in_el1: + adrp x9, kstack_top + add x9, x9, :lo12:kstack_top + mov sp, x9 + + bl kmain + + /* QEMU semihosting exit. SYS_EXIT_EXTENDED (op 0x20) takes x1 = + * pointer to [reason, subcode]; ADP_Stopped_ApplicationExit + * (0x20026) returns subcode as the host exit code. */ + cbnz w0, .Lfail + mov w0, #0x20 + adrp x1, .Lexit_ok + add x1, x1, :lo12:.Lexit_ok + hlt #0xf000 +.Lhang_ok: + wfe + b .Lhang_ok +.Lfail: + mov w0, #0x20 + adrp x1, .Lexit_fail + add x1, x1, :lo12:.Lexit_fail + hlt #0xf000 +.Lhang_fail: + wfe + b .Lhang_fail + + .section .rodata, "a" + .balign 8 +.Lexit_ok: + .quad 0x20026 /* ADP_Stopped_ApplicationExit */ + .quad 0 /* subcode = host exit 0 */ +.Lexit_fail: + .quad 0x20026 + .quad 1 /* subcode = host exit 1 */ + + .section .bss, "aw", %nobits + .balign 16 +kstack_bottom: + .skip 4096 +kstack_top: diff --git a/test/link/cases/35_linker_script_kernel/expected b/test/link/cases/35_linker_script_kernel/expected @@ -0,0 +1 @@ +0 diff --git a/test/link/cases/35_linker_script_kernel/kernel.lds b/test/link/cases/35_linker_script_kernel/kernel.lds @@ -0,0 +1,29 @@ +ENTRY(_start) + +SECTIONS { + . = 0x40080000; + + .text : ALIGN(8) { + *(.text .text.*) + } + + .rodata : ALIGN(8) { + *(.rodata .rodata.*) + } + + .data : ALIGN(8) { + *(.data .data.*) + } + + .bss : ALIGN(16) { + __bss_start = .; + *(.bss .bss.*) + . = ALIGN(., 16); + } + + _end = .; + + /DISCARD/ : { + *(.note.*) *(.comment) *(.eh_frame) + } +} diff --git a/test/link/cases/35_linker_script_kernel/kernel_image b/test/link/cases/35_linker_script_kernel/kernel_image diff --git a/test/link/cases/35_linker_script_kernel/linker_script b/test/link/cases/35_linker_script_kernel/linker_script @@ -0,0 +1 @@ +kernel.lds diff --git a/test/link/harness/jit_runner.c b/test/link/harness/jit_runner.c @@ -3,6 +3,7 @@ * Usage: * jit_runner [--gc-sections] [--use-resolver] * [--check-absent SYM] [--check-present SYM] + * [--linker-script <path>] * [--archive [--whole-archive] <lib.a>] <in.o> ... * * Reads .o (and optionally .a) inputs, calls cfree_link_jit (which runs @@ -259,6 +260,7 @@ int main(int argc, char** argv) { * --check-present SYM: after link, verify symbol IS in image. */ const char* check_absent = NULL; const char* check_present = NULL; + const char* script_path = NULL; CfreeBytesInput objs[64]; CfreeBytesInputArchive archives[16]; @@ -280,6 +282,8 @@ int main(int argc, char** argv) { check_absent = argv[++i]; } else if (!strcmp(argv[i], "--check-present") && i + 1 < argc) { check_present = argv[++i]; + } else if (!strcmp(argv[i], "--linker-script") && i + 1 < argc) { + script_path = argv[++i]; } else { uint8_t* data; size_t len; @@ -339,6 +343,26 @@ int main(int argc, char** argv) { opts.inputs.extern_resolver_user = NULL; } + if (script_path) { + uint8_t* sbytes; + size_t slen; + if (slurp(script_path, &sbytes, &slen)) { + fprintf(stderr, "jit-runner: cannot read %s\n", script_path); + cfree_compiler_free(c); + return 2; + } + const CfreeLinkScript* script = NULL; + int prc = cfree_link_script_parse(c, (const char*)sbytes, slen, &script); + free(sbytes); + if (prc) { + fprintf(stderr, "jit-runner: linker script parse failed: %s\n", + script_path); + cfree_compiler_free(c); + return 1; + } + opts.inputs.linker_script = script; + } + CfreeJit* jit = NULL; int rc = cfree_link_jit(c, &opts, &jit); for (int i = 0; i < nbufs; i++) free(bufs[i]); diff --git a/test/link/harness/link_exe_runner.c b/test/link/harness/link_exe_runner.c @@ -1,7 +1,8 @@ /* link_exe_runner — Path E harness driver. * * Usage: - * link_exe_runner [--gc-sections] [--entry NAME] -o <out.exe> + * link_exe_runner [--gc-sections] [--entry NAME] [--linker-script <path>] + * -o <out.exe> * [--archive [--whole-archive] <lib.a>] <in.o> ... * * Reads inputs, calls cfree_link_exe, writes the ELF executable. @@ -98,6 +99,7 @@ static int write_exe(const char* path, const uint8_t* data, size_t len) { int main(int argc, char** argv) { const char* out_path = NULL; const char* entry_name = "_start"; + const char* script_path = NULL; int gc_sections = 0; int next_archive = 0; int next_whole = 0; @@ -118,6 +120,8 @@ int main(int argc, char** argv) { next_archive = 1; } else if (!strcmp(argv[i], "--entry") && i + 1 < argc) { entry_name = argv[++i]; + } else if (!strcmp(argv[i], "--linker-script") && i + 1 < argc) { + script_path = argv[++i]; } else if (!strcmp(argv[i], "-o") && i + 1 < argc) { out_path = argv[++i]; } else { @@ -177,6 +181,26 @@ int main(int argc, char** argv) { opts.inputs.entry = entry_name; opts.gc_sections = gc_sections; + if (script_path) { + uint8_t* sbytes; + size_t slen; + if (slurp(script_path, &sbytes, &slen)) { + fprintf(stderr, "link-exe-runner: cannot read %s\n", script_path); + cfree_compiler_free(c); + return 2; + } + const CfreeLinkScript* script = NULL; + int prc = cfree_link_script_parse(c, (const char*)sbytes, slen, &script); + free(sbytes); + if (prc) { + fprintf(stderr, "link-exe-runner: linker script parse failed: %s\n", + script_path); + cfree_compiler_free(c); + return 1; + } + opts.inputs.linker_script = script; + } + CfreeWriter* w = cfree_writer_mem(&g_heap); if (!w) { cfree_compiler_free(c);