kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 9feae6c4c66cd1be6b1a90b22a6cad2e2ab43cb6
parent 74c6f214ef1dfd173158873ecd2893772e2a5b3a
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sun, 10 May 2026 12:59:35 -0700

obj/test: macho roundtrip — fix read/emit fidelity, wire test-link path R

Mach-O R path runs every applicable test/link/cases case (36 pass,
3 SKIP-NA for ELF-only `__start_/__stop_`, ELF TLS, kernel script).
The reader/writer fidelity bugs surfaced by structural diff:

  - underscore mangling broke names without a `_` prefix
    (`ltmp0` -> `_ltmp0`); names round-trip verbatim now
  - undef external syms lost N_EXT (extern decls came back local)
  - weak undef refs (N_WEAK_REF) weren't recognized — only N_WEAK_DEF
    collapsed to SB_WEAK
  - common syms dropped their size (n_value)
  - section S_TYPE/S_ATTR_* bits were recomputed from SecKind instead
    of using the preserved Section.ext_type
  - zerofill sections layered inline; conventional Mach-O places them
    at the segment tail, which the symbol n_values depend on
  - n_desc pass-through bits (N_NO_DEAD_STRIP etc.) were lost. Added
    obj_symbol_set_flags so readers can stash format-specific bits
    the canonical bind/vis/kind triple doesn't model

Harness:
  - test/test.mk: cfree-roundtrip-macho Make target; test-link
    depends on it
  - test/link/run.sh: dispatch ROUNDTRIP_BIN + NORMALIZE by
    CFREE_TEST_OBJ; new have_dump covers llvm-readelf / llvm-readobj;
    per-case `targets` file gates ELF-only cases as SKIP-NA
  - test/macho/normalize.py: structural diff via llvm-readobj

Diffstat:
Mdoc/linker-status.md | 1+
Msrc/obj/macho_emit.c | 56+++++++++++++++++++++++++++++++++++++++++++++++---------
Msrc/obj/macho_read.c | 30+++++++++++++++++++++---------
Msrc/obj/obj.c | 8++++++++
Msrc/obj/obj.h | 9+++++++++
Atest/link/cases/25h_gc_start_stop/targets | 3+++
Atest/link/cases/31_tls_local_exec/targets | 3+++
Atest/link/cases/35_linker_script_kernel/targets | 2++
Mtest/link/run.sh | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
Atest/macho/normalize.py | 259+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/test.mk | 17++++++++++++-----
11 files changed, 420 insertions(+), 31 deletions(-)

diff --git a/doc/linker-status.md b/doc/linker-status.md @@ -20,6 +20,7 @@ live in `test/link/` — they are not duplicated in `test/elf/`. | `test-link` E (aa64) | 37 | 0 | qemu/podman aarch64 exec, incl. IFUNC | | `test-link` J (aa64) | 38 | 0 | JIT in-process incl. GC subgroup, IFUNC, TLS | | `test-link` R (rv64) | 38 | 0 | object roundtrip via cfree-roundtrip | +| `test-link` R (aa64-macho) | 36 | 0 | Mach-O object roundtrip via cfree-roundtrip-macho (3 cases SKIP-NA: ELF-only) | | `test-link` E (rv64) | 38 | 0 | qemu/podman riscv64 exec, incl. IFUNC + TLS | | `test-link` bad | 2 | 0 | `bad/30_undef_strong` (E + J) | | `test-musl` | 6 | 0 | musl 1.2.5 static + dynamic: syscall, errno, printf | diff --git a/src/obj/macho_emit.c b/src/obj/macho_emit.c @@ -249,8 +249,18 @@ void emit_macho(Compiler* c, ObjBuilder* ob, Writer* w) { name_to_seg_sect(nm ? nm : "", (u32)nlen, s->kind, &m->ns); m->obj_sec = i; m->align = s->align ? s->align : 1; - m->flags = section_flags_for(s->kind, s->flags, m->ns.sectname, - m->ns.sect_len); + /* Mach-O reader stashes the raw section.flags (S_TYPE | S_ATTR_*) + * in Section.ext_type when reading a Mach-O input. Use it + * verbatim so attribute bits like S_ATTR_NO_DEAD_STRIP / + * S_ATTR_LIVE_SUPPORT round-trip. Fall back to the kind-derived + * default for sections originating from non-Mach-O readers (e.g. + * cfree codegen). */ + if (s->ext_kind == OBJ_EXT_MACHO && s->ext_type) { + m->flags = s->ext_type; + } else { + m->flags = section_flags_for(s->kind, s->flags, m->ns.sectname, + m->ns.sect_len); + } if (s->sem == SSEM_NOBITS || s->kind == SEC_BSS) { m->is_zerofill = 1; m->size = s->bss_size; @@ -268,10 +278,24 @@ void emit_macho(Compiler* c, ObjBuilder* ob, Writer* w) { /* ---- pass 2: assign vmaddrs (segment-relative) and per-section * flat-layout addresses. MH_OBJECT keeps everything in * one segment with vmaddr=0; section addr fields are - * relative offsets within the segment. */ + * relative offsets within the segment. + * + * Two-pass to match the conventional Mach-O `MH_OBJECT` layout: + * non-zerofill sections come first in vmaddr order, then zerofill + * sections at the tail. Apple `as` and clang `-c` both lay out + * this way, and roundtripping must reproduce it so symbol n_values + * (which are segment-relative addresses) compare equal. */ u64 cur_addr = 0; for (u32 i = 0; i < nsecs; ++i) { MSec* m = &secs[i]; + if (m->is_zerofill) continue; + cur_addr = ALIGN_UP(cur_addr, (u64)m->align); + m->addr = cur_addr; + cur_addr += m->size; + } + for (u32 i = 0; i < nsecs; ++i) { + MSec* m = &secs[i]; + if (!m->is_zerofill) continue; cur_addr = ALIGN_UP(cur_addr, (u64)m->align); m->addr = cur_addr; cur_addr += m->size; @@ -320,14 +344,16 @@ void emit_macho(Compiler* c, ObjBuilder* ob, Writer* w) { size_t nlen; const char* nm = pool_str(c->global, s->name, &nlen); - /* Mach-O convention: C symbols carry a leading underscore on - * disk. Apple toolchains rely on this for the linker - * "_main"-vs-"main" entry point and for every libSystem call. - * Round-tripped via read_macho, which strips one leading `_`. */ + /* Mach-O symbol names are stored on disk verbatim — including + * the leading `_` Apple toolchains use for C-source-level + * symbols ("_main" for `int main()`). The cfree path treats + * that prefix as part of the on-disk name, not a transform + * applied at emit; a future Mach-O codegen frontend can + * prepend the underscore itself the same way LLVM's MCSymbol + * does via target.MCAsmInfo. Round-tripping is then byte-for- + * byte: emit writes what read sees. */ if (nlen && nm) { - u8 us = '_'; u32 off = buf_pos(&strtab); - buf_write(&strtab, &us, 1); buf_write(&strtab, nm, nlen); u8 z = 0; buf_write(&strtab, &z, 1); @@ -348,6 +374,11 @@ void emit_macho(Compiler* c, ObjBuilder* ob, Writer* w) { if (undef) { type |= N_UNDF; + /* Undefined symbols with non-LOCAL bind are external references + * (the common case — every `extern int x;`). Setting N_EXT + * matches what clang emits and what Apple `ld` expects. */ + if (s->bind == SB_GLOBAL || s->bind == SB_WEAK) type |= N_EXT; + if (s->bind == SB_WEAK) n_desc |= N_WEAK_REF; value = 0; } else if (s->kind == SK_ABS) { type |= N_ABS; @@ -368,6 +399,13 @@ void emit_macho(Compiler* c, ObjBuilder* ob, Writer* w) { if (s->bind == SB_WEAK) n_desc |= N_WEAK_DEF; } + /* OR in any pass-through n_desc bits the reader stashed in + * sym->flags (N_NO_DEAD_STRIP, etc.). The bits we already + * compute (N_WEAK_DEF / N_WEAK_REF and the common-alignment + * field) are already excluded by read_macho before stashing, + * so a plain OR can't double-count. */ + n_desc |= s->flags; + ms->n_type = type; ms->n_sect = n_sect; ms->n_desc = n_desc; diff --git a/src/obj/macho_read.c b/src/obj/macho_read.c @@ -236,13 +236,10 @@ ObjBuilder* read_macho(Compiler* c, const char* name, const u8* data, nm = (const char*)(strtab + strx); while (strx + nlen < strsize && nm[nlen]) ++nlen; } - /* Inverse of the leading-underscore prefix emit_macho applies to C - * symbols. If the Mach-O name starts with `_`, strip it so the - * builder holds the source-level name. */ - if (nlen && nm[0] == '_') { - ++nm; - --nlen; - } + /* Mach-O names round-trip verbatim — the leading `_` Apple + * toolchains apply to C symbols is part of the on-disk name as + * far as ObjBuilder is concerned. Mirrors the no-transform + * decision in emit_macho. */ Sym sn = nlen ? pool_intern(c->global, nm, nlen) : 0; u8 type_field = (u8)(n_type & N_TYPE); @@ -250,12 +247,16 @@ ObjBuilder* read_macho(Compiler* c, const char* name, const u8* data, u8 pext = (u8)(n_type & N_PEXT); u16 bind = ext ? SB_GLOBAL : SB_LOCAL; - if (ext && (n_desc & N_WEAK_DEF)) bind = SB_WEAK; + /* Weak DEFs (defined symbols) carry N_WEAK_DEF; weak REFs (undef + * `__attribute__((weak))` references) carry N_WEAK_REF. Either + * one collapses to SB_WEAK in the cfree model. */ + if (ext && (n_desc & (N_WEAK_DEF | N_WEAK_REF))) bind = SB_WEAK; u8 vis = pext ? SV_HIDDEN : SV_DEFAULT; u16 kind; ObjSecId sec_id = OBJ_SEC_NONE; u64 value = 0; + u64 size = 0; u64 cmnalign = 0; if (type_field == N_UNDF) { @@ -264,6 +265,7 @@ ObjBuilder* read_macho(Compiler* c, const char* name, const u8* data, * GET_COMM_ALIGN bits. */ kind = SK_COMMON; value = 0; + size = n_value; u32 la = (u32)((n_desc >> 8) & 0xf); cmnalign = 1u << la; } else { @@ -289,7 +291,17 @@ ObjBuilder* read_macho(Compiler* c, const char* name, const u8* data, } ObjSymId id = obj_symbol_ex(ob, sn, (SymBind)bind, (SymVis)vis, - (SymKind)kind, sec_id, value, 0, cmnalign); + (SymKind)kind, sec_id, value, size, cmnalign); + /* n_desc carries Mach-O attribute bits beyond what bind/vis/kind + * model — N_NO_DEAD_STRIP, N_REF_TO_WEAK, N_ARM_THUMB_DEF, etc. + * Mask off the bits we already round-trip via bind (N_WEAK_DEF / + * N_WEAK_REF) and the alignment field for commons (which lives + * in cmnalign), then stash the remainder so emit_macho can OR it + * back in. */ + u16 desc_pass = n_desc; + desc_pass &= (u16)~(N_WEAK_DEF | N_WEAK_REF); + if (kind == SK_COMMON) desc_pass &= 0x00ff; /* drop align field */ + if (desc_pass) obj_symbol_set_flags(ob, id, desc_pass); sym_macho_to_obj[i] = id; } diff --git a/src/obj/obj.c b/src/obj/obj.c @@ -226,6 +226,14 @@ void obj_symbol_define(ObjBuilder* ob, ObjSymId id, ObjSecId section_id, if (s->kind == SK_UNDEF) s->kind = SK_OBJ; } +void obj_symbol_set_flags(ObjBuilder* ob, ObjSymId id, u16 flags) { + ObjSym* s; + if (id == OBJ_SYM_NONE) return; + s = Symbols_at(&ob->symbols, id); + if (!s) return; + s->flags = flags; +} + void obj_reloc(ObjBuilder* ob, ObjSecId section_id, u32 offset, RelocKind kind, ObjSymId sym, i64 addend) { obj_reloc_ex(ob, section_id, offset, kind, sym, addend, 1, 0); diff --git a/src/obj/obj.h b/src/obj/obj.h @@ -334,6 +334,15 @@ void obj_finalize(ObjBuilder*); void obj_set_elf_e_flags(ObjBuilder*, u32 e_flags); int obj_get_elf_e_flags(const ObjBuilder*, u32* out); +/* Per-symbol format-specific flag bits. ObjSym.flags is otherwise + * unused; readers stash format-specific attribute bits there so the + * matching emitter can re-apply them. Today this is Mach-O n_desc + * pass-through (N_NO_DEAD_STRIP, etc.) — bits the canonical + * ObjSym.bind/vis/kind triple doesn't model. ELF callers are free + * to use the same field for their own pass-through; the contract is + * "bits go in / same bits come out", not a shared semantic. */ +void obj_symbol_set_flags(ObjBuilder*, ObjSymId, u16 flags); + /* ---- read side (linker, file emitters, objdump) ---- */ u32 obj_section_count(const ObjBuilder*); const Section* obj_section_get(const ObjBuilder*, ObjSecId id); diff --git a/test/link/cases/25h_gc_start_stop/targets b/test/link/cases/25h_gc_start_stop/targets @@ -0,0 +1,3 @@ +aa64-elf +rv64-elf +x64-elf diff --git a/test/link/cases/31_tls_local_exec/targets b/test/link/cases/31_tls_local_exec/targets @@ -0,0 +1,3 @@ +aa64-elf +rv64-elf +x64-elf diff --git a/test/link/cases/35_linker_script_kernel/targets b/test/link/cases/35_linker_script_kernel/targets @@ -0,0 +1,2 @@ +aa64-elf +rv64-elf diff --git a/test/link/run.sh b/test/link/run.sh @@ -57,8 +57,10 @@ ROOT="$(cd "$(dirname "$0")/../.." && pwd)" TEST_DIR="$ROOT/test/link" BUILD_DIR="$ROOT/build/test" LIB_AR="$ROOT/build/libcfree.a" -ROUNDTRIP_BIN="$ROOT/build/test/cfree-roundtrip" -NORMALIZE="$ROOT/test/elf/normalize.py" +ROUNDTRIP_BIN_ELF="$ROOT/build/test/cfree-roundtrip" +ROUNDTRIP_BIN_MACHO="$ROOT/build/test/cfree-roundtrip-macho" +NORMALIZE_ELF="$ROOT/test/elf/normalize.py" +NORMALIZE_MACHO="$ROOT/test/macho/normalize.py" LINK_EXE_RUNNER="$BUILD_DIR/link-exe-runner" JIT_RUNNER="$BUILD_DIR/jit-runner" @@ -82,6 +84,8 @@ case "$CFREE_TEST_OBJ" in x64) CLANG_TRIPLE=x86_64-linux-gnu ;; rv64) CLANG_TRIPLE=riscv64-linux-gnu ;; esac + ROUNDTRIP_BIN="$ROUNDTRIP_BIN_ELF" + NORMALIZE="$NORMALIZE_ELF" ;; macho) EXEC_OS=macos @@ -90,6 +94,8 @@ case "$CFREE_TEST_OBJ" in x64) CLANG_TRIPLE=x86_64-apple-macos ;; rv64) printf 'CFREE_TEST_OBJ=macho has no rv64 target\n' >&2; exit 2 ;; esac + ROUNDTRIP_BIN="$ROUNDTRIP_BIN_MACHO" + NORMALIZE="$NORMALIZE_MACHO" ;; *) printf 'unknown CFREE_TEST_OBJ=%s\n' "$CFREE_TEST_OBJ" >&2; exit 2 ;; esac @@ -144,6 +150,18 @@ fi command -v llvm-readelf >/dev/null 2>&1 && have_readelf=1 command -v readelf >/dev/null 2>&1 && have_readelf=1 command -v python3 >/dev/null 2>&1 && have_python3=1 +have_readobj=0 +command -v llvm-readobj >/dev/null 2>&1 && have_readobj=1 +command -v readobj >/dev/null 2>&1 && have_readobj=1 +# Path R needs the right dump tool for the obj format. ELF wants +# llvm-readelf; Mach-O wants llvm-readobj. The harness exposes a +# single have_dump flag so the per-case skip logic doesn't have to +# branch on CFREE_TEST_OBJ. +have_dump=0 +case "$CFREE_TEST_OBJ" in + elf) [ $have_readelf -eq 1 ] && have_dump=1 ;; + macho) [ $have_readobj -eq 1 ] && have_dump=1 ;; +esac # Prefer llvm-ar for archive creation: Apple's /usr/bin/ar requires # Mach-O members and silently drops ELF objects (leaving only a SYMDEF # stub), which breaks the cross-target archive cases here. @@ -246,10 +264,29 @@ E_GC_PRESENT_LIST=() # ---- per-case loop --------------------------------------------------------- +CUR_TUPLE="${TEST_ARCH}-${CFREE_TEST_OBJ}" + for case_dir in "$TEST_DIR/cases"/*/; do [ -d "$case_dir" ] || continue name="$(basename "$case_dir")" [ -n "$FILTER" ] && [[ "$name" != *"$FILTER"* ]] && continue + # Per-case applicability: a `targets` file lists the <arch>-<obj> + # tuples the case applies to (one per line, or whitespace-separated). + # Cases with no `targets` file run on every tuple. Filtered cases + # print a SKIP-NA line and don't count against pass/fail/skip — they + # exercise target-specific features with no analogue elsewhere + # (e.g. ELF __start_/__stop_ boundary syms have no Mach-O peer; + # ELF TLS local-exec relocs differ fundamentally from Mach-O TLVP). + if [ -f "$case_dir/targets" ]; then + applicable=0 + for tuple in $(cat "$case_dir/targets"); do + [ "$tuple" = "$CUR_TUPLE" ] && applicable=1 + done + if [ $applicable -eq 0 ]; then + printf ' %s %s — N/A on %s\n' "$(color_yel SKIP-NA)" "$name" "$CUR_TUPLE" + continue + fi + fi work="$BUILD_DIR/link/$name" mkdir -p "$work" @@ -399,7 +436,7 @@ for case_dir in "$TEST_DIR/cases"/*/; do # ---- Path R: roundtrip -------------------------------------------------- if [ $jit_only -eq 0 ] && [ $RUN_R -eq 1 ] && [ $kernel_image -eq 0 ]; then - if [ $have_roundtrip -eq 1 ] && [ $have_readelf -eq 1 ] && [ $have_python3 -eq 1 ]; then + if [ $have_roundtrip -eq 1 ] && [ $have_dump -eq 1 ] && [ $have_python3 -eq 1 ]; then t0=$(now_ms) r_ok=1 for obj in "${rt_obj_files[@]}"; do @@ -408,10 +445,20 @@ for case_dir in "$TEST_DIR/cases"/*/; do if ! "$ROUNDTRIP_BIN" "$obj" "$rt" 2>"$work/rt_${base}.err"; then r_ok=0; break fi - "$READELF_BIN" -aW "$obj" | python3 "$NORMALIZE" filter \ - >"$work/${base}_golden.norm" 2>/dev/null - "$READELF_BIN" -aW "$rt" | python3 "$NORMALIZE" filter \ - >"$work/${base}_rt.norm" 2>/dev/null + # ELF: pipe `readelf -aW` through normalize.py filter. + # Mach-O: normalize.py runs llvm-readobj on the file + # itself (it knows the right flag set). + if [ "$CFREE_TEST_OBJ" = "macho" ]; then + python3 "$NORMALIZE" "$obj" \ + >"$work/${base}_golden.norm" 2>/dev/null + python3 "$NORMALIZE" "$rt" \ + >"$work/${base}_rt.norm" 2>/dev/null + else + "$READELF_BIN" -aW "$obj" | python3 "$NORMALIZE" filter \ + >"$work/${base}_golden.norm" 2>/dev/null + "$READELF_BIN" -aW "$rt" | python3 "$NORMALIZE" filter \ + >"$work/${base}_rt.norm" 2>/dev/null + fi if ! diff -u "$work/${base}_golden.norm" \ "$work/${base}_rt.norm" \ >"$work/${base}_diff.txt" 2>&1; then @@ -422,7 +469,7 @@ for case_dir in "$TEST_DIR/cases"/*/; do if [ $r_ok -eq 1 ]; then note_pass "$name/R (${dt}ms)" else note_fail "$name/R"; fi else - note_skip "$name/R" "missing roundtrip/readelf/python3" + note_skip "$name/R" "missing roundtrip/dump-tool/python3" fi fi diff --git a/test/macho/normalize.py b/test/macho/normalize.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +"""Canonicalize llvm-readobj output for Mach-O so two .o files with +equivalent semantic content compare equal. + +Mach-O peer of test/elf/normalize.py — the structural-fidelity pivot +the path-R harness uses for CFREE_TEST_OBJ=macho. Differs from the +ELF normalizer because: + + * llvm-readobj's Mach-O output uses an indented "Section { ... }" + block format, not a one-line-per-row layout. The normalizer is a + block transformer rather than a line transformer. + * Mach-O symbol-name strtab packing order is implementation-defined + (clang packs externals before locals; cfree packs in symtab + order). The "(NN)" strx hint after each Name field is dropped so + those orderings compare equal. + * Section/segment name fields print with a trailing hex byte dump + (Name: __text (5F 5F 74 65 78 74 ...)). The hex tail is dropped + — the printable name already captures it. + * File offsets, RelocationOffset, addresses inside SectionData + headers are layout-dependent and dropped. + +Strips: + - "(NN)" strtab offsets after Name: fields + - hex byte dumps after section/segment names + - "Index: N" (section index — re-derivable from order) + - "Offset: N" (file offset) + - "RelocationOffset: 0xN" + - "Address: 0xN" (where the section is positioned within the segment) + - the SectionData hex dump line numbers — content is what matters + - the "File:" header line (path varies between golden and rt) + +Sorts: + - Symbol entries within Symbols [] by Name (clang and cfree partition + differently within the same DYSYMTAB extents) + - Relocations within each Relocations { Section { ... } } by Offset + +Invocation: + normalize.py <file> — runs `llvm-readobj` on file, normalizes. + normalize.py filter — reads stdin, writes normalized to stdout. +""" +import re +import shutil +import subprocess +import sys + + +def _which(*names): + for n in names: + p = shutil.which(n) + if p: + return p + return None + + +def _readobj(path): + bin_path = _which("llvm-readobj", "readobj") + if not bin_path: + sys.stderr.write("normalize.py: cannot find llvm-readobj\n") + sys.exit(77) + args = [ + bin_path, + "--section-headers", + "--section-data", + "--relocations", + "--symbols", + path, + ] + res = subprocess.run(args, capture_output=True, text=True) + sys.stderr.write(res.stderr) + return res.stdout + + +# "Name: foo (NN)" — strip the strtab-offset hint +_NAME_STRX = re.compile(r"^(\s*Name:\s+\S+)\s+\(\d+\)\s*$") +# "Name: __text (5F 5F ...)" / "Segment: __TEXT (5F 5F ...)" — strip hex tail +_NAME_HEXBYTES = re.compile( + r"^(\s*(?:Name|Segment):\s+\S+)\s+\([0-9A-Fa-f ]+\)\s*$" +) +# Lines whose value is layout-dependent and not load-bearing for fidelity. +_DROP_FIELD_RE = re.compile( + r"^\s*(Index|Offset|RelocationOffset|Address|File):\s" +) +# SectionData hex rows: " 0000: 00008052 ..." +_SECTION_DATA_ROW = re.compile(r"^\s*[0-9A-Fa-f]{4}:\s+") +# Block headers we use for sort scoping. +_BLOCK_HDR = re.compile(r"^(\s*)(\w[\w]*)\s+(\[|\{)\s*$") + + +def _strip_line(line): + rstripped = line.rstrip("\n") + if _DROP_FIELD_RE.match(rstripped): + return None + m = _NAME_STRX.match(rstripped) + if m: + return m.group(1) + "\n" + m = _NAME_HEXBYTES.match(rstripped) + if m: + return m.group(1) + "\n" + return line + + +def _block_key(block_lines, prefer_keys=("Name", "Offset")): + """Pick a sort key from a block: first matching field wins.""" + for k in prefer_keys: + for ln in block_lines: + m = re.match(r"^\s*" + re.escape(k) + r":\s+(.+?)\s*$", ln) + if m: + return m.group(1) + return "" + + +def _split_top_blocks(lines): + """Walk lines at one indent level; group into blocks bounded by + matching '{...}'. Returns (prefix, blocks, suffix) where blocks is + a list of [lines] each starting with 'Foo {' and ending '}'.""" + prefix, suffix = [], [] + blocks = [] + i = 0 + n = len(lines) + started = False + base_indent = None + while i < n: + line = lines[i] + stripped = line.rstrip("\n") + # Detect a "Symbol {" or "Section {" or "Relocation {" block start. + m = re.match(r"^(\s*)(\w+)\s+\{\s*$", stripped) + if m and m.group(2) in ("Symbol", "Section", "Relocation"): + if base_indent is None: + base_indent = len(m.group(1)) + indent = len(m.group(1)) + if indent == base_indent: + started = True + # Collect until the matching '}' + depth = 1 + blk = [line] + i += 1 + while i < n and depth > 0: + bl = lines[i] + bs = bl.rstrip("\n") + if re.match(r"^\s*\w[\w ]*\{\s*$", bs) or bs.endswith(" {"): + depth += 1 + elif re.match(r"^\s*\}\s*$", bs): + depth -= 1 + blk.append(bl) + i += 1 + blocks.append(blk) + continue + if not started: + prefix.append(line) + else: + suffix.append(line) + i += 1 + return prefix, blocks, suffix + + +def _normalize_block_internals(block_lines): + """For a Section block containing Relocations { ... } or a + Relocations { Section { ... } } block — sort inner relocation + entries by Offset.""" + # Find inner "Relocations [" or "Relocation {" sub-blocks and sort. + # Two cases this matters: + # 1) top-level Sections [] entries don't carry their own relocs in + # llvm-readobj's macho output — relocs live under Relocations []. + # 2) Relocations [ Section { Relocation { ... } ... } ] + return block_lines + + +def normalize(text): + # First apply per-line strips. + lines = [] + for raw in text.splitlines(keepends=True): + out = _strip_line(raw) + if out is None: + continue + # Drop SectionData byte-row column offsets like "0000:" — keep + # only the ASCII bytes. SectionData rows are positional; they + # render the same content for both files at the same offsets. + m = _SECTION_DATA_ROW.match(out) + if m: + # Keep just the hex+ascii portion after the column tag so + # whitespace variations don't matter. + out = re.sub(r"^\s*[0-9A-Fa-f]{4}:\s+", " ", out) + lines.append(out) + + # Now scan top-level "Symbols [" and "Relocations [" blocks and + # sort their inner "Symbol {" / "Section { ... Relocation { ... } }" + # entries. + out_lines = [] + i = 0 + n = len(lines) + while i < n: + line = lines[i] + m_sym = re.match(r"^Symbols\s*\[\s*$", line.rstrip("\n")) + m_rel = re.match(r"^Relocations\s*\[\s*$", line.rstrip("\n")) + if m_sym or m_rel: + out_lines.append(line) + i += 1 + # Collect everything until the matching closing "]" + inner = [] + depth = 1 + while i < n and depth > 0: + ln = lines[i] + s = ln.rstrip("\n") + if re.match(r"^[A-Za-z][\w]*\s*\[\s*$", s) or s.endswith(" ["): + depth += 1 + elif re.match(r"^\]\s*$", s): + depth -= 1 + if depth == 0: + break + inner.append(ln) + i += 1 + # Split inner into Symbol/Section/Relocation blocks and sort. + _, blocks, suffix = _split_top_blocks(inner) + if m_sym: + blocks.sort(key=lambda b: _block_key(b, ("Name",))) + else: + # Relocations [ Section { Relocation { Offset: N ... } } ] + # Sort the inner Relocation entries within each Section. + new_blocks = [] + for blk in blocks: + p2, sub_blocks, suf2 = _split_top_blocks(blk[1:-1]) + sub_blocks.sort(key=lambda b: _block_key(b, ("Offset",))) + rebuilt = [blk[0]] + p2 + for sb in sub_blocks: + rebuilt.extend(sb) + rebuilt.extend(suf2) + rebuilt.append(blk[-1]) + new_blocks.append(rebuilt) + blocks = new_blocks + for b in blocks: + out_lines.extend(b) + out_lines.extend(suffix) + # Append the closing "]" line we left at lines[i] + if i < n: + out_lines.append(lines[i]) + i += 1 + else: + out_lines.append(line) + i += 1 + + return "".join(out_lines) + + +def main(argv): + if len(argv) < 2: + sys.stderr.write(__doc__) + return 2 + cmd = argv[1] + if cmd == "filter": + text = sys.stdin.read() + else: + # Treat the arg as a file path; run llvm-readobj on it. + text = _readobj(cmd) + sys.stdout.write(normalize(text)) + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/test/test.mk b/test/test.mk @@ -93,16 +93,23 @@ $(DEBUG_TEST_BIN): test/debug/roundtrip_unit.c $(LIB_AR) # void* to a function pointer, which pedantic rejects under C11. HARNESS_CFLAGS = -std=c11 -Wall -Wextra -Werror -isysroot $(SYSROOT) -Iinclude -Itest -ROUNDTRIP_BIN = build/test/cfree-roundtrip -LINK_EXE_RUNNER = build/test/link-exe-runner -JIT_RUNNER = build/test/jit-runner -PARSE_RUNNER = build/test/parse-runner +ROUNDTRIP_BIN = build/test/cfree-roundtrip +ROUNDTRIP_BIN_MACHO = build/test/cfree-roundtrip-macho +LINK_EXE_RUNNER = build/test/link-exe-runner +JIT_RUNNER = build/test/jit-runner +PARSE_RUNNER = build/test/parse-runner # cfree-roundtrip needs `-Isrc` for the internal obj.h surface it inspects. $(ROUNDTRIP_BIN): test/elf/cfree-roundtrip.c $(LIB_AR) @mkdir -p $(dir $@) $(CC) $(HARNESS_CFLAGS) -Isrc test/elf/cfree-roundtrip.c $(LIB_AR) -o $@ +# Mach-O peer of cfree-roundtrip — read_macho + emit_macho. Used by +# test-link's path R when CFREE_TEST_OBJ=macho. +$(ROUNDTRIP_BIN_MACHO): test/macho/cfree-roundtrip-macho.c $(LIB_AR) + @mkdir -p $(dir $@) + $(CC) $(HARNESS_CFLAGS) -Isrc test/macho/cfree-roundtrip-macho.c $(LIB_AR) -o $@ + $(LINK_EXE_RUNNER): test/link/harness/link_exe_runner.c $(LIB_AR) @mkdir -p $(dir $@) $(CC) $(HARNESS_CFLAGS) test/link/harness/link_exe_runner.c $(LIB_AR) -o $@ @@ -118,7 +125,7 @@ $(PARSE_RUNNER): test/parse/harness/parse_runner.c $(LIB_AR) test-elf: lib bin-soft $(ROUNDTRIP_BIN) bash test/elf/run.sh -test-link: lib $(ROUNDTRIP_BIN) $(LINK_EXE_RUNNER) $(JIT_RUNNER) +test-link: lib $(ROUNDTRIP_BIN) $(ROUNDTRIP_BIN_MACHO) $(LINK_EXE_RUNNER) $(JIT_RUNNER) bash test/link/run.sh test-cg: lib $(ROUNDTRIP_BIN) $(LINK_EXE_RUNNER) $(JIT_RUNNER)