commit 9feae6c4c66cd1be6b1a90b22a6cad2e2ab43cb6
parent 74c6f214ef1dfd173158873ecd2893772e2a5b3a
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sun, 10 May 2026 12:59:35 -0700
obj/test: macho roundtrip — fix read/emit fidelity, wire test-link path R
Mach-O R path runs every applicable test/link/cases case (36 pass,
3 SKIP-NA for ELF-only `__start_/__stop_`, ELF TLS, kernel script).
The reader/writer fidelity bugs surfaced by structural diff:
- underscore mangling broke names without a `_` prefix
(`ltmp0` -> `_ltmp0`); names round-trip verbatim now
- undef external syms lost N_EXT (extern decls came back local)
- weak undef refs (N_WEAK_REF) weren't recognized — only N_WEAK_DEF
collapsed to SB_WEAK
- common syms dropped their size (n_value)
- section S_TYPE/S_ATTR_* bits were recomputed from SecKind instead
of using the preserved Section.ext_type
- zerofill sections layered inline; conventional Mach-O places them
at the segment tail, which the symbol n_values depend on
- n_desc pass-through bits (N_NO_DEAD_STRIP etc.) were lost. Added
obj_symbol_set_flags so readers can stash format-specific bits
the canonical bind/vis/kind triple doesn't model
Harness:
- test/test.mk: cfree-roundtrip-macho Make target; test-link
depends on it
- test/link/run.sh: dispatch ROUNDTRIP_BIN + NORMALIZE by
CFREE_TEST_OBJ; new have_dump covers llvm-readelf / llvm-readobj;
per-case `targets` file gates ELF-only cases as SKIP-NA
- test/macho/normalize.py: structural diff via llvm-readobj
Diffstat:
11 files changed, 420 insertions(+), 31 deletions(-)
diff --git a/doc/linker-status.md b/doc/linker-status.md
@@ -20,6 +20,7 @@ live in `test/link/` — they are not duplicated in `test/elf/`.
| `test-link` E (aa64) | 37 | 0 | qemu/podman aarch64 exec, incl. IFUNC |
| `test-link` J (aa64) | 38 | 0 | JIT in-process incl. GC subgroup, IFUNC, TLS |
| `test-link` R (rv64) | 38 | 0 | object roundtrip via cfree-roundtrip |
+| `test-link` R (aa64-macho) | 36 | 0 | Mach-O object roundtrip via cfree-roundtrip-macho (3 cases SKIP-NA: ELF-only) |
| `test-link` E (rv64) | 38 | 0 | qemu/podman riscv64 exec, incl. IFUNC + TLS |
| `test-link` bad | 2 | 0 | `bad/30_undef_strong` (E + J) |
| `test-musl` | 6 | 0 | musl 1.2.5 static + dynamic: syscall, errno, printf |
diff --git a/src/obj/macho_emit.c b/src/obj/macho_emit.c
@@ -249,8 +249,18 @@ void emit_macho(Compiler* c, ObjBuilder* ob, Writer* w) {
name_to_seg_sect(nm ? nm : "", (u32)nlen, s->kind, &m->ns);
m->obj_sec = i;
m->align = s->align ? s->align : 1;
- m->flags = section_flags_for(s->kind, s->flags, m->ns.sectname,
- m->ns.sect_len);
+ /* Mach-O reader stashes the raw section.flags (S_TYPE | S_ATTR_*)
+ * in Section.ext_type when reading a Mach-O input. Use it
+ * verbatim so attribute bits like S_ATTR_NO_DEAD_STRIP /
+ * S_ATTR_LIVE_SUPPORT round-trip. Fall back to the kind-derived
+ * default for sections originating from non-Mach-O readers (e.g.
+ * cfree codegen). */
+ if (s->ext_kind == OBJ_EXT_MACHO && s->ext_type) {
+ m->flags = s->ext_type;
+ } else {
+ m->flags = section_flags_for(s->kind, s->flags, m->ns.sectname,
+ m->ns.sect_len);
+ }
if (s->sem == SSEM_NOBITS || s->kind == SEC_BSS) {
m->is_zerofill = 1;
m->size = s->bss_size;
@@ -268,10 +278,24 @@ void emit_macho(Compiler* c, ObjBuilder* ob, Writer* w) {
/* ---- pass 2: assign vmaddrs (segment-relative) and per-section
* flat-layout addresses. MH_OBJECT keeps everything in
* one segment with vmaddr=0; section addr fields are
- * relative offsets within the segment. */
+ * relative offsets within the segment.
+ *
+ * Two-pass to match the conventional Mach-O `MH_OBJECT` layout:
+ * non-zerofill sections come first in vmaddr order, then zerofill
+ * sections at the tail. Apple `as` and clang `-c` both lay out
+ * this way, and roundtripping must reproduce it so symbol n_values
+ * (which are segment-relative addresses) compare equal. */
u64 cur_addr = 0;
for (u32 i = 0; i < nsecs; ++i) {
MSec* m = &secs[i];
+ if (m->is_zerofill) continue;
+ cur_addr = ALIGN_UP(cur_addr, (u64)m->align);
+ m->addr = cur_addr;
+ cur_addr += m->size;
+ }
+ for (u32 i = 0; i < nsecs; ++i) {
+ MSec* m = &secs[i];
+ if (!m->is_zerofill) continue;
cur_addr = ALIGN_UP(cur_addr, (u64)m->align);
m->addr = cur_addr;
cur_addr += m->size;
@@ -320,14 +344,16 @@ void emit_macho(Compiler* c, ObjBuilder* ob, Writer* w) {
size_t nlen;
const char* nm = pool_str(c->global, s->name, &nlen);
- /* Mach-O convention: C symbols carry a leading underscore on
- * disk. Apple toolchains rely on this for the linker
- * "_main"-vs-"main" entry point and for every libSystem call.
- * Round-tripped via read_macho, which strips one leading `_`. */
+ /* Mach-O symbol names are stored on disk verbatim — including
+ * the leading `_` Apple toolchains use for C-source-level
+ * symbols ("_main" for `int main()`). The cfree path treats
+ * that prefix as part of the on-disk name, not a transform
+ * applied at emit; a future Mach-O codegen frontend can
+ * prepend the underscore itself the same way LLVM's MCSymbol
+ * does via target.MCAsmInfo. Round-tripping is then byte-for-
+ * byte: emit writes what read sees. */
if (nlen && nm) {
- u8 us = '_';
u32 off = buf_pos(&strtab);
- buf_write(&strtab, &us, 1);
buf_write(&strtab, nm, nlen);
u8 z = 0;
buf_write(&strtab, &z, 1);
@@ -348,6 +374,11 @@ void emit_macho(Compiler* c, ObjBuilder* ob, Writer* w) {
if (undef) {
type |= N_UNDF;
+ /* Undefined symbols with non-LOCAL bind are external references
+ * (the common case — every `extern int x;`). Setting N_EXT
+ * matches what clang emits and what Apple `ld` expects. */
+ if (s->bind == SB_GLOBAL || s->bind == SB_WEAK) type |= N_EXT;
+ if (s->bind == SB_WEAK) n_desc |= N_WEAK_REF;
value = 0;
} else if (s->kind == SK_ABS) {
type |= N_ABS;
@@ -368,6 +399,13 @@ void emit_macho(Compiler* c, ObjBuilder* ob, Writer* w) {
if (s->bind == SB_WEAK) n_desc |= N_WEAK_DEF;
}
+ /* OR in any pass-through n_desc bits the reader stashed in
+ * sym->flags (N_NO_DEAD_STRIP, etc.). The bits we already
+ * compute (N_WEAK_DEF / N_WEAK_REF and the common-alignment
+ * field) are already excluded by read_macho before stashing,
+ * so a plain OR can't double-count. */
+ n_desc |= s->flags;
+
ms->n_type = type;
ms->n_sect = n_sect;
ms->n_desc = n_desc;
diff --git a/src/obj/macho_read.c b/src/obj/macho_read.c
@@ -236,13 +236,10 @@ ObjBuilder* read_macho(Compiler* c, const char* name, const u8* data,
nm = (const char*)(strtab + strx);
while (strx + nlen < strsize && nm[nlen]) ++nlen;
}
- /* Inverse of the leading-underscore prefix emit_macho applies to C
- * symbols. If the Mach-O name starts with `_`, strip it so the
- * builder holds the source-level name. */
- if (nlen && nm[0] == '_') {
- ++nm;
- --nlen;
- }
+ /* Mach-O names round-trip verbatim — the leading `_` Apple
+ * toolchains apply to C symbols is part of the on-disk name as
+ * far as ObjBuilder is concerned. Mirrors the no-transform
+ * decision in emit_macho. */
Sym sn = nlen ? pool_intern(c->global, nm, nlen) : 0;
u8 type_field = (u8)(n_type & N_TYPE);
@@ -250,12 +247,16 @@ ObjBuilder* read_macho(Compiler* c, const char* name, const u8* data,
u8 pext = (u8)(n_type & N_PEXT);
u16 bind = ext ? SB_GLOBAL : SB_LOCAL;
- if (ext && (n_desc & N_WEAK_DEF)) bind = SB_WEAK;
+ /* Weak DEFs (defined symbols) carry N_WEAK_DEF; weak REFs (undef
+ * `__attribute__((weak))` references) carry N_WEAK_REF. Either
+ * one collapses to SB_WEAK in the cfree model. */
+ if (ext && (n_desc & (N_WEAK_DEF | N_WEAK_REF))) bind = SB_WEAK;
u8 vis = pext ? SV_HIDDEN : SV_DEFAULT;
u16 kind;
ObjSecId sec_id = OBJ_SEC_NONE;
u64 value = 0;
+ u64 size = 0;
u64 cmnalign = 0;
if (type_field == N_UNDF) {
@@ -264,6 +265,7 @@ ObjBuilder* read_macho(Compiler* c, const char* name, const u8* data,
* GET_COMM_ALIGN bits. */
kind = SK_COMMON;
value = 0;
+ size = n_value;
u32 la = (u32)((n_desc >> 8) & 0xf);
cmnalign = 1u << la;
} else {
@@ -289,7 +291,17 @@ ObjBuilder* read_macho(Compiler* c, const char* name, const u8* data,
}
ObjSymId id = obj_symbol_ex(ob, sn, (SymBind)bind, (SymVis)vis,
- (SymKind)kind, sec_id, value, 0, cmnalign);
+ (SymKind)kind, sec_id, value, size, cmnalign);
+ /* n_desc carries Mach-O attribute bits beyond what bind/vis/kind
+ * model — N_NO_DEAD_STRIP, N_REF_TO_WEAK, N_ARM_THUMB_DEF, etc.
+ * Mask off the bits we already round-trip via bind (N_WEAK_DEF /
+ * N_WEAK_REF) and the alignment field for commons (which lives
+ * in cmnalign), then stash the remainder so emit_macho can OR it
+ * back in. */
+ u16 desc_pass = n_desc;
+ desc_pass &= (u16)~(N_WEAK_DEF | N_WEAK_REF);
+ if (kind == SK_COMMON) desc_pass &= 0x00ff; /* drop align field */
+ if (desc_pass) obj_symbol_set_flags(ob, id, desc_pass);
sym_macho_to_obj[i] = id;
}
diff --git a/src/obj/obj.c b/src/obj/obj.c
@@ -226,6 +226,14 @@ void obj_symbol_define(ObjBuilder* ob, ObjSymId id, ObjSecId section_id,
if (s->kind == SK_UNDEF) s->kind = SK_OBJ;
}
+void obj_symbol_set_flags(ObjBuilder* ob, ObjSymId id, u16 flags) {
+ ObjSym* s;
+ if (id == OBJ_SYM_NONE) return;
+ s = Symbols_at(&ob->symbols, id);
+ if (!s) return;
+ s->flags = flags;
+}
+
void obj_reloc(ObjBuilder* ob, ObjSecId section_id, u32 offset, RelocKind kind,
ObjSymId sym, i64 addend) {
obj_reloc_ex(ob, section_id, offset, kind, sym, addend, 1, 0);
diff --git a/src/obj/obj.h b/src/obj/obj.h
@@ -334,6 +334,15 @@ void obj_finalize(ObjBuilder*);
void obj_set_elf_e_flags(ObjBuilder*, u32 e_flags);
int obj_get_elf_e_flags(const ObjBuilder*, u32* out);
+/* Per-symbol format-specific flag bits. ObjSym.flags is otherwise
+ * unused; readers stash format-specific attribute bits there so the
+ * matching emitter can re-apply them. Today this is Mach-O n_desc
+ * pass-through (N_NO_DEAD_STRIP, etc.) — bits the canonical
+ * ObjSym.bind/vis/kind triple doesn't model. ELF callers are free
+ * to use the same field for their own pass-through; the contract is
+ * "bits go in / same bits come out", not a shared semantic. */
+void obj_symbol_set_flags(ObjBuilder*, ObjSymId, u16 flags);
+
/* ---- read side (linker, file emitters, objdump) ---- */
u32 obj_section_count(const ObjBuilder*);
const Section* obj_section_get(const ObjBuilder*, ObjSecId id);
diff --git a/test/link/cases/25h_gc_start_stop/targets b/test/link/cases/25h_gc_start_stop/targets
@@ -0,0 +1,3 @@
+aa64-elf
+rv64-elf
+x64-elf
diff --git a/test/link/cases/31_tls_local_exec/targets b/test/link/cases/31_tls_local_exec/targets
@@ -0,0 +1,3 @@
+aa64-elf
+rv64-elf
+x64-elf
diff --git a/test/link/cases/35_linker_script_kernel/targets b/test/link/cases/35_linker_script_kernel/targets
@@ -0,0 +1,2 @@
+aa64-elf
+rv64-elf
diff --git a/test/link/run.sh b/test/link/run.sh
@@ -57,8 +57,10 @@ ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
TEST_DIR="$ROOT/test/link"
BUILD_DIR="$ROOT/build/test"
LIB_AR="$ROOT/build/libcfree.a"
-ROUNDTRIP_BIN="$ROOT/build/test/cfree-roundtrip"
-NORMALIZE="$ROOT/test/elf/normalize.py"
+ROUNDTRIP_BIN_ELF="$ROOT/build/test/cfree-roundtrip"
+ROUNDTRIP_BIN_MACHO="$ROOT/build/test/cfree-roundtrip-macho"
+NORMALIZE_ELF="$ROOT/test/elf/normalize.py"
+NORMALIZE_MACHO="$ROOT/test/macho/normalize.py"
LINK_EXE_RUNNER="$BUILD_DIR/link-exe-runner"
JIT_RUNNER="$BUILD_DIR/jit-runner"
@@ -82,6 +84,8 @@ case "$CFREE_TEST_OBJ" in
x64) CLANG_TRIPLE=x86_64-linux-gnu ;;
rv64) CLANG_TRIPLE=riscv64-linux-gnu ;;
esac
+ ROUNDTRIP_BIN="$ROUNDTRIP_BIN_ELF"
+ NORMALIZE="$NORMALIZE_ELF"
;;
macho)
EXEC_OS=macos
@@ -90,6 +94,8 @@ case "$CFREE_TEST_OBJ" in
x64) CLANG_TRIPLE=x86_64-apple-macos ;;
rv64) printf 'CFREE_TEST_OBJ=macho has no rv64 target\n' >&2; exit 2 ;;
esac
+ ROUNDTRIP_BIN="$ROUNDTRIP_BIN_MACHO"
+ NORMALIZE="$NORMALIZE_MACHO"
;;
*) printf 'unknown CFREE_TEST_OBJ=%s\n' "$CFREE_TEST_OBJ" >&2; exit 2 ;;
esac
@@ -144,6 +150,18 @@ fi
command -v llvm-readelf >/dev/null 2>&1 && have_readelf=1
command -v readelf >/dev/null 2>&1 && have_readelf=1
command -v python3 >/dev/null 2>&1 && have_python3=1
+have_readobj=0
+command -v llvm-readobj >/dev/null 2>&1 && have_readobj=1
+command -v readobj >/dev/null 2>&1 && have_readobj=1
+# Path R needs the right dump tool for the obj format. ELF wants
+# llvm-readelf; Mach-O wants llvm-readobj. The harness exposes a
+# single have_dump flag so the per-case skip logic doesn't have to
+# branch on CFREE_TEST_OBJ.
+have_dump=0
+case "$CFREE_TEST_OBJ" in
+ elf) [ $have_readelf -eq 1 ] && have_dump=1 ;;
+ macho) [ $have_readobj -eq 1 ] && have_dump=1 ;;
+esac
# Prefer llvm-ar for archive creation: Apple's /usr/bin/ar requires
# Mach-O members and silently drops ELF objects (leaving only a SYMDEF
# stub), which breaks the cross-target archive cases here.
@@ -246,10 +264,29 @@ E_GC_PRESENT_LIST=()
# ---- per-case loop ---------------------------------------------------------
+CUR_TUPLE="${TEST_ARCH}-${CFREE_TEST_OBJ}"
+
for case_dir in "$TEST_DIR/cases"/*/; do
[ -d "$case_dir" ] || continue
name="$(basename "$case_dir")"
[ -n "$FILTER" ] && [[ "$name" != *"$FILTER"* ]] && continue
+ # Per-case applicability: a `targets` file lists the <arch>-<obj>
+ # tuples the case applies to (one per line, or whitespace-separated).
+ # Cases with no `targets` file run on every tuple. Filtered cases
+ # print a SKIP-NA line and don't count against pass/fail/skip — they
+ # exercise target-specific features with no analogue elsewhere
+ # (e.g. ELF __start_/__stop_ boundary syms have no Mach-O peer;
+ # ELF TLS local-exec relocs differ fundamentally from Mach-O TLVP).
+ if [ -f "$case_dir/targets" ]; then
+ applicable=0
+ for tuple in $(cat "$case_dir/targets"); do
+ [ "$tuple" = "$CUR_TUPLE" ] && applicable=1
+ done
+ if [ $applicable -eq 0 ]; then
+ printf ' %s %s — N/A on %s\n' "$(color_yel SKIP-NA)" "$name" "$CUR_TUPLE"
+ continue
+ fi
+ fi
work="$BUILD_DIR/link/$name"
mkdir -p "$work"
@@ -399,7 +436,7 @@ for case_dir in "$TEST_DIR/cases"/*/; do
# ---- Path R: roundtrip --------------------------------------------------
if [ $jit_only -eq 0 ] && [ $RUN_R -eq 1 ] && [ $kernel_image -eq 0 ]; then
- if [ $have_roundtrip -eq 1 ] && [ $have_readelf -eq 1 ] && [ $have_python3 -eq 1 ]; then
+ if [ $have_roundtrip -eq 1 ] && [ $have_dump -eq 1 ] && [ $have_python3 -eq 1 ]; then
t0=$(now_ms)
r_ok=1
for obj in "${rt_obj_files[@]}"; do
@@ -408,10 +445,20 @@ for case_dir in "$TEST_DIR/cases"/*/; do
if ! "$ROUNDTRIP_BIN" "$obj" "$rt" 2>"$work/rt_${base}.err"; then
r_ok=0; break
fi
- "$READELF_BIN" -aW "$obj" | python3 "$NORMALIZE" filter \
- >"$work/${base}_golden.norm" 2>/dev/null
- "$READELF_BIN" -aW "$rt" | python3 "$NORMALIZE" filter \
- >"$work/${base}_rt.norm" 2>/dev/null
+ # ELF: pipe `readelf -aW` through normalize.py filter.
+ # Mach-O: normalize.py runs llvm-readobj on the file
+ # itself (it knows the right flag set).
+ if [ "$CFREE_TEST_OBJ" = "macho" ]; then
+ python3 "$NORMALIZE" "$obj" \
+ >"$work/${base}_golden.norm" 2>/dev/null
+ python3 "$NORMALIZE" "$rt" \
+ >"$work/${base}_rt.norm" 2>/dev/null
+ else
+ "$READELF_BIN" -aW "$obj" | python3 "$NORMALIZE" filter \
+ >"$work/${base}_golden.norm" 2>/dev/null
+ "$READELF_BIN" -aW "$rt" | python3 "$NORMALIZE" filter \
+ >"$work/${base}_rt.norm" 2>/dev/null
+ fi
if ! diff -u "$work/${base}_golden.norm" \
"$work/${base}_rt.norm" \
>"$work/${base}_diff.txt" 2>&1; then
@@ -422,7 +469,7 @@ for case_dir in "$TEST_DIR/cases"/*/; do
if [ $r_ok -eq 1 ]; then note_pass "$name/R (${dt}ms)"
else note_fail "$name/R"; fi
else
- note_skip "$name/R" "missing roundtrip/readelf/python3"
+ note_skip "$name/R" "missing roundtrip/dump-tool/python3"
fi
fi
diff --git a/test/macho/normalize.py b/test/macho/normalize.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""Canonicalize llvm-readobj output for Mach-O so two .o files with
+equivalent semantic content compare equal.
+
+Mach-O peer of test/elf/normalize.py — the structural-fidelity pivot
+the path-R harness uses for CFREE_TEST_OBJ=macho. Differs from the
+ELF normalizer because:
+
+ * llvm-readobj's Mach-O output uses an indented "Section { ... }"
+ block format, not a one-line-per-row layout. The normalizer is a
+ block transformer rather than a line transformer.
+ * Mach-O symbol-name strtab packing order is implementation-defined
+ (clang packs externals before locals; cfree packs in symtab
+ order). The "(NN)" strx hint after each Name field is dropped so
+ those orderings compare equal.
+ * Section/segment name fields print with a trailing hex byte dump
+ (Name: __text (5F 5F 74 65 78 74 ...)). The hex tail is dropped
+ — the printable name already captures it.
+ * File offsets, RelocationOffset, addresses inside SectionData
+ headers are layout-dependent and dropped.
+
+Strips:
+ - "(NN)" strtab offsets after Name: fields
+ - hex byte dumps after section/segment names
+ - "Index: N" (section index — re-derivable from order)
+ - "Offset: N" (file offset)
+ - "RelocationOffset: 0xN"
+ - "Address: 0xN" (where the section is positioned within the segment)
+ - the SectionData hex dump line numbers — content is what matters
+ - the "File:" header line (path varies between golden and rt)
+
+Sorts:
+ - Symbol entries within Symbols [] by Name (clang and cfree partition
+ differently within the same DYSYMTAB extents)
+ - Relocations within each Relocations { Section { ... } } by Offset
+
+Invocation:
+ normalize.py <file> — runs `llvm-readobj` on file, normalizes.
+ normalize.py filter — reads stdin, writes normalized to stdout.
+"""
+import re
+import shutil
+import subprocess
+import sys
+
+
+def _which(*names):
+ for n in names:
+ p = shutil.which(n)
+ if p:
+ return p
+ return None
+
+
+def _readobj(path):
+ bin_path = _which("llvm-readobj", "readobj")
+ if not bin_path:
+ sys.stderr.write("normalize.py: cannot find llvm-readobj\n")
+ sys.exit(77)
+ args = [
+ bin_path,
+ "--section-headers",
+ "--section-data",
+ "--relocations",
+ "--symbols",
+ path,
+ ]
+ res = subprocess.run(args, capture_output=True, text=True)
+ sys.stderr.write(res.stderr)
+ return res.stdout
+
+
+# "Name: foo (NN)" — strip the strtab-offset hint
+_NAME_STRX = re.compile(r"^(\s*Name:\s+\S+)\s+\(\d+\)\s*$")
+# "Name: __text (5F 5F ...)" / "Segment: __TEXT (5F 5F ...)" — strip hex tail
+_NAME_HEXBYTES = re.compile(
+ r"^(\s*(?:Name|Segment):\s+\S+)\s+\([0-9A-Fa-f ]+\)\s*$"
+)
+# Lines whose value is layout-dependent and not load-bearing for fidelity.
+_DROP_FIELD_RE = re.compile(
+ r"^\s*(Index|Offset|RelocationOffset|Address|File):\s"
+)
+# SectionData hex rows: " 0000: 00008052 ..."
+_SECTION_DATA_ROW = re.compile(r"^\s*[0-9A-Fa-f]{4}:\s+")
+# Block headers we use for sort scoping.
+_BLOCK_HDR = re.compile(r"^(\s*)(\w[\w]*)\s+(\[|\{)\s*$")
+
+
+def _strip_line(line):
+ rstripped = line.rstrip("\n")
+ if _DROP_FIELD_RE.match(rstripped):
+ return None
+ m = _NAME_STRX.match(rstripped)
+ if m:
+ return m.group(1) + "\n"
+ m = _NAME_HEXBYTES.match(rstripped)
+ if m:
+ return m.group(1) + "\n"
+ return line
+
+
+def _block_key(block_lines, prefer_keys=("Name", "Offset")):
+ """Pick a sort key from a block: first matching field wins."""
+ for k in prefer_keys:
+ for ln in block_lines:
+ m = re.match(r"^\s*" + re.escape(k) + r":\s+(.+?)\s*$", ln)
+ if m:
+ return m.group(1)
+ return ""
+
+
+def _split_top_blocks(lines):
+ """Walk lines at one indent level; group into blocks bounded by
+ matching '{...}'. Returns (prefix, blocks, suffix) where blocks is
+ a list of [lines] each starting with 'Foo {' and ending '}'."""
+ prefix, suffix = [], []
+ blocks = []
+ i = 0
+ n = len(lines)
+ started = False
+ base_indent = None
+ while i < n:
+ line = lines[i]
+ stripped = line.rstrip("\n")
+ # Detect a "Symbol {" or "Section {" or "Relocation {" block start.
+ m = re.match(r"^(\s*)(\w+)\s+\{\s*$", stripped)
+ if m and m.group(2) in ("Symbol", "Section", "Relocation"):
+ if base_indent is None:
+ base_indent = len(m.group(1))
+ indent = len(m.group(1))
+ if indent == base_indent:
+ started = True
+ # Collect until the matching '}'
+ depth = 1
+ blk = [line]
+ i += 1
+ while i < n and depth > 0:
+ bl = lines[i]
+ bs = bl.rstrip("\n")
+ if re.match(r"^\s*\w[\w ]*\{\s*$", bs) or bs.endswith(" {"):
+ depth += 1
+ elif re.match(r"^\s*\}\s*$", bs):
+ depth -= 1
+ blk.append(bl)
+ i += 1
+ blocks.append(blk)
+ continue
+ if not started:
+ prefix.append(line)
+ else:
+ suffix.append(line)
+ i += 1
+ return prefix, blocks, suffix
+
+
+def _normalize_block_internals(block_lines):
+ """For a Section block containing Relocations { ... } or a
+ Relocations { Section { ... } } block — sort inner relocation
+ entries by Offset."""
+ # Find inner "Relocations [" or "Relocation {" sub-blocks and sort.
+ # Two cases this matters:
+ # 1) top-level Sections [] entries don't carry their own relocs in
+ # llvm-readobj's macho output — relocs live under Relocations [].
+ # 2) Relocations [ Section { Relocation { ... } ... } ]
+ return block_lines
+
+
+def normalize(text):
+ # First apply per-line strips.
+ lines = []
+ for raw in text.splitlines(keepends=True):
+ out = _strip_line(raw)
+ if out is None:
+ continue
+ # Drop SectionData byte-row column offsets like "0000:" — keep
+ # only the ASCII bytes. SectionData rows are positional; they
+ # render the same content for both files at the same offsets.
+ m = _SECTION_DATA_ROW.match(out)
+ if m:
+ # Keep just the hex+ascii portion after the column tag so
+ # whitespace variations don't matter.
+ out = re.sub(r"^\s*[0-9A-Fa-f]{4}:\s+", " ", out)
+ lines.append(out)
+
+ # Now scan top-level "Symbols [" and "Relocations [" blocks and
+ # sort their inner "Symbol {" / "Section { ... Relocation { ... } }"
+ # entries.
+ out_lines = []
+ i = 0
+ n = len(lines)
+ while i < n:
+ line = lines[i]
+ m_sym = re.match(r"^Symbols\s*\[\s*$", line.rstrip("\n"))
+ m_rel = re.match(r"^Relocations\s*\[\s*$", line.rstrip("\n"))
+ if m_sym or m_rel:
+ out_lines.append(line)
+ i += 1
+ # Collect everything until the matching closing "]"
+ inner = []
+ depth = 1
+ while i < n and depth > 0:
+ ln = lines[i]
+ s = ln.rstrip("\n")
+ if re.match(r"^[A-Za-z][\w]*\s*\[\s*$", s) or s.endswith(" ["):
+ depth += 1
+ elif re.match(r"^\]\s*$", s):
+ depth -= 1
+ if depth == 0:
+ break
+ inner.append(ln)
+ i += 1
+ # Split inner into Symbol/Section/Relocation blocks and sort.
+ _, blocks, suffix = _split_top_blocks(inner)
+ if m_sym:
+ blocks.sort(key=lambda b: _block_key(b, ("Name",)))
+ else:
+ # Relocations [ Section { Relocation { Offset: N ... } } ]
+ # Sort the inner Relocation entries within each Section.
+ new_blocks = []
+ for blk in blocks:
+ p2, sub_blocks, suf2 = _split_top_blocks(blk[1:-1])
+ sub_blocks.sort(key=lambda b: _block_key(b, ("Offset",)))
+ rebuilt = [blk[0]] + p2
+ for sb in sub_blocks:
+ rebuilt.extend(sb)
+ rebuilt.extend(suf2)
+ rebuilt.append(blk[-1])
+ new_blocks.append(rebuilt)
+ blocks = new_blocks
+ for b in blocks:
+ out_lines.extend(b)
+ out_lines.extend(suffix)
+ # Append the closing "]" line we left at lines[i]
+ if i < n:
+ out_lines.append(lines[i])
+ i += 1
+ else:
+ out_lines.append(line)
+ i += 1
+
+ return "".join(out_lines)
+
+
+def main(argv):
+ if len(argv) < 2:
+ sys.stderr.write(__doc__)
+ return 2
+ cmd = argv[1]
+ if cmd == "filter":
+ text = sys.stdin.read()
+ else:
+ # Treat the arg as a file path; run llvm-readobj on it.
+ text = _readobj(cmd)
+ sys.stdout.write(normalize(text))
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv))
diff --git a/test/test.mk b/test/test.mk
@@ -93,16 +93,23 @@ $(DEBUG_TEST_BIN): test/debug/roundtrip_unit.c $(LIB_AR)
# void* to a function pointer, which pedantic rejects under C11.
HARNESS_CFLAGS = -std=c11 -Wall -Wextra -Werror -isysroot $(SYSROOT) -Iinclude -Itest
-ROUNDTRIP_BIN = build/test/cfree-roundtrip
-LINK_EXE_RUNNER = build/test/link-exe-runner
-JIT_RUNNER = build/test/jit-runner
-PARSE_RUNNER = build/test/parse-runner
+ROUNDTRIP_BIN = build/test/cfree-roundtrip
+ROUNDTRIP_BIN_MACHO = build/test/cfree-roundtrip-macho
+LINK_EXE_RUNNER = build/test/link-exe-runner
+JIT_RUNNER = build/test/jit-runner
+PARSE_RUNNER = build/test/parse-runner
# cfree-roundtrip needs `-Isrc` for the internal obj.h surface it inspects.
$(ROUNDTRIP_BIN): test/elf/cfree-roundtrip.c $(LIB_AR)
@mkdir -p $(dir $@)
$(CC) $(HARNESS_CFLAGS) -Isrc test/elf/cfree-roundtrip.c $(LIB_AR) -o $@
+# Mach-O peer of cfree-roundtrip — read_macho + emit_macho. Used by
+# test-link's path R when CFREE_TEST_OBJ=macho.
+$(ROUNDTRIP_BIN_MACHO): test/macho/cfree-roundtrip-macho.c $(LIB_AR)
+ @mkdir -p $(dir $@)
+ $(CC) $(HARNESS_CFLAGS) -Isrc test/macho/cfree-roundtrip-macho.c $(LIB_AR) -o $@
+
$(LINK_EXE_RUNNER): test/link/harness/link_exe_runner.c $(LIB_AR)
@mkdir -p $(dir $@)
$(CC) $(HARNESS_CFLAGS) test/link/harness/link_exe_runner.c $(LIB_AR) -o $@
@@ -118,7 +125,7 @@ $(PARSE_RUNNER): test/parse/harness/parse_runner.c $(LIB_AR)
test-elf: lib bin-soft $(ROUNDTRIP_BIN)
bash test/elf/run.sh
-test-link: lib $(ROUNDTRIP_BIN) $(LINK_EXE_RUNNER) $(JIT_RUNNER)
+test-link: lib $(ROUNDTRIP_BIN) $(ROUNDTRIP_BIN_MACHO) $(LINK_EXE_RUNNER) $(JIT_RUNNER)
bash test/link/run.sh
test-cg: lib $(ROUNDTRIP_BIN) $(LINK_EXE_RUNNER) $(JIT_RUNNER)