commit f4828d2f04a2c393f74e136359bd15952323ac81
parent faac4a4e8df891391b8695fe464799333c1c5754
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 29 May 2026 23:17:52 -0700
test: llvm differential cross-check (cfree as/disasm vs llvm-mc)
A second-oracle, byte-level cross-check against llvm (sidesteps disassembly-
text normalization, which founders on alias/format differences):
encode lane: assemble every aa64 encode/*.s with cfree as AND llvm-mc; the
.text bytes must match. Validates cfree's assembler. 17/17 agree.
disasm lane: cc -c bytes vs llvm-mc of cc -S. The -S text is cfree's
disassembly, so llvm re-encoding it to codegen's bytes confirms the decode
(catches a wrong decode cfree's own re-encode would repeat). The one benign
disagreement — cfree keeps a same-section CALL26/JUMP26 reloc that llvm-mc
resolves in place (link-equivalent) — is recognized via the reloc-table
diff and not flagged.
269 agree, 34 reloc-equivalent, 0 differ over the corpus at -O0/-O1. Opt-in
(test-diff-llvm); skips cleanly when llvm-mc is absent.
Diffstat:
3 files changed, 155 insertions(+), 3 deletions(-)
diff --git a/doc/ASM_ROUNDTRIP_TESTING.md b/doc/ASM_ROUNDTRIP_TESTING.md
@@ -116,9 +116,27 @@ the baseline (`bash test/asm/symmetry.sh --update`).
- **Other arches** — the symbolizer switches on aa64 reloc kinds, and the
branch-relaxation predicate lists only the aa64 branch kinds; x64/rv64 keep
the numeric `-S` output and current `as` behavior. Broaden per the
- RelocKind→syntax tables below.
-- **Differential** — add the llvm-mc / llvm-objdump differential lanes over the
- same `-S` output as a second-oracle cross-check.
+ RelocKind→syntax tables below. The self-symmetry sweep and llvm differential
+ are aa64-only too.
+
+### llvm differential (`test-diff-llvm`)
+
+A second-oracle cross-check against llvm (`test/asm/diff_llvm.sh`), byte-level
+so it sidesteps disassembly-text normalization (movz-vs-mov, `#16`-vs-`#0x10`):
+
+- **encode lane**: assemble every aa64 `test/asm/encode/*.s` with both `cfree as`
+ and `llvm-mc`; the `.text` bytes must match. Validates cfree's assembler.
+- **disasm lane**: `cfree cc -c` bytes vs `llvm-mc` of `cfree cc -S`. Since the
+ `-S` text *is* cfree's disassembly, llvm re-encoding it to codegen's bytes
+ confirms the decode — a *wrong* decode that cfree's own re-encode would repeat
+ is caught here. The one benign disagreement (cfree codegen keeps a CALL26/
+ JUMP26 reloc for a same-section call/branch to a defined local symbol, which
+ llvm-mc resolves in place — link-equivalent) is recognized by the reloc-table
+ diff and not flagged.
+
+Currently 269 agree, 34 reloc-equivalent, 0 differ over the corpus at -O0/-O1.
+Opt-in; skips cleanly when `llvm-mc` is absent. The host carries the
+aarch64/x86_64/riscv64 llvm tools.
## Background — what cfree can do today (verified)
diff --git a/test/asm/diff_llvm.sh b/test/asm/diff_llvm.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+# test/asm/diff_llvm.sh — differential cross-check of cfree against llvm (aa64).
+#
+# Two byte-level lanes (robust: no disassembly-text normalization, which would
+# founder on alias/format differences like movz-vs-mov or #16-vs-#0x10):
+#
+# encode lane: assemble every aa64 test/asm/encode/*.s with BOTH `cfree as`
+# and `llvm-mc`; the .text bytes must match. Validates cfree's assembler
+# against llvm-mc as a second oracle.
+#
+# disasm lane: for every test/asm/roundtrip/*.c, `cfree cc -c` gives codegen's
+# bytes and `cfree cc -S` gives cfree's disassembly as re-assemblable text;
+# assemble that text with llvm-mc and require the bytes to match codegen's.
+# If llvm agrees the -S text means the original bytes, cfree's disassembler
+# decoded them correctly — a decode differential that catches a *wrong*
+# decode (one a self-round-trip can't, since cfree's own re-encode would
+# repeat the mistake).
+#
+# Opt-in; requires llvm-mc (+ a cfree-readable object). Skips cleanly if the
+# tools are absent. See doc/ASM_ROUNDTRIP_TESTING.md.
+
+set -u
+
+ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+CFREE="$ROOT/build/cfree"
+ENCODE_DIR="$ROOT/test/asm/encode"
+RT_DIR="$ROOT/test/asm/roundtrip"
+WORK="$ROOT/build/test/asm/diff_llvm"
+TRIPLE="${CFREE_LLVM_TRIPLE:-aarch64-linux-gnu}"
+MATTR="${CFREE_LLVM_MATTR:-+lse,+v8.1a}"
+OPTS="${CFREE_TEST_OPTS:-O1}"
+
+LLVM_MC="${LLVM_MC:-$(command -v llvm-mc || echo /opt/homebrew/bin/llvm-mc)}"
+
+color_red() { printf '\033[31m%s\033[0m' "$1"; }
+color_grn() { printf '\033[32m%s\033[0m' "$1"; }
+color_yel() { printf '\033[33m%s\033[0m' "$1"; }
+
+if [ ! -x "$LLVM_MC" ]; then
+ printf 'diff-llvm: %s llvm-mc not found (set $LLVM_MC); skipping\n' \
+ "$(color_yel SKIP)"
+ exit 0
+fi
+if [ ! -x "$CFREE" ]; then
+ printf 'diff-llvm: %s cfree missing — run "make bin"\n' "$(color_red FATAL)" >&2
+ exit 2
+fi
+mkdir -p "$WORK"
+
+# Raw .text bytes via cfree objdump (same tool for both objects, so the
+# representation is identical regardless of which assembler produced the .o).
+raw() { "$CFREE" objdump -s -j .text "$1" 2>/dev/null | awk '/^ *[0-9a-f]+ /{print $2$3$4$5}'; }
+# .text relocation kinds+targets (offset omitted — it shifts when a sibling
+# reloc is relaxed). Used to recognize the one benign disagreement: cfree
+# codegen keeps a CALL26/JUMP26/CONDBR reloc for a same-section call/branch to
+# a defined local symbol, where llvm-mc (like GNU as) resolves it in place and
+# drops the reloc. Both link to the same bytes; only the relocatable form
+# differs. When that's the whole story the reloc tables differ, so we skip
+# rather than flag.
+text_relocs() { "$CFREE" objdump -r "$1" 2>/dev/null | awk '
+ /RELOCATION RECORDS FOR \[\.text\]/{f=1;next} /RELOCATION RECORDS FOR/{f=0}
+ f && /^[0-9a-f]/{print $2, $3}'; }
+mc() { "$LLVM_MC" -triple="$TRIPLE" -mattr="$MATTR" -filetype=obj "$1" -o "$2" 2>"$WORK/mc.err"; }
+
+agree=0; differ=0; reject=0; reloc_skip=0
+fails=()
+
+printf 'diff-llvm: encode lane (cfree as vs llvm-mc over the encode corpus)\n'
+shopt -s nullglob
+for s in "$ENCODE_DIR"/aa64_*.s; do
+ name="$(basename "$s" .s)"
+ tg="$ENCODE_DIR/$name.targets"
+ [ -f "$tg" ] && ! grep -qE 'aa64|aarch64|arm64' "$tg" && continue
+ "$CFREE" as -target "$TRIPLE" "$s" -o "$WORK/c.o" 2>/dev/null || continue
+ if ! mc "$s" "$WORK/l.o"; then
+ reject=$((reject+1))
+ printf ' %s %s: llvm-mc rejected (%s)\n' "$(color_yel SKIP)" "$name" \
+ "$(head -1 "$WORK/mc.err" | sed 's|.*error: *||')"
+ continue
+ fi
+ if [ "$(raw "$WORK/c.o")" = "$(raw "$WORK/l.o")" ]; then
+ agree=$((agree+1))
+ else
+ differ=$((differ+1)); fails+=("encode:$name")
+ printf ' %s %s: .text bytes differ\n' "$(color_red DIFF)" "$name"
+ fi
+done
+
+printf 'diff-llvm: disasm lane (cc -c bytes vs llvm-mc of cc -S, opts="%s")\n' "$OPTS"
+for src in "$RT_DIR"/*.c; do
+ name="$(basename "$src" .c)"
+ [ -e "$RT_DIR/$name.skip" ] && continue
+ for opt in $OPTS; do
+ "$CFREE" cc -c "-$opt" -target "$TRIPLE" "$src" -o "$WORK/cc.o" 2>/dev/null || continue
+ "$CFREE" cc -S "-$opt" -target "$TRIPLE" "$src" -o "$WORK/s.s" 2>/dev/null || continue
+ if ! mc "$WORK/s.s" "$WORK/l.o"; then
+ reject=$((reject+1))
+ printf ' %s %s[-%s]: llvm-mc rejected cc -S (%s)\n' "$(color_yel SKIP)" \
+ "$name" "$opt" "$(head -1 "$WORK/mc.err" | sed 's|.*error: *||')"
+ continue
+ fi
+ if [ "$(raw "$WORK/cc.o")" = "$(raw "$WORK/l.o")" ]; then
+ agree=$((agree+1))
+ elif [ "$(text_relocs "$WORK/cc.o")" != "$(text_relocs "$WORK/l.o")" ]; then
+ # Reloc tables differ: cfree kept a same-section call/branch reloc
+ # that llvm-mc resolved in place. Link-equivalent — not a decode bug.
+ reloc_skip=$((reloc_skip+1))
+ else
+ differ=$((differ+1)); fails+=("disasm:$name[-$opt]")
+ printf ' %s %s[-%s]: cc -c vs llvm-mc(cc -S) bytes differ (relocs match)\n' \
+ "$(color_red DIFF)" "$name" "$opt"
+ fi
+ done
+done
+shopt -u nullglob
+
+printf '\n'
+if [ "${#fails[@]}" -gt 0 ]; then
+ printf 'diff-llvm: %s %d agree, %d differ, %d reloc-equiv, %d llvm-skip\n' \
+ "$(color_red 'cfree disagrees with llvm')" "$agree" "$differ" "$reloc_skip" "$reject"
+ exit 1
+fi
+printf 'diff-llvm: %s %d agree, %d reloc-equiv, %d llvm-skip\n' \
+ "$(color_grn 'cfree agrees with llvm')" "$agree" "$reloc_skip" "$reject"
+exit 0
diff --git a/test/test.mk b/test/test.mk
@@ -41,6 +41,7 @@ TEST_TARGETS = \
test-asm-roundtrip \
test-asm-roundtrip-exec \
test-asm-symmetry \
+ test-diff-llvm \
test-bounce \
test-cbackend \
test-cg-api \
@@ -663,6 +664,14 @@ test-asm-roundtrip-exec: bin $(JIT_RUNNER)
test-asm-symmetry: $(ASM_RUNNER) $(AA64_SWEEP_GEN)
@bash test/asm/symmetry.sh
+# test-diff-llvm: differential cross-check of cfree against llvm (aa64), as a
+# second oracle. Encode lane: cfree as vs llvm-mc bytes over the encode corpus.
+# Disasm lane: cc -c bytes vs llvm-mc of cc -S (validates cfree's disassembler;
+# the benign same-section-call reloc-vs-resolve difference is recognized).
+# Opt-in; skips cleanly when llvm-mc is absent.
+test-diff-llvm: bin
+ @CFREE_TEST_OPTS="O0 O1" bash test/asm/diff_llvm.sh
+
test-wasm: test-wasm-front test-wasm-target test-wasm-toy
test-wasm-front: bin $(WASM_TOOL) $(LINK_EXE_RUNNER) $(JIT_RUNNER)