kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit f4828d2f04a2c393f74e136359bd15952323ac81
parent faac4a4e8df891391b8695fe464799333c1c5754
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 29 May 2026 23:17:52 -0700

test: llvm differential cross-check (cfree as/disasm vs llvm-mc)

A second-oracle, byte-level cross-check against llvm (sidesteps disassembly-
text normalization, which founders on alias/format differences):

  encode lane: assemble every aa64 encode/*.s with cfree as AND llvm-mc; the
    .text bytes must match. Validates cfree's assembler. 17/17 agree.
  disasm lane: cc -c bytes vs llvm-mc of cc -S. The -S text is cfree's
    disassembly, so llvm re-encoding it to codegen's bytes confirms the decode
    (catches a wrong decode cfree's own re-encode would repeat). The one benign
    disagreement — cfree keeps a same-section CALL26/JUMP26 reloc that llvm-mc
    resolves in place (link-equivalent) — is recognized via the reloc-table
    diff and not flagged.

269 agree, 34 reloc-equivalent, 0 differ over the corpus at -O0/-O1. Opt-in
(test-diff-llvm); skips cleanly when llvm-mc is absent.

Diffstat:
Mdoc/ASM_ROUNDTRIP_TESTING.md | 24+++++++++++++++++++++---
Atest/asm/diff_llvm.sh | 125+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/test.mk | 9+++++++++
3 files changed, 155 insertions(+), 3 deletions(-)

diff --git a/doc/ASM_ROUNDTRIP_TESTING.md b/doc/ASM_ROUNDTRIP_TESTING.md @@ -116,9 +116,27 @@ the baseline (`bash test/asm/symmetry.sh --update`). - **Other arches** — the symbolizer switches on aa64 reloc kinds, and the branch-relaxation predicate lists only the aa64 branch kinds; x64/rv64 keep the numeric `-S` output and current `as` behavior. Broaden per the - RelocKind→syntax tables below. -- **Differential** — add the llvm-mc / llvm-objdump differential lanes over the - same `-S` output as a second-oracle cross-check. + RelocKind→syntax tables below. The self-symmetry sweep and llvm differential + are aa64-only too. + +### llvm differential (`test-diff-llvm`) + +A second-oracle cross-check against llvm (`test/asm/diff_llvm.sh`), byte-level +so it sidesteps disassembly-text normalization (movz-vs-mov, `#16`-vs-`#0x10`): + +- **encode lane**: assemble every aa64 `test/asm/encode/*.s` with both `cfree as` + and `llvm-mc`; the `.text` bytes must match. Validates cfree's assembler. +- **disasm lane**: `cfree cc -c` bytes vs `llvm-mc` of `cfree cc -S`. Since the + `-S` text *is* cfree's disassembly, llvm re-encoding it to codegen's bytes + confirms the decode — a *wrong* decode that cfree's own re-encode would repeat + is caught here. The one benign disagreement (cfree codegen keeps a CALL26/ + JUMP26 reloc for a same-section call/branch to a defined local symbol, which + llvm-mc resolves in place — link-equivalent) is recognized by the reloc-table + diff and not flagged. + +Currently 269 agree, 34 reloc-equivalent, 0 differ over the corpus at -O0/-O1. +Opt-in; skips cleanly when `llvm-mc` is absent. The host carries the +aarch64/x86_64/riscv64 llvm tools. ## Background — what cfree can do today (verified) diff --git a/test/asm/diff_llvm.sh b/test/asm/diff_llvm.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +# test/asm/diff_llvm.sh — differential cross-check of cfree against llvm (aa64). +# +# Two byte-level lanes (robust: no disassembly-text normalization, which would +# founder on alias/format differences like movz-vs-mov or #16-vs-#0x10): +# +# encode lane: assemble every aa64 test/asm/encode/*.s with BOTH `cfree as` +# and `llvm-mc`; the .text bytes must match. Validates cfree's assembler +# against llvm-mc as a second oracle. +# +# disasm lane: for every test/asm/roundtrip/*.c, `cfree cc -c` gives codegen's +# bytes and `cfree cc -S` gives cfree's disassembly as re-assemblable text; +# assemble that text with llvm-mc and require the bytes to match codegen's. +# If llvm agrees the -S text means the original bytes, cfree's disassembler +# decoded them correctly — a decode differential that catches a *wrong* +# decode (one a self-round-trip can't, since cfree's own re-encode would +# repeat the mistake). +# +# Opt-in; requires llvm-mc (+ a cfree-readable object). Skips cleanly if the +# tools are absent. See doc/ASM_ROUNDTRIP_TESTING.md. + +set -u + +ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +CFREE="$ROOT/build/cfree" +ENCODE_DIR="$ROOT/test/asm/encode" +RT_DIR="$ROOT/test/asm/roundtrip" +WORK="$ROOT/build/test/asm/diff_llvm" +TRIPLE="${CFREE_LLVM_TRIPLE:-aarch64-linux-gnu}" +MATTR="${CFREE_LLVM_MATTR:-+lse,+v8.1a}" +OPTS="${CFREE_TEST_OPTS:-O1}" + +LLVM_MC="${LLVM_MC:-$(command -v llvm-mc || echo /opt/homebrew/bin/llvm-mc)}" + +color_red() { printf '\033[31m%s\033[0m' "$1"; } +color_grn() { printf '\033[32m%s\033[0m' "$1"; } +color_yel() { printf '\033[33m%s\033[0m' "$1"; } + +if [ ! -x "$LLVM_MC" ]; then + printf 'diff-llvm: %s llvm-mc not found (set $LLVM_MC); skipping\n' \ + "$(color_yel SKIP)" + exit 0 +fi +if [ ! -x "$CFREE" ]; then + printf 'diff-llvm: %s cfree missing — run "make bin"\n' "$(color_red FATAL)" >&2 + exit 2 +fi +mkdir -p "$WORK" + +# Raw .text bytes via cfree objdump (same tool for both objects, so the +# representation is identical regardless of which assembler produced the .o). +raw() { "$CFREE" objdump -s -j .text "$1" 2>/dev/null | awk '/^ *[0-9a-f]+ /{print $2$3$4$5}'; } +# .text relocation kinds+targets (offset omitted — it shifts when a sibling +# reloc is relaxed). Used to recognize the one benign disagreement: cfree +# codegen keeps a CALL26/JUMP26/CONDBR reloc for a same-section call/branch to +# a defined local symbol, where llvm-mc (like GNU as) resolves it in place and +# drops the reloc. Both link to the same bytes; only the relocatable form +# differs. When that's the whole story the reloc tables differ, so we skip +# rather than flag. +text_relocs() { "$CFREE" objdump -r "$1" 2>/dev/null | awk ' + /RELOCATION RECORDS FOR \[\.text\]/{f=1;next} /RELOCATION RECORDS FOR/{f=0} + f && /^[0-9a-f]/{print $2, $3}'; } +mc() { "$LLVM_MC" -triple="$TRIPLE" -mattr="$MATTR" -filetype=obj "$1" -o "$2" 2>"$WORK/mc.err"; } + +agree=0; differ=0; reject=0; reloc_skip=0 +fails=() + +printf 'diff-llvm: encode lane (cfree as vs llvm-mc over the encode corpus)\n' +shopt -s nullglob +for s in "$ENCODE_DIR"/aa64_*.s; do + name="$(basename "$s" .s)" + tg="$ENCODE_DIR/$name.targets" + [ -f "$tg" ] && ! grep -qE 'aa64|aarch64|arm64' "$tg" && continue + "$CFREE" as -target "$TRIPLE" "$s" -o "$WORK/c.o" 2>/dev/null || continue + if ! mc "$s" "$WORK/l.o"; then + reject=$((reject+1)) + printf ' %s %s: llvm-mc rejected (%s)\n' "$(color_yel SKIP)" "$name" \ + "$(head -1 "$WORK/mc.err" | sed 's|.*error: *||')" + continue + fi + if [ "$(raw "$WORK/c.o")" = "$(raw "$WORK/l.o")" ]; then + agree=$((agree+1)) + else + differ=$((differ+1)); fails+=("encode:$name") + printf ' %s %s: .text bytes differ\n' "$(color_red DIFF)" "$name" + fi +done + +printf 'diff-llvm: disasm lane (cc -c bytes vs llvm-mc of cc -S, opts="%s")\n' "$OPTS" +for src in "$RT_DIR"/*.c; do + name="$(basename "$src" .c)" + [ -e "$RT_DIR/$name.skip" ] && continue + for opt in $OPTS; do + "$CFREE" cc -c "-$opt" -target "$TRIPLE" "$src" -o "$WORK/cc.o" 2>/dev/null || continue + "$CFREE" cc -S "-$opt" -target "$TRIPLE" "$src" -o "$WORK/s.s" 2>/dev/null || continue + if ! mc "$WORK/s.s" "$WORK/l.o"; then + reject=$((reject+1)) + printf ' %s %s[-%s]: llvm-mc rejected cc -S (%s)\n' "$(color_yel SKIP)" \ + "$name" "$opt" "$(head -1 "$WORK/mc.err" | sed 's|.*error: *||')" + continue + fi + if [ "$(raw "$WORK/cc.o")" = "$(raw "$WORK/l.o")" ]; then + agree=$((agree+1)) + elif [ "$(text_relocs "$WORK/cc.o")" != "$(text_relocs "$WORK/l.o")" ]; then + # Reloc tables differ: cfree kept a same-section call/branch reloc + # that llvm-mc resolved in place. Link-equivalent — not a decode bug. + reloc_skip=$((reloc_skip+1)) + else + differ=$((differ+1)); fails+=("disasm:$name[-$opt]") + printf ' %s %s[-%s]: cc -c vs llvm-mc(cc -S) bytes differ (relocs match)\n' \ + "$(color_red DIFF)" "$name" "$opt" + fi + done +done +shopt -u nullglob + +printf '\n' +if [ "${#fails[@]}" -gt 0 ]; then + printf 'diff-llvm: %s %d agree, %d differ, %d reloc-equiv, %d llvm-skip\n' \ + "$(color_red 'cfree disagrees with llvm')" "$agree" "$differ" "$reloc_skip" "$reject" + exit 1 +fi +printf 'diff-llvm: %s %d agree, %d reloc-equiv, %d llvm-skip\n' \ + "$(color_grn 'cfree agrees with llvm')" "$agree" "$reloc_skip" "$reject" +exit 0 diff --git a/test/test.mk b/test/test.mk @@ -41,6 +41,7 @@ TEST_TARGETS = \ test-asm-roundtrip \ test-asm-roundtrip-exec \ test-asm-symmetry \ + test-diff-llvm \ test-bounce \ test-cbackend \ test-cg-api \ @@ -663,6 +664,14 @@ test-asm-roundtrip-exec: bin $(JIT_RUNNER) test-asm-symmetry: $(ASM_RUNNER) $(AA64_SWEEP_GEN) @bash test/asm/symmetry.sh +# test-diff-llvm: differential cross-check of cfree against llvm (aa64), as a +# second oracle. Encode lane: cfree as vs llvm-mc bytes over the encode corpus. +# Disasm lane: cc -c bytes vs llvm-mc of cc -S (validates cfree's disassembler; +# the benign same-section-call reloc-vs-resolve difference is recognized). +# Opt-in; skips cleanly when llvm-mc is absent. +test-diff-llvm: bin + @CFREE_TEST_OPTS="O0 O1" bash test/asm/diff_llvm.sh + test-wasm: test-wasm-front test-wasm-target test-wasm-toy test-wasm-front: bin $(WASM_TOOL) $(LINK_EXE_RUNNER) $(JIT_RUNNER)