commit bbed6357a8627b47f9fc30f48f60fb5e63d38a88
parent 684090878187d2a4704d4bcac4aadd7f6f8f4a67
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 29 Apr 2026 18:09:11 -0700
m1-symbols.py annotate disassembly, add :_text_end to m1pp+scheme
Diffstat:
3 files changed, 185 insertions(+), 0 deletions(-)
diff --git a/M1pp/M1pp.P1 b/M1pp/M1pp.P1
@@ -6481,6 +6481,11 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
li_a1 %1 %0
syscall
+## Sentinel: marks the boundary between executable text and rodata. Read by
+## scripts/disasm-elf.sh (via scripts/m1-symbols.py) to bound disassembly
+## so trailing strings don't decode as bogus instructions.
+:_text_end
+
## --- Rodata: const tokens (for tok_eq_const) and fatal messages --------------
:const_macro "%macro"
diff --git a/scheme1/scheme1.P1pp b/scheme1/scheme1.P1pp
@@ -6262,6 +6262,11 @@
%tail(&apply)
})
+# Sentinel: marks the boundary between executable text and rodata.
+# Read by scripts/disasm-elf.sh (via scripts/m1-symbols.py) to bound
+# disassembly so trailing strings don't decode as bogus instructions.
+:_text_end
+
# Surface names. Length is hard-coded at the call site; no NUL needed
# because intern takes (ptr, len). Padding bytes are written as
# '00...' quoted-literal form (not bare `00 00`) -- the riscv64 stage0
diff --git a/scripts/m1-symbols.py b/scripts/m1-symbols.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""m1-symbols.py — extract labels from a hex2 file and annotate disassembly.
+
+The hex2 produced by M0 in this project uses a small grammar:
+
+ :LABEL declares LABEL at the current byte offset (zero bytes)
+ &LABEL 4-byte absolute address reference (low 32 bits of LABEL)
+ 'XXXX... raw byte literal; (len(token)-1)/2 bytes (no closing quote)
+ XXXX... bare hex; len(token)/2 bytes
+ # ... comment to end-of-line
+
+Label addresses = BASE + HEADER + cumulative byte offset, where BASE is the
+ELF load address (0x600000) and HEADER is the size of the on-disk ELF header
++ phdr that sits in front of M0's output (0x78 bytes for our seed ELF).
+
+Subcommands:
+
+ map <prog.hex2> [--base 0x600000] [--header 0x78] [--ref-size 4]
+ Print "0xADDR LABEL" lines, sorted by address.
+
+ annotate <map>
+ Read llvm-objdump output on stdin; inject "ADDR <LABEL>:" headers and
+ rewrite "<PT_LOAD#0+0xNNN>" xrefs to "<LABEL>" or "<LABEL+N>". Writes
+ annotated output to stdout.
+"""
+
+import argparse
+import re
+import sys
+
+
+def parse_hex2(path, ref_size=4):
+ """Yield (offset, label) pairs in declaration order."""
+ offset = 0
+ seen_unknown = set()
+ with open(path) as f:
+ for lineno, line in enumerate(f, 1):
+ line = line.split('#', 1)[0]
+ for tok in line.split():
+ if tok.startswith(':'):
+ yield offset, tok[1:]
+ elif tok.startswith('&'):
+ offset += ref_size
+ elif tok.startswith("'"):
+ offset += (len(tok) - 1) // 2
+ elif re.fullmatch(r'[0-9A-Fa-f]+', tok):
+ offset += len(tok) // 2
+ else:
+ # Unknown token: warn once per kind, skip. Keeps a
+ # malformed line from dropping the whole label map.
+ sig = tok[:1]
+ if sig not in seen_unknown:
+ seen_unknown.add(sig)
+ sys.stderr.write(
+ f"{path}:{lineno}: skipping unrecognized token "
+ f"{tok!r} (subsequent same-prefix tokens silenced)\n"
+ )
+
+
+def cmd_map(args):
+ items = []
+ for off, name in parse_hex2(args.hex2, ref_size=args.ref_size):
+ items.append((args.base + args.header + off, name))
+ items.sort()
+ for addr, name in items:
+ print(f"0x{addr:08x} {name}")
+
+
+def load_map(path):
+ """Return {addr: [labels]} (multiple labels can share an address)."""
+ by_addr = {}
+ with open(path) as f:
+ for line in f:
+ line = line.strip()
+ if not line or line.startswith('#'):
+ continue
+ addr_str, name = line.split()
+ addr = int(addr_str, 0)
+ by_addr.setdefault(addr, []).append(name)
+ return by_addr
+
+
+def nearest_label(by_addr, sorted_addrs, target):
+ """Find label whose address is the largest <= target. Returns
+ (label, offset) or None."""
+ # Binary search for rightmost addr <= target.
+ lo, hi = 0, len(sorted_addrs)
+ while lo < hi:
+ mid = (lo + hi) // 2
+ if sorted_addrs[mid] <= target:
+ lo = mid + 1
+ else:
+ hi = mid
+ if lo == 0:
+ return None
+ addr = sorted_addrs[lo - 1]
+ return by_addr[addr][0], target - addr
+
+
+# Matches ` 600078: f94003e0 ldr ...` — leading spaces, hex addr, colon.
+LINE_ADDR_RE = re.compile(r'^(\s+)([0-9a-f]+):\s')
+
+# Matches xref like `0x600088 <PT_LOAD#0+0x88>` in the asm column.
+XREF_RE = re.compile(r'0x([0-9a-f]+)\s+<PT_LOAD#0\+0x[0-9a-f]+>')
+
+# Matches the synthetic section header objdump emits for our seed ELF
+# (no real section table): `0000000000600000 <PT_LOAD#0>:`. We skip
+# these in favor of the labels we inject ourselves.
+SECTION_HEADER_RE = re.compile(r'^[0-9a-f]+\s+<PT_LOAD#0>:\s*$')
+
+
+def cmd_annotate(args):
+ by_addr = load_map(args.map)
+ sorted_addrs = sorted(by_addr)
+
+ def rewrite_xref(m):
+ target = int(m.group(1), 16)
+ hit = nearest_label(by_addr, sorted_addrs, target)
+ if hit is None:
+ return m.group(0)
+ label, delta = hit
+ suffix = f"+0x{delta:x}" if delta else ""
+ return f"0x{target:x} <{label}{suffix}>"
+
+ skip_blank = False
+ for raw in sys.stdin:
+ line = raw.rstrip('\n')
+ # Drop the synthetic <PT_LOAD#0>: section header and the blank
+ # line that follows it; our injected labels carry the same info.
+ if SECTION_HEADER_RE.match(line):
+ skip_blank = True
+ continue
+ if skip_blank:
+ skip_blank = False
+ if line == '':
+ continue
+ m = LINE_ADDR_RE.match(line)
+ if m:
+ addr = int(m.group(2), 16)
+ if addr in by_addr:
+ for name in by_addr[addr]:
+ print(f"\n{addr:016x} <{name}>:")
+ line = XREF_RE.sub(rewrite_xref, line)
+ print(line)
+
+
+def main():
+ ap = argparse.ArgumentParser(description=__doc__,
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+ sub = ap.add_subparsers(dest='cmd', required=True)
+
+ p_map = sub.add_parser('map', help='emit address->label map')
+ p_map.add_argument('hex2')
+ p_map.add_argument('--base', type=lambda s: int(s, 0), default=0x600000)
+ p_map.add_argument('--header', type=lambda s: int(s, 0), default=0x78)
+ p_map.add_argument('--ref-size', type=int, default=4)
+ p_map.set_defaults(func=cmd_map)
+
+ p_an = sub.add_parser('annotate', help='inject labels into objdump output')
+ p_an.add_argument('map')
+ p_an.set_defaults(func=cmd_annotate)
+
+ args = ap.parse_args()
+ args.func(args)
+
+
+if __name__ == '__main__':
+ try:
+ main()
+ except BrokenPipeError:
+ # Downstream pipe (e.g. head) closed; exit cleanly.
+ try:
+ sys.stdout.close()
+ except BrokenPipeError:
+ pass