m1-symbols.py annotate disassembly, add :_text_end to m1pp+scheme - boot2

commit bbed6357a8627b47f9fc30f48f60fb5e63d38a88
parent 684090878187d2a4704d4bcac4aadd7f6f8f4a67
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed, 29 Apr 2026 18:09:11 -0700

m1-symbols.py annotate disassembly, add :_text_end to m1pp+scheme

Diffstat:
M M1pp/M1pp.P1  | 5 +++++
M scheme1/scheme1.P1pp  | 5 +++++
A scripts/m1-symbols.py  | 175 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 185 insertions(+), 0 deletions(-)
diff --git a/M1pp/M1pp.P1 b/M1pp/M1pp.P1
@@ -6481,6 +6481,11 @@ DEFINE OFF_local_lookup_scratch 0052850000000000
     li_a1 %1 %0
     syscall
 
+## Sentinel: marks the boundary between executable text and rodata. Read by
+## scripts/disasm-elf.sh (via scripts/m1-symbols.py) to bound disassembly
+## so trailing strings don't decode as bogus instructions.
+:_text_end
+
 ## --- Rodata: const tokens (for tok_eq_const) and fatal messages --------------
 
 :const_macro "%macro"
diff --git a/scheme1/scheme1.P1pp b/scheme1/scheme1.P1pp
@@ -6262,6 +6262,11 @@
     %tail(&apply)
 })
 
+# Sentinel: marks the boundary between executable text and rodata.
+# Read by scripts/disasm-elf.sh (via scripts/m1-symbols.py) to bound
+# disassembly so trailing strings don't decode as bogus instructions.
+:_text_end
+
 # Surface names. Length is hard-coded at the call site; no NUL needed
 # because intern takes (ptr, len). Padding bytes are written as
 # '00...' quoted-literal form (not bare `00 00`) -- the riscv64 stage0
diff --git a/scripts/m1-symbols.py b/scripts/m1-symbols.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""m1-symbols.py — extract labels from a hex2 file and annotate disassembly.
+
+The hex2 produced by M0 in this project uses a small grammar:
+
+  :LABEL          declares LABEL at the current byte offset (zero bytes)
+  &LABEL          4-byte absolute address reference (low 32 bits of LABEL)
+  'XXXX...        raw byte literal; (len(token)-1)/2 bytes (no closing quote)
+  XXXX...         bare hex; len(token)/2 bytes
+  # ...           comment to end-of-line
+
+Label addresses = BASE + HEADER + cumulative byte offset, where BASE is the
+ELF load address (0x600000) and HEADER is the size of the on-disk ELF header
++ phdr that sits in front of M0's output (0x78 bytes for our seed ELF).
+
+Subcommands:
+
+  map <prog.hex2> [--base 0x600000] [--header 0x78] [--ref-size 4]
+      Print "0xADDR LABEL" lines, sorted by address.
+
+  annotate <map>
+      Read llvm-objdump output on stdin; inject "ADDR <LABEL>:" headers and
+      rewrite "<PT_LOAD#0+0xNNN>" xrefs to "<LABEL>" or "<LABEL+N>". Writes
+      annotated output to stdout.
+"""
+
+import argparse
+import re
+import sys
+
+
+def parse_hex2(path, ref_size=4):
+    """Yield (offset, label) pairs in declaration order."""
+    offset = 0
+    seen_unknown = set()
+    with open(path) as f:
+        for lineno, line in enumerate(f, 1):
+            line = line.split('#', 1)[0]
+            for tok in line.split():
+                if tok.startswith(':'):
+                    yield offset, tok[1:]
+                elif tok.startswith('&'):
+                    offset += ref_size
+                elif tok.startswith("'"):
+                    offset += (len(tok) - 1) // 2
+                elif re.fullmatch(r'[0-9A-Fa-f]+', tok):
+                    offset += len(tok) // 2
+                else:
+                    # Unknown token: warn once per kind, skip. Keeps a
+                    # malformed line from dropping the whole label map.
+                    sig = tok[:1]
+                    if sig not in seen_unknown:
+                        seen_unknown.add(sig)
+                        sys.stderr.write(
+                            f"{path}:{lineno}: skipping unrecognized token "
+                            f"{tok!r} (subsequent same-prefix tokens silenced)\n"
+                        )
+
+
+def cmd_map(args):
+    items = []
+    for off, name in parse_hex2(args.hex2, ref_size=args.ref_size):
+        items.append((args.base + args.header + off, name))
+    items.sort()
+    for addr, name in items:
+        print(f"0x{addr:08x} {name}")
+
+
+def load_map(path):
+    """Return {addr: [labels]} (multiple labels can share an address)."""
+    by_addr = {}
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            addr_str, name = line.split()
+            addr = int(addr_str, 0)
+            by_addr.setdefault(addr, []).append(name)
+    return by_addr
+
+
+def nearest_label(by_addr, sorted_addrs, target):
+    """Find label whose address is the largest <= target. Returns
+    (label, offset) or None."""
+    # Binary search for rightmost addr <= target.
+    lo, hi = 0, len(sorted_addrs)
+    while lo < hi:
+        mid = (lo + hi) // 2
+        if sorted_addrs[mid] <= target:
+            lo = mid + 1
+        else:
+            hi = mid
+    if lo == 0:
+        return None
+    addr = sorted_addrs[lo - 1]
+    return by_addr[addr][0], target - addr
+
+
+# Matches `  600078: f94003e0     ldr ...` — leading spaces, hex addr, colon.
+LINE_ADDR_RE = re.compile(r'^(\s+)([0-9a-f]+):\s')
+
+# Matches xref like `0x600088 <PT_LOAD#0+0x88>` in the asm column.
+XREF_RE = re.compile(r'0x([0-9a-f]+)\s+<PT_LOAD#0\+0x[0-9a-f]+>')
+
+# Matches the synthetic section header objdump emits for our seed ELF
+# (no real section table): `0000000000600000 <PT_LOAD#0>:`. We skip
+# these in favor of the labels we inject ourselves.
+SECTION_HEADER_RE = re.compile(r'^[0-9a-f]+\s+<PT_LOAD#0>:\s*$')
+
+
+def cmd_annotate(args):
+    by_addr = load_map(args.map)
+    sorted_addrs = sorted(by_addr)
+
+    def rewrite_xref(m):
+        target = int(m.group(1), 16)
+        hit = nearest_label(by_addr, sorted_addrs, target)
+        if hit is None:
+            return m.group(0)
+        label, delta = hit
+        suffix = f"+0x{delta:x}" if delta else ""
+        return f"0x{target:x} <{label}{suffix}>"
+
+    skip_blank = False
+    for raw in sys.stdin:
+        line = raw.rstrip('\n')
+        # Drop the synthetic <PT_LOAD#0>: section header and the blank
+        # line that follows it; our injected labels carry the same info.
+        if SECTION_HEADER_RE.match(line):
+            skip_blank = True
+            continue
+        if skip_blank:
+            skip_blank = False
+            if line == '':
+                continue
+        m = LINE_ADDR_RE.match(line)
+        if m:
+            addr = int(m.group(2), 16)
+            if addr in by_addr:
+                for name in by_addr[addr]:
+                    print(f"\n{addr:016x} <{name}>:")
+        line = XREF_RE.sub(rewrite_xref, line)
+        print(line)
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    sub = ap.add_subparsers(dest='cmd', required=True)
+
+    p_map = sub.add_parser('map', help='emit address->label map')
+    p_map.add_argument('hex2')
+    p_map.add_argument('--base', type=lambda s: int(s, 0), default=0x600000)
+    p_map.add_argument('--header', type=lambda s: int(s, 0), default=0x78)
+    p_map.add_argument('--ref-size', type=int, default=4)
+    p_map.set_defaults(func=cmd_map)
+
+    p_an = sub.add_parser('annotate', help='inject labels into objdump output')
+    p_an.add_argument('map')
+    p_an.set_defaults(func=cmd_annotate)
+
+    args = ap.parse_args()
+    args.func(args)
+
+
+if __name__ == '__main__':
+    try:
+        main()
+    except BrokenPipeError:
+        # Downstream pipe (e.g. head) closed; exit cleanly.
+        try:
+            sys.stdout.close()
+        except BrokenPipeError:
+            pass

	boot2 Playing with the boostrap
	git clone https://git.ryansepassi.com/git/boot2.git
	Log \| Files \| Refs \| README

M	M1pp/M1pp.P1	\|	5	+++++
M	scheme1/scheme1.P1pp	\|	5	+++++
A	scripts/m1-symbols.py	\|	175	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++