boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit bbed6357a8627b47f9fc30f48f60fb5e63d38a88
parent 684090878187d2a4704d4bcac4aadd7f6f8f4a67
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed, 29 Apr 2026 18:09:11 -0700

m1-symbols.py annotate disassembly, add :_text_end to m1pp+scheme

Diffstat:
MM1pp/M1pp.P1 | 5+++++
Mscheme1/scheme1.P1pp | 5+++++
Ascripts/m1-symbols.py | 175+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 185 insertions(+), 0 deletions(-)

diff --git a/M1pp/M1pp.P1 b/M1pp/M1pp.P1 @@ -6481,6 +6481,11 @@ DEFINE OFF_local_lookup_scratch 0052850000000000 li_a1 %1 %0 syscall +## Sentinel: marks the boundary between executable text and rodata. Read by +## scripts/disasm-elf.sh (via scripts/m1-symbols.py) to bound disassembly +## so trailing strings don't decode as bogus instructions. +:_text_end + ## --- Rodata: const tokens (for tok_eq_const) and fatal messages -------------- :const_macro "%macro" diff --git a/scheme1/scheme1.P1pp b/scheme1/scheme1.P1pp @@ -6262,6 +6262,11 @@ %tail(&apply) }) +# Sentinel: marks the boundary between executable text and rodata. +# Read by scripts/disasm-elf.sh (via scripts/m1-symbols.py) to bound +# disassembly so trailing strings don't decode as bogus instructions. +:_text_end + # Surface names. Length is hard-coded at the call site; no NUL needed # because intern takes (ptr, len). Padding bytes are written as # '00...' quoted-literal form (not bare `00 00`) -- the riscv64 stage0 diff --git a/scripts/m1-symbols.py b/scripts/m1-symbols.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +"""m1-symbols.py — extract labels from a hex2 file and annotate disassembly. + +The hex2 produced by M0 in this project uses a small grammar: + + :LABEL declares LABEL at the current byte offset (zero bytes) + &LABEL 4-byte absolute address reference (low 32 bits of LABEL) + 'XXXX... raw byte literal; (len(token)-1)/2 bytes (no closing quote) + XXXX... bare hex; len(token)/2 bytes + # ... comment to end-of-line + +Label addresses = BASE + HEADER + cumulative byte offset, where BASE is the +ELF load address (0x600000) and HEADER is the size of the on-disk ELF header ++ phdr that sits in front of M0's output (0x78 bytes for our seed ELF). + +Subcommands: + + map <prog.hex2> [--base 0x600000] [--header 0x78] [--ref-size 4] + Print "0xADDR LABEL" lines, sorted by address. + + annotate <map> + Read llvm-objdump output on stdin; inject "ADDR <LABEL>:" headers and + rewrite "<PT_LOAD#0+0xNNN>" xrefs to "<LABEL>" or "<LABEL+N>". Writes + annotated output to stdout. +""" + +import argparse +import re +import sys + + +def parse_hex2(path, ref_size=4): + """Yield (offset, label) pairs in declaration order.""" + offset = 0 + seen_unknown = set() + with open(path) as f: + for lineno, line in enumerate(f, 1): + line = line.split('#', 1)[0] + for tok in line.split(): + if tok.startswith(':'): + yield offset, tok[1:] + elif tok.startswith('&'): + offset += ref_size + elif tok.startswith("'"): + offset += (len(tok) - 1) // 2 + elif re.fullmatch(r'[0-9A-Fa-f]+', tok): + offset += len(tok) // 2 + else: + # Unknown token: warn once per kind, skip. Keeps a + # malformed line from dropping the whole label map. + sig = tok[:1] + if sig not in seen_unknown: + seen_unknown.add(sig) + sys.stderr.write( + f"{path}:{lineno}: skipping unrecognized token " + f"{tok!r} (subsequent same-prefix tokens silenced)\n" + ) + + +def cmd_map(args): + items = [] + for off, name in parse_hex2(args.hex2, ref_size=args.ref_size): + items.append((args.base + args.header + off, name)) + items.sort() + for addr, name in items: + print(f"0x{addr:08x} {name}") + + +def load_map(path): + """Return {addr: [labels]} (multiple labels can share an address).""" + by_addr = {} + with open(path) as f: + for line in f: + line = line.strip() + if not line or line.startswith('#'): + continue + addr_str, name = line.split() + addr = int(addr_str, 0) + by_addr.setdefault(addr, []).append(name) + return by_addr + + +def nearest_label(by_addr, sorted_addrs, target): + """Find label whose address is the largest <= target. Returns + (label, offset) or None.""" + # Binary search for rightmost addr <= target. + lo, hi = 0, len(sorted_addrs) + while lo < hi: + mid = (lo + hi) // 2 + if sorted_addrs[mid] <= target: + lo = mid + 1 + else: + hi = mid + if lo == 0: + return None + addr = sorted_addrs[lo - 1] + return by_addr[addr][0], target - addr + + +# Matches ` 600078: f94003e0 ldr ...` — leading spaces, hex addr, colon. +LINE_ADDR_RE = re.compile(r'^(\s+)([0-9a-f]+):\s') + +# Matches xref like `0x600088 <PT_LOAD#0+0x88>` in the asm column. +XREF_RE = re.compile(r'0x([0-9a-f]+)\s+<PT_LOAD#0\+0x[0-9a-f]+>') + +# Matches the synthetic section header objdump emits for our seed ELF +# (no real section table): `0000000000600000 <PT_LOAD#0>:`. We skip +# these in favor of the labels we inject ourselves. +SECTION_HEADER_RE = re.compile(r'^[0-9a-f]+\s+<PT_LOAD#0>:\s*$') + + +def cmd_annotate(args): + by_addr = load_map(args.map) + sorted_addrs = sorted(by_addr) + + def rewrite_xref(m): + target = int(m.group(1), 16) + hit = nearest_label(by_addr, sorted_addrs, target) + if hit is None: + return m.group(0) + label, delta = hit + suffix = f"+0x{delta:x}" if delta else "" + return f"0x{target:x} <{label}{suffix}>" + + skip_blank = False + for raw in sys.stdin: + line = raw.rstrip('\n') + # Drop the synthetic <PT_LOAD#0>: section header and the blank + # line that follows it; our injected labels carry the same info. + if SECTION_HEADER_RE.match(line): + skip_blank = True + continue + if skip_blank: + skip_blank = False + if line == '': + continue + m = LINE_ADDR_RE.match(line) + if m: + addr = int(m.group(2), 16) + if addr in by_addr: + for name in by_addr[addr]: + print(f"\n{addr:016x} <{name}>:") + line = XREF_RE.sub(rewrite_xref, line) + print(line) + + +def main(): + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + sub = ap.add_subparsers(dest='cmd', required=True) + + p_map = sub.add_parser('map', help='emit address->label map') + p_map.add_argument('hex2') + p_map.add_argument('--base', type=lambda s: int(s, 0), default=0x600000) + p_map.add_argument('--header', type=lambda s: int(s, 0), default=0x78) + p_map.add_argument('--ref-size', type=int, default=4) + p_map.set_defaults(func=cmd_map) + + p_an = sub.add_parser('annotate', help='inject labels into objdump output') + p_an.add_argument('map') + p_an.set_defaults(func=cmd_annotate) + + args = ap.parse_args() + args.func(args) + + +if __name__ == '__main__': + try: + main() + except BrokenPipeError: + # Downstream pipe (e.g. head) closed; exit cleanly. + try: + sys.stdout.close() + except BrokenPipeError: + pass