boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

m1-symbols.py (12562B)


      1 #!/usr/bin/env python3
      2 """m1-symbols.py — extract labels from a hex2/hex2pp file and annotate disassembly.
      3 
      4 The expanded hex2pp produced by M1pp in this project uses a small grammar:
      5 
      6   :LABEL          declares LABEL at the current byte offset (zero bytes)
      7   !LABEL          1-byte relative label reference
      8   @LABEL/$LABEL   2-byte label reference
      9   ~LABEL          3-byte relative label reference
     10   %LABEL/&LABEL   ptrsize-byte label reference (4 by default)
     11   XXXX...         bare hex; len(token)/2 bytes
     12   .align N        pads to N-byte boundary
     13   .fill N B       emits N copies of B
     14   # ... / ; ...   comment to end-of-line
     15 
     16 Legacy prog.hex2 files from the seed M0 path are still accepted; their
     17 'XXXX... raw byte literals are counted as before.
     18 
     19 Label addresses = BASE + HEADER + cumulative byte offset, where BASE is the
     20 ELF load address (0x600000) and HEADER is the size of the on-disk ELF header
     21 + phdr that sits in front of expanded hex2pp/M0 output (0x78 bytes for our
     22 seed ELF).
     23 
     24 Subcommands:
     25 
     26   map <expanded.hex2pp|prog.hex2> [--base 0x600000] [--header 0x78]
     27       Print "0xADDR LABEL" lines, sorted by address.
     28 
     29   annotate <map>
     30       Read llvm-objdump output on stdin; inject "ADDR <LABEL>:" headers and
     31       rewrite "<PT_LOAD#0+0xNNN>" xrefs to "<LABEL>" or "<LABEL+N>". Writes
     32       annotated output to stdout.
     33 
     34   lookup [--elf ELF | --hex2 HEX2 | --map MAP] ADDR [ADDR ...]
     35       For each address, print "0xADDR LABEL+0xN" (or just "LABEL" when
     36       delta=0). Pairs with `%trace`'s stderr output: paste a trace
     37       address and get the enclosing function. With --elf, locates the
     38       sibling <ELF>.workdir sidecar (written by boot-build-p1*.sh) to
     39       find expanded.hex2pp or legacy prog.hex2, so you don't have to
     40       know the .work path.
     41       Reads addresses from stdin if none are passed positionally.
     42 """
     43 
     44 import argparse
     45 import re
     46 import sys
     47 
     48 
     49 REF_WIDTH = {
     50     '!': 1,
     51     '@': 2,
     52     '$': 2,
     53     '~': 3,
     54 }
     55 
     56 
     57 def _parse_int(tok):
     58     return int(tok, 0)
     59 
     60 
     61 def parse_hex2(path, ref_size=4):
     62     """Yield (offset, label) pairs in declaration order."""
     63     offset = 0
     64     ptrsize = ref_size
     65     seen_unknown = set()
     66     with open(path) as f:
     67         for lineno, line in enumerate(f, 1):
     68             line = line.split('#', 1)[0].split(';', 1)[0]
     69             toks = line.split()
     70             i = 0
     71             while i < len(toks):
     72                 tok = toks[i]
     73                 if tok.startswith(':'):
     74                     yield offset, tok[1:]
     75                 elif tok in ('.scope', '.endscope'):
     76                     pass
     77                 elif tok == '.ptrsize':
     78                     if i + 1 >= len(toks):
     79                         sys.stderr.write(f"{path}:{lineno}: missing .ptrsize value\n")
     80                     else:
     81                         ptrsize = _parse_int(toks[i + 1])
     82                         i += 1
     83                 elif tok == '.align':
     84                     if i + 1 >= len(toks):
     85                         sys.stderr.write(f"{path}:{lineno}: missing .align value\n")
     86                     else:
     87                         n = _parse_int(toks[i + 1])
     88                         offset += (-offset) % n
     89                         # Optional pad pattern is a source token but does not
     90                         # change the number of emitted bytes beyond the pad.
     91                         if i + 2 < len(toks) and re.fullmatch(r'[0-9A-Fa-f]+', toks[i + 2]):
     92                             i += 1
     93                         i += 1
     94                 elif tok == '.fill':
     95                     if i + 2 >= len(toks):
     96                         sys.stderr.write(f"{path}:{lineno}: missing .fill args\n")
     97                     else:
     98                         offset += _parse_int(toks[i + 1])
     99                         i += 2
    100                 elif tok[:1] in ('%', '&'):
    101                     offset += ptrsize
    102                 elif tok[:1] in REF_WIDTH:
    103                     offset += REF_WIDTH[tok[:1]]
    104                 elif tok.startswith("'"):
    105                     offset += (len(tok) - 1) // 2
    106                 elif re.fullmatch(r'[0-9A-Fa-f]+', tok):
    107                     offset += len(tok) // 2
    108                 else:
    109                     # Unknown token: warn once per kind, skip. Keeps a
    110                     # malformed line from dropping the whole label map.
    111                     sig = tok[:1]
    112                     if sig not in seen_unknown:
    113                         seen_unknown.add(sig)
    114                         sys.stderr.write(
    115                             f"{path}:{lineno}: skipping unrecognized token "
    116                             f"{tok!r} (subsequent same-prefix tokens silenced)\n"
    117                         )
    118                 i += 1
    119 
    120 
    121 def cmd_map(args):
    122     items = []
    123     for off, name in parse_hex2(args.hex2, ref_size=args.ref_size):
    124         items.append((args.base + args.header + off, name))
    125     items.sort()
    126     for addr, name in items:
    127         print(f"0x{addr:08x} {name}")
    128 
    129 
    130 def load_map(path):
    131     """Return {addr: [labels]} (multiple labels can share an address)."""
    132     by_addr = {}
    133     with open(path) as f:
    134         for line in f:
    135             line = line.strip()
    136             if not line or line.startswith('#'):
    137                 continue
    138             addr_str, name = line.split()
    139             addr = int(addr_str, 0)
    140             by_addr.setdefault(addr, []).append(name)
    141     return by_addr
    142 
    143 
    144 def nearest_label(by_addr, sorted_addrs, target, skip=None):
    145     """Find label whose address is the largest <= target. Returns
    146     (label, offset) or None. If `skip` is a callable, addresses whose
    147     only labels are all skip(name)==True are walked past — useful for
    148     filtering M1pp's `:@name`→`:name__N` macro-locals so a trace
    149     address resolves to its enclosing function rather than the
    150     macro-internal `:@here`."""
    151     # Binary search for rightmost addr <= target.
    152     lo, hi = 0, len(sorted_addrs)
    153     while lo < hi:
    154         mid = (lo + hi) // 2
    155         if sorted_addrs[mid] <= target:
    156             lo = mid + 1
    157         else:
    158             hi = mid
    159     while lo > 0:
    160         addr = sorted_addrs[lo - 1]
    161         names = by_addr[addr]
    162         keep = [n for n in names if not (skip and skip(n))] if skip else names
    163         if keep:
    164             return keep[0], target - addr
    165         lo -= 1
    166     return None
    167 
    168 
    169 # Matches M1pp's `:@name`→`:name__N` rewrite. The `__\d+` tail is
    170 # unique to that mangling, since user labels can't legally end with
    171 # `__<digits>` (only the M1pp expansion counter produces them).
    172 MACRO_LOCAL_RE = re.compile(r'^[A-Za-z_][A-Za-z0-9_]*__\d+$')
    173 
    174 
    175 def is_macro_local(name):
    176     return bool(MACRO_LOCAL_RE.match(name))
    177 
    178 
    179 # Matches `  600078: f94003e0     ldr ...` — leading spaces, hex addr, colon.
    180 LINE_ADDR_RE = re.compile(r'^(\s+)([0-9a-f]+):\s')
    181 
    182 # Matches xref like `0x600088 <PT_LOAD#0+0x88>` in the asm column.
    183 XREF_RE = re.compile(r'0x([0-9a-f]+)\s+<PT_LOAD#0\+0x[0-9a-f]+>')
    184 
    185 # Matches the synthetic section header objdump emits for our seed ELF
    186 # (no real section table): `0000000000600000 <PT_LOAD#0>:`. We skip
    187 # these in favor of the labels we inject ourselves.
    188 SECTION_HEADER_RE = re.compile(r'^[0-9a-f]+\s+<PT_LOAD#0>:\s*$')
    189 
    190 
    191 def cmd_annotate(args):
    192     by_addr = load_map(args.map)
    193     sorted_addrs = sorted(by_addr)
    194 
    195     def rewrite_xref(m):
    196         target = int(m.group(1), 16)
    197         hit = nearest_label(by_addr, sorted_addrs, target)
    198         if hit is None:
    199             return m.group(0)
    200         label, delta = hit
    201         suffix = f"+0x{delta:x}" if delta else ""
    202         return f"0x{target:x} <{label}{suffix}>"
    203 
    204     skip_blank = False
    205     for raw in sys.stdin:
    206         line = raw.rstrip('\n')
    207         # Drop the synthetic <PT_LOAD#0>: section header and the blank
    208         # line that follows it; our injected labels carry the same info.
    209         if SECTION_HEADER_RE.match(line):
    210             skip_blank = True
    211             continue
    212         if skip_blank:
    213             skip_blank = False
    214             if line == '':
    215                 continue
    216         m = LINE_ADDR_RE.match(line)
    217         if m:
    218             addr = int(m.group(2), 16)
    219             if addr in by_addr:
    220                 for name in by_addr[addr]:
    221                     print(f"\n{addr:016x} <{name}>:")
    222         line = XREF_RE.sub(rewrite_xref, line)
    223         print(line)
    224 
    225 
    226 def _resolve_hex2_from_elf(elf_path):
    227     """Mirror disasm-elf.sh's <elf>.workdir sidecar lookup."""
    228     import os
    229     sidecar = elf_path + '.workdir'
    230     if not os.path.exists(sidecar):
    231         sys.exit(f"m1-symbols: no {sidecar} sidecar — rebuild the ELF "
    232                  f"with boot-build-p1*.sh to generate it")
    233     with open(sidecar) as f:
    234         workdir = f.read().strip()
    235     if not os.path.isabs(workdir):
    236         repo_root = os.path.abspath(
    237             os.path.join(os.path.dirname(__file__), '..'))
    238         workdir = os.path.join(repo_root, workdir)
    239     for name in ('expanded.hex2pp', 'prog.hex2'):
    240         hex2 = os.path.join(workdir, name)
    241         if os.path.exists(hex2):
    242             return hex2
    243     sys.exit(f"m1-symbols: {sidecar} -> {workdir}, but no expanded.hex2pp or prog.hex2 there")
    244 
    245 
    246 def _build_map_from_args(args):
    247     """Resolve --elf / --hex2 / --map into the {addr: [labels]} dict."""
    248     if args.map:
    249         return load_map(args.map)
    250     hex2 = args.hex2 or _resolve_hex2_from_elf(args.elf)
    251     by_addr = {}
    252     for off, name in parse_hex2(hex2, ref_size=args.ref_size):
    253         by_addr.setdefault(args.base + args.header + off, []).append(name)
    254     return by_addr
    255 
    256 
    257 def cmd_lookup(args):
    258     by_addr = _build_map_from_args(args)
    259     sorted_addrs = sorted(by_addr)
    260     skip = None if args.include_macro_locals else is_macro_local
    261     addrs = args.addrs or [line.strip() for line in sys.stdin if line.strip()]
    262     for raw in addrs:
    263         try:
    264             target = int(raw, 0) if raw.lower().startswith('0x') \
    265                                  else int(raw, 16)
    266         except ValueError:
    267             print(f"0x{raw}\t<bad address>")
    268             continue
    269         hit = nearest_label(by_addr, sorted_addrs, target, skip=skip)
    270         if hit is None:
    271             print(f"0x{target:x}\t<no label <= addr>")
    272             continue
    273         label, delta = hit
    274         suffix = f"+0x{delta:x}" if delta else ""
    275         print(f"0x{target:x}\t{label}{suffix}")
    276 
    277 
    278 def main():
    279     ap = argparse.ArgumentParser(description=__doc__,
    280                                  formatter_class=argparse.RawDescriptionHelpFormatter)
    281     sub = ap.add_subparsers(dest='cmd', required=True)
    282 
    283     p_map = sub.add_parser('map', help='emit address->label map')
    284     p_map.add_argument('hex2')
    285     p_map.add_argument('--base', type=lambda s: int(s, 0), default=0x600000)
    286     p_map.add_argument('--header', type=lambda s: int(s, 0), default=0x78)
    287     p_map.add_argument('--ref-size', type=int, default=4)
    288     p_map.set_defaults(func=cmd_map)
    289 
    290     p_an = sub.add_parser('annotate', help='inject labels into objdump output')
    291     p_an.add_argument('map')
    292     p_an.set_defaults(func=cmd_annotate)
    293 
    294     p_lk = sub.add_parser('lookup',
    295                           help='resolve addrs to nearest preceding label')
    296     src = p_lk.add_mutually_exclusive_group(required=True)
    297     src.add_argument('--elf', help='ELF path; uses <ELF>.workdir sidecar '
    298                                    'to find expanded.hex2pp or prog.hex2')
    299     src.add_argument('--hex2', help='expanded.hex2pp or prog.hex2 path')
    300     src.add_argument('--map', help='pre-built address->label map '
    301                                    '(from `m1-symbols.py map`)')
    302     p_lk.add_argument('--base', type=lambda s: int(s, 0), default=0x600000)
    303     p_lk.add_argument('--header', type=lambda s: int(s, 0), default=0x78)
    304     p_lk.add_argument('--ref-size', type=int, default=4)
    305     p_lk.add_argument('--include-macro-locals', action='store_true',
    306                       help='include M1pp-mangled local labels '
    307                            '(`:@name` → `:name__N`) when picking the '
    308                            'nearest preceding label. Off by default '
    309                            'so trace addresses resolve to the '
    310                            'enclosing function, not the macro internal '
    311                            ':@here.')
    312     p_lk.add_argument('addrs', nargs='*',
    313                       help='addresses (hex, with or without 0x prefix); '
    314                            'reads stdin one per line if omitted')
    315     p_lk.set_defaults(func=cmd_lookup)
    316 
    317     args = ap.parse_args()
    318     args.func(args)
    319 
    320 
    321 if __name__ == '__main__':
    322     try:
    323         main()
    324     except BrokenPipeError:
    325         # Downstream pipe (e.g. head) closed; exit cleanly.
    326         try:
    327             sys.stdout.close()
    328         except BrokenPipeError:
    329             pass