m1-symbols.py (12562B)
1 #!/usr/bin/env python3 2 """m1-symbols.py — extract labels from a hex2/hex2pp file and annotate disassembly. 3 4 The expanded hex2pp produced by M1pp in this project uses a small grammar: 5 6 :LABEL declares LABEL at the current byte offset (zero bytes) 7 !LABEL 1-byte relative label reference 8 @LABEL/$LABEL 2-byte label reference 9 ~LABEL 3-byte relative label reference 10 %LABEL/&LABEL ptrsize-byte label reference (4 by default) 11 XXXX... bare hex; len(token)/2 bytes 12 .align N pads to N-byte boundary 13 .fill N B emits N copies of B 14 # ... / ; ... comment to end-of-line 15 16 Legacy prog.hex2 files from the seed M0 path are still accepted; their 17 'XXXX... raw byte literals are counted as before. 18 19 Label addresses = BASE + HEADER + cumulative byte offset, where BASE is the 20 ELF load address (0x600000) and HEADER is the size of the on-disk ELF header 21 + phdr that sits in front of expanded hex2pp/M0 output (0x78 bytes for our 22 seed ELF). 23 24 Subcommands: 25 26 map <expanded.hex2pp|prog.hex2> [--base 0x600000] [--header 0x78] 27 Print "0xADDR LABEL" lines, sorted by address. 28 29 annotate <map> 30 Read llvm-objdump output on stdin; inject "ADDR <LABEL>:" headers and 31 rewrite "<PT_LOAD#0+0xNNN>" xrefs to "<LABEL>" or "<LABEL+N>". Writes 32 annotated output to stdout. 33 34 lookup [--elf ELF | --hex2 HEX2 | --map MAP] ADDR [ADDR ...] 35 For each address, print "0xADDR LABEL+0xN" (or just "LABEL" when 36 delta=0). Pairs with `%trace`'s stderr output: paste a trace 37 address and get the enclosing function. With --elf, locates the 38 sibling <ELF>.workdir sidecar (written by boot-build-p1*.sh) to 39 find expanded.hex2pp or legacy prog.hex2, so you don't have to 40 know the .work path. 41 Reads addresses from stdin if none are passed positionally. 42 """ 43 44 import argparse 45 import re 46 import sys 47 48 49 REF_WIDTH = { 50 '!': 1, 51 '@': 2, 52 '$': 2, 53 '~': 3, 54 } 55 56 57 def _parse_int(tok): 58 return int(tok, 0) 59 60 61 def parse_hex2(path, ref_size=4): 62 """Yield (offset, label) pairs in declaration order.""" 63 offset = 0 64 ptrsize = ref_size 65 seen_unknown = set() 66 with open(path) as f: 67 for lineno, line in enumerate(f, 1): 68 line = line.split('#', 1)[0].split(';', 1)[0] 69 toks = line.split() 70 i = 0 71 while i < len(toks): 72 tok = toks[i] 73 if tok.startswith(':'): 74 yield offset, tok[1:] 75 elif tok in ('.scope', '.endscope'): 76 pass 77 elif tok == '.ptrsize': 78 if i + 1 >= len(toks): 79 sys.stderr.write(f"{path}:{lineno}: missing .ptrsize value\n") 80 else: 81 ptrsize = _parse_int(toks[i + 1]) 82 i += 1 83 elif tok == '.align': 84 if i + 1 >= len(toks): 85 sys.stderr.write(f"{path}:{lineno}: missing .align value\n") 86 else: 87 n = _parse_int(toks[i + 1]) 88 offset += (-offset) % n 89 # Optional pad pattern is a source token but does not 90 # change the number of emitted bytes beyond the pad. 91 if i + 2 < len(toks) and re.fullmatch(r'[0-9A-Fa-f]+', toks[i + 2]): 92 i += 1 93 i += 1 94 elif tok == '.fill': 95 if i + 2 >= len(toks): 96 sys.stderr.write(f"{path}:{lineno}: missing .fill args\n") 97 else: 98 offset += _parse_int(toks[i + 1]) 99 i += 2 100 elif tok[:1] in ('%', '&'): 101 offset += ptrsize 102 elif tok[:1] in REF_WIDTH: 103 offset += REF_WIDTH[tok[:1]] 104 elif tok.startswith("'"): 105 offset += (len(tok) - 1) // 2 106 elif re.fullmatch(r'[0-9A-Fa-f]+', tok): 107 offset += len(tok) // 2 108 else: 109 # Unknown token: warn once per kind, skip. Keeps a 110 # malformed line from dropping the whole label map. 111 sig = tok[:1] 112 if sig not in seen_unknown: 113 seen_unknown.add(sig) 114 sys.stderr.write( 115 f"{path}:{lineno}: skipping unrecognized token " 116 f"{tok!r} (subsequent same-prefix tokens silenced)\n" 117 ) 118 i += 1 119 120 121 def cmd_map(args): 122 items = [] 123 for off, name in parse_hex2(args.hex2, ref_size=args.ref_size): 124 items.append((args.base + args.header + off, name)) 125 items.sort() 126 for addr, name in items: 127 print(f"0x{addr:08x} {name}") 128 129 130 def load_map(path): 131 """Return {addr: [labels]} (multiple labels can share an address).""" 132 by_addr = {} 133 with open(path) as f: 134 for line in f: 135 line = line.strip() 136 if not line or line.startswith('#'): 137 continue 138 addr_str, name = line.split() 139 addr = int(addr_str, 0) 140 by_addr.setdefault(addr, []).append(name) 141 return by_addr 142 143 144 def nearest_label(by_addr, sorted_addrs, target, skip=None): 145 """Find label whose address is the largest <= target. Returns 146 (label, offset) or None. If `skip` is a callable, addresses whose 147 only labels are all skip(name)==True are walked past — useful for 148 filtering M1pp's `:@name`→`:name__N` macro-locals so a trace 149 address resolves to its enclosing function rather than the 150 macro-internal `:@here`.""" 151 # Binary search for rightmost addr <= target. 152 lo, hi = 0, len(sorted_addrs) 153 while lo < hi: 154 mid = (lo + hi) // 2 155 if sorted_addrs[mid] <= target: 156 lo = mid + 1 157 else: 158 hi = mid 159 while lo > 0: 160 addr = sorted_addrs[lo - 1] 161 names = by_addr[addr] 162 keep = [n for n in names if not (skip and skip(n))] if skip else names 163 if keep: 164 return keep[0], target - addr 165 lo -= 1 166 return None 167 168 169 # Matches M1pp's `:@name`→`:name__N` rewrite. The `__\d+` tail is 170 # unique to that mangling, since user labels can't legally end with 171 # `__<digits>` (only the M1pp expansion counter produces them). 172 MACRO_LOCAL_RE = re.compile(r'^[A-Za-z_][A-Za-z0-9_]*__\d+$') 173 174 175 def is_macro_local(name): 176 return bool(MACRO_LOCAL_RE.match(name)) 177 178 179 # Matches ` 600078: f94003e0 ldr ...` — leading spaces, hex addr, colon. 180 LINE_ADDR_RE = re.compile(r'^(\s+)([0-9a-f]+):\s') 181 182 # Matches xref like `0x600088 <PT_LOAD#0+0x88>` in the asm column. 183 XREF_RE = re.compile(r'0x([0-9a-f]+)\s+<PT_LOAD#0\+0x[0-9a-f]+>') 184 185 # Matches the synthetic section header objdump emits for our seed ELF 186 # (no real section table): `0000000000600000 <PT_LOAD#0>:`. We skip 187 # these in favor of the labels we inject ourselves. 188 SECTION_HEADER_RE = re.compile(r'^[0-9a-f]+\s+<PT_LOAD#0>:\s*$') 189 190 191 def cmd_annotate(args): 192 by_addr = load_map(args.map) 193 sorted_addrs = sorted(by_addr) 194 195 def rewrite_xref(m): 196 target = int(m.group(1), 16) 197 hit = nearest_label(by_addr, sorted_addrs, target) 198 if hit is None: 199 return m.group(0) 200 label, delta = hit 201 suffix = f"+0x{delta:x}" if delta else "" 202 return f"0x{target:x} <{label}{suffix}>" 203 204 skip_blank = False 205 for raw in sys.stdin: 206 line = raw.rstrip('\n') 207 # Drop the synthetic <PT_LOAD#0>: section header and the blank 208 # line that follows it; our injected labels carry the same info. 209 if SECTION_HEADER_RE.match(line): 210 skip_blank = True 211 continue 212 if skip_blank: 213 skip_blank = False 214 if line == '': 215 continue 216 m = LINE_ADDR_RE.match(line) 217 if m: 218 addr = int(m.group(2), 16) 219 if addr in by_addr: 220 for name in by_addr[addr]: 221 print(f"\n{addr:016x} <{name}>:") 222 line = XREF_RE.sub(rewrite_xref, line) 223 print(line) 224 225 226 def _resolve_hex2_from_elf(elf_path): 227 """Mirror disasm-elf.sh's <elf>.workdir sidecar lookup.""" 228 import os 229 sidecar = elf_path + '.workdir' 230 if not os.path.exists(sidecar): 231 sys.exit(f"m1-symbols: no {sidecar} sidecar — rebuild the ELF " 232 f"with boot-build-p1*.sh to generate it") 233 with open(sidecar) as f: 234 workdir = f.read().strip() 235 if not os.path.isabs(workdir): 236 repo_root = os.path.abspath( 237 os.path.join(os.path.dirname(__file__), '..')) 238 workdir = os.path.join(repo_root, workdir) 239 for name in ('expanded.hex2pp', 'prog.hex2'): 240 hex2 = os.path.join(workdir, name) 241 if os.path.exists(hex2): 242 return hex2 243 sys.exit(f"m1-symbols: {sidecar} -> {workdir}, but no expanded.hex2pp or prog.hex2 there") 244 245 246 def _build_map_from_args(args): 247 """Resolve --elf / --hex2 / --map into the {addr: [labels]} dict.""" 248 if args.map: 249 return load_map(args.map) 250 hex2 = args.hex2 or _resolve_hex2_from_elf(args.elf) 251 by_addr = {} 252 for off, name in parse_hex2(hex2, ref_size=args.ref_size): 253 by_addr.setdefault(args.base + args.header + off, []).append(name) 254 return by_addr 255 256 257 def cmd_lookup(args): 258 by_addr = _build_map_from_args(args) 259 sorted_addrs = sorted(by_addr) 260 skip = None if args.include_macro_locals else is_macro_local 261 addrs = args.addrs or [line.strip() for line in sys.stdin if line.strip()] 262 for raw in addrs: 263 try: 264 target = int(raw, 0) if raw.lower().startswith('0x') \ 265 else int(raw, 16) 266 except ValueError: 267 print(f"0x{raw}\t<bad address>") 268 continue 269 hit = nearest_label(by_addr, sorted_addrs, target, skip=skip) 270 if hit is None: 271 print(f"0x{target:x}\t<no label <= addr>") 272 continue 273 label, delta = hit 274 suffix = f"+0x{delta:x}" if delta else "" 275 print(f"0x{target:x}\t{label}{suffix}") 276 277 278 def main(): 279 ap = argparse.ArgumentParser(description=__doc__, 280 formatter_class=argparse.RawDescriptionHelpFormatter) 281 sub = ap.add_subparsers(dest='cmd', required=True) 282 283 p_map = sub.add_parser('map', help='emit address->label map') 284 p_map.add_argument('hex2') 285 p_map.add_argument('--base', type=lambda s: int(s, 0), default=0x600000) 286 p_map.add_argument('--header', type=lambda s: int(s, 0), default=0x78) 287 p_map.add_argument('--ref-size', type=int, default=4) 288 p_map.set_defaults(func=cmd_map) 289 290 p_an = sub.add_parser('annotate', help='inject labels into objdump output') 291 p_an.add_argument('map') 292 p_an.set_defaults(func=cmd_annotate) 293 294 p_lk = sub.add_parser('lookup', 295 help='resolve addrs to nearest preceding label') 296 src = p_lk.add_mutually_exclusive_group(required=True) 297 src.add_argument('--elf', help='ELF path; uses <ELF>.workdir sidecar ' 298 'to find expanded.hex2pp or prog.hex2') 299 src.add_argument('--hex2', help='expanded.hex2pp or prog.hex2 path') 300 src.add_argument('--map', help='pre-built address->label map ' 301 '(from `m1-symbols.py map`)') 302 p_lk.add_argument('--base', type=lambda s: int(s, 0), default=0x600000) 303 p_lk.add_argument('--header', type=lambda s: int(s, 0), default=0x78) 304 p_lk.add_argument('--ref-size', type=int, default=4) 305 p_lk.add_argument('--include-macro-locals', action='store_true', 306 help='include M1pp-mangled local labels ' 307 '(`:@name` → `:name__N`) when picking the ' 308 'nearest preceding label. Off by default ' 309 'so trace addresses resolve to the ' 310 'enclosing function, not the macro internal ' 311 ':@here.') 312 p_lk.add_argument('addrs', nargs='*', 313 help='addresses (hex, with or without 0x prefix); ' 314 'reads stdin one per line if omitted') 315 p_lk.set_defaults(func=cmd_lookup) 316 317 args = ap.parse_args() 318 args.func(args) 319 320 321 if __name__ == '__main__': 322 try: 323 main() 324 except BrokenPipeError: 325 # Downstream pipe (e.g. head) closed; exit cleanly. 326 try: 327 sys.stdout.close() 328 except BrokenPipeError: 329 pass