boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

llvm_disasm_aarch64.py (4332B)


      1 #!/usr/bin/env python3
      2 """Annotate p1_aarch64.M1 DEFINE rows with llvm-mc disassembly.
      3 
      4 Reads generated DEFINE lines from p1_aarch64.M1, disassembles code-bearing rows
      5 with llvm-mc, and prints the DEFINE name beside the native aarch64 mnemonic
      6 sequence. Literal data rows such as syscall-number constants are labeled as data
      7 instead of being treated as instructions.
      8 """
      9 
     10 import argparse
     11 import os
     12 import re
     13 import subprocess
     14 import sys
     15 from pathlib import Path
     16 
     17 
     18 DEFINE_RE = re.compile(r'^DEFINE\s+(\S+)\s+([0-9A-Fa-f]+)\s*$')
     19 
     20 
     21 def repo_root():
     22     return Path(__file__).resolve().parent.parent
     23 
     24 
     25 def default_input_path():
     26     return repo_root() / 'build' / 'p1' / 'aarch64' / 'p1_aarch64.M1'
     27 
     28 
     29 def ensure_generated(path: Path):
     30     if path.exists():
     31         return
     32     gen = repo_root() / 'p1' / 'p1_gen.py'
     33     proc = subprocess.run(
     34         [sys.executable, str(gen), '--arch', 'aarch64', str(path.parent.parent)],
     35         check=True,
     36         cwd=repo_root(),
     37         capture_output=True,
     38         text=True,
     39     )
     40     if proc.stderr:
     41         sys.stderr.write(proc.stderr)
     42 
     43 
     44 def parse_rows(path: Path):
     45     rows = []
     46     for line in path.read_text().splitlines():
     47         match = DEFINE_RE.match(line)
     48         if not match:
     49             continue
     50         name, hex_bytes = match.groups()
     51         rows.append((name, hex_bytes.upper()))
     52     return rows
     53 
     54 
     55 def is_data_row(name: str):
     56     return name.startswith('sys_')
     57 
     58 
     59 def disassemble_code_rows(rows, llvm_mc):
     60     code_rows = [(name, hex_bytes) for name, hex_bytes in rows if not is_data_row(name)]
     61     if not code_rows:
     62         return {}
     63 
     64     payload = '\n'.join(hex_bytes for _, hex_bytes in code_rows) + '\n'
     65     proc = subprocess.run(
     66         [llvm_mc, '--disassemble', '--hex', '--arch=aarch64'],
     67         input=payload,
     68         text=True,
     69         capture_output=True,
     70         check=True,
     71     )
     72     inst_lines = [line.strip() for line in proc.stdout.splitlines() if line.strip()]
     73 
     74     out = {}
     75     index = 0
     76     for name, hex_bytes in code_rows:
     77         words = len(hex_bytes) // 8
     78         out[name] = inst_lines[index:index + words]
     79         index += words
     80 
     81     if index != len(inst_lines):
     82         raise RuntimeError(
     83             f'llvm output row split mismatch: consumed {index}, got {len(inst_lines)}'
     84         )
     85     return out
     86 
     87 
     88 def format_rows(rows, disasm_by_name, show_bytes):
     89     name_width = max(len(name) for name, _ in rows) if rows else 0
     90     out = []
     91     for name, hex_bytes in rows:
     92         if is_data_row(name):
     93             rhs = f'data 0x{hex_bytes}'
     94             out.append(f'{name:<{name_width}}  {rhs}')
     95             continue
     96 
     97         insns = disasm_by_name.get(name, [])
     98         if not insns:
     99             out.append(f'{name:<{name_width}}  <no disassembly>')
    100             continue
    101 
    102         prefix = name.ljust(name_width)
    103         byte_col = f'  {hex_bytes}' if show_bytes else ''
    104         out.append(f'{prefix}{byte_col}  {insns[0]}')
    105         for insn in insns[1:]:
    106             spacer = ' ' * name_width
    107             if show_bytes:
    108                 spacer += '  ' + ' ' * len(hex_bytes)
    109             out.append(f'{spacer}  {insn}')
    110     return '\n'.join(out)
    111 
    112 
    113 def main():
    114     parser = argparse.ArgumentParser()
    115     parser.add_argument(
    116         'input',
    117         nargs='?',
    118         default=str(default_input_path()),
    119         help='path to p1_aarch64.M1',
    120     )
    121     parser.add_argument(
    122         '--llvm-mc',
    123         default=os.environ.get('LLVM_MC', 'llvm-mc'),
    124         help='path to llvm-mc',
    125     )
    126     parser.add_argument(
    127         '--grep',
    128         default='',
    129         help='only include DEFINE names containing this substring',
    130     )
    131     parser.add_argument(
    132         '--limit',
    133         type=int,
    134         default=0,
    135         help='maximum number of DEFINE rows to print (0 = all)',
    136     )
    137     parser.add_argument(
    138         '--show-bytes',
    139         action='store_true',
    140         help='include raw DEFINE bytes next to the name',
    141     )
    142     args = parser.parse_args()
    143 
    144     path = Path(args.input)
    145     ensure_generated(path)
    146     rows = parse_rows(path)
    147     if args.grep:
    148         rows = [(name, hex_bytes) for name, hex_bytes in rows if args.grep in name]
    149     if args.limit:
    150         rows = rows[:args.limit]
    151 
    152     disasm_by_name = disassemble_code_rows(rows, args.llvm_mc)
    153     print(format_rows(rows, disasm_by_name, args.show_bytes))
    154 
    155 
    156 if __name__ == '__main__':
    157     main()