kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

normalize.py (9919B)


      1 #!/usr/bin/env python3
      2 """Canonicalize llvm-readelf / llvm-objdump output so two ELFs with
      3 equivalent semantic content compare equal.
      4 
      5 Strips/normalizes:
      6   - file offsets, virtual addresses (replaced with "<addr>")
      7   - section/symbol indices (replaced with "<idx>")
      8   - string-table offsets that show up after a "name:" tag
      9 
     10 Sorts:
     11   - "Symbol table" entries by (binding, name) within each scope
     12   - "Relocation section" entries by (section, offset)
     13 
     14 Invocation:
     15   normalize.py readelf  <file>     — runs `llvm-readelf -aW`, then normalizes
     16   normalize.py objdump  <file>     — runs `llvm-objdump -drwhW`, then normalizes
     17   normalize.py filter             — reads stdin, writes normalized to stdout
     18 """
     19 import os
     20 import re
     21 import shutil
     22 import subprocess
     23 import sys
     24 
     25 
     26 def _which(*names):
     27     for n in names:
     28         p = shutil.which(n)
     29         if p:
     30             return p
     31     return None
     32 
     33 
     34 def _run(tool_args, file_path):
     35     bin_path = _which(*tool_args[0])
     36     if not bin_path:
     37         sys.stderr.write("normalize.py: cannot find %s\n" % tool_args[0][0])
     38         sys.exit(77)
     39     res = subprocess.run([bin_path] + tool_args[1] + [file_path],
     40                          capture_output=True, text=True)
     41     sys.stderr.write(res.stderr)
     42     return res.stdout
     43 
     44 
     45 # Hex address: 0x[0-9a-f]+ (>=4 digits, to avoid clobbering small numerics like flags).
     46 _HEX_ADDR = re.compile(r"0x[0-9a-fA-F]{4,}")
     47 # Bare hex addresses (no 0x) only inside specific column contexts handled below.
     48 _OFFSET_LINE = re.compile(r"^\s*Offset:\s+\S+\s*$")
     49 # llvm-readelf section header lines:
     50 #   [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
     51 # Flg can be empty (sections like SHT_STRTAB / SHT_NOBITS-but-not-allocatable),
     52 # so we parse positionally rather than with a single regex.
     53 _SHDR_HEADER_RE = re.compile(
     54     r"^\s*\[\s*(\d+)\]\s+(\S+)\s+(\S+)\s+([0-9a-fA-F]+)\s+([0-9a-fA-F]+)\s+([0-9a-fA-F]+)\s+(.*)$"
     55 )
     56 # llvm-readelf symbol table row:
     57 #   Num:    Value          Size Type    Bind   Vis      Ndx Name
     58 _SYM_LINE = re.compile(
     59     r"^\s*(\d+):\s+([0-9a-fA-F]+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.*)$"
     60 )
     61 # llvm-readelf RELA row:
     62 #   Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
     63 _RELA_LINE = re.compile(
     64     r"^\s*([0-9a-fA-F]+)\s+([0-9a-fA-F]+)\s+(\S+)\s+([0-9a-fA-F]+)\s+(.*)$"
     65 )
     66 
     67 
     68 # Sections kit's data model doesn't fully preserve. Drop them from
     69 # the structural diff entirely so the rest of the comparison is
     70 # meaningful. Each entry here is paired with a comment in the C
     71 # implementation noting why; remove from this set when the model is
     72 # extended.
     73 _DROP_SHDR_NAMES = set()
     74 # When the kit data model can't preserve a section across roundtrip,
     75 # add its name here with a comment pointing at the underlying gap.
     76 # Sections previously listed (.llvm_addrsig, .ARM.attributes) now
     77 # round-trip via Section.ext_type / Section.ext_flags.
     78 
     79 
     80 def _normalize_shdr(line):
     81     m = _SHDR_HEADER_RE.match(line)
     82     if not m:
     83         return line
     84     nr, name, sh_type, addr, off, size, rest = m.groups()
     85     if name in _DROP_SHDR_NAMES:
     86         return ""
     87     toks = rest.split()
     88     # rest is "ES [Flg] Lk Inf Al" — Flg can be empty; treat as 4 or 5 tokens.
     89     if len(toks) == 5:
     90         es, flg, lk, inf, al = toks
     91     elif len(toks) == 4:
     92         es, flg, lk, inf, al = toks[0], "", toks[1], toks[2], toks[3]
     93     else:
     94         return line
     95     # Drop nr/addr/off and the link/info indices (they reference other
     96     # sections positionally and vary with layout). Section semantics are
     97     # captured by name/type/flg/al; link relationships and exact byte
     98     # sizes re-emerge from the symtab/rela contents downstream.
     99     #
    100     # STRTAB size depends on whether the implementation does tail
    101     # merging (".eh_frame" stored as a suffix of ".rela.eh_frame", etc.),
    102     # which is an optimization, not a contract — strip the size for
    103     # STRTAB so equivalent-content tables compare equal.
    104     if sh_type == "STRTAB":
    105         return ("[<idx>] %s STRTAB es=%s flg=%s al=%s\n" % (name, es, flg, al))
    106     return ("[<idx>] %s %s size=%s es=%s flg=%s al=%s\n"
    107             % (name, sh_type, size, es, flg, al))
    108 
    109 
    110 def _normalize_sym(line):
    111     m = _SYM_LINE.match(line)
    112     if not m:
    113         return line
    114     num, val, size, sym_type, bind, vis, ndx, name = m.groups()
    115     # ndx as a numeric is layout-dependent; collapse all numeric ndx to
    116     # "DEF" (defined-in-some-section) and keep the special markers
    117     # (UND / ABS / COMMON / etc.).
    118     if ndx.isdigit():
    119         ndx = "DEF"
    120     return ("[<idx>] value=<addr> size=%s type=%s bind=%s vis=%s ndx=%s name=%s\n"
    121             % (size, sym_type, bind, vis, ndx, name))
    122 
    123 
    124 def _normalize_rela(line):
    125     m = _RELA_LINE.match(line)
    126     if not m:
    127         return line
    128     off, info, rtype, sym_val, sym_name = m.groups()
    129     return ("offset=%s type=%s sym_value=<addr> sym=%s\n"
    130             % (off, rtype, sym_name.strip()))
    131 
    132 
    133 # llvm-readelf COMDAT group section header:
    134 #   COMDAT group section [  N] `.group' [symbol_name] contains M sections:
    135 _GROUP_HDR_RE = re.compile(
    136     r"^(COMDAT )?group section\s+\[\s*\d+\]\s+`([^']+)'\s+\[([^\]]+)\]",
    137     re.IGNORECASE,
    138 )
    139 # Member entry inside a group block:
    140 #   [   N]   .section_name
    141 _GROUP_ENTRY_RE = re.compile(r"^\s*\[\s*(\d+)\]\s+(.+)$")
    142 
    143 
    144 # Lines whose presence is sensitive to layout choices but says nothing
    145 # semantic: count of headers, where they live, etc. Drop them entirely.
    146 _DROP_PREFIXES = (
    147     "  Start of section headers:",
    148     "  Number of section headers:",
    149     "  Section header string table index:",
    150     "There are ",                  # "There are N section headers..."
    151     "Symbol table '",              # "Symbol table '.symtab' contains N entries"
    152     "Relocation section '",        # "Relocation section '.rela.X' at offset N..."
    153 )
    154 
    155 def _is_segment_mapping(line):
    156     return line.lstrip().startswith("None ") or "Segment Sections..." in line
    157 
    158 
    159 def normalize(text):
    160     out_blocks = []
    161     cur_block = []
    162     cur_kind = None  # "shdr", "sym", "rela", None
    163 
    164     def flush():
    165         nonlocal cur_block, cur_kind
    166         # Sort all block kinds; section ordering and symbol ordering are not
    167         # semantic. (Relocation sections within a relocation block are
    168         # already named, so sorting is fine.)
    169         if cur_kind in ("shdr", "sym", "rela", "group"):
    170             cur_block.sort()
    171         out_blocks.extend(cur_block)
    172         cur_block = []
    173         cur_kind = None
    174 
    175     for raw in text.splitlines(keepends=True):
    176         line = raw
    177 
    178         # Block-start markers come first — even if their text matches
    179         # _DROP_PREFIXES, they still need to set cur_kind. We replace
    180         # the line itself with a stable canonical heading so the body
    181         # can be diffed without the count/offset suffix.
    182         if line.startswith("Symbol table"):
    183             flush()
    184             cur_kind = "sym"
    185             out_blocks.append("Symbol table:\n")
    186             continue
    187         if line.startswith("Relocation section"):
    188             flush()
    189             cur_kind = "rela"
    190             out_blocks.append("Relocation section:\n")
    191             continue
    192         if "Section Headers:" in line:
    193             flush()
    194             cur_kind = "shdr"
    195             out_blocks.append(line)
    196             continue
    197         gm = _GROUP_HDR_RE.search(line)
    198         if gm:
    199             flush()
    200             cur_kind = "group"
    201             # Canonical heading preserves the group symbol name but drops
    202             # the section index and member count — both are layout-dependent.
    203             out_blocks.append("COMDAT group [%s]:\n" % gm.group(3))
    204             continue
    205 
    206         if any(line.startswith(p) for p in _DROP_PREFIXES):
    207             continue
    208 
    209         # Section-to-segment mapping line: sort section names so file
    210         # ordering doesn't show through; drop any sections in the
    211         # known-not-preserved set.
    212         if cur_kind is None and _is_segment_mapping(line.rstrip("\n")):
    213             stripped = line.rstrip("\n")
    214             if "None" in stripped:
    215                 head, _, tail = stripped.partition("None")
    216                 names = sorted(n for n in tail.split() if n not in _DROP_SHDR_NAMES)
    217                 line = head + "None " + " ".join(names) + "\n"
    218         if not line.strip():
    219             flush()
    220             out_blocks.append(line)
    221             continue
    222 
    223         if cur_kind == "shdr":
    224             cur_block.append(_normalize_shdr(line))
    225             continue
    226         if cur_kind == "sym":
    227             cur_block.append(_normalize_sym(line))
    228             continue
    229         if cur_kind == "rela":
    230             cur_block.append(_normalize_rela(line))
    231             continue
    232         if cur_kind == "group":
    233             # Replace the numeric section index with <idx>; keep the name.
    234             # "[Index]    Name" header and blank lines pass through after
    235             # index scrubbing so the block can be sorted by section name.
    236             em = _GROUP_ENTRY_RE.match(line)
    237             if em:
    238                 cur_block.append("[<idx>]   %s\n" % em.group(2).strip())
    239             else:
    240                 cur_block.append(line)
    241             continue
    242 
    243         # Default: scrub addresses outside section bodies too.
    244         line = _HEX_ADDR.sub("<addr>", line)
    245         out_blocks.append(line)
    246 
    247     flush()
    248     return "".join(out_blocks)
    249 
    250 
    251 def main(argv):
    252     if len(argv) < 2:
    253         sys.stderr.write(__doc__)
    254         return 2
    255     cmd = argv[1]
    256     if cmd == "readelf":
    257         if len(argv) != 3: return 2
    258         text = _run((["llvm-readelf", "readelf"], ["-aW"]), argv[2])
    259     elif cmd == "objdump":
    260         if len(argv) != 3: return 2
    261         text = _run((["llvm-objdump", "objdump"], ["-drwhW"]), argv[2])
    262     elif cmd == "filter":
    263         text = sys.stdin.read()
    264     else:
    265         sys.stderr.write(__doc__)
    266         return 2
    267     sys.stdout.write(normalize(text))
    268     return 0
    269 
    270 
    271 if __name__ == "__main__":
    272     sys.exit(main(sys.argv))