kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

normalize.py (9082B)


      1 #!/usr/bin/env python3
      2 """Canonicalize llvm-readobj output for Mach-O so two .o files with
      3 equivalent semantic content compare equal.
      4 
      5 Mach-O peer of test/elf/normalize.py — the structural-fidelity pivot
      6 the path-R harness uses for KIT_TEST_OBJ=macho. Differs from the
      7 ELF normalizer because:
      8 
      9   * llvm-readobj's Mach-O output uses an indented "Section { ... }"
     10     block format, not a one-line-per-row layout. The normalizer is a
     11     block transformer rather than a line transformer.
     12   * Mach-O symbol-name strtab packing order is implementation-defined
     13     (clang packs externals before locals; kit packs in symtab
     14     order). The "(NN)" strx hint after each Name field is dropped so
     15     those orderings compare equal.
     16   * Section/segment name fields print with a trailing hex byte dump
     17     (Name: __text (5F 5F 74 65 78 74 ...)). The hex tail is dropped
     18     — the printable name already captures it.
     19   * File offsets, RelocationOffset, addresses inside SectionData
     20     headers are layout-dependent and dropped.
     21 
     22 Strips:
     23   - "(NN)" strtab offsets after Name: fields
     24   - hex byte dumps after section/segment names
     25   - "Index: N" (section index — re-derivable from order)
     26   - "Offset: N" (file offset)
     27   - "RelocationOffset: 0xN"
     28   - "Address: 0xN" (where the section is positioned within the segment)
     29   - the SectionData hex dump line numbers — content is what matters
     30   - the "File:" header line (path varies between golden and rt)
     31 
     32 Sorts:
     33   - Symbol entries within Symbols [] by Name (clang and kit partition
     34     differently within the same DYSYMTAB extents)
     35   - Relocations within each Relocations { Section { ... } } by Offset
     36 
     37 Invocation:
     38   normalize.py <file>      — runs `llvm-readobj` on file, normalizes.
     39   normalize.py filter      — reads stdin, writes normalized to stdout.
     40 """
     41 import re
     42 import shutil
     43 import subprocess
     44 import sys
     45 
     46 
     47 def _which(*names):
     48     for n in names:
     49         p = shutil.which(n)
     50         if p:
     51             return p
     52     return None
     53 
     54 
     55 def _readobj(path):
     56     bin_path = _which("llvm-readobj", "readobj")
     57     if not bin_path:
     58         sys.stderr.write("normalize.py: cannot find llvm-readobj\n")
     59         sys.exit(77)
     60     args = [
     61         bin_path,
     62         "--section-headers",
     63         "--section-data",
     64         "--relocations",
     65         "--symbols",
     66         path,
     67     ]
     68     res = subprocess.run(args, capture_output=True, text=True)
     69     sys.stderr.write(res.stderr)
     70     return res.stdout
     71 
     72 
     73 # "Name: foo (NN)" — strip the strtab-offset hint
     74 _NAME_STRX = re.compile(r"^(\s*Name:\s+\S+)\s+\(\d+\)\s*$")
     75 # "Name: __text (5F 5F ...)" / "Segment: __TEXT (5F 5F ...)" — strip hex tail
     76 _NAME_HEXBYTES = re.compile(
     77     r"^(\s*(?:Name|Segment):\s+\S+)\s+\([0-9A-Fa-f ]+\)\s*$"
     78 )
     79 # Lines whose value is layout-dependent and not load-bearing for fidelity.
     80 _DROP_FIELD_RE = re.compile(
     81     r"^\s*(Index|Offset|RelocationOffset|Address|File):\s"
     82 )
     83 # SectionData hex rows: "    0000: 00008052 ..."
     84 _SECTION_DATA_ROW = re.compile(r"^\s*[0-9A-Fa-f]{4}:\s+")
     85 # Block headers we use for sort scoping.
     86 _BLOCK_HDR = re.compile(r"^(\s*)(\w[\w]*)\s+(\[|\{)\s*$")
     87 
     88 
     89 def _strip_line(line):
     90     rstripped = line.rstrip("\n")
     91     if _DROP_FIELD_RE.match(rstripped):
     92         return None
     93     m = _NAME_STRX.match(rstripped)
     94     if m:
     95         return m.group(1) + "\n"
     96     m = _NAME_HEXBYTES.match(rstripped)
     97     if m:
     98         return m.group(1) + "\n"
     99     return line
    100 
    101 
    102 def _block_key(block_lines, prefer_keys=("Name", "Offset")):
    103     """Pick a sort key from a block: first matching field wins."""
    104     for k in prefer_keys:
    105         for ln in block_lines:
    106             m = re.match(r"^\s*" + re.escape(k) + r":\s+(.+?)\s*$", ln)
    107             if m:
    108                 return m.group(1)
    109     return ""
    110 
    111 
    112 def _split_top_blocks(lines):
    113     """Walk lines at one indent level; group into blocks bounded by
    114     matching '{...}'. Returns (prefix, blocks, suffix) where blocks is
    115     a list of [lines] each starting with 'Foo {' and ending '}'."""
    116     prefix, suffix = [], []
    117     blocks = []
    118     i = 0
    119     n = len(lines)
    120     started = False
    121     base_indent = None
    122     while i < n:
    123         line = lines[i]
    124         stripped = line.rstrip("\n")
    125         # Detect a "Symbol {" or "Section {" or "Relocation {" block start.
    126         m = re.match(r"^(\s*)(\w+)\s+\{\s*$", stripped)
    127         if m and m.group(2) in ("Symbol", "Section", "Relocation"):
    128             if base_indent is None:
    129                 base_indent = len(m.group(1))
    130             indent = len(m.group(1))
    131             if indent == base_indent:
    132                 started = True
    133                 # Collect until the matching '}'
    134                 depth = 1
    135                 blk = [line]
    136                 i += 1
    137                 while i < n and depth > 0:
    138                     bl = lines[i]
    139                     bs = bl.rstrip("\n")
    140                     if re.match(r"^\s*\w[\w ]*\{\s*$", bs) or bs.endswith(" {"):
    141                         depth += 1
    142                     elif re.match(r"^\s*\}\s*$", bs):
    143                         depth -= 1
    144                     blk.append(bl)
    145                     i += 1
    146                 blocks.append(blk)
    147                 continue
    148         if not started:
    149             prefix.append(line)
    150         else:
    151             suffix.append(line)
    152         i += 1
    153     return prefix, blocks, suffix
    154 
    155 
    156 def _normalize_block_internals(block_lines):
    157     """For a Section block containing Relocations { ... } or a
    158     Relocations { Section { ... } } block — sort inner relocation
    159     entries by Offset."""
    160     # Find inner "Relocations [" or "Relocation {" sub-blocks and sort.
    161     # Two cases this matters:
    162     #   1) top-level Sections [] entries don't carry their own relocs in
    163     #      llvm-readobj's macho output — relocs live under Relocations [].
    164     #   2) Relocations [ Section { Relocation { ... } ... } ]
    165     return block_lines
    166 
    167 
    168 def normalize(text):
    169     # First apply per-line strips.
    170     lines = []
    171     for raw in text.splitlines(keepends=True):
    172         out = _strip_line(raw)
    173         if out is None:
    174             continue
    175         # Drop SectionData byte-row column offsets like "0000:" — keep
    176         # only the ASCII bytes. SectionData rows are positional; they
    177         # render the same content for both files at the same offsets.
    178         m = _SECTION_DATA_ROW.match(out)
    179         if m:
    180             # Keep just the hex+ascii portion after the column tag so
    181             # whitespace variations don't matter.
    182             out = re.sub(r"^\s*[0-9A-Fa-f]{4}:\s+", "    ", out)
    183         lines.append(out)
    184 
    185     # Now scan top-level "Symbols [" and "Relocations [" blocks and
    186     # sort their inner "Symbol {" / "Section { ... Relocation { ... } }"
    187     # entries.
    188     out_lines = []
    189     i = 0
    190     n = len(lines)
    191     while i < n:
    192         line = lines[i]
    193         m_sym = re.match(r"^Symbols\s*\[\s*$", line.rstrip("\n"))
    194         m_rel = re.match(r"^Relocations\s*\[\s*$", line.rstrip("\n"))
    195         if m_sym or m_rel:
    196             out_lines.append(line)
    197             i += 1
    198             # Collect everything until the matching closing "]"
    199             inner = []
    200             depth = 1
    201             while i < n and depth > 0:
    202                 ln = lines[i]
    203                 s = ln.rstrip("\n")
    204                 if re.match(r"^[A-Za-z][\w]*\s*\[\s*$", s) or s.endswith(" ["):
    205                     depth += 1
    206                 elif re.match(r"^\]\s*$", s):
    207                     depth -= 1
    208                     if depth == 0:
    209                         break
    210                 inner.append(ln)
    211                 i += 1
    212             # Split inner into Symbol/Section/Relocation blocks and sort.
    213             _, blocks, suffix = _split_top_blocks(inner)
    214             if m_sym:
    215                 blocks.sort(key=lambda b: _block_key(b, ("Name",)))
    216             else:
    217                 # Relocations [ Section { Relocation { Offset: N ... } } ]
    218                 # Sort the inner Relocation entries within each Section.
    219                 new_blocks = []
    220                 for blk in blocks:
    221                     p2, sub_blocks, suf2 = _split_top_blocks(blk[1:-1])
    222                     sub_blocks.sort(key=lambda b: _block_key(b, ("Offset",)))
    223                     rebuilt = [blk[0]] + p2
    224                     for sb in sub_blocks:
    225                         rebuilt.extend(sb)
    226                     rebuilt.extend(suf2)
    227                     rebuilt.append(blk[-1])
    228                     new_blocks.append(rebuilt)
    229                 blocks = new_blocks
    230             for b in blocks:
    231                 out_lines.extend(b)
    232             out_lines.extend(suffix)
    233             # Append the closing "]" line we left at lines[i]
    234             if i < n:
    235                 out_lines.append(lines[i])
    236                 i += 1
    237         else:
    238             out_lines.append(line)
    239             i += 1
    240 
    241     return "".join(out_lines)
    242 
    243 
    244 def main(argv):
    245     if len(argv) < 2:
    246         sys.stderr.write(__doc__)
    247         return 2
    248     cmd = argv[1]
    249     if cmd == "filter":
    250         text = sys.stdin.read()
    251     else:
    252         # Treat the arg as a file path; run llvm-readobj on it.
    253         text = _readobj(cmd)
    254     sys.stdout.write(normalize(text))
    255     return 0
    256 
    257 
    258 if __name__ == "__main__":
    259     sys.exit(main(sys.argv))