normalize.py (9082B)
1 #!/usr/bin/env python3 2 """Canonicalize llvm-readobj output for Mach-O so two .o files with 3 equivalent semantic content compare equal. 4 5 Mach-O peer of test/elf/normalize.py — the structural-fidelity pivot 6 the path-R harness uses for KIT_TEST_OBJ=macho. Differs from the 7 ELF normalizer because: 8 9 * llvm-readobj's Mach-O output uses an indented "Section { ... }" 10 block format, not a one-line-per-row layout. The normalizer is a 11 block transformer rather than a line transformer. 12 * Mach-O symbol-name strtab packing order is implementation-defined 13 (clang packs externals before locals; kit packs in symtab 14 order). The "(NN)" strx hint after each Name field is dropped so 15 those orderings compare equal. 16 * Section/segment name fields print with a trailing hex byte dump 17 (Name: __text (5F 5F 74 65 78 74 ...)). The hex tail is dropped 18 — the printable name already captures it. 19 * File offsets, RelocationOffset, addresses inside SectionData 20 headers are layout-dependent and dropped. 21 22 Strips: 23 - "(NN)" strtab offsets after Name: fields 24 - hex byte dumps after section/segment names 25 - "Index: N" (section index — re-derivable from order) 26 - "Offset: N" (file offset) 27 - "RelocationOffset: 0xN" 28 - "Address: 0xN" (where the section is positioned within the segment) 29 - the SectionData hex dump line numbers — content is what matters 30 - the "File:" header line (path varies between golden and rt) 31 32 Sorts: 33 - Symbol entries within Symbols [] by Name (clang and kit partition 34 differently within the same DYSYMTAB extents) 35 - Relocations within each Relocations { Section { ... } } by Offset 36 37 Invocation: 38 normalize.py <file> — runs `llvm-readobj` on file, normalizes. 39 normalize.py filter — reads stdin, writes normalized to stdout. 40 """ 41 import re 42 import shutil 43 import subprocess 44 import sys 45 46 47 def _which(*names): 48 for n in names: 49 p = shutil.which(n) 50 if p: 51 return p 52 return None 53 54 55 def _readobj(path): 56 bin_path = _which("llvm-readobj", "readobj") 57 if not bin_path: 58 sys.stderr.write("normalize.py: cannot find llvm-readobj\n") 59 sys.exit(77) 60 args = [ 61 bin_path, 62 "--section-headers", 63 "--section-data", 64 "--relocations", 65 "--symbols", 66 path, 67 ] 68 res = subprocess.run(args, capture_output=True, text=True) 69 sys.stderr.write(res.stderr) 70 return res.stdout 71 72 73 # "Name: foo (NN)" — strip the strtab-offset hint 74 _NAME_STRX = re.compile(r"^(\s*Name:\s+\S+)\s+\(\d+\)\s*$") 75 # "Name: __text (5F 5F ...)" / "Segment: __TEXT (5F 5F ...)" — strip hex tail 76 _NAME_HEXBYTES = re.compile( 77 r"^(\s*(?:Name|Segment):\s+\S+)\s+\([0-9A-Fa-f ]+\)\s*$" 78 ) 79 # Lines whose value is layout-dependent and not load-bearing for fidelity. 80 _DROP_FIELD_RE = re.compile( 81 r"^\s*(Index|Offset|RelocationOffset|Address|File):\s" 82 ) 83 # SectionData hex rows: " 0000: 00008052 ..." 84 _SECTION_DATA_ROW = re.compile(r"^\s*[0-9A-Fa-f]{4}:\s+") 85 # Block headers we use for sort scoping. 86 _BLOCK_HDR = re.compile(r"^(\s*)(\w[\w]*)\s+(\[|\{)\s*$") 87 88 89 def _strip_line(line): 90 rstripped = line.rstrip("\n") 91 if _DROP_FIELD_RE.match(rstripped): 92 return None 93 m = _NAME_STRX.match(rstripped) 94 if m: 95 return m.group(1) + "\n" 96 m = _NAME_HEXBYTES.match(rstripped) 97 if m: 98 return m.group(1) + "\n" 99 return line 100 101 102 def _block_key(block_lines, prefer_keys=("Name", "Offset")): 103 """Pick a sort key from a block: first matching field wins.""" 104 for k in prefer_keys: 105 for ln in block_lines: 106 m = re.match(r"^\s*" + re.escape(k) + r":\s+(.+?)\s*$", ln) 107 if m: 108 return m.group(1) 109 return "" 110 111 112 def _split_top_blocks(lines): 113 """Walk lines at one indent level; group into blocks bounded by 114 matching '{...}'. Returns (prefix, blocks, suffix) where blocks is 115 a list of [lines] each starting with 'Foo {' and ending '}'.""" 116 prefix, suffix = [], [] 117 blocks = [] 118 i = 0 119 n = len(lines) 120 started = False 121 base_indent = None 122 while i < n: 123 line = lines[i] 124 stripped = line.rstrip("\n") 125 # Detect a "Symbol {" or "Section {" or "Relocation {" block start. 126 m = re.match(r"^(\s*)(\w+)\s+\{\s*$", stripped) 127 if m and m.group(2) in ("Symbol", "Section", "Relocation"): 128 if base_indent is None: 129 base_indent = len(m.group(1)) 130 indent = len(m.group(1)) 131 if indent == base_indent: 132 started = True 133 # Collect until the matching '}' 134 depth = 1 135 blk = [line] 136 i += 1 137 while i < n and depth > 0: 138 bl = lines[i] 139 bs = bl.rstrip("\n") 140 if re.match(r"^\s*\w[\w ]*\{\s*$", bs) or bs.endswith(" {"): 141 depth += 1 142 elif re.match(r"^\s*\}\s*$", bs): 143 depth -= 1 144 blk.append(bl) 145 i += 1 146 blocks.append(blk) 147 continue 148 if not started: 149 prefix.append(line) 150 else: 151 suffix.append(line) 152 i += 1 153 return prefix, blocks, suffix 154 155 156 def _normalize_block_internals(block_lines): 157 """For a Section block containing Relocations { ... } or a 158 Relocations { Section { ... } } block — sort inner relocation 159 entries by Offset.""" 160 # Find inner "Relocations [" or "Relocation {" sub-blocks and sort. 161 # Two cases this matters: 162 # 1) top-level Sections [] entries don't carry their own relocs in 163 # llvm-readobj's macho output — relocs live under Relocations []. 164 # 2) Relocations [ Section { Relocation { ... } ... } ] 165 return block_lines 166 167 168 def normalize(text): 169 # First apply per-line strips. 170 lines = [] 171 for raw in text.splitlines(keepends=True): 172 out = _strip_line(raw) 173 if out is None: 174 continue 175 # Drop SectionData byte-row column offsets like "0000:" — keep 176 # only the ASCII bytes. SectionData rows are positional; they 177 # render the same content for both files at the same offsets. 178 m = _SECTION_DATA_ROW.match(out) 179 if m: 180 # Keep just the hex+ascii portion after the column tag so 181 # whitespace variations don't matter. 182 out = re.sub(r"^\s*[0-9A-Fa-f]{4}:\s+", " ", out) 183 lines.append(out) 184 185 # Now scan top-level "Symbols [" and "Relocations [" blocks and 186 # sort their inner "Symbol {" / "Section { ... Relocation { ... } }" 187 # entries. 188 out_lines = [] 189 i = 0 190 n = len(lines) 191 while i < n: 192 line = lines[i] 193 m_sym = re.match(r"^Symbols\s*\[\s*$", line.rstrip("\n")) 194 m_rel = re.match(r"^Relocations\s*\[\s*$", line.rstrip("\n")) 195 if m_sym or m_rel: 196 out_lines.append(line) 197 i += 1 198 # Collect everything until the matching closing "]" 199 inner = [] 200 depth = 1 201 while i < n and depth > 0: 202 ln = lines[i] 203 s = ln.rstrip("\n") 204 if re.match(r"^[A-Za-z][\w]*\s*\[\s*$", s) or s.endswith(" ["): 205 depth += 1 206 elif re.match(r"^\]\s*$", s): 207 depth -= 1 208 if depth == 0: 209 break 210 inner.append(ln) 211 i += 1 212 # Split inner into Symbol/Section/Relocation blocks and sort. 213 _, blocks, suffix = _split_top_blocks(inner) 214 if m_sym: 215 blocks.sort(key=lambda b: _block_key(b, ("Name",))) 216 else: 217 # Relocations [ Section { Relocation { Offset: N ... } } ] 218 # Sort the inner Relocation entries within each Section. 219 new_blocks = [] 220 for blk in blocks: 221 p2, sub_blocks, suf2 = _split_top_blocks(blk[1:-1]) 222 sub_blocks.sort(key=lambda b: _block_key(b, ("Offset",))) 223 rebuilt = [blk[0]] + p2 224 for sb in sub_blocks: 225 rebuilt.extend(sb) 226 rebuilt.extend(suf2) 227 rebuilt.append(blk[-1]) 228 new_blocks.append(rebuilt) 229 blocks = new_blocks 230 for b in blocks: 231 out_lines.extend(b) 232 out_lines.extend(suffix) 233 # Append the closing "]" line we left at lines[i] 234 if i < n: 235 out_lines.append(lines[i]) 236 i += 1 237 else: 238 out_lines.append(line) 239 i += 1 240 241 return "".join(out_lines) 242 243 244 def main(argv): 245 if len(argv) < 2: 246 sys.stderr.write(__doc__) 247 return 2 248 cmd = argv[1] 249 if cmd == "filter": 250 text = sys.stdin.read() 251 else: 252 # Treat the arg as a file path; run llvm-readobj on it. 253 text = _readobj(cmd) 254 sys.stdout.write(normalize(text)) 255 return 0 256 257 258 if __name__ == "__main__": 259 sys.exit(main(sys.argv))