normalize.py (9919B)
1 #!/usr/bin/env python3 2 """Canonicalize llvm-readelf / llvm-objdump output so two ELFs with 3 equivalent semantic content compare equal. 4 5 Strips/normalizes: 6 - file offsets, virtual addresses (replaced with "<addr>") 7 - section/symbol indices (replaced with "<idx>") 8 - string-table offsets that show up after a "name:" tag 9 10 Sorts: 11 - "Symbol table" entries by (binding, name) within each scope 12 - "Relocation section" entries by (section, offset) 13 14 Invocation: 15 normalize.py readelf <file> — runs `llvm-readelf -aW`, then normalizes 16 normalize.py objdump <file> — runs `llvm-objdump -drwhW`, then normalizes 17 normalize.py filter — reads stdin, writes normalized to stdout 18 """ 19 import os 20 import re 21 import shutil 22 import subprocess 23 import sys 24 25 26 def _which(*names): 27 for n in names: 28 p = shutil.which(n) 29 if p: 30 return p 31 return None 32 33 34 def _run(tool_args, file_path): 35 bin_path = _which(*tool_args[0]) 36 if not bin_path: 37 sys.stderr.write("normalize.py: cannot find %s\n" % tool_args[0][0]) 38 sys.exit(77) 39 res = subprocess.run([bin_path] + tool_args[1] + [file_path], 40 capture_output=True, text=True) 41 sys.stderr.write(res.stderr) 42 return res.stdout 43 44 45 # Hex address: 0x[0-9a-f]+ (>=4 digits, to avoid clobbering small numerics like flags). 46 _HEX_ADDR = re.compile(r"0x[0-9a-fA-F]{4,}") 47 # Bare hex addresses (no 0x) only inside specific column contexts handled below. 48 _OFFSET_LINE = re.compile(r"^\s*Offset:\s+\S+\s*$") 49 # llvm-readelf section header lines: 50 # [Nr] Name Type Address Off Size ES Flg Lk Inf Al 51 # Flg can be empty (sections like SHT_STRTAB / SHT_NOBITS-but-not-allocatable), 52 # so we parse positionally rather than with a single regex. 53 _SHDR_HEADER_RE = re.compile( 54 r"^\s*\[\s*(\d+)\]\s+(\S+)\s+(\S+)\s+([0-9a-fA-F]+)\s+([0-9a-fA-F]+)\s+([0-9a-fA-F]+)\s+(.*)$" 55 ) 56 # llvm-readelf symbol table row: 57 # Num: Value Size Type Bind Vis Ndx Name 58 _SYM_LINE = re.compile( 59 r"^\s*(\d+):\s+([0-9a-fA-F]+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.*)$" 60 ) 61 # llvm-readelf RELA row: 62 # Offset Info Type Symbol's Value Symbol's Name + Addend 63 _RELA_LINE = re.compile( 64 r"^\s*([0-9a-fA-F]+)\s+([0-9a-fA-F]+)\s+(\S+)\s+([0-9a-fA-F]+)\s+(.*)$" 65 ) 66 67 68 # Sections kit's data model doesn't fully preserve. Drop them from 69 # the structural diff entirely so the rest of the comparison is 70 # meaningful. Each entry here is paired with a comment in the C 71 # implementation noting why; remove from this set when the model is 72 # extended. 73 _DROP_SHDR_NAMES = set() 74 # When the kit data model can't preserve a section across roundtrip, 75 # add its name here with a comment pointing at the underlying gap. 76 # Sections previously listed (.llvm_addrsig, .ARM.attributes) now 77 # round-trip via Section.ext_type / Section.ext_flags. 78 79 80 def _normalize_shdr(line): 81 m = _SHDR_HEADER_RE.match(line) 82 if not m: 83 return line 84 nr, name, sh_type, addr, off, size, rest = m.groups() 85 if name in _DROP_SHDR_NAMES: 86 return "" 87 toks = rest.split() 88 # rest is "ES [Flg] Lk Inf Al" — Flg can be empty; treat as 4 or 5 tokens. 89 if len(toks) == 5: 90 es, flg, lk, inf, al = toks 91 elif len(toks) == 4: 92 es, flg, lk, inf, al = toks[0], "", toks[1], toks[2], toks[3] 93 else: 94 return line 95 # Drop nr/addr/off and the link/info indices (they reference other 96 # sections positionally and vary with layout). Section semantics are 97 # captured by name/type/flg/al; link relationships and exact byte 98 # sizes re-emerge from the symtab/rela contents downstream. 99 # 100 # STRTAB size depends on whether the implementation does tail 101 # merging (".eh_frame" stored as a suffix of ".rela.eh_frame", etc.), 102 # which is an optimization, not a contract — strip the size for 103 # STRTAB so equivalent-content tables compare equal. 104 if sh_type == "STRTAB": 105 return ("[<idx>] %s STRTAB es=%s flg=%s al=%s\n" % (name, es, flg, al)) 106 return ("[<idx>] %s %s size=%s es=%s flg=%s al=%s\n" 107 % (name, sh_type, size, es, flg, al)) 108 109 110 def _normalize_sym(line): 111 m = _SYM_LINE.match(line) 112 if not m: 113 return line 114 num, val, size, sym_type, bind, vis, ndx, name = m.groups() 115 # ndx as a numeric is layout-dependent; collapse all numeric ndx to 116 # "DEF" (defined-in-some-section) and keep the special markers 117 # (UND / ABS / COMMON / etc.). 118 if ndx.isdigit(): 119 ndx = "DEF" 120 return ("[<idx>] value=<addr> size=%s type=%s bind=%s vis=%s ndx=%s name=%s\n" 121 % (size, sym_type, bind, vis, ndx, name)) 122 123 124 def _normalize_rela(line): 125 m = _RELA_LINE.match(line) 126 if not m: 127 return line 128 off, info, rtype, sym_val, sym_name = m.groups() 129 return ("offset=%s type=%s sym_value=<addr> sym=%s\n" 130 % (off, rtype, sym_name.strip())) 131 132 133 # llvm-readelf COMDAT group section header: 134 # COMDAT group section [ N] `.group' [symbol_name] contains M sections: 135 _GROUP_HDR_RE = re.compile( 136 r"^(COMDAT )?group section\s+\[\s*\d+\]\s+`([^']+)'\s+\[([^\]]+)\]", 137 re.IGNORECASE, 138 ) 139 # Member entry inside a group block: 140 # [ N] .section_name 141 _GROUP_ENTRY_RE = re.compile(r"^\s*\[\s*(\d+)\]\s+(.+)$") 142 143 144 # Lines whose presence is sensitive to layout choices but says nothing 145 # semantic: count of headers, where they live, etc. Drop them entirely. 146 _DROP_PREFIXES = ( 147 " Start of section headers:", 148 " Number of section headers:", 149 " Section header string table index:", 150 "There are ", # "There are N section headers..." 151 "Symbol table '", # "Symbol table '.symtab' contains N entries" 152 "Relocation section '", # "Relocation section '.rela.X' at offset N..." 153 ) 154 155 def _is_segment_mapping(line): 156 return line.lstrip().startswith("None ") or "Segment Sections..." in line 157 158 159 def normalize(text): 160 out_blocks = [] 161 cur_block = [] 162 cur_kind = None # "shdr", "sym", "rela", None 163 164 def flush(): 165 nonlocal cur_block, cur_kind 166 # Sort all block kinds; section ordering and symbol ordering are not 167 # semantic. (Relocation sections within a relocation block are 168 # already named, so sorting is fine.) 169 if cur_kind in ("shdr", "sym", "rela", "group"): 170 cur_block.sort() 171 out_blocks.extend(cur_block) 172 cur_block = [] 173 cur_kind = None 174 175 for raw in text.splitlines(keepends=True): 176 line = raw 177 178 # Block-start markers come first — even if their text matches 179 # _DROP_PREFIXES, they still need to set cur_kind. We replace 180 # the line itself with a stable canonical heading so the body 181 # can be diffed without the count/offset suffix. 182 if line.startswith("Symbol table"): 183 flush() 184 cur_kind = "sym" 185 out_blocks.append("Symbol table:\n") 186 continue 187 if line.startswith("Relocation section"): 188 flush() 189 cur_kind = "rela" 190 out_blocks.append("Relocation section:\n") 191 continue 192 if "Section Headers:" in line: 193 flush() 194 cur_kind = "shdr" 195 out_blocks.append(line) 196 continue 197 gm = _GROUP_HDR_RE.search(line) 198 if gm: 199 flush() 200 cur_kind = "group" 201 # Canonical heading preserves the group symbol name but drops 202 # the section index and member count — both are layout-dependent. 203 out_blocks.append("COMDAT group [%s]:\n" % gm.group(3)) 204 continue 205 206 if any(line.startswith(p) for p in _DROP_PREFIXES): 207 continue 208 209 # Section-to-segment mapping line: sort section names so file 210 # ordering doesn't show through; drop any sections in the 211 # known-not-preserved set. 212 if cur_kind is None and _is_segment_mapping(line.rstrip("\n")): 213 stripped = line.rstrip("\n") 214 if "None" in stripped: 215 head, _, tail = stripped.partition("None") 216 names = sorted(n for n in tail.split() if n not in _DROP_SHDR_NAMES) 217 line = head + "None " + " ".join(names) + "\n" 218 if not line.strip(): 219 flush() 220 out_blocks.append(line) 221 continue 222 223 if cur_kind == "shdr": 224 cur_block.append(_normalize_shdr(line)) 225 continue 226 if cur_kind == "sym": 227 cur_block.append(_normalize_sym(line)) 228 continue 229 if cur_kind == "rela": 230 cur_block.append(_normalize_rela(line)) 231 continue 232 if cur_kind == "group": 233 # Replace the numeric section index with <idx>; keep the name. 234 # "[Index] Name" header and blank lines pass through after 235 # index scrubbing so the block can be sorted by section name. 236 em = _GROUP_ENTRY_RE.match(line) 237 if em: 238 cur_block.append("[<idx>] %s\n" % em.group(2).strip()) 239 else: 240 cur_block.append(line) 241 continue 242 243 # Default: scrub addresses outside section bodies too. 244 line = _HEX_ADDR.sub("<addr>", line) 245 out_blocks.append(line) 246 247 flush() 248 return "".join(out_blocks) 249 250 251 def main(argv): 252 if len(argv) < 2: 253 sys.stderr.write(__doc__) 254 return 2 255 cmd = argv[1] 256 if cmd == "readelf": 257 if len(argv) != 3: return 2 258 text = _run((["llvm-readelf", "readelf"], ["-aW"]), argv[2]) 259 elif cmd == "objdump": 260 if len(argv) != 3: return 2 261 text = _run((["llvm-objdump", "objdump"], ["-drwhW"]), argv[2]) 262 elif cmd == "filter": 263 text = sys.stdin.read() 264 else: 265 sys.stderr.write(__doc__) 266 return 2 267 sys.stdout.write(normalize(text)) 268 return 0 269 270 271 if __name__ == "__main__": 272 sys.exit(main(sys.argv))