tbd_read.c (6230B)
1 /* Apple `.tbd` (text-based stub) reader. 2 * 3 * `.tbd` files describe a dylib's ABI surface in a YAML-shaped TAPI 4 * format — Apple ships them in the macOS SDK as a substitute for the 5 * actual .dylib bytes. kit's linker treats them as a peer of 6 * read_macho_dso: extract install-name + the exported symbol set, and 7 * surface that as an ObjBuilder full of defined OBJ_SEC_NONE entries. 8 * 9 * The TAPI grammar is intricate (per-target `exports:` blocks, weak 10 * symbols, Obj-C metadata, re-exports, ...). Rather than re-implement 11 * a YAML parser, this reader takes the conservative approach: it 12 * extracts the FIRST document's `install-name:` (the umbrella the 13 * consumer records in LC_LOAD_DYLIB) and then **scans the entire file 14 * for symbol-looking tokens** — sequences starting with `_` followed by 15 * identifier chars. The result is the union of every C, Obj-C class, 16 * weak, and re-exported symbol declared anywhere in the file. 17 * 18 * Why the union is safe: 19 * - kit's linker only consults the DSO's exported set to satisfy 20 * undefs. Including a symbol the consumer never references is 21 * harmless — the symbol simply never appears in our output's 22 * LC_LOAD_DYLIB chain. 23 * - dyld at runtime walks libSystem's full re-export graph to bind 24 * each name; our static-link decision (which dylib provides it) 25 * reduces to "the umbrella" anyway. We only need to convince the 26 * static linker that the name is bindable, then write 27 * LC_LOAD_DYLIB against the umbrella's install-name. 28 * 29 * The scanner skips the top-of-file `install-name:` line so its path 30 * (`/usr/lib/libSystem.B.dylib`) doesn't end up as a fake symbol — but 31 * since paths don't start with `_`, that wasn't actually a risk. 32 * 33 * Identifier alphabet: A-Z, a-z, 0-9, `_`, `$`, `.`. This matches 34 * Apple's C / Obj-C symbol mangling (e.g. `'_OBJC_CLASS_$_NSString'`, 35 * `'_pause$NOCANCEL'`). Tokens may be surrounded by single or double 36 * quotes — the scanner doesn't see those, since they aren't in the 37 * identifier alphabet, so a token like `'_pause$NOCANCEL'` matches as 38 * just `_pause$NOCANCEL`. */ 39 40 #include <string.h> 41 42 #include "core/heap.h" 43 #include "core/pool.h" 44 #include "core/slice.h" 45 #include "obj/obj.h" 46 47 static int is_id_start(u8 c) { return c == '_'; } 48 static int is_id_cont(u8 c) { 49 return (c == '_') || (c == '$') || (c == '.') || (c >= 'A' && c <= 'Z') || 50 (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'); 51 } 52 53 /* Extract the install-name from the first document. We look for a 54 * line beginning with "install-name:" and take the value up to EOL, 55 * then strip whitespace and surrounding quotes. Returns 0 if absent. */ 56 static Sym extract_install_name(Compiler* c, const u8* data, size_t len) { 57 static const char KEY[] = "install-name:"; 58 size_t klen = sizeof(KEY) - 1u; 59 for (size_t i = 0; i + klen <= len; ++i) { 60 /* Match at start of line (i==0 or preceded by '\n'). */ 61 if (i > 0 && data[i - 1] != '\n') continue; 62 if (memcmp(data + i, KEY, klen) != 0) continue; 63 /* Skip past the colon and surrounding whitespace. */ 64 size_t j = i + klen; 65 while (j < len && (data[j] == ' ' || data[j] == '\t')) ++j; 66 /* Take to EOL. */ 67 size_t start = j; 68 while (j < len && data[j] != '\n' && data[j] != '\r') ++j; 69 size_t end = j; 70 /* Strip trailing whitespace. */ 71 while (end > start && (data[end - 1] == ' ' || data[end - 1] == '\t' || 72 data[end - 1] == '\r')) 73 --end; 74 /* Strip surrounding single or double quotes. */ 75 if (end > start + 1u && (data[start] == '\'' || data[start] == '"') && 76 data[end - 1] == data[start]) { 77 ++start; 78 --end; 79 } 80 if (end > start) 81 return pool_intern_slice( 82 c->global, 83 (Slice){.s = (const char*)(data + start), .len = (u32)(end - start)}); 84 return 0; 85 } 86 return 0; 87 } 88 89 ObjBuilder* read_tbd(Compiler* c, const char* name, const u8* data, size_t len, 90 Sym* install_name_out) { 91 (void)name; 92 if (install_name_out) *install_name_out = 0; 93 if (!data || !len) compiler_panic(c, SRCLOC_NONE, "read_tbd: empty input"); 94 95 /* Validate magic: a tbd starts with `--- !tapi-tbd` (or any `---`). */ 96 if (len < 4 || data[0] != '-' || data[1] != '-' || data[2] != '-') 97 compiler_panic(c, SRCLOC_NONE, "read_tbd: not a tbd file (missing '---')"); 98 99 /* Reject obviously-wrong target arches up front so we don't stream a 100 * bunch of irrelevant symbols in. */ 101 switch (c->target.arch) { 102 case KIT_ARCH_ARM_64: 103 case KIT_ARCH_X86_64: 104 break; 105 default: 106 compiler_panic(c, SRCLOC_NONE, 107 "read_tbd: unsupported target arch %u for tbd lookup", 108 (u32)c->target.arch); 109 } 110 111 ObjBuilder* ob = obj_new(c); 112 if (!ob) compiler_panic(c, SRCLOC_NONE, "read_tbd: obj_new failed"); 113 114 if (install_name_out) *install_name_out = extract_install_name(c, data, len); 115 116 /* Token scanner: walk the file, emit every `_id` token as a defined 117 * external ObjSymbol. Tracking already-seen names via a tiny linear 118 * dedup list would be linear-quadratic on a multi-MB tbd; instead we 119 * rely on the pool's intern de-dup downstream — duplicate ObjSymbol 120 * names are tolerated by the linker's hash, with the second insert 121 * resolving to the existing entry on collision. */ 122 size_t i = 0; 123 while (i < len) { 124 /* Skip non-token bytes. */ 125 while (i < len && !is_id_start(data[i])) ++i; 126 if (i >= len) break; 127 size_t start = i; 128 while (i < len && is_id_cont(data[i])) ++i; 129 size_t tlen = i - start; 130 if (tlen == 0) continue; 131 /* Filter out the obvious YAML-key-like collisions: tokens that are 132 * field names ("_macos" doesn't occur, but be defensive). All 133 * Apple symbols start with `_` followed by another id char, so we 134 * keep tokens of length >= 2. Single `_` is the throwaway-name 135 * convention and never an exported symbol. */ 136 if (tlen < 2u) continue; 137 Sym sn = pool_intern_slice( 138 c->global, (Slice){.s = (const char*)(data + start), .len = (u32)tlen}); 139 obj_symbol_ex(ob, sn, SB_GLOBAL, SV_DEFAULT, SK_NOTYPE, OBJ_SEC_NONE, 0, 0, 140 0); 141 } 142 143 obj_finalize(ob); 144 return ob; 145 }