kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

tbd_read.c (6230B)


      1 /* Apple `.tbd` (text-based stub) reader.
      2  *
      3  * `.tbd` files describe a dylib's ABI surface in a YAML-shaped TAPI
      4  * format — Apple ships them in the macOS SDK as a substitute for the
      5  * actual .dylib bytes.  kit's linker treats them as a peer of
      6  * read_macho_dso: extract install-name + the exported symbol set, and
      7  * surface that as an ObjBuilder full of defined OBJ_SEC_NONE entries.
      8  *
      9  * The TAPI grammar is intricate (per-target `exports:` blocks, weak
     10  * symbols, Obj-C metadata, re-exports, ...).  Rather than re-implement
     11  * a YAML parser, this reader takes the conservative approach: it
     12  * extracts the FIRST document's `install-name:` (the umbrella the
     13  * consumer records in LC_LOAD_DYLIB) and then **scans the entire file
     14  * for symbol-looking tokens** — sequences starting with `_` followed by
     15  * identifier chars.  The result is the union of every C, Obj-C class,
     16  * weak, and re-exported symbol declared anywhere in the file.
     17  *
     18  * Why the union is safe:
     19  *   - kit's linker only consults the DSO's exported set to satisfy
     20  *     undefs.  Including a symbol the consumer never references is
     21  *     harmless — the symbol simply never appears in our output's
     22  *     LC_LOAD_DYLIB chain.
     23  *   - dyld at runtime walks libSystem's full re-export graph to bind
     24  *     each name; our static-link decision (which dylib provides it)
     25  *     reduces to "the umbrella" anyway.  We only need to convince the
     26  *     static linker that the name is bindable, then write
     27  *     LC_LOAD_DYLIB against the umbrella's install-name.
     28  *
     29  * The scanner skips the top-of-file `install-name:` line so its path
     30  * (`/usr/lib/libSystem.B.dylib`) doesn't end up as a fake symbol — but
     31  * since paths don't start with `_`, that wasn't actually a risk.
     32  *
     33  * Identifier alphabet: A-Z, a-z, 0-9, `_`, `$`, `.`.  This matches
     34  * Apple's C / Obj-C symbol mangling (e.g. `'_OBJC_CLASS_$_NSString'`,
     35  * `'_pause$NOCANCEL'`).  Tokens may be surrounded by single or double
     36  * quotes — the scanner doesn't see those, since they aren't in the
     37  * identifier alphabet, so a token like `'_pause$NOCANCEL'` matches as
     38  * just `_pause$NOCANCEL`. */
     39 
     40 #include <string.h>
     41 
     42 #include "core/heap.h"
     43 #include "core/pool.h"
     44 #include "core/slice.h"
     45 #include "obj/obj.h"
     46 
     47 static int is_id_start(u8 c) { return c == '_'; }
     48 static int is_id_cont(u8 c) {
     49   return (c == '_') || (c == '$') || (c == '.') || (c >= 'A' && c <= 'Z') ||
     50          (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
     51 }
     52 
     53 /* Extract the install-name from the first document.  We look for a
     54  * line beginning with "install-name:" and take the value up to EOL,
     55  * then strip whitespace and surrounding quotes.  Returns 0 if absent. */
     56 static Sym extract_install_name(Compiler* c, const u8* data, size_t len) {
     57   static const char KEY[] = "install-name:";
     58   size_t klen = sizeof(KEY) - 1u;
     59   for (size_t i = 0; i + klen <= len; ++i) {
     60     /* Match at start of line (i==0 or preceded by '\n'). */
     61     if (i > 0 && data[i - 1] != '\n') continue;
     62     if (memcmp(data + i, KEY, klen) != 0) continue;
     63     /* Skip past the colon and surrounding whitespace. */
     64     size_t j = i + klen;
     65     while (j < len && (data[j] == ' ' || data[j] == '\t')) ++j;
     66     /* Take to EOL. */
     67     size_t start = j;
     68     while (j < len && data[j] != '\n' && data[j] != '\r') ++j;
     69     size_t end = j;
     70     /* Strip trailing whitespace. */
     71     while (end > start && (data[end - 1] == ' ' || data[end - 1] == '\t' ||
     72                            data[end - 1] == '\r'))
     73       --end;
     74     /* Strip surrounding single or double quotes. */
     75     if (end > start + 1u && (data[start] == '\'' || data[start] == '"') &&
     76         data[end - 1] == data[start]) {
     77       ++start;
     78       --end;
     79     }
     80     if (end > start)
     81       return pool_intern_slice(
     82           c->global,
     83           (Slice){.s = (const char*)(data + start), .len = (u32)(end - start)});
     84     return 0;
     85   }
     86   return 0;
     87 }
     88 
     89 ObjBuilder* read_tbd(Compiler* c, const char* name, const u8* data, size_t len,
     90                      Sym* install_name_out) {
     91   (void)name;
     92   if (install_name_out) *install_name_out = 0;
     93   if (!data || !len) compiler_panic(c, SRCLOC_NONE, "read_tbd: empty input");
     94 
     95   /* Validate magic: a tbd starts with `--- !tapi-tbd` (or any `---`). */
     96   if (len < 4 || data[0] != '-' || data[1] != '-' || data[2] != '-')
     97     compiler_panic(c, SRCLOC_NONE, "read_tbd: not a tbd file (missing '---')");
     98 
     99   /* Reject obviously-wrong target arches up front so we don't stream a
    100    * bunch of irrelevant symbols in. */
    101   switch (c->target.arch) {
    102     case KIT_ARCH_ARM_64:
    103     case KIT_ARCH_X86_64:
    104       break;
    105     default:
    106       compiler_panic(c, SRCLOC_NONE,
    107                      "read_tbd: unsupported target arch %u for tbd lookup",
    108                      (u32)c->target.arch);
    109   }
    110 
    111   ObjBuilder* ob = obj_new(c);
    112   if (!ob) compiler_panic(c, SRCLOC_NONE, "read_tbd: obj_new failed");
    113 
    114   if (install_name_out) *install_name_out = extract_install_name(c, data, len);
    115 
    116   /* Token scanner: walk the file, emit every `_id` token as a defined
    117    * external ObjSymbol.  Tracking already-seen names via a tiny linear
    118    * dedup list would be linear-quadratic on a multi-MB tbd; instead we
    119    * rely on the pool's intern de-dup downstream — duplicate ObjSymbol
    120    * names are tolerated by the linker's hash, with the second insert
    121    * resolving to the existing entry on collision. */
    122   size_t i = 0;
    123   while (i < len) {
    124     /* Skip non-token bytes. */
    125     while (i < len && !is_id_start(data[i])) ++i;
    126     if (i >= len) break;
    127     size_t start = i;
    128     while (i < len && is_id_cont(data[i])) ++i;
    129     size_t tlen = i - start;
    130     if (tlen == 0) continue;
    131     /* Filter out the obvious YAML-key-like collisions: tokens that are
    132      * field names ("_macos" doesn't occur, but be defensive).  All
    133      * Apple symbols start with `_` followed by another id char, so we
    134      * keep tokens of length >= 2.  Single `_` is the throwaway-name
    135      * convention and never an exported symbol. */
    136     if (tlen < 2u) continue;
    137     Sym sn = pool_intern_slice(
    138         c->global, (Slice){.s = (const char*)(data + start), .len = (u32)tlen});
    139     obj_symbol_ex(ob, sn, SB_GLOBAL, SV_DEFAULT, SK_NOTYPE, OBJ_SEC_NONE, 0, 0,
    140                   0);
    141   }
    142 
    143   obj_finalize(ob);
    144   return ob;
    145 }