kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

object_detect.c (12236B)


      1 /* Binary format and target detection from object header bytes. */
      2 
      3 #include <kit/cg.h>
      4 #include <kit/config.h>
      5 #include <kit/object.h>
      6 
      7 #include "core/core.h"
      8 #include "obj/elf/elf.h"
      9 #include "obj/format.h"
     10 
     11 /* COFF Machine numbers recognized as the COFF *binary* format by
     12  * kit_detect_fmt. This is format membership only — the arch mapping (in
     13  * detect_coff) routes through the registry's coff_machine reverse map.
     14  *
     15  * The set here is intentionally broader than the registry's coff_machine
     16  * ops (which models only the link/codegen-supported AMD64 / ARM64): a COFF
     17  * header for any of these machines should still be classified as COFF so
     18  * the reader/target paths report the precise reason it cannot be consumed.
     19  * ARM64EC (0xA641) is plain AArch64 (same encoding, differing ABI); the
     20  * registry aliases it to ARM64 before lookup (coff.h). */
     21 static const u16 COFF_MACHINES[] = {
     22     0x8664, /* AMD64    */
     23     0x014C, /* I386     */
     24     0xAA64, /* ARM64    */
     25     0xA641, /* ARM64EC  */
     26     0x01C4, /* ARMNT    */
     27     0x5032, /* RISCV32  */
     28     0x5064, /* RISCV64  */
     29 };
     30 
     31 /* Is `machine` a COFF Machine number we classify as the COFF format? */
     32 static int coff_machine_known(u16 machine) {
     33   size_t i;
     34   for (i = 0; i < sizeof COFF_MACHINES / sizeof COFF_MACHINES[0]; i++) {
     35     if (COFF_MACHINES[i] == machine) return 1;
     36   }
     37   return 0;
     38 }
     39 
     40 KitBinFmt kit_detect_fmt(const uint8_t* data, size_t len) {
     41   u32 m;
     42   u16 coff_machine;
     43 
     44   if (!data) return KIT_BIN_UNKNOWN;
     45   if (len >= 8 && data[0] == '!' && data[1] == '<' && data[2] == 'a' &&
     46       data[3] == 'r' && data[4] == 'c' && data[5] == 'h' && data[6] == '>' &&
     47       data[7] == '\n') {
     48     return KIT_BIN_AR;
     49   }
     50   if (len >= 4 && data[0] == 0x7f && data[1] == 'E' && data[2] == 'L' &&
     51       data[3] == 'F') {
     52     return KIT_BIN_ELF;
     53   }
     54   if (len >= 4 && data[0] == 0x00 && data[1] == 'a' && data[2] == 's' &&
     55       data[3] == 'm') {
     56     return KIT_BIN_WASM;
     57   }
     58   if (len >= 4) {
     59     m = (u32)data[0] | ((u32)data[1] << 8) | ((u32)data[2] << 16) |
     60         ((u32)data[3] << 24);
     61     if (m == 0xFEEDFACEu || m == 0xFEEDFACFu || m == 0xCEFAEDFEu ||
     62         m == 0xCFFAEDFEu || m == 0xCAFEBABEu) {
     63       return KIT_BIN_MACHO;
     64     }
     65   }
     66   if (len >= 2 && data[0] == 'M' && data[1] == 'Z') {
     67     return KIT_BIN_PE;
     68   }
     69   if (len >= 2) {
     70     coff_machine = (u16)data[0] | ((u16)data[1] << 8);
     71     if (coff_machine_known(coff_machine)) return KIT_BIN_COFF;
     72   }
     73   /* Microsoft "short import" record: Sig1=0, Sig2=0xFFFF. Routed
     74    * through read_coff (which dispatches to read_coff_short_import).
     75    * The header continues with a Machine word, which we also sanity-
     76    * check so a stray 00 00 FF FF prefix on some other format does
     77    * not mis-route. */
     78   if (len >= 8 && data[0] == 0x00 && data[1] == 0x00 && data[2] == 0xFF &&
     79       data[3] == 0xFF) {
     80     u16 mach = (u16)data[6] | ((u16)data[7] << 8);
     81     if (coff_machine_known(mach)) return KIT_BIN_COFF;
     82   }
     83   return KIT_BIN_UNKNOWN;
     84 }
     85 
     86 static void detect_target_defaults(KitTargetSpec* t) {
     87   t->big_endian = 0;
     88   t->pic = KIT_PIC_NONE;
     89   t->code_model = KIT_CM_DEFAULT;
     90   t->float_abi = KIT_FLOAT_ABI_DEFAULT;
     91 }
     92 
     93 static void detect_set_ptr(KitTargetSpec* t, KitArchKind arch) {
     94   /* kit_arch_ptr_size is the single source of truth for an arch's byte
     95    * pointer width (cg.h); ptr_align tracks ptr_size for every arch this
     96    * detector resolves. wasm32 reports 4 here; the wasm path in
     97    * kit_detect_target sets its own spec and does not call detect_set_ptr,
     98    * so wasm64's 8-byte width is unaffected. */
     99   uint8_t w = kit_arch_ptr_size(arch);
    100   t->arch = arch;
    101   t->ptr_size = w;
    102   t->ptr_align = w;
    103 }
    104 
    105 static KitStatus detect_elf(const u8* d, size_t len, KitTargetSpec* out) {
    106   u8 ei_class, ei_data, ei_osabi;
    107   u16 e_machine;
    108   if (len < 20) return KIT_MALFORMED;
    109   ei_class = d[4];
    110   ei_data = d[5];
    111   ei_osabi = d[7];
    112   if (ei_data == 1) {
    113     e_machine = (u16)d[18] | ((u16)d[19] << 8);
    114   } else if (ei_data == 2) {
    115     e_machine = (u16)d[19] | ((u16)d[18] << 8);
    116   } else {
    117     return KIT_MALFORMED;
    118   }
    119 
    120   detect_target_defaults(out);
    121   out->big_endian = (ei_data == 2);
    122   out->obj = KIT_OBJ_ELF;
    123 
    124   /* Resolve the arch through the ELF format's machine reverse map
    125    * (obj_elf_machine_class also splits EM_RISCV into RV32/RV64 by
    126    * EI_CLASS). The registry models only the link/codegen arches, so the
    127    * legacy 32-bit ABI-classifiable machines it does not carry (EM_386,
    128    * EM_ARM) are mapped here explicitly to preserve detection. */
    129   {
    130     const ObjElfArchOps* ops = obj_elf_machine_class(e_machine, ei_class);
    131     if (ops) {
    132       detect_set_ptr(out, ops->arch);
    133     } else if (e_machine == 0x03) { /* EM_386 */
    134       detect_set_ptr(out, KIT_ARCH_X86_32);
    135     } else if (e_machine == 0x28) { /* EM_ARM */
    136       detect_set_ptr(out, KIT_ARCH_ARM_32);
    137     } else {
    138       return KIT_UNSUPPORTED;
    139     }
    140   }
    141   /* EI_CLASS must agree with the arch's pointer width: 32-bit arches are
    142    * ELFCLASS32, 64-bit arches ELFCLASS64. EM_RISCV is already disambiguated
    143    * by class above; this also rejects a class/machine mismatch such as a
    144    * 64-bit arch object whose EI_CLASS byte claims ELFCLASS32. */
    145   if (ei_class != ((out->ptr_size == 4) ? 1u : 2u)) return KIT_MALFORMED;
    146   if (ei_osabi == 0 || ei_osabi == 3)
    147     out->os = KIT_OS_LINUX;
    148   else if (ei_osabi == 9)
    149     out->os = KIT_OS_FREEBSD;
    150   else
    151     out->os = KIT_OS_FREESTANDING;
    152 
    153   /* Recover the float ABI from e_flags via the per-arch decoder so a detected
    154    * target selects the matching runtime variant (rv32 ilp32 soft vs ilp32f
    155    * single share an arch + pointer width and differ only here). e_flags is a
    156    * 4-byte LE field after the three native-width addr fields: offset 36 on
    157    * ELFCLASS32, 48 on ELFCLASS64. Only RISC-V supplies a decoder
    158    * (float_abi_from_e_flags non-NULL); other arches leave the default. The
    159    * decoder is only consulted when the header carries the full e_flags word,
    160    * leaving DEFAULT (the wildcard) when it is truncated. */
    161   {
    162     const ObjElfArchOps* ops = obj_elf_machine_class(e_machine, ei_class);
    163     if (ops && ops->float_abi_from_e_flags) {
    164       size_t flags_off = (ei_class == 1) ? 36u : 48u;
    165       if (len >= flags_off + 4u) {
    166         u32 e_flags;
    167         if (ei_data == 1)
    168           e_flags = (u32)d[flags_off] | ((u32)d[flags_off + 1] << 8) |
    169                     ((u32)d[flags_off + 2] << 16) |
    170                     ((u32)d[flags_off + 3] << 24);
    171         else
    172           e_flags = (u32)d[flags_off + 3] | ((u32)d[flags_off + 2] << 8) |
    173                     ((u32)d[flags_off + 1] << 16) | ((u32)d[flags_off] << 24);
    174         out->float_abi = (u8)ops->float_abi_from_e_flags(e_flags);
    175       }
    176     }
    177   }
    178   return KIT_OK;
    179 }
    180 
    181 /* Resolve a COFF Machine number to a KitArchKind through the registry's
    182  * coff_machine reverse map (which aliases ARM64EC -> ARM64). The registry
    183  * models only the link/codegen arches (AMD64 / ARM64); the legacy
    184  * ABI-classifiable machines it does not carry are mapped explicitly to
    185  * preserve detection. Returns 1 and writes *out on success; 0 if the
    186  * machine is unsupported. Shared by detect_coff (.obj) and detect_pe
    187  * (linked image). */
    188 static int coff_machine_to_arch(u16 machine, KitArchKind* out) {
    189   const ObjFormatImpl* fmt = obj_format_lookup(KIT_OBJ_COFF);
    190   const ObjCoffArchOps* ops =
    191       (fmt && fmt->coff_machine) ? fmt->coff_machine(machine) : NULL;
    192   if (ops) {
    193     *out = ops->arch;
    194   } else if (machine == 0x014Cu) { /* IMAGE_FILE_MACHINE_I386 */
    195     *out = KIT_ARCH_X86_32;
    196   } else if (machine == 0x01C4u) { /* IMAGE_FILE_MACHINE_ARMNT */
    197     *out = KIT_ARCH_ARM_32;
    198   } else if (machine == 0x5032u) { /* IMAGE_FILE_MACHINE_RISCV32 */
    199     *out = KIT_ARCH_RV32;
    200   } else if (machine == 0x5064u) { /* IMAGE_FILE_MACHINE_RISCV64 */
    201     *out = KIT_ARCH_RV64;
    202   } else {
    203     return 0;
    204   }
    205   return 1;
    206 }
    207 
    208 static KitStatus detect_coff(const u8* d, size_t len, KitTargetSpec* out) {
    209   u16 machine;
    210   KitArchKind arch;
    211   if (len < 2) return KIT_MALFORMED;
    212   machine = (u16)d[0] | ((u16)d[1] << 8);
    213   if (!coff_machine_to_arch(machine, &arch)) return KIT_UNSUPPORTED;
    214   detect_target_defaults(out);
    215   out->obj = KIT_OBJ_COFF;
    216   out->os = KIT_OS_WINDOWS;
    217   detect_set_ptr(out, arch);
    218   return KIT_OK;
    219 }
    220 
    221 /* PE image (DOS 'MZ' stub + "PE\0\0" signature). Unlike a bare .obj, the
    222  * COFF Machine word lives in the file header at e_lfanew+4, not at offset 0
    223  * (the DOS stub), so this can't reuse detect_coff's offset-0 read. Routes a
    224  * well-formed image to KIT_OBJ_COFF / KIT_OS_WINDOWS; read_coff then
    225  * dispatches the 'MZ' magic to read_coff_image. A 'MZ' prefix with no valid
    226  * PE signature (a DOS-only stub) is rejected as malformed. */
    227 static KitStatus detect_pe(const u8* d, size_t len, KitTargetSpec* out) {
    228   u32 e_lfanew, pe_sig;
    229   u16 machine;
    230   KitArchKind arch;
    231   if (len < 64) return KIT_MALFORMED; /* DOS header */
    232   if (!(d[0] == 'M' && d[1] == 'Z')) return KIT_MALFORMED;
    233   e_lfanew = (u32)d[60] | ((u32)d[61] << 8) | ((u32)d[62] << 16) |
    234              ((u32)d[63] << 24);
    235   /* Need the 4-byte PE signature + the 20-byte IMAGE_FILE_HEADER. */
    236   if ((u64)e_lfanew + 4u + 20u > (u64)len) return KIT_MALFORMED;
    237   pe_sig = (u32)d[e_lfanew] | ((u32)d[e_lfanew + 1] << 8) |
    238            ((u32)d[e_lfanew + 2] << 16) | ((u32)d[e_lfanew + 3] << 24);
    239   if (pe_sig != 0x00004550u) return KIT_MALFORMED; /* "PE\0\0" */
    240   machine = (u16)d[e_lfanew + 4] | ((u16)d[e_lfanew + 5] << 8);
    241   if (!coff_machine_to_arch(machine, &arch)) return KIT_UNSUPPORTED;
    242   detect_target_defaults(out);
    243   out->obj = KIT_OBJ_COFF;
    244   out->os = KIT_OS_WINDOWS;
    245   detect_set_ptr(out, arch);
    246   return KIT_OK;
    247 }
    248 
    249 static KitStatus detect_macho(const u8* d, size_t len, KitTargetSpec* out) {
    250   u32 magic, cputype;
    251   int swap, is64;
    252   if (len < 8) return KIT_MALFORMED;
    253   magic = (u32)d[0] | ((u32)d[1] << 8) | ((u32)d[2] << 16) | ((u32)d[3] << 24);
    254   switch (magic) {
    255     case 0xFEEDFACEu:
    256       swap = 0;
    257       is64 = 0;
    258       break;
    259     case 0xFEEDFACFu:
    260       swap = 0;
    261       is64 = 1;
    262       break;
    263     case 0xCEFAEDFEu:
    264       swap = 1;
    265       is64 = 0;
    266       break;
    267     case 0xCFFAEDFEu:
    268       swap = 1;
    269       is64 = 1;
    270       break;
    271     default:
    272       return KIT_MALFORMED;
    273   }
    274   if (!swap) {
    275     cputype =
    276         (u32)d[4] | ((u32)d[5] << 8) | ((u32)d[6] << 16) | ((u32)d[7] << 24);
    277   } else {
    278     cputype =
    279         (u32)d[7] | ((u32)d[6] << 8) | ((u32)d[5] << 16) | ((u32)d[4] << 24);
    280   }
    281   detect_target_defaults(out);
    282   out->obj = KIT_OBJ_MACHO;
    283   out->os = KIT_OS_MACOS;
    284 
    285   /* Resolve the arch through the Mach-O format's cputype reverse map. The
    286    * registry models only the link/codegen arches (ARM64 / X86_64); the
    287    * legacy 32-bit ABI-classifiable cputypes it does not carry (CPU_TYPE_X86,
    288    * CPU_TYPE_ARM) are mapped explicitly to preserve detection. */
    289   {
    290     const ObjFormatImpl* fmt = obj_format_lookup(KIT_OBJ_MACHO);
    291     const ObjMachoArchOps* ops =
    292         (fmt && fmt->macho_cputype) ? fmt->macho_cputype(cputype) : NULL;
    293     if (ops) {
    294       detect_set_ptr(out, ops->arch);
    295     } else if (cputype == 0x00000007u) { /* CPU_TYPE_X86 */
    296       detect_set_ptr(out, KIT_ARCH_X86_32);
    297     } else if (cputype == 0x0000000Cu) { /* CPU_TYPE_ARM */
    298       detect_set_ptr(out, KIT_ARCH_ARM_32);
    299     } else {
    300       return KIT_UNSUPPORTED;
    301     }
    302   }
    303   (void)is64;
    304   return KIT_OK;
    305 }
    306 
    307 KitStatus kit_detect_target(const uint8_t* data, size_t len,
    308                             KitTargetSpec* out) {
    309   KitBinFmt bin;
    310   if (!data || !out) return KIT_INVALID;
    311   bin = kit_detect_fmt(data, len);
    312   switch (bin) {
    313 #if KIT_OBJ_ELF_ENABLED
    314     case KIT_BIN_ELF:
    315       return detect_elf(data, len, out);
    316 #endif
    317 #if KIT_OBJ_COFF_ENABLED
    318     case KIT_BIN_PE:
    319       return detect_pe(data, len, out);
    320     case KIT_BIN_COFF:
    321       return detect_coff(data, len, out);
    322 #endif
    323 #if KIT_OBJ_MACHO_ENABLED
    324     case KIT_BIN_MACHO:
    325       return detect_macho(data, len, out);
    326 #endif
    327 #if KIT_OBJ_WASM_ENABLED
    328     case KIT_BIN_WASM: {
    329       KitTargetSpec t;
    330       t.big_endian = 0;
    331       t.pic = KIT_PIC_NONE;
    332       t.code_model = KIT_CM_DEFAULT;
    333       t.arch = KIT_ARCH_WASM;
    334       t.ptr_size = 4;
    335       t.ptr_align = 4;
    336       t.obj = KIT_OBJ_WASM;
    337       t.os = KIT_OS_WASI;
    338       *out = t;
    339       return KIT_OK;
    340     }
    341 #endif
    342     default:
    343       return KIT_UNSUPPORTED;
    344   }
    345 }