kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 106346286e80596ca1f647a9ec69c2120494d34a
parent b212b95462ccf02a5f87c6ee4ee1bd24a8e6cb6e
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri,  8 May 2026 16:59:16 -0700

include/cfree.h: JIT sessions, debug info, target pic/code-model

Diffstat:
Minclude/cfree.h | 654++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 637 insertions(+), 17 deletions(-)

diff --git a/include/cfree.h b/include/cfree.h @@ -17,6 +17,9 @@ typedef struct CfreeCompiler CfreeCompiler; typedef struct CfreeObjBuilder CfreeObjBuilder; typedef struct CfreeJit CfreeJit; +typedef struct CfreeJitSession CfreeJitSession; +typedef struct CfreeObjFile CfreeObjFile; +typedef struct CfreeDebugInfo CfreeDebugInfo; /* ============================================================ * Source locations (carried in diagnostics) @@ -28,6 +31,20 @@ typedef struct CfreeSrcLoc { } CfreeSrcLoc; /* ============================================================ + * Frame snapshot + * ============================================================ + * Canonical register snapshot. Shared by the unwinder (cfree_dwarf_unwind_step) + * and the JIT session's stop notifications (CfreeStopInfo.regs). `pc` and + * `cfa` are the program counter and canonical frame address; `regs` uses the + * target arch's DWARF register numbering. Registers beyond the arch's defined + * range are zero. */ +typedef struct CfreeUnwindFrame { + uint64_t pc; + uint64_t cfa; + uint64_t regs[32]; +} CfreeUnwindFrame; + +/* ============================================================ * Host-implemented interfaces (vtables) * ============================================================ * Heap, DiagSink, and Writer are implemented *outside* libcfree, by the @@ -100,6 +117,22 @@ typedef enum CfreeObjFmt { CFREE_OBJ_WASM, } CfreeObjFmt; +typedef enum CfreePic { + CFREE_PIC_NONE, + CFREE_PIC_PIC, + CFREE_PIC_PIE, +} CfreePic; + +/* CFREE_CM_DEFAULT is resolved per-arch (small on x86-64/AArch64, medium on + * RISC-V, etc.). PIC and code-model are independent: -fPIC -mcmodel=small and + * -fPIE -mcmodel=medium are both coherent. */ +typedef enum CfreeCodeModel { + CFREE_CM_DEFAULT, + CFREE_CM_SMALL, + CFREE_CM_MEDIUM, + CFREE_CM_LARGE, +} CfreeCodeModel; + typedef struct CfreeTarget { CfreeArchKind arch; CfreeOSKind os; @@ -107,8 +140,15 @@ typedef struct CfreeTarget { uint8_t ptr_size; /* 4 or 8 */ uint8_t ptr_align; uint8_t big_endian; + uint8_t pic; /* CfreePic; default CFREE_PIC_NONE */ + uint8_t code_model; /* CfreeCodeModel; default CFREE_CM_DEFAULT */ } CfreeTarget; +/* JIT note: cfree_link_jit and cfree_jit_from_image force pic = CFREE_PIC_PIC + * regardless of caller input — the mmap'd image's address is unknown until + * map time. The override happens at the linker entry, not silently inside the + * backend. */ + /* ============================================================ * Host environment * ============================================================ @@ -169,6 +209,115 @@ const uint8_t* cfree_writer_mem_bytes(CfreeWriter*, size_t* len_out); void cfree_jit_free (CfreeJit*); void* cfree_jit_lookup(CfreeJit*, const char* name); +/* ----- JIT image inspection ----- + * + * cfree_jit_view borrows a CfreeObjFile over the loaded JIT image. Lets the + * driver feed the JIT to objdump/dwarf consumers without round-tripping the + * image to bytes. The returned pointer is owned by the CfreeJit and is + * invalidated by cfree_jit_free; callers must not call cfree_obj_close on it. + * + * cfree_jit_addr_to_sym is the reverse of cfree_jit_lookup: maps a runtime + * PC to the enclosing global symbol. Returns 0 on success and 1 when no + * symbol contains `addr`. The interned name string is valid until + * cfree_jit_free. */ +const CfreeObjFile* cfree_jit_view (CfreeJit*); +int cfree_jit_addr_to_sym (CfreeJit*, uint64_t addr, + const char** name_out, + uint64_t* off_out); + +/* ----- JIT session: controlled execution ----- + * + * A session wraps the JIT in a worker thread and a per-arch trap engine + * (software breakpoint patch + single-step / displaced-step trampoline). + * The library owns all signal handling, ucontext extraction, and per-arch + * trap-byte / single-step machinery. The driver side uses the session to + * call into the JIT'd code and is notified of stops (breakpoint, fault, + * exit) via the blocking session_call/session_resume entries. + * + * Threading model: a single worker thread runs the JIT entry. session_call + * and session_resume block the caller until the worker stops; the worker + * is parked on stop and resumed from the next session_resume. Only one + * thread may drive the session at a time. + * + * Lifetime: the CfreeJit must outlive the CfreeJitSession. cfree_jit_free + * tears down a session implicitly if one is still attached, but explicit + * cfree_jit_session_free is preferred so the worker thread is joined + * deterministically. + * + * Breakpoints: set/clear via session_breakpoint_set/_clear. The trap byte + * patch and arch-specific single-step trampoline (used to step over the + * patched instruction during resume) are entirely internal. The session + * dedupes addresses; setting a breakpoint at an existing address returns + * the original handle. */ + +typedef enum CfreeStopKind { + CFREE_STOP_BREAKPOINT, /* worker hit a breakpoint we set */ + CFREE_STOP_SIGNAL, /* worker took a fault we did not arm */ + CFREE_STOP_EXIT, /* worker entry returned normally */ + CFREE_STOP_INTERRUPT, /* host requested via session_interrupt */ +} CfreeStopKind; + +typedef struct CfreeStopInfo { + CfreeStopKind kind; + int signal; /* host signo when kind == STOP_SIGNAL */ + int exit_code; /* worker return value when kind == EXIT */ + uint32_t bp_id; /* set bp handle when kind == BREAKPOINT */ + CfreeUnwindFrame regs; /* register snapshot at the stop site */ +} CfreeStopInfo; + +typedef enum CfreeResumeMode { + CFREE_RESUME_CONTINUE, /* run until next stop or exit */ + CFREE_RESUME_STEP_INSN, /* execute one machine instruction */ +} CfreeResumeMode; + +/* Entry-point signature dispatched by session_call. The library is + * responsible for marshalling argv/argc into the worker's ABI; the driver + * is shape-agnostic. New entry shapes extend the enum. */ +typedef enum CfreeEntryKind { + CFREE_ENTRY_INT_ARGV, /* int(int, char**) */ +} CfreeEntryKind; + +CfreeJitSession* cfree_jit_session_new (CfreeJit*); +void cfree_jit_session_free(CfreeJitSession*); + +/* Begin executing `entry` with `argv`. Blocks until the worker stops. + * `entry` must be a pointer returned by cfree_jit_lookup (or otherwise + * within the JIT image). Returns 0 on success (including an EXIT stop), + * nonzero on internal failure (worker spawn, OOM). On success *stop is + * populated. */ +int cfree_jit_session_call (CfreeJitSession*, void* entry, + CfreeEntryKind, int argc, char** argv, + CfreeStopInfo* stop_out); + +/* Resume the parked worker. Blocks until the next stop. Returns 0 on + * success, nonzero if no worker is parked. */ +int cfree_jit_session_resume(CfreeJitSession*, CfreeResumeMode, + CfreeStopInfo* stop_out); + +/* Asynchronously interrupt a running worker. Async-signal-safe: callable + * from a SIGINT handler in the host. The next stop event delivered to the + * driving thread will be CFREE_STOP_INTERRUPT. Returns 0 on a queued + * interrupt, nonzero if no worker is currently running. */ +int cfree_jit_session_interrupt(CfreeJitSession*); + +/* Read `n` bytes from the worker's address space starting at `addr` into + * `dst`. Used by `p` and `x` in the dbg driver to dereference globals, + * frame-relative locals, and arbitrary user memory. Returns 0 on success + * and nonzero on a bad address or partial read; partial-read attempts do + * not modify `dst`. Safe to call only while the worker is parked at a + * stop. */ +int cfree_jit_session_read_mem(CfreeJitSession*, uint64_t addr, + void* dst, size_t n); + +/* Set / clear a breakpoint at `addr` (which must lie within the JIT image). + * On success, *bp_id_out is the session-local handle that future stop + * events will report. Idempotent: setting at an address that already has + * a breakpoint returns its existing handle. cfree_jit_session_breakpoint_clear + * silently succeeds on an unknown handle. */ +int cfree_jit_session_breakpoint_set (CfreeJitSession*, uint64_t addr, + uint32_t* bp_id_out); +int cfree_jit_session_breakpoint_clear(CfreeJitSession*, uint32_t bp_id); + /* Resolver invoked when the linker encounters an undefined symbol. Returning * NULL is an error. */ typedef void* (*CfreeExternResolver)(void* user, const char* name); @@ -222,11 +371,37 @@ typedef struct CfreePpOptions { uint32_t nundefines; } CfreePpOptions; +/* Path prefix remap entry. Applied by SourceManager whenever it produces a + * path for DWARF emission (DW_AT_comp_dir, DW_AT_name, line program). The + * first match wins. Diagnostic output uses original paths. */ +typedef struct CfreePathPrefixMap { + const char* old_prefix; + const char* new_prefix; +} CfreePathPrefixMap; + /* Per-TU compile knobs. */ typedef struct CfreeCompileOptions { int opt_level; /* 0 direct, 1 minimal, 2 full */ int debug_info; CfreePpOptions pp; + /* Reproducible-build knobs. `epoch` (Unix seconds) is consulted by every + * file emitter that would otherwise have written wall-clock time (COFF + * header, Mach-O LC_BUILD_VERSION, ar ar_date, DWARF producer). 0 means + * write no timestamp at all (the default). */ + uint64_t epoch; + const CfreePathPrefixMap* path_map; + uint32_t npath_map; + /* Diagnostic policy. + * + * `warnings_are_errors` (-Werror): warnings emitted to CfreeDiagSink are + * counted as errors for the compile_* return-value test and against + * max_errors. The sink's `warnings` counter is unaffected. + * + * `max_errors`: 0 means unlimited. When >0, the parser stops emitting + * after sink.errors reaches the cap (the Nth error is still emitted; the + * (N+1)th is not), and compile_* returns nonzero. */ + int warnings_are_errors; + uint32_t max_errors; } CfreeCompileOptions; /* Preprocess one C input. @@ -249,29 +424,265 @@ int cfree_preprocess (CfreeCompiler*, const CfreePpOptions* pp, * temporary builder before returning. The Writer is not closed. On nonzero * return the Writer may contain partial output and should not be consumed. * - * Returns 0 on success, nonzero on failure. */ + * Diagnostic model: report-all. Every error reachable by the parser's + * recovery rules is emitted to env.diag before return; the parser does not + * abort on routine syntax/semantic errors. These functions return 0 iff + * env.diag->errors == 0 at the end of the call (with warnings counting as + * errors when CfreeCompileOptions.warnings_are_errors is set). Returns + * nonzero on internal failures (OOM, invariant violation), where the + * underlying compiler_panic mechanism unwinds before return. */ int cfree_compile_obj (CfreeCompiler*, const CfreeCompileOptions*, const CfreeBytesInput* input, CfreeObjBuilder** out); int cfree_compile_obj_emit(CfreeCompiler*, const CfreeCompileOptions*, const CfreeBytesInput* input, CfreeWriter* out); +/* ----- Assembly entries (GAS subset) ----- + * + * Assemble one text source into a relocatable object. Diagnostic and panic + * semantics match cfree_compile_obj / cfree_compile_obj_emit: report-all to + * env.diag, return 0 iff env.diag->errors == 0 at end of call (warnings count + * as errors when warnings_are_errors is set), nonzero on internal failures. + * + * cfree_assemble_obj returns a finalized CfreeObjBuilder owned by the + * CfreeCompiler; pass it to cfree_link_exe / cfree_link_jit. The builder must + * be alive until the linker has consumed it; the CfreeCompiler must outlive + * it. + * + * cfree_assemble_obj_emit writes the encoded object to `out` and frees its + * temporary builder before returning. The Writer is not closed. On nonzero + * return the Writer may contain partial output and should not be consumed. + * + * Inline asm in the C parser reuses the same parser internally; no separate + * public entry. */ +typedef struct CfreeAsmOptions { + int debug_info; /* generate .debug_line from .file/.loc */ + /* Reproducible-build knobs; semantics match CfreeCompileOptions. */ + uint64_t epoch; + const CfreePathPrefixMap* path_map; + uint32_t npath_map; + /* Diagnostic policy; semantics match CfreeCompileOptions. */ + int warnings_are_errors; + uint32_t max_errors; +} CfreeAsmOptions; + +int cfree_assemble_obj (CfreeCompiler*, const CfreeAsmOptions*, + const CfreeBytesInput* input, + CfreeObjBuilder** out); +int cfree_assemble_obj_emit(CfreeCompiler*, const CfreeAsmOptions*, + const CfreeBytesInput* input, + CfreeWriter* out); + +/* ----- Header-dependency iteration ----- + * + * Walks the include edges recorded by SourceManager during a preceding + * cfree_preprocess or cfree_compile_obj* call. The library hands out raw + * edges; formatting (Make rules, ninja, JSON) is the driver's job. + * + * cfree_dep_iter_next returns 1 and fills `*out` for each remaining edge, + * 0 when iteration is exhausted. The strings in CfreeDepEdge alias storage + * owned by the CfreeCompiler and are valid until the next preprocess/ + * compile call or compiler_free, whichever comes first. + * + * `includer_name` and `included_name` are the *resolved* paths SourceManager + * actually opened — the same byte sequences passed to CfreeFileIO.read_all. + * They are not the literal include token text; a `#include "x.h"` resolved + * via -I to /abs/inc/x.h reports `/abs/inc/x.h`. This is what build systems + * need: a Make rule emitted from these strings refers to files the build + * tool will stat on rebuild. + * + * `from_system_path` distinguishes headers found through a system include + * path (-isystem, sysroot, builtin) from user headers found via -I or the + * source's own directory. This is the GCC `-MM` filter: drop edges whose + * `from_system_path` is set. It is set by the include-path resolver, NOT by + * the include syntax — `#include <myheader.h>` resolved through -I is a + * user header (from_system_path=0); `#include "stdio.h"` resolved through a + * system path is a system header (from_system_path=1). + * + * `bracketed` records the include *syntax* (1 for `<…>`, 0 for `"…"`). Tools + * that round-trip include directives (formatters, IDE indexers) want the + * lexical fact; -MM filtering does not. + * + * Edges are reported across all TUs processed since compiler_new; callers + * that want a single TU's edges filter by `includer_name`. */ +typedef struct CfreeDepIter CfreeDepIter; + +typedef struct CfreeDepEdge { + const char* includer_name; /* resolved path; same string given to read_all */ + const char* included_name; /* resolved path; same string given to read_all */ + CfreeSrcLoc include_loc; + uint8_t from_system_path; /* 1 if resolved via a system include path */ + uint8_t bracketed; /* 1 if syntax was <…>; 0 for "…" */ + uint8_t pad[2]; +} CfreeDepEdge; + +CfreeDepIter* cfree_dep_iter_new (CfreeCompiler*); +int cfree_dep_iter_next(CfreeDepIter*, CfreeDepEdge* out); +void cfree_dep_iter_free(CfreeDepIter*); + +/* Build-ID emission mode (ELF .note.gnu.build-id and friends). */ +typedef enum CfreeBuildIdMode { + CFREE_BUILDID_NONE, /* no build-id note (default) */ + CFREE_BUILDID_SHA256, /* hash all input section bytes in + * stable order — reproducible */ + CFREE_BUILDID_UUID, /* random; opt-in, not reproducible */ + CFREE_BUILDID_USER, /* caller-supplied bytes */ +} CfreeBuildIdMode; + +/* ============================================================ + * Linker script (structured) + * ============================================================ + * The linker accepts only the structured form. Programmatic build systems + * construct a CfreeLinkScript directly; hosts that prefer GNU-ld text feed + * the optional cfree_link_script_parse helper, which yields the same + * structured form. The data model makes the supported semantics + * inspectable rather than implicit in a parser. + * + * All pointers in a CfreeLinkScript are borrowed: the script and every + * sub-object (expressions, regions, sections, assignments, name strings) + * must outlive the call to cfree_link_exe / cfree_link_jit that consumes + * it. cfree_link_script_parse arena-owns its result; cfree_link_script_free + * releases everything reachable from a parser-produced script. */ + +typedef struct CfreeLinkExpr CfreeLinkExpr; + +typedef enum CfreeLinkExprKind { + CFREE_LE_INT, /* int_val */ + CFREE_LE_DOT, /* current location counter */ + CFREE_LE_SYM, /* name */ + CFREE_LE_REGION_ORIGIN, /* name = MEMORY region */ + CFREE_LE_REGION_LENGTH, /* name = MEMORY region */ + CFREE_LE_ADD, CFREE_LE_SUB, CFREE_LE_MUL, CFREE_LE_DIV, + CFREE_LE_AND, CFREE_LE_OR, CFREE_LE_XOR, + CFREE_LE_SHL, CFREE_LE_SHR, + CFREE_LE_ALIGN, /* ALIGN(val, align) */ + CFREE_LE_MAX, CFREE_LE_MIN, +} CfreeLinkExprKind; + +struct CfreeLinkExpr { + uint8_t kind; /* CfreeLinkExprKind */ + union { + int64_t int_val; + const char* name; + struct { const CfreeLinkExpr *lhs, *rhs; } bin; + struct { const CfreeLinkExpr *val, *align; } align; + } v; +}; + +typedef enum CfreeLinkRegionFlag { + CFREE_LRF_R = 1u << 0, + CFREE_LRF_W = 1u << 1, + CFREE_LRF_X = 1u << 2, +} CfreeLinkRegionFlag; + +typedef struct CfreeLinkRegion { + const char* name; + uint8_t flags; /* CfreeLinkRegionFlag mask */ + uint64_t origin; + uint64_t length; +} CfreeLinkRegion; + +typedef struct CfreeLinkInputMatch { + const char* file_pattern; /* NULL == "*" */ + const char* section_pattern; + int keep; /* nonzero: exempt from --gc-sections */ +} CfreeLinkInputMatch; + +typedef enum CfreeLinkAsnKind { + CFREE_LAS_DOT, /* . = expr; sym ignored */ + CFREE_LAS_SYM, /* sym = expr */ + CFREE_LAS_PROVIDE, /* PROVIDE(sym = expr) */ +} CfreeLinkAsnKind; + +typedef struct CfreeLinkAssignment { + uint8_t kind; /* CfreeLinkAsnKind */ + const char* sym; /* unused for CFREE_LAS_DOT */ + const CfreeLinkExpr* expr; +} CfreeLinkAssignment; + +typedef struct CfreeLinkOutputSection { + const char* name; + const CfreeLinkExpr* vma; /* NULL: from region/dot */ + const CfreeLinkExpr* lma; /* NULL: equal to vma */ + const CfreeLinkInputMatch* inputs; + uint32_t ninputs; + const char* region; /* > REGION; NULL if absent */ + const char* load_region; /* AT> REGION; NULL if absent */ + const CfreeLinkAssignment* asns; + uint32_t nasns; +} CfreeLinkOutputSection; + +typedef struct CfreeLinkScript { + const char* entry; /* NULL: use CfreeLinkOptions.entry */ + const CfreeLinkRegion* regions; + uint32_t nregions; + const CfreeLinkOutputSection* sections; /* in declaration order */ + uint32_t nsections; + const CfreeLinkAssignment* top_asns; /* outside any SECTIONS{} */ + uint32_t ntop_asns; +} CfreeLinkScript; + +/* Parse GNU-ld-subset text into a structured script. The compiler arena + * owns the result; cfree_link_script_free releases it. The supported v1 + * subset is: + * ENTRY(symbol) + * MEMORY { name (rwx) : ORIGIN = expr, LENGTH = expr } + * SECTIONS { ... } with output sections in declaration order + * Input rules `*(.section.glob)` or `file.o(.section)` + * KEEP(...) for --gc-sections opt-out + * PROVIDE(sym = expr), plain `sym = expr`, `. = expr` + * `> REGION` and `AT> REGION` placement + * Operators + - * / & | ^ << >>, ALIGN(expr, align), MAX(a,b), MIN(a,b) + * slash-star block comments + * Anything outside the subset (OVERLAY, VERSION, INSERT BEFORE/AFTER, + * OUTPUT_FORMAT, INPUT, GROUP, elaborate file patterns, other operators) + * is rejected with a diagnostic and the call returns nonzero with *out + * unchanged. Returns 0 on success. */ +int cfree_link_script_parse(CfreeCompiler*, const char* text, size_t len, + const CfreeLinkScript** out); +void cfree_link_script_free (CfreeCompiler*, const CfreeLinkScript*); + +/* Per-archive flags. Object-file inputs (obj_bytes) keep the plain + * CfreeBytesInput[] shape — only archives carry these knobs. */ +typedef enum CfreeLinkArchFlag { + CFREE_LAF_NONE = 0, + /* Pull every member of the archive in regardless of whether its symbols + * satisfy an undef. Equivalent to GNU ld --whole-archive. */ + CFREE_LAF_WHOLE_ARCHIVE = 1u << 0, +} CfreeLinkArchFlag; + +/* Archive input with linker-side flags. `group_id` clusters archives into a + * cyclic resolution group: archives sharing a nonzero id are scanned + * cyclically until no new symbols are pulled in (equivalent to GNU ld + * --start-group ... --end-group). `group_id == 0` (default) means linear + * single-pass. */ +typedef struct CfreeBytesInputArchive { + CfreeBytesInput input; + uint8_t flags; /* bitmask of CfreeLinkArchFlag */ + uint8_t group_id; /* 0 = none; same nonzero = same cyclic group */ +} CfreeBytesInputArchive; + typedef struct CfreeLinkOptions { - CfreeObjBuilder* const* objs; /* fresh-compiled, by reference */ - uint32_t nobjs; - const CfreeBytesInput* obj_bytes; - uint32_t nobj_bytes; - const CfreeBytesInput* archives; - uint32_t narchives; - const char* linker_script_text; /* NULL = no script. - * Non-NULL: linker_script_len - * must match the buffer. */ - size_t linker_script_len; + CfreeObjBuilder* const* objs; /* fresh-compiled, by reference */ + uint32_t nobjs; + const CfreeBytesInput* obj_bytes; + uint32_t nobj_bytes; + const CfreeBytesInputArchive* archives; + uint32_t narchives; + /* Structured linker script. NULL means no script (target/format default + * layout). Borrowed: must outlive the cfree_link_* call. */ + const CfreeLinkScript* linker_script; const char* entry; /* NULL = format/target default */ CfreeExternResolver extern_resolver; void* extern_resolver_user; + /* Build-ID. `build_id_mode` is a CfreeBuildIdMode. `build_id_bytes` / + * `build_id_len` are consulted only when mode == CFREE_BUILDID_USER. */ + uint8_t build_id_mode; + const uint8_t* build_id_bytes; + uint32_t build_id_len; } CfreeLinkOptions; -/* All bytes inputs (obj_bytes, archives) must remain alive until the +/* All bytes inputs (obj_bytes, archives — including the CfreeBytesInput + * nested inside each CfreeBytesInputArchive) must remain alive until the * matching cfree_link_* call returns. */ /* Link to executable. Writer is not closed by the call. On nonzero return @@ -324,6 +735,20 @@ typedef struct CfreeOptions { CfreeExternResolver extern_resolver; void* extern_resolver_user; + /* Reproducibility — forwarded to the underlying compile/link options. */ + uint64_t epoch; + const CfreePathPrefixMap* path_map; + uint32_t npath_map; + uint8_t build_id_mode; + const uint8_t* build_id_bytes; + uint32_t build_id_len; + + /* Diagnostic policy — forwarded to CfreeCompileOptions. cfree_run + * returns nonzero when any per-TU compile reports errors (or when a + * subsequent link/jit step fails). */ + int warnings_are_errors; + uint32_t max_errors; + CfreeJit** out_jit; /* JIT only: caller owns on success */ } CfreeOptions; @@ -428,14 +853,197 @@ CfreeObjSymIter* cfree_obj_symiter_new (CfreeObjFile*); int cfree_obj_symiter_next(CfreeObjSymIter*, CfreeObjSymInfo* out); void cfree_obj_symiter_free(CfreeObjSymIter*); +/* Raw bytes of a section. Returns a pointer aliasing storage owned by the + * CfreeObjFile and valid until cfree_obj_close. For BSS (no in-file bytes), + * returns NULL with `*len_out = 0`; the section's virtual size is on + * CfreeObjSecInfo.size. Out-of-range idx returns NULL with `*len_out = 0`. */ +const uint8_t* cfree_obj_section_data(const CfreeObjFile*, uint32_t idx, + size_t* len_out); + +/* Expose the underlying CfreeObjBuilder for use with cfree_disasm_iter_new + * (so the disassembler can consult sym/reloc tables for annotation). The + * pointer is owned by the CfreeObjFile and is valid until cfree_obj_close. */ +CfreeObjBuilder* cfree_obj_builder(const CfreeObjFile*); + +/* Relocation iterator. Walks every relocation in the object across all + * sections in section-then-offset order. Strings are interned and valid + * until cfree_obj_close. */ +typedef struct CfreeObjReloc { + uint32_t section; /* 0-based section index the reloc applies to */ + uint64_t offset; /* offset within that section */ + uint32_t sym; /* opaque symbol id; CFREE_SECTION_NONE if none */ + const char* sym_name; /* interned; "" when sym is none/anonymous */ + int64_t addend; + uint32_t kind; /* arch-specific reloc type code */ + const char* kind_name; /* interned, e.g. "R_X86_64_PC32" */ +} CfreeObjReloc; + +typedef struct CfreeObjRelocIter CfreeObjRelocIter; + +CfreeObjRelocIter* cfree_obj_reliter_new (CfreeObjFile*); +int cfree_obj_reliter_next(CfreeObjRelocIter*, CfreeObjReloc* out); +void cfree_obj_reliter_free(CfreeObjRelocIter*); + +/* ============================================================ + * DWARF consumer + * ============================================================ + * Read DWARF (.debug_info / .debug_line / .debug_aranges / .eh_frame) out + * of an already-opened CfreeObjFile. The CfreeObjFile must outlive the + * CfreeDebugInfo. Strings handed back through the query functions are + * interned and valid until cfree_dwarf_close. + * + * cfree_dwarf_open returns NULL when the object has no DWARF, when the + * object's format doesn't carry DWARF (PE/COFF can; the consumer accepts + * the standard sections wherever they live), or on internal failure. + * + * cfree_dwarf_addr_to_line maps a runtime / image PC to the source file, + * line, and column that produced it. Returns 0 on success and 1 when the + * PC has no matching .debug_line entry (e.g. compiler scaffolding). + * + * cfree_dwarf_line_to_addr is the inverse: returns 0 on success, 1 when no + * statement-flagged row matches the (file, line) pair. The first matching + * row wins. + * + * cfree_dwarf_func_at returns the enclosing subprogram's name and + * inclusive PC bounds. Returns 0 on success, 1 if no subprogram contains + * `pc`. */ +typedef struct CfreeDebugInfo CfreeDebugInfo; + +CfreeDebugInfo* cfree_dwarf_open (CfreeCompiler*, const CfreeObjFile*); +void cfree_dwarf_close(CfreeDebugInfo*); + +int cfree_dwarf_addr_to_line(CfreeDebugInfo*, uint64_t pc, + const char** file_out, + uint32_t* line_out, + uint32_t* col_out); +int cfree_dwarf_line_to_addr(CfreeDebugInfo*, const char* file, uint32_t line, + uint64_t* pc_out); +int cfree_dwarf_func_at (CfreeDebugInfo*, uint64_t pc, + const char** name_out, + uint64_t* low_pc_out, + uint64_t* high_pc_out); + +/* CFI-driven unwind step. The caller seeds `frame->pc` (and any callee-saved + * registers known at the leaf) and the consumer walks .eh_frame to compute + * the caller frame in place: pc, cfa, and registers are updated. CfreeUnwindFrame + * is declared at the top of this header. Register indices follow the DWARF + * register numbering for the target arch (which matches CfreeArchKind's + * canonical mapping). Returns 0 on a successful step, 1 at the bottom of the + * stack (no caller), nonzero on decode error. */ +int cfree_dwarf_unwind_step(CfreeDebugInfo*, CfreeUnwindFrame*); + +/* ----- Variable locations ----- + * + * Decode where a named variable lives at PC. Resolution order: the deepest + * lexical scope at `pc` whose `name` matches wins; if no local matches, a + * file-scope global with that name is returned; otherwise 1. + * + * `byte_size` is the variable's storage size in bytes, taken from the + * variable's DIE type. Zero means unknown. + * + * cfree_dwarf_loc_read evaluates the location against `frame` (whose `regs` + * supply register values; the leaf frame's regs come from CfreeStopInfo, + * deeper frames from cfree_dwarf_unwind_step) and reads the underlying + * bytes through the supplied JIT session. Up to `cap` bytes are written + * into `dst`; *read_out reports the number actually read (capped to + * the variable's byte_size). Returns 0 on success, nonzero on bad + * arguments or a read fault. + * + * EXPR locations carry a DWARF expression byte string; libcfree owns the + * stack-machine evaluator. Callers should treat the loc as opaque and + * always go through cfree_dwarf_loc_read. */ +typedef enum CfreeDwarfLocKind { + CFREE_DLOC_REG, /* value lives in a register */ + CFREE_DLOC_FRAME_OFS, /* [cfa + frame_ofs] */ + CFREE_DLOC_GLOBAL, /* absolute address */ + CFREE_DLOC_EXPR, /* DWARF expression bytes */ +} CfreeDwarfLocKind; + +typedef struct CfreeDwarfVarLoc { + CfreeDwarfLocKind kind; + uint32_t byte_size; /* 0 = unknown */ + union { + uint32_t reg; + int32_t frame_ofs; + uint64_t global; + struct { const uint8_t* bytes; size_t len; } expr; + } v; +} CfreeDwarfVarLoc; + +int cfree_dwarf_var_at (CfreeDebugInfo*, uint64_t pc, const char* name, + CfreeDwarfVarLoc* out); +int cfree_dwarf_loc_read(CfreeDebugInfo*, const CfreeDwarfVarLoc*, + const CfreeUnwindFrame*, + CfreeJitSession*, /* memory provider */ + void* dst, size_t cap, size_t* read_out); + +/* ============================================================ + * Disassembler + * ============================================================ + * Two layers: a high-level convenience that walks a relocatable object's + * text sections and writes an objdump-style listing, and a low-level + * iterator that decodes instructions from a byte buffer with vaddr context. + * + * Operands are pre-rendered to text on CfreeInsn. Structured operands (per- + * arch REG/IMM/MEM/SYM_REL enums) are the principled answer but multiply + * surface per arch (x86 ModR/M, AArch64 vector lanes, RISC-V CSR names) + * without v1 consumers; adding a structured form later is non-breaking + * because the text fields remain accurate. + * + * Strings on CfreeInsn (mnemonic/operands/annotation) and `bytes` are owned + * by the iterator and valid only until the next cfree_disasm_iter_next call + * or cfree_disasm_iter_free, whichever comes first. */ + +typedef struct CfreeInsn { + uint64_t vaddr; + const uint8_t* bytes; + uint32_t nbytes; + const char* mnemonic; + const char* operands; /* pre-rendered; may be "" */ + const char* annotation; /* sym/reloc note; may be "" */ +} CfreeInsn; + +/* Walk a relocatable object's text sections and write an objdump-style + * listing to `out`. Convenience over the iterator. The Writer is not + * closed. Returns 0 on success, nonzero on failure. */ +int cfree_obj_disasm(CfreeCompiler*, + const CfreeBytesInput*, CfreeWriter* out); + +/* Iterate instructions in a byte buffer at virtual address `vaddr`. If + * `obj` is non-NULL, the decoder consults its symbol and relocation tables + * to fill CfreeInsn.annotation; pass NULL for raw decoding. The bytes + * buffer must remain alive until cfree_disasm_iter_free. + * + * cfree_disasm_iter_next returns 1 and fills `*out` for each decoded + * instruction, 0 when the buffer is exhausted. On an undecodable byte the + * iterator advances by the arch's minimum unit and emits a placeholder + * mnemonic so the listing stays in sync. */ +typedef struct CfreeDisasmIter CfreeDisasmIter; + +CfreeDisasmIter* cfree_disasm_iter_new (CfreeCompiler*, + const uint8_t* bytes, size_t len, + uint64_t vaddr, + CfreeObjBuilder* obj /* may be NULL */); +int cfree_disasm_iter_next(CfreeDisasmIter*, CfreeInsn* out); +void cfree_disasm_iter_free(CfreeDisasmIter*); + /* ============================================================ * Archive (ar) file * ============================================================ * Pure format I/O — no compilation context required. * * cfree_ar_write packs member byte payloads into a POSIX ar archive written - * to `out`. The Writer is not closed; I/O errors are detectable via - * out->error(). Returns 0 on success, 1 on bad arguments. + * to `out`. Options control reproducibility and format extensions: + * - `epoch` Unix seconds written to ar_date for every member; 0 + * leaves the field as the literal "0" (the default). + * - `symbol_index` if nonzero, emit a System V `/` symbol-index member. + * Not yet implemented; currently ignored. + * - `long_names` if nonzero, emit a `//` long-name table when any + * member name exceeds 15 characters or contains '/'. + * With long_names == 0, over-long names are truncated. + * `opts` may be NULL to accept all defaults. + * The Writer is not closed; I/O errors are detectable via out->error(). + * Returns 0 on success, 1 on bad arguments. * * cfree_ar_list writes one member name per line to `out` for each non-special * member in the archive. Returns 0 on success, 1 on bad arguments or @@ -447,18 +1055,30 @@ void cfree_obj_symiter_free(CfreeObjSymIter*); * cfree_ar_iter_next advances to the next non-special member and fills *out; * returns 1 if a member was returned, 0 at end or on malformed data. * Member data pointers alias the original archive bytes and are valid as - * long as the archive bytes remain alive. */ + * long as the archive bytes remain alive. CfreeArMember.name is interned + * in iterator-owned storage and is valid only until the next iter_next + * call on the same iterator. */ +typedef struct CfreeArWriteOptions { + uint64_t epoch; /* ar_date for every member; 0 = none */ + int symbol_index; /* emit System V '/' index entry (TODO) */ + int long_names; /* emit '//' long-name table when needed */ +} CfreeArWriteOptions; + int cfree_ar_write(CfreeWriter* out, - const CfreeBytesInput* members, uint32_t nmembers); + const CfreeBytesInput* members, uint32_t nmembers, + const CfreeArWriteOptions* opts); int cfree_ar_list (const CfreeBytesInput* archive, CfreeWriter* out); typedef struct CfreeArIter { const uint8_t* _p; const uint8_t* _end; + const uint8_t* _longnames; /* `//` table bytes, NULL until seen */ + size_t _longnames_len; + char _namebuf[256]; /* iterator-owned scratch for member name */ } CfreeArIter; typedef struct CfreeArMember { - char name[17]; /* null-terminated, max 16 chars */ + const char* name; /* iterator-owned; valid until next iter_next */ const uint8_t* data; /* points into archive bytes */ size_t size; } CfreeArMember;