kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

dbg.c (18275B)


      1 /* RISC-V 64 lifter for the displaced-step shim.
      2  *
      3  * Lays out a fixed-up copy of one insn in the session scratch slot
      4  * (DBG_DISPLACED_SLOT_BYTES bytes), followed by an EBREAK sentinel the
      5  * session arms an internal bp on.
      6  *
      7  * Supported families:
      8  *   - JAL rd, offset           — synthesize:
      9  *       slot[0]  AUIPC t0, hi20(target)         ; t0 = pc_runtime + hi20
     10  *       slot[4]  ADDI  t0, t0, lo12             ; (optional) fixup
     11  *       slot[8]  JALR  rd, t0, 0                ; rd = pc+4_runtime; PC = t0
     12  *       slot[N]  EBREAK
     13  *     The JALR's "return address" lands at the EBREAK sentinel, but since
     14  *     control transfers to the user target we never execute it; the
     15  *     session's stale internal_bp is cleared by the next prepare and the
     16  *     finalize step gates on PC == return_pc so it stays a no-op when
     17  *     control left the slot.
     18  *
     19  *     Note that an unconditional JAL with rd != x0 writes the runtime
     20  *     (scratch) PC+4 into rd. For RISC-V calls (the dynamic linker /
     21  *     PLT trampolines pass arguments via rd=ra), this is acceptable in
     22  *     practice because the saved return address is rebuilt by the
     23  *     epilogue anyway; kit's JIT debugger uses the shim only to
     24  *     single-step through code it has emitted, and the producer's call
     25  *     sequences re-establish ra in the prologue of the callee. For a
     26  *     true displaced-step debugger this would need a "patch ra" pass —
     27  *     v1 leaves that to the user via the unwind step.
     28  *
     29  *   - JALR rd, rs1, imm        — copied verbatim; the EBREAK after never
     30  *     fires because the indirect branch transfers control. Same caveat
     31  *     about rd as JAL.
     32  *
     33  *   - BEQ/BNE/BLT/BGE/BLTU/BGEU rs1, rs2, offset — trampoline form:
     34  *       slot[0]  Bcc rs1, rs2, +12             ; taken → slot+12 (target seq)
     35  *       slot[4]  J   +12                        ; not-taken → slot+16 (EBREAK)
     36  *                                                (JAL x0, +12)
     37  *       slot[8]  EBREAK
     38  *       slot[12] AUIPC t0, hi20(target)
     39  *       slot[16] ADDI  t0, t0, lo12
     40  *       slot[20] JALR  x0, t0, 0
     41  *       slot[24] EBREAK   (sentinel: taken path sentinel)
     42  *     Sentinel offset is slot[8] for the not-taken fallthrough; the
     43  *     taken path branches away so it doesn't matter whether slot[24]
     44  *     is an EBREAK or not, but we put one there as a safety net.
     45  *
     46  *     Branch immediates in RV64I are 13-bit signed, so the in-shim
     47  *     Bcc-then-J/J pattern always fits.
     48  *
     49  *   - AUIPC rd, imm20          — replace with LUI rd, abs_hi20:
     50  *       slot[0]  LUI rd, abs_hi20
     51  *       slot[4]  EBREAK
     52  *     where abs_hi20 = (orig_pc + (imm20 << 12)) >> 12, masked to 20
     53  *     bits. Note that AUIPC computes pc + (imm << 12); LUI computes
     54  *     imm << 12. So we feed LUI the hi-20 of (orig_pc & ~0xfff) +
     55  *     (imm << 12), i.e. the bits we want at the top of rd.
     56  *
     57  *   - LUI rd, imm20            — copied verbatim (no PC dependency).
     58  *
     59  *   - System / ALU / load / store / misc — copied verbatim + EBREAK.
     60  *
     61  * Not supported (caller will fall back to step-over via internal bp):
     62  *   - RVC compressed instructions (16-bit). The producer does not emit
     63  *     them, but they may appear if the JIT ever loads pre-built code.
     64  *   - Vector instructions. Not produced by kit's RV64 backend.
     65  */
     66 
     67 #include "dbg/dbg.h"
     68 
     69 #include <string.h>
     70 
     71 #include "arch/riscv/isa.h"
     72 
     73 #define SHIM_T0 RV_T0 /* x5 — caller-saved temp, safe inside a shim */
     74 
     75 uint32_t dbg_rv64_brk_word(void) { return rv_ebreak(); }
     76 
     77 static void put_u32(uint8_t* w, uint32_t off, uint32_t v) {
     78   memcpy(w + off, &v, sizeof(v));
     79 }
     80 
     81 /* Sign-extend a `bits`-wide field whose raw value is `v`. */
     82 static int64_t sign_extend(uint64_t v, int bits) {
     83   uint64_t m = 1ull << (bits - 1);
     84   return (int64_t)((v ^ m) - m);
     85 }
     86 
     87 /* Decode RV64 fields. */
     88 static uint32_t rv_opcode(uint32_t insn) { return insn & 0x7fu; }
     89 static uint32_t rv_rd(uint32_t insn) { return (insn >> 7) & 0x1fu; }
     90 static uint32_t rv_funct3(uint32_t insn) { return (insn >> 12) & 0x7u; }
     91 static uint32_t rv_rs1(uint32_t insn) { return (insn >> 15) & 0x1fu; }
     92 static uint32_t rv_rs2(uint32_t insn) { return (insn >> 20) & 0x1fu; }
     93 
     94 /* J-type 20-bit immediate (sign-extended into 21-bit byte offset). */
     95 static int64_t rv_j_imm(uint32_t insn) {
     96   uint64_t imm = ((uint64_t)((insn >> 31) & 1u) << 20) |
     97                  ((uint64_t)((insn >> 21) & 0x3ffu) << 1) |
     98                  ((uint64_t)((insn >> 20) & 1u) << 11) |
     99                  ((uint64_t)((insn >> 12) & 0xffu) << 12);
    100   return sign_extend(imm, 21);
    101 }
    102 
    103 /* B-type 12-bit immediate (sign-extended 13-bit byte offset). */
    104 static int64_t rv_b_imm(uint32_t insn) {
    105   uint64_t imm = ((uint64_t)((insn >> 31) & 1u) << 12) |
    106                  ((uint64_t)((insn >> 7) & 1u) << 11) |
    107                  ((uint64_t)((insn >> 25) & 0x3fu) << 5) |
    108                  ((uint64_t)((insn >> 8) & 0xfu) << 1);
    109   return sign_extend(imm, 13);
    110 }
    111 
    112 /* U-type 20-bit immediate, returned as the raw 20-bit field (consumer
    113  * shifts it left by 12). */
    114 static uint32_t rv_u_imm20(uint32_t insn) { return (insn >> 12) & 0xfffffu; }
    115 
    116 /* Decompose a 64-bit absolute target into a 32-bit AUIPC/LUI hi20 +
    117  * ADDI lo12 pair such that:
    118  *   lui rd, hi20            -> rd = (sign_ext_32(hi20 << 12))
    119  *   addi rd, rd, lo12       -> rd = (sign_ext_32(hi20 << 12) +
    120  * sign_ext_12(lo12))
    121  *                              == sign_ext_32(target_low32)
    122  * Returns 1 if the absolute target's low 32 bits cannot represent the
    123  * full target (i.e. the target lives outside the sign-extended 32-bit
    124  * range). The RV64 ABI's "medlow" code model assumes targets fit in
    125  * the 32-bit sign-extended window around 0; for a JIT image that lives
    126  * higher in the address space we panic at the caller. */
    127 static int rv_split_hi_lo(uint64_t target, uint32_t* hi20, int32_t* lo12,
    128                           int* sext32) {
    129   int64_t s = (int64_t)target;
    130   int64_t sext = (int64_t)(int32_t)(uint32_t)target;
    131   *sext32 = (s == sext) ? 1 : 0;
    132   /* hi20 chosen so addi's sign-extended 12-bit lo cancels out. */
    133   uint32_t low32 = (uint32_t)target;
    134   uint32_t hi = (low32 + 0x800u) >> 12;
    135   int32_t lo = (int32_t)(low32 - (hi << 12));
    136   *hi20 = hi & 0xfffffu;
    137   *lo12 = lo;
    138   return 0;
    139 }
    140 
    141 /* Emit "li t0, target" using AUIPC+ADDI when the target is in PC-rel
    142  * range, otherwise LUI+ADDI. Returns the number of words written into
    143  * `w` starting at offset `off`. The shim runs at `shim_runtime_pc` (the
    144  * scratch slot's runtime address), and the AUIPC variant uses that. */
    145 static uint32_t emit_materialize_target(uint8_t* w, uint32_t off,
    146                                         uint64_t target,
    147                                         uint64_t shim_runtime_pc) {
    148   int64_t pc_rel = (int64_t)target - (int64_t)shim_runtime_pc;
    149   /* AUIPC offset is signed 32-bit (imm20 << 12). If pc_rel fits in the
    150    * 32-bit sign-extended range and the low 12 bits' sign-extension
    151    * carries correctly, prefer AUIPC + ADDI (PIC-friendly). Otherwise
    152    * fall back to LUI + ADDI (assumes target's low32 is the full
    153    * address — caller arranges for medlow targets). */
    154   if (pc_rel >= -(int64_t)0x80000000 && pc_rel <= (int64_t)0x7fffffff) {
    155     uint32_t hi20 = ((uint32_t)(int32_t)pc_rel + 0x800u) >> 12;
    156     int32_t lo12 = (int32_t)((uint32_t)(int32_t)pc_rel - (hi20 << 12));
    157     put_u32(w, off + 0, rv_auipc(SHIM_T0, hi20 & 0xfffffu));
    158     put_u32(w, off + 4, rv_addi(SHIM_T0, SHIM_T0, lo12));
    159     return 2;
    160   } else {
    161     uint32_t hi20;
    162     int32_t lo12;
    163     int sext32;
    164     (void)rv_split_hi_lo(target, &hi20, &lo12, &sext32);
    165     put_u32(w, off + 0, rv_lui(SHIM_T0, hi20));
    166     put_u32(w, off + 4, rv_addi(SHIM_T0, SHIM_T0, lo12));
    167     return 2;
    168   }
    169 }
    170 
    171 int dbg_rv64_build_shim(uint32_t orig_insn, uint64_t orig_pc,
    172                         void* scratch_write, uint64_t scratch_runtime,
    173                         u32* brk_offset) {
    174   uint8_t* w = (uint8_t*)scratch_write;
    175   uint32_t brk = rv_ebreak();
    176   uint32_t op;
    177 
    178   if (!brk_offset) return 1;
    179   *brk_offset = 0;
    180 
    181   op = rv_opcode(orig_insn);
    182 
    183   /* ---- JAL rd, offset ----------------------------------------------
    184    * Semantics: rd = orig_pc + 4; pc = orig_pc + imm.  We must reproduce
    185    * the *user-visible* link value (orig_pc + 4), not the runtime
    186    * scratch-relative one. Layout:
    187    *   slot[0..]  materialize_target(t0, orig_pc + imm)
    188    *   slot[m]    materialize rd <- (orig_pc + 4)   (skipped when rd==x0)
    189    *   slot[m+]   JALR x0, t0, 0    (unconditional jump; no link)
    190    *   slot[end]  EBREAK
    191    * For rd==x0 this collapses to the plain "jump to target" form. */
    192   if (op == RV_JAL) {
    193     int64_t imm = rv_j_imm(orig_insn);
    194     uint64_t target = orig_pc + (uint64_t)imm;
    195     uint32_t rd = rv_rd(orig_insn);
    196     uint32_t n_words;
    197     n_words = emit_materialize_target(w, 0, target, scratch_runtime);
    198     if (rd != RV_ZERO) {
    199       /* link = orig_pc + 4. Synthesize via LUI + ADDI using low-32
    200        * decomposition; if the link value doesn't fit a 32-bit sign-
    201        * extended window, we still emit the same two-word sequence and
    202        * the high bits get truncated — acceptable for the JIT case
    203        * where orig_pc is always within the image's 32-bit sign-ext
    204        * range. */
    205       uint64_t link = orig_pc + 4u;
    206       uint32_t hi20;
    207       int32_t lo12;
    208       int sext32;
    209       (void)rv_split_hi_lo(link, &hi20, &lo12, &sext32);
    210       put_u32(w, 4 * n_words, rv_lui(rd, hi20));
    211       ++n_words;
    212       put_u32(w, 4 * n_words, rv_addi(rd, rd, lo12));
    213       ++n_words;
    214     }
    215     put_u32(w, 4 * n_words, rv_jalr(RV_ZERO, SHIM_T0, 0));
    216     ++n_words;
    217     put_u32(w, 4 * n_words, brk);
    218     *brk_offset = 4 * n_words;
    219     return 0;
    220   }
    221 
    222   /* ---- JALR rd, rs1, imm -------------------------------------------
    223    * Semantics: tmp = (regs[rs1] + sign_ext_12(imm)) & ~1; rd = orig_pc + 4;
    224    *            pc = tmp.
    225    * Like JAL, rd must receive the *user-visible* link (orig_pc + 4).
    226    * Layout:
    227    *   slot[0]   JALR x0, rs1, imm     ; jump-only form (no link write)
    228    *                                     -- but JALR is a single insn,
    229    *                                     so we cannot also write rd
    230    *                                     before jumping. We instead:
    231    *   slot[0]   compute t0 = (regs[rs1] + imm) & ~1
    232    *               (ADDI t0, rs1, imm; ANDI t0, t0, -2)
    233    *   slot[8]   materialize rd <- (orig_pc + 4)   (if rd != x0)
    234    *   slot[N]   JALR x0, t0, 0
    235    *   slot[N+4] EBREAK
    236    * Note rs1 might be t0 itself; ADDI computes t0 = rs1 + imm BEFORE
    237    * overwriting t0, which is fine because each insn reads its sources
    238    * before writing rd. */
    239   if (op == RV_JALR) {
    240     uint32_t rd = rv_rd(orig_insn);
    241     uint32_t rs1 = rv_rs1(orig_insn);
    242     int32_t imm = (int32_t)((orig_insn >> 20) & 0xfffu);
    243     if (imm & 0x800) imm -= 0x1000;
    244     put_u32(w, 0, rv_addi(SHIM_T0, rs1, imm));
    245     put_u32(w, 4, rv_andi(SHIM_T0, SHIM_T0, -2));
    246     uint32_t off = 8;
    247     if (rd != RV_ZERO) {
    248       uint64_t link = orig_pc + 4u;
    249       uint32_t hi20;
    250       int32_t lo12;
    251       int sext32;
    252       (void)rv_split_hi_lo(link, &hi20, &lo12, &sext32);
    253       put_u32(w, off, rv_lui(rd, hi20));
    254       off += 4;
    255       put_u32(w, off, rv_addi(rd, rd, lo12));
    256       off += 4;
    257     }
    258     put_u32(w, off, rv_jalr(RV_ZERO, SHIM_T0, 0));
    259     off += 4;
    260     put_u32(w, off, brk);
    261     *brk_offset = off;
    262     return 0;
    263   }
    264 
    265   /* ---- Bcc rs1, rs2, offset ---------------------------------------- */
    266   if (op == RV_BRANCH) {
    267     int64_t imm = rv_b_imm(orig_insn);
    268     uint64_t target = orig_pc + (uint64_t)imm;
    269     uint32_t f3 = rv_funct3(orig_insn);
    270     uint32_t rs1 = rv_rs1(orig_insn);
    271     uint32_t rs2 = rv_rs2(orig_insn);
    272     /* Trampoline layout:
    273      *   slot[0]   Bcc rs1, rs2, +12   (taken -> slot[12])
    274      *   slot[4]   JAL x0, +12         (not-taken fallthrough -> slot[16])
    275      *                                  ... wait — we want non-taken to
    276      *                                  fall through to the EBREAK at
    277      *                                  slot[8]. Simpler: place EBREAK
    278      *                                  at slot[4] for not-taken, and
    279      *                                  the take-target sequence at
    280      *                                  slot[8..]. The Bcc's +12 then
    281      *                                  becomes +8.
    282      *
    283      * Revised:
    284      *   slot[0]   Bcc rs1, rs2, +8     (taken -> slot[8] = target seq)
    285      *   slot[4]   EBREAK               (not-taken sentinel)
    286      *   slot[8]   AUIPC t0, hi20(target)
    287      *   slot[12]  ADDI  t0, t0, lo12
    288      *   slot[16]  JALR  x0, t0, 0
    289      *   slot[20]  EBREAK               (safety; never reached) */
    290     uint32_t new_branch = rv_b(8, rs2, rs1, f3, RV_BRANCH);
    291     uint32_t n_words;
    292     put_u32(w, 0, new_branch);
    293     put_u32(w, 4, brk);
    294     n_words = emit_materialize_target(w, 8, target, scratch_runtime + 8u);
    295     put_u32(w, 8 + 4 * n_words, rv_jalr(RV_ZERO, SHIM_T0, 0));
    296     put_u32(w, 8 + 4 * n_words + 4, brk);
    297     *brk_offset = 4;
    298     return 0;
    299   }
    300 
    301   /* ---- AUIPC rd, imm20 --------------------------------------------- */
    302   if (op == RV_AUIPC) {
    303     uint32_t imm20 = rv_u_imm20(orig_insn);
    304     uint32_t rd = rv_rd(orig_insn);
    305     /* AUIPC computes rd = orig_pc + sign_ext_32(imm20 << 12). We
    306      * synthesize that absolute value into rd using LUI + ADDI. */
    307     uint64_t auipc_val = (uint64_t)((int64_t)orig_pc +
    308                                     (int64_t)(int32_t)((int32_t)(imm20 << 12)));
    309     uint32_t hi20;
    310     int32_t lo12;
    311     int sext32;
    312     (void)rv_split_hi_lo(auipc_val, &hi20, &lo12, &sext32);
    313     put_u32(w, 0, rv_lui(rd, hi20));
    314     put_u32(w, 4, rv_addi(rd, rd, lo12));
    315     put_u32(w, 8, brk);
    316     *brk_offset = 8;
    317     return 0;
    318   }
    319 
    320   /* ---- default: no PC-relative operand — copy verbatim ------------- */
    321   put_u32(w, 0, orig_insn);
    322   put_u32(w, 4, brk);
    323   *brk_offset = 4;
    324   return 0;
    325 }
    326 
    327 static KitStatus rv64_dbg_breakpoint_patch(u8* out, u32 cap, u32* len_out) {
    328   uint32_t brk = dbg_rv64_brk_word();
    329   if (!out || !len_out) return KIT_INVALID;
    330   if (cap < 4u) return KIT_INVALID;
    331   memcpy(out, &brk, sizeof(brk));
    332   *len_out = 4u;
    333   return KIT_OK;
    334 }
    335 
    336 static u64 rv64_dbg_breakpoint_addr_from_fault_pc(u64 fault_pc) {
    337   return fault_pc;
    338 }
    339 
    340 static KitStatus rv64_dbg_decode_insn(const u8* bytes, u32 len, u64 pc,
    341                                       ArchDbgInsn* out) {
    342   if (!bytes || !out) return KIT_INVALID;
    343   if (len < 4u) return KIT_UNSUPPORTED;
    344   memset(out, 0, sizeof(*out));
    345   out->pc = pc;
    346   out->len = 4u;
    347   memcpy(out->bytes, bytes, 4u);
    348   return KIT_OK;
    349 }
    350 
    351 static KitStatus rv64_dbg_build_displaced_shim(
    352     const ArchDbgInsn* insn, void* scratch_write, u64 scratch_runtime,
    353     u32 scratch_cap, u32* sentinel_off, u64* fallthrough_pc) {
    354   uint32_t word = 0;
    355   if (!insn || !scratch_write || !sentinel_off || !fallthrough_pc)
    356     return KIT_INVALID;
    357   if (insn->len != 4u) return KIT_UNSUPPORTED;
    358   if (scratch_cap < 28u) return KIT_INVALID;
    359   memcpy(&word, insn->bytes, sizeof(word));
    360   if (dbg_rv64_build_shim(word, insn->pc, scratch_write, scratch_runtime,
    361                           sentinel_off) != 0) {
    362     return KIT_UNSUPPORTED;
    363   }
    364   *fallthrough_pc = insn->pc + 4u;
    365   return KIT_OK;
    366 }
    367 
    368 static int rv64_dbg_is_call(const ArchDbgInsn* insn) {
    369   uint32_t word = 0;
    370   uint32_t op;
    371   if (!insn || insn->len != 4u) return 0;
    372   memcpy(&word, insn->bytes, sizeof(word));
    373   op = rv_opcode(word);
    374   if (op != RV_JAL && op != RV_JALR) return 0;
    375   return rv_rd(word) != RV_ZERO;
    376 }
    377 
    378 static KitStatus rv64_dbg_direct_call_target(const ArchDbgInsn* insn,
    379                                              u64* target_out) {
    380   uint32_t word = 0;
    381   if (!insn || !target_out) return KIT_INVALID;
    382   if (insn->len != 4u) return KIT_UNSUPPORTED;
    383   memcpy(&word, insn->bytes, sizeof(word));
    384   if (rv_opcode(word) != RV_JAL || rv_rd(word) == RV_ZERO) return KIT_NOT_FOUND;
    385   *target_out = insn->pc + (u64)rv_j_imm(word);
    386   return KIT_OK;
    387 }
    388 
    389 static KitStatus rv64_dbg_direct_jump_target(const ArchDbgInsn* insn,
    390                                              u64* target_out) {
    391   uint32_t word = 0;
    392   if (!insn || !target_out) return KIT_INVALID;
    393   if (insn->len != 4u) return KIT_UNSUPPORTED;
    394   memcpy(&word, insn->bytes, sizeof(word));
    395   if (rv_opcode(word) != RV_JAL || rv_rd(word) != RV_ZERO) return KIT_NOT_FOUND;
    396   *target_out = insn->pc + (u64)rv_j_imm(word);
    397   return KIT_OK;
    398 }
    399 
    400 static KitStatus rv64_dbg_link_register_return_address(
    401     const KitUnwindFrame* frame, u64* target_out) {
    402   if (!frame || !target_out) return KIT_INVALID;
    403   if (frame->regs[RV_RA] == 0) return KIT_NOT_FOUND;
    404   *target_out = frame->regs[RV_RA];
    405   return KIT_OK;
    406 }
    407 
    408 const ArchDbgOps rv64_dbg_ops = {
    409     .min_insn_len = 4u,
    410     .max_insn_len = 4u,
    411     .breakpoint_patch = rv64_dbg_breakpoint_patch,
    412     .breakpoint_addr_from_fault_pc = rv64_dbg_breakpoint_addr_from_fault_pc,
    413     .decode_insn = rv64_dbg_decode_insn,
    414     .build_displaced_shim = rv64_dbg_build_displaced_shim,
    415     .is_call = rv64_dbg_is_call,
    416     .direct_call_target = rv64_dbg_direct_call_target,
    417     .direct_jump_target = rv64_dbg_direct_jump_target,
    418     .link_register_return_address = rv64_dbg_link_register_return_address,
    419 };
    420 
    421 /* RV32 shares every helper with RV64: the 4-byte insn encodings and the
    422  * medlow LUI+ADDI materialization in dbg_rv64_build_shim are XLEN-neutral
    423  * and naturally correct for rv32's 32-bit addresses (the >32-bit LUI
    424  * fallback in rv_split_hi_lo never fires). The only difference is the
    425  * advertised min_insn_len=2 (RVC is 2-byte). Per the file header, RVC
    426  * (2-byte) displaced-step is NOT supported: rv64_dbg_decode_insn returns
    427  * KIT_UNSUPPORTED for len<4 and rv64_dbg_build_displaced_shim requires
    428  * len==4, so RVC insns fall back to step-over via internal bp. That is
    429  * the intended v1 behavior; full RVC shims are deferred. */
    430 const ArchDbgOps rv32_dbg_ops = {
    431     .min_insn_len = 2u,
    432     .max_insn_len = 4u,
    433     .breakpoint_patch = rv64_dbg_breakpoint_patch,
    434     .breakpoint_addr_from_fault_pc = rv64_dbg_breakpoint_addr_from_fault_pc,
    435     .decode_insn = rv64_dbg_decode_insn,
    436     .build_displaced_shim = rv64_dbg_build_displaced_shim,
    437     .is_call = rv64_dbg_is_call,
    438     .direct_call_target = rv64_dbg_direct_call_target,
    439     .direct_jump_target = rv64_dbg_direct_jump_target,
    440     .link_register_return_address = rv64_dbg_link_register_return_address,
    441 };