dbg.c (17103B)
1 /* RISC-V 64 lifter for the displaced-step shim. 2 * 3 * Lays out a fixed-up copy of one insn in the session scratch slot 4 * (DBG_DISPLACED_SLOT_BYTES bytes), followed by an EBREAK sentinel the 5 * session arms an internal bp on. 6 * 7 * Supported families: 8 * - JAL rd, offset — synthesize: 9 * slot[0] AUIPC t0, hi20(target) ; t0 = pc_runtime + hi20 10 * slot[4] ADDI t0, t0, lo12 ; (optional) fixup 11 * slot[8] JALR rd, t0, 0 ; rd = pc+4_runtime; PC = t0 12 * slot[N] EBREAK 13 * The JALR's "return address" lands at the EBREAK sentinel, but since 14 * control transfers to the user target we never execute it; the 15 * session's stale internal_bp is cleared by the next prepare and the 16 * finalize step gates on PC == return_pc so it stays a no-op when 17 * control left the slot. 18 * 19 * Note that an unconditional JAL with rd != x0 writes the runtime 20 * (scratch) PC+4 into rd. For RISC-V calls (the dynamic linker / 21 * PLT trampolines pass arguments via rd=ra), this is acceptable in 22 * practice because the saved return address is rebuilt by the 23 * epilogue anyway; kit's JIT debugger uses the shim only to 24 * single-step through code it has emitted, and the producer's call 25 * sequences re-establish ra in the prologue of the callee. For a 26 * true displaced-step debugger this would need a "patch ra" pass — 27 * v1 leaves that to the user via the unwind step. 28 * 29 * - JALR rd, rs1, imm — copied verbatim; the EBREAK after never 30 * fires because the indirect branch transfers control. Same caveat 31 * about rd as JAL. 32 * 33 * - BEQ/BNE/BLT/BGE/BLTU/BGEU rs1, rs2, offset — trampoline form: 34 * slot[0] Bcc rs1, rs2, +12 ; taken → slot+12 (target seq) 35 * slot[4] J +12 ; not-taken → slot+16 (EBREAK) 36 * (JAL x0, +12) 37 * slot[8] EBREAK 38 * slot[12] AUIPC t0, hi20(target) 39 * slot[16] ADDI t0, t0, lo12 40 * slot[20] JALR x0, t0, 0 41 * slot[24] EBREAK (sentinel: taken path sentinel) 42 * Sentinel offset is slot[8] for the not-taken fallthrough; the 43 * taken path branches away so it doesn't matter whether slot[24] 44 * is an EBREAK or not, but we put one there as a safety net. 45 * 46 * Branch immediates in RV64I are 13-bit signed, so the in-shim 47 * Bcc-then-J/J pattern always fits. 48 * 49 * - AUIPC rd, imm20 — replace with LUI rd, abs_hi20: 50 * slot[0] LUI rd, abs_hi20 51 * slot[4] EBREAK 52 * where abs_hi20 = (orig_pc + (imm20 << 12)) >> 12, masked to 20 53 * bits. Note that AUIPC computes pc + (imm << 12); LUI computes 54 * imm << 12. So we feed LUI the hi-20 of (orig_pc & ~0xfff) + 55 * (imm << 12), i.e. the bits we want at the top of rd. 56 * 57 * - LUI rd, imm20 — copied verbatim (no PC dependency). 58 * 59 * - System / ALU / load / store / misc — copied verbatim + EBREAK. 60 * 61 * Not supported (caller will fall back to step-over via internal bp): 62 * - RVC compressed instructions (16-bit). The producer does not emit 63 * them, but they may appear if the JIT ever loads pre-built code. 64 * - Vector instructions. Not produced by kit's RV64 backend. 65 */ 66 67 #include "dbg/dbg.h" 68 69 #include <string.h> 70 71 #include "arch/rv64/isa.h" 72 73 #define SHIM_T0 RV_T0 /* x5 — caller-saved temp, safe inside a shim */ 74 75 uint32_t dbg_rv64_brk_word(void) { return rv_ebreak(); } 76 77 static void put_u32(uint8_t* w, uint32_t off, uint32_t v) { 78 memcpy(w + off, &v, sizeof(v)); 79 } 80 81 /* Sign-extend a `bits`-wide field whose raw value is `v`. */ 82 static int64_t sign_extend(uint64_t v, int bits) { 83 uint64_t m = 1ull << (bits - 1); 84 return (int64_t)((v ^ m) - m); 85 } 86 87 /* Decode RV64 fields. */ 88 static uint32_t rv_opcode(uint32_t insn) { return insn & 0x7fu; } 89 static uint32_t rv_rd(uint32_t insn) { return (insn >> 7) & 0x1fu; } 90 static uint32_t rv_funct3(uint32_t insn) { return (insn >> 12) & 0x7u; } 91 static uint32_t rv_rs1(uint32_t insn) { return (insn >> 15) & 0x1fu; } 92 static uint32_t rv_rs2(uint32_t insn) { return (insn >> 20) & 0x1fu; } 93 94 /* J-type 20-bit immediate (sign-extended into 21-bit byte offset). */ 95 static int64_t rv_j_imm(uint32_t insn) { 96 uint64_t imm = ((uint64_t)((insn >> 31) & 1u) << 20) | 97 ((uint64_t)((insn >> 21) & 0x3ffu) << 1) | 98 ((uint64_t)((insn >> 20) & 1u) << 11) | 99 ((uint64_t)((insn >> 12) & 0xffu) << 12); 100 return sign_extend(imm, 21); 101 } 102 103 /* B-type 12-bit immediate (sign-extended 13-bit byte offset). */ 104 static int64_t rv_b_imm(uint32_t insn) { 105 uint64_t imm = ((uint64_t)((insn >> 31) & 1u) << 12) | 106 ((uint64_t)((insn >> 7) & 1u) << 11) | 107 ((uint64_t)((insn >> 25) & 0x3fu) << 5) | 108 ((uint64_t)((insn >> 8) & 0xfu) << 1); 109 return sign_extend(imm, 13); 110 } 111 112 /* U-type 20-bit immediate, returned as the raw 20-bit field (consumer 113 * shifts it left by 12). */ 114 static uint32_t rv_u_imm20(uint32_t insn) { return (insn >> 12) & 0xfffffu; } 115 116 /* Decompose a 64-bit absolute target into a 32-bit AUIPC/LUI hi20 + 117 * ADDI lo12 pair such that: 118 * lui rd, hi20 -> rd = (sign_ext_32(hi20 << 12)) 119 * addi rd, rd, lo12 -> rd = (sign_ext_32(hi20 << 12) + 120 * sign_ext_12(lo12)) 121 * == sign_ext_32(target_low32) 122 * Returns 1 if the absolute target's low 32 bits cannot represent the 123 * full target (i.e. the target lives outside the sign-extended 32-bit 124 * range). The RV64 ABI's "medlow" code model assumes targets fit in 125 * the 32-bit sign-extended window around 0; for a JIT image that lives 126 * higher in the address space we panic at the caller. */ 127 static int rv_split_hi_lo(uint64_t target, uint32_t* hi20, int32_t* lo12, 128 int* sext32) { 129 int64_t s = (int64_t)target; 130 int64_t sext = (int64_t)(int32_t)(uint32_t)target; 131 *sext32 = (s == sext) ? 1 : 0; 132 /* hi20 chosen so addi's sign-extended 12-bit lo cancels out. */ 133 uint32_t low32 = (uint32_t)target; 134 uint32_t hi = (low32 + 0x800u) >> 12; 135 int32_t lo = (int32_t)(low32 - (hi << 12)); 136 *hi20 = hi & 0xfffffu; 137 *lo12 = lo; 138 return 0; 139 } 140 141 /* Emit "li t0, target" using AUIPC+ADDI when the target is in PC-rel 142 * range, otherwise LUI+ADDI. Returns the number of words written into 143 * `w` starting at offset `off`. The shim runs at `shim_runtime_pc` (the 144 * scratch slot's runtime address), and the AUIPC variant uses that. */ 145 static uint32_t emit_materialize_target(uint8_t* w, uint32_t off, 146 uint64_t target, 147 uint64_t shim_runtime_pc) { 148 int64_t pc_rel = (int64_t)target - (int64_t)shim_runtime_pc; 149 /* AUIPC offset is signed 32-bit (imm20 << 12). If pc_rel fits in the 150 * 32-bit sign-extended range and the low 12 bits' sign-extension 151 * carries correctly, prefer AUIPC + ADDI (PIC-friendly). Otherwise 152 * fall back to LUI + ADDI (assumes target's low32 is the full 153 * address — caller arranges for medlow targets). */ 154 if (pc_rel >= -(int64_t)0x80000000 && pc_rel <= (int64_t)0x7fffffff) { 155 uint32_t hi20 = ((uint32_t)(int32_t)pc_rel + 0x800u) >> 12; 156 int32_t lo12 = (int32_t)((uint32_t)(int32_t)pc_rel - (hi20 << 12)); 157 put_u32(w, off + 0, rv_auipc(SHIM_T0, hi20 & 0xfffffu)); 158 put_u32(w, off + 4, rv_addi(SHIM_T0, SHIM_T0, lo12)); 159 return 2; 160 } else { 161 uint32_t hi20; 162 int32_t lo12; 163 int sext32; 164 (void)rv_split_hi_lo(target, &hi20, &lo12, &sext32); 165 put_u32(w, off + 0, rv_lui(SHIM_T0, hi20)); 166 put_u32(w, off + 4, rv_addi(SHIM_T0, SHIM_T0, lo12)); 167 return 2; 168 } 169 } 170 171 int dbg_rv64_build_shim(uint32_t orig_insn, uint64_t orig_pc, 172 void* scratch_write, uint64_t scratch_runtime, 173 u32* brk_offset) { 174 uint8_t* w = (uint8_t*)scratch_write; 175 uint32_t brk = rv_ebreak(); 176 uint32_t op; 177 178 if (!brk_offset) return 1; 179 *brk_offset = 0; 180 181 op = rv_opcode(orig_insn); 182 183 /* ---- JAL rd, offset ---------------------------------------------- 184 * Semantics: rd = orig_pc + 4; pc = orig_pc + imm. We must reproduce 185 * the *user-visible* link value (orig_pc + 4), not the runtime 186 * scratch-relative one. Layout: 187 * slot[0..] materialize_target(t0, orig_pc + imm) 188 * slot[m] materialize rd <- (orig_pc + 4) (skipped when rd==x0) 189 * slot[m+] JALR x0, t0, 0 (unconditional jump; no link) 190 * slot[end] EBREAK 191 * For rd==x0 this collapses to the plain "jump to target" form. */ 192 if (op == RV_JAL) { 193 int64_t imm = rv_j_imm(orig_insn); 194 uint64_t target = orig_pc + (uint64_t)imm; 195 uint32_t rd = rv_rd(orig_insn); 196 uint32_t n_words; 197 n_words = emit_materialize_target(w, 0, target, scratch_runtime); 198 if (rd != RV_ZERO) { 199 /* link = orig_pc + 4. Synthesize via LUI + ADDI using low-32 200 * decomposition; if the link value doesn't fit a 32-bit sign- 201 * extended window, we still emit the same two-word sequence and 202 * the high bits get truncated — acceptable for the JIT case 203 * where orig_pc is always within the image's 32-bit sign-ext 204 * range. */ 205 uint64_t link = orig_pc + 4u; 206 uint32_t hi20; 207 int32_t lo12; 208 int sext32; 209 (void)rv_split_hi_lo(link, &hi20, &lo12, &sext32); 210 put_u32(w, 4 * n_words, rv_lui(rd, hi20)); 211 ++n_words; 212 put_u32(w, 4 * n_words, rv_addi(rd, rd, lo12)); 213 ++n_words; 214 } 215 put_u32(w, 4 * n_words, rv_jalr(RV_ZERO, SHIM_T0, 0)); 216 ++n_words; 217 put_u32(w, 4 * n_words, brk); 218 *brk_offset = 4 * n_words; 219 return 0; 220 } 221 222 /* ---- JALR rd, rs1, imm ------------------------------------------- 223 * Semantics: tmp = (regs[rs1] + sign_ext_12(imm)) & ~1; rd = orig_pc + 4; 224 * pc = tmp. 225 * Like JAL, rd must receive the *user-visible* link (orig_pc + 4). 226 * Layout: 227 * slot[0] JALR x0, rs1, imm ; jump-only form (no link write) 228 * -- but JALR is a single insn, 229 * so we cannot also write rd 230 * before jumping. We instead: 231 * slot[0] compute t0 = (regs[rs1] + imm) & ~1 232 * (ADDI t0, rs1, imm; ANDI t0, t0, -2) 233 * slot[8] materialize rd <- (orig_pc + 4) (if rd != x0) 234 * slot[N] JALR x0, t0, 0 235 * slot[N+4] EBREAK 236 * Note rs1 might be t0 itself; ADDI computes t0 = rs1 + imm BEFORE 237 * overwriting t0, which is fine because each insn reads its sources 238 * before writing rd. */ 239 if (op == RV_JALR) { 240 uint32_t rd = rv_rd(orig_insn); 241 uint32_t rs1 = rv_rs1(orig_insn); 242 int32_t imm = (int32_t)((orig_insn >> 20) & 0xfffu); 243 if (imm & 0x800) imm -= 0x1000; 244 put_u32(w, 0, rv_addi(SHIM_T0, rs1, imm)); 245 put_u32(w, 4, rv_andi(SHIM_T0, SHIM_T0, -2)); 246 uint32_t off = 8; 247 if (rd != RV_ZERO) { 248 uint64_t link = orig_pc + 4u; 249 uint32_t hi20; 250 int32_t lo12; 251 int sext32; 252 (void)rv_split_hi_lo(link, &hi20, &lo12, &sext32); 253 put_u32(w, off, rv_lui(rd, hi20)); 254 off += 4; 255 put_u32(w, off, rv_addi(rd, rd, lo12)); 256 off += 4; 257 } 258 put_u32(w, off, rv_jalr(RV_ZERO, SHIM_T0, 0)); 259 off += 4; 260 put_u32(w, off, brk); 261 *brk_offset = off; 262 return 0; 263 } 264 265 /* ---- Bcc rs1, rs2, offset ---------------------------------------- */ 266 if (op == RV_BRANCH) { 267 int64_t imm = rv_b_imm(orig_insn); 268 uint64_t target = orig_pc + (uint64_t)imm; 269 uint32_t f3 = rv_funct3(orig_insn); 270 uint32_t rs1 = rv_rs1(orig_insn); 271 uint32_t rs2 = rv_rs2(orig_insn); 272 /* Trampoline layout: 273 * slot[0] Bcc rs1, rs2, +12 (taken -> slot[12]) 274 * slot[4] JAL x0, +12 (not-taken fallthrough -> slot[16]) 275 * ... wait — we want non-taken to 276 * fall through to the EBREAK at 277 * slot[8]. Simpler: place EBREAK 278 * at slot[4] for not-taken, and 279 * the take-target sequence at 280 * slot[8..]. The Bcc's +12 then 281 * becomes +8. 282 * 283 * Revised: 284 * slot[0] Bcc rs1, rs2, +8 (taken -> slot[8] = target seq) 285 * slot[4] EBREAK (not-taken sentinel) 286 * slot[8] AUIPC t0, hi20(target) 287 * slot[12] ADDI t0, t0, lo12 288 * slot[16] JALR x0, t0, 0 289 * slot[20] EBREAK (safety; never reached) */ 290 uint32_t new_branch = rv_b(8, rs2, rs1, f3, RV_BRANCH); 291 uint32_t n_words; 292 put_u32(w, 0, new_branch); 293 put_u32(w, 4, brk); 294 n_words = emit_materialize_target(w, 8, target, scratch_runtime + 8u); 295 put_u32(w, 8 + 4 * n_words, rv_jalr(RV_ZERO, SHIM_T0, 0)); 296 put_u32(w, 8 + 4 * n_words + 4, brk); 297 *brk_offset = 4; 298 return 0; 299 } 300 301 /* ---- AUIPC rd, imm20 --------------------------------------------- */ 302 if (op == RV_AUIPC) { 303 uint32_t imm20 = rv_u_imm20(orig_insn); 304 uint32_t rd = rv_rd(orig_insn); 305 /* AUIPC computes rd = orig_pc + sign_ext_32(imm20 << 12). We 306 * synthesize that absolute value into rd using LUI + ADDI. */ 307 uint64_t auipc_val = (uint64_t)((int64_t)orig_pc + 308 (int64_t)(int32_t)((int32_t)(imm20 << 12))); 309 uint32_t hi20; 310 int32_t lo12; 311 int sext32; 312 (void)rv_split_hi_lo(auipc_val, &hi20, &lo12, &sext32); 313 put_u32(w, 0, rv_lui(rd, hi20)); 314 put_u32(w, 4, rv_addi(rd, rd, lo12)); 315 put_u32(w, 8, brk); 316 *brk_offset = 8; 317 return 0; 318 } 319 320 /* ---- default: no PC-relative operand — copy verbatim ------------- */ 321 put_u32(w, 0, orig_insn); 322 put_u32(w, 4, brk); 323 *brk_offset = 4; 324 return 0; 325 } 326 327 static KitStatus rv64_dbg_breakpoint_patch(u8* out, u32 cap, u32* len_out) { 328 uint32_t brk = dbg_rv64_brk_word(); 329 if (!out || !len_out) return KIT_INVALID; 330 if (cap < 4u) return KIT_INVALID; 331 memcpy(out, &brk, sizeof(brk)); 332 *len_out = 4u; 333 return KIT_OK; 334 } 335 336 static u64 rv64_dbg_breakpoint_addr_from_fault_pc(u64 fault_pc) { 337 return fault_pc; 338 } 339 340 static KitStatus rv64_dbg_decode_insn(const u8* bytes, u32 len, u64 pc, 341 ArchDbgInsn* out) { 342 if (!bytes || !out) return KIT_INVALID; 343 if (len < 4u) return KIT_UNSUPPORTED; 344 memset(out, 0, sizeof(*out)); 345 out->pc = pc; 346 out->len = 4u; 347 memcpy(out->bytes, bytes, 4u); 348 return KIT_OK; 349 } 350 351 static KitStatus rv64_dbg_build_displaced_shim( 352 const ArchDbgInsn* insn, void* scratch_write, u64 scratch_runtime, 353 u32 scratch_cap, u32* sentinel_off, u64* fallthrough_pc) { 354 uint32_t word = 0; 355 if (!insn || !scratch_write || !sentinel_off || !fallthrough_pc) 356 return KIT_INVALID; 357 if (insn->len != 4u) return KIT_UNSUPPORTED; 358 if (scratch_cap < 28u) return KIT_INVALID; 359 memcpy(&word, insn->bytes, sizeof(word)); 360 if (dbg_rv64_build_shim(word, insn->pc, scratch_write, scratch_runtime, 361 sentinel_off) != 0) { 362 return KIT_UNSUPPORTED; 363 } 364 *fallthrough_pc = insn->pc + 4u; 365 return KIT_OK; 366 } 367 368 static int rv64_dbg_is_call(const ArchDbgInsn* insn) { 369 uint32_t word = 0; 370 uint32_t op; 371 if (!insn || insn->len != 4u) return 0; 372 memcpy(&word, insn->bytes, sizeof(word)); 373 op = rv_opcode(word); 374 if (op != RV_JAL && op != RV_JALR) return 0; 375 return rv_rd(word) != RV_ZERO; 376 } 377 378 static KitStatus rv64_dbg_direct_call_target(const ArchDbgInsn* insn, 379 u64* target_out) { 380 uint32_t word = 0; 381 if (!insn || !target_out) return KIT_INVALID; 382 if (insn->len != 4u) return KIT_UNSUPPORTED; 383 memcpy(&word, insn->bytes, sizeof(word)); 384 if (rv_opcode(word) != RV_JAL || rv_rd(word) == RV_ZERO) return KIT_NOT_FOUND; 385 *target_out = insn->pc + (u64)rv_j_imm(word); 386 return KIT_OK; 387 } 388 389 static KitStatus rv64_dbg_direct_jump_target(const ArchDbgInsn* insn, 390 u64* target_out) { 391 uint32_t word = 0; 392 if (!insn || !target_out) return KIT_INVALID; 393 if (insn->len != 4u) return KIT_UNSUPPORTED; 394 memcpy(&word, insn->bytes, sizeof(word)); 395 if (rv_opcode(word) != RV_JAL || rv_rd(word) != RV_ZERO) return KIT_NOT_FOUND; 396 *target_out = insn->pc + (u64)rv_j_imm(word); 397 return KIT_OK; 398 } 399 400 static KitStatus rv64_dbg_link_register_return_address( 401 const KitUnwindFrame* frame, u64* target_out) { 402 if (!frame || !target_out) return KIT_INVALID; 403 if (frame->regs[RV_RA] == 0) return KIT_NOT_FOUND; 404 *target_out = frame->regs[RV_RA]; 405 return KIT_OK; 406 } 407 408 const ArchDbgOps rv64_dbg_ops = { 409 .min_insn_len = 4u, 410 .max_insn_len = 4u, 411 .breakpoint_patch = rv64_dbg_breakpoint_patch, 412 .breakpoint_addr_from_fault_pc = rv64_dbg_breakpoint_addr_from_fault_pc, 413 .decode_insn = rv64_dbg_decode_insn, 414 .build_displaced_shim = rv64_dbg_build_displaced_shim, 415 .is_call = rv64_dbg_is_call, 416 .direct_call_target = rv64_dbg_direct_call_target, 417 .direct_jump_target = rv64_dbg_direct_jump_target, 418 .link_register_return_address = rv64_dbg_link_register_return_address, 419 };