native.c (168825B)
1 /* src/arch/rv64/native.c — RISC-V (RV64GC, LP64D) NativeTarget implementation. 2 * 3 * Mirrors the aa64 reference (src/arch/aa64/native.c): a physical-emission 4 * NativeTarget driven at -O0 by the shared NativeDirectTarget and at -O1+ by 5 * the optimizer emit path. ABI decisions go through the abi/ interface; this 6 * file owns only ISA emission and the RV64 frame layout. 7 * 8 * Frame model (single, top-record): s0 (x8) is the frame pointer anchored at 9 * the saved s0/ra pair; slots live below s0 at positive byte offsets `off` 10 * (address = s0 - off); outgoing args sit at the bottom of the frame (sp+0..). 11 * frame_size = align16(16 + cum_off + max_outgoing + va_save_sz) 12 * fp_pair_off = frame_size - 16 - va_save_sz (saved pair, sp-relative) 13 * CFA = s0 + (frame_size - fp_pair_off) 14 * RISC-V has no condition flags: comparisons materialize a 0/1 via SLT/SLTU or 15 * FLT/FLE; branches compare two registers directly. x0 is a hardware zero. */ 16 17 #include <string.h> 18 19 #include "abi/abi.h" 20 #include "arch/riscv/asm.h" 21 #include "arch/riscv/isa.h" 22 #include "arch/riscv/regs.h" 23 #include "arch/riscv/rv64.h" 24 #include "arch/riscv/variant.h" 25 #include "asm/asm.h" 26 #include "asm/asm_lex.h" 27 #include "cg/native_argmove.h" 28 #include "cg/native_asm.h" 29 #include "cg/native_direct_target.h" 30 #include "cg/native_frame.h" 31 #include "cg/type.h" 32 #include "core/arena.h" 33 #include "core/bytes.h" 34 #include "core/pool.h" 35 #include "core/slice.h" 36 #include "obj/obj.h" 37 38 enum { 39 RV_TMP0 = 5u, /* t0: emit-internal scratch (reserved, never allocable) */ 40 RV_TMP1 = 6u, /* t1: emit-internal scratch */ 41 RV_TMP2 = 7u, /* t2: emit-internal scratch (reserved in phys table) */ 42 RV_TMP3 = 28u, /* t3: emit-internal scratch (reserved in phys table) */ 43 RV_FTMP0 = 0u, /* ft0: emit-internal FP scratch */ 44 RV_FTMP1 = 1u, /* ft1: emit-internal FP scratch */ 45 RV_FA0 = 10u, /* fa0..fa7 = f10..f17 (FP arg/return registers) */ 46 RV_FA7 = 17u, 47 /* Single-pass (-O0) worst-case prologue: sp adjust (3) + far save pair (7) 48 * + sret spill (1) + variadic GP spills (8). No callee-saves at -O0. */ 49 RV_PROLOGUE_WORDS = 32u, 50 /* Known-frame (-O1) prologues are emitted directly, not into the fixed -O0 51 * NOP region, and additionally save callee-saved registers (up to 11 int + 12 52 * fp, each up to 4 words for a far s0-relative offset) on top of the header, 53 * sret, and variadic spills. Size the build buffer for the worst case. */ 54 RV_KNOWN_PROLOGUE_WORDS = 192u, 55 RV_FRAME_SAVE_SIZE = 16u, 56 }; 57 58 /* s1..s11 (11) + fs0..fs11 (12); separate int/fp collect arrays use this cap. 59 */ 60 #define RV_MAX_CALLEE_SAVES 16u 61 #define RV_MAX_REG_ARG_MOVES 16u 62 63 extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc); 64 extern void debug_func_pc_range(Debug*, ObjSecId text_section, u32 begin_ofs, 65 u32 end_ofs); 66 67 /* ============================ low-level emit ============================ */ 68 69 void rv64_emit32(MCEmitter* mc, u32 word) { 70 u8 b[4]; 71 u32 ofs = obj_pos(mc->obj, mc->section_id); 72 wr_u32_le(b, word); 73 mc->emit_bytes(mc, b, sizeof b); 74 if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); 75 } 76 77 void rv64_emit16(MCEmitter* mc, u32 halfword) { 78 u8 b[2]; 79 u32 ofs = obj_pos(mc->obj, mc->section_id); 80 b[0] = (u8)(halfword & 0xff); 81 b[1] = (u8)((halfword >> 8) & 0xff); 82 mc->emit_bytes(mc, b, sizeof b); 83 if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); 84 } 85 86 static void rv_patch32(ObjBuilder* obj, ObjSecId sec, u32 off, u32 word) { 87 u8 b[4]; 88 wr_u32_le(b, word); 89 obj_patch(obj, sec, off, b, sizeof b); 90 } 91 92 static int fits_i12(i64 v) { return v >= -2048 && v <= 2047; } 93 static int fits_i32(i64 v) { 94 return v >= (i64)(i32)0x80000000 && v <= (i64)(i32)0x7fffffff; 95 } 96 97 static u32 align_up_u32(u32 v, u32 align) { 98 u32 mask = align ? align - 1u : 0u; 99 return (v + mask) & ~mask; 100 } 101 102 static i64 floor_div_4096(i64 v) { 103 if (v >= 0) return v / 4096; 104 return -((-v + 4095) / 4096); 105 } 106 107 static void rv_emit_li32(const RiscvVariant* v, MCEmitter* mc, u32 rd, 108 i32 imm) { 109 if (imm >= -2048 && imm <= 2047) { 110 rv64_emit32(mc, rv_addi(rd, RV_ZERO, imm)); 111 return; 112 } 113 { 114 i64 hi64 = floor_div_4096((i64)imm + 0x800); 115 i32 hi = (i32)hi64; 116 i32 lo = (i32)((i64)imm - hi64 * 4096); 117 rv64_emit32(mc, rv_lui(rd, (u32)hi & 0xfffffu)); 118 /* ADDIW is RV64-only; on RV32 the value fits 32 bits so plain ADDI is 119 * exact (and identical to ADDIW's low result on RV64). */ 120 if (lo) 121 rv64_emit32(mc, v->has_w_forms ? rv_addiw(rd, rd, lo) : rv_addi(rd, rd, lo)); 122 } 123 } 124 125 static i32 sext12(u32 v) { 126 v &= 0xfffu; 127 return (v & 0x800u) ? (i32)v - 4096 : (i32)v; 128 } 129 130 /* Builds a full XLEN-wide value. The recursion / slli-12 chain assembles bits 131 * above 32 and is only ever reached on rv64 (a single rv32 register cannot hold 132 * a value wider than 32 bits — the cg layer legalizes those into pairs). */ 133 static void rv_emit_li64(const RiscvVariant* v, MCEmitter* mc, u32 rd, u64 imm) { 134 if (fits_i32((i64)imm)) { 135 rv_emit_li32(v, mc, rd, (i32)(i64)imm); 136 return; 137 } 138 { 139 i32 lo = sext12((u32)imm); 140 u64 hi = (imm - (u64)(i64)lo) >> 12; 141 rv_emit_li64(v, mc, rd, hi); 142 rv64_emit32(mc, rv_slli(rd, rd, 12)); 143 if (lo) rv64_emit32(mc, rv_addi(rd, rd, lo)); 144 } 145 } 146 147 /* sf!=0 selects a full native-width materialization; sf==0 a 32-bit value. On 148 * rv32 the native width is 32, so the wide branch collapses to the 32-bit 149 * path. */ 150 static void rv_emit_load_imm(const RiscvVariant* v, MCEmitter* mc, u32 sf, 151 u32 rd, i64 imm) { 152 if (!sf || v->xlen == 32u) { 153 rv_emit_li32(v, mc, rd, (i32)imm); 154 return; 155 } 156 if (fits_i32(imm)) 157 rv_emit_li32(v, mc, rd, (i32)imm); 158 else 159 rv_emit_li64(v, mc, rd, (u64)imm); 160 } 161 162 /* rd = base + off, materializing the offset when it exceeds imm12. Uses RV_TMP1 163 * as scratch for the wide path, so callers must keep RV_TMP1 free. */ 164 static void rv_emit_addr_adjust(const RiscvVariant* v, MCEmitter* mc, u32 rd, 165 u32 base, i32 off) { 166 if (off == 0) { 167 if (rd != base) rv64_emit32(mc, rv_addi(rd, base, 0)); 168 return; 169 } 170 if (fits_i12(off)) { 171 rv64_emit32(mc, rv_addi(rd, base, off)); 172 return; 173 } 174 rv_emit_load_imm(v, mc, 1, RV_TMP1, (i64)off); 175 rv64_emit32(mc, rv_add(rd, base, RV_TMP1)); 176 } 177 178 static u32 enc_int_store(const RiscvVariant* v, u32 nbytes, u32 src, u32 base, 179 i32 off) { 180 switch (nbytes) { 181 case 1: 182 return rv_sb(src, base, off); 183 case 2: 184 return rv_sh(src, base, off); 185 case 4: 186 return rv_sw(src, base, off); 187 default: 188 /* The widest GPR store is SD on rv64, SW on rv32. */ 189 return v->ptr_bytes == 8u ? rv_sd(src, base, off) : rv_sw(src, base, off); 190 } 191 } 192 static u32 enc_int_load(const RiscvVariant* v, u32 nbytes, int sign_ext, u32 rd, 193 u32 base, i32 off) { 194 switch (nbytes) { 195 case 1: 196 return sign_ext ? rv_lb(rd, base, off) : rv_lbu(rd, base, off); 197 case 2: 198 return sign_ext ? rv_lh(rd, base, off) : rv_lhu(rd, base, off); 199 case 4: 200 /* LWU (zero-extending 32-bit load) is RV64-only; on rv32 a 4-byte load 201 * is just LW (no wider container to zero-extend into). */ 202 return sign_ext || v->xlen == 32u ? rv_lw(rd, base, off) 203 : rv_lwu(rd, base, off); 204 default: 205 /* The widest GPR load is LD on rv64, LW on rv32. */ 206 return v->ptr_bytes == 8u ? rv_ld(rd, base, off) : rv_lw(rd, base, off); 207 } 208 } 209 210 /* Pointer-width GPR load/store (GOT entries, frame-value bases, saved ra/s0, 211 * sret/indirect/va_list pointers): LD/SD on rv64, LW/SW on rv32. */ 212 static u32 rv_ld_ptr(const RiscvVariant* v, u32 rd, u32 base, i32 off) { 213 return v->ptr_bytes == 8u ? rv_ld(rd, base, off) : rv_lw(rd, base, off); 214 } 215 static u32 rv_sd_ptr(const RiscvVariant* v, u32 src, u32 base, i32 off) { 216 return v->ptr_bytes == 8u ? rv_sd(src, base, off) : rv_sw(src, base, off); 217 } 218 219 /* ============================ target state ============================ */ 220 221 /* Frame slots and callee-save records live in the shared NativeFrame 222 * bookkeeping (cg/native_frame.h); these aliases keep the rv64-local spellings. 223 */ 224 typedef NativeFrameSlotEntry RvNativeSlot; 225 typedef NativeFrameCalleeSave RvCalleeSave; 226 227 typedef enum RvPatchKind { RV_PATCH_ALLOCA } RvPatchKind; 228 229 typedef struct RvPatch { 230 u8 kind; /* RvPatchKind */ 231 u32 pos; 232 u32 dst_reg; 233 } RvPatch; 234 235 typedef struct RvNativeTarget { 236 NativeTarget base; 237 /* Immutable per-XLEN descriptor (rv32 / rv64), set once in the constructor 238 * from c->target.arch. Every XLEN-dependent emit site reads it; with the 239 * rv64 variant each site reproduces the historical literal exactly. */ 240 const RiscvVariant* variant; 241 SrcLoc loc; 242 const CGFuncDesc* func; 243 244 /* Shared frame bookkeeping: slot table, cum_off, max_outgoing, callee-save 245 * set, and the known_frame / has_alloca / frame_final flags. */ 246 NativeFrame frame; 247 u32 frame_size_final; 248 u32 fp_pair_off; 249 u32 minimal_prologue_words; /* known-frame path: exact prologue length, else 0 250 */ 251 252 /* Known-frame (-O1) leaf no-frame tier (aa64's slim_prologue equivalent), 253 * settled in rv_func_begin_known_frame; always 0 on the single-pass path. A 254 * leaf with no callee-saves, no body slots, no outgoing args, no 255 * sret/variadic and register-only params never reads s0 nor clobbers ra, so 256 * it emits NO prologue and a bare `ret` — the whole frame setup/teardown is 257 * elided. RISC-V has no pre/post-indexed store, so aa64's fp_at_bottom fold 258 * would save zero instructions on a kept frame and is intentionally not 259 * ported (see doc/plan/ARCH.md §2); this leaf tier is the rv64 win. */ 260 u8 slim_prologue; 261 262 u32 incoming_stack_size; /* fixed-param stack bytes (tail-call check) */ 263 u32 next_param_int; 264 u32 next_param_fp; 265 u32 next_param_stack; 266 u8 has_sret; 267 u8 is_variadic; 268 NativeFrameSlot sret_ptr_slot; 269 270 RvPatch* patches; 271 u32 npatches; 272 u32 patches_cap; 273 u32 nalloca; 274 275 u32 func_start; 276 u32 prologue_pos; 277 MCLabel epilogue_label; 278 } RvNativeTarget; 279 280 static RvNativeTarget* rv_of(NativeTarget* t) { return (RvNativeTarget*)t; } 281 282 static _Noreturn void rv_panic(RvNativeTarget* a, const char* msg) { 283 compiler_panic(a->base.c, a->loc, "rv64 native target: %s", msg); 284 } 285 286 static RvNativeSlot* rv_slot_get(RvNativeTarget* a, NativeFrameSlot fs) { 287 return native_frame_slot_at(&a->frame, fs); 288 } 289 290 /* s0-relative byte offset of a frame slot's base (address = s0 + ret). */ 291 static i32 rv_s0_off_slot(const RvNativeSlot* s) { return -(i32)s->off; } 292 293 static u32 rv_va_save_sz(const RvNativeTarget* a) { 294 /* ABI-derived: the variadic register-save area is gp_reg_count*gp_slot_size 295 * (a0..a7 = 64 bytes for LP64D, 32 for ILP32). Only present in variadics. */ 296 return a->is_variadic ? native_frame_va_save_bytes(a->base.c->abi) : 0u; 297 } 298 299 /* s0-relative byte offset of incoming stack arg at byte_off. Stack args sit 300 * just above the saved pair; the variadic GP save area (when present) is 301 * contiguous with them at [s0 + frame_save_size). */ 302 static i32 rv_s0_off_in_arg(const RvNativeTarget* a, u32 byte_off) { 303 u32 base = a->variant->frame_save_size; 304 if (a->is_variadic) base += rv_va_save_sz(a); 305 return (i32)(base + byte_off); 306 } 307 308 /* Callee-saved registers are homed just below the locals at rv_save_off() — 309 * they are NOT frame slots, so the frame size must reserve their bytes 310 * explicitly. Integer saves are ptr_bytes wide (sd on rv64, sw on rv32); FP 311 * saves are always 8 bytes (fsd, even on rv32d). On rv64 both are 8 so the sum 312 * is identical to the historical ncallee_saves*8. Zero at -O0. */ 313 static u32 rv_callee_save_bytes(const RvNativeTarget* a) { 314 u32 ptr = a->variant->ptr_bytes; 315 u32 i, bytes = 0; 316 for (i = 0; i < a->frame.ncallee_saves; ++i) 317 bytes += a->frame.callee_saves[i].cls == NATIVE_REG_FP ? 8u : ptr; 318 return bytes; 319 } 320 321 static u32 rv_frame_size(const RvNativeTarget* a) { 322 u32 raw = a->variant->frame_save_size + a->frame.cum_off + 323 rv_callee_save_bytes(a) + a->frame.max_outgoing + rv_va_save_sz(a); 324 return align_up_u32(raw, 16u); 325 } 326 327 static u32 rv_fp_pair_off(const RvNativeTarget* a, u32 frame_size) { 328 return frame_size - a->variant->frame_save_size - rv_va_save_sz(a); 329 } 330 331 /* ============================ type helpers ============================ */ 332 333 /* Scalar size/align/mem/class/loc constructors are shared in native_target.h 334 * (native_type_size, native_type_align, native_mem_for_type, 335 * native_class_for_type_fp_le8, native_loc_reg, native_loc_stack, 336 * native_loc_is_fp). loc_reg's mask is arch-specific and stays here. */ 337 338 /* True when a scalar value is WIDER than XLEN's natural single-register width, 339 * i.e. it needs the "wide" (rv64 64-bit) ops rather than the base ops. On rv64 340 * a pointer is 8 bytes and counts as wide alongside i64/double; on rv32 a 341 * pointer is 4 bytes and fits a single 32-bit register, so it is NOT wide and 342 * the base (non-W) ops apply. (Kept named rv_is_64 to minimize churn; for the 343 * rv64 variant the result is byte-identical to the old predicate.) */ 344 static int rv_is_64(NativeTarget* t, KitCgTypeId type) { 345 const RiscvVariant* v = rv_of(t)->variant; 346 return native_type_size(t, type) >= 8u || 347 (v->xlen == 64u && cg_type_is_ptr(t->c, type)); 348 } 349 350 static u32 loc_reg(NativeLoc loc) { return loc.v.reg & 0x1fu; } 351 352 /* ============================ register tables ============================ */ 353 354 #define RV_PHYS_INT_ARG(r, idx) \ 355 {.reg = (r), \ 356 .cls = NATIVE_REG_INT, \ 357 .abi_index = (idx), \ 358 .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \ 359 ((idx) < 2u ? NATIVE_REG_RET : 0), \ 360 .spill_cost = 1u, \ 361 .copy_cost = 1u} 362 #define RV_PHYS_INT_CALLER(r) \ 363 {.reg = (r), \ 364 .cls = NATIVE_REG_INT, \ 365 .abi_index = 0xffu, \ 366 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \ 367 .spill_cost = 1u, \ 368 .copy_cost = 1u} 369 #define RV_PHYS_INT_CALLEE(r) \ 370 {.reg = (r), \ 371 .cls = NATIVE_REG_INT, \ 372 .abi_index = 0xffu, \ 373 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \ 374 .spill_cost = 4u, \ 375 .copy_cost = 1u} 376 #define RV_PHYS_INT_RESERVED(r) \ 377 {.reg = (r), \ 378 .cls = NATIVE_REG_INT, \ 379 .abi_index = 0xffu, \ 380 .flags = NATIVE_REG_RESERVED, \ 381 .spill_cost = 0u, \ 382 .copy_cost = 0u} 383 384 /* t0..t3 (x5,x6,x7,x28) are emit-internal scratch (RV_TMP0..RV_TMP3), reserved 385 * and never handed to the allocator or driver. t4/t5 are the driver scratch 386 * pool (disjoint from the emit temps so a hook can never clobber an operand the 387 * driver parked there). t6 is the lone caller-saved allocable (the -O0 cache's 388 * only caller-saved home); s1..s11 are appended callee-saved, chosen under 389 * pressure (and saved by the optimizer prologue at -O1). */ 390 static const Reg rv_int_allocable[] = {31u, 9u, 18u, 19u, 20u, 21u, 391 22u, 23u, 24u, 25u, 26u, 27u}; 392 static const Reg rv_int_scratch[] = {29u, 30u}; /* t4, t5 */ 393 394 static const NativePhysRegInfo rv_int_phys[] = { 395 RV_PHYS_INT_RESERVED(0u), /* zero */ 396 RV_PHYS_INT_RESERVED(1u), /* ra */ 397 RV_PHYS_INT_RESERVED(2u), /* sp */ 398 RV_PHYS_INT_RESERVED(3u), /* gp */ 399 RV_PHYS_INT_RESERVED(4u), /* tp */ 400 RV_PHYS_INT_RESERVED(5u), /* t0 = TMP0 */ 401 RV_PHYS_INT_RESERVED(6u), /* t1 = TMP1 */ 402 RV_PHYS_INT_RESERVED(7u), /* t2 = TMP2 (emit) */ 403 RV_PHYS_INT_RESERVED(8u), /* s0/fp */ 404 RV_PHYS_INT_CALLEE(9u), /* s1 */ 405 RV_PHYS_INT_ARG(10u, 0u), RV_PHYS_INT_ARG(11u, 1u), 406 RV_PHYS_INT_ARG(12u, 2u), RV_PHYS_INT_ARG(13u, 3u), 407 RV_PHYS_INT_ARG(14u, 4u), RV_PHYS_INT_ARG(15u, 5u), 408 RV_PHYS_INT_ARG(16u, 6u), RV_PHYS_INT_ARG(17u, 7u), 409 RV_PHYS_INT_CALLEE(18u), RV_PHYS_INT_CALLEE(19u), 410 RV_PHYS_INT_CALLEE(20u), RV_PHYS_INT_CALLEE(21u), 411 RV_PHYS_INT_CALLEE(22u), RV_PHYS_INT_CALLEE(23u), 412 RV_PHYS_INT_CALLEE(24u), RV_PHYS_INT_CALLEE(25u), 413 RV_PHYS_INT_CALLEE(26u), RV_PHYS_INT_CALLEE(27u), 414 RV_PHYS_INT_RESERVED(28u), /* t3 = TMP3 (emit) */ 415 RV_PHYS_INT_RESERVED(29u), /* t4 = driver scratch */ 416 RV_PHYS_INT_RESERVED(30u), /* t5 = driver scratch */ 417 RV_PHYS_INT_CALLER(31u), /* t6 = caller-saved allocable */ 418 }; 419 420 #define RV_PHYS_FP_ARG(r, idx) \ 421 {.reg = (r), \ 422 .cls = NATIVE_REG_FP, \ 423 .abi_index = (idx), \ 424 .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \ 425 ((idx) < 2u ? NATIVE_REG_RET : 0), \ 426 .spill_cost = 1u, \ 427 .copy_cost = 1u} 428 #define RV_PHYS_FP_CALLER(r) \ 429 {.reg = (r), \ 430 .cls = NATIVE_REG_FP, \ 431 .abi_index = 0xffu, \ 432 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \ 433 .spill_cost = 1u, \ 434 .copy_cost = 1u} 435 #define RV_PHYS_FP_CALLEE(r) \ 436 {.reg = (r), \ 437 .cls = NATIVE_REG_FP, \ 438 .abi_index = 0xffu, \ 439 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \ 440 .spill_cost = 4u, \ 441 .copy_cost = 1u} 442 #define RV_PHYS_FP_RESERVED(r) \ 443 {.reg = (r), \ 444 .cls = NATIVE_REG_FP, \ 445 .abi_index = 0xffu, \ 446 .flags = NATIVE_REG_RESERVED, \ 447 .spill_cost = 0u, \ 448 .copy_cost = 0u} 449 450 /* Caller-saved allocable first (ft4..ft7, ft8..ft11), then callee (fs0..fs11). 451 * ft0/ft1 reserved as emit-internal scratch; ft2/ft3 driver scratch. */ 452 static const Reg rv_fp_allocable[] = {4u, 5u, 6u, 7u, 28u, 29u, 30u, 453 31u, 8u, 9u, 18u, 19u, 20u, 21u, 454 22u, 23u, 24u, 25u, 26u, 27u}; 455 static const Reg rv_fp_scratch[] = {2u, 3u}; /* ft2, ft3 */ 456 457 static const NativePhysRegInfo rv_fp_phys[] = { 458 RV_PHYS_FP_RESERVED(0u), /* ft0 = FTMP0 */ 459 RV_PHYS_FP_RESERVED(1u), /* ft1 = FTMP1 */ 460 RV_PHYS_FP_RESERVED(2u), /* ft2 = scratch */ 461 RV_PHYS_FP_RESERVED(3u), /* ft3 = scratch */ 462 RV_PHYS_FP_CALLER(4u), RV_PHYS_FP_CALLER(5u), RV_PHYS_FP_CALLER(6u), 463 RV_PHYS_FP_CALLER(7u), RV_PHYS_FP_CALLEE(8u), RV_PHYS_FP_CALLEE(9u), 464 RV_PHYS_FP_ARG(10u, 0u), RV_PHYS_FP_ARG(11u, 1u), RV_PHYS_FP_ARG(12u, 2u), 465 RV_PHYS_FP_ARG(13u, 3u), RV_PHYS_FP_ARG(14u, 4u), RV_PHYS_FP_ARG(15u, 5u), 466 RV_PHYS_FP_ARG(16u, 6u), RV_PHYS_FP_ARG(17u, 7u), RV_PHYS_FP_CALLEE(18u), 467 RV_PHYS_FP_CALLEE(19u), RV_PHYS_FP_CALLEE(20u), RV_PHYS_FP_CALLEE(21u), 468 RV_PHYS_FP_CALLEE(22u), RV_PHYS_FP_CALLEE(23u), RV_PHYS_FP_CALLEE(24u), 469 RV_PHYS_FP_CALLEE(25u), RV_PHYS_FP_CALLEE(26u), RV_PHYS_FP_CALLEE(27u), 470 RV_PHYS_FP_CALLER(28u), RV_PHYS_FP_CALLER(29u), RV_PHYS_FP_CALLER(30u), 471 RV_PHYS_FP_CALLER(31u), 472 }; 473 474 static const NativeAllocClassInfo rv_classes[] = { 475 {.cls = NATIVE_REG_INT, 476 .allocable = rv_int_allocable, 477 .nallocable = sizeof rv_int_allocable / sizeof rv_int_allocable[0], 478 .scratch = rv_int_scratch, 479 .nscratch = sizeof rv_int_scratch / sizeof rv_int_scratch[0], 480 .phys = rv_int_phys, 481 .nphys = sizeof rv_int_phys / sizeof rv_int_phys[0], 482 /* t0-t6 (5-7,28-31) + a0-a7 (10-17) */ 483 .caller_saved_mask = 0xf00400e0u | 0x0001fc00u, 484 /* s0-s11 (8,9,18-27) */ 485 .callee_saved_mask = 0x0ffc0300u, 486 .arg_mask = 0x0001fc00u, 487 .ret_mask = 0x00000c00u, 488 /* zero,ra,sp,gp,tp,t0,t1,t2,s0 (bits 0-8) + t3 (bit 28). t4/t5 are the 489 * driver scratch pool (reserved-from-alloc but listed in scratch[]). */ 490 .reserved_mask = 0x000001ffu | (1u << 28)}, 491 {.cls = NATIVE_REG_FP, 492 .allocable = rv_fp_allocable, 493 .nallocable = sizeof rv_fp_allocable / sizeof rv_fp_allocable[0], 494 .scratch = rv_fp_scratch, 495 .nscratch = sizeof rv_fp_scratch / sizeof rv_fp_scratch[0], 496 .phys = rv_fp_phys, 497 .nphys = sizeof rv_fp_phys / sizeof rv_fp_phys[0], 498 /* ft0-ft7 (0-7), fa0-fa7 (10-17), ft8-ft11 (28-31) */ 499 .caller_saved_mask = 0xf00400ffu | 0x0001fc00u, 500 /* fs0-fs11 (8,9,18-27) */ 501 .callee_saved_mask = 0x0ffc0300u, 502 .arg_mask = 0x0001fc00u, 503 .ret_mask = 0x00000c00u, 504 .reserved_mask = 0x0000000fu /* ft0-ft3 */}, 505 }; 506 507 /* Resolve a register name ("a7", "fa0", ...) to its (class, Reg). Powers the 508 * optimizer's inline-asm clobber masks and explicit hard-register operands 509 * ("{a7}" from a GNU local register variable). x0..x31 are DWARF 0..31; the 510 * FP bank f0..f31 is DWARF 32..63. Returns non-zero for a non-register name 511 * (cc/memory/unknown), which the caller skips. */ 512 static int rv_resolve_name(const NativeRegInfo* ri, Slice name, Reg* out, 513 NativeAllocClass* cls_out) { 514 char buf[16]; 515 uint32_t dwarf; 516 (void)ri; 517 if (!name.s || !name.len || name.len >= sizeof buf) return 1; 518 memcpy(buf, name.s, name.len); 519 buf[name.len] = '\0'; 520 if (rv64_register_index(buf, &dwarf) != 0) return 1; 521 if (dwarf <= 31u) { 522 *cls_out = NATIVE_REG_INT; 523 *out = (Reg)dwarf; 524 return 0; 525 } 526 if (dwarf >= 32u && dwarf <= 63u) { 527 *cls_out = NATIVE_REG_FP; 528 *out = (Reg)(dwarf - 32u); 529 return 0; 530 } 531 return 1; 532 } 533 534 static int rv_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls, 535 Reg reg) { 536 (void)ri; 537 if (cls == NATIVE_REG_INT) { 538 if (reg == 9u) return 1; /* s1 */ 539 if (reg >= 10u && reg <= 17u) return 1; /* a0..a7 */ 540 if (reg >= 18u && reg <= 27u) return 1; /* s2..s11 */ 541 if (reg == 31u) return 1; /* t6 */ 542 return 0; 543 } 544 if (cls == NATIVE_REG_FP) return reg >= 4u && reg <= 31u; 545 return 0; 546 } 547 548 static int rv_asm_constraint_reg(const NativeRegInfo* ri, const char* body, 549 NativeAllocClass* cls_out, Reg* fixed_out, 550 u32* allowed_mask_out) { 551 (void)ri; 552 if (!body || !body[0]) return 0; 553 if (fixed_out) *fixed_out = REG_NONE; 554 if (allowed_mask_out) *allowed_mask_out = 0; 555 if (body[0] == 'r' && body[1] == '\0') { 556 if (cls_out) *cls_out = NATIVE_REG_INT; 557 return 1; 558 } 559 if (body[0] == 'f' && body[1] == '\0') { 560 if (cls_out) *cls_out = NATIVE_REG_FP; 561 return 1; 562 } 563 if (body[0] == 'c' && body[1] == 'r' && body[2] == '\0') { 564 if (cls_out) *cls_out = NATIVE_REG_INT; 565 if (allowed_mask_out) *allowed_mask_out = 0x0000ff00u; /* x8..x15 */ 566 return 1; 567 } 568 if (body[0] == 'c' && body[1] == 'f' && body[2] == '\0') { 569 if (cls_out) *cls_out = NATIVE_REG_FP; 570 if (allowed_mask_out) *allowed_mask_out = 0x0000ff00u; /* f8..f15 */ 571 return 1; 572 } 573 return 0; 574 } 575 576 static const NativeRegInfo rv_reg_info = { 577 .classes = rv_classes, 578 .nclasses = sizeof rv_classes / sizeof rv_classes[0], 579 .resolve_name = rv_resolve_name, 580 .asm_operand_reg_ok = rv_asm_operand_reg_ok, 581 .asm_constraint_reg = rv_asm_constraint_reg, 582 }; 583 584 /* ============================ legality ============================ */ 585 586 static int rv_imm_legal(NativeTarget* t, NativeImmUse use, u32 op, 587 KitCgTypeId type, i64 imm) { 588 /* SLLI/SRLI/SRAI shamt is shamt_bits wide: 6 bits (max 63) on rv64, 5 bits 589 * (max 31) on rv32. */ 590 i64 shamt_max = (i64)((1u << rv_of(t)->variant->shamt_bits) - 1u); 591 (void)type; 592 switch (use) { 593 case NATIVE_IMM_MOVE: 594 return 1; 595 case NATIVE_IMM_BINOP: 596 switch ((BinOp)op) { 597 case BO_IADD: 598 return fits_i12(imm); 599 case BO_ISUB: 600 return fits_i12(-imm); /* emitted as ADDI with negated imm */ 601 case BO_AND: 602 case BO_OR: 603 case BO_XOR: 604 return fits_i12(imm); 605 case BO_SHL: 606 case BO_SHR_S: 607 case BO_SHR_U: 608 return imm >= 0 && imm <= shamt_max; 609 default: 610 return 0; 611 } 612 case NATIVE_IMM_CMP: 613 return imm == 0; /* compares need both ends in registers (SLT/branch) */ 614 case NATIVE_IMM_ADDR_OFFSET: 615 return fits_i12(imm); 616 } 617 return 0; 618 } 619 620 static int rv_addr_legal(NativeTarget* t, const NativeAddr* addr, 621 MemAccess mem) { 622 (void)t; 623 (void)mem; 624 if (!addr) return 0; 625 if (addr->index_kind != NATIVE_ADDR_INDEX_NONE) return 0; 626 if (addr->base_kind != NATIVE_ADDR_BASE_REG && 627 addr->base_kind != NATIVE_ADDR_BASE_FRAME) 628 return 0; 629 return fits_i12(addr->offset); 630 } 631 632 /* ============================ memory ============================ */ 633 634 /* Materialize the runtime address of a global into `dst`, including addend. */ 635 static void rv_emit_global_addr(RvNativeTarget* a, u32 dst, ObjSymId sym, 636 i64 addend) { 637 NativeTarget* t = &a->base; 638 MCEmitter* mc = t->mc; 639 u32 sec = mc->section_id; 640 if (obj_symbol_extern_via_got(t->c, t->obj, sym)) { 641 u32 ap = mc->pos(mc); 642 rv64_emit32(mc, rv_auipc(dst, 0)); 643 mc->emit_reloc_at(mc, sec, ap, R_RV_GOT_HI20, sym, 0, 0, 0); 644 { 645 Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi")); 646 ObjSymId anchor = 647 obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0); 648 u32 lp = mc->pos(mc); 649 rv64_emit32(mc, rv_ld_ptr(a->variant, dst, dst, 0)); 650 mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0); 651 } 652 } else { 653 u32 ap = mc->pos(mc); 654 rv64_emit32(mc, rv_auipc(dst, 0)); 655 mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, 0, 0, 0); 656 { 657 Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi")); 658 ObjSymId anchor = 659 obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0); 660 u32 lp = mc->pos(mc); 661 rv64_emit32(mc, rv_addi(dst, dst, 0)); 662 mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0); 663 } 664 } 665 if (addend) rv_emit_addr_adjust(a->variant, mc, dst, dst, (i32)addend); 666 } 667 668 /* Fold (base_reg << 0) + (index << scale) into RV_TMP0 via Zba. */ 669 static u32 rv_fold_index(RvNativeTarget* a, u32 base, u32 idx, u8 log2_scale) { 670 MCEmitter* mc = a->base.mc; 671 switch (log2_scale) { 672 case 0: 673 rv64_emit32(mc, rv_add(RV_TMP0, base, idx)); 674 break; 675 case 1: 676 rv64_emit32(mc, rv_sh1add(RV_TMP0, idx, base)); 677 break; 678 case 2: 679 rv64_emit32(mc, rv_sh2add(RV_TMP0, idx, base)); 680 break; 681 default: 682 rv64_emit32(mc, rv_sh3add(RV_TMP0, idx, base)); 683 break; 684 } 685 return RV_TMP0; 686 } 687 688 /* Resolve any NativeAddr to a base register + imm12 offset. RISC-V has no 689 * indexed load/store, so an index is folded into RV_TMP0 via Zba; far offsets 690 * and FRAME/FRAME_VALUE/GLOBAL bases are materialized into RV_TMP0/RV_TMP1. */ 691 static void rv_resolve_mem_addr(RvNativeTarget* a, const NativeAddr* addr, 692 u32* base_out, i32* off_out) { 693 MCEmitter* mc = a->base.mc; 694 u32 base; 695 i32 off; 696 switch (addr->base_kind) { 697 case NATIVE_ADDR_BASE_REG: 698 base = addr->base.reg & 0x1fu; 699 off = addr->offset; 700 break; 701 case NATIVE_ADDR_BASE_FRAME: { 702 RvNativeSlot* s = rv_slot_get(a, addr->base.frame); 703 base = RV_S0; 704 off = rv_s0_off_slot(s) + addr->offset; 705 break; 706 } 707 case NATIVE_ADDR_BASE_FRAME_VALUE: { 708 RvNativeSlot* s = rv_slot_get(a, addr->base.frame); 709 rv64_emit32(mc, rv_ld_ptr(a->variant, RV_TMP0, RV_S0, rv_s0_off_slot(s))); 710 base = RV_TMP0; 711 off = addr->offset; 712 break; 713 } 714 case NATIVE_ADDR_BASE_GLOBAL: 715 rv_emit_global_addr(a, RV_TMP0, addr->base.global.sym, 716 addr->base.global.addend); 717 base = RV_TMP0; 718 off = addr->offset; 719 break; 720 default: 721 rv_panic(a, "unsupported address base"); 722 } 723 if (addr->index_kind == NATIVE_ADDR_INDEX_REG) { 724 base = rv_fold_index(a, base, addr->index.reg & 0x1fu, addr->log2_scale); 725 } else if (addr->index_kind == NATIVE_ADDR_INDEX_FRAME_VALUE) { 726 RvNativeSlot* s = rv_slot_get(a, addr->index.frame); 727 rv64_emit32(mc, rv_ld_ptr(a->variant, RV_TMP1, RV_S0, rv_s0_off_slot(s))); 728 base = rv_fold_index(a, base, RV_TMP1, addr->log2_scale); 729 } 730 if (!fits_i12(off)) { 731 rv_emit_load_imm(a->variant, mc, 1, RV_TMP1, (i64)off); 732 rv64_emit32(mc, rv_add(RV_TMP0, base, RV_TMP1)); 733 base = RV_TMP0; 734 off = 0; 735 } 736 *base_out = base; 737 *off_out = off; 738 } 739 740 /* Central load/store primitive. is_load: 1 load into reg, 0 store reg to mem. 741 */ 742 static void rv_emit_mem(RvNativeTarget* a, int is_load, NativeLoc reg, 743 NativeAddr addr, MemAccess mem) { 744 NativeTarget* t = &a->base; 745 MCEmitter* mc = t->mc; 746 u32 r = loc_reg(reg); 747 int fp = native_loc_is_fp(reg); 748 u32 sz = mem.size ? mem.size : native_type_size(t, reg.type); 749 u32 base; 750 i32 off; 751 752 rv_resolve_mem_addr(a, &addr, &base, &off); 753 if (fp) { 754 rv64_emit32( 755 mc, is_load ? (sz == 8u ? rv_fld(r, base, off) : rv_flw(r, base, off)) 756 : (sz == 8u ? rv_fsd(r, base, off) : rv_fsw(r, base, off))); 757 } else { 758 rv64_emit32(mc, is_load ? enc_int_load(a->variant, sz, 0, r, base, off) 759 : enc_int_store(a->variant, sz, r, base, off)); 760 } 761 } 762 763 /* ============================ moves / data ============================ */ 764 765 static void rv_move(NativeTarget* t, NativeLoc dst, NativeLoc src) { 766 MCEmitter* mc = t->mc; 767 int dfp = native_loc_is_fp(dst), sfp = native_loc_is_fp(src); 768 u32 rd = loc_reg(dst), rs = loc_reg(src); 769 if (dfp && sfp) { 770 u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S; 771 if (rd == rs) return; 772 rv64_emit32(mc, rv_fsgnj(fmt, rd, rs, rs)); 773 return; 774 } 775 if (!dfp && sfp) { 776 u32 sz = native_type_size(t, src.type); 777 rv64_emit32(mc, sz == 8u ? rv_fmv_x_d(rd, rs) : rv_fmv_x_w(rd, rs)); 778 return; 779 } 780 if (dfp && !sfp) { 781 u32 sz = native_type_size(t, dst.type); 782 rv64_emit32(mc, sz == 8u ? rv_fmv_d_x(rd, rs) : rv_fmv_w_x(rd, rs)); 783 return; 784 } 785 if (rd == rs) return; 786 rv64_emit32(mc, rv_addi(rd, rs, 0)); 787 } 788 789 static void rv_load_imm(NativeTarget* t, NativeLoc dst, i64 imm) { 790 rv_emit_load_imm(rv_of(t)->variant, t->mc, rv_is_64(t, dst.type) ? 1u : 0u, 791 loc_reg(dst), imm); 792 } 793 794 static void rv_load_const(NativeTarget* t, NativeLoc dst, ConstBytes cb) { 795 RvNativeTarget* a = rv_of(t); 796 u64 v = 0; 797 u32 i; 798 if (!native_loc_is_fp(dst)) { 799 for (i = 0; i < cb.size && i < 8u; ++i) v |= (u64)cb.bytes[i] << (i * 8u); 800 rv_load_imm(t, dst, (i64)v); 801 return; 802 } 803 /* FP constant: materialize the bit pattern in TMP0, bitcast into the FPR. */ 804 for (i = 0; i < cb.size && i < 8u; ++i) v |= (u64)cb.bytes[i] << (i * 8u); 805 rv_emit_load_imm(a->variant, t->mc, 1, RV_TMP0, (i64)v); 806 if (cb.size == 8u) 807 rv64_emit32(t->mc, rv_fmv_d_x(loc_reg(dst), RV_TMP0)); 808 else 809 rv64_emit32(t->mc, rv_fmv_w_x(loc_reg(dst), RV_TMP0)); 810 (void)a; 811 } 812 813 static void rv_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) { 814 RvNativeTarget* a = rv_of(t); 815 MCEmitter* mc = t->mc; 816 u32 rd = loc_reg(dst); 817 u32 base; 818 i32 off; 819 if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL) { 820 rv_emit_global_addr(a, rd, addr.base.global.sym, 821 addr.base.global.addend + addr.offset); 822 base = rd; 823 off = 0; 824 } else if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) { 825 /* Load the pointer stored in the frame slot, then add the offset. */ 826 RvNativeSlot* s = rv_slot_get(a, addr.base.frame); 827 rv64_emit32(mc, rv_ld_ptr(a->variant, rd, RV_S0, rv_s0_off_slot(s))); 828 base = rd; 829 off = addr.offset; 830 } else if (addr.base_kind == NATIVE_ADDR_BASE_FRAME) { 831 RvNativeSlot* s = rv_slot_get(a, addr.base.frame); 832 base = RV_S0; 833 off = rv_s0_off_slot(s) + addr.offset; 834 } else if (addr.base_kind == NATIVE_ADDR_BASE_REG) { 835 base = addr.base.reg & 0x1fu; 836 off = addr.offset; 837 } else { 838 rv_panic(a, "unsupported address base in load_addr"); 839 } 840 /* Fold any index via Zba sh{1,2,3}add (index << scale) + base. */ 841 if (addr.index_kind == NATIVE_ADDR_INDEX_REG) { 842 u32 idx = addr.index.reg & 0x1fu; 843 if (off != 0 || base != rd) 844 rv_emit_addr_adjust(a->variant, mc, rd, base, off); 845 switch (addr.log2_scale) { 846 case 0: 847 rv64_emit32(mc, rv_add(rd, rd, idx)); 848 break; 849 case 1: 850 rv64_emit32(mc, rv_sh1add(rd, idx, rd)); 851 break; 852 case 2: 853 rv64_emit32(mc, rv_sh2add(rd, idx, rd)); 854 break; 855 default: 856 rv64_emit32(mc, rv_sh3add(rd, idx, rd)); 857 break; 858 } 859 return; 860 } 861 rv_emit_addr_adjust(a->variant, mc, rd, base, off); 862 } 863 864 static void rv_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, 865 MemAccess mem) { 866 rv_emit_mem(rv_of(t), 1, dst, addr, mem); 867 } 868 static void rv_store(NativeTarget* t, NativeAddr addr, NativeLoc src, 869 MemAccess mem) { 870 rv_emit_mem(rv_of(t), 0, src, addr, mem); 871 } 872 873 /* copy_bytes: resolve dst and src to dedicated pointer regs (RV_TMP3 / RV_TMP0) 874 * once, then copy granule-by-granule advancing both pointers. dst is resolved 875 * first because its base may itself live in RV_TMP1 (the transfer reg, e.g. the 876 * sret pointer from plan_ret); capturing it into RV_TMP3 before src resolution 877 * (which may clobber RV_TMP1 for far offsets) keeps it live. Advancing the 878 * pointers keeps every load/store at offset 0, so no offset ever exceeds imm12 879 * and the transfer reg never aliases a base. */ 880 static void rv_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src, 881 AggregateAccess access) { 882 RvNativeTarget* a = rv_of(t); 883 const RiscvVariant* v = a->variant; 884 MCEmitter* mc = t->mc; 885 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 886 u32 rem = access.size; 887 u32 maxg = v->ptr_bytes; /* widest granule: 8 on rv64, 4 on rv32 */ 888 rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP3), dst); 889 rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP0), src); 890 while (rem) { 891 u32 sz = rem >= 8u && maxg >= 8u ? 8u 892 : rem >= 4u ? 4u 893 : rem >= 2u ? 2u 894 : 1u; 895 rv64_emit32(mc, enc_int_load(v, sz, 0, RV_TMP1, RV_TMP0, 0)); 896 rv64_emit32(mc, enc_int_store(v, sz, RV_TMP1, RV_TMP3, 0)); 897 rv64_emit32(mc, rv_addi(RV_TMP0, RV_TMP0, (i32)sz)); 898 rv64_emit32(mc, rv_addi(RV_TMP3, RV_TMP3, (i32)sz)); 899 rem -= sz; 900 } 901 } 902 903 static void rv_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value, 904 AggregateAccess access) { 905 MCEmitter* mc = t->mc; 906 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 907 u32 bv = loc_reg(byte_value); 908 u32 rem = access.size; 909 rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP3), dst); 910 while (rem) { 911 rv64_emit32(mc, rv_sb(bv, RV_TMP3, 0)); 912 rv64_emit32(mc, rv_addi(RV_TMP3, RV_TMP3, 1)); 913 rem -= 1u; 914 } 915 } 916 917 /* ============================ arithmetic ============================ */ 918 919 static void rv_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc aop, 920 NativeLoc bop) { 921 const RiscvVariant* v = rv_of(t)->variant; 922 MCEmitter* mc = t->mc; 923 u32 rd = loc_reg(dst); 924 u32 ra = loc_reg(aop); 925 int sf = rv_is_64(t, dst.type); 926 /* The W-form ops (ADDW/SUBW/MULW/SLLW/...) are RV64-only and act on a 32-bit 927 * value held in a 64-bit register. They are emitted only for a narrow value 928 * on rv64; on rv32 the BASE ops ARE the 32-bit ops, so `w` is always 0 and we 929 * fall to the base ops. */ 930 int w = !sf && v->has_w_forms; 931 /* Immediate shamt mask: 5-bit (&31) for a W-form / rv32 op, else shamt_bits 932 * (6-bit &63 on rv64) for the native-width op. */ 933 u32 shmask = w ? 31u : ((1u << v->shamt_bits) - 1u); 934 int b_imm = bop.kind == NATIVE_LOC_IMM; 935 u32 rb = b_imm ? 0u : loc_reg(bop); 936 i64 imm = b_imm ? bop.v.imm : 0; 937 938 switch (op) { 939 case BO_FADD: 940 case BO_FSUB: 941 case BO_FMUL: 942 case BO_FDIV: { 943 u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S; 944 switch (op) { 945 case BO_FADD: 946 rv64_emit32(mc, rv_fadd(fmt, rd, ra, rb)); 947 break; 948 case BO_FSUB: 949 rv64_emit32(mc, rv_fsub(fmt, rd, ra, rb)); 950 break; 951 case BO_FMUL: 952 rv64_emit32(mc, rv_fmul(fmt, rd, ra, rb)); 953 break; 954 default: 955 rv64_emit32(mc, rv_fdiv(fmt, rd, ra, rb)); 956 break; 957 } 958 return; 959 } 960 case BO_IADD: 961 if (b_imm) { 962 rv64_emit32( 963 mc, w ? rv_addiw(rd, ra, (i32)imm) : rv_addi(rd, ra, (i32)imm)); 964 } else { 965 rv64_emit32(mc, w ? rv_addw(rd, ra, rb) : rv_add(rd, ra, rb)); 966 } 967 return; 968 case BO_ISUB: 969 if (b_imm) { 970 rv64_emit32( 971 mc, w ? rv_addiw(rd, ra, (i32)-imm) : rv_addi(rd, ra, (i32)-imm)); 972 } else { 973 rv64_emit32(mc, w ? rv_subw(rd, ra, rb) : rv_sub(rd, ra, rb)); 974 } 975 return; 976 case BO_IMUL: 977 rv64_emit32(mc, w ? rv_mulw(rd, ra, rb) : rv_mul(rd, ra, rb)); 978 return; 979 case BO_SDIV: 980 rv64_emit32(mc, w ? rv_divw(rd, ra, rb) : rv_div(rd, ra, rb)); 981 return; 982 case BO_UDIV: 983 rv64_emit32(mc, w ? rv_divuw(rd, ra, rb) : rv_divu(rd, ra, rb)); 984 return; 985 case BO_SREM: 986 rv64_emit32(mc, w ? rv_remw(rd, ra, rb) : rv_rem(rd, ra, rb)); 987 return; 988 case BO_UREM: 989 rv64_emit32(mc, w ? rv_remuw(rd, ra, rb) : rv_remu(rd, ra, rb)); 990 return; 991 case BO_AND: 992 rv64_emit32(mc, b_imm ? rv_andi(rd, ra, (i32)imm) : rv_and(rd, ra, rb)); 993 return; 994 case BO_OR: 995 rv64_emit32(mc, b_imm ? rv_ori(rd, ra, (i32)imm) : rv_or(rd, ra, rb)); 996 return; 997 case BO_XOR: 998 rv64_emit32(mc, b_imm ? rv_xori(rd, ra, (i32)imm) : rv_xor(rd, ra, rb)); 999 return; 1000 case BO_SHL: 1001 if (b_imm) 1002 rv64_emit32(mc, w ? rv_slliw(rd, ra, (u32)imm & shmask) 1003 : rv_slli(rd, ra, (u32)imm & shmask)); 1004 else 1005 rv64_emit32(mc, w ? rv_sllw(rd, ra, rb) : rv_sll(rd, ra, rb)); 1006 return; 1007 case BO_SHR_U: 1008 if (b_imm) 1009 rv64_emit32(mc, w ? rv_srliw(rd, ra, (u32)imm & shmask) 1010 : rv_srli(rd, ra, (u32)imm & shmask)); 1011 else 1012 rv64_emit32(mc, w ? rv_srlw(rd, ra, rb) : rv_srl(rd, ra, rb)); 1013 return; 1014 case BO_SHR_S: 1015 if (b_imm) 1016 rv64_emit32(mc, w ? rv_sraiw(rd, ra, (u32)imm & shmask) 1017 : rv_srai(rd, ra, (u32)imm & shmask)); 1018 else 1019 rv64_emit32(mc, w ? rv_sraw(rd, ra, rb) : rv_sra(rd, ra, rb)); 1020 return; 1021 default: 1022 rv_panic(rv_of(t), "unsupported binop"); 1023 } 1024 } 1025 1026 static void rv_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) { 1027 const RiscvVariant* v = rv_of(t)->variant; 1028 MCEmitter* mc = t->mc; 1029 u32 rd = loc_reg(dst), rs = loc_reg(src); 1030 int sf = rv_is_64(t, dst.type); 1031 int w = !sf && v->has_w_forms; /* SUBW is RV64-only; base SUB on rv32 */ 1032 switch (op) { 1033 case UO_NEG: 1034 rv64_emit32(mc, w ? rv_subw(rd, RV_ZERO, rs) : rv_sub(rd, RV_ZERO, rs)); 1035 return; 1036 case UO_FNEG: { 1037 u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S; 1038 rv64_emit32(mc, rv_fsgnjn(fmt, rd, rs, rs)); 1039 return; 1040 } 1041 case UO_BNOT: 1042 rv64_emit32(mc, rv_xori(rd, rs, -1)); 1043 return; 1044 case UO_NOT: 1045 rv64_emit32(mc, rv_sltiu(rd, rs, 1)); 1046 return; 1047 default: 1048 rv_panic(rv_of(t), "unsupported unop"); 1049 } 1050 } 1051 1052 /* Sign/zero-extend a 32-bit operand into a 64-bit register for comparison. 1053 * Returns the register to compare. */ 1054 static u32 rv_cmp_ext(NativeTarget* t, int is_signed, NativeLoc op, u32 tmp) { 1055 const RiscvVariant* v = rv_of(t)->variant; 1056 MCEmitter* mc = t->mc; 1057 u32 r = loc_reg(op); 1058 /* On rv32 a 32-bit operand already fills the whole register — there is no 1059 * wider container to canonicalize into, so the extension is a no-op. */ 1060 if (v->xlen == 32u) return r; 1061 if (rv_is_64(t, op.type)) return r; 1062 if (is_signed) { 1063 rv64_emit32(mc, rv_addiw(tmp, r, 0)); /* sign-extend low 32 */ 1064 } else { 1065 rv64_emit32(mc, rv_slli(tmp, r, 32)); 1066 rv64_emit32(mc, rv_srli(tmp, tmp, 32)); 1067 } 1068 return tmp; 1069 } 1070 1071 static int cmp_is_signed(CmpOp op) { 1072 switch (op) { 1073 case CMP_LT_U: 1074 case CMP_LE_U: 1075 case CMP_GT_U: 1076 case CMP_GE_U: 1077 return 0; 1078 default: 1079 return 1; 1080 } 1081 } 1082 1083 /* Emit a 0/1 comparison result into rd from two integer registers. */ 1084 static void rv_emit_icmp(NativeTarget* t, CmpOp op, u32 rd, u32 ra, u32 rb) { 1085 MCEmitter* mc = t->mc; 1086 switch (op) { 1087 case CMP_EQ: 1088 rv64_emit32(mc, rv_sub(rd, ra, rb)); 1089 rv64_emit32(mc, rv_sltiu(rd, rd, 1)); 1090 return; 1091 case CMP_NE: 1092 rv64_emit32(mc, rv_sub(rd, ra, rb)); 1093 rv64_emit32(mc, rv_sltu(rd, RV_ZERO, rd)); 1094 return; 1095 case CMP_LT_S: 1096 rv64_emit32(mc, rv_slt(rd, ra, rb)); 1097 return; 1098 case CMP_LT_U: 1099 rv64_emit32(mc, rv_sltu(rd, ra, rb)); 1100 return; 1101 case CMP_GT_S: 1102 rv64_emit32(mc, rv_slt(rd, rb, ra)); 1103 return; 1104 case CMP_GT_U: 1105 rv64_emit32(mc, rv_sltu(rd, rb, ra)); 1106 return; 1107 case CMP_GE_S: 1108 rv64_emit32(mc, rv_slt(rd, ra, rb)); 1109 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1110 return; 1111 case CMP_GE_U: 1112 rv64_emit32(mc, rv_sltu(rd, ra, rb)); 1113 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1114 return; 1115 case CMP_LE_S: 1116 rv64_emit32(mc, rv_slt(rd, rb, ra)); 1117 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1118 return; 1119 case CMP_LE_U: 1120 rv64_emit32(mc, rv_sltu(rd, rb, ra)); 1121 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1122 return; 1123 default: 1124 rv_panic(rv_of(t), "unsupported integer cmp"); 1125 } 1126 } 1127 1128 /* Format-dispatching wrappers over the ordered FP compares (feq/flt/fle are 1129 * ordered: they yield 0 on NaN; flt/fle are signaling, raising NV on NaN — 1130 * pre-existing for ordered ops, and the boolean result is still correct). */ 1131 static u32 rv_feq_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) { 1132 return fmt == RV_FMT_D ? rv_feq_d(rd, ra, rb) : rv_feq_s(rd, ra, rb); 1133 } 1134 static u32 rv_flt_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) { 1135 return fmt == RV_FMT_D ? rv_flt_d(rd, ra, rb) : rv_flt_s(rd, ra, rb); 1136 } 1137 static u32 rv_fle_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) { 1138 return fmt == RV_FMT_D ? rv_fle_d(rd, ra, rb) : rv_fle_s(rd, ra, rb); 1139 } 1140 1141 static void rv_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc aop, 1142 NativeLoc bop) { 1143 MCEmitter* mc = t->mc; 1144 u32 rd = loc_reg(dst); 1145 /* FP-ness is self-describing from the opcode (FP block starts at CMP_OEQ_F). 1146 * Unordered predicates use unordered-R == NOT(ordered-not-R): the ordered 1147 * compare into rd, then `xori rd,rd,1`. ONE/UEQ have no single ordered 1148 * primitive and OR the two strict relations (a<b | a>b) via scratch RV_TMP2 1149 * (x7, reserved & never allocable, so it can't alias rd). */ 1150 if (op >= CMP_OEQ_F) { 1151 u32 fmt = native_type_size(t, aop.type) == 8u ? RV_FMT_D : RV_FMT_S; 1152 u32 ra = loc_reg(aop), rb = loc_reg(bop); 1153 switch (op) { 1154 case CMP_OEQ_F: 1155 rv64_emit32(mc, rv_feq_fmt(fmt, rd, ra, rb)); 1156 return; 1157 case CMP_UNE_F: /* !(OEQ) */ 1158 rv64_emit32(mc, rv_feq_fmt(fmt, rd, ra, rb)); 1159 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1160 return; 1161 case CMP_OLT_F: 1162 rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb)); 1163 return; 1164 case CMP_OLE_F: 1165 rv64_emit32(mc, rv_fle_fmt(fmt, rd, ra, rb)); 1166 return; 1167 case CMP_OGT_F: 1168 rv64_emit32(mc, rv_flt_fmt(fmt, rd, rb, ra)); 1169 return; 1170 case CMP_OGE_F: 1171 rv64_emit32(mc, rv_fle_fmt(fmt, rd, rb, ra)); 1172 return; 1173 case CMP_UGE_F: /* !(OLT) */ 1174 rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb)); 1175 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1176 return; 1177 case CMP_UGT_F: /* !(OLE) */ 1178 rv64_emit32(mc, rv_fle_fmt(fmt, rd, ra, rb)); 1179 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1180 return; 1181 case CMP_ULE_F: /* !(OGT) */ 1182 rv64_emit32(mc, rv_flt_fmt(fmt, rd, rb, ra)); 1183 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1184 return; 1185 case CMP_ULT_F: /* !(OGE) */ 1186 rv64_emit32(mc, rv_fle_fmt(fmt, rd, rb, ra)); 1187 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1188 return; 1189 case CMP_ONE_F: /* ordered & !=: (a<b) | (a>b) */ 1190 rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb)); 1191 rv64_emit32(mc, rv_flt_fmt(fmt, RV_TMP2, rb, ra)); 1192 rv64_emit32(mc, rv_or(rd, rd, RV_TMP2)); 1193 return; 1194 case CMP_UEQ_F: /* unordered | ==: !((a<b) | (a>b)) */ 1195 rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb)); 1196 rv64_emit32(mc, rv_flt_fmt(fmt, RV_TMP2, rb, ra)); 1197 rv64_emit32(mc, rv_or(rd, rd, RV_TMP2)); 1198 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1199 return; 1200 default: 1201 rv_panic(rv_of(t), "unsupported fp cmp"); 1202 } 1203 } 1204 { 1205 int sg = cmp_is_signed(op); 1206 u32 ra = rv_cmp_ext(t, sg, aop, RV_TMP0); 1207 u32 rb = rv_cmp_ext(t, sg, bop, RV_TMP1); 1208 rv_emit_icmp(t, op, rd, ra, rb); 1209 } 1210 } 1211 1212 static void rv_convert(NativeTarget* t, ConvKind op, NativeLoc dst, 1213 NativeLoc src) { 1214 const RiscvVariant* v = rv_of(t)->variant; 1215 MCEmitter* mc = t->mc; 1216 u32 rd = loc_reg(dst), rs = loc_reg(src); 1217 u32 src_sz = native_type_size(t, src.type); 1218 u32 dst_sz = native_type_size(t, dst.type); 1219 /* `il` (int-side wide): the 64-bit-integer fcvt L-forms are RV64-only; on 1220 * rv32 only the w/wu forms exist and a 64-bit int<->fp is legalized to a 1221 * libcall before reaching here. */ 1222 int il = v->has_w_forms; 1223 switch (op) { 1224 case CV_SEXT: 1225 if (src_sz >= 4u) { 1226 /* ADDIW sign-extends bits[31:0] into a 64-bit reg (RV64). On rv32 a 1227 * 4-byte value already spans the whole register, so a plain move (or 1228 * nothing when rd==rs) is the sign extension. */ 1229 if (v->has_w_forms) 1230 rv64_emit32(mc, rv_addiw(rd, rs, 0)); 1231 else if (rd != rs) 1232 rv64_emit32(mc, rv_addi(rd, rs, 0)); 1233 } else { 1234 u32 sh = v->xlen - src_sz * 8u; 1235 rv64_emit32(mc, rv_slli(rd, rs, sh)); 1236 rv64_emit32(mc, rv_srai(rd, rd, sh)); 1237 } 1238 return; 1239 case CV_ZEXT: { 1240 u32 sh = v->xlen - src_sz * 8u; 1241 rv64_emit32(mc, rv_slli(rd, rs, sh)); 1242 rv64_emit32(mc, rv_srli(rd, rd, sh)); 1243 return; 1244 } 1245 case CV_TRUNC: 1246 if (rd != rs || dst_sz <= 4u) 1247 rv64_emit32(mc, rv_addi(rd, rs, 0)); /* low bits; users re-narrow */ 1248 return; 1249 case CV_ITOF_S: 1250 if (native_type_size(t, dst.type) == 8u) 1251 rv64_emit32(mc, il && src_sz == 8u ? rv_fcvt_d_l(rd, rs) 1252 : rv_fcvt_d_w(rd, rs)); 1253 else 1254 rv64_emit32(mc, il && src_sz == 8u ? rv_fcvt_s_l(rd, rs) 1255 : rv_fcvt_s_w(rd, rs)); 1256 return; 1257 case CV_ITOF_U: 1258 if (native_type_size(t, dst.type) == 8u) 1259 rv64_emit32(mc, il && src_sz == 8u ? rv_fcvt_d_lu(rd, rs) 1260 : rv_fcvt_d_wu(rd, rs)); 1261 else 1262 rv64_emit32(mc, il && src_sz == 8u ? rv_fcvt_s_lu(rd, rs) 1263 : rv_fcvt_s_wu(rd, rs)); 1264 return; 1265 case CV_FTOI_S: 1266 if (src_sz == 8u) 1267 rv64_emit32(mc, il && dst_sz == 8u ? rv_fcvt_l_d(rd, rs) 1268 : rv_fcvt_w_d(rd, rs)); 1269 else 1270 rv64_emit32(mc, il && dst_sz == 8u ? rv_fcvt_l_s(rd, rs) 1271 : rv_fcvt_w_s(rd, rs)); 1272 return; 1273 case CV_FTOI_U: 1274 if (src_sz == 8u) 1275 rv64_emit32(mc, il && dst_sz == 8u ? rv_fcvt_lu_d(rd, rs) 1276 : rv_fcvt_wu_d(rd, rs)); 1277 else 1278 rv64_emit32(mc, il && dst_sz == 8u ? rv_fcvt_lu_s(rd, rs) 1279 : rv_fcvt_wu_s(rd, rs)); 1280 return; 1281 case CV_FEXT: 1282 rv64_emit32(mc, rv_fcvt_d_s(rd, rs)); 1283 return; 1284 case CV_FTRUNC: 1285 rv64_emit32(mc, rv_fcvt_s_d(rd, rs)); 1286 return; 1287 case CV_BITCAST: 1288 rv_move(t, dst, src); 1289 return; 1290 default: 1291 rv_panic(rv_of(t), "unsupported convert"); 1292 } 1293 } 1294 1295 /* ============================ spill / reload ============================ */ 1296 1297 static void rv_spill(NativeTarget* t, NativeLoc src, NativeFrameSlot slot, 1298 MemAccess mem) { 1299 NativeAddr addr; 1300 memset(&addr, 0, sizeof addr); 1301 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1302 addr.base.frame = slot; 1303 addr.base_type = src.type; 1304 rv_emit_mem(rv_of(t), 0, src, addr, mem); 1305 } 1306 static void rv_reload(NativeTarget* t, NativeLoc dst, NativeFrameSlot slot, 1307 MemAccess mem) { 1308 NativeAddr addr; 1309 memset(&addr, 0, sizeof addr); 1310 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1311 addr.base.frame = slot; 1312 addr.base_type = dst.type; 1313 rv_emit_mem(rv_of(t), 1, dst, addr, mem); 1314 } 1315 1316 /* ============================ control flow ============================ */ 1317 1318 static MCLabel rv_label_new(NativeTarget* t) { return t->mc->label_new(t->mc); } 1319 static void rv_label_place(NativeTarget* t, MCLabel l) { 1320 t->mc->label_place(t->mc, l); 1321 } 1322 static void rv_jump(NativeTarget* t, MCLabel l) { 1323 rv64_emit32(t->mc, rv_jal(RV_ZERO, 0)); 1324 t->mc->emit_label_ref(t->mc, l, R_RV_JAL, 4, 0); 1325 } 1326 1327 static void rv_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc aop, 1328 NativeLoc bop, MCLabel l) { 1329 MCEmitter* mc = t->mc; 1330 /* RISC-V B-type branches reach only ±4 KiB, which a single (especially 1331 * -O0) function can exceed between a branch and its target. Rather than a 1332 * lone conditional branch to the label, emit a short *inverted* branch 1333 * that skips an unconditional `jal` (±1 MiB) to the target. The inverted 1334 * branch's displacement is the constant SKIP_JAL (skip just the jal) and 1335 * so is always in range; the jal carries the long reach. See rv_jump. */ 1336 enum { SKIP_JAL = 8 }; /* branch over the 4-byte jal that follows it */ 1337 /* FP compares have no register-register branch form: materialize the 0/1 1338 * into TMP0 via rv_cmp (handles all 12 predicates), then branch on nonzero. 1339 */ 1340 if (op >= CMP_OEQ_F) { 1341 NativeLoc tmp = 1342 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, RV_TMP0); 1343 rv_cmp(t, op, tmp, aop, bop); 1344 /* Skip the jal when the result is 0 (condition false). */ 1345 rv64_emit32(mc, rv_beq(RV_TMP0, RV_ZERO, SKIP_JAL)); 1346 rv_jump(t, l); 1347 return; 1348 } 1349 { 1350 int sg = cmp_is_signed(op); 1351 u32 ra = rv_cmp_ext(t, sg, aop, RV_TMP0); 1352 u32 rb = rv_cmp_ext(t, sg, bop, RV_TMP1); 1353 u32 word; 1354 /* Encode the *inverse* of `op`, skipping the jal when NOT taken. */ 1355 switch (op) { 1356 case CMP_EQ: 1357 word = rv_bne(ra, rb, SKIP_JAL); 1358 break; 1359 case CMP_NE: 1360 word = rv_beq(ra, rb, SKIP_JAL); 1361 break; 1362 case CMP_LT_S: 1363 word = rv_bge(ra, rb, SKIP_JAL); 1364 break; 1365 case CMP_GE_S: 1366 word = rv_blt(ra, rb, SKIP_JAL); 1367 break; 1368 case CMP_LT_U: 1369 word = rv_bgeu(ra, rb, SKIP_JAL); 1370 break; 1371 case CMP_GE_U: 1372 word = rv_bltu(ra, rb, SKIP_JAL); 1373 break; 1374 case CMP_GT_S: 1375 word = rv_bge(rb, ra, SKIP_JAL); 1376 break; 1377 case CMP_LE_S: 1378 word = rv_blt(rb, ra, SKIP_JAL); 1379 break; 1380 case CMP_GT_U: 1381 word = rv_bgeu(rb, ra, SKIP_JAL); 1382 break; 1383 case CMP_LE_U: 1384 word = rv_bltu(rb, ra, SKIP_JAL); 1385 break; 1386 default: 1387 rv_panic(rv_of(t), "unsupported cmp_branch"); 1388 } 1389 rv64_emit32(mc, word); 1390 rv_jump(t, l); 1391 } 1392 } 1393 1394 static void rv_indirect_branch(NativeTarget* t, NativeLoc addr, 1395 const MCLabel* valid_targets, u32 ntargets) { 1396 (void)valid_targets; 1397 (void)ntargets; 1398 rv64_emit32(t->mc, rv_jalr(RV_ZERO, loc_reg(addr), 0)); 1399 } 1400 1401 static void rv_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel l) { 1402 /* `&&label` address-take: auipc/addi with a %pcrel_hi/%pcrel_lo relocation 1403 * pair against the label's per-block local symbol — the same form 1404 * rv_emit_global_addr uses for a global — so a compressing/re-encoding 1405 * assembler recomputes the displacement (a baked offset would break under 1406 * the C extension). */ 1407 MCEmitter* mc = t->mc; 1408 u32 rd = loc_reg(dst); 1409 u32 sec = mc->section_id; 1410 ObjSymId sym = mc_label_symbol(mc, l); 1411 u32 ap = mc->pos(mc); 1412 rv64_emit32(mc, rv_auipc(rd, 0)); 1413 mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, 0, 0, 0); 1414 { 1415 Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi")); 1416 ObjSymId anchor = obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0); 1417 u32 lp = mc->pos(mc); 1418 rv64_emit32(mc, rv_addi(rd, rd, 0)); 1419 mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0); 1420 } 1421 } 1422 1423 /* ============================ frame / lifecycle ============================ 1424 */ 1425 1426 static NativeFrameSlot rv_frame_slot(NativeTarget* t, 1427 const NativeFrameSlotDesc* d) { 1428 return native_frame_slot_alloc(&rv_of(t)->frame, d); 1429 } 1430 1431 static int rv_frame_slot_debug_loc(NativeTarget* t, NativeFrameSlot slot, 1432 CGDebugLoc* out) { 1433 RvNativeTarget* a = rv_of(t); 1434 RvNativeSlot* s; 1435 if (!out) return 0; 1436 memset(out, 0, sizeof *out); 1437 if (slot == NATIVE_FRAME_SLOT_NONE || slot > a->frame.nslots) return 0; 1438 s = rv_slot_get(a, slot); 1439 out->kind = CG_DEBUG_LOC_FRAME; 1440 /* rv64 slots are addressed s0/fp-relative (rv_s0_off_slot); the hosted dbg 1441 * snapshot seeds the frame base with s0, matching aa64's FP-relative 1442 * convention. */ 1443 out->v.frame_ofs = rv_s0_off_slot(s); 1444 return 1; 1445 } 1446 1447 static void rv_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) { 1448 RvNativeTarget* a = rv_of(t); 1449 MCEmitter* mc = t->mc; 1450 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type); 1451 a->func = fd; 1452 a->loc = fd->loc; 1453 /* Shared frame bookkeeping: clears the slot table, cum_off, max_outgoing, 1454 * callee-save set, and known_frame/has_alloca/frame_final. */ 1455 native_frame_reset(&a->frame); 1456 a->incoming_stack_size = 0; 1457 a->next_param_int = 0; 1458 a->next_param_fp = 0; 1459 a->next_param_stack = 0; 1460 a->has_sret = (abi && abi->has_sret) ? 1u : 0u; 1461 a->is_variadic = (abi && abi->variadic) ? 1u : 0u; 1462 a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE; 1463 a->npatches = 0; 1464 a->nalloca = 0; 1465 a->minimal_prologue_words = 0; 1466 a->slim_prologue = 0; 1467 1468 mc->set_section(mc, fd->text_section_id); 1469 mc->emit_align(mc, 4, 0); 1470 a->func_start = mc->pos(mc); 1471 mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start); 1472 if (mc->cfi_startproc) mc->cfi_startproc(mc); 1473 a->epilogue_label = mc->label_new(mc); 1474 } 1475 1476 /* sret: reserve a hidden slot for the incoming destination pointer (a0). */ 1477 static void rv_reserve_entry_saves(RvNativeTarget* a) { 1478 NativeTarget* t = &a->base; 1479 if (a->has_sret) { 1480 NativeFrameSlotDesc sd; 1481 u32 ptr = a->variant->ptr_bytes; 1482 memset(&sd, 0, sizeof sd); 1483 sd.type = builtin_id(KIT_CG_BUILTIN_I64); 1484 sd.size = ptr; /* a pointer slot: 8 on rv64, 4 on rv32 */ 1485 sd.align = ptr; 1486 sd.kind = NATIVE_FRAME_SLOT_SAVE; 1487 a->sret_ptr_slot = t->frame_slot(t, &sd); 1488 a->next_param_int = 1; /* a0 consumed by the sret pointer */ 1489 } 1490 } 1491 1492 static void rv_emit_entry_save_stores(RvNativeTarget* a) { 1493 NativeTarget* t = &a->base; 1494 if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) { 1495 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 1496 u32 ptr = a->variant->ptr_bytes; 1497 NativeAddr addr; 1498 memset(&addr, 0, sizeof addr); 1499 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1500 addr.base.frame = a->sret_ptr_slot; 1501 addr.base_type = i64t; 1502 rv_emit_mem(a, 0, native_loc_reg(i64t, NATIVE_REG_INT, RV_A0), addr, 1503 native_mem_for_type(t, i64t, ptr)); 1504 } 1505 } 1506 1507 /* Collect the callee-saves the body used (none at -O0). */ 1508 static u32 rv_collect_int_saves(RvNativeTarget* a, u32* regs) { 1509 u32 n = 0, i; 1510 for (i = 0; i < a->frame.ncallee_saves; ++i) 1511 if (a->frame.callee_saves[i].cls == NATIVE_REG_INT) 1512 regs[n++] = a->frame.callee_saves[i].reg; 1513 return n; 1514 } 1515 static u32 rv_collect_fp_saves(RvNativeTarget* a, u32* regs) { 1516 u32 n = 0, i; 1517 for (i = 0; i < a->frame.ncallee_saves; ++i) 1518 if (a->frame.callee_saves[i].cls == NATIVE_REG_FP) 1519 regs[n++] = a->frame.callee_saves[i].reg; 1520 return n; 1521 } 1522 1523 /* s0-relative offset of a saved register, below the locals. The flat index runs 1524 * 0..n_int-1 over integer saves (each ptr_bytes wide) then n_int..n_int+n_fp-1 1525 * over fp saves (each 8 bytes wide, fsd). On rv64 ptr_bytes==8 so this reduces 1526 * to the historical uniform -cum_off-8-8*idx layout, byte-for-byte. */ 1527 static i32 rv_save_off(RvNativeTarget* a, u32 n_int, u32 idx) { 1528 i32 base = -(i32)(a->frame.cum_off); 1529 u32 ptr = a->variant->ptr_bytes; 1530 if (idx < n_int) return base - (i32)ptr * (i32)(idx + 1u); 1531 return base - (i32)(ptr * n_int) - 8 * (i32)(idx - n_int + 1u); 1532 } 1533 1534 static void rv_load_s0(const RiscvVariant* v, MCEmitter* mc, int fp, u32 reg, 1535 i32 off) { 1536 if (fits_i12(off)) { 1537 rv64_emit32(mc, fp ? rv_fld(reg, RV_S0, off) 1538 : rv_ld_ptr(v, reg, RV_S0, off)); 1539 return; 1540 } 1541 rv_emit_load_imm(v, mc, 1, RV_TMP0, (i64)off); 1542 rv64_emit32(mc, rv_add(RV_TMP0, RV_S0, RV_TMP0)); 1543 rv64_emit32(mc, fp ? rv_fld(reg, RV_TMP0, 0) : rv_ld_ptr(v, reg, RV_TMP0, 0)); 1544 } 1545 1546 /* Build the prologue instruction sequence into words[]. Returns count. */ 1547 static u32 rv_build_prologue(RvNativeTarget* a, u32* words, u32 cap, 1548 u32 frame_size, u32 fp_pair_off, 1549 const u32* int_regs, u32 n_int, const u32* fp_regs, 1550 u32 n_fp) { 1551 const RiscvVariant* v = a->variant; 1552 u32 ptr = v->ptr_bytes; /* saved-pair / int-save stride */ 1553 u32 gp_slot = v->gp_slot_bytes; /* vararg GP-slot stride */ 1554 u32 fsz = v->frame_save_size; /* saved ra+s0 pair base offset */ 1555 u32 wi = 0; 1556 /* lui+ADD{I,IW} materializes a 32-bit constant in TMP0; ADDIW is RV64-only so 1557 * use plain ADDI on rv32 (the value already fits 32 bits). */ 1558 #define ADDI_LO(rd, lo) (v->has_w_forms ? rv_addiw((rd), (rd), (lo)) : rv_addi((rd), (rd), (lo))) 1559 #define PUSH(w) \ 1560 do { \ 1561 if (wi >= cap) rv_panic(a, "prologue placeholder overflow"); \ 1562 words[wi++] = (w); \ 1563 } while (0) 1564 /* sp -= frame_size */ 1565 if (fits_i12(-(i32)frame_size)) { 1566 PUSH(rv_addi(RV_SP, RV_SP, -(i32)frame_size)); 1567 } else { 1568 i32 neg = -(i32)frame_size; 1569 i32 hi = (i32)(((i64)neg + 0x800) >> 12); 1570 i32 lo = neg - (i32)((u32)hi << 12); 1571 PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu)); 1572 if (lo) PUSH(ADDI_LO(RV_TMP0, lo)); 1573 PUSH(rv_add(RV_SP, RV_SP, RV_TMP0)); 1574 } 1575 /* save s0/ra at [sp + fp_pair_off], set s0 = sp + fp_pair_off. The saved-pair 1576 * internal stride is ptr_bytes (s0 at +0, ra at +ptr). */ 1577 if (fits_i12((i32)fp_pair_off + (i32)ptr)) { 1578 PUSH(rv_sd_ptr(v, RV_S0, RV_SP, (i32)fp_pair_off)); 1579 PUSH(rv_sd_ptr(v, RV_RA, RV_SP, (i32)fp_pair_off + (i32)ptr)); 1580 PUSH(rv_addi(RV_S0, RV_SP, (i32)fp_pair_off)); 1581 } else { 1582 i32 off = (i32)fp_pair_off; 1583 i32 hi = (i32)(((i64)off + 0x800) >> 12); 1584 i32 lo = off - (i32)((u32)hi << 12); 1585 PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu)); 1586 if (lo) PUSH(ADDI_LO(RV_TMP0, lo)); 1587 PUSH(rv_add(RV_TMP0, RV_SP, RV_TMP0)); 1588 PUSH(rv_sd_ptr(v, RV_S0, RV_TMP0, 0)); 1589 PUSH(rv_sd_ptr(v, RV_RA, RV_TMP0, (i32)ptr)); 1590 PUSH(rv_addi(RV_S0, RV_TMP0, 0)); 1591 } 1592 /* sret a0 spill */ 1593 if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) { 1594 RvNativeSlot* s = rv_slot_get(a, a->sret_ptr_slot); 1595 PUSH(rv_sd_ptr(v, RV_A0, RV_S0, rv_s0_off_slot(s))); 1596 } 1597 /* variadic GP save area: spill unconsumed a-regs at 1598 * [s0 + frame_save_size + i*gp_slot_bytes] */ 1599 if (a->is_variadic) { 1600 u32 i; 1601 for (i = a->next_param_int; i < 8u; ++i) 1602 PUSH(rv_sd_ptr(v, RV_A0 + i, RV_S0, (i32)fsz + (i32)i * (i32)gp_slot)); 1603 } 1604 /* callee saves: integer with the pointer-width store (sw/sd), fp with fsd. */ 1605 { 1606 u32 i; 1607 for (i = 0; i < n_int; ++i) { 1608 i32 off = rv_save_off(a, n_int, i); 1609 if (fits_i12(off)) { 1610 PUSH(rv_sd_ptr(v, int_regs[i], RV_S0, off)); 1611 } else { 1612 /* rare; emitted directly is fine in the known-frame path, but the 1613 * single-pass placeholder must hold these too. Use the wide form. */ 1614 i32 hi = (i32)(((i64)off + 0x800) >> 12); 1615 i32 lo = off - (i32)((u32)hi << 12); 1616 PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu)); 1617 if (lo) PUSH(ADDI_LO(RV_TMP0, lo)); 1618 PUSH(rv_add(RV_TMP0, RV_S0, RV_TMP0)); 1619 PUSH(rv_sd_ptr(v, int_regs[i], RV_TMP0, 0)); 1620 } 1621 } 1622 for (i = 0; i < n_fp; ++i) { 1623 i32 off = rv_save_off(a, n_int, n_int + i); 1624 if (fits_i12(off)) { 1625 PUSH(rv_fsd(fp_regs[i], RV_S0, off)); 1626 } else { 1627 i32 hi = (i32)(((i64)off + 0x800) >> 12); 1628 i32 lo = off - (i32)((u32)hi << 12); 1629 PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu)); 1630 if (lo) PUSH(ADDI_LO(RV_TMP0, lo)); 1631 PUSH(rv_add(RV_TMP0, RV_S0, RV_TMP0)); 1632 PUSH(rv_fsd(fp_regs[i], RV_TMP0, 0)); 1633 } 1634 } 1635 } 1636 #undef PUSH 1637 #undef ADDI_LO 1638 return wi; 1639 } 1640 1641 static void rv_func_begin(NativeTarget* t, const CGFuncDesc* fd) { 1642 RvNativeTarget* a = rv_of(t); 1643 MCEmitter* mc = t->mc; 1644 u32 i; 1645 rv_func_begin_common(t, fd); 1646 a->prologue_pos = mc->pos(mc); 1647 for (i = 0; i < RV_PROLOGUE_WORDS; ++i) rv64_emit32(mc, RV_NOP); 1648 rv_reserve_entry_saves(a); 1649 rv_emit_entry_save_stores(a); 1650 } 1651 1652 static void rv_func_end(NativeTarget* t) { 1653 RvNativeTarget* a = rv_of(t); 1654 MCEmitter* mc = t->mc; 1655 ObjBuilder* obj = t->obj; 1656 ObjSecId sec = a->func->text_section_id; 1657 u32 int_regs[16], fp_regs[16]; 1658 u32 n_int = rv_collect_int_saves(a, int_regs); 1659 u32 n_fp = rv_collect_fp_saves(a, fp_regs); 1660 u32 frame_size = rv_frame_size(a); 1661 u32 fp_pair_off = rv_fp_pair_off(a, frame_size); 1662 u32 end; 1663 i32 i; 1664 a->frame_size_final = frame_size; 1665 a->fp_pair_off = fp_pair_off; 1666 1667 /* epilogue */ 1668 mc->label_place(mc, a->epilogue_label); 1669 if (a->slim_prologue) { 1670 /* Frameless leaf: no callee-saves, no s0/ra to reload, sp untouched. */ 1671 rv64_emit32(mc, rv_jalr(RV_ZERO, RV_RA, 0)); 1672 } else { 1673 const RiscvVariant* v = a->variant; 1674 for (i = (i32)n_int - 1; i >= 0; --i) 1675 rv_load_s0(v, mc, 0, int_regs[i], rv_save_off(a, n_int, (u32)i)); 1676 for (i = (i32)n_fp - 1; i >= 0; --i) 1677 rv_load_s0(v, mc, 1, fp_regs[i], rv_save_off(a, n_int, n_int + (u32)i)); 1678 if (a->frame.has_alloca) 1679 rv_emit_addr_adjust(v, mc, RV_SP, RV_S0, -(i32)fp_pair_off); 1680 /* Reload ra/s0 from the saved pair (s0 at +0, ra at +ptr_bytes), pointer 1681 * width. */ 1682 rv64_emit32(mc, rv_ld_ptr(v, RV_RA, RV_S0, (i32)v->ptr_bytes)); 1683 rv64_emit32(mc, rv_ld_ptr(v, RV_S0, RV_S0, 0)); 1684 /* sp += frame_size */ 1685 if (fits_i12((i32)frame_size)) { 1686 rv64_emit32(mc, rv_addi(RV_SP, RV_SP, (i32)frame_size)); 1687 } else { 1688 rv_emit_load_imm(v, mc, 1, RV_TMP0, (i64)frame_size); 1689 rv64_emit32(mc, rv_add(RV_SP, RV_SP, RV_TMP0)); 1690 } 1691 rv64_emit32(mc, rv_jalr(RV_ZERO, RV_RA, 0)); 1692 } 1693 1694 /* patch prologue */ 1695 if (!a->frame.known_frame) { 1696 u32 words[RV_PROLOGUE_WORDS]; 1697 u32 nwords, k; 1698 for (k = 0; k < RV_PROLOGUE_WORDS; ++k) words[k] = RV_NOP; 1699 nwords = rv_build_prologue(a, words, RV_PROLOGUE_WORDS, frame_size, 1700 fp_pair_off, int_regs, n_int, fp_regs, n_fp); 1701 (void)nwords; 1702 for (k = 0; k < RV_PROLOGUE_WORDS; ++k) 1703 rv_patch32(obj, sec, a->prologue_pos + k * 4u, words[k]); 1704 } 1705 /* patch alloca sites: addi dst, sp, max_outgoing */ 1706 { 1707 u32 mo = align_up_u32(a->frame.max_outgoing, 16u); 1708 u32 k; 1709 if (mo > 2047u) rv_panic(a, "max_outgoing too large for alloca patch"); 1710 for (k = 0; k < a->npatches; ++k) 1711 rv_patch32(obj, sec, a->patches[k].pos, 1712 rv_addi(a->patches[k].dst_reg, RV_SP, (i32)mo)); 1713 } 1714 1715 /* CFI: CFA = s0 + (frame_size - fp_pair_off) */ 1716 if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) { 1717 if (a->slim_prologue) { 1718 /* Frameless leaf: CFA = sp (unchanged from entry) and the return address 1719 * stays live in ra (the CIE default), so no saved-register rules. The 1720 * state holds from the first instruction (offset 0). */ 1721 mc->cfi_set_next_pc_offset(mc, 0); 1722 mc->cfi_def_cfa(mc, RV_SP, 0); 1723 } else { 1724 i32 cfa = (i32)frame_size - (i32)fp_pair_off; 1725 u32 post = a->prologue_pos + (a->frame.known_frame 1726 ? a->minimal_prologue_words * 4u 1727 : RV_PROLOGUE_WORDS * 4u); 1728 u32 k; 1729 mc->cfi_set_next_pc_offset(mc, post - a->func_start); 1730 mc->cfi_def_cfa(mc, RV_S0, cfa); 1731 mc->cfi_offset(mc, RV_S0, -cfa); 1732 /* ra is saved at the saved-pair stride above s0 (ptr_bytes). */ 1733 mc->cfi_offset(mc, RV_RA, -cfa + (i32)a->variant->ptr_bytes); 1734 for (k = 0; k < n_int; ++k) 1735 mc->cfi_offset(mc, int_regs[k], rv_save_off(a, n_int, k) - cfa); 1736 for (k = 0; k < n_fp; ++k) 1737 mc->cfi_offset(mc, 32u + fp_regs[k], 1738 rv_save_off(a, n_int, n_int + k) - cfa); 1739 } 1740 } 1741 1742 end = mc->pos(mc); 1743 obj_symbol_define(obj, a->func->sym, sec, (u64)a->func_start, 1744 (u64)(end - a->func_start)); 1745 if (a->func->atomize) 1746 obj_atom_define(obj, sec, a->func_start, end - a->func_start, a->func->sym, 1747 0); 1748 if (mc->debug) debug_func_pc_range(mc->debug, sec, a->func_start, end); 1749 if (mc->cfi_endproc) mc->cfi_endproc(mc); 1750 mc_end_function(mc); 1751 a->func = NULL; 1752 } 1753 1754 /* rv64 homes its callee-saves below the locals at rv_save_off(idx) rather than 1755 * in frame slots, so alloc_slots=0: native_frame just records the {reg,cls} set 1756 * derived from the optimizer's per-class used-masks. */ 1757 static void rv_reserve_callee_saves(NativeTarget* t, const u32* used, 1758 u32 nclasses) { 1759 native_frame_set_callee_saves(&rv_of(t)->frame, used, nclasses, NULL, 0, 0); 1760 } 1761 1762 static int rv_reg_is_callee_int(Reg r); 1763 static int rv_reg_is_callee_fp(Reg r); 1764 static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, 1765 u32 nclob, u32* int_mask, u32* fp_mask); 1766 1767 /* Expand the arch-neutral clobber-ABI sets (KitCgAsmClobberAbiSet bits) into 1768 * this target's per-class caller/callee-saved register masks. */ 1769 /* abi_clobber_masks is shared as native_asm_abi_clobber_masks 1770 * (cg/native_asm.h); it reads the masks from t->regs->classes. */ 1771 1772 /* Build the callee-saved set the prologue must preserve: the allocator-assigned 1773 * callee-saved registers (frame->callee_saved_used) plus any an inline-asm 1774 * block clobbers. The latter are opaque to the optimizer's operand scan, so it 1775 * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral 1776 * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks 1777 * and keep only the callee-saved ones — rv_reg_is_callee_int excludes s0 (the 1778 * frame pointer, preserved by the prologue head, not as an ordinary 1779 * callee-save). This is the same register selection the per-block spill used, 1780 * hoisted into the prologue. Writes up to `cap` per-class masks into `out` and 1781 * returns the class count to reserve. */ 1782 static u32 rv_known_callee_saves(NativeTarget* t, 1783 const NativeKnownFrameDesc* frame, u32* out, 1784 u32 cap) { 1785 u32 ncls = frame->ncallee_classes; 1786 u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp; 1787 if (ncls > cap) ncls = cap; 1788 for (u32 c = 0; c < ncls; ++c) 1789 out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u; 1790 if (frame->asm_clobbers && frame->nasm_clobbers) { 1791 RvNativeTarget* a = rv_of(t); 1792 SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; 1793 rv_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers, 1794 &clob_int, &clob_fp); 1795 } 1796 native_asm_abi_clobber_masks(t, frame->asm_clobber_abi_sets, &abi_int, 1797 &abi_fp); 1798 clob_int |= abi_int; 1799 clob_fp |= abi_fp; 1800 for (Reg r = 0; r < 32u; ++r) { 1801 if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) && 1802 rv_reg_is_callee_int(r)) 1803 out[NATIVE_REG_INT] |= 1u << r; 1804 if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && rv_reg_is_callee_fp(r)) 1805 out[NATIVE_REG_FP] |= 1u << r; 1806 } 1807 return ncls; 1808 } 1809 1810 static u32 rv_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type, 1811 int* variadic, u32* nparams); 1812 1813 /* Optimizer entry point: the full frame is supplied up front, so the prologue 1814 * is emitted final the moment it is built — no NOP region, no func_end patch 1815 * (rv_func_end skips patching when known_frame). rv_build_prologue emits the 1816 * sret spill and the variadic register-save stores inline, so there is no 1817 * separate entry-save emission. Slot creation order matches the single-pass 1818 * path: callee-saves first (only recorded for rv64), then static slots, then 1819 * the sret entry-save slot. */ 1820 static void rv_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd, 1821 const NativeKnownFrameDesc* frame, 1822 NativeFrameSlot* out_slots) { 1823 RvNativeTarget* a = rv_of(t); 1824 MCEmitter* mc = t->mc; 1825 u32 int_regs[RV_MAX_CALLEE_SAVES], fp_regs[RV_MAX_CALLEE_SAVES]; 1826 u32 n_int, n_fp, frame_size, fp_pair_off, nwords, i; 1827 u32 words[RV_KNOWN_PROLOGUE_WORDS]; 1828 rv_func_begin_common(t, fd); 1829 a->frame.known_frame = 1; 1830 if (frame) { 1831 u32 cs[NATIVE_CALL_PLAN_CLASSES]; 1832 u32 ncs = rv_known_callee_saves(t, frame, cs, NATIVE_CALL_PLAN_CLASSES); 1833 a->frame.has_alloca = frame->has_alloca; 1834 if (ncs) rv_reserve_callee_saves(t, cs, ncs); 1835 for (i = 0; i < frame->nslots; ++i) { 1836 NativeFrameSlot slot = rv_frame_slot(t, &frame->slots[i]); 1837 if (out_slots) out_slots[i] = slot; 1838 } 1839 rv_reserve_entry_saves(a); 1840 native_frame_note_outgoing(&a->frame, frame->max_outgoing); 1841 } 1842 /* Frame is final: size and offsets are settled, so emit the exact prologue. 1843 */ 1844 frame_size = rv_frame_size(a); 1845 fp_pair_off = rv_fp_pair_off(a, frame_size); 1846 a->frame_size_final = frame_size; 1847 a->fp_pair_off = fp_pair_off; 1848 a->prologue_pos = mc->pos(mc); 1849 /* Leaf no-frame tier (aa64 slim_prologue equivalent): a leaf with no 1850 * callee-saves, no body slots, no outgoing args, no sret/variadic and 1851 * register-only params never reads s0 (no frame slots / stack args) nor 1852 * clobbers ra (no calls). Emit no prologue at all; rv_func_end emits a bare 1853 * `ret`. cum_off==0 already implies no sret slot and no param spills, but the 1854 * extra guards keep the intent explicit. Inline asm is excluded: it can 1855 * clobber ra opaquely, and without the saved record the bare `ret` would 1856 * return through the destroyed link register. */ 1857 a->slim_prologue = frame && frame->is_leaf && !frame->has_asm && 1858 !frame->reads_frame && a->frame.ncallee_saves == 0 && 1859 !a->frame.has_alloca && a->frame.cum_off == 0 && 1860 a->frame.max_outgoing == 0 && !a->has_sret && 1861 !a->is_variadic && 1862 rv_signature_stack_bytes(t, fd->fn_type, NULL, NULL) == 0; 1863 if (a->slim_prologue) { 1864 a->minimal_prologue_words = 0; 1865 native_frame_set_final(&a->frame); 1866 return; 1867 } 1868 n_int = rv_collect_int_saves(a, int_regs); 1869 n_fp = rv_collect_fp_saves(a, fp_regs); 1870 nwords = rv_build_prologue(a, words, RV_KNOWN_PROLOGUE_WORDS, frame_size, 1871 fp_pair_off, int_regs, n_int, fp_regs, n_fp); 1872 for (i = 0; i < nwords; ++i) rv64_emit32(mc, words[i]); 1873 a->minimal_prologue_words = nwords; 1874 native_frame_set_final(&a->frame); 1875 } 1876 1877 /* ============================ params / ABI helpers 1878 * ============================ */ 1879 1880 static const ABIArgInfo* rv_param_abi(NativeTarget* t, const ABIFuncInfo* abi, 1881 const NativeCallDesc* desc, u32 i, 1882 ABIArgInfo* scratch) { 1883 /* Synthesized for unnamed (variadic) args, or untyped calls. RISC-V LP64D 1884 * passes variadic FP args in INTEGER registers (as their bit pattern), not 1885 * the FP pool — so a variadic float part is ABI_CLASS_INT. */ 1886 int variadic = abi && i >= abi->nparams; 1887 u32 gpr = rv_of(t)->variant->ptr_bytes; /* GPR width: 4 ilp32 / 8 lp64 */ 1888 u32 sz, align; 1889 int is_fp; 1890 if (abi && i < abi->nparams) return &abi->params[i]; 1891 sz = native_type_size(t, desc->args[i].type); 1892 align = native_type_align(t, desc->args[i].type); 1893 /* A variadic FP arg rides the INTEGER pool as its bit pattern (RISC-V passes 1894 * unnamed FP args in GPRs), so it is INT-class here. */ 1895 is_fp = !variadic && cg_type_is_float(t->c, desc->args[i].type); 1896 memset(scratch, 0, sizeof *scratch); 1897 scratch->kind = ABI_ARG_DIRECT; 1898 /* A scalar wider than one GPR (an 8-byte i64 / soft-double on ilp32) rides a 1899 * register pair, matching the named-arg classifier (abi_rv64.c). Synthesize 1900 * one INT part per GPR-word so the per-part marshaller fills both registers 1901 * (low word in the lower-numbered reg) instead of dropping the high half into 1902 * a single register. FP-class args (hardware-float, size<=GPR) stay single. */ 1903 if (!is_fp && sz > gpr) { 1904 u32 nparts = (sz + gpr - 1u) / gpr, p; 1905 ABIArgPart* parts = arena_zarray(t->c->tu, ABIArgPart, nparts); 1906 for (p = 0; p < nparts; ++p) { 1907 u32 off = p * gpr; 1908 parts[p].cls = ABI_CLASS_INT; 1909 parts[p].loc = ABI_LOC_REG; 1910 parts[p].size = (sz - off) < gpr ? (sz - off) : gpr; 1911 parts[p].align = gpr; 1912 parts[p].src_offset = off; 1913 } 1914 scratch->nparts = (u16)nparts; 1915 scratch->parts = parts; 1916 return scratch; 1917 } 1918 scratch->nparts = 1; 1919 scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1); 1920 ((ABIArgPart*)scratch->parts)[0].cls = is_fp ? ABI_CLASS_FP : ABI_CLASS_INT; 1921 ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG; 1922 ((ABIArgPart*)scratch->parts)[0].size = sz; 1923 ((ABIArgPart*)scratch->parts)[0].align = align; 1924 return scratch; 1925 } 1926 1927 /* Outgoing stack-slot size/align: the xlen-word (gp_slot_bytes: 8 lp64d / 1928 * 4 ilp32) is the natural slot stride; stack ABI alignment caps at 16. */ 1929 static u32 rv_part_stack_size(const RiscvVariant* v, const ABIArgPart* part) { 1930 u32 slot = v->gp_slot_bytes; 1931 return align_up_u32(part->size ? part->size : slot, slot); 1932 } 1933 static u32 rv_part_stack_align(const RiscvVariant* v, const ABIArgPart* part) { 1934 u32 slot = v->gp_slot_bytes; 1935 u32 al = part->align ? part->align : slot; 1936 if (al < slot) al = slot; 1937 if (al > 16u) al = 16u; 1938 return al; 1939 } 1940 1941 static KitCgTypeId rv_part_scalar_type(const ABIArgPart* part) { 1942 if (part->cls == ABI_CLASS_FP) { 1943 if (part->size <= 4u) return builtin_id(KIT_CG_BUILTIN_F32); 1944 return builtin_id(KIT_CG_BUILTIN_F64); 1945 } 1946 switch (part->size) { 1947 case 1u: 1948 return builtin_id(KIT_CG_BUILTIN_I8); 1949 case 2u: 1950 return builtin_id(KIT_CG_BUILTIN_I16); 1951 case 4u: 1952 return builtin_id(KIT_CG_BUILTIN_I32); 1953 default: 1954 return builtin_id(KIT_CG_BUILTIN_I64); 1955 } 1956 } 1957 1958 static u32 rv_class_stack_size(const RiscvVariant* v, const ABIArgInfo* ai) { 1959 u32 slot = v->gp_slot_bytes; 1960 u32 total = 0, p; 1961 if (!ai || ai->kind == ABI_ARG_IGNORE) return 0; 1962 if (ai->kind == ABI_ARG_INDIRECT) return v->ptr_bytes; 1963 for (p = 0; p < ai->nparts; ++p) { 1964 total = align_up_u32(total, rv_part_stack_align(v, &ai->parts[p])); 1965 total += rv_part_stack_size(v, &ai->parts[p]); 1966 } 1967 return align_up_u32(total ? total : slot, slot); 1968 } 1969 1970 static u32 rv_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) { 1971 const RiscvVariant* v = rv_of(t)->variant; 1972 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type); 1973 /* sret consumes a0 as the implicit first integer argument. */ 1974 u32 next_int = (abi && abi->has_sret) ? 1u : 0u; 1975 u32 next_fp = 0, stack = 0, i, p; 1976 for (i = 0; i < desc->nargs; ++i) { 1977 ABIArgInfo tmp; 1978 const ABIArgInfo* ai = rv_param_abi(t, abi, desc, i, &tmp); 1979 int force_stack = 1980 abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams; 1981 if (ai->kind == ABI_ARG_IGNORE) continue; 1982 if (force_stack) { 1983 stack += rv_class_stack_size(v, ai); 1984 continue; 1985 } 1986 if (ai->kind == ABI_ARG_INDIRECT) { 1987 if (next_int < 8u) 1988 next_int++; 1989 else 1990 stack += v->ptr_bytes; 1991 continue; 1992 } 1993 for (p = 0; p < ai->nparts; ++p) { 1994 const ABIArgPart* part = &ai->parts[p]; 1995 if (part->cls == ABI_CLASS_FP) { 1996 if (next_fp < 8u) 1997 next_fp++; 1998 else { 1999 stack = align_up_u32(stack, rv_part_stack_align(v, part)); 2000 stack += rv_part_stack_size(v, part); 2001 } 2002 } else { 2003 if (next_int < 8u) 2004 next_int++; 2005 else { 2006 stack = align_up_u32(stack, rv_part_stack_align(v, part)); 2007 stack += rv_part_stack_size(v, part); 2008 } 2009 } 2010 } 2011 } 2012 return align_up_u32(stack, 16u); 2013 } 2014 2015 static u32 rv_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type, 2016 int* variadic, u32* nparams) { 2017 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type); 2018 NativeCallDesc d; 2019 if (variadic) *variadic = abi ? (int)abi->variadic : 0; 2020 if (nparams) *nparams = abi ? abi->nparams : 0u; 2021 memset(&d, 0, sizeof d); 2022 d.fn_type = fn_type; 2023 d.nargs = abi ? abi->nparams : 0u; 2024 if (d.nargs) d.args = arena_zarray(t->c->tu, NativeLoc, d.nargs); 2025 return rv_call_stack_size(t, &d); 2026 } 2027 2028 static u32 rv_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) { 2029 return rv_call_stack_size(t, desc); 2030 } 2031 2032 /* Resolve a NativeLoc to an addressable NativeAddr (frame/stack/addr). */ 2033 static NativeAddr rv_loc_addr(RvNativeTarget* a, NativeLoc loc, u32 offset) { 2034 NativeAddr addr; 2035 memset(&addr, 0, sizeof addr); 2036 switch ((NativeLocKind)loc.kind) { 2037 case NATIVE_LOC_FRAME: 2038 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 2039 addr.base.frame = loc.v.frame; 2040 addr.base_type = loc.type; 2041 addr.offset = (i32)offset; 2042 return addr; 2043 case NATIVE_LOC_STACK: 2044 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 2045 addr.base.frame = loc.v.stack.slot; 2046 addr.base_type = loc.type; 2047 addr.offset = loc.v.stack.offset + (i32)offset; 2048 return addr; 2049 case NATIVE_LOC_ADDR: 2050 addr = loc.v.addr; 2051 addr.offset += (i32)offset; 2052 return addr; 2053 default: 2054 rv_panic(a, "location is not addressable"); 2055 } 2056 return addr; 2057 } 2058 2059 static void rv_load_part(NativeTarget* t, NativeLoc dst, NativeLoc src, 2060 u32 offset, u32 size) { 2061 RvNativeTarget* a = rv_of(t); 2062 if (src.kind == NATIVE_LOC_REG) { 2063 rv_move(t, dst, src); 2064 return; 2065 } 2066 if (src.kind == NATIVE_LOC_FRAME || src.kind == NATIVE_LOC_STACK || 2067 src.kind == NATIVE_LOC_ADDR) { 2068 NativeAddr addr = rv_loc_addr(a, src, offset); 2069 addr.base_type = dst.type; 2070 rv_emit_mem(a, 1, dst, addr, native_mem_for_type(t, dst.type, size)); 2071 return; 2072 } 2073 if (src.kind == NATIVE_LOC_IMM) { 2074 rv_emit_load_imm(a->variant, t->mc, rv_is_64(t, dst.type) ? 1u : 0u, 2075 loc_reg(dst), src.v.imm); 2076 return; 2077 } 2078 rv_panic(a, "unsupported part source"); 2079 } 2080 2081 static void rv_store_part(NativeTarget* t, NativeLoc dst, NativeLoc src, 2082 u32 offset, u32 size) { 2083 RvNativeTarget* a = rv_of(t); 2084 if (dst.kind == NATIVE_LOC_FRAME || dst.kind == NATIVE_LOC_STACK || 2085 dst.kind == NATIVE_LOC_ADDR) { 2086 NativeAddr addr = rv_loc_addr(a, dst, offset); 2087 addr.base_type = src.type; 2088 rv_emit_mem(a, 0, src, addr, native_mem_for_type(t, src.type, size)); 2089 return; 2090 } 2091 if (dst.kind == NATIVE_LOC_REG) { 2092 rv_move(t, dst, src); 2093 return; 2094 } 2095 rv_panic(a, "unsupported part destination"); 2096 } 2097 2098 static void rv_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) { 2099 NativeAddr addr = rv_loc_addr(rv_of(t), src, 0); 2100 rv_load_addr(t, dst, addr); 2101 } 2102 2103 static void rv_store_outgoing_part(NativeTarget* t, int tail_call, 2104 u32 stack_off, NativeLoc src, u32 size) { 2105 NativeAddr addr; 2106 memset(&addr, 0, sizeof addr); 2107 addr.base_kind = NATIVE_ADDR_BASE_REG; 2108 addr.base_type = src.type; 2109 if (tail_call) { 2110 /* A sibling call reuses the caller's frame: its outgoing stack args land in 2111 * the caller's incoming-arg window ([s0 + 16 + va_save + off]) — physically 2112 * the same address the tail-callee will read at [sp+off] once the teardown 2113 * has restored sp to the caller's entry sp (the CFA). */ 2114 addr.base.reg = RV_S0; 2115 addr.offset = rv_s0_off_in_arg(rv_of(t), stack_off); 2116 } else { 2117 addr.base.reg = RV_SP; 2118 addr.offset = (i32)stack_off; 2119 } 2120 rv_emit_mem(rv_of(t), 0, src, addr, native_mem_for_type(t, src.type, size)); 2121 } 2122 2123 /* NativeTarget bind_param: route incoming param (ABI loc) into dst. */ 2124 static void rv_bind_native_param(NativeTarget* t, const CGParamDesc* p, 2125 NativeLoc dst) { 2126 RvNativeTarget* a = rv_of(t); 2127 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type); 2128 const ABIArgInfo* ai = 2129 p->index < abi->nparams ? &abi->params[p->index] : NULL; 2130 int to_reg = dst.kind == NATIVE_LOC_REG; 2131 u32 i; 2132 if (!ai || ai->kind == ABI_ARG_IGNORE) return; 2133 if (ai->kind == ABI_ARG_INDIRECT) { 2134 NativeLoc src = native_loc_reg( 2135 builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, 2136 a->next_param_int < 8u ? RV_A0 + a->next_param_int : RV_TMP0); 2137 NativeAddr d_addr, from; 2138 AggregateAccess access; 2139 if (a->next_param_int < 8u) { 2140 a->next_param_int++; 2141 } else { 2142 NativeAddr sa; 2143 memset(&sa, 0, sizeof sa); 2144 sa.base_kind = NATIVE_ADDR_BASE_REG; 2145 sa.base.reg = RV_S0; 2146 sa.offset = rv_s0_off_in_arg(a, a->next_param_stack); 2147 sa.base_type = src.type; 2148 rv_emit_mem(a, 1, src, sa, 2149 native_mem_for_type(t, src.type, a->variant->ptr_bytes)); 2150 a->next_param_stack += a->variant->ptr_bytes; 2151 } 2152 if (dst.kind != NATIVE_LOC_FRAME) 2153 rv_panic(a, "indirect parameter requires a frame destination"); 2154 memset(&d_addr, 0, sizeof d_addr); 2155 d_addr.base_kind = NATIVE_ADDR_BASE_FRAME; 2156 d_addr.base.frame = dst.v.frame; 2157 d_addr.base_type = p->type; 2158 memset(&from, 0, sizeof from); 2159 from.base_kind = NATIVE_ADDR_BASE_REG; 2160 from.base.reg = loc_reg(src); 2161 from.base_type = p->type; 2162 memset(&access, 0, sizeof access); 2163 access.type = p->type; 2164 access.size = p->size ? p->size : (u32)cg_type_size(t->c, p->type); 2165 access.align = p->align ? p->align : native_type_align(t, p->type); 2166 rv_copy_bytes(t, d_addr, from, access); 2167 return; 2168 } 2169 for (i = 0; i < ai->nparts; ++i) { 2170 const ABIArgPart* part = &ai->parts[i]; 2171 NativeAllocClass cls = 2172 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2173 NativeLoc src; 2174 if (cls == NATIVE_REG_FP && a->next_param_fp < 8u) { 2175 src = native_loc_reg(p->type, cls, RV_FA0 + a->next_param_fp++); 2176 } else if (cls == NATIVE_REG_INT && a->next_param_int < 8u) { 2177 src = native_loc_reg(p->type, cls, RV_A0 + a->next_param_int++); 2178 } else { 2179 Reg tmp = (cls == NATIVE_REG_FP) ? RV_FTMP0 : RV_TMP0; 2180 NativeAddr sa; 2181 src = native_loc_reg(p->type, cls, tmp); 2182 a->next_param_stack = align_up_u32( 2183 a->next_param_stack, rv_part_stack_align(a->variant, part)); 2184 memset(&sa, 0, sizeof sa); 2185 sa.base_kind = NATIVE_ADDR_BASE_REG; 2186 sa.base.reg = RV_S0; 2187 sa.base_type = p->type; 2188 sa.offset = rv_s0_off_in_arg(a, a->next_param_stack); 2189 rv_emit_mem(a, 1, src, sa, native_mem_for_type(t, p->type, part->size)); 2190 a->next_param_stack += rv_part_stack_size(a->variant, part); 2191 } 2192 if (dst.kind == NATIVE_LOC_NONE) { 2193 /* unused parameter; cursors already advanced */ 2194 } else if (to_reg) { 2195 NativeLoc d = native_loc_reg(dst.type ? dst.type : p->type, 2196 (NativeAllocClass)dst.cls, (Reg)dst.v.reg); 2197 if (!(src.kind == NATIVE_LOC_REG && loc_reg(src) == loc_reg(d) && 2198 (NativeAllocClass)src.cls == (NativeAllocClass)d.cls)) 2199 rv_move(t, d, src); 2200 } else { 2201 rv_store_part( 2202 t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), src, 2203 0, part->size); 2204 } 2205 } 2206 a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u); 2207 } 2208 2209 /* ============================ calls / returns ============================ */ 2210 2211 typedef NativeArgMove RvArgMove; 2212 2213 static void rv_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) { 2214 if (m->is_addr) 2215 rv_addr_of_loc(t, m->dst, m->src); 2216 else 2217 rv_load_part(t, m->dst, m->src, m->src_offset, m->size); 2218 } 2219 2220 /* Parallel-copy register arg moves via the shared scheduler; cycles break 2221 * through the int/fp emit scratch (t1 / ft1). */ 2222 static void rv_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, 2223 u32 n) { 2224 NativeArgShuffle s; 2225 if (n > RV_MAX_REG_ARG_MOVES) rv_panic(rv_of(t), "too many register args"); 2226 memset(&s, 0, sizeof s); 2227 s.t = t; 2228 s.emit_one = rv_emit_one_arg_move; 2229 s.reg_move = rv_move; 2230 s.scratch[NATIVE_REG_INT] = RV_TMP1; 2231 s.scratch[NATIVE_REG_FP] = RV_FTMP1; 2232 native_arg_shuffle(&s, moves, n); 2233 } 2234 2235 static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc, 2236 NativeCallPlan* plan) { 2237 RvNativeTarget* a = rv_of(t); 2238 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type); 2239 NativeCallPlanRet* rets; 2240 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 2241 memset(plan, 0, sizeof *plan); 2242 rets = desc->nresults ? arena_zarray(t->c->tu, NativeCallPlanRet, 4) : NULL; 2243 plan->callee = desc->callee; 2244 plan->rets = rets; 2245 plan->flags = desc->flags; 2246 plan->has_sret = abi && abi->has_sret; 2247 plan->is_variadic = abi && abi->variadic; 2248 plan->stack_arg_size = rv_call_stack_size(t, desc); 2249 if (plan->stack_arg_size > a->frame.max_outgoing) 2250 a->frame.max_outgoing = plan->stack_arg_size; 2251 /* Indirect callee in an arg register would be clobbered by arg loads. */ 2252 if (plan->callee.kind == NATIVE_LOC_REG && 2253 (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT && 2254 plan->callee.v.reg >= RV_A0 && plan->callee.v.reg <= RV_A7) { 2255 NativeLoc scratch = 2256 native_loc_reg(plan->callee.type, NATIVE_REG_INT, RV_TMP0); 2257 rv_move(t, scratch, plan->callee); 2258 plan->callee = scratch; 2259 } 2260 { 2261 /* sret returns pass the hidden destination pointer as the implicit first 2262 * integer argument (a0), so the real args start at a1. */ 2263 u32 next_int = (abi && abi->has_sret) ? 1u : 0u; 2264 u32 next_fp = 0, stack = 0, nmoves = 0, i, p; 2265 int tail = (desc->flags & CG_CALL_TAIL) != 0; 2266 RvArgMove moves[RV_MAX_REG_ARG_MOVES]; 2267 for (i = 0; i < desc->nargs; ++i) { 2268 ABIArgInfo tmp; 2269 const ABIArgInfo* ai = rv_param_abi(t, abi, desc, i, &tmp); 2270 int force_stack = 2271 abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams; 2272 if (ai->kind == ABI_ARG_IGNORE) continue; 2273 if (force_stack) { 2274 NativeLoc tmpreg = 2275 native_loc_reg(desc->args[i].type, NATIVE_REG_INT, RV_TMP0); 2276 u32 slot = a->variant->gp_slot_bytes; /* xlen-word: 8 lp64d / 4 ilp32 */ 2277 u32 n = rv_class_stack_size(a->variant, ai), off = 0; 2278 while (off < n) { 2279 rv_load_part(t, tmpreg, desc->args[i], off, slot); 2280 rv_store_outgoing_part(t, tail, stack + off, tmpreg, slot); 2281 off += slot; 2282 } 2283 stack += n; 2284 continue; 2285 } 2286 if (ai->kind == ABI_ARG_INDIRECT) { 2287 u32 ptr_sz = a->variant->ptr_bytes; 2288 if (next_int < 8u) { 2289 RvArgMove* m = &moves[nmoves++]; 2290 m->dst = native_loc_reg(i64t, NATIVE_REG_INT, RV_A0 + next_int++); 2291 m->src = desc->args[i]; 2292 m->src_offset = 0; 2293 m->size = ptr_sz; 2294 m->is_addr = 1; 2295 } else { 2296 NativeLoc ptr = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP0); 2297 rv_addr_of_loc(t, ptr, desc->args[i]); 2298 rv_store_outgoing_part(t, tail, stack, ptr, ptr_sz); 2299 stack += ptr_sz; 2300 } 2301 continue; 2302 } 2303 for (p = 0; p < ai->nparts; ++p) { 2304 const ABIArgPart* part = &ai->parts[p]; 2305 NativeAllocClass cls = 2306 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2307 if ((cls == NATIVE_REG_FP && next_fp < 8u) || 2308 (cls == NATIVE_REG_INT && next_int < 8u)) { 2309 RvArgMove* m = &moves[nmoves++]; 2310 Reg areg = 2311 cls == NATIVE_REG_FP ? RV_FA0 + next_fp++ : RV_A0 + next_int++; 2312 m->dst = native_loc_reg(desc->args[i].type, cls, areg); 2313 m->src = desc->args[i]; 2314 m->src_offset = part->src_offset; 2315 m->size = part->size; 2316 m->is_addr = 0; 2317 } else { 2318 Reg tmp = cls == NATIVE_REG_FP ? RV_FTMP0 : RV_TMP0; 2319 NativeLoc tmpreg = native_loc_reg(desc->args[i].type, cls, tmp); 2320 rv_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size); 2321 stack = align_up_u32(stack, rv_part_stack_align(a->variant, part)); 2322 rv_store_outgoing_part(t, tail, stack, tmpreg, part->size); 2323 stack += rv_part_stack_size(a->variant, part); 2324 } 2325 } 2326 } 2327 rv_emit_reg_arg_moves(t, moves, nmoves); 2328 if (abi && abi->has_sret) { 2329 /* sret pointer goes in a0; arg loads have completed. A tail call forwards 2330 * the caller's own incoming sret pointer (spilled at entry) so the 2331 * sibling writes the result into the caller's caller's destination; 2332 * otherwise pass the address of this call's result slot. */ 2333 NativeLoc a0 = native_loc_reg(i64t, NATIVE_REG_INT, RV_A0); 2334 if (tail) 2335 rv_load_part(t, a0, native_loc_stack(i64t, a->sret_ptr_slot, 0), 0, 2336 a->variant->ptr_bytes); 2337 else if (desc->nresults) 2338 rv_addr_of_loc(t, a0, desc->results[0]); 2339 } 2340 } 2341 if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) { 2342 u32 nr = 0, ni = 0, nf = 0, p; 2343 for (p = 0; p < abi->ret.nparts; ++p) { 2344 const ABIArgPart* part = &abi->ret.parts[p]; 2345 NativeAllocClass cls = 2346 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2347 KitCgTypeId pty = rv_part_scalar_type(part); 2348 Reg rreg = cls == NATIVE_REG_FP ? RV_FA0 + nf++ : RV_A0 + ni++; 2349 rets[nr].src = native_loc_reg(pty, cls, rreg); 2350 rets[nr].dst = desc->results[0]; 2351 if (rets[nr].dst.kind == NATIVE_LOC_FRAME) 2352 rets[nr].dst = native_loc_stack(pty, desc->results[0].v.frame, 2353 (i32)part->src_offset); 2354 else if (rets[nr].dst.kind == NATIVE_LOC_STACK) { 2355 rets[nr].dst.v.stack.offset += (i32)part->src_offset; 2356 rets[nr].dst.type = pty; 2357 } 2358 rets[nr].mem = native_mem_for_type(t, pty, part->size); 2359 nr++; 2360 } 2361 plan->nrets = nr; 2362 } else if (abi && abi->ret.kind == ABI_ARG_IGNORE) { 2363 plan->nrets = 0; 2364 } else if (!abi && desc->nresults) { 2365 rets[0].src = native_loc_reg(desc->results[0].type, NATIVE_REG_INT, RV_A0); 2366 rets[0].dst = desc->results[0]; 2367 rets[0].mem = native_mem_for_type(t, desc->results[0].type, 0); 2368 plan->nrets = 1; 2369 } 2370 } 2371 2372 /* Emit a sibling (tail) call: tear the frame down to the caller's entry state 2373 * and jump (no link) to the callee. Outgoing args are already in the arg regs / 2374 * the caller's incoming-arg window. At -O0 there are no callee-saves, and the 2375 * sp restore uses the CFA offset (s0 + 16 + va_save), which is independent of 2376 * the not-yet-final frame_size — so no func_end patching is needed. */ 2377 static void rv_emit_tail_site(NativeTarget* t, NativeLoc callee) { 2378 RvNativeTarget* a = rv_of(t); 2379 const RiscvVariant* v = a->variant; 2380 MCEmitter* mc = t->mc; 2381 i32 cfa = (i32)(v->frame_save_size + rv_va_save_sz(a)); 2382 int indirect = callee.kind == NATIVE_LOC_REG; 2383 u32 int_regs[RV_MAX_CALLEE_SAVES], fp_regs[RV_MAX_CALLEE_SAVES]; 2384 u32 n_int = rv_collect_int_saves(a, int_regs); 2385 u32 n_fp = rv_collect_fp_saves(a, fp_regs); 2386 i32 i; 2387 /* Stage an indirect callee into a reserved scratch (t1) BEFORE the teardown: 2388 * regalloc parks the function pointer in a callee-saved register so it 2389 * survives arg marshalling, and the callee-save / s0 / ra restores below 2390 * would otherwise overwrite it. t1 is reserved (never allocable) and 2391 * untouched by the restore loop (which only uses t0 for far offsets). */ 2392 if (indirect) rv64_emit32(mc, rv_addi(RV_TMP1, loc_reg(callee), 0)); 2393 /* Restore callee-saves before tearing the frame down (O1 path; none at -O0). 2394 * Their save offsets are s0-relative via rv_save_off, so the restore is 2395 * frame-size- and teardown-order-independent. */ 2396 for (i = (i32)n_int - 1; i >= 0; --i) 2397 rv_load_s0(v, mc, 0, int_regs[i], rv_save_off(a, n_int, (u32)i)); 2398 for (i = (i32)n_fp - 1; i >= 0; --i) 2399 rv_load_s0(v, mc, 1, fp_regs[i], rv_save_off(a, n_int, n_int + (u32)i)); 2400 rv64_emit32(mc, rv_ld_ptr(v, RV_RA, RV_S0, (i32)v->ptr_bytes)); 2401 rv64_emit32(mc, rv_addi(RV_SP, RV_S0, cfa)); 2402 rv64_emit32(mc, rv_ld_ptr(v, RV_S0, RV_S0, 0)); 2403 if (callee.kind == NATIVE_LOC_GLOBAL) { 2404 u32 pos = mc->pos(mc); 2405 rv64_emit32(mc, rv_auipc(RV_TMP0, 0)); 2406 rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP0, 0)); 2407 mc->emit_reloc_at(mc, mc->section_id, pos, R_RV_CALL, callee.v.global.sym, 2408 callee.v.global.addend, 0, 0); 2409 } else if (indirect) { 2410 rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP1, 0)); 2411 } else { 2412 rv_panic(a, "unsupported tail call target"); 2413 } 2414 } 2415 2416 static void rv_emit_call(NativeTarget* t, const NativeCallPlan* plan) { 2417 MCEmitter* mc = t->mc; 2418 ObjSecId sec = mc->section_id; 2419 if (plan->flags & CG_CALL_TAIL) { 2420 rv_emit_tail_site(t, plan->callee); 2421 return; 2422 } 2423 if (plan->callee.kind == NATIVE_LOC_GLOBAL) { 2424 u32 pos = mc->pos(mc); 2425 rv64_emit32(mc, rv_auipc(RV_RA, 0)); 2426 rv64_emit32(mc, rv_jalr(RV_RA, RV_RA, 0)); 2427 mc->emit_reloc_at(mc, sec, pos, R_RV_CALL, plan->callee.v.global.sym, 2428 plan->callee.v.global.addend, 0, 0); 2429 return; 2430 } 2431 if (plan->callee.kind == NATIVE_LOC_REG) { 2432 rv64_emit32(mc, rv_jalr(RV_RA, loc_reg(plan->callee), 0)); 2433 return; 2434 } 2435 rv_panic(rv_of(t), "unsupported call target"); 2436 } 2437 2438 static void rv_plan_ret(NativeTarget* t, const CGFuncDesc* fd, 2439 const NativeLoc* value, 2440 NativeCallPlanRet** out_rets, u32* out_nrets) { 2441 RvNativeTarget* a = rv_of(t); 2442 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type); 2443 NativeCallPlanRet* rets = NULL; 2444 u32 nr = 0; 2445 if (value) rets = arena_zarray(t->c->tu, NativeCallPlanRet, 4); 2446 if (value && abi && abi->ret.kind == ABI_ARG_INDIRECT) { 2447 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 2448 NativeLoc dstp = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1); 2449 NativeLoc saved = native_loc_stack(i64t, a->sret_ptr_slot, 0); 2450 NativeAddr dst_addr, src_addr; 2451 AggregateAccess access; 2452 rv_load_part(t, dstp, saved, 0, a->variant->ptr_bytes); 2453 memset(&dst_addr, 0, sizeof dst_addr); 2454 dst_addr.base_kind = NATIVE_ADDR_BASE_REG; 2455 dst_addr.base.reg = RV_TMP1; 2456 dst_addr.base_type = value->type; 2457 src_addr = rv_loc_addr(a, *value, 0); 2458 src_addr.base_type = value->type; 2459 memset(&access, 0, sizeof access); 2460 access.type = value->type; 2461 access.size = (u32)cg_type_size(t->c, value->type); 2462 access.align = native_type_align(t, value->type); 2463 rv_copy_bytes(t, dst_addr, src_addr, access); 2464 *out_rets = NULL; 2465 *out_nrets = 0; 2466 return; 2467 } 2468 if (value && abi && abi->ret.kind == ABI_ARG_DIRECT) { 2469 u32 ni = 0, nf = 0, p; 2470 for (p = 0; p < abi->ret.nparts; ++p) { 2471 const ABIArgPart* part = &abi->ret.parts[p]; 2472 NativeAllocClass cls = 2473 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2474 KitCgTypeId pty = rv_part_scalar_type(part); 2475 Reg rreg = cls == NATIVE_REG_FP ? RV_FA0 + nf++ : RV_A0 + ni++; 2476 rets[nr].src = *value; 2477 if (rets[nr].src.kind == NATIVE_LOC_FRAME) 2478 rets[nr].src = 2479 native_loc_stack(pty, value->v.frame, (i32)part->src_offset); 2480 else if (rets[nr].src.kind == NATIVE_LOC_STACK) { 2481 rets[nr].src.v.stack.offset += (i32)part->src_offset; 2482 rets[nr].src.type = pty; 2483 } 2484 rets[nr].dst = native_loc_reg(pty, cls, rreg); 2485 rets[nr].mem = native_mem_for_type(t, pty, part->size); 2486 nr++; 2487 } 2488 } else if (value) { 2489 rets[0].src = *value; 2490 rets[0].dst = native_loc_reg(value->type, NATIVE_REG_INT, RV_A0); 2491 rets[0].mem = native_mem_for_type(t, value->type, 0); 2492 nr = 1; 2493 } 2494 *out_rets = rets; 2495 *out_nrets = nr; 2496 } 2497 2498 static void rv_ret(NativeTarget* t) { 2499 RvNativeTarget* a = rv_of(t); 2500 rv_jump(t, a->epilogue_label); 2501 } 2502 2503 /* ============================ alloca ============================ */ 2504 2505 static void rv_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size, 2506 u32 align) { 2507 RvNativeTarget* a = rv_of(t); 2508 MCEmitter* mc = t->mc; 2509 u32 rsz = loc_reg(size); 2510 u32 rd = loc_reg(dst); 2511 u32 al = align ? align : 16u; 2512 if (al < 16u) al = 16u; 2513 /* round up: t0 = (size + (al-1)) & ~(al-1) */ 2514 rv64_emit32(mc, rv_addi(RV_TMP0, rsz, (i32)(al - 1u))); 2515 rv_emit_load_imm(a->variant, mc, 1, RV_TMP1, -(i64)al); 2516 rv64_emit32(mc, rv_and(RV_TMP0, RV_TMP0, RV_TMP1)); 2517 rv64_emit32(mc, rv_sub(RV_SP, RV_SP, RV_TMP0)); 2518 a->frame.has_alloca = 1; 2519 /* dst = sp + max_outgoing (patched in func_end) */ 2520 if (a->npatches == a->patches_cap) { 2521 u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u; 2522 RvPatch* nb = arena_zarray(t->c->tu, RvPatch, cap); 2523 if (a->patches) memcpy(nb, a->patches, sizeof(*nb) * a->npatches); 2524 a->patches = nb; 2525 a->patches_cap = cap; 2526 } 2527 a->patches[a->npatches].kind = RV_PATCH_ALLOCA; 2528 a->patches[a->npatches].pos = mc->pos(mc); 2529 a->patches[a->npatches].dst_reg = rd; 2530 a->npatches++; 2531 a->nalloca++; 2532 rv64_emit32(mc, RV_NOP); /* placeholder for addi dst, sp, max_outgoing */ 2533 } 2534 2535 /* ============================ TLS / bitfield / atomics 2536 * ============================ */ 2537 2538 static void rv_tls_addr_of(NativeTarget* t, NativeLoc dst, ObjSymId sym, 2539 i64 addend) { 2540 MCEmitter* mc = t->mc; 2541 u32 sec = mc->section_id; 2542 u32 rd = loc_reg(dst); 2543 /* Local-Exec only, matching aa64 (aa_tls_addr_of) and x64 (x64_tls_addr_of): 2544 * kit links the whole module statically, so every _Thread_local symbol is 2545 * resolved within the image and TPREL is always valid. An Initial-Exec GOT 2546 * path (R_RV_TLS_GOT_HI20) used to be emitted for extern-via-GOT symbols 2547 * under -fPIE (the hosted default), but the linker has no layout/apply for 2548 * that reloc, so it produced a hard "unsupported reloc kind" link failure 2549 * rather than a working binary. */ 2550 /* lui t0, %tprel_hi(sym); add t0, tp, t0; addi dst, t0, %tprel_lo(sym). */ 2551 { 2552 u32 hp = mc->pos(mc); 2553 rv64_emit32(mc, rv_lui(RV_TMP0, 0)); 2554 mc->emit_reloc_at(mc, sec, hp, R_RV_TPREL_HI20, sym, addend, 0, 0); 2555 rv64_emit32(mc, rv_add(RV_TMP0, RV_TP, RV_TMP0)); 2556 { 2557 u32 lp = mc->pos(mc); 2558 rv64_emit32(mc, rv_addi(rd, RV_TMP0, 0)); 2559 mc->emit_reloc_at(mc, sec, lp, R_RV_TPREL_LO12_I, sym, addend, 0, 0); 2560 } 2561 } 2562 } 2563 static void rv_bitfield_load(NativeTarget* t, NativeLoc dst, NativeAddr ra, 2564 BitFieldAccess bf) { 2565 RvNativeTarget* a = rv_of(t); 2566 const RiscvVariant* v = a->variant; 2567 MCEmitter* mc = t->mc; 2568 u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u; 2569 u32 rd = loc_reg(dst); 2570 u32 base; 2571 i32 off; 2572 u32 lsb = bf.bit_offset; 2573 u32 width = bf.bit_width ? bf.bit_width : 1u; 2574 /* Shift left so the field's MSB lands at the register top (XLEN-1), then shift 2575 * right to sign/zero extend it down. Shifts are XLEN-wide. */ 2576 u32 sh_left = v->xlen - (lsb + width); 2577 u32 sh_right = v->xlen - width; 2578 ra.offset += (i32)bf.storage_offset; 2579 rv_resolve_mem_addr(a, &ra, &base, &off); 2580 rv64_emit32(mc, enc_int_load(v, storage_bytes, 0, rd, base, off)); 2581 rv64_emit32(mc, rv_slli(rd, rd, sh_left)); 2582 if (bf.signed_) 2583 rv64_emit32(mc, rv_srai(rd, rd, sh_right)); 2584 else 2585 rv64_emit32(mc, rv_srli(rd, rd, sh_right)); 2586 } 2587 static void rv_bitfield_store(NativeTarget* t, NativeAddr ra, NativeLoc src, 2588 BitFieldAccess bf) { 2589 RvNativeTarget* a = rv_of(t); 2590 const RiscvVariant* v = a->variant; 2591 MCEmitter* mc = t->mc; 2592 u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u; 2593 u32 src_reg = loc_reg(src); 2594 u32 base; 2595 i32 off; 2596 u32 lsb = bf.bit_offset; 2597 u32 width = bf.bit_width ? bf.bit_width : 1u; 2598 u64 ones = width >= 64u ? ~(u64)0 : (((u64)1 << width) - 1u); 2599 u64 mask_in = ones << lsb; 2600 ra.offset += (i32)bf.storage_offset; 2601 /* Resolve the field address; rv_resolve_mem_addr may use RV_TMP0/RV_TMP1, so 2602 * stabilize the base into RV_TMP1 before consuming the scratch temps. */ 2603 rv_resolve_mem_addr(a, &ra, &base, &off); 2604 if (base != RV_S0 && base != RV_TMP1) { 2605 rv_emit_addr_adjust(v, mc, RV_TMP1, base, off); 2606 base = RV_TMP1; 2607 off = 0; 2608 } else if (base == RV_TMP1 && off != 0) { 2609 rv_emit_addr_adjust(v, mc, RV_TMP1, RV_TMP1, off); 2610 off = 0; 2611 } 2612 /* word in RV_TMP2; merged via RV_TMP0 (clear mask, then shifted src). */ 2613 rv64_emit32(mc, enc_int_load(v, storage_bytes, 0, RV_TMP2, base, off)); 2614 rv_emit_load_imm(v, mc, 1, RV_TMP0, (i64)~mask_in); 2615 rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP2, RV_TMP0)); 2616 rv_emit_load_imm(v, mc, 1, RV_TMP0, (i64)ones); 2617 rv64_emit32(mc, rv_and(RV_TMP0, src_reg, RV_TMP0)); 2618 if (lsb) rv64_emit32(mc, rv_slli(RV_TMP0, RV_TMP0, lsb)); 2619 rv64_emit32(mc, rv_or(RV_TMP2, RV_TMP2, RV_TMP0)); 2620 rv64_emit32(mc, enc_int_store(v, storage_bytes, RV_TMP2, base, off)); 2621 } 2622 static int rv_order_acquire(KitCgMemOrder o) { 2623 return o == KIT_CG_MO_CONSUME || o == KIT_CG_MO_ACQUIRE || 2624 o == KIT_CG_MO_ACQ_REL || o == KIT_CG_MO_SEQ_CST; 2625 } 2626 static int rv_order_release(KitCgMemOrder o) { 2627 return o == KIT_CG_MO_RELEASE || o == KIT_CG_MO_ACQ_REL || 2628 o == KIT_CG_MO_SEQ_CST; 2629 } 2630 2631 /* Materialize the atomic operand address into RV_TMP0 (a bare pointer, since 2632 * LR/SC and AMO take a base register with no offset) and return it. */ 2633 static u32 rv_atomic_addr_reg(RvNativeTarget* a, NativeAddr addr) { 2634 NativeLoc dst = 2635 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, RV_TMP0); 2636 rv_load_addr(&a->base, dst, addr); 2637 return RV_TMP0; 2638 } 2639 2640 static void rv_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, 2641 MemAccess mem, KitCgMemOrder mo) { 2642 RvNativeTarget* a = rv_of(t); 2643 MCEmitter* mc = t->mc; 2644 u32 sf = 2645 (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u; 2646 u32 base = rv_atomic_addr_reg(a, addr); 2647 if (mo == KIT_CG_MO_SEQ_CST) rv64_emit32(mc, rv_fence_rw_rw()); 2648 if (rv_order_acquire(mo)) { 2649 /* lr.w/d as an ordered load (aq=1). */ 2650 rv64_emit32(mc, sf ? rv_lr_d(loc_reg(dst), base, 1, 0) 2651 : rv_lr_w(loc_reg(dst), base, 1, 0)); 2652 } else { 2653 rv64_emit32(mc, enc_int_load(a->variant, 2654 mem.size ? mem.size 2655 : native_type_size(t, dst.type), 2656 0, loc_reg(dst), base, 0)); 2657 } 2658 } 2659 2660 static void rv_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src, 2661 MemAccess mem, KitCgMemOrder mo) { 2662 RvNativeTarget* a = rv_of(t); 2663 MCEmitter* mc = t->mc; 2664 u32 sz = mem.size ? mem.size : native_type_size(t, src.type); 2665 /* RV_TMP0 holds the address; never collides with src (an allocable reg). */ 2666 u32 base = rv_atomic_addr_reg(a, addr); 2667 if (rv_order_release(mo)) rv64_emit32(mc, rv_fence_rw_rw()); 2668 rv64_emit32(mc, enc_int_store(a->variant, sz, loc_reg(src), base, 0)); 2669 if (mo == KIT_CG_MO_SEQ_CST) rv64_emit32(mc, rv_fence_rw_rw()); 2670 } 2671 2672 static void rv_atomic_rmw(NativeTarget* t, KitCgAtomicOp op, NativeLoc dst, 2673 NativeAddr addr, NativeLoc val, MemAccess mem, 2674 KitCgMemOrder mo) { 2675 RvNativeTarget* a = rv_of(t); 2676 const RiscvVariant* v = a->variant; 2677 MCEmitter* mc = t->mc; 2678 u32 sf = (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u; 2679 /* W-form add/sub apply only to a 32-bit value on rv64; on rv32 the base ops 2680 * are the 32-bit ops. */ 2681 int w = !sf && v->has_w_forms; 2682 u32 base = rv_atomic_addr_reg(a, addr); /* RV_TMP0 */ 2683 u32 vreg = loc_reg(val); 2684 u32 rd = loc_reg(dst); 2685 u32 aq = (u32)rv_order_acquire(mo); 2686 u32 rl = (u32)rv_order_release(mo); 2687 MCLabel retry = mc->label_new(mc); 2688 /* LR/SC loop: dst = *base; new = dst op val; sc new; retry on failure. 2689 * RV_TMP1 carries the SC status, RV_TMP3 the computed new value. */ 2690 mc->label_place(mc, retry); 2691 rv64_emit32(mc, sf ? rv_lr_d(rd, base, aq, 0) : rv_lr_w(rd, base, aq, 0)); 2692 switch (op) { 2693 case KIT_CG_ATOMIC_XCHG: 2694 rv64_emit32(mc, rv_addi(RV_TMP3, vreg, 0)); 2695 break; 2696 case KIT_CG_ATOMIC_ADD: 2697 rv64_emit32(mc, 2698 w ? rv_addw(RV_TMP3, rd, vreg) : rv_add(RV_TMP3, rd, vreg)); 2699 break; 2700 case KIT_CG_ATOMIC_SUB: 2701 rv64_emit32(mc, 2702 w ? rv_subw(RV_TMP3, rd, vreg) : rv_sub(RV_TMP3, rd, vreg)); 2703 break; 2704 case KIT_CG_ATOMIC_AND: 2705 rv64_emit32(mc, rv_and(RV_TMP3, rd, vreg)); 2706 break; 2707 case KIT_CG_ATOMIC_OR: 2708 rv64_emit32(mc, rv_or(RV_TMP3, rd, vreg)); 2709 break; 2710 case KIT_CG_ATOMIC_XOR: 2711 rv64_emit32(mc, rv_xor(RV_TMP3, rd, vreg)); 2712 break; 2713 case KIT_CG_ATOMIC_NAND: 2714 rv64_emit32(mc, rv_and(RV_TMP3, rd, vreg)); 2715 rv64_emit32(mc, rv_xori(RV_TMP3, RV_TMP3, -1)); 2716 break; 2717 default: 2718 rv_panic(a, "unsupported atomic rmw op"); 2719 } 2720 rv64_emit32(mc, sf ? rv_sc_d(RV_TMP1, base, RV_TMP3, 0, rl) 2721 : rv_sc_w(RV_TMP1, base, RV_TMP3, 0, rl)); 2722 rv64_emit32(mc, rv_bne(RV_TMP1, RV_ZERO, 0)); 2723 mc->emit_label_ref(mc, retry, R_RV_BRANCH, 4, 0); 2724 } 2725 2726 static void rv_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok, 2727 NativeAddr addr, NativeLoc expected, 2728 NativeLoc desired, MemAccess mem, 2729 KitCgMemOrder success, KitCgMemOrder failure) { 2730 RvNativeTarget* a = rv_of(t); 2731 MCEmitter* mc = t->mc; 2732 u32 sf = 2733 (mem.size ? mem.size : native_type_size(t, prior.type)) == 8u ? 1u : 0u; 2734 u32 base = rv_atomic_addr_reg(a, addr); /* RV_TMP0 */ 2735 u32 rprior = loc_reg(prior); 2736 u32 rexp = loc_reg(expected); 2737 u32 rdes = loc_reg(desired); 2738 u32 rok = loc_reg(ok); 2739 u32 aq = (u32)rv_order_acquire(success); 2740 u32 rl = (u32)rv_order_release(success); 2741 MCLabel retry = mc->label_new(mc); 2742 MCLabel fail = mc->label_new(mc); 2743 MCLabel done = mc->label_new(mc); 2744 (void)failure; 2745 mc->label_place(mc, retry); 2746 rv64_emit32(mc, 2747 sf ? rv_lr_d(rprior, base, aq, 0) : rv_lr_w(rprior, base, aq, 0)); 2748 /* if (prior != expected) -> fail */ 2749 rv64_emit32(mc, rv_bne(rprior, rexp, 0)); 2750 mc->emit_label_ref(mc, fail, R_RV_BRANCH, 4, 0); 2751 /* sc.w/d status, desired, (base); retry on failure. */ 2752 rv64_emit32(mc, sf ? rv_sc_d(RV_TMP1, base, rdes, 0, rl) 2753 : rv_sc_w(RV_TMP1, base, rdes, 0, rl)); 2754 rv64_emit32(mc, rv_bne(RV_TMP1, RV_ZERO, 0)); 2755 mc->emit_label_ref(mc, retry, R_RV_BRANCH, 4, 0); 2756 /* ok = 1; jump done. */ 2757 rv_emit_load_imm(a->variant, mc, 0, rok, 1); 2758 rv64_emit32(mc, rv_jal(RV_ZERO, 0)); 2759 mc->emit_label_ref(mc, done, R_RV_JAL, 4, 0); 2760 mc->label_place(mc, fail); 2761 rv_emit_load_imm(a->variant, mc, 0, rok, 0); 2762 mc->label_place(mc, done); 2763 } 2764 2765 static void rv_fence(NativeTarget* t, KitCgMemOrder mo) { 2766 if (mo == KIT_CG_MO_RELAXED) return; 2767 rv64_emit32(t->mc, rv_fence_rw_rw()); 2768 } 2769 /* ---- variadics (LP64D ABI_VA_LIST_POINTER) ---- 2770 * va_list is a single void* to the next argument slot. The prologue spilled 2771 * unconsumed a-regs into the 64-byte save area at [s0+16); incoming stack args 2772 * follow contiguously, so a uniform 8-byte stride covers both. `ap` is a 2773 * NativeAddr that addresses the va_list object itself. */ 2774 2775 static void rv_va_start_core(RvNativeTarget* a, NativeAddr ap) { 2776 NativeTarget* t = &a->base; 2777 const RiscvVariant* v = a->variant; 2778 MCEmitter* mc = t->mc; 2779 ABIVaListInfo vai = abi_va_list_layout(t->c->abi); 2780 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 2781 u32 slot = vai.gp_slot_size ? vai.gp_slot_size : v->gp_slot_bytes; 2782 if (vai.kind != ABI_VA_LIST_POINTER) 2783 rv_panic(a, "unsupported va_list layout"); 2784 if (!a->is_variadic) rv_panic(a, "va_start: function not variadic"); 2785 /* *ap = s0 + frame_save + next_param_int*gp_slot (skip named-int slots). */ 2786 rv64_emit32(mc, rv_addi(RV_TMP1, RV_S0, 2787 (i32)v->frame_save_size + 2788 (i32)(a->next_param_int * slot))); 2789 rv_emit_mem(a, 0, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1), ap, 2790 native_mem_for_type(t, i64t, v->ptr_bytes)); 2791 } 2792 2793 /* Wide / aggregate va_arg: a value too large for a single GPR (an 8-byte 2794 * i64 / soft-double on ilp32) occupies consecutive GP slots in the save area 2795 * and cannot move through one register. Read the cursor, advance it past the 2796 * whole span, then byte-copy the value from the (saved) cursor into the 2797 * destination memory. RV_TMP2 holds the cursor across the rv_copy_bytes call, 2798 * which itself uses RV_TMP0/RV_TMP1/RV_TMP3. */ 2799 static void rv_va_arg_wide(RvNativeTarget* a, NativeAddr dst, NativeAddr ap, 2800 u32 sz) { 2801 NativeTarget* t = &a->base; 2802 const RiscvVariant* v = a->variant; 2803 MCEmitter* mc = t->mc; 2804 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 2805 ABIVaListInfo vai = abi_va_list_layout(t->c->abi); 2806 u32 slot = vai.gp_slot_size ? vai.gp_slot_size : v->gp_slot_bytes; 2807 u32 span = align_up_u32(sz, slot); 2808 NativeLoc cur = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP2); 2809 NativeLoc nxt = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1); 2810 NativeAddr src; 2811 AggregateAccess acc; 2812 /* cur = *ap; *ap = cur + span. */ 2813 rv_emit_mem(a, 1, cur, ap, native_mem_for_type(t, i64t, v->ptr_bytes)); 2814 rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP2, (i32)span)); 2815 rv_emit_mem(a, 0, nxt, ap, native_mem_for_type(t, i64t, v->ptr_bytes)); 2816 /* Copy sz bytes from [cur] to the destination. */ 2817 memset(&src, 0, sizeof src); 2818 src.base_kind = NATIVE_ADDR_BASE_REG; 2819 src.base.reg = RV_TMP2; 2820 src.base_type = i64t; 2821 memset(&acc, 0, sizeof acc); 2822 acc.type = i64t; 2823 acc.size = sz; 2824 acc.align = slot; 2825 rv_copy_bytes(t, dst, src, acc); 2826 } 2827 2828 static void rv_va_arg_core(RvNativeTarget* a, NativeLoc dst, NativeAddr ap, 2829 KitCgTypeId type) { 2830 NativeTarget* t = &a->base; 2831 const RiscvVariant* v = a->variant; 2832 MCEmitter* mc = t->mc; 2833 ABIVaListInfo vai = abi_va_list_layout(t->c->abi); 2834 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 2835 u32 sz = native_type_size(t, type); 2836 u32 slot = vai.gp_slot_size ? vai.gp_slot_size : v->gp_slot_bytes; 2837 NativeLoc cur = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1); 2838 NativeAddr from; 2839 if (vai.kind != ABI_VA_LIST_POINTER) 2840 rv_panic(a, "unsupported va_list layout"); 2841 if (dst.kind != NATIVE_LOC_REG) rv_panic(a, "va_arg destination must be reg"); 2842 /* cur = *ap; load value from [cur]; *ap = cur + slot (one GP-slot stride). */ 2843 rv_emit_mem(a, 1, cur, ap, native_mem_for_type(t, i64t, v->ptr_bytes)); 2844 memset(&from, 0, sizeof from); 2845 from.base_kind = NATIVE_ADDR_BASE_REG; 2846 from.base.reg = RV_TMP1; 2847 from.base_type = type; 2848 if (native_loc_is_fp(dst)) { 2849 /* Variadic FP args sit in the integer save area as their bit pattern; 2850 * load into RV_TMP2 and bitcast into the FPR. The fmv_d_x (double) path is 2851 * RV64-only — on rv32 doubles are passed soft and never reach here. */ 2852 NativeLoc itmp = native_loc_reg(type, NATIVE_REG_INT, RV_TMP2); 2853 rv_emit_mem(a, 1, itmp, from, native_mem_for_type(t, type, sz)); 2854 rv64_emit32(mc, sz == 8u ? rv_fmv_d_x(loc_reg(dst), RV_TMP2) 2855 : rv_fmv_w_x(loc_reg(dst), RV_TMP2)); 2856 } else { 2857 rv_emit_mem(a, 1, dst, from, native_mem_for_type(t, type, sz)); 2858 } 2859 rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP1, (i32)slot)); 2860 rv_emit_mem(a, 0, cur, ap, native_mem_for_type(t, i64t, v->ptr_bytes)); 2861 } 2862 2863 static void rv_va_copy_core(RvNativeTarget* a, NativeAddr dst_ap, 2864 NativeAddr src_ap) { 2865 NativeTarget* t = &a->base; 2866 u32 ptr = a->variant->ptr_bytes; 2867 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 2868 NativeLoc tmp = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1); 2869 /* va_list is a single pointer-width slot. */ 2870 rv_emit_mem(a, 1, tmp, src_ap, native_mem_for_type(t, i64t, ptr)); 2871 rv_emit_mem(a, 0, tmp, dst_ap, native_mem_for_type(t, i64t, ptr)); 2872 } 2873 2874 static NativeAddr rv_va_addr_from_ptr(NativeLoc ap_ptr) { 2875 NativeAddr addr; 2876 memset(&addr, 0, sizeof addr); 2877 addr.base_kind = NATIVE_ADDR_BASE_REG; 2878 addr.cls = NATIVE_REG_INT; 2879 addr.base.reg = ap_ptr.v.reg; 2880 addr.base_type = ap_ptr.type; 2881 return addr; 2882 } 2883 2884 static void rv_va_start_native(NativeTarget* t, NativeLoc ap_ptr) { 2885 rv_va_start_core(rv_of(t), rv_va_addr_from_ptr(ap_ptr)); 2886 } 2887 /* A scalar whose value cannot move through one GPR (size > GPR width, e.g. an 2888 * 8-byte i64 / soft-double on ilp32). pass_native_emit hands such a va_arg its 2889 * memory destination directly rather than a scratch register. */ 2890 static int rv_va_arg_is_wide(NativeTarget* t, KitCgTypeId type) { 2891 return native_type_size(t, type) > rv_of(t)->variant->ptr_bytes; 2892 } 2893 2894 static void rv_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr, 2895 KitCgTypeId type) { 2896 RvNativeTarget* a = rv_of(t); 2897 if (rv_va_arg_is_wide(t, type)) { 2898 rv_va_arg_wide(a, rv_loc_addr(a, dst, 0), rv_va_addr_from_ptr(ap_ptr), 2899 native_type_size(t, type)); 2900 return; 2901 } 2902 rv_va_arg_core(a, dst, rv_va_addr_from_ptr(ap_ptr), type); 2903 } 2904 static void rv_va_end_native(NativeTarget* t, NativeLoc ap_ptr) { 2905 (void)t; 2906 (void)ap_ptr; 2907 } 2908 static void rv_va_copy_native(NativeTarget* t, NativeLoc dst, NativeLoc src) { 2909 rv_va_copy_core(rv_of(t), rv_va_addr_from_ptr(dst), rv_va_addr_from_ptr(src)); 2910 } 2911 /* Software popcount of RV_TMP1 (already width-normalized) into rd, using 2912 * RV_TMP1/RV_TMP2/RV_TMP3 as scratch. Mirrors the legacy bit-twiddling. On rv32 2913 * only the 32-bit (is64==0) path is reachable for a single register. */ 2914 static void rv_emit_popcount(const RiscvVariant* v, MCEmitter* mc, u32 rd, 2915 int is64) { 2916 rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, 1)); 2917 rv_emit_load_imm(v, mc, 1, RV_TMP3, 2918 is64 ? (i64)0x5555555555555555ll : (i64)0x55555555); 2919 rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP2, RV_TMP3)); 2920 rv64_emit32(mc, rv_sub(RV_TMP1, RV_TMP1, RV_TMP2)); 2921 rv_emit_load_imm(v, mc, 1, RV_TMP3, 2922 is64 ? (i64)0x3333333333333333ll : (i64)0x33333333); 2923 rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP1, RV_TMP3)); 2924 rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 2)); 2925 rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP3)); 2926 rv64_emit32(mc, rv_add(RV_TMP1, RV_TMP1, RV_TMP2)); 2927 rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, 4)); 2928 rv64_emit32(mc, rv_add(RV_TMP1, RV_TMP1, RV_TMP2)); 2929 rv_emit_load_imm(v, mc, 1, RV_TMP3, 2930 is64 ? (i64)0x0f0f0f0f0f0f0f0fll : (i64)0x0f0f0f0f); 2931 rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP3)); 2932 rv_emit_load_imm(v, mc, 1, RV_TMP3, 2933 is64 ? (i64)0x0101010101010101ll : (i64)0x01010101); 2934 rv64_emit32(mc, rv_mul(RV_TMP1, RV_TMP1, RV_TMP3)); 2935 rv64_emit32(mc, rv_srli(rd, RV_TMP1, is64 ? 56u : 24u)); 2936 /* The 32-bit SWAR sum lives in product bits [24,32); since the multiply is 2937 * 64-bit, bits [32,64) survive the >>24 and must be masked off. (The 64-bit 2938 * path's >>56 already isolates the top byte, so it needs no mask.) */ 2939 if (!is64) rv64_emit32(mc, rv_andi(rd, rd, 0xff)); 2940 } 2941 2942 /* Inline byte-granule copy/set between bare base registers (memcpy/memmove/ 2943 * memset intrinsics). dir<0 copies high-to-low (memmove backward). The 8-byte 2944 * granule (ld/sd) and zero-extending lwu are RV64-only; on rv32 the widest 2945 * granule is 4 bytes via lw/sw. */ 2946 static void rv_intrin_copy(const RiscvVariant* v, MCEmitter* mc, u32 dr, u32 sr, 2947 u32 n, int backward) { 2948 int wide = v->ptr_bytes == 8u; 2949 if (!backward) { 2950 u32 i = 0; 2951 while (wide && i + 8u <= n) { 2952 rv64_emit32(mc, rv_ld(RV_TMP3, sr, (i32)i)); 2953 rv64_emit32(mc, rv_sd(RV_TMP3, dr, (i32)i)); 2954 i += 8u; 2955 } 2956 while (i + 4u <= n) { 2957 rv64_emit32(mc, wide ? rv_lwu(RV_TMP3, sr, (i32)i) 2958 : rv_lw(RV_TMP3, sr, (i32)i)); 2959 rv64_emit32(mc, rv_sw(RV_TMP3, dr, (i32)i)); 2960 i += 4u; 2961 } 2962 while (i + 2u <= n) { 2963 rv64_emit32(mc, rv_lhu(RV_TMP3, sr, (i32)i)); 2964 rv64_emit32(mc, rv_sh(RV_TMP3, dr, (i32)i)); 2965 i += 2u; 2966 } 2967 while (i < n) { 2968 rv64_emit32(mc, rv_lbu(RV_TMP3, sr, (i32)i)); 2969 rv64_emit32(mc, rv_sb(RV_TMP3, dr, (i32)i)); 2970 i += 1u; 2971 } 2972 } else { 2973 u32 i = n; 2974 while (wide && i >= 8u) { 2975 i -= 8u; 2976 rv64_emit32(mc, rv_ld(RV_TMP3, sr, (i32)i)); 2977 rv64_emit32(mc, rv_sd(RV_TMP3, dr, (i32)i)); 2978 } 2979 while (i >= 4u) { 2980 i -= 4u; 2981 rv64_emit32(mc, wide ? rv_lwu(RV_TMP3, sr, (i32)i) 2982 : rv_lw(RV_TMP3, sr, (i32)i)); 2983 rv64_emit32(mc, rv_sw(RV_TMP3, dr, (i32)i)); 2984 } 2985 while (i >= 2u) { 2986 i -= 2u; 2987 rv64_emit32(mc, rv_lhu(RV_TMP3, sr, (i32)i)); 2988 rv64_emit32(mc, rv_sh(RV_TMP3, dr, (i32)i)); 2989 } 2990 while (i >= 1u) { 2991 i -= 1u; 2992 rv64_emit32(mc, rv_lbu(RV_TMP3, sr, (i32)i)); 2993 rv64_emit32(mc, rv_sb(RV_TMP3, dr, (i32)i)); 2994 } 2995 } 2996 } 2997 2998 static void rv_intrinsic(NativeTarget* t, IntrinKind kind, 2999 const NativeLoc* dsts, u32 ndst, const NativeLoc* args, 3000 u32 narg) { 3001 RvNativeTarget* a = rv_of(t); 3002 const RiscvVariant* v = a->variant; 3003 MCEmitter* mc = t->mc; 3004 (void)ndst; 3005 (void)narg; 3006 switch (kind) { 3007 case INTRIN_NONE: 3008 break; 3009 case INTRIN_EXPECT: 3010 case INTRIN_ASSUME_ALIGNED: { 3011 /* dst = val (hint dropped). */ 3012 if (args[0].kind == NATIVE_LOC_IMM) 3013 rv_emit_load_imm(v, mc, rv_is_64(t, dsts[0].type) ? 1u : 0u, 3014 loc_reg(dsts[0]), args[0].v.imm); 3015 else 3016 rv_move(t, dsts[0], args[0]); 3017 return; 3018 } 3019 case INTRIN_PREFETCH: 3020 return; 3021 case INTRIN_TRAP: 3022 rv64_emit32(mc, rv_ebreak()); 3023 return; 3024 case INTRIN_SYSCALL: 3025 if (ndst == 1u && narg >= 1u && narg <= 7u) { 3026 static const u32 syscall_regs[7] = {RV_A7, RV_A0, RV_A1, RV_A2, 3027 RV_A3, RV_A4, RV_A5}; 3028 RvArgMove moves[7]; 3029 for (u32 i = 0; i < narg; ++i) { 3030 RvArgMove* m = &moves[i]; 3031 memset(m, 0, sizeof *m); 3032 m->dst = 3033 native_loc_reg(dsts[0].type, NATIVE_REG_INT, syscall_regs[i]); 3034 m->src = args[i]; 3035 m->size = t->c->target.ptr_size; 3036 } 3037 rv_emit_reg_arg_moves(t, moves, narg); 3038 rv64_emit32(mc, rv_ecall()); 3039 rv_move(t, dsts[0], native_loc_reg(dsts[0].type, NATIVE_REG_INT, RV_A0)); 3040 } 3041 return; 3042 case INTRIN_BSWAP: { 3043 u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type); 3044 switch (width) { 3045 case 2: { 3046 u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); 3047 /* rd = ((rs & 0xff) << 8) | ((rs >> 8) & 0xff). */ 3048 rv64_emit32(mc, rv_addi(RV_TMP2, RV_ZERO, 0xff)); 3049 rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); /* 0xff00 */ 3050 rv64_emit32(mc, rv_slli(RV_TMP1, rs, 8)); 3051 rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP2)); 3052 rv64_emit32(mc, rv_srli(RV_TMP3, rs, 8)); 3053 rv64_emit32(mc, rv_andi(RV_TMP3, RV_TMP3, 0xff)); 3054 rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP3)); 3055 return; 3056 } 3057 case 4: { 3058 u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); 3059 /* SRLIW is RV64-only; on rv32 SRLI on a 32-bit reg is equivalent. */ 3060 int w = v->has_w_forms; 3061 rv64_emit32(mc, w ? rv_srliw(RV_TMP1, rs, 24) : rv_srli(RV_TMP1, rs, 24)); 3062 rv64_emit32(mc, rv_andi(RV_TMP1, RV_TMP1, 0xff)); 3063 rv64_emit32(mc, w ? rv_srliw(RV_TMP2, rs, 16) : rv_srli(RV_TMP2, rs, 16)); 3064 rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff)); 3065 rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); 3066 rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2)); 3067 rv64_emit32(mc, w ? rv_srliw(RV_TMP2, rs, 8) : rv_srli(RV_TMP2, rs, 8)); 3068 rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff)); 3069 rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 16)); 3070 rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2)); 3071 rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff)); 3072 rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 24)); 3073 rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP2)); 3074 /* Canonicalize to a 32-bit value in a 64-bit reg (RV64 only); on rv32 3075 * the result already occupies the whole register. */ 3076 if (w) { 3077 rv64_emit32(mc, rv_slli(rd, rd, 32)); 3078 rv64_emit32(mc, rv_srli(rd, rd, 32)); 3079 } 3080 return; 3081 } 3082 case 8: { 3083 u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); 3084 int i; 3085 rv64_emit32(mc, rv_addi(RV_TMP1, RV_ZERO, 0)); 3086 for (i = 0; i < 8; ++i) { 3087 int sh = 56 - 8 * i; 3088 if (i == 0) { 3089 rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff)); 3090 } else { 3091 rv64_emit32(mc, rv_srli(RV_TMP2, rs, (u32)(8 * i))); 3092 rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff)); 3093 } 3094 if (sh) rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, (u32)sh)); 3095 rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2)); 3096 } 3097 rv64_emit32(mc, rv_addi(rd, RV_TMP1, 0)); 3098 return; 3099 } 3100 default: 3101 break; 3102 } 3103 return; 3104 } 3105 case INTRIN_POPCOUNT: { 3106 u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); 3107 int is64 = rv_is_64(t, args[0].type); 3108 /* The narrow-in-wide normalization clears the high 32 bits of a 64-bit 3109 * reg; on rv32 there are none, so it is skipped. */ 3110 int nrm = !is64 && v->xlen == 64u; 3111 rv64_emit32(mc, rv_addi(RV_TMP1, rs, 0)); 3112 if (nrm) { 3113 rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32)); 3114 rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32)); 3115 } 3116 rv_emit_popcount(v, mc, rd, is64); 3117 return; 3118 } 3119 case INTRIN_CTZ: { 3120 /* ctz(x) = popcount((x & -x) - 1) for x != 0. */ 3121 u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); 3122 int is64 = rv_is_64(t, args[0].type); 3123 int nrm = !is64 && v->xlen == 64u; 3124 rv64_emit32(mc, rv_sub(RV_TMP1, RV_ZERO, rs)); 3125 rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, rs)); 3126 rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP1, -1)); 3127 if (nrm) { 3128 rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32)); 3129 rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32)); 3130 } 3131 rv_emit_popcount(v, mc, rd, is64); 3132 return; 3133 } 3134 case INTRIN_CLZ: { 3135 /* Fold the high bit downward, then clz = popcount(~folded). */ 3136 u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); 3137 int is64 = rv_is_64(t, args[0].type); 3138 int nrm = !is64 && v->xlen == 64u; 3139 u32 shifts[6] = {1, 2, 4, 8, 16, 32}; 3140 u32 ns = is64 ? 6u : 5u, i; 3141 rv64_emit32(mc, rv_addi(RV_TMP1, rs, 0)); 3142 if (nrm) { 3143 rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32)); 3144 rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32)); 3145 } 3146 for (i = 0; i < ns; ++i) { 3147 rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, shifts[i])); 3148 rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2)); 3149 } 3150 rv64_emit32(mc, rv_xori(RV_TMP1, RV_TMP1, -1)); 3151 if (nrm) { 3152 rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32)); 3153 rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32)); 3154 } 3155 rv_emit_popcount(v, mc, rd, is64); 3156 return; 3157 } 3158 case INTRIN_SADD_OVERFLOW: 3159 case INTRIN_SSUB_OVERFLOW: { 3160 /* dsts: [val, ovf]. ADD: ovf=((a^r)&(b^r))>>(w-1); 3161 * SUB: ovf=((a^b)&(a^r))>>(w-1). */ 3162 int is64 = rv_is_64(t, dsts[0].type); 3163 int w = !is64 && v->has_w_forms; /* narrow op on rv64 -> W-form */ 3164 u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]); 3165 u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]); 3166 u32 sh = is64 ? 63u : 31u; 3167 if (kind == INTRIN_SADD_OVERFLOW) 3168 rv64_emit32(mc, w ? rv_addw(RV_TMP2, ra, rb) : rv_add(RV_TMP2, ra, rb)); 3169 else 3170 rv64_emit32(mc, w ? rv_subw(RV_TMP2, ra, rb) : rv_sub(RV_TMP2, ra, rb)); 3171 rv64_emit32(mc, rv_xor(RV_TMP3, ra, RV_TMP2)); /* a ^ r */ 3172 if (kind == INTRIN_SADD_OVERFLOW) { 3173 rv64_emit32(mc, rv_xor(rovf, rb, RV_TMP2)); /* b ^ r */ 3174 rv64_emit32(mc, rv_and(rovf, rovf, RV_TMP3)); 3175 } else { 3176 rv64_emit32(mc, rv_xor(rovf, ra, rb)); /* a ^ b */ 3177 rv64_emit32(mc, rv_and(rovf, rovf, RV_TMP3)); 3178 } 3179 rv64_emit32(mc, w ? rv_srliw(rovf, rovf, sh) : rv_srli(rovf, rovf, sh)); 3180 rv64_emit32(mc, rv_andi(rovf, rovf, 1)); 3181 rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0)); 3182 return; 3183 } 3184 case INTRIN_UADD_OVERFLOW: 3185 case INTRIN_USUB_OVERFLOW: { 3186 int is64 = rv_is_64(t, dsts[0].type); 3187 /* `single`: the value fills the whole native register (rv64 i64 or any 3188 * rv32 value), so the native carry/borrow sequence applies directly; the 3189 * `!single` branch is the rv64 32-bit-in-64-bit-register implementation 3190 * (zero-extend + srli-32), reachable only on rv64. */ 3191 int single = is64 || v->xlen == 32u; 3192 u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]); 3193 u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]); 3194 if (!single) { 3195 rv64_emit32(mc, rv_slli(RV_TMP2, ra, 32)); 3196 rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP2, 32)); 3197 rv64_emit32(mc, rv_slli(RV_TMP3, rb, 32)); 3198 rv64_emit32(mc, rv_srli(RV_TMP3, RV_TMP3, 32)); 3199 ra = RV_TMP2; 3200 rb = RV_TMP3; 3201 } 3202 if (kind == INTRIN_UADD_OVERFLOW) { 3203 if (single) { 3204 rv64_emit32(mc, rv_add(RV_TMP2, ra, rb)); 3205 rv64_emit32(mc, rv_sltu(rovf, RV_TMP2, ra)); 3206 } else { 3207 rv64_emit32(mc, rv_add(RV_TMP2, ra, rb)); 3208 rv64_emit32(mc, rv_srli(rovf, RV_TMP2, 32)); 3209 rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf)); 3210 rv64_emit32(mc, rv_addiw(RV_TMP2, RV_TMP2, 0)); 3211 } 3212 } else { 3213 rv64_emit32(mc, rv_sltu(rovf, ra, rb)); 3214 rv64_emit32(mc, single ? rv_sub(RV_TMP2, ra, rb) 3215 : rv_subw(RV_TMP2, ra, rb)); 3216 } 3217 rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0)); 3218 return; 3219 } 3220 case INTRIN_SMUL_OVERFLOW: { 3221 int is64 = rv_is_64(t, dsts[0].type); 3222 /* `single`: native-width product overflow via MUL + MULH and a sign-bit 3223 * compare (shift xlen-1). rv64 i64 and any rv32 value take this path; the 3224 * `!single` branch is the rv64 32-bit-in-64-bit-register sequence. */ 3225 int single = is64 || v->xlen == 32u; 3226 u32 sh = is64 ? 63u : 31u; 3227 u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]); 3228 u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]); 3229 if (single) { 3230 rv64_emit32(mc, rv_mul(RV_TMP2, ra, rb)); 3231 rv64_emit32(mc, rv_mulh(RV_TMP3, ra, rb)); 3232 rv64_emit32(mc, rv_srai(rovf, RV_TMP2, sh)); 3233 rv64_emit32(mc, rv_xor(rovf, RV_TMP3, rovf)); 3234 rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf)); 3235 rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0)); 3236 } else { 3237 rv64_emit32(mc, rv_addiw(RV_TMP2, ra, 0)); 3238 rv64_emit32(mc, rv_addiw(RV_TMP3, rb, 0)); 3239 rv64_emit32(mc, rv_mul(RV_TMP2, RV_TMP2, RV_TMP3)); 3240 rv64_emit32(mc, rv_addiw(RV_TMP3, RV_TMP2, 0)); 3241 rv64_emit32(mc, rv_xor(rovf, RV_TMP2, RV_TMP3)); 3242 rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf)); 3243 rv64_emit32(mc, rv_addiw(rd, RV_TMP2, 0)); 3244 } 3245 return; 3246 } 3247 case INTRIN_UMUL_OVERFLOW: { 3248 int is64 = rv_is_64(t, dsts[0].type); 3249 /* `single`: native-width product, overflow = (high word != 0) via MULHU. 3250 * rv64 i64 and any rv32 value take this path; `!single` is the rv64 3251 * 32-bit-in-64-bit-register sequence. */ 3252 int single = is64 || v->xlen == 32u; 3253 u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]); 3254 u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]); 3255 if (single) { 3256 rv64_emit32(mc, rv_mulhu(rovf, ra, rb)); 3257 rv64_emit32(mc, rv_mul(rd, ra, rb)); 3258 rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf)); 3259 } else { 3260 rv64_emit32(mc, rv_slli(RV_TMP2, ra, 32)); 3261 rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP2, 32)); 3262 rv64_emit32(mc, rv_slli(RV_TMP3, rb, 32)); 3263 rv64_emit32(mc, rv_srli(RV_TMP3, RV_TMP3, 32)); 3264 rv64_emit32(mc, rv_mul(RV_TMP2, RV_TMP2, RV_TMP3)); 3265 rv64_emit32(mc, rv_srli(rovf, RV_TMP2, 32)); 3266 rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf)); 3267 rv64_emit32(mc, rv_addiw(rd, RV_TMP2, 0)); 3268 } 3269 return; 3270 } 3271 case INTRIN_MEMMOVE: { 3272 u32 dr, sr, n; 3273 if (narg != 3u || args[0].kind != NATIVE_LOC_REG || 3274 args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM) 3275 rv_panic(a, "unsupported memory intrinsic operands"); 3276 if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll) 3277 rv_panic(a, "unsupported memory intrinsic size"); 3278 dr = loc_reg(args[0]); 3279 sr = loc_reg(args[1]); 3280 n = (u32)args[2].v.imm; 3281 rv_intrin_copy(v, mc, dr, sr, n, /*reverse (overlap-safe)=*/1); 3282 return; 3283 } 3284 case INTRIN_CPU_NOP: 3285 rv64_emit32(mc, rv_nop()); 3286 return; 3287 case INTRIN_CPU_YIELD: 3288 rv64_emit32(mc, rv_pause()); 3289 return; 3290 case INTRIN_ISB: 3291 rv64_emit32(mc, rv_fence_i()); 3292 return; 3293 case INTRIN_DMB: 3294 case INTRIN_DSB: 3295 rv64_emit32(mc, rv_fence_rw_rw()); 3296 return; 3297 case INTRIN_WFI: 3298 rv64_emit32(mc, rv_wfi()); 3299 return; 3300 case INTRIN_FRAME_ADDRESS: 3301 case INTRIN_RETURN_ADDRESS: 3302 /* Walk the s0 frame-record chain. kit's RISC-V prologue anchors s0 at the 3303 * saved pair: [s0] = caller's s0, [s0 + ptr_bytes] = saved ra (this 3304 * frame's return address). NOTE: this differs from the psABI's 3305 * ra@s0-8 / fp@s0-16 layout — kit stores the pair at and above s0. A 3306 * function that reads its frame is forced off the frameless-leaf tier 3307 * (see NativeKnownFrameDesc.reads_frame), so s0 is always valid here. The 3308 * level is constant, so the walk unrolls to `level` dependent loads. */ 3309 if (ndst == 1u) { 3310 u32 level = (narg >= 1u && args[0].kind == NATIVE_LOC_IMM) 3311 ? (u32)args[0].v.imm 3312 : 0u; 3313 u32 rd = loc_reg(dsts[0]); 3314 rv64_emit32(mc, rv_addi(rd, RV_S0, 0)); /* rd = s0 */ 3315 for (u32 i = 0; i < level; ++i) 3316 rv64_emit32(mc, rv_ld_ptr(v, rd, rd, 0)); /* rd = *(rd) */ 3317 if (kind == INTRIN_RETURN_ADDRESS) 3318 rv64_emit32(mc, rv_ld_ptr(v, rd, rd, (i32)v->ptr_bytes)); 3319 } 3320 return; 3321 default: 3322 break; 3323 } 3324 rv_panic(a, "unsupported compiler intrinsic"); 3325 } 3326 /* ============================ inline asm ============================ */ 3327 3328 _Noreturn static void rv_asm_panic_at(Compiler* c, SrcLoc loc, 3329 const char* msg) { 3330 compiler_panic(c, loc, "rv64 inline asm: %s", msg); 3331 } 3332 _Noreturn static void rv_asm_panic(NativeDirectTarget* d, const char* msg) { 3333 rv_asm_panic_at(d->base.c, d->loc, msg); 3334 } 3335 3336 /* constraint_body / constraint_early / match_index are shared 3337 * (cg/native_asm.h). */ 3338 3339 /* Build a bound register pseudo-operand in the rv64 inline shape. */ 3340 static void rv_asm_bound_reg(Operand* out, KitCgTypeId type, 3341 NativeAllocClass cls, Reg reg) { 3342 memset(out, 0, sizeof *out); 3343 out->kind = RV64_INLINE_OPK_REG; 3344 out->pad[0] = 3345 (cls == NATIVE_REG_FP) ? RV64_INLINE_OPCLS_FP : RV64_INLINE_OPCLS_INT; 3346 out->type = type; 3347 out->v.local = (CGLocal)reg; 3348 } 3349 static void rv_asm_bound_mem(Operand* out, KitCgTypeId type, Reg base) { 3350 memset(out, 0, sizeof *out); 3351 out->kind = OPK_INDIRECT; 3352 out->type = type; 3353 out->v.ind.base = (CGLocal)base; 3354 out->v.ind.index = CG_LOCAL_NONE; 3355 out->v.ind.ofs = 0; 3356 } 3357 3358 /* Parse a clobber register name into (class, reg). Returns 0 for the special 3359 * "cc"/"memory" clobbers and panics on an unknown register. RV64 dwarf: int 3360 * x0..x31 = 0..31, fp f0..f31 = 32..63. */ 3361 static int rv_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name, 3362 NativeAllocClass* cls_out, Reg* reg_out) { 3363 Slice s = pool_slice(c->global, name); 3364 char buf[16]; 3365 uint32_t dwarf; 3366 if (!s.s || !s.len) return 0; 3367 if (s.len == 2 && s.s[0] == 'c' && s.s[1] == 'c') return 0; 3368 if (s.len == 6 && memcmp(s.s, "memory", 6) == 0) return 0; 3369 if (s.len >= sizeof buf) rv_asm_panic_at(c, loc, "clobber name is too long"); 3370 memcpy(buf, s.s, s.len); 3371 buf[s.len] = '\0'; 3372 if (rv64_register_index(buf, &dwarf) != 0) 3373 rv_asm_panic_at(c, loc, "unknown clobber register"); 3374 if (dwarf <= 31u) { 3375 *cls_out = NATIVE_REG_INT; 3376 *reg_out = (Reg)dwarf; 3377 return 1; 3378 } 3379 if (dwarf >= 32u && dwarf <= 63u) { 3380 *cls_out = NATIVE_REG_FP; 3381 *reg_out = (Reg)(dwarf - 32u); 3382 return 1; 3383 } 3384 rv_asm_panic_at(c, loc, "unsupported clobber register"); 3385 return 0; 3386 } 3387 3388 static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, 3389 u32 nclob, u32* int_mask, u32* fp_mask) { 3390 u32 i; 3391 *int_mask = 0; 3392 *fp_mask = 0; 3393 for (i = 0; i < nclob; ++i) { 3394 NativeAllocClass cls; 3395 Reg reg; 3396 if (!rv_asm_parse_reg_clobber(c, loc, clobbers[i], &cls, ®)) continue; 3397 if (cls == NATIVE_REG_INT) 3398 *int_mask |= 1u << reg; 3399 else 3400 *fp_mask |= 1u << reg; 3401 } 3402 } 3403 3404 static int rv_asm_resolve_pin_or_panic(NativeDirectTarget* d, Sym reg, 3405 const char* constraint, 3406 NativeAsmRegPin* pin) { 3407 NativeAsmRegPinStatus st = 3408 native_asm_resolve_pin(d->native, reg, constraint, pin); 3409 if (st == NATIVE_ASM_REG_PIN_ABSENT) return 0; 3410 if (st != NATIVE_ASM_REG_PIN_OK) 3411 rv_asm_panic(d, native_asm_pin_status_message(st)); 3412 return 1; 3413 } 3414 3415 /* Pick a free register from the arch's caller-saved allocable pools for an 3416 * asm operand the direct path must self-allocate. */ 3417 static Reg rv_asm_alloc_reg(NativeDirectTarget* d, NativeAllocClass cls, 3418 u32 allowed_mask, u32* used_int, u32* used_fp) { 3419 /* int: a0..a7 (10..17) then t-temps that aren't emit scratch. */ 3420 static const Reg int_pool[] = {10u, 11u, 12u, 13u, 14u, 15u, 3421 16u, 17u, 29u, 30u, 31u}; 3422 /* fp: fa0..fa7 (10..17) then ft caller-saved. */ 3423 static const Reg fp_pool[] = {10u, 11u, 12u, 13u, 14u, 15u, 16u, 17u, 3424 4u, 5u, 6u, 7u, 28u, 29u, 30u, 31u}; 3425 const Reg* pool = cls == NATIVE_REG_FP ? fp_pool : int_pool; 3426 u32 n = cls == NATIVE_REG_FP ? (u32)(sizeof fp_pool / sizeof fp_pool[0]) 3427 : (u32)(sizeof int_pool / sizeof int_pool[0]); 3428 u32* used = cls == NATIVE_REG_FP ? used_fp : used_int; 3429 u32 i; 3430 for (i = 0; i < n; ++i) { 3431 Reg r = pool[i]; 3432 if (allowed_mask && (allowed_mask & (1u << r)) == 0) continue; 3433 if ((*used & (1u << r)) != 0) continue; 3434 *used |= 1u << r; 3435 return r; 3436 } 3437 rv_asm_panic(d, "out of registers for asm operands"); 3438 return REG_NONE; 3439 } 3440 3441 /* Direct (-O0) path: resolve a semantic Operand to a NativeAddr. */ 3442 static NativeAddr rv_direct_addr(NativeDirectTarget* d, Operand op) { 3443 NativeAddr addr; 3444 memset(&addr, 0, sizeof addr); 3445 switch ((OpKind)op.kind) { 3446 case OPK_LOCAL: 3447 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3448 addr.base.frame = d->locals[op.v.local - 1u].home; 3449 addr.base_type = op.type; 3450 return addr; 3451 case OPK_INDIRECT: 3452 addr.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE; 3453 addr.base.frame = d->locals[op.v.ind.base - 1u].home; 3454 addr.cls = d->locals[op.v.ind.base - 1u].cls; 3455 addr.base_type = d->locals[op.v.ind.base - 1u].type; 3456 addr.offset = op.v.ind.ofs; 3457 return addr; 3458 default: 3459 rv_asm_panic(d, "operand is not addressable"); 3460 } 3461 } 3462 3463 /* Materialize an OPK_INDIRECT (frame-value) base into a register, returning a 3464 * plain register-based NativeAddr. */ 3465 static NativeAddr rv_direct_materialize_addr(NativeDirectTarget* d, 3466 Operand op) { 3467 RvNativeTarget* a = rv_of(d->native); 3468 NativeAddr addr = rv_direct_addr(d, op); 3469 if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) { 3470 NativeLoc base = native_loc_reg(addr.base_type, NATIVE_REG_INT, RV_TMP1); 3471 NativeAddr load; 3472 memset(&load, 0, sizeof load); 3473 load.base_kind = NATIVE_ADDR_BASE_FRAME; 3474 load.base.frame = addr.base.frame; 3475 load.base_type = addr.base_type; 3476 rv_emit_mem(a, 1, base, load, 3477 native_mem_for_type(d->native, addr.base_type, 8)); 3478 addr.base_kind = NATIVE_ADDR_BASE_REG; 3479 addr.base.reg = RV_TMP1; 3480 } 3481 return addr; 3482 } 3483 3484 static void rv_direct_load_operand_to_reg(NativeDirectTarget* d, Operand op, 3485 NativeLoc dst) { 3486 RvNativeTarget* a = rv_of(d->native); 3487 NativeAddr addr; 3488 memset(&addr, 0, sizeof addr); 3489 switch ((OpKind)op.kind) { 3490 case OPK_IMM: 3491 if ((NativeAllocClass)dst.cls != NATIVE_REG_INT) 3492 rv_asm_panic(d, "floating-point immediate asm input is unsupported"); 3493 d->native->load_imm(d->native, dst, op.v.imm); 3494 return; 3495 case OPK_LOCAL: 3496 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3497 addr.base.frame = d->locals[op.v.local - 1u].home; 3498 addr.base_type = op.type; 3499 rv_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0)); 3500 return; 3501 case OPK_GLOBAL: 3502 addr.base_kind = NATIVE_ADDR_BASE_GLOBAL; 3503 addr.base.global.sym = op.v.global.sym; 3504 addr.base.global.addend = op.v.global.addend; 3505 addr.base_type = op.type; 3506 d->native->load_addr(d->native, dst, addr); 3507 return; 3508 case OPK_INDIRECT: 3509 addr = rv_direct_materialize_addr(d, op); 3510 rv_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0)); 3511 return; 3512 } 3513 rv_asm_panic(d, "unsupported asm input operand"); 3514 } 3515 3516 static void rv_direct_load_address_to_reg(NativeDirectTarget* d, Operand op, 3517 NativeLoc dst) { 3518 d->native->load_addr(d->native, dst, rv_direct_addr(d, op)); 3519 } 3520 3521 static void rv_direct_store_reg_to_operand(NativeDirectTarget* d, Operand op, 3522 NativeLoc src) { 3523 RvNativeTarget* a = rv_of(d->native); 3524 NativeAddr addr; 3525 memset(&addr, 0, sizeof addr); 3526 if (op.kind == OPK_LOCAL) { 3527 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3528 addr.base.frame = d->locals[op.v.local - 1u].home; 3529 addr.base_type = op.type; 3530 } else { 3531 addr = rv_direct_materialize_addr(d, op); 3532 } 3533 rv_emit_mem(a, 0, src, addr, native_mem_for_type(d->native, op.type, 0)); 3534 } 3535 3536 /* Callee-saved registers an asm block clobbers must be spilled/restored around 3537 * the block (the only ABI duty the allocator cannot discharge itself). */ 3538 typedef struct RvAsmSavedClobber { 3539 NativeFrameSlot slot; 3540 NativeAllocClass cls; 3541 Reg reg; 3542 KitCgTypeId type; 3543 } RvAsmSavedClobber; 3544 3545 /* A clobber save slot is register-width: ptr_bytes for an integer reg (4 on 3546 * rv32, 8 on rv64) but always 8 for an FP reg (fsd, even on rv32d). */ 3547 static u32 rv_asm_save_bytes(const RvNativeTarget* a, const RvAsmSavedClobber* s) { 3548 return s->cls == NATIVE_REG_FP ? 8u : a->variant->ptr_bytes; 3549 } 3550 static void rv_asm_save_one(RvNativeTarget* a, RvAsmSavedClobber* s) { 3551 NativeFrameSlotDesc desc; 3552 NativeAddr addr; 3553 u32 sz = rv_asm_save_bytes(a, s); 3554 memset(&desc, 0, sizeof desc); 3555 desc.type = s->type; 3556 desc.size = sz; 3557 desc.align = sz; 3558 desc.kind = NATIVE_FRAME_SLOT_SAVE; 3559 s->slot = a->base.frame_slot(&a->base, &desc); 3560 memset(&addr, 0, sizeof addr); 3561 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3562 addr.base.frame = s->slot; 3563 addr.base_type = s->type; 3564 rv_emit_mem(a, 0, native_loc_reg(s->type, s->cls, s->reg), addr, 3565 native_mem_for_type(&a->base, s->type, sz)); 3566 } 3567 static void rv_asm_restore_one(RvNativeTarget* a, const RvAsmSavedClobber* s) { 3568 NativeAddr addr; 3569 u32 sz = rv_asm_save_bytes(a, s); 3570 memset(&addr, 0, sizeof addr); 3571 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3572 addr.base.frame = s->slot; 3573 addr.base_type = s->type; 3574 rv_emit_mem(a, 1, native_loc_reg(s->type, s->cls, s->reg), addr, 3575 native_mem_for_type(&a->base, s->type, sz)); 3576 } 3577 3578 /* psABI callee-saved: integer s0..s11 (x8,x9,x18..x27), fp fs0..fs11 3579 * (f8,f9,f18..f27). x8 is the frame pointer and never asm-clobbered. */ 3580 static int rv_reg_is_callee_int(Reg r) { 3581 return r == 9u || (r >= 18u && r <= 27u); 3582 } 3583 static int rv_reg_is_callee_fp(Reg r) { 3584 return r == 8u || r == 9u || (r >= 18u && r <= 27u); 3585 } 3586 3587 static RvAsmSavedClobber* rv_asm_save_callee_clobbers(RvNativeTarget* a, 3588 u32 int_mask, u32 fp_mask, 3589 u32* nsaved_out) { 3590 RvAsmSavedClobber* saved = 3591 arena_zarray(a->base.c->tu, RvAsmSavedClobber, 24u); 3592 KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64); 3593 KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64); 3594 u32 n = 0; 3595 Reg r; 3596 for (r = 0; r <= 31u; ++r) { 3597 if ((int_mask & (1u << r)) == 0 || !rv_reg_is_callee_int(r)) continue; 3598 saved[n].cls = NATIVE_REG_INT; 3599 saved[n].reg = r; 3600 saved[n].type = i64; 3601 rv_asm_save_one(a, &saved[n++]); 3602 } 3603 for (r = 0; r <= 31u; ++r) { 3604 if ((fp_mask & (1u << r)) == 0 || !rv_reg_is_callee_fp(r)) continue; 3605 saved[n].cls = NATIVE_REG_FP; 3606 saved[n].reg = r; 3607 saved[n].type = f64; 3608 rv_asm_save_one(a, &saved[n++]); 3609 } 3610 *nsaved_out = n; 3611 return saved; 3612 } 3613 3614 /* ---- NativeTarget (optimizer) asm hook ---- 3615 * The optimizer pre-allocated every operand register and arranged surrounding 3616 * data flow, so this binds pre-allocated registers to the template and only 3617 * materializes memory-operand bases into the reserved scratch + spills the 3618 * callee-saved registers the asm clobbers. */ 3619 3620 static NativeAddr rv_asm_loc_to_addr(RvNativeTarget* a, SrcLoc loc, 3621 NativeLoc src) { 3622 NativeAddr addr; 3623 memset(&addr, 0, sizeof addr); 3624 addr.base_type = src.type; 3625 switch ((NativeLocKind)src.kind) { 3626 case NATIVE_LOC_FRAME: 3627 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3628 addr.base.frame = src.v.frame; 3629 return addr; 3630 case NATIVE_LOC_ADDR: 3631 return src.v.addr; 3632 case NATIVE_LOC_GLOBAL: 3633 addr.base_kind = NATIVE_ADDR_BASE_GLOBAL; 3634 addr.base.global.sym = src.v.global.sym; 3635 addr.base.global.addend = src.v.global.addend; 3636 return addr; 3637 case NATIVE_LOC_REG: 3638 addr.base_kind = NATIVE_ADDR_BASE_REG; 3639 addr.cls = NATIVE_REG_INT; 3640 addr.base.reg = src.v.reg; 3641 return addr; 3642 default: 3643 rv_asm_panic_at(a->base.c, loc, "unsupported memory asm operand"); 3644 } 3645 } 3646 3647 /* Resolve a memory-constraint operand to a single base register with zero 3648 * offset, folding any frame/global/offset into a reserved scratch register. */ 3649 static Reg rv_asm_native_mem_base(RvNativeTarget* a, SrcLoc loc, NativeLoc src, 3650 u32* ntmp) { 3651 NativeAddr addr = rv_asm_loc_to_addr(a, loc, src); 3652 u32 base; 3653 i32 off; 3654 Reg dst; 3655 if (addr.index_kind != NATIVE_ADDR_INDEX_NONE) 3656 rv_asm_panic_at(a->base.c, loc, "indexed memory asm operand unsupported"); 3657 rv_resolve_mem_addr(a, &addr, &base, &off); 3658 if (off == 0 && base != RV_TMP0 && base != RV_TMP1) return (Reg)base; 3659 if (*ntmp >= 2u) 3660 rv_asm_panic_at(a->base.c, loc, "too many memory asm operands"); 3661 dst = (*ntmp == 0u) ? RV_TMP0 : RV_TMP1; 3662 (*ntmp)++; 3663 rv_emit_addr_adjust(a->variant, a->base.mc, dst, base, off); 3664 return dst; 3665 } 3666 3667 static u32 rv_asm_reg_mem_size(RvNativeTarget* a, NativeAllocClass cls, 3668 KitCgTypeId type) { 3669 u32 sz = native_type_size(&a->base, type); 3670 if (cls == NATIVE_REG_INT && sz > a->variant->ptr_bytes) 3671 sz = a->variant->ptr_bytes; 3672 return sz; 3673 } 3674 3675 static Reg rv_asm_stage_reg(RvNativeTarget* a, SrcLoc loc, NativeAllocClass cls, 3676 u32* nint, u32* nfp) { 3677 static const Reg int_regs[] = {RV_TMP2, RV_TMP3}; 3678 static const Reg fp_regs[] = {RV_FTMP0, RV_FTMP1}; 3679 if (cls == NATIVE_REG_FP) { 3680 if (*nfp >= (u32)(sizeof fp_regs / sizeof fp_regs[0])) 3681 rv_asm_panic_at(a->base.c, loc, "too many staged fp asm operands"); 3682 return fp_regs[(*nfp)++]; 3683 } 3684 if (*nint >= (u32)(sizeof int_regs / sizeof int_regs[0])) 3685 rv_asm_panic_at(a->base.c, loc, "too many staged integer asm operands"); 3686 return int_regs[(*nint)++]; 3687 } 3688 3689 static void rv_asm_load_loc_to_reg(RvNativeTarget* a, SrcLoc loc, NativeLoc src, 3690 NativeLoc dst) { 3691 NativeTarget* t = &a->base; 3692 NativeAllocClass cls = (NativeAllocClass)dst.cls; 3693 if (src.kind == NATIVE_LOC_REG) { 3694 if (src.v.reg != dst.v.reg || src.cls != dst.cls) t->move(t, dst, src); 3695 return; 3696 } 3697 if (src.kind == NATIVE_LOC_IMM) { 3698 if (cls != NATIVE_REG_INT) 3699 rv_asm_panic_at(t->c, loc, 3700 "floating-point immediate asm input is unsupported"); 3701 t->load_imm(t, dst, src.v.imm); 3702 return; 3703 } 3704 rv_emit_mem(a, 1, dst, rv_asm_loc_to_addr(a, loc, src), 3705 native_mem_for_type(t, dst.type, 3706 rv_asm_reg_mem_size(a, cls, dst.type))); 3707 } 3708 3709 static void rv_asm_store_reg_to_loc(RvNativeTarget* a, SrcLoc loc, NativeLoc dst, 3710 NativeLoc src) { 3711 NativeTarget* t = &a->base; 3712 NativeAllocClass cls = (NativeAllocClass)src.cls; 3713 if (dst.kind == NATIVE_LOC_REG) { 3714 if (dst.v.reg != src.v.reg || dst.cls != src.cls) t->move(t, dst, src); 3715 return; 3716 } 3717 rv_emit_mem(a, 0, src, rv_asm_loc_to_addr(a, loc, dst), 3718 native_mem_for_type(t, src.type, 3719 rv_asm_reg_mem_size(a, cls, src.type))); 3720 } 3721 3722 static void rv_asm_bind_native(RvNativeTarget* a, SrcLoc loc, Operand* out, 3723 const char* constraint, KitCgTypeId type, 3724 NativeLoc src, u32* ntmp) { 3725 const char* body = native_asm_constraint_body(constraint); 3726 NativeAsmConstraintInfo info; 3727 if (native_asm_constraint_reg_info(&a->base, constraint, &info)) { 3728 if (src.kind != NATIVE_LOC_REG) 3729 rv_asm_panic_at(a->base.c, loc, "register asm operand not in a register"); 3730 if (info.fixed_reg != REG_NONE && info.fixed_reg != (Reg)src.v.reg) 3731 rv_asm_panic_at(a->base.c, loc, 3732 "fixed-register asm operand in wrong register"); 3733 if (info.allowed_mask && 3734 ((Reg)src.v.reg >= 32 || 3735 (info.allowed_mask & (1u << (Reg)src.v.reg)) == 0)) 3736 rv_asm_panic_at(a->base.c, loc, 3737 "register asm operand violates constraint register set"); 3738 rv_asm_bound_reg(out, type, info.cls, (Reg)src.v.reg); 3739 } else if (body[0] == 'i') { 3740 if (src.kind != NATIVE_LOC_IMM) 3741 rv_asm_panic_at(a->base.c, loc, "immediate asm operand is not immediate"); 3742 memset(out, 0, sizeof *out); 3743 out->kind = OPK_IMM; 3744 out->type = type; 3745 out->v.imm = src.v.imm; 3746 } else if (body[0] == 'm') { 3747 rv_asm_bound_mem(out, type, rv_asm_native_mem_base(a, loc, src, ntmp)); 3748 } else { 3749 rv_asm_panic_at(a->base.c, loc, "unsupported asm constraint"); 3750 } 3751 } 3752 3753 static void rv_asm_block_native(NativeTarget* t, const char* tmpl, 3754 const AsmConstraint* outs, u32 nout, 3755 NativeLoc* out_locs, const AsmConstraint* ins, 3756 u32 nin, const NativeLoc* in_locs, 3757 const Sym* clobbers, u32 nclob) { 3758 RvNativeTarget* a = rv_of(t); 3759 Compiler* c = t->c; 3760 SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; 3761 Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; 3762 Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; 3763 u8* staged_outs = nout ? arena_zarray(c->tu, u8, nout) : NULL; 3764 u32 ntmp = 0, nstage_int = 0, nstage_fp = 0, i; 3765 Rv64Asm* asmh; 3766 3767 for (i = 0; i < nout; ++i) { 3768 KitCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type; 3769 NativeLoc outloc = out_locs[i]; 3770 NativeAsmConstraintInfo info; 3771 NativeAsmPinnedLoc pinned = 3772 native_asm_prepare_pinned_loc(t, outs[i].reg, outs[i].str, type, outloc); 3773 if (pinned.has_pin) { 3774 if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK) 3775 rv_asm_panic_at(c, loc, 3776 native_asm_pin_status_message(pinned.pin_status)); 3777 if (pinned.wrong_reg) 3778 rv_asm_panic_at(c, loc, "hard-register asm operand in wrong register"); 3779 outloc = pinned.loc; 3780 if (pinned.needs_stage) { 3781 staged_outs[i] = 1u; 3782 if (outs[i].dir == KIT_CG_ASM_INOUT) 3783 rv_asm_load_loc_to_reg(a, loc, out_locs[i], outloc); 3784 } 3785 } else if (native_asm_constraint_reg_info(t, outs[i].str, &info) && 3786 info.allowed_mask == 0 && outloc.kind != NATIVE_LOC_REG) { 3787 Reg r = rv_asm_stage_reg(a, loc, info.cls, &nstage_int, &nstage_fp); 3788 outloc = native_loc_reg(type, info.cls, r); 3789 staged_outs[i] = 1u; 3790 if (outs[i].dir == KIT_CG_ASM_INOUT) 3791 rv_asm_load_loc_to_reg(a, loc, out_locs[i], outloc); 3792 } 3793 rv_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, outloc, &ntmp); 3794 } 3795 for (i = 0; i < nin; ++i) { 3796 const char* body = native_asm_constraint_body(ins[i].str); 3797 int matched = native_asm_match_index(body); 3798 KitCgTypeId type; 3799 NativeLoc inloc; 3800 if (matched >= 0) { 3801 if ((u32)matched >= nout) 3802 rv_asm_panic_at(c, loc, "matching constraint out of range"); 3803 bound_ins[i] = bound_outs[matched]; 3804 continue; 3805 } 3806 type = ins[i].type ? ins[i].type : in_locs[i].type; 3807 inloc = in_locs[i]; 3808 { 3809 NativeAsmConstraintInfo info; 3810 NativeAsmPinnedLoc pinned = 3811 native_asm_prepare_pinned_loc(t, ins[i].reg, ins[i].str, type, inloc); 3812 if (pinned.has_pin) { 3813 if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK) 3814 rv_asm_panic_at(c, loc, 3815 native_asm_pin_status_message(pinned.pin_status)); 3816 if (pinned.wrong_reg) 3817 rv_asm_panic_at(c, loc, 3818 "hard-register asm operand in wrong register"); 3819 inloc = pinned.loc; 3820 if (pinned.needs_stage) 3821 rv_asm_load_loc_to_reg(a, loc, in_locs[i], inloc); 3822 } else if (native_asm_constraint_reg_info(t, ins[i].str, &info) && 3823 info.allowed_mask == 0 && inloc.kind != NATIVE_LOC_REG) { 3824 Reg r = rv_asm_stage_reg(a, loc, info.cls, &nstage_int, &nstage_fp); 3825 inloc = native_loc_reg(type, info.cls, r); 3826 rv_asm_load_loc_to_reg(a, loc, in_locs[i], inloc); 3827 } 3828 } 3829 rv_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp); 3830 } 3831 3832 /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber 3833 * masks and rv_known_callee_saves folded the callee-saved ones into the 3834 * function's saved set, so the prologue/epilogue already preserve them. */ 3835 asmh = rv64_asm_open(c); 3836 rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, 3837 nclob); 3838 rv64_asm_run_template(asmh, t->mc, tmpl); 3839 rv64_asm_close(asmh); 3840 3841 for (i = 0; i < nout; ++i) { 3842 NativeAllocClass cls; 3843 NativeLoc src; 3844 if (!staged_outs || !staged_outs[i]) continue; 3845 if (bound_outs[i].kind != RV64_INLINE_OPK_REG) continue; 3846 cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP ? NATIVE_REG_FP 3847 : NATIVE_REG_INT; 3848 src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local); 3849 rv_asm_store_reg_to_loc(a, loc, out_locs[i], src); 3850 } 3851 } 3852 /* file_scope_asm + finalize are shared (cg/native_asm.h). */ 3853 3854 static void rv_trap(NativeTarget* t) { rv64_emit32(t->mc, rv_ebreak()); } 3855 static void rv_set_loc(NativeTarget* t, SrcLoc loc) { 3856 rv_of(t)->loc = loc; 3857 if (t->mc->set_loc) t->mc->set_loc(t->mc, loc); 3858 } 3859 3860 /* ============================ construction ============================ */ 3861 3862 NativeTarget* rv64_native_target_new(Compiler* c, ObjBuilder* obj, 3863 MCEmitter* mc) { 3864 RvNativeTarget* a = arena_znew(c->tu, RvNativeTarget); 3865 NativeTarget* t; 3866 if (!a) return NULL; 3867 t = &a->base; 3868 t->c = c; 3869 t->obj = obj; 3870 t->mc = mc; 3871 a->variant = riscv_variant_for_kind(c->target.arch); 3872 native_frame_init(&a->frame, c); 3873 t->regs = &rv_reg_info; 3874 t->class_for_type = native_class_for_type_fp_le8; 3875 t->imm_legal = rv_imm_legal; 3876 t->addr_legal = rv_addr_legal; 3877 t->func_begin = rv_func_begin; 3878 t->func_begin_known_frame = rv_func_begin_known_frame; 3879 t->note_frame_state = NULL; 3880 /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved 3881 * set; rv_func_begin_known_frame derives the records from the masks. */ 3882 t->reserve_callee_saves = rv_reserve_callee_saves; 3883 t->signature_stack_bytes = rv_signature_stack_bytes; 3884 t->call_stack_bytes = rv_call_stack_bytes; 3885 t->has_store_zero_reg = 1; 3886 t->store_zero_reg = RV_ZERO; 3887 t->func_end = rv_func_end; 3888 t->frame_slot = rv_frame_slot; 3889 t->frame_slot_debug_loc = rv_frame_slot_debug_loc; 3890 t->bind_param = rv_bind_native_param; 3891 t->label_new = rv_label_new; 3892 t->label_place = rv_label_place; 3893 t->jump = rv_jump; 3894 t->cmp_branch = rv_cmp_branch; 3895 t->indirect_branch = rv_indirect_branch; 3896 t->load_label_addr = rv_load_label_addr; 3897 t->move = rv_move; 3898 t->load_imm = rv_load_imm; 3899 t->load_const = rv_load_const; 3900 t->load_addr = rv_load_addr; 3901 t->load = rv_load; 3902 t->store = rv_store; 3903 t->tls_addr_of = rv_tls_addr_of; 3904 t->copy_bytes = rv_copy_bytes; 3905 t->set_bytes = rv_set_bytes; 3906 t->bitfield_load = rv_bitfield_load; 3907 t->bitfield_store = rv_bitfield_store; 3908 t->binop = rv_binop; 3909 t->unop = rv_unop; 3910 t->cmp = rv_cmp; 3911 t->convert = rv_convert; 3912 t->alloca_ = rv_alloca; 3913 t->spill = rv_spill; 3914 t->reload = rv_reload; 3915 t->plan_call = rv_plan_call; 3916 t->emit_call = rv_emit_call; 3917 t->plan_ret = rv_plan_ret; 3918 t->ret = rv_ret; 3919 t->atomic_load = rv_atomic_load; 3920 t->atomic_store = rv_atomic_store; 3921 t->atomic_rmw = rv_atomic_rmw; 3922 t->atomic_cas = rv_atomic_cas; 3923 t->fence = rv_fence; 3924 t->va_start_ = rv_va_start_native; 3925 t->va_arg_ = rv_va_arg_native; 3926 t->va_end_ = rv_va_end_native; 3927 t->va_copy_ = rv_va_copy_native; 3928 t->intrinsic = rv_intrinsic; 3929 t->asm_block = rv_asm_block_native; 3930 t->file_scope_asm = native_file_scope_asm; 3931 t->trap = rv_trap; 3932 t->set_loc = rv_set_loc; 3933 t->finalize = native_finalize; 3934 return t; 3935 } 3936 3937 /* ============================ NativeOps (-O0) ============================ */ 3938 3939 static void rv_bind_param(NativeDirectTarget* d, const CGParamDesc* p, 3940 CGLocal local, NativeDirectLocal* l) { 3941 NativeLoc dst; 3942 (void)local; 3943 memset(&dst, 0, sizeof dst); 3944 dst.kind = NATIVE_LOC_FRAME; 3945 dst.type = p->type; 3946 dst.v.frame = l->home; 3947 rv_bind_native_param(d->native, p, dst); 3948 } 3949 3950 /* A sibling call is realizable when its outgoing stack-argument area fits the 3951 * window the caller itself received (so the args land in the caller's incoming 3952 * slots without overflowing into the caller's caller's frame). Register-only 3953 * calls (the common case) always qualify. Mirrors aa64's aa_no_tail. */ 3954 static const char* rv_no_tail(NativeDirectTarget* d, const CGCallDesc* call) { 3955 RvNativeTarget* a = rv_of(d->native); 3956 NativeCallDesc nd; 3957 NativeLoc* args = NULL; 3958 NativeLoc* results = NULL; 3959 u32 i, stack; 3960 if (a->frame.ncallee_saves) 3961 return "rv64 tail call: callee-saved registers in use"; 3962 memset(&nd, 0, sizeof nd); 3963 u32 nresults = call->result != CG_LOCAL_NONE ? 1u : 0u; 3964 if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs); 3965 if (nresults) results = arena_zarray(d->base.c->tu, NativeLoc, nresults); 3966 for (i = 0; i < call->nargs; ++i) { 3967 args[i].kind = NATIVE_LOC_FRAME; 3968 args[i].type = d->locals[call->args[i] - 1u].type; 3969 args[i].cls = d->locals[call->args[i] - 1u].cls; 3970 args[i].v.frame = d->locals[call->args[i] - 1u].home; 3971 } 3972 if (nresults) { 3973 results[0].kind = NATIVE_LOC_FRAME; 3974 results[0].type = d->locals[call->result - 1u].type; 3975 results[0].cls = d->locals[call->result - 1u].cls; 3976 results[0].v.frame = d->locals[call->result - 1u].home; 3977 } 3978 nd.fn_type = call->fn_type; 3979 nd.args = args; 3980 nd.results = results; 3981 nd.nargs = call->nargs; 3982 nd.nresults = nresults; 3983 stack = rv_call_stack_size(d->native, &nd); 3984 if (stack > a->incoming_stack_size) 3985 return "rv64 tail call: stack argument area too small"; 3986 return NULL; 3987 } 3988 3989 /* Resolve a pointer-typed Operand (the address of a va_list object) into `reg` 3990 * and return a register-based NativeAddr. An OPK_LOCAL holds the va_list object 3991 * itself, so we take its frame address; an OPK_INDIRECT holds the pointer in 3992 * memory and must be loaded. The va cores use TMP1/TMP2 internally, so `reg` 3993 * must be distinct from those (callers pass TMP0 / TMP3). */ 3994 /* ap_addr is the pointer value &ap (the va_list object's address). For an 3995 * OPK_LOCAL the local HOLDS that pointer, so load its home value; an 3996 * OPK_INDIRECT names *(base+ofs), whose address base+ofs is the pointer. 3997 * Mirrors aa64's aa_direct_pointer_addr. */ 3998 static NativeAddr rv_direct_pointer_addr(NativeDirectTarget* d, Operand op) { 3999 RvNativeTarget* a = rv_of(d->native); 4000 NativeAddr addr; 4001 memset(&addr, 0, sizeof addr); 4002 if (op.kind == OPK_LOCAL) { 4003 NativeLoc base = native_loc_reg(op.type, NATIVE_REG_INT, RV_TMP1); 4004 NativeAddr load; 4005 memset(&load, 0, sizeof load); 4006 load.base_kind = NATIVE_ADDR_BASE_FRAME; 4007 load.base.frame = d->locals[op.v.local - 1u].home; 4008 load.base_type = op.type; 4009 rv_emit_mem(a, 1, base, load, native_mem_for_type(d->native, op.type, 8)); 4010 addr.base_kind = NATIVE_ADDR_BASE_REG; 4011 addr.base.reg = RV_TMP1; 4012 addr.base_type = op.type; 4013 return addr; 4014 } 4015 return rv_direct_materialize_addr(d, op); 4016 } 4017 4018 static NativeAddr rv_direct_va_base(NativeDirectTarget* d, Operand ap_addr, 4019 Reg reg) { 4020 NativeLoc dst = 4021 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg); 4022 NativeAddr addr; 4023 d->native->load_addr(d->native, dst, rv_direct_pointer_addr(d, ap_addr)); 4024 memset(&addr, 0, sizeof addr); 4025 addr.base_kind = NATIVE_ADDR_BASE_REG; 4026 addr.cls = NATIVE_REG_INT; 4027 addr.base.reg = reg; 4028 addr.base_type = builtin_id(KIT_CG_BUILTIN_I64); 4029 return addr; 4030 } 4031 4032 static void rv_va_start_(NativeDirectTarget* d, Operand ap_addr) { 4033 rv_va_start_core(rv_of(d->native), rv_direct_va_base(d, ap_addr, RV_TMP3)); 4034 } 4035 static void rv_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr, 4036 KitCgTypeId type) { 4037 RvNativeTarget* a = rv_of(d->native); 4038 NativeAllocClass cls; 4039 /* A value too wide for one GPR (8-byte i64 / soft-double on ilp32) is copied 4040 * straight from the save area into its destination memory. */ 4041 if (rv_va_arg_is_wide(d->native, type)) { 4042 rv_va_arg_wide(a, rv_direct_addr(d, dst), 4043 rv_direct_va_base(d, ap_addr, RV_TMP3), 4044 native_type_size(d->native, type)); 4045 return; 4046 } 4047 /* Float-ABI-aware class: a soft (or wider-than-flen) float is INT-class so 4048 * the va_arg fetch never lands a double in an FP register on rv32. */ 4049 cls = native_class_for_type_fp_le8(d->native, type); 4050 NativeLoc res = native_loc_reg(type, cls, 4051 cls == NATIVE_REG_FP ? RV_FTMP0 : RV_TMP0); 4052 NativeAddr dst_addr; 4053 rv_va_arg_core(a, res, rv_direct_va_base(d, ap_addr, RV_TMP3), type); 4054 /* Store the fetched value back into the semantic destination. */ 4055 dst_addr = rv_direct_addr(d, dst); 4056 if (dst_addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) { 4057 NativeLoc base = 4058 native_loc_reg(dst_addr.base_type, NATIVE_REG_INT, RV_TMP1); 4059 NativeAddr load; 4060 memset(&load, 0, sizeof load); 4061 load.base_kind = NATIVE_ADDR_BASE_FRAME; 4062 load.base.frame = dst_addr.base.frame; 4063 load.base_type = dst_addr.base_type; 4064 rv_emit_mem(a, 1, base, load, 4065 native_mem_for_type(d->native, dst_addr.base_type, 8)); 4066 dst_addr.base_kind = NATIVE_ADDR_BASE_REG; 4067 dst_addr.base.reg = RV_TMP1; 4068 } 4069 rv_emit_mem( 4070 a, 0, res, dst_addr, 4071 native_mem_for_type(d->native, type, native_type_size(d->native, type))); 4072 } 4073 static void rv_va_end_(NativeDirectTarget* d, Operand ap_addr) { 4074 (void)d; 4075 (void)ap_addr; 4076 } 4077 static void rv_va_copy_(NativeDirectTarget* d, Operand dst, Operand src) { 4078 RvNativeTarget* a = rv_of(d->native); 4079 NativeAddr src_ap = rv_direct_va_base(d, src, RV_TMP0); 4080 NativeAddr dst_ap = rv_direct_va_base(d, dst, RV_TMP3); 4081 rv_va_copy_core(a, dst_ap, src_ap); 4082 } 4083 4084 static void rv_direct_asm_block(NativeDirectTarget* d, const char* tmpl, 4085 const AsmConstraint* outs, u32 nout, 4086 Operand* out_ops, const AsmConstraint* ins, 4087 u32 nin, const Operand* in_ops, 4088 const Sym* clobbers, u32 nclob, 4089 u32 clobber_abi_sets) { 4090 RvNativeTarget* a = rv_of(d->native); 4091 Compiler* c = d->base.c; 4092 Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; 4093 Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; 4094 u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp; 4095 RvAsmSavedClobber* saved; 4096 u32 nsaved, i; 4097 Rv64Asm* asmh; 4098 4099 rv_asm_clobber_masks(c, d->loc, clobbers, nclob, &clob_int, &clob_fp); 4100 native_asm_abi_clobber_masks(d->native, clobber_abi_sets, &abi_int, &abi_fp); 4101 clob_int |= abi_int; 4102 clob_fp |= abi_fp; 4103 /* Reserve emit scratch (t0/t1/t2/t3), sp/gp/tp/zero/ra and the frame pointer 4104 * so the operand allocator never hands them out. */ 4105 used_int = clob_int | (1u << RV_ZERO) | (1u << RV_RA) | (1u << RV_SP) | 4106 (1u << RV_GP) | (1u << RV_TP) | (1u << RV_TMP0) | (1u << RV_TMP1) | 4107 (1u << RV_TMP2) | (1u << RV_TMP3) | (1u << RV_S0); 4108 used_fp = 4109 clob_fp | (1u << RV_FTMP0) | (1u << RV_FTMP1) | (1u << 2u) | (1u << 3u); 4110 4111 for (i = 0; i < nout; ++i) { 4112 const char* body = native_asm_constraint_body(outs[i].str); 4113 KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; 4114 NativeAsmRegPin pin; 4115 if (rv_asm_resolve_pin_or_panic(d, outs[i].reg, outs[i].str, &pin)) { 4116 /* GNU local register variable: pin to the named hard register. */ 4117 if (pin.cls == NATIVE_REG_FP) { 4118 used_fp |= 1u << pin.reg; 4119 clob_fp |= 1u << pin.reg; 4120 } else { 4121 used_int |= 1u << pin.reg; 4122 clob_int |= 1u << pin.reg; 4123 } 4124 rv_asm_bound_reg(&bound_outs[i], type, pin.cls, pin.reg); 4125 } else { 4126 NativeAsmConstraintInfo info; 4127 if (native_asm_constraint_reg_info(d->native, outs[i].str, &info)) { 4128 Reg reg = info.fixed_reg != REG_NONE 4129 ? info.fixed_reg 4130 : rv_asm_alloc_reg(d, info.cls, info.allowed_mask, 4131 &used_int, &used_fp); 4132 if (info.cls == NATIVE_REG_FP) { 4133 used_fp |= 1u << reg; 4134 if (info.fixed_reg != REG_NONE) clob_fp |= 1u << reg; 4135 } else { 4136 used_int |= 1u << reg; 4137 if (info.fixed_reg != REG_NONE) clob_int |= 1u << reg; 4138 } 4139 rv_asm_bound_reg(&bound_outs[i], type, info.cls, reg); 4140 } else if (body[0] == 'm') { 4141 Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, 0, &used_int, &used_fp); 4142 rv_asm_bound_mem(&bound_outs[i], type, reg); 4143 } else { 4144 rv_asm_panic(d, "unsupported output constraint"); 4145 } 4146 } 4147 } 4148 4149 for (i = 0; i < nin; ++i) { 4150 const char* body = native_asm_constraint_body(ins[i].str); 4151 int matched = native_asm_match_index(body); 4152 KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type; 4153 if (matched >= 0) { 4154 if ((u32)matched >= nout) 4155 rv_asm_panic(d, "matching constraint out of range"); 4156 if (native_asm_constraint_early(outs[matched].str)) 4157 rv_asm_panic(d, "matching input names early-clobber output"); 4158 if (bound_outs[matched].kind != RV64_INLINE_OPK_REG) 4159 rv_asm_panic(d, "matching constraint requires register output"); 4160 bound_ins[i] = bound_outs[matched]; 4161 continue; 4162 } 4163 NativeAsmRegPin pin; 4164 if (rv_asm_resolve_pin_or_panic(d, ins[i].reg, ins[i].str, &pin)) { 4165 /* GNU local register variable: pin to the named hard register. */ 4166 if (pin.cls == NATIVE_REG_FP) { 4167 used_fp |= 1u << pin.reg; 4168 clob_fp |= 1u << pin.reg; 4169 } else { 4170 used_int |= 1u << pin.reg; 4171 clob_int |= 1u << pin.reg; 4172 } 4173 rv_asm_bound_reg(&bound_ins[i], type, pin.cls, pin.reg); 4174 } else { 4175 NativeAsmConstraintInfo info; 4176 if (native_asm_constraint_reg_info(d->native, ins[i].str, &info)) { 4177 Reg reg = info.fixed_reg != REG_NONE 4178 ? info.fixed_reg 4179 : rv_asm_alloc_reg(d, info.cls, info.allowed_mask, 4180 &used_int, &used_fp); 4181 if (info.cls == NATIVE_REG_FP) { 4182 used_fp |= 1u << reg; 4183 if (info.fixed_reg != REG_NONE) clob_fp |= 1u << reg; 4184 } else { 4185 used_int |= 1u << reg; 4186 if (info.fixed_reg != REG_NONE) clob_int |= 1u << reg; 4187 } 4188 rv_asm_bound_reg(&bound_ins[i], type, info.cls, reg); 4189 } else if (body[0] == 'i') { 4190 if (in_ops[i].kind != OPK_IMM) 4191 rv_asm_panic(d, "immediate constraint requires immediate operand"); 4192 bound_ins[i] = in_ops[i]; 4193 } else if (body[0] == 'm') { 4194 Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, 0, &used_int, &used_fp); 4195 rv_asm_bound_mem(&bound_ins[i], type, reg); 4196 } else { 4197 rv_asm_panic(d, "unsupported input constraint"); 4198 } 4199 } 4200 } 4201 4202 saved = rv_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved); 4203 for (i = 0; i < nout; ++i) { 4204 if (bound_outs[i].kind == RV64_INLINE_OPK_REG) { 4205 NativeAllocClass cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP 4206 ? NATIVE_REG_FP 4207 : NATIVE_REG_INT; 4208 if (outs[i].dir == KIT_CG_ASM_INOUT) { 4209 rv_direct_load_operand_to_reg( 4210 d, out_ops[i], 4211 native_loc_reg(bound_outs[i].type, cls, 4212 (Reg)bound_outs[i].v.local)); 4213 } 4214 } else if (bound_outs[i].kind == OPK_INDIRECT) { 4215 NativeLoc loc = 4216 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, 4217 (Reg)bound_outs[i].v.ind.base); 4218 rv_direct_load_address_to_reg(d, out_ops[i], loc); 4219 } 4220 } 4221 for (i = 0; i < nin; ++i) { 4222 if (bound_ins[i].kind == RV64_INLINE_OPK_REG) { 4223 NativeAllocClass cls = bound_ins[i].pad[0] == RV64_INLINE_OPCLS_FP 4224 ? NATIVE_REG_FP 4225 : NATIVE_REG_INT; 4226 rv_direct_load_operand_to_reg( 4227 d, in_ops[i], 4228 native_loc_reg(bound_ins[i].type, cls, (Reg)bound_ins[i].v.local)); 4229 } else if (bound_ins[i].kind == OPK_INDIRECT) { 4230 NativeLoc loc = 4231 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, 4232 (Reg)bound_ins[i].v.ind.base); 4233 rv_direct_load_address_to_reg(d, in_ops[i], loc); 4234 } 4235 } 4236 asmh = rv64_asm_open(c); 4237 rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, 4238 nclob); 4239 rv64_asm_run_template(asmh, d->native->mc, tmpl); 4240 rv64_asm_close(asmh); 4241 4242 for (i = 0; i < nout; ++i) { 4243 NativeAllocClass cls; 4244 NativeLoc src; 4245 if (bound_outs[i].kind != RV64_INLINE_OPK_REG) continue; 4246 cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP ? NATIVE_REG_FP 4247 : NATIVE_REG_INT; 4248 src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local); 4249 rv_direct_store_reg_to_operand(d, out_ops[i], src); 4250 } 4251 for (i = nsaved; i > 0; --i) rv_asm_restore_one(a, &saved[i - 1u]); 4252 } 4253 4254 static const NativeOps rv_direct_ops = { 4255 .bind_param = rv_bind_param, 4256 .tail_call_unrealizable_reason = rv_no_tail, 4257 .va_start_ = rv_va_start_, 4258 .va_arg_ = rv_va_arg_, 4259 .va_end_ = rv_va_end_, 4260 .va_copy_ = rv_va_copy_, 4261 .asm_block = rv_direct_asm_block, 4262 }; 4263 4264 const NativeOps* rv64_native_direct_ops(void) { return &rv_direct_ops; }