native.c (147781B)
1 /* src/arch/rv64/native.c โ RISC-V (RV64GC, LP64D) NativeTarget implementation. 2 * 3 * Mirrors the aa64 reference (src/arch/aa64/native.c): a physical-emission 4 * NativeTarget driven at -O0 by the shared NativeDirectTarget and at -O1+ by 5 * the optimizer emit path. ABI decisions go through the abi/ interface; this 6 * file owns only ISA emission and the RV64 frame layout. 7 * 8 * Frame model (single, top-record): s0 (x8) is the frame pointer anchored at 9 * the saved s0/ra pair; slots live below s0 at positive byte offsets `off` 10 * (address = s0 - off); outgoing args sit at the bottom of the frame (sp+0..). 11 * frame_size = align16(16 + cum_off + max_outgoing + va_save_sz) 12 * fp_pair_off = frame_size - 16 - va_save_sz (saved pair, sp-relative) 13 * CFA = s0 + (frame_size - fp_pair_off) 14 * RISC-V has no condition flags: comparisons materialize a 0/1 via SLT/SLTU or 15 * FLT/FLE; branches compare two registers directly. x0 is a hardware zero. */ 16 17 #include <string.h> 18 19 #include "abi/abi.h" 20 #include "arch/rv64/asm.h" 21 #include "arch/rv64/isa.h" 22 #include "arch/rv64/regs.h" 23 #include "arch/rv64/rv64.h" 24 #include "asm/asm.h" 25 #include "asm/asm_lex.h" 26 #include "cg/native_argmove.h" 27 #include "cg/native_asm.h" 28 #include "cg/native_direct_target.h" 29 #include "cg/native_frame.h" 30 #include "cg/type.h" 31 #include "core/arena.h" 32 #include "core/bytes.h" 33 #include "core/pool.h" 34 #include "core/slice.h" 35 #include "obj/obj.h" 36 37 enum { 38 RV_TMP0 = 5u, /* t0: emit-internal scratch (reserved, never allocable) */ 39 RV_TMP1 = 6u, /* t1: emit-internal scratch */ 40 RV_TMP2 = 7u, /* t2: emit-internal scratch (reserved in phys table) */ 41 RV_TMP3 = 28u, /* t3: emit-internal scratch (reserved in phys table) */ 42 RV_FTMP0 = 0u, /* ft0: emit-internal FP scratch */ 43 RV_FTMP1 = 1u, /* ft1: emit-internal FP scratch */ 44 RV_FA0 = 10u, /* fa0..fa7 = f10..f17 (FP arg/return registers) */ 45 RV_FA7 = 17u, 46 /* Single-pass (-O0) worst-case prologue: sp adjust (3) + far save pair (7) 47 * + sret spill (1) + variadic GP spills (8). No callee-saves at -O0. */ 48 RV_PROLOGUE_WORDS = 32u, 49 /* Known-frame (-O1) prologues are emitted directly, not into the fixed -O0 50 * NOP region, and additionally save callee-saved registers (up to 11 int + 12 51 * fp, each up to 4 words for a far s0-relative offset) on top of the header, 52 * sret, and variadic spills. Size the build buffer for the worst case. */ 53 RV_KNOWN_PROLOGUE_WORDS = 192u, 54 RV_FRAME_SAVE_SIZE = 16u, 55 }; 56 57 /* s1..s11 (11) + fs0..fs11 (12); separate int/fp collect arrays use this cap. 58 */ 59 #define RV_MAX_CALLEE_SAVES 16u 60 #define RV_MAX_REG_ARG_MOVES 16u 61 62 extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc); 63 extern void debug_func_pc_range(Debug*, ObjSecId text_section, u32 begin_ofs, 64 u32 end_ofs); 65 66 /* ============================ low-level emit ============================ */ 67 68 void rv64_emit32(MCEmitter* mc, u32 word) { 69 u8 b[4]; 70 u32 ofs = obj_pos(mc->obj, mc->section_id); 71 wr_u32_le(b, word); 72 mc->emit_bytes(mc, b, sizeof b); 73 if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); 74 } 75 76 void rv64_emit16(MCEmitter* mc, u32 halfword) { 77 u8 b[2]; 78 u32 ofs = obj_pos(mc->obj, mc->section_id); 79 b[0] = (u8)(halfword & 0xff); 80 b[1] = (u8)((halfword >> 8) & 0xff); 81 mc->emit_bytes(mc, b, sizeof b); 82 if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); 83 } 84 85 static void rv_patch32(ObjBuilder* obj, ObjSecId sec, u32 off, u32 word) { 86 u8 b[4]; 87 wr_u32_le(b, word); 88 obj_patch(obj, sec, off, b, sizeof b); 89 } 90 91 static int fits_i12(i64 v) { return v >= -2048 && v <= 2047; } 92 static int fits_i32(i64 v) { 93 return v >= (i64)(i32)0x80000000 && v <= (i64)(i32)0x7fffffff; 94 } 95 96 static u32 align_up_u32(u32 v, u32 align) { 97 u32 mask = align ? align - 1u : 0u; 98 return (v + mask) & ~mask; 99 } 100 101 static i64 floor_div_4096(i64 v) { 102 if (v >= 0) return v / 4096; 103 return -((-v + 4095) / 4096); 104 } 105 106 static void rv_emit_li32(MCEmitter* mc, u32 rd, i32 imm) { 107 if (imm >= -2048 && imm <= 2047) { 108 rv64_emit32(mc, rv_addi(rd, RV_ZERO, imm)); 109 return; 110 } 111 { 112 i64 hi64 = floor_div_4096((i64)imm + 0x800); 113 i32 hi = (i32)hi64; 114 i32 lo = (i32)((i64)imm - hi64 * 4096); 115 rv64_emit32(mc, rv_lui(rd, (u32)hi & 0xfffffu)); 116 if (lo) rv64_emit32(mc, rv_addiw(rd, rd, lo)); 117 } 118 } 119 120 static i32 sext12(u32 v) { 121 v &= 0xfffu; 122 return (v & 0x800u) ? (i32)v - 4096 : (i32)v; 123 } 124 125 static void rv_emit_li64(MCEmitter* mc, u32 rd, u64 imm) { 126 if (fits_i32((i64)imm)) { 127 rv_emit_li32(mc, rd, (i32)(i64)imm); 128 return; 129 } 130 { 131 i32 lo = sext12((u32)imm); 132 u64 hi = (imm - (u64)(i64)lo) >> 12; 133 rv_emit_li64(mc, rd, hi); 134 rv64_emit32(mc, rv_slli(rd, rd, 12)); 135 if (lo) rv64_emit32(mc, rv_addi(rd, rd, lo)); 136 } 137 } 138 139 /* sf!=0 selects a full 64-bit materialization; sf==0 a 32-bit value. */ 140 static void rv_emit_load_imm(MCEmitter* mc, u32 sf, u32 rd, i64 imm) { 141 if (!sf) { 142 rv_emit_li32(mc, rd, (i32)imm); 143 return; 144 } 145 if (fits_i32(imm)) 146 rv_emit_li32(mc, rd, (i32)imm); 147 else 148 rv_emit_li64(mc, rd, (u64)imm); 149 } 150 151 /* rd = base + off, materializing the offset when it exceeds imm12. Uses RV_TMP1 152 * as scratch for the wide path, so callers must keep RV_TMP1 free. */ 153 static void rv_emit_addr_adjust(MCEmitter* mc, u32 rd, u32 base, i32 off) { 154 if (off == 0) { 155 if (rd != base) rv64_emit32(mc, rv_addi(rd, base, 0)); 156 return; 157 } 158 if (fits_i12(off)) { 159 rv64_emit32(mc, rv_addi(rd, base, off)); 160 return; 161 } 162 rv_emit_load_imm(mc, 1, RV_TMP1, (i64)off); 163 rv64_emit32(mc, rv_add(rd, base, RV_TMP1)); 164 } 165 166 static u32 enc_int_store(u32 nbytes, u32 src, u32 base, i32 off) { 167 switch (nbytes) { 168 case 1: 169 return rv_sb(src, base, off); 170 case 2: 171 return rv_sh(src, base, off); 172 case 4: 173 return rv_sw(src, base, off); 174 default: 175 return rv_sd(src, base, off); 176 } 177 } 178 static u32 enc_int_load(u32 nbytes, int sign_ext, u32 rd, u32 base, i32 off) { 179 switch (nbytes) { 180 case 1: 181 return sign_ext ? rv_lb(rd, base, off) : rv_lbu(rd, base, off); 182 case 2: 183 return sign_ext ? rv_lh(rd, base, off) : rv_lhu(rd, base, off); 184 case 4: 185 return sign_ext ? rv_lw(rd, base, off) : rv_lwu(rd, base, off); 186 default: 187 return rv_ld(rd, base, off); 188 } 189 } 190 191 /* ============================ target state ============================ */ 192 193 /* Frame slots and callee-save records live in the shared NativeFrame 194 * bookkeeping (cg/native_frame.h); these aliases keep the rv64-local spellings. 195 */ 196 typedef NativeFrameSlotEntry RvNativeSlot; 197 typedef NativeFrameCalleeSave RvCalleeSave; 198 199 typedef enum RvPatchKind { RV_PATCH_ALLOCA } RvPatchKind; 200 201 typedef struct RvPatch { 202 u8 kind; /* RvPatchKind */ 203 u32 pos; 204 u32 dst_reg; 205 } RvPatch; 206 207 typedef struct RvNativeTarget { 208 NativeTarget base; 209 SrcLoc loc; 210 const CGFuncDesc* func; 211 212 /* Shared frame bookkeeping: slot table, cum_off, max_outgoing, callee-save 213 * set, and the known_frame / has_alloca / frame_final flags. */ 214 NativeFrame frame; 215 u32 frame_size_final; 216 u32 fp_pair_off; 217 u32 minimal_prologue_words; /* known-frame path: exact prologue length, else 0 218 */ 219 220 /* Known-frame (-O1) leaf no-frame tier (aa64's slim_prologue equivalent), 221 * settled in rv_func_begin_known_frame; always 0 on the single-pass path. A 222 * leaf with no callee-saves, no body slots, no outgoing args, no 223 * sret/variadic and register-only params never reads s0 nor clobbers ra, so 224 * it emits NO prologue and a bare `ret` โ the whole frame setup/teardown is 225 * elided. RISC-V has no pre/post-indexed store, so aa64's fp_at_bottom fold 226 * would save zero instructions on a kept frame and is intentionally not 227 * ported (see doc/plan/ARCH.md ยง2); this leaf tier is the rv64 win. */ 228 u8 slim_prologue; 229 230 u32 incoming_stack_size; /* fixed-param stack bytes (tail-call check) */ 231 u32 next_param_int; 232 u32 next_param_fp; 233 u32 next_param_stack; 234 u8 has_sret; 235 u8 is_variadic; 236 NativeFrameSlot sret_ptr_slot; 237 238 RvPatch* patches; 239 u32 npatches; 240 u32 patches_cap; 241 u32 nalloca; 242 243 u32 func_start; 244 u32 prologue_pos; 245 MCLabel epilogue_label; 246 } RvNativeTarget; 247 248 static RvNativeTarget* rv_of(NativeTarget* t) { return (RvNativeTarget*)t; } 249 250 static _Noreturn void rv_panic(RvNativeTarget* a, const char* msg) { 251 compiler_panic(a->base.c, a->loc, "rv64 native target: %s", msg); 252 } 253 254 static RvNativeSlot* rv_slot_get(RvNativeTarget* a, NativeFrameSlot fs) { 255 return native_frame_slot_at(&a->frame, fs); 256 } 257 258 /* s0-relative byte offset of a frame slot's base (address = s0 + ret). */ 259 static i32 rv_s0_off_slot(const RvNativeSlot* s) { return -(i32)s->off; } 260 261 /* s0-relative byte offset of incoming stack arg at byte_off. Stack args sit 262 * just above the saved pair; the 64-byte variadic GP save area (when present) 263 * is contiguous with them at [s0+16). */ 264 static i32 rv_s0_off_in_arg(const RvNativeTarget* a, u32 byte_off) { 265 u32 base = a->is_variadic ? 16u + 64u : 16u; 266 return (i32)(base + byte_off); 267 } 268 269 static u32 rv_va_save_sz(const RvNativeTarget* a) { 270 /* ABI-derived: the variadic register-save area is gp_reg_count*gp_slot_size 271 * (a0..a7 = 64 bytes for LP64D). Only present in variadic functions. */ 272 return a->is_variadic ? native_frame_va_save_bytes(a->base.c->abi) : 0u; 273 } 274 275 /* Callee-saved registers are homed just below the locals at rv_save_off(), 8 276 * bytes each โ they are NOT frame slots, so the frame size must reserve their 277 * bytes explicitly. Zero at -O0 (no callee-saves are taken). */ 278 static u32 rv_callee_save_bytes(const RvNativeTarget* a) { 279 return a->frame.ncallee_saves * 8u; 280 } 281 282 static u32 rv_frame_size(const RvNativeTarget* a) { 283 u32 raw = RV_FRAME_SAVE_SIZE + a->frame.cum_off + rv_callee_save_bytes(a) + 284 a->frame.max_outgoing + rv_va_save_sz(a); 285 return align_up_u32(raw, 16u); 286 } 287 288 static u32 rv_fp_pair_off(const RvNativeTarget* a, u32 frame_size) { 289 return frame_size - RV_FRAME_SAVE_SIZE - rv_va_save_sz(a); 290 } 291 292 /* ============================ type helpers ============================ */ 293 294 /* Scalar size/align/mem/class/loc constructors are shared in native_target.h 295 * (native_type_size, native_type_align, native_mem_for_type, 296 * native_class_for_type_fp_le8, native_loc_reg, native_loc_stack, 297 * native_loc_is_fp). loc_reg's mask is arch-specific and stays here. */ 298 299 /* A scalar value occupies a 64-bit register when it is pointer-sized or wider, 300 * else it is a 32-bit value (drives ADDW vs ADD selection etc). */ 301 static int rv_is_64(NativeTarget* t, KitCgTypeId type) { 302 return native_type_size(t, type) >= 8u || cg_type_is_ptr(t->c, type); 303 } 304 305 static u32 loc_reg(NativeLoc loc) { return loc.v.reg & 0x1fu; } 306 307 /* ============================ register tables ============================ */ 308 309 #define RV_PHYS_INT_ARG(r, idx) \ 310 {.reg = (r), \ 311 .cls = NATIVE_REG_INT, \ 312 .abi_index = (idx), \ 313 .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \ 314 ((idx) < 2u ? NATIVE_REG_RET : 0), \ 315 .spill_cost = 1u, \ 316 .copy_cost = 1u} 317 #define RV_PHYS_INT_CALLER(r) \ 318 {.reg = (r), \ 319 .cls = NATIVE_REG_INT, \ 320 .abi_index = 0xffu, \ 321 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \ 322 .spill_cost = 1u, \ 323 .copy_cost = 1u} 324 #define RV_PHYS_INT_CALLEE(r) \ 325 {.reg = (r), \ 326 .cls = NATIVE_REG_INT, \ 327 .abi_index = 0xffu, \ 328 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \ 329 .spill_cost = 4u, \ 330 .copy_cost = 1u} 331 #define RV_PHYS_INT_RESERVED(r) \ 332 {.reg = (r), \ 333 .cls = NATIVE_REG_INT, \ 334 .abi_index = 0xffu, \ 335 .flags = NATIVE_REG_RESERVED, \ 336 .spill_cost = 0u, \ 337 .copy_cost = 0u} 338 339 /* t0..t3 (x5,x6,x7,x28) are emit-internal scratch (RV_TMP0..RV_TMP3), reserved 340 * and never handed to the allocator or driver. t4/t5 are the driver scratch 341 * pool (disjoint from the emit temps so a hook can never clobber an operand the 342 * driver parked there). t6 is the lone caller-saved allocable (the -O0 cache's 343 * only caller-saved home); s1..s11 are appended callee-saved, chosen under 344 * pressure (and saved by the optimizer prologue at -O1). */ 345 static const Reg rv_int_allocable[] = {31u, 9u, 18u, 19u, 20u, 21u, 346 22u, 23u, 24u, 25u, 26u, 27u}; 347 static const Reg rv_int_scratch[] = {29u, 30u}; /* t4, t5 */ 348 349 static const NativePhysRegInfo rv_int_phys[] = { 350 RV_PHYS_INT_RESERVED(0u), /* zero */ 351 RV_PHYS_INT_RESERVED(1u), /* ra */ 352 RV_PHYS_INT_RESERVED(2u), /* sp */ 353 RV_PHYS_INT_RESERVED(3u), /* gp */ 354 RV_PHYS_INT_RESERVED(4u), /* tp */ 355 RV_PHYS_INT_RESERVED(5u), /* t0 = TMP0 */ 356 RV_PHYS_INT_RESERVED(6u), /* t1 = TMP1 */ 357 RV_PHYS_INT_RESERVED(7u), /* t2 = TMP2 (emit) */ 358 RV_PHYS_INT_RESERVED(8u), /* s0/fp */ 359 RV_PHYS_INT_CALLEE(9u), /* s1 */ 360 RV_PHYS_INT_ARG(10u, 0u), RV_PHYS_INT_ARG(11u, 1u), 361 RV_PHYS_INT_ARG(12u, 2u), RV_PHYS_INT_ARG(13u, 3u), 362 RV_PHYS_INT_ARG(14u, 4u), RV_PHYS_INT_ARG(15u, 5u), 363 RV_PHYS_INT_ARG(16u, 6u), RV_PHYS_INT_ARG(17u, 7u), 364 RV_PHYS_INT_CALLEE(18u), RV_PHYS_INT_CALLEE(19u), 365 RV_PHYS_INT_CALLEE(20u), RV_PHYS_INT_CALLEE(21u), 366 RV_PHYS_INT_CALLEE(22u), RV_PHYS_INT_CALLEE(23u), 367 RV_PHYS_INT_CALLEE(24u), RV_PHYS_INT_CALLEE(25u), 368 RV_PHYS_INT_CALLEE(26u), RV_PHYS_INT_CALLEE(27u), 369 RV_PHYS_INT_RESERVED(28u), /* t3 = TMP3 (emit) */ 370 RV_PHYS_INT_RESERVED(29u), /* t4 = driver scratch */ 371 RV_PHYS_INT_RESERVED(30u), /* t5 = driver scratch */ 372 RV_PHYS_INT_CALLER(31u), /* t6 = caller-saved allocable */ 373 }; 374 375 #define RV_PHYS_FP_ARG(r, idx) \ 376 {.reg = (r), \ 377 .cls = NATIVE_REG_FP, \ 378 .abi_index = (idx), \ 379 .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \ 380 ((idx) < 2u ? NATIVE_REG_RET : 0), \ 381 .spill_cost = 1u, \ 382 .copy_cost = 1u} 383 #define RV_PHYS_FP_CALLER(r) \ 384 {.reg = (r), \ 385 .cls = NATIVE_REG_FP, \ 386 .abi_index = 0xffu, \ 387 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \ 388 .spill_cost = 1u, \ 389 .copy_cost = 1u} 390 #define RV_PHYS_FP_CALLEE(r) \ 391 {.reg = (r), \ 392 .cls = NATIVE_REG_FP, \ 393 .abi_index = 0xffu, \ 394 .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \ 395 .spill_cost = 4u, \ 396 .copy_cost = 1u} 397 #define RV_PHYS_FP_RESERVED(r) \ 398 {.reg = (r), \ 399 .cls = NATIVE_REG_FP, \ 400 .abi_index = 0xffu, \ 401 .flags = NATIVE_REG_RESERVED, \ 402 .spill_cost = 0u, \ 403 .copy_cost = 0u} 404 405 /* Caller-saved allocable first (ft4..ft7, ft8..ft11), then callee (fs0..fs11). 406 * ft0/ft1 reserved as emit-internal scratch; ft2/ft3 driver scratch. */ 407 static const Reg rv_fp_allocable[] = {4u, 5u, 6u, 7u, 28u, 29u, 30u, 408 31u, 8u, 9u, 18u, 19u, 20u, 21u, 409 22u, 23u, 24u, 25u, 26u, 27u}; 410 static const Reg rv_fp_scratch[] = {2u, 3u}; /* ft2, ft3 */ 411 412 static const NativePhysRegInfo rv_fp_phys[] = { 413 RV_PHYS_FP_RESERVED(0u), /* ft0 = FTMP0 */ 414 RV_PHYS_FP_RESERVED(1u), /* ft1 = FTMP1 */ 415 RV_PHYS_FP_RESERVED(2u), /* ft2 = scratch */ 416 RV_PHYS_FP_RESERVED(3u), /* ft3 = scratch */ 417 RV_PHYS_FP_CALLER(4u), RV_PHYS_FP_CALLER(5u), RV_PHYS_FP_CALLER(6u), 418 RV_PHYS_FP_CALLER(7u), RV_PHYS_FP_CALLEE(8u), RV_PHYS_FP_CALLEE(9u), 419 RV_PHYS_FP_ARG(10u, 0u), RV_PHYS_FP_ARG(11u, 1u), RV_PHYS_FP_ARG(12u, 2u), 420 RV_PHYS_FP_ARG(13u, 3u), RV_PHYS_FP_ARG(14u, 4u), RV_PHYS_FP_ARG(15u, 5u), 421 RV_PHYS_FP_ARG(16u, 6u), RV_PHYS_FP_ARG(17u, 7u), RV_PHYS_FP_CALLEE(18u), 422 RV_PHYS_FP_CALLEE(19u), RV_PHYS_FP_CALLEE(20u), RV_PHYS_FP_CALLEE(21u), 423 RV_PHYS_FP_CALLEE(22u), RV_PHYS_FP_CALLEE(23u), RV_PHYS_FP_CALLEE(24u), 424 RV_PHYS_FP_CALLEE(25u), RV_PHYS_FP_CALLEE(26u), RV_PHYS_FP_CALLEE(27u), 425 RV_PHYS_FP_CALLER(28u), RV_PHYS_FP_CALLER(29u), RV_PHYS_FP_CALLER(30u), 426 RV_PHYS_FP_CALLER(31u), 427 }; 428 429 static const NativeAllocClassInfo rv_classes[] = { 430 {.cls = NATIVE_REG_INT, 431 .allocable = rv_int_allocable, 432 .nallocable = sizeof rv_int_allocable / sizeof rv_int_allocable[0], 433 .scratch = rv_int_scratch, 434 .nscratch = sizeof rv_int_scratch / sizeof rv_int_scratch[0], 435 .phys = rv_int_phys, 436 .nphys = sizeof rv_int_phys / sizeof rv_int_phys[0], 437 /* t0-t6 (5-7,28-31) + a0-a7 (10-17) */ 438 .caller_saved_mask = 0xf00400e0u | 0x0001fc00u, 439 /* s0-s11 (8,9,18-27) */ 440 .callee_saved_mask = 0x0ffc0300u, 441 .arg_mask = 0x0001fc00u, 442 .ret_mask = 0x00000c00u, 443 /* zero,ra,sp,gp,tp,t0,t1,t2,s0 (bits 0-8) + t3 (bit 28). t4/t5 are the 444 * driver scratch pool (reserved-from-alloc but listed in scratch[]). */ 445 .reserved_mask = 0x000001ffu | (1u << 28)}, 446 {.cls = NATIVE_REG_FP, 447 .allocable = rv_fp_allocable, 448 .nallocable = sizeof rv_fp_allocable / sizeof rv_fp_allocable[0], 449 .scratch = rv_fp_scratch, 450 .nscratch = sizeof rv_fp_scratch / sizeof rv_fp_scratch[0], 451 .phys = rv_fp_phys, 452 .nphys = sizeof rv_fp_phys / sizeof rv_fp_phys[0], 453 /* ft0-ft7 (0-7), fa0-fa7 (10-17), ft8-ft11 (28-31) */ 454 .caller_saved_mask = 0xf00400ffu | 0x0001fc00u, 455 /* fs0-fs11 (8,9,18-27) */ 456 .callee_saved_mask = 0x0ffc0300u, 457 .arg_mask = 0x0001fc00u, 458 .ret_mask = 0x00000c00u, 459 .reserved_mask = 0x0000000fu /* ft0-ft3 */}, 460 }; 461 462 /* Resolve a register name ("a7", "fa0", ...) to its (class, Reg). Powers the 463 * optimizer's inline-asm clobber masks and explicit hard-register operands 464 * ("{a7}" from a GNU local register variable). x0..x31 are DWARF 0..31; the 465 * FP bank f0..f31 is DWARF 32..63. Returns non-zero for a non-register name 466 * (cc/memory/unknown), which the caller skips. */ 467 static int rv_resolve_name(const NativeRegInfo* ri, Slice name, Reg* out, 468 NativeAllocClass* cls_out) { 469 char buf[16]; 470 uint32_t dwarf; 471 (void)ri; 472 if (!name.s || !name.len || name.len >= sizeof buf) return 1; 473 memcpy(buf, name.s, name.len); 474 buf[name.len] = '\0'; 475 if (rv64_register_index(buf, &dwarf) != 0) return 1; 476 if (dwarf <= 31u) { 477 *cls_out = NATIVE_REG_INT; 478 *out = (Reg)dwarf; 479 return 0; 480 } 481 if (dwarf >= 32u && dwarf <= 63u) { 482 *cls_out = NATIVE_REG_FP; 483 *out = (Reg)(dwarf - 32u); 484 return 0; 485 } 486 return 1; 487 } 488 489 static int rv_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls, 490 Reg reg) { 491 (void)ri; 492 if (cls == NATIVE_REG_INT) { 493 if (reg == 9u) return 1; /* s1 */ 494 if (reg >= 10u && reg <= 17u) return 1; /* a0..a7 */ 495 if (reg >= 18u && reg <= 27u) return 1; /* s2..s11 */ 496 if (reg == 31u) return 1; /* t6 */ 497 return 0; 498 } 499 if (cls == NATIVE_REG_FP) return reg >= 4u && reg <= 31u; 500 return 0; 501 } 502 503 static const NativeRegInfo rv_reg_info = { 504 .classes = rv_classes, 505 .nclasses = sizeof rv_classes / sizeof rv_classes[0], 506 .resolve_name = rv_resolve_name, 507 .asm_operand_reg_ok = rv_asm_operand_reg_ok, 508 }; 509 510 /* ============================ legality ============================ */ 511 512 static int rv_imm_legal(NativeTarget* t, NativeImmUse use, u32 op, 513 KitCgTypeId type, i64 imm) { 514 (void)t; 515 (void)type; 516 switch (use) { 517 case NATIVE_IMM_MOVE: 518 return 1; 519 case NATIVE_IMM_BINOP: 520 switch ((BinOp)op) { 521 case BO_IADD: 522 return fits_i12(imm); 523 case BO_ISUB: 524 return fits_i12(-imm); /* emitted as ADDI with negated imm */ 525 case BO_AND: 526 case BO_OR: 527 case BO_XOR: 528 return fits_i12(imm); 529 case BO_SHL: 530 case BO_SHR_S: 531 case BO_SHR_U: 532 return imm >= 0 && imm <= 63; 533 default: 534 return 0; 535 } 536 case NATIVE_IMM_CMP: 537 return imm == 0; /* compares need both ends in registers (SLT/branch) */ 538 case NATIVE_IMM_ADDR_OFFSET: 539 return fits_i12(imm); 540 } 541 return 0; 542 } 543 544 static int rv_addr_legal(NativeTarget* t, const NativeAddr* addr, 545 MemAccess mem) { 546 (void)t; 547 (void)mem; 548 if (!addr) return 0; 549 if (addr->index_kind != NATIVE_ADDR_INDEX_NONE) return 0; 550 if (addr->base_kind != NATIVE_ADDR_BASE_REG && 551 addr->base_kind != NATIVE_ADDR_BASE_FRAME) 552 return 0; 553 return fits_i12(addr->offset); 554 } 555 556 /* ============================ memory ============================ */ 557 558 /* Materialize the runtime address of a global into `dst`, including addend. */ 559 static void rv_emit_global_addr(RvNativeTarget* a, u32 dst, ObjSymId sym, 560 i64 addend) { 561 NativeTarget* t = &a->base; 562 MCEmitter* mc = t->mc; 563 u32 sec = mc->section_id; 564 if (obj_symbol_extern_via_got(t->c, t->obj, sym)) { 565 u32 ap = mc->pos(mc); 566 rv64_emit32(mc, rv_auipc(dst, 0)); 567 mc->emit_reloc_at(mc, sec, ap, R_RV_GOT_HI20, sym, 0, 0, 0); 568 { 569 Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi")); 570 ObjSymId anchor = 571 obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0); 572 u32 lp = mc->pos(mc); 573 rv64_emit32(mc, rv_ld(dst, dst, 0)); 574 mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0); 575 } 576 } else { 577 u32 ap = mc->pos(mc); 578 rv64_emit32(mc, rv_auipc(dst, 0)); 579 mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, 0, 0, 0); 580 { 581 Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi")); 582 ObjSymId anchor = 583 obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0); 584 u32 lp = mc->pos(mc); 585 rv64_emit32(mc, rv_addi(dst, dst, 0)); 586 mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0); 587 } 588 } 589 if (addend) rv_emit_addr_adjust(mc, dst, dst, (i32)addend); 590 } 591 592 /* Fold (base_reg << 0) + (index << scale) into RV_TMP0 via Zba. */ 593 static u32 rv_fold_index(RvNativeTarget* a, u32 base, u32 idx, u8 log2_scale) { 594 MCEmitter* mc = a->base.mc; 595 switch (log2_scale) { 596 case 0: 597 rv64_emit32(mc, rv_add(RV_TMP0, base, idx)); 598 break; 599 case 1: 600 rv64_emit32(mc, rv_sh1add(RV_TMP0, idx, base)); 601 break; 602 case 2: 603 rv64_emit32(mc, rv_sh2add(RV_TMP0, idx, base)); 604 break; 605 default: 606 rv64_emit32(mc, rv_sh3add(RV_TMP0, idx, base)); 607 break; 608 } 609 return RV_TMP0; 610 } 611 612 /* Resolve any NativeAddr to a base register + imm12 offset. RISC-V has no 613 * indexed load/store, so an index is folded into RV_TMP0 via Zba; far offsets 614 * and FRAME/FRAME_VALUE/GLOBAL bases are materialized into RV_TMP0/RV_TMP1. */ 615 static void rv_resolve_mem_addr(RvNativeTarget* a, const NativeAddr* addr, 616 u32* base_out, i32* off_out) { 617 MCEmitter* mc = a->base.mc; 618 u32 base; 619 i32 off; 620 switch (addr->base_kind) { 621 case NATIVE_ADDR_BASE_REG: 622 base = addr->base.reg & 0x1fu; 623 off = addr->offset; 624 break; 625 case NATIVE_ADDR_BASE_FRAME: { 626 RvNativeSlot* s = rv_slot_get(a, addr->base.frame); 627 base = RV_S0; 628 off = rv_s0_off_slot(s) + addr->offset; 629 break; 630 } 631 case NATIVE_ADDR_BASE_FRAME_VALUE: { 632 RvNativeSlot* s = rv_slot_get(a, addr->base.frame); 633 rv64_emit32(mc, rv_ld(RV_TMP0, RV_S0, rv_s0_off_slot(s))); 634 base = RV_TMP0; 635 off = addr->offset; 636 break; 637 } 638 case NATIVE_ADDR_BASE_GLOBAL: 639 rv_emit_global_addr(a, RV_TMP0, addr->base.global.sym, 640 addr->base.global.addend); 641 base = RV_TMP0; 642 off = addr->offset; 643 break; 644 default: 645 rv_panic(a, "unsupported address base"); 646 } 647 if (addr->index_kind == NATIVE_ADDR_INDEX_REG) { 648 base = rv_fold_index(a, base, addr->index.reg & 0x1fu, addr->log2_scale); 649 } else if (addr->index_kind == NATIVE_ADDR_INDEX_FRAME_VALUE) { 650 RvNativeSlot* s = rv_slot_get(a, addr->index.frame); 651 rv64_emit32(mc, rv_ld(RV_TMP1, RV_S0, rv_s0_off_slot(s))); 652 base = rv_fold_index(a, base, RV_TMP1, addr->log2_scale); 653 } 654 if (!fits_i12(off)) { 655 rv_emit_load_imm(mc, 1, RV_TMP1, (i64)off); 656 rv64_emit32(mc, rv_add(RV_TMP0, base, RV_TMP1)); 657 base = RV_TMP0; 658 off = 0; 659 } 660 *base_out = base; 661 *off_out = off; 662 } 663 664 /* Central load/store primitive. is_load: 1 load into reg, 0 store reg to mem. 665 */ 666 static void rv_emit_mem(RvNativeTarget* a, int is_load, NativeLoc reg, 667 NativeAddr addr, MemAccess mem) { 668 NativeTarget* t = &a->base; 669 MCEmitter* mc = t->mc; 670 u32 r = loc_reg(reg); 671 int fp = native_loc_is_fp(reg); 672 u32 sz = mem.size ? mem.size : native_type_size(t, reg.type); 673 u32 base; 674 i32 off; 675 676 rv_resolve_mem_addr(a, &addr, &base, &off); 677 if (fp) { 678 rv64_emit32( 679 mc, is_load ? (sz == 8u ? rv_fld(r, base, off) : rv_flw(r, base, off)) 680 : (sz == 8u ? rv_fsd(r, base, off) : rv_fsw(r, base, off))); 681 } else { 682 rv64_emit32(mc, is_load ? enc_int_load(sz, 0, r, base, off) 683 : enc_int_store(sz, r, base, off)); 684 } 685 } 686 687 /* ============================ moves / data ============================ */ 688 689 static void rv_move(NativeTarget* t, NativeLoc dst, NativeLoc src) { 690 MCEmitter* mc = t->mc; 691 int dfp = native_loc_is_fp(dst), sfp = native_loc_is_fp(src); 692 u32 rd = loc_reg(dst), rs = loc_reg(src); 693 if (dfp && sfp) { 694 u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S; 695 if (rd == rs) return; 696 rv64_emit32(mc, rv_fsgnj(fmt, rd, rs, rs)); 697 return; 698 } 699 if (!dfp && sfp) { 700 u32 sz = native_type_size(t, src.type); 701 rv64_emit32(mc, sz == 8u ? rv_fmv_x_d(rd, rs) : rv_fmv_x_w(rd, rs)); 702 return; 703 } 704 if (dfp && !sfp) { 705 u32 sz = native_type_size(t, dst.type); 706 rv64_emit32(mc, sz == 8u ? rv_fmv_d_x(rd, rs) : rv_fmv_w_x(rd, rs)); 707 return; 708 } 709 if (rd == rs) return; 710 rv64_emit32(mc, rv_addi(rd, rs, 0)); 711 } 712 713 static void rv_load_imm(NativeTarget* t, NativeLoc dst, i64 imm) { 714 rv_emit_load_imm(t->mc, rv_is_64(t, dst.type) ? 1u : 0u, loc_reg(dst), imm); 715 } 716 717 static void rv_load_const(NativeTarget* t, NativeLoc dst, ConstBytes cb) { 718 RvNativeTarget* a = rv_of(t); 719 u64 v = 0; 720 u32 i; 721 if (!native_loc_is_fp(dst)) { 722 for (i = 0; i < cb.size && i < 8u; ++i) v |= (u64)cb.bytes[i] << (i * 8u); 723 rv_load_imm(t, dst, (i64)v); 724 return; 725 } 726 /* FP constant: materialize the bit pattern in TMP0, bitcast into the FPR. */ 727 for (i = 0; i < cb.size && i < 8u; ++i) v |= (u64)cb.bytes[i] << (i * 8u); 728 rv_emit_load_imm(t->mc, 1, RV_TMP0, (i64)v); 729 if (cb.size == 8u) 730 rv64_emit32(t->mc, rv_fmv_d_x(loc_reg(dst), RV_TMP0)); 731 else 732 rv64_emit32(t->mc, rv_fmv_w_x(loc_reg(dst), RV_TMP0)); 733 (void)a; 734 } 735 736 static void rv_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) { 737 RvNativeTarget* a = rv_of(t); 738 MCEmitter* mc = t->mc; 739 u32 rd = loc_reg(dst); 740 u32 base; 741 i32 off; 742 if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL) { 743 rv_emit_global_addr(a, rd, addr.base.global.sym, 744 addr.base.global.addend + addr.offset); 745 base = rd; 746 off = 0; 747 } else if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) { 748 /* Load the pointer stored in the frame slot, then add the offset. */ 749 RvNativeSlot* s = rv_slot_get(a, addr.base.frame); 750 rv64_emit32(mc, rv_ld(rd, RV_S0, rv_s0_off_slot(s))); 751 base = rd; 752 off = addr.offset; 753 } else if (addr.base_kind == NATIVE_ADDR_BASE_FRAME) { 754 RvNativeSlot* s = rv_slot_get(a, addr.base.frame); 755 base = RV_S0; 756 off = rv_s0_off_slot(s) + addr.offset; 757 } else if (addr.base_kind == NATIVE_ADDR_BASE_REG) { 758 base = addr.base.reg & 0x1fu; 759 off = addr.offset; 760 } else { 761 rv_panic(a, "unsupported address base in load_addr"); 762 } 763 /* Fold any index via Zba sh{1,2,3}add (index << scale) + base. */ 764 if (addr.index_kind == NATIVE_ADDR_INDEX_REG) { 765 u32 idx = addr.index.reg & 0x1fu; 766 if (off != 0 || base != rd) rv_emit_addr_adjust(mc, rd, base, off); 767 switch (addr.log2_scale) { 768 case 0: 769 rv64_emit32(mc, rv_add(rd, rd, idx)); 770 break; 771 case 1: 772 rv64_emit32(mc, rv_sh1add(rd, idx, rd)); 773 break; 774 case 2: 775 rv64_emit32(mc, rv_sh2add(rd, idx, rd)); 776 break; 777 default: 778 rv64_emit32(mc, rv_sh3add(rd, idx, rd)); 779 break; 780 } 781 return; 782 } 783 rv_emit_addr_adjust(mc, rd, base, off); 784 } 785 786 static void rv_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, 787 MemAccess mem) { 788 rv_emit_mem(rv_of(t), 1, dst, addr, mem); 789 } 790 static void rv_store(NativeTarget* t, NativeAddr addr, NativeLoc src, 791 MemAccess mem) { 792 rv_emit_mem(rv_of(t), 0, src, addr, mem); 793 } 794 795 /* copy_bytes: resolve dst and src to dedicated pointer regs (RV_TMP3 / RV_TMP0) 796 * once, then copy granule-by-granule advancing both pointers. dst is resolved 797 * first because its base may itself live in RV_TMP1 (the transfer reg, e.g. the 798 * sret pointer from plan_ret); capturing it into RV_TMP3 before src resolution 799 * (which may clobber RV_TMP1 for far offsets) keeps it live. Advancing the 800 * pointers keeps every load/store at offset 0, so no offset ever exceeds imm12 801 * and the transfer reg never aliases a base. */ 802 static void rv_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src, 803 AggregateAccess access) { 804 MCEmitter* mc = t->mc; 805 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 806 u32 rem = access.size; 807 rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP3), dst); 808 rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP0), src); 809 while (rem) { 810 u32 sz = rem >= 8u ? 8u : rem >= 4u ? 4u : rem >= 2u ? 2u : 1u; 811 rv64_emit32(mc, enc_int_load(sz, 0, RV_TMP1, RV_TMP0, 0)); 812 rv64_emit32(mc, enc_int_store(sz, RV_TMP1, RV_TMP3, 0)); 813 rv64_emit32(mc, rv_addi(RV_TMP0, RV_TMP0, (i32)sz)); 814 rv64_emit32(mc, rv_addi(RV_TMP3, RV_TMP3, (i32)sz)); 815 rem -= sz; 816 } 817 } 818 819 static void rv_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value, 820 AggregateAccess access) { 821 MCEmitter* mc = t->mc; 822 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 823 u32 bv = loc_reg(byte_value); 824 u32 rem = access.size; 825 rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP3), dst); 826 while (rem) { 827 rv64_emit32(mc, rv_sb(bv, RV_TMP3, 0)); 828 rv64_emit32(mc, rv_addi(RV_TMP3, RV_TMP3, 1)); 829 rem -= 1u; 830 } 831 } 832 833 /* ============================ arithmetic ============================ */ 834 835 static void rv_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc aop, 836 NativeLoc bop) { 837 MCEmitter* mc = t->mc; 838 u32 rd = loc_reg(dst); 839 u32 ra = loc_reg(aop); 840 int sf = rv_is_64(t, dst.type); 841 int b_imm = bop.kind == NATIVE_LOC_IMM; 842 u32 rb = b_imm ? 0u : loc_reg(bop); 843 i64 imm = b_imm ? bop.v.imm : 0; 844 845 switch (op) { 846 case BO_FADD: 847 case BO_FSUB: 848 case BO_FMUL: 849 case BO_FDIV: { 850 u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S; 851 switch (op) { 852 case BO_FADD: 853 rv64_emit32(mc, rv_fadd(fmt, rd, ra, rb)); 854 break; 855 case BO_FSUB: 856 rv64_emit32(mc, rv_fsub(fmt, rd, ra, rb)); 857 break; 858 case BO_FMUL: 859 rv64_emit32(mc, rv_fmul(fmt, rd, ra, rb)); 860 break; 861 default: 862 rv64_emit32(mc, rv_fdiv(fmt, rd, ra, rb)); 863 break; 864 } 865 return; 866 } 867 case BO_IADD: 868 if (b_imm) { 869 rv64_emit32( 870 mc, sf ? rv_addi(rd, ra, (i32)imm) : rv_addiw(rd, ra, (i32)imm)); 871 } else { 872 rv64_emit32(mc, sf ? rv_add(rd, ra, rb) : rv_addw(rd, ra, rb)); 873 } 874 return; 875 case BO_ISUB: 876 if (b_imm) { 877 rv64_emit32( 878 mc, sf ? rv_addi(rd, ra, (i32)-imm) : rv_addiw(rd, ra, (i32)-imm)); 879 } else { 880 rv64_emit32(mc, sf ? rv_sub(rd, ra, rb) : rv_subw(rd, ra, rb)); 881 } 882 return; 883 case BO_IMUL: 884 rv64_emit32(mc, sf ? rv_mul(rd, ra, rb) : rv_mulw(rd, ra, rb)); 885 return; 886 case BO_SDIV: 887 rv64_emit32(mc, sf ? rv_div(rd, ra, rb) : rv_divw(rd, ra, rb)); 888 return; 889 case BO_UDIV: 890 rv64_emit32(mc, sf ? rv_divu(rd, ra, rb) : rv_divuw(rd, ra, rb)); 891 return; 892 case BO_SREM: 893 rv64_emit32(mc, sf ? rv_rem(rd, ra, rb) : rv_remw(rd, ra, rb)); 894 return; 895 case BO_UREM: 896 rv64_emit32(mc, sf ? rv_remu(rd, ra, rb) : rv_remuw(rd, ra, rb)); 897 return; 898 case BO_AND: 899 rv64_emit32(mc, b_imm ? rv_andi(rd, ra, (i32)imm) : rv_and(rd, ra, rb)); 900 return; 901 case BO_OR: 902 rv64_emit32(mc, b_imm ? rv_ori(rd, ra, (i32)imm) : rv_or(rd, ra, rb)); 903 return; 904 case BO_XOR: 905 rv64_emit32(mc, b_imm ? rv_xori(rd, ra, (i32)imm) : rv_xor(rd, ra, rb)); 906 return; 907 case BO_SHL: 908 if (b_imm) 909 rv64_emit32(mc, sf ? rv_slli(rd, ra, (u32)imm & 63u) 910 : rv_slliw(rd, ra, (u32)imm & 31u)); 911 else 912 rv64_emit32(mc, sf ? rv_sll(rd, ra, rb) : rv_sllw(rd, ra, rb)); 913 return; 914 case BO_SHR_U: 915 if (b_imm) 916 rv64_emit32(mc, sf ? rv_srli(rd, ra, (u32)imm & 63u) 917 : rv_srliw(rd, ra, (u32)imm & 31u)); 918 else 919 rv64_emit32(mc, sf ? rv_srl(rd, ra, rb) : rv_srlw(rd, ra, rb)); 920 return; 921 case BO_SHR_S: 922 if (b_imm) 923 rv64_emit32(mc, sf ? rv_srai(rd, ra, (u32)imm & 63u) 924 : rv_sraiw(rd, ra, (u32)imm & 31u)); 925 else 926 rv64_emit32(mc, sf ? rv_sra(rd, ra, rb) : rv_sraw(rd, ra, rb)); 927 return; 928 default: 929 rv_panic(rv_of(t), "unsupported binop"); 930 } 931 } 932 933 static void rv_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) { 934 MCEmitter* mc = t->mc; 935 u32 rd = loc_reg(dst), rs = loc_reg(src); 936 int sf = rv_is_64(t, dst.type); 937 switch (op) { 938 case UO_NEG: 939 rv64_emit32(mc, sf ? rv_sub(rd, RV_ZERO, rs) : rv_subw(rd, RV_ZERO, rs)); 940 return; 941 case UO_FNEG: { 942 u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S; 943 rv64_emit32(mc, rv_fsgnjn(fmt, rd, rs, rs)); 944 return; 945 } 946 case UO_BNOT: 947 rv64_emit32(mc, rv_xori(rd, rs, -1)); 948 return; 949 case UO_NOT: 950 rv64_emit32(mc, rv_sltiu(rd, rs, 1)); 951 return; 952 default: 953 rv_panic(rv_of(t), "unsupported unop"); 954 } 955 } 956 957 /* Sign/zero-extend a 32-bit operand into a 64-bit register for comparison. 958 * Returns the register to compare. */ 959 static u32 rv_cmp_ext(NativeTarget* t, int is_signed, NativeLoc op, u32 tmp) { 960 MCEmitter* mc = t->mc; 961 u32 r = loc_reg(op); 962 if (rv_is_64(t, op.type)) return r; 963 if (is_signed) { 964 rv64_emit32(mc, rv_addiw(tmp, r, 0)); /* sign-extend low 32 */ 965 } else { 966 rv64_emit32(mc, rv_slli(tmp, r, 32)); 967 rv64_emit32(mc, rv_srli(tmp, tmp, 32)); 968 } 969 return tmp; 970 } 971 972 static int cmp_is_signed(CmpOp op) { 973 switch (op) { 974 case CMP_LT_U: 975 case CMP_LE_U: 976 case CMP_GT_U: 977 case CMP_GE_U: 978 return 0; 979 default: 980 return 1; 981 } 982 } 983 984 /* Emit a 0/1 comparison result into rd from two integer registers. */ 985 static void rv_emit_icmp(NativeTarget* t, CmpOp op, u32 rd, u32 ra, u32 rb) { 986 MCEmitter* mc = t->mc; 987 switch (op) { 988 case CMP_EQ: 989 rv64_emit32(mc, rv_sub(rd, ra, rb)); 990 rv64_emit32(mc, rv_sltiu(rd, rd, 1)); 991 return; 992 case CMP_NE: 993 rv64_emit32(mc, rv_sub(rd, ra, rb)); 994 rv64_emit32(mc, rv_sltu(rd, RV_ZERO, rd)); 995 return; 996 case CMP_LT_S: 997 rv64_emit32(mc, rv_slt(rd, ra, rb)); 998 return; 999 case CMP_LT_U: 1000 rv64_emit32(mc, rv_sltu(rd, ra, rb)); 1001 return; 1002 case CMP_GT_S: 1003 rv64_emit32(mc, rv_slt(rd, rb, ra)); 1004 return; 1005 case CMP_GT_U: 1006 rv64_emit32(mc, rv_sltu(rd, rb, ra)); 1007 return; 1008 case CMP_GE_S: 1009 rv64_emit32(mc, rv_slt(rd, ra, rb)); 1010 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1011 return; 1012 case CMP_GE_U: 1013 rv64_emit32(mc, rv_sltu(rd, ra, rb)); 1014 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1015 return; 1016 case CMP_LE_S: 1017 rv64_emit32(mc, rv_slt(rd, rb, ra)); 1018 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1019 return; 1020 case CMP_LE_U: 1021 rv64_emit32(mc, rv_sltu(rd, rb, ra)); 1022 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1023 return; 1024 default: 1025 rv_panic(rv_of(t), "unsupported integer cmp"); 1026 } 1027 } 1028 1029 /* Format-dispatching wrappers over the ordered FP compares (feq/flt/fle are 1030 * ordered: they yield 0 on NaN; flt/fle are signaling, raising NV on NaN โ 1031 * pre-existing for ordered ops, and the boolean result is still correct). */ 1032 static u32 rv_feq_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) { 1033 return fmt == RV_FMT_D ? rv_feq_d(rd, ra, rb) : rv_feq_s(rd, ra, rb); 1034 } 1035 static u32 rv_flt_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) { 1036 return fmt == RV_FMT_D ? rv_flt_d(rd, ra, rb) : rv_flt_s(rd, ra, rb); 1037 } 1038 static u32 rv_fle_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) { 1039 return fmt == RV_FMT_D ? rv_fle_d(rd, ra, rb) : rv_fle_s(rd, ra, rb); 1040 } 1041 1042 static void rv_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc aop, 1043 NativeLoc bop) { 1044 MCEmitter* mc = t->mc; 1045 u32 rd = loc_reg(dst); 1046 /* FP-ness is self-describing from the opcode (FP block starts at CMP_OEQ_F). 1047 * Unordered predicates use unordered-R == NOT(ordered-not-R): the ordered 1048 * compare into rd, then `xori rd,rd,1`. ONE/UEQ have no single ordered 1049 * primitive and OR the two strict relations (a<b | a>b) via scratch RV_TMP2 1050 * (x7, reserved & never allocable, so it can't alias rd). */ 1051 if (op >= CMP_OEQ_F) { 1052 u32 fmt = native_type_size(t, aop.type) == 8u ? RV_FMT_D : RV_FMT_S; 1053 u32 ra = loc_reg(aop), rb = loc_reg(bop); 1054 switch (op) { 1055 case CMP_OEQ_F: 1056 rv64_emit32(mc, rv_feq_fmt(fmt, rd, ra, rb)); 1057 return; 1058 case CMP_UNE_F: /* !(OEQ) */ 1059 rv64_emit32(mc, rv_feq_fmt(fmt, rd, ra, rb)); 1060 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1061 return; 1062 case CMP_OLT_F: 1063 rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb)); 1064 return; 1065 case CMP_OLE_F: 1066 rv64_emit32(mc, rv_fle_fmt(fmt, rd, ra, rb)); 1067 return; 1068 case CMP_OGT_F: 1069 rv64_emit32(mc, rv_flt_fmt(fmt, rd, rb, ra)); 1070 return; 1071 case CMP_OGE_F: 1072 rv64_emit32(mc, rv_fle_fmt(fmt, rd, rb, ra)); 1073 return; 1074 case CMP_UGE_F: /* !(OLT) */ 1075 rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb)); 1076 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1077 return; 1078 case CMP_UGT_F: /* !(OLE) */ 1079 rv64_emit32(mc, rv_fle_fmt(fmt, rd, ra, rb)); 1080 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1081 return; 1082 case CMP_ULE_F: /* !(OGT) */ 1083 rv64_emit32(mc, rv_flt_fmt(fmt, rd, rb, ra)); 1084 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1085 return; 1086 case CMP_ULT_F: /* !(OGE) */ 1087 rv64_emit32(mc, rv_fle_fmt(fmt, rd, rb, ra)); 1088 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1089 return; 1090 case CMP_ONE_F: /* ordered & !=: (a<b) | (a>b) */ 1091 rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb)); 1092 rv64_emit32(mc, rv_flt_fmt(fmt, RV_TMP2, rb, ra)); 1093 rv64_emit32(mc, rv_or(rd, rd, RV_TMP2)); 1094 return; 1095 case CMP_UEQ_F: /* unordered | ==: !((a<b) | (a>b)) */ 1096 rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb)); 1097 rv64_emit32(mc, rv_flt_fmt(fmt, RV_TMP2, rb, ra)); 1098 rv64_emit32(mc, rv_or(rd, rd, RV_TMP2)); 1099 rv64_emit32(mc, rv_xori(rd, rd, 1)); 1100 return; 1101 default: 1102 rv_panic(rv_of(t), "unsupported fp cmp"); 1103 } 1104 } 1105 { 1106 int sg = cmp_is_signed(op); 1107 u32 ra = rv_cmp_ext(t, sg, aop, RV_TMP0); 1108 u32 rb = rv_cmp_ext(t, sg, bop, RV_TMP1); 1109 rv_emit_icmp(t, op, rd, ra, rb); 1110 } 1111 } 1112 1113 static void rv_convert(NativeTarget* t, ConvKind op, NativeLoc dst, 1114 NativeLoc src) { 1115 MCEmitter* mc = t->mc; 1116 u32 rd = loc_reg(dst), rs = loc_reg(src); 1117 u32 src_sz = native_type_size(t, src.type); 1118 u32 dst_sz = native_type_size(t, dst.type); 1119 switch (op) { 1120 case CV_SEXT: 1121 if (src_sz >= 4u) { 1122 rv64_emit32(mc, rv_addiw(rd, rs, 0)); 1123 } else { 1124 u32 sh = 64u - src_sz * 8u; 1125 rv64_emit32(mc, rv_slli(rd, rs, sh)); 1126 rv64_emit32(mc, rv_srai(rd, rd, sh)); 1127 } 1128 return; 1129 case CV_ZEXT: { 1130 u32 sh = 64u - src_sz * 8u; 1131 rv64_emit32(mc, rv_slli(rd, rs, sh)); 1132 rv64_emit32(mc, rv_srli(rd, rd, sh)); 1133 return; 1134 } 1135 case CV_TRUNC: 1136 if (rd != rs || dst_sz <= 4u) 1137 rv64_emit32(mc, rv_addi(rd, rs, 0)); /* low bits; users re-narrow */ 1138 return; 1139 case CV_ITOF_S: 1140 if (native_type_size(t, dst.type) == 8u) 1141 rv64_emit32(mc, 1142 src_sz == 8u ? rv_fcvt_d_l(rd, rs) : rv_fcvt_d_w(rd, rs)); 1143 else 1144 rv64_emit32(mc, 1145 src_sz == 8u ? rv_fcvt_s_l(rd, rs) : rv_fcvt_s_w(rd, rs)); 1146 return; 1147 case CV_ITOF_U: 1148 if (native_type_size(t, dst.type) == 8u) 1149 rv64_emit32(mc, 1150 src_sz == 8u ? rv_fcvt_d_lu(rd, rs) : rv_fcvt_d_wu(rd, rs)); 1151 else 1152 rv64_emit32(mc, 1153 src_sz == 8u ? rv_fcvt_s_lu(rd, rs) : rv_fcvt_s_wu(rd, rs)); 1154 return; 1155 case CV_FTOI_S: 1156 if (src_sz == 8u) 1157 rv64_emit32(mc, 1158 dst_sz == 8u ? rv_fcvt_l_d(rd, rs) : rv_fcvt_w_d(rd, rs)); 1159 else 1160 rv64_emit32(mc, 1161 dst_sz == 8u ? rv_fcvt_l_s(rd, rs) : rv_fcvt_w_s(rd, rs)); 1162 return; 1163 case CV_FTOI_U: 1164 if (src_sz == 8u) 1165 rv64_emit32(mc, 1166 dst_sz == 8u ? rv_fcvt_lu_d(rd, rs) : rv_fcvt_wu_d(rd, rs)); 1167 else 1168 rv64_emit32(mc, 1169 dst_sz == 8u ? rv_fcvt_lu_s(rd, rs) : rv_fcvt_wu_s(rd, rs)); 1170 return; 1171 case CV_FEXT: 1172 rv64_emit32(mc, rv_fcvt_d_s(rd, rs)); 1173 return; 1174 case CV_FTRUNC: 1175 rv64_emit32(mc, rv_fcvt_s_d(rd, rs)); 1176 return; 1177 case CV_BITCAST: 1178 rv_move(t, dst, src); 1179 return; 1180 default: 1181 rv_panic(rv_of(t), "unsupported convert"); 1182 } 1183 } 1184 1185 /* ============================ spill / reload ============================ */ 1186 1187 static void rv_spill(NativeTarget* t, NativeLoc src, NativeFrameSlot slot, 1188 MemAccess mem) { 1189 NativeAddr addr; 1190 memset(&addr, 0, sizeof addr); 1191 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1192 addr.base.frame = slot; 1193 addr.base_type = src.type; 1194 rv_emit_mem(rv_of(t), 0, src, addr, mem); 1195 } 1196 static void rv_reload(NativeTarget* t, NativeLoc dst, NativeFrameSlot slot, 1197 MemAccess mem) { 1198 NativeAddr addr; 1199 memset(&addr, 0, sizeof addr); 1200 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1201 addr.base.frame = slot; 1202 addr.base_type = dst.type; 1203 rv_emit_mem(rv_of(t), 1, dst, addr, mem); 1204 } 1205 1206 /* ============================ control flow ============================ */ 1207 1208 static MCLabel rv_label_new(NativeTarget* t) { return t->mc->label_new(t->mc); } 1209 static void rv_label_place(NativeTarget* t, MCLabel l) { 1210 t->mc->label_place(t->mc, l); 1211 } 1212 static void rv_jump(NativeTarget* t, MCLabel l) { 1213 rv64_emit32(t->mc, rv_jal(RV_ZERO, 0)); 1214 t->mc->emit_label_ref(t->mc, l, R_RV_JAL, 4, 0); 1215 } 1216 1217 static void rv_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc aop, 1218 NativeLoc bop, MCLabel l) { 1219 MCEmitter* mc = t->mc; 1220 /* RISC-V B-type branches reach only ยฑ4 KiB, which a single (especially 1221 * -O0) function can exceed between a branch and its target. Rather than a 1222 * lone conditional branch to the label, emit a short *inverted* branch 1223 * that skips an unconditional `jal` (ยฑ1 MiB) to the target. The inverted 1224 * branch's displacement is the constant SKIP_JAL (skip just the jal) and 1225 * so is always in range; the jal carries the long reach. See rv_jump. */ 1226 enum { SKIP_JAL = 8 }; /* branch over the 4-byte jal that follows it */ 1227 /* FP compares have no register-register branch form: materialize the 0/1 1228 * into TMP0 via rv_cmp (handles all 12 predicates), then branch on nonzero. 1229 */ 1230 if (op >= CMP_OEQ_F) { 1231 NativeLoc tmp = 1232 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, RV_TMP0); 1233 rv_cmp(t, op, tmp, aop, bop); 1234 /* Skip the jal when the result is 0 (condition false). */ 1235 rv64_emit32(mc, rv_beq(RV_TMP0, RV_ZERO, SKIP_JAL)); 1236 rv_jump(t, l); 1237 return; 1238 } 1239 { 1240 int sg = cmp_is_signed(op); 1241 u32 ra = rv_cmp_ext(t, sg, aop, RV_TMP0); 1242 u32 rb = rv_cmp_ext(t, sg, bop, RV_TMP1); 1243 u32 word; 1244 /* Encode the *inverse* of `op`, skipping the jal when NOT taken. */ 1245 switch (op) { 1246 case CMP_EQ: 1247 word = rv_bne(ra, rb, SKIP_JAL); 1248 break; 1249 case CMP_NE: 1250 word = rv_beq(ra, rb, SKIP_JAL); 1251 break; 1252 case CMP_LT_S: 1253 word = rv_bge(ra, rb, SKIP_JAL); 1254 break; 1255 case CMP_GE_S: 1256 word = rv_blt(ra, rb, SKIP_JAL); 1257 break; 1258 case CMP_LT_U: 1259 word = rv_bgeu(ra, rb, SKIP_JAL); 1260 break; 1261 case CMP_GE_U: 1262 word = rv_bltu(ra, rb, SKIP_JAL); 1263 break; 1264 case CMP_GT_S: 1265 word = rv_bge(rb, ra, SKIP_JAL); 1266 break; 1267 case CMP_LE_S: 1268 word = rv_blt(rb, ra, SKIP_JAL); 1269 break; 1270 case CMP_GT_U: 1271 word = rv_bgeu(rb, ra, SKIP_JAL); 1272 break; 1273 case CMP_LE_U: 1274 word = rv_bltu(rb, ra, SKIP_JAL); 1275 break; 1276 default: 1277 rv_panic(rv_of(t), "unsupported cmp_branch"); 1278 } 1279 rv64_emit32(mc, word); 1280 rv_jump(t, l); 1281 } 1282 } 1283 1284 static void rv_indirect_branch(NativeTarget* t, NativeLoc addr, 1285 const MCLabel* valid_targets, u32 ntargets) { 1286 (void)valid_targets; 1287 (void)ntargets; 1288 rv64_emit32(t->mc, rv_jalr(RV_ZERO, loc_reg(addr), 0)); 1289 } 1290 1291 static void rv_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel l) { 1292 /* `&&label` address-take: auipc/addi with a %pcrel_hi/%pcrel_lo relocation 1293 * pair against the label's per-block local symbol โ the same form 1294 * rv_emit_global_addr uses for a global โ so a compressing/re-encoding 1295 * assembler recomputes the displacement (a baked offset would break under 1296 * the C extension). */ 1297 MCEmitter* mc = t->mc; 1298 u32 rd = loc_reg(dst); 1299 u32 sec = mc->section_id; 1300 ObjSymId sym = mc_label_symbol(mc, l); 1301 u32 ap = mc->pos(mc); 1302 rv64_emit32(mc, rv_auipc(rd, 0)); 1303 mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, 0, 0, 0); 1304 { 1305 Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi")); 1306 ObjSymId anchor = obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0); 1307 u32 lp = mc->pos(mc); 1308 rv64_emit32(mc, rv_addi(rd, rd, 0)); 1309 mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0); 1310 } 1311 } 1312 1313 /* ============================ frame / lifecycle ============================ 1314 */ 1315 1316 static NativeFrameSlot rv_frame_slot(NativeTarget* t, 1317 const NativeFrameSlotDesc* d) { 1318 return native_frame_slot_alloc(&rv_of(t)->frame, d); 1319 } 1320 1321 static int rv_frame_slot_debug_loc(NativeTarget* t, NativeFrameSlot slot, 1322 CGDebugLoc* out) { 1323 RvNativeTarget* a = rv_of(t); 1324 RvNativeSlot* s; 1325 if (!out) return 0; 1326 memset(out, 0, sizeof *out); 1327 if (slot == NATIVE_FRAME_SLOT_NONE || slot > a->frame.nslots) return 0; 1328 s = rv_slot_get(a, slot); 1329 out->kind = CG_DEBUG_LOC_FRAME; 1330 /* rv64 slots are addressed s0/fp-relative (rv_s0_off_slot); the hosted dbg 1331 * snapshot seeds the frame base with s0, matching aa64's FP-relative 1332 * convention. */ 1333 out->v.frame_ofs = rv_s0_off_slot(s); 1334 return 1; 1335 } 1336 1337 static void rv_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) { 1338 RvNativeTarget* a = rv_of(t); 1339 MCEmitter* mc = t->mc; 1340 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type); 1341 a->func = fd; 1342 a->loc = fd->loc; 1343 /* Shared frame bookkeeping: clears the slot table, cum_off, max_outgoing, 1344 * callee-save set, and known_frame/has_alloca/frame_final. */ 1345 native_frame_reset(&a->frame); 1346 a->incoming_stack_size = 0; 1347 a->next_param_int = 0; 1348 a->next_param_fp = 0; 1349 a->next_param_stack = 0; 1350 a->has_sret = (abi && abi->has_sret) ? 1u : 0u; 1351 a->is_variadic = (abi && abi->variadic) ? 1u : 0u; 1352 a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE; 1353 a->npatches = 0; 1354 a->nalloca = 0; 1355 a->minimal_prologue_words = 0; 1356 a->slim_prologue = 0; 1357 1358 mc->set_section(mc, fd->text_section_id); 1359 mc->emit_align(mc, 4, 0); 1360 a->func_start = mc->pos(mc); 1361 mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start); 1362 if (mc->cfi_startproc) mc->cfi_startproc(mc); 1363 a->epilogue_label = mc->label_new(mc); 1364 } 1365 1366 /* sret: reserve a hidden slot for the incoming destination pointer (a0). */ 1367 static void rv_reserve_entry_saves(RvNativeTarget* a) { 1368 NativeTarget* t = &a->base; 1369 if (a->has_sret) { 1370 NativeFrameSlotDesc sd; 1371 memset(&sd, 0, sizeof sd); 1372 sd.type = builtin_id(KIT_CG_BUILTIN_I64); 1373 sd.size = 8; 1374 sd.align = 8; 1375 sd.kind = NATIVE_FRAME_SLOT_SAVE; 1376 a->sret_ptr_slot = t->frame_slot(t, &sd); 1377 a->next_param_int = 1; /* a0 consumed by the sret pointer */ 1378 } 1379 } 1380 1381 static void rv_emit_entry_save_stores(RvNativeTarget* a) { 1382 NativeTarget* t = &a->base; 1383 if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) { 1384 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 1385 NativeAddr addr; 1386 memset(&addr, 0, sizeof addr); 1387 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1388 addr.base.frame = a->sret_ptr_slot; 1389 addr.base_type = i64t; 1390 rv_emit_mem(a, 0, native_loc_reg(i64t, NATIVE_REG_INT, RV_A0), addr, 1391 native_mem_for_type(t, i64t, 8)); 1392 } 1393 } 1394 1395 /* Collect the callee-saves the body used (none at -O0). */ 1396 static u32 rv_collect_int_saves(RvNativeTarget* a, u32* regs) { 1397 u32 n = 0, i; 1398 for (i = 0; i < a->frame.ncallee_saves; ++i) 1399 if (a->frame.callee_saves[i].cls == NATIVE_REG_INT) 1400 regs[n++] = a->frame.callee_saves[i].reg; 1401 return n; 1402 } 1403 static u32 rv_collect_fp_saves(RvNativeTarget* a, u32* regs) { 1404 u32 n = 0, i; 1405 for (i = 0; i < a->frame.ncallee_saves; ++i) 1406 if (a->frame.callee_saves[i].cls == NATIVE_REG_FP) 1407 regs[n++] = a->frame.callee_saves[i].reg; 1408 return n; 1409 } 1410 1411 /* s0-relative offset of the i-th saved register (saves stack below locals). */ 1412 static i32 rv_save_off(RvNativeTarget* a, u32 idx) { 1413 return -(i32)(a->frame.cum_off) - 8 - 8 * (i32)idx; 1414 } 1415 1416 static void rv_load_s0(MCEmitter* mc, int fp, u32 reg, i32 off) { 1417 if (fits_i12(off)) { 1418 rv64_emit32(mc, fp ? rv_fld(reg, RV_S0, off) : rv_ld(reg, RV_S0, off)); 1419 return; 1420 } 1421 rv_emit_load_imm(mc, 1, RV_TMP0, (i64)off); 1422 rv64_emit32(mc, rv_add(RV_TMP0, RV_S0, RV_TMP0)); 1423 rv64_emit32(mc, fp ? rv_fld(reg, RV_TMP0, 0) : rv_ld(reg, RV_TMP0, 0)); 1424 } 1425 1426 /* Build the prologue instruction sequence into words[]. Returns count. */ 1427 static u32 rv_build_prologue(RvNativeTarget* a, u32* words, u32 cap, 1428 u32 frame_size, u32 fp_pair_off, 1429 const u32* int_regs, u32 n_int, const u32* fp_regs, 1430 u32 n_fp) { 1431 u32 wi = 0; 1432 #define PUSH(w) \ 1433 do { \ 1434 if (wi >= cap) rv_panic(a, "prologue placeholder overflow"); \ 1435 words[wi++] = (w); \ 1436 } while (0) 1437 /* sp -= frame_size */ 1438 if (fits_i12(-(i32)frame_size)) { 1439 PUSH(rv_addi(RV_SP, RV_SP, -(i32)frame_size)); 1440 } else { 1441 i32 neg = -(i32)frame_size; 1442 i32 hi = (i32)(((i64)neg + 0x800) >> 12); 1443 i32 lo = neg - (i32)((u32)hi << 12); 1444 PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu)); 1445 if (lo) PUSH(rv_addiw(RV_TMP0, RV_TMP0, lo)); 1446 PUSH(rv_add(RV_SP, RV_SP, RV_TMP0)); 1447 } 1448 /* save s0/ra at [sp + fp_pair_off], set s0 = sp + fp_pair_off */ 1449 if (fits_i12((i32)fp_pair_off + 8)) { 1450 PUSH(rv_sd(RV_S0, RV_SP, (i32)fp_pair_off)); 1451 PUSH(rv_sd(RV_RA, RV_SP, (i32)fp_pair_off + 8)); 1452 PUSH(rv_addi(RV_S0, RV_SP, (i32)fp_pair_off)); 1453 } else { 1454 i32 off = (i32)fp_pair_off; 1455 i32 hi = (i32)(((i64)off + 0x800) >> 12); 1456 i32 lo = off - (i32)((u32)hi << 12); 1457 PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu)); 1458 if (lo) PUSH(rv_addiw(RV_TMP0, RV_TMP0, lo)); 1459 PUSH(rv_add(RV_TMP0, RV_SP, RV_TMP0)); 1460 PUSH(rv_sd(RV_S0, RV_TMP0, 0)); 1461 PUSH(rv_sd(RV_RA, RV_TMP0, 8)); 1462 PUSH(rv_addi(RV_S0, RV_TMP0, 0)); 1463 } 1464 /* sret a0 spill */ 1465 if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) { 1466 RvNativeSlot* s = rv_slot_get(a, a->sret_ptr_slot); 1467 PUSH(rv_sd(RV_A0, RV_S0, rv_s0_off_slot(s))); 1468 } 1469 /* variadic GP save area: spill unconsumed a-regs at [s0 + 16 + i*8] */ 1470 if (a->is_variadic) { 1471 u32 i; 1472 for (i = a->next_param_int; i < 8u; ++i) 1473 PUSH(rv_sd(RV_A0 + i, RV_S0, 16 + (i32)i * 8)); 1474 } 1475 /* callee saves */ 1476 { 1477 u32 i; 1478 for (i = 0; i < n_int; ++i) { 1479 i32 off = rv_save_off(a, i); 1480 if (fits_i12(off)) { 1481 PUSH(rv_sd(int_regs[i], RV_S0, off)); 1482 } else { 1483 /* rare; emitted directly is fine in the known-frame path, but the 1484 * single-pass placeholder must hold these too. Use the wide form. */ 1485 i32 hi = (i32)(((i64)off + 0x800) >> 12); 1486 i32 lo = off - (i32)((u32)hi << 12); 1487 PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu)); 1488 if (lo) PUSH(rv_addiw(RV_TMP0, RV_TMP0, lo)); 1489 PUSH(rv_add(RV_TMP0, RV_S0, RV_TMP0)); 1490 PUSH(rv_sd(int_regs[i], RV_TMP0, 0)); 1491 } 1492 } 1493 for (i = 0; i < n_fp; ++i) { 1494 i32 off = rv_save_off(a, n_int + i); 1495 if (fits_i12(off)) { 1496 PUSH(rv_fsd(fp_regs[i], RV_S0, off)); 1497 } else { 1498 i32 hi = (i32)(((i64)off + 0x800) >> 12); 1499 i32 lo = off - (i32)((u32)hi << 12); 1500 PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu)); 1501 if (lo) PUSH(rv_addiw(RV_TMP0, RV_TMP0, lo)); 1502 PUSH(rv_add(RV_TMP0, RV_S0, RV_TMP0)); 1503 PUSH(rv_fsd(fp_regs[i], RV_TMP0, 0)); 1504 } 1505 } 1506 } 1507 #undef PUSH 1508 return wi; 1509 } 1510 1511 static void rv_func_begin(NativeTarget* t, const CGFuncDesc* fd) { 1512 RvNativeTarget* a = rv_of(t); 1513 MCEmitter* mc = t->mc; 1514 u32 i; 1515 rv_func_begin_common(t, fd); 1516 a->prologue_pos = mc->pos(mc); 1517 for (i = 0; i < RV_PROLOGUE_WORDS; ++i) rv64_emit32(mc, RV_NOP); 1518 rv_reserve_entry_saves(a); 1519 rv_emit_entry_save_stores(a); 1520 } 1521 1522 static void rv_func_end(NativeTarget* t) { 1523 RvNativeTarget* a = rv_of(t); 1524 MCEmitter* mc = t->mc; 1525 ObjBuilder* obj = t->obj; 1526 ObjSecId sec = a->func->text_section_id; 1527 u32 int_regs[16], fp_regs[16]; 1528 u32 n_int = rv_collect_int_saves(a, int_regs); 1529 u32 n_fp = rv_collect_fp_saves(a, fp_regs); 1530 u32 frame_size = rv_frame_size(a); 1531 u32 fp_pair_off = rv_fp_pair_off(a, frame_size); 1532 u32 end; 1533 i32 i; 1534 a->frame_size_final = frame_size; 1535 a->fp_pair_off = fp_pair_off; 1536 1537 /* epilogue */ 1538 mc->label_place(mc, a->epilogue_label); 1539 if (a->slim_prologue) { 1540 /* Frameless leaf: no callee-saves, no s0/ra to reload, sp untouched. */ 1541 rv64_emit32(mc, rv_jalr(RV_ZERO, RV_RA, 0)); 1542 } else { 1543 for (i = (i32)n_int - 1; i >= 0; --i) 1544 rv_load_s0(mc, 0, int_regs[i], rv_save_off(a, (u32)i)); 1545 for (i = (i32)n_fp - 1; i >= 0; --i) 1546 rv_load_s0(mc, 1, fp_regs[i], rv_save_off(a, n_int + (u32)i)); 1547 if (a->frame.has_alloca) 1548 rv_emit_addr_adjust(mc, RV_SP, RV_S0, -(i32)fp_pair_off); 1549 rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8)); 1550 rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0)); 1551 /* sp += frame_size */ 1552 if (fits_i12((i32)frame_size)) { 1553 rv64_emit32(mc, rv_addi(RV_SP, RV_SP, (i32)frame_size)); 1554 } else { 1555 rv_emit_load_imm(mc, 1, RV_TMP0, (i64)frame_size); 1556 rv64_emit32(mc, rv_add(RV_SP, RV_SP, RV_TMP0)); 1557 } 1558 rv64_emit32(mc, rv_jalr(RV_ZERO, RV_RA, 0)); 1559 } 1560 1561 /* patch prologue */ 1562 if (!a->frame.known_frame) { 1563 u32 words[RV_PROLOGUE_WORDS]; 1564 u32 nwords, k; 1565 for (k = 0; k < RV_PROLOGUE_WORDS; ++k) words[k] = RV_NOP; 1566 nwords = rv_build_prologue(a, words, RV_PROLOGUE_WORDS, frame_size, 1567 fp_pair_off, int_regs, n_int, fp_regs, n_fp); 1568 (void)nwords; 1569 for (k = 0; k < RV_PROLOGUE_WORDS; ++k) 1570 rv_patch32(obj, sec, a->prologue_pos + k * 4u, words[k]); 1571 } 1572 /* patch alloca sites: addi dst, sp, max_outgoing */ 1573 { 1574 u32 mo = align_up_u32(a->frame.max_outgoing, 16u); 1575 u32 k; 1576 if (mo > 2047u) rv_panic(a, "max_outgoing too large for alloca patch"); 1577 for (k = 0; k < a->npatches; ++k) 1578 rv_patch32(obj, sec, a->patches[k].pos, 1579 rv_addi(a->patches[k].dst_reg, RV_SP, (i32)mo)); 1580 } 1581 1582 /* CFI: CFA = s0 + (frame_size - fp_pair_off) */ 1583 if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) { 1584 if (a->slim_prologue) { 1585 /* Frameless leaf: CFA = sp (unchanged from entry) and the return address 1586 * stays live in ra (the CIE default), so no saved-register rules. The 1587 * state holds from the first instruction (offset 0). */ 1588 mc->cfi_set_next_pc_offset(mc, 0); 1589 mc->cfi_def_cfa(mc, RV_SP, 0); 1590 } else { 1591 i32 cfa = (i32)frame_size - (i32)fp_pair_off; 1592 u32 post = a->prologue_pos + (a->frame.known_frame 1593 ? a->minimal_prologue_words * 4u 1594 : RV_PROLOGUE_WORDS * 4u); 1595 u32 k; 1596 mc->cfi_set_next_pc_offset(mc, post - a->func_start); 1597 mc->cfi_def_cfa(mc, RV_S0, cfa); 1598 mc->cfi_offset(mc, RV_S0, -cfa); 1599 mc->cfi_offset(mc, RV_RA, -cfa + 8); 1600 for (k = 0; k < n_int; ++k) 1601 mc->cfi_offset(mc, int_regs[k], rv_save_off(a, k) - cfa); 1602 for (k = 0; k < n_fp; ++k) 1603 mc->cfi_offset(mc, 32u + fp_regs[k], rv_save_off(a, n_int + k) - cfa); 1604 } 1605 } 1606 1607 end = mc->pos(mc); 1608 obj_symbol_define(obj, a->func->sym, sec, (u64)a->func_start, 1609 (u64)(end - a->func_start)); 1610 if (a->func->atomize) 1611 obj_atom_define(obj, sec, a->func_start, end - a->func_start, a->func->sym, 1612 0); 1613 if (mc->debug) debug_func_pc_range(mc->debug, sec, a->func_start, end); 1614 if (mc->cfi_endproc) mc->cfi_endproc(mc); 1615 mc_end_function(mc); 1616 a->func = NULL; 1617 } 1618 1619 /* rv64 homes its callee-saves below the locals at rv_save_off(idx) rather than 1620 * in frame slots, so alloc_slots=0: native_frame just records the {reg,cls} set 1621 * derived from the optimizer's per-class used-masks. */ 1622 static void rv_reserve_callee_saves(NativeTarget* t, const u32* used, 1623 u32 nclasses) { 1624 native_frame_set_callee_saves(&rv_of(t)->frame, used, nclasses, NULL, 0, 0); 1625 } 1626 1627 static int rv_reg_is_callee_int(Reg r); 1628 static int rv_reg_is_callee_fp(Reg r); 1629 static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, 1630 u32 nclob, u32* int_mask, u32* fp_mask); 1631 1632 /* Expand the arch-neutral clobber-ABI sets (KitCgAsmClobberAbiSet bits) into 1633 * this target's per-class caller/callee-saved register masks. */ 1634 /* abi_clobber_masks is shared as native_asm_abi_clobber_masks 1635 * (cg/native_asm.h); it reads the masks from t->regs->classes. */ 1636 1637 /* Build the callee-saved set the prologue must preserve: the allocator-assigned 1638 * callee-saved registers (frame->callee_saved_used) plus any an inline-asm 1639 * block clobbers. The latter are opaque to the optimizer's operand scan, so it 1640 * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral 1641 * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks 1642 * and keep only the callee-saved ones โ rv_reg_is_callee_int excludes s0 (the 1643 * frame pointer, preserved by the prologue head, not as an ordinary 1644 * callee-save). This is the same register selection the per-block spill used, 1645 * hoisted into the prologue. Writes up to `cap` per-class masks into `out` and 1646 * returns the class count to reserve. */ 1647 static u32 rv_known_callee_saves(NativeTarget* t, 1648 const NativeKnownFrameDesc* frame, u32* out, 1649 u32 cap) { 1650 u32 ncls = frame->ncallee_classes; 1651 u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp; 1652 if (ncls > cap) ncls = cap; 1653 for (u32 c = 0; c < ncls; ++c) 1654 out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u; 1655 if (frame->asm_clobbers && frame->nasm_clobbers) { 1656 RvNativeTarget* a = rv_of(t); 1657 SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; 1658 rv_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers, 1659 &clob_int, &clob_fp); 1660 } 1661 native_asm_abi_clobber_masks(t, frame->asm_clobber_abi_sets, &abi_int, 1662 &abi_fp); 1663 clob_int |= abi_int; 1664 clob_fp |= abi_fp; 1665 for (Reg r = 0; r < 32u; ++r) { 1666 if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) && 1667 rv_reg_is_callee_int(r)) 1668 out[NATIVE_REG_INT] |= 1u << r; 1669 if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && rv_reg_is_callee_fp(r)) 1670 out[NATIVE_REG_FP] |= 1u << r; 1671 } 1672 return ncls; 1673 } 1674 1675 static u32 rv_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type, 1676 int* variadic, u32* nparams); 1677 1678 /* Optimizer entry point: the full frame is supplied up front, so the prologue 1679 * is emitted final the moment it is built โ no NOP region, no func_end patch 1680 * (rv_func_end skips patching when known_frame). rv_build_prologue emits the 1681 * sret spill and the variadic register-save stores inline, so there is no 1682 * separate entry-save emission. Slot creation order matches the single-pass 1683 * path: callee-saves first (only recorded for rv64), then static slots, then 1684 * the sret entry-save slot. */ 1685 static void rv_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd, 1686 const NativeKnownFrameDesc* frame, 1687 NativeFrameSlot* out_slots) { 1688 RvNativeTarget* a = rv_of(t); 1689 MCEmitter* mc = t->mc; 1690 u32 int_regs[RV_MAX_CALLEE_SAVES], fp_regs[RV_MAX_CALLEE_SAVES]; 1691 u32 n_int, n_fp, frame_size, fp_pair_off, nwords, i; 1692 u32 words[RV_KNOWN_PROLOGUE_WORDS]; 1693 rv_func_begin_common(t, fd); 1694 a->frame.known_frame = 1; 1695 if (frame) { 1696 u32 cs[NATIVE_CALL_PLAN_CLASSES]; 1697 u32 ncs = rv_known_callee_saves(t, frame, cs, NATIVE_CALL_PLAN_CLASSES); 1698 a->frame.has_alloca = frame->has_alloca; 1699 if (ncs) rv_reserve_callee_saves(t, cs, ncs); 1700 for (i = 0; i < frame->nslots; ++i) { 1701 NativeFrameSlot slot = rv_frame_slot(t, &frame->slots[i]); 1702 if (out_slots) out_slots[i] = slot; 1703 } 1704 rv_reserve_entry_saves(a); 1705 native_frame_note_outgoing(&a->frame, frame->max_outgoing); 1706 } 1707 /* Frame is final: size and offsets are settled, so emit the exact prologue. 1708 */ 1709 frame_size = rv_frame_size(a); 1710 fp_pair_off = rv_fp_pair_off(a, frame_size); 1711 a->frame_size_final = frame_size; 1712 a->fp_pair_off = fp_pair_off; 1713 a->prologue_pos = mc->pos(mc); 1714 /* Leaf no-frame tier (aa64 slim_prologue equivalent): a leaf with no 1715 * callee-saves, no body slots, no outgoing args, no sret/variadic and 1716 * register-only params never reads s0 (no frame slots / stack args) nor 1717 * clobbers ra (no calls). Emit no prologue at all; rv_func_end emits a bare 1718 * `ret`. cum_off==0 already implies no sret slot and no param spills, but the 1719 * extra guards keep the intent explicit. Inline asm is excluded: it can 1720 * clobber ra opaquely, and without the saved record the bare `ret` would 1721 * return through the destroyed link register. */ 1722 a->slim_prologue = frame && frame->is_leaf && !frame->has_asm && 1723 a->frame.ncallee_saves == 0 && !a->frame.has_alloca && 1724 a->frame.cum_off == 0 && a->frame.max_outgoing == 0 && 1725 !a->has_sret && !a->is_variadic && 1726 rv_signature_stack_bytes(t, fd->fn_type, NULL, NULL) == 0; 1727 if (a->slim_prologue) { 1728 a->minimal_prologue_words = 0; 1729 native_frame_set_final(&a->frame); 1730 return; 1731 } 1732 n_int = rv_collect_int_saves(a, int_regs); 1733 n_fp = rv_collect_fp_saves(a, fp_regs); 1734 nwords = rv_build_prologue(a, words, RV_KNOWN_PROLOGUE_WORDS, frame_size, 1735 fp_pair_off, int_regs, n_int, fp_regs, n_fp); 1736 for (i = 0; i < nwords; ++i) rv64_emit32(mc, words[i]); 1737 a->minimal_prologue_words = nwords; 1738 native_frame_set_final(&a->frame); 1739 } 1740 1741 /* ============================ params / ABI helpers 1742 * ============================ */ 1743 1744 static const ABIArgInfo* rv_param_abi(NativeTarget* t, const ABIFuncInfo* abi, 1745 const NativeCallDesc* desc, u32 i, 1746 ABIArgInfo* scratch) { 1747 /* Synthesized for unnamed (variadic) args, or untyped calls. RISC-V LP64D 1748 * passes variadic FP args in INTEGER registers (as their bit pattern), not 1749 * the FP pool โ so a variadic float part is ABI_CLASS_INT. */ 1750 int variadic = abi && i >= abi->nparams; 1751 if (abi && i < abi->nparams) return &abi->params[i]; 1752 memset(scratch, 0, sizeof *scratch); 1753 scratch->kind = ABI_ARG_DIRECT; 1754 scratch->nparts = 1; 1755 scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1); 1756 ((ABIArgPart*)scratch->parts)[0].cls = 1757 (!variadic && cg_type_is_float(t->c, desc->args[i].type)) ? ABI_CLASS_FP 1758 : ABI_CLASS_INT; 1759 ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG; 1760 ((ABIArgPart*)scratch->parts)[0].size = 1761 native_type_size(t, desc->args[i].type); 1762 ((ABIArgPart*)scratch->parts)[0].align = 1763 native_type_align(t, desc->args[i].type); 1764 return scratch; 1765 } 1766 1767 static u32 rv_part_stack_size(const ABIArgPart* part) { 1768 return align_up_u32(part->size ? part->size : 8u, 8u); 1769 } 1770 static u32 rv_part_stack_align(const ABIArgPart* part) { 1771 u32 al = part->align ? part->align : 8u; 1772 if (al < 8u) al = 8u; 1773 if (al > 16u) al = 16u; 1774 return al; 1775 } 1776 1777 static KitCgTypeId rv_part_scalar_type(const ABIArgPart* part) { 1778 if (part->cls == ABI_CLASS_FP) { 1779 if (part->size <= 4u) return builtin_id(KIT_CG_BUILTIN_F32); 1780 return builtin_id(KIT_CG_BUILTIN_F64); 1781 } 1782 switch (part->size) { 1783 case 1u: 1784 return builtin_id(KIT_CG_BUILTIN_I8); 1785 case 2u: 1786 return builtin_id(KIT_CG_BUILTIN_I16); 1787 case 4u: 1788 return builtin_id(KIT_CG_BUILTIN_I32); 1789 default: 1790 return builtin_id(KIT_CG_BUILTIN_I64); 1791 } 1792 } 1793 1794 static u32 rv_class_stack_size(const ABIArgInfo* ai) { 1795 u32 total = 0, p; 1796 if (!ai || ai->kind == ABI_ARG_IGNORE) return 0; 1797 if (ai->kind == ABI_ARG_INDIRECT) return 8u; 1798 for (p = 0; p < ai->nparts; ++p) { 1799 total = align_up_u32(total, rv_part_stack_align(&ai->parts[p])); 1800 total += rv_part_stack_size(&ai->parts[p]); 1801 } 1802 return align_up_u32(total ? total : 8u, 8u); 1803 } 1804 1805 static u32 rv_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) { 1806 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type); 1807 /* sret consumes a0 as the implicit first integer argument. */ 1808 u32 next_int = (abi && abi->has_sret) ? 1u : 0u; 1809 u32 next_fp = 0, stack = 0, i, p; 1810 for (i = 0; i < desc->nargs; ++i) { 1811 ABIArgInfo tmp; 1812 const ABIArgInfo* ai = rv_param_abi(t, abi, desc, i, &tmp); 1813 int force_stack = 1814 abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams; 1815 if (ai->kind == ABI_ARG_IGNORE) continue; 1816 if (force_stack) { 1817 stack += rv_class_stack_size(ai); 1818 continue; 1819 } 1820 if (ai->kind == ABI_ARG_INDIRECT) { 1821 if (next_int < 8u) 1822 next_int++; 1823 else 1824 stack += 8u; 1825 continue; 1826 } 1827 for (p = 0; p < ai->nparts; ++p) { 1828 const ABIArgPart* part = &ai->parts[p]; 1829 if (part->cls == ABI_CLASS_FP) { 1830 if (next_fp < 8u) 1831 next_fp++; 1832 else { 1833 stack = align_up_u32(stack, rv_part_stack_align(part)); 1834 stack += rv_part_stack_size(part); 1835 } 1836 } else { 1837 if (next_int < 8u) 1838 next_int++; 1839 else { 1840 stack = align_up_u32(stack, rv_part_stack_align(part)); 1841 stack += rv_part_stack_size(part); 1842 } 1843 } 1844 } 1845 } 1846 return align_up_u32(stack, 16u); 1847 } 1848 1849 static u32 rv_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type, 1850 int* variadic, u32* nparams) { 1851 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type); 1852 NativeCallDesc d; 1853 if (variadic) *variadic = abi ? (int)abi->variadic : 0; 1854 if (nparams) *nparams = abi ? abi->nparams : 0u; 1855 memset(&d, 0, sizeof d); 1856 d.fn_type = fn_type; 1857 d.nargs = abi ? abi->nparams : 0u; 1858 if (d.nargs) d.args = arena_zarray(t->c->tu, NativeLoc, d.nargs); 1859 return rv_call_stack_size(t, &d); 1860 } 1861 1862 static u32 rv_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) { 1863 return rv_call_stack_size(t, desc); 1864 } 1865 1866 /* Resolve a NativeLoc to an addressable NativeAddr (frame/stack/addr). */ 1867 static NativeAddr rv_loc_addr(RvNativeTarget* a, NativeLoc loc, u32 offset) { 1868 NativeAddr addr; 1869 memset(&addr, 0, sizeof addr); 1870 switch ((NativeLocKind)loc.kind) { 1871 case NATIVE_LOC_FRAME: 1872 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1873 addr.base.frame = loc.v.frame; 1874 addr.base_type = loc.type; 1875 addr.offset = (i32)offset; 1876 return addr; 1877 case NATIVE_LOC_STACK: 1878 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1879 addr.base.frame = loc.v.stack.slot; 1880 addr.base_type = loc.type; 1881 addr.offset = loc.v.stack.offset + (i32)offset; 1882 return addr; 1883 case NATIVE_LOC_ADDR: 1884 addr = loc.v.addr; 1885 addr.offset += (i32)offset; 1886 return addr; 1887 default: 1888 rv_panic(a, "location is not addressable"); 1889 } 1890 return addr; 1891 } 1892 1893 static void rv_load_part(NativeTarget* t, NativeLoc dst, NativeLoc src, 1894 u32 offset, u32 size) { 1895 RvNativeTarget* a = rv_of(t); 1896 if (src.kind == NATIVE_LOC_REG) { 1897 rv_move(t, dst, src); 1898 return; 1899 } 1900 if (src.kind == NATIVE_LOC_FRAME || src.kind == NATIVE_LOC_STACK || 1901 src.kind == NATIVE_LOC_ADDR) { 1902 NativeAddr addr = rv_loc_addr(a, src, offset); 1903 addr.base_type = dst.type; 1904 rv_emit_mem(a, 1, dst, addr, native_mem_for_type(t, dst.type, size)); 1905 return; 1906 } 1907 if (src.kind == NATIVE_LOC_IMM) { 1908 rv_emit_load_imm(t->mc, rv_is_64(t, dst.type) ? 1u : 0u, loc_reg(dst), 1909 src.v.imm); 1910 return; 1911 } 1912 rv_panic(a, "unsupported part source"); 1913 } 1914 1915 static void rv_store_part(NativeTarget* t, NativeLoc dst, NativeLoc src, 1916 u32 offset, u32 size) { 1917 RvNativeTarget* a = rv_of(t); 1918 if (dst.kind == NATIVE_LOC_FRAME || dst.kind == NATIVE_LOC_STACK || 1919 dst.kind == NATIVE_LOC_ADDR) { 1920 NativeAddr addr = rv_loc_addr(a, dst, offset); 1921 addr.base_type = src.type; 1922 rv_emit_mem(a, 0, src, addr, native_mem_for_type(t, src.type, size)); 1923 return; 1924 } 1925 if (dst.kind == NATIVE_LOC_REG) { 1926 rv_move(t, dst, src); 1927 return; 1928 } 1929 rv_panic(a, "unsupported part destination"); 1930 } 1931 1932 static void rv_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) { 1933 NativeAddr addr = rv_loc_addr(rv_of(t), src, 0); 1934 rv_load_addr(t, dst, addr); 1935 } 1936 1937 static void rv_store_outgoing_part(NativeTarget* t, int tail_call, 1938 u32 stack_off, NativeLoc src, u32 size) { 1939 NativeAddr addr; 1940 memset(&addr, 0, sizeof addr); 1941 addr.base_kind = NATIVE_ADDR_BASE_REG; 1942 addr.base_type = src.type; 1943 if (tail_call) { 1944 /* A sibling call reuses the caller's frame: its outgoing stack args land in 1945 * the caller's incoming-arg window ([s0 + 16 + va_save + off]) โ physically 1946 * the same address the tail-callee will read at [sp+off] once the teardown 1947 * has restored sp to the caller's entry sp (the CFA). */ 1948 addr.base.reg = RV_S0; 1949 addr.offset = rv_s0_off_in_arg(rv_of(t), stack_off); 1950 } else { 1951 addr.base.reg = RV_SP; 1952 addr.offset = (i32)stack_off; 1953 } 1954 rv_emit_mem(rv_of(t), 0, src, addr, native_mem_for_type(t, src.type, size)); 1955 } 1956 1957 /* NativeTarget bind_param: route incoming param (ABI loc) into dst. */ 1958 static void rv_bind_native_param(NativeTarget* t, const CGParamDesc* p, 1959 NativeLoc dst) { 1960 RvNativeTarget* a = rv_of(t); 1961 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type); 1962 const ABIArgInfo* ai = 1963 p->index < abi->nparams ? &abi->params[p->index] : NULL; 1964 int to_reg = dst.kind == NATIVE_LOC_REG; 1965 u32 i; 1966 if (!ai || ai->kind == ABI_ARG_IGNORE) return; 1967 if (ai->kind == ABI_ARG_INDIRECT) { 1968 NativeLoc src = native_loc_reg( 1969 builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, 1970 a->next_param_int < 8u ? RV_A0 + a->next_param_int : RV_TMP0); 1971 NativeAddr d_addr, from; 1972 AggregateAccess access; 1973 if (a->next_param_int < 8u) { 1974 a->next_param_int++; 1975 } else { 1976 NativeAddr sa; 1977 memset(&sa, 0, sizeof sa); 1978 sa.base_kind = NATIVE_ADDR_BASE_REG; 1979 sa.base.reg = RV_S0; 1980 sa.offset = rv_s0_off_in_arg(a, a->next_param_stack); 1981 sa.base_type = src.type; 1982 rv_emit_mem(a, 1, src, sa, native_mem_for_type(t, src.type, 8)); 1983 a->next_param_stack += 8u; 1984 } 1985 if (dst.kind != NATIVE_LOC_FRAME) 1986 rv_panic(a, "indirect parameter requires a frame destination"); 1987 memset(&d_addr, 0, sizeof d_addr); 1988 d_addr.base_kind = NATIVE_ADDR_BASE_FRAME; 1989 d_addr.base.frame = dst.v.frame; 1990 d_addr.base_type = p->type; 1991 memset(&from, 0, sizeof from); 1992 from.base_kind = NATIVE_ADDR_BASE_REG; 1993 from.base.reg = loc_reg(src); 1994 from.base_type = p->type; 1995 memset(&access, 0, sizeof access); 1996 access.type = p->type; 1997 access.size = p->size ? p->size : (u32)cg_type_size(t->c, p->type); 1998 access.align = p->align ? p->align : native_type_align(t, p->type); 1999 rv_copy_bytes(t, d_addr, from, access); 2000 return; 2001 } 2002 for (i = 0; i < ai->nparts; ++i) { 2003 const ABIArgPart* part = &ai->parts[i]; 2004 NativeAllocClass cls = 2005 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2006 NativeLoc src; 2007 if (cls == NATIVE_REG_FP && a->next_param_fp < 8u) { 2008 src = native_loc_reg(p->type, cls, RV_FA0 + a->next_param_fp++); 2009 } else if (cls == NATIVE_REG_INT && a->next_param_int < 8u) { 2010 src = native_loc_reg(p->type, cls, RV_A0 + a->next_param_int++); 2011 } else { 2012 Reg tmp = (cls == NATIVE_REG_FP) ? RV_FTMP0 : RV_TMP0; 2013 NativeAddr sa; 2014 src = native_loc_reg(p->type, cls, tmp); 2015 a->next_param_stack = 2016 align_up_u32(a->next_param_stack, rv_part_stack_align(part)); 2017 memset(&sa, 0, sizeof sa); 2018 sa.base_kind = NATIVE_ADDR_BASE_REG; 2019 sa.base.reg = RV_S0; 2020 sa.base_type = p->type; 2021 sa.offset = rv_s0_off_in_arg(a, a->next_param_stack); 2022 rv_emit_mem(a, 1, src, sa, native_mem_for_type(t, p->type, part->size)); 2023 a->next_param_stack += rv_part_stack_size(part); 2024 } 2025 if (dst.kind == NATIVE_LOC_NONE) { 2026 /* unused parameter; cursors already advanced */ 2027 } else if (to_reg) { 2028 NativeLoc d = native_loc_reg(dst.type ? dst.type : p->type, 2029 (NativeAllocClass)dst.cls, (Reg)dst.v.reg); 2030 if (!(src.kind == NATIVE_LOC_REG && loc_reg(src) == loc_reg(d) && 2031 (NativeAllocClass)src.cls == (NativeAllocClass)d.cls)) 2032 rv_move(t, d, src); 2033 } else { 2034 rv_store_part( 2035 t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), src, 2036 0, part->size); 2037 } 2038 } 2039 a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u); 2040 } 2041 2042 /* ============================ calls / returns ============================ */ 2043 2044 typedef NativeArgMove RvArgMove; 2045 2046 static void rv_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) { 2047 if (m->is_addr) 2048 rv_addr_of_loc(t, m->dst, m->src); 2049 else 2050 rv_load_part(t, m->dst, m->src, m->src_offset, m->size); 2051 } 2052 2053 /* Parallel-copy register arg moves via the shared scheduler; cycles break 2054 * through the int/fp emit scratch (t1 / ft1). */ 2055 static void rv_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves, 2056 u32 n) { 2057 NativeArgShuffle s; 2058 if (n > RV_MAX_REG_ARG_MOVES) rv_panic(rv_of(t), "too many register args"); 2059 memset(&s, 0, sizeof s); 2060 s.t = t; 2061 s.emit_one = rv_emit_one_arg_move; 2062 s.reg_move = rv_move; 2063 s.scratch[NATIVE_REG_INT] = RV_TMP1; 2064 s.scratch[NATIVE_REG_FP] = RV_FTMP1; 2065 native_arg_shuffle(&s, moves, n); 2066 } 2067 2068 static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc, 2069 NativeCallPlan* plan) { 2070 RvNativeTarget* a = rv_of(t); 2071 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type); 2072 NativeCallPlanRet* rets; 2073 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 2074 memset(plan, 0, sizeof *plan); 2075 rets = desc->nresults ? arena_zarray(t->c->tu, NativeCallPlanRet, 4) : NULL; 2076 plan->callee = desc->callee; 2077 plan->rets = rets; 2078 plan->flags = desc->flags; 2079 plan->has_sret = abi && abi->has_sret; 2080 plan->is_variadic = abi && abi->variadic; 2081 plan->stack_arg_size = rv_call_stack_size(t, desc); 2082 if (plan->stack_arg_size > a->frame.max_outgoing) 2083 a->frame.max_outgoing = plan->stack_arg_size; 2084 /* Indirect callee in an arg register would be clobbered by arg loads. */ 2085 if (plan->callee.kind == NATIVE_LOC_REG && 2086 (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT && 2087 plan->callee.v.reg >= RV_A0 && plan->callee.v.reg <= RV_A7) { 2088 NativeLoc scratch = 2089 native_loc_reg(plan->callee.type, NATIVE_REG_INT, RV_TMP0); 2090 rv_move(t, scratch, plan->callee); 2091 plan->callee = scratch; 2092 } 2093 { 2094 /* sret returns pass the hidden destination pointer as the implicit first 2095 * integer argument (a0), so the real args start at a1. */ 2096 u32 next_int = (abi && abi->has_sret) ? 1u : 0u; 2097 u32 next_fp = 0, stack = 0, nmoves = 0, i, p; 2098 int tail = (desc->flags & CG_CALL_TAIL) != 0; 2099 RvArgMove moves[RV_MAX_REG_ARG_MOVES]; 2100 for (i = 0; i < desc->nargs; ++i) { 2101 ABIArgInfo tmp; 2102 const ABIArgInfo* ai = rv_param_abi(t, abi, desc, i, &tmp); 2103 int force_stack = 2104 abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams; 2105 if (ai->kind == ABI_ARG_IGNORE) continue; 2106 if (force_stack) { 2107 NativeLoc tmpreg = 2108 native_loc_reg(desc->args[i].type, NATIVE_REG_INT, RV_TMP0); 2109 u32 n = rv_class_stack_size(ai), off = 0; 2110 while (off < n) { 2111 rv_load_part(t, tmpreg, desc->args[i], off, 8); 2112 rv_store_outgoing_part(t, tail, stack + off, tmpreg, 8); 2113 off += 8; 2114 } 2115 stack += n; 2116 continue; 2117 } 2118 if (ai->kind == ABI_ARG_INDIRECT) { 2119 if (next_int < 8u) { 2120 RvArgMove* m = &moves[nmoves++]; 2121 m->dst = native_loc_reg(i64t, NATIVE_REG_INT, RV_A0 + next_int++); 2122 m->src = desc->args[i]; 2123 m->src_offset = 0; 2124 m->size = 8; 2125 m->is_addr = 1; 2126 } else { 2127 NativeLoc ptr = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP0); 2128 rv_addr_of_loc(t, ptr, desc->args[i]); 2129 rv_store_outgoing_part(t, tail, stack, ptr, 8); 2130 stack += 8u; 2131 } 2132 continue; 2133 } 2134 for (p = 0; p < ai->nparts; ++p) { 2135 const ABIArgPart* part = &ai->parts[p]; 2136 NativeAllocClass cls = 2137 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2138 if ((cls == NATIVE_REG_FP && next_fp < 8u) || 2139 (cls == NATIVE_REG_INT && next_int < 8u)) { 2140 RvArgMove* m = &moves[nmoves++]; 2141 Reg areg = 2142 cls == NATIVE_REG_FP ? RV_FA0 + next_fp++ : RV_A0 + next_int++; 2143 m->dst = native_loc_reg(desc->args[i].type, cls, areg); 2144 m->src = desc->args[i]; 2145 m->src_offset = part->src_offset; 2146 m->size = part->size; 2147 m->is_addr = 0; 2148 } else { 2149 Reg tmp = cls == NATIVE_REG_FP ? RV_FTMP0 : RV_TMP0; 2150 NativeLoc tmpreg = native_loc_reg(desc->args[i].type, cls, tmp); 2151 rv_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size); 2152 stack = align_up_u32(stack, rv_part_stack_align(part)); 2153 rv_store_outgoing_part(t, tail, stack, tmpreg, part->size); 2154 stack += rv_part_stack_size(part); 2155 } 2156 } 2157 } 2158 rv_emit_reg_arg_moves(t, moves, nmoves); 2159 if (abi && abi->has_sret && desc->nresults) { 2160 /* sret pointer goes in a0; arg loads have completed. A tail call forwards 2161 * the caller's own incoming sret pointer (spilled at entry) so the 2162 * sibling writes the result into the caller's caller's destination; 2163 * otherwise pass the address of this call's result slot. */ 2164 NativeLoc a0 = native_loc_reg(i64t, NATIVE_REG_INT, RV_A0); 2165 if (tail) 2166 rv_load_part(t, a0, native_loc_stack(i64t, a->sret_ptr_slot, 0), 0, 8); 2167 else 2168 rv_addr_of_loc(t, a0, desc->results[0]); 2169 } 2170 } 2171 if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) { 2172 u32 nr = 0, ni = 0, nf = 0, p; 2173 for (p = 0; p < abi->ret.nparts; ++p) { 2174 const ABIArgPart* part = &abi->ret.parts[p]; 2175 NativeAllocClass cls = 2176 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2177 KitCgTypeId pty = rv_part_scalar_type(part); 2178 Reg rreg = cls == NATIVE_REG_FP ? RV_FA0 + nf++ : RV_A0 + ni++; 2179 rets[nr].src = native_loc_reg(pty, cls, rreg); 2180 rets[nr].dst = desc->results[0]; 2181 if (rets[nr].dst.kind == NATIVE_LOC_FRAME) 2182 rets[nr].dst = native_loc_stack(pty, desc->results[0].v.frame, 2183 (i32)part->src_offset); 2184 else if (rets[nr].dst.kind == NATIVE_LOC_STACK) { 2185 rets[nr].dst.v.stack.offset += (i32)part->src_offset; 2186 rets[nr].dst.type = pty; 2187 } 2188 rets[nr].mem = native_mem_for_type(t, pty, part->size); 2189 nr++; 2190 } 2191 plan->nrets = nr; 2192 } else if (abi && abi->ret.kind == ABI_ARG_IGNORE) { 2193 plan->nrets = 0; 2194 } else if (!abi && desc->nresults) { 2195 rets[0].src = native_loc_reg(desc->results[0].type, NATIVE_REG_INT, RV_A0); 2196 rets[0].dst = desc->results[0]; 2197 rets[0].mem = native_mem_for_type(t, desc->results[0].type, 0); 2198 plan->nrets = 1; 2199 } 2200 } 2201 2202 /* Emit a sibling (tail) call: tear the frame down to the caller's entry state 2203 * and jump (no link) to the callee. Outgoing args are already in the arg regs / 2204 * the caller's incoming-arg window. At -O0 there are no callee-saves, and the 2205 * sp restore uses the CFA offset (s0 + 16 + va_save), which is independent of 2206 * the not-yet-final frame_size โ so no func_end patching is needed. */ 2207 static void rv_emit_tail_site(NativeTarget* t, NativeLoc callee) { 2208 RvNativeTarget* a = rv_of(t); 2209 MCEmitter* mc = t->mc; 2210 i32 cfa = (i32)(RV_FRAME_SAVE_SIZE + rv_va_save_sz(a)); 2211 int indirect = callee.kind == NATIVE_LOC_REG; 2212 u32 int_regs[RV_MAX_CALLEE_SAVES], fp_regs[RV_MAX_CALLEE_SAVES]; 2213 u32 n_int = rv_collect_int_saves(a, int_regs); 2214 u32 n_fp = rv_collect_fp_saves(a, fp_regs); 2215 i32 i; 2216 /* Stage an indirect callee into a reserved scratch (t1) BEFORE the teardown: 2217 * regalloc parks the function pointer in a callee-saved register so it 2218 * survives arg marshalling, and the callee-save / s0 / ra restores below 2219 * would otherwise overwrite it. t1 is reserved (never allocable) and 2220 * untouched by the restore loop (which only uses t0 for far offsets). */ 2221 if (indirect) rv64_emit32(mc, rv_addi(RV_TMP1, loc_reg(callee), 0)); 2222 /* Restore callee-saves before tearing the frame down (O1 path; none at -O0). 2223 * Their save offsets are s0-relative via rv_save_off, so the restore is 2224 * frame-size- and teardown-order-independent. */ 2225 for (i = (i32)n_int - 1; i >= 0; --i) 2226 rv_load_s0(mc, 0, int_regs[i], rv_save_off(a, (u32)i)); 2227 for (i = (i32)n_fp - 1; i >= 0; --i) 2228 rv_load_s0(mc, 1, fp_regs[i], rv_save_off(a, n_int + (u32)i)); 2229 rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8)); 2230 rv64_emit32(mc, rv_addi(RV_SP, RV_S0, cfa)); 2231 rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0)); 2232 if (callee.kind == NATIVE_LOC_GLOBAL) { 2233 u32 pos = mc->pos(mc); 2234 rv64_emit32(mc, rv_auipc(RV_TMP0, 0)); 2235 rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP0, 0)); 2236 mc->emit_reloc_at(mc, mc->section_id, pos, R_RV_CALL, callee.v.global.sym, 2237 callee.v.global.addend, 0, 0); 2238 } else if (indirect) { 2239 rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP1, 0)); 2240 } else { 2241 rv_panic(a, "unsupported tail call target"); 2242 } 2243 } 2244 2245 static void rv_emit_call(NativeTarget* t, const NativeCallPlan* plan) { 2246 MCEmitter* mc = t->mc; 2247 ObjSecId sec = mc->section_id; 2248 if (plan->flags & CG_CALL_TAIL) { 2249 rv_emit_tail_site(t, plan->callee); 2250 return; 2251 } 2252 if (plan->callee.kind == NATIVE_LOC_GLOBAL) { 2253 u32 pos = mc->pos(mc); 2254 rv64_emit32(mc, rv_auipc(RV_RA, 0)); 2255 rv64_emit32(mc, rv_jalr(RV_RA, RV_RA, 0)); 2256 mc->emit_reloc_at(mc, sec, pos, R_RV_CALL, plan->callee.v.global.sym, 2257 plan->callee.v.global.addend, 0, 0); 2258 return; 2259 } 2260 if (plan->callee.kind == NATIVE_LOC_REG) { 2261 rv64_emit32(mc, rv_jalr(RV_RA, loc_reg(plan->callee), 0)); 2262 return; 2263 } 2264 rv_panic(rv_of(t), "unsupported call target"); 2265 } 2266 2267 static void rv_plan_ret(NativeTarget* t, const CGFuncDesc* fd, 2268 const NativeLoc* values, u32 nvalues, 2269 NativeCallPlanRet** out_rets, u32* out_nrets) { 2270 RvNativeTarget* a = rv_of(t); 2271 const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type); 2272 NativeCallPlanRet* rets = NULL; 2273 u32 nr = 0; 2274 if (nvalues > 1u) rv_panic(a, "multiple returns unsupported"); 2275 if (nvalues) rets = arena_zarray(t->c->tu, NativeCallPlanRet, 4); 2276 if (nvalues && abi && abi->ret.kind == ABI_ARG_INDIRECT) { 2277 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 2278 NativeLoc dstp = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1); 2279 NativeLoc saved = native_loc_stack(i64t, a->sret_ptr_slot, 0); 2280 NativeAddr dst_addr, src_addr; 2281 AggregateAccess access; 2282 rv_load_part(t, dstp, saved, 0, 8); 2283 memset(&dst_addr, 0, sizeof dst_addr); 2284 dst_addr.base_kind = NATIVE_ADDR_BASE_REG; 2285 dst_addr.base.reg = RV_TMP1; 2286 dst_addr.base_type = values[0].type; 2287 src_addr = rv_loc_addr(a, values[0], 0); 2288 src_addr.base_type = values[0].type; 2289 memset(&access, 0, sizeof access); 2290 access.type = values[0].type; 2291 access.size = (u32)cg_type_size(t->c, values[0].type); 2292 access.align = native_type_align(t, values[0].type); 2293 rv_copy_bytes(t, dst_addr, src_addr, access); 2294 *out_rets = NULL; 2295 *out_nrets = 0; 2296 return; 2297 } 2298 if (nvalues && abi && abi->ret.kind == ABI_ARG_DIRECT) { 2299 u32 ni = 0, nf = 0, p; 2300 for (p = 0; p < abi->ret.nparts; ++p) { 2301 const ABIArgPart* part = &abi->ret.parts[p]; 2302 NativeAllocClass cls = 2303 part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; 2304 KitCgTypeId pty = rv_part_scalar_type(part); 2305 Reg rreg = cls == NATIVE_REG_FP ? RV_FA0 + nf++ : RV_A0 + ni++; 2306 rets[nr].src = values[0]; 2307 if (rets[nr].src.kind == NATIVE_LOC_FRAME) 2308 rets[nr].src = 2309 native_loc_stack(pty, values[0].v.frame, (i32)part->src_offset); 2310 else if (rets[nr].src.kind == NATIVE_LOC_STACK) { 2311 rets[nr].src.v.stack.offset += (i32)part->src_offset; 2312 rets[nr].src.type = pty; 2313 } 2314 rets[nr].dst = native_loc_reg(pty, cls, rreg); 2315 rets[nr].mem = native_mem_for_type(t, pty, part->size); 2316 nr++; 2317 } 2318 } else if (nvalues) { 2319 rets[0].src = values[0]; 2320 rets[0].dst = native_loc_reg(values[0].type, NATIVE_REG_INT, RV_A0); 2321 rets[0].mem = native_mem_for_type(t, values[0].type, 0); 2322 nr = 1; 2323 } 2324 *out_rets = rets; 2325 *out_nrets = nr; 2326 } 2327 2328 static void rv_ret(NativeTarget* t) { 2329 RvNativeTarget* a = rv_of(t); 2330 rv_jump(t, a->epilogue_label); 2331 } 2332 2333 /* ============================ alloca ============================ */ 2334 2335 static void rv_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size, 2336 u32 align) { 2337 RvNativeTarget* a = rv_of(t); 2338 MCEmitter* mc = t->mc; 2339 u32 rsz = loc_reg(size); 2340 u32 rd = loc_reg(dst); 2341 u32 al = align ? align : 16u; 2342 if (al < 16u) al = 16u; 2343 /* round up: t0 = (size + (al-1)) & ~(al-1) */ 2344 rv64_emit32(mc, rv_addi(RV_TMP0, rsz, (i32)(al - 1u))); 2345 rv_emit_load_imm(mc, 1, RV_TMP1, -(i64)al); 2346 rv64_emit32(mc, rv_and(RV_TMP0, RV_TMP0, RV_TMP1)); 2347 rv64_emit32(mc, rv_sub(RV_SP, RV_SP, RV_TMP0)); 2348 a->frame.has_alloca = 1; 2349 /* dst = sp + max_outgoing (patched in func_end) */ 2350 if (a->npatches == a->patches_cap) { 2351 u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u; 2352 RvPatch* nb = arena_zarray(t->c->tu, RvPatch, cap); 2353 if (a->patches) memcpy(nb, a->patches, sizeof(*nb) * a->npatches); 2354 a->patches = nb; 2355 a->patches_cap = cap; 2356 } 2357 a->patches[a->npatches].kind = RV_PATCH_ALLOCA; 2358 a->patches[a->npatches].pos = mc->pos(mc); 2359 a->patches[a->npatches].dst_reg = rd; 2360 a->npatches++; 2361 a->nalloca++; 2362 rv64_emit32(mc, RV_NOP); /* placeholder for addi dst, sp, max_outgoing */ 2363 } 2364 2365 /* ============================ TLS / bitfield / atomics 2366 * ============================ */ 2367 2368 static void rv_tls_addr_of(NativeTarget* t, NativeLoc dst, ObjSymId sym, 2369 i64 addend) { 2370 MCEmitter* mc = t->mc; 2371 u32 sec = mc->section_id; 2372 u32 rd = loc_reg(dst); 2373 /* Local-Exec only, matching aa64 (aa_tls_addr_of) and x64 (x64_tls_addr_of): 2374 * kit links the whole module statically, so every _Thread_local symbol is 2375 * resolved within the image and TPREL is always valid. An Initial-Exec GOT 2376 * path (R_RV_TLS_GOT_HI20) used to be emitted for extern-via-GOT symbols 2377 * under -fPIE (the hosted default), but the linker has no layout/apply for 2378 * that reloc, so it produced a hard "unsupported reloc kind" link failure 2379 * rather than a working binary. */ 2380 /* lui t0, %tprel_hi(sym); add t0, tp, t0; addi dst, t0, %tprel_lo(sym). */ 2381 { 2382 u32 hp = mc->pos(mc); 2383 rv64_emit32(mc, rv_lui(RV_TMP0, 0)); 2384 mc->emit_reloc_at(mc, sec, hp, R_RV_TPREL_HI20, sym, addend, 0, 0); 2385 rv64_emit32(mc, rv_add(RV_TMP0, RV_TP, RV_TMP0)); 2386 { 2387 u32 lp = mc->pos(mc); 2388 rv64_emit32(mc, rv_addi(rd, RV_TMP0, 0)); 2389 mc->emit_reloc_at(mc, sec, lp, R_RV_TPREL_LO12_I, sym, addend, 0, 0); 2390 } 2391 } 2392 } 2393 static void rv_bitfield_load(NativeTarget* t, NativeLoc dst, NativeAddr ra, 2394 BitFieldAccess bf) { 2395 RvNativeTarget* a = rv_of(t); 2396 MCEmitter* mc = t->mc; 2397 u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u; 2398 u32 rd = loc_reg(dst); 2399 u32 base; 2400 i32 off; 2401 u32 lsb = bf.bit_offset; 2402 u32 width = bf.bit_width ? bf.bit_width : 1u; 2403 /* Shift left so the field's MSB lands at bit 63, then shift right to 2404 * sign/zero extend it down. Use 64-bit shifts throughout. */ 2405 u32 sh_left = 64u - (lsb + width); 2406 u32 sh_right = 64u - width; 2407 ra.offset += (i32)bf.storage_offset; 2408 rv_resolve_mem_addr(a, &ra, &base, &off); 2409 rv64_emit32(mc, enc_int_load(storage_bytes, 0, rd, base, off)); 2410 rv64_emit32(mc, rv_slli(rd, rd, sh_left)); 2411 if (bf.signed_) 2412 rv64_emit32(mc, rv_srai(rd, rd, sh_right)); 2413 else 2414 rv64_emit32(mc, rv_srli(rd, rd, sh_right)); 2415 } 2416 static void rv_bitfield_store(NativeTarget* t, NativeAddr ra, NativeLoc src, 2417 BitFieldAccess bf) { 2418 RvNativeTarget* a = rv_of(t); 2419 MCEmitter* mc = t->mc; 2420 u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u; 2421 u32 src_reg = loc_reg(src); 2422 u32 base; 2423 i32 off; 2424 u32 lsb = bf.bit_offset; 2425 u32 width = bf.bit_width ? bf.bit_width : 1u; 2426 u64 ones = width >= 64u ? ~(u64)0 : (((u64)1 << width) - 1u); 2427 u64 mask_in = ones << lsb; 2428 ra.offset += (i32)bf.storage_offset; 2429 /* Resolve the field address; rv_resolve_mem_addr may use RV_TMP0/RV_TMP1, so 2430 * stabilize the base into RV_TMP1 before consuming the scratch temps. */ 2431 rv_resolve_mem_addr(a, &ra, &base, &off); 2432 if (base != RV_S0 && base != RV_TMP1) { 2433 rv_emit_addr_adjust(mc, RV_TMP1, base, off); 2434 base = RV_TMP1; 2435 off = 0; 2436 } else if (base == RV_TMP1 && off != 0) { 2437 rv_emit_addr_adjust(mc, RV_TMP1, RV_TMP1, off); 2438 off = 0; 2439 } 2440 /* word in RV_TMP2; merged via RV_TMP0 (clear mask, then shifted src). */ 2441 rv64_emit32(mc, enc_int_load(storage_bytes, 0, RV_TMP2, base, off)); 2442 rv_emit_load_imm(mc, 1, RV_TMP0, (i64)~mask_in); 2443 rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP2, RV_TMP0)); 2444 rv_emit_load_imm(mc, 1, RV_TMP0, (i64)ones); 2445 rv64_emit32(mc, rv_and(RV_TMP0, src_reg, RV_TMP0)); 2446 if (lsb) rv64_emit32(mc, rv_slli(RV_TMP0, RV_TMP0, lsb)); 2447 rv64_emit32(mc, rv_or(RV_TMP2, RV_TMP2, RV_TMP0)); 2448 rv64_emit32(mc, enc_int_store(storage_bytes, RV_TMP2, base, off)); 2449 } 2450 static int rv_order_acquire(KitCgMemOrder o) { 2451 return o == KIT_CG_MO_CONSUME || o == KIT_CG_MO_ACQUIRE || 2452 o == KIT_CG_MO_ACQ_REL || o == KIT_CG_MO_SEQ_CST; 2453 } 2454 static int rv_order_release(KitCgMemOrder o) { 2455 return o == KIT_CG_MO_RELEASE || o == KIT_CG_MO_ACQ_REL || 2456 o == KIT_CG_MO_SEQ_CST; 2457 } 2458 2459 /* Materialize the atomic operand address into RV_TMP0 (a bare pointer, since 2460 * LR/SC and AMO take a base register with no offset) and return it. */ 2461 static u32 rv_atomic_addr_reg(RvNativeTarget* a, NativeAddr addr) { 2462 NativeLoc dst = 2463 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, RV_TMP0); 2464 rv_load_addr(&a->base, dst, addr); 2465 return RV_TMP0; 2466 } 2467 2468 static void rv_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, 2469 MemAccess mem, KitCgMemOrder mo) { 2470 RvNativeTarget* a = rv_of(t); 2471 MCEmitter* mc = t->mc; 2472 u32 sf = 2473 (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u; 2474 u32 base = rv_atomic_addr_reg(a, addr); 2475 if (mo == KIT_CG_MO_SEQ_CST) rv64_emit32(mc, rv_fence_rw_rw()); 2476 if (rv_order_acquire(mo)) { 2477 /* lr.w/d as an ordered load (aq=1). */ 2478 rv64_emit32(mc, sf ? rv_lr_d(loc_reg(dst), base, 1, 0) 2479 : rv_lr_w(loc_reg(dst), base, 1, 0)); 2480 } else { 2481 rv64_emit32( 2482 mc, enc_int_load(mem.size ? mem.size : native_type_size(t, dst.type), 0, 2483 loc_reg(dst), base, 0)); 2484 } 2485 } 2486 2487 static void rv_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src, 2488 MemAccess mem, KitCgMemOrder mo) { 2489 RvNativeTarget* a = rv_of(t); 2490 MCEmitter* mc = t->mc; 2491 u32 sz = mem.size ? mem.size : native_type_size(t, src.type); 2492 /* RV_TMP0 holds the address; never collides with src (an allocable reg). */ 2493 u32 base = rv_atomic_addr_reg(a, addr); 2494 if (rv_order_release(mo)) rv64_emit32(mc, rv_fence_rw_rw()); 2495 rv64_emit32(mc, enc_int_store(sz, loc_reg(src), base, 0)); 2496 if (mo == KIT_CG_MO_SEQ_CST) rv64_emit32(mc, rv_fence_rw_rw()); 2497 } 2498 2499 static void rv_atomic_rmw(NativeTarget* t, KitCgAtomicOp op, NativeLoc dst, 2500 NativeAddr addr, NativeLoc val, MemAccess mem, 2501 KitCgMemOrder mo) { 2502 RvNativeTarget* a = rv_of(t); 2503 MCEmitter* mc = t->mc; 2504 u32 sf = 2505 (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u; 2506 u32 base = rv_atomic_addr_reg(a, addr); /* RV_TMP0 */ 2507 u32 vreg = loc_reg(val); 2508 u32 rd = loc_reg(dst); 2509 u32 aq = (u32)rv_order_acquire(mo); 2510 u32 rl = (u32)rv_order_release(mo); 2511 MCLabel retry = mc->label_new(mc); 2512 /* LR/SC loop: dst = *base; new = dst op val; sc new; retry on failure. 2513 * RV_TMP1 carries the SC status, RV_TMP3 the computed new value. */ 2514 mc->label_place(mc, retry); 2515 rv64_emit32(mc, sf ? rv_lr_d(rd, base, aq, 0) : rv_lr_w(rd, base, aq, 0)); 2516 switch (op) { 2517 case KIT_CG_ATOMIC_XCHG: 2518 rv64_emit32(mc, rv_addi(RV_TMP3, vreg, 0)); 2519 break; 2520 case KIT_CG_ATOMIC_ADD: 2521 rv64_emit32(mc, 2522 sf ? rv_add(RV_TMP3, rd, vreg) : rv_addw(RV_TMP3, rd, vreg)); 2523 break; 2524 case KIT_CG_ATOMIC_SUB: 2525 rv64_emit32(mc, 2526 sf ? rv_sub(RV_TMP3, rd, vreg) : rv_subw(RV_TMP3, rd, vreg)); 2527 break; 2528 case KIT_CG_ATOMIC_AND: 2529 rv64_emit32(mc, rv_and(RV_TMP3, rd, vreg)); 2530 break; 2531 case KIT_CG_ATOMIC_OR: 2532 rv64_emit32(mc, rv_or(RV_TMP3, rd, vreg)); 2533 break; 2534 case KIT_CG_ATOMIC_XOR: 2535 rv64_emit32(mc, rv_xor(RV_TMP3, rd, vreg)); 2536 break; 2537 case KIT_CG_ATOMIC_NAND: 2538 rv64_emit32(mc, rv_and(RV_TMP3, rd, vreg)); 2539 rv64_emit32(mc, rv_xori(RV_TMP3, RV_TMP3, -1)); 2540 break; 2541 default: 2542 rv_panic(a, "unsupported atomic rmw op"); 2543 } 2544 rv64_emit32(mc, sf ? rv_sc_d(RV_TMP1, base, RV_TMP3, 0, rl) 2545 : rv_sc_w(RV_TMP1, base, RV_TMP3, 0, rl)); 2546 rv64_emit32(mc, rv_bne(RV_TMP1, RV_ZERO, 0)); 2547 mc->emit_label_ref(mc, retry, R_RV_BRANCH, 4, 0); 2548 } 2549 2550 static void rv_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok, 2551 NativeAddr addr, NativeLoc expected, 2552 NativeLoc desired, MemAccess mem, 2553 KitCgMemOrder success, KitCgMemOrder failure) { 2554 RvNativeTarget* a = rv_of(t); 2555 MCEmitter* mc = t->mc; 2556 u32 sf = 2557 (mem.size ? mem.size : native_type_size(t, prior.type)) == 8u ? 1u : 0u; 2558 u32 base = rv_atomic_addr_reg(a, addr); /* RV_TMP0 */ 2559 u32 rprior = loc_reg(prior); 2560 u32 rexp = loc_reg(expected); 2561 u32 rdes = loc_reg(desired); 2562 u32 rok = loc_reg(ok); 2563 u32 aq = (u32)rv_order_acquire(success); 2564 u32 rl = (u32)rv_order_release(success); 2565 MCLabel retry = mc->label_new(mc); 2566 MCLabel fail = mc->label_new(mc); 2567 MCLabel done = mc->label_new(mc); 2568 (void)failure; 2569 mc->label_place(mc, retry); 2570 rv64_emit32(mc, 2571 sf ? rv_lr_d(rprior, base, aq, 0) : rv_lr_w(rprior, base, aq, 0)); 2572 /* if (prior != expected) -> fail */ 2573 rv64_emit32(mc, rv_bne(rprior, rexp, 0)); 2574 mc->emit_label_ref(mc, fail, R_RV_BRANCH, 4, 0); 2575 /* sc.w/d status, desired, (base); retry on failure. */ 2576 rv64_emit32(mc, sf ? rv_sc_d(RV_TMP1, base, rdes, 0, rl) 2577 : rv_sc_w(RV_TMP1, base, rdes, 0, rl)); 2578 rv64_emit32(mc, rv_bne(RV_TMP1, RV_ZERO, 0)); 2579 mc->emit_label_ref(mc, retry, R_RV_BRANCH, 4, 0); 2580 /* ok = 1; jump done. */ 2581 rv_emit_load_imm(mc, 0, rok, 1); 2582 rv64_emit32(mc, rv_jal(RV_ZERO, 0)); 2583 mc->emit_label_ref(mc, done, R_RV_JAL, 4, 0); 2584 mc->label_place(mc, fail); 2585 rv_emit_load_imm(mc, 0, rok, 0); 2586 mc->label_place(mc, done); 2587 } 2588 2589 static void rv_fence(NativeTarget* t, KitCgMemOrder mo) { 2590 if (mo == KIT_CG_MO_RELAXED) return; 2591 rv64_emit32(t->mc, rv_fence_rw_rw()); 2592 } 2593 /* ---- variadics (LP64D ABI_VA_LIST_POINTER) ---- 2594 * va_list is a single void* to the next argument slot. The prologue spilled 2595 * unconsumed a-regs into the 64-byte save area at [s0+16); incoming stack args 2596 * follow contiguously, so a uniform 8-byte stride covers both. `ap` is a 2597 * NativeAddr that addresses the va_list object itself. */ 2598 2599 static void rv_va_start_core(RvNativeTarget* a, NativeAddr ap) { 2600 NativeTarget* t = &a->base; 2601 MCEmitter* mc = t->mc; 2602 ABIVaListInfo vai = abi_va_list_layout(t->c->abi); 2603 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 2604 if (vai.kind != ABI_VA_LIST_POINTER) 2605 rv_panic(a, "unsupported va_list layout"); 2606 if (!a->is_variadic) rv_panic(a, "va_start: function not variadic"); 2607 /* *ap = s0 + 16 + next_param_int*8 (skip past named-int save slots). */ 2608 rv64_emit32(mc, rv_addi(RV_TMP1, RV_S0, 16 + (i32)(a->next_param_int * 8u))); 2609 rv_emit_mem(a, 0, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1), ap, 2610 native_mem_for_type(t, i64t, 8)); 2611 } 2612 2613 static void rv_va_arg_core(RvNativeTarget* a, NativeLoc dst, NativeAddr ap, 2614 KitCgTypeId type) { 2615 NativeTarget* t = &a->base; 2616 MCEmitter* mc = t->mc; 2617 ABIVaListInfo vai = abi_va_list_layout(t->c->abi); 2618 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 2619 u32 sz = native_type_size(t, type); 2620 NativeLoc cur = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1); 2621 NativeAddr from; 2622 if (vai.kind != ABI_VA_LIST_POINTER) 2623 rv_panic(a, "unsupported va_list layout"); 2624 if (dst.kind != NATIVE_LOC_REG) rv_panic(a, "va_arg destination must be reg"); 2625 /* cur = *ap; load value from [cur]; *ap = cur + 8 (each slot is 8 bytes). */ 2626 rv_emit_mem(a, 1, cur, ap, native_mem_for_type(t, i64t, 8)); 2627 memset(&from, 0, sizeof from); 2628 from.base_kind = NATIVE_ADDR_BASE_REG; 2629 from.base.reg = RV_TMP1; 2630 from.base_type = type; 2631 if (native_loc_is_fp(dst)) { 2632 /* Variadic FP args sit in the integer save area as their bit pattern; 2633 * load into RV_TMP2 and bitcast into the FPR. */ 2634 NativeLoc itmp = native_loc_reg(type, NATIVE_REG_INT, RV_TMP2); 2635 rv_emit_mem(a, 1, itmp, from, native_mem_for_type(t, type, sz)); 2636 rv64_emit32(mc, sz == 8u ? rv_fmv_d_x(loc_reg(dst), RV_TMP2) 2637 : rv_fmv_w_x(loc_reg(dst), RV_TMP2)); 2638 } else { 2639 rv_emit_mem(a, 1, dst, from, native_mem_for_type(t, type, sz)); 2640 } 2641 rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP1, 8)); 2642 rv_emit_mem(a, 0, cur, ap, native_mem_for_type(t, i64t, 8)); 2643 } 2644 2645 static void rv_va_copy_core(RvNativeTarget* a, NativeAddr dst_ap, 2646 NativeAddr src_ap) { 2647 NativeTarget* t = &a->base; 2648 KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64); 2649 NativeLoc tmp = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1); 2650 /* va_list is a single 8-byte pointer. */ 2651 rv_emit_mem(a, 1, tmp, src_ap, native_mem_for_type(t, i64t, 8)); 2652 rv_emit_mem(a, 0, tmp, dst_ap, native_mem_for_type(t, i64t, 8)); 2653 } 2654 2655 static NativeAddr rv_va_addr_from_ptr(NativeLoc ap_ptr) { 2656 NativeAddr addr; 2657 memset(&addr, 0, sizeof addr); 2658 addr.base_kind = NATIVE_ADDR_BASE_REG; 2659 addr.cls = NATIVE_REG_INT; 2660 addr.base.reg = ap_ptr.v.reg; 2661 addr.base_type = ap_ptr.type; 2662 return addr; 2663 } 2664 2665 static void rv_va_start_native(NativeTarget* t, NativeLoc ap_ptr) { 2666 rv_va_start_core(rv_of(t), rv_va_addr_from_ptr(ap_ptr)); 2667 } 2668 static void rv_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr, 2669 KitCgTypeId type) { 2670 rv_va_arg_core(rv_of(t), dst, rv_va_addr_from_ptr(ap_ptr), type); 2671 } 2672 static void rv_va_end_native(NativeTarget* t, NativeLoc ap_ptr) { 2673 (void)t; 2674 (void)ap_ptr; 2675 } 2676 static void rv_va_copy_native(NativeTarget* t, NativeLoc dst, NativeLoc src) { 2677 rv_va_copy_core(rv_of(t), rv_va_addr_from_ptr(dst), rv_va_addr_from_ptr(src)); 2678 } 2679 /* Software popcount of RV_TMP1 (already width-normalized) into rd, using 2680 * RV_TMP1/RV_TMP2/RV_TMP3 as scratch. Mirrors the legacy bit-twiddling. */ 2681 static void rv_emit_popcount(MCEmitter* mc, u32 rd, int is64) { 2682 rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, 1)); 2683 rv_emit_load_imm(mc, 1, RV_TMP3, 2684 is64 ? (i64)0x5555555555555555ll : (i64)0x55555555); 2685 rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP2, RV_TMP3)); 2686 rv64_emit32(mc, rv_sub(RV_TMP1, RV_TMP1, RV_TMP2)); 2687 rv_emit_load_imm(mc, 1, RV_TMP3, 2688 is64 ? (i64)0x3333333333333333ll : (i64)0x33333333); 2689 rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP1, RV_TMP3)); 2690 rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 2)); 2691 rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP3)); 2692 rv64_emit32(mc, rv_add(RV_TMP1, RV_TMP1, RV_TMP2)); 2693 rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, 4)); 2694 rv64_emit32(mc, rv_add(RV_TMP1, RV_TMP1, RV_TMP2)); 2695 rv_emit_load_imm(mc, 1, RV_TMP3, 2696 is64 ? (i64)0x0f0f0f0f0f0f0f0fll : (i64)0x0f0f0f0f); 2697 rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP3)); 2698 rv_emit_load_imm(mc, 1, RV_TMP3, 2699 is64 ? (i64)0x0101010101010101ll : (i64)0x01010101); 2700 rv64_emit32(mc, rv_mul(RV_TMP1, RV_TMP1, RV_TMP3)); 2701 rv64_emit32(mc, rv_srli(rd, RV_TMP1, is64 ? 56u : 24u)); 2702 /* The 32-bit SWAR sum lives in product bits [24,32); since the multiply is 2703 * 64-bit, bits [32,64) survive the >>24 and must be masked off. (The 64-bit 2704 * path's >>56 already isolates the top byte, so it needs no mask.) */ 2705 if (!is64) rv64_emit32(mc, rv_andi(rd, rd, 0xff)); 2706 } 2707 2708 /* Inline byte-granule copy/set between bare base registers (memcpy/memmove/ 2709 * memset intrinsics). dir<0 copies high-to-low (memmove backward). */ 2710 static void rv_intrin_copy(MCEmitter* mc, u32 dr, u32 sr, u32 n, int backward) { 2711 if (!backward) { 2712 u32 i = 0; 2713 while (i + 8u <= n) { 2714 rv64_emit32(mc, rv_ld(RV_TMP3, sr, (i32)i)); 2715 rv64_emit32(mc, rv_sd(RV_TMP3, dr, (i32)i)); 2716 i += 8u; 2717 } 2718 while (i + 4u <= n) { 2719 rv64_emit32(mc, rv_lwu(RV_TMP3, sr, (i32)i)); 2720 rv64_emit32(mc, rv_sw(RV_TMP3, dr, (i32)i)); 2721 i += 4u; 2722 } 2723 while (i + 2u <= n) { 2724 rv64_emit32(mc, rv_lhu(RV_TMP3, sr, (i32)i)); 2725 rv64_emit32(mc, rv_sh(RV_TMP3, dr, (i32)i)); 2726 i += 2u; 2727 } 2728 while (i < n) { 2729 rv64_emit32(mc, rv_lbu(RV_TMP3, sr, (i32)i)); 2730 rv64_emit32(mc, rv_sb(RV_TMP3, dr, (i32)i)); 2731 i += 1u; 2732 } 2733 } else { 2734 u32 i = n; 2735 while (i >= 8u) { 2736 i -= 8u; 2737 rv64_emit32(mc, rv_ld(RV_TMP3, sr, (i32)i)); 2738 rv64_emit32(mc, rv_sd(RV_TMP3, dr, (i32)i)); 2739 } 2740 while (i >= 4u) { 2741 i -= 4u; 2742 rv64_emit32(mc, rv_lwu(RV_TMP3, sr, (i32)i)); 2743 rv64_emit32(mc, rv_sw(RV_TMP3, dr, (i32)i)); 2744 } 2745 while (i >= 2u) { 2746 i -= 2u; 2747 rv64_emit32(mc, rv_lhu(RV_TMP3, sr, (i32)i)); 2748 rv64_emit32(mc, rv_sh(RV_TMP3, dr, (i32)i)); 2749 } 2750 while (i >= 1u) { 2751 i -= 1u; 2752 rv64_emit32(mc, rv_lbu(RV_TMP3, sr, (i32)i)); 2753 rv64_emit32(mc, rv_sb(RV_TMP3, dr, (i32)i)); 2754 } 2755 } 2756 } 2757 2758 static void rv_intrinsic(NativeTarget* t, IntrinKind kind, 2759 const NativeLoc* dsts, u32 ndst, const NativeLoc* args, 2760 u32 narg) { 2761 RvNativeTarget* a = rv_of(t); 2762 MCEmitter* mc = t->mc; 2763 (void)ndst; 2764 (void)narg; 2765 switch (kind) { 2766 case INTRIN_NONE: 2767 break; 2768 case INTRIN_EXPECT: 2769 case INTRIN_ASSUME_ALIGNED: { 2770 /* dst = val (hint dropped). */ 2771 if (args[0].kind == NATIVE_LOC_IMM) 2772 rv_emit_load_imm(mc, rv_is_64(t, dsts[0].type) ? 1u : 0u, 2773 loc_reg(dsts[0]), args[0].v.imm); 2774 else 2775 rv_move(t, dsts[0], args[0]); 2776 return; 2777 } 2778 case INTRIN_PREFETCH: 2779 return; 2780 case INTRIN_TRAP: 2781 rv64_emit32(mc, rv_ebreak()); 2782 return; 2783 case INTRIN_BSWAP: { 2784 u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type); 2785 switch (width) { 2786 case 2: { 2787 u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); 2788 /* rd = ((rs & 0xff) << 8) | ((rs >> 8) & 0xff). */ 2789 rv64_emit32(mc, rv_addi(RV_TMP2, RV_ZERO, 0xff)); 2790 rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); /* 0xff00 */ 2791 rv64_emit32(mc, rv_slli(RV_TMP1, rs, 8)); 2792 rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP2)); 2793 rv64_emit32(mc, rv_srli(RV_TMP3, rs, 8)); 2794 rv64_emit32(mc, rv_andi(RV_TMP3, RV_TMP3, 0xff)); 2795 rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP3)); 2796 return; 2797 } 2798 case 4: { 2799 u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); 2800 rv64_emit32(mc, rv_srliw(RV_TMP1, rs, 24)); 2801 rv64_emit32(mc, rv_andi(RV_TMP1, RV_TMP1, 0xff)); 2802 rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 16)); 2803 rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff)); 2804 rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); 2805 rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2)); 2806 rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 8)); 2807 rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff)); 2808 rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 16)); 2809 rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2)); 2810 rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff)); 2811 rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 24)); 2812 rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP2)); 2813 rv64_emit32(mc, rv_slli(rd, rd, 32)); 2814 rv64_emit32(mc, rv_srli(rd, rd, 32)); 2815 return; 2816 } 2817 case 8: { 2818 u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); 2819 int i; 2820 rv64_emit32(mc, rv_addi(RV_TMP1, RV_ZERO, 0)); 2821 for (i = 0; i < 8; ++i) { 2822 int sh = 56 - 8 * i; 2823 if (i == 0) { 2824 rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff)); 2825 } else { 2826 rv64_emit32(mc, rv_srli(RV_TMP2, rs, (u32)(8 * i))); 2827 rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff)); 2828 } 2829 if (sh) rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, (u32)sh)); 2830 rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2)); 2831 } 2832 rv64_emit32(mc, rv_addi(rd, RV_TMP1, 0)); 2833 return; 2834 } 2835 default: 2836 break; 2837 } 2838 return; 2839 } 2840 case INTRIN_POPCOUNT: { 2841 u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); 2842 int is64 = rv_is_64(t, args[0].type); 2843 rv64_emit32(mc, rv_addi(RV_TMP1, rs, 0)); 2844 if (!is64) { 2845 rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32)); 2846 rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32)); 2847 } 2848 rv_emit_popcount(mc, rd, is64); 2849 return; 2850 } 2851 case INTRIN_CTZ: { 2852 /* ctz(x) = popcount((x & -x) - 1) for x != 0. */ 2853 u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); 2854 int is64 = rv_is_64(t, args[0].type); 2855 rv64_emit32(mc, rv_sub(RV_TMP1, RV_ZERO, rs)); 2856 rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, rs)); 2857 rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP1, -1)); 2858 if (!is64) { 2859 rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32)); 2860 rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32)); 2861 } 2862 rv_emit_popcount(mc, rd, is64); 2863 return; 2864 } 2865 case INTRIN_CLZ: { 2866 /* Fold the high bit downward, then clz = popcount(~folded). */ 2867 u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]); 2868 int is64 = rv_is_64(t, args[0].type); 2869 u32 shifts[6] = {1, 2, 4, 8, 16, 32}; 2870 u32 ns = is64 ? 6u : 5u, i; 2871 rv64_emit32(mc, rv_addi(RV_TMP1, rs, 0)); 2872 if (!is64) { 2873 rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32)); 2874 rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32)); 2875 } 2876 for (i = 0; i < ns; ++i) { 2877 rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, shifts[i])); 2878 rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2)); 2879 } 2880 rv64_emit32(mc, rv_xori(RV_TMP1, RV_TMP1, -1)); 2881 if (!is64) { 2882 rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32)); 2883 rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32)); 2884 } 2885 rv_emit_popcount(mc, rd, is64); 2886 return; 2887 } 2888 case INTRIN_SADD_OVERFLOW: 2889 case INTRIN_SSUB_OVERFLOW: { 2890 /* dsts: [val, ovf]. ADD: ovf=((a^r)&(b^r))>>(w-1); 2891 * SUB: ovf=((a^b)&(a^r))>>(w-1). */ 2892 int is64 = rv_is_64(t, dsts[0].type); 2893 u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]); 2894 u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]); 2895 u32 sh = is64 ? 63u : 31u; 2896 if (kind == INTRIN_SADD_OVERFLOW) 2897 rv64_emit32(mc, 2898 is64 ? rv_add(RV_TMP2, ra, rb) : rv_addw(RV_TMP2, ra, rb)); 2899 else 2900 rv64_emit32(mc, 2901 is64 ? rv_sub(RV_TMP2, ra, rb) : rv_subw(RV_TMP2, ra, rb)); 2902 rv64_emit32(mc, rv_xor(RV_TMP3, ra, RV_TMP2)); /* a ^ r */ 2903 if (kind == INTRIN_SADD_OVERFLOW) { 2904 rv64_emit32(mc, rv_xor(rovf, rb, RV_TMP2)); /* b ^ r */ 2905 rv64_emit32(mc, rv_and(rovf, rovf, RV_TMP3)); 2906 } else { 2907 rv64_emit32(mc, rv_xor(rovf, ra, rb)); /* a ^ b */ 2908 rv64_emit32(mc, rv_and(rovf, rovf, RV_TMP3)); 2909 } 2910 rv64_emit32(mc, 2911 is64 ? rv_srli(rovf, rovf, sh) : rv_srliw(rovf, rovf, sh)); 2912 rv64_emit32(mc, rv_andi(rovf, rovf, 1)); 2913 rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0)); 2914 return; 2915 } 2916 case INTRIN_UADD_OVERFLOW: 2917 case INTRIN_USUB_OVERFLOW: { 2918 int is64 = rv_is_64(t, dsts[0].type); 2919 u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]); 2920 u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]); 2921 if (!is64) { 2922 rv64_emit32(mc, rv_slli(RV_TMP2, ra, 32)); 2923 rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP2, 32)); 2924 rv64_emit32(mc, rv_slli(RV_TMP3, rb, 32)); 2925 rv64_emit32(mc, rv_srli(RV_TMP3, RV_TMP3, 32)); 2926 ra = RV_TMP2; 2927 rb = RV_TMP3; 2928 } 2929 if (kind == INTRIN_UADD_OVERFLOW) { 2930 if (is64) { 2931 rv64_emit32(mc, rv_add(RV_TMP2, ra, rb)); 2932 rv64_emit32(mc, rv_sltu(rovf, RV_TMP2, ra)); 2933 } else { 2934 rv64_emit32(mc, rv_add(RV_TMP2, ra, rb)); 2935 rv64_emit32(mc, rv_srli(rovf, RV_TMP2, 32)); 2936 rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf)); 2937 rv64_emit32(mc, rv_addiw(RV_TMP2, RV_TMP2, 0)); 2938 } 2939 } else { 2940 rv64_emit32(mc, rv_sltu(rovf, ra, rb)); 2941 rv64_emit32(mc, 2942 is64 ? rv_sub(RV_TMP2, ra, rb) : rv_subw(RV_TMP2, ra, rb)); 2943 } 2944 rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0)); 2945 return; 2946 } 2947 case INTRIN_SMUL_OVERFLOW: { 2948 int is64 = rv_is_64(t, dsts[0].type); 2949 u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]); 2950 u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]); 2951 if (is64) { 2952 rv64_emit32(mc, rv_mul(RV_TMP2, ra, rb)); 2953 rv64_emit32(mc, rv_mulh(RV_TMP3, ra, rb)); 2954 rv64_emit32(mc, rv_srai(rovf, RV_TMP2, 63)); 2955 rv64_emit32(mc, rv_xor(rovf, RV_TMP3, rovf)); 2956 rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf)); 2957 rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0)); 2958 } else { 2959 rv64_emit32(mc, rv_addiw(RV_TMP2, ra, 0)); 2960 rv64_emit32(mc, rv_addiw(RV_TMP3, rb, 0)); 2961 rv64_emit32(mc, rv_mul(RV_TMP2, RV_TMP2, RV_TMP3)); 2962 rv64_emit32(mc, rv_addiw(RV_TMP3, RV_TMP2, 0)); 2963 rv64_emit32(mc, rv_xor(rovf, RV_TMP2, RV_TMP3)); 2964 rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf)); 2965 rv64_emit32(mc, rv_addiw(rd, RV_TMP2, 0)); 2966 } 2967 return; 2968 } 2969 case INTRIN_UMUL_OVERFLOW: { 2970 int is64 = rv_is_64(t, dsts[0].type); 2971 u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]); 2972 u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]); 2973 if (is64) { 2974 rv64_emit32(mc, rv_mulhu(rovf, ra, rb)); 2975 rv64_emit32(mc, rv_mul(rd, ra, rb)); 2976 rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf)); 2977 } else { 2978 rv64_emit32(mc, rv_slli(RV_TMP2, ra, 32)); 2979 rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP2, 32)); 2980 rv64_emit32(mc, rv_slli(RV_TMP3, rb, 32)); 2981 rv64_emit32(mc, rv_srli(RV_TMP3, RV_TMP3, 32)); 2982 rv64_emit32(mc, rv_mul(RV_TMP2, RV_TMP2, RV_TMP3)); 2983 rv64_emit32(mc, rv_srli(rovf, RV_TMP2, 32)); 2984 rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf)); 2985 rv64_emit32(mc, rv_addiw(rd, RV_TMP2, 0)); 2986 } 2987 return; 2988 } 2989 case INTRIN_MEMCPY: 2990 case INTRIN_MEMMOVE: { 2991 u32 dr, sr, n; 2992 if (narg != 3u || args[0].kind != NATIVE_LOC_REG || 2993 args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM) 2994 rv_panic(a, "unsupported memory intrinsic operands"); 2995 if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll) 2996 rv_panic(a, "unsupported memory intrinsic size"); 2997 dr = loc_reg(args[0]); 2998 sr = loc_reg(args[1]); 2999 n = (u32)args[2].v.imm; 3000 rv_intrin_copy(mc, dr, sr, n, kind == INTRIN_MEMMOVE); 3001 return; 3002 } 3003 case INTRIN_MEMSET: { 3004 u32 dr, n, src; 3005 if (narg != 3u || args[0].kind != NATIVE_LOC_REG || 3006 args[2].kind != NATIVE_LOC_IMM) 3007 rv_panic(a, "unsupported memset operands"); 3008 if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll) 3009 rv_panic(a, "unsupported memset size"); 3010 dr = loc_reg(args[0]); 3011 n = (u32)args[2].v.imm; 3012 if (args[1].kind == NATIVE_LOC_IMM) { 3013 u32 byte = (u32)(args[1].v.imm & 0xffu); 3014 if (byte == 0) { 3015 src = RV_ZERO; 3016 } else { 3017 u64 b = byte; 3018 b |= b << 8; 3019 b |= b << 16; 3020 b |= b << 32; 3021 rv_emit_load_imm(mc, 1, RV_TMP3, (i64)b); 3022 src = RV_TMP3; 3023 } 3024 } else { 3025 /* Replicate the low byte of a register value across 8 bytes. */ 3026 u32 rb = loc_reg(args[1]); 3027 rv64_emit32(mc, rv_andi(RV_TMP3, rb, 0xff)); 3028 rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 8)); 3029 rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2)); 3030 rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 16)); 3031 rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2)); 3032 rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 32)); 3033 rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2)); 3034 src = RV_TMP3; 3035 } 3036 { 3037 u32 i = 0; 3038 while (i + 8u <= n) { 3039 rv64_emit32(mc, rv_sd(src, dr, (i32)i)); 3040 i += 8u; 3041 } 3042 while (i + 4u <= n) { 3043 rv64_emit32(mc, rv_sw(src, dr, (i32)i)); 3044 i += 4u; 3045 } 3046 while (i + 2u <= n) { 3047 rv64_emit32(mc, rv_sh(src, dr, (i32)i)); 3048 i += 2u; 3049 } 3050 while (i < n) { 3051 rv64_emit32(mc, rv_sb(src, dr, (i32)i)); 3052 i += 1u; 3053 } 3054 } 3055 return; 3056 } 3057 case INTRIN_CPU_NOP: 3058 rv64_emit32(mc, rv_nop()); 3059 return; 3060 case INTRIN_CPU_YIELD: 3061 rv64_emit32(mc, rv_pause()); 3062 return; 3063 case INTRIN_ISB: 3064 rv64_emit32(mc, rv_fence_i()); 3065 return; 3066 case INTRIN_DMB: 3067 case INTRIN_DSB: 3068 rv64_emit32(mc, rv_fence_rw_rw()); 3069 return; 3070 case INTRIN_WFI: 3071 rv64_emit32(mc, rv_wfi()); 3072 return; 3073 default: 3074 break; 3075 } 3076 rv_panic(a, "unsupported compiler intrinsic"); 3077 } 3078 /* ============================ inline asm ============================ */ 3079 3080 _Noreturn static void rv_asm_panic_at(Compiler* c, SrcLoc loc, 3081 const char* msg) { 3082 compiler_panic(c, loc, "rv64 inline asm: %s", msg); 3083 } 3084 _Noreturn static void rv_asm_panic(NativeDirectTarget* d, const char* msg) { 3085 rv_asm_panic_at(d->base.c, d->loc, msg); 3086 } 3087 3088 /* constraint_body / constraint_early / match_index are shared 3089 * (cg/native_asm.h). */ 3090 3091 /* Build a bound register pseudo-operand in the rv64 inline shape. */ 3092 static void rv_asm_bound_reg(Operand* out, KitCgTypeId type, 3093 NativeAllocClass cls, Reg reg) { 3094 memset(out, 0, sizeof *out); 3095 out->kind = RV64_INLINE_OPK_REG; 3096 out->pad[0] = 3097 (cls == NATIVE_REG_FP) ? RV64_INLINE_OPCLS_FP : RV64_INLINE_OPCLS_INT; 3098 out->type = type; 3099 out->v.local = (CGLocal)reg; 3100 } 3101 static void rv_asm_bound_mem(Operand* out, KitCgTypeId type, Reg base) { 3102 memset(out, 0, sizeof *out); 3103 out->kind = OPK_INDIRECT; 3104 out->type = type; 3105 out->v.ind.base = (CGLocal)base; 3106 out->v.ind.index = CG_LOCAL_NONE; 3107 out->v.ind.ofs = 0; 3108 } 3109 3110 /* Parse a clobber register name into (class, reg). Returns 0 for the special 3111 * "cc"/"memory" clobbers and panics on an unknown register. RV64 dwarf: int 3112 * x0..x31 = 0..31, fp f0..f31 = 32..63. */ 3113 static int rv_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name, 3114 NativeAllocClass* cls_out, Reg* reg_out) { 3115 Slice s = pool_slice(c->global, name); 3116 char buf[16]; 3117 uint32_t dwarf; 3118 if (!s.s || !s.len) return 0; 3119 if (s.len == 2 && s.s[0] == 'c' && s.s[1] == 'c') return 0; 3120 if (s.len == 6 && memcmp(s.s, "memory", 6) == 0) return 0; 3121 if (s.len >= sizeof buf) rv_asm_panic_at(c, loc, "clobber name is too long"); 3122 memcpy(buf, s.s, s.len); 3123 buf[s.len] = '\0'; 3124 if (rv64_register_index(buf, &dwarf) != 0) 3125 rv_asm_panic_at(c, loc, "unknown clobber register"); 3126 if (dwarf <= 31u) { 3127 *cls_out = NATIVE_REG_INT; 3128 *reg_out = (Reg)dwarf; 3129 return 1; 3130 } 3131 if (dwarf >= 32u && dwarf <= 63u) { 3132 *cls_out = NATIVE_REG_FP; 3133 *reg_out = (Reg)(dwarf - 32u); 3134 return 1; 3135 } 3136 rv_asm_panic_at(c, loc, "unsupported clobber register"); 3137 return 0; 3138 } 3139 3140 static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers, 3141 u32 nclob, u32* int_mask, u32* fp_mask) { 3142 u32 i; 3143 *int_mask = 0; 3144 *fp_mask = 0; 3145 for (i = 0; i < nclob; ++i) { 3146 NativeAllocClass cls; 3147 Reg reg; 3148 if (!rv_asm_parse_reg_clobber(c, loc, clobbers[i], &cls, ®)) continue; 3149 if (cls == NATIVE_REG_INT) 3150 *int_mask |= 1u << reg; 3151 else 3152 *fp_mask |= 1u << reg; 3153 } 3154 } 3155 3156 static NativeAllocClass rv_asm_constraint_class(NativeDirectTarget* d, 3157 const char* body) { 3158 if (body[0] == 'r') return NATIVE_REG_INT; 3159 if (body[0] == 'f') return NATIVE_REG_FP; 3160 rv_asm_panic(d, "constraint is not a register constraint"); 3161 return NATIVE_REG_INT; 3162 } 3163 3164 static int rv_asm_resolve_pin_or_panic(NativeDirectTarget* d, Sym reg, 3165 const char* constraint, 3166 NativeAsmRegPin* pin) { 3167 NativeAsmRegPinStatus st = 3168 native_asm_resolve_pin(d->native, reg, constraint, pin); 3169 if (st == NATIVE_ASM_REG_PIN_ABSENT) return 0; 3170 if (st != NATIVE_ASM_REG_PIN_OK) 3171 rv_asm_panic(d, native_asm_pin_status_message(st)); 3172 return 1; 3173 } 3174 3175 /* Pick a free register from the arch's caller-saved allocable pools for an 3176 * asm operand the direct path must self-allocate. */ 3177 static Reg rv_asm_alloc_reg(NativeDirectTarget* d, NativeAllocClass cls, 3178 u32* used_int, u32* used_fp) { 3179 /* int: a0..a7 (10..17) then t-temps that aren't emit scratch. */ 3180 static const Reg int_pool[] = {10u, 11u, 12u, 13u, 14u, 15u, 3181 16u, 17u, 29u, 30u, 31u}; 3182 /* fp: fa0..fa7 (10..17) then ft caller-saved. */ 3183 static const Reg fp_pool[] = {10u, 11u, 12u, 13u, 14u, 15u, 16u, 17u, 3184 4u, 5u, 6u, 7u, 28u, 29u, 30u, 31u}; 3185 const Reg* pool = cls == NATIVE_REG_FP ? fp_pool : int_pool; 3186 u32 n = cls == NATIVE_REG_FP ? (u32)(sizeof fp_pool / sizeof fp_pool[0]) 3187 : (u32)(sizeof int_pool / sizeof int_pool[0]); 3188 u32* used = cls == NATIVE_REG_FP ? used_fp : used_int; 3189 u32 i; 3190 for (i = 0; i < n; ++i) { 3191 Reg r = pool[i]; 3192 if ((*used & (1u << r)) != 0) continue; 3193 *used |= 1u << r; 3194 return r; 3195 } 3196 rv_asm_panic(d, "out of registers for asm operands"); 3197 return REG_NONE; 3198 } 3199 3200 /* Direct (-O0) path: resolve a semantic Operand to a NativeAddr. */ 3201 static NativeAddr rv_direct_addr(NativeDirectTarget* d, Operand op) { 3202 NativeAddr addr; 3203 memset(&addr, 0, sizeof addr); 3204 switch ((OpKind)op.kind) { 3205 case OPK_LOCAL: 3206 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3207 addr.base.frame = d->locals[op.v.local - 1u].home; 3208 addr.base_type = op.type; 3209 return addr; 3210 case OPK_INDIRECT: 3211 addr.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE; 3212 addr.base.frame = d->locals[op.v.ind.base - 1u].home; 3213 addr.cls = d->locals[op.v.ind.base - 1u].cls; 3214 addr.base_type = d->locals[op.v.ind.base - 1u].type; 3215 addr.offset = op.v.ind.ofs; 3216 return addr; 3217 default: 3218 rv_asm_panic(d, "operand is not addressable"); 3219 } 3220 } 3221 3222 /* Materialize an OPK_INDIRECT (frame-value) base into a register, returning a 3223 * plain register-based NativeAddr. */ 3224 static NativeAddr rv_direct_materialize_addr(NativeDirectTarget* d, 3225 Operand op) { 3226 RvNativeTarget* a = rv_of(d->native); 3227 NativeAddr addr = rv_direct_addr(d, op); 3228 if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) { 3229 NativeLoc base = native_loc_reg(addr.base_type, NATIVE_REG_INT, RV_TMP1); 3230 NativeAddr load; 3231 memset(&load, 0, sizeof load); 3232 load.base_kind = NATIVE_ADDR_BASE_FRAME; 3233 load.base.frame = addr.base.frame; 3234 load.base_type = addr.base_type; 3235 rv_emit_mem(a, 1, base, load, 3236 native_mem_for_type(d->native, addr.base_type, 8)); 3237 addr.base_kind = NATIVE_ADDR_BASE_REG; 3238 addr.base.reg = RV_TMP1; 3239 } 3240 return addr; 3241 } 3242 3243 static void rv_direct_load_operand_to_reg(NativeDirectTarget* d, Operand op, 3244 NativeLoc dst) { 3245 RvNativeTarget* a = rv_of(d->native); 3246 NativeAddr addr; 3247 memset(&addr, 0, sizeof addr); 3248 switch ((OpKind)op.kind) { 3249 case OPK_IMM: 3250 if ((NativeAllocClass)dst.cls != NATIVE_REG_INT) 3251 rv_asm_panic(d, "floating-point immediate asm input is unsupported"); 3252 d->native->load_imm(d->native, dst, op.v.imm); 3253 return; 3254 case OPK_LOCAL: 3255 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3256 addr.base.frame = d->locals[op.v.local - 1u].home; 3257 addr.base_type = op.type; 3258 rv_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0)); 3259 return; 3260 case OPK_GLOBAL: 3261 addr.base_kind = NATIVE_ADDR_BASE_GLOBAL; 3262 addr.base.global.sym = op.v.global.sym; 3263 addr.base.global.addend = op.v.global.addend; 3264 addr.base_type = op.type; 3265 d->native->load_addr(d->native, dst, addr); 3266 return; 3267 case OPK_INDIRECT: 3268 addr = rv_direct_materialize_addr(d, op); 3269 rv_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0)); 3270 return; 3271 } 3272 rv_asm_panic(d, "unsupported asm input operand"); 3273 } 3274 3275 static void rv_direct_load_address_to_reg(NativeDirectTarget* d, Operand op, 3276 NativeLoc dst) { 3277 d->native->load_addr(d->native, dst, rv_direct_addr(d, op)); 3278 } 3279 3280 static void rv_direct_store_reg_to_operand(NativeDirectTarget* d, Operand op, 3281 NativeLoc src) { 3282 RvNativeTarget* a = rv_of(d->native); 3283 NativeAddr addr; 3284 memset(&addr, 0, sizeof addr); 3285 if (op.kind == OPK_LOCAL) { 3286 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3287 addr.base.frame = d->locals[op.v.local - 1u].home; 3288 addr.base_type = op.type; 3289 } else { 3290 addr = rv_direct_materialize_addr(d, op); 3291 } 3292 rv_emit_mem(a, 0, src, addr, native_mem_for_type(d->native, op.type, 0)); 3293 } 3294 3295 /* Callee-saved registers an asm block clobbers must be spilled/restored around 3296 * the block (the only ABI duty the allocator cannot discharge itself). */ 3297 typedef struct RvAsmSavedClobber { 3298 NativeFrameSlot slot; 3299 NativeAllocClass cls; 3300 Reg reg; 3301 KitCgTypeId type; 3302 } RvAsmSavedClobber; 3303 3304 static void rv_asm_save_one(RvNativeTarget* a, RvAsmSavedClobber* s) { 3305 NativeFrameSlotDesc desc; 3306 NativeAddr addr; 3307 memset(&desc, 0, sizeof desc); 3308 desc.type = s->type; 3309 desc.size = 8; 3310 desc.align = 8; 3311 desc.kind = NATIVE_FRAME_SLOT_SAVE; 3312 s->slot = a->base.frame_slot(&a->base, &desc); 3313 memset(&addr, 0, sizeof addr); 3314 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3315 addr.base.frame = s->slot; 3316 addr.base_type = s->type; 3317 rv_emit_mem(a, 0, native_loc_reg(s->type, s->cls, s->reg), addr, 3318 native_mem_for_type(&a->base, s->type, 8)); 3319 } 3320 static void rv_asm_restore_one(RvNativeTarget* a, const RvAsmSavedClobber* s) { 3321 NativeAddr addr; 3322 memset(&addr, 0, sizeof addr); 3323 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3324 addr.base.frame = s->slot; 3325 addr.base_type = s->type; 3326 rv_emit_mem(a, 1, native_loc_reg(s->type, s->cls, s->reg), addr, 3327 native_mem_for_type(&a->base, s->type, 8)); 3328 } 3329 3330 /* psABI callee-saved: integer s0..s11 (x8,x9,x18..x27), fp fs0..fs11 3331 * (f8,f9,f18..f27). x8 is the frame pointer and never asm-clobbered. */ 3332 static int rv_reg_is_callee_int(Reg r) { 3333 return r == 9u || (r >= 18u && r <= 27u); 3334 } 3335 static int rv_reg_is_callee_fp(Reg r) { 3336 return r == 8u || r == 9u || (r >= 18u && r <= 27u); 3337 } 3338 3339 static RvAsmSavedClobber* rv_asm_save_callee_clobbers(RvNativeTarget* a, 3340 u32 int_mask, u32 fp_mask, 3341 u32* nsaved_out) { 3342 RvAsmSavedClobber* saved = 3343 arena_zarray(a->base.c->tu, RvAsmSavedClobber, 24u); 3344 KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64); 3345 KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64); 3346 u32 n = 0; 3347 Reg r; 3348 for (r = 0; r <= 31u; ++r) { 3349 if ((int_mask & (1u << r)) == 0 || !rv_reg_is_callee_int(r)) continue; 3350 saved[n].cls = NATIVE_REG_INT; 3351 saved[n].reg = r; 3352 saved[n].type = i64; 3353 rv_asm_save_one(a, &saved[n++]); 3354 } 3355 for (r = 0; r <= 31u; ++r) { 3356 if ((fp_mask & (1u << r)) == 0 || !rv_reg_is_callee_fp(r)) continue; 3357 saved[n].cls = NATIVE_REG_FP; 3358 saved[n].reg = r; 3359 saved[n].type = f64; 3360 rv_asm_save_one(a, &saved[n++]); 3361 } 3362 *nsaved_out = n; 3363 return saved; 3364 } 3365 3366 /* ---- NativeTarget (optimizer) asm hook ---- 3367 * The optimizer pre-allocated every operand register and arranged surrounding 3368 * data flow, so this binds pre-allocated registers to the template and only 3369 * materializes memory-operand bases into the reserved scratch + spills the 3370 * callee-saved registers the asm clobbers. */ 3371 3372 static NativeAddr rv_asm_loc_to_addr(RvNativeTarget* a, SrcLoc loc, 3373 NativeLoc src) { 3374 NativeAddr addr; 3375 memset(&addr, 0, sizeof addr); 3376 addr.base_type = src.type; 3377 switch ((NativeLocKind)src.kind) { 3378 case NATIVE_LOC_FRAME: 3379 addr.base_kind = NATIVE_ADDR_BASE_FRAME; 3380 addr.base.frame = src.v.frame; 3381 return addr; 3382 case NATIVE_LOC_ADDR: 3383 return src.v.addr; 3384 case NATIVE_LOC_GLOBAL: 3385 addr.base_kind = NATIVE_ADDR_BASE_GLOBAL; 3386 addr.base.global.sym = src.v.global.sym; 3387 addr.base.global.addend = src.v.global.addend; 3388 return addr; 3389 case NATIVE_LOC_REG: 3390 addr.base_kind = NATIVE_ADDR_BASE_REG; 3391 addr.cls = NATIVE_REG_INT; 3392 addr.base.reg = src.v.reg; 3393 return addr; 3394 default: 3395 rv_asm_panic_at(a->base.c, loc, "unsupported memory asm operand"); 3396 } 3397 } 3398 3399 /* Resolve a memory-constraint operand to a single base register with zero 3400 * offset, folding any frame/global/offset into a reserved scratch register. */ 3401 static Reg rv_asm_native_mem_base(RvNativeTarget* a, SrcLoc loc, NativeLoc src, 3402 u32* ntmp) { 3403 NativeAddr addr = rv_asm_loc_to_addr(a, loc, src); 3404 u32 base; 3405 i32 off; 3406 Reg dst; 3407 if (addr.index_kind != NATIVE_ADDR_INDEX_NONE) 3408 rv_asm_panic_at(a->base.c, loc, "indexed memory asm operand unsupported"); 3409 rv_resolve_mem_addr(a, &addr, &base, &off); 3410 if (off == 0 && base != RV_TMP0 && base != RV_TMP1) return (Reg)base; 3411 if (*ntmp >= 2u) 3412 rv_asm_panic_at(a->base.c, loc, "too many memory asm operands"); 3413 dst = (*ntmp == 0u) ? RV_TMP0 : RV_TMP1; 3414 (*ntmp)++; 3415 rv_emit_addr_adjust(a->base.mc, dst, base, off); 3416 return dst; 3417 } 3418 3419 static void rv_asm_bind_native(RvNativeTarget* a, SrcLoc loc, Operand* out, 3420 const char* constraint, KitCgTypeId type, 3421 NativeLoc src, u32* ntmp) { 3422 const char* body = native_asm_constraint_body(constraint); 3423 if (body[0] == 'r' || body[0] == 'f') { 3424 NativeAllocClass cls = (body[0] == 'f') ? NATIVE_REG_FP : NATIVE_REG_INT; 3425 if (src.kind != NATIVE_LOC_REG) 3426 rv_asm_panic_at(a->base.c, loc, "register asm operand not in a register"); 3427 rv_asm_bound_reg(out, type, cls, (Reg)src.v.reg); 3428 } else if (body[0] == 'i') { 3429 if (src.kind != NATIVE_LOC_IMM) 3430 rv_asm_panic_at(a->base.c, loc, "immediate asm operand is not immediate"); 3431 memset(out, 0, sizeof *out); 3432 out->kind = OPK_IMM; 3433 out->type = type; 3434 out->v.imm = src.v.imm; 3435 } else if (body[0] == 'm') { 3436 rv_asm_bound_mem(out, type, rv_asm_native_mem_base(a, loc, src, ntmp)); 3437 } else { 3438 rv_asm_panic_at(a->base.c, loc, "unsupported asm constraint"); 3439 } 3440 } 3441 3442 static void rv_asm_block_native(NativeTarget* t, const char* tmpl, 3443 const AsmConstraint* outs, u32 nout, 3444 NativeLoc* out_locs, const AsmConstraint* ins, 3445 u32 nin, const NativeLoc* in_locs, 3446 const Sym* clobbers, u32 nclob) { 3447 RvNativeTarget* a = rv_of(t); 3448 Compiler* c = t->c; 3449 SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0}; 3450 Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; 3451 Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; 3452 u32 ntmp = 0, i; 3453 Rv64Asm* asmh; 3454 3455 for (i = 0; i < nout; ++i) { 3456 KitCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type; 3457 rv_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i], 3458 &ntmp); 3459 } 3460 for (i = 0; i < nin; ++i) { 3461 const char* body = native_asm_constraint_body(ins[i].str); 3462 int matched = native_asm_match_index(body); 3463 KitCgTypeId type; 3464 NativeLoc inloc; 3465 if (matched >= 0) { 3466 if ((u32)matched >= nout) 3467 rv_asm_panic_at(c, loc, "matching constraint out of range"); 3468 bound_ins[i] = bound_outs[matched]; 3469 continue; 3470 } 3471 type = ins[i].type ? ins[i].type : in_locs[i].type; 3472 inloc = in_locs[i]; 3473 /* A register-constrained input that lives in a frame slot (address-taken 3474 * local) must be loaded into a reserved scratch first. */ 3475 if (body[0] == 'r' && inloc.kind != NATIVE_LOC_REG) { 3476 Reg r; 3477 if (ntmp >= 2u) rv_asm_panic_at(c, loc, "too many memory asm operands"); 3478 r = (ntmp == 0u) ? RV_TMP0 : RV_TMP1; 3479 ntmp++; 3480 inloc = native_loc_reg(type, NATIVE_REG_INT, r); 3481 rv_emit_mem(a, 1, inloc, rv_asm_loc_to_addr(a, loc, in_locs[i]), 3482 native_mem_for_type(t, type, native_type_size(t, type))); 3483 } 3484 rv_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp); 3485 } 3486 3487 /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber 3488 * masks and rv_known_callee_saves folded the callee-saved ones into the 3489 * function's saved set, so the prologue/epilogue already preserve them. */ 3490 asmh = rv64_asm_open(c); 3491 rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, 3492 nclob); 3493 rv64_asm_run_template(asmh, t->mc, tmpl); 3494 rv64_asm_close(asmh); 3495 } 3496 /* file_scope_asm + finalize are shared (cg/native_asm.h). */ 3497 3498 static void rv_trap(NativeTarget* t) { rv64_emit32(t->mc, rv_ebreak()); } 3499 static void rv_set_loc(NativeTarget* t, SrcLoc loc) { 3500 rv_of(t)->loc = loc; 3501 if (t->mc->set_loc) t->mc->set_loc(t->mc, loc); 3502 } 3503 3504 /* ============================ construction ============================ */ 3505 3506 NativeTarget* rv64_native_target_new(Compiler* c, ObjBuilder* obj, 3507 MCEmitter* mc) { 3508 RvNativeTarget* a = arena_znew(c->tu, RvNativeTarget); 3509 NativeTarget* t; 3510 if (!a) return NULL; 3511 t = &a->base; 3512 t->c = c; 3513 t->obj = obj; 3514 t->mc = mc; 3515 native_frame_init(&a->frame, c); 3516 t->regs = &rv_reg_info; 3517 t->class_for_type = native_class_for_type_fp_le8; 3518 t->imm_legal = rv_imm_legal; 3519 t->addr_legal = rv_addr_legal; 3520 t->func_begin = rv_func_begin; 3521 t->func_begin_known_frame = rv_func_begin_known_frame; 3522 t->note_frame_state = NULL; 3523 /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved 3524 * set; rv_func_begin_known_frame derives the records from the masks. */ 3525 t->reserve_callee_saves = rv_reserve_callee_saves; 3526 t->signature_stack_bytes = rv_signature_stack_bytes; 3527 t->call_stack_bytes = rv_call_stack_bytes; 3528 t->has_store_zero_reg = 1; 3529 t->store_zero_reg = RV_ZERO; 3530 t->func_end = rv_func_end; 3531 t->frame_slot = rv_frame_slot; 3532 t->frame_slot_debug_loc = rv_frame_slot_debug_loc; 3533 t->bind_param = rv_bind_native_param; 3534 t->label_new = rv_label_new; 3535 t->label_place = rv_label_place; 3536 t->jump = rv_jump; 3537 t->cmp_branch = rv_cmp_branch; 3538 t->indirect_branch = rv_indirect_branch; 3539 t->load_label_addr = rv_load_label_addr; 3540 t->move = rv_move; 3541 t->load_imm = rv_load_imm; 3542 t->load_const = rv_load_const; 3543 t->load_addr = rv_load_addr; 3544 t->load = rv_load; 3545 t->store = rv_store; 3546 t->tls_addr_of = rv_tls_addr_of; 3547 t->copy_bytes = rv_copy_bytes; 3548 t->set_bytes = rv_set_bytes; 3549 t->bitfield_load = rv_bitfield_load; 3550 t->bitfield_store = rv_bitfield_store; 3551 t->binop = rv_binop; 3552 t->unop = rv_unop; 3553 t->cmp = rv_cmp; 3554 t->convert = rv_convert; 3555 t->alloca_ = rv_alloca; 3556 t->spill = rv_spill; 3557 t->reload = rv_reload; 3558 t->plan_call = rv_plan_call; 3559 t->emit_call = rv_emit_call; 3560 t->plan_ret = rv_plan_ret; 3561 t->ret = rv_ret; 3562 t->atomic_load = rv_atomic_load; 3563 t->atomic_store = rv_atomic_store; 3564 t->atomic_rmw = rv_atomic_rmw; 3565 t->atomic_cas = rv_atomic_cas; 3566 t->fence = rv_fence; 3567 t->va_start_ = rv_va_start_native; 3568 t->va_arg_ = rv_va_arg_native; 3569 t->va_end_ = rv_va_end_native; 3570 t->va_copy_ = rv_va_copy_native; 3571 t->intrinsic = rv_intrinsic; 3572 t->asm_block = rv_asm_block_native; 3573 t->file_scope_asm = native_file_scope_asm; 3574 t->trap = rv_trap; 3575 t->set_loc = rv_set_loc; 3576 t->finalize = native_finalize; 3577 return t; 3578 } 3579 3580 /* ============================ NativeOps (-O0) ============================ */ 3581 3582 static void rv_bind_param(NativeDirectTarget* d, const CGParamDesc* p, 3583 CGLocal local, NativeDirectLocal* l) { 3584 NativeLoc dst; 3585 (void)local; 3586 memset(&dst, 0, sizeof dst); 3587 dst.kind = NATIVE_LOC_FRAME; 3588 dst.type = p->type; 3589 dst.v.frame = l->home; 3590 rv_bind_native_param(d->native, p, dst); 3591 } 3592 3593 /* A sibling call is realizable when its outgoing stack-argument area fits the 3594 * window the caller itself received (so the args land in the caller's incoming 3595 * slots without overflowing into the caller's caller's frame). Register-only 3596 * calls (the common case) always qualify. Mirrors aa64's aa_no_tail. */ 3597 static const char* rv_no_tail(NativeDirectTarget* d, const CGCallDesc* call) { 3598 RvNativeTarget* a = rv_of(d->native); 3599 NativeCallDesc nd; 3600 NativeLoc* args = NULL; 3601 NativeLoc* results = NULL; 3602 u32 i, stack; 3603 if (a->frame.ncallee_saves) 3604 return "rv64 tail call: callee-saved registers in use"; 3605 memset(&nd, 0, sizeof nd); 3606 if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs); 3607 if (call->nresults) 3608 results = arena_zarray(d->base.c->tu, NativeLoc, call->nresults); 3609 for (i = 0; i < call->nargs; ++i) { 3610 args[i].kind = NATIVE_LOC_FRAME; 3611 args[i].type = d->locals[call->args[i] - 1u].type; 3612 args[i].cls = d->locals[call->args[i] - 1u].cls; 3613 args[i].v.frame = d->locals[call->args[i] - 1u].home; 3614 } 3615 for (i = 0; i < call->nresults; ++i) { 3616 results[i].kind = NATIVE_LOC_FRAME; 3617 results[i].type = d->locals[call->results[i] - 1u].type; 3618 results[i].cls = d->locals[call->results[i] - 1u].cls; 3619 results[i].v.frame = d->locals[call->results[i] - 1u].home; 3620 } 3621 nd.fn_type = call->fn_type; 3622 nd.args = args; 3623 nd.results = results; 3624 nd.nargs = call->nargs; 3625 nd.nresults = call->nresults; 3626 stack = rv_call_stack_size(d->native, &nd); 3627 if (stack > a->incoming_stack_size) 3628 return "rv64 tail call: stack argument area too small"; 3629 return NULL; 3630 } 3631 3632 /* Resolve a pointer-typed Operand (the address of a va_list object) into `reg` 3633 * and return a register-based NativeAddr. An OPK_LOCAL holds the va_list object 3634 * itself, so we take its frame address; an OPK_INDIRECT holds the pointer in 3635 * memory and must be loaded. The va cores use TMP1/TMP2 internally, so `reg` 3636 * must be distinct from those (callers pass TMP0 / TMP3). */ 3637 /* ap_addr is the pointer value &ap (the va_list object's address). For an 3638 * OPK_LOCAL the local HOLDS that pointer, so load its home value; an 3639 * OPK_INDIRECT names *(base+ofs), whose address base+ofs is the pointer. 3640 * Mirrors aa64's aa_direct_pointer_addr. */ 3641 static NativeAddr rv_direct_pointer_addr(NativeDirectTarget* d, Operand op) { 3642 RvNativeTarget* a = rv_of(d->native); 3643 NativeAddr addr; 3644 memset(&addr, 0, sizeof addr); 3645 if (op.kind == OPK_LOCAL) { 3646 NativeLoc base = native_loc_reg(op.type, NATIVE_REG_INT, RV_TMP1); 3647 NativeAddr load; 3648 memset(&load, 0, sizeof load); 3649 load.base_kind = NATIVE_ADDR_BASE_FRAME; 3650 load.base.frame = d->locals[op.v.local - 1u].home; 3651 load.base_type = op.type; 3652 rv_emit_mem(a, 1, base, load, native_mem_for_type(d->native, op.type, 8)); 3653 addr.base_kind = NATIVE_ADDR_BASE_REG; 3654 addr.base.reg = RV_TMP1; 3655 addr.base_type = op.type; 3656 return addr; 3657 } 3658 return rv_direct_materialize_addr(d, op); 3659 } 3660 3661 static NativeAddr rv_direct_va_base(NativeDirectTarget* d, Operand ap_addr, 3662 Reg reg) { 3663 NativeLoc dst = 3664 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg); 3665 NativeAddr addr; 3666 d->native->load_addr(d->native, dst, rv_direct_pointer_addr(d, ap_addr)); 3667 memset(&addr, 0, sizeof addr); 3668 addr.base_kind = NATIVE_ADDR_BASE_REG; 3669 addr.cls = NATIVE_REG_INT; 3670 addr.base.reg = reg; 3671 addr.base_type = builtin_id(KIT_CG_BUILTIN_I64); 3672 return addr; 3673 } 3674 3675 static void rv_va_start_(NativeDirectTarget* d, Operand ap_addr) { 3676 rv_va_start_core(rv_of(d->native), rv_direct_va_base(d, ap_addr, RV_TMP3)); 3677 } 3678 static void rv_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr, 3679 KitCgTypeId type) { 3680 RvNativeTarget* a = rv_of(d->native); 3681 int is_fp = cg_type_is_float(d->base.c, type); 3682 NativeLoc res = native_loc_reg(type, is_fp ? NATIVE_REG_FP : NATIVE_REG_INT, 3683 is_fp ? RV_FTMP0 : RV_TMP0); 3684 NativeAddr dst_addr; 3685 rv_va_arg_core(a, res, rv_direct_va_base(d, ap_addr, RV_TMP3), type); 3686 /* Store the fetched value back into the semantic destination. */ 3687 dst_addr = rv_direct_addr(d, dst); 3688 if (dst_addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) { 3689 NativeLoc base = 3690 native_loc_reg(dst_addr.base_type, NATIVE_REG_INT, RV_TMP1); 3691 NativeAddr load; 3692 memset(&load, 0, sizeof load); 3693 load.base_kind = NATIVE_ADDR_BASE_FRAME; 3694 load.base.frame = dst_addr.base.frame; 3695 load.base_type = dst_addr.base_type; 3696 rv_emit_mem(a, 1, base, load, 3697 native_mem_for_type(d->native, dst_addr.base_type, 8)); 3698 dst_addr.base_kind = NATIVE_ADDR_BASE_REG; 3699 dst_addr.base.reg = RV_TMP1; 3700 } 3701 rv_emit_mem( 3702 a, 0, res, dst_addr, 3703 native_mem_for_type(d->native, type, native_type_size(d->native, type))); 3704 } 3705 static void rv_va_end_(NativeDirectTarget* d, Operand ap_addr) { 3706 (void)d; 3707 (void)ap_addr; 3708 } 3709 static void rv_va_copy_(NativeDirectTarget* d, Operand dst, Operand src) { 3710 RvNativeTarget* a = rv_of(d->native); 3711 NativeAddr src_ap = rv_direct_va_base(d, src, RV_TMP0); 3712 NativeAddr dst_ap = rv_direct_va_base(d, dst, RV_TMP3); 3713 rv_va_copy_core(a, dst_ap, src_ap); 3714 } 3715 3716 static void rv_direct_asm_block(NativeDirectTarget* d, const char* tmpl, 3717 const AsmConstraint* outs, u32 nout, 3718 Operand* out_ops, const AsmConstraint* ins, 3719 u32 nin, const Operand* in_ops, 3720 const Sym* clobbers, u32 nclob, 3721 u32 clobber_abi_sets) { 3722 RvNativeTarget* a = rv_of(d->native); 3723 Compiler* c = d->base.c; 3724 Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL; 3725 Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL; 3726 u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp; 3727 RvAsmSavedClobber* saved; 3728 u32 nsaved, i; 3729 Rv64Asm* asmh; 3730 3731 rv_asm_clobber_masks(c, d->loc, clobbers, nclob, &clob_int, &clob_fp); 3732 native_asm_abi_clobber_masks(d->native, clobber_abi_sets, &abi_int, &abi_fp); 3733 clob_int |= abi_int; 3734 clob_fp |= abi_fp; 3735 /* Reserve emit scratch (t0/t1/t2/t3), sp/gp/tp/zero/ra and the frame pointer 3736 * so the operand allocator never hands them out. */ 3737 used_int = clob_int | (1u << RV_ZERO) | (1u << RV_RA) | (1u << RV_SP) | 3738 (1u << RV_GP) | (1u << RV_TP) | (1u << RV_TMP0) | (1u << RV_TMP1) | 3739 (1u << RV_TMP2) | (1u << RV_TMP3) | (1u << RV_S0); 3740 used_fp = 3741 clob_fp | (1u << RV_FTMP0) | (1u << RV_FTMP1) | (1u << 2u) | (1u << 3u); 3742 3743 for (i = 0; i < nout; ++i) { 3744 const char* body = native_asm_constraint_body(outs[i].str); 3745 KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; 3746 NativeAsmRegPin pin; 3747 if (rv_asm_resolve_pin_or_panic(d, outs[i].reg, outs[i].str, &pin)) { 3748 /* GNU local register variable: pin to the named hard register. */ 3749 if (pin.cls == NATIVE_REG_FP) { 3750 used_fp |= 1u << pin.reg; 3751 clob_fp |= 1u << pin.reg; 3752 } else { 3753 used_int |= 1u << pin.reg; 3754 clob_int |= 1u << pin.reg; 3755 } 3756 rv_asm_bound_reg(&bound_outs[i], type, pin.cls, pin.reg); 3757 } else if (body[0] == 'r' || body[0] == 'f') { 3758 NativeAllocClass cls = rv_asm_constraint_class(d, body); 3759 Reg reg = rv_asm_alloc_reg(d, cls, &used_int, &used_fp); 3760 rv_asm_bound_reg(&bound_outs[i], type, cls, reg); 3761 } else if (body[0] == 'm') { 3762 Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp); 3763 rv_asm_bound_mem(&bound_outs[i], type, reg); 3764 } else { 3765 rv_asm_panic(d, "unsupported output constraint"); 3766 } 3767 } 3768 3769 for (i = 0; i < nin; ++i) { 3770 const char* body = native_asm_constraint_body(ins[i].str); 3771 int matched = native_asm_match_index(body); 3772 KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type; 3773 if (matched >= 0) { 3774 if ((u32)matched >= nout) 3775 rv_asm_panic(d, "matching constraint out of range"); 3776 if (native_asm_constraint_early(outs[matched].str)) 3777 rv_asm_panic(d, "matching input names early-clobber output"); 3778 if (bound_outs[matched].kind != RV64_INLINE_OPK_REG) 3779 rv_asm_panic(d, "matching constraint requires register output"); 3780 bound_ins[i] = bound_outs[matched]; 3781 continue; 3782 } 3783 NativeAsmRegPin pin; 3784 if (rv_asm_resolve_pin_or_panic(d, ins[i].reg, ins[i].str, &pin)) { 3785 /* GNU local register variable: pin to the named hard register. */ 3786 if (pin.cls == NATIVE_REG_FP) { 3787 used_fp |= 1u << pin.reg; 3788 clob_fp |= 1u << pin.reg; 3789 } else { 3790 used_int |= 1u << pin.reg; 3791 clob_int |= 1u << pin.reg; 3792 } 3793 rv_asm_bound_reg(&bound_ins[i], type, pin.cls, pin.reg); 3794 } else if (body[0] == 'r' || body[0] == 'f') { 3795 NativeAllocClass cls = rv_asm_constraint_class(d, body); 3796 Reg reg = rv_asm_alloc_reg(d, cls, &used_int, &used_fp); 3797 rv_asm_bound_reg(&bound_ins[i], type, cls, reg); 3798 } else if (body[0] == 'i') { 3799 if (in_ops[i].kind != OPK_IMM) 3800 rv_asm_panic(d, "immediate constraint requires immediate operand"); 3801 bound_ins[i] = in_ops[i]; 3802 } else if (body[0] == 'm') { 3803 Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp); 3804 rv_asm_bound_mem(&bound_ins[i], type, reg); 3805 } else { 3806 rv_asm_panic(d, "unsupported input constraint"); 3807 } 3808 } 3809 3810 saved = rv_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved); 3811 for (i = 0; i < nout; ++i) { 3812 if (bound_outs[i].kind == RV64_INLINE_OPK_REG) { 3813 NativeAllocClass cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP 3814 ? NATIVE_REG_FP 3815 : NATIVE_REG_INT; 3816 if (outs[i].dir == KIT_CG_ASM_INOUT) { 3817 rv_direct_load_operand_to_reg( 3818 d, out_ops[i], 3819 native_loc_reg(bound_outs[i].type, cls, 3820 (Reg)bound_outs[i].v.local)); 3821 } 3822 } else if (bound_outs[i].kind == OPK_INDIRECT) { 3823 NativeLoc loc = 3824 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, 3825 (Reg)bound_outs[i].v.ind.base); 3826 rv_direct_load_address_to_reg(d, out_ops[i], loc); 3827 } 3828 } 3829 for (i = 0; i < nin; ++i) { 3830 if (bound_ins[i].kind == RV64_INLINE_OPK_REG) { 3831 NativeAllocClass cls = bound_ins[i].pad[0] == RV64_INLINE_OPCLS_FP 3832 ? NATIVE_REG_FP 3833 : NATIVE_REG_INT; 3834 rv_direct_load_operand_to_reg( 3835 d, in_ops[i], 3836 native_loc_reg(bound_ins[i].type, cls, (Reg)bound_ins[i].v.local)); 3837 } else if (bound_ins[i].kind == OPK_INDIRECT) { 3838 NativeLoc loc = 3839 native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, 3840 (Reg)bound_ins[i].v.ind.base); 3841 rv_direct_load_address_to_reg(d, in_ops[i], loc); 3842 } 3843 } 3844 asmh = rv64_asm_open(c); 3845 rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, 3846 nclob); 3847 rv64_asm_run_template(asmh, d->native->mc, tmpl); 3848 rv64_asm_close(asmh); 3849 3850 for (i = 0; i < nout; ++i) { 3851 NativeAllocClass cls; 3852 NativeLoc src; 3853 if (bound_outs[i].kind != RV64_INLINE_OPK_REG) continue; 3854 cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP ? NATIVE_REG_FP 3855 : NATIVE_REG_INT; 3856 src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local); 3857 rv_direct_store_reg_to_operand(d, out_ops[i], src); 3858 } 3859 for (i = nsaved; i > 0; --i) rv_asm_restore_one(a, &saved[i - 1u]); 3860 } 3861 3862 static const NativeOps rv_direct_ops = { 3863 .bind_param = rv_bind_param, 3864 .tail_call_unrealizable_reason = rv_no_tail, 3865 .va_start_ = rv_va_start_, 3866 .va_arg_ = rv_va_arg_, 3867 .va_end_ = rv_va_end_, 3868 .va_copy_ = rv_va_copy_, 3869 .asm_block = rv_direct_asm_block, 3870 }; 3871 3872 const NativeOps* rv64_native_direct_ops(void) { return &rv_direct_ops; }