kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

native.c (147781B)


      1 /* src/arch/rv64/native.c โ€” RISC-V (RV64GC, LP64D) NativeTarget implementation.
      2  *
      3  * Mirrors the aa64 reference (src/arch/aa64/native.c): a physical-emission
      4  * NativeTarget driven at -O0 by the shared NativeDirectTarget and at -O1+ by
      5  * the optimizer emit path. ABI decisions go through the abi/ interface; this
      6  * file owns only ISA emission and the RV64 frame layout.
      7  *
      8  * Frame model (single, top-record): s0 (x8) is the frame pointer anchored at
      9  * the saved s0/ra pair; slots live below s0 at positive byte offsets `off`
     10  * (address = s0 - off); outgoing args sit at the bottom of the frame (sp+0..).
     11  *   frame_size  = align16(16 + cum_off + max_outgoing + va_save_sz)
     12  *   fp_pair_off = frame_size - 16 - va_save_sz   (saved pair, sp-relative)
     13  *   CFA = s0 + (frame_size - fp_pair_off)
     14  * RISC-V has no condition flags: comparisons materialize a 0/1 via SLT/SLTU or
     15  * FLT/FLE; branches compare two registers directly. x0 is a hardware zero. */
     16 
     17 #include <string.h>
     18 
     19 #include "abi/abi.h"
     20 #include "arch/rv64/asm.h"
     21 #include "arch/rv64/isa.h"
     22 #include "arch/rv64/regs.h"
     23 #include "arch/rv64/rv64.h"
     24 #include "asm/asm.h"
     25 #include "asm/asm_lex.h"
     26 #include "cg/native_argmove.h"
     27 #include "cg/native_asm.h"
     28 #include "cg/native_direct_target.h"
     29 #include "cg/native_frame.h"
     30 #include "cg/type.h"
     31 #include "core/arena.h"
     32 #include "core/bytes.h"
     33 #include "core/pool.h"
     34 #include "core/slice.h"
     35 #include "obj/obj.h"
     36 
     37 enum {
     38   RV_TMP0 = 5u,  /* t0: emit-internal scratch (reserved, never allocable) */
     39   RV_TMP1 = 6u,  /* t1: emit-internal scratch */
     40   RV_TMP2 = 7u,  /* t2: emit-internal scratch (reserved in phys table) */
     41   RV_TMP3 = 28u, /* t3: emit-internal scratch (reserved in phys table) */
     42   RV_FTMP0 = 0u, /* ft0: emit-internal FP scratch */
     43   RV_FTMP1 = 1u, /* ft1: emit-internal FP scratch */
     44   RV_FA0 = 10u,  /* fa0..fa7 = f10..f17 (FP arg/return registers) */
     45   RV_FA7 = 17u,
     46   /* Single-pass (-O0) worst-case prologue: sp adjust (3) + far save pair (7)
     47    * + sret spill (1) + variadic GP spills (8). No callee-saves at -O0. */
     48   RV_PROLOGUE_WORDS = 32u,
     49   /* Known-frame (-O1) prologues are emitted directly, not into the fixed -O0
     50    * NOP region, and additionally save callee-saved registers (up to 11 int + 12
     51    * fp, each up to 4 words for a far s0-relative offset) on top of the header,
     52    * sret, and variadic spills. Size the build buffer for the worst case. */
     53   RV_KNOWN_PROLOGUE_WORDS = 192u,
     54   RV_FRAME_SAVE_SIZE = 16u,
     55 };
     56 
     57 /* s1..s11 (11) + fs0..fs11 (12); separate int/fp collect arrays use this cap.
     58  */
     59 #define RV_MAX_CALLEE_SAVES 16u
     60 #define RV_MAX_REG_ARG_MOVES 16u
     61 
     62 extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc);
     63 extern void debug_func_pc_range(Debug*, ObjSecId text_section, u32 begin_ofs,
     64                                 u32 end_ofs);
     65 
     66 /* ============================ low-level emit ============================ */
     67 
     68 void rv64_emit32(MCEmitter* mc, u32 word) {
     69   u8 b[4];
     70   u32 ofs = obj_pos(mc->obj, mc->section_id);
     71   wr_u32_le(b, word);
     72   mc->emit_bytes(mc, b, sizeof b);
     73   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
     74 }
     75 
     76 void rv64_emit16(MCEmitter* mc, u32 halfword) {
     77   u8 b[2];
     78   u32 ofs = obj_pos(mc->obj, mc->section_id);
     79   b[0] = (u8)(halfword & 0xff);
     80   b[1] = (u8)((halfword >> 8) & 0xff);
     81   mc->emit_bytes(mc, b, sizeof b);
     82   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
     83 }
     84 
     85 static void rv_patch32(ObjBuilder* obj, ObjSecId sec, u32 off, u32 word) {
     86   u8 b[4];
     87   wr_u32_le(b, word);
     88   obj_patch(obj, sec, off, b, sizeof b);
     89 }
     90 
     91 static int fits_i12(i64 v) { return v >= -2048 && v <= 2047; }
     92 static int fits_i32(i64 v) {
     93   return v >= (i64)(i32)0x80000000 && v <= (i64)(i32)0x7fffffff;
     94 }
     95 
     96 static u32 align_up_u32(u32 v, u32 align) {
     97   u32 mask = align ? align - 1u : 0u;
     98   return (v + mask) & ~mask;
     99 }
    100 
    101 static i64 floor_div_4096(i64 v) {
    102   if (v >= 0) return v / 4096;
    103   return -((-v + 4095) / 4096);
    104 }
    105 
    106 static void rv_emit_li32(MCEmitter* mc, u32 rd, i32 imm) {
    107   if (imm >= -2048 && imm <= 2047) {
    108     rv64_emit32(mc, rv_addi(rd, RV_ZERO, imm));
    109     return;
    110   }
    111   {
    112     i64 hi64 = floor_div_4096((i64)imm + 0x800);
    113     i32 hi = (i32)hi64;
    114     i32 lo = (i32)((i64)imm - hi64 * 4096);
    115     rv64_emit32(mc, rv_lui(rd, (u32)hi & 0xfffffu));
    116     if (lo) rv64_emit32(mc, rv_addiw(rd, rd, lo));
    117   }
    118 }
    119 
    120 static i32 sext12(u32 v) {
    121   v &= 0xfffu;
    122   return (v & 0x800u) ? (i32)v - 4096 : (i32)v;
    123 }
    124 
    125 static void rv_emit_li64(MCEmitter* mc, u32 rd, u64 imm) {
    126   if (fits_i32((i64)imm)) {
    127     rv_emit_li32(mc, rd, (i32)(i64)imm);
    128     return;
    129   }
    130   {
    131     i32 lo = sext12((u32)imm);
    132     u64 hi = (imm - (u64)(i64)lo) >> 12;
    133     rv_emit_li64(mc, rd, hi);
    134     rv64_emit32(mc, rv_slli(rd, rd, 12));
    135     if (lo) rv64_emit32(mc, rv_addi(rd, rd, lo));
    136   }
    137 }
    138 
    139 /* sf!=0 selects a full 64-bit materialization; sf==0 a 32-bit value. */
    140 static void rv_emit_load_imm(MCEmitter* mc, u32 sf, u32 rd, i64 imm) {
    141   if (!sf) {
    142     rv_emit_li32(mc, rd, (i32)imm);
    143     return;
    144   }
    145   if (fits_i32(imm))
    146     rv_emit_li32(mc, rd, (i32)imm);
    147   else
    148     rv_emit_li64(mc, rd, (u64)imm);
    149 }
    150 
    151 /* rd = base + off, materializing the offset when it exceeds imm12. Uses RV_TMP1
    152  * as scratch for the wide path, so callers must keep RV_TMP1 free. */
    153 static void rv_emit_addr_adjust(MCEmitter* mc, u32 rd, u32 base, i32 off) {
    154   if (off == 0) {
    155     if (rd != base) rv64_emit32(mc, rv_addi(rd, base, 0));
    156     return;
    157   }
    158   if (fits_i12(off)) {
    159     rv64_emit32(mc, rv_addi(rd, base, off));
    160     return;
    161   }
    162   rv_emit_load_imm(mc, 1, RV_TMP1, (i64)off);
    163   rv64_emit32(mc, rv_add(rd, base, RV_TMP1));
    164 }
    165 
    166 static u32 enc_int_store(u32 nbytes, u32 src, u32 base, i32 off) {
    167   switch (nbytes) {
    168     case 1:
    169       return rv_sb(src, base, off);
    170     case 2:
    171       return rv_sh(src, base, off);
    172     case 4:
    173       return rv_sw(src, base, off);
    174     default:
    175       return rv_sd(src, base, off);
    176   }
    177 }
    178 static u32 enc_int_load(u32 nbytes, int sign_ext, u32 rd, u32 base, i32 off) {
    179   switch (nbytes) {
    180     case 1:
    181       return sign_ext ? rv_lb(rd, base, off) : rv_lbu(rd, base, off);
    182     case 2:
    183       return sign_ext ? rv_lh(rd, base, off) : rv_lhu(rd, base, off);
    184     case 4:
    185       return sign_ext ? rv_lw(rd, base, off) : rv_lwu(rd, base, off);
    186     default:
    187       return rv_ld(rd, base, off);
    188   }
    189 }
    190 
    191 /* ============================ target state ============================ */
    192 
    193 /* Frame slots and callee-save records live in the shared NativeFrame
    194  * bookkeeping (cg/native_frame.h); these aliases keep the rv64-local spellings.
    195  */
    196 typedef NativeFrameSlotEntry RvNativeSlot;
    197 typedef NativeFrameCalleeSave RvCalleeSave;
    198 
    199 typedef enum RvPatchKind { RV_PATCH_ALLOCA } RvPatchKind;
    200 
    201 typedef struct RvPatch {
    202   u8 kind; /* RvPatchKind */
    203   u32 pos;
    204   u32 dst_reg;
    205 } RvPatch;
    206 
    207 typedef struct RvNativeTarget {
    208   NativeTarget base;
    209   SrcLoc loc;
    210   const CGFuncDesc* func;
    211 
    212   /* Shared frame bookkeeping: slot table, cum_off, max_outgoing, callee-save
    213    * set, and the known_frame / has_alloca / frame_final flags. */
    214   NativeFrame frame;
    215   u32 frame_size_final;
    216   u32 fp_pair_off;
    217   u32 minimal_prologue_words; /* known-frame path: exact prologue length, else 0
    218                                */
    219 
    220   /* Known-frame (-O1) leaf no-frame tier (aa64's slim_prologue equivalent),
    221    * settled in rv_func_begin_known_frame; always 0 on the single-pass path. A
    222    * leaf with no callee-saves, no body slots, no outgoing args, no
    223    * sret/variadic and register-only params never reads s0 nor clobbers ra, so
    224    * it emits NO prologue and a bare `ret` โ€” the whole frame setup/teardown is
    225    * elided. RISC-V has no pre/post-indexed store, so aa64's fp_at_bottom fold
    226    * would save zero instructions on a kept frame and is intentionally not
    227    * ported (see doc/plan/ARCH.md ยง2); this leaf tier is the rv64 win. */
    228   u8 slim_prologue;
    229 
    230   u32 incoming_stack_size; /* fixed-param stack bytes (tail-call check) */
    231   u32 next_param_int;
    232   u32 next_param_fp;
    233   u32 next_param_stack;
    234   u8 has_sret;
    235   u8 is_variadic;
    236   NativeFrameSlot sret_ptr_slot;
    237 
    238   RvPatch* patches;
    239   u32 npatches;
    240   u32 patches_cap;
    241   u32 nalloca;
    242 
    243   u32 func_start;
    244   u32 prologue_pos;
    245   MCLabel epilogue_label;
    246 } RvNativeTarget;
    247 
    248 static RvNativeTarget* rv_of(NativeTarget* t) { return (RvNativeTarget*)t; }
    249 
    250 static _Noreturn void rv_panic(RvNativeTarget* a, const char* msg) {
    251   compiler_panic(a->base.c, a->loc, "rv64 native target: %s", msg);
    252 }
    253 
    254 static RvNativeSlot* rv_slot_get(RvNativeTarget* a, NativeFrameSlot fs) {
    255   return native_frame_slot_at(&a->frame, fs);
    256 }
    257 
    258 /* s0-relative byte offset of a frame slot's base (address = s0 + ret). */
    259 static i32 rv_s0_off_slot(const RvNativeSlot* s) { return -(i32)s->off; }
    260 
    261 /* s0-relative byte offset of incoming stack arg at byte_off. Stack args sit
    262  * just above the saved pair; the 64-byte variadic GP save area (when present)
    263  * is contiguous with them at [s0+16). */
    264 static i32 rv_s0_off_in_arg(const RvNativeTarget* a, u32 byte_off) {
    265   u32 base = a->is_variadic ? 16u + 64u : 16u;
    266   return (i32)(base + byte_off);
    267 }
    268 
    269 static u32 rv_va_save_sz(const RvNativeTarget* a) {
    270   /* ABI-derived: the variadic register-save area is gp_reg_count*gp_slot_size
    271    * (a0..a7 = 64 bytes for LP64D). Only present in variadic functions. */
    272   return a->is_variadic ? native_frame_va_save_bytes(a->base.c->abi) : 0u;
    273 }
    274 
    275 /* Callee-saved registers are homed just below the locals at rv_save_off(), 8
    276  * bytes each โ€” they are NOT frame slots, so the frame size must reserve their
    277  * bytes explicitly. Zero at -O0 (no callee-saves are taken). */
    278 static u32 rv_callee_save_bytes(const RvNativeTarget* a) {
    279   return a->frame.ncallee_saves * 8u;
    280 }
    281 
    282 static u32 rv_frame_size(const RvNativeTarget* a) {
    283   u32 raw = RV_FRAME_SAVE_SIZE + a->frame.cum_off + rv_callee_save_bytes(a) +
    284             a->frame.max_outgoing + rv_va_save_sz(a);
    285   return align_up_u32(raw, 16u);
    286 }
    287 
    288 static u32 rv_fp_pair_off(const RvNativeTarget* a, u32 frame_size) {
    289   return frame_size - RV_FRAME_SAVE_SIZE - rv_va_save_sz(a);
    290 }
    291 
    292 /* ============================ type helpers ============================ */
    293 
    294 /* Scalar size/align/mem/class/loc constructors are shared in native_target.h
    295  * (native_type_size, native_type_align, native_mem_for_type,
    296  * native_class_for_type_fp_le8, native_loc_reg, native_loc_stack,
    297  * native_loc_is_fp). loc_reg's mask is arch-specific and stays here. */
    298 
    299 /* A scalar value occupies a 64-bit register when it is pointer-sized or wider,
    300  * else it is a 32-bit value (drives ADDW vs ADD selection etc). */
    301 static int rv_is_64(NativeTarget* t, KitCgTypeId type) {
    302   return native_type_size(t, type) >= 8u || cg_type_is_ptr(t->c, type);
    303 }
    304 
    305 static u32 loc_reg(NativeLoc loc) { return loc.v.reg & 0x1fu; }
    306 
    307 /* ============================ register tables ============================ */
    308 
    309 #define RV_PHYS_INT_ARG(r, idx)                        \
    310   {.reg = (r),                                         \
    311    .cls = NATIVE_REG_INT,                              \
    312    .abi_index = (idx),                                 \
    313    .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \
    314             ((idx) < 2u ? NATIVE_REG_RET : 0),         \
    315    .spill_cost = 1u,                                   \
    316    .copy_cost = 1u}
    317 #define RV_PHYS_INT_CALLER(r)                               \
    318   {.reg = (r),                                              \
    319    .cls = NATIVE_REG_INT,                                   \
    320    .abi_index = 0xffu,                                      \
    321    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \
    322    .spill_cost = 1u,                                        \
    323    .copy_cost = 1u}
    324 #define RV_PHYS_INT_CALLEE(r)                               \
    325   {.reg = (r),                                              \
    326    .cls = NATIVE_REG_INT,                                   \
    327    .abi_index = 0xffu,                                      \
    328    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \
    329    .spill_cost = 4u,                                        \
    330    .copy_cost = 1u}
    331 #define RV_PHYS_INT_RESERVED(r)  \
    332   {.reg = (r),                   \
    333    .cls = NATIVE_REG_INT,        \
    334    .abi_index = 0xffu,           \
    335    .flags = NATIVE_REG_RESERVED, \
    336    .spill_cost = 0u,             \
    337    .copy_cost = 0u}
    338 
    339 /* t0..t3 (x5,x6,x7,x28) are emit-internal scratch (RV_TMP0..RV_TMP3), reserved
    340  * and never handed to the allocator or driver. t4/t5 are the driver scratch
    341  * pool (disjoint from the emit temps so a hook can never clobber an operand the
    342  * driver parked there). t6 is the lone caller-saved allocable (the -O0 cache's
    343  * only caller-saved home); s1..s11 are appended callee-saved, chosen under
    344  * pressure (and saved by the optimizer prologue at -O1). */
    345 static const Reg rv_int_allocable[] = {31u, 9u,  18u, 19u, 20u, 21u,
    346                                        22u, 23u, 24u, 25u, 26u, 27u};
    347 static const Reg rv_int_scratch[] = {29u, 30u}; /* t4, t5 */
    348 
    349 static const NativePhysRegInfo rv_int_phys[] = {
    350     RV_PHYS_INT_RESERVED(0u), /* zero */
    351     RV_PHYS_INT_RESERVED(1u), /* ra */
    352     RV_PHYS_INT_RESERVED(2u), /* sp */
    353     RV_PHYS_INT_RESERVED(3u), /* gp */
    354     RV_PHYS_INT_RESERVED(4u), /* tp */
    355     RV_PHYS_INT_RESERVED(5u), /* t0 = TMP0 */
    356     RV_PHYS_INT_RESERVED(6u), /* t1 = TMP1 */
    357     RV_PHYS_INT_RESERVED(7u), /* t2 = TMP2 (emit) */
    358     RV_PHYS_INT_RESERVED(8u), /* s0/fp */
    359     RV_PHYS_INT_CALLEE(9u),   /* s1 */
    360     RV_PHYS_INT_ARG(10u, 0u),  RV_PHYS_INT_ARG(11u, 1u),
    361     RV_PHYS_INT_ARG(12u, 2u),  RV_PHYS_INT_ARG(13u, 3u),
    362     RV_PHYS_INT_ARG(14u, 4u),  RV_PHYS_INT_ARG(15u, 5u),
    363     RV_PHYS_INT_ARG(16u, 6u),  RV_PHYS_INT_ARG(17u, 7u),
    364     RV_PHYS_INT_CALLEE(18u),   RV_PHYS_INT_CALLEE(19u),
    365     RV_PHYS_INT_CALLEE(20u),   RV_PHYS_INT_CALLEE(21u),
    366     RV_PHYS_INT_CALLEE(22u),   RV_PHYS_INT_CALLEE(23u),
    367     RV_PHYS_INT_CALLEE(24u),   RV_PHYS_INT_CALLEE(25u),
    368     RV_PHYS_INT_CALLEE(26u),   RV_PHYS_INT_CALLEE(27u),
    369     RV_PHYS_INT_RESERVED(28u), /* t3 = TMP3 (emit) */
    370     RV_PHYS_INT_RESERVED(29u), /* t4 = driver scratch */
    371     RV_PHYS_INT_RESERVED(30u), /* t5 = driver scratch */
    372     RV_PHYS_INT_CALLER(31u),   /* t6 = caller-saved allocable */
    373 };
    374 
    375 #define RV_PHYS_FP_ARG(r, idx)                         \
    376   {.reg = (r),                                         \
    377    .cls = NATIVE_REG_FP,                               \
    378    .abi_index = (idx),                                 \
    379    .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \
    380             ((idx) < 2u ? NATIVE_REG_RET : 0),         \
    381    .spill_cost = 1u,                                   \
    382    .copy_cost = 1u}
    383 #define RV_PHYS_FP_CALLER(r)                                \
    384   {.reg = (r),                                              \
    385    .cls = NATIVE_REG_FP,                                    \
    386    .abi_index = 0xffu,                                      \
    387    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \
    388    .spill_cost = 1u,                                        \
    389    .copy_cost = 1u}
    390 #define RV_PHYS_FP_CALLEE(r)                                \
    391   {.reg = (r),                                              \
    392    .cls = NATIVE_REG_FP,                                    \
    393    .abi_index = 0xffu,                                      \
    394    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \
    395    .spill_cost = 4u,                                        \
    396    .copy_cost = 1u}
    397 #define RV_PHYS_FP_RESERVED(r)   \
    398   {.reg = (r),                   \
    399    .cls = NATIVE_REG_FP,         \
    400    .abi_index = 0xffu,           \
    401    .flags = NATIVE_REG_RESERVED, \
    402    .spill_cost = 0u,             \
    403    .copy_cost = 0u}
    404 
    405 /* Caller-saved allocable first (ft4..ft7, ft8..ft11), then callee (fs0..fs11).
    406  * ft0/ft1 reserved as emit-internal scratch; ft2/ft3 driver scratch. */
    407 static const Reg rv_fp_allocable[] = {4u,  5u,  6u,  7u,  28u, 29u, 30u,
    408                                       31u, 8u,  9u,  18u, 19u, 20u, 21u,
    409                                       22u, 23u, 24u, 25u, 26u, 27u};
    410 static const Reg rv_fp_scratch[] = {2u, 3u}; /* ft2, ft3 */
    411 
    412 static const NativePhysRegInfo rv_fp_phys[] = {
    413     RV_PHYS_FP_RESERVED(0u), /* ft0 = FTMP0 */
    414     RV_PHYS_FP_RESERVED(1u), /* ft1 = FTMP1 */
    415     RV_PHYS_FP_RESERVED(2u), /* ft2 = scratch */
    416     RV_PHYS_FP_RESERVED(3u), /* ft3 = scratch */
    417     RV_PHYS_FP_CALLER(4u),   RV_PHYS_FP_CALLER(5u),   RV_PHYS_FP_CALLER(6u),
    418     RV_PHYS_FP_CALLER(7u),   RV_PHYS_FP_CALLEE(8u),   RV_PHYS_FP_CALLEE(9u),
    419     RV_PHYS_FP_ARG(10u, 0u), RV_PHYS_FP_ARG(11u, 1u), RV_PHYS_FP_ARG(12u, 2u),
    420     RV_PHYS_FP_ARG(13u, 3u), RV_PHYS_FP_ARG(14u, 4u), RV_PHYS_FP_ARG(15u, 5u),
    421     RV_PHYS_FP_ARG(16u, 6u), RV_PHYS_FP_ARG(17u, 7u), RV_PHYS_FP_CALLEE(18u),
    422     RV_PHYS_FP_CALLEE(19u),  RV_PHYS_FP_CALLEE(20u),  RV_PHYS_FP_CALLEE(21u),
    423     RV_PHYS_FP_CALLEE(22u),  RV_PHYS_FP_CALLEE(23u),  RV_PHYS_FP_CALLEE(24u),
    424     RV_PHYS_FP_CALLEE(25u),  RV_PHYS_FP_CALLEE(26u),  RV_PHYS_FP_CALLEE(27u),
    425     RV_PHYS_FP_CALLER(28u),  RV_PHYS_FP_CALLER(29u),  RV_PHYS_FP_CALLER(30u),
    426     RV_PHYS_FP_CALLER(31u),
    427 };
    428 
    429 static const NativeAllocClassInfo rv_classes[] = {
    430     {.cls = NATIVE_REG_INT,
    431      .allocable = rv_int_allocable,
    432      .nallocable = sizeof rv_int_allocable / sizeof rv_int_allocable[0],
    433      .scratch = rv_int_scratch,
    434      .nscratch = sizeof rv_int_scratch / sizeof rv_int_scratch[0],
    435      .phys = rv_int_phys,
    436      .nphys = sizeof rv_int_phys / sizeof rv_int_phys[0],
    437      /* t0-t6 (5-7,28-31) + a0-a7 (10-17) */
    438      .caller_saved_mask = 0xf00400e0u | 0x0001fc00u,
    439      /* s0-s11 (8,9,18-27) */
    440      .callee_saved_mask = 0x0ffc0300u,
    441      .arg_mask = 0x0001fc00u,
    442      .ret_mask = 0x00000c00u,
    443      /* zero,ra,sp,gp,tp,t0,t1,t2,s0 (bits 0-8) + t3 (bit 28). t4/t5 are the
    444       * driver scratch pool (reserved-from-alloc but listed in scratch[]). */
    445      .reserved_mask = 0x000001ffu | (1u << 28)},
    446     {.cls = NATIVE_REG_FP,
    447      .allocable = rv_fp_allocable,
    448      .nallocable = sizeof rv_fp_allocable / sizeof rv_fp_allocable[0],
    449      .scratch = rv_fp_scratch,
    450      .nscratch = sizeof rv_fp_scratch / sizeof rv_fp_scratch[0],
    451      .phys = rv_fp_phys,
    452      .nphys = sizeof rv_fp_phys / sizeof rv_fp_phys[0],
    453      /* ft0-ft7 (0-7), fa0-fa7 (10-17), ft8-ft11 (28-31) */
    454      .caller_saved_mask = 0xf00400ffu | 0x0001fc00u,
    455      /* fs0-fs11 (8,9,18-27) */
    456      .callee_saved_mask = 0x0ffc0300u,
    457      .arg_mask = 0x0001fc00u,
    458      .ret_mask = 0x00000c00u,
    459      .reserved_mask = 0x0000000fu /* ft0-ft3 */},
    460 };
    461 
    462 /* Resolve a register name ("a7", "fa0", ...) to its (class, Reg). Powers the
    463  * optimizer's inline-asm clobber masks and explicit hard-register operands
    464  * ("{a7}" from a GNU local register variable). x0..x31 are DWARF 0..31; the
    465  * FP bank f0..f31 is DWARF 32..63. Returns non-zero for a non-register name
    466  * (cc/memory/unknown), which the caller skips. */
    467 static int rv_resolve_name(const NativeRegInfo* ri, Slice name, Reg* out,
    468                            NativeAllocClass* cls_out) {
    469   char buf[16];
    470   uint32_t dwarf;
    471   (void)ri;
    472   if (!name.s || !name.len || name.len >= sizeof buf) return 1;
    473   memcpy(buf, name.s, name.len);
    474   buf[name.len] = '\0';
    475   if (rv64_register_index(buf, &dwarf) != 0) return 1;
    476   if (dwarf <= 31u) {
    477     *cls_out = NATIVE_REG_INT;
    478     *out = (Reg)dwarf;
    479     return 0;
    480   }
    481   if (dwarf >= 32u && dwarf <= 63u) {
    482     *cls_out = NATIVE_REG_FP;
    483     *out = (Reg)(dwarf - 32u);
    484     return 0;
    485   }
    486   return 1;
    487 }
    488 
    489 static int rv_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls,
    490                                  Reg reg) {
    491   (void)ri;
    492   if (cls == NATIVE_REG_INT) {
    493     if (reg == 9u) return 1;                /* s1 */
    494     if (reg >= 10u && reg <= 17u) return 1; /* a0..a7 */
    495     if (reg >= 18u && reg <= 27u) return 1; /* s2..s11 */
    496     if (reg == 31u) return 1;               /* t6 */
    497     return 0;
    498   }
    499   if (cls == NATIVE_REG_FP) return reg >= 4u && reg <= 31u;
    500   return 0;
    501 }
    502 
    503 static const NativeRegInfo rv_reg_info = {
    504     .classes = rv_classes,
    505     .nclasses = sizeof rv_classes / sizeof rv_classes[0],
    506     .resolve_name = rv_resolve_name,
    507     .asm_operand_reg_ok = rv_asm_operand_reg_ok,
    508 };
    509 
    510 /* ============================ legality ============================ */
    511 
    512 static int rv_imm_legal(NativeTarget* t, NativeImmUse use, u32 op,
    513                         KitCgTypeId type, i64 imm) {
    514   (void)t;
    515   (void)type;
    516   switch (use) {
    517     case NATIVE_IMM_MOVE:
    518       return 1;
    519     case NATIVE_IMM_BINOP:
    520       switch ((BinOp)op) {
    521         case BO_IADD:
    522           return fits_i12(imm);
    523         case BO_ISUB:
    524           return fits_i12(-imm); /* emitted as ADDI with negated imm */
    525         case BO_AND:
    526         case BO_OR:
    527         case BO_XOR:
    528           return fits_i12(imm);
    529         case BO_SHL:
    530         case BO_SHR_S:
    531         case BO_SHR_U:
    532           return imm >= 0 && imm <= 63;
    533         default:
    534           return 0;
    535       }
    536     case NATIVE_IMM_CMP:
    537       return imm == 0; /* compares need both ends in registers (SLT/branch) */
    538     case NATIVE_IMM_ADDR_OFFSET:
    539       return fits_i12(imm);
    540   }
    541   return 0;
    542 }
    543 
    544 static int rv_addr_legal(NativeTarget* t, const NativeAddr* addr,
    545                          MemAccess mem) {
    546   (void)t;
    547   (void)mem;
    548   if (!addr) return 0;
    549   if (addr->index_kind != NATIVE_ADDR_INDEX_NONE) return 0;
    550   if (addr->base_kind != NATIVE_ADDR_BASE_REG &&
    551       addr->base_kind != NATIVE_ADDR_BASE_FRAME)
    552     return 0;
    553   return fits_i12(addr->offset);
    554 }
    555 
    556 /* ============================ memory ============================ */
    557 
    558 /* Materialize the runtime address of a global into `dst`, including addend. */
    559 static void rv_emit_global_addr(RvNativeTarget* a, u32 dst, ObjSymId sym,
    560                                 i64 addend) {
    561   NativeTarget* t = &a->base;
    562   MCEmitter* mc = t->mc;
    563   u32 sec = mc->section_id;
    564   if (obj_symbol_extern_via_got(t->c, t->obj, sym)) {
    565     u32 ap = mc->pos(mc);
    566     rv64_emit32(mc, rv_auipc(dst, 0));
    567     mc->emit_reloc_at(mc, sec, ap, R_RV_GOT_HI20, sym, 0, 0, 0);
    568     {
    569       Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi"));
    570       ObjSymId anchor =
    571           obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
    572       u32 lp = mc->pos(mc);
    573       rv64_emit32(mc, rv_ld(dst, dst, 0));
    574       mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
    575     }
    576   } else {
    577     u32 ap = mc->pos(mc);
    578     rv64_emit32(mc, rv_auipc(dst, 0));
    579     mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, 0, 0, 0);
    580     {
    581       Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi"));
    582       ObjSymId anchor =
    583           obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
    584       u32 lp = mc->pos(mc);
    585       rv64_emit32(mc, rv_addi(dst, dst, 0));
    586       mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
    587     }
    588   }
    589   if (addend) rv_emit_addr_adjust(mc, dst, dst, (i32)addend);
    590 }
    591 
    592 /* Fold (base_reg << 0) + (index << scale) into RV_TMP0 via Zba. */
    593 static u32 rv_fold_index(RvNativeTarget* a, u32 base, u32 idx, u8 log2_scale) {
    594   MCEmitter* mc = a->base.mc;
    595   switch (log2_scale) {
    596     case 0:
    597       rv64_emit32(mc, rv_add(RV_TMP0, base, idx));
    598       break;
    599     case 1:
    600       rv64_emit32(mc, rv_sh1add(RV_TMP0, idx, base));
    601       break;
    602     case 2:
    603       rv64_emit32(mc, rv_sh2add(RV_TMP0, idx, base));
    604       break;
    605     default:
    606       rv64_emit32(mc, rv_sh3add(RV_TMP0, idx, base));
    607       break;
    608   }
    609   return RV_TMP0;
    610 }
    611 
    612 /* Resolve any NativeAddr to a base register + imm12 offset. RISC-V has no
    613  * indexed load/store, so an index is folded into RV_TMP0 via Zba; far offsets
    614  * and FRAME/FRAME_VALUE/GLOBAL bases are materialized into RV_TMP0/RV_TMP1. */
    615 static void rv_resolve_mem_addr(RvNativeTarget* a, const NativeAddr* addr,
    616                                 u32* base_out, i32* off_out) {
    617   MCEmitter* mc = a->base.mc;
    618   u32 base;
    619   i32 off;
    620   switch (addr->base_kind) {
    621     case NATIVE_ADDR_BASE_REG:
    622       base = addr->base.reg & 0x1fu;
    623       off = addr->offset;
    624       break;
    625     case NATIVE_ADDR_BASE_FRAME: {
    626       RvNativeSlot* s = rv_slot_get(a, addr->base.frame);
    627       base = RV_S0;
    628       off = rv_s0_off_slot(s) + addr->offset;
    629       break;
    630     }
    631     case NATIVE_ADDR_BASE_FRAME_VALUE: {
    632       RvNativeSlot* s = rv_slot_get(a, addr->base.frame);
    633       rv64_emit32(mc, rv_ld(RV_TMP0, RV_S0, rv_s0_off_slot(s)));
    634       base = RV_TMP0;
    635       off = addr->offset;
    636       break;
    637     }
    638     case NATIVE_ADDR_BASE_GLOBAL:
    639       rv_emit_global_addr(a, RV_TMP0, addr->base.global.sym,
    640                           addr->base.global.addend);
    641       base = RV_TMP0;
    642       off = addr->offset;
    643       break;
    644     default:
    645       rv_panic(a, "unsupported address base");
    646   }
    647   if (addr->index_kind == NATIVE_ADDR_INDEX_REG) {
    648     base = rv_fold_index(a, base, addr->index.reg & 0x1fu, addr->log2_scale);
    649   } else if (addr->index_kind == NATIVE_ADDR_INDEX_FRAME_VALUE) {
    650     RvNativeSlot* s = rv_slot_get(a, addr->index.frame);
    651     rv64_emit32(mc, rv_ld(RV_TMP1, RV_S0, rv_s0_off_slot(s)));
    652     base = rv_fold_index(a, base, RV_TMP1, addr->log2_scale);
    653   }
    654   if (!fits_i12(off)) {
    655     rv_emit_load_imm(mc, 1, RV_TMP1, (i64)off);
    656     rv64_emit32(mc, rv_add(RV_TMP0, base, RV_TMP1));
    657     base = RV_TMP0;
    658     off = 0;
    659   }
    660   *base_out = base;
    661   *off_out = off;
    662 }
    663 
    664 /* Central load/store primitive. is_load: 1 load into reg, 0 store reg to mem.
    665  */
    666 static void rv_emit_mem(RvNativeTarget* a, int is_load, NativeLoc reg,
    667                         NativeAddr addr, MemAccess mem) {
    668   NativeTarget* t = &a->base;
    669   MCEmitter* mc = t->mc;
    670   u32 r = loc_reg(reg);
    671   int fp = native_loc_is_fp(reg);
    672   u32 sz = mem.size ? mem.size : native_type_size(t, reg.type);
    673   u32 base;
    674   i32 off;
    675 
    676   rv_resolve_mem_addr(a, &addr, &base, &off);
    677   if (fp) {
    678     rv64_emit32(
    679         mc, is_load ? (sz == 8u ? rv_fld(r, base, off) : rv_flw(r, base, off))
    680                     : (sz == 8u ? rv_fsd(r, base, off) : rv_fsw(r, base, off)));
    681   } else {
    682     rv64_emit32(mc, is_load ? enc_int_load(sz, 0, r, base, off)
    683                             : enc_int_store(sz, r, base, off));
    684   }
    685 }
    686 
    687 /* ============================ moves / data ============================ */
    688 
    689 static void rv_move(NativeTarget* t, NativeLoc dst, NativeLoc src) {
    690   MCEmitter* mc = t->mc;
    691   int dfp = native_loc_is_fp(dst), sfp = native_loc_is_fp(src);
    692   u32 rd = loc_reg(dst), rs = loc_reg(src);
    693   if (dfp && sfp) {
    694     u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S;
    695     if (rd == rs) return;
    696     rv64_emit32(mc, rv_fsgnj(fmt, rd, rs, rs));
    697     return;
    698   }
    699   if (!dfp && sfp) {
    700     u32 sz = native_type_size(t, src.type);
    701     rv64_emit32(mc, sz == 8u ? rv_fmv_x_d(rd, rs) : rv_fmv_x_w(rd, rs));
    702     return;
    703   }
    704   if (dfp && !sfp) {
    705     u32 sz = native_type_size(t, dst.type);
    706     rv64_emit32(mc, sz == 8u ? rv_fmv_d_x(rd, rs) : rv_fmv_w_x(rd, rs));
    707     return;
    708   }
    709   if (rd == rs) return;
    710   rv64_emit32(mc, rv_addi(rd, rs, 0));
    711 }
    712 
    713 static void rv_load_imm(NativeTarget* t, NativeLoc dst, i64 imm) {
    714   rv_emit_load_imm(t->mc, rv_is_64(t, dst.type) ? 1u : 0u, loc_reg(dst), imm);
    715 }
    716 
    717 static void rv_load_const(NativeTarget* t, NativeLoc dst, ConstBytes cb) {
    718   RvNativeTarget* a = rv_of(t);
    719   u64 v = 0;
    720   u32 i;
    721   if (!native_loc_is_fp(dst)) {
    722     for (i = 0; i < cb.size && i < 8u; ++i) v |= (u64)cb.bytes[i] << (i * 8u);
    723     rv_load_imm(t, dst, (i64)v);
    724     return;
    725   }
    726   /* FP constant: materialize the bit pattern in TMP0, bitcast into the FPR. */
    727   for (i = 0; i < cb.size && i < 8u; ++i) v |= (u64)cb.bytes[i] << (i * 8u);
    728   rv_emit_load_imm(t->mc, 1, RV_TMP0, (i64)v);
    729   if (cb.size == 8u)
    730     rv64_emit32(t->mc, rv_fmv_d_x(loc_reg(dst), RV_TMP0));
    731   else
    732     rv64_emit32(t->mc, rv_fmv_w_x(loc_reg(dst), RV_TMP0));
    733   (void)a;
    734 }
    735 
    736 static void rv_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) {
    737   RvNativeTarget* a = rv_of(t);
    738   MCEmitter* mc = t->mc;
    739   u32 rd = loc_reg(dst);
    740   u32 base;
    741   i32 off;
    742   if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL) {
    743     rv_emit_global_addr(a, rd, addr.base.global.sym,
    744                         addr.base.global.addend + addr.offset);
    745     base = rd;
    746     off = 0;
    747   } else if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
    748     /* Load the pointer stored in the frame slot, then add the offset. */
    749     RvNativeSlot* s = rv_slot_get(a, addr.base.frame);
    750     rv64_emit32(mc, rv_ld(rd, RV_S0, rv_s0_off_slot(s)));
    751     base = rd;
    752     off = addr.offset;
    753   } else if (addr.base_kind == NATIVE_ADDR_BASE_FRAME) {
    754     RvNativeSlot* s = rv_slot_get(a, addr.base.frame);
    755     base = RV_S0;
    756     off = rv_s0_off_slot(s) + addr.offset;
    757   } else if (addr.base_kind == NATIVE_ADDR_BASE_REG) {
    758     base = addr.base.reg & 0x1fu;
    759     off = addr.offset;
    760   } else {
    761     rv_panic(a, "unsupported address base in load_addr");
    762   }
    763   /* Fold any index via Zba sh{1,2,3}add (index << scale) + base. */
    764   if (addr.index_kind == NATIVE_ADDR_INDEX_REG) {
    765     u32 idx = addr.index.reg & 0x1fu;
    766     if (off != 0 || base != rd) rv_emit_addr_adjust(mc, rd, base, off);
    767     switch (addr.log2_scale) {
    768       case 0:
    769         rv64_emit32(mc, rv_add(rd, rd, idx));
    770         break;
    771       case 1:
    772         rv64_emit32(mc, rv_sh1add(rd, idx, rd));
    773         break;
    774       case 2:
    775         rv64_emit32(mc, rv_sh2add(rd, idx, rd));
    776         break;
    777       default:
    778         rv64_emit32(mc, rv_sh3add(rd, idx, rd));
    779         break;
    780     }
    781     return;
    782   }
    783   rv_emit_addr_adjust(mc, rd, base, off);
    784 }
    785 
    786 static void rv_load(NativeTarget* t, NativeLoc dst, NativeAddr addr,
    787                     MemAccess mem) {
    788   rv_emit_mem(rv_of(t), 1, dst, addr, mem);
    789 }
    790 static void rv_store(NativeTarget* t, NativeAddr addr, NativeLoc src,
    791                      MemAccess mem) {
    792   rv_emit_mem(rv_of(t), 0, src, addr, mem);
    793 }
    794 
    795 /* copy_bytes: resolve dst and src to dedicated pointer regs (RV_TMP3 / RV_TMP0)
    796  * once, then copy granule-by-granule advancing both pointers. dst is resolved
    797  * first because its base may itself live in RV_TMP1 (the transfer reg, e.g. the
    798  * sret pointer from plan_ret); capturing it into RV_TMP3 before src resolution
    799  * (which may clobber RV_TMP1 for far offsets) keeps it live. Advancing the
    800  * pointers keeps every load/store at offset 0, so no offset ever exceeds imm12
    801  * and the transfer reg never aliases a base. */
    802 static void rv_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src,
    803                           AggregateAccess access) {
    804   MCEmitter* mc = t->mc;
    805   KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
    806   u32 rem = access.size;
    807   rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP3), dst);
    808   rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP0), src);
    809   while (rem) {
    810     u32 sz = rem >= 8u ? 8u : rem >= 4u ? 4u : rem >= 2u ? 2u : 1u;
    811     rv64_emit32(mc, enc_int_load(sz, 0, RV_TMP1, RV_TMP0, 0));
    812     rv64_emit32(mc, enc_int_store(sz, RV_TMP1, RV_TMP3, 0));
    813     rv64_emit32(mc, rv_addi(RV_TMP0, RV_TMP0, (i32)sz));
    814     rv64_emit32(mc, rv_addi(RV_TMP3, RV_TMP3, (i32)sz));
    815     rem -= sz;
    816   }
    817 }
    818 
    819 static void rv_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value,
    820                          AggregateAccess access) {
    821   MCEmitter* mc = t->mc;
    822   KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
    823   u32 bv = loc_reg(byte_value);
    824   u32 rem = access.size;
    825   rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP3), dst);
    826   while (rem) {
    827     rv64_emit32(mc, rv_sb(bv, RV_TMP3, 0));
    828     rv64_emit32(mc, rv_addi(RV_TMP3, RV_TMP3, 1));
    829     rem -= 1u;
    830   }
    831 }
    832 
    833 /* ============================ arithmetic ============================ */
    834 
    835 static void rv_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc aop,
    836                      NativeLoc bop) {
    837   MCEmitter* mc = t->mc;
    838   u32 rd = loc_reg(dst);
    839   u32 ra = loc_reg(aop);
    840   int sf = rv_is_64(t, dst.type);
    841   int b_imm = bop.kind == NATIVE_LOC_IMM;
    842   u32 rb = b_imm ? 0u : loc_reg(bop);
    843   i64 imm = b_imm ? bop.v.imm : 0;
    844 
    845   switch (op) {
    846     case BO_FADD:
    847     case BO_FSUB:
    848     case BO_FMUL:
    849     case BO_FDIV: {
    850       u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S;
    851       switch (op) {
    852         case BO_FADD:
    853           rv64_emit32(mc, rv_fadd(fmt, rd, ra, rb));
    854           break;
    855         case BO_FSUB:
    856           rv64_emit32(mc, rv_fsub(fmt, rd, ra, rb));
    857           break;
    858         case BO_FMUL:
    859           rv64_emit32(mc, rv_fmul(fmt, rd, ra, rb));
    860           break;
    861         default:
    862           rv64_emit32(mc, rv_fdiv(fmt, rd, ra, rb));
    863           break;
    864       }
    865       return;
    866     }
    867     case BO_IADD:
    868       if (b_imm) {
    869         rv64_emit32(
    870             mc, sf ? rv_addi(rd, ra, (i32)imm) : rv_addiw(rd, ra, (i32)imm));
    871       } else {
    872         rv64_emit32(mc, sf ? rv_add(rd, ra, rb) : rv_addw(rd, ra, rb));
    873       }
    874       return;
    875     case BO_ISUB:
    876       if (b_imm) {
    877         rv64_emit32(
    878             mc, sf ? rv_addi(rd, ra, (i32)-imm) : rv_addiw(rd, ra, (i32)-imm));
    879       } else {
    880         rv64_emit32(mc, sf ? rv_sub(rd, ra, rb) : rv_subw(rd, ra, rb));
    881       }
    882       return;
    883     case BO_IMUL:
    884       rv64_emit32(mc, sf ? rv_mul(rd, ra, rb) : rv_mulw(rd, ra, rb));
    885       return;
    886     case BO_SDIV:
    887       rv64_emit32(mc, sf ? rv_div(rd, ra, rb) : rv_divw(rd, ra, rb));
    888       return;
    889     case BO_UDIV:
    890       rv64_emit32(mc, sf ? rv_divu(rd, ra, rb) : rv_divuw(rd, ra, rb));
    891       return;
    892     case BO_SREM:
    893       rv64_emit32(mc, sf ? rv_rem(rd, ra, rb) : rv_remw(rd, ra, rb));
    894       return;
    895     case BO_UREM:
    896       rv64_emit32(mc, sf ? rv_remu(rd, ra, rb) : rv_remuw(rd, ra, rb));
    897       return;
    898     case BO_AND:
    899       rv64_emit32(mc, b_imm ? rv_andi(rd, ra, (i32)imm) : rv_and(rd, ra, rb));
    900       return;
    901     case BO_OR:
    902       rv64_emit32(mc, b_imm ? rv_ori(rd, ra, (i32)imm) : rv_or(rd, ra, rb));
    903       return;
    904     case BO_XOR:
    905       rv64_emit32(mc, b_imm ? rv_xori(rd, ra, (i32)imm) : rv_xor(rd, ra, rb));
    906       return;
    907     case BO_SHL:
    908       if (b_imm)
    909         rv64_emit32(mc, sf ? rv_slli(rd, ra, (u32)imm & 63u)
    910                            : rv_slliw(rd, ra, (u32)imm & 31u));
    911       else
    912         rv64_emit32(mc, sf ? rv_sll(rd, ra, rb) : rv_sllw(rd, ra, rb));
    913       return;
    914     case BO_SHR_U:
    915       if (b_imm)
    916         rv64_emit32(mc, sf ? rv_srli(rd, ra, (u32)imm & 63u)
    917                            : rv_srliw(rd, ra, (u32)imm & 31u));
    918       else
    919         rv64_emit32(mc, sf ? rv_srl(rd, ra, rb) : rv_srlw(rd, ra, rb));
    920       return;
    921     case BO_SHR_S:
    922       if (b_imm)
    923         rv64_emit32(mc, sf ? rv_srai(rd, ra, (u32)imm & 63u)
    924                            : rv_sraiw(rd, ra, (u32)imm & 31u));
    925       else
    926         rv64_emit32(mc, sf ? rv_sra(rd, ra, rb) : rv_sraw(rd, ra, rb));
    927       return;
    928     default:
    929       rv_panic(rv_of(t), "unsupported binop");
    930   }
    931 }
    932 
    933 static void rv_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) {
    934   MCEmitter* mc = t->mc;
    935   u32 rd = loc_reg(dst), rs = loc_reg(src);
    936   int sf = rv_is_64(t, dst.type);
    937   switch (op) {
    938     case UO_NEG:
    939       rv64_emit32(mc, sf ? rv_sub(rd, RV_ZERO, rs) : rv_subw(rd, RV_ZERO, rs));
    940       return;
    941     case UO_FNEG: {
    942       u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S;
    943       rv64_emit32(mc, rv_fsgnjn(fmt, rd, rs, rs));
    944       return;
    945     }
    946     case UO_BNOT:
    947       rv64_emit32(mc, rv_xori(rd, rs, -1));
    948       return;
    949     case UO_NOT:
    950       rv64_emit32(mc, rv_sltiu(rd, rs, 1));
    951       return;
    952     default:
    953       rv_panic(rv_of(t), "unsupported unop");
    954   }
    955 }
    956 
    957 /* Sign/zero-extend a 32-bit operand into a 64-bit register for comparison.
    958  * Returns the register to compare. */
    959 static u32 rv_cmp_ext(NativeTarget* t, int is_signed, NativeLoc op, u32 tmp) {
    960   MCEmitter* mc = t->mc;
    961   u32 r = loc_reg(op);
    962   if (rv_is_64(t, op.type)) return r;
    963   if (is_signed) {
    964     rv64_emit32(mc, rv_addiw(tmp, r, 0)); /* sign-extend low 32 */
    965   } else {
    966     rv64_emit32(mc, rv_slli(tmp, r, 32));
    967     rv64_emit32(mc, rv_srli(tmp, tmp, 32));
    968   }
    969   return tmp;
    970 }
    971 
    972 static int cmp_is_signed(CmpOp op) {
    973   switch (op) {
    974     case CMP_LT_U:
    975     case CMP_LE_U:
    976     case CMP_GT_U:
    977     case CMP_GE_U:
    978       return 0;
    979     default:
    980       return 1;
    981   }
    982 }
    983 
    984 /* Emit a 0/1 comparison result into rd from two integer registers. */
    985 static void rv_emit_icmp(NativeTarget* t, CmpOp op, u32 rd, u32 ra, u32 rb) {
    986   MCEmitter* mc = t->mc;
    987   switch (op) {
    988     case CMP_EQ:
    989       rv64_emit32(mc, rv_sub(rd, ra, rb));
    990       rv64_emit32(mc, rv_sltiu(rd, rd, 1));
    991       return;
    992     case CMP_NE:
    993       rv64_emit32(mc, rv_sub(rd, ra, rb));
    994       rv64_emit32(mc, rv_sltu(rd, RV_ZERO, rd));
    995       return;
    996     case CMP_LT_S:
    997       rv64_emit32(mc, rv_slt(rd, ra, rb));
    998       return;
    999     case CMP_LT_U:
   1000       rv64_emit32(mc, rv_sltu(rd, ra, rb));
   1001       return;
   1002     case CMP_GT_S:
   1003       rv64_emit32(mc, rv_slt(rd, rb, ra));
   1004       return;
   1005     case CMP_GT_U:
   1006       rv64_emit32(mc, rv_sltu(rd, rb, ra));
   1007       return;
   1008     case CMP_GE_S:
   1009       rv64_emit32(mc, rv_slt(rd, ra, rb));
   1010       rv64_emit32(mc, rv_xori(rd, rd, 1));
   1011       return;
   1012     case CMP_GE_U:
   1013       rv64_emit32(mc, rv_sltu(rd, ra, rb));
   1014       rv64_emit32(mc, rv_xori(rd, rd, 1));
   1015       return;
   1016     case CMP_LE_S:
   1017       rv64_emit32(mc, rv_slt(rd, rb, ra));
   1018       rv64_emit32(mc, rv_xori(rd, rd, 1));
   1019       return;
   1020     case CMP_LE_U:
   1021       rv64_emit32(mc, rv_sltu(rd, rb, ra));
   1022       rv64_emit32(mc, rv_xori(rd, rd, 1));
   1023       return;
   1024     default:
   1025       rv_panic(rv_of(t), "unsupported integer cmp");
   1026   }
   1027 }
   1028 
   1029 /* Format-dispatching wrappers over the ordered FP compares (feq/flt/fle are
   1030  * ordered: they yield 0 on NaN; flt/fle are signaling, raising NV on NaN โ€”
   1031  * pre-existing for ordered ops, and the boolean result is still correct). */
   1032 static u32 rv_feq_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) {
   1033   return fmt == RV_FMT_D ? rv_feq_d(rd, ra, rb) : rv_feq_s(rd, ra, rb);
   1034 }
   1035 static u32 rv_flt_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) {
   1036   return fmt == RV_FMT_D ? rv_flt_d(rd, ra, rb) : rv_flt_s(rd, ra, rb);
   1037 }
   1038 static u32 rv_fle_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) {
   1039   return fmt == RV_FMT_D ? rv_fle_d(rd, ra, rb) : rv_fle_s(rd, ra, rb);
   1040 }
   1041 
   1042 static void rv_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc aop,
   1043                    NativeLoc bop) {
   1044   MCEmitter* mc = t->mc;
   1045   u32 rd = loc_reg(dst);
   1046   /* FP-ness is self-describing from the opcode (FP block starts at CMP_OEQ_F).
   1047    * Unordered predicates use unordered-R == NOT(ordered-not-R): the ordered
   1048    * compare into rd, then `xori rd,rd,1`. ONE/UEQ have no single ordered
   1049    * primitive and OR the two strict relations (a<b | a>b) via scratch RV_TMP2
   1050    * (x7, reserved & never allocable, so it can't alias rd). */
   1051   if (op >= CMP_OEQ_F) {
   1052     u32 fmt = native_type_size(t, aop.type) == 8u ? RV_FMT_D : RV_FMT_S;
   1053     u32 ra = loc_reg(aop), rb = loc_reg(bop);
   1054     switch (op) {
   1055       case CMP_OEQ_F:
   1056         rv64_emit32(mc, rv_feq_fmt(fmt, rd, ra, rb));
   1057         return;
   1058       case CMP_UNE_F: /* !(OEQ) */
   1059         rv64_emit32(mc, rv_feq_fmt(fmt, rd, ra, rb));
   1060         rv64_emit32(mc, rv_xori(rd, rd, 1));
   1061         return;
   1062       case CMP_OLT_F:
   1063         rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
   1064         return;
   1065       case CMP_OLE_F:
   1066         rv64_emit32(mc, rv_fle_fmt(fmt, rd, ra, rb));
   1067         return;
   1068       case CMP_OGT_F:
   1069         rv64_emit32(mc, rv_flt_fmt(fmt, rd, rb, ra));
   1070         return;
   1071       case CMP_OGE_F:
   1072         rv64_emit32(mc, rv_fle_fmt(fmt, rd, rb, ra));
   1073         return;
   1074       case CMP_UGE_F: /* !(OLT) */
   1075         rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
   1076         rv64_emit32(mc, rv_xori(rd, rd, 1));
   1077         return;
   1078       case CMP_UGT_F: /* !(OLE) */
   1079         rv64_emit32(mc, rv_fle_fmt(fmt, rd, ra, rb));
   1080         rv64_emit32(mc, rv_xori(rd, rd, 1));
   1081         return;
   1082       case CMP_ULE_F: /* !(OGT) */
   1083         rv64_emit32(mc, rv_flt_fmt(fmt, rd, rb, ra));
   1084         rv64_emit32(mc, rv_xori(rd, rd, 1));
   1085         return;
   1086       case CMP_ULT_F: /* !(OGE) */
   1087         rv64_emit32(mc, rv_fle_fmt(fmt, rd, rb, ra));
   1088         rv64_emit32(mc, rv_xori(rd, rd, 1));
   1089         return;
   1090       case CMP_ONE_F: /* ordered & !=: (a<b) | (a>b) */
   1091         rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
   1092         rv64_emit32(mc, rv_flt_fmt(fmt, RV_TMP2, rb, ra));
   1093         rv64_emit32(mc, rv_or(rd, rd, RV_TMP2));
   1094         return;
   1095       case CMP_UEQ_F: /* unordered | ==: !((a<b) | (a>b)) */
   1096         rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
   1097         rv64_emit32(mc, rv_flt_fmt(fmt, RV_TMP2, rb, ra));
   1098         rv64_emit32(mc, rv_or(rd, rd, RV_TMP2));
   1099         rv64_emit32(mc, rv_xori(rd, rd, 1));
   1100         return;
   1101       default:
   1102         rv_panic(rv_of(t), "unsupported fp cmp");
   1103     }
   1104   }
   1105   {
   1106     int sg = cmp_is_signed(op);
   1107     u32 ra = rv_cmp_ext(t, sg, aop, RV_TMP0);
   1108     u32 rb = rv_cmp_ext(t, sg, bop, RV_TMP1);
   1109     rv_emit_icmp(t, op, rd, ra, rb);
   1110   }
   1111 }
   1112 
   1113 static void rv_convert(NativeTarget* t, ConvKind op, NativeLoc dst,
   1114                        NativeLoc src) {
   1115   MCEmitter* mc = t->mc;
   1116   u32 rd = loc_reg(dst), rs = loc_reg(src);
   1117   u32 src_sz = native_type_size(t, src.type);
   1118   u32 dst_sz = native_type_size(t, dst.type);
   1119   switch (op) {
   1120     case CV_SEXT:
   1121       if (src_sz >= 4u) {
   1122         rv64_emit32(mc, rv_addiw(rd, rs, 0));
   1123       } else {
   1124         u32 sh = 64u - src_sz * 8u;
   1125         rv64_emit32(mc, rv_slli(rd, rs, sh));
   1126         rv64_emit32(mc, rv_srai(rd, rd, sh));
   1127       }
   1128       return;
   1129     case CV_ZEXT: {
   1130       u32 sh = 64u - src_sz * 8u;
   1131       rv64_emit32(mc, rv_slli(rd, rs, sh));
   1132       rv64_emit32(mc, rv_srli(rd, rd, sh));
   1133       return;
   1134     }
   1135     case CV_TRUNC:
   1136       if (rd != rs || dst_sz <= 4u)
   1137         rv64_emit32(mc, rv_addi(rd, rs, 0)); /* low bits; users re-narrow */
   1138       return;
   1139     case CV_ITOF_S:
   1140       if (native_type_size(t, dst.type) == 8u)
   1141         rv64_emit32(mc,
   1142                     src_sz == 8u ? rv_fcvt_d_l(rd, rs) : rv_fcvt_d_w(rd, rs));
   1143       else
   1144         rv64_emit32(mc,
   1145                     src_sz == 8u ? rv_fcvt_s_l(rd, rs) : rv_fcvt_s_w(rd, rs));
   1146       return;
   1147     case CV_ITOF_U:
   1148       if (native_type_size(t, dst.type) == 8u)
   1149         rv64_emit32(mc,
   1150                     src_sz == 8u ? rv_fcvt_d_lu(rd, rs) : rv_fcvt_d_wu(rd, rs));
   1151       else
   1152         rv64_emit32(mc,
   1153                     src_sz == 8u ? rv_fcvt_s_lu(rd, rs) : rv_fcvt_s_wu(rd, rs));
   1154       return;
   1155     case CV_FTOI_S:
   1156       if (src_sz == 8u)
   1157         rv64_emit32(mc,
   1158                     dst_sz == 8u ? rv_fcvt_l_d(rd, rs) : rv_fcvt_w_d(rd, rs));
   1159       else
   1160         rv64_emit32(mc,
   1161                     dst_sz == 8u ? rv_fcvt_l_s(rd, rs) : rv_fcvt_w_s(rd, rs));
   1162       return;
   1163     case CV_FTOI_U:
   1164       if (src_sz == 8u)
   1165         rv64_emit32(mc,
   1166                     dst_sz == 8u ? rv_fcvt_lu_d(rd, rs) : rv_fcvt_wu_d(rd, rs));
   1167       else
   1168         rv64_emit32(mc,
   1169                     dst_sz == 8u ? rv_fcvt_lu_s(rd, rs) : rv_fcvt_wu_s(rd, rs));
   1170       return;
   1171     case CV_FEXT:
   1172       rv64_emit32(mc, rv_fcvt_d_s(rd, rs));
   1173       return;
   1174     case CV_FTRUNC:
   1175       rv64_emit32(mc, rv_fcvt_s_d(rd, rs));
   1176       return;
   1177     case CV_BITCAST:
   1178       rv_move(t, dst, src);
   1179       return;
   1180     default:
   1181       rv_panic(rv_of(t), "unsupported convert");
   1182   }
   1183 }
   1184 
   1185 /* ============================ spill / reload ============================ */
   1186 
   1187 static void rv_spill(NativeTarget* t, NativeLoc src, NativeFrameSlot slot,
   1188                      MemAccess mem) {
   1189   NativeAddr addr;
   1190   memset(&addr, 0, sizeof addr);
   1191   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1192   addr.base.frame = slot;
   1193   addr.base_type = src.type;
   1194   rv_emit_mem(rv_of(t), 0, src, addr, mem);
   1195 }
   1196 static void rv_reload(NativeTarget* t, NativeLoc dst, NativeFrameSlot slot,
   1197                       MemAccess mem) {
   1198   NativeAddr addr;
   1199   memset(&addr, 0, sizeof addr);
   1200   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1201   addr.base.frame = slot;
   1202   addr.base_type = dst.type;
   1203   rv_emit_mem(rv_of(t), 1, dst, addr, mem);
   1204 }
   1205 
   1206 /* ============================ control flow ============================ */
   1207 
   1208 static MCLabel rv_label_new(NativeTarget* t) { return t->mc->label_new(t->mc); }
   1209 static void rv_label_place(NativeTarget* t, MCLabel l) {
   1210   t->mc->label_place(t->mc, l);
   1211 }
   1212 static void rv_jump(NativeTarget* t, MCLabel l) {
   1213   rv64_emit32(t->mc, rv_jal(RV_ZERO, 0));
   1214   t->mc->emit_label_ref(t->mc, l, R_RV_JAL, 4, 0);
   1215 }
   1216 
   1217 static void rv_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc aop,
   1218                           NativeLoc bop, MCLabel l) {
   1219   MCEmitter* mc = t->mc;
   1220   /* RISC-V B-type branches reach only ยฑ4 KiB, which a single (especially
   1221    * -O0) function can exceed between a branch and its target. Rather than a
   1222    * lone conditional branch to the label, emit a short *inverted* branch
   1223    * that skips an unconditional `jal` (ยฑ1 MiB) to the target. The inverted
   1224    * branch's displacement is the constant SKIP_JAL (skip just the jal) and
   1225    * so is always in range; the jal carries the long reach. See rv_jump. */
   1226   enum { SKIP_JAL = 8 }; /* branch over the 4-byte jal that follows it */
   1227   /* FP compares have no register-register branch form: materialize the 0/1
   1228    * into TMP0 via rv_cmp (handles all 12 predicates), then branch on nonzero.
   1229    */
   1230   if (op >= CMP_OEQ_F) {
   1231     NativeLoc tmp =
   1232         native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, RV_TMP0);
   1233     rv_cmp(t, op, tmp, aop, bop);
   1234     /* Skip the jal when the result is 0 (condition false). */
   1235     rv64_emit32(mc, rv_beq(RV_TMP0, RV_ZERO, SKIP_JAL));
   1236     rv_jump(t, l);
   1237     return;
   1238   }
   1239   {
   1240     int sg = cmp_is_signed(op);
   1241     u32 ra = rv_cmp_ext(t, sg, aop, RV_TMP0);
   1242     u32 rb = rv_cmp_ext(t, sg, bop, RV_TMP1);
   1243     u32 word;
   1244     /* Encode the *inverse* of `op`, skipping the jal when NOT taken. */
   1245     switch (op) {
   1246       case CMP_EQ:
   1247         word = rv_bne(ra, rb, SKIP_JAL);
   1248         break;
   1249       case CMP_NE:
   1250         word = rv_beq(ra, rb, SKIP_JAL);
   1251         break;
   1252       case CMP_LT_S:
   1253         word = rv_bge(ra, rb, SKIP_JAL);
   1254         break;
   1255       case CMP_GE_S:
   1256         word = rv_blt(ra, rb, SKIP_JAL);
   1257         break;
   1258       case CMP_LT_U:
   1259         word = rv_bgeu(ra, rb, SKIP_JAL);
   1260         break;
   1261       case CMP_GE_U:
   1262         word = rv_bltu(ra, rb, SKIP_JAL);
   1263         break;
   1264       case CMP_GT_S:
   1265         word = rv_bge(rb, ra, SKIP_JAL);
   1266         break;
   1267       case CMP_LE_S:
   1268         word = rv_blt(rb, ra, SKIP_JAL);
   1269         break;
   1270       case CMP_GT_U:
   1271         word = rv_bgeu(rb, ra, SKIP_JAL);
   1272         break;
   1273       case CMP_LE_U:
   1274         word = rv_bltu(rb, ra, SKIP_JAL);
   1275         break;
   1276       default:
   1277         rv_panic(rv_of(t), "unsupported cmp_branch");
   1278     }
   1279     rv64_emit32(mc, word);
   1280     rv_jump(t, l);
   1281   }
   1282 }
   1283 
   1284 static void rv_indirect_branch(NativeTarget* t, NativeLoc addr,
   1285                                const MCLabel* valid_targets, u32 ntargets) {
   1286   (void)valid_targets;
   1287   (void)ntargets;
   1288   rv64_emit32(t->mc, rv_jalr(RV_ZERO, loc_reg(addr), 0));
   1289 }
   1290 
   1291 static void rv_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel l) {
   1292   /* `&&label` address-take: auipc/addi with a %pcrel_hi/%pcrel_lo relocation
   1293    * pair against the label's per-block local symbol โ€” the same form
   1294    * rv_emit_global_addr uses for a global โ€” so a compressing/re-encoding
   1295    * assembler recomputes the displacement (a baked offset would break under
   1296    * the C extension). */
   1297   MCEmitter* mc = t->mc;
   1298   u32 rd = loc_reg(dst);
   1299   u32 sec = mc->section_id;
   1300   ObjSymId sym = mc_label_symbol(mc, l);
   1301   u32 ap = mc->pos(mc);
   1302   rv64_emit32(mc, rv_auipc(rd, 0));
   1303   mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, 0, 0, 0);
   1304   {
   1305     Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi"));
   1306     ObjSymId anchor = obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
   1307     u32 lp = mc->pos(mc);
   1308     rv64_emit32(mc, rv_addi(rd, rd, 0));
   1309     mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
   1310   }
   1311 }
   1312 
   1313 /* ============================ frame / lifecycle ============================
   1314  */
   1315 
   1316 static NativeFrameSlot rv_frame_slot(NativeTarget* t,
   1317                                      const NativeFrameSlotDesc* d) {
   1318   return native_frame_slot_alloc(&rv_of(t)->frame, d);
   1319 }
   1320 
   1321 static int rv_frame_slot_debug_loc(NativeTarget* t, NativeFrameSlot slot,
   1322                                    CGDebugLoc* out) {
   1323   RvNativeTarget* a = rv_of(t);
   1324   RvNativeSlot* s;
   1325   if (!out) return 0;
   1326   memset(out, 0, sizeof *out);
   1327   if (slot == NATIVE_FRAME_SLOT_NONE || slot > a->frame.nslots) return 0;
   1328   s = rv_slot_get(a, slot);
   1329   out->kind = CG_DEBUG_LOC_FRAME;
   1330   /* rv64 slots are addressed s0/fp-relative (rv_s0_off_slot); the hosted dbg
   1331    * snapshot seeds the frame base with s0, matching aa64's FP-relative
   1332    * convention. */
   1333   out->v.frame_ofs = rv_s0_off_slot(s);
   1334   return 1;
   1335 }
   1336 
   1337 static void rv_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
   1338   RvNativeTarget* a = rv_of(t);
   1339   MCEmitter* mc = t->mc;
   1340   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
   1341   a->func = fd;
   1342   a->loc = fd->loc;
   1343   /* Shared frame bookkeeping: clears the slot table, cum_off, max_outgoing,
   1344    * callee-save set, and known_frame/has_alloca/frame_final. */
   1345   native_frame_reset(&a->frame);
   1346   a->incoming_stack_size = 0;
   1347   a->next_param_int = 0;
   1348   a->next_param_fp = 0;
   1349   a->next_param_stack = 0;
   1350   a->has_sret = (abi && abi->has_sret) ? 1u : 0u;
   1351   a->is_variadic = (abi && abi->variadic) ? 1u : 0u;
   1352   a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE;
   1353   a->npatches = 0;
   1354   a->nalloca = 0;
   1355   a->minimal_prologue_words = 0;
   1356   a->slim_prologue = 0;
   1357 
   1358   mc->set_section(mc, fd->text_section_id);
   1359   mc->emit_align(mc, 4, 0);
   1360   a->func_start = mc->pos(mc);
   1361   mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start);
   1362   if (mc->cfi_startproc) mc->cfi_startproc(mc);
   1363   a->epilogue_label = mc->label_new(mc);
   1364 }
   1365 
   1366 /* sret: reserve a hidden slot for the incoming destination pointer (a0). */
   1367 static void rv_reserve_entry_saves(RvNativeTarget* a) {
   1368   NativeTarget* t = &a->base;
   1369   if (a->has_sret) {
   1370     NativeFrameSlotDesc sd;
   1371     memset(&sd, 0, sizeof sd);
   1372     sd.type = builtin_id(KIT_CG_BUILTIN_I64);
   1373     sd.size = 8;
   1374     sd.align = 8;
   1375     sd.kind = NATIVE_FRAME_SLOT_SAVE;
   1376     a->sret_ptr_slot = t->frame_slot(t, &sd);
   1377     a->next_param_int = 1; /* a0 consumed by the sret pointer */
   1378   }
   1379 }
   1380 
   1381 static void rv_emit_entry_save_stores(RvNativeTarget* a) {
   1382   NativeTarget* t = &a->base;
   1383   if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) {
   1384     KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   1385     NativeAddr addr;
   1386     memset(&addr, 0, sizeof addr);
   1387     addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1388     addr.base.frame = a->sret_ptr_slot;
   1389     addr.base_type = i64t;
   1390     rv_emit_mem(a, 0, native_loc_reg(i64t, NATIVE_REG_INT, RV_A0), addr,
   1391                 native_mem_for_type(t, i64t, 8));
   1392   }
   1393 }
   1394 
   1395 /* Collect the callee-saves the body used (none at -O0). */
   1396 static u32 rv_collect_int_saves(RvNativeTarget* a, u32* regs) {
   1397   u32 n = 0, i;
   1398   for (i = 0; i < a->frame.ncallee_saves; ++i)
   1399     if (a->frame.callee_saves[i].cls == NATIVE_REG_INT)
   1400       regs[n++] = a->frame.callee_saves[i].reg;
   1401   return n;
   1402 }
   1403 static u32 rv_collect_fp_saves(RvNativeTarget* a, u32* regs) {
   1404   u32 n = 0, i;
   1405   for (i = 0; i < a->frame.ncallee_saves; ++i)
   1406     if (a->frame.callee_saves[i].cls == NATIVE_REG_FP)
   1407       regs[n++] = a->frame.callee_saves[i].reg;
   1408   return n;
   1409 }
   1410 
   1411 /* s0-relative offset of the i-th saved register (saves stack below locals). */
   1412 static i32 rv_save_off(RvNativeTarget* a, u32 idx) {
   1413   return -(i32)(a->frame.cum_off) - 8 - 8 * (i32)idx;
   1414 }
   1415 
   1416 static void rv_load_s0(MCEmitter* mc, int fp, u32 reg, i32 off) {
   1417   if (fits_i12(off)) {
   1418     rv64_emit32(mc, fp ? rv_fld(reg, RV_S0, off) : rv_ld(reg, RV_S0, off));
   1419     return;
   1420   }
   1421   rv_emit_load_imm(mc, 1, RV_TMP0, (i64)off);
   1422   rv64_emit32(mc, rv_add(RV_TMP0, RV_S0, RV_TMP0));
   1423   rv64_emit32(mc, fp ? rv_fld(reg, RV_TMP0, 0) : rv_ld(reg, RV_TMP0, 0));
   1424 }
   1425 
   1426 /* Build the prologue instruction sequence into words[]. Returns count. */
   1427 static u32 rv_build_prologue(RvNativeTarget* a, u32* words, u32 cap,
   1428                              u32 frame_size, u32 fp_pair_off,
   1429                              const u32* int_regs, u32 n_int, const u32* fp_regs,
   1430                              u32 n_fp) {
   1431   u32 wi = 0;
   1432 #define PUSH(w)                                                  \
   1433   do {                                                           \
   1434     if (wi >= cap) rv_panic(a, "prologue placeholder overflow"); \
   1435     words[wi++] = (w);                                           \
   1436   } while (0)
   1437   /* sp -= frame_size */
   1438   if (fits_i12(-(i32)frame_size)) {
   1439     PUSH(rv_addi(RV_SP, RV_SP, -(i32)frame_size));
   1440   } else {
   1441     i32 neg = -(i32)frame_size;
   1442     i32 hi = (i32)(((i64)neg + 0x800) >> 12);
   1443     i32 lo = neg - (i32)((u32)hi << 12);
   1444     PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
   1445     if (lo) PUSH(rv_addiw(RV_TMP0, RV_TMP0, lo));
   1446     PUSH(rv_add(RV_SP, RV_SP, RV_TMP0));
   1447   }
   1448   /* save s0/ra at [sp + fp_pair_off], set s0 = sp + fp_pair_off */
   1449   if (fits_i12((i32)fp_pair_off + 8)) {
   1450     PUSH(rv_sd(RV_S0, RV_SP, (i32)fp_pair_off));
   1451     PUSH(rv_sd(RV_RA, RV_SP, (i32)fp_pair_off + 8));
   1452     PUSH(rv_addi(RV_S0, RV_SP, (i32)fp_pair_off));
   1453   } else {
   1454     i32 off = (i32)fp_pair_off;
   1455     i32 hi = (i32)(((i64)off + 0x800) >> 12);
   1456     i32 lo = off - (i32)((u32)hi << 12);
   1457     PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
   1458     if (lo) PUSH(rv_addiw(RV_TMP0, RV_TMP0, lo));
   1459     PUSH(rv_add(RV_TMP0, RV_SP, RV_TMP0));
   1460     PUSH(rv_sd(RV_S0, RV_TMP0, 0));
   1461     PUSH(rv_sd(RV_RA, RV_TMP0, 8));
   1462     PUSH(rv_addi(RV_S0, RV_TMP0, 0));
   1463   }
   1464   /* sret a0 spill */
   1465   if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) {
   1466     RvNativeSlot* s = rv_slot_get(a, a->sret_ptr_slot);
   1467     PUSH(rv_sd(RV_A0, RV_S0, rv_s0_off_slot(s)));
   1468   }
   1469   /* variadic GP save area: spill unconsumed a-regs at [s0 + 16 + i*8] */
   1470   if (a->is_variadic) {
   1471     u32 i;
   1472     for (i = a->next_param_int; i < 8u; ++i)
   1473       PUSH(rv_sd(RV_A0 + i, RV_S0, 16 + (i32)i * 8));
   1474   }
   1475   /* callee saves */
   1476   {
   1477     u32 i;
   1478     for (i = 0; i < n_int; ++i) {
   1479       i32 off = rv_save_off(a, i);
   1480       if (fits_i12(off)) {
   1481         PUSH(rv_sd(int_regs[i], RV_S0, off));
   1482       } else {
   1483         /* rare; emitted directly is fine in the known-frame path, but the
   1484          * single-pass placeholder must hold these too. Use the wide form. */
   1485         i32 hi = (i32)(((i64)off + 0x800) >> 12);
   1486         i32 lo = off - (i32)((u32)hi << 12);
   1487         PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
   1488         if (lo) PUSH(rv_addiw(RV_TMP0, RV_TMP0, lo));
   1489         PUSH(rv_add(RV_TMP0, RV_S0, RV_TMP0));
   1490         PUSH(rv_sd(int_regs[i], RV_TMP0, 0));
   1491       }
   1492     }
   1493     for (i = 0; i < n_fp; ++i) {
   1494       i32 off = rv_save_off(a, n_int + i);
   1495       if (fits_i12(off)) {
   1496         PUSH(rv_fsd(fp_regs[i], RV_S0, off));
   1497       } else {
   1498         i32 hi = (i32)(((i64)off + 0x800) >> 12);
   1499         i32 lo = off - (i32)((u32)hi << 12);
   1500         PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
   1501         if (lo) PUSH(rv_addiw(RV_TMP0, RV_TMP0, lo));
   1502         PUSH(rv_add(RV_TMP0, RV_S0, RV_TMP0));
   1503         PUSH(rv_fsd(fp_regs[i], RV_TMP0, 0));
   1504       }
   1505     }
   1506   }
   1507 #undef PUSH
   1508   return wi;
   1509 }
   1510 
   1511 static void rv_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
   1512   RvNativeTarget* a = rv_of(t);
   1513   MCEmitter* mc = t->mc;
   1514   u32 i;
   1515   rv_func_begin_common(t, fd);
   1516   a->prologue_pos = mc->pos(mc);
   1517   for (i = 0; i < RV_PROLOGUE_WORDS; ++i) rv64_emit32(mc, RV_NOP);
   1518   rv_reserve_entry_saves(a);
   1519   rv_emit_entry_save_stores(a);
   1520 }
   1521 
   1522 static void rv_func_end(NativeTarget* t) {
   1523   RvNativeTarget* a = rv_of(t);
   1524   MCEmitter* mc = t->mc;
   1525   ObjBuilder* obj = t->obj;
   1526   ObjSecId sec = a->func->text_section_id;
   1527   u32 int_regs[16], fp_regs[16];
   1528   u32 n_int = rv_collect_int_saves(a, int_regs);
   1529   u32 n_fp = rv_collect_fp_saves(a, fp_regs);
   1530   u32 frame_size = rv_frame_size(a);
   1531   u32 fp_pair_off = rv_fp_pair_off(a, frame_size);
   1532   u32 end;
   1533   i32 i;
   1534   a->frame_size_final = frame_size;
   1535   a->fp_pair_off = fp_pair_off;
   1536 
   1537   /* epilogue */
   1538   mc->label_place(mc, a->epilogue_label);
   1539   if (a->slim_prologue) {
   1540     /* Frameless leaf: no callee-saves, no s0/ra to reload, sp untouched. */
   1541     rv64_emit32(mc, rv_jalr(RV_ZERO, RV_RA, 0));
   1542   } else {
   1543     for (i = (i32)n_int - 1; i >= 0; --i)
   1544       rv_load_s0(mc, 0, int_regs[i], rv_save_off(a, (u32)i));
   1545     for (i = (i32)n_fp - 1; i >= 0; --i)
   1546       rv_load_s0(mc, 1, fp_regs[i], rv_save_off(a, n_int + (u32)i));
   1547     if (a->frame.has_alloca)
   1548       rv_emit_addr_adjust(mc, RV_SP, RV_S0, -(i32)fp_pair_off);
   1549     rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8));
   1550     rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0));
   1551     /* sp += frame_size */
   1552     if (fits_i12((i32)frame_size)) {
   1553       rv64_emit32(mc, rv_addi(RV_SP, RV_SP, (i32)frame_size));
   1554     } else {
   1555       rv_emit_load_imm(mc, 1, RV_TMP0, (i64)frame_size);
   1556       rv64_emit32(mc, rv_add(RV_SP, RV_SP, RV_TMP0));
   1557     }
   1558     rv64_emit32(mc, rv_jalr(RV_ZERO, RV_RA, 0));
   1559   }
   1560 
   1561   /* patch prologue */
   1562   if (!a->frame.known_frame) {
   1563     u32 words[RV_PROLOGUE_WORDS];
   1564     u32 nwords, k;
   1565     for (k = 0; k < RV_PROLOGUE_WORDS; ++k) words[k] = RV_NOP;
   1566     nwords = rv_build_prologue(a, words, RV_PROLOGUE_WORDS, frame_size,
   1567                                fp_pair_off, int_regs, n_int, fp_regs, n_fp);
   1568     (void)nwords;
   1569     for (k = 0; k < RV_PROLOGUE_WORDS; ++k)
   1570       rv_patch32(obj, sec, a->prologue_pos + k * 4u, words[k]);
   1571   }
   1572   /* patch alloca sites: addi dst, sp, max_outgoing */
   1573   {
   1574     u32 mo = align_up_u32(a->frame.max_outgoing, 16u);
   1575     u32 k;
   1576     if (mo > 2047u) rv_panic(a, "max_outgoing too large for alloca patch");
   1577     for (k = 0; k < a->npatches; ++k)
   1578       rv_patch32(obj, sec, a->patches[k].pos,
   1579                  rv_addi(a->patches[k].dst_reg, RV_SP, (i32)mo));
   1580   }
   1581 
   1582   /* CFI: CFA = s0 + (frame_size - fp_pair_off) */
   1583   if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
   1584     if (a->slim_prologue) {
   1585       /* Frameless leaf: CFA = sp (unchanged from entry) and the return address
   1586        * stays live in ra (the CIE default), so no saved-register rules. The
   1587        * state holds from the first instruction (offset 0). */
   1588       mc->cfi_set_next_pc_offset(mc, 0);
   1589       mc->cfi_def_cfa(mc, RV_SP, 0);
   1590     } else {
   1591       i32 cfa = (i32)frame_size - (i32)fp_pair_off;
   1592       u32 post = a->prologue_pos + (a->frame.known_frame
   1593                                         ? a->minimal_prologue_words * 4u
   1594                                         : RV_PROLOGUE_WORDS * 4u);
   1595       u32 k;
   1596       mc->cfi_set_next_pc_offset(mc, post - a->func_start);
   1597       mc->cfi_def_cfa(mc, RV_S0, cfa);
   1598       mc->cfi_offset(mc, RV_S0, -cfa);
   1599       mc->cfi_offset(mc, RV_RA, -cfa + 8);
   1600       for (k = 0; k < n_int; ++k)
   1601         mc->cfi_offset(mc, int_regs[k], rv_save_off(a, k) - cfa);
   1602       for (k = 0; k < n_fp; ++k)
   1603         mc->cfi_offset(mc, 32u + fp_regs[k], rv_save_off(a, n_int + k) - cfa);
   1604     }
   1605   }
   1606 
   1607   end = mc->pos(mc);
   1608   obj_symbol_define(obj, a->func->sym, sec, (u64)a->func_start,
   1609                     (u64)(end - a->func_start));
   1610   if (a->func->atomize)
   1611     obj_atom_define(obj, sec, a->func_start, end - a->func_start, a->func->sym,
   1612                     0);
   1613   if (mc->debug) debug_func_pc_range(mc->debug, sec, a->func_start, end);
   1614   if (mc->cfi_endproc) mc->cfi_endproc(mc);
   1615   mc_end_function(mc);
   1616   a->func = NULL;
   1617 }
   1618 
   1619 /* rv64 homes its callee-saves below the locals at rv_save_off(idx) rather than
   1620  * in frame slots, so alloc_slots=0: native_frame just records the {reg,cls} set
   1621  * derived from the optimizer's per-class used-masks. */
   1622 static void rv_reserve_callee_saves(NativeTarget* t, const u32* used,
   1623                                     u32 nclasses) {
   1624   native_frame_set_callee_saves(&rv_of(t)->frame, used, nclasses, NULL, 0, 0);
   1625 }
   1626 
   1627 static int rv_reg_is_callee_int(Reg r);
   1628 static int rv_reg_is_callee_fp(Reg r);
   1629 static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
   1630                                  u32 nclob, u32* int_mask, u32* fp_mask);
   1631 
   1632 /* Expand the arch-neutral clobber-ABI sets (KitCgAsmClobberAbiSet bits) into
   1633  * this target's per-class caller/callee-saved register masks. */
   1634 /* abi_clobber_masks is shared as native_asm_abi_clobber_masks
   1635  * (cg/native_asm.h); it reads the masks from t->regs->classes. */
   1636 
   1637 /* Build the callee-saved set the prologue must preserve: the allocator-assigned
   1638  * callee-saved registers (frame->callee_saved_used) plus any an inline-asm
   1639  * block clobbers. The latter are opaque to the optimizer's operand scan, so it
   1640  * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral
   1641  * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks
   1642  * and keep only the callee-saved ones โ€” rv_reg_is_callee_int excludes s0 (the
   1643  * frame pointer, preserved by the prologue head, not as an ordinary
   1644  * callee-save). This is the same register selection the per-block spill used,
   1645  * hoisted into the prologue. Writes up to `cap` per-class masks into `out` and
   1646  * returns the class count to reserve. */
   1647 static u32 rv_known_callee_saves(NativeTarget* t,
   1648                                  const NativeKnownFrameDesc* frame, u32* out,
   1649                                  u32 cap) {
   1650   u32 ncls = frame->ncallee_classes;
   1651   u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp;
   1652   if (ncls > cap) ncls = cap;
   1653   for (u32 c = 0; c < ncls; ++c)
   1654     out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u;
   1655   if (frame->asm_clobbers && frame->nasm_clobbers) {
   1656     RvNativeTarget* a = rv_of(t);
   1657     SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
   1658     rv_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers,
   1659                          &clob_int, &clob_fp);
   1660   }
   1661   native_asm_abi_clobber_masks(t, frame->asm_clobber_abi_sets, &abi_int,
   1662                                &abi_fp);
   1663   clob_int |= abi_int;
   1664   clob_fp |= abi_fp;
   1665   for (Reg r = 0; r < 32u; ++r) {
   1666     if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) &&
   1667         rv_reg_is_callee_int(r))
   1668       out[NATIVE_REG_INT] |= 1u << r;
   1669     if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && rv_reg_is_callee_fp(r))
   1670       out[NATIVE_REG_FP] |= 1u << r;
   1671   }
   1672   return ncls;
   1673 }
   1674 
   1675 static u32 rv_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type,
   1676                                     int* variadic, u32* nparams);
   1677 
   1678 /* Optimizer entry point: the full frame is supplied up front, so the prologue
   1679  * is emitted final the moment it is built โ€” no NOP region, no func_end patch
   1680  * (rv_func_end skips patching when known_frame). rv_build_prologue emits the
   1681  * sret spill and the variadic register-save stores inline, so there is no
   1682  * separate entry-save emission. Slot creation order matches the single-pass
   1683  * path: callee-saves first (only recorded for rv64), then static slots, then
   1684  * the sret entry-save slot. */
   1685 static void rv_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
   1686                                       const NativeKnownFrameDesc* frame,
   1687                                       NativeFrameSlot* out_slots) {
   1688   RvNativeTarget* a = rv_of(t);
   1689   MCEmitter* mc = t->mc;
   1690   u32 int_regs[RV_MAX_CALLEE_SAVES], fp_regs[RV_MAX_CALLEE_SAVES];
   1691   u32 n_int, n_fp, frame_size, fp_pair_off, nwords, i;
   1692   u32 words[RV_KNOWN_PROLOGUE_WORDS];
   1693   rv_func_begin_common(t, fd);
   1694   a->frame.known_frame = 1;
   1695   if (frame) {
   1696     u32 cs[NATIVE_CALL_PLAN_CLASSES];
   1697     u32 ncs = rv_known_callee_saves(t, frame, cs, NATIVE_CALL_PLAN_CLASSES);
   1698     a->frame.has_alloca = frame->has_alloca;
   1699     if (ncs) rv_reserve_callee_saves(t, cs, ncs);
   1700     for (i = 0; i < frame->nslots; ++i) {
   1701       NativeFrameSlot slot = rv_frame_slot(t, &frame->slots[i]);
   1702       if (out_slots) out_slots[i] = slot;
   1703     }
   1704     rv_reserve_entry_saves(a);
   1705     native_frame_note_outgoing(&a->frame, frame->max_outgoing);
   1706   }
   1707   /* Frame is final: size and offsets are settled, so emit the exact prologue.
   1708    */
   1709   frame_size = rv_frame_size(a);
   1710   fp_pair_off = rv_fp_pair_off(a, frame_size);
   1711   a->frame_size_final = frame_size;
   1712   a->fp_pair_off = fp_pair_off;
   1713   a->prologue_pos = mc->pos(mc);
   1714   /* Leaf no-frame tier (aa64 slim_prologue equivalent): a leaf with no
   1715    * callee-saves, no body slots, no outgoing args, no sret/variadic and
   1716    * register-only params never reads s0 (no frame slots / stack args) nor
   1717    * clobbers ra (no calls). Emit no prologue at all; rv_func_end emits a bare
   1718    * `ret`. cum_off==0 already implies no sret slot and no param spills, but the
   1719    * extra guards keep the intent explicit. Inline asm is excluded: it can
   1720    * clobber ra opaquely, and without the saved record the bare `ret` would
   1721    * return through the destroyed link register. */
   1722   a->slim_prologue = frame && frame->is_leaf && !frame->has_asm &&
   1723                      a->frame.ncallee_saves == 0 && !a->frame.has_alloca &&
   1724                      a->frame.cum_off == 0 && a->frame.max_outgoing == 0 &&
   1725                      !a->has_sret && !a->is_variadic &&
   1726                      rv_signature_stack_bytes(t, fd->fn_type, NULL, NULL) == 0;
   1727   if (a->slim_prologue) {
   1728     a->minimal_prologue_words = 0;
   1729     native_frame_set_final(&a->frame);
   1730     return;
   1731   }
   1732   n_int = rv_collect_int_saves(a, int_regs);
   1733   n_fp = rv_collect_fp_saves(a, fp_regs);
   1734   nwords = rv_build_prologue(a, words, RV_KNOWN_PROLOGUE_WORDS, frame_size,
   1735                              fp_pair_off, int_regs, n_int, fp_regs, n_fp);
   1736   for (i = 0; i < nwords; ++i) rv64_emit32(mc, words[i]);
   1737   a->minimal_prologue_words = nwords;
   1738   native_frame_set_final(&a->frame);
   1739 }
   1740 
   1741 /* ============================ params / ABI helpers
   1742  * ============================ */
   1743 
   1744 static const ABIArgInfo* rv_param_abi(NativeTarget* t, const ABIFuncInfo* abi,
   1745                                       const NativeCallDesc* desc, u32 i,
   1746                                       ABIArgInfo* scratch) {
   1747   /* Synthesized for unnamed (variadic) args, or untyped calls. RISC-V LP64D
   1748    * passes variadic FP args in INTEGER registers (as their bit pattern), not
   1749    * the FP pool โ€” so a variadic float part is ABI_CLASS_INT. */
   1750   int variadic = abi && i >= abi->nparams;
   1751   if (abi && i < abi->nparams) return &abi->params[i];
   1752   memset(scratch, 0, sizeof *scratch);
   1753   scratch->kind = ABI_ARG_DIRECT;
   1754   scratch->nparts = 1;
   1755   scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1);
   1756   ((ABIArgPart*)scratch->parts)[0].cls =
   1757       (!variadic && cg_type_is_float(t->c, desc->args[i].type)) ? ABI_CLASS_FP
   1758                                                                 : ABI_CLASS_INT;
   1759   ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG;
   1760   ((ABIArgPart*)scratch->parts)[0].size =
   1761       native_type_size(t, desc->args[i].type);
   1762   ((ABIArgPart*)scratch->parts)[0].align =
   1763       native_type_align(t, desc->args[i].type);
   1764   return scratch;
   1765 }
   1766 
   1767 static u32 rv_part_stack_size(const ABIArgPart* part) {
   1768   return align_up_u32(part->size ? part->size : 8u, 8u);
   1769 }
   1770 static u32 rv_part_stack_align(const ABIArgPart* part) {
   1771   u32 al = part->align ? part->align : 8u;
   1772   if (al < 8u) al = 8u;
   1773   if (al > 16u) al = 16u;
   1774   return al;
   1775 }
   1776 
   1777 static KitCgTypeId rv_part_scalar_type(const ABIArgPart* part) {
   1778   if (part->cls == ABI_CLASS_FP) {
   1779     if (part->size <= 4u) return builtin_id(KIT_CG_BUILTIN_F32);
   1780     return builtin_id(KIT_CG_BUILTIN_F64);
   1781   }
   1782   switch (part->size) {
   1783     case 1u:
   1784       return builtin_id(KIT_CG_BUILTIN_I8);
   1785     case 2u:
   1786       return builtin_id(KIT_CG_BUILTIN_I16);
   1787     case 4u:
   1788       return builtin_id(KIT_CG_BUILTIN_I32);
   1789     default:
   1790       return builtin_id(KIT_CG_BUILTIN_I64);
   1791   }
   1792 }
   1793 
   1794 static u32 rv_class_stack_size(const ABIArgInfo* ai) {
   1795   u32 total = 0, p;
   1796   if (!ai || ai->kind == ABI_ARG_IGNORE) return 0;
   1797   if (ai->kind == ABI_ARG_INDIRECT) return 8u;
   1798   for (p = 0; p < ai->nparts; ++p) {
   1799     total = align_up_u32(total, rv_part_stack_align(&ai->parts[p]));
   1800     total += rv_part_stack_size(&ai->parts[p]);
   1801   }
   1802   return align_up_u32(total ? total : 8u, 8u);
   1803 }
   1804 
   1805 static u32 rv_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) {
   1806   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);
   1807   /* sret consumes a0 as the implicit first integer argument. */
   1808   u32 next_int = (abi && abi->has_sret) ? 1u : 0u;
   1809   u32 next_fp = 0, stack = 0, i, p;
   1810   for (i = 0; i < desc->nargs; ++i) {
   1811     ABIArgInfo tmp;
   1812     const ABIArgInfo* ai = rv_param_abi(t, abi, desc, i, &tmp);
   1813     int force_stack =
   1814         abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams;
   1815     if (ai->kind == ABI_ARG_IGNORE) continue;
   1816     if (force_stack) {
   1817       stack += rv_class_stack_size(ai);
   1818       continue;
   1819     }
   1820     if (ai->kind == ABI_ARG_INDIRECT) {
   1821       if (next_int < 8u)
   1822         next_int++;
   1823       else
   1824         stack += 8u;
   1825       continue;
   1826     }
   1827     for (p = 0; p < ai->nparts; ++p) {
   1828       const ABIArgPart* part = &ai->parts[p];
   1829       if (part->cls == ABI_CLASS_FP) {
   1830         if (next_fp < 8u)
   1831           next_fp++;
   1832         else {
   1833           stack = align_up_u32(stack, rv_part_stack_align(part));
   1834           stack += rv_part_stack_size(part);
   1835         }
   1836       } else {
   1837         if (next_int < 8u)
   1838           next_int++;
   1839         else {
   1840           stack = align_up_u32(stack, rv_part_stack_align(part));
   1841           stack += rv_part_stack_size(part);
   1842         }
   1843       }
   1844     }
   1845   }
   1846   return align_up_u32(stack, 16u);
   1847 }
   1848 
   1849 static u32 rv_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type,
   1850                                     int* variadic, u32* nparams) {
   1851   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type);
   1852   NativeCallDesc d;
   1853   if (variadic) *variadic = abi ? (int)abi->variadic : 0;
   1854   if (nparams) *nparams = abi ? abi->nparams : 0u;
   1855   memset(&d, 0, sizeof d);
   1856   d.fn_type = fn_type;
   1857   d.nargs = abi ? abi->nparams : 0u;
   1858   if (d.nargs) d.args = arena_zarray(t->c->tu, NativeLoc, d.nargs);
   1859   return rv_call_stack_size(t, &d);
   1860 }
   1861 
   1862 static u32 rv_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) {
   1863   return rv_call_stack_size(t, desc);
   1864 }
   1865 
   1866 /* Resolve a NativeLoc to an addressable NativeAddr (frame/stack/addr). */
   1867 static NativeAddr rv_loc_addr(RvNativeTarget* a, NativeLoc loc, u32 offset) {
   1868   NativeAddr addr;
   1869   memset(&addr, 0, sizeof addr);
   1870   switch ((NativeLocKind)loc.kind) {
   1871     case NATIVE_LOC_FRAME:
   1872       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1873       addr.base.frame = loc.v.frame;
   1874       addr.base_type = loc.type;
   1875       addr.offset = (i32)offset;
   1876       return addr;
   1877     case NATIVE_LOC_STACK:
   1878       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1879       addr.base.frame = loc.v.stack.slot;
   1880       addr.base_type = loc.type;
   1881       addr.offset = loc.v.stack.offset + (i32)offset;
   1882       return addr;
   1883     case NATIVE_LOC_ADDR:
   1884       addr = loc.v.addr;
   1885       addr.offset += (i32)offset;
   1886       return addr;
   1887     default:
   1888       rv_panic(a, "location is not addressable");
   1889   }
   1890   return addr;
   1891 }
   1892 
   1893 static void rv_load_part(NativeTarget* t, NativeLoc dst, NativeLoc src,
   1894                          u32 offset, u32 size) {
   1895   RvNativeTarget* a = rv_of(t);
   1896   if (src.kind == NATIVE_LOC_REG) {
   1897     rv_move(t, dst, src);
   1898     return;
   1899   }
   1900   if (src.kind == NATIVE_LOC_FRAME || src.kind == NATIVE_LOC_STACK ||
   1901       src.kind == NATIVE_LOC_ADDR) {
   1902     NativeAddr addr = rv_loc_addr(a, src, offset);
   1903     addr.base_type = dst.type;
   1904     rv_emit_mem(a, 1, dst, addr, native_mem_for_type(t, dst.type, size));
   1905     return;
   1906   }
   1907   if (src.kind == NATIVE_LOC_IMM) {
   1908     rv_emit_load_imm(t->mc, rv_is_64(t, dst.type) ? 1u : 0u, loc_reg(dst),
   1909                      src.v.imm);
   1910     return;
   1911   }
   1912   rv_panic(a, "unsupported part source");
   1913 }
   1914 
   1915 static void rv_store_part(NativeTarget* t, NativeLoc dst, NativeLoc src,
   1916                           u32 offset, u32 size) {
   1917   RvNativeTarget* a = rv_of(t);
   1918   if (dst.kind == NATIVE_LOC_FRAME || dst.kind == NATIVE_LOC_STACK ||
   1919       dst.kind == NATIVE_LOC_ADDR) {
   1920     NativeAddr addr = rv_loc_addr(a, dst, offset);
   1921     addr.base_type = src.type;
   1922     rv_emit_mem(a, 0, src, addr, native_mem_for_type(t, src.type, size));
   1923     return;
   1924   }
   1925   if (dst.kind == NATIVE_LOC_REG) {
   1926     rv_move(t, dst, src);
   1927     return;
   1928   }
   1929   rv_panic(a, "unsupported part destination");
   1930 }
   1931 
   1932 static void rv_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) {
   1933   NativeAddr addr = rv_loc_addr(rv_of(t), src, 0);
   1934   rv_load_addr(t, dst, addr);
   1935 }
   1936 
   1937 static void rv_store_outgoing_part(NativeTarget* t, int tail_call,
   1938                                    u32 stack_off, NativeLoc src, u32 size) {
   1939   NativeAddr addr;
   1940   memset(&addr, 0, sizeof addr);
   1941   addr.base_kind = NATIVE_ADDR_BASE_REG;
   1942   addr.base_type = src.type;
   1943   if (tail_call) {
   1944     /* A sibling call reuses the caller's frame: its outgoing stack args land in
   1945      * the caller's incoming-arg window ([s0 + 16 + va_save + off]) โ€” physically
   1946      * the same address the tail-callee will read at [sp+off] once the teardown
   1947      * has restored sp to the caller's entry sp (the CFA). */
   1948     addr.base.reg = RV_S0;
   1949     addr.offset = rv_s0_off_in_arg(rv_of(t), stack_off);
   1950   } else {
   1951     addr.base.reg = RV_SP;
   1952     addr.offset = (i32)stack_off;
   1953   }
   1954   rv_emit_mem(rv_of(t), 0, src, addr, native_mem_for_type(t, src.type, size));
   1955 }
   1956 
   1957 /* NativeTarget bind_param: route incoming param (ABI loc) into dst. */
   1958 static void rv_bind_native_param(NativeTarget* t, const CGParamDesc* p,
   1959                                  NativeLoc dst) {
   1960   RvNativeTarget* a = rv_of(t);
   1961   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type);
   1962   const ABIArgInfo* ai =
   1963       p->index < abi->nparams ? &abi->params[p->index] : NULL;
   1964   int to_reg = dst.kind == NATIVE_LOC_REG;
   1965   u32 i;
   1966   if (!ai || ai->kind == ABI_ARG_IGNORE) return;
   1967   if (ai->kind == ABI_ARG_INDIRECT) {
   1968     NativeLoc src = native_loc_reg(
   1969         builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
   1970         a->next_param_int < 8u ? RV_A0 + a->next_param_int : RV_TMP0);
   1971     NativeAddr d_addr, from;
   1972     AggregateAccess access;
   1973     if (a->next_param_int < 8u) {
   1974       a->next_param_int++;
   1975     } else {
   1976       NativeAddr sa;
   1977       memset(&sa, 0, sizeof sa);
   1978       sa.base_kind = NATIVE_ADDR_BASE_REG;
   1979       sa.base.reg = RV_S0;
   1980       sa.offset = rv_s0_off_in_arg(a, a->next_param_stack);
   1981       sa.base_type = src.type;
   1982       rv_emit_mem(a, 1, src, sa, native_mem_for_type(t, src.type, 8));
   1983       a->next_param_stack += 8u;
   1984     }
   1985     if (dst.kind != NATIVE_LOC_FRAME)
   1986       rv_panic(a, "indirect parameter requires a frame destination");
   1987     memset(&d_addr, 0, sizeof d_addr);
   1988     d_addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1989     d_addr.base.frame = dst.v.frame;
   1990     d_addr.base_type = p->type;
   1991     memset(&from, 0, sizeof from);
   1992     from.base_kind = NATIVE_ADDR_BASE_REG;
   1993     from.base.reg = loc_reg(src);
   1994     from.base_type = p->type;
   1995     memset(&access, 0, sizeof access);
   1996     access.type = p->type;
   1997     access.size = p->size ? p->size : (u32)cg_type_size(t->c, p->type);
   1998     access.align = p->align ? p->align : native_type_align(t, p->type);
   1999     rv_copy_bytes(t, d_addr, from, access);
   2000     return;
   2001   }
   2002   for (i = 0; i < ai->nparts; ++i) {
   2003     const ABIArgPart* part = &ai->parts[i];
   2004     NativeAllocClass cls =
   2005         part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2006     NativeLoc src;
   2007     if (cls == NATIVE_REG_FP && a->next_param_fp < 8u) {
   2008       src = native_loc_reg(p->type, cls, RV_FA0 + a->next_param_fp++);
   2009     } else if (cls == NATIVE_REG_INT && a->next_param_int < 8u) {
   2010       src = native_loc_reg(p->type, cls, RV_A0 + a->next_param_int++);
   2011     } else {
   2012       Reg tmp = (cls == NATIVE_REG_FP) ? RV_FTMP0 : RV_TMP0;
   2013       NativeAddr sa;
   2014       src = native_loc_reg(p->type, cls, tmp);
   2015       a->next_param_stack =
   2016           align_up_u32(a->next_param_stack, rv_part_stack_align(part));
   2017       memset(&sa, 0, sizeof sa);
   2018       sa.base_kind = NATIVE_ADDR_BASE_REG;
   2019       sa.base.reg = RV_S0;
   2020       sa.base_type = p->type;
   2021       sa.offset = rv_s0_off_in_arg(a, a->next_param_stack);
   2022       rv_emit_mem(a, 1, src, sa, native_mem_for_type(t, p->type, part->size));
   2023       a->next_param_stack += rv_part_stack_size(part);
   2024     }
   2025     if (dst.kind == NATIVE_LOC_NONE) {
   2026       /* unused parameter; cursors already advanced */
   2027     } else if (to_reg) {
   2028       NativeLoc d = native_loc_reg(dst.type ? dst.type : p->type,
   2029                                    (NativeAllocClass)dst.cls, (Reg)dst.v.reg);
   2030       if (!(src.kind == NATIVE_LOC_REG && loc_reg(src) == loc_reg(d) &&
   2031             (NativeAllocClass)src.cls == (NativeAllocClass)d.cls))
   2032         rv_move(t, d, src);
   2033     } else {
   2034       rv_store_part(
   2035           t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), src,
   2036           0, part->size);
   2037     }
   2038   }
   2039   a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u);
   2040 }
   2041 
   2042 /* ============================ calls / returns ============================ */
   2043 
   2044 typedef NativeArgMove RvArgMove;
   2045 
   2046 static void rv_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) {
   2047   if (m->is_addr)
   2048     rv_addr_of_loc(t, m->dst, m->src);
   2049   else
   2050     rv_load_part(t, m->dst, m->src, m->src_offset, m->size);
   2051 }
   2052 
   2053 /* Parallel-copy register arg moves via the shared scheduler; cycles break
   2054  * through the int/fp emit scratch (t1 / ft1). */
   2055 static void rv_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves,
   2056                                   u32 n) {
   2057   NativeArgShuffle s;
   2058   if (n > RV_MAX_REG_ARG_MOVES) rv_panic(rv_of(t), "too many register args");
   2059   memset(&s, 0, sizeof s);
   2060   s.t = t;
   2061   s.emit_one = rv_emit_one_arg_move;
   2062   s.reg_move = rv_move;
   2063   s.scratch[NATIVE_REG_INT] = RV_TMP1;
   2064   s.scratch[NATIVE_REG_FP] = RV_FTMP1;
   2065   native_arg_shuffle(&s, moves, n);
   2066 }
   2067 
   2068 static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
   2069                          NativeCallPlan* plan) {
   2070   RvNativeTarget* a = rv_of(t);
   2071   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);
   2072   NativeCallPlanRet* rets;
   2073   KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   2074   memset(plan, 0, sizeof *plan);
   2075   rets = desc->nresults ? arena_zarray(t->c->tu, NativeCallPlanRet, 4) : NULL;
   2076   plan->callee = desc->callee;
   2077   plan->rets = rets;
   2078   plan->flags = desc->flags;
   2079   plan->has_sret = abi && abi->has_sret;
   2080   plan->is_variadic = abi && abi->variadic;
   2081   plan->stack_arg_size = rv_call_stack_size(t, desc);
   2082   if (plan->stack_arg_size > a->frame.max_outgoing)
   2083     a->frame.max_outgoing = plan->stack_arg_size;
   2084   /* Indirect callee in an arg register would be clobbered by arg loads. */
   2085   if (plan->callee.kind == NATIVE_LOC_REG &&
   2086       (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT &&
   2087       plan->callee.v.reg >= RV_A0 && plan->callee.v.reg <= RV_A7) {
   2088     NativeLoc scratch =
   2089         native_loc_reg(plan->callee.type, NATIVE_REG_INT, RV_TMP0);
   2090     rv_move(t, scratch, plan->callee);
   2091     plan->callee = scratch;
   2092   }
   2093   {
   2094     /* sret returns pass the hidden destination pointer as the implicit first
   2095      * integer argument (a0), so the real args start at a1. */
   2096     u32 next_int = (abi && abi->has_sret) ? 1u : 0u;
   2097     u32 next_fp = 0, stack = 0, nmoves = 0, i, p;
   2098     int tail = (desc->flags & CG_CALL_TAIL) != 0;
   2099     RvArgMove moves[RV_MAX_REG_ARG_MOVES];
   2100     for (i = 0; i < desc->nargs; ++i) {
   2101       ABIArgInfo tmp;
   2102       const ABIArgInfo* ai = rv_param_abi(t, abi, desc, i, &tmp);
   2103       int force_stack =
   2104           abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams;
   2105       if (ai->kind == ABI_ARG_IGNORE) continue;
   2106       if (force_stack) {
   2107         NativeLoc tmpreg =
   2108             native_loc_reg(desc->args[i].type, NATIVE_REG_INT, RV_TMP0);
   2109         u32 n = rv_class_stack_size(ai), off = 0;
   2110         while (off < n) {
   2111           rv_load_part(t, tmpreg, desc->args[i], off, 8);
   2112           rv_store_outgoing_part(t, tail, stack + off, tmpreg, 8);
   2113           off += 8;
   2114         }
   2115         stack += n;
   2116         continue;
   2117       }
   2118       if (ai->kind == ABI_ARG_INDIRECT) {
   2119         if (next_int < 8u) {
   2120           RvArgMove* m = &moves[nmoves++];
   2121           m->dst = native_loc_reg(i64t, NATIVE_REG_INT, RV_A0 + next_int++);
   2122           m->src = desc->args[i];
   2123           m->src_offset = 0;
   2124           m->size = 8;
   2125           m->is_addr = 1;
   2126         } else {
   2127           NativeLoc ptr = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP0);
   2128           rv_addr_of_loc(t, ptr, desc->args[i]);
   2129           rv_store_outgoing_part(t, tail, stack, ptr, 8);
   2130           stack += 8u;
   2131         }
   2132         continue;
   2133       }
   2134       for (p = 0; p < ai->nparts; ++p) {
   2135         const ABIArgPart* part = &ai->parts[p];
   2136         NativeAllocClass cls =
   2137             part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2138         if ((cls == NATIVE_REG_FP && next_fp < 8u) ||
   2139             (cls == NATIVE_REG_INT && next_int < 8u)) {
   2140           RvArgMove* m = &moves[nmoves++];
   2141           Reg areg =
   2142               cls == NATIVE_REG_FP ? RV_FA0 + next_fp++ : RV_A0 + next_int++;
   2143           m->dst = native_loc_reg(desc->args[i].type, cls, areg);
   2144           m->src = desc->args[i];
   2145           m->src_offset = part->src_offset;
   2146           m->size = part->size;
   2147           m->is_addr = 0;
   2148         } else {
   2149           Reg tmp = cls == NATIVE_REG_FP ? RV_FTMP0 : RV_TMP0;
   2150           NativeLoc tmpreg = native_loc_reg(desc->args[i].type, cls, tmp);
   2151           rv_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size);
   2152           stack = align_up_u32(stack, rv_part_stack_align(part));
   2153           rv_store_outgoing_part(t, tail, stack, tmpreg, part->size);
   2154           stack += rv_part_stack_size(part);
   2155         }
   2156       }
   2157     }
   2158     rv_emit_reg_arg_moves(t, moves, nmoves);
   2159     if (abi && abi->has_sret && desc->nresults) {
   2160       /* sret pointer goes in a0; arg loads have completed. A tail call forwards
   2161        * the caller's own incoming sret pointer (spilled at entry) so the
   2162        * sibling writes the result into the caller's caller's destination;
   2163        * otherwise pass the address of this call's result slot. */
   2164       NativeLoc a0 = native_loc_reg(i64t, NATIVE_REG_INT, RV_A0);
   2165       if (tail)
   2166         rv_load_part(t, a0, native_loc_stack(i64t, a->sret_ptr_slot, 0), 0, 8);
   2167       else
   2168         rv_addr_of_loc(t, a0, desc->results[0]);
   2169     }
   2170   }
   2171   if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) {
   2172     u32 nr = 0, ni = 0, nf = 0, p;
   2173     for (p = 0; p < abi->ret.nparts; ++p) {
   2174       const ABIArgPart* part = &abi->ret.parts[p];
   2175       NativeAllocClass cls =
   2176           part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2177       KitCgTypeId pty = rv_part_scalar_type(part);
   2178       Reg rreg = cls == NATIVE_REG_FP ? RV_FA0 + nf++ : RV_A0 + ni++;
   2179       rets[nr].src = native_loc_reg(pty, cls, rreg);
   2180       rets[nr].dst = desc->results[0];
   2181       if (rets[nr].dst.kind == NATIVE_LOC_FRAME)
   2182         rets[nr].dst = native_loc_stack(pty, desc->results[0].v.frame,
   2183                                         (i32)part->src_offset);
   2184       else if (rets[nr].dst.kind == NATIVE_LOC_STACK) {
   2185         rets[nr].dst.v.stack.offset += (i32)part->src_offset;
   2186         rets[nr].dst.type = pty;
   2187       }
   2188       rets[nr].mem = native_mem_for_type(t, pty, part->size);
   2189       nr++;
   2190     }
   2191     plan->nrets = nr;
   2192   } else if (abi && abi->ret.kind == ABI_ARG_IGNORE) {
   2193     plan->nrets = 0;
   2194   } else if (!abi && desc->nresults) {
   2195     rets[0].src = native_loc_reg(desc->results[0].type, NATIVE_REG_INT, RV_A0);
   2196     rets[0].dst = desc->results[0];
   2197     rets[0].mem = native_mem_for_type(t, desc->results[0].type, 0);
   2198     plan->nrets = 1;
   2199   }
   2200 }
   2201 
   2202 /* Emit a sibling (tail) call: tear the frame down to the caller's entry state
   2203  * and jump (no link) to the callee. Outgoing args are already in the arg regs /
   2204  * the caller's incoming-arg window. At -O0 there are no callee-saves, and the
   2205  * sp restore uses the CFA offset (s0 + 16 + va_save), which is independent of
   2206  * the not-yet-final frame_size โ€” so no func_end patching is needed. */
   2207 static void rv_emit_tail_site(NativeTarget* t, NativeLoc callee) {
   2208   RvNativeTarget* a = rv_of(t);
   2209   MCEmitter* mc = t->mc;
   2210   i32 cfa = (i32)(RV_FRAME_SAVE_SIZE + rv_va_save_sz(a));
   2211   int indirect = callee.kind == NATIVE_LOC_REG;
   2212   u32 int_regs[RV_MAX_CALLEE_SAVES], fp_regs[RV_MAX_CALLEE_SAVES];
   2213   u32 n_int = rv_collect_int_saves(a, int_regs);
   2214   u32 n_fp = rv_collect_fp_saves(a, fp_regs);
   2215   i32 i;
   2216   /* Stage an indirect callee into a reserved scratch (t1) BEFORE the teardown:
   2217    * regalloc parks the function pointer in a callee-saved register so it
   2218    * survives arg marshalling, and the callee-save / s0 / ra restores below
   2219    * would otherwise overwrite it. t1 is reserved (never allocable) and
   2220    * untouched by the restore loop (which only uses t0 for far offsets). */
   2221   if (indirect) rv64_emit32(mc, rv_addi(RV_TMP1, loc_reg(callee), 0));
   2222   /* Restore callee-saves before tearing the frame down (O1 path; none at -O0).
   2223    * Their save offsets are s0-relative via rv_save_off, so the restore is
   2224    * frame-size- and teardown-order-independent. */
   2225   for (i = (i32)n_int - 1; i >= 0; --i)
   2226     rv_load_s0(mc, 0, int_regs[i], rv_save_off(a, (u32)i));
   2227   for (i = (i32)n_fp - 1; i >= 0; --i)
   2228     rv_load_s0(mc, 1, fp_regs[i], rv_save_off(a, n_int + (u32)i));
   2229   rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8));
   2230   rv64_emit32(mc, rv_addi(RV_SP, RV_S0, cfa));
   2231   rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0));
   2232   if (callee.kind == NATIVE_LOC_GLOBAL) {
   2233     u32 pos = mc->pos(mc);
   2234     rv64_emit32(mc, rv_auipc(RV_TMP0, 0));
   2235     rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP0, 0));
   2236     mc->emit_reloc_at(mc, mc->section_id, pos, R_RV_CALL, callee.v.global.sym,
   2237                       callee.v.global.addend, 0, 0);
   2238   } else if (indirect) {
   2239     rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP1, 0));
   2240   } else {
   2241     rv_panic(a, "unsupported tail call target");
   2242   }
   2243 }
   2244 
   2245 static void rv_emit_call(NativeTarget* t, const NativeCallPlan* plan) {
   2246   MCEmitter* mc = t->mc;
   2247   ObjSecId sec = mc->section_id;
   2248   if (plan->flags & CG_CALL_TAIL) {
   2249     rv_emit_tail_site(t, plan->callee);
   2250     return;
   2251   }
   2252   if (plan->callee.kind == NATIVE_LOC_GLOBAL) {
   2253     u32 pos = mc->pos(mc);
   2254     rv64_emit32(mc, rv_auipc(RV_RA, 0));
   2255     rv64_emit32(mc, rv_jalr(RV_RA, RV_RA, 0));
   2256     mc->emit_reloc_at(mc, sec, pos, R_RV_CALL, plan->callee.v.global.sym,
   2257                       plan->callee.v.global.addend, 0, 0);
   2258     return;
   2259   }
   2260   if (plan->callee.kind == NATIVE_LOC_REG) {
   2261     rv64_emit32(mc, rv_jalr(RV_RA, loc_reg(plan->callee), 0));
   2262     return;
   2263   }
   2264   rv_panic(rv_of(t), "unsupported call target");
   2265 }
   2266 
   2267 static void rv_plan_ret(NativeTarget* t, const CGFuncDesc* fd,
   2268                         const NativeLoc* values, u32 nvalues,
   2269                         NativeCallPlanRet** out_rets, u32* out_nrets) {
   2270   RvNativeTarget* a = rv_of(t);
   2271   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
   2272   NativeCallPlanRet* rets = NULL;
   2273   u32 nr = 0;
   2274   if (nvalues > 1u) rv_panic(a, "multiple returns unsupported");
   2275   if (nvalues) rets = arena_zarray(t->c->tu, NativeCallPlanRet, 4);
   2276   if (nvalues && abi && abi->ret.kind == ABI_ARG_INDIRECT) {
   2277     KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   2278     NativeLoc dstp = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1);
   2279     NativeLoc saved = native_loc_stack(i64t, a->sret_ptr_slot, 0);
   2280     NativeAddr dst_addr, src_addr;
   2281     AggregateAccess access;
   2282     rv_load_part(t, dstp, saved, 0, 8);
   2283     memset(&dst_addr, 0, sizeof dst_addr);
   2284     dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
   2285     dst_addr.base.reg = RV_TMP1;
   2286     dst_addr.base_type = values[0].type;
   2287     src_addr = rv_loc_addr(a, values[0], 0);
   2288     src_addr.base_type = values[0].type;
   2289     memset(&access, 0, sizeof access);
   2290     access.type = values[0].type;
   2291     access.size = (u32)cg_type_size(t->c, values[0].type);
   2292     access.align = native_type_align(t, values[0].type);
   2293     rv_copy_bytes(t, dst_addr, src_addr, access);
   2294     *out_rets = NULL;
   2295     *out_nrets = 0;
   2296     return;
   2297   }
   2298   if (nvalues && abi && abi->ret.kind == ABI_ARG_DIRECT) {
   2299     u32 ni = 0, nf = 0, p;
   2300     for (p = 0; p < abi->ret.nparts; ++p) {
   2301       const ABIArgPart* part = &abi->ret.parts[p];
   2302       NativeAllocClass cls =
   2303           part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2304       KitCgTypeId pty = rv_part_scalar_type(part);
   2305       Reg rreg = cls == NATIVE_REG_FP ? RV_FA0 + nf++ : RV_A0 + ni++;
   2306       rets[nr].src = values[0];
   2307       if (rets[nr].src.kind == NATIVE_LOC_FRAME)
   2308         rets[nr].src =
   2309             native_loc_stack(pty, values[0].v.frame, (i32)part->src_offset);
   2310       else if (rets[nr].src.kind == NATIVE_LOC_STACK) {
   2311         rets[nr].src.v.stack.offset += (i32)part->src_offset;
   2312         rets[nr].src.type = pty;
   2313       }
   2314       rets[nr].dst = native_loc_reg(pty, cls, rreg);
   2315       rets[nr].mem = native_mem_for_type(t, pty, part->size);
   2316       nr++;
   2317     }
   2318   } else if (nvalues) {
   2319     rets[0].src = values[0];
   2320     rets[0].dst = native_loc_reg(values[0].type, NATIVE_REG_INT, RV_A0);
   2321     rets[0].mem = native_mem_for_type(t, values[0].type, 0);
   2322     nr = 1;
   2323   }
   2324   *out_rets = rets;
   2325   *out_nrets = nr;
   2326 }
   2327 
   2328 static void rv_ret(NativeTarget* t) {
   2329   RvNativeTarget* a = rv_of(t);
   2330   rv_jump(t, a->epilogue_label);
   2331 }
   2332 
   2333 /* ============================ alloca ============================ */
   2334 
   2335 static void rv_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size,
   2336                       u32 align) {
   2337   RvNativeTarget* a = rv_of(t);
   2338   MCEmitter* mc = t->mc;
   2339   u32 rsz = loc_reg(size);
   2340   u32 rd = loc_reg(dst);
   2341   u32 al = align ? align : 16u;
   2342   if (al < 16u) al = 16u;
   2343   /* round up: t0 = (size + (al-1)) & ~(al-1) */
   2344   rv64_emit32(mc, rv_addi(RV_TMP0, rsz, (i32)(al - 1u)));
   2345   rv_emit_load_imm(mc, 1, RV_TMP1, -(i64)al);
   2346   rv64_emit32(mc, rv_and(RV_TMP0, RV_TMP0, RV_TMP1));
   2347   rv64_emit32(mc, rv_sub(RV_SP, RV_SP, RV_TMP0));
   2348   a->frame.has_alloca = 1;
   2349   /* dst = sp + max_outgoing (patched in func_end) */
   2350   if (a->npatches == a->patches_cap) {
   2351     u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u;
   2352     RvPatch* nb = arena_zarray(t->c->tu, RvPatch, cap);
   2353     if (a->patches) memcpy(nb, a->patches, sizeof(*nb) * a->npatches);
   2354     a->patches = nb;
   2355     a->patches_cap = cap;
   2356   }
   2357   a->patches[a->npatches].kind = RV_PATCH_ALLOCA;
   2358   a->patches[a->npatches].pos = mc->pos(mc);
   2359   a->patches[a->npatches].dst_reg = rd;
   2360   a->npatches++;
   2361   a->nalloca++;
   2362   rv64_emit32(mc, RV_NOP); /* placeholder for addi dst, sp, max_outgoing */
   2363 }
   2364 
   2365 /* ============================ TLS / bitfield / atomics
   2366  * ============================ */
   2367 
   2368 static void rv_tls_addr_of(NativeTarget* t, NativeLoc dst, ObjSymId sym,
   2369                            i64 addend) {
   2370   MCEmitter* mc = t->mc;
   2371   u32 sec = mc->section_id;
   2372   u32 rd = loc_reg(dst);
   2373   /* Local-Exec only, matching aa64 (aa_tls_addr_of) and x64 (x64_tls_addr_of):
   2374    * kit links the whole module statically, so every _Thread_local symbol is
   2375    * resolved within the image and TPREL is always valid. An Initial-Exec GOT
   2376    * path (R_RV_TLS_GOT_HI20) used to be emitted for extern-via-GOT symbols
   2377    * under -fPIE (the hosted default), but the linker has no layout/apply for
   2378    * that reloc, so it produced a hard "unsupported reloc kind" link failure
   2379    * rather than a working binary. */
   2380   /* lui t0, %tprel_hi(sym); add t0, tp, t0; addi dst, t0, %tprel_lo(sym). */
   2381   {
   2382     u32 hp = mc->pos(mc);
   2383     rv64_emit32(mc, rv_lui(RV_TMP0, 0));
   2384     mc->emit_reloc_at(mc, sec, hp, R_RV_TPREL_HI20, sym, addend, 0, 0);
   2385     rv64_emit32(mc, rv_add(RV_TMP0, RV_TP, RV_TMP0));
   2386     {
   2387       u32 lp = mc->pos(mc);
   2388       rv64_emit32(mc, rv_addi(rd, RV_TMP0, 0));
   2389       mc->emit_reloc_at(mc, sec, lp, R_RV_TPREL_LO12_I, sym, addend, 0, 0);
   2390     }
   2391   }
   2392 }
   2393 static void rv_bitfield_load(NativeTarget* t, NativeLoc dst, NativeAddr ra,
   2394                              BitFieldAccess bf) {
   2395   RvNativeTarget* a = rv_of(t);
   2396   MCEmitter* mc = t->mc;
   2397   u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
   2398   u32 rd = loc_reg(dst);
   2399   u32 base;
   2400   i32 off;
   2401   u32 lsb = bf.bit_offset;
   2402   u32 width = bf.bit_width ? bf.bit_width : 1u;
   2403   /* Shift left so the field's MSB lands at bit 63, then shift right to
   2404    * sign/zero extend it down. Use 64-bit shifts throughout. */
   2405   u32 sh_left = 64u - (lsb + width);
   2406   u32 sh_right = 64u - width;
   2407   ra.offset += (i32)bf.storage_offset;
   2408   rv_resolve_mem_addr(a, &ra, &base, &off);
   2409   rv64_emit32(mc, enc_int_load(storage_bytes, 0, rd, base, off));
   2410   rv64_emit32(mc, rv_slli(rd, rd, sh_left));
   2411   if (bf.signed_)
   2412     rv64_emit32(mc, rv_srai(rd, rd, sh_right));
   2413   else
   2414     rv64_emit32(mc, rv_srli(rd, rd, sh_right));
   2415 }
   2416 static void rv_bitfield_store(NativeTarget* t, NativeAddr ra, NativeLoc src,
   2417                               BitFieldAccess bf) {
   2418   RvNativeTarget* a = rv_of(t);
   2419   MCEmitter* mc = t->mc;
   2420   u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
   2421   u32 src_reg = loc_reg(src);
   2422   u32 base;
   2423   i32 off;
   2424   u32 lsb = bf.bit_offset;
   2425   u32 width = bf.bit_width ? bf.bit_width : 1u;
   2426   u64 ones = width >= 64u ? ~(u64)0 : (((u64)1 << width) - 1u);
   2427   u64 mask_in = ones << lsb;
   2428   ra.offset += (i32)bf.storage_offset;
   2429   /* Resolve the field address; rv_resolve_mem_addr may use RV_TMP0/RV_TMP1, so
   2430    * stabilize the base into RV_TMP1 before consuming the scratch temps. */
   2431   rv_resolve_mem_addr(a, &ra, &base, &off);
   2432   if (base != RV_S0 && base != RV_TMP1) {
   2433     rv_emit_addr_adjust(mc, RV_TMP1, base, off);
   2434     base = RV_TMP1;
   2435     off = 0;
   2436   } else if (base == RV_TMP1 && off != 0) {
   2437     rv_emit_addr_adjust(mc, RV_TMP1, RV_TMP1, off);
   2438     off = 0;
   2439   }
   2440   /* word in RV_TMP2; merged via RV_TMP0 (clear mask, then shifted src). */
   2441   rv64_emit32(mc, enc_int_load(storage_bytes, 0, RV_TMP2, base, off));
   2442   rv_emit_load_imm(mc, 1, RV_TMP0, (i64)~mask_in);
   2443   rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP2, RV_TMP0));
   2444   rv_emit_load_imm(mc, 1, RV_TMP0, (i64)ones);
   2445   rv64_emit32(mc, rv_and(RV_TMP0, src_reg, RV_TMP0));
   2446   if (lsb) rv64_emit32(mc, rv_slli(RV_TMP0, RV_TMP0, lsb));
   2447   rv64_emit32(mc, rv_or(RV_TMP2, RV_TMP2, RV_TMP0));
   2448   rv64_emit32(mc, enc_int_store(storage_bytes, RV_TMP2, base, off));
   2449 }
   2450 static int rv_order_acquire(KitCgMemOrder o) {
   2451   return o == KIT_CG_MO_CONSUME || o == KIT_CG_MO_ACQUIRE ||
   2452          o == KIT_CG_MO_ACQ_REL || o == KIT_CG_MO_SEQ_CST;
   2453 }
   2454 static int rv_order_release(KitCgMemOrder o) {
   2455   return o == KIT_CG_MO_RELEASE || o == KIT_CG_MO_ACQ_REL ||
   2456          o == KIT_CG_MO_SEQ_CST;
   2457 }
   2458 
   2459 /* Materialize the atomic operand address into RV_TMP0 (a bare pointer, since
   2460  * LR/SC and AMO take a base register with no offset) and return it. */
   2461 static u32 rv_atomic_addr_reg(RvNativeTarget* a, NativeAddr addr) {
   2462   NativeLoc dst =
   2463       native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, RV_TMP0);
   2464   rv_load_addr(&a->base, dst, addr);
   2465   return RV_TMP0;
   2466 }
   2467 
   2468 static void rv_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr,
   2469                            MemAccess mem, KitCgMemOrder mo) {
   2470   RvNativeTarget* a = rv_of(t);
   2471   MCEmitter* mc = t->mc;
   2472   u32 sf =
   2473       (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u;
   2474   u32 base = rv_atomic_addr_reg(a, addr);
   2475   if (mo == KIT_CG_MO_SEQ_CST) rv64_emit32(mc, rv_fence_rw_rw());
   2476   if (rv_order_acquire(mo)) {
   2477     /* lr.w/d as an ordered load (aq=1). */
   2478     rv64_emit32(mc, sf ? rv_lr_d(loc_reg(dst), base, 1, 0)
   2479                        : rv_lr_w(loc_reg(dst), base, 1, 0));
   2480   } else {
   2481     rv64_emit32(
   2482         mc, enc_int_load(mem.size ? mem.size : native_type_size(t, dst.type), 0,
   2483                          loc_reg(dst), base, 0));
   2484   }
   2485 }
   2486 
   2487 static void rv_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src,
   2488                             MemAccess mem, KitCgMemOrder mo) {
   2489   RvNativeTarget* a = rv_of(t);
   2490   MCEmitter* mc = t->mc;
   2491   u32 sz = mem.size ? mem.size : native_type_size(t, src.type);
   2492   /* RV_TMP0 holds the address; never collides with src (an allocable reg). */
   2493   u32 base = rv_atomic_addr_reg(a, addr);
   2494   if (rv_order_release(mo)) rv64_emit32(mc, rv_fence_rw_rw());
   2495   rv64_emit32(mc, enc_int_store(sz, loc_reg(src), base, 0));
   2496   if (mo == KIT_CG_MO_SEQ_CST) rv64_emit32(mc, rv_fence_rw_rw());
   2497 }
   2498 
   2499 static void rv_atomic_rmw(NativeTarget* t, KitCgAtomicOp op, NativeLoc dst,
   2500                           NativeAddr addr, NativeLoc val, MemAccess mem,
   2501                           KitCgMemOrder mo) {
   2502   RvNativeTarget* a = rv_of(t);
   2503   MCEmitter* mc = t->mc;
   2504   u32 sf =
   2505       (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u;
   2506   u32 base = rv_atomic_addr_reg(a, addr); /* RV_TMP0 */
   2507   u32 vreg = loc_reg(val);
   2508   u32 rd = loc_reg(dst);
   2509   u32 aq = (u32)rv_order_acquire(mo);
   2510   u32 rl = (u32)rv_order_release(mo);
   2511   MCLabel retry = mc->label_new(mc);
   2512   /* LR/SC loop: dst = *base; new = dst op val; sc new; retry on failure.
   2513    * RV_TMP1 carries the SC status, RV_TMP3 the computed new value. */
   2514   mc->label_place(mc, retry);
   2515   rv64_emit32(mc, sf ? rv_lr_d(rd, base, aq, 0) : rv_lr_w(rd, base, aq, 0));
   2516   switch (op) {
   2517     case KIT_CG_ATOMIC_XCHG:
   2518       rv64_emit32(mc, rv_addi(RV_TMP3, vreg, 0));
   2519       break;
   2520     case KIT_CG_ATOMIC_ADD:
   2521       rv64_emit32(mc,
   2522                   sf ? rv_add(RV_TMP3, rd, vreg) : rv_addw(RV_TMP3, rd, vreg));
   2523       break;
   2524     case KIT_CG_ATOMIC_SUB:
   2525       rv64_emit32(mc,
   2526                   sf ? rv_sub(RV_TMP3, rd, vreg) : rv_subw(RV_TMP3, rd, vreg));
   2527       break;
   2528     case KIT_CG_ATOMIC_AND:
   2529       rv64_emit32(mc, rv_and(RV_TMP3, rd, vreg));
   2530       break;
   2531     case KIT_CG_ATOMIC_OR:
   2532       rv64_emit32(mc, rv_or(RV_TMP3, rd, vreg));
   2533       break;
   2534     case KIT_CG_ATOMIC_XOR:
   2535       rv64_emit32(mc, rv_xor(RV_TMP3, rd, vreg));
   2536       break;
   2537     case KIT_CG_ATOMIC_NAND:
   2538       rv64_emit32(mc, rv_and(RV_TMP3, rd, vreg));
   2539       rv64_emit32(mc, rv_xori(RV_TMP3, RV_TMP3, -1));
   2540       break;
   2541     default:
   2542       rv_panic(a, "unsupported atomic rmw op");
   2543   }
   2544   rv64_emit32(mc, sf ? rv_sc_d(RV_TMP1, base, RV_TMP3, 0, rl)
   2545                      : rv_sc_w(RV_TMP1, base, RV_TMP3, 0, rl));
   2546   rv64_emit32(mc, rv_bne(RV_TMP1, RV_ZERO, 0));
   2547   mc->emit_label_ref(mc, retry, R_RV_BRANCH, 4, 0);
   2548 }
   2549 
   2550 static void rv_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok,
   2551                           NativeAddr addr, NativeLoc expected,
   2552                           NativeLoc desired, MemAccess mem,
   2553                           KitCgMemOrder success, KitCgMemOrder failure) {
   2554   RvNativeTarget* a = rv_of(t);
   2555   MCEmitter* mc = t->mc;
   2556   u32 sf =
   2557       (mem.size ? mem.size : native_type_size(t, prior.type)) == 8u ? 1u : 0u;
   2558   u32 base = rv_atomic_addr_reg(a, addr); /* RV_TMP0 */
   2559   u32 rprior = loc_reg(prior);
   2560   u32 rexp = loc_reg(expected);
   2561   u32 rdes = loc_reg(desired);
   2562   u32 rok = loc_reg(ok);
   2563   u32 aq = (u32)rv_order_acquire(success);
   2564   u32 rl = (u32)rv_order_release(success);
   2565   MCLabel retry = mc->label_new(mc);
   2566   MCLabel fail = mc->label_new(mc);
   2567   MCLabel done = mc->label_new(mc);
   2568   (void)failure;
   2569   mc->label_place(mc, retry);
   2570   rv64_emit32(mc,
   2571               sf ? rv_lr_d(rprior, base, aq, 0) : rv_lr_w(rprior, base, aq, 0));
   2572   /* if (prior != expected) -> fail */
   2573   rv64_emit32(mc, rv_bne(rprior, rexp, 0));
   2574   mc->emit_label_ref(mc, fail, R_RV_BRANCH, 4, 0);
   2575   /* sc.w/d status, desired, (base); retry on failure. */
   2576   rv64_emit32(mc, sf ? rv_sc_d(RV_TMP1, base, rdes, 0, rl)
   2577                      : rv_sc_w(RV_TMP1, base, rdes, 0, rl));
   2578   rv64_emit32(mc, rv_bne(RV_TMP1, RV_ZERO, 0));
   2579   mc->emit_label_ref(mc, retry, R_RV_BRANCH, 4, 0);
   2580   /* ok = 1; jump done. */
   2581   rv_emit_load_imm(mc, 0, rok, 1);
   2582   rv64_emit32(mc, rv_jal(RV_ZERO, 0));
   2583   mc->emit_label_ref(mc, done, R_RV_JAL, 4, 0);
   2584   mc->label_place(mc, fail);
   2585   rv_emit_load_imm(mc, 0, rok, 0);
   2586   mc->label_place(mc, done);
   2587 }
   2588 
   2589 static void rv_fence(NativeTarget* t, KitCgMemOrder mo) {
   2590   if (mo == KIT_CG_MO_RELAXED) return;
   2591   rv64_emit32(t->mc, rv_fence_rw_rw());
   2592 }
   2593 /* ---- variadics (LP64D ABI_VA_LIST_POINTER) ----
   2594  * va_list is a single void* to the next argument slot. The prologue spilled
   2595  * unconsumed a-regs into the 64-byte save area at [s0+16); incoming stack args
   2596  * follow contiguously, so a uniform 8-byte stride covers both. `ap` is a
   2597  * NativeAddr that addresses the va_list object itself. */
   2598 
   2599 static void rv_va_start_core(RvNativeTarget* a, NativeAddr ap) {
   2600   NativeTarget* t = &a->base;
   2601   MCEmitter* mc = t->mc;
   2602   ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
   2603   KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   2604   if (vai.kind != ABI_VA_LIST_POINTER)
   2605     rv_panic(a, "unsupported va_list layout");
   2606   if (!a->is_variadic) rv_panic(a, "va_start: function not variadic");
   2607   /* *ap = s0 + 16 + next_param_int*8 (skip past named-int save slots). */
   2608   rv64_emit32(mc, rv_addi(RV_TMP1, RV_S0, 16 + (i32)(a->next_param_int * 8u)));
   2609   rv_emit_mem(a, 0, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1), ap,
   2610               native_mem_for_type(t, i64t, 8));
   2611 }
   2612 
   2613 static void rv_va_arg_core(RvNativeTarget* a, NativeLoc dst, NativeAddr ap,
   2614                            KitCgTypeId type) {
   2615   NativeTarget* t = &a->base;
   2616   MCEmitter* mc = t->mc;
   2617   ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
   2618   KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   2619   u32 sz = native_type_size(t, type);
   2620   NativeLoc cur = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1);
   2621   NativeAddr from;
   2622   if (vai.kind != ABI_VA_LIST_POINTER)
   2623     rv_panic(a, "unsupported va_list layout");
   2624   if (dst.kind != NATIVE_LOC_REG) rv_panic(a, "va_arg destination must be reg");
   2625   /* cur = *ap; load value from [cur]; *ap = cur + 8 (each slot is 8 bytes). */
   2626   rv_emit_mem(a, 1, cur, ap, native_mem_for_type(t, i64t, 8));
   2627   memset(&from, 0, sizeof from);
   2628   from.base_kind = NATIVE_ADDR_BASE_REG;
   2629   from.base.reg = RV_TMP1;
   2630   from.base_type = type;
   2631   if (native_loc_is_fp(dst)) {
   2632     /* Variadic FP args sit in the integer save area as their bit pattern;
   2633      * load into RV_TMP2 and bitcast into the FPR. */
   2634     NativeLoc itmp = native_loc_reg(type, NATIVE_REG_INT, RV_TMP2);
   2635     rv_emit_mem(a, 1, itmp, from, native_mem_for_type(t, type, sz));
   2636     rv64_emit32(mc, sz == 8u ? rv_fmv_d_x(loc_reg(dst), RV_TMP2)
   2637                              : rv_fmv_w_x(loc_reg(dst), RV_TMP2));
   2638   } else {
   2639     rv_emit_mem(a, 1, dst, from, native_mem_for_type(t, type, sz));
   2640   }
   2641   rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP1, 8));
   2642   rv_emit_mem(a, 0, cur, ap, native_mem_for_type(t, i64t, 8));
   2643 }
   2644 
   2645 static void rv_va_copy_core(RvNativeTarget* a, NativeAddr dst_ap,
   2646                             NativeAddr src_ap) {
   2647   NativeTarget* t = &a->base;
   2648   KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   2649   NativeLoc tmp = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1);
   2650   /* va_list is a single 8-byte pointer. */
   2651   rv_emit_mem(a, 1, tmp, src_ap, native_mem_for_type(t, i64t, 8));
   2652   rv_emit_mem(a, 0, tmp, dst_ap, native_mem_for_type(t, i64t, 8));
   2653 }
   2654 
   2655 static NativeAddr rv_va_addr_from_ptr(NativeLoc ap_ptr) {
   2656   NativeAddr addr;
   2657   memset(&addr, 0, sizeof addr);
   2658   addr.base_kind = NATIVE_ADDR_BASE_REG;
   2659   addr.cls = NATIVE_REG_INT;
   2660   addr.base.reg = ap_ptr.v.reg;
   2661   addr.base_type = ap_ptr.type;
   2662   return addr;
   2663 }
   2664 
   2665 static void rv_va_start_native(NativeTarget* t, NativeLoc ap_ptr) {
   2666   rv_va_start_core(rv_of(t), rv_va_addr_from_ptr(ap_ptr));
   2667 }
   2668 static void rv_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr,
   2669                              KitCgTypeId type) {
   2670   rv_va_arg_core(rv_of(t), dst, rv_va_addr_from_ptr(ap_ptr), type);
   2671 }
   2672 static void rv_va_end_native(NativeTarget* t, NativeLoc ap_ptr) {
   2673   (void)t;
   2674   (void)ap_ptr;
   2675 }
   2676 static void rv_va_copy_native(NativeTarget* t, NativeLoc dst, NativeLoc src) {
   2677   rv_va_copy_core(rv_of(t), rv_va_addr_from_ptr(dst), rv_va_addr_from_ptr(src));
   2678 }
   2679 /* Software popcount of RV_TMP1 (already width-normalized) into rd, using
   2680  * RV_TMP1/RV_TMP2/RV_TMP3 as scratch. Mirrors the legacy bit-twiddling. */
   2681 static void rv_emit_popcount(MCEmitter* mc, u32 rd, int is64) {
   2682   rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, 1));
   2683   rv_emit_load_imm(mc, 1, RV_TMP3,
   2684                    is64 ? (i64)0x5555555555555555ll : (i64)0x55555555);
   2685   rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP2, RV_TMP3));
   2686   rv64_emit32(mc, rv_sub(RV_TMP1, RV_TMP1, RV_TMP2));
   2687   rv_emit_load_imm(mc, 1, RV_TMP3,
   2688                    is64 ? (i64)0x3333333333333333ll : (i64)0x33333333);
   2689   rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP1, RV_TMP3));
   2690   rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 2));
   2691   rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP3));
   2692   rv64_emit32(mc, rv_add(RV_TMP1, RV_TMP1, RV_TMP2));
   2693   rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, 4));
   2694   rv64_emit32(mc, rv_add(RV_TMP1, RV_TMP1, RV_TMP2));
   2695   rv_emit_load_imm(mc, 1, RV_TMP3,
   2696                    is64 ? (i64)0x0f0f0f0f0f0f0f0fll : (i64)0x0f0f0f0f);
   2697   rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP3));
   2698   rv_emit_load_imm(mc, 1, RV_TMP3,
   2699                    is64 ? (i64)0x0101010101010101ll : (i64)0x01010101);
   2700   rv64_emit32(mc, rv_mul(RV_TMP1, RV_TMP1, RV_TMP3));
   2701   rv64_emit32(mc, rv_srli(rd, RV_TMP1, is64 ? 56u : 24u));
   2702   /* The 32-bit SWAR sum lives in product bits [24,32); since the multiply is
   2703    * 64-bit, bits [32,64) survive the >>24 and must be masked off. (The 64-bit
   2704    * path's >>56 already isolates the top byte, so it needs no mask.) */
   2705   if (!is64) rv64_emit32(mc, rv_andi(rd, rd, 0xff));
   2706 }
   2707 
   2708 /* Inline byte-granule copy/set between bare base registers (memcpy/memmove/
   2709  * memset intrinsics). dir<0 copies high-to-low (memmove backward). */
   2710 static void rv_intrin_copy(MCEmitter* mc, u32 dr, u32 sr, u32 n, int backward) {
   2711   if (!backward) {
   2712     u32 i = 0;
   2713     while (i + 8u <= n) {
   2714       rv64_emit32(mc, rv_ld(RV_TMP3, sr, (i32)i));
   2715       rv64_emit32(mc, rv_sd(RV_TMP3, dr, (i32)i));
   2716       i += 8u;
   2717     }
   2718     while (i + 4u <= n) {
   2719       rv64_emit32(mc, rv_lwu(RV_TMP3, sr, (i32)i));
   2720       rv64_emit32(mc, rv_sw(RV_TMP3, dr, (i32)i));
   2721       i += 4u;
   2722     }
   2723     while (i + 2u <= n) {
   2724       rv64_emit32(mc, rv_lhu(RV_TMP3, sr, (i32)i));
   2725       rv64_emit32(mc, rv_sh(RV_TMP3, dr, (i32)i));
   2726       i += 2u;
   2727     }
   2728     while (i < n) {
   2729       rv64_emit32(mc, rv_lbu(RV_TMP3, sr, (i32)i));
   2730       rv64_emit32(mc, rv_sb(RV_TMP3, dr, (i32)i));
   2731       i += 1u;
   2732     }
   2733   } else {
   2734     u32 i = n;
   2735     while (i >= 8u) {
   2736       i -= 8u;
   2737       rv64_emit32(mc, rv_ld(RV_TMP3, sr, (i32)i));
   2738       rv64_emit32(mc, rv_sd(RV_TMP3, dr, (i32)i));
   2739     }
   2740     while (i >= 4u) {
   2741       i -= 4u;
   2742       rv64_emit32(mc, rv_lwu(RV_TMP3, sr, (i32)i));
   2743       rv64_emit32(mc, rv_sw(RV_TMP3, dr, (i32)i));
   2744     }
   2745     while (i >= 2u) {
   2746       i -= 2u;
   2747       rv64_emit32(mc, rv_lhu(RV_TMP3, sr, (i32)i));
   2748       rv64_emit32(mc, rv_sh(RV_TMP3, dr, (i32)i));
   2749     }
   2750     while (i >= 1u) {
   2751       i -= 1u;
   2752       rv64_emit32(mc, rv_lbu(RV_TMP3, sr, (i32)i));
   2753       rv64_emit32(mc, rv_sb(RV_TMP3, dr, (i32)i));
   2754     }
   2755   }
   2756 }
   2757 
   2758 static void rv_intrinsic(NativeTarget* t, IntrinKind kind,
   2759                          const NativeLoc* dsts, u32 ndst, const NativeLoc* args,
   2760                          u32 narg) {
   2761   RvNativeTarget* a = rv_of(t);
   2762   MCEmitter* mc = t->mc;
   2763   (void)ndst;
   2764   (void)narg;
   2765   switch (kind) {
   2766     case INTRIN_NONE:
   2767       break;
   2768     case INTRIN_EXPECT:
   2769     case INTRIN_ASSUME_ALIGNED: {
   2770       /* dst = val (hint dropped). */
   2771       if (args[0].kind == NATIVE_LOC_IMM)
   2772         rv_emit_load_imm(mc, rv_is_64(t, dsts[0].type) ? 1u : 0u,
   2773                          loc_reg(dsts[0]), args[0].v.imm);
   2774       else
   2775         rv_move(t, dsts[0], args[0]);
   2776       return;
   2777     }
   2778     case INTRIN_PREFETCH:
   2779       return;
   2780     case INTRIN_TRAP:
   2781       rv64_emit32(mc, rv_ebreak());
   2782       return;
   2783     case INTRIN_BSWAP: {
   2784       u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type);
   2785       switch (width) {
   2786         case 2: {
   2787           u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
   2788           /* rd = ((rs & 0xff) << 8) | ((rs >> 8) & 0xff). */
   2789           rv64_emit32(mc, rv_addi(RV_TMP2, RV_ZERO, 0xff));
   2790           rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); /* 0xff00 */
   2791           rv64_emit32(mc, rv_slli(RV_TMP1, rs, 8));
   2792           rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP2));
   2793           rv64_emit32(mc, rv_srli(RV_TMP3, rs, 8));
   2794           rv64_emit32(mc, rv_andi(RV_TMP3, RV_TMP3, 0xff));
   2795           rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP3));
   2796           return;
   2797         }
   2798         case 4: {
   2799           u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
   2800           rv64_emit32(mc, rv_srliw(RV_TMP1, rs, 24));
   2801           rv64_emit32(mc, rv_andi(RV_TMP1, RV_TMP1, 0xff));
   2802           rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 16));
   2803           rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
   2804           rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8));
   2805           rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
   2806           rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 8));
   2807           rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
   2808           rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 16));
   2809           rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
   2810           rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
   2811           rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 24));
   2812           rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP2));
   2813           rv64_emit32(mc, rv_slli(rd, rd, 32));
   2814           rv64_emit32(mc, rv_srli(rd, rd, 32));
   2815           return;
   2816         }
   2817         case 8: {
   2818           u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
   2819           int i;
   2820           rv64_emit32(mc, rv_addi(RV_TMP1, RV_ZERO, 0));
   2821           for (i = 0; i < 8; ++i) {
   2822             int sh = 56 - 8 * i;
   2823             if (i == 0) {
   2824               rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
   2825             } else {
   2826               rv64_emit32(mc, rv_srli(RV_TMP2, rs, (u32)(8 * i)));
   2827               rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
   2828             }
   2829             if (sh) rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, (u32)sh));
   2830             rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
   2831           }
   2832           rv64_emit32(mc, rv_addi(rd, RV_TMP1, 0));
   2833           return;
   2834         }
   2835         default:
   2836           break;
   2837       }
   2838       return;
   2839     }
   2840     case INTRIN_POPCOUNT: {
   2841       u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
   2842       int is64 = rv_is_64(t, args[0].type);
   2843       rv64_emit32(mc, rv_addi(RV_TMP1, rs, 0));
   2844       if (!is64) {
   2845         rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
   2846         rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
   2847       }
   2848       rv_emit_popcount(mc, rd, is64);
   2849       return;
   2850     }
   2851     case INTRIN_CTZ: {
   2852       /* ctz(x) = popcount((x & -x) - 1) for x != 0. */
   2853       u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
   2854       int is64 = rv_is_64(t, args[0].type);
   2855       rv64_emit32(mc, rv_sub(RV_TMP1, RV_ZERO, rs));
   2856       rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, rs));
   2857       rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP1, -1));
   2858       if (!is64) {
   2859         rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
   2860         rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
   2861       }
   2862       rv_emit_popcount(mc, rd, is64);
   2863       return;
   2864     }
   2865     case INTRIN_CLZ: {
   2866       /* Fold the high bit downward, then clz = popcount(~folded). */
   2867       u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
   2868       int is64 = rv_is_64(t, args[0].type);
   2869       u32 shifts[6] = {1, 2, 4, 8, 16, 32};
   2870       u32 ns = is64 ? 6u : 5u, i;
   2871       rv64_emit32(mc, rv_addi(RV_TMP1, rs, 0));
   2872       if (!is64) {
   2873         rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
   2874         rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
   2875       }
   2876       for (i = 0; i < ns; ++i) {
   2877         rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, shifts[i]));
   2878         rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
   2879       }
   2880       rv64_emit32(mc, rv_xori(RV_TMP1, RV_TMP1, -1));
   2881       if (!is64) {
   2882         rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
   2883         rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
   2884       }
   2885       rv_emit_popcount(mc, rd, is64);
   2886       return;
   2887     }
   2888     case INTRIN_SADD_OVERFLOW:
   2889     case INTRIN_SSUB_OVERFLOW: {
   2890       /* dsts: [val, ovf]. ADD: ovf=((a^r)&(b^r))>>(w-1);
   2891        * SUB: ovf=((a^b)&(a^r))>>(w-1). */
   2892       int is64 = rv_is_64(t, dsts[0].type);
   2893       u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
   2894       u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
   2895       u32 sh = is64 ? 63u : 31u;
   2896       if (kind == INTRIN_SADD_OVERFLOW)
   2897         rv64_emit32(mc,
   2898                     is64 ? rv_add(RV_TMP2, ra, rb) : rv_addw(RV_TMP2, ra, rb));
   2899       else
   2900         rv64_emit32(mc,
   2901                     is64 ? rv_sub(RV_TMP2, ra, rb) : rv_subw(RV_TMP2, ra, rb));
   2902       rv64_emit32(mc, rv_xor(RV_TMP3, ra, RV_TMP2)); /* a ^ r */
   2903       if (kind == INTRIN_SADD_OVERFLOW) {
   2904         rv64_emit32(mc, rv_xor(rovf, rb, RV_TMP2)); /* b ^ r */
   2905         rv64_emit32(mc, rv_and(rovf, rovf, RV_TMP3));
   2906       } else {
   2907         rv64_emit32(mc, rv_xor(rovf, ra, rb)); /* a ^ b */
   2908         rv64_emit32(mc, rv_and(rovf, rovf, RV_TMP3));
   2909       }
   2910       rv64_emit32(mc,
   2911                   is64 ? rv_srli(rovf, rovf, sh) : rv_srliw(rovf, rovf, sh));
   2912       rv64_emit32(mc, rv_andi(rovf, rovf, 1));
   2913       rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0));
   2914       return;
   2915     }
   2916     case INTRIN_UADD_OVERFLOW:
   2917     case INTRIN_USUB_OVERFLOW: {
   2918       int is64 = rv_is_64(t, dsts[0].type);
   2919       u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
   2920       u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
   2921       if (!is64) {
   2922         rv64_emit32(mc, rv_slli(RV_TMP2, ra, 32));
   2923         rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP2, 32));
   2924         rv64_emit32(mc, rv_slli(RV_TMP3, rb, 32));
   2925         rv64_emit32(mc, rv_srli(RV_TMP3, RV_TMP3, 32));
   2926         ra = RV_TMP2;
   2927         rb = RV_TMP3;
   2928       }
   2929       if (kind == INTRIN_UADD_OVERFLOW) {
   2930         if (is64) {
   2931           rv64_emit32(mc, rv_add(RV_TMP2, ra, rb));
   2932           rv64_emit32(mc, rv_sltu(rovf, RV_TMP2, ra));
   2933         } else {
   2934           rv64_emit32(mc, rv_add(RV_TMP2, ra, rb));
   2935           rv64_emit32(mc, rv_srli(rovf, RV_TMP2, 32));
   2936           rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
   2937           rv64_emit32(mc, rv_addiw(RV_TMP2, RV_TMP2, 0));
   2938         }
   2939       } else {
   2940         rv64_emit32(mc, rv_sltu(rovf, ra, rb));
   2941         rv64_emit32(mc,
   2942                     is64 ? rv_sub(RV_TMP2, ra, rb) : rv_subw(RV_TMP2, ra, rb));
   2943       }
   2944       rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0));
   2945       return;
   2946     }
   2947     case INTRIN_SMUL_OVERFLOW: {
   2948       int is64 = rv_is_64(t, dsts[0].type);
   2949       u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
   2950       u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
   2951       if (is64) {
   2952         rv64_emit32(mc, rv_mul(RV_TMP2, ra, rb));
   2953         rv64_emit32(mc, rv_mulh(RV_TMP3, ra, rb));
   2954         rv64_emit32(mc, rv_srai(rovf, RV_TMP2, 63));
   2955         rv64_emit32(mc, rv_xor(rovf, RV_TMP3, rovf));
   2956         rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
   2957         rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0));
   2958       } else {
   2959         rv64_emit32(mc, rv_addiw(RV_TMP2, ra, 0));
   2960         rv64_emit32(mc, rv_addiw(RV_TMP3, rb, 0));
   2961         rv64_emit32(mc, rv_mul(RV_TMP2, RV_TMP2, RV_TMP3));
   2962         rv64_emit32(mc, rv_addiw(RV_TMP3, RV_TMP2, 0));
   2963         rv64_emit32(mc, rv_xor(rovf, RV_TMP2, RV_TMP3));
   2964         rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
   2965         rv64_emit32(mc, rv_addiw(rd, RV_TMP2, 0));
   2966       }
   2967       return;
   2968     }
   2969     case INTRIN_UMUL_OVERFLOW: {
   2970       int is64 = rv_is_64(t, dsts[0].type);
   2971       u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
   2972       u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
   2973       if (is64) {
   2974         rv64_emit32(mc, rv_mulhu(rovf, ra, rb));
   2975         rv64_emit32(mc, rv_mul(rd, ra, rb));
   2976         rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
   2977       } else {
   2978         rv64_emit32(mc, rv_slli(RV_TMP2, ra, 32));
   2979         rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP2, 32));
   2980         rv64_emit32(mc, rv_slli(RV_TMP3, rb, 32));
   2981         rv64_emit32(mc, rv_srli(RV_TMP3, RV_TMP3, 32));
   2982         rv64_emit32(mc, rv_mul(RV_TMP2, RV_TMP2, RV_TMP3));
   2983         rv64_emit32(mc, rv_srli(rovf, RV_TMP2, 32));
   2984         rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
   2985         rv64_emit32(mc, rv_addiw(rd, RV_TMP2, 0));
   2986       }
   2987       return;
   2988     }
   2989     case INTRIN_MEMCPY:
   2990     case INTRIN_MEMMOVE: {
   2991       u32 dr, sr, n;
   2992       if (narg != 3u || args[0].kind != NATIVE_LOC_REG ||
   2993           args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM)
   2994         rv_panic(a, "unsupported memory intrinsic operands");
   2995       if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll)
   2996         rv_panic(a, "unsupported memory intrinsic size");
   2997       dr = loc_reg(args[0]);
   2998       sr = loc_reg(args[1]);
   2999       n = (u32)args[2].v.imm;
   3000       rv_intrin_copy(mc, dr, sr, n, kind == INTRIN_MEMMOVE);
   3001       return;
   3002     }
   3003     case INTRIN_MEMSET: {
   3004       u32 dr, n, src;
   3005       if (narg != 3u || args[0].kind != NATIVE_LOC_REG ||
   3006           args[2].kind != NATIVE_LOC_IMM)
   3007         rv_panic(a, "unsupported memset operands");
   3008       if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll)
   3009         rv_panic(a, "unsupported memset size");
   3010       dr = loc_reg(args[0]);
   3011       n = (u32)args[2].v.imm;
   3012       if (args[1].kind == NATIVE_LOC_IMM) {
   3013         u32 byte = (u32)(args[1].v.imm & 0xffu);
   3014         if (byte == 0) {
   3015           src = RV_ZERO;
   3016         } else {
   3017           u64 b = byte;
   3018           b |= b << 8;
   3019           b |= b << 16;
   3020           b |= b << 32;
   3021           rv_emit_load_imm(mc, 1, RV_TMP3, (i64)b);
   3022           src = RV_TMP3;
   3023         }
   3024       } else {
   3025         /* Replicate the low byte of a register value across 8 bytes. */
   3026         u32 rb = loc_reg(args[1]);
   3027         rv64_emit32(mc, rv_andi(RV_TMP3, rb, 0xff));
   3028         rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 8));
   3029         rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2));
   3030         rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 16));
   3031         rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2));
   3032         rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 32));
   3033         rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2));
   3034         src = RV_TMP3;
   3035       }
   3036       {
   3037         u32 i = 0;
   3038         while (i + 8u <= n) {
   3039           rv64_emit32(mc, rv_sd(src, dr, (i32)i));
   3040           i += 8u;
   3041         }
   3042         while (i + 4u <= n) {
   3043           rv64_emit32(mc, rv_sw(src, dr, (i32)i));
   3044           i += 4u;
   3045         }
   3046         while (i + 2u <= n) {
   3047           rv64_emit32(mc, rv_sh(src, dr, (i32)i));
   3048           i += 2u;
   3049         }
   3050         while (i < n) {
   3051           rv64_emit32(mc, rv_sb(src, dr, (i32)i));
   3052           i += 1u;
   3053         }
   3054       }
   3055       return;
   3056     }
   3057     case INTRIN_CPU_NOP:
   3058       rv64_emit32(mc, rv_nop());
   3059       return;
   3060     case INTRIN_CPU_YIELD:
   3061       rv64_emit32(mc, rv_pause());
   3062       return;
   3063     case INTRIN_ISB:
   3064       rv64_emit32(mc, rv_fence_i());
   3065       return;
   3066     case INTRIN_DMB:
   3067     case INTRIN_DSB:
   3068       rv64_emit32(mc, rv_fence_rw_rw());
   3069       return;
   3070     case INTRIN_WFI:
   3071       rv64_emit32(mc, rv_wfi());
   3072       return;
   3073     default:
   3074       break;
   3075   }
   3076   rv_panic(a, "unsupported compiler intrinsic");
   3077 }
   3078 /* ============================ inline asm ============================ */
   3079 
   3080 _Noreturn static void rv_asm_panic_at(Compiler* c, SrcLoc loc,
   3081                                       const char* msg) {
   3082   compiler_panic(c, loc, "rv64 inline asm: %s", msg);
   3083 }
   3084 _Noreturn static void rv_asm_panic(NativeDirectTarget* d, const char* msg) {
   3085   rv_asm_panic_at(d->base.c, d->loc, msg);
   3086 }
   3087 
   3088 /* constraint_body / constraint_early / match_index are shared
   3089  * (cg/native_asm.h). */
   3090 
   3091 /* Build a bound register pseudo-operand in the rv64 inline shape. */
   3092 static void rv_asm_bound_reg(Operand* out, KitCgTypeId type,
   3093                              NativeAllocClass cls, Reg reg) {
   3094   memset(out, 0, sizeof *out);
   3095   out->kind = RV64_INLINE_OPK_REG;
   3096   out->pad[0] =
   3097       (cls == NATIVE_REG_FP) ? RV64_INLINE_OPCLS_FP : RV64_INLINE_OPCLS_INT;
   3098   out->type = type;
   3099   out->v.local = (CGLocal)reg;
   3100 }
   3101 static void rv_asm_bound_mem(Operand* out, KitCgTypeId type, Reg base) {
   3102   memset(out, 0, sizeof *out);
   3103   out->kind = OPK_INDIRECT;
   3104   out->type = type;
   3105   out->v.ind.base = (CGLocal)base;
   3106   out->v.ind.index = CG_LOCAL_NONE;
   3107   out->v.ind.ofs = 0;
   3108 }
   3109 
   3110 /* Parse a clobber register name into (class, reg). Returns 0 for the special
   3111  * "cc"/"memory" clobbers and panics on an unknown register. RV64 dwarf: int
   3112  * x0..x31 = 0..31, fp f0..f31 = 32..63. */
   3113 static int rv_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name,
   3114                                     NativeAllocClass* cls_out, Reg* reg_out) {
   3115   Slice s = pool_slice(c->global, name);
   3116   char buf[16];
   3117   uint32_t dwarf;
   3118   if (!s.s || !s.len) return 0;
   3119   if (s.len == 2 && s.s[0] == 'c' && s.s[1] == 'c') return 0;
   3120   if (s.len == 6 && memcmp(s.s, "memory", 6) == 0) return 0;
   3121   if (s.len >= sizeof buf) rv_asm_panic_at(c, loc, "clobber name is too long");
   3122   memcpy(buf, s.s, s.len);
   3123   buf[s.len] = '\0';
   3124   if (rv64_register_index(buf, &dwarf) != 0)
   3125     rv_asm_panic_at(c, loc, "unknown clobber register");
   3126   if (dwarf <= 31u) {
   3127     *cls_out = NATIVE_REG_INT;
   3128     *reg_out = (Reg)dwarf;
   3129     return 1;
   3130   }
   3131   if (dwarf >= 32u && dwarf <= 63u) {
   3132     *cls_out = NATIVE_REG_FP;
   3133     *reg_out = (Reg)(dwarf - 32u);
   3134     return 1;
   3135   }
   3136   rv_asm_panic_at(c, loc, "unsupported clobber register");
   3137   return 0;
   3138 }
   3139 
   3140 static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
   3141                                  u32 nclob, u32* int_mask, u32* fp_mask) {
   3142   u32 i;
   3143   *int_mask = 0;
   3144   *fp_mask = 0;
   3145   for (i = 0; i < nclob; ++i) {
   3146     NativeAllocClass cls;
   3147     Reg reg;
   3148     if (!rv_asm_parse_reg_clobber(c, loc, clobbers[i], &cls, &reg)) continue;
   3149     if (cls == NATIVE_REG_INT)
   3150       *int_mask |= 1u << reg;
   3151     else
   3152       *fp_mask |= 1u << reg;
   3153   }
   3154 }
   3155 
   3156 static NativeAllocClass rv_asm_constraint_class(NativeDirectTarget* d,
   3157                                                 const char* body) {
   3158   if (body[0] == 'r') return NATIVE_REG_INT;
   3159   if (body[0] == 'f') return NATIVE_REG_FP;
   3160   rv_asm_panic(d, "constraint is not a register constraint");
   3161   return NATIVE_REG_INT;
   3162 }
   3163 
   3164 static int rv_asm_resolve_pin_or_panic(NativeDirectTarget* d, Sym reg,
   3165                                        const char* constraint,
   3166                                        NativeAsmRegPin* pin) {
   3167   NativeAsmRegPinStatus st =
   3168       native_asm_resolve_pin(d->native, reg, constraint, pin);
   3169   if (st == NATIVE_ASM_REG_PIN_ABSENT) return 0;
   3170   if (st != NATIVE_ASM_REG_PIN_OK)
   3171     rv_asm_panic(d, native_asm_pin_status_message(st));
   3172   return 1;
   3173 }
   3174 
   3175 /* Pick a free register from the arch's caller-saved allocable pools for an
   3176  * asm operand the direct path must self-allocate. */
   3177 static Reg rv_asm_alloc_reg(NativeDirectTarget* d, NativeAllocClass cls,
   3178                             u32* used_int, u32* used_fp) {
   3179   /* int: a0..a7 (10..17) then t-temps that aren't emit scratch. */
   3180   static const Reg int_pool[] = {10u, 11u, 12u, 13u, 14u, 15u,
   3181                                  16u, 17u, 29u, 30u, 31u};
   3182   /* fp: fa0..fa7 (10..17) then ft caller-saved. */
   3183   static const Reg fp_pool[] = {10u, 11u, 12u, 13u, 14u, 15u, 16u, 17u,
   3184                                 4u,  5u,  6u,  7u,  28u, 29u, 30u, 31u};
   3185   const Reg* pool = cls == NATIVE_REG_FP ? fp_pool : int_pool;
   3186   u32 n = cls == NATIVE_REG_FP ? (u32)(sizeof fp_pool / sizeof fp_pool[0])
   3187                                : (u32)(sizeof int_pool / sizeof int_pool[0]);
   3188   u32* used = cls == NATIVE_REG_FP ? used_fp : used_int;
   3189   u32 i;
   3190   for (i = 0; i < n; ++i) {
   3191     Reg r = pool[i];
   3192     if ((*used & (1u << r)) != 0) continue;
   3193     *used |= 1u << r;
   3194     return r;
   3195   }
   3196   rv_asm_panic(d, "out of registers for asm operands");
   3197   return REG_NONE;
   3198 }
   3199 
   3200 /* Direct (-O0) path: resolve a semantic Operand to a NativeAddr. */
   3201 static NativeAddr rv_direct_addr(NativeDirectTarget* d, Operand op) {
   3202   NativeAddr addr;
   3203   memset(&addr, 0, sizeof addr);
   3204   switch ((OpKind)op.kind) {
   3205     case OPK_LOCAL:
   3206       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3207       addr.base.frame = d->locals[op.v.local - 1u].home;
   3208       addr.base_type = op.type;
   3209       return addr;
   3210     case OPK_INDIRECT:
   3211       addr.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE;
   3212       addr.base.frame = d->locals[op.v.ind.base - 1u].home;
   3213       addr.cls = d->locals[op.v.ind.base - 1u].cls;
   3214       addr.base_type = d->locals[op.v.ind.base - 1u].type;
   3215       addr.offset = op.v.ind.ofs;
   3216       return addr;
   3217     default:
   3218       rv_asm_panic(d, "operand is not addressable");
   3219   }
   3220 }
   3221 
   3222 /* Materialize an OPK_INDIRECT (frame-value) base into a register, returning a
   3223  * plain register-based NativeAddr. */
   3224 static NativeAddr rv_direct_materialize_addr(NativeDirectTarget* d,
   3225                                              Operand op) {
   3226   RvNativeTarget* a = rv_of(d->native);
   3227   NativeAddr addr = rv_direct_addr(d, op);
   3228   if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
   3229     NativeLoc base = native_loc_reg(addr.base_type, NATIVE_REG_INT, RV_TMP1);
   3230     NativeAddr load;
   3231     memset(&load, 0, sizeof load);
   3232     load.base_kind = NATIVE_ADDR_BASE_FRAME;
   3233     load.base.frame = addr.base.frame;
   3234     load.base_type = addr.base_type;
   3235     rv_emit_mem(a, 1, base, load,
   3236                 native_mem_for_type(d->native, addr.base_type, 8));
   3237     addr.base_kind = NATIVE_ADDR_BASE_REG;
   3238     addr.base.reg = RV_TMP1;
   3239   }
   3240   return addr;
   3241 }
   3242 
   3243 static void rv_direct_load_operand_to_reg(NativeDirectTarget* d, Operand op,
   3244                                           NativeLoc dst) {
   3245   RvNativeTarget* a = rv_of(d->native);
   3246   NativeAddr addr;
   3247   memset(&addr, 0, sizeof addr);
   3248   switch ((OpKind)op.kind) {
   3249     case OPK_IMM:
   3250       if ((NativeAllocClass)dst.cls != NATIVE_REG_INT)
   3251         rv_asm_panic(d, "floating-point immediate asm input is unsupported");
   3252       d->native->load_imm(d->native, dst, op.v.imm);
   3253       return;
   3254     case OPK_LOCAL:
   3255       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3256       addr.base.frame = d->locals[op.v.local - 1u].home;
   3257       addr.base_type = op.type;
   3258       rv_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0));
   3259       return;
   3260     case OPK_GLOBAL:
   3261       addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
   3262       addr.base.global.sym = op.v.global.sym;
   3263       addr.base.global.addend = op.v.global.addend;
   3264       addr.base_type = op.type;
   3265       d->native->load_addr(d->native, dst, addr);
   3266       return;
   3267     case OPK_INDIRECT:
   3268       addr = rv_direct_materialize_addr(d, op);
   3269       rv_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0));
   3270       return;
   3271   }
   3272   rv_asm_panic(d, "unsupported asm input operand");
   3273 }
   3274 
   3275 static void rv_direct_load_address_to_reg(NativeDirectTarget* d, Operand op,
   3276                                           NativeLoc dst) {
   3277   d->native->load_addr(d->native, dst, rv_direct_addr(d, op));
   3278 }
   3279 
   3280 static void rv_direct_store_reg_to_operand(NativeDirectTarget* d, Operand op,
   3281                                            NativeLoc src) {
   3282   RvNativeTarget* a = rv_of(d->native);
   3283   NativeAddr addr;
   3284   memset(&addr, 0, sizeof addr);
   3285   if (op.kind == OPK_LOCAL) {
   3286     addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3287     addr.base.frame = d->locals[op.v.local - 1u].home;
   3288     addr.base_type = op.type;
   3289   } else {
   3290     addr = rv_direct_materialize_addr(d, op);
   3291   }
   3292   rv_emit_mem(a, 0, src, addr, native_mem_for_type(d->native, op.type, 0));
   3293 }
   3294 
   3295 /* Callee-saved registers an asm block clobbers must be spilled/restored around
   3296  * the block (the only ABI duty the allocator cannot discharge itself). */
   3297 typedef struct RvAsmSavedClobber {
   3298   NativeFrameSlot slot;
   3299   NativeAllocClass cls;
   3300   Reg reg;
   3301   KitCgTypeId type;
   3302 } RvAsmSavedClobber;
   3303 
   3304 static void rv_asm_save_one(RvNativeTarget* a, RvAsmSavedClobber* s) {
   3305   NativeFrameSlotDesc desc;
   3306   NativeAddr addr;
   3307   memset(&desc, 0, sizeof desc);
   3308   desc.type = s->type;
   3309   desc.size = 8;
   3310   desc.align = 8;
   3311   desc.kind = NATIVE_FRAME_SLOT_SAVE;
   3312   s->slot = a->base.frame_slot(&a->base, &desc);
   3313   memset(&addr, 0, sizeof addr);
   3314   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3315   addr.base.frame = s->slot;
   3316   addr.base_type = s->type;
   3317   rv_emit_mem(a, 0, native_loc_reg(s->type, s->cls, s->reg), addr,
   3318               native_mem_for_type(&a->base, s->type, 8));
   3319 }
   3320 static void rv_asm_restore_one(RvNativeTarget* a, const RvAsmSavedClobber* s) {
   3321   NativeAddr addr;
   3322   memset(&addr, 0, sizeof addr);
   3323   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3324   addr.base.frame = s->slot;
   3325   addr.base_type = s->type;
   3326   rv_emit_mem(a, 1, native_loc_reg(s->type, s->cls, s->reg), addr,
   3327               native_mem_for_type(&a->base, s->type, 8));
   3328 }
   3329 
   3330 /* psABI callee-saved: integer s0..s11 (x8,x9,x18..x27), fp fs0..fs11
   3331  * (f8,f9,f18..f27). x8 is the frame pointer and never asm-clobbered. */
   3332 static int rv_reg_is_callee_int(Reg r) {
   3333   return r == 9u || (r >= 18u && r <= 27u);
   3334 }
   3335 static int rv_reg_is_callee_fp(Reg r) {
   3336   return r == 8u || r == 9u || (r >= 18u && r <= 27u);
   3337 }
   3338 
   3339 static RvAsmSavedClobber* rv_asm_save_callee_clobbers(RvNativeTarget* a,
   3340                                                       u32 int_mask, u32 fp_mask,
   3341                                                       u32* nsaved_out) {
   3342   RvAsmSavedClobber* saved =
   3343       arena_zarray(a->base.c->tu, RvAsmSavedClobber, 24u);
   3344   KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
   3345   KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64);
   3346   u32 n = 0;
   3347   Reg r;
   3348   for (r = 0; r <= 31u; ++r) {
   3349     if ((int_mask & (1u << r)) == 0 || !rv_reg_is_callee_int(r)) continue;
   3350     saved[n].cls = NATIVE_REG_INT;
   3351     saved[n].reg = r;
   3352     saved[n].type = i64;
   3353     rv_asm_save_one(a, &saved[n++]);
   3354   }
   3355   for (r = 0; r <= 31u; ++r) {
   3356     if ((fp_mask & (1u << r)) == 0 || !rv_reg_is_callee_fp(r)) continue;
   3357     saved[n].cls = NATIVE_REG_FP;
   3358     saved[n].reg = r;
   3359     saved[n].type = f64;
   3360     rv_asm_save_one(a, &saved[n++]);
   3361   }
   3362   *nsaved_out = n;
   3363   return saved;
   3364 }
   3365 
   3366 /* ---- NativeTarget (optimizer) asm hook ----
   3367  * The optimizer pre-allocated every operand register and arranged surrounding
   3368  * data flow, so this binds pre-allocated registers to the template and only
   3369  * materializes memory-operand bases into the reserved scratch + spills the
   3370  * callee-saved registers the asm clobbers. */
   3371 
   3372 static NativeAddr rv_asm_loc_to_addr(RvNativeTarget* a, SrcLoc loc,
   3373                                      NativeLoc src) {
   3374   NativeAddr addr;
   3375   memset(&addr, 0, sizeof addr);
   3376   addr.base_type = src.type;
   3377   switch ((NativeLocKind)src.kind) {
   3378     case NATIVE_LOC_FRAME:
   3379       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3380       addr.base.frame = src.v.frame;
   3381       return addr;
   3382     case NATIVE_LOC_ADDR:
   3383       return src.v.addr;
   3384     case NATIVE_LOC_GLOBAL:
   3385       addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
   3386       addr.base.global.sym = src.v.global.sym;
   3387       addr.base.global.addend = src.v.global.addend;
   3388       return addr;
   3389     case NATIVE_LOC_REG:
   3390       addr.base_kind = NATIVE_ADDR_BASE_REG;
   3391       addr.cls = NATIVE_REG_INT;
   3392       addr.base.reg = src.v.reg;
   3393       return addr;
   3394     default:
   3395       rv_asm_panic_at(a->base.c, loc, "unsupported memory asm operand");
   3396   }
   3397 }
   3398 
   3399 /* Resolve a memory-constraint operand to a single base register with zero
   3400  * offset, folding any frame/global/offset into a reserved scratch register. */
   3401 static Reg rv_asm_native_mem_base(RvNativeTarget* a, SrcLoc loc, NativeLoc src,
   3402                                   u32* ntmp) {
   3403   NativeAddr addr = rv_asm_loc_to_addr(a, loc, src);
   3404   u32 base;
   3405   i32 off;
   3406   Reg dst;
   3407   if (addr.index_kind != NATIVE_ADDR_INDEX_NONE)
   3408     rv_asm_panic_at(a->base.c, loc, "indexed memory asm operand unsupported");
   3409   rv_resolve_mem_addr(a, &addr, &base, &off);
   3410   if (off == 0 && base != RV_TMP0 && base != RV_TMP1) return (Reg)base;
   3411   if (*ntmp >= 2u)
   3412     rv_asm_panic_at(a->base.c, loc, "too many memory asm operands");
   3413   dst = (*ntmp == 0u) ? RV_TMP0 : RV_TMP1;
   3414   (*ntmp)++;
   3415   rv_emit_addr_adjust(a->base.mc, dst, base, off);
   3416   return dst;
   3417 }
   3418 
   3419 static void rv_asm_bind_native(RvNativeTarget* a, SrcLoc loc, Operand* out,
   3420                                const char* constraint, KitCgTypeId type,
   3421                                NativeLoc src, u32* ntmp) {
   3422   const char* body = native_asm_constraint_body(constraint);
   3423   if (body[0] == 'r' || body[0] == 'f') {
   3424     NativeAllocClass cls = (body[0] == 'f') ? NATIVE_REG_FP : NATIVE_REG_INT;
   3425     if (src.kind != NATIVE_LOC_REG)
   3426       rv_asm_panic_at(a->base.c, loc, "register asm operand not in a register");
   3427     rv_asm_bound_reg(out, type, cls, (Reg)src.v.reg);
   3428   } else if (body[0] == 'i') {
   3429     if (src.kind != NATIVE_LOC_IMM)
   3430       rv_asm_panic_at(a->base.c, loc, "immediate asm operand is not immediate");
   3431     memset(out, 0, sizeof *out);
   3432     out->kind = OPK_IMM;
   3433     out->type = type;
   3434     out->v.imm = src.v.imm;
   3435   } else if (body[0] == 'm') {
   3436     rv_asm_bound_mem(out, type, rv_asm_native_mem_base(a, loc, src, ntmp));
   3437   } else {
   3438     rv_asm_panic_at(a->base.c, loc, "unsupported asm constraint");
   3439   }
   3440 }
   3441 
   3442 static void rv_asm_block_native(NativeTarget* t, const char* tmpl,
   3443                                 const AsmConstraint* outs, u32 nout,
   3444                                 NativeLoc* out_locs, const AsmConstraint* ins,
   3445                                 u32 nin, const NativeLoc* in_locs,
   3446                                 const Sym* clobbers, u32 nclob) {
   3447   RvNativeTarget* a = rv_of(t);
   3448   Compiler* c = t->c;
   3449   SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
   3450   Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
   3451   Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
   3452   u32 ntmp = 0, i;
   3453   Rv64Asm* asmh;
   3454 
   3455   for (i = 0; i < nout; ++i) {
   3456     KitCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
   3457     rv_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i],
   3458                        &ntmp);
   3459   }
   3460   for (i = 0; i < nin; ++i) {
   3461     const char* body = native_asm_constraint_body(ins[i].str);
   3462     int matched = native_asm_match_index(body);
   3463     KitCgTypeId type;
   3464     NativeLoc inloc;
   3465     if (matched >= 0) {
   3466       if ((u32)matched >= nout)
   3467         rv_asm_panic_at(c, loc, "matching constraint out of range");
   3468       bound_ins[i] = bound_outs[matched];
   3469       continue;
   3470     }
   3471     type = ins[i].type ? ins[i].type : in_locs[i].type;
   3472     inloc = in_locs[i];
   3473     /* A register-constrained input that lives in a frame slot (address-taken
   3474      * local) must be loaded into a reserved scratch first. */
   3475     if (body[0] == 'r' && inloc.kind != NATIVE_LOC_REG) {
   3476       Reg r;
   3477       if (ntmp >= 2u) rv_asm_panic_at(c, loc, "too many memory asm operands");
   3478       r = (ntmp == 0u) ? RV_TMP0 : RV_TMP1;
   3479       ntmp++;
   3480       inloc = native_loc_reg(type, NATIVE_REG_INT, r);
   3481       rv_emit_mem(a, 1, inloc, rv_asm_loc_to_addr(a, loc, in_locs[i]),
   3482                   native_mem_for_type(t, type, native_type_size(t, type)));
   3483     }
   3484     rv_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp);
   3485   }
   3486 
   3487   /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber
   3488    * masks and rv_known_callee_saves folded the callee-saved ones into the
   3489    * function's saved set, so the prologue/epilogue already preserve them. */
   3490   asmh = rv64_asm_open(c);
   3491   rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
   3492                    nclob);
   3493   rv64_asm_run_template(asmh, t->mc, tmpl);
   3494   rv64_asm_close(asmh);
   3495 }
   3496 /* file_scope_asm + finalize are shared (cg/native_asm.h). */
   3497 
   3498 static void rv_trap(NativeTarget* t) { rv64_emit32(t->mc, rv_ebreak()); }
   3499 static void rv_set_loc(NativeTarget* t, SrcLoc loc) {
   3500   rv_of(t)->loc = loc;
   3501   if (t->mc->set_loc) t->mc->set_loc(t->mc, loc);
   3502 }
   3503 
   3504 /* ============================ construction ============================ */
   3505 
   3506 NativeTarget* rv64_native_target_new(Compiler* c, ObjBuilder* obj,
   3507                                      MCEmitter* mc) {
   3508   RvNativeTarget* a = arena_znew(c->tu, RvNativeTarget);
   3509   NativeTarget* t;
   3510   if (!a) return NULL;
   3511   t = &a->base;
   3512   t->c = c;
   3513   t->obj = obj;
   3514   t->mc = mc;
   3515   native_frame_init(&a->frame, c);
   3516   t->regs = &rv_reg_info;
   3517   t->class_for_type = native_class_for_type_fp_le8;
   3518   t->imm_legal = rv_imm_legal;
   3519   t->addr_legal = rv_addr_legal;
   3520   t->func_begin = rv_func_begin;
   3521   t->func_begin_known_frame = rv_func_begin_known_frame;
   3522   t->note_frame_state = NULL;
   3523   /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved
   3524    * set; rv_func_begin_known_frame derives the records from the masks. */
   3525   t->reserve_callee_saves = rv_reserve_callee_saves;
   3526   t->signature_stack_bytes = rv_signature_stack_bytes;
   3527   t->call_stack_bytes = rv_call_stack_bytes;
   3528   t->has_store_zero_reg = 1;
   3529   t->store_zero_reg = RV_ZERO;
   3530   t->func_end = rv_func_end;
   3531   t->frame_slot = rv_frame_slot;
   3532   t->frame_slot_debug_loc = rv_frame_slot_debug_loc;
   3533   t->bind_param = rv_bind_native_param;
   3534   t->label_new = rv_label_new;
   3535   t->label_place = rv_label_place;
   3536   t->jump = rv_jump;
   3537   t->cmp_branch = rv_cmp_branch;
   3538   t->indirect_branch = rv_indirect_branch;
   3539   t->load_label_addr = rv_load_label_addr;
   3540   t->move = rv_move;
   3541   t->load_imm = rv_load_imm;
   3542   t->load_const = rv_load_const;
   3543   t->load_addr = rv_load_addr;
   3544   t->load = rv_load;
   3545   t->store = rv_store;
   3546   t->tls_addr_of = rv_tls_addr_of;
   3547   t->copy_bytes = rv_copy_bytes;
   3548   t->set_bytes = rv_set_bytes;
   3549   t->bitfield_load = rv_bitfield_load;
   3550   t->bitfield_store = rv_bitfield_store;
   3551   t->binop = rv_binop;
   3552   t->unop = rv_unop;
   3553   t->cmp = rv_cmp;
   3554   t->convert = rv_convert;
   3555   t->alloca_ = rv_alloca;
   3556   t->spill = rv_spill;
   3557   t->reload = rv_reload;
   3558   t->plan_call = rv_plan_call;
   3559   t->emit_call = rv_emit_call;
   3560   t->plan_ret = rv_plan_ret;
   3561   t->ret = rv_ret;
   3562   t->atomic_load = rv_atomic_load;
   3563   t->atomic_store = rv_atomic_store;
   3564   t->atomic_rmw = rv_atomic_rmw;
   3565   t->atomic_cas = rv_atomic_cas;
   3566   t->fence = rv_fence;
   3567   t->va_start_ = rv_va_start_native;
   3568   t->va_arg_ = rv_va_arg_native;
   3569   t->va_end_ = rv_va_end_native;
   3570   t->va_copy_ = rv_va_copy_native;
   3571   t->intrinsic = rv_intrinsic;
   3572   t->asm_block = rv_asm_block_native;
   3573   t->file_scope_asm = native_file_scope_asm;
   3574   t->trap = rv_trap;
   3575   t->set_loc = rv_set_loc;
   3576   t->finalize = native_finalize;
   3577   return t;
   3578 }
   3579 
   3580 /* ============================ NativeOps (-O0) ============================ */
   3581 
   3582 static void rv_bind_param(NativeDirectTarget* d, const CGParamDesc* p,
   3583                           CGLocal local, NativeDirectLocal* l) {
   3584   NativeLoc dst;
   3585   (void)local;
   3586   memset(&dst, 0, sizeof dst);
   3587   dst.kind = NATIVE_LOC_FRAME;
   3588   dst.type = p->type;
   3589   dst.v.frame = l->home;
   3590   rv_bind_native_param(d->native, p, dst);
   3591 }
   3592 
   3593 /* A sibling call is realizable when its outgoing stack-argument area fits the
   3594  * window the caller itself received (so the args land in the caller's incoming
   3595  * slots without overflowing into the caller's caller's frame). Register-only
   3596  * calls (the common case) always qualify. Mirrors aa64's aa_no_tail. */
   3597 static const char* rv_no_tail(NativeDirectTarget* d, const CGCallDesc* call) {
   3598   RvNativeTarget* a = rv_of(d->native);
   3599   NativeCallDesc nd;
   3600   NativeLoc* args = NULL;
   3601   NativeLoc* results = NULL;
   3602   u32 i, stack;
   3603   if (a->frame.ncallee_saves)
   3604     return "rv64 tail call: callee-saved registers in use";
   3605   memset(&nd, 0, sizeof nd);
   3606   if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs);
   3607   if (call->nresults)
   3608     results = arena_zarray(d->base.c->tu, NativeLoc, call->nresults);
   3609   for (i = 0; i < call->nargs; ++i) {
   3610     args[i].kind = NATIVE_LOC_FRAME;
   3611     args[i].type = d->locals[call->args[i] - 1u].type;
   3612     args[i].cls = d->locals[call->args[i] - 1u].cls;
   3613     args[i].v.frame = d->locals[call->args[i] - 1u].home;
   3614   }
   3615   for (i = 0; i < call->nresults; ++i) {
   3616     results[i].kind = NATIVE_LOC_FRAME;
   3617     results[i].type = d->locals[call->results[i] - 1u].type;
   3618     results[i].cls = d->locals[call->results[i] - 1u].cls;
   3619     results[i].v.frame = d->locals[call->results[i] - 1u].home;
   3620   }
   3621   nd.fn_type = call->fn_type;
   3622   nd.args = args;
   3623   nd.results = results;
   3624   nd.nargs = call->nargs;
   3625   nd.nresults = call->nresults;
   3626   stack = rv_call_stack_size(d->native, &nd);
   3627   if (stack > a->incoming_stack_size)
   3628     return "rv64 tail call: stack argument area too small";
   3629   return NULL;
   3630 }
   3631 
   3632 /* Resolve a pointer-typed Operand (the address of a va_list object) into `reg`
   3633  * and return a register-based NativeAddr. An OPK_LOCAL holds the va_list object
   3634  * itself, so we take its frame address; an OPK_INDIRECT holds the pointer in
   3635  * memory and must be loaded. The va cores use TMP1/TMP2 internally, so `reg`
   3636  * must be distinct from those (callers pass TMP0 / TMP3). */
   3637 /* ap_addr is the pointer value &ap (the va_list object's address). For an
   3638  * OPK_LOCAL the local HOLDS that pointer, so load its home value; an
   3639  * OPK_INDIRECT names *(base+ofs), whose address base+ofs is the pointer.
   3640  * Mirrors aa64's aa_direct_pointer_addr. */
   3641 static NativeAddr rv_direct_pointer_addr(NativeDirectTarget* d, Operand op) {
   3642   RvNativeTarget* a = rv_of(d->native);
   3643   NativeAddr addr;
   3644   memset(&addr, 0, sizeof addr);
   3645   if (op.kind == OPK_LOCAL) {
   3646     NativeLoc base = native_loc_reg(op.type, NATIVE_REG_INT, RV_TMP1);
   3647     NativeAddr load;
   3648     memset(&load, 0, sizeof load);
   3649     load.base_kind = NATIVE_ADDR_BASE_FRAME;
   3650     load.base.frame = d->locals[op.v.local - 1u].home;
   3651     load.base_type = op.type;
   3652     rv_emit_mem(a, 1, base, load, native_mem_for_type(d->native, op.type, 8));
   3653     addr.base_kind = NATIVE_ADDR_BASE_REG;
   3654     addr.base.reg = RV_TMP1;
   3655     addr.base_type = op.type;
   3656     return addr;
   3657   }
   3658   return rv_direct_materialize_addr(d, op);
   3659 }
   3660 
   3661 static NativeAddr rv_direct_va_base(NativeDirectTarget* d, Operand ap_addr,
   3662                                     Reg reg) {
   3663   NativeLoc dst =
   3664       native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg);
   3665   NativeAddr addr;
   3666   d->native->load_addr(d->native, dst, rv_direct_pointer_addr(d, ap_addr));
   3667   memset(&addr, 0, sizeof addr);
   3668   addr.base_kind = NATIVE_ADDR_BASE_REG;
   3669   addr.cls = NATIVE_REG_INT;
   3670   addr.base.reg = reg;
   3671   addr.base_type = builtin_id(KIT_CG_BUILTIN_I64);
   3672   return addr;
   3673 }
   3674 
   3675 static void rv_va_start_(NativeDirectTarget* d, Operand ap_addr) {
   3676   rv_va_start_core(rv_of(d->native), rv_direct_va_base(d, ap_addr, RV_TMP3));
   3677 }
   3678 static void rv_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr,
   3679                        KitCgTypeId type) {
   3680   RvNativeTarget* a = rv_of(d->native);
   3681   int is_fp = cg_type_is_float(d->base.c, type);
   3682   NativeLoc res = native_loc_reg(type, is_fp ? NATIVE_REG_FP : NATIVE_REG_INT,
   3683                                  is_fp ? RV_FTMP0 : RV_TMP0);
   3684   NativeAddr dst_addr;
   3685   rv_va_arg_core(a, res, rv_direct_va_base(d, ap_addr, RV_TMP3), type);
   3686   /* Store the fetched value back into the semantic destination. */
   3687   dst_addr = rv_direct_addr(d, dst);
   3688   if (dst_addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
   3689     NativeLoc base =
   3690         native_loc_reg(dst_addr.base_type, NATIVE_REG_INT, RV_TMP1);
   3691     NativeAddr load;
   3692     memset(&load, 0, sizeof load);
   3693     load.base_kind = NATIVE_ADDR_BASE_FRAME;
   3694     load.base.frame = dst_addr.base.frame;
   3695     load.base_type = dst_addr.base_type;
   3696     rv_emit_mem(a, 1, base, load,
   3697                 native_mem_for_type(d->native, dst_addr.base_type, 8));
   3698     dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
   3699     dst_addr.base.reg = RV_TMP1;
   3700   }
   3701   rv_emit_mem(
   3702       a, 0, res, dst_addr,
   3703       native_mem_for_type(d->native, type, native_type_size(d->native, type)));
   3704 }
   3705 static void rv_va_end_(NativeDirectTarget* d, Operand ap_addr) {
   3706   (void)d;
   3707   (void)ap_addr;
   3708 }
   3709 static void rv_va_copy_(NativeDirectTarget* d, Operand dst, Operand src) {
   3710   RvNativeTarget* a = rv_of(d->native);
   3711   NativeAddr src_ap = rv_direct_va_base(d, src, RV_TMP0);
   3712   NativeAddr dst_ap = rv_direct_va_base(d, dst, RV_TMP3);
   3713   rv_va_copy_core(a, dst_ap, src_ap);
   3714 }
   3715 
   3716 static void rv_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
   3717                                 const AsmConstraint* outs, u32 nout,
   3718                                 Operand* out_ops, const AsmConstraint* ins,
   3719                                 u32 nin, const Operand* in_ops,
   3720                                 const Sym* clobbers, u32 nclob,
   3721                                 u32 clobber_abi_sets) {
   3722   RvNativeTarget* a = rv_of(d->native);
   3723   Compiler* c = d->base.c;
   3724   Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
   3725   Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
   3726   u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp;
   3727   RvAsmSavedClobber* saved;
   3728   u32 nsaved, i;
   3729   Rv64Asm* asmh;
   3730 
   3731   rv_asm_clobber_masks(c, d->loc, clobbers, nclob, &clob_int, &clob_fp);
   3732   native_asm_abi_clobber_masks(d->native, clobber_abi_sets, &abi_int, &abi_fp);
   3733   clob_int |= abi_int;
   3734   clob_fp |= abi_fp;
   3735   /* Reserve emit scratch (t0/t1/t2/t3), sp/gp/tp/zero/ra and the frame pointer
   3736    * so the operand allocator never hands them out. */
   3737   used_int = clob_int | (1u << RV_ZERO) | (1u << RV_RA) | (1u << RV_SP) |
   3738              (1u << RV_GP) | (1u << RV_TP) | (1u << RV_TMP0) | (1u << RV_TMP1) |
   3739              (1u << RV_TMP2) | (1u << RV_TMP3) | (1u << RV_S0);
   3740   used_fp =
   3741       clob_fp | (1u << RV_FTMP0) | (1u << RV_FTMP1) | (1u << 2u) | (1u << 3u);
   3742 
   3743   for (i = 0; i < nout; ++i) {
   3744     const char* body = native_asm_constraint_body(outs[i].str);
   3745     KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type;
   3746     NativeAsmRegPin pin;
   3747     if (rv_asm_resolve_pin_or_panic(d, outs[i].reg, outs[i].str, &pin)) {
   3748       /* GNU local register variable: pin to the named hard register. */
   3749       if (pin.cls == NATIVE_REG_FP) {
   3750         used_fp |= 1u << pin.reg;
   3751         clob_fp |= 1u << pin.reg;
   3752       } else {
   3753         used_int |= 1u << pin.reg;
   3754         clob_int |= 1u << pin.reg;
   3755       }
   3756       rv_asm_bound_reg(&bound_outs[i], type, pin.cls, pin.reg);
   3757     } else if (body[0] == 'r' || body[0] == 'f') {
   3758       NativeAllocClass cls = rv_asm_constraint_class(d, body);
   3759       Reg reg = rv_asm_alloc_reg(d, cls, &used_int, &used_fp);
   3760       rv_asm_bound_reg(&bound_outs[i], type, cls, reg);
   3761     } else if (body[0] == 'm') {
   3762       Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp);
   3763       rv_asm_bound_mem(&bound_outs[i], type, reg);
   3764     } else {
   3765       rv_asm_panic(d, "unsupported output constraint");
   3766     }
   3767   }
   3768 
   3769   for (i = 0; i < nin; ++i) {
   3770     const char* body = native_asm_constraint_body(ins[i].str);
   3771     int matched = native_asm_match_index(body);
   3772     KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type;
   3773     if (matched >= 0) {
   3774       if ((u32)matched >= nout)
   3775         rv_asm_panic(d, "matching constraint out of range");
   3776       if (native_asm_constraint_early(outs[matched].str))
   3777         rv_asm_panic(d, "matching input names early-clobber output");
   3778       if (bound_outs[matched].kind != RV64_INLINE_OPK_REG)
   3779         rv_asm_panic(d, "matching constraint requires register output");
   3780       bound_ins[i] = bound_outs[matched];
   3781       continue;
   3782     }
   3783     NativeAsmRegPin pin;
   3784     if (rv_asm_resolve_pin_or_panic(d, ins[i].reg, ins[i].str, &pin)) {
   3785       /* GNU local register variable: pin to the named hard register. */
   3786       if (pin.cls == NATIVE_REG_FP) {
   3787         used_fp |= 1u << pin.reg;
   3788         clob_fp |= 1u << pin.reg;
   3789       } else {
   3790         used_int |= 1u << pin.reg;
   3791         clob_int |= 1u << pin.reg;
   3792       }
   3793       rv_asm_bound_reg(&bound_ins[i], type, pin.cls, pin.reg);
   3794     } else if (body[0] == 'r' || body[0] == 'f') {
   3795       NativeAllocClass cls = rv_asm_constraint_class(d, body);
   3796       Reg reg = rv_asm_alloc_reg(d, cls, &used_int, &used_fp);
   3797       rv_asm_bound_reg(&bound_ins[i], type, cls, reg);
   3798     } else if (body[0] == 'i') {
   3799       if (in_ops[i].kind != OPK_IMM)
   3800         rv_asm_panic(d, "immediate constraint requires immediate operand");
   3801       bound_ins[i] = in_ops[i];
   3802     } else if (body[0] == 'm') {
   3803       Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp);
   3804       rv_asm_bound_mem(&bound_ins[i], type, reg);
   3805     } else {
   3806       rv_asm_panic(d, "unsupported input constraint");
   3807     }
   3808   }
   3809 
   3810   saved = rv_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved);
   3811   for (i = 0; i < nout; ++i) {
   3812     if (bound_outs[i].kind == RV64_INLINE_OPK_REG) {
   3813       NativeAllocClass cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP
   3814                                  ? NATIVE_REG_FP
   3815                                  : NATIVE_REG_INT;
   3816       if (outs[i].dir == KIT_CG_ASM_INOUT) {
   3817         rv_direct_load_operand_to_reg(
   3818             d, out_ops[i],
   3819             native_loc_reg(bound_outs[i].type, cls,
   3820                            (Reg)bound_outs[i].v.local));
   3821       }
   3822     } else if (bound_outs[i].kind == OPK_INDIRECT) {
   3823       NativeLoc loc =
   3824           native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
   3825                          (Reg)bound_outs[i].v.ind.base);
   3826       rv_direct_load_address_to_reg(d, out_ops[i], loc);
   3827     }
   3828   }
   3829   for (i = 0; i < nin; ++i) {
   3830     if (bound_ins[i].kind == RV64_INLINE_OPK_REG) {
   3831       NativeAllocClass cls = bound_ins[i].pad[0] == RV64_INLINE_OPCLS_FP
   3832                                  ? NATIVE_REG_FP
   3833                                  : NATIVE_REG_INT;
   3834       rv_direct_load_operand_to_reg(
   3835           d, in_ops[i],
   3836           native_loc_reg(bound_ins[i].type, cls, (Reg)bound_ins[i].v.local));
   3837     } else if (bound_ins[i].kind == OPK_INDIRECT) {
   3838       NativeLoc loc =
   3839           native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
   3840                          (Reg)bound_ins[i].v.ind.base);
   3841       rv_direct_load_address_to_reg(d, in_ops[i], loc);
   3842     }
   3843   }
   3844   asmh = rv64_asm_open(c);
   3845   rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
   3846                    nclob);
   3847   rv64_asm_run_template(asmh, d->native->mc, tmpl);
   3848   rv64_asm_close(asmh);
   3849 
   3850   for (i = 0; i < nout; ++i) {
   3851     NativeAllocClass cls;
   3852     NativeLoc src;
   3853     if (bound_outs[i].kind != RV64_INLINE_OPK_REG) continue;
   3854     cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP ? NATIVE_REG_FP
   3855                                                        : NATIVE_REG_INT;
   3856     src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local);
   3857     rv_direct_store_reg_to_operand(d, out_ops[i], src);
   3858   }
   3859   for (i = nsaved; i > 0; --i) rv_asm_restore_one(a, &saved[i - 1u]);
   3860 }
   3861 
   3862 static const NativeOps rv_direct_ops = {
   3863     .bind_param = rv_bind_param,
   3864     .tail_call_unrealizable_reason = rv_no_tail,
   3865     .va_start_ = rv_va_start_,
   3866     .va_arg_ = rv_va_arg_,
   3867     .va_end_ = rv_va_end_,
   3868     .va_copy_ = rv_va_copy_,
   3869     .asm_block = rv_direct_asm_block,
   3870 };
   3871 
   3872 const NativeOps* rv64_native_direct_ops(void) { return &rv_direct_ops; }