native.c - kit

native.c (168825B)
      1 /* src/arch/rv64/native.c — RISC-V (RV64GC, LP64D) NativeTarget implementation.
      2  *
      3  * Mirrors the aa64 reference (src/arch/aa64/native.c): a physical-emission
      4  * NativeTarget driven at -O0 by the shared NativeDirectTarget and at -O1+ by
      5  * the optimizer emit path. ABI decisions go through the abi/ interface; this
      6  * file owns only ISA emission and the RV64 frame layout.
      7  *
      8  * Frame model (single, top-record): s0 (x8) is the frame pointer anchored at
      9  * the saved s0/ra pair; slots live below s0 at positive byte offsets `off`
     10  * (address = s0 - off); outgoing args sit at the bottom of the frame (sp+0..).
     11  *   frame_size  = align16(16 + cum_off + max_outgoing + va_save_sz)
     12  *   fp_pair_off = frame_size - 16 - va_save_sz   (saved pair, sp-relative)
     13  *   CFA = s0 + (frame_size - fp_pair_off)
     14  * RISC-V has no condition flags: comparisons materialize a 0/1 via SLT/SLTU or
     15  * FLT/FLE; branches compare two registers directly. x0 is a hardware zero. */
     16 
     17 #include <string.h>
     18 
     19 #include "abi/abi.h"
     20 #include "arch/riscv/asm.h"
     21 #include "arch/riscv/isa.h"
     22 #include "arch/riscv/regs.h"
     23 #include "arch/riscv/rv64.h"
     24 #include "arch/riscv/variant.h"
     25 #include "asm/asm.h"
     26 #include "asm/asm_lex.h"
     27 #include "cg/native_argmove.h"
     28 #include "cg/native_asm.h"
     29 #include "cg/native_direct_target.h"
     30 #include "cg/native_frame.h"
     31 #include "cg/type.h"
     32 #include "core/arena.h"
     33 #include "core/bytes.h"
     34 #include "core/pool.h"
     35 #include "core/slice.h"
     36 #include "obj/obj.h"
     37 
     38 enum {
     39   RV_TMP0 = 5u,  /* t0: emit-internal scratch (reserved, never allocable) */
     40   RV_TMP1 = 6u,  /* t1: emit-internal scratch */
     41   RV_TMP2 = 7u,  /* t2: emit-internal scratch (reserved in phys table) */
     42   RV_TMP3 = 28u, /* t3: emit-internal scratch (reserved in phys table) */
     43   RV_FTMP0 = 0u, /* ft0: emit-internal FP scratch */
     44   RV_FTMP1 = 1u, /* ft1: emit-internal FP scratch */
     45   RV_FA0 = 10u,  /* fa0..fa7 = f10..f17 (FP arg/return registers) */
     46   RV_FA7 = 17u,
     47   /* Single-pass (-O0) worst-case prologue: sp adjust (3) + far save pair (7)
     48    * + sret spill (1) + variadic GP spills (8). No callee-saves at -O0. */
     49   RV_PROLOGUE_WORDS = 32u,
     50   /* Known-frame (-O1) prologues are emitted directly, not into the fixed -O0
     51    * NOP region, and additionally save callee-saved registers (up to 11 int + 12
     52    * fp, each up to 4 words for a far s0-relative offset) on top of the header,
     53    * sret, and variadic spills. Size the build buffer for the worst case. */
     54   RV_KNOWN_PROLOGUE_WORDS = 192u,
     55   RV_FRAME_SAVE_SIZE = 16u,
     56 };
     57 
     58 /* s1..s11 (11) + fs0..fs11 (12); separate int/fp collect arrays use this cap.
     59  */
     60 #define RV_MAX_CALLEE_SAVES 16u
     61 #define RV_MAX_REG_ARG_MOVES 16u
     62 
     63 extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc);
     64 extern void debug_func_pc_range(Debug*, ObjSecId text_section, u32 begin_ofs,
     65                                 u32 end_ofs);
     66 
     67 /* ============================ low-level emit ============================ */
     68 
     69 void rv64_emit32(MCEmitter* mc, u32 word) {
     70   u8 b[4];
     71   u32 ofs = obj_pos(mc->obj, mc->section_id);
     72   wr_u32_le(b, word);
     73   mc->emit_bytes(mc, b, sizeof b);
     74   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
     75 }
     76 
     77 void rv64_emit16(MCEmitter* mc, u32 halfword) {
     78   u8 b[2];
     79   u32 ofs = obj_pos(mc->obj, mc->section_id);
     80   b[0] = (u8)(halfword & 0xff);
     81   b[1] = (u8)((halfword >> 8) & 0xff);
     82   mc->emit_bytes(mc, b, sizeof b);
     83   if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
     84 }
     85 
     86 static void rv_patch32(ObjBuilder* obj, ObjSecId sec, u32 off, u32 word) {
     87   u8 b[4];
     88   wr_u32_le(b, word);
     89   obj_patch(obj, sec, off, b, sizeof b);
     90 }
     91 
     92 static int fits_i12(i64 v) { return v >= -2048 && v <= 2047; }
     93 static int fits_i32(i64 v) {
     94   return v >= (i64)(i32)0x80000000 && v <= (i64)(i32)0x7fffffff;
     95 }
     96 
     97 static u32 align_up_u32(u32 v, u32 align) {
     98   u32 mask = align ? align - 1u : 0u;
     99   return (v + mask) & ~mask;
    100 }
    101 
    102 static i64 floor_div_4096(i64 v) {
    103   if (v >= 0) return v / 4096;
    104   return -((-v + 4095) / 4096);
    105 }
    106 
    107 static void rv_emit_li32(const RiscvVariant* v, MCEmitter* mc, u32 rd,
    108                          i32 imm) {
    109   if (imm >= -2048 && imm <= 2047) {
    110     rv64_emit32(mc, rv_addi(rd, RV_ZERO, imm));
    111     return;
    112   }
    113   {
    114     i64 hi64 = floor_div_4096((i64)imm + 0x800);
    115     i32 hi = (i32)hi64;
    116     i32 lo = (i32)((i64)imm - hi64 * 4096);
    117     rv64_emit32(mc, rv_lui(rd, (u32)hi & 0xfffffu));
    118     /* ADDIW is RV64-only; on RV32 the value fits 32 bits so plain ADDI is
    119      * exact (and identical to ADDIW's low result on RV64). */
    120     if (lo)
    121       rv64_emit32(mc, v->has_w_forms ? rv_addiw(rd, rd, lo) : rv_addi(rd, rd, lo));
    122   }
    123 }
    124 
    125 static i32 sext12(u32 v) {
    126   v &= 0xfffu;
    127   return (v & 0x800u) ? (i32)v - 4096 : (i32)v;
    128 }
    129 
    130 /* Builds a full XLEN-wide value. The recursion / slli-12 chain assembles bits
    131  * above 32 and is only ever reached on rv64 (a single rv32 register cannot hold
    132  * a value wider than 32 bits — the cg layer legalizes those into pairs). */
    133 static void rv_emit_li64(const RiscvVariant* v, MCEmitter* mc, u32 rd, u64 imm) {
    134   if (fits_i32((i64)imm)) {
    135     rv_emit_li32(v, mc, rd, (i32)(i64)imm);
    136     return;
    137   }
    138   {
    139     i32 lo = sext12((u32)imm);
    140     u64 hi = (imm - (u64)(i64)lo) >> 12;
    141     rv_emit_li64(v, mc, rd, hi);
    142     rv64_emit32(mc, rv_slli(rd, rd, 12));
    143     if (lo) rv64_emit32(mc, rv_addi(rd, rd, lo));
    144   }
    145 }
    146 
    147 /* sf!=0 selects a full native-width materialization; sf==0 a 32-bit value. On
    148  * rv32 the native width is 32, so the wide branch collapses to the 32-bit
    149  * path. */
    150 static void rv_emit_load_imm(const RiscvVariant* v, MCEmitter* mc, u32 sf,
    151                              u32 rd, i64 imm) {
    152   if (!sf || v->xlen == 32u) {
    153     rv_emit_li32(v, mc, rd, (i32)imm);
    154     return;
    155   }
    156   if (fits_i32(imm))
    157     rv_emit_li32(v, mc, rd, (i32)imm);
    158   else
    159     rv_emit_li64(v, mc, rd, (u64)imm);
    160 }
    161 
    162 /* rd = base + off, materializing the offset when it exceeds imm12. Uses RV_TMP1
    163  * as scratch for the wide path, so callers must keep RV_TMP1 free. */
    164 static void rv_emit_addr_adjust(const RiscvVariant* v, MCEmitter* mc, u32 rd,
    165                                 u32 base, i32 off) {
    166   if (off == 0) {
    167     if (rd != base) rv64_emit32(mc, rv_addi(rd, base, 0));
    168     return;
    169   }
    170   if (fits_i12(off)) {
    171     rv64_emit32(mc, rv_addi(rd, base, off));
    172     return;
    173   }
    174   rv_emit_load_imm(v, mc, 1, RV_TMP1, (i64)off);
    175   rv64_emit32(mc, rv_add(rd, base, RV_TMP1));
    176 }
    177 
    178 static u32 enc_int_store(const RiscvVariant* v, u32 nbytes, u32 src, u32 base,
    179                          i32 off) {
    180   switch (nbytes) {
    181     case 1:
    182       return rv_sb(src, base, off);
    183     case 2:
    184       return rv_sh(src, base, off);
    185     case 4:
    186       return rv_sw(src, base, off);
    187     default:
    188       /* The widest GPR store is SD on rv64, SW on rv32. */
    189       return v->ptr_bytes == 8u ? rv_sd(src, base, off) : rv_sw(src, base, off);
    190   }
    191 }
    192 static u32 enc_int_load(const RiscvVariant* v, u32 nbytes, int sign_ext, u32 rd,
    193                         u32 base, i32 off) {
    194   switch (nbytes) {
    195     case 1:
    196       return sign_ext ? rv_lb(rd, base, off) : rv_lbu(rd, base, off);
    197     case 2:
    198       return sign_ext ? rv_lh(rd, base, off) : rv_lhu(rd, base, off);
    199     case 4:
    200       /* LWU (zero-extending 32-bit load) is RV64-only; on rv32 a 4-byte load
    201        * is just LW (no wider container to zero-extend into). */
    202       return sign_ext || v->xlen == 32u ? rv_lw(rd, base, off)
    203                                         : rv_lwu(rd, base, off);
    204     default:
    205       /* The widest GPR load is LD on rv64, LW on rv32. */
    206       return v->ptr_bytes == 8u ? rv_ld(rd, base, off) : rv_lw(rd, base, off);
    207   }
    208 }
    209 
    210 /* Pointer-width GPR load/store (GOT entries, frame-value bases, saved ra/s0,
    211  * sret/indirect/va_list pointers): LD/SD on rv64, LW/SW on rv32. */
    212 static u32 rv_ld_ptr(const RiscvVariant* v, u32 rd, u32 base, i32 off) {
    213   return v->ptr_bytes == 8u ? rv_ld(rd, base, off) : rv_lw(rd, base, off);
    214 }
    215 static u32 rv_sd_ptr(const RiscvVariant* v, u32 src, u32 base, i32 off) {
    216   return v->ptr_bytes == 8u ? rv_sd(src, base, off) : rv_sw(src, base, off);
    217 }
    218 
    219 /* ============================ target state ============================ */
    220 
    221 /* Frame slots and callee-save records live in the shared NativeFrame
    222  * bookkeeping (cg/native_frame.h); these aliases keep the rv64-local spellings.
    223  */
    224 typedef NativeFrameSlotEntry RvNativeSlot;
    225 typedef NativeFrameCalleeSave RvCalleeSave;
    226 
    227 typedef enum RvPatchKind { RV_PATCH_ALLOCA } RvPatchKind;
    228 
    229 typedef struct RvPatch {
    230   u8 kind; /* RvPatchKind */
    231   u32 pos;
    232   u32 dst_reg;
    233 } RvPatch;
    234 
    235 typedef struct RvNativeTarget {
    236   NativeTarget base;
    237   /* Immutable per-XLEN descriptor (rv32 / rv64), set once in the constructor
    238    * from c->target.arch. Every XLEN-dependent emit site reads it; with the
    239    * rv64 variant each site reproduces the historical literal exactly. */
    240   const RiscvVariant* variant;
    241   SrcLoc loc;
    242   const CGFuncDesc* func;
    243 
    244   /* Shared frame bookkeeping: slot table, cum_off, max_outgoing, callee-save
    245    * set, and the known_frame / has_alloca / frame_final flags. */
    246   NativeFrame frame;
    247   u32 frame_size_final;
    248   u32 fp_pair_off;
    249   u32 minimal_prologue_words; /* known-frame path: exact prologue length, else 0
    250                                */
    251 
    252   /* Known-frame (-O1) leaf no-frame tier (aa64's slim_prologue equivalent),
    253    * settled in rv_func_begin_known_frame; always 0 on the single-pass path. A
    254    * leaf with no callee-saves, no body slots, no outgoing args, no
    255    * sret/variadic and register-only params never reads s0 nor clobbers ra, so
    256    * it emits NO prologue and a bare `ret` — the whole frame setup/teardown is
    257    * elided. RISC-V has no pre/post-indexed store, so aa64's fp_at_bottom fold
    258    * would save zero instructions on a kept frame and is intentionally not
    259    * ported (see doc/plan/ARCH.md §2); this leaf tier is the rv64 win. */
    260   u8 slim_prologue;
    261 
    262   u32 incoming_stack_size; /* fixed-param stack bytes (tail-call check) */
    263   u32 next_param_int;
    264   u32 next_param_fp;
    265   u32 next_param_stack;
    266   u8 has_sret;
    267   u8 is_variadic;
    268   NativeFrameSlot sret_ptr_slot;
    269 
    270   RvPatch* patches;
    271   u32 npatches;
    272   u32 patches_cap;
    273   u32 nalloca;
    274 
    275   u32 func_start;
    276   u32 prologue_pos;
    277   MCLabel epilogue_label;
    278 } RvNativeTarget;
    279 
    280 static RvNativeTarget* rv_of(NativeTarget* t) { return (RvNativeTarget*)t; }
    281 
    282 static _Noreturn void rv_panic(RvNativeTarget* a, const char* msg) {
    283   compiler_panic(a->base.c, a->loc, "rv64 native target: %s", msg);
    284 }
    285 
    286 static RvNativeSlot* rv_slot_get(RvNativeTarget* a, NativeFrameSlot fs) {
    287   return native_frame_slot_at(&a->frame, fs);
    288 }
    289 
    290 /* s0-relative byte offset of a frame slot's base (address = s0 + ret). */
    291 static i32 rv_s0_off_slot(const RvNativeSlot* s) { return -(i32)s->off; }
    292 
    293 static u32 rv_va_save_sz(const RvNativeTarget* a) {
    294   /* ABI-derived: the variadic register-save area is gp_reg_count*gp_slot_size
    295    * (a0..a7 = 64 bytes for LP64D, 32 for ILP32). Only present in variadics. */
    296   return a->is_variadic ? native_frame_va_save_bytes(a->base.c->abi) : 0u;
    297 }
    298 
    299 /* s0-relative byte offset of incoming stack arg at byte_off. Stack args sit
    300  * just above the saved pair; the variadic GP save area (when present) is
    301  * contiguous with them at [s0 + frame_save_size). */
    302 static i32 rv_s0_off_in_arg(const RvNativeTarget* a, u32 byte_off) {
    303   u32 base = a->variant->frame_save_size;
    304   if (a->is_variadic) base += rv_va_save_sz(a);
    305   return (i32)(base + byte_off);
    306 }
    307 
    308 /* Callee-saved registers are homed just below the locals at rv_save_off() —
    309  * they are NOT frame slots, so the frame size must reserve their bytes
    310  * explicitly. Integer saves are ptr_bytes wide (sd on rv64, sw on rv32); FP
    311  * saves are always 8 bytes (fsd, even on rv32d). On rv64 both are 8 so the sum
    312  * is identical to the historical ncallee_saves*8. Zero at -O0. */
    313 static u32 rv_callee_save_bytes(const RvNativeTarget* a) {
    314   u32 ptr = a->variant->ptr_bytes;
    315   u32 i, bytes = 0;
    316   for (i = 0; i < a->frame.ncallee_saves; ++i)
    317     bytes += a->frame.callee_saves[i].cls == NATIVE_REG_FP ? 8u : ptr;
    318   return bytes;
    319 }
    320 
    321 static u32 rv_frame_size(const RvNativeTarget* a) {
    322   u32 raw = a->variant->frame_save_size + a->frame.cum_off +
    323             rv_callee_save_bytes(a) + a->frame.max_outgoing + rv_va_save_sz(a);
    324   return align_up_u32(raw, 16u);
    325 }
    326 
    327 static u32 rv_fp_pair_off(const RvNativeTarget* a, u32 frame_size) {
    328   return frame_size - a->variant->frame_save_size - rv_va_save_sz(a);
    329 }
    330 
    331 /* ============================ type helpers ============================ */
    332 
    333 /* Scalar size/align/mem/class/loc constructors are shared in native_target.h
    334  * (native_type_size, native_type_align, native_mem_for_type,
    335  * native_class_for_type_fp_le8, native_loc_reg, native_loc_stack,
    336  * native_loc_is_fp). loc_reg's mask is arch-specific and stays here. */
    337 
    338 /* True when a scalar value is WIDER than XLEN's natural single-register width,
    339  * i.e. it needs the "wide" (rv64 64-bit) ops rather than the base ops. On rv64
    340  * a pointer is 8 bytes and counts as wide alongside i64/double; on rv32 a
    341  * pointer is 4 bytes and fits a single 32-bit register, so it is NOT wide and
    342  * the base (non-W) ops apply. (Kept named rv_is_64 to minimize churn; for the
    343  * rv64 variant the result is byte-identical to the old predicate.) */
    344 static int rv_is_64(NativeTarget* t, KitCgTypeId type) {
    345   const RiscvVariant* v = rv_of(t)->variant;
    346   return native_type_size(t, type) >= 8u ||
    347          (v->xlen == 64u && cg_type_is_ptr(t->c, type));
    348 }
    349 
    350 static u32 loc_reg(NativeLoc loc) { return loc.v.reg & 0x1fu; }
    351 
    352 /* ============================ register tables ============================ */
    353 
    354 #define RV_PHYS_INT_ARG(r, idx)                        \
    355   {.reg = (r),                                         \
    356    .cls = NATIVE_REG_INT,                              \
    357    .abi_index = (idx),                                 \
    358    .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \
    359             ((idx) < 2u ? NATIVE_REG_RET : 0),         \
    360    .spill_cost = 1u,                                   \
    361    .copy_cost = 1u}
    362 #define RV_PHYS_INT_CALLER(r)                               \
    363   {.reg = (r),                                              \
    364    .cls = NATIVE_REG_INT,                                   \
    365    .abi_index = 0xffu,                                      \
    366    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \
    367    .spill_cost = 1u,                                        \
    368    .copy_cost = 1u}
    369 #define RV_PHYS_INT_CALLEE(r)                               \
    370   {.reg = (r),                                              \
    371    .cls = NATIVE_REG_INT,                                   \
    372    .abi_index = 0xffu,                                      \
    373    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \
    374    .spill_cost = 4u,                                        \
    375    .copy_cost = 1u}
    376 #define RV_PHYS_INT_RESERVED(r)  \
    377   {.reg = (r),                   \
    378    .cls = NATIVE_REG_INT,        \
    379    .abi_index = 0xffu,           \
    380    .flags = NATIVE_REG_RESERVED, \
    381    .spill_cost = 0u,             \
    382    .copy_cost = 0u}
    383 
    384 /* t0..t3 (x5,x6,x7,x28) are emit-internal scratch (RV_TMP0..RV_TMP3), reserved
    385  * and never handed to the allocator or driver. t4/t5 are the driver scratch
    386  * pool (disjoint from the emit temps so a hook can never clobber an operand the
    387  * driver parked there). t6 is the lone caller-saved allocable (the -O0 cache's
    388  * only caller-saved home); s1..s11 are appended callee-saved, chosen under
    389  * pressure (and saved by the optimizer prologue at -O1). */
    390 static const Reg rv_int_allocable[] = {31u, 9u,  18u, 19u, 20u, 21u,
    391                                        22u, 23u, 24u, 25u, 26u, 27u};
    392 static const Reg rv_int_scratch[] = {29u, 30u}; /* t4, t5 */
    393 
    394 static const NativePhysRegInfo rv_int_phys[] = {
    395     RV_PHYS_INT_RESERVED(0u), /* zero */
    396     RV_PHYS_INT_RESERVED(1u), /* ra */
    397     RV_PHYS_INT_RESERVED(2u), /* sp */
    398     RV_PHYS_INT_RESERVED(3u), /* gp */
    399     RV_PHYS_INT_RESERVED(4u), /* tp */
    400     RV_PHYS_INT_RESERVED(5u), /* t0 = TMP0 */
    401     RV_PHYS_INT_RESERVED(6u), /* t1 = TMP1 */
    402     RV_PHYS_INT_RESERVED(7u), /* t2 = TMP2 (emit) */
    403     RV_PHYS_INT_RESERVED(8u), /* s0/fp */
    404     RV_PHYS_INT_CALLEE(9u),   /* s1 */
    405     RV_PHYS_INT_ARG(10u, 0u),  RV_PHYS_INT_ARG(11u, 1u),
    406     RV_PHYS_INT_ARG(12u, 2u),  RV_PHYS_INT_ARG(13u, 3u),
    407     RV_PHYS_INT_ARG(14u, 4u),  RV_PHYS_INT_ARG(15u, 5u),
    408     RV_PHYS_INT_ARG(16u, 6u),  RV_PHYS_INT_ARG(17u, 7u),
    409     RV_PHYS_INT_CALLEE(18u),   RV_PHYS_INT_CALLEE(19u),
    410     RV_PHYS_INT_CALLEE(20u),   RV_PHYS_INT_CALLEE(21u),
    411     RV_PHYS_INT_CALLEE(22u),   RV_PHYS_INT_CALLEE(23u),
    412     RV_PHYS_INT_CALLEE(24u),   RV_PHYS_INT_CALLEE(25u),
    413     RV_PHYS_INT_CALLEE(26u),   RV_PHYS_INT_CALLEE(27u),
    414     RV_PHYS_INT_RESERVED(28u), /* t3 = TMP3 (emit) */
    415     RV_PHYS_INT_RESERVED(29u), /* t4 = driver scratch */
    416     RV_PHYS_INT_RESERVED(30u), /* t5 = driver scratch */
    417     RV_PHYS_INT_CALLER(31u),   /* t6 = caller-saved allocable */
    418 };
    419 
    420 #define RV_PHYS_FP_ARG(r, idx)                         \
    421   {.reg = (r),                                         \
    422    .cls = NATIVE_REG_FP,                               \
    423    .abi_index = (idx),                                 \
    424    .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \
    425             ((idx) < 2u ? NATIVE_REG_RET : 0),         \
    426    .spill_cost = 1u,                                   \
    427    .copy_cost = 1u}
    428 #define RV_PHYS_FP_CALLER(r)                                \
    429   {.reg = (r),                                              \
    430    .cls = NATIVE_REG_FP,                                    \
    431    .abi_index = 0xffu,                                      \
    432    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \
    433    .spill_cost = 1u,                                        \
    434    .copy_cost = 1u}
    435 #define RV_PHYS_FP_CALLEE(r)                                \
    436   {.reg = (r),                                              \
    437    .cls = NATIVE_REG_FP,                                    \
    438    .abi_index = 0xffu,                                      \
    439    .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \
    440    .spill_cost = 4u,                                        \
    441    .copy_cost = 1u}
    442 #define RV_PHYS_FP_RESERVED(r)   \
    443   {.reg = (r),                   \
    444    .cls = NATIVE_REG_FP,         \
    445    .abi_index = 0xffu,           \
    446    .flags = NATIVE_REG_RESERVED, \
    447    .spill_cost = 0u,             \
    448    .copy_cost = 0u}
    449 
    450 /* Caller-saved allocable first (ft4..ft7, ft8..ft11), then callee (fs0..fs11).
    451  * ft0/ft1 reserved as emit-internal scratch; ft2/ft3 driver scratch. */
    452 static const Reg rv_fp_allocable[] = {4u,  5u,  6u,  7u,  28u, 29u, 30u,
    453                                       31u, 8u,  9u,  18u, 19u, 20u, 21u,
    454                                       22u, 23u, 24u, 25u, 26u, 27u};
    455 static const Reg rv_fp_scratch[] = {2u, 3u}; /* ft2, ft3 */
    456 
    457 static const NativePhysRegInfo rv_fp_phys[] = {
    458     RV_PHYS_FP_RESERVED(0u), /* ft0 = FTMP0 */
    459     RV_PHYS_FP_RESERVED(1u), /* ft1 = FTMP1 */
    460     RV_PHYS_FP_RESERVED(2u), /* ft2 = scratch */
    461     RV_PHYS_FP_RESERVED(3u), /* ft3 = scratch */
    462     RV_PHYS_FP_CALLER(4u),   RV_PHYS_FP_CALLER(5u),   RV_PHYS_FP_CALLER(6u),
    463     RV_PHYS_FP_CALLER(7u),   RV_PHYS_FP_CALLEE(8u),   RV_PHYS_FP_CALLEE(9u),
    464     RV_PHYS_FP_ARG(10u, 0u), RV_PHYS_FP_ARG(11u, 1u), RV_PHYS_FP_ARG(12u, 2u),
    465     RV_PHYS_FP_ARG(13u, 3u), RV_PHYS_FP_ARG(14u, 4u), RV_PHYS_FP_ARG(15u, 5u),
    466     RV_PHYS_FP_ARG(16u, 6u), RV_PHYS_FP_ARG(17u, 7u), RV_PHYS_FP_CALLEE(18u),
    467     RV_PHYS_FP_CALLEE(19u),  RV_PHYS_FP_CALLEE(20u),  RV_PHYS_FP_CALLEE(21u),
    468     RV_PHYS_FP_CALLEE(22u),  RV_PHYS_FP_CALLEE(23u),  RV_PHYS_FP_CALLEE(24u),
    469     RV_PHYS_FP_CALLEE(25u),  RV_PHYS_FP_CALLEE(26u),  RV_PHYS_FP_CALLEE(27u),
    470     RV_PHYS_FP_CALLER(28u),  RV_PHYS_FP_CALLER(29u),  RV_PHYS_FP_CALLER(30u),
    471     RV_PHYS_FP_CALLER(31u),
    472 };
    473 
    474 static const NativeAllocClassInfo rv_classes[] = {
    475     {.cls = NATIVE_REG_INT,
    476      .allocable = rv_int_allocable,
    477      .nallocable = sizeof rv_int_allocable / sizeof rv_int_allocable[0],
    478      .scratch = rv_int_scratch,
    479      .nscratch = sizeof rv_int_scratch / sizeof rv_int_scratch[0],
    480      .phys = rv_int_phys,
    481      .nphys = sizeof rv_int_phys / sizeof rv_int_phys[0],
    482      /* t0-t6 (5-7,28-31) + a0-a7 (10-17) */
    483      .caller_saved_mask = 0xf00400e0u | 0x0001fc00u,
    484      /* s0-s11 (8,9,18-27) */
    485      .callee_saved_mask = 0x0ffc0300u,
    486      .arg_mask = 0x0001fc00u,
    487      .ret_mask = 0x00000c00u,
    488      /* zero,ra,sp,gp,tp,t0,t1,t2,s0 (bits 0-8) + t3 (bit 28). t4/t5 are the
    489       * driver scratch pool (reserved-from-alloc but listed in scratch[]). */
    490      .reserved_mask = 0x000001ffu | (1u << 28)},
    491     {.cls = NATIVE_REG_FP,
    492      .allocable = rv_fp_allocable,
    493      .nallocable = sizeof rv_fp_allocable / sizeof rv_fp_allocable[0],
    494      .scratch = rv_fp_scratch,
    495      .nscratch = sizeof rv_fp_scratch / sizeof rv_fp_scratch[0],
    496      .phys = rv_fp_phys,
    497      .nphys = sizeof rv_fp_phys / sizeof rv_fp_phys[0],
    498      /* ft0-ft7 (0-7), fa0-fa7 (10-17), ft8-ft11 (28-31) */
    499      .caller_saved_mask = 0xf00400ffu | 0x0001fc00u,
    500      /* fs0-fs11 (8,9,18-27) */
    501      .callee_saved_mask = 0x0ffc0300u,
    502      .arg_mask = 0x0001fc00u,
    503      .ret_mask = 0x00000c00u,
    504      .reserved_mask = 0x0000000fu /* ft0-ft3 */},
    505 };
    506 
    507 /* Resolve a register name ("a7", "fa0", ...) to its (class, Reg). Powers the
    508  * optimizer's inline-asm clobber masks and explicit hard-register operands
    509  * ("{a7}" from a GNU local register variable). x0..x31 are DWARF 0..31; the
    510  * FP bank f0..f31 is DWARF 32..63. Returns non-zero for a non-register name
    511  * (cc/memory/unknown), which the caller skips. */
    512 static int rv_resolve_name(const NativeRegInfo* ri, Slice name, Reg* out,
    513                            NativeAllocClass* cls_out) {
    514   char buf[16];
    515   uint32_t dwarf;
    516   (void)ri;
    517   if (!name.s || !name.len || name.len >= sizeof buf) return 1;
    518   memcpy(buf, name.s, name.len);
    519   buf[name.len] = '\0';
    520   if (rv64_register_index(buf, &dwarf) != 0) return 1;
    521   if (dwarf <= 31u) {
    522     *cls_out = NATIVE_REG_INT;
    523     *out = (Reg)dwarf;
    524     return 0;
    525   }
    526   if (dwarf >= 32u && dwarf <= 63u) {
    527     *cls_out = NATIVE_REG_FP;
    528     *out = (Reg)(dwarf - 32u);
    529     return 0;
    530   }
    531   return 1;
    532 }
    533 
    534 static int rv_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls,
    535                                  Reg reg) {
    536   (void)ri;
    537   if (cls == NATIVE_REG_INT) {
    538     if (reg == 9u) return 1;                /* s1 */
    539     if (reg >= 10u && reg <= 17u) return 1; /* a0..a7 */
    540     if (reg >= 18u && reg <= 27u) return 1; /* s2..s11 */
    541     if (reg == 31u) return 1;               /* t6 */
    542     return 0;
    543   }
    544   if (cls == NATIVE_REG_FP) return reg >= 4u && reg <= 31u;
    545   return 0;
    546 }
    547 
    548 static int rv_asm_constraint_reg(const NativeRegInfo* ri, const char* body,
    549                                  NativeAllocClass* cls_out, Reg* fixed_out,
    550                                  u32* allowed_mask_out) {
    551   (void)ri;
    552   if (!body || !body[0]) return 0;
    553   if (fixed_out) *fixed_out = REG_NONE;
    554   if (allowed_mask_out) *allowed_mask_out = 0;
    555   if (body[0] == 'r' && body[1] == '\0') {
    556     if (cls_out) *cls_out = NATIVE_REG_INT;
    557     return 1;
    558   }
    559   if (body[0] == 'f' && body[1] == '\0') {
    560     if (cls_out) *cls_out = NATIVE_REG_FP;
    561     return 1;
    562   }
    563   if (body[0] == 'c' && body[1] == 'r' && body[2] == '\0') {
    564     if (cls_out) *cls_out = NATIVE_REG_INT;
    565     if (allowed_mask_out) *allowed_mask_out = 0x0000ff00u; /* x8..x15 */
    566     return 1;
    567   }
    568   if (body[0] == 'c' && body[1] == 'f' && body[2] == '\0') {
    569     if (cls_out) *cls_out = NATIVE_REG_FP;
    570     if (allowed_mask_out) *allowed_mask_out = 0x0000ff00u; /* f8..f15 */
    571     return 1;
    572   }
    573   return 0;
    574 }
    575 
    576 static const NativeRegInfo rv_reg_info = {
    577     .classes = rv_classes,
    578     .nclasses = sizeof rv_classes / sizeof rv_classes[0],
    579     .resolve_name = rv_resolve_name,
    580     .asm_operand_reg_ok = rv_asm_operand_reg_ok,
    581     .asm_constraint_reg = rv_asm_constraint_reg,
    582 };
    583 
    584 /* ============================ legality ============================ */
    585 
    586 static int rv_imm_legal(NativeTarget* t, NativeImmUse use, u32 op,
    587                         KitCgTypeId type, i64 imm) {
    588   /* SLLI/SRLI/SRAI shamt is shamt_bits wide: 6 bits (max 63) on rv64, 5 bits
    589    * (max 31) on rv32. */
    590   i64 shamt_max = (i64)((1u << rv_of(t)->variant->shamt_bits) - 1u);
    591   (void)type;
    592   switch (use) {
    593     case NATIVE_IMM_MOVE:
    594       return 1;
    595     case NATIVE_IMM_BINOP:
    596       switch ((BinOp)op) {
    597         case BO_IADD:
    598           return fits_i12(imm);
    599         case BO_ISUB:
    600           return fits_i12(-imm); /* emitted as ADDI with negated imm */
    601         case BO_AND:
    602         case BO_OR:
    603         case BO_XOR:
    604           return fits_i12(imm);
    605         case BO_SHL:
    606         case BO_SHR_S:
    607         case BO_SHR_U:
    608           return imm >= 0 && imm <= shamt_max;
    609         default:
    610           return 0;
    611       }
    612     case NATIVE_IMM_CMP:
    613       return imm == 0; /* compares need both ends in registers (SLT/branch) */
    614     case NATIVE_IMM_ADDR_OFFSET:
    615       return fits_i12(imm);
    616   }
    617   return 0;
    618 }
    619 
    620 static int rv_addr_legal(NativeTarget* t, const NativeAddr* addr,
    621                          MemAccess mem) {
    622   (void)t;
    623   (void)mem;
    624   if (!addr) return 0;
    625   if (addr->index_kind != NATIVE_ADDR_INDEX_NONE) return 0;
    626   if (addr->base_kind != NATIVE_ADDR_BASE_REG &&
    627       addr->base_kind != NATIVE_ADDR_BASE_FRAME)
    628     return 0;
    629   return fits_i12(addr->offset);
    630 }
    631 
    632 /* ============================ memory ============================ */
    633 
    634 /* Materialize the runtime address of a global into `dst`, including addend. */
    635 static void rv_emit_global_addr(RvNativeTarget* a, u32 dst, ObjSymId sym,
    636                                 i64 addend) {
    637   NativeTarget* t = &a->base;
    638   MCEmitter* mc = t->mc;
    639   u32 sec = mc->section_id;
    640   if (obj_symbol_extern_via_got(t->c, t->obj, sym)) {
    641     u32 ap = mc->pos(mc);
    642     rv64_emit32(mc, rv_auipc(dst, 0));
    643     mc->emit_reloc_at(mc, sec, ap, R_RV_GOT_HI20, sym, 0, 0, 0);
    644     {
    645       Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi"));
    646       ObjSymId anchor =
    647           obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
    648       u32 lp = mc->pos(mc);
    649       rv64_emit32(mc, rv_ld_ptr(a->variant, dst, dst, 0));
    650       mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
    651     }
    652   } else {
    653     u32 ap = mc->pos(mc);
    654     rv64_emit32(mc, rv_auipc(dst, 0));
    655     mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, 0, 0, 0);
    656     {
    657       Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi"));
    658       ObjSymId anchor =
    659           obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
    660       u32 lp = mc->pos(mc);
    661       rv64_emit32(mc, rv_addi(dst, dst, 0));
    662       mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
    663     }
    664   }
    665   if (addend) rv_emit_addr_adjust(a->variant, mc, dst, dst, (i32)addend);
    666 }
    667 
    668 /* Fold (base_reg << 0) + (index << scale) into RV_TMP0 via Zba. */
    669 static u32 rv_fold_index(RvNativeTarget* a, u32 base, u32 idx, u8 log2_scale) {
    670   MCEmitter* mc = a->base.mc;
    671   switch (log2_scale) {
    672     case 0:
    673       rv64_emit32(mc, rv_add(RV_TMP0, base, idx));
    674       break;
    675     case 1:
    676       rv64_emit32(mc, rv_sh1add(RV_TMP0, idx, base));
    677       break;
    678     case 2:
    679       rv64_emit32(mc, rv_sh2add(RV_TMP0, idx, base));
    680       break;
    681     default:
    682       rv64_emit32(mc, rv_sh3add(RV_TMP0, idx, base));
    683       break;
    684   }
    685   return RV_TMP0;
    686 }
    687 
    688 /* Resolve any NativeAddr to a base register + imm12 offset. RISC-V has no
    689  * indexed load/store, so an index is folded into RV_TMP0 via Zba; far offsets
    690  * and FRAME/FRAME_VALUE/GLOBAL bases are materialized into RV_TMP0/RV_TMP1. */
    691 static void rv_resolve_mem_addr(RvNativeTarget* a, const NativeAddr* addr,
    692                                 u32* base_out, i32* off_out) {
    693   MCEmitter* mc = a->base.mc;
    694   u32 base;
    695   i32 off;
    696   switch (addr->base_kind) {
    697     case NATIVE_ADDR_BASE_REG:
    698       base = addr->base.reg & 0x1fu;
    699       off = addr->offset;
    700       break;
    701     case NATIVE_ADDR_BASE_FRAME: {
    702       RvNativeSlot* s = rv_slot_get(a, addr->base.frame);
    703       base = RV_S0;
    704       off = rv_s0_off_slot(s) + addr->offset;
    705       break;
    706     }
    707     case NATIVE_ADDR_BASE_FRAME_VALUE: {
    708       RvNativeSlot* s = rv_slot_get(a, addr->base.frame);
    709       rv64_emit32(mc, rv_ld_ptr(a->variant, RV_TMP0, RV_S0, rv_s0_off_slot(s)));
    710       base = RV_TMP0;
    711       off = addr->offset;
    712       break;
    713     }
    714     case NATIVE_ADDR_BASE_GLOBAL:
    715       rv_emit_global_addr(a, RV_TMP0, addr->base.global.sym,
    716                           addr->base.global.addend);
    717       base = RV_TMP0;
    718       off = addr->offset;
    719       break;
    720     default:
    721       rv_panic(a, "unsupported address base");
    722   }
    723   if (addr->index_kind == NATIVE_ADDR_INDEX_REG) {
    724     base = rv_fold_index(a, base, addr->index.reg & 0x1fu, addr->log2_scale);
    725   } else if (addr->index_kind == NATIVE_ADDR_INDEX_FRAME_VALUE) {
    726     RvNativeSlot* s = rv_slot_get(a, addr->index.frame);
    727     rv64_emit32(mc, rv_ld_ptr(a->variant, RV_TMP1, RV_S0, rv_s0_off_slot(s)));
    728     base = rv_fold_index(a, base, RV_TMP1, addr->log2_scale);
    729   }
    730   if (!fits_i12(off)) {
    731     rv_emit_load_imm(a->variant, mc, 1, RV_TMP1, (i64)off);
    732     rv64_emit32(mc, rv_add(RV_TMP0, base, RV_TMP1));
    733     base = RV_TMP0;
    734     off = 0;
    735   }
    736   *base_out = base;
    737   *off_out = off;
    738 }
    739 
    740 /* Central load/store primitive. is_load: 1 load into reg, 0 store reg to mem.
    741  */
    742 static void rv_emit_mem(RvNativeTarget* a, int is_load, NativeLoc reg,
    743                         NativeAddr addr, MemAccess mem) {
    744   NativeTarget* t = &a->base;
    745   MCEmitter* mc = t->mc;
    746   u32 r = loc_reg(reg);
    747   int fp = native_loc_is_fp(reg);
    748   u32 sz = mem.size ? mem.size : native_type_size(t, reg.type);
    749   u32 base;
    750   i32 off;
    751 
    752   rv_resolve_mem_addr(a, &addr, &base, &off);
    753   if (fp) {
    754     rv64_emit32(
    755         mc, is_load ? (sz == 8u ? rv_fld(r, base, off) : rv_flw(r, base, off))
    756                     : (sz == 8u ? rv_fsd(r, base, off) : rv_fsw(r, base, off)));
    757   } else {
    758     rv64_emit32(mc, is_load ? enc_int_load(a->variant, sz, 0, r, base, off)
    759                             : enc_int_store(a->variant, sz, r, base, off));
    760   }
    761 }
    762 
    763 /* ============================ moves / data ============================ */
    764 
    765 static void rv_move(NativeTarget* t, NativeLoc dst, NativeLoc src) {
    766   MCEmitter* mc = t->mc;
    767   int dfp = native_loc_is_fp(dst), sfp = native_loc_is_fp(src);
    768   u32 rd = loc_reg(dst), rs = loc_reg(src);
    769   if (dfp && sfp) {
    770     u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S;
    771     if (rd == rs) return;
    772     rv64_emit32(mc, rv_fsgnj(fmt, rd, rs, rs));
    773     return;
    774   }
    775   if (!dfp && sfp) {
    776     u32 sz = native_type_size(t, src.type);
    777     rv64_emit32(mc, sz == 8u ? rv_fmv_x_d(rd, rs) : rv_fmv_x_w(rd, rs));
    778     return;
    779   }
    780   if (dfp && !sfp) {
    781     u32 sz = native_type_size(t, dst.type);
    782     rv64_emit32(mc, sz == 8u ? rv_fmv_d_x(rd, rs) : rv_fmv_w_x(rd, rs));
    783     return;
    784   }
    785   if (rd == rs) return;
    786   rv64_emit32(mc, rv_addi(rd, rs, 0));
    787 }
    788 
    789 static void rv_load_imm(NativeTarget* t, NativeLoc dst, i64 imm) {
    790   rv_emit_load_imm(rv_of(t)->variant, t->mc, rv_is_64(t, dst.type) ? 1u : 0u,
    791                    loc_reg(dst), imm);
    792 }
    793 
    794 static void rv_load_const(NativeTarget* t, NativeLoc dst, ConstBytes cb) {
    795   RvNativeTarget* a = rv_of(t);
    796   u64 v = 0;
    797   u32 i;
    798   if (!native_loc_is_fp(dst)) {
    799     for (i = 0; i < cb.size && i < 8u; ++i) v |= (u64)cb.bytes[i] << (i * 8u);
    800     rv_load_imm(t, dst, (i64)v);
    801     return;
    802   }
    803   /* FP constant: materialize the bit pattern in TMP0, bitcast into the FPR. */
    804   for (i = 0; i < cb.size && i < 8u; ++i) v |= (u64)cb.bytes[i] << (i * 8u);
    805   rv_emit_load_imm(a->variant, t->mc, 1, RV_TMP0, (i64)v);
    806   if (cb.size == 8u)
    807     rv64_emit32(t->mc, rv_fmv_d_x(loc_reg(dst), RV_TMP0));
    808   else
    809     rv64_emit32(t->mc, rv_fmv_w_x(loc_reg(dst), RV_TMP0));
    810   (void)a;
    811 }
    812 
    813 static void rv_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) {
    814   RvNativeTarget* a = rv_of(t);
    815   MCEmitter* mc = t->mc;
    816   u32 rd = loc_reg(dst);
    817   u32 base;
    818   i32 off;
    819   if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL) {
    820     rv_emit_global_addr(a, rd, addr.base.global.sym,
    821                         addr.base.global.addend + addr.offset);
    822     base = rd;
    823     off = 0;
    824   } else if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
    825     /* Load the pointer stored in the frame slot, then add the offset. */
    826     RvNativeSlot* s = rv_slot_get(a, addr.base.frame);
    827     rv64_emit32(mc, rv_ld_ptr(a->variant, rd, RV_S0, rv_s0_off_slot(s)));
    828     base = rd;
    829     off = addr.offset;
    830   } else if (addr.base_kind == NATIVE_ADDR_BASE_FRAME) {
    831     RvNativeSlot* s = rv_slot_get(a, addr.base.frame);
    832     base = RV_S0;
    833     off = rv_s0_off_slot(s) + addr.offset;
    834   } else if (addr.base_kind == NATIVE_ADDR_BASE_REG) {
    835     base = addr.base.reg & 0x1fu;
    836     off = addr.offset;
    837   } else {
    838     rv_panic(a, "unsupported address base in load_addr");
    839   }
    840   /* Fold any index via Zba sh{1,2,3}add (index << scale) + base. */
    841   if (addr.index_kind == NATIVE_ADDR_INDEX_REG) {
    842     u32 idx = addr.index.reg & 0x1fu;
    843     if (off != 0 || base != rd)
    844       rv_emit_addr_adjust(a->variant, mc, rd, base, off);
    845     switch (addr.log2_scale) {
    846       case 0:
    847         rv64_emit32(mc, rv_add(rd, rd, idx));
    848         break;
    849       case 1:
    850         rv64_emit32(mc, rv_sh1add(rd, idx, rd));
    851         break;
    852       case 2:
    853         rv64_emit32(mc, rv_sh2add(rd, idx, rd));
    854         break;
    855       default:
    856         rv64_emit32(mc, rv_sh3add(rd, idx, rd));
    857         break;
    858     }
    859     return;
    860   }
    861   rv_emit_addr_adjust(a->variant, mc, rd, base, off);
    862 }
    863 
    864 static void rv_load(NativeTarget* t, NativeLoc dst, NativeAddr addr,
    865                     MemAccess mem) {
    866   rv_emit_mem(rv_of(t), 1, dst, addr, mem);
    867 }
    868 static void rv_store(NativeTarget* t, NativeAddr addr, NativeLoc src,
    869                      MemAccess mem) {
    870   rv_emit_mem(rv_of(t), 0, src, addr, mem);
    871 }
    872 
    873 /* copy_bytes: resolve dst and src to dedicated pointer regs (RV_TMP3 / RV_TMP0)
    874  * once, then copy granule-by-granule advancing both pointers. dst is resolved
    875  * first because its base may itself live in RV_TMP1 (the transfer reg, e.g. the
    876  * sret pointer from plan_ret); capturing it into RV_TMP3 before src resolution
    877  * (which may clobber RV_TMP1 for far offsets) keeps it live. Advancing the
    878  * pointers keeps every load/store at offset 0, so no offset ever exceeds imm12
    879  * and the transfer reg never aliases a base. */
    880 static void rv_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src,
    881                           AggregateAccess access) {
    882   RvNativeTarget* a = rv_of(t);
    883   const RiscvVariant* v = a->variant;
    884   MCEmitter* mc = t->mc;
    885   KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
    886   u32 rem = access.size;
    887   u32 maxg = v->ptr_bytes; /* widest granule: 8 on rv64, 4 on rv32 */
    888   rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP3), dst);
    889   rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP0), src);
    890   while (rem) {
    891     u32 sz = rem >= 8u && maxg >= 8u ? 8u
    892              : rem >= 4u             ? 4u
    893              : rem >= 2u             ? 2u
    894                                      : 1u;
    895     rv64_emit32(mc, enc_int_load(v, sz, 0, RV_TMP1, RV_TMP0, 0));
    896     rv64_emit32(mc, enc_int_store(v, sz, RV_TMP1, RV_TMP3, 0));
    897     rv64_emit32(mc, rv_addi(RV_TMP0, RV_TMP0, (i32)sz));
    898     rv64_emit32(mc, rv_addi(RV_TMP3, RV_TMP3, (i32)sz));
    899     rem -= sz;
    900   }
    901 }
    902 
    903 static void rv_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value,
    904                          AggregateAccess access) {
    905   MCEmitter* mc = t->mc;
    906   KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
    907   u32 bv = loc_reg(byte_value);
    908   u32 rem = access.size;
    909   rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP3), dst);
    910   while (rem) {
    911     rv64_emit32(mc, rv_sb(bv, RV_TMP3, 0));
    912     rv64_emit32(mc, rv_addi(RV_TMP3, RV_TMP3, 1));
    913     rem -= 1u;
    914   }
    915 }
    916 
    917 /* ============================ arithmetic ============================ */
    918 
    919 static void rv_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc aop,
    920                      NativeLoc bop) {
    921   const RiscvVariant* v = rv_of(t)->variant;
    922   MCEmitter* mc = t->mc;
    923   u32 rd = loc_reg(dst);
    924   u32 ra = loc_reg(aop);
    925   int sf = rv_is_64(t, dst.type);
    926   /* The W-form ops (ADDW/SUBW/MULW/SLLW/...) are RV64-only and act on a 32-bit
    927    * value held in a 64-bit register. They are emitted only for a narrow value
    928    * on rv64; on rv32 the BASE ops ARE the 32-bit ops, so `w` is always 0 and we
    929    * fall to the base ops. */
    930   int w = !sf && v->has_w_forms;
    931   /* Immediate shamt mask: 5-bit (&31) for a W-form / rv32 op, else shamt_bits
    932    * (6-bit &63 on rv64) for the native-width op. */
    933   u32 shmask = w ? 31u : ((1u << v->shamt_bits) - 1u);
    934   int b_imm = bop.kind == NATIVE_LOC_IMM;
    935   u32 rb = b_imm ? 0u : loc_reg(bop);
    936   i64 imm = b_imm ? bop.v.imm : 0;
    937 
    938   switch (op) {
    939     case BO_FADD:
    940     case BO_FSUB:
    941     case BO_FMUL:
    942     case BO_FDIV: {
    943       u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S;
    944       switch (op) {
    945         case BO_FADD:
    946           rv64_emit32(mc, rv_fadd(fmt, rd, ra, rb));
    947           break;
    948         case BO_FSUB:
    949           rv64_emit32(mc, rv_fsub(fmt, rd, ra, rb));
    950           break;
    951         case BO_FMUL:
    952           rv64_emit32(mc, rv_fmul(fmt, rd, ra, rb));
    953           break;
    954         default:
    955           rv64_emit32(mc, rv_fdiv(fmt, rd, ra, rb));
    956           break;
    957       }
    958       return;
    959     }
    960     case BO_IADD:
    961       if (b_imm) {
    962         rv64_emit32(
    963             mc, w ? rv_addiw(rd, ra, (i32)imm) : rv_addi(rd, ra, (i32)imm));
    964       } else {
    965         rv64_emit32(mc, w ? rv_addw(rd, ra, rb) : rv_add(rd, ra, rb));
    966       }
    967       return;
    968     case BO_ISUB:
    969       if (b_imm) {
    970         rv64_emit32(
    971             mc, w ? rv_addiw(rd, ra, (i32)-imm) : rv_addi(rd, ra, (i32)-imm));
    972       } else {
    973         rv64_emit32(mc, w ? rv_subw(rd, ra, rb) : rv_sub(rd, ra, rb));
    974       }
    975       return;
    976     case BO_IMUL:
    977       rv64_emit32(mc, w ? rv_mulw(rd, ra, rb) : rv_mul(rd, ra, rb));
    978       return;
    979     case BO_SDIV:
    980       rv64_emit32(mc, w ? rv_divw(rd, ra, rb) : rv_div(rd, ra, rb));
    981       return;
    982     case BO_UDIV:
    983       rv64_emit32(mc, w ? rv_divuw(rd, ra, rb) : rv_divu(rd, ra, rb));
    984       return;
    985     case BO_SREM:
    986       rv64_emit32(mc, w ? rv_remw(rd, ra, rb) : rv_rem(rd, ra, rb));
    987       return;
    988     case BO_UREM:
    989       rv64_emit32(mc, w ? rv_remuw(rd, ra, rb) : rv_remu(rd, ra, rb));
    990       return;
    991     case BO_AND:
    992       rv64_emit32(mc, b_imm ? rv_andi(rd, ra, (i32)imm) : rv_and(rd, ra, rb));
    993       return;
    994     case BO_OR:
    995       rv64_emit32(mc, b_imm ? rv_ori(rd, ra, (i32)imm) : rv_or(rd, ra, rb));
    996       return;
    997     case BO_XOR:
    998       rv64_emit32(mc, b_imm ? rv_xori(rd, ra, (i32)imm) : rv_xor(rd, ra, rb));
    999       return;
   1000     case BO_SHL:
   1001       if (b_imm)
   1002         rv64_emit32(mc, w ? rv_slliw(rd, ra, (u32)imm & shmask)
   1003                           : rv_slli(rd, ra, (u32)imm & shmask));
   1004       else
   1005         rv64_emit32(mc, w ? rv_sllw(rd, ra, rb) : rv_sll(rd, ra, rb));
   1006       return;
   1007     case BO_SHR_U:
   1008       if (b_imm)
   1009         rv64_emit32(mc, w ? rv_srliw(rd, ra, (u32)imm & shmask)
   1010                           : rv_srli(rd, ra, (u32)imm & shmask));
   1011       else
   1012         rv64_emit32(mc, w ? rv_srlw(rd, ra, rb) : rv_srl(rd, ra, rb));
   1013       return;
   1014     case BO_SHR_S:
   1015       if (b_imm)
   1016         rv64_emit32(mc, w ? rv_sraiw(rd, ra, (u32)imm & shmask)
   1017                           : rv_srai(rd, ra, (u32)imm & shmask));
   1018       else
   1019         rv64_emit32(mc, w ? rv_sraw(rd, ra, rb) : rv_sra(rd, ra, rb));
   1020       return;
   1021     default:
   1022       rv_panic(rv_of(t), "unsupported binop");
   1023   }
   1024 }
   1025 
   1026 static void rv_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) {
   1027   const RiscvVariant* v = rv_of(t)->variant;
   1028   MCEmitter* mc = t->mc;
   1029   u32 rd = loc_reg(dst), rs = loc_reg(src);
   1030   int sf = rv_is_64(t, dst.type);
   1031   int w = !sf && v->has_w_forms; /* SUBW is RV64-only; base SUB on rv32 */
   1032   switch (op) {
   1033     case UO_NEG:
   1034       rv64_emit32(mc, w ? rv_subw(rd, RV_ZERO, rs) : rv_sub(rd, RV_ZERO, rs));
   1035       return;
   1036     case UO_FNEG: {
   1037       u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S;
   1038       rv64_emit32(mc, rv_fsgnjn(fmt, rd, rs, rs));
   1039       return;
   1040     }
   1041     case UO_BNOT:
   1042       rv64_emit32(mc, rv_xori(rd, rs, -1));
   1043       return;
   1044     case UO_NOT:
   1045       rv64_emit32(mc, rv_sltiu(rd, rs, 1));
   1046       return;
   1047     default:
   1048       rv_panic(rv_of(t), "unsupported unop");
   1049   }
   1050 }
   1051 
   1052 /* Sign/zero-extend a 32-bit operand into a 64-bit register for comparison.
   1053  * Returns the register to compare. */
   1054 static u32 rv_cmp_ext(NativeTarget* t, int is_signed, NativeLoc op, u32 tmp) {
   1055   const RiscvVariant* v = rv_of(t)->variant;
   1056   MCEmitter* mc = t->mc;
   1057   u32 r = loc_reg(op);
   1058   /* On rv32 a 32-bit operand already fills the whole register — there is no
   1059    * wider container to canonicalize into, so the extension is a no-op. */
   1060   if (v->xlen == 32u) return r;
   1061   if (rv_is_64(t, op.type)) return r;
   1062   if (is_signed) {
   1063     rv64_emit32(mc, rv_addiw(tmp, r, 0)); /* sign-extend low 32 */
   1064   } else {
   1065     rv64_emit32(mc, rv_slli(tmp, r, 32));
   1066     rv64_emit32(mc, rv_srli(tmp, tmp, 32));
   1067   }
   1068   return tmp;
   1069 }
   1070 
   1071 static int cmp_is_signed(CmpOp op) {
   1072   switch (op) {
   1073     case CMP_LT_U:
   1074     case CMP_LE_U:
   1075     case CMP_GT_U:
   1076     case CMP_GE_U:
   1077       return 0;
   1078     default:
   1079       return 1;
   1080   }
   1081 }
   1082 
   1083 /* Emit a 0/1 comparison result into rd from two integer registers. */
   1084 static void rv_emit_icmp(NativeTarget* t, CmpOp op, u32 rd, u32 ra, u32 rb) {
   1085   MCEmitter* mc = t->mc;
   1086   switch (op) {
   1087     case CMP_EQ:
   1088       rv64_emit32(mc, rv_sub(rd, ra, rb));
   1089       rv64_emit32(mc, rv_sltiu(rd, rd, 1));
   1090       return;
   1091     case CMP_NE:
   1092       rv64_emit32(mc, rv_sub(rd, ra, rb));
   1093       rv64_emit32(mc, rv_sltu(rd, RV_ZERO, rd));
   1094       return;
   1095     case CMP_LT_S:
   1096       rv64_emit32(mc, rv_slt(rd, ra, rb));
   1097       return;
   1098     case CMP_LT_U:
   1099       rv64_emit32(mc, rv_sltu(rd, ra, rb));
   1100       return;
   1101     case CMP_GT_S:
   1102       rv64_emit32(mc, rv_slt(rd, rb, ra));
   1103       return;
   1104     case CMP_GT_U:
   1105       rv64_emit32(mc, rv_sltu(rd, rb, ra));
   1106       return;
   1107     case CMP_GE_S:
   1108       rv64_emit32(mc, rv_slt(rd, ra, rb));
   1109       rv64_emit32(mc, rv_xori(rd, rd, 1));
   1110       return;
   1111     case CMP_GE_U:
   1112       rv64_emit32(mc, rv_sltu(rd, ra, rb));
   1113       rv64_emit32(mc, rv_xori(rd, rd, 1));
   1114       return;
   1115     case CMP_LE_S:
   1116       rv64_emit32(mc, rv_slt(rd, rb, ra));
   1117       rv64_emit32(mc, rv_xori(rd, rd, 1));
   1118       return;
   1119     case CMP_LE_U:
   1120       rv64_emit32(mc, rv_sltu(rd, rb, ra));
   1121       rv64_emit32(mc, rv_xori(rd, rd, 1));
   1122       return;
   1123     default:
   1124       rv_panic(rv_of(t), "unsupported integer cmp");
   1125   }
   1126 }
   1127 
   1128 /* Format-dispatching wrappers over the ordered FP compares (feq/flt/fle are
   1129  * ordered: they yield 0 on NaN; flt/fle are signaling, raising NV on NaN —
   1130  * pre-existing for ordered ops, and the boolean result is still correct). */
   1131 static u32 rv_feq_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) {
   1132   return fmt == RV_FMT_D ? rv_feq_d(rd, ra, rb) : rv_feq_s(rd, ra, rb);
   1133 }
   1134 static u32 rv_flt_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) {
   1135   return fmt == RV_FMT_D ? rv_flt_d(rd, ra, rb) : rv_flt_s(rd, ra, rb);
   1136 }
   1137 static u32 rv_fle_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) {
   1138   return fmt == RV_FMT_D ? rv_fle_d(rd, ra, rb) : rv_fle_s(rd, ra, rb);
   1139 }
   1140 
   1141 static void rv_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc aop,
   1142                    NativeLoc bop) {
   1143   MCEmitter* mc = t->mc;
   1144   u32 rd = loc_reg(dst);
   1145   /* FP-ness is self-describing from the opcode (FP block starts at CMP_OEQ_F).
   1146    * Unordered predicates use unordered-R == NOT(ordered-not-R): the ordered
   1147    * compare into rd, then `xori rd,rd,1`. ONE/UEQ have no single ordered
   1148    * primitive and OR the two strict relations (a<b | a>b) via scratch RV_TMP2
   1149    * (x7, reserved & never allocable, so it can't alias rd). */
   1150   if (op >= CMP_OEQ_F) {
   1151     u32 fmt = native_type_size(t, aop.type) == 8u ? RV_FMT_D : RV_FMT_S;
   1152     u32 ra = loc_reg(aop), rb = loc_reg(bop);
   1153     switch (op) {
   1154       case CMP_OEQ_F:
   1155         rv64_emit32(mc, rv_feq_fmt(fmt, rd, ra, rb));
   1156         return;
   1157       case CMP_UNE_F: /* !(OEQ) */
   1158         rv64_emit32(mc, rv_feq_fmt(fmt, rd, ra, rb));
   1159         rv64_emit32(mc, rv_xori(rd, rd, 1));
   1160         return;
   1161       case CMP_OLT_F:
   1162         rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
   1163         return;
   1164       case CMP_OLE_F:
   1165         rv64_emit32(mc, rv_fle_fmt(fmt, rd, ra, rb));
   1166         return;
   1167       case CMP_OGT_F:
   1168         rv64_emit32(mc, rv_flt_fmt(fmt, rd, rb, ra));
   1169         return;
   1170       case CMP_OGE_F:
   1171         rv64_emit32(mc, rv_fle_fmt(fmt, rd, rb, ra));
   1172         return;
   1173       case CMP_UGE_F: /* !(OLT) */
   1174         rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
   1175         rv64_emit32(mc, rv_xori(rd, rd, 1));
   1176         return;
   1177       case CMP_UGT_F: /* !(OLE) */
   1178         rv64_emit32(mc, rv_fle_fmt(fmt, rd, ra, rb));
   1179         rv64_emit32(mc, rv_xori(rd, rd, 1));
   1180         return;
   1181       case CMP_ULE_F: /* !(OGT) */
   1182         rv64_emit32(mc, rv_flt_fmt(fmt, rd, rb, ra));
   1183         rv64_emit32(mc, rv_xori(rd, rd, 1));
   1184         return;
   1185       case CMP_ULT_F: /* !(OGE) */
   1186         rv64_emit32(mc, rv_fle_fmt(fmt, rd, rb, ra));
   1187         rv64_emit32(mc, rv_xori(rd, rd, 1));
   1188         return;
   1189       case CMP_ONE_F: /* ordered & !=: (a<b) | (a>b) */
   1190         rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
   1191         rv64_emit32(mc, rv_flt_fmt(fmt, RV_TMP2, rb, ra));
   1192         rv64_emit32(mc, rv_or(rd, rd, RV_TMP2));
   1193         return;
   1194       case CMP_UEQ_F: /* unordered | ==: !((a<b) | (a>b)) */
   1195         rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
   1196         rv64_emit32(mc, rv_flt_fmt(fmt, RV_TMP2, rb, ra));
   1197         rv64_emit32(mc, rv_or(rd, rd, RV_TMP2));
   1198         rv64_emit32(mc, rv_xori(rd, rd, 1));
   1199         return;
   1200       default:
   1201         rv_panic(rv_of(t), "unsupported fp cmp");
   1202     }
   1203   }
   1204   {
   1205     int sg = cmp_is_signed(op);
   1206     u32 ra = rv_cmp_ext(t, sg, aop, RV_TMP0);
   1207     u32 rb = rv_cmp_ext(t, sg, bop, RV_TMP1);
   1208     rv_emit_icmp(t, op, rd, ra, rb);
   1209   }
   1210 }
   1211 
   1212 static void rv_convert(NativeTarget* t, ConvKind op, NativeLoc dst,
   1213                        NativeLoc src) {
   1214   const RiscvVariant* v = rv_of(t)->variant;
   1215   MCEmitter* mc = t->mc;
   1216   u32 rd = loc_reg(dst), rs = loc_reg(src);
   1217   u32 src_sz = native_type_size(t, src.type);
   1218   u32 dst_sz = native_type_size(t, dst.type);
   1219   /* `il` (int-side wide): the 64-bit-integer fcvt L-forms are RV64-only; on
   1220    * rv32 only the w/wu forms exist and a 64-bit int<->fp is legalized to a
   1221    * libcall before reaching here. */
   1222   int il = v->has_w_forms;
   1223   switch (op) {
   1224     case CV_SEXT:
   1225       if (src_sz >= 4u) {
   1226         /* ADDIW sign-extends bits[31:0] into a 64-bit reg (RV64). On rv32 a
   1227          * 4-byte value already spans the whole register, so a plain move (or
   1228          * nothing when rd==rs) is the sign extension. */
   1229         if (v->has_w_forms)
   1230           rv64_emit32(mc, rv_addiw(rd, rs, 0));
   1231         else if (rd != rs)
   1232           rv64_emit32(mc, rv_addi(rd, rs, 0));
   1233       } else {
   1234         u32 sh = v->xlen - src_sz * 8u;
   1235         rv64_emit32(mc, rv_slli(rd, rs, sh));
   1236         rv64_emit32(mc, rv_srai(rd, rd, sh));
   1237       }
   1238       return;
   1239     case CV_ZEXT: {
   1240       u32 sh = v->xlen - src_sz * 8u;
   1241       rv64_emit32(mc, rv_slli(rd, rs, sh));
   1242       rv64_emit32(mc, rv_srli(rd, rd, sh));
   1243       return;
   1244     }
   1245     case CV_TRUNC:
   1246       if (rd != rs || dst_sz <= 4u)
   1247         rv64_emit32(mc, rv_addi(rd, rs, 0)); /* low bits; users re-narrow */
   1248       return;
   1249     case CV_ITOF_S:
   1250       if (native_type_size(t, dst.type) == 8u)
   1251         rv64_emit32(mc, il && src_sz == 8u ? rv_fcvt_d_l(rd, rs)
   1252                                            : rv_fcvt_d_w(rd, rs));
   1253       else
   1254         rv64_emit32(mc, il && src_sz == 8u ? rv_fcvt_s_l(rd, rs)
   1255                                            : rv_fcvt_s_w(rd, rs));
   1256       return;
   1257     case CV_ITOF_U:
   1258       if (native_type_size(t, dst.type) == 8u)
   1259         rv64_emit32(mc, il && src_sz == 8u ? rv_fcvt_d_lu(rd, rs)
   1260                                            : rv_fcvt_d_wu(rd, rs));
   1261       else
   1262         rv64_emit32(mc, il && src_sz == 8u ? rv_fcvt_s_lu(rd, rs)
   1263                                            : rv_fcvt_s_wu(rd, rs));
   1264       return;
   1265     case CV_FTOI_S:
   1266       if (src_sz == 8u)
   1267         rv64_emit32(mc, il && dst_sz == 8u ? rv_fcvt_l_d(rd, rs)
   1268                                            : rv_fcvt_w_d(rd, rs));
   1269       else
   1270         rv64_emit32(mc, il && dst_sz == 8u ? rv_fcvt_l_s(rd, rs)
   1271                                            : rv_fcvt_w_s(rd, rs));
   1272       return;
   1273     case CV_FTOI_U:
   1274       if (src_sz == 8u)
   1275         rv64_emit32(mc, il && dst_sz == 8u ? rv_fcvt_lu_d(rd, rs)
   1276                                            : rv_fcvt_wu_d(rd, rs));
   1277       else
   1278         rv64_emit32(mc, il && dst_sz == 8u ? rv_fcvt_lu_s(rd, rs)
   1279                                            : rv_fcvt_wu_s(rd, rs));
   1280       return;
   1281     case CV_FEXT:
   1282       rv64_emit32(mc, rv_fcvt_d_s(rd, rs));
   1283       return;
   1284     case CV_FTRUNC:
   1285       rv64_emit32(mc, rv_fcvt_s_d(rd, rs));
   1286       return;
   1287     case CV_BITCAST:
   1288       rv_move(t, dst, src);
   1289       return;
   1290     default:
   1291       rv_panic(rv_of(t), "unsupported convert");
   1292   }
   1293 }
   1294 
   1295 /* ============================ spill / reload ============================ */
   1296 
   1297 static void rv_spill(NativeTarget* t, NativeLoc src, NativeFrameSlot slot,
   1298                      MemAccess mem) {
   1299   NativeAddr addr;
   1300   memset(&addr, 0, sizeof addr);
   1301   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1302   addr.base.frame = slot;
   1303   addr.base_type = src.type;
   1304   rv_emit_mem(rv_of(t), 0, src, addr, mem);
   1305 }
   1306 static void rv_reload(NativeTarget* t, NativeLoc dst, NativeFrameSlot slot,
   1307                       MemAccess mem) {
   1308   NativeAddr addr;
   1309   memset(&addr, 0, sizeof addr);
   1310   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1311   addr.base.frame = slot;
   1312   addr.base_type = dst.type;
   1313   rv_emit_mem(rv_of(t), 1, dst, addr, mem);
   1314 }
   1315 
   1316 /* ============================ control flow ============================ */
   1317 
   1318 static MCLabel rv_label_new(NativeTarget* t) { return t->mc->label_new(t->mc); }
   1319 static void rv_label_place(NativeTarget* t, MCLabel l) {
   1320   t->mc->label_place(t->mc, l);
   1321 }
   1322 static void rv_jump(NativeTarget* t, MCLabel l) {
   1323   rv64_emit32(t->mc, rv_jal(RV_ZERO, 0));
   1324   t->mc->emit_label_ref(t->mc, l, R_RV_JAL, 4, 0);
   1325 }
   1326 
   1327 static void rv_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc aop,
   1328                           NativeLoc bop, MCLabel l) {
   1329   MCEmitter* mc = t->mc;
   1330   /* RISC-V B-type branches reach only ±4 KiB, which a single (especially
   1331    * -O0) function can exceed between a branch and its target. Rather than a
   1332    * lone conditional branch to the label, emit a short *inverted* branch
   1333    * that skips an unconditional `jal` (±1 MiB) to the target. The inverted
   1334    * branch's displacement is the constant SKIP_JAL (skip just the jal) and
   1335    * so is always in range; the jal carries the long reach. See rv_jump. */
   1336   enum { SKIP_JAL = 8 }; /* branch over the 4-byte jal that follows it */
   1337   /* FP compares have no register-register branch form: materialize the 0/1
   1338    * into TMP0 via rv_cmp (handles all 12 predicates), then branch on nonzero.
   1339    */
   1340   if (op >= CMP_OEQ_F) {
   1341     NativeLoc tmp =
   1342         native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, RV_TMP0);
   1343     rv_cmp(t, op, tmp, aop, bop);
   1344     /* Skip the jal when the result is 0 (condition false). */
   1345     rv64_emit32(mc, rv_beq(RV_TMP0, RV_ZERO, SKIP_JAL));
   1346     rv_jump(t, l);
   1347     return;
   1348   }
   1349   {
   1350     int sg = cmp_is_signed(op);
   1351     u32 ra = rv_cmp_ext(t, sg, aop, RV_TMP0);
   1352     u32 rb = rv_cmp_ext(t, sg, bop, RV_TMP1);
   1353     u32 word;
   1354     /* Encode the *inverse* of `op`, skipping the jal when NOT taken. */
   1355     switch (op) {
   1356       case CMP_EQ:
   1357         word = rv_bne(ra, rb, SKIP_JAL);
   1358         break;
   1359       case CMP_NE:
   1360         word = rv_beq(ra, rb, SKIP_JAL);
   1361         break;
   1362       case CMP_LT_S:
   1363         word = rv_bge(ra, rb, SKIP_JAL);
   1364         break;
   1365       case CMP_GE_S:
   1366         word = rv_blt(ra, rb, SKIP_JAL);
   1367         break;
   1368       case CMP_LT_U:
   1369         word = rv_bgeu(ra, rb, SKIP_JAL);
   1370         break;
   1371       case CMP_GE_U:
   1372         word = rv_bltu(ra, rb, SKIP_JAL);
   1373         break;
   1374       case CMP_GT_S:
   1375         word = rv_bge(rb, ra, SKIP_JAL);
   1376         break;
   1377       case CMP_LE_S:
   1378         word = rv_blt(rb, ra, SKIP_JAL);
   1379         break;
   1380       case CMP_GT_U:
   1381         word = rv_bgeu(rb, ra, SKIP_JAL);
   1382         break;
   1383       case CMP_LE_U:
   1384         word = rv_bltu(rb, ra, SKIP_JAL);
   1385         break;
   1386       default:
   1387         rv_panic(rv_of(t), "unsupported cmp_branch");
   1388     }
   1389     rv64_emit32(mc, word);
   1390     rv_jump(t, l);
   1391   }
   1392 }
   1393 
   1394 static void rv_indirect_branch(NativeTarget* t, NativeLoc addr,
   1395                                const MCLabel* valid_targets, u32 ntargets) {
   1396   (void)valid_targets;
   1397   (void)ntargets;
   1398   rv64_emit32(t->mc, rv_jalr(RV_ZERO, loc_reg(addr), 0));
   1399 }
   1400 
   1401 static void rv_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel l) {
   1402   /* `&&label` address-take: auipc/addi with a %pcrel_hi/%pcrel_lo relocation
   1403    * pair against the label's per-block local symbol — the same form
   1404    * rv_emit_global_addr uses for a global — so a compressing/re-encoding
   1405    * assembler recomputes the displacement (a baked offset would break under
   1406    * the C extension). */
   1407   MCEmitter* mc = t->mc;
   1408   u32 rd = loc_reg(dst);
   1409   u32 sec = mc->section_id;
   1410   ObjSymId sym = mc_label_symbol(mc, l);
   1411   u32 ap = mc->pos(mc);
   1412   rv64_emit32(mc, rv_auipc(rd, 0));
   1413   mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, 0, 0, 0);
   1414   {
   1415     Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi"));
   1416     ObjSymId anchor = obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
   1417     u32 lp = mc->pos(mc);
   1418     rv64_emit32(mc, rv_addi(rd, rd, 0));
   1419     mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
   1420   }
   1421 }
   1422 
   1423 /* ============================ frame / lifecycle ============================
   1424  */
   1425 
   1426 static NativeFrameSlot rv_frame_slot(NativeTarget* t,
   1427                                      const NativeFrameSlotDesc* d) {
   1428   return native_frame_slot_alloc(&rv_of(t)->frame, d);
   1429 }
   1430 
   1431 static int rv_frame_slot_debug_loc(NativeTarget* t, NativeFrameSlot slot,
   1432                                    CGDebugLoc* out) {
   1433   RvNativeTarget* a = rv_of(t);
   1434   RvNativeSlot* s;
   1435   if (!out) return 0;
   1436   memset(out, 0, sizeof *out);
   1437   if (slot == NATIVE_FRAME_SLOT_NONE || slot > a->frame.nslots) return 0;
   1438   s = rv_slot_get(a, slot);
   1439   out->kind = CG_DEBUG_LOC_FRAME;
   1440   /* rv64 slots are addressed s0/fp-relative (rv_s0_off_slot); the hosted dbg
   1441    * snapshot seeds the frame base with s0, matching aa64's FP-relative
   1442    * convention. */
   1443   out->v.frame_ofs = rv_s0_off_slot(s);
   1444   return 1;
   1445 }
   1446 
   1447 static void rv_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
   1448   RvNativeTarget* a = rv_of(t);
   1449   MCEmitter* mc = t->mc;
   1450   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
   1451   a->func = fd;
   1452   a->loc = fd->loc;
   1453   /* Shared frame bookkeeping: clears the slot table, cum_off, max_outgoing,
   1454    * callee-save set, and known_frame/has_alloca/frame_final. */
   1455   native_frame_reset(&a->frame);
   1456   a->incoming_stack_size = 0;
   1457   a->next_param_int = 0;
   1458   a->next_param_fp = 0;
   1459   a->next_param_stack = 0;
   1460   a->has_sret = (abi && abi->has_sret) ? 1u : 0u;
   1461   a->is_variadic = (abi && abi->variadic) ? 1u : 0u;
   1462   a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE;
   1463   a->npatches = 0;
   1464   a->nalloca = 0;
   1465   a->minimal_prologue_words = 0;
   1466   a->slim_prologue = 0;
   1467 
   1468   mc->set_section(mc, fd->text_section_id);
   1469   mc->emit_align(mc, 4, 0);
   1470   a->func_start = mc->pos(mc);
   1471   mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start);
   1472   if (mc->cfi_startproc) mc->cfi_startproc(mc);
   1473   a->epilogue_label = mc->label_new(mc);
   1474 }
   1475 
   1476 /* sret: reserve a hidden slot for the incoming destination pointer (a0). */
   1477 static void rv_reserve_entry_saves(RvNativeTarget* a) {
   1478   NativeTarget* t = &a->base;
   1479   if (a->has_sret) {
   1480     NativeFrameSlotDesc sd;
   1481     u32 ptr = a->variant->ptr_bytes;
   1482     memset(&sd, 0, sizeof sd);
   1483     sd.type = builtin_id(KIT_CG_BUILTIN_I64);
   1484     sd.size = ptr; /* a pointer slot: 8 on rv64, 4 on rv32 */
   1485     sd.align = ptr;
   1486     sd.kind = NATIVE_FRAME_SLOT_SAVE;
   1487     a->sret_ptr_slot = t->frame_slot(t, &sd);
   1488     a->next_param_int = 1; /* a0 consumed by the sret pointer */
   1489   }
   1490 }
   1491 
   1492 static void rv_emit_entry_save_stores(RvNativeTarget* a) {
   1493   NativeTarget* t = &a->base;
   1494   if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) {
   1495     KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   1496     u32 ptr = a->variant->ptr_bytes;
   1497     NativeAddr addr;
   1498     memset(&addr, 0, sizeof addr);
   1499     addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   1500     addr.base.frame = a->sret_ptr_slot;
   1501     addr.base_type = i64t;
   1502     rv_emit_mem(a, 0, native_loc_reg(i64t, NATIVE_REG_INT, RV_A0), addr,
   1503                 native_mem_for_type(t, i64t, ptr));
   1504   }
   1505 }
   1506 
   1507 /* Collect the callee-saves the body used (none at -O0). */
   1508 static u32 rv_collect_int_saves(RvNativeTarget* a, u32* regs) {
   1509   u32 n = 0, i;
   1510   for (i = 0; i < a->frame.ncallee_saves; ++i)
   1511     if (a->frame.callee_saves[i].cls == NATIVE_REG_INT)
   1512       regs[n++] = a->frame.callee_saves[i].reg;
   1513   return n;
   1514 }
   1515 static u32 rv_collect_fp_saves(RvNativeTarget* a, u32* regs) {
   1516   u32 n = 0, i;
   1517   for (i = 0; i < a->frame.ncallee_saves; ++i)
   1518     if (a->frame.callee_saves[i].cls == NATIVE_REG_FP)
   1519       regs[n++] = a->frame.callee_saves[i].reg;
   1520   return n;
   1521 }
   1522 
   1523 /* s0-relative offset of a saved register, below the locals. The flat index runs
   1524  * 0..n_int-1 over integer saves (each ptr_bytes wide) then n_int..n_int+n_fp-1
   1525  * over fp saves (each 8 bytes wide, fsd). On rv64 ptr_bytes==8 so this reduces
   1526  * to the historical uniform -cum_off-8-8*idx layout, byte-for-byte. */
   1527 static i32 rv_save_off(RvNativeTarget* a, u32 n_int, u32 idx) {
   1528   i32 base = -(i32)(a->frame.cum_off);
   1529   u32 ptr = a->variant->ptr_bytes;
   1530   if (idx < n_int) return base - (i32)ptr * (i32)(idx + 1u);
   1531   return base - (i32)(ptr * n_int) - 8 * (i32)(idx - n_int + 1u);
   1532 }
   1533 
   1534 static void rv_load_s0(const RiscvVariant* v, MCEmitter* mc, int fp, u32 reg,
   1535                        i32 off) {
   1536   if (fits_i12(off)) {
   1537     rv64_emit32(mc, fp ? rv_fld(reg, RV_S0, off)
   1538                        : rv_ld_ptr(v, reg, RV_S0, off));
   1539     return;
   1540   }
   1541   rv_emit_load_imm(v, mc, 1, RV_TMP0, (i64)off);
   1542   rv64_emit32(mc, rv_add(RV_TMP0, RV_S0, RV_TMP0));
   1543   rv64_emit32(mc, fp ? rv_fld(reg, RV_TMP0, 0) : rv_ld_ptr(v, reg, RV_TMP0, 0));
   1544 }
   1545 
   1546 /* Build the prologue instruction sequence into words[]. Returns count. */
   1547 static u32 rv_build_prologue(RvNativeTarget* a, u32* words, u32 cap,
   1548                              u32 frame_size, u32 fp_pair_off,
   1549                              const u32* int_regs, u32 n_int, const u32* fp_regs,
   1550                              u32 n_fp) {
   1551   const RiscvVariant* v = a->variant;
   1552   u32 ptr = v->ptr_bytes;                 /* saved-pair / int-save stride */
   1553   u32 gp_slot = v->gp_slot_bytes;         /* vararg GP-slot stride */
   1554   u32 fsz = v->frame_save_size;           /* saved ra+s0 pair base offset */
   1555   u32 wi = 0;
   1556   /* lui+ADD{I,IW} materializes a 32-bit constant in TMP0; ADDIW is RV64-only so
   1557    * use plain ADDI on rv32 (the value already fits 32 bits). */
   1558 #define ADDI_LO(rd, lo) (v->has_w_forms ? rv_addiw((rd), (rd), (lo)) : rv_addi((rd), (rd), (lo)))
   1559 #define PUSH(w)                                                  \
   1560   do {                                                           \
   1561     if (wi >= cap) rv_panic(a, "prologue placeholder overflow"); \
   1562     words[wi++] = (w);                                           \
   1563   } while (0)
   1564   /* sp -= frame_size */
   1565   if (fits_i12(-(i32)frame_size)) {
   1566     PUSH(rv_addi(RV_SP, RV_SP, -(i32)frame_size));
   1567   } else {
   1568     i32 neg = -(i32)frame_size;
   1569     i32 hi = (i32)(((i64)neg + 0x800) >> 12);
   1570     i32 lo = neg - (i32)((u32)hi << 12);
   1571     PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
   1572     if (lo) PUSH(ADDI_LO(RV_TMP0, lo));
   1573     PUSH(rv_add(RV_SP, RV_SP, RV_TMP0));
   1574   }
   1575   /* save s0/ra at [sp + fp_pair_off], set s0 = sp + fp_pair_off. The saved-pair
   1576    * internal stride is ptr_bytes (s0 at +0, ra at +ptr). */
   1577   if (fits_i12((i32)fp_pair_off + (i32)ptr)) {
   1578     PUSH(rv_sd_ptr(v, RV_S0, RV_SP, (i32)fp_pair_off));
   1579     PUSH(rv_sd_ptr(v, RV_RA, RV_SP, (i32)fp_pair_off + (i32)ptr));
   1580     PUSH(rv_addi(RV_S0, RV_SP, (i32)fp_pair_off));
   1581   } else {
   1582     i32 off = (i32)fp_pair_off;
   1583     i32 hi = (i32)(((i64)off + 0x800) >> 12);
   1584     i32 lo = off - (i32)((u32)hi << 12);
   1585     PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
   1586     if (lo) PUSH(ADDI_LO(RV_TMP0, lo));
   1587     PUSH(rv_add(RV_TMP0, RV_SP, RV_TMP0));
   1588     PUSH(rv_sd_ptr(v, RV_S0, RV_TMP0, 0));
   1589     PUSH(rv_sd_ptr(v, RV_RA, RV_TMP0, (i32)ptr));
   1590     PUSH(rv_addi(RV_S0, RV_TMP0, 0));
   1591   }
   1592   /* sret a0 spill */
   1593   if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) {
   1594     RvNativeSlot* s = rv_slot_get(a, a->sret_ptr_slot);
   1595     PUSH(rv_sd_ptr(v, RV_A0, RV_S0, rv_s0_off_slot(s)));
   1596   }
   1597   /* variadic GP save area: spill unconsumed a-regs at
   1598    * [s0 + frame_save_size + i*gp_slot_bytes] */
   1599   if (a->is_variadic) {
   1600     u32 i;
   1601     for (i = a->next_param_int; i < 8u; ++i)
   1602       PUSH(rv_sd_ptr(v, RV_A0 + i, RV_S0, (i32)fsz + (i32)i * (i32)gp_slot));
   1603   }
   1604   /* callee saves: integer with the pointer-width store (sw/sd), fp with fsd. */
   1605   {
   1606     u32 i;
   1607     for (i = 0; i < n_int; ++i) {
   1608       i32 off = rv_save_off(a, n_int, i);
   1609       if (fits_i12(off)) {
   1610         PUSH(rv_sd_ptr(v, int_regs[i], RV_S0, off));
   1611       } else {
   1612         /* rare; emitted directly is fine in the known-frame path, but the
   1613          * single-pass placeholder must hold these too. Use the wide form. */
   1614         i32 hi = (i32)(((i64)off + 0x800) >> 12);
   1615         i32 lo = off - (i32)((u32)hi << 12);
   1616         PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
   1617         if (lo) PUSH(ADDI_LO(RV_TMP0, lo));
   1618         PUSH(rv_add(RV_TMP0, RV_S0, RV_TMP0));
   1619         PUSH(rv_sd_ptr(v, int_regs[i], RV_TMP0, 0));
   1620       }
   1621     }
   1622     for (i = 0; i < n_fp; ++i) {
   1623       i32 off = rv_save_off(a, n_int, n_int + i);
   1624       if (fits_i12(off)) {
   1625         PUSH(rv_fsd(fp_regs[i], RV_S0, off));
   1626       } else {
   1627         i32 hi = (i32)(((i64)off + 0x800) >> 12);
   1628         i32 lo = off - (i32)((u32)hi << 12);
   1629         PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
   1630         if (lo) PUSH(ADDI_LO(RV_TMP0, lo));
   1631         PUSH(rv_add(RV_TMP0, RV_S0, RV_TMP0));
   1632         PUSH(rv_fsd(fp_regs[i], RV_TMP0, 0));
   1633       }
   1634     }
   1635   }
   1636 #undef PUSH
   1637 #undef ADDI_LO
   1638   return wi;
   1639 }
   1640 
   1641 static void rv_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
   1642   RvNativeTarget* a = rv_of(t);
   1643   MCEmitter* mc = t->mc;
   1644   u32 i;
   1645   rv_func_begin_common(t, fd);
   1646   a->prologue_pos = mc->pos(mc);
   1647   for (i = 0; i < RV_PROLOGUE_WORDS; ++i) rv64_emit32(mc, RV_NOP);
   1648   rv_reserve_entry_saves(a);
   1649   rv_emit_entry_save_stores(a);
   1650 }
   1651 
   1652 static void rv_func_end(NativeTarget* t) {
   1653   RvNativeTarget* a = rv_of(t);
   1654   MCEmitter* mc = t->mc;
   1655   ObjBuilder* obj = t->obj;
   1656   ObjSecId sec = a->func->text_section_id;
   1657   u32 int_regs[16], fp_regs[16];
   1658   u32 n_int = rv_collect_int_saves(a, int_regs);
   1659   u32 n_fp = rv_collect_fp_saves(a, fp_regs);
   1660   u32 frame_size = rv_frame_size(a);
   1661   u32 fp_pair_off = rv_fp_pair_off(a, frame_size);
   1662   u32 end;
   1663   i32 i;
   1664   a->frame_size_final = frame_size;
   1665   a->fp_pair_off = fp_pair_off;
   1666 
   1667   /* epilogue */
   1668   mc->label_place(mc, a->epilogue_label);
   1669   if (a->slim_prologue) {
   1670     /* Frameless leaf: no callee-saves, no s0/ra to reload, sp untouched. */
   1671     rv64_emit32(mc, rv_jalr(RV_ZERO, RV_RA, 0));
   1672   } else {
   1673     const RiscvVariant* v = a->variant;
   1674     for (i = (i32)n_int - 1; i >= 0; --i)
   1675       rv_load_s0(v, mc, 0, int_regs[i], rv_save_off(a, n_int, (u32)i));
   1676     for (i = (i32)n_fp - 1; i >= 0; --i)
   1677       rv_load_s0(v, mc, 1, fp_regs[i], rv_save_off(a, n_int, n_int + (u32)i));
   1678     if (a->frame.has_alloca)
   1679       rv_emit_addr_adjust(v, mc, RV_SP, RV_S0, -(i32)fp_pair_off);
   1680     /* Reload ra/s0 from the saved pair (s0 at +0, ra at +ptr_bytes), pointer
   1681      * width. */
   1682     rv64_emit32(mc, rv_ld_ptr(v, RV_RA, RV_S0, (i32)v->ptr_bytes));
   1683     rv64_emit32(mc, rv_ld_ptr(v, RV_S0, RV_S0, 0));
   1684     /* sp += frame_size */
   1685     if (fits_i12((i32)frame_size)) {
   1686       rv64_emit32(mc, rv_addi(RV_SP, RV_SP, (i32)frame_size));
   1687     } else {
   1688       rv_emit_load_imm(v, mc, 1, RV_TMP0, (i64)frame_size);
   1689       rv64_emit32(mc, rv_add(RV_SP, RV_SP, RV_TMP0));
   1690     }
   1691     rv64_emit32(mc, rv_jalr(RV_ZERO, RV_RA, 0));
   1692   }
   1693 
   1694   /* patch prologue */
   1695   if (!a->frame.known_frame) {
   1696     u32 words[RV_PROLOGUE_WORDS];
   1697     u32 nwords, k;
   1698     for (k = 0; k < RV_PROLOGUE_WORDS; ++k) words[k] = RV_NOP;
   1699     nwords = rv_build_prologue(a, words, RV_PROLOGUE_WORDS, frame_size,
   1700                                fp_pair_off, int_regs, n_int, fp_regs, n_fp);
   1701     (void)nwords;
   1702     for (k = 0; k < RV_PROLOGUE_WORDS; ++k)
   1703       rv_patch32(obj, sec, a->prologue_pos + k * 4u, words[k]);
   1704   }
   1705   /* patch alloca sites: addi dst, sp, max_outgoing */
   1706   {
   1707     u32 mo = align_up_u32(a->frame.max_outgoing, 16u);
   1708     u32 k;
   1709     if (mo > 2047u) rv_panic(a, "max_outgoing too large for alloca patch");
   1710     for (k = 0; k < a->npatches; ++k)
   1711       rv_patch32(obj, sec, a->patches[k].pos,
   1712                  rv_addi(a->patches[k].dst_reg, RV_SP, (i32)mo));
   1713   }
   1714 
   1715   /* CFI: CFA = s0 + (frame_size - fp_pair_off) */
   1716   if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
   1717     if (a->slim_prologue) {
   1718       /* Frameless leaf: CFA = sp (unchanged from entry) and the return address
   1719        * stays live in ra (the CIE default), so no saved-register rules. The
   1720        * state holds from the first instruction (offset 0). */
   1721       mc->cfi_set_next_pc_offset(mc, 0);
   1722       mc->cfi_def_cfa(mc, RV_SP, 0);
   1723     } else {
   1724       i32 cfa = (i32)frame_size - (i32)fp_pair_off;
   1725       u32 post = a->prologue_pos + (a->frame.known_frame
   1726                                         ? a->minimal_prologue_words * 4u
   1727                                         : RV_PROLOGUE_WORDS * 4u);
   1728       u32 k;
   1729       mc->cfi_set_next_pc_offset(mc, post - a->func_start);
   1730       mc->cfi_def_cfa(mc, RV_S0, cfa);
   1731       mc->cfi_offset(mc, RV_S0, -cfa);
   1732       /* ra is saved at the saved-pair stride above s0 (ptr_bytes). */
   1733       mc->cfi_offset(mc, RV_RA, -cfa + (i32)a->variant->ptr_bytes);
   1734       for (k = 0; k < n_int; ++k)
   1735         mc->cfi_offset(mc, int_regs[k], rv_save_off(a, n_int, k) - cfa);
   1736       for (k = 0; k < n_fp; ++k)
   1737         mc->cfi_offset(mc, 32u + fp_regs[k],
   1738                        rv_save_off(a, n_int, n_int + k) - cfa);
   1739     }
   1740   }
   1741 
   1742   end = mc->pos(mc);
   1743   obj_symbol_define(obj, a->func->sym, sec, (u64)a->func_start,
   1744                     (u64)(end - a->func_start));
   1745   if (a->func->atomize)
   1746     obj_atom_define(obj, sec, a->func_start, end - a->func_start, a->func->sym,
   1747                     0);
   1748   if (mc->debug) debug_func_pc_range(mc->debug, sec, a->func_start, end);
   1749   if (mc->cfi_endproc) mc->cfi_endproc(mc);
   1750   mc_end_function(mc);
   1751   a->func = NULL;
   1752 }
   1753 
   1754 /* rv64 homes its callee-saves below the locals at rv_save_off(idx) rather than
   1755  * in frame slots, so alloc_slots=0: native_frame just records the {reg,cls} set
   1756  * derived from the optimizer's per-class used-masks. */
   1757 static void rv_reserve_callee_saves(NativeTarget* t, const u32* used,
   1758                                     u32 nclasses) {
   1759   native_frame_set_callee_saves(&rv_of(t)->frame, used, nclasses, NULL, 0, 0);
   1760 }
   1761 
   1762 static int rv_reg_is_callee_int(Reg r);
   1763 static int rv_reg_is_callee_fp(Reg r);
   1764 static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
   1765                                  u32 nclob, u32* int_mask, u32* fp_mask);
   1766 
   1767 /* Expand the arch-neutral clobber-ABI sets (KitCgAsmClobberAbiSet bits) into
   1768  * this target's per-class caller/callee-saved register masks. */
   1769 /* abi_clobber_masks is shared as native_asm_abi_clobber_masks
   1770  * (cg/native_asm.h); it reads the masks from t->regs->classes. */
   1771 
   1772 /* Build the callee-saved set the prologue must preserve: the allocator-assigned
   1773  * callee-saved registers (frame->callee_saved_used) plus any an inline-asm
   1774  * block clobbers. The latter are opaque to the optimizer's operand scan, so it
   1775  * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral
   1776  * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks
   1777  * and keep only the callee-saved ones — rv_reg_is_callee_int excludes s0 (the
   1778  * frame pointer, preserved by the prologue head, not as an ordinary
   1779  * callee-save). This is the same register selection the per-block spill used,
   1780  * hoisted into the prologue. Writes up to `cap` per-class masks into `out` and
   1781  * returns the class count to reserve. */
   1782 static u32 rv_known_callee_saves(NativeTarget* t,
   1783                                  const NativeKnownFrameDesc* frame, u32* out,
   1784                                  u32 cap) {
   1785   u32 ncls = frame->ncallee_classes;
   1786   u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp;
   1787   if (ncls > cap) ncls = cap;
   1788   for (u32 c = 0; c < ncls; ++c)
   1789     out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u;
   1790   if (frame->asm_clobbers && frame->nasm_clobbers) {
   1791     RvNativeTarget* a = rv_of(t);
   1792     SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
   1793     rv_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers,
   1794                          &clob_int, &clob_fp);
   1795   }
   1796   native_asm_abi_clobber_masks(t, frame->asm_clobber_abi_sets, &abi_int,
   1797                                &abi_fp);
   1798   clob_int |= abi_int;
   1799   clob_fp |= abi_fp;
   1800   for (Reg r = 0; r < 32u; ++r) {
   1801     if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) &&
   1802         rv_reg_is_callee_int(r))
   1803       out[NATIVE_REG_INT] |= 1u << r;
   1804     if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && rv_reg_is_callee_fp(r))
   1805       out[NATIVE_REG_FP] |= 1u << r;
   1806   }
   1807   return ncls;
   1808 }
   1809 
   1810 static u32 rv_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type,
   1811                                     int* variadic, u32* nparams);
   1812 
   1813 /* Optimizer entry point: the full frame is supplied up front, so the prologue
   1814  * is emitted final the moment it is built — no NOP region, no func_end patch
   1815  * (rv_func_end skips patching when known_frame). rv_build_prologue emits the
   1816  * sret spill and the variadic register-save stores inline, so there is no
   1817  * separate entry-save emission. Slot creation order matches the single-pass
   1818  * path: callee-saves first (only recorded for rv64), then static slots, then
   1819  * the sret entry-save slot. */
   1820 static void rv_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
   1821                                       const NativeKnownFrameDesc* frame,
   1822                                       NativeFrameSlot* out_slots) {
   1823   RvNativeTarget* a = rv_of(t);
   1824   MCEmitter* mc = t->mc;
   1825   u32 int_regs[RV_MAX_CALLEE_SAVES], fp_regs[RV_MAX_CALLEE_SAVES];
   1826   u32 n_int, n_fp, frame_size, fp_pair_off, nwords, i;
   1827   u32 words[RV_KNOWN_PROLOGUE_WORDS];
   1828   rv_func_begin_common(t, fd);
   1829   a->frame.known_frame = 1;
   1830   if (frame) {
   1831     u32 cs[NATIVE_CALL_PLAN_CLASSES];
   1832     u32 ncs = rv_known_callee_saves(t, frame, cs, NATIVE_CALL_PLAN_CLASSES);
   1833     a->frame.has_alloca = frame->has_alloca;
   1834     if (ncs) rv_reserve_callee_saves(t, cs, ncs);
   1835     for (i = 0; i < frame->nslots; ++i) {
   1836       NativeFrameSlot slot = rv_frame_slot(t, &frame->slots[i]);
   1837       if (out_slots) out_slots[i] = slot;
   1838     }
   1839     rv_reserve_entry_saves(a);
   1840     native_frame_note_outgoing(&a->frame, frame->max_outgoing);
   1841   }
   1842   /* Frame is final: size and offsets are settled, so emit the exact prologue.
   1843    */
   1844   frame_size = rv_frame_size(a);
   1845   fp_pair_off = rv_fp_pair_off(a, frame_size);
   1846   a->frame_size_final = frame_size;
   1847   a->fp_pair_off = fp_pair_off;
   1848   a->prologue_pos = mc->pos(mc);
   1849   /* Leaf no-frame tier (aa64 slim_prologue equivalent): a leaf with no
   1850    * callee-saves, no body slots, no outgoing args, no sret/variadic and
   1851    * register-only params never reads s0 (no frame slots / stack args) nor
   1852    * clobbers ra (no calls). Emit no prologue at all; rv_func_end emits a bare
   1853    * `ret`. cum_off==0 already implies no sret slot and no param spills, but the
   1854    * extra guards keep the intent explicit. Inline asm is excluded: it can
   1855    * clobber ra opaquely, and without the saved record the bare `ret` would
   1856    * return through the destroyed link register. */
   1857   a->slim_prologue = frame && frame->is_leaf && !frame->has_asm &&
   1858                      !frame->reads_frame && a->frame.ncallee_saves == 0 &&
   1859                      !a->frame.has_alloca && a->frame.cum_off == 0 &&
   1860                      a->frame.max_outgoing == 0 && !a->has_sret &&
   1861                      !a->is_variadic &&
   1862                      rv_signature_stack_bytes(t, fd->fn_type, NULL, NULL) == 0;
   1863   if (a->slim_prologue) {
   1864     a->minimal_prologue_words = 0;
   1865     native_frame_set_final(&a->frame);
   1866     return;
   1867   }
   1868   n_int = rv_collect_int_saves(a, int_regs);
   1869   n_fp = rv_collect_fp_saves(a, fp_regs);
   1870   nwords = rv_build_prologue(a, words, RV_KNOWN_PROLOGUE_WORDS, frame_size,
   1871                              fp_pair_off, int_regs, n_int, fp_regs, n_fp);
   1872   for (i = 0; i < nwords; ++i) rv64_emit32(mc, words[i]);
   1873   a->minimal_prologue_words = nwords;
   1874   native_frame_set_final(&a->frame);
   1875 }
   1876 
   1877 /* ============================ params / ABI helpers
   1878  * ============================ */
   1879 
   1880 static const ABIArgInfo* rv_param_abi(NativeTarget* t, const ABIFuncInfo* abi,
   1881                                       const NativeCallDesc* desc, u32 i,
   1882                                       ABIArgInfo* scratch) {
   1883   /* Synthesized for unnamed (variadic) args, or untyped calls. RISC-V LP64D
   1884    * passes variadic FP args in INTEGER registers (as their bit pattern), not
   1885    * the FP pool — so a variadic float part is ABI_CLASS_INT. */
   1886   int variadic = abi && i >= abi->nparams;
   1887   u32 gpr = rv_of(t)->variant->ptr_bytes; /* GPR width: 4 ilp32 / 8 lp64 */
   1888   u32 sz, align;
   1889   int is_fp;
   1890   if (abi && i < abi->nparams) return &abi->params[i];
   1891   sz = native_type_size(t, desc->args[i].type);
   1892   align = native_type_align(t, desc->args[i].type);
   1893   /* A variadic FP arg rides the INTEGER pool as its bit pattern (RISC-V passes
   1894    * unnamed FP args in GPRs), so it is INT-class here. */
   1895   is_fp = !variadic && cg_type_is_float(t->c, desc->args[i].type);
   1896   memset(scratch, 0, sizeof *scratch);
   1897   scratch->kind = ABI_ARG_DIRECT;
   1898   /* A scalar wider than one GPR (an 8-byte i64 / soft-double on ilp32) rides a
   1899    * register pair, matching the named-arg classifier (abi_rv64.c). Synthesize
   1900    * one INT part per GPR-word so the per-part marshaller fills both registers
   1901    * (low word in the lower-numbered reg) instead of dropping the high half into
   1902    * a single register. FP-class args (hardware-float, size<=GPR) stay single. */
   1903   if (!is_fp && sz > gpr) {
   1904     u32 nparts = (sz + gpr - 1u) / gpr, p;
   1905     ABIArgPart* parts = arena_zarray(t->c->tu, ABIArgPart, nparts);
   1906     for (p = 0; p < nparts; ++p) {
   1907       u32 off = p * gpr;
   1908       parts[p].cls = ABI_CLASS_INT;
   1909       parts[p].loc = ABI_LOC_REG;
   1910       parts[p].size = (sz - off) < gpr ? (sz - off) : gpr;
   1911       parts[p].align = gpr;
   1912       parts[p].src_offset = off;
   1913     }
   1914     scratch->nparts = (u16)nparts;
   1915     scratch->parts = parts;
   1916     return scratch;
   1917   }
   1918   scratch->nparts = 1;
   1919   scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1);
   1920   ((ABIArgPart*)scratch->parts)[0].cls = is_fp ? ABI_CLASS_FP : ABI_CLASS_INT;
   1921   ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG;
   1922   ((ABIArgPart*)scratch->parts)[0].size = sz;
   1923   ((ABIArgPart*)scratch->parts)[0].align = align;
   1924   return scratch;
   1925 }
   1926 
   1927 /* Outgoing stack-slot size/align: the xlen-word (gp_slot_bytes: 8 lp64d /
   1928  * 4 ilp32) is the natural slot stride; stack ABI alignment caps at 16. */
   1929 static u32 rv_part_stack_size(const RiscvVariant* v, const ABIArgPart* part) {
   1930   u32 slot = v->gp_slot_bytes;
   1931   return align_up_u32(part->size ? part->size : slot, slot);
   1932 }
   1933 static u32 rv_part_stack_align(const RiscvVariant* v, const ABIArgPart* part) {
   1934   u32 slot = v->gp_slot_bytes;
   1935   u32 al = part->align ? part->align : slot;
   1936   if (al < slot) al = slot;
   1937   if (al > 16u) al = 16u;
   1938   return al;
   1939 }
   1940 
   1941 static KitCgTypeId rv_part_scalar_type(const ABIArgPart* part) {
   1942   if (part->cls == ABI_CLASS_FP) {
   1943     if (part->size <= 4u) return builtin_id(KIT_CG_BUILTIN_F32);
   1944     return builtin_id(KIT_CG_BUILTIN_F64);
   1945   }
   1946   switch (part->size) {
   1947     case 1u:
   1948       return builtin_id(KIT_CG_BUILTIN_I8);
   1949     case 2u:
   1950       return builtin_id(KIT_CG_BUILTIN_I16);
   1951     case 4u:
   1952       return builtin_id(KIT_CG_BUILTIN_I32);
   1953     default:
   1954       return builtin_id(KIT_CG_BUILTIN_I64);
   1955   }
   1956 }
   1957 
   1958 static u32 rv_class_stack_size(const RiscvVariant* v, const ABIArgInfo* ai) {
   1959   u32 slot = v->gp_slot_bytes;
   1960   u32 total = 0, p;
   1961   if (!ai || ai->kind == ABI_ARG_IGNORE) return 0;
   1962   if (ai->kind == ABI_ARG_INDIRECT) return v->ptr_bytes;
   1963   for (p = 0; p < ai->nparts; ++p) {
   1964     total = align_up_u32(total, rv_part_stack_align(v, &ai->parts[p]));
   1965     total += rv_part_stack_size(v, &ai->parts[p]);
   1966   }
   1967   return align_up_u32(total ? total : slot, slot);
   1968 }
   1969 
   1970 static u32 rv_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) {
   1971   const RiscvVariant* v = rv_of(t)->variant;
   1972   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);
   1973   /* sret consumes a0 as the implicit first integer argument. */
   1974   u32 next_int = (abi && abi->has_sret) ? 1u : 0u;
   1975   u32 next_fp = 0, stack = 0, i, p;
   1976   for (i = 0; i < desc->nargs; ++i) {
   1977     ABIArgInfo tmp;
   1978     const ABIArgInfo* ai = rv_param_abi(t, abi, desc, i, &tmp);
   1979     int force_stack =
   1980         abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams;
   1981     if (ai->kind == ABI_ARG_IGNORE) continue;
   1982     if (force_stack) {
   1983       stack += rv_class_stack_size(v, ai);
   1984       continue;
   1985     }
   1986     if (ai->kind == ABI_ARG_INDIRECT) {
   1987       if (next_int < 8u)
   1988         next_int++;
   1989       else
   1990         stack += v->ptr_bytes;
   1991       continue;
   1992     }
   1993     for (p = 0; p < ai->nparts; ++p) {
   1994       const ABIArgPart* part = &ai->parts[p];
   1995       if (part->cls == ABI_CLASS_FP) {
   1996         if (next_fp < 8u)
   1997           next_fp++;
   1998         else {
   1999           stack = align_up_u32(stack, rv_part_stack_align(v, part));
   2000           stack += rv_part_stack_size(v, part);
   2001         }
   2002       } else {
   2003         if (next_int < 8u)
   2004           next_int++;
   2005         else {
   2006           stack = align_up_u32(stack, rv_part_stack_align(v, part));
   2007           stack += rv_part_stack_size(v, part);
   2008         }
   2009       }
   2010     }
   2011   }
   2012   return align_up_u32(stack, 16u);
   2013 }
   2014 
   2015 static u32 rv_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type,
   2016                                     int* variadic, u32* nparams) {
   2017   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type);
   2018   NativeCallDesc d;
   2019   if (variadic) *variadic = abi ? (int)abi->variadic : 0;
   2020   if (nparams) *nparams = abi ? abi->nparams : 0u;
   2021   memset(&d, 0, sizeof d);
   2022   d.fn_type = fn_type;
   2023   d.nargs = abi ? abi->nparams : 0u;
   2024   if (d.nargs) d.args = arena_zarray(t->c->tu, NativeLoc, d.nargs);
   2025   return rv_call_stack_size(t, &d);
   2026 }
   2027 
   2028 static u32 rv_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) {
   2029   return rv_call_stack_size(t, desc);
   2030 }
   2031 
   2032 /* Resolve a NativeLoc to an addressable NativeAddr (frame/stack/addr). */
   2033 static NativeAddr rv_loc_addr(RvNativeTarget* a, NativeLoc loc, u32 offset) {
   2034   NativeAddr addr;
   2035   memset(&addr, 0, sizeof addr);
   2036   switch ((NativeLocKind)loc.kind) {
   2037     case NATIVE_LOC_FRAME:
   2038       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   2039       addr.base.frame = loc.v.frame;
   2040       addr.base_type = loc.type;
   2041       addr.offset = (i32)offset;
   2042       return addr;
   2043     case NATIVE_LOC_STACK:
   2044       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   2045       addr.base.frame = loc.v.stack.slot;
   2046       addr.base_type = loc.type;
   2047       addr.offset = loc.v.stack.offset + (i32)offset;
   2048       return addr;
   2049     case NATIVE_LOC_ADDR:
   2050       addr = loc.v.addr;
   2051       addr.offset += (i32)offset;
   2052       return addr;
   2053     default:
   2054       rv_panic(a, "location is not addressable");
   2055   }
   2056   return addr;
   2057 }
   2058 
   2059 static void rv_load_part(NativeTarget* t, NativeLoc dst, NativeLoc src,
   2060                          u32 offset, u32 size) {
   2061   RvNativeTarget* a = rv_of(t);
   2062   if (src.kind == NATIVE_LOC_REG) {
   2063     rv_move(t, dst, src);
   2064     return;
   2065   }
   2066   if (src.kind == NATIVE_LOC_FRAME || src.kind == NATIVE_LOC_STACK ||
   2067       src.kind == NATIVE_LOC_ADDR) {
   2068     NativeAddr addr = rv_loc_addr(a, src, offset);
   2069     addr.base_type = dst.type;
   2070     rv_emit_mem(a, 1, dst, addr, native_mem_for_type(t, dst.type, size));
   2071     return;
   2072   }
   2073   if (src.kind == NATIVE_LOC_IMM) {
   2074     rv_emit_load_imm(a->variant, t->mc, rv_is_64(t, dst.type) ? 1u : 0u,
   2075                      loc_reg(dst), src.v.imm);
   2076     return;
   2077   }
   2078   rv_panic(a, "unsupported part source");
   2079 }
   2080 
   2081 static void rv_store_part(NativeTarget* t, NativeLoc dst, NativeLoc src,
   2082                           u32 offset, u32 size) {
   2083   RvNativeTarget* a = rv_of(t);
   2084   if (dst.kind == NATIVE_LOC_FRAME || dst.kind == NATIVE_LOC_STACK ||
   2085       dst.kind == NATIVE_LOC_ADDR) {
   2086     NativeAddr addr = rv_loc_addr(a, dst, offset);
   2087     addr.base_type = src.type;
   2088     rv_emit_mem(a, 0, src, addr, native_mem_for_type(t, src.type, size));
   2089     return;
   2090   }
   2091   if (dst.kind == NATIVE_LOC_REG) {
   2092     rv_move(t, dst, src);
   2093     return;
   2094   }
   2095   rv_panic(a, "unsupported part destination");
   2096 }
   2097 
   2098 static void rv_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) {
   2099   NativeAddr addr = rv_loc_addr(rv_of(t), src, 0);
   2100   rv_load_addr(t, dst, addr);
   2101 }
   2102 
   2103 static void rv_store_outgoing_part(NativeTarget* t, int tail_call,
   2104                                    u32 stack_off, NativeLoc src, u32 size) {
   2105   NativeAddr addr;
   2106   memset(&addr, 0, sizeof addr);
   2107   addr.base_kind = NATIVE_ADDR_BASE_REG;
   2108   addr.base_type = src.type;
   2109   if (tail_call) {
   2110     /* A sibling call reuses the caller's frame: its outgoing stack args land in
   2111      * the caller's incoming-arg window ([s0 + 16 + va_save + off]) — physically
   2112      * the same address the tail-callee will read at [sp+off] once the teardown
   2113      * has restored sp to the caller's entry sp (the CFA). */
   2114     addr.base.reg = RV_S0;
   2115     addr.offset = rv_s0_off_in_arg(rv_of(t), stack_off);
   2116   } else {
   2117     addr.base.reg = RV_SP;
   2118     addr.offset = (i32)stack_off;
   2119   }
   2120   rv_emit_mem(rv_of(t), 0, src, addr, native_mem_for_type(t, src.type, size));
   2121 }
   2122 
   2123 /* NativeTarget bind_param: route incoming param (ABI loc) into dst. */
   2124 static void rv_bind_native_param(NativeTarget* t, const CGParamDesc* p,
   2125                                  NativeLoc dst) {
   2126   RvNativeTarget* a = rv_of(t);
   2127   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type);
   2128   const ABIArgInfo* ai =
   2129       p->index < abi->nparams ? &abi->params[p->index] : NULL;
   2130   int to_reg = dst.kind == NATIVE_LOC_REG;
   2131   u32 i;
   2132   if (!ai || ai->kind == ABI_ARG_IGNORE) return;
   2133   if (ai->kind == ABI_ARG_INDIRECT) {
   2134     NativeLoc src = native_loc_reg(
   2135         builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
   2136         a->next_param_int < 8u ? RV_A0 + a->next_param_int : RV_TMP0);
   2137     NativeAddr d_addr, from;
   2138     AggregateAccess access;
   2139     if (a->next_param_int < 8u) {
   2140       a->next_param_int++;
   2141     } else {
   2142       NativeAddr sa;
   2143       memset(&sa, 0, sizeof sa);
   2144       sa.base_kind = NATIVE_ADDR_BASE_REG;
   2145       sa.base.reg = RV_S0;
   2146       sa.offset = rv_s0_off_in_arg(a, a->next_param_stack);
   2147       sa.base_type = src.type;
   2148       rv_emit_mem(a, 1, src, sa,
   2149                   native_mem_for_type(t, src.type, a->variant->ptr_bytes));
   2150       a->next_param_stack += a->variant->ptr_bytes;
   2151     }
   2152     if (dst.kind != NATIVE_LOC_FRAME)
   2153       rv_panic(a, "indirect parameter requires a frame destination");
   2154     memset(&d_addr, 0, sizeof d_addr);
   2155     d_addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   2156     d_addr.base.frame = dst.v.frame;
   2157     d_addr.base_type = p->type;
   2158     memset(&from, 0, sizeof from);
   2159     from.base_kind = NATIVE_ADDR_BASE_REG;
   2160     from.base.reg = loc_reg(src);
   2161     from.base_type = p->type;
   2162     memset(&access, 0, sizeof access);
   2163     access.type = p->type;
   2164     access.size = p->size ? p->size : (u32)cg_type_size(t->c, p->type);
   2165     access.align = p->align ? p->align : native_type_align(t, p->type);
   2166     rv_copy_bytes(t, d_addr, from, access);
   2167     return;
   2168   }
   2169   for (i = 0; i < ai->nparts; ++i) {
   2170     const ABIArgPart* part = &ai->parts[i];
   2171     NativeAllocClass cls =
   2172         part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2173     NativeLoc src;
   2174     if (cls == NATIVE_REG_FP && a->next_param_fp < 8u) {
   2175       src = native_loc_reg(p->type, cls, RV_FA0 + a->next_param_fp++);
   2176     } else if (cls == NATIVE_REG_INT && a->next_param_int < 8u) {
   2177       src = native_loc_reg(p->type, cls, RV_A0 + a->next_param_int++);
   2178     } else {
   2179       Reg tmp = (cls == NATIVE_REG_FP) ? RV_FTMP0 : RV_TMP0;
   2180       NativeAddr sa;
   2181       src = native_loc_reg(p->type, cls, tmp);
   2182       a->next_param_stack = align_up_u32(
   2183           a->next_param_stack, rv_part_stack_align(a->variant, part));
   2184       memset(&sa, 0, sizeof sa);
   2185       sa.base_kind = NATIVE_ADDR_BASE_REG;
   2186       sa.base.reg = RV_S0;
   2187       sa.base_type = p->type;
   2188       sa.offset = rv_s0_off_in_arg(a, a->next_param_stack);
   2189       rv_emit_mem(a, 1, src, sa, native_mem_for_type(t, p->type, part->size));
   2190       a->next_param_stack += rv_part_stack_size(a->variant, part);
   2191     }
   2192     if (dst.kind == NATIVE_LOC_NONE) {
   2193       /* unused parameter; cursors already advanced */
   2194     } else if (to_reg) {
   2195       NativeLoc d = native_loc_reg(dst.type ? dst.type : p->type,
   2196                                    (NativeAllocClass)dst.cls, (Reg)dst.v.reg);
   2197       if (!(src.kind == NATIVE_LOC_REG && loc_reg(src) == loc_reg(d) &&
   2198             (NativeAllocClass)src.cls == (NativeAllocClass)d.cls))
   2199         rv_move(t, d, src);
   2200     } else {
   2201       rv_store_part(
   2202           t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), src,
   2203           0, part->size);
   2204     }
   2205   }
   2206   a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u);
   2207 }
   2208 
   2209 /* ============================ calls / returns ============================ */
   2210 
   2211 typedef NativeArgMove RvArgMove;
   2212 
   2213 static void rv_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) {
   2214   if (m->is_addr)
   2215     rv_addr_of_loc(t, m->dst, m->src);
   2216   else
   2217     rv_load_part(t, m->dst, m->src, m->src_offset, m->size);
   2218 }
   2219 
   2220 /* Parallel-copy register arg moves via the shared scheduler; cycles break
   2221  * through the int/fp emit scratch (t1 / ft1). */
   2222 static void rv_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves,
   2223                                   u32 n) {
   2224   NativeArgShuffle s;
   2225   if (n > RV_MAX_REG_ARG_MOVES) rv_panic(rv_of(t), "too many register args");
   2226   memset(&s, 0, sizeof s);
   2227   s.t = t;
   2228   s.emit_one = rv_emit_one_arg_move;
   2229   s.reg_move = rv_move;
   2230   s.scratch[NATIVE_REG_INT] = RV_TMP1;
   2231   s.scratch[NATIVE_REG_FP] = RV_FTMP1;
   2232   native_arg_shuffle(&s, moves, n);
   2233 }
   2234 
   2235 static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
   2236                          NativeCallPlan* plan) {
   2237   RvNativeTarget* a = rv_of(t);
   2238   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);
   2239   NativeCallPlanRet* rets;
   2240   KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   2241   memset(plan, 0, sizeof *plan);
   2242   rets = desc->nresults ? arena_zarray(t->c->tu, NativeCallPlanRet, 4) : NULL;
   2243   plan->callee = desc->callee;
   2244   plan->rets = rets;
   2245   plan->flags = desc->flags;
   2246   plan->has_sret = abi && abi->has_sret;
   2247   plan->is_variadic = abi && abi->variadic;
   2248   plan->stack_arg_size = rv_call_stack_size(t, desc);
   2249   if (plan->stack_arg_size > a->frame.max_outgoing)
   2250     a->frame.max_outgoing = plan->stack_arg_size;
   2251   /* Indirect callee in an arg register would be clobbered by arg loads. */
   2252   if (plan->callee.kind == NATIVE_LOC_REG &&
   2253       (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT &&
   2254       plan->callee.v.reg >= RV_A0 && plan->callee.v.reg <= RV_A7) {
   2255     NativeLoc scratch =
   2256         native_loc_reg(plan->callee.type, NATIVE_REG_INT, RV_TMP0);
   2257     rv_move(t, scratch, plan->callee);
   2258     plan->callee = scratch;
   2259   }
   2260   {
   2261     /* sret returns pass the hidden destination pointer as the implicit first
   2262      * integer argument (a0), so the real args start at a1. */
   2263     u32 next_int = (abi && abi->has_sret) ? 1u : 0u;
   2264     u32 next_fp = 0, stack = 0, nmoves = 0, i, p;
   2265     int tail = (desc->flags & CG_CALL_TAIL) != 0;
   2266     RvArgMove moves[RV_MAX_REG_ARG_MOVES];
   2267     for (i = 0; i < desc->nargs; ++i) {
   2268       ABIArgInfo tmp;
   2269       const ABIArgInfo* ai = rv_param_abi(t, abi, desc, i, &tmp);
   2270       int force_stack =
   2271           abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams;
   2272       if (ai->kind == ABI_ARG_IGNORE) continue;
   2273       if (force_stack) {
   2274         NativeLoc tmpreg =
   2275             native_loc_reg(desc->args[i].type, NATIVE_REG_INT, RV_TMP0);
   2276         u32 slot = a->variant->gp_slot_bytes; /* xlen-word: 8 lp64d / 4 ilp32 */
   2277         u32 n = rv_class_stack_size(a->variant, ai), off = 0;
   2278         while (off < n) {
   2279           rv_load_part(t, tmpreg, desc->args[i], off, slot);
   2280           rv_store_outgoing_part(t, tail, stack + off, tmpreg, slot);
   2281           off += slot;
   2282         }
   2283         stack += n;
   2284         continue;
   2285       }
   2286       if (ai->kind == ABI_ARG_INDIRECT) {
   2287         u32 ptr_sz = a->variant->ptr_bytes;
   2288         if (next_int < 8u) {
   2289           RvArgMove* m = &moves[nmoves++];
   2290           m->dst = native_loc_reg(i64t, NATIVE_REG_INT, RV_A0 + next_int++);
   2291           m->src = desc->args[i];
   2292           m->src_offset = 0;
   2293           m->size = ptr_sz;
   2294           m->is_addr = 1;
   2295         } else {
   2296           NativeLoc ptr = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP0);
   2297           rv_addr_of_loc(t, ptr, desc->args[i]);
   2298           rv_store_outgoing_part(t, tail, stack, ptr, ptr_sz);
   2299           stack += ptr_sz;
   2300         }
   2301         continue;
   2302       }
   2303       for (p = 0; p < ai->nparts; ++p) {
   2304         const ABIArgPart* part = &ai->parts[p];
   2305         NativeAllocClass cls =
   2306             part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2307         if ((cls == NATIVE_REG_FP && next_fp < 8u) ||
   2308             (cls == NATIVE_REG_INT && next_int < 8u)) {
   2309           RvArgMove* m = &moves[nmoves++];
   2310           Reg areg =
   2311               cls == NATIVE_REG_FP ? RV_FA0 + next_fp++ : RV_A0 + next_int++;
   2312           m->dst = native_loc_reg(desc->args[i].type, cls, areg);
   2313           m->src = desc->args[i];
   2314           m->src_offset = part->src_offset;
   2315           m->size = part->size;
   2316           m->is_addr = 0;
   2317         } else {
   2318           Reg tmp = cls == NATIVE_REG_FP ? RV_FTMP0 : RV_TMP0;
   2319           NativeLoc tmpreg = native_loc_reg(desc->args[i].type, cls, tmp);
   2320           rv_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size);
   2321           stack = align_up_u32(stack, rv_part_stack_align(a->variant, part));
   2322           rv_store_outgoing_part(t, tail, stack, tmpreg, part->size);
   2323           stack += rv_part_stack_size(a->variant, part);
   2324         }
   2325       }
   2326     }
   2327     rv_emit_reg_arg_moves(t, moves, nmoves);
   2328     if (abi && abi->has_sret) {
   2329       /* sret pointer goes in a0; arg loads have completed. A tail call forwards
   2330        * the caller's own incoming sret pointer (spilled at entry) so the
   2331        * sibling writes the result into the caller's caller's destination;
   2332        * otherwise pass the address of this call's result slot. */
   2333       NativeLoc a0 = native_loc_reg(i64t, NATIVE_REG_INT, RV_A0);
   2334       if (tail)
   2335         rv_load_part(t, a0, native_loc_stack(i64t, a->sret_ptr_slot, 0), 0,
   2336                      a->variant->ptr_bytes);
   2337       else if (desc->nresults)
   2338         rv_addr_of_loc(t, a0, desc->results[0]);
   2339     }
   2340   }
   2341   if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) {
   2342     u32 nr = 0, ni = 0, nf = 0, p;
   2343     for (p = 0; p < abi->ret.nparts; ++p) {
   2344       const ABIArgPart* part = &abi->ret.parts[p];
   2345       NativeAllocClass cls =
   2346           part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2347       KitCgTypeId pty = rv_part_scalar_type(part);
   2348       Reg rreg = cls == NATIVE_REG_FP ? RV_FA0 + nf++ : RV_A0 + ni++;
   2349       rets[nr].src = native_loc_reg(pty, cls, rreg);
   2350       rets[nr].dst = desc->results[0];
   2351       if (rets[nr].dst.kind == NATIVE_LOC_FRAME)
   2352         rets[nr].dst = native_loc_stack(pty, desc->results[0].v.frame,
   2353                                         (i32)part->src_offset);
   2354       else if (rets[nr].dst.kind == NATIVE_LOC_STACK) {
   2355         rets[nr].dst.v.stack.offset += (i32)part->src_offset;
   2356         rets[nr].dst.type = pty;
   2357       }
   2358       rets[nr].mem = native_mem_for_type(t, pty, part->size);
   2359       nr++;
   2360     }
   2361     plan->nrets = nr;
   2362   } else if (abi && abi->ret.kind == ABI_ARG_IGNORE) {
   2363     plan->nrets = 0;
   2364   } else if (!abi && desc->nresults) {
   2365     rets[0].src = native_loc_reg(desc->results[0].type, NATIVE_REG_INT, RV_A0);
   2366     rets[0].dst = desc->results[0];
   2367     rets[0].mem = native_mem_for_type(t, desc->results[0].type, 0);
   2368     plan->nrets = 1;
   2369   }
   2370 }
   2371 
   2372 /* Emit a sibling (tail) call: tear the frame down to the caller's entry state
   2373  * and jump (no link) to the callee. Outgoing args are already in the arg regs /
   2374  * the caller's incoming-arg window. At -O0 there are no callee-saves, and the
   2375  * sp restore uses the CFA offset (s0 + 16 + va_save), which is independent of
   2376  * the not-yet-final frame_size — so no func_end patching is needed. */
   2377 static void rv_emit_tail_site(NativeTarget* t, NativeLoc callee) {
   2378   RvNativeTarget* a = rv_of(t);
   2379   const RiscvVariant* v = a->variant;
   2380   MCEmitter* mc = t->mc;
   2381   i32 cfa = (i32)(v->frame_save_size + rv_va_save_sz(a));
   2382   int indirect = callee.kind == NATIVE_LOC_REG;
   2383   u32 int_regs[RV_MAX_CALLEE_SAVES], fp_regs[RV_MAX_CALLEE_SAVES];
   2384   u32 n_int = rv_collect_int_saves(a, int_regs);
   2385   u32 n_fp = rv_collect_fp_saves(a, fp_regs);
   2386   i32 i;
   2387   /* Stage an indirect callee into a reserved scratch (t1) BEFORE the teardown:
   2388    * regalloc parks the function pointer in a callee-saved register so it
   2389    * survives arg marshalling, and the callee-save / s0 / ra restores below
   2390    * would otherwise overwrite it. t1 is reserved (never allocable) and
   2391    * untouched by the restore loop (which only uses t0 for far offsets). */
   2392   if (indirect) rv64_emit32(mc, rv_addi(RV_TMP1, loc_reg(callee), 0));
   2393   /* Restore callee-saves before tearing the frame down (O1 path; none at -O0).
   2394    * Their save offsets are s0-relative via rv_save_off, so the restore is
   2395    * frame-size- and teardown-order-independent. */
   2396   for (i = (i32)n_int - 1; i >= 0; --i)
   2397     rv_load_s0(v, mc, 0, int_regs[i], rv_save_off(a, n_int, (u32)i));
   2398   for (i = (i32)n_fp - 1; i >= 0; --i)
   2399     rv_load_s0(v, mc, 1, fp_regs[i], rv_save_off(a, n_int, n_int + (u32)i));
   2400   rv64_emit32(mc, rv_ld_ptr(v, RV_RA, RV_S0, (i32)v->ptr_bytes));
   2401   rv64_emit32(mc, rv_addi(RV_SP, RV_S0, cfa));
   2402   rv64_emit32(mc, rv_ld_ptr(v, RV_S0, RV_S0, 0));
   2403   if (callee.kind == NATIVE_LOC_GLOBAL) {
   2404     u32 pos = mc->pos(mc);
   2405     rv64_emit32(mc, rv_auipc(RV_TMP0, 0));
   2406     rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP0, 0));
   2407     mc->emit_reloc_at(mc, mc->section_id, pos, R_RV_CALL, callee.v.global.sym,
   2408                       callee.v.global.addend, 0, 0);
   2409   } else if (indirect) {
   2410     rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP1, 0));
   2411   } else {
   2412     rv_panic(a, "unsupported tail call target");
   2413   }
   2414 }
   2415 
   2416 static void rv_emit_call(NativeTarget* t, const NativeCallPlan* plan) {
   2417   MCEmitter* mc = t->mc;
   2418   ObjSecId sec = mc->section_id;
   2419   if (plan->flags & CG_CALL_TAIL) {
   2420     rv_emit_tail_site(t, plan->callee);
   2421     return;
   2422   }
   2423   if (plan->callee.kind == NATIVE_LOC_GLOBAL) {
   2424     u32 pos = mc->pos(mc);
   2425     rv64_emit32(mc, rv_auipc(RV_RA, 0));
   2426     rv64_emit32(mc, rv_jalr(RV_RA, RV_RA, 0));
   2427     mc->emit_reloc_at(mc, sec, pos, R_RV_CALL, plan->callee.v.global.sym,
   2428                       plan->callee.v.global.addend, 0, 0);
   2429     return;
   2430   }
   2431   if (plan->callee.kind == NATIVE_LOC_REG) {
   2432     rv64_emit32(mc, rv_jalr(RV_RA, loc_reg(plan->callee), 0));
   2433     return;
   2434   }
   2435   rv_panic(rv_of(t), "unsupported call target");
   2436 }
   2437 
   2438 static void rv_plan_ret(NativeTarget* t, const CGFuncDesc* fd,
   2439                         const NativeLoc* value,
   2440                         NativeCallPlanRet** out_rets, u32* out_nrets) {
   2441   RvNativeTarget* a = rv_of(t);
   2442   const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
   2443   NativeCallPlanRet* rets = NULL;
   2444   u32 nr = 0;
   2445   if (value) rets = arena_zarray(t->c->tu, NativeCallPlanRet, 4);
   2446   if (value && abi && abi->ret.kind == ABI_ARG_INDIRECT) {
   2447     KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   2448     NativeLoc dstp = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1);
   2449     NativeLoc saved = native_loc_stack(i64t, a->sret_ptr_slot, 0);
   2450     NativeAddr dst_addr, src_addr;
   2451     AggregateAccess access;
   2452     rv_load_part(t, dstp, saved, 0, a->variant->ptr_bytes);
   2453     memset(&dst_addr, 0, sizeof dst_addr);
   2454     dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
   2455     dst_addr.base.reg = RV_TMP1;
   2456     dst_addr.base_type = value->type;
   2457     src_addr = rv_loc_addr(a, *value, 0);
   2458     src_addr.base_type = value->type;
   2459     memset(&access, 0, sizeof access);
   2460     access.type = value->type;
   2461     access.size = (u32)cg_type_size(t->c, value->type);
   2462     access.align = native_type_align(t, value->type);
   2463     rv_copy_bytes(t, dst_addr, src_addr, access);
   2464     *out_rets = NULL;
   2465     *out_nrets = 0;
   2466     return;
   2467   }
   2468   if (value && abi && abi->ret.kind == ABI_ARG_DIRECT) {
   2469     u32 ni = 0, nf = 0, p;
   2470     for (p = 0; p < abi->ret.nparts; ++p) {
   2471       const ABIArgPart* part = &abi->ret.parts[p];
   2472       NativeAllocClass cls =
   2473           part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
   2474       KitCgTypeId pty = rv_part_scalar_type(part);
   2475       Reg rreg = cls == NATIVE_REG_FP ? RV_FA0 + nf++ : RV_A0 + ni++;
   2476       rets[nr].src = *value;
   2477       if (rets[nr].src.kind == NATIVE_LOC_FRAME)
   2478         rets[nr].src =
   2479             native_loc_stack(pty, value->v.frame, (i32)part->src_offset);
   2480       else if (rets[nr].src.kind == NATIVE_LOC_STACK) {
   2481         rets[nr].src.v.stack.offset += (i32)part->src_offset;
   2482         rets[nr].src.type = pty;
   2483       }
   2484       rets[nr].dst = native_loc_reg(pty, cls, rreg);
   2485       rets[nr].mem = native_mem_for_type(t, pty, part->size);
   2486       nr++;
   2487     }
   2488   } else if (value) {
   2489     rets[0].src = *value;
   2490     rets[0].dst = native_loc_reg(value->type, NATIVE_REG_INT, RV_A0);
   2491     rets[0].mem = native_mem_for_type(t, value->type, 0);
   2492     nr = 1;
   2493   }
   2494   *out_rets = rets;
   2495   *out_nrets = nr;
   2496 }
   2497 
   2498 static void rv_ret(NativeTarget* t) {
   2499   RvNativeTarget* a = rv_of(t);
   2500   rv_jump(t, a->epilogue_label);
   2501 }
   2502 
   2503 /* ============================ alloca ============================ */
   2504 
   2505 static void rv_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size,
   2506                       u32 align) {
   2507   RvNativeTarget* a = rv_of(t);
   2508   MCEmitter* mc = t->mc;
   2509   u32 rsz = loc_reg(size);
   2510   u32 rd = loc_reg(dst);
   2511   u32 al = align ? align : 16u;
   2512   if (al < 16u) al = 16u;
   2513   /* round up: t0 = (size + (al-1)) & ~(al-1) */
   2514   rv64_emit32(mc, rv_addi(RV_TMP0, rsz, (i32)(al - 1u)));
   2515   rv_emit_load_imm(a->variant, mc, 1, RV_TMP1, -(i64)al);
   2516   rv64_emit32(mc, rv_and(RV_TMP0, RV_TMP0, RV_TMP1));
   2517   rv64_emit32(mc, rv_sub(RV_SP, RV_SP, RV_TMP0));
   2518   a->frame.has_alloca = 1;
   2519   /* dst = sp + max_outgoing (patched in func_end) */
   2520   if (a->npatches == a->patches_cap) {
   2521     u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u;
   2522     RvPatch* nb = arena_zarray(t->c->tu, RvPatch, cap);
   2523     if (a->patches) memcpy(nb, a->patches, sizeof(*nb) * a->npatches);
   2524     a->patches = nb;
   2525     a->patches_cap = cap;
   2526   }
   2527   a->patches[a->npatches].kind = RV_PATCH_ALLOCA;
   2528   a->patches[a->npatches].pos = mc->pos(mc);
   2529   a->patches[a->npatches].dst_reg = rd;
   2530   a->npatches++;
   2531   a->nalloca++;
   2532   rv64_emit32(mc, RV_NOP); /* placeholder for addi dst, sp, max_outgoing */
   2533 }
   2534 
   2535 /* ============================ TLS / bitfield / atomics
   2536  * ============================ */
   2537 
   2538 static void rv_tls_addr_of(NativeTarget* t, NativeLoc dst, ObjSymId sym,
   2539                            i64 addend) {
   2540   MCEmitter* mc = t->mc;
   2541   u32 sec = mc->section_id;
   2542   u32 rd = loc_reg(dst);
   2543   /* Local-Exec only, matching aa64 (aa_tls_addr_of) and x64 (x64_tls_addr_of):
   2544    * kit links the whole module statically, so every _Thread_local symbol is
   2545    * resolved within the image and TPREL is always valid. An Initial-Exec GOT
   2546    * path (R_RV_TLS_GOT_HI20) used to be emitted for extern-via-GOT symbols
   2547    * under -fPIE (the hosted default), but the linker has no layout/apply for
   2548    * that reloc, so it produced a hard "unsupported reloc kind" link failure
   2549    * rather than a working binary. */
   2550   /* lui t0, %tprel_hi(sym); add t0, tp, t0; addi dst, t0, %tprel_lo(sym). */
   2551   {
   2552     u32 hp = mc->pos(mc);
   2553     rv64_emit32(mc, rv_lui(RV_TMP0, 0));
   2554     mc->emit_reloc_at(mc, sec, hp, R_RV_TPREL_HI20, sym, addend, 0, 0);
   2555     rv64_emit32(mc, rv_add(RV_TMP0, RV_TP, RV_TMP0));
   2556     {
   2557       u32 lp = mc->pos(mc);
   2558       rv64_emit32(mc, rv_addi(rd, RV_TMP0, 0));
   2559       mc->emit_reloc_at(mc, sec, lp, R_RV_TPREL_LO12_I, sym, addend, 0, 0);
   2560     }
   2561   }
   2562 }
   2563 static void rv_bitfield_load(NativeTarget* t, NativeLoc dst, NativeAddr ra,
   2564                              BitFieldAccess bf) {
   2565   RvNativeTarget* a = rv_of(t);
   2566   const RiscvVariant* v = a->variant;
   2567   MCEmitter* mc = t->mc;
   2568   u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
   2569   u32 rd = loc_reg(dst);
   2570   u32 base;
   2571   i32 off;
   2572   u32 lsb = bf.bit_offset;
   2573   u32 width = bf.bit_width ? bf.bit_width : 1u;
   2574   /* Shift left so the field's MSB lands at the register top (XLEN-1), then shift
   2575    * right to sign/zero extend it down. Shifts are XLEN-wide. */
   2576   u32 sh_left = v->xlen - (lsb + width);
   2577   u32 sh_right = v->xlen - width;
   2578   ra.offset += (i32)bf.storage_offset;
   2579   rv_resolve_mem_addr(a, &ra, &base, &off);
   2580   rv64_emit32(mc, enc_int_load(v, storage_bytes, 0, rd, base, off));
   2581   rv64_emit32(mc, rv_slli(rd, rd, sh_left));
   2582   if (bf.signed_)
   2583     rv64_emit32(mc, rv_srai(rd, rd, sh_right));
   2584   else
   2585     rv64_emit32(mc, rv_srli(rd, rd, sh_right));
   2586 }
   2587 static void rv_bitfield_store(NativeTarget* t, NativeAddr ra, NativeLoc src,
   2588                               BitFieldAccess bf) {
   2589   RvNativeTarget* a = rv_of(t);
   2590   const RiscvVariant* v = a->variant;
   2591   MCEmitter* mc = t->mc;
   2592   u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
   2593   u32 src_reg = loc_reg(src);
   2594   u32 base;
   2595   i32 off;
   2596   u32 lsb = bf.bit_offset;
   2597   u32 width = bf.bit_width ? bf.bit_width : 1u;
   2598   u64 ones = width >= 64u ? ~(u64)0 : (((u64)1 << width) - 1u);
   2599   u64 mask_in = ones << lsb;
   2600   ra.offset += (i32)bf.storage_offset;
   2601   /* Resolve the field address; rv_resolve_mem_addr may use RV_TMP0/RV_TMP1, so
   2602    * stabilize the base into RV_TMP1 before consuming the scratch temps. */
   2603   rv_resolve_mem_addr(a, &ra, &base, &off);
   2604   if (base != RV_S0 && base != RV_TMP1) {
   2605     rv_emit_addr_adjust(v, mc, RV_TMP1, base, off);
   2606     base = RV_TMP1;
   2607     off = 0;
   2608   } else if (base == RV_TMP1 && off != 0) {
   2609     rv_emit_addr_adjust(v, mc, RV_TMP1, RV_TMP1, off);
   2610     off = 0;
   2611   }
   2612   /* word in RV_TMP2; merged via RV_TMP0 (clear mask, then shifted src). */
   2613   rv64_emit32(mc, enc_int_load(v, storage_bytes, 0, RV_TMP2, base, off));
   2614   rv_emit_load_imm(v, mc, 1, RV_TMP0, (i64)~mask_in);
   2615   rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP2, RV_TMP0));
   2616   rv_emit_load_imm(v, mc, 1, RV_TMP0, (i64)ones);
   2617   rv64_emit32(mc, rv_and(RV_TMP0, src_reg, RV_TMP0));
   2618   if (lsb) rv64_emit32(mc, rv_slli(RV_TMP0, RV_TMP0, lsb));
   2619   rv64_emit32(mc, rv_or(RV_TMP2, RV_TMP2, RV_TMP0));
   2620   rv64_emit32(mc, enc_int_store(v, storage_bytes, RV_TMP2, base, off));
   2621 }
   2622 static int rv_order_acquire(KitCgMemOrder o) {
   2623   return o == KIT_CG_MO_CONSUME || o == KIT_CG_MO_ACQUIRE ||
   2624          o == KIT_CG_MO_ACQ_REL || o == KIT_CG_MO_SEQ_CST;
   2625 }
   2626 static int rv_order_release(KitCgMemOrder o) {
   2627   return o == KIT_CG_MO_RELEASE || o == KIT_CG_MO_ACQ_REL ||
   2628          o == KIT_CG_MO_SEQ_CST;
   2629 }
   2630 
   2631 /* Materialize the atomic operand address into RV_TMP0 (a bare pointer, since
   2632  * LR/SC and AMO take a base register with no offset) and return it. */
   2633 static u32 rv_atomic_addr_reg(RvNativeTarget* a, NativeAddr addr) {
   2634   NativeLoc dst =
   2635       native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, RV_TMP0);
   2636   rv_load_addr(&a->base, dst, addr);
   2637   return RV_TMP0;
   2638 }
   2639 
   2640 static void rv_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr,
   2641                            MemAccess mem, KitCgMemOrder mo) {
   2642   RvNativeTarget* a = rv_of(t);
   2643   MCEmitter* mc = t->mc;
   2644   u32 sf =
   2645       (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u;
   2646   u32 base = rv_atomic_addr_reg(a, addr);
   2647   if (mo == KIT_CG_MO_SEQ_CST) rv64_emit32(mc, rv_fence_rw_rw());
   2648   if (rv_order_acquire(mo)) {
   2649     /* lr.w/d as an ordered load (aq=1). */
   2650     rv64_emit32(mc, sf ? rv_lr_d(loc_reg(dst), base, 1, 0)
   2651                        : rv_lr_w(loc_reg(dst), base, 1, 0));
   2652   } else {
   2653     rv64_emit32(mc, enc_int_load(a->variant,
   2654                                  mem.size ? mem.size
   2655                                           : native_type_size(t, dst.type),
   2656                                  0, loc_reg(dst), base, 0));
   2657   }
   2658 }
   2659 
   2660 static void rv_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src,
   2661                             MemAccess mem, KitCgMemOrder mo) {
   2662   RvNativeTarget* a = rv_of(t);
   2663   MCEmitter* mc = t->mc;
   2664   u32 sz = mem.size ? mem.size : native_type_size(t, src.type);
   2665   /* RV_TMP0 holds the address; never collides with src (an allocable reg). */
   2666   u32 base = rv_atomic_addr_reg(a, addr);
   2667   if (rv_order_release(mo)) rv64_emit32(mc, rv_fence_rw_rw());
   2668   rv64_emit32(mc, enc_int_store(a->variant, sz, loc_reg(src), base, 0));
   2669   if (mo == KIT_CG_MO_SEQ_CST) rv64_emit32(mc, rv_fence_rw_rw());
   2670 }
   2671 
   2672 static void rv_atomic_rmw(NativeTarget* t, KitCgAtomicOp op, NativeLoc dst,
   2673                           NativeAddr addr, NativeLoc val, MemAccess mem,
   2674                           KitCgMemOrder mo) {
   2675   RvNativeTarget* a = rv_of(t);
   2676   const RiscvVariant* v = a->variant;
   2677   MCEmitter* mc = t->mc;
   2678   u32 sf = (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u;
   2679   /* W-form add/sub apply only to a 32-bit value on rv64; on rv32 the base ops
   2680    * are the 32-bit ops. */
   2681   int w = !sf && v->has_w_forms;
   2682   u32 base = rv_atomic_addr_reg(a, addr); /* RV_TMP0 */
   2683   u32 vreg = loc_reg(val);
   2684   u32 rd = loc_reg(dst);
   2685   u32 aq = (u32)rv_order_acquire(mo);
   2686   u32 rl = (u32)rv_order_release(mo);
   2687   MCLabel retry = mc->label_new(mc);
   2688   /* LR/SC loop: dst = *base; new = dst op val; sc new; retry on failure.
   2689    * RV_TMP1 carries the SC status, RV_TMP3 the computed new value. */
   2690   mc->label_place(mc, retry);
   2691   rv64_emit32(mc, sf ? rv_lr_d(rd, base, aq, 0) : rv_lr_w(rd, base, aq, 0));
   2692   switch (op) {
   2693     case KIT_CG_ATOMIC_XCHG:
   2694       rv64_emit32(mc, rv_addi(RV_TMP3, vreg, 0));
   2695       break;
   2696     case KIT_CG_ATOMIC_ADD:
   2697       rv64_emit32(mc,
   2698                   w ? rv_addw(RV_TMP3, rd, vreg) : rv_add(RV_TMP3, rd, vreg));
   2699       break;
   2700     case KIT_CG_ATOMIC_SUB:
   2701       rv64_emit32(mc,
   2702                   w ? rv_subw(RV_TMP3, rd, vreg) : rv_sub(RV_TMP3, rd, vreg));
   2703       break;
   2704     case KIT_CG_ATOMIC_AND:
   2705       rv64_emit32(mc, rv_and(RV_TMP3, rd, vreg));
   2706       break;
   2707     case KIT_CG_ATOMIC_OR:
   2708       rv64_emit32(mc, rv_or(RV_TMP3, rd, vreg));
   2709       break;
   2710     case KIT_CG_ATOMIC_XOR:
   2711       rv64_emit32(mc, rv_xor(RV_TMP3, rd, vreg));
   2712       break;
   2713     case KIT_CG_ATOMIC_NAND:
   2714       rv64_emit32(mc, rv_and(RV_TMP3, rd, vreg));
   2715       rv64_emit32(mc, rv_xori(RV_TMP3, RV_TMP3, -1));
   2716       break;
   2717     default:
   2718       rv_panic(a, "unsupported atomic rmw op");
   2719   }
   2720   rv64_emit32(mc, sf ? rv_sc_d(RV_TMP1, base, RV_TMP3, 0, rl)
   2721                      : rv_sc_w(RV_TMP1, base, RV_TMP3, 0, rl));
   2722   rv64_emit32(mc, rv_bne(RV_TMP1, RV_ZERO, 0));
   2723   mc->emit_label_ref(mc, retry, R_RV_BRANCH, 4, 0);
   2724 }
   2725 
   2726 static void rv_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok,
   2727                           NativeAddr addr, NativeLoc expected,
   2728                           NativeLoc desired, MemAccess mem,
   2729                           KitCgMemOrder success, KitCgMemOrder failure) {
   2730   RvNativeTarget* a = rv_of(t);
   2731   MCEmitter* mc = t->mc;
   2732   u32 sf =
   2733       (mem.size ? mem.size : native_type_size(t, prior.type)) == 8u ? 1u : 0u;
   2734   u32 base = rv_atomic_addr_reg(a, addr); /* RV_TMP0 */
   2735   u32 rprior = loc_reg(prior);
   2736   u32 rexp = loc_reg(expected);
   2737   u32 rdes = loc_reg(desired);
   2738   u32 rok = loc_reg(ok);
   2739   u32 aq = (u32)rv_order_acquire(success);
   2740   u32 rl = (u32)rv_order_release(success);
   2741   MCLabel retry = mc->label_new(mc);
   2742   MCLabel fail = mc->label_new(mc);
   2743   MCLabel done = mc->label_new(mc);
   2744   (void)failure;
   2745   mc->label_place(mc, retry);
   2746   rv64_emit32(mc,
   2747               sf ? rv_lr_d(rprior, base, aq, 0) : rv_lr_w(rprior, base, aq, 0));
   2748   /* if (prior != expected) -> fail */
   2749   rv64_emit32(mc, rv_bne(rprior, rexp, 0));
   2750   mc->emit_label_ref(mc, fail, R_RV_BRANCH, 4, 0);
   2751   /* sc.w/d status, desired, (base); retry on failure. */
   2752   rv64_emit32(mc, sf ? rv_sc_d(RV_TMP1, base, rdes, 0, rl)
   2753                      : rv_sc_w(RV_TMP1, base, rdes, 0, rl));
   2754   rv64_emit32(mc, rv_bne(RV_TMP1, RV_ZERO, 0));
   2755   mc->emit_label_ref(mc, retry, R_RV_BRANCH, 4, 0);
   2756   /* ok = 1; jump done. */
   2757   rv_emit_load_imm(a->variant, mc, 0, rok, 1);
   2758   rv64_emit32(mc, rv_jal(RV_ZERO, 0));
   2759   mc->emit_label_ref(mc, done, R_RV_JAL, 4, 0);
   2760   mc->label_place(mc, fail);
   2761   rv_emit_load_imm(a->variant, mc, 0, rok, 0);
   2762   mc->label_place(mc, done);
   2763 }
   2764 
   2765 static void rv_fence(NativeTarget* t, KitCgMemOrder mo) {
   2766   if (mo == KIT_CG_MO_RELAXED) return;
   2767   rv64_emit32(t->mc, rv_fence_rw_rw());
   2768 }
   2769 /* ---- variadics (LP64D ABI_VA_LIST_POINTER) ----
   2770  * va_list is a single void* to the next argument slot. The prologue spilled
   2771  * unconsumed a-regs into the 64-byte save area at [s0+16); incoming stack args
   2772  * follow contiguously, so a uniform 8-byte stride covers both. `ap` is a
   2773  * NativeAddr that addresses the va_list object itself. */
   2774 
   2775 static void rv_va_start_core(RvNativeTarget* a, NativeAddr ap) {
   2776   NativeTarget* t = &a->base;
   2777   const RiscvVariant* v = a->variant;
   2778   MCEmitter* mc = t->mc;
   2779   ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
   2780   KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   2781   u32 slot = vai.gp_slot_size ? vai.gp_slot_size : v->gp_slot_bytes;
   2782   if (vai.kind != ABI_VA_LIST_POINTER)
   2783     rv_panic(a, "unsupported va_list layout");
   2784   if (!a->is_variadic) rv_panic(a, "va_start: function not variadic");
   2785   /* *ap = s0 + frame_save + next_param_int*gp_slot (skip named-int slots). */
   2786   rv64_emit32(mc, rv_addi(RV_TMP1, RV_S0,
   2787                           (i32)v->frame_save_size +
   2788                               (i32)(a->next_param_int * slot)));
   2789   rv_emit_mem(a, 0, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1), ap,
   2790               native_mem_for_type(t, i64t, v->ptr_bytes));
   2791 }
   2792 
   2793 /* Wide / aggregate va_arg: a value too large for a single GPR (an 8-byte
   2794  * i64 / soft-double on ilp32) occupies consecutive GP slots in the save area
   2795  * and cannot move through one register. Read the cursor, advance it past the
   2796  * whole span, then byte-copy the value from the (saved) cursor into the
   2797  * destination memory. RV_TMP2 holds the cursor across the rv_copy_bytes call,
   2798  * which itself uses RV_TMP0/RV_TMP1/RV_TMP3. */
   2799 static void rv_va_arg_wide(RvNativeTarget* a, NativeAddr dst, NativeAddr ap,
   2800                            u32 sz) {
   2801   NativeTarget* t = &a->base;
   2802   const RiscvVariant* v = a->variant;
   2803   MCEmitter* mc = t->mc;
   2804   KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   2805   ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
   2806   u32 slot = vai.gp_slot_size ? vai.gp_slot_size : v->gp_slot_bytes;
   2807   u32 span = align_up_u32(sz, slot);
   2808   NativeLoc cur = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP2);
   2809   NativeLoc nxt = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1);
   2810   NativeAddr src;
   2811   AggregateAccess acc;
   2812   /* cur = *ap; *ap = cur + span. */
   2813   rv_emit_mem(a, 1, cur, ap, native_mem_for_type(t, i64t, v->ptr_bytes));
   2814   rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP2, (i32)span));
   2815   rv_emit_mem(a, 0, nxt, ap, native_mem_for_type(t, i64t, v->ptr_bytes));
   2816   /* Copy sz bytes from [cur] to the destination. */
   2817   memset(&src, 0, sizeof src);
   2818   src.base_kind = NATIVE_ADDR_BASE_REG;
   2819   src.base.reg = RV_TMP2;
   2820   src.base_type = i64t;
   2821   memset(&acc, 0, sizeof acc);
   2822   acc.type = i64t;
   2823   acc.size = sz;
   2824   acc.align = slot;
   2825   rv_copy_bytes(t, dst, src, acc);
   2826 }
   2827 
   2828 static void rv_va_arg_core(RvNativeTarget* a, NativeLoc dst, NativeAddr ap,
   2829                            KitCgTypeId type) {
   2830   NativeTarget* t = &a->base;
   2831   const RiscvVariant* v = a->variant;
   2832   MCEmitter* mc = t->mc;
   2833   ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
   2834   KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   2835   u32 sz = native_type_size(t, type);
   2836   u32 slot = vai.gp_slot_size ? vai.gp_slot_size : v->gp_slot_bytes;
   2837   NativeLoc cur = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1);
   2838   NativeAddr from;
   2839   if (vai.kind != ABI_VA_LIST_POINTER)
   2840     rv_panic(a, "unsupported va_list layout");
   2841   if (dst.kind != NATIVE_LOC_REG) rv_panic(a, "va_arg destination must be reg");
   2842   /* cur = *ap; load value from [cur]; *ap = cur + slot (one GP-slot stride). */
   2843   rv_emit_mem(a, 1, cur, ap, native_mem_for_type(t, i64t, v->ptr_bytes));
   2844   memset(&from, 0, sizeof from);
   2845   from.base_kind = NATIVE_ADDR_BASE_REG;
   2846   from.base.reg = RV_TMP1;
   2847   from.base_type = type;
   2848   if (native_loc_is_fp(dst)) {
   2849     /* Variadic FP args sit in the integer save area as their bit pattern;
   2850      * load into RV_TMP2 and bitcast into the FPR. The fmv_d_x (double) path is
   2851      * RV64-only — on rv32 doubles are passed soft and never reach here. */
   2852     NativeLoc itmp = native_loc_reg(type, NATIVE_REG_INT, RV_TMP2);
   2853     rv_emit_mem(a, 1, itmp, from, native_mem_for_type(t, type, sz));
   2854     rv64_emit32(mc, sz == 8u ? rv_fmv_d_x(loc_reg(dst), RV_TMP2)
   2855                              : rv_fmv_w_x(loc_reg(dst), RV_TMP2));
   2856   } else {
   2857     rv_emit_mem(a, 1, dst, from, native_mem_for_type(t, type, sz));
   2858   }
   2859   rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP1, (i32)slot));
   2860   rv_emit_mem(a, 0, cur, ap, native_mem_for_type(t, i64t, v->ptr_bytes));
   2861 }
   2862 
   2863 static void rv_va_copy_core(RvNativeTarget* a, NativeAddr dst_ap,
   2864                             NativeAddr src_ap) {
   2865   NativeTarget* t = &a->base;
   2866   u32 ptr = a->variant->ptr_bytes;
   2867   KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
   2868   NativeLoc tmp = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1);
   2869   /* va_list is a single pointer-width slot. */
   2870   rv_emit_mem(a, 1, tmp, src_ap, native_mem_for_type(t, i64t, ptr));
   2871   rv_emit_mem(a, 0, tmp, dst_ap, native_mem_for_type(t, i64t, ptr));
   2872 }
   2873 
   2874 static NativeAddr rv_va_addr_from_ptr(NativeLoc ap_ptr) {
   2875   NativeAddr addr;
   2876   memset(&addr, 0, sizeof addr);
   2877   addr.base_kind = NATIVE_ADDR_BASE_REG;
   2878   addr.cls = NATIVE_REG_INT;
   2879   addr.base.reg = ap_ptr.v.reg;
   2880   addr.base_type = ap_ptr.type;
   2881   return addr;
   2882 }
   2883 
   2884 static void rv_va_start_native(NativeTarget* t, NativeLoc ap_ptr) {
   2885   rv_va_start_core(rv_of(t), rv_va_addr_from_ptr(ap_ptr));
   2886 }
   2887 /* A scalar whose value cannot move through one GPR (size > GPR width, e.g. an
   2888  * 8-byte i64 / soft-double on ilp32). pass_native_emit hands such a va_arg its
   2889  * memory destination directly rather than a scratch register. */
   2890 static int rv_va_arg_is_wide(NativeTarget* t, KitCgTypeId type) {
   2891   return native_type_size(t, type) > rv_of(t)->variant->ptr_bytes;
   2892 }
   2893 
   2894 static void rv_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr,
   2895                              KitCgTypeId type) {
   2896   RvNativeTarget* a = rv_of(t);
   2897   if (rv_va_arg_is_wide(t, type)) {
   2898     rv_va_arg_wide(a, rv_loc_addr(a, dst, 0), rv_va_addr_from_ptr(ap_ptr),
   2899                    native_type_size(t, type));
   2900     return;
   2901   }
   2902   rv_va_arg_core(a, dst, rv_va_addr_from_ptr(ap_ptr), type);
   2903 }
   2904 static void rv_va_end_native(NativeTarget* t, NativeLoc ap_ptr) {
   2905   (void)t;
   2906   (void)ap_ptr;
   2907 }
   2908 static void rv_va_copy_native(NativeTarget* t, NativeLoc dst, NativeLoc src) {
   2909   rv_va_copy_core(rv_of(t), rv_va_addr_from_ptr(dst), rv_va_addr_from_ptr(src));
   2910 }
   2911 /* Software popcount of RV_TMP1 (already width-normalized) into rd, using
   2912  * RV_TMP1/RV_TMP2/RV_TMP3 as scratch. Mirrors the legacy bit-twiddling. On rv32
   2913  * only the 32-bit (is64==0) path is reachable for a single register. */
   2914 static void rv_emit_popcount(const RiscvVariant* v, MCEmitter* mc, u32 rd,
   2915                              int is64) {
   2916   rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, 1));
   2917   rv_emit_load_imm(v, mc, 1, RV_TMP3,
   2918                    is64 ? (i64)0x5555555555555555ll : (i64)0x55555555);
   2919   rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP2, RV_TMP3));
   2920   rv64_emit32(mc, rv_sub(RV_TMP1, RV_TMP1, RV_TMP2));
   2921   rv_emit_load_imm(v, mc, 1, RV_TMP3,
   2922                    is64 ? (i64)0x3333333333333333ll : (i64)0x33333333);
   2923   rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP1, RV_TMP3));
   2924   rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 2));
   2925   rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP3));
   2926   rv64_emit32(mc, rv_add(RV_TMP1, RV_TMP1, RV_TMP2));
   2927   rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, 4));
   2928   rv64_emit32(mc, rv_add(RV_TMP1, RV_TMP1, RV_TMP2));
   2929   rv_emit_load_imm(v, mc, 1, RV_TMP3,
   2930                    is64 ? (i64)0x0f0f0f0f0f0f0f0fll : (i64)0x0f0f0f0f);
   2931   rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP3));
   2932   rv_emit_load_imm(v, mc, 1, RV_TMP3,
   2933                    is64 ? (i64)0x0101010101010101ll : (i64)0x01010101);
   2934   rv64_emit32(mc, rv_mul(RV_TMP1, RV_TMP1, RV_TMP3));
   2935   rv64_emit32(mc, rv_srli(rd, RV_TMP1, is64 ? 56u : 24u));
   2936   /* The 32-bit SWAR sum lives in product bits [24,32); since the multiply is
   2937    * 64-bit, bits [32,64) survive the >>24 and must be masked off. (The 64-bit
   2938    * path's >>56 already isolates the top byte, so it needs no mask.) */
   2939   if (!is64) rv64_emit32(mc, rv_andi(rd, rd, 0xff));
   2940 }
   2941 
   2942 /* Inline byte-granule copy/set between bare base registers (memcpy/memmove/
   2943  * memset intrinsics). dir<0 copies high-to-low (memmove backward). The 8-byte
   2944  * granule (ld/sd) and zero-extending lwu are RV64-only; on rv32 the widest
   2945  * granule is 4 bytes via lw/sw. */
   2946 static void rv_intrin_copy(const RiscvVariant* v, MCEmitter* mc, u32 dr, u32 sr,
   2947                            u32 n, int backward) {
   2948   int wide = v->ptr_bytes == 8u;
   2949   if (!backward) {
   2950     u32 i = 0;
   2951     while (wide && i + 8u <= n) {
   2952       rv64_emit32(mc, rv_ld(RV_TMP3, sr, (i32)i));
   2953       rv64_emit32(mc, rv_sd(RV_TMP3, dr, (i32)i));
   2954       i += 8u;
   2955     }
   2956     while (i + 4u <= n) {
   2957       rv64_emit32(mc, wide ? rv_lwu(RV_TMP3, sr, (i32)i)
   2958                            : rv_lw(RV_TMP3, sr, (i32)i));
   2959       rv64_emit32(mc, rv_sw(RV_TMP3, dr, (i32)i));
   2960       i += 4u;
   2961     }
   2962     while (i + 2u <= n) {
   2963       rv64_emit32(mc, rv_lhu(RV_TMP3, sr, (i32)i));
   2964       rv64_emit32(mc, rv_sh(RV_TMP3, dr, (i32)i));
   2965       i += 2u;
   2966     }
   2967     while (i < n) {
   2968       rv64_emit32(mc, rv_lbu(RV_TMP3, sr, (i32)i));
   2969       rv64_emit32(mc, rv_sb(RV_TMP3, dr, (i32)i));
   2970       i += 1u;
   2971     }
   2972   } else {
   2973     u32 i = n;
   2974     while (wide && i >= 8u) {
   2975       i -= 8u;
   2976       rv64_emit32(mc, rv_ld(RV_TMP3, sr, (i32)i));
   2977       rv64_emit32(mc, rv_sd(RV_TMP3, dr, (i32)i));
   2978     }
   2979     while (i >= 4u) {
   2980       i -= 4u;
   2981       rv64_emit32(mc, wide ? rv_lwu(RV_TMP3, sr, (i32)i)
   2982                            : rv_lw(RV_TMP3, sr, (i32)i));
   2983       rv64_emit32(mc, rv_sw(RV_TMP3, dr, (i32)i));
   2984     }
   2985     while (i >= 2u) {
   2986       i -= 2u;
   2987       rv64_emit32(mc, rv_lhu(RV_TMP3, sr, (i32)i));
   2988       rv64_emit32(mc, rv_sh(RV_TMP3, dr, (i32)i));
   2989     }
   2990     while (i >= 1u) {
   2991       i -= 1u;
   2992       rv64_emit32(mc, rv_lbu(RV_TMP3, sr, (i32)i));
   2993       rv64_emit32(mc, rv_sb(RV_TMP3, dr, (i32)i));
   2994     }
   2995   }
   2996 }
   2997 
   2998 static void rv_intrinsic(NativeTarget* t, IntrinKind kind,
   2999                          const NativeLoc* dsts, u32 ndst, const NativeLoc* args,
   3000                          u32 narg) {
   3001   RvNativeTarget* a = rv_of(t);
   3002   const RiscvVariant* v = a->variant;
   3003   MCEmitter* mc = t->mc;
   3004   (void)ndst;
   3005   (void)narg;
   3006   switch (kind) {
   3007     case INTRIN_NONE:
   3008       break;
   3009     case INTRIN_EXPECT:
   3010     case INTRIN_ASSUME_ALIGNED: {
   3011       /* dst = val (hint dropped). */
   3012       if (args[0].kind == NATIVE_LOC_IMM)
   3013         rv_emit_load_imm(v, mc, rv_is_64(t, dsts[0].type) ? 1u : 0u,
   3014                          loc_reg(dsts[0]), args[0].v.imm);
   3015       else
   3016         rv_move(t, dsts[0], args[0]);
   3017       return;
   3018     }
   3019     case INTRIN_PREFETCH:
   3020       return;
   3021     case INTRIN_TRAP:
   3022       rv64_emit32(mc, rv_ebreak());
   3023       return;
   3024     case INTRIN_SYSCALL:
   3025       if (ndst == 1u && narg >= 1u && narg <= 7u) {
   3026         static const u32 syscall_regs[7] = {RV_A7, RV_A0, RV_A1, RV_A2,
   3027                                             RV_A3, RV_A4, RV_A5};
   3028         RvArgMove moves[7];
   3029         for (u32 i = 0; i < narg; ++i) {
   3030           RvArgMove* m = &moves[i];
   3031           memset(m, 0, sizeof *m);
   3032           m->dst =
   3033               native_loc_reg(dsts[0].type, NATIVE_REG_INT, syscall_regs[i]);
   3034           m->src = args[i];
   3035           m->size = t->c->target.ptr_size;
   3036         }
   3037         rv_emit_reg_arg_moves(t, moves, narg);
   3038         rv64_emit32(mc, rv_ecall());
   3039         rv_move(t, dsts[0], native_loc_reg(dsts[0].type, NATIVE_REG_INT, RV_A0));
   3040       }
   3041       return;
   3042     case INTRIN_BSWAP: {
   3043       u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type);
   3044       switch (width) {
   3045         case 2: {
   3046           u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
   3047           /* rd = ((rs & 0xff) << 8) | ((rs >> 8) & 0xff). */
   3048           rv64_emit32(mc, rv_addi(RV_TMP2, RV_ZERO, 0xff));
   3049           rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); /* 0xff00 */
   3050           rv64_emit32(mc, rv_slli(RV_TMP1, rs, 8));
   3051           rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP2));
   3052           rv64_emit32(mc, rv_srli(RV_TMP3, rs, 8));
   3053           rv64_emit32(mc, rv_andi(RV_TMP3, RV_TMP3, 0xff));
   3054           rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP3));
   3055           return;
   3056         }
   3057         case 4: {
   3058           u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
   3059           /* SRLIW is RV64-only; on rv32 SRLI on a 32-bit reg is equivalent. */
   3060           int w = v->has_w_forms;
   3061           rv64_emit32(mc, w ? rv_srliw(RV_TMP1, rs, 24) : rv_srli(RV_TMP1, rs, 24));
   3062           rv64_emit32(mc, rv_andi(RV_TMP1, RV_TMP1, 0xff));
   3063           rv64_emit32(mc, w ? rv_srliw(RV_TMP2, rs, 16) : rv_srli(RV_TMP2, rs, 16));
   3064           rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
   3065           rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8));
   3066           rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
   3067           rv64_emit32(mc, w ? rv_srliw(RV_TMP2, rs, 8) : rv_srli(RV_TMP2, rs, 8));
   3068           rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
   3069           rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 16));
   3070           rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
   3071           rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
   3072           rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 24));
   3073           rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP2));
   3074           /* Canonicalize to a 32-bit value in a 64-bit reg (RV64 only); on rv32
   3075            * the result already occupies the whole register. */
   3076           if (w) {
   3077             rv64_emit32(mc, rv_slli(rd, rd, 32));
   3078             rv64_emit32(mc, rv_srli(rd, rd, 32));
   3079           }
   3080           return;
   3081         }
   3082         case 8: {
   3083           u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
   3084           int i;
   3085           rv64_emit32(mc, rv_addi(RV_TMP1, RV_ZERO, 0));
   3086           for (i = 0; i < 8; ++i) {
   3087             int sh = 56 - 8 * i;
   3088             if (i == 0) {
   3089               rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
   3090             } else {
   3091               rv64_emit32(mc, rv_srli(RV_TMP2, rs, (u32)(8 * i)));
   3092               rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
   3093             }
   3094             if (sh) rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, (u32)sh));
   3095             rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
   3096           }
   3097           rv64_emit32(mc, rv_addi(rd, RV_TMP1, 0));
   3098           return;
   3099         }
   3100         default:
   3101           break;
   3102       }
   3103       return;
   3104     }
   3105     case INTRIN_POPCOUNT: {
   3106       u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
   3107       int is64 = rv_is_64(t, args[0].type);
   3108       /* The narrow-in-wide normalization clears the high 32 bits of a 64-bit
   3109        * reg; on rv32 there are none, so it is skipped. */
   3110       int nrm = !is64 && v->xlen == 64u;
   3111       rv64_emit32(mc, rv_addi(RV_TMP1, rs, 0));
   3112       if (nrm) {
   3113         rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
   3114         rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
   3115       }
   3116       rv_emit_popcount(v, mc, rd, is64);
   3117       return;
   3118     }
   3119     case INTRIN_CTZ: {
   3120       /* ctz(x) = popcount((x & -x) - 1) for x != 0. */
   3121       u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
   3122       int is64 = rv_is_64(t, args[0].type);
   3123       int nrm = !is64 && v->xlen == 64u;
   3124       rv64_emit32(mc, rv_sub(RV_TMP1, RV_ZERO, rs));
   3125       rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, rs));
   3126       rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP1, -1));
   3127       if (nrm) {
   3128         rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
   3129         rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
   3130       }
   3131       rv_emit_popcount(v, mc, rd, is64);
   3132       return;
   3133     }
   3134     case INTRIN_CLZ: {
   3135       /* Fold the high bit downward, then clz = popcount(~folded). */
   3136       u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
   3137       int is64 = rv_is_64(t, args[0].type);
   3138       int nrm = !is64 && v->xlen == 64u;
   3139       u32 shifts[6] = {1, 2, 4, 8, 16, 32};
   3140       u32 ns = is64 ? 6u : 5u, i;
   3141       rv64_emit32(mc, rv_addi(RV_TMP1, rs, 0));
   3142       if (nrm) {
   3143         rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
   3144         rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
   3145       }
   3146       for (i = 0; i < ns; ++i) {
   3147         rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, shifts[i]));
   3148         rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
   3149       }
   3150       rv64_emit32(mc, rv_xori(RV_TMP1, RV_TMP1, -1));
   3151       if (nrm) {
   3152         rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
   3153         rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
   3154       }
   3155       rv_emit_popcount(v, mc, rd, is64);
   3156       return;
   3157     }
   3158     case INTRIN_SADD_OVERFLOW:
   3159     case INTRIN_SSUB_OVERFLOW: {
   3160       /* dsts: [val, ovf]. ADD: ovf=((a^r)&(b^r))>>(w-1);
   3161        * SUB: ovf=((a^b)&(a^r))>>(w-1). */
   3162       int is64 = rv_is_64(t, dsts[0].type);
   3163       int w = !is64 && v->has_w_forms; /* narrow op on rv64 -> W-form */
   3164       u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
   3165       u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
   3166       u32 sh = is64 ? 63u : 31u;
   3167       if (kind == INTRIN_SADD_OVERFLOW)
   3168         rv64_emit32(mc, w ? rv_addw(RV_TMP2, ra, rb) : rv_add(RV_TMP2, ra, rb));
   3169       else
   3170         rv64_emit32(mc, w ? rv_subw(RV_TMP2, ra, rb) : rv_sub(RV_TMP2, ra, rb));
   3171       rv64_emit32(mc, rv_xor(RV_TMP3, ra, RV_TMP2)); /* a ^ r */
   3172       if (kind == INTRIN_SADD_OVERFLOW) {
   3173         rv64_emit32(mc, rv_xor(rovf, rb, RV_TMP2)); /* b ^ r */
   3174         rv64_emit32(mc, rv_and(rovf, rovf, RV_TMP3));
   3175       } else {
   3176         rv64_emit32(mc, rv_xor(rovf, ra, rb)); /* a ^ b */
   3177         rv64_emit32(mc, rv_and(rovf, rovf, RV_TMP3));
   3178       }
   3179       rv64_emit32(mc, w ? rv_srliw(rovf, rovf, sh) : rv_srli(rovf, rovf, sh));
   3180       rv64_emit32(mc, rv_andi(rovf, rovf, 1));
   3181       rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0));
   3182       return;
   3183     }
   3184     case INTRIN_UADD_OVERFLOW:
   3185     case INTRIN_USUB_OVERFLOW: {
   3186       int is64 = rv_is_64(t, dsts[0].type);
   3187       /* `single`: the value fills the whole native register (rv64 i64 or any
   3188        * rv32 value), so the native carry/borrow sequence applies directly; the
   3189        * `!single` branch is the rv64 32-bit-in-64-bit-register implementation
   3190        * (zero-extend + srli-32), reachable only on rv64. */
   3191       int single = is64 || v->xlen == 32u;
   3192       u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
   3193       u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
   3194       if (!single) {
   3195         rv64_emit32(mc, rv_slli(RV_TMP2, ra, 32));
   3196         rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP2, 32));
   3197         rv64_emit32(mc, rv_slli(RV_TMP3, rb, 32));
   3198         rv64_emit32(mc, rv_srli(RV_TMP3, RV_TMP3, 32));
   3199         ra = RV_TMP2;
   3200         rb = RV_TMP3;
   3201       }
   3202       if (kind == INTRIN_UADD_OVERFLOW) {
   3203         if (single) {
   3204           rv64_emit32(mc, rv_add(RV_TMP2, ra, rb));
   3205           rv64_emit32(mc, rv_sltu(rovf, RV_TMP2, ra));
   3206         } else {
   3207           rv64_emit32(mc, rv_add(RV_TMP2, ra, rb));
   3208           rv64_emit32(mc, rv_srli(rovf, RV_TMP2, 32));
   3209           rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
   3210           rv64_emit32(mc, rv_addiw(RV_TMP2, RV_TMP2, 0));
   3211         }
   3212       } else {
   3213         rv64_emit32(mc, rv_sltu(rovf, ra, rb));
   3214         rv64_emit32(mc, single ? rv_sub(RV_TMP2, ra, rb)
   3215                                : rv_subw(RV_TMP2, ra, rb));
   3216       }
   3217       rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0));
   3218       return;
   3219     }
   3220     case INTRIN_SMUL_OVERFLOW: {
   3221       int is64 = rv_is_64(t, dsts[0].type);
   3222       /* `single`: native-width product overflow via MUL + MULH and a sign-bit
   3223        * compare (shift xlen-1). rv64 i64 and any rv32 value take this path; the
   3224        * `!single` branch is the rv64 32-bit-in-64-bit-register sequence. */
   3225       int single = is64 || v->xlen == 32u;
   3226       u32 sh = is64 ? 63u : 31u;
   3227       u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
   3228       u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
   3229       if (single) {
   3230         rv64_emit32(mc, rv_mul(RV_TMP2, ra, rb));
   3231         rv64_emit32(mc, rv_mulh(RV_TMP3, ra, rb));
   3232         rv64_emit32(mc, rv_srai(rovf, RV_TMP2, sh));
   3233         rv64_emit32(mc, rv_xor(rovf, RV_TMP3, rovf));
   3234         rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
   3235         rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0));
   3236       } else {
   3237         rv64_emit32(mc, rv_addiw(RV_TMP2, ra, 0));
   3238         rv64_emit32(mc, rv_addiw(RV_TMP3, rb, 0));
   3239         rv64_emit32(mc, rv_mul(RV_TMP2, RV_TMP2, RV_TMP3));
   3240         rv64_emit32(mc, rv_addiw(RV_TMP3, RV_TMP2, 0));
   3241         rv64_emit32(mc, rv_xor(rovf, RV_TMP2, RV_TMP3));
   3242         rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
   3243         rv64_emit32(mc, rv_addiw(rd, RV_TMP2, 0));
   3244       }
   3245       return;
   3246     }
   3247     case INTRIN_UMUL_OVERFLOW: {
   3248       int is64 = rv_is_64(t, dsts[0].type);
   3249       /* `single`: native-width product, overflow = (high word != 0) via MULHU.
   3250        * rv64 i64 and any rv32 value take this path; `!single` is the rv64
   3251        * 32-bit-in-64-bit-register sequence. */
   3252       int single = is64 || v->xlen == 32u;
   3253       u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
   3254       u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
   3255       if (single) {
   3256         rv64_emit32(mc, rv_mulhu(rovf, ra, rb));
   3257         rv64_emit32(mc, rv_mul(rd, ra, rb));
   3258         rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
   3259       } else {
   3260         rv64_emit32(mc, rv_slli(RV_TMP2, ra, 32));
   3261         rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP2, 32));
   3262         rv64_emit32(mc, rv_slli(RV_TMP3, rb, 32));
   3263         rv64_emit32(mc, rv_srli(RV_TMP3, RV_TMP3, 32));
   3264         rv64_emit32(mc, rv_mul(RV_TMP2, RV_TMP2, RV_TMP3));
   3265         rv64_emit32(mc, rv_srli(rovf, RV_TMP2, 32));
   3266         rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
   3267         rv64_emit32(mc, rv_addiw(rd, RV_TMP2, 0));
   3268       }
   3269       return;
   3270     }
   3271     case INTRIN_MEMMOVE: {
   3272       u32 dr, sr, n;
   3273       if (narg != 3u || args[0].kind != NATIVE_LOC_REG ||
   3274           args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM)
   3275         rv_panic(a, "unsupported memory intrinsic operands");
   3276       if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll)
   3277         rv_panic(a, "unsupported memory intrinsic size");
   3278       dr = loc_reg(args[0]);
   3279       sr = loc_reg(args[1]);
   3280       n = (u32)args[2].v.imm;
   3281       rv_intrin_copy(v, mc, dr, sr, n, /*reverse (overlap-safe)=*/1);
   3282       return;
   3283     }
   3284     case INTRIN_CPU_NOP:
   3285       rv64_emit32(mc, rv_nop());
   3286       return;
   3287     case INTRIN_CPU_YIELD:
   3288       rv64_emit32(mc, rv_pause());
   3289       return;
   3290     case INTRIN_ISB:
   3291       rv64_emit32(mc, rv_fence_i());
   3292       return;
   3293     case INTRIN_DMB:
   3294     case INTRIN_DSB:
   3295       rv64_emit32(mc, rv_fence_rw_rw());
   3296       return;
   3297     case INTRIN_WFI:
   3298       rv64_emit32(mc, rv_wfi());
   3299       return;
   3300     case INTRIN_FRAME_ADDRESS:
   3301     case INTRIN_RETURN_ADDRESS:
   3302       /* Walk the s0 frame-record chain. kit's RISC-V prologue anchors s0 at the
   3303        * saved pair: [s0] = caller's s0, [s0 + ptr_bytes] = saved ra (this
   3304        * frame's return address). NOTE: this differs from the psABI's
   3305        * ra@s0-8 / fp@s0-16 layout — kit stores the pair at and above s0. A
   3306        * function that reads its frame is forced off the frameless-leaf tier
   3307        * (see NativeKnownFrameDesc.reads_frame), so s0 is always valid here. The
   3308        * level is constant, so the walk unrolls to `level` dependent loads. */
   3309       if (ndst == 1u) {
   3310         u32 level = (narg >= 1u && args[0].kind == NATIVE_LOC_IMM)
   3311                         ? (u32)args[0].v.imm
   3312                         : 0u;
   3313         u32 rd = loc_reg(dsts[0]);
   3314         rv64_emit32(mc, rv_addi(rd, RV_S0, 0)); /* rd = s0 */
   3315         for (u32 i = 0; i < level; ++i)
   3316           rv64_emit32(mc, rv_ld_ptr(v, rd, rd, 0)); /* rd = *(rd) */
   3317         if (kind == INTRIN_RETURN_ADDRESS)
   3318           rv64_emit32(mc, rv_ld_ptr(v, rd, rd, (i32)v->ptr_bytes));
   3319       }
   3320       return;
   3321     default:
   3322       break;
   3323   }
   3324   rv_panic(a, "unsupported compiler intrinsic");
   3325 }
   3326 /* ============================ inline asm ============================ */
   3327 
   3328 _Noreturn static void rv_asm_panic_at(Compiler* c, SrcLoc loc,
   3329                                       const char* msg) {
   3330   compiler_panic(c, loc, "rv64 inline asm: %s", msg);
   3331 }
   3332 _Noreturn static void rv_asm_panic(NativeDirectTarget* d, const char* msg) {
   3333   rv_asm_panic_at(d->base.c, d->loc, msg);
   3334 }
   3335 
   3336 /* constraint_body / constraint_early / match_index are shared
   3337  * (cg/native_asm.h). */
   3338 
   3339 /* Build a bound register pseudo-operand in the rv64 inline shape. */
   3340 static void rv_asm_bound_reg(Operand* out, KitCgTypeId type,
   3341                              NativeAllocClass cls, Reg reg) {
   3342   memset(out, 0, sizeof *out);
   3343   out->kind = RV64_INLINE_OPK_REG;
   3344   out->pad[0] =
   3345       (cls == NATIVE_REG_FP) ? RV64_INLINE_OPCLS_FP : RV64_INLINE_OPCLS_INT;
   3346   out->type = type;
   3347   out->v.local = (CGLocal)reg;
   3348 }
   3349 static void rv_asm_bound_mem(Operand* out, KitCgTypeId type, Reg base) {
   3350   memset(out, 0, sizeof *out);
   3351   out->kind = OPK_INDIRECT;
   3352   out->type = type;
   3353   out->v.ind.base = (CGLocal)base;
   3354   out->v.ind.index = CG_LOCAL_NONE;
   3355   out->v.ind.ofs = 0;
   3356 }
   3357 
   3358 /* Parse a clobber register name into (class, reg). Returns 0 for the special
   3359  * "cc"/"memory" clobbers and panics on an unknown register. RV64 dwarf: int
   3360  * x0..x31 = 0..31, fp f0..f31 = 32..63. */
   3361 static int rv_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name,
   3362                                     NativeAllocClass* cls_out, Reg* reg_out) {
   3363   Slice s = pool_slice(c->global, name);
   3364   char buf[16];
   3365   uint32_t dwarf;
   3366   if (!s.s || !s.len) return 0;
   3367   if (s.len == 2 && s.s[0] == 'c' && s.s[1] == 'c') return 0;
   3368   if (s.len == 6 && memcmp(s.s, "memory", 6) == 0) return 0;
   3369   if (s.len >= sizeof buf) rv_asm_panic_at(c, loc, "clobber name is too long");
   3370   memcpy(buf, s.s, s.len);
   3371   buf[s.len] = '\0';
   3372   if (rv64_register_index(buf, &dwarf) != 0)
   3373     rv_asm_panic_at(c, loc, "unknown clobber register");
   3374   if (dwarf <= 31u) {
   3375     *cls_out = NATIVE_REG_INT;
   3376     *reg_out = (Reg)dwarf;
   3377     return 1;
   3378   }
   3379   if (dwarf >= 32u && dwarf <= 63u) {
   3380     *cls_out = NATIVE_REG_FP;
   3381     *reg_out = (Reg)(dwarf - 32u);
   3382     return 1;
   3383   }
   3384   rv_asm_panic_at(c, loc, "unsupported clobber register");
   3385   return 0;
   3386 }
   3387 
   3388 static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
   3389                                  u32 nclob, u32* int_mask, u32* fp_mask) {
   3390   u32 i;
   3391   *int_mask = 0;
   3392   *fp_mask = 0;
   3393   for (i = 0; i < nclob; ++i) {
   3394     NativeAllocClass cls;
   3395     Reg reg;
   3396     if (!rv_asm_parse_reg_clobber(c, loc, clobbers[i], &cls, &reg)) continue;
   3397     if (cls == NATIVE_REG_INT)
   3398       *int_mask |= 1u << reg;
   3399     else
   3400       *fp_mask |= 1u << reg;
   3401   }
   3402 }
   3403 
   3404 static int rv_asm_resolve_pin_or_panic(NativeDirectTarget* d, Sym reg,
   3405                                        const char* constraint,
   3406                                        NativeAsmRegPin* pin) {
   3407   NativeAsmRegPinStatus st =
   3408       native_asm_resolve_pin(d->native, reg, constraint, pin);
   3409   if (st == NATIVE_ASM_REG_PIN_ABSENT) return 0;
   3410   if (st != NATIVE_ASM_REG_PIN_OK)
   3411     rv_asm_panic(d, native_asm_pin_status_message(st));
   3412   return 1;
   3413 }
   3414 
   3415 /* Pick a free register from the arch's caller-saved allocable pools for an
   3416  * asm operand the direct path must self-allocate. */
   3417 static Reg rv_asm_alloc_reg(NativeDirectTarget* d, NativeAllocClass cls,
   3418                             u32 allowed_mask, u32* used_int, u32* used_fp) {
   3419   /* int: a0..a7 (10..17) then t-temps that aren't emit scratch. */
   3420   static const Reg int_pool[] = {10u, 11u, 12u, 13u, 14u, 15u,
   3421                                  16u, 17u, 29u, 30u, 31u};
   3422   /* fp: fa0..fa7 (10..17) then ft caller-saved. */
   3423   static const Reg fp_pool[] = {10u, 11u, 12u, 13u, 14u, 15u, 16u, 17u,
   3424                                 4u,  5u,  6u,  7u,  28u, 29u, 30u, 31u};
   3425   const Reg* pool = cls == NATIVE_REG_FP ? fp_pool : int_pool;
   3426   u32 n = cls == NATIVE_REG_FP ? (u32)(sizeof fp_pool / sizeof fp_pool[0])
   3427                                : (u32)(sizeof int_pool / sizeof int_pool[0]);
   3428   u32* used = cls == NATIVE_REG_FP ? used_fp : used_int;
   3429   u32 i;
   3430   for (i = 0; i < n; ++i) {
   3431     Reg r = pool[i];
   3432     if (allowed_mask && (allowed_mask & (1u << r)) == 0) continue;
   3433     if ((*used & (1u << r)) != 0) continue;
   3434     *used |= 1u << r;
   3435     return r;
   3436   }
   3437   rv_asm_panic(d, "out of registers for asm operands");
   3438   return REG_NONE;
   3439 }
   3440 
   3441 /* Direct (-O0) path: resolve a semantic Operand to a NativeAddr. */
   3442 static NativeAddr rv_direct_addr(NativeDirectTarget* d, Operand op) {
   3443   NativeAddr addr;
   3444   memset(&addr, 0, sizeof addr);
   3445   switch ((OpKind)op.kind) {
   3446     case OPK_LOCAL:
   3447       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3448       addr.base.frame = d->locals[op.v.local - 1u].home;
   3449       addr.base_type = op.type;
   3450       return addr;
   3451     case OPK_INDIRECT:
   3452       addr.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE;
   3453       addr.base.frame = d->locals[op.v.ind.base - 1u].home;
   3454       addr.cls = d->locals[op.v.ind.base - 1u].cls;
   3455       addr.base_type = d->locals[op.v.ind.base - 1u].type;
   3456       addr.offset = op.v.ind.ofs;
   3457       return addr;
   3458     default:
   3459       rv_asm_panic(d, "operand is not addressable");
   3460   }
   3461 }
   3462 
   3463 /* Materialize an OPK_INDIRECT (frame-value) base into a register, returning a
   3464  * plain register-based NativeAddr. */
   3465 static NativeAddr rv_direct_materialize_addr(NativeDirectTarget* d,
   3466                                              Operand op) {
   3467   RvNativeTarget* a = rv_of(d->native);
   3468   NativeAddr addr = rv_direct_addr(d, op);
   3469   if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
   3470     NativeLoc base = native_loc_reg(addr.base_type, NATIVE_REG_INT, RV_TMP1);
   3471     NativeAddr load;
   3472     memset(&load, 0, sizeof load);
   3473     load.base_kind = NATIVE_ADDR_BASE_FRAME;
   3474     load.base.frame = addr.base.frame;
   3475     load.base_type = addr.base_type;
   3476     rv_emit_mem(a, 1, base, load,
   3477                 native_mem_for_type(d->native, addr.base_type, 8));
   3478     addr.base_kind = NATIVE_ADDR_BASE_REG;
   3479     addr.base.reg = RV_TMP1;
   3480   }
   3481   return addr;
   3482 }
   3483 
   3484 static void rv_direct_load_operand_to_reg(NativeDirectTarget* d, Operand op,
   3485                                           NativeLoc dst) {
   3486   RvNativeTarget* a = rv_of(d->native);
   3487   NativeAddr addr;
   3488   memset(&addr, 0, sizeof addr);
   3489   switch ((OpKind)op.kind) {
   3490     case OPK_IMM:
   3491       if ((NativeAllocClass)dst.cls != NATIVE_REG_INT)
   3492         rv_asm_panic(d, "floating-point immediate asm input is unsupported");
   3493       d->native->load_imm(d->native, dst, op.v.imm);
   3494       return;
   3495     case OPK_LOCAL:
   3496       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3497       addr.base.frame = d->locals[op.v.local - 1u].home;
   3498       addr.base_type = op.type;
   3499       rv_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0));
   3500       return;
   3501     case OPK_GLOBAL:
   3502       addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
   3503       addr.base.global.sym = op.v.global.sym;
   3504       addr.base.global.addend = op.v.global.addend;
   3505       addr.base_type = op.type;
   3506       d->native->load_addr(d->native, dst, addr);
   3507       return;
   3508     case OPK_INDIRECT:
   3509       addr = rv_direct_materialize_addr(d, op);
   3510       rv_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0));
   3511       return;
   3512   }
   3513   rv_asm_panic(d, "unsupported asm input operand");
   3514 }
   3515 
   3516 static void rv_direct_load_address_to_reg(NativeDirectTarget* d, Operand op,
   3517                                           NativeLoc dst) {
   3518   d->native->load_addr(d->native, dst, rv_direct_addr(d, op));
   3519 }
   3520 
   3521 static void rv_direct_store_reg_to_operand(NativeDirectTarget* d, Operand op,
   3522                                            NativeLoc src) {
   3523   RvNativeTarget* a = rv_of(d->native);
   3524   NativeAddr addr;
   3525   memset(&addr, 0, sizeof addr);
   3526   if (op.kind == OPK_LOCAL) {
   3527     addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3528     addr.base.frame = d->locals[op.v.local - 1u].home;
   3529     addr.base_type = op.type;
   3530   } else {
   3531     addr = rv_direct_materialize_addr(d, op);
   3532   }
   3533   rv_emit_mem(a, 0, src, addr, native_mem_for_type(d->native, op.type, 0));
   3534 }
   3535 
   3536 /* Callee-saved registers an asm block clobbers must be spilled/restored around
   3537  * the block (the only ABI duty the allocator cannot discharge itself). */
   3538 typedef struct RvAsmSavedClobber {
   3539   NativeFrameSlot slot;
   3540   NativeAllocClass cls;
   3541   Reg reg;
   3542   KitCgTypeId type;
   3543 } RvAsmSavedClobber;
   3544 
   3545 /* A clobber save slot is register-width: ptr_bytes for an integer reg (4 on
   3546  * rv32, 8 on rv64) but always 8 for an FP reg (fsd, even on rv32d). */
   3547 static u32 rv_asm_save_bytes(const RvNativeTarget* a, const RvAsmSavedClobber* s) {
   3548   return s->cls == NATIVE_REG_FP ? 8u : a->variant->ptr_bytes;
   3549 }
   3550 static void rv_asm_save_one(RvNativeTarget* a, RvAsmSavedClobber* s) {
   3551   NativeFrameSlotDesc desc;
   3552   NativeAddr addr;
   3553   u32 sz = rv_asm_save_bytes(a, s);
   3554   memset(&desc, 0, sizeof desc);
   3555   desc.type = s->type;
   3556   desc.size = sz;
   3557   desc.align = sz;
   3558   desc.kind = NATIVE_FRAME_SLOT_SAVE;
   3559   s->slot = a->base.frame_slot(&a->base, &desc);
   3560   memset(&addr, 0, sizeof addr);
   3561   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3562   addr.base.frame = s->slot;
   3563   addr.base_type = s->type;
   3564   rv_emit_mem(a, 0, native_loc_reg(s->type, s->cls, s->reg), addr,
   3565               native_mem_for_type(&a->base, s->type, sz));
   3566 }
   3567 static void rv_asm_restore_one(RvNativeTarget* a, const RvAsmSavedClobber* s) {
   3568   NativeAddr addr;
   3569   u32 sz = rv_asm_save_bytes(a, s);
   3570   memset(&addr, 0, sizeof addr);
   3571   addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3572   addr.base.frame = s->slot;
   3573   addr.base_type = s->type;
   3574   rv_emit_mem(a, 1, native_loc_reg(s->type, s->cls, s->reg), addr,
   3575               native_mem_for_type(&a->base, s->type, sz));
   3576 }
   3577 
   3578 /* psABI callee-saved: integer s0..s11 (x8,x9,x18..x27), fp fs0..fs11
   3579  * (f8,f9,f18..f27). x8 is the frame pointer and never asm-clobbered. */
   3580 static int rv_reg_is_callee_int(Reg r) {
   3581   return r == 9u || (r >= 18u && r <= 27u);
   3582 }
   3583 static int rv_reg_is_callee_fp(Reg r) {
   3584   return r == 8u || r == 9u || (r >= 18u && r <= 27u);
   3585 }
   3586 
   3587 static RvAsmSavedClobber* rv_asm_save_callee_clobbers(RvNativeTarget* a,
   3588                                                       u32 int_mask, u32 fp_mask,
   3589                                                       u32* nsaved_out) {
   3590   RvAsmSavedClobber* saved =
   3591       arena_zarray(a->base.c->tu, RvAsmSavedClobber, 24u);
   3592   KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
   3593   KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64);
   3594   u32 n = 0;
   3595   Reg r;
   3596   for (r = 0; r <= 31u; ++r) {
   3597     if ((int_mask & (1u << r)) == 0 || !rv_reg_is_callee_int(r)) continue;
   3598     saved[n].cls = NATIVE_REG_INT;
   3599     saved[n].reg = r;
   3600     saved[n].type = i64;
   3601     rv_asm_save_one(a, &saved[n++]);
   3602   }
   3603   for (r = 0; r <= 31u; ++r) {
   3604     if ((fp_mask & (1u << r)) == 0 || !rv_reg_is_callee_fp(r)) continue;
   3605     saved[n].cls = NATIVE_REG_FP;
   3606     saved[n].reg = r;
   3607     saved[n].type = f64;
   3608     rv_asm_save_one(a, &saved[n++]);
   3609   }
   3610   *nsaved_out = n;
   3611   return saved;
   3612 }
   3613 
   3614 /* ---- NativeTarget (optimizer) asm hook ----
   3615  * The optimizer pre-allocated every operand register and arranged surrounding
   3616  * data flow, so this binds pre-allocated registers to the template and only
   3617  * materializes memory-operand bases into the reserved scratch + spills the
   3618  * callee-saved registers the asm clobbers. */
   3619 
   3620 static NativeAddr rv_asm_loc_to_addr(RvNativeTarget* a, SrcLoc loc,
   3621                                      NativeLoc src) {
   3622   NativeAddr addr;
   3623   memset(&addr, 0, sizeof addr);
   3624   addr.base_type = src.type;
   3625   switch ((NativeLocKind)src.kind) {
   3626     case NATIVE_LOC_FRAME:
   3627       addr.base_kind = NATIVE_ADDR_BASE_FRAME;
   3628       addr.base.frame = src.v.frame;
   3629       return addr;
   3630     case NATIVE_LOC_ADDR:
   3631       return src.v.addr;
   3632     case NATIVE_LOC_GLOBAL:
   3633       addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
   3634       addr.base.global.sym = src.v.global.sym;
   3635       addr.base.global.addend = src.v.global.addend;
   3636       return addr;
   3637     case NATIVE_LOC_REG:
   3638       addr.base_kind = NATIVE_ADDR_BASE_REG;
   3639       addr.cls = NATIVE_REG_INT;
   3640       addr.base.reg = src.v.reg;
   3641       return addr;
   3642     default:
   3643       rv_asm_panic_at(a->base.c, loc, "unsupported memory asm operand");
   3644   }
   3645 }
   3646 
   3647 /* Resolve a memory-constraint operand to a single base register with zero
   3648  * offset, folding any frame/global/offset into a reserved scratch register. */
   3649 static Reg rv_asm_native_mem_base(RvNativeTarget* a, SrcLoc loc, NativeLoc src,
   3650                                   u32* ntmp) {
   3651   NativeAddr addr = rv_asm_loc_to_addr(a, loc, src);
   3652   u32 base;
   3653   i32 off;
   3654   Reg dst;
   3655   if (addr.index_kind != NATIVE_ADDR_INDEX_NONE)
   3656     rv_asm_panic_at(a->base.c, loc, "indexed memory asm operand unsupported");
   3657   rv_resolve_mem_addr(a, &addr, &base, &off);
   3658   if (off == 0 && base != RV_TMP0 && base != RV_TMP1) return (Reg)base;
   3659   if (*ntmp >= 2u)
   3660     rv_asm_panic_at(a->base.c, loc, "too many memory asm operands");
   3661   dst = (*ntmp == 0u) ? RV_TMP0 : RV_TMP1;
   3662   (*ntmp)++;
   3663   rv_emit_addr_adjust(a->variant, a->base.mc, dst, base, off);
   3664   return dst;
   3665 }
   3666 
   3667 static u32 rv_asm_reg_mem_size(RvNativeTarget* a, NativeAllocClass cls,
   3668                                KitCgTypeId type) {
   3669   u32 sz = native_type_size(&a->base, type);
   3670   if (cls == NATIVE_REG_INT && sz > a->variant->ptr_bytes)
   3671     sz = a->variant->ptr_bytes;
   3672   return sz;
   3673 }
   3674 
   3675 static Reg rv_asm_stage_reg(RvNativeTarget* a, SrcLoc loc, NativeAllocClass cls,
   3676                             u32* nint, u32* nfp) {
   3677   static const Reg int_regs[] = {RV_TMP2, RV_TMP3};
   3678   static const Reg fp_regs[] = {RV_FTMP0, RV_FTMP1};
   3679   if (cls == NATIVE_REG_FP) {
   3680     if (*nfp >= (u32)(sizeof fp_regs / sizeof fp_regs[0]))
   3681       rv_asm_panic_at(a->base.c, loc, "too many staged fp asm operands");
   3682     return fp_regs[(*nfp)++];
   3683   }
   3684   if (*nint >= (u32)(sizeof int_regs / sizeof int_regs[0]))
   3685     rv_asm_panic_at(a->base.c, loc, "too many staged integer asm operands");
   3686   return int_regs[(*nint)++];
   3687 }
   3688 
   3689 static void rv_asm_load_loc_to_reg(RvNativeTarget* a, SrcLoc loc, NativeLoc src,
   3690                                    NativeLoc dst) {
   3691   NativeTarget* t = &a->base;
   3692   NativeAllocClass cls = (NativeAllocClass)dst.cls;
   3693   if (src.kind == NATIVE_LOC_REG) {
   3694     if (src.v.reg != dst.v.reg || src.cls != dst.cls) t->move(t, dst, src);
   3695     return;
   3696   }
   3697   if (src.kind == NATIVE_LOC_IMM) {
   3698     if (cls != NATIVE_REG_INT)
   3699       rv_asm_panic_at(t->c, loc,
   3700                       "floating-point immediate asm input is unsupported");
   3701     t->load_imm(t, dst, src.v.imm);
   3702     return;
   3703   }
   3704   rv_emit_mem(a, 1, dst, rv_asm_loc_to_addr(a, loc, src),
   3705               native_mem_for_type(t, dst.type,
   3706                                   rv_asm_reg_mem_size(a, cls, dst.type)));
   3707 }
   3708 
   3709 static void rv_asm_store_reg_to_loc(RvNativeTarget* a, SrcLoc loc, NativeLoc dst,
   3710                                     NativeLoc src) {
   3711   NativeTarget* t = &a->base;
   3712   NativeAllocClass cls = (NativeAllocClass)src.cls;
   3713   if (dst.kind == NATIVE_LOC_REG) {
   3714     if (dst.v.reg != src.v.reg || dst.cls != src.cls) t->move(t, dst, src);
   3715     return;
   3716   }
   3717   rv_emit_mem(a, 0, src, rv_asm_loc_to_addr(a, loc, dst),
   3718               native_mem_for_type(t, src.type,
   3719                                   rv_asm_reg_mem_size(a, cls, src.type)));
   3720 }
   3721 
   3722 static void rv_asm_bind_native(RvNativeTarget* a, SrcLoc loc, Operand* out,
   3723                                const char* constraint, KitCgTypeId type,
   3724                                NativeLoc src, u32* ntmp) {
   3725   const char* body = native_asm_constraint_body(constraint);
   3726   NativeAsmConstraintInfo info;
   3727   if (native_asm_constraint_reg_info(&a->base, constraint, &info)) {
   3728     if (src.kind != NATIVE_LOC_REG)
   3729       rv_asm_panic_at(a->base.c, loc, "register asm operand not in a register");
   3730     if (info.fixed_reg != REG_NONE && info.fixed_reg != (Reg)src.v.reg)
   3731       rv_asm_panic_at(a->base.c, loc,
   3732                       "fixed-register asm operand in wrong register");
   3733     if (info.allowed_mask &&
   3734         ((Reg)src.v.reg >= 32 ||
   3735          (info.allowed_mask & (1u << (Reg)src.v.reg)) == 0))
   3736       rv_asm_panic_at(a->base.c, loc,
   3737                       "register asm operand violates constraint register set");
   3738     rv_asm_bound_reg(out, type, info.cls, (Reg)src.v.reg);
   3739   } else if (body[0] == 'i') {
   3740     if (src.kind != NATIVE_LOC_IMM)
   3741       rv_asm_panic_at(a->base.c, loc, "immediate asm operand is not immediate");
   3742     memset(out, 0, sizeof *out);
   3743     out->kind = OPK_IMM;
   3744     out->type = type;
   3745     out->v.imm = src.v.imm;
   3746   } else if (body[0] == 'm') {
   3747     rv_asm_bound_mem(out, type, rv_asm_native_mem_base(a, loc, src, ntmp));
   3748   } else {
   3749     rv_asm_panic_at(a->base.c, loc, "unsupported asm constraint");
   3750   }
   3751 }
   3752 
   3753 static void rv_asm_block_native(NativeTarget* t, const char* tmpl,
   3754                                 const AsmConstraint* outs, u32 nout,
   3755                                 NativeLoc* out_locs, const AsmConstraint* ins,
   3756                                 u32 nin, const NativeLoc* in_locs,
   3757                                 const Sym* clobbers, u32 nclob) {
   3758   RvNativeTarget* a = rv_of(t);
   3759   Compiler* c = t->c;
   3760   SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
   3761   Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
   3762   Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
   3763   u8* staged_outs = nout ? arena_zarray(c->tu, u8, nout) : NULL;
   3764   u32 ntmp = 0, nstage_int = 0, nstage_fp = 0, i;
   3765   Rv64Asm* asmh;
   3766 
   3767   for (i = 0; i < nout; ++i) {
   3768     KitCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
   3769     NativeLoc outloc = out_locs[i];
   3770     NativeAsmConstraintInfo info;
   3771     NativeAsmPinnedLoc pinned =
   3772         native_asm_prepare_pinned_loc(t, outs[i].reg, outs[i].str, type, outloc);
   3773     if (pinned.has_pin) {
   3774       if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK)
   3775         rv_asm_panic_at(c, loc,
   3776                         native_asm_pin_status_message(pinned.pin_status));
   3777       if (pinned.wrong_reg)
   3778         rv_asm_panic_at(c, loc, "hard-register asm operand in wrong register");
   3779       outloc = pinned.loc;
   3780       if (pinned.needs_stage) {
   3781         staged_outs[i] = 1u;
   3782         if (outs[i].dir == KIT_CG_ASM_INOUT)
   3783           rv_asm_load_loc_to_reg(a, loc, out_locs[i], outloc);
   3784       }
   3785     } else if (native_asm_constraint_reg_info(t, outs[i].str, &info) &&
   3786                info.allowed_mask == 0 && outloc.kind != NATIVE_LOC_REG) {
   3787       Reg r = rv_asm_stage_reg(a, loc, info.cls, &nstage_int, &nstage_fp);
   3788       outloc = native_loc_reg(type, info.cls, r);
   3789       staged_outs[i] = 1u;
   3790       if (outs[i].dir == KIT_CG_ASM_INOUT)
   3791         rv_asm_load_loc_to_reg(a, loc, out_locs[i], outloc);
   3792     }
   3793     rv_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, outloc, &ntmp);
   3794   }
   3795   for (i = 0; i < nin; ++i) {
   3796     const char* body = native_asm_constraint_body(ins[i].str);
   3797     int matched = native_asm_match_index(body);
   3798     KitCgTypeId type;
   3799     NativeLoc inloc;
   3800     if (matched >= 0) {
   3801       if ((u32)matched >= nout)
   3802         rv_asm_panic_at(c, loc, "matching constraint out of range");
   3803       bound_ins[i] = bound_outs[matched];
   3804       continue;
   3805     }
   3806     type = ins[i].type ? ins[i].type : in_locs[i].type;
   3807     inloc = in_locs[i];
   3808     {
   3809       NativeAsmConstraintInfo info;
   3810       NativeAsmPinnedLoc pinned =
   3811           native_asm_prepare_pinned_loc(t, ins[i].reg, ins[i].str, type, inloc);
   3812       if (pinned.has_pin) {
   3813         if (pinned.pin_status != NATIVE_ASM_REG_PIN_OK)
   3814           rv_asm_panic_at(c, loc,
   3815                           native_asm_pin_status_message(pinned.pin_status));
   3816         if (pinned.wrong_reg)
   3817           rv_asm_panic_at(c, loc,
   3818                           "hard-register asm operand in wrong register");
   3819         inloc = pinned.loc;
   3820         if (pinned.needs_stage)
   3821           rv_asm_load_loc_to_reg(a, loc, in_locs[i], inloc);
   3822       } else if (native_asm_constraint_reg_info(t, ins[i].str, &info) &&
   3823                  info.allowed_mask == 0 && inloc.kind != NATIVE_LOC_REG) {
   3824         Reg r = rv_asm_stage_reg(a, loc, info.cls, &nstage_int, &nstage_fp);
   3825         inloc = native_loc_reg(type, info.cls, r);
   3826         rv_asm_load_loc_to_reg(a, loc, in_locs[i], inloc);
   3827       }
   3828     }
   3829     rv_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp);
   3830   }
   3831 
   3832   /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber
   3833    * masks and rv_known_callee_saves folded the callee-saved ones into the
   3834    * function's saved set, so the prologue/epilogue already preserve them. */
   3835   asmh = rv64_asm_open(c);
   3836   rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
   3837                    nclob);
   3838   rv64_asm_run_template(asmh, t->mc, tmpl);
   3839   rv64_asm_close(asmh);
   3840 
   3841   for (i = 0; i < nout; ++i) {
   3842     NativeAllocClass cls;
   3843     NativeLoc src;
   3844     if (!staged_outs || !staged_outs[i]) continue;
   3845     if (bound_outs[i].kind != RV64_INLINE_OPK_REG) continue;
   3846     cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP ? NATIVE_REG_FP
   3847                                                        : NATIVE_REG_INT;
   3848     src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local);
   3849     rv_asm_store_reg_to_loc(a, loc, out_locs[i], src);
   3850   }
   3851 }
   3852 /* file_scope_asm + finalize are shared (cg/native_asm.h). */
   3853 
   3854 static void rv_trap(NativeTarget* t) { rv64_emit32(t->mc, rv_ebreak()); }
   3855 static void rv_set_loc(NativeTarget* t, SrcLoc loc) {
   3856   rv_of(t)->loc = loc;
   3857   if (t->mc->set_loc) t->mc->set_loc(t->mc, loc);
   3858 }
   3859 
   3860 /* ============================ construction ============================ */
   3861 
   3862 NativeTarget* rv64_native_target_new(Compiler* c, ObjBuilder* obj,
   3863                                      MCEmitter* mc) {
   3864   RvNativeTarget* a = arena_znew(c->tu, RvNativeTarget);
   3865   NativeTarget* t;
   3866   if (!a) return NULL;
   3867   t = &a->base;
   3868   t->c = c;
   3869   t->obj = obj;
   3870   t->mc = mc;
   3871   a->variant = riscv_variant_for_kind(c->target.arch);
   3872   native_frame_init(&a->frame, c);
   3873   t->regs = &rv_reg_info;
   3874   t->class_for_type = native_class_for_type_fp_le8;
   3875   t->imm_legal = rv_imm_legal;
   3876   t->addr_legal = rv_addr_legal;
   3877   t->func_begin = rv_func_begin;
   3878   t->func_begin_known_frame = rv_func_begin_known_frame;
   3879   t->note_frame_state = NULL;
   3880   /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved
   3881    * set; rv_func_begin_known_frame derives the records from the masks. */
   3882   t->reserve_callee_saves = rv_reserve_callee_saves;
   3883   t->signature_stack_bytes = rv_signature_stack_bytes;
   3884   t->call_stack_bytes = rv_call_stack_bytes;
   3885   t->has_store_zero_reg = 1;
   3886   t->store_zero_reg = RV_ZERO;
   3887   t->func_end = rv_func_end;
   3888   t->frame_slot = rv_frame_slot;
   3889   t->frame_slot_debug_loc = rv_frame_slot_debug_loc;
   3890   t->bind_param = rv_bind_native_param;
   3891   t->label_new = rv_label_new;
   3892   t->label_place = rv_label_place;
   3893   t->jump = rv_jump;
   3894   t->cmp_branch = rv_cmp_branch;
   3895   t->indirect_branch = rv_indirect_branch;
   3896   t->load_label_addr = rv_load_label_addr;
   3897   t->move = rv_move;
   3898   t->load_imm = rv_load_imm;
   3899   t->load_const = rv_load_const;
   3900   t->load_addr = rv_load_addr;
   3901   t->load = rv_load;
   3902   t->store = rv_store;
   3903   t->tls_addr_of = rv_tls_addr_of;
   3904   t->copy_bytes = rv_copy_bytes;
   3905   t->set_bytes = rv_set_bytes;
   3906   t->bitfield_load = rv_bitfield_load;
   3907   t->bitfield_store = rv_bitfield_store;
   3908   t->binop = rv_binop;
   3909   t->unop = rv_unop;
   3910   t->cmp = rv_cmp;
   3911   t->convert = rv_convert;
   3912   t->alloca_ = rv_alloca;
   3913   t->spill = rv_spill;
   3914   t->reload = rv_reload;
   3915   t->plan_call = rv_plan_call;
   3916   t->emit_call = rv_emit_call;
   3917   t->plan_ret = rv_plan_ret;
   3918   t->ret = rv_ret;
   3919   t->atomic_load = rv_atomic_load;
   3920   t->atomic_store = rv_atomic_store;
   3921   t->atomic_rmw = rv_atomic_rmw;
   3922   t->atomic_cas = rv_atomic_cas;
   3923   t->fence = rv_fence;
   3924   t->va_start_ = rv_va_start_native;
   3925   t->va_arg_ = rv_va_arg_native;
   3926   t->va_end_ = rv_va_end_native;
   3927   t->va_copy_ = rv_va_copy_native;
   3928   t->intrinsic = rv_intrinsic;
   3929   t->asm_block = rv_asm_block_native;
   3930   t->file_scope_asm = native_file_scope_asm;
   3931   t->trap = rv_trap;
   3932   t->set_loc = rv_set_loc;
   3933   t->finalize = native_finalize;
   3934   return t;
   3935 }
   3936 
   3937 /* ============================ NativeOps (-O0) ============================ */
   3938 
   3939 static void rv_bind_param(NativeDirectTarget* d, const CGParamDesc* p,
   3940                           CGLocal local, NativeDirectLocal* l) {
   3941   NativeLoc dst;
   3942   (void)local;
   3943   memset(&dst, 0, sizeof dst);
   3944   dst.kind = NATIVE_LOC_FRAME;
   3945   dst.type = p->type;
   3946   dst.v.frame = l->home;
   3947   rv_bind_native_param(d->native, p, dst);
   3948 }
   3949 
   3950 /* A sibling call is realizable when its outgoing stack-argument area fits the
   3951  * window the caller itself received (so the args land in the caller's incoming
   3952  * slots without overflowing into the caller's caller's frame). Register-only
   3953  * calls (the common case) always qualify. Mirrors aa64's aa_no_tail. */
   3954 static const char* rv_no_tail(NativeDirectTarget* d, const CGCallDesc* call) {
   3955   RvNativeTarget* a = rv_of(d->native);
   3956   NativeCallDesc nd;
   3957   NativeLoc* args = NULL;
   3958   NativeLoc* results = NULL;
   3959   u32 i, stack;
   3960   if (a->frame.ncallee_saves)
   3961     return "rv64 tail call: callee-saved registers in use";
   3962   memset(&nd, 0, sizeof nd);
   3963   u32 nresults = call->result != CG_LOCAL_NONE ? 1u : 0u;
   3964   if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs);
   3965   if (nresults) results = arena_zarray(d->base.c->tu, NativeLoc, nresults);
   3966   for (i = 0; i < call->nargs; ++i) {
   3967     args[i].kind = NATIVE_LOC_FRAME;
   3968     args[i].type = d->locals[call->args[i] - 1u].type;
   3969     args[i].cls = d->locals[call->args[i] - 1u].cls;
   3970     args[i].v.frame = d->locals[call->args[i] - 1u].home;
   3971   }
   3972   if (nresults) {
   3973     results[0].kind = NATIVE_LOC_FRAME;
   3974     results[0].type = d->locals[call->result - 1u].type;
   3975     results[0].cls = d->locals[call->result - 1u].cls;
   3976     results[0].v.frame = d->locals[call->result - 1u].home;
   3977   }
   3978   nd.fn_type = call->fn_type;
   3979   nd.args = args;
   3980   nd.results = results;
   3981   nd.nargs = call->nargs;
   3982   nd.nresults = nresults;
   3983   stack = rv_call_stack_size(d->native, &nd);
   3984   if (stack > a->incoming_stack_size)
   3985     return "rv64 tail call: stack argument area too small";
   3986   return NULL;
   3987 }
   3988 
   3989 /* Resolve a pointer-typed Operand (the address of a va_list object) into `reg`
   3990  * and return a register-based NativeAddr. An OPK_LOCAL holds the va_list object
   3991  * itself, so we take its frame address; an OPK_INDIRECT holds the pointer in
   3992  * memory and must be loaded. The va cores use TMP1/TMP2 internally, so `reg`
   3993  * must be distinct from those (callers pass TMP0 / TMP3). */
   3994 /* ap_addr is the pointer value &ap (the va_list object's address). For an
   3995  * OPK_LOCAL the local HOLDS that pointer, so load its home value; an
   3996  * OPK_INDIRECT names *(base+ofs), whose address base+ofs is the pointer.
   3997  * Mirrors aa64's aa_direct_pointer_addr. */
   3998 static NativeAddr rv_direct_pointer_addr(NativeDirectTarget* d, Operand op) {
   3999   RvNativeTarget* a = rv_of(d->native);
   4000   NativeAddr addr;
   4001   memset(&addr, 0, sizeof addr);
   4002   if (op.kind == OPK_LOCAL) {
   4003     NativeLoc base = native_loc_reg(op.type, NATIVE_REG_INT, RV_TMP1);
   4004     NativeAddr load;
   4005     memset(&load, 0, sizeof load);
   4006     load.base_kind = NATIVE_ADDR_BASE_FRAME;
   4007     load.base.frame = d->locals[op.v.local - 1u].home;
   4008     load.base_type = op.type;
   4009     rv_emit_mem(a, 1, base, load, native_mem_for_type(d->native, op.type, 8));
   4010     addr.base_kind = NATIVE_ADDR_BASE_REG;
   4011     addr.base.reg = RV_TMP1;
   4012     addr.base_type = op.type;
   4013     return addr;
   4014   }
   4015   return rv_direct_materialize_addr(d, op);
   4016 }
   4017 
   4018 static NativeAddr rv_direct_va_base(NativeDirectTarget* d, Operand ap_addr,
   4019                                     Reg reg) {
   4020   NativeLoc dst =
   4021       native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg);
   4022   NativeAddr addr;
   4023   d->native->load_addr(d->native, dst, rv_direct_pointer_addr(d, ap_addr));
   4024   memset(&addr, 0, sizeof addr);
   4025   addr.base_kind = NATIVE_ADDR_BASE_REG;
   4026   addr.cls = NATIVE_REG_INT;
   4027   addr.base.reg = reg;
   4028   addr.base_type = builtin_id(KIT_CG_BUILTIN_I64);
   4029   return addr;
   4030 }
   4031 
   4032 static void rv_va_start_(NativeDirectTarget* d, Operand ap_addr) {
   4033   rv_va_start_core(rv_of(d->native), rv_direct_va_base(d, ap_addr, RV_TMP3));
   4034 }
   4035 static void rv_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr,
   4036                        KitCgTypeId type) {
   4037   RvNativeTarget* a = rv_of(d->native);
   4038   NativeAllocClass cls;
   4039   /* A value too wide for one GPR (8-byte i64 / soft-double on ilp32) is copied
   4040    * straight from the save area into its destination memory. */
   4041   if (rv_va_arg_is_wide(d->native, type)) {
   4042     rv_va_arg_wide(a, rv_direct_addr(d, dst),
   4043                    rv_direct_va_base(d, ap_addr, RV_TMP3),
   4044                    native_type_size(d->native, type));
   4045     return;
   4046   }
   4047   /* Float-ABI-aware class: a soft (or wider-than-flen) float is INT-class so
   4048    * the va_arg fetch never lands a double in an FP register on rv32. */
   4049   cls = native_class_for_type_fp_le8(d->native, type);
   4050   NativeLoc res = native_loc_reg(type, cls,
   4051                              cls == NATIVE_REG_FP ? RV_FTMP0 : RV_TMP0);
   4052   NativeAddr dst_addr;
   4053   rv_va_arg_core(a, res, rv_direct_va_base(d, ap_addr, RV_TMP3), type);
   4054   /* Store the fetched value back into the semantic destination. */
   4055   dst_addr = rv_direct_addr(d, dst);
   4056   if (dst_addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
   4057     NativeLoc base =
   4058         native_loc_reg(dst_addr.base_type, NATIVE_REG_INT, RV_TMP1);
   4059     NativeAddr load;
   4060     memset(&load, 0, sizeof load);
   4061     load.base_kind = NATIVE_ADDR_BASE_FRAME;
   4062     load.base.frame = dst_addr.base.frame;
   4063     load.base_type = dst_addr.base_type;
   4064     rv_emit_mem(a, 1, base, load,
   4065                 native_mem_for_type(d->native, dst_addr.base_type, 8));
   4066     dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
   4067     dst_addr.base.reg = RV_TMP1;
   4068   }
   4069   rv_emit_mem(
   4070       a, 0, res, dst_addr,
   4071       native_mem_for_type(d->native, type, native_type_size(d->native, type)));
   4072 }
   4073 static void rv_va_end_(NativeDirectTarget* d, Operand ap_addr) {
   4074   (void)d;
   4075   (void)ap_addr;
   4076 }
   4077 static void rv_va_copy_(NativeDirectTarget* d, Operand dst, Operand src) {
   4078   RvNativeTarget* a = rv_of(d->native);
   4079   NativeAddr src_ap = rv_direct_va_base(d, src, RV_TMP0);
   4080   NativeAddr dst_ap = rv_direct_va_base(d, dst, RV_TMP3);
   4081   rv_va_copy_core(a, dst_ap, src_ap);
   4082 }
   4083 
   4084 static void rv_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
   4085                                 const AsmConstraint* outs, u32 nout,
   4086                                 Operand* out_ops, const AsmConstraint* ins,
   4087                                 u32 nin, const Operand* in_ops,
   4088                                 const Sym* clobbers, u32 nclob,
   4089                                 u32 clobber_abi_sets) {
   4090   RvNativeTarget* a = rv_of(d->native);
   4091   Compiler* c = d->base.c;
   4092   Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
   4093   Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
   4094   u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp;
   4095   RvAsmSavedClobber* saved;
   4096   u32 nsaved, i;
   4097   Rv64Asm* asmh;
   4098 
   4099   rv_asm_clobber_masks(c, d->loc, clobbers, nclob, &clob_int, &clob_fp);
   4100   native_asm_abi_clobber_masks(d->native, clobber_abi_sets, &abi_int, &abi_fp);
   4101   clob_int |= abi_int;
   4102   clob_fp |= abi_fp;
   4103   /* Reserve emit scratch (t0/t1/t2/t3), sp/gp/tp/zero/ra and the frame pointer
   4104    * so the operand allocator never hands them out. */
   4105   used_int = clob_int | (1u << RV_ZERO) | (1u << RV_RA) | (1u << RV_SP) |
   4106              (1u << RV_GP) | (1u << RV_TP) | (1u << RV_TMP0) | (1u << RV_TMP1) |
   4107              (1u << RV_TMP2) | (1u << RV_TMP3) | (1u << RV_S0);
   4108   used_fp =
   4109       clob_fp | (1u << RV_FTMP0) | (1u << RV_FTMP1) | (1u << 2u) | (1u << 3u);
   4110 
   4111   for (i = 0; i < nout; ++i) {
   4112     const char* body = native_asm_constraint_body(outs[i].str);
   4113     KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type;
   4114     NativeAsmRegPin pin;
   4115     if (rv_asm_resolve_pin_or_panic(d, outs[i].reg, outs[i].str, &pin)) {
   4116       /* GNU local register variable: pin to the named hard register. */
   4117       if (pin.cls == NATIVE_REG_FP) {
   4118         used_fp |= 1u << pin.reg;
   4119         clob_fp |= 1u << pin.reg;
   4120       } else {
   4121         used_int |= 1u << pin.reg;
   4122         clob_int |= 1u << pin.reg;
   4123       }
   4124       rv_asm_bound_reg(&bound_outs[i], type, pin.cls, pin.reg);
   4125     } else {
   4126       NativeAsmConstraintInfo info;
   4127       if (native_asm_constraint_reg_info(d->native, outs[i].str, &info)) {
   4128         Reg reg = info.fixed_reg != REG_NONE
   4129                       ? info.fixed_reg
   4130                       : rv_asm_alloc_reg(d, info.cls, info.allowed_mask,
   4131                                          &used_int, &used_fp);
   4132         if (info.cls == NATIVE_REG_FP) {
   4133           used_fp |= 1u << reg;
   4134           if (info.fixed_reg != REG_NONE) clob_fp |= 1u << reg;
   4135         } else {
   4136           used_int |= 1u << reg;
   4137           if (info.fixed_reg != REG_NONE) clob_int |= 1u << reg;
   4138         }
   4139         rv_asm_bound_reg(&bound_outs[i], type, info.cls, reg);
   4140       } else if (body[0] == 'm') {
   4141         Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, 0, &used_int, &used_fp);
   4142         rv_asm_bound_mem(&bound_outs[i], type, reg);
   4143       } else {
   4144         rv_asm_panic(d, "unsupported output constraint");
   4145       }
   4146     }
   4147   }
   4148 
   4149   for (i = 0; i < nin; ++i) {
   4150     const char* body = native_asm_constraint_body(ins[i].str);
   4151     int matched = native_asm_match_index(body);
   4152     KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type;
   4153     if (matched >= 0) {
   4154       if ((u32)matched >= nout)
   4155         rv_asm_panic(d, "matching constraint out of range");
   4156       if (native_asm_constraint_early(outs[matched].str))
   4157         rv_asm_panic(d, "matching input names early-clobber output");
   4158       if (bound_outs[matched].kind != RV64_INLINE_OPK_REG)
   4159         rv_asm_panic(d, "matching constraint requires register output");
   4160       bound_ins[i] = bound_outs[matched];
   4161       continue;
   4162     }
   4163     NativeAsmRegPin pin;
   4164     if (rv_asm_resolve_pin_or_panic(d, ins[i].reg, ins[i].str, &pin)) {
   4165       /* GNU local register variable: pin to the named hard register. */
   4166       if (pin.cls == NATIVE_REG_FP) {
   4167         used_fp |= 1u << pin.reg;
   4168         clob_fp |= 1u << pin.reg;
   4169       } else {
   4170         used_int |= 1u << pin.reg;
   4171         clob_int |= 1u << pin.reg;
   4172       }
   4173       rv_asm_bound_reg(&bound_ins[i], type, pin.cls, pin.reg);
   4174     } else {
   4175       NativeAsmConstraintInfo info;
   4176       if (native_asm_constraint_reg_info(d->native, ins[i].str, &info)) {
   4177         Reg reg = info.fixed_reg != REG_NONE
   4178                       ? info.fixed_reg
   4179                       : rv_asm_alloc_reg(d, info.cls, info.allowed_mask,
   4180                                          &used_int, &used_fp);
   4181         if (info.cls == NATIVE_REG_FP) {
   4182           used_fp |= 1u << reg;
   4183           if (info.fixed_reg != REG_NONE) clob_fp |= 1u << reg;
   4184         } else {
   4185           used_int |= 1u << reg;
   4186           if (info.fixed_reg != REG_NONE) clob_int |= 1u << reg;
   4187         }
   4188         rv_asm_bound_reg(&bound_ins[i], type, info.cls, reg);
   4189       } else if (body[0] == 'i') {
   4190         if (in_ops[i].kind != OPK_IMM)
   4191           rv_asm_panic(d, "immediate constraint requires immediate operand");
   4192         bound_ins[i] = in_ops[i];
   4193       } else if (body[0] == 'm') {
   4194         Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, 0, &used_int, &used_fp);
   4195         rv_asm_bound_mem(&bound_ins[i], type, reg);
   4196       } else {
   4197         rv_asm_panic(d, "unsupported input constraint");
   4198       }
   4199     }
   4200   }
   4201 
   4202   saved = rv_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved);
   4203   for (i = 0; i < nout; ++i) {
   4204     if (bound_outs[i].kind == RV64_INLINE_OPK_REG) {
   4205       NativeAllocClass cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP
   4206                                  ? NATIVE_REG_FP
   4207                                  : NATIVE_REG_INT;
   4208       if (outs[i].dir == KIT_CG_ASM_INOUT) {
   4209         rv_direct_load_operand_to_reg(
   4210             d, out_ops[i],
   4211             native_loc_reg(bound_outs[i].type, cls,
   4212                            (Reg)bound_outs[i].v.local));
   4213       }
   4214     } else if (bound_outs[i].kind == OPK_INDIRECT) {
   4215       NativeLoc loc =
   4216           native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
   4217                          (Reg)bound_outs[i].v.ind.base);
   4218       rv_direct_load_address_to_reg(d, out_ops[i], loc);
   4219     }
   4220   }
   4221   for (i = 0; i < nin; ++i) {
   4222     if (bound_ins[i].kind == RV64_INLINE_OPK_REG) {
   4223       NativeAllocClass cls = bound_ins[i].pad[0] == RV64_INLINE_OPCLS_FP
   4224                                  ? NATIVE_REG_FP
   4225                                  : NATIVE_REG_INT;
   4226       rv_direct_load_operand_to_reg(
   4227           d, in_ops[i],
   4228           native_loc_reg(bound_ins[i].type, cls, (Reg)bound_ins[i].v.local));
   4229     } else if (bound_ins[i].kind == OPK_INDIRECT) {
   4230       NativeLoc loc =
   4231           native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
   4232                          (Reg)bound_ins[i].v.ind.base);
   4233       rv_direct_load_address_to_reg(d, in_ops[i], loc);
   4234     }
   4235   }
   4236   asmh = rv64_asm_open(c);
   4237   rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
   4238                    nclob);
   4239   rv64_asm_run_template(asmh, d->native->mc, tmpl);
   4240   rv64_asm_close(asmh);
   4241 
   4242   for (i = 0; i < nout; ++i) {
   4243     NativeAllocClass cls;
   4244     NativeLoc src;
   4245     if (bound_outs[i].kind != RV64_INLINE_OPK_REG) continue;
   4246     cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP ? NATIVE_REG_FP
   4247                                                        : NATIVE_REG_INT;
   4248     src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local);
   4249     rv_direct_store_reg_to_operand(d, out_ops[i], src);
   4250   }
   4251   for (i = nsaved; i > 0; --i) rv_asm_restore_one(a, &saved[i - 1u]);
   4252 }
   4253 
   4254 static const NativeOps rv_direct_ops = {
   4255     .bind_param = rv_bind_param,
   4256     .tail_call_unrealizable_reason = rv_no_tail,
   4257     .va_start_ = rv_va_start_,
   4258     .va_arg_ = rv_va_arg_,
   4259     .va_end_ = rv_va_end_,
   4260     .va_copy_ = rv_va_copy_,
   4261     .asm_block = rv_direct_asm_block,
   4262 };
   4263 
   4264 const NativeOps* rv64_native_direct_ops(void) { return &rv_direct_ops; }
	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README