kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

asm.c (55616B)


      1 /* RV64 assembler — descriptor-table driven.
      2  *
      3  * Mnemonic → Rv64InsnDesc via rv64_asm_find; operand parsing dispatches
      4  * on the format kind. The descriptor's `match` field already carries
      5  * the funct3/funct7/opcode bits; the parser only needs to fill in the
      6  * register operands and immediate.
      7  *
      8  * Aliases (li, mv, ret, jr, j, nop, sext.w, beqz, bnez) are recognized
      9  * by their alias rows in the descriptor table and rewritten to the
     10  * canonical encoding here. Inline rv_* encoders in isa.h remain the
     11  * hot path for codegen; the assembler uses them to assemble the
     12  * machine word once it has the operand values. */
     13 
     14 #include "arch/riscv/asm.h"
     15 
     16 #include <string.h>
     17 
     18 #include "arch/riscv/isa.h"
     19 #include "arch/riscv/regs.h"
     20 #include "arch/riscv/rv64.h"
     21 #include "arch/riscv/variant.h"
     22 #include "asm/asm_helpers.h"
     23 #include "core/arena.h"
     24 #include "core/pool.h"
     25 #include "core/slice.h"
     26 #include "core/strbuf.h"
     27 #include "obj/obj.h"
     28 
     29 struct Rv64Asm {
     30   ArchAsm base;
     31   Compiler* c;
     32 
     33   /* Inline-asm bound state (set by rv64_inline_bind, cleared otherwise).
     34    * Operand indexing per GCC convention: 0..nout-1 are outputs, then
     35    * nout..nout+nin-1 are inputs. Templates address into this combined
     36    * list via %N / %zN / %aN / %w[name] / %x[name]. */
     37   const AsmConstraint* outs;
     38   Operand* out_ops;
     39   const AsmConstraint* ins;
     40   const Operand* in_ops;
     41   const Sym* clobbers;
     42   u32 nout;
     43   u32 nin;
     44   u32 nclob;
     45 };
     46 
     47 typedef struct Rv64Asm Rv64Asm;
     48 
     49 /* Relocation modifier on a 12-bit immediate offset (`%lo`/`%pcrel_lo`).
     50  * RV_MEMMOD_NONE means a plain numeric displacement in `disp`. */
     51 typedef enum RvMemMod {
     52   RV_MEMMOD_NONE = 0,
     53   RV_MEMMOD_LO,
     54   RV_MEMMOD_PCREL_LO,
     55 } RvMemMod;
     56 
     57 typedef struct Rv64Mem {
     58   i32 disp;
     59   u32 base;
     60   RvMemMod mod; /* reloc modifier on the offset, or RV_MEMMOD_NONE */
     61   ObjSymId sym; /* symbol when mod != NONE */
     62   i64 off;      /* addend when mod != NONE */
     63 } Rv64Mem;
     64 
     65 static int sym_to_cstr(AsmDriver* d, Sym s, char* out, size_t cap) {
     66   Slice sl = pool_slice(asm_driver_pool(d), s);
     67   if (!sl.s || sl.len >= cap) return 0;
     68   memcpy(out, sl.s, sl.len);
     69   out[sl.len] = '\0';
     70   return 1;
     71 }
     72 
     73 /* True if `s` begins with the NUL-terminated literal `pfx` (length-explicit).
     74  */
     75 static bool slice_has_prefix_cstr(Slice s, const char* pfx, size_t n) {
     76   return s.len >= n && memcmp(s.s, pfx, n) == 0;
     77 }
     78 
     79 static int rv_reg_from_name(AsmDriver* d, Sym s, u32* reg_out, int* fp_out) {
     80   char name[16];
     81   uint32_t dwarf = 0;
     82   if (!sym_to_cstr(d, s, name, sizeof name)) return 0;
     83   if (rv64_register_index(name, &dwarf) != 0) return 0;
     84   if (reg_out) *reg_out = dwarf & 31u;
     85   if (fp_out) *fp_out = dwarf >= 32u;
     86   return 1;
     87 }
     88 
     89 static u32 parse_reg(AsmDriver* d, int* fp_out) {
     90   AsmTok t = asm_driver_next(d);
     91   u32 r;
     92   if (t.kind != ASM_TOK_IDENT || !rv_reg_from_name(d, t.v.ident, &r, fp_out))
     93     asm_driver_panic(d, "rv64 asm: bad register");
     94   return r;
     95 }
     96 
     97 static u32 parse_xreg(AsmDriver* d) {
     98   int fp = 0;
     99   u32 r = parse_reg(d, &fp);
    100   if (fp) asm_driver_panic(d, "rv64 asm: expected integer register");
    101   return r;
    102 }
    103 
    104 static u32 parse_freg(AsmDriver* d) {
    105   int fp = 0;
    106   u32 r = parse_reg(d, &fp);
    107   if (!fp) asm_driver_panic(d, "rv64 asm: expected float register");
    108   return r;
    109 }
    110 
    111 static void expect_comma(AsmDriver* d) {
    112   if (!asm_driver_eat_comma(d)) asm_driver_panic(d, "rv64 asm: expected ','");
    113 }
    114 
    115 /* Parse a CSR operand: a standard CSR name (mstatus, mtvec, ...) or a bare
    116  * numeric expression. Returns the 12-bit CSR number. */
    117 static u32 parse_csr(AsmDriver* d) {
    118   AsmTok t = asm_driver_peek(d);
    119   if (t.kind == ASM_TOK_IDENT) {
    120     u16 num;
    121     if (rv64_csr_num_from_name(pool_slice(asm_driver_pool(d), t.v.ident),
    122                                &num)) {
    123       (void)asm_driver_next(d); /* consume the name */
    124       return (u32)num & 0xfffu;
    125     }
    126   }
    127   return (u32)asm_driver_parse_const(d) & 0xfffu;
    128 }
    129 
    130 /* Position of a `%mod(sym)` relocation operand: the 20-bit upper field of
    131  * lui/auipc, or a 12-bit I-type (addi/load) or S-type (store) immediate. */
    132 typedef enum RvModPos {
    133   RV_MODPOS_HI20,
    134   RV_MODPOS_LO_I,
    135   RV_MODPOS_LO_S,
    136 } RvModPos;
    137 
    138 /* Map a relocation-modifier name (`hi`, `lo`, `pcrel_hi`, `pcrel_lo`,
    139  * `got_pcrel_hi`) to the RelocKind appropriate for `pos`. Panics on a name
    140  * that is not valid at this operand position. */
    141 static RelocKind rv_mod_to_reloc(AsmDriver* d, Slice name, RvModPos pos) {
    142   if (pos == RV_MODPOS_HI20) {
    143     if (slice_eq_cstr(name, "hi")) return R_RV_HI20;
    144     if (slice_eq_cstr(name, "pcrel_hi")) return R_RV_PCREL_HI20;
    145     if (slice_eq_cstr(name, "got_pcrel_hi")) return R_RV_GOT_HI20;
    146   } else {
    147     int store = (pos == RV_MODPOS_LO_S);
    148     if (slice_eq_cstr(name, "lo")) return store ? R_RV_LO12_S : R_RV_LO12_I;
    149     if (slice_eq_cstr(name, "pcrel_lo"))
    150       return store ? R_RV_PCREL_LO12_S : R_RV_PCREL_LO12_I;
    151   }
    152   asm_driver_panic(d, "rv64 asm: relocation modifier not valid here");
    153 }
    154 
    155 /* If the next token is `%`, parse a `%mod(sym{+off})` relocation operand,
    156  * emit the relocation at the current emit position (where the about-to-be-
    157  * returned instruction word will land), and return 1. The caller encodes a
    158  * zero placeholder in the immediate field. Returns 0 if there is no modifier
    159  * (leaving the stream untouched for normal constant parsing). A leading `%`
    160  * is unambiguous here: modulo is infix and never starts an operand. */
    161 static int rv_parse_mod_reloc(AsmDriver* d, RvModPos pos, ObjSymId* sym_out,
    162                               i64* off_out, RelocKind* kind_out) {
    163   if (!asm_driver_tok_is_punct(asm_driver_peek(d), '%')) return 0;
    164   (void)asm_driver_next(d); /* eat '%' */
    165   AsmTok name = asm_driver_next(d);
    166   if (name.kind != ASM_TOK_IDENT)
    167     asm_driver_panic(d, "rv64 asm: expected relocation modifier name");
    168   Slice nm = pool_slice(asm_driver_pool(d), name.v.ident);
    169   asm_driver_expect_punct(d, '(', "'(' after relocation modifier");
    170   ObjSymId sym = OBJ_SYM_NONE;
    171   i64 off = 0;
    172   asm_driver_parse_sym_expr(d, &sym, &off);
    173   asm_driver_expect_punct(d, ')', "')' after %mod(sym)");
    174   RelocKind k = rv_mod_to_reloc(d, nm, pos);
    175   if (sym_out) *sym_out = sym;
    176   if (off_out) *off_out = off;
    177   if (kind_out) *kind_out = k;
    178   return 1;
    179 }
    180 
    181 /* Parse a RISC-V rounding-mode mnemonic (the comma is already consumed) into
    182  * its 3-bit funct3 value. cc -S emits this suffix on fcvt/fsqrt when the mode
    183  * isn't the default `dyn`, so the round-trip (and clang) re-encode the exact
    184  * mode rather than guessing a default. */
    185 static u32 rv_parse_rm_name(AsmDriver* d) {
    186   AsmTok t = asm_driver_next(d);
    187   Slice s;
    188   if (t.kind != ASM_TOK_IDENT)
    189     asm_driver_panic(d, "rv64 asm: expected rounding mode");
    190   s = pool_slice(asm_driver_pool(d), t.v.ident);
    191   if (slice_eq_cstr(s, "rne")) return 0u;
    192   if (slice_eq_cstr(s, "rtz")) return 1u;
    193   if (slice_eq_cstr(s, "rdn")) return 2u;
    194   if (slice_eq_cstr(s, "rup")) return 3u;
    195   if (slice_eq_cstr(s, "rmm")) return 4u;
    196   if (slice_eq_cstr(s, "dyn")) return 7u;
    197   asm_driver_panic(d, "rv64 asm: unknown rounding mode");
    198 }
    199 
    200 /* Emit a relocation for a U-type / I-type immediate `%mod(sym)` operand at
    201  * the current instruction position; returns 1 if one was present. */
    202 static int rv_emit_imm_mod_reloc(AsmDriver* d, RvModPos pos) {
    203   ObjSymId sym;
    204   i64 off;
    205   RelocKind k;
    206   if (!rv_parse_mod_reloc(d, pos, &sym, &off, &k)) return 0;
    207   MCEmitter* mc = asm_driver_mc(d);
    208   mc->emit_reloc_at(mc, mc->section_id, mc->pos(mc), k, sym, off, 0, 0);
    209   return 1;
    210 }
    211 
    212 static Rv64Mem parse_mem(AsmDriver* d) {
    213   Rv64Mem m;
    214   m.disp = 0;
    215   m.mod = RV_MEMMOD_NONE;
    216   m.sym = OBJ_SYM_NONE;
    217   m.off = 0;
    218   if (asm_driver_tok_is_punct(asm_driver_peek(d), '%')) {
    219     /* `%lo(sym)(base)` / `%pcrel_lo(label)(base)` — record the modifier; the
    220      * load/store caller emits the I- or S-type relocation. */
    221     ObjSymId sym;
    222     i64 off;
    223     RelocKind k;
    224     (void)rv_parse_mod_reloc(d, RV_MODPOS_LO_I, &sym, &off, &k);
    225     m.mod = (k == R_RV_PCREL_LO12_I) ? RV_MEMMOD_PCREL_LO : RV_MEMMOD_LO;
    226     m.sym = sym;
    227     m.off = off;
    228   } else {
    229     m.disp = (i32)asm_driver_parse_const(d);
    230   }
    231   asm_driver_expect_punct(d, '(', "'(' in rv64 memory operand");
    232   m.base = parse_xreg(d);
    233   asm_driver_expect_punct(d, ')', "')' in rv64 memory operand");
    234   return m;
    235 }
    236 
    237 /* Emit the I/S-type relocation recorded by parse_mem for a `%lo`/`%pcrel_lo`
    238  * memory offset, picking the S-type variant for stores. */
    239 static void rv_emit_mem_mod_reloc(AsmDriver* d, const Rv64Mem* m,
    240                                   int is_store) {
    241   if (m->mod == RV_MEMMOD_NONE) return;
    242   RelocKind k = (m->mod == RV_MEMMOD_PCREL_LO)
    243                     ? (is_store ? R_RV_PCREL_LO12_S : R_RV_PCREL_LO12_I)
    244                     : (is_store ? R_RV_LO12_S : R_RV_LO12_I);
    245   MCEmitter* mc = asm_driver_mc(d);
    246   mc->emit_reloc_at(mc, mc->section_id, mc->pos(mc), k, m->sym, m->off, 0, 0);
    247 }
    248 
    249 /* Fence pred/succ parser — accepts a string like "rw" / "iorw" / "0" /
    250  * a numeric literal. Returns the 4-bit mask: bit3=i, bit2=o, bit1=r,
    251  * bit0=w. */
    252 static u32 parse_fence_mask(AsmDriver* d) {
    253   AsmTok t = asm_driver_peek(d);
    254   if (t.kind == ASM_TOK_NUM) {
    255     (void)asm_driver_next(d);
    256     return (u32)asm_driver_parse_const(d) & 0xfu;
    257   }
    258   if (t.kind == ASM_TOK_IDENT) {
    259     char name[8];
    260     AsmTok tt = asm_driver_next(d);
    261     if (!sym_to_cstr(d, tt.v.ident, name, sizeof name))
    262       asm_driver_panic(d, "rv64 asm: bad fence mask");
    263     u32 mask = 0;
    264     for (const char* p = name; *p; ++p) {
    265       switch (*p) {
    266         case 'i':
    267           mask |= 8u;
    268           break;
    269         case 'o':
    270           mask |= 4u;
    271           break;
    272         case 'r':
    273           mask |= 2u;
    274           break;
    275         case 'w':
    276           mask |= 1u;
    277           break;
    278         default:
    279           asm_driver_panic(d, "rv64 asm: bad fence char");
    280       }
    281     }
    282     return mask;
    283   }
    284   asm_driver_panic(d, "rv64 asm: bad fence operand");
    285 }
    286 
    287 /* The XLEN variant for the assembly target. Reached off the AsmDriver's
    288  * Compiler so the stateless encoders can gate rv32-vs-rv64 behavior
    289  * (shamt width, addiw availability). */
    290 static const RiscvVariant* rv_asm_variant(AsmDriver* d) {
    291   return riscv_variant_for_kind(asm_driver_compiler(d)->target.arch);
    292 }
    293 
    294 /* Field overlay onto a descriptor's `match` word.
    295  *
    296  * For most formats the descriptor's match already pins opcode +
    297  * funct3 + funct7. We OR in the per-operand fields. For shift-imm and
    298  * AMO families the layouts diverge from the basic R/I templates — we
    299  * handle those explicitly below. */
    300 
    301 static u32 enc_r(u32 match, u32 rd, u32 rs1, u32 rs2) {
    302   return match | ((rs2 & 0x1fu) << 20) | ((rs1 & 0x1fu) << 15) |
    303          ((rd & 0x1fu) << 7);
    304 }
    305 static u32 enc_i(u32 match, u32 rd, u32 rs1, i32 imm12) {
    306   return match | (((u32)imm12 & 0xfffu) << 20) | ((rs1 & 0x1fu) << 15) |
    307          ((rd & 0x1fu) << 7);
    308 }
    309 static u32 enc_s(u32 match, u32 rs2, u32 rs1, i32 imm12) {
    310   u32 ui = (u32)imm12 & 0xfffu;
    311   return match | ((ui >> 5) << 25) | ((rs2 & 0x1fu) << 20) |
    312          ((rs1 & 0x1fu) << 15) | ((ui & 0x1fu) << 7);
    313 }
    314 static u32 enc_b(u32 match, u32 rs1, u32 rs2, i32 imm13) {
    315   u32 ui = (u32)imm13;
    316   return match | (((ui >> 12) & 1u) << 31) | (((ui >> 5) & 0x3fu) << 25) |
    317          ((rs2 & 0x1fu) << 20) | ((rs1 & 0x1fu) << 15) |
    318          (((ui >> 1) & 0xfu) << 8) | (((ui >> 11) & 1u) << 7);
    319 }
    320 static u32 enc_u(u32 match, u32 rd, u32 imm20) {
    321   return match | ((imm20 & 0xfffffu) << 12) | ((rd & 0x1fu) << 7);
    322 }
    323 static u32 enc_j(u32 match, u32 rd, i32 imm21) {
    324   u32 ui = (u32)imm21;
    325   return match | (((ui >> 20) & 1u) << 31) | (((ui >> 1) & 0x3ffu) << 21) |
    326          (((ui >> 11) & 1u) << 20) | (((ui >> 12) & 0xffu) << 12) |
    327          ((rd & 0x1fu) << 7);
    328 }
    329 static u32 enc_r4(u32 match, u32 rd, u32 rs1, u32 rs2, u32 rs3, u32 rm) {
    330   return match | ((rs3 & 0x1fu) << 27) | ((rs2 & 0x1fu) << 20) |
    331          ((rs1 & 0x1fu) << 15) | ((rm & 0x7u) << 12) | ((rd & 0x1fu) << 7);
    332 }
    333 
    334 /* SLLI/SRLI/SRAI shift-imm. The shamt occupies bits 25:20 on rv64 (6-bit,
    335  * funct6 in match) but only bits 24:20 on rv32 (5-bit; bit 25 belongs to
    336  * funct7 and MUST stay 0, else the word reads as a different funct7). The
    337  * variant's shamt_bits drives the mask; an rv32 shamt >= 32 is rejected. */
    338 static u32 enc_ishift(AsmDriver* d, u32 match, u32 rd, u32 rs1, u32 shamt) {
    339   u32 shamt_bits = rv_asm_variant(d)->shamt_bits;
    340   u32 shamt_mask = (shamt_bits == 5u) ? 0x1fu : 0x3fu;
    341   if (shamt > shamt_mask)
    342     asm_driver_panic(d, "rv64 asm: shift amount out of range for target XLEN");
    343   return match | ((shamt & shamt_mask) << 20) | ((rs1 & 0x1fu) << 15) |
    344          ((rd & 0x1fu) << 7);
    345 }
    346 /* RV32 word shift-imm: shamt5 occupies bits 24:20 (funct7 already pinned). */
    347 static u32 enc_ishiftw(u32 match, u32 rd, u32 rs1, u32 shamt) {
    348   return match | ((shamt & 0x1fu) << 20) | ((rs1 & 0x1fu) << 15) |
    349          ((rd & 0x1fu) << 7);
    350 }
    351 /* AMO: aq/rl bits 26/25 — we accept them as optional .aq/.rl suffixes
    352  * on the mnemonic. For now mnemonics arrive bare. */
    353 static u32 enc_amo(u32 match, u32 aq, u32 rl, u32 rd, u32 rs1, u32 rs2) {
    354   return match | ((aq & 1u) << 26) | ((rl & 1u) << 25) | ((rs2 & 0x1fu) << 20) |
    355          ((rs1 & 0x1fu) << 15) | ((rd & 0x1fu) << 7);
    356 }
    357 
    358 static u32 c_reg3(AsmDriver* d, u32 r) {
    359   if (r < 8u || r > 15u)
    360     asm_driver_panic(d,
    361                      "rv64 asm: compressed register must be x8..x15/f8..f15");
    362   return r - 8u;
    363 }
    364 
    365 static u32 enc_c_ci(u32 match, u32 rd, i32 imm) {
    366   u32 u = (u32)imm & 0x3fu;
    367   return match | (((u >> 5) & 1u) << 12) | ((rd & 0x1fu) << 7) |
    368          ((u & 0x1fu) << 2);
    369 }
    370 
    371 static u32 enc_c_cr(u32 match, u32 rd_rs1, u32 rs2) {
    372   return match | ((rd_rs1 & 0x1fu) << 7) | ((rs2 & 0x1fu) << 2);
    373 }
    374 
    375 static u32 enc_c_addi16sp(u32 match, i32 imm) {
    376   u32 u = (u32)imm & 0x3ffu;
    377   return match | (((u >> 9) & 1u) << 12) | (((u >> 4) & 1u) << 6) |
    378          (((u >> 6) & 1u) << 5) | (((u >> 7) & 3u) << 3) |
    379          (((u >> 5) & 1u) << 2);
    380 }
    381 
    382 static u32 enc_c_addi4spn(u32 match, u32 rd3, u32 imm) {
    383   u32 enc = (((imm >> 4) & 3u) << 6) | (((imm >> 6) & 0xfu) << 2) |
    384             (((imm >> 2) & 1u) << 1) | ((imm >> 3) & 1u);
    385   return match | ((enc & 0xffu) << 5) | ((rd3 & 7u) << 2);
    386 }
    387 
    388 static u32 enc_c_lwld(u32 match, u32 rd3, u32 rs1_3, u32 off, int wide64) {
    389   if (wide64) {
    390     return match | (((off >> 3) & 7u) << 10) | ((rs1_3 & 7u) << 7) |
    391            (((off >> 6) & 3u) << 5) | ((rd3 & 7u) << 2);
    392   }
    393   return match | (((off >> 3) & 7u) << 10) | ((rs1_3 & 7u) << 7) |
    394          (((off >> 2) & 1u) << 6) | (((off >> 6) & 1u) << 5) |
    395          ((rd3 & 7u) << 2);
    396 }
    397 
    398 static u32 enc_c_swld(u32 match, u32 rs2_3, u32 rs1_3, u32 off, int wide64) {
    399   return enc_c_lwld(match, rs2_3, rs1_3, off, wide64);
    400 }
    401 
    402 static u32 enc_c_lwsp(u32 match, u32 rd, u32 off, int wide64) {
    403   if (wide64) {
    404     return match | (((off >> 5) & 1u) << 12) | ((rd & 0x1fu) << 7) |
    405            (((off >> 3) & 3u) << 5) | (((off >> 6) & 7u) << 2);
    406   }
    407   return match | (((off >> 5) & 1u) << 12) | ((rd & 0x1fu) << 7) |
    408          (((off >> 2) & 7u) << 4) | (((off >> 6) & 3u) << 2);
    409 }
    410 
    411 static u32 enc_c_swsp(u32 match, u32 rs2, u32 off, int wide64) {
    412   u32 imm6;
    413   if (wide64)
    414     imm6 = (((off >> 3) & 7u) << 3) | ((off >> 6) & 7u);
    415   else
    416     imm6 = (((off >> 2) & 0xfu) << 2) | ((off >> 6) & 3u);
    417   return match | ((imm6 & 0x3fu) << 7) | ((rs2 & 0x1fu) << 2);
    418 }
    419 
    420 static u32 enc_c_cb_imm(u32 match, u32 rs1_3, i32 imm) {
    421   u32 u = (u32)imm & 0x1ffu;
    422   return match | (((u >> 8) & 1u) << 12) | (((u >> 3) & 3u) << 10) |
    423          ((rs1_3 & 7u) << 7) | (((u >> 6) & 3u) << 5) | (((u >> 1) & 3u) << 3) |
    424          (((u >> 5) & 1u) << 2);
    425 }
    426 
    427 static u32 enc_c_cb_alu_imm(u32 match, u32 rd3, i32 imm) {
    428   u32 u = (u32)imm & 0x3fu;
    429   return match | (((u >> 5) & 1u) << 12) | ((rd3 & 7u) << 7) |
    430          ((u & 0x1fu) << 2);
    431 }
    432 
    433 static u32 enc_c_cj(u32 match, i32 imm) {
    434   u32 u = (u32)imm & 0xfffu;
    435   return match | (((u >> 11) & 1u) << 12) | (((u >> 4) & 1u) << 11) |
    436          (((u >> 8) & 3u) << 9) | (((u >> 10) & 1u) << 8) |
    437          (((u >> 6) & 1u) << 7) | (((u >> 7) & 1u) << 6) |
    438          (((u >> 1) & 7u) << 3) | (((u >> 5) & 1u) << 2);
    439 }
    440 
    441 /* Parse a branch/jump target operand. With a symbolic target (a label), emit
    442  * the relocation at the current position — which is exactly where the caller
    443  * is about to write this instruction word — and return 0 as the placeholder
    444  * immediate. With a bare constant, return it as the PC-relative byte
    445  * displacement (preserving the existing numeric-offset corpus behavior). */
    446 static i32 rv_reloc_target(AsmDriver* d, RelocKind kind) {
    447   ObjSymId sym = OBJ_SYM_NONE;
    448   i64 off = 0;
    449   asm_driver_parse_sym_expr(d, &sym, &off);
    450   if (sym != OBJ_SYM_NONE) {
    451     MCEmitter* mc = asm_driver_mc(d);
    452     mc->emit_reloc_at(mc, mc->section_id, mc->pos(mc), kind, sym, off, 0, 0);
    453     return 0;
    454   }
    455   return (i32)off;
    456 }
    457 
    458 /* Per-format parser — reads the operand list off the driver and returns
    459  * the encoded 32-bit word, given the matched descriptor. */
    460 static u32 assemble_one(AsmDriver* d, const Rv64InsnDesc* desc) {
    461   u32 m = desc->match;
    462   u32 rd = 0, rs1 = 0, rs2 = 0;
    463   i32 imm = 0;
    464   Rv64Mem mem;
    465 
    466   switch ((Rv64Format)desc->fmt) {
    467     case RV64_FMT_R:
    468       /* Two-operand aliases: snez/neg/negw — rd, rs (rs1=x0). */
    469       if (desc->flags & RV64_ASMFL_ALIAS) {
    470         rd = parse_xreg(d);
    471         expect_comma(d);
    472         rs2 = parse_xreg(d);
    473         return enc_r(m, rd, 0u, rs2);
    474       }
    475       rd = parse_xreg(d);
    476       expect_comma(d);
    477       rs1 = parse_xreg(d);
    478       expect_comma(d);
    479       rs2 = parse_xreg(d);
    480       return enc_r(m, rd, rs1, rs2);
    481 
    482     case RV64_FMT_R4: {
    483       u32 rs3;
    484       rd = parse_freg(d);
    485       expect_comma(d);
    486       rs1 = parse_freg(d);
    487       expect_comma(d);
    488       rs2 = parse_freg(d);
    489       expect_comma(d);
    490       rs3 = parse_freg(d);
    491       return enc_r4(m, rd, rs1, rs2, rs3, 0x7u);
    492     }
    493 
    494     case RV64_FMT_I:
    495       /* Aliases first. `li` is handled earlier by rv64_emit_pseudo (it may
    496        * need a multi-word expansion), so it never reaches here. */
    497       if (desc->flags & RV64_ASMFL_ALIAS) {
    498         if (slice_eq_cstr(desc->mnemonic, "mv")) {
    499           /* Standard two-operand `mv rd, rs` = `addi rd, rs, 0`. (A %pcrel_lo
    500            * low-half is emitted as the canonical `addi rd, rs, %pcrel_lo(L)`,
    501            * not a non-standard 3-operand `mv`, so it lands in the ADDI path
    502            * below — matching clang.) */
    503           rd = parse_xreg(d);
    504           expect_comma(d);
    505           rs1 = parse_xreg(d);
    506           return enc_i(m, rd, rs1, 0);
    507         }
    508         if (slice_eq_cstr(desc->mnemonic, "sext.w")) {
    509           rd = parse_xreg(d);
    510           expect_comma(d);
    511           rs1 = parse_xreg(d);
    512           return enc_i(m, rd, rs1, 0);
    513         }
    514         if (slice_eq_cstr(desc->mnemonic, "seqz") ||
    515             slice_eq_cstr(desc->mnemonic, "not")) {
    516           rd = parse_xreg(d);
    517           expect_comma(d);
    518           rs1 = parse_xreg(d);
    519           /* match already has imm12 + funct3 + op pinned. */
    520           return m | ((rs1 & 0x1fu) << 15) | ((rd & 0x1fu) << 7);
    521         }
    522       }
    523       rd = parse_xreg(d);
    524       expect_comma(d);
    525       rs1 = parse_xreg(d);
    526       expect_comma(d);
    527       /* `addi rd, rs1, %lo(sym)` / `%pcrel_lo(label)` → R_RV_LO12_I. */
    528       if (rv_emit_imm_mod_reloc(d, RV_MODPOS_LO_I)) return enc_i(m, rd, rs1, 0);
    529       imm = (i32)asm_driver_parse_const(d);
    530       return enc_i(m, rd, rs1, imm);
    531 
    532     case RV64_FMT_I_SHIFT:
    533       rd = parse_xreg(d);
    534       expect_comma(d);
    535       rs1 = parse_xreg(d);
    536       expect_comma(d);
    537       return enc_ishift(d, m, rd, rs1, (u32)asm_driver_parse_const(d));
    538 
    539     case RV64_FMT_I_SHIFTW:
    540       rd = parse_xreg(d);
    541       expect_comma(d);
    542       rs1 = parse_xreg(d);
    543       expect_comma(d);
    544       return enc_ishiftw(m, rd, rs1, (u32)asm_driver_parse_const(d));
    545 
    546     case RV64_FMT_U:
    547       rd = parse_xreg(d);
    548       expect_comma(d);
    549       /* `lui rd, %hi(sym)` → R_RV_HI20; `auipc rd, %pcrel_hi(sym)` →
    550        * R_RV_PCREL_HI20 (or %got_pcrel_hi → R_RV_GOT_HI20). */
    551       if (rv_emit_imm_mod_reloc(d, RV_MODPOS_HI20)) return enc_u(m, rd, 0);
    552       imm = (i32)asm_driver_parse_const(d);
    553       /* LUI/AUIPC immediate is the upper-20 value: the input is interpreted
    554        * as the literal 20-bit value (already shifted-out form). */
    555       return enc_u(m, rd, (u32)imm);
    556 
    557     case RV64_FMT_J:
    558       /* `j label` / `jal rd, label` accept a symbolic target (R_RV_JAL) or a
    559        * bare numeric displacement. */
    560       if ((desc->flags & RV64_ASMFL_ALIAS) &&
    561           slice_eq_cstr(desc->mnemonic, "j")) {
    562         return enc_j(m, 0u, rv_reloc_target(d, R_RV_JAL));
    563       }
    564       rd = parse_xreg(d);
    565       expect_comma(d);
    566       return enc_j(m, rd, rv_reloc_target(d, R_RV_JAL));
    567 
    568     case RV64_FMT_B:
    569       /* `beq rs1, rs2, label` (and beqz/bnez aliases) accept a symbolic target
    570        * (R_RV_BRANCH) or a bare numeric displacement. */
    571       if (desc->flags & RV64_ASMFL_ALIAS) {
    572         /* beqz / bnez: rs, off. */
    573         rs1 = parse_xreg(d);
    574         expect_comma(d);
    575         return enc_b(m, rs1, 0u, rv_reloc_target(d, R_RV_BRANCH));
    576       }
    577       rs1 = parse_xreg(d);
    578       expect_comma(d);
    579       rs2 = parse_xreg(d);
    580       expect_comma(d);
    581       return enc_b(m, rs1, rs2, rv_reloc_target(d, R_RV_BRANCH));
    582 
    583     case RV64_FMT_LOAD:
    584       rd = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
    585       expect_comma(d);
    586       mem = parse_mem(d);
    587       rv_emit_mem_mod_reloc(d, &mem, /*is_store=*/0);
    588       return enc_i(m, rd, mem.base, mem.disp);
    589 
    590     case RV64_FMT_FP_LOAD:
    591       rd = parse_freg(d);
    592       expect_comma(d);
    593       mem = parse_mem(d);
    594       rv_emit_mem_mod_reloc(d, &mem, /*is_store=*/0);
    595       return enc_i(m, rd, mem.base, mem.disp);
    596 
    597     case RV64_FMT_STORE:
    598       rs2 = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
    599       expect_comma(d);
    600       mem = parse_mem(d);
    601       rv_emit_mem_mod_reloc(d, &mem, /*is_store=*/1);
    602       return enc_s(m, rs2, mem.base, mem.disp);
    603 
    604     case RV64_FMT_FP_STORE:
    605       rs2 = parse_freg(d);
    606       expect_comma(d);
    607       mem = parse_mem(d);
    608       rv_emit_mem_mod_reloc(d, &mem, /*is_store=*/1);
    609       return enc_s(m, rs2, mem.base, mem.disp);
    610 
    611     case RV64_FMT_JALR:
    612       if ((desc->flags & RV64_ASMFL_ALIAS) &&
    613           slice_eq_cstr(desc->mnemonic, "jr")) {
    614         rs1 = parse_xreg(d);
    615         return enc_i(m, 0u, rs1, 0);
    616       }
    617       rd = parse_xreg(d);
    618       if (!asm_driver_eat_comma(d)) {
    619         if (slice_eq_cstr(desc->mnemonic, "jalr"))
    620           return enc_i(m, RV_RA, rd, 0);
    621         asm_driver_panic(d, "rv64 asm: expected ','");
    622       }
    623       /* Accept both `jalr rd, imm(rs1)` and `jalr rd, rs1, imm`. */
    624       {
    625         AsmTok t = asm_driver_peek(d);
    626         if (t.kind == ASM_TOK_IDENT) {
    627           /* register first → register form */
    628           rs1 = parse_xreg(d);
    629           if (asm_driver_eat_comma(d)) {
    630             imm = (i32)asm_driver_parse_const(d);
    631           } else {
    632             imm = 0;
    633           }
    634           return enc_i(m, rd, rs1, imm);
    635         }
    636       }
    637       mem = parse_mem(d);
    638       return enc_i(m, rd, mem.base, mem.disp);
    639 
    640     case RV64_FMT_FENCE: {
    641       u32 pred, succ;
    642       pred = parse_fence_mask(d);
    643       expect_comma(d);
    644       succ = parse_fence_mask(d);
    645       return m | (pred << 24) | (succ << 20);
    646     }
    647 
    648     case RV64_FMT_SYSTEM:
    649       /* No operands. nop/ret/ecall/ebreak. */
    650       return m;
    651 
    652     case RV64_FMT_FP_RM:
    653       rd = parse_freg(d);
    654       expect_comma(d);
    655       rs1 = parse_freg(d);
    656       expect_comma(d);
    657       rs2 = parse_freg(d);
    658       /* Use DYN(=7) rounding mode by default. */
    659       return enc_r(m | (0x7u << 12), rd, rs1, rs2);
    660 
    661     case RV64_FMT_FP_R:
    662       if (desc->flags & RV64_ASMFL_FP) {
    663         rd = parse_freg(d);
    664       } else {
    665         rd = parse_xreg(d);
    666       }
    667       expect_comma(d);
    668       rs1 = parse_freg(d);
    669       expect_comma(d);
    670       rs2 = parse_freg(d);
    671       return enc_r(m, rd, rs1, rs2);
    672 
    673     case RV64_FMT_FP_CVT:
    674       if (desc->flags & RV64_ASMFL_FP) {
    675         rd = parse_freg(d);
    676         expect_comma(d);
    677         /* Source: integer reg for fcvt.s.w etc (no FP flag would
    678          * indicate); but since we have ASMFL_FP set on dest, source may
    679          * be either. Disambiguate by mnemonic. */
    680         if (slice_has_prefix_cstr(desc->mnemonic, "fcvt.s.", 7) &&
    681             (desc->mnemonic.s[7] == 'w' || desc->mnemonic.s[7] == 'l')) {
    682           rs1 = parse_xreg(d);
    683         } else if (slice_has_prefix_cstr(desc->mnemonic, "fcvt.d.", 7) &&
    684                    (desc->mnemonic.s[7] == 'w' || desc->mnemonic.s[7] == 'l')) {
    685           rs1 = parse_xreg(d);
    686         } else if (slice_eq_cstr(desc->mnemonic, "fmv.w.x") ||
    687                    slice_eq_cstr(desc->mnemonic, "fmv.d.x")) {
    688           rs1 = parse_xreg(d);
    689         } else {
    690           rs1 = parse_freg(d);
    691         }
    692       } else {
    693         rd = parse_xreg(d);
    694         expect_comma(d);
    695         rs1 = parse_freg(d);
    696       }
    697       /* match encodes rs2 (type selector); OR in rd/rs1 and the rounding mode.
    698        * An explicit `, <rm>` suffix (cc -S emits it for non-default modes, and
    699        * clang/gas accept it) takes precedence; otherwise a bare conversion
    700        * mnemonic encodes the dynamic rounding mode (DYN=7), matching gas/clang
    701        * for hand-written assembly. (Codegen's C float->int truncation is RTZ,
    702        * but that path uses the rv_fcvt_* encoders directly and supplies its own
    703        * rm; the text assembler must follow the assembler convention.) fmv
    704        * bit-moves carry no rounding (rm=0). */
    705       {
    706         u32 funct7 = (m >> 25) & 0x7fu;
    707         u32 rm;
    708         if (asm_driver_eat_comma(d)) {
    709           rm = rv_parse_rm_name(d);
    710         } else {
    711           switch (funct7) {
    712             case 0x70: /* fmv.x.w */
    713             case 0x71: /* fmv.x.d */
    714             case 0x78: /* fmv.w.x */
    715             case 0x79: /* fmv.d.x */
    716               rm = 0x0u;
    717               break;
    718             default: /* fcvt families: DYN (explicit suffix overrides above) */
    719               rm = 0x7u;
    720               break;
    721           }
    722         }
    723         return m | (rm << 12) | ((rs1 & 0x1fu) << 15) | ((rd & 0x1fu) << 7);
    724       }
    725 
    726     case RV64_FMT_AMO:
    727       rd = parse_xreg(d);
    728       expect_comma(d);
    729       rs2 = parse_xreg(d);
    730       expect_comma(d);
    731       asm_driver_expect_punct(d, '(', "'(' in rv64 amo operand");
    732       rs1 = parse_xreg(d);
    733       asm_driver_expect_punct(d, ')', "')' in rv64 amo operand");
    734       return enc_amo(m, 0u, 0u, rd, rs1, rs2);
    735 
    736     case RV64_FMT_LR:
    737       rd = parse_xreg(d);
    738       expect_comma(d);
    739       asm_driver_expect_punct(d, '(', "'(' in rv64 lr operand");
    740       rs1 = parse_xreg(d);
    741       asm_driver_expect_punct(d, ')', "')' in rv64 lr operand");
    742       return enc_amo(m, 0u, 0u, rd, rs1, 0u);
    743 
    744     case RV64_FMT_CSR: {
    745       u32 csr;
    746       rd = parse_xreg(d);
    747       expect_comma(d);
    748       csr = parse_csr(d);
    749       expect_comma(d);
    750       rs1 = parse_xreg(d);
    751       return enc_i(m, rd, rs1, (i32)csr);
    752     }
    753 
    754     case RV64_FMT_CSRI: {
    755       u32 csr;
    756       rd = parse_xreg(d);
    757       expect_comma(d);
    758       csr = parse_csr(d);
    759       expect_comma(d);
    760       u32 uimm = (u32)asm_driver_parse_const(d) & 0x1fu;
    761       return enc_i(m, rd, uimm, (i32)csr);
    762     }
    763 
    764     case RV64_FMT_CSR_PSEUDO: {
    765       /* 2-operand CSR pseudos. The match word already pins funct3+opcode; we
    766        * supply x0 for the implicit rd or rs1 per the mnemonic. */
    767       u32 csr;
    768       if (slice_eq_cstr(desc->mnemonic, "csrr")) {
    769         /* csrr rd, csr = csrrs rd, csr, x0 */
    770         rd = parse_xreg(d);
    771         expect_comma(d);
    772         csr = parse_csr(d);
    773         return enc_i(m, rd, 0u, (i32)csr);
    774       }
    775       /* csrw/csrs/csrc csr, rs   and   csrwi/csrsi/csrci csr, uimm:
    776        * destination is x0, csr comes first. */
    777       csr = parse_csr(d);
    778       expect_comma(d);
    779       if (slice_eq_cstr(desc->mnemonic, "csrwi") ||
    780           slice_eq_cstr(desc->mnemonic, "csrsi") ||
    781           slice_eq_cstr(desc->mnemonic, "csrci")) {
    782         u32 uimm = (u32)asm_driver_parse_const(d) & 0x1fu;
    783         return enc_i(m, 0u, uimm, (i32)csr);
    784       }
    785       rs1 = parse_xreg(d);
    786       return enc_i(m, 0u, rs1, (i32)csr);
    787     }
    788 
    789     case RV64_FMT_CR:
    790       if (slice_eq_cstr(desc->mnemonic, "c.jr") ||
    791           slice_eq_cstr(desc->mnemonic, "c.jalr")) {
    792         rs1 = parse_xreg(d);
    793         return enc_c_cr(m, rs1, 0u);
    794       }
    795       rd = parse_xreg(d);
    796       expect_comma(d);
    797       rs2 = parse_xreg(d);
    798       return enc_c_cr(m, rd, rs2);
    799 
    800     case RV64_FMT_CI:
    801       if (slice_eq_cstr(desc->mnemonic, "c.lwsp") ||
    802           slice_eq_cstr(desc->mnemonic, "c.ldsp") ||
    803           slice_eq_cstr(desc->mnemonic, "c.fldsp")) {
    804         rd = slice_eq_cstr(desc->mnemonic, "c.fldsp") ? parse_freg(d)
    805                                                       : parse_xreg(d);
    806         expect_comma(d);
    807         mem = parse_mem(d);
    808         if (mem.base != RV_SP)
    809           asm_driver_panic(d, "rv64 asm: compressed stack load needs sp base");
    810         return enc_c_lwsp(m, rd, (u32)mem.disp,
    811                           !slice_eq_cstr(desc->mnemonic, "c.lwsp"));
    812       }
    813       rd = parse_xreg(d);
    814       expect_comma(d);
    815       imm = (i32)asm_driver_parse_const(d);
    816       if (slice_eq_cstr(desc->mnemonic, "c.lui") && ((u32)imm & 0xfffu) == 0)
    817         imm >>= 12;
    818       if (slice_eq_cstr(desc->mnemonic, "c.addi16sp")) {
    819         if (rd != RV_SP)
    820           asm_driver_panic(d, "rv64 asm: c.addi16sp needs sp destination");
    821         return enc_c_addi16sp(m, imm);
    822       }
    823       return enc_c_ci(m, rd, imm);
    824 
    825     case RV64_FMT_CSS:
    826       rs2 = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
    827       expect_comma(d);
    828       mem = parse_mem(d);
    829       if (mem.base != RV_SP)
    830         asm_driver_panic(d, "rv64 asm: compressed stack store needs sp base");
    831       return enc_c_swsp(m, rs2, (u32)mem.disp,
    832                         !slice_eq_cstr(desc->mnemonic, "c.swsp"));
    833 
    834     case RV64_FMT_CIW:
    835       rd = parse_xreg(d);
    836       expect_comma(d);
    837       rs1 = parse_xreg(d);
    838       expect_comma(d);
    839       if (rs1 != RV_SP)
    840         asm_driver_panic(d, "rv64 asm: c.addi4spn needs sp source");
    841       imm = (i32)asm_driver_parse_const(d);
    842       return enc_c_addi4spn(m, c_reg3(d, rd), (u32)imm);
    843 
    844     case RV64_FMT_CL:
    845       rd = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
    846       expect_comma(d);
    847       mem = parse_mem(d);
    848       return enc_c_lwld(m, c_reg3(d, rd), c_reg3(d, mem.base), (u32)mem.disp,
    849                         !slice_eq_cstr(desc->mnemonic, "c.lw"));
    850 
    851     case RV64_FMT_CS:
    852       rs2 = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
    853       expect_comma(d);
    854       mem = parse_mem(d);
    855       return enc_c_swld(m, c_reg3(d, rs2), c_reg3(d, mem.base), (u32)mem.disp,
    856                         !slice_eq_cstr(desc->mnemonic, "c.sw"));
    857 
    858     case RV64_FMT_CA:
    859       rd = parse_xreg(d);
    860       expect_comma(d);
    861       rs2 = parse_xreg(d);
    862       return m | (c_reg3(d, rd) << 7) | (c_reg3(d, rs2) << 2);
    863 
    864     case RV64_FMT_CB:
    865       rs1 = parse_xreg(d);
    866       expect_comma(d);
    867       imm = (i32)asm_driver_parse_const(d);
    868       if (slice_eq_cstr(desc->mnemonic, "c.beqz") ||
    869           slice_eq_cstr(desc->mnemonic, "c.bnez")) {
    870         return enc_c_cb_imm(m, c_reg3(d, rs1), imm);
    871       }
    872       return enc_c_cb_alu_imm(m, c_reg3(d, rs1), imm);
    873 
    874     case RV64_FMT_CJ:
    875       imm = (i32)asm_driver_parse_const(d);
    876       return enc_c_cj(m, imm);
    877 
    878     case RV64_FMT_C_NONE:
    879       return m;
    880 
    881     default:
    882       asm_driver_panic(d, "rv64 asm: unsupported format");
    883   }
    884 }
    885 
    886 /* ============================================================
    887  * Multi-word pseudo-instruction expansion.
    888  *
    889  * call/tail/la/lla expand to a PC-relative AUIPC + (JALR | ADDI) pair;
    890  * `li` with a constant that does not fit a 12-bit signed immediate
    891  * expands to an LUI/ADDI(W)/SLLI chain (no relocations). Each 32-bit
    892  * word goes out through rv64_emit32 — the same path assemble_one's
    893  * single-word result uses — and relocations are attached via
    894  * mc->emit_reloc_at at the appropriate word offset. */
    895 
    896 /* 12-bit signed immediate range check for li short-circuit. */
    897 static bool rv_fits_i12(i64 v) { return v >= -2048 && v <= 2047; }
    898 
    899 /* Sign-extend the low 12 bits of v. */
    900 static i64 rv_sext12(i64 v) {
    901   return (i64)((((u64)v & 0xfffu) ^ 0x800u)) - 0x800;
    902 }
    903 
    904 /* Emit an AUIPC rd,0 + a R_RV_PCREL_HI20(sym) reloc, then create a local
    905  * `.LpcrelHi` anchor at the AUIPC offset and return that anchor symbol so
    906  * the paired low-half reloc can reference it. Mirrors native.c's
    907  * rv_emit_global_addr (the non-GOT branch). */
    908 static ObjSymId rv_emit_pcrel_hi(AsmDriver* d, u32 rd, ObjSymId sym,
    909                                  i64 addend) {
    910   MCEmitter* mc = asm_driver_mc(d);
    911   ObjBuilder* obj = asm_driver_ob(d);
    912   Compiler* c = asm_driver_compiler(d);
    913   u32 sec = mc->section_id;
    914   u32 ap = mc->pos(mc);
    915   rv64_emit32(mc, rv_auipc(rd, 0));
    916   mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, addend, 0, 0);
    917   Sym an = pool_intern_slice(c->global, SLICE_LIT(".LpcrelHi"));
    918   return obj_symbol(obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
    919 }
    920 
    921 /* call/tail: AUIPC <link>,0 + JALR <rd>,<link>,0 with one R_RV_CALL reloc
    922  * at the AUIPC. `link` is the register the AUIPC materializes into and the
    923  * JALR's base; `rd` is the JALR link-register (ra for call, zero for
    924  * tail). The linker patches both words from the single R_RV_CALL reloc. */
    925 static void rv_emit_call_pseudo(AsmDriver* d, u32 link, u32 rd) {
    926   MCEmitter* mc = asm_driver_mc(d);
    927   ObjSymId sym = OBJ_SYM_NONE;
    928   i64 off = 0;
    929   asm_driver_parse_sym_expr(d, &sym, &off);
    930   if (sym == OBJ_SYM_NONE)
    931     asm_driver_panic(d, "rv64 asm: call/tail target must be a symbol");
    932   u32 sec = mc->section_id;
    933   u32 ap = mc->pos(mc);
    934   rv64_emit32(mc, rv_auipc(link, 0));
    935   rv64_emit32(mc, rv_jalr(rd, link, 0));
    936   mc->emit_reloc_at(mc, sec, ap, R_RV_CALL, sym, off, 0, 0);
    937 }
    938 
    939 /* la/lla rd, sym: AUIPC rd,%pcrel_hi(sym) + ADDI rd,rd,%pcrel_lo(anchor).
    940  * kit's static Local-Exec model has no GOT, so `la` == `lla`. */
    941 static void rv_emit_la_pseudo(AsmDriver* d) {
    942   MCEmitter* mc = asm_driver_mc(d);
    943   u32 rd = parse_xreg(d);
    944   expect_comma(d);
    945   ObjSymId sym = OBJ_SYM_NONE;
    946   i64 off = 0;
    947   asm_driver_parse_sym_expr(d, &sym, &off);
    948   if (sym == OBJ_SYM_NONE)
    949     asm_driver_panic(d, "rv64 asm: la/lla target must be a symbol");
    950   ObjSymId anchor = rv_emit_pcrel_hi(d, rd, sym, off);
    951   u32 sec = mc->section_id;
    952   u32 lp = mc->pos(mc);
    953   rv64_emit32(mc, rv_addi(rd, rd, 0));
    954   mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
    955 }
    956 
    957 /* LUI immediate that sign-extends to a negative 32-bit value: bit 19 of
    958  * the 20-bit field is set, i.e. Hi20 >= 0x80000. */
    959 #define RV_LUI_HI20_SIGN 0x80000LL
    960 
    961 /* Materialize a constant into `rd` via the LLVM RISCVMatInt sequence: for
    962  * values fitting a signed 32-bit range, LUI + ADDI/ADDIW; otherwise a
    963  * recursive top-down hi20/lo12 split with SLLI shifts that absorb trailing
    964  * zeros. No relocations.
    965  *
    966  * On rv64, after an LUI the low-half add uses ADDIW only when the LUI value
    967  * is negative in 32-bit form (Hi20 >= RV_LUI_HI20_SIGN): there the add must
    968  * wrap in 32-bit arithmetic and re-sign-extend to land in range. When the
    969  * LUI value is non-negative in its low 32 bits, plain ADDI keeps the
    970  * 64-bit result correct (matching LLVM's generateInstSeqImpl).
    971  *
    972  * On rv32 there is no ADDIW and the GPR is 32 bits wide, so every constant
    973  * fits a LUI + ADDI pair (the add already wraps mod 2^32). The variant's
    974  * has_w_forms gates both the ADDIW use and the >32-bit recursion below. */
    975 static void rv_emit_li_value(MCEmitter* mc, const RiscvVariant* variant,
    976                              u32 rd, i64 val) {
    977   if (!variant->has_w_forms || (val >= -2147483648LL && val <= 2147483647LL)) {
    978     i64 hi20 = ((val + 0x800) >> 12) & 0xfffffLL;
    979     i64 lo12 = rv_sext12(val);
    980     if (hi20) rv64_emit32(mc, rv_lui(rd, (u32)hi20));
    981     if (lo12 || hi20 == 0) {
    982       u32 src = hi20 ? rd : (u32)RV_ZERO;
    983       if (variant->has_w_forms && hi20 >= RV_LUI_HI20_SIGN)
    984         rv64_emit32(mc, rv_addiw(rd, src, (i32)lo12));
    985       else
    986         rv64_emit32(mc, rv_addi(rd, src, (i32)lo12));
    987     }
    988     return;
    989   }
    990   /* >32-bit: split off the low 12 bits, recurse on the (shifted) high
    991    * part, then SLLI back and ADD the low bits. The subtraction is done in
    992    * unsigned space so it cannot signed-overflow at the int64 extremes
    993    * (e.g. val=INT64_MAX, lo12=-1); the result has its low 12 bits clear,
    994    * and the arithmetic right shift recovers the sign-extended high part. */
    995   i64 lo12 = rv_sext12(val);
    996   i64 hi = (i64)((u64)val - (u64)lo12) >> 12;
    997   u32 shift = 12;
    998   /* Absorb trailing zeros of the high part into the shift amount. */
    999   while ((hi & 1) == 0) {
   1000     hi >>= 1;
   1001     ++shift;
   1002   }
   1003   rv_emit_li_value(mc, variant, rd, hi);
   1004   rv64_emit32(mc, rv_slli(rd, rd, shift));
   1005   if (lo12) rv64_emit32(mc, rv_addi(rd, rd, (i32)lo12));
   1006 }
   1007 
   1008 /* Dispatch a multi-word pseudo. Returns true if it consumed the operands
   1009  * and emitted its expansion; false to fall through to the single-word
   1010  * path. `li` is handled here only when its immediate exceeds the 12-bit
   1011  * signed range the alias row encodes directly. */
   1012 static bool rv64_emit_pseudo(AsmDriver* d, const Rv64InsnDesc* desc) {
   1013   MCEmitter* mc = asm_driver_mc(d);
   1014   if (desc->fmt == RV64_FMT_PSEUDO) {
   1015     if (slice_eq_cstr(desc->mnemonic, "call")) {
   1016       rv_emit_call_pseudo(d, RV_RA, RV_RA);
   1017       return true;
   1018     }
   1019     if (slice_eq_cstr(desc->mnemonic, "tail")) {
   1020       /* Standard RISC-V `tail` materializes the address into t1 (x6). kit
   1021        * codegen uses t0 for its own tail-call temp, so a `cc -S`-fused
   1022        * `tail sym` re-assembles to t1 not t0 — execution-equivalent (both are
   1023        * caller-saved temps clobbered by the tail jump; cross-exec still
   1024        * matches), only the byte image differs on tail-call cases. Keeping the
   1025        * assembler's `tail` standard preserves clang/gas interop. */
   1026       rv_emit_call_pseudo(d, RV_T1, RV_ZERO);
   1027       return true;
   1028     }
   1029     /* la / lla — identical PC-relative expansion in kit. */
   1030     rv_emit_la_pseudo(d);
   1031     return true;
   1032   }
   1033   if ((desc->flags & RV64_ASMFL_ALIAS) && slice_eq_cstr(desc->mnemonic, "li")) {
   1034     /* Peek the immediate without consuming the destination register: the
   1035      * single-word alias path re-parses both. We commit to the multi-word
   1036      * path only for out-of-range constants, leaving the existing 12-bit
   1037      * fast path (and its golden behavior) untouched. */
   1038     u32 rd = parse_xreg(d);
   1039     expect_comma(d);
   1040     i64 imm = asm_driver_parse_const(d);
   1041     if (rv_fits_i12(imm)) {
   1042       rv64_emit32(mc, rv_addi(rd, RV_ZERO, (i32)imm));
   1043     } else {
   1044       rv_emit_li_value(mc, rv_asm_variant(d), rd, imm);
   1045     }
   1046     return true;
   1047   }
   1048   return false;
   1049 }
   1050 
   1051 static void rv64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic) {
   1052   MCEmitter* mc = asm_driver_mc(d);
   1053   const Rv64InsnDesc* desc;
   1054   u8 av = rv_asm_variant(d)->xlen == 32u ? (u8)RV_AV_RV32 : (u8)RV_AV_RV64;
   1055   (void)base;
   1056   (void)asm_driver_cur_section(d);
   1057   desc = rv64_asm_find(pool_slice(asm_driver_pool(d), mnemonic), av);
   1058   if (!desc)
   1059     asm_driver_panic(d, av == (u8)RV_AV_RV32 ? "rv32 asm: unsupported instruction"
   1060                                              : "rv64 asm: unsupported instruction");
   1061   if (rv64_emit_pseudo(d, desc)) return;
   1062   if (desc->flags & RV64_ASMFL_C16)
   1063     rv64_emit16(mc, assemble_one(d, desc));
   1064   else
   1065     rv64_emit32(mc, assemble_one(d, desc));
   1066 }
   1067 
   1068 static void rv64_arch_asm_destroy(ArchAsm* base) { (void)base; }
   1069 
   1070 /* ---- textual-assembly operand syntax (printer <-> parser) ----------------
   1071  *
   1072  * Inverse of the `.s` parsers above (rv_parse_mod_reloc / rv_reloc_target and
   1073  * the call/la pseudo expanders): how a relocated rv64 operand is spelled in
   1074  * `cc -S` so the same text re-assembles under kit-as. RISC-V uses the same
   1075  * `%hi`/`%lo`/`%pcrel_hi`/`%pcrel_lo` operator syntax on every object format,
   1076  * so `fmt` is unused. See ArchAsmOps and src/api/asm_emit.c. */
   1077 static int rv64_reloc_operand(u16 kind, KitObjFmt fmt, ArchRelocOperand* out) {
   1078   (void)fmt;
   1079   out->prefix = "";
   1080   out->suffix = "";
   1081   out->addend_bias = 0;
   1082   out->emit_anchor = 0;
   1083   out->ref_anchor = 0;
   1084   switch (kind) {
   1085     case R_RV_PCREL_HI20:
   1086       out->surg = ARCH_RELOC_SURG_TAIL;
   1087       out->prefix = "%pcrel_hi(";
   1088       out->suffix = ")";
   1089       out->emit_anchor = 1; /* define a unique anchor label at this AUIPC */
   1090       return 1;
   1091     case R_RV_GOT_HI20:
   1092       out->surg = ARCH_RELOC_SURG_TAIL;
   1093       out->prefix = "%got_pcrel_hi(";
   1094       out->suffix = ")";
   1095       out->emit_anchor = 1;
   1096       return 1;
   1097     case R_RV_PCREL_LO12_I:
   1098     case R_RV_PCREL_LO12_S:
   1099       out->surg = ARCH_RELOC_SURG_RV_LO12;
   1100       out->prefix = "%pcrel_lo(";
   1101       out->suffix = ")";
   1102       out->ref_anchor = 1; /* references the preceding AUIPC's anchor label */
   1103       return 1;
   1104     case R_RV_HI20:
   1105       out->surg = ARCH_RELOC_SURG_TAIL;
   1106       out->prefix = "%hi(";
   1107       out->suffix = ")";
   1108       return 1;
   1109     case R_RV_LO12_I:
   1110     case R_RV_LO12_S:
   1111       out->surg = ARCH_RELOC_SURG_RV_LO12;
   1112       out->prefix = "%lo(";
   1113       out->suffix = ")";
   1114       return 1;
   1115     case R_RV_BRANCH:
   1116     case R_RV_JAL:
   1117       out->surg = ARCH_RELOC_SURG_TAIL;
   1118       return 1;
   1119     default:
   1120       return 0; /* R_ABS*, R_RV_RVC_*, R_RV_RELAX, TLS, ... → keep numeric */
   1121   }
   1122 }
   1123 
   1124 /* Intra-section local branches whose target codegen resolved in place (no
   1125  * relocation): the disassembler renders the target numerically, so cc -S
   1126  * synthesizes a label there. `j`/`jal x0` are JAL aliases; the conditional
   1127  * branches are B-type. `call`/`tail` are excluded — they carry R_RV_CALL. */
   1128 static int rv64_is_local_branch(KitSlice m) {
   1129   if (m.len == 1 && m.s[0] == 'j') return 1;
   1130   if (m.len == 3 && memcmp(m.s, "jal", 3) == 0) return 1;
   1131   if (m.len == 3 && memcmp(m.s, "beq", 3) == 0) return 1;
   1132   if (m.len == 3 && memcmp(m.s, "bne", 3) == 0) return 1;
   1133   if (m.len == 3 && memcmp(m.s, "blt", 3) == 0) return 1;
   1134   if (m.len == 3 && memcmp(m.s, "bge", 3) == 0) return 1;
   1135   if (m.len == 4 && memcmp(m.s, "bltu", 4) == 0) return 1;
   1136   if (m.len == 4 && memcmp(m.s, "bgeu", 4) == 0) return 1;
   1137   if (m.len == 4 && memcmp(m.s, "beqz", 4) == 0) return 1;
   1138   if (m.len == 4 && memcmp(m.s, "bnez", 4) == 0) return 1;
   1139   if (m.len == 4 && memcmp(m.s, "blez", 4) == 0) return 1;
   1140   if (m.len == 4 && memcmp(m.s, "bgez", 4) == 0) return 1;
   1141   if (m.len == 4 && memcmp(m.s, "bltz", 4) == 0) return 1;
   1142   if (m.len == 4 && memcmp(m.s, "bgtz", 4) == 0) return 1;
   1143   if (m.len == 6 && memcmp(m.s, "c.beqz", 6) == 0) return 1;
   1144   if (m.len == 6 && memcmp(m.s, "c.bnez", 6) == 0) return 1;
   1145   if (m.len == 3 && memcmp(m.s, "c.j", 3) == 0) return 1;
   1146   return 0;
   1147 }
   1148 
   1149 /* R_RV_CALL fuses an AUIPC+JALR pair into a single `call`/`tail sym` pseudo
   1150  * (the canonical `.s` spelling the assembler re-expands to the same pair +
   1151  * reloc). The reloc sits on the AUIPC; the JALR partner carries no reloc. A
   1152  * tail call links into x0 (the JALR's rd is `zero`); a regular call links into
   1153  * ra. We read that from the partner JALR's disassembled text. */
   1154 static int rv64_reloc_call_pair(u16 kind, KitSlice pair_mnemonic,
   1155                                 KitSlice pair_ops, const char** mnemonic_out) {
   1156   if (kind != R_RV_CALL) return 0;
   1157   /* The partner JALR links into ra (regular call) or x0 (tail). The
   1158    * disassembler renders the x0-link, zero-immediate form as the `jr rs`
   1159    * alias, and the ra form as `jalr ra, 0(ra)`. So a `jr` partner is always a
   1160    * tail; a `jalr` partner is a tail iff its link register is `zero`. */
   1161   if (pair_mnemonic.len == 2 && memcmp(pair_mnemonic.s, "jr", 2) == 0) {
   1162     *mnemonic_out = "tail";
   1163     return 1;
   1164   }
   1165   if (pair_mnemonic.len == 4 && memcmp(pair_mnemonic.s, "jalr", 4) == 0) {
   1166     if (pair_ops.len >= 4 && memcmp(pair_ops.s, "zero", 4) == 0)
   1167       *mnemonic_out = "tail";
   1168     else
   1169       *mnemonic_out = "call";
   1170     return 1;
   1171   }
   1172   return 0;
   1173 }
   1174 
   1175 const ArchAsmOps rv64_asm_ops = {
   1176     .reloc_operand = rv64_reloc_operand,
   1177     .is_local_branch = rv64_is_local_branch,
   1178     .reloc_call_pair = rv64_reloc_call_pair,
   1179 };
   1180 
   1181 ArchAsm* rv64_arch_asm_new(Compiler* c) {
   1182   Rv64Asm* a = arena_new(c->tu, Rv64Asm);
   1183   memset(a, 0, sizeof *a);
   1184   a->base.insn = rv64_arch_asm_insn;
   1185   a->base.destroy = rv64_arch_asm_destroy;
   1186   a->c = c;
   1187   return &a->base;
   1188 }
   1189 
   1190 /* ============================================================
   1191  * Inline-asm template walker (parallel to aa64 asm.c §"inline-asm
   1192  * template walker"). The walker substitutes %N / %[name] / %% / %a%w%x
   1193  * placeholders into a per-line StrBuf, then re-lexes each line through
   1194  * rv64_arch_asm_insn for assembly. Statement separators recognised are
   1195  * '\n' and ';' (outside parens / quoted strings).
   1196  * ============================================================ */
   1197 
   1198 Rv64Asm* rv64_asm_open(Compiler* c) {
   1199   Rv64Asm* a = arena_new(c->tu, Rv64Asm);
   1200   memset(a, 0, sizeof *a);
   1201   a->base.insn = rv64_arch_asm_insn;
   1202   a->base.destroy = rv64_arch_asm_destroy;
   1203   a->c = c;
   1204   return a;
   1205 }
   1206 
   1207 void rv64_asm_close(Rv64Asm* a) { (void)a; }
   1208 
   1209 void rv64_inline_bind(Rv64Asm* a, const AsmConstraint* outs, u32 nout,
   1210                       Operand* out_ops, const AsmConstraint* ins, u32 nin,
   1211                       const Operand* in_ops, const Sym* clobbers, u32 nclob) {
   1212   a->outs = outs;
   1213   a->out_ops = out_ops;
   1214   a->ins = ins;
   1215   a->in_ops = in_ops;
   1216   a->clobbers = clobbers;
   1217   a->nout = nout;
   1218   a->nin = nin;
   1219   a->nclob = nclob;
   1220 }
   1221 
   1222 /* Per-line rendered buffer cap. Inline asm rarely emits more than a
   1223  * handful of insns per block; one substituted line fits comfortably.
   1224  * Truncation panics — the operator grammar should never grow a single
   1225  * line beyond this without a deliberate reason. */
   1226 #define RV64_INLINE_LINE_CAP 1024
   1227 
   1228 _Noreturn static void inline_panic(Rv64Asm* a, const char* msg) {
   1229   SrcLoc loc = {0, 0, 0};
   1230   compiler_panic(a->c, loc, "rv64 inline asm: %.*s",
   1231                  SLICE_ARG(slice_from_cstr(msg)));
   1232 }
   1233 
   1234 /* Render a 5-bit integer register number using its canonical psABI name. */
   1235 static void render_xreg(StrBuf* sb, u32 reg) {
   1236   const char* nm = rv64_register_name(reg & 0x1fu);
   1237   if (!nm) {
   1238     strbuf_putc(sb, 'x');
   1239     if ((reg & 0x1fu) >= 10u)
   1240       strbuf_putc(sb, (char)('0' + ((reg & 0x1fu) / 10u)));
   1241     strbuf_putc(sb, (char)('0' + ((reg & 0x1fu) % 10u)));
   1242     return;
   1243   }
   1244   strbuf_puts(sb, nm);
   1245 }
   1246 
   1247 /* Render an FP register by its canonical psABI name (e.g., fa0). */
   1248 static void render_freg(StrBuf* sb, u32 reg) {
   1249   const char* nm = rv64_register_name(32u + (reg & 0x1fu));
   1250   if (!nm) {
   1251     strbuf_putc(sb, 'f');
   1252     if ((reg & 0x1fu) >= 10u)
   1253       strbuf_putc(sb, (char)('0' + ((reg & 0x1fu) / 10u)));
   1254     strbuf_putc(sb, (char)('0' + ((reg & 0x1fu) % 10u)));
   1255     return;
   1256   }
   1257   strbuf_puts(sb, nm);
   1258 }
   1259 
   1260 /* Render a signed 64-bit integer. Inline asm immediates appear bare in
   1261  * RISC-V (no '#' prefix), matching the standalone .s parser. */
   1262 static void render_imm(StrBuf* sb, i64 v) { strbuf_put_i64(sb, v); }
   1263 
   1264 /* Render addressing form `disp(base)`. */
   1265 static void render_indirect(Rv64Asm* a, StrBuf* sb, Reg base, i32 ofs) {
   1266   (void)a;
   1267   if (ofs != 0)
   1268     strbuf_put_i64(sb, (i64)ofs);
   1269   else
   1270     strbuf_putc(sb, '0');
   1271   strbuf_putc(sb, '(');
   1272   render_xreg(sb, (u32)base);
   1273   strbuf_putc(sb, ')');
   1274 }
   1275 
   1276 /* Resolve operand index → render into sb. form:
   1277  *   0 = default (per-kind),
   1278  *   1 = %wN (width hint; on rv64 same as default xreg form),
   1279  *   2 = %xN (force 64-bit reg form — identical to default for rv64),
   1280  *   3 = %aN (memory addressing form).
   1281  *   4 = %zN (RISC-V GCC: emits "zero" if operand is imm 0, else reg). */
   1282 static void render_operand(Rv64Asm* a, StrBuf* sb, u32 idx, int form) {
   1283   u32 ntot = a->nout + a->nin;
   1284   if (idx >= ntot) inline_panic(a, "operand index out of range");
   1285   const Operand* op =
   1286       (idx < a->nout) ? &a->out_ops[idx] : &a->in_ops[idx - a->nout];
   1287   switch (form) {
   1288     case 1: /* %wN — accept any reg/imm; rv64 has no narrower spelling. */
   1289     case 2: /* %xN — same. */
   1290       if (op->kind == RV64_INLINE_OPK_REG) {
   1291         if (op->pad[0] == RV64_INLINE_OPCLS_FP)
   1292           render_freg(sb, (u32)op->v.local);
   1293         else
   1294           render_xreg(sb, (u32)op->v.local);
   1295         return;
   1296       }
   1297       if (op->kind == OPK_IMM) {
   1298         render_imm(sb, op->v.imm);
   1299         return;
   1300       }
   1301       inline_panic(a, "%w/%x on unsupported operand kind");
   1302     case 3: /* %aN — memory addressing form */
   1303       if (op->kind != OPK_INDIRECT) inline_panic(a, "%a on non-memory operand");
   1304       if (op->v.ind.index != CG_LOCAL_NONE)
   1305         inline_panic(a,
   1306                      "%a on indexed memory operand: rv64 inline asm "
   1307                      "requires base+disp only");
   1308       render_indirect(a, sb, (Reg)op->v.ind.base, op->v.ind.ofs);
   1309       return;
   1310     case 4: /* %zN — zero-or-reg */
   1311       if (op->kind == OPK_IMM && op->v.imm == 0) {
   1312         strbuf_puts(sb, "zero");
   1313         return;
   1314       }
   1315       if (op->kind == RV64_INLINE_OPK_REG) {
   1316         if (op->pad[0] == RV64_INLINE_OPCLS_FP)
   1317           render_freg(sb, (u32)op->v.local);
   1318         else
   1319           render_xreg(sb, (u32)op->v.local);
   1320         return;
   1321       }
   1322       inline_panic(a, "%z on unsupported operand kind");
   1323     default:
   1324       break;
   1325   }
   1326   switch (op->kind) {
   1327     case RV64_INLINE_OPK_REG:
   1328       if (op->pad[0] == RV64_INLINE_OPCLS_FP)
   1329         render_freg(sb, (u32)op->v.local);
   1330       else
   1331         render_xreg(sb, (u32)op->v.local);
   1332       return;
   1333     case OPK_IMM:
   1334       render_imm(sb, op->v.imm);
   1335       return;
   1336     case OPK_INDIRECT:
   1337       if (op->v.ind.index != CG_LOCAL_NONE)
   1338         inline_panic(a,
   1339                      "indexed memory operand in inline asm: rv64 requires "
   1340                      "base+disp only");
   1341       render_indirect(a, sb, (Reg)op->v.ind.base, op->v.ind.ofs);
   1342       return;
   1343     default:
   1344       inline_panic(a, "unsupported operand kind for %N");
   1345   }
   1346 }
   1347 
   1348 /* Resolve a `%[name]` operand by looking up `needle` against the
   1349  * constraint.name fields on the combined outs+ins list. Returns the
   1350  * combined index, or (u32)-1 on miss. */
   1351 static u32 lookup_named(Rv64Asm* a, Sym needle) {
   1352   for (u32 k = 0; k < a->nout; ++k) {
   1353     if (a->outs[k].name == needle) return k;
   1354   }
   1355   for (u32 k = 0; k < a->nin; ++k) {
   1356     if (a->ins[k].name == needle) return a->nout + k;
   1357   }
   1358   return (u32)-1;
   1359 }
   1360 
   1361 /* Lex one line of substituted asm and dispatch via rv64_arch_asm_insn. */
   1362 static void run_one_line(Rv64Asm* a, MCEmitter* mc, const char* text,
   1363                          size_t len) {
   1364   /* Skip blank lines. */
   1365   size_t i;
   1366   for (i = 0; i < len; ++i) {
   1367     if (text[i] != ' ' && text[i] != '\t') break;
   1368   }
   1369   if (i == len) return;
   1370 
   1371   AsmLexer* lx = asm_lex_open_mem(a->c, "<inline-asm>", text, len);
   1372   AsmDriver* d = asm_driver_open_inline(a->c, mc, lx);
   1373 
   1374   /* The first non-trivial token must be the mnemonic identifier. */
   1375   AsmTok t = asm_driver_peek(d);
   1376   while (t.kind == ASM_TOK_NEWLINE) {
   1377     (void)asm_driver_next(d);
   1378     t = asm_driver_peek(d);
   1379   }
   1380   if (t.kind == ASM_TOK_EOF) {
   1381     asm_driver_close_inline(d);
   1382     asm_lex_close(lx);
   1383     return;
   1384   }
   1385   if (t.kind != ASM_TOK_IDENT)
   1386     inline_panic(a, "expected mnemonic at start of inline asm line");
   1387   (void)asm_driver_next(d);
   1388   Sym mn = t.v.ident;
   1389   /* Compose `fcvt.s.w` etc. — rv64 has dotted mnemonics; the standalone
   1390    * lexer already strings them together as a single IDENT in most paths.
   1391    * Mirror the aa64 composite handling for safety. */
   1392   AsmTok dot = asm_driver_peek(d);
   1393   while (asm_driver_tok_is_punct(dot, '.')) {
   1394     (void)asm_driver_next(d);
   1395     AsmTok rest = asm_driver_next(d);
   1396     if (rest.kind != ASM_TOK_IDENT)
   1397       inline_panic(a, "composite mnemonic: expected ident after '.'");
   1398     Slice hsl = pool_slice(asm_driver_pool(d), mn);
   1399     Slice rsl = pool_slice(asm_driver_pool(d), rest.v.ident);
   1400     size_t hn = hsl.len, rn = rsl.len;
   1401     char buf[64];
   1402     if (hn + 1 + rn >= sizeof buf)
   1403       inline_panic(a, "composite mnemonic too long");
   1404     for (size_t k = 0; k < hn; ++k) buf[k] = hsl.s[k];
   1405     buf[hn] = '.';
   1406     for (size_t k = 0; k < rn; ++k) buf[hn + 1 + k] = rsl.s[k];
   1407     mn = pool_intern_slice(asm_driver_pool(d),
   1408                            (Slice){.s = buf, .len = hn + 1 + rn});
   1409     dot = asm_driver_peek(d);
   1410   }
   1411   rv64_arch_asm_insn(&a->base, d, mn);
   1412   asm_driver_close_inline(d);
   1413   asm_lex_close(lx);
   1414 }
   1415 
   1416 /* Substitute placeholders into one line's StrBuf, then dispatch. */
   1417 static void render_and_run_line(Rv64Asm* a, MCEmitter* mc, StrBuf* sb,
   1418                                 const char* start, const char* end) {
   1419   strbuf_reset(sb);
   1420   for (const char* p = start; p < end; ++p) {
   1421     char c = *p;
   1422     if (c != '%') {
   1423       strbuf_putc(sb, c);
   1424       continue;
   1425     }
   1426     /* Placeholder. */
   1427     if (p + 1 >= end) inline_panic(a, "trailing '%' in template");
   1428     char n = *(p + 1);
   1429     if (n == '%') {
   1430       strbuf_putc(sb, '%');
   1431       ++p;
   1432       continue;
   1433     }
   1434     if (n == '[') {
   1435       const char* nbeg = p + 2;
   1436       const char* nend = nbeg;
   1437       while (nend < end && *nend != ']') ++nend;
   1438       if (nend == end) inline_panic(a, "unterminated %[name]");
   1439       size_t nlen = (size_t)(nend - nbeg);
   1440       Sym needle =
   1441           pool_intern_slice(a->c->global, (Slice){.s = nbeg, .len = nlen});
   1442       u32 idx = lookup_named(a, needle);
   1443       if (idx == (u32)-1)
   1444         inline_panic(a, "%[name] does not match any constraint");
   1445       p = nend; /* loop's ++p steps past the ']' */
   1446       render_operand(a, sb, idx, 0);
   1447       continue;
   1448     }
   1449     int form = 0; /* 0=default, 1=w, 2=x, 3=a, 4=z */
   1450     if (n == 'w' || n == 'x' || n == 'a' || n == 'z') {
   1451       form = (n == 'w') ? 1 : (n == 'x') ? 2 : (n == 'a') ? 3 : 4;
   1452       ++p;
   1453       if (p + 1 >= end) inline_panic(a, "trailing '%' modifier in template");
   1454       n = *(p + 1);
   1455     }
   1456     if (n == '[') {
   1457       const char* nbeg = p + 2;
   1458       const char* nend = nbeg;
   1459       while (nend < end && *nend != ']') ++nend;
   1460       if (nend == end) inline_panic(a, "unterminated %[name]");
   1461       size_t nlen = (size_t)(nend - nbeg);
   1462       Sym needle =
   1463           pool_intern_slice(a->c->global, (Slice){.s = nbeg, .len = nlen});
   1464       u32 idx = lookup_named(a, needle);
   1465       if (idx == (u32)-1)
   1466         inline_panic(a, "%[name] does not match any constraint");
   1467       p = nend;
   1468       render_operand(a, sb, idx, form);
   1469       continue;
   1470     }
   1471     if (n < '0' || n > '9') inline_panic(a, "expected digit after '%'");
   1472     u32 idx = (u32)(n - '0');
   1473     ++p;
   1474     /* GCC syntax permits up to two digits (%0..%99). */
   1475     if (p + 1 < end && *(p + 1) >= '0' && *(p + 1) <= '9') {
   1476       idx = idx * 10 + (u32)(*(p + 1) - '0');
   1477       ++p;
   1478     }
   1479     render_operand(a, sb, idx, form);
   1480   }
   1481   if (sb->truncated) inline_panic(a, "inline asm line buffer overflow");
   1482   run_one_line(a, mc, strbuf_cstr(sb), strbuf_len(sb));
   1483 }
   1484 
   1485 void rv64_asm_run_template(Rv64Asm* a, MCEmitter* mc, const char* tmpl) {
   1486   if (!tmpl || !*tmpl) return;
   1487 
   1488   char buf[RV64_INLINE_LINE_CAP];
   1489   StrBuf sb;
   1490   strbuf_init(&sb, buf, sizeof buf);
   1491 
   1492   /* Walk tmpl, splitting on '\n' and ';'. Track paren depth and quote
   1493    * state so that a literal ';' inside `( ... )` (memory operand) or a
   1494    * quoted string is not mistaken for a statement separator. RISC-V uses
   1495    * `disp(base)` for memory, hence we track parens. */
   1496   const char* line_start = tmpl;
   1497   int paren = 0;
   1498   char quote = 0;
   1499   for (const char* p = tmpl;; ++p) {
   1500     char c = *p;
   1501     if (c == '\0') {
   1502       render_and_run_line(a, mc, &sb, line_start, p);
   1503       break;
   1504     }
   1505     if (quote) {
   1506       if (c == '\\' && *(p + 1)) {
   1507         ++p;
   1508         continue;
   1509       }
   1510       if (c == quote) quote = 0;
   1511       continue;
   1512     }
   1513     if (c == '"' || c == '\'') {
   1514       quote = c;
   1515       continue;
   1516     }
   1517     if (c == '(') {
   1518       ++paren;
   1519       continue;
   1520     }
   1521     if (c == ')') {
   1522       if (paren) --paren;
   1523       continue;
   1524     }
   1525     if (paren == 0 && (c == '\n' || c == ';')) {
   1526       render_and_run_line(a, mc, &sb, line_start, p);
   1527       line_start = p + 1;
   1528     }
   1529   }
   1530 }