commit bf5b73bcb4b8e9cffd12aa173a5a5759cbe71381
parent 6ea2c9a48aa1a8670eb9a9005cd63d8b7c002a28
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 3 Jun 2026 17:02:26 -0700
rv32: bring up and complete riscv32-none-elf cross target
Refactor src/arch/rv64 -> src/arch/riscv as one XLEN-parameterized RISC-V
backend (RiscvVariant descriptor) and wire riscv32-none-elf end to end. RV64 /
x64 / aa64 are fully non-regressed.
Backend bring-up:
- WS0 RiscvVariant (src/arch/riscv/variant.{h,c}) + KIT_ARCH_RV32_ENABLED.
- WS1 rv64->riscv rename; variant threaded through native.c (rv64 byte-identical).
- WS2 ISA availability-mask (RV_AV_RV32/RV64) + asm/disasm/link/dbg XLEN param;
link_arch_rv32 (lw stubs), rv32_dbg_ops (min=2/max=4, RVC->step-over).
- WS3 arch_impl_rv32 + registry; -march via variant->isa_prefix; rv32 default
rv32imafc_zicsr_zifencei (D cleared); ilp32f predefined macros.
- WS4a KitFloatAbi + KitTargetSpec.float_abi + KitTargetOptions.abi; -mabi
parse/resolve/validate (ilp32d-without-D and width-mismatch rejected);
medlow/medany mcmodel aliases.
- WS4b shared RISC-V ABI classifier (descriptor {gpr_bytes,aggregate,flen,
float_abi}) + rv32_vtable + registry.
- WS5 ELFCLASS32 emit/read/link + reloc_riscv32.c + EI_CLASS machine
disambiguation (obj_elf_machine_class); width-aware jump-table label
relocs (R_ABS32 on rv32); layout_dyn clean-panics on an ELF32 dynamic/PIE
link (was an ELF64 SEGV).
- WS7 mk/rt.mk variants fixed (riscv32-elf=ilp32/rv32imac, +riscv32-elf-hardfloat
=ilp32f/rv32imafc); mk/lib_srcs.mk ABI guard widened. RT_CFLAGS/RT_ASFLAGS
now include ARCH_FLAGS (the -mabi/-march were silently dropped). ELF
e_flags float-ABI derived from target.float_abi (was hardcoded SINGLE,
mislabelling ilp32 soft).
WS6 64-bit-value legalization (the blocker): rv32 8-byte scalars (long long/i64
AND soft double) are now memory-resident and legalized to 2-word lane ops --
inline add/sub/and/or/xor/neg/bnot/compare (carry/borrow via sltu; no compiler-rt
64-bit add exists), __*di3 for mul/div/rem/shift, __*df*/__*sf* for soft
double/single, i64<->float via __floatdisf/__fixsfdi, __*di2 for
clz/ctz/popcount/bswap, 2-lane 64-bit consts. The allocator binds one register
per value, so memory residence + the multi-part ABI path is the only correct
representation. nd_* guards panic on any 8-byte value reaching a single-register
op. Verified under qemu-system-riscv32 at -O0/-O1 for both ilp32f and ilp32.
Freestanding link policy is target-derived, host-irrelevant: kit stamps
EI_OSABI=ELFOSABI_STANDALONE on *-none-elf objects (fixes "none decodes as
Linux"), kit ld derives the PIC default via driver_default_pic (hosted PIE /
freestanding no-PIE) scanning all inputs, and auto-links a runtime only when one
exists (driver_runtime_has_variant) -- so a freestanding rv32 link needs no
-no-pie / -nostdlib. .eh_frame suppressed for KIT_OS_FREESTANDING; new -Ttext /
-nostdlib flags.
Tests: rv32_decode_test (test-isa, 31 checks), rv32_jit_test (test-rv32-jit),
rv32_class32 ELFCLASS32 round-trip (test-elf), test/smoke/rv32.sh
(test-smoke-rv32, 6 lanes incl. a kit-ld end-to-end lane), and rv32 wired as a
cross arch in the Toy (test/toy/run.sh path X, 240/15) and C (test/parse/run.sh
path E, 439/36) corpus lanes via the shared test/lib/exec_rv32_bare.sh. Reds are
left red on purpose -- real remaining gaps (i128, i64 atomics/overflow/varargs,
TLS) are tracked in doc/plan/RV32.md. RV64/x64/aa64 fully non-regressed.
Diffstat:
93 files changed, 14450 insertions(+), 10399 deletions(-)
diff --git a/doc/plan/RV32.md b/doc/plan/RV32.md
@@ -1,5 +1,132 @@
# Plan: RISC-V 32-bit (`riscv32-none-elf`) support
+## Status — 2026-06-03 (branch `rv32`) — core complete; cross-test gaps tracked
+
+`riscv32-none-elf` (`rv32imafc_zicsr_zifencei`, both `ilp32f` and `ilp32`) is a
+working cross target. WS6 — the flagged "hardest part", 64-bit-value legalization
+— is **done and behaviorally verified under `qemu-system-riscv32`** at -O0 and -O1
+for both ABIs. The full kit toolchain (`kit cc → kit ld → qemu-system`) builds and
+runs a correct bare-metal rv32 image with **no special flags** (freestanding
+defaults to no-PIE and links no auto-runtime). **RV64 / x64 / aa64 fully
+non-regressed**: asm goldens byte-identical, isa (rv64 21 + rv32 31)/0,
+abi-classify 367/0, elf 41/0, link 122/0 + x64 79/0, cg-api 544/0, smoke-rv64 3/0,
+dwarf/driver/interp green.
+
+Both corpora now run on qemu-system-riscv32 as a cross arch: **Toy `240 pass / 15
+red`** (`test/toy/run.sh`, path X) and **C `439 pass / 36 red`** (`test/parse/run.sh`,
+path E). **The reds are deliberately left red** (no skip sidecars) — they are the
+real remaining rv32 gaps, enumerated in the checklist below.
+
+### Done & verified ✅
+- [x] **WS0–WS5, WS7** — variant scaffold, XLEN-parameterized backend, `arch_impl_rv32`
+ + `-march`/`-mabi`/macros, shared ABI classifier + `rv32_vtable`, ELFCLASS32
+ emit/read/link + `reloc_riscv32.c`, `mk/rt.mk` variants. (See git history.)
+- [x] **WS6 — 64-bit-value pair-legalization (THE blocker) — DONE.** rv32 8-byte scalars
+ (`long long`/i64 AND soft `double`) are **memory-resident** (`api_is_wide8_scalar_type`
+ forces `CG_LOCAL_MEMORY_REQUIRED`; `cg_ir_lower`/`pass_native_emit` size>word checks made
+ `> ptr_size`), mirroring the proven i128/wide16 model. The allocator binds one register per
+ value, so memory residence + the multi-part ABI path (`ABIArgPart.src_offset`,
+ `rv_load_part`/`rv_store_part`) is the only correct representation. (`src/cg/arith.c`,
+ `src/cg/wide.c`):
+ - add/sub/and/or/xor/neg/bnot — **inline 2-word lane ops** (carry/borrow via `sltu`); no
+ compiler-rt 64-bit add helper exists, so these *must* be inline.
+ - i64 compares — inline lane eq/lt (signed-hi/unsigned-lo); `if(i64)` = `(lo|hi)!=0`.
+ - i64 mul/div/rem/shift → `__*di3`; soft `double` → `__*df*`; i64↔float → `__floatdisf`/
+ `__fixsfdi`/…; **soft single** f32 under `ilp32` → `__*sf*`; i64 clz/ctz/popcount/bswap →
+ `__*di2`; 64-bit consts → two lanes.
+ - `nd_*` guards (`native_direct_target.c`) **panic** on any 8-byte value reaching a
+ single-register binop/unop/cmp/convert/load_imm/load_const — loud, never truncation.
+- [x] **Runtime (`make rt`) — DONE.** Both `riscv32-elf` (ilp32) and `riscv32-elf-hardfloat`
+ (ilp32f) build with kit's own cc. Fixed `mk/rt.mk`: `RT_CFLAGS`/`RT_ASFLAGS` now include
+ `RT_<v>_ARCH_FLAGS` (the `-mabi`/`-march` were silently dropped — every variant built ilp32f).
+- [x] **ELF e_flags float-ABI** — `emit.c`/`link.c` derive the RISC-V float-ABI bits from
+ `target.float_abi` (the static descriptor hardcoded SINGLE, mislabelling `ilp32` soft);
+ rv64/x64/aa64 byte-identical.
+- [x] **Freestanding policy (host-irrelevant, target-derived):**
+ - kit stamps **`EI_OSABI=ELFOSABI_STANDALONE`** on `*-none-elf` objects (`emit.c`) so they
+ round-trip as `KIT_OS_FREESTANDING` instead of decoding back to Linux (the "none → Linux"
+ bug). `kit ld` derives the PIC default from the *target* via `driver_default_pic` (hosted
+ → PIE, freestanding → no-PIE) and scans all inputs for a freestanding object — the host's
+ default never leaks onto a cross target. So `kit ld` for rv32 needs **no `-no-pie`**.
+ - `kit ld` auto-links a runtime only for targets that have one (`driver_runtime_has_variant`);
+ a freestanding target supplies its own, so **no `-nostdlib`** is needed. New `-Ttext ADDR`
+ and `-nostdlib`/`--no-default-libs` flags remain available.
+ - `.eh_frame` suppressed for `KIT_OS_FREESTANDING` (`src/arch/mc.c`); hosted byte-identical.
+ - `layout_dyn` emits a clean diagnostic for an ELF32 dynamic/PIE link (was an ELF64 SEGV).
+ - jump-table / label-address slots are width-aware (`R_ABS32` on rv32, `R_ABS64` on 64-bit)
+ in `nd_local_static_data_label_addr` — fixes switch jump tables on rv32.
+- [x] **WS9 tests + CI wiring:** `test/arch/rv32_decode_test.c` (→ `test-isa`, 31 checks),
+ `test/link/rv32_jit_test.c` (→ `test-rv32-jit`, exit-77 host gate),
+ `test/elf/unit/rv32_class32.c` (ELFCLASS32 round-trip, → `test-elf`),
+ `test/smoke/rv32.sh` (→ `test-smoke-rv32`): 6 lanes — ilp32f + ilp32 × {-O0,-O1} covering i64
+ + soft-double + soft-single, a `kit ld` end-to-end lane, a negative control. Wired in
+ `mk/test.mk`/`mk/test_unit.mk` (`test-rv32-jit`, `test-smoke-rv32`).
+- [x] **Toy + C cross lanes (rv32 as an arch).** Shared bare-metal runner
+ `test/lib/exec_rv32_bare.sh` (clang startup → `kit cc`/parse-runner → `kit ld` → qemu-system,
+ SiFive-finisher exit oracle; entry symbol configurable — `main` for Toy, `test_main` for C).
+ Toy: `test/toy/run.sh` `cross_one_rv32` (rv32 in default `TOY_CROSS_ARCHS`, path X) — **240/15**.
+ C: `test/parse/run.sh` `kit_lane_E` rv32 branch + `kit_test_target.h` rv32 arm (path E,
+ `KIT_TEST_ARCH=rv32`) — **439/36**. Both opt-in; reds left red.
+
+### Remaining ⚠️ — clear checklist
+
+**A. rv32 codegen gaps surfaced by the cross lanes (the reds — left red on purpose, no skips).**
+Toy `240/15`, C `439/36`; the 51 reds cluster into:
+- [ ] **`__int128`** (C: `i128_02`…`i128_13+`, ~15 cases — the largest C bucket). rv32 has no
+ `__int128` (runtime `INT128=0`; the 16-byte-scalar path is dead on rv32). Decide: reject
+ `__int128` on rv32 at the front end with a clear diagnostic (cleanest), or legalize it (a
+ 4-word version of the wide8 work — large). Until then these are compile-fail/wrong-result.
+- [ ] **i64 atomics** (`@atomic_*<i64>` / `__atomic_*_8`; Toy 17/22/59/73/74/75/77, C
+ `builtin_*_atomic_long`). rv32 `A` has no 64-bit AMO/`lr.d`/`sc.d`; needs `__atomic_*_8`
+ libcalls (libatomic / a lock), absent freestanding. Provide 8-byte `__atomic_*` in `rt/`, or
+ document as a hard rv32 limitation.
+- [ ] **64-bit `*_overflow` intrinsics** (Toy 58_overflow_record, C `builtin_26_sadd_overflow`).
+ Legalize i64 sadd/uadd/ssub/usub/smul/umul-overflow on rv32 (the 64-bit operand reaches the
+ backend un-split today → trap), à la the clz/ctz wide8 routing in `arith.c`. 32-bit works.
+- [ ] **i64 varargs** (Toy 133_varargs_mixed_types — wrong result, not a hang). Audit the rv32
+ `va_arg` path for an 8-byte value (even-pair fetch from the vararg save area).
+- [ ] **thread-local storage** (Toy 141, C `6_7_1_03_thread_local_basic`, `gnu_thread_storage_01`).
+ TLS needs a thread pointer the bare-metal image never sets up — likely a genuine freestanding
+ limitation (the Linux lanes get it from the OS); document, or provide a static-TLS model.
+- [ ] **toy soft-float compare lowering** (Toy 153_fp_cmp_negation_b — `kit cc` "addr operand is
+ not an lvalue", rv32-only, not reproducible in C). An eager soft-fp compare feeding an
+ empty-then/else block hits an lvalue path the rv64 delayed-`SV_CMP` form avoids. Narrow.
+- [ ] **123_spec_demo** (Toy, hangs) — triage which of the above it exercises.
+- Test-environment mismatches (NOT rv32 codegen bugs; an `.rv32.skip` sidecar exists for them but
+ none is committed): Toy 145_baremetal_privileged_aa64 (aa64 intrinsics), 20_cg_api_inline_asm_full
+ + C `asm_01_grammar` (inline-asm constraints/grammar), 47_target_arch_switch (selects its expected
+ exit code by target arch).
+
+**B. Pre-existing follow-ups (orthogonal to the cross tests).**
+- [ ] Optional `make` targets `test-toy-rv32` / `test-parse-rv32` (opt-in; not in
+ `DEFAULT_TEST_TARGETS` while reds exist).
+- [ ] **`test/asm/` rv32 byte-golden lane + `regen-rv32.sh`** (rv32 arm in `test/asm/run.sh` /
+ `kit_unit.h` + committed clang/llvm-objdump goldens; `kit_test_target.h` already has rv32).
+- [ ] **CSR pseudo-ops in the assembler** (`csrs`/`csrw`/`csrr`/… + CSR names) — a general
+ RISC-V-assembler feature (missing on rv64 too; new `RV64_FMT_CSR_{R,W,WI}` + CSR-name table +
+ disasm print cases). Until then the smoke/cross startup stub is clang-assembled.
+
+**Out of scope (decided):** `kit ld` ELF32 dynamic/PIE — rv32 is static-only; `layout_dyn`
+clean-panics on an ELF32 dynamic/PIE link and that is the intended behavior.
+
+### Where to look
+- WS6 legalization: `src/cg/wide.c`, `src/cg/arith.c` (binop/unop/cmp/convert + soft-fp + clz/ctz),
+ `src/cg/{value,local,memory,call,control}.c`, `src/opt/{cg_ir_lower.c,pass_native_emit.c}`,
+ `src/cg/native_direct_target.c` (`nd_*` panics + `nd_local_static_data_label_addr`).
+- Backend: `src/arch/riscv/{variant.{h,c},native.c,isa.{c,h},disasm.c,asm.c,link.c,dbg.c,arch.c}`.
+- ABI: `src/abi/abi_rv64.c` + `src/abi/registry.c`.
+- ELF / kit ld / freestanding policy: `src/obj/elf/{elf.h,emit.c,read.c,link.c,link_dyn.c}` +
+ `reloc_riscv32.c`; `driver/cmd/ld.c` (`-Ttext`/`-nostdlib`/PIC-from-target), `driver/lib/target.c`
+ (`driver_default_pic`), `driver/lib/runtime.{c,h}` (`driver_runtime_has_variant`),
+ `src/api/object_detect.c` (EI_OSABI → os), `src/link/{link.c,link_layout.c}`, `src/api/link.c`.
+- Runtime/intrinsics: `mk/rt.mk` (ARCH_FLAGS), `src/cg/type.c` (rv32 ≡ rv64 for intrinsics).
+- Tests: `test/smoke/rv32.sh`, `test/lib/{check_rv32_env.sh,exec_rv32_bare.sh,kit_test_target.h}`,
+ `test/toy/run.sh` (`cross_one_rv32`), `test/parse/run.sh` (`kit_lane_E` rv32 branch),
+ `test/arch/rv32_decode_test.c`, `test/link/rv32_jit_test.c`, `test/elf/unit/rv32_class32.c`,
+ `mk/test.mk`, `mk/test_unit.mk`.
+
+---
+
## Context
`kit` today targets `riscv64` (LP64D) via a single backend in `src/arch/rv64/`. We want a
diff --git a/driver/cmd/cc.c b/driver/cmd/cc.c
@@ -761,6 +761,14 @@ static int cc_record_mcmodel(CcOptions* o, const char* val) {
o->target.code_model = KIT_CM_LARGE;
return 0;
}
+ if (driver_streq(val, "medlow")) {
+ o->target.code_model = KIT_CM_SMALL;
+ return 0;
+ }
+ if (driver_streq(val, "medany")) {
+ o->target.code_model = KIT_CM_MEDIUM;
+ return 0;
+ }
driver_errf(CC_TOOL, "unknown -mcmodel value: %.*s",
KIT_SLICE_ARG(kit_slice_cstr(val)));
return 1;
diff --git a/driver/cmd/ld.c b/driver/cmd/ld.c
@@ -77,6 +77,10 @@ typedef struct LdOptions {
int output_seen;
const char* entry; /* -e */
const char* script_path; /* -T */
+ int text_base_set; /* -Ttext seen */
+ uint64_t text_base; /* -Ttext ADDR: static ET_EXEC image base */
+ int no_default_libs; /* -nostdlib / --no-default-libs */
+ int pic_explicit; /* -static / -pie / -no-pie / -shared seen */
const char* support_dir; /* --support-dir */
uint16_t pe_subsystem; /* KitPeSubsystem */
/* PT_INTERP path. NULL means "let libkit pick the target default
@@ -335,6 +339,29 @@ static int hex_nibble(char c) {
return -1;
}
+/* Parse a -Ttext address: 0x<hex> or decimal. Returns 0 on success. */
+static int ld_parse_addr(const char* s, uint64_t* out) {
+ uint64_t v = 0;
+ const char* p;
+ if (!s || !s[0]) return 1;
+ if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
+ p = s + 2;
+ if (!*p) return 1;
+ for (; *p; ++p) {
+ int n = hex_nibble(*p);
+ if (n < 0) return 1;
+ v = (v << 4) | (uint64_t)n;
+ }
+ } else {
+ for (p = s; *p; ++p) {
+ if (*p < '0' || *p > '9') return 1;
+ v = v * 10u + (uint64_t)(*p - '0');
+ }
+ }
+ *out = v;
+ return 0;
+}
+
/* Parse `--build-id=...` argument into options. Accepts "none", "sha256",
* "uuid", or "0x<even-hex>". Returns 0 on success, 1 on bad value. */
static int ld_parse_build_id(LdOptions* o, const char* val) {
@@ -658,6 +685,41 @@ static int ld_parse(int argc, char** argv, LdOptions* o) {
o->script_path = argv[i];
continue;
}
+ /* -Ttext ADDR / -Ttext=ADDR: set the static ET_EXEC image (text) base, for
+ * freestanding images that must load at a fixed address (e.g. the qemu
+ * `virt` RAM base 0x80000000). Matched here; the bare "-T" above is an exact
+ * match, so it never swallows -Ttext. No effect on PIE/scripted layouts. */
+ {
+ const char* tval = NULL;
+ if (driver_streq(a, "-Ttext")) {
+ if (++i >= argc) {
+ driver_errf(LD_TOOL, "-Ttext requires an argument");
+ return 1;
+ }
+ tval = argv[i];
+ } else if ((tval = arg_eq_value(a, "-Ttext")) != NULL) {
+ /* -Ttext=ADDR */
+ }
+ if (tval) {
+ uint64_t v;
+ if (ld_parse_addr(tval, &v) != 0) {
+ driver_errf(LD_TOOL, "-Ttext: invalid address: %s", tval);
+ return 1;
+ }
+ o->text_base = v;
+ o->text_base_set = 1;
+ continue;
+ }
+ }
+ /* -nostdlib / --no-default-libs: do not auto-resolve and link kit's
+ * compiler runtime. A freestanding image (e.g. riscv32-none-elf) supplies
+ * its own libkit_rt.a on the command line, and a pure-linker invocation
+ * should not require a per-target runtime archive to exist. */
+ if (driver_streq(a, "-nostdlib") || driver_streq(a, "--no-default-libs") ||
+ driver_streq(a, "-nodefaultlibs")) {
+ o->no_default_libs = 1;
+ continue;
+ }
if (driver_streq(a, "--support-dir")) {
if (++i >= argc) {
driver_errf(LD_TOOL, "--support-dir requires an argument");
@@ -779,11 +841,13 @@ static int ld_parse(int argc, char** argv, LdOptions* o) {
if (driver_streq(a, "-static")) {
o->target.pic = KIT_PIC_NONE;
+ o->pic_explicit = 1;
continue;
}
if (driver_streq(a, "-pie")) {
o->target.pic = KIT_PIC_PIE;
o->pie = 1;
+ o->pic_explicit = 1;
continue;
}
if (driver_streq(a, "-dynamic-linker") ||
@@ -801,6 +865,7 @@ static int ld_parse(int argc, char** argv, LdOptions* o) {
}
if (driver_streq(a, "-no-pie")) {
o->target.pic = KIT_PIC_NONE;
+ o->pic_explicit = 1;
continue;
}
@@ -809,6 +874,7 @@ static int ld_parse(int argc, char** argv, LdOptions* o) {
/* Shared objects must be position-independent. Force PIC unless
* the caller has explicitly chosen PIE (which is also fine). */
if (o->target.pic == KIT_PIC_NONE) o->target.pic = KIT_PIC_PIC;
+ o->pic_explicit = 1;
continue;
}
if (driver_streq(a, "-r") || driver_streq(a, "--relocatable")) {
@@ -1126,19 +1192,44 @@ static int ld_run_link(LdOptions* o) {
obj_in[i].len = obj_lf[i].data.size;
}
- /* Auto-detect target from the first object input, falling back to
- * the host target. PIC overrides parsed from flags are preserved. */
+ /* Auto-detect target from the object inputs, falling back to the host target.
+ * The PIC default is a property of the *target*, not the host: hosted targets
+ * default to PIE, freestanding ones to no-PIE (driver_default_pic). The host's
+ * default PIC must never leak onto a detected cross target; only an EXPLICIT
+ * -static/-pie/-no-pie/-shared wins.
+ *
+ * Detection takes the arch from the first object, but the OS from any object:
+ * a freestanding (`*-none-elf`, EI_OSABI=STANDALONE) object means a freestanding
+ * link even if a foreign object (e.g. a clang-assembled startup stub, which
+ * stamps EI_OSABI=SysV and so decodes as Linux) appears first. */
if (o->nobject_files > 0) {
KitTargetSpec detected;
if (kit_detect_target(obj_lf[0].data.data, obj_lf[0].data.size,
&detected) == KIT_OK) {
uint8_t pic = o->target.pic;
+ uint32_t oi;
+ for (oi = 1; oi < o->nobject_files; ++oi) {
+ KitTargetSpec t;
+ if (kit_detect_target(obj_lf[oi].data.data, obj_lf[oi].data.size, &t) ==
+ KIT_OK &&
+ t.os == KIT_OS_FREESTANDING)
+ detected.os = KIT_OS_FREESTANDING;
+ }
o->target = detected;
- if (pic != KIT_PIC_NONE) o->target.pic = pic;
+ if (o->pic_explicit)
+ o->target.pic = pic;
+ else
+ o->target.pic = driver_default_pic(o->target.obj, o->target.os);
}
}
- if (!o->relocatable) {
+ /* Auto-link kit's compiler runtime only for targets that have one. A
+ * freestanding/static-only target (e.g. riscv32-none-elf) has no auto-runtime
+ * variant — it is implicitly -nostdlib and supplies its own libkit_rt.a on the
+ * command line — so skip resolution rather than erroring. -nostdlib forces the
+ * skip for any target. */
+ if (!o->relocatable && !o->no_default_libs &&
+ driver_runtime_has_variant(o->target)) {
if (driver_runtime_resolve(o->env, o->support_dir, o->driver_path,
&runtime) != 0) {
driver_errf(LD_TOOL, "support dir not found");
@@ -1259,6 +1350,8 @@ static int ld_run_link(LdOptions* o) {
: o->shared ? KIT_LINK_OUTPUT_SHARED
: KIT_LINK_OUTPUT_EXE;
lopts.entry = kit_slice_cstr(o->entry);
+ lopts.text_base_set = o->text_base_set;
+ lopts.text_base = o->text_base;
lopts.linker_script = script;
lopts.build_id_mode = o->build_id_mode;
lopts.build_id_bytes = o->build_id_bytes;
diff --git a/driver/cmd/run.c b/driver/cmd/run.c
@@ -432,6 +432,14 @@ static int run_record_mcmodel(RunOptions* o, const char* val) {
o->target.code_model = KIT_CM_LARGE;
return 0;
}
+ if (driver_streq(val, "medlow")) {
+ o->target.code_model = KIT_CM_SMALL;
+ return 0;
+ }
+ if (driver_streq(val, "medany")) {
+ o->target.code_model = KIT_CM_MEDIUM;
+ return 0;
+ }
driver_errf(RUN_TOOL, "unknown -mcmodel value: %.*s",
KIT_SLICE_ARG(kit_slice_cstr(val)));
return 1;
diff --git a/driver/driver.h b/driver/driver.h
@@ -162,6 +162,7 @@ typedef struct DriverTargetFeatures {
KitSlice isa;
KitSlice cpu;
KitSlice tune;
+ KitSlice abi;
} DriverTargetFeatures;
/* Shared target-feature parser. Canonical spelling is `-mattr=+foo,-bar`;
diff --git a/driver/lib/runtime.c b/driver/lib/runtime.c
@@ -330,6 +330,10 @@ static const RuntimeVariant* rt_variant_for_target(KitTargetSpec target) {
return NULL;
}
+int driver_runtime_has_variant(KitTargetSpec target) {
+ return rt_variant_for_target(target) != NULL;
+}
+
static int rt_prepare_pp(DriverEnv* env, const DriverRuntimeSupport* support,
const RuntimeVariant* variant, int assembler,
KitPreprocessOptions* pp, char*** owned_dirs,
diff --git a/driver/lib/runtime.h b/driver/lib/runtime.h
@@ -49,4 +49,9 @@ int driver_runtime_prepare_archive(DriverEnv* env, const char* tool,
DriverRuntimeArchive* out);
void driver_runtime_archive_fini(DriverEnv* env, DriverRuntimeArchive* a);
+/* True if kit knows how to build/provide a compiler runtime for `target`. A
+ * freestanding/static-only target (e.g. riscv32-none-elf) has none, so a linker
+ * should not try to auto-link one — the freestanding image supplies its own. */
+int driver_runtime_has_variant(KitTargetSpec target);
+
#endif
diff --git a/driver/lib/target.c b/driver/lib/target.c
@@ -162,6 +162,18 @@ int driver_target_features_try_consume(DriverTargetFeatures* tf, DriverEnv* env,
tf->isa = kit_slice_cstr(argv[*i]);
return 1;
}
+ if (driver_strneq(a, "-mabi=", 6)) {
+ tf->abi = kit_slice_cstr(a + 6);
+ return 1;
+ }
+ if (driver_streq(a, "-mabi")) {
+ if (++(*i) >= argc) {
+ driver_errf(tool, "-mabi requires an argument");
+ return -1;
+ }
+ tf->abi = kit_slice_cstr(argv[*i]);
+ return 1;
+ }
if (driver_strneq(a, "-mcpu=", 6)) {
tf->cpu = kit_slice_cstr(a + 6);
return 1;
@@ -211,6 +223,7 @@ int driver_target_options(const DriverTargetFeatures* tf, const char* tool,
out->isa = tf->isa;
out->cpu = tf->cpu;
out->tune = tf->tune;
+ out->abi = tf->abi;
out->features = tf->features;
out->nfeatures = tf->nfeatures;
return 0;
diff --git a/include/kit/config.h b/include/kit/config.h
@@ -26,6 +26,7 @@
/* Backend architectures. */
#define KIT_ARCH_AA64_ENABLED 1
#define KIT_ARCH_X64_ENABLED 1
+#define KIT_ARCH_RV32_ENABLED 1
#define KIT_ARCH_RV64_ENABLED 1
#define KIT_ARCH_WASM_ENABLED 1
#define KIT_ARCH_C_TARGET_ENABLED 1
diff --git a/include/kit/core.h b/include/kit/core.h
@@ -158,6 +158,18 @@ typedef enum KitCodeModel {
KIT_CM_LARGE,
} KitCodeModel;
+/* Float ABI axis, orthogonal to XLEN. Currently meaningful for RISC-V:
+ * SOFT = no FP args in FP regs (ilp32/lp64), SINGLE = float in FP regs but
+ * double soft (ilp32f/lp64f), DOUBLE = float+double in FP regs (ilp32d/lp64d).
+ * DEFAULT (=0) means "derive from -march" and is the safe unset sentinel for
+ * every spec-construction site that does not set it. */
+typedef enum KitFloatAbi {
+ KIT_FLOAT_ABI_DEFAULT,
+ KIT_FLOAT_ABI_SOFT,
+ KIT_FLOAT_ABI_SINGLE,
+ KIT_FLOAT_ABI_DOUBLE,
+} KitFloatAbi;
+
typedef struct KitTargetSpec {
KitArchKind arch;
KitOSKind os;
@@ -167,6 +179,7 @@ typedef struct KitTargetSpec {
bool big_endian;
uint8_t pic; /* KitPic */
uint8_t code_model; /* KitCodeModel */
+ uint8_t float_abi; /* KitFloatAbi */
} KitTargetSpec;
typedef struct KitTargetFeature {
@@ -179,6 +192,7 @@ typedef struct KitTargetOptions {
KitSlice isa; /* optional ISA/profile string, e.g. rv64gc or x86-64-v3 */
KitSlice cpu; /* optional target CPU/profile name */
KitSlice tune; /* optional tuning CPU/profile name */
+ KitSlice abi; /* optional ABI string, e.g. ilp32f / lp64d (-mabi) */
const KitTargetFeature* features;
uint32_t nfeatures;
} KitTargetOptions;
diff --git a/include/kit/link.h b/include/kit/link.h
@@ -166,6 +166,11 @@ typedef struct KitLinkSessionOptions {
uint16_t pe_subsystem; /* KitPeSubsystem; 0 => target default */
KitSlice interp_path;
KitSlice entry;
+ /* Static ET_EXEC image (text) base override, from `kit ld -Ttext ADDR`. When
+ * text_base_set is true it overrides the default static base; ignored for
+ * PIE/shared (base 0) and scripted layouts (the script pins vaddrs). */
+ bool text_base_set;
+ uint64_t text_base;
const KitLinkScript* linker_script;
uint8_t build_id_mode; /* KitBuildIdMode */
const uint8_t* build_id_bytes;
diff --git a/mk/lib_srcs.mk b/mk/lib_srcs.mk
@@ -23,7 +23,7 @@ flatobjs = $(foreach s,$(1),$(BUILD_DIR)/$(3)/$(dir $(patsubst $(2)/%,%,$(s)))$(
define arch-feature-off
LIB_SRCS_ARCH_AA64 := $$(filter-out %/$(1),$$(LIB_SRCS_ARCH_AA64))
LIB_SRCS_ARCH_X64 := $$(filter-out %/$(1),$$(LIB_SRCS_ARCH_X64))
-LIB_SRCS_ARCH_RV64 := $$(filter-out %/$(1),$$(LIB_SRCS_ARCH_RV64))
+LIB_SRCS_ARCH_RISCV := $$(filter-out %/$(1),$$(LIB_SRCS_ARCH_RISCV))
LIB_SRCS_NONARCH += $(2)
endef
@@ -52,7 +52,7 @@ LIB_SRCS_NONARCH = $(LIB_SRCS_ABI_CORE) \
# hundreds on every `make` invocation.
LIB_SRCS_ARCH_AA64 := $(shell find src/arch/aa64 -name '*.c' 2>/dev/null)
LIB_SRCS_ARCH_X64 := $(shell find src/arch/x64 -name '*.c' 2>/dev/null)
-LIB_SRCS_ARCH_RV64 := $(shell find src/arch/rv64 -name '*.c' 2>/dev/null)
+LIB_SRCS_ARCH_RISCV := $(shell find src/arch/riscv -name '*.c' 2>/dev/null)
LIB_SRCS_ARCH_WASM := $(shell find src/arch/wasm -name '*.c' 2>/dev/null)
LIB_SRCS_ARCH_C_TARGET := $(shell find src/arch/c_target -name '*.c' 2>/dev/null)
ifneq ($(KIT_OPT_ENABLED),1)
@@ -68,7 +68,7 @@ ifneq ($(KIT_DBG_ENABLED),1)
$(eval $(call arch-feature-off,dbg.c,src/arch/dbg_stubs.c))
endif
ifneq ($(KIT_EMU_ENABLED),1)
-LIB_SRCS_ARCH_RV64 := $(filter-out %/emu.c,$(LIB_SRCS_ARCH_RV64))
+LIB_SRCS_ARCH_RISCV := $(filter-out %/emu.c,$(LIB_SRCS_ARCH_RISCV))
LIB_SRCS_NONARCH += src/arch/emu_stubs.c
endif
ifneq ($(KIT_INTERP_ENABLED),1)
@@ -186,8 +186,8 @@ endif
ifeq ($(KIT_ARCH_X64_ENABLED),1)
LIB_SRCS += $(LIB_SRCS_ARCH_X64)
endif
-ifeq ($(KIT_ARCH_RV64_ENABLED),1)
-LIB_SRCS += $(LIB_SRCS_ARCH_RV64)
+ifneq ($(filter 1,$(KIT_ARCH_RV32_ENABLED) $(KIT_ARCH_RV64_ENABLED)),)
+LIB_SRCS += $(LIB_SRCS_ARCH_RISCV)
endif
ifeq ($(KIT_ARCH_WASM_ENABLED),1)
LIB_SRCS += $(LIB_SRCS_ARCH_WASM)
@@ -230,7 +230,7 @@ ifeq ($(KIT_OBJ_COFF_ENABLED),1)
LIB_SRCS += $(LIB_SRC_ABI_WIN64_X64)
endif
endif
-ifneq ($(filter 1,$(KIT_ARCH_RV64_ENABLED) $(KIT_ARCH_C_TARGET_ENABLED)),)
+ifneq ($(filter 1,$(KIT_ARCH_RV32_ENABLED) $(KIT_ARCH_RV64_ENABLED) $(KIT_ARCH_C_TARGET_ENABLED)),)
ifeq ($(KIT_OBJ_ELF_ENABLED),1)
LIB_SRCS += $(LIB_SRC_ABI_RV64)
endif
diff --git a/mk/rt.mk b/mk/rt.mk
@@ -26,6 +26,7 @@ RT_VARIANTS = \
wasm32 \
riscv32-elf \
riscv32-elf-save-restore \
+ riscv32-elf-hardfloat \
arm-eabi-thumb2 \
arm-eabi-thumb1
@@ -136,18 +137,29 @@ RT_wasm32_TARGET = wasm32-unknown-unknown
RT_wasm32_ABI = ilp32
RT_wasm32_INT128 = 0
+# Soft-float embedded RV32 (ABI=ilp32, no F/D): integer layout is ilp32 and
+# all FP is soft, so it reuses RT_ABI_SRCS_ilp32 / RT_ABI_INC ilp32_le.
RT_riscv32-elf_TARGET = riscv32-unknown-elf
RT_riscv32-elf_ABI = ilp32
RT_riscv32-elf_INT128 = 0
RT_riscv32-elf_CORO = riscv32
-RT_riscv32-elf_ARCH_FLAGS = -mabi=ilp32 -march=rv32imafd
+RT_riscv32-elf_ARCH_FLAGS = -mabi=ilp32 -march=rv32imac
RT_riscv32-elf-save-restore_TARGET = riscv32-unknown-elf
RT_riscv32-elf-save-restore_ABI = ilp32
RT_riscv32-elf-save-restore_INT128 = 0
RT_riscv32-elf-save-restore_CORO = riscv32
RT_riscv32-elf-save-restore_SAVE_RESTORE = 1
-RT_riscv32-elf-save-restore_ARCH_FLAGS = -mabi=ilp32 -march=rv32imafd
+RT_riscv32-elf-save-restore_ARCH_FLAGS = -mabi=ilp32 -march=rv32imac
+
+# Hard-float (single-precision) RV32: ilp32f passes float in FP regs, but the
+# integer layout is still ilp32 (double stays soft), so ABI=ilp32 here too and
+# it reuses the same RT_ABI_SRCS_ilp32 / RT_ABI_INC ilp32_le as the soft variant.
+RT_riscv32-elf-hardfloat_TARGET = riscv32-unknown-elf
+RT_riscv32-elf-hardfloat_ABI = ilp32
+RT_riscv32-elf-hardfloat_INT128 = 0
+RT_riscv32-elf-hardfloat_CORO = riscv32
+RT_riscv32-elf-hardfloat_ARCH_FLAGS = -mabi=ilp32f -march=rv32imafc
RT_arm-eabi-thumb2_TARGET = arm-none-eabi
RT_arm-eabi-thumb2_ABI = ilp32
@@ -229,12 +241,14 @@ RT_SRCS_$(1) := \
RT_CFLAGS_$(1) := \
$$(RT_COMMON_CFLAGS) $$(RT_LIB_INCS) \
-target $$(RT_$(1)_TARGET) \
+ $$(RT_$(1)_ARCH_FLAGS) \
-DHAS_INT128=$$(RT_$(1)_INT128) \
$$(RT_ABI_INC_$$(RT_$(1)_ABI)) \
$$(if $$(RT_$(1)_LDBL128),$$(RT_LDBL128_FLAGS)) \
$$(if $$(RT_$(1)_CORO),-Irt/include)
RT_ASFLAGS_$(1) := \
-target $$(RT_$(1)_TARGET) \
+ $$(RT_$(1)_ARCH_FLAGS) \
-DHAS_INT128=$$(RT_$(1)_INT128) \
-D__ASSEMBLER__=1 \
$$(RT_ABI_INC_$$(RT_$(1)_ABI)) \
diff --git a/mk/test.mk b/mk/test.mk
@@ -104,8 +104,10 @@ TEST_TARGETS = \
test-link-x64 \
test-rv64-inline \
test-rv64-jit \
+ test-rv32-jit \
test-rv64-tls-link \
test-smoke-rv64 \
+ test-smoke-rv32 \
test-smoke-x64 \
test-toy \
test-wasm \
@@ -134,6 +136,7 @@ DEFAULT_TEST_TARGETS = \
test-aa64-inline \
test-rv64-inline \
test-rv64-jit \
+ test-rv32-jit \
test-rv64-tls-link \
test-emu \
test-emu-unit \
@@ -315,10 +318,12 @@ test-dbg-red: bin
# form). Internal arch/ surface — needs -Isrc.
AA64_ISA_TEST_BIN = build/test/aa64_isa_test
RV64_DECODE_TEST_BIN = build/test/rv64_decode_test
+RV32_DECODE_TEST_BIN = build/test/rv32_decode_test
-test-isa: $(AA64_ISA_TEST_BIN) $(RV64_DECODE_TEST_BIN)
+test-isa: $(AA64_ISA_TEST_BIN) $(RV64_DECODE_TEST_BIN) $(RV32_DECODE_TEST_BIN)
$(AA64_ISA_TEST_BIN)
$(RV64_DECODE_TEST_BIN)
+ $(RV32_DECODE_TEST_BIN)
@@ -465,6 +470,20 @@ test-rv64-jit: $(RV64_JIT_TEST_BIN)
exit $$rc; \
fi
+# rv32 JIT smoke test. Mirror of test-rv64-jit: builds a tiny rv32 ELF .o in
+# memory, runs it through kit_link_session in JIT-output mode, and skips native
+# execution on non-riscv32 hosts after exercising the JIT mapping/reloc path.
+RV32_JIT_TEST_BIN = build/test/rv32_jit_test
+
+test-rv32-jit: $(RV32_JIT_TEST_BIN)
+ @$(RV32_JIT_TEST_BIN); rc=$$?; \
+ if [ $$rc -eq 77 ]; then \
+ echo " (rv32_jit_test SKIPPED on non-rv32 host)"; \
+ exit 0; \
+ else \
+ exit $$rc; \
+ fi
+
# Link-only regression for rv64 TLS Local-Exec lowering (runs on any host).
test-rv64-tls-link: bin
@@ -831,6 +850,12 @@ test-smoke-x64:
test-smoke-rv64:
bash test/smoke/rv64.sh
+# test-smoke-rv32: behavioral oracle for riscv32-none-elf codegen under
+# qemu-system-riscv32 (ilp32f + ilp32 lanes, i64 + soft-float). Skips if the
+# rv32 toolchain/qemu prerequisites are absent (see test/lib/check_rv32_env.sh).
+test-smoke-rv32:
+ bash test/smoke/rv32.sh
+
# test-parse-rv64-wide: end-to-end coverage of the rv64 128-bit scalar types
# — __int128 (i128_*) and IEEE-754 binary128 long double (ldbl128_*) — built
# with kit and run on riscv64. Exercises the soft-float / i128 lowering to
diff --git a/mk/test_unit.mk b/mk/test_unit.mk
@@ -31,7 +31,7 @@ UNIT_CFLAGS_INTERNAL = $(HOST_CFLAGS) -Iinclude -Isrc -Itest
UNIT_TESTS_PUBLIC := \
ar_test target_test cg_api_test cg_switch_test cg_fp_cmp_test hash_test \
- rv64_jit_test aa64_inline_test rv64_inline_test x64_inline_test \
+ rv64_jit_test rv32_jit_test aa64_inline_test rv64_inline_test x64_inline_test \
strength_reduce_test
ar_test_SRC := test/ar/ar_test.c
target_test_SRC := test/api/target_test.c
@@ -41,13 +41,14 @@ cg_switch_test_SRC := test/api/cg_switch_test.c
cg_fp_cmp_test_SRC := test/api/cg_fp_cmp_test.c
strength_reduce_test_SRC := test/cg/strength_reduce_test.c
rv64_jit_test_SRC := test/link/rv64_jit_test.c
+rv32_jit_test_SRC := test/link/rv32_jit_test.c
aa64_inline_test_SRC := test/arch/aa64_inline_test.c
rv64_inline_test_SRC := test/arch/rv64_inline_test.c
x64_inline_test_SRC := test/arch/x64_inline_test.c
UNIT_TESTS_INTERNAL := \
dwarf_test debug_roundtrip_unit debug_cfi_unit \
- aa64_isa_test rv64_decode_test aa64_sweep_gen \
+ aa64_isa_test rv64_decode_test rv32_decode_test aa64_sweep_gen \
reloc_uleb128_unit emu_rv64_unit_test interp_smoke_test \
rv64_interp_smoke_test abi_classify_test ir_recorder_test \
native_direct_target_test x64_dbg_test cg_ir_lower_test tiny_inline_test
@@ -56,6 +57,7 @@ debug_roundtrip_unit_SRC := test/debug/roundtrip_unit.c
debug_cfi_unit_SRC := test/debug/cfi_unit.c
aa64_isa_test_SRC := test/arch/aa64_isa_test.c
rv64_decode_test_SRC := test/arch/rv64_decode_test.c
+rv32_decode_test_SRC := test/arch/rv32_decode_test.c
aa64_sweep_gen_SRC := test/arch/aa64_sweep_gen.c
reloc_uleb128_unit_SRC := test/link/reloc_uleb128_unit.c
emu_rv64_unit_test_SRC := test/emu/rv64_vm_unit_test.c
diff --git a/src/abi/abi_internal.h b/src/abi/abi_internal.h
@@ -21,6 +21,7 @@ typedef struct ABIVtable {
extern const ABIVtable aapcs64_vtable;
extern const ABIVtable sysv_x64_vtable;
extern const ABIVtable rv64_vtable;
+extern const ABIVtable rv32_vtable;
extern const ABIVtable wasm32_vtable;
/* Apple Darwin variants — selected when (arch, os) matches. See
* abi.c::select_vtable. */
diff --git a/src/abi/abi_rv64.c b/src/abi/abi_rv64.c
@@ -1,26 +1,36 @@
-/* RISC-V LP64D ABI dispatch.
+/* RISC-V ABI dispatch (shared LP64* / ILP32* classifier).
*
- * Covers the subset the cg test harness exercises plus the LP64D
- * floating-point aggregate refinements per the RISC-V psABI:
+ * One descriptor-parameterized classifier serves both XLENs. The descriptor is
+ * derived per call from a->c->target:
+ * gpr_bytes = target.ptr_size (4 on rv32, 8 on rv64)
+ * aggregate_gpr_bytes = 2 * gpr_bytes (8 on rv32, 16 on rv64)
+ * flen = FP register width in bytes from target.float_abi:
+ * DOUBLE -> 8, SINGLE -> 4, SOFT -> 0,
+ * DEFAULT(unset) -> gpr_bytes (preserves the old
+ * rv64 LP64D behavior byte-for-byte).
+ *
+ * Covers the subset the cg test harness exercises plus the RISC-V psABI
+ * floating-point aggregate refinements:
* void -> IGNORE
- * integer ≤ 8B -> DIRECT, one INT part (a0..a7 for args; a0 for return)
+ * integer ≤ XLEN -> DIRECT, one INT part (a0..a7 for args; a0 for return)
* pointer -> DIRECT, one INT part
- * float/double -> DIRECT, one FP part (fa0..fa7 for args; fa0 for return)
+ * float/double -> DIRECT, one FP part when FP-eligible (fa0..fa7 for args;
+ * fa0 for return); otherwise INT (and a 2*XLEN scalar
+ * becomes a GPR pair).
* small struct -> DIRECT:
* * homogeneous FP aggregate (1 or 2 same-kind FP fields,
* ignoring empty/zero-size fields and zero-length arrays)
- * -> FP parts (fa pair);
- * * one FP + one INT scalar (in either order, ≤ 16 B)
+ * -> FP parts (fa pair) when FP-eligible;
+ * * one FP + one INT scalar (in either order, ≤ 2*XLEN)
* -> (fa, a) or (a, fa) pair;
- * * otherwise INT parts up to 16 B (passed in up to 2 GPRs).
+ * * otherwise INT parts up to 2*XLEN (passed in up to 2 GPRs).
* large struct -> INDIRECT (sret for return; byval for args)
*
- * Long double is IEEE-754 binary128 (quad) on rv64. Like __int128 it is a
- * 16-byte scalar passed/returned in an aligned pair of integer registers
- * (a0:a1 .. a6:a7; low-order half in the lower-numbered register), split
- * low-in-register / high-on-stack when only one register remains, and wholly
- * on the stack otherwise. classify_scalar() handles both via the size==16
- * INT-pair path below (there are no 128-bit FP registers in RV64GC).
+ * Long double is IEEE-754 binary128 (quad) and __int128 are 16-byte scalars
+ * passed/returned in an aligned pair of integer registers (low-order half in
+ * the lower-numbered register). On rv64 this is the size==2*gpr_bytes pair
+ * path; there are no 128-bit FP registers. On rv32 a 64-bit scalar (i64, or a
+ * soft-float double) is the size==2*gpr_bytes even-GPR pair.
*
* Variadic args bypass these rules entirely and always go through the
* integer register file / stack (handled at the caller / callee sites). */
@@ -32,7 +42,43 @@
#include "core/arena.h"
#include "core/core.h"
-enum { RV64_ABI_AGGREGATE_GPR_BYTES = 16, RV64_ABI_GPR_BYTES = 8 };
+/* Per-call ABI descriptor derived from the target spec. */
+typedef struct RiscvAbiDesc {
+ u32 gpr_bytes; /* XLEN in bytes: 4 (rv32) or 8 (rv64) */
+ u32 aggregate_gpr_bytes; /* 2 * gpr_bytes: the small-struct register cap */
+ u32 flen; /* FP register width in bytes: 0, 4, or 8 */
+} RiscvAbiDesc;
+
+static RiscvAbiDesc riscv_abi_desc(TargetABI* a) {
+ RiscvAbiDesc d;
+ d.gpr_bytes = a->c->target.ptr_size ? a->c->target.ptr_size : 8u;
+ d.aggregate_gpr_bytes = 2u * d.gpr_bytes;
+ switch (a->c->target.float_abi) {
+ case KIT_FLOAT_ABI_DOUBLE:
+ d.flen = 8u;
+ break;
+ case KIT_FLOAT_ABI_SINGLE:
+ d.flen = 4u;
+ break;
+ case KIT_FLOAT_ABI_SOFT:
+ d.flen = 0u;
+ break;
+ case KIT_FLOAT_ABI_DEFAULT:
+ default:
+ /* Unset: preserve the historical rv64 LP64D behavior, i.e. treat the FP
+ * register width as the GPR width (flen == 8 on rv64). */
+ d.flen = d.gpr_bytes;
+ break;
+ }
+ return d;
+}
+
+/* An FP scalar of `size` bytes can be carried in an FP register iff the float
+ * ABI is hard and the value fits: float (4) needs flen>=4; double (8) needs
+ * flen>=8. With soft float (flen==0) nothing is FP-eligible. */
+static int riscv_fp_eligible(u32 flen, u32 size) {
+ return flen != 0u && size <= flen;
+}
/* Walk a record collecting the leaf scalars in ABI order, skipping
* zero-size members (empty structs, zero-length arrays, zero-width
@@ -44,12 +90,12 @@ typedef struct AbiLeaf {
u8 scalar_kind; /* ABIScalarKind */
} AbiLeaf;
-static u32 rv64_collect_leaves(TargetABI* a, KitCgTypeId tid, u32 base_off,
- AbiLeaf* out, u32 cap, u32 written) {
+static u32 riscv_collect_leaves(TargetABI* a, KitCgTypeId tid, u32 base_off,
+ AbiLeaf* out, u32 cap, u32 written) {
const CgType* t = cg_type_get(a->c, tid);
if (!t) return written + 1u; /* poison: treat as too-many */
if (t->kind == KIT_CG_TYPE_ALIAS)
- return rv64_collect_leaves(a, t->alias.base, base_off, out, cap, written);
+ return riscv_collect_leaves(a, t->alias.base, base_off, out, cap, written);
if (t->kind == KIT_CG_TYPE_RECORD) {
if (t->record.is_union) return cap + 1u; /* unions: bail */
for (u32 i = 0; i < t->record.nfields; ++i) {
@@ -59,7 +105,7 @@ static u32 rv64_collect_leaves(TargetABI* a, KitCgTypeId tid, u32 base_off,
* the psABI (treat the whole record as GPR-pair). */
if (f->bit_width != 0) return cap + 1u;
u32 off = base_off + (u32)f->offset;
- written = rv64_collect_leaves(a, f->type, off, out, cap, written);
+ written = riscv_collect_leaves(a, f->type, off, out, cap, written);
if (written > cap) return written;
}
return written;
@@ -70,7 +116,7 @@ static u32 rv64_collect_leaves(TargetABI* a, KitCgTypeId tid, u32 base_off,
if (elem.size == 0) return written;
for (u64 i = 0; i < t->array.count; ++i) {
u32 off = base_off + (u32)(i * elem.size);
- written = rv64_collect_leaves(a, t->array.elem, off, out, cap, written);
+ written = riscv_collect_leaves(a, t->array.elem, off, out, cap, written);
if (written > cap) return written;
}
return written;
@@ -86,21 +132,29 @@ static u32 rv64_collect_leaves(TargetABI* a, KitCgTypeId tid, u32 base_off,
}
static void classify_scalar(TargetABI* a, KitCgTypeId t, ABIArgInfo* out) {
+ RiscvAbiDesc d = riscv_abi_desc(a);
ABITypeInfo ti = abi_internal_type_info(a, t);
- if (ti.size == 16 &&
+ /* A scalar twice the GPR width that lives in the integer/long-double space
+ * (or a soft-float double) is carried as an aligned pair of GPRs. On rv64
+ * this is the 16-byte long double / __int128 pair; on rv32 it is the 8-byte
+ * i64 / soft-double pair. A double is only excluded from the pair here when
+ * it is FP-eligible (handled by the single-FP-part path below). */
+ int fp_part = (ti.scalar_kind == ABI_SC_FLOAT) &&
+ riscv_fp_eligible(d.flen, ti.size);
+ if (ti.size == 2u * d.gpr_bytes && !fp_part &&
(ti.scalar_kind == ABI_SC_INT || ti.scalar_kind == ABI_SC_FLOAT)) {
ABIArgPart* parts = arena_array(a->c->tu, ABIArgPart, 2);
memset(parts, 0, sizeof(ABIArgPart) * 2);
parts[0].cls = ABI_CLASS_INT;
parts[0].loc = ABI_LOC_REG;
- parts[0].size = 8;
- parts[0].align = 8;
+ parts[0].size = d.gpr_bytes;
+ parts[0].align = d.gpr_bytes;
parts[0].src_offset = 0;
parts[1].cls = ABI_CLASS_INT;
parts[1].loc = ABI_LOC_REG;
- parts[1].size = 8;
- parts[1].align = 8;
- parts[1].src_offset = 8;
+ parts[1].size = d.gpr_bytes;
+ parts[1].align = d.gpr_bytes;
+ parts[1].src_offset = d.gpr_bytes;
out->kind = ABI_ARG_DIRECT;
out->flags = ABI_AF_NONE;
out->parts = parts;
@@ -114,7 +168,7 @@ static void classify_scalar(TargetABI* a, KitCgTypeId t, ABIArgInfo* out) {
ABIArgPart* parts = arena_new(a->c->tu, ABIArgPart);
memset(parts, 0, sizeof *parts);
- parts->cls = (ti.scalar_kind == ABI_SC_FLOAT) ? ABI_CLASS_FP : ABI_CLASS_INT;
+ parts->cls = fp_part ? ABI_CLASS_FP : ABI_CLASS_INT;
parts->loc = ABI_LOC_REG;
parts->size = ti.size;
parts->align = ti.align;
@@ -131,16 +185,22 @@ static void classify_void(ABIArgInfo* out) {
/* Try the psABI floating-point aggregate refinements. Returns 1 if `out`
* was populated, 0 to fall back to the generic GPR-pair packing. */
-static int rv64_classify_fp_aggregate(TargetABI* a, KitCgTypeId t,
- ABIArgInfo* out) {
+static int riscv_classify_fp_aggregate(TargetABI* a, KitCgTypeId t,
+ const RiscvAbiDesc* d, ABIArgInfo* out) {
AbiLeaf leaves[2];
- u32 n = rv64_collect_leaves(a, t, 0, leaves, /*cap=*/2u, /*written=*/0u);
+ u32 n = riscv_collect_leaves(a, t, 0, leaves, /*cap=*/2u, /*written=*/0u);
/* n > 2: bail; n == 0: caller already handled zero-size aggregates. */
if (n == 0 || n > 2) return 0;
u32 nfp = 0;
for (u32 i = 0; i < n; ++i) {
- if (leaves[i].scalar_kind == ABI_SC_FLOAT) ++nfp;
+ if (leaves[i].scalar_kind == ABI_SC_FLOAT) {
+ /* An FP leaf only stays in the FP file when it is FP-eligible. With
+ * soft float, or a double wider than flen, the aggregate must fall
+ * back to the GPR-pair path. */
+ if (!riscv_fp_eligible(d->flen, leaves[i].size)) return 0;
+ ++nfp;
+ }
/* ABI_SC_INT, ABI_SC_BOOL, ABI_SC_PTR all go to the GPR side. */
}
if (nfp == 0) return 0; /* pure-INT goes through the GPR-pair path. */
@@ -167,25 +227,26 @@ static int rv64_classify_fp_aggregate(TargetABI* a, KitCgTypeId t,
static void classify_aggregate(TargetABI* a, KitCgTypeId t, ABIArgInfo* out,
int is_return) {
+ RiscvAbiDesc d = riscv_abi_desc(a);
ABITypeInfo ti = abi_internal_type_info(a, t);
if (ti.size == 0) {
classify_void(out);
return;
}
- if (ti.size <= RV64_ABI_AGGREGATE_GPR_BYTES) {
+ if (ti.size <= d.aggregate_gpr_bytes) {
/* Per psABI: try the FP-aware refinement first (HFA / fp+int pair). */
- if (rv64_classify_fp_aggregate(a, t, out)) return;
- u32 nparts = (ti.size + RV64_ABI_GPR_BYTES - 1u) / RV64_ABI_GPR_BYTES;
+ if (riscv_classify_fp_aggregate(a, t, &d, out)) return;
+ u32 nparts = (ti.size + d.gpr_bytes - 1u) / d.gpr_bytes;
ABIArgPart* parts = arena_array(a->c->tu, ABIArgPart, nparts);
memset(parts, 0, sizeof(ABIArgPart) * nparts);
u32 off = 0;
for (u32 i = 0; i < nparts; ++i) {
- u32 chunk = (ti.size - off > RV64_ABI_GPR_BYTES) ? RV64_ABI_GPR_BYTES
- : (ti.size - off);
+ u32 chunk =
+ (ti.size - off > d.gpr_bytes) ? d.gpr_bytes : (ti.size - off);
parts[i].cls = ABI_CLASS_INT;
parts[i].loc = ABI_LOC_REG;
parts[i].size = chunk;
- parts[i].align = RV64_ABI_GPR_BYTES;
+ parts[i].align = d.gpr_bytes;
parts[i].src_offset = off;
off += chunk;
}
@@ -197,7 +258,7 @@ static void classify_aggregate(TargetABI* a, KitCgTypeId t, ABIArgInfo* out,
} else {
out->kind = ABI_ARG_INDIRECT;
out->flags = is_return ? ABI_AF_SRET : ABI_AF_BYVAL;
- out->indirect_align = ti.align ? ti.align : RV64_ABI_GPR_BYTES;
+ out->indirect_align = ti.align ? ti.align : d.gpr_bytes;
out->parts = NULL;
out->nparts = 0;
}
@@ -224,7 +285,7 @@ static void classify_one(TargetABI* a, KitCgTypeId t, ABIArgInfo* out,
}
}
-static ABIFuncInfo* rv64_compute_func_info(TargetABI* a, KitCgTypeId fn) {
+static ABIFuncInfo* riscv_compute_func_info(TargetABI* a, KitCgTypeId fn) {
ABIFuncInfo* info = arena_new(a->c->tu, ABIFuncInfo);
const CgType* fnty = cg_type_get(a->c, fn);
memset(info, 0, sizeof *info);
@@ -248,7 +309,7 @@ static ABIFuncInfo* rv64_compute_func_info(TargetABI* a, KitCgTypeId fn) {
}
const ABIVtable rv64_vtable = {
- .compute_func_info = rv64_compute_func_info,
+ .compute_func_info = riscv_compute_func_info,
.va_list_info = {8, 8, ABI_SC_PTR, 0, 0, 0},
/* LP64D va_list is a plain pointer, but the variadic register-save area is
* the 8 integer arg registers (a0..a7) spilled contiguously = 64 bytes; FP
@@ -262,3 +323,17 @@ const ABIVtable rv64_vtable = {
.gp_slot_size = 8,
.fp_slot_size = 0},
};
+
+const ABIVtable rv32_vtable = {
+ .compute_func_info = riscv_compute_func_info,
+ .va_list_info = {4, 4, ABI_SC_PTR, 0, 0, 0},
+ /* ILP32* va_list is a plain 4-byte pointer; the variadic register-save
+ * area is the 8 integer arg registers (a0..a7) spilled contiguously =
+ * 32 bytes. FP varargs are passed in GPRs, so there is no FP save area. */
+ .va_list_layout = {.type = {4, 4, ABI_SC_PTR, 0, 0, 0},
+ .kind = ABI_VA_LIST_POINTER,
+ .gp_reg_count = 8,
+ .fp_reg_count = 0,
+ .gp_slot_size = 4,
+ .fp_slot_size = 0},
+};
diff --git a/src/abi/registry.c b/src/abi/registry.c
@@ -15,6 +15,8 @@ typedef struct ABIImpl {
#define KIT_ABI_X64_ENABLED (KIT_ARCH_X64_ENABLED || KIT_ARCH_C_TARGET_ENABLED)
#define KIT_ABI_RV64_ENABLED \
(KIT_ARCH_RV64_ENABLED || KIT_ARCH_C_TARGET_ENABLED)
+#define KIT_ABI_RV32_ENABLED \
+ (KIT_ARCH_RV32_ENABLED || KIT_ARCH_C_TARGET_ENABLED)
static const ABIImpl abi_impls[] = {
#if KIT_ABI_AA64_ENABLED && KIT_OBJ_ELF_ENABLED
@@ -38,11 +40,14 @@ static const ABIImpl abi_impls[] = {
#if KIT_ABI_RV64_ENABLED && KIT_OBJ_ELF_ENABLED
{KIT_ARCH_RV64, KIT_OBJ_ELF, &rv64_vtable},
#endif
+#if KIT_ABI_RV32_ENABLED && KIT_OBJ_ELF_ENABLED
+ {KIT_ARCH_RV32, KIT_OBJ_ELF, &rv32_vtable},
+#endif
#if KIT_ARCH_WASM_ENABLED && KIT_OBJ_WASM_ENABLED
{KIT_ARCH_WASM, KIT_OBJ_WASM, &wasm32_vtable},
#endif
#if !KIT_ABI_AA64_ENABLED && !KIT_ABI_X64_ENABLED && !KIT_ABI_RV64_ENABLED && \
- !KIT_ARCH_WASM_ENABLED
+ !KIT_ABI_RV32_ENABLED && !KIT_ARCH_WASM_ENABLED
{KIT_ARCH_WASM, KIT_OBJ_WASM, NULL},
#endif
};
@@ -59,3 +64,4 @@ const ABIVtable* abi_vtable_lookup(KitArchKind arch, KitObjFmt obj) {
#undef KIT_ABI_AA64_ENABLED
#undef KIT_ABI_X64_ENABLED
#undef KIT_ABI_RV64_ENABLED
+#undef KIT_ABI_RV32_ENABLED
diff --git a/src/api/core.c b/src/api/core.c
@@ -101,6 +101,77 @@ KitStatus kit_target_new(const KitContext* ctx, const KitTargetOptions* opts,
opts->features[i].enabled);
}
+ /* Resolve & validate the float ABI for RISC-V targets only. Other arches
+ * leave float_abi at KIT_FLOAT_ABI_DEFAULT. */
+ if (opts->spec.arch == KIT_ARCH_RV32 || opts->spec.arch == KIT_ARCH_RV64) {
+ u32 fidx, didx;
+ int has_f = arch_target_feature_index(arch, kit_slice_cstr("f"), &fidx) &&
+ bitset_get(t->feature_words, t->nfeature_words, fidx);
+ int has_d = arch_target_feature_index(arch, kit_slice_cstr("d"), &didx) &&
+ bitset_get(t->feature_words, t->nfeature_words, didx);
+ KitFloatAbi fa;
+
+ if (opts->abi.s && opts->abi.len) {
+ int is_ilp32 = 0;
+ int is_lp64 = 0;
+ if (kit_slice_eq_cstr(opts->abi, "ilp32") ||
+ kit_slice_eq_cstr(opts->abi, "ilp32f") ||
+ kit_slice_eq_cstr(opts->abi, "ilp32d")) {
+ is_ilp32 = 1;
+ } else if (kit_slice_eq_cstr(opts->abi, "lp64") ||
+ kit_slice_eq_cstr(opts->abi, "lp64f") ||
+ kit_slice_eq_cstr(opts->abi, "lp64d")) {
+ is_lp64 = 1;
+ } else {
+ target_diag(ctx, "unsupported ABI for %s: %.*s", arch->name,
+ KIT_SLICE_ARG(opts->abi));
+ kit_target_free(t);
+ return KIT_INVALID;
+ }
+ /* Width prefix must match pointer size. */
+ if ((is_ilp32 && t->spec.ptr_size != 4u) ||
+ (is_lp64 && t->spec.ptr_size != 8u)) {
+ target_diag(ctx, "ABI %.*s does not match pointer width for %s",
+ KIT_SLICE_ARG(opts->abi), arch->name);
+ kit_target_free(t);
+ return KIT_INVALID;
+ }
+ if (kit_slice_eq_cstr(opts->abi, "ilp32d") ||
+ kit_slice_eq_cstr(opts->abi, "lp64d")) {
+ fa = KIT_FLOAT_ABI_DOUBLE;
+ } else if (kit_slice_eq_cstr(opts->abi, "ilp32f") ||
+ kit_slice_eq_cstr(opts->abi, "lp64f")) {
+ fa = KIT_FLOAT_ABI_SINGLE;
+ } else {
+ fa = KIT_FLOAT_ABI_SOFT;
+ }
+ } else {
+ /* Derive from the resolved -march feature bits. */
+ if (has_d)
+ fa = KIT_FLOAT_ABI_DOUBLE;
+ else if (has_f)
+ fa = KIT_FLOAT_ABI_SINGLE;
+ else
+ fa = KIT_FLOAT_ABI_SOFT;
+ }
+
+ if (fa == KIT_FLOAT_ABI_SINGLE && !has_f) {
+ target_diag(ctx,
+ "hardware single-float ABI requires the 'f' extension for %s",
+ arch->name);
+ kit_target_free(t);
+ return KIT_INVALID;
+ }
+ if (fa == KIT_FLOAT_ABI_DOUBLE && !has_d) {
+ target_diag(ctx,
+ "hardware double-float ABI requires the 'd' extension for %s",
+ arch->name);
+ kit_target_free(t);
+ return KIT_INVALID;
+ }
+ t->spec.float_abi = (uint8_t)fa;
+ }
+
*out = t;
return KIT_OK;
}
diff --git a/src/api/link.c b/src/api/link.c
@@ -139,6 +139,7 @@ KitStatus kit_link_session_new(KitCompiler* c,
break;
}
if (opts->linker_script) link_set_script(l, opts->linker_script);
+ if (opts->text_base_set) link_set_text_base(l, opts->text_base);
if (opts->entry.s && opts->entry.len) {
link_set_entry(l, opts->entry);
} else if (opts->pe_subsystem == KIT_PE_SUBSYSTEM_WINDOWS_GUI &&
diff --git a/src/api/object_detect.c b/src/api/object_detect.c
@@ -148,6 +148,11 @@ static KitStatus detect_elf(const u8* d, size_t len, KitTargetSpec* out) {
default:
return KIT_UNSUPPORTED;
}
+ /* EI_CLASS must agree with the arch's pointer width: 32-bit arches are
+ * ELFCLASS32, 64-bit arches ELFCLASS64. EM_RISCV is already disambiguated
+ * by class above; this also rejects a class/machine mismatch such as a
+ * 64-bit arch object whose EI_CLASS byte claims ELFCLASS32. */
+ if (ei_class != ((out->ptr_size == 4) ? 1u : 2u)) return KIT_MALFORMED;
if (ei_osabi == 0 || ei_osabi == 3)
out->os = KIT_OS_LINUX;
else
diff --git a/src/arch/mc.c b/src/arch/mc.c
@@ -757,6 +757,30 @@ void mc_emit_eh_frame(MCEmitter* m) {
mc->eh_frame_emitted = 1;
return;
}
+ /* Freestanding (bare-metal, target.os == none): emit no .eh_frame. kit marks
+ * .eh_frame SF_ALLOC so a hosted unwinder can consume it, but a bare-metal
+ * link (e.g. riscv32-none-elf) has no unwinder and would otherwise have to
+ * /DISCARD/ the orphaned ALLOC section. Drop CFI entirely there; hosted output
+ * (linux/macos/windows/freebsd/wasi) is unaffected and byte-identical. */
+ if (m->c->target.os == KIT_OS_FREESTANDING) {
+ heap = m->c->ctx->heap;
+ for (i = 0; i < mc->nfdes; ++i) {
+ if (mc->fdes[i].directives) {
+ heap->free(heap, mc->fdes[i].directives,
+ sizeof(CfiDirective) * mc->fdes[i].dir_cap);
+ mc->fdes[i].directives = NULL;
+ mc->fdes[i].dir_cap = 0;
+ }
+ }
+ if (mc->fdes) {
+ heap->free(heap, mc->fdes, sizeof(CfiFde) * mc->fdes_cap);
+ mc->fdes = NULL;
+ mc->fdes_cap = 0;
+ mc->nfdes = 0;
+ }
+ mc->eh_frame_emitted = 1;
+ return;
+ }
heap = m->c->ctx->heap;
fde_pe = (u8)(DW_EH_PE_pcrel | DW_EH_PE_sdata4);
diff --git a/src/arch/native_target.h b/src/arch/native_target.h
@@ -576,12 +576,25 @@ static inline MemAccess native_mem_for_type(NativeTarget* t, KitCgTypeId type,
return m;
}
-/* FP register class for a scalar type, FP only when float and <= 8 bytes.
- * aa64 keeps its own (the predicate is the same, but it pairs with a distinct
- * mem helper). */
+/* FP register class for a scalar type: a float value lives in an FP register
+ * only when the hardware float ABI has a register that wide. flen comes from
+ * the target float ABI (SINGLE->4, DOUBLE->8, SOFT->0); the DEFAULT/unset
+ * sentinel maps to the pointer width, which preserves the historical "FP iff
+ * float and <= 8 bytes" behavior for lp64d / x86-64 and yields the correct
+ * rv32 soft-double result (double is 8 bytes > flen=4 on ilp32f, > 0 on ilp32,
+ * so it is INT-class and never bit-cast through an FP register via fmv.d.x).
+ * aa64 keeps its own (same predicate, distinct mem helper). */
static inline NativeAllocClass native_class_for_type_fp_le8(NativeTarget* t,
KitCgTypeId type) {
- if (type && cg_type_is_float(t->c, type) && cg_type_size(t->c, type) <= 8u)
+ u32 flen;
+ switch (t->c->target.float_abi) {
+ case KIT_FLOAT_ABI_SINGLE: flen = 4u; break;
+ case KIT_FLOAT_ABI_DOUBLE: flen = 8u; break;
+ case KIT_FLOAT_ABI_SOFT: flen = 0u; break;
+ default: flen = t->c->target.ptr_size; break; /* DEFAULT: historical */
+ }
+ if (type && flen && cg_type_is_float(t->c, type) &&
+ cg_type_size(t->c, type) <= flen)
return NATIVE_REG_FP;
return NATIVE_REG_INT;
}
diff --git a/src/arch/registry.c b/src/arch/registry.c
@@ -21,6 +21,9 @@
#if KIT_ARCH_AA64_ENABLED
extern const ArchImpl arch_impl_aa64;
#endif
+#if KIT_ARCH_RV32_ENABLED
+extern const ArchImpl arch_impl_rv32;
+#endif
#if KIT_ARCH_RV64_ENABLED
extern const ArchImpl arch_impl_rv64;
#endif
@@ -47,6 +50,9 @@ static const ArchImpl* const arch_impls[] = {
#if KIT_ARCH_X64_ENABLED
&arch_impl_x64,
#endif
+#if KIT_ARCH_RV32_ENABLED
+ &arch_impl_rv32,
+#endif
#if KIT_ARCH_RV64_ENABLED
&arch_impl_rv64,
#endif
@@ -54,7 +60,7 @@ static const ArchImpl* const arch_impls[] = {
&arch_impl_wasm,
#endif
#if !KIT_ARCH_AA64_ENABLED && !KIT_ARCH_X64_ENABLED && \
- !KIT_ARCH_RV64_ENABLED && !KIT_ARCH_WASM_ENABLED
+ !KIT_ARCH_RV32_ENABLED && !KIT_ARCH_RV64_ENABLED && !KIT_ARCH_WASM_ENABLED
NULL,
#endif
};
diff --git a/src/arch/riscv/arch.c b/src/arch/riscv/arch.c
@@ -0,0 +1,424 @@
+#include "arch/arch.h"
+
+#include <string.h>
+
+#include "arch/riscv/asm.h"
+#include "arch/riscv/disasm.h"
+#include "arch/riscv/regs.h"
+#include "arch/riscv/rv64.h"
+#include "arch/riscv/variant.h"
+#include "cg/native_direct_target.h"
+#include "core/bytes.h"
+#include "link/link_arch.h"
+#include "obj/obj.h"
+
+extern const LinkArchDesc link_arch_rv64;
+extern const ArchDbgOps rv64_dbg_ops;
+extern const ArchEmuOps rv64_emu_ops;
+extern const ArchDwarfOps rv64_dwarf_ops;
+extern const ArchAsmOps rv64_asm_ops;
+extern const LinkArchDesc link_arch_rv32;
+extern const ArchDbgOps rv32_dbg_ops;
+
+static int rv64_register_at_public(uint32_t idx, KitArchReg* out) {
+ const char* nm = NULL;
+ int rc;
+ if (!out) return 1;
+ rc = rv64_register_iter_get(idx, &out->dwarf_idx, &nm);
+ if (rc == 0) out->name = kit_slice_cstr(nm);
+ return rc;
+}
+
+static SrcLoc rv64_no_loc(void) {
+ SrcLoc l = {0, 0, 0};
+ return l;
+}
+
+static int rv64_apply_label_fixup(Compiler* c, const ArchLabelFixup* fx) {
+ const Section* s;
+ u8 cur[4];
+ u32 word;
+ u32 b;
+
+ (void)c;
+ if (!fx) return 1;
+ s = obj_section_get(fx->obj, fx->sec_id);
+ if (!s) return 0;
+
+ /* INTRA_AUIPC_ADDI is a width=8 pair; other kinds patch a single 4-byte
+ * instruction. Read the first word only for the 4-byte cases. */
+ if (fx->kind != R_RV_INTRA_AUIPC_ADDI) {
+ if (fx->width != 4) return 1;
+ buf_read(&s->bytes, fx->offset, cur, 4);
+ word = rd_u32_le(cur);
+ } else {
+ buf_read(&s->bytes, fx->offset, cur, 4);
+ word = rd_u32_le(cur);
+ }
+ b = (u32)fx->disp;
+
+ switch (fx->kind) {
+ case R_RV_BRANCH:
+ /* B-type reaches ±4 KiB. Conditional branches are emitted over a jal
+ * (see rv_cmp_branch) so this only carries small fixed displacements;
+ * a violation is a backend bug, not silently-truncated code. */
+ if ((i64)fx->disp < -(i64)(1 << 12) || (i64)fx->disp >= (i64)(1 << 12))
+ compiler_panic(c, rv64_no_loc(), "rv64: BRANCH out of range (±4KiB)");
+ word &= 0x01fff07fu;
+ word |= ((b >> 12) & 1u) << 31;
+ word |= ((b >> 5) & 0x3fu) << 25;
+ word |= ((b >> 1) & 0xfu) << 8;
+ word |= ((b >> 11) & 1u) << 7;
+ break;
+ case R_RV_JAL:
+ /* J-type reaches ±1 MiB — ample for intra-function jumps (including the
+ * long leg of a conditional branch). Fail loudly rather than wrap. */
+ if ((i64)fx->disp < -(i64)(1 << 20) || (i64)fx->disp >= (i64)(1 << 20))
+ compiler_panic(c, rv64_no_loc(), "rv64: JAL out of range (±1MiB)");
+ word &= 0x00000fffu;
+ word |= ((b >> 20) & 1u) << 31;
+ word |= ((b >> 1) & 0x3ffu) << 21;
+ word |= ((b >> 11) & 1u) << 20;
+ word |= ((b >> 12) & 0xffu) << 12;
+ break;
+ case R_RV_INTRA_AUIPC_ADDI: {
+ /* width=8: patch both the AUIPC at fx->offset and the ADDI at
+ * fx->offset+4. disp is the byte offset from the AUIPC PC to the
+ * target label. */
+ u8 cur2[4];
+ u32 word2;
+ i32 disp = (i32)fx->disp;
+ /* hi20 is the top 20 bits of (disp + 0x800) so the sign-extended
+ * 12-bit lo12 cancels out. */
+ u32 hi20 = (u32)((disp + 0x800) >> 12) & 0xfffffu;
+ u32 lo12 = (u32)disp & 0xfffu;
+ if (fx->width != 8) return 1;
+ /* AUIPC: keep rd (bits 11:7) and opcode (bits 6:0); patch imm[31:12]. */
+ word = (word & 0x00000fffu) | (hi20 << 12);
+ wr_u32_le(cur, word);
+ obj_patch(fx->obj, fx->sec_id, fx->offset, cur, 4);
+ buf_read(&s->bytes, fx->offset + 4, cur2, 4);
+ word2 = rd_u32_le(cur2);
+ /* ADDI: keep rs1/funct3/rd/opcode (bits 19:0); patch imm[11:0]. */
+ word2 = (word2 & 0x000fffffu) | (lo12 << 20);
+ wr_u32_le(cur2, word2);
+ obj_patch(fx->obj, fx->sec_id, fx->offset + 4, cur2, 4);
+ return 0;
+ }
+ default:
+ return 1;
+ }
+
+ wr_u32_le(cur, word);
+ obj_patch(fx->obj, fx->sec_id, fx->offset, cur, 4);
+ return 0;
+}
+
+/* Mirrors `clang --target=riscv64-linux-gnu -E -dM` for the in-scope
+ * RV64GC profile: I/M/F/D/A/C + Zicsr-minimal. Macros that depend on
+ * extensions outside scope (V, B, Zve*, Zfh, …) are deliberately
+ * absent. ABI variant is lp64d. */
+static const KitPredefinedMacro rv64_predefined_macros[] = {
+ {KIT_SLICE_LIT("__riscv"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_xlen"), KIT_SLICE_LIT("64")},
+ {KIT_SLICE_LIT("__riscv_float_abi_double"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_atomic"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_mul"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_div"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_muldiv"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_compressed"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_flen"), KIT_SLICE_LIT("64")},
+ {KIT_SLICE_LIT("__riscv_fdiv"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_fsqrt"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_zicsr"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_zifencei"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_arch_test"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__LP64__"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("_LP64"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__ORDER_LITTLE_ENDIAN__"), KIT_SLICE_LIT("1234")},
+ {KIT_SLICE_LIT("__ORDER_BIG_ENDIAN__"), KIT_SLICE_LIT("4321")},
+ {KIT_SLICE_LIT("__BYTE_ORDER__"), KIT_SLICE_LIT("__ORDER_LITTLE_ENDIAN__")},
+ {KIT_SLICE_LIT("__LITTLE_ENDIAN__"), KIT_SLICE_LIT("1")},
+};
+
+/* Mirrors `clang --target=riscv32-linux-gnu -march=rv32imafc_zicsr_zifencei
+ * -mabi=ilp32f -E -dM` for the DEFAULT ilp32f hard-single profile:
+ * I/M/F/A/C + Zicsr-minimal, single-precision float ABI (double is soft).
+ * __riscv_flen=32 (no D). __ILP32__/_ILP32 replace __LP64__/_LP64.
+ *
+ * Known v1 limitation: predefined macros are a static (ptr,count) table
+ * consumed without a Target (src/api/compile.c), so this fixed table reflects
+ * the default ilp32f profile only. Soft-float ilp32 codegen correctness is
+ * driven by c->target.float_abi, not these macros. */
+static const KitPredefinedMacro rv32_predefined_macros[] = {
+ {KIT_SLICE_LIT("__riscv"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_xlen"), KIT_SLICE_LIT("32")},
+ {KIT_SLICE_LIT("__riscv_float_abi_single"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_flen"), KIT_SLICE_LIT("32")},
+ {KIT_SLICE_LIT("__riscv_fdiv"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_fsqrt"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_atomic"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_mul"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_div"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_muldiv"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_compressed"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_zicsr"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_zifencei"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__riscv_arch_test"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__ILP32__"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("_ILP32"), KIT_SLICE_LIT("1")},
+ {KIT_SLICE_LIT("__ORDER_LITTLE_ENDIAN__"), KIT_SLICE_LIT("1234")},
+ {KIT_SLICE_LIT("__ORDER_BIG_ENDIAN__"), KIT_SLICE_LIT("4321")},
+ {KIT_SLICE_LIT("__BYTE_ORDER__"), KIT_SLICE_LIT("__ORDER_LITTLE_ENDIAN__")},
+ {KIT_SLICE_LIT("__LITTLE_ENDIAN__"), KIT_SLICE_LIT("1")},
+};
+
+enum {
+ RV64_FEAT_I = 0,
+ RV64_FEAT_M,
+ RV64_FEAT_A,
+ RV64_FEAT_F,
+ RV64_FEAT_D,
+ RV64_FEAT_C,
+ RV64_FEAT_ZICSR,
+ RV64_FEAT_ZIFENCEI,
+};
+
+static const ArchTargetFeature rv64_target_features[] = {
+ {"i"}, {"m"}, {"a"}, {"f"}, {"d"}, {"c"}, {"zicsr"}, {"zifencei"},
+};
+
+static void rv64_feature_set(u64* words, u32 nwords, u32 idx) {
+ if (!words || idx / 64u >= nwords) return;
+ words[idx / 64u] |= 1ull << (idx % 64u);
+}
+
+static void rv64_feature_clear(u64* words, u32 nwords, u32 idx) {
+ if (!words || idx / 64u >= nwords) return;
+ words[idx / 64u] &= ~(1ull << (idx % 64u));
+}
+
+static void rv64_feature_disable_all(u64* words, u32 nwords) {
+ rv64_feature_clear(words, nwords, RV64_FEAT_I);
+ rv64_feature_clear(words, nwords, RV64_FEAT_M);
+ rv64_feature_clear(words, nwords, RV64_FEAT_A);
+ rv64_feature_clear(words, nwords, RV64_FEAT_F);
+ rv64_feature_clear(words, nwords, RV64_FEAT_D);
+ rv64_feature_clear(words, nwords, RV64_FEAT_C);
+ rv64_feature_clear(words, nwords, RV64_FEAT_ZICSR);
+ rv64_feature_clear(words, nwords, RV64_FEAT_ZIFENCEI);
+}
+
+static void rv64_feature_enable_g(u64* words, u32 nwords) {
+ rv64_feature_set(words, nwords, RV64_FEAT_I);
+ rv64_feature_set(words, nwords, RV64_FEAT_M);
+ rv64_feature_set(words, nwords, RV64_FEAT_A);
+ rv64_feature_set(words, nwords, RV64_FEAT_F);
+ rv64_feature_set(words, nwords, RV64_FEAT_D);
+ rv64_feature_set(words, nwords, RV64_FEAT_ZICSR);
+ rv64_feature_set(words, nwords, RV64_FEAT_ZIFENCEI);
+}
+
+static int rv64_has_prefix(const char* p, const char* end, const char* lit) {
+ size_t n = strlen(lit);
+ return (size_t)(end - p) >= n && memcmp(p, lit, n) == 0;
+}
+
+static void rv64_skip_version(const char** pp, const char* end) {
+ const char* p = *pp;
+ while (p < end && ((*p >= '0' && *p <= '9') || *p == 'p')) ++p;
+ *pp = p;
+}
+
+static KitStatus rv64_target_feature_apply_isa(const Target* target,
+ KitSlice isa, u64* words,
+ u32 nwords) {
+ const char* p;
+ const char* end;
+ const RiscvVariant* v = riscv_variant_for_kind(target->arch);
+ if (isa.len < 5 || memcmp(isa.s, v->isa_prefix, 4) != 0) return KIT_UNSUPPORTED;
+ p = isa.s + 4;
+ end = isa.s + isa.len;
+ rv64_feature_disable_all(words, nwords);
+ while (p < end) {
+ if (*p == '_') {
+ ++p;
+ continue;
+ }
+ switch (*p) {
+ case 'i':
+ rv64_feature_set(words, nwords, RV64_FEAT_I);
+ ++p;
+ rv64_skip_version(&p, end);
+ continue;
+ case 'm':
+ rv64_feature_set(words, nwords, RV64_FEAT_M);
+ ++p;
+ rv64_skip_version(&p, end);
+ continue;
+ case 'a':
+ rv64_feature_set(words, nwords, RV64_FEAT_A);
+ ++p;
+ rv64_skip_version(&p, end);
+ continue;
+ case 'f':
+ rv64_feature_set(words, nwords, RV64_FEAT_F);
+ ++p;
+ rv64_skip_version(&p, end);
+ continue;
+ case 'd':
+ rv64_feature_set(words, nwords, RV64_FEAT_D);
+ ++p;
+ rv64_skip_version(&p, end);
+ continue;
+ case 'c':
+ rv64_feature_set(words, nwords, RV64_FEAT_C);
+ ++p;
+ rv64_skip_version(&p, end);
+ continue;
+ case 'g':
+ rv64_feature_enable_g(words, nwords);
+ ++p;
+ rv64_skip_version(&p, end);
+ continue;
+ case 'z':
+ if (rv64_has_prefix(p, end, "zicsr")) {
+ rv64_feature_set(words, nwords, RV64_FEAT_ZICSR);
+ p += 5;
+ rv64_skip_version(&p, end);
+ continue;
+ }
+ if (rv64_has_prefix(p, end, "zifencei")) {
+ rv64_feature_set(words, nwords, RV64_FEAT_ZIFENCEI);
+ p += 8;
+ rv64_skip_version(&p, end);
+ continue;
+ }
+ break;
+ }
+ return KIT_UNSUPPORTED;
+ }
+ return KIT_OK;
+}
+
+static void rv64_target_feature_defaults(const Target* target, u64* words,
+ u32 nwords) {
+ rv64_feature_set(words, nwords, RV64_FEAT_I);
+ rv64_feature_set(words, nwords, RV64_FEAT_M);
+ rv64_feature_set(words, nwords, RV64_FEAT_A);
+ rv64_feature_set(words, nwords, RV64_FEAT_F);
+ /* rv32 default profile is rv32imafc_zicsr_zifencei (ilp32f hard-single) —
+ * no D. rv64 keeps the full G+C (lp64d) profile including D. */
+ if (target->arch != KIT_ARCH_RV32) rv64_feature_set(words, nwords, RV64_FEAT_D);
+ rv64_feature_set(words, nwords, RV64_FEAT_C);
+ rv64_feature_set(words, nwords, RV64_FEAT_ZICSR);
+ rv64_feature_set(words, nwords, RV64_FEAT_ZIFENCEI);
+}
+
+static CgTarget* rv64_backend_make(Compiler* c, ObjBuilder* o,
+ const KitCodeOptions* opts) {
+ MCEmitter* mc = NULL;
+ Debug* debug = NULL;
+ CgTarget* t;
+ NativeTarget* native;
+ NativeDirectTargetConfig cfg;
+ if (cg_mc_debug_new(c, o, opts, &mc, &debug) != KIT_OK) return NULL;
+ native = rv64_native_target_new(c, o, mc);
+ if (!native) return NULL;
+ memset(&cfg, 0, sizeof cfg);
+ cfg.native = native;
+ cfg.ops = rv64_native_direct_ops();
+ t = native_direct_target_new(c, o, &cfg);
+ if (t) t->debug = debug;
+ return t;
+}
+
+static CgTarget* rv64_semantic_target_new(Compiler* c, ObjBuilder* o,
+ MCEmitter* mc) {
+ NativeTarget* native;
+ NativeDirectTargetConfig cfg;
+ if (!mc) mc = mc_new(c, o);
+ native = rv64_native_target_new(c, o, mc);
+ if (!native) return NULL;
+ memset(&cfg, 0, sizeof cfg);
+ cfg.native = native;
+ cfg.ops = rv64_native_direct_ops();
+ return native_direct_target_new(c, o, &cfg);
+}
+
+const ArchImpl arch_impl_rv64 = {
+ .backend = {.name = "rv64", .make = rv64_backend_make},
+ .kind = KIT_ARCH_RV64,
+ .name = "rv64",
+ .cgtarget_new = rv64_semantic_target_new,
+ .asm_new = rv64_arch_asm_new,
+ .disasm_new = rv64_disasm_new,
+ .apply_label_fixup = rv64_apply_label_fixup,
+ .decode = &rv64_decode_ops,
+ .emu = &rv64_emu_ops,
+ .link = &link_arch_rv64,
+ .dwarf = &rv64_dwarf_ops,
+ .dbg = &rv64_dbg_ops,
+ .asm_ops = &rv64_asm_ops,
+ .predefined_macros = rv64_predefined_macros,
+ .npredefined_macros =
+ (u32)(sizeof rv64_predefined_macros / sizeof rv64_predefined_macros[0]),
+ .target_features = rv64_target_features,
+ .ntarget_features =
+ (u32)(sizeof rv64_target_features / sizeof rv64_target_features[0]),
+ .target_feature_defaults = rv64_target_feature_defaults,
+ .target_feature_apply_isa = rv64_target_feature_apply_isa,
+ .register_name = rv64_register_name,
+ .register_index = rv64_register_index,
+ .register_count = rv64_register_iter_size,
+ .register_at = rv64_register_at_public,
+ /* RISC-V psABI: return address in x1 (ra). 4-byte aligned insns
+ * (cover 2-byte C-ext too via code_align=2). Data align -8 for
+ * doubleword stack stride. CFA = sp at entry. */
+ .cfi_return_addr_reg = 1u,
+ .cfi_code_align_factor = 2,
+ .cfi_data_align_factor = -8,
+ .cfi_cfa_init_reg = 2u,
+ .cfi_cfa_init_offset = 0,
+};
+
+/* RV32 shares nearly all of the RISC-V backend with rv64 — the per-XLEN
+ * differences are threaded through RiscvVariant inside the shared functions.
+ * Differs only in: backend/arch names + kind, the link descriptor + dbg ops
+ * (rv_lw / 2-byte min insn), the ilp32f predefined-macro table, and the CFI
+ * data alignment factor (-4 word stride vs rv64's -8 doubleword). */
+const ArchImpl arch_impl_rv32 = {
+ .backend = {.name = "rv32", .make = rv64_backend_make},
+ .kind = KIT_ARCH_RV32,
+ .name = "rv32",
+ .cgtarget_new = rv64_semantic_target_new,
+ .asm_new = rv64_arch_asm_new,
+ .disasm_new = rv64_disasm_new,
+ .apply_label_fixup = rv64_apply_label_fixup,
+ .decode = &rv64_decode_ops,
+ .emu = &rv64_emu_ops,
+ .link = &link_arch_rv32,
+ .dwarf = &rv64_dwarf_ops,
+ .dbg = &rv32_dbg_ops,
+ .asm_ops = &rv64_asm_ops,
+ .predefined_macros = rv32_predefined_macros,
+ .npredefined_macros =
+ (u32)(sizeof rv32_predefined_macros / sizeof rv32_predefined_macros[0]),
+ .target_features = rv64_target_features,
+ .ntarget_features =
+ (u32)(sizeof rv64_target_features / sizeof rv64_target_features[0]),
+ .target_feature_defaults = rv64_target_feature_defaults,
+ .target_feature_apply_isa = rv64_target_feature_apply_isa,
+ .register_name = rv64_register_name,
+ .register_index = rv64_register_index,
+ .register_count = rv64_register_iter_size,
+ .register_at = rv64_register_at_public,
+ /* RISC-V psABI: return address in x1 (ra). 4-byte aligned insns
+ * (cover 2-byte C-ext too via code_align=2). Data align -4 for
+ * word stack stride (rv32). CFA = sp at entry. */
+ .cfi_return_addr_reg = 1u,
+ .cfi_code_align_factor = 2,
+ .cfi_data_align_factor = -4,
+ .cfi_cfa_init_reg = 2u,
+ .cfi_cfa_init_offset = 0,
+};
diff --git a/src/arch/riscv/asm.c b/src/arch/riscv/asm.c
@@ -0,0 +1,1492 @@
+/* RV64 assembler — descriptor-table driven.
+ *
+ * Mnemonic → Rv64InsnDesc via rv64_asm_find; operand parsing dispatches
+ * on the format kind. The descriptor's `match` field already carries
+ * the funct3/funct7/opcode bits; the parser only needs to fill in the
+ * register operands and immediate.
+ *
+ * Aliases (li, mv, ret, jr, j, nop, sext.w, beqz, bnez) are recognized
+ * by their alias rows in the descriptor table and rewritten to the
+ * canonical encoding here. Inline rv_* encoders in isa.h remain the
+ * hot path for codegen; the assembler uses them to assemble the
+ * machine word once it has the operand values. */
+
+#include "arch/riscv/asm.h"
+
+#include <string.h>
+
+#include "arch/riscv/isa.h"
+#include "arch/riscv/regs.h"
+#include "arch/riscv/rv64.h"
+#include "arch/riscv/variant.h"
+#include "asm/asm_helpers.h"
+#include "core/arena.h"
+#include "core/pool.h"
+#include "core/slice.h"
+#include "core/strbuf.h"
+#include "obj/obj.h"
+
+struct Rv64Asm {
+ ArchAsm base;
+ Compiler* c;
+
+ /* Inline-asm bound state (set by rv64_inline_bind, cleared otherwise).
+ * Operand indexing per GCC convention: 0..nout-1 are outputs, then
+ * nout..nout+nin-1 are inputs. Templates address into this combined
+ * list via %N / %zN / %aN / %w[name] / %x[name]. */
+ const AsmConstraint* outs;
+ Operand* out_ops;
+ const AsmConstraint* ins;
+ const Operand* in_ops;
+ const Sym* clobbers;
+ u32 nout;
+ u32 nin;
+ u32 nclob;
+};
+
+typedef struct Rv64Asm Rv64Asm;
+
+/* Relocation modifier on a 12-bit immediate offset (`%lo`/`%pcrel_lo`).
+ * RV_MEMMOD_NONE means a plain numeric displacement in `disp`. */
+typedef enum RvMemMod {
+ RV_MEMMOD_NONE = 0,
+ RV_MEMMOD_LO,
+ RV_MEMMOD_PCREL_LO,
+} RvMemMod;
+
+typedef struct Rv64Mem {
+ i32 disp;
+ u32 base;
+ RvMemMod mod; /* reloc modifier on the offset, or RV_MEMMOD_NONE */
+ ObjSymId sym; /* symbol when mod != NONE */
+ i64 off; /* addend when mod != NONE */
+} Rv64Mem;
+
+static int sym_to_cstr(AsmDriver* d, Sym s, char* out, size_t cap) {
+ Slice sl = pool_slice(asm_driver_pool(d), s);
+ if (!sl.s || sl.len >= cap) return 0;
+ memcpy(out, sl.s, sl.len);
+ out[sl.len] = '\0';
+ return 1;
+}
+
+/* True if `s` begins with the NUL-terminated literal `pfx` (length-explicit).
+ */
+static bool slice_has_prefix_cstr(Slice s, const char* pfx, size_t n) {
+ return s.len >= n && memcmp(s.s, pfx, n) == 0;
+}
+
+static int rv_reg_from_name(AsmDriver* d, Sym s, u32* reg_out, int* fp_out) {
+ char name[16];
+ uint32_t dwarf = 0;
+ if (!sym_to_cstr(d, s, name, sizeof name)) return 0;
+ if (rv64_register_index(name, &dwarf) != 0) return 0;
+ if (reg_out) *reg_out = dwarf & 31u;
+ if (fp_out) *fp_out = dwarf >= 32u;
+ return 1;
+}
+
+static u32 parse_reg(AsmDriver* d, int* fp_out) {
+ AsmTok t = asm_driver_next(d);
+ u32 r;
+ if (t.kind != ASM_TOK_IDENT || !rv_reg_from_name(d, t.v.ident, &r, fp_out))
+ asm_driver_panic(d, "rv64 asm: bad register");
+ return r;
+}
+
+static u32 parse_xreg(AsmDriver* d) {
+ int fp = 0;
+ u32 r = parse_reg(d, &fp);
+ if (fp) asm_driver_panic(d, "rv64 asm: expected integer register");
+ return r;
+}
+
+static u32 parse_freg(AsmDriver* d) {
+ int fp = 0;
+ u32 r = parse_reg(d, &fp);
+ if (!fp) asm_driver_panic(d, "rv64 asm: expected float register");
+ return r;
+}
+
+static void expect_comma(AsmDriver* d) {
+ if (!asm_driver_eat_comma(d)) asm_driver_panic(d, "rv64 asm: expected ','");
+}
+
+/* Position of a `%mod(sym)` relocation operand: the 20-bit upper field of
+ * lui/auipc, or a 12-bit I-type (addi/load) or S-type (store) immediate. */
+typedef enum RvModPos {
+ RV_MODPOS_HI20,
+ RV_MODPOS_LO_I,
+ RV_MODPOS_LO_S,
+} RvModPos;
+
+/* Map a relocation-modifier name (`hi`, `lo`, `pcrel_hi`, `pcrel_lo`,
+ * `got_pcrel_hi`) to the RelocKind appropriate for `pos`. Panics on a name
+ * that is not valid at this operand position. */
+static RelocKind rv_mod_to_reloc(AsmDriver* d, Slice name, RvModPos pos) {
+ if (pos == RV_MODPOS_HI20) {
+ if (slice_eq_cstr(name, "hi")) return R_RV_HI20;
+ if (slice_eq_cstr(name, "pcrel_hi")) return R_RV_PCREL_HI20;
+ if (slice_eq_cstr(name, "got_pcrel_hi")) return R_RV_GOT_HI20;
+ } else {
+ int store = (pos == RV_MODPOS_LO_S);
+ if (slice_eq_cstr(name, "lo")) return store ? R_RV_LO12_S : R_RV_LO12_I;
+ if (slice_eq_cstr(name, "pcrel_lo"))
+ return store ? R_RV_PCREL_LO12_S : R_RV_PCREL_LO12_I;
+ }
+ asm_driver_panic(d, "rv64 asm: relocation modifier not valid here");
+}
+
+/* If the next token is `%`, parse a `%mod(sym{+off})` relocation operand,
+ * emit the relocation at the current emit position (where the about-to-be-
+ * returned instruction word will land), and return 1. The caller encodes a
+ * zero placeholder in the immediate field. Returns 0 if there is no modifier
+ * (leaving the stream untouched for normal constant parsing). A leading `%`
+ * is unambiguous here: modulo is infix and never starts an operand. */
+static int rv_parse_mod_reloc(AsmDriver* d, RvModPos pos, ObjSymId* sym_out,
+ i64* off_out, RelocKind* kind_out) {
+ if (!asm_driver_tok_is_punct(asm_driver_peek(d), '%')) return 0;
+ (void)asm_driver_next(d); /* eat '%' */
+ AsmTok name = asm_driver_next(d);
+ if (name.kind != ASM_TOK_IDENT)
+ asm_driver_panic(d, "rv64 asm: expected relocation modifier name");
+ Slice nm = pool_slice(asm_driver_pool(d), name.v.ident);
+ asm_driver_expect_punct(d, '(', "'(' after relocation modifier");
+ ObjSymId sym = OBJ_SYM_NONE;
+ i64 off = 0;
+ asm_driver_parse_sym_expr(d, &sym, &off);
+ asm_driver_expect_punct(d, ')', "')' after %mod(sym)");
+ RelocKind k = rv_mod_to_reloc(d, nm, pos);
+ if (sym_out) *sym_out = sym;
+ if (off_out) *off_out = off;
+ if (kind_out) *kind_out = k;
+ return 1;
+}
+
+/* Parse a RISC-V rounding-mode mnemonic (the comma is already consumed) into
+ * its 3-bit funct3 value. cc -S emits this suffix on fcvt/fsqrt when the mode
+ * isn't the default `dyn`, so the round-trip (and clang) re-encode the exact
+ * mode rather than guessing a default. */
+static u32 rv_parse_rm_name(AsmDriver* d) {
+ AsmTok t = asm_driver_next(d);
+ Slice s;
+ if (t.kind != ASM_TOK_IDENT)
+ asm_driver_panic(d, "rv64 asm: expected rounding mode");
+ s = pool_slice(asm_driver_pool(d), t.v.ident);
+ if (slice_eq_cstr(s, "rne")) return 0u;
+ if (slice_eq_cstr(s, "rtz")) return 1u;
+ if (slice_eq_cstr(s, "rdn")) return 2u;
+ if (slice_eq_cstr(s, "rup")) return 3u;
+ if (slice_eq_cstr(s, "rmm")) return 4u;
+ if (slice_eq_cstr(s, "dyn")) return 7u;
+ asm_driver_panic(d, "rv64 asm: unknown rounding mode");
+}
+
+/* Emit a relocation for a U-type / I-type immediate `%mod(sym)` operand at
+ * the current instruction position; returns 1 if one was present. */
+static int rv_emit_imm_mod_reloc(AsmDriver* d, RvModPos pos) {
+ ObjSymId sym;
+ i64 off;
+ RelocKind k;
+ if (!rv_parse_mod_reloc(d, pos, &sym, &off, &k)) return 0;
+ MCEmitter* mc = asm_driver_mc(d);
+ mc->emit_reloc_at(mc, mc->section_id, mc->pos(mc), k, sym, off, 0, 0);
+ return 1;
+}
+
+static Rv64Mem parse_mem(AsmDriver* d) {
+ Rv64Mem m;
+ m.disp = 0;
+ m.mod = RV_MEMMOD_NONE;
+ m.sym = OBJ_SYM_NONE;
+ m.off = 0;
+ if (asm_driver_tok_is_punct(asm_driver_peek(d), '%')) {
+ /* `%lo(sym)(base)` / `%pcrel_lo(label)(base)` — record the modifier; the
+ * load/store caller emits the I- or S-type relocation. */
+ ObjSymId sym;
+ i64 off;
+ RelocKind k;
+ (void)rv_parse_mod_reloc(d, RV_MODPOS_LO_I, &sym, &off, &k);
+ m.mod = (k == R_RV_PCREL_LO12_I) ? RV_MEMMOD_PCREL_LO : RV_MEMMOD_LO;
+ m.sym = sym;
+ m.off = off;
+ } else {
+ m.disp = (i32)asm_driver_parse_const(d);
+ }
+ asm_driver_expect_punct(d, '(', "'(' in rv64 memory operand");
+ m.base = parse_xreg(d);
+ asm_driver_expect_punct(d, ')', "')' in rv64 memory operand");
+ return m;
+}
+
+/* Emit the I/S-type relocation recorded by parse_mem for a `%lo`/`%pcrel_lo`
+ * memory offset, picking the S-type variant for stores. */
+static void rv_emit_mem_mod_reloc(AsmDriver* d, const Rv64Mem* m,
+ int is_store) {
+ if (m->mod == RV_MEMMOD_NONE) return;
+ RelocKind k = (m->mod == RV_MEMMOD_PCREL_LO)
+ ? (is_store ? R_RV_PCREL_LO12_S : R_RV_PCREL_LO12_I)
+ : (is_store ? R_RV_LO12_S : R_RV_LO12_I);
+ MCEmitter* mc = asm_driver_mc(d);
+ mc->emit_reloc_at(mc, mc->section_id, mc->pos(mc), k, m->sym, m->off, 0, 0);
+}
+
+/* Fence pred/succ parser — accepts a string like "rw" / "iorw" / "0" /
+ * a numeric literal. Returns the 4-bit mask: bit3=i, bit2=o, bit1=r,
+ * bit0=w. */
+static u32 parse_fence_mask(AsmDriver* d) {
+ AsmTok t = asm_driver_peek(d);
+ if (t.kind == ASM_TOK_NUM) {
+ (void)asm_driver_next(d);
+ return (u32)asm_driver_parse_const(d) & 0xfu;
+ }
+ if (t.kind == ASM_TOK_IDENT) {
+ char name[8];
+ AsmTok tt = asm_driver_next(d);
+ if (!sym_to_cstr(d, tt.v.ident, name, sizeof name))
+ asm_driver_panic(d, "rv64 asm: bad fence mask");
+ u32 mask = 0;
+ for (const char* p = name; *p; ++p) {
+ switch (*p) {
+ case 'i':
+ mask |= 8u;
+ break;
+ case 'o':
+ mask |= 4u;
+ break;
+ case 'r':
+ mask |= 2u;
+ break;
+ case 'w':
+ mask |= 1u;
+ break;
+ default:
+ asm_driver_panic(d, "rv64 asm: bad fence char");
+ }
+ }
+ return mask;
+ }
+ asm_driver_panic(d, "rv64 asm: bad fence operand");
+}
+
+/* The XLEN variant for the assembly target. Reached off the AsmDriver's
+ * Compiler so the stateless encoders can gate rv32-vs-rv64 behavior
+ * (shamt width, addiw availability). */
+static const RiscvVariant* rv_asm_variant(AsmDriver* d) {
+ return riscv_variant_for_kind(asm_driver_compiler(d)->target.arch);
+}
+
+/* Field overlay onto a descriptor's `match` word.
+ *
+ * For most formats the descriptor's match already pins opcode +
+ * funct3 + funct7. We OR in the per-operand fields. For shift-imm and
+ * AMO families the layouts diverge from the basic R/I templates — we
+ * handle those explicitly below. */
+
+static u32 enc_r(u32 match, u32 rd, u32 rs1, u32 rs2) {
+ return match | ((rs2 & 0x1fu) << 20) | ((rs1 & 0x1fu) << 15) |
+ ((rd & 0x1fu) << 7);
+}
+static u32 enc_i(u32 match, u32 rd, u32 rs1, i32 imm12) {
+ return match | (((u32)imm12 & 0xfffu) << 20) | ((rs1 & 0x1fu) << 15) |
+ ((rd & 0x1fu) << 7);
+}
+static u32 enc_s(u32 match, u32 rs2, u32 rs1, i32 imm12) {
+ u32 ui = (u32)imm12 & 0xfffu;
+ return match | ((ui >> 5) << 25) | ((rs2 & 0x1fu) << 20) |
+ ((rs1 & 0x1fu) << 15) | ((ui & 0x1fu) << 7);
+}
+static u32 enc_b(u32 match, u32 rs1, u32 rs2, i32 imm13) {
+ u32 ui = (u32)imm13;
+ return match | (((ui >> 12) & 1u) << 31) | (((ui >> 5) & 0x3fu) << 25) |
+ ((rs2 & 0x1fu) << 20) | ((rs1 & 0x1fu) << 15) |
+ (((ui >> 1) & 0xfu) << 8) | (((ui >> 11) & 1u) << 7);
+}
+static u32 enc_u(u32 match, u32 rd, u32 imm20) {
+ return match | ((imm20 & 0xfffffu) << 12) | ((rd & 0x1fu) << 7);
+}
+static u32 enc_j(u32 match, u32 rd, i32 imm21) {
+ u32 ui = (u32)imm21;
+ return match | (((ui >> 20) & 1u) << 31) | (((ui >> 1) & 0x3ffu) << 21) |
+ (((ui >> 11) & 1u) << 20) | (((ui >> 12) & 0xffu) << 12) |
+ ((rd & 0x1fu) << 7);
+}
+static u32 enc_r4(u32 match, u32 rd, u32 rs1, u32 rs2, u32 rs3, u32 rm) {
+ return match | ((rs3 & 0x1fu) << 27) | ((rs2 & 0x1fu) << 20) |
+ ((rs1 & 0x1fu) << 15) | ((rm & 0x7u) << 12) | ((rd & 0x1fu) << 7);
+}
+
+/* SLLI/SRLI/SRAI shift-imm. The shamt occupies bits 25:20 on rv64 (6-bit,
+ * funct6 in match) but only bits 24:20 on rv32 (5-bit; bit 25 belongs to
+ * funct7 and MUST stay 0, else the word reads as a different funct7). The
+ * variant's shamt_bits drives the mask; an rv32 shamt >= 32 is rejected. */
+static u32 enc_ishift(AsmDriver* d, u32 match, u32 rd, u32 rs1, u32 shamt) {
+ u32 shamt_bits = rv_asm_variant(d)->shamt_bits;
+ u32 shamt_mask = (shamt_bits == 5u) ? 0x1fu : 0x3fu;
+ if (shamt > shamt_mask)
+ asm_driver_panic(d, "rv64 asm: shift amount out of range for target XLEN");
+ return match | ((shamt & shamt_mask) << 20) | ((rs1 & 0x1fu) << 15) |
+ ((rd & 0x1fu) << 7);
+}
+/* RV32 word shift-imm: shamt5 occupies bits 24:20 (funct7 already pinned). */
+static u32 enc_ishiftw(u32 match, u32 rd, u32 rs1, u32 shamt) {
+ return match | ((shamt & 0x1fu) << 20) | ((rs1 & 0x1fu) << 15) |
+ ((rd & 0x1fu) << 7);
+}
+/* AMO: aq/rl bits 26/25 — we accept them as optional .aq/.rl suffixes
+ * on the mnemonic. For now mnemonics arrive bare. */
+static u32 enc_amo(u32 match, u32 aq, u32 rl, u32 rd, u32 rs1, u32 rs2) {
+ return match | ((aq & 1u) << 26) | ((rl & 1u) << 25) | ((rs2 & 0x1fu) << 20) |
+ ((rs1 & 0x1fu) << 15) | ((rd & 0x1fu) << 7);
+}
+
+static u32 c_reg3(AsmDriver* d, u32 r) {
+ if (r < 8u || r > 15u)
+ asm_driver_panic(d,
+ "rv64 asm: compressed register must be x8..x15/f8..f15");
+ return r - 8u;
+}
+
+static u32 enc_c_ci(u32 match, u32 rd, i32 imm) {
+ u32 u = (u32)imm & 0x3fu;
+ return match | (((u >> 5) & 1u) << 12) | ((rd & 0x1fu) << 7) |
+ ((u & 0x1fu) << 2);
+}
+
+static u32 enc_c_cr(u32 match, u32 rd_rs1, u32 rs2) {
+ return match | ((rd_rs1 & 0x1fu) << 7) | ((rs2 & 0x1fu) << 2);
+}
+
+static u32 enc_c_addi16sp(u32 match, i32 imm) {
+ u32 u = (u32)imm & 0x3ffu;
+ return match | (((u >> 9) & 1u) << 12) | (((u >> 4) & 1u) << 6) |
+ (((u >> 6) & 1u) << 5) | (((u >> 7) & 3u) << 3) |
+ (((u >> 5) & 1u) << 2);
+}
+
+static u32 enc_c_addi4spn(u32 match, u32 rd3, u32 imm) {
+ u32 enc = (((imm >> 4) & 3u) << 6) | (((imm >> 6) & 0xfu) << 2) |
+ (((imm >> 2) & 1u) << 1) | ((imm >> 3) & 1u);
+ return match | ((enc & 0xffu) << 5) | ((rd3 & 7u) << 2);
+}
+
+static u32 enc_c_lwld(u32 match, u32 rd3, u32 rs1_3, u32 off, int wide64) {
+ if (wide64) {
+ return match | (((off >> 3) & 7u) << 10) | ((rs1_3 & 7u) << 7) |
+ (((off >> 6) & 3u) << 5) | ((rd3 & 7u) << 2);
+ }
+ return match | (((off >> 3) & 7u) << 10) | ((rs1_3 & 7u) << 7) |
+ (((off >> 2) & 1u) << 6) | (((off >> 6) & 1u) << 5) |
+ ((rd3 & 7u) << 2);
+}
+
+static u32 enc_c_swld(u32 match, u32 rs2_3, u32 rs1_3, u32 off, int wide64) {
+ return enc_c_lwld(match, rs2_3, rs1_3, off, wide64);
+}
+
+static u32 enc_c_lwsp(u32 match, u32 rd, u32 off, int wide64) {
+ if (wide64) {
+ return match | (((off >> 5) & 1u) << 12) | ((rd & 0x1fu) << 7) |
+ (((off >> 3) & 3u) << 5) | (((off >> 6) & 7u) << 2);
+ }
+ return match | (((off >> 5) & 1u) << 12) | ((rd & 0x1fu) << 7) |
+ (((off >> 2) & 7u) << 4) | (((off >> 6) & 3u) << 2);
+}
+
+static u32 enc_c_swsp(u32 match, u32 rs2, u32 off, int wide64) {
+ u32 imm6;
+ if (wide64)
+ imm6 = (((off >> 3) & 7u) << 3) | ((off >> 6) & 7u);
+ else
+ imm6 = (((off >> 2) & 0xfu) << 2) | ((off >> 6) & 3u);
+ return match | ((imm6 & 0x3fu) << 7) | ((rs2 & 0x1fu) << 2);
+}
+
+static u32 enc_c_cb_imm(u32 match, u32 rs1_3, i32 imm) {
+ u32 u = (u32)imm & 0x1ffu;
+ return match | (((u >> 8) & 1u) << 12) | (((u >> 3) & 3u) << 10) |
+ ((rs1_3 & 7u) << 7) | (((u >> 6) & 3u) << 5) | (((u >> 1) & 3u) << 3) |
+ (((u >> 5) & 1u) << 2);
+}
+
+static u32 enc_c_cb_alu_imm(u32 match, u32 rd3, i32 imm) {
+ u32 u = (u32)imm & 0x3fu;
+ return match | (((u >> 5) & 1u) << 12) | ((rd3 & 7u) << 7) |
+ ((u & 0x1fu) << 2);
+}
+
+static u32 enc_c_cj(u32 match, i32 imm) {
+ u32 u = (u32)imm & 0xfffu;
+ return match | (((u >> 11) & 1u) << 12) | (((u >> 4) & 1u) << 11) |
+ (((u >> 8) & 3u) << 9) | (((u >> 10) & 1u) << 8) |
+ (((u >> 6) & 1u) << 7) | (((u >> 7) & 1u) << 6) |
+ (((u >> 1) & 7u) << 3) | (((u >> 5) & 1u) << 2);
+}
+
+/* Parse a branch/jump target operand. With a symbolic target (a label), emit
+ * the relocation at the current position — which is exactly where the caller
+ * is about to write this instruction word — and return 0 as the placeholder
+ * immediate. With a bare constant, return it as the PC-relative byte
+ * displacement (preserving the existing numeric-offset corpus behavior). */
+static i32 rv_reloc_target(AsmDriver* d, RelocKind kind) {
+ ObjSymId sym = OBJ_SYM_NONE;
+ i64 off = 0;
+ asm_driver_parse_sym_expr(d, &sym, &off);
+ if (sym != OBJ_SYM_NONE) {
+ MCEmitter* mc = asm_driver_mc(d);
+ mc->emit_reloc_at(mc, mc->section_id, mc->pos(mc), kind, sym, off, 0, 0);
+ return 0;
+ }
+ return (i32)off;
+}
+
+/* Per-format parser — reads the operand list off the driver and returns
+ * the encoded 32-bit word, given the matched descriptor. */
+static u32 assemble_one(AsmDriver* d, const Rv64InsnDesc* desc) {
+ u32 m = desc->match;
+ u32 rd = 0, rs1 = 0, rs2 = 0;
+ i32 imm = 0;
+ Rv64Mem mem;
+
+ switch ((Rv64Format)desc->fmt) {
+ case RV64_FMT_R:
+ /* Two-operand aliases: snez/neg/negw — rd, rs (rs1=x0). */
+ if (desc->flags & RV64_ASMFL_ALIAS) {
+ rd = parse_xreg(d);
+ expect_comma(d);
+ rs2 = parse_xreg(d);
+ return enc_r(m, rd, 0u, rs2);
+ }
+ rd = parse_xreg(d);
+ expect_comma(d);
+ rs1 = parse_xreg(d);
+ expect_comma(d);
+ rs2 = parse_xreg(d);
+ return enc_r(m, rd, rs1, rs2);
+
+ case RV64_FMT_R4: {
+ u32 rs3;
+ rd = parse_freg(d);
+ expect_comma(d);
+ rs1 = parse_freg(d);
+ expect_comma(d);
+ rs2 = parse_freg(d);
+ expect_comma(d);
+ rs3 = parse_freg(d);
+ return enc_r4(m, rd, rs1, rs2, rs3, 0x7u);
+ }
+
+ case RV64_FMT_I:
+ /* Aliases first. `li` is handled earlier by rv64_emit_pseudo (it may
+ * need a multi-word expansion), so it never reaches here. */
+ if (desc->flags & RV64_ASMFL_ALIAS) {
+ if (slice_eq_cstr(desc->mnemonic, "mv")) {
+ /* Standard two-operand `mv rd, rs` = `addi rd, rs, 0`. (A %pcrel_lo
+ * low-half is emitted as the canonical `addi rd, rs, %pcrel_lo(L)`,
+ * not a non-standard 3-operand `mv`, so it lands in the ADDI path
+ * below — matching clang.) */
+ rd = parse_xreg(d);
+ expect_comma(d);
+ rs1 = parse_xreg(d);
+ return enc_i(m, rd, rs1, 0);
+ }
+ if (slice_eq_cstr(desc->mnemonic, "sext.w")) {
+ rd = parse_xreg(d);
+ expect_comma(d);
+ rs1 = parse_xreg(d);
+ return enc_i(m, rd, rs1, 0);
+ }
+ if (slice_eq_cstr(desc->mnemonic, "seqz") ||
+ slice_eq_cstr(desc->mnemonic, "not")) {
+ rd = parse_xreg(d);
+ expect_comma(d);
+ rs1 = parse_xreg(d);
+ /* match already has imm12 + funct3 + op pinned. */
+ return m | ((rs1 & 0x1fu) << 15) | ((rd & 0x1fu) << 7);
+ }
+ }
+ rd = parse_xreg(d);
+ expect_comma(d);
+ rs1 = parse_xreg(d);
+ expect_comma(d);
+ /* `addi rd, rs1, %lo(sym)` / `%pcrel_lo(label)` → R_RV_LO12_I. */
+ if (rv_emit_imm_mod_reloc(d, RV_MODPOS_LO_I)) return enc_i(m, rd, rs1, 0);
+ imm = (i32)asm_driver_parse_const(d);
+ return enc_i(m, rd, rs1, imm);
+
+ case RV64_FMT_I_SHIFT:
+ rd = parse_xreg(d);
+ expect_comma(d);
+ rs1 = parse_xreg(d);
+ expect_comma(d);
+ return enc_ishift(d, m, rd, rs1, (u32)asm_driver_parse_const(d));
+
+ case RV64_FMT_I_SHIFTW:
+ rd = parse_xreg(d);
+ expect_comma(d);
+ rs1 = parse_xreg(d);
+ expect_comma(d);
+ return enc_ishiftw(m, rd, rs1, (u32)asm_driver_parse_const(d));
+
+ case RV64_FMT_U:
+ rd = parse_xreg(d);
+ expect_comma(d);
+ /* `lui rd, %hi(sym)` → R_RV_HI20; `auipc rd, %pcrel_hi(sym)` →
+ * R_RV_PCREL_HI20 (or %got_pcrel_hi → R_RV_GOT_HI20). */
+ if (rv_emit_imm_mod_reloc(d, RV_MODPOS_HI20)) return enc_u(m, rd, 0);
+ imm = (i32)asm_driver_parse_const(d);
+ /* LUI/AUIPC immediate is the upper-20 value: the input is interpreted
+ * as the literal 20-bit value (already shifted-out form). */
+ return enc_u(m, rd, (u32)imm);
+
+ case RV64_FMT_J:
+ /* `j label` / `jal rd, label` accept a symbolic target (R_RV_JAL) or a
+ * bare numeric displacement. */
+ if ((desc->flags & RV64_ASMFL_ALIAS) &&
+ slice_eq_cstr(desc->mnemonic, "j")) {
+ return enc_j(m, 0u, rv_reloc_target(d, R_RV_JAL));
+ }
+ rd = parse_xreg(d);
+ expect_comma(d);
+ return enc_j(m, rd, rv_reloc_target(d, R_RV_JAL));
+
+ case RV64_FMT_B:
+ /* `beq rs1, rs2, label` (and beqz/bnez aliases) accept a symbolic target
+ * (R_RV_BRANCH) or a bare numeric displacement. */
+ if (desc->flags & RV64_ASMFL_ALIAS) {
+ /* beqz / bnez: rs, off. */
+ rs1 = parse_xreg(d);
+ expect_comma(d);
+ return enc_b(m, rs1, 0u, rv_reloc_target(d, R_RV_BRANCH));
+ }
+ rs1 = parse_xreg(d);
+ expect_comma(d);
+ rs2 = parse_xreg(d);
+ expect_comma(d);
+ return enc_b(m, rs1, rs2, rv_reloc_target(d, R_RV_BRANCH));
+
+ case RV64_FMT_LOAD:
+ rd = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
+ expect_comma(d);
+ mem = parse_mem(d);
+ rv_emit_mem_mod_reloc(d, &mem, /*is_store=*/0);
+ return enc_i(m, rd, mem.base, mem.disp);
+
+ case RV64_FMT_FP_LOAD:
+ rd = parse_freg(d);
+ expect_comma(d);
+ mem = parse_mem(d);
+ rv_emit_mem_mod_reloc(d, &mem, /*is_store=*/0);
+ return enc_i(m, rd, mem.base, mem.disp);
+
+ case RV64_FMT_STORE:
+ rs2 = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
+ expect_comma(d);
+ mem = parse_mem(d);
+ rv_emit_mem_mod_reloc(d, &mem, /*is_store=*/1);
+ return enc_s(m, rs2, mem.base, mem.disp);
+
+ case RV64_FMT_FP_STORE:
+ rs2 = parse_freg(d);
+ expect_comma(d);
+ mem = parse_mem(d);
+ rv_emit_mem_mod_reloc(d, &mem, /*is_store=*/1);
+ return enc_s(m, rs2, mem.base, mem.disp);
+
+ case RV64_FMT_JALR:
+ if ((desc->flags & RV64_ASMFL_ALIAS) &&
+ slice_eq_cstr(desc->mnemonic, "jr")) {
+ rs1 = parse_xreg(d);
+ return enc_i(m, 0u, rs1, 0);
+ }
+ rd = parse_xreg(d);
+ if (!asm_driver_eat_comma(d)) {
+ if (slice_eq_cstr(desc->mnemonic, "jalr"))
+ return enc_i(m, RV_RA, rd, 0);
+ asm_driver_panic(d, "rv64 asm: expected ','");
+ }
+ /* Accept both `jalr rd, imm(rs1)` and `jalr rd, rs1, imm`. */
+ {
+ AsmTok t = asm_driver_peek(d);
+ if (t.kind == ASM_TOK_IDENT) {
+ /* register first → register form */
+ rs1 = parse_xreg(d);
+ if (asm_driver_eat_comma(d)) {
+ imm = (i32)asm_driver_parse_const(d);
+ } else {
+ imm = 0;
+ }
+ return enc_i(m, rd, rs1, imm);
+ }
+ }
+ mem = parse_mem(d);
+ return enc_i(m, rd, mem.base, mem.disp);
+
+ case RV64_FMT_FENCE: {
+ u32 pred, succ;
+ pred = parse_fence_mask(d);
+ expect_comma(d);
+ succ = parse_fence_mask(d);
+ return m | (pred << 24) | (succ << 20);
+ }
+
+ case RV64_FMT_SYSTEM:
+ /* No operands. nop/ret/ecall/ebreak. */
+ return m;
+
+ case RV64_FMT_FP_RM:
+ rd = parse_freg(d);
+ expect_comma(d);
+ rs1 = parse_freg(d);
+ expect_comma(d);
+ rs2 = parse_freg(d);
+ /* Use DYN(=7) rounding mode by default. */
+ return enc_r(m | (0x7u << 12), rd, rs1, rs2);
+
+ case RV64_FMT_FP_R:
+ if (desc->flags & RV64_ASMFL_FP) {
+ rd = parse_freg(d);
+ } else {
+ rd = parse_xreg(d);
+ }
+ expect_comma(d);
+ rs1 = parse_freg(d);
+ expect_comma(d);
+ rs2 = parse_freg(d);
+ return enc_r(m, rd, rs1, rs2);
+
+ case RV64_FMT_FP_CVT:
+ if (desc->flags & RV64_ASMFL_FP) {
+ rd = parse_freg(d);
+ expect_comma(d);
+ /* Source: integer reg for fcvt.s.w etc (no FP flag would
+ * indicate); but since we have ASMFL_FP set on dest, source may
+ * be either. Disambiguate by mnemonic. */
+ if (slice_has_prefix_cstr(desc->mnemonic, "fcvt.s.", 7) &&
+ (desc->mnemonic.s[7] == 'w' || desc->mnemonic.s[7] == 'l')) {
+ rs1 = parse_xreg(d);
+ } else if (slice_has_prefix_cstr(desc->mnemonic, "fcvt.d.", 7) &&
+ (desc->mnemonic.s[7] == 'w' || desc->mnemonic.s[7] == 'l')) {
+ rs1 = parse_xreg(d);
+ } else if (slice_eq_cstr(desc->mnemonic, "fmv.w.x") ||
+ slice_eq_cstr(desc->mnemonic, "fmv.d.x")) {
+ rs1 = parse_xreg(d);
+ } else {
+ rs1 = parse_freg(d);
+ }
+ } else {
+ rd = parse_xreg(d);
+ expect_comma(d);
+ rs1 = parse_freg(d);
+ }
+ /* match encodes rs2 (type selector); OR in rd/rs1 and the rounding mode.
+ * An explicit `, <rm>` suffix (cc -S emits it for non-default modes, and
+ * clang/gas accept it) takes precedence; otherwise the rm is fixed per
+ * conversion family (mirrors the rv_fcvt_* encoders in isa.h, the codegen
+ * source of truth): fp->int truncates (RTZ=1); int->fp and fp->fp use the
+ * default DYN=7; fmv bit-moves carry no rounding (rm=0). */
+ {
+ u32 funct7 = (m >> 25) & 0x7fu;
+ u32 rm;
+ if (asm_driver_eat_comma(d)) {
+ rm = rv_parse_rm_name(d);
+ } else {
+ switch (funct7) {
+ case 0x60: /* fcvt.{w,wu,l,lu}.s */
+ case 0x61: /* fcvt.{w,wu,l,lu}.d */
+ rm = 0x1u; /* RTZ */
+ break;
+ case 0x70: /* fmv.x.w */
+ case 0x71: /* fmv.x.d */
+ case 0x78: /* fmv.w.x */
+ case 0x79: /* fmv.d.x */
+ rm = 0x0u;
+ break;
+ default: /* int->fp (0x68/0x69) and fp<->fp (0x20/0x21): DYN */
+ rm = 0x7u;
+ break;
+ }
+ }
+ return m | (rm << 12) | ((rs1 & 0x1fu) << 15) | ((rd & 0x1fu) << 7);
+ }
+
+ case RV64_FMT_AMO:
+ rd = parse_xreg(d);
+ expect_comma(d);
+ rs2 = parse_xreg(d);
+ expect_comma(d);
+ asm_driver_expect_punct(d, '(', "'(' in rv64 amo operand");
+ rs1 = parse_xreg(d);
+ asm_driver_expect_punct(d, ')', "')' in rv64 amo operand");
+ return enc_amo(m, 0u, 0u, rd, rs1, rs2);
+
+ case RV64_FMT_LR:
+ rd = parse_xreg(d);
+ expect_comma(d);
+ asm_driver_expect_punct(d, '(', "'(' in rv64 lr operand");
+ rs1 = parse_xreg(d);
+ asm_driver_expect_punct(d, ')', "')' in rv64 lr operand");
+ return enc_amo(m, 0u, 0u, rd, rs1, 0u);
+
+ case RV64_FMT_CSR: {
+ i32 csr;
+ rd = parse_xreg(d);
+ expect_comma(d);
+ csr = (i32)asm_driver_parse_const(d);
+ expect_comma(d);
+ rs1 = parse_xreg(d);
+ return enc_i(m, rd, rs1, csr);
+ }
+
+ case RV64_FMT_CSRI: {
+ i32 csr;
+ rd = parse_xreg(d);
+ expect_comma(d);
+ csr = (i32)asm_driver_parse_const(d);
+ expect_comma(d);
+ u32 uimm = (u32)asm_driver_parse_const(d) & 0x1fu;
+ return enc_i(m, rd, uimm, csr);
+ }
+
+ case RV64_FMT_CR:
+ if (slice_eq_cstr(desc->mnemonic, "c.jr") ||
+ slice_eq_cstr(desc->mnemonic, "c.jalr")) {
+ rs1 = parse_xreg(d);
+ return enc_c_cr(m, rs1, 0u);
+ }
+ rd = parse_xreg(d);
+ expect_comma(d);
+ rs2 = parse_xreg(d);
+ return enc_c_cr(m, rd, rs2);
+
+ case RV64_FMT_CI:
+ if (slice_eq_cstr(desc->mnemonic, "c.lwsp") ||
+ slice_eq_cstr(desc->mnemonic, "c.ldsp") ||
+ slice_eq_cstr(desc->mnemonic, "c.fldsp")) {
+ rd = slice_eq_cstr(desc->mnemonic, "c.fldsp") ? parse_freg(d)
+ : parse_xreg(d);
+ expect_comma(d);
+ mem = parse_mem(d);
+ if (mem.base != RV_SP)
+ asm_driver_panic(d, "rv64 asm: compressed stack load needs sp base");
+ return enc_c_lwsp(m, rd, (u32)mem.disp,
+ !slice_eq_cstr(desc->mnemonic, "c.lwsp"));
+ }
+ rd = parse_xreg(d);
+ expect_comma(d);
+ imm = (i32)asm_driver_parse_const(d);
+ if (slice_eq_cstr(desc->mnemonic, "c.lui") && ((u32)imm & 0xfffu) == 0)
+ imm >>= 12;
+ if (slice_eq_cstr(desc->mnemonic, "c.addi16sp")) {
+ if (rd != RV_SP)
+ asm_driver_panic(d, "rv64 asm: c.addi16sp needs sp destination");
+ return enc_c_addi16sp(m, imm);
+ }
+ return enc_c_ci(m, rd, imm);
+
+ case RV64_FMT_CSS:
+ rs2 = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
+ expect_comma(d);
+ mem = parse_mem(d);
+ if (mem.base != RV_SP)
+ asm_driver_panic(d, "rv64 asm: compressed stack store needs sp base");
+ return enc_c_swsp(m, rs2, (u32)mem.disp,
+ !slice_eq_cstr(desc->mnemonic, "c.swsp"));
+
+ case RV64_FMT_CIW:
+ rd = parse_xreg(d);
+ expect_comma(d);
+ rs1 = parse_xreg(d);
+ expect_comma(d);
+ if (rs1 != RV_SP)
+ asm_driver_panic(d, "rv64 asm: c.addi4spn needs sp source");
+ imm = (i32)asm_driver_parse_const(d);
+ return enc_c_addi4spn(m, c_reg3(d, rd), (u32)imm);
+
+ case RV64_FMT_CL:
+ rd = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
+ expect_comma(d);
+ mem = parse_mem(d);
+ return enc_c_lwld(m, c_reg3(d, rd), c_reg3(d, mem.base), (u32)mem.disp,
+ !slice_eq_cstr(desc->mnemonic, "c.lw"));
+
+ case RV64_FMT_CS:
+ rs2 = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
+ expect_comma(d);
+ mem = parse_mem(d);
+ return enc_c_swld(m, c_reg3(d, rs2), c_reg3(d, mem.base), (u32)mem.disp,
+ !slice_eq_cstr(desc->mnemonic, "c.sw"));
+
+ case RV64_FMT_CA:
+ rd = parse_xreg(d);
+ expect_comma(d);
+ rs2 = parse_xreg(d);
+ return m | (c_reg3(d, rd) << 7) | (c_reg3(d, rs2) << 2);
+
+ case RV64_FMT_CB:
+ rs1 = parse_xreg(d);
+ expect_comma(d);
+ imm = (i32)asm_driver_parse_const(d);
+ if (slice_eq_cstr(desc->mnemonic, "c.beqz") ||
+ slice_eq_cstr(desc->mnemonic, "c.bnez")) {
+ return enc_c_cb_imm(m, c_reg3(d, rs1), imm);
+ }
+ return enc_c_cb_alu_imm(m, c_reg3(d, rs1), imm);
+
+ case RV64_FMT_CJ:
+ imm = (i32)asm_driver_parse_const(d);
+ return enc_c_cj(m, imm);
+
+ case RV64_FMT_C_NONE:
+ return m;
+
+ default:
+ asm_driver_panic(d, "rv64 asm: unsupported format");
+ }
+}
+
+/* ============================================================
+ * Multi-word pseudo-instruction expansion.
+ *
+ * call/tail/la/lla expand to a PC-relative AUIPC + (JALR | ADDI) pair;
+ * `li` with a constant that does not fit a 12-bit signed immediate
+ * expands to an LUI/ADDI(W)/SLLI chain (no relocations). Each 32-bit
+ * word goes out through rv64_emit32 — the same path assemble_one's
+ * single-word result uses — and relocations are attached via
+ * mc->emit_reloc_at at the appropriate word offset. */
+
+/* 12-bit signed immediate range check for li short-circuit. */
+static bool rv_fits_i12(i64 v) { return v >= -2048 && v <= 2047; }
+
+/* Sign-extend the low 12 bits of v. */
+static i64 rv_sext12(i64 v) {
+ return (i64)((((u64)v & 0xfffu) ^ 0x800u)) - 0x800;
+}
+
+/* Emit an AUIPC rd,0 + a R_RV_PCREL_HI20(sym) reloc, then create a local
+ * `.LpcrelHi` anchor at the AUIPC offset and return that anchor symbol so
+ * the paired low-half reloc can reference it. Mirrors native.c's
+ * rv_emit_global_addr (the non-GOT branch). */
+static ObjSymId rv_emit_pcrel_hi(AsmDriver* d, u32 rd, ObjSymId sym,
+ i64 addend) {
+ MCEmitter* mc = asm_driver_mc(d);
+ ObjBuilder* obj = asm_driver_ob(d);
+ Compiler* c = asm_driver_compiler(d);
+ u32 sec = mc->section_id;
+ u32 ap = mc->pos(mc);
+ rv64_emit32(mc, rv_auipc(rd, 0));
+ mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, addend, 0, 0);
+ Sym an = pool_intern_slice(c->global, SLICE_LIT(".LpcrelHi"));
+ return obj_symbol(obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
+}
+
+/* call/tail: AUIPC <link>,0 + JALR <rd>,<link>,0 with one R_RV_CALL reloc
+ * at the AUIPC. `link` is the register the AUIPC materializes into and the
+ * JALR's base; `rd` is the JALR link-register (ra for call, zero for
+ * tail). The linker patches both words from the single R_RV_CALL reloc. */
+static void rv_emit_call_pseudo(AsmDriver* d, u32 link, u32 rd) {
+ MCEmitter* mc = asm_driver_mc(d);
+ ObjSymId sym = OBJ_SYM_NONE;
+ i64 off = 0;
+ asm_driver_parse_sym_expr(d, &sym, &off);
+ if (sym == OBJ_SYM_NONE)
+ asm_driver_panic(d, "rv64 asm: call/tail target must be a symbol");
+ u32 sec = mc->section_id;
+ u32 ap = mc->pos(mc);
+ rv64_emit32(mc, rv_auipc(link, 0));
+ rv64_emit32(mc, rv_jalr(rd, link, 0));
+ mc->emit_reloc_at(mc, sec, ap, R_RV_CALL, sym, off, 0, 0);
+}
+
+/* la/lla rd, sym: AUIPC rd,%pcrel_hi(sym) + ADDI rd,rd,%pcrel_lo(anchor).
+ * kit's static Local-Exec model has no GOT, so `la` == `lla`. */
+static void rv_emit_la_pseudo(AsmDriver* d) {
+ MCEmitter* mc = asm_driver_mc(d);
+ u32 rd = parse_xreg(d);
+ expect_comma(d);
+ ObjSymId sym = OBJ_SYM_NONE;
+ i64 off = 0;
+ asm_driver_parse_sym_expr(d, &sym, &off);
+ if (sym == OBJ_SYM_NONE)
+ asm_driver_panic(d, "rv64 asm: la/lla target must be a symbol");
+ ObjSymId anchor = rv_emit_pcrel_hi(d, rd, sym, off);
+ u32 sec = mc->section_id;
+ u32 lp = mc->pos(mc);
+ rv64_emit32(mc, rv_addi(rd, rd, 0));
+ mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
+}
+
+/* LUI immediate that sign-extends to a negative 32-bit value: bit 19 of
+ * the 20-bit field is set, i.e. Hi20 >= 0x80000. */
+#define RV_LUI_HI20_SIGN 0x80000LL
+
+/* Materialize a constant into `rd` via the LLVM RISCVMatInt sequence: for
+ * values fitting a signed 32-bit range, LUI + ADDI/ADDIW; otherwise a
+ * recursive top-down hi20/lo12 split with SLLI shifts that absorb trailing
+ * zeros. No relocations.
+ *
+ * On rv64, after an LUI the low-half add uses ADDIW only when the LUI value
+ * is negative in 32-bit form (Hi20 >= RV_LUI_HI20_SIGN): there the add must
+ * wrap in 32-bit arithmetic and re-sign-extend to land in range. When the
+ * LUI value is non-negative in its low 32 bits, plain ADDI keeps the
+ * 64-bit result correct (matching LLVM's generateInstSeqImpl).
+ *
+ * On rv32 there is no ADDIW and the GPR is 32 bits wide, so every constant
+ * fits a LUI + ADDI pair (the add already wraps mod 2^32). The variant's
+ * has_w_forms gates both the ADDIW use and the >32-bit recursion below. */
+static void rv_emit_li_value(MCEmitter* mc, const RiscvVariant* variant,
+ u32 rd, i64 val) {
+ if (!variant->has_w_forms || (val >= -2147483648LL && val <= 2147483647LL)) {
+ i64 hi20 = ((val + 0x800) >> 12) & 0xfffffLL;
+ i64 lo12 = rv_sext12(val);
+ if (hi20) rv64_emit32(mc, rv_lui(rd, (u32)hi20));
+ if (lo12 || hi20 == 0) {
+ u32 src = hi20 ? rd : (u32)RV_ZERO;
+ if (variant->has_w_forms && hi20 >= RV_LUI_HI20_SIGN)
+ rv64_emit32(mc, rv_addiw(rd, src, (i32)lo12));
+ else
+ rv64_emit32(mc, rv_addi(rd, src, (i32)lo12));
+ }
+ return;
+ }
+ /* >32-bit: split off the low 12 bits, recurse on the (shifted) high
+ * part, then SLLI back and ADD the low bits. The subtraction is done in
+ * unsigned space so it cannot signed-overflow at the int64 extremes
+ * (e.g. val=INT64_MAX, lo12=-1); the result has its low 12 bits clear,
+ * and the arithmetic right shift recovers the sign-extended high part. */
+ i64 lo12 = rv_sext12(val);
+ i64 hi = (i64)((u64)val - (u64)lo12) >> 12;
+ u32 shift = 12;
+ /* Absorb trailing zeros of the high part into the shift amount. */
+ while ((hi & 1) == 0) {
+ hi >>= 1;
+ ++shift;
+ }
+ rv_emit_li_value(mc, variant, rd, hi);
+ rv64_emit32(mc, rv_slli(rd, rd, shift));
+ if (lo12) rv64_emit32(mc, rv_addi(rd, rd, (i32)lo12));
+}
+
+/* Dispatch a multi-word pseudo. Returns true if it consumed the operands
+ * and emitted its expansion; false to fall through to the single-word
+ * path. `li` is handled here only when its immediate exceeds the 12-bit
+ * signed range the alias row encodes directly. */
+static bool rv64_emit_pseudo(AsmDriver* d, const Rv64InsnDesc* desc) {
+ MCEmitter* mc = asm_driver_mc(d);
+ if (desc->fmt == RV64_FMT_PSEUDO) {
+ if (slice_eq_cstr(desc->mnemonic, "call")) {
+ rv_emit_call_pseudo(d, RV_RA, RV_RA);
+ return true;
+ }
+ if (slice_eq_cstr(desc->mnemonic, "tail")) {
+ /* Standard RISC-V `tail` materializes the address into t1 (x6). kit
+ * codegen uses t0 for its own tail-call temp, so a `cc -S`-fused
+ * `tail sym` re-assembles to t1 not t0 — execution-equivalent (both are
+ * caller-saved temps clobbered by the tail jump; cross-exec still
+ * matches), only the byte image differs on tail-call cases. Keeping the
+ * assembler's `tail` standard preserves clang/gas interop. */
+ rv_emit_call_pseudo(d, RV_T1, RV_ZERO);
+ return true;
+ }
+ /* la / lla — identical PC-relative expansion in kit. */
+ rv_emit_la_pseudo(d);
+ return true;
+ }
+ if ((desc->flags & RV64_ASMFL_ALIAS) && slice_eq_cstr(desc->mnemonic, "li")) {
+ /* Peek the immediate without consuming the destination register: the
+ * single-word alias path re-parses both. We commit to the multi-word
+ * path only for out-of-range constants, leaving the existing 12-bit
+ * fast path (and its golden behavior) untouched. */
+ u32 rd = parse_xreg(d);
+ expect_comma(d);
+ i64 imm = asm_driver_parse_const(d);
+ if (rv_fits_i12(imm)) {
+ rv64_emit32(mc, rv_addi(rd, RV_ZERO, (i32)imm));
+ } else {
+ rv_emit_li_value(mc, rv_asm_variant(d), rd, imm);
+ }
+ return true;
+ }
+ return false;
+}
+
+static void rv64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic) {
+ MCEmitter* mc = asm_driver_mc(d);
+ const Rv64InsnDesc* desc;
+ u8 av = rv_asm_variant(d)->xlen == 32u ? (u8)RV_AV_RV32 : (u8)RV_AV_RV64;
+ (void)base;
+ (void)asm_driver_cur_section(d);
+ desc = rv64_asm_find(pool_slice(asm_driver_pool(d), mnemonic), av);
+ if (!desc)
+ asm_driver_panic(d, av == (u8)RV_AV_RV32 ? "rv32 asm: unsupported instruction"
+ : "rv64 asm: unsupported instruction");
+ if (rv64_emit_pseudo(d, desc)) return;
+ if (desc->flags & RV64_ASMFL_C16)
+ rv64_emit16(mc, assemble_one(d, desc));
+ else
+ rv64_emit32(mc, assemble_one(d, desc));
+}
+
+static void rv64_arch_asm_destroy(ArchAsm* base) { (void)base; }
+
+/* ---- textual-assembly operand syntax (printer <-> parser) ----------------
+ *
+ * Inverse of the `.s` parsers above (rv_parse_mod_reloc / rv_reloc_target and
+ * the call/la pseudo expanders): how a relocated rv64 operand is spelled in
+ * `cc -S` so the same text re-assembles under kit-as. RISC-V uses the same
+ * `%hi`/`%lo`/`%pcrel_hi`/`%pcrel_lo` operator syntax on every object format,
+ * so `fmt` is unused. See ArchAsmOps and src/api/asm_emit.c. */
+static int rv64_reloc_operand(u16 kind, KitObjFmt fmt, ArchRelocOperand* out) {
+ (void)fmt;
+ out->prefix = "";
+ out->suffix = "";
+ out->addend_bias = 0;
+ out->emit_anchor = 0;
+ out->ref_anchor = 0;
+ switch (kind) {
+ case R_RV_PCREL_HI20:
+ out->surg = ARCH_RELOC_SURG_TAIL;
+ out->prefix = "%pcrel_hi(";
+ out->suffix = ")";
+ out->emit_anchor = 1; /* define a unique anchor label at this AUIPC */
+ return 1;
+ case R_RV_GOT_HI20:
+ out->surg = ARCH_RELOC_SURG_TAIL;
+ out->prefix = "%got_pcrel_hi(";
+ out->suffix = ")";
+ out->emit_anchor = 1;
+ return 1;
+ case R_RV_PCREL_LO12_I:
+ case R_RV_PCREL_LO12_S:
+ out->surg = ARCH_RELOC_SURG_RV_LO12;
+ out->prefix = "%pcrel_lo(";
+ out->suffix = ")";
+ out->ref_anchor = 1; /* references the preceding AUIPC's anchor label */
+ return 1;
+ case R_RV_HI20:
+ out->surg = ARCH_RELOC_SURG_TAIL;
+ out->prefix = "%hi(";
+ out->suffix = ")";
+ return 1;
+ case R_RV_LO12_I:
+ case R_RV_LO12_S:
+ out->surg = ARCH_RELOC_SURG_RV_LO12;
+ out->prefix = "%lo(";
+ out->suffix = ")";
+ return 1;
+ case R_RV_BRANCH:
+ case R_RV_JAL:
+ out->surg = ARCH_RELOC_SURG_TAIL;
+ return 1;
+ default:
+ return 0; /* R_ABS*, R_RV_RVC_*, R_RV_RELAX, TLS, ... → keep numeric */
+ }
+}
+
+/* Intra-section local branches whose target codegen resolved in place (no
+ * relocation): the disassembler renders the target numerically, so cc -S
+ * synthesizes a label there. `j`/`jal x0` are JAL aliases; the conditional
+ * branches are B-type. `call`/`tail` are excluded — they carry R_RV_CALL. */
+static int rv64_is_local_branch(KitSlice m) {
+ if (m.len == 1 && m.s[0] == 'j') return 1;
+ if (m.len == 3 && memcmp(m.s, "jal", 3) == 0) return 1;
+ if (m.len == 3 && memcmp(m.s, "beq", 3) == 0) return 1;
+ if (m.len == 3 && memcmp(m.s, "bne", 3) == 0) return 1;
+ if (m.len == 3 && memcmp(m.s, "blt", 3) == 0) return 1;
+ if (m.len == 3 && memcmp(m.s, "bge", 3) == 0) return 1;
+ if (m.len == 4 && memcmp(m.s, "bltu", 4) == 0) return 1;
+ if (m.len == 4 && memcmp(m.s, "bgeu", 4) == 0) return 1;
+ if (m.len == 4 && memcmp(m.s, "beqz", 4) == 0) return 1;
+ if (m.len == 4 && memcmp(m.s, "bnez", 4) == 0) return 1;
+ if (m.len == 4 && memcmp(m.s, "blez", 4) == 0) return 1;
+ if (m.len == 4 && memcmp(m.s, "bgez", 4) == 0) return 1;
+ if (m.len == 4 && memcmp(m.s, "bltz", 4) == 0) return 1;
+ if (m.len == 4 && memcmp(m.s, "bgtz", 4) == 0) return 1;
+ if (m.len == 6 && memcmp(m.s, "c.beqz", 6) == 0) return 1;
+ if (m.len == 6 && memcmp(m.s, "c.bnez", 6) == 0) return 1;
+ if (m.len == 3 && memcmp(m.s, "c.j", 3) == 0) return 1;
+ return 0;
+}
+
+/* R_RV_CALL fuses an AUIPC+JALR pair into a single `call`/`tail sym` pseudo
+ * (the canonical `.s` spelling the assembler re-expands to the same pair +
+ * reloc). The reloc sits on the AUIPC; the JALR partner carries no reloc. A
+ * tail call links into x0 (the JALR's rd is `zero`); a regular call links into
+ * ra. We read that from the partner JALR's disassembled text. */
+static int rv64_reloc_call_pair(u16 kind, KitSlice pair_mnemonic,
+ KitSlice pair_ops, const char** mnemonic_out) {
+ if (kind != R_RV_CALL) return 0;
+ /* The partner JALR links into ra (regular call) or x0 (tail). The
+ * disassembler renders the x0-link, zero-immediate form as the `jr rs`
+ * alias, and the ra form as `jalr ra, 0(ra)`. So a `jr` partner is always a
+ * tail; a `jalr` partner is a tail iff its link register is `zero`. */
+ if (pair_mnemonic.len == 2 && memcmp(pair_mnemonic.s, "jr", 2) == 0) {
+ *mnemonic_out = "tail";
+ return 1;
+ }
+ if (pair_mnemonic.len == 4 && memcmp(pair_mnemonic.s, "jalr", 4) == 0) {
+ if (pair_ops.len >= 4 && memcmp(pair_ops.s, "zero", 4) == 0)
+ *mnemonic_out = "tail";
+ else
+ *mnemonic_out = "call";
+ return 1;
+ }
+ return 0;
+}
+
+const ArchAsmOps rv64_asm_ops = {
+ .reloc_operand = rv64_reloc_operand,
+ .is_local_branch = rv64_is_local_branch,
+ .reloc_call_pair = rv64_reloc_call_pair,
+};
+
+ArchAsm* rv64_arch_asm_new(Compiler* c) {
+ Rv64Asm* a = arena_new(c->tu, Rv64Asm);
+ memset(a, 0, sizeof *a);
+ a->base.insn = rv64_arch_asm_insn;
+ a->base.destroy = rv64_arch_asm_destroy;
+ a->c = c;
+ return &a->base;
+}
+
+/* ============================================================
+ * Inline-asm template walker (parallel to aa64 asm.c §"inline-asm
+ * template walker"). The walker substitutes %N / %[name] / %% / %a%w%x
+ * placeholders into a per-line StrBuf, then re-lexes each line through
+ * rv64_arch_asm_insn for assembly. Statement separators recognised are
+ * '\n' and ';' (outside parens / quoted strings).
+ * ============================================================ */
+
+Rv64Asm* rv64_asm_open(Compiler* c) {
+ Rv64Asm* a = arena_new(c->tu, Rv64Asm);
+ memset(a, 0, sizeof *a);
+ a->base.insn = rv64_arch_asm_insn;
+ a->base.destroy = rv64_arch_asm_destroy;
+ a->c = c;
+ return a;
+}
+
+void rv64_asm_close(Rv64Asm* a) { (void)a; }
+
+void rv64_inline_bind(Rv64Asm* a, const AsmConstraint* outs, u32 nout,
+ Operand* out_ops, const AsmConstraint* ins, u32 nin,
+ const Operand* in_ops, const Sym* clobbers, u32 nclob) {
+ a->outs = outs;
+ a->out_ops = out_ops;
+ a->ins = ins;
+ a->in_ops = in_ops;
+ a->clobbers = clobbers;
+ a->nout = nout;
+ a->nin = nin;
+ a->nclob = nclob;
+}
+
+/* Per-line rendered buffer cap. Inline asm rarely emits more than a
+ * handful of insns per block; one substituted line fits comfortably.
+ * Truncation panics — the operator grammar should never grow a single
+ * line beyond this without a deliberate reason. */
+#define RV64_INLINE_LINE_CAP 1024
+
+_Noreturn static void inline_panic(Rv64Asm* a, const char* msg) {
+ SrcLoc loc = {0, 0, 0};
+ compiler_panic(a->c, loc, "rv64 inline asm: %.*s",
+ SLICE_ARG(slice_from_cstr(msg)));
+}
+
+/* Render a 5-bit integer register number using its canonical psABI name. */
+static void render_xreg(StrBuf* sb, u32 reg) {
+ const char* nm = rv64_register_name(reg & 0x1fu);
+ if (!nm) {
+ strbuf_putc(sb, 'x');
+ if ((reg & 0x1fu) >= 10u)
+ strbuf_putc(sb, (char)('0' + ((reg & 0x1fu) / 10u)));
+ strbuf_putc(sb, (char)('0' + ((reg & 0x1fu) % 10u)));
+ return;
+ }
+ strbuf_puts(sb, nm);
+}
+
+/* Render an FP register by its canonical psABI name (e.g., fa0). */
+static void render_freg(StrBuf* sb, u32 reg) {
+ const char* nm = rv64_register_name(32u + (reg & 0x1fu));
+ if (!nm) {
+ strbuf_putc(sb, 'f');
+ if ((reg & 0x1fu) >= 10u)
+ strbuf_putc(sb, (char)('0' + ((reg & 0x1fu) / 10u)));
+ strbuf_putc(sb, (char)('0' + ((reg & 0x1fu) % 10u)));
+ return;
+ }
+ strbuf_puts(sb, nm);
+}
+
+/* Render a signed 64-bit integer. Inline asm immediates appear bare in
+ * RISC-V (no '#' prefix), matching the standalone .s parser. */
+static void render_imm(StrBuf* sb, i64 v) { strbuf_put_i64(sb, v); }
+
+/* Render addressing form `disp(base)`. */
+static void render_indirect(Rv64Asm* a, StrBuf* sb, Reg base, i32 ofs) {
+ (void)a;
+ if (ofs != 0)
+ strbuf_put_i64(sb, (i64)ofs);
+ else
+ strbuf_putc(sb, '0');
+ strbuf_putc(sb, '(');
+ render_xreg(sb, (u32)base);
+ strbuf_putc(sb, ')');
+}
+
+/* Resolve operand index → render into sb. form:
+ * 0 = default (per-kind),
+ * 1 = %wN (width hint; on rv64 same as default xreg form),
+ * 2 = %xN (force 64-bit reg form — identical to default for rv64),
+ * 3 = %aN (memory addressing form).
+ * 4 = %zN (RISC-V GCC: emits "zero" if operand is imm 0, else reg). */
+static void render_operand(Rv64Asm* a, StrBuf* sb, u32 idx, int form) {
+ u32 ntot = a->nout + a->nin;
+ if (idx >= ntot) inline_panic(a, "operand index out of range");
+ const Operand* op =
+ (idx < a->nout) ? &a->out_ops[idx] : &a->in_ops[idx - a->nout];
+ switch (form) {
+ case 1: /* %wN — accept any reg/imm; rv64 has no narrower spelling. */
+ case 2: /* %xN — same. */
+ if (op->kind == RV64_INLINE_OPK_REG) {
+ if (op->pad[0] == RV64_INLINE_OPCLS_FP)
+ render_freg(sb, (u32)op->v.local);
+ else
+ render_xreg(sb, (u32)op->v.local);
+ return;
+ }
+ if (op->kind == OPK_IMM) {
+ render_imm(sb, op->v.imm);
+ return;
+ }
+ inline_panic(a, "%w/%x on unsupported operand kind");
+ case 3: /* %aN — memory addressing form */
+ if (op->kind != OPK_INDIRECT) inline_panic(a, "%a on non-memory operand");
+ if (op->v.ind.index != CG_LOCAL_NONE)
+ inline_panic(a,
+ "%a on indexed memory operand: rv64 inline asm "
+ "requires base+disp only");
+ render_indirect(a, sb, (Reg)op->v.ind.base, op->v.ind.ofs);
+ return;
+ case 4: /* %zN — zero-or-reg */
+ if (op->kind == OPK_IMM && op->v.imm == 0) {
+ strbuf_puts(sb, "zero");
+ return;
+ }
+ if (op->kind == RV64_INLINE_OPK_REG) {
+ if (op->pad[0] == RV64_INLINE_OPCLS_FP)
+ render_freg(sb, (u32)op->v.local);
+ else
+ render_xreg(sb, (u32)op->v.local);
+ return;
+ }
+ inline_panic(a, "%z on unsupported operand kind");
+ default:
+ break;
+ }
+ switch (op->kind) {
+ case RV64_INLINE_OPK_REG:
+ if (op->pad[0] == RV64_INLINE_OPCLS_FP)
+ render_freg(sb, (u32)op->v.local);
+ else
+ render_xreg(sb, (u32)op->v.local);
+ return;
+ case OPK_IMM:
+ render_imm(sb, op->v.imm);
+ return;
+ case OPK_INDIRECT:
+ if (op->v.ind.index != CG_LOCAL_NONE)
+ inline_panic(a,
+ "indexed memory operand in inline asm: rv64 requires "
+ "base+disp only");
+ render_indirect(a, sb, (Reg)op->v.ind.base, op->v.ind.ofs);
+ return;
+ default:
+ inline_panic(a, "unsupported operand kind for %N");
+ }
+}
+
+/* Resolve a `%[name]` operand by looking up `needle` against the
+ * constraint.name fields on the combined outs+ins list. Returns the
+ * combined index, or (u32)-1 on miss. */
+static u32 lookup_named(Rv64Asm* a, Sym needle) {
+ for (u32 k = 0; k < a->nout; ++k) {
+ if (a->outs[k].name == needle) return k;
+ }
+ for (u32 k = 0; k < a->nin; ++k) {
+ if (a->ins[k].name == needle) return a->nout + k;
+ }
+ return (u32)-1;
+}
+
+/* Lex one line of substituted asm and dispatch via rv64_arch_asm_insn. */
+static void run_one_line(Rv64Asm* a, MCEmitter* mc, const char* text,
+ size_t len) {
+ /* Skip blank lines. */
+ size_t i;
+ for (i = 0; i < len; ++i) {
+ if (text[i] != ' ' && text[i] != '\t') break;
+ }
+ if (i == len) return;
+
+ AsmLexer* lx = asm_lex_open_mem(a->c, "<inline-asm>", text, len);
+ AsmDriver* d = asm_driver_open_inline(a->c, mc, lx);
+
+ /* The first non-trivial token must be the mnemonic identifier. */
+ AsmTok t = asm_driver_peek(d);
+ while (t.kind == ASM_TOK_NEWLINE) {
+ (void)asm_driver_next(d);
+ t = asm_driver_peek(d);
+ }
+ if (t.kind == ASM_TOK_EOF) {
+ asm_driver_close_inline(d);
+ asm_lex_close(lx);
+ return;
+ }
+ if (t.kind != ASM_TOK_IDENT)
+ inline_panic(a, "expected mnemonic at start of inline asm line");
+ (void)asm_driver_next(d);
+ Sym mn = t.v.ident;
+ /* Compose `fcvt.s.w` etc. — rv64 has dotted mnemonics; the standalone
+ * lexer already strings them together as a single IDENT in most paths.
+ * Mirror the aa64 composite handling for safety. */
+ AsmTok dot = asm_driver_peek(d);
+ while (asm_driver_tok_is_punct(dot, '.')) {
+ (void)asm_driver_next(d);
+ AsmTok rest = asm_driver_next(d);
+ if (rest.kind != ASM_TOK_IDENT)
+ inline_panic(a, "composite mnemonic: expected ident after '.'");
+ Slice hsl = pool_slice(asm_driver_pool(d), mn);
+ Slice rsl = pool_slice(asm_driver_pool(d), rest.v.ident);
+ size_t hn = hsl.len, rn = rsl.len;
+ char buf[64];
+ if (hn + 1 + rn >= sizeof buf)
+ inline_panic(a, "composite mnemonic too long");
+ for (size_t k = 0; k < hn; ++k) buf[k] = hsl.s[k];
+ buf[hn] = '.';
+ for (size_t k = 0; k < rn; ++k) buf[hn + 1 + k] = rsl.s[k];
+ mn = pool_intern_slice(asm_driver_pool(d),
+ (Slice){.s = buf, .len = hn + 1 + rn});
+ dot = asm_driver_peek(d);
+ }
+ rv64_arch_asm_insn(&a->base, d, mn);
+ asm_driver_close_inline(d);
+ asm_lex_close(lx);
+}
+
+/* Substitute placeholders into one line's StrBuf, then dispatch. */
+static void render_and_run_line(Rv64Asm* a, MCEmitter* mc, StrBuf* sb,
+ const char* start, const char* end) {
+ strbuf_reset(sb);
+ for (const char* p = start; p < end; ++p) {
+ char c = *p;
+ if (c != '%') {
+ strbuf_putc(sb, c);
+ continue;
+ }
+ /* Placeholder. */
+ if (p + 1 >= end) inline_panic(a, "trailing '%' in template");
+ char n = *(p + 1);
+ if (n == '%') {
+ strbuf_putc(sb, '%');
+ ++p;
+ continue;
+ }
+ if (n == '[') {
+ const char* nbeg = p + 2;
+ const char* nend = nbeg;
+ while (nend < end && *nend != ']') ++nend;
+ if (nend == end) inline_panic(a, "unterminated %[name]");
+ size_t nlen = (size_t)(nend - nbeg);
+ Sym needle =
+ pool_intern_slice(a->c->global, (Slice){.s = nbeg, .len = nlen});
+ u32 idx = lookup_named(a, needle);
+ if (idx == (u32)-1)
+ inline_panic(a, "%[name] does not match any constraint");
+ p = nend; /* loop's ++p steps past the ']' */
+ render_operand(a, sb, idx, 0);
+ continue;
+ }
+ int form = 0; /* 0=default, 1=w, 2=x, 3=a, 4=z */
+ if (n == 'w' || n == 'x' || n == 'a' || n == 'z') {
+ form = (n == 'w') ? 1 : (n == 'x') ? 2 : (n == 'a') ? 3 : 4;
+ ++p;
+ if (p + 1 >= end) inline_panic(a, "trailing '%' modifier in template");
+ n = *(p + 1);
+ }
+ if (n == '[') {
+ const char* nbeg = p + 2;
+ const char* nend = nbeg;
+ while (nend < end && *nend != ']') ++nend;
+ if (nend == end) inline_panic(a, "unterminated %[name]");
+ size_t nlen = (size_t)(nend - nbeg);
+ Sym needle =
+ pool_intern_slice(a->c->global, (Slice){.s = nbeg, .len = nlen});
+ u32 idx = lookup_named(a, needle);
+ if (idx == (u32)-1)
+ inline_panic(a, "%[name] does not match any constraint");
+ p = nend;
+ render_operand(a, sb, idx, form);
+ continue;
+ }
+ if (n < '0' || n > '9') inline_panic(a, "expected digit after '%'");
+ u32 idx = (u32)(n - '0');
+ ++p;
+ /* GCC syntax permits up to two digits (%0..%99). */
+ if (p + 1 < end && *(p + 1) >= '0' && *(p + 1) <= '9') {
+ idx = idx * 10 + (u32)(*(p + 1) - '0');
+ ++p;
+ }
+ render_operand(a, sb, idx, form);
+ }
+ if (sb->truncated) inline_panic(a, "inline asm line buffer overflow");
+ run_one_line(a, mc, strbuf_cstr(sb), strbuf_len(sb));
+}
+
+void rv64_asm_run_template(Rv64Asm* a, MCEmitter* mc, const char* tmpl) {
+ if (!tmpl || !*tmpl) return;
+
+ char buf[RV64_INLINE_LINE_CAP];
+ StrBuf sb;
+ strbuf_init(&sb, buf, sizeof buf);
+
+ /* Walk tmpl, splitting on '\n' and ';'. Track paren depth and quote
+ * state so that a literal ';' inside `( ... )` (memory operand) or a
+ * quoted string is not mistaken for a statement separator. RISC-V uses
+ * `disp(base)` for memory, hence we track parens. */
+ const char* line_start = tmpl;
+ int paren = 0;
+ char quote = 0;
+ for (const char* p = tmpl;; ++p) {
+ char c = *p;
+ if (c == '\0') {
+ render_and_run_line(a, mc, &sb, line_start, p);
+ break;
+ }
+ if (quote) {
+ if (c == '\\' && *(p + 1)) {
+ ++p;
+ continue;
+ }
+ if (c == quote) quote = 0;
+ continue;
+ }
+ if (c == '"' || c == '\'') {
+ quote = c;
+ continue;
+ }
+ if (c == '(') {
+ ++paren;
+ continue;
+ }
+ if (c == ')') {
+ if (paren) --paren;
+ continue;
+ }
+ if (paren == 0 && (c == '\n' || c == ';')) {
+ render_and_run_line(a, mc, &sb, line_start, p);
+ line_start = p + 1;
+ }
+ }
+}
diff --git a/src/arch/rv64/asm.h b/src/arch/riscv/asm.h
diff --git a/src/arch/riscv/dbg.c b/src/arch/riscv/dbg.c
@@ -0,0 +1,441 @@
+/* RISC-V 64 lifter for the displaced-step shim.
+ *
+ * Lays out a fixed-up copy of one insn in the session scratch slot
+ * (DBG_DISPLACED_SLOT_BYTES bytes), followed by an EBREAK sentinel the
+ * session arms an internal bp on.
+ *
+ * Supported families:
+ * - JAL rd, offset — synthesize:
+ * slot[0] AUIPC t0, hi20(target) ; t0 = pc_runtime + hi20
+ * slot[4] ADDI t0, t0, lo12 ; (optional) fixup
+ * slot[8] JALR rd, t0, 0 ; rd = pc+4_runtime; PC = t0
+ * slot[N] EBREAK
+ * The JALR's "return address" lands at the EBREAK sentinel, but since
+ * control transfers to the user target we never execute it; the
+ * session's stale internal_bp is cleared by the next prepare and the
+ * finalize step gates on PC == return_pc so it stays a no-op when
+ * control left the slot.
+ *
+ * Note that an unconditional JAL with rd != x0 writes the runtime
+ * (scratch) PC+4 into rd. For RISC-V calls (the dynamic linker /
+ * PLT trampolines pass arguments via rd=ra), this is acceptable in
+ * practice because the saved return address is rebuilt by the
+ * epilogue anyway; kit's JIT debugger uses the shim only to
+ * single-step through code it has emitted, and the producer's call
+ * sequences re-establish ra in the prologue of the callee. For a
+ * true displaced-step debugger this would need a "patch ra" pass —
+ * v1 leaves that to the user via the unwind step.
+ *
+ * - JALR rd, rs1, imm — copied verbatim; the EBREAK after never
+ * fires because the indirect branch transfers control. Same caveat
+ * about rd as JAL.
+ *
+ * - BEQ/BNE/BLT/BGE/BLTU/BGEU rs1, rs2, offset — trampoline form:
+ * slot[0] Bcc rs1, rs2, +12 ; taken → slot+12 (target seq)
+ * slot[4] J +12 ; not-taken → slot+16 (EBREAK)
+ * (JAL x0, +12)
+ * slot[8] EBREAK
+ * slot[12] AUIPC t0, hi20(target)
+ * slot[16] ADDI t0, t0, lo12
+ * slot[20] JALR x0, t0, 0
+ * slot[24] EBREAK (sentinel: taken path sentinel)
+ * Sentinel offset is slot[8] for the not-taken fallthrough; the
+ * taken path branches away so it doesn't matter whether slot[24]
+ * is an EBREAK or not, but we put one there as a safety net.
+ *
+ * Branch immediates in RV64I are 13-bit signed, so the in-shim
+ * Bcc-then-J/J pattern always fits.
+ *
+ * - AUIPC rd, imm20 — replace with LUI rd, abs_hi20:
+ * slot[0] LUI rd, abs_hi20
+ * slot[4] EBREAK
+ * where abs_hi20 = (orig_pc + (imm20 << 12)) >> 12, masked to 20
+ * bits. Note that AUIPC computes pc + (imm << 12); LUI computes
+ * imm << 12. So we feed LUI the hi-20 of (orig_pc & ~0xfff) +
+ * (imm << 12), i.e. the bits we want at the top of rd.
+ *
+ * - LUI rd, imm20 — copied verbatim (no PC dependency).
+ *
+ * - System / ALU / load / store / misc — copied verbatim + EBREAK.
+ *
+ * Not supported (caller will fall back to step-over via internal bp):
+ * - RVC compressed instructions (16-bit). The producer does not emit
+ * them, but they may appear if the JIT ever loads pre-built code.
+ * - Vector instructions. Not produced by kit's RV64 backend.
+ */
+
+#include "dbg/dbg.h"
+
+#include <string.h>
+
+#include "arch/riscv/isa.h"
+
+#define SHIM_T0 RV_T0 /* x5 — caller-saved temp, safe inside a shim */
+
+uint32_t dbg_rv64_brk_word(void) { return rv_ebreak(); }
+
+static void put_u32(uint8_t* w, uint32_t off, uint32_t v) {
+ memcpy(w + off, &v, sizeof(v));
+}
+
+/* Sign-extend a `bits`-wide field whose raw value is `v`. */
+static int64_t sign_extend(uint64_t v, int bits) {
+ uint64_t m = 1ull << (bits - 1);
+ return (int64_t)((v ^ m) - m);
+}
+
+/* Decode RV64 fields. */
+static uint32_t rv_opcode(uint32_t insn) { return insn & 0x7fu; }
+static uint32_t rv_rd(uint32_t insn) { return (insn >> 7) & 0x1fu; }
+static uint32_t rv_funct3(uint32_t insn) { return (insn >> 12) & 0x7u; }
+static uint32_t rv_rs1(uint32_t insn) { return (insn >> 15) & 0x1fu; }
+static uint32_t rv_rs2(uint32_t insn) { return (insn >> 20) & 0x1fu; }
+
+/* J-type 20-bit immediate (sign-extended into 21-bit byte offset). */
+static int64_t rv_j_imm(uint32_t insn) {
+ uint64_t imm = ((uint64_t)((insn >> 31) & 1u) << 20) |
+ ((uint64_t)((insn >> 21) & 0x3ffu) << 1) |
+ ((uint64_t)((insn >> 20) & 1u) << 11) |
+ ((uint64_t)((insn >> 12) & 0xffu) << 12);
+ return sign_extend(imm, 21);
+}
+
+/* B-type 12-bit immediate (sign-extended 13-bit byte offset). */
+static int64_t rv_b_imm(uint32_t insn) {
+ uint64_t imm = ((uint64_t)((insn >> 31) & 1u) << 12) |
+ ((uint64_t)((insn >> 7) & 1u) << 11) |
+ ((uint64_t)((insn >> 25) & 0x3fu) << 5) |
+ ((uint64_t)((insn >> 8) & 0xfu) << 1);
+ return sign_extend(imm, 13);
+}
+
+/* U-type 20-bit immediate, returned as the raw 20-bit field (consumer
+ * shifts it left by 12). */
+static uint32_t rv_u_imm20(uint32_t insn) { return (insn >> 12) & 0xfffffu; }
+
+/* Decompose a 64-bit absolute target into a 32-bit AUIPC/LUI hi20 +
+ * ADDI lo12 pair such that:
+ * lui rd, hi20 -> rd = (sign_ext_32(hi20 << 12))
+ * addi rd, rd, lo12 -> rd = (sign_ext_32(hi20 << 12) +
+ * sign_ext_12(lo12))
+ * == sign_ext_32(target_low32)
+ * Returns 1 if the absolute target's low 32 bits cannot represent the
+ * full target (i.e. the target lives outside the sign-extended 32-bit
+ * range). The RV64 ABI's "medlow" code model assumes targets fit in
+ * the 32-bit sign-extended window around 0; for a JIT image that lives
+ * higher in the address space we panic at the caller. */
+static int rv_split_hi_lo(uint64_t target, uint32_t* hi20, int32_t* lo12,
+ int* sext32) {
+ int64_t s = (int64_t)target;
+ int64_t sext = (int64_t)(int32_t)(uint32_t)target;
+ *sext32 = (s == sext) ? 1 : 0;
+ /* hi20 chosen so addi's sign-extended 12-bit lo cancels out. */
+ uint32_t low32 = (uint32_t)target;
+ uint32_t hi = (low32 + 0x800u) >> 12;
+ int32_t lo = (int32_t)(low32 - (hi << 12));
+ *hi20 = hi & 0xfffffu;
+ *lo12 = lo;
+ return 0;
+}
+
+/* Emit "li t0, target" using AUIPC+ADDI when the target is in PC-rel
+ * range, otherwise LUI+ADDI. Returns the number of words written into
+ * `w` starting at offset `off`. The shim runs at `shim_runtime_pc` (the
+ * scratch slot's runtime address), and the AUIPC variant uses that. */
+static uint32_t emit_materialize_target(uint8_t* w, uint32_t off,
+ uint64_t target,
+ uint64_t shim_runtime_pc) {
+ int64_t pc_rel = (int64_t)target - (int64_t)shim_runtime_pc;
+ /* AUIPC offset is signed 32-bit (imm20 << 12). If pc_rel fits in the
+ * 32-bit sign-extended range and the low 12 bits' sign-extension
+ * carries correctly, prefer AUIPC + ADDI (PIC-friendly). Otherwise
+ * fall back to LUI + ADDI (assumes target's low32 is the full
+ * address — caller arranges for medlow targets). */
+ if (pc_rel >= -(int64_t)0x80000000 && pc_rel <= (int64_t)0x7fffffff) {
+ uint32_t hi20 = ((uint32_t)(int32_t)pc_rel + 0x800u) >> 12;
+ int32_t lo12 = (int32_t)((uint32_t)(int32_t)pc_rel - (hi20 << 12));
+ put_u32(w, off + 0, rv_auipc(SHIM_T0, hi20 & 0xfffffu));
+ put_u32(w, off + 4, rv_addi(SHIM_T0, SHIM_T0, lo12));
+ return 2;
+ } else {
+ uint32_t hi20;
+ int32_t lo12;
+ int sext32;
+ (void)rv_split_hi_lo(target, &hi20, &lo12, &sext32);
+ put_u32(w, off + 0, rv_lui(SHIM_T0, hi20));
+ put_u32(w, off + 4, rv_addi(SHIM_T0, SHIM_T0, lo12));
+ return 2;
+ }
+}
+
+int dbg_rv64_build_shim(uint32_t orig_insn, uint64_t orig_pc,
+ void* scratch_write, uint64_t scratch_runtime,
+ u32* brk_offset) {
+ uint8_t* w = (uint8_t*)scratch_write;
+ uint32_t brk = rv_ebreak();
+ uint32_t op;
+
+ if (!brk_offset) return 1;
+ *brk_offset = 0;
+
+ op = rv_opcode(orig_insn);
+
+ /* ---- JAL rd, offset ----------------------------------------------
+ * Semantics: rd = orig_pc + 4; pc = orig_pc + imm. We must reproduce
+ * the *user-visible* link value (orig_pc + 4), not the runtime
+ * scratch-relative one. Layout:
+ * slot[0..] materialize_target(t0, orig_pc + imm)
+ * slot[m] materialize rd <- (orig_pc + 4) (skipped when rd==x0)
+ * slot[m+] JALR x0, t0, 0 (unconditional jump; no link)
+ * slot[end] EBREAK
+ * For rd==x0 this collapses to the plain "jump to target" form. */
+ if (op == RV_JAL) {
+ int64_t imm = rv_j_imm(orig_insn);
+ uint64_t target = orig_pc + (uint64_t)imm;
+ uint32_t rd = rv_rd(orig_insn);
+ uint32_t n_words;
+ n_words = emit_materialize_target(w, 0, target, scratch_runtime);
+ if (rd != RV_ZERO) {
+ /* link = orig_pc + 4. Synthesize via LUI + ADDI using low-32
+ * decomposition; if the link value doesn't fit a 32-bit sign-
+ * extended window, we still emit the same two-word sequence and
+ * the high bits get truncated — acceptable for the JIT case
+ * where orig_pc is always within the image's 32-bit sign-ext
+ * range. */
+ uint64_t link = orig_pc + 4u;
+ uint32_t hi20;
+ int32_t lo12;
+ int sext32;
+ (void)rv_split_hi_lo(link, &hi20, &lo12, &sext32);
+ put_u32(w, 4 * n_words, rv_lui(rd, hi20));
+ ++n_words;
+ put_u32(w, 4 * n_words, rv_addi(rd, rd, lo12));
+ ++n_words;
+ }
+ put_u32(w, 4 * n_words, rv_jalr(RV_ZERO, SHIM_T0, 0));
+ ++n_words;
+ put_u32(w, 4 * n_words, brk);
+ *brk_offset = 4 * n_words;
+ return 0;
+ }
+
+ /* ---- JALR rd, rs1, imm -------------------------------------------
+ * Semantics: tmp = (regs[rs1] + sign_ext_12(imm)) & ~1; rd = orig_pc + 4;
+ * pc = tmp.
+ * Like JAL, rd must receive the *user-visible* link (orig_pc + 4).
+ * Layout:
+ * slot[0] JALR x0, rs1, imm ; jump-only form (no link write)
+ * -- but JALR is a single insn,
+ * so we cannot also write rd
+ * before jumping. We instead:
+ * slot[0] compute t0 = (regs[rs1] + imm) & ~1
+ * (ADDI t0, rs1, imm; ANDI t0, t0, -2)
+ * slot[8] materialize rd <- (orig_pc + 4) (if rd != x0)
+ * slot[N] JALR x0, t0, 0
+ * slot[N+4] EBREAK
+ * Note rs1 might be t0 itself; ADDI computes t0 = rs1 + imm BEFORE
+ * overwriting t0, which is fine because each insn reads its sources
+ * before writing rd. */
+ if (op == RV_JALR) {
+ uint32_t rd = rv_rd(orig_insn);
+ uint32_t rs1 = rv_rs1(orig_insn);
+ int32_t imm = (int32_t)((orig_insn >> 20) & 0xfffu);
+ if (imm & 0x800) imm -= 0x1000;
+ put_u32(w, 0, rv_addi(SHIM_T0, rs1, imm));
+ put_u32(w, 4, rv_andi(SHIM_T0, SHIM_T0, -2));
+ uint32_t off = 8;
+ if (rd != RV_ZERO) {
+ uint64_t link = orig_pc + 4u;
+ uint32_t hi20;
+ int32_t lo12;
+ int sext32;
+ (void)rv_split_hi_lo(link, &hi20, &lo12, &sext32);
+ put_u32(w, off, rv_lui(rd, hi20));
+ off += 4;
+ put_u32(w, off, rv_addi(rd, rd, lo12));
+ off += 4;
+ }
+ put_u32(w, off, rv_jalr(RV_ZERO, SHIM_T0, 0));
+ off += 4;
+ put_u32(w, off, brk);
+ *brk_offset = off;
+ return 0;
+ }
+
+ /* ---- Bcc rs1, rs2, offset ---------------------------------------- */
+ if (op == RV_BRANCH) {
+ int64_t imm = rv_b_imm(orig_insn);
+ uint64_t target = orig_pc + (uint64_t)imm;
+ uint32_t f3 = rv_funct3(orig_insn);
+ uint32_t rs1 = rv_rs1(orig_insn);
+ uint32_t rs2 = rv_rs2(orig_insn);
+ /* Trampoline layout:
+ * slot[0] Bcc rs1, rs2, +12 (taken -> slot[12])
+ * slot[4] JAL x0, +12 (not-taken fallthrough -> slot[16])
+ * ... wait — we want non-taken to
+ * fall through to the EBREAK at
+ * slot[8]. Simpler: place EBREAK
+ * at slot[4] for not-taken, and
+ * the take-target sequence at
+ * slot[8..]. The Bcc's +12 then
+ * becomes +8.
+ *
+ * Revised:
+ * slot[0] Bcc rs1, rs2, +8 (taken -> slot[8] = target seq)
+ * slot[4] EBREAK (not-taken sentinel)
+ * slot[8] AUIPC t0, hi20(target)
+ * slot[12] ADDI t0, t0, lo12
+ * slot[16] JALR x0, t0, 0
+ * slot[20] EBREAK (safety; never reached) */
+ uint32_t new_branch = rv_b(8, rs2, rs1, f3, RV_BRANCH);
+ uint32_t n_words;
+ put_u32(w, 0, new_branch);
+ put_u32(w, 4, brk);
+ n_words = emit_materialize_target(w, 8, target, scratch_runtime + 8u);
+ put_u32(w, 8 + 4 * n_words, rv_jalr(RV_ZERO, SHIM_T0, 0));
+ put_u32(w, 8 + 4 * n_words + 4, brk);
+ *brk_offset = 4;
+ return 0;
+ }
+
+ /* ---- AUIPC rd, imm20 --------------------------------------------- */
+ if (op == RV_AUIPC) {
+ uint32_t imm20 = rv_u_imm20(orig_insn);
+ uint32_t rd = rv_rd(orig_insn);
+ /* AUIPC computes rd = orig_pc + sign_ext_32(imm20 << 12). We
+ * synthesize that absolute value into rd using LUI + ADDI. */
+ uint64_t auipc_val = (uint64_t)((int64_t)orig_pc +
+ (int64_t)(int32_t)((int32_t)(imm20 << 12)));
+ uint32_t hi20;
+ int32_t lo12;
+ int sext32;
+ (void)rv_split_hi_lo(auipc_val, &hi20, &lo12, &sext32);
+ put_u32(w, 0, rv_lui(rd, hi20));
+ put_u32(w, 4, rv_addi(rd, rd, lo12));
+ put_u32(w, 8, brk);
+ *brk_offset = 8;
+ return 0;
+ }
+
+ /* ---- default: no PC-relative operand — copy verbatim ------------- */
+ put_u32(w, 0, orig_insn);
+ put_u32(w, 4, brk);
+ *brk_offset = 4;
+ return 0;
+}
+
+static KitStatus rv64_dbg_breakpoint_patch(u8* out, u32 cap, u32* len_out) {
+ uint32_t brk = dbg_rv64_brk_word();
+ if (!out || !len_out) return KIT_INVALID;
+ if (cap < 4u) return KIT_INVALID;
+ memcpy(out, &brk, sizeof(brk));
+ *len_out = 4u;
+ return KIT_OK;
+}
+
+static u64 rv64_dbg_breakpoint_addr_from_fault_pc(u64 fault_pc) {
+ return fault_pc;
+}
+
+static KitStatus rv64_dbg_decode_insn(const u8* bytes, u32 len, u64 pc,
+ ArchDbgInsn* out) {
+ if (!bytes || !out) return KIT_INVALID;
+ if (len < 4u) return KIT_UNSUPPORTED;
+ memset(out, 0, sizeof(*out));
+ out->pc = pc;
+ out->len = 4u;
+ memcpy(out->bytes, bytes, 4u);
+ return KIT_OK;
+}
+
+static KitStatus rv64_dbg_build_displaced_shim(
+ const ArchDbgInsn* insn, void* scratch_write, u64 scratch_runtime,
+ u32 scratch_cap, u32* sentinel_off, u64* fallthrough_pc) {
+ uint32_t word = 0;
+ if (!insn || !scratch_write || !sentinel_off || !fallthrough_pc)
+ return KIT_INVALID;
+ if (insn->len != 4u) return KIT_UNSUPPORTED;
+ if (scratch_cap < 28u) return KIT_INVALID;
+ memcpy(&word, insn->bytes, sizeof(word));
+ if (dbg_rv64_build_shim(word, insn->pc, scratch_write, scratch_runtime,
+ sentinel_off) != 0) {
+ return KIT_UNSUPPORTED;
+ }
+ *fallthrough_pc = insn->pc + 4u;
+ return KIT_OK;
+}
+
+static int rv64_dbg_is_call(const ArchDbgInsn* insn) {
+ uint32_t word = 0;
+ uint32_t op;
+ if (!insn || insn->len != 4u) return 0;
+ memcpy(&word, insn->bytes, sizeof(word));
+ op = rv_opcode(word);
+ if (op != RV_JAL && op != RV_JALR) return 0;
+ return rv_rd(word) != RV_ZERO;
+}
+
+static KitStatus rv64_dbg_direct_call_target(const ArchDbgInsn* insn,
+ u64* target_out) {
+ uint32_t word = 0;
+ if (!insn || !target_out) return KIT_INVALID;
+ if (insn->len != 4u) return KIT_UNSUPPORTED;
+ memcpy(&word, insn->bytes, sizeof(word));
+ if (rv_opcode(word) != RV_JAL || rv_rd(word) == RV_ZERO) return KIT_NOT_FOUND;
+ *target_out = insn->pc + (u64)rv_j_imm(word);
+ return KIT_OK;
+}
+
+static KitStatus rv64_dbg_direct_jump_target(const ArchDbgInsn* insn,
+ u64* target_out) {
+ uint32_t word = 0;
+ if (!insn || !target_out) return KIT_INVALID;
+ if (insn->len != 4u) return KIT_UNSUPPORTED;
+ memcpy(&word, insn->bytes, sizeof(word));
+ if (rv_opcode(word) != RV_JAL || rv_rd(word) != RV_ZERO) return KIT_NOT_FOUND;
+ *target_out = insn->pc + (u64)rv_j_imm(word);
+ return KIT_OK;
+}
+
+static KitStatus rv64_dbg_link_register_return_address(
+ const KitUnwindFrame* frame, u64* target_out) {
+ if (!frame || !target_out) return KIT_INVALID;
+ if (frame->regs[RV_RA] == 0) return KIT_NOT_FOUND;
+ *target_out = frame->regs[RV_RA];
+ return KIT_OK;
+}
+
+const ArchDbgOps rv64_dbg_ops = {
+ .min_insn_len = 4u,
+ .max_insn_len = 4u,
+ .breakpoint_patch = rv64_dbg_breakpoint_patch,
+ .breakpoint_addr_from_fault_pc = rv64_dbg_breakpoint_addr_from_fault_pc,
+ .decode_insn = rv64_dbg_decode_insn,
+ .build_displaced_shim = rv64_dbg_build_displaced_shim,
+ .is_call = rv64_dbg_is_call,
+ .direct_call_target = rv64_dbg_direct_call_target,
+ .direct_jump_target = rv64_dbg_direct_jump_target,
+ .link_register_return_address = rv64_dbg_link_register_return_address,
+};
+
+/* RV32 shares every helper with RV64: the 4-byte insn encodings and the
+ * medlow LUI+ADDI materialization in dbg_rv64_build_shim are XLEN-neutral
+ * and naturally correct for rv32's 32-bit addresses (the >32-bit LUI
+ * fallback in rv_split_hi_lo never fires). The only difference is the
+ * advertised min_insn_len=2 (RVC is 2-byte). Per the file header, RVC
+ * (2-byte) displaced-step is NOT supported: rv64_dbg_decode_insn returns
+ * KIT_UNSUPPORTED for len<4 and rv64_dbg_build_displaced_shim requires
+ * len==4, so RVC insns fall back to step-over via internal bp. That is
+ * the intended v1 behavior; full RVC shims are deferred. */
+const ArchDbgOps rv32_dbg_ops = {
+ .min_insn_len = 2u,
+ .max_insn_len = 4u,
+ .breakpoint_patch = rv64_dbg_breakpoint_patch,
+ .breakpoint_addr_from_fault_pc = rv64_dbg_breakpoint_addr_from_fault_pc,
+ .decode_insn = rv64_dbg_decode_insn,
+ .build_displaced_shim = rv64_dbg_build_displaced_shim,
+ .is_call = rv64_dbg_is_call,
+ .direct_call_target = rv64_dbg_direct_call_target,
+ .direct_jump_target = rv64_dbg_direct_jump_target,
+ .link_register_return_address = rv64_dbg_link_register_return_address,
+};
diff --git a/src/arch/riscv/disasm.c b/src/arch/riscv/disasm.c
@@ -0,0 +1,474 @@
+/* RV64 disassembler — descriptor-table driven.
+ *
+ * Decodes a 4-byte word by linear-scan over `rv64_insn_table` and
+ * dispatches operand printing on the matched format. Compressed (RV64C)
+ * instructions are 16-bit: a halfword whose low 2 bits are not 0b11
+ * goes through the C-decode path; the iterator advances by 2 bytes.
+ *
+ * Unknown words/halfwords fall back to ".word"/".hword" placeholders. */
+
+#include "arch/riscv/disasm.h"
+
+#include <string.h>
+
+#include "arch/riscv/isa.h"
+#include "arch/riscv/variant.h"
+#include "core/heap.h"
+#include "core/strbuf.h"
+
+#define RV64_DASM_MNEM_CAP 16u
+#define RV64_DASM_OPS_CAP 96u
+#define RV64_DASM_ANN_CAP 64u
+#define RV64_ENCODING_UNKNOWN 0xffffffffu
+
+typedef struct Rv64InsnFormatter {
+ ArchInsnFormatter base;
+ Compiler* c;
+ Heap* heap;
+ char mnem_buf[RV64_DASM_MNEM_CAP];
+ char ops_buf[RV64_DASM_OPS_CAP];
+ char ann_buf[RV64_DASM_ANN_CAP];
+ StrBuf mnem;
+ StrBuf ops;
+ StrBuf ann;
+} Rv64InsnFormatter;
+
+typedef struct Rv64Disasm {
+ ArchDisasm base;
+ Rv64InsnFormatter fmt;
+} Rv64Disasm;
+
+static KitStatus rv64_format_insn(ArchInsnFormatter*, const KitDecodedInsn*,
+ KitInsn*);
+static void rv64_formatter_destroy(ArchInsnFormatter*);
+
+/* RV_AV_* mask of the arch being disassembled. Derived from the Compiler's
+ * target; defaults (and any non-RISC-V kind) map to rv64 so the historical
+ * decode path is byte-identical. */
+static u8 rv_av_for_compiler(Compiler* c) {
+ const RiscvVariant* v = riscv_variant_for_kind(c->target.arch);
+ return v->xlen == 32u ? (u8)RV_AV_RV32 : (u8)RV_AV_RV64;
+}
+
+static u32 rv_read_u32_le(const u8* b) {
+ return (u32)b[0] | ((u32)b[1] << 8) | ((u32)b[2] << 16) | ((u32)b[3] << 24);
+}
+
+static u32 rv_read_u16_le(const u8* b) { return (u32)b[0] | ((u32)b[1] << 8); }
+
+static void rv_fmt_emit_fallback32(Rv64InsnFormatter* f, u32 word) {
+ strbuf_reset(&f->mnem);
+ strbuf_puts(&f->mnem, ".word");
+ strbuf_reset(&f->ops);
+ strbuf_put_hex_u64(&f->ops, (u64)word);
+}
+
+static void rv_fmt_emit_fallback16(Rv64InsnFormatter* f, u32 hw) {
+ strbuf_reset(&f->mnem);
+ strbuf_puts(&f->mnem, ".hword");
+ strbuf_reset(&f->ops);
+ strbuf_put_hex_u64(&f->ops, (u64)hw);
+}
+
+static u32 rv64_desc_encoding_id(const Rv64InsnDesc* desc) {
+ u32 i;
+ if (!desc) return RV64_ENCODING_UNKNOWN;
+ for (i = 0; i < rv64_insn_table_n; ++i) {
+ if (desc == &rv64_insn_table[i]) return i;
+ }
+ return RV64_ENCODING_UNKNOWN;
+}
+
+static u32 rv64_semantic_opcode(u32 word, u32 nbytes) {
+ u32 op, funct3, funct7;
+ if (nbytes != 4u) return RV64_DEC_UNKNOWN;
+ if (word == rv_ecall()) return RV64_DEC_ECALL;
+ if (word == rv_ebreak()) return RV64_DEC_EBREAK;
+ op = word & 0x7fu;
+ funct3 = (word >> 12) & 0x7u;
+ funct7 = (word >> 25) & 0x7fu;
+ if (op == RV_OP_IMM && funct3 == 0u) return RV64_DEC_ADDI;
+ if (op == RV_OP && funct3 == 0u && funct7 == 0u) return RV64_DEC_ADD;
+ if (op == RV_AUIPC) return RV64_DEC_AUIPC;
+ if (op == RV_LOAD && funct3 == 3u) return RV64_DEC_LD;
+ if (op == RV_STORE && funct3 == 3u) return RV64_DEC_SD;
+ if (op == RV_JALR && funct3 == 0u) return RV64_DEC_JALR;
+ return RV64_DEC_UNKNOWN;
+}
+
+static void rv_decop_none(KitDecodedOperand* o) {
+ memset(o, 0, sizeof(*o));
+ o->kind = KIT_DECOP_NONE;
+ o->index_reg = REG_NONE;
+}
+
+static void rv_decop_reg(KitDecodedOperand* o, u32 reg, u8 width_bits) {
+ rv_decop_none(o);
+ o->kind = KIT_DECOP_REG;
+ o->width_bits = width_bits;
+ o->reg = reg;
+}
+
+static void rv_decop_imm(KitDecodedOperand* o, i64 imm) {
+ rv_decop_none(o);
+ o->kind = KIT_DECOP_IMM;
+ o->imm = imm;
+}
+
+static void rv_decop_sysreg(KitDecodedOperand* o, u32 reg) {
+ rv_decop_none(o);
+ o->kind = KIT_DECOP_SYSREG;
+ o->reg = reg;
+}
+
+static void rv_decop_mem(KitDecodedOperand* o, u32 base, i64 imm,
+ u8 width_bits) {
+ rv_decop_none(o);
+ o->kind = KIT_DECOP_MEM;
+ o->width_bits = width_bits;
+ o->reg = base;
+ o->imm = imm;
+}
+
+static void rv_decop_pcrel(KitDecodedOperand* o, u64 pc, i64 disp) {
+ rv_decop_none(o);
+ o->kind = KIT_DECOP_PCREL;
+ o->imm = (i64)(pc + (u64)disp);
+}
+
+static u8 rv_load_width_bits(u32 funct3) {
+ switch (funct3 & 7u) {
+ case 0:
+ case 4:
+ return 8;
+ case 1:
+ case 5:
+ return 16;
+ case 2:
+ case 6:
+ return 32;
+ case 3:
+ return 64;
+ default:
+ return 0;
+ }
+}
+
+static u16 rv64_decode_flags(const Rv64InsnDesc* desc, u32 word) {
+ u16 flags = 0;
+ Rv64Format fmt;
+ if (!desc) return 0;
+ fmt = (Rv64Format)desc->fmt;
+ switch (fmt) {
+ case RV64_FMT_B:
+ case RV64_FMT_CB:
+ case RV64_FMT_CJ:
+ flags |= KIT_DECODE_TERMINATOR | KIT_DECODE_BRANCH;
+ break;
+ case RV64_FMT_J:
+ flags |= KIT_DECODE_TERMINATOR | KIT_DECODE_BRANCH;
+ if (((word >> 7) & 0x1fu) == RV_RA) flags |= KIT_DECODE_CALL;
+ break;
+ case RV64_FMT_JALR: {
+ u32 rd = (word >> 7) & 0x1fu;
+ u32 rs1 = (word >> 15) & 0x1fu;
+ flags |= KIT_DECODE_TERMINATOR | KIT_DECODE_BRANCH;
+ if (rd == RV_RA) flags |= KIT_DECODE_CALL;
+ if (rd == RV_ZERO && rs1 == RV_RA) flags |= KIT_DECODE_RET;
+ break;
+ }
+ case RV64_FMT_CR:
+ if (slice_eq_cstr(desc->mnemonic, "c.jr") ||
+ slice_eq_cstr(desc->mnemonic, "c.jalr")) {
+ flags |= KIT_DECODE_TERMINATOR | KIT_DECODE_BRANCH;
+ if (slice_eq_cstr(desc->mnemonic, "c.jalr")) flags |= KIT_DECODE_CALL;
+ }
+ break;
+ case RV64_FMT_SYSTEM:
+ if (word == rv_ecall() || word == rv_ebreak())
+ flags |= KIT_DECODE_TERMINATOR | KIT_DECODE_TRAP;
+ break;
+ case RV64_FMT_C_NONE:
+ if ((word & 0xffffu) == 0x9002u)
+ flags |= KIT_DECODE_TERMINATOR | KIT_DECODE_TRAP;
+ break;
+ case RV64_FMT_LOAD:
+ case RV64_FMT_STORE:
+ case RV64_FMT_FP_LOAD:
+ case RV64_FMT_FP_STORE:
+ case RV64_FMT_AMO:
+ case RV64_FMT_LR:
+ case RV64_FMT_CL:
+ case RV64_FMT_CS:
+ case RV64_FMT_CSS:
+ flags |= KIT_DECODE_MEMORY;
+ break;
+ default:
+ break;
+ }
+ return flags;
+}
+
+static void rv64_decode_operands(const Rv64InsnDesc* desc, u32 word, u64 pc,
+ const RiscvVariant* variant,
+ KitDecodedInsn* out) {
+ Rv64Format fmt;
+ if (!desc) return;
+ fmt = (Rv64Format)desc->fmt;
+ switch (fmt) {
+ case RV64_FMT_R:
+ case RV64_FMT_FP_R:
+ case RV64_FMT_FP_RM: {
+ Rv64R r = rv64_r_unpack(word);
+ out->noperands = 3;
+ rv_decop_reg(&out->operands[0], r.rd, 64);
+ rv_decop_reg(&out->operands[1], r.rs1, 64);
+ rv_decop_reg(&out->operands[2], r.rs2, 64);
+ break;
+ }
+ case RV64_FMT_I: {
+ Rv64I i = rv64_i_unpack(word);
+ out->noperands = 3;
+ rv_decop_reg(&out->operands[0], i.rd, 64);
+ rv_decop_reg(&out->operands[1], i.rs1, 64);
+ rv_decop_imm(&out->operands[2], rv64_sext(i.imm12, 12));
+ break;
+ }
+ case RV64_FMT_I_SHIFT:
+ case RV64_FMT_I_SHIFTW: {
+ Rv64I i = rv64_i_unpack(word);
+ /* SLLIW/SRLIW/SRAIW (I_SHIFTW) are always a 5-bit shamt. The plain
+ * SLLI/SRLI/SRAI shamt is 6-bit on rv64 but 5-bit on rv32 (bit 25 is
+ * funct7 there), so the mask follows variant->shamt_bits. */
+ u32 shamt_mask =
+ (fmt == RV64_FMT_I_SHIFTW || variant->shamt_bits == 5u) ? 0x1fu
+ : 0x3fu;
+ out->noperands = 3;
+ rv_decop_reg(&out->operands[0], i.rd, 64);
+ rv_decop_reg(&out->operands[1], i.rs1, 64);
+ rv_decop_imm(&out->operands[2], (i64)(i.imm12 & shamt_mask));
+ break;
+ }
+ case RV64_FMT_LOAD:
+ case RV64_FMT_FP_LOAD: {
+ Rv64I i = rv64_i_unpack(word);
+ out->noperands = 2;
+ rv_decop_reg(&out->operands[0], i.rd, 64);
+ rv_decop_mem(&out->operands[1], i.rs1, rv64_sext(i.imm12, 12),
+ rv_load_width_bits(i.funct3));
+ break;
+ }
+ case RV64_FMT_S:
+ case RV64_FMT_STORE:
+ case RV64_FMT_FP_STORE: {
+ Rv64S s = rv64_s_unpack(word);
+ out->noperands = 2;
+ rv_decop_reg(&out->operands[0], s.rs2, 64);
+ rv_decop_mem(&out->operands[1], s.rs1, rv64_sext(s.imm12, 12),
+ rv_load_width_bits(s.funct3));
+ break;
+ }
+ case RV64_FMT_B: {
+ Rv64B b = rv64_b_unpack(word);
+ out->noperands = 3;
+ rv_decop_reg(&out->operands[0], b.rs1, 64);
+ rv_decop_reg(&out->operands[1], b.rs2, 64);
+ rv_decop_pcrel(&out->operands[2], pc, rv64_sext(b.imm13, 13));
+ break;
+ }
+ case RV64_FMT_U: {
+ Rv64U u = rv64_u_unpack(word);
+ out->noperands = 2;
+ rv_decop_reg(&out->operands[0], u.rd, 64);
+ rv_decop_imm(&out->operands[1], (i64)(i32)u.imm32_hi20);
+ break;
+ }
+ case RV64_FMT_J: {
+ Rv64J j = rv64_j_unpack(word);
+ out->noperands = 2;
+ rv_decop_reg(&out->operands[0], j.rd, 64);
+ rv_decop_pcrel(&out->operands[1], pc, rv64_sext(j.imm21, 21));
+ break;
+ }
+ case RV64_FMT_JALR: {
+ Rv64I i = rv64_i_unpack(word);
+ out->noperands = 2;
+ rv_decop_reg(&out->operands[0], i.rd, 64);
+ rv_decop_mem(&out->operands[1], i.rs1, rv64_sext(i.imm12, 12), 64);
+ break;
+ }
+ case RV64_FMT_CSR: {
+ Rv64I i = rv64_i_unpack(word);
+ out->noperands = 3;
+ rv_decop_reg(&out->operands[0], i.rd, 64);
+ rv_decop_sysreg(&out->operands[1], i.imm12);
+ rv_decop_reg(&out->operands[2], i.rs1, 64);
+ break;
+ }
+ case RV64_FMT_CSRI: {
+ Rv64I i = rv64_i_unpack(word);
+ out->noperands = 3;
+ rv_decop_reg(&out->operands[0], i.rd, 64);
+ rv_decop_sysreg(&out->operands[1], i.imm12);
+ rv_decop_imm(&out->operands[2], (i64)i.rs1);
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+static KitStatus rv64_decode_one(Compiler* c, const u8* bytes, size_t len,
+ u64 pc, KitDecodedInsn* out) {
+ const Rv64InsnDesc* desc;
+ const RiscvVariant* variant;
+ u8 av;
+ u32 first_hw;
+ u32 word;
+ u32 encoding_id;
+ if (!bytes || !out) return KIT_INVALID;
+ if (len < 2u) return KIT_MALFORMED;
+ variant = riscv_variant_for_kind(c->target.arch);
+ av = rv_av_for_compiler(c);
+ memset(out, 0, sizeof(*out));
+ for (u32 i = 0; i < KIT_DECODE_MAX_OPERANDS; ++i)
+ rv_decop_none(&out->operands[i]);
+
+ first_hw = rv_read_u16_le(bytes);
+ if ((first_hw & 3u) != 3u) {
+ word = first_hw;
+ desc = rv64_disasm_find_c(first_hw, av);
+ out->nbytes = 2;
+ } else {
+ if (len < 4u) return KIT_MALFORMED;
+ word = rv_read_u32_le(bytes);
+ desc = rv64_disasm_find(word, av);
+ out->nbytes = 4;
+ }
+
+ encoding_id = rv64_desc_encoding_id(desc);
+ out->pc = pc;
+ out->bytes = bytes;
+ out->encoding_id = encoding_id;
+ out->opcode = rv64_semantic_opcode(word, out->nbytes);
+ out->flags = rv64_decode_flags(desc, word);
+ out->arch[0] = word;
+ out->arch[1] = desc ? desc->fmt : 0xffu;
+ rv64_decode_operands(desc, word, pc, variant, out);
+ return KIT_OK;
+}
+
+static KitStatus rv64_decode_block(Compiler* c, const u8* bytes, size_t len,
+ u64 pc, KitDecodedInsn* out, u32 cap,
+ u32* n_out) {
+ u32 n = 0;
+ if (n_out) *n_out = 0;
+ if (!bytes || !out || !n_out) return KIT_INVALID;
+ while (n < cap && len > 0) {
+ KitStatus st = rv64_decode_one(c, bytes, len, pc, &out[n]);
+ if (st != KIT_OK) return n ? KIT_OK : st;
+ bytes += out[n].nbytes;
+ len -= out[n].nbytes;
+ pc += out[n].nbytes;
+ ++n;
+ if (out[n - 1u].flags & KIT_DECODE_TERMINATOR) break;
+ }
+ *n_out = n;
+ return KIT_OK;
+}
+
+static void rv64_formatter_init(Rv64InsnFormatter* f, Compiler* c, Heap* h) {
+ memset(f, 0, sizeof(*f));
+ f->c = c;
+ f->heap = h;
+ f->base.format = rv64_format_insn;
+ f->base.destroy = rv64_formatter_destroy;
+ strbuf_init(&f->mnem, f->mnem_buf, sizeof f->mnem_buf);
+ strbuf_init(&f->ops, f->ops_buf, sizeof f->ops_buf);
+ strbuf_init(&f->ann, f->ann_buf, sizeof f->ann_buf);
+}
+
+static KitStatus rv64_format_insn(ArchInsnFormatter* base,
+ const KitDecodedInsn* insn, KitInsn* out) {
+ Rv64InsnFormatter* f = (Rv64InsnFormatter*)base;
+ const Rv64InsnDesc* desc;
+ u32 word;
+ if (!f || !insn || !out) return KIT_INVALID;
+ word = (u32)insn->arch[0];
+ {
+ u8 av = rv_av_for_compiler(f->c);
+ desc = insn->nbytes == 2u ? rv64_disasm_find_c(word, av)
+ : rv64_disasm_find(word, av);
+ }
+ if (desc) {
+ strbuf_reset(&f->mnem);
+ strbuf_put_slice(&f->mnem, desc->mnemonic);
+ strbuf_reset(&f->ops);
+ rv64_print_operands(&f->ops, desc, word, insn->pc);
+ } else if (insn->nbytes == 2u) {
+ rv_fmt_emit_fallback16(f, word);
+ } else {
+ rv_fmt_emit_fallback32(f, word);
+ }
+
+ strbuf_reset(&f->ann);
+ out->vaddr = insn->pc;
+ out->bytes = insn->bytes;
+ out->nbytes = insn->nbytes;
+ out->mnemonic = strbuf_slice(&f->mnem);
+ out->operands = strbuf_slice(&f->ops);
+ out->annotation = strbuf_slice(&f->ann);
+ return KIT_OK;
+}
+
+static void rv64_formatter_destroy(ArchInsnFormatter* base) {
+ Rv64InsnFormatter* f = (Rv64InsnFormatter*)base;
+ if (!f) return;
+ f->heap->free(f->heap, f, sizeof(*f));
+}
+
+static ArchInsnFormatter* rv64_formatter_new(Compiler* c) {
+ Heap* h = (Heap*)c->ctx->heap;
+ Rv64InsnFormatter* f =
+ (Rv64InsnFormatter*)h->alloc(h, sizeof(*f), _Alignof(Rv64InsnFormatter));
+ if (!f) return NULL;
+ rv64_formatter_init(f, c, h);
+ return &f->base;
+}
+
+static u32 rv_decode(ArchDisasm* base, const u8* bytes, size_t len, u64 vaddr,
+ KitInsn* out) {
+ Rv64Disasm* d = (Rv64Disasm*)base;
+ KitDecodedInsn insn;
+ KitStatus st = rv64_decode_one(d->fmt.c, bytes, len, vaddr, &insn);
+ if (st != KIT_OK) return 0;
+ st = rv64_format_insn(&d->fmt.base, &insn, out);
+ if (st != KIT_OK) return 0;
+ return insn.nbytes;
+}
+
+static void rv64_destroy(ArchDisasm* base) {
+ Rv64Disasm* d = (Rv64Disasm*)base;
+ d->fmt.heap->free(d->fmt.heap, d, sizeof(*d));
+}
+
+ArchDisasm* rv64_disasm_new(Compiler* c) {
+ Heap* h = (Heap*)c->ctx->heap;
+ Rv64Disasm* d = (Rv64Disasm*)h->alloc(h, sizeof(*d), _Alignof(Rv64Disasm));
+ if (!d) return NULL;
+ memset(d, 0, sizeof(*d));
+ d->base.decode = rv_decode;
+ d->base.destroy = rv64_destroy;
+ rv64_formatter_init(&d->fmt, c, h);
+ return &d->base;
+}
+
+const ArchDecodeOps rv64_decode_ops = {
+ .min_insn_len = 2,
+ .max_insn_len = 4,
+ .decode_one = rv64_decode_one,
+ .decode_block = rv64_decode_block,
+ .formatter_new = rv64_formatter_new,
+ .format = rv64_format_insn,
+ .formatter_free = rv64_formatter_destroy,
+};
diff --git a/src/arch/rv64/disasm.h b/src/arch/riscv/disasm.h
diff --git a/src/arch/riscv/emu.c b/src/arch/riscv/emu.c
@@ -0,0 +1,511 @@
+#include "emu/emu.h"
+
+#include <string.h>
+
+#include "arch/arch.h"
+#include "arch/riscv/isa.h"
+#include "core/slice.h"
+
+#define RV64_EMU_SYM_XREG "__emu_rv64_xreg"
+#define RV64_EMU_SYM_SET_XREG "__emu_rv64_set_xreg"
+#define RV64_EMU_SYM_JALR "__emu_rv64_jalr"
+
+typedef struct Rv64EmuCPUState {
+ u64 x[32];
+ u64 f[32];
+ u32 fcsr;
+ u64 reserved_addr;
+ int has_reservation;
+} Rv64EmuCPUState;
+
+typedef struct Rv64EmuLiftSyms {
+ KitCgSym xreg;
+ KitCgSym set_xreg;
+ KitCgSym load64;
+ KitCgSym load64_checked;
+ KitCgSym store64;
+ KitCgSym jalr;
+ KitCgSym syscall;
+ KitCgTypeId xreg_fn;
+ KitCgTypeId set_xreg_fn;
+ KitCgTypeId load64_fn;
+ KitCgTypeId load64_checked_fn;
+ KitCgTypeId store64_fn;
+ KitCgTypeId jalr_fn;
+ KitCgTypeId syscall_fn;
+ KitCgTypeId thread_ptr;
+ KitCgTypeId i32;
+ KitCgTypeId i64;
+ KitCgTypeId i64_ptr;
+ KitCgTypeId void_ty;
+} Rv64EmuLiftSyms;
+
+static KitCgSym rv64_emu_decl_helper(KitCompiler* c, KitCg* cg,
+ const char* name, KitCgTypeId type) {
+ KitCgDecl d;
+ memset(&d, 0, sizeof(d));
+ d.kind = KIT_CG_DECL_FUNC;
+ d.linkage_name = kit_sym_intern(c, kit_slice_cstr(name));
+ d.display_name = d.linkage_name;
+ d.linkage_name = kit_cg_c_linkage_name(c, d.linkage_name);
+ d.type = type;
+ d.sym.bind = KIT_SB_GLOBAL;
+ d.sym.visibility = KIT_CG_VIS_DEFAULT;
+ return kit_cg_decl(cg, d);
+}
+
+static KitCgTypeId rv64_emu_func_type(KitCompiler* c, KitCgTypeId ret,
+ const KitCgTypeId* params, u32 nparams) {
+ KitCgFuncParam p[5];
+ KitCgFuncResult result;
+ KitCgFuncSig sig;
+ u32 i;
+ memset(p, 0, sizeof(p));
+ for (i = 0; i < nparams; ++i) p[i].type = params[i];
+ memset(&sig, 0, sizeof(sig));
+ memset(&result, 0, sizeof(result));
+ result.type = ret;
+ sig.results = &result;
+ sig.nresults = 1;
+ sig.params = p;
+ sig.nparams = nparams;
+ sig.call_conv = KIT_CG_CC_TARGET_C;
+ return kit_cg_type_func(c, sig);
+}
+
+static void rv64_emu_lift_syms_init(KitCompiler* c, KitCg* cg,
+ Rv64EmuLiftSyms* out) {
+ KitCgBuiltinTypes bi = kit_cg_builtin_types(c);
+ KitCgTypeId params[5];
+ memset(out, 0, sizeof(*out));
+ out->void_ty = bi.id[KIT_CG_BUILTIN_VOID];
+ out->i32 = bi.id[KIT_CG_BUILTIN_I32];
+ out->i64 = bi.id[KIT_CG_BUILTIN_I64];
+ out->i64_ptr = kit_cg_type_ptr(c, out->i64, 0);
+ out->thread_ptr = emu_thread_type((Compiler*)c);
+
+ params[0] = out->thread_ptr;
+ params[1] = out->i32;
+ out->xreg_fn = rv64_emu_func_type(c, out->i64, params, 2);
+
+ params[0] = out->thread_ptr;
+ params[1] = out->i32;
+ params[2] = out->i64;
+ out->set_xreg_fn = rv64_emu_func_type(c, out->void_ty, params, 3);
+
+ params[0] = out->thread_ptr;
+ params[1] = out->i64;
+ out->load64_fn = rv64_emu_func_type(c, out->i64, params, 2);
+
+ {
+ KitCgTypeId load_params[5];
+ load_params[0] = out->thread_ptr;
+ load_params[1] = out->i64;
+ load_params[2] = out->i64;
+ load_params[3] = out->i64;
+ load_params[4] = out->i64_ptr;
+ out->load64_checked_fn = rv64_emu_func_type(c, out->i64, load_params, 5);
+ }
+
+ {
+ KitCgTypeId store_params[5];
+ store_params[0] = out->thread_ptr;
+ store_params[1] = out->i64;
+ store_params[2] = out->i64;
+ store_params[3] = out->i64;
+ store_params[4] = out->i64;
+ out->store64_fn = rv64_emu_func_type(c, out->i64, store_params, 5);
+ }
+
+ {
+ KitCgTypeId jalr_params[5];
+ jalr_params[0] = out->thread_ptr;
+ jalr_params[1] = out->i64;
+ jalr_params[2] = out->i64;
+ jalr_params[3] = out->i64;
+ jalr_params[4] = out->i64;
+ out->jalr_fn = rv64_emu_func_type(c, out->i64, jalr_params, 5);
+ }
+
+ params[0] = out->thread_ptr;
+ params[1] = out->i64;
+ out->syscall_fn = rv64_emu_func_type(c, out->i64, params, 2);
+
+ out->xreg = rv64_emu_decl_helper(c, cg, RV64_EMU_SYM_XREG, out->xreg_fn);
+ out->set_xreg =
+ rv64_emu_decl_helper(c, cg, RV64_EMU_SYM_SET_XREG, out->set_xreg_fn);
+ out->load64 = rv64_emu_decl_helper(c, cg, EMU_SYM_LOAD64, out->load64_fn);
+ out->load64_checked = rv64_emu_decl_helper(c, cg, EMU_SYM_LOAD64_CHECKED,
+ out->load64_checked_fn);
+ out->store64 = rv64_emu_decl_helper(c, cg, EMU_SYM_STORE64, out->store64_fn);
+ out->jalr = rv64_emu_decl_helper(c, cg, RV64_EMU_SYM_JALR, out->jalr_fn);
+ out->syscall = rv64_emu_decl_helper(c, cg, EMU_SYM_SYSCALL, out->syscall_fn);
+}
+
+static KitCgMemAccess rv64_emu_mem(KitCgTypeId type) {
+ KitCgMemAccess m;
+ memset(&m, 0, sizeof(m));
+ m.type = type;
+ return m;
+}
+
+static void rv64_emu_push_thread(KitCg* cg, KitCgLocal thread,
+ KitCgTypeId thread_ptr) {
+ kit_cg_push_local(cg, thread);
+ kit_cg_load(cg, rv64_emu_mem(thread_ptr));
+}
+
+static void rv64_emu_push_xreg(KitCg* cg, const Rv64EmuLiftSyms* s,
+ KitCgLocal thread, u32 reg) {
+ if (reg == 0u) {
+ kit_cg_push_int(cg, 0, s->i64);
+ return;
+ }
+ rv64_emu_push_thread(cg, thread, s->thread_ptr);
+ kit_cg_push_int(cg, reg, s->i32);
+ kit_cg_call_symbol(cg, s->xreg, 2, (KitCgCallAttrs){0});
+}
+
+static void rv64_emu_store_xreg_from_tmp(KitCg* cg, const Rv64EmuLiftSyms* s,
+ KitCgLocal thread, KitCgLocal tmp,
+ u32 reg) {
+ if (reg == 0u) return;
+ rv64_emu_push_thread(cg, thread, s->thread_ptr);
+ kit_cg_push_int(cg, reg, s->i32);
+ kit_cg_push_local(cg, tmp);
+ kit_cg_load(cg, rv64_emu_mem(s->i64));
+ kit_cg_call_symbol(cg, s->set_xreg, 3, (KitCgCallAttrs){0});
+}
+
+static void rv64_emu_store_xreg_from_stack(KitCg* cg, const Rv64EmuLiftSyms* s,
+ KitCgLocal thread, u32 reg,
+ KitCgLocal tmp) {
+ kit_cg_push_local(cg, tmp);
+ kit_cg_swap(cg);
+ kit_cg_store(cg, rv64_emu_mem(s->i64));
+ if (reg == 0u) return;
+ rv64_emu_store_xreg_from_tmp(cg, s, thread, tmp, reg);
+}
+
+static void rv64_emu_store_local_from_stack(KitCg* cg, const Rv64EmuLiftSyms* s,
+ KitCgLocal local) {
+ kit_cg_push_local(cg, local);
+ kit_cg_swap(cg);
+ kit_cg_store(cg, rv64_emu_mem(s->i64));
+}
+
+static void rv64_emu_push_local_value(KitCg* cg, const Rv64EmuLiftSyms* s,
+ KitCgLocal local) {
+ kit_cg_push_local(cg, local);
+ kit_cg_load(cg, rv64_emu_mem(s->i64));
+}
+
+static void rv64_emu_push_addr(KitCg* cg, const Rv64EmuLiftSyms* s,
+ KitCgLocal thread,
+ const KitDecodedOperand* mem) {
+ rv64_emu_push_xreg(cg, s, thread, mem->reg);
+ if (mem->imm) {
+ kit_cg_push_int(cg, (u64)mem->imm, s->i64);
+ kit_cg_int_binop(cg, KIT_CG_INT_ADD, 0);
+ }
+}
+
+static KitStatus rv64_emu_lift_block(Compiler* compiler, KitCg* cg,
+ const KitDecodedInsn* insts, u32 n,
+ const EmuLiftCtx* ctx) {
+ Rv64EmuLiftSyms syms;
+ KitCgLocal thread;
+ KitCgLocal tmp;
+ KitCgLocal fault_next;
+ KitCgLocalAttrs attrs;
+ u64 next_pc;
+ u32 i;
+ KitCompiler* c;
+
+ if (!compiler || !cg || !insts || !ctx) return KIT_INVALID;
+ c = (KitCompiler*)compiler;
+ rv64_emu_lift_syms_init(c, cg, &syms);
+
+ kit_cg_func_begin(cg, ctx->block_sym);
+ memset(&attrs, 0, sizeof(attrs));
+ attrs.name = kit_sym_intern(c, KIT_SLICE_LIT("thread"));
+ thread = kit_cg_param(cg, 0, syms.thread_ptr, attrs);
+ attrs.name = kit_sym_intern(c, KIT_SLICE_LIT("tmp"));
+ tmp = kit_cg_local(cg, syms.i64, attrs);
+ attrs.name = kit_sym_intern(c, KIT_SLICE_LIT("fault_next"));
+ fault_next = kit_cg_local(cg, syms.i64, attrs);
+
+ next_pc = ctx->guest_pc;
+ for (i = 0; i < n; ++i) {
+ const KitDecodedInsn* in = &insts[i];
+ next_pc = in->pc + in->nbytes;
+ switch (in->opcode) {
+ case RV64_DEC_ADDI: {
+ u32 rd = in->operands[0].reg;
+ u32 rs1 = in->operands[1].reg;
+ i64 imm = in->operands[2].imm;
+ rv64_emu_push_xreg(cg, &syms, thread, rs1);
+ kit_cg_push_int(cg, (u64)imm, syms.i64);
+ kit_cg_int_binop(cg, KIT_CG_INT_ADD, 0);
+ rv64_emu_store_xreg_from_stack(cg, &syms, thread, rd, tmp);
+ break;
+ }
+ case RV64_DEC_ADD: {
+ u32 rd = in->operands[0].reg;
+ u32 rs1 = in->operands[1].reg;
+ u32 rs2 = in->operands[2].reg;
+ rv64_emu_push_xreg(cg, &syms, thread, rs1);
+ rv64_emu_push_xreg(cg, &syms, thread, rs2);
+ kit_cg_int_binop(cg, KIT_CG_INT_ADD, 0);
+ rv64_emu_store_xreg_from_stack(cg, &syms, thread, rd, tmp);
+ break;
+ }
+ case RV64_DEC_AUIPC: {
+ u32 rd = in->operands[0].reg;
+ i64 imm = in->operands[1].imm;
+ kit_cg_push_int(cg, (u64)(in->pc + (u64)imm), syms.i64);
+ rv64_emu_store_xreg_from_stack(cg, &syms, thread, rd, tmp);
+ break;
+ }
+ case RV64_DEC_LD: {
+ u32 rd = in->operands[0].reg;
+ KitCgLabel ok = kit_cg_label_new(cg);
+ rv64_emu_push_thread(cg, thread, syms.thread_ptr);
+ rv64_emu_push_addr(cg, &syms, thread, &in->operands[1]);
+ kit_cg_push_int(cg, in->pc, syms.i64);
+ kit_cg_push_int(cg, next_pc, syms.i64);
+ kit_cg_push_local_addr(cg, tmp);
+ kit_cg_call_symbol(cg, syms.load64_checked, 5, (KitCgCallAttrs){0});
+ rv64_emu_store_local_from_stack(cg, &syms, fault_next);
+ rv64_emu_push_local_value(cg, &syms, fault_next);
+ kit_cg_push_int(cg, 0, syms.i64);
+ kit_cg_int_cmp(cg, KIT_CG_INT_NE);
+ kit_cg_branch_false(cg, ok);
+ rv64_emu_push_local_value(cg, &syms, fault_next);
+ kit_cg_ret(cg);
+ kit_cg_label_place(cg, ok);
+ rv64_emu_store_xreg_from_tmp(cg, &syms, thread, tmp, rd);
+ break;
+ }
+ case RV64_DEC_SD: {
+ rv64_emu_push_thread(cg, thread, syms.thread_ptr);
+ rv64_emu_push_addr(cg, &syms, thread, &in->operands[1]);
+ rv64_emu_push_xreg(cg, &syms, thread, in->operands[0].reg);
+ kit_cg_push_int(cg, in->pc, syms.i64);
+ kit_cg_push_int(cg, next_pc, syms.i64);
+ kit_cg_call_symbol(cg, syms.store64, 5, (KitCgCallAttrs){0});
+ kit_cg_ret(cg);
+ kit_cg_func_end(cg);
+ return KIT_OK;
+ }
+ case RV64_DEC_JALR: {
+ Rv64I ji = rv64_i_unpack(in->arch[0]);
+ u32 rd = ji.rd;
+ u32 rs1 = ji.rs1;
+ i64 imm = rv64_sext(ji.imm12, 12);
+ rv64_emu_push_thread(cg, thread, syms.thread_ptr);
+ kit_cg_push_int(cg, rd, syms.i64);
+ kit_cg_push_int(cg, rs1, syms.i64);
+ kit_cg_push_int(cg, (u64)imm, syms.i64);
+ kit_cg_push_int(cg, next_pc, syms.i64);
+ kit_cg_call_symbol(cg, syms.jalr, 5, (KitCgCallAttrs){0});
+ kit_cg_ret(cg);
+ kit_cg_func_end(cg);
+ return KIT_OK;
+ break;
+ }
+ case RV64_DEC_ECALL:
+ rv64_emu_push_thread(cg, thread, syms.thread_ptr);
+ kit_cg_push_int(cg, next_pc, syms.i64);
+ kit_cg_call_symbol(cg, syms.syscall, 2, (KitCgCallAttrs){0});
+ kit_cg_ret(cg);
+ kit_cg_func_end(cg);
+ return KIT_OK;
+ default:
+ kit_cg_push_int(cg, in->pc, syms.i64);
+ kit_cg_ret(cg);
+ kit_cg_func_end(cg);
+ return KIT_OK;
+ }
+ }
+
+ kit_cg_push_int(cg, next_pc, syms.i64);
+ kit_cg_ret(cg);
+ kit_cg_func_end(cg);
+ return KIT_OK;
+}
+
+static EmuCPUState* rv64_emu_cpu_new(Compiler* c, u64 initial_pc,
+ u64 initial_sp) {
+ EmuCPUState* cpu = emu_cpu_new_with_arch_state(c, KIT_ARCH_RV64, initial_pc,
+ sizeof(Rv64EmuCPUState),
+ _Alignof(Rv64EmuCPUState));
+ Rv64EmuCPUState* rv = (Rv64EmuCPUState*)emu_cpu_arch_state(cpu);
+ if (rv) rv->x[2] = initial_sp;
+ return cpu;
+}
+
+static Rv64EmuCPUState* rv64_thread_state(EmuThread* thread) {
+ return thread ? (Rv64EmuCPUState*)emu_cpu_arch_state(emu_thread_cpu(thread))
+ : NULL;
+}
+
+u64 emu_rv64_xreg(EmuThread* thread, u32 i) {
+ Rv64EmuCPUState* rv = rv64_thread_state(thread);
+ if (!rv || i >= 32u) return 0;
+ return i == 0u ? 0u : rv->x[i];
+}
+
+void emu_rv64_set_xreg(EmuThread* thread, u32 i, u64 v) {
+ Rv64EmuCPUState* rv = rv64_thread_state(thread);
+ if (!rv || i >= 32u || i == 0u) return;
+ rv->x[i] = v;
+}
+
+static u64 rv64_get_syscall_no(EmuThread* thread) {
+ return emu_rv64_xreg(thread, 17u);
+}
+
+static u64 rv64_get_syscall_arg(EmuThread* thread, u32 index) {
+ static const u32 regs[6] = {10u, 11u, 12u, 13u, 14u, 15u};
+ return index < 6u ? emu_rv64_xreg(thread, regs[index]) : 0;
+}
+
+static void rv64_set_syscall_result(EmuThread* thread, u64 value) {
+ emu_rv64_set_xreg(thread, 10u, value);
+}
+
+static u64 rv64_get_sp(EmuThread* thread) { return emu_rv64_xreg(thread, 2u); }
+
+static void rv64_set_sp(EmuThread* thread, u64 value) {
+ emu_rv64_set_xreg(thread, 2u, value);
+}
+
+static u64 rv64_get_tp(EmuThread* thread) { return emu_rv64_xreg(thread, 4u); }
+
+static void rv64_set_tp(EmuThread* thread, u64 value) {
+ emu_rv64_set_xreg(thread, 4u, value);
+}
+
+static void rv64_signal_wr64(u8* p, u64 v) {
+ u32 i;
+ for (i = 0; i < 8u; ++i) p[i] = (u8)(v >> (8u * i));
+}
+
+static u64 rv64_signal_rd64(const u8* p) {
+ return (u64)p[0] | ((u64)p[1] << 8) | ((u64)p[2] << 16) | ((u64)p[3] << 24) |
+ ((u64)p[4] << 32) | ((u64)p[5] << 40) | ((u64)p[6] << 48) |
+ ((u64)p[7] << 56);
+}
+
+static u64 rv64_signal_context_size(EmuProcess* process, EmuThread* thread) {
+ (void)process;
+ (void)thread;
+ return 32u * 8u;
+}
+
+static KitStatus rv64_save_signal_context(EmuProcess* process,
+ EmuThread* thread, u8* dst,
+ u64 size) {
+ u32 i;
+ (void)process;
+ if (!thread || !dst || size < 32u * 8u) return KIT_INVALID;
+ for (i = 0; i < 32u; ++i)
+ rv64_signal_wr64(dst + (u64)i * 8u, emu_rv64_xreg(thread, i));
+ return KIT_OK;
+}
+
+static KitStatus rv64_restore_signal_context(EmuProcess* process,
+ EmuThread* thread, const u8* src,
+ u64 size) {
+ u32 i;
+ (void)process;
+ if (!thread || !src || size < 32u * 8u) return KIT_INVALID;
+ for (i = 0; i < 32u; ++i)
+ emu_rv64_set_xreg(thread, i, rv64_signal_rd64(src + (u64)i * 8u));
+ return KIT_OK;
+}
+
+static KitStatus rv64_set_signal_handler_args(EmuProcess* process,
+ EmuThread* thread, int signo,
+ u64 siginfo, u64 ucontext) {
+ (void)process;
+ if (!thread) return KIT_INVALID;
+ emu_rv64_set_xreg(thread, 10u, (u64)signo);
+ emu_rv64_set_xreg(thread, 11u, siginfo);
+ emu_rv64_set_xreg(thread, 12u, ucontext);
+ return KIT_OK;
+}
+
+static u64 rv64_signal_stack_align(EmuProcess* process, EmuThread* thread) {
+ (void)process;
+ (void)thread;
+ return 16u;
+}
+
+static KitStatus rv64_emit_import_thunk(EmuProcess* process, u64 thunk_vaddr) {
+ u8 code[4];
+ u32 word = 0x00008067u;
+ u32 i;
+ if (!process) return KIT_INVALID;
+ for (i = 0; i < 4u; ++i) code[i] = (u8)(word >> (8u * i));
+ return emu_addr_space_copy_in(&process->image.addr_space, thunk_vaddr, code,
+ sizeof(code));
+}
+
+u64 emu_rv64_jalr(EmuThread* thread, u64 rd, u64 rs1, u64 imm, u64 next_pc) {
+ EmuImportBinding* b = NULL;
+ u64 target;
+ if (rd != 0u) emu_rv64_set_xreg(thread, (u32)rd, next_pc);
+ target = emu_rv64_xreg(thread, (u32)rs1) + imm;
+ target &= ~1ull;
+ if (emu_dl_resolve_import_thunk(thread ? thread->process : NULL, target,
+ &b) == KIT_OK &&
+ b) {
+ u64 args[3];
+ u64 result = 0;
+ args[0] = emu_rv64_xreg(thread, 10u);
+ args[1] = emu_rv64_xreg(thread, 11u);
+ args[2] = emu_rv64_xreg(thread, 12u);
+ if (emu_call_host_import(thread, b, args, 3u, &result) != KIT_OK) {
+ emu_cpu_trap_fault(emu_thread_cpu(thread));
+ return next_pc;
+ }
+ if (b->signature.result != KIT_EMU_VALUE_VOID)
+ emu_rv64_set_xreg(thread, 10u, result);
+ return next_pc;
+ }
+ return target;
+}
+
+static void* rv64_resolve_runtime_helper(void* emu, KitSlice name) {
+ (void)emu;
+ if (kit_slice_eq_cstr(name, RV64_EMU_SYM_XREG)) return (void*)emu_rv64_xreg;
+ if (kit_slice_eq_cstr(name, RV64_EMU_SYM_SET_XREG))
+ return (void*)emu_rv64_set_xreg;
+ if (kit_slice_eq_cstr(name, RV64_EMU_SYM_JALR)) return (void*)emu_rv64_jalr;
+ return NULL;
+}
+
+const ArchEmuOps rv64_emu_ops = {
+ .cpu_new = rv64_emu_cpu_new,
+ .block_fn_type = emu_block_fn_type,
+ .lift_block = rv64_emu_lift_block,
+ .get_gpr = emu_rv64_xreg,
+ .set_gpr = emu_rv64_set_xreg,
+ .get_syscall_no = rv64_get_syscall_no,
+ .get_syscall_arg = rv64_get_syscall_arg,
+ .set_syscall_result = rv64_set_syscall_result,
+ .get_sp = rv64_get_sp,
+ .set_sp = rv64_set_sp,
+ .get_tp = rv64_get_tp,
+ .set_tp = rv64_set_tp,
+ .signal_context_size = rv64_signal_context_size,
+ .save_signal_context = rv64_save_signal_context,
+ .restore_signal_context = rv64_restore_signal_context,
+ .set_signal_handler_args = rv64_set_signal_handler_args,
+ .signal_stack_align = rv64_signal_stack_align,
+ .import_thunk_size = 4u,
+ .emit_import_thunk = rv64_emit_import_thunk,
+ .resolve_runtime_helper = rv64_resolve_runtime_helper,
+};
diff --git a/src/arch/riscv/isa.c b/src/arch/riscv/isa.c
@@ -0,0 +1,2112 @@
+/* RV64 instruction descriptor table + operand print dispatch.
+ *
+ * Mirrors the aa64_isa.c pattern. Each row records (mnemonic, match,
+ * mask, format, flags); rv64_disasm_find returns the first row whose
+ * masked bits match the word, and rv64_print_operands renders the
+ * operand text using the format's unpack helper.
+ *
+ * Row ordering: first-match wins. Aliases (rows with RV64_ASMFL_ALIAS)
+ * use tighter masks placed BEFORE the canonical row they alias so the
+ * disassembler renders the alias spelling. The assembler accepts both
+ * forms via rv64_asm_find which prefers the canonical row. */
+
+#include "arch/riscv/isa.h"
+
+#include <string.h>
+
+#include "core/slice.h"
+#include "core/strbuf.h"
+
+/* True if `s` begins with the NUL-terminated literal `pfx` (length-explicit).
+ */
+static bool slice_has_prefix_cstr(Slice s, const char* pfx, size_t n) {
+ return s.len >= n && memcmp(s.s, pfx, n) == 0;
+}
+
+/* Family-match bit patterns. The opcode (bits 6:0) plus
+ * funct3/funct7/funct5 selectors narrow each match. For aliases we pin
+ * specific register fields (e.g. rs1=x0 for `li`, rd=x0 for `j`). */
+
+/* Helper: build a 32-bit match for R-type with fixed funct7/funct3/op. */
+#define MATCH_R(funct7, funct3, op) \
+ (((u32)(funct7) << 25) | ((u32)(funct3) << 12) | (u32)(op))
+#define MASK_R (0xfe00707fu) /* funct7 + funct3 + opcode */
+
+#define MATCH_I(funct3, op) (((u32)(funct3) << 12) | (u32)(op))
+#define MASK_I (0x0000707fu) /* funct3 + opcode */
+
+#define MATCH_S(funct3, op) (((u32)(funct3) << 12) | (u32)(op))
+#define MASK_S (0x0000707fu)
+
+#define MATCH_B(funct3, op) (((u32)(funct3) << 12) | (u32)(op))
+#define MASK_B (0x0000707fu)
+
+#define MATCH_U(op) ((u32)(op))
+#define MASK_U (0x0000007fu)
+
+#define MATCH_J(op) ((u32)(op))
+#define MASK_J (0x0000007fu)
+
+/* FP fused multiply-add/sub: rs3(31:27) fmt(26:25) rs2 rs1 rm rd op. */
+#define MATCH_R4(fmt, op) (((u32)(fmt) << 25) | (u32)(op))
+#define MASK_R4 (0x0600007fu)
+
+/* I-type shift in RV64: funct6 (bits 31:26) is the selector + opcode +
+ * funct3. shamt occupies bits 25:20. */
+#define MATCH_ISHIFT(funct6, funct3, op) \
+ (((u32)(funct6) << 26) | ((u32)(funct3) << 12) | (u32)(op))
+#define MASK_ISHIFT (0xfc00707fu)
+
+/* I-type shift in 32-bit (W) form uses 7-bit funct7 + 5-bit shamt. */
+#define MATCH_ISHIFTW(funct7, funct3, op) \
+ (((u32)(funct7) << 25) | ((u32)(funct3) << 12) | (u32)(op))
+#define MASK_ISHIFTW (0xfe00707fu)
+
+/* AMO: aq/rl bits 26/25 vary, so mask must exclude them. funct5 is
+ * bits[31:27]. */
+#define MATCH_AMO(funct5, funct3, op) \
+ (((u32)(funct5) << 27) | ((u32)(funct3) << 12) | (u32)(op))
+#define MASK_AMO (0xf800707fu)
+#define MATCH_AMO_ORDER(funct5, aq, rl, funct3, op) \
+ (((u32)(funct5) << 27) | ((u32)(aq) << 26) | ((u32)(rl) << 25) | \
+ ((u32)(funct3) << 12) | (u32)(op))
+#define MASK_AMO_ORDER (MASK_AMO | (3u << 25))
+
+/* FP arithmetic with rm — rm field (funct3) is don't-care. funct7
+ * encodes op-major and format. */
+#define MATCH_FP_RM(funct7, op) (((u32)(funct7) << 25) | (u32)(op))
+#define MASK_FP_RM (0xfe00007fu)
+
+/* FP R-type with fixed funct3 (compare or sign-injection variants). */
+#define MATCH_FP_R(funct7, funct3, op) MATCH_R((funct7), (funct3), (op))
+#define MASK_FP_R MASK_R
+
+/* FP conversion: funct7 + rs2 (type selector) + funct3-as-rm don't-care
+ * + opcode. The rs2 field (bits 24:20) selects integer width / signedness. */
+#define MATCH_FP_CVT(funct7, rs2, op) \
+ (((u32)(funct7) << 25) | ((u32)(rs2) << 20) | (u32)(op))
+#define MASK_FP_CVT (0xfff0007fu)
+
+/* SYSTEM (ECALL/EBREAK) — full 32-bit value matches a single instruction. */
+#define MATCH_FULL(w) ((u32)(w))
+#define MASK_FULL (0xffffffffu)
+
+/* CSR — Zicsr. csr (imm12) is don't-care, but funct3+opcode pin the op. */
+#define MATCH_CSR(funct3) (((u32)(funct3) << 12) | (u32)RV_SYSTEM)
+#define MASK_CSR (0x0000707fu)
+
+/* Compressed 16-bit instructions live in low 16 bits of the descriptor
+ * word; the mask zeroes bits 16+ to ensure a match against the C-decode
+ * path which presents the halfword in low 16 bits. */
+#define MATCH_C(w16) ((u32)(w16))
+
+/* Mnemonic Slice literal for a static table row (compile-time length). */
+#define MN(s) {{(s)}, sizeof(s) - 1}
+
+const Rv64InsnDesc rv64_insn_table[] = {
+ /* =================================================================
+ * RV64I base — integer register ops (R-type, OP=0x33)
+ * ================================================================= */
+ {MN("add"), MATCH_R(0x00, 0x0, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("sub"), MATCH_R(0x20, 0x0, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("sll"), MATCH_R(0x00, 0x1, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("slt"), MATCH_R(0x00, 0x2, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("sltu"), MATCH_R(0x00, 0x3, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("xor"), MATCH_R(0x00, 0x4, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("srl"), MATCH_R(0x00, 0x5, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("sra"), MATCH_R(0x20, 0x5, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("or"), MATCH_R(0x00, 0x6, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("and"), MATCH_R(0x00, 0x7, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+
+ /* 32-bit (W) variants — OP_32 = 0x3b (RV64-only major opcode) */
+ {MN("addw"), MATCH_R(0x00, 0x0, RV_OP_32), MASK_R, RV64_FMT_R, 0,
+ RV_AV_RV64, {0}},
+ {MN("subw"), MATCH_R(0x20, 0x0, RV_OP_32), MASK_R, RV64_FMT_R, 0,
+ RV_AV_RV64, {0}},
+ {MN("sllw"), MATCH_R(0x00, 0x1, RV_OP_32), MASK_R, RV64_FMT_R, 0,
+ RV_AV_RV64, {0}},
+ {MN("srlw"), MATCH_R(0x00, 0x5, RV_OP_32), MASK_R, RV64_FMT_R, 0,
+ RV_AV_RV64, {0}},
+ {MN("sraw"), MATCH_R(0x20, 0x5, RV_OP_32), MASK_R, RV64_FMT_R, 0,
+ RV_AV_RV64, {0}},
+
+ /* ---- I-type immediate ALU (OP_IMM=0x13) ----
+ * Aliases: `li rd, imm` = ADDI rd, x0, imm (rs1=x0).
+ * `mv rd, rs1` = ADDI rd, rs1, 0 (imm=0).
+ * `nop` = ADDI x0, x0, 0 (full word fixed). */
+ {MN("nop"),
+ 0x00000013u,
+ 0xffffffffu,
+ RV64_FMT_SYSTEM,
+ RV64_ASMFL_ALIAS,
+ 0, {0}},
+ {MN("li"), 0x00000013u, 0x000f807fu, RV64_FMT_I, RV64_ASMFL_ALIAS, 0, {0}},
+ /* mv: ADDI with imm=0. mask requires imm12=0 + funct3=0 + op. */
+ {MN("mv"), 0x00000013u, 0xfff0707fu, RV64_FMT_I, RV64_ASMFL_ALIAS, 0, {0}},
+ /* seqz: SLTIU rd, rs, 1 — funct3=3, imm12=1, op=OP_IMM. */
+ {MN("seqz"),
+ 0x00103013u,
+ 0xfff0707fu,
+ RV64_FMT_I,
+ RV64_ASMFL_ALIAS,
+ 0, {0}},
+ /* snez: SLTU rd, x0, rs2 — rs1=x0, funct3=3, op=OP. */
+ {MN("snez"),
+ 0x00003033u,
+ 0xfe0ff07fu,
+ RV64_FMT_R,
+ RV64_ASMFL_ALIAS,
+ 0, {0}},
+ /* not: XORI rd, rs, -1 — imm12=0xfff, funct3=4, op=OP_IMM. */
+ {MN("not"), 0xfff04013u, 0xfff0707fu, RV64_FMT_I, RV64_ASMFL_ALIAS, 0, {0}},
+ /* neg: SUB rd, x0, rs2 — rs1=x0, funct7=0x20, funct3=0. */
+ {MN("neg"), 0x40000033u, 0xfe0ff07fu, RV64_FMT_R, RV64_ASMFL_ALIAS, 0, {0}},
+ /* negw: SUBW rd, x0, rs2 (RV64-only, SUBW major opcode). */
+ {MN("negw"),
+ 0x4000003bu,
+ 0xfe0ff07fu,
+ RV64_FMT_R,
+ RV64_ASMFL_ALIAS,
+ RV_AV_RV64, {0}},
+ {MN("addi"), MATCH_I(0x0, RV_OP_IMM), MASK_I, RV64_FMT_I, 0, 0, {0}},
+ {MN("slti"), MATCH_I(0x2, RV_OP_IMM), MASK_I, RV64_FMT_I, 0, 0, {0}},
+ {MN("sltiu"), MATCH_I(0x3, RV_OP_IMM), MASK_I, RV64_FMT_I, 0, 0, {0}},
+ {MN("xori"), MATCH_I(0x4, RV_OP_IMM), MASK_I, RV64_FMT_I, 0, 0, {0}},
+ {MN("ori"), MATCH_I(0x6, RV_OP_IMM), MASK_I, RV64_FMT_I, 0, 0, {0}},
+ {MN("andi"), MATCH_I(0x7, RV_OP_IMM), MASK_I, RV64_FMT_I, 0, 0, {0}},
+
+ /* RV64I shift-imm: funct6 in bits 31:26, shamt in 25:20. */
+ {MN("slli"),
+ MATCH_ISHIFT(0x00, 0x1, RV_OP_IMM),
+ MASK_ISHIFT,
+ RV64_FMT_I_SHIFT,
+ 0,
+ 0, {0}},
+ {MN("srli"),
+ MATCH_ISHIFT(0x00, 0x5, RV_OP_IMM),
+ MASK_ISHIFT,
+ RV64_FMT_I_SHIFT,
+ 0,
+ 0, {0}},
+ {MN("srai"),
+ MATCH_ISHIFT(0x10, 0x5, RV_OP_IMM),
+ MASK_ISHIFT,
+ RV64_FMT_I_SHIFT,
+ 0,
+ 0, {0}},
+
+ /* OP_IMM_32: ADDIW + word shifts. sext.w alias = ADDIW rd, rs, 0.
+ * OP_IMM_32 major opcode (0x1b) is absent on rv32 — all RV64-only. */
+ {MN("sext.w"),
+ 0x0000001bu,
+ 0xfff0707fu,
+ RV64_FMT_I,
+ RV64_ASMFL_ALIAS,
+ RV_AV_RV64, {0}},
+ {MN("addiw"), MATCH_I(0x0, RV_OP_IMM_32), MASK_I, RV64_FMT_I, 0,
+ RV_AV_RV64, {0}},
+ {MN("slliw"),
+ MATCH_ISHIFTW(0x00, 0x1, RV_OP_IMM_32),
+ MASK_ISHIFTW,
+ RV64_FMT_I_SHIFTW,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("srliw"),
+ MATCH_ISHIFTW(0x00, 0x5, RV_OP_IMM_32),
+ MASK_ISHIFTW,
+ RV64_FMT_I_SHIFTW,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("sraiw"),
+ MATCH_ISHIFTW(0x20, 0x5, RV_OP_IMM_32),
+ MASK_ISHIFTW,
+ RV64_FMT_I_SHIFTW,
+ 0,
+ RV_AV_RV64, {0}},
+
+ /* ---- LUI / AUIPC ---- */
+ {MN("lui"), MATCH_U(RV_LUI), MASK_U, RV64_FMT_U, 0, 0, {0}},
+ {MN("auipc"), MATCH_U(RV_AUIPC), MASK_U, RV64_FMT_U, 0, 0, {0}},
+
+ /* ---- Loads (I-type, op=LOAD=0x03) ---- */
+ {MN("lb"), MATCH_I(0x0, RV_LOAD), MASK_I, RV64_FMT_LOAD, 0, 0, {0}},
+ {MN("lh"), MATCH_I(0x1, RV_LOAD), MASK_I, RV64_FMT_LOAD, 0, 0, {0}},
+ {MN("lw"), MATCH_I(0x2, RV_LOAD), MASK_I, RV64_FMT_LOAD, 0, 0, {0}},
+ {MN("ld"), MATCH_I(0x3, RV_LOAD), MASK_I, RV64_FMT_LOAD, 0,
+ RV_AV_RV64, {0}}, /* LD funct3=3 RV64-only */
+ {MN("lbu"), MATCH_I(0x4, RV_LOAD), MASK_I, RV64_FMT_LOAD, 0, 0, {0}},
+ {MN("lhu"), MATCH_I(0x5, RV_LOAD), MASK_I, RV64_FMT_LOAD, 0, 0, {0}},
+ {MN("lwu"), MATCH_I(0x6, RV_LOAD), MASK_I, RV64_FMT_LOAD, 0,
+ RV_AV_RV64, {0}}, /* LWU funct3=6 RV64-only */
+
+ /* ---- Stores (S-type, op=STORE=0x23) ---- */
+ {MN("sb"), MATCH_S(0x0, RV_STORE), MASK_S, RV64_FMT_STORE, 0, 0, {0}},
+ {MN("sh"), MATCH_S(0x1, RV_STORE), MASK_S, RV64_FMT_STORE, 0, 0, {0}},
+ {MN("sw"), MATCH_S(0x2, RV_STORE), MASK_S, RV64_FMT_STORE, 0, 0, {0}},
+ {MN("sd"), MATCH_S(0x3, RV_STORE), MASK_S, RV64_FMT_STORE, 0,
+ RV_AV_RV64, {0}}, /* SD funct3=3 RV64-only */
+
+ /* ---- Branches (B-type, op=BRANCH=0x63) ----
+ * Aliases: `beqz rs, off` = BEQ rs, x0, off; `bnez rs, off` = BNE. */
+ {MN("beqz"),
+ 0x00000063u,
+ 0x01f0707fu,
+ RV64_FMT_B,
+ RV64_ASMFL_ALIAS,
+ 0, {0}},
+ {MN("bnez"),
+ 0x00001063u,
+ 0x01f0707fu,
+ RV64_FMT_B,
+ RV64_ASMFL_ALIAS,
+ 0, {0}},
+ {MN("beq"), MATCH_B(0x0, RV_BRANCH), MASK_B, RV64_FMT_B, 0, 0, {0}},
+ {MN("bne"), MATCH_B(0x1, RV_BRANCH), MASK_B, RV64_FMT_B, 0, 0, {0}},
+ {MN("blt"), MATCH_B(0x4, RV_BRANCH), MASK_B, RV64_FMT_B, 0, 0, {0}},
+ {MN("bge"), MATCH_B(0x5, RV_BRANCH), MASK_B, RV64_FMT_B, 0, 0, {0}},
+ {MN("bltu"), MATCH_B(0x6, RV_BRANCH), MASK_B, RV64_FMT_B, 0, 0, {0}},
+ {MN("bgeu"), MATCH_B(0x7, RV_BRANCH), MASK_B, RV64_FMT_B, 0, 0, {0}},
+
+ /* ---- JAL / JALR ----
+ * `j off` = JAL x0, off (rd=x0).
+ * `jal off` = JAL ra, off (rd=ra, single-operand form).
+ * `ret` = JALR x0, 0(ra) (rd=x0 + rs1=ra + imm=0).
+ * `jr rs` = JALR x0, 0(rs) (rd=x0, imm=0).
+ * `jalr rs` = JALR ra, 0(rs) (rd=ra, imm=0). */
+ {MN("ret"),
+ 0x00008067u,
+ 0xffffffffu,
+ RV64_FMT_SYSTEM,
+ RV64_ASMFL_ALIAS,
+ 0, {0}},
+ {MN("jr"),
+ 0x00000067u,
+ 0xfff07fffu,
+ RV64_FMT_JALR,
+ RV64_ASMFL_ALIAS,
+ 0, {0}},
+ {MN("j"), 0x0000006fu, 0x00000fffu, RV64_FMT_J, RV64_ASMFL_ALIAS, 0, {0}},
+ {MN("jal"), MATCH_J(RV_JAL), MASK_J, RV64_FMT_J, 0, 0, {0}},
+ {MN("jalr"), MATCH_I(0x0, RV_JALR), MASK_I, RV64_FMT_JALR, 0, 0, {0}},
+
+ /* ---- Multi-word pseudo-instructions ----
+ * `call sym` = AUIPC ra, %pcrel_hi(sym); JALR ra, %pcrel_lo(ra) — one
+ * R_RV_CALL reloc at the AUIPC; the linker patches both.
+ * `tail sym` = AUIPC t1, ...; JALR zero, t1 — same R_RV_CALL reloc.
+ * `la rd,sym` / `lla rd,sym` = AUIPC rd, %pcrel_hi(sym); ADDI rd, rd,
+ * %pcrel_lo. kit's static Local-Exec model treats `la`
+ * and `lla` identically (no GOT indirection). The match
+ * column is unused: RV64_FMT_PSEUDO dispatches on the
+ * mnemonic and emits the expansion directly. */
+ {MN("call"), 0u, 0u, RV64_FMT_PSEUDO, RV64_ASMFL_PSEUDO, 0, {0}},
+ {MN("tail"), 0u, 0u, RV64_FMT_PSEUDO, RV64_ASMFL_PSEUDO, 0, {0}},
+ {MN("la"), 0u, 0u, RV64_FMT_PSEUDO, RV64_ASMFL_PSEUDO, 0, {0}},
+ {MN("lla"), 0u, 0u, RV64_FMT_PSEUDO, RV64_ASMFL_PSEUDO, 0, {0}},
+
+ /* ---- FENCE ---- */
+ {MN("fence"), MATCH_I(0x0, RV_FENCE), MASK_I, RV64_FMT_FENCE, 0, 0, {0}},
+ {MN("fence.i"),
+ MATCH_FULL(0x0000100fu),
+ MASK_FULL,
+ RV64_FMT_SYSTEM,
+ 0,
+ 0, {0}},
+
+ /* ---- System (ECALL/EBREAK) ---- */
+ {MN("ecall"),
+ MATCH_FULL(0x00000073u),
+ MASK_FULL,
+ RV64_FMT_SYSTEM,
+ 0,
+ 0, {0}},
+ {MN("ebreak"),
+ MATCH_FULL(0x00100073u),
+ MASK_FULL,
+ RV64_FMT_SYSTEM,
+ 0,
+ 0, {0}},
+
+ /* =================================================================
+ * Zicsr (CSR access) — RV_SYSTEM with funct3 ∈ {1..3, 5..7}.
+ * ================================================================= */
+ {MN("csrrw"), MATCH_CSR(0x1), MASK_CSR, RV64_FMT_CSR, 0, 0, {0}},
+ {MN("csrrs"), MATCH_CSR(0x2), MASK_CSR, RV64_FMT_CSR, 0, 0, {0}},
+ {MN("csrrc"), MATCH_CSR(0x3), MASK_CSR, RV64_FMT_CSR, 0, 0, {0}},
+ {MN("csrrwi"), MATCH_CSR(0x5), MASK_CSR, RV64_FMT_CSRI, 0, 0, {0}},
+ {MN("csrrsi"), MATCH_CSR(0x6), MASK_CSR, RV64_FMT_CSRI, 0, 0, {0}},
+ {MN("csrrci"), MATCH_CSR(0x7), MASK_CSR, RV64_FMT_CSRI, 0, 0, {0}},
+
+ /* =================================================================
+ * RV64M (multiply / divide) — funct7 = 0x01
+ * ================================================================= */
+ {MN("mul"), MATCH_R(0x01, 0x0, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("mulh"), MATCH_R(0x01, 0x1, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("mulhsu"), MATCH_R(0x01, 0x2, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("mulhu"), MATCH_R(0x01, 0x3, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("div"), MATCH_R(0x01, 0x4, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("divu"), MATCH_R(0x01, 0x5, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("rem"), MATCH_R(0x01, 0x6, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ {MN("remu"), MATCH_R(0x01, 0x7, RV_OP), MASK_R, RV64_FMT_R, 0, 0, {0}},
+ /* W-form multiply/divide — OP_32 major opcode, RV64-only. */
+ {MN("mulw"), MATCH_R(0x01, 0x0, RV_OP_32), MASK_R, RV64_FMT_R, 0,
+ RV_AV_RV64, {0}},
+ {MN("divw"), MATCH_R(0x01, 0x4, RV_OP_32), MASK_R, RV64_FMT_R, 0,
+ RV_AV_RV64, {0}},
+ {MN("divuw"), MATCH_R(0x01, 0x5, RV_OP_32), MASK_R, RV64_FMT_R, 0,
+ RV_AV_RV64, {0}},
+ {MN("remw"), MATCH_R(0x01, 0x6, RV_OP_32), MASK_R, RV64_FMT_R, 0,
+ RV_AV_RV64, {0}},
+ {MN("remuw"), MATCH_R(0x01, 0x7, RV_OP_32), MASK_R, RV64_FMT_R, 0,
+ RV_AV_RV64, {0}},
+
+ /* =================================================================
+ * RV32F / RV32D — single and double precision FP
+ * ================================================================= */
+ /* FP fused multiply-add/subtract — rm defaults to dyn in the assembler. */
+ {MN("fmadd.s"),
+ MATCH_R4(RV_FMT_S, RV_MADD),
+ MASK_R4,
+ RV64_FMT_R4,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fmsub.s"),
+ MATCH_R4(RV_FMT_S, RV_MSUB),
+ MASK_R4,
+ RV64_FMT_R4,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fnmsub.s"),
+ MATCH_R4(RV_FMT_S, RV_NMSUB),
+ MASK_R4,
+ RV64_FMT_R4,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fnmadd.s"),
+ MATCH_R4(RV_FMT_S, RV_NMADD),
+ MASK_R4,
+ RV64_FMT_R4,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fmadd.d"),
+ MATCH_R4(RV_FMT_D, RV_MADD),
+ MASK_R4,
+ RV64_FMT_R4,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fmsub.d"),
+ MATCH_R4(RV_FMT_D, RV_MSUB),
+ MASK_R4,
+ RV64_FMT_R4,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fnmsub.d"),
+ MATCH_R4(RV_FMT_D, RV_NMSUB),
+ MASK_R4,
+ RV64_FMT_R4,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fnmadd.d"),
+ MATCH_R4(RV_FMT_D, RV_NMADD),
+ MASK_R4,
+ RV64_FMT_R4,
+ RV64_ASMFL_FP,
+ 0, {0}},
+
+ /* FP arithmetic — rm field (funct3) is the rounding mode and prints
+ * as the DYN(=7) default suppressed. funct7 low bits select fmt. */
+ {MN("fadd.s"),
+ MATCH_FP_RM(0x00, RV_OP_FP),
+ MASK_FP_RM,
+ RV64_FMT_FP_RM,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fsub.s"),
+ MATCH_FP_RM(0x04, RV_OP_FP),
+ MASK_FP_RM,
+ RV64_FMT_FP_RM,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fmul.s"),
+ MATCH_FP_RM(0x08, RV_OP_FP),
+ MASK_FP_RM,
+ RV64_FMT_FP_RM,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fdiv.s"),
+ MATCH_FP_RM(0x0c, RV_OP_FP),
+ MASK_FP_RM,
+ RV64_FMT_FP_RM,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fadd.d"),
+ MATCH_FP_RM(0x01, RV_OP_FP),
+ MASK_FP_RM,
+ RV64_FMT_FP_RM,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fsub.d"),
+ MATCH_FP_RM(0x05, RV_OP_FP),
+ MASK_FP_RM,
+ RV64_FMT_FP_RM,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fmul.d"),
+ MATCH_FP_RM(0x09, RV_OP_FP),
+ MASK_FP_RM,
+ RV64_FMT_FP_RM,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fdiv.d"),
+ MATCH_FP_RM(0x0d, RV_OP_FP),
+ MASK_FP_RM,
+ RV64_FMT_FP_RM,
+ RV64_ASMFL_FP,
+ 0, {0}},
+
+ /* FP sqrt — funct7 = 0x2c (S) / 0x2d (D), rs2 must be 0. */
+ {MN("fsqrt.s"),
+ MATCH_FP_CVT(0x2c, 0x0, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fsqrt.d"),
+ MATCH_FP_CVT(0x2d, 0x0, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ RV64_ASMFL_FP,
+ 0, {0}},
+
+ /* FP min/max — funct7 = 0x14/0x15, funct3 = 0 (min) / 1 (max). */
+ {MN("fmin.s"),
+ MATCH_FP_R(0x14, 0x0, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_FP | RV64_ASMFL_NORM,
+ 0, {0}},
+ {MN("fmax.s"),
+ MATCH_FP_R(0x14, 0x1, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_FP | RV64_ASMFL_NORM,
+ 0, {0}},
+ {MN("fmin.d"),
+ MATCH_FP_R(0x15, 0x0, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_FP | RV64_ASMFL_NORM,
+ 0, {0}},
+ {MN("fmax.d"),
+ MATCH_FP_R(0x15, 0x1, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_FP | RV64_ASMFL_NORM,
+ 0, {0}},
+
+ /* FP sign-injection — funct7 = 0x10/0x11, funct3 = 0/1/2 = J/JN/JX. */
+ {MN("fsgnj.s"),
+ MATCH_FP_R(0x10, 0x0, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_FP | RV64_ASMFL_NORM,
+ 0, {0}},
+ {MN("fsgnjn.s"),
+ MATCH_FP_R(0x10, 0x1, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_FP | RV64_ASMFL_NORM,
+ 0, {0}},
+ {MN("fsgnjx.s"),
+ MATCH_FP_R(0x10, 0x2, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_FP | RV64_ASMFL_NORM,
+ 0, {0}},
+ {MN("fsgnj.d"),
+ MATCH_FP_R(0x11, 0x0, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_FP | RV64_ASMFL_NORM,
+ 0, {0}},
+ {MN("fsgnjn.d"),
+ MATCH_FP_R(0x11, 0x1, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_FP | RV64_ASMFL_NORM,
+ 0, {0}},
+ {MN("fsgnjx.d"),
+ MATCH_FP_R(0x11, 0x2, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_FP | RV64_ASMFL_NORM,
+ 0, {0}},
+
+ /* FP compare — funct7 = 0x50 (S) / 0x51 (D), funct3 = 0/1/2 = LE/LT/EQ.
+ * rd is integer GPR (not FP). */
+ {MN("fle.s"),
+ MATCH_FP_R(0x50, 0x0, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_NORM,
+ 0, {0}},
+ {MN("flt.s"),
+ MATCH_FP_R(0x50, 0x1, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_NORM,
+ 0, {0}},
+ {MN("feq.s"),
+ MATCH_FP_R(0x50, 0x2, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_NORM,
+ 0, {0}},
+ {MN("fle.d"),
+ MATCH_FP_R(0x51, 0x0, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_NORM,
+ 0, {0}},
+ {MN("flt.d"),
+ MATCH_FP_R(0x51, 0x1, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_NORM,
+ 0, {0}},
+ {MN("feq.d"),
+ MATCH_FP_R(0x51, 0x2, RV_OP_FP),
+ MASK_FP_R,
+ RV64_FMT_FP_R,
+ RV64_ASMFL_NORM,
+ 0, {0}},
+
+ /* FP classification — rd is GPR, rs1 is FPR, rs2=0, rm/funct3=1. */
+ {MN("fclass.s"),
+ MATCH_FP_R(0x70, 0x1, RV_OP_FP) | (0u << 20),
+ MASK_FP_CVT | (7u << 12),
+ RV64_FMT_FP_CVT,
+ 0,
+ 0, {0}},
+ {MN("fclass.d"),
+ MATCH_FP_R(0x71, 0x1, RV_OP_FP) | (0u << 20),
+ MASK_FP_CVT | (7u << 12),
+ RV64_FMT_FP_CVT,
+ 0,
+ 0, {0}},
+
+ /* FP conversions — funct7 selects {direction, fmt}, rs2 selects
+ * integer width/signedness. */
+ {MN("fcvt.w.s"),
+ MATCH_FP_CVT(0x60, 0x0, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ 0,
+ 0, {0}},
+ {MN("fcvt.wu.s"),
+ MATCH_FP_CVT(0x60, 0x1, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ 0,
+ 0, {0}},
+ {MN("fcvt.l.s"),
+ MATCH_FP_CVT(0x60, 0x2, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ 0,
+ RV_AV_RV64, {0}}, /* 64-bit int dest needs 64-bit GPR */
+ {MN("fcvt.lu.s"),
+ MATCH_FP_CVT(0x60, 0x3, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("fcvt.w.d"),
+ MATCH_FP_CVT(0x61, 0x0, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ 0,
+ 0, {0}},
+ {MN("fcvt.wu.d"),
+ MATCH_FP_CVT(0x61, 0x1, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ 0,
+ 0, {0}},
+ {MN("fcvt.l.d"),
+ MATCH_FP_CVT(0x61, 0x2, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("fcvt.lu.d"),
+ MATCH_FP_CVT(0x61, 0x3, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("fcvt.s.w"),
+ MATCH_FP_CVT(0x68, 0x0, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fcvt.s.wu"),
+ MATCH_FP_CVT(0x68, 0x1, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fcvt.s.l"),
+ MATCH_FP_CVT(0x68, 0x2, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ RV64_ASMFL_FP,
+ RV_AV_RV64, {0}},
+ {MN("fcvt.s.lu"),
+ MATCH_FP_CVT(0x68, 0x3, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ RV64_ASMFL_FP,
+ RV_AV_RV64, {0}},
+ {MN("fcvt.d.w"),
+ MATCH_FP_CVT(0x69, 0x0, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fcvt.d.wu"),
+ MATCH_FP_CVT(0x69, 0x1, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fcvt.d.l"),
+ MATCH_FP_CVT(0x69, 0x2, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ RV64_ASMFL_FP,
+ RV_AV_RV64, {0}},
+ {MN("fcvt.d.lu"),
+ MATCH_FP_CVT(0x69, 0x3, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ RV64_ASMFL_FP,
+ RV_AV_RV64, {0}},
+ {MN("fcvt.s.d"),
+ MATCH_FP_CVT(0x20, 0x1, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fcvt.d.s"),
+ MATCH_FP_CVT(0x21, 0x0, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ RV64_ASMFL_FP,
+ 0, {0}},
+
+ /* FP bitcast moves — funct7 + rs2=0 + funct3=0 fixed. */
+ {MN("fmv.x.w"),
+ MATCH_FP_CVT(0x70, 0x0, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ 0,
+ 0, {0}},
+ {MN("fmv.w.x"),
+ MATCH_FP_CVT(0x78, 0x0, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fmv.x.d"),
+ MATCH_FP_CVT(0x71, 0x0, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ 0,
+ RV_AV_RV64, {0}}, /* moves a 64-bit double through a GPR */
+ {MN("fmv.d.x"),
+ MATCH_FP_CVT(0x79, 0x0, RV_OP_FP),
+ MASK_FP_CVT,
+ RV64_FMT_FP_CVT,
+ RV64_ASMFL_FP,
+ RV_AV_RV64, {0}},
+
+ /* FP load/store */
+ {MN("flw"),
+ MATCH_I(0x2, RV_LOAD_FP),
+ MASK_I,
+ RV64_FMT_FP_LOAD,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fld"),
+ MATCH_I(0x3, RV_LOAD_FP),
+ MASK_I,
+ RV64_FMT_FP_LOAD,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fsw"),
+ MATCH_S(0x2, RV_STORE_FP),
+ MASK_S,
+ RV64_FMT_FP_STORE,
+ RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("fsd"),
+ MATCH_S(0x3, RV_STORE_FP),
+ MASK_S,
+ RV64_FMT_FP_STORE,
+ RV64_ASMFL_FP,
+ 0, {0}},
+
+ /* =================================================================
+ * RV64A (atomic) — AMO funct5 + funct3 (W=2, D=3). aq/rl vary, so
+ * mask leaves bits 26:25 free. We expose the .aq/.rl ordering
+ * suffixes via the disassembler's annotation, but the row mnemonic
+ * itself is the bare form (e.g. "amoadd.w").
+ * ================================================================= */
+ {MN("lr.w.aq"),
+ MATCH_AMO_ORDER(0x02, 1, 0, 0x2, RV_AMO),
+ MASK_AMO_ORDER | (0x1fu << 20),
+ RV64_FMT_LR,
+ 0,
+ 0, {0}},
+ {MN("lr.w.rl"),
+ MATCH_AMO_ORDER(0x02, 0, 1, 0x2, RV_AMO),
+ MASK_AMO_ORDER | (0x1fu << 20),
+ RV64_FMT_LR,
+ 0,
+ 0, {0}},
+ {MN("lr.w.aqrl"),
+ MATCH_AMO_ORDER(0x02, 1, 1, 0x2, RV_AMO),
+ MASK_AMO_ORDER | (0x1fu << 20),
+ RV64_FMT_LR,
+ 0,
+ 0, {0}},
+ {MN("lr.d.aq"),
+ MATCH_AMO_ORDER(0x02, 1, 0, 0x3, RV_AMO),
+ MASK_AMO_ORDER | (0x1fu << 20),
+ RV64_FMT_LR,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("lr.d.rl"),
+ MATCH_AMO_ORDER(0x02, 0, 1, 0x3, RV_AMO),
+ MASK_AMO_ORDER | (0x1fu << 20),
+ RV64_FMT_LR,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("lr.d.aqrl"),
+ MATCH_AMO_ORDER(0x02, 1, 1, 0x3, RV_AMO),
+ MASK_AMO_ORDER | (0x1fu << 20),
+ RV64_FMT_LR,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("sc.w.aq"),
+ MATCH_AMO_ORDER(0x03, 1, 0, 0x2, RV_AMO),
+ MASK_AMO_ORDER,
+ RV64_FMT_AMO,
+ 0,
+ 0, {0}},
+ {MN("sc.w.rl"),
+ MATCH_AMO_ORDER(0x03, 0, 1, 0x2, RV_AMO),
+ MASK_AMO_ORDER,
+ RV64_FMT_AMO,
+ 0,
+ 0, {0}},
+ {MN("sc.w.aqrl"),
+ MATCH_AMO_ORDER(0x03, 1, 1, 0x2, RV_AMO),
+ MASK_AMO_ORDER,
+ RV64_FMT_AMO,
+ 0,
+ 0, {0}},
+ {MN("sc.d.aq"),
+ MATCH_AMO_ORDER(0x03, 1, 0, 0x3, RV_AMO),
+ MASK_AMO_ORDER,
+ RV64_FMT_AMO,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("sc.d.rl"),
+ MATCH_AMO_ORDER(0x03, 0, 1, 0x3, RV_AMO),
+ MASK_AMO_ORDER,
+ RV64_FMT_AMO,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("sc.d.aqrl"),
+ MATCH_AMO_ORDER(0x03, 1, 1, 0x3, RV_AMO),
+ MASK_AMO_ORDER,
+ RV64_FMT_AMO,
+ 0,
+ RV_AV_RV64, {0}},
+/* `av` tags the doubleword (.d) atomics RV_AV_RV64 (RV64-only); word (.w)
+ * forms pass 0 (BOTH). The av byte sits between flags and pad[1]. */
+#define RV64_AMO_ORDER_ROWS(mn, f5, f3, av) \
+ {MN(mn ".aq"), \
+ MATCH_AMO_ORDER(f5, 1, 0, f3, RV_AMO), \
+ MASK_AMO_ORDER, \
+ RV64_FMT_AMO, \
+ 0, \
+ (av), {0}}, \
+ {MN(mn ".rl"), \
+ MATCH_AMO_ORDER(f5, 0, 1, f3, RV_AMO), \
+ MASK_AMO_ORDER, \
+ RV64_FMT_AMO, \
+ 0, \
+ (av), {0}}, \
+ { \
+ MN(mn ".aqrl"), MATCH_AMO_ORDER(f5, 1, 1, f3, RV_AMO), MASK_AMO_ORDER, \
+ RV64_FMT_AMO, 0, (av), {0} \
+ }
+ RV64_AMO_ORDER_ROWS("amoswap.w", RV_AMO_SWAP, 0x2, 0),
+ RV64_AMO_ORDER_ROWS("amoadd.w", RV_AMO_ADD, 0x2, 0),
+ RV64_AMO_ORDER_ROWS("amoxor.w", RV_AMO_XOR, 0x2, 0),
+ RV64_AMO_ORDER_ROWS("amoand.w", RV_AMO_AND, 0x2, 0),
+ RV64_AMO_ORDER_ROWS("amoor.w", RV_AMO_OR, 0x2, 0),
+ RV64_AMO_ORDER_ROWS("amomin.w", RV_AMO_MIN, 0x2, 0),
+ RV64_AMO_ORDER_ROWS("amomax.w", RV_AMO_MAX, 0x2, 0),
+ RV64_AMO_ORDER_ROWS("amominu.w", RV_AMO_MINU, 0x2, 0),
+ RV64_AMO_ORDER_ROWS("amomaxu.w", RV_AMO_MAXU, 0x2, 0),
+ RV64_AMO_ORDER_ROWS("amoswap.d", RV_AMO_SWAP, 0x3, RV_AV_RV64),
+ RV64_AMO_ORDER_ROWS("amoadd.d", RV_AMO_ADD, 0x3, RV_AV_RV64),
+ RV64_AMO_ORDER_ROWS("amoxor.d", RV_AMO_XOR, 0x3, RV_AV_RV64),
+ RV64_AMO_ORDER_ROWS("amoand.d", RV_AMO_AND, 0x3, RV_AV_RV64),
+ RV64_AMO_ORDER_ROWS("amoor.d", RV_AMO_OR, 0x3, RV_AV_RV64),
+ RV64_AMO_ORDER_ROWS("amomin.d", RV_AMO_MIN, 0x3, RV_AV_RV64),
+ RV64_AMO_ORDER_ROWS("amomax.d", RV_AMO_MAX, 0x3, RV_AV_RV64),
+ RV64_AMO_ORDER_ROWS("amominu.d", RV_AMO_MINU, 0x3, RV_AV_RV64),
+ RV64_AMO_ORDER_ROWS("amomaxu.d", RV_AMO_MAXU, 0x3, RV_AV_RV64),
+ {MN("lr.w"),
+ MATCH_AMO(0x02, 0x2, RV_AMO),
+ MASK_AMO | (0x1fu << 20),
+ RV64_FMT_LR,
+ 0,
+ 0, {0}},
+ {MN("lr.d"),
+ MATCH_AMO(0x02, 0x3, RV_AMO),
+ MASK_AMO | (0x1fu << 20),
+ RV64_FMT_LR,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("sc.w"),
+ MATCH_AMO(0x03, 0x2, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ 0, {0}},
+ {MN("sc.d"),
+ MATCH_AMO(0x03, 0x3, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("amoswap.w"),
+ MATCH_AMO(RV_AMO_SWAP, 0x2, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ 0, {0}},
+ {MN("amoadd.w"),
+ MATCH_AMO(RV_AMO_ADD, 0x2, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ 0, {0}},
+ {MN("amoxor.w"),
+ MATCH_AMO(RV_AMO_XOR, 0x2, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ 0, {0}},
+ {MN("amoand.w"),
+ MATCH_AMO(RV_AMO_AND, 0x2, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ 0, {0}},
+ {MN("amoor.w"),
+ MATCH_AMO(RV_AMO_OR, 0x2, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ 0, {0}},
+ {MN("amomin.w"),
+ MATCH_AMO(RV_AMO_MIN, 0x2, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ 0, {0}},
+ {MN("amomax.w"),
+ MATCH_AMO(RV_AMO_MAX, 0x2, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ 0, {0}},
+ {MN("amominu.w"),
+ MATCH_AMO(RV_AMO_MINU, 0x2, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ 0, {0}},
+ {MN("amomaxu.w"),
+ MATCH_AMO(RV_AMO_MAXU, 0x2, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ 0, {0}},
+ {MN("amoswap.d"),
+ MATCH_AMO(RV_AMO_SWAP, 0x3, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("amoadd.d"),
+ MATCH_AMO(RV_AMO_ADD, 0x3, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("amoxor.d"),
+ MATCH_AMO(RV_AMO_XOR, 0x3, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("amoand.d"),
+ MATCH_AMO(RV_AMO_AND, 0x3, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("amoor.d"),
+ MATCH_AMO(RV_AMO_OR, 0x3, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("amomin.d"),
+ MATCH_AMO(RV_AMO_MIN, 0x3, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("amomax.d"),
+ MATCH_AMO(RV_AMO_MAX, 0x3, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("amominu.d"),
+ MATCH_AMO(RV_AMO_MINU, 0x3, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ RV_AV_RV64, {0}},
+ {MN("amomaxu.d"),
+ MATCH_AMO(RV_AMO_MAXU, 0x3, RV_AMO),
+ MASK_AMO,
+ RV64_FMT_AMO,
+ 0,
+ RV_AV_RV64, {0}},
+
+ /* =================================================================
+ * RV64C compressed — assembler rows. The disassembler uses the
+ * dynamic C decoder below, so 32-bit decode skips these rows.
+ * ================================================================= */
+ {MN("c.nop"), 0x0001u, 0xffffu, RV64_FMT_C_NONE, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.ebreak"), 0x9002u, 0xffffu, RV64_FMT_C_NONE, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.jr"), 0x8002u, 0xf07fu, RV64_FMT_CR, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.jalr"), 0x9002u, 0xf07fu, RV64_FMT_CR, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.mv"), 0x8002u, 0xf003u, RV64_FMT_CR, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.add"), 0x9002u, 0xf003u, RV64_FMT_CR, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.li"), 0x4001u, 0xe003u, RV64_FMT_CI, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.addi"), 0x0001u, 0xe003u, RV64_FMT_CI, RV64_ASMFL_C16, 0, {0}},
+ /* q1/f3=001: c.addiw on rv64, but the SAME encoding is c.jal on rv32. */
+ {MN("c.addiw"), 0x2001u, 0xe003u, RV64_FMT_CI, RV64_ASMFL_C16,
+ RV_AV_RV64, {0}},
+ {MN("c.jal"), 0x2001u, 0xe003u, RV64_FMT_CJ, RV64_ASMFL_C16,
+ RV_AV_RV32, {0}},
+ {MN("c.slli"), 0x0002u, 0xe003u, RV64_FMT_CI, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.lui"), 0x6001u, 0xe003u, RV64_FMT_CI, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.addi16sp"), 0x6101u, 0xef83u, RV64_FMT_CI, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.lwsp"), 0x4002u, 0xe003u, RV64_FMT_CI, RV64_ASMFL_C16, 0, {0}},
+ /* q2/f3=011: c.ldsp on rv64, c.flwsp on rv32 (same encoding). */
+ {MN("c.ldsp"), 0x6002u, 0xe003u, RV64_FMT_CI, RV64_ASMFL_C16,
+ RV_AV_RV64, {0}},
+ {MN("c.flwsp"), 0x6002u, 0xe003u, RV64_FMT_CI,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP, RV_AV_RV32, {0}},
+ {MN("c.fldsp"),
+ 0x2002u,
+ 0xe003u,
+ RV64_FMT_CI,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("c.swsp"), 0xc002u, 0xe003u, RV64_FMT_CSS, RV64_ASMFL_C16, 0, {0}},
+ /* q2/f3=111: c.sdsp on rv64, c.fswsp on rv32 (same encoding). */
+ {MN("c.sdsp"), 0xe002u, 0xe003u, RV64_FMT_CSS, RV64_ASMFL_C16,
+ RV_AV_RV64, {0}},
+ {MN("c.fswsp"), 0xe002u, 0xe003u, RV64_FMT_CSS,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP, RV_AV_RV32, {0}},
+ {MN("c.fsdsp"),
+ 0xa002u,
+ 0xe003u,
+ RV64_FMT_CSS,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("c.addi4spn"), 0x0000u, 0xe003u, RV64_FMT_CIW, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.lw"), 0x4000u, 0xe003u, RV64_FMT_CL, RV64_ASMFL_C16, 0, {0}},
+ /* q0/f3=011: c.ld on rv64, c.flw on rv32 (same encoding). */
+ {MN("c.ld"), 0x6000u, 0xe003u, RV64_FMT_CL, RV64_ASMFL_C16,
+ RV_AV_RV64, {0}},
+ {MN("c.flw"), 0x6000u, 0xe003u, RV64_FMT_CL,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP, RV_AV_RV32, {0}},
+ {MN("c.fld"),
+ 0x2000u,
+ 0xe003u,
+ RV64_FMT_CL,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("c.sw"), 0xc000u, 0xe003u, RV64_FMT_CS, RV64_ASMFL_C16, 0, {0}},
+ /* q0/f3=111: c.sd on rv64, c.fsw on rv32 (same encoding). */
+ {MN("c.sd"), 0xe000u, 0xe003u, RV64_FMT_CS, RV64_ASMFL_C16,
+ RV_AV_RV64, {0}},
+ {MN("c.fsw"), 0xe000u, 0xe003u, RV64_FMT_CS,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP, RV_AV_RV32, {0}},
+ {MN("c.fsd"),
+ 0xa000u,
+ 0xe003u,
+ RV64_FMT_CS,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP,
+ 0, {0}},
+ {MN("c.srli"), 0x8001u, 0xec03u, RV64_FMT_CB, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.srai"), 0x8401u, 0xec03u, RV64_FMT_CB, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.andi"), 0x8801u, 0xec03u, RV64_FMT_CB, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.sub"), 0x8c01u, 0xfc63u, RV64_FMT_CA, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.xor"), 0x8c21u, 0xfc63u, RV64_FMT_CA, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.or"), 0x8c41u, 0xfc63u, RV64_FMT_CA, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.and"), 0x8c61u, 0xfc63u, RV64_FMT_CA, RV64_ASMFL_C16, 0, {0}},
+ /* c.subw/c.addw are RV64-only (their CA slot is reserved on rv32). */
+ {MN("c.subw"), 0x9c01u, 0xfc63u, RV64_FMT_CA, RV64_ASMFL_C16,
+ RV_AV_RV64, {0}},
+ {MN("c.addw"), 0x9c21u, 0xfc63u, RV64_FMT_CA, RV64_ASMFL_C16,
+ RV_AV_RV64, {0}},
+ {MN("c.j"), 0xa001u, 0xe003u, RV64_FMT_CJ, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.beqz"), 0xc001u, 0xe003u, RV64_FMT_CB, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.bnez"), 0xe001u, 0xe003u, RV64_FMT_CB, RV64_ASMFL_C16, 0, {0}},
+};
+#undef RV64_AMO_ORDER_ROWS
+
+const u32 rv64_insn_table_n =
+ (u32)(sizeof rv64_insn_table / sizeof rv64_insn_table[0]);
+
+/* A row is available for `av_wanted` when its av column is 0 (BOTH) or
+ * its av mask intersects the wanted arch. */
+static bool rv_av_ok(u8 av, u8 av_wanted) {
+ return av == 0u || (av & av_wanted) != 0u;
+}
+
+const Rv64InsnDesc* rv64_disasm_find(u32 word, u8 av_wanted) {
+ for (u32 i = 0; i < rv64_insn_table_n; ++i) {
+ const Rv64InsnDesc* d = &rv64_insn_table[i];
+ if ((d->flags & RV64_ASMFL_C16)) continue; /* 32-bit decode path */
+ if ((d->flags & RV64_ASMFL_PSEUDO)) continue; /* assembler-only expansion */
+ if (!rv_av_ok(d->av, av_wanted)) continue; /* wrong-XLEN row */
+ if ((word & d->mask) == d->match) return d;
+ }
+ return NULL;
+}
+
+const Rv64InsnDesc* rv64_asm_find(Slice mnemonic, u8 av_wanted) {
+ /* Prefer canonical (non-alias) rows when both spellings exist; the
+ * caller can still write the alias and we'll match it on a second
+ * pass. Aliases share encoding with the canonical row so the choice
+ * is purely for diagnostics. Rows whose av excludes the target arch
+ * are skipped so e.g. `ld`/`addiw` are not assemblable under rv32. */
+ if (!mnemonic.s) return NULL;
+ for (u32 i = 0; i < rv64_insn_table_n; ++i) {
+ const Rv64InsnDesc* d = &rv64_insn_table[i];
+ if ((d->flags & RV64_ASMFL_ALIAS)) continue;
+ if (!rv_av_ok(d->av, av_wanted)) continue;
+ if (slice_eq(d->mnemonic, mnemonic)) return d;
+ }
+ for (u32 i = 0; i < rv64_insn_table_n; ++i) {
+ const Rv64InsnDesc* d = &rv64_insn_table[i];
+ if (!rv_av_ok(d->av, av_wanted)) continue;
+ if (slice_eq(d->mnemonic, mnemonic)) return d;
+ }
+ return NULL;
+}
+
+/* =====================================================================
+ * Compressed-instruction decode.
+ *
+ * RV64C instructions are 16 bits; bits[1:0] (op-quadrant) is 00/01/10
+ * (11 means uncompressed/32-bit). bits[15:13] (funct3) further select.
+ *
+ * For the disassembler we expose a small set of the common encodings;
+ * less common ones decode as .hword. */
+
+static u32 rv64c_lookup_simple(u32 w) {
+ u32 op = w & 0x3u;
+ u32 f3 = (w >> 13) & 0x7u;
+ /* C.NOP: funct3=000, op=01, rd/rs1=x0, imm=0 → word=0x0001 */
+ if (w == 0x0001u) return 1; /* index in table-c below */
+ /* C.EBREAK: 0x9002 */
+ if (w == 0x9002u) return 2;
+ (void)op;
+ (void)f3;
+ return 0;
+}
+
+/* The C-extension descriptors are stored in a private table indexed by
+ * an internal enum. They are minimal — most C-format instructions print
+ * with custom operand printers. */
+static const Rv64InsnDesc rv64_c_table[] = {
+ /* index 0 reserved (no match). */
+ {MN("c.unknown"), 0, 0xffffu, RV64_FMT_C_NONE, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.nop"), 0x0001u, 0xffffu, RV64_FMT_C_NONE, RV64_ASMFL_C16, 0, {0}},
+ {MN("c.ebreak"), 0x9002u, 0xffffu, RV64_FMT_C_NONE, RV64_ASMFL_C16, 0, {0}},
+};
+
+#undef MN
+
+const Rv64InsnDesc* rv64_disasm_find_c(u32 word, u8 av_wanted) {
+ u32 hw = word & 0xffffu;
+ u32 idx = rv64c_lookup_simple(hw);
+ /* True when decoding for rv32: several RVC quadrant slots whose integer
+ * doubleword meaning is RV64-only carry an FP load/store meaning instead
+ * (RV32FC), and q1/f3=001 is c.jal not c.addiw. */
+ bool rv32 = (av_wanted & RV_AV_RV32) != 0u;
+ if (idx) return &rv64_c_table[idx];
+ /* Pattern-match remaining common C-instructions. We use a tiny static
+ * scratch descriptor that the printer interprets by funct3+op. */
+ static Rv64InsnDesc dyn;
+ u32 op = hw & 0x3u;
+ u32 f3 = (hw >> 13) & 0x7u;
+ if (op == 3u) return NULL; /* uncompressed */
+
+ /* C.JR / C.JALR / C.MV / C.ADD — quadrant 2, funct3=100 */
+ if (op == 2u && f3 == 4u) {
+ u32 funct4 = (hw >> 12) & 0xfu;
+ u32 rd_rs1 = (hw >> 7) & 0x1fu;
+ u32 rs2 = (hw >> 2) & 0x1fu;
+ if (funct4 == 0x8u) {
+ dyn = (Rv64InsnDesc){slice_from_cstr(rs2 == 0 ? "c.jr" : "c.mv"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CR,
+ RV64_ASMFL_C16,
+ 0, {0}};
+ return rd_rs1 == 0 ? NULL : &dyn;
+ }
+ if (funct4 == 0x9u) {
+ if (rs2 == 0 && rd_rs1 == 0) {
+ dyn = rv64_c_table[2]; /* c.ebreak */
+ return &dyn;
+ }
+ dyn = (Rv64InsnDesc){slice_from_cstr(rs2 == 0 ? "c.jalr" : "c.add"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CR,
+ RV64_ASMFL_C16,
+ 0, {0}};
+ return &dyn;
+ }
+ }
+ /* C.LI / C.ADDI / C.LUI — quadrant 1 */
+ if (op == 1u && f3 == 2u) {
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.li"), hw, 0xffffu, RV64_FMT_CI,
+ RV64_ASMFL_C16, 0, {0}};
+ return &dyn;
+ }
+ if (op == 1u && f3 == 1u) {
+ /* q1/f3=001: c.addiw on rv64, c.jal on rv32 (same encoding). */
+ dyn = rv32 ? (Rv64InsnDesc){slice_from_cstr("c.jal"), hw, 0xffffu,
+ RV64_FMT_CJ, RV64_ASMFL_C16,
+ 0, {0}}
+ : (Rv64InsnDesc){slice_from_cstr("c.addiw"), hw, 0xffffu,
+ RV64_FMT_CI, RV64_ASMFL_C16,
+ 0, {0}};
+ return &dyn;
+ }
+ if (op == 1u && f3 == 0u) {
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.addi"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CI,
+ RV64_ASMFL_C16,
+ 0, {0}};
+ return &dyn;
+ }
+ if (op == 1u && f3 == 3u) {
+ u32 rd = (hw >> 7) & 0x1fu;
+ dyn = (Rv64InsnDesc){slice_from_cstr(rd == 2u ? "c.addi16sp" : "c.lui"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CI,
+ RV64_ASMFL_C16,
+ 0, {0}};
+ return &dyn;
+ }
+ if (op == 1u && f3 == 4u) {
+ u32 top = (hw >> 10) & 0x3u;
+ if (top == 0u || top == 1u || top == 2u) {
+ static const char* const names[3] = {"c.srli", "c.srai", "c.andi"};
+ dyn = (Rv64InsnDesc){slice_from_cstr(names[top]),
+ hw,
+ 0xffffu,
+ RV64_FMT_CB,
+ RV64_ASMFL_C16,
+ 0, {0}};
+ return &dyn;
+ }
+ {
+ u32 bit12 = (hw >> 12) & 1u;
+ u32 subop = (hw >> 5) & 0x3u;
+ static const char* const ca0[4] = {"c.sub", "c.xor", "c.or", "c.and"};
+ static const char* const ca1[4] = {"c.subw", "c.addw", NULL, NULL};
+ /* bit12==1 selects c.subw/c.addw — RV64-only; reserved on rv32. */
+ const char* name = bit12 ? (rv32 ? NULL : ca1[subop]) : ca0[subop];
+ if (!name) return NULL;
+ dyn = (Rv64InsnDesc){slice_from_cstr(name), hw, 0xffffu, RV64_FMT_CA,
+ RV64_ASMFL_C16, 0, {0}};
+ return &dyn;
+ }
+ }
+ if (op == 1u && f3 == 5u) {
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.j"), hw, 0xffffu, RV64_FMT_CJ,
+ RV64_ASMFL_C16, 0, {0}};
+ return &dyn;
+ }
+ if (op == 1u && f3 == 6u) {
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.beqz"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CB,
+ RV64_ASMFL_C16,
+ 0, {0}};
+ return &dyn;
+ }
+ if (op == 1u && f3 == 7u) {
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.bnez"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CB,
+ RV64_ASMFL_C16,
+ 0, {0}};
+ return &dyn;
+ }
+ /* C.LWSP / C.LDSP — quadrant 2, funct3=010/011 */
+ if (op == 2u && f3 == 2u) {
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.lwsp"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CI,
+ RV64_ASMFL_C16,
+ 0, {0}};
+ return &dyn;
+ }
+ if (op == 2u && f3 == 3u) {
+ /* q2/f3=011: c.ldsp on rv64, c.flwsp on rv32 (same encoding). */
+ if (rv32)
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.flwsp"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CI,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP,
+ 0,
+ {0}};
+ else
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.ldsp"), hw, 0xffffu,
+ RV64_FMT_CI, RV64_ASMFL_C16,
+ 0, {0}};
+ return &dyn;
+ }
+ if (op == 2u && f3 == 0u) {
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.slli"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CI,
+ RV64_ASMFL_C16,
+ 0, {0}};
+ return &dyn;
+ }
+ if (op == 2u && f3 == 1u) {
+ dyn = (Rv64InsnDesc){
+ slice_from_cstr("c.fldsp"), hw, 0xffffu, RV64_FMT_CI,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP, 0, {0}};
+ return &dyn;
+ }
+ /* C.SWSP / C.SDSP — quadrant 2, funct3=110/111 */
+ if (op == 2u && f3 == 6u) {
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.swsp"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CSS,
+ RV64_ASMFL_C16,
+ 0, {0}};
+ return &dyn;
+ }
+ if (op == 2u && f3 == 7u) {
+ /* q2/f3=111: c.sdsp on rv64, c.fswsp on rv32 (same encoding). */
+ if (rv32)
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.fswsp"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CSS,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP,
+ 0,
+ {0}};
+ else
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.sdsp"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CSS,
+ RV64_ASMFL_C16,
+ 0, {0}};
+ return &dyn;
+ }
+ if (op == 2u && f3 == 5u) {
+ dyn = (Rv64InsnDesc){
+ slice_from_cstr("c.fsdsp"), hw, 0xffffu, RV64_FMT_CSS,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP, 0, {0}};
+ return &dyn;
+ }
+ /* C.ADDI4SPN — quadrant 0, funct3=000 */
+ if (op == 0u && f3 == 0u) {
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.addi4spn"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CIW,
+ RV64_ASMFL_C16,
+ 0, {0}};
+ return &dyn;
+ }
+ /* C.LW / C.LD — quadrant 0, funct3=010/011 */
+ if (op == 0u && f3 == 2u) {
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.lw"), hw, 0xffffu, RV64_FMT_CL,
+ RV64_ASMFL_C16, 0, {0}};
+ return &dyn;
+ }
+ if (op == 0u && f3 == 3u) {
+ /* q0/f3=011: c.ld on rv64, c.flw on rv32 (same encoding). */
+ if (rv32)
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.flw"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CL,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP,
+ 0,
+ {0}};
+ else
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.ld"), hw, 0xffffu, RV64_FMT_CL,
+ RV64_ASMFL_C16, 0, {0}};
+ return &dyn;
+ }
+ if (op == 0u && f3 == 1u) {
+ dyn = (Rv64InsnDesc){
+ slice_from_cstr("c.fld"), hw, 0xffffu, RV64_FMT_CL,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP, 0, {0}};
+ return &dyn;
+ }
+ if (op == 0u && f3 == 6u) {
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.sw"), hw, 0xffffu, RV64_FMT_CS,
+ RV64_ASMFL_C16, 0, {0}};
+ return &dyn;
+ }
+ if (op == 0u && f3 == 7u) {
+ /* q0/f3=111: c.sd on rv64, c.fsw on rv32 (same encoding). */
+ if (rv32)
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.fsw"),
+ hw,
+ 0xffffu,
+ RV64_FMT_CS,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP,
+ 0,
+ {0}};
+ else
+ dyn = (Rv64InsnDesc){slice_from_cstr("c.sd"), hw, 0xffffu, RV64_FMT_CS,
+ RV64_ASMFL_C16, 0, {0}};
+ return &dyn;
+ }
+ if (op == 0u && f3 == 5u) {
+ dyn = (Rv64InsnDesc){
+ slice_from_cstr("c.fsd"), hw, 0xffffu, RV64_FMT_CS,
+ RV64_ASMFL_C16 | RV64_ASMFL_FP, 0, {0}};
+ return &dyn;
+ }
+ return NULL;
+}
+
+/* =====================================================================
+ * Operand print — one helper per format. */
+
+static const char* const RV_XNAMES[32] = {
+ "zero", "ra", "sp", "gp", "tp", "t0", "t1", "t2", "s0", "s1", "a0",
+ "a1", "a2", "a3", "a4", "a5", "a6", "a7", "s2", "s3", "s4", "s5",
+ "s6", "s7", "s8", "s9", "s10", "s11", "t3", "t4", "t5", "t6",
+};
+
+static const char* const RV_FNAMES[32] = {
+ "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7",
+ "fs0", "fs1", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5",
+ "fa6", "fa7", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7",
+ "fs8", "fs9", "fs10", "fs11", "ft8", "ft9", "ft10", "ft11",
+};
+
+static void p_xreg(StrBuf* sb, u32 r) { strbuf_puts(sb, RV_XNAMES[r & 31u]); }
+static void p_freg(StrBuf* sb, u32 r) { strbuf_puts(sb, RV_FNAMES[r & 31u]); }
+static void p_sep(StrBuf* sb) { strbuf_puts(sb, ", "); }
+static void p_mem(StrBuf* sb, i64 off, u32 base) {
+ strbuf_put_i64(sb, off);
+ strbuf_putc(sb, '(');
+ p_xreg(sb, base);
+ strbuf_putc(sb, ')');
+}
+static void p_rel(StrBuf* sb, u64 vaddr, i64 off) {
+ if (vaddr)
+ strbuf_put_hex_u64(sb, vaddr + (u64)off);
+ else {
+ strbuf_putc(sb, '#');
+ strbuf_put_i64(sb, off);
+ }
+}
+
+static void print_r(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
+ Rv64R f = rv64_r_unpack(w);
+ /* Two-operand aliases (snez/neg/negw) drop rs1=x0 from the print. */
+ if (d->flags & RV64_ASMFL_ALIAS) {
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ p_xreg(sb, f.rs2);
+ return;
+ }
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ p_xreg(sb, f.rs1);
+ p_sep(sb);
+ p_xreg(sb, f.rs2);
+}
+
+static void print_r4(StrBuf* sb, u32 w) {
+ u32 rd = (w >> 7) & 0x1fu;
+ u32 rs1 = (w >> 15) & 0x1fu;
+ u32 rs2 = (w >> 20) & 0x1fu;
+ u32 rs3 = (w >> 27) & 0x1fu;
+ p_freg(sb, rd);
+ p_sep(sb);
+ p_freg(sb, rs1);
+ p_sep(sb);
+ p_freg(sb, rs2);
+ p_sep(sb);
+ p_freg(sb, rs3);
+}
+
+static void print_i(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
+ Rv64I f = rv64_i_unpack(w);
+ i64 imm = rv64_sext((u64)f.imm12, 12);
+ /* Alias: `li rd, imm` — print rd, imm. */
+ if ((d->flags & RV64_ASMFL_ALIAS) && slice_eq_cstr(d->mnemonic, "li")) {
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ strbuf_put_i64(sb, imm);
+ return;
+ }
+ /* Alias: `mv rd, rs1` — print rd, rs1. */
+ if ((d->flags & RV64_ASMFL_ALIAS) && slice_eq_cstr(d->mnemonic, "mv")) {
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ p_xreg(sb, f.rs1);
+ return;
+ }
+ /* Alias: `sext.w rd, rs1` — print rd, rs1. */
+ if ((d->flags & RV64_ASMFL_ALIAS) && slice_eq_cstr(d->mnemonic, "sext.w")) {
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ p_xreg(sb, f.rs1);
+ return;
+ }
+ /* Alias: `seqz rd, rs` / `not rd, rs` — print rd, rs (drop imm). */
+ if ((d->flags & RV64_ASMFL_ALIAS) && (slice_eq_cstr(d->mnemonic, "seqz") ||
+ slice_eq_cstr(d->mnemonic, "not"))) {
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ p_xreg(sb, f.rs1);
+ return;
+ }
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ p_xreg(sb, f.rs1);
+ p_sep(sb);
+ strbuf_put_i64(sb, imm);
+}
+
+static void print_i_shift(StrBuf* sb, u32 w) {
+ /* shamt is 6 bits for RV64 shift-imm. */
+ u32 rd = (w >> 7) & 0x1fu;
+ u32 rs1 = (w >> 15) & 0x1fu;
+ u32 shamt = (w >> 20) & 0x3fu;
+ p_xreg(sb, rd);
+ p_sep(sb);
+ p_xreg(sb, rs1);
+ p_sep(sb);
+ strbuf_put_u64(sb, (u64)shamt);
+}
+
+static void print_i_shiftw(StrBuf* sb, u32 w) {
+ u32 rd = (w >> 7) & 0x1fu;
+ u32 rs1 = (w >> 15) & 0x1fu;
+ u32 shamt = (w >> 20) & 0x1fu;
+ p_xreg(sb, rd);
+ p_sep(sb);
+ p_xreg(sb, rs1);
+ p_sep(sb);
+ strbuf_put_u64(sb, (u64)shamt);
+}
+
+static void print_u(StrBuf* sb, u32 w) {
+ Rv64U f = rv64_u_unpack(w);
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ /* The immediate is the upper-20 already shifted into bits 31:12; print
+ * the raw 20-bit value the assembler expects. */
+ strbuf_put_hex_u64(sb, (u64)(f.imm32_hi20 >> 12));
+}
+
+static void print_load(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
+ Rv64I f = rv64_i_unpack(w);
+ i64 imm = rv64_sext((u64)f.imm12, 12);
+ if (d->flags & RV64_ASMFL_FP)
+ p_freg(sb, f.rd);
+ else
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ p_mem(sb, imm, f.rs1);
+}
+
+static void print_store(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
+ Rv64S f = rv64_s_unpack(w);
+ i64 imm = rv64_sext((u64)f.imm12, 12);
+ if (d->flags & RV64_ASMFL_FP)
+ p_freg(sb, f.rs2);
+ else
+ p_xreg(sb, f.rs2);
+ p_sep(sb);
+ p_mem(sb, imm, f.rs1);
+}
+
+static void print_b(StrBuf* sb, u32 w, u64 vaddr, const Rv64InsnDesc* d) {
+ Rv64B f = rv64_b_unpack(w);
+ i64 off = rv64_sext((u64)f.imm13, 13);
+ if ((d->flags & RV64_ASMFL_ALIAS) && (slice_eq_cstr(d->mnemonic, "beqz") ||
+ slice_eq_cstr(d->mnemonic, "bnez"))) {
+ p_xreg(sb, f.rs1);
+ p_sep(sb);
+ p_rel(sb, vaddr, off);
+ return;
+ }
+ p_xreg(sb, f.rs1);
+ p_sep(sb);
+ p_xreg(sb, f.rs2);
+ p_sep(sb);
+ p_rel(sb, vaddr, off);
+}
+
+static void print_j(StrBuf* sb, u32 w, u64 vaddr, const Rv64InsnDesc* d) {
+ Rv64J f = rv64_j_unpack(w);
+ i64 off = rv64_sext((u64)f.imm21, 21);
+ if ((d->flags & RV64_ASMFL_ALIAS) && slice_eq_cstr(d->mnemonic, "j")) {
+ p_rel(sb, vaddr, off);
+ return;
+ }
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ p_rel(sb, vaddr, off);
+}
+
+static void print_jalr(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
+ Rv64I f = rv64_i_unpack(w);
+ i64 imm = rv64_sext((u64)f.imm12, 12);
+ if ((d->flags & RV64_ASMFL_ALIAS) && slice_eq_cstr(d->mnemonic, "jr")) {
+ p_xreg(sb, f.rs1);
+ return;
+ }
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ p_mem(sb, imm, f.rs1);
+}
+
+static void print_fence(StrBuf* sb, u32 w) {
+ u32 pred = (w >> 24) & 0xfu;
+ u32 succ = (w >> 20) & 0xfu;
+ static const char order_chars[5] = {'w', 'r', 'o', 'i', '\0'};
+ /* pred/succ: bit3=i, bit2=o, bit1=r, bit0=w; print iorw left-to-right. */
+ char buf[8];
+ u32 k = 0;
+ if (pred & 8u) buf[k++] = 'i';
+ if (pred & 4u) buf[k++] = 'o';
+ if (pred & 2u) buf[k++] = 'r';
+ if (pred & 1u) buf[k++] = 'w';
+ if (!k) buf[k++] = '0';
+ buf[k] = '\0';
+ strbuf_puts(sb, buf);
+ p_sep(sb);
+ k = 0;
+ if (succ & 8u) buf[k++] = 'i';
+ if (succ & 4u) buf[k++] = 'o';
+ if (succ & 2u) buf[k++] = 'r';
+ if (succ & 1u) buf[k++] = 'w';
+ if (!k) buf[k++] = '0';
+ buf[k] = '\0';
+ strbuf_puts(sb, buf);
+ (void)order_chars;
+}
+
+static void print_csr(StrBuf* sb, u32 w) {
+ Rv64I f = rv64_i_unpack(w);
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ strbuf_put_hex_u64(sb, (u64)f.imm12);
+ p_sep(sb);
+ p_xreg(sb, f.rs1);
+}
+
+static void print_csri(StrBuf* sb, u32 w) {
+ Rv64I f = rv64_i_unpack(w);
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ strbuf_put_hex_u64(sb, (u64)f.imm12);
+ p_sep(sb);
+ strbuf_put_u64(sb, (u64)f.rs1);
+}
+
+static void print_fp_rm(StrBuf* sb, u32 w) {
+ Rv64R f = rv64_r_unpack(w);
+ p_freg(sb, f.rd);
+ p_sep(sb);
+ p_freg(sb, f.rs1);
+ p_sep(sb);
+ p_freg(sb, f.rs2);
+}
+
+static void print_fp_r(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
+ Rv64R f = rv64_r_unpack(w);
+ if (d->flags & RV64_ASMFL_FP) {
+ p_freg(sb, f.rd);
+ p_sep(sb);
+ p_freg(sb, f.rs1);
+ p_sep(sb);
+ p_freg(sb, f.rs2);
+ } else {
+ /* FP compare: rd is GPR. */
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ p_freg(sb, f.rs1);
+ p_sep(sb);
+ p_freg(sb, f.rs2);
+ }
+}
+
+static void print_fp_cvt(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
+ Rv64R f = rv64_r_unpack(w);
+ /* rd is FP for: fcvt.s.*, fcvt.d.*, fmv.w.x, fmv.d.x, fsqrt.{s,d}.
+ * GPR for: fcvt.w.*, fcvt.l.*, fmv.x.w, fmv.x.d. */
+ if (d->flags & RV64_ASMFL_FP)
+ p_freg(sb, f.rd);
+ else
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ /* rs1: FP if mnemonic is fcvt.X.{S,D} or fsqrt or fmv.x.{w,d};
+ * GPR if mnemonic is fcvt.{S,D}.{w,wu,l,lu} or fmv.{w,d}.x. */
+ int rs1_is_fp = 1;
+ if (slice_eq_cstr(d->mnemonic, "fmv.w.x") ||
+ slice_eq_cstr(d->mnemonic, "fmv.d.x") ||
+ slice_has_prefix_cstr(d->mnemonic, "fcvt.s.", 7) ||
+ slice_has_prefix_cstr(d->mnemonic, "fcvt.d.", 7)) {
+ /* These have rs1 as integer GPR (source is integer). Exception:
+ * fcvt.s.d / fcvt.d.s have rs1 as FP. */
+ if (slice_eq_cstr(d->mnemonic, "fcvt.s.d") ||
+ slice_eq_cstr(d->mnemonic, "fcvt.d.s"))
+ rs1_is_fp = 1;
+ else
+ rs1_is_fp = 0;
+ }
+ if (rs1_is_fp)
+ p_freg(sb, f.rs1);
+ else
+ p_xreg(sb, f.rs1);
+ /* Explicit rounding mode for the rounding conversions (fcvt / fsqrt) when it
+ * isn't the default `dyn` — fmv and fclass carry no rounding mode. Matches
+ * the objdump/clang convention (an omitted suffix means dyn), so a third-
+ * party assembler re-encodes our fp->int truncation (rtz) exactly rather
+ * than substituting its own default. */
+ if (slice_has_prefix_cstr(d->mnemonic, "fcvt.", 5) ||
+ slice_has_prefix_cstr(d->mnemonic, "fsqrt.", 6)) {
+ u32 rm = (w >> 12) & 7u;
+ static const char* const RMN[8] = {"rne", "rtz", "rdn", "rup",
+ "rmm", 0, 0, "dyn"};
+ if (rm != 7u && RMN[rm]) {
+ p_sep(sb);
+ strbuf_puts(sb, RMN[rm]);
+ }
+ }
+}
+
+static void print_amo(StrBuf* sb, u32 w) {
+ Rv64R f = rv64_r_unpack(w);
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ p_xreg(sb, f.rs2);
+ p_sep(sb);
+ strbuf_putc(sb, '(');
+ p_xreg(sb, f.rs1);
+ strbuf_putc(sb, ')');
+}
+
+static void print_lr(StrBuf* sb, u32 w) {
+ Rv64R f = rv64_r_unpack(w);
+ p_xreg(sb, f.rd);
+ p_sep(sb);
+ strbuf_putc(sb, '(');
+ p_xreg(sb, f.rs1);
+ strbuf_putc(sb, ')');
+}
+
+/* ---- compressed printers ---- */
+
+static void print_cr(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
+ u32 hw = w & 0xffffu;
+ u32 rd_rs1 = (hw >> 7) & 0x1fu;
+ u32 rs2 = (hw >> 2) & 0x1fu;
+ if (slice_eq_cstr(d->mnemonic, "c.jr") ||
+ slice_eq_cstr(d->mnemonic, "c.jalr")) {
+ p_xreg(sb, rd_rs1);
+ } else {
+ /* c.mv / c.add */
+ p_xreg(sb, rd_rs1);
+ p_sep(sb);
+ p_xreg(sb, rs2);
+ }
+}
+
+static void print_ci(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
+ u32 hw = w & 0xffffu;
+ u32 rd_rs1 = (hw >> 7) & 0x1fu;
+ /* immediate is split across bits 12 and 6:2 (signed 6-bit for most). */
+ u32 imm5 = (hw >> 12) & 1u;
+ u32 imm4_0 = (hw >> 2) & 0x1fu;
+ i64 imm;
+ if (slice_eq_cstr(d->mnemonic, "c.lui")) {
+ /* nzimm[17:12] = bits 12, 6:2 — signed extended to 18 bits. */
+ u64 raw = (u64)((imm5 << 5) | imm4_0);
+ imm = (i64)((u64)rv64_sext(raw, 6) << 12);
+ p_xreg(sb, rd_rs1);
+ p_sep(sb);
+ strbuf_put_hex_u64(sb, (u64)imm);
+ return;
+ }
+ if (slice_eq_cstr(d->mnemonic, "c.addi16sp")) {
+ /* nzimm[9|4|6|8:7|5] (scrambled). Just decode for print. */
+ u32 b9 = (hw >> 12) & 1u;
+ u32 b4 = (hw >> 6) & 1u;
+ u32 b6 = (hw >> 5) & 1u;
+ u32 b87 = (hw >> 3) & 3u;
+ u32 b5 = (hw >> 2) & 1u;
+ u64 raw = ((u64)b9 << 9) | ((u64)b87 << 7) | ((u64)b6 << 6) |
+ ((u64)b5 << 5) | ((u64)b4 << 4);
+ imm = rv64_sext(raw, 10);
+ p_xreg(sb, rd_rs1);
+ p_sep(sb);
+ strbuf_put_i64(sb, imm);
+ return;
+ }
+ if (slice_eq_cstr(d->mnemonic, "c.lwsp")) {
+ /* offset[5|4:2|7:6] scaled by 4. */
+ u32 b5 = imm5;
+ u32 b4_2 = (imm4_0 >> 2) & 7u;
+ u32 b7_6 = imm4_0 & 3u;
+ u32 off = (b7_6 << 6) | (b5 << 5) | (b4_2 << 2);
+ p_xreg(sb, rd_rs1);
+ p_sep(sb);
+ p_mem(sb, (i64)off, 2u);
+ return;
+ }
+ if (slice_eq_cstr(d->mnemonic, "c.ldsp") ||
+ slice_eq_cstr(d->mnemonic, "c.fldsp")) {
+ /* offset[5|4:3|8:6] scaled by 8. */
+ u32 b5 = imm5;
+ u32 b4_3 = (imm4_0 >> 3) & 3u;
+ u32 b8_6 = imm4_0 & 7u;
+ u32 off = (b8_6 << 6) | (b5 << 5) | (b4_3 << 3);
+ if (d->flags & RV64_ASMFL_FP)
+ p_freg(sb, rd_rs1);
+ else
+ p_xreg(sb, rd_rs1);
+ p_sep(sb);
+ p_mem(sb, (i64)off, 2u);
+ return;
+ }
+ if (slice_eq_cstr(d->mnemonic, "c.slli")) {
+ u32 shamt = (imm5 << 5) | imm4_0;
+ p_xreg(sb, rd_rs1);
+ p_sep(sb);
+ strbuf_put_u64(sb, (u64)shamt);
+ return;
+ }
+ /* c.li / c.addi — signed 6-bit immediate. */
+ imm = rv64_sext((u64)((imm5 << 5) | imm4_0), 6);
+ p_xreg(sb, rd_rs1);
+ p_sep(sb);
+ strbuf_put_i64(sb, imm);
+}
+
+static void print_css(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
+ u32 hw = w & 0xffffu;
+ u32 rs2 = (hw >> 2) & 0x1fu;
+ u32 imm6 = (hw >> 7) & 0x3fu;
+ u32 off;
+ if (slice_eq_cstr(d->mnemonic, "c.swsp")) {
+ /* offset[5:2|7:6] scaled by 4. */
+ u32 b5_2 = (imm6 >> 2) & 0xfu;
+ u32 b7_6 = imm6 & 3u;
+ off = (b7_6 << 6) | (b5_2 << 2);
+ p_xreg(sb, rs2);
+ p_sep(sb);
+ p_mem(sb, (i64)off, 2u);
+ return;
+ }
+ /* c.sdsp / c.fsdsp — offset[5:3|8:6] scaled by 8. */
+ {
+ u32 b5_3 = (imm6 >> 3) & 7u;
+ u32 b8_6 = imm6 & 7u;
+ off = (b8_6 << 6) | (b5_3 << 3);
+ if (d->flags & RV64_ASMFL_FP)
+ p_freg(sb, rs2);
+ else
+ p_xreg(sb, rs2);
+ p_sep(sb);
+ p_mem(sb, (i64)off, 2u);
+ }
+}
+
+static void print_ciw(StrBuf* sb, u32 w) {
+ u32 hw = w & 0xffffu;
+ u32 rd3 = (hw >> 2) & 7u;
+ /* nzuimm[5:4|9:6|2|3] scaled by 4 — encoded into bits 12:5. */
+ u32 imm = (hw >> 5) & 0xffu;
+ u32 b5_4 = (imm >> 6) & 3u;
+ u32 b9_6 = (imm >> 2) & 0xfu;
+ u32 b2 = (imm >> 1) & 1u;
+ u32 b3 = imm & 1u;
+ u32 off = (b9_6 << 6) | (b5_4 << 4) | (b3 << 3) | (b2 << 2);
+ p_xreg(sb, RVC_REG3(rd3));
+ p_sep(sb);
+ strbuf_puts(sb, "sp");
+ p_sep(sb);
+ strbuf_put_u64(sb, (u64)off);
+}
+
+static void print_cl(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
+ u32 hw = w & 0xffffu;
+ u32 rd3 = (hw >> 2) & 7u;
+ u32 rs1_3 = (hw >> 7) & 7u;
+ u32 b5_3 = (hw >> 10) & 7u;
+ u32 lo = (hw >> 5) & 3u;
+ u32 off;
+ if (slice_eq_cstr(d->mnemonic, "c.lw")) {
+ /* offset[5:3|2|6] scaled by 4. */
+ u32 b2 = (lo >> 1) & 1u;
+ u32 b6 = lo & 1u;
+ off = (b6 << 6) | (b5_3 << 3) | (b2 << 2);
+ } else {
+ /* c.ld: offset[5:3|7:6] scaled by 8. */
+ off = (lo << 6) | (b5_3 << 3);
+ }
+ if (d->flags & RV64_ASMFL_FP)
+ p_freg(sb, RVC_REG3(rd3));
+ else
+ p_xreg(sb, RVC_REG3(rd3));
+ p_sep(sb);
+ p_mem(sb, (i64)off, RVC_REG3(rs1_3));
+}
+
+static void print_cs(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
+ u32 hw = w & 0xffffu;
+ u32 rs2_3 = (hw >> 2) & 7u;
+ u32 rs1_3 = (hw >> 7) & 7u;
+ u32 b5_3 = (hw >> 10) & 7u;
+ u32 lo = (hw >> 5) & 3u;
+ u32 off;
+ if (slice_eq_cstr(d->mnemonic, "c.sw")) {
+ u32 b2 = (lo >> 1) & 1u;
+ u32 b6 = lo & 1u;
+ off = (b6 << 6) | (b5_3 << 3) | (b2 << 2);
+ } else {
+ off = (lo << 6) | (b5_3 << 3);
+ }
+ if (d->flags & RV64_ASMFL_FP)
+ p_freg(sb, RVC_REG3(rs2_3));
+ else
+ p_xreg(sb, RVC_REG3(rs2_3));
+ p_sep(sb);
+ p_mem(sb, (i64)off, RVC_REG3(rs1_3));
+}
+
+static void print_ca(StrBuf* sb, u32 w) {
+ u32 hw = w & 0xffffu;
+ u32 rd3 = (hw >> 7) & 7u;
+ u32 rs2_3 = (hw >> 2) & 7u;
+ p_xreg(sb, RVC_REG3(rd3));
+ p_sep(sb);
+ p_xreg(sb, RVC_REG3(rs2_3));
+}
+
+static void print_cb(StrBuf* sb, u32 w, u64 vaddr, const Rv64InsnDesc* d) {
+ u32 hw = w & 0xffffu;
+ u32 rs1_3 = (hw >> 7) & 7u;
+ if (slice_eq_cstr(d->mnemonic, "c.srli") ||
+ slice_eq_cstr(d->mnemonic, "c.srai") ||
+ slice_eq_cstr(d->mnemonic, "c.andi")) {
+ u32 imm = (((hw >> 12) & 1u) << 5) | ((hw >> 2) & 0x1fu);
+ p_xreg(sb, RVC_REG3(rs1_3));
+ p_sep(sb);
+ if (slice_eq_cstr(d->mnemonic, "c.andi"))
+ strbuf_put_i64(sb, rv64_sext((u64)imm, 6));
+ else
+ strbuf_put_u64(sb, (u64)imm);
+ return;
+ }
+ /* offset[8|4:3|7:6|2:1|5] scaled by 2. */
+ u32 b8 = (hw >> 12) & 1u;
+ u32 b4_3 = (hw >> 10) & 3u;
+ u32 b7_6 = (hw >> 5) & 3u;
+ u32 b2_1 = (hw >> 3) & 3u;
+ u32 b5 = (hw >> 2) & 1u;
+ u64 raw = ((u64)b8 << 8) | ((u64)b7_6 << 6) | ((u64)b5 << 5) |
+ ((u64)b4_3 << 3) | ((u64)b2_1 << 1);
+ i64 off = rv64_sext(raw, 9);
+ p_xreg(sb, RVC_REG3(rs1_3));
+ p_sep(sb);
+ p_rel(sb, vaddr, off);
+}
+
+static void print_cj(StrBuf* sb, u32 w, u64 vaddr) {
+ u32 hw = w & 0xffffu;
+ /* offset[11|4|9:8|10|6|7|3:1|5] scaled by 2. */
+ u32 b11 = (hw >> 12) & 1u;
+ u32 b4 = (hw >> 11) & 1u;
+ u32 b9_8 = (hw >> 9) & 3u;
+ u32 b10 = (hw >> 8) & 1u;
+ u32 b6 = (hw >> 7) & 1u;
+ u32 b7 = (hw >> 6) & 1u;
+ u32 b3_1 = (hw >> 3) & 7u;
+ u32 b5 = (hw >> 2) & 1u;
+ u64 raw = ((u64)b11 << 11) | ((u64)b10 << 10) | ((u64)b9_8 << 8) |
+ ((u64)b7 << 7) | ((u64)b6 << 6) | ((u64)b5 << 5) | ((u64)b4 << 4) |
+ ((u64)b3_1 << 1);
+ i64 off = rv64_sext(raw, 12);
+ p_rel(sb, vaddr, off);
+}
+
+void rv64_print_operands(StrBuf* sb, const Rv64InsnDesc* desc, u32 word,
+ u64 vaddr) {
+ switch ((Rv64Format)desc->fmt) {
+ case RV64_FMT_R:
+ print_r(sb, word, desc);
+ break;
+ case RV64_FMT_R4:
+ print_r4(sb, word);
+ break;
+ case RV64_FMT_I:
+ print_i(sb, word, desc);
+ break;
+ case RV64_FMT_I_SHIFT:
+ print_i_shift(sb, word);
+ break;
+ case RV64_FMT_I_SHIFTW:
+ print_i_shiftw(sb, word);
+ break;
+ case RV64_FMT_S:
+ print_store(sb, word, desc);
+ break;
+ case RV64_FMT_B:
+ print_b(sb, word, vaddr, desc);
+ break;
+ case RV64_FMT_U:
+ print_u(sb, word);
+ break;
+ case RV64_FMT_J:
+ print_j(sb, word, vaddr, desc);
+ break;
+ case RV64_FMT_LOAD:
+ print_load(sb, word, desc);
+ break;
+ case RV64_FMT_STORE:
+ print_store(sb, word, desc);
+ break;
+ case RV64_FMT_JALR:
+ print_jalr(sb, word, desc);
+ break;
+ case RV64_FMT_FENCE:
+ print_fence(sb, word);
+ break;
+ case RV64_FMT_SYSTEM:
+ break; /* no operands */
+ case RV64_FMT_FP_RM:
+ print_fp_rm(sb, word);
+ break;
+ case RV64_FMT_FP_R:
+ print_fp_r(sb, word, desc);
+ break;
+ case RV64_FMT_FP_CVT:
+ print_fp_cvt(sb, word, desc);
+ break;
+ case RV64_FMT_FP_LOAD:
+ print_load(sb, word, desc);
+ break;
+ case RV64_FMT_FP_STORE:
+ print_store(sb, word, desc);
+ break;
+ case RV64_FMT_AMO:
+ print_amo(sb, word);
+ break;
+ case RV64_FMT_LR:
+ print_lr(sb, word);
+ break;
+ case RV64_FMT_CSR:
+ print_csr(sb, word);
+ break;
+ case RV64_FMT_CSRI:
+ print_csri(sb, word);
+ break;
+ case RV64_FMT_CR:
+ print_cr(sb, word, desc);
+ break;
+ case RV64_FMT_CI:
+ print_ci(sb, word, desc);
+ break;
+ case RV64_FMT_CSS:
+ print_css(sb, word, desc);
+ break;
+ case RV64_FMT_CIW:
+ print_ciw(sb, word);
+ break;
+ case RV64_FMT_CL:
+ print_cl(sb, word, desc);
+ break;
+ case RV64_FMT_CS:
+ print_cs(sb, word, desc);
+ break;
+ case RV64_FMT_CA:
+ print_ca(sb, word);
+ break;
+ case RV64_FMT_CB:
+ print_cb(sb, word, vaddr, desc);
+ break;
+ case RV64_FMT_CJ:
+ print_cj(sb, word, vaddr);
+ break;
+ case RV64_FMT_C_NONE:
+ break;
+ case RV64_FMT_PSEUDO:
+ /* Assembler-only multi-word pseudo; rv64_disasm_find never returns
+ * these rows, so the printer is never reached for this format. */
+ break;
+ }
+}
diff --git a/src/arch/riscv/isa.h b/src/arch/riscv/isa.h
@@ -0,0 +1,808 @@
+/* RV64 instruction encoders + descriptor table — single source of truth
+ * for every instruction the encoder, decoder, and disassembler need to
+ * agree on. Mirrors the aa64_isa.[ch] pattern.
+ *
+ * The bottom of this header (after the `rv_*` inline encoders) declares
+ * the format-kind enum and per-format pack/unpack helpers. The
+ * descriptor table itself lives in isa.c. */
+
+#ifndef KIT_RV64_ISA_H
+#define KIT_RV64_ISA_H
+
+#include "core/core.h"
+#include "core/slice.h"
+#include "core/strbuf.h"
+
+/* ---- Named registers (DWARF / psABI numbering matches HW) ---- */
+enum {
+ RV_X0 = 0,
+ RV_ZERO = 0,
+ RV_X1 = 1,
+ RV_RA = 1,
+ RV_X2 = 2,
+ RV_SP = 2,
+ RV_X3 = 3,
+ RV_GP = 3,
+ RV_X4 = 4,
+ RV_TP = 4,
+ RV_X5 = 5,
+ RV_T0 = 5,
+ RV_X6 = 6,
+ RV_T1 = 6,
+ RV_X7 = 7,
+ RV_T2 = 7,
+ RV_X8 = 8,
+ RV_S0 = 8,
+ RV_FP = 8,
+ RV_X9 = 9,
+ RV_S1 = 9,
+ RV_X10 = 10,
+ RV_A0 = 10,
+ RV_X11 = 11,
+ RV_A1 = 11,
+ RV_X12 = 12,
+ RV_A2 = 12,
+ RV_X13 = 13,
+ RV_A3 = 13,
+ RV_X14 = 14,
+ RV_A4 = 14,
+ RV_X15 = 15,
+ RV_A5 = 15,
+ RV_X16 = 16,
+ RV_A6 = 16,
+ RV_X17 = 17,
+ RV_A7 = 17,
+ RV_X18 = 18,
+ RV_S2 = 18,
+ RV_X27 = 27,
+ RV_S11 = 27,
+ RV_X28 = 28,
+ RV_T3 = 28,
+ RV_X29 = 29,
+ RV_T4 = 29,
+ RV_X30 = 30,
+ RV_T5 = 30,
+ RV_X31 = 31,
+ RV_T6 = 31,
+};
+
+#define RV_NOP 0x00000013u /* ADDI x0, x0, 0 */
+
+/* ---- Format helpers ----
+ *
+ * R-type: funct7(31:25) rs2(24:20) rs1(19:15) funct3(14:12) rd(11:7) op(6:0)
+ * I-type: imm(31:20) rs1(19:15) funct3(14:12) rd(11:7) op(6:0)
+ * S-type: imm[11:5](31:25) rs2(24:20) rs1(19:15) funct3(14:12) imm[4:0](11:7)
+ * op(6:0) B-type: imm[12](31) imm[10:5](30:25) rs2(24:20) rs1(19:15)
+ * funct3(14:12) imm[4:1](11:8) imm[11](7) op(6:0) U-type: imm[31:12](31:12)
+ * rd(11:7) op(6:0) J-type: imm[20](31) imm[10:1](30:21) imm[11](20)
+ * imm[19:12](19:12) rd(11:7) op(6:0)
+ */
+
+static inline u32 rv_r(u32 funct7, u32 rs2, u32 rs1, u32 funct3, u32 rd,
+ u32 op) {
+ return ((funct7 & 0x7fu) << 25) | ((rs2 & 0x1fu) << 20) |
+ ((rs1 & 0x1fu) << 15) | ((funct3 & 0x7u) << 12) | ((rd & 0x1fu) << 7) |
+ (op & 0x7fu);
+}
+static inline u32 rv_i(i32 imm12, u32 rs1, u32 funct3, u32 rd, u32 op) {
+ return (((u32)imm12 & 0xfffu) << 20) | ((rs1 & 0x1fu) << 15) |
+ ((funct3 & 0x7u) << 12) | ((rd & 0x1fu) << 7) | (op & 0x7fu);
+}
+static inline u32 rv_s(i32 imm12, u32 rs2, u32 rs1, u32 funct3, u32 op) {
+ u32 ui = (u32)imm12 & 0xfffu;
+ return ((ui >> 5) << 25) | ((rs2 & 0x1fu) << 20) | ((rs1 & 0x1fu) << 15) |
+ ((funct3 & 0x7u) << 12) | ((ui & 0x1fu) << 7) | (op & 0x7fu);
+}
+static inline u32 rv_b(i32 imm13, u32 rs2, u32 rs1, u32 funct3, u32 op) {
+ u32 ui = (u32)imm13;
+ return (((ui >> 12) & 1u) << 31) | (((ui >> 5) & 0x3fu) << 25) |
+ ((rs2 & 0x1fu) << 20) | ((rs1 & 0x1fu) << 15) |
+ ((funct3 & 0x7u) << 12) | (((ui >> 1) & 0xfu) << 8) |
+ (((ui >> 11) & 1u) << 7) | (op & 0x7fu);
+}
+static inline u32 rv_u(u32 imm32_hi20, u32 rd, u32 op) {
+ return (imm32_hi20 & 0xfffff000u) | ((rd & 0x1fu) << 7) | (op & 0x7fu);
+}
+static inline u32 rv_j(i32 imm21, u32 rd, u32 op) {
+ u32 ui = (u32)imm21;
+ return (((ui >> 20) & 1u) << 31) | (((ui >> 1) & 0x3ffu) << 21) |
+ (((ui >> 11) & 1u) << 20) | (((ui >> 12) & 0xffu) << 12) |
+ ((rd & 0x1fu) << 7) | (op & 0x7fu);
+}
+
+/* ---- Integer ops (RV32I/RV64I) ---- */
+
+#define RV_OP 0x33u
+#define RV_OP_IMM 0x13u
+#define RV_OP_32 0x3bu
+#define RV_OP_IMM_32 0x1bu
+#define RV_LUI 0x37u
+#define RV_AUIPC 0x17u
+#define RV_LOAD 0x03u
+#define RV_STORE 0x23u
+#define RV_BRANCH 0x63u
+#define RV_JAL 0x6fu
+#define RV_JALR 0x67u
+#define RV_LOAD_FP 0x07u
+#define RV_STORE_FP 0x27u
+#define RV_OP_FP 0x53u
+#define RV_MADD 0x43u
+#define RV_MSUB 0x47u
+#define RV_NMSUB 0x4bu
+#define RV_NMADD 0x4fu
+#define RV_AMO 0x2fu
+#define RV_FENCE 0x0fu
+#define RV_SYSTEM 0x73u
+
+static inline u32 rv_add(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x00, rs2, rs1, 0x0, rd, RV_OP);
+}
+static inline u32 rv_sub(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x20, rs2, rs1, 0x0, rd, RV_OP);
+}
+static inline u32 rv_sll(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x00, rs2, rs1, 0x1, rd, RV_OP);
+}
+static inline u32 rv_slt(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x00, rs2, rs1, 0x2, rd, RV_OP);
+}
+static inline u32 rv_sltu(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x00, rs2, rs1, 0x3, rd, RV_OP);
+}
+static inline u32 rv_xor(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x00, rs2, rs1, 0x4, rd, RV_OP);
+}
+static inline u32 rv_srl(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x00, rs2, rs1, 0x5, rd, RV_OP);
+}
+static inline u32 rv_sra(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x20, rs2, rs1, 0x5, rd, RV_OP);
+}
+static inline u32 rv_or(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x00, rs2, rs1, 0x6, rd, RV_OP);
+}
+static inline u32 rv_and(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x00, rs2, rs1, 0x7, rd, RV_OP);
+}
+
+static inline u32 rv_addw(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x00, rs2, rs1, 0x0, rd, RV_OP_32);
+}
+static inline u32 rv_subw(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x20, rs2, rs1, 0x0, rd, RV_OP_32);
+}
+static inline u32 rv_sllw(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x00, rs2, rs1, 0x1, rd, RV_OP_32);
+}
+static inline u32 rv_srlw(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x00, rs2, rs1, 0x5, rd, RV_OP_32);
+}
+static inline u32 rv_sraw(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x20, rs2, rs1, 0x5, rd, RV_OP_32);
+}
+
+static inline u32 rv_addi(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x0, rd, RV_OP_IMM);
+}
+static inline u32 rv_slti(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x2, rd, RV_OP_IMM);
+}
+static inline u32 rv_sltiu(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x3, rd, RV_OP_IMM);
+}
+static inline u32 rv_xori(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x4, rd, RV_OP_IMM);
+}
+static inline u32 rv_ori(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x6, rd, RV_OP_IMM);
+}
+static inline u32 rv_andi(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x7, rd, RV_OP_IMM);
+}
+
+/* Shift-immediate forms. RV64I uses a 6-bit shamt in bits 25:20 and a
+ * 6-bit funct6 in bits 31:26 (so the funct7-vs-shamt[5] split that
+ * rv_r() does is wrong here — we hand-assemble these). */
+static inline u32 rv_slli(u32 rd, u32 rs1, u32 sh) {
+ return (0x00u << 26) | ((sh & 0x3fu) << 20) | ((rs1 & 0x1fu) << 15) |
+ (0x1u << 12) | ((rd & 0x1fu) << 7) | RV_OP_IMM;
+}
+static inline u32 rv_srli(u32 rd, u32 rs1, u32 sh) {
+ return (0x00u << 26) | ((sh & 0x3fu) << 20) | ((rs1 & 0x1fu) << 15) |
+ (0x5u << 12) | ((rd & 0x1fu) << 7) | RV_OP_IMM;
+}
+static inline u32 rv_srai(u32 rd, u32 rs1, u32 sh) {
+ return (0x10u << 26) | ((sh & 0x3fu) << 20) | ((rs1 & 0x1fu) << 15) |
+ (0x5u << 12) | ((rd & 0x1fu) << 7) | RV_OP_IMM;
+}
+
+static inline u32 rv_addiw(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x0, rd, RV_OP_IMM_32);
+}
+static inline u32 rv_slliw(u32 rd, u32 rs1, u32 sh) {
+ return rv_r(0x00, sh & 0x1fu, rs1, 0x1, rd, RV_OP_IMM_32);
+}
+static inline u32 rv_srliw(u32 rd, u32 rs1, u32 sh) {
+ return rv_r(0x00, sh & 0x1fu, rs1, 0x5, rd, RV_OP_IMM_32);
+}
+static inline u32 rv_sraiw(u32 rd, u32 rs1, u32 sh) {
+ return rv_r(0x20, sh & 0x1fu, rs1, 0x5, rd, RV_OP_IMM_32);
+}
+
+static inline u32 rv_lui(u32 rd, u32 imm20) {
+ return ((imm20 & 0xfffffu) << 12) | ((rd & 0x1fu) << 7) | RV_LUI;
+}
+static inline u32 rv_auipc(u32 rd, u32 imm20) {
+ return ((imm20 & 0xfffffu) << 12) | ((rd & 0x1fu) << 7) | RV_AUIPC;
+}
+
+/* M extension */
+static inline u32 rv_mul(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x01, rs2, rs1, 0x0, rd, RV_OP);
+}
+static inline u32 rv_mulh(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x01, rs2, rs1, 0x1, rd, RV_OP);
+}
+static inline u32 rv_mulhsu(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x01, rs2, rs1, 0x2, rd, RV_OP);
+}
+static inline u32 rv_mulhu(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x01, rs2, rs1, 0x3, rd, RV_OP);
+}
+static inline u32 rv_div(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x01, rs2, rs1, 0x4, rd, RV_OP);
+}
+static inline u32 rv_divu(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x01, rs2, rs1, 0x5, rd, RV_OP);
+}
+static inline u32 rv_rem(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x01, rs2, rs1, 0x6, rd, RV_OP);
+}
+static inline u32 rv_remu(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x01, rs2, rs1, 0x7, rd, RV_OP);
+}
+static inline u32 rv_mulw(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x01, rs2, rs1, 0x0, rd, RV_OP_32);
+}
+static inline u32 rv_divw(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x01, rs2, rs1, 0x4, rd, RV_OP_32);
+}
+static inline u32 rv_divuw(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x01, rs2, rs1, 0x5, rd, RV_OP_32);
+}
+static inline u32 rv_remw(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x01, rs2, rs1, 0x6, rd, RV_OP_32);
+}
+static inline u32 rv_remuw(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x01, rs2, rs1, 0x7, rd, RV_OP_32);
+}
+
+/* Zba (address-generation) subset — assumed available on rv64 targets.
+ * SH{1,2,3}ADD rd, rs1, rs2 computes rd = (rs1 << {1,2,3}) + rs2 in one
+ * instruction (funct7=0x10, opcode=OP). Used by load/store to fold an
+ * indexed effective address `base + (index << log2_scale)` into a single
+ * scratch register without an explicit shift+add pair. */
+static inline u32 rv_sh1add(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x10, rs2, rs1, 0x2, rd, RV_OP);
+}
+static inline u32 rv_sh2add(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x10, rs2, rs1, 0x4, rd, RV_OP);
+}
+static inline u32 rv_sh3add(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x10, rs2, rs1, 0x6, rd, RV_OP);
+}
+
+/* Loads (funct3: 0=LB,1=LH,2=LW,3=LD,4=LBU,5=LHU,6=LWU) */
+static inline u32 rv_lb(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x0, rd, RV_LOAD);
+}
+static inline u32 rv_lh(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x1, rd, RV_LOAD);
+}
+static inline u32 rv_lw(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x2, rd, RV_LOAD);
+}
+static inline u32 rv_ld(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x3, rd, RV_LOAD);
+}
+static inline u32 rv_lbu(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x4, rd, RV_LOAD);
+}
+static inline u32 rv_lhu(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x5, rd, RV_LOAD);
+}
+static inline u32 rv_lwu(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x6, rd, RV_LOAD);
+}
+
+/* Stores (funct3: 0=SB,1=SH,2=SW,3=SD) */
+static inline u32 rv_sb(u32 rs2, u32 rs1, i32 imm) {
+ return rv_s(imm, rs2, rs1, 0x0, RV_STORE);
+}
+static inline u32 rv_sh(u32 rs2, u32 rs1, i32 imm) {
+ return rv_s(imm, rs2, rs1, 0x1, RV_STORE);
+}
+static inline u32 rv_sw(u32 rs2, u32 rs1, i32 imm) {
+ return rv_s(imm, rs2, rs1, 0x2, RV_STORE);
+}
+static inline u32 rv_sd(u32 rs2, u32 rs1, i32 imm) {
+ return rv_s(imm, rs2, rs1, 0x3, RV_STORE);
+}
+
+/* Branches */
+static inline u32 rv_beq(u32 rs1, u32 rs2, i32 imm) {
+ return rv_b(imm, rs2, rs1, 0x0, RV_BRANCH);
+}
+static inline u32 rv_bne(u32 rs1, u32 rs2, i32 imm) {
+ return rv_b(imm, rs2, rs1, 0x1, RV_BRANCH);
+}
+static inline u32 rv_blt(u32 rs1, u32 rs2, i32 imm) {
+ return rv_b(imm, rs2, rs1, 0x4, RV_BRANCH);
+}
+static inline u32 rv_bge(u32 rs1, u32 rs2, i32 imm) {
+ return rv_b(imm, rs2, rs1, 0x5, RV_BRANCH);
+}
+static inline u32 rv_bltu(u32 rs1, u32 rs2, i32 imm) {
+ return rv_b(imm, rs2, rs1, 0x6, RV_BRANCH);
+}
+static inline u32 rv_bgeu(u32 rs1, u32 rs2, i32 imm) {
+ return rv_b(imm, rs2, rs1, 0x7, RV_BRANCH);
+}
+
+/* Jumps */
+static inline u32 rv_jal(u32 rd, i32 imm21) { return rv_j(imm21, rd, RV_JAL); }
+static inline u32 rv_jalr(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x0, rd, RV_JALR);
+}
+
+/* Convenience: jr / ret / j / nop */
+static inline u32 rv_jr(u32 rs1) { return rv_jalr(RV_ZERO, rs1, 0); }
+static inline u32 rv_ret_(void) { return rv_jalr(RV_ZERO, RV_RA, 0); }
+static inline u32 rv_nop(void) { return RV_NOP; }
+
+/* System */
+static inline u32 rv_ecall(void) { return rv_i(0, 0, 0, 0, RV_SYSTEM); }
+static inline u32 rv_ebreak(void) { return rv_i(1, 0, 0, 0, RV_SYSTEM); }
+/* WFI: wait-for-interrupt, SYSTEM funct12=0x105 (privileged). */
+static inline u32 rv_wfi(void) { return 0x10500073u; }
+
+/* FENCE: pred/succ each 4 bits in imm[11:8]/imm[7:4]. fm bits 11:8 of imm */
+static inline u32 rv_fence_rw_rw(void) {
+ return rv_i((i32)0x033, 0, 0, 0, RV_FENCE);
+}
+/* FENCE.I: instruction-stream sync (Zifencei). funct3=1 in the MISC-MEM major
+ * opcode (0x0F). Used to lower the ISB intrinsic. */
+static inline u32 rv_fence_i(void) { return 0x0000100Fu; }
+/* PAUSE (Zihintpause): a FENCE with pred=W, succ=none. Used for cpu_yield;
+ * decodes as a plain FENCE on hardware lacking the extension, which is a safe
+ * (stronger) no-op hint. */
+static inline u32 rv_pause(void) { return 0x0100000Fu; }
+
+/* ---- FP (F + D extensions) ----
+ * funct7 layout: bits[6:2] op-major (e.g. 0x00 FADD, 0x01 FSUB, ...);
+ * bits[1:0] = fmt (00=S, 01=D). rm (rounding mode) in funct3; 0x7 = DYN. */
+
+#define RV_FMT_S 0u
+#define RV_FMT_D 1u
+
+static inline u32 rv_fadd(u32 fmt, u32 rd, u32 rs1, u32 rs2) {
+ return rv_r((0x00u << 2) | fmt, rs2, rs1, 0x7, rd, RV_OP_FP);
+}
+static inline u32 rv_fsub(u32 fmt, u32 rd, u32 rs1, u32 rs2) {
+ return rv_r((0x01u << 2) | fmt, rs2, rs1, 0x7, rd, RV_OP_FP);
+}
+static inline u32 rv_fmul(u32 fmt, u32 rd, u32 rs1, u32 rs2) {
+ return rv_r((0x02u << 2) | fmt, rs2, rs1, 0x7, rd, RV_OP_FP);
+}
+static inline u32 rv_fdiv(u32 fmt, u32 rd, u32 rs1, u32 rs2) {
+ return rv_r((0x03u << 2) | fmt, rs2, rs1, 0x7, rd, RV_OP_FP);
+}
+/* FSGNJ.fmt rd, rs1, rs2 — used to implement FMV.fmt rd, rs (sgnj rs, rs). */
+static inline u32 rv_fsgnj(u32 fmt, u32 rd, u32 rs1, u32 rs2) {
+ return rv_r((0x04u << 2) | fmt, rs2, rs1, 0x0, rd, RV_OP_FP);
+}
+static inline u32 rv_fsgnjn(u32 fmt, u32 rd, u32 rs1, u32 rs2) {
+ return rv_r((0x04u << 2) | fmt, rs2, rs1, 0x1, rd, RV_OP_FP);
+}
+/* FCVT — integer/FP conversions. funct7 = 0x18..0x1d depending on direction;
+ * rs2 encodes the partner type:
+ * 0x60(W <- S) 0x61(W <- D)
+ * 0x68(S <- W) 0x69(D <- W) etc
+ * We assemble explicitly via rv_r to be obvious. */
+static inline u32 rv_fcvt(u32 funct7, u32 rs2_sel, u32 rd, u32 rs1, u32 rm) {
+ return rv_r(funct7, rs2_sel, rs1, rm, rd, RV_OP_FP);
+}
+/* FCVT.W.S rd, rs1 (signed i32 from f32, rtz=001) : funct7=0x60 rs2=0 */
+static inline u32 rv_fcvt_w_s(u32 rd, u32 rs1) {
+ return rv_fcvt(0x60, 0x0, rd, rs1, 0x1);
+}
+static inline u32 rv_fcvt_wu_s(u32 rd, u32 rs1) {
+ return rv_fcvt(0x60, 0x1, rd, rs1, 0x1);
+}
+static inline u32 rv_fcvt_l_s(u32 rd, u32 rs1) {
+ return rv_fcvt(0x60, 0x2, rd, rs1, 0x1);
+}
+static inline u32 rv_fcvt_lu_s(u32 rd, u32 rs1) {
+ return rv_fcvt(0x60, 0x3, rd, rs1, 0x1);
+}
+static inline u32 rv_fcvt_w_d(u32 rd, u32 rs1) {
+ return rv_fcvt(0x61, 0x0, rd, rs1, 0x1);
+}
+static inline u32 rv_fcvt_wu_d(u32 rd, u32 rs1) {
+ return rv_fcvt(0x61, 0x1, rd, rs1, 0x1);
+}
+static inline u32 rv_fcvt_l_d(u32 rd, u32 rs1) {
+ return rv_fcvt(0x61, 0x2, rd, rs1, 0x1);
+}
+static inline u32 rv_fcvt_lu_d(u32 rd, u32 rs1) {
+ return rv_fcvt(0x61, 0x3, rd, rs1, 0x1);
+}
+static inline u32 rv_fcvt_s_w(u32 rd, u32 rs1) {
+ return rv_fcvt(0x68, 0x0, rd, rs1, 0x7);
+}
+static inline u32 rv_fcvt_s_wu(u32 rd, u32 rs1) {
+ return rv_fcvt(0x68, 0x1, rd, rs1, 0x7);
+}
+static inline u32 rv_fcvt_s_l(u32 rd, u32 rs1) {
+ return rv_fcvt(0x68, 0x2, rd, rs1, 0x7);
+}
+static inline u32 rv_fcvt_s_lu(u32 rd, u32 rs1) {
+ return rv_fcvt(0x68, 0x3, rd, rs1, 0x7);
+}
+static inline u32 rv_fcvt_d_w(u32 rd, u32 rs1) {
+ return rv_fcvt(0x69, 0x0, rd, rs1, 0x7);
+}
+static inline u32 rv_fcvt_d_wu(u32 rd, u32 rs1) {
+ return rv_fcvt(0x69, 0x1, rd, rs1, 0x7);
+}
+static inline u32 rv_fcvt_d_l(u32 rd, u32 rs1) {
+ return rv_fcvt(0x69, 0x2, rd, rs1, 0x7);
+}
+static inline u32 rv_fcvt_d_lu(u32 rd, u32 rs1) {
+ return rv_fcvt(0x69, 0x3, rd, rs1, 0x7);
+}
+/* FCVT.S.D / FCVT.D.S */
+static inline u32 rv_fcvt_s_d(u32 rd, u32 rs1) {
+ return rv_fcvt(0x20, 0x1, rd, rs1, 0x7);
+}
+static inline u32 rv_fcvt_d_s(u32 rd, u32 rs1) {
+ return rv_fcvt(0x21, 0x0, rd, rs1, 0x7);
+}
+
+/* FMV.X.W / FMV.W.X / FMV.X.D / FMV.D.X — bitcast between GPR and FPR. */
+static inline u32 rv_fmv_x_w(u32 rd, u32 rs1) {
+ return rv_fcvt(0x70, 0x0, rd, rs1, 0x0);
+}
+static inline u32 rv_fmv_w_x(u32 rd, u32 rs1) {
+ return rv_fcvt(0x78, 0x0, rd, rs1, 0x0);
+}
+static inline u32 rv_fmv_x_d(u32 rd, u32 rs1) {
+ return rv_fcvt(0x71, 0x0, rd, rs1, 0x0);
+}
+static inline u32 rv_fmv_d_x(u32 rd, u32 rs1) {
+ return rv_fcvt(0x79, 0x0, rd, rs1, 0x0);
+}
+
+/* FP compares — rd is integer GPR. funct7 = 0x50/0x51 (S/D). rm: 0=LE, 1=LT,
+ * 2=EQ. */
+static inline u32 rv_feq_s(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x50, rs2, rs1, 0x2, rd, RV_OP_FP);
+}
+static inline u32 rv_flt_s(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x50, rs2, rs1, 0x1, rd, RV_OP_FP);
+}
+static inline u32 rv_fle_s(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x50, rs2, rs1, 0x0, rd, RV_OP_FP);
+}
+static inline u32 rv_feq_d(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x51, rs2, rs1, 0x2, rd, RV_OP_FP);
+}
+static inline u32 rv_flt_d(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x51, rs2, rs1, 0x1, rd, RV_OP_FP);
+}
+static inline u32 rv_fle_d(u32 rd, u32 rs1, u32 rs2) {
+ return rv_r(0x51, rs2, rs1, 0x0, rd, RV_OP_FP);
+}
+
+static inline u32 rv_flw(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x2, rd, RV_LOAD_FP);
+}
+static inline u32 rv_fld(u32 rd, u32 rs1, i32 imm) {
+ return rv_i(imm, rs1, 0x3, rd, RV_LOAD_FP);
+}
+static inline u32 rv_fsw(u32 rs2, u32 rs1, i32 imm) {
+ return rv_s(imm, rs2, rs1, 0x2, RV_STORE_FP);
+}
+static inline u32 rv_fsd(u32 rs2, u32 rs1, i32 imm) {
+ return rv_s(imm, rs2, rs1, 0x3, RV_STORE_FP);
+}
+
+/* ---- A extension (LR/SC + AMO) ----
+ * AMO funct7 layout: aq(26) rl(25) funct5(31:27) op-specific.
+ * funct3 selects width: 0x2 = W (32-bit), 0x3 = D (64-bit). */
+static inline u32 rv_amo(u32 funct5, u32 aq, u32 rl, u32 rd, u32 rs1, u32 rs2,
+ u32 funct3) {
+ u32 funct7 = (funct5 << 2) | ((aq & 1u) << 1) | (rl & 1u);
+ return rv_r(funct7, rs2, rs1, funct3, rd, RV_AMO);
+}
+static inline u32 rv_lr_w(u32 rd, u32 rs1, u32 aq, u32 rl) {
+ return rv_amo(0x02, aq, rl, rd, rs1, 0, 0x2);
+}
+static inline u32 rv_lr_d(u32 rd, u32 rs1, u32 aq, u32 rl) {
+ return rv_amo(0x02, aq, rl, rd, rs1, 0, 0x3);
+}
+static inline u32 rv_sc_w(u32 rd, u32 rs1, u32 rs2, u32 aq, u32 rl) {
+ return rv_amo(0x03, aq, rl, rd, rs1, rs2, 0x2);
+}
+static inline u32 rv_sc_d(u32 rd, u32 rs1, u32 rs2, u32 aq, u32 rl) {
+ return rv_amo(0x03, aq, rl, rd, rs1, rs2, 0x3);
+}
+
+/* Other A-extension AMO funct5 codes (W and D widths via funct3). */
+#define RV_AMO_SWAP 0x01u
+#define RV_AMO_ADD 0x00u
+#define RV_AMO_XOR 0x04u
+#define RV_AMO_AND 0x0Cu
+#define RV_AMO_OR 0x08u
+#define RV_AMO_MIN 0x10u
+#define RV_AMO_MAX 0x14u
+#define RV_AMO_MINU 0x18u
+#define RV_AMO_MAXU 0x1Cu
+
+/* Zicsr — CSR instructions. csr in imm[11:0]; funct3 selects op.
+ * csrrw=1, csrrs=2, csrrc=3, csrrwi=5, csrrsi=6, csrrci=7 */
+static inline u32 rv_csrrw(u32 rd, u32 csr, u32 rs1) {
+ return rv_i((i32)(csr & 0xfffu), rs1, 0x1, rd, RV_SYSTEM);
+}
+static inline u32 rv_csrrs(u32 rd, u32 csr, u32 rs1) {
+ return rv_i((i32)(csr & 0xfffu), rs1, 0x2, rd, RV_SYSTEM);
+}
+static inline u32 rv_csrrc(u32 rd, u32 csr, u32 rs1) {
+ return rv_i((i32)(csr & 0xfffu), rs1, 0x3, rd, RV_SYSTEM);
+}
+static inline u32 rv_csrrwi(u32 rd, u32 csr, u32 uimm) {
+ return rv_i((i32)(csr & 0xfffu), uimm & 0x1fu, 0x5, rd, RV_SYSTEM);
+}
+static inline u32 rv_csrrsi(u32 rd, u32 csr, u32 uimm) {
+ return rv_i((i32)(csr & 0xfffu), uimm & 0x1fu, 0x6, rd, RV_SYSTEM);
+}
+static inline u32 rv_csrrci(u32 rd, u32 csr, u32 uimm) {
+ return rv_i((i32)(csr & 0xfffu), uimm & 0x1fu, 0x7, rd, RV_SYSTEM);
+}
+
+/* ===================================================================
+ * Format kinds — one per encoding family the descriptor table dispatches
+ * on. R-type splits by funct3/funct7 selectors; I/S/B/U/J each carry a
+ * distinct immediate layout. The C-extension formats (CR/CI/CSS/CIW/CL/
+ * CS/CB/CJ) are 16-bit; the disassembler picks 16 vs 32 by checking the
+ * bottom two bits of the first halfword (00/01/10 → compressed, 11 → 32).
+ * =================================================================== */
+typedef enum Rv64Format {
+ RV64_FMT_R, /* funct7 rs2 rs1 funct3 rd op — most ALU ops */
+ RV64_FMT_R4, /* fused FMA: rs3 funct2 rs2 rs1 funct3 rd op */
+ RV64_FMT_I, /* imm[11:0] rs1 funct3 rd op — ALU-imm, loads, jalr */
+ RV64_FMT_I_SHIFT, /* shift-imm (shamt6/funct6) — RV64 SLLI/SRLI/SRAI */
+ RV64_FMT_I_SHIFTW, /* RV32 word-shift (shamt5/funct7) — SLLIW/SRLIW/SRAIW */
+ RV64_FMT_S, /* store */
+ RV64_FMT_B, /* branch */
+ RV64_FMT_U, /* LUI/AUIPC */
+ RV64_FMT_J, /* JAL */
+ RV64_FMT_LOAD, /* I-type load: rd, imm(rs1) — printer uses memory syntax */
+ RV64_FMT_STORE, /* S-type store: rs2, imm(rs1) */
+ RV64_FMT_JALR, /* JALR: rd, imm(rs1) — memory-style operand syntax */
+ RV64_FMT_FENCE, /* FENCE pred,succ */
+ RV64_FMT_SYSTEM, /* ECALL/EBREAK — no operands */
+ RV64_FMT_FP_RM, /* FP arithmetic with rm: funct7 rs2 rs1 rm rd op */
+ RV64_FMT_FP_R, /* FP R-type without rm-as-mnemonic-suffix (cmp/sgnj) */
+ RV64_FMT_FP_CVT, /* FP conversion: rs2 is type selector, rs1 is src */
+ RV64_FMT_FP_LOAD, /* fld/flw — rd[FP], imm(rs1) */
+ RV64_FMT_FP_STORE, /* fsd/fsw — rs2[FP], imm(rs1) */
+ RV64_FMT_AMO, /* atomic: rd, rs2, (rs1) */
+ RV64_FMT_LR, /* LR.W/D: rd, (rs1) — no rs2 */
+ RV64_FMT_CSR, /* csrr*: rd, csr, rs1 */
+ RV64_FMT_CSRI, /* csrr*i: rd, csr, uimm5 */
+ /* ---- Compressed (16-bit) formats ---- */
+ RV64_FMT_CR, /* funct4 rd/rs1 rs2 op (e.g. C.MV, C.ADD, C.JR, C.JALR) */
+ RV64_FMT_CI, /* funct3 imm rd/rs1 imm op (e.g. C.ADDI, C.LI, C.LUI) */
+ RV64_FMT_CSS, /* funct3 imm rs2 op (stack store: C.SDSP, C.SWSP) */
+ RV64_FMT_CIW, /* funct3 imm rd' op (C.ADDI4SPN) */
+ RV64_FMT_CL, /* funct3 imm rs1' imm rd' op (C.LD, C.LW) */
+ RV64_FMT_CS, /* funct3 imm rs1' imm rs2' op (C.SD, C.SW) */
+ RV64_FMT_CA, /* funct6 rd'/rs1' funct2 rs2' op (C.AND, C.OR, ...) */
+ RV64_FMT_CB, /* branch: funct3 imm rs1' imm op (C.BEQZ, C.BNEZ) */
+ RV64_FMT_CJ, /* jump: funct3 imm op (C.J, C.JAL_unused on RV64) */
+ RV64_FMT_C_NONE, /* known opcode with no operands (C.NOP, C.EBREAK) */
+ /* Assembler-only multi-word pseudo-instruction (call/tail/la/lla). The
+ * descriptor's `match` is unused; the assembler dispatches on mnemonic
+ * and emits the AUIPC+JALR / AUIPC+ADDI expansion directly. */
+ RV64_FMT_PSEUDO,
+} Rv64Format;
+
+typedef enum Rv64DecodedOpcode {
+ RV64_DEC_UNKNOWN = 0,
+ RV64_DEC_ADDI,
+ RV64_DEC_ADD,
+ RV64_DEC_AUIPC,
+ RV64_DEC_LD,
+ RV64_DEC_SD,
+ RV64_DEC_JALR,
+ RV64_DEC_ECALL,
+ RV64_DEC_EBREAK,
+} Rv64DecodedOpcode;
+
+/* ---- AsmFlags column on Rv64InsnDesc ---- */
+#define RV64_ASMFL_ALIAS 0x01u /* row is an alias (preferred print form) */
+#define RV64_ASMFL_FP 0x02u /* operands take f-register prefix */
+#define RV64_ASMFL_NORM 0x04u /* FP_RM row prints without rm suffix */
+#define RV64_ASMFL_C16 0x08u /* 16-bit compressed instruction */
+/* Assembler-only multi-word pseudo (call/tail/la/lla). These expand to
+ * several 32-bit words and never participate in disassembly — the decoder
+ * sees the individual auipc/jalr/addi words instead. rv64_disasm_find
+ * skips rows carrying this flag. */
+#define RV64_ASMFL_PSEUDO 0x10u
+
+/* ---- Availability column (`av`) on Rv64InsnDesc ----
+ * Which XLEN variant(s) a row is valid on. A row with av==0 is implicitly
+ * available on BOTH (this is how the ~200 untouched `{0, 0}` pad
+ * initializers stay correct — av lands in the first pad byte and reads 0).
+ * Rows whose encoding only exists / changes meaning per XLEN are tagged
+ * explicitly; the find functions skip a row when its av is set and excludes
+ * the wanted arch. */
+#define RV_AV_RV32 0x1u
+#define RV_AV_RV64 0x2u
+#define RV_AV_BOTH (RV_AV_RV32 | RV_AV_RV64)
+
+/* ===================================================================
+ * Per-format field structs + pack/unpack pure functions.
+ * =================================================================== */
+
+typedef struct Rv64R {
+ u32 funct7, rs2, rs1, funct3, rd, op;
+} Rv64R;
+typedef struct Rv64I {
+ u32 imm12, rs1, funct3, rd, op;
+} Rv64I;
+typedef struct Rv64S {
+ u32 imm12, rs2, rs1, funct3, op;
+} Rv64S;
+typedef struct Rv64B {
+ u32 imm13, rs2, rs1, funct3, op;
+} Rv64B;
+typedef struct Rv64U {
+ u32 imm32_hi20, rd, op;
+} Rv64U;
+typedef struct Rv64J {
+ u32 imm21, rd, op;
+} Rv64J;
+
+static inline Rv64R rv64_r_unpack(u32 w) {
+ Rv64R f;
+ f.funct7 = (w >> 25) & 0x7fu;
+ f.rs2 = (w >> 20) & 0x1fu;
+ f.rs1 = (w >> 15) & 0x1fu;
+ f.funct3 = (w >> 12) & 0x7u;
+ f.rd = (w >> 7) & 0x1fu;
+ f.op = w & 0x7fu;
+ return f;
+}
+static inline Rv64I rv64_i_unpack(u32 w) {
+ Rv64I f;
+ f.imm12 = (w >> 20) & 0xfffu;
+ f.rs1 = (w >> 15) & 0x1fu;
+ f.funct3 = (w >> 12) & 0x7u;
+ f.rd = (w >> 7) & 0x1fu;
+ f.op = w & 0x7fu;
+ return f;
+}
+static inline Rv64S rv64_s_unpack(u32 w) {
+ Rv64S f;
+ f.imm12 = (((w >> 25) & 0x7fu) << 5) | ((w >> 7) & 0x1fu);
+ f.rs2 = (w >> 20) & 0x1fu;
+ f.rs1 = (w >> 15) & 0x1fu;
+ f.funct3 = (w >> 12) & 0x7u;
+ f.op = w & 0x7fu;
+ return f;
+}
+static inline Rv64B rv64_b_unpack(u32 w) {
+ Rv64B f;
+ f.imm13 = (((w >> 31) & 1u) << 12) | (((w >> 7) & 1u) << 11) |
+ (((w >> 25) & 0x3fu) << 5) | (((w >> 8) & 0xfu) << 1);
+ f.rs2 = (w >> 20) & 0x1fu;
+ f.rs1 = (w >> 15) & 0x1fu;
+ f.funct3 = (w >> 12) & 0x7u;
+ f.op = w & 0x7fu;
+ return f;
+}
+static inline Rv64U rv64_u_unpack(u32 w) {
+ Rv64U f;
+ f.imm32_hi20 = w & 0xfffff000u;
+ f.rd = (w >> 7) & 0x1fu;
+ f.op = w & 0x7fu;
+ return f;
+}
+static inline Rv64J rv64_j_unpack(u32 w) {
+ Rv64J f;
+ f.imm21 = (((w >> 31) & 1u) << 20) | (((w >> 12) & 0xffu) << 12) |
+ (((w >> 20) & 1u) << 11) | (((w >> 21) & 0x3ffu) << 1);
+ f.rd = (w >> 7) & 0x1fu;
+ f.op = w & 0x7fu;
+ return f;
+}
+
+/* Sign-extend an n-bit value held in the low bits of v to i64. */
+static inline i64 rv64_sext(u64 v, u32 nbits) {
+ u64 mask = (nbits >= 64u) ? ~0ull : ((1ull << nbits) - 1ull);
+ v &= mask;
+ u64 sign = (nbits == 0u) ? 0ull : (1ull << (nbits - 1u));
+ if (v & sign) v |= ~mask;
+ return (i64)v;
+}
+
+/* ===================================================================
+ * Compressed (RV64C) helpers — 16-bit instructions.
+ *
+ * Layout (per RVC quadrant): bits[1:0] (op) select the quadrant:
+ * 00 → Q0 (stack-relative & load/store narrow),
+ * 01 → Q1 (constant/branch),
+ * 10 → Q2 (stack pointer access & jumps & MV/ADD).
+ * 11 is reserved for 32-bit (uncompressed) instructions, so the
+ * disassembler picks 16-bit when (halfword & 3) != 3.
+ *
+ * The "narrow" register fields rs1' / rs2' / rd' are 3-bit and encode
+ * x8..x15; macro RVC_REG3 unfolds: r' → 8 + r'. */
+#define RVC_REG3(r3) ((u32)(8u + ((r3) & 7u)))
+
+typedef struct Rv64C {
+ u32 word;
+} Rv64C; /* 16-bit halfword in low 16 bits */
+
+/* ===================================================================
+ * Descriptor table.
+ * =================================================================== */
+
+typedef struct Rv64InsnDesc {
+ Slice mnemonic;
+ u32 match;
+ u32 mask;
+ u8 fmt; /* Rv64Format */
+ u8 flags; /* RV64_ASMFL_* */
+ u8 av; /* RV_AV_* availability mask; 0 == available on BOTH */
+ u8 pad[1];
+} Rv64InsnDesc;
+
+extern const Rv64InsnDesc rv64_insn_table[];
+extern const u32 rv64_insn_table_n;
+
+/* Linear-scan lookup. Returns the matching descriptor or NULL. First
+ * match wins; ordering puts more-specific entries (aliases, fixed-Rd
+ * forms) before broader ones. `av_wanted` is the RV_AV_* mask of the
+ * decoding arch (RV_AV_RV32 / RV_AV_RV64); rows whose av is set and
+ * excludes it are skipped. Pass RV_AV_RV64 to reproduce the historical
+ * rv64-only behavior exactly. */
+const Rv64InsnDesc* rv64_disasm_find(u32 word, u8 av_wanted);
+
+/* Compressed-instruction (16-bit) variant. Pass the halfword in the low
+ * 16 bits of `word`. `av_wanted` branches the ambiguous quadrant slots
+ * whose meaning differs between rv32 and rv64. Returns NULL if no
+ * descriptor matches. */
+const Rv64InsnDesc* rv64_disasm_find_c(u32 word, u8 av_wanted);
+
+/* Mnemonic → descriptor for the assembler. Returns NULL if not found.
+ * Ignores ALIAS-only rows when those would produce ambiguous parses
+ * (the canonical form is always reachable). `av_wanted` filters rows by
+ * target arch so e.g. `ld`/`addiw` are not assemblable under rv32. */
+const Rv64InsnDesc* rv64_asm_find(Slice mnemonic, u8 av_wanted);
+
+/* ===================================================================
+ * Operand print / parse dispatch.
+ *
+ * rv64_print_operands renders the operand text (everything after the
+ * mnemonic) for `word` into `sb`, using `desc->fmt` to dispatch.
+ * Mnemonic itself is in `desc->mnemonic`; the caller writes it before
+ * calling this helper. `vaddr` is the instruction's virtual address for
+ * PC-relative formats; pass 0 if not known. */
+void rv64_print_operands(StrBuf* sb, const Rv64InsnDesc* desc, u32 word,
+ u64 vaddr);
+
+#endif /* KIT_RV64_ISA_H */
diff --git a/src/arch/riscv/link.c b/src/arch/riscv/link.c
@@ -0,0 +1,145 @@
+/* RV64 link-time arch descriptor. See link_arch.h for the contract.
+ *
+ * The PLT0/PLT-entry/IPLT-stub byte layouts here mirror what used to
+ * live inline in link_dyn.c (PLT) and link_layout.c (IPLT) before the
+ * vtable refactor; comments preserve the WHY (notably the +0x800 bias
+ * on AUIPC immediates). */
+
+#include "arch/riscv/isa.h"
+#include "core/bytes.h"
+#include "core/core.h"
+#include "link/link_arch.h"
+
+/* PLT0 is 8 canonical NOPs (32 bytes); each PLT entry and IPLT stub is
+ * 4 instructions (16 bytes) / 3 instructions (12 bytes) respectively.
+ * Encoded once here so the descriptor and emitters stay in sync. */
+#define RV64_PLT0_SIZE 32u
+#define RV64_PLT_ENTRY_SIZE 16u
+#define RV64_IPLT_STUB_SIZE 12u
+
+/* Split a PC-relative displacement into the (hi20, lo12) pair consumed
+ * by the AUIPC + I-type sequence. The +0x800 bias is the standard
+ * RISC-V two-instruction PCREL trick: AUIPC adds an upper-20 immediate
+ * shifted left 12, then the second instruction adds a sign-extended
+ * 12-bit lo12. If we naively split disp into (disp>>12, disp&0xfff)
+ * the lo12 sign-extends as a *negative* number whenever bit 11 is set,
+ * which underflows the AUIPC result by 0x1000. Adding 0x800 before
+ * the shift rounds the high half up in exactly the cases that need it
+ * so AUIPC + sign-extended-lo12 reconstructs disp correctly. */
+static inline void rv64_split_pcrel(i64 disp, u32* hi20_out, u32* lo12_out) {
+ *hi20_out = (u32)(((u64)(disp + 0x800)) >> 12) & 0xfffffu;
+ *lo12_out = (u32)((u64)disp & 0xfffu);
+}
+
+/* PLT0 under DF_1_NOW is never executed — the loader resolves every
+ * JUMP_SLOT before transferring control — but we still emit it in
+ * canonical form (8 NOPs) so disassemblers and unwinders see a well-
+ * formed prologue at the top of .plt. */
+static void rv64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) {
+ u32 i;
+ (void)plt0_vaddr;
+ (void)gotplt_vaddr;
+ for (i = 0; i < RV64_PLT0_SIZE; i += 4u) wr_u32_le(dst + i, rv_nop());
+}
+
+/* Per-import PLT entry: load the GOT slot pre-filled by the loader
+ * (R_RISCV_JUMP_SLOT) and tail-call through it. t1 is the standard
+ * psABI scratch for the trampoline return-address (clobbered by the
+ * lazy resolver in the non-BIND_NOW path); t3 holds the slot pointer. */
+static void rv64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) {
+ i64 disp = (i64)slot_vaddr - (i64)entry_vaddr;
+ u32 hi20;
+ u32 lo12;
+ rv64_split_pcrel(disp, &hi20, &lo12);
+ wr_u32_le(dst + 0, rv_auipc(RV_T3, hi20));
+ wr_u32_le(dst + 4, rv_ld(RV_T3, RV_T3, (i32)lo12));
+ wr_u32_le(dst + 8, rv_jalr(RV_T1, RV_T3, 0));
+ wr_u32_le(dst + 12, rv_nop());
+}
+
+/* IPLT stub: load .igot.plt[i] (filled at startup by the resolver) and
+ * tail-call to it. The stub->slot displacement is invariant under the
+ * segment-base shift (both addresses live in the same image), so we
+ * bake it directly into the instructions and report zero apply-time
+ * relocs — unlike aarch64, which cannot encode a 32-bit pcrel inline. */
+static u32 rv64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr,
+ LinkArchIPltReloc out[2]) {
+ i64 disp = (i64)slot_vaddr - (i64)stub_vaddr;
+ u32 hi20;
+ u32 lo12;
+ (void)out;
+ rv64_split_pcrel(disp, &hi20, &lo12);
+ wr_u32_le(dst + 0, rv_auipc(RV_T1, hi20));
+ wr_u32_le(dst + 4, rv_ld(RV_T1, RV_T1, (i32)lo12));
+ wr_u32_le(dst + 8, rv_jr(RV_T1));
+ return 0u;
+}
+
+/* RV32 PLT entry: identical to rv64_emit_plt_entry except the GOT slot
+ * is 4 bytes (one XLEN word), so the load is LW not LD. Entry stays
+ * 16 bytes / 4 insns; the AUIPC + (hi20,lo12) split is XLEN-neutral. */
+static void rv32_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) {
+ i64 disp = (i64)slot_vaddr - (i64)entry_vaddr;
+ u32 hi20;
+ u32 lo12;
+ rv64_split_pcrel(disp, &hi20, &lo12);
+ wr_u32_le(dst + 0, rv_auipc(RV_T3, hi20));
+ wr_u32_le(dst + 4, rv_lw(RV_T3, RV_T3, (i32)lo12));
+ wr_u32_le(dst + 8, rv_jalr(RV_T1, RV_T3, 0));
+ wr_u32_le(dst + 12, rv_nop());
+}
+
+/* RV32 IPLT stub: identical to rv64_emit_iplt_stub except the
+ * .igot.plt slot is 4 bytes, so LW not LD. Stub stays 12 bytes /
+ * 3 insns; displacement is baked inline, so zero apply-time relocs. */
+static u32 rv32_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr,
+ LinkArchIPltReloc out[2]) {
+ i64 disp = (i64)slot_vaddr - (i64)stub_vaddr;
+ u32 hi20;
+ u32 lo12;
+ (void)out;
+ rv64_split_pcrel(disp, &hi20, &lo12);
+ wr_u32_le(dst + 0, rv_auipc(RV_T1, hi20));
+ wr_u32_le(dst + 4, rv_lw(RV_T1, RV_T1, (i32)lo12));
+ wr_u32_le(dst + 8, rv_jr(RV_T1));
+ return 0u;
+}
+
+/* A direct rv64 call (R_RV_CALL = AUIPC+JALR) reaches only ±2GiB. In the JIT,
+ * an external SK_ABS target (a host libc symbol resolved to an arbitrary
+ * address) can lie farther than that from the JIT-allocated code region, where
+ * link_reloc_apply would panic "RV CALL out of range". Reporting these as
+ * branch relocs routes them through the JIT call-stub pass, which reuses
+ * emit_iplt_stub (AUIPC+LD+JR) to reach an arbitrary address held in an
+ * in-image slot — the same safety net aa64 and x64 already wire. */
+static int rv64_is_branch_reloc(RelocKind kind) {
+ return kind == R_RV_CALL || kind == R_PLT32;
+}
+
+const LinkArchDesc link_arch_rv64 = {
+ .plt0_size = RV64_PLT0_SIZE,
+ .plt_entry_size = RV64_PLT_ENTRY_SIZE,
+ .iplt_stub_size = RV64_IPLT_STUB_SIZE,
+ .global_pointer_symbol = "__global_pointer$",
+ .global_pointer_rw_offset = 0x800u,
+ .emit_plt0 = rv64_emit_plt0,
+ .emit_plt_entry = rv64_emit_plt_entry,
+ .emit_iplt_stub = rv64_emit_iplt_stub,
+ .needs_jit_call_stub = rv64_is_branch_reloc,
+};
+
+/* RV32 link descriptor: identical to rv64 (PLT0/entry/stub byte sizes,
+ * __global_pointer$ + 0x800 RW bias, canonical 8-NOP PLT0, and the JIT
+ * call-stub predicate) EXCEPT the PLT/IPLT emitters load 4-byte GOT
+ * slots with LW instead of LD. */
+const LinkArchDesc link_arch_rv32 = {
+ .plt0_size = RV64_PLT0_SIZE,
+ .plt_entry_size = RV64_PLT_ENTRY_SIZE,
+ .iplt_stub_size = RV64_IPLT_STUB_SIZE,
+ .global_pointer_symbol = "__global_pointer$",
+ .global_pointer_rw_offset = 0x800u,
+ .emit_plt0 = rv64_emit_plt0,
+ .emit_plt_entry = rv32_emit_plt_entry,
+ .emit_iplt_stub = rv32_emit_iplt_stub,
+ .needs_jit_call_stub = rv64_is_branch_reloc,
+};
diff --git a/src/arch/riscv/native.c b/src/arch/riscv/native.c
@@ -0,0 +1,4048 @@
+/* src/arch/rv64/native.c — RISC-V (RV64GC, LP64D) NativeTarget implementation.
+ *
+ * Mirrors the aa64 reference (src/arch/aa64/native.c): a physical-emission
+ * NativeTarget driven at -O0 by the shared NativeDirectTarget and at -O1+ by
+ * the optimizer emit path. ABI decisions go through the abi/ interface; this
+ * file owns only ISA emission and the RV64 frame layout.
+ *
+ * Frame model (single, top-record): s0 (x8) is the frame pointer anchored at
+ * the saved s0/ra pair; slots live below s0 at positive byte offsets `off`
+ * (address = s0 - off); outgoing args sit at the bottom of the frame (sp+0..).
+ * frame_size = align16(16 + cum_off + max_outgoing + va_save_sz)
+ * fp_pair_off = frame_size - 16 - va_save_sz (saved pair, sp-relative)
+ * CFA = s0 + (frame_size - fp_pair_off)
+ * RISC-V has no condition flags: comparisons materialize a 0/1 via SLT/SLTU or
+ * FLT/FLE; branches compare two registers directly. x0 is a hardware zero. */
+
+#include <string.h>
+
+#include "abi/abi.h"
+#include "arch/riscv/asm.h"
+#include "arch/riscv/isa.h"
+#include "arch/riscv/regs.h"
+#include "arch/riscv/rv64.h"
+#include "arch/riscv/variant.h"
+#include "asm/asm.h"
+#include "asm/asm_lex.h"
+#include "cg/native_argmove.h"
+#include "cg/native_asm.h"
+#include "cg/native_direct_target.h"
+#include "cg/native_frame.h"
+#include "cg/type.h"
+#include "core/arena.h"
+#include "core/bytes.h"
+#include "core/pool.h"
+#include "core/slice.h"
+#include "obj/obj.h"
+
+enum {
+ RV_TMP0 = 5u, /* t0: emit-internal scratch (reserved, never allocable) */
+ RV_TMP1 = 6u, /* t1: emit-internal scratch */
+ RV_TMP2 = 7u, /* t2: emit-internal scratch (reserved in phys table) */
+ RV_TMP3 = 28u, /* t3: emit-internal scratch (reserved in phys table) */
+ RV_FTMP0 = 0u, /* ft0: emit-internal FP scratch */
+ RV_FTMP1 = 1u, /* ft1: emit-internal FP scratch */
+ RV_FA0 = 10u, /* fa0..fa7 = f10..f17 (FP arg/return registers) */
+ RV_FA7 = 17u,
+ /* Single-pass (-O0) worst-case prologue: sp adjust (3) + far save pair (7)
+ * + sret spill (1) + variadic GP spills (8). No callee-saves at -O0. */
+ RV_PROLOGUE_WORDS = 32u,
+ /* Known-frame (-O1) prologues are emitted directly, not into the fixed -O0
+ * NOP region, and additionally save callee-saved registers (up to 11 int + 12
+ * fp, each up to 4 words for a far s0-relative offset) on top of the header,
+ * sret, and variadic spills. Size the build buffer for the worst case. */
+ RV_KNOWN_PROLOGUE_WORDS = 192u,
+ RV_FRAME_SAVE_SIZE = 16u,
+};
+
+/* s1..s11 (11) + fs0..fs11 (12); separate int/fp collect arrays use this cap.
+ */
+#define RV_MAX_CALLEE_SAVES 16u
+#define RV_MAX_REG_ARG_MOVES 16u
+
+extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc);
+extern void debug_func_pc_range(Debug*, ObjSecId text_section, u32 begin_ofs,
+ u32 end_ofs);
+
+/* ============================ low-level emit ============================ */
+
+void rv64_emit32(MCEmitter* mc, u32 word) {
+ u8 b[4];
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ wr_u32_le(b, word);
+ mc->emit_bytes(mc, b, sizeof b);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
+void rv64_emit16(MCEmitter* mc, u32 halfword) {
+ u8 b[2];
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ b[0] = (u8)(halfword & 0xff);
+ b[1] = (u8)((halfword >> 8) & 0xff);
+ mc->emit_bytes(mc, b, sizeof b);
+ if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+}
+
+static void rv_patch32(ObjBuilder* obj, ObjSecId sec, u32 off, u32 word) {
+ u8 b[4];
+ wr_u32_le(b, word);
+ obj_patch(obj, sec, off, b, sizeof b);
+}
+
+static int fits_i12(i64 v) { return v >= -2048 && v <= 2047; }
+static int fits_i32(i64 v) {
+ return v >= (i64)(i32)0x80000000 && v <= (i64)(i32)0x7fffffff;
+}
+
+static u32 align_up_u32(u32 v, u32 align) {
+ u32 mask = align ? align - 1u : 0u;
+ return (v + mask) & ~mask;
+}
+
+static i64 floor_div_4096(i64 v) {
+ if (v >= 0) return v / 4096;
+ return -((-v + 4095) / 4096);
+}
+
+static void rv_emit_li32(const RiscvVariant* v, MCEmitter* mc, u32 rd,
+ i32 imm) {
+ if (imm >= -2048 && imm <= 2047) {
+ rv64_emit32(mc, rv_addi(rd, RV_ZERO, imm));
+ return;
+ }
+ {
+ i64 hi64 = floor_div_4096((i64)imm + 0x800);
+ i32 hi = (i32)hi64;
+ i32 lo = (i32)((i64)imm - hi64 * 4096);
+ rv64_emit32(mc, rv_lui(rd, (u32)hi & 0xfffffu));
+ /* ADDIW is RV64-only; on RV32 the value fits 32 bits so plain ADDI is
+ * exact (and identical to ADDIW's low result on RV64). */
+ if (lo)
+ rv64_emit32(mc, v->has_w_forms ? rv_addiw(rd, rd, lo) : rv_addi(rd, rd, lo));
+ }
+}
+
+static i32 sext12(u32 v) {
+ v &= 0xfffu;
+ return (v & 0x800u) ? (i32)v - 4096 : (i32)v;
+}
+
+/* Builds a full XLEN-wide value. The recursion / slli-12 chain assembles bits
+ * above 32 and is only ever reached on rv64 (a single rv32 register cannot hold
+ * a value wider than 32 bits — the cg layer legalizes those into pairs). */
+static void rv_emit_li64(const RiscvVariant* v, MCEmitter* mc, u32 rd, u64 imm) {
+ if (fits_i32((i64)imm)) {
+ rv_emit_li32(v, mc, rd, (i32)(i64)imm);
+ return;
+ }
+ {
+ i32 lo = sext12((u32)imm);
+ u64 hi = (imm - (u64)(i64)lo) >> 12;
+ rv_emit_li64(v, mc, rd, hi);
+ rv64_emit32(mc, rv_slli(rd, rd, 12));
+ if (lo) rv64_emit32(mc, rv_addi(rd, rd, lo));
+ }
+}
+
+/* sf!=0 selects a full native-width materialization; sf==0 a 32-bit value. On
+ * rv32 the native width is 32, so the wide branch collapses to the 32-bit
+ * path. */
+static void rv_emit_load_imm(const RiscvVariant* v, MCEmitter* mc, u32 sf,
+ u32 rd, i64 imm) {
+ if (!sf || v->xlen == 32u) {
+ rv_emit_li32(v, mc, rd, (i32)imm);
+ return;
+ }
+ if (fits_i32(imm))
+ rv_emit_li32(v, mc, rd, (i32)imm);
+ else
+ rv_emit_li64(v, mc, rd, (u64)imm);
+}
+
+/* rd = base + off, materializing the offset when it exceeds imm12. Uses RV_TMP1
+ * as scratch for the wide path, so callers must keep RV_TMP1 free. */
+static void rv_emit_addr_adjust(const RiscvVariant* v, MCEmitter* mc, u32 rd,
+ u32 base, i32 off) {
+ if (off == 0) {
+ if (rd != base) rv64_emit32(mc, rv_addi(rd, base, 0));
+ return;
+ }
+ if (fits_i12(off)) {
+ rv64_emit32(mc, rv_addi(rd, base, off));
+ return;
+ }
+ rv_emit_load_imm(v, mc, 1, RV_TMP1, (i64)off);
+ rv64_emit32(mc, rv_add(rd, base, RV_TMP1));
+}
+
+static u32 enc_int_store(const RiscvVariant* v, u32 nbytes, u32 src, u32 base,
+ i32 off) {
+ switch (nbytes) {
+ case 1:
+ return rv_sb(src, base, off);
+ case 2:
+ return rv_sh(src, base, off);
+ case 4:
+ return rv_sw(src, base, off);
+ default:
+ /* The widest GPR store is SD on rv64, SW on rv32. */
+ return v->ptr_bytes == 8u ? rv_sd(src, base, off) : rv_sw(src, base, off);
+ }
+}
+static u32 enc_int_load(const RiscvVariant* v, u32 nbytes, int sign_ext, u32 rd,
+ u32 base, i32 off) {
+ switch (nbytes) {
+ case 1:
+ return sign_ext ? rv_lb(rd, base, off) : rv_lbu(rd, base, off);
+ case 2:
+ return sign_ext ? rv_lh(rd, base, off) : rv_lhu(rd, base, off);
+ case 4:
+ /* LWU (zero-extending 32-bit load) is RV64-only; on rv32 a 4-byte load
+ * is just LW (no wider container to zero-extend into). */
+ return sign_ext || v->xlen == 32u ? rv_lw(rd, base, off)
+ : rv_lwu(rd, base, off);
+ default:
+ /* The widest GPR load is LD on rv64, LW on rv32. */
+ return v->ptr_bytes == 8u ? rv_ld(rd, base, off) : rv_lw(rd, base, off);
+ }
+}
+
+/* Pointer-width GPR load/store (GOT entries, frame-value bases, saved ra/s0,
+ * sret/indirect/va_list pointers): LD/SD on rv64, LW/SW on rv32. */
+static u32 rv_ld_ptr(const RiscvVariant* v, u32 rd, u32 base, i32 off) {
+ return v->ptr_bytes == 8u ? rv_ld(rd, base, off) : rv_lw(rd, base, off);
+}
+static u32 rv_sd_ptr(const RiscvVariant* v, u32 src, u32 base, i32 off) {
+ return v->ptr_bytes == 8u ? rv_sd(src, base, off) : rv_sw(src, base, off);
+}
+
+/* ============================ target state ============================ */
+
+/* Frame slots and callee-save records live in the shared NativeFrame
+ * bookkeeping (cg/native_frame.h); these aliases keep the rv64-local spellings.
+ */
+typedef NativeFrameSlotEntry RvNativeSlot;
+typedef NativeFrameCalleeSave RvCalleeSave;
+
+typedef enum RvPatchKind { RV_PATCH_ALLOCA } RvPatchKind;
+
+typedef struct RvPatch {
+ u8 kind; /* RvPatchKind */
+ u32 pos;
+ u32 dst_reg;
+} RvPatch;
+
+typedef struct RvNativeTarget {
+ NativeTarget base;
+ /* Immutable per-XLEN descriptor (rv32 / rv64), set once in the constructor
+ * from c->target.arch. Every XLEN-dependent emit site reads it; with the
+ * rv64 variant each site reproduces the historical literal exactly. */
+ const RiscvVariant* variant;
+ SrcLoc loc;
+ const CGFuncDesc* func;
+
+ /* Shared frame bookkeeping: slot table, cum_off, max_outgoing, callee-save
+ * set, and the known_frame / has_alloca / frame_final flags. */
+ NativeFrame frame;
+ u32 frame_size_final;
+ u32 fp_pair_off;
+ u32 minimal_prologue_words; /* known-frame path: exact prologue length, else 0
+ */
+
+ /* Known-frame (-O1) leaf no-frame tier (aa64's slim_prologue equivalent),
+ * settled in rv_func_begin_known_frame; always 0 on the single-pass path. A
+ * leaf with no callee-saves, no body slots, no outgoing args, no
+ * sret/variadic and register-only params never reads s0 nor clobbers ra, so
+ * it emits NO prologue and a bare `ret` — the whole frame setup/teardown is
+ * elided. RISC-V has no pre/post-indexed store, so aa64's fp_at_bottom fold
+ * would save zero instructions on a kept frame and is intentionally not
+ * ported (see doc/plan/ARCH.md §2); this leaf tier is the rv64 win. */
+ u8 slim_prologue;
+
+ u32 incoming_stack_size; /* fixed-param stack bytes (tail-call check) */
+ u32 next_param_int;
+ u32 next_param_fp;
+ u32 next_param_stack;
+ u8 has_sret;
+ u8 is_variadic;
+ NativeFrameSlot sret_ptr_slot;
+
+ RvPatch* patches;
+ u32 npatches;
+ u32 patches_cap;
+ u32 nalloca;
+
+ u32 func_start;
+ u32 prologue_pos;
+ MCLabel epilogue_label;
+} RvNativeTarget;
+
+static RvNativeTarget* rv_of(NativeTarget* t) { return (RvNativeTarget*)t; }
+
+static _Noreturn void rv_panic(RvNativeTarget* a, const char* msg) {
+ compiler_panic(a->base.c, a->loc, "rv64 native target: %s", msg);
+}
+
+static RvNativeSlot* rv_slot_get(RvNativeTarget* a, NativeFrameSlot fs) {
+ return native_frame_slot_at(&a->frame, fs);
+}
+
+/* s0-relative byte offset of a frame slot's base (address = s0 + ret). */
+static i32 rv_s0_off_slot(const RvNativeSlot* s) { return -(i32)s->off; }
+
+static u32 rv_va_save_sz(const RvNativeTarget* a) {
+ /* ABI-derived: the variadic register-save area is gp_reg_count*gp_slot_size
+ * (a0..a7 = 64 bytes for LP64D, 32 for ILP32). Only present in variadics. */
+ return a->is_variadic ? native_frame_va_save_bytes(a->base.c->abi) : 0u;
+}
+
+/* s0-relative byte offset of incoming stack arg at byte_off. Stack args sit
+ * just above the saved pair; the variadic GP save area (when present) is
+ * contiguous with them at [s0 + frame_save_size). */
+static i32 rv_s0_off_in_arg(const RvNativeTarget* a, u32 byte_off) {
+ u32 base = a->variant->frame_save_size;
+ if (a->is_variadic) base += rv_va_save_sz(a);
+ return (i32)(base + byte_off);
+}
+
+/* Callee-saved registers are homed just below the locals at rv_save_off() —
+ * they are NOT frame slots, so the frame size must reserve their bytes
+ * explicitly. Integer saves are ptr_bytes wide (sd on rv64, sw on rv32); FP
+ * saves are always 8 bytes (fsd, even on rv32d). On rv64 both are 8 so the sum
+ * is identical to the historical ncallee_saves*8. Zero at -O0. */
+static u32 rv_callee_save_bytes(const RvNativeTarget* a) {
+ u32 ptr = a->variant->ptr_bytes;
+ u32 i, bytes = 0;
+ for (i = 0; i < a->frame.ncallee_saves; ++i)
+ bytes += a->frame.callee_saves[i].cls == NATIVE_REG_FP ? 8u : ptr;
+ return bytes;
+}
+
+static u32 rv_frame_size(const RvNativeTarget* a) {
+ u32 raw = a->variant->frame_save_size + a->frame.cum_off +
+ rv_callee_save_bytes(a) + a->frame.max_outgoing + rv_va_save_sz(a);
+ return align_up_u32(raw, 16u);
+}
+
+static u32 rv_fp_pair_off(const RvNativeTarget* a, u32 frame_size) {
+ return frame_size - a->variant->frame_save_size - rv_va_save_sz(a);
+}
+
+/* ============================ type helpers ============================ */
+
+/* Scalar size/align/mem/class/loc constructors are shared in native_target.h
+ * (native_type_size, native_type_align, native_mem_for_type,
+ * native_class_for_type_fp_le8, native_loc_reg, native_loc_stack,
+ * native_loc_is_fp). loc_reg's mask is arch-specific and stays here. */
+
+/* True when a scalar value is WIDER than XLEN's natural single-register width,
+ * i.e. it needs the "wide" (rv64 64-bit) ops rather than the base ops. On rv64
+ * a pointer is 8 bytes and counts as wide alongside i64/double; on rv32 a
+ * pointer is 4 bytes and fits a single 32-bit register, so it is NOT wide and
+ * the base (non-W) ops apply. (Kept named rv_is_64 to minimize churn; for the
+ * rv64 variant the result is byte-identical to the old predicate.) */
+static int rv_is_64(NativeTarget* t, KitCgTypeId type) {
+ const RiscvVariant* v = rv_of(t)->variant;
+ return native_type_size(t, type) >= 8u ||
+ (v->xlen == 64u && cg_type_is_ptr(t->c, type));
+}
+
+static u32 loc_reg(NativeLoc loc) { return loc.v.reg & 0x1fu; }
+
+/* ============================ register tables ============================ */
+
+#define RV_PHYS_INT_ARG(r, idx) \
+ {.reg = (r), \
+ .cls = NATIVE_REG_INT, \
+ .abi_index = (idx), \
+ .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \
+ ((idx) < 2u ? NATIVE_REG_RET : 0), \
+ .spill_cost = 1u, \
+ .copy_cost = 1u}
+#define RV_PHYS_INT_CALLER(r) \
+ {.reg = (r), \
+ .cls = NATIVE_REG_INT, \
+ .abi_index = 0xffu, \
+ .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \
+ .spill_cost = 1u, \
+ .copy_cost = 1u}
+#define RV_PHYS_INT_CALLEE(r) \
+ {.reg = (r), \
+ .cls = NATIVE_REG_INT, \
+ .abi_index = 0xffu, \
+ .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \
+ .spill_cost = 4u, \
+ .copy_cost = 1u}
+#define RV_PHYS_INT_RESERVED(r) \
+ {.reg = (r), \
+ .cls = NATIVE_REG_INT, \
+ .abi_index = 0xffu, \
+ .flags = NATIVE_REG_RESERVED, \
+ .spill_cost = 0u, \
+ .copy_cost = 0u}
+
+/* t0..t3 (x5,x6,x7,x28) are emit-internal scratch (RV_TMP0..RV_TMP3), reserved
+ * and never handed to the allocator or driver. t4/t5 are the driver scratch
+ * pool (disjoint from the emit temps so a hook can never clobber an operand the
+ * driver parked there). t6 is the lone caller-saved allocable (the -O0 cache's
+ * only caller-saved home); s1..s11 are appended callee-saved, chosen under
+ * pressure (and saved by the optimizer prologue at -O1). */
+static const Reg rv_int_allocable[] = {31u, 9u, 18u, 19u, 20u, 21u,
+ 22u, 23u, 24u, 25u, 26u, 27u};
+static const Reg rv_int_scratch[] = {29u, 30u}; /* t4, t5 */
+
+static const NativePhysRegInfo rv_int_phys[] = {
+ RV_PHYS_INT_RESERVED(0u), /* zero */
+ RV_PHYS_INT_RESERVED(1u), /* ra */
+ RV_PHYS_INT_RESERVED(2u), /* sp */
+ RV_PHYS_INT_RESERVED(3u), /* gp */
+ RV_PHYS_INT_RESERVED(4u), /* tp */
+ RV_PHYS_INT_RESERVED(5u), /* t0 = TMP0 */
+ RV_PHYS_INT_RESERVED(6u), /* t1 = TMP1 */
+ RV_PHYS_INT_RESERVED(7u), /* t2 = TMP2 (emit) */
+ RV_PHYS_INT_RESERVED(8u), /* s0/fp */
+ RV_PHYS_INT_CALLEE(9u), /* s1 */
+ RV_PHYS_INT_ARG(10u, 0u), RV_PHYS_INT_ARG(11u, 1u),
+ RV_PHYS_INT_ARG(12u, 2u), RV_PHYS_INT_ARG(13u, 3u),
+ RV_PHYS_INT_ARG(14u, 4u), RV_PHYS_INT_ARG(15u, 5u),
+ RV_PHYS_INT_ARG(16u, 6u), RV_PHYS_INT_ARG(17u, 7u),
+ RV_PHYS_INT_CALLEE(18u), RV_PHYS_INT_CALLEE(19u),
+ RV_PHYS_INT_CALLEE(20u), RV_PHYS_INT_CALLEE(21u),
+ RV_PHYS_INT_CALLEE(22u), RV_PHYS_INT_CALLEE(23u),
+ RV_PHYS_INT_CALLEE(24u), RV_PHYS_INT_CALLEE(25u),
+ RV_PHYS_INT_CALLEE(26u), RV_PHYS_INT_CALLEE(27u),
+ RV_PHYS_INT_RESERVED(28u), /* t3 = TMP3 (emit) */
+ RV_PHYS_INT_RESERVED(29u), /* t4 = driver scratch */
+ RV_PHYS_INT_RESERVED(30u), /* t5 = driver scratch */
+ RV_PHYS_INT_CALLER(31u), /* t6 = caller-saved allocable */
+};
+
+#define RV_PHYS_FP_ARG(r, idx) \
+ {.reg = (r), \
+ .cls = NATIVE_REG_FP, \
+ .abi_index = (idx), \
+ .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \
+ ((idx) < 2u ? NATIVE_REG_RET : 0), \
+ .spill_cost = 1u, \
+ .copy_cost = 1u}
+#define RV_PHYS_FP_CALLER(r) \
+ {.reg = (r), \
+ .cls = NATIVE_REG_FP, \
+ .abi_index = 0xffu, \
+ .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \
+ .spill_cost = 1u, \
+ .copy_cost = 1u}
+#define RV_PHYS_FP_CALLEE(r) \
+ {.reg = (r), \
+ .cls = NATIVE_REG_FP, \
+ .abi_index = 0xffu, \
+ .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \
+ .spill_cost = 4u, \
+ .copy_cost = 1u}
+#define RV_PHYS_FP_RESERVED(r) \
+ {.reg = (r), \
+ .cls = NATIVE_REG_FP, \
+ .abi_index = 0xffu, \
+ .flags = NATIVE_REG_RESERVED, \
+ .spill_cost = 0u, \
+ .copy_cost = 0u}
+
+/* Caller-saved allocable first (ft4..ft7, ft8..ft11), then callee (fs0..fs11).
+ * ft0/ft1 reserved as emit-internal scratch; ft2/ft3 driver scratch. */
+static const Reg rv_fp_allocable[] = {4u, 5u, 6u, 7u, 28u, 29u, 30u,
+ 31u, 8u, 9u, 18u, 19u, 20u, 21u,
+ 22u, 23u, 24u, 25u, 26u, 27u};
+static const Reg rv_fp_scratch[] = {2u, 3u}; /* ft2, ft3 */
+
+static const NativePhysRegInfo rv_fp_phys[] = {
+ RV_PHYS_FP_RESERVED(0u), /* ft0 = FTMP0 */
+ RV_PHYS_FP_RESERVED(1u), /* ft1 = FTMP1 */
+ RV_PHYS_FP_RESERVED(2u), /* ft2 = scratch */
+ RV_PHYS_FP_RESERVED(3u), /* ft3 = scratch */
+ RV_PHYS_FP_CALLER(4u), RV_PHYS_FP_CALLER(5u), RV_PHYS_FP_CALLER(6u),
+ RV_PHYS_FP_CALLER(7u), RV_PHYS_FP_CALLEE(8u), RV_PHYS_FP_CALLEE(9u),
+ RV_PHYS_FP_ARG(10u, 0u), RV_PHYS_FP_ARG(11u, 1u), RV_PHYS_FP_ARG(12u, 2u),
+ RV_PHYS_FP_ARG(13u, 3u), RV_PHYS_FP_ARG(14u, 4u), RV_PHYS_FP_ARG(15u, 5u),
+ RV_PHYS_FP_ARG(16u, 6u), RV_PHYS_FP_ARG(17u, 7u), RV_PHYS_FP_CALLEE(18u),
+ RV_PHYS_FP_CALLEE(19u), RV_PHYS_FP_CALLEE(20u), RV_PHYS_FP_CALLEE(21u),
+ RV_PHYS_FP_CALLEE(22u), RV_PHYS_FP_CALLEE(23u), RV_PHYS_FP_CALLEE(24u),
+ RV_PHYS_FP_CALLEE(25u), RV_PHYS_FP_CALLEE(26u), RV_PHYS_FP_CALLEE(27u),
+ RV_PHYS_FP_CALLER(28u), RV_PHYS_FP_CALLER(29u), RV_PHYS_FP_CALLER(30u),
+ RV_PHYS_FP_CALLER(31u),
+};
+
+static const NativeAllocClassInfo rv_classes[] = {
+ {.cls = NATIVE_REG_INT,
+ .allocable = rv_int_allocable,
+ .nallocable = sizeof rv_int_allocable / sizeof rv_int_allocable[0],
+ .scratch = rv_int_scratch,
+ .nscratch = sizeof rv_int_scratch / sizeof rv_int_scratch[0],
+ .phys = rv_int_phys,
+ .nphys = sizeof rv_int_phys / sizeof rv_int_phys[0],
+ /* t0-t6 (5-7,28-31) + a0-a7 (10-17) */
+ .caller_saved_mask = 0xf00400e0u | 0x0001fc00u,
+ /* s0-s11 (8,9,18-27) */
+ .callee_saved_mask = 0x0ffc0300u,
+ .arg_mask = 0x0001fc00u,
+ .ret_mask = 0x00000c00u,
+ /* zero,ra,sp,gp,tp,t0,t1,t2,s0 (bits 0-8) + t3 (bit 28). t4/t5 are the
+ * driver scratch pool (reserved-from-alloc but listed in scratch[]). */
+ .reserved_mask = 0x000001ffu | (1u << 28)},
+ {.cls = NATIVE_REG_FP,
+ .allocable = rv_fp_allocable,
+ .nallocable = sizeof rv_fp_allocable / sizeof rv_fp_allocable[0],
+ .scratch = rv_fp_scratch,
+ .nscratch = sizeof rv_fp_scratch / sizeof rv_fp_scratch[0],
+ .phys = rv_fp_phys,
+ .nphys = sizeof rv_fp_phys / sizeof rv_fp_phys[0],
+ /* ft0-ft7 (0-7), fa0-fa7 (10-17), ft8-ft11 (28-31) */
+ .caller_saved_mask = 0xf00400ffu | 0x0001fc00u,
+ /* fs0-fs11 (8,9,18-27) */
+ .callee_saved_mask = 0x0ffc0300u,
+ .arg_mask = 0x0001fc00u,
+ .ret_mask = 0x00000c00u,
+ .reserved_mask = 0x0000000fu /* ft0-ft3 */},
+};
+
+/* Resolve a register name ("a7", "fa0", ...) to its (class, Reg). Powers the
+ * optimizer's inline-asm clobber masks and explicit hard-register operands
+ * ("{a7}" from a GNU local register variable). x0..x31 are DWARF 0..31; the
+ * FP bank f0..f31 is DWARF 32..63. Returns non-zero for a non-register name
+ * (cc/memory/unknown), which the caller skips. */
+static int rv_resolve_name(const NativeRegInfo* ri, Slice name, Reg* out,
+ NativeAllocClass* cls_out) {
+ char buf[16];
+ uint32_t dwarf;
+ (void)ri;
+ if (!name.s || !name.len || name.len >= sizeof buf) return 1;
+ memcpy(buf, name.s, name.len);
+ buf[name.len] = '\0';
+ if (rv64_register_index(buf, &dwarf) != 0) return 1;
+ if (dwarf <= 31u) {
+ *cls_out = NATIVE_REG_INT;
+ *out = (Reg)dwarf;
+ return 0;
+ }
+ if (dwarf >= 32u && dwarf <= 63u) {
+ *cls_out = NATIVE_REG_FP;
+ *out = (Reg)(dwarf - 32u);
+ return 0;
+ }
+ return 1;
+}
+
+static int rv_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls,
+ Reg reg) {
+ (void)ri;
+ if (cls == NATIVE_REG_INT) {
+ if (reg == 9u) return 1; /* s1 */
+ if (reg >= 10u && reg <= 17u) return 1; /* a0..a7 */
+ if (reg >= 18u && reg <= 27u) return 1; /* s2..s11 */
+ if (reg == 31u) return 1; /* t6 */
+ return 0;
+ }
+ if (cls == NATIVE_REG_FP) return reg >= 4u && reg <= 31u;
+ return 0;
+}
+
+static const NativeRegInfo rv_reg_info = {
+ .classes = rv_classes,
+ .nclasses = sizeof rv_classes / sizeof rv_classes[0],
+ .resolve_name = rv_resolve_name,
+ .asm_operand_reg_ok = rv_asm_operand_reg_ok,
+};
+
+/* ============================ legality ============================ */
+
+static int rv_imm_legal(NativeTarget* t, NativeImmUse use, u32 op,
+ KitCgTypeId type, i64 imm) {
+ /* SLLI/SRLI/SRAI shamt is shamt_bits wide: 6 bits (max 63) on rv64, 5 bits
+ * (max 31) on rv32. */
+ i64 shamt_max = (i64)((1u << rv_of(t)->variant->shamt_bits) - 1u);
+ (void)type;
+ switch (use) {
+ case NATIVE_IMM_MOVE:
+ return 1;
+ case NATIVE_IMM_BINOP:
+ switch ((BinOp)op) {
+ case BO_IADD:
+ return fits_i12(imm);
+ case BO_ISUB:
+ return fits_i12(-imm); /* emitted as ADDI with negated imm */
+ case BO_AND:
+ case BO_OR:
+ case BO_XOR:
+ return fits_i12(imm);
+ case BO_SHL:
+ case BO_SHR_S:
+ case BO_SHR_U:
+ return imm >= 0 && imm <= shamt_max;
+ default:
+ return 0;
+ }
+ case NATIVE_IMM_CMP:
+ return imm == 0; /* compares need both ends in registers (SLT/branch) */
+ case NATIVE_IMM_ADDR_OFFSET:
+ return fits_i12(imm);
+ }
+ return 0;
+}
+
+static int rv_addr_legal(NativeTarget* t, const NativeAddr* addr,
+ MemAccess mem) {
+ (void)t;
+ (void)mem;
+ if (!addr) return 0;
+ if (addr->index_kind != NATIVE_ADDR_INDEX_NONE) return 0;
+ if (addr->base_kind != NATIVE_ADDR_BASE_REG &&
+ addr->base_kind != NATIVE_ADDR_BASE_FRAME)
+ return 0;
+ return fits_i12(addr->offset);
+}
+
+/* ============================ memory ============================ */
+
+/* Materialize the runtime address of a global into `dst`, including addend. */
+static void rv_emit_global_addr(RvNativeTarget* a, u32 dst, ObjSymId sym,
+ i64 addend) {
+ NativeTarget* t = &a->base;
+ MCEmitter* mc = t->mc;
+ u32 sec = mc->section_id;
+ if (obj_symbol_extern_via_got(t->c, t->obj, sym)) {
+ u32 ap = mc->pos(mc);
+ rv64_emit32(mc, rv_auipc(dst, 0));
+ mc->emit_reloc_at(mc, sec, ap, R_RV_GOT_HI20, sym, 0, 0, 0);
+ {
+ Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi"));
+ ObjSymId anchor =
+ obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
+ u32 lp = mc->pos(mc);
+ rv64_emit32(mc, rv_ld_ptr(a->variant, dst, dst, 0));
+ mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
+ }
+ } else {
+ u32 ap = mc->pos(mc);
+ rv64_emit32(mc, rv_auipc(dst, 0));
+ mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, 0, 0, 0);
+ {
+ Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi"));
+ ObjSymId anchor =
+ obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
+ u32 lp = mc->pos(mc);
+ rv64_emit32(mc, rv_addi(dst, dst, 0));
+ mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
+ }
+ }
+ if (addend) rv_emit_addr_adjust(a->variant, mc, dst, dst, (i32)addend);
+}
+
+/* Fold (base_reg << 0) + (index << scale) into RV_TMP0 via Zba. */
+static u32 rv_fold_index(RvNativeTarget* a, u32 base, u32 idx, u8 log2_scale) {
+ MCEmitter* mc = a->base.mc;
+ switch (log2_scale) {
+ case 0:
+ rv64_emit32(mc, rv_add(RV_TMP0, base, idx));
+ break;
+ case 1:
+ rv64_emit32(mc, rv_sh1add(RV_TMP0, idx, base));
+ break;
+ case 2:
+ rv64_emit32(mc, rv_sh2add(RV_TMP0, idx, base));
+ break;
+ default:
+ rv64_emit32(mc, rv_sh3add(RV_TMP0, idx, base));
+ break;
+ }
+ return RV_TMP0;
+}
+
+/* Resolve any NativeAddr to a base register + imm12 offset. RISC-V has no
+ * indexed load/store, so an index is folded into RV_TMP0 via Zba; far offsets
+ * and FRAME/FRAME_VALUE/GLOBAL bases are materialized into RV_TMP0/RV_TMP1. */
+static void rv_resolve_mem_addr(RvNativeTarget* a, const NativeAddr* addr,
+ u32* base_out, i32* off_out) {
+ MCEmitter* mc = a->base.mc;
+ u32 base;
+ i32 off;
+ switch (addr->base_kind) {
+ case NATIVE_ADDR_BASE_REG:
+ base = addr->base.reg & 0x1fu;
+ off = addr->offset;
+ break;
+ case NATIVE_ADDR_BASE_FRAME: {
+ RvNativeSlot* s = rv_slot_get(a, addr->base.frame);
+ base = RV_S0;
+ off = rv_s0_off_slot(s) + addr->offset;
+ break;
+ }
+ case NATIVE_ADDR_BASE_FRAME_VALUE: {
+ RvNativeSlot* s = rv_slot_get(a, addr->base.frame);
+ rv64_emit32(mc, rv_ld_ptr(a->variant, RV_TMP0, RV_S0, rv_s0_off_slot(s)));
+ base = RV_TMP0;
+ off = addr->offset;
+ break;
+ }
+ case NATIVE_ADDR_BASE_GLOBAL:
+ rv_emit_global_addr(a, RV_TMP0, addr->base.global.sym,
+ addr->base.global.addend);
+ base = RV_TMP0;
+ off = addr->offset;
+ break;
+ default:
+ rv_panic(a, "unsupported address base");
+ }
+ if (addr->index_kind == NATIVE_ADDR_INDEX_REG) {
+ base = rv_fold_index(a, base, addr->index.reg & 0x1fu, addr->log2_scale);
+ } else if (addr->index_kind == NATIVE_ADDR_INDEX_FRAME_VALUE) {
+ RvNativeSlot* s = rv_slot_get(a, addr->index.frame);
+ rv64_emit32(mc, rv_ld_ptr(a->variant, RV_TMP1, RV_S0, rv_s0_off_slot(s)));
+ base = rv_fold_index(a, base, RV_TMP1, addr->log2_scale);
+ }
+ if (!fits_i12(off)) {
+ rv_emit_load_imm(a->variant, mc, 1, RV_TMP1, (i64)off);
+ rv64_emit32(mc, rv_add(RV_TMP0, base, RV_TMP1));
+ base = RV_TMP0;
+ off = 0;
+ }
+ *base_out = base;
+ *off_out = off;
+}
+
+/* Central load/store primitive. is_load: 1 load into reg, 0 store reg to mem.
+ */
+static void rv_emit_mem(RvNativeTarget* a, int is_load, NativeLoc reg,
+ NativeAddr addr, MemAccess mem) {
+ NativeTarget* t = &a->base;
+ MCEmitter* mc = t->mc;
+ u32 r = loc_reg(reg);
+ int fp = native_loc_is_fp(reg);
+ u32 sz = mem.size ? mem.size : native_type_size(t, reg.type);
+ u32 base;
+ i32 off;
+
+ rv_resolve_mem_addr(a, &addr, &base, &off);
+ if (fp) {
+ rv64_emit32(
+ mc, is_load ? (sz == 8u ? rv_fld(r, base, off) : rv_flw(r, base, off))
+ : (sz == 8u ? rv_fsd(r, base, off) : rv_fsw(r, base, off)));
+ } else {
+ rv64_emit32(mc, is_load ? enc_int_load(a->variant, sz, 0, r, base, off)
+ : enc_int_store(a->variant, sz, r, base, off));
+ }
+}
+
+/* ============================ moves / data ============================ */
+
+static void rv_move(NativeTarget* t, NativeLoc dst, NativeLoc src) {
+ MCEmitter* mc = t->mc;
+ int dfp = native_loc_is_fp(dst), sfp = native_loc_is_fp(src);
+ u32 rd = loc_reg(dst), rs = loc_reg(src);
+ if (dfp && sfp) {
+ u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S;
+ if (rd == rs) return;
+ rv64_emit32(mc, rv_fsgnj(fmt, rd, rs, rs));
+ return;
+ }
+ if (!dfp && sfp) {
+ u32 sz = native_type_size(t, src.type);
+ rv64_emit32(mc, sz == 8u ? rv_fmv_x_d(rd, rs) : rv_fmv_x_w(rd, rs));
+ return;
+ }
+ if (dfp && !sfp) {
+ u32 sz = native_type_size(t, dst.type);
+ rv64_emit32(mc, sz == 8u ? rv_fmv_d_x(rd, rs) : rv_fmv_w_x(rd, rs));
+ return;
+ }
+ if (rd == rs) return;
+ rv64_emit32(mc, rv_addi(rd, rs, 0));
+}
+
+static void rv_load_imm(NativeTarget* t, NativeLoc dst, i64 imm) {
+ rv_emit_load_imm(rv_of(t)->variant, t->mc, rv_is_64(t, dst.type) ? 1u : 0u,
+ loc_reg(dst), imm);
+}
+
+static void rv_load_const(NativeTarget* t, NativeLoc dst, ConstBytes cb) {
+ RvNativeTarget* a = rv_of(t);
+ u64 v = 0;
+ u32 i;
+ if (!native_loc_is_fp(dst)) {
+ for (i = 0; i < cb.size && i < 8u; ++i) v |= (u64)cb.bytes[i] << (i * 8u);
+ rv_load_imm(t, dst, (i64)v);
+ return;
+ }
+ /* FP constant: materialize the bit pattern in TMP0, bitcast into the FPR. */
+ for (i = 0; i < cb.size && i < 8u; ++i) v |= (u64)cb.bytes[i] << (i * 8u);
+ rv_emit_load_imm(a->variant, t->mc, 1, RV_TMP0, (i64)v);
+ if (cb.size == 8u)
+ rv64_emit32(t->mc, rv_fmv_d_x(loc_reg(dst), RV_TMP0));
+ else
+ rv64_emit32(t->mc, rv_fmv_w_x(loc_reg(dst), RV_TMP0));
+ (void)a;
+}
+
+static void rv_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) {
+ RvNativeTarget* a = rv_of(t);
+ MCEmitter* mc = t->mc;
+ u32 rd = loc_reg(dst);
+ u32 base;
+ i32 off;
+ if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL) {
+ rv_emit_global_addr(a, rd, addr.base.global.sym,
+ addr.base.global.addend + addr.offset);
+ base = rd;
+ off = 0;
+ } else if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
+ /* Load the pointer stored in the frame slot, then add the offset. */
+ RvNativeSlot* s = rv_slot_get(a, addr.base.frame);
+ rv64_emit32(mc, rv_ld_ptr(a->variant, rd, RV_S0, rv_s0_off_slot(s)));
+ base = rd;
+ off = addr.offset;
+ } else if (addr.base_kind == NATIVE_ADDR_BASE_FRAME) {
+ RvNativeSlot* s = rv_slot_get(a, addr.base.frame);
+ base = RV_S0;
+ off = rv_s0_off_slot(s) + addr.offset;
+ } else if (addr.base_kind == NATIVE_ADDR_BASE_REG) {
+ base = addr.base.reg & 0x1fu;
+ off = addr.offset;
+ } else {
+ rv_panic(a, "unsupported address base in load_addr");
+ }
+ /* Fold any index via Zba sh{1,2,3}add (index << scale) + base. */
+ if (addr.index_kind == NATIVE_ADDR_INDEX_REG) {
+ u32 idx = addr.index.reg & 0x1fu;
+ if (off != 0 || base != rd)
+ rv_emit_addr_adjust(a->variant, mc, rd, base, off);
+ switch (addr.log2_scale) {
+ case 0:
+ rv64_emit32(mc, rv_add(rd, rd, idx));
+ break;
+ case 1:
+ rv64_emit32(mc, rv_sh1add(rd, idx, rd));
+ break;
+ case 2:
+ rv64_emit32(mc, rv_sh2add(rd, idx, rd));
+ break;
+ default:
+ rv64_emit32(mc, rv_sh3add(rd, idx, rd));
+ break;
+ }
+ return;
+ }
+ rv_emit_addr_adjust(a->variant, mc, rd, base, off);
+}
+
+static void rv_load(NativeTarget* t, NativeLoc dst, NativeAddr addr,
+ MemAccess mem) {
+ rv_emit_mem(rv_of(t), 1, dst, addr, mem);
+}
+static void rv_store(NativeTarget* t, NativeAddr addr, NativeLoc src,
+ MemAccess mem) {
+ rv_emit_mem(rv_of(t), 0, src, addr, mem);
+}
+
+/* copy_bytes: resolve dst and src to dedicated pointer regs (RV_TMP3 / RV_TMP0)
+ * once, then copy granule-by-granule advancing both pointers. dst is resolved
+ * first because its base may itself live in RV_TMP1 (the transfer reg, e.g. the
+ * sret pointer from plan_ret); capturing it into RV_TMP3 before src resolution
+ * (which may clobber RV_TMP1 for far offsets) keeps it live. Advancing the
+ * pointers keeps every load/store at offset 0, so no offset ever exceeds imm12
+ * and the transfer reg never aliases a base. */
+static void rv_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src,
+ AggregateAccess access) {
+ RvNativeTarget* a = rv_of(t);
+ const RiscvVariant* v = a->variant;
+ MCEmitter* mc = t->mc;
+ KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
+ u32 rem = access.size;
+ u32 maxg = v->ptr_bytes; /* widest granule: 8 on rv64, 4 on rv32 */
+ rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP3), dst);
+ rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP0), src);
+ while (rem) {
+ u32 sz = rem >= 8u && maxg >= 8u ? 8u
+ : rem >= 4u ? 4u
+ : rem >= 2u ? 2u
+ : 1u;
+ rv64_emit32(mc, enc_int_load(v, sz, 0, RV_TMP1, RV_TMP0, 0));
+ rv64_emit32(mc, enc_int_store(v, sz, RV_TMP1, RV_TMP3, 0));
+ rv64_emit32(mc, rv_addi(RV_TMP0, RV_TMP0, (i32)sz));
+ rv64_emit32(mc, rv_addi(RV_TMP3, RV_TMP3, (i32)sz));
+ rem -= sz;
+ }
+}
+
+static void rv_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value,
+ AggregateAccess access) {
+ MCEmitter* mc = t->mc;
+ KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
+ u32 bv = loc_reg(byte_value);
+ u32 rem = access.size;
+ rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP3), dst);
+ while (rem) {
+ rv64_emit32(mc, rv_sb(bv, RV_TMP3, 0));
+ rv64_emit32(mc, rv_addi(RV_TMP3, RV_TMP3, 1));
+ rem -= 1u;
+ }
+}
+
+/* ============================ arithmetic ============================ */
+
+static void rv_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc aop,
+ NativeLoc bop) {
+ const RiscvVariant* v = rv_of(t)->variant;
+ MCEmitter* mc = t->mc;
+ u32 rd = loc_reg(dst);
+ u32 ra = loc_reg(aop);
+ int sf = rv_is_64(t, dst.type);
+ /* The W-form ops (ADDW/SUBW/MULW/SLLW/...) are RV64-only and act on a 32-bit
+ * value held in a 64-bit register. They are emitted only for a narrow value
+ * on rv64; on rv32 the BASE ops ARE the 32-bit ops, so `w` is always 0 and we
+ * fall to the base ops. */
+ int w = !sf && v->has_w_forms;
+ /* Immediate shamt mask: 5-bit (&31) for a W-form / rv32 op, else shamt_bits
+ * (6-bit &63 on rv64) for the native-width op. */
+ u32 shmask = w ? 31u : ((1u << v->shamt_bits) - 1u);
+ int b_imm = bop.kind == NATIVE_LOC_IMM;
+ u32 rb = b_imm ? 0u : loc_reg(bop);
+ i64 imm = b_imm ? bop.v.imm : 0;
+
+ switch (op) {
+ case BO_FADD:
+ case BO_FSUB:
+ case BO_FMUL:
+ case BO_FDIV: {
+ u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S;
+ switch (op) {
+ case BO_FADD:
+ rv64_emit32(mc, rv_fadd(fmt, rd, ra, rb));
+ break;
+ case BO_FSUB:
+ rv64_emit32(mc, rv_fsub(fmt, rd, ra, rb));
+ break;
+ case BO_FMUL:
+ rv64_emit32(mc, rv_fmul(fmt, rd, ra, rb));
+ break;
+ default:
+ rv64_emit32(mc, rv_fdiv(fmt, rd, ra, rb));
+ break;
+ }
+ return;
+ }
+ case BO_IADD:
+ if (b_imm) {
+ rv64_emit32(
+ mc, w ? rv_addiw(rd, ra, (i32)imm) : rv_addi(rd, ra, (i32)imm));
+ } else {
+ rv64_emit32(mc, w ? rv_addw(rd, ra, rb) : rv_add(rd, ra, rb));
+ }
+ return;
+ case BO_ISUB:
+ if (b_imm) {
+ rv64_emit32(
+ mc, w ? rv_addiw(rd, ra, (i32)-imm) : rv_addi(rd, ra, (i32)-imm));
+ } else {
+ rv64_emit32(mc, w ? rv_subw(rd, ra, rb) : rv_sub(rd, ra, rb));
+ }
+ return;
+ case BO_IMUL:
+ rv64_emit32(mc, w ? rv_mulw(rd, ra, rb) : rv_mul(rd, ra, rb));
+ return;
+ case BO_SDIV:
+ rv64_emit32(mc, w ? rv_divw(rd, ra, rb) : rv_div(rd, ra, rb));
+ return;
+ case BO_UDIV:
+ rv64_emit32(mc, w ? rv_divuw(rd, ra, rb) : rv_divu(rd, ra, rb));
+ return;
+ case BO_SREM:
+ rv64_emit32(mc, w ? rv_remw(rd, ra, rb) : rv_rem(rd, ra, rb));
+ return;
+ case BO_UREM:
+ rv64_emit32(mc, w ? rv_remuw(rd, ra, rb) : rv_remu(rd, ra, rb));
+ return;
+ case BO_AND:
+ rv64_emit32(mc, b_imm ? rv_andi(rd, ra, (i32)imm) : rv_and(rd, ra, rb));
+ return;
+ case BO_OR:
+ rv64_emit32(mc, b_imm ? rv_ori(rd, ra, (i32)imm) : rv_or(rd, ra, rb));
+ return;
+ case BO_XOR:
+ rv64_emit32(mc, b_imm ? rv_xori(rd, ra, (i32)imm) : rv_xor(rd, ra, rb));
+ return;
+ case BO_SHL:
+ if (b_imm)
+ rv64_emit32(mc, w ? rv_slliw(rd, ra, (u32)imm & shmask)
+ : rv_slli(rd, ra, (u32)imm & shmask));
+ else
+ rv64_emit32(mc, w ? rv_sllw(rd, ra, rb) : rv_sll(rd, ra, rb));
+ return;
+ case BO_SHR_U:
+ if (b_imm)
+ rv64_emit32(mc, w ? rv_srliw(rd, ra, (u32)imm & shmask)
+ : rv_srli(rd, ra, (u32)imm & shmask));
+ else
+ rv64_emit32(mc, w ? rv_srlw(rd, ra, rb) : rv_srl(rd, ra, rb));
+ return;
+ case BO_SHR_S:
+ if (b_imm)
+ rv64_emit32(mc, w ? rv_sraiw(rd, ra, (u32)imm & shmask)
+ : rv_srai(rd, ra, (u32)imm & shmask));
+ else
+ rv64_emit32(mc, w ? rv_sraw(rd, ra, rb) : rv_sra(rd, ra, rb));
+ return;
+ default:
+ rv_panic(rv_of(t), "unsupported binop");
+ }
+}
+
+static void rv_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) {
+ const RiscvVariant* v = rv_of(t)->variant;
+ MCEmitter* mc = t->mc;
+ u32 rd = loc_reg(dst), rs = loc_reg(src);
+ int sf = rv_is_64(t, dst.type);
+ int w = !sf && v->has_w_forms; /* SUBW is RV64-only; base SUB on rv32 */
+ switch (op) {
+ case UO_NEG:
+ rv64_emit32(mc, w ? rv_subw(rd, RV_ZERO, rs) : rv_sub(rd, RV_ZERO, rs));
+ return;
+ case UO_FNEG: {
+ u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S;
+ rv64_emit32(mc, rv_fsgnjn(fmt, rd, rs, rs));
+ return;
+ }
+ case UO_BNOT:
+ rv64_emit32(mc, rv_xori(rd, rs, -1));
+ return;
+ case UO_NOT:
+ rv64_emit32(mc, rv_sltiu(rd, rs, 1));
+ return;
+ default:
+ rv_panic(rv_of(t), "unsupported unop");
+ }
+}
+
+/* Sign/zero-extend a 32-bit operand into a 64-bit register for comparison.
+ * Returns the register to compare. */
+static u32 rv_cmp_ext(NativeTarget* t, int is_signed, NativeLoc op, u32 tmp) {
+ const RiscvVariant* v = rv_of(t)->variant;
+ MCEmitter* mc = t->mc;
+ u32 r = loc_reg(op);
+ /* On rv32 a 32-bit operand already fills the whole register — there is no
+ * wider container to canonicalize into, so the extension is a no-op. */
+ if (v->xlen == 32u) return r;
+ if (rv_is_64(t, op.type)) return r;
+ if (is_signed) {
+ rv64_emit32(mc, rv_addiw(tmp, r, 0)); /* sign-extend low 32 */
+ } else {
+ rv64_emit32(mc, rv_slli(tmp, r, 32));
+ rv64_emit32(mc, rv_srli(tmp, tmp, 32));
+ }
+ return tmp;
+}
+
+static int cmp_is_signed(CmpOp op) {
+ switch (op) {
+ case CMP_LT_U:
+ case CMP_LE_U:
+ case CMP_GT_U:
+ case CMP_GE_U:
+ return 0;
+ default:
+ return 1;
+ }
+}
+
+/* Emit a 0/1 comparison result into rd from two integer registers. */
+static void rv_emit_icmp(NativeTarget* t, CmpOp op, u32 rd, u32 ra, u32 rb) {
+ MCEmitter* mc = t->mc;
+ switch (op) {
+ case CMP_EQ:
+ rv64_emit32(mc, rv_sub(rd, ra, rb));
+ rv64_emit32(mc, rv_sltiu(rd, rd, 1));
+ return;
+ case CMP_NE:
+ rv64_emit32(mc, rv_sub(rd, ra, rb));
+ rv64_emit32(mc, rv_sltu(rd, RV_ZERO, rd));
+ return;
+ case CMP_LT_S:
+ rv64_emit32(mc, rv_slt(rd, ra, rb));
+ return;
+ case CMP_LT_U:
+ rv64_emit32(mc, rv_sltu(rd, ra, rb));
+ return;
+ case CMP_GT_S:
+ rv64_emit32(mc, rv_slt(rd, rb, ra));
+ return;
+ case CMP_GT_U:
+ rv64_emit32(mc, rv_sltu(rd, rb, ra));
+ return;
+ case CMP_GE_S:
+ rv64_emit32(mc, rv_slt(rd, ra, rb));
+ rv64_emit32(mc, rv_xori(rd, rd, 1));
+ return;
+ case CMP_GE_U:
+ rv64_emit32(mc, rv_sltu(rd, ra, rb));
+ rv64_emit32(mc, rv_xori(rd, rd, 1));
+ return;
+ case CMP_LE_S:
+ rv64_emit32(mc, rv_slt(rd, rb, ra));
+ rv64_emit32(mc, rv_xori(rd, rd, 1));
+ return;
+ case CMP_LE_U:
+ rv64_emit32(mc, rv_sltu(rd, rb, ra));
+ rv64_emit32(mc, rv_xori(rd, rd, 1));
+ return;
+ default:
+ rv_panic(rv_of(t), "unsupported integer cmp");
+ }
+}
+
+/* Format-dispatching wrappers over the ordered FP compares (feq/flt/fle are
+ * ordered: they yield 0 on NaN; flt/fle are signaling, raising NV on NaN —
+ * pre-existing for ordered ops, and the boolean result is still correct). */
+static u32 rv_feq_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) {
+ return fmt == RV_FMT_D ? rv_feq_d(rd, ra, rb) : rv_feq_s(rd, ra, rb);
+}
+static u32 rv_flt_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) {
+ return fmt == RV_FMT_D ? rv_flt_d(rd, ra, rb) : rv_flt_s(rd, ra, rb);
+}
+static u32 rv_fle_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) {
+ return fmt == RV_FMT_D ? rv_fle_d(rd, ra, rb) : rv_fle_s(rd, ra, rb);
+}
+
+static void rv_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc aop,
+ NativeLoc bop) {
+ MCEmitter* mc = t->mc;
+ u32 rd = loc_reg(dst);
+ /* FP-ness is self-describing from the opcode (FP block starts at CMP_OEQ_F).
+ * Unordered predicates use unordered-R == NOT(ordered-not-R): the ordered
+ * compare into rd, then `xori rd,rd,1`. ONE/UEQ have no single ordered
+ * primitive and OR the two strict relations (a<b | a>b) via scratch RV_TMP2
+ * (x7, reserved & never allocable, so it can't alias rd). */
+ if (op >= CMP_OEQ_F) {
+ u32 fmt = native_type_size(t, aop.type) == 8u ? RV_FMT_D : RV_FMT_S;
+ u32 ra = loc_reg(aop), rb = loc_reg(bop);
+ switch (op) {
+ case CMP_OEQ_F:
+ rv64_emit32(mc, rv_feq_fmt(fmt, rd, ra, rb));
+ return;
+ case CMP_UNE_F: /* !(OEQ) */
+ rv64_emit32(mc, rv_feq_fmt(fmt, rd, ra, rb));
+ rv64_emit32(mc, rv_xori(rd, rd, 1));
+ return;
+ case CMP_OLT_F:
+ rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
+ return;
+ case CMP_OLE_F:
+ rv64_emit32(mc, rv_fle_fmt(fmt, rd, ra, rb));
+ return;
+ case CMP_OGT_F:
+ rv64_emit32(mc, rv_flt_fmt(fmt, rd, rb, ra));
+ return;
+ case CMP_OGE_F:
+ rv64_emit32(mc, rv_fle_fmt(fmt, rd, rb, ra));
+ return;
+ case CMP_UGE_F: /* !(OLT) */
+ rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
+ rv64_emit32(mc, rv_xori(rd, rd, 1));
+ return;
+ case CMP_UGT_F: /* !(OLE) */
+ rv64_emit32(mc, rv_fle_fmt(fmt, rd, ra, rb));
+ rv64_emit32(mc, rv_xori(rd, rd, 1));
+ return;
+ case CMP_ULE_F: /* !(OGT) */
+ rv64_emit32(mc, rv_flt_fmt(fmt, rd, rb, ra));
+ rv64_emit32(mc, rv_xori(rd, rd, 1));
+ return;
+ case CMP_ULT_F: /* !(OGE) */
+ rv64_emit32(mc, rv_fle_fmt(fmt, rd, rb, ra));
+ rv64_emit32(mc, rv_xori(rd, rd, 1));
+ return;
+ case CMP_ONE_F: /* ordered & !=: (a<b) | (a>b) */
+ rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
+ rv64_emit32(mc, rv_flt_fmt(fmt, RV_TMP2, rb, ra));
+ rv64_emit32(mc, rv_or(rd, rd, RV_TMP2));
+ return;
+ case CMP_UEQ_F: /* unordered | ==: !((a<b) | (a>b)) */
+ rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
+ rv64_emit32(mc, rv_flt_fmt(fmt, RV_TMP2, rb, ra));
+ rv64_emit32(mc, rv_or(rd, rd, RV_TMP2));
+ rv64_emit32(mc, rv_xori(rd, rd, 1));
+ return;
+ default:
+ rv_panic(rv_of(t), "unsupported fp cmp");
+ }
+ }
+ {
+ int sg = cmp_is_signed(op);
+ u32 ra = rv_cmp_ext(t, sg, aop, RV_TMP0);
+ u32 rb = rv_cmp_ext(t, sg, bop, RV_TMP1);
+ rv_emit_icmp(t, op, rd, ra, rb);
+ }
+}
+
+static void rv_convert(NativeTarget* t, ConvKind op, NativeLoc dst,
+ NativeLoc src) {
+ const RiscvVariant* v = rv_of(t)->variant;
+ MCEmitter* mc = t->mc;
+ u32 rd = loc_reg(dst), rs = loc_reg(src);
+ u32 src_sz = native_type_size(t, src.type);
+ u32 dst_sz = native_type_size(t, dst.type);
+ /* `il` (int-side wide): the 64-bit-integer fcvt L-forms are RV64-only; on
+ * rv32 only the w/wu forms exist and a 64-bit int<->fp is legalized to a
+ * libcall before reaching here. */
+ int il = v->has_w_forms;
+ switch (op) {
+ case CV_SEXT:
+ if (src_sz >= 4u) {
+ /* ADDIW sign-extends bits[31:0] into a 64-bit reg (RV64). On rv32 a
+ * 4-byte value already spans the whole register, so a plain move (or
+ * nothing when rd==rs) is the sign extension. */
+ if (v->has_w_forms)
+ rv64_emit32(mc, rv_addiw(rd, rs, 0));
+ else if (rd != rs)
+ rv64_emit32(mc, rv_addi(rd, rs, 0));
+ } else {
+ u32 sh = v->xlen - src_sz * 8u;
+ rv64_emit32(mc, rv_slli(rd, rs, sh));
+ rv64_emit32(mc, rv_srai(rd, rd, sh));
+ }
+ return;
+ case CV_ZEXT: {
+ u32 sh = v->xlen - src_sz * 8u;
+ rv64_emit32(mc, rv_slli(rd, rs, sh));
+ rv64_emit32(mc, rv_srli(rd, rd, sh));
+ return;
+ }
+ case CV_TRUNC:
+ if (rd != rs || dst_sz <= 4u)
+ rv64_emit32(mc, rv_addi(rd, rs, 0)); /* low bits; users re-narrow */
+ return;
+ case CV_ITOF_S:
+ if (native_type_size(t, dst.type) == 8u)
+ rv64_emit32(mc, il && src_sz == 8u ? rv_fcvt_d_l(rd, rs)
+ : rv_fcvt_d_w(rd, rs));
+ else
+ rv64_emit32(mc, il && src_sz == 8u ? rv_fcvt_s_l(rd, rs)
+ : rv_fcvt_s_w(rd, rs));
+ return;
+ case CV_ITOF_U:
+ if (native_type_size(t, dst.type) == 8u)
+ rv64_emit32(mc, il && src_sz == 8u ? rv_fcvt_d_lu(rd, rs)
+ : rv_fcvt_d_wu(rd, rs));
+ else
+ rv64_emit32(mc, il && src_sz == 8u ? rv_fcvt_s_lu(rd, rs)
+ : rv_fcvt_s_wu(rd, rs));
+ return;
+ case CV_FTOI_S:
+ if (src_sz == 8u)
+ rv64_emit32(mc, il && dst_sz == 8u ? rv_fcvt_l_d(rd, rs)
+ : rv_fcvt_w_d(rd, rs));
+ else
+ rv64_emit32(mc, il && dst_sz == 8u ? rv_fcvt_l_s(rd, rs)
+ : rv_fcvt_w_s(rd, rs));
+ return;
+ case CV_FTOI_U:
+ if (src_sz == 8u)
+ rv64_emit32(mc, il && dst_sz == 8u ? rv_fcvt_lu_d(rd, rs)
+ : rv_fcvt_wu_d(rd, rs));
+ else
+ rv64_emit32(mc, il && dst_sz == 8u ? rv_fcvt_lu_s(rd, rs)
+ : rv_fcvt_wu_s(rd, rs));
+ return;
+ case CV_FEXT:
+ rv64_emit32(mc, rv_fcvt_d_s(rd, rs));
+ return;
+ case CV_FTRUNC:
+ rv64_emit32(mc, rv_fcvt_s_d(rd, rs));
+ return;
+ case CV_BITCAST:
+ rv_move(t, dst, src);
+ return;
+ default:
+ rv_panic(rv_of(t), "unsupported convert");
+ }
+}
+
+/* ============================ spill / reload ============================ */
+
+static void rv_spill(NativeTarget* t, NativeLoc src, NativeFrameSlot slot,
+ MemAccess mem) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = slot;
+ addr.base_type = src.type;
+ rv_emit_mem(rv_of(t), 0, src, addr, mem);
+}
+static void rv_reload(NativeTarget* t, NativeLoc dst, NativeFrameSlot slot,
+ MemAccess mem) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = slot;
+ addr.base_type = dst.type;
+ rv_emit_mem(rv_of(t), 1, dst, addr, mem);
+}
+
+/* ============================ control flow ============================ */
+
+static MCLabel rv_label_new(NativeTarget* t) { return t->mc->label_new(t->mc); }
+static void rv_label_place(NativeTarget* t, MCLabel l) {
+ t->mc->label_place(t->mc, l);
+}
+static void rv_jump(NativeTarget* t, MCLabel l) {
+ rv64_emit32(t->mc, rv_jal(RV_ZERO, 0));
+ t->mc->emit_label_ref(t->mc, l, R_RV_JAL, 4, 0);
+}
+
+static void rv_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc aop,
+ NativeLoc bop, MCLabel l) {
+ MCEmitter* mc = t->mc;
+ /* RISC-V B-type branches reach only ±4 KiB, which a single (especially
+ * -O0) function can exceed between a branch and its target. Rather than a
+ * lone conditional branch to the label, emit a short *inverted* branch
+ * that skips an unconditional `jal` (±1 MiB) to the target. The inverted
+ * branch's displacement is the constant SKIP_JAL (skip just the jal) and
+ * so is always in range; the jal carries the long reach. See rv_jump. */
+ enum { SKIP_JAL = 8 }; /* branch over the 4-byte jal that follows it */
+ /* FP compares have no register-register branch form: materialize the 0/1
+ * into TMP0 via rv_cmp (handles all 12 predicates), then branch on nonzero.
+ */
+ if (op >= CMP_OEQ_F) {
+ NativeLoc tmp =
+ native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, RV_TMP0);
+ rv_cmp(t, op, tmp, aop, bop);
+ /* Skip the jal when the result is 0 (condition false). */
+ rv64_emit32(mc, rv_beq(RV_TMP0, RV_ZERO, SKIP_JAL));
+ rv_jump(t, l);
+ return;
+ }
+ {
+ int sg = cmp_is_signed(op);
+ u32 ra = rv_cmp_ext(t, sg, aop, RV_TMP0);
+ u32 rb = rv_cmp_ext(t, sg, bop, RV_TMP1);
+ u32 word;
+ /* Encode the *inverse* of `op`, skipping the jal when NOT taken. */
+ switch (op) {
+ case CMP_EQ:
+ word = rv_bne(ra, rb, SKIP_JAL);
+ break;
+ case CMP_NE:
+ word = rv_beq(ra, rb, SKIP_JAL);
+ break;
+ case CMP_LT_S:
+ word = rv_bge(ra, rb, SKIP_JAL);
+ break;
+ case CMP_GE_S:
+ word = rv_blt(ra, rb, SKIP_JAL);
+ break;
+ case CMP_LT_U:
+ word = rv_bgeu(ra, rb, SKIP_JAL);
+ break;
+ case CMP_GE_U:
+ word = rv_bltu(ra, rb, SKIP_JAL);
+ break;
+ case CMP_GT_S:
+ word = rv_bge(rb, ra, SKIP_JAL);
+ break;
+ case CMP_LE_S:
+ word = rv_blt(rb, ra, SKIP_JAL);
+ break;
+ case CMP_GT_U:
+ word = rv_bgeu(rb, ra, SKIP_JAL);
+ break;
+ case CMP_LE_U:
+ word = rv_bltu(rb, ra, SKIP_JAL);
+ break;
+ default:
+ rv_panic(rv_of(t), "unsupported cmp_branch");
+ }
+ rv64_emit32(mc, word);
+ rv_jump(t, l);
+ }
+}
+
+static void rv_indirect_branch(NativeTarget* t, NativeLoc addr,
+ const MCLabel* valid_targets, u32 ntargets) {
+ (void)valid_targets;
+ (void)ntargets;
+ rv64_emit32(t->mc, rv_jalr(RV_ZERO, loc_reg(addr), 0));
+}
+
+static void rv_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel l) {
+ /* `&&label` address-take: auipc/addi with a %pcrel_hi/%pcrel_lo relocation
+ * pair against the label's per-block local symbol — the same form
+ * rv_emit_global_addr uses for a global — so a compressing/re-encoding
+ * assembler recomputes the displacement (a baked offset would break under
+ * the C extension). */
+ MCEmitter* mc = t->mc;
+ u32 rd = loc_reg(dst);
+ u32 sec = mc->section_id;
+ ObjSymId sym = mc_label_symbol(mc, l);
+ u32 ap = mc->pos(mc);
+ rv64_emit32(mc, rv_auipc(rd, 0));
+ mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, 0, 0, 0);
+ {
+ Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi"));
+ ObjSymId anchor = obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
+ u32 lp = mc->pos(mc);
+ rv64_emit32(mc, rv_addi(rd, rd, 0));
+ mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
+ }
+}
+
+/* ============================ frame / lifecycle ============================
+ */
+
+static NativeFrameSlot rv_frame_slot(NativeTarget* t,
+ const NativeFrameSlotDesc* d) {
+ return native_frame_slot_alloc(&rv_of(t)->frame, d);
+}
+
+static int rv_frame_slot_debug_loc(NativeTarget* t, NativeFrameSlot slot,
+ CGDebugLoc* out) {
+ RvNativeTarget* a = rv_of(t);
+ RvNativeSlot* s;
+ if (!out) return 0;
+ memset(out, 0, sizeof *out);
+ if (slot == NATIVE_FRAME_SLOT_NONE || slot > a->frame.nslots) return 0;
+ s = rv_slot_get(a, slot);
+ out->kind = CG_DEBUG_LOC_FRAME;
+ /* rv64 slots are addressed s0/fp-relative (rv_s0_off_slot); the hosted dbg
+ * snapshot seeds the frame base with s0, matching aa64's FP-relative
+ * convention. */
+ out->v.frame_ofs = rv_s0_off_slot(s);
+ return 1;
+}
+
+static void rv_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
+ RvNativeTarget* a = rv_of(t);
+ MCEmitter* mc = t->mc;
+ const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
+ a->func = fd;
+ a->loc = fd->loc;
+ /* Shared frame bookkeeping: clears the slot table, cum_off, max_outgoing,
+ * callee-save set, and known_frame/has_alloca/frame_final. */
+ native_frame_reset(&a->frame);
+ a->incoming_stack_size = 0;
+ a->next_param_int = 0;
+ a->next_param_fp = 0;
+ a->next_param_stack = 0;
+ a->has_sret = (abi && abi->has_sret) ? 1u : 0u;
+ a->is_variadic = (abi && abi->variadic) ? 1u : 0u;
+ a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE;
+ a->npatches = 0;
+ a->nalloca = 0;
+ a->minimal_prologue_words = 0;
+ a->slim_prologue = 0;
+
+ mc->set_section(mc, fd->text_section_id);
+ mc->emit_align(mc, 4, 0);
+ a->func_start = mc->pos(mc);
+ mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start);
+ if (mc->cfi_startproc) mc->cfi_startproc(mc);
+ a->epilogue_label = mc->label_new(mc);
+}
+
+/* sret: reserve a hidden slot for the incoming destination pointer (a0). */
+static void rv_reserve_entry_saves(RvNativeTarget* a) {
+ NativeTarget* t = &a->base;
+ if (a->has_sret) {
+ NativeFrameSlotDesc sd;
+ u32 ptr = a->variant->ptr_bytes;
+ memset(&sd, 0, sizeof sd);
+ sd.type = builtin_id(KIT_CG_BUILTIN_I64);
+ sd.size = ptr; /* a pointer slot: 8 on rv64, 4 on rv32 */
+ sd.align = ptr;
+ sd.kind = NATIVE_FRAME_SLOT_SAVE;
+ a->sret_ptr_slot = t->frame_slot(t, &sd);
+ a->next_param_int = 1; /* a0 consumed by the sret pointer */
+ }
+}
+
+static void rv_emit_entry_save_stores(RvNativeTarget* a) {
+ NativeTarget* t = &a->base;
+ if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) {
+ KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
+ u32 ptr = a->variant->ptr_bytes;
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = a->sret_ptr_slot;
+ addr.base_type = i64t;
+ rv_emit_mem(a, 0, native_loc_reg(i64t, NATIVE_REG_INT, RV_A0), addr,
+ native_mem_for_type(t, i64t, ptr));
+ }
+}
+
+/* Collect the callee-saves the body used (none at -O0). */
+static u32 rv_collect_int_saves(RvNativeTarget* a, u32* regs) {
+ u32 n = 0, i;
+ for (i = 0; i < a->frame.ncallee_saves; ++i)
+ if (a->frame.callee_saves[i].cls == NATIVE_REG_INT)
+ regs[n++] = a->frame.callee_saves[i].reg;
+ return n;
+}
+static u32 rv_collect_fp_saves(RvNativeTarget* a, u32* regs) {
+ u32 n = 0, i;
+ for (i = 0; i < a->frame.ncallee_saves; ++i)
+ if (a->frame.callee_saves[i].cls == NATIVE_REG_FP)
+ regs[n++] = a->frame.callee_saves[i].reg;
+ return n;
+}
+
+/* s0-relative offset of a saved register, below the locals. The flat index runs
+ * 0..n_int-1 over integer saves (each ptr_bytes wide) then n_int..n_int+n_fp-1
+ * over fp saves (each 8 bytes wide, fsd). On rv64 ptr_bytes==8 so this reduces
+ * to the historical uniform -cum_off-8-8*idx layout, byte-for-byte. */
+static i32 rv_save_off(RvNativeTarget* a, u32 n_int, u32 idx) {
+ i32 base = -(i32)(a->frame.cum_off);
+ u32 ptr = a->variant->ptr_bytes;
+ if (idx < n_int) return base - (i32)ptr * (i32)(idx + 1u);
+ return base - (i32)(ptr * n_int) - 8 * (i32)(idx - n_int + 1u);
+}
+
+static void rv_load_s0(const RiscvVariant* v, MCEmitter* mc, int fp, u32 reg,
+ i32 off) {
+ if (fits_i12(off)) {
+ rv64_emit32(mc, fp ? rv_fld(reg, RV_S0, off)
+ : rv_ld_ptr(v, reg, RV_S0, off));
+ return;
+ }
+ rv_emit_load_imm(v, mc, 1, RV_TMP0, (i64)off);
+ rv64_emit32(mc, rv_add(RV_TMP0, RV_S0, RV_TMP0));
+ rv64_emit32(mc, fp ? rv_fld(reg, RV_TMP0, 0) : rv_ld_ptr(v, reg, RV_TMP0, 0));
+}
+
+/* Build the prologue instruction sequence into words[]. Returns count. */
+static u32 rv_build_prologue(RvNativeTarget* a, u32* words, u32 cap,
+ u32 frame_size, u32 fp_pair_off,
+ const u32* int_regs, u32 n_int, const u32* fp_regs,
+ u32 n_fp) {
+ const RiscvVariant* v = a->variant;
+ u32 ptr = v->ptr_bytes; /* saved-pair / int-save stride */
+ u32 gp_slot = v->gp_slot_bytes; /* vararg GP-slot stride */
+ u32 fsz = v->frame_save_size; /* saved ra+s0 pair base offset */
+ u32 wi = 0;
+ /* lui+ADD{I,IW} materializes a 32-bit constant in TMP0; ADDIW is RV64-only so
+ * use plain ADDI on rv32 (the value already fits 32 bits). */
+#define ADDI_LO(rd, lo) (v->has_w_forms ? rv_addiw((rd), (rd), (lo)) : rv_addi((rd), (rd), (lo)))
+#define PUSH(w) \
+ do { \
+ if (wi >= cap) rv_panic(a, "prologue placeholder overflow"); \
+ words[wi++] = (w); \
+ } while (0)
+ /* sp -= frame_size */
+ if (fits_i12(-(i32)frame_size)) {
+ PUSH(rv_addi(RV_SP, RV_SP, -(i32)frame_size));
+ } else {
+ i32 neg = -(i32)frame_size;
+ i32 hi = (i32)(((i64)neg + 0x800) >> 12);
+ i32 lo = neg - (i32)((u32)hi << 12);
+ PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
+ if (lo) PUSH(ADDI_LO(RV_TMP0, lo));
+ PUSH(rv_add(RV_SP, RV_SP, RV_TMP0));
+ }
+ /* save s0/ra at [sp + fp_pair_off], set s0 = sp + fp_pair_off. The saved-pair
+ * internal stride is ptr_bytes (s0 at +0, ra at +ptr). */
+ if (fits_i12((i32)fp_pair_off + (i32)ptr)) {
+ PUSH(rv_sd_ptr(v, RV_S0, RV_SP, (i32)fp_pair_off));
+ PUSH(rv_sd_ptr(v, RV_RA, RV_SP, (i32)fp_pair_off + (i32)ptr));
+ PUSH(rv_addi(RV_S0, RV_SP, (i32)fp_pair_off));
+ } else {
+ i32 off = (i32)fp_pair_off;
+ i32 hi = (i32)(((i64)off + 0x800) >> 12);
+ i32 lo = off - (i32)((u32)hi << 12);
+ PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
+ if (lo) PUSH(ADDI_LO(RV_TMP0, lo));
+ PUSH(rv_add(RV_TMP0, RV_SP, RV_TMP0));
+ PUSH(rv_sd_ptr(v, RV_S0, RV_TMP0, 0));
+ PUSH(rv_sd_ptr(v, RV_RA, RV_TMP0, (i32)ptr));
+ PUSH(rv_addi(RV_S0, RV_TMP0, 0));
+ }
+ /* sret a0 spill */
+ if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) {
+ RvNativeSlot* s = rv_slot_get(a, a->sret_ptr_slot);
+ PUSH(rv_sd_ptr(v, RV_A0, RV_S0, rv_s0_off_slot(s)));
+ }
+ /* variadic GP save area: spill unconsumed a-regs at
+ * [s0 + frame_save_size + i*gp_slot_bytes] */
+ if (a->is_variadic) {
+ u32 i;
+ for (i = a->next_param_int; i < 8u; ++i)
+ PUSH(rv_sd_ptr(v, RV_A0 + i, RV_S0, (i32)fsz + (i32)i * (i32)gp_slot));
+ }
+ /* callee saves: integer with the pointer-width store (sw/sd), fp with fsd. */
+ {
+ u32 i;
+ for (i = 0; i < n_int; ++i) {
+ i32 off = rv_save_off(a, n_int, i);
+ if (fits_i12(off)) {
+ PUSH(rv_sd_ptr(v, int_regs[i], RV_S0, off));
+ } else {
+ /* rare; emitted directly is fine in the known-frame path, but the
+ * single-pass placeholder must hold these too. Use the wide form. */
+ i32 hi = (i32)(((i64)off + 0x800) >> 12);
+ i32 lo = off - (i32)((u32)hi << 12);
+ PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
+ if (lo) PUSH(ADDI_LO(RV_TMP0, lo));
+ PUSH(rv_add(RV_TMP0, RV_S0, RV_TMP0));
+ PUSH(rv_sd_ptr(v, int_regs[i], RV_TMP0, 0));
+ }
+ }
+ for (i = 0; i < n_fp; ++i) {
+ i32 off = rv_save_off(a, n_int, n_int + i);
+ if (fits_i12(off)) {
+ PUSH(rv_fsd(fp_regs[i], RV_S0, off));
+ } else {
+ i32 hi = (i32)(((i64)off + 0x800) >> 12);
+ i32 lo = off - (i32)((u32)hi << 12);
+ PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
+ if (lo) PUSH(ADDI_LO(RV_TMP0, lo));
+ PUSH(rv_add(RV_TMP0, RV_S0, RV_TMP0));
+ PUSH(rv_fsd(fp_regs[i], RV_TMP0, 0));
+ }
+ }
+ }
+#undef PUSH
+#undef ADDI_LO
+ return wi;
+}
+
+static void rv_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
+ RvNativeTarget* a = rv_of(t);
+ MCEmitter* mc = t->mc;
+ u32 i;
+ rv_func_begin_common(t, fd);
+ a->prologue_pos = mc->pos(mc);
+ for (i = 0; i < RV_PROLOGUE_WORDS; ++i) rv64_emit32(mc, RV_NOP);
+ rv_reserve_entry_saves(a);
+ rv_emit_entry_save_stores(a);
+}
+
+static void rv_func_end(NativeTarget* t) {
+ RvNativeTarget* a = rv_of(t);
+ MCEmitter* mc = t->mc;
+ ObjBuilder* obj = t->obj;
+ ObjSecId sec = a->func->text_section_id;
+ u32 int_regs[16], fp_regs[16];
+ u32 n_int = rv_collect_int_saves(a, int_regs);
+ u32 n_fp = rv_collect_fp_saves(a, fp_regs);
+ u32 frame_size = rv_frame_size(a);
+ u32 fp_pair_off = rv_fp_pair_off(a, frame_size);
+ u32 end;
+ i32 i;
+ a->frame_size_final = frame_size;
+ a->fp_pair_off = fp_pair_off;
+
+ /* epilogue */
+ mc->label_place(mc, a->epilogue_label);
+ if (a->slim_prologue) {
+ /* Frameless leaf: no callee-saves, no s0/ra to reload, sp untouched. */
+ rv64_emit32(mc, rv_jalr(RV_ZERO, RV_RA, 0));
+ } else {
+ const RiscvVariant* v = a->variant;
+ for (i = (i32)n_int - 1; i >= 0; --i)
+ rv_load_s0(v, mc, 0, int_regs[i], rv_save_off(a, n_int, (u32)i));
+ for (i = (i32)n_fp - 1; i >= 0; --i)
+ rv_load_s0(v, mc, 1, fp_regs[i], rv_save_off(a, n_int, n_int + (u32)i));
+ if (a->frame.has_alloca)
+ rv_emit_addr_adjust(v, mc, RV_SP, RV_S0, -(i32)fp_pair_off);
+ /* Reload ra/s0 from the saved pair (s0 at +0, ra at +ptr_bytes), pointer
+ * width. */
+ rv64_emit32(mc, rv_ld_ptr(v, RV_RA, RV_S0, (i32)v->ptr_bytes));
+ rv64_emit32(mc, rv_ld_ptr(v, RV_S0, RV_S0, 0));
+ /* sp += frame_size */
+ if (fits_i12((i32)frame_size)) {
+ rv64_emit32(mc, rv_addi(RV_SP, RV_SP, (i32)frame_size));
+ } else {
+ rv_emit_load_imm(v, mc, 1, RV_TMP0, (i64)frame_size);
+ rv64_emit32(mc, rv_add(RV_SP, RV_SP, RV_TMP0));
+ }
+ rv64_emit32(mc, rv_jalr(RV_ZERO, RV_RA, 0));
+ }
+
+ /* patch prologue */
+ if (!a->frame.known_frame) {
+ u32 words[RV_PROLOGUE_WORDS];
+ u32 nwords, k;
+ for (k = 0; k < RV_PROLOGUE_WORDS; ++k) words[k] = RV_NOP;
+ nwords = rv_build_prologue(a, words, RV_PROLOGUE_WORDS, frame_size,
+ fp_pair_off, int_regs, n_int, fp_regs, n_fp);
+ (void)nwords;
+ for (k = 0; k < RV_PROLOGUE_WORDS; ++k)
+ rv_patch32(obj, sec, a->prologue_pos + k * 4u, words[k]);
+ }
+ /* patch alloca sites: addi dst, sp, max_outgoing */
+ {
+ u32 mo = align_up_u32(a->frame.max_outgoing, 16u);
+ u32 k;
+ if (mo > 2047u) rv_panic(a, "max_outgoing too large for alloca patch");
+ for (k = 0; k < a->npatches; ++k)
+ rv_patch32(obj, sec, a->patches[k].pos,
+ rv_addi(a->patches[k].dst_reg, RV_SP, (i32)mo));
+ }
+
+ /* CFI: CFA = s0 + (frame_size - fp_pair_off) */
+ if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
+ if (a->slim_prologue) {
+ /* Frameless leaf: CFA = sp (unchanged from entry) and the return address
+ * stays live in ra (the CIE default), so no saved-register rules. The
+ * state holds from the first instruction (offset 0). */
+ mc->cfi_set_next_pc_offset(mc, 0);
+ mc->cfi_def_cfa(mc, RV_SP, 0);
+ } else {
+ i32 cfa = (i32)frame_size - (i32)fp_pair_off;
+ u32 post = a->prologue_pos + (a->frame.known_frame
+ ? a->minimal_prologue_words * 4u
+ : RV_PROLOGUE_WORDS * 4u);
+ u32 k;
+ mc->cfi_set_next_pc_offset(mc, post - a->func_start);
+ mc->cfi_def_cfa(mc, RV_S0, cfa);
+ mc->cfi_offset(mc, RV_S0, -cfa);
+ /* ra is saved at the saved-pair stride above s0 (ptr_bytes). */
+ mc->cfi_offset(mc, RV_RA, -cfa + (i32)a->variant->ptr_bytes);
+ for (k = 0; k < n_int; ++k)
+ mc->cfi_offset(mc, int_regs[k], rv_save_off(a, n_int, k) - cfa);
+ for (k = 0; k < n_fp; ++k)
+ mc->cfi_offset(mc, 32u + fp_regs[k],
+ rv_save_off(a, n_int, n_int + k) - cfa);
+ }
+ }
+
+ end = mc->pos(mc);
+ obj_symbol_define(obj, a->func->sym, sec, (u64)a->func_start,
+ (u64)(end - a->func_start));
+ if (a->func->atomize)
+ obj_atom_define(obj, sec, a->func_start, end - a->func_start, a->func->sym,
+ 0);
+ if (mc->debug) debug_func_pc_range(mc->debug, sec, a->func_start, end);
+ if (mc->cfi_endproc) mc->cfi_endproc(mc);
+ mc_end_function(mc);
+ a->func = NULL;
+}
+
+/* rv64 homes its callee-saves below the locals at rv_save_off(idx) rather than
+ * in frame slots, so alloc_slots=0: native_frame just records the {reg,cls} set
+ * derived from the optimizer's per-class used-masks. */
+static void rv_reserve_callee_saves(NativeTarget* t, const u32* used,
+ u32 nclasses) {
+ native_frame_set_callee_saves(&rv_of(t)->frame, used, nclasses, NULL, 0, 0);
+}
+
+static int rv_reg_is_callee_int(Reg r);
+static int rv_reg_is_callee_fp(Reg r);
+static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
+ u32 nclob, u32* int_mask, u32* fp_mask);
+
+/* Expand the arch-neutral clobber-ABI sets (KitCgAsmClobberAbiSet bits) into
+ * this target's per-class caller/callee-saved register masks. */
+/* abi_clobber_masks is shared as native_asm_abi_clobber_masks
+ * (cg/native_asm.h); it reads the masks from t->regs->classes. */
+
+/* Build the callee-saved set the prologue must preserve: the allocator-assigned
+ * callee-saved registers (frame->callee_saved_used) plus any an inline-asm
+ * block clobbers. The latter are opaque to the optimizer's operand scan, so it
+ * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral
+ * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks
+ * and keep only the callee-saved ones — rv_reg_is_callee_int excludes s0 (the
+ * frame pointer, preserved by the prologue head, not as an ordinary
+ * callee-save). This is the same register selection the per-block spill used,
+ * hoisted into the prologue. Writes up to `cap` per-class masks into `out` and
+ * returns the class count to reserve. */
+static u32 rv_known_callee_saves(NativeTarget* t,
+ const NativeKnownFrameDesc* frame, u32* out,
+ u32 cap) {
+ u32 ncls = frame->ncallee_classes;
+ u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp;
+ if (ncls > cap) ncls = cap;
+ for (u32 c = 0; c < ncls; ++c)
+ out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u;
+ if (frame->asm_clobbers && frame->nasm_clobbers) {
+ RvNativeTarget* a = rv_of(t);
+ SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
+ rv_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers,
+ &clob_int, &clob_fp);
+ }
+ native_asm_abi_clobber_masks(t, frame->asm_clobber_abi_sets, &abi_int,
+ &abi_fp);
+ clob_int |= abi_int;
+ clob_fp |= abi_fp;
+ for (Reg r = 0; r < 32u; ++r) {
+ if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) &&
+ rv_reg_is_callee_int(r))
+ out[NATIVE_REG_INT] |= 1u << r;
+ if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && rv_reg_is_callee_fp(r))
+ out[NATIVE_REG_FP] |= 1u << r;
+ }
+ return ncls;
+}
+
+static u32 rv_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type,
+ int* variadic, u32* nparams);
+
+/* Optimizer entry point: the full frame is supplied up front, so the prologue
+ * is emitted final the moment it is built — no NOP region, no func_end patch
+ * (rv_func_end skips patching when known_frame). rv_build_prologue emits the
+ * sret spill and the variadic register-save stores inline, so there is no
+ * separate entry-save emission. Slot creation order matches the single-pass
+ * path: callee-saves first (only recorded for rv64), then static slots, then
+ * the sret entry-save slot. */
+static void rv_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
+ const NativeKnownFrameDesc* frame,
+ NativeFrameSlot* out_slots) {
+ RvNativeTarget* a = rv_of(t);
+ MCEmitter* mc = t->mc;
+ u32 int_regs[RV_MAX_CALLEE_SAVES], fp_regs[RV_MAX_CALLEE_SAVES];
+ u32 n_int, n_fp, frame_size, fp_pair_off, nwords, i;
+ u32 words[RV_KNOWN_PROLOGUE_WORDS];
+ rv_func_begin_common(t, fd);
+ a->frame.known_frame = 1;
+ if (frame) {
+ u32 cs[NATIVE_CALL_PLAN_CLASSES];
+ u32 ncs = rv_known_callee_saves(t, frame, cs, NATIVE_CALL_PLAN_CLASSES);
+ a->frame.has_alloca = frame->has_alloca;
+ if (ncs) rv_reserve_callee_saves(t, cs, ncs);
+ for (i = 0; i < frame->nslots; ++i) {
+ NativeFrameSlot slot = rv_frame_slot(t, &frame->slots[i]);
+ if (out_slots) out_slots[i] = slot;
+ }
+ rv_reserve_entry_saves(a);
+ native_frame_note_outgoing(&a->frame, frame->max_outgoing);
+ }
+ /* Frame is final: size and offsets are settled, so emit the exact prologue.
+ */
+ frame_size = rv_frame_size(a);
+ fp_pair_off = rv_fp_pair_off(a, frame_size);
+ a->frame_size_final = frame_size;
+ a->fp_pair_off = fp_pair_off;
+ a->prologue_pos = mc->pos(mc);
+ /* Leaf no-frame tier (aa64 slim_prologue equivalent): a leaf with no
+ * callee-saves, no body slots, no outgoing args, no sret/variadic and
+ * register-only params never reads s0 (no frame slots / stack args) nor
+ * clobbers ra (no calls). Emit no prologue at all; rv_func_end emits a bare
+ * `ret`. cum_off==0 already implies no sret slot and no param spills, but the
+ * extra guards keep the intent explicit. Inline asm is excluded: it can
+ * clobber ra opaquely, and without the saved record the bare `ret` would
+ * return through the destroyed link register. */
+ a->slim_prologue = frame && frame->is_leaf && !frame->has_asm &&
+ a->frame.ncallee_saves == 0 && !a->frame.has_alloca &&
+ a->frame.cum_off == 0 && a->frame.max_outgoing == 0 &&
+ !a->has_sret && !a->is_variadic &&
+ rv_signature_stack_bytes(t, fd->fn_type, NULL, NULL) == 0;
+ if (a->slim_prologue) {
+ a->minimal_prologue_words = 0;
+ native_frame_set_final(&a->frame);
+ return;
+ }
+ n_int = rv_collect_int_saves(a, int_regs);
+ n_fp = rv_collect_fp_saves(a, fp_regs);
+ nwords = rv_build_prologue(a, words, RV_KNOWN_PROLOGUE_WORDS, frame_size,
+ fp_pair_off, int_regs, n_int, fp_regs, n_fp);
+ for (i = 0; i < nwords; ++i) rv64_emit32(mc, words[i]);
+ a->minimal_prologue_words = nwords;
+ native_frame_set_final(&a->frame);
+}
+
+/* ============================ params / ABI helpers
+ * ============================ */
+
+static const ABIArgInfo* rv_param_abi(NativeTarget* t, const ABIFuncInfo* abi,
+ const NativeCallDesc* desc, u32 i,
+ ABIArgInfo* scratch) {
+ /* Synthesized for unnamed (variadic) args, or untyped calls. RISC-V LP64D
+ * passes variadic FP args in INTEGER registers (as their bit pattern), not
+ * the FP pool — so a variadic float part is ABI_CLASS_INT. */
+ int variadic = abi && i >= abi->nparams;
+ if (abi && i < abi->nparams) return &abi->params[i];
+ memset(scratch, 0, sizeof *scratch);
+ scratch->kind = ABI_ARG_DIRECT;
+ scratch->nparts = 1;
+ scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1);
+ ((ABIArgPart*)scratch->parts)[0].cls =
+ (!variadic && cg_type_is_float(t->c, desc->args[i].type)) ? ABI_CLASS_FP
+ : ABI_CLASS_INT;
+ ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG;
+ ((ABIArgPart*)scratch->parts)[0].size =
+ native_type_size(t, desc->args[i].type);
+ ((ABIArgPart*)scratch->parts)[0].align =
+ native_type_align(t, desc->args[i].type);
+ return scratch;
+}
+
+/* Outgoing stack-slot size/align: the xlen-word (gp_slot_bytes: 8 lp64d /
+ * 4 ilp32) is the natural slot stride; stack ABI alignment caps at 16. */
+static u32 rv_part_stack_size(const RiscvVariant* v, const ABIArgPart* part) {
+ u32 slot = v->gp_slot_bytes;
+ return align_up_u32(part->size ? part->size : slot, slot);
+}
+static u32 rv_part_stack_align(const RiscvVariant* v, const ABIArgPart* part) {
+ u32 slot = v->gp_slot_bytes;
+ u32 al = part->align ? part->align : slot;
+ if (al < slot) al = slot;
+ if (al > 16u) al = 16u;
+ return al;
+}
+
+static KitCgTypeId rv_part_scalar_type(const ABIArgPart* part) {
+ if (part->cls == ABI_CLASS_FP) {
+ if (part->size <= 4u) return builtin_id(KIT_CG_BUILTIN_F32);
+ return builtin_id(KIT_CG_BUILTIN_F64);
+ }
+ switch (part->size) {
+ case 1u:
+ return builtin_id(KIT_CG_BUILTIN_I8);
+ case 2u:
+ return builtin_id(KIT_CG_BUILTIN_I16);
+ case 4u:
+ return builtin_id(KIT_CG_BUILTIN_I32);
+ default:
+ return builtin_id(KIT_CG_BUILTIN_I64);
+ }
+}
+
+static u32 rv_class_stack_size(const RiscvVariant* v, const ABIArgInfo* ai) {
+ u32 slot = v->gp_slot_bytes;
+ u32 total = 0, p;
+ if (!ai || ai->kind == ABI_ARG_IGNORE) return 0;
+ if (ai->kind == ABI_ARG_INDIRECT) return v->ptr_bytes;
+ for (p = 0; p < ai->nparts; ++p) {
+ total = align_up_u32(total, rv_part_stack_align(v, &ai->parts[p]));
+ total += rv_part_stack_size(v, &ai->parts[p]);
+ }
+ return align_up_u32(total ? total : slot, slot);
+}
+
+static u32 rv_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) {
+ const RiscvVariant* v = rv_of(t)->variant;
+ const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);
+ /* sret consumes a0 as the implicit first integer argument. */
+ u32 next_int = (abi && abi->has_sret) ? 1u : 0u;
+ u32 next_fp = 0, stack = 0, i, p;
+ for (i = 0; i < desc->nargs; ++i) {
+ ABIArgInfo tmp;
+ const ABIArgInfo* ai = rv_param_abi(t, abi, desc, i, &tmp);
+ int force_stack =
+ abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams;
+ if (ai->kind == ABI_ARG_IGNORE) continue;
+ if (force_stack) {
+ stack += rv_class_stack_size(v, ai);
+ continue;
+ }
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ if (next_int < 8u)
+ next_int++;
+ else
+ stack += v->ptr_bytes;
+ continue;
+ }
+ for (p = 0; p < ai->nparts; ++p) {
+ const ABIArgPart* part = &ai->parts[p];
+ if (part->cls == ABI_CLASS_FP) {
+ if (next_fp < 8u)
+ next_fp++;
+ else {
+ stack = align_up_u32(stack, rv_part_stack_align(v, part));
+ stack += rv_part_stack_size(v, part);
+ }
+ } else {
+ if (next_int < 8u)
+ next_int++;
+ else {
+ stack = align_up_u32(stack, rv_part_stack_align(v, part));
+ stack += rv_part_stack_size(v, part);
+ }
+ }
+ }
+ }
+ return align_up_u32(stack, 16u);
+}
+
+static u32 rv_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type,
+ int* variadic, u32* nparams) {
+ const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type);
+ NativeCallDesc d;
+ if (variadic) *variadic = abi ? (int)abi->variadic : 0;
+ if (nparams) *nparams = abi ? abi->nparams : 0u;
+ memset(&d, 0, sizeof d);
+ d.fn_type = fn_type;
+ d.nargs = abi ? abi->nparams : 0u;
+ if (d.nargs) d.args = arena_zarray(t->c->tu, NativeLoc, d.nargs);
+ return rv_call_stack_size(t, &d);
+}
+
+static u32 rv_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) {
+ return rv_call_stack_size(t, desc);
+}
+
+/* Resolve a NativeLoc to an addressable NativeAddr (frame/stack/addr). */
+static NativeAddr rv_loc_addr(RvNativeTarget* a, NativeLoc loc, u32 offset) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ switch ((NativeLocKind)loc.kind) {
+ case NATIVE_LOC_FRAME:
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = loc.v.frame;
+ addr.base_type = loc.type;
+ addr.offset = (i32)offset;
+ return addr;
+ case NATIVE_LOC_STACK:
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = loc.v.stack.slot;
+ addr.base_type = loc.type;
+ addr.offset = loc.v.stack.offset + (i32)offset;
+ return addr;
+ case NATIVE_LOC_ADDR:
+ addr = loc.v.addr;
+ addr.offset += (i32)offset;
+ return addr;
+ default:
+ rv_panic(a, "location is not addressable");
+ }
+ return addr;
+}
+
+static void rv_load_part(NativeTarget* t, NativeLoc dst, NativeLoc src,
+ u32 offset, u32 size) {
+ RvNativeTarget* a = rv_of(t);
+ if (src.kind == NATIVE_LOC_REG) {
+ rv_move(t, dst, src);
+ return;
+ }
+ if (src.kind == NATIVE_LOC_FRAME || src.kind == NATIVE_LOC_STACK ||
+ src.kind == NATIVE_LOC_ADDR) {
+ NativeAddr addr = rv_loc_addr(a, src, offset);
+ addr.base_type = dst.type;
+ rv_emit_mem(a, 1, dst, addr, native_mem_for_type(t, dst.type, size));
+ return;
+ }
+ if (src.kind == NATIVE_LOC_IMM) {
+ rv_emit_load_imm(a->variant, t->mc, rv_is_64(t, dst.type) ? 1u : 0u,
+ loc_reg(dst), src.v.imm);
+ return;
+ }
+ rv_panic(a, "unsupported part source");
+}
+
+static void rv_store_part(NativeTarget* t, NativeLoc dst, NativeLoc src,
+ u32 offset, u32 size) {
+ RvNativeTarget* a = rv_of(t);
+ if (dst.kind == NATIVE_LOC_FRAME || dst.kind == NATIVE_LOC_STACK ||
+ dst.kind == NATIVE_LOC_ADDR) {
+ NativeAddr addr = rv_loc_addr(a, dst, offset);
+ addr.base_type = src.type;
+ rv_emit_mem(a, 0, src, addr, native_mem_for_type(t, src.type, size));
+ return;
+ }
+ if (dst.kind == NATIVE_LOC_REG) {
+ rv_move(t, dst, src);
+ return;
+ }
+ rv_panic(a, "unsupported part destination");
+}
+
+static void rv_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) {
+ NativeAddr addr = rv_loc_addr(rv_of(t), src, 0);
+ rv_load_addr(t, dst, addr);
+}
+
+static void rv_store_outgoing_part(NativeTarget* t, int tail_call,
+ u32 stack_off, NativeLoc src, u32 size) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_REG;
+ addr.base_type = src.type;
+ if (tail_call) {
+ /* A sibling call reuses the caller's frame: its outgoing stack args land in
+ * the caller's incoming-arg window ([s0 + 16 + va_save + off]) — physically
+ * the same address the tail-callee will read at [sp+off] once the teardown
+ * has restored sp to the caller's entry sp (the CFA). */
+ addr.base.reg = RV_S0;
+ addr.offset = rv_s0_off_in_arg(rv_of(t), stack_off);
+ } else {
+ addr.base.reg = RV_SP;
+ addr.offset = (i32)stack_off;
+ }
+ rv_emit_mem(rv_of(t), 0, src, addr, native_mem_for_type(t, src.type, size));
+}
+
+/* NativeTarget bind_param: route incoming param (ABI loc) into dst. */
+static void rv_bind_native_param(NativeTarget* t, const CGParamDesc* p,
+ NativeLoc dst) {
+ RvNativeTarget* a = rv_of(t);
+ const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type);
+ const ABIArgInfo* ai =
+ p->index < abi->nparams ? &abi->params[p->index] : NULL;
+ int to_reg = dst.kind == NATIVE_LOC_REG;
+ u32 i;
+ if (!ai || ai->kind == ABI_ARG_IGNORE) return;
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ NativeLoc src = native_loc_reg(
+ builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
+ a->next_param_int < 8u ? RV_A0 + a->next_param_int : RV_TMP0);
+ NativeAddr d_addr, from;
+ AggregateAccess access;
+ if (a->next_param_int < 8u) {
+ a->next_param_int++;
+ } else {
+ NativeAddr sa;
+ memset(&sa, 0, sizeof sa);
+ sa.base_kind = NATIVE_ADDR_BASE_REG;
+ sa.base.reg = RV_S0;
+ sa.offset = rv_s0_off_in_arg(a, a->next_param_stack);
+ sa.base_type = src.type;
+ rv_emit_mem(a, 1, src, sa,
+ native_mem_for_type(t, src.type, a->variant->ptr_bytes));
+ a->next_param_stack += a->variant->ptr_bytes;
+ }
+ if (dst.kind != NATIVE_LOC_FRAME)
+ rv_panic(a, "indirect parameter requires a frame destination");
+ memset(&d_addr, 0, sizeof d_addr);
+ d_addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ d_addr.base.frame = dst.v.frame;
+ d_addr.base_type = p->type;
+ memset(&from, 0, sizeof from);
+ from.base_kind = NATIVE_ADDR_BASE_REG;
+ from.base.reg = loc_reg(src);
+ from.base_type = p->type;
+ memset(&access, 0, sizeof access);
+ access.type = p->type;
+ access.size = p->size ? p->size : (u32)cg_type_size(t->c, p->type);
+ access.align = p->align ? p->align : native_type_align(t, p->type);
+ rv_copy_bytes(t, d_addr, from, access);
+ return;
+ }
+ for (i = 0; i < ai->nparts; ++i) {
+ const ABIArgPart* part = &ai->parts[i];
+ NativeAllocClass cls =
+ part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
+ NativeLoc src;
+ if (cls == NATIVE_REG_FP && a->next_param_fp < 8u) {
+ src = native_loc_reg(p->type, cls, RV_FA0 + a->next_param_fp++);
+ } else if (cls == NATIVE_REG_INT && a->next_param_int < 8u) {
+ src = native_loc_reg(p->type, cls, RV_A0 + a->next_param_int++);
+ } else {
+ Reg tmp = (cls == NATIVE_REG_FP) ? RV_FTMP0 : RV_TMP0;
+ NativeAddr sa;
+ src = native_loc_reg(p->type, cls, tmp);
+ a->next_param_stack = align_up_u32(
+ a->next_param_stack, rv_part_stack_align(a->variant, part));
+ memset(&sa, 0, sizeof sa);
+ sa.base_kind = NATIVE_ADDR_BASE_REG;
+ sa.base.reg = RV_S0;
+ sa.base_type = p->type;
+ sa.offset = rv_s0_off_in_arg(a, a->next_param_stack);
+ rv_emit_mem(a, 1, src, sa, native_mem_for_type(t, p->type, part->size));
+ a->next_param_stack += rv_part_stack_size(a->variant, part);
+ }
+ if (dst.kind == NATIVE_LOC_NONE) {
+ /* unused parameter; cursors already advanced */
+ } else if (to_reg) {
+ NativeLoc d = native_loc_reg(dst.type ? dst.type : p->type,
+ (NativeAllocClass)dst.cls, (Reg)dst.v.reg);
+ if (!(src.kind == NATIVE_LOC_REG && loc_reg(src) == loc_reg(d) &&
+ (NativeAllocClass)src.cls == (NativeAllocClass)d.cls))
+ rv_move(t, d, src);
+ } else {
+ rv_store_part(
+ t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), src,
+ 0, part->size);
+ }
+ }
+ a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u);
+}
+
+/* ============================ calls / returns ============================ */
+
+typedef NativeArgMove RvArgMove;
+
+static void rv_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) {
+ if (m->is_addr)
+ rv_addr_of_loc(t, m->dst, m->src);
+ else
+ rv_load_part(t, m->dst, m->src, m->src_offset, m->size);
+}
+
+/* Parallel-copy register arg moves via the shared scheduler; cycles break
+ * through the int/fp emit scratch (t1 / ft1). */
+static void rv_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves,
+ u32 n) {
+ NativeArgShuffle s;
+ if (n > RV_MAX_REG_ARG_MOVES) rv_panic(rv_of(t), "too many register args");
+ memset(&s, 0, sizeof s);
+ s.t = t;
+ s.emit_one = rv_emit_one_arg_move;
+ s.reg_move = rv_move;
+ s.scratch[NATIVE_REG_INT] = RV_TMP1;
+ s.scratch[NATIVE_REG_FP] = RV_FTMP1;
+ native_arg_shuffle(&s, moves, n);
+}
+
+static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
+ NativeCallPlan* plan) {
+ RvNativeTarget* a = rv_of(t);
+ const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);
+ NativeCallPlanRet* rets;
+ KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
+ memset(plan, 0, sizeof *plan);
+ rets = desc->nresults ? arena_zarray(t->c->tu, NativeCallPlanRet, 4) : NULL;
+ plan->callee = desc->callee;
+ plan->rets = rets;
+ plan->flags = desc->flags;
+ plan->has_sret = abi && abi->has_sret;
+ plan->is_variadic = abi && abi->variadic;
+ plan->stack_arg_size = rv_call_stack_size(t, desc);
+ if (plan->stack_arg_size > a->frame.max_outgoing)
+ a->frame.max_outgoing = plan->stack_arg_size;
+ /* Indirect callee in an arg register would be clobbered by arg loads. */
+ if (plan->callee.kind == NATIVE_LOC_REG &&
+ (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT &&
+ plan->callee.v.reg >= RV_A0 && plan->callee.v.reg <= RV_A7) {
+ NativeLoc scratch =
+ native_loc_reg(plan->callee.type, NATIVE_REG_INT, RV_TMP0);
+ rv_move(t, scratch, plan->callee);
+ plan->callee = scratch;
+ }
+ {
+ /* sret returns pass the hidden destination pointer as the implicit first
+ * integer argument (a0), so the real args start at a1. */
+ u32 next_int = (abi && abi->has_sret) ? 1u : 0u;
+ u32 next_fp = 0, stack = 0, nmoves = 0, i, p;
+ int tail = (desc->flags & CG_CALL_TAIL) != 0;
+ RvArgMove moves[RV_MAX_REG_ARG_MOVES];
+ for (i = 0; i < desc->nargs; ++i) {
+ ABIArgInfo tmp;
+ const ABIArgInfo* ai = rv_param_abi(t, abi, desc, i, &tmp);
+ int force_stack =
+ abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams;
+ if (ai->kind == ABI_ARG_IGNORE) continue;
+ if (force_stack) {
+ NativeLoc tmpreg =
+ native_loc_reg(desc->args[i].type, NATIVE_REG_INT, RV_TMP0);
+ u32 slot = a->variant->gp_slot_bytes; /* xlen-word: 8 lp64d / 4 ilp32 */
+ u32 n = rv_class_stack_size(a->variant, ai), off = 0;
+ while (off < n) {
+ rv_load_part(t, tmpreg, desc->args[i], off, slot);
+ rv_store_outgoing_part(t, tail, stack + off, tmpreg, slot);
+ off += slot;
+ }
+ stack += n;
+ continue;
+ }
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ u32 ptr_sz = a->variant->ptr_bytes;
+ if (next_int < 8u) {
+ RvArgMove* m = &moves[nmoves++];
+ m->dst = native_loc_reg(i64t, NATIVE_REG_INT, RV_A0 + next_int++);
+ m->src = desc->args[i];
+ m->src_offset = 0;
+ m->size = ptr_sz;
+ m->is_addr = 1;
+ } else {
+ NativeLoc ptr = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP0);
+ rv_addr_of_loc(t, ptr, desc->args[i]);
+ rv_store_outgoing_part(t, tail, stack, ptr, ptr_sz);
+ stack += ptr_sz;
+ }
+ continue;
+ }
+ for (p = 0; p < ai->nparts; ++p) {
+ const ABIArgPart* part = &ai->parts[p];
+ NativeAllocClass cls =
+ part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
+ if ((cls == NATIVE_REG_FP && next_fp < 8u) ||
+ (cls == NATIVE_REG_INT && next_int < 8u)) {
+ RvArgMove* m = &moves[nmoves++];
+ Reg areg =
+ cls == NATIVE_REG_FP ? RV_FA0 + next_fp++ : RV_A0 + next_int++;
+ m->dst = native_loc_reg(desc->args[i].type, cls, areg);
+ m->src = desc->args[i];
+ m->src_offset = part->src_offset;
+ m->size = part->size;
+ m->is_addr = 0;
+ } else {
+ Reg tmp = cls == NATIVE_REG_FP ? RV_FTMP0 : RV_TMP0;
+ NativeLoc tmpreg = native_loc_reg(desc->args[i].type, cls, tmp);
+ rv_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size);
+ stack = align_up_u32(stack, rv_part_stack_align(a->variant, part));
+ rv_store_outgoing_part(t, tail, stack, tmpreg, part->size);
+ stack += rv_part_stack_size(a->variant, part);
+ }
+ }
+ }
+ rv_emit_reg_arg_moves(t, moves, nmoves);
+ if (abi && abi->has_sret && desc->nresults) {
+ /* sret pointer goes in a0; arg loads have completed. A tail call forwards
+ * the caller's own incoming sret pointer (spilled at entry) so the
+ * sibling writes the result into the caller's caller's destination;
+ * otherwise pass the address of this call's result slot. */
+ NativeLoc a0 = native_loc_reg(i64t, NATIVE_REG_INT, RV_A0);
+ if (tail)
+ rv_load_part(t, a0, native_loc_stack(i64t, a->sret_ptr_slot, 0), 0,
+ a->variant->ptr_bytes);
+ else
+ rv_addr_of_loc(t, a0, desc->results[0]);
+ }
+ }
+ if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) {
+ u32 nr = 0, ni = 0, nf = 0, p;
+ for (p = 0; p < abi->ret.nparts; ++p) {
+ const ABIArgPart* part = &abi->ret.parts[p];
+ NativeAllocClass cls =
+ part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
+ KitCgTypeId pty = rv_part_scalar_type(part);
+ Reg rreg = cls == NATIVE_REG_FP ? RV_FA0 + nf++ : RV_A0 + ni++;
+ rets[nr].src = native_loc_reg(pty, cls, rreg);
+ rets[nr].dst = desc->results[0];
+ if (rets[nr].dst.kind == NATIVE_LOC_FRAME)
+ rets[nr].dst = native_loc_stack(pty, desc->results[0].v.frame,
+ (i32)part->src_offset);
+ else if (rets[nr].dst.kind == NATIVE_LOC_STACK) {
+ rets[nr].dst.v.stack.offset += (i32)part->src_offset;
+ rets[nr].dst.type = pty;
+ }
+ rets[nr].mem = native_mem_for_type(t, pty, part->size);
+ nr++;
+ }
+ plan->nrets = nr;
+ } else if (abi && abi->ret.kind == ABI_ARG_IGNORE) {
+ plan->nrets = 0;
+ } else if (!abi && desc->nresults) {
+ rets[0].src = native_loc_reg(desc->results[0].type, NATIVE_REG_INT, RV_A0);
+ rets[0].dst = desc->results[0];
+ rets[0].mem = native_mem_for_type(t, desc->results[0].type, 0);
+ plan->nrets = 1;
+ }
+}
+
+/* Emit a sibling (tail) call: tear the frame down to the caller's entry state
+ * and jump (no link) to the callee. Outgoing args are already in the arg regs /
+ * the caller's incoming-arg window. At -O0 there are no callee-saves, and the
+ * sp restore uses the CFA offset (s0 + 16 + va_save), which is independent of
+ * the not-yet-final frame_size — so no func_end patching is needed. */
+static void rv_emit_tail_site(NativeTarget* t, NativeLoc callee) {
+ RvNativeTarget* a = rv_of(t);
+ const RiscvVariant* v = a->variant;
+ MCEmitter* mc = t->mc;
+ i32 cfa = (i32)(v->frame_save_size + rv_va_save_sz(a));
+ int indirect = callee.kind == NATIVE_LOC_REG;
+ u32 int_regs[RV_MAX_CALLEE_SAVES], fp_regs[RV_MAX_CALLEE_SAVES];
+ u32 n_int = rv_collect_int_saves(a, int_regs);
+ u32 n_fp = rv_collect_fp_saves(a, fp_regs);
+ i32 i;
+ /* Stage an indirect callee into a reserved scratch (t1) BEFORE the teardown:
+ * regalloc parks the function pointer in a callee-saved register so it
+ * survives arg marshalling, and the callee-save / s0 / ra restores below
+ * would otherwise overwrite it. t1 is reserved (never allocable) and
+ * untouched by the restore loop (which only uses t0 for far offsets). */
+ if (indirect) rv64_emit32(mc, rv_addi(RV_TMP1, loc_reg(callee), 0));
+ /* Restore callee-saves before tearing the frame down (O1 path; none at -O0).
+ * Their save offsets are s0-relative via rv_save_off, so the restore is
+ * frame-size- and teardown-order-independent. */
+ for (i = (i32)n_int - 1; i >= 0; --i)
+ rv_load_s0(v, mc, 0, int_regs[i], rv_save_off(a, n_int, (u32)i));
+ for (i = (i32)n_fp - 1; i >= 0; --i)
+ rv_load_s0(v, mc, 1, fp_regs[i], rv_save_off(a, n_int, n_int + (u32)i));
+ rv64_emit32(mc, rv_ld_ptr(v, RV_RA, RV_S0, (i32)v->ptr_bytes));
+ rv64_emit32(mc, rv_addi(RV_SP, RV_S0, cfa));
+ rv64_emit32(mc, rv_ld_ptr(v, RV_S0, RV_S0, 0));
+ if (callee.kind == NATIVE_LOC_GLOBAL) {
+ u32 pos = mc->pos(mc);
+ rv64_emit32(mc, rv_auipc(RV_TMP0, 0));
+ rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP0, 0));
+ mc->emit_reloc_at(mc, mc->section_id, pos, R_RV_CALL, callee.v.global.sym,
+ callee.v.global.addend, 0, 0);
+ } else if (indirect) {
+ rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP1, 0));
+ } else {
+ rv_panic(a, "unsupported tail call target");
+ }
+}
+
+static void rv_emit_call(NativeTarget* t, const NativeCallPlan* plan) {
+ MCEmitter* mc = t->mc;
+ ObjSecId sec = mc->section_id;
+ if (plan->flags & CG_CALL_TAIL) {
+ rv_emit_tail_site(t, plan->callee);
+ return;
+ }
+ if (plan->callee.kind == NATIVE_LOC_GLOBAL) {
+ u32 pos = mc->pos(mc);
+ rv64_emit32(mc, rv_auipc(RV_RA, 0));
+ rv64_emit32(mc, rv_jalr(RV_RA, RV_RA, 0));
+ mc->emit_reloc_at(mc, sec, pos, R_RV_CALL, plan->callee.v.global.sym,
+ plan->callee.v.global.addend, 0, 0);
+ return;
+ }
+ if (plan->callee.kind == NATIVE_LOC_REG) {
+ rv64_emit32(mc, rv_jalr(RV_RA, loc_reg(plan->callee), 0));
+ return;
+ }
+ rv_panic(rv_of(t), "unsupported call target");
+}
+
+static void rv_plan_ret(NativeTarget* t, const CGFuncDesc* fd,
+ const NativeLoc* values, u32 nvalues,
+ NativeCallPlanRet** out_rets, u32* out_nrets) {
+ RvNativeTarget* a = rv_of(t);
+ const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
+ NativeCallPlanRet* rets = NULL;
+ u32 nr = 0;
+ if (nvalues > 1u) rv_panic(a, "multiple returns unsupported");
+ if (nvalues) rets = arena_zarray(t->c->tu, NativeCallPlanRet, 4);
+ if (nvalues && abi && abi->ret.kind == ABI_ARG_INDIRECT) {
+ KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
+ NativeLoc dstp = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1);
+ NativeLoc saved = native_loc_stack(i64t, a->sret_ptr_slot, 0);
+ NativeAddr dst_addr, src_addr;
+ AggregateAccess access;
+ rv_load_part(t, dstp, saved, 0, a->variant->ptr_bytes);
+ memset(&dst_addr, 0, sizeof dst_addr);
+ dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
+ dst_addr.base.reg = RV_TMP1;
+ dst_addr.base_type = values[0].type;
+ src_addr = rv_loc_addr(a, values[0], 0);
+ src_addr.base_type = values[0].type;
+ memset(&access, 0, sizeof access);
+ access.type = values[0].type;
+ access.size = (u32)cg_type_size(t->c, values[0].type);
+ access.align = native_type_align(t, values[0].type);
+ rv_copy_bytes(t, dst_addr, src_addr, access);
+ *out_rets = NULL;
+ *out_nrets = 0;
+ return;
+ }
+ if (nvalues && abi && abi->ret.kind == ABI_ARG_DIRECT) {
+ u32 ni = 0, nf = 0, p;
+ for (p = 0; p < abi->ret.nparts; ++p) {
+ const ABIArgPart* part = &abi->ret.parts[p];
+ NativeAllocClass cls =
+ part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
+ KitCgTypeId pty = rv_part_scalar_type(part);
+ Reg rreg = cls == NATIVE_REG_FP ? RV_FA0 + nf++ : RV_A0 + ni++;
+ rets[nr].src = values[0];
+ if (rets[nr].src.kind == NATIVE_LOC_FRAME)
+ rets[nr].src =
+ native_loc_stack(pty, values[0].v.frame, (i32)part->src_offset);
+ else if (rets[nr].src.kind == NATIVE_LOC_STACK) {
+ rets[nr].src.v.stack.offset += (i32)part->src_offset;
+ rets[nr].src.type = pty;
+ }
+ rets[nr].dst = native_loc_reg(pty, cls, rreg);
+ rets[nr].mem = native_mem_for_type(t, pty, part->size);
+ nr++;
+ }
+ } else if (nvalues) {
+ rets[0].src = values[0];
+ rets[0].dst = native_loc_reg(values[0].type, NATIVE_REG_INT, RV_A0);
+ rets[0].mem = native_mem_for_type(t, values[0].type, 0);
+ nr = 1;
+ }
+ *out_rets = rets;
+ *out_nrets = nr;
+}
+
+static void rv_ret(NativeTarget* t) {
+ RvNativeTarget* a = rv_of(t);
+ rv_jump(t, a->epilogue_label);
+}
+
+/* ============================ alloca ============================ */
+
+static void rv_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size,
+ u32 align) {
+ RvNativeTarget* a = rv_of(t);
+ MCEmitter* mc = t->mc;
+ u32 rsz = loc_reg(size);
+ u32 rd = loc_reg(dst);
+ u32 al = align ? align : 16u;
+ if (al < 16u) al = 16u;
+ /* round up: t0 = (size + (al-1)) & ~(al-1) */
+ rv64_emit32(mc, rv_addi(RV_TMP0, rsz, (i32)(al - 1u)));
+ rv_emit_load_imm(a->variant, mc, 1, RV_TMP1, -(i64)al);
+ rv64_emit32(mc, rv_and(RV_TMP0, RV_TMP0, RV_TMP1));
+ rv64_emit32(mc, rv_sub(RV_SP, RV_SP, RV_TMP0));
+ a->frame.has_alloca = 1;
+ /* dst = sp + max_outgoing (patched in func_end) */
+ if (a->npatches == a->patches_cap) {
+ u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u;
+ RvPatch* nb = arena_zarray(t->c->tu, RvPatch, cap);
+ if (a->patches) memcpy(nb, a->patches, sizeof(*nb) * a->npatches);
+ a->patches = nb;
+ a->patches_cap = cap;
+ }
+ a->patches[a->npatches].kind = RV_PATCH_ALLOCA;
+ a->patches[a->npatches].pos = mc->pos(mc);
+ a->patches[a->npatches].dst_reg = rd;
+ a->npatches++;
+ a->nalloca++;
+ rv64_emit32(mc, RV_NOP); /* placeholder for addi dst, sp, max_outgoing */
+}
+
+/* ============================ TLS / bitfield / atomics
+ * ============================ */
+
+static void rv_tls_addr_of(NativeTarget* t, NativeLoc dst, ObjSymId sym,
+ i64 addend) {
+ MCEmitter* mc = t->mc;
+ u32 sec = mc->section_id;
+ u32 rd = loc_reg(dst);
+ /* Local-Exec only, matching aa64 (aa_tls_addr_of) and x64 (x64_tls_addr_of):
+ * kit links the whole module statically, so every _Thread_local symbol is
+ * resolved within the image and TPREL is always valid. An Initial-Exec GOT
+ * path (R_RV_TLS_GOT_HI20) used to be emitted for extern-via-GOT symbols
+ * under -fPIE (the hosted default), but the linker has no layout/apply for
+ * that reloc, so it produced a hard "unsupported reloc kind" link failure
+ * rather than a working binary. */
+ /* lui t0, %tprel_hi(sym); add t0, tp, t0; addi dst, t0, %tprel_lo(sym). */
+ {
+ u32 hp = mc->pos(mc);
+ rv64_emit32(mc, rv_lui(RV_TMP0, 0));
+ mc->emit_reloc_at(mc, sec, hp, R_RV_TPREL_HI20, sym, addend, 0, 0);
+ rv64_emit32(mc, rv_add(RV_TMP0, RV_TP, RV_TMP0));
+ {
+ u32 lp = mc->pos(mc);
+ rv64_emit32(mc, rv_addi(rd, RV_TMP0, 0));
+ mc->emit_reloc_at(mc, sec, lp, R_RV_TPREL_LO12_I, sym, addend, 0, 0);
+ }
+ }
+}
+static void rv_bitfield_load(NativeTarget* t, NativeLoc dst, NativeAddr ra,
+ BitFieldAccess bf) {
+ RvNativeTarget* a = rv_of(t);
+ const RiscvVariant* v = a->variant;
+ MCEmitter* mc = t->mc;
+ u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
+ u32 rd = loc_reg(dst);
+ u32 base;
+ i32 off;
+ u32 lsb = bf.bit_offset;
+ u32 width = bf.bit_width ? bf.bit_width : 1u;
+ /* Shift left so the field's MSB lands at the register top (XLEN-1), then shift
+ * right to sign/zero extend it down. Shifts are XLEN-wide. */
+ u32 sh_left = v->xlen - (lsb + width);
+ u32 sh_right = v->xlen - width;
+ ra.offset += (i32)bf.storage_offset;
+ rv_resolve_mem_addr(a, &ra, &base, &off);
+ rv64_emit32(mc, enc_int_load(v, storage_bytes, 0, rd, base, off));
+ rv64_emit32(mc, rv_slli(rd, rd, sh_left));
+ if (bf.signed_)
+ rv64_emit32(mc, rv_srai(rd, rd, sh_right));
+ else
+ rv64_emit32(mc, rv_srli(rd, rd, sh_right));
+}
+static void rv_bitfield_store(NativeTarget* t, NativeAddr ra, NativeLoc src,
+ BitFieldAccess bf) {
+ RvNativeTarget* a = rv_of(t);
+ const RiscvVariant* v = a->variant;
+ MCEmitter* mc = t->mc;
+ u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
+ u32 src_reg = loc_reg(src);
+ u32 base;
+ i32 off;
+ u32 lsb = bf.bit_offset;
+ u32 width = bf.bit_width ? bf.bit_width : 1u;
+ u64 ones = width >= 64u ? ~(u64)0 : (((u64)1 << width) - 1u);
+ u64 mask_in = ones << lsb;
+ ra.offset += (i32)bf.storage_offset;
+ /* Resolve the field address; rv_resolve_mem_addr may use RV_TMP0/RV_TMP1, so
+ * stabilize the base into RV_TMP1 before consuming the scratch temps. */
+ rv_resolve_mem_addr(a, &ra, &base, &off);
+ if (base != RV_S0 && base != RV_TMP1) {
+ rv_emit_addr_adjust(v, mc, RV_TMP1, base, off);
+ base = RV_TMP1;
+ off = 0;
+ } else if (base == RV_TMP1 && off != 0) {
+ rv_emit_addr_adjust(v, mc, RV_TMP1, RV_TMP1, off);
+ off = 0;
+ }
+ /* word in RV_TMP2; merged via RV_TMP0 (clear mask, then shifted src). */
+ rv64_emit32(mc, enc_int_load(v, storage_bytes, 0, RV_TMP2, base, off));
+ rv_emit_load_imm(v, mc, 1, RV_TMP0, (i64)~mask_in);
+ rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP2, RV_TMP0));
+ rv_emit_load_imm(v, mc, 1, RV_TMP0, (i64)ones);
+ rv64_emit32(mc, rv_and(RV_TMP0, src_reg, RV_TMP0));
+ if (lsb) rv64_emit32(mc, rv_slli(RV_TMP0, RV_TMP0, lsb));
+ rv64_emit32(mc, rv_or(RV_TMP2, RV_TMP2, RV_TMP0));
+ rv64_emit32(mc, enc_int_store(v, storage_bytes, RV_TMP2, base, off));
+}
+static int rv_order_acquire(KitCgMemOrder o) {
+ return o == KIT_CG_MO_CONSUME || o == KIT_CG_MO_ACQUIRE ||
+ o == KIT_CG_MO_ACQ_REL || o == KIT_CG_MO_SEQ_CST;
+}
+static int rv_order_release(KitCgMemOrder o) {
+ return o == KIT_CG_MO_RELEASE || o == KIT_CG_MO_ACQ_REL ||
+ o == KIT_CG_MO_SEQ_CST;
+}
+
+/* Materialize the atomic operand address into RV_TMP0 (a bare pointer, since
+ * LR/SC and AMO take a base register with no offset) and return it. */
+static u32 rv_atomic_addr_reg(RvNativeTarget* a, NativeAddr addr) {
+ NativeLoc dst =
+ native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, RV_TMP0);
+ rv_load_addr(&a->base, dst, addr);
+ return RV_TMP0;
+}
+
+static void rv_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr,
+ MemAccess mem, KitCgMemOrder mo) {
+ RvNativeTarget* a = rv_of(t);
+ MCEmitter* mc = t->mc;
+ u32 sf =
+ (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u;
+ u32 base = rv_atomic_addr_reg(a, addr);
+ if (mo == KIT_CG_MO_SEQ_CST) rv64_emit32(mc, rv_fence_rw_rw());
+ if (rv_order_acquire(mo)) {
+ /* lr.w/d as an ordered load (aq=1). */
+ rv64_emit32(mc, sf ? rv_lr_d(loc_reg(dst), base, 1, 0)
+ : rv_lr_w(loc_reg(dst), base, 1, 0));
+ } else {
+ rv64_emit32(mc, enc_int_load(a->variant,
+ mem.size ? mem.size
+ : native_type_size(t, dst.type),
+ 0, loc_reg(dst), base, 0));
+ }
+}
+
+static void rv_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src,
+ MemAccess mem, KitCgMemOrder mo) {
+ RvNativeTarget* a = rv_of(t);
+ MCEmitter* mc = t->mc;
+ u32 sz = mem.size ? mem.size : native_type_size(t, src.type);
+ /* RV_TMP0 holds the address; never collides with src (an allocable reg). */
+ u32 base = rv_atomic_addr_reg(a, addr);
+ if (rv_order_release(mo)) rv64_emit32(mc, rv_fence_rw_rw());
+ rv64_emit32(mc, enc_int_store(a->variant, sz, loc_reg(src), base, 0));
+ if (mo == KIT_CG_MO_SEQ_CST) rv64_emit32(mc, rv_fence_rw_rw());
+}
+
+static void rv_atomic_rmw(NativeTarget* t, KitCgAtomicOp op, NativeLoc dst,
+ NativeAddr addr, NativeLoc val, MemAccess mem,
+ KitCgMemOrder mo) {
+ RvNativeTarget* a = rv_of(t);
+ const RiscvVariant* v = a->variant;
+ MCEmitter* mc = t->mc;
+ u32 sf = (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u;
+ /* W-form add/sub apply only to a 32-bit value on rv64; on rv32 the base ops
+ * are the 32-bit ops. */
+ int w = !sf && v->has_w_forms;
+ u32 base = rv_atomic_addr_reg(a, addr); /* RV_TMP0 */
+ u32 vreg = loc_reg(val);
+ u32 rd = loc_reg(dst);
+ u32 aq = (u32)rv_order_acquire(mo);
+ u32 rl = (u32)rv_order_release(mo);
+ MCLabel retry = mc->label_new(mc);
+ /* LR/SC loop: dst = *base; new = dst op val; sc new; retry on failure.
+ * RV_TMP1 carries the SC status, RV_TMP3 the computed new value. */
+ mc->label_place(mc, retry);
+ rv64_emit32(mc, sf ? rv_lr_d(rd, base, aq, 0) : rv_lr_w(rd, base, aq, 0));
+ switch (op) {
+ case KIT_CG_ATOMIC_XCHG:
+ rv64_emit32(mc, rv_addi(RV_TMP3, vreg, 0));
+ break;
+ case KIT_CG_ATOMIC_ADD:
+ rv64_emit32(mc,
+ w ? rv_addw(RV_TMP3, rd, vreg) : rv_add(RV_TMP3, rd, vreg));
+ break;
+ case KIT_CG_ATOMIC_SUB:
+ rv64_emit32(mc,
+ w ? rv_subw(RV_TMP3, rd, vreg) : rv_sub(RV_TMP3, rd, vreg));
+ break;
+ case KIT_CG_ATOMIC_AND:
+ rv64_emit32(mc, rv_and(RV_TMP3, rd, vreg));
+ break;
+ case KIT_CG_ATOMIC_OR:
+ rv64_emit32(mc, rv_or(RV_TMP3, rd, vreg));
+ break;
+ case KIT_CG_ATOMIC_XOR:
+ rv64_emit32(mc, rv_xor(RV_TMP3, rd, vreg));
+ break;
+ case KIT_CG_ATOMIC_NAND:
+ rv64_emit32(mc, rv_and(RV_TMP3, rd, vreg));
+ rv64_emit32(mc, rv_xori(RV_TMP3, RV_TMP3, -1));
+ break;
+ default:
+ rv_panic(a, "unsupported atomic rmw op");
+ }
+ rv64_emit32(mc, sf ? rv_sc_d(RV_TMP1, base, RV_TMP3, 0, rl)
+ : rv_sc_w(RV_TMP1, base, RV_TMP3, 0, rl));
+ rv64_emit32(mc, rv_bne(RV_TMP1, RV_ZERO, 0));
+ mc->emit_label_ref(mc, retry, R_RV_BRANCH, 4, 0);
+}
+
+static void rv_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok,
+ NativeAddr addr, NativeLoc expected,
+ NativeLoc desired, MemAccess mem,
+ KitCgMemOrder success, KitCgMemOrder failure) {
+ RvNativeTarget* a = rv_of(t);
+ MCEmitter* mc = t->mc;
+ u32 sf =
+ (mem.size ? mem.size : native_type_size(t, prior.type)) == 8u ? 1u : 0u;
+ u32 base = rv_atomic_addr_reg(a, addr); /* RV_TMP0 */
+ u32 rprior = loc_reg(prior);
+ u32 rexp = loc_reg(expected);
+ u32 rdes = loc_reg(desired);
+ u32 rok = loc_reg(ok);
+ u32 aq = (u32)rv_order_acquire(success);
+ u32 rl = (u32)rv_order_release(success);
+ MCLabel retry = mc->label_new(mc);
+ MCLabel fail = mc->label_new(mc);
+ MCLabel done = mc->label_new(mc);
+ (void)failure;
+ mc->label_place(mc, retry);
+ rv64_emit32(mc,
+ sf ? rv_lr_d(rprior, base, aq, 0) : rv_lr_w(rprior, base, aq, 0));
+ /* if (prior != expected) -> fail */
+ rv64_emit32(mc, rv_bne(rprior, rexp, 0));
+ mc->emit_label_ref(mc, fail, R_RV_BRANCH, 4, 0);
+ /* sc.w/d status, desired, (base); retry on failure. */
+ rv64_emit32(mc, sf ? rv_sc_d(RV_TMP1, base, rdes, 0, rl)
+ : rv_sc_w(RV_TMP1, base, rdes, 0, rl));
+ rv64_emit32(mc, rv_bne(RV_TMP1, RV_ZERO, 0));
+ mc->emit_label_ref(mc, retry, R_RV_BRANCH, 4, 0);
+ /* ok = 1; jump done. */
+ rv_emit_load_imm(a->variant, mc, 0, rok, 1);
+ rv64_emit32(mc, rv_jal(RV_ZERO, 0));
+ mc->emit_label_ref(mc, done, R_RV_JAL, 4, 0);
+ mc->label_place(mc, fail);
+ rv_emit_load_imm(a->variant, mc, 0, rok, 0);
+ mc->label_place(mc, done);
+}
+
+static void rv_fence(NativeTarget* t, KitCgMemOrder mo) {
+ if (mo == KIT_CG_MO_RELAXED) return;
+ rv64_emit32(t->mc, rv_fence_rw_rw());
+}
+/* ---- variadics (LP64D ABI_VA_LIST_POINTER) ----
+ * va_list is a single void* to the next argument slot. The prologue spilled
+ * unconsumed a-regs into the 64-byte save area at [s0+16); incoming stack args
+ * follow contiguously, so a uniform 8-byte stride covers both. `ap` is a
+ * NativeAddr that addresses the va_list object itself. */
+
+static void rv_va_start_core(RvNativeTarget* a, NativeAddr ap) {
+ NativeTarget* t = &a->base;
+ const RiscvVariant* v = a->variant;
+ MCEmitter* mc = t->mc;
+ ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
+ KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
+ u32 slot = vai.gp_slot_size ? vai.gp_slot_size : v->gp_slot_bytes;
+ if (vai.kind != ABI_VA_LIST_POINTER)
+ rv_panic(a, "unsupported va_list layout");
+ if (!a->is_variadic) rv_panic(a, "va_start: function not variadic");
+ /* *ap = s0 + frame_save + next_param_int*gp_slot (skip named-int slots). */
+ rv64_emit32(mc, rv_addi(RV_TMP1, RV_S0,
+ (i32)v->frame_save_size +
+ (i32)(a->next_param_int * slot)));
+ rv_emit_mem(a, 0, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1), ap,
+ native_mem_for_type(t, i64t, v->ptr_bytes));
+}
+
+static void rv_va_arg_core(RvNativeTarget* a, NativeLoc dst, NativeAddr ap,
+ KitCgTypeId type) {
+ NativeTarget* t = &a->base;
+ const RiscvVariant* v = a->variant;
+ MCEmitter* mc = t->mc;
+ ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
+ KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
+ u32 sz = native_type_size(t, type);
+ u32 slot = vai.gp_slot_size ? vai.gp_slot_size : v->gp_slot_bytes;
+ NativeLoc cur = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1);
+ NativeAddr from;
+ if (vai.kind != ABI_VA_LIST_POINTER)
+ rv_panic(a, "unsupported va_list layout");
+ if (dst.kind != NATIVE_LOC_REG) rv_panic(a, "va_arg destination must be reg");
+ /* cur = *ap; load value from [cur]; *ap = cur + slot (one GP-slot stride). */
+ rv_emit_mem(a, 1, cur, ap, native_mem_for_type(t, i64t, v->ptr_bytes));
+ memset(&from, 0, sizeof from);
+ from.base_kind = NATIVE_ADDR_BASE_REG;
+ from.base.reg = RV_TMP1;
+ from.base_type = type;
+ if (native_loc_is_fp(dst)) {
+ /* Variadic FP args sit in the integer save area as their bit pattern;
+ * load into RV_TMP2 and bitcast into the FPR. The fmv_d_x (double) path is
+ * RV64-only — on rv32 doubles are passed soft and never reach here. */
+ NativeLoc itmp = native_loc_reg(type, NATIVE_REG_INT, RV_TMP2);
+ rv_emit_mem(a, 1, itmp, from, native_mem_for_type(t, type, sz));
+ rv64_emit32(mc, sz == 8u ? rv_fmv_d_x(loc_reg(dst), RV_TMP2)
+ : rv_fmv_w_x(loc_reg(dst), RV_TMP2));
+ } else {
+ rv_emit_mem(a, 1, dst, from, native_mem_for_type(t, type, sz));
+ }
+ rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP1, (i32)slot));
+ rv_emit_mem(a, 0, cur, ap, native_mem_for_type(t, i64t, v->ptr_bytes));
+}
+
+static void rv_va_copy_core(RvNativeTarget* a, NativeAddr dst_ap,
+ NativeAddr src_ap) {
+ NativeTarget* t = &a->base;
+ u32 ptr = a->variant->ptr_bytes;
+ KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
+ NativeLoc tmp = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1);
+ /* va_list is a single pointer-width slot. */
+ rv_emit_mem(a, 1, tmp, src_ap, native_mem_for_type(t, i64t, ptr));
+ rv_emit_mem(a, 0, tmp, dst_ap, native_mem_for_type(t, i64t, ptr));
+}
+
+static NativeAddr rv_va_addr_from_ptr(NativeLoc ap_ptr) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_REG;
+ addr.cls = NATIVE_REG_INT;
+ addr.base.reg = ap_ptr.v.reg;
+ addr.base_type = ap_ptr.type;
+ return addr;
+}
+
+static void rv_va_start_native(NativeTarget* t, NativeLoc ap_ptr) {
+ rv_va_start_core(rv_of(t), rv_va_addr_from_ptr(ap_ptr));
+}
+static void rv_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr,
+ KitCgTypeId type) {
+ rv_va_arg_core(rv_of(t), dst, rv_va_addr_from_ptr(ap_ptr), type);
+}
+static void rv_va_end_native(NativeTarget* t, NativeLoc ap_ptr) {
+ (void)t;
+ (void)ap_ptr;
+}
+static void rv_va_copy_native(NativeTarget* t, NativeLoc dst, NativeLoc src) {
+ rv_va_copy_core(rv_of(t), rv_va_addr_from_ptr(dst), rv_va_addr_from_ptr(src));
+}
+/* Software popcount of RV_TMP1 (already width-normalized) into rd, using
+ * RV_TMP1/RV_TMP2/RV_TMP3 as scratch. Mirrors the legacy bit-twiddling. On rv32
+ * only the 32-bit (is64==0) path is reachable for a single register. */
+static void rv_emit_popcount(const RiscvVariant* v, MCEmitter* mc, u32 rd,
+ int is64) {
+ rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, 1));
+ rv_emit_load_imm(v, mc, 1, RV_TMP3,
+ is64 ? (i64)0x5555555555555555ll : (i64)0x55555555);
+ rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP2, RV_TMP3));
+ rv64_emit32(mc, rv_sub(RV_TMP1, RV_TMP1, RV_TMP2));
+ rv_emit_load_imm(v, mc, 1, RV_TMP3,
+ is64 ? (i64)0x3333333333333333ll : (i64)0x33333333);
+ rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP1, RV_TMP3));
+ rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 2));
+ rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP3));
+ rv64_emit32(mc, rv_add(RV_TMP1, RV_TMP1, RV_TMP2));
+ rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, 4));
+ rv64_emit32(mc, rv_add(RV_TMP1, RV_TMP1, RV_TMP2));
+ rv_emit_load_imm(v, mc, 1, RV_TMP3,
+ is64 ? (i64)0x0f0f0f0f0f0f0f0fll : (i64)0x0f0f0f0f);
+ rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP3));
+ rv_emit_load_imm(v, mc, 1, RV_TMP3,
+ is64 ? (i64)0x0101010101010101ll : (i64)0x01010101);
+ rv64_emit32(mc, rv_mul(RV_TMP1, RV_TMP1, RV_TMP3));
+ rv64_emit32(mc, rv_srli(rd, RV_TMP1, is64 ? 56u : 24u));
+ /* The 32-bit SWAR sum lives in product bits [24,32); since the multiply is
+ * 64-bit, bits [32,64) survive the >>24 and must be masked off. (The 64-bit
+ * path's >>56 already isolates the top byte, so it needs no mask.) */
+ if (!is64) rv64_emit32(mc, rv_andi(rd, rd, 0xff));
+}
+
+/* Inline byte-granule copy/set between bare base registers (memcpy/memmove/
+ * memset intrinsics). dir<0 copies high-to-low (memmove backward). The 8-byte
+ * granule (ld/sd) and zero-extending lwu are RV64-only; on rv32 the widest
+ * granule is 4 bytes via lw/sw. */
+static void rv_intrin_copy(const RiscvVariant* v, MCEmitter* mc, u32 dr, u32 sr,
+ u32 n, int backward) {
+ int wide = v->ptr_bytes == 8u;
+ if (!backward) {
+ u32 i = 0;
+ while (wide && i + 8u <= n) {
+ rv64_emit32(mc, rv_ld(RV_TMP3, sr, (i32)i));
+ rv64_emit32(mc, rv_sd(RV_TMP3, dr, (i32)i));
+ i += 8u;
+ }
+ while (i + 4u <= n) {
+ rv64_emit32(mc, wide ? rv_lwu(RV_TMP3, sr, (i32)i)
+ : rv_lw(RV_TMP3, sr, (i32)i));
+ rv64_emit32(mc, rv_sw(RV_TMP3, dr, (i32)i));
+ i += 4u;
+ }
+ while (i + 2u <= n) {
+ rv64_emit32(mc, rv_lhu(RV_TMP3, sr, (i32)i));
+ rv64_emit32(mc, rv_sh(RV_TMP3, dr, (i32)i));
+ i += 2u;
+ }
+ while (i < n) {
+ rv64_emit32(mc, rv_lbu(RV_TMP3, sr, (i32)i));
+ rv64_emit32(mc, rv_sb(RV_TMP3, dr, (i32)i));
+ i += 1u;
+ }
+ } else {
+ u32 i = n;
+ while (wide && i >= 8u) {
+ i -= 8u;
+ rv64_emit32(mc, rv_ld(RV_TMP3, sr, (i32)i));
+ rv64_emit32(mc, rv_sd(RV_TMP3, dr, (i32)i));
+ }
+ while (i >= 4u) {
+ i -= 4u;
+ rv64_emit32(mc, wide ? rv_lwu(RV_TMP3, sr, (i32)i)
+ : rv_lw(RV_TMP3, sr, (i32)i));
+ rv64_emit32(mc, rv_sw(RV_TMP3, dr, (i32)i));
+ }
+ while (i >= 2u) {
+ i -= 2u;
+ rv64_emit32(mc, rv_lhu(RV_TMP3, sr, (i32)i));
+ rv64_emit32(mc, rv_sh(RV_TMP3, dr, (i32)i));
+ }
+ while (i >= 1u) {
+ i -= 1u;
+ rv64_emit32(mc, rv_lbu(RV_TMP3, sr, (i32)i));
+ rv64_emit32(mc, rv_sb(RV_TMP3, dr, (i32)i));
+ }
+ }
+}
+
+static void rv_intrinsic(NativeTarget* t, IntrinKind kind,
+ const NativeLoc* dsts, u32 ndst, const NativeLoc* args,
+ u32 narg) {
+ RvNativeTarget* a = rv_of(t);
+ const RiscvVariant* v = a->variant;
+ MCEmitter* mc = t->mc;
+ (void)ndst;
+ (void)narg;
+ switch (kind) {
+ case INTRIN_NONE:
+ break;
+ case INTRIN_EXPECT:
+ case INTRIN_ASSUME_ALIGNED: {
+ /* dst = val (hint dropped). */
+ if (args[0].kind == NATIVE_LOC_IMM)
+ rv_emit_load_imm(v, mc, rv_is_64(t, dsts[0].type) ? 1u : 0u,
+ loc_reg(dsts[0]), args[0].v.imm);
+ else
+ rv_move(t, dsts[0], args[0]);
+ return;
+ }
+ case INTRIN_PREFETCH:
+ return;
+ case INTRIN_TRAP:
+ rv64_emit32(mc, rv_ebreak());
+ return;
+ case INTRIN_BSWAP: {
+ u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type);
+ switch (width) {
+ case 2: {
+ u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
+ /* rd = ((rs & 0xff) << 8) | ((rs >> 8) & 0xff). */
+ rv64_emit32(mc, rv_addi(RV_TMP2, RV_ZERO, 0xff));
+ rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); /* 0xff00 */
+ rv64_emit32(mc, rv_slli(RV_TMP1, rs, 8));
+ rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP2));
+ rv64_emit32(mc, rv_srli(RV_TMP3, rs, 8));
+ rv64_emit32(mc, rv_andi(RV_TMP3, RV_TMP3, 0xff));
+ rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP3));
+ return;
+ }
+ case 4: {
+ u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
+ /* SRLIW is RV64-only; on rv32 SRLI on a 32-bit reg is equivalent. */
+ int w = v->has_w_forms;
+ rv64_emit32(mc, w ? rv_srliw(RV_TMP1, rs, 24) : rv_srli(RV_TMP1, rs, 24));
+ rv64_emit32(mc, rv_andi(RV_TMP1, RV_TMP1, 0xff));
+ rv64_emit32(mc, w ? rv_srliw(RV_TMP2, rs, 16) : rv_srli(RV_TMP2, rs, 16));
+ rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
+ rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8));
+ rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
+ rv64_emit32(mc, w ? rv_srliw(RV_TMP2, rs, 8) : rv_srli(RV_TMP2, rs, 8));
+ rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
+ rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 16));
+ rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
+ rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
+ rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 24));
+ rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP2));
+ /* Canonicalize to a 32-bit value in a 64-bit reg (RV64 only); on rv32
+ * the result already occupies the whole register. */
+ if (w) {
+ rv64_emit32(mc, rv_slli(rd, rd, 32));
+ rv64_emit32(mc, rv_srli(rd, rd, 32));
+ }
+ return;
+ }
+ case 8: {
+ u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
+ int i;
+ rv64_emit32(mc, rv_addi(RV_TMP1, RV_ZERO, 0));
+ for (i = 0; i < 8; ++i) {
+ int sh = 56 - 8 * i;
+ if (i == 0) {
+ rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
+ } else {
+ rv64_emit32(mc, rv_srli(RV_TMP2, rs, (u32)(8 * i)));
+ rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
+ }
+ if (sh) rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, (u32)sh));
+ rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
+ }
+ rv64_emit32(mc, rv_addi(rd, RV_TMP1, 0));
+ return;
+ }
+ default:
+ break;
+ }
+ return;
+ }
+ case INTRIN_POPCOUNT: {
+ u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
+ int is64 = rv_is_64(t, args[0].type);
+ /* The narrow-in-wide normalization clears the high 32 bits of a 64-bit
+ * reg; on rv32 there are none, so it is skipped. */
+ int nrm = !is64 && v->xlen == 64u;
+ rv64_emit32(mc, rv_addi(RV_TMP1, rs, 0));
+ if (nrm) {
+ rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
+ rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
+ }
+ rv_emit_popcount(v, mc, rd, is64);
+ return;
+ }
+ case INTRIN_CTZ: {
+ /* ctz(x) = popcount((x & -x) - 1) for x != 0. */
+ u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
+ int is64 = rv_is_64(t, args[0].type);
+ int nrm = !is64 && v->xlen == 64u;
+ rv64_emit32(mc, rv_sub(RV_TMP1, RV_ZERO, rs));
+ rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, rs));
+ rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP1, -1));
+ if (nrm) {
+ rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
+ rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
+ }
+ rv_emit_popcount(v, mc, rd, is64);
+ return;
+ }
+ case INTRIN_CLZ: {
+ /* Fold the high bit downward, then clz = popcount(~folded). */
+ u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
+ int is64 = rv_is_64(t, args[0].type);
+ int nrm = !is64 && v->xlen == 64u;
+ u32 shifts[6] = {1, 2, 4, 8, 16, 32};
+ u32 ns = is64 ? 6u : 5u, i;
+ rv64_emit32(mc, rv_addi(RV_TMP1, rs, 0));
+ if (nrm) {
+ rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
+ rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
+ }
+ for (i = 0; i < ns; ++i) {
+ rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, shifts[i]));
+ rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
+ }
+ rv64_emit32(mc, rv_xori(RV_TMP1, RV_TMP1, -1));
+ if (nrm) {
+ rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
+ rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
+ }
+ rv_emit_popcount(v, mc, rd, is64);
+ return;
+ }
+ case INTRIN_SADD_OVERFLOW:
+ case INTRIN_SSUB_OVERFLOW: {
+ /* dsts: [val, ovf]. ADD: ovf=((a^r)&(b^r))>>(w-1);
+ * SUB: ovf=((a^b)&(a^r))>>(w-1). */
+ int is64 = rv_is_64(t, dsts[0].type);
+ int w = !is64 && v->has_w_forms; /* narrow op on rv64 -> W-form */
+ u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
+ u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
+ u32 sh = is64 ? 63u : 31u;
+ if (kind == INTRIN_SADD_OVERFLOW)
+ rv64_emit32(mc, w ? rv_addw(RV_TMP2, ra, rb) : rv_add(RV_TMP2, ra, rb));
+ else
+ rv64_emit32(mc, w ? rv_subw(RV_TMP2, ra, rb) : rv_sub(RV_TMP2, ra, rb));
+ rv64_emit32(mc, rv_xor(RV_TMP3, ra, RV_TMP2)); /* a ^ r */
+ if (kind == INTRIN_SADD_OVERFLOW) {
+ rv64_emit32(mc, rv_xor(rovf, rb, RV_TMP2)); /* b ^ r */
+ rv64_emit32(mc, rv_and(rovf, rovf, RV_TMP3));
+ } else {
+ rv64_emit32(mc, rv_xor(rovf, ra, rb)); /* a ^ b */
+ rv64_emit32(mc, rv_and(rovf, rovf, RV_TMP3));
+ }
+ rv64_emit32(mc, w ? rv_srliw(rovf, rovf, sh) : rv_srli(rovf, rovf, sh));
+ rv64_emit32(mc, rv_andi(rovf, rovf, 1));
+ rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0));
+ return;
+ }
+ case INTRIN_UADD_OVERFLOW:
+ case INTRIN_USUB_OVERFLOW: {
+ int is64 = rv_is_64(t, dsts[0].type);
+ /* `single`: the value fills the whole native register (rv64 i64 or any
+ * rv32 value), so the native carry/borrow sequence applies directly; the
+ * `!single` branch is the rv64 32-bit-in-64-bit-register implementation
+ * (zero-extend + srli-32), reachable only on rv64. */
+ int single = is64 || v->xlen == 32u;
+ u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
+ u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
+ if (!single) {
+ rv64_emit32(mc, rv_slli(RV_TMP2, ra, 32));
+ rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP2, 32));
+ rv64_emit32(mc, rv_slli(RV_TMP3, rb, 32));
+ rv64_emit32(mc, rv_srli(RV_TMP3, RV_TMP3, 32));
+ ra = RV_TMP2;
+ rb = RV_TMP3;
+ }
+ if (kind == INTRIN_UADD_OVERFLOW) {
+ if (single) {
+ rv64_emit32(mc, rv_add(RV_TMP2, ra, rb));
+ rv64_emit32(mc, rv_sltu(rovf, RV_TMP2, ra));
+ } else {
+ rv64_emit32(mc, rv_add(RV_TMP2, ra, rb));
+ rv64_emit32(mc, rv_srli(rovf, RV_TMP2, 32));
+ rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
+ rv64_emit32(mc, rv_addiw(RV_TMP2, RV_TMP2, 0));
+ }
+ } else {
+ rv64_emit32(mc, rv_sltu(rovf, ra, rb));
+ rv64_emit32(mc, single ? rv_sub(RV_TMP2, ra, rb)
+ : rv_subw(RV_TMP2, ra, rb));
+ }
+ rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0));
+ return;
+ }
+ case INTRIN_SMUL_OVERFLOW: {
+ int is64 = rv_is_64(t, dsts[0].type);
+ /* `single`: native-width product overflow via MUL + MULH and a sign-bit
+ * compare (shift xlen-1). rv64 i64 and any rv32 value take this path; the
+ * `!single` branch is the rv64 32-bit-in-64-bit-register sequence. */
+ int single = is64 || v->xlen == 32u;
+ u32 sh = is64 ? 63u : 31u;
+ u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
+ u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
+ if (single) {
+ rv64_emit32(mc, rv_mul(RV_TMP2, ra, rb));
+ rv64_emit32(mc, rv_mulh(RV_TMP3, ra, rb));
+ rv64_emit32(mc, rv_srai(rovf, RV_TMP2, sh));
+ rv64_emit32(mc, rv_xor(rovf, RV_TMP3, rovf));
+ rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
+ rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0));
+ } else {
+ rv64_emit32(mc, rv_addiw(RV_TMP2, ra, 0));
+ rv64_emit32(mc, rv_addiw(RV_TMP3, rb, 0));
+ rv64_emit32(mc, rv_mul(RV_TMP2, RV_TMP2, RV_TMP3));
+ rv64_emit32(mc, rv_addiw(RV_TMP3, RV_TMP2, 0));
+ rv64_emit32(mc, rv_xor(rovf, RV_TMP2, RV_TMP3));
+ rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
+ rv64_emit32(mc, rv_addiw(rd, RV_TMP2, 0));
+ }
+ return;
+ }
+ case INTRIN_UMUL_OVERFLOW: {
+ int is64 = rv_is_64(t, dsts[0].type);
+ /* `single`: native-width product, overflow = (high word != 0) via MULHU.
+ * rv64 i64 and any rv32 value take this path; `!single` is the rv64
+ * 32-bit-in-64-bit-register sequence. */
+ int single = is64 || v->xlen == 32u;
+ u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
+ u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
+ if (single) {
+ rv64_emit32(mc, rv_mulhu(rovf, ra, rb));
+ rv64_emit32(mc, rv_mul(rd, ra, rb));
+ rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
+ } else {
+ rv64_emit32(mc, rv_slli(RV_TMP2, ra, 32));
+ rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP2, 32));
+ rv64_emit32(mc, rv_slli(RV_TMP3, rb, 32));
+ rv64_emit32(mc, rv_srli(RV_TMP3, RV_TMP3, 32));
+ rv64_emit32(mc, rv_mul(RV_TMP2, RV_TMP2, RV_TMP3));
+ rv64_emit32(mc, rv_srli(rovf, RV_TMP2, 32));
+ rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
+ rv64_emit32(mc, rv_addiw(rd, RV_TMP2, 0));
+ }
+ return;
+ }
+ case INTRIN_MEMCPY:
+ case INTRIN_MEMMOVE: {
+ u32 dr, sr, n;
+ if (narg != 3u || args[0].kind != NATIVE_LOC_REG ||
+ args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM)
+ rv_panic(a, "unsupported memory intrinsic operands");
+ if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll)
+ rv_panic(a, "unsupported memory intrinsic size");
+ dr = loc_reg(args[0]);
+ sr = loc_reg(args[1]);
+ n = (u32)args[2].v.imm;
+ rv_intrin_copy(v, mc, dr, sr, n, kind == INTRIN_MEMMOVE);
+ return;
+ }
+ case INTRIN_MEMSET: {
+ u32 dr, n, src;
+ int wide = v->ptr_bytes == 8u; /* 8-byte sd stores only on rv64 */
+ if (narg != 3u || args[0].kind != NATIVE_LOC_REG ||
+ args[2].kind != NATIVE_LOC_IMM)
+ rv_panic(a, "unsupported memset operands");
+ if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll)
+ rv_panic(a, "unsupported memset size");
+ dr = loc_reg(args[0]);
+ n = (u32)args[2].v.imm;
+ if (args[1].kind == NATIVE_LOC_IMM) {
+ u32 byte = (u32)(args[1].v.imm & 0xffu);
+ if (byte == 0) {
+ src = RV_ZERO;
+ } else {
+ u64 b = byte;
+ b |= b << 8;
+ b |= b << 16;
+ if (wide) b |= b << 32;
+ rv_emit_load_imm(v, mc, 1, RV_TMP3, (i64)b);
+ src = RV_TMP3;
+ }
+ } else {
+ /* Replicate the low byte across the register width (4 or 8 bytes). */
+ u32 rb = loc_reg(args[1]);
+ rv64_emit32(mc, rv_andi(RV_TMP3, rb, 0xff));
+ rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 8));
+ rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2));
+ rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 16));
+ rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2));
+ if (wide) {
+ rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 32));
+ rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2));
+ }
+ src = RV_TMP3;
+ }
+ {
+ u32 i = 0;
+ while (wide && i + 8u <= n) {
+ rv64_emit32(mc, rv_sd(src, dr, (i32)i));
+ i += 8u;
+ }
+ while (i + 4u <= n) {
+ rv64_emit32(mc, rv_sw(src, dr, (i32)i));
+ i += 4u;
+ }
+ while (i + 2u <= n) {
+ rv64_emit32(mc, rv_sh(src, dr, (i32)i));
+ i += 2u;
+ }
+ while (i < n) {
+ rv64_emit32(mc, rv_sb(src, dr, (i32)i));
+ i += 1u;
+ }
+ }
+ return;
+ }
+ case INTRIN_CPU_NOP:
+ rv64_emit32(mc, rv_nop());
+ return;
+ case INTRIN_CPU_YIELD:
+ rv64_emit32(mc, rv_pause());
+ return;
+ case INTRIN_ISB:
+ rv64_emit32(mc, rv_fence_i());
+ return;
+ case INTRIN_DMB:
+ case INTRIN_DSB:
+ rv64_emit32(mc, rv_fence_rw_rw());
+ return;
+ case INTRIN_WFI:
+ rv64_emit32(mc, rv_wfi());
+ return;
+ default:
+ break;
+ }
+ rv_panic(a, "unsupported compiler intrinsic");
+}
+/* ============================ inline asm ============================ */
+
+_Noreturn static void rv_asm_panic_at(Compiler* c, SrcLoc loc,
+ const char* msg) {
+ compiler_panic(c, loc, "rv64 inline asm: %s", msg);
+}
+_Noreturn static void rv_asm_panic(NativeDirectTarget* d, const char* msg) {
+ rv_asm_panic_at(d->base.c, d->loc, msg);
+}
+
+/* constraint_body / constraint_early / match_index are shared
+ * (cg/native_asm.h). */
+
+/* Build a bound register pseudo-operand in the rv64 inline shape. */
+static void rv_asm_bound_reg(Operand* out, KitCgTypeId type,
+ NativeAllocClass cls, Reg reg) {
+ memset(out, 0, sizeof *out);
+ out->kind = RV64_INLINE_OPK_REG;
+ out->pad[0] =
+ (cls == NATIVE_REG_FP) ? RV64_INLINE_OPCLS_FP : RV64_INLINE_OPCLS_INT;
+ out->type = type;
+ out->v.local = (CGLocal)reg;
+}
+static void rv_asm_bound_mem(Operand* out, KitCgTypeId type, Reg base) {
+ memset(out, 0, sizeof *out);
+ out->kind = OPK_INDIRECT;
+ out->type = type;
+ out->v.ind.base = (CGLocal)base;
+ out->v.ind.index = CG_LOCAL_NONE;
+ out->v.ind.ofs = 0;
+}
+
+/* Parse a clobber register name into (class, reg). Returns 0 for the special
+ * "cc"/"memory" clobbers and panics on an unknown register. RV64 dwarf: int
+ * x0..x31 = 0..31, fp f0..f31 = 32..63. */
+static int rv_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name,
+ NativeAllocClass* cls_out, Reg* reg_out) {
+ Slice s = pool_slice(c->global, name);
+ char buf[16];
+ uint32_t dwarf;
+ if (!s.s || !s.len) return 0;
+ if (s.len == 2 && s.s[0] == 'c' && s.s[1] == 'c') return 0;
+ if (s.len == 6 && memcmp(s.s, "memory", 6) == 0) return 0;
+ if (s.len >= sizeof buf) rv_asm_panic_at(c, loc, "clobber name is too long");
+ memcpy(buf, s.s, s.len);
+ buf[s.len] = '\0';
+ if (rv64_register_index(buf, &dwarf) != 0)
+ rv_asm_panic_at(c, loc, "unknown clobber register");
+ if (dwarf <= 31u) {
+ *cls_out = NATIVE_REG_INT;
+ *reg_out = (Reg)dwarf;
+ return 1;
+ }
+ if (dwarf >= 32u && dwarf <= 63u) {
+ *cls_out = NATIVE_REG_FP;
+ *reg_out = (Reg)(dwarf - 32u);
+ return 1;
+ }
+ rv_asm_panic_at(c, loc, "unsupported clobber register");
+ return 0;
+}
+
+static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
+ u32 nclob, u32* int_mask, u32* fp_mask) {
+ u32 i;
+ *int_mask = 0;
+ *fp_mask = 0;
+ for (i = 0; i < nclob; ++i) {
+ NativeAllocClass cls;
+ Reg reg;
+ if (!rv_asm_parse_reg_clobber(c, loc, clobbers[i], &cls, ®)) continue;
+ if (cls == NATIVE_REG_INT)
+ *int_mask |= 1u << reg;
+ else
+ *fp_mask |= 1u << reg;
+ }
+}
+
+static NativeAllocClass rv_asm_constraint_class(NativeDirectTarget* d,
+ const char* body) {
+ if (body[0] == 'r') return NATIVE_REG_INT;
+ if (body[0] == 'f') return NATIVE_REG_FP;
+ rv_asm_panic(d, "constraint is not a register constraint");
+ return NATIVE_REG_INT;
+}
+
+static int rv_asm_resolve_pin_or_panic(NativeDirectTarget* d, Sym reg,
+ const char* constraint,
+ NativeAsmRegPin* pin) {
+ NativeAsmRegPinStatus st =
+ native_asm_resolve_pin(d->native, reg, constraint, pin);
+ if (st == NATIVE_ASM_REG_PIN_ABSENT) return 0;
+ if (st != NATIVE_ASM_REG_PIN_OK)
+ rv_asm_panic(d, native_asm_pin_status_message(st));
+ return 1;
+}
+
+/* Pick a free register from the arch's caller-saved allocable pools for an
+ * asm operand the direct path must self-allocate. */
+static Reg rv_asm_alloc_reg(NativeDirectTarget* d, NativeAllocClass cls,
+ u32* used_int, u32* used_fp) {
+ /* int: a0..a7 (10..17) then t-temps that aren't emit scratch. */
+ static const Reg int_pool[] = {10u, 11u, 12u, 13u, 14u, 15u,
+ 16u, 17u, 29u, 30u, 31u};
+ /* fp: fa0..fa7 (10..17) then ft caller-saved. */
+ static const Reg fp_pool[] = {10u, 11u, 12u, 13u, 14u, 15u, 16u, 17u,
+ 4u, 5u, 6u, 7u, 28u, 29u, 30u, 31u};
+ const Reg* pool = cls == NATIVE_REG_FP ? fp_pool : int_pool;
+ u32 n = cls == NATIVE_REG_FP ? (u32)(sizeof fp_pool / sizeof fp_pool[0])
+ : (u32)(sizeof int_pool / sizeof int_pool[0]);
+ u32* used = cls == NATIVE_REG_FP ? used_fp : used_int;
+ u32 i;
+ for (i = 0; i < n; ++i) {
+ Reg r = pool[i];
+ if ((*used & (1u << r)) != 0) continue;
+ *used |= 1u << r;
+ return r;
+ }
+ rv_asm_panic(d, "out of registers for asm operands");
+ return REG_NONE;
+}
+
+/* Direct (-O0) path: resolve a semantic Operand to a NativeAddr. */
+static NativeAddr rv_direct_addr(NativeDirectTarget* d, Operand op) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ switch ((OpKind)op.kind) {
+ case OPK_LOCAL:
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = d->locals[op.v.local - 1u].home;
+ addr.base_type = op.type;
+ return addr;
+ case OPK_INDIRECT:
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE;
+ addr.base.frame = d->locals[op.v.ind.base - 1u].home;
+ addr.cls = d->locals[op.v.ind.base - 1u].cls;
+ addr.base_type = d->locals[op.v.ind.base - 1u].type;
+ addr.offset = op.v.ind.ofs;
+ return addr;
+ default:
+ rv_asm_panic(d, "operand is not addressable");
+ }
+}
+
+/* Materialize an OPK_INDIRECT (frame-value) base into a register, returning a
+ * plain register-based NativeAddr. */
+static NativeAddr rv_direct_materialize_addr(NativeDirectTarget* d,
+ Operand op) {
+ RvNativeTarget* a = rv_of(d->native);
+ NativeAddr addr = rv_direct_addr(d, op);
+ if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
+ NativeLoc base = native_loc_reg(addr.base_type, NATIVE_REG_INT, RV_TMP1);
+ NativeAddr load;
+ memset(&load, 0, sizeof load);
+ load.base_kind = NATIVE_ADDR_BASE_FRAME;
+ load.base.frame = addr.base.frame;
+ load.base_type = addr.base_type;
+ rv_emit_mem(a, 1, base, load,
+ native_mem_for_type(d->native, addr.base_type, 8));
+ addr.base_kind = NATIVE_ADDR_BASE_REG;
+ addr.base.reg = RV_TMP1;
+ }
+ return addr;
+}
+
+static void rv_direct_load_operand_to_reg(NativeDirectTarget* d, Operand op,
+ NativeLoc dst) {
+ RvNativeTarget* a = rv_of(d->native);
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ switch ((OpKind)op.kind) {
+ case OPK_IMM:
+ if ((NativeAllocClass)dst.cls != NATIVE_REG_INT)
+ rv_asm_panic(d, "floating-point immediate asm input is unsupported");
+ d->native->load_imm(d->native, dst, op.v.imm);
+ return;
+ case OPK_LOCAL:
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = d->locals[op.v.local - 1u].home;
+ addr.base_type = op.type;
+ rv_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0));
+ return;
+ case OPK_GLOBAL:
+ addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
+ addr.base.global.sym = op.v.global.sym;
+ addr.base.global.addend = op.v.global.addend;
+ addr.base_type = op.type;
+ d->native->load_addr(d->native, dst, addr);
+ return;
+ case OPK_INDIRECT:
+ addr = rv_direct_materialize_addr(d, op);
+ rv_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0));
+ return;
+ }
+ rv_asm_panic(d, "unsupported asm input operand");
+}
+
+static void rv_direct_load_address_to_reg(NativeDirectTarget* d, Operand op,
+ NativeLoc dst) {
+ d->native->load_addr(d->native, dst, rv_direct_addr(d, op));
+}
+
+static void rv_direct_store_reg_to_operand(NativeDirectTarget* d, Operand op,
+ NativeLoc src) {
+ RvNativeTarget* a = rv_of(d->native);
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ if (op.kind == OPK_LOCAL) {
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = d->locals[op.v.local - 1u].home;
+ addr.base_type = op.type;
+ } else {
+ addr = rv_direct_materialize_addr(d, op);
+ }
+ rv_emit_mem(a, 0, src, addr, native_mem_for_type(d->native, op.type, 0));
+}
+
+/* Callee-saved registers an asm block clobbers must be spilled/restored around
+ * the block (the only ABI duty the allocator cannot discharge itself). */
+typedef struct RvAsmSavedClobber {
+ NativeFrameSlot slot;
+ NativeAllocClass cls;
+ Reg reg;
+ KitCgTypeId type;
+} RvAsmSavedClobber;
+
+/* A clobber save slot is register-width: ptr_bytes for an integer reg (4 on
+ * rv32, 8 on rv64) but always 8 for an FP reg (fsd, even on rv32d). */
+static u32 rv_asm_save_bytes(const RvNativeTarget* a, const RvAsmSavedClobber* s) {
+ return s->cls == NATIVE_REG_FP ? 8u : a->variant->ptr_bytes;
+}
+static void rv_asm_save_one(RvNativeTarget* a, RvAsmSavedClobber* s) {
+ NativeFrameSlotDesc desc;
+ NativeAddr addr;
+ u32 sz = rv_asm_save_bytes(a, s);
+ memset(&desc, 0, sizeof desc);
+ desc.type = s->type;
+ desc.size = sz;
+ desc.align = sz;
+ desc.kind = NATIVE_FRAME_SLOT_SAVE;
+ s->slot = a->base.frame_slot(&a->base, &desc);
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = s->slot;
+ addr.base_type = s->type;
+ rv_emit_mem(a, 0, native_loc_reg(s->type, s->cls, s->reg), addr,
+ native_mem_for_type(&a->base, s->type, sz));
+}
+static void rv_asm_restore_one(RvNativeTarget* a, const RvAsmSavedClobber* s) {
+ NativeAddr addr;
+ u32 sz = rv_asm_save_bytes(a, s);
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = s->slot;
+ addr.base_type = s->type;
+ rv_emit_mem(a, 1, native_loc_reg(s->type, s->cls, s->reg), addr,
+ native_mem_for_type(&a->base, s->type, sz));
+}
+
+/* psABI callee-saved: integer s0..s11 (x8,x9,x18..x27), fp fs0..fs11
+ * (f8,f9,f18..f27). x8 is the frame pointer and never asm-clobbered. */
+static int rv_reg_is_callee_int(Reg r) {
+ return r == 9u || (r >= 18u && r <= 27u);
+}
+static int rv_reg_is_callee_fp(Reg r) {
+ return r == 8u || r == 9u || (r >= 18u && r <= 27u);
+}
+
+static RvAsmSavedClobber* rv_asm_save_callee_clobbers(RvNativeTarget* a,
+ u32 int_mask, u32 fp_mask,
+ u32* nsaved_out) {
+ RvAsmSavedClobber* saved =
+ arena_zarray(a->base.c->tu, RvAsmSavedClobber, 24u);
+ KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
+ KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64);
+ u32 n = 0;
+ Reg r;
+ for (r = 0; r <= 31u; ++r) {
+ if ((int_mask & (1u << r)) == 0 || !rv_reg_is_callee_int(r)) continue;
+ saved[n].cls = NATIVE_REG_INT;
+ saved[n].reg = r;
+ saved[n].type = i64;
+ rv_asm_save_one(a, &saved[n++]);
+ }
+ for (r = 0; r <= 31u; ++r) {
+ if ((fp_mask & (1u << r)) == 0 || !rv_reg_is_callee_fp(r)) continue;
+ saved[n].cls = NATIVE_REG_FP;
+ saved[n].reg = r;
+ saved[n].type = f64;
+ rv_asm_save_one(a, &saved[n++]);
+ }
+ *nsaved_out = n;
+ return saved;
+}
+
+/* ---- NativeTarget (optimizer) asm hook ----
+ * The optimizer pre-allocated every operand register and arranged surrounding
+ * data flow, so this binds pre-allocated registers to the template and only
+ * materializes memory-operand bases into the reserved scratch + spills the
+ * callee-saved registers the asm clobbers. */
+
+static NativeAddr rv_asm_loc_to_addr(RvNativeTarget* a, SrcLoc loc,
+ NativeLoc src) {
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ addr.base_type = src.type;
+ switch ((NativeLocKind)src.kind) {
+ case NATIVE_LOC_FRAME:
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = src.v.frame;
+ return addr;
+ case NATIVE_LOC_ADDR:
+ return src.v.addr;
+ case NATIVE_LOC_GLOBAL:
+ addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
+ addr.base.global.sym = src.v.global.sym;
+ addr.base.global.addend = src.v.global.addend;
+ return addr;
+ case NATIVE_LOC_REG:
+ addr.base_kind = NATIVE_ADDR_BASE_REG;
+ addr.cls = NATIVE_REG_INT;
+ addr.base.reg = src.v.reg;
+ return addr;
+ default:
+ rv_asm_panic_at(a->base.c, loc, "unsupported memory asm operand");
+ }
+}
+
+/* Resolve a memory-constraint operand to a single base register with zero
+ * offset, folding any frame/global/offset into a reserved scratch register. */
+static Reg rv_asm_native_mem_base(RvNativeTarget* a, SrcLoc loc, NativeLoc src,
+ u32* ntmp) {
+ NativeAddr addr = rv_asm_loc_to_addr(a, loc, src);
+ u32 base;
+ i32 off;
+ Reg dst;
+ if (addr.index_kind != NATIVE_ADDR_INDEX_NONE)
+ rv_asm_panic_at(a->base.c, loc, "indexed memory asm operand unsupported");
+ rv_resolve_mem_addr(a, &addr, &base, &off);
+ if (off == 0 && base != RV_TMP0 && base != RV_TMP1) return (Reg)base;
+ if (*ntmp >= 2u)
+ rv_asm_panic_at(a->base.c, loc, "too many memory asm operands");
+ dst = (*ntmp == 0u) ? RV_TMP0 : RV_TMP1;
+ (*ntmp)++;
+ rv_emit_addr_adjust(a->variant, a->base.mc, dst, base, off);
+ return dst;
+}
+
+static void rv_asm_bind_native(RvNativeTarget* a, SrcLoc loc, Operand* out,
+ const char* constraint, KitCgTypeId type,
+ NativeLoc src, u32* ntmp) {
+ const char* body = native_asm_constraint_body(constraint);
+ if (body[0] == 'r' || body[0] == 'f') {
+ NativeAllocClass cls = (body[0] == 'f') ? NATIVE_REG_FP : NATIVE_REG_INT;
+ if (src.kind != NATIVE_LOC_REG)
+ rv_asm_panic_at(a->base.c, loc, "register asm operand not in a register");
+ rv_asm_bound_reg(out, type, cls, (Reg)src.v.reg);
+ } else if (body[0] == 'i') {
+ if (src.kind != NATIVE_LOC_IMM)
+ rv_asm_panic_at(a->base.c, loc, "immediate asm operand is not immediate");
+ memset(out, 0, sizeof *out);
+ out->kind = OPK_IMM;
+ out->type = type;
+ out->v.imm = src.v.imm;
+ } else if (body[0] == 'm') {
+ rv_asm_bound_mem(out, type, rv_asm_native_mem_base(a, loc, src, ntmp));
+ } else {
+ rv_asm_panic_at(a->base.c, loc, "unsupported asm constraint");
+ }
+}
+
+static void rv_asm_block_native(NativeTarget* t, const char* tmpl,
+ const AsmConstraint* outs, u32 nout,
+ NativeLoc* out_locs, const AsmConstraint* ins,
+ u32 nin, const NativeLoc* in_locs,
+ const Sym* clobbers, u32 nclob) {
+ RvNativeTarget* a = rv_of(t);
+ Compiler* c = t->c;
+ SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
+ Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
+ Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
+ u32 ntmp = 0, i;
+ Rv64Asm* asmh;
+
+ for (i = 0; i < nout; ++i) {
+ KitCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
+ rv_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i],
+ &ntmp);
+ }
+ for (i = 0; i < nin; ++i) {
+ const char* body = native_asm_constraint_body(ins[i].str);
+ int matched = native_asm_match_index(body);
+ KitCgTypeId type;
+ NativeLoc inloc;
+ if (matched >= 0) {
+ if ((u32)matched >= nout)
+ rv_asm_panic_at(c, loc, "matching constraint out of range");
+ bound_ins[i] = bound_outs[matched];
+ continue;
+ }
+ type = ins[i].type ? ins[i].type : in_locs[i].type;
+ inloc = in_locs[i];
+ /* A register-constrained input that lives in a frame slot (address-taken
+ * local) must be loaded into a reserved scratch first. */
+ if (body[0] == 'r' && inloc.kind != NATIVE_LOC_REG) {
+ Reg r;
+ if (ntmp >= 2u) rv_asm_panic_at(c, loc, "too many memory asm operands");
+ r = (ntmp == 0u) ? RV_TMP0 : RV_TMP1;
+ ntmp++;
+ inloc = native_loc_reg(type, NATIVE_REG_INT, r);
+ rv_emit_mem(a, 1, inloc, rv_asm_loc_to_addr(a, loc, in_locs[i]),
+ native_mem_for_type(t, type, native_type_size(t, type)));
+ }
+ rv_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp);
+ }
+
+ /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber
+ * masks and rv_known_callee_saves folded the callee-saved ones into the
+ * function's saved set, so the prologue/epilogue already preserve them. */
+ asmh = rv64_asm_open(c);
+ rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
+ nclob);
+ rv64_asm_run_template(asmh, t->mc, tmpl);
+ rv64_asm_close(asmh);
+}
+/* file_scope_asm + finalize are shared (cg/native_asm.h). */
+
+static void rv_trap(NativeTarget* t) { rv64_emit32(t->mc, rv_ebreak()); }
+static void rv_set_loc(NativeTarget* t, SrcLoc loc) {
+ rv_of(t)->loc = loc;
+ if (t->mc->set_loc) t->mc->set_loc(t->mc, loc);
+}
+
+/* ============================ construction ============================ */
+
+NativeTarget* rv64_native_target_new(Compiler* c, ObjBuilder* obj,
+ MCEmitter* mc) {
+ RvNativeTarget* a = arena_znew(c->tu, RvNativeTarget);
+ NativeTarget* t;
+ if (!a) return NULL;
+ t = &a->base;
+ t->c = c;
+ t->obj = obj;
+ t->mc = mc;
+ a->variant = riscv_variant_for_kind(c->target.arch);
+ native_frame_init(&a->frame, c);
+ t->regs = &rv_reg_info;
+ t->class_for_type = native_class_for_type_fp_le8;
+ t->imm_legal = rv_imm_legal;
+ t->addr_legal = rv_addr_legal;
+ t->func_begin = rv_func_begin;
+ t->func_begin_known_frame = rv_func_begin_known_frame;
+ t->note_frame_state = NULL;
+ /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved
+ * set; rv_func_begin_known_frame derives the records from the masks. */
+ t->reserve_callee_saves = rv_reserve_callee_saves;
+ t->signature_stack_bytes = rv_signature_stack_bytes;
+ t->call_stack_bytes = rv_call_stack_bytes;
+ t->has_store_zero_reg = 1;
+ t->store_zero_reg = RV_ZERO;
+ t->func_end = rv_func_end;
+ t->frame_slot = rv_frame_slot;
+ t->frame_slot_debug_loc = rv_frame_slot_debug_loc;
+ t->bind_param = rv_bind_native_param;
+ t->label_new = rv_label_new;
+ t->label_place = rv_label_place;
+ t->jump = rv_jump;
+ t->cmp_branch = rv_cmp_branch;
+ t->indirect_branch = rv_indirect_branch;
+ t->load_label_addr = rv_load_label_addr;
+ t->move = rv_move;
+ t->load_imm = rv_load_imm;
+ t->load_const = rv_load_const;
+ t->load_addr = rv_load_addr;
+ t->load = rv_load;
+ t->store = rv_store;
+ t->tls_addr_of = rv_tls_addr_of;
+ t->copy_bytes = rv_copy_bytes;
+ t->set_bytes = rv_set_bytes;
+ t->bitfield_load = rv_bitfield_load;
+ t->bitfield_store = rv_bitfield_store;
+ t->binop = rv_binop;
+ t->unop = rv_unop;
+ t->cmp = rv_cmp;
+ t->convert = rv_convert;
+ t->alloca_ = rv_alloca;
+ t->spill = rv_spill;
+ t->reload = rv_reload;
+ t->plan_call = rv_plan_call;
+ t->emit_call = rv_emit_call;
+ t->plan_ret = rv_plan_ret;
+ t->ret = rv_ret;
+ t->atomic_load = rv_atomic_load;
+ t->atomic_store = rv_atomic_store;
+ t->atomic_rmw = rv_atomic_rmw;
+ t->atomic_cas = rv_atomic_cas;
+ t->fence = rv_fence;
+ t->va_start_ = rv_va_start_native;
+ t->va_arg_ = rv_va_arg_native;
+ t->va_end_ = rv_va_end_native;
+ t->va_copy_ = rv_va_copy_native;
+ t->intrinsic = rv_intrinsic;
+ t->asm_block = rv_asm_block_native;
+ t->file_scope_asm = native_file_scope_asm;
+ t->trap = rv_trap;
+ t->set_loc = rv_set_loc;
+ t->finalize = native_finalize;
+ return t;
+}
+
+/* ============================ NativeOps (-O0) ============================ */
+
+static void rv_bind_param(NativeDirectTarget* d, const CGParamDesc* p,
+ CGLocal local, NativeDirectLocal* l) {
+ NativeLoc dst;
+ (void)local;
+ memset(&dst, 0, sizeof dst);
+ dst.kind = NATIVE_LOC_FRAME;
+ dst.type = p->type;
+ dst.v.frame = l->home;
+ rv_bind_native_param(d->native, p, dst);
+}
+
+/* A sibling call is realizable when its outgoing stack-argument area fits the
+ * window the caller itself received (so the args land in the caller's incoming
+ * slots without overflowing into the caller's caller's frame). Register-only
+ * calls (the common case) always qualify. Mirrors aa64's aa_no_tail. */
+static const char* rv_no_tail(NativeDirectTarget* d, const CGCallDesc* call) {
+ RvNativeTarget* a = rv_of(d->native);
+ NativeCallDesc nd;
+ NativeLoc* args = NULL;
+ NativeLoc* results = NULL;
+ u32 i, stack;
+ if (a->frame.ncallee_saves)
+ return "rv64 tail call: callee-saved registers in use";
+ memset(&nd, 0, sizeof nd);
+ if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs);
+ if (call->nresults)
+ results = arena_zarray(d->base.c->tu, NativeLoc, call->nresults);
+ for (i = 0; i < call->nargs; ++i) {
+ args[i].kind = NATIVE_LOC_FRAME;
+ args[i].type = d->locals[call->args[i] - 1u].type;
+ args[i].cls = d->locals[call->args[i] - 1u].cls;
+ args[i].v.frame = d->locals[call->args[i] - 1u].home;
+ }
+ for (i = 0; i < call->nresults; ++i) {
+ results[i].kind = NATIVE_LOC_FRAME;
+ results[i].type = d->locals[call->results[i] - 1u].type;
+ results[i].cls = d->locals[call->results[i] - 1u].cls;
+ results[i].v.frame = d->locals[call->results[i] - 1u].home;
+ }
+ nd.fn_type = call->fn_type;
+ nd.args = args;
+ nd.results = results;
+ nd.nargs = call->nargs;
+ nd.nresults = call->nresults;
+ stack = rv_call_stack_size(d->native, &nd);
+ if (stack > a->incoming_stack_size)
+ return "rv64 tail call: stack argument area too small";
+ return NULL;
+}
+
+/* Resolve a pointer-typed Operand (the address of a va_list object) into `reg`
+ * and return a register-based NativeAddr. An OPK_LOCAL holds the va_list object
+ * itself, so we take its frame address; an OPK_INDIRECT holds the pointer in
+ * memory and must be loaded. The va cores use TMP1/TMP2 internally, so `reg`
+ * must be distinct from those (callers pass TMP0 / TMP3). */
+/* ap_addr is the pointer value &ap (the va_list object's address). For an
+ * OPK_LOCAL the local HOLDS that pointer, so load its home value; an
+ * OPK_INDIRECT names *(base+ofs), whose address base+ofs is the pointer.
+ * Mirrors aa64's aa_direct_pointer_addr. */
+static NativeAddr rv_direct_pointer_addr(NativeDirectTarget* d, Operand op) {
+ RvNativeTarget* a = rv_of(d->native);
+ NativeAddr addr;
+ memset(&addr, 0, sizeof addr);
+ if (op.kind == OPK_LOCAL) {
+ NativeLoc base = native_loc_reg(op.type, NATIVE_REG_INT, RV_TMP1);
+ NativeAddr load;
+ memset(&load, 0, sizeof load);
+ load.base_kind = NATIVE_ADDR_BASE_FRAME;
+ load.base.frame = d->locals[op.v.local - 1u].home;
+ load.base_type = op.type;
+ rv_emit_mem(a, 1, base, load, native_mem_for_type(d->native, op.type, 8));
+ addr.base_kind = NATIVE_ADDR_BASE_REG;
+ addr.base.reg = RV_TMP1;
+ addr.base_type = op.type;
+ return addr;
+ }
+ return rv_direct_materialize_addr(d, op);
+}
+
+static NativeAddr rv_direct_va_base(NativeDirectTarget* d, Operand ap_addr,
+ Reg reg) {
+ NativeLoc dst =
+ native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg);
+ NativeAddr addr;
+ d->native->load_addr(d->native, dst, rv_direct_pointer_addr(d, ap_addr));
+ memset(&addr, 0, sizeof addr);
+ addr.base_kind = NATIVE_ADDR_BASE_REG;
+ addr.cls = NATIVE_REG_INT;
+ addr.base.reg = reg;
+ addr.base_type = builtin_id(KIT_CG_BUILTIN_I64);
+ return addr;
+}
+
+static void rv_va_start_(NativeDirectTarget* d, Operand ap_addr) {
+ rv_va_start_core(rv_of(d->native), rv_direct_va_base(d, ap_addr, RV_TMP3));
+}
+static void rv_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr,
+ KitCgTypeId type) {
+ RvNativeTarget* a = rv_of(d->native);
+ /* Float-ABI-aware class: a soft (or wider-than-flen) float is INT-class so
+ * the va_arg fetch never lands a double in an FP register on rv32. */
+ NativeAllocClass cls = native_class_for_type_fp_le8(d->native, type);
+ NativeLoc res = native_loc_reg(type, cls,
+ cls == NATIVE_REG_FP ? RV_FTMP0 : RV_TMP0);
+ NativeAddr dst_addr;
+ rv_va_arg_core(a, res, rv_direct_va_base(d, ap_addr, RV_TMP3), type);
+ /* Store the fetched value back into the semantic destination. */
+ dst_addr = rv_direct_addr(d, dst);
+ if (dst_addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
+ NativeLoc base =
+ native_loc_reg(dst_addr.base_type, NATIVE_REG_INT, RV_TMP1);
+ NativeAddr load;
+ memset(&load, 0, sizeof load);
+ load.base_kind = NATIVE_ADDR_BASE_FRAME;
+ load.base.frame = dst_addr.base.frame;
+ load.base_type = dst_addr.base_type;
+ rv_emit_mem(a, 1, base, load,
+ native_mem_for_type(d->native, dst_addr.base_type, 8));
+ dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
+ dst_addr.base.reg = RV_TMP1;
+ }
+ rv_emit_mem(
+ a, 0, res, dst_addr,
+ native_mem_for_type(d->native, type, native_type_size(d->native, type)));
+}
+static void rv_va_end_(NativeDirectTarget* d, Operand ap_addr) {
+ (void)d;
+ (void)ap_addr;
+}
+static void rv_va_copy_(NativeDirectTarget* d, Operand dst, Operand src) {
+ RvNativeTarget* a = rv_of(d->native);
+ NativeAddr src_ap = rv_direct_va_base(d, src, RV_TMP0);
+ NativeAddr dst_ap = rv_direct_va_base(d, dst, RV_TMP3);
+ rv_va_copy_core(a, dst_ap, src_ap);
+}
+
+static void rv_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
+ const AsmConstraint* outs, u32 nout,
+ Operand* out_ops, const AsmConstraint* ins,
+ u32 nin, const Operand* in_ops,
+ const Sym* clobbers, u32 nclob,
+ u32 clobber_abi_sets) {
+ RvNativeTarget* a = rv_of(d->native);
+ Compiler* c = d->base.c;
+ Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
+ Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
+ u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp;
+ RvAsmSavedClobber* saved;
+ u32 nsaved, i;
+ Rv64Asm* asmh;
+
+ rv_asm_clobber_masks(c, d->loc, clobbers, nclob, &clob_int, &clob_fp);
+ native_asm_abi_clobber_masks(d->native, clobber_abi_sets, &abi_int, &abi_fp);
+ clob_int |= abi_int;
+ clob_fp |= abi_fp;
+ /* Reserve emit scratch (t0/t1/t2/t3), sp/gp/tp/zero/ra and the frame pointer
+ * so the operand allocator never hands them out. */
+ used_int = clob_int | (1u << RV_ZERO) | (1u << RV_RA) | (1u << RV_SP) |
+ (1u << RV_GP) | (1u << RV_TP) | (1u << RV_TMP0) | (1u << RV_TMP1) |
+ (1u << RV_TMP2) | (1u << RV_TMP3) | (1u << RV_S0);
+ used_fp =
+ clob_fp | (1u << RV_FTMP0) | (1u << RV_FTMP1) | (1u << 2u) | (1u << 3u);
+
+ for (i = 0; i < nout; ++i) {
+ const char* body = native_asm_constraint_body(outs[i].str);
+ KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type;
+ NativeAsmRegPin pin;
+ if (rv_asm_resolve_pin_or_panic(d, outs[i].reg, outs[i].str, &pin)) {
+ /* GNU local register variable: pin to the named hard register. */
+ if (pin.cls == NATIVE_REG_FP) {
+ used_fp |= 1u << pin.reg;
+ clob_fp |= 1u << pin.reg;
+ } else {
+ used_int |= 1u << pin.reg;
+ clob_int |= 1u << pin.reg;
+ }
+ rv_asm_bound_reg(&bound_outs[i], type, pin.cls, pin.reg);
+ } else if (body[0] == 'r' || body[0] == 'f') {
+ NativeAllocClass cls = rv_asm_constraint_class(d, body);
+ Reg reg = rv_asm_alloc_reg(d, cls, &used_int, &used_fp);
+ rv_asm_bound_reg(&bound_outs[i], type, cls, reg);
+ } else if (body[0] == 'm') {
+ Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp);
+ rv_asm_bound_mem(&bound_outs[i], type, reg);
+ } else {
+ rv_asm_panic(d, "unsupported output constraint");
+ }
+ }
+
+ for (i = 0; i < nin; ++i) {
+ const char* body = native_asm_constraint_body(ins[i].str);
+ int matched = native_asm_match_index(body);
+ KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type;
+ if (matched >= 0) {
+ if ((u32)matched >= nout)
+ rv_asm_panic(d, "matching constraint out of range");
+ if (native_asm_constraint_early(outs[matched].str))
+ rv_asm_panic(d, "matching input names early-clobber output");
+ if (bound_outs[matched].kind != RV64_INLINE_OPK_REG)
+ rv_asm_panic(d, "matching constraint requires register output");
+ bound_ins[i] = bound_outs[matched];
+ continue;
+ }
+ NativeAsmRegPin pin;
+ if (rv_asm_resolve_pin_or_panic(d, ins[i].reg, ins[i].str, &pin)) {
+ /* GNU local register variable: pin to the named hard register. */
+ if (pin.cls == NATIVE_REG_FP) {
+ used_fp |= 1u << pin.reg;
+ clob_fp |= 1u << pin.reg;
+ } else {
+ used_int |= 1u << pin.reg;
+ clob_int |= 1u << pin.reg;
+ }
+ rv_asm_bound_reg(&bound_ins[i], type, pin.cls, pin.reg);
+ } else if (body[0] == 'r' || body[0] == 'f') {
+ NativeAllocClass cls = rv_asm_constraint_class(d, body);
+ Reg reg = rv_asm_alloc_reg(d, cls, &used_int, &used_fp);
+ rv_asm_bound_reg(&bound_ins[i], type, cls, reg);
+ } else if (body[0] == 'i') {
+ if (in_ops[i].kind != OPK_IMM)
+ rv_asm_panic(d, "immediate constraint requires immediate operand");
+ bound_ins[i] = in_ops[i];
+ } else if (body[0] == 'm') {
+ Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp);
+ rv_asm_bound_mem(&bound_ins[i], type, reg);
+ } else {
+ rv_asm_panic(d, "unsupported input constraint");
+ }
+ }
+
+ saved = rv_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved);
+ for (i = 0; i < nout; ++i) {
+ if (bound_outs[i].kind == RV64_INLINE_OPK_REG) {
+ NativeAllocClass cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP
+ ? NATIVE_REG_FP
+ : NATIVE_REG_INT;
+ if (outs[i].dir == KIT_CG_ASM_INOUT) {
+ rv_direct_load_operand_to_reg(
+ d, out_ops[i],
+ native_loc_reg(bound_outs[i].type, cls,
+ (Reg)bound_outs[i].v.local));
+ }
+ } else if (bound_outs[i].kind == OPK_INDIRECT) {
+ NativeLoc loc =
+ native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
+ (Reg)bound_outs[i].v.ind.base);
+ rv_direct_load_address_to_reg(d, out_ops[i], loc);
+ }
+ }
+ for (i = 0; i < nin; ++i) {
+ if (bound_ins[i].kind == RV64_INLINE_OPK_REG) {
+ NativeAllocClass cls = bound_ins[i].pad[0] == RV64_INLINE_OPCLS_FP
+ ? NATIVE_REG_FP
+ : NATIVE_REG_INT;
+ rv_direct_load_operand_to_reg(
+ d, in_ops[i],
+ native_loc_reg(bound_ins[i].type, cls, (Reg)bound_ins[i].v.local));
+ } else if (bound_ins[i].kind == OPK_INDIRECT) {
+ NativeLoc loc =
+ native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
+ (Reg)bound_ins[i].v.ind.base);
+ rv_direct_load_address_to_reg(d, in_ops[i], loc);
+ }
+ }
+ asmh = rv64_asm_open(c);
+ rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
+ nclob);
+ rv64_asm_run_template(asmh, d->native->mc, tmpl);
+ rv64_asm_close(asmh);
+
+ for (i = 0; i < nout; ++i) {
+ NativeAllocClass cls;
+ NativeLoc src;
+ if (bound_outs[i].kind != RV64_INLINE_OPK_REG) continue;
+ cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP ? NATIVE_REG_FP
+ : NATIVE_REG_INT;
+ src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local);
+ rv_direct_store_reg_to_operand(d, out_ops[i], src);
+ }
+ for (i = nsaved; i > 0; --i) rv_asm_restore_one(a, &saved[i - 1u]);
+}
+
+static const NativeOps rv_direct_ops = {
+ .bind_param = rv_bind_param,
+ .tail_call_unrealizable_reason = rv_no_tail,
+ .va_start_ = rv_va_start_,
+ .va_arg_ = rv_va_arg_,
+ .va_end_ = rv_va_end_,
+ .va_copy_ = rv_va_copy_,
+ .asm_block = rv_direct_asm_block,
+};
+
+const NativeOps* rv64_native_direct_ops(void) { return &rv_direct_ops; }
diff --git a/src/arch/riscv/regs.c b/src/arch/riscv/regs.c
@@ -0,0 +1,99 @@
+/* RV64 register name table -- DWARF index <-> psABI assembler name.
+ *
+ * RISC-V DWARF numbering uses 0..31 for x-registers and 32..63 for
+ * f-registers. Canonical names use psABI spellings; xN/fN aliases are
+ * accepted by lookup. */
+
+#include "arch/riscv/regs.h"
+
+#include <stdint.h>
+
+#include "core/core.h"
+#include "core/slice.h"
+
+typedef struct Rv64Reg {
+ uint32_t dwarf_idx;
+ const char* name;
+} Rv64Reg;
+
+static const Rv64Reg RV64_REGS[] = {
+ {0, "zero"}, {1, "ra"}, {2, "sp"}, {3, "gp"}, {4, "tp"},
+ {5, "t0"}, {6, "t1"}, {7, "t2"}, {8, "s0"}, {9, "s1"},
+ {10, "a0"}, {11, "a1"}, {12, "a2"}, {13, "a3"}, {14, "a4"},
+ {15, "a5"}, {16, "a6"}, {17, "a7"}, {18, "s2"}, {19, "s3"},
+ {20, "s4"}, {21, "s5"}, {22, "s6"}, {23, "s7"}, {24, "s8"},
+ {25, "s9"}, {26, "s10"}, {27, "s11"}, {28, "t3"}, {29, "t4"},
+ {30, "t5"}, {31, "t6"},
+
+ {32, "ft0"}, {33, "ft1"}, {34, "ft2"}, {35, "ft3"}, {36, "ft4"},
+ {37, "ft5"}, {38, "ft6"}, {39, "ft7"}, {40, "fs0"}, {41, "fs1"},
+ {42, "fa0"}, {43, "fa1"}, {44, "fa2"}, {45, "fa3"}, {46, "fa4"},
+ {47, "fa5"}, {48, "fa6"}, {49, "fa7"}, {50, "fs2"}, {51, "fs3"},
+ {52, "fs4"}, {53, "fs5"}, {54, "fs6"}, {55, "fs7"}, {56, "fs8"},
+ {57, "fs9"}, {58, "fs10"}, {59, "fs11"}, {60, "ft8"}, {61, "ft9"},
+ {62, "ft10"}, {63, "ft11"},
+};
+
+static const uint32_t RV64_REGS_N =
+ (uint32_t)(sizeof RV64_REGS / sizeof RV64_REGS[0]);
+
+static int parse_num_suffix(const char* name, char prefix, uint32_t max,
+ uint32_t* out) {
+ uint32_t v = 0;
+ const char* p;
+ if (!name || name[0] != prefix || name[1] == '\0') return 1;
+ p = name + 1;
+ while (*p) {
+ if (*p < '0' || *p > '9') return 1;
+ v = v * 10u + (uint32_t)(*p - '0');
+ if (v > max) return 1;
+ ++p;
+ }
+ if (out) *out = v;
+ return 0;
+}
+
+const char* rv64_register_name(uint32_t dwarf_idx) {
+ uint32_t i;
+ for (i = 0; i < RV64_REGS_N; ++i) {
+ if (RV64_REGS[i].dwarf_idx == dwarf_idx) return RV64_REGS[i].name;
+ }
+ return NULL;
+}
+
+int rv64_register_index(const char* name, uint32_t* idx_out) {
+ uint32_t i;
+ uint32_t n;
+ Slice q;
+ if (!name) return 1;
+ q = slice_from_cstr(name);
+ for (i = 0; i < RV64_REGS_N; ++i) {
+ if (slice_eq_cstr(q, RV64_REGS[i].name)) {
+ if (idx_out) *idx_out = RV64_REGS[i].dwarf_idx;
+ return 0;
+ }
+ }
+ if (!parse_num_suffix(name, 'x', 31, &n)) {
+ if (idx_out) *idx_out = n;
+ return 0;
+ }
+ if (!parse_num_suffix(name, 'f', 31, &n)) {
+ if (idx_out) *idx_out = 32u + n;
+ return 0;
+ }
+ if (slice_eq_cstr(q, "fp")) {
+ if (idx_out) *idx_out = 8u;
+ return 0;
+ }
+ return 1;
+}
+
+uint32_t rv64_register_iter_size(void) { return RV64_REGS_N; }
+
+int rv64_register_iter_get(uint32_t i, uint32_t* dwarf_out,
+ const char** name_out) {
+ if (i >= RV64_REGS_N) return 1;
+ if (dwarf_out) *dwarf_out = RV64_REGS[i].dwarf_idx;
+ if (name_out) *name_out = RV64_REGS[i].name;
+ return 0;
+}
diff --git a/src/arch/rv64/regs.h b/src/arch/riscv/regs.h
diff --git a/src/arch/rv64/rv64.h b/src/arch/riscv/rv64.h
diff --git a/src/arch/riscv/variant.c b/src/arch/riscv/variant.c
@@ -0,0 +1,30 @@
+/* The two immutable RISC-V XLEN variants. See variant.h for the contract. */
+#include "arch/riscv/variant.h"
+
+const RiscvVariant riscv_variant_rv32 = {
+ .kind = KIT_ARCH_RV32,
+ .name = "rv32",
+ .isa_prefix = "rv32",
+ .xlen = 32u,
+ .ptr_bytes = 4u,
+ .gp_slot_bytes = 4u,
+ .has_w_forms = 0u,
+ .shamt_bits = 5u,
+ .frame_save_size = 8u, /* 2 * ptr_bytes */
+};
+
+const RiscvVariant riscv_variant_rv64 = {
+ .kind = KIT_ARCH_RV64,
+ .name = "rv64",
+ .isa_prefix = "rv64",
+ .xlen = 64u,
+ .ptr_bytes = 8u,
+ .gp_slot_bytes = 8u,
+ .has_w_forms = 1u,
+ .shamt_bits = 6u,
+ .frame_save_size = 16u, /* 2 * ptr_bytes */
+};
+
+const RiscvVariant* riscv_variant_for_kind(KitArchKind kind) {
+ return kind == KIT_ARCH_RV32 ? &riscv_variant_rv32 : &riscv_variant_rv64;
+}
diff --git a/src/arch/riscv/variant.h b/src/arch/riscv/variant.h
@@ -0,0 +1,40 @@
+/* RISC-V XLEN variant descriptor — an immutable per-XLEN table threaded
+ * through the otherwise stateless decode / asm / disasm / link / dbg / native
+ * paths so a single RISC-V backend serves both rv32 and rv64. It is always
+ * reached through a context (RvNativeTarget.variant, or
+ * riscv_variant_for_kind(c->target.arch) in the stateless paths), never as
+ * ambient global state.
+ *
+ * Three distinct widths are deliberately split out so rv32 cannot be derived
+ * by conflating them (all three are 8 on rv64, which is why the rv64-only code
+ * could get away with a single literal):
+ * - ptr_bytes pointer / native GPR width (4 / 8)
+ * - gp_slot_bytes ABI vararg-save & callee-save slot stride (4 / 8)
+ * - frame_save_size saved ra+s0 pair size = 2 * ptr_bytes (8 / 16)
+ */
+#ifndef KIT_ARCH_RISCV_VARIANT_H
+#define KIT_ARCH_RISCV_VARIANT_H
+
+#include "core/core.h" /* u8 / u32, KitArchKind */
+
+typedef struct RiscvVariant {
+ KitArchKind kind; /* KIT_ARCH_RV32 / KIT_ARCH_RV64 */
+ const char* name; /* "rv32" / "rv64" */
+ const char* isa_prefix; /* "rv32" / "rv64" — for -march parsing */
+ u8 xlen; /* 32 / 64 */
+ u8 ptr_bytes; /* 4 / 8 — pointer & native register width */
+ u8 gp_slot_bytes; /* 4 / 8 — vararg-save & callee-save slot stride */
+ u8 has_w_forms; /* 0 rv32 / 1 rv64 — ADDW/ADDIW/SLLIW/... */
+ u8 shamt_bits; /* 5 rv32 / 6 rv64 — SLLI/SRLI/SRAI immediate */
+ u32 frame_save_size; /* 2 * ptr_bytes (8 rv32 / 16 rv64) */
+} RiscvVariant;
+
+extern const RiscvVariant riscv_variant_rv32;
+extern const RiscvVariant riscv_variant_rv64;
+
+/* Returns the variant for KIT_ARCH_RV32 / KIT_ARCH_RV64. Any other arch maps
+ * to the rv64 variant (the historical default), so shared code that only ever
+ * sees a RISC-V kind never dereferences NULL. */
+const RiscvVariant* riscv_variant_for_kind(KitArchKind kind);
+
+#endif
diff --git a/src/arch/rv64/arch.c b/src/arch/rv64/arch.c
@@ -1,347 +0,0 @@
-#include "arch/arch.h"
-
-#include <string.h>
-
-#include "arch/rv64/asm.h"
-#include "arch/rv64/disasm.h"
-#include "arch/rv64/regs.h"
-#include "arch/rv64/rv64.h"
-#include "cg/native_direct_target.h"
-#include "core/bytes.h"
-#include "link/link_arch.h"
-#include "obj/obj.h"
-
-extern const LinkArchDesc link_arch_rv64;
-extern const ArchDbgOps rv64_dbg_ops;
-extern const ArchEmuOps rv64_emu_ops;
-extern const ArchDwarfOps rv64_dwarf_ops;
-extern const ArchAsmOps rv64_asm_ops;
-
-static int rv64_register_at_public(uint32_t idx, KitArchReg* out) {
- const char* nm = NULL;
- int rc;
- if (!out) return 1;
- rc = rv64_register_iter_get(idx, &out->dwarf_idx, &nm);
- if (rc == 0) out->name = kit_slice_cstr(nm);
- return rc;
-}
-
-static SrcLoc rv64_no_loc(void) {
- SrcLoc l = {0, 0, 0};
- return l;
-}
-
-static int rv64_apply_label_fixup(Compiler* c, const ArchLabelFixup* fx) {
- const Section* s;
- u8 cur[4];
- u32 word;
- u32 b;
-
- (void)c;
- if (!fx) return 1;
- s = obj_section_get(fx->obj, fx->sec_id);
- if (!s) return 0;
-
- /* INTRA_AUIPC_ADDI is a width=8 pair; other kinds patch a single 4-byte
- * instruction. Read the first word only for the 4-byte cases. */
- if (fx->kind != R_RV_INTRA_AUIPC_ADDI) {
- if (fx->width != 4) return 1;
- buf_read(&s->bytes, fx->offset, cur, 4);
- word = rd_u32_le(cur);
- } else {
- buf_read(&s->bytes, fx->offset, cur, 4);
- word = rd_u32_le(cur);
- }
- b = (u32)fx->disp;
-
- switch (fx->kind) {
- case R_RV_BRANCH:
- /* B-type reaches ±4 KiB. Conditional branches are emitted over a jal
- * (see rv_cmp_branch) so this only carries small fixed displacements;
- * a violation is a backend bug, not silently-truncated code. */
- if ((i64)fx->disp < -(i64)(1 << 12) || (i64)fx->disp >= (i64)(1 << 12))
- compiler_panic(c, rv64_no_loc(), "rv64: BRANCH out of range (±4KiB)");
- word &= 0x01fff07fu;
- word |= ((b >> 12) & 1u) << 31;
- word |= ((b >> 5) & 0x3fu) << 25;
- word |= ((b >> 1) & 0xfu) << 8;
- word |= ((b >> 11) & 1u) << 7;
- break;
- case R_RV_JAL:
- /* J-type reaches ±1 MiB — ample for intra-function jumps (including the
- * long leg of a conditional branch). Fail loudly rather than wrap. */
- if ((i64)fx->disp < -(i64)(1 << 20) || (i64)fx->disp >= (i64)(1 << 20))
- compiler_panic(c, rv64_no_loc(), "rv64: JAL out of range (±1MiB)");
- word &= 0x00000fffu;
- word |= ((b >> 20) & 1u) << 31;
- word |= ((b >> 1) & 0x3ffu) << 21;
- word |= ((b >> 11) & 1u) << 20;
- word |= ((b >> 12) & 0xffu) << 12;
- break;
- case R_RV_INTRA_AUIPC_ADDI: {
- /* width=8: patch both the AUIPC at fx->offset and the ADDI at
- * fx->offset+4. disp is the byte offset from the AUIPC PC to the
- * target label. */
- u8 cur2[4];
- u32 word2;
- i32 disp = (i32)fx->disp;
- /* hi20 is the top 20 bits of (disp + 0x800) so the sign-extended
- * 12-bit lo12 cancels out. */
- u32 hi20 = (u32)((disp + 0x800) >> 12) & 0xfffffu;
- u32 lo12 = (u32)disp & 0xfffu;
- if (fx->width != 8) return 1;
- /* AUIPC: keep rd (bits 11:7) and opcode (bits 6:0); patch imm[31:12]. */
- word = (word & 0x00000fffu) | (hi20 << 12);
- wr_u32_le(cur, word);
- obj_patch(fx->obj, fx->sec_id, fx->offset, cur, 4);
- buf_read(&s->bytes, fx->offset + 4, cur2, 4);
- word2 = rd_u32_le(cur2);
- /* ADDI: keep rs1/funct3/rd/opcode (bits 19:0); patch imm[11:0]. */
- word2 = (word2 & 0x000fffffu) | (lo12 << 20);
- wr_u32_le(cur2, word2);
- obj_patch(fx->obj, fx->sec_id, fx->offset + 4, cur2, 4);
- return 0;
- }
- default:
- return 1;
- }
-
- wr_u32_le(cur, word);
- obj_patch(fx->obj, fx->sec_id, fx->offset, cur, 4);
- return 0;
-}
-
-/* Mirrors `clang --target=riscv64-linux-gnu -E -dM` for the in-scope
- * RV64GC profile: I/M/F/D/A/C + Zicsr-minimal. Macros that depend on
- * extensions outside scope (V, B, Zve*, Zfh, …) are deliberately
- * absent. ABI variant is lp64d. */
-static const KitPredefinedMacro rv64_predefined_macros[] = {
- {KIT_SLICE_LIT("__riscv"), KIT_SLICE_LIT("1")},
- {KIT_SLICE_LIT("__riscv_xlen"), KIT_SLICE_LIT("64")},
- {KIT_SLICE_LIT("__riscv_float_abi_double"), KIT_SLICE_LIT("1")},
- {KIT_SLICE_LIT("__riscv_atomic"), KIT_SLICE_LIT("1")},
- {KIT_SLICE_LIT("__riscv_mul"), KIT_SLICE_LIT("1")},
- {KIT_SLICE_LIT("__riscv_div"), KIT_SLICE_LIT("1")},
- {KIT_SLICE_LIT("__riscv_muldiv"), KIT_SLICE_LIT("1")},
- {KIT_SLICE_LIT("__riscv_compressed"), KIT_SLICE_LIT("1")},
- {KIT_SLICE_LIT("__riscv_flen"), KIT_SLICE_LIT("64")},
- {KIT_SLICE_LIT("__riscv_fdiv"), KIT_SLICE_LIT("1")},
- {KIT_SLICE_LIT("__riscv_fsqrt"), KIT_SLICE_LIT("1")},
- {KIT_SLICE_LIT("__riscv_zicsr"), KIT_SLICE_LIT("1")},
- {KIT_SLICE_LIT("__riscv_zifencei"), KIT_SLICE_LIT("1")},
- {KIT_SLICE_LIT("__riscv_arch_test"), KIT_SLICE_LIT("1")},
- {KIT_SLICE_LIT("__LP64__"), KIT_SLICE_LIT("1")},
- {KIT_SLICE_LIT("_LP64"), KIT_SLICE_LIT("1")},
- {KIT_SLICE_LIT("__ORDER_LITTLE_ENDIAN__"), KIT_SLICE_LIT("1234")},
- {KIT_SLICE_LIT("__ORDER_BIG_ENDIAN__"), KIT_SLICE_LIT("4321")},
- {KIT_SLICE_LIT("__BYTE_ORDER__"), KIT_SLICE_LIT("__ORDER_LITTLE_ENDIAN__")},
- {KIT_SLICE_LIT("__LITTLE_ENDIAN__"), KIT_SLICE_LIT("1")},
-};
-
-enum {
- RV64_FEAT_I = 0,
- RV64_FEAT_M,
- RV64_FEAT_A,
- RV64_FEAT_F,
- RV64_FEAT_D,
- RV64_FEAT_C,
- RV64_FEAT_ZICSR,
- RV64_FEAT_ZIFENCEI,
-};
-
-static const ArchTargetFeature rv64_target_features[] = {
- {"i"}, {"m"}, {"a"}, {"f"}, {"d"}, {"c"}, {"zicsr"}, {"zifencei"},
-};
-
-static void rv64_feature_set(u64* words, u32 nwords, u32 idx) {
- if (!words || idx / 64u >= nwords) return;
- words[idx / 64u] |= 1ull << (idx % 64u);
-}
-
-static void rv64_feature_clear(u64* words, u32 nwords, u32 idx) {
- if (!words || idx / 64u >= nwords) return;
- words[idx / 64u] &= ~(1ull << (idx % 64u));
-}
-
-static void rv64_feature_disable_all(u64* words, u32 nwords) {
- rv64_feature_clear(words, nwords, RV64_FEAT_I);
- rv64_feature_clear(words, nwords, RV64_FEAT_M);
- rv64_feature_clear(words, nwords, RV64_FEAT_A);
- rv64_feature_clear(words, nwords, RV64_FEAT_F);
- rv64_feature_clear(words, nwords, RV64_FEAT_D);
- rv64_feature_clear(words, nwords, RV64_FEAT_C);
- rv64_feature_clear(words, nwords, RV64_FEAT_ZICSR);
- rv64_feature_clear(words, nwords, RV64_FEAT_ZIFENCEI);
-}
-
-static void rv64_feature_enable_g(u64* words, u32 nwords) {
- rv64_feature_set(words, nwords, RV64_FEAT_I);
- rv64_feature_set(words, nwords, RV64_FEAT_M);
- rv64_feature_set(words, nwords, RV64_FEAT_A);
- rv64_feature_set(words, nwords, RV64_FEAT_F);
- rv64_feature_set(words, nwords, RV64_FEAT_D);
- rv64_feature_set(words, nwords, RV64_FEAT_ZICSR);
- rv64_feature_set(words, nwords, RV64_FEAT_ZIFENCEI);
-}
-
-static int rv64_has_prefix(const char* p, const char* end, const char* lit) {
- size_t n = strlen(lit);
- return (size_t)(end - p) >= n && memcmp(p, lit, n) == 0;
-}
-
-static void rv64_skip_version(const char** pp, const char* end) {
- const char* p = *pp;
- while (p < end && ((*p >= '0' && *p <= '9') || *p == 'p')) ++p;
- *pp = p;
-}
-
-static KitStatus rv64_target_feature_apply_isa(const Target* target,
- KitSlice isa, u64* words,
- u32 nwords) {
- const char* p;
- const char* end;
- (void)target;
- if (isa.len < 5 || memcmp(isa.s, "rv64", 4) != 0) return KIT_UNSUPPORTED;
- p = isa.s + 4;
- end = isa.s + isa.len;
- rv64_feature_disable_all(words, nwords);
- while (p < end) {
- if (*p == '_') {
- ++p;
- continue;
- }
- switch (*p) {
- case 'i':
- rv64_feature_set(words, nwords, RV64_FEAT_I);
- ++p;
- rv64_skip_version(&p, end);
- continue;
- case 'm':
- rv64_feature_set(words, nwords, RV64_FEAT_M);
- ++p;
- rv64_skip_version(&p, end);
- continue;
- case 'a':
- rv64_feature_set(words, nwords, RV64_FEAT_A);
- ++p;
- rv64_skip_version(&p, end);
- continue;
- case 'f':
- rv64_feature_set(words, nwords, RV64_FEAT_F);
- ++p;
- rv64_skip_version(&p, end);
- continue;
- case 'd':
- rv64_feature_set(words, nwords, RV64_FEAT_D);
- ++p;
- rv64_skip_version(&p, end);
- continue;
- case 'c':
- rv64_feature_set(words, nwords, RV64_FEAT_C);
- ++p;
- rv64_skip_version(&p, end);
- continue;
- case 'g':
- rv64_feature_enable_g(words, nwords);
- ++p;
- rv64_skip_version(&p, end);
- continue;
- case 'z':
- if (rv64_has_prefix(p, end, "zicsr")) {
- rv64_feature_set(words, nwords, RV64_FEAT_ZICSR);
- p += 5;
- rv64_skip_version(&p, end);
- continue;
- }
- if (rv64_has_prefix(p, end, "zifencei")) {
- rv64_feature_set(words, nwords, RV64_FEAT_ZIFENCEI);
- p += 8;
- rv64_skip_version(&p, end);
- continue;
- }
- break;
- }
- return KIT_UNSUPPORTED;
- }
- return KIT_OK;
-}
-
-static void rv64_target_feature_defaults(const Target* target, u64* words,
- u32 nwords) {
- (void)target;
- rv64_feature_set(words, nwords, RV64_FEAT_I);
- rv64_feature_set(words, nwords, RV64_FEAT_M);
- rv64_feature_set(words, nwords, RV64_FEAT_A);
- rv64_feature_set(words, nwords, RV64_FEAT_F);
- rv64_feature_set(words, nwords, RV64_FEAT_D);
- rv64_feature_set(words, nwords, RV64_FEAT_C);
- rv64_feature_set(words, nwords, RV64_FEAT_ZICSR);
- rv64_feature_set(words, nwords, RV64_FEAT_ZIFENCEI);
-}
-
-static CgTarget* rv64_backend_make(Compiler* c, ObjBuilder* o,
- const KitCodeOptions* opts) {
- MCEmitter* mc = NULL;
- Debug* debug = NULL;
- CgTarget* t;
- NativeTarget* native;
- NativeDirectTargetConfig cfg;
- if (cg_mc_debug_new(c, o, opts, &mc, &debug) != KIT_OK) return NULL;
- native = rv64_native_target_new(c, o, mc);
- if (!native) return NULL;
- memset(&cfg, 0, sizeof cfg);
- cfg.native = native;
- cfg.ops = rv64_native_direct_ops();
- t = native_direct_target_new(c, o, &cfg);
- if (t) t->debug = debug;
- return t;
-}
-
-static CgTarget* rv64_semantic_target_new(Compiler* c, ObjBuilder* o,
- MCEmitter* mc) {
- NativeTarget* native;
- NativeDirectTargetConfig cfg;
- if (!mc) mc = mc_new(c, o);
- native = rv64_native_target_new(c, o, mc);
- if (!native) return NULL;
- memset(&cfg, 0, sizeof cfg);
- cfg.native = native;
- cfg.ops = rv64_native_direct_ops();
- return native_direct_target_new(c, o, &cfg);
-}
-
-const ArchImpl arch_impl_rv64 = {
- .backend = {.name = "rv64", .make = rv64_backend_make},
- .kind = KIT_ARCH_RV64,
- .name = "rv64",
- .cgtarget_new = rv64_semantic_target_new,
- .asm_new = rv64_arch_asm_new,
- .disasm_new = rv64_disasm_new,
- .apply_label_fixup = rv64_apply_label_fixup,
- .decode = &rv64_decode_ops,
- .emu = &rv64_emu_ops,
- .link = &link_arch_rv64,
- .dwarf = &rv64_dwarf_ops,
- .dbg = &rv64_dbg_ops,
- .asm_ops = &rv64_asm_ops,
- .predefined_macros = rv64_predefined_macros,
- .npredefined_macros =
- (u32)(sizeof rv64_predefined_macros / sizeof rv64_predefined_macros[0]),
- .target_features = rv64_target_features,
- .ntarget_features =
- (u32)(sizeof rv64_target_features / sizeof rv64_target_features[0]),
- .target_feature_defaults = rv64_target_feature_defaults,
- .target_feature_apply_isa = rv64_target_feature_apply_isa,
- .register_name = rv64_register_name,
- .register_index = rv64_register_index,
- .register_count = rv64_register_iter_size,
- .register_at = rv64_register_at_public,
- /* RISC-V psABI: return address in x1 (ra). 4-byte aligned insns
- * (cover 2-byte C-ext too via code_align=2). Data align -8 for
- * doubleword stack stride. CFA = sp at entry. */
- .cfi_return_addr_reg = 1u,
- .cfi_code_align_factor = 2,
- .cfi_data_align_factor = -8,
- .cfi_cfa_init_reg = 2u,
- .cfi_cfa_init_offset = 0,
-};
diff --git a/src/arch/rv64/asm.c b/src/arch/rv64/asm.c
@@ -1,1469 +0,0 @@
-/* RV64 assembler — descriptor-table driven.
- *
- * Mnemonic → Rv64InsnDesc via rv64_asm_find; operand parsing dispatches
- * on the format kind. The descriptor's `match` field already carries
- * the funct3/funct7/opcode bits; the parser only needs to fill in the
- * register operands and immediate.
- *
- * Aliases (li, mv, ret, jr, j, nop, sext.w, beqz, bnez) are recognized
- * by their alias rows in the descriptor table and rewritten to the
- * canonical encoding here. Inline rv_* encoders in isa.h remain the
- * hot path for codegen; the assembler uses them to assemble the
- * machine word once it has the operand values. */
-
-#include "arch/rv64/asm.h"
-
-#include <string.h>
-
-#include "arch/rv64/isa.h"
-#include "arch/rv64/regs.h"
-#include "arch/rv64/rv64.h"
-#include "asm/asm_helpers.h"
-#include "core/arena.h"
-#include "core/pool.h"
-#include "core/slice.h"
-#include "core/strbuf.h"
-#include "obj/obj.h"
-
-struct Rv64Asm {
- ArchAsm base;
- Compiler* c;
-
- /* Inline-asm bound state (set by rv64_inline_bind, cleared otherwise).
- * Operand indexing per GCC convention: 0..nout-1 are outputs, then
- * nout..nout+nin-1 are inputs. Templates address into this combined
- * list via %N / %zN / %aN / %w[name] / %x[name]. */
- const AsmConstraint* outs;
- Operand* out_ops;
- const AsmConstraint* ins;
- const Operand* in_ops;
- const Sym* clobbers;
- u32 nout;
- u32 nin;
- u32 nclob;
-};
-
-typedef struct Rv64Asm Rv64Asm;
-
-/* Relocation modifier on a 12-bit immediate offset (`%lo`/`%pcrel_lo`).
- * RV_MEMMOD_NONE means a plain numeric displacement in `disp`. */
-typedef enum RvMemMod {
- RV_MEMMOD_NONE = 0,
- RV_MEMMOD_LO,
- RV_MEMMOD_PCREL_LO,
-} RvMemMod;
-
-typedef struct Rv64Mem {
- i32 disp;
- u32 base;
- RvMemMod mod; /* reloc modifier on the offset, or RV_MEMMOD_NONE */
- ObjSymId sym; /* symbol when mod != NONE */
- i64 off; /* addend when mod != NONE */
-} Rv64Mem;
-
-static int sym_to_cstr(AsmDriver* d, Sym s, char* out, size_t cap) {
- Slice sl = pool_slice(asm_driver_pool(d), s);
- if (!sl.s || sl.len >= cap) return 0;
- memcpy(out, sl.s, sl.len);
- out[sl.len] = '\0';
- return 1;
-}
-
-/* True if `s` begins with the NUL-terminated literal `pfx` (length-explicit).
- */
-static bool slice_has_prefix_cstr(Slice s, const char* pfx, size_t n) {
- return s.len >= n && memcmp(s.s, pfx, n) == 0;
-}
-
-static int rv_reg_from_name(AsmDriver* d, Sym s, u32* reg_out, int* fp_out) {
- char name[16];
- uint32_t dwarf = 0;
- if (!sym_to_cstr(d, s, name, sizeof name)) return 0;
- if (rv64_register_index(name, &dwarf) != 0) return 0;
- if (reg_out) *reg_out = dwarf & 31u;
- if (fp_out) *fp_out = dwarf >= 32u;
- return 1;
-}
-
-static u32 parse_reg(AsmDriver* d, int* fp_out) {
- AsmTok t = asm_driver_next(d);
- u32 r;
- if (t.kind != ASM_TOK_IDENT || !rv_reg_from_name(d, t.v.ident, &r, fp_out))
- asm_driver_panic(d, "rv64 asm: bad register");
- return r;
-}
-
-static u32 parse_xreg(AsmDriver* d) {
- int fp = 0;
- u32 r = parse_reg(d, &fp);
- if (fp) asm_driver_panic(d, "rv64 asm: expected integer register");
- return r;
-}
-
-static u32 parse_freg(AsmDriver* d) {
- int fp = 0;
- u32 r = parse_reg(d, &fp);
- if (!fp) asm_driver_panic(d, "rv64 asm: expected float register");
- return r;
-}
-
-static void expect_comma(AsmDriver* d) {
- if (!asm_driver_eat_comma(d)) asm_driver_panic(d, "rv64 asm: expected ','");
-}
-
-/* Position of a `%mod(sym)` relocation operand: the 20-bit upper field of
- * lui/auipc, or a 12-bit I-type (addi/load) or S-type (store) immediate. */
-typedef enum RvModPos {
- RV_MODPOS_HI20,
- RV_MODPOS_LO_I,
- RV_MODPOS_LO_S,
-} RvModPos;
-
-/* Map a relocation-modifier name (`hi`, `lo`, `pcrel_hi`, `pcrel_lo`,
- * `got_pcrel_hi`) to the RelocKind appropriate for `pos`. Panics on a name
- * that is not valid at this operand position. */
-static RelocKind rv_mod_to_reloc(AsmDriver* d, Slice name, RvModPos pos) {
- if (pos == RV_MODPOS_HI20) {
- if (slice_eq_cstr(name, "hi")) return R_RV_HI20;
- if (slice_eq_cstr(name, "pcrel_hi")) return R_RV_PCREL_HI20;
- if (slice_eq_cstr(name, "got_pcrel_hi")) return R_RV_GOT_HI20;
- } else {
- int store = (pos == RV_MODPOS_LO_S);
- if (slice_eq_cstr(name, "lo")) return store ? R_RV_LO12_S : R_RV_LO12_I;
- if (slice_eq_cstr(name, "pcrel_lo"))
- return store ? R_RV_PCREL_LO12_S : R_RV_PCREL_LO12_I;
- }
- asm_driver_panic(d, "rv64 asm: relocation modifier not valid here");
-}
-
-/* If the next token is `%`, parse a `%mod(sym{+off})` relocation operand,
- * emit the relocation at the current emit position (where the about-to-be-
- * returned instruction word will land), and return 1. The caller encodes a
- * zero placeholder in the immediate field. Returns 0 if there is no modifier
- * (leaving the stream untouched for normal constant parsing). A leading `%`
- * is unambiguous here: modulo is infix and never starts an operand. */
-static int rv_parse_mod_reloc(AsmDriver* d, RvModPos pos, ObjSymId* sym_out,
- i64* off_out, RelocKind* kind_out) {
- if (!asm_driver_tok_is_punct(asm_driver_peek(d), '%')) return 0;
- (void)asm_driver_next(d); /* eat '%' */
- AsmTok name = asm_driver_next(d);
- if (name.kind != ASM_TOK_IDENT)
- asm_driver_panic(d, "rv64 asm: expected relocation modifier name");
- Slice nm = pool_slice(asm_driver_pool(d), name.v.ident);
- asm_driver_expect_punct(d, '(', "'(' after relocation modifier");
- ObjSymId sym = OBJ_SYM_NONE;
- i64 off = 0;
- asm_driver_parse_sym_expr(d, &sym, &off);
- asm_driver_expect_punct(d, ')', "')' after %mod(sym)");
- RelocKind k = rv_mod_to_reloc(d, nm, pos);
- if (sym_out) *sym_out = sym;
- if (off_out) *off_out = off;
- if (kind_out) *kind_out = k;
- return 1;
-}
-
-/* Parse a RISC-V rounding-mode mnemonic (the comma is already consumed) into
- * its 3-bit funct3 value. cc -S emits this suffix on fcvt/fsqrt when the mode
- * isn't the default `dyn`, so the round-trip (and clang) re-encode the exact
- * mode rather than guessing a default. */
-static u32 rv_parse_rm_name(AsmDriver* d) {
- AsmTok t = asm_driver_next(d);
- Slice s;
- if (t.kind != ASM_TOK_IDENT)
- asm_driver_panic(d, "rv64 asm: expected rounding mode");
- s = pool_slice(asm_driver_pool(d), t.v.ident);
- if (slice_eq_cstr(s, "rne")) return 0u;
- if (slice_eq_cstr(s, "rtz")) return 1u;
- if (slice_eq_cstr(s, "rdn")) return 2u;
- if (slice_eq_cstr(s, "rup")) return 3u;
- if (slice_eq_cstr(s, "rmm")) return 4u;
- if (slice_eq_cstr(s, "dyn")) return 7u;
- asm_driver_panic(d, "rv64 asm: unknown rounding mode");
-}
-
-/* Emit a relocation for a U-type / I-type immediate `%mod(sym)` operand at
- * the current instruction position; returns 1 if one was present. */
-static int rv_emit_imm_mod_reloc(AsmDriver* d, RvModPos pos) {
- ObjSymId sym;
- i64 off;
- RelocKind k;
- if (!rv_parse_mod_reloc(d, pos, &sym, &off, &k)) return 0;
- MCEmitter* mc = asm_driver_mc(d);
- mc->emit_reloc_at(mc, mc->section_id, mc->pos(mc), k, sym, off, 0, 0);
- return 1;
-}
-
-static Rv64Mem parse_mem(AsmDriver* d) {
- Rv64Mem m;
- m.disp = 0;
- m.mod = RV_MEMMOD_NONE;
- m.sym = OBJ_SYM_NONE;
- m.off = 0;
- if (asm_driver_tok_is_punct(asm_driver_peek(d), '%')) {
- /* `%lo(sym)(base)` / `%pcrel_lo(label)(base)` — record the modifier; the
- * load/store caller emits the I- or S-type relocation. */
- ObjSymId sym;
- i64 off;
- RelocKind k;
- (void)rv_parse_mod_reloc(d, RV_MODPOS_LO_I, &sym, &off, &k);
- m.mod = (k == R_RV_PCREL_LO12_I) ? RV_MEMMOD_PCREL_LO : RV_MEMMOD_LO;
- m.sym = sym;
- m.off = off;
- } else {
- m.disp = (i32)asm_driver_parse_const(d);
- }
- asm_driver_expect_punct(d, '(', "'(' in rv64 memory operand");
- m.base = parse_xreg(d);
- asm_driver_expect_punct(d, ')', "')' in rv64 memory operand");
- return m;
-}
-
-/* Emit the I/S-type relocation recorded by parse_mem for a `%lo`/`%pcrel_lo`
- * memory offset, picking the S-type variant for stores. */
-static void rv_emit_mem_mod_reloc(AsmDriver* d, const Rv64Mem* m,
- int is_store) {
- if (m->mod == RV_MEMMOD_NONE) return;
- RelocKind k = (m->mod == RV_MEMMOD_PCREL_LO)
- ? (is_store ? R_RV_PCREL_LO12_S : R_RV_PCREL_LO12_I)
- : (is_store ? R_RV_LO12_S : R_RV_LO12_I);
- MCEmitter* mc = asm_driver_mc(d);
- mc->emit_reloc_at(mc, mc->section_id, mc->pos(mc), k, m->sym, m->off, 0, 0);
-}
-
-/* Fence pred/succ parser — accepts a string like "rw" / "iorw" / "0" /
- * a numeric literal. Returns the 4-bit mask: bit3=i, bit2=o, bit1=r,
- * bit0=w. */
-static u32 parse_fence_mask(AsmDriver* d) {
- AsmTok t = asm_driver_peek(d);
- if (t.kind == ASM_TOK_NUM) {
- (void)asm_driver_next(d);
- return (u32)asm_driver_parse_const(d) & 0xfu;
- }
- if (t.kind == ASM_TOK_IDENT) {
- char name[8];
- AsmTok tt = asm_driver_next(d);
- if (!sym_to_cstr(d, tt.v.ident, name, sizeof name))
- asm_driver_panic(d, "rv64 asm: bad fence mask");
- u32 mask = 0;
- for (const char* p = name; *p; ++p) {
- switch (*p) {
- case 'i':
- mask |= 8u;
- break;
- case 'o':
- mask |= 4u;
- break;
- case 'r':
- mask |= 2u;
- break;
- case 'w':
- mask |= 1u;
- break;
- default:
- asm_driver_panic(d, "rv64 asm: bad fence char");
- }
- }
- return mask;
- }
- asm_driver_panic(d, "rv64 asm: bad fence operand");
-}
-
-/* Field overlay onto a descriptor's `match` word.
- *
- * For most formats the descriptor's match already pins opcode +
- * funct3 + funct7. We OR in the per-operand fields. For shift-imm and
- * AMO families the layouts diverge from the basic R/I templates — we
- * handle those explicitly below. */
-
-static u32 enc_r(u32 match, u32 rd, u32 rs1, u32 rs2) {
- return match | ((rs2 & 0x1fu) << 20) | ((rs1 & 0x1fu) << 15) |
- ((rd & 0x1fu) << 7);
-}
-static u32 enc_i(u32 match, u32 rd, u32 rs1, i32 imm12) {
- return match | (((u32)imm12 & 0xfffu) << 20) | ((rs1 & 0x1fu) << 15) |
- ((rd & 0x1fu) << 7);
-}
-static u32 enc_s(u32 match, u32 rs2, u32 rs1, i32 imm12) {
- u32 ui = (u32)imm12 & 0xfffu;
- return match | ((ui >> 5) << 25) | ((rs2 & 0x1fu) << 20) |
- ((rs1 & 0x1fu) << 15) | ((ui & 0x1fu) << 7);
-}
-static u32 enc_b(u32 match, u32 rs1, u32 rs2, i32 imm13) {
- u32 ui = (u32)imm13;
- return match | (((ui >> 12) & 1u) << 31) | (((ui >> 5) & 0x3fu) << 25) |
- ((rs2 & 0x1fu) << 20) | ((rs1 & 0x1fu) << 15) |
- (((ui >> 1) & 0xfu) << 8) | (((ui >> 11) & 1u) << 7);
-}
-static u32 enc_u(u32 match, u32 rd, u32 imm20) {
- return match | ((imm20 & 0xfffffu) << 12) | ((rd & 0x1fu) << 7);
-}
-static u32 enc_j(u32 match, u32 rd, i32 imm21) {
- u32 ui = (u32)imm21;
- return match | (((ui >> 20) & 1u) << 31) | (((ui >> 1) & 0x3ffu) << 21) |
- (((ui >> 11) & 1u) << 20) | (((ui >> 12) & 0xffu) << 12) |
- ((rd & 0x1fu) << 7);
-}
-static u32 enc_r4(u32 match, u32 rd, u32 rs1, u32 rs2, u32 rs3, u32 rm) {
- return match | ((rs3 & 0x1fu) << 27) | ((rs2 & 0x1fu) << 20) |
- ((rs1 & 0x1fu) << 15) | ((rm & 0x7u) << 12) | ((rd & 0x1fu) << 7);
-}
-
-/* RV64I shift-imm: shamt6 occupies bits 25:20; funct6 already in match. */
-static u32 enc_ishift(u32 match, u32 rd, u32 rs1, u32 shamt) {
- return match | ((shamt & 0x3fu) << 20) | ((rs1 & 0x1fu) << 15) |
- ((rd & 0x1fu) << 7);
-}
-/* RV32 word shift-imm: shamt5 occupies bits 24:20 (funct7 already pinned). */
-static u32 enc_ishiftw(u32 match, u32 rd, u32 rs1, u32 shamt) {
- return match | ((shamt & 0x1fu) << 20) | ((rs1 & 0x1fu) << 15) |
- ((rd & 0x1fu) << 7);
-}
-/* AMO: aq/rl bits 26/25 — we accept them as optional .aq/.rl suffixes
- * on the mnemonic. For now mnemonics arrive bare. */
-static u32 enc_amo(u32 match, u32 aq, u32 rl, u32 rd, u32 rs1, u32 rs2) {
- return match | ((aq & 1u) << 26) | ((rl & 1u) << 25) | ((rs2 & 0x1fu) << 20) |
- ((rs1 & 0x1fu) << 15) | ((rd & 0x1fu) << 7);
-}
-
-static u32 c_reg3(AsmDriver* d, u32 r) {
- if (r < 8u || r > 15u)
- asm_driver_panic(d,
- "rv64 asm: compressed register must be x8..x15/f8..f15");
- return r - 8u;
-}
-
-static u32 enc_c_ci(u32 match, u32 rd, i32 imm) {
- u32 u = (u32)imm & 0x3fu;
- return match | (((u >> 5) & 1u) << 12) | ((rd & 0x1fu) << 7) |
- ((u & 0x1fu) << 2);
-}
-
-static u32 enc_c_cr(u32 match, u32 rd_rs1, u32 rs2) {
- return match | ((rd_rs1 & 0x1fu) << 7) | ((rs2 & 0x1fu) << 2);
-}
-
-static u32 enc_c_addi16sp(u32 match, i32 imm) {
- u32 u = (u32)imm & 0x3ffu;
- return match | (((u >> 9) & 1u) << 12) | (((u >> 4) & 1u) << 6) |
- (((u >> 6) & 1u) << 5) | (((u >> 7) & 3u) << 3) |
- (((u >> 5) & 1u) << 2);
-}
-
-static u32 enc_c_addi4spn(u32 match, u32 rd3, u32 imm) {
- u32 enc = (((imm >> 4) & 3u) << 6) | (((imm >> 6) & 0xfu) << 2) |
- (((imm >> 2) & 1u) << 1) | ((imm >> 3) & 1u);
- return match | ((enc & 0xffu) << 5) | ((rd3 & 7u) << 2);
-}
-
-static u32 enc_c_lwld(u32 match, u32 rd3, u32 rs1_3, u32 off, int wide64) {
- if (wide64) {
- return match | (((off >> 3) & 7u) << 10) | ((rs1_3 & 7u) << 7) |
- (((off >> 6) & 3u) << 5) | ((rd3 & 7u) << 2);
- }
- return match | (((off >> 3) & 7u) << 10) | ((rs1_3 & 7u) << 7) |
- (((off >> 2) & 1u) << 6) | (((off >> 6) & 1u) << 5) |
- ((rd3 & 7u) << 2);
-}
-
-static u32 enc_c_swld(u32 match, u32 rs2_3, u32 rs1_3, u32 off, int wide64) {
- return enc_c_lwld(match, rs2_3, rs1_3, off, wide64);
-}
-
-static u32 enc_c_lwsp(u32 match, u32 rd, u32 off, int wide64) {
- if (wide64) {
- return match | (((off >> 5) & 1u) << 12) | ((rd & 0x1fu) << 7) |
- (((off >> 3) & 3u) << 5) | (((off >> 6) & 7u) << 2);
- }
- return match | (((off >> 5) & 1u) << 12) | ((rd & 0x1fu) << 7) |
- (((off >> 2) & 7u) << 4) | (((off >> 6) & 3u) << 2);
-}
-
-static u32 enc_c_swsp(u32 match, u32 rs2, u32 off, int wide64) {
- u32 imm6;
- if (wide64)
- imm6 = (((off >> 3) & 7u) << 3) | ((off >> 6) & 7u);
- else
- imm6 = (((off >> 2) & 0xfu) << 2) | ((off >> 6) & 3u);
- return match | ((imm6 & 0x3fu) << 7) | ((rs2 & 0x1fu) << 2);
-}
-
-static u32 enc_c_cb_imm(u32 match, u32 rs1_3, i32 imm) {
- u32 u = (u32)imm & 0x1ffu;
- return match | (((u >> 8) & 1u) << 12) | (((u >> 3) & 3u) << 10) |
- ((rs1_3 & 7u) << 7) | (((u >> 6) & 3u) << 5) | (((u >> 1) & 3u) << 3) |
- (((u >> 5) & 1u) << 2);
-}
-
-static u32 enc_c_cb_alu_imm(u32 match, u32 rd3, i32 imm) {
- u32 u = (u32)imm & 0x3fu;
- return match | (((u >> 5) & 1u) << 12) | ((rd3 & 7u) << 7) |
- ((u & 0x1fu) << 2);
-}
-
-static u32 enc_c_cj(u32 match, i32 imm) {
- u32 u = (u32)imm & 0xfffu;
- return match | (((u >> 11) & 1u) << 12) | (((u >> 4) & 1u) << 11) |
- (((u >> 8) & 3u) << 9) | (((u >> 10) & 1u) << 8) |
- (((u >> 6) & 1u) << 7) | (((u >> 7) & 1u) << 6) |
- (((u >> 1) & 7u) << 3) | (((u >> 5) & 1u) << 2);
-}
-
-/* Parse a branch/jump target operand. With a symbolic target (a label), emit
- * the relocation at the current position — which is exactly where the caller
- * is about to write this instruction word — and return 0 as the placeholder
- * immediate. With a bare constant, return it as the PC-relative byte
- * displacement (preserving the existing numeric-offset corpus behavior). */
-static i32 rv_reloc_target(AsmDriver* d, RelocKind kind) {
- ObjSymId sym = OBJ_SYM_NONE;
- i64 off = 0;
- asm_driver_parse_sym_expr(d, &sym, &off);
- if (sym != OBJ_SYM_NONE) {
- MCEmitter* mc = asm_driver_mc(d);
- mc->emit_reloc_at(mc, mc->section_id, mc->pos(mc), kind, sym, off, 0, 0);
- return 0;
- }
- return (i32)off;
-}
-
-/* Per-format parser — reads the operand list off the driver and returns
- * the encoded 32-bit word, given the matched descriptor. */
-static u32 assemble_one(AsmDriver* d, const Rv64InsnDesc* desc) {
- u32 m = desc->match;
- u32 rd = 0, rs1 = 0, rs2 = 0;
- i32 imm = 0;
- Rv64Mem mem;
-
- switch ((Rv64Format)desc->fmt) {
- case RV64_FMT_R:
- /* Two-operand aliases: snez/neg/negw — rd, rs (rs1=x0). */
- if (desc->flags & RV64_ASMFL_ALIAS) {
- rd = parse_xreg(d);
- expect_comma(d);
- rs2 = parse_xreg(d);
- return enc_r(m, rd, 0u, rs2);
- }
- rd = parse_xreg(d);
- expect_comma(d);
- rs1 = parse_xreg(d);
- expect_comma(d);
- rs2 = parse_xreg(d);
- return enc_r(m, rd, rs1, rs2);
-
- case RV64_FMT_R4: {
- u32 rs3;
- rd = parse_freg(d);
- expect_comma(d);
- rs1 = parse_freg(d);
- expect_comma(d);
- rs2 = parse_freg(d);
- expect_comma(d);
- rs3 = parse_freg(d);
- return enc_r4(m, rd, rs1, rs2, rs3, 0x7u);
- }
-
- case RV64_FMT_I:
- /* Aliases first. `li` is handled earlier by rv64_emit_pseudo (it may
- * need a multi-word expansion), so it never reaches here. */
- if (desc->flags & RV64_ASMFL_ALIAS) {
- if (slice_eq_cstr(desc->mnemonic, "mv")) {
- /* Standard two-operand `mv rd, rs` = `addi rd, rs, 0`. (A %pcrel_lo
- * low-half is emitted as the canonical `addi rd, rs, %pcrel_lo(L)`,
- * not a non-standard 3-operand `mv`, so it lands in the ADDI path
- * below — matching clang.) */
- rd = parse_xreg(d);
- expect_comma(d);
- rs1 = parse_xreg(d);
- return enc_i(m, rd, rs1, 0);
- }
- if (slice_eq_cstr(desc->mnemonic, "sext.w")) {
- rd = parse_xreg(d);
- expect_comma(d);
- rs1 = parse_xreg(d);
- return enc_i(m, rd, rs1, 0);
- }
- if (slice_eq_cstr(desc->mnemonic, "seqz") ||
- slice_eq_cstr(desc->mnemonic, "not")) {
- rd = parse_xreg(d);
- expect_comma(d);
- rs1 = parse_xreg(d);
- /* match already has imm12 + funct3 + op pinned. */
- return m | ((rs1 & 0x1fu) << 15) | ((rd & 0x1fu) << 7);
- }
- }
- rd = parse_xreg(d);
- expect_comma(d);
- rs1 = parse_xreg(d);
- expect_comma(d);
- /* `addi rd, rs1, %lo(sym)` / `%pcrel_lo(label)` → R_RV_LO12_I. */
- if (rv_emit_imm_mod_reloc(d, RV_MODPOS_LO_I)) return enc_i(m, rd, rs1, 0);
- imm = (i32)asm_driver_parse_const(d);
- return enc_i(m, rd, rs1, imm);
-
- case RV64_FMT_I_SHIFT:
- rd = parse_xreg(d);
- expect_comma(d);
- rs1 = parse_xreg(d);
- expect_comma(d);
- return enc_ishift(m, rd, rs1, (u32)asm_driver_parse_const(d));
-
- case RV64_FMT_I_SHIFTW:
- rd = parse_xreg(d);
- expect_comma(d);
- rs1 = parse_xreg(d);
- expect_comma(d);
- return enc_ishiftw(m, rd, rs1, (u32)asm_driver_parse_const(d));
-
- case RV64_FMT_U:
- rd = parse_xreg(d);
- expect_comma(d);
- /* `lui rd, %hi(sym)` → R_RV_HI20; `auipc rd, %pcrel_hi(sym)` →
- * R_RV_PCREL_HI20 (or %got_pcrel_hi → R_RV_GOT_HI20). */
- if (rv_emit_imm_mod_reloc(d, RV_MODPOS_HI20)) return enc_u(m, rd, 0);
- imm = (i32)asm_driver_parse_const(d);
- /* LUI/AUIPC immediate is the upper-20 value: the input is interpreted
- * as the literal 20-bit value (already shifted-out form). */
- return enc_u(m, rd, (u32)imm);
-
- case RV64_FMT_J:
- /* `j label` / `jal rd, label` accept a symbolic target (R_RV_JAL) or a
- * bare numeric displacement. */
- if ((desc->flags & RV64_ASMFL_ALIAS) &&
- slice_eq_cstr(desc->mnemonic, "j")) {
- return enc_j(m, 0u, rv_reloc_target(d, R_RV_JAL));
- }
- rd = parse_xreg(d);
- expect_comma(d);
- return enc_j(m, rd, rv_reloc_target(d, R_RV_JAL));
-
- case RV64_FMT_B:
- /* `beq rs1, rs2, label` (and beqz/bnez aliases) accept a symbolic target
- * (R_RV_BRANCH) or a bare numeric displacement. */
- if (desc->flags & RV64_ASMFL_ALIAS) {
- /* beqz / bnez: rs, off. */
- rs1 = parse_xreg(d);
- expect_comma(d);
- return enc_b(m, rs1, 0u, rv_reloc_target(d, R_RV_BRANCH));
- }
- rs1 = parse_xreg(d);
- expect_comma(d);
- rs2 = parse_xreg(d);
- expect_comma(d);
- return enc_b(m, rs1, rs2, rv_reloc_target(d, R_RV_BRANCH));
-
- case RV64_FMT_LOAD:
- rd = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
- expect_comma(d);
- mem = parse_mem(d);
- rv_emit_mem_mod_reloc(d, &mem, /*is_store=*/0);
- return enc_i(m, rd, mem.base, mem.disp);
-
- case RV64_FMT_FP_LOAD:
- rd = parse_freg(d);
- expect_comma(d);
- mem = parse_mem(d);
- rv_emit_mem_mod_reloc(d, &mem, /*is_store=*/0);
- return enc_i(m, rd, mem.base, mem.disp);
-
- case RV64_FMT_STORE:
- rs2 = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
- expect_comma(d);
- mem = parse_mem(d);
- rv_emit_mem_mod_reloc(d, &mem, /*is_store=*/1);
- return enc_s(m, rs2, mem.base, mem.disp);
-
- case RV64_FMT_FP_STORE:
- rs2 = parse_freg(d);
- expect_comma(d);
- mem = parse_mem(d);
- rv_emit_mem_mod_reloc(d, &mem, /*is_store=*/1);
- return enc_s(m, rs2, mem.base, mem.disp);
-
- case RV64_FMT_JALR:
- if ((desc->flags & RV64_ASMFL_ALIAS) &&
- slice_eq_cstr(desc->mnemonic, "jr")) {
- rs1 = parse_xreg(d);
- return enc_i(m, 0u, rs1, 0);
- }
- rd = parse_xreg(d);
- if (!asm_driver_eat_comma(d)) {
- if (slice_eq_cstr(desc->mnemonic, "jalr"))
- return enc_i(m, RV_RA, rd, 0);
- asm_driver_panic(d, "rv64 asm: expected ','");
- }
- /* Accept both `jalr rd, imm(rs1)` and `jalr rd, rs1, imm`. */
- {
- AsmTok t = asm_driver_peek(d);
- if (t.kind == ASM_TOK_IDENT) {
- /* register first → register form */
- rs1 = parse_xreg(d);
- if (asm_driver_eat_comma(d)) {
- imm = (i32)asm_driver_parse_const(d);
- } else {
- imm = 0;
- }
- return enc_i(m, rd, rs1, imm);
- }
- }
- mem = parse_mem(d);
- return enc_i(m, rd, mem.base, mem.disp);
-
- case RV64_FMT_FENCE: {
- u32 pred, succ;
- pred = parse_fence_mask(d);
- expect_comma(d);
- succ = parse_fence_mask(d);
- return m | (pred << 24) | (succ << 20);
- }
-
- case RV64_FMT_SYSTEM:
- /* No operands. nop/ret/ecall/ebreak. */
- return m;
-
- case RV64_FMT_FP_RM:
- rd = parse_freg(d);
- expect_comma(d);
- rs1 = parse_freg(d);
- expect_comma(d);
- rs2 = parse_freg(d);
- /* Use DYN(=7) rounding mode by default. */
- return enc_r(m | (0x7u << 12), rd, rs1, rs2);
-
- case RV64_FMT_FP_R:
- if (desc->flags & RV64_ASMFL_FP) {
- rd = parse_freg(d);
- } else {
- rd = parse_xreg(d);
- }
- expect_comma(d);
- rs1 = parse_freg(d);
- expect_comma(d);
- rs2 = parse_freg(d);
- return enc_r(m, rd, rs1, rs2);
-
- case RV64_FMT_FP_CVT:
- if (desc->flags & RV64_ASMFL_FP) {
- rd = parse_freg(d);
- expect_comma(d);
- /* Source: integer reg for fcvt.s.w etc (no FP flag would
- * indicate); but since we have ASMFL_FP set on dest, source may
- * be either. Disambiguate by mnemonic. */
- if (slice_has_prefix_cstr(desc->mnemonic, "fcvt.s.", 7) &&
- (desc->mnemonic.s[7] == 'w' || desc->mnemonic.s[7] == 'l')) {
- rs1 = parse_xreg(d);
- } else if (slice_has_prefix_cstr(desc->mnemonic, "fcvt.d.", 7) &&
- (desc->mnemonic.s[7] == 'w' || desc->mnemonic.s[7] == 'l')) {
- rs1 = parse_xreg(d);
- } else if (slice_eq_cstr(desc->mnemonic, "fmv.w.x") ||
- slice_eq_cstr(desc->mnemonic, "fmv.d.x")) {
- rs1 = parse_xreg(d);
- } else {
- rs1 = parse_freg(d);
- }
- } else {
- rd = parse_xreg(d);
- expect_comma(d);
- rs1 = parse_freg(d);
- }
- /* match encodes rs2 (type selector); OR in rd/rs1 and the rounding mode.
- * An explicit `, <rm>` suffix (cc -S emits it for non-default modes, and
- * clang/gas accept it) takes precedence; otherwise the rm is fixed per
- * conversion family (mirrors the rv_fcvt_* encoders in isa.h, the codegen
- * source of truth): fp->int truncates (RTZ=1); int->fp and fp->fp use the
- * default DYN=7; fmv bit-moves carry no rounding (rm=0). */
- {
- u32 funct7 = (m >> 25) & 0x7fu;
- u32 rm;
- if (asm_driver_eat_comma(d)) {
- rm = rv_parse_rm_name(d);
- } else {
- switch (funct7) {
- case 0x60: /* fcvt.{w,wu,l,lu}.s */
- case 0x61: /* fcvt.{w,wu,l,lu}.d */
- rm = 0x1u; /* RTZ */
- break;
- case 0x70: /* fmv.x.w */
- case 0x71: /* fmv.x.d */
- case 0x78: /* fmv.w.x */
- case 0x79: /* fmv.d.x */
- rm = 0x0u;
- break;
- default: /* int->fp (0x68/0x69) and fp<->fp (0x20/0x21): DYN */
- rm = 0x7u;
- break;
- }
- }
- return m | (rm << 12) | ((rs1 & 0x1fu) << 15) | ((rd & 0x1fu) << 7);
- }
-
- case RV64_FMT_AMO:
- rd = parse_xreg(d);
- expect_comma(d);
- rs2 = parse_xreg(d);
- expect_comma(d);
- asm_driver_expect_punct(d, '(', "'(' in rv64 amo operand");
- rs1 = parse_xreg(d);
- asm_driver_expect_punct(d, ')', "')' in rv64 amo operand");
- return enc_amo(m, 0u, 0u, rd, rs1, rs2);
-
- case RV64_FMT_LR:
- rd = parse_xreg(d);
- expect_comma(d);
- asm_driver_expect_punct(d, '(', "'(' in rv64 lr operand");
- rs1 = parse_xreg(d);
- asm_driver_expect_punct(d, ')', "')' in rv64 lr operand");
- return enc_amo(m, 0u, 0u, rd, rs1, 0u);
-
- case RV64_FMT_CSR: {
- i32 csr;
- rd = parse_xreg(d);
- expect_comma(d);
- csr = (i32)asm_driver_parse_const(d);
- expect_comma(d);
- rs1 = parse_xreg(d);
- return enc_i(m, rd, rs1, csr);
- }
-
- case RV64_FMT_CSRI: {
- i32 csr;
- rd = parse_xreg(d);
- expect_comma(d);
- csr = (i32)asm_driver_parse_const(d);
- expect_comma(d);
- u32 uimm = (u32)asm_driver_parse_const(d) & 0x1fu;
- return enc_i(m, rd, uimm, csr);
- }
-
- case RV64_FMT_CR:
- if (slice_eq_cstr(desc->mnemonic, "c.jr") ||
- slice_eq_cstr(desc->mnemonic, "c.jalr")) {
- rs1 = parse_xreg(d);
- return enc_c_cr(m, rs1, 0u);
- }
- rd = parse_xreg(d);
- expect_comma(d);
- rs2 = parse_xreg(d);
- return enc_c_cr(m, rd, rs2);
-
- case RV64_FMT_CI:
- if (slice_eq_cstr(desc->mnemonic, "c.lwsp") ||
- slice_eq_cstr(desc->mnemonic, "c.ldsp") ||
- slice_eq_cstr(desc->mnemonic, "c.fldsp")) {
- rd = slice_eq_cstr(desc->mnemonic, "c.fldsp") ? parse_freg(d)
- : parse_xreg(d);
- expect_comma(d);
- mem = parse_mem(d);
- if (mem.base != RV_SP)
- asm_driver_panic(d, "rv64 asm: compressed stack load needs sp base");
- return enc_c_lwsp(m, rd, (u32)mem.disp,
- !slice_eq_cstr(desc->mnemonic, "c.lwsp"));
- }
- rd = parse_xreg(d);
- expect_comma(d);
- imm = (i32)asm_driver_parse_const(d);
- if (slice_eq_cstr(desc->mnemonic, "c.lui") && ((u32)imm & 0xfffu) == 0)
- imm >>= 12;
- if (slice_eq_cstr(desc->mnemonic, "c.addi16sp")) {
- if (rd != RV_SP)
- asm_driver_panic(d, "rv64 asm: c.addi16sp needs sp destination");
- return enc_c_addi16sp(m, imm);
- }
- return enc_c_ci(m, rd, imm);
-
- case RV64_FMT_CSS:
- rs2 = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
- expect_comma(d);
- mem = parse_mem(d);
- if (mem.base != RV_SP)
- asm_driver_panic(d, "rv64 asm: compressed stack store needs sp base");
- return enc_c_swsp(m, rs2, (u32)mem.disp,
- !slice_eq_cstr(desc->mnemonic, "c.swsp"));
-
- case RV64_FMT_CIW:
- rd = parse_xreg(d);
- expect_comma(d);
- rs1 = parse_xreg(d);
- expect_comma(d);
- if (rs1 != RV_SP)
- asm_driver_panic(d, "rv64 asm: c.addi4spn needs sp source");
- imm = (i32)asm_driver_parse_const(d);
- return enc_c_addi4spn(m, c_reg3(d, rd), (u32)imm);
-
- case RV64_FMT_CL:
- rd = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
- expect_comma(d);
- mem = parse_mem(d);
- return enc_c_lwld(m, c_reg3(d, rd), c_reg3(d, mem.base), (u32)mem.disp,
- !slice_eq_cstr(desc->mnemonic, "c.lw"));
-
- case RV64_FMT_CS:
- rs2 = (desc->flags & RV64_ASMFL_FP) ? parse_freg(d) : parse_xreg(d);
- expect_comma(d);
- mem = parse_mem(d);
- return enc_c_swld(m, c_reg3(d, rs2), c_reg3(d, mem.base), (u32)mem.disp,
- !slice_eq_cstr(desc->mnemonic, "c.sw"));
-
- case RV64_FMT_CA:
- rd = parse_xreg(d);
- expect_comma(d);
- rs2 = parse_xreg(d);
- return m | (c_reg3(d, rd) << 7) | (c_reg3(d, rs2) << 2);
-
- case RV64_FMT_CB:
- rs1 = parse_xreg(d);
- expect_comma(d);
- imm = (i32)asm_driver_parse_const(d);
- if (slice_eq_cstr(desc->mnemonic, "c.beqz") ||
- slice_eq_cstr(desc->mnemonic, "c.bnez")) {
- return enc_c_cb_imm(m, c_reg3(d, rs1), imm);
- }
- return enc_c_cb_alu_imm(m, c_reg3(d, rs1), imm);
-
- case RV64_FMT_CJ:
- imm = (i32)asm_driver_parse_const(d);
- return enc_c_cj(m, imm);
-
- case RV64_FMT_C_NONE:
- return m;
-
- default:
- asm_driver_panic(d, "rv64 asm: unsupported format");
- }
-}
-
-/* ============================================================
- * Multi-word pseudo-instruction expansion.
- *
- * call/tail/la/lla expand to a PC-relative AUIPC + (JALR | ADDI) pair;
- * `li` with a constant that does not fit a 12-bit signed immediate
- * expands to an LUI/ADDI(W)/SLLI chain (no relocations). Each 32-bit
- * word goes out through rv64_emit32 — the same path assemble_one's
- * single-word result uses — and relocations are attached via
- * mc->emit_reloc_at at the appropriate word offset. */
-
-/* 12-bit signed immediate range check for li short-circuit. */
-static bool rv_fits_i12(i64 v) { return v >= -2048 && v <= 2047; }
-
-/* Sign-extend the low 12 bits of v. */
-static i64 rv_sext12(i64 v) {
- return (i64)((((u64)v & 0xfffu) ^ 0x800u)) - 0x800;
-}
-
-/* Emit an AUIPC rd,0 + a R_RV_PCREL_HI20(sym) reloc, then create a local
- * `.LpcrelHi` anchor at the AUIPC offset and return that anchor symbol so
- * the paired low-half reloc can reference it. Mirrors native.c's
- * rv_emit_global_addr (the non-GOT branch). */
-static ObjSymId rv_emit_pcrel_hi(AsmDriver* d, u32 rd, ObjSymId sym,
- i64 addend) {
- MCEmitter* mc = asm_driver_mc(d);
- ObjBuilder* obj = asm_driver_ob(d);
- Compiler* c = asm_driver_compiler(d);
- u32 sec = mc->section_id;
- u32 ap = mc->pos(mc);
- rv64_emit32(mc, rv_auipc(rd, 0));
- mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, addend, 0, 0);
- Sym an = pool_intern_slice(c->global, SLICE_LIT(".LpcrelHi"));
- return obj_symbol(obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
-}
-
-/* call/tail: AUIPC <link>,0 + JALR <rd>,<link>,0 with one R_RV_CALL reloc
- * at the AUIPC. `link` is the register the AUIPC materializes into and the
- * JALR's base; `rd` is the JALR link-register (ra for call, zero for
- * tail). The linker patches both words from the single R_RV_CALL reloc. */
-static void rv_emit_call_pseudo(AsmDriver* d, u32 link, u32 rd) {
- MCEmitter* mc = asm_driver_mc(d);
- ObjSymId sym = OBJ_SYM_NONE;
- i64 off = 0;
- asm_driver_parse_sym_expr(d, &sym, &off);
- if (sym == OBJ_SYM_NONE)
- asm_driver_panic(d, "rv64 asm: call/tail target must be a symbol");
- u32 sec = mc->section_id;
- u32 ap = mc->pos(mc);
- rv64_emit32(mc, rv_auipc(link, 0));
- rv64_emit32(mc, rv_jalr(rd, link, 0));
- mc->emit_reloc_at(mc, sec, ap, R_RV_CALL, sym, off, 0, 0);
-}
-
-/* la/lla rd, sym: AUIPC rd,%pcrel_hi(sym) + ADDI rd,rd,%pcrel_lo(anchor).
- * kit's static Local-Exec model has no GOT, so `la` == `lla`. */
-static void rv_emit_la_pseudo(AsmDriver* d) {
- MCEmitter* mc = asm_driver_mc(d);
- u32 rd = parse_xreg(d);
- expect_comma(d);
- ObjSymId sym = OBJ_SYM_NONE;
- i64 off = 0;
- asm_driver_parse_sym_expr(d, &sym, &off);
- if (sym == OBJ_SYM_NONE)
- asm_driver_panic(d, "rv64 asm: la/lla target must be a symbol");
- ObjSymId anchor = rv_emit_pcrel_hi(d, rd, sym, off);
- u32 sec = mc->section_id;
- u32 lp = mc->pos(mc);
- rv64_emit32(mc, rv_addi(rd, rd, 0));
- mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
-}
-
-/* LUI immediate that sign-extends to a negative 32-bit value: bit 19 of
- * the 20-bit field is set, i.e. Hi20 >= 0x80000. */
-#define RV_LUI_HI20_SIGN 0x80000LL
-
-/* Materialize a 64-bit constant into `rd` via the LLVM RISCVMatInt
- * sequence: for values fitting a signed 32-bit range, LUI + ADDI/ADDIW;
- * otherwise a recursive top-down hi20/lo12 split with SLLI shifts that
- * absorb trailing zeros. No relocations.
- *
- * After an LUI, the low-half add uses ADDIW only when the LUI value is
- * negative in 32-bit form (Hi20 >= RV_LUI_HI20_SIGN): there the add must
- * wrap in 32-bit arithmetic and re-sign-extend to land in range. When the
- * LUI value is non-negative in its low 32 bits, plain ADDI keeps the
- * 64-bit result correct (matching LLVM's generateInstSeqImpl). */
-static void rv_emit_li_value(MCEmitter* mc, u32 rd, i64 val) {
- if (val >= -2147483648LL && val <= 2147483647LL) {
- i64 hi20 = ((val + 0x800) >> 12) & 0xfffffLL;
- i64 lo12 = rv_sext12(val);
- if (hi20) rv64_emit32(mc, rv_lui(rd, (u32)hi20));
- if (lo12 || hi20 == 0) {
- u32 src = hi20 ? rd : (u32)RV_ZERO;
- if (hi20 >= RV_LUI_HI20_SIGN)
- rv64_emit32(mc, rv_addiw(rd, src, (i32)lo12));
- else
- rv64_emit32(mc, rv_addi(rd, src, (i32)lo12));
- }
- return;
- }
- /* >32-bit: split off the low 12 bits, recurse on the (shifted) high
- * part, then SLLI back and ADD the low bits. The subtraction is done in
- * unsigned space so it cannot signed-overflow at the int64 extremes
- * (e.g. val=INT64_MAX, lo12=-1); the result has its low 12 bits clear,
- * and the arithmetic right shift recovers the sign-extended high part. */
- i64 lo12 = rv_sext12(val);
- i64 hi = (i64)((u64)val - (u64)lo12) >> 12;
- u32 shift = 12;
- /* Absorb trailing zeros of the high part into the shift amount. */
- while ((hi & 1) == 0) {
- hi >>= 1;
- ++shift;
- }
- rv_emit_li_value(mc, rd, hi);
- rv64_emit32(mc, rv_slli(rd, rd, shift));
- if (lo12) rv64_emit32(mc, rv_addi(rd, rd, (i32)lo12));
-}
-
-/* Dispatch a multi-word pseudo. Returns true if it consumed the operands
- * and emitted its expansion; false to fall through to the single-word
- * path. `li` is handled here only when its immediate exceeds the 12-bit
- * signed range the alias row encodes directly. */
-static bool rv64_emit_pseudo(AsmDriver* d, const Rv64InsnDesc* desc) {
- MCEmitter* mc = asm_driver_mc(d);
- if (desc->fmt == RV64_FMT_PSEUDO) {
- if (slice_eq_cstr(desc->mnemonic, "call")) {
- rv_emit_call_pseudo(d, RV_RA, RV_RA);
- return true;
- }
- if (slice_eq_cstr(desc->mnemonic, "tail")) {
- /* Standard RISC-V `tail` materializes the address into t1 (x6). kit
- * codegen uses t0 for its own tail-call temp, so a `cc -S`-fused
- * `tail sym` re-assembles to t1 not t0 — execution-equivalent (both are
- * caller-saved temps clobbered by the tail jump; cross-exec still
- * matches), only the byte image differs on tail-call cases. Keeping the
- * assembler's `tail` standard preserves clang/gas interop. */
- rv_emit_call_pseudo(d, RV_T1, RV_ZERO);
- return true;
- }
- /* la / lla — identical PC-relative expansion in kit. */
- rv_emit_la_pseudo(d);
- return true;
- }
- if ((desc->flags & RV64_ASMFL_ALIAS) && slice_eq_cstr(desc->mnemonic, "li")) {
- /* Peek the immediate without consuming the destination register: the
- * single-word alias path re-parses both. We commit to the multi-word
- * path only for out-of-range constants, leaving the existing 12-bit
- * fast path (and its golden behavior) untouched. */
- u32 rd = parse_xreg(d);
- expect_comma(d);
- i64 imm = asm_driver_parse_const(d);
- if (rv_fits_i12(imm)) {
- rv64_emit32(mc, rv_addi(rd, RV_ZERO, (i32)imm));
- } else {
- rv_emit_li_value(mc, rd, imm);
- }
- return true;
- }
- return false;
-}
-
-static void rv64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic) {
- MCEmitter* mc = asm_driver_mc(d);
- const Rv64InsnDesc* desc;
- (void)base;
- (void)asm_driver_cur_section(d);
- desc = rv64_asm_find(pool_slice(asm_driver_pool(d), mnemonic));
- if (!desc) asm_driver_panic(d, "rv64 asm: unsupported instruction");
- if (rv64_emit_pseudo(d, desc)) return;
- if (desc->flags & RV64_ASMFL_C16)
- rv64_emit16(mc, assemble_one(d, desc));
- else
- rv64_emit32(mc, assemble_one(d, desc));
-}
-
-static void rv64_arch_asm_destroy(ArchAsm* base) { (void)base; }
-
-/* ---- textual-assembly operand syntax (printer <-> parser) ----------------
- *
- * Inverse of the `.s` parsers above (rv_parse_mod_reloc / rv_reloc_target and
- * the call/la pseudo expanders): how a relocated rv64 operand is spelled in
- * `cc -S` so the same text re-assembles under kit-as. RISC-V uses the same
- * `%hi`/`%lo`/`%pcrel_hi`/`%pcrel_lo` operator syntax on every object format,
- * so `fmt` is unused. See ArchAsmOps and src/api/asm_emit.c. */
-static int rv64_reloc_operand(u16 kind, KitObjFmt fmt, ArchRelocOperand* out) {
- (void)fmt;
- out->prefix = "";
- out->suffix = "";
- out->addend_bias = 0;
- out->emit_anchor = 0;
- out->ref_anchor = 0;
- switch (kind) {
- case R_RV_PCREL_HI20:
- out->surg = ARCH_RELOC_SURG_TAIL;
- out->prefix = "%pcrel_hi(";
- out->suffix = ")";
- out->emit_anchor = 1; /* define a unique anchor label at this AUIPC */
- return 1;
- case R_RV_GOT_HI20:
- out->surg = ARCH_RELOC_SURG_TAIL;
- out->prefix = "%got_pcrel_hi(";
- out->suffix = ")";
- out->emit_anchor = 1;
- return 1;
- case R_RV_PCREL_LO12_I:
- case R_RV_PCREL_LO12_S:
- out->surg = ARCH_RELOC_SURG_RV_LO12;
- out->prefix = "%pcrel_lo(";
- out->suffix = ")";
- out->ref_anchor = 1; /* references the preceding AUIPC's anchor label */
- return 1;
- case R_RV_HI20:
- out->surg = ARCH_RELOC_SURG_TAIL;
- out->prefix = "%hi(";
- out->suffix = ")";
- return 1;
- case R_RV_LO12_I:
- case R_RV_LO12_S:
- out->surg = ARCH_RELOC_SURG_RV_LO12;
- out->prefix = "%lo(";
- out->suffix = ")";
- return 1;
- case R_RV_BRANCH:
- case R_RV_JAL:
- out->surg = ARCH_RELOC_SURG_TAIL;
- return 1;
- default:
- return 0; /* R_ABS*, R_RV_RVC_*, R_RV_RELAX, TLS, ... → keep numeric */
- }
-}
-
-/* Intra-section local branches whose target codegen resolved in place (no
- * relocation): the disassembler renders the target numerically, so cc -S
- * synthesizes a label there. `j`/`jal x0` are JAL aliases; the conditional
- * branches are B-type. `call`/`tail` are excluded — they carry R_RV_CALL. */
-static int rv64_is_local_branch(KitSlice m) {
- if (m.len == 1 && m.s[0] == 'j') return 1;
- if (m.len == 3 && memcmp(m.s, "jal", 3) == 0) return 1;
- if (m.len == 3 && memcmp(m.s, "beq", 3) == 0) return 1;
- if (m.len == 3 && memcmp(m.s, "bne", 3) == 0) return 1;
- if (m.len == 3 && memcmp(m.s, "blt", 3) == 0) return 1;
- if (m.len == 3 && memcmp(m.s, "bge", 3) == 0) return 1;
- if (m.len == 4 && memcmp(m.s, "bltu", 4) == 0) return 1;
- if (m.len == 4 && memcmp(m.s, "bgeu", 4) == 0) return 1;
- if (m.len == 4 && memcmp(m.s, "beqz", 4) == 0) return 1;
- if (m.len == 4 && memcmp(m.s, "bnez", 4) == 0) return 1;
- if (m.len == 4 && memcmp(m.s, "blez", 4) == 0) return 1;
- if (m.len == 4 && memcmp(m.s, "bgez", 4) == 0) return 1;
- if (m.len == 4 && memcmp(m.s, "bltz", 4) == 0) return 1;
- if (m.len == 4 && memcmp(m.s, "bgtz", 4) == 0) return 1;
- if (m.len == 6 && memcmp(m.s, "c.beqz", 6) == 0) return 1;
- if (m.len == 6 && memcmp(m.s, "c.bnez", 6) == 0) return 1;
- if (m.len == 3 && memcmp(m.s, "c.j", 3) == 0) return 1;
- return 0;
-}
-
-/* R_RV_CALL fuses an AUIPC+JALR pair into a single `call`/`tail sym` pseudo
- * (the canonical `.s` spelling the assembler re-expands to the same pair +
- * reloc). The reloc sits on the AUIPC; the JALR partner carries no reloc. A
- * tail call links into x0 (the JALR's rd is `zero`); a regular call links into
- * ra. We read that from the partner JALR's disassembled text. */
-static int rv64_reloc_call_pair(u16 kind, KitSlice pair_mnemonic,
- KitSlice pair_ops, const char** mnemonic_out) {
- if (kind != R_RV_CALL) return 0;
- /* The partner JALR links into ra (regular call) or x0 (tail). The
- * disassembler renders the x0-link, zero-immediate form as the `jr rs`
- * alias, and the ra form as `jalr ra, 0(ra)`. So a `jr` partner is always a
- * tail; a `jalr` partner is a tail iff its link register is `zero`. */
- if (pair_mnemonic.len == 2 && memcmp(pair_mnemonic.s, "jr", 2) == 0) {
- *mnemonic_out = "tail";
- return 1;
- }
- if (pair_mnemonic.len == 4 && memcmp(pair_mnemonic.s, "jalr", 4) == 0) {
- if (pair_ops.len >= 4 && memcmp(pair_ops.s, "zero", 4) == 0)
- *mnemonic_out = "tail";
- else
- *mnemonic_out = "call";
- return 1;
- }
- return 0;
-}
-
-const ArchAsmOps rv64_asm_ops = {
- .reloc_operand = rv64_reloc_operand,
- .is_local_branch = rv64_is_local_branch,
- .reloc_call_pair = rv64_reloc_call_pair,
-};
-
-ArchAsm* rv64_arch_asm_new(Compiler* c) {
- Rv64Asm* a = arena_new(c->tu, Rv64Asm);
- memset(a, 0, sizeof *a);
- a->base.insn = rv64_arch_asm_insn;
- a->base.destroy = rv64_arch_asm_destroy;
- a->c = c;
- return &a->base;
-}
-
-/* ============================================================
- * Inline-asm template walker (parallel to aa64 asm.c §"inline-asm
- * template walker"). The walker substitutes %N / %[name] / %% / %a%w%x
- * placeholders into a per-line StrBuf, then re-lexes each line through
- * rv64_arch_asm_insn for assembly. Statement separators recognised are
- * '\n' and ';' (outside parens / quoted strings).
- * ============================================================ */
-
-Rv64Asm* rv64_asm_open(Compiler* c) {
- Rv64Asm* a = arena_new(c->tu, Rv64Asm);
- memset(a, 0, sizeof *a);
- a->base.insn = rv64_arch_asm_insn;
- a->base.destroy = rv64_arch_asm_destroy;
- a->c = c;
- return a;
-}
-
-void rv64_asm_close(Rv64Asm* a) { (void)a; }
-
-void rv64_inline_bind(Rv64Asm* a, const AsmConstraint* outs, u32 nout,
- Operand* out_ops, const AsmConstraint* ins, u32 nin,
- const Operand* in_ops, const Sym* clobbers, u32 nclob) {
- a->outs = outs;
- a->out_ops = out_ops;
- a->ins = ins;
- a->in_ops = in_ops;
- a->clobbers = clobbers;
- a->nout = nout;
- a->nin = nin;
- a->nclob = nclob;
-}
-
-/* Per-line rendered buffer cap. Inline asm rarely emits more than a
- * handful of insns per block; one substituted line fits comfortably.
- * Truncation panics — the operator grammar should never grow a single
- * line beyond this without a deliberate reason. */
-#define RV64_INLINE_LINE_CAP 1024
-
-_Noreturn static void inline_panic(Rv64Asm* a, const char* msg) {
- SrcLoc loc = {0, 0, 0};
- compiler_panic(a->c, loc, "rv64 inline asm: %.*s",
- SLICE_ARG(slice_from_cstr(msg)));
-}
-
-/* Render a 5-bit integer register number using its canonical psABI name. */
-static void render_xreg(StrBuf* sb, u32 reg) {
- const char* nm = rv64_register_name(reg & 0x1fu);
- if (!nm) {
- strbuf_putc(sb, 'x');
- if ((reg & 0x1fu) >= 10u)
- strbuf_putc(sb, (char)('0' + ((reg & 0x1fu) / 10u)));
- strbuf_putc(sb, (char)('0' + ((reg & 0x1fu) % 10u)));
- return;
- }
- strbuf_puts(sb, nm);
-}
-
-/* Render an FP register by its canonical psABI name (e.g., fa0). */
-static void render_freg(StrBuf* sb, u32 reg) {
- const char* nm = rv64_register_name(32u + (reg & 0x1fu));
- if (!nm) {
- strbuf_putc(sb, 'f');
- if ((reg & 0x1fu) >= 10u)
- strbuf_putc(sb, (char)('0' + ((reg & 0x1fu) / 10u)));
- strbuf_putc(sb, (char)('0' + ((reg & 0x1fu) % 10u)));
- return;
- }
- strbuf_puts(sb, nm);
-}
-
-/* Render a signed 64-bit integer. Inline asm immediates appear bare in
- * RISC-V (no '#' prefix), matching the standalone .s parser. */
-static void render_imm(StrBuf* sb, i64 v) { strbuf_put_i64(sb, v); }
-
-/* Render addressing form `disp(base)`. */
-static void render_indirect(Rv64Asm* a, StrBuf* sb, Reg base, i32 ofs) {
- (void)a;
- if (ofs != 0)
- strbuf_put_i64(sb, (i64)ofs);
- else
- strbuf_putc(sb, '0');
- strbuf_putc(sb, '(');
- render_xreg(sb, (u32)base);
- strbuf_putc(sb, ')');
-}
-
-/* Resolve operand index → render into sb. form:
- * 0 = default (per-kind),
- * 1 = %wN (width hint; on rv64 same as default xreg form),
- * 2 = %xN (force 64-bit reg form — identical to default for rv64),
- * 3 = %aN (memory addressing form).
- * 4 = %zN (RISC-V GCC: emits "zero" if operand is imm 0, else reg). */
-static void render_operand(Rv64Asm* a, StrBuf* sb, u32 idx, int form) {
- u32 ntot = a->nout + a->nin;
- if (idx >= ntot) inline_panic(a, "operand index out of range");
- const Operand* op =
- (idx < a->nout) ? &a->out_ops[idx] : &a->in_ops[idx - a->nout];
- switch (form) {
- case 1: /* %wN — accept any reg/imm; rv64 has no narrower spelling. */
- case 2: /* %xN — same. */
- if (op->kind == RV64_INLINE_OPK_REG) {
- if (op->pad[0] == RV64_INLINE_OPCLS_FP)
- render_freg(sb, (u32)op->v.local);
- else
- render_xreg(sb, (u32)op->v.local);
- return;
- }
- if (op->kind == OPK_IMM) {
- render_imm(sb, op->v.imm);
- return;
- }
- inline_panic(a, "%w/%x on unsupported operand kind");
- case 3: /* %aN — memory addressing form */
- if (op->kind != OPK_INDIRECT) inline_panic(a, "%a on non-memory operand");
- if (op->v.ind.index != CG_LOCAL_NONE)
- inline_panic(a,
- "%a on indexed memory operand: rv64 inline asm "
- "requires base+disp only");
- render_indirect(a, sb, (Reg)op->v.ind.base, op->v.ind.ofs);
- return;
- case 4: /* %zN — zero-or-reg */
- if (op->kind == OPK_IMM && op->v.imm == 0) {
- strbuf_puts(sb, "zero");
- return;
- }
- if (op->kind == RV64_INLINE_OPK_REG) {
- if (op->pad[0] == RV64_INLINE_OPCLS_FP)
- render_freg(sb, (u32)op->v.local);
- else
- render_xreg(sb, (u32)op->v.local);
- return;
- }
- inline_panic(a, "%z on unsupported operand kind");
- default:
- break;
- }
- switch (op->kind) {
- case RV64_INLINE_OPK_REG:
- if (op->pad[0] == RV64_INLINE_OPCLS_FP)
- render_freg(sb, (u32)op->v.local);
- else
- render_xreg(sb, (u32)op->v.local);
- return;
- case OPK_IMM:
- render_imm(sb, op->v.imm);
- return;
- case OPK_INDIRECT:
- if (op->v.ind.index != CG_LOCAL_NONE)
- inline_panic(a,
- "indexed memory operand in inline asm: rv64 requires "
- "base+disp only");
- render_indirect(a, sb, (Reg)op->v.ind.base, op->v.ind.ofs);
- return;
- default:
- inline_panic(a, "unsupported operand kind for %N");
- }
-}
-
-/* Resolve a `%[name]` operand by looking up `needle` against the
- * constraint.name fields on the combined outs+ins list. Returns the
- * combined index, or (u32)-1 on miss. */
-static u32 lookup_named(Rv64Asm* a, Sym needle) {
- for (u32 k = 0; k < a->nout; ++k) {
- if (a->outs[k].name == needle) return k;
- }
- for (u32 k = 0; k < a->nin; ++k) {
- if (a->ins[k].name == needle) return a->nout + k;
- }
- return (u32)-1;
-}
-
-/* Lex one line of substituted asm and dispatch via rv64_arch_asm_insn. */
-static void run_one_line(Rv64Asm* a, MCEmitter* mc, const char* text,
- size_t len) {
- /* Skip blank lines. */
- size_t i;
- for (i = 0; i < len; ++i) {
- if (text[i] != ' ' && text[i] != '\t') break;
- }
- if (i == len) return;
-
- AsmLexer* lx = asm_lex_open_mem(a->c, "<inline-asm>", text, len);
- AsmDriver* d = asm_driver_open_inline(a->c, mc, lx);
-
- /* The first non-trivial token must be the mnemonic identifier. */
- AsmTok t = asm_driver_peek(d);
- while (t.kind == ASM_TOK_NEWLINE) {
- (void)asm_driver_next(d);
- t = asm_driver_peek(d);
- }
- if (t.kind == ASM_TOK_EOF) {
- asm_driver_close_inline(d);
- asm_lex_close(lx);
- return;
- }
- if (t.kind != ASM_TOK_IDENT)
- inline_panic(a, "expected mnemonic at start of inline asm line");
- (void)asm_driver_next(d);
- Sym mn = t.v.ident;
- /* Compose `fcvt.s.w` etc. — rv64 has dotted mnemonics; the standalone
- * lexer already strings them together as a single IDENT in most paths.
- * Mirror the aa64 composite handling for safety. */
- AsmTok dot = asm_driver_peek(d);
- while (asm_driver_tok_is_punct(dot, '.')) {
- (void)asm_driver_next(d);
- AsmTok rest = asm_driver_next(d);
- if (rest.kind != ASM_TOK_IDENT)
- inline_panic(a, "composite mnemonic: expected ident after '.'");
- Slice hsl = pool_slice(asm_driver_pool(d), mn);
- Slice rsl = pool_slice(asm_driver_pool(d), rest.v.ident);
- size_t hn = hsl.len, rn = rsl.len;
- char buf[64];
- if (hn + 1 + rn >= sizeof buf)
- inline_panic(a, "composite mnemonic too long");
- for (size_t k = 0; k < hn; ++k) buf[k] = hsl.s[k];
- buf[hn] = '.';
- for (size_t k = 0; k < rn; ++k) buf[hn + 1 + k] = rsl.s[k];
- mn = pool_intern_slice(asm_driver_pool(d),
- (Slice){.s = buf, .len = hn + 1 + rn});
- dot = asm_driver_peek(d);
- }
- rv64_arch_asm_insn(&a->base, d, mn);
- asm_driver_close_inline(d);
- asm_lex_close(lx);
-}
-
-/* Substitute placeholders into one line's StrBuf, then dispatch. */
-static void render_and_run_line(Rv64Asm* a, MCEmitter* mc, StrBuf* sb,
- const char* start, const char* end) {
- strbuf_reset(sb);
- for (const char* p = start; p < end; ++p) {
- char c = *p;
- if (c != '%') {
- strbuf_putc(sb, c);
- continue;
- }
- /* Placeholder. */
- if (p + 1 >= end) inline_panic(a, "trailing '%' in template");
- char n = *(p + 1);
- if (n == '%') {
- strbuf_putc(sb, '%');
- ++p;
- continue;
- }
- if (n == '[') {
- const char* nbeg = p + 2;
- const char* nend = nbeg;
- while (nend < end && *nend != ']') ++nend;
- if (nend == end) inline_panic(a, "unterminated %[name]");
- size_t nlen = (size_t)(nend - nbeg);
- Sym needle =
- pool_intern_slice(a->c->global, (Slice){.s = nbeg, .len = nlen});
- u32 idx = lookup_named(a, needle);
- if (idx == (u32)-1)
- inline_panic(a, "%[name] does not match any constraint");
- p = nend; /* loop's ++p steps past the ']' */
- render_operand(a, sb, idx, 0);
- continue;
- }
- int form = 0; /* 0=default, 1=w, 2=x, 3=a, 4=z */
- if (n == 'w' || n == 'x' || n == 'a' || n == 'z') {
- form = (n == 'w') ? 1 : (n == 'x') ? 2 : (n == 'a') ? 3 : 4;
- ++p;
- if (p + 1 >= end) inline_panic(a, "trailing '%' modifier in template");
- n = *(p + 1);
- }
- if (n == '[') {
- const char* nbeg = p + 2;
- const char* nend = nbeg;
- while (nend < end && *nend != ']') ++nend;
- if (nend == end) inline_panic(a, "unterminated %[name]");
- size_t nlen = (size_t)(nend - nbeg);
- Sym needle =
- pool_intern_slice(a->c->global, (Slice){.s = nbeg, .len = nlen});
- u32 idx = lookup_named(a, needle);
- if (idx == (u32)-1)
- inline_panic(a, "%[name] does not match any constraint");
- p = nend;
- render_operand(a, sb, idx, form);
- continue;
- }
- if (n < '0' || n > '9') inline_panic(a, "expected digit after '%'");
- u32 idx = (u32)(n - '0');
- ++p;
- /* GCC syntax permits up to two digits (%0..%99). */
- if (p + 1 < end && *(p + 1) >= '0' && *(p + 1) <= '9') {
- idx = idx * 10 + (u32)(*(p + 1) - '0');
- ++p;
- }
- render_operand(a, sb, idx, form);
- }
- if (sb->truncated) inline_panic(a, "inline asm line buffer overflow");
- run_one_line(a, mc, strbuf_cstr(sb), strbuf_len(sb));
-}
-
-void rv64_asm_run_template(Rv64Asm* a, MCEmitter* mc, const char* tmpl) {
- if (!tmpl || !*tmpl) return;
-
- char buf[RV64_INLINE_LINE_CAP];
- StrBuf sb;
- strbuf_init(&sb, buf, sizeof buf);
-
- /* Walk tmpl, splitting on '\n' and ';'. Track paren depth and quote
- * state so that a literal ';' inside `( ... )` (memory operand) or a
- * quoted string is not mistaken for a statement separator. RISC-V uses
- * `disp(base)` for memory, hence we track parens. */
- const char* line_start = tmpl;
- int paren = 0;
- char quote = 0;
- for (const char* p = tmpl;; ++p) {
- char c = *p;
- if (c == '\0') {
- render_and_run_line(a, mc, &sb, line_start, p);
- break;
- }
- if (quote) {
- if (c == '\\' && *(p + 1)) {
- ++p;
- continue;
- }
- if (c == quote) quote = 0;
- continue;
- }
- if (c == '"' || c == '\'') {
- quote = c;
- continue;
- }
- if (c == '(') {
- ++paren;
- continue;
- }
- if (c == ')') {
- if (paren) --paren;
- continue;
- }
- if (paren == 0 && (c == '\n' || c == ';')) {
- render_and_run_line(a, mc, &sb, line_start, p);
- line_start = p + 1;
- }
- }
-}
diff --git a/src/arch/rv64/dbg.c b/src/arch/rv64/dbg.c
@@ -1,419 +0,0 @@
-/* RISC-V 64 lifter for the displaced-step shim.
- *
- * Lays out a fixed-up copy of one insn in the session scratch slot
- * (DBG_DISPLACED_SLOT_BYTES bytes), followed by an EBREAK sentinel the
- * session arms an internal bp on.
- *
- * Supported families:
- * - JAL rd, offset — synthesize:
- * slot[0] AUIPC t0, hi20(target) ; t0 = pc_runtime + hi20
- * slot[4] ADDI t0, t0, lo12 ; (optional) fixup
- * slot[8] JALR rd, t0, 0 ; rd = pc+4_runtime; PC = t0
- * slot[N] EBREAK
- * The JALR's "return address" lands at the EBREAK sentinel, but since
- * control transfers to the user target we never execute it; the
- * session's stale internal_bp is cleared by the next prepare and the
- * finalize step gates on PC == return_pc so it stays a no-op when
- * control left the slot.
- *
- * Note that an unconditional JAL with rd != x0 writes the runtime
- * (scratch) PC+4 into rd. For RISC-V calls (the dynamic linker /
- * PLT trampolines pass arguments via rd=ra), this is acceptable in
- * practice because the saved return address is rebuilt by the
- * epilogue anyway; kit's JIT debugger uses the shim only to
- * single-step through code it has emitted, and the producer's call
- * sequences re-establish ra in the prologue of the callee. For a
- * true displaced-step debugger this would need a "patch ra" pass —
- * v1 leaves that to the user via the unwind step.
- *
- * - JALR rd, rs1, imm — copied verbatim; the EBREAK after never
- * fires because the indirect branch transfers control. Same caveat
- * about rd as JAL.
- *
- * - BEQ/BNE/BLT/BGE/BLTU/BGEU rs1, rs2, offset — trampoline form:
- * slot[0] Bcc rs1, rs2, +12 ; taken → slot+12 (target seq)
- * slot[4] J +12 ; not-taken → slot+16 (EBREAK)
- * (JAL x0, +12)
- * slot[8] EBREAK
- * slot[12] AUIPC t0, hi20(target)
- * slot[16] ADDI t0, t0, lo12
- * slot[20] JALR x0, t0, 0
- * slot[24] EBREAK (sentinel: taken path sentinel)
- * Sentinel offset is slot[8] for the not-taken fallthrough; the
- * taken path branches away so it doesn't matter whether slot[24]
- * is an EBREAK or not, but we put one there as a safety net.
- *
- * Branch immediates in RV64I are 13-bit signed, so the in-shim
- * Bcc-then-J/J pattern always fits.
- *
- * - AUIPC rd, imm20 — replace with LUI rd, abs_hi20:
- * slot[0] LUI rd, abs_hi20
- * slot[4] EBREAK
- * where abs_hi20 = (orig_pc + (imm20 << 12)) >> 12, masked to 20
- * bits. Note that AUIPC computes pc + (imm << 12); LUI computes
- * imm << 12. So we feed LUI the hi-20 of (orig_pc & ~0xfff) +
- * (imm << 12), i.e. the bits we want at the top of rd.
- *
- * - LUI rd, imm20 — copied verbatim (no PC dependency).
- *
- * - System / ALU / load / store / misc — copied verbatim + EBREAK.
- *
- * Not supported (caller will fall back to step-over via internal bp):
- * - RVC compressed instructions (16-bit). The producer does not emit
- * them, but they may appear if the JIT ever loads pre-built code.
- * - Vector instructions. Not produced by kit's RV64 backend.
- */
-
-#include "dbg/dbg.h"
-
-#include <string.h>
-
-#include "arch/rv64/isa.h"
-
-#define SHIM_T0 RV_T0 /* x5 — caller-saved temp, safe inside a shim */
-
-uint32_t dbg_rv64_brk_word(void) { return rv_ebreak(); }
-
-static void put_u32(uint8_t* w, uint32_t off, uint32_t v) {
- memcpy(w + off, &v, sizeof(v));
-}
-
-/* Sign-extend a `bits`-wide field whose raw value is `v`. */
-static int64_t sign_extend(uint64_t v, int bits) {
- uint64_t m = 1ull << (bits - 1);
- return (int64_t)((v ^ m) - m);
-}
-
-/* Decode RV64 fields. */
-static uint32_t rv_opcode(uint32_t insn) { return insn & 0x7fu; }
-static uint32_t rv_rd(uint32_t insn) { return (insn >> 7) & 0x1fu; }
-static uint32_t rv_funct3(uint32_t insn) { return (insn >> 12) & 0x7u; }
-static uint32_t rv_rs1(uint32_t insn) { return (insn >> 15) & 0x1fu; }
-static uint32_t rv_rs2(uint32_t insn) { return (insn >> 20) & 0x1fu; }
-
-/* J-type 20-bit immediate (sign-extended into 21-bit byte offset). */
-static int64_t rv_j_imm(uint32_t insn) {
- uint64_t imm = ((uint64_t)((insn >> 31) & 1u) << 20) |
- ((uint64_t)((insn >> 21) & 0x3ffu) << 1) |
- ((uint64_t)((insn >> 20) & 1u) << 11) |
- ((uint64_t)((insn >> 12) & 0xffu) << 12);
- return sign_extend(imm, 21);
-}
-
-/* B-type 12-bit immediate (sign-extended 13-bit byte offset). */
-static int64_t rv_b_imm(uint32_t insn) {
- uint64_t imm = ((uint64_t)((insn >> 31) & 1u) << 12) |
- ((uint64_t)((insn >> 7) & 1u) << 11) |
- ((uint64_t)((insn >> 25) & 0x3fu) << 5) |
- ((uint64_t)((insn >> 8) & 0xfu) << 1);
- return sign_extend(imm, 13);
-}
-
-/* U-type 20-bit immediate, returned as the raw 20-bit field (consumer
- * shifts it left by 12). */
-static uint32_t rv_u_imm20(uint32_t insn) { return (insn >> 12) & 0xfffffu; }
-
-/* Decompose a 64-bit absolute target into a 32-bit AUIPC/LUI hi20 +
- * ADDI lo12 pair such that:
- * lui rd, hi20 -> rd = (sign_ext_32(hi20 << 12))
- * addi rd, rd, lo12 -> rd = (sign_ext_32(hi20 << 12) +
- * sign_ext_12(lo12))
- * == sign_ext_32(target_low32)
- * Returns 1 if the absolute target's low 32 bits cannot represent the
- * full target (i.e. the target lives outside the sign-extended 32-bit
- * range). The RV64 ABI's "medlow" code model assumes targets fit in
- * the 32-bit sign-extended window around 0; for a JIT image that lives
- * higher in the address space we panic at the caller. */
-static int rv_split_hi_lo(uint64_t target, uint32_t* hi20, int32_t* lo12,
- int* sext32) {
- int64_t s = (int64_t)target;
- int64_t sext = (int64_t)(int32_t)(uint32_t)target;
- *sext32 = (s == sext) ? 1 : 0;
- /* hi20 chosen so addi's sign-extended 12-bit lo cancels out. */
- uint32_t low32 = (uint32_t)target;
- uint32_t hi = (low32 + 0x800u) >> 12;
- int32_t lo = (int32_t)(low32 - (hi << 12));
- *hi20 = hi & 0xfffffu;
- *lo12 = lo;
- return 0;
-}
-
-/* Emit "li t0, target" using AUIPC+ADDI when the target is in PC-rel
- * range, otherwise LUI+ADDI. Returns the number of words written into
- * `w` starting at offset `off`. The shim runs at `shim_runtime_pc` (the
- * scratch slot's runtime address), and the AUIPC variant uses that. */
-static uint32_t emit_materialize_target(uint8_t* w, uint32_t off,
- uint64_t target,
- uint64_t shim_runtime_pc) {
- int64_t pc_rel = (int64_t)target - (int64_t)shim_runtime_pc;
- /* AUIPC offset is signed 32-bit (imm20 << 12). If pc_rel fits in the
- * 32-bit sign-extended range and the low 12 bits' sign-extension
- * carries correctly, prefer AUIPC + ADDI (PIC-friendly). Otherwise
- * fall back to LUI + ADDI (assumes target's low32 is the full
- * address — caller arranges for medlow targets). */
- if (pc_rel >= -(int64_t)0x80000000 && pc_rel <= (int64_t)0x7fffffff) {
- uint32_t hi20 = ((uint32_t)(int32_t)pc_rel + 0x800u) >> 12;
- int32_t lo12 = (int32_t)((uint32_t)(int32_t)pc_rel - (hi20 << 12));
- put_u32(w, off + 0, rv_auipc(SHIM_T0, hi20 & 0xfffffu));
- put_u32(w, off + 4, rv_addi(SHIM_T0, SHIM_T0, lo12));
- return 2;
- } else {
- uint32_t hi20;
- int32_t lo12;
- int sext32;
- (void)rv_split_hi_lo(target, &hi20, &lo12, &sext32);
- put_u32(w, off + 0, rv_lui(SHIM_T0, hi20));
- put_u32(w, off + 4, rv_addi(SHIM_T0, SHIM_T0, lo12));
- return 2;
- }
-}
-
-int dbg_rv64_build_shim(uint32_t orig_insn, uint64_t orig_pc,
- void* scratch_write, uint64_t scratch_runtime,
- u32* brk_offset) {
- uint8_t* w = (uint8_t*)scratch_write;
- uint32_t brk = rv_ebreak();
- uint32_t op;
-
- if (!brk_offset) return 1;
- *brk_offset = 0;
-
- op = rv_opcode(orig_insn);
-
- /* ---- JAL rd, offset ----------------------------------------------
- * Semantics: rd = orig_pc + 4; pc = orig_pc + imm. We must reproduce
- * the *user-visible* link value (orig_pc + 4), not the runtime
- * scratch-relative one. Layout:
- * slot[0..] materialize_target(t0, orig_pc + imm)
- * slot[m] materialize rd <- (orig_pc + 4) (skipped when rd==x0)
- * slot[m+] JALR x0, t0, 0 (unconditional jump; no link)
- * slot[end] EBREAK
- * For rd==x0 this collapses to the plain "jump to target" form. */
- if (op == RV_JAL) {
- int64_t imm = rv_j_imm(orig_insn);
- uint64_t target = orig_pc + (uint64_t)imm;
- uint32_t rd = rv_rd(orig_insn);
- uint32_t n_words;
- n_words = emit_materialize_target(w, 0, target, scratch_runtime);
- if (rd != RV_ZERO) {
- /* link = orig_pc + 4. Synthesize via LUI + ADDI using low-32
- * decomposition; if the link value doesn't fit a 32-bit sign-
- * extended window, we still emit the same two-word sequence and
- * the high bits get truncated — acceptable for the JIT case
- * where orig_pc is always within the image's 32-bit sign-ext
- * range. */
- uint64_t link = orig_pc + 4u;
- uint32_t hi20;
- int32_t lo12;
- int sext32;
- (void)rv_split_hi_lo(link, &hi20, &lo12, &sext32);
- put_u32(w, 4 * n_words, rv_lui(rd, hi20));
- ++n_words;
- put_u32(w, 4 * n_words, rv_addi(rd, rd, lo12));
- ++n_words;
- }
- put_u32(w, 4 * n_words, rv_jalr(RV_ZERO, SHIM_T0, 0));
- ++n_words;
- put_u32(w, 4 * n_words, brk);
- *brk_offset = 4 * n_words;
- return 0;
- }
-
- /* ---- JALR rd, rs1, imm -------------------------------------------
- * Semantics: tmp = (regs[rs1] + sign_ext_12(imm)) & ~1; rd = orig_pc + 4;
- * pc = tmp.
- * Like JAL, rd must receive the *user-visible* link (orig_pc + 4).
- * Layout:
- * slot[0] JALR x0, rs1, imm ; jump-only form (no link write)
- * -- but JALR is a single insn,
- * so we cannot also write rd
- * before jumping. We instead:
- * slot[0] compute t0 = (regs[rs1] + imm) & ~1
- * (ADDI t0, rs1, imm; ANDI t0, t0, -2)
- * slot[8] materialize rd <- (orig_pc + 4) (if rd != x0)
- * slot[N] JALR x0, t0, 0
- * slot[N+4] EBREAK
- * Note rs1 might be t0 itself; ADDI computes t0 = rs1 + imm BEFORE
- * overwriting t0, which is fine because each insn reads its sources
- * before writing rd. */
- if (op == RV_JALR) {
- uint32_t rd = rv_rd(orig_insn);
- uint32_t rs1 = rv_rs1(orig_insn);
- int32_t imm = (int32_t)((orig_insn >> 20) & 0xfffu);
- if (imm & 0x800) imm -= 0x1000;
- put_u32(w, 0, rv_addi(SHIM_T0, rs1, imm));
- put_u32(w, 4, rv_andi(SHIM_T0, SHIM_T0, -2));
- uint32_t off = 8;
- if (rd != RV_ZERO) {
- uint64_t link = orig_pc + 4u;
- uint32_t hi20;
- int32_t lo12;
- int sext32;
- (void)rv_split_hi_lo(link, &hi20, &lo12, &sext32);
- put_u32(w, off, rv_lui(rd, hi20));
- off += 4;
- put_u32(w, off, rv_addi(rd, rd, lo12));
- off += 4;
- }
- put_u32(w, off, rv_jalr(RV_ZERO, SHIM_T0, 0));
- off += 4;
- put_u32(w, off, brk);
- *brk_offset = off;
- return 0;
- }
-
- /* ---- Bcc rs1, rs2, offset ---------------------------------------- */
- if (op == RV_BRANCH) {
- int64_t imm = rv_b_imm(orig_insn);
- uint64_t target = orig_pc + (uint64_t)imm;
- uint32_t f3 = rv_funct3(orig_insn);
- uint32_t rs1 = rv_rs1(orig_insn);
- uint32_t rs2 = rv_rs2(orig_insn);
- /* Trampoline layout:
- * slot[0] Bcc rs1, rs2, +12 (taken -> slot[12])
- * slot[4] JAL x0, +12 (not-taken fallthrough -> slot[16])
- * ... wait — we want non-taken to
- * fall through to the EBREAK at
- * slot[8]. Simpler: place EBREAK
- * at slot[4] for not-taken, and
- * the take-target sequence at
- * slot[8..]. The Bcc's +12 then
- * becomes +8.
- *
- * Revised:
- * slot[0] Bcc rs1, rs2, +8 (taken -> slot[8] = target seq)
- * slot[4] EBREAK (not-taken sentinel)
- * slot[8] AUIPC t0, hi20(target)
- * slot[12] ADDI t0, t0, lo12
- * slot[16] JALR x0, t0, 0
- * slot[20] EBREAK (safety; never reached) */
- uint32_t new_branch = rv_b(8, rs2, rs1, f3, RV_BRANCH);
- uint32_t n_words;
- put_u32(w, 0, new_branch);
- put_u32(w, 4, brk);
- n_words = emit_materialize_target(w, 8, target, scratch_runtime + 8u);
- put_u32(w, 8 + 4 * n_words, rv_jalr(RV_ZERO, SHIM_T0, 0));
- put_u32(w, 8 + 4 * n_words + 4, brk);
- *brk_offset = 4;
- return 0;
- }
-
- /* ---- AUIPC rd, imm20 --------------------------------------------- */
- if (op == RV_AUIPC) {
- uint32_t imm20 = rv_u_imm20(orig_insn);
- uint32_t rd = rv_rd(orig_insn);
- /* AUIPC computes rd = orig_pc + sign_ext_32(imm20 << 12). We
- * synthesize that absolute value into rd using LUI + ADDI. */
- uint64_t auipc_val = (uint64_t)((int64_t)orig_pc +
- (int64_t)(int32_t)((int32_t)(imm20 << 12)));
- uint32_t hi20;
- int32_t lo12;
- int sext32;
- (void)rv_split_hi_lo(auipc_val, &hi20, &lo12, &sext32);
- put_u32(w, 0, rv_lui(rd, hi20));
- put_u32(w, 4, rv_addi(rd, rd, lo12));
- put_u32(w, 8, brk);
- *brk_offset = 8;
- return 0;
- }
-
- /* ---- default: no PC-relative operand — copy verbatim ------------- */
- put_u32(w, 0, orig_insn);
- put_u32(w, 4, brk);
- *brk_offset = 4;
- return 0;
-}
-
-static KitStatus rv64_dbg_breakpoint_patch(u8* out, u32 cap, u32* len_out) {
- uint32_t brk = dbg_rv64_brk_word();
- if (!out || !len_out) return KIT_INVALID;
- if (cap < 4u) return KIT_INVALID;
- memcpy(out, &brk, sizeof(brk));
- *len_out = 4u;
- return KIT_OK;
-}
-
-static u64 rv64_dbg_breakpoint_addr_from_fault_pc(u64 fault_pc) {
- return fault_pc;
-}
-
-static KitStatus rv64_dbg_decode_insn(const u8* bytes, u32 len, u64 pc,
- ArchDbgInsn* out) {
- if (!bytes || !out) return KIT_INVALID;
- if (len < 4u) return KIT_UNSUPPORTED;
- memset(out, 0, sizeof(*out));
- out->pc = pc;
- out->len = 4u;
- memcpy(out->bytes, bytes, 4u);
- return KIT_OK;
-}
-
-static KitStatus rv64_dbg_build_displaced_shim(
- const ArchDbgInsn* insn, void* scratch_write, u64 scratch_runtime,
- u32 scratch_cap, u32* sentinel_off, u64* fallthrough_pc) {
- uint32_t word = 0;
- if (!insn || !scratch_write || !sentinel_off || !fallthrough_pc)
- return KIT_INVALID;
- if (insn->len != 4u) return KIT_UNSUPPORTED;
- if (scratch_cap < 28u) return KIT_INVALID;
- memcpy(&word, insn->bytes, sizeof(word));
- if (dbg_rv64_build_shim(word, insn->pc, scratch_write, scratch_runtime,
- sentinel_off) != 0) {
- return KIT_UNSUPPORTED;
- }
- *fallthrough_pc = insn->pc + 4u;
- return KIT_OK;
-}
-
-static int rv64_dbg_is_call(const ArchDbgInsn* insn) {
- uint32_t word = 0;
- uint32_t op;
- if (!insn || insn->len != 4u) return 0;
- memcpy(&word, insn->bytes, sizeof(word));
- op = rv_opcode(word);
- if (op != RV_JAL && op != RV_JALR) return 0;
- return rv_rd(word) != RV_ZERO;
-}
-
-static KitStatus rv64_dbg_direct_call_target(const ArchDbgInsn* insn,
- u64* target_out) {
- uint32_t word = 0;
- if (!insn || !target_out) return KIT_INVALID;
- if (insn->len != 4u) return KIT_UNSUPPORTED;
- memcpy(&word, insn->bytes, sizeof(word));
- if (rv_opcode(word) != RV_JAL || rv_rd(word) == RV_ZERO) return KIT_NOT_FOUND;
- *target_out = insn->pc + (u64)rv_j_imm(word);
- return KIT_OK;
-}
-
-static KitStatus rv64_dbg_direct_jump_target(const ArchDbgInsn* insn,
- u64* target_out) {
- uint32_t word = 0;
- if (!insn || !target_out) return KIT_INVALID;
- if (insn->len != 4u) return KIT_UNSUPPORTED;
- memcpy(&word, insn->bytes, sizeof(word));
- if (rv_opcode(word) != RV_JAL || rv_rd(word) != RV_ZERO) return KIT_NOT_FOUND;
- *target_out = insn->pc + (u64)rv_j_imm(word);
- return KIT_OK;
-}
-
-static KitStatus rv64_dbg_link_register_return_address(
- const KitUnwindFrame* frame, u64* target_out) {
- if (!frame || !target_out) return KIT_INVALID;
- if (frame->regs[RV_RA] == 0) return KIT_NOT_FOUND;
- *target_out = frame->regs[RV_RA];
- return KIT_OK;
-}
-
-const ArchDbgOps rv64_dbg_ops = {
- .min_insn_len = 4u,
- .max_insn_len = 4u,
- .breakpoint_patch = rv64_dbg_breakpoint_patch,
- .breakpoint_addr_from_fault_pc = rv64_dbg_breakpoint_addr_from_fault_pc,
- .decode_insn = rv64_dbg_decode_insn,
- .build_displaced_shim = rv64_dbg_build_displaced_shim,
- .is_call = rv64_dbg_is_call,
- .direct_call_target = rv64_dbg_direct_call_target,
- .direct_jump_target = rv64_dbg_direct_jump_target,
- .link_register_return_address = rv64_dbg_link_register_return_address,
-};
diff --git a/src/arch/rv64/disasm.c b/src/arch/rv64/disasm.c
@@ -1,453 +0,0 @@
-/* RV64 disassembler — descriptor-table driven.
- *
- * Decodes a 4-byte word by linear-scan over `rv64_insn_table` and
- * dispatches operand printing on the matched format. Compressed (RV64C)
- * instructions are 16-bit: a halfword whose low 2 bits are not 0b11
- * goes through the C-decode path; the iterator advances by 2 bytes.
- *
- * Unknown words/halfwords fall back to ".word"/".hword" placeholders. */
-
-#include "arch/rv64/disasm.h"
-
-#include <string.h>
-
-#include "arch/rv64/isa.h"
-#include "core/heap.h"
-#include "core/strbuf.h"
-
-#define RV64_DASM_MNEM_CAP 16u
-#define RV64_DASM_OPS_CAP 96u
-#define RV64_DASM_ANN_CAP 64u
-#define RV64_ENCODING_UNKNOWN 0xffffffffu
-
-typedef struct Rv64InsnFormatter {
- ArchInsnFormatter base;
- Compiler* c;
- Heap* heap;
- char mnem_buf[RV64_DASM_MNEM_CAP];
- char ops_buf[RV64_DASM_OPS_CAP];
- char ann_buf[RV64_DASM_ANN_CAP];
- StrBuf mnem;
- StrBuf ops;
- StrBuf ann;
-} Rv64InsnFormatter;
-
-typedef struct Rv64Disasm {
- ArchDisasm base;
- Rv64InsnFormatter fmt;
-} Rv64Disasm;
-
-static KitStatus rv64_format_insn(ArchInsnFormatter*, const KitDecodedInsn*,
- KitInsn*);
-static void rv64_formatter_destroy(ArchInsnFormatter*);
-
-static u32 rv_read_u32_le(const u8* b) {
- return (u32)b[0] | ((u32)b[1] << 8) | ((u32)b[2] << 16) | ((u32)b[3] << 24);
-}
-
-static u32 rv_read_u16_le(const u8* b) { return (u32)b[0] | ((u32)b[1] << 8); }
-
-static void rv_fmt_emit_fallback32(Rv64InsnFormatter* f, u32 word) {
- strbuf_reset(&f->mnem);
- strbuf_puts(&f->mnem, ".word");
- strbuf_reset(&f->ops);
- strbuf_put_hex_u64(&f->ops, (u64)word);
-}
-
-static void rv_fmt_emit_fallback16(Rv64InsnFormatter* f, u32 hw) {
- strbuf_reset(&f->mnem);
- strbuf_puts(&f->mnem, ".hword");
- strbuf_reset(&f->ops);
- strbuf_put_hex_u64(&f->ops, (u64)hw);
-}
-
-static u32 rv64_desc_encoding_id(const Rv64InsnDesc* desc) {
- u32 i;
- if (!desc) return RV64_ENCODING_UNKNOWN;
- for (i = 0; i < rv64_insn_table_n; ++i) {
- if (desc == &rv64_insn_table[i]) return i;
- }
- return RV64_ENCODING_UNKNOWN;
-}
-
-static u32 rv64_semantic_opcode(u32 word, u32 nbytes) {
- u32 op, funct3, funct7;
- if (nbytes != 4u) return RV64_DEC_UNKNOWN;
- if (word == rv_ecall()) return RV64_DEC_ECALL;
- if (word == rv_ebreak()) return RV64_DEC_EBREAK;
- op = word & 0x7fu;
- funct3 = (word >> 12) & 0x7u;
- funct7 = (word >> 25) & 0x7fu;
- if (op == RV_OP_IMM && funct3 == 0u) return RV64_DEC_ADDI;
- if (op == RV_OP && funct3 == 0u && funct7 == 0u) return RV64_DEC_ADD;
- if (op == RV_AUIPC) return RV64_DEC_AUIPC;
- if (op == RV_LOAD && funct3 == 3u) return RV64_DEC_LD;
- if (op == RV_STORE && funct3 == 3u) return RV64_DEC_SD;
- if (op == RV_JALR && funct3 == 0u) return RV64_DEC_JALR;
- return RV64_DEC_UNKNOWN;
-}
-
-static void rv_decop_none(KitDecodedOperand* o) {
- memset(o, 0, sizeof(*o));
- o->kind = KIT_DECOP_NONE;
- o->index_reg = REG_NONE;
-}
-
-static void rv_decop_reg(KitDecodedOperand* o, u32 reg, u8 width_bits) {
- rv_decop_none(o);
- o->kind = KIT_DECOP_REG;
- o->width_bits = width_bits;
- o->reg = reg;
-}
-
-static void rv_decop_imm(KitDecodedOperand* o, i64 imm) {
- rv_decop_none(o);
- o->kind = KIT_DECOP_IMM;
- o->imm = imm;
-}
-
-static void rv_decop_sysreg(KitDecodedOperand* o, u32 reg) {
- rv_decop_none(o);
- o->kind = KIT_DECOP_SYSREG;
- o->reg = reg;
-}
-
-static void rv_decop_mem(KitDecodedOperand* o, u32 base, i64 imm,
- u8 width_bits) {
- rv_decop_none(o);
- o->kind = KIT_DECOP_MEM;
- o->width_bits = width_bits;
- o->reg = base;
- o->imm = imm;
-}
-
-static void rv_decop_pcrel(KitDecodedOperand* o, u64 pc, i64 disp) {
- rv_decop_none(o);
- o->kind = KIT_DECOP_PCREL;
- o->imm = (i64)(pc + (u64)disp);
-}
-
-static u8 rv_load_width_bits(u32 funct3) {
- switch (funct3 & 7u) {
- case 0:
- case 4:
- return 8;
- case 1:
- case 5:
- return 16;
- case 2:
- case 6:
- return 32;
- case 3:
- return 64;
- default:
- return 0;
- }
-}
-
-static u16 rv64_decode_flags(const Rv64InsnDesc* desc, u32 word) {
- u16 flags = 0;
- Rv64Format fmt;
- if (!desc) return 0;
- fmt = (Rv64Format)desc->fmt;
- switch (fmt) {
- case RV64_FMT_B:
- case RV64_FMT_CB:
- case RV64_FMT_CJ:
- flags |= KIT_DECODE_TERMINATOR | KIT_DECODE_BRANCH;
- break;
- case RV64_FMT_J:
- flags |= KIT_DECODE_TERMINATOR | KIT_DECODE_BRANCH;
- if (((word >> 7) & 0x1fu) == RV_RA) flags |= KIT_DECODE_CALL;
- break;
- case RV64_FMT_JALR: {
- u32 rd = (word >> 7) & 0x1fu;
- u32 rs1 = (word >> 15) & 0x1fu;
- flags |= KIT_DECODE_TERMINATOR | KIT_DECODE_BRANCH;
- if (rd == RV_RA) flags |= KIT_DECODE_CALL;
- if (rd == RV_ZERO && rs1 == RV_RA) flags |= KIT_DECODE_RET;
- break;
- }
- case RV64_FMT_CR:
- if (slice_eq_cstr(desc->mnemonic, "c.jr") ||
- slice_eq_cstr(desc->mnemonic, "c.jalr")) {
- flags |= KIT_DECODE_TERMINATOR | KIT_DECODE_BRANCH;
- if (slice_eq_cstr(desc->mnemonic, "c.jalr")) flags |= KIT_DECODE_CALL;
- }
- break;
- case RV64_FMT_SYSTEM:
- if (word == rv_ecall() || word == rv_ebreak())
- flags |= KIT_DECODE_TERMINATOR | KIT_DECODE_TRAP;
- break;
- case RV64_FMT_C_NONE:
- if ((word & 0xffffu) == 0x9002u)
- flags |= KIT_DECODE_TERMINATOR | KIT_DECODE_TRAP;
- break;
- case RV64_FMT_LOAD:
- case RV64_FMT_STORE:
- case RV64_FMT_FP_LOAD:
- case RV64_FMT_FP_STORE:
- case RV64_FMT_AMO:
- case RV64_FMT_LR:
- case RV64_FMT_CL:
- case RV64_FMT_CS:
- case RV64_FMT_CSS:
- flags |= KIT_DECODE_MEMORY;
- break;
- default:
- break;
- }
- return flags;
-}
-
-static void rv64_decode_operands(const Rv64InsnDesc* desc, u32 word, u64 pc,
- KitDecodedInsn* out) {
- Rv64Format fmt;
- if (!desc) return;
- fmt = (Rv64Format)desc->fmt;
- switch (fmt) {
- case RV64_FMT_R:
- case RV64_FMT_FP_R:
- case RV64_FMT_FP_RM: {
- Rv64R r = rv64_r_unpack(word);
- out->noperands = 3;
- rv_decop_reg(&out->operands[0], r.rd, 64);
- rv_decop_reg(&out->operands[1], r.rs1, 64);
- rv_decop_reg(&out->operands[2], r.rs2, 64);
- break;
- }
- case RV64_FMT_I: {
- Rv64I i = rv64_i_unpack(word);
- out->noperands = 3;
- rv_decop_reg(&out->operands[0], i.rd, 64);
- rv_decop_reg(&out->operands[1], i.rs1, 64);
- rv_decop_imm(&out->operands[2], rv64_sext(i.imm12, 12));
- break;
- }
- case RV64_FMT_I_SHIFT:
- case RV64_FMT_I_SHIFTW: {
- Rv64I i = rv64_i_unpack(word);
- out->noperands = 3;
- rv_decop_reg(&out->operands[0], i.rd, 64);
- rv_decop_reg(&out->operands[1], i.rs1, 64);
- rv_decop_imm(&out->operands[2], fmt == RV64_FMT_I_SHIFTW
- ? (i.imm12 & 0x1f)
- : (i.imm12 & 0x3f));
- break;
- }
- case RV64_FMT_LOAD:
- case RV64_FMT_FP_LOAD: {
- Rv64I i = rv64_i_unpack(word);
- out->noperands = 2;
- rv_decop_reg(&out->operands[0], i.rd, 64);
- rv_decop_mem(&out->operands[1], i.rs1, rv64_sext(i.imm12, 12),
- rv_load_width_bits(i.funct3));
- break;
- }
- case RV64_FMT_S:
- case RV64_FMT_STORE:
- case RV64_FMT_FP_STORE: {
- Rv64S s = rv64_s_unpack(word);
- out->noperands = 2;
- rv_decop_reg(&out->operands[0], s.rs2, 64);
- rv_decop_mem(&out->operands[1], s.rs1, rv64_sext(s.imm12, 12),
- rv_load_width_bits(s.funct3));
- break;
- }
- case RV64_FMT_B: {
- Rv64B b = rv64_b_unpack(word);
- out->noperands = 3;
- rv_decop_reg(&out->operands[0], b.rs1, 64);
- rv_decop_reg(&out->operands[1], b.rs2, 64);
- rv_decop_pcrel(&out->operands[2], pc, rv64_sext(b.imm13, 13));
- break;
- }
- case RV64_FMT_U: {
- Rv64U u = rv64_u_unpack(word);
- out->noperands = 2;
- rv_decop_reg(&out->operands[0], u.rd, 64);
- rv_decop_imm(&out->operands[1], (i64)(i32)u.imm32_hi20);
- break;
- }
- case RV64_FMT_J: {
- Rv64J j = rv64_j_unpack(word);
- out->noperands = 2;
- rv_decop_reg(&out->operands[0], j.rd, 64);
- rv_decop_pcrel(&out->operands[1], pc, rv64_sext(j.imm21, 21));
- break;
- }
- case RV64_FMT_JALR: {
- Rv64I i = rv64_i_unpack(word);
- out->noperands = 2;
- rv_decop_reg(&out->operands[0], i.rd, 64);
- rv_decop_mem(&out->operands[1], i.rs1, rv64_sext(i.imm12, 12), 64);
- break;
- }
- case RV64_FMT_CSR: {
- Rv64I i = rv64_i_unpack(word);
- out->noperands = 3;
- rv_decop_reg(&out->operands[0], i.rd, 64);
- rv_decop_sysreg(&out->operands[1], i.imm12);
- rv_decop_reg(&out->operands[2], i.rs1, 64);
- break;
- }
- case RV64_FMT_CSRI: {
- Rv64I i = rv64_i_unpack(word);
- out->noperands = 3;
- rv_decop_reg(&out->operands[0], i.rd, 64);
- rv_decop_sysreg(&out->operands[1], i.imm12);
- rv_decop_imm(&out->operands[2], (i64)i.rs1);
- break;
- }
- default:
- break;
- }
-}
-
-static KitStatus rv64_decode_one(Compiler* c, const u8* bytes, size_t len,
- u64 pc, KitDecodedInsn* out) {
- const Rv64InsnDesc* desc;
- u32 first_hw;
- u32 word;
- u32 encoding_id;
- (void)c;
- if (!bytes || !out) return KIT_INVALID;
- if (len < 2u) return KIT_MALFORMED;
- memset(out, 0, sizeof(*out));
- for (u32 i = 0; i < KIT_DECODE_MAX_OPERANDS; ++i)
- rv_decop_none(&out->operands[i]);
-
- first_hw = rv_read_u16_le(bytes);
- if ((first_hw & 3u) != 3u) {
- word = first_hw;
- desc = rv64_disasm_find_c(first_hw);
- out->nbytes = 2;
- } else {
- if (len < 4u) return KIT_MALFORMED;
- word = rv_read_u32_le(bytes);
- desc = rv64_disasm_find(word);
- out->nbytes = 4;
- }
-
- encoding_id = rv64_desc_encoding_id(desc);
- out->pc = pc;
- out->bytes = bytes;
- out->encoding_id = encoding_id;
- out->opcode = rv64_semantic_opcode(word, out->nbytes);
- out->flags = rv64_decode_flags(desc, word);
- out->arch[0] = word;
- out->arch[1] = desc ? desc->fmt : 0xffu;
- rv64_decode_operands(desc, word, pc, out);
- return KIT_OK;
-}
-
-static KitStatus rv64_decode_block(Compiler* c, const u8* bytes, size_t len,
- u64 pc, KitDecodedInsn* out, u32 cap,
- u32* n_out) {
- u32 n = 0;
- if (n_out) *n_out = 0;
- if (!bytes || !out || !n_out) return KIT_INVALID;
- while (n < cap && len > 0) {
- KitStatus st = rv64_decode_one(c, bytes, len, pc, &out[n]);
- if (st != KIT_OK) return n ? KIT_OK : st;
- bytes += out[n].nbytes;
- len -= out[n].nbytes;
- pc += out[n].nbytes;
- ++n;
- if (out[n - 1u].flags & KIT_DECODE_TERMINATOR) break;
- }
- *n_out = n;
- return KIT_OK;
-}
-
-static void rv64_formatter_init(Rv64InsnFormatter* f, Compiler* c, Heap* h) {
- memset(f, 0, sizeof(*f));
- f->c = c;
- f->heap = h;
- f->base.format = rv64_format_insn;
- f->base.destroy = rv64_formatter_destroy;
- strbuf_init(&f->mnem, f->mnem_buf, sizeof f->mnem_buf);
- strbuf_init(&f->ops, f->ops_buf, sizeof f->ops_buf);
- strbuf_init(&f->ann, f->ann_buf, sizeof f->ann_buf);
-}
-
-static KitStatus rv64_format_insn(ArchInsnFormatter* base,
- const KitDecodedInsn* insn, KitInsn* out) {
- Rv64InsnFormatter* f = (Rv64InsnFormatter*)base;
- const Rv64InsnDesc* desc;
- u32 word;
- if (!f || !insn || !out) return KIT_INVALID;
- word = (u32)insn->arch[0];
- desc = insn->nbytes == 2u ? rv64_disasm_find_c(word) : rv64_disasm_find(word);
- if (desc) {
- strbuf_reset(&f->mnem);
- strbuf_put_slice(&f->mnem, desc->mnemonic);
- strbuf_reset(&f->ops);
- rv64_print_operands(&f->ops, desc, word, insn->pc);
- } else if (insn->nbytes == 2u) {
- rv_fmt_emit_fallback16(f, word);
- } else {
- rv_fmt_emit_fallback32(f, word);
- }
-
- strbuf_reset(&f->ann);
- out->vaddr = insn->pc;
- out->bytes = insn->bytes;
- out->nbytes = insn->nbytes;
- out->mnemonic = strbuf_slice(&f->mnem);
- out->operands = strbuf_slice(&f->ops);
- out->annotation = strbuf_slice(&f->ann);
- return KIT_OK;
-}
-
-static void rv64_formatter_destroy(ArchInsnFormatter* base) {
- Rv64InsnFormatter* f = (Rv64InsnFormatter*)base;
- if (!f) return;
- f->heap->free(f->heap, f, sizeof(*f));
-}
-
-static ArchInsnFormatter* rv64_formatter_new(Compiler* c) {
- Heap* h = (Heap*)c->ctx->heap;
- Rv64InsnFormatter* f =
- (Rv64InsnFormatter*)h->alloc(h, sizeof(*f), _Alignof(Rv64InsnFormatter));
- if (!f) return NULL;
- rv64_formatter_init(f, c, h);
- return &f->base;
-}
-
-static u32 rv_decode(ArchDisasm* base, const u8* bytes, size_t len, u64 vaddr,
- KitInsn* out) {
- Rv64Disasm* d = (Rv64Disasm*)base;
- KitDecodedInsn insn;
- KitStatus st = rv64_decode_one(d->fmt.c, bytes, len, vaddr, &insn);
- if (st != KIT_OK) return 0;
- st = rv64_format_insn(&d->fmt.base, &insn, out);
- if (st != KIT_OK) return 0;
- return insn.nbytes;
-}
-
-static void rv64_destroy(ArchDisasm* base) {
- Rv64Disasm* d = (Rv64Disasm*)base;
- d->fmt.heap->free(d->fmt.heap, d, sizeof(*d));
-}
-
-ArchDisasm* rv64_disasm_new(Compiler* c) {
- Heap* h = (Heap*)c->ctx->heap;
- Rv64Disasm* d = (Rv64Disasm*)h->alloc(h, sizeof(*d), _Alignof(Rv64Disasm));
- if (!d) return NULL;
- memset(d, 0, sizeof(*d));
- d->base.decode = rv_decode;
- d->base.destroy = rv64_destroy;
- rv64_formatter_init(&d->fmt, c, h);
- return &d->base;
-}
-
-const ArchDecodeOps rv64_decode_ops = {
- .min_insn_len = 2,
- .max_insn_len = 4,
- .decode_one = rv64_decode_one,
- .decode_block = rv64_decode_block,
- .formatter_new = rv64_formatter_new,
- .format = rv64_format_insn,
- .formatter_free = rv64_formatter_destroy,
-};
diff --git a/src/arch/rv64/emu.c b/src/arch/rv64/emu.c
@@ -1,511 +0,0 @@
-#include "emu/emu.h"
-
-#include <string.h>
-
-#include "arch/arch.h"
-#include "arch/rv64/isa.h"
-#include "core/slice.h"
-
-#define RV64_EMU_SYM_XREG "__emu_rv64_xreg"
-#define RV64_EMU_SYM_SET_XREG "__emu_rv64_set_xreg"
-#define RV64_EMU_SYM_JALR "__emu_rv64_jalr"
-
-typedef struct Rv64EmuCPUState {
- u64 x[32];
- u64 f[32];
- u32 fcsr;
- u64 reserved_addr;
- int has_reservation;
-} Rv64EmuCPUState;
-
-typedef struct Rv64EmuLiftSyms {
- KitCgSym xreg;
- KitCgSym set_xreg;
- KitCgSym load64;
- KitCgSym load64_checked;
- KitCgSym store64;
- KitCgSym jalr;
- KitCgSym syscall;
- KitCgTypeId xreg_fn;
- KitCgTypeId set_xreg_fn;
- KitCgTypeId load64_fn;
- KitCgTypeId load64_checked_fn;
- KitCgTypeId store64_fn;
- KitCgTypeId jalr_fn;
- KitCgTypeId syscall_fn;
- KitCgTypeId thread_ptr;
- KitCgTypeId i32;
- KitCgTypeId i64;
- KitCgTypeId i64_ptr;
- KitCgTypeId void_ty;
-} Rv64EmuLiftSyms;
-
-static KitCgSym rv64_emu_decl_helper(KitCompiler* c, KitCg* cg,
- const char* name, KitCgTypeId type) {
- KitCgDecl d;
- memset(&d, 0, sizeof(d));
- d.kind = KIT_CG_DECL_FUNC;
- d.linkage_name = kit_sym_intern(c, kit_slice_cstr(name));
- d.display_name = d.linkage_name;
- d.linkage_name = kit_cg_c_linkage_name(c, d.linkage_name);
- d.type = type;
- d.sym.bind = KIT_SB_GLOBAL;
- d.sym.visibility = KIT_CG_VIS_DEFAULT;
- return kit_cg_decl(cg, d);
-}
-
-static KitCgTypeId rv64_emu_func_type(KitCompiler* c, KitCgTypeId ret,
- const KitCgTypeId* params, u32 nparams) {
- KitCgFuncParam p[5];
- KitCgFuncResult result;
- KitCgFuncSig sig;
- u32 i;
- memset(p, 0, sizeof(p));
- for (i = 0; i < nparams; ++i) p[i].type = params[i];
- memset(&sig, 0, sizeof(sig));
- memset(&result, 0, sizeof(result));
- result.type = ret;
- sig.results = &result;
- sig.nresults = 1;
- sig.params = p;
- sig.nparams = nparams;
- sig.call_conv = KIT_CG_CC_TARGET_C;
- return kit_cg_type_func(c, sig);
-}
-
-static void rv64_emu_lift_syms_init(KitCompiler* c, KitCg* cg,
- Rv64EmuLiftSyms* out) {
- KitCgBuiltinTypes bi = kit_cg_builtin_types(c);
- KitCgTypeId params[5];
- memset(out, 0, sizeof(*out));
- out->void_ty = bi.id[KIT_CG_BUILTIN_VOID];
- out->i32 = bi.id[KIT_CG_BUILTIN_I32];
- out->i64 = bi.id[KIT_CG_BUILTIN_I64];
- out->i64_ptr = kit_cg_type_ptr(c, out->i64, 0);
- out->thread_ptr = emu_thread_type((Compiler*)c);
-
- params[0] = out->thread_ptr;
- params[1] = out->i32;
- out->xreg_fn = rv64_emu_func_type(c, out->i64, params, 2);
-
- params[0] = out->thread_ptr;
- params[1] = out->i32;
- params[2] = out->i64;
- out->set_xreg_fn = rv64_emu_func_type(c, out->void_ty, params, 3);
-
- params[0] = out->thread_ptr;
- params[1] = out->i64;
- out->load64_fn = rv64_emu_func_type(c, out->i64, params, 2);
-
- {
- KitCgTypeId load_params[5];
- load_params[0] = out->thread_ptr;
- load_params[1] = out->i64;
- load_params[2] = out->i64;
- load_params[3] = out->i64;
- load_params[4] = out->i64_ptr;
- out->load64_checked_fn = rv64_emu_func_type(c, out->i64, load_params, 5);
- }
-
- {
- KitCgTypeId store_params[5];
- store_params[0] = out->thread_ptr;
- store_params[1] = out->i64;
- store_params[2] = out->i64;
- store_params[3] = out->i64;
- store_params[4] = out->i64;
- out->store64_fn = rv64_emu_func_type(c, out->i64, store_params, 5);
- }
-
- {
- KitCgTypeId jalr_params[5];
- jalr_params[0] = out->thread_ptr;
- jalr_params[1] = out->i64;
- jalr_params[2] = out->i64;
- jalr_params[3] = out->i64;
- jalr_params[4] = out->i64;
- out->jalr_fn = rv64_emu_func_type(c, out->i64, jalr_params, 5);
- }
-
- params[0] = out->thread_ptr;
- params[1] = out->i64;
- out->syscall_fn = rv64_emu_func_type(c, out->i64, params, 2);
-
- out->xreg = rv64_emu_decl_helper(c, cg, RV64_EMU_SYM_XREG, out->xreg_fn);
- out->set_xreg =
- rv64_emu_decl_helper(c, cg, RV64_EMU_SYM_SET_XREG, out->set_xreg_fn);
- out->load64 = rv64_emu_decl_helper(c, cg, EMU_SYM_LOAD64, out->load64_fn);
- out->load64_checked = rv64_emu_decl_helper(c, cg, EMU_SYM_LOAD64_CHECKED,
- out->load64_checked_fn);
- out->store64 = rv64_emu_decl_helper(c, cg, EMU_SYM_STORE64, out->store64_fn);
- out->jalr = rv64_emu_decl_helper(c, cg, RV64_EMU_SYM_JALR, out->jalr_fn);
- out->syscall = rv64_emu_decl_helper(c, cg, EMU_SYM_SYSCALL, out->syscall_fn);
-}
-
-static KitCgMemAccess rv64_emu_mem(KitCgTypeId type) {
- KitCgMemAccess m;
- memset(&m, 0, sizeof(m));
- m.type = type;
- return m;
-}
-
-static void rv64_emu_push_thread(KitCg* cg, KitCgLocal thread,
- KitCgTypeId thread_ptr) {
- kit_cg_push_local(cg, thread);
- kit_cg_load(cg, rv64_emu_mem(thread_ptr));
-}
-
-static void rv64_emu_push_xreg(KitCg* cg, const Rv64EmuLiftSyms* s,
- KitCgLocal thread, u32 reg) {
- if (reg == 0u) {
- kit_cg_push_int(cg, 0, s->i64);
- return;
- }
- rv64_emu_push_thread(cg, thread, s->thread_ptr);
- kit_cg_push_int(cg, reg, s->i32);
- kit_cg_call_symbol(cg, s->xreg, 2, (KitCgCallAttrs){0});
-}
-
-static void rv64_emu_store_xreg_from_tmp(KitCg* cg, const Rv64EmuLiftSyms* s,
- KitCgLocal thread, KitCgLocal tmp,
- u32 reg) {
- if (reg == 0u) return;
- rv64_emu_push_thread(cg, thread, s->thread_ptr);
- kit_cg_push_int(cg, reg, s->i32);
- kit_cg_push_local(cg, tmp);
- kit_cg_load(cg, rv64_emu_mem(s->i64));
- kit_cg_call_symbol(cg, s->set_xreg, 3, (KitCgCallAttrs){0});
-}
-
-static void rv64_emu_store_xreg_from_stack(KitCg* cg, const Rv64EmuLiftSyms* s,
- KitCgLocal thread, u32 reg,
- KitCgLocal tmp) {
- kit_cg_push_local(cg, tmp);
- kit_cg_swap(cg);
- kit_cg_store(cg, rv64_emu_mem(s->i64));
- if (reg == 0u) return;
- rv64_emu_store_xreg_from_tmp(cg, s, thread, tmp, reg);
-}
-
-static void rv64_emu_store_local_from_stack(KitCg* cg, const Rv64EmuLiftSyms* s,
- KitCgLocal local) {
- kit_cg_push_local(cg, local);
- kit_cg_swap(cg);
- kit_cg_store(cg, rv64_emu_mem(s->i64));
-}
-
-static void rv64_emu_push_local_value(KitCg* cg, const Rv64EmuLiftSyms* s,
- KitCgLocal local) {
- kit_cg_push_local(cg, local);
- kit_cg_load(cg, rv64_emu_mem(s->i64));
-}
-
-static void rv64_emu_push_addr(KitCg* cg, const Rv64EmuLiftSyms* s,
- KitCgLocal thread,
- const KitDecodedOperand* mem) {
- rv64_emu_push_xreg(cg, s, thread, mem->reg);
- if (mem->imm) {
- kit_cg_push_int(cg, (u64)mem->imm, s->i64);
- kit_cg_int_binop(cg, KIT_CG_INT_ADD, 0);
- }
-}
-
-static KitStatus rv64_emu_lift_block(Compiler* compiler, KitCg* cg,
- const KitDecodedInsn* insts, u32 n,
- const EmuLiftCtx* ctx) {
- Rv64EmuLiftSyms syms;
- KitCgLocal thread;
- KitCgLocal tmp;
- KitCgLocal fault_next;
- KitCgLocalAttrs attrs;
- u64 next_pc;
- u32 i;
- KitCompiler* c;
-
- if (!compiler || !cg || !insts || !ctx) return KIT_INVALID;
- c = (KitCompiler*)compiler;
- rv64_emu_lift_syms_init(c, cg, &syms);
-
- kit_cg_func_begin(cg, ctx->block_sym);
- memset(&attrs, 0, sizeof(attrs));
- attrs.name = kit_sym_intern(c, KIT_SLICE_LIT("thread"));
- thread = kit_cg_param(cg, 0, syms.thread_ptr, attrs);
- attrs.name = kit_sym_intern(c, KIT_SLICE_LIT("tmp"));
- tmp = kit_cg_local(cg, syms.i64, attrs);
- attrs.name = kit_sym_intern(c, KIT_SLICE_LIT("fault_next"));
- fault_next = kit_cg_local(cg, syms.i64, attrs);
-
- next_pc = ctx->guest_pc;
- for (i = 0; i < n; ++i) {
- const KitDecodedInsn* in = &insts[i];
- next_pc = in->pc + in->nbytes;
- switch (in->opcode) {
- case RV64_DEC_ADDI: {
- u32 rd = in->operands[0].reg;
- u32 rs1 = in->operands[1].reg;
- i64 imm = in->operands[2].imm;
- rv64_emu_push_xreg(cg, &syms, thread, rs1);
- kit_cg_push_int(cg, (u64)imm, syms.i64);
- kit_cg_int_binop(cg, KIT_CG_INT_ADD, 0);
- rv64_emu_store_xreg_from_stack(cg, &syms, thread, rd, tmp);
- break;
- }
- case RV64_DEC_ADD: {
- u32 rd = in->operands[0].reg;
- u32 rs1 = in->operands[1].reg;
- u32 rs2 = in->operands[2].reg;
- rv64_emu_push_xreg(cg, &syms, thread, rs1);
- rv64_emu_push_xreg(cg, &syms, thread, rs2);
- kit_cg_int_binop(cg, KIT_CG_INT_ADD, 0);
- rv64_emu_store_xreg_from_stack(cg, &syms, thread, rd, tmp);
- break;
- }
- case RV64_DEC_AUIPC: {
- u32 rd = in->operands[0].reg;
- i64 imm = in->operands[1].imm;
- kit_cg_push_int(cg, (u64)(in->pc + (u64)imm), syms.i64);
- rv64_emu_store_xreg_from_stack(cg, &syms, thread, rd, tmp);
- break;
- }
- case RV64_DEC_LD: {
- u32 rd = in->operands[0].reg;
- KitCgLabel ok = kit_cg_label_new(cg);
- rv64_emu_push_thread(cg, thread, syms.thread_ptr);
- rv64_emu_push_addr(cg, &syms, thread, &in->operands[1]);
- kit_cg_push_int(cg, in->pc, syms.i64);
- kit_cg_push_int(cg, next_pc, syms.i64);
- kit_cg_push_local_addr(cg, tmp);
- kit_cg_call_symbol(cg, syms.load64_checked, 5, (KitCgCallAttrs){0});
- rv64_emu_store_local_from_stack(cg, &syms, fault_next);
- rv64_emu_push_local_value(cg, &syms, fault_next);
- kit_cg_push_int(cg, 0, syms.i64);
- kit_cg_int_cmp(cg, KIT_CG_INT_NE);
- kit_cg_branch_false(cg, ok);
- rv64_emu_push_local_value(cg, &syms, fault_next);
- kit_cg_ret(cg);
- kit_cg_label_place(cg, ok);
- rv64_emu_store_xreg_from_tmp(cg, &syms, thread, tmp, rd);
- break;
- }
- case RV64_DEC_SD: {
- rv64_emu_push_thread(cg, thread, syms.thread_ptr);
- rv64_emu_push_addr(cg, &syms, thread, &in->operands[1]);
- rv64_emu_push_xreg(cg, &syms, thread, in->operands[0].reg);
- kit_cg_push_int(cg, in->pc, syms.i64);
- kit_cg_push_int(cg, next_pc, syms.i64);
- kit_cg_call_symbol(cg, syms.store64, 5, (KitCgCallAttrs){0});
- kit_cg_ret(cg);
- kit_cg_func_end(cg);
- return KIT_OK;
- }
- case RV64_DEC_JALR: {
- Rv64I ji = rv64_i_unpack(in->arch[0]);
- u32 rd = ji.rd;
- u32 rs1 = ji.rs1;
- i64 imm = rv64_sext(ji.imm12, 12);
- rv64_emu_push_thread(cg, thread, syms.thread_ptr);
- kit_cg_push_int(cg, rd, syms.i64);
- kit_cg_push_int(cg, rs1, syms.i64);
- kit_cg_push_int(cg, (u64)imm, syms.i64);
- kit_cg_push_int(cg, next_pc, syms.i64);
- kit_cg_call_symbol(cg, syms.jalr, 5, (KitCgCallAttrs){0});
- kit_cg_ret(cg);
- kit_cg_func_end(cg);
- return KIT_OK;
- break;
- }
- case RV64_DEC_ECALL:
- rv64_emu_push_thread(cg, thread, syms.thread_ptr);
- kit_cg_push_int(cg, next_pc, syms.i64);
- kit_cg_call_symbol(cg, syms.syscall, 2, (KitCgCallAttrs){0});
- kit_cg_ret(cg);
- kit_cg_func_end(cg);
- return KIT_OK;
- default:
- kit_cg_push_int(cg, in->pc, syms.i64);
- kit_cg_ret(cg);
- kit_cg_func_end(cg);
- return KIT_OK;
- }
- }
-
- kit_cg_push_int(cg, next_pc, syms.i64);
- kit_cg_ret(cg);
- kit_cg_func_end(cg);
- return KIT_OK;
-}
-
-static EmuCPUState* rv64_emu_cpu_new(Compiler* c, u64 initial_pc,
- u64 initial_sp) {
- EmuCPUState* cpu = emu_cpu_new_with_arch_state(c, KIT_ARCH_RV64, initial_pc,
- sizeof(Rv64EmuCPUState),
- _Alignof(Rv64EmuCPUState));
- Rv64EmuCPUState* rv = (Rv64EmuCPUState*)emu_cpu_arch_state(cpu);
- if (rv) rv->x[2] = initial_sp;
- return cpu;
-}
-
-static Rv64EmuCPUState* rv64_thread_state(EmuThread* thread) {
- return thread ? (Rv64EmuCPUState*)emu_cpu_arch_state(emu_thread_cpu(thread))
- : NULL;
-}
-
-u64 emu_rv64_xreg(EmuThread* thread, u32 i) {
- Rv64EmuCPUState* rv = rv64_thread_state(thread);
- if (!rv || i >= 32u) return 0;
- return i == 0u ? 0u : rv->x[i];
-}
-
-void emu_rv64_set_xreg(EmuThread* thread, u32 i, u64 v) {
- Rv64EmuCPUState* rv = rv64_thread_state(thread);
- if (!rv || i >= 32u || i == 0u) return;
- rv->x[i] = v;
-}
-
-static u64 rv64_get_syscall_no(EmuThread* thread) {
- return emu_rv64_xreg(thread, 17u);
-}
-
-static u64 rv64_get_syscall_arg(EmuThread* thread, u32 index) {
- static const u32 regs[6] = {10u, 11u, 12u, 13u, 14u, 15u};
- return index < 6u ? emu_rv64_xreg(thread, regs[index]) : 0;
-}
-
-static void rv64_set_syscall_result(EmuThread* thread, u64 value) {
- emu_rv64_set_xreg(thread, 10u, value);
-}
-
-static u64 rv64_get_sp(EmuThread* thread) { return emu_rv64_xreg(thread, 2u); }
-
-static void rv64_set_sp(EmuThread* thread, u64 value) {
- emu_rv64_set_xreg(thread, 2u, value);
-}
-
-static u64 rv64_get_tp(EmuThread* thread) { return emu_rv64_xreg(thread, 4u); }
-
-static void rv64_set_tp(EmuThread* thread, u64 value) {
- emu_rv64_set_xreg(thread, 4u, value);
-}
-
-static void rv64_signal_wr64(u8* p, u64 v) {
- u32 i;
- for (i = 0; i < 8u; ++i) p[i] = (u8)(v >> (8u * i));
-}
-
-static u64 rv64_signal_rd64(const u8* p) {
- return (u64)p[0] | ((u64)p[1] << 8) | ((u64)p[2] << 16) | ((u64)p[3] << 24) |
- ((u64)p[4] << 32) | ((u64)p[5] << 40) | ((u64)p[6] << 48) |
- ((u64)p[7] << 56);
-}
-
-static u64 rv64_signal_context_size(EmuProcess* process, EmuThread* thread) {
- (void)process;
- (void)thread;
- return 32u * 8u;
-}
-
-static KitStatus rv64_save_signal_context(EmuProcess* process,
- EmuThread* thread, u8* dst,
- u64 size) {
- u32 i;
- (void)process;
- if (!thread || !dst || size < 32u * 8u) return KIT_INVALID;
- for (i = 0; i < 32u; ++i)
- rv64_signal_wr64(dst + (u64)i * 8u, emu_rv64_xreg(thread, i));
- return KIT_OK;
-}
-
-static KitStatus rv64_restore_signal_context(EmuProcess* process,
- EmuThread* thread, const u8* src,
- u64 size) {
- u32 i;
- (void)process;
- if (!thread || !src || size < 32u * 8u) return KIT_INVALID;
- for (i = 0; i < 32u; ++i)
- emu_rv64_set_xreg(thread, i, rv64_signal_rd64(src + (u64)i * 8u));
- return KIT_OK;
-}
-
-static KitStatus rv64_set_signal_handler_args(EmuProcess* process,
- EmuThread* thread, int signo,
- u64 siginfo, u64 ucontext) {
- (void)process;
- if (!thread) return KIT_INVALID;
- emu_rv64_set_xreg(thread, 10u, (u64)signo);
- emu_rv64_set_xreg(thread, 11u, siginfo);
- emu_rv64_set_xreg(thread, 12u, ucontext);
- return KIT_OK;
-}
-
-static u64 rv64_signal_stack_align(EmuProcess* process, EmuThread* thread) {
- (void)process;
- (void)thread;
- return 16u;
-}
-
-static KitStatus rv64_emit_import_thunk(EmuProcess* process, u64 thunk_vaddr) {
- u8 code[4];
- u32 word = 0x00008067u;
- u32 i;
- if (!process) return KIT_INVALID;
- for (i = 0; i < 4u; ++i) code[i] = (u8)(word >> (8u * i));
- return emu_addr_space_copy_in(&process->image.addr_space, thunk_vaddr, code,
- sizeof(code));
-}
-
-u64 emu_rv64_jalr(EmuThread* thread, u64 rd, u64 rs1, u64 imm, u64 next_pc) {
- EmuImportBinding* b = NULL;
- u64 target;
- if (rd != 0u) emu_rv64_set_xreg(thread, (u32)rd, next_pc);
- target = emu_rv64_xreg(thread, (u32)rs1) + imm;
- target &= ~1ull;
- if (emu_dl_resolve_import_thunk(thread ? thread->process : NULL, target,
- &b) == KIT_OK &&
- b) {
- u64 args[3];
- u64 result = 0;
- args[0] = emu_rv64_xreg(thread, 10u);
- args[1] = emu_rv64_xreg(thread, 11u);
- args[2] = emu_rv64_xreg(thread, 12u);
- if (emu_call_host_import(thread, b, args, 3u, &result) != KIT_OK) {
- emu_cpu_trap_fault(emu_thread_cpu(thread));
- return next_pc;
- }
- if (b->signature.result != KIT_EMU_VALUE_VOID)
- emu_rv64_set_xreg(thread, 10u, result);
- return next_pc;
- }
- return target;
-}
-
-static void* rv64_resolve_runtime_helper(void* emu, KitSlice name) {
- (void)emu;
- if (kit_slice_eq_cstr(name, RV64_EMU_SYM_XREG)) return (void*)emu_rv64_xreg;
- if (kit_slice_eq_cstr(name, RV64_EMU_SYM_SET_XREG))
- return (void*)emu_rv64_set_xreg;
- if (kit_slice_eq_cstr(name, RV64_EMU_SYM_JALR)) return (void*)emu_rv64_jalr;
- return NULL;
-}
-
-const ArchEmuOps rv64_emu_ops = {
- .cpu_new = rv64_emu_cpu_new,
- .block_fn_type = emu_block_fn_type,
- .lift_block = rv64_emu_lift_block,
- .get_gpr = emu_rv64_xreg,
- .set_gpr = emu_rv64_set_xreg,
- .get_syscall_no = rv64_get_syscall_no,
- .get_syscall_arg = rv64_get_syscall_arg,
- .set_syscall_result = rv64_set_syscall_result,
- .get_sp = rv64_get_sp,
- .set_sp = rv64_set_sp,
- .get_tp = rv64_get_tp,
- .set_tp = rv64_set_tp,
- .signal_context_size = rv64_signal_context_size,
- .save_signal_context = rv64_save_signal_context,
- .restore_signal_context = rv64_restore_signal_context,
- .set_signal_handler_args = rv64_set_signal_handler_args,
- .signal_stack_align = rv64_signal_stack_align,
- .import_thunk_size = 4u,
- .emit_import_thunk = rv64_emit_import_thunk,
- .resolve_runtime_helper = rv64_resolve_runtime_helper,
-};
diff --git a/src/arch/rv64/isa.c b/src/arch/rv64/isa.c
@@ -1,2018 +0,0 @@
-/* RV64 instruction descriptor table + operand print dispatch.
- *
- * Mirrors the aa64_isa.c pattern. Each row records (mnemonic, match,
- * mask, format, flags); rv64_disasm_find returns the first row whose
- * masked bits match the word, and rv64_print_operands renders the
- * operand text using the format's unpack helper.
- *
- * Row ordering: first-match wins. Aliases (rows with RV64_ASMFL_ALIAS)
- * use tighter masks placed BEFORE the canonical row they alias so the
- * disassembler renders the alias spelling. The assembler accepts both
- * forms via rv64_asm_find which prefers the canonical row. */
-
-#include "arch/rv64/isa.h"
-
-#include <string.h>
-
-#include "core/slice.h"
-#include "core/strbuf.h"
-
-/* True if `s` begins with the NUL-terminated literal `pfx` (length-explicit).
- */
-static bool slice_has_prefix_cstr(Slice s, const char* pfx, size_t n) {
- return s.len >= n && memcmp(s.s, pfx, n) == 0;
-}
-
-/* Family-match bit patterns. The opcode (bits 6:0) plus
- * funct3/funct7/funct5 selectors narrow each match. For aliases we pin
- * specific register fields (e.g. rs1=x0 for `li`, rd=x0 for `j`). */
-
-/* Helper: build a 32-bit match for R-type with fixed funct7/funct3/op. */
-#define MATCH_R(funct7, funct3, op) \
- (((u32)(funct7) << 25) | ((u32)(funct3) << 12) | (u32)(op))
-#define MASK_R (0xfe00707fu) /* funct7 + funct3 + opcode */
-
-#define MATCH_I(funct3, op) (((u32)(funct3) << 12) | (u32)(op))
-#define MASK_I (0x0000707fu) /* funct3 + opcode */
-
-#define MATCH_S(funct3, op) (((u32)(funct3) << 12) | (u32)(op))
-#define MASK_S (0x0000707fu)
-
-#define MATCH_B(funct3, op) (((u32)(funct3) << 12) | (u32)(op))
-#define MASK_B (0x0000707fu)
-
-#define MATCH_U(op) ((u32)(op))
-#define MASK_U (0x0000007fu)
-
-#define MATCH_J(op) ((u32)(op))
-#define MASK_J (0x0000007fu)
-
-/* FP fused multiply-add/sub: rs3(31:27) fmt(26:25) rs2 rs1 rm rd op. */
-#define MATCH_R4(fmt, op) (((u32)(fmt) << 25) | (u32)(op))
-#define MASK_R4 (0x0600007fu)
-
-/* I-type shift in RV64: funct6 (bits 31:26) is the selector + opcode +
- * funct3. shamt occupies bits 25:20. */
-#define MATCH_ISHIFT(funct6, funct3, op) \
- (((u32)(funct6) << 26) | ((u32)(funct3) << 12) | (u32)(op))
-#define MASK_ISHIFT (0xfc00707fu)
-
-/* I-type shift in 32-bit (W) form uses 7-bit funct7 + 5-bit shamt. */
-#define MATCH_ISHIFTW(funct7, funct3, op) \
- (((u32)(funct7) << 25) | ((u32)(funct3) << 12) | (u32)(op))
-#define MASK_ISHIFTW (0xfe00707fu)
-
-/* AMO: aq/rl bits 26/25 vary, so mask must exclude them. funct5 is
- * bits[31:27]. */
-#define MATCH_AMO(funct5, funct3, op) \
- (((u32)(funct5) << 27) | ((u32)(funct3) << 12) | (u32)(op))
-#define MASK_AMO (0xf800707fu)
-#define MATCH_AMO_ORDER(funct5, aq, rl, funct3, op) \
- (((u32)(funct5) << 27) | ((u32)(aq) << 26) | ((u32)(rl) << 25) | \
- ((u32)(funct3) << 12) | (u32)(op))
-#define MASK_AMO_ORDER (MASK_AMO | (3u << 25))
-
-/* FP arithmetic with rm — rm field (funct3) is don't-care. funct7
- * encodes op-major and format. */
-#define MATCH_FP_RM(funct7, op) (((u32)(funct7) << 25) | (u32)(op))
-#define MASK_FP_RM (0xfe00007fu)
-
-/* FP R-type with fixed funct3 (compare or sign-injection variants). */
-#define MATCH_FP_R(funct7, funct3, op) MATCH_R((funct7), (funct3), (op))
-#define MASK_FP_R MASK_R
-
-/* FP conversion: funct7 + rs2 (type selector) + funct3-as-rm don't-care
- * + opcode. The rs2 field (bits 24:20) selects integer width / signedness. */
-#define MATCH_FP_CVT(funct7, rs2, op) \
- (((u32)(funct7) << 25) | ((u32)(rs2) << 20) | (u32)(op))
-#define MASK_FP_CVT (0xfff0007fu)
-
-/* SYSTEM (ECALL/EBREAK) — full 32-bit value matches a single instruction. */
-#define MATCH_FULL(w) ((u32)(w))
-#define MASK_FULL (0xffffffffu)
-
-/* CSR — Zicsr. csr (imm12) is don't-care, but funct3+opcode pin the op. */
-#define MATCH_CSR(funct3) (((u32)(funct3) << 12) | (u32)RV_SYSTEM)
-#define MASK_CSR (0x0000707fu)
-
-/* Compressed 16-bit instructions live in low 16 bits of the descriptor
- * word; the mask zeroes bits 16+ to ensure a match against the C-decode
- * path which presents the halfword in low 16 bits. */
-#define MATCH_C(w16) ((u32)(w16))
-
-/* Mnemonic Slice literal for a static table row (compile-time length). */
-#define MN(s) {{(s)}, sizeof(s) - 1}
-
-const Rv64InsnDesc rv64_insn_table[] = {
- /* =================================================================
- * RV64I base — integer register ops (R-type, OP=0x33)
- * ================================================================= */
- {MN("add"), MATCH_R(0x00, 0x0, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("sub"), MATCH_R(0x20, 0x0, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("sll"), MATCH_R(0x00, 0x1, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("slt"), MATCH_R(0x00, 0x2, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("sltu"), MATCH_R(0x00, 0x3, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("xor"), MATCH_R(0x00, 0x4, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("srl"), MATCH_R(0x00, 0x5, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("sra"), MATCH_R(0x20, 0x5, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("or"), MATCH_R(0x00, 0x6, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("and"), MATCH_R(0x00, 0x7, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
-
- /* 32-bit (W) variants — OP_32 = 0x3b */
- {MN("addw"), MATCH_R(0x00, 0x0, RV_OP_32), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("subw"), MATCH_R(0x20, 0x0, RV_OP_32), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("sllw"), MATCH_R(0x00, 0x1, RV_OP_32), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("srlw"), MATCH_R(0x00, 0x5, RV_OP_32), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("sraw"), MATCH_R(0x20, 0x5, RV_OP_32), MASK_R, RV64_FMT_R, 0, {0, 0}},
-
- /* ---- I-type immediate ALU (OP_IMM=0x13) ----
- * Aliases: `li rd, imm` = ADDI rd, x0, imm (rs1=x0).
- * `mv rd, rs1` = ADDI rd, rs1, 0 (imm=0).
- * `nop` = ADDI x0, x0, 0 (full word fixed). */
- {MN("nop"),
- 0x00000013u,
- 0xffffffffu,
- RV64_FMT_SYSTEM,
- RV64_ASMFL_ALIAS,
- {0, 0}},
- {MN("li"), 0x00000013u, 0x000f807fu, RV64_FMT_I, RV64_ASMFL_ALIAS, {0, 0}},
- /* mv: ADDI with imm=0. mask requires imm12=0 + funct3=0 + op. */
- {MN("mv"), 0x00000013u, 0xfff0707fu, RV64_FMT_I, RV64_ASMFL_ALIAS, {0, 0}},
- /* seqz: SLTIU rd, rs, 1 — funct3=3, imm12=1, op=OP_IMM. */
- {MN("seqz"),
- 0x00103013u,
- 0xfff0707fu,
- RV64_FMT_I,
- RV64_ASMFL_ALIAS,
- {0, 0}},
- /* snez: SLTU rd, x0, rs2 — rs1=x0, funct3=3, op=OP. */
- {MN("snez"),
- 0x00003033u,
- 0xfe0ff07fu,
- RV64_FMT_R,
- RV64_ASMFL_ALIAS,
- {0, 0}},
- /* not: XORI rd, rs, -1 — imm12=0xfff, funct3=4, op=OP_IMM. */
- {MN("not"), 0xfff04013u, 0xfff0707fu, RV64_FMT_I, RV64_ASMFL_ALIAS, {0, 0}},
- /* neg: SUB rd, x0, rs2 — rs1=x0, funct7=0x20, funct3=0. */
- {MN("neg"), 0x40000033u, 0xfe0ff07fu, RV64_FMT_R, RV64_ASMFL_ALIAS, {0, 0}},
- /* negw: SUBW rd, x0, rs2. */
- {MN("negw"),
- 0x4000003bu,
- 0xfe0ff07fu,
- RV64_FMT_R,
- RV64_ASMFL_ALIAS,
- {0, 0}},
- {MN("addi"), MATCH_I(0x0, RV_OP_IMM), MASK_I, RV64_FMT_I, 0, {0, 0}},
- {MN("slti"), MATCH_I(0x2, RV_OP_IMM), MASK_I, RV64_FMT_I, 0, {0, 0}},
- {MN("sltiu"), MATCH_I(0x3, RV_OP_IMM), MASK_I, RV64_FMT_I, 0, {0, 0}},
- {MN("xori"), MATCH_I(0x4, RV_OP_IMM), MASK_I, RV64_FMT_I, 0, {0, 0}},
- {MN("ori"), MATCH_I(0x6, RV_OP_IMM), MASK_I, RV64_FMT_I, 0, {0, 0}},
- {MN("andi"), MATCH_I(0x7, RV_OP_IMM), MASK_I, RV64_FMT_I, 0, {0, 0}},
-
- /* RV64I shift-imm: funct6 in bits 31:26, shamt in 25:20. */
- {MN("slli"),
- MATCH_ISHIFT(0x00, 0x1, RV_OP_IMM),
- MASK_ISHIFT,
- RV64_FMT_I_SHIFT,
- 0,
- {0, 0}},
- {MN("srli"),
- MATCH_ISHIFT(0x00, 0x5, RV_OP_IMM),
- MASK_ISHIFT,
- RV64_FMT_I_SHIFT,
- 0,
- {0, 0}},
- {MN("srai"),
- MATCH_ISHIFT(0x10, 0x5, RV_OP_IMM),
- MASK_ISHIFT,
- RV64_FMT_I_SHIFT,
- 0,
- {0, 0}},
-
- /* OP_IMM_32: ADDIW + word shifts. sext.w alias = ADDIW rd, rs, 0. */
- {MN("sext.w"),
- 0x0000001bu,
- 0xfff0707fu,
- RV64_FMT_I,
- RV64_ASMFL_ALIAS,
- {0, 0}},
- {MN("addiw"), MATCH_I(0x0, RV_OP_IMM_32), MASK_I, RV64_FMT_I, 0, {0, 0}},
- {MN("slliw"),
- MATCH_ISHIFTW(0x00, 0x1, RV_OP_IMM_32),
- MASK_ISHIFTW,
- RV64_FMT_I_SHIFTW,
- 0,
- {0, 0}},
- {MN("srliw"),
- MATCH_ISHIFTW(0x00, 0x5, RV_OP_IMM_32),
- MASK_ISHIFTW,
- RV64_FMT_I_SHIFTW,
- 0,
- {0, 0}},
- {MN("sraiw"),
- MATCH_ISHIFTW(0x20, 0x5, RV_OP_IMM_32),
- MASK_ISHIFTW,
- RV64_FMT_I_SHIFTW,
- 0,
- {0, 0}},
-
- /* ---- LUI / AUIPC ---- */
- {MN("lui"), MATCH_U(RV_LUI), MASK_U, RV64_FMT_U, 0, {0, 0}},
- {MN("auipc"), MATCH_U(RV_AUIPC), MASK_U, RV64_FMT_U, 0, {0, 0}},
-
- /* ---- Loads (I-type, op=LOAD=0x03) ---- */
- {MN("lb"), MATCH_I(0x0, RV_LOAD), MASK_I, RV64_FMT_LOAD, 0, {0, 0}},
- {MN("lh"), MATCH_I(0x1, RV_LOAD), MASK_I, RV64_FMT_LOAD, 0, {0, 0}},
- {MN("lw"), MATCH_I(0x2, RV_LOAD), MASK_I, RV64_FMT_LOAD, 0, {0, 0}},
- {MN("ld"), MATCH_I(0x3, RV_LOAD), MASK_I, RV64_FMT_LOAD, 0, {0, 0}},
- {MN("lbu"), MATCH_I(0x4, RV_LOAD), MASK_I, RV64_FMT_LOAD, 0, {0, 0}},
- {MN("lhu"), MATCH_I(0x5, RV_LOAD), MASK_I, RV64_FMT_LOAD, 0, {0, 0}},
- {MN("lwu"), MATCH_I(0x6, RV_LOAD), MASK_I, RV64_FMT_LOAD, 0, {0, 0}},
-
- /* ---- Stores (S-type, op=STORE=0x23) ---- */
- {MN("sb"), MATCH_S(0x0, RV_STORE), MASK_S, RV64_FMT_STORE, 0, {0, 0}},
- {MN("sh"), MATCH_S(0x1, RV_STORE), MASK_S, RV64_FMT_STORE, 0, {0, 0}},
- {MN("sw"), MATCH_S(0x2, RV_STORE), MASK_S, RV64_FMT_STORE, 0, {0, 0}},
- {MN("sd"), MATCH_S(0x3, RV_STORE), MASK_S, RV64_FMT_STORE, 0, {0, 0}},
-
- /* ---- Branches (B-type, op=BRANCH=0x63) ----
- * Aliases: `beqz rs, off` = BEQ rs, x0, off; `bnez rs, off` = BNE. */
- {MN("beqz"),
- 0x00000063u,
- 0x01f0707fu,
- RV64_FMT_B,
- RV64_ASMFL_ALIAS,
- {0, 0}},
- {MN("bnez"),
- 0x00001063u,
- 0x01f0707fu,
- RV64_FMT_B,
- RV64_ASMFL_ALIAS,
- {0, 0}},
- {MN("beq"), MATCH_B(0x0, RV_BRANCH), MASK_B, RV64_FMT_B, 0, {0, 0}},
- {MN("bne"), MATCH_B(0x1, RV_BRANCH), MASK_B, RV64_FMT_B, 0, {0, 0}},
- {MN("blt"), MATCH_B(0x4, RV_BRANCH), MASK_B, RV64_FMT_B, 0, {0, 0}},
- {MN("bge"), MATCH_B(0x5, RV_BRANCH), MASK_B, RV64_FMT_B, 0, {0, 0}},
- {MN("bltu"), MATCH_B(0x6, RV_BRANCH), MASK_B, RV64_FMT_B, 0, {0, 0}},
- {MN("bgeu"), MATCH_B(0x7, RV_BRANCH), MASK_B, RV64_FMT_B, 0, {0, 0}},
-
- /* ---- JAL / JALR ----
- * `j off` = JAL x0, off (rd=x0).
- * `jal off` = JAL ra, off (rd=ra, single-operand form).
- * `ret` = JALR x0, 0(ra) (rd=x0 + rs1=ra + imm=0).
- * `jr rs` = JALR x0, 0(rs) (rd=x0, imm=0).
- * `jalr rs` = JALR ra, 0(rs) (rd=ra, imm=0). */
- {MN("ret"),
- 0x00008067u,
- 0xffffffffu,
- RV64_FMT_SYSTEM,
- RV64_ASMFL_ALIAS,
- {0, 0}},
- {MN("jr"),
- 0x00000067u,
- 0xfff07fffu,
- RV64_FMT_JALR,
- RV64_ASMFL_ALIAS,
- {0, 0}},
- {MN("j"), 0x0000006fu, 0x00000fffu, RV64_FMT_J, RV64_ASMFL_ALIAS, {0, 0}},
- {MN("jal"), MATCH_J(RV_JAL), MASK_J, RV64_FMT_J, 0, {0, 0}},
- {MN("jalr"), MATCH_I(0x0, RV_JALR), MASK_I, RV64_FMT_JALR, 0, {0, 0}},
-
- /* ---- Multi-word pseudo-instructions ----
- * `call sym` = AUIPC ra, %pcrel_hi(sym); JALR ra, %pcrel_lo(ra) — one
- * R_RV_CALL reloc at the AUIPC; the linker patches both.
- * `tail sym` = AUIPC t1, ...; JALR zero, t1 — same R_RV_CALL reloc.
- * `la rd,sym` / `lla rd,sym` = AUIPC rd, %pcrel_hi(sym); ADDI rd, rd,
- * %pcrel_lo. kit's static Local-Exec model treats `la`
- * and `lla` identically (no GOT indirection). The match
- * column is unused: RV64_FMT_PSEUDO dispatches on the
- * mnemonic and emits the expansion directly. */
- {MN("call"), 0u, 0u, RV64_FMT_PSEUDO, RV64_ASMFL_PSEUDO, {0, 0}},
- {MN("tail"), 0u, 0u, RV64_FMT_PSEUDO, RV64_ASMFL_PSEUDO, {0, 0}},
- {MN("la"), 0u, 0u, RV64_FMT_PSEUDO, RV64_ASMFL_PSEUDO, {0, 0}},
- {MN("lla"), 0u, 0u, RV64_FMT_PSEUDO, RV64_ASMFL_PSEUDO, {0, 0}},
-
- /* ---- FENCE ---- */
- {MN("fence"), MATCH_I(0x0, RV_FENCE), MASK_I, RV64_FMT_FENCE, 0, {0, 0}},
- {MN("fence.i"),
- MATCH_FULL(0x0000100fu),
- MASK_FULL,
- RV64_FMT_SYSTEM,
- 0,
- {0, 0}},
-
- /* ---- System (ECALL/EBREAK) ---- */
- {MN("ecall"),
- MATCH_FULL(0x00000073u),
- MASK_FULL,
- RV64_FMT_SYSTEM,
- 0,
- {0, 0}},
- {MN("ebreak"),
- MATCH_FULL(0x00100073u),
- MASK_FULL,
- RV64_FMT_SYSTEM,
- 0,
- {0, 0}},
-
- /* =================================================================
- * Zicsr (CSR access) — RV_SYSTEM with funct3 ∈ {1..3, 5..7}.
- * ================================================================= */
- {MN("csrrw"), MATCH_CSR(0x1), MASK_CSR, RV64_FMT_CSR, 0, {0, 0}},
- {MN("csrrs"), MATCH_CSR(0x2), MASK_CSR, RV64_FMT_CSR, 0, {0, 0}},
- {MN("csrrc"), MATCH_CSR(0x3), MASK_CSR, RV64_FMT_CSR, 0, {0, 0}},
- {MN("csrrwi"), MATCH_CSR(0x5), MASK_CSR, RV64_FMT_CSRI, 0, {0, 0}},
- {MN("csrrsi"), MATCH_CSR(0x6), MASK_CSR, RV64_FMT_CSRI, 0, {0, 0}},
- {MN("csrrci"), MATCH_CSR(0x7), MASK_CSR, RV64_FMT_CSRI, 0, {0, 0}},
-
- /* =================================================================
- * RV64M (multiply / divide) — funct7 = 0x01
- * ================================================================= */
- {MN("mul"), MATCH_R(0x01, 0x0, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("mulh"), MATCH_R(0x01, 0x1, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("mulhsu"), MATCH_R(0x01, 0x2, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("mulhu"), MATCH_R(0x01, 0x3, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("div"), MATCH_R(0x01, 0x4, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("divu"), MATCH_R(0x01, 0x5, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("rem"), MATCH_R(0x01, 0x6, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("remu"), MATCH_R(0x01, 0x7, RV_OP), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("mulw"), MATCH_R(0x01, 0x0, RV_OP_32), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("divw"), MATCH_R(0x01, 0x4, RV_OP_32), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("divuw"), MATCH_R(0x01, 0x5, RV_OP_32), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("remw"), MATCH_R(0x01, 0x6, RV_OP_32), MASK_R, RV64_FMT_R, 0, {0, 0}},
- {MN("remuw"), MATCH_R(0x01, 0x7, RV_OP_32), MASK_R, RV64_FMT_R, 0, {0, 0}},
-
- /* =================================================================
- * RV32F / RV32D — single and double precision FP
- * ================================================================= */
- /* FP fused multiply-add/subtract — rm defaults to dyn in the assembler. */
- {MN("fmadd.s"),
- MATCH_R4(RV_FMT_S, RV_MADD),
- MASK_R4,
- RV64_FMT_R4,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fmsub.s"),
- MATCH_R4(RV_FMT_S, RV_MSUB),
- MASK_R4,
- RV64_FMT_R4,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fnmsub.s"),
- MATCH_R4(RV_FMT_S, RV_NMSUB),
- MASK_R4,
- RV64_FMT_R4,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fnmadd.s"),
- MATCH_R4(RV_FMT_S, RV_NMADD),
- MASK_R4,
- RV64_FMT_R4,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fmadd.d"),
- MATCH_R4(RV_FMT_D, RV_MADD),
- MASK_R4,
- RV64_FMT_R4,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fmsub.d"),
- MATCH_R4(RV_FMT_D, RV_MSUB),
- MASK_R4,
- RV64_FMT_R4,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fnmsub.d"),
- MATCH_R4(RV_FMT_D, RV_NMSUB),
- MASK_R4,
- RV64_FMT_R4,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fnmadd.d"),
- MATCH_R4(RV_FMT_D, RV_NMADD),
- MASK_R4,
- RV64_FMT_R4,
- RV64_ASMFL_FP,
- {0, 0}},
-
- /* FP arithmetic — rm field (funct3) is the rounding mode and prints
- * as the DYN(=7) default suppressed. funct7 low bits select fmt. */
- {MN("fadd.s"),
- MATCH_FP_RM(0x00, RV_OP_FP),
- MASK_FP_RM,
- RV64_FMT_FP_RM,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fsub.s"),
- MATCH_FP_RM(0x04, RV_OP_FP),
- MASK_FP_RM,
- RV64_FMT_FP_RM,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fmul.s"),
- MATCH_FP_RM(0x08, RV_OP_FP),
- MASK_FP_RM,
- RV64_FMT_FP_RM,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fdiv.s"),
- MATCH_FP_RM(0x0c, RV_OP_FP),
- MASK_FP_RM,
- RV64_FMT_FP_RM,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fadd.d"),
- MATCH_FP_RM(0x01, RV_OP_FP),
- MASK_FP_RM,
- RV64_FMT_FP_RM,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fsub.d"),
- MATCH_FP_RM(0x05, RV_OP_FP),
- MASK_FP_RM,
- RV64_FMT_FP_RM,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fmul.d"),
- MATCH_FP_RM(0x09, RV_OP_FP),
- MASK_FP_RM,
- RV64_FMT_FP_RM,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fdiv.d"),
- MATCH_FP_RM(0x0d, RV_OP_FP),
- MASK_FP_RM,
- RV64_FMT_FP_RM,
- RV64_ASMFL_FP,
- {0, 0}},
-
- /* FP sqrt — funct7 = 0x2c (S) / 0x2d (D), rs2 must be 0. */
- {MN("fsqrt.s"),
- MATCH_FP_CVT(0x2c, 0x0, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fsqrt.d"),
- MATCH_FP_CVT(0x2d, 0x0, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- RV64_ASMFL_FP,
- {0, 0}},
-
- /* FP min/max — funct7 = 0x14/0x15, funct3 = 0 (min) / 1 (max). */
- {MN("fmin.s"),
- MATCH_FP_R(0x14, 0x0, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_FP | RV64_ASMFL_NORM,
- {0, 0}},
- {MN("fmax.s"),
- MATCH_FP_R(0x14, 0x1, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_FP | RV64_ASMFL_NORM,
- {0, 0}},
- {MN("fmin.d"),
- MATCH_FP_R(0x15, 0x0, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_FP | RV64_ASMFL_NORM,
- {0, 0}},
- {MN("fmax.d"),
- MATCH_FP_R(0x15, 0x1, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_FP | RV64_ASMFL_NORM,
- {0, 0}},
-
- /* FP sign-injection — funct7 = 0x10/0x11, funct3 = 0/1/2 = J/JN/JX. */
- {MN("fsgnj.s"),
- MATCH_FP_R(0x10, 0x0, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_FP | RV64_ASMFL_NORM,
- {0, 0}},
- {MN("fsgnjn.s"),
- MATCH_FP_R(0x10, 0x1, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_FP | RV64_ASMFL_NORM,
- {0, 0}},
- {MN("fsgnjx.s"),
- MATCH_FP_R(0x10, 0x2, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_FP | RV64_ASMFL_NORM,
- {0, 0}},
- {MN("fsgnj.d"),
- MATCH_FP_R(0x11, 0x0, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_FP | RV64_ASMFL_NORM,
- {0, 0}},
- {MN("fsgnjn.d"),
- MATCH_FP_R(0x11, 0x1, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_FP | RV64_ASMFL_NORM,
- {0, 0}},
- {MN("fsgnjx.d"),
- MATCH_FP_R(0x11, 0x2, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_FP | RV64_ASMFL_NORM,
- {0, 0}},
-
- /* FP compare — funct7 = 0x50 (S) / 0x51 (D), funct3 = 0/1/2 = LE/LT/EQ.
- * rd is integer GPR (not FP). */
- {MN("fle.s"),
- MATCH_FP_R(0x50, 0x0, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_NORM,
- {0, 0}},
- {MN("flt.s"),
- MATCH_FP_R(0x50, 0x1, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_NORM,
- {0, 0}},
- {MN("feq.s"),
- MATCH_FP_R(0x50, 0x2, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_NORM,
- {0, 0}},
- {MN("fle.d"),
- MATCH_FP_R(0x51, 0x0, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_NORM,
- {0, 0}},
- {MN("flt.d"),
- MATCH_FP_R(0x51, 0x1, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_NORM,
- {0, 0}},
- {MN("feq.d"),
- MATCH_FP_R(0x51, 0x2, RV_OP_FP),
- MASK_FP_R,
- RV64_FMT_FP_R,
- RV64_ASMFL_NORM,
- {0, 0}},
-
- /* FP classification — rd is GPR, rs1 is FPR, rs2=0, rm/funct3=1. */
- {MN("fclass.s"),
- MATCH_FP_R(0x70, 0x1, RV_OP_FP) | (0u << 20),
- MASK_FP_CVT | (7u << 12),
- RV64_FMT_FP_CVT,
- 0,
- {0, 0}},
- {MN("fclass.d"),
- MATCH_FP_R(0x71, 0x1, RV_OP_FP) | (0u << 20),
- MASK_FP_CVT | (7u << 12),
- RV64_FMT_FP_CVT,
- 0,
- {0, 0}},
-
- /* FP conversions — funct7 selects {direction, fmt}, rs2 selects
- * integer width/signedness. */
- {MN("fcvt.w.s"),
- MATCH_FP_CVT(0x60, 0x0, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- 0,
- {0, 0}},
- {MN("fcvt.wu.s"),
- MATCH_FP_CVT(0x60, 0x1, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- 0,
- {0, 0}},
- {MN("fcvt.l.s"),
- MATCH_FP_CVT(0x60, 0x2, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- 0,
- {0, 0}},
- {MN("fcvt.lu.s"),
- MATCH_FP_CVT(0x60, 0x3, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- 0,
- {0, 0}},
- {MN("fcvt.w.d"),
- MATCH_FP_CVT(0x61, 0x0, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- 0,
- {0, 0}},
- {MN("fcvt.wu.d"),
- MATCH_FP_CVT(0x61, 0x1, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- 0,
- {0, 0}},
- {MN("fcvt.l.d"),
- MATCH_FP_CVT(0x61, 0x2, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- 0,
- {0, 0}},
- {MN("fcvt.lu.d"),
- MATCH_FP_CVT(0x61, 0x3, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- 0,
- {0, 0}},
- {MN("fcvt.s.w"),
- MATCH_FP_CVT(0x68, 0x0, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fcvt.s.wu"),
- MATCH_FP_CVT(0x68, 0x1, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fcvt.s.l"),
- MATCH_FP_CVT(0x68, 0x2, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fcvt.s.lu"),
- MATCH_FP_CVT(0x68, 0x3, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fcvt.d.w"),
- MATCH_FP_CVT(0x69, 0x0, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fcvt.d.wu"),
- MATCH_FP_CVT(0x69, 0x1, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fcvt.d.l"),
- MATCH_FP_CVT(0x69, 0x2, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fcvt.d.lu"),
- MATCH_FP_CVT(0x69, 0x3, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fcvt.s.d"),
- MATCH_FP_CVT(0x20, 0x1, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fcvt.d.s"),
- MATCH_FP_CVT(0x21, 0x0, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- RV64_ASMFL_FP,
- {0, 0}},
-
- /* FP bitcast moves — funct7 + rs2=0 + funct3=0 fixed. */
- {MN("fmv.x.w"),
- MATCH_FP_CVT(0x70, 0x0, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- 0,
- {0, 0}},
- {MN("fmv.w.x"),
- MATCH_FP_CVT(0x78, 0x0, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fmv.x.d"),
- MATCH_FP_CVT(0x71, 0x0, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- 0,
- {0, 0}},
- {MN("fmv.d.x"),
- MATCH_FP_CVT(0x79, 0x0, RV_OP_FP),
- MASK_FP_CVT,
- RV64_FMT_FP_CVT,
- RV64_ASMFL_FP,
- {0, 0}},
-
- /* FP load/store */
- {MN("flw"),
- MATCH_I(0x2, RV_LOAD_FP),
- MASK_I,
- RV64_FMT_FP_LOAD,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fld"),
- MATCH_I(0x3, RV_LOAD_FP),
- MASK_I,
- RV64_FMT_FP_LOAD,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fsw"),
- MATCH_S(0x2, RV_STORE_FP),
- MASK_S,
- RV64_FMT_FP_STORE,
- RV64_ASMFL_FP,
- {0, 0}},
- {MN("fsd"),
- MATCH_S(0x3, RV_STORE_FP),
- MASK_S,
- RV64_FMT_FP_STORE,
- RV64_ASMFL_FP,
- {0, 0}},
-
- /* =================================================================
- * RV64A (atomic) — AMO funct5 + funct3 (W=2, D=3). aq/rl vary, so
- * mask leaves bits 26:25 free. We expose the .aq/.rl ordering
- * suffixes via the disassembler's annotation, but the row mnemonic
- * itself is the bare form (e.g. "amoadd.w").
- * ================================================================= */
- {MN("lr.w.aq"),
- MATCH_AMO_ORDER(0x02, 1, 0, 0x2, RV_AMO),
- MASK_AMO_ORDER | (0x1fu << 20),
- RV64_FMT_LR,
- 0,
- {0, 0}},
- {MN("lr.w.rl"),
- MATCH_AMO_ORDER(0x02, 0, 1, 0x2, RV_AMO),
- MASK_AMO_ORDER | (0x1fu << 20),
- RV64_FMT_LR,
- 0,
- {0, 0}},
- {MN("lr.w.aqrl"),
- MATCH_AMO_ORDER(0x02, 1, 1, 0x2, RV_AMO),
- MASK_AMO_ORDER | (0x1fu << 20),
- RV64_FMT_LR,
- 0,
- {0, 0}},
- {MN("lr.d.aq"),
- MATCH_AMO_ORDER(0x02, 1, 0, 0x3, RV_AMO),
- MASK_AMO_ORDER | (0x1fu << 20),
- RV64_FMT_LR,
- 0,
- {0, 0}},
- {MN("lr.d.rl"),
- MATCH_AMO_ORDER(0x02, 0, 1, 0x3, RV_AMO),
- MASK_AMO_ORDER | (0x1fu << 20),
- RV64_FMT_LR,
- 0,
- {0, 0}},
- {MN("lr.d.aqrl"),
- MATCH_AMO_ORDER(0x02, 1, 1, 0x3, RV_AMO),
- MASK_AMO_ORDER | (0x1fu << 20),
- RV64_FMT_LR,
- 0,
- {0, 0}},
- {MN("sc.w.aq"),
- MATCH_AMO_ORDER(0x03, 1, 0, 0x2, RV_AMO),
- MASK_AMO_ORDER,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("sc.w.rl"),
- MATCH_AMO_ORDER(0x03, 0, 1, 0x2, RV_AMO),
- MASK_AMO_ORDER,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("sc.w.aqrl"),
- MATCH_AMO_ORDER(0x03, 1, 1, 0x2, RV_AMO),
- MASK_AMO_ORDER,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("sc.d.aq"),
- MATCH_AMO_ORDER(0x03, 1, 0, 0x3, RV_AMO),
- MASK_AMO_ORDER,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("sc.d.rl"),
- MATCH_AMO_ORDER(0x03, 0, 1, 0x3, RV_AMO),
- MASK_AMO_ORDER,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("sc.d.aqrl"),
- MATCH_AMO_ORDER(0x03, 1, 1, 0x3, RV_AMO),
- MASK_AMO_ORDER,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
-#define RV64_AMO_ORDER_ROWS(mn, f5, f3) \
- {MN(mn ".aq"), \
- MATCH_AMO_ORDER(f5, 1, 0, f3, RV_AMO), \
- MASK_AMO_ORDER, \
- RV64_FMT_AMO, \
- 0, \
- {0, 0}}, \
- {MN(mn ".rl"), \
- MATCH_AMO_ORDER(f5, 0, 1, f3, RV_AMO), \
- MASK_AMO_ORDER, \
- RV64_FMT_AMO, \
- 0, \
- {0, 0}}, \
- { \
- MN(mn ".aqrl"), MATCH_AMO_ORDER(f5, 1, 1, f3, RV_AMO), MASK_AMO_ORDER, \
- RV64_FMT_AMO, 0, {0, 0} \
- }
- RV64_AMO_ORDER_ROWS("amoswap.w", RV_AMO_SWAP, 0x2),
- RV64_AMO_ORDER_ROWS("amoadd.w", RV_AMO_ADD, 0x2),
- RV64_AMO_ORDER_ROWS("amoxor.w", RV_AMO_XOR, 0x2),
- RV64_AMO_ORDER_ROWS("amoand.w", RV_AMO_AND, 0x2),
- RV64_AMO_ORDER_ROWS("amoor.w", RV_AMO_OR, 0x2),
- RV64_AMO_ORDER_ROWS("amomin.w", RV_AMO_MIN, 0x2),
- RV64_AMO_ORDER_ROWS("amomax.w", RV_AMO_MAX, 0x2),
- RV64_AMO_ORDER_ROWS("amominu.w", RV_AMO_MINU, 0x2),
- RV64_AMO_ORDER_ROWS("amomaxu.w", RV_AMO_MAXU, 0x2),
- RV64_AMO_ORDER_ROWS("amoswap.d", RV_AMO_SWAP, 0x3),
- RV64_AMO_ORDER_ROWS("amoadd.d", RV_AMO_ADD, 0x3),
- RV64_AMO_ORDER_ROWS("amoxor.d", RV_AMO_XOR, 0x3),
- RV64_AMO_ORDER_ROWS("amoand.d", RV_AMO_AND, 0x3),
- RV64_AMO_ORDER_ROWS("amoor.d", RV_AMO_OR, 0x3),
- RV64_AMO_ORDER_ROWS("amomin.d", RV_AMO_MIN, 0x3),
- RV64_AMO_ORDER_ROWS("amomax.d", RV_AMO_MAX, 0x3),
- RV64_AMO_ORDER_ROWS("amominu.d", RV_AMO_MINU, 0x3),
- RV64_AMO_ORDER_ROWS("amomaxu.d", RV_AMO_MAXU, 0x3),
- {MN("lr.w"),
- MATCH_AMO(0x02, 0x2, RV_AMO),
- MASK_AMO | (0x1fu << 20),
- RV64_FMT_LR,
- 0,
- {0, 0}},
- {MN("lr.d"),
- MATCH_AMO(0x02, 0x3, RV_AMO),
- MASK_AMO | (0x1fu << 20),
- RV64_FMT_LR,
- 0,
- {0, 0}},
- {MN("sc.w"),
- MATCH_AMO(0x03, 0x2, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("sc.d"),
- MATCH_AMO(0x03, 0x3, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amoswap.w"),
- MATCH_AMO(RV_AMO_SWAP, 0x2, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amoadd.w"),
- MATCH_AMO(RV_AMO_ADD, 0x2, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amoxor.w"),
- MATCH_AMO(RV_AMO_XOR, 0x2, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amoand.w"),
- MATCH_AMO(RV_AMO_AND, 0x2, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amoor.w"),
- MATCH_AMO(RV_AMO_OR, 0x2, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amomin.w"),
- MATCH_AMO(RV_AMO_MIN, 0x2, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amomax.w"),
- MATCH_AMO(RV_AMO_MAX, 0x2, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amominu.w"),
- MATCH_AMO(RV_AMO_MINU, 0x2, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amomaxu.w"),
- MATCH_AMO(RV_AMO_MAXU, 0x2, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amoswap.d"),
- MATCH_AMO(RV_AMO_SWAP, 0x3, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amoadd.d"),
- MATCH_AMO(RV_AMO_ADD, 0x3, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amoxor.d"),
- MATCH_AMO(RV_AMO_XOR, 0x3, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amoand.d"),
- MATCH_AMO(RV_AMO_AND, 0x3, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amoor.d"),
- MATCH_AMO(RV_AMO_OR, 0x3, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amomin.d"),
- MATCH_AMO(RV_AMO_MIN, 0x3, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amomax.d"),
- MATCH_AMO(RV_AMO_MAX, 0x3, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amominu.d"),
- MATCH_AMO(RV_AMO_MINU, 0x3, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
- {MN("amomaxu.d"),
- MATCH_AMO(RV_AMO_MAXU, 0x3, RV_AMO),
- MASK_AMO,
- RV64_FMT_AMO,
- 0,
- {0, 0}},
-
- /* =================================================================
- * RV64C compressed — assembler rows. The disassembler uses the
- * dynamic C decoder below, so 32-bit decode skips these rows.
- * ================================================================= */
- {MN("c.nop"), 0x0001u, 0xffffu, RV64_FMT_C_NONE, RV64_ASMFL_C16, {0, 0}},
- {MN("c.ebreak"), 0x9002u, 0xffffu, RV64_FMT_C_NONE, RV64_ASMFL_C16, {0, 0}},
- {MN("c.jr"), 0x8002u, 0xf07fu, RV64_FMT_CR, RV64_ASMFL_C16, {0, 0}},
- {MN("c.jalr"), 0x9002u, 0xf07fu, RV64_FMT_CR, RV64_ASMFL_C16, {0, 0}},
- {MN("c.mv"), 0x8002u, 0xf003u, RV64_FMT_CR, RV64_ASMFL_C16, {0, 0}},
- {MN("c.add"), 0x9002u, 0xf003u, RV64_FMT_CR, RV64_ASMFL_C16, {0, 0}},
- {MN("c.li"), 0x4001u, 0xe003u, RV64_FMT_CI, RV64_ASMFL_C16, {0, 0}},
- {MN("c.addi"), 0x0001u, 0xe003u, RV64_FMT_CI, RV64_ASMFL_C16, {0, 0}},
- {MN("c.addiw"), 0x2001u, 0xe003u, RV64_FMT_CI, RV64_ASMFL_C16, {0, 0}},
- {MN("c.slli"), 0x0002u, 0xe003u, RV64_FMT_CI, RV64_ASMFL_C16, {0, 0}},
- {MN("c.lui"), 0x6001u, 0xe003u, RV64_FMT_CI, RV64_ASMFL_C16, {0, 0}},
- {MN("c.addi16sp"), 0x6101u, 0xef83u, RV64_FMT_CI, RV64_ASMFL_C16, {0, 0}},
- {MN("c.lwsp"), 0x4002u, 0xe003u, RV64_FMT_CI, RV64_ASMFL_C16, {0, 0}},
- {MN("c.ldsp"), 0x6002u, 0xe003u, RV64_FMT_CI, RV64_ASMFL_C16, {0, 0}},
- {MN("c.fldsp"),
- 0x2002u,
- 0xe003u,
- RV64_FMT_CI,
- RV64_ASMFL_C16 | RV64_ASMFL_FP,
- {0, 0}},
- {MN("c.swsp"), 0xc002u, 0xe003u, RV64_FMT_CSS, RV64_ASMFL_C16, {0, 0}},
- {MN("c.sdsp"), 0xe002u, 0xe003u, RV64_FMT_CSS, RV64_ASMFL_C16, {0, 0}},
- {MN("c.fsdsp"),
- 0xa002u,
- 0xe003u,
- RV64_FMT_CSS,
- RV64_ASMFL_C16 | RV64_ASMFL_FP,
- {0, 0}},
- {MN("c.addi4spn"), 0x0000u, 0xe003u, RV64_FMT_CIW, RV64_ASMFL_C16, {0, 0}},
- {MN("c.lw"), 0x4000u, 0xe003u, RV64_FMT_CL, RV64_ASMFL_C16, {0, 0}},
- {MN("c.ld"), 0x6000u, 0xe003u, RV64_FMT_CL, RV64_ASMFL_C16, {0, 0}},
- {MN("c.fld"),
- 0x2000u,
- 0xe003u,
- RV64_FMT_CL,
- RV64_ASMFL_C16 | RV64_ASMFL_FP,
- {0, 0}},
- {MN("c.sw"), 0xc000u, 0xe003u, RV64_FMT_CS, RV64_ASMFL_C16, {0, 0}},
- {MN("c.sd"), 0xe000u, 0xe003u, RV64_FMT_CS, RV64_ASMFL_C16, {0, 0}},
- {MN("c.fsd"),
- 0xa000u,
- 0xe003u,
- RV64_FMT_CS,
- RV64_ASMFL_C16 | RV64_ASMFL_FP,
- {0, 0}},
- {MN("c.srli"), 0x8001u, 0xec03u, RV64_FMT_CB, RV64_ASMFL_C16, {0, 0}},
- {MN("c.srai"), 0x8401u, 0xec03u, RV64_FMT_CB, RV64_ASMFL_C16, {0, 0}},
- {MN("c.andi"), 0x8801u, 0xec03u, RV64_FMT_CB, RV64_ASMFL_C16, {0, 0}},
- {MN("c.sub"), 0x8c01u, 0xfc63u, RV64_FMT_CA, RV64_ASMFL_C16, {0, 0}},
- {MN("c.xor"), 0x8c21u, 0xfc63u, RV64_FMT_CA, RV64_ASMFL_C16, {0, 0}},
- {MN("c.or"), 0x8c41u, 0xfc63u, RV64_FMT_CA, RV64_ASMFL_C16, {0, 0}},
- {MN("c.and"), 0x8c61u, 0xfc63u, RV64_FMT_CA, RV64_ASMFL_C16, {0, 0}},
- {MN("c.subw"), 0x9c01u, 0xfc63u, RV64_FMT_CA, RV64_ASMFL_C16, {0, 0}},
- {MN("c.addw"), 0x9c21u, 0xfc63u, RV64_FMT_CA, RV64_ASMFL_C16, {0, 0}},
- {MN("c.j"), 0xa001u, 0xe003u, RV64_FMT_CJ, RV64_ASMFL_C16, {0, 0}},
- {MN("c.beqz"), 0xc001u, 0xe003u, RV64_FMT_CB, RV64_ASMFL_C16, {0, 0}},
- {MN("c.bnez"), 0xe001u, 0xe003u, RV64_FMT_CB, RV64_ASMFL_C16, {0, 0}},
-};
-#undef RV64_AMO_ORDER_ROWS
-
-const u32 rv64_insn_table_n =
- (u32)(sizeof rv64_insn_table / sizeof rv64_insn_table[0]);
-
-const Rv64InsnDesc* rv64_disasm_find(u32 word) {
- for (u32 i = 0; i < rv64_insn_table_n; ++i) {
- const Rv64InsnDesc* d = &rv64_insn_table[i];
- if ((d->flags & RV64_ASMFL_C16)) continue; /* 32-bit decode path */
- if ((d->flags & RV64_ASMFL_PSEUDO)) continue; /* assembler-only expansion */
- if ((word & d->mask) == d->match) return d;
- }
- return NULL;
-}
-
-const Rv64InsnDesc* rv64_asm_find(Slice mnemonic) {
- /* Prefer canonical (non-alias) rows when both spellings exist; the
- * caller can still write the alias and we'll match it on a second
- * pass. Aliases share encoding with the canonical row so the choice
- * is purely for diagnostics. */
- if (!mnemonic.s) return NULL;
- for (u32 i = 0; i < rv64_insn_table_n; ++i) {
- const Rv64InsnDesc* d = &rv64_insn_table[i];
- if ((d->flags & RV64_ASMFL_ALIAS)) continue;
- if (slice_eq(d->mnemonic, mnemonic)) return d;
- }
- for (u32 i = 0; i < rv64_insn_table_n; ++i) {
- const Rv64InsnDesc* d = &rv64_insn_table[i];
- if (slice_eq(d->mnemonic, mnemonic)) return d;
- }
- return NULL;
-}
-
-/* =====================================================================
- * Compressed-instruction decode.
- *
- * RV64C instructions are 16 bits; bits[1:0] (op-quadrant) is 00/01/10
- * (11 means uncompressed/32-bit). bits[15:13] (funct3) further select.
- *
- * For the disassembler we expose a small set of the common encodings;
- * less common ones decode as .hword. */
-
-static u32 rv64c_lookup_simple(u32 w) {
- u32 op = w & 0x3u;
- u32 f3 = (w >> 13) & 0x7u;
- /* C.NOP: funct3=000, op=01, rd/rs1=x0, imm=0 → word=0x0001 */
- if (w == 0x0001u) return 1; /* index in table-c below */
- /* C.EBREAK: 0x9002 */
- if (w == 0x9002u) return 2;
- (void)op;
- (void)f3;
- return 0;
-}
-
-/* The C-extension descriptors are stored in a private table indexed by
- * an internal enum. They are minimal — most C-format instructions print
- * with custom operand printers. */
-static const Rv64InsnDesc rv64_c_table[] = {
- /* index 0 reserved (no match). */
- {MN("c.unknown"), 0, 0xffffu, RV64_FMT_C_NONE, RV64_ASMFL_C16, {0, 0}},
- {MN("c.nop"), 0x0001u, 0xffffu, RV64_FMT_C_NONE, RV64_ASMFL_C16, {0, 0}},
- {MN("c.ebreak"), 0x9002u, 0xffffu, RV64_FMT_C_NONE, RV64_ASMFL_C16, {0, 0}},
-};
-
-#undef MN
-
-const Rv64InsnDesc* rv64_disasm_find_c(u32 word) {
- u32 hw = word & 0xffffu;
- u32 idx = rv64c_lookup_simple(hw);
- if (idx) return &rv64_c_table[idx];
- /* Pattern-match remaining common C-instructions. We use a tiny static
- * scratch descriptor that the printer interprets by funct3+op. */
- static Rv64InsnDesc dyn;
- u32 op = hw & 0x3u;
- u32 f3 = (hw >> 13) & 0x7u;
- if (op == 3u) return NULL; /* uncompressed */
-
- /* C.JR / C.JALR / C.MV / C.ADD — quadrant 2, funct3=100 */
- if (op == 2u && f3 == 4u) {
- u32 funct4 = (hw >> 12) & 0xfu;
- u32 rd_rs1 = (hw >> 7) & 0x1fu;
- u32 rs2 = (hw >> 2) & 0x1fu;
- if (funct4 == 0x8u) {
- dyn = (Rv64InsnDesc){slice_from_cstr(rs2 == 0 ? "c.jr" : "c.mv"),
- hw,
- 0xffffu,
- RV64_FMT_CR,
- RV64_ASMFL_C16,
- {0, 0}};
- return rd_rs1 == 0 ? NULL : &dyn;
- }
- if (funct4 == 0x9u) {
- if (rs2 == 0 && rd_rs1 == 0) {
- dyn = rv64_c_table[2]; /* c.ebreak */
- return &dyn;
- }
- dyn = (Rv64InsnDesc){slice_from_cstr(rs2 == 0 ? "c.jalr" : "c.add"),
- hw,
- 0xffffu,
- RV64_FMT_CR,
- RV64_ASMFL_C16,
- {0, 0}};
- return &dyn;
- }
- }
- /* C.LI / C.ADDI / C.LUI — quadrant 1 */
- if (op == 1u && f3 == 2u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.li"), hw, 0xffffu, RV64_FMT_CI,
- RV64_ASMFL_C16, {0, 0}};
- return &dyn;
- }
- if (op == 1u && f3 == 1u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.addiw"),
- hw,
- 0xffffu,
- RV64_FMT_CI,
- RV64_ASMFL_C16,
- {0, 0}};
- return &dyn;
- }
- if (op == 1u && f3 == 0u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.addi"),
- hw,
- 0xffffu,
- RV64_FMT_CI,
- RV64_ASMFL_C16,
- {0, 0}};
- return &dyn;
- }
- if (op == 1u && f3 == 3u) {
- u32 rd = (hw >> 7) & 0x1fu;
- dyn = (Rv64InsnDesc){slice_from_cstr(rd == 2u ? "c.addi16sp" : "c.lui"),
- hw,
- 0xffffu,
- RV64_FMT_CI,
- RV64_ASMFL_C16,
- {0, 0}};
- return &dyn;
- }
- if (op == 1u && f3 == 4u) {
- u32 top = (hw >> 10) & 0x3u;
- if (top == 0u || top == 1u || top == 2u) {
- static const char* const names[3] = {"c.srli", "c.srai", "c.andi"};
- dyn = (Rv64InsnDesc){slice_from_cstr(names[top]),
- hw,
- 0xffffu,
- RV64_FMT_CB,
- RV64_ASMFL_C16,
- {0, 0}};
- return &dyn;
- }
- {
- u32 bit12 = (hw >> 12) & 1u;
- u32 subop = (hw >> 5) & 0x3u;
- static const char* const ca0[4] = {"c.sub", "c.xor", "c.or", "c.and"};
- static const char* const ca1[4] = {"c.subw", "c.addw", NULL, NULL};
- const char* name = bit12 ? ca1[subop] : ca0[subop];
- if (!name) return NULL;
- dyn = (Rv64InsnDesc){slice_from_cstr(name), hw, 0xffffu, RV64_FMT_CA,
- RV64_ASMFL_C16, {0, 0}};
- return &dyn;
- }
- }
- if (op == 1u && f3 == 5u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.j"), hw, 0xffffu, RV64_FMT_CJ,
- RV64_ASMFL_C16, {0, 0}};
- return &dyn;
- }
- if (op == 1u && f3 == 6u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.beqz"),
- hw,
- 0xffffu,
- RV64_FMT_CB,
- RV64_ASMFL_C16,
- {0, 0}};
- return &dyn;
- }
- if (op == 1u && f3 == 7u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.bnez"),
- hw,
- 0xffffu,
- RV64_FMT_CB,
- RV64_ASMFL_C16,
- {0, 0}};
- return &dyn;
- }
- /* C.LWSP / C.LDSP — quadrant 2, funct3=010/011 */
- if (op == 2u && f3 == 2u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.lwsp"),
- hw,
- 0xffffu,
- RV64_FMT_CI,
- RV64_ASMFL_C16,
- {0, 0}};
- return &dyn;
- }
- if (op == 2u && f3 == 3u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.ldsp"),
- hw,
- 0xffffu,
- RV64_FMT_CI,
- RV64_ASMFL_C16,
- {0, 0}};
- return &dyn;
- }
- if (op == 2u && f3 == 0u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.slli"),
- hw,
- 0xffffu,
- RV64_FMT_CI,
- RV64_ASMFL_C16,
- {0, 0}};
- return &dyn;
- }
- if (op == 2u && f3 == 1u) {
- dyn = (Rv64InsnDesc){
- slice_from_cstr("c.fldsp"), hw, 0xffffu, RV64_FMT_CI,
- RV64_ASMFL_C16 | RV64_ASMFL_FP, {0, 0}};
- return &dyn;
- }
- /* C.SWSP / C.SDSP — quadrant 2, funct3=110/111 */
- if (op == 2u && f3 == 6u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.swsp"),
- hw,
- 0xffffu,
- RV64_FMT_CSS,
- RV64_ASMFL_C16,
- {0, 0}};
- return &dyn;
- }
- if (op == 2u && f3 == 7u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.sdsp"),
- hw,
- 0xffffu,
- RV64_FMT_CSS,
- RV64_ASMFL_C16,
- {0, 0}};
- return &dyn;
- }
- if (op == 2u && f3 == 5u) {
- dyn = (Rv64InsnDesc){
- slice_from_cstr("c.fsdsp"), hw, 0xffffu, RV64_FMT_CSS,
- RV64_ASMFL_C16 | RV64_ASMFL_FP, {0, 0}};
- return &dyn;
- }
- /* C.ADDI4SPN — quadrant 0, funct3=000 */
- if (op == 0u && f3 == 0u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.addi4spn"),
- hw,
- 0xffffu,
- RV64_FMT_CIW,
- RV64_ASMFL_C16,
- {0, 0}};
- return &dyn;
- }
- /* C.LW / C.LD — quadrant 0, funct3=010/011 */
- if (op == 0u && f3 == 2u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.lw"), hw, 0xffffu, RV64_FMT_CL,
- RV64_ASMFL_C16, {0, 0}};
- return &dyn;
- }
- if (op == 0u && f3 == 3u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.ld"), hw, 0xffffu, RV64_FMT_CL,
- RV64_ASMFL_C16, {0, 0}};
- return &dyn;
- }
- if (op == 0u && f3 == 1u) {
- dyn = (Rv64InsnDesc){
- slice_from_cstr("c.fld"), hw, 0xffffu, RV64_FMT_CL,
- RV64_ASMFL_C16 | RV64_ASMFL_FP, {0, 0}};
- return &dyn;
- }
- if (op == 0u && f3 == 6u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.sw"), hw, 0xffffu, RV64_FMT_CS,
- RV64_ASMFL_C16, {0, 0}};
- return &dyn;
- }
- if (op == 0u && f3 == 7u) {
- dyn = (Rv64InsnDesc){slice_from_cstr("c.sd"), hw, 0xffffu, RV64_FMT_CS,
- RV64_ASMFL_C16, {0, 0}};
- return &dyn;
- }
- if (op == 0u && f3 == 5u) {
- dyn = (Rv64InsnDesc){
- slice_from_cstr("c.fsd"), hw, 0xffffu, RV64_FMT_CS,
- RV64_ASMFL_C16 | RV64_ASMFL_FP, {0, 0}};
- return &dyn;
- }
- return NULL;
-}
-
-/* =====================================================================
- * Operand print — one helper per format. */
-
-static const char* const RV_XNAMES[32] = {
- "zero", "ra", "sp", "gp", "tp", "t0", "t1", "t2", "s0", "s1", "a0",
- "a1", "a2", "a3", "a4", "a5", "a6", "a7", "s2", "s3", "s4", "s5",
- "s6", "s7", "s8", "s9", "s10", "s11", "t3", "t4", "t5", "t6",
-};
-
-static const char* const RV_FNAMES[32] = {
- "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7",
- "fs0", "fs1", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5",
- "fa6", "fa7", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7",
- "fs8", "fs9", "fs10", "fs11", "ft8", "ft9", "ft10", "ft11",
-};
-
-static void p_xreg(StrBuf* sb, u32 r) { strbuf_puts(sb, RV_XNAMES[r & 31u]); }
-static void p_freg(StrBuf* sb, u32 r) { strbuf_puts(sb, RV_FNAMES[r & 31u]); }
-static void p_sep(StrBuf* sb) { strbuf_puts(sb, ", "); }
-static void p_mem(StrBuf* sb, i64 off, u32 base) {
- strbuf_put_i64(sb, off);
- strbuf_putc(sb, '(');
- p_xreg(sb, base);
- strbuf_putc(sb, ')');
-}
-static void p_rel(StrBuf* sb, u64 vaddr, i64 off) {
- if (vaddr)
- strbuf_put_hex_u64(sb, vaddr + (u64)off);
- else {
- strbuf_putc(sb, '#');
- strbuf_put_i64(sb, off);
- }
-}
-
-static void print_r(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
- Rv64R f = rv64_r_unpack(w);
- /* Two-operand aliases (snez/neg/negw) drop rs1=x0 from the print. */
- if (d->flags & RV64_ASMFL_ALIAS) {
- p_xreg(sb, f.rd);
- p_sep(sb);
- p_xreg(sb, f.rs2);
- return;
- }
- p_xreg(sb, f.rd);
- p_sep(sb);
- p_xreg(sb, f.rs1);
- p_sep(sb);
- p_xreg(sb, f.rs2);
-}
-
-static void print_r4(StrBuf* sb, u32 w) {
- u32 rd = (w >> 7) & 0x1fu;
- u32 rs1 = (w >> 15) & 0x1fu;
- u32 rs2 = (w >> 20) & 0x1fu;
- u32 rs3 = (w >> 27) & 0x1fu;
- p_freg(sb, rd);
- p_sep(sb);
- p_freg(sb, rs1);
- p_sep(sb);
- p_freg(sb, rs2);
- p_sep(sb);
- p_freg(sb, rs3);
-}
-
-static void print_i(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
- Rv64I f = rv64_i_unpack(w);
- i64 imm = rv64_sext((u64)f.imm12, 12);
- /* Alias: `li rd, imm` — print rd, imm. */
- if ((d->flags & RV64_ASMFL_ALIAS) && slice_eq_cstr(d->mnemonic, "li")) {
- p_xreg(sb, f.rd);
- p_sep(sb);
- strbuf_put_i64(sb, imm);
- return;
- }
- /* Alias: `mv rd, rs1` — print rd, rs1. */
- if ((d->flags & RV64_ASMFL_ALIAS) && slice_eq_cstr(d->mnemonic, "mv")) {
- p_xreg(sb, f.rd);
- p_sep(sb);
- p_xreg(sb, f.rs1);
- return;
- }
- /* Alias: `sext.w rd, rs1` — print rd, rs1. */
- if ((d->flags & RV64_ASMFL_ALIAS) && slice_eq_cstr(d->mnemonic, "sext.w")) {
- p_xreg(sb, f.rd);
- p_sep(sb);
- p_xreg(sb, f.rs1);
- return;
- }
- /* Alias: `seqz rd, rs` / `not rd, rs` — print rd, rs (drop imm). */
- if ((d->flags & RV64_ASMFL_ALIAS) && (slice_eq_cstr(d->mnemonic, "seqz") ||
- slice_eq_cstr(d->mnemonic, "not"))) {
- p_xreg(sb, f.rd);
- p_sep(sb);
- p_xreg(sb, f.rs1);
- return;
- }
- p_xreg(sb, f.rd);
- p_sep(sb);
- p_xreg(sb, f.rs1);
- p_sep(sb);
- strbuf_put_i64(sb, imm);
-}
-
-static void print_i_shift(StrBuf* sb, u32 w) {
- /* shamt is 6 bits for RV64 shift-imm. */
- u32 rd = (w >> 7) & 0x1fu;
- u32 rs1 = (w >> 15) & 0x1fu;
- u32 shamt = (w >> 20) & 0x3fu;
- p_xreg(sb, rd);
- p_sep(sb);
- p_xreg(sb, rs1);
- p_sep(sb);
- strbuf_put_u64(sb, (u64)shamt);
-}
-
-static void print_i_shiftw(StrBuf* sb, u32 w) {
- u32 rd = (w >> 7) & 0x1fu;
- u32 rs1 = (w >> 15) & 0x1fu;
- u32 shamt = (w >> 20) & 0x1fu;
- p_xreg(sb, rd);
- p_sep(sb);
- p_xreg(sb, rs1);
- p_sep(sb);
- strbuf_put_u64(sb, (u64)shamt);
-}
-
-static void print_u(StrBuf* sb, u32 w) {
- Rv64U f = rv64_u_unpack(w);
- p_xreg(sb, f.rd);
- p_sep(sb);
- /* The immediate is the upper-20 already shifted into bits 31:12; print
- * the raw 20-bit value the assembler expects. */
- strbuf_put_hex_u64(sb, (u64)(f.imm32_hi20 >> 12));
-}
-
-static void print_load(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
- Rv64I f = rv64_i_unpack(w);
- i64 imm = rv64_sext((u64)f.imm12, 12);
- if (d->flags & RV64_ASMFL_FP)
- p_freg(sb, f.rd);
- else
- p_xreg(sb, f.rd);
- p_sep(sb);
- p_mem(sb, imm, f.rs1);
-}
-
-static void print_store(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
- Rv64S f = rv64_s_unpack(w);
- i64 imm = rv64_sext((u64)f.imm12, 12);
- if (d->flags & RV64_ASMFL_FP)
- p_freg(sb, f.rs2);
- else
- p_xreg(sb, f.rs2);
- p_sep(sb);
- p_mem(sb, imm, f.rs1);
-}
-
-static void print_b(StrBuf* sb, u32 w, u64 vaddr, const Rv64InsnDesc* d) {
- Rv64B f = rv64_b_unpack(w);
- i64 off = rv64_sext((u64)f.imm13, 13);
- if ((d->flags & RV64_ASMFL_ALIAS) && (slice_eq_cstr(d->mnemonic, "beqz") ||
- slice_eq_cstr(d->mnemonic, "bnez"))) {
- p_xreg(sb, f.rs1);
- p_sep(sb);
- p_rel(sb, vaddr, off);
- return;
- }
- p_xreg(sb, f.rs1);
- p_sep(sb);
- p_xreg(sb, f.rs2);
- p_sep(sb);
- p_rel(sb, vaddr, off);
-}
-
-static void print_j(StrBuf* sb, u32 w, u64 vaddr, const Rv64InsnDesc* d) {
- Rv64J f = rv64_j_unpack(w);
- i64 off = rv64_sext((u64)f.imm21, 21);
- if ((d->flags & RV64_ASMFL_ALIAS) && slice_eq_cstr(d->mnemonic, "j")) {
- p_rel(sb, vaddr, off);
- return;
- }
- p_xreg(sb, f.rd);
- p_sep(sb);
- p_rel(sb, vaddr, off);
-}
-
-static void print_jalr(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
- Rv64I f = rv64_i_unpack(w);
- i64 imm = rv64_sext((u64)f.imm12, 12);
- if ((d->flags & RV64_ASMFL_ALIAS) && slice_eq_cstr(d->mnemonic, "jr")) {
- p_xreg(sb, f.rs1);
- return;
- }
- p_xreg(sb, f.rd);
- p_sep(sb);
- p_mem(sb, imm, f.rs1);
-}
-
-static void print_fence(StrBuf* sb, u32 w) {
- u32 pred = (w >> 24) & 0xfu;
- u32 succ = (w >> 20) & 0xfu;
- static const char order_chars[5] = {'w', 'r', 'o', 'i', '\0'};
- /* pred/succ: bit3=i, bit2=o, bit1=r, bit0=w; print iorw left-to-right. */
- char buf[8];
- u32 k = 0;
- if (pred & 8u) buf[k++] = 'i';
- if (pred & 4u) buf[k++] = 'o';
- if (pred & 2u) buf[k++] = 'r';
- if (pred & 1u) buf[k++] = 'w';
- if (!k) buf[k++] = '0';
- buf[k] = '\0';
- strbuf_puts(sb, buf);
- p_sep(sb);
- k = 0;
- if (succ & 8u) buf[k++] = 'i';
- if (succ & 4u) buf[k++] = 'o';
- if (succ & 2u) buf[k++] = 'r';
- if (succ & 1u) buf[k++] = 'w';
- if (!k) buf[k++] = '0';
- buf[k] = '\0';
- strbuf_puts(sb, buf);
- (void)order_chars;
-}
-
-static void print_csr(StrBuf* sb, u32 w) {
- Rv64I f = rv64_i_unpack(w);
- p_xreg(sb, f.rd);
- p_sep(sb);
- strbuf_put_hex_u64(sb, (u64)f.imm12);
- p_sep(sb);
- p_xreg(sb, f.rs1);
-}
-
-static void print_csri(StrBuf* sb, u32 w) {
- Rv64I f = rv64_i_unpack(w);
- p_xreg(sb, f.rd);
- p_sep(sb);
- strbuf_put_hex_u64(sb, (u64)f.imm12);
- p_sep(sb);
- strbuf_put_u64(sb, (u64)f.rs1);
-}
-
-static void print_fp_rm(StrBuf* sb, u32 w) {
- Rv64R f = rv64_r_unpack(w);
- p_freg(sb, f.rd);
- p_sep(sb);
- p_freg(sb, f.rs1);
- p_sep(sb);
- p_freg(sb, f.rs2);
-}
-
-static void print_fp_r(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
- Rv64R f = rv64_r_unpack(w);
- if (d->flags & RV64_ASMFL_FP) {
- p_freg(sb, f.rd);
- p_sep(sb);
- p_freg(sb, f.rs1);
- p_sep(sb);
- p_freg(sb, f.rs2);
- } else {
- /* FP compare: rd is GPR. */
- p_xreg(sb, f.rd);
- p_sep(sb);
- p_freg(sb, f.rs1);
- p_sep(sb);
- p_freg(sb, f.rs2);
- }
-}
-
-static void print_fp_cvt(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
- Rv64R f = rv64_r_unpack(w);
- /* rd is FP for: fcvt.s.*, fcvt.d.*, fmv.w.x, fmv.d.x, fsqrt.{s,d}.
- * GPR for: fcvt.w.*, fcvt.l.*, fmv.x.w, fmv.x.d. */
- if (d->flags & RV64_ASMFL_FP)
- p_freg(sb, f.rd);
- else
- p_xreg(sb, f.rd);
- p_sep(sb);
- /* rs1: FP if mnemonic is fcvt.X.{S,D} or fsqrt or fmv.x.{w,d};
- * GPR if mnemonic is fcvt.{S,D}.{w,wu,l,lu} or fmv.{w,d}.x. */
- int rs1_is_fp = 1;
- if (slice_eq_cstr(d->mnemonic, "fmv.w.x") ||
- slice_eq_cstr(d->mnemonic, "fmv.d.x") ||
- slice_has_prefix_cstr(d->mnemonic, "fcvt.s.", 7) ||
- slice_has_prefix_cstr(d->mnemonic, "fcvt.d.", 7)) {
- /* These have rs1 as integer GPR (source is integer). Exception:
- * fcvt.s.d / fcvt.d.s have rs1 as FP. */
- if (slice_eq_cstr(d->mnemonic, "fcvt.s.d") ||
- slice_eq_cstr(d->mnemonic, "fcvt.d.s"))
- rs1_is_fp = 1;
- else
- rs1_is_fp = 0;
- }
- if (rs1_is_fp)
- p_freg(sb, f.rs1);
- else
- p_xreg(sb, f.rs1);
- /* Explicit rounding mode for the rounding conversions (fcvt / fsqrt) when it
- * isn't the default `dyn` — fmv and fclass carry no rounding mode. Matches
- * the objdump/clang convention (an omitted suffix means dyn), so a third-
- * party assembler re-encodes our fp->int truncation (rtz) exactly rather
- * than substituting its own default. */
- if (slice_has_prefix_cstr(d->mnemonic, "fcvt.", 5) ||
- slice_has_prefix_cstr(d->mnemonic, "fsqrt.", 6)) {
- u32 rm = (w >> 12) & 7u;
- static const char* const RMN[8] = {"rne", "rtz", "rdn", "rup",
- "rmm", 0, 0, "dyn"};
- if (rm != 7u && RMN[rm]) {
- p_sep(sb);
- strbuf_puts(sb, RMN[rm]);
- }
- }
-}
-
-static void print_amo(StrBuf* sb, u32 w) {
- Rv64R f = rv64_r_unpack(w);
- p_xreg(sb, f.rd);
- p_sep(sb);
- p_xreg(sb, f.rs2);
- p_sep(sb);
- strbuf_putc(sb, '(');
- p_xreg(sb, f.rs1);
- strbuf_putc(sb, ')');
-}
-
-static void print_lr(StrBuf* sb, u32 w) {
- Rv64R f = rv64_r_unpack(w);
- p_xreg(sb, f.rd);
- p_sep(sb);
- strbuf_putc(sb, '(');
- p_xreg(sb, f.rs1);
- strbuf_putc(sb, ')');
-}
-
-/* ---- compressed printers ---- */
-
-static void print_cr(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
- u32 hw = w & 0xffffu;
- u32 rd_rs1 = (hw >> 7) & 0x1fu;
- u32 rs2 = (hw >> 2) & 0x1fu;
- if (slice_eq_cstr(d->mnemonic, "c.jr") ||
- slice_eq_cstr(d->mnemonic, "c.jalr")) {
- p_xreg(sb, rd_rs1);
- } else {
- /* c.mv / c.add */
- p_xreg(sb, rd_rs1);
- p_sep(sb);
- p_xreg(sb, rs2);
- }
-}
-
-static void print_ci(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
- u32 hw = w & 0xffffu;
- u32 rd_rs1 = (hw >> 7) & 0x1fu;
- /* immediate is split across bits 12 and 6:2 (signed 6-bit for most). */
- u32 imm5 = (hw >> 12) & 1u;
- u32 imm4_0 = (hw >> 2) & 0x1fu;
- i64 imm;
- if (slice_eq_cstr(d->mnemonic, "c.lui")) {
- /* nzimm[17:12] = bits 12, 6:2 — signed extended to 18 bits. */
- u64 raw = (u64)((imm5 << 5) | imm4_0);
- imm = rv64_sext(raw, 6) << 12;
- p_xreg(sb, rd_rs1);
- p_sep(sb);
- strbuf_put_hex_u64(sb, (u64)imm);
- return;
- }
- if (slice_eq_cstr(d->mnemonic, "c.addi16sp")) {
- /* nzimm[9|4|6|8:7|5] (scrambled). Just decode for print. */
- u32 b9 = (hw >> 12) & 1u;
- u32 b4 = (hw >> 6) & 1u;
- u32 b6 = (hw >> 5) & 1u;
- u32 b87 = (hw >> 3) & 3u;
- u32 b5 = (hw >> 2) & 1u;
- u64 raw = ((u64)b9 << 9) | ((u64)b87 << 7) | ((u64)b6 << 6) |
- ((u64)b5 << 5) | ((u64)b4 << 4);
- imm = rv64_sext(raw, 10);
- p_xreg(sb, rd_rs1);
- p_sep(sb);
- strbuf_put_i64(sb, imm);
- return;
- }
- if (slice_eq_cstr(d->mnemonic, "c.lwsp")) {
- /* offset[5|4:2|7:6] scaled by 4. */
- u32 b5 = imm5;
- u32 b4_2 = (imm4_0 >> 2) & 7u;
- u32 b7_6 = imm4_0 & 3u;
- u32 off = (b7_6 << 6) | (b5 << 5) | (b4_2 << 2);
- p_xreg(sb, rd_rs1);
- p_sep(sb);
- p_mem(sb, (i64)off, 2u);
- return;
- }
- if (slice_eq_cstr(d->mnemonic, "c.ldsp") ||
- slice_eq_cstr(d->mnemonic, "c.fldsp")) {
- /* offset[5|4:3|8:6] scaled by 8. */
- u32 b5 = imm5;
- u32 b4_3 = (imm4_0 >> 3) & 3u;
- u32 b8_6 = imm4_0 & 7u;
- u32 off = (b8_6 << 6) | (b5 << 5) | (b4_3 << 3);
- if (d->flags & RV64_ASMFL_FP)
- p_freg(sb, rd_rs1);
- else
- p_xreg(sb, rd_rs1);
- p_sep(sb);
- p_mem(sb, (i64)off, 2u);
- return;
- }
- if (slice_eq_cstr(d->mnemonic, "c.slli")) {
- u32 shamt = (imm5 << 5) | imm4_0;
- p_xreg(sb, rd_rs1);
- p_sep(sb);
- strbuf_put_u64(sb, (u64)shamt);
- return;
- }
- /* c.li / c.addi — signed 6-bit immediate. */
- imm = rv64_sext((u64)((imm5 << 5) | imm4_0), 6);
- p_xreg(sb, rd_rs1);
- p_sep(sb);
- strbuf_put_i64(sb, imm);
-}
-
-static void print_css(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
- u32 hw = w & 0xffffu;
- u32 rs2 = (hw >> 2) & 0x1fu;
- u32 imm6 = (hw >> 7) & 0x3fu;
- u32 off;
- if (slice_eq_cstr(d->mnemonic, "c.swsp")) {
- /* offset[5:2|7:6] scaled by 4. */
- u32 b5_2 = (imm6 >> 2) & 0xfu;
- u32 b7_6 = imm6 & 3u;
- off = (b7_6 << 6) | (b5_2 << 2);
- p_xreg(sb, rs2);
- p_sep(sb);
- p_mem(sb, (i64)off, 2u);
- return;
- }
- /* c.sdsp / c.fsdsp — offset[5:3|8:6] scaled by 8. */
- {
- u32 b5_3 = (imm6 >> 3) & 7u;
- u32 b8_6 = imm6 & 7u;
- off = (b8_6 << 6) | (b5_3 << 3);
- if (d->flags & RV64_ASMFL_FP)
- p_freg(sb, rs2);
- else
- p_xreg(sb, rs2);
- p_sep(sb);
- p_mem(sb, (i64)off, 2u);
- }
-}
-
-static void print_ciw(StrBuf* sb, u32 w) {
- u32 hw = w & 0xffffu;
- u32 rd3 = (hw >> 2) & 7u;
- /* nzuimm[5:4|9:6|2|3] scaled by 4 — encoded into bits 12:5. */
- u32 imm = (hw >> 5) & 0xffu;
- u32 b5_4 = (imm >> 6) & 3u;
- u32 b9_6 = (imm >> 2) & 0xfu;
- u32 b2 = (imm >> 1) & 1u;
- u32 b3 = imm & 1u;
- u32 off = (b9_6 << 6) | (b5_4 << 4) | (b3 << 3) | (b2 << 2);
- p_xreg(sb, RVC_REG3(rd3));
- p_sep(sb);
- strbuf_puts(sb, "sp");
- p_sep(sb);
- strbuf_put_u64(sb, (u64)off);
-}
-
-static void print_cl(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
- u32 hw = w & 0xffffu;
- u32 rd3 = (hw >> 2) & 7u;
- u32 rs1_3 = (hw >> 7) & 7u;
- u32 b5_3 = (hw >> 10) & 7u;
- u32 lo = (hw >> 5) & 3u;
- u32 off;
- if (slice_eq_cstr(d->mnemonic, "c.lw")) {
- /* offset[5:3|2|6] scaled by 4. */
- u32 b2 = (lo >> 1) & 1u;
- u32 b6 = lo & 1u;
- off = (b6 << 6) | (b5_3 << 3) | (b2 << 2);
- } else {
- /* c.ld: offset[5:3|7:6] scaled by 8. */
- off = (lo << 6) | (b5_3 << 3);
- }
- if (d->flags & RV64_ASMFL_FP)
- p_freg(sb, RVC_REG3(rd3));
- else
- p_xreg(sb, RVC_REG3(rd3));
- p_sep(sb);
- p_mem(sb, (i64)off, RVC_REG3(rs1_3));
-}
-
-static void print_cs(StrBuf* sb, u32 w, const Rv64InsnDesc* d) {
- u32 hw = w & 0xffffu;
- u32 rs2_3 = (hw >> 2) & 7u;
- u32 rs1_3 = (hw >> 7) & 7u;
- u32 b5_3 = (hw >> 10) & 7u;
- u32 lo = (hw >> 5) & 3u;
- u32 off;
- if (slice_eq_cstr(d->mnemonic, "c.sw")) {
- u32 b2 = (lo >> 1) & 1u;
- u32 b6 = lo & 1u;
- off = (b6 << 6) | (b5_3 << 3) | (b2 << 2);
- } else {
- off = (lo << 6) | (b5_3 << 3);
- }
- if (d->flags & RV64_ASMFL_FP)
- p_freg(sb, RVC_REG3(rs2_3));
- else
- p_xreg(sb, RVC_REG3(rs2_3));
- p_sep(sb);
- p_mem(sb, (i64)off, RVC_REG3(rs1_3));
-}
-
-static void print_ca(StrBuf* sb, u32 w) {
- u32 hw = w & 0xffffu;
- u32 rd3 = (hw >> 7) & 7u;
- u32 rs2_3 = (hw >> 2) & 7u;
- p_xreg(sb, RVC_REG3(rd3));
- p_sep(sb);
- p_xreg(sb, RVC_REG3(rs2_3));
-}
-
-static void print_cb(StrBuf* sb, u32 w, u64 vaddr, const Rv64InsnDesc* d) {
- u32 hw = w & 0xffffu;
- u32 rs1_3 = (hw >> 7) & 7u;
- if (slice_eq_cstr(d->mnemonic, "c.srli") ||
- slice_eq_cstr(d->mnemonic, "c.srai") ||
- slice_eq_cstr(d->mnemonic, "c.andi")) {
- u32 imm = (((hw >> 12) & 1u) << 5) | ((hw >> 2) & 0x1fu);
- p_xreg(sb, RVC_REG3(rs1_3));
- p_sep(sb);
- if (slice_eq_cstr(d->mnemonic, "c.andi"))
- strbuf_put_i64(sb, rv64_sext((u64)imm, 6));
- else
- strbuf_put_u64(sb, (u64)imm);
- return;
- }
- /* offset[8|4:3|7:6|2:1|5] scaled by 2. */
- u32 b8 = (hw >> 12) & 1u;
- u32 b4_3 = (hw >> 10) & 3u;
- u32 b7_6 = (hw >> 5) & 3u;
- u32 b2_1 = (hw >> 3) & 3u;
- u32 b5 = (hw >> 2) & 1u;
- u64 raw = ((u64)b8 << 8) | ((u64)b7_6 << 6) | ((u64)b5 << 5) |
- ((u64)b4_3 << 3) | ((u64)b2_1 << 1);
- i64 off = rv64_sext(raw, 9);
- p_xreg(sb, RVC_REG3(rs1_3));
- p_sep(sb);
- p_rel(sb, vaddr, off);
-}
-
-static void print_cj(StrBuf* sb, u32 w, u64 vaddr) {
- u32 hw = w & 0xffffu;
- /* offset[11|4|9:8|10|6|7|3:1|5] scaled by 2. */
- u32 b11 = (hw >> 12) & 1u;
- u32 b4 = (hw >> 11) & 1u;
- u32 b9_8 = (hw >> 9) & 3u;
- u32 b10 = (hw >> 8) & 1u;
- u32 b6 = (hw >> 7) & 1u;
- u32 b7 = (hw >> 6) & 1u;
- u32 b3_1 = (hw >> 3) & 7u;
- u32 b5 = (hw >> 2) & 1u;
- u64 raw = ((u64)b11 << 11) | ((u64)b10 << 10) | ((u64)b9_8 << 8) |
- ((u64)b7 << 7) | ((u64)b6 << 6) | ((u64)b5 << 5) | ((u64)b4 << 4) |
- ((u64)b3_1 << 1);
- i64 off = rv64_sext(raw, 12);
- p_rel(sb, vaddr, off);
-}
-
-void rv64_print_operands(StrBuf* sb, const Rv64InsnDesc* desc, u32 word,
- u64 vaddr) {
- switch ((Rv64Format)desc->fmt) {
- case RV64_FMT_R:
- print_r(sb, word, desc);
- break;
- case RV64_FMT_R4:
- print_r4(sb, word);
- break;
- case RV64_FMT_I:
- print_i(sb, word, desc);
- break;
- case RV64_FMT_I_SHIFT:
- print_i_shift(sb, word);
- break;
- case RV64_FMT_I_SHIFTW:
- print_i_shiftw(sb, word);
- break;
- case RV64_FMT_S:
- print_store(sb, word, desc);
- break;
- case RV64_FMT_B:
- print_b(sb, word, vaddr, desc);
- break;
- case RV64_FMT_U:
- print_u(sb, word);
- break;
- case RV64_FMT_J:
- print_j(sb, word, vaddr, desc);
- break;
- case RV64_FMT_LOAD:
- print_load(sb, word, desc);
- break;
- case RV64_FMT_STORE:
- print_store(sb, word, desc);
- break;
- case RV64_FMT_JALR:
- print_jalr(sb, word, desc);
- break;
- case RV64_FMT_FENCE:
- print_fence(sb, word);
- break;
- case RV64_FMT_SYSTEM:
- break; /* no operands */
- case RV64_FMT_FP_RM:
- print_fp_rm(sb, word);
- break;
- case RV64_FMT_FP_R:
- print_fp_r(sb, word, desc);
- break;
- case RV64_FMT_FP_CVT:
- print_fp_cvt(sb, word, desc);
- break;
- case RV64_FMT_FP_LOAD:
- print_load(sb, word, desc);
- break;
- case RV64_FMT_FP_STORE:
- print_store(sb, word, desc);
- break;
- case RV64_FMT_AMO:
- print_amo(sb, word);
- break;
- case RV64_FMT_LR:
- print_lr(sb, word);
- break;
- case RV64_FMT_CSR:
- print_csr(sb, word);
- break;
- case RV64_FMT_CSRI:
- print_csri(sb, word);
- break;
- case RV64_FMT_CR:
- print_cr(sb, word, desc);
- break;
- case RV64_FMT_CI:
- print_ci(sb, word, desc);
- break;
- case RV64_FMT_CSS:
- print_css(sb, word, desc);
- break;
- case RV64_FMT_CIW:
- print_ciw(sb, word);
- break;
- case RV64_FMT_CL:
- print_cl(sb, word, desc);
- break;
- case RV64_FMT_CS:
- print_cs(sb, word, desc);
- break;
- case RV64_FMT_CA:
- print_ca(sb, word);
- break;
- case RV64_FMT_CB:
- print_cb(sb, word, vaddr, desc);
- break;
- case RV64_FMT_CJ:
- print_cj(sb, word, vaddr);
- break;
- case RV64_FMT_C_NONE:
- break;
- case RV64_FMT_PSEUDO:
- /* Assembler-only multi-word pseudo; rv64_disasm_find never returns
- * these rows, so the printer is never reached for this format. */
- break;
- }
-}
diff --git a/src/arch/rv64/isa.h b/src/arch/rv64/isa.h
@@ -1,790 +0,0 @@
-/* RV64 instruction encoders + descriptor table — single source of truth
- * for every instruction the encoder, decoder, and disassembler need to
- * agree on. Mirrors the aa64_isa.[ch] pattern.
- *
- * The bottom of this header (after the `rv_*` inline encoders) declares
- * the format-kind enum and per-format pack/unpack helpers. The
- * descriptor table itself lives in isa.c. */
-
-#ifndef KIT_RV64_ISA_H
-#define KIT_RV64_ISA_H
-
-#include "core/core.h"
-#include "core/slice.h"
-#include "core/strbuf.h"
-
-/* ---- Named registers (DWARF / psABI numbering matches HW) ---- */
-enum {
- RV_X0 = 0,
- RV_ZERO = 0,
- RV_X1 = 1,
- RV_RA = 1,
- RV_X2 = 2,
- RV_SP = 2,
- RV_X3 = 3,
- RV_GP = 3,
- RV_X4 = 4,
- RV_TP = 4,
- RV_X5 = 5,
- RV_T0 = 5,
- RV_X6 = 6,
- RV_T1 = 6,
- RV_X7 = 7,
- RV_T2 = 7,
- RV_X8 = 8,
- RV_S0 = 8,
- RV_FP = 8,
- RV_X9 = 9,
- RV_S1 = 9,
- RV_X10 = 10,
- RV_A0 = 10,
- RV_X11 = 11,
- RV_A1 = 11,
- RV_X12 = 12,
- RV_A2 = 12,
- RV_X13 = 13,
- RV_A3 = 13,
- RV_X14 = 14,
- RV_A4 = 14,
- RV_X15 = 15,
- RV_A5 = 15,
- RV_X16 = 16,
- RV_A6 = 16,
- RV_X17 = 17,
- RV_A7 = 17,
- RV_X18 = 18,
- RV_S2 = 18,
- RV_X27 = 27,
- RV_S11 = 27,
- RV_X28 = 28,
- RV_T3 = 28,
- RV_X29 = 29,
- RV_T4 = 29,
- RV_X30 = 30,
- RV_T5 = 30,
- RV_X31 = 31,
- RV_T6 = 31,
-};
-
-#define RV_NOP 0x00000013u /* ADDI x0, x0, 0 */
-
-/* ---- Format helpers ----
- *
- * R-type: funct7(31:25) rs2(24:20) rs1(19:15) funct3(14:12) rd(11:7) op(6:0)
- * I-type: imm(31:20) rs1(19:15) funct3(14:12) rd(11:7) op(6:0)
- * S-type: imm[11:5](31:25) rs2(24:20) rs1(19:15) funct3(14:12) imm[4:0](11:7)
- * op(6:0) B-type: imm[12](31) imm[10:5](30:25) rs2(24:20) rs1(19:15)
- * funct3(14:12) imm[4:1](11:8) imm[11](7) op(6:0) U-type: imm[31:12](31:12)
- * rd(11:7) op(6:0) J-type: imm[20](31) imm[10:1](30:21) imm[11](20)
- * imm[19:12](19:12) rd(11:7) op(6:0)
- */
-
-static inline u32 rv_r(u32 funct7, u32 rs2, u32 rs1, u32 funct3, u32 rd,
- u32 op) {
- return ((funct7 & 0x7fu) << 25) | ((rs2 & 0x1fu) << 20) |
- ((rs1 & 0x1fu) << 15) | ((funct3 & 0x7u) << 12) | ((rd & 0x1fu) << 7) |
- (op & 0x7fu);
-}
-static inline u32 rv_i(i32 imm12, u32 rs1, u32 funct3, u32 rd, u32 op) {
- return (((u32)imm12 & 0xfffu) << 20) | ((rs1 & 0x1fu) << 15) |
- ((funct3 & 0x7u) << 12) | ((rd & 0x1fu) << 7) | (op & 0x7fu);
-}
-static inline u32 rv_s(i32 imm12, u32 rs2, u32 rs1, u32 funct3, u32 op) {
- u32 ui = (u32)imm12 & 0xfffu;
- return ((ui >> 5) << 25) | ((rs2 & 0x1fu) << 20) | ((rs1 & 0x1fu) << 15) |
- ((funct3 & 0x7u) << 12) | ((ui & 0x1fu) << 7) | (op & 0x7fu);
-}
-static inline u32 rv_b(i32 imm13, u32 rs2, u32 rs1, u32 funct3, u32 op) {
- u32 ui = (u32)imm13;
- return (((ui >> 12) & 1u) << 31) | (((ui >> 5) & 0x3fu) << 25) |
- ((rs2 & 0x1fu) << 20) | ((rs1 & 0x1fu) << 15) |
- ((funct3 & 0x7u) << 12) | (((ui >> 1) & 0xfu) << 8) |
- (((ui >> 11) & 1u) << 7) | (op & 0x7fu);
-}
-static inline u32 rv_u(u32 imm32_hi20, u32 rd, u32 op) {
- return (imm32_hi20 & 0xfffff000u) | ((rd & 0x1fu) << 7) | (op & 0x7fu);
-}
-static inline u32 rv_j(i32 imm21, u32 rd, u32 op) {
- u32 ui = (u32)imm21;
- return (((ui >> 20) & 1u) << 31) | (((ui >> 1) & 0x3ffu) << 21) |
- (((ui >> 11) & 1u) << 20) | (((ui >> 12) & 0xffu) << 12) |
- ((rd & 0x1fu) << 7) | (op & 0x7fu);
-}
-
-/* ---- Integer ops (RV32I/RV64I) ---- */
-
-#define RV_OP 0x33u
-#define RV_OP_IMM 0x13u
-#define RV_OP_32 0x3bu
-#define RV_OP_IMM_32 0x1bu
-#define RV_LUI 0x37u
-#define RV_AUIPC 0x17u
-#define RV_LOAD 0x03u
-#define RV_STORE 0x23u
-#define RV_BRANCH 0x63u
-#define RV_JAL 0x6fu
-#define RV_JALR 0x67u
-#define RV_LOAD_FP 0x07u
-#define RV_STORE_FP 0x27u
-#define RV_OP_FP 0x53u
-#define RV_MADD 0x43u
-#define RV_MSUB 0x47u
-#define RV_NMSUB 0x4bu
-#define RV_NMADD 0x4fu
-#define RV_AMO 0x2fu
-#define RV_FENCE 0x0fu
-#define RV_SYSTEM 0x73u
-
-static inline u32 rv_add(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x00, rs2, rs1, 0x0, rd, RV_OP);
-}
-static inline u32 rv_sub(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x20, rs2, rs1, 0x0, rd, RV_OP);
-}
-static inline u32 rv_sll(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x00, rs2, rs1, 0x1, rd, RV_OP);
-}
-static inline u32 rv_slt(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x00, rs2, rs1, 0x2, rd, RV_OP);
-}
-static inline u32 rv_sltu(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x00, rs2, rs1, 0x3, rd, RV_OP);
-}
-static inline u32 rv_xor(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x00, rs2, rs1, 0x4, rd, RV_OP);
-}
-static inline u32 rv_srl(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x00, rs2, rs1, 0x5, rd, RV_OP);
-}
-static inline u32 rv_sra(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x20, rs2, rs1, 0x5, rd, RV_OP);
-}
-static inline u32 rv_or(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x00, rs2, rs1, 0x6, rd, RV_OP);
-}
-static inline u32 rv_and(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x00, rs2, rs1, 0x7, rd, RV_OP);
-}
-
-static inline u32 rv_addw(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x00, rs2, rs1, 0x0, rd, RV_OP_32);
-}
-static inline u32 rv_subw(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x20, rs2, rs1, 0x0, rd, RV_OP_32);
-}
-static inline u32 rv_sllw(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x00, rs2, rs1, 0x1, rd, RV_OP_32);
-}
-static inline u32 rv_srlw(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x00, rs2, rs1, 0x5, rd, RV_OP_32);
-}
-static inline u32 rv_sraw(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x20, rs2, rs1, 0x5, rd, RV_OP_32);
-}
-
-static inline u32 rv_addi(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x0, rd, RV_OP_IMM);
-}
-static inline u32 rv_slti(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x2, rd, RV_OP_IMM);
-}
-static inline u32 rv_sltiu(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x3, rd, RV_OP_IMM);
-}
-static inline u32 rv_xori(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x4, rd, RV_OP_IMM);
-}
-static inline u32 rv_ori(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x6, rd, RV_OP_IMM);
-}
-static inline u32 rv_andi(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x7, rd, RV_OP_IMM);
-}
-
-/* Shift-immediate forms. RV64I uses a 6-bit shamt in bits 25:20 and a
- * 6-bit funct6 in bits 31:26 (so the funct7-vs-shamt[5] split that
- * rv_r() does is wrong here — we hand-assemble these). */
-static inline u32 rv_slli(u32 rd, u32 rs1, u32 sh) {
- return (0x00u << 26) | ((sh & 0x3fu) << 20) | ((rs1 & 0x1fu) << 15) |
- (0x1u << 12) | ((rd & 0x1fu) << 7) | RV_OP_IMM;
-}
-static inline u32 rv_srli(u32 rd, u32 rs1, u32 sh) {
- return (0x00u << 26) | ((sh & 0x3fu) << 20) | ((rs1 & 0x1fu) << 15) |
- (0x5u << 12) | ((rd & 0x1fu) << 7) | RV_OP_IMM;
-}
-static inline u32 rv_srai(u32 rd, u32 rs1, u32 sh) {
- return (0x10u << 26) | ((sh & 0x3fu) << 20) | ((rs1 & 0x1fu) << 15) |
- (0x5u << 12) | ((rd & 0x1fu) << 7) | RV_OP_IMM;
-}
-
-static inline u32 rv_addiw(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x0, rd, RV_OP_IMM_32);
-}
-static inline u32 rv_slliw(u32 rd, u32 rs1, u32 sh) {
- return rv_r(0x00, sh & 0x1fu, rs1, 0x1, rd, RV_OP_IMM_32);
-}
-static inline u32 rv_srliw(u32 rd, u32 rs1, u32 sh) {
- return rv_r(0x00, sh & 0x1fu, rs1, 0x5, rd, RV_OP_IMM_32);
-}
-static inline u32 rv_sraiw(u32 rd, u32 rs1, u32 sh) {
- return rv_r(0x20, sh & 0x1fu, rs1, 0x5, rd, RV_OP_IMM_32);
-}
-
-static inline u32 rv_lui(u32 rd, u32 imm20) {
- return ((imm20 & 0xfffffu) << 12) | ((rd & 0x1fu) << 7) | RV_LUI;
-}
-static inline u32 rv_auipc(u32 rd, u32 imm20) {
- return ((imm20 & 0xfffffu) << 12) | ((rd & 0x1fu) << 7) | RV_AUIPC;
-}
-
-/* M extension */
-static inline u32 rv_mul(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x01, rs2, rs1, 0x0, rd, RV_OP);
-}
-static inline u32 rv_mulh(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x01, rs2, rs1, 0x1, rd, RV_OP);
-}
-static inline u32 rv_mulhsu(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x01, rs2, rs1, 0x2, rd, RV_OP);
-}
-static inline u32 rv_mulhu(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x01, rs2, rs1, 0x3, rd, RV_OP);
-}
-static inline u32 rv_div(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x01, rs2, rs1, 0x4, rd, RV_OP);
-}
-static inline u32 rv_divu(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x01, rs2, rs1, 0x5, rd, RV_OP);
-}
-static inline u32 rv_rem(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x01, rs2, rs1, 0x6, rd, RV_OP);
-}
-static inline u32 rv_remu(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x01, rs2, rs1, 0x7, rd, RV_OP);
-}
-static inline u32 rv_mulw(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x01, rs2, rs1, 0x0, rd, RV_OP_32);
-}
-static inline u32 rv_divw(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x01, rs2, rs1, 0x4, rd, RV_OP_32);
-}
-static inline u32 rv_divuw(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x01, rs2, rs1, 0x5, rd, RV_OP_32);
-}
-static inline u32 rv_remw(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x01, rs2, rs1, 0x6, rd, RV_OP_32);
-}
-static inline u32 rv_remuw(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x01, rs2, rs1, 0x7, rd, RV_OP_32);
-}
-
-/* Zba (address-generation) subset — assumed available on rv64 targets.
- * SH{1,2,3}ADD rd, rs1, rs2 computes rd = (rs1 << {1,2,3}) + rs2 in one
- * instruction (funct7=0x10, opcode=OP). Used by load/store to fold an
- * indexed effective address `base + (index << log2_scale)` into a single
- * scratch register without an explicit shift+add pair. */
-static inline u32 rv_sh1add(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x10, rs2, rs1, 0x2, rd, RV_OP);
-}
-static inline u32 rv_sh2add(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x10, rs2, rs1, 0x4, rd, RV_OP);
-}
-static inline u32 rv_sh3add(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x10, rs2, rs1, 0x6, rd, RV_OP);
-}
-
-/* Loads (funct3: 0=LB,1=LH,2=LW,3=LD,4=LBU,5=LHU,6=LWU) */
-static inline u32 rv_lb(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x0, rd, RV_LOAD);
-}
-static inline u32 rv_lh(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x1, rd, RV_LOAD);
-}
-static inline u32 rv_lw(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x2, rd, RV_LOAD);
-}
-static inline u32 rv_ld(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x3, rd, RV_LOAD);
-}
-static inline u32 rv_lbu(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x4, rd, RV_LOAD);
-}
-static inline u32 rv_lhu(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x5, rd, RV_LOAD);
-}
-static inline u32 rv_lwu(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x6, rd, RV_LOAD);
-}
-
-/* Stores (funct3: 0=SB,1=SH,2=SW,3=SD) */
-static inline u32 rv_sb(u32 rs2, u32 rs1, i32 imm) {
- return rv_s(imm, rs2, rs1, 0x0, RV_STORE);
-}
-static inline u32 rv_sh(u32 rs2, u32 rs1, i32 imm) {
- return rv_s(imm, rs2, rs1, 0x1, RV_STORE);
-}
-static inline u32 rv_sw(u32 rs2, u32 rs1, i32 imm) {
- return rv_s(imm, rs2, rs1, 0x2, RV_STORE);
-}
-static inline u32 rv_sd(u32 rs2, u32 rs1, i32 imm) {
- return rv_s(imm, rs2, rs1, 0x3, RV_STORE);
-}
-
-/* Branches */
-static inline u32 rv_beq(u32 rs1, u32 rs2, i32 imm) {
- return rv_b(imm, rs2, rs1, 0x0, RV_BRANCH);
-}
-static inline u32 rv_bne(u32 rs1, u32 rs2, i32 imm) {
- return rv_b(imm, rs2, rs1, 0x1, RV_BRANCH);
-}
-static inline u32 rv_blt(u32 rs1, u32 rs2, i32 imm) {
- return rv_b(imm, rs2, rs1, 0x4, RV_BRANCH);
-}
-static inline u32 rv_bge(u32 rs1, u32 rs2, i32 imm) {
- return rv_b(imm, rs2, rs1, 0x5, RV_BRANCH);
-}
-static inline u32 rv_bltu(u32 rs1, u32 rs2, i32 imm) {
- return rv_b(imm, rs2, rs1, 0x6, RV_BRANCH);
-}
-static inline u32 rv_bgeu(u32 rs1, u32 rs2, i32 imm) {
- return rv_b(imm, rs2, rs1, 0x7, RV_BRANCH);
-}
-
-/* Jumps */
-static inline u32 rv_jal(u32 rd, i32 imm21) { return rv_j(imm21, rd, RV_JAL); }
-static inline u32 rv_jalr(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x0, rd, RV_JALR);
-}
-
-/* Convenience: jr / ret / j / nop */
-static inline u32 rv_jr(u32 rs1) { return rv_jalr(RV_ZERO, rs1, 0); }
-static inline u32 rv_ret_(void) { return rv_jalr(RV_ZERO, RV_RA, 0); }
-static inline u32 rv_nop(void) { return RV_NOP; }
-
-/* System */
-static inline u32 rv_ecall(void) { return rv_i(0, 0, 0, 0, RV_SYSTEM); }
-static inline u32 rv_ebreak(void) { return rv_i(1, 0, 0, 0, RV_SYSTEM); }
-/* WFI: wait-for-interrupt, SYSTEM funct12=0x105 (privileged). */
-static inline u32 rv_wfi(void) { return 0x10500073u; }
-
-/* FENCE: pred/succ each 4 bits in imm[11:8]/imm[7:4]. fm bits 11:8 of imm */
-static inline u32 rv_fence_rw_rw(void) {
- return rv_i((i32)0x033, 0, 0, 0, RV_FENCE);
-}
-/* FENCE.I: instruction-stream sync (Zifencei). funct3=1 in the MISC-MEM major
- * opcode (0x0F). Used to lower the ISB intrinsic. */
-static inline u32 rv_fence_i(void) { return 0x0000100Fu; }
-/* PAUSE (Zihintpause): a FENCE with pred=W, succ=none. Used for cpu_yield;
- * decodes as a plain FENCE on hardware lacking the extension, which is a safe
- * (stronger) no-op hint. */
-static inline u32 rv_pause(void) { return 0x0100000Fu; }
-
-/* ---- FP (F + D extensions) ----
- * funct7 layout: bits[6:2] op-major (e.g. 0x00 FADD, 0x01 FSUB, ...);
- * bits[1:0] = fmt (00=S, 01=D). rm (rounding mode) in funct3; 0x7 = DYN. */
-
-#define RV_FMT_S 0u
-#define RV_FMT_D 1u
-
-static inline u32 rv_fadd(u32 fmt, u32 rd, u32 rs1, u32 rs2) {
- return rv_r((0x00u << 2) | fmt, rs2, rs1, 0x7, rd, RV_OP_FP);
-}
-static inline u32 rv_fsub(u32 fmt, u32 rd, u32 rs1, u32 rs2) {
- return rv_r((0x01u << 2) | fmt, rs2, rs1, 0x7, rd, RV_OP_FP);
-}
-static inline u32 rv_fmul(u32 fmt, u32 rd, u32 rs1, u32 rs2) {
- return rv_r((0x02u << 2) | fmt, rs2, rs1, 0x7, rd, RV_OP_FP);
-}
-static inline u32 rv_fdiv(u32 fmt, u32 rd, u32 rs1, u32 rs2) {
- return rv_r((0x03u << 2) | fmt, rs2, rs1, 0x7, rd, RV_OP_FP);
-}
-/* FSGNJ.fmt rd, rs1, rs2 — used to implement FMV.fmt rd, rs (sgnj rs, rs). */
-static inline u32 rv_fsgnj(u32 fmt, u32 rd, u32 rs1, u32 rs2) {
- return rv_r((0x04u << 2) | fmt, rs2, rs1, 0x0, rd, RV_OP_FP);
-}
-static inline u32 rv_fsgnjn(u32 fmt, u32 rd, u32 rs1, u32 rs2) {
- return rv_r((0x04u << 2) | fmt, rs2, rs1, 0x1, rd, RV_OP_FP);
-}
-/* FCVT — integer/FP conversions. funct7 = 0x18..0x1d depending on direction;
- * rs2 encodes the partner type:
- * 0x60(W <- S) 0x61(W <- D)
- * 0x68(S <- W) 0x69(D <- W) etc
- * We assemble explicitly via rv_r to be obvious. */
-static inline u32 rv_fcvt(u32 funct7, u32 rs2_sel, u32 rd, u32 rs1, u32 rm) {
- return rv_r(funct7, rs2_sel, rs1, rm, rd, RV_OP_FP);
-}
-/* FCVT.W.S rd, rs1 (signed i32 from f32, rtz=001) : funct7=0x60 rs2=0 */
-static inline u32 rv_fcvt_w_s(u32 rd, u32 rs1) {
- return rv_fcvt(0x60, 0x0, rd, rs1, 0x1);
-}
-static inline u32 rv_fcvt_wu_s(u32 rd, u32 rs1) {
- return rv_fcvt(0x60, 0x1, rd, rs1, 0x1);
-}
-static inline u32 rv_fcvt_l_s(u32 rd, u32 rs1) {
- return rv_fcvt(0x60, 0x2, rd, rs1, 0x1);
-}
-static inline u32 rv_fcvt_lu_s(u32 rd, u32 rs1) {
- return rv_fcvt(0x60, 0x3, rd, rs1, 0x1);
-}
-static inline u32 rv_fcvt_w_d(u32 rd, u32 rs1) {
- return rv_fcvt(0x61, 0x0, rd, rs1, 0x1);
-}
-static inline u32 rv_fcvt_wu_d(u32 rd, u32 rs1) {
- return rv_fcvt(0x61, 0x1, rd, rs1, 0x1);
-}
-static inline u32 rv_fcvt_l_d(u32 rd, u32 rs1) {
- return rv_fcvt(0x61, 0x2, rd, rs1, 0x1);
-}
-static inline u32 rv_fcvt_lu_d(u32 rd, u32 rs1) {
- return rv_fcvt(0x61, 0x3, rd, rs1, 0x1);
-}
-static inline u32 rv_fcvt_s_w(u32 rd, u32 rs1) {
- return rv_fcvt(0x68, 0x0, rd, rs1, 0x7);
-}
-static inline u32 rv_fcvt_s_wu(u32 rd, u32 rs1) {
- return rv_fcvt(0x68, 0x1, rd, rs1, 0x7);
-}
-static inline u32 rv_fcvt_s_l(u32 rd, u32 rs1) {
- return rv_fcvt(0x68, 0x2, rd, rs1, 0x7);
-}
-static inline u32 rv_fcvt_s_lu(u32 rd, u32 rs1) {
- return rv_fcvt(0x68, 0x3, rd, rs1, 0x7);
-}
-static inline u32 rv_fcvt_d_w(u32 rd, u32 rs1) {
- return rv_fcvt(0x69, 0x0, rd, rs1, 0x7);
-}
-static inline u32 rv_fcvt_d_wu(u32 rd, u32 rs1) {
- return rv_fcvt(0x69, 0x1, rd, rs1, 0x7);
-}
-static inline u32 rv_fcvt_d_l(u32 rd, u32 rs1) {
- return rv_fcvt(0x69, 0x2, rd, rs1, 0x7);
-}
-static inline u32 rv_fcvt_d_lu(u32 rd, u32 rs1) {
- return rv_fcvt(0x69, 0x3, rd, rs1, 0x7);
-}
-/* FCVT.S.D / FCVT.D.S */
-static inline u32 rv_fcvt_s_d(u32 rd, u32 rs1) {
- return rv_fcvt(0x20, 0x1, rd, rs1, 0x7);
-}
-static inline u32 rv_fcvt_d_s(u32 rd, u32 rs1) {
- return rv_fcvt(0x21, 0x0, rd, rs1, 0x7);
-}
-
-/* FMV.X.W / FMV.W.X / FMV.X.D / FMV.D.X — bitcast between GPR and FPR. */
-static inline u32 rv_fmv_x_w(u32 rd, u32 rs1) {
- return rv_fcvt(0x70, 0x0, rd, rs1, 0x0);
-}
-static inline u32 rv_fmv_w_x(u32 rd, u32 rs1) {
- return rv_fcvt(0x78, 0x0, rd, rs1, 0x0);
-}
-static inline u32 rv_fmv_x_d(u32 rd, u32 rs1) {
- return rv_fcvt(0x71, 0x0, rd, rs1, 0x0);
-}
-static inline u32 rv_fmv_d_x(u32 rd, u32 rs1) {
- return rv_fcvt(0x79, 0x0, rd, rs1, 0x0);
-}
-
-/* FP compares — rd is integer GPR. funct7 = 0x50/0x51 (S/D). rm: 0=LE, 1=LT,
- * 2=EQ. */
-static inline u32 rv_feq_s(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x50, rs2, rs1, 0x2, rd, RV_OP_FP);
-}
-static inline u32 rv_flt_s(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x50, rs2, rs1, 0x1, rd, RV_OP_FP);
-}
-static inline u32 rv_fle_s(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x50, rs2, rs1, 0x0, rd, RV_OP_FP);
-}
-static inline u32 rv_feq_d(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x51, rs2, rs1, 0x2, rd, RV_OP_FP);
-}
-static inline u32 rv_flt_d(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x51, rs2, rs1, 0x1, rd, RV_OP_FP);
-}
-static inline u32 rv_fle_d(u32 rd, u32 rs1, u32 rs2) {
- return rv_r(0x51, rs2, rs1, 0x0, rd, RV_OP_FP);
-}
-
-static inline u32 rv_flw(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x2, rd, RV_LOAD_FP);
-}
-static inline u32 rv_fld(u32 rd, u32 rs1, i32 imm) {
- return rv_i(imm, rs1, 0x3, rd, RV_LOAD_FP);
-}
-static inline u32 rv_fsw(u32 rs2, u32 rs1, i32 imm) {
- return rv_s(imm, rs2, rs1, 0x2, RV_STORE_FP);
-}
-static inline u32 rv_fsd(u32 rs2, u32 rs1, i32 imm) {
- return rv_s(imm, rs2, rs1, 0x3, RV_STORE_FP);
-}
-
-/* ---- A extension (LR/SC + AMO) ----
- * AMO funct7 layout: aq(26) rl(25) funct5(31:27) op-specific.
- * funct3 selects width: 0x2 = W (32-bit), 0x3 = D (64-bit). */
-static inline u32 rv_amo(u32 funct5, u32 aq, u32 rl, u32 rd, u32 rs1, u32 rs2,
- u32 funct3) {
- u32 funct7 = (funct5 << 2) | ((aq & 1u) << 1) | (rl & 1u);
- return rv_r(funct7, rs2, rs1, funct3, rd, RV_AMO);
-}
-static inline u32 rv_lr_w(u32 rd, u32 rs1, u32 aq, u32 rl) {
- return rv_amo(0x02, aq, rl, rd, rs1, 0, 0x2);
-}
-static inline u32 rv_lr_d(u32 rd, u32 rs1, u32 aq, u32 rl) {
- return rv_amo(0x02, aq, rl, rd, rs1, 0, 0x3);
-}
-static inline u32 rv_sc_w(u32 rd, u32 rs1, u32 rs2, u32 aq, u32 rl) {
- return rv_amo(0x03, aq, rl, rd, rs1, rs2, 0x2);
-}
-static inline u32 rv_sc_d(u32 rd, u32 rs1, u32 rs2, u32 aq, u32 rl) {
- return rv_amo(0x03, aq, rl, rd, rs1, rs2, 0x3);
-}
-
-/* Other A-extension AMO funct5 codes (W and D widths via funct3). */
-#define RV_AMO_SWAP 0x01u
-#define RV_AMO_ADD 0x00u
-#define RV_AMO_XOR 0x04u
-#define RV_AMO_AND 0x0Cu
-#define RV_AMO_OR 0x08u
-#define RV_AMO_MIN 0x10u
-#define RV_AMO_MAX 0x14u
-#define RV_AMO_MINU 0x18u
-#define RV_AMO_MAXU 0x1Cu
-
-/* Zicsr — CSR instructions. csr in imm[11:0]; funct3 selects op.
- * csrrw=1, csrrs=2, csrrc=3, csrrwi=5, csrrsi=6, csrrci=7 */
-static inline u32 rv_csrrw(u32 rd, u32 csr, u32 rs1) {
- return rv_i((i32)(csr & 0xfffu), rs1, 0x1, rd, RV_SYSTEM);
-}
-static inline u32 rv_csrrs(u32 rd, u32 csr, u32 rs1) {
- return rv_i((i32)(csr & 0xfffu), rs1, 0x2, rd, RV_SYSTEM);
-}
-static inline u32 rv_csrrc(u32 rd, u32 csr, u32 rs1) {
- return rv_i((i32)(csr & 0xfffu), rs1, 0x3, rd, RV_SYSTEM);
-}
-static inline u32 rv_csrrwi(u32 rd, u32 csr, u32 uimm) {
- return rv_i((i32)(csr & 0xfffu), uimm & 0x1fu, 0x5, rd, RV_SYSTEM);
-}
-static inline u32 rv_csrrsi(u32 rd, u32 csr, u32 uimm) {
- return rv_i((i32)(csr & 0xfffu), uimm & 0x1fu, 0x6, rd, RV_SYSTEM);
-}
-static inline u32 rv_csrrci(u32 rd, u32 csr, u32 uimm) {
- return rv_i((i32)(csr & 0xfffu), uimm & 0x1fu, 0x7, rd, RV_SYSTEM);
-}
-
-/* ===================================================================
- * Format kinds — one per encoding family the descriptor table dispatches
- * on. R-type splits by funct3/funct7 selectors; I/S/B/U/J each carry a
- * distinct immediate layout. The C-extension formats (CR/CI/CSS/CIW/CL/
- * CS/CB/CJ) are 16-bit; the disassembler picks 16 vs 32 by checking the
- * bottom two bits of the first halfword (00/01/10 → compressed, 11 → 32).
- * =================================================================== */
-typedef enum Rv64Format {
- RV64_FMT_R, /* funct7 rs2 rs1 funct3 rd op — most ALU ops */
- RV64_FMT_R4, /* fused FMA: rs3 funct2 rs2 rs1 funct3 rd op */
- RV64_FMT_I, /* imm[11:0] rs1 funct3 rd op — ALU-imm, loads, jalr */
- RV64_FMT_I_SHIFT, /* shift-imm (shamt6/funct6) — RV64 SLLI/SRLI/SRAI */
- RV64_FMT_I_SHIFTW, /* RV32 word-shift (shamt5/funct7) — SLLIW/SRLIW/SRAIW */
- RV64_FMT_S, /* store */
- RV64_FMT_B, /* branch */
- RV64_FMT_U, /* LUI/AUIPC */
- RV64_FMT_J, /* JAL */
- RV64_FMT_LOAD, /* I-type load: rd, imm(rs1) — printer uses memory syntax */
- RV64_FMT_STORE, /* S-type store: rs2, imm(rs1) */
- RV64_FMT_JALR, /* JALR: rd, imm(rs1) — memory-style operand syntax */
- RV64_FMT_FENCE, /* FENCE pred,succ */
- RV64_FMT_SYSTEM, /* ECALL/EBREAK — no operands */
- RV64_FMT_FP_RM, /* FP arithmetic with rm: funct7 rs2 rs1 rm rd op */
- RV64_FMT_FP_R, /* FP R-type without rm-as-mnemonic-suffix (cmp/sgnj) */
- RV64_FMT_FP_CVT, /* FP conversion: rs2 is type selector, rs1 is src */
- RV64_FMT_FP_LOAD, /* fld/flw — rd[FP], imm(rs1) */
- RV64_FMT_FP_STORE, /* fsd/fsw — rs2[FP], imm(rs1) */
- RV64_FMT_AMO, /* atomic: rd, rs2, (rs1) */
- RV64_FMT_LR, /* LR.W/D: rd, (rs1) — no rs2 */
- RV64_FMT_CSR, /* csrr*: rd, csr, rs1 */
- RV64_FMT_CSRI, /* csrr*i: rd, csr, uimm5 */
- /* ---- Compressed (16-bit) formats ---- */
- RV64_FMT_CR, /* funct4 rd/rs1 rs2 op (e.g. C.MV, C.ADD, C.JR, C.JALR) */
- RV64_FMT_CI, /* funct3 imm rd/rs1 imm op (e.g. C.ADDI, C.LI, C.LUI) */
- RV64_FMT_CSS, /* funct3 imm rs2 op (stack store: C.SDSP, C.SWSP) */
- RV64_FMT_CIW, /* funct3 imm rd' op (C.ADDI4SPN) */
- RV64_FMT_CL, /* funct3 imm rs1' imm rd' op (C.LD, C.LW) */
- RV64_FMT_CS, /* funct3 imm rs1' imm rs2' op (C.SD, C.SW) */
- RV64_FMT_CA, /* funct6 rd'/rs1' funct2 rs2' op (C.AND, C.OR, ...) */
- RV64_FMT_CB, /* branch: funct3 imm rs1' imm op (C.BEQZ, C.BNEZ) */
- RV64_FMT_CJ, /* jump: funct3 imm op (C.J, C.JAL_unused on RV64) */
- RV64_FMT_C_NONE, /* known opcode with no operands (C.NOP, C.EBREAK) */
- /* Assembler-only multi-word pseudo-instruction (call/tail/la/lla). The
- * descriptor's `match` is unused; the assembler dispatches on mnemonic
- * and emits the AUIPC+JALR / AUIPC+ADDI expansion directly. */
- RV64_FMT_PSEUDO,
-} Rv64Format;
-
-typedef enum Rv64DecodedOpcode {
- RV64_DEC_UNKNOWN = 0,
- RV64_DEC_ADDI,
- RV64_DEC_ADD,
- RV64_DEC_AUIPC,
- RV64_DEC_LD,
- RV64_DEC_SD,
- RV64_DEC_JALR,
- RV64_DEC_ECALL,
- RV64_DEC_EBREAK,
-} Rv64DecodedOpcode;
-
-/* ---- AsmFlags column on Rv64InsnDesc ---- */
-#define RV64_ASMFL_ALIAS 0x01u /* row is an alias (preferred print form) */
-#define RV64_ASMFL_FP 0x02u /* operands take f-register prefix */
-#define RV64_ASMFL_NORM 0x04u /* FP_RM row prints without rm suffix */
-#define RV64_ASMFL_C16 0x08u /* 16-bit compressed instruction */
-/* Assembler-only multi-word pseudo (call/tail/la/lla). These expand to
- * several 32-bit words and never participate in disassembly — the decoder
- * sees the individual auipc/jalr/addi words instead. rv64_disasm_find
- * skips rows carrying this flag. */
-#define RV64_ASMFL_PSEUDO 0x10u
-
-/* ===================================================================
- * Per-format field structs + pack/unpack pure functions.
- * =================================================================== */
-
-typedef struct Rv64R {
- u32 funct7, rs2, rs1, funct3, rd, op;
-} Rv64R;
-typedef struct Rv64I {
- u32 imm12, rs1, funct3, rd, op;
-} Rv64I;
-typedef struct Rv64S {
- u32 imm12, rs2, rs1, funct3, op;
-} Rv64S;
-typedef struct Rv64B {
- u32 imm13, rs2, rs1, funct3, op;
-} Rv64B;
-typedef struct Rv64U {
- u32 imm32_hi20, rd, op;
-} Rv64U;
-typedef struct Rv64J {
- u32 imm21, rd, op;
-} Rv64J;
-
-static inline Rv64R rv64_r_unpack(u32 w) {
- Rv64R f;
- f.funct7 = (w >> 25) & 0x7fu;
- f.rs2 = (w >> 20) & 0x1fu;
- f.rs1 = (w >> 15) & 0x1fu;
- f.funct3 = (w >> 12) & 0x7u;
- f.rd = (w >> 7) & 0x1fu;
- f.op = w & 0x7fu;
- return f;
-}
-static inline Rv64I rv64_i_unpack(u32 w) {
- Rv64I f;
- f.imm12 = (w >> 20) & 0xfffu;
- f.rs1 = (w >> 15) & 0x1fu;
- f.funct3 = (w >> 12) & 0x7u;
- f.rd = (w >> 7) & 0x1fu;
- f.op = w & 0x7fu;
- return f;
-}
-static inline Rv64S rv64_s_unpack(u32 w) {
- Rv64S f;
- f.imm12 = (((w >> 25) & 0x7fu) << 5) | ((w >> 7) & 0x1fu);
- f.rs2 = (w >> 20) & 0x1fu;
- f.rs1 = (w >> 15) & 0x1fu;
- f.funct3 = (w >> 12) & 0x7u;
- f.op = w & 0x7fu;
- return f;
-}
-static inline Rv64B rv64_b_unpack(u32 w) {
- Rv64B f;
- f.imm13 = (((w >> 31) & 1u) << 12) | (((w >> 7) & 1u) << 11) |
- (((w >> 25) & 0x3fu) << 5) | (((w >> 8) & 0xfu) << 1);
- f.rs2 = (w >> 20) & 0x1fu;
- f.rs1 = (w >> 15) & 0x1fu;
- f.funct3 = (w >> 12) & 0x7u;
- f.op = w & 0x7fu;
- return f;
-}
-static inline Rv64U rv64_u_unpack(u32 w) {
- Rv64U f;
- f.imm32_hi20 = w & 0xfffff000u;
- f.rd = (w >> 7) & 0x1fu;
- f.op = w & 0x7fu;
- return f;
-}
-static inline Rv64J rv64_j_unpack(u32 w) {
- Rv64J f;
- f.imm21 = (((w >> 31) & 1u) << 20) | (((w >> 12) & 0xffu) << 12) |
- (((w >> 20) & 1u) << 11) | (((w >> 21) & 0x3ffu) << 1);
- f.rd = (w >> 7) & 0x1fu;
- f.op = w & 0x7fu;
- return f;
-}
-
-/* Sign-extend an n-bit value held in the low bits of v to i64. */
-static inline i64 rv64_sext(u64 v, u32 nbits) {
- u64 mask = (nbits >= 64u) ? ~0ull : ((1ull << nbits) - 1ull);
- v &= mask;
- u64 sign = (nbits == 0u) ? 0ull : (1ull << (nbits - 1u));
- if (v & sign) v |= ~mask;
- return (i64)v;
-}
-
-/* ===================================================================
- * Compressed (RV64C) helpers — 16-bit instructions.
- *
- * Layout (per RVC quadrant): bits[1:0] (op) select the quadrant:
- * 00 → Q0 (stack-relative & load/store narrow),
- * 01 → Q1 (constant/branch),
- * 10 → Q2 (stack pointer access & jumps & MV/ADD).
- * 11 is reserved for 32-bit (uncompressed) instructions, so the
- * disassembler picks 16-bit when (halfword & 3) != 3.
- *
- * The "narrow" register fields rs1' / rs2' / rd' are 3-bit and encode
- * x8..x15; macro RVC_REG3 unfolds: r' → 8 + r'. */
-#define RVC_REG3(r3) ((u32)(8u + ((r3) & 7u)))
-
-typedef struct Rv64C {
- u32 word;
-} Rv64C; /* 16-bit halfword in low 16 bits */
-
-/* ===================================================================
- * Descriptor table.
- * =================================================================== */
-
-typedef struct Rv64InsnDesc {
- Slice mnemonic;
- u32 match;
- u32 mask;
- u8 fmt; /* Rv64Format */
- u8 flags; /* RV64_ASMFL_* */
- u8 pad[2];
-} Rv64InsnDesc;
-
-extern const Rv64InsnDesc rv64_insn_table[];
-extern const u32 rv64_insn_table_n;
-
-/* Linear-scan lookup. Returns the matching descriptor or NULL. First
- * match wins; ordering puts more-specific entries (aliases, fixed-Rd
- * forms) before broader ones. */
-const Rv64InsnDesc* rv64_disasm_find(u32 word);
-
-/* Compressed-instruction (16-bit) variant. Pass the halfword in the low
- * 16 bits of `word`. Returns NULL if no descriptor matches. */
-const Rv64InsnDesc* rv64_disasm_find_c(u32 word);
-
-/* Mnemonic → descriptor for the assembler. Returns NULL if not found.
- * Ignores ALIAS-only rows when those would produce ambiguous parses
- * (the canonical form is always reachable). */
-const Rv64InsnDesc* rv64_asm_find(Slice mnemonic);
-
-/* ===================================================================
- * Operand print / parse dispatch.
- *
- * rv64_print_operands renders the operand text (everything after the
- * mnemonic) for `word` into `sb`, using `desc->fmt` to dispatch.
- * Mnemonic itself is in `desc->mnemonic`; the caller writes it before
- * calling this helper. `vaddr` is the instruction's virtual address for
- * PC-relative formats; pass 0 if not known. */
-void rv64_print_operands(StrBuf* sb, const Rv64InsnDesc* desc, u32 word,
- u64 vaddr);
-
-#endif /* KIT_RV64_ISA_H */
diff --git a/src/arch/rv64/link.c b/src/arch/rv64/link.c
@@ -1,99 +0,0 @@
-/* RV64 link-time arch descriptor. See link_arch.h for the contract.
- *
- * The PLT0/PLT-entry/IPLT-stub byte layouts here mirror what used to
- * live inline in link_dyn.c (PLT) and link_layout.c (IPLT) before the
- * vtable refactor; comments preserve the WHY (notably the +0x800 bias
- * on AUIPC immediates). */
-
-#include "arch/rv64/isa.h"
-#include "core/bytes.h"
-#include "core/core.h"
-#include "link/link_arch.h"
-
-/* PLT0 is 8 canonical NOPs (32 bytes); each PLT entry and IPLT stub is
- * 4 instructions (16 bytes) / 3 instructions (12 bytes) respectively.
- * Encoded once here so the descriptor and emitters stay in sync. */
-#define RV64_PLT0_SIZE 32u
-#define RV64_PLT_ENTRY_SIZE 16u
-#define RV64_IPLT_STUB_SIZE 12u
-
-/* Split a PC-relative displacement into the (hi20, lo12) pair consumed
- * by the AUIPC + I-type sequence. The +0x800 bias is the standard
- * RISC-V two-instruction PCREL trick: AUIPC adds an upper-20 immediate
- * shifted left 12, then the second instruction adds a sign-extended
- * 12-bit lo12. If we naively split disp into (disp>>12, disp&0xfff)
- * the lo12 sign-extends as a *negative* number whenever bit 11 is set,
- * which underflows the AUIPC result by 0x1000. Adding 0x800 before
- * the shift rounds the high half up in exactly the cases that need it
- * so AUIPC + sign-extended-lo12 reconstructs disp correctly. */
-static inline void rv64_split_pcrel(i64 disp, u32* hi20_out, u32* lo12_out) {
- *hi20_out = (u32)(((u64)(disp + 0x800)) >> 12) & 0xfffffu;
- *lo12_out = (u32)((u64)disp & 0xfffu);
-}
-
-/* PLT0 under DF_1_NOW is never executed — the loader resolves every
- * JUMP_SLOT before transferring control — but we still emit it in
- * canonical form (8 NOPs) so disassemblers and unwinders see a well-
- * formed prologue at the top of .plt. */
-static void rv64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) {
- u32 i;
- (void)plt0_vaddr;
- (void)gotplt_vaddr;
- for (i = 0; i < RV64_PLT0_SIZE; i += 4u) wr_u32_le(dst + i, rv_nop());
-}
-
-/* Per-import PLT entry: load the GOT slot pre-filled by the loader
- * (R_RISCV_JUMP_SLOT) and tail-call through it. t1 is the standard
- * psABI scratch for the trampoline return-address (clobbered by the
- * lazy resolver in the non-BIND_NOW path); t3 holds the slot pointer. */
-static void rv64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) {
- i64 disp = (i64)slot_vaddr - (i64)entry_vaddr;
- u32 hi20;
- u32 lo12;
- rv64_split_pcrel(disp, &hi20, &lo12);
- wr_u32_le(dst + 0, rv_auipc(RV_T3, hi20));
- wr_u32_le(dst + 4, rv_ld(RV_T3, RV_T3, (i32)lo12));
- wr_u32_le(dst + 8, rv_jalr(RV_T1, RV_T3, 0));
- wr_u32_le(dst + 12, rv_nop());
-}
-
-/* IPLT stub: load .igot.plt[i] (filled at startup by the resolver) and
- * tail-call to it. The stub->slot displacement is invariant under the
- * segment-base shift (both addresses live in the same image), so we
- * bake it directly into the instructions and report zero apply-time
- * relocs — unlike aarch64, which cannot encode a 32-bit pcrel inline. */
-static u32 rv64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr,
- LinkArchIPltReloc out[2]) {
- i64 disp = (i64)slot_vaddr - (i64)stub_vaddr;
- u32 hi20;
- u32 lo12;
- (void)out;
- rv64_split_pcrel(disp, &hi20, &lo12);
- wr_u32_le(dst + 0, rv_auipc(RV_T1, hi20));
- wr_u32_le(dst + 4, rv_ld(RV_T1, RV_T1, (i32)lo12));
- wr_u32_le(dst + 8, rv_jr(RV_T1));
- return 0u;
-}
-
-/* A direct rv64 call (R_RV_CALL = AUIPC+JALR) reaches only ±2GiB. In the JIT,
- * an external SK_ABS target (a host libc symbol resolved to an arbitrary
- * address) can lie farther than that from the JIT-allocated code region, where
- * link_reloc_apply would panic "RV CALL out of range". Reporting these as
- * branch relocs routes them through the JIT call-stub pass, which reuses
- * emit_iplt_stub (AUIPC+LD+JR) to reach an arbitrary address held in an
- * in-image slot — the same safety net aa64 and x64 already wire. */
-static int rv64_is_branch_reloc(RelocKind kind) {
- return kind == R_RV_CALL || kind == R_PLT32;
-}
-
-const LinkArchDesc link_arch_rv64 = {
- .plt0_size = RV64_PLT0_SIZE,
- .plt_entry_size = RV64_PLT_ENTRY_SIZE,
- .iplt_stub_size = RV64_IPLT_STUB_SIZE,
- .global_pointer_symbol = "__global_pointer$",
- .global_pointer_rw_offset = 0x800u,
- .emit_plt0 = rv64_emit_plt0,
- .emit_plt_entry = rv64_emit_plt_entry,
- .emit_iplt_stub = rv64_emit_iplt_stub,
- .needs_jit_call_stub = rv64_is_branch_reloc,
-};
diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c
@@ -1,3872 +0,0 @@
-/* src/arch/rv64/native.c — RISC-V (RV64GC, LP64D) NativeTarget implementation.
- *
- * Mirrors the aa64 reference (src/arch/aa64/native.c): a physical-emission
- * NativeTarget driven at -O0 by the shared NativeDirectTarget and at -O1+ by
- * the optimizer emit path. ABI decisions go through the abi/ interface; this
- * file owns only ISA emission and the RV64 frame layout.
- *
- * Frame model (single, top-record): s0 (x8) is the frame pointer anchored at
- * the saved s0/ra pair; slots live below s0 at positive byte offsets `off`
- * (address = s0 - off); outgoing args sit at the bottom of the frame (sp+0..).
- * frame_size = align16(16 + cum_off + max_outgoing + va_save_sz)
- * fp_pair_off = frame_size - 16 - va_save_sz (saved pair, sp-relative)
- * CFA = s0 + (frame_size - fp_pair_off)
- * RISC-V has no condition flags: comparisons materialize a 0/1 via SLT/SLTU or
- * FLT/FLE; branches compare two registers directly. x0 is a hardware zero. */
-
-#include <string.h>
-
-#include "abi/abi.h"
-#include "arch/rv64/asm.h"
-#include "arch/rv64/isa.h"
-#include "arch/rv64/regs.h"
-#include "arch/rv64/rv64.h"
-#include "asm/asm.h"
-#include "asm/asm_lex.h"
-#include "cg/native_argmove.h"
-#include "cg/native_asm.h"
-#include "cg/native_direct_target.h"
-#include "cg/native_frame.h"
-#include "cg/type.h"
-#include "core/arena.h"
-#include "core/bytes.h"
-#include "core/pool.h"
-#include "core/slice.h"
-#include "obj/obj.h"
-
-enum {
- RV_TMP0 = 5u, /* t0: emit-internal scratch (reserved, never allocable) */
- RV_TMP1 = 6u, /* t1: emit-internal scratch */
- RV_TMP2 = 7u, /* t2: emit-internal scratch (reserved in phys table) */
- RV_TMP3 = 28u, /* t3: emit-internal scratch (reserved in phys table) */
- RV_FTMP0 = 0u, /* ft0: emit-internal FP scratch */
- RV_FTMP1 = 1u, /* ft1: emit-internal FP scratch */
- RV_FA0 = 10u, /* fa0..fa7 = f10..f17 (FP arg/return registers) */
- RV_FA7 = 17u,
- /* Single-pass (-O0) worst-case prologue: sp adjust (3) + far save pair (7)
- * + sret spill (1) + variadic GP spills (8). No callee-saves at -O0. */
- RV_PROLOGUE_WORDS = 32u,
- /* Known-frame (-O1) prologues are emitted directly, not into the fixed -O0
- * NOP region, and additionally save callee-saved registers (up to 11 int + 12
- * fp, each up to 4 words for a far s0-relative offset) on top of the header,
- * sret, and variadic spills. Size the build buffer for the worst case. */
- RV_KNOWN_PROLOGUE_WORDS = 192u,
- RV_FRAME_SAVE_SIZE = 16u,
-};
-
-/* s1..s11 (11) + fs0..fs11 (12); separate int/fp collect arrays use this cap.
- */
-#define RV_MAX_CALLEE_SAVES 16u
-#define RV_MAX_REG_ARG_MOVES 16u
-
-extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc);
-extern void debug_func_pc_range(Debug*, ObjSecId text_section, u32 begin_ofs,
- u32 end_ofs);
-
-/* ============================ low-level emit ============================ */
-
-void rv64_emit32(MCEmitter* mc, u32 word) {
- u8 b[4];
- u32 ofs = obj_pos(mc->obj, mc->section_id);
- wr_u32_le(b, word);
- mc->emit_bytes(mc, b, sizeof b);
- if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
-}
-
-void rv64_emit16(MCEmitter* mc, u32 halfword) {
- u8 b[2];
- u32 ofs = obj_pos(mc->obj, mc->section_id);
- b[0] = (u8)(halfword & 0xff);
- b[1] = (u8)((halfword >> 8) & 0xff);
- mc->emit_bytes(mc, b, sizeof b);
- if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
-}
-
-static void rv_patch32(ObjBuilder* obj, ObjSecId sec, u32 off, u32 word) {
- u8 b[4];
- wr_u32_le(b, word);
- obj_patch(obj, sec, off, b, sizeof b);
-}
-
-static int fits_i12(i64 v) { return v >= -2048 && v <= 2047; }
-static int fits_i32(i64 v) {
- return v >= (i64)(i32)0x80000000 && v <= (i64)(i32)0x7fffffff;
-}
-
-static u32 align_up_u32(u32 v, u32 align) {
- u32 mask = align ? align - 1u : 0u;
- return (v + mask) & ~mask;
-}
-
-static i64 floor_div_4096(i64 v) {
- if (v >= 0) return v / 4096;
- return -((-v + 4095) / 4096);
-}
-
-static void rv_emit_li32(MCEmitter* mc, u32 rd, i32 imm) {
- if (imm >= -2048 && imm <= 2047) {
- rv64_emit32(mc, rv_addi(rd, RV_ZERO, imm));
- return;
- }
- {
- i64 hi64 = floor_div_4096((i64)imm + 0x800);
- i32 hi = (i32)hi64;
- i32 lo = (i32)((i64)imm - hi64 * 4096);
- rv64_emit32(mc, rv_lui(rd, (u32)hi & 0xfffffu));
- if (lo) rv64_emit32(mc, rv_addiw(rd, rd, lo));
- }
-}
-
-static i32 sext12(u32 v) {
- v &= 0xfffu;
- return (v & 0x800u) ? (i32)v - 4096 : (i32)v;
-}
-
-static void rv_emit_li64(MCEmitter* mc, u32 rd, u64 imm) {
- if (fits_i32((i64)imm)) {
- rv_emit_li32(mc, rd, (i32)(i64)imm);
- return;
- }
- {
- i32 lo = sext12((u32)imm);
- u64 hi = (imm - (u64)(i64)lo) >> 12;
- rv_emit_li64(mc, rd, hi);
- rv64_emit32(mc, rv_slli(rd, rd, 12));
- if (lo) rv64_emit32(mc, rv_addi(rd, rd, lo));
- }
-}
-
-/* sf!=0 selects a full 64-bit materialization; sf==0 a 32-bit value. */
-static void rv_emit_load_imm(MCEmitter* mc, u32 sf, u32 rd, i64 imm) {
- if (!sf) {
- rv_emit_li32(mc, rd, (i32)imm);
- return;
- }
- if (fits_i32(imm))
- rv_emit_li32(mc, rd, (i32)imm);
- else
- rv_emit_li64(mc, rd, (u64)imm);
-}
-
-/* rd = base + off, materializing the offset when it exceeds imm12. Uses RV_TMP1
- * as scratch for the wide path, so callers must keep RV_TMP1 free. */
-static void rv_emit_addr_adjust(MCEmitter* mc, u32 rd, u32 base, i32 off) {
- if (off == 0) {
- if (rd != base) rv64_emit32(mc, rv_addi(rd, base, 0));
- return;
- }
- if (fits_i12(off)) {
- rv64_emit32(mc, rv_addi(rd, base, off));
- return;
- }
- rv_emit_load_imm(mc, 1, RV_TMP1, (i64)off);
- rv64_emit32(mc, rv_add(rd, base, RV_TMP1));
-}
-
-static u32 enc_int_store(u32 nbytes, u32 src, u32 base, i32 off) {
- switch (nbytes) {
- case 1:
- return rv_sb(src, base, off);
- case 2:
- return rv_sh(src, base, off);
- case 4:
- return rv_sw(src, base, off);
- default:
- return rv_sd(src, base, off);
- }
-}
-static u32 enc_int_load(u32 nbytes, int sign_ext, u32 rd, u32 base, i32 off) {
- switch (nbytes) {
- case 1:
- return sign_ext ? rv_lb(rd, base, off) : rv_lbu(rd, base, off);
- case 2:
- return sign_ext ? rv_lh(rd, base, off) : rv_lhu(rd, base, off);
- case 4:
- return sign_ext ? rv_lw(rd, base, off) : rv_lwu(rd, base, off);
- default:
- return rv_ld(rd, base, off);
- }
-}
-
-/* ============================ target state ============================ */
-
-/* Frame slots and callee-save records live in the shared NativeFrame
- * bookkeeping (cg/native_frame.h); these aliases keep the rv64-local spellings.
- */
-typedef NativeFrameSlotEntry RvNativeSlot;
-typedef NativeFrameCalleeSave RvCalleeSave;
-
-typedef enum RvPatchKind { RV_PATCH_ALLOCA } RvPatchKind;
-
-typedef struct RvPatch {
- u8 kind; /* RvPatchKind */
- u32 pos;
- u32 dst_reg;
-} RvPatch;
-
-typedef struct RvNativeTarget {
- NativeTarget base;
- SrcLoc loc;
- const CGFuncDesc* func;
-
- /* Shared frame bookkeeping: slot table, cum_off, max_outgoing, callee-save
- * set, and the known_frame / has_alloca / frame_final flags. */
- NativeFrame frame;
- u32 frame_size_final;
- u32 fp_pair_off;
- u32 minimal_prologue_words; /* known-frame path: exact prologue length, else 0
- */
-
- /* Known-frame (-O1) leaf no-frame tier (aa64's slim_prologue equivalent),
- * settled in rv_func_begin_known_frame; always 0 on the single-pass path. A
- * leaf with no callee-saves, no body slots, no outgoing args, no
- * sret/variadic and register-only params never reads s0 nor clobbers ra, so
- * it emits NO prologue and a bare `ret` — the whole frame setup/teardown is
- * elided. RISC-V has no pre/post-indexed store, so aa64's fp_at_bottom fold
- * would save zero instructions on a kept frame and is intentionally not
- * ported (see doc/plan/ARCH.md §2); this leaf tier is the rv64 win. */
- u8 slim_prologue;
-
- u32 incoming_stack_size; /* fixed-param stack bytes (tail-call check) */
- u32 next_param_int;
- u32 next_param_fp;
- u32 next_param_stack;
- u8 has_sret;
- u8 is_variadic;
- NativeFrameSlot sret_ptr_slot;
-
- RvPatch* patches;
- u32 npatches;
- u32 patches_cap;
- u32 nalloca;
-
- u32 func_start;
- u32 prologue_pos;
- MCLabel epilogue_label;
-} RvNativeTarget;
-
-static RvNativeTarget* rv_of(NativeTarget* t) { return (RvNativeTarget*)t; }
-
-static _Noreturn void rv_panic(RvNativeTarget* a, const char* msg) {
- compiler_panic(a->base.c, a->loc, "rv64 native target: %s", msg);
-}
-
-static RvNativeSlot* rv_slot_get(RvNativeTarget* a, NativeFrameSlot fs) {
- return native_frame_slot_at(&a->frame, fs);
-}
-
-/* s0-relative byte offset of a frame slot's base (address = s0 + ret). */
-static i32 rv_s0_off_slot(const RvNativeSlot* s) { return -(i32)s->off; }
-
-/* s0-relative byte offset of incoming stack arg at byte_off. Stack args sit
- * just above the saved pair; the 64-byte variadic GP save area (when present)
- * is contiguous with them at [s0+16). */
-static i32 rv_s0_off_in_arg(const RvNativeTarget* a, u32 byte_off) {
- u32 base = a->is_variadic ? 16u + 64u : 16u;
- return (i32)(base + byte_off);
-}
-
-static u32 rv_va_save_sz(const RvNativeTarget* a) {
- /* ABI-derived: the variadic register-save area is gp_reg_count*gp_slot_size
- * (a0..a7 = 64 bytes for LP64D). Only present in variadic functions. */
- return a->is_variadic ? native_frame_va_save_bytes(a->base.c->abi) : 0u;
-}
-
-/* Callee-saved registers are homed just below the locals at rv_save_off(), 8
- * bytes each — they are NOT frame slots, so the frame size must reserve their
- * bytes explicitly. Zero at -O0 (no callee-saves are taken). */
-static u32 rv_callee_save_bytes(const RvNativeTarget* a) {
- return a->frame.ncallee_saves * 8u;
-}
-
-static u32 rv_frame_size(const RvNativeTarget* a) {
- u32 raw = RV_FRAME_SAVE_SIZE + a->frame.cum_off + rv_callee_save_bytes(a) +
- a->frame.max_outgoing + rv_va_save_sz(a);
- return align_up_u32(raw, 16u);
-}
-
-static u32 rv_fp_pair_off(const RvNativeTarget* a, u32 frame_size) {
- return frame_size - RV_FRAME_SAVE_SIZE - rv_va_save_sz(a);
-}
-
-/* ============================ type helpers ============================ */
-
-/* Scalar size/align/mem/class/loc constructors are shared in native_target.h
- * (native_type_size, native_type_align, native_mem_for_type,
- * native_class_for_type_fp_le8, native_loc_reg, native_loc_stack,
- * native_loc_is_fp). loc_reg's mask is arch-specific and stays here. */
-
-/* A scalar value occupies a 64-bit register when it is pointer-sized or wider,
- * else it is a 32-bit value (drives ADDW vs ADD selection etc). */
-static int rv_is_64(NativeTarget* t, KitCgTypeId type) {
- return native_type_size(t, type) >= 8u || cg_type_is_ptr(t->c, type);
-}
-
-static u32 loc_reg(NativeLoc loc) { return loc.v.reg & 0x1fu; }
-
-/* ============================ register tables ============================ */
-
-#define RV_PHYS_INT_ARG(r, idx) \
- {.reg = (r), \
- .cls = NATIVE_REG_INT, \
- .abi_index = (idx), \
- .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \
- ((idx) < 2u ? NATIVE_REG_RET : 0), \
- .spill_cost = 1u, \
- .copy_cost = 1u}
-#define RV_PHYS_INT_CALLER(r) \
- {.reg = (r), \
- .cls = NATIVE_REG_INT, \
- .abi_index = 0xffu, \
- .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \
- .spill_cost = 1u, \
- .copy_cost = 1u}
-#define RV_PHYS_INT_CALLEE(r) \
- {.reg = (r), \
- .cls = NATIVE_REG_INT, \
- .abi_index = 0xffu, \
- .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \
- .spill_cost = 4u, \
- .copy_cost = 1u}
-#define RV_PHYS_INT_RESERVED(r) \
- {.reg = (r), \
- .cls = NATIVE_REG_INT, \
- .abi_index = 0xffu, \
- .flags = NATIVE_REG_RESERVED, \
- .spill_cost = 0u, \
- .copy_cost = 0u}
-
-/* t0..t3 (x5,x6,x7,x28) are emit-internal scratch (RV_TMP0..RV_TMP3), reserved
- * and never handed to the allocator or driver. t4/t5 are the driver scratch
- * pool (disjoint from the emit temps so a hook can never clobber an operand the
- * driver parked there). t6 is the lone caller-saved allocable (the -O0 cache's
- * only caller-saved home); s1..s11 are appended callee-saved, chosen under
- * pressure (and saved by the optimizer prologue at -O1). */
-static const Reg rv_int_allocable[] = {31u, 9u, 18u, 19u, 20u, 21u,
- 22u, 23u, 24u, 25u, 26u, 27u};
-static const Reg rv_int_scratch[] = {29u, 30u}; /* t4, t5 */
-
-static const NativePhysRegInfo rv_int_phys[] = {
- RV_PHYS_INT_RESERVED(0u), /* zero */
- RV_PHYS_INT_RESERVED(1u), /* ra */
- RV_PHYS_INT_RESERVED(2u), /* sp */
- RV_PHYS_INT_RESERVED(3u), /* gp */
- RV_PHYS_INT_RESERVED(4u), /* tp */
- RV_PHYS_INT_RESERVED(5u), /* t0 = TMP0 */
- RV_PHYS_INT_RESERVED(6u), /* t1 = TMP1 */
- RV_PHYS_INT_RESERVED(7u), /* t2 = TMP2 (emit) */
- RV_PHYS_INT_RESERVED(8u), /* s0/fp */
- RV_PHYS_INT_CALLEE(9u), /* s1 */
- RV_PHYS_INT_ARG(10u, 0u), RV_PHYS_INT_ARG(11u, 1u),
- RV_PHYS_INT_ARG(12u, 2u), RV_PHYS_INT_ARG(13u, 3u),
- RV_PHYS_INT_ARG(14u, 4u), RV_PHYS_INT_ARG(15u, 5u),
- RV_PHYS_INT_ARG(16u, 6u), RV_PHYS_INT_ARG(17u, 7u),
- RV_PHYS_INT_CALLEE(18u), RV_PHYS_INT_CALLEE(19u),
- RV_PHYS_INT_CALLEE(20u), RV_PHYS_INT_CALLEE(21u),
- RV_PHYS_INT_CALLEE(22u), RV_PHYS_INT_CALLEE(23u),
- RV_PHYS_INT_CALLEE(24u), RV_PHYS_INT_CALLEE(25u),
- RV_PHYS_INT_CALLEE(26u), RV_PHYS_INT_CALLEE(27u),
- RV_PHYS_INT_RESERVED(28u), /* t3 = TMP3 (emit) */
- RV_PHYS_INT_RESERVED(29u), /* t4 = driver scratch */
- RV_PHYS_INT_RESERVED(30u), /* t5 = driver scratch */
- RV_PHYS_INT_CALLER(31u), /* t6 = caller-saved allocable */
-};
-
-#define RV_PHYS_FP_ARG(r, idx) \
- {.reg = (r), \
- .cls = NATIVE_REG_FP, \
- .abi_index = (idx), \
- .flags = NATIVE_REG_CALLER_SAVED | NATIVE_REG_ARG | \
- ((idx) < 2u ? NATIVE_REG_RET : 0), \
- .spill_cost = 1u, \
- .copy_cost = 1u}
-#define RV_PHYS_FP_CALLER(r) \
- {.reg = (r), \
- .cls = NATIVE_REG_FP, \
- .abi_index = 0xffu, \
- .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLER_SAVED, \
- .spill_cost = 1u, \
- .copy_cost = 1u}
-#define RV_PHYS_FP_CALLEE(r) \
- {.reg = (r), \
- .cls = NATIVE_REG_FP, \
- .abi_index = 0xffu, \
- .flags = NATIVE_REG_ALLOCABLE | NATIVE_REG_CALLEE_SAVED, \
- .spill_cost = 4u, \
- .copy_cost = 1u}
-#define RV_PHYS_FP_RESERVED(r) \
- {.reg = (r), \
- .cls = NATIVE_REG_FP, \
- .abi_index = 0xffu, \
- .flags = NATIVE_REG_RESERVED, \
- .spill_cost = 0u, \
- .copy_cost = 0u}
-
-/* Caller-saved allocable first (ft4..ft7, ft8..ft11), then callee (fs0..fs11).
- * ft0/ft1 reserved as emit-internal scratch; ft2/ft3 driver scratch. */
-static const Reg rv_fp_allocable[] = {4u, 5u, 6u, 7u, 28u, 29u, 30u,
- 31u, 8u, 9u, 18u, 19u, 20u, 21u,
- 22u, 23u, 24u, 25u, 26u, 27u};
-static const Reg rv_fp_scratch[] = {2u, 3u}; /* ft2, ft3 */
-
-static const NativePhysRegInfo rv_fp_phys[] = {
- RV_PHYS_FP_RESERVED(0u), /* ft0 = FTMP0 */
- RV_PHYS_FP_RESERVED(1u), /* ft1 = FTMP1 */
- RV_PHYS_FP_RESERVED(2u), /* ft2 = scratch */
- RV_PHYS_FP_RESERVED(3u), /* ft3 = scratch */
- RV_PHYS_FP_CALLER(4u), RV_PHYS_FP_CALLER(5u), RV_PHYS_FP_CALLER(6u),
- RV_PHYS_FP_CALLER(7u), RV_PHYS_FP_CALLEE(8u), RV_PHYS_FP_CALLEE(9u),
- RV_PHYS_FP_ARG(10u, 0u), RV_PHYS_FP_ARG(11u, 1u), RV_PHYS_FP_ARG(12u, 2u),
- RV_PHYS_FP_ARG(13u, 3u), RV_PHYS_FP_ARG(14u, 4u), RV_PHYS_FP_ARG(15u, 5u),
- RV_PHYS_FP_ARG(16u, 6u), RV_PHYS_FP_ARG(17u, 7u), RV_PHYS_FP_CALLEE(18u),
- RV_PHYS_FP_CALLEE(19u), RV_PHYS_FP_CALLEE(20u), RV_PHYS_FP_CALLEE(21u),
- RV_PHYS_FP_CALLEE(22u), RV_PHYS_FP_CALLEE(23u), RV_PHYS_FP_CALLEE(24u),
- RV_PHYS_FP_CALLEE(25u), RV_PHYS_FP_CALLEE(26u), RV_PHYS_FP_CALLEE(27u),
- RV_PHYS_FP_CALLER(28u), RV_PHYS_FP_CALLER(29u), RV_PHYS_FP_CALLER(30u),
- RV_PHYS_FP_CALLER(31u),
-};
-
-static const NativeAllocClassInfo rv_classes[] = {
- {.cls = NATIVE_REG_INT,
- .allocable = rv_int_allocable,
- .nallocable = sizeof rv_int_allocable / sizeof rv_int_allocable[0],
- .scratch = rv_int_scratch,
- .nscratch = sizeof rv_int_scratch / sizeof rv_int_scratch[0],
- .phys = rv_int_phys,
- .nphys = sizeof rv_int_phys / sizeof rv_int_phys[0],
- /* t0-t6 (5-7,28-31) + a0-a7 (10-17) */
- .caller_saved_mask = 0xf00400e0u | 0x0001fc00u,
- /* s0-s11 (8,9,18-27) */
- .callee_saved_mask = 0x0ffc0300u,
- .arg_mask = 0x0001fc00u,
- .ret_mask = 0x00000c00u,
- /* zero,ra,sp,gp,tp,t0,t1,t2,s0 (bits 0-8) + t3 (bit 28). t4/t5 are the
- * driver scratch pool (reserved-from-alloc but listed in scratch[]). */
- .reserved_mask = 0x000001ffu | (1u << 28)},
- {.cls = NATIVE_REG_FP,
- .allocable = rv_fp_allocable,
- .nallocable = sizeof rv_fp_allocable / sizeof rv_fp_allocable[0],
- .scratch = rv_fp_scratch,
- .nscratch = sizeof rv_fp_scratch / sizeof rv_fp_scratch[0],
- .phys = rv_fp_phys,
- .nphys = sizeof rv_fp_phys / sizeof rv_fp_phys[0],
- /* ft0-ft7 (0-7), fa0-fa7 (10-17), ft8-ft11 (28-31) */
- .caller_saved_mask = 0xf00400ffu | 0x0001fc00u,
- /* fs0-fs11 (8,9,18-27) */
- .callee_saved_mask = 0x0ffc0300u,
- .arg_mask = 0x0001fc00u,
- .ret_mask = 0x00000c00u,
- .reserved_mask = 0x0000000fu /* ft0-ft3 */},
-};
-
-/* Resolve a register name ("a7", "fa0", ...) to its (class, Reg). Powers the
- * optimizer's inline-asm clobber masks and explicit hard-register operands
- * ("{a7}" from a GNU local register variable). x0..x31 are DWARF 0..31; the
- * FP bank f0..f31 is DWARF 32..63. Returns non-zero for a non-register name
- * (cc/memory/unknown), which the caller skips. */
-static int rv_resolve_name(const NativeRegInfo* ri, Slice name, Reg* out,
- NativeAllocClass* cls_out) {
- char buf[16];
- uint32_t dwarf;
- (void)ri;
- if (!name.s || !name.len || name.len >= sizeof buf) return 1;
- memcpy(buf, name.s, name.len);
- buf[name.len] = '\0';
- if (rv64_register_index(buf, &dwarf) != 0) return 1;
- if (dwarf <= 31u) {
- *cls_out = NATIVE_REG_INT;
- *out = (Reg)dwarf;
- return 0;
- }
- if (dwarf >= 32u && dwarf <= 63u) {
- *cls_out = NATIVE_REG_FP;
- *out = (Reg)(dwarf - 32u);
- return 0;
- }
- return 1;
-}
-
-static int rv_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls,
- Reg reg) {
- (void)ri;
- if (cls == NATIVE_REG_INT) {
- if (reg == 9u) return 1; /* s1 */
- if (reg >= 10u && reg <= 17u) return 1; /* a0..a7 */
- if (reg >= 18u && reg <= 27u) return 1; /* s2..s11 */
- if (reg == 31u) return 1; /* t6 */
- return 0;
- }
- if (cls == NATIVE_REG_FP) return reg >= 4u && reg <= 31u;
- return 0;
-}
-
-static const NativeRegInfo rv_reg_info = {
- .classes = rv_classes,
- .nclasses = sizeof rv_classes / sizeof rv_classes[0],
- .resolve_name = rv_resolve_name,
- .asm_operand_reg_ok = rv_asm_operand_reg_ok,
-};
-
-/* ============================ legality ============================ */
-
-static int rv_imm_legal(NativeTarget* t, NativeImmUse use, u32 op,
- KitCgTypeId type, i64 imm) {
- (void)t;
- (void)type;
- switch (use) {
- case NATIVE_IMM_MOVE:
- return 1;
- case NATIVE_IMM_BINOP:
- switch ((BinOp)op) {
- case BO_IADD:
- return fits_i12(imm);
- case BO_ISUB:
- return fits_i12(-imm); /* emitted as ADDI with negated imm */
- case BO_AND:
- case BO_OR:
- case BO_XOR:
- return fits_i12(imm);
- case BO_SHL:
- case BO_SHR_S:
- case BO_SHR_U:
- return imm >= 0 && imm <= 63;
- default:
- return 0;
- }
- case NATIVE_IMM_CMP:
- return imm == 0; /* compares need both ends in registers (SLT/branch) */
- case NATIVE_IMM_ADDR_OFFSET:
- return fits_i12(imm);
- }
- return 0;
-}
-
-static int rv_addr_legal(NativeTarget* t, const NativeAddr* addr,
- MemAccess mem) {
- (void)t;
- (void)mem;
- if (!addr) return 0;
- if (addr->index_kind != NATIVE_ADDR_INDEX_NONE) return 0;
- if (addr->base_kind != NATIVE_ADDR_BASE_REG &&
- addr->base_kind != NATIVE_ADDR_BASE_FRAME)
- return 0;
- return fits_i12(addr->offset);
-}
-
-/* ============================ memory ============================ */
-
-/* Materialize the runtime address of a global into `dst`, including addend. */
-static void rv_emit_global_addr(RvNativeTarget* a, u32 dst, ObjSymId sym,
- i64 addend) {
- NativeTarget* t = &a->base;
- MCEmitter* mc = t->mc;
- u32 sec = mc->section_id;
- if (obj_symbol_extern_via_got(t->c, t->obj, sym)) {
- u32 ap = mc->pos(mc);
- rv64_emit32(mc, rv_auipc(dst, 0));
- mc->emit_reloc_at(mc, sec, ap, R_RV_GOT_HI20, sym, 0, 0, 0);
- {
- Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi"));
- ObjSymId anchor =
- obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
- u32 lp = mc->pos(mc);
- rv64_emit32(mc, rv_ld(dst, dst, 0));
- mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
- }
- } else {
- u32 ap = mc->pos(mc);
- rv64_emit32(mc, rv_auipc(dst, 0));
- mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, 0, 0, 0);
- {
- Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi"));
- ObjSymId anchor =
- obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
- u32 lp = mc->pos(mc);
- rv64_emit32(mc, rv_addi(dst, dst, 0));
- mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
- }
- }
- if (addend) rv_emit_addr_adjust(mc, dst, dst, (i32)addend);
-}
-
-/* Fold (base_reg << 0) + (index << scale) into RV_TMP0 via Zba. */
-static u32 rv_fold_index(RvNativeTarget* a, u32 base, u32 idx, u8 log2_scale) {
- MCEmitter* mc = a->base.mc;
- switch (log2_scale) {
- case 0:
- rv64_emit32(mc, rv_add(RV_TMP0, base, idx));
- break;
- case 1:
- rv64_emit32(mc, rv_sh1add(RV_TMP0, idx, base));
- break;
- case 2:
- rv64_emit32(mc, rv_sh2add(RV_TMP0, idx, base));
- break;
- default:
- rv64_emit32(mc, rv_sh3add(RV_TMP0, idx, base));
- break;
- }
- return RV_TMP0;
-}
-
-/* Resolve any NativeAddr to a base register + imm12 offset. RISC-V has no
- * indexed load/store, so an index is folded into RV_TMP0 via Zba; far offsets
- * and FRAME/FRAME_VALUE/GLOBAL bases are materialized into RV_TMP0/RV_TMP1. */
-static void rv_resolve_mem_addr(RvNativeTarget* a, const NativeAddr* addr,
- u32* base_out, i32* off_out) {
- MCEmitter* mc = a->base.mc;
- u32 base;
- i32 off;
- switch (addr->base_kind) {
- case NATIVE_ADDR_BASE_REG:
- base = addr->base.reg & 0x1fu;
- off = addr->offset;
- break;
- case NATIVE_ADDR_BASE_FRAME: {
- RvNativeSlot* s = rv_slot_get(a, addr->base.frame);
- base = RV_S0;
- off = rv_s0_off_slot(s) + addr->offset;
- break;
- }
- case NATIVE_ADDR_BASE_FRAME_VALUE: {
- RvNativeSlot* s = rv_slot_get(a, addr->base.frame);
- rv64_emit32(mc, rv_ld(RV_TMP0, RV_S0, rv_s0_off_slot(s)));
- base = RV_TMP0;
- off = addr->offset;
- break;
- }
- case NATIVE_ADDR_BASE_GLOBAL:
- rv_emit_global_addr(a, RV_TMP0, addr->base.global.sym,
- addr->base.global.addend);
- base = RV_TMP0;
- off = addr->offset;
- break;
- default:
- rv_panic(a, "unsupported address base");
- }
- if (addr->index_kind == NATIVE_ADDR_INDEX_REG) {
- base = rv_fold_index(a, base, addr->index.reg & 0x1fu, addr->log2_scale);
- } else if (addr->index_kind == NATIVE_ADDR_INDEX_FRAME_VALUE) {
- RvNativeSlot* s = rv_slot_get(a, addr->index.frame);
- rv64_emit32(mc, rv_ld(RV_TMP1, RV_S0, rv_s0_off_slot(s)));
- base = rv_fold_index(a, base, RV_TMP1, addr->log2_scale);
- }
- if (!fits_i12(off)) {
- rv_emit_load_imm(mc, 1, RV_TMP1, (i64)off);
- rv64_emit32(mc, rv_add(RV_TMP0, base, RV_TMP1));
- base = RV_TMP0;
- off = 0;
- }
- *base_out = base;
- *off_out = off;
-}
-
-/* Central load/store primitive. is_load: 1 load into reg, 0 store reg to mem.
- */
-static void rv_emit_mem(RvNativeTarget* a, int is_load, NativeLoc reg,
- NativeAddr addr, MemAccess mem) {
- NativeTarget* t = &a->base;
- MCEmitter* mc = t->mc;
- u32 r = loc_reg(reg);
- int fp = native_loc_is_fp(reg);
- u32 sz = mem.size ? mem.size : native_type_size(t, reg.type);
- u32 base;
- i32 off;
-
- rv_resolve_mem_addr(a, &addr, &base, &off);
- if (fp) {
- rv64_emit32(
- mc, is_load ? (sz == 8u ? rv_fld(r, base, off) : rv_flw(r, base, off))
- : (sz == 8u ? rv_fsd(r, base, off) : rv_fsw(r, base, off)));
- } else {
- rv64_emit32(mc, is_load ? enc_int_load(sz, 0, r, base, off)
- : enc_int_store(sz, r, base, off));
- }
-}
-
-/* ============================ moves / data ============================ */
-
-static void rv_move(NativeTarget* t, NativeLoc dst, NativeLoc src) {
- MCEmitter* mc = t->mc;
- int dfp = native_loc_is_fp(dst), sfp = native_loc_is_fp(src);
- u32 rd = loc_reg(dst), rs = loc_reg(src);
- if (dfp && sfp) {
- u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S;
- if (rd == rs) return;
- rv64_emit32(mc, rv_fsgnj(fmt, rd, rs, rs));
- return;
- }
- if (!dfp && sfp) {
- u32 sz = native_type_size(t, src.type);
- rv64_emit32(mc, sz == 8u ? rv_fmv_x_d(rd, rs) : rv_fmv_x_w(rd, rs));
- return;
- }
- if (dfp && !sfp) {
- u32 sz = native_type_size(t, dst.type);
- rv64_emit32(mc, sz == 8u ? rv_fmv_d_x(rd, rs) : rv_fmv_w_x(rd, rs));
- return;
- }
- if (rd == rs) return;
- rv64_emit32(mc, rv_addi(rd, rs, 0));
-}
-
-static void rv_load_imm(NativeTarget* t, NativeLoc dst, i64 imm) {
- rv_emit_load_imm(t->mc, rv_is_64(t, dst.type) ? 1u : 0u, loc_reg(dst), imm);
-}
-
-static void rv_load_const(NativeTarget* t, NativeLoc dst, ConstBytes cb) {
- RvNativeTarget* a = rv_of(t);
- u64 v = 0;
- u32 i;
- if (!native_loc_is_fp(dst)) {
- for (i = 0; i < cb.size && i < 8u; ++i) v |= (u64)cb.bytes[i] << (i * 8u);
- rv_load_imm(t, dst, (i64)v);
- return;
- }
- /* FP constant: materialize the bit pattern in TMP0, bitcast into the FPR. */
- for (i = 0; i < cb.size && i < 8u; ++i) v |= (u64)cb.bytes[i] << (i * 8u);
- rv_emit_load_imm(t->mc, 1, RV_TMP0, (i64)v);
- if (cb.size == 8u)
- rv64_emit32(t->mc, rv_fmv_d_x(loc_reg(dst), RV_TMP0));
- else
- rv64_emit32(t->mc, rv_fmv_w_x(loc_reg(dst), RV_TMP0));
- (void)a;
-}
-
-static void rv_load_addr(NativeTarget* t, NativeLoc dst, NativeAddr addr) {
- RvNativeTarget* a = rv_of(t);
- MCEmitter* mc = t->mc;
- u32 rd = loc_reg(dst);
- u32 base;
- i32 off;
- if (addr.base_kind == NATIVE_ADDR_BASE_GLOBAL) {
- rv_emit_global_addr(a, rd, addr.base.global.sym,
- addr.base.global.addend + addr.offset);
- base = rd;
- off = 0;
- } else if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
- /* Load the pointer stored in the frame slot, then add the offset. */
- RvNativeSlot* s = rv_slot_get(a, addr.base.frame);
- rv64_emit32(mc, rv_ld(rd, RV_S0, rv_s0_off_slot(s)));
- base = rd;
- off = addr.offset;
- } else if (addr.base_kind == NATIVE_ADDR_BASE_FRAME) {
- RvNativeSlot* s = rv_slot_get(a, addr.base.frame);
- base = RV_S0;
- off = rv_s0_off_slot(s) + addr.offset;
- } else if (addr.base_kind == NATIVE_ADDR_BASE_REG) {
- base = addr.base.reg & 0x1fu;
- off = addr.offset;
- } else {
- rv_panic(a, "unsupported address base in load_addr");
- }
- /* Fold any index via Zba sh{1,2,3}add (index << scale) + base. */
- if (addr.index_kind == NATIVE_ADDR_INDEX_REG) {
- u32 idx = addr.index.reg & 0x1fu;
- if (off != 0 || base != rd) rv_emit_addr_adjust(mc, rd, base, off);
- switch (addr.log2_scale) {
- case 0:
- rv64_emit32(mc, rv_add(rd, rd, idx));
- break;
- case 1:
- rv64_emit32(mc, rv_sh1add(rd, idx, rd));
- break;
- case 2:
- rv64_emit32(mc, rv_sh2add(rd, idx, rd));
- break;
- default:
- rv64_emit32(mc, rv_sh3add(rd, idx, rd));
- break;
- }
- return;
- }
- rv_emit_addr_adjust(mc, rd, base, off);
-}
-
-static void rv_load(NativeTarget* t, NativeLoc dst, NativeAddr addr,
- MemAccess mem) {
- rv_emit_mem(rv_of(t), 1, dst, addr, mem);
-}
-static void rv_store(NativeTarget* t, NativeAddr addr, NativeLoc src,
- MemAccess mem) {
- rv_emit_mem(rv_of(t), 0, src, addr, mem);
-}
-
-/* copy_bytes: resolve dst and src to dedicated pointer regs (RV_TMP3 / RV_TMP0)
- * once, then copy granule-by-granule advancing both pointers. dst is resolved
- * first because its base may itself live in RV_TMP1 (the transfer reg, e.g. the
- * sret pointer from plan_ret); capturing it into RV_TMP3 before src resolution
- * (which may clobber RV_TMP1 for far offsets) keeps it live. Advancing the
- * pointers keeps every load/store at offset 0, so no offset ever exceeds imm12
- * and the transfer reg never aliases a base. */
-static void rv_copy_bytes(NativeTarget* t, NativeAddr dst, NativeAddr src,
- AggregateAccess access) {
- MCEmitter* mc = t->mc;
- KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
- u32 rem = access.size;
- rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP3), dst);
- rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP0), src);
- while (rem) {
- u32 sz = rem >= 8u ? 8u : rem >= 4u ? 4u : rem >= 2u ? 2u : 1u;
- rv64_emit32(mc, enc_int_load(sz, 0, RV_TMP1, RV_TMP0, 0));
- rv64_emit32(mc, enc_int_store(sz, RV_TMP1, RV_TMP3, 0));
- rv64_emit32(mc, rv_addi(RV_TMP0, RV_TMP0, (i32)sz));
- rv64_emit32(mc, rv_addi(RV_TMP3, RV_TMP3, (i32)sz));
- rem -= sz;
- }
-}
-
-static void rv_set_bytes(NativeTarget* t, NativeAddr dst, NativeLoc byte_value,
- AggregateAccess access) {
- MCEmitter* mc = t->mc;
- KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
- u32 bv = loc_reg(byte_value);
- u32 rem = access.size;
- rv_load_addr(t, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP3), dst);
- while (rem) {
- rv64_emit32(mc, rv_sb(bv, RV_TMP3, 0));
- rv64_emit32(mc, rv_addi(RV_TMP3, RV_TMP3, 1));
- rem -= 1u;
- }
-}
-
-/* ============================ arithmetic ============================ */
-
-static void rv_binop(NativeTarget* t, BinOp op, NativeLoc dst, NativeLoc aop,
- NativeLoc bop) {
- MCEmitter* mc = t->mc;
- u32 rd = loc_reg(dst);
- u32 ra = loc_reg(aop);
- int sf = rv_is_64(t, dst.type);
- int b_imm = bop.kind == NATIVE_LOC_IMM;
- u32 rb = b_imm ? 0u : loc_reg(bop);
- i64 imm = b_imm ? bop.v.imm : 0;
-
- switch (op) {
- case BO_FADD:
- case BO_FSUB:
- case BO_FMUL:
- case BO_FDIV: {
- u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S;
- switch (op) {
- case BO_FADD:
- rv64_emit32(mc, rv_fadd(fmt, rd, ra, rb));
- break;
- case BO_FSUB:
- rv64_emit32(mc, rv_fsub(fmt, rd, ra, rb));
- break;
- case BO_FMUL:
- rv64_emit32(mc, rv_fmul(fmt, rd, ra, rb));
- break;
- default:
- rv64_emit32(mc, rv_fdiv(fmt, rd, ra, rb));
- break;
- }
- return;
- }
- case BO_IADD:
- if (b_imm) {
- rv64_emit32(
- mc, sf ? rv_addi(rd, ra, (i32)imm) : rv_addiw(rd, ra, (i32)imm));
- } else {
- rv64_emit32(mc, sf ? rv_add(rd, ra, rb) : rv_addw(rd, ra, rb));
- }
- return;
- case BO_ISUB:
- if (b_imm) {
- rv64_emit32(
- mc, sf ? rv_addi(rd, ra, (i32)-imm) : rv_addiw(rd, ra, (i32)-imm));
- } else {
- rv64_emit32(mc, sf ? rv_sub(rd, ra, rb) : rv_subw(rd, ra, rb));
- }
- return;
- case BO_IMUL:
- rv64_emit32(mc, sf ? rv_mul(rd, ra, rb) : rv_mulw(rd, ra, rb));
- return;
- case BO_SDIV:
- rv64_emit32(mc, sf ? rv_div(rd, ra, rb) : rv_divw(rd, ra, rb));
- return;
- case BO_UDIV:
- rv64_emit32(mc, sf ? rv_divu(rd, ra, rb) : rv_divuw(rd, ra, rb));
- return;
- case BO_SREM:
- rv64_emit32(mc, sf ? rv_rem(rd, ra, rb) : rv_remw(rd, ra, rb));
- return;
- case BO_UREM:
- rv64_emit32(mc, sf ? rv_remu(rd, ra, rb) : rv_remuw(rd, ra, rb));
- return;
- case BO_AND:
- rv64_emit32(mc, b_imm ? rv_andi(rd, ra, (i32)imm) : rv_and(rd, ra, rb));
- return;
- case BO_OR:
- rv64_emit32(mc, b_imm ? rv_ori(rd, ra, (i32)imm) : rv_or(rd, ra, rb));
- return;
- case BO_XOR:
- rv64_emit32(mc, b_imm ? rv_xori(rd, ra, (i32)imm) : rv_xor(rd, ra, rb));
- return;
- case BO_SHL:
- if (b_imm)
- rv64_emit32(mc, sf ? rv_slli(rd, ra, (u32)imm & 63u)
- : rv_slliw(rd, ra, (u32)imm & 31u));
- else
- rv64_emit32(mc, sf ? rv_sll(rd, ra, rb) : rv_sllw(rd, ra, rb));
- return;
- case BO_SHR_U:
- if (b_imm)
- rv64_emit32(mc, sf ? rv_srli(rd, ra, (u32)imm & 63u)
- : rv_srliw(rd, ra, (u32)imm & 31u));
- else
- rv64_emit32(mc, sf ? rv_srl(rd, ra, rb) : rv_srlw(rd, ra, rb));
- return;
- case BO_SHR_S:
- if (b_imm)
- rv64_emit32(mc, sf ? rv_srai(rd, ra, (u32)imm & 63u)
- : rv_sraiw(rd, ra, (u32)imm & 31u));
- else
- rv64_emit32(mc, sf ? rv_sra(rd, ra, rb) : rv_sraw(rd, ra, rb));
- return;
- default:
- rv_panic(rv_of(t), "unsupported binop");
- }
-}
-
-static void rv_unop(NativeTarget* t, UnOp op, NativeLoc dst, NativeLoc src) {
- MCEmitter* mc = t->mc;
- u32 rd = loc_reg(dst), rs = loc_reg(src);
- int sf = rv_is_64(t, dst.type);
- switch (op) {
- case UO_NEG:
- rv64_emit32(mc, sf ? rv_sub(rd, RV_ZERO, rs) : rv_subw(rd, RV_ZERO, rs));
- return;
- case UO_FNEG: {
- u32 fmt = native_type_size(t, dst.type) == 8u ? RV_FMT_D : RV_FMT_S;
- rv64_emit32(mc, rv_fsgnjn(fmt, rd, rs, rs));
- return;
- }
- case UO_BNOT:
- rv64_emit32(mc, rv_xori(rd, rs, -1));
- return;
- case UO_NOT:
- rv64_emit32(mc, rv_sltiu(rd, rs, 1));
- return;
- default:
- rv_panic(rv_of(t), "unsupported unop");
- }
-}
-
-/* Sign/zero-extend a 32-bit operand into a 64-bit register for comparison.
- * Returns the register to compare. */
-static u32 rv_cmp_ext(NativeTarget* t, int is_signed, NativeLoc op, u32 tmp) {
- MCEmitter* mc = t->mc;
- u32 r = loc_reg(op);
- if (rv_is_64(t, op.type)) return r;
- if (is_signed) {
- rv64_emit32(mc, rv_addiw(tmp, r, 0)); /* sign-extend low 32 */
- } else {
- rv64_emit32(mc, rv_slli(tmp, r, 32));
- rv64_emit32(mc, rv_srli(tmp, tmp, 32));
- }
- return tmp;
-}
-
-static int cmp_is_signed(CmpOp op) {
- switch (op) {
- case CMP_LT_U:
- case CMP_LE_U:
- case CMP_GT_U:
- case CMP_GE_U:
- return 0;
- default:
- return 1;
- }
-}
-
-/* Emit a 0/1 comparison result into rd from two integer registers. */
-static void rv_emit_icmp(NativeTarget* t, CmpOp op, u32 rd, u32 ra, u32 rb) {
- MCEmitter* mc = t->mc;
- switch (op) {
- case CMP_EQ:
- rv64_emit32(mc, rv_sub(rd, ra, rb));
- rv64_emit32(mc, rv_sltiu(rd, rd, 1));
- return;
- case CMP_NE:
- rv64_emit32(mc, rv_sub(rd, ra, rb));
- rv64_emit32(mc, rv_sltu(rd, RV_ZERO, rd));
- return;
- case CMP_LT_S:
- rv64_emit32(mc, rv_slt(rd, ra, rb));
- return;
- case CMP_LT_U:
- rv64_emit32(mc, rv_sltu(rd, ra, rb));
- return;
- case CMP_GT_S:
- rv64_emit32(mc, rv_slt(rd, rb, ra));
- return;
- case CMP_GT_U:
- rv64_emit32(mc, rv_sltu(rd, rb, ra));
- return;
- case CMP_GE_S:
- rv64_emit32(mc, rv_slt(rd, ra, rb));
- rv64_emit32(mc, rv_xori(rd, rd, 1));
- return;
- case CMP_GE_U:
- rv64_emit32(mc, rv_sltu(rd, ra, rb));
- rv64_emit32(mc, rv_xori(rd, rd, 1));
- return;
- case CMP_LE_S:
- rv64_emit32(mc, rv_slt(rd, rb, ra));
- rv64_emit32(mc, rv_xori(rd, rd, 1));
- return;
- case CMP_LE_U:
- rv64_emit32(mc, rv_sltu(rd, rb, ra));
- rv64_emit32(mc, rv_xori(rd, rd, 1));
- return;
- default:
- rv_panic(rv_of(t), "unsupported integer cmp");
- }
-}
-
-/* Format-dispatching wrappers over the ordered FP compares (feq/flt/fle are
- * ordered: they yield 0 on NaN; flt/fle are signaling, raising NV on NaN —
- * pre-existing for ordered ops, and the boolean result is still correct). */
-static u32 rv_feq_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) {
- return fmt == RV_FMT_D ? rv_feq_d(rd, ra, rb) : rv_feq_s(rd, ra, rb);
-}
-static u32 rv_flt_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) {
- return fmt == RV_FMT_D ? rv_flt_d(rd, ra, rb) : rv_flt_s(rd, ra, rb);
-}
-static u32 rv_fle_fmt(u32 fmt, u32 rd, u32 ra, u32 rb) {
- return fmt == RV_FMT_D ? rv_fle_d(rd, ra, rb) : rv_fle_s(rd, ra, rb);
-}
-
-static void rv_cmp(NativeTarget* t, CmpOp op, NativeLoc dst, NativeLoc aop,
- NativeLoc bop) {
- MCEmitter* mc = t->mc;
- u32 rd = loc_reg(dst);
- /* FP-ness is self-describing from the opcode (FP block starts at CMP_OEQ_F).
- * Unordered predicates use unordered-R == NOT(ordered-not-R): the ordered
- * compare into rd, then `xori rd,rd,1`. ONE/UEQ have no single ordered
- * primitive and OR the two strict relations (a<b | a>b) via scratch RV_TMP2
- * (x7, reserved & never allocable, so it can't alias rd). */
- if (op >= CMP_OEQ_F) {
- u32 fmt = native_type_size(t, aop.type) == 8u ? RV_FMT_D : RV_FMT_S;
- u32 ra = loc_reg(aop), rb = loc_reg(bop);
- switch (op) {
- case CMP_OEQ_F:
- rv64_emit32(mc, rv_feq_fmt(fmt, rd, ra, rb));
- return;
- case CMP_UNE_F: /* !(OEQ) */
- rv64_emit32(mc, rv_feq_fmt(fmt, rd, ra, rb));
- rv64_emit32(mc, rv_xori(rd, rd, 1));
- return;
- case CMP_OLT_F:
- rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
- return;
- case CMP_OLE_F:
- rv64_emit32(mc, rv_fle_fmt(fmt, rd, ra, rb));
- return;
- case CMP_OGT_F:
- rv64_emit32(mc, rv_flt_fmt(fmt, rd, rb, ra));
- return;
- case CMP_OGE_F:
- rv64_emit32(mc, rv_fle_fmt(fmt, rd, rb, ra));
- return;
- case CMP_UGE_F: /* !(OLT) */
- rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
- rv64_emit32(mc, rv_xori(rd, rd, 1));
- return;
- case CMP_UGT_F: /* !(OLE) */
- rv64_emit32(mc, rv_fle_fmt(fmt, rd, ra, rb));
- rv64_emit32(mc, rv_xori(rd, rd, 1));
- return;
- case CMP_ULE_F: /* !(OGT) */
- rv64_emit32(mc, rv_flt_fmt(fmt, rd, rb, ra));
- rv64_emit32(mc, rv_xori(rd, rd, 1));
- return;
- case CMP_ULT_F: /* !(OGE) */
- rv64_emit32(mc, rv_fle_fmt(fmt, rd, rb, ra));
- rv64_emit32(mc, rv_xori(rd, rd, 1));
- return;
- case CMP_ONE_F: /* ordered & !=: (a<b) | (a>b) */
- rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
- rv64_emit32(mc, rv_flt_fmt(fmt, RV_TMP2, rb, ra));
- rv64_emit32(mc, rv_or(rd, rd, RV_TMP2));
- return;
- case CMP_UEQ_F: /* unordered | ==: !((a<b) | (a>b)) */
- rv64_emit32(mc, rv_flt_fmt(fmt, rd, ra, rb));
- rv64_emit32(mc, rv_flt_fmt(fmt, RV_TMP2, rb, ra));
- rv64_emit32(mc, rv_or(rd, rd, RV_TMP2));
- rv64_emit32(mc, rv_xori(rd, rd, 1));
- return;
- default:
- rv_panic(rv_of(t), "unsupported fp cmp");
- }
- }
- {
- int sg = cmp_is_signed(op);
- u32 ra = rv_cmp_ext(t, sg, aop, RV_TMP0);
- u32 rb = rv_cmp_ext(t, sg, bop, RV_TMP1);
- rv_emit_icmp(t, op, rd, ra, rb);
- }
-}
-
-static void rv_convert(NativeTarget* t, ConvKind op, NativeLoc dst,
- NativeLoc src) {
- MCEmitter* mc = t->mc;
- u32 rd = loc_reg(dst), rs = loc_reg(src);
- u32 src_sz = native_type_size(t, src.type);
- u32 dst_sz = native_type_size(t, dst.type);
- switch (op) {
- case CV_SEXT:
- if (src_sz >= 4u) {
- rv64_emit32(mc, rv_addiw(rd, rs, 0));
- } else {
- u32 sh = 64u - src_sz * 8u;
- rv64_emit32(mc, rv_slli(rd, rs, sh));
- rv64_emit32(mc, rv_srai(rd, rd, sh));
- }
- return;
- case CV_ZEXT: {
- u32 sh = 64u - src_sz * 8u;
- rv64_emit32(mc, rv_slli(rd, rs, sh));
- rv64_emit32(mc, rv_srli(rd, rd, sh));
- return;
- }
- case CV_TRUNC:
- if (rd != rs || dst_sz <= 4u)
- rv64_emit32(mc, rv_addi(rd, rs, 0)); /* low bits; users re-narrow */
- return;
- case CV_ITOF_S:
- if (native_type_size(t, dst.type) == 8u)
- rv64_emit32(mc,
- src_sz == 8u ? rv_fcvt_d_l(rd, rs) : rv_fcvt_d_w(rd, rs));
- else
- rv64_emit32(mc,
- src_sz == 8u ? rv_fcvt_s_l(rd, rs) : rv_fcvt_s_w(rd, rs));
- return;
- case CV_ITOF_U:
- if (native_type_size(t, dst.type) == 8u)
- rv64_emit32(mc,
- src_sz == 8u ? rv_fcvt_d_lu(rd, rs) : rv_fcvt_d_wu(rd, rs));
- else
- rv64_emit32(mc,
- src_sz == 8u ? rv_fcvt_s_lu(rd, rs) : rv_fcvt_s_wu(rd, rs));
- return;
- case CV_FTOI_S:
- if (src_sz == 8u)
- rv64_emit32(mc,
- dst_sz == 8u ? rv_fcvt_l_d(rd, rs) : rv_fcvt_w_d(rd, rs));
- else
- rv64_emit32(mc,
- dst_sz == 8u ? rv_fcvt_l_s(rd, rs) : rv_fcvt_w_s(rd, rs));
- return;
- case CV_FTOI_U:
- if (src_sz == 8u)
- rv64_emit32(mc,
- dst_sz == 8u ? rv_fcvt_lu_d(rd, rs) : rv_fcvt_wu_d(rd, rs));
- else
- rv64_emit32(mc,
- dst_sz == 8u ? rv_fcvt_lu_s(rd, rs) : rv_fcvt_wu_s(rd, rs));
- return;
- case CV_FEXT:
- rv64_emit32(mc, rv_fcvt_d_s(rd, rs));
- return;
- case CV_FTRUNC:
- rv64_emit32(mc, rv_fcvt_s_d(rd, rs));
- return;
- case CV_BITCAST:
- rv_move(t, dst, src);
- return;
- default:
- rv_panic(rv_of(t), "unsupported convert");
- }
-}
-
-/* ============================ spill / reload ============================ */
-
-static void rv_spill(NativeTarget* t, NativeLoc src, NativeFrameSlot slot,
- MemAccess mem) {
- NativeAddr addr;
- memset(&addr, 0, sizeof addr);
- addr.base_kind = NATIVE_ADDR_BASE_FRAME;
- addr.base.frame = slot;
- addr.base_type = src.type;
- rv_emit_mem(rv_of(t), 0, src, addr, mem);
-}
-static void rv_reload(NativeTarget* t, NativeLoc dst, NativeFrameSlot slot,
- MemAccess mem) {
- NativeAddr addr;
- memset(&addr, 0, sizeof addr);
- addr.base_kind = NATIVE_ADDR_BASE_FRAME;
- addr.base.frame = slot;
- addr.base_type = dst.type;
- rv_emit_mem(rv_of(t), 1, dst, addr, mem);
-}
-
-/* ============================ control flow ============================ */
-
-static MCLabel rv_label_new(NativeTarget* t) { return t->mc->label_new(t->mc); }
-static void rv_label_place(NativeTarget* t, MCLabel l) {
- t->mc->label_place(t->mc, l);
-}
-static void rv_jump(NativeTarget* t, MCLabel l) {
- rv64_emit32(t->mc, rv_jal(RV_ZERO, 0));
- t->mc->emit_label_ref(t->mc, l, R_RV_JAL, 4, 0);
-}
-
-static void rv_cmp_branch(NativeTarget* t, CmpOp op, NativeLoc aop,
- NativeLoc bop, MCLabel l) {
- MCEmitter* mc = t->mc;
- /* RISC-V B-type branches reach only ±4 KiB, which a single (especially
- * -O0) function can exceed between a branch and its target. Rather than a
- * lone conditional branch to the label, emit a short *inverted* branch
- * that skips an unconditional `jal` (±1 MiB) to the target. The inverted
- * branch's displacement is the constant SKIP_JAL (skip just the jal) and
- * so is always in range; the jal carries the long reach. See rv_jump. */
- enum { SKIP_JAL = 8 }; /* branch over the 4-byte jal that follows it */
- /* FP compares have no register-register branch form: materialize the 0/1
- * into TMP0 via rv_cmp (handles all 12 predicates), then branch on nonzero.
- */
- if (op >= CMP_OEQ_F) {
- NativeLoc tmp =
- native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, RV_TMP0);
- rv_cmp(t, op, tmp, aop, bop);
- /* Skip the jal when the result is 0 (condition false). */
- rv64_emit32(mc, rv_beq(RV_TMP0, RV_ZERO, SKIP_JAL));
- rv_jump(t, l);
- return;
- }
- {
- int sg = cmp_is_signed(op);
- u32 ra = rv_cmp_ext(t, sg, aop, RV_TMP0);
- u32 rb = rv_cmp_ext(t, sg, bop, RV_TMP1);
- u32 word;
- /* Encode the *inverse* of `op`, skipping the jal when NOT taken. */
- switch (op) {
- case CMP_EQ:
- word = rv_bne(ra, rb, SKIP_JAL);
- break;
- case CMP_NE:
- word = rv_beq(ra, rb, SKIP_JAL);
- break;
- case CMP_LT_S:
- word = rv_bge(ra, rb, SKIP_JAL);
- break;
- case CMP_GE_S:
- word = rv_blt(ra, rb, SKIP_JAL);
- break;
- case CMP_LT_U:
- word = rv_bgeu(ra, rb, SKIP_JAL);
- break;
- case CMP_GE_U:
- word = rv_bltu(ra, rb, SKIP_JAL);
- break;
- case CMP_GT_S:
- word = rv_bge(rb, ra, SKIP_JAL);
- break;
- case CMP_LE_S:
- word = rv_blt(rb, ra, SKIP_JAL);
- break;
- case CMP_GT_U:
- word = rv_bgeu(rb, ra, SKIP_JAL);
- break;
- case CMP_LE_U:
- word = rv_bltu(rb, ra, SKIP_JAL);
- break;
- default:
- rv_panic(rv_of(t), "unsupported cmp_branch");
- }
- rv64_emit32(mc, word);
- rv_jump(t, l);
- }
-}
-
-static void rv_indirect_branch(NativeTarget* t, NativeLoc addr,
- const MCLabel* valid_targets, u32 ntargets) {
- (void)valid_targets;
- (void)ntargets;
- rv64_emit32(t->mc, rv_jalr(RV_ZERO, loc_reg(addr), 0));
-}
-
-static void rv_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel l) {
- /* `&&label` address-take: auipc/addi with a %pcrel_hi/%pcrel_lo relocation
- * pair against the label's per-block local symbol — the same form
- * rv_emit_global_addr uses for a global — so a compressing/re-encoding
- * assembler recomputes the displacement (a baked offset would break under
- * the C extension). */
- MCEmitter* mc = t->mc;
- u32 rd = loc_reg(dst);
- u32 sec = mc->section_id;
- ObjSymId sym = mc_label_symbol(mc, l);
- u32 ap = mc->pos(mc);
- rv64_emit32(mc, rv_auipc(rd, 0));
- mc->emit_reloc_at(mc, sec, ap, R_RV_PCREL_HI20, sym, 0, 0, 0);
- {
- Sym an = pool_intern_slice(t->c->global, SLICE_LIT(".LpcrelHi"));
- ObjSymId anchor = obj_symbol(t->obj, an, SB_LOCAL, SK_OBJ, sec, (u64)ap, 0);
- u32 lp = mc->pos(mc);
- rv64_emit32(mc, rv_addi(rd, rd, 0));
- mc->emit_reloc_at(mc, sec, lp, R_RV_PCREL_LO12_I, anchor, 0, 0, 0);
- }
-}
-
-/* ============================ frame / lifecycle ============================
- */
-
-static NativeFrameSlot rv_frame_slot(NativeTarget* t,
- const NativeFrameSlotDesc* d) {
- return native_frame_slot_alloc(&rv_of(t)->frame, d);
-}
-
-static int rv_frame_slot_debug_loc(NativeTarget* t, NativeFrameSlot slot,
- CGDebugLoc* out) {
- RvNativeTarget* a = rv_of(t);
- RvNativeSlot* s;
- if (!out) return 0;
- memset(out, 0, sizeof *out);
- if (slot == NATIVE_FRAME_SLOT_NONE || slot > a->frame.nslots) return 0;
- s = rv_slot_get(a, slot);
- out->kind = CG_DEBUG_LOC_FRAME;
- /* rv64 slots are addressed s0/fp-relative (rv_s0_off_slot); the hosted dbg
- * snapshot seeds the frame base with s0, matching aa64's FP-relative
- * convention. */
- out->v.frame_ofs = rv_s0_off_slot(s);
- return 1;
-}
-
-static void rv_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
- RvNativeTarget* a = rv_of(t);
- MCEmitter* mc = t->mc;
- const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
- a->func = fd;
- a->loc = fd->loc;
- /* Shared frame bookkeeping: clears the slot table, cum_off, max_outgoing,
- * callee-save set, and known_frame/has_alloca/frame_final. */
- native_frame_reset(&a->frame);
- a->incoming_stack_size = 0;
- a->next_param_int = 0;
- a->next_param_fp = 0;
- a->next_param_stack = 0;
- a->has_sret = (abi && abi->has_sret) ? 1u : 0u;
- a->is_variadic = (abi && abi->variadic) ? 1u : 0u;
- a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE;
- a->npatches = 0;
- a->nalloca = 0;
- a->minimal_prologue_words = 0;
- a->slim_prologue = 0;
-
- mc->set_section(mc, fd->text_section_id);
- mc->emit_align(mc, 4, 0);
- a->func_start = mc->pos(mc);
- mc_begin_function(mc, fd->sym, fd->text_section_id, a->func_start);
- if (mc->cfi_startproc) mc->cfi_startproc(mc);
- a->epilogue_label = mc->label_new(mc);
-}
-
-/* sret: reserve a hidden slot for the incoming destination pointer (a0). */
-static void rv_reserve_entry_saves(RvNativeTarget* a) {
- NativeTarget* t = &a->base;
- if (a->has_sret) {
- NativeFrameSlotDesc sd;
- memset(&sd, 0, sizeof sd);
- sd.type = builtin_id(KIT_CG_BUILTIN_I64);
- sd.size = 8;
- sd.align = 8;
- sd.kind = NATIVE_FRAME_SLOT_SAVE;
- a->sret_ptr_slot = t->frame_slot(t, &sd);
- a->next_param_int = 1; /* a0 consumed by the sret pointer */
- }
-}
-
-static void rv_emit_entry_save_stores(RvNativeTarget* a) {
- NativeTarget* t = &a->base;
- if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) {
- KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
- NativeAddr addr;
- memset(&addr, 0, sizeof addr);
- addr.base_kind = NATIVE_ADDR_BASE_FRAME;
- addr.base.frame = a->sret_ptr_slot;
- addr.base_type = i64t;
- rv_emit_mem(a, 0, native_loc_reg(i64t, NATIVE_REG_INT, RV_A0), addr,
- native_mem_for_type(t, i64t, 8));
- }
-}
-
-/* Collect the callee-saves the body used (none at -O0). */
-static u32 rv_collect_int_saves(RvNativeTarget* a, u32* regs) {
- u32 n = 0, i;
- for (i = 0; i < a->frame.ncallee_saves; ++i)
- if (a->frame.callee_saves[i].cls == NATIVE_REG_INT)
- regs[n++] = a->frame.callee_saves[i].reg;
- return n;
-}
-static u32 rv_collect_fp_saves(RvNativeTarget* a, u32* regs) {
- u32 n = 0, i;
- for (i = 0; i < a->frame.ncallee_saves; ++i)
- if (a->frame.callee_saves[i].cls == NATIVE_REG_FP)
- regs[n++] = a->frame.callee_saves[i].reg;
- return n;
-}
-
-/* s0-relative offset of the i-th saved register (saves stack below locals). */
-static i32 rv_save_off(RvNativeTarget* a, u32 idx) {
- return -(i32)(a->frame.cum_off) - 8 - 8 * (i32)idx;
-}
-
-static void rv_load_s0(MCEmitter* mc, int fp, u32 reg, i32 off) {
- if (fits_i12(off)) {
- rv64_emit32(mc, fp ? rv_fld(reg, RV_S0, off) : rv_ld(reg, RV_S0, off));
- return;
- }
- rv_emit_load_imm(mc, 1, RV_TMP0, (i64)off);
- rv64_emit32(mc, rv_add(RV_TMP0, RV_S0, RV_TMP0));
- rv64_emit32(mc, fp ? rv_fld(reg, RV_TMP0, 0) : rv_ld(reg, RV_TMP0, 0));
-}
-
-/* Build the prologue instruction sequence into words[]. Returns count. */
-static u32 rv_build_prologue(RvNativeTarget* a, u32* words, u32 cap,
- u32 frame_size, u32 fp_pair_off,
- const u32* int_regs, u32 n_int, const u32* fp_regs,
- u32 n_fp) {
- u32 wi = 0;
-#define PUSH(w) \
- do { \
- if (wi >= cap) rv_panic(a, "prologue placeholder overflow"); \
- words[wi++] = (w); \
- } while (0)
- /* sp -= frame_size */
- if (fits_i12(-(i32)frame_size)) {
- PUSH(rv_addi(RV_SP, RV_SP, -(i32)frame_size));
- } else {
- i32 neg = -(i32)frame_size;
- i32 hi = (i32)(((i64)neg + 0x800) >> 12);
- i32 lo = neg - (i32)((u32)hi << 12);
- PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
- if (lo) PUSH(rv_addiw(RV_TMP0, RV_TMP0, lo));
- PUSH(rv_add(RV_SP, RV_SP, RV_TMP0));
- }
- /* save s0/ra at [sp + fp_pair_off], set s0 = sp + fp_pair_off */
- if (fits_i12((i32)fp_pair_off + 8)) {
- PUSH(rv_sd(RV_S0, RV_SP, (i32)fp_pair_off));
- PUSH(rv_sd(RV_RA, RV_SP, (i32)fp_pair_off + 8));
- PUSH(rv_addi(RV_S0, RV_SP, (i32)fp_pair_off));
- } else {
- i32 off = (i32)fp_pair_off;
- i32 hi = (i32)(((i64)off + 0x800) >> 12);
- i32 lo = off - (i32)((u32)hi << 12);
- PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
- if (lo) PUSH(rv_addiw(RV_TMP0, RV_TMP0, lo));
- PUSH(rv_add(RV_TMP0, RV_SP, RV_TMP0));
- PUSH(rv_sd(RV_S0, RV_TMP0, 0));
- PUSH(rv_sd(RV_RA, RV_TMP0, 8));
- PUSH(rv_addi(RV_S0, RV_TMP0, 0));
- }
- /* sret a0 spill */
- if (a->has_sret && a->sret_ptr_slot != NATIVE_FRAME_SLOT_NONE) {
- RvNativeSlot* s = rv_slot_get(a, a->sret_ptr_slot);
- PUSH(rv_sd(RV_A0, RV_S0, rv_s0_off_slot(s)));
- }
- /* variadic GP save area: spill unconsumed a-regs at [s0 + 16 + i*8] */
- if (a->is_variadic) {
- u32 i;
- for (i = a->next_param_int; i < 8u; ++i)
- PUSH(rv_sd(RV_A0 + i, RV_S0, 16 + (i32)i * 8));
- }
- /* callee saves */
- {
- u32 i;
- for (i = 0; i < n_int; ++i) {
- i32 off = rv_save_off(a, i);
- if (fits_i12(off)) {
- PUSH(rv_sd(int_regs[i], RV_S0, off));
- } else {
- /* rare; emitted directly is fine in the known-frame path, but the
- * single-pass placeholder must hold these too. Use the wide form. */
- i32 hi = (i32)(((i64)off + 0x800) >> 12);
- i32 lo = off - (i32)((u32)hi << 12);
- PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
- if (lo) PUSH(rv_addiw(RV_TMP0, RV_TMP0, lo));
- PUSH(rv_add(RV_TMP0, RV_S0, RV_TMP0));
- PUSH(rv_sd(int_regs[i], RV_TMP0, 0));
- }
- }
- for (i = 0; i < n_fp; ++i) {
- i32 off = rv_save_off(a, n_int + i);
- if (fits_i12(off)) {
- PUSH(rv_fsd(fp_regs[i], RV_S0, off));
- } else {
- i32 hi = (i32)(((i64)off + 0x800) >> 12);
- i32 lo = off - (i32)((u32)hi << 12);
- PUSH(rv_lui(RV_TMP0, (u32)hi & 0xfffffu));
- if (lo) PUSH(rv_addiw(RV_TMP0, RV_TMP0, lo));
- PUSH(rv_add(RV_TMP0, RV_S0, RV_TMP0));
- PUSH(rv_fsd(fp_regs[i], RV_TMP0, 0));
- }
- }
- }
-#undef PUSH
- return wi;
-}
-
-static void rv_func_begin(NativeTarget* t, const CGFuncDesc* fd) {
- RvNativeTarget* a = rv_of(t);
- MCEmitter* mc = t->mc;
- u32 i;
- rv_func_begin_common(t, fd);
- a->prologue_pos = mc->pos(mc);
- for (i = 0; i < RV_PROLOGUE_WORDS; ++i) rv64_emit32(mc, RV_NOP);
- rv_reserve_entry_saves(a);
- rv_emit_entry_save_stores(a);
-}
-
-static void rv_func_end(NativeTarget* t) {
- RvNativeTarget* a = rv_of(t);
- MCEmitter* mc = t->mc;
- ObjBuilder* obj = t->obj;
- ObjSecId sec = a->func->text_section_id;
- u32 int_regs[16], fp_regs[16];
- u32 n_int = rv_collect_int_saves(a, int_regs);
- u32 n_fp = rv_collect_fp_saves(a, fp_regs);
- u32 frame_size = rv_frame_size(a);
- u32 fp_pair_off = rv_fp_pair_off(a, frame_size);
- u32 end;
- i32 i;
- a->frame_size_final = frame_size;
- a->fp_pair_off = fp_pair_off;
-
- /* epilogue */
- mc->label_place(mc, a->epilogue_label);
- if (a->slim_prologue) {
- /* Frameless leaf: no callee-saves, no s0/ra to reload, sp untouched. */
- rv64_emit32(mc, rv_jalr(RV_ZERO, RV_RA, 0));
- } else {
- for (i = (i32)n_int - 1; i >= 0; --i)
- rv_load_s0(mc, 0, int_regs[i], rv_save_off(a, (u32)i));
- for (i = (i32)n_fp - 1; i >= 0; --i)
- rv_load_s0(mc, 1, fp_regs[i], rv_save_off(a, n_int + (u32)i));
- if (a->frame.has_alloca)
- rv_emit_addr_adjust(mc, RV_SP, RV_S0, -(i32)fp_pair_off);
- rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8));
- rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0));
- /* sp += frame_size */
- if (fits_i12((i32)frame_size)) {
- rv64_emit32(mc, rv_addi(RV_SP, RV_SP, (i32)frame_size));
- } else {
- rv_emit_load_imm(mc, 1, RV_TMP0, (i64)frame_size);
- rv64_emit32(mc, rv_add(RV_SP, RV_SP, RV_TMP0));
- }
- rv64_emit32(mc, rv_jalr(RV_ZERO, RV_RA, 0));
- }
-
- /* patch prologue */
- if (!a->frame.known_frame) {
- u32 words[RV_PROLOGUE_WORDS];
- u32 nwords, k;
- for (k = 0; k < RV_PROLOGUE_WORDS; ++k) words[k] = RV_NOP;
- nwords = rv_build_prologue(a, words, RV_PROLOGUE_WORDS, frame_size,
- fp_pair_off, int_regs, n_int, fp_regs, n_fp);
- (void)nwords;
- for (k = 0; k < RV_PROLOGUE_WORDS; ++k)
- rv_patch32(obj, sec, a->prologue_pos + k * 4u, words[k]);
- }
- /* patch alloca sites: addi dst, sp, max_outgoing */
- {
- u32 mo = align_up_u32(a->frame.max_outgoing, 16u);
- u32 k;
- if (mo > 2047u) rv_panic(a, "max_outgoing too large for alloca patch");
- for (k = 0; k < a->npatches; ++k)
- rv_patch32(obj, sec, a->patches[k].pos,
- rv_addi(a->patches[k].dst_reg, RV_SP, (i32)mo));
- }
-
- /* CFI: CFA = s0 + (frame_size - fp_pair_off) */
- if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
- if (a->slim_prologue) {
- /* Frameless leaf: CFA = sp (unchanged from entry) and the return address
- * stays live in ra (the CIE default), so no saved-register rules. The
- * state holds from the first instruction (offset 0). */
- mc->cfi_set_next_pc_offset(mc, 0);
- mc->cfi_def_cfa(mc, RV_SP, 0);
- } else {
- i32 cfa = (i32)frame_size - (i32)fp_pair_off;
- u32 post = a->prologue_pos + (a->frame.known_frame
- ? a->minimal_prologue_words * 4u
- : RV_PROLOGUE_WORDS * 4u);
- u32 k;
- mc->cfi_set_next_pc_offset(mc, post - a->func_start);
- mc->cfi_def_cfa(mc, RV_S0, cfa);
- mc->cfi_offset(mc, RV_S0, -cfa);
- mc->cfi_offset(mc, RV_RA, -cfa + 8);
- for (k = 0; k < n_int; ++k)
- mc->cfi_offset(mc, int_regs[k], rv_save_off(a, k) - cfa);
- for (k = 0; k < n_fp; ++k)
- mc->cfi_offset(mc, 32u + fp_regs[k], rv_save_off(a, n_int + k) - cfa);
- }
- }
-
- end = mc->pos(mc);
- obj_symbol_define(obj, a->func->sym, sec, (u64)a->func_start,
- (u64)(end - a->func_start));
- if (a->func->atomize)
- obj_atom_define(obj, sec, a->func_start, end - a->func_start, a->func->sym,
- 0);
- if (mc->debug) debug_func_pc_range(mc->debug, sec, a->func_start, end);
- if (mc->cfi_endproc) mc->cfi_endproc(mc);
- mc_end_function(mc);
- a->func = NULL;
-}
-
-/* rv64 homes its callee-saves below the locals at rv_save_off(idx) rather than
- * in frame slots, so alloc_slots=0: native_frame just records the {reg,cls} set
- * derived from the optimizer's per-class used-masks. */
-static void rv_reserve_callee_saves(NativeTarget* t, const u32* used,
- u32 nclasses) {
- native_frame_set_callee_saves(&rv_of(t)->frame, used, nclasses, NULL, 0, 0);
-}
-
-static int rv_reg_is_callee_int(Reg r);
-static int rv_reg_is_callee_fp(Reg r);
-static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
- u32 nclob, u32* int_mask, u32* fp_mask);
-
-/* Expand the arch-neutral clobber-ABI sets (KitCgAsmClobberAbiSet bits) into
- * this target's per-class caller/callee-saved register masks. */
-/* abi_clobber_masks is shared as native_asm_abi_clobber_masks
- * (cg/native_asm.h); it reads the masks from t->regs->classes. */
-
-/* Build the callee-saved set the prologue must preserve: the allocator-assigned
- * callee-saved registers (frame->callee_saved_used) plus any an inline-asm
- * block clobbers. The latter are opaque to the optimizer's operand scan, so it
- * forwards the raw clobber names (frame->asm_clobbers) and the arch-neutral
- * clobber-ABI sets (frame->asm_clobber_abi_sets); we resolve both into masks
- * and keep only the callee-saved ones — rv_reg_is_callee_int excludes s0 (the
- * frame pointer, preserved by the prologue head, not as an ordinary
- * callee-save). This is the same register selection the per-block spill used,
- * hoisted into the prologue. Writes up to `cap` per-class masks into `out` and
- * returns the class count to reserve. */
-static u32 rv_known_callee_saves(NativeTarget* t,
- const NativeKnownFrameDesc* frame, u32* out,
- u32 cap) {
- u32 ncls = frame->ncallee_classes;
- u32 clob_int = 0, clob_fp = 0, abi_int, abi_fp;
- if (ncls > cap) ncls = cap;
- for (u32 c = 0; c < ncls; ++c)
- out[c] = frame->callee_saved_used ? frame->callee_saved_used[c] : 0u;
- if (frame->asm_clobbers && frame->nasm_clobbers) {
- RvNativeTarget* a = rv_of(t);
- SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
- rv_asm_clobber_masks(t->c, loc, frame->asm_clobbers, frame->nasm_clobbers,
- &clob_int, &clob_fp);
- }
- native_asm_abi_clobber_masks(t, frame->asm_clobber_abi_sets, &abi_int,
- &abi_fp);
- clob_int |= abi_int;
- clob_fp |= abi_fp;
- for (Reg r = 0; r < 32u; ++r) {
- if (NATIVE_REG_INT < ncls && (clob_int & (1u << r)) &&
- rv_reg_is_callee_int(r))
- out[NATIVE_REG_INT] |= 1u << r;
- if (NATIVE_REG_FP < ncls && (clob_fp & (1u << r)) && rv_reg_is_callee_fp(r))
- out[NATIVE_REG_FP] |= 1u << r;
- }
- return ncls;
-}
-
-static u32 rv_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type,
- int* variadic, u32* nparams);
-
-/* Optimizer entry point: the full frame is supplied up front, so the prologue
- * is emitted final the moment it is built — no NOP region, no func_end patch
- * (rv_func_end skips patching when known_frame). rv_build_prologue emits the
- * sret spill and the variadic register-save stores inline, so there is no
- * separate entry-save emission. Slot creation order matches the single-pass
- * path: callee-saves first (only recorded for rv64), then static slots, then
- * the sret entry-save slot. */
-static void rv_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
- const NativeKnownFrameDesc* frame,
- NativeFrameSlot* out_slots) {
- RvNativeTarget* a = rv_of(t);
- MCEmitter* mc = t->mc;
- u32 int_regs[RV_MAX_CALLEE_SAVES], fp_regs[RV_MAX_CALLEE_SAVES];
- u32 n_int, n_fp, frame_size, fp_pair_off, nwords, i;
- u32 words[RV_KNOWN_PROLOGUE_WORDS];
- rv_func_begin_common(t, fd);
- a->frame.known_frame = 1;
- if (frame) {
- u32 cs[NATIVE_CALL_PLAN_CLASSES];
- u32 ncs = rv_known_callee_saves(t, frame, cs, NATIVE_CALL_PLAN_CLASSES);
- a->frame.has_alloca = frame->has_alloca;
- if (ncs) rv_reserve_callee_saves(t, cs, ncs);
- for (i = 0; i < frame->nslots; ++i) {
- NativeFrameSlot slot = rv_frame_slot(t, &frame->slots[i]);
- if (out_slots) out_slots[i] = slot;
- }
- rv_reserve_entry_saves(a);
- native_frame_note_outgoing(&a->frame, frame->max_outgoing);
- }
- /* Frame is final: size and offsets are settled, so emit the exact prologue.
- */
- frame_size = rv_frame_size(a);
- fp_pair_off = rv_fp_pair_off(a, frame_size);
- a->frame_size_final = frame_size;
- a->fp_pair_off = fp_pair_off;
- a->prologue_pos = mc->pos(mc);
- /* Leaf no-frame tier (aa64 slim_prologue equivalent): a leaf with no
- * callee-saves, no body slots, no outgoing args, no sret/variadic and
- * register-only params never reads s0 (no frame slots / stack args) nor
- * clobbers ra (no calls). Emit no prologue at all; rv_func_end emits a bare
- * `ret`. cum_off==0 already implies no sret slot and no param spills, but the
- * extra guards keep the intent explicit. Inline asm is excluded: it can
- * clobber ra opaquely, and without the saved record the bare `ret` would
- * return through the destroyed link register. */
- a->slim_prologue = frame && frame->is_leaf && !frame->has_asm &&
- a->frame.ncallee_saves == 0 && !a->frame.has_alloca &&
- a->frame.cum_off == 0 && a->frame.max_outgoing == 0 &&
- !a->has_sret && !a->is_variadic &&
- rv_signature_stack_bytes(t, fd->fn_type, NULL, NULL) == 0;
- if (a->slim_prologue) {
- a->minimal_prologue_words = 0;
- native_frame_set_final(&a->frame);
- return;
- }
- n_int = rv_collect_int_saves(a, int_regs);
- n_fp = rv_collect_fp_saves(a, fp_regs);
- nwords = rv_build_prologue(a, words, RV_KNOWN_PROLOGUE_WORDS, frame_size,
- fp_pair_off, int_regs, n_int, fp_regs, n_fp);
- for (i = 0; i < nwords; ++i) rv64_emit32(mc, words[i]);
- a->minimal_prologue_words = nwords;
- native_frame_set_final(&a->frame);
-}
-
-/* ============================ params / ABI helpers
- * ============================ */
-
-static const ABIArgInfo* rv_param_abi(NativeTarget* t, const ABIFuncInfo* abi,
- const NativeCallDesc* desc, u32 i,
- ABIArgInfo* scratch) {
- /* Synthesized for unnamed (variadic) args, or untyped calls. RISC-V LP64D
- * passes variadic FP args in INTEGER registers (as their bit pattern), not
- * the FP pool — so a variadic float part is ABI_CLASS_INT. */
- int variadic = abi && i >= abi->nparams;
- if (abi && i < abi->nparams) return &abi->params[i];
- memset(scratch, 0, sizeof *scratch);
- scratch->kind = ABI_ARG_DIRECT;
- scratch->nparts = 1;
- scratch->parts = arena_zarray(t->c->tu, ABIArgPart, 1);
- ((ABIArgPart*)scratch->parts)[0].cls =
- (!variadic && cg_type_is_float(t->c, desc->args[i].type)) ? ABI_CLASS_FP
- : ABI_CLASS_INT;
- ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG;
- ((ABIArgPart*)scratch->parts)[0].size =
- native_type_size(t, desc->args[i].type);
- ((ABIArgPart*)scratch->parts)[0].align =
- native_type_align(t, desc->args[i].type);
- return scratch;
-}
-
-static u32 rv_part_stack_size(const ABIArgPart* part) {
- return align_up_u32(part->size ? part->size : 8u, 8u);
-}
-static u32 rv_part_stack_align(const ABIArgPart* part) {
- u32 al = part->align ? part->align : 8u;
- if (al < 8u) al = 8u;
- if (al > 16u) al = 16u;
- return al;
-}
-
-static KitCgTypeId rv_part_scalar_type(const ABIArgPart* part) {
- if (part->cls == ABI_CLASS_FP) {
- if (part->size <= 4u) return builtin_id(KIT_CG_BUILTIN_F32);
- return builtin_id(KIT_CG_BUILTIN_F64);
- }
- switch (part->size) {
- case 1u:
- return builtin_id(KIT_CG_BUILTIN_I8);
- case 2u:
- return builtin_id(KIT_CG_BUILTIN_I16);
- case 4u:
- return builtin_id(KIT_CG_BUILTIN_I32);
- default:
- return builtin_id(KIT_CG_BUILTIN_I64);
- }
-}
-
-static u32 rv_class_stack_size(const ABIArgInfo* ai) {
- u32 total = 0, p;
- if (!ai || ai->kind == ABI_ARG_IGNORE) return 0;
- if (ai->kind == ABI_ARG_INDIRECT) return 8u;
- for (p = 0; p < ai->nparts; ++p) {
- total = align_up_u32(total, rv_part_stack_align(&ai->parts[p]));
- total += rv_part_stack_size(&ai->parts[p]);
- }
- return align_up_u32(total ? total : 8u, 8u);
-}
-
-static u32 rv_call_stack_size(NativeTarget* t, const NativeCallDesc* desc) {
- const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);
- /* sret consumes a0 as the implicit first integer argument. */
- u32 next_int = (abi && abi->has_sret) ? 1u : 0u;
- u32 next_fp = 0, stack = 0, i, p;
- for (i = 0; i < desc->nargs; ++i) {
- ABIArgInfo tmp;
- const ABIArgInfo* ai = rv_param_abi(t, abi, desc, i, &tmp);
- int force_stack =
- abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams;
- if (ai->kind == ABI_ARG_IGNORE) continue;
- if (force_stack) {
- stack += rv_class_stack_size(ai);
- continue;
- }
- if (ai->kind == ABI_ARG_INDIRECT) {
- if (next_int < 8u)
- next_int++;
- else
- stack += 8u;
- continue;
- }
- for (p = 0; p < ai->nparts; ++p) {
- const ABIArgPart* part = &ai->parts[p];
- if (part->cls == ABI_CLASS_FP) {
- if (next_fp < 8u)
- next_fp++;
- else {
- stack = align_up_u32(stack, rv_part_stack_align(part));
- stack += rv_part_stack_size(part);
- }
- } else {
- if (next_int < 8u)
- next_int++;
- else {
- stack = align_up_u32(stack, rv_part_stack_align(part));
- stack += rv_part_stack_size(part);
- }
- }
- }
- }
- return align_up_u32(stack, 16u);
-}
-
-static u32 rv_signature_stack_bytes(NativeTarget* t, KitCgTypeId fn_type,
- int* variadic, u32* nparams) {
- const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fn_type);
- NativeCallDesc d;
- if (variadic) *variadic = abi ? (int)abi->variadic : 0;
- if (nparams) *nparams = abi ? abi->nparams : 0u;
- memset(&d, 0, sizeof d);
- d.fn_type = fn_type;
- d.nargs = abi ? abi->nparams : 0u;
- if (d.nargs) d.args = arena_zarray(t->c->tu, NativeLoc, d.nargs);
- return rv_call_stack_size(t, &d);
-}
-
-static u32 rv_call_stack_bytes(NativeTarget* t, const NativeCallDesc* desc) {
- return rv_call_stack_size(t, desc);
-}
-
-/* Resolve a NativeLoc to an addressable NativeAddr (frame/stack/addr). */
-static NativeAddr rv_loc_addr(RvNativeTarget* a, NativeLoc loc, u32 offset) {
- NativeAddr addr;
- memset(&addr, 0, sizeof addr);
- switch ((NativeLocKind)loc.kind) {
- case NATIVE_LOC_FRAME:
- addr.base_kind = NATIVE_ADDR_BASE_FRAME;
- addr.base.frame = loc.v.frame;
- addr.base_type = loc.type;
- addr.offset = (i32)offset;
- return addr;
- case NATIVE_LOC_STACK:
- addr.base_kind = NATIVE_ADDR_BASE_FRAME;
- addr.base.frame = loc.v.stack.slot;
- addr.base_type = loc.type;
- addr.offset = loc.v.stack.offset + (i32)offset;
- return addr;
- case NATIVE_LOC_ADDR:
- addr = loc.v.addr;
- addr.offset += (i32)offset;
- return addr;
- default:
- rv_panic(a, "location is not addressable");
- }
- return addr;
-}
-
-static void rv_load_part(NativeTarget* t, NativeLoc dst, NativeLoc src,
- u32 offset, u32 size) {
- RvNativeTarget* a = rv_of(t);
- if (src.kind == NATIVE_LOC_REG) {
- rv_move(t, dst, src);
- return;
- }
- if (src.kind == NATIVE_LOC_FRAME || src.kind == NATIVE_LOC_STACK ||
- src.kind == NATIVE_LOC_ADDR) {
- NativeAddr addr = rv_loc_addr(a, src, offset);
- addr.base_type = dst.type;
- rv_emit_mem(a, 1, dst, addr, native_mem_for_type(t, dst.type, size));
- return;
- }
- if (src.kind == NATIVE_LOC_IMM) {
- rv_emit_load_imm(t->mc, rv_is_64(t, dst.type) ? 1u : 0u, loc_reg(dst),
- src.v.imm);
- return;
- }
- rv_panic(a, "unsupported part source");
-}
-
-static void rv_store_part(NativeTarget* t, NativeLoc dst, NativeLoc src,
- u32 offset, u32 size) {
- RvNativeTarget* a = rv_of(t);
- if (dst.kind == NATIVE_LOC_FRAME || dst.kind == NATIVE_LOC_STACK ||
- dst.kind == NATIVE_LOC_ADDR) {
- NativeAddr addr = rv_loc_addr(a, dst, offset);
- addr.base_type = src.type;
- rv_emit_mem(a, 0, src, addr, native_mem_for_type(t, src.type, size));
- return;
- }
- if (dst.kind == NATIVE_LOC_REG) {
- rv_move(t, dst, src);
- return;
- }
- rv_panic(a, "unsupported part destination");
-}
-
-static void rv_addr_of_loc(NativeTarget* t, NativeLoc dst, NativeLoc src) {
- NativeAddr addr = rv_loc_addr(rv_of(t), src, 0);
- rv_load_addr(t, dst, addr);
-}
-
-static void rv_store_outgoing_part(NativeTarget* t, int tail_call,
- u32 stack_off, NativeLoc src, u32 size) {
- NativeAddr addr;
- memset(&addr, 0, sizeof addr);
- addr.base_kind = NATIVE_ADDR_BASE_REG;
- addr.base_type = src.type;
- if (tail_call) {
- /* A sibling call reuses the caller's frame: its outgoing stack args land in
- * the caller's incoming-arg window ([s0 + 16 + va_save + off]) — physically
- * the same address the tail-callee will read at [sp+off] once the teardown
- * has restored sp to the caller's entry sp (the CFA). */
- addr.base.reg = RV_S0;
- addr.offset = rv_s0_off_in_arg(rv_of(t), stack_off);
- } else {
- addr.base.reg = RV_SP;
- addr.offset = (i32)stack_off;
- }
- rv_emit_mem(rv_of(t), 0, src, addr, native_mem_for_type(t, src.type, size));
-}
-
-/* NativeTarget bind_param: route incoming param (ABI loc) into dst. */
-static void rv_bind_native_param(NativeTarget* t, const CGParamDesc* p,
- NativeLoc dst) {
- RvNativeTarget* a = rv_of(t);
- const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, a->func->fn_type);
- const ABIArgInfo* ai =
- p->index < abi->nparams ? &abi->params[p->index] : NULL;
- int to_reg = dst.kind == NATIVE_LOC_REG;
- u32 i;
- if (!ai || ai->kind == ABI_ARG_IGNORE) return;
- if (ai->kind == ABI_ARG_INDIRECT) {
- NativeLoc src = native_loc_reg(
- builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
- a->next_param_int < 8u ? RV_A0 + a->next_param_int : RV_TMP0);
- NativeAddr d_addr, from;
- AggregateAccess access;
- if (a->next_param_int < 8u) {
- a->next_param_int++;
- } else {
- NativeAddr sa;
- memset(&sa, 0, sizeof sa);
- sa.base_kind = NATIVE_ADDR_BASE_REG;
- sa.base.reg = RV_S0;
- sa.offset = rv_s0_off_in_arg(a, a->next_param_stack);
- sa.base_type = src.type;
- rv_emit_mem(a, 1, src, sa, native_mem_for_type(t, src.type, 8));
- a->next_param_stack += 8u;
- }
- if (dst.kind != NATIVE_LOC_FRAME)
- rv_panic(a, "indirect parameter requires a frame destination");
- memset(&d_addr, 0, sizeof d_addr);
- d_addr.base_kind = NATIVE_ADDR_BASE_FRAME;
- d_addr.base.frame = dst.v.frame;
- d_addr.base_type = p->type;
- memset(&from, 0, sizeof from);
- from.base_kind = NATIVE_ADDR_BASE_REG;
- from.base.reg = loc_reg(src);
- from.base_type = p->type;
- memset(&access, 0, sizeof access);
- access.type = p->type;
- access.size = p->size ? p->size : (u32)cg_type_size(t->c, p->type);
- access.align = p->align ? p->align : native_type_align(t, p->type);
- rv_copy_bytes(t, d_addr, from, access);
- return;
- }
- for (i = 0; i < ai->nparts; ++i) {
- const ABIArgPart* part = &ai->parts[i];
- NativeAllocClass cls =
- part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
- NativeLoc src;
- if (cls == NATIVE_REG_FP && a->next_param_fp < 8u) {
- src = native_loc_reg(p->type, cls, RV_FA0 + a->next_param_fp++);
- } else if (cls == NATIVE_REG_INT && a->next_param_int < 8u) {
- src = native_loc_reg(p->type, cls, RV_A0 + a->next_param_int++);
- } else {
- Reg tmp = (cls == NATIVE_REG_FP) ? RV_FTMP0 : RV_TMP0;
- NativeAddr sa;
- src = native_loc_reg(p->type, cls, tmp);
- a->next_param_stack =
- align_up_u32(a->next_param_stack, rv_part_stack_align(part));
- memset(&sa, 0, sizeof sa);
- sa.base_kind = NATIVE_ADDR_BASE_REG;
- sa.base.reg = RV_S0;
- sa.base_type = p->type;
- sa.offset = rv_s0_off_in_arg(a, a->next_param_stack);
- rv_emit_mem(a, 1, src, sa, native_mem_for_type(t, p->type, part->size));
- a->next_param_stack += rv_part_stack_size(part);
- }
- if (dst.kind == NATIVE_LOC_NONE) {
- /* unused parameter; cursors already advanced */
- } else if (to_reg) {
- NativeLoc d = native_loc_reg(dst.type ? dst.type : p->type,
- (NativeAllocClass)dst.cls, (Reg)dst.v.reg);
- if (!(src.kind == NATIVE_LOC_REG && loc_reg(src) == loc_reg(d) &&
- (NativeAllocClass)src.cls == (NativeAllocClass)d.cls))
- rv_move(t, d, src);
- } else {
- rv_store_part(
- t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), src,
- 0, part->size);
- }
- }
- a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u);
-}
-
-/* ============================ calls / returns ============================ */
-
-typedef NativeArgMove RvArgMove;
-
-static void rv_emit_one_arg_move(NativeTarget* t, const NativeArgMove* m) {
- if (m->is_addr)
- rv_addr_of_loc(t, m->dst, m->src);
- else
- rv_load_part(t, m->dst, m->src, m->src_offset, m->size);
-}
-
-/* Parallel-copy register arg moves via the shared scheduler; cycles break
- * through the int/fp emit scratch (t1 / ft1). */
-static void rv_emit_reg_arg_moves(NativeTarget* t, NativeArgMove* moves,
- u32 n) {
- NativeArgShuffle s;
- if (n > RV_MAX_REG_ARG_MOVES) rv_panic(rv_of(t), "too many register args");
- memset(&s, 0, sizeof s);
- s.t = t;
- s.emit_one = rv_emit_one_arg_move;
- s.reg_move = rv_move;
- s.scratch[NATIVE_REG_INT] = RV_TMP1;
- s.scratch[NATIVE_REG_FP] = RV_FTMP1;
- native_arg_shuffle(&s, moves, n);
-}
-
-static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
- NativeCallPlan* plan) {
- RvNativeTarget* a = rv_of(t);
- const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, desc->fn_type);
- NativeCallPlanRet* rets;
- KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
- memset(plan, 0, sizeof *plan);
- rets = desc->nresults ? arena_zarray(t->c->tu, NativeCallPlanRet, 4) : NULL;
- plan->callee = desc->callee;
- plan->rets = rets;
- plan->flags = desc->flags;
- plan->has_sret = abi && abi->has_sret;
- plan->is_variadic = abi && abi->variadic;
- plan->stack_arg_size = rv_call_stack_size(t, desc);
- if (plan->stack_arg_size > a->frame.max_outgoing)
- a->frame.max_outgoing = plan->stack_arg_size;
- /* Indirect callee in an arg register would be clobbered by arg loads. */
- if (plan->callee.kind == NATIVE_LOC_REG &&
- (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT &&
- plan->callee.v.reg >= RV_A0 && plan->callee.v.reg <= RV_A7) {
- NativeLoc scratch =
- native_loc_reg(plan->callee.type, NATIVE_REG_INT, RV_TMP0);
- rv_move(t, scratch, plan->callee);
- plan->callee = scratch;
- }
- {
- /* sret returns pass the hidden destination pointer as the implicit first
- * integer argument (a0), so the real args start at a1. */
- u32 next_int = (abi && abi->has_sret) ? 1u : 0u;
- u32 next_fp = 0, stack = 0, nmoves = 0, i, p;
- int tail = (desc->flags & CG_CALL_TAIL) != 0;
- RvArgMove moves[RV_MAX_REG_ARG_MOVES];
- for (i = 0; i < desc->nargs; ++i) {
- ABIArgInfo tmp;
- const ABIArgInfo* ai = rv_param_abi(t, abi, desc, i, &tmp);
- int force_stack =
- abi && abi->variadic && abi->vararg_on_stack && i >= abi->nparams;
- if (ai->kind == ABI_ARG_IGNORE) continue;
- if (force_stack) {
- NativeLoc tmpreg =
- native_loc_reg(desc->args[i].type, NATIVE_REG_INT, RV_TMP0);
- u32 n = rv_class_stack_size(ai), off = 0;
- while (off < n) {
- rv_load_part(t, tmpreg, desc->args[i], off, 8);
- rv_store_outgoing_part(t, tail, stack + off, tmpreg, 8);
- off += 8;
- }
- stack += n;
- continue;
- }
- if (ai->kind == ABI_ARG_INDIRECT) {
- if (next_int < 8u) {
- RvArgMove* m = &moves[nmoves++];
- m->dst = native_loc_reg(i64t, NATIVE_REG_INT, RV_A0 + next_int++);
- m->src = desc->args[i];
- m->src_offset = 0;
- m->size = 8;
- m->is_addr = 1;
- } else {
- NativeLoc ptr = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP0);
- rv_addr_of_loc(t, ptr, desc->args[i]);
- rv_store_outgoing_part(t, tail, stack, ptr, 8);
- stack += 8u;
- }
- continue;
- }
- for (p = 0; p < ai->nparts; ++p) {
- const ABIArgPart* part = &ai->parts[p];
- NativeAllocClass cls =
- part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
- if ((cls == NATIVE_REG_FP && next_fp < 8u) ||
- (cls == NATIVE_REG_INT && next_int < 8u)) {
- RvArgMove* m = &moves[nmoves++];
- Reg areg =
- cls == NATIVE_REG_FP ? RV_FA0 + next_fp++ : RV_A0 + next_int++;
- m->dst = native_loc_reg(desc->args[i].type, cls, areg);
- m->src = desc->args[i];
- m->src_offset = part->src_offset;
- m->size = part->size;
- m->is_addr = 0;
- } else {
- Reg tmp = cls == NATIVE_REG_FP ? RV_FTMP0 : RV_TMP0;
- NativeLoc tmpreg = native_loc_reg(desc->args[i].type, cls, tmp);
- rv_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size);
- stack = align_up_u32(stack, rv_part_stack_align(part));
- rv_store_outgoing_part(t, tail, stack, tmpreg, part->size);
- stack += rv_part_stack_size(part);
- }
- }
- }
- rv_emit_reg_arg_moves(t, moves, nmoves);
- if (abi && abi->has_sret && desc->nresults) {
- /* sret pointer goes in a0; arg loads have completed. A tail call forwards
- * the caller's own incoming sret pointer (spilled at entry) so the
- * sibling writes the result into the caller's caller's destination;
- * otherwise pass the address of this call's result slot. */
- NativeLoc a0 = native_loc_reg(i64t, NATIVE_REG_INT, RV_A0);
- if (tail)
- rv_load_part(t, a0, native_loc_stack(i64t, a->sret_ptr_slot, 0), 0, 8);
- else
- rv_addr_of_loc(t, a0, desc->results[0]);
- }
- }
- if (abi && abi->ret.kind == ABI_ARG_DIRECT && desc->nresults) {
- u32 nr = 0, ni = 0, nf = 0, p;
- for (p = 0; p < abi->ret.nparts; ++p) {
- const ABIArgPart* part = &abi->ret.parts[p];
- NativeAllocClass cls =
- part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
- KitCgTypeId pty = rv_part_scalar_type(part);
- Reg rreg = cls == NATIVE_REG_FP ? RV_FA0 + nf++ : RV_A0 + ni++;
- rets[nr].src = native_loc_reg(pty, cls, rreg);
- rets[nr].dst = desc->results[0];
- if (rets[nr].dst.kind == NATIVE_LOC_FRAME)
- rets[nr].dst = native_loc_stack(pty, desc->results[0].v.frame,
- (i32)part->src_offset);
- else if (rets[nr].dst.kind == NATIVE_LOC_STACK) {
- rets[nr].dst.v.stack.offset += (i32)part->src_offset;
- rets[nr].dst.type = pty;
- }
- rets[nr].mem = native_mem_for_type(t, pty, part->size);
- nr++;
- }
- plan->nrets = nr;
- } else if (abi && abi->ret.kind == ABI_ARG_IGNORE) {
- plan->nrets = 0;
- } else if (!abi && desc->nresults) {
- rets[0].src = native_loc_reg(desc->results[0].type, NATIVE_REG_INT, RV_A0);
- rets[0].dst = desc->results[0];
- rets[0].mem = native_mem_for_type(t, desc->results[0].type, 0);
- plan->nrets = 1;
- }
-}
-
-/* Emit a sibling (tail) call: tear the frame down to the caller's entry state
- * and jump (no link) to the callee. Outgoing args are already in the arg regs /
- * the caller's incoming-arg window. At -O0 there are no callee-saves, and the
- * sp restore uses the CFA offset (s0 + 16 + va_save), which is independent of
- * the not-yet-final frame_size — so no func_end patching is needed. */
-static void rv_emit_tail_site(NativeTarget* t, NativeLoc callee) {
- RvNativeTarget* a = rv_of(t);
- MCEmitter* mc = t->mc;
- i32 cfa = (i32)(RV_FRAME_SAVE_SIZE + rv_va_save_sz(a));
- int indirect = callee.kind == NATIVE_LOC_REG;
- u32 int_regs[RV_MAX_CALLEE_SAVES], fp_regs[RV_MAX_CALLEE_SAVES];
- u32 n_int = rv_collect_int_saves(a, int_regs);
- u32 n_fp = rv_collect_fp_saves(a, fp_regs);
- i32 i;
- /* Stage an indirect callee into a reserved scratch (t1) BEFORE the teardown:
- * regalloc parks the function pointer in a callee-saved register so it
- * survives arg marshalling, and the callee-save / s0 / ra restores below
- * would otherwise overwrite it. t1 is reserved (never allocable) and
- * untouched by the restore loop (which only uses t0 for far offsets). */
- if (indirect) rv64_emit32(mc, rv_addi(RV_TMP1, loc_reg(callee), 0));
- /* Restore callee-saves before tearing the frame down (O1 path; none at -O0).
- * Their save offsets are s0-relative via rv_save_off, so the restore is
- * frame-size- and teardown-order-independent. */
- for (i = (i32)n_int - 1; i >= 0; --i)
- rv_load_s0(mc, 0, int_regs[i], rv_save_off(a, (u32)i));
- for (i = (i32)n_fp - 1; i >= 0; --i)
- rv_load_s0(mc, 1, fp_regs[i], rv_save_off(a, n_int + (u32)i));
- rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8));
- rv64_emit32(mc, rv_addi(RV_SP, RV_S0, cfa));
- rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0));
- if (callee.kind == NATIVE_LOC_GLOBAL) {
- u32 pos = mc->pos(mc);
- rv64_emit32(mc, rv_auipc(RV_TMP0, 0));
- rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP0, 0));
- mc->emit_reloc_at(mc, mc->section_id, pos, R_RV_CALL, callee.v.global.sym,
- callee.v.global.addend, 0, 0);
- } else if (indirect) {
- rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP1, 0));
- } else {
- rv_panic(a, "unsupported tail call target");
- }
-}
-
-static void rv_emit_call(NativeTarget* t, const NativeCallPlan* plan) {
- MCEmitter* mc = t->mc;
- ObjSecId sec = mc->section_id;
- if (plan->flags & CG_CALL_TAIL) {
- rv_emit_tail_site(t, plan->callee);
- return;
- }
- if (plan->callee.kind == NATIVE_LOC_GLOBAL) {
- u32 pos = mc->pos(mc);
- rv64_emit32(mc, rv_auipc(RV_RA, 0));
- rv64_emit32(mc, rv_jalr(RV_RA, RV_RA, 0));
- mc->emit_reloc_at(mc, sec, pos, R_RV_CALL, plan->callee.v.global.sym,
- plan->callee.v.global.addend, 0, 0);
- return;
- }
- if (plan->callee.kind == NATIVE_LOC_REG) {
- rv64_emit32(mc, rv_jalr(RV_RA, loc_reg(plan->callee), 0));
- return;
- }
- rv_panic(rv_of(t), "unsupported call target");
-}
-
-static void rv_plan_ret(NativeTarget* t, const CGFuncDesc* fd,
- const NativeLoc* values, u32 nvalues,
- NativeCallPlanRet** out_rets, u32* out_nrets) {
- RvNativeTarget* a = rv_of(t);
- const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
- NativeCallPlanRet* rets = NULL;
- u32 nr = 0;
- if (nvalues > 1u) rv_panic(a, "multiple returns unsupported");
- if (nvalues) rets = arena_zarray(t->c->tu, NativeCallPlanRet, 4);
- if (nvalues && abi && abi->ret.kind == ABI_ARG_INDIRECT) {
- KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
- NativeLoc dstp = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1);
- NativeLoc saved = native_loc_stack(i64t, a->sret_ptr_slot, 0);
- NativeAddr dst_addr, src_addr;
- AggregateAccess access;
- rv_load_part(t, dstp, saved, 0, 8);
- memset(&dst_addr, 0, sizeof dst_addr);
- dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
- dst_addr.base.reg = RV_TMP1;
- dst_addr.base_type = values[0].type;
- src_addr = rv_loc_addr(a, values[0], 0);
- src_addr.base_type = values[0].type;
- memset(&access, 0, sizeof access);
- access.type = values[0].type;
- access.size = (u32)cg_type_size(t->c, values[0].type);
- access.align = native_type_align(t, values[0].type);
- rv_copy_bytes(t, dst_addr, src_addr, access);
- *out_rets = NULL;
- *out_nrets = 0;
- return;
- }
- if (nvalues && abi && abi->ret.kind == ABI_ARG_DIRECT) {
- u32 ni = 0, nf = 0, p;
- for (p = 0; p < abi->ret.nparts; ++p) {
- const ABIArgPart* part = &abi->ret.parts[p];
- NativeAllocClass cls =
- part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
- KitCgTypeId pty = rv_part_scalar_type(part);
- Reg rreg = cls == NATIVE_REG_FP ? RV_FA0 + nf++ : RV_A0 + ni++;
- rets[nr].src = values[0];
- if (rets[nr].src.kind == NATIVE_LOC_FRAME)
- rets[nr].src =
- native_loc_stack(pty, values[0].v.frame, (i32)part->src_offset);
- else if (rets[nr].src.kind == NATIVE_LOC_STACK) {
- rets[nr].src.v.stack.offset += (i32)part->src_offset;
- rets[nr].src.type = pty;
- }
- rets[nr].dst = native_loc_reg(pty, cls, rreg);
- rets[nr].mem = native_mem_for_type(t, pty, part->size);
- nr++;
- }
- } else if (nvalues) {
- rets[0].src = values[0];
- rets[0].dst = native_loc_reg(values[0].type, NATIVE_REG_INT, RV_A0);
- rets[0].mem = native_mem_for_type(t, values[0].type, 0);
- nr = 1;
- }
- *out_rets = rets;
- *out_nrets = nr;
-}
-
-static void rv_ret(NativeTarget* t) {
- RvNativeTarget* a = rv_of(t);
- rv_jump(t, a->epilogue_label);
-}
-
-/* ============================ alloca ============================ */
-
-static void rv_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size,
- u32 align) {
- RvNativeTarget* a = rv_of(t);
- MCEmitter* mc = t->mc;
- u32 rsz = loc_reg(size);
- u32 rd = loc_reg(dst);
- u32 al = align ? align : 16u;
- if (al < 16u) al = 16u;
- /* round up: t0 = (size + (al-1)) & ~(al-1) */
- rv64_emit32(mc, rv_addi(RV_TMP0, rsz, (i32)(al - 1u)));
- rv_emit_load_imm(mc, 1, RV_TMP1, -(i64)al);
- rv64_emit32(mc, rv_and(RV_TMP0, RV_TMP0, RV_TMP1));
- rv64_emit32(mc, rv_sub(RV_SP, RV_SP, RV_TMP0));
- a->frame.has_alloca = 1;
- /* dst = sp + max_outgoing (patched in func_end) */
- if (a->npatches == a->patches_cap) {
- u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u;
- RvPatch* nb = arena_zarray(t->c->tu, RvPatch, cap);
- if (a->patches) memcpy(nb, a->patches, sizeof(*nb) * a->npatches);
- a->patches = nb;
- a->patches_cap = cap;
- }
- a->patches[a->npatches].kind = RV_PATCH_ALLOCA;
- a->patches[a->npatches].pos = mc->pos(mc);
- a->patches[a->npatches].dst_reg = rd;
- a->npatches++;
- a->nalloca++;
- rv64_emit32(mc, RV_NOP); /* placeholder for addi dst, sp, max_outgoing */
-}
-
-/* ============================ TLS / bitfield / atomics
- * ============================ */
-
-static void rv_tls_addr_of(NativeTarget* t, NativeLoc dst, ObjSymId sym,
- i64 addend) {
- MCEmitter* mc = t->mc;
- u32 sec = mc->section_id;
- u32 rd = loc_reg(dst);
- /* Local-Exec only, matching aa64 (aa_tls_addr_of) and x64 (x64_tls_addr_of):
- * kit links the whole module statically, so every _Thread_local symbol is
- * resolved within the image and TPREL is always valid. An Initial-Exec GOT
- * path (R_RV_TLS_GOT_HI20) used to be emitted for extern-via-GOT symbols
- * under -fPIE (the hosted default), but the linker has no layout/apply for
- * that reloc, so it produced a hard "unsupported reloc kind" link failure
- * rather than a working binary. */
- /* lui t0, %tprel_hi(sym); add t0, tp, t0; addi dst, t0, %tprel_lo(sym). */
- {
- u32 hp = mc->pos(mc);
- rv64_emit32(mc, rv_lui(RV_TMP0, 0));
- mc->emit_reloc_at(mc, sec, hp, R_RV_TPREL_HI20, sym, addend, 0, 0);
- rv64_emit32(mc, rv_add(RV_TMP0, RV_TP, RV_TMP0));
- {
- u32 lp = mc->pos(mc);
- rv64_emit32(mc, rv_addi(rd, RV_TMP0, 0));
- mc->emit_reloc_at(mc, sec, lp, R_RV_TPREL_LO12_I, sym, addend, 0, 0);
- }
- }
-}
-static void rv_bitfield_load(NativeTarget* t, NativeLoc dst, NativeAddr ra,
- BitFieldAccess bf) {
- RvNativeTarget* a = rv_of(t);
- MCEmitter* mc = t->mc;
- u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
- u32 rd = loc_reg(dst);
- u32 base;
- i32 off;
- u32 lsb = bf.bit_offset;
- u32 width = bf.bit_width ? bf.bit_width : 1u;
- /* Shift left so the field's MSB lands at bit 63, then shift right to
- * sign/zero extend it down. Use 64-bit shifts throughout. */
- u32 sh_left = 64u - (lsb + width);
- u32 sh_right = 64u - width;
- ra.offset += (i32)bf.storage_offset;
- rv_resolve_mem_addr(a, &ra, &base, &off);
- rv64_emit32(mc, enc_int_load(storage_bytes, 0, rd, base, off));
- rv64_emit32(mc, rv_slli(rd, rd, sh_left));
- if (bf.signed_)
- rv64_emit32(mc, rv_srai(rd, rd, sh_right));
- else
- rv64_emit32(mc, rv_srli(rd, rd, sh_right));
-}
-static void rv_bitfield_store(NativeTarget* t, NativeAddr ra, NativeLoc src,
- BitFieldAccess bf) {
- RvNativeTarget* a = rv_of(t);
- MCEmitter* mc = t->mc;
- u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
- u32 src_reg = loc_reg(src);
- u32 base;
- i32 off;
- u32 lsb = bf.bit_offset;
- u32 width = bf.bit_width ? bf.bit_width : 1u;
- u64 ones = width >= 64u ? ~(u64)0 : (((u64)1 << width) - 1u);
- u64 mask_in = ones << lsb;
- ra.offset += (i32)bf.storage_offset;
- /* Resolve the field address; rv_resolve_mem_addr may use RV_TMP0/RV_TMP1, so
- * stabilize the base into RV_TMP1 before consuming the scratch temps. */
- rv_resolve_mem_addr(a, &ra, &base, &off);
- if (base != RV_S0 && base != RV_TMP1) {
- rv_emit_addr_adjust(mc, RV_TMP1, base, off);
- base = RV_TMP1;
- off = 0;
- } else if (base == RV_TMP1 && off != 0) {
- rv_emit_addr_adjust(mc, RV_TMP1, RV_TMP1, off);
- off = 0;
- }
- /* word in RV_TMP2; merged via RV_TMP0 (clear mask, then shifted src). */
- rv64_emit32(mc, enc_int_load(storage_bytes, 0, RV_TMP2, base, off));
- rv_emit_load_imm(mc, 1, RV_TMP0, (i64)~mask_in);
- rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP2, RV_TMP0));
- rv_emit_load_imm(mc, 1, RV_TMP0, (i64)ones);
- rv64_emit32(mc, rv_and(RV_TMP0, src_reg, RV_TMP0));
- if (lsb) rv64_emit32(mc, rv_slli(RV_TMP0, RV_TMP0, lsb));
- rv64_emit32(mc, rv_or(RV_TMP2, RV_TMP2, RV_TMP0));
- rv64_emit32(mc, enc_int_store(storage_bytes, RV_TMP2, base, off));
-}
-static int rv_order_acquire(KitCgMemOrder o) {
- return o == KIT_CG_MO_CONSUME || o == KIT_CG_MO_ACQUIRE ||
- o == KIT_CG_MO_ACQ_REL || o == KIT_CG_MO_SEQ_CST;
-}
-static int rv_order_release(KitCgMemOrder o) {
- return o == KIT_CG_MO_RELEASE || o == KIT_CG_MO_ACQ_REL ||
- o == KIT_CG_MO_SEQ_CST;
-}
-
-/* Materialize the atomic operand address into RV_TMP0 (a bare pointer, since
- * LR/SC and AMO take a base register with no offset) and return it. */
-static u32 rv_atomic_addr_reg(RvNativeTarget* a, NativeAddr addr) {
- NativeLoc dst =
- native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, RV_TMP0);
- rv_load_addr(&a->base, dst, addr);
- return RV_TMP0;
-}
-
-static void rv_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr,
- MemAccess mem, KitCgMemOrder mo) {
- RvNativeTarget* a = rv_of(t);
- MCEmitter* mc = t->mc;
- u32 sf =
- (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u;
- u32 base = rv_atomic_addr_reg(a, addr);
- if (mo == KIT_CG_MO_SEQ_CST) rv64_emit32(mc, rv_fence_rw_rw());
- if (rv_order_acquire(mo)) {
- /* lr.w/d as an ordered load (aq=1). */
- rv64_emit32(mc, sf ? rv_lr_d(loc_reg(dst), base, 1, 0)
- : rv_lr_w(loc_reg(dst), base, 1, 0));
- } else {
- rv64_emit32(
- mc, enc_int_load(mem.size ? mem.size : native_type_size(t, dst.type), 0,
- loc_reg(dst), base, 0));
- }
-}
-
-static void rv_atomic_store(NativeTarget* t, NativeAddr addr, NativeLoc src,
- MemAccess mem, KitCgMemOrder mo) {
- RvNativeTarget* a = rv_of(t);
- MCEmitter* mc = t->mc;
- u32 sz = mem.size ? mem.size : native_type_size(t, src.type);
- /* RV_TMP0 holds the address; never collides with src (an allocable reg). */
- u32 base = rv_atomic_addr_reg(a, addr);
- if (rv_order_release(mo)) rv64_emit32(mc, rv_fence_rw_rw());
- rv64_emit32(mc, enc_int_store(sz, loc_reg(src), base, 0));
- if (mo == KIT_CG_MO_SEQ_CST) rv64_emit32(mc, rv_fence_rw_rw());
-}
-
-static void rv_atomic_rmw(NativeTarget* t, KitCgAtomicOp op, NativeLoc dst,
- NativeAddr addr, NativeLoc val, MemAccess mem,
- KitCgMemOrder mo) {
- RvNativeTarget* a = rv_of(t);
- MCEmitter* mc = t->mc;
- u32 sf =
- (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u;
- u32 base = rv_atomic_addr_reg(a, addr); /* RV_TMP0 */
- u32 vreg = loc_reg(val);
- u32 rd = loc_reg(dst);
- u32 aq = (u32)rv_order_acquire(mo);
- u32 rl = (u32)rv_order_release(mo);
- MCLabel retry = mc->label_new(mc);
- /* LR/SC loop: dst = *base; new = dst op val; sc new; retry on failure.
- * RV_TMP1 carries the SC status, RV_TMP3 the computed new value. */
- mc->label_place(mc, retry);
- rv64_emit32(mc, sf ? rv_lr_d(rd, base, aq, 0) : rv_lr_w(rd, base, aq, 0));
- switch (op) {
- case KIT_CG_ATOMIC_XCHG:
- rv64_emit32(mc, rv_addi(RV_TMP3, vreg, 0));
- break;
- case KIT_CG_ATOMIC_ADD:
- rv64_emit32(mc,
- sf ? rv_add(RV_TMP3, rd, vreg) : rv_addw(RV_TMP3, rd, vreg));
- break;
- case KIT_CG_ATOMIC_SUB:
- rv64_emit32(mc,
- sf ? rv_sub(RV_TMP3, rd, vreg) : rv_subw(RV_TMP3, rd, vreg));
- break;
- case KIT_CG_ATOMIC_AND:
- rv64_emit32(mc, rv_and(RV_TMP3, rd, vreg));
- break;
- case KIT_CG_ATOMIC_OR:
- rv64_emit32(mc, rv_or(RV_TMP3, rd, vreg));
- break;
- case KIT_CG_ATOMIC_XOR:
- rv64_emit32(mc, rv_xor(RV_TMP3, rd, vreg));
- break;
- case KIT_CG_ATOMIC_NAND:
- rv64_emit32(mc, rv_and(RV_TMP3, rd, vreg));
- rv64_emit32(mc, rv_xori(RV_TMP3, RV_TMP3, -1));
- break;
- default:
- rv_panic(a, "unsupported atomic rmw op");
- }
- rv64_emit32(mc, sf ? rv_sc_d(RV_TMP1, base, RV_TMP3, 0, rl)
- : rv_sc_w(RV_TMP1, base, RV_TMP3, 0, rl));
- rv64_emit32(mc, rv_bne(RV_TMP1, RV_ZERO, 0));
- mc->emit_label_ref(mc, retry, R_RV_BRANCH, 4, 0);
-}
-
-static void rv_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok,
- NativeAddr addr, NativeLoc expected,
- NativeLoc desired, MemAccess mem,
- KitCgMemOrder success, KitCgMemOrder failure) {
- RvNativeTarget* a = rv_of(t);
- MCEmitter* mc = t->mc;
- u32 sf =
- (mem.size ? mem.size : native_type_size(t, prior.type)) == 8u ? 1u : 0u;
- u32 base = rv_atomic_addr_reg(a, addr); /* RV_TMP0 */
- u32 rprior = loc_reg(prior);
- u32 rexp = loc_reg(expected);
- u32 rdes = loc_reg(desired);
- u32 rok = loc_reg(ok);
- u32 aq = (u32)rv_order_acquire(success);
- u32 rl = (u32)rv_order_release(success);
- MCLabel retry = mc->label_new(mc);
- MCLabel fail = mc->label_new(mc);
- MCLabel done = mc->label_new(mc);
- (void)failure;
- mc->label_place(mc, retry);
- rv64_emit32(mc,
- sf ? rv_lr_d(rprior, base, aq, 0) : rv_lr_w(rprior, base, aq, 0));
- /* if (prior != expected) -> fail */
- rv64_emit32(mc, rv_bne(rprior, rexp, 0));
- mc->emit_label_ref(mc, fail, R_RV_BRANCH, 4, 0);
- /* sc.w/d status, desired, (base); retry on failure. */
- rv64_emit32(mc, sf ? rv_sc_d(RV_TMP1, base, rdes, 0, rl)
- : rv_sc_w(RV_TMP1, base, rdes, 0, rl));
- rv64_emit32(mc, rv_bne(RV_TMP1, RV_ZERO, 0));
- mc->emit_label_ref(mc, retry, R_RV_BRANCH, 4, 0);
- /* ok = 1; jump done. */
- rv_emit_load_imm(mc, 0, rok, 1);
- rv64_emit32(mc, rv_jal(RV_ZERO, 0));
- mc->emit_label_ref(mc, done, R_RV_JAL, 4, 0);
- mc->label_place(mc, fail);
- rv_emit_load_imm(mc, 0, rok, 0);
- mc->label_place(mc, done);
-}
-
-static void rv_fence(NativeTarget* t, KitCgMemOrder mo) {
- if (mo == KIT_CG_MO_RELAXED) return;
- rv64_emit32(t->mc, rv_fence_rw_rw());
-}
-/* ---- variadics (LP64D ABI_VA_LIST_POINTER) ----
- * va_list is a single void* to the next argument slot. The prologue spilled
- * unconsumed a-regs into the 64-byte save area at [s0+16); incoming stack args
- * follow contiguously, so a uniform 8-byte stride covers both. `ap` is a
- * NativeAddr that addresses the va_list object itself. */
-
-static void rv_va_start_core(RvNativeTarget* a, NativeAddr ap) {
- NativeTarget* t = &a->base;
- MCEmitter* mc = t->mc;
- ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
- KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
- if (vai.kind != ABI_VA_LIST_POINTER)
- rv_panic(a, "unsupported va_list layout");
- if (!a->is_variadic) rv_panic(a, "va_start: function not variadic");
- /* *ap = s0 + 16 + next_param_int*8 (skip past named-int save slots). */
- rv64_emit32(mc, rv_addi(RV_TMP1, RV_S0, 16 + (i32)(a->next_param_int * 8u)));
- rv_emit_mem(a, 0, native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1), ap,
- native_mem_for_type(t, i64t, 8));
-}
-
-static void rv_va_arg_core(RvNativeTarget* a, NativeLoc dst, NativeAddr ap,
- KitCgTypeId type) {
- NativeTarget* t = &a->base;
- MCEmitter* mc = t->mc;
- ABIVaListInfo vai = abi_va_list_layout(t->c->abi);
- KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
- u32 sz = native_type_size(t, type);
- NativeLoc cur = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1);
- NativeAddr from;
- if (vai.kind != ABI_VA_LIST_POINTER)
- rv_panic(a, "unsupported va_list layout");
- if (dst.kind != NATIVE_LOC_REG) rv_panic(a, "va_arg destination must be reg");
- /* cur = *ap; load value from [cur]; *ap = cur + 8 (each slot is 8 bytes). */
- rv_emit_mem(a, 1, cur, ap, native_mem_for_type(t, i64t, 8));
- memset(&from, 0, sizeof from);
- from.base_kind = NATIVE_ADDR_BASE_REG;
- from.base.reg = RV_TMP1;
- from.base_type = type;
- if (native_loc_is_fp(dst)) {
- /* Variadic FP args sit in the integer save area as their bit pattern;
- * load into RV_TMP2 and bitcast into the FPR. */
- NativeLoc itmp = native_loc_reg(type, NATIVE_REG_INT, RV_TMP2);
- rv_emit_mem(a, 1, itmp, from, native_mem_for_type(t, type, sz));
- rv64_emit32(mc, sz == 8u ? rv_fmv_d_x(loc_reg(dst), RV_TMP2)
- : rv_fmv_w_x(loc_reg(dst), RV_TMP2));
- } else {
- rv_emit_mem(a, 1, dst, from, native_mem_for_type(t, type, sz));
- }
- rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP1, 8));
- rv_emit_mem(a, 0, cur, ap, native_mem_for_type(t, i64t, 8));
-}
-
-static void rv_va_copy_core(RvNativeTarget* a, NativeAddr dst_ap,
- NativeAddr src_ap) {
- NativeTarget* t = &a->base;
- KitCgTypeId i64t = builtin_id(KIT_CG_BUILTIN_I64);
- NativeLoc tmp = native_loc_reg(i64t, NATIVE_REG_INT, RV_TMP1);
- /* va_list is a single 8-byte pointer. */
- rv_emit_mem(a, 1, tmp, src_ap, native_mem_for_type(t, i64t, 8));
- rv_emit_mem(a, 0, tmp, dst_ap, native_mem_for_type(t, i64t, 8));
-}
-
-static NativeAddr rv_va_addr_from_ptr(NativeLoc ap_ptr) {
- NativeAddr addr;
- memset(&addr, 0, sizeof addr);
- addr.base_kind = NATIVE_ADDR_BASE_REG;
- addr.cls = NATIVE_REG_INT;
- addr.base.reg = ap_ptr.v.reg;
- addr.base_type = ap_ptr.type;
- return addr;
-}
-
-static void rv_va_start_native(NativeTarget* t, NativeLoc ap_ptr) {
- rv_va_start_core(rv_of(t), rv_va_addr_from_ptr(ap_ptr));
-}
-static void rv_va_arg_native(NativeTarget* t, NativeLoc dst, NativeLoc ap_ptr,
- KitCgTypeId type) {
- rv_va_arg_core(rv_of(t), dst, rv_va_addr_from_ptr(ap_ptr), type);
-}
-static void rv_va_end_native(NativeTarget* t, NativeLoc ap_ptr) {
- (void)t;
- (void)ap_ptr;
-}
-static void rv_va_copy_native(NativeTarget* t, NativeLoc dst, NativeLoc src) {
- rv_va_copy_core(rv_of(t), rv_va_addr_from_ptr(dst), rv_va_addr_from_ptr(src));
-}
-/* Software popcount of RV_TMP1 (already width-normalized) into rd, using
- * RV_TMP1/RV_TMP2/RV_TMP3 as scratch. Mirrors the legacy bit-twiddling. */
-static void rv_emit_popcount(MCEmitter* mc, u32 rd, int is64) {
- rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, 1));
- rv_emit_load_imm(mc, 1, RV_TMP3,
- is64 ? (i64)0x5555555555555555ll : (i64)0x55555555);
- rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP2, RV_TMP3));
- rv64_emit32(mc, rv_sub(RV_TMP1, RV_TMP1, RV_TMP2));
- rv_emit_load_imm(mc, 1, RV_TMP3,
- is64 ? (i64)0x3333333333333333ll : (i64)0x33333333);
- rv64_emit32(mc, rv_and(RV_TMP2, RV_TMP1, RV_TMP3));
- rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 2));
- rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP3));
- rv64_emit32(mc, rv_add(RV_TMP1, RV_TMP1, RV_TMP2));
- rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, 4));
- rv64_emit32(mc, rv_add(RV_TMP1, RV_TMP1, RV_TMP2));
- rv_emit_load_imm(mc, 1, RV_TMP3,
- is64 ? (i64)0x0f0f0f0f0f0f0f0fll : (i64)0x0f0f0f0f);
- rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP3));
- rv_emit_load_imm(mc, 1, RV_TMP3,
- is64 ? (i64)0x0101010101010101ll : (i64)0x01010101);
- rv64_emit32(mc, rv_mul(RV_TMP1, RV_TMP1, RV_TMP3));
- rv64_emit32(mc, rv_srli(rd, RV_TMP1, is64 ? 56u : 24u));
- /* The 32-bit SWAR sum lives in product bits [24,32); since the multiply is
- * 64-bit, bits [32,64) survive the >>24 and must be masked off. (The 64-bit
- * path's >>56 already isolates the top byte, so it needs no mask.) */
- if (!is64) rv64_emit32(mc, rv_andi(rd, rd, 0xff));
-}
-
-/* Inline byte-granule copy/set between bare base registers (memcpy/memmove/
- * memset intrinsics). dir<0 copies high-to-low (memmove backward). */
-static void rv_intrin_copy(MCEmitter* mc, u32 dr, u32 sr, u32 n, int backward) {
- if (!backward) {
- u32 i = 0;
- while (i + 8u <= n) {
- rv64_emit32(mc, rv_ld(RV_TMP3, sr, (i32)i));
- rv64_emit32(mc, rv_sd(RV_TMP3, dr, (i32)i));
- i += 8u;
- }
- while (i + 4u <= n) {
- rv64_emit32(mc, rv_lwu(RV_TMP3, sr, (i32)i));
- rv64_emit32(mc, rv_sw(RV_TMP3, dr, (i32)i));
- i += 4u;
- }
- while (i + 2u <= n) {
- rv64_emit32(mc, rv_lhu(RV_TMP3, sr, (i32)i));
- rv64_emit32(mc, rv_sh(RV_TMP3, dr, (i32)i));
- i += 2u;
- }
- while (i < n) {
- rv64_emit32(mc, rv_lbu(RV_TMP3, sr, (i32)i));
- rv64_emit32(mc, rv_sb(RV_TMP3, dr, (i32)i));
- i += 1u;
- }
- } else {
- u32 i = n;
- while (i >= 8u) {
- i -= 8u;
- rv64_emit32(mc, rv_ld(RV_TMP3, sr, (i32)i));
- rv64_emit32(mc, rv_sd(RV_TMP3, dr, (i32)i));
- }
- while (i >= 4u) {
- i -= 4u;
- rv64_emit32(mc, rv_lwu(RV_TMP3, sr, (i32)i));
- rv64_emit32(mc, rv_sw(RV_TMP3, dr, (i32)i));
- }
- while (i >= 2u) {
- i -= 2u;
- rv64_emit32(mc, rv_lhu(RV_TMP3, sr, (i32)i));
- rv64_emit32(mc, rv_sh(RV_TMP3, dr, (i32)i));
- }
- while (i >= 1u) {
- i -= 1u;
- rv64_emit32(mc, rv_lbu(RV_TMP3, sr, (i32)i));
- rv64_emit32(mc, rv_sb(RV_TMP3, dr, (i32)i));
- }
- }
-}
-
-static void rv_intrinsic(NativeTarget* t, IntrinKind kind,
- const NativeLoc* dsts, u32 ndst, const NativeLoc* args,
- u32 narg) {
- RvNativeTarget* a = rv_of(t);
- MCEmitter* mc = t->mc;
- (void)ndst;
- (void)narg;
- switch (kind) {
- case INTRIN_NONE:
- break;
- case INTRIN_EXPECT:
- case INTRIN_ASSUME_ALIGNED: {
- /* dst = val (hint dropped). */
- if (args[0].kind == NATIVE_LOC_IMM)
- rv_emit_load_imm(mc, rv_is_64(t, dsts[0].type) ? 1u : 0u,
- loc_reg(dsts[0]), args[0].v.imm);
- else
- rv_move(t, dsts[0], args[0]);
- return;
- }
- case INTRIN_PREFETCH:
- return;
- case INTRIN_TRAP:
- rv64_emit32(mc, rv_ebreak());
- return;
- case INTRIN_BSWAP: {
- u32 width = abi_cg_sizeof(t->c->abi, dsts[0].type);
- switch (width) {
- case 2: {
- u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
- /* rd = ((rs & 0xff) << 8) | ((rs >> 8) & 0xff). */
- rv64_emit32(mc, rv_addi(RV_TMP2, RV_ZERO, 0xff));
- rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8)); /* 0xff00 */
- rv64_emit32(mc, rv_slli(RV_TMP1, rs, 8));
- rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, RV_TMP2));
- rv64_emit32(mc, rv_srli(RV_TMP3, rs, 8));
- rv64_emit32(mc, rv_andi(RV_TMP3, RV_TMP3, 0xff));
- rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP3));
- return;
- }
- case 4: {
- u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
- rv64_emit32(mc, rv_srliw(RV_TMP1, rs, 24));
- rv64_emit32(mc, rv_andi(RV_TMP1, RV_TMP1, 0xff));
- rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 16));
- rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
- rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 8));
- rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
- rv64_emit32(mc, rv_srliw(RV_TMP2, rs, 8));
- rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
- rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 16));
- rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
- rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
- rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, 24));
- rv64_emit32(mc, rv_or(rd, RV_TMP1, RV_TMP2));
- rv64_emit32(mc, rv_slli(rd, rd, 32));
- rv64_emit32(mc, rv_srli(rd, rd, 32));
- return;
- }
- case 8: {
- u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
- int i;
- rv64_emit32(mc, rv_addi(RV_TMP1, RV_ZERO, 0));
- for (i = 0; i < 8; ++i) {
- int sh = 56 - 8 * i;
- if (i == 0) {
- rv64_emit32(mc, rv_andi(RV_TMP2, rs, 0xff));
- } else {
- rv64_emit32(mc, rv_srli(RV_TMP2, rs, (u32)(8 * i)));
- rv64_emit32(mc, rv_andi(RV_TMP2, RV_TMP2, 0xff));
- }
- if (sh) rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP2, (u32)sh));
- rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
- }
- rv64_emit32(mc, rv_addi(rd, RV_TMP1, 0));
- return;
- }
- default:
- break;
- }
- return;
- }
- case INTRIN_POPCOUNT: {
- u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
- int is64 = rv_is_64(t, args[0].type);
- rv64_emit32(mc, rv_addi(RV_TMP1, rs, 0));
- if (!is64) {
- rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
- rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
- }
- rv_emit_popcount(mc, rd, is64);
- return;
- }
- case INTRIN_CTZ: {
- /* ctz(x) = popcount((x & -x) - 1) for x != 0. */
- u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
- int is64 = rv_is_64(t, args[0].type);
- rv64_emit32(mc, rv_sub(RV_TMP1, RV_ZERO, rs));
- rv64_emit32(mc, rv_and(RV_TMP1, RV_TMP1, rs));
- rv64_emit32(mc, rv_addi(RV_TMP1, RV_TMP1, -1));
- if (!is64) {
- rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
- rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
- }
- rv_emit_popcount(mc, rd, is64);
- return;
- }
- case INTRIN_CLZ: {
- /* Fold the high bit downward, then clz = popcount(~folded). */
- u32 rd = loc_reg(dsts[0]), rs = loc_reg(args[0]);
- int is64 = rv_is_64(t, args[0].type);
- u32 shifts[6] = {1, 2, 4, 8, 16, 32};
- u32 ns = is64 ? 6u : 5u, i;
- rv64_emit32(mc, rv_addi(RV_TMP1, rs, 0));
- if (!is64) {
- rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
- rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
- }
- for (i = 0; i < ns; ++i) {
- rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP1, shifts[i]));
- rv64_emit32(mc, rv_or(RV_TMP1, RV_TMP1, RV_TMP2));
- }
- rv64_emit32(mc, rv_xori(RV_TMP1, RV_TMP1, -1));
- if (!is64) {
- rv64_emit32(mc, rv_slli(RV_TMP1, RV_TMP1, 32));
- rv64_emit32(mc, rv_srli(RV_TMP1, RV_TMP1, 32));
- }
- rv_emit_popcount(mc, rd, is64);
- return;
- }
- case INTRIN_SADD_OVERFLOW:
- case INTRIN_SSUB_OVERFLOW: {
- /* dsts: [val, ovf]. ADD: ovf=((a^r)&(b^r))>>(w-1);
- * SUB: ovf=((a^b)&(a^r))>>(w-1). */
- int is64 = rv_is_64(t, dsts[0].type);
- u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
- u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
- u32 sh = is64 ? 63u : 31u;
- if (kind == INTRIN_SADD_OVERFLOW)
- rv64_emit32(mc,
- is64 ? rv_add(RV_TMP2, ra, rb) : rv_addw(RV_TMP2, ra, rb));
- else
- rv64_emit32(mc,
- is64 ? rv_sub(RV_TMP2, ra, rb) : rv_subw(RV_TMP2, ra, rb));
- rv64_emit32(mc, rv_xor(RV_TMP3, ra, RV_TMP2)); /* a ^ r */
- if (kind == INTRIN_SADD_OVERFLOW) {
- rv64_emit32(mc, rv_xor(rovf, rb, RV_TMP2)); /* b ^ r */
- rv64_emit32(mc, rv_and(rovf, rovf, RV_TMP3));
- } else {
- rv64_emit32(mc, rv_xor(rovf, ra, rb)); /* a ^ b */
- rv64_emit32(mc, rv_and(rovf, rovf, RV_TMP3));
- }
- rv64_emit32(mc,
- is64 ? rv_srli(rovf, rovf, sh) : rv_srliw(rovf, rovf, sh));
- rv64_emit32(mc, rv_andi(rovf, rovf, 1));
- rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0));
- return;
- }
- case INTRIN_UADD_OVERFLOW:
- case INTRIN_USUB_OVERFLOW: {
- int is64 = rv_is_64(t, dsts[0].type);
- u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
- u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
- if (!is64) {
- rv64_emit32(mc, rv_slli(RV_TMP2, ra, 32));
- rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP2, 32));
- rv64_emit32(mc, rv_slli(RV_TMP3, rb, 32));
- rv64_emit32(mc, rv_srli(RV_TMP3, RV_TMP3, 32));
- ra = RV_TMP2;
- rb = RV_TMP3;
- }
- if (kind == INTRIN_UADD_OVERFLOW) {
- if (is64) {
- rv64_emit32(mc, rv_add(RV_TMP2, ra, rb));
- rv64_emit32(mc, rv_sltu(rovf, RV_TMP2, ra));
- } else {
- rv64_emit32(mc, rv_add(RV_TMP2, ra, rb));
- rv64_emit32(mc, rv_srli(rovf, RV_TMP2, 32));
- rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
- rv64_emit32(mc, rv_addiw(RV_TMP2, RV_TMP2, 0));
- }
- } else {
- rv64_emit32(mc, rv_sltu(rovf, ra, rb));
- rv64_emit32(mc,
- is64 ? rv_sub(RV_TMP2, ra, rb) : rv_subw(RV_TMP2, ra, rb));
- }
- rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0));
- return;
- }
- case INTRIN_SMUL_OVERFLOW: {
- int is64 = rv_is_64(t, dsts[0].type);
- u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
- u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
- if (is64) {
- rv64_emit32(mc, rv_mul(RV_TMP2, ra, rb));
- rv64_emit32(mc, rv_mulh(RV_TMP3, ra, rb));
- rv64_emit32(mc, rv_srai(rovf, RV_TMP2, 63));
- rv64_emit32(mc, rv_xor(rovf, RV_TMP3, rovf));
- rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
- rv64_emit32(mc, rv_addi(rd, RV_TMP2, 0));
- } else {
- rv64_emit32(mc, rv_addiw(RV_TMP2, ra, 0));
- rv64_emit32(mc, rv_addiw(RV_TMP3, rb, 0));
- rv64_emit32(mc, rv_mul(RV_TMP2, RV_TMP2, RV_TMP3));
- rv64_emit32(mc, rv_addiw(RV_TMP3, RV_TMP2, 0));
- rv64_emit32(mc, rv_xor(rovf, RV_TMP2, RV_TMP3));
- rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
- rv64_emit32(mc, rv_addiw(rd, RV_TMP2, 0));
- }
- return;
- }
- case INTRIN_UMUL_OVERFLOW: {
- int is64 = rv_is_64(t, dsts[0].type);
- u32 ra = loc_reg(args[0]), rb = loc_reg(args[1]);
- u32 rd = loc_reg(dsts[0]), rovf = loc_reg(dsts[1]);
- if (is64) {
- rv64_emit32(mc, rv_mulhu(rovf, ra, rb));
- rv64_emit32(mc, rv_mul(rd, ra, rb));
- rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
- } else {
- rv64_emit32(mc, rv_slli(RV_TMP2, ra, 32));
- rv64_emit32(mc, rv_srli(RV_TMP2, RV_TMP2, 32));
- rv64_emit32(mc, rv_slli(RV_TMP3, rb, 32));
- rv64_emit32(mc, rv_srli(RV_TMP3, RV_TMP3, 32));
- rv64_emit32(mc, rv_mul(RV_TMP2, RV_TMP2, RV_TMP3));
- rv64_emit32(mc, rv_srli(rovf, RV_TMP2, 32));
- rv64_emit32(mc, rv_sltu(rovf, RV_ZERO, rovf));
- rv64_emit32(mc, rv_addiw(rd, RV_TMP2, 0));
- }
- return;
- }
- case INTRIN_MEMCPY:
- case INTRIN_MEMMOVE: {
- u32 dr, sr, n;
- if (narg != 3u || args[0].kind != NATIVE_LOC_REG ||
- args[1].kind != NATIVE_LOC_REG || args[2].kind != NATIVE_LOC_IMM)
- rv_panic(a, "unsupported memory intrinsic operands");
- if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll)
- rv_panic(a, "unsupported memory intrinsic size");
- dr = loc_reg(args[0]);
- sr = loc_reg(args[1]);
- n = (u32)args[2].v.imm;
- rv_intrin_copy(mc, dr, sr, n, kind == INTRIN_MEMMOVE);
- return;
- }
- case INTRIN_MEMSET: {
- u32 dr, n, src;
- if (narg != 3u || args[0].kind != NATIVE_LOC_REG ||
- args[2].kind != NATIVE_LOC_IMM)
- rv_panic(a, "unsupported memset operands");
- if (args[2].v.imm < 0 || args[2].v.imm > 0xffffffffll)
- rv_panic(a, "unsupported memset size");
- dr = loc_reg(args[0]);
- n = (u32)args[2].v.imm;
- if (args[1].kind == NATIVE_LOC_IMM) {
- u32 byte = (u32)(args[1].v.imm & 0xffu);
- if (byte == 0) {
- src = RV_ZERO;
- } else {
- u64 b = byte;
- b |= b << 8;
- b |= b << 16;
- b |= b << 32;
- rv_emit_load_imm(mc, 1, RV_TMP3, (i64)b);
- src = RV_TMP3;
- }
- } else {
- /* Replicate the low byte of a register value across 8 bytes. */
- u32 rb = loc_reg(args[1]);
- rv64_emit32(mc, rv_andi(RV_TMP3, rb, 0xff));
- rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 8));
- rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2));
- rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 16));
- rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2));
- rv64_emit32(mc, rv_slli(RV_TMP2, RV_TMP3, 32));
- rv64_emit32(mc, rv_or(RV_TMP3, RV_TMP3, RV_TMP2));
- src = RV_TMP3;
- }
- {
- u32 i = 0;
- while (i + 8u <= n) {
- rv64_emit32(mc, rv_sd(src, dr, (i32)i));
- i += 8u;
- }
- while (i + 4u <= n) {
- rv64_emit32(mc, rv_sw(src, dr, (i32)i));
- i += 4u;
- }
- while (i + 2u <= n) {
- rv64_emit32(mc, rv_sh(src, dr, (i32)i));
- i += 2u;
- }
- while (i < n) {
- rv64_emit32(mc, rv_sb(src, dr, (i32)i));
- i += 1u;
- }
- }
- return;
- }
- case INTRIN_CPU_NOP:
- rv64_emit32(mc, rv_nop());
- return;
- case INTRIN_CPU_YIELD:
- rv64_emit32(mc, rv_pause());
- return;
- case INTRIN_ISB:
- rv64_emit32(mc, rv_fence_i());
- return;
- case INTRIN_DMB:
- case INTRIN_DSB:
- rv64_emit32(mc, rv_fence_rw_rw());
- return;
- case INTRIN_WFI:
- rv64_emit32(mc, rv_wfi());
- return;
- default:
- break;
- }
- rv_panic(a, "unsupported compiler intrinsic");
-}
-/* ============================ inline asm ============================ */
-
-_Noreturn static void rv_asm_panic_at(Compiler* c, SrcLoc loc,
- const char* msg) {
- compiler_panic(c, loc, "rv64 inline asm: %s", msg);
-}
-_Noreturn static void rv_asm_panic(NativeDirectTarget* d, const char* msg) {
- rv_asm_panic_at(d->base.c, d->loc, msg);
-}
-
-/* constraint_body / constraint_early / match_index are shared
- * (cg/native_asm.h). */
-
-/* Build a bound register pseudo-operand in the rv64 inline shape. */
-static void rv_asm_bound_reg(Operand* out, KitCgTypeId type,
- NativeAllocClass cls, Reg reg) {
- memset(out, 0, sizeof *out);
- out->kind = RV64_INLINE_OPK_REG;
- out->pad[0] =
- (cls == NATIVE_REG_FP) ? RV64_INLINE_OPCLS_FP : RV64_INLINE_OPCLS_INT;
- out->type = type;
- out->v.local = (CGLocal)reg;
-}
-static void rv_asm_bound_mem(Operand* out, KitCgTypeId type, Reg base) {
- memset(out, 0, sizeof *out);
- out->kind = OPK_INDIRECT;
- out->type = type;
- out->v.ind.base = (CGLocal)base;
- out->v.ind.index = CG_LOCAL_NONE;
- out->v.ind.ofs = 0;
-}
-
-/* Parse a clobber register name into (class, reg). Returns 0 for the special
- * "cc"/"memory" clobbers and panics on an unknown register. RV64 dwarf: int
- * x0..x31 = 0..31, fp f0..f31 = 32..63. */
-static int rv_asm_parse_reg_clobber(Compiler* c, SrcLoc loc, Sym name,
- NativeAllocClass* cls_out, Reg* reg_out) {
- Slice s = pool_slice(c->global, name);
- char buf[16];
- uint32_t dwarf;
- if (!s.s || !s.len) return 0;
- if (s.len == 2 && s.s[0] == 'c' && s.s[1] == 'c') return 0;
- if (s.len == 6 && memcmp(s.s, "memory", 6) == 0) return 0;
- if (s.len >= sizeof buf) rv_asm_panic_at(c, loc, "clobber name is too long");
- memcpy(buf, s.s, s.len);
- buf[s.len] = '\0';
- if (rv64_register_index(buf, &dwarf) != 0)
- rv_asm_panic_at(c, loc, "unknown clobber register");
- if (dwarf <= 31u) {
- *cls_out = NATIVE_REG_INT;
- *reg_out = (Reg)dwarf;
- return 1;
- }
- if (dwarf >= 32u && dwarf <= 63u) {
- *cls_out = NATIVE_REG_FP;
- *reg_out = (Reg)(dwarf - 32u);
- return 1;
- }
- rv_asm_panic_at(c, loc, "unsupported clobber register");
- return 0;
-}
-
-static void rv_asm_clobber_masks(Compiler* c, SrcLoc loc, const Sym* clobbers,
- u32 nclob, u32* int_mask, u32* fp_mask) {
- u32 i;
- *int_mask = 0;
- *fp_mask = 0;
- for (i = 0; i < nclob; ++i) {
- NativeAllocClass cls;
- Reg reg;
- if (!rv_asm_parse_reg_clobber(c, loc, clobbers[i], &cls, ®)) continue;
- if (cls == NATIVE_REG_INT)
- *int_mask |= 1u << reg;
- else
- *fp_mask |= 1u << reg;
- }
-}
-
-static NativeAllocClass rv_asm_constraint_class(NativeDirectTarget* d,
- const char* body) {
- if (body[0] == 'r') return NATIVE_REG_INT;
- if (body[0] == 'f') return NATIVE_REG_FP;
- rv_asm_panic(d, "constraint is not a register constraint");
- return NATIVE_REG_INT;
-}
-
-static int rv_asm_resolve_pin_or_panic(NativeDirectTarget* d, Sym reg,
- const char* constraint,
- NativeAsmRegPin* pin) {
- NativeAsmRegPinStatus st =
- native_asm_resolve_pin(d->native, reg, constraint, pin);
- if (st == NATIVE_ASM_REG_PIN_ABSENT) return 0;
- if (st != NATIVE_ASM_REG_PIN_OK)
- rv_asm_panic(d, native_asm_pin_status_message(st));
- return 1;
-}
-
-/* Pick a free register from the arch's caller-saved allocable pools for an
- * asm operand the direct path must self-allocate. */
-static Reg rv_asm_alloc_reg(NativeDirectTarget* d, NativeAllocClass cls,
- u32* used_int, u32* used_fp) {
- /* int: a0..a7 (10..17) then t-temps that aren't emit scratch. */
- static const Reg int_pool[] = {10u, 11u, 12u, 13u, 14u, 15u,
- 16u, 17u, 29u, 30u, 31u};
- /* fp: fa0..fa7 (10..17) then ft caller-saved. */
- static const Reg fp_pool[] = {10u, 11u, 12u, 13u, 14u, 15u, 16u, 17u,
- 4u, 5u, 6u, 7u, 28u, 29u, 30u, 31u};
- const Reg* pool = cls == NATIVE_REG_FP ? fp_pool : int_pool;
- u32 n = cls == NATIVE_REG_FP ? (u32)(sizeof fp_pool / sizeof fp_pool[0])
- : (u32)(sizeof int_pool / sizeof int_pool[0]);
- u32* used = cls == NATIVE_REG_FP ? used_fp : used_int;
- u32 i;
- for (i = 0; i < n; ++i) {
- Reg r = pool[i];
- if ((*used & (1u << r)) != 0) continue;
- *used |= 1u << r;
- return r;
- }
- rv_asm_panic(d, "out of registers for asm operands");
- return REG_NONE;
-}
-
-/* Direct (-O0) path: resolve a semantic Operand to a NativeAddr. */
-static NativeAddr rv_direct_addr(NativeDirectTarget* d, Operand op) {
- NativeAddr addr;
- memset(&addr, 0, sizeof addr);
- switch ((OpKind)op.kind) {
- case OPK_LOCAL:
- addr.base_kind = NATIVE_ADDR_BASE_FRAME;
- addr.base.frame = d->locals[op.v.local - 1u].home;
- addr.base_type = op.type;
- return addr;
- case OPK_INDIRECT:
- addr.base_kind = NATIVE_ADDR_BASE_FRAME_VALUE;
- addr.base.frame = d->locals[op.v.ind.base - 1u].home;
- addr.cls = d->locals[op.v.ind.base - 1u].cls;
- addr.base_type = d->locals[op.v.ind.base - 1u].type;
- addr.offset = op.v.ind.ofs;
- return addr;
- default:
- rv_asm_panic(d, "operand is not addressable");
- }
-}
-
-/* Materialize an OPK_INDIRECT (frame-value) base into a register, returning a
- * plain register-based NativeAddr. */
-static NativeAddr rv_direct_materialize_addr(NativeDirectTarget* d,
- Operand op) {
- RvNativeTarget* a = rv_of(d->native);
- NativeAddr addr = rv_direct_addr(d, op);
- if (addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
- NativeLoc base = native_loc_reg(addr.base_type, NATIVE_REG_INT, RV_TMP1);
- NativeAddr load;
- memset(&load, 0, sizeof load);
- load.base_kind = NATIVE_ADDR_BASE_FRAME;
- load.base.frame = addr.base.frame;
- load.base_type = addr.base_type;
- rv_emit_mem(a, 1, base, load,
- native_mem_for_type(d->native, addr.base_type, 8));
- addr.base_kind = NATIVE_ADDR_BASE_REG;
- addr.base.reg = RV_TMP1;
- }
- return addr;
-}
-
-static void rv_direct_load_operand_to_reg(NativeDirectTarget* d, Operand op,
- NativeLoc dst) {
- RvNativeTarget* a = rv_of(d->native);
- NativeAddr addr;
- memset(&addr, 0, sizeof addr);
- switch ((OpKind)op.kind) {
- case OPK_IMM:
- if ((NativeAllocClass)dst.cls != NATIVE_REG_INT)
- rv_asm_panic(d, "floating-point immediate asm input is unsupported");
- d->native->load_imm(d->native, dst, op.v.imm);
- return;
- case OPK_LOCAL:
- addr.base_kind = NATIVE_ADDR_BASE_FRAME;
- addr.base.frame = d->locals[op.v.local - 1u].home;
- addr.base_type = op.type;
- rv_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0));
- return;
- case OPK_GLOBAL:
- addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
- addr.base.global.sym = op.v.global.sym;
- addr.base.global.addend = op.v.global.addend;
- addr.base_type = op.type;
- d->native->load_addr(d->native, dst, addr);
- return;
- case OPK_INDIRECT:
- addr = rv_direct_materialize_addr(d, op);
- rv_emit_mem(a, 1, dst, addr, native_mem_for_type(d->native, op.type, 0));
- return;
- }
- rv_asm_panic(d, "unsupported asm input operand");
-}
-
-static void rv_direct_load_address_to_reg(NativeDirectTarget* d, Operand op,
- NativeLoc dst) {
- d->native->load_addr(d->native, dst, rv_direct_addr(d, op));
-}
-
-static void rv_direct_store_reg_to_operand(NativeDirectTarget* d, Operand op,
- NativeLoc src) {
- RvNativeTarget* a = rv_of(d->native);
- NativeAddr addr;
- memset(&addr, 0, sizeof addr);
- if (op.kind == OPK_LOCAL) {
- addr.base_kind = NATIVE_ADDR_BASE_FRAME;
- addr.base.frame = d->locals[op.v.local - 1u].home;
- addr.base_type = op.type;
- } else {
- addr = rv_direct_materialize_addr(d, op);
- }
- rv_emit_mem(a, 0, src, addr, native_mem_for_type(d->native, op.type, 0));
-}
-
-/* Callee-saved registers an asm block clobbers must be spilled/restored around
- * the block (the only ABI duty the allocator cannot discharge itself). */
-typedef struct RvAsmSavedClobber {
- NativeFrameSlot slot;
- NativeAllocClass cls;
- Reg reg;
- KitCgTypeId type;
-} RvAsmSavedClobber;
-
-static void rv_asm_save_one(RvNativeTarget* a, RvAsmSavedClobber* s) {
- NativeFrameSlotDesc desc;
- NativeAddr addr;
- memset(&desc, 0, sizeof desc);
- desc.type = s->type;
- desc.size = 8;
- desc.align = 8;
- desc.kind = NATIVE_FRAME_SLOT_SAVE;
- s->slot = a->base.frame_slot(&a->base, &desc);
- memset(&addr, 0, sizeof addr);
- addr.base_kind = NATIVE_ADDR_BASE_FRAME;
- addr.base.frame = s->slot;
- addr.base_type = s->type;
- rv_emit_mem(a, 0, native_loc_reg(s->type, s->cls, s->reg), addr,
- native_mem_for_type(&a->base, s->type, 8));
-}
-static void rv_asm_restore_one(RvNativeTarget* a, const RvAsmSavedClobber* s) {
- NativeAddr addr;
- memset(&addr, 0, sizeof addr);
- addr.base_kind = NATIVE_ADDR_BASE_FRAME;
- addr.base.frame = s->slot;
- addr.base_type = s->type;
- rv_emit_mem(a, 1, native_loc_reg(s->type, s->cls, s->reg), addr,
- native_mem_for_type(&a->base, s->type, 8));
-}
-
-/* psABI callee-saved: integer s0..s11 (x8,x9,x18..x27), fp fs0..fs11
- * (f8,f9,f18..f27). x8 is the frame pointer and never asm-clobbered. */
-static int rv_reg_is_callee_int(Reg r) {
- return r == 9u || (r >= 18u && r <= 27u);
-}
-static int rv_reg_is_callee_fp(Reg r) {
- return r == 8u || r == 9u || (r >= 18u && r <= 27u);
-}
-
-static RvAsmSavedClobber* rv_asm_save_callee_clobbers(RvNativeTarget* a,
- u32 int_mask, u32 fp_mask,
- u32* nsaved_out) {
- RvAsmSavedClobber* saved =
- arena_zarray(a->base.c->tu, RvAsmSavedClobber, 24u);
- KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
- KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64);
- u32 n = 0;
- Reg r;
- for (r = 0; r <= 31u; ++r) {
- if ((int_mask & (1u << r)) == 0 || !rv_reg_is_callee_int(r)) continue;
- saved[n].cls = NATIVE_REG_INT;
- saved[n].reg = r;
- saved[n].type = i64;
- rv_asm_save_one(a, &saved[n++]);
- }
- for (r = 0; r <= 31u; ++r) {
- if ((fp_mask & (1u << r)) == 0 || !rv_reg_is_callee_fp(r)) continue;
- saved[n].cls = NATIVE_REG_FP;
- saved[n].reg = r;
- saved[n].type = f64;
- rv_asm_save_one(a, &saved[n++]);
- }
- *nsaved_out = n;
- return saved;
-}
-
-/* ---- NativeTarget (optimizer) asm hook ----
- * The optimizer pre-allocated every operand register and arranged surrounding
- * data flow, so this binds pre-allocated registers to the template and only
- * materializes memory-operand bases into the reserved scratch + spills the
- * callee-saved registers the asm clobbers. */
-
-static NativeAddr rv_asm_loc_to_addr(RvNativeTarget* a, SrcLoc loc,
- NativeLoc src) {
- NativeAddr addr;
- memset(&addr, 0, sizeof addr);
- addr.base_type = src.type;
- switch ((NativeLocKind)src.kind) {
- case NATIVE_LOC_FRAME:
- addr.base_kind = NATIVE_ADDR_BASE_FRAME;
- addr.base.frame = src.v.frame;
- return addr;
- case NATIVE_LOC_ADDR:
- return src.v.addr;
- case NATIVE_LOC_GLOBAL:
- addr.base_kind = NATIVE_ADDR_BASE_GLOBAL;
- addr.base.global.sym = src.v.global.sym;
- addr.base.global.addend = src.v.global.addend;
- return addr;
- case NATIVE_LOC_REG:
- addr.base_kind = NATIVE_ADDR_BASE_REG;
- addr.cls = NATIVE_REG_INT;
- addr.base.reg = src.v.reg;
- return addr;
- default:
- rv_asm_panic_at(a->base.c, loc, "unsupported memory asm operand");
- }
-}
-
-/* Resolve a memory-constraint operand to a single base register with zero
- * offset, folding any frame/global/offset into a reserved scratch register. */
-static Reg rv_asm_native_mem_base(RvNativeTarget* a, SrcLoc loc, NativeLoc src,
- u32* ntmp) {
- NativeAddr addr = rv_asm_loc_to_addr(a, loc, src);
- u32 base;
- i32 off;
- Reg dst;
- if (addr.index_kind != NATIVE_ADDR_INDEX_NONE)
- rv_asm_panic_at(a->base.c, loc, "indexed memory asm operand unsupported");
- rv_resolve_mem_addr(a, &addr, &base, &off);
- if (off == 0 && base != RV_TMP0 && base != RV_TMP1) return (Reg)base;
- if (*ntmp >= 2u)
- rv_asm_panic_at(a->base.c, loc, "too many memory asm operands");
- dst = (*ntmp == 0u) ? RV_TMP0 : RV_TMP1;
- (*ntmp)++;
- rv_emit_addr_adjust(a->base.mc, dst, base, off);
- return dst;
-}
-
-static void rv_asm_bind_native(RvNativeTarget* a, SrcLoc loc, Operand* out,
- const char* constraint, KitCgTypeId type,
- NativeLoc src, u32* ntmp) {
- const char* body = native_asm_constraint_body(constraint);
- if (body[0] == 'r' || body[0] == 'f') {
- NativeAllocClass cls = (body[0] == 'f') ? NATIVE_REG_FP : NATIVE_REG_INT;
- if (src.kind != NATIVE_LOC_REG)
- rv_asm_panic_at(a->base.c, loc, "register asm operand not in a register");
- rv_asm_bound_reg(out, type, cls, (Reg)src.v.reg);
- } else if (body[0] == 'i') {
- if (src.kind != NATIVE_LOC_IMM)
- rv_asm_panic_at(a->base.c, loc, "immediate asm operand is not immediate");
- memset(out, 0, sizeof *out);
- out->kind = OPK_IMM;
- out->type = type;
- out->v.imm = src.v.imm;
- } else if (body[0] == 'm') {
- rv_asm_bound_mem(out, type, rv_asm_native_mem_base(a, loc, src, ntmp));
- } else {
- rv_asm_panic_at(a->base.c, loc, "unsupported asm constraint");
- }
-}
-
-static void rv_asm_block_native(NativeTarget* t, const char* tmpl,
- const AsmConstraint* outs, u32 nout,
- NativeLoc* out_locs, const AsmConstraint* ins,
- u32 nin, const NativeLoc* in_locs,
- const Sym* clobbers, u32 nclob) {
- RvNativeTarget* a = rv_of(t);
- Compiler* c = t->c;
- SrcLoc loc = a->func ? a->func->loc : (SrcLoc){0, 0, 0};
- Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
- Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
- u32 ntmp = 0, i;
- Rv64Asm* asmh;
-
- for (i = 0; i < nout; ++i) {
- KitCgTypeId type = outs[i].type ? outs[i].type : out_locs[i].type;
- rv_asm_bind_native(a, loc, &bound_outs[i], outs[i].str, type, out_locs[i],
- &ntmp);
- }
- for (i = 0; i < nin; ++i) {
- const char* body = native_asm_constraint_body(ins[i].str);
- int matched = native_asm_match_index(body);
- KitCgTypeId type;
- NativeLoc inloc;
- if (matched >= 0) {
- if ((u32)matched >= nout)
- rv_asm_panic_at(c, loc, "matching constraint out of range");
- bound_ins[i] = bound_outs[matched];
- continue;
- }
- type = ins[i].type ? ins[i].type : in_locs[i].type;
- inloc = in_locs[i];
- /* A register-constrained input that lives in a frame slot (address-taken
- * local) must be loaded into a reserved scratch first. */
- if (body[0] == 'r' && inloc.kind != NATIVE_LOC_REG) {
- Reg r;
- if (ntmp >= 2u) rv_asm_panic_at(c, loc, "too many memory asm operands");
- r = (ntmp == 0u) ? RV_TMP0 : RV_TMP1;
- ntmp++;
- inloc = native_loc_reg(type, NATIVE_REG_INT, r);
- rv_emit_mem(a, 1, inloc, rv_asm_loc_to_addr(a, loc, in_locs[i]),
- native_mem_for_type(t, type, native_type_size(t, type)));
- }
- rv_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, &ntmp);
- }
-
- /* No per-block callee-saved spill here: plan_frame forwarded the asm clobber
- * masks and rv_known_callee_saves folded the callee-saved ones into the
- * function's saved set, so the prologue/epilogue already preserve them. */
- asmh = rv64_asm_open(c);
- rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
- nclob);
- rv64_asm_run_template(asmh, t->mc, tmpl);
- rv64_asm_close(asmh);
-}
-/* file_scope_asm + finalize are shared (cg/native_asm.h). */
-
-static void rv_trap(NativeTarget* t) { rv64_emit32(t->mc, rv_ebreak()); }
-static void rv_set_loc(NativeTarget* t, SrcLoc loc) {
- rv_of(t)->loc = loc;
- if (t->mc->set_loc) t->mc->set_loc(t->mc, loc);
-}
-
-/* ============================ construction ============================ */
-
-NativeTarget* rv64_native_target_new(Compiler* c, ObjBuilder* obj,
- MCEmitter* mc) {
- RvNativeTarget* a = arena_znew(c->tu, RvNativeTarget);
- NativeTarget* t;
- if (!a) return NULL;
- t = &a->base;
- t->c = c;
- t->obj = obj;
- t->mc = mc;
- native_frame_init(&a->frame, c);
- t->regs = &rv_reg_info;
- t->class_for_type = native_class_for_type_fp_le8;
- t->imm_legal = rv_imm_legal;
- t->addr_legal = rv_addr_legal;
- t->func_begin = rv_func_begin;
- t->func_begin_known_frame = rv_func_begin_known_frame;
- t->note_frame_state = NULL;
- /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved
- * set; rv_func_begin_known_frame derives the records from the masks. */
- t->reserve_callee_saves = rv_reserve_callee_saves;
- t->signature_stack_bytes = rv_signature_stack_bytes;
- t->call_stack_bytes = rv_call_stack_bytes;
- t->has_store_zero_reg = 1;
- t->store_zero_reg = RV_ZERO;
- t->func_end = rv_func_end;
- t->frame_slot = rv_frame_slot;
- t->frame_slot_debug_loc = rv_frame_slot_debug_loc;
- t->bind_param = rv_bind_native_param;
- t->label_new = rv_label_new;
- t->label_place = rv_label_place;
- t->jump = rv_jump;
- t->cmp_branch = rv_cmp_branch;
- t->indirect_branch = rv_indirect_branch;
- t->load_label_addr = rv_load_label_addr;
- t->move = rv_move;
- t->load_imm = rv_load_imm;
- t->load_const = rv_load_const;
- t->load_addr = rv_load_addr;
- t->load = rv_load;
- t->store = rv_store;
- t->tls_addr_of = rv_tls_addr_of;
- t->copy_bytes = rv_copy_bytes;
- t->set_bytes = rv_set_bytes;
- t->bitfield_load = rv_bitfield_load;
- t->bitfield_store = rv_bitfield_store;
- t->binop = rv_binop;
- t->unop = rv_unop;
- t->cmp = rv_cmp;
- t->convert = rv_convert;
- t->alloca_ = rv_alloca;
- t->spill = rv_spill;
- t->reload = rv_reload;
- t->plan_call = rv_plan_call;
- t->emit_call = rv_emit_call;
- t->plan_ret = rv_plan_ret;
- t->ret = rv_ret;
- t->atomic_load = rv_atomic_load;
- t->atomic_store = rv_atomic_store;
- t->atomic_rmw = rv_atomic_rmw;
- t->atomic_cas = rv_atomic_cas;
- t->fence = rv_fence;
- t->va_start_ = rv_va_start_native;
- t->va_arg_ = rv_va_arg_native;
- t->va_end_ = rv_va_end_native;
- t->va_copy_ = rv_va_copy_native;
- t->intrinsic = rv_intrinsic;
- t->asm_block = rv_asm_block_native;
- t->file_scope_asm = native_file_scope_asm;
- t->trap = rv_trap;
- t->set_loc = rv_set_loc;
- t->finalize = native_finalize;
- return t;
-}
-
-/* ============================ NativeOps (-O0) ============================ */
-
-static void rv_bind_param(NativeDirectTarget* d, const CGParamDesc* p,
- CGLocal local, NativeDirectLocal* l) {
- NativeLoc dst;
- (void)local;
- memset(&dst, 0, sizeof dst);
- dst.kind = NATIVE_LOC_FRAME;
- dst.type = p->type;
- dst.v.frame = l->home;
- rv_bind_native_param(d->native, p, dst);
-}
-
-/* A sibling call is realizable when its outgoing stack-argument area fits the
- * window the caller itself received (so the args land in the caller's incoming
- * slots without overflowing into the caller's caller's frame). Register-only
- * calls (the common case) always qualify. Mirrors aa64's aa_no_tail. */
-static const char* rv_no_tail(NativeDirectTarget* d, const CGCallDesc* call) {
- RvNativeTarget* a = rv_of(d->native);
- NativeCallDesc nd;
- NativeLoc* args = NULL;
- NativeLoc* results = NULL;
- u32 i, stack;
- if (a->frame.ncallee_saves)
- return "rv64 tail call: callee-saved registers in use";
- memset(&nd, 0, sizeof nd);
- if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs);
- if (call->nresults)
- results = arena_zarray(d->base.c->tu, NativeLoc, call->nresults);
- for (i = 0; i < call->nargs; ++i) {
- args[i].kind = NATIVE_LOC_FRAME;
- args[i].type = d->locals[call->args[i] - 1u].type;
- args[i].cls = d->locals[call->args[i] - 1u].cls;
- args[i].v.frame = d->locals[call->args[i] - 1u].home;
- }
- for (i = 0; i < call->nresults; ++i) {
- results[i].kind = NATIVE_LOC_FRAME;
- results[i].type = d->locals[call->results[i] - 1u].type;
- results[i].cls = d->locals[call->results[i] - 1u].cls;
- results[i].v.frame = d->locals[call->results[i] - 1u].home;
- }
- nd.fn_type = call->fn_type;
- nd.args = args;
- nd.results = results;
- nd.nargs = call->nargs;
- nd.nresults = call->nresults;
- stack = rv_call_stack_size(d->native, &nd);
- if (stack > a->incoming_stack_size)
- return "rv64 tail call: stack argument area too small";
- return NULL;
-}
-
-/* Resolve a pointer-typed Operand (the address of a va_list object) into `reg`
- * and return a register-based NativeAddr. An OPK_LOCAL holds the va_list object
- * itself, so we take its frame address; an OPK_INDIRECT holds the pointer in
- * memory and must be loaded. The va cores use TMP1/TMP2 internally, so `reg`
- * must be distinct from those (callers pass TMP0 / TMP3). */
-/* ap_addr is the pointer value &ap (the va_list object's address). For an
- * OPK_LOCAL the local HOLDS that pointer, so load its home value; an
- * OPK_INDIRECT names *(base+ofs), whose address base+ofs is the pointer.
- * Mirrors aa64's aa_direct_pointer_addr. */
-static NativeAddr rv_direct_pointer_addr(NativeDirectTarget* d, Operand op) {
- RvNativeTarget* a = rv_of(d->native);
- NativeAddr addr;
- memset(&addr, 0, sizeof addr);
- if (op.kind == OPK_LOCAL) {
- NativeLoc base = native_loc_reg(op.type, NATIVE_REG_INT, RV_TMP1);
- NativeAddr load;
- memset(&load, 0, sizeof load);
- load.base_kind = NATIVE_ADDR_BASE_FRAME;
- load.base.frame = d->locals[op.v.local - 1u].home;
- load.base_type = op.type;
- rv_emit_mem(a, 1, base, load, native_mem_for_type(d->native, op.type, 8));
- addr.base_kind = NATIVE_ADDR_BASE_REG;
- addr.base.reg = RV_TMP1;
- addr.base_type = op.type;
- return addr;
- }
- return rv_direct_materialize_addr(d, op);
-}
-
-static NativeAddr rv_direct_va_base(NativeDirectTarget* d, Operand ap_addr,
- Reg reg) {
- NativeLoc dst =
- native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg);
- NativeAddr addr;
- d->native->load_addr(d->native, dst, rv_direct_pointer_addr(d, ap_addr));
- memset(&addr, 0, sizeof addr);
- addr.base_kind = NATIVE_ADDR_BASE_REG;
- addr.cls = NATIVE_REG_INT;
- addr.base.reg = reg;
- addr.base_type = builtin_id(KIT_CG_BUILTIN_I64);
- return addr;
-}
-
-static void rv_va_start_(NativeDirectTarget* d, Operand ap_addr) {
- rv_va_start_core(rv_of(d->native), rv_direct_va_base(d, ap_addr, RV_TMP3));
-}
-static void rv_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr,
- KitCgTypeId type) {
- RvNativeTarget* a = rv_of(d->native);
- int is_fp = cg_type_is_float(d->base.c, type);
- NativeLoc res = native_loc_reg(type, is_fp ? NATIVE_REG_FP : NATIVE_REG_INT,
- is_fp ? RV_FTMP0 : RV_TMP0);
- NativeAddr dst_addr;
- rv_va_arg_core(a, res, rv_direct_va_base(d, ap_addr, RV_TMP3), type);
- /* Store the fetched value back into the semantic destination. */
- dst_addr = rv_direct_addr(d, dst);
- if (dst_addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) {
- NativeLoc base =
- native_loc_reg(dst_addr.base_type, NATIVE_REG_INT, RV_TMP1);
- NativeAddr load;
- memset(&load, 0, sizeof load);
- load.base_kind = NATIVE_ADDR_BASE_FRAME;
- load.base.frame = dst_addr.base.frame;
- load.base_type = dst_addr.base_type;
- rv_emit_mem(a, 1, base, load,
- native_mem_for_type(d->native, dst_addr.base_type, 8));
- dst_addr.base_kind = NATIVE_ADDR_BASE_REG;
- dst_addr.base.reg = RV_TMP1;
- }
- rv_emit_mem(
- a, 0, res, dst_addr,
- native_mem_for_type(d->native, type, native_type_size(d->native, type)));
-}
-static void rv_va_end_(NativeDirectTarget* d, Operand ap_addr) {
- (void)d;
- (void)ap_addr;
-}
-static void rv_va_copy_(NativeDirectTarget* d, Operand dst, Operand src) {
- RvNativeTarget* a = rv_of(d->native);
- NativeAddr src_ap = rv_direct_va_base(d, src, RV_TMP0);
- NativeAddr dst_ap = rv_direct_va_base(d, dst, RV_TMP3);
- rv_va_copy_core(a, dst_ap, src_ap);
-}
-
-static void rv_direct_asm_block(NativeDirectTarget* d, const char* tmpl,
- const AsmConstraint* outs, u32 nout,
- Operand* out_ops, const AsmConstraint* ins,
- u32 nin, const Operand* in_ops,
- const Sym* clobbers, u32 nclob,
- u32 clobber_abi_sets) {
- RvNativeTarget* a = rv_of(d->native);
- Compiler* c = d->base.c;
- Operand* bound_outs = nout ? arena_zarray(c->tu, Operand, nout) : NULL;
- Operand* bound_ins = nin ? arena_zarray(c->tu, Operand, nin) : NULL;
- u32 clob_int, clob_fp, abi_int, abi_fp, used_int, used_fp;
- RvAsmSavedClobber* saved;
- u32 nsaved, i;
- Rv64Asm* asmh;
-
- rv_asm_clobber_masks(c, d->loc, clobbers, nclob, &clob_int, &clob_fp);
- native_asm_abi_clobber_masks(d->native, clobber_abi_sets, &abi_int, &abi_fp);
- clob_int |= abi_int;
- clob_fp |= abi_fp;
- /* Reserve emit scratch (t0/t1/t2/t3), sp/gp/tp/zero/ra and the frame pointer
- * so the operand allocator never hands them out. */
- used_int = clob_int | (1u << RV_ZERO) | (1u << RV_RA) | (1u << RV_SP) |
- (1u << RV_GP) | (1u << RV_TP) | (1u << RV_TMP0) | (1u << RV_TMP1) |
- (1u << RV_TMP2) | (1u << RV_TMP3) | (1u << RV_S0);
- used_fp =
- clob_fp | (1u << RV_FTMP0) | (1u << RV_FTMP1) | (1u << 2u) | (1u << 3u);
-
- for (i = 0; i < nout; ++i) {
- const char* body = native_asm_constraint_body(outs[i].str);
- KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type;
- NativeAsmRegPin pin;
- if (rv_asm_resolve_pin_or_panic(d, outs[i].reg, outs[i].str, &pin)) {
- /* GNU local register variable: pin to the named hard register. */
- if (pin.cls == NATIVE_REG_FP) {
- used_fp |= 1u << pin.reg;
- clob_fp |= 1u << pin.reg;
- } else {
- used_int |= 1u << pin.reg;
- clob_int |= 1u << pin.reg;
- }
- rv_asm_bound_reg(&bound_outs[i], type, pin.cls, pin.reg);
- } else if (body[0] == 'r' || body[0] == 'f') {
- NativeAllocClass cls = rv_asm_constraint_class(d, body);
- Reg reg = rv_asm_alloc_reg(d, cls, &used_int, &used_fp);
- rv_asm_bound_reg(&bound_outs[i], type, cls, reg);
- } else if (body[0] == 'm') {
- Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp);
- rv_asm_bound_mem(&bound_outs[i], type, reg);
- } else {
- rv_asm_panic(d, "unsupported output constraint");
- }
- }
-
- for (i = 0; i < nin; ++i) {
- const char* body = native_asm_constraint_body(ins[i].str);
- int matched = native_asm_match_index(body);
- KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type;
- if (matched >= 0) {
- if ((u32)matched >= nout)
- rv_asm_panic(d, "matching constraint out of range");
- if (native_asm_constraint_early(outs[matched].str))
- rv_asm_panic(d, "matching input names early-clobber output");
- if (bound_outs[matched].kind != RV64_INLINE_OPK_REG)
- rv_asm_panic(d, "matching constraint requires register output");
- bound_ins[i] = bound_outs[matched];
- continue;
- }
- NativeAsmRegPin pin;
- if (rv_asm_resolve_pin_or_panic(d, ins[i].reg, ins[i].str, &pin)) {
- /* GNU local register variable: pin to the named hard register. */
- if (pin.cls == NATIVE_REG_FP) {
- used_fp |= 1u << pin.reg;
- clob_fp |= 1u << pin.reg;
- } else {
- used_int |= 1u << pin.reg;
- clob_int |= 1u << pin.reg;
- }
- rv_asm_bound_reg(&bound_ins[i], type, pin.cls, pin.reg);
- } else if (body[0] == 'r' || body[0] == 'f') {
- NativeAllocClass cls = rv_asm_constraint_class(d, body);
- Reg reg = rv_asm_alloc_reg(d, cls, &used_int, &used_fp);
- rv_asm_bound_reg(&bound_ins[i], type, cls, reg);
- } else if (body[0] == 'i') {
- if (in_ops[i].kind != OPK_IMM)
- rv_asm_panic(d, "immediate constraint requires immediate operand");
- bound_ins[i] = in_ops[i];
- } else if (body[0] == 'm') {
- Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp);
- rv_asm_bound_mem(&bound_ins[i], type, reg);
- } else {
- rv_asm_panic(d, "unsupported input constraint");
- }
- }
-
- saved = rv_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved);
- for (i = 0; i < nout; ++i) {
- if (bound_outs[i].kind == RV64_INLINE_OPK_REG) {
- NativeAllocClass cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP
- ? NATIVE_REG_FP
- : NATIVE_REG_INT;
- if (outs[i].dir == KIT_CG_ASM_INOUT) {
- rv_direct_load_operand_to_reg(
- d, out_ops[i],
- native_loc_reg(bound_outs[i].type, cls,
- (Reg)bound_outs[i].v.local));
- }
- } else if (bound_outs[i].kind == OPK_INDIRECT) {
- NativeLoc loc =
- native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
- (Reg)bound_outs[i].v.ind.base);
- rv_direct_load_address_to_reg(d, out_ops[i], loc);
- }
- }
- for (i = 0; i < nin; ++i) {
- if (bound_ins[i].kind == RV64_INLINE_OPK_REG) {
- NativeAllocClass cls = bound_ins[i].pad[0] == RV64_INLINE_OPCLS_FP
- ? NATIVE_REG_FP
- : NATIVE_REG_INT;
- rv_direct_load_operand_to_reg(
- d, in_ops[i],
- native_loc_reg(bound_ins[i].type, cls, (Reg)bound_ins[i].v.local));
- } else if (bound_ins[i].kind == OPK_INDIRECT) {
- NativeLoc loc =
- native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT,
- (Reg)bound_ins[i].v.ind.base);
- rv_direct_load_address_to_reg(d, in_ops[i], loc);
- }
- }
- asmh = rv64_asm_open(c);
- rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers,
- nclob);
- rv64_asm_run_template(asmh, d->native->mc, tmpl);
- rv64_asm_close(asmh);
-
- for (i = 0; i < nout; ++i) {
- NativeAllocClass cls;
- NativeLoc src;
- if (bound_outs[i].kind != RV64_INLINE_OPK_REG) continue;
- cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP ? NATIVE_REG_FP
- : NATIVE_REG_INT;
- src = native_loc_reg(bound_outs[i].type, cls, (Reg)bound_outs[i].v.local);
- rv_direct_store_reg_to_operand(d, out_ops[i], src);
- }
- for (i = nsaved; i > 0; --i) rv_asm_restore_one(a, &saved[i - 1u]);
-}
-
-static const NativeOps rv_direct_ops = {
- .bind_param = rv_bind_param,
- .tail_call_unrealizable_reason = rv_no_tail,
- .va_start_ = rv_va_start_,
- .va_arg_ = rv_va_arg_,
- .va_end_ = rv_va_end_,
- .va_copy_ = rv_va_copy_,
- .asm_block = rv_direct_asm_block,
-};
-
-const NativeOps* rv64_native_direct_ops(void) { return &rv_direct_ops; }
diff --git a/src/arch/rv64/regs.c b/src/arch/rv64/regs.c
@@ -1,99 +0,0 @@
-/* RV64 register name table -- DWARF index <-> psABI assembler name.
- *
- * RISC-V DWARF numbering uses 0..31 for x-registers and 32..63 for
- * f-registers. Canonical names use psABI spellings; xN/fN aliases are
- * accepted by lookup. */
-
-#include "arch/rv64/regs.h"
-
-#include <stdint.h>
-
-#include "core/core.h"
-#include "core/slice.h"
-
-typedef struct Rv64Reg {
- uint32_t dwarf_idx;
- const char* name;
-} Rv64Reg;
-
-static const Rv64Reg RV64_REGS[] = {
- {0, "zero"}, {1, "ra"}, {2, "sp"}, {3, "gp"}, {4, "tp"},
- {5, "t0"}, {6, "t1"}, {7, "t2"}, {8, "s0"}, {9, "s1"},
- {10, "a0"}, {11, "a1"}, {12, "a2"}, {13, "a3"}, {14, "a4"},
- {15, "a5"}, {16, "a6"}, {17, "a7"}, {18, "s2"}, {19, "s3"},
- {20, "s4"}, {21, "s5"}, {22, "s6"}, {23, "s7"}, {24, "s8"},
- {25, "s9"}, {26, "s10"}, {27, "s11"}, {28, "t3"}, {29, "t4"},
- {30, "t5"}, {31, "t6"},
-
- {32, "ft0"}, {33, "ft1"}, {34, "ft2"}, {35, "ft3"}, {36, "ft4"},
- {37, "ft5"}, {38, "ft6"}, {39, "ft7"}, {40, "fs0"}, {41, "fs1"},
- {42, "fa0"}, {43, "fa1"}, {44, "fa2"}, {45, "fa3"}, {46, "fa4"},
- {47, "fa5"}, {48, "fa6"}, {49, "fa7"}, {50, "fs2"}, {51, "fs3"},
- {52, "fs4"}, {53, "fs5"}, {54, "fs6"}, {55, "fs7"}, {56, "fs8"},
- {57, "fs9"}, {58, "fs10"}, {59, "fs11"}, {60, "ft8"}, {61, "ft9"},
- {62, "ft10"}, {63, "ft11"},
-};
-
-static const uint32_t RV64_REGS_N =
- (uint32_t)(sizeof RV64_REGS / sizeof RV64_REGS[0]);
-
-static int parse_num_suffix(const char* name, char prefix, uint32_t max,
- uint32_t* out) {
- uint32_t v = 0;
- const char* p;
- if (!name || name[0] != prefix || name[1] == '\0') return 1;
- p = name + 1;
- while (*p) {
- if (*p < '0' || *p > '9') return 1;
- v = v * 10u + (uint32_t)(*p - '0');
- if (v > max) return 1;
- ++p;
- }
- if (out) *out = v;
- return 0;
-}
-
-const char* rv64_register_name(uint32_t dwarf_idx) {
- uint32_t i;
- for (i = 0; i < RV64_REGS_N; ++i) {
- if (RV64_REGS[i].dwarf_idx == dwarf_idx) return RV64_REGS[i].name;
- }
- return NULL;
-}
-
-int rv64_register_index(const char* name, uint32_t* idx_out) {
- uint32_t i;
- uint32_t n;
- Slice q;
- if (!name) return 1;
- q = slice_from_cstr(name);
- for (i = 0; i < RV64_REGS_N; ++i) {
- if (slice_eq_cstr(q, RV64_REGS[i].name)) {
- if (idx_out) *idx_out = RV64_REGS[i].dwarf_idx;
- return 0;
- }
- }
- if (!parse_num_suffix(name, 'x', 31, &n)) {
- if (idx_out) *idx_out = n;
- return 0;
- }
- if (!parse_num_suffix(name, 'f', 31, &n)) {
- if (idx_out) *idx_out = 32u + n;
- return 0;
- }
- if (slice_eq_cstr(q, "fp")) {
- if (idx_out) *idx_out = 8u;
- return 0;
- }
- return 1;
-}
-
-uint32_t rv64_register_iter_size(void) { return RV64_REGS_N; }
-
-int rv64_register_iter_get(uint32_t i, uint32_t* dwarf_out,
- const char** name_out) {
- if (i >= RV64_REGS_N) return 1;
- if (dwarf_out) *dwarf_out = RV64_REGS[i].dwarf_idx;
- if (name_out) *name_out = RV64_REGS[i].name;
- return 0;
-}
diff --git a/src/cg/arith.c b/src/cg/arith.c
@@ -202,6 +202,8 @@ void api_cg_cmp(KitCg* g, CmpOp cop) {
int api_try_i128_convert(KitCg* g, ConvKind ck, KitCgTypeId sty,
KitCgTypeId dty, ApiSValue* v);
+int api_try_wide8_convert(KitCg* g, ConvKind ck, KitCgTypeId sty,
+ KitCgTypeId dty, ApiSValue* v);
void api_cg_convert_kind(KitCg* g, KitCgTypeId dst_type, ConvKind ck) {
ApiSValue v;
@@ -232,11 +234,17 @@ void api_cg_convert_kind(KitCg* g, KitCgTypeId dst_type, ConvKind ck) {
i64 folded;
if (api_try_fold_int_convert(g, ck, sty, dty, v.op.v.imm, &folded)) {
api_release(g, &v);
- api_push(g, api_make_sv(api_op_imm(folded, dty), dty));
+ /* A folded result of rv32 8-byte type must be memory-resident (two lanes),
+ * not a bare i64 immediate the backend would truncate. */
+ if (api_is_wide8_scalar_type(g->c, dty))
+ api_push(g, api_make_wide8_int_const(g, folded, dty));
+ else
+ api_push(g, api_make_sv(api_op_imm(folded, dty), dty));
return;
}
}
if (api_try_i128_convert(g, ck, sty, dty, &v)) return;
+ if (api_try_wide8_convert(g, ck, sty, dty, &v)) return;
if (ck == CV_BITCAST && abi_cg_sizeof(g->c->abi, sty) == 16 &&
abi_cg_sizeof(g->c->abi, dty) == 16 &&
(api_is_f128_type(g->c, sty) || api_is_f128_type(g->c, dty))) {
@@ -306,6 +314,23 @@ int api_i128_stack_top(KitCg* g, u32 depth) {
return api_is_i128_type(g->c, api_sv_type(&g->stack[g->sp - 1u - depth]));
}
+/* "Wider than the target machine word but NOT i128" — i.e. a 64-bit integer on
+ * a 32-bit target (rv32). The native backend handles add/sub/and/or/xor on such
+ * values as register pairs, but mul/div/shift must be lowered to a __*di3
+ * runtime call (see api_wideint64_binop). i128 routes through its own ti3 path
+ * (api_i128_*), so it is explicitly excluded here. */
+static int api_int_is_wide64(KitCg* g, KitCgTypeId ty) {
+ if (!g) return 0;
+ if (api_is_i128_type(g->c, ty)) return 0;
+ if (kit_cg_type_int_width((KitCompiler*)g->c, ty) == 0) return 0;
+ return abi_cg_sizeof(g->c->abi, ty) > g->c->target.ptr_size;
+}
+
+static int api_wide64_stack_top(KitCg* g, u32 depth) {
+ if (!g || g->sp <= depth) return 0;
+ return api_int_is_wide64(g, api_sv_type(&g->stack[g->sp - 1u - depth]));
+}
+
static int api_binop_is_shift(BinOp iop) {
return iop == BO_SHL || iop == BO_SHR_U || iop == BO_SHR_S;
}
@@ -353,6 +378,314 @@ static void api_i128_binop(KitCg* g, BinOp iop) {
api_runtime_call_values(g, name, i128, ps, 2, args);
}
+/* Runtime helper name for a 64-bit-integer mul/div/rem/shift on a 32-bit
+ * target. Mirrors api_i128_binop_helper but with the compiler-rt *di3 names.
+ * Returns NULL for ops the inline backend handles (add/sub/and/or/xor). */
+static const char* api_wideint64_binop_helper(BinOp op) {
+ switch (op) {
+ case BO_IMUL:
+ return "__muldi3";
+ case BO_SDIV:
+ return "__divdi3";
+ case BO_UDIV:
+ return "__udivdi3";
+ case BO_SREM:
+ return "__moddi3";
+ case BO_UREM:
+ return "__umoddi3";
+ case BO_SHL:
+ return "__ashldi3";
+ case BO_SHR_U:
+ return "__lshrdi3";
+ case BO_SHR_S:
+ return "__ashrdi3";
+ default:
+ return NULL;
+ }
+}
+
+/* Lower a 64-bit mul/div/rem/shift to a runtime call. Mirrors api_i128_binop
+ * but ret/params are builtin i64; the shift-count param is i32 (the __ashldi3
+ * family takes (i64 value, i32 count) per compiler-rt). */
+static void api_wideint64_binop(KitCg* g, BinOp iop) {
+ KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
+ KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
+ const char* name = api_wideint64_binop_helper(iop);
+ KitCgTypeId ps[2];
+ ApiSValue args[2];
+ if (!name) {
+ compiler_panic(g->c, g->cur_loc, "KitCg: unsupported wide i64 binop");
+ return;
+ }
+ args[1] = api_pop(g);
+ args[0] = api_pop(g);
+ ps[0] = i64;
+ ps[1] = api_binop_is_shift(iop) ? i32 : i64;
+ api_runtime_call_values(g, name, i64, ps, 2, args);
+}
+
+/* ============================================================
+ * wide8 inline 2-word lane arithmetic (rv32 i64)
+ *
+ * On rv32 a 64-bit integer is a memory-resident 8-byte scalar. add/sub/and/or/
+ * xor/neg/not and compares have no compiler-rt helper (they would recurse), so
+ * they are emitted INLINE here as 32-bit lane ops, mirroring the i128 lane
+ * primitives but operating on register-class i32 lanes loaded from / stored to
+ * the value's memory home. mul/div/rem/shift route to __*di3 (api_wideint64_*).
+ * ============================================================ */
+
+static i32 wide8_lo_off(KitCg* g) { return g->c->target.big_endian ? 4 : 0; }
+static i32 wide8_hi_off(KitCg* g) { return g->c->target.big_endian ? 0 : 4; }
+
+/* Emit one i32 binop into a fresh temp and return it. */
+static Operand wide8_i32_binop(KitCg* g, BinOp op, Operand a, Operand b) {
+ KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
+ CGLocal r = api_alloc_temp_local(g, i32);
+ Operand d = api_op_local(r, i32);
+ g->target->binop(g->target, op, d, a, b);
+ return d;
+}
+
+/* Emit one i32 compare (0/1 result) into a fresh temp and return it. */
+static Operand wide8_i32_cmp(KitCg* g, CmpOp op, Operand a, Operand b) {
+ KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
+ CGLocal r = api_alloc_temp_local(g, i32);
+ Operand d = api_op_local(r, i32);
+ g->target->cmp(g->target, op, d, a, b);
+ return d;
+}
+
+/* (lo | hi) of the 8-byte value `v` as an i32, for a truthiness test. Consumes
+ * nothing on the value stack (caller owns *v). */
+Operand api_wide8_or_lanes(KitCg* g, ApiSValue* v, KitCgTypeId ty) {
+ Operand addr = api_wide8_addr(g, v, ty);
+ Operand lo = api_wide8_load_lane(g, addr, wide8_lo_off(g));
+ Operand hi = api_wide8_load_lane(g, addr, wide8_hi_off(g));
+ return wide8_i32_binop(g, BO_OR, lo, hi);
+}
+
+/* add/sub/and/or/xor on two 8-byte ints, result pushed as a fresh 8-byte value.
+ * add/sub carry/borrow through the high lane via an sltu (CMP_LT_U). */
+static void api_wide64_binop_inline(KitCg* g, BinOp iop) {
+ ApiSValue b = api_pop(g);
+ ApiSValue a = api_pop(g);
+ KitCgTypeId ty = a.type ? a.type : b.type;
+ int lo = wide8_lo_off(g), hi = wide8_hi_off(g);
+ Operand aa = api_wide8_addr(g, &a, ty);
+ Operand ab = api_wide8_addr(g, &b, ty);
+ Operand alo = api_wide8_load_lane(g, aa, lo);
+ Operand ahi = api_wide8_load_lane(g, aa, hi);
+ Operand blo = api_wide8_load_lane(g, ab, lo);
+ Operand bhi = api_wide8_load_lane(g, ab, hi);
+ CGLocal res = api_wide8_temp_local(g, ty);
+ ApiSValue res_lv = api_make_lv(api_op_local(res, ty), ty);
+ Operand ar = api_lvalue_addr(g, &res_lv, cg_type_ptr_to(g->c, ty));
+ Operand rlo;
+ Operand rhi;
+ switch (iop) {
+ case BO_AND:
+ case BO_OR:
+ case BO_XOR:
+ rlo = wide8_i32_binop(g, iop, alo, blo);
+ rhi = wide8_i32_binop(g, iop, ahi, bhi);
+ break;
+ case BO_IADD: {
+ Operand carry;
+ rlo = wide8_i32_binop(g, BO_IADD, alo, blo);
+ carry = wide8_i32_cmp(g, CMP_LT_U, rlo, alo); /* unsigned wrap -> carry */
+ rhi = wide8_i32_binop(g, BO_IADD, ahi, bhi);
+ rhi = wide8_i32_binop(g, BO_IADD, rhi, carry);
+ break;
+ }
+ case BO_ISUB: {
+ Operand borrow = wide8_i32_cmp(g, CMP_LT_U, alo, blo);
+ rlo = wide8_i32_binop(g, BO_ISUB, alo, blo);
+ rhi = wide8_i32_binop(g, BO_ISUB, ahi, bhi);
+ rhi = wide8_i32_binop(g, BO_ISUB, rhi, borrow);
+ break;
+ }
+ default:
+ compiler_panic(g->c, g->cur_loc, "KitCg: unsupported wide i64 inline binop");
+ return;
+ }
+ api_wide8_store_lane(g, ar, lo, rlo);
+ api_wide8_store_lane(g, ar, hi, rhi);
+ api_release(g, &a);
+ api_release(g, &b);
+ api_push(g, api_make_sv(api_op_local(res, ty), ty));
+}
+
+/* neg / bnot on an 8-byte int. NEG is two's complement: lo = 0-lo with borrow
+ * into hi = 0-hi-borrow. BNOT is lane-wise xor -1. */
+static void api_wide64_unop_inline(KitCg* g, UnOp iop) {
+ KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
+ ApiSValue a = api_pop(g);
+ KitCgTypeId ty = a.type ? a.type : a.op.type;
+ int lo = wide8_lo_off(g), hi = wide8_hi_off(g);
+ Operand aa = api_wide8_addr(g, &a, ty);
+ Operand alo = api_wide8_load_lane(g, aa, lo);
+ Operand ahi = api_wide8_load_lane(g, aa, hi);
+ CGLocal res = api_wide8_temp_local(g, ty);
+ ApiSValue res_lv = api_make_lv(api_op_local(res, ty), ty);
+ Operand ar = api_lvalue_addr(g, &res_lv, cg_type_ptr_to(g->c, ty));
+ Operand rlo;
+ Operand rhi;
+ if (iop == UO_BNOT) {
+ rlo = wide8_i32_binop(g, BO_XOR, alo, api_op_imm(-1, i32));
+ rhi = wide8_i32_binop(g, BO_XOR, ahi, api_op_imm(-1, i32));
+ } else { /* UO_NEG: 0 - value */
+ Operand zero = api_op_imm(0, i32);
+ Operand borrow = wide8_i32_cmp(g, CMP_LT_U, zero, alo); /* 0<lo -> borrow */
+ rlo = wide8_i32_binop(g, BO_ISUB, zero, alo);
+ rhi = wide8_i32_binop(g, BO_ISUB, zero, ahi);
+ rhi = wide8_i32_binop(g, BO_ISUB, rhi, borrow);
+ }
+ api_wide8_store_lane(g, ar, lo, rlo);
+ api_wide8_store_lane(g, ar, hi, rhi);
+ api_release(g, &a);
+ api_push(g, api_make_sv(api_op_local(res, ty), ty));
+}
+
+/* a < b over 8-byte lanes: (a_hi <{s,u} b_hi) | (a_hi==b_hi & a_lo <u b_lo).
+ * The high lane uses the signed/unsigned relation; the low lane is always
+ * unsigned. Returns an i32 0/1. */
+static Operand wide8_lt(KitCg* g, int is_signed, Operand alo, Operand ahi,
+ Operand blo, Operand bhi) {
+ Operand hi_lt = wide8_i32_cmp(g, is_signed ? CMP_LT_S : CMP_LT_U, ahi, bhi);
+ Operand hi_eq = wide8_i32_cmp(g, CMP_EQ, ahi, bhi);
+ Operand lo_lt = wide8_i32_cmp(g, CMP_LT_U, alo, blo);
+ Operand t = wide8_i32_binop(g, BO_AND, hi_eq, lo_lt);
+ return wide8_i32_binop(g, BO_OR, hi_lt, t);
+}
+
+static Operand wide8_eq(KitCg* g, Operand alo, Operand ahi, Operand blo,
+ Operand bhi) {
+ KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
+ Operand dlo = wide8_i32_binop(g, BO_XOR, alo, blo);
+ Operand dhi = wide8_i32_binop(g, BO_XOR, ahi, bhi);
+ Operand diff = wide8_i32_binop(g, BO_OR, dlo, dhi);
+ return wide8_i32_cmp(g, CMP_EQ, diff, api_op_imm(0, i32));
+}
+
+static int cmp_is_signed_rel(CmpOp op) {
+ return op == CMP_LT_S || op == CMP_LE_S || op == CMP_GT_S || op == CMP_GE_S;
+}
+
+/* 8-byte int compare -> eager i32 0/1 value (not a delayed SV_CMP). */
+static void api_wide64_cmp_inline(KitCg* g, CmpOp cop) {
+ KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
+ int sg = cmp_is_signed_rel(cop);
+ ApiSValue b = api_pop(g);
+ ApiSValue a = api_pop(g);
+ KitCgTypeId ty = a.type ? a.type : b.type;
+ int lo = wide8_lo_off(g), hi = wide8_hi_off(g);
+ Operand aa = api_wide8_addr(g, &a, ty);
+ Operand ab = api_wide8_addr(g, &b, ty);
+ Operand alo = api_wide8_load_lane(g, aa, lo);
+ Operand ahi = api_wide8_load_lane(g, aa, hi);
+ Operand blo = api_wide8_load_lane(g, ab, lo);
+ Operand bhi = api_wide8_load_lane(g, ab, hi);
+ Operand one = api_op_imm(1, i32);
+ Operand res;
+ switch (cop) {
+ case CMP_EQ:
+ res = wide8_eq(g, alo, ahi, blo, bhi);
+ break;
+ case CMP_NE:
+ res = wide8_i32_binop(g, BO_XOR, wide8_eq(g, alo, ahi, blo, bhi), one);
+ break;
+ case CMP_LT_S:
+ case CMP_LT_U:
+ res = wide8_lt(g, sg, alo, ahi, blo, bhi);
+ break;
+ case CMP_GT_S:
+ case CMP_GT_U: /* a>b == b<a */
+ res = wide8_lt(g, sg, blo, bhi, alo, ahi);
+ break;
+ case CMP_LE_S:
+ case CMP_LE_U: /* a<=b == !(b<a) */
+ res = wide8_i32_binop(g, BO_XOR, wide8_lt(g, sg, blo, bhi, alo, ahi), one);
+ break;
+ case CMP_GE_S:
+ case CMP_GE_U: /* a>=b == !(a<b) */
+ res = wide8_i32_binop(g, BO_XOR, wide8_lt(g, sg, alo, ahi, blo, bhi), one);
+ break;
+ default:
+ compiler_panic(g->c, g->cur_loc, "KitCg: unsupported wide i64 compare");
+ return;
+ }
+ api_release(g, &a);
+ api_release(g, &b);
+ api_push(g, api_make_sv(res, i32));
+}
+
+/* int<->i64 conversions on rv32 (sext/zext/trunc/bitcast across the 4<->8
+ * boundary, and i64->bool). Returns 1 if it handled (and consumed) *v. The
+ * i64<->float conversions are routed to libcalls in kit_cg_*_to_float /
+ * kit_cg_float_to_* and never reach here. */
+int api_try_wide8_convert(KitCg* g, ConvKind ck, KitCgTypeId sty,
+ KitCgTypeId dty, ApiSValue* v) {
+ KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
+ int s_wide = api_is_wide8_scalar_type(g->c, sty);
+ int d_wide = api_is_wide8_scalar_type(g->c, dty);
+ int lo = wide8_lo_off(g), hi = wide8_hi_off(g);
+ if (!s_wide && !d_wide) return 0;
+ if (s_wide && d_wide) {
+ /* i64<->soft-double reinterpret (same 8-byte layout) or i64<->u64. */
+ v->type = dty;
+ v->op.type = dty;
+ api_push(g, *v);
+ return 1;
+ }
+ if (d_wide) {
+ /* narrower int -> i64: low lane is the (converted-to-i32) source; high lane
+ * is the sign-extension (CV_SEXT) or zero (CV_ZEXT/CV_BITCAST of a ptr). */
+ int sext = (ck == CV_SEXT);
+ Operand src32;
+ CGLocal res;
+ ApiSValue res_lv;
+ Operand ar;
+ Operand hival;
+ if (api_unalias_type(g->c, sty) != i32) {
+ api_push(g, *v);
+ api_cg_convert_kind(g, i32, ck == CV_SEXT ? CV_SEXT : CV_ZEXT);
+ *v = api_pop(g);
+ }
+ src32 = api_force_local(g, v, i32);
+ res = api_wide8_temp_local(g, dty);
+ res_lv = api_make_lv(api_op_local(res, dty), dty);
+ ar = api_lvalue_addr(g, &res_lv, cg_type_ptr_to(g->c, dty));
+ api_wide8_store_lane(g, ar, lo, src32);
+ if (sext)
+ hival = wide8_i32_binop(g, BO_SHR_S, src32, api_op_imm(31, i32));
+ else
+ hival = api_op_imm(0, i32);
+ api_wide8_store_lane(g, ar, hi, hival);
+ api_release(g, v);
+ api_push(g, api_make_sv(api_op_local(res, dty), dty));
+ return 1;
+ }
+ /* s_wide: i64 -> narrower. _Bool is "any bit set"; else take the low lane and
+ * truncate/convert further. */
+ if (api_is_bool_type(g->c, dty)) {
+ Operand orl = api_wide8_or_lanes(g, v, sty);
+ api_release(g, v);
+ api_push(g, api_make_sv(orl, i32));
+ kit_cg_push_int(g, 0, i32);
+ api_cg_cmp(g, CMP_NE);
+ api_cg_convert_kind(g, dty, CV_TRUNC);
+ return 1;
+ }
+ {
+ Operand addr = api_wide8_addr(g, v, sty);
+ Operand lolane = api_wide8_load_lane(g, addr, lo);
+ api_release(g, v);
+ api_push(g, api_make_sv(lolane, i32));
+ if (api_unalias_type(g->c, dty) != i32) api_cg_convert_kind(g, dty, CV_TRUNC);
+ return 1;
+ }
+}
+
static void api_i128_unop(KitCg* g, UnOp iop) {
KitCgTypeId i128 = builtin_id(KIT_CG_BUILTIN_I128);
const char* name = NULL;
@@ -483,6 +816,17 @@ void kit_cg_int_binop(KitCg* g, KitCgIntBinOp op, uint32_t flags) {
api_i128_binop(g, iop);
return;
}
+ /* 64-bit int on a 32-bit target (rv32): mul/div/rem/shift become __*di3
+ * runtime calls; add/sub/and/or/xor are emitted inline as 2-word lane ops
+ * (no compiler-rt helper exists for them). Both keep the value memory-resident
+ * so the allocator never tries to put 8 bytes in one 4-byte register. */
+ if (g && (api_wide64_stack_top(g, 0) || api_wide64_stack_top(g, 1))) {
+ if (api_wideint64_binop_helper(iop))
+ api_wideint64_binop(g, iop);
+ else
+ api_wide64_binop_inline(g, iop);
+ return;
+ }
api_cg_binop(g, iop, flags);
}
@@ -492,6 +836,25 @@ void kit_cg_int_unop(KitCg* g, KitCgIntUnOp op, uint32_t flags) {
api_i128_unop(g, iop);
return;
}
+ /* rv32 64-bit int: neg/bnot are inline 2-word lane ops; logical-not (!x) is
+ * the full-value truthiness test (lo|hi)==0. */
+ if (g && api_wide64_stack_top(g, 0)) {
+ if (iop == UO_NEG || iop == UO_BNOT) {
+ api_wide64_unop_inline(g, iop);
+ return;
+ }
+ if (iop == UO_NOT) {
+ KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
+ ApiSValue v = api_pop(g);
+ KitCgTypeId ty = v.type ? v.type : v.op.type;
+ Operand orl = api_wide8_or_lanes(g, &v, ty);
+ api_release(g, &v);
+ api_push(g, api_make_sv(orl, i32));
+ kit_cg_push_int(g, 0, i32);
+ api_cg_cmp(g, CMP_EQ);
+ return;
+ }
+ }
api_cg_unop(g, iop, flags);
}
@@ -501,6 +864,10 @@ void kit_cg_int_cmp(KitCg* g, KitCgIntCmpOp op) {
api_i128_cmp(g, cop);
return;
}
+ if (g && (api_wide64_stack_top(g, 0) || api_wide64_stack_top(g, 1))) {
+ api_wide64_cmp_inline(g, cop);
+ return;
+ }
api_cg_cmp(g, cop);
}
@@ -559,11 +926,75 @@ const char* api_f128_binop_helper(KitCgFpBinOp op) {
return NULL;
}
+/* Runtime helper name for double (f64) arithmetic on a target that lacks a
+ * hardware double unit. Mirrors api_f128_binop_helper with the __*df3 names. */
+static const char* api_softdf_binop_helper(KitCgFpBinOp op) {
+ switch (op) {
+ case KIT_CG_FP_ADD:
+ return "__adddf3";
+ case KIT_CG_FP_SUB:
+ return "__subdf3";
+ case KIT_CG_FP_MUL:
+ return "__muldf3";
+ case KIT_CG_FP_DIV:
+ return "__divdf3";
+ }
+ return NULL;
+}
+
int api_f128_stack_top(KitCg* g, u32 depth) {
if (!g || g->sp <= depth) return 0;
return api_is_f128_type(g->c, api_sv_type(&g->stack[g->sp - 1u - depth]));
}
+/* True when the target has no hardware double: float_abi is SOFT (ilp32/lp64,
+ * no FP regs) or SINGLE (ilp32f/lp64f, only float in FP regs — double is always
+ * soft). DOUBLE (rv64 lp64d) and DEFAULT (x64/aa64 hardware-double targets that
+ * never set float_abi) keep the inline hardware path, so existing rv64/x64/aa64
+ * codegen is unchanged. */
+static int api_target_double_is_soft(KitCg* g) {
+ if (!g) return 0;
+ return g->c->target.float_abi == KIT_FLOAT_ABI_SOFT ||
+ g->c->target.float_abi == KIT_FLOAT_ABI_SINGLE;
+}
+
+/* True when ty is a 64-bit float (double) AND the target lacks hardware double.
+ * f128 is handled by the separate api_f128_* path, so width must be exactly 64. */
+static int api_type_is_soft_double(KitCg* g, KitCgTypeId ty) {
+ if (!api_target_double_is_soft(g)) return 0;
+ return kit_cg_type_float_width((KitCompiler*)g->c, ty) == 64;
+}
+
+static int api_soft_double_stack_top(KitCg* g, u32 depth) {
+ if (!g || g->sp <= depth) return 0;
+ return api_type_is_soft_double(g, api_sv_type(&g->stack[g->sp - 1u - depth]));
+}
+
+/* f32 under pure-soft ilp32/lp64 (float_abi SOFT, no FP unit): single-precision
+ * arithmetic/compare/convert is also a libcall. Under SINGLE (ilp32f) float is
+ * hardware (fadd.s etc.) so this is false; DOUBLE/DEFAULT keep hardware too. */
+static int api_type_is_soft_single(KitCg* g, KitCgTypeId ty) {
+ if (!g || g->c->target.float_abi != KIT_FLOAT_ABI_SOFT) return 0;
+ return kit_cg_type_float_width((KitCompiler*)g->c, ty) == 32;
+}
+
+static int api_soft_single_stack_top(KitCg* g, u32 depth) {
+ if (!g || g->sp <= depth) return 0;
+ return api_type_is_soft_single(g, api_sv_type(&g->stack[g->sp - 1u - depth]));
+}
+
+/* Runtime helper for f32 arithmetic on a soft-float target (mirrors
+ * api_softdf_binop_helper with the __*sf3 names). */
+static const char* api_softsf_binop_helper(KitCgFpBinOp op) {
+ switch (op) {
+ case KIT_CG_FP_ADD: return "__addsf3";
+ case KIT_CG_FP_SUB: return "__subsf3";
+ case KIT_CG_FP_MUL: return "__mulsf3";
+ case KIT_CG_FP_DIV: return "__divsf3";
+ }
+ return NULL;
+}
+
void api_f128_call_unary(KitCg* g, const char* name, KitCgTypeId ret,
KitCgTypeId param) {
ApiSValue args[1];
@@ -589,6 +1020,34 @@ void kit_cg_fp_binop(KitCg* g, KitCgFpBinOp op, uint32_t flags) {
api_runtime_call_values(g, name, f128, ps, 2, args);
return;
}
+ if (api_soft_double_stack_top(g, 0) || api_soft_double_stack_top(g, 1)) {
+ KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64);
+ KitCgTypeId ps[2];
+ ApiSValue args[2];
+ const char* name = api_softdf_binop_helper(op);
+ if (!name)
+ compiler_panic(g->c, g->cur_loc, "KitCg: unsupported soft double binop");
+ args[1] = api_pop(g);
+ args[0] = api_pop(g);
+ ps[0] = f64;
+ ps[1] = f64;
+ api_runtime_call_values(g, name, f64, ps, 2, args);
+ return;
+ }
+ if (api_soft_single_stack_top(g, 0) || api_soft_single_stack_top(g, 1)) {
+ KitCgTypeId f32 = builtin_id(KIT_CG_BUILTIN_F32);
+ KitCgTypeId ps[2];
+ ApiSValue args[2];
+ const char* name = api_softsf_binop_helper(op);
+ if (!name)
+ compiler_panic(g->c, g->cur_loc, "KitCg: unsupported soft single binop");
+ args[1] = api_pop(g);
+ args[0] = api_pop(g);
+ ps[0] = f32;
+ ps[1] = f32;
+ api_runtime_call_values(g, name, f32, ps, 2, args);
+ return;
+ }
api_cg_binop(g, api_map_fp_binop(op), 0);
}
@@ -603,19 +1062,33 @@ void kit_cg_fp_unop(KitCg* g, KitCgFpUnOp op, uint32_t flags) {
api_f128_call_unary(g, "__negtf2", f128, f128);
return;
}
+ /* Soft float has no FP unit, so negation is a libcall too (the inline FNEG
+ * path emits fsgnj on an FP register, which does not exist here). */
+ if (api_soft_double_stack_top(g, 0)) {
+ KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64);
+ api_f128_call_unary(g, "__negdf2", f64, f64);
+ return;
+ }
+ if (api_soft_single_stack_top(g, 0)) {
+ KitCgTypeId f32 = builtin_id(KIT_CG_BUILTIN_F32);
+ api_f128_call_unary(g, "__negsf2", f32, f32);
+ return;
+ }
api_cg_unop(g, UO_FNEG, 0);
}
-/* f128 single-libcall comparison: call `name(a,b)` and test its i32 three-way
- * result against 0 with `icmp`. Consumes the two f128 operands on the stack and
- * pushes the i32 boolean. */
-static void api_f128_cmp_call(KitCg* g, const char* name, CmpOp icmp) {
- KitCgTypeId f128 = builtin_id(KIT_CG_BUILTIN_F128);
+/* Soft-float single-libcall comparison: call `name(a,b)` (both operands of type
+ * `opty`) and test its i32 three-way result against 0 with `icmp`. Consumes the
+ * two operands on the stack and pushes the i32 boolean. Shared by the f128 (tf)
+ * and soft-double (df) paths — only the helper name and operand type differ; the
+ * compiler-rt NaN-sign convention is identical for both. */
+static void api_softfp_cmp_call(KitCg* g, const char* name, KitCgTypeId opty,
+ CmpOp icmp) {
KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
KitCgTypeId ps[2];
ApiSValue args[2];
- ps[0] = f128;
- ps[1] = f128;
+ ps[0] = opty;
+ ps[1] = opty;
args[1] = api_pop(g);
args[0] = api_pop(g);
api_runtime_call_values(g, name, i32, ps, 2, args);
@@ -623,81 +1096,116 @@ static void api_f128_cmp_call(KitCg* g, const char* name, CmpOp icmp) {
api_cg_cmp(g, icmp);
}
-/* UEQ and ONE are the only f128 predicates that cannot be a single libcall:
- * "equal" and "unordered" both yield a nonzero magnitude from __eqtf2/__netf2,
- * so they need a separate __unordtf2 to split them.
- * UEQ = (__eqtf2(a,b) == 0) || (__unordtf2(a,b) != 0)
- * ONE = (__netf2(a,b) != 0) && (__unordtf2(a,b) == 0)
+/* UEQ and ONE are the only soft-float predicates that cannot be a single
+ * libcall: "equal" and "unordered" both yield a nonzero magnitude from
+ * __eq*2/__ne*2, so they need a separate __unord*2 to split them.
+ * UEQ = (__eq*2(a,b) == 0) || (__unord*2(a,b) != 0)
+ * ONE = (__ne*2(a,b) != 0) && (__unord*2(a,b) == 0)
+ * `suffix` is "tf" (f128) or "df" (double); `opty` the matching operand type.
* The operands are dup'd (kit_cg_dup copies into a fresh owned local) so each
* libcall consumes its own copy. */
-static void api_f128_cmp_with_unord(KitCg* g, KitCgFpCmpOp op) {
- const char* relname = (op == KIT_CG_FP_UEQ) ? "__eqtf2" : "__netf2";
+static void api_softfp_cmp_with_unord(KitCg* g, KitCgFpCmpOp op,
+ const char* suffix, KitCgTypeId opty) {
+ char relname[16];
+ char unordname[16];
CmpOp relcmp = (op == KIT_CG_FP_UEQ) ? CMP_EQ : CMP_NE;
+ const char* rel = (op == KIT_CG_FP_UEQ) ? "eq" : "ne";
+ snprintf(relname, sizeof relname, "__%s%s2", rel, suffix);
+ snprintf(unordname, sizeof unordname, "__unord%s2", suffix);
/* [a, b] -> [a, b, a, b] */
kit_cg_dup2(g);
/* relation on the top (dup'd) copy: [a, b, R] */
- api_f128_cmp_call(g, relname, relcmp);
+ api_softfp_cmp_call(g, relname, opty, relcmp);
/* bring the original a, b back to TOS with R underneath: [R, a, b] */
kit_cg_rot3(g);
kit_cg_rot3(g);
if (op == KIT_CG_FP_UEQ) {
- api_f128_cmp_call(g, "__unordtf2", CMP_NE); /* [R, unordered?] */
- api_cg_binop(g, BO_OR, 0); /* R || unordered */
+ api_softfp_cmp_call(g, unordname, opty, CMP_NE); /* [R, unordered?] */
+ api_cg_binop(g, BO_OR, 0); /* R || unordered */
} else {
- api_f128_cmp_call(g, "__unordtf2", CMP_EQ); /* [R, ordered?] */
- api_cg_binop(g, BO_AND, 0); /* R && ordered */
+ api_softfp_cmp_call(g, unordname, opty, CMP_EQ); /* [R, ordered?] */
+ api_cg_binop(g, BO_AND, 0); /* R && ordered */
+ }
+}
+
+/* Emit a soft-float comparison for either f128 (suffix "tf", opty f128) or
+ * soft double (suffix "df", opty f64). The predicate->helper mapping and the
+ * compiler-rt NaN-sign convention are XLEN/width-neutral, so a single body
+ * serves both — only the suffix and operand type vary. */
+static void api_softfp_cmp(KitCg* g, KitCgFpCmpOp op, const char* suffix,
+ KitCgTypeId opty) {
+ char name[16];
+ switch (op) {
+ case KIT_CG_FP_OEQ:
+ snprintf(name, sizeof name, "__eq%s2", suffix);
+ api_softfp_cmp_call(g, name, opty, CMP_EQ);
+ return;
+ case KIT_CG_FP_UNE:
+ snprintf(name, sizeof name, "__ne%s2", suffix);
+ api_softfp_cmp_call(g, name, opty, CMP_NE);
+ return;
+ case KIT_CG_FP_OLT:
+ snprintf(name, sizeof name, "__lt%s2", suffix);
+ api_softfp_cmp_call(g, name, opty, CMP_LT_S);
+ return;
+ case KIT_CG_FP_OLE:
+ snprintf(name, sizeof name, "__le%s2", suffix);
+ api_softfp_cmp_call(g, name, opty, CMP_LE_S);
+ return;
+ case KIT_CG_FP_OGT:
+ snprintf(name, sizeof name, "__gt%s2", suffix);
+ api_softfp_cmp_call(g, name, opty, CMP_GT_S);
+ return;
+ case KIT_CG_FP_OGE:
+ snprintf(name, sizeof name, "__ge%s2", suffix);
+ api_softfp_cmp_call(g, name, opty, CMP_GE_S);
+ return;
+ /* unordered duals via the opposite-sign helper (NaN flips the test): */
+ case KIT_CG_FP_UGE:
+ snprintf(name, sizeof name, "__lt%s2", suffix);
+ api_softfp_cmp_call(g, name, opty, CMP_GE_S);
+ return;
+ case KIT_CG_FP_UGT:
+ snprintf(name, sizeof name, "__le%s2", suffix);
+ api_softfp_cmp_call(g, name, opty, CMP_GT_S);
+ return;
+ case KIT_CG_FP_ULT:
+ snprintf(name, sizeof name, "__ge%s2", suffix);
+ api_softfp_cmp_call(g, name, opty, CMP_LT_S);
+ return;
+ case KIT_CG_FP_ULE:
+ snprintf(name, sizeof name, "__gt%s2", suffix);
+ api_softfp_cmp_call(g, name, opty, CMP_LE_S);
+ return;
+ case KIT_CG_FP_UEQ:
+ case KIT_CG_FP_ONE:
+ api_softfp_cmp_with_unord(g, op, suffix, opty);
+ return;
}
}
void kit_cg_fp_cmp(KitCg* g, KitCgFpCmpOp op) {
+ /* f128/long double and soft double are both soft-float: the comparison is a
+ * libcall returning a three-way i32 we test against 0. kit's runtime uses the
+ * standard compiler-rt sign convention (rt/lib/impl/fp_compare_impl.inc):
+ * __le-family (__eq*2/__ne*2/__lt*2/__le*2): NaN -> +1
+ * __ge-family (__ge*2/__gt*2): NaN -> -1
+ * so each ordered predicate AND its unordered dual maps to one libcall,
+ * choosing the helper whose NaN sign makes the integer test fall the right
+ * way (ordered: NaN must fail; unordered: NaN must pass). Only UEQ/ONE, which
+ * must split "equal" from "unordered", need a second (__unord*2) call. The
+ * convention is width-neutral, so the same logic drives the tf and df
+ * suffixes via api_softfp_cmp. */
if (api_f128_stack_top(g, 0) || api_f128_stack_top(g, 1)) {
- /* f128/long double is soft-float: the comparison is a libcall returning a
- * three-way i32 we test against 0. kit's runtime uses the standard
- * compiler-rt sign convention (rt/lib/impl/fp_compare_impl.inc):
- * __le-family (__eqtf2/__netf2/__lttf2/__letf2): NaN -> +1
- * __ge-family (__getf2/__gttf2): NaN -> -1
- * so each ordered predicate AND its unordered dual maps to one libcall,
- * choosing the helper whose NaN sign makes the integer test fall the right
- * way (ordered: NaN must fail; unordered: NaN must pass). Only UEQ/ONE,
- * which must split "equal" from "unordered", need a second (__unordtf2)
- * call. */
- switch (op) {
- case KIT_CG_FP_OEQ:
- api_f128_cmp_call(g, "__eqtf2", CMP_EQ);
- return;
- case KIT_CG_FP_UNE:
- api_f128_cmp_call(g, "__netf2", CMP_NE);
- return;
- case KIT_CG_FP_OLT:
- api_f128_cmp_call(g, "__lttf2", CMP_LT_S);
- return;
- case KIT_CG_FP_OLE:
- api_f128_cmp_call(g, "__letf2", CMP_LE_S);
- return;
- case KIT_CG_FP_OGT:
- api_f128_cmp_call(g, "__gttf2", CMP_GT_S);
- return;
- case KIT_CG_FP_OGE:
- api_f128_cmp_call(g, "__getf2", CMP_GE_S);
- return;
- /* unordered duals via the opposite-sign helper (NaN flips the test): */
- case KIT_CG_FP_UGE:
- api_f128_cmp_call(g, "__lttf2", CMP_GE_S);
- return;
- case KIT_CG_FP_UGT:
- api_f128_cmp_call(g, "__letf2", CMP_GT_S);
- return;
- case KIT_CG_FP_ULT:
- api_f128_cmp_call(g, "__getf2", CMP_LT_S);
- return;
- case KIT_CG_FP_ULE:
- api_f128_cmp_call(g, "__gttf2", CMP_LE_S);
- return;
- case KIT_CG_FP_UEQ:
- case KIT_CG_FP_ONE:
- api_f128_cmp_with_unord(g, op);
- return;
- }
+ api_softfp_cmp(g, op, "tf", builtin_id(KIT_CG_BUILTIN_F128));
+ return;
+ }
+ if (api_soft_double_stack_top(g, 0) || api_soft_double_stack_top(g, 1)) {
+ api_softfp_cmp(g, op, "df", builtin_id(KIT_CG_BUILTIN_F64));
+ return;
+ }
+ if (api_soft_single_stack_top(g, 0) || api_soft_single_stack_top(g, 1)) {
+ api_softfp_cmp(g, op, "sf", builtin_id(KIT_CG_BUILTIN_F32));
return;
}
api_cg_cmp(g, api_map_fp_cmp(op));
@@ -738,6 +1246,14 @@ void kit_cg_fpext(KitCg* g, KitCgTypeId dst) {
api_f128_call_unary(g, name, dty, sty);
return;
}
+ /* float -> soft double: runtime widen via __extendsfdf2. */
+ if (api_type_is_soft_double(g, dty)) {
+ ApiSValue v = api_pop(g);
+ KitCgTypeId sty = api_unalias_type(g->c, api_sv_type(&v));
+ api_push(g, v);
+ api_f128_call_unary(g, "__extendsfdf2", dty, sty);
+ return;
+ }
api_cg_convert_kind(g, dst, CV_FEXT);
}
@@ -752,6 +1268,14 @@ void kit_cg_fptrunc(KitCg* g, KitCgTypeId dst) {
api_f128_call_unary(g, name, dty, f128);
return;
}
+ /* soft double -> float: runtime narrow via __truncdfsf2. */
+ if (api_soft_double_stack_top(g, 0)) {
+ ApiSValue v = api_pop(g);
+ KitCgTypeId sty = api_unalias_type(g->c, api_sv_type(&v));
+ api_push(g, v);
+ api_f128_call_unary(g, "__truncdfsf2", dty, sty);
+ return;
+ }
api_cg_convert_kind(g, dst, CV_FTRUNC);
}
@@ -770,6 +1294,33 @@ void kit_cg_sint_to_float(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) {
api_f128_call_unary(g, name, resolve_type(g->c, dst), pty);
return;
}
+ /* signed int -> soft double: __floatsidf (i32) / __floatdidf (i64). */
+ if (api_type_is_soft_double(g, resolve_type(g->c, dst))) {
+ ApiSValue v = api_pop(g);
+ KitCgTypeId sty = api_unalias_type(g->c, api_sv_type(&v));
+ u32 sz = (u32)abi_cg_sizeof(g->c->abi, sty);
+ KitCgTypeId pty = sz > 8 ? builtin_id(KIT_CG_BUILTIN_I128)
+ : (sz > 4 ? builtin_id(KIT_CG_BUILTIN_I64)
+ : builtin_id(KIT_CG_BUILTIN_I32));
+ const char* name =
+ sz > 8 ? "__floattidf" : (sz > 4 ? "__floatdidf" : "__floatsidf");
+ api_push(g, v);
+ api_f128_call_unary(g, name, resolve_type(g->c, dst), pty);
+ return;
+ }
+ /* signed i64 -> hardware single float: rv32 has no fcvt.s.l, so the i64->f32
+ * conversion is a __floatdisf runtime call (mirrors clang under ilp32f). */
+ if (api_wide64_stack_top(g, 0)) {
+ api_f128_call_unary(g, "__floatdisf", resolve_type(g->c, dst),
+ builtin_id(KIT_CG_BUILTIN_I64));
+ return;
+ }
+ /* i32 -> soft single float (ilp32, no FPU): __floatsisf. */
+ if (api_type_is_soft_single(g, resolve_type(g->c, dst))) {
+ api_f128_call_unary(g, "__floatsisf", resolve_type(g->c, dst),
+ builtin_id(KIT_CG_BUILTIN_I32));
+ return;
+ }
api_cg_convert_kind(g, dst, CV_ITOF_S);
}
@@ -788,6 +1339,32 @@ void kit_cg_uint_to_float(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) {
api_f128_call_unary(g, name, resolve_type(g->c, dst), pty);
return;
}
+ /* unsigned int -> soft double: __floatunsidf (i32) / __floatundidf (i64). */
+ if (api_type_is_soft_double(g, resolve_type(g->c, dst))) {
+ ApiSValue v = api_pop(g);
+ KitCgTypeId sty = api_unalias_type(g->c, api_sv_type(&v));
+ u32 sz = (u32)abi_cg_sizeof(g->c->abi, sty);
+ KitCgTypeId pty = sz > 8 ? builtin_id(KIT_CG_BUILTIN_I128)
+ : (sz > 4 ? builtin_id(KIT_CG_BUILTIN_I64)
+ : builtin_id(KIT_CG_BUILTIN_I32));
+ const char* name =
+ sz > 8 ? "__floatuntidf" : (sz > 4 ? "__floatundidf" : "__floatunsidf");
+ api_push(g, v);
+ api_f128_call_unary(g, name, resolve_type(g->c, dst), pty);
+ return;
+ }
+ /* unsigned i64 -> hardware single float: __floatundisf. */
+ if (api_wide64_stack_top(g, 0)) {
+ api_f128_call_unary(g, "__floatundisf", resolve_type(g->c, dst),
+ builtin_id(KIT_CG_BUILTIN_I64));
+ return;
+ }
+ /* u32 -> soft single float (ilp32, no FPU): __floatunsisf. */
+ if (api_type_is_soft_single(g, resolve_type(g->c, dst))) {
+ api_f128_call_unary(g, "__floatunsisf", resolve_type(g->c, dst),
+ builtin_id(KIT_CG_BUILTIN_I32));
+ return;
+ }
api_cg_convert_kind(g, dst, CV_ITOF_U);
}
@@ -805,6 +1382,34 @@ void kit_cg_float_to_sint(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) {
if (rty != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
return;
}
+ /* soft double -> signed int: __fixdfsi (i32) / __fixdfdi (i64). */
+ if (api_soft_double_stack_top(g, 0)) {
+ KitCgTypeId dty = resolve_type(g->c, dst);
+ KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64);
+ u32 sz = (u32)abi_cg_sizeof(g->c->abi, dty);
+ KitCgTypeId rty = sz > 8 ? builtin_id(KIT_CG_BUILTIN_I128)
+ : (sz > 4 ? builtin_id(KIT_CG_BUILTIN_I64)
+ : builtin_id(KIT_CG_BUILTIN_I32));
+ const char* name =
+ sz > 8 ? "__fixdfti" : (sz > 4 ? "__fixdfdi" : "__fixdfsi");
+ api_f128_call_unary(g, name, rty, f64);
+ if (rty != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
+ return;
+ }
+ /* hardware single float -> i64: rv32 has no fcvt.l.s, so __fixsfdi. */
+ if (api_is_wide8_scalar_type(g->c, resolve_type(g->c, dst))) {
+ api_f128_call_unary(g, "__fixsfdi", resolve_type(g->c, dst),
+ builtin_id(KIT_CG_BUILTIN_F32));
+ return;
+ }
+ /* soft single float -> signed int <=32 (ilp32, no FPU): __fixsfsi. */
+ if (api_soft_single_stack_top(g, 0)) {
+ KitCgTypeId dty = resolve_type(g->c, dst);
+ KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
+ api_f128_call_unary(g, "__fixsfsi", i32, builtin_id(KIT_CG_BUILTIN_F32));
+ if (i32 != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
+ return;
+ }
api_cg_convert_kind(g, dst, CV_FTOI_S);
}
@@ -822,6 +1427,34 @@ void kit_cg_float_to_uint(KitCg* g, KitCgTypeId dst, KitCgRounding rounding) {
if (rty != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
return;
}
+ /* soft double -> unsigned int: __fixunsdfsi (i32) / __fixunsdfdi (i64). */
+ if (api_soft_double_stack_top(g, 0)) {
+ KitCgTypeId dty = resolve_type(g->c, dst);
+ KitCgTypeId f64 = builtin_id(KIT_CG_BUILTIN_F64);
+ u32 sz = (u32)abi_cg_sizeof(g->c->abi, dty);
+ KitCgTypeId rty = sz > 8 ? builtin_id(KIT_CG_BUILTIN_I128)
+ : (sz > 4 ? builtin_id(KIT_CG_BUILTIN_I64)
+ : builtin_id(KIT_CG_BUILTIN_I32));
+ const char* name =
+ sz > 8 ? "__fixunsdfti" : (sz > 4 ? "__fixunsdfdi" : "__fixunsdfsi");
+ api_f128_call_unary(g, name, rty, f64);
+ if (rty != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
+ return;
+ }
+ /* hardware single float -> u64: __fixunssfdi. */
+ if (api_is_wide8_scalar_type(g->c, resolve_type(g->c, dst))) {
+ api_f128_call_unary(g, "__fixunssfdi", resolve_type(g->c, dst),
+ builtin_id(KIT_CG_BUILTIN_F32));
+ return;
+ }
+ /* soft single float -> unsigned int <=32 (ilp32, no FPU): __fixunssfsi. */
+ if (api_soft_single_stack_top(g, 0)) {
+ KitCgTypeId dty = resolve_type(g->c, dst);
+ KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
+ api_f128_call_unary(g, "__fixunssfsi", i32, builtin_id(KIT_CG_BUILTIN_F32));
+ if (i32 != dty) api_cg_convert_kind(g, dty, CV_TRUNC);
+ return;
+ }
api_cg_convert_kind(g, dst, CV_FTOI_U);
}
@@ -939,6 +1572,29 @@ void kit_cg_intrinsic(KitCg* g, KitCgIntrinsic intrin, uint32_t nargs,
u32 ndst = 0;
Heap* h;
if (!g) return;
+ /* rv32: clz/ctz/popcount/bswap on a 64-bit value cannot be the single-register
+ * software sequence the backend emits (it would shift by 32 — an illegal rv32
+ * shamt). Route them to the compiler-rt __*di2 helpers, which decompose into
+ * 32-bit operations. (32-bit forms still lower inline.) */
+ if (nargs == 1 && api_wide64_stack_top(g, 0)) {
+ const char* name = NULL;
+ KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
+ KitCgTypeId i64 = builtin_id(KIT_CG_BUILTIN_I64);
+ KitCgTypeId ret = i32;
+ switch (intrin) {
+ case KIT_CG_INTRIN_CLZ: name = "__clzdi2"; break;
+ case KIT_CG_INTRIN_CTZ: name = "__ctzdi2"; break;
+ case KIT_CG_INTRIN_POPCOUNT: name = "__popcountdi2"; break;
+ case KIT_CG_INTRIN_BSWAP: name = "__bswapdi2"; ret = i64; break;
+ default: break;
+ }
+ if (name) {
+ ApiSValue arg = api_pop(g);
+ KitCgTypeId ps[1] = {i64};
+ api_runtime_call_values(g, name, ret, ps, 1, &arg);
+ return;
+ }
+ }
T = g->target;
h = g->c->ctx->heap;
rty = resolve_type(g->c, result_type);
diff --git a/src/cg/call.c b/src/cg/call.c
@@ -48,6 +48,12 @@ static CGLocal api_materialize_call_local(KitCg* g, ApiSValue* arg,
if (api_sv_op_is(arg, OPK_IMM) && api_is_wide16_scalar_type(g->c, ty)) {
*arg = api_make_wide16_int_const(g, arg->op.v.imm, ty);
}
+ /* Same for an rv32 8-byte immediate argument: materialize it as a 2-lane
+ * memory value so the multi-part ABI path marshals both words into the GPR
+ * pair, instead of load_imm'ing only the low word into one register. */
+ if (api_sv_op_is(arg, OPK_IMM) && api_is_wide8_scalar_type(g->c, ty)) {
+ *arg = api_make_wide8_int_const(g, arg->op.v.imm, ty);
+ }
op = api_force_local_unless_imm(g, arg, src_ty);
if (op.kind == OPK_LOCAL &&
api_unalias_type(g->c, op.type) == api_unalias_type(g->c, ty)) {
diff --git a/src/cg/control.c b/src/cg/control.c
@@ -50,6 +50,17 @@ void api_branch_if(KitCg* g, ApiSValue* v, int branch_when_true, Label label) {
api_branch_if(g, &cmp, branch_when_true, label);
return;
}
+ /* rv32 8-byte int (or soft double) truthiness: branch on (lo | hi) != 0. The
+ * value is memory-resident, so a single-register CMP_NE-vs-zero would only see
+ * the low word; OR the two lanes into an i32 first. */
+ if (api_is_wide8_scalar_type(g->c, ty)) {
+ KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32);
+ Operand orl = api_wide8_or_lanes(g, v, ty);
+ Operand zero = api_op_imm(0, i32);
+ T->cmp_branch(T, branch_when_true ? CMP_NE : CMP_EQ, orl, zero, label);
+ api_release(g, v);
+ return;
+ }
{
Operand a = api_force_local(g, v, ty);
Operand zero = api_op_imm(0, ty);
diff --git a/src/cg/internal.h b/src/cg/internal.h
@@ -373,6 +373,7 @@ int api_type_is_float(Compiler* c, KitCgTypeId ty);
int api_is_f128_type(Compiler* c, KitCgTypeId ty);
int api_is_i128_type(Compiler* c, KitCgTypeId ty);
int api_is_wide16_scalar_type(Compiler* c, KitCgTypeId ty);
+int api_is_wide8_scalar_type(Compiler* c, KitCgTypeId ty);
Operand api_op_imm(i64 v, KitCgTypeId ty);
Operand api_op_local(CGLocal r, KitCgTypeId ty);
Operand api_op_global(ObjSymId sym, i64 addend, KitCgTypeId ty);
@@ -433,6 +434,13 @@ ApiSValue api_make_wide16_int_const(KitCg* g, i64 value, KitCgTypeId ty);
void api_encode_binary128_from_double(KitCg* g, double value, u8 out[16]);
ApiSValue api_make_f128_const(KitCg* g, double value, KitCgTypeId ty);
ApiSValue api_wide16_materialize_lvalue(KitCg* g, ApiSValue* v, KitCgTypeId ty);
+CGLocal api_wide8_temp_local(KitCg* g, KitCgTypeId ty);
+ApiSValue api_make_wide8_const_bits(KitCg* g, u64 bits, KitCgTypeId ty);
+ApiSValue api_make_wide8_int_const(KitCg* g, i64 value, KitCgTypeId ty);
+Operand api_wide8_addr(KitCg* g, ApiSValue* v, KitCgTypeId ty);
+Operand api_wide8_load_lane(KitCg* g, Operand addr, i32 off);
+void api_wide8_store_lane(KitCg* g, Operand addr, i32 off, Operand val);
+Operand api_wide8_or_lanes(KitCg* g, ApiSValue* v, KitCgTypeId ty);
KitCgSym api_runtime_helper(KitCg* g, const char* name, KitCgTypeId ret,
const KitCgTypeId* params, u32 nparams);
void api_runtime_call_values(KitCg* g, const char* name, KitCgTypeId ret,
diff --git a/src/cg/local.c b/src/cg/local.c
@@ -5,9 +5,10 @@ int api_local_requires_memory(KitCg* g, KitCgTypeId ty, KitCgLocalAttrs attrs) {
KIT_CG_LOCAL_COMPILER_TEMP;
if (g && g->debug && attrs.name && (attrs.flags & hidden_flags) == 0)
return 1;
- /* Aggregates (records, arrays), wide16 (f128/i128), vararg state, and any
- * non-scalar type must live in memory. */
+ /* Aggregates (records, arrays), wide16 (f128/i128), wide8 (rv32 i64/double),
+ * vararg state, and any non-scalar type must live in memory. */
if (api_is_wide16_scalar_type(g->c, ty)) return 1;
+ if (api_is_wide8_scalar_type(g->c, ty)) return 1;
return !(cg_type_is_int(g->c, ty) || cg_type_is_float(g->c, ty) ||
cg_type_is_ptr(g->c, ty));
}
diff --git a/src/cg/memory.c b/src/cg/memory.c
@@ -12,6 +12,13 @@ void kit_cg_push_int(KitCg* g, uint64_t value, KitCgTypeId type) {
api_push(g, api_make_wide16_int_const(g, (i64)value, ty));
return;
}
+ /* rv32 8-byte int: the 64-bit value fits in op.v.imm, but the value is
+ * memory-resident, so materialize it as two 32-bit lanes (a register
+ * load_imm of an i64 on rv32 would keep only the low word). */
+ if (api_is_wide8_scalar_type(g->c, ty)) {
+ api_push(g, api_make_wide8_int_const(g, (i64)value, ty));
+ return;
+ }
api_push(g, api_make_sv(api_op_imm((i64)value, ty), ty));
}
@@ -33,6 +40,18 @@ void kit_cg_push_float(KitCg* g, double value, KitCgTypeId type) {
api_push(g, api_make_f128_const(g, value, ty));
return;
}
+ /* rv32 soft double: the 8-byte value is memory-resident, so materialize the
+ * IEEE-754 binary64 pattern as two 32-bit lanes (a register load_const of an
+ * 8-byte value on rv32 would keep only the low word). */
+ if (api_is_wide8_scalar_type(g->c, ty)) {
+ union {
+ double d;
+ u64 u;
+ } bits;
+ bits.d = value;
+ api_push(g, api_make_wide8_const_bits(g, bits.u, ty));
+ return;
+ }
T = g->target;
cb.type = ty;
cb.size = (u32)abi_cg_sizeof(g->c->abi, type);
@@ -422,6 +441,12 @@ void kit_cg_store(KitCg* g, KitCgMemAccess access) {
api_is_wide16_scalar_type(g->c, ty)) {
rv = api_make_wide16_int_const(g, rv.op.v.imm, ty);
}
+ /* Same for an rv32 8-byte immediate: lower it to a 2-lane memory value so the
+ * store moves a full 64-bit value rather than load_imm'ing only the low word. */
+ if (!is_bitfield && api_sv_op_is(&rv, OPK_IMM) &&
+ api_is_wide8_scalar_type(g->c, ty)) {
+ rv = api_make_wide8_int_const(g, rv.op.v.imm, ty);
+ }
/* General scalar / bit-field store. Compute the source operand first so its
* local lifetime doesn't overlap any addressing arith. */
diff --git a/src/cg/native_direct_target.c b/src/cg/native_direct_target.c
@@ -1120,16 +1120,25 @@ static void nd_local_static_data_label_addr(CgTarget* t, Label target,
NativeDirectTarget* d = nd_of(t);
u32 off;
u8 zero[8];
- (void)width;
+ RelocKind kind;
(void)address_space;
if (!d->local_static_active)
nd_panic(d, "label address outside local static data");
- if (width != 8u) nd_panic(d, "unsupported local static label address width");
+ /* A jump-table / label-address slot is one target pointer wide: 8 bytes
+ * (R_ABS64) on a 64-bit target, 4 bytes (R_ABS32) on rv32/ELFCLASS32. */
+ if (width == 8u)
+ kind = R_ABS64;
+ else if (width == 4u)
+ kind = R_ABS32;
+ else {
+ nd_panic(d, "unsupported local static label address width");
+ return;
+ }
memset(zero, 0, sizeof zero);
off = d->local_static_base + d->local_static_size;
obj_write(t->obj, d->local_static_sec, zero, width);
d->native->mc->emit_label_data_reloc(d->native->mc, d->local_static_sec, off,
- nd_mc_label(d, target), R_ABS64, width,
+ nd_mc_label(d, target), kind, width,
addend);
d->local_static_size += width;
}
@@ -1190,9 +1199,17 @@ static void nd_continue_to(CgTarget* t, CGScope scope) {
nd_jump(t, s->continue_label);
}
+static int nd_is_wide64_int(NativeDirectTarget* d, KitCgTypeId ty);
+static int nd_is_soft_double(NativeDirectTarget* d, KitCgTypeId ty);
+
static void nd_load_imm(CgTarget* t, Operand dst, i64 imm) {
NativeDirectTarget* d = nd_of(t);
- NativeLoc reg = nd_dst_reg(d, dst);
+ NativeLoc reg;
+ if (nd_is_wide64_int(d, dst.type))
+ nd_panic(d,
+ "64-bit integer immediate reached the rv32 backend un-lowered "
+ "(cg should materialize it as two 32-bit lanes)");
+ reg = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, load_imm, "target does not emit immediates");
d->native->load_imm(d->native, reg, imm);
nd_dst_writeback(d, dst, reg);
@@ -1200,7 +1217,12 @@ static void nd_load_imm(CgTarget* t, Operand dst, i64 imm) {
static void nd_load_const(CgTarget* t, Operand dst, ConstBytes cbytes) {
NativeDirectTarget* d = nd_of(t);
- NativeLoc reg = nd_dst_reg(d, dst);
+ NativeLoc reg;
+ if (nd_is_wide64_int(d, dst.type) || nd_is_soft_double(d, dst.type))
+ nd_panic(d,
+ "8-byte constant reached the rv32 backend un-lowered (cg should "
+ "materialize it as two 32-bit lanes)");
+ reg = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, load_const, "target does not emit byte constants");
d->native->load_const(d->native, reg, cbytes);
nd_dst_writeback(d, dst, reg);
@@ -1434,11 +1456,47 @@ static void nd_bitfield_store(CgTarget* t, Operand record_addr, Operand src,
nd_addr_temps_release(d, &temps);
}
+/* Last line of defense against an unlowered wide/soft-float op reaching the
+ * machine backend. The cg-layer gates in src/cg/arith.c route i64 mul/div/shift
+ * (rv32) and all soft-double arith/convert/compare to runtime calls; if one
+ * escapes, the native backend would silently emit wrong code. These guards turn
+ * that into a loud compiler_panic. Every condition is gated on ptr_size==4
+ * (rv32) and/or float_abi SOFT|SINGLE, so x64/aa64/rv64 (ptr_size 8, or double
+ * in FP regs) never trip them. */
+static int nd_is_wide64_int(NativeDirectTarget* d, KitCgTypeId ty) {
+ if (d->base.c->target.ptr_size != 4) return 0; /* rv32 only */
+ if (kit_cg_type_int_width((KitCompiler*)d->base.c, ty) == 0) return 0;
+ return cg_type_size(d->base.c, ty) > d->base.c->target.ptr_size;
+}
+
+static int nd_is_soft_double(NativeDirectTarget* d, KitCgTypeId ty) {
+ u8 abi = d->base.c->target.float_abi;
+ if (abi != KIT_FLOAT_ABI_SOFT && abi != KIT_FLOAT_ABI_SINGLE) return 0;
+ return kit_cg_type_float_width((KitCompiler*)d->base.c, ty) == 64;
+}
+
static void nd_binop(CgTarget* t, BinOp op, Operand dst, Operand a, Operand b) {
NativeDirectTarget* d = nd_of(t);
- NativeLoc ar = nd_materialize_operand(d, a);
- NativeLoc br = nd_rhs_imm_or_reg(d, NATIVE_IMM_BINOP, (u32)op, b);
- NativeLoc dr = nd_dst_reg(d, dst);
+ NativeLoc ar;
+ NativeLoc br;
+ NativeLoc dr;
+ /* No 8-byte value reaches a single GPR op on rv32: the cg layer lowers i64
+ * add/sub/and/or/xor to inline 2-word lane sequences and mul/div/rem/shift to
+ * __*di3 runtime calls (src/cg/arith.c). Anything that slips through here would
+ * silently compute only the low word, so fail loudly instead. */
+ if (nd_is_wide64_int(d, a.type) || nd_is_wide64_int(d, dst.type)) {
+ nd_panic(d,
+ "64-bit integer arithmetic reached the rv32 backend un-lowered "
+ "(cg should emit a 2-word lane sequence or a __*di3 runtime call)");
+ }
+ if (nd_is_soft_double(d, a.type) || nd_is_soft_double(d, dst.type)) {
+ nd_panic(d,
+ "soft-float double arithmetic reached the backend un-lowered "
+ "(should be a __*df3 runtime call)");
+ }
+ ar = nd_materialize_operand(d, a);
+ br = nd_rhs_imm_or_reg(d, NATIVE_IMM_BINOP, (u32)op, b);
+ dr = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, binop, "target does not emit binary ops");
d->native->binop(d->native, op, dr, ar, br);
nd_dst_writeback(d, dst, dr);
@@ -1448,8 +1506,22 @@ static void nd_binop(CgTarget* t, BinOp op, Operand dst, Operand a, Operand b) {
static void nd_unop(CgTarget* t, UnOp op, Operand dst, Operand a) {
NativeDirectTarget* d = nd_of(t);
- NativeLoc ar = nd_materialize_operand(d, a);
- NativeLoc dr = nd_dst_reg(d, dst);
+ NativeLoc ar;
+ NativeLoc dr;
+ /* i64 neg/bnot stay inline as register pairs, and soft-double FNEG stays
+ * inline as a high-word sign-bit flip (v1), so both are allowlisted. Any
+ * OTHER soft-double unop reaching the backend is an unlowered escape. */
+ if (nd_is_wide64_int(d, a.type) || nd_is_wide64_int(d, dst.type)) {
+ nd_panic(d,
+ "64-bit integer unary op reached the rv32 backend un-lowered "
+ "(cg should emit a 2-word lane sequence)");
+ }
+ if (op != UO_FNEG &&
+ (nd_is_soft_double(d, a.type) || nd_is_soft_double(d, dst.type))) {
+ nd_panic(d, "soft-float double unary op reached the backend un-lowered");
+ }
+ ar = nd_materialize_operand(d, a);
+ dr = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, unop, "target does not emit unary ops");
d->native->unop(d->native, op, dr, ar);
nd_dst_writeback(d, dst, dr);
@@ -1458,9 +1530,25 @@ static void nd_unop(CgTarget* t, UnOp op, Operand dst, Operand a) {
static void nd_cmp(CgTarget* t, CmpOp op, Operand dst, Operand a, Operand b) {
NativeDirectTarget* d = nd_of(t);
- NativeLoc ar = nd_materialize_operand(d, a);
- NativeLoc br = nd_rhs_imm_or_reg(d, NATIVE_IMM_CMP, (u32)op, b);
- NativeLoc dr = nd_dst_reg(d, dst);
+ NativeLoc ar;
+ NativeLoc br;
+ NativeLoc dr;
+ /* i64 compares are lowered to inline 2-word lane sequences and soft-double
+ * compares to __*df2 runtime calls (src/cg/arith.c); neither reaches a single
+ * GPR compare here. */
+ if (nd_is_wide64_int(d, a.type) || nd_is_wide64_int(d, b.type)) {
+ nd_panic(d,
+ "64-bit integer compare reached the rv32 backend un-lowered "
+ "(cg should emit a 2-word lane sequence)");
+ }
+ if (nd_is_soft_double(d, a.type) || nd_is_soft_double(d, b.type)) {
+ nd_panic(d,
+ "soft-float double compare reached the backend un-lowered "
+ "(should be a __*df2 runtime call)");
+ }
+ ar = nd_materialize_operand(d, a);
+ br = nd_rhs_imm_or_reg(d, NATIVE_IMM_CMP, (u32)op, b);
+ dr = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, cmp, "target does not emit compares");
d->native->cmp(d->native, op, dr, ar, br);
nd_dst_writeback(d, dst, dr);
@@ -1470,8 +1558,23 @@ static void nd_cmp(CgTarget* t, CmpOp op, Operand dst, Operand a, Operand b) {
static void nd_convert(CgTarget* t, ConvKind op, Operand dst, Operand src) {
NativeDirectTarget* d = nd_of(t);
- NativeLoc sr = nd_materialize_operand(d, src);
- NativeLoc dr = nd_dst_reg(d, dst);
+ NativeLoc sr;
+ NativeLoc dr;
+ /* i64<->i32 sext/zext/trunc are lowered to inline lane ops (src/cg/arith.c
+ * api_try_wide8_convert) and i64<->float / soft-double conversions to runtime
+ * calls; none reaches a single-register convert here. */
+ if (nd_is_wide64_int(d, src.type) || nd_is_wide64_int(d, dst.type)) {
+ nd_panic(d,
+ "64-bit integer conversion reached the rv32 backend un-lowered "
+ "(cg should emit a 2-word lane sequence or a runtime call)");
+ }
+ if (nd_is_soft_double(d, src.type) || nd_is_soft_double(d, dst.type)) {
+ nd_panic(d,
+ "soft-float double conversion reached the backend un-lowered "
+ "(should be a runtime call)");
+ }
+ sr = nd_materialize_operand(d, src);
+ dr = nd_dst_reg(d, dst);
ND_REQUIRE_NATIVE(d, convert, "target does not emit converts");
d->native->convert(d->native, op, dr, sr);
nd_dst_writeback(d, dst, dr);
diff --git a/src/cg/type.c b/src/cg/type.c
@@ -988,6 +988,9 @@ int kit_cg_target_supports_intrinsic(KitCompiler* c, KitCgIntrinsic intrin) {
KitArchKind arch;
if (!c) return 0;
arch = c->target.arch;
+ /* rv32 and rv64 share one RISC-V backend (src/arch/riscv), so the set of
+ * lowerable intrinsics is identical; decide as rv64 for both. */
+ if (arch == KIT_ARCH_RV32) arch = KIT_ARCH_RV64;
switch (intrin) {
/* Portable intrinsics every backend (native + wasm + C-source) lowers.
* The C-source backend runs under the host's native arch, so it is covered
diff --git a/src/cg/value.c b/src/cg/value.c
@@ -25,6 +25,20 @@ int api_is_wide16_scalar_type(Compiler* c, KitCgTypeId ty) {
return api_is_f128_type(c, ty) || api_is_i128_type(c, ty);
}
+/* rv32 only: an 8-byte scalar (long long / int64_t, and — under ilp32f/ilp32 —
+ * soft double) is twice the 4-byte machine word. Like wide16 (i128/f128) on a
+ * 64-bit target it cannot live in a single GPR, so it is forced memory-resident
+ * and every operation is legalized to a 2-word lane sequence: add/sub/and/or/
+ * xor/neg/not/compare inline (src/cg/wide.c, no compiler-rt helper exists for
+ * 64-bit add), mul/div/rem/shift and all soft-double arith/convert to runtime
+ * calls. Defined as exactly 8 bytes on a 4-byte-pointer target, so it matches
+ * only rv32 i64/double and never fires on rv64/x64/aa64 (ptr_size 8). */
+int api_is_wide8_scalar_type(Compiler* c, KitCgTypeId ty) {
+ if (c->target.ptr_size != 4u) return 0;
+ if (!(cg_type_is_int(c, ty) || cg_type_is_float(c, ty))) return 0;
+ return abi_cg_sizeof(c->abi, ty) == 8u;
+}
+
Operand api_op_imm(i64 v, KitCgTypeId ty) {
Operand o;
memset(&o, 0, sizeof o);
@@ -241,6 +255,13 @@ CGLocal api_alloc_temp_local(KitCg* g, KitCgTypeId ty) {
d.size = abi_cg_sizeof(g->c->abi, ty);
d.align = abi_cg_alignof(g->c->abi, ty);
}
+ /* An rv32 8-byte scalar temp (i64/soft-double arithmetic result, call result,
+ * etc.) must live in memory so its two words are addressable for lane ops and
+ * the multi-part ABI path; the allocator gives an unflagged scalar a single
+ * register, which would truncate it. (wide16 temps are already forced via the
+ * size>word auto-home in cg_ir_lower.) */
+ if (ty && api_is_wide8_scalar_type(g->c, ty))
+ d.flags |= CG_LOCAL_MEMORY_REQUIRED;
local = g->target->local(g->target, &d);
if (local == CG_LOCAL_NONE) {
compiler_panic(g->c, g->cur_loc,
diff --git a/src/cg/wide.c b/src/cg/wide.c
@@ -129,6 +129,108 @@ ApiSValue api_make_f128_const(KitCg* g, double value, KitCgTypeId ty) {
return api_make_lv(api_op_local(local, ty), ty);
}
+/* ============================================================
+ * wide8 — rv32 8-byte (2-word) scalar lane plumbing
+ *
+ * On rv32 a long long / int64_t (and, under ilp32f/ilp32, a soft double) is two
+ * machine words. Like the wide16 (i128/f128) scalars above it is memory-resident
+ * (api_is_wide8_scalar_type forces CG_LOCAL_MEMORY_REQUIRED), but its arithmetic
+ * is done INLINE as 2-word lane sequences (src/cg/arith.c) rather than via a
+ * runtime call, because compiler-rt has no 64-bit add/sub/and/or/xor helper. The
+ * lane size is the 4-byte word (ptr_size); the low word is at offset 0 on a
+ * little-endian target (rv32 is LE; the big-endian offsets are kept for parity
+ * with the wide16 helpers). These primitives are the inline analogue of
+ * api_store_f128_bytes / api_i128_addr / api_i128_load_lane.
+ * ============================================================ */
+
+/* Allocate an 8-byte memory-resident, address-taken scalar temp. */
+CGLocal api_wide8_temp_local(KitCg* g, KitCgTypeId ty) {
+ CGLocalDesc d;
+ memset(&d, 0, sizeof d);
+ d.type = ty;
+ d.size = 8;
+ d.align = (u32)abi_cg_alignof(g->c->abi, ty);
+ if (!d.align) d.align = 8;
+ d.flags = CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED;
+ return g->target->local(g->target, &d);
+}
+
+/* Byte offset of the low / high 32-bit lane within an 8-byte scalar. */
+static i32 api_wide8_lo_off(KitCg* g) { return g->c->target.big_endian ? 4 : 0; }
+static i32 api_wide8_hi_off(KitCg* g) { return g->c->target.big_endian ? 0 : 4; }
+
+/* Materialize a 64-bit constant bit pattern into a fresh memory-resident scalar,
+ * storing its two 32-bit lanes, and return the value backed by that local. Used
+ * for both i64 immediates (bits = (u64)imm) and soft-double constants (bits =
+ * the IEEE-754 binary64 encoding). */
+ApiSValue api_make_wide8_const_bits(KitCg* g, u64 bits, KitCgTypeId ty) {
+ KitCgTypeId i32_ty = builtin_id(KIT_CG_BUILTIN_I32);
+ KitCgTypeId ptr_ty = cg_type_ptr_to(g->c, ty);
+ CGLocal local = api_wide8_temp_local(g, ty);
+ CGLocal ar = api_alloc_temp_local(g, ptr_ty);
+ Operand base = api_op_local(ar, ptr_ty);
+ MemAccess ma;
+ memset(&ma, 0, sizeof ma);
+ ma.type = i32_ty;
+ ma.size = 4;
+ ma.align = 4;
+ g->target->addr_of(g->target, base, api_op_local(local, ty));
+ g->target->store(g->target, api_op_indirect(ar, api_wide8_lo_off(g), i32_ty),
+ api_op_imm((i64)(i32)(u32)(bits & 0xffffffffu), i32_ty), ma);
+ g->target->store(g->target, api_op_indirect(ar, api_wide8_hi_off(g), i32_ty),
+ api_op_imm((i64)(i32)(u32)(bits >> 32), i32_ty), ma);
+ api_release_temp_local(g, ar);
+ return api_make_sv(api_op_local(local, ty), ty);
+}
+
+ApiSValue api_make_wide8_int_const(KitCg* g, i64 value, KitCgTypeId ty) {
+ return api_make_wide8_const_bits(g, (u64)value, ty);
+}
+
+/* Materialize an 8-byte value as an lvalue and return a pointer local to it.
+ * An immediate is first lowered to a 2-lane memory constant. */
+Operand api_wide8_addr(KitCg* g, ApiSValue* v, KitCgTypeId ty) {
+ ApiSValue lv;
+ if (api_sv_op_is(v, OPK_IMM)) {
+ lv = api_make_wide8_int_const(g, v->op.v.imm, ty);
+ } else {
+ lv = *v;
+ }
+ lv.type = ty;
+ lv.op.type = ty;
+ lv.lvalue = 1;
+ return api_lvalue_addr(g, &lv, cg_type_ptr_to(g->c, ty));
+}
+
+/* Load a 32-bit lane (at byte offset `off`) of the scalar addressed by `addr`
+ * into a fresh i32 temp; returns the temp operand. */
+Operand api_wide8_load_lane(KitCg* g, Operand addr, i32 off) {
+ KitCgTypeId i32_ty = builtin_id(KIT_CG_BUILTIN_I32);
+ CGLocal rr = api_alloc_temp_local(g, i32_ty);
+ Operand dst = api_op_local(rr, i32_ty);
+ MemAccess ma;
+ memset(&ma, 0, sizeof ma);
+ ma.type = i32_ty;
+ ma.size = 4;
+ ma.align = 4;
+ g->target->load(g->target, dst, api_op_indirect(addr.v.local, off, i32_ty),
+ ma);
+ return dst;
+}
+
+/* Store an i32 `val` into the 32-bit lane (byte offset `off`) addressed by
+ * `addr`. */
+void api_wide8_store_lane(KitCg* g, Operand addr, i32 off, Operand val) {
+ KitCgTypeId i32_ty = builtin_id(KIT_CG_BUILTIN_I32);
+ MemAccess ma;
+ memset(&ma, 0, sizeof ma);
+ ma.type = i32_ty;
+ ma.size = 4;
+ ma.align = 4;
+ g->target->store(g->target, api_op_indirect(addr.v.local, off, i32_ty), val,
+ ma);
+}
+
ApiSValue api_wide16_materialize_lvalue(KitCg* g, ApiSValue* v,
KitCgTypeId ty) {
if (v->op.kind == OPK_LOCAL &&
diff --git a/src/link/link.c b/src/link/link.c
@@ -377,6 +377,12 @@ void link_set_pie(Linker* l, int enable) {
l->emit_pie = enable ? 1 : 0;
}
+void link_set_text_base(Linker* l, u64 base) {
+ if (!l) return;
+ l->text_base_set = 1;
+ l->text_base = base;
+}
+
void link_set_pe_subsystem(Linker* l, u16 subsystem) {
if (!l) return;
l->pe_subsystem = subsystem;
diff --git a/src/link/link.h b/src/link/link.h
@@ -209,6 +209,9 @@ void link_set_jit_mode(Linker*, int enable);
* emit_static_exe; both may be set in the same link (the IFUNC ctor
* still wants to run on the exe path regardless of PIE). */
void link_set_pie(Linker*, int enable);
+/* Override the static ET_EXEC image (text) base, from `kit ld -Ttext ADDR`. No
+ * effect on PIE/shared (base 0) or scripted layout (script pins vaddrs). */
+void link_set_text_base(Linker*, u64 base);
void link_set_pe_subsystem(Linker*, u16 subsystem);
/* Runtime loader path written into PT_INTERP / .interp. NULL leaves the
diff --git a/src/link/link_internal.h b/src/link/link_internal.h
@@ -211,6 +211,10 @@ struct Linker {
* script's output sections in declaration order. Borrowed; the
* script and every sub-object must outlive link_resolve. */
const KitLinkScript* script;
+ /* -Ttext override of the static ET_EXEC image base. text_base_set==0 leaves
+ * the default IMAGE_BASE_STATIC; ignored on PIE/shared/scripted layouts. */
+ int text_base_set;
+ u64 text_base;
int gc_sections;
int strip_debug;
/* Set by kit_link_exe before link_resolve. When 1, layout_iplt
@@ -594,6 +598,10 @@ struct LinkImage {
* self-describing headers PT_LOAD / build-id PT_NOTE, and only shifts
* file offsets to make room for ehdr+phdrs. */
u8 scripted;
+ /* -Ttext: when text_base_set, the static-exe image base override, mirrored
+ * from Linker at link_resolve time. Ignored if pie/scripted. */
+ int text_base_set;
+ u64 text_base;
};
/* Page granularity used for ELF segment alignment and the file-offset /
diff --git a/src/link/link_layout.c b/src/link/link_layout.c
@@ -1197,6 +1197,8 @@ LinkImage* link_resolve(Linker* l) {
img = link_image_alloc(l->c);
h = img->heap;
img->linker = l;
+ img->text_base_set = l->text_base_set;
+ img->text_base = l->text_base;
img->ninput_maps = LinkInputs_count(&l->inputs);
metrics_count(l->c, "link.inputs", img->ninput_maps);
diff --git a/src/obj/elf/elf.h b/src/obj/elf/elf.h
@@ -32,10 +32,16 @@
#define ELFMAG1 'E'
#define ELFMAG2 'L'
#define ELFMAG3 'F'
+#define ELFCLASS32 1
#define ELFCLASS64 2
#define ELFDATA2LSB 1
#define EV_CURRENT 1
#define ELFOSABI_NONE 0
+/* Bare-metal / freestanding (`*-none-elf`). kit stamps this so a freestanding
+ * object round-trips as KIT_OS_FREESTANDING instead of decoding back to Linux
+ * (EI_OSABI=SysV/0 is ambiguous and is otherwise read as a hosted Linux/PIE
+ * default — the bug that forced rv32 links to be special-cased). */
+#define ELFOSABI_STANDALONE 0xFF
#define ELFOSABI_GNU \
3 /* a.k.a. ELFOSABI_LINUX — required when \
the file uses STT_GNU_IFUNC / STB_GNU_UNIQUE. */
@@ -63,6 +69,17 @@
#define ELF64_RELA_SIZE 24u
#define ELF64_DYN_SIZE 16u
+/* ELFCLASS32 (ELF32) on-disk record sizes. RV32 ET_REL/ET_EXEC use these
+ * via the `is32` flag derived from EI_CLASS / Compiler.target.ptr_size==4.
+ * Note: Elf32_Sym and Elf32_Phdr REORDER fields relative to their ELF64
+ * counterparts (not just narrower) — see emit.c / read.c / link.c. */
+#define ELF32_EHDR_SIZE 52
+#define ELF32_SHDR_SIZE 40
+#define ELF32_PHDR_SIZE 32
+#define ELF32_SYM_SIZE 16u
+#define ELF32_RELA_SIZE 12u
+#define ELF32_DYN_SIZE 8u
+
/* ---- special section indices ---- */
#define SHN_UNDEF 0u
#define SHN_ABS 0xfff1u
@@ -120,6 +137,12 @@
#define ELF64_R_TYPE(i) ((u32)((i) & 0xffffffffu))
#define ELF64_R_INFO(s, t) ((((u64)(s)) << 32) | ((u64)(t) & 0xffffffffull))
+/* ELF32 r_info packs the symbol index in the high 24 bits and an 8-bit
+ * reloc type in the low byte. RV32 reloc type codes (<= 61) all fit. */
+#define ELF32_R_SYM(i) ((u32)((i) >> 8))
+#define ELF32_R_TYPE(i) ((u32)((i) & 0xffu))
+#define ELF32_R_INFO(s, t) ((u32)((((u32)(s)) << 8) | ((u32)(t) & 0xffu)))
+
/* ---- kit SymKind/SymBind/SymVis -> ELF wire value, single source ----
* One table per axis, indexed by the kit obj.h enum, wrapped in a
* static-inline accessor so unused TUs that include elf.h don't trip
@@ -382,6 +405,11 @@ u32 elf_x86_64_reloc_from(u32 elf_type);
u32 elf_riscv64_reloc_to(u32 kind /* RelocKind */);
u32 elf_riscv64_reloc_from(u32 elf_type);
+/* RV32 (ELFCLASS32) reloc table. Same XLEN-neutral kinds as riscv64; the
+ * 64-bit-only kinds (R_ABS64 / R_RV_ADD64 / R_RV_SUB64) are unsupported. */
+u32 elf_riscv32_reloc_to(u32 kind /* RelocKind */);
+u32 elf_riscv32_reloc_from(u32 elf_type);
+
/* ---- little-endian byte writers (Writer-based) ----
* Writes go through the shared writer_u*_le helpers (core/bytes.h); the
* elf_wr_* aliases keep the ELF spelling at existing call sites. Reads
@@ -394,4 +422,22 @@ u32 elf_riscv64_reloc_from(u32 elf_type);
#define elf_wr_u32 writer_u32_le
#define elf_wr_u64 writer_u64_le
+/* Native-width address/offset/size field: 4 bytes on ELFCLASS32, 8 on
+ * ELFCLASS64. Used wherever the Ehdr/Shdr/Sym widths shrink under is32. */
+static inline void elf_wr_addr(Writer* w, int is32, u64 v) {
+ if (is32)
+ elf_wr_u32(w, (u32)v);
+ else
+ elf_wr_u64(w, v);
+}
+
+static inline u64 elf_rd_addr(const u8* p, int is32) {
+ return is32 ? (u64)rd_u32_le(p) : rd_u64_le(p);
+}
+
+/* EI_CLASS-aware arch-ops lookup. obj_elf_machine() keys on e_machine
+ * alone, which cannot tell RV32 (EM_RISCV+ELFCLASS32) from RV64
+ * (EM_RISCV+ELFCLASS64); this variant disambiguates via ei_class. */
+const struct ObjElfArchOps* obj_elf_machine_class(u32 e_machine, u8 ei_class);
+
#endif /* KIT_OBJ_ELF_H */
diff --git a/src/obj/elf/emit.c b/src/obj/elf/emit.c
@@ -214,10 +214,17 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
if (c->target.big_endian) {
compiler_panic(c, SRCLOC_NONE, "emit_elf: big-endian ELF not supported");
}
- if (c->target.ptr_size != 8) {
- compiler_panic(c, SRCLOC_NONE, "emit_elf: ptr_size %u (expected 8)",
+ /* is32 selects ELFCLASS32 (RV32) record widths/layouts everywhere
+ * below; ptr_size==8 is the established ELFCLASS64 path. */
+ if (c->target.ptr_size != 8 && c->target.ptr_size != 4) {
+ compiler_panic(c, SRCLOC_NONE, "emit_elf: ptr_size %u (expected 4 or 8)",
(u32)c->target.ptr_size);
}
+ int is32 = (c->target.ptr_size == 4);
+ u32 sym_size = is32 ? ELF32_SYM_SIZE : ELF64_SYM_SIZE;
+ u32 rela_size = is32 ? ELF32_RELA_SIZE : ELF64_RELA_SIZE;
+ u32 ehdr_size = is32 ? ELF32_EHDR_SIZE : ELF64_EHDR_SIZE;
+ u32 shdr_size = is32 ? ELF32_SHDR_SIZE : ELF64_SHDR_SIZE;
/* ---- pass 1: plan ELF section list ------------------------------ */
@@ -293,29 +300,47 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
obj_symiter_free(it);
}
u32 max_syms = 1 + (nobjsec - 1) + nobjsym;
- u8* symtab = (u8*)arena_alloc(c->scratch, (size_t)ELF64_SYM_SIZE * max_syms,
+ u8* symtab = (u8*)arena_alloc(c->scratch, (size_t)sym_size * max_syms,
_Alignof(u64));
u32 nsyms = 0;
- memset(&symtab[nsyms * ELF64_SYM_SIZE], 0, ELF64_SYM_SIZE);
+ memset(&symtab[nsyms * sym_size], 0, sym_size);
nsyms = 1; /* index 0: STN_UNDEF */
-/* Helper to emit one Elf64_Sym record at index `idx` into symtab. */
-#define WRITE_SYM(idx, st_name, st_info, st_other, st_shndx, st_value, \
- st_size) \
- do { \
- u8* slot = &symtab[(idx) * ELF64_SYM_SIZE]; \
- slot[0] = (u8)((st_name)); \
- slot[1] = (u8)((st_name) >> 8); \
- slot[2] = (u8)((st_name) >> 16); \
- slot[3] = (u8)((st_name) >> 24); \
- slot[4] = (u8)((st_info)); \
- slot[5] = (u8)((st_other)); \
- slot[6] = (u8)((st_shndx)); \
- slot[7] = (u8)((st_shndx) >> 8); \
- for (int _b = 0; _b < 8; ++_b) \
- slot[8 + _b] = (u8)((u64)(st_value) >> (_b * 8)); \
- for (int _b = 0; _b < 8; ++_b) \
- slot[16 + _b] = (u8)((u64)(st_size) >> (_b * 8)); \
+/* Helper to emit one symbol record at index `idx` into symtab.
+ * Elf64_Sym (24B) and Elf32_Sym (16B) REORDER fields: ELF32 places
+ * st_value/st_size BEFORE st_info/st_other/st_shndx, so select the byte
+ * layout by `is32` rather than just narrowing widths. */
+#define WRITE_SYM(idx, st_name, st_info, st_other, st_shndx, st_value, \
+ st_size) \
+ do { \
+ u8* slot = &symtab[(idx) * sym_size]; \
+ if (is32) { \
+ slot[0] = (u8)((st_name)); \
+ slot[1] = (u8)((st_name) >> 8); \
+ slot[2] = (u8)((st_name) >> 16); \
+ slot[3] = (u8)((st_name) >> 24); \
+ for (int _b = 0; _b < 4; ++_b) \
+ slot[4 + _b] = (u8)((u64)(st_value) >> (_b * 8)); \
+ for (int _b = 0; _b < 4; ++_b) \
+ slot[8 + _b] = (u8)((u64)(st_size) >> (_b * 8)); \
+ slot[12] = (u8)((st_info)); \
+ slot[13] = (u8)((st_other)); \
+ slot[14] = (u8)((st_shndx)); \
+ slot[15] = (u8)((st_shndx) >> 8); \
+ } else { \
+ slot[0] = (u8)((st_name)); \
+ slot[1] = (u8)((st_name) >> 8); \
+ slot[2] = (u8)((st_name) >> 16); \
+ slot[3] = (u8)((st_name) >> 24); \
+ slot[4] = (u8)((st_info)); \
+ slot[5] = (u8)((st_other)); \
+ slot[6] = (u8)((st_shndx)); \
+ slot[7] = (u8)((st_shndx) >> 8); \
+ for (int _b = 0; _b < 8; ++_b) \
+ slot[8 + _b] = (u8)((u64)(st_value) >> (_b * 8)); \
+ for (int _b = 0; _b < 8; ++_b) \
+ slot[16 + _b] = (u8)((u64)(st_size) >> (_b * 8)); \
+ } \
} while (0)
/* No automatic STT_SECTION synthesis. Section symbols are emitted
@@ -434,7 +459,7 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
typedef struct RelaPlan {
u32 obj_section; /* obj section the rela applies to */
u8* bytes; /* arena-allocated rela bytes */
- u32 size; /* bytes count = nrelocs * 24 */
+ u32 size; /* bytes count = nrelocs * rela_size (24 or 12) */
} RelaPlan;
RelaPlan* rela_plans = arena_zarray(c->scratch, RelaPlan, nobjsec);
@@ -445,8 +470,8 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
if (!host || host->removed) continue;
u32 nr = obj_reloc_count(ob, si);
if (!nr) continue;
- u8* buf = (u8*)arena_alloc(c->scratch, (size_t)ELF64_RELA_SIZE * nr,
- _Alignof(u64));
+ u8* buf =
+ (u8*)arena_alloc(c->scratch, (size_t)rela_size * nr, _Alignof(u64));
u32 j = 0;
for (u32 i = 0; i < total_relocs; ++i) {
const Reloc* r = obj_reloc_at(ob, i);
@@ -468,17 +493,27 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
} else {
sym_elf_idx = sym_to_elf[r->sym];
}
- u8* slot = &buf[j * ELF64_RELA_SIZE];
- for (int b = 0; b < 8; ++b) slot[b] = (u8)((u64)r->offset >> (b * 8));
- u64 info = ELF64_R_INFO(sym_elf_idx, etype);
- for (int b = 0; b < 8; ++b) slot[8 + b] = (u8)(info >> (b * 8));
- for (int b = 0; b < 8; ++b)
- slot[16 + b] = (u8)((u64)r->addend >> (b * 8));
+ /* Elf32_Rela (12B): r_offset@0, r_info@4 (ELF32_R_INFO, 8-bit
+ * type), r_addend@8 — all 4-byte. Elf64_Rela (24B): all 8-byte. */
+ u8* slot = &buf[j * rela_size];
+ if (is32) {
+ for (int b = 0; b < 4; ++b) slot[b] = (u8)((u32)r->offset >> (b * 8));
+ u32 info = ELF32_R_INFO(sym_elf_idx, etype);
+ for (int b = 0; b < 4; ++b) slot[4 + b] = (u8)(info >> (b * 8));
+ for (int b = 0; b < 4; ++b)
+ slot[8 + b] = (u8)((u32)r->addend >> (b * 8));
+ } else {
+ for (int b = 0; b < 8; ++b) slot[b] = (u8)((u64)r->offset >> (b * 8));
+ u64 info = ELF64_R_INFO(sym_elf_idx, etype);
+ for (int b = 0; b < 8; ++b) slot[8 + b] = (u8)(info >> (b * 8));
+ for (int b = 0; b < 8; ++b)
+ slot[16 + b] = (u8)((u64)r->addend >> (b * 8));
+ }
++j;
}
rela_plans[nrela_plans].obj_section = si;
rela_plans[nrela_plans].bytes = buf;
- rela_plans[nrela_plans].size = nr * ELF64_RELA_SIZE;
+ rela_plans[nrela_plans].size = nr * rela_size;
nrela_plans++;
}
@@ -502,8 +537,8 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
es->name_len = nlen;
es->sh_type = SHT_RELA;
es->sh_flags = SHF_INFO_LINK;
- es->sh_addralign = 8;
- es->sh_entsize = ELF64_RELA_SIZE;
+ es->sh_addralign = is32 ? 4 : 8;
+ es->sh_entsize = rela_size;
es->sh_info = obj_to_elf[si]; /* section the relas apply to */
/* sh_link filled below once we know symtab's elf index. */
es->raw_bytes = rela_plans[ri].bytes;
@@ -520,10 +555,10 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
es->name_len = 7;
es->sh_type = SHT_SYMTAB;
es->sh_flags = 0;
- es->sh_addralign = 8;
- es->sh_entsize = ELF64_SYM_SIZE;
+ es->sh_addralign = is32 ? 4 : 8;
+ es->sh_entsize = sym_size;
es->raw_bytes = symtab;
- es->sh_size = (u64)nsyms * ELF64_SYM_SIZE;
+ es->sh_size = (u64)nsyms * sym_size;
es->sh_info = nlocals; /* first non-local symbol */
idx_symtab = nsecs;
nsecs++;
@@ -582,7 +617,7 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
/* ---- pass 5: assign file offsets -------------------------------- */
- u64 cur = ELF64_EHDR_SIZE;
+ u64 cur = ehdr_size;
for (u32 i = 1; i < nsecs; ++i) {
ElfSec* es = &secs[i];
if (es->is_nobits) {
@@ -597,7 +632,8 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
es->sh_offset = cur;
cur += es->sh_size;
}
- cur = ALIGN_UP(cur, (u64)8);
+ /* ELF32 toolchains conventionally align the SHT to 4; ELF64 to 8. */
+ cur = ALIGN_UP(cur, (u64)(is32 ? 4 : 8));
u64 e_shoff = cur;
/* ---- pass 6: write Ehdr ----------------------------------------- */
@@ -607,7 +643,7 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
ident[EI_MAG1] = ELFMAG1;
ident[EI_MAG2] = ELFMAG2;
ident[EI_MAG3] = ELFMAG3;
- ident[EI_CLASS] = ELFCLASS64;
+ ident[EI_CLASS] = is32 ? ELFCLASS32 : ELFCLASS64;
ident[EI_DATA] = ELFDATA2LSB;
ident[EI_VERSION] = EV_CURRENT;
/* SysV is the canonical OSABI for relocatable AArch64 .o; clang and
@@ -617,7 +653,16 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
* Exception: GNU extensions (STT_GNU_IFUNC, SHF_GNU_RETAIN, ...)
* require EI_OSABI=ELFOSABI_GNU. Clang sets it for any TU using a
* GNU-flavored marker; we mirror that so roundtrip is byte-stable. */
- ident[EI_OSABI] = ELFOSABI_NONE;
+ /* A freestanding (`*-none-elf`) target stamps ELFOSABI_STANDALONE so the
+ * object round-trips as bare-metal rather than decoding back to a hosted
+ * Linux/PIE default; hosted targets keep SysV. (A GNU extension below still
+ * upgrades to ELFOSABI_GNU when required — those markers imply a GNU loader.) */
+ {
+ Compiler* osc = obj_compiler(ob);
+ ident[EI_OSABI] = (osc && osc->target.os == KIT_OS_FREESTANDING)
+ ? ELFOSABI_STANDALONE
+ : ELFOSABI_NONE;
+ }
{
ObjSymIter* it = obj_symiter_new(ob);
ObjSymEntry e;
@@ -644,21 +689,42 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
* else synthesize a sensible per-arch default. RV64 kit targets the
* Linux psABI's lp64d soft-relax convention (RVC + double-float ABI). */
u32 e_flags;
- if (!obj_get_elf_e_flags(ob, &e_flags)) e_flags = elf->e_flags;
+ if (!obj_get_elf_e_flags(ob, &e_flags)) {
+ e_flags = elf->e_flags;
+ /* rv32 (ptr_size 4): ilp32 and ilp32f share KIT_ARCH_RV32, so the static
+ * descriptor's float-ABI bits (a placeholder SINGLE) cannot be right for
+ * both. Derive them from -mabi (float_abi); RVC and other descriptor bits
+ * are kept. rv64 (ptr_size 8) is left untouched, preserving its e_flags. */
+ if (e_machine == EM_RISCV) {
+ Compiler* ec = obj_compiler(ob);
+ if (ec && ec->target.ptr_size == 4u) {
+ u32 fa = EF_RISCV_FLOAT_ABI_SOFT;
+ if (ec->target.float_abi == KIT_FLOAT_ABI_SINGLE)
+ fa = EF_RISCV_FLOAT_ABI_SINGLE;
+ else if (ec->target.float_abi == KIT_FLOAT_ABI_DOUBLE)
+ fa = EF_RISCV_FLOAT_ABI_DOUBLE;
+ else if (ec->target.float_abi == KIT_FLOAT_ABI_DEFAULT)
+ fa = EF_RISCV_FLOAT_ABI_SINGLE; /* rv32 default profile is ilp32f */
+ e_flags = (e_flags & ~(u32)EF_RISCV_FLOAT_ABI_MASK) | fa;
+ }
+ }
+ }
kit_writer_seek(w, 0);
kit_writer_write(w, ident, EI_NIDENT);
elf_wr_u16(w, ET_REL);
elf_wr_u16(w, (u16)e_machine);
elf_wr_u32(w, EV_CURRENT);
- elf_wr_u64(w, 0); /* e_entry */
- elf_wr_u64(w, 0); /* e_phoff */
- elf_wr_u64(w, e_shoff); /* e_shoff */
+ /* e_entry/e_phoff/e_shoff are native-width (4B on ELF32, 8B on ELF64);
+ * the field ORDER is identical, only the widths shrink. */
+ elf_wr_addr(w, is32, 0); /* e_entry */
+ elf_wr_addr(w, is32, 0); /* e_phoff */
+ elf_wr_addr(w, is32, e_shoff); /* e_shoff */
elf_wr_u32(w, e_flags); /* e_flags */
- elf_wr_u16(w, ELF64_EHDR_SIZE); /* e_ehsize */
+ elf_wr_u16(w, (u16)ehdr_size); /* e_ehsize */
elf_wr_u16(w, 0); /* e_phentsize */
elf_wr_u16(w, 0); /* e_phnum */
- elf_wr_u16(w, ELF64_SHDR_SIZE); /* e_shentsize */
+ elf_wr_u16(w, (u16)shdr_size); /* e_shentsize */
elf_wr_u16(w, (u16)nsecs); /* e_shnum */
elf_wr_u16(w, (u16)idx_shstrtab); /* e_shstrndx */
@@ -684,15 +750,18 @@ void emit_elf(Compiler* c, ObjBuilder* ob, Writer* w) {
kit_writer_seek(w, e_shoff);
for (u32 i = 0; i < nsecs; ++i) {
const ElfSec* es = &secs[i];
+ /* Elf32_Shdr (40B) and Elf64_Shdr (64B) share field ORDER; only
+ * sh_flags/sh_addr/sh_offset/sh_size/sh_addralign/sh_entsize narrow
+ * from u64 to u32 under is32. */
elf_wr_u32(w, es->sh_name);
elf_wr_u32(w, es->sh_type);
- elf_wr_u64(w, es->sh_flags);
- elf_wr_u64(w, es->sh_addr);
- elf_wr_u64(w, es->sh_offset);
- elf_wr_u64(w, es->sh_size);
+ elf_wr_addr(w, is32, es->sh_flags);
+ elf_wr_addr(w, is32, es->sh_addr);
+ elf_wr_addr(w, is32, es->sh_offset);
+ elf_wr_addr(w, is32, es->sh_size);
elf_wr_u32(w, es->sh_link);
elf_wr_u32(w, es->sh_info);
- elf_wr_u64(w, es->sh_addralign);
- elf_wr_u64(w, es->sh_entsize);
+ elf_wr_addr(w, is32, es->sh_addralign);
+ elf_wr_addr(w, is32, es->sh_entsize);
}
}
diff --git a/src/obj/elf/link.c b/src/obj/elf/link.c
@@ -104,6 +104,54 @@ typedef struct __attribute__((packed)) Shdr64 {
u64 sh_entsize;
} Shdr64;
+/* ---- ELF32 wire structs (RV32 static ET_EXEC) ----
+ *
+ * Ehdr32/Shdr32 keep the ELF64 field ORDER, only narrowing the
+ * native-width members to u32. Phdr32 REORDERS p_flags to AFTER the
+ * sizes (vs Phdr64 where p_flags is field #2) — the packed struct below
+ * encodes that order, so the by-name field assignments in the phdr build
+ * loop stay correct under either class. */
+typedef struct __attribute__((packed)) Ehdr32 {
+ u8 e_ident[EI_NIDENT];
+ u16 e_type;
+ u16 e_machine;
+ u32 e_version;
+ u32 e_entry;
+ u32 e_phoff;
+ u32 e_shoff;
+ u32 e_flags;
+ u16 e_ehsize;
+ u16 e_phentsize;
+ u16 e_phnum;
+ u16 e_shentsize;
+ u16 e_shnum;
+ u16 e_shstrndx;
+} Ehdr32;
+
+typedef struct __attribute__((packed)) Phdr32 {
+ u32 p_type;
+ u32 p_offset;
+ u32 p_vaddr;
+ u32 p_paddr;
+ u32 p_filesz;
+ u32 p_memsz;
+ u32 p_flags;
+ u32 p_align;
+} Phdr32;
+
+typedef struct __attribute__((packed)) Shdr32 {
+ u32 sh_name;
+ u32 sh_type;
+ u32 sh_flags;
+ u32 sh_addr;
+ u32 sh_offset;
+ u32 sh_size;
+ u32 sh_link;
+ u32 sh_info;
+ u32 sh_addralign;
+ u32 sh_entsize;
+} Shdr32;
+
#define PT_NOTE 4
#define PT_TLS 7
@@ -140,6 +188,87 @@ static u32 perms_to_pflags(u32 secflags) {
return f;
}
+/* ---- class-aware header serializers ----
+ *
+ * The writer builds every header in the wide Ehdr64/Phdr64/Shdr64
+ * in-memory form, then serializes to the on-disk class. On ELFCLASS64
+ * the bytes are the wide struct verbatim (preserving the existing
+ * byte-exact RV64/x86_64/aa64 output). On ELFCLASS32 the native-width
+ * fields narrow to u32 and Phdr fields reorder (p_flags after sizes).
+ * The serialized record sizes are the wire constants in elf.h. */
+static size_t elf_ehdr_sz(int class32) {
+ return class32 ? ELF32_EHDR_SIZE : sizeof(Ehdr64);
+}
+static size_t elf_phdr_sz(int class32) {
+ return class32 ? ELF32_PHDR_SIZE : sizeof(Phdr64);
+}
+static size_t elf_shdr_sz(int class32) {
+ return class32 ? ELF32_SHDR_SIZE : sizeof(Shdr64);
+}
+
+static void write_ehdr(Writer* w, const Ehdr64* e, int class32) {
+ if (!class32) {
+ write_bytes(w, e, sizeof(*e));
+ return;
+ }
+ Ehdr32 e32;
+ memset(&e32, 0, sizeof e32);
+ memcpy(e32.e_ident, e->e_ident, EI_NIDENT);
+ e32.e_type = e->e_type;
+ e32.e_machine = e->e_machine;
+ e32.e_version = e->e_version;
+ e32.e_entry = (u32)e->e_entry;
+ e32.e_phoff = (u32)e->e_phoff;
+ e32.e_shoff = (u32)e->e_shoff;
+ e32.e_flags = e->e_flags;
+ e32.e_ehsize = e->e_ehsize;
+ e32.e_phentsize = e->e_phentsize;
+ e32.e_phnum = e->e_phnum;
+ e32.e_shentsize = e->e_shentsize;
+ e32.e_shnum = e->e_shnum;
+ e32.e_shstrndx = e->e_shstrndx;
+ write_bytes(w, &e32, sizeof e32);
+}
+
+static void write_phdrs(Writer* w, const Phdr64* phdrs, u32 n, int class32) {
+ if (!class32) {
+ write_bytes(w, phdrs, sizeof(Phdr64) * n);
+ return;
+ }
+ for (u32 i = 0; i < n; ++i) {
+ const Phdr64* p = &phdrs[i];
+ Phdr32 p32;
+ p32.p_type = p->p_type;
+ p32.p_offset = (u32)p->p_offset;
+ p32.p_vaddr = (u32)p->p_vaddr;
+ p32.p_paddr = (u32)p->p_paddr;
+ p32.p_filesz = (u32)p->p_filesz;
+ p32.p_memsz = (u32)p->p_memsz;
+ p32.p_flags = p->p_flags;
+ p32.p_align = (u32)p->p_align;
+ write_bytes(w, &p32, sizeof p32);
+ }
+}
+
+static void write_shdr(Writer* w, const Shdr64* s, int class32) {
+ if (!class32) {
+ write_bytes(w, s, sizeof(*s));
+ return;
+ }
+ Shdr32 s32;
+ s32.sh_name = s->sh_name;
+ s32.sh_type = s->sh_type;
+ s32.sh_flags = (u32)s->sh_flags;
+ s32.sh_addr = (u32)s->sh_addr;
+ s32.sh_offset = (u32)s->sh_offset;
+ s32.sh_size = (u32)s->sh_size;
+ s32.sh_link = s->sh_link;
+ s32.sh_info = s->sh_info;
+ s32.sh_addralign = (u32)s->sh_addralign;
+ s32.sh_entsize = (u32)s->sh_entsize;
+ write_bytes(w, &s32, sizeof s32);
+}
+
/* Scripted-layout post-pass: vaddrs are already final (the script
* pinned them via `. = …`), so only file offsets need to bump to
* leave room for ehdr+phdrs. Mirror of shift_image_addresses but
@@ -544,8 +673,26 @@ static u8 sym_kind_to_st_type(u8 kind) {
static u8 sym_bind_to_st_bind(u8 bind) { return elf_st_bind(bind); }
-/* Produces one Elf64_Sym record on the wire from a SymRec. */
-static void write_sym_rec(Writer* w, const SymRec* r) {
+/* Produces one symbol record on the wire from a SymRec. Elf32_Sym (16B)
+ * REORDERS fields vs Elf64_Sym (24B): st_value/st_size come BEFORE
+ * st_info/st_other/st_shndx, so select the byte layout by class32. */
+static void write_sym_rec(Writer* w, const SymRec* r, int class32) {
+ if (class32) {
+ u8 buf[ELF32_SYM_SIZE];
+ u32 i;
+ buf[0] = (u8)(r->st_name);
+ buf[1] = (u8)(r->st_name >> 8);
+ buf[2] = (u8)(r->st_name >> 16);
+ buf[3] = (u8)(r->st_name >> 24);
+ for (i = 0; i < 4; ++i) buf[4 + i] = (u8)(r->st_value >> (i * 8));
+ for (i = 0; i < 4; ++i) buf[8 + i] = (u8)(r->st_size >> (i * 8));
+ buf[12] = r->st_info;
+ buf[13] = r->st_other;
+ buf[14] = (u8)(r->st_shndx);
+ buf[15] = (u8)(r->st_shndx >> 8);
+ write_bytes(w, buf, sizeof buf);
+ return;
+ }
u8 buf[ELF64_SYM_SIZE];
buf[0] = (u8)(r->st_name);
buf[1] = (u8)(r->st_name >> 8);
@@ -662,6 +809,15 @@ void link_emit_elf(LinkImage* img, Writer* w) {
Compiler* c = img->c;
const ObjElfArchOps* arch = elf_arch_or_panic(c, "link_emit_elf");
u32 e_machine = arch->e_machine;
+ /* class32: ELFCLASS32 (RV32) output, derived from target ptr width.
+ * ptr_size must be 4 or 8 (every supported arch sets one of these). */
+ int class32 = (c->target.ptr_size == 4);
+ /* RV32 is static-only in v1: dynamic linking (link_dyn.c) and the
+ * PIE re-serialize block below remain ELFCLASS64. Gate here so the
+ * dynamic path is never reached for a 32-bit image. */
+ if (class32 && (img->pie || img->dyn))
+ compiler_panic(c, SRCLOC_NONE,
+ "rv32: dynamic/PIE linking unsupported; static only");
if (img->entry_sym == LINK_SYM_NONE)
compiler_panic(c, SRCLOC_NONE, "link_emit_elf: no resolved entry symbol");
/* IFUNC trampolines: layout_iplt builds the .iplt stubs + .igot.plt
@@ -682,7 +838,11 @@ void link_emit_elf(LinkImage* img, Writer* w) {
* self-describing memory image. */
int pie = img->pie;
int scripted = img->scripted;
- u64 img_base = (pie || scripted) ? 0ULL : IMAGE_BASE_STATIC;
+ /* Static ET_EXEC base: a `kit ld -Ttext ADDR` override (e.g. 0x80000000 for a
+ * qemu `virt` image) wins over IMAGE_BASE_STATIC; PIE/scripted keep base 0. */
+ u64 img_base = (pie || scripted) ? 0ULL
+ : img->text_base_set ? img->text_base
+ : IMAGE_BASE_STATIC;
/* ---- plan number of program headers ----
*
@@ -699,12 +859,15 @@ void link_emit_elf(LinkImage* img, Writer* w) {
u32 nphdr_total = nphdr_headers + img->nsegments + nphdr_buildid + has_tls +
nphdr_extra_dyn;
u64 build_id_note_bytes = scripted ? 0ULL : BUILD_ID_NOTE_BYTES;
+ /* Class-selected on-disk header sizes (ELF32: 52/32/40, ELF64: 64/56/64). */
+ u64 ehdr_sz = elf_ehdr_sz(class32);
+ u64 phent_sz = elf_phdr_sz(class32);
u64 headers_size =
- sizeof(Ehdr64) + (u64)nphdr_total * sizeof(Phdr64) + build_id_note_bytes;
+ ehdr_sz + (u64)nphdr_total * phent_sz + build_id_note_bytes;
u64 headers_load = ALIGN_UP(headers_size, (u64)PAGE_SIZE);
/* The build-id note lives inside the headers PT_LOAD at this offset. */
- u64 build_id_off = sizeof(Ehdr64) + (u64)nphdr_total * sizeof(Phdr64);
+ u64 build_id_off = ehdr_sz + (u64)nphdr_total * phent_sz;
u64 build_id_addr = img_base + build_id_off;
/* ---- shift image addresses, apply relocations ----
@@ -1090,7 +1253,8 @@ void link_emit_elf(LinkImage* img, Writer* w) {
}
}
u64 symtab_off = ALIGN_UP(dbg_cursor, (u64)8u);
- u64 symtab_size = (u64)ELF64_SYM_SIZE * nsyms_emit;
+ u32 sym_size = class32 ? ELF32_SYM_SIZE : ELF64_SYM_SIZE;
+ u64 symtab_size = (u64)sym_size * nsyms_emit;
u64 strtab_off = symtab_off + symtab_size;
u64 strtab_size = strtab.len;
u64 shstrtab_off = strtab_off + strtab_size;
@@ -1234,7 +1398,7 @@ void link_emit_elf(LinkImage* img, Writer* w) {
ehdr.e_ident[1] = ELFMAG1;
ehdr.e_ident[2] = ELFMAG2;
ehdr.e_ident[3] = ELFMAG3;
- ehdr.e_ident[4] = ELFCLASS64;
+ ehdr.e_ident[4] = class32 ? ELFCLASS32 : ELFCLASS64;
ehdr.e_ident[5] = ELFDATA2LSB;
ehdr.e_ident[6] = EV_CURRENT;
ehdr.e_ident[7] = ELFOSABI_NONE;
@@ -1242,21 +1406,38 @@ void link_emit_elf(LinkImage* img, Writer* w) {
ehdr.e_machine = (u16)e_machine;
ehdr.e_version = EV_CURRENT;
ehdr.e_entry = img_base + LinkSyms_at(&img->syms, img->entry_sym - 1)->vaddr;
- ehdr.e_phoff = sizeof(Ehdr64);
+ ehdr.e_phoff = ehdr_sz;
ehdr.e_shoff = shdr_off;
- ehdr.e_flags = 0;
- ehdr.e_ehsize = sizeof(Ehdr64);
- ehdr.e_phentsize = sizeof(Phdr64);
+ /* e_flags carries the arch ABI bits (RISC-V float-ABI / RVC). This was
+ * previously hardcoded 0 for all arches; writing arch->e_flags lands
+ * the RV32/RV64 flags. (RV64 descriptor e_flags is unchanged, so its
+ * header now reflects RVC|FLOAT_ABI_DOUBLE — see integration notes.) */
+ ehdr.e_flags = arch->e_flags;
+ /* rv32: ilp32 and ilp32f share KIT_ARCH_RV32, so the descriptor's float-ABI
+ * bits are a placeholder. Override them from -mabi so the executable's ABI
+ * matches its objects (and a soft ilp32 image isn't mislabelled single). */
+ if (e_machine == EM_RISCV && class32) {
+ u32 fa = EF_RISCV_FLOAT_ABI_SOFT;
+ if (c->target.float_abi == KIT_FLOAT_ABI_SINGLE)
+ fa = EF_RISCV_FLOAT_ABI_SINGLE;
+ else if (c->target.float_abi == KIT_FLOAT_ABI_DOUBLE)
+ fa = EF_RISCV_FLOAT_ABI_DOUBLE;
+ else if (c->target.float_abi == KIT_FLOAT_ABI_DEFAULT)
+ fa = EF_RISCV_FLOAT_ABI_SINGLE;
+ ehdr.e_flags = (ehdr.e_flags & ~(u32)EF_RISCV_FLOAT_ABI_MASK) | fa;
+ }
+ ehdr.e_ehsize = (u16)ehdr_sz;
+ ehdr.e_phentsize = (u16)phent_sz;
ehdr.e_phnum = (u16)nphdr_total;
- ehdr.e_shentsize = sizeof(Shdr64);
+ ehdr.e_shentsize = (u16)elf_shdr_sz(class32);
ehdr.e_shnum = (u16)nshdr;
ehdr.e_shstrndx = (u16)shndx_shstrtab;
/* ---- write ehdr, phdrs, build-id note, pad ---- */
u64 cur_off;
- write_bytes(w, &ehdr, sizeof(ehdr));
- write_bytes(w, phdrs, sizeof(Phdr64) * nphdr_total);
- cur_off = sizeof(ehdr) + sizeof(Phdr64) * nphdr_total;
+ write_ehdr(w, &ehdr, class32);
+ write_phdrs(w, phdrs, nphdr_total, class32);
+ cur_off = ehdr_sz + phent_sz * nphdr_total;
/* .note.gnu.build-id wire format:
* u32 namesz = 4 ("GNU\0")
@@ -1338,7 +1519,7 @@ void link_emit_elf(LinkImage* img, Writer* w) {
}
{
u32 i;
- for (i = 0; i < nsyms_emit; ++i) write_sym_rec(w, &recs[i]);
+ for (i = 0; i < nsyms_emit; ++i) write_sym_rec(w, &recs[i], class32);
cur_off += symtab_size;
}
if (strtab.len) {
@@ -1360,7 +1541,7 @@ void link_emit_elf(LinkImage* img, Writer* w) {
u32 i;
/* shdr 0: NULL */
memset(&sh, 0, sizeof(sh));
- write_bytes(w, &sh, sizeof(sh));
+ write_shdr(w, &sh, class32);
/* Locate dyn-section names (interned earlier in layout_dyn) so
* we can override sh_type / sh_link / sh_info / sh_entsize for
* .dynsym / .dynstr / .gnu.hash / .rela.dyn / .rela.plt /
@@ -1448,7 +1629,7 @@ void link_emit_elf(LinkImage* img, Writer* w) {
sh.sh_entsize = 8;
}
}
- write_bytes(w, &sh, sizeof(sh));
+ write_shdr(w, &sh, class32);
}
/* shdr: .note.gnu.build-id (allocatable; in headers PT_LOAD) */
memset(&sh, 0, sizeof(sh));
@@ -1459,7 +1640,7 @@ void link_emit_elf(LinkImage* img, Writer* w) {
sh.sh_offset = build_id_off;
sh.sh_size = BUILD_ID_NOTE_BYTES;
sh.sh_addralign = 4;
- write_bytes(w, &sh, sizeof(sh));
+ write_shdr(w, &sh, class32);
/* shdr: .symtab */
memset(&sh, 0, sizeof(sh));
sh.sh_name = sh_name_symtab;
@@ -1471,8 +1652,8 @@ void link_emit_elf(LinkImage* img, Writer* w) {
sh.sh_link = shndx_strtab;
sh.sh_info = first_global_idx;
sh.sh_addralign = 8;
- sh.sh_entsize = ELF64_SYM_SIZE;
- write_bytes(w, &sh, sizeof(sh));
+ sh.sh_entsize = sym_size;
+ write_shdr(w, &sh, class32);
/* shdr: .strtab */
memset(&sh, 0, sizeof(sh));
sh.sh_name = sh_name_strtab;
@@ -1480,7 +1661,7 @@ void link_emit_elf(LinkImage* img, Writer* w) {
sh.sh_offset = strtab_off;
sh.sh_size = strtab_size;
sh.sh_addralign = 1;
- write_bytes(w, &sh, sizeof(sh));
+ write_shdr(w, &sh, class32);
/* shdr: .shstrtab */
memset(&sh, 0, sizeof(sh));
sh.sh_name = sh_name_shstrtab;
@@ -1488,7 +1669,7 @@ void link_emit_elf(LinkImage* img, Writer* w) {
sh.sh_offset = shstrtab_off;
sh.sh_size = shstrtab_size;
sh.sh_addralign = 1;
- write_bytes(w, &sh, sizeof(sh));
+ write_shdr(w, &sh, class32);
}
heap->free(heap, phdrs, sizeof(Phdr64) * nphdr_total);
diff --git a/src/obj/elf/link_dyn.c b/src/obj/elf/link_dyn.c
@@ -507,6 +507,17 @@ void layout_dyn(Linker* l, LinkImage* img) {
if (!l->emit_pie) return;
+ /* The dynamic-section layout below is ELF64-only (Elf64_Sym/Dyn/Rela wire
+ * sizes, 8-byte GOT slots). rv32 is a static-only v1 target, so a dynamic /
+ * PIE rv32 link is unsupported — fail with a clear diagnostic instead of
+ * crashing on the ELF64 assumptions. Link rv32 images statically (kit ld
+ * -no-pie, or a -T script for bare-metal section placement). */
+ if (img->c->target.ptr_size == 4u) {
+ compiler_panic(img->c, SRCLOC_NONE,
+ "link: dynamic/PIE linking is not supported for 32-bit "
+ "RISC-V (ELFCLASS32); link statically (kit ld -no-pie)");
+ }
+
arch = link_arch_desc_for(l->c);
if (!arch)
compiler_panic(img->c, SRCLOC_NONE, "link: layout_dyn: no arch descriptor");
diff --git a/src/obj/elf/read.c b/src/obj/elf/read.c
@@ -38,17 +38,33 @@ typedef struct ShdrRec {
u64 sh_entsize;
} ShdrRec;
-static void parse_shdr(const u8* p, ShdrRec* out) {
- out->sh_name = rd_u32_le(p + 0);
- out->sh_type = rd_u32_le(p + 4);
- out->sh_flags = rd_u64_le(p + 8);
- out->sh_addr = rd_u64_le(p + 16);
- out->sh_offset = rd_u64_le(p + 24);
- out->sh_size = rd_u64_le(p + 32);
- out->sh_link = rd_u32_le(p + 40);
- out->sh_info = rd_u32_le(p + 44);
- out->sh_addralign = rd_u64_le(p + 48);
- out->sh_entsize = rd_u64_le(p + 56);
+static void parse_shdr(const u8* p, int is32, ShdrRec* out) {
+ /* Elf32_Shdr (40B) shares field order with Elf64_Shdr (64B); only the
+ * flags/addr/offset/size/addralign/entsize fields narrow to u32 and
+ * shift the following offsets. The ShdrRec stays u64-wide. */
+ if (is32) {
+ out->sh_name = rd_u32_le(p + 0);
+ out->sh_type = rd_u32_le(p + 4);
+ out->sh_flags = rd_u32_le(p + 8);
+ out->sh_addr = rd_u32_le(p + 12);
+ out->sh_offset = rd_u32_le(p + 16);
+ out->sh_size = rd_u32_le(p + 20);
+ out->sh_link = rd_u32_le(p + 24);
+ out->sh_info = rd_u32_le(p + 28);
+ out->sh_addralign = rd_u32_le(p + 32);
+ out->sh_entsize = rd_u32_le(p + 36);
+ } else {
+ out->sh_name = rd_u32_le(p + 0);
+ out->sh_type = rd_u32_le(p + 4);
+ out->sh_flags = rd_u64_le(p + 8);
+ out->sh_addr = rd_u64_le(p + 16);
+ out->sh_offset = rd_u64_le(p + 24);
+ out->sh_size = rd_u64_le(p + 32);
+ out->sh_link = rd_u32_le(p + 40);
+ out->sh_info = rd_u32_le(p + 44);
+ out->sh_addralign = rd_u64_le(p + 48);
+ out->sh_entsize = rd_u64_le(p + 56);
+ }
}
/* ---- mappers ---- */
@@ -239,38 +255,51 @@ static Sym intern_cstr(Compiler* c, const char* s) {
* malformed sub-table would otherwise abort a useful inspection: a bad
* .dynamic / .dynsym / dyn-reloc table is skipped rather than panicked. */
static void read_elf_image(Compiler* c, ObjBuilder* ob, const u8* data,
- size_t len, u16 e_type, const ShdrRec* shdrs,
- u16 e_shnum, const u32* elf_to_obj,
- u32 (*reloc_from)(u32)) {
+ size_t len, u16 e_type, int is32,
+ const ShdrRec* shdrs, u16 e_shnum,
+ const u32* elf_to_obj, u32 (*reloc_from)(u32)) {
+ u32 phdr_size = is32 ? ELF32_PHDR_SIZE : ELF64_PHDR_SIZE;
+ u32 sym_size = is32 ? ELF32_SYM_SIZE : ELF64_SYM_SIZE;
+ u32 rela_size = is32 ? ELF32_RELA_SIZE : ELF64_RELA_SIZE;
+ u32 rel_size = is32 ? 8u : 16u;
+ u32 dyn_size = is32 ? ELF32_DYN_SIZE : ELF64_DYN_SIZE;
ObjImage* im =
obj_image_ensure(ob, e_type == ET_DYN ? OBJ_KIND_DYN : OBJ_KIND_EXEC);
if (!im) compiler_panic(c, SRCLOC_NONE, "read_elf: obj_image_ensure failed");
- obj_image_set_entry(im, rd_u64_le(data + 24));
+ /* e_entry is at offset 24 in both Ehdr32/Ehdr64, native width. */
+ obj_image_set_entry(im, elf_rd_addr(data + 24, is32));
/* Program headers -> segments (+ PT_INTERP string, image base). */
{
- u64 e_phoff = rd_u64_le(data + 32);
- u16 e_phentsize = rd_u16_le(data + 54);
- u16 e_phnum = rd_u16_le(data + 56);
+ /* e_phoff: 4B@28 on ELF32, 8B@32 on ELF64. e_phentsize/e_phnum
+ * shift accordingly (42/44 vs 54/56). */
+ u64 e_phoff = is32 ? (u64)rd_u32_le(data + 28) : rd_u64_le(data + 32);
+ u16 e_phentsize = rd_u16_le(data + (is32 ? 42 : 54));
+ u16 e_phnum = rd_u16_le(data + (is32 ? 44 : 56));
int have_base = 0;
u64 image_base = 0;
if (e_phnum) {
- if (e_phentsize != ELF64_PHDR_SIZE)
+ if (e_phentsize != phdr_size)
compiler_panic(c, SRCLOC_NONE, "read_elf: unexpected e_phentsize %u",
(u32)e_phentsize);
- if (e_phoff + (u64)e_phnum * ELF64_PHDR_SIZE > len)
+ if (e_phoff + (u64)e_phnum * phdr_size > len)
compiler_panic(c, SRCLOC_NONE,
"read_elf: program header table out of range");
for (u16 i = 0; i < e_phnum; ++i) {
- const u8* p = data + e_phoff + (u64)i * ELF64_PHDR_SIZE;
+ const u8* p = data + e_phoff + (u64)i * phdr_size;
+ /* Elf32_Phdr REORDERS p_flags AFTER the sizes:
+ * p_type@0,p_offset@4,p_vaddr@8,p_paddr@12,p_filesz@16,
+ * p_memsz@20,p_flags@24,p_align@28 (all u32).
+ * Elf64_Phdr: p_type@0,p_flags@4,p_offset@8,p_vaddr@16,
+ * p_filesz@32,p_memsz@40,p_align@48. */
u32 p_type = rd_u32_le(p + 0);
- u32 p_flags = rd_u32_le(p + 4);
- u64 p_offset = rd_u64_le(p + 8);
- u64 p_vaddr = rd_u64_le(p + 16);
- u64 p_filesz = rd_u64_le(p + 32);
- u64 p_memsz = rd_u64_le(p + 40);
- u64 p_align = rd_u64_le(p + 48);
+ u32 p_flags = is32 ? rd_u32_le(p + 24) : rd_u32_le(p + 4);
+ u64 p_offset = is32 ? (u64)rd_u32_le(p + 4) : rd_u64_le(p + 8);
+ u64 p_vaddr = is32 ? (u64)rd_u32_le(p + 8) : rd_u64_le(p + 16);
+ u64 p_filesz = is32 ? (u64)rd_u32_le(p + 16) : rd_u64_le(p + 32);
+ u64 p_memsz = is32 ? (u64)rd_u32_le(p + 20) : rd_u64_le(p + 40);
+ u64 p_align = is32 ? (u64)rd_u32_le(p + 28) : rd_u64_le(p + 48);
ObjSegment seg;
seg.name = intern_cstr(c, pt_type_name(p_type));
seg.vaddr = p_vaddr;
@@ -318,9 +347,10 @@ static void read_elf_image(Compiler* c, ObjBuilder* ob, const u8* data,
u64 dynstr_sz = str_sh->sh_size;
const u8* dynp = data + dsh->sh_offset;
u64 dynsz = dsh->sh_size;
- for (u64 off = 0; off + 16 <= dynsz; off += 16) {
- u64 tag = rd_u64_le(dynp + off);
- u64 val = rd_u64_le(dynp + off + 8);
+ /* ELF32 DT entries are 8B (d_tag:u32, d_un:u32); ELF64 16B. */
+ for (u64 off = 0; off + dyn_size <= dynsz; off += dyn_size) {
+ u64 tag = elf_rd_addr(dynp + off, is32);
+ u64 val = elf_rd_addr(dynp + off + (is32 ? 4 : 8), is32);
if (tag == DT_NULL) break;
if (tag != DT_NEEDED && tag != DT_SONAME && tag != DT_RPATH &&
tag != DT_RUNPATH)
@@ -354,23 +384,26 @@ static void read_elf_image(Compiler* c, ObjBuilder* ob, const u8* data,
u32 ndynsym = 0;
if (dynsym_idx) {
const ShdrRec* sh = &shdrs[dynsym_idx];
- if (sh->sh_entsize == ELF64_SYM_SIZE &&
- (sh->sh_size % ELF64_SYM_SIZE) == 0 && sh->sh_link < e_shnum &&
- sh->sh_offset + sh->sh_size <= len) {
+ if (sh->sh_entsize == sym_size && (sh->sh_size % sym_size) == 0 &&
+ sh->sh_link < e_shnum && sh->sh_offset + sh->sh_size <= len) {
const ShdrRec* str_sh = &shdrs[sh->sh_link];
if (str_sh->sh_offset + str_sh->sh_size <= len) {
const u8* strtab = data + str_sh->sh_offset;
u64 strtab_sz = str_sh->sh_size;
const u8* base = data + sh->sh_offset;
- ndynsym = (u32)(sh->sh_size / ELF64_SYM_SIZE);
+ ndynsym = (u32)(sh->sh_size / sym_size);
dynsym_names = arena_zarray(c->scratch, Sym, ndynsym ? ndynsym : 1);
for (u32 i = 1; i < ndynsym; ++i) {
- const u8* p = base + (u64)i * ELF64_SYM_SIZE;
+ const u8* p = base + (u64)i * sym_size;
+ /* Elf32_Sym REORDERS: st_name@0, st_value@4, st_size@8,
+ * st_info@12, st_other@13, st_shndx@14. Elf64_Sym:
+ * st_name@0, st_info@4, st_other@5, st_shndx@6,
+ * st_value@8, st_size@16. */
u32 st_name = rd_u32_le(p + 0);
- u8 st_info = p[4];
- u16 st_shndx = rd_u16_le(p + 6);
- u64 st_value = rd_u64_le(p + 8);
- u64 st_size = rd_u64_le(p + 16);
+ u8 st_info = is32 ? p[12] : p[4];
+ u16 st_shndx = is32 ? rd_u16_le(p + 14) : rd_u16_le(p + 6);
+ u64 st_value = is32 ? (u64)rd_u32_le(p + 4) : rd_u64_le(p + 8);
+ u64 st_size = is32 ? (u64)rd_u32_le(p + 8) : rd_u64_le(p + 16);
u32 nlen;
const char* nm = strtab_lookup(strtab, strtab_sz, st_name, &nlen);
Sym sn =
@@ -403,18 +436,22 @@ static void read_elf_image(Compiler* c, ObjBuilder* ob, const u8* data,
if (!is_rela && !is_rel) continue;
if (!(sh->sh_flags & SHF_ALLOC))
continue; /* link-time relocs: not dynamic */
- entsize = is_rela ? ELF64_RELA_SIZE : 16;
+ entsize = is_rela ? rela_size : rel_size;
if (sh->sh_entsize != entsize || (sh->sh_size % entsize) != 0) continue;
if (sh->sh_offset + sh->sh_size > len) continue;
nrec = (u32)(sh->sh_size / entsize);
base = data + sh->sh_offset;
for (j = 0; j < nrec; ++j) {
+ /* Elf32_Rela (12B): r_offset@0, r_info@4 (ELF32 packing),
+ * r_addend@8. Elf64_Rela (24B): r_offset@0, r_info@8, r_addend@16. */
const u8* p = base + (u64)j * entsize;
- u64 r_offset = rd_u64_le(p + 0);
- u64 r_info = rd_u64_le(p + 8);
- i64 r_addend = is_rela ? (i64)rd_u64_le(p + 16) : 0;
- u32 esym = ELF64_R_SYM(r_info);
- u32 kind = reloc_from(ELF64_R_TYPE(r_info));
+ u64 r_offset = elf_rd_addr(p + 0, is32);
+ u64 r_info = is32 ? (u64)rd_u32_le(p + 4) : rd_u64_le(p + 8);
+ i64 r_addend =
+ is_rela ? (is32 ? (i64)(i32)rd_u32_le(p + 8) : (i64)rd_u64_le(p + 16))
+ : 0;
+ u32 esym = is32 ? ELF32_R_SYM(r_info) : ELF64_R_SYM(r_info);
+ u32 kind = reloc_from(is32 ? ELF32_R_TYPE(r_info) : ELF64_R_TYPE(r_info));
ObjImageReloc dr;
if (kind == (u32)-1) continue; /* unmodeled dyn reloc type: skip */
dr.section = OBJ_SEC_NONE; /* offset is a vaddr, not section-relative */
@@ -431,20 +468,35 @@ ObjBuilder* read_elf(Compiler* c, const char* name, const u8* data,
size_t len) {
(void)name;
- if (len < ELF64_EHDR_SIZE)
+ /* Need at least the e_ident to read EI_CLASS; the full min-length
+ * check below uses the class-selected ehdr size. */
+ if (len < EI_NIDENT)
compiler_panic(c, SRCLOC_NONE, "read_elf: input shorter than ELF header");
if (data[EI_MAG0] != ELFMAG0 || data[EI_MAG1] != ELFMAG1 ||
data[EI_MAG2] != ELFMAG2 || data[EI_MAG3] != ELFMAG3)
compiler_panic(c, SRCLOC_NONE, "read_elf: bad ELF magic");
- if (data[EI_CLASS] != ELFCLASS64)
- compiler_panic(c, SRCLOC_NONE, "read_elf: not ELFCLASS64 (got %u)",
+ /* Accept both classes; is32 (EI_CLASS==ELFCLASS32) drives every
+ * stride/offset/field-order decision below. RV32 and RV64 share
+ * EM_RISCV — the reader cannot tell them apart by e_machine, only by
+ * EI_CLASS, so is32 is the single source of truth here. */
+ if (data[EI_CLASS] != ELFCLASS64 && data[EI_CLASS] != ELFCLASS32)
+ compiler_panic(c, SRCLOC_NONE, "read_elf: not ELFCLASS32/64 (got %u)",
data[EI_CLASS]);
if (data[EI_DATA] != ELFDATA2LSB)
compiler_panic(c, SRCLOC_NONE, "read_elf: not ELFDATA2LSB (got %u)",
data[EI_DATA]);
+ int is32 = (data[EI_CLASS] == ELFCLASS32);
+ u32 ehdr_size = is32 ? ELF32_EHDR_SIZE : ELF64_EHDR_SIZE;
+ u32 shdr_size = is32 ? ELF32_SHDR_SIZE : ELF64_SHDR_SIZE;
+ u32 sym_size = is32 ? ELF32_SYM_SIZE : ELF64_SYM_SIZE;
+ u32 rela_size = is32 ? ELF32_RELA_SIZE : ELF64_RELA_SIZE;
+ u32 rel_size = is32 ? 8u : 16u;
+ if (len < ehdr_size)
+ compiler_panic(c, SRCLOC_NONE, "read_elf: input shorter than ELF header");
+
u16 e_type = rd_u16_le(data + 16);
/* ET_REL parses to the section/symbol/reloc view only. ET_EXEC/ET_DYN
* additionally get the linked-image view (read_elf_image, below); their
@@ -457,9 +509,9 @@ ObjBuilder* read_elf(Compiler* c, const char* name, const u8* data,
(u32)e_type);
u16 e_machine = rd_u16_le(data + 18);
- const ObjFormatImpl* fmt = obj_format_lookup(KIT_OBJ_ELF);
- const ObjElfArchOps* arch =
- fmt && fmt->elf_machine ? fmt->elf_machine(e_machine) : NULL;
+ /* EM_RISCV is shared by RV32/RV64; disambiguate by EI_CLASS via
+ * obj_elf_machine_class (obj_elf_machine keys on e_machine alone). */
+ const ObjElfArchOps* arch = obj_elf_machine_class(e_machine, data[EI_CLASS]);
u32 (*reloc_from)(u32);
if (!arch || !arch->reloc_from) {
compiler_panic(c, SRCLOC_NONE, "read_elf: unsupported e_machine 0x%x",
@@ -467,11 +519,15 @@ ObjBuilder* read_elf(Compiler* c, const char* name, const u8* data,
}
reloc_from = arch->reloc_from;
- u64 e_shoff = rd_u64_le(data + 40);
- u32 e_flags = rd_u32_le(data + 48);
- u16 e_shentsize = rd_u16_le(data + 58);
- u16 e_shnum = rd_u16_le(data + 60);
- u16 e_shstrndx = rd_u16_le(data + 62);
+ /* Post-e_version Ehdr fields narrow + shift under ELF32: e_entry/
+ * e_phoff/e_shoff are 4B (vs 8B), so e_flags@36, e_phentsize@42,
+ * e_phnum@44, e_shentsize@46, e_shnum@48, e_shstrndx@50 (vs 48/54/
+ * 56/58/60/62 on ELF64). */
+ u64 e_shoff = is32 ? (u64)rd_u32_le(data + 32) : rd_u64_le(data + 40);
+ u32 e_flags = rd_u32_le(data + (is32 ? 36 : 48));
+ u16 e_shentsize = rd_u16_le(data + (is32 ? 46 : 58));
+ u16 e_shnum = rd_u16_le(data + (is32 ? 48 : 60));
+ u16 e_shstrndx = rd_u16_le(data + (is32 ? 50 : 62));
/* A fully section-stripped image (objcopy --strip-sections, packers,
* some release binaries) sets e_shoff/e_shnum to zero: the section
@@ -481,10 +537,10 @@ ObjBuilder* read_elf(Compiler* c, const char* name, const u8* data,
* ET_REL with no sections carries no model state, so still reject it. */
int has_sht = (e_shoff != 0 && e_shnum != 0);
if (has_sht) {
- if (e_shentsize != ELF64_SHDR_SIZE)
+ if (e_shentsize != shdr_size)
compiler_panic(c, SRCLOC_NONE, "read_elf: unexpected e_shentsize %u",
(u32)e_shentsize);
- if (e_shoff + (u64)e_shnum * ELF64_SHDR_SIZE > len)
+ if (e_shoff + (u64)e_shnum * shdr_size > len)
compiler_panic(c, SRCLOC_NONE,
"read_elf: section header table out of range");
if (e_shstrndx >= e_shnum)
@@ -504,7 +560,7 @@ ObjBuilder* read_elf(Compiler* c, const char* name, const u8* data,
if (has_sht) {
shdrs = arena_array(c->scratch, ShdrRec, e_shnum);
for (u32 i = 0; i < e_shnum; ++i)
- parse_shdr(data + e_shoff + (u64)i * ELF64_SHDR_SIZE, &shdrs[i]);
+ parse_shdr(data + e_shoff + (u64)i * shdr_size, is32, &shdrs[i]);
const ShdrRec* shstr_sh = &shdrs[e_shstrndx];
if (shstr_sh->sh_offset + shstr_sh->sh_size > len)
@@ -605,13 +661,13 @@ ObjBuilder* read_elf(Compiler* c, const char* name, const u8* data,
if (symtab_shndx) {
const ShdrRec* sh = &shdrs[symtab_shndx];
- if (sh->sh_entsize != ELF64_SYM_SIZE)
+ if (sh->sh_entsize != sym_size)
compiler_panic(c, SRCLOC_NONE, "read_elf: .symtab entsize %llu != %u",
- (unsigned long long)sh->sh_entsize, (u32)ELF64_SYM_SIZE);
- if (sh->sh_size % ELF64_SYM_SIZE)
+ (unsigned long long)sh->sh_entsize, sym_size);
+ if (sh->sh_size % sym_size)
compiler_panic(c, SRCLOC_NONE,
"read_elf: .symtab size %llu not a multiple of %u",
- (unsigned long long)sh->sh_size, (u32)ELF64_SYM_SIZE);
+ (unsigned long long)sh->sh_size, sym_size);
if (sh->sh_link >= e_shnum)
compiler_panic(c, SRCLOC_NONE, "read_elf: .symtab sh_link %u out of range",
sh->sh_link);
@@ -621,18 +677,21 @@ ObjBuilder* read_elf(Compiler* c, const char* name, const u8* data,
const u8* strtab = data + str_sh->sh_offset;
u64 strtab_sz = str_sh->sh_size;
- nsyms = (u32)(sh->sh_size / ELF64_SYM_SIZE);
+ nsyms = (u32)(sh->sh_size / sym_size);
sym_elf_to_obj = arena_zarray(c->scratch, u32, nsyms ? nsyms : 1);
const u8* base = data + sh->sh_offset;
for (u32 i = 1; i < nsyms; ++i) { /* skip index 0 */
- const u8* p = base + (u64)i * ELF64_SYM_SIZE;
+ const u8* p = base + (u64)i * sym_size;
+ /* Elf32_Sym REORDERS: st_name@0, st_value@4, st_size@8, st_info@12,
+ * st_other@13, st_shndx@14. Elf64_Sym: st_name@0, st_info@4,
+ * st_other@5, st_shndx@6, st_value@8, st_size@16. */
u32 st_name = rd_u32_le(p + 0);
- u8 st_info = p[4];
- u8 st_other = p[5];
- u16 st_shndx = rd_u16_le(p + 6);
- u64 st_value = rd_u64_le(p + 8);
- u64 st_size = rd_u64_le(p + 16);
+ u8 st_info = is32 ? p[12] : p[4];
+ u8 st_other = is32 ? p[13] : p[5];
+ u16 st_shndx = is32 ? rd_u16_le(p + 14) : rd_u16_le(p + 6);
+ u64 st_value = is32 ? (u64)rd_u32_le(p + 4) : rd_u64_le(p + 8);
+ u64 st_size = is32 ? (u64)rd_u32_le(p + 8) : rd_u64_le(p + 16);
u32 nlen;
const char* nm = strtab_lookup(strtab, strtab_sz, st_name, &nlen);
@@ -688,7 +747,7 @@ ObjBuilder* read_elf(Compiler* c, const char* name, const u8* data,
* SHF_ALLOC, so this is a no-op for relocatable objects. */
if (sh->sh_flags & SHF_ALLOC) continue;
- u32 entsize = is_rela ? ELF64_RELA_SIZE : 16;
+ u32 entsize = is_rela ? rela_size : rel_size;
if (sh->sh_entsize != entsize)
compiler_panic(c, SRCLOC_NONE, "read_elf: rela entsize %llu != %u",
(unsigned long long)sh->sh_entsize, entsize);
@@ -701,12 +760,17 @@ ObjBuilder* read_elf(Compiler* c, const char* name, const u8* data,
u32 nrec = (u32)(sh->sh_size / entsize);
const u8* base = data + sh->sh_offset;
for (u32 j = 0; j < nrec; ++j) {
+ /* Elf32_Rela (12B): r_offset@0, r_info@4 (ELF32 packing, 8-bit
+ * type), r_addend@8. Elf64_Rela (24B): r_offset@0, r_info@8,
+ * r_addend@16. */
const u8* p = base + (u64)j * entsize;
- u64 r_offset = rd_u64_le(p + 0);
- u64 r_info = rd_u64_le(p + 8);
- i64 r_addend = is_rela ? (i64)rd_u64_le(p + 16) : 0;
- u32 esym = ELF64_R_SYM(r_info);
- u32 etype = ELF64_R_TYPE(r_info);
+ u64 r_offset = elf_rd_addr(p + 0, is32);
+ u64 r_info = is32 ? (u64)rd_u32_le(p + 4) : rd_u64_le(p + 8);
+ i64 r_addend =
+ is_rela ? (is32 ? (i64)(i32)rd_u32_le(p + 8) : (i64)rd_u64_le(p + 16))
+ : 0;
+ u32 esym = is32 ? ELF32_R_SYM(r_info) : ELF64_R_SYM(r_info);
+ u32 etype = is32 ? ELF32_R_TYPE(r_info) : ELF64_R_TYPE(r_info);
u32 kind = reloc_from(etype);
if (kind == (u32)-1)
@@ -754,7 +818,7 @@ ObjBuilder* read_elf(Compiler* c, const char* name, const u8* data,
/* ET_EXEC / ET_DYN: attach the linked-image view (segments + dynamic). */
if (e_type != ET_REL)
- read_elf_image(c, ob, data, len, e_type, shdrs, e_shnum, elf_to_obj,
+ read_elf_image(c, ob, data, len, e_type, is32, shdrs, e_shnum, elf_to_obj,
reloc_from);
obj_finalize(ob);
@@ -843,9 +907,11 @@ ObjBuilder* read_elf_dso(Compiler* c, const char* name, const u8* data,
if (e_shstrndx >= e_shnum)
compiler_panic(c, SRCLOC_NONE, "read_elf_dso: e_shstrndx out of range");
+ /* read_elf_dso is ELFCLASS64-only (panics above on other classes), so
+ * parse with the ELF64 layout (is32 = 0). */
ShdrRec* shdrs = arena_array(c->scratch, ShdrRec, e_shnum);
for (u32 i = 0; i < e_shnum; ++i)
- parse_shdr(data + e_shoff + (u64)i * ELF64_SHDR_SIZE, &shdrs[i]);
+ parse_shdr(data + e_shoff + (u64)i * ELF64_SHDR_SIZE, 0, &shdrs[i]);
/* Locate .dynsym (preferred over .symtab — a stripped DSO carries
* only .dynsym) and its associated strtab via sh_link. */
diff --git a/src/obj/elf/reloc_riscv32.c b/src/obj/elf/reloc_riscv32.c
@@ -0,0 +1,177 @@
+/* RelocKind <-> RISC-V ELF reloc-type mapping (ELFCLASS32 / RV32 ILP32*).
+ *
+ * Clone of reloc_riscv64.c. The RISC-V reloc type codes are XLEN-neutral
+ * and fit in the 8-bit ELF32 r_info type field, so every code-relative /
+ * data-relative kind is reused verbatim. The only divergences are the
+ * 64-bit-wide kinds:
+ * - R_ABS64 -> unsupported on RV32 (ELF_R_RISCV_NONE)
+ * - R_RV_ADD64 / R_RV_SUB64 -> unsupported on RV32 (ELF_R_RISCV_NONE)
+ * and, on the read side, ELF_R_RISCV_64 / ADD64 / SUB64 map to the
+ * (u32)-1 sentinel so the reader diagnoses them rather than silently
+ * fabricating a 64-bit RelocKind for a 32-bit object.
+ *
+ * R_ABS32 -> ELF_R_RISCV_32 is the RV32 primary absolute reloc. */
+
+#include "obj/elf/elf.h"
+
+u32 elf_riscv32_reloc_to(u32 kind /* RelocKind */) {
+ switch (kind) {
+ case R_NONE:
+ return ELF_R_RISCV_NONE;
+ /* R_ABS64 / R_RV_ADD64 / R_RV_SUB64 are 64-bit-only: unsupported on
+ * RV32 (fall through to NONE so emit_elf flags them). */
+ case R_ABS32:
+ return ELF_R_RISCV_32;
+ case R_PC32:
+ return ELF_R_RISCV_32_PCREL;
+ case R_RV_HI20:
+ return ELF_R_RISCV_HI20;
+ case R_RV_LO12_I:
+ return ELF_R_RISCV_LO12_I;
+ case R_RV_LO12_S:
+ return ELF_R_RISCV_LO12_S;
+ case R_RV_BRANCH:
+ return ELF_R_RISCV_BRANCH;
+ case R_RV_JAL:
+ return ELF_R_RISCV_JAL;
+ case R_RV_CALL:
+ return ELF_R_RISCV_CALL;
+ case R_PLT32:
+ return ELF_R_RISCV_CALL_PLT;
+ case R_RV_PCREL_HI20:
+ return ELF_R_RISCV_PCREL_HI20;
+ case R_RV_PCREL_LO12_I:
+ return ELF_R_RISCV_PCREL_LO12_I;
+ case R_RV_PCREL_LO12_S:
+ return ELF_R_RISCV_PCREL_LO12_S;
+ case R_RV_GOT_HI20:
+ return ELF_R_RISCV_GOT_HI20;
+ case R_RV_TLS_GOT_HI20:
+ return ELF_R_RISCV_TLS_GOT_HI20;
+ case R_RV_TPREL_HI20:
+ return ELF_R_RISCV_TPREL_HI20;
+ case R_RV_TPREL_LO12_I:
+ return ELF_R_RISCV_TPREL_LO12_I;
+ case R_RV_TPREL_LO12_S:
+ return ELF_R_RISCV_TPREL_LO12_S;
+ case R_RV_TPREL_ADD:
+ return ELF_R_RISCV_TPREL_ADD;
+ case R_RV_ADD8:
+ return ELF_R_RISCV_ADD8;
+ case R_RV_ADD16:
+ return ELF_R_RISCV_ADD16;
+ case R_RV_ADD32:
+ return ELF_R_RISCV_ADD32;
+ case R_RV_SUB8:
+ return ELF_R_RISCV_SUB8;
+ case R_RV_SUB16:
+ return ELF_R_RISCV_SUB16;
+ case R_RV_SUB32:
+ return ELF_R_RISCV_SUB32;
+ case R_RV_ALIGN:
+ return ELF_R_RISCV_ALIGN;
+ case R_RV_RVC_BRANCH:
+ return ELF_R_RISCV_RVC_BRANCH;
+ case R_RV_RVC_JUMP:
+ return ELF_R_RISCV_RVC_JUMP;
+ case R_RV_RELAX:
+ return ELF_R_RISCV_RELAX;
+ case R_RV_SUB6:
+ return ELF_R_RISCV_SUB6;
+ case R_RV_SET6:
+ return ELF_R_RISCV_SET6;
+ case R_RV_SET8:
+ return ELF_R_RISCV_SET8;
+ case R_RV_SET16:
+ return ELF_R_RISCV_SET16;
+ case R_RV_SET32:
+ return ELF_R_RISCV_SET32;
+ case R_RV_SET_ULEB128:
+ return ELF_R_RISCV_SET_ULEB128;
+ case R_RV_SUB_ULEB128:
+ return ELF_R_RISCV_SUB_ULEB128;
+ default:
+ return ELF_R_RISCV_NONE;
+ }
+}
+
+u32 elf_riscv32_reloc_from(u32 elf_type) {
+ switch (elf_type) {
+ case ELF_R_RISCV_NONE:
+ return R_NONE;
+ /* ELF_R_RISCV_64 / ADD64 / SUB64 are 64-bit-only: not valid in an
+ * ELFCLASS32 object — fall through to the (u32)-1 sentinel. */
+ case ELF_R_RISCV_32:
+ return R_ABS32;
+ case ELF_R_RISCV_32_PCREL:
+ return R_PC32;
+ case ELF_R_RISCV_HI20:
+ return R_RV_HI20;
+ case ELF_R_RISCV_LO12_I:
+ return R_RV_LO12_I;
+ case ELF_R_RISCV_LO12_S:
+ return R_RV_LO12_S;
+ case ELF_R_RISCV_BRANCH:
+ return R_RV_BRANCH;
+ case ELF_R_RISCV_JAL:
+ return R_RV_JAL;
+ case ELF_R_RISCV_CALL:
+ return R_RV_CALL;
+ case ELF_R_RISCV_CALL_PLT:
+ return R_PLT32;
+ case ELF_R_RISCV_PCREL_HI20:
+ return R_RV_PCREL_HI20;
+ case ELF_R_RISCV_PCREL_LO12_I:
+ return R_RV_PCREL_LO12_I;
+ case ELF_R_RISCV_PCREL_LO12_S:
+ return R_RV_PCREL_LO12_S;
+ case ELF_R_RISCV_GOT_HI20:
+ return R_RV_GOT_HI20;
+ case ELF_R_RISCV_TLS_GOT_HI20:
+ return R_RV_TLS_GOT_HI20;
+ case ELF_R_RISCV_TPREL_HI20:
+ return R_RV_TPREL_HI20;
+ case ELF_R_RISCV_TPREL_LO12_I:
+ return R_RV_TPREL_LO12_I;
+ case ELF_R_RISCV_TPREL_LO12_S:
+ return R_RV_TPREL_LO12_S;
+ case ELF_R_RISCV_TPREL_ADD:
+ return R_RV_TPREL_ADD;
+ case ELF_R_RISCV_ADD8:
+ return R_RV_ADD8;
+ case ELF_R_RISCV_ADD16:
+ return R_RV_ADD16;
+ case ELF_R_RISCV_ADD32:
+ return R_RV_ADD32;
+ case ELF_R_RISCV_SUB8:
+ return R_RV_SUB8;
+ case ELF_R_RISCV_SUB16:
+ return R_RV_SUB16;
+ case ELF_R_RISCV_SUB32:
+ return R_RV_SUB32;
+ case ELF_R_RISCV_ALIGN:
+ return R_RV_ALIGN;
+ case ELF_R_RISCV_RVC_BRANCH:
+ return R_RV_RVC_BRANCH;
+ case ELF_R_RISCV_RVC_JUMP:
+ return R_RV_RVC_JUMP;
+ case ELF_R_RISCV_RELAX:
+ return R_RV_RELAX;
+ case ELF_R_RISCV_SUB6:
+ return R_RV_SUB6;
+ case ELF_R_RISCV_SET6:
+ return R_RV_SET6;
+ case ELF_R_RISCV_SET8:
+ return R_RV_SET8;
+ case ELF_R_RISCV_SET16:
+ return R_RV_SET16;
+ case ELF_R_RISCV_SET32:
+ return R_RV_SET32;
+ case ELF_R_RISCV_SET_ULEB128:
+ return R_RV_SET_ULEB128;
+ case ELF_R_RISCV_SUB_ULEB128:
+ return R_RV_SUB_ULEB128;
+ default:
+ return (u32)-1; /* sentinel */
+ }
+}
diff --git a/src/obj/registry.c b/src/obj/registry.c
@@ -104,7 +104,25 @@ static const ObjElfArchOps obj_elf_arch_ops[] = {
.reloc_from = elf_riscv64_reloc_from,
},
#endif
-#if !KIT_ARCH_AA64_ENABLED && !KIT_ARCH_X64_ENABLED && !KIT_ARCH_RV64_ENABLED
+#if KIT_ARCH_RV32_ENABLED
+ {
+ /* RV32 shares EM_RISCV with RV64; obj_elf_machine_class uses
+ * EI_CLASS to disambiguate. Default float ABI is ilp32f (SINGLE:
+ * float in FP regs, double always soft). Static-only — no musl
+ * interp; the dyn r_* fields are unused on the static path. */
+ .arch = KIT_ARCH_RV32,
+ .e_machine = EM_RISCV,
+ .e_flags = EF_RISCV_RVC | EF_RISCV_FLOAT_ABI_SINGLE,
+ .default_musl_interp = NULL,
+ .r_relative = ELF_R_RISCV_RELATIVE,
+ .r_glob_dat = ELF_R_RISCV_32,
+ .r_jump_slot = ELF_R_RISCV_JUMP_SLOT,
+ .reloc_to = elf_riscv32_reloc_to,
+ .reloc_from = elf_riscv32_reloc_from,
+ },
+#endif
+#if !KIT_ARCH_AA64_ENABLED && !KIT_ARCH_X64_ENABLED && \
+ !KIT_ARCH_RV64_ENABLED && !KIT_ARCH_RV32_ENABLED
{.arch = KIT_ARCH_WASM},
#endif
};
@@ -128,6 +146,33 @@ static const ObjElfArchOps* obj_elf_machine(u32 e_machine) {
}
return NULL;
}
+
+/* EI_CLASS-aware variant. EM_RISCV is shared by RV32 (ELFCLASS32) and
+ * RV64 (ELFCLASS64); selecting by e_machine alone would always pick the
+ * first table entry. For RISC-V we narrow the match by ptr-class so the
+ * reader resolves the correct reloc_from table; non-RISC-V archs (one
+ * entry per e_machine) match on e_machine as before. */
+const ObjElfArchOps* obj_elf_machine_class(u32 e_machine, u8 ei_class) {
+ u32 i;
+ for (i = 0; i < (u32)(sizeof obj_elf_arch_ops / sizeof obj_elf_arch_ops[0]);
+ ++i) {
+ const ObjElfArchOps* a = &obj_elf_arch_ops[i];
+ if (!a->e_machine || a->e_machine != e_machine) continue;
+ if (e_machine == EM_RISCV) {
+ int want32 = (ei_class == ELFCLASS32);
+ int is_rv32 = (a->arch == KIT_ARCH_RV32);
+ if (want32 != is_rv32) continue;
+ }
+ return a;
+ }
+ return NULL;
+}
+#else
+const ObjElfArchOps* obj_elf_machine_class(u32 e_machine, u8 ei_class) {
+ (void)e_machine;
+ (void)ei_class;
+ return NULL;
+}
#endif
#if KIT_OBJ_MACHO_ENABLED
diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c
@@ -46,7 +46,7 @@ static _Noreturn void lower_panic(CgIrLower* l, SrcLoc loc, const char* msg) {
}
static u8 local_reg_class(Compiler* c, KitCgTypeId ty) {
- return cg_type_is_float(c, ty) ? RC_FP : RC_INT;
+ return opt_value_reg_class(c, ty);
}
static OptCGFuncDesc lower_func_desc(Arena* a, const struct CGFuncDesc* in) {
@@ -184,11 +184,15 @@ static void lower_locals(CgIrLower* l) {
m->align = in->desc.align;
m->cls = local_reg_class(l->c, in->desc.type);
/* Aggregates and oversized scalars cannot live in a single PReg; they need
- * a memory home regardless of whether their address is taken. */
+ * a memory home regardless of whether their address is taken. "Oversized"
+ * is wider than the machine word (ptr_size): 8 on rv64/x64/aa64, 4 on rv32
+ * — so an 8-byte i64/double on rv32 is homed in memory like an i128 is on a
+ * 64-bit target (the cg layer also flags these CG_LOCAL_MEMORY_REQUIRED). */
m->address_taken = local_needs_home(in) ||
local_address_used_in_cg_ir(l->c, l->src, in->id) ||
cg_type_is_aggregate(l->c, in->desc.type) ||
- cg_type_size(l->c, in->desc.type) > 8u;
+ cg_type_size(l->c, in->desc.type) >
+ (u64)l->c->target.ptr_size;
PReg r = ir_alloc_preg(l->f, in->desc.type, m->cls);
if (m->address_taken) {
diff --git a/src/opt/ir.c b/src/opt/ir.c
@@ -17,9 +17,33 @@
#include <string.h>
+#include <kit/cg.h>
+
#include "core/arena.h"
#include "core/core.h"
+/* Register class for a value of type `ty`. A float lands in an FP register only
+ * when the float ABI has a register that wide (flen); a soft float, or a float
+ * wider than flen (double under ilp32f/ilp32), is INT-class and carried in GPRs
+ * (a GPR pair for a 2-word double) like an integer of the same width, so it is
+ * never bit-cast through an FP register (illegal fmv.d.x on rv32). flen:
+ * SINGLE 4, DOUBLE 8, SOFT 0; DEFAULT maps to the pointer width, preserving
+ * lp64d / x86-64 / rv64 (double 8 <= 8 -> FP). EVERY value-class decision in
+ * the optimizer routes through here so they agree — the verifier cross-checks
+ * the SSA value class, param/local class, and physical-reg class against it. */
+u8 opt_value_reg_class(Compiler* c, KitCgTypeId ty) {
+ KitCompiler* pc = (KitCompiler*)c;
+ u32 flen;
+ if (kit_cg_type_kind(pc, ty) != KIT_CG_TYPE_FLOAT) return RC_INT;
+ switch (c->target.float_abi) {
+ case KIT_FLOAT_ABI_SINGLE: flen = 4u; break;
+ case KIT_FLOAT_ABI_DOUBLE: flen = 8u; break;
+ case KIT_FLOAT_ABI_SOFT: flen = 0u; break;
+ default: flen = c->target.ptr_size; break; /* DEFAULT: historical */
+ }
+ return (flen && kit_cg_type_size(pc, ty) <= (uint64_t)flen) ? RC_FP : RC_INT;
+}
+
/* ---- val table ---- */
static void val_table_grow(Func* f, u32 needed) {
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -43,6 +43,10 @@ typedef NativeAllocClass RegClass;
#define RC_FP NATIVE_REG_FP
#define RC_VEC NATIVE_REG_VEC
+/* The single float-ABI-aware value register-class decision (defined in ir.c).
+ * All optimizer passes route through it so their classifications agree. */
+u8 opt_value_reg_class(Compiler* c, KitCgTypeId ty);
+
#define CG_REG_ALLOCABLE NATIVE_REG_ALLOCABLE
#define CG_REG_CALLER_SAVED NATIVE_REG_CALLER_SAVED
#define CG_REG_CALLEE_SAVED NATIVE_REG_CALLEE_SAVED
diff --git a/src/opt/pass_addr_fold.c b/src/opt/pass_addr_fold.c
@@ -477,10 +477,7 @@ void opt_promote_scalar_locals(Func* f) {
}
}
if (rejected || !touched_count) continue;
- u8 cls =
- (kit_cg_type_kind((KitCompiler*)f->c, slot->type) == KIT_CG_TYPE_FLOAT)
- ? RC_FP
- : RC_INT;
+ u8 cls = opt_value_reg_class(f->c, slot->type);
PReg preg = ir_alloc_preg(f, slot->type, cls);
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
diff --git a/src/opt/pass_analysis.c b/src/opt/pass_analysis.c
@@ -39,8 +39,7 @@ static int verify_stage_is_ssa(const char* stage) {
}
static u8 verify_type_reg_class(Func* f, KitCgTypeId ty) {
- KitCgTypeKind kind = kit_cg_type_kind((KitCompiler*)f->c, ty);
- return kind == KIT_CG_TYPE_FLOAT ? RC_FP : RC_INT;
+ return opt_value_reg_class(f->c, ty);
}
static void verify_frame_slot(Func* f, const char* stage, FrameSlot slot,
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -510,8 +510,12 @@ static Reg loc_avoid_reg(NativeLoc l) {
}
static int type_is_aggregate_or_large(NativeEmitCtx* e, KitCgTypeId type) {
+ /* "Large" = wider than one machine word (ptr_size): such a value cannot move
+ * through a single register, so IR_COPY/IR_LOAD/IR_STORE of it must go through
+ * copy_bytes. 8 on rv64/x64/aa64, 4 on rv32 (so an 8-byte i64/double is large
+ * there and is copied as two words rather than truncated into one register). */
return type && (cg_type_is_aggregate(e->c, type) ||
- type_size_or(e->c, type, 8u) > 8u);
+ type_size_or(e->c, type, 8u) > e->c->target.ptr_size);
}
/* Copy an aggregate / oversized value between two memory locations. dst and
@@ -689,7 +693,8 @@ static void emit_call(NativeEmitCtx* e, Inst* in) {
NativeLoc tmp = loc_frame(rty, class_for_type(e, rty), result_slot);
result_mem = mem_for_type(e->c, rty);
if (final_result.kind != NATIVE_LOC_REG &&
- (cg_type_is_aggregate(e->c, rty) || type_size_or(e->c, rty, 8u) > 8u)) {
+ (cg_type_is_aggregate(e->c, rty) ||
+ type_size_or(e->c, rty, 8u) > e->c->target.ptr_size)) {
/* Aggregate / oversized result: move bytes rather than a scalar copy
* (which would exceed the single-register width). The result was either
* written in parts by plan_call's rets, or by the callee via the sret
diff --git a/src/opt/pass_ssa.c b/src/opt/pass_ssa.c
@@ -38,8 +38,7 @@ typedef struct RegRenameCtx {
} RegRenameCtx;
static u8 ssa_type_class(Func* f, KitCgTypeId ty) {
- KitCgTypeKind kind = kit_cg_type_kind((KitCompiler*)f->c, ty);
- return kind == KIT_CG_TYPE_FLOAT ? RC_FP : RC_INT;
+ return opt_value_reg_class(f->c, ty);
}
static u32 opnd_slot_id(const Operand* op) {
diff --git a/test/arch/rv32_decode_test.c b/test/arch/rv32_decode_test.c
@@ -0,0 +1,192 @@
+/* RV32 structured decode test.
+ *
+ * Mirror of rv64_decode_test.c for the XLEN-parameterized RISC-V decoder: the
+ * same ArchDecodeOps path, but built for KIT_ARCH_RV32 so the decoder selects
+ * the RV_AV_RV32 instruction table (5-bit shamt, lw/sw rather than ld/sd, and
+ * the rv32 meaning of the ambiguous compressed quadrants). bytes decode into
+ * KitDecodedInsn records and the formatter renders those same records, so
+ * decode<->format agreement is the structural oracle. */
+
+#include <kit/compile.h>
+#include <kit/core.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "arch/arch.h"
+#include "arch/riscv/isa.h"
+#include "lib/kit_unit.h"
+
+static KitUnit g_u;
+#define EXPECT(cond, ...) CU_EXPECT(&g_u, cond, __VA_ARGS__)
+
+static KitCompiler* new_compiler(void) {
+ KitTargetSpec t = kit_unit_target(KIT_ARCH_RV32, KIT_OS_LINUX, KIT_OBJ_ELF);
+ KitCompiler* c = NULL;
+ /* kit_unit_target defaults to a 64-bit pointer; rv32 is a 4-byte target. The
+ * decode path keys off the arch, not ptr_size, but keep the spec honest. */
+ t.ptr_size = 4;
+ t.ptr_align = 4;
+ if (kit_unit_compiler_new(&g_u, t, &c) != KIT_OK || !c) {
+ fprintf(stderr, "compiler_new failed\n");
+ exit(2);
+ }
+ return c;
+}
+
+static void put32(unsigned char* b, size_t off, unsigned v) {
+ b[off + 0] = (unsigned char)v;
+ b[off + 1] = (unsigned char)(v >> 8);
+ b[off + 2] = (unsigned char)(v >> 16);
+ b[off + 3] = (unsigned char)(v >> 24);
+}
+
+static void decode_addi(KitCompiler* pub) {
+ Compiler* c = (Compiler*)pub;
+ unsigned char bytes[4];
+ KitDecodedInsn insn;
+ KitStatus st;
+
+ put32(bytes, 0, rv_addi(RV_A0, RV_ZERO, 42));
+ memset(&insn, 0, sizeof(insn));
+ st = arch_decode_one(c, bytes, sizeof(bytes), 0x1000, &insn);
+ EXPECT(st == KIT_OK, "decode_one(addi) status %d", (int)st);
+ EXPECT(insn.pc == 0x1000, "pc = 0x%llx", (unsigned long long)insn.pc);
+ EXPECT(insn.nbytes == 4, "nbytes = %u", (unsigned)insn.nbytes);
+ EXPECT(insn.opcode == RV64_DEC_ADDI, "opcode = %u, want ADDI",
+ (unsigned)insn.opcode);
+ EXPECT((insn.flags & KIT_DECODE_TERMINATOR) == 0,
+ "addi should not be a terminator");
+ EXPECT(insn.noperands == 3, "addi operand count = %u",
+ (unsigned)insn.noperands);
+ EXPECT(
+ insn.operands[0].kind == KIT_DECOP_REG && insn.operands[0].reg == RV_A0,
+ "addi rd operand wrong");
+ EXPECT(
+ insn.operands[1].kind == KIT_DECOP_REG && insn.operands[1].reg == RV_ZERO,
+ "addi rs1 operand wrong");
+ EXPECT(insn.operands[2].kind == KIT_DECOP_IMM && insn.operands[2].imm == 42,
+ "addi imm operand wrong");
+}
+
+static void decode_block_stops_at_ecall(KitCompiler* pub) {
+ Compiler* c = (Compiler*)pub;
+ unsigned char bytes[16];
+ KitDecodedInsn insts[4];
+ u32 n = 0;
+ KitStatus st;
+
+ put32(bytes, 0, rv_addi(RV_A0, RV_ZERO, 42));
+ put32(bytes, 4, rv_addi(RV_A7, RV_ZERO, 93));
+ put32(bytes, 8, rv_ecall());
+ put32(bytes, 12, rv_addi(RV_A0, RV_ZERO, 7));
+
+ memset(insts, 0, sizeof(insts));
+ st = arch_decode_block(c, bytes, sizeof(bytes), 0x2000, insts, 4, &n);
+ EXPECT(st == KIT_OK, "decode_block status %d", (int)st);
+ EXPECT(n == 3, "decode_block count = %u", (unsigned)n);
+ EXPECT(insts[2].nbytes == 4, "ecall nbytes = %u", (unsigned)insts[2].nbytes);
+ EXPECT(insts[2].opcode == RV64_DEC_ECALL, "ecall opcode = %u",
+ (unsigned)insts[2].opcode);
+ EXPECT((insts[2].flags & KIT_DECODE_TERMINATOR) != 0,
+ "ecall should terminate block");
+ EXPECT((insts[2].flags & KIT_DECODE_TRAP) != 0,
+ "ecall should be marked trap");
+}
+
+static void format_decoded_record(KitCompiler* pub) {
+ Compiler* c = (Compiler*)pub;
+ unsigned char bytes[4];
+ KitDecodedInsn insn;
+ ArchInsnFormatter* fmt;
+ KitInsn text;
+ KitStatus st;
+
+ put32(bytes, 0, rv_addi(RV_A0, RV_ZERO, 42));
+ st = arch_decode_one(c, bytes, sizeof(bytes), 0x3000, &insn);
+ EXPECT(st == KIT_OK, "decode_one for format status %d", (int)st);
+ fmt = arch_insn_formatter_new(c);
+ EXPECT(fmt != NULL, "formatter_new returned NULL");
+ if (!fmt) return;
+ memset(&text, 0, sizeof(text));
+ st = arch_format_insn(fmt, &insn, &text);
+ EXPECT(st == KIT_OK, "format status %d", (int)st);
+ EXPECT(kit_slice_eq_cstr(text.mnemonic, "li"), "mnemonic = %.*s",
+ KIT_SLICE_ARG(text.mnemonic));
+ EXPECT(text.operands.s && strstr(text.operands.s, "a0"),
+ "operands missing a0: %.*s", KIT_SLICE_ARG(text.operands));
+ EXPECT(text.operands.s && strstr(text.operands.s, "42"),
+ "operands missing 42: %.*s", KIT_SLICE_ARG(text.operands));
+ arch_insn_formatter_free(fmt);
+}
+
+/* rv32 loads are lw (XLEN-wide), not ld. The same opcode bytes are RV64-only as
+ * an `ld` would be a different funct3; here we encode an `lw` and confirm the
+ * rv32 formatter renders it as "lw" with the byte offset preserved. */
+static void format_lw_is_lw(KitCompiler* pub) {
+ Compiler* c = (Compiler*)pub;
+ unsigned char bytes[4];
+ KitDecodedInsn insn;
+ ArchInsnFormatter* fmt;
+ KitInsn text;
+ KitStatus st;
+
+ put32(bytes, 0, rv_lw(RV_A0, RV_SP, 8));
+ memset(&insn, 0, sizeof(insn));
+ st = arch_decode_one(c, bytes, sizeof(bytes), 0x4000, &insn);
+ EXPECT(st == KIT_OK, "decode_one(lw) status %d", (int)st);
+ EXPECT(insn.nbytes == 4, "lw nbytes = %u", (unsigned)insn.nbytes);
+ fmt = arch_insn_formatter_new(c);
+ if (!fmt) return;
+ memset(&text, 0, sizeof(text));
+ st = arch_format_insn(fmt, &insn, &text);
+ EXPECT(st == KIT_OK, "format(lw) status %d", (int)st);
+ EXPECT(kit_slice_eq_cstr(text.mnemonic, "lw"), "lw mnemonic = %.*s",
+ KIT_SLICE_ARG(text.mnemonic));
+ EXPECT(text.operands.s && strstr(text.operands.s, "a0"),
+ "lw operands missing a0: %.*s", KIT_SLICE_ARG(text.operands));
+ arch_insn_formatter_free(fmt);
+}
+
+/* slli takes a 5-bit shift amount on rv32 (vs 6 on rv64). Encode the maximum
+ * legal rv32 shamt (31) and confirm it decodes and formats with that amount;
+ * the rv32 decoder must accept it (bit 25 clear) and round-trip the value. */
+static void format_slli_5bit(KitCompiler* pub) {
+ Compiler* c = (Compiler*)pub;
+ unsigned char bytes[4];
+ KitDecodedInsn insn;
+ ArchInsnFormatter* fmt;
+ KitInsn text;
+ KitStatus st;
+
+ put32(bytes, 0, rv_slli(RV_A0, RV_A1, 31));
+ memset(&insn, 0, sizeof(insn));
+ st = arch_decode_one(c, bytes, sizeof(bytes), 0x5000, &insn);
+ EXPECT(st == KIT_OK, "decode_one(slli 31) status %d", (int)st);
+ EXPECT(insn.nbytes == 4, "slli nbytes = %u", (unsigned)insn.nbytes);
+ fmt = arch_insn_formatter_new(c);
+ if (!fmt) return;
+ memset(&text, 0, sizeof(text));
+ st = arch_format_insn(fmt, &insn, &text);
+ EXPECT(st == KIT_OK, "format(slli) status %d", (int)st);
+ EXPECT(kit_slice_eq_cstr(text.mnemonic, "slli"), "slli mnemonic = %.*s",
+ KIT_SLICE_ARG(text.mnemonic));
+ EXPECT(text.operands.s && strstr(text.operands.s, "31"),
+ "slli operands missing shamt 31: %.*s", KIT_SLICE_ARG(text.operands));
+ arch_insn_formatter_free(fmt);
+}
+
+int main(void) {
+ KitCompiler* c;
+ kit_unit_init(&g_u);
+ c = new_compiler();
+ decode_addi(c);
+ decode_block_stops_at_ecall(c);
+ format_decoded_record(c);
+ format_lw_is_lw(c);
+ format_slli_5bit(c);
+ kit_compiler_free(c);
+ kit_unit_summary(&g_u, "rv32_decode_test");
+ return kit_unit_status(&g_u);
+}
diff --git a/test/arch/rv64_decode_test.c b/test/arch/rv64_decode_test.c
@@ -12,7 +12,7 @@
#include <string.h>
#include "arch/arch.h"
-#include "arch/rv64/isa.h"
+#include "arch/riscv/isa.h"
#include "lib/kit_unit.h"
/* Shared test context replaces the per-file heap/diag/counter globals;
diff --git a/test/elf/CORPUS.md b/test/elf/CORPUS.md
@@ -172,7 +172,7 @@ exits cleanly (no segfault).
| `truncated_ehdr.elf` | < 64 bytes |
| `bad_magic.elf` | first 4 bytes wrong |
| `e_machine_x86.elf` | machine mismatch (when arch-validated) |
-| `wrong_class.elf` | ELFCLASS32 in a 64-bit pipeline |
+| `wrong_class.elf` | 64-bit machine tagged ELFCLASS32 (class/arch mismatch) |
| `wrong_endian.elf` | ELFDATA2MSB in an LSB pipeline |
| `sh_offset_oob.elf` | `sh_offset + sh_size > file_size` |
| `sh_link_oob.elf` | `sh_link >= e_shnum` |
diff --git a/test/elf/bad/gen.py b/test/elf/bad/gen.py
@@ -94,10 +94,15 @@ def m_bad_magic(buf):
return b
-@case("wrong_class", "not ELFCLASS64")
+# The base object is 64-bit (e.g. aarch64). ELFCLASS32 is a valid class on its
+# own now (riscv32 uses it), so the reader no longer blanket-rejects it — but
+# kit_detect_target requires EI_CLASS to match the arch's pointer width, so a
+# 64-bit machine tagged ELFCLASS32 is a mismatch caught before read_elf, with
+# the same "not a recognized object file" text bad_magic/e_machine_unknown use.
+@case("wrong_class", "not a recognized object file")
def m_wrong_class(buf):
b = bytearray(buf)
- b[4] = 1 # ELFCLASS32
+ b[4] = 1 # ELFCLASS32 on a 64-bit-machine object -> class/arch mismatch
return b
diff --git a/test/elf/bad/wrong_class.expect b/test/elf/bad/wrong_class.expect
@@ -1 +1 @@
-not ELFCLASS64
-\ No newline at end of file
+not a recognized object file
+\ No newline at end of file
diff --git a/test/elf/unit/rv32_class32.c b/test/elf/unit/rv32_class32.c
@@ -0,0 +1,291 @@
+/* ELFCLASS32 write->read round-trip — the first ELFCLASS32 consumer test
+ * for the kit rv32 port.
+ *
+ * Builds a tiny riscv32-none-elf relocatable in memory: one .text section
+ * with a few bytes, a GLOBAL FUNC symbol, and one R_ABS32 relocation (the
+ * RV32 primary absolute reloc, which the ELF emitter lowers to
+ * ELF_R_RISCV_32). Then:
+ *
+ * kit_obj_builder_emit(ob, mem_writer)
+ * kit_obj_open("rv32_class32", bytes, len)
+ *
+ * and asserts the readback is a 32-bit (ELFCLASS32) RISC-V object whose
+ * symbol and relocation survived intact.
+ *
+ * Unlike the AArch64/x64/rv64 unit tests this does NOT use
+ * kit_test_target_init: that helper hardcodes ptr_size=8 and has no rv32
+ * branch. Instead we build a fixed rv32 KitTargetSpec directly.
+ *
+ * Exit 0 = pass; non-zero = fail (with one-line stderr explanations). */
+
+#include <kit/core.h>
+#include <kit/object.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* ---- env ---- */
+
+static void* heap_alloc(KitHeap* h, size_t n, size_t a) {
+ (void)h;
+ (void)a;
+ return n ? malloc(n) : NULL;
+}
+static void* heap_realloc(KitHeap* h, void* p, size_t o, size_t n, size_t a) {
+ (void)h;
+ (void)o;
+ (void)a;
+ return realloc(p, n);
+}
+static void heap_free(KitHeap* h, void* p, size_t n) {
+ (void)h;
+ (void)n;
+ free(p);
+}
+static KitHeap g_heap = {heap_alloc, heap_realloc, heap_free, NULL};
+
+static void diag_emit(KitDiagSink* s, KitDiagKind k, KitSrcLoc loc,
+ const char* fmt, va_list ap) {
+ static const char* names[] = {"note", "warning", "error", "fatal"};
+ (void)s;
+ (void)loc;
+ fprintf(stderr, "%s: ", names[k]);
+ vfprintf(stderr, fmt, ap);
+ fputc('\n', stderr);
+}
+static KitDiagSink g_diag = {diag_emit, NULL, 0, 0};
+
+/* ---- assertion helpers ---- */
+
+static int g_failures;
+#define CHECK(cond, ...) \
+ do { \
+ if (!(cond)) { \
+ fprintf(stderr, "FAIL %s:%d: ", __FILE__, __LINE__); \
+ fprintf(stderr, __VA_ARGS__); \
+ fputc('\n', stderr); \
+ g_failures++; \
+ } \
+ } while (0)
+
+/* ---- ELF on-the-wire constants (raw-byte checks) ---- */
+
+#define EI_CLASS 4
+#define ELFCLASS32 1
+#define EM_RISCV 243
+
+/* ---- input fixture ---- */
+
+/* RV32I: addi a0, zero, 0 ; jalr zero, 0(ra) — little-endian.
+ * Exact instruction encoding is irrelevant to this object-shape test; the
+ * bytes only need to round-trip verbatim. */
+static const uint8_t TEXT_BYTES[8] = {
+ 0x13, 0x05, 0x00, 0x00, 0x67, 0x80, 0x00, 0x00,
+};
+
+#define RELOC_OFFSET 4u
+
+int main(void) {
+ /* Fixed rv32 target: ELFCLASS32, RISC-V, little-endian, 4-byte pointers. */
+ KitTargetSpec target;
+ memset(&target, 0, sizeof target);
+ target.arch = KIT_ARCH_RV32;
+ target.os = KIT_OS_LINUX;
+ target.obj = KIT_OBJ_ELF;
+ target.ptr_size = 4;
+ target.ptr_align = 4;
+ target.big_endian = 0;
+
+ KitContext ctx;
+ memset(&ctx, 0, sizeof ctx);
+ ctx.heap = &g_heap;
+ ctx.diag = &g_diag;
+ ctx.now = -1;
+
+ KitTargetOptions target_opts;
+ memset(&target_opts, 0, sizeof target_opts);
+ target_opts.spec = target;
+ KitTarget* kt = NULL;
+ if (kit_target_new(&ctx, &target_opts, &kt) != KIT_OK || !kt) {
+ fprintf(stderr, "FAIL: kit_target_new (rv32)\n");
+ return 1;
+ }
+ KitCompiler* cc = NULL;
+ if (kit_compiler_new(kt, &ctx, &cc) != KIT_OK || !cc) {
+ fprintf(stderr, "FAIL: kit_compiler_new\n");
+ kit_target_free(kt);
+ return 1;
+ }
+
+ /* ---- build ---- */
+ KitObjBuilder* in = NULL;
+ CHECK(kit_obj_builder_new(cc, &in) == KIT_OK && in, "kit_obj_builder_new");
+
+ KitObjSection sec_text = KIT_SECTION_NONE;
+ KitObjSectionDesc text_desc = {
+ .name = kit_sym_intern(cc, KIT_SLICE_LIT(".text")),
+ .kind = KIT_SEC_TEXT,
+ .flags = KIT_SF_ALLOC | KIT_SF_EXEC,
+ .align = 4,
+ .entsize = 0,
+ };
+ CHECK(kit_obj_builder_section(in, &text_desc, &sec_text) == KIT_OK,
+ "section .text");
+ CHECK(kit_obj_builder_write(in, sec_text, TEXT_BYTES, sizeof TEXT_BYTES) ==
+ KIT_OK,
+ "write .text");
+
+ KitObjSymbol sym_func = KIT_OBJ_SYMBOL_NONE;
+ KitObjSymbolDesc func_desc = {
+ .name = kit_sym_intern(cc, KIT_SLICE_LIT("rv32_sym")),
+ .bind = KIT_SB_GLOBAL,
+ .kind = KIT_SK_FUNC,
+ .section = sec_text,
+ .value = 0,
+ .size = sizeof TEXT_BYTES,
+ };
+ CHECK(kit_obj_builder_symbol(in, &func_desc, &sym_func) == KIT_OK,
+ "symbol rv32_sym");
+
+ /* One 32-bit absolute reloc. R_ABS32 -> ELF_R_RISCV_32 on RV32, so an
+ * Elf32_Rela survives the round-trip; aimed at rv32_sym itself. */
+ KitObjRelocDesc reloc_desc = {
+ .section = sec_text,
+ .offset = RELOC_OFFSET,
+ .kind = {.arch = target.arch,
+ .obj_fmt = target.obj,
+ .code = KIT_RELOC_ABS32},
+ .symbol = sym_func,
+ .addend = 0,
+ };
+ CHECK(kit_obj_builder_reloc(in, &reloc_desc) == KIT_OK, "reloc .text");
+
+ CHECK(kit_obj_builder_finalize(in) == KIT_OK, "finalize");
+
+ /* ---- emit to memory ---- */
+ KitWriter* w = NULL;
+ (void)kit_writer_mem(&g_heap, &w);
+ CHECK(kit_obj_builder_emit(in, w) == KIT_OK, "emit");
+ size_t out_len = 0;
+ const uint8_t* out_data = kit_writer_mem_bytes(w, &out_len);
+
+ /* Sanity: ELF magic. */
+ CHECK(out_len >= 52, "ELF emit produced too few bytes (%zu)", out_len);
+ CHECK(out_len >= 4 && out_data[0] == 0x7f && out_data[1] == 'E' &&
+ out_data[2] == 'L' && out_data[3] == 'F',
+ "ELF emit output missing ELF magic");
+
+ /* (a) Raw-byte class/machine checks on the emitted image. For ELFCLASS32
+ * e_machine is a little-endian uint16 at offset 18; for ELFCLASS64 it is
+ * also at 18, so this offset is valid regardless. e_ident[EI_CLASS] must
+ * be ELFCLASS32 (==1) for the rv32 target. */
+ if (out_len > EI_CLASS) {
+ CHECK(out_data[EI_CLASS] == ELFCLASS32,
+ "e_ident[EI_CLASS]=%u after emit, want ELFCLASS32=%u",
+ out_data[EI_CLASS], ELFCLASS32);
+ }
+ if (out_len >= 20) {
+ unsigned machine =
+ (unsigned)out_data[18] | ((unsigned)out_data[19] << 8);
+ CHECK(machine == EM_RISCV, "e_machine=%u after emit, want EM_RISCV=%u",
+ machine, EM_RISCV);
+ }
+
+ /* Round-trip: copy to a private buffer first — the mem writer's storage is
+ * freed on close. */
+ uint8_t* roundtrip = (uint8_t*)malloc(out_len ? out_len : 1);
+ memcpy(roundtrip, out_data, out_len);
+ kit_writer_close(w);
+
+ /* ---- reopen ---- */
+ KitSlice input = {.data = roundtrip, .len = out_len};
+ KitObjFile* back = NULL;
+ CHECK(kit_obj_open(&ctx, KIT_SLICE_LIT("rv32_class32"), &input, &back) ==
+ KIT_OK &&
+ back,
+ "kit_obj_open failed");
+
+ /* (b) The reader resolves the image to an rv32 target spec. */
+ if (back) {
+ KitTargetSpec got = kit_obj_target(back);
+ CHECK(got.arch == KIT_ARCH_RV32, "reopened arch=%d, want KIT_ARCH_RV32=%d",
+ (int)got.arch, (int)KIT_ARCH_RV32);
+ CHECK(got.ptr_size == 4, "reopened ptr_size=%u, want 4",
+ (unsigned)got.ptr_size);
+ CHECK(got.obj == KIT_OBJ_ELF, "reopened obj fmt is not ELF");
+ }
+
+ /* (c) The symbol survives: GLOBAL FUNC in .text at value 0. */
+ if (back) {
+ KitObjSection text = KIT_SECTION_NONE;
+ CHECK(kit_obj_section_by_name(back, KIT_SLICE_LIT(".text"), &text) ==
+ KIT_OK,
+ ".text not present after roundtrip");
+
+ KitObjSymInfo si;
+ if (kit_obj_symbol_by_name(back, KIT_SLICE_LIT("rv32_sym"), &si) ==
+ KIT_OK) {
+ CHECK(si.bind == KIT_SB_GLOBAL, "rv32_sym bind=%u", si.bind);
+ CHECK(si.kind == KIT_SK_FUNC, "rv32_sym kind=%u (want SK_FUNC)",
+ si.kind);
+ CHECK(si.value == 0, "rv32_sym value=%llu, want 0",
+ (unsigned long long)si.value);
+ if (text != KIT_SECTION_NONE) {
+ CHECK(si.section == text, "rv32_sym not bound to .text");
+ }
+ } else {
+ CHECK(0, "missing 'rv32_sym' symbol after roundtrip");
+ }
+ }
+
+ /* (d) The relocation survives: one R_ABS32 in .text at RELOC_OFFSET against
+ * rv32_sym, addend 0. */
+ if (back) {
+ KitObjRelocIter* it = NULL;
+ KitObjReloc found;
+ int ntext = 0;
+ int have = 0;
+ KitObjSection text = KIT_SECTION_NONE;
+ (void)kit_obj_section_by_name(back, KIT_SLICE_LIT(".text"), &text);
+
+ CHECK(kit_obj_reliter_new(back, &it) == KIT_OK && it, "reliter_new");
+ if (it) {
+ KitObjReloc r;
+ while (kit_obj_reliter_next(it, &r) == KIT_ITER_ITEM) {
+ if (r.section == text) {
+ ++ntext;
+ found = r;
+ have = 1;
+ }
+ }
+ kit_obj_reliter_free(it);
+ }
+ CHECK(ntext == 1, ".text reloc count = %d, want 1", ntext);
+ CHECK(have, "no reloc on .text after roundtrip");
+ if (have) {
+ CHECK(found.kind.code == KIT_RELOC_ABS32,
+ ".text reloc code=%u (want ABS32=%u)", found.kind.code,
+ KIT_RELOC_ABS32);
+ CHECK(found.offset == RELOC_OFFSET, ".text reloc offset=%llu, want %u",
+ (unsigned long long)found.offset, RELOC_OFFSET);
+ CHECK(found.addend == 0, ".text reloc addend=%lld",
+ (long long)found.addend);
+ CHECK(kit_slice_eq_cstr(found.sym_name, "rv32_sym"),
+ "reloc target name = %.*s", KIT_SLICE_ARG(found.sym_name));
+ }
+ }
+
+ if (back) kit_obj_free(back);
+ free(roundtrip);
+ kit_obj_builder_free(in);
+ kit_compiler_free(cc);
+ kit_target_free(kt);
+
+ if (g_failures) {
+ fprintf(stderr, "%d failure(s)\n", g_failures);
+ return g_failures;
+ }
+ fputs("rv32_class32: OK\n", stderr);
+ return 0;
+}
diff --git a/test/emu/rv64_smoke_test.c b/test/emu/rv64_smoke_test.c
@@ -46,7 +46,7 @@
* (static inline) and ELF64 layout constants. The accompanying white-box unit
* tests for the decoder / address space / syscall units live in
* rv64_vm_unit_test.c, which links the library objects directly. */
-#include "arch/rv64/isa.h"
+#include "arch/riscv/isa.h"
#include "core/core.h"
#include "lib/kit_unit.h"
#include "obj/elf/elf.h"
diff --git a/test/emu/rv64_vm_unit_test.c b/test/emu/rv64_vm_unit_test.c
@@ -17,7 +17,7 @@
#include <unistd.h>
#include "arch/arch.h"
-#include "arch/rv64/isa.h"
+#include "arch/riscv/isa.h"
#include "core/core.h"
#include "emu/emu.h"
#include "lib/kit_unit.h"
diff --git a/test/lib/check_rv32_env.sh b/test/lib/check_rv32_env.sh
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+# test/lib/check_rv32_env.sh — kit rv32 "doctor".
+#
+# Prerequisite check for the rv32 behavioral-oracle lane (test/smoke/rv32.sh).
+# Unlike rv64 (qemu-user / podman, a riscv64-linux ELF), the rv32 target is
+# freestanding `-none-elf`: it runs as a bare-metal image under
+# qemu-system-riscv32 -machine virt, with a startup stub that enables the FPU
+# (mstatus.FS) for the single-hard-float (ilp32f) variant and a SiFive test
+# finisher at 0x100000 to turn a program result into a qemu exit code.
+#
+# Each checked tool is reported as a one-liner (OK / MISSING) with what was
+# looked for and how to install it. Source it and call check_rv32_env to
+# populate the RV32_* globals; run it directly for a standalone report.
+#
+# After check_rv32_env returns, these globals are set:
+# RV32_HAVE_CLANG_TARGET 0/1 — clang accepts --target=riscv32-unknown-elf
+# RV32_HAVE_LLD 0/1 — ld.lld on PATH (bare-metal link fallback)
+# RV32_HAVE_QEMU_SYSTEM 0/1 — qemu-system-riscv32 on PATH
+# RV32_QEMU_SYSTEM_BIN path or empty
+# RV32_READY 0/1 — clang target + qemu-system both usable
+
+_rv32_os_tag() {
+ case "$(uname -s 2>/dev/null)" in
+ Darwin) echo darwin ;;
+ Linux)
+ if [ -r /etc/os-release ]; then
+ . /etc/os-release
+ case "${ID:-}:${ID_LIKE:-}" in
+ *alpine*) echo alpine ;;
+ *debian*|*ubuntu*) echo debian ;;
+ *fedora*|*rhel*) echo fedora ;;
+ *) echo linux ;;
+ esac
+ else echo linux; fi ;;
+ *) echo other ;;
+ esac
+}
+
+_rv32_hint_clang() {
+ case "$(_rv32_os_tag)" in
+ darwin) echo "brew install llvm (clang 16+ ships the riscv32 target)" ;;
+ debian) echo "apt install clang lld" ;;
+ fedora) echo "dnf install clang lld" ;;
+ alpine) echo "apk add clang lld" ;;
+ *) echo "install an LLVM/clang with the riscv32 target" ;;
+ esac
+}
+
+_rv32_hint_qemu_system() {
+ case "$(_rv32_os_tag)" in
+ darwin) echo "brew install qemu" ;;
+ debian) echo "apt install qemu-system-misc" ;;
+ fedora) echo "dnf install qemu-system-riscv" ;;
+ alpine) echo "apk add qemu-system-riscv32" ;;
+ *) echo "install qemu (qemu-system-riscv32)" ;;
+ esac
+}
+
+check_rv32_env() {
+ RV32_HAVE_CLANG_TARGET=0
+ RV32_HAVE_LLD=0
+ RV32_HAVE_QEMU_SYSTEM=0
+ RV32_QEMU_SYSTEM_BIN=""
+
+ # clang with the riscv32 target: probe by compiling an empty TU.
+ if command -v clang >/dev/null 2>&1; then
+ if echo 'int _e(void){return 0;}' | \
+ clang --target=riscv32-unknown-elf -march=rv32imac -mabi=ilp32 \
+ -ffreestanding -nostdlib -c -x c - -o /dev/null >/dev/null 2>&1; then
+ RV32_HAVE_CLANG_TARGET=1
+ echo " OK clang --target=riscv32-unknown-elf"
+ else
+ echo " MISSING clang riscv32 target — install: $(_rv32_hint_clang)"
+ fi
+ else
+ echo " MISSING clang — install: $(_rv32_hint_clang)"
+ fi
+
+ if command -v ld.lld >/dev/null 2>&1; then
+ RV32_HAVE_LLD=1
+ echo " OK ld.lld"
+ else
+ echo " MISSING ld.lld — install: $(_rv32_hint_clang)"
+ fi
+
+ if command -v qemu-system-riscv32 >/dev/null 2>&1; then
+ RV32_HAVE_QEMU_SYSTEM=1
+ RV32_QEMU_SYSTEM_BIN="$(command -v qemu-system-riscv32)"
+ echo " OK qemu-system-riscv32 ($RV32_QEMU_SYSTEM_BIN)"
+ else
+ echo " MISSING qemu-system-riscv32 — install: $(_rv32_hint_qemu_system)"
+ fi
+
+ RV32_READY=0
+ if [ "$RV32_HAVE_CLANG_TARGET" -eq 1 ] && [ "$RV32_HAVE_QEMU_SYSTEM" -eq 1 ]; then
+ RV32_READY=1
+ echo " READY rv32 behavioral oracle available"
+ else
+ echo " BLOCKED rv32 behavioral oracle needs clang riscv32 target + qemu-system-riscv32"
+ fi
+}
+
+# Run standalone: report and exit non-zero if blocked.
+if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
+ check_rv32_env
+ [ "${RV32_READY:-0}" -eq 1 ]
+fi
diff --git a/test/lib/exec_rv32_bare.sh b/test/lib/exec_rv32_bare.sh
@@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+# test/lib/exec_rv32_bare.sh — shared bare-metal execution helper for the rv32
+# cross-test lane (path "V") of the corpus harnesses (test/toy/run.sh and
+# test/parse/run.sh).
+#
+# rv32 is a freestanding `-none-elf` target with no qemu-user / podman path
+# (unlike the aa64/x64/rv64 Linux cross lanes that go through exec_target.sh).
+# A corpus object — whose `main` returns an exit code — is instead linked with a
+# bare-metal startup that sets the stack, enables the FPU (ilp32f), calls main,
+# and reports its return through a SiFive test finisher, then run under
+# qemu-system-riscv32 -machine virt. The qemu exit code equals main's return
+# (0 -> 0x5555 -> qemu exit 0; N -> 0x3333|(N<<16) -> qemu exit N), so the
+# corpus's existing `rc == expected` oracle applies unchanged.
+#
+# The link uses `kit ld` (a freestanding rv32 target defaults to no-PIE and
+# auto-links no runtime, so the corpus + the kit runtime archive are supplied
+# explicitly), exercising the full kit toolchain end to end. The startup stub is
+# clang-assembled because kit's inline assembler does not yet emit the `csrs`
+# CSR pseudo used to enable the FPU.
+#
+# Public API (after sourcing):
+# rv32_bare_setup <workdir> populate RV32_BARE_OK (0/1) and cache
+# the startup/wrapper/linkscript + rt.
+# rv32_bare_run <obj> <work> <rcfile> link <obj> into a bootable image and
+# run it; write the qemu exit code to
+# <rcfile>. Echoes a one-line reason and
+# returns: 0 ran (rc in <rcfile>),
+# 2 link/build failure (caller decides).
+
+# shellcheck disable=SC2034 # RV32_BARE_* are consumed by the sourcing harness.
+
+_rv32_bare_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+# shellcheck source=check_rv32_env.sh
+. "$_rv32_bare_root/test/lib/check_rv32_env.sh"
+
+RV32_BARE_OK=0
+RV32_BARE_KIT="${KIT:-$_rv32_bare_root/build/kit}"
+# Entry symbol the wrapper calls and reports the exit code of. Toy cases define
+# `main`; the C parse corpus defines `test_main`. Set before rv32_bare_setup.
+RV32_BARE_ENTRY="${RV32_BARE_ENTRY:-main}"
+RV32_BARE_QEMU=""
+RV32_BARE_RT=""
+RV32_BARE_START=""
+RV32_BARE_WRAP=""
+RV32_BARE_LDS=""
+# ilp32f: hardware single float + soft double + i64 — the verified rv32 profile.
+RV32_BARE_MARCH="rv32imafc_zicsr_zifencei"
+RV32_BARE_MABI="ilp32f"
+
+rv32_bare_setup() {
+ local work="$1"
+ RV32_BARE_OK=0
+ check_rv32_env >/dev/null 2>&1
+ if [ "${RV32_HAVE_CLANG_TARGET:-0}" -ne 1 ] ||
+ [ "${RV32_HAVE_QEMU_SYSTEM:-0}" -ne 1 ]; then
+ return 0
+ fi
+ RV32_BARE_QEMU="${RV32_QEMU_SYSTEM_BIN:-qemu-system-riscv32}"
+ [ -x "$RV32_BARE_KIT" ] || return 0
+
+ # The freestanding runtime (i64 mul/div/shift + soft double helpers). Build it
+ # on demand; without it, link of any corpus case touching those would fail.
+ RV32_BARE_RT="$_rv32_bare_root/build/rt/riscv32-elf-hardfloat/libkit_rt.a"
+ if [ ! -f "$RV32_BARE_RT" ]; then
+ make -C "$_rv32_bare_root" rt-riscv32-elf-hardfloat >/dev/null 2>&1 || true
+ fi
+ [ -f "$RV32_BARE_RT" ] || return 0
+
+ mkdir -p "$work"
+ RV32_BARE_START="$work/_rv32_start.o"
+ RV32_BARE_WRAP="$work/_rv32_wrap.o"
+ RV32_BARE_LDS="$work/_rv32.lds"
+
+ # Bare-metal reset entry: stack at the top of `virt` RAM, enable the FPU
+ # (mstatus.FS=Initial) for ilp32f, then call the C wrapper.
+ cat > "$work/_rv32_start.S" <<'EOF'
+.section .text.start,"ax",@progbits
+.globl _start
+_start:
+ li sp, 0x80100000
+ li t0, 0x2000
+ csrs mstatus, t0
+ call _rv32_cmain
+1: j 1b
+EOF
+ # The wrapper calls the corpus's main() and maps its return onto the SiFive
+ # test finisher at 0x100000. Compiled by kit (exercises rv32 codegen for the
+ # finisher store + the call); main may return i32 or i64 — the low word is the
+ # exit code.
+ cat > "$work/_rv32_wrap.c" <<EOF
+#define FINISHER ((volatile unsigned int*)0x100000)
+extern int ${RV32_BARE_ENTRY}(void);
+__attribute__((noreturn)) void _rv32_cmain(void) {
+ int code = ${RV32_BARE_ENTRY}();
+ *FINISHER = code ? (0x3333u | ((unsigned)code << 16)) : 0x5555u;
+ for (;;) {}
+}
+EOF
+ cat > "$RV32_BARE_LDS" <<'EOF'
+ENTRY(_start)
+SECTIONS {
+ . = 0x80000000;
+ .text : { *(.text.start) *(.text*) }
+ .rodata : { *(.rodata*) }
+ .data : { *(.data*) }
+ .bss : { *(.bss*) *(COMMON) }
+ /DISCARD/ : { *(.riscv.attributes) *(.comment) }
+}
+EOF
+ if ! clang --target=riscv32-unknown-elf -march=rv32imafc -mabi="$RV32_BARE_MABI" \
+ -ffreestanding -nostdlib -c "$work/_rv32_start.S" -o "$RV32_BARE_START" \
+ >/dev/null 2>&1; then
+ return 0
+ fi
+ if ! "$RV32_BARE_KIT" cc -target riscv32-none-elf -march="$RV32_BARE_MARCH" \
+ -mabi="$RV32_BARE_MABI" -O1 -ffreestanding -c "$work/_rv32_wrap.c" \
+ -o "$RV32_BARE_WRAP" >/dev/null 2>&1; then
+ return 0
+ fi
+ RV32_BARE_OK=1
+}
+
+rv32_bare_run() { # <obj> <work> <rcfile>
+ local obj="$1" work="$2" rcfile="$3"
+ local elf="$work/$(basename "$obj").elf"
+ local lderr="$work/$(basename "$obj").rv32ld.err"
+ if [ "$RV32_BARE_OK" -ne 1 ]; then
+ echo "rv32 bare-metal toolchain unavailable"; return 2
+ fi
+ if ! "$RV32_BARE_KIT" ld -T "$RV32_BARE_LDS" -e _start \
+ "$RV32_BARE_START" "$RV32_BARE_WRAP" "$obj" "$RV32_BARE_RT" \
+ -o "$elf" 2>"$lderr"; then
+ echo "kit ld (rv32) failed: $(head -n1 "$lderr" 2>/dev/null)"; return 2
+ fi
+ local rc=0
+ timeout 20 "$RV32_BARE_QEMU" -machine virt -bios none -kernel "$elf" \
+ -nographic -no-reboot >/dev/null 2>&1 || rc=$?
+ printf '%s' "$rc" > "$rcfile"
+ return 0
+}
diff --git a/test/lib/kit_test_target.h b/test/lib/kit_test_target.h
@@ -70,9 +70,24 @@ static inline int kit_test_target_init(KitTargetSpec* t) {
t->arch = KIT_ARCH_RV64;
return 0;
}
+ if (!strcmp(a, "rv32") || !strcmp(a, "riscv32")) {
+ if (t->os == KIT_OS_MACOS) {
+ fprintf(stderr, "kit_test_target: rv32 has no macOS target\n");
+ return -1;
+ }
+ /* riscv32-none-elf: freestanding, 4-byte pointers, ilp32f (hardware single
+ * float, soft double + i64) — the verified rv32 profile. float_abi must be
+ * set so `double` routes to soft-float (rv32 has no D). */
+ t->arch = KIT_ARCH_RV32;
+ t->os = KIT_OS_FREESTANDING;
+ t->ptr_size = 4;
+ t->ptr_align = 4;
+ t->float_abi = KIT_FLOAT_ABI_SINGLE;
+ return 0;
+ }
fprintf(stderr,
"kit_test_target: unrecognized KIT_TEST_ARCH=\"%s\" "
- "(expected aa64/x64/rv64)\n",
+ "(expected aa64/x64/rv64/rv32)\n",
a);
return -1;
}
diff --git a/test/link/rv32_jit_test.c b/test/link/rv32_jit_test.c
@@ -0,0 +1,345 @@
+/* RV32 JIT smoke test.
+ *
+ * Builds a tiny ELF relocatable object in memory for rv32 containing
+ * one function:
+ *
+ * .text
+ * .globl rv32_jit_answer
+ * rv32_jit_answer:
+ * addi a0, zero, 42 # 0x02a00513
+ * jalr zero, ra, 0 # 0x00008067 (ret)
+ *
+ * Feeds it through kit_link_session in KIT_LINK_OUTPUT_JIT mode,
+ * which exercises the rv32 path of:
+ * - executable-memory reservation + W^X protect cycle
+ * - relocation application (none needed here, but the path runs)
+ * - symbol resolution / lookup by C-mangled name
+ * - icache flush (fence.i / __builtin___clear_cache on riscv hosts)
+ *
+ * If we are running on a rv32 host, the test then *calls* the JITed
+ * function and asserts the return is 42 — that's the native-host
+ * execution leg the parity checklist asked for. On non-rv32 hosts
+ * we still build the image (verifying the in-memory machinery is wired
+ * end-to-end) but SKIP the actual call: the bytes are valid rv32 but
+ * the host CPU can't decode them. The test prints "SKIP <reason>" and
+ * exits 77 (the GNU autotools "skipped" convention) when this happens.
+ *
+ * Wired into mk/test.mk via test-rv32-jit. Always builds; calls only on
+ * rv32 hosts. This mirrors the rv64 JIT smoke test (test/link/rv64_jit_test.c):
+ * have the code path in place for the day someone runs kit on a rv32 dev box.
+ *
+ * The two base-ISA encodings are identical on rv32 and rv64 (they are RV32I
+ * instructions reused unchanged by RV64I), so the encoding constants are the
+ * same as the rv64 sibling test. */
+
+#include <kit/core.h>
+#include <kit/jit.h>
+#include <kit/link.h>
+#include <kit/object.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "lib/kit_unit.h"
+
+/* Native execution requires the host CPU to be rv32 (any OS that gives
+ * us POSIX mmap + mprotect, which on rv32 means Linux today). Anywhere
+ * else we still build the JIT image but skip the call. */
+#if defined(__riscv) && (__riscv_xlen == 32)
+#define RV32_HOST_NATIVE 1
+#else
+#define RV32_HOST_NATIVE 0
+#endif
+
+/* ---- host glue: heap + diag come from the shared KitUnit ---- */
+static KitUnit g_u;
+
+/* ---- execmem with W^X dual-mapping (mirrors test/link/harness) ---- */
+static int xm_to_posix(int p) {
+ int q = 0;
+ if (p & KIT_PROT_READ) q |= PROT_READ;
+ if (p & KIT_PROT_WRITE) q |= PROT_WRITE;
+ if (p & KIT_PROT_EXEC) q |= PROT_EXEC;
+ return q;
+}
+
+#if defined(__linux__)
+#include <sys/syscall.h>
+#define XM_DUAL_LINUX 1
+#else
+#define XM_DUAL_LINUX 0
+#endif
+
+typedef struct XmTok {
+ void* w;
+ void* r;
+ size_t n;
+} XmTok;
+
+static KitStatus xm_reserve_single(size_t n, KitExecMemRegion* out) {
+ void* p =
+ mmap(NULL, n, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (p == MAP_FAILED) return KIT_NOMEM;
+ out->write = out->runtime = p;
+ out->size = n;
+ out->token = NULL;
+ return KIT_OK;
+}
+
+static KitStatus xm_reserve(void* u, size_t n, int p, KitExecMemRegion* out) {
+ (void)u;
+ if (!out || !n) return KIT_INVALID;
+ if (!(p & KIT_PROT_EXEC)) return xm_reserve_single(n, out);
+#if XM_DUAL_LINUX
+ {
+ int fd = (int)syscall(SYS_memfd_create, "kit-rv32-jit-test", 0u);
+ void *w, *r;
+ XmTok* tok;
+ if (fd < 0) return KIT_NOMEM;
+ if (ftruncate(fd, (off_t)n) != 0) {
+ close(fd);
+ return KIT_NOMEM;
+ }
+ w = mmap(NULL, n, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (w == MAP_FAILED) {
+ close(fd);
+ return KIT_NOMEM;
+ }
+ r = mmap(NULL, n, PROT_READ, MAP_SHARED, fd, 0);
+ close(fd);
+ if (r == MAP_FAILED) {
+ munmap(w, n);
+ return KIT_NOMEM;
+ }
+ tok = (XmTok*)malloc(sizeof(*tok));
+ if (!tok) {
+ munmap(r, n);
+ munmap(w, n);
+ return KIT_NOMEM;
+ }
+ tok->w = w;
+ tok->r = r;
+ tok->n = n;
+ out->write = w;
+ out->runtime = r;
+ out->size = n;
+ out->token = tok;
+ return KIT_OK;
+ }
+#else
+ return xm_reserve_single(n, out);
+#endif
+}
+
+static KitStatus xm_protect(void* u, void* a, size_t n, int p) {
+ (void)u;
+ return mprotect(a, n, xm_to_posix(p)) == 0 ? KIT_OK : KIT_IO;
+}
+
+static void xm_release(void* u, KitExecMemRegion* region) {
+ (void)u;
+ if (!region || !region->size) return;
+ if (region->token) {
+ XmTok* tok = (XmTok*)region->token;
+ if (tok->r && tok->r != tok->w) munmap(tok->r, tok->n);
+ if (tok->w) munmap(tok->w, tok->n);
+ free(tok);
+ } else if (region->write) {
+ munmap(region->write, region->size);
+ }
+ region->write = region->runtime = NULL;
+ region->size = 0;
+ region->token = NULL;
+}
+
+static void xm_flush(void* u, void* a, size_t n) {
+ (void)u;
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+#if defined(__riscv)
+ /* Local-hart self-modify ordering; __builtin___clear_cache below also
+ * issues the cross-hart syscall on Linux. */
+ __asm__ __volatile__("fence.i" ::: "memory");
+#endif
+ __builtin___clear_cache((char*)a, (char*)a + n);
+#else
+ (void)a;
+ (void)n;
+#endif
+}
+
+static KitExecMem g_execmem = {
+ 16 * 1024, xm_reserve, xm_protect, xm_release, xm_flush, NULL,
+};
+
+/* ---- rv32 instruction encodings used by the test ---- */
+/* These are RV32I base-ISA instructions, byte-identical to the rv64 forms. */
+/* `addi a0, zero, 42` — I-type: imm[11:0]=42, rs1=0, funct3=000 (ADDI),
+ * rd=10 (a0), opcode=0010011. */
+#define ENC_ADDI_A0_ZERO_42 0x02a00513u
+/* `jalr zero, 0(ra)` (= ret) — I-type: imm=0, rs1=1 (ra), funct3=000,
+ * rd=0 (zero), opcode=1100111. */
+#define ENC_RET 0x00008067u
+
+/* ---- the test ---- */
+typedef int (*answer_fn)(void);
+
+int main(void) {
+ /* Page size for the execmem. Same dance as the other runners. */
+ {
+ long ps = sysconf(_SC_PAGESIZE);
+ if (ps > 0) g_execmem.page_size = (size_t)ps;
+ }
+
+ KitTargetSpec target =
+ kit_unit_target(KIT_ARCH_RV32, KIT_OS_LINUX, KIT_OBJ_ELF);
+ /* kit_unit_target hardcodes ptr_size/ptr_align=8 for the rv64-era suite;
+ * rv32 is ILP32 and the ELF class (ELFCLASS32 vs 64) is selected from
+ * ptr_size, so narrow them here. */
+ target.ptr_size = 4;
+ target.ptr_align = 4;
+
+ kit_unit_init(&g_u);
+ g_u.ctx.now = -1;
+
+ KitCompiler* c = NULL;
+ if (kit_unit_compiler_new(&g_u, target, &c) != KIT_OK || !c) {
+ fprintf(stderr, "rv32_jit_test: compiler_new failed\n");
+ return 2;
+ }
+
+ /* Build the object. */
+ KitObjBuilder* ob = NULL;
+ if (kit_obj_builder_new(c, &ob) != KIT_OK || !ob) {
+ fprintf(stderr, "rv32_jit_test: obj_builder_new failed\n");
+ kit_compiler_free(c);
+ return 2;
+ }
+
+ KitObjSectionDesc sec_desc;
+ memset(&sec_desc, 0, sizeof(sec_desc));
+ sec_desc.name = kit_sym_intern(c, KIT_SLICE_LIT(".text"));
+ sec_desc.kind = KIT_SEC_TEXT;
+ sec_desc.flags = KIT_SF_EXEC | KIT_SF_ALLOC;
+ sec_desc.align = 4;
+ KitObjSection text = KIT_SECTION_NONE;
+ if (kit_obj_builder_section(ob, &sec_desc, &text) != KIT_OK) {
+ fprintf(stderr, "rv32_jit_test: section failed\n");
+ return 2;
+ }
+
+ uint32_t code[2] = {ENC_ADDI_A0_ZERO_42, ENC_RET};
+ if (kit_obj_builder_write(ob, text, code, sizeof(code)) != KIT_OK) {
+ fprintf(stderr, "rv32_jit_test: write failed\n");
+ return 2;
+ }
+
+ KitObjSymbolDesc sym_desc;
+ memset(&sym_desc, 0, sizeof(sym_desc));
+ sym_desc.name = kit_sym_intern(c, KIT_SLICE_LIT("rv32_jit_answer"));
+ sym_desc.bind = KIT_SB_GLOBAL;
+ sym_desc.kind = KIT_SK_FUNC;
+ sym_desc.section = text;
+ sym_desc.value = 0;
+ sym_desc.size = sizeof(code);
+ KitObjSymbol sym = KIT_OBJ_SYMBOL_NONE;
+ if (kit_obj_builder_symbol(ob, &sym_desc, &sym) != KIT_OK) {
+ fprintf(stderr, "rv32_jit_test: symbol failed\n");
+ return 2;
+ }
+
+ if (kit_obj_builder_finalize(ob) != KIT_OK) {
+ fprintf(stderr, "rv32_jit_test: finalize failed\n");
+ return 2;
+ }
+
+ /* JIT the object. The host's execmem is the W^X dual-map above; for
+ * this test we don't need TLS so the jit_host->tls vtable is NULL. */
+ KitJitHost jhost;
+ memset(&jhost, 0, sizeof(jhost));
+ jhost.execmem = &g_execmem;
+ jhost.tls = NULL;
+
+ KitLinkSessionOptions opts;
+ memset(&opts, 0, sizeof(opts));
+ opts.output_kind = KIT_LINK_OUTPUT_JIT;
+ opts.entry = KIT_SLICE_LIT("rv32_jit_answer");
+ opts.jit_host = &jhost;
+
+ KitLinkSession* sess = NULL;
+ if (kit_link_session_new(c, &opts, &sess) != KIT_OK || !sess) {
+ fprintf(stderr, "rv32_jit_test: link_session_new failed\n");
+ return 1;
+ }
+ if (kit_link_session_add_obj(sess, ob) != KIT_OK) {
+ fprintf(stderr, "rv32_jit_test: add_obj failed\n");
+ kit_link_session_free(sess);
+ return 1;
+ }
+
+ KitJit* jit = NULL;
+ if (kit_link_session_jit(sess, &jit) != KIT_OK || !jit) {
+ fprintf(stderr, "rv32_jit_test: link_session_jit failed\n");
+ kit_link_session_free(sess);
+ return 1;
+ }
+ kit_link_session_free(sess);
+
+ void* fn = kit_jit_lookup(jit, KIT_SLICE_LIT("rv32_jit_answer"));
+ if (!fn) {
+ fprintf(stderr, "rv32_jit_test: lookup failed\n");
+ kit_jit_free(jit);
+ kit_compiler_free(c);
+ return 1;
+ }
+
+ /* Reading back the first instruction bytes through the runtime alias
+ * is always safe and verifies the bytes survived the W^X dance plus
+ * the icache-flush hook fired without crashing. This is the portable
+ * check on non-rv32 hosts. */
+ uint32_t got = 0;
+ memcpy(&got, fn, sizeof(got));
+ if (got != ENC_ADDI_A0_ZERO_42) {
+ fprintf(stderr,
+ "rv32_jit_test: bytes corrupted at runtime alias: got 0x%08x "
+ "expected 0x%08x\n",
+ (unsigned)got, (unsigned)ENC_ADDI_A0_ZERO_42);
+ kit_jit_free(jit);
+ kit_compiler_free(c);
+ return 1;
+ }
+
+#if RV32_HOST_NATIVE
+ /* Real execution on a rv32 host. */
+ {
+ answer_fn f = (answer_fn)(uintptr_t)fn;
+ int r = f();
+ if (r != 42) {
+ fprintf(stderr, "rv32_jit_test: jit fn returned %d, expected 42\n", r);
+ kit_jit_free(jit);
+ kit_compiler_free(c);
+ return 1;
+ }
+ printf("rv32_jit_test: PASS (native rv32 execution returned 42)\n");
+ }
+#else
+ /* Non-rv32 host: JIT plumbing worked end-to-end (image built,
+ * permissions flipped, lookup resolved, bytes intact at the runtime
+ * alias). Skip the actual call — calling rv32 bytes on a non-rv32
+ * CPU would SIGILL. Exit-code 77 is the GNU autotools convention
+ * for "skipped" so test wrappers can distinguish from pass/fail. */
+ printf(
+ "rv32_jit_test: SKIP — non-rv32 host (image built, "
+ "lookup OK, bytes intact)\n");
+ kit_jit_free(jit);
+ kit_compiler_free(c);
+ return 77;
+#endif
+
+ kit_jit_free(jit);
+ kit_compiler_free(c);
+ return 0;
+}
diff --git a/test/parse/run.sh b/test/parse/run.sh
@@ -88,6 +88,10 @@ case "$KIT_TEST_ARCH" in
aa64|aarch64|arm64) TEST_ARCH=aa64; CLANG_TRIPLE=aarch64-linux-gnu; EXEC_ARCH=aarch64 ;;
x64|x86_64|amd64) TEST_ARCH=x64; CLANG_TRIPLE=x86_64-linux-gnu; EXEC_ARCH=x64 ;;
rv64|riscv64) TEST_ARCH=rv64; CLANG_TRIPLE=riscv64-linux-gnu; EXEC_ARCH=rv64 ;;
+ # rv32 is freestanding: the E lane runs bare-metal under qemu-system-riscv32
+ # via exec_rv32_bare.sh, not exec_target's qemu-user path. CLANG_TRIPLE is
+ # only for clang probes; the kit target comes from KIT_TEST_ARCH.
+ rv32|riscv32) TEST_ARCH=rv32; CLANG_TRIPLE=riscv32-unknown-elf; EXEC_ARCH=rv32 ;;
*) printf 'unknown KIT_TEST_ARCH=%s\n' "$KIT_TEST_ARCH" >&2; exit 2 ;;
esac
export KIT_TEST_ARCH
@@ -96,6 +100,7 @@ case "$TEST_ARCH" in
aa64) RT_AR="$ROOT/build/rt/aarch64-linux/libkit_rt.a" ;;
x64) RT_AR="$ROOT/build/rt/x86_64-linux/libkit_rt.a" ;;
rv64) RT_AR="$ROOT/build/rt/riscv64-linux/libkit_rt.a" ;;
+ rv32) RT_AR="$ROOT/build/rt/riscv32-elf-hardfloat/libkit_rt.a" ;; # used by exec_rv32_bare
esac
RT_LINK_ARGS=()
if [ -f "$RT_AR" ]; then
@@ -181,6 +186,16 @@ export have_qemu have_podman is_aarch64 QEMU_BIN EXEC_TARGET_MOUNT_ROOT
# shellcheck source=../lib/exec_target.sh
source "$ROOT/test/lib/exec_target.sh"
+# rv32 is freestanding: the E lane runs bare-metal under qemu-system-riscv32.
+if [ "$TEST_ARCH" = "rv32" ]; then
+ # The parse corpus's entry is test_main() (path C bridges main->test_main).
+ RV32_BARE_ENTRY=test_main
+ export RV32_BARE_ENTRY
+ # shellcheck source=../lib/exec_rv32_bare.sh
+ . "$ROOT/test/lib/exec_rv32_bare.sh"
+ rv32_bare_setup "$BUILD_DIR/rv32"
+fi
+
# ---- harness binaries ------------------------------------------------------
printf 'Checking harness...\n'
@@ -344,6 +359,29 @@ kit_lane_R() {
}
kit_lane_E() {
+ # rv32: freestanding bare-metal. parse-runner --emit -> kit ld with a startup
+ # that calls main() and reports its return via a SiFive finisher -> run under
+ # qemu-system-riscv32 (test/lib/exec_rv32_bare.sh). The qemu exit equals
+ # main()'s return, so the corpus rc==expected oracle applies. Gaps stay RED.
+ if [ "$TEST_ARCH" = "rv32" ]; then
+ local exp_byte rc reason t0 dt run_rc
+ if [ "${RV32_BARE_OK:-0}" -ne 1 ]; then
+ kit_skip "$KIT_NAME/E" "no rv32 runner (clang riscv32 + qemu-system-riscv32)"
+ return
+ fi
+ _parse_emit_obj || return
+ exp_byte=$(( KIT_EXPECTED & 0xff ))
+ t0=$(kit_now_ms)
+ reason="$(rv32_bare_run "$PARSE_OBJ" "$KIT_WORK" "$KIT_WORK/exec.rc")"
+ run_rc=$?
+ dt=$(( $(kit_now_ms) - t0 ))
+ kit_time E "$dt"
+ if [ "$run_rc" -eq 2 ]; then kit_fail "$KIT_NAME/E" "$reason, ${dt}ms"; return; fi
+ rc="$(cat "$KIT_WORK/exec.rc" 2>/dev/null || echo 99)"
+ if [ "$rc" -eq "$exp_byte" ]; then kit_pass "$KIT_NAME/E (${dt}ms)"
+ else kit_fail "$KIT_NAME/E" "expected $exp_byte got $rc (qemu-system-riscv32), ${dt}ms"; fi
+ return
+ fi
if [ $have_exe_runner -ne 1 ] || [ $have_clang_cross -ne 1 ] || [ $have_start_obj -ne 1 ]; then
kit_skip "$KIT_NAME/E" "no link-exe-runner, $TEST_ARCH clang, or start.o"
return
diff --git a/test/smoke/rv32.sh b/test/smoke/rv32.sh
@@ -0,0 +1,218 @@
+#!/usr/bin/env bash
+# test/smoke/rv32.sh — behavioral oracle for kit's riscv32-none-elf codegen.
+#
+# Unlike rv64 (qemu-user / podman, a riscv64-linux ELF), rv32 is freestanding
+# `-none-elf`: kit compiles the app, we link a bare-metal image at the `virt`
+# machine's RAM base (0x80000000) with a startup stub that enables the FPU
+# (mstatus.FS, required before any fadd.s under ilp32f) and a SiFive test
+# finisher at 0x100000 that turns a program result into a qemu exit code
+# (0x5555 -> poweroff/exit 0; 0x3333|(code<<16) -> exit code). Run under
+# qemu-system-riscv32 -machine virt -bios none -kernel.
+#
+# Scope: the full verified codegen surface — 32-bit integer + pointer, hardware
+# single-float (ilp32f), control flow, AND (WS6) 64-bit-value legalization:
+# long long carry/borrow/bitwise/compare/convert inline as GPR pairs, i64
+# mul/div/shift via __*di3 runtime calls, and soft `double` arith/compare/
+# convert via __*df3 calls. Two ABI lanes are exercised: ilp32f (hardware single
+# float, soft double) and ilp32 (pure soft float). The i64-mul/div/shift and
+# double cases link kit's freestanding runtime (libkit_rt.a) for the helpers;
+# the inline i64 cases need no runtime.
+#
+# kit compiles app.c (the code under test); the startup stub is clang-assembled
+# and the final bare-metal link uses ld.lld (kit's static ELF base-addr control
+# for `virt` is exercised separately). The .eh_frame kit emits is discarded by
+# the bare-metal link script (no unwinder in a freestanding image).
+#
+# Skipped (per the shared kit_exit convention) if clang lacks the riscv32
+# target, ld.lld is missing, or qemu-system-riscv32 is unavailable.
+
+set -u
+ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+BUILD_DIR="$ROOT/build/test/smoke-rv32"
+mkdir -p "$BUILD_DIR"
+
+KIT_KIT_DIR="$ROOT/test/lib"
+# shellcheck source=../lib/kit_sh_kit.sh
+. "$ROOT/test/lib/kit_sh_kit.sh"
+kit_report_init
+KIT_SKIP_IS_FAILURE=1
+
+# ---- prerequisites (the rv32 doctor) --------------------------------------
+# shellcheck source=../lib/check_rv32_env.sh
+. "$ROOT/test/lib/check_rv32_env.sh"
+check_rv32_env
+if [ "${RV32_HAVE_CLANG_TARGET:-0}" -eq 0 ]; then
+ skip_test "smoke-rv32" "clang riscv32 target unavailable"; kit_summary test-smoke-rv32; kit_exit
+fi
+if [ "${RV32_HAVE_LLD:-0}" -eq 0 ]; then
+ skip_test "smoke-rv32" "ld.lld unavailable"; kit_summary test-smoke-rv32; kit_exit
+fi
+if [ "${RV32_HAVE_QEMU_SYSTEM:-0}" -eq 0 ]; then
+ skip_test "smoke-rv32" "qemu-system-riscv32 unavailable"; kit_summary test-smoke-rv32; kit_exit
+fi
+
+KIT="$ROOT/build/kit"
+QEMU="${RV32_QEMU_SYSTEM_BIN:-qemu-system-riscv32}"
+
+# Per-ABI startup stubs. ilp32f must enable the FPU (mstatus.FS) before any
+# fadd.s; pure-soft ilp32 has no FPU and skips it.
+cat > "$BUILD_DIR/start_hf.S" <<'EOF'
+.section .text.start,"ax",@progbits
+.globl _start
+_start:
+ li sp, 0x80100000
+ li t0, 0x2000 /* mstatus.FS = Initial: enable the FPU */
+ csrs mstatus, t0
+ call cmain
+1: j 1b
+EOF
+cat > "$BUILD_DIR/start_soft.S" <<'EOF'
+.section .text.start,"ax",@progbits
+.globl _start
+_start:
+ li sp, 0x80100000
+ call cmain
+1: j 1b
+EOF
+cat > "$BUILD_DIR/link.ld" <<'EOF'
+ENTRY(_start)
+SECTIONS {
+ . = 0x80000000;
+ .text : { *(.text.start) *(.text*) }
+ .rodata : { *(.rodata*) }
+ .data : { *(.data*) }
+ .bss : { *(.bss*) *(COMMON) }
+ /DISCARD/ : { *(.eh_frame) *(.eh_frame_hdr) *(.riscv.attributes) *(.comment) }
+}
+EOF
+
+# The app under test. `hw_float` is gated so the pure-soft ilp32 lane (no FPU,
+# rv32imac) does not emit fadd.s. Every check returns a distinct nonzero code on
+# failure so a qemu exit pinpoints the broken case.
+cat > "$BUILD_DIR/app.c" <<'EOF'
+#define FINISHER ((volatile unsigned int*)0x100000)
+__attribute__((noreturn)) static void finish(int code){
+ *FINISHER = code ? (0x3333u | ((unsigned)code << 16)) : 0x5555u;
+ for(;;){}
+}
+static int compute(void){
+ /* 32-bit integer / control flow */
+ int acc = 0; for (int i = 0; i < 10; i++) acc += i;
+ if (acc != 45) return 1;
+ volatile unsigned u = 0x12345678u; u ^= 0x0F0F0F0Fu;
+ if (u != (0x12345678u ^ 0x0F0F0F0Fu)) return 2;
+#ifdef HW_FLOAT
+ volatile float f = 1.5f; f = f + 2.5f; /* HW fadd.s -> 4.0 */
+ if (f != 4.0f) return 3;
+#endif
+ /* i64 inline: carry / borrow / bitwise / compare / convert (no runtime) */
+ volatile unsigned long long a = 0xFFFFFFFFull; a += 1;
+ if (a != 0x100000000ull) return 4;
+ volatile unsigned long long b = 0x100000000ull; b -= 1;
+ if (b != 0x0FFFFFFFFull) return 5;
+ volatile unsigned long long c = 0x1122334455667788ull;
+ if ((c ^ 0xFFFFFFFFFFFFFFFFull) != 0xEEDDCCBBAA998877ull) return 6;
+ volatile long long x = 0x1234567800000000ll, y = 0x1234567700000001ll;
+ if (!(x > y) || !(y < x) || (x == y)) return 7;
+ volatile int s32 = -7; volatile long long s64 = s32;
+ if (s64 != -7) return 8;
+ volatile long long big = 0x00000000FAFAFAFAll;
+ if ((unsigned)(int)big != 0xFAFAFAFAu) return 9;
+ if (a) { } else return 10; /* truthiness (hi word set) */
+ /* i64 runtime: mul / udiv / umod / shifts (__muldi3/__udivdi3/...) */
+ volatile unsigned long long m = 0x0000000100000001ull;
+ if (m * 3ull != 0x0000000300000003ull) return 11;
+ volatile unsigned long long d = 0xFFFFFFFFFFFFFFFFull;
+ if (d / 0xFFFFFFFFull != 0x0000000100000001ull) return 12;
+ if (d % 7ull != (0xFFFFFFFFFFFFFFFFull % 7ull)) return 13;
+ volatile unsigned long long sh = 1ull;
+ if ((sh << 40) != 0x0000010000000000ull) return 14;
+ volatile long long sar = -0x4000000000000000ll;
+ if ((sar >> 36) != (-0x4000000000000000ll >> 36)) return 15;
+ /* soft double: arith / compare / convert (__adddf3/__muldf3/__floatsidf/...) */
+ volatile double p = 1.5, q = 2.25;
+ if (p + q != 3.75) return 16;
+ if (p * q != 3.375) return 17;
+ if (!(p < q) || (p >= q)) return 18;
+ volatile int iv = 7; volatile double dv = iv;
+ if (dv != 7.0) return 19;
+ volatile double dd = 3.75; if ((int)dd != 3) return 20;
+ volatile long long L = 5000000000ll; volatile double dL = L;
+ if (dL != 5000000000.0) return 21;
+ volatile double dbig = 5000000000.0;
+ if ((long long)dbig != 5000000000ll) return 22;
+ return 0;
+}
+void cmain(void){ finish(compute()); }
+EOF
+
+# Build kit's freestanding runtime archive for the helpers (i64 mul/div/shift +
+# soft double). Built on demand; skip the runtime-dependent checks if it cannot
+# be produced (e.g. a partial checkout) rather than failing spuriously.
+RT_HF="$ROOT/build/rt/riscv32-elf-hardfloat/libkit_rt.a"
+RT_SF="$ROOT/build/rt/riscv32-elf/libkit_rt.a"
+make -C "$ROOT" rt-riscv32-elf-hardfloat rt-riscv32-elf >/dev/null 2>&1 || true
+
+run_lane() { # <name> <march> <mabi> <startsrc> <rtarchive> <cppdef>
+ local name="$1" march="$2" mabi="$3" startsrc="$4" rt="$5" def="$6"
+ local clang_mabi="$mabi" clang_march
+ clang_march="${march%%_*}" # clang wants e.g. rv32imafc (no _zicsr_zifencei)
+ if [ ! -f "$rt" ]; then
+ skip_test "$name" "runtime archive $rt missing"; return; fi
+ local so="$BUILD_DIR/$name.start.o"
+ clang --target=riscv32-unknown-elf -march="$clang_march" -mabi="$clang_mabi" \
+ -ffreestanding -nostdlib -c "$startsrc" -o "$so" 2>/dev/null
+ local O
+ for O in -O0 -O1; do
+ local o="$BUILD_DIR/$name$O.o" elf="$BUILD_DIR/$name$O.elf"
+ if ! "$KIT" cc -target riscv32-none-elf -march="$march" -mabi="$mabi" $O $def \
+ -ffreestanding -c "$BUILD_DIR/app.c" -o "$o" 2>"$BUILD_DIR/$name$O.cc.err"; then
+ not_ok "$name $O (kit cc)" "$BUILD_DIR/$name$O.cc.err"; continue; fi
+ if ! ld.lld -T "$BUILD_DIR/link.ld" "$so" "$o" "$rt" -o "$elf" \
+ 2>"$BUILD_DIR/$name$O.ld.err"; then
+ not_ok "$name $O (link)" "$BUILD_DIR/$name$O.ld.err"; continue; fi
+ local rc=0
+ timeout 20 "$QEMU" -machine virt -bios none -kernel "$elf" -nographic -no-reboot \
+ >/dev/null 2>&1 || rc=$?
+ if [ "$rc" -eq 0 ]; then ok "$name $O (qemu rc=0)";
+ else not_ok "$name $O" "expected exit 0, got $rc (failed check #$rc)"; fi
+ done
+}
+
+# ilp32f: hardware single float + soft double + i64.
+run_lane "ilp32f" "rv32imafc_zicsr_zifencei" "ilp32f" "$BUILD_DIR/start_hf.S" "$RT_HF" "-DHW_FLOAT"
+# ilp32: pure soft float (no FPU) + i64.
+run_lane "ilp32" "rv32imac_zicsr_zifencei" "ilp32" "$BUILD_DIR/start_soft.S" "$RT_SF" ""
+
+# Full kit toolchain: re-link the ilp32f objects with `kit ld` (not ld.lld) to
+# prove kit produces a correct bootable rv32 static image end-to-end. A
+# freestanding rv32 target defaults to non-PIE and auto-links no runtime, so no
+# -no-pie / -nostdlib is needed; -T places .text.start at the qemu `virt` RAM
+# base (0x80000000) and the runtime archive is supplied explicitly.
+if [ -f "$BUILD_DIR/ilp32f-O1.o" ] && [ -f "$RT_HF" ]; then
+ if "$KIT" ld -T "$BUILD_DIR/link.ld" -e _start \
+ "$BUILD_DIR/ilp32f.start.o" "$BUILD_DIR/ilp32f-O1.o" "$RT_HF" \
+ -o "$BUILD_DIR/kitld.elf" 2>"$BUILD_DIR/kitld.ld.err"; then
+ rc=0
+ timeout 20 "$QEMU" -machine virt -bios none -kernel "$BUILD_DIR/kitld.elf" \
+ -nographic -no-reboot >/dev/null 2>&1 || rc=$?
+ if [ "$rc" -eq 0 ]; then ok "kit-ld ilp32f -O1 (qemu rc=0)";
+ else not_ok "kit-ld ilp32f -O1" "expected exit 0, got $rc"; fi
+ else
+ not_ok "kit-ld ilp32f -O1 (link)" "$BUILD_DIR/kitld.ld.err"
+ fi
+fi
+
+# Negative control: a deliberately wrong result must produce a nonzero exit.
+sed 's/if (acc != 45) return 1;/if (acc != 45) return 1; return 99;/' "$BUILD_DIR/app.c" > "$BUILD_DIR/bad.c"
+if "$KIT" cc -target riscv32-none-elf -march=rv32imac_zicsr_zifencei -mabi=ilp32 -O1 \
+ -ffreestanding -c "$BUILD_DIR/bad.c" -o "$BUILD_DIR/bad.o" 2>/dev/null \
+ && ld.lld -T "$BUILD_DIR/link.ld" "$BUILD_DIR/ilp32.start.o" "$BUILD_DIR/bad.o" "$RT_SF" \
+ -o "$BUILD_DIR/bad.elf" 2>/dev/null; then
+ rc=0; timeout 20 "$QEMU" -machine virt -bios none -kernel "$BUILD_DIR/bad.elf" -nographic -no-reboot >/dev/null 2>&1 || rc=$?
+ if [ "$rc" -eq 99 ]; then ok "negative-control (qemu rc=99)";
+ else not_ok "negative-control" "expected exit 99, got $rc"; fi
+fi
+
+kit_summary test-smoke-rv32
+kit_exit
diff --git a/test/toy/run.sh b/test/toy/run.sh
@@ -63,7 +63,7 @@ case "$PATHS" in *X*) RUN_X=1;; *) RUN_X=0;; esac
case "$PATHS" in *C*) RUN_C=1;; *) RUN_C=0;; esac
case "$PATHS" in *W*) RUN_W=1;; *) RUN_W=0;; esac
case "$PATHS" in *I*) RUN_I=1;; *) RUN_I=0;; esac
-TOY_CROSS_ARCHS="${KIT_TOY_CROSS_ARCHS:-aa64 x64 rv64}"
+TOY_CROSS_ARCHS="${KIT_TOY_CROSS_ARCHS:-aa64 x64 rv64 rv32}"
TOY_OPT_LEVELS="${KIT_OPT_LEVELS:-0 1}"
HOST_CC="${CC:-cc}"
PAR="${KIT_TOY_PARALLEL:-1}"
@@ -303,9 +303,50 @@ EOF_START
printf '%s' "$start_o"
}
+# rv32 cross arch: freestanding (`-none-elf`), so it cannot use the Linux
+# qemu-user / exec_target path the other cross arches share. Compile for
+# riscv32-none-elf and run the bare-metal image under qemu-system-riscv32 via the
+# shared helper. Runs inline (one qemu-system boot per image) rather than through
+# the deferred exec_target queue.
+#
+# A `<name>.rv32.skip` sidecar opts a single case out (the standard per-lane
+# mechanism, like `.link.skip` / `.wasm.skip`) — but NONE are committed: the real
+# rv32 gaps surfaced by this lane (i64 atomics, the 64-bit overflow intrinsic,
+# i64 varargs, thread-local storage, a toy soft-float compare lowering) are left
+# RED on purpose, so they are not silently hidden. The sidecar exists only for
+# cases that are genuinely inapplicable to rv32 (and may be added later). The
+# shared asmnop skip (an aa64-only construct, already skipped for every non-aa64
+# cross arch) and the env-unavailable skip are the only built-in non-run verdicts.
+cross_one_rv32() {
+ local label="$KIT_BASE/X-O$KIT_OPT:rv32" rc reason exp obj cc_err
+ local skip="${KIT_SRC%.toy}.rv32.skip"
+ if [ -e "$skip" ]; then kit_skip "$label" "$(head -n1 "$skip")"; return; fi
+ if [ "${RV32_BARE_OK:-0}" -ne 1 ]; then
+ kit_skip "$label" "no rv32 runner (clang riscv32 + qemu-system-riscv32)"; return; fi
+ if grep -q 'asmnop' "$KIT_SRC" 2>/dev/null; then
+ kit_skip "$label" "asmnop is target-specific before toy asm selectors"; return; fi
+ obj="$KIT_WORK/$KIT_BASE.O$KIT_OPT.rv32.o"; cc_err="$KIT_WORK/rv32.cc.err"
+ if ! "$KIT" cc "-O$KIT_OPT" -target riscv32-none-elf \
+ -march=rv32imafc_zicsr_zifencei -mabi=ilp32f -ffreestanding \
+ -c "$KIT_SRC" -o "$obj" > "$KIT_WORK/rv32.cc.out" 2> "$cc_err"; then
+ kit_fail "$label" "kit cc -target riscv32-none-elf failed"
+ sed 's/^/ | /' "$cc_err"; return
+ fi
+ if [ -s "$cc_err" ]; then
+ kit_fail "$label" "kit cc rv32 wrote stderr"; sed 's/^/ | /' "$cc_err"; return; fi
+ reason="$(rv32_bare_run "$obj" "$KIT_WORK" "$KIT_WORK/rv32.rc")"
+ if [ $? -eq 2 ]; then kit_skip "$label" "$reason"; return; fi
+ rc="$(cat "$KIT_WORK/rv32.rc" 2>/dev/null || echo 99)"; exp=$(( KIT_EXPECTED & 255 ))
+ # Bare-metal has no stderr channel; the finisher maps main()'s return onto the
+ # qemu exit code, so compare it directly to the expected exit code.
+ if [ "$rc" -eq "$exp" ]; then kit_pass "$label"
+ else kit_fail "$label" "expected rc $exp, got $rc (qemu-system-riscv32)"; fi
+}
+
cross_one() {
local arch="$1"
local triple tag obj exe start_obj cc_err ld_err out err label
+ if [ "$arch" = "rv32" ]; then cross_one_rv32; return; fi
triple="$(cross_triple_for "$arch")" || {
kit_skip "$KIT_BASE/X-O$KIT_OPT:$arch" "unknown cross arch"
return
@@ -538,6 +579,14 @@ if [ "$RUN_X" -eq 1 ]; then
export EXEC_TARGET_MOUNT_ROOT
# shellcheck source=../lib/exec_target.sh
. "$ROOT/test/lib/exec_target.sh"
+ # rv32 is a cross arch in path X, but freestanding (qemu-system bare-metal),
+ # so it uses its own exec helper rather than exec_target's qemu-user path.
+ case " $TOY_CROSS_ARCHS " in
+ *" rv32 "*)
+ # shellcheck source=../lib/exec_rv32_bare.sh
+ . "$ROOT/test/lib/exec_rv32_bare.sh"
+ rv32_bare_setup "$BUILD_DIR/rv32" ;;
+ esac
fi
# ---- drive the corpora -----------------------------------------------------