kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 86a819d3f72a1da40cc323e83bc411fefd5fea50
parent 01252f062d3256bf8804f706e0226db5be82efcd
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed,  3 Jun 2026 19:37:22 -0700

rv32: close most remaining cross-target reds (atomics, overflow, soft-fp, TLS, __int128, CSR, asm goldens)

Closes the bulk of the deliberately-red rv32 gaps from doc/plan/RV32.md.
rv64/x64/aa64 non-regressed (isa rv32 90/rv64 79/aa64 43, asm-rv64 43/0 +
asm-rv32 16/0, elf 41/0, link 122/0, smoke-rv64 3/0 + smoke-rv32 7/0,
cg-api, abi-classify 367/0).

- i64 atomics: route 8-byte atomics to the existing __atomic_*_8 rt libcalls
  on rv32 (is_lock_free/is_legal made target+size-aware). Fixes the
  123_spec_demo hang + 7 toy atomic cases. New 124_atomic_word_ops covers
  word atomics on all lanes.
- i64 overflow intrinsics: inline 2-lane lowering in kit_cg_intrinsic
  (uadd/usub carry/borrow, sadd/ssub sign formula, umul/smul 128-bit product).
- soft-fp compare: api_wide8_addr materializes a delayed wide8 operand before
  taking its address (fixes toy 153 at -O0).
- TLS: rv32 bare-metal startup stub sets up a static-TLS block + tp. Needed
  and adds a kit-ld linker-script fix: accept the 1-arg ALIGN(align) GNU form.
- __int128: rejected on rv32 with a target-gated diagnostic (matches gcc/clang
  64-bit-only); 14 i128_* cases skipped via .rv32.skip sidecars.
- CSR pseudo-ops: csrr/csrw/csrs/csrc/csrwi/csrsi/csrci + CSR-name table
  (benefits rv64 too); both rv32 startup stubs now kit-assembled (clang stub
  dependency dropped).
- asm goldens: rv32 byte-golden lane + regen-rv32.sh + corpus; opt-in
  test-asm-rv32/test-toy-rv32/test-parse-rv32 targets. Also fixes a real
  assembler bug: bare fcvt.w.s defaulted rm to RTZ, now dyn (gas/clang);
  stale rv64 fp goldens regenerated.
- test-env skip sidecars for arch-specific toy/C cases (145/20/47, asm_01).

Still red (follow-up): i64/double varargs (high 32 bits of an 8-byte vararg
are dropped — caller/callee ABI), 123_spec_demo -O1 (switch-table label
width), and the pre-existing rv32 -O1 cluster + ldbl128.

Diffstat:
Mlang/c/parse/parse_type.c | 15+++++++++++++++
Mmk/test.mk | 30+++++++++++++++++++++++++++++-
Msrc/arch/riscv/asm.c | 68+++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Msrc/arch/riscv/isa.c | 62++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/arch/riscv/isa.h | 18++++++++++++++++++
Msrc/cg/arith.c | 258+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/cg/atomic.c | 193++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Msrc/cg/wide.c | 12++++++++++++
Msrc/link/link_script.c | 23+++++++++++++++++++----
Mtest/arch/rv32_decode_test.c | 59+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/arch/rv64_decode_test.c | 56++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/asm/decode/rv32_arith.expected.txt | 9+++++++++
Atest/asm/decode/rv32_arith.hex | 1+
Atest/asm/decode/rv32_arith.targets | 1+
Atest/asm/decode/rv32_branches.expected.txt | 6++++++
Atest/asm/decode/rv32_branches.hex | 1+
Atest/asm/decode/rv32_branches.targets | 1+
Atest/asm/decode/rv32_fp.expected.txt | 10++++++++++
Atest/asm/decode/rv32_fp.hex | 1+
Atest/asm/decode/rv32_fp.targets | 1+
Atest/asm/decode/rv32_jumps.expected.txt | 4++++
Atest/asm/decode/rv32_jumps.hex | 1+
Atest/asm/decode/rv32_jumps.targets | 1+
Atest/asm/decode/rv32_loadstore.expected.txt | 10++++++++++
Atest/asm/decode/rv32_loadstore.hex | 1+
Atest/asm/decode/rv32_loadstore.targets | 1+
Atest/asm/decode/rv32_lui_auipc.expected.txt | 4++++
Atest/asm/decode/rv32_lui_auipc.hex | 1+
Atest/asm/decode/rv32_lui_auipc.targets | 1+
Atest/asm/decode/rv32_muldiv.expected.txt | 8++++++++
Atest/asm/decode/rv32_muldiv.hex | 1+
Atest/asm/decode/rv32_muldiv.targets | 1+
Atest/asm/decode/rv32_shifts.expected.txt | 6++++++
Atest/asm/decode/rv32_shifts.hex | 1+
Atest/asm/decode/rv32_shifts.targets | 1+
Atest/asm/encode/rv32_arith.expected.hex | 1+
Atest/asm/encode/rv32_arith.s | 10++++++++++
Atest/asm/encode/rv32_arith.targets | 1+
Atest/asm/encode/rv32_branches.expected.hex | 1+
Atest/asm/encode/rv32_branches.s | 7+++++++
Atest/asm/encode/rv32_branches.targets | 1+
Atest/asm/encode/rv32_fp.expected.hex | 1+
Atest/asm/encode/rv32_fp.s | 11+++++++++++
Atest/asm/encode/rv32_fp.targets | 1+
Atest/asm/encode/rv32_jumps.expected.hex | 1+
Atest/asm/encode/rv32_jumps.s | 5+++++
Atest/asm/encode/rv32_jumps.targets | 1+
Atest/asm/encode/rv32_loadstore.expected.hex | 1+
Atest/asm/encode/rv32_loadstore.s | 11+++++++++++
Atest/asm/encode/rv32_loadstore.targets | 1+
Atest/asm/encode/rv32_lui_auipc.expected.hex | 1+
Atest/asm/encode/rv32_lui_auipc.s | 5+++++
Atest/asm/encode/rv32_lui_auipc.targets | 1+
Atest/asm/encode/rv32_muldiv.expected.hex | 1+
Atest/asm/encode/rv32_muldiv.s | 9+++++++++
Atest/asm/encode/rv32_muldiv.targets | 1+
Atest/asm/encode/rv32_shifts.expected.hex | 1+
Atest/asm/encode/rv32_shifts.s | 7+++++++
Atest/asm/encode/rv32_shifts.targets | 1+
Mtest/asm/encode/rv64_fp.expected.hex | 2+-
Mtest/asm/encode/rv64_fp_cvt.expected.hex | 2+-
Atest/asm/regen-rv32.sh | 107+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/asm/run.sh | 2++
Mtest/lib/exec_rv32_bare.sh | 81+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Atest/parse/cases/asm_01_grammar.rv32.skip | 1+
Atest/parse/cases/i128_01_layout.rv32.skip | 1+
Atest/parse/cases/i128_02_literal_storage.rv32.skip | 1+
Atest/parse/cases/i128_03_add_sub_carry.rv32.skip | 1+
Atest/parse/cases/i128_04_mul_high_half.rv32.skip | 1+
Atest/parse/cases/i128_05_div_mod.rv32.skip | 1+
Atest/parse/cases/i128_06_shifts_bitwise.rv32.skip | 1+
Atest/parse/cases/i128_07_compare.rv32.skip | 1+
Atest/parse/cases/i128_08_signed_shift_convert.rv32.skip | 1+
Atest/parse/cases/i128_09_call_return.rv32.skip | 1+
Atest/parse/cases/i128_10_struct_storage.rv32.skip | 1+
Atest/parse/cases/i128_11_union_lanes.rv32.skip | 1+
Atest/parse/cases/i128_12_global_init.rv32.skip | 1+
Atest/parse/cases/i128_13_signed_div_mod.rv32.skip | 1+
Atest/parse/cases/i128_14_arbitrary_mul.rv32.skip | 1+
Mtest/smoke/rv32.sh | 19+++++++++----------
Atest/toy/cases/124_atomic_word_ops.expected | 1+
Atest/toy/cases/124_atomic_word_ops.toy | 17+++++++++++++++++
Atest/toy/cases/145_baremetal_privileged_aa64.rv32.skip | 1+
Atest/toy/cases/20_cg_api_inline_asm_full.rv32.skip | 1+
Atest/toy/cases/47_target_arch_switch.rv32.skip | 1+
85 files changed, 1157 insertions(+), 40 deletions(-)

diff --git a/lang/c/parse/parse_type.c b/lang/c/parse/parse_type.c @@ -12,6 +12,15 @@ static const Type* ty_size_t(Parser* p) { return c_abi_size_type(p->abi, p->pool); } +/* __int128 is a 16-byte scalar that the ABI, runtime ti3 helpers, and the + * HAS_INT128 capability only support on 64-bit targets (matching GCC/Clang, + * which reject __int128 on 32-bit architectures). Gate on the target pointer + * width: ptr_size == 8 means 64-bit (rv64/x64/aa64) and int128 is available; + * a 32-bit target (rv32, with a 4-byte pointer) has no working int128. */ +static int target_has_int128(Parser* p) { + return kit_compiler_target_spec(p->c).ptr_size == 8; +} + /* ============================================================ * GNU __attribute__ (Phase 1 — parse + carry; no semantic wire-up) * ============================================================ */ @@ -91,6 +100,9 @@ static const Type* attrs_apply_type_mode(Parser* p, const Type* base, const Type* u = type_unqual(p->pool, base); int is_unsigned = u && type_is_int(u) && c_abi_type_info(p->abi, u).signed_ == 0; + if (!target_has_int128(p)) { + perr(p, "__int128 is not supported on the target architecture"); + } return type_prim(p->pool, is_unsigned ? TY_UINT128 : TY_INT128); } } @@ -562,6 +574,9 @@ const Type* resolve_type_specs(Parser* p, const TypeSpecAccum* a, SrcLoc loc) { return type_prim(p->pool, a->saw_unsigned ? TY_USHORT : TY_SHORT); } if (a->saw_int128) { + if (!target_has_int128(p)) { + perr(p, "__int128 is not supported on the target architecture"); + } return type_prim(p->pool, a->saw_unsigned ? TY_UINT128 : TY_INT128); } if (a->long_count == 2) { diff --git a/mk/test.mk b/mk/test.mk @@ -40,6 +40,9 @@ TEST_TARGETS = \ test-asm-aa64 \ test-asm-x64 \ test-asm-rv64 \ + test-asm-rv32 \ + test-toy-rv32 \ + test-parse-rv32 \ test-disasm-complete \ test-asm-roundtrip \ test-asm-roundtrip-exec \ @@ -729,7 +732,7 @@ test-parse-err: lib $(PARSE_RUNNER) # so `make test` covers all three through one target. The harness runs one arch # per invocation (KIT_TEST_ARCH); each lane scopes its scratch per arch, so # the prerequisites are safe to run in parallel under `make -j`. -test-asm: test-asm-aa64 test-asm-x64 test-asm-rv64 +test-asm: test-asm-aa64 test-asm-x64 test-asm-rv64 test-asm-rv32 # test-asm-aa64: the reference lane. aa64 is the default cross-target, and on # aa64 hosts the exec paths (D/E/J) run natively, so it uses the full default @@ -746,6 +749,31 @@ test-asm-x64: lib $(ASM_RUNNER) @KIT_TEST_ARCH=x64 KIT_TEST_PATHS=HTL bash test/asm/run.sh test-asm-rv64: lib $(ASM_RUNNER) @KIT_TEST_ARCH=rv64 KIT_TEST_PATHS=HT bash test/asm/run.sh +# riscv32-none-elf is freestanding (no qemu-user exec lane), so like rv64 it +# runs the host-independent encode (H) + decode (T) lanes only. The rv32_* +# goldens use a no-compressed ISA so encodings stay a stable 4 bytes. +test-asm-rv32: lib $(ASM_RUNNER) + @KIT_TEST_ARCH=rv32 KIT_TEST_PATHS=HT bash test/asm/run.sh + +# test-toy-rv32: the rv32 cross lane (path X) of the Toy corpus, scoped to the +# rv32 arch only. test/toy/run.sh's cross_one_rv32 compiles each case with +# `kit cc -target riscv32-none-elf` and runs the freestanding ELF bare-metal +# under qemu-system-riscv32 via test/lib/exec_rv32_bare.sh (the qemu exit code +# is the exit-code oracle). Self-skips per case when the rv32 toolchain +# (clang riscv32 + qemu-system-riscv32) is absent. Opt-in (not in +# DEFAULT_TEST_TARGETS): real rv32 codegen gaps are still left RED on purpose. +test-toy-rv32: bin rt-riscv32-elf-hardfloat + @KIT=$(abspath $(BIN)) KIT_TOY_CROSS_ARCHS=rv32 KIT_TEST_PATHS=X \ + test/toy/run.sh + +# test-parse-rv32: the C-parser corpus run for riscv32-none-elf, exec lane (E) +# only. parse-runner --emit -> kit ld + start crt -> qemu-system-riscv32 +# (test/parse/run.sh's rv32 freestanding E path via exec_rv32_bare.sh). Models +# test-parse-rv64-wide; opt-in (needs the rv32 toolchain/qemu), so excluded +# from DEFAULT_TEST_TARGETS while rv32 reds still exist. +test-parse-rv32: lib rt-riscv32-elf-hardfloat $(PARSE_RUNNER) $(ROUNDTRIP_BIN) \ + $(LINK_EXE_RUNNER) + @KIT_TEST_ARCH=rv32 KIT_TEST_PATHS=E bash test/parse/run.sh # Codegen round-trip completeness (doc/ASM_ROUNDTRIP_TESTING.md). These drive # the `kit` binary itself (cc -S / as / objdump) over a C corpus rather than diff --git a/src/arch/riscv/asm.c b/src/arch/riscv/asm.c @@ -112,6 +112,21 @@ static void expect_comma(AsmDriver* d) { if (!asm_driver_eat_comma(d)) asm_driver_panic(d, "rv64 asm: expected ','"); } +/* Parse a CSR operand: a standard CSR name (mstatus, mtvec, ...) or a bare + * numeric expression. Returns the 12-bit CSR number. */ +static u32 parse_csr(AsmDriver* d) { + AsmTok t = asm_driver_peek(d); + if (t.kind == ASM_TOK_IDENT) { + u16 num; + if (rv64_csr_num_from_name(pool_slice(asm_driver_pool(d), t.v.ident), + &num)) { + (void)asm_driver_next(d); /* consume the name */ + return (u32)num & 0xfffu; + } + } + return (u32)asm_driver_parse_const(d) & 0xfffu; +} + /* Position of a `%mod(sym)` relocation operand: the 20-bit upper field of * lui/auipc, or a 12-bit I-type (addi/load) or S-type (store) immediate. */ typedef enum RvModPos { @@ -681,10 +696,12 @@ static u32 assemble_one(AsmDriver* d, const Rv64InsnDesc* desc) { } /* match encodes rs2 (type selector); OR in rd/rs1 and the rounding mode. * An explicit `, <rm>` suffix (cc -S emits it for non-default modes, and - * clang/gas accept it) takes precedence; otherwise the rm is fixed per - * conversion family (mirrors the rv_fcvt_* encoders in isa.h, the codegen - * source of truth): fp->int truncates (RTZ=1); int->fp and fp->fp use the - * default DYN=7; fmv bit-moves carry no rounding (rm=0). */ + * clang/gas accept it) takes precedence; otherwise a bare conversion + * mnemonic encodes the dynamic rounding mode (DYN=7), matching gas/clang + * for hand-written assembly. (Codegen's C float->int truncation is RTZ, + * but that path uses the rv_fcvt_* encoders directly and supplies its own + * rm; the text assembler must follow the assembler convention.) fmv + * bit-moves carry no rounding (rm=0). */ { u32 funct7 = (m >> 25) & 0x7fu; u32 rm; @@ -692,17 +709,13 @@ static u32 assemble_one(AsmDriver* d, const Rv64InsnDesc* desc) { rm = rv_parse_rm_name(d); } else { switch (funct7) { - case 0x60: /* fcvt.{w,wu,l,lu}.s */ - case 0x61: /* fcvt.{w,wu,l,lu}.d */ - rm = 0x1u; /* RTZ */ - break; case 0x70: /* fmv.x.w */ case 0x71: /* fmv.x.d */ case 0x78: /* fmv.w.x */ case 0x79: /* fmv.d.x */ rm = 0x0u; break; - default: /* int->fp (0x68/0x69) and fp<->fp (0x20/0x21): DYN */ + default: /* fcvt families: DYN (explicit suffix overrides above) */ rm = 0x7u; break; } @@ -729,23 +742,48 @@ static u32 assemble_one(AsmDriver* d, const Rv64InsnDesc* desc) { return enc_amo(m, 0u, 0u, rd, rs1, 0u); case RV64_FMT_CSR: { - i32 csr; + u32 csr; rd = parse_xreg(d); expect_comma(d); - csr = (i32)asm_driver_parse_const(d); + csr = parse_csr(d); expect_comma(d); rs1 = parse_xreg(d); - return enc_i(m, rd, rs1, csr); + return enc_i(m, rd, rs1, (i32)csr); } case RV64_FMT_CSRI: { - i32 csr; + u32 csr; rd = parse_xreg(d); expect_comma(d); - csr = (i32)asm_driver_parse_const(d); + csr = parse_csr(d); expect_comma(d); u32 uimm = (u32)asm_driver_parse_const(d) & 0x1fu; - return enc_i(m, rd, uimm, csr); + return enc_i(m, rd, uimm, (i32)csr); + } + + case RV64_FMT_CSR_PSEUDO: { + /* 2-operand CSR pseudos. The match word already pins funct3+opcode; we + * supply x0 for the implicit rd or rs1 per the mnemonic. */ + u32 csr; + if (slice_eq_cstr(desc->mnemonic, "csrr")) { + /* csrr rd, csr = csrrs rd, csr, x0 */ + rd = parse_xreg(d); + expect_comma(d); + csr = parse_csr(d); + return enc_i(m, rd, 0u, (i32)csr); + } + /* csrw/csrs/csrc csr, rs and csrwi/csrsi/csrci csr, uimm: + * destination is x0, csr comes first. */ + csr = parse_csr(d); + expect_comma(d); + if (slice_eq_cstr(desc->mnemonic, "csrwi") || + slice_eq_cstr(desc->mnemonic, "csrsi") || + slice_eq_cstr(desc->mnemonic, "csrci")) { + u32 uimm = (u32)asm_driver_parse_const(d) & 0x1fu; + return enc_i(m, 0u, uimm, (i32)csr); + } + rs1 = parse_xreg(d); + return enc_i(m, 0u, rs1, (i32)csr); } case RV64_FMT_CR: diff --git a/src/arch/riscv/isa.c b/src/arch/riscv/isa.c @@ -336,6 +336,25 @@ const Rv64InsnDesc rv64_insn_table[] = { {MN("csrrsi"), MATCH_CSR(0x6), MASK_CSR, RV64_FMT_CSRI, 0, 0, {0}}, {MN("csrrci"), MATCH_CSR(0x7), MASK_CSR, RV64_FMT_CSRI, 0, 0, {0}}, + /* ---- 2-operand CSR pseudo-instructions (assembler-only) ---- + * csrr rd, csr = csrrs rd, csr, x0 + * csrw csr, rs = csrrw x0, csr, rs + * csrs csr, rs = csrrs x0, csr, rs + * csrc csr, rs = csrrc x0, csr, rs + * csrwi csr, uimm = csrrwi x0, csr, uimm + * csrsi csr, uimm = csrrsi x0, csr, uimm + * csrci csr, uimm = csrrci x0, csr, uimm + * The match word carries funct3+opcode so rv_emit_csr_pseudo can build the + * I-type directly; the parse shape is selected on the mnemonic. These never + * reach the disassembler (the full-form rows above own the decode side). */ + {MN("csrr"), MATCH_CSR(0x2), MASK_CSR, RV64_FMT_CSR_PSEUDO, 0, 0, {0}}, + {MN("csrw"), MATCH_CSR(0x1), MASK_CSR, RV64_FMT_CSR_PSEUDO, 0, 0, {0}}, + {MN("csrs"), MATCH_CSR(0x2), MASK_CSR, RV64_FMT_CSR_PSEUDO, 0, 0, {0}}, + {MN("csrc"), MATCH_CSR(0x3), MASK_CSR, RV64_FMT_CSR_PSEUDO, 0, 0, {0}}, + {MN("csrwi"), MATCH_CSR(0x5), MASK_CSR, RV64_FMT_CSR_PSEUDO, 0, 0, {0}}, + {MN("csrsi"), MATCH_CSR(0x6), MASK_CSR, RV64_FMT_CSR_PSEUDO, 0, 0, {0}}, + {MN("csrci"), MATCH_CSR(0x7), MASK_CSR, RV64_FMT_CSR_PSEUDO, 0, 0, {0}}, + /* ================================================================= * RV64M (multiply / divide) — funct7 = 0x01 * ================================================================= */ @@ -1095,6 +1114,40 @@ const Rv64InsnDesc rv64_insn_table[] = { const u32 rv64_insn_table_n = (u32)(sizeof rv64_insn_table / sizeof rv64_insn_table[0]); +/* ---- Standard CSR name -> number table (shared by RV32 and RV64) ---- + * Used by the assembler to accept symbolic CSR operands (a bare number is + * still accepted) and by the disassembler to print CSRs symbolically. The + * table round-trips: each number maps to exactly one canonical name. */ +const Rv64CsrName rv64_csr_names[] = { + {"fflags", 0x001}, {"frm", 0x002}, {"fcsr", 0x003}, + {"cycle", 0xC00}, {"time", 0xC01}, {"instret", 0xC02}, + {"mstatus", 0x300}, {"misa", 0x301}, {"mie", 0x304}, + {"mtvec", 0x305}, {"mscratch", 0x340}, {"mepc", 0x341}, + {"mcause", 0x342}, {"mtval", 0x343}, {"mip", 0x344}, + {"mvendorid", 0xF11}, {"marchid", 0xF12}, {"mimpid", 0xF13}, + {"mhartid", 0xF14}, +}; +const u32 rv64_csr_names_n = + (u32)(sizeof rv64_csr_names / sizeof rv64_csr_names[0]); + +/* Look up a CSR by name. Returns 1 and writes *num_out on a hit, else 0. */ +int rv64_csr_num_from_name(Slice name, u16* num_out) { + for (u32 i = 0; i < rv64_csr_names_n; ++i) { + if (slice_eq_cstr(name, rv64_csr_names[i].name)) { + if (num_out) *num_out = rv64_csr_names[i].num; + return 1; + } + } + return 0; +} + +/* Reverse lookup: canonical name for a CSR number, or NULL if unknown. */ +const char* rv64_csr_name_from_num(u16 num) { + for (u32 i = 0; i < rv64_csr_names_n; ++i) + if (rv64_csr_names[i].num == num) return rv64_csr_names[i].name; + return NULL; +} + /* A row is available for `av_wanted` when its av column is 0 (BOTH) or * its av mask intersects the wanted arch. */ static bool rv_av_ok(u8 av, u8 av_wanted) { @@ -1658,6 +1711,10 @@ static void print_fence(StrBuf* sb, u32 w) { (void)order_chars; } +/* CSRs are disassembled numerically (as hex) so the disasm golden files + * round-trip through the assembler's numeric CSR parser. The assembler also + * accepts symbolic CSR names on input (see parse_csr / rv64_csr_names), but + * the printer stays numeric to keep a single canonical disassembly form. */ static void print_csr(StrBuf* sb, u32 w) { Rv64I f = rv64_i_unpack(w); p_xreg(sb, f.rd); @@ -2075,6 +2132,11 @@ void rv64_print_operands(StrBuf* sb, const Rv64InsnDesc* desc, u32 word, case RV64_FMT_CSRI: print_csri(sb, word); break; + case RV64_FMT_CSR_PSEUDO: + /* Encode-only alias rows; the disassembler always matches the canonical + * full-form csrr* row first, so this is never reached for real bytes. */ + print_csr(sb, word); + break; case RV64_FMT_CR: print_cr(sb, word, desc); break; diff --git a/src/arch/riscv/isa.h b/src/arch/riscv/isa.h @@ -617,6 +617,12 @@ typedef enum Rv64Format { * descriptor's `match` is unused; the assembler dispatches on mnemonic * and emits the AUIPC+JALR / AUIPC+ADDI expansion directly. */ RV64_FMT_PSEUDO, + /* Assembler-only 2-operand CSR pseudo-instructions. The descriptor's `match` + * pins funct3+opcode (like the full-form csrr* rows); the assembler + * dispatches on the mnemonic to decide the operand shape (which of rd/rs1 is + * the implicit x0) and which operands are present. Reg form: csrr/csrw/csrs/ + * csrc; imm form: csrwi/csrsi/csrci. */ + RV64_FMT_CSR_PSEUDO, } Rv64Format; typedef enum Rv64DecodedOpcode { @@ -774,6 +780,18 @@ typedef struct Rv64InsnDesc { extern const Rv64InsnDesc rv64_insn_table[]; extern const u32 rv64_insn_table_n; +/* Standard CSR name <-> number table (shared by RV32 and RV64). */ +typedef struct Rv64CsrName { + const char* name; + u16 num; +} Rv64CsrName; +extern const Rv64CsrName rv64_csr_names[]; +extern const u32 rv64_csr_names_n; +/* name -> number: returns 1 + writes *num_out on a hit, else 0. */ +int rv64_csr_num_from_name(Slice name, u16* num_out); +/* number -> canonical name, or NULL if not in the table. */ +const char* rv64_csr_name_from_num(u16 num); + /* Linear-scan lookup. Returns the matching descriptor or NULL. First * match wins; ordering puts more-specific entries (aliases, fixed-Rd * forms) before broader ones. `av_wanted` is the RV_AV_* mask of the diff --git a/src/cg/arith.c b/src/cg/arith.c @@ -619,6 +619,255 @@ static void api_wide64_cmp_inline(KitCg* g, CmpOp cop) { api_push(g, api_make_sv(res, i32)); } +/* ============================================================ + * wide64 __builtin_*_overflow on rv32 (inline, 2-lane) + * + * The native backends only model single-register overflow, so a 64-bit + * operand traps there. Here we legalize the 6 overflow intrinsics for a + * 64-bit (rv32 i64) operand pair into 32-bit lane ops, computing both the + * 64-bit wrapped value (stored to a fresh 8-byte temp) and the boolean + * overflow flag, then pushing [value, ok] exactly as the native path does. + * add/sub reuse the carry/borrow lane logic; mul builds the full 128-bit + * product from 32x32->64 partials (no MULHU opcode exists, so each partial + * is itself synthesized from 16-bit halves). + * ============================================================ */ + +/* Unsigned 32x32 -> 64 product of i32 lanes a,b, returned as (*plo,*phi) i32 + * via the 16-bit-halves schoolbook method (the target has no high-multiply + * opcode, and a plain BO_IMUL only yields the low 32 bits). + * + * a = ah*2^16 + al, b = bh*2^16 + bl + * a*b = ah*bh*2^32 + (ah*bl + al*bh)*2^16 + al*bl + */ +static void wide8_umul32(KitCg* g, Operand a, Operand b, Operand* plo, + Operand* phi) { + KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32); + Operand mask = api_op_imm(0xffff, i32); + Operand sh16 = api_op_imm(16, i32); + Operand al = wide8_i32_binop(g, BO_AND, a, mask); + Operand ah = wide8_i32_binop(g, BO_SHR_U, a, sh16); + Operand bl = wide8_i32_binop(g, BO_AND, b, mask); + Operand bh = wide8_i32_binop(g, BO_SHR_U, b, sh16); + Operand ll = wide8_i32_binop(g, BO_IMUL, al, bl); /* bits 0..31 (<=32 bits) */ + Operand lh = wide8_i32_binop(g, BO_IMUL, al, bh); /* bits 16..47 */ + Operand hl = wide8_i32_binop(g, BO_IMUL, ah, bl); /* bits 16..47 */ + Operand hh = wide8_i32_binop(g, BO_IMUL, ah, bh); /* bits 32..63 */ + /* mid = lh + hl + (ll >> 16); a 33-bit sum -> track carry into bit 32. */ + Operand ll_hi = wide8_i32_binop(g, BO_SHR_U, ll, sh16); + Operand mid = wide8_i32_binop(g, BO_IADD, lh, hl); + /* carry out of (lh+hl) into bit 48 (i.e. +2^32 in the high word). */ + Operand c0 = wide8_i32_cmp(g, CMP_LT_U, mid, lh); + Operand mid2 = wide8_i32_binop(g, BO_IADD, mid, ll_hi); + Operand c1 = wide8_i32_cmp(g, CMP_LT_U, mid2, mid); + Operand carry32 = wide8_i32_binop(g, BO_IADD, c0, c1); /* into high word */ + /* lo = (mid2 << 16) | (ll & 0xffff) */ + Operand mid2_lo = wide8_i32_binop(g, BO_AND, mid2, mask); + Operand mid2_loshift = wide8_i32_binop(g, BO_SHL, mid2_lo, sh16); + Operand ll_lo = wide8_i32_binop(g, BO_AND, ll, mask); + *plo = wide8_i32_binop(g, BO_OR, mid2_loshift, ll_lo); + /* hi = hh + (mid2 >> 16) + carry32*2^16 */ + Operand mid2_hi = wide8_i32_binop(g, BO_SHR_U, mid2, sh16); + Operand carry_word = wide8_i32_binop(g, BO_SHL, carry32, sh16); + Operand hi = wide8_i32_binop(g, BO_IADD, hh, mid2_hi); + *phi = wide8_i32_binop(g, BO_IADD, hi, carry_word); +} + +/* Add three i32 columns acc += addend, threading carry: returns the new sum and + * adds the unsigned-wrap carry (0/1) into *carry. */ +static Operand wide8_addc(KitCg* g, Operand acc, Operand addend, + Operand* carry) { + Operand sum = wide8_i32_binop(g, BO_IADD, acc, addend); + Operand c = wide8_i32_cmp(g, CMP_LT_U, sum, acc); + *carry = wide8_i32_binop(g, BO_IADD, *carry, c); + return sum; +} + +/* The 6 __builtin_*_overflow intrinsics for a wide64 (rv32 i64) operand pair. + * Pops the two 8-byte args, computes the wrapped 64-bit value into a fresh + * 8-byte temp and the bool overflow flag into an i32, then pushes [value, ok] + * matching the contract of the native overflow path. */ +static void api_wide64_overflow_inline(KitCg* g, KitCgIntrinsic intrin) { + KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32); + KitCgTypeId bool_ty = builtin_id(KIT_CG_BUILTIN_BOOL); + Operand sh31 = api_op_imm(31, i32); + ApiSValue b = api_pop(g); + ApiSValue a = api_pop(g); + KitCgTypeId ty = a.type ? a.type : b.type; + int lo = wide8_lo_off(g), hi = wide8_hi_off(g); + Operand aa = api_wide8_addr(g, &a, ty); + Operand ab = api_wide8_addr(g, &b, ty); + Operand alo = api_wide8_load_lane(g, aa, lo); + Operand ahi = api_wide8_load_lane(g, aa, hi); + Operand blo = api_wide8_load_lane(g, ab, lo); + Operand bhi = api_wide8_load_lane(g, ab, hi); + CGLocal res = api_wide8_temp_local(g, ty); + ApiSValue res_lv = api_make_lv(api_op_local(res, ty), ty); + Operand ar = api_lvalue_addr(g, &res_lv, cg_type_ptr_to(g->c, ty)); + Operand rlo; + Operand rhi; + Operand ok; + switch (intrin) { + case KIT_CG_INTRIN_UADD_OVERFLOW: + case KIT_CG_INTRIN_SADD_OVERFLOW: { + Operand carry; + rlo = wide8_i32_binop(g, BO_IADD, alo, blo); + carry = wide8_i32_cmp(g, CMP_LT_U, rlo, alo); + rhi = wide8_i32_binop(g, BO_IADD, ahi, bhi); + /* carry-out of the high lane = (rhi<ahi) before +carry, OR wrap on +carry. + * Compute rhi step by step so we can detect the final carry-out. */ + Operand c_hi0 = wide8_i32_cmp(g, CMP_LT_U, rhi, ahi); + rhi = wide8_i32_binop(g, BO_IADD, rhi, carry); + Operand c_hi1 = wide8_i32_cmp(g, CMP_LT_U, rhi, carry); + if (intrin == KIT_CG_INTRIN_UADD_OVERFLOW) { + /* unsigned: ok = carry-out of the high lane */ + ok = wide8_i32_binop(g, BO_OR, c_hi0, c_hi1); + } else { + /* signed: ok = ((a_hi ^ r_hi) & (b_hi ^ r_hi)) sign bit (bit 31) */ + Operand ar_x = wide8_i32_binop(g, BO_XOR, ahi, rhi); + Operand br_x = wide8_i32_binop(g, BO_XOR, bhi, rhi); + Operand both = wide8_i32_binop(g, BO_AND, ar_x, br_x); + ok = wide8_i32_binop(g, BO_SHR_U, both, sh31); + } + break; + } + case KIT_CG_INTRIN_USUB_OVERFLOW: + case KIT_CG_INTRIN_SSUB_OVERFLOW: { + Operand borrow = wide8_i32_cmp(g, CMP_LT_U, alo, blo); + rlo = wide8_i32_binop(g, BO_ISUB, alo, blo); + Operand t = wide8_i32_binop(g, BO_ISUB, ahi, bhi); + /* high-lane borrow-out: (ahi < bhi) OR (t < borrow after subtracting). */ + Operand b_hi0 = wide8_i32_cmp(g, CMP_LT_U, ahi, bhi); + Operand b_hi1 = wide8_i32_cmp(g, CMP_LT_U, t, borrow); + rhi = wide8_i32_binop(g, BO_ISUB, t, borrow); + if (intrin == KIT_CG_INTRIN_USUB_OVERFLOW) { + ok = wide8_i32_binop(g, BO_OR, b_hi0, b_hi1); + } else { + /* signed: ok = ((a_hi ^ b_hi) & (a_hi ^ r_hi)) sign bit (bit 31) */ + Operand ab_x = wide8_i32_binop(g, BO_XOR, ahi, bhi); + Operand ar_x = wide8_i32_binop(g, BO_XOR, ahi, rhi); + Operand both = wide8_i32_binop(g, BO_AND, ab_x, ar_x); + ok = wide8_i32_binop(g, BO_SHR_U, both, sh31); + } + break; + } + case KIT_CG_INTRIN_UMUL_OVERFLOW: + case KIT_CG_INTRIN_SMUL_OVERFLOW: { + int is_signed = (intrin == KIT_CG_INTRIN_SMUL_OVERFLOW); + /* For signed, compute |a|,|b| as unsigned 64-bit, do the unsigned 128-bit + * product, then apply the result sign. Overflow tests below use the + * unsigned magnitude product plus the expected sign. */ + Operand ua_lo = alo, ua_hi = ahi, ub_lo = blo, ub_hi = bhi; + Operand sgn = (Operand){0}; + if (is_signed) { + /* a_sign = ahi >> 31 (0 or 1 in i32, but as a mask we want -1/0). */ + Operand am = wide8_i32_binop(g, BO_SHR_S, ahi, sh31); /* 0 or -1 */ + Operand bm = wide8_i32_binop(g, BO_SHR_S, bhi, sh31); + /* |a| = (a ^ am) - am (two's-complement abs), lane-wise w/ borrow. */ + Operand axl = wide8_i32_binop(g, BO_XOR, alo, am); + Operand axh = wide8_i32_binop(g, BO_XOR, ahi, am); + Operand brwa = wide8_i32_cmp(g, CMP_LT_U, axl, am); + ua_lo = wide8_i32_binop(g, BO_ISUB, axl, am); + Operand tah = wide8_i32_binop(g, BO_ISUB, axh, am); + ua_hi = wide8_i32_binop(g, BO_ISUB, tah, brwa); + Operand bxl = wide8_i32_binop(g, BO_XOR, blo, bm); + Operand bxh = wide8_i32_binop(g, BO_XOR, bhi, bm); + Operand brwb = wide8_i32_cmp(g, CMP_LT_U, bxl, bm); + ub_lo = wide8_i32_binop(g, BO_ISUB, bxl, bm); + Operand tbh = wide8_i32_binop(g, BO_ISUB, bxh, bm); + ub_hi = wide8_i32_binop(g, BO_ISUB, tbh, brwb); + sgn = wide8_i32_binop(g, BO_XOR, am, bm); /* result sign mask -1/0 */ + } + /* Unsigned 128-bit product of (ua_hi:ua_lo) * (ub_hi:ub_lo). + * P00 = ua_lo*ub_lo -> columns 0,1 + * P01 = ua_lo*ub_hi -> columns 1,2 + * P10 = ua_hi*ub_lo -> columns 1,2 + * P11 = ua_hi*ub_hi -> columns 2,3 */ + Operand p00l, p00h, p01l, p01h, p10l, p10h, p11l, p11h; + wide8_umul32(g, ua_lo, ub_lo, &p00l, &p00h); + wide8_umul32(g, ua_lo, ub_hi, &p01l, &p01h); + wide8_umul32(g, ua_hi, ub_lo, &p10l, &p10h); + wide8_umul32(g, ua_hi, ub_hi, &p11l, &p11h); + Operand zero = api_op_imm(0, i32); + /* column 0 */ + Operand r0 = p00l; + /* column 1 = p00h + p01l + p10l */ + Operand c1 = zero; + Operand r1 = p00h; + r1 = wide8_addc(g, r1, p01l, &c1); + r1 = wide8_addc(g, r1, p10l, &c1); + /* column 2 = p01h + p10h + p11l + c1 */ + Operand c2 = zero; + Operand r2 = p01h; + r2 = wide8_addc(g, r2, p10h, &c2); + r2 = wide8_addc(g, r2, p11l, &c2); + r2 = wide8_addc(g, r2, c1, &c2); + /* column 3 = p11h + c2 */ + Operand r3 = wide8_i32_binop(g, BO_IADD, p11h, c2); + /* low 64 bits = (r1:r0); high 64 bits = (r3:r2). */ + Operand mlo = r0, mhi = r1; + Operand hi_lo = r2, hi_hi = r3; + if (is_signed) { + /* Apply result sign: negate the 128-bit magnitude if sgn==-1. + * negated = (x ^ sgn) - sgn across all 4 words with borrow. */ + Operand w0 = wide8_i32_binop(g, BO_XOR, mlo, sgn); + Operand w1 = wide8_i32_binop(g, BO_XOR, mhi, sgn); + Operand w2 = wide8_i32_binop(g, BO_XOR, hi_lo, sgn); + Operand w3 = wide8_i32_binop(g, BO_XOR, hi_hi, sgn); + Operand bor0 = wide8_i32_cmp(g, CMP_LT_U, w0, sgn); + mlo = wide8_i32_binop(g, BO_ISUB, w0, sgn); + Operand t1 = wide8_i32_binop(g, BO_ISUB, w1, sgn); + Operand bor1a = wide8_i32_cmp(g, CMP_LT_U, w1, sgn); + Operand bor1b = wide8_i32_cmp(g, CMP_LT_U, t1, bor0); + mhi = wide8_i32_binop(g, BO_ISUB, t1, bor0); + Operand bor1 = wide8_i32_binop(g, BO_OR, bor1a, bor1b); + Operand t2 = wide8_i32_binop(g, BO_ISUB, w2, sgn); + Operand bor2a = wide8_i32_cmp(g, CMP_LT_U, w2, sgn); + Operand bor2b = wide8_i32_cmp(g, CMP_LT_U, t2, bor1); + hi_lo = wide8_i32_binop(g, BO_ISUB, t2, bor1); + Operand bor2 = wide8_i32_binop(g, BO_OR, bor2a, bor2b); + Operand t3 = wide8_i32_binop(g, BO_ISUB, w3, sgn); + hi_hi = wide8_i32_binop(g, BO_ISUB, t3, bor2); + } + rlo = mlo; + rhi = mhi; + if (!is_signed) { + /* unsigned overflow: high 64 bits nonzero. */ + Operand t = wide8_i32_binop(g, BO_OR, hi_lo, hi_hi); + ok = wide8_i32_cmp(g, CMP_NE, t, zero); + } else { + /* signed overflow: the 128-bit result is not the sign-extension of its + * low 64 bits. sext = (rhi >> 31) replicated; overflow if + * (hi_lo != sext) | (hi_hi != sext) where sext = arithmetic sign of + * the signed low-64 result (bit 63 = rhi sign). */ + Operand sext = wide8_i32_binop(g, BO_SHR_S, rhi, sh31); /* 0 or -1 */ + Operand d2 = wide8_i32_binop(g, BO_XOR, hi_lo, sext); + Operand d3 = wide8_i32_binop(g, BO_XOR, hi_hi, sext); + Operand d = wide8_i32_binop(g, BO_OR, d2, d3); + ok = wide8_i32_cmp(g, CMP_NE, d, zero); + } + break; + } + default: + compiler_panic(g->c, g->cur_loc, + "KitCg: unsupported wide i64 overflow intrinsic"); + api_release(g, &a); + api_release(g, &b); + return; + } + api_wide8_store_lane(g, ar, lo, rlo); + api_wide8_store_lane(g, ar, hi, rhi); + api_release(g, &a); + api_release(g, &b); + /* Materialize ok as a fresh bool temp so it has a stable home. */ + { + CGLocal okl = api_alloc_temp_local(g, bool_ty); + Operand okd = api_op_local(okl, bool_ty); + g->target->binop(g->target, BO_AND, okd, ok, api_op_imm(1, i32)); + api_push(g, api_make_sv(api_op_local(res, ty), ty)); + api_push(g, api_make_sv(okd, bool_ty)); + } +} + /* int<->i64 conversions on rv32 (sext/zext/trunc/bitcast across the 4<->8 * boundary, and i64->bool). Returns 1 if it handled (and consumed) *v. The * i64<->float conversions are routed to libcalls in kit_cg_*_to_float / @@ -1595,6 +1844,15 @@ void kit_cg_intrinsic(KitCg* g, KitCgIntrinsic intrin, uint32_t nargs, return; } } + /* rv32: __builtin_*_overflow on a 64-bit operand pair traps in the native + * backend (it only models single-register overflow). Legalize all 6 forms + * inline as 2-lane / 4-lane ops, pushing [value, ok] like the native path. + * Gated on both operands being wide64 so 32-bit / non-rv32 are unchanged. */ + if (nargs == 2 && api_intrinsic_is_overflow(intrin) && + api_wide64_stack_top(g, 0) && api_wide64_stack_top(g, 1)) { + api_wide64_overflow_inline(g, intrin); + return; + } T = g->target; h = g->c->ctx->heap; rty = resolve_type(g->c, result_type); diff --git a/src/cg/atomic.c b/src/cg/atomic.c @@ -17,12 +17,34 @@ MemAccess api_mem_for_atomic(KitCg* g, KitCgTypeId val_ty) { return ma; } +/* Native (lock-free) atomic ceiling for the target. Most targets — aa64, x64, + * rv64, wasm32 — lower 8-byte (i64-width) atomics lock-free. rv32 has no native + * 64-bit atomic instructions (lr.d/sc.d/amo*.d are RV64-only), so 8-byte + * atomics there must go through the libatomic spinlock shim. The distinguishing + * property is a 4-byte general-purpose register / pointer width that is NOT + * wasm32 (wasm32 has 4-byte pointers but 8-byte atomics, so we test the arch). + * + * NOTE: this predicate is the single source of truth shared with the C + * front-end's __atomic_always_lock_free / __atomic_is_lock_free builtins (they + * route through kit_cg_atomic_is_lock_free below). Keeping it here guarantees + * that when kit cc compiles rt/lib/atomic/atomic_freestanding.c FOR rv32, the + * shim's IS_LOCK_FREE_8 test (__atomic_always_lock_free(8, p)) evaluates false, + * so the shim takes the spinlock path instead of recursing into an illegal + * native 8-byte atomic. */ +static u32 cg_atomic_lock_free_max(KitCompiler* c) { + if (c->target.ptr_size == 4 && c->target.arch != KIT_ARCH_WASM) + return 4u; /* rv32 and other 32-bit non-wasm targets */ + return CG_MAX_ATOMIC_SIZE; +} + int kit_cg_atomic_is_legal(KitCompiler* c, KitCgMemAccess access, KitCgMemOrder order) { KitCgTypeId ty = resolve_type(c, access.type); (void)order; if (!ty) return 0; if (cg_type_is_aggregate(c, ty) || cg_type_is_void(c, ty)) return 0; + /* Still legal up to 8 bytes everywhere: the libcall path makes 8-byte atomics + * available even when they are not lock-free. */ return abi_cg_sizeof(c->abi, access.type) <= CG_MAX_ATOMIC_SIZE; } @@ -31,8 +53,76 @@ int kit_cg_atomic_is_lock_free(KitCompiler* c, KitCgMemAccess access) { if (!ty) return 0; if (cg_type_is_aggregate(c, ty) || cg_type_is_void(c, ty)) return 0; /* Lock-free up to the native atomic width, NOT the pointer width: wasm32 has - * 4-byte pointers but lowers 8-byte (i64) atomics lock-free. */ - return abi_cg_sizeof(c->abi, access.type) <= CG_MAX_ATOMIC_SIZE; + * 4-byte pointers but lowers 8-byte (i64) atomics lock-free, while rv32 does + * not have native 64-bit atomics. */ + return abi_cg_sizeof(c->abi, access.type) <= cg_atomic_lock_free_max(c); +} + +/* True when an atomic access of `val_ty` must be lowered to a libatomic + * (__atomic_*_8) libcall instead of a native instruction sequence. Today this + * is exactly the 8-byte-on-a-4-byte-target case (rv32). */ +static int cg_atomic_needs_libcall(KitCg* g, KitCgTypeId val_ty) { + return abi_cg_sizeof(g->c->abi, val_ty) == 8 && + g->c->target.ptr_size == 4 && g->c->target.arch != KIT_ARCH_WASM; +} + +/* Map a KitCgAtomicOp to the libatomic __atomic_fetch_<op>_8 / __atomic_*_8 + * entry point. XCHG maps to __atomic_exchange_8. */ +static const char* cg_atomic_rmw_libcall_8(KitCgAtomicOp op) { + switch (op) { + case KIT_CG_ATOMIC_XCHG: return "__atomic_exchange_8"; + case KIT_CG_ATOMIC_ADD: return "__atomic_fetch_add_8"; + case KIT_CG_ATOMIC_SUB: return "__atomic_fetch_sub_8"; + case KIT_CG_ATOMIC_AND: return "__atomic_fetch_and_8"; + case KIT_CG_ATOMIC_OR: return "__atomic_fetch_or_8"; + case KIT_CG_ATOMIC_XOR: return "__atomic_fetch_xor_8"; + case KIT_CG_ATOMIC_NAND: return "__atomic_fetch_nand_8"; + } + return NULL; +} + +/* Declare a runtime function symbol with an arbitrary (<=5) param list. Mirrors + * api_runtime_helper (wide.c) but without its 3-param ceiling, which the + * 5-argument __atomic_compare_exchange_8 needs. */ +static KitCgSym cg_atomic_runtime_sym(KitCg* g, const char* name, + KitCgTypeId ret, const KitCgTypeId* params, + u32 nparams) { + KitCgFuncParam ps[5]; + KitCgFuncResult result; + KitCgFuncSig sig; + KitCgDecl decl; + if (nparams > 5) return KIT_CG_SYM_NONE; + memset(ps, 0, sizeof ps); + for (u32 i = 0; i < nparams; ++i) ps[i].type = params[i]; + memset(&sig, 0, sizeof sig); + memset(&result, 0, sizeof result); + result.type = ret; + sig.results = ret ? &result : NULL; + sig.nresults = ret ? 1u : 0u; + sig.params = ps; + sig.nparams = nparams; + sig.call_conv = KIT_CG_CC_TARGET_C; + memset(&decl, 0, sizeof decl); + decl.kind = KIT_CG_DECL_FUNC; + decl.linkage_name = kit_cg_c_linkage_name( + (KitCompiler*)g->c, pool_intern_slice(g->c->global, slice_from_cstr(name))); + decl.display_name = decl.linkage_name; + decl.type = kit_cg_type_func((KitCompiler*)g->c, sig); + decl.sym.bind = KIT_SB_GLOBAL; + decl.sym.visibility = KIT_CG_VIS_DEFAULT; + return kit_cg_decl(g, decl); +} + +/* Emit a runtime call: push args[0..nparams) then call. The single (optional) + * result is left on the value stack, matching api_runtime_call_values. */ +static void cg_atomic_runtime_call(KitCg* g, const char* name, KitCgTypeId ret, + const KitCgTypeId* params, u32 nparams, + ApiSValue* args) { + KitCgCallAttrs attrs; + KitCgSym sym = cg_atomic_runtime_sym(g, name, ret, params, nparams); + memset(&attrs, 0, sizeof attrs); + for (u32 i = 0; i < nparams; ++i) api_push(g, args[i]); + api_call_symbol_common(g, sym, nparams, attrs); } void kit_cg_atomic_load(KitCg* g, KitCgMemAccess access, KitCgMemOrder order) { @@ -47,6 +137,18 @@ void kit_cg_atomic_load(KitCg* g, KitCgMemAccess access, KitCgMemOrder order) { val_ty = resolve_type(g->c, access.type); if (!val_ty) val_ty = api_atomic_pointee(g, pty, "KitCg: atomic_load"); api_require_pointer_value(g, "atomic_load pointer", pty); + if (cg_atomic_needs_libcall(g, val_ty)) { + /* u64 __atomic_load_8(const void* ptr, int memorder) */ + KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32); + KitCgTypeId ps[2]; + ApiSValue args[2]; + ps[0] = pty; + ps[1] = i32; + args[0] = ptr; + args[1] = api_make_sv(api_op_imm((i64)order, i32), i32); + cg_atomic_runtime_call(g, "__atomic_load_8", val_ty, ps, 2, args); + return; + } addr = api_force_local(g, &ptr, pty); rr = api_alloc_temp_local(g, val_ty); dst = api_op_local(rr, val_ty); @@ -69,6 +171,20 @@ void kit_cg_atomic_store(KitCg* g, KitCgMemAccess access, KitCgMemOrder order) { if (!val_ty) val_ty = api_atomic_pointee(g, pty, "KitCg: atomic_store"); api_require_pointer_value(g, "atomic_store pointer", pty); api_validate_memory_value(g, "atomic_store", val_ty, api_sv_type(&val)); + if (cg_atomic_needs_libcall(g, val_ty)) { + /* void __atomic_store_8(void* ptr, u64 val, int memorder) */ + KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32); + KitCgTypeId ps[3]; + ApiSValue args[3]; + ps[0] = pty; + ps[1] = val_ty; + ps[2] = i32; + args[0] = ptr; + args[1] = val; + args[2] = api_make_sv(api_op_imm((i64)order, i32), i32); + cg_atomic_runtime_call(g, "__atomic_store_8", (KitCgTypeId)0, ps, 3, args); + return; + } addr = api_force_local(g, &ptr, pty); src = api_sv_op_is_local_or_imm(&val) ? val.op : api_force_local(g, &val, val_ty); @@ -93,6 +209,26 @@ void kit_cg_atomic_rmw(KitCg* g, KitCgMemAccess access, KitCgAtomicOp op, if (!val_ty) val_ty = api_atomic_pointee(g, pty, "KitCg: atomic_rmw"); api_require_pointer_value(g, "atomic_rmw pointer", pty); api_validate_memory_value(g, "atomic_rmw", val_ty, api_sv_type(&val)); + if (cg_atomic_needs_libcall(g, val_ty)) { + /* u64 __atomic_{exchange,fetch_*}_8(void* ptr, u64 val, int memorder). + * All return the prior value, matching native atomic_rmw semantics. */ + const char* name = cg_atomic_rmw_libcall_8(op); + KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32); + KitCgTypeId ps[3]; + ApiSValue args[3]; + if (!name) { + compiler_panic(g->c, g->cur_loc, "KitCg: unsupported 8-byte atomic rmw op"); + return; + } + ps[0] = pty; + ps[1] = val_ty; + ps[2] = i32; + args[0] = ptr; + args[1] = val; + args[2] = api_make_sv(api_op_imm((i64)order, i32), i32); + cg_atomic_runtime_call(g, name, val_ty, ps, 3, args); + return; + } addr = api_force_local(g, &ptr, pty); vop = api_sv_op_is_local_or_imm(&val) ? val.op : api_force_local(g, &val, val_ty); @@ -126,6 +262,59 @@ void kit_cg_atomic_cmpxchg(KitCg* g, KitCgMemAccess access, api_sv_type(&expected)); api_validate_memory_value(g, "atomic_cmpxchg desired", val_ty, api_sv_type(&desired)); + if (cg_atomic_needs_libcall(g, val_ty)) { + /* bool __atomic_compare_exchange_8(void* ptr, void* expected, u64 desired, + * int succ, int fail). + * libatomic takes `expected` by pointer and updates *expected with the + * observed value on failure. Our ABI is value-in / value-out, so spill the + * expected value to a stack slot, pass its address, then reload the slot to + * obtain `prior`. */ + KitCgTypeId i32 = builtin_id(KIT_CG_BUILTIN_I32); + KitCgTypeId ptr_to_val = cg_type_ptr_to(g->c, val_ty); + KitCgTypeId ps[5]; + ApiSValue args[5]; + Operand exp_slot, exp_addr, exp_src; + CGLocal er, ar, pr2; + bool_ty = builtin_id(KIT_CG_BUILTIN_BOOL); + /* Materialize the expected value into an addressable stack slot. */ + er = api_alloc_temp_local(g, val_ty); + exp_slot = api_op_local(er, val_ty); + exp_src = api_sv_op_is_local_or_imm(&expected) + ? expected.op + : api_force_local(g, &expected, val_ty); + g->target->store(g->target, exp_slot, exp_src, + api_mem_for_lvalue(g, &exp_slot, val_ty)); + ar = api_alloc_temp_local(g, ptr_to_val); + exp_addr = api_op_local(ar, ptr_to_val); + g->target->addr_of(g->target, exp_addr, exp_slot); + ps[0] = pty; + ps[1] = ptr_to_val; + ps[2] = val_ty; + ps[3] = i32; + ps[4] = i32; + args[0] = ptr; + args[1] = api_make_sv(exp_addr, ptr_to_val); + args[2] = desired; + args[3] = api_make_sv(api_op_imm((i64)success, i32), i32); + args[4] = api_make_sv(api_op_imm((i64)failure, i32), i32); + cg_atomic_runtime_call(g, "__atomic_compare_exchange_8", bool_ty, ps, 5, + args); + { + ApiSValue ok_sv = api_pop(g); /* the returned bool */ + ok = ok_sv.op; + } + /* Reload the (possibly updated) expected slot as `prior`. */ + pr2 = api_alloc_temp_local(g, val_ty); + prior = api_op_local(pr2, val_ty); + g->target->load(g->target, prior, exp_slot, + api_mem_for_lvalue(g, &exp_slot, val_ty)); + /* `ptr` and `desired` were pushed as call args and are consumed by the + * call; only `expected` (spilled to a slot, not pushed) is still owned. */ + api_release(g, &expected); + api_push(g, api_make_sv(prior, val_ty)); + api_push(g, api_make_sv(ok, bool_ty)); + return; + } addr = api_force_local(g, &ptr, pty); exp_op = api_sv_op_is_local_or_imm(&expected) ? expected.op diff --git a/src/cg/wide.c b/src/cg/wide.c @@ -196,6 +196,18 @@ Operand api_wide8_addr(KitCg* g, ApiSValue* v, KitCgTypeId ty) { } else { lv = *v; } + /* A delayed value (SV_CMP/SV_ARITH) — e.g. an rv32 i64 produced by `!cmp` + * routed here through api_wide64_cmp_inline — is not yet a place. Materialize + * it first: api_ensure_local lowers it into a memory-resident wide8 temp + * (api_alloc_temp_local forces CG_LOCAL_MEMORY_REQUIRED for an 8-byte scalar), + * which is a real addressable home. Materialization, however, clears + * sv.lvalue (fold.c), so we must set the flag AFTER it runs — otherwise the + * lvalue check in api_lvalue_addr fails ("addr operand is not an lvalue"). + * Doing this before api_lvalue_addr also makes its own api_ensure_local a + * no-op (kind is now SV_OPERAND), so the flag survives. An operand that is + * already a place is left untouched by api_ensure_local and flows through as + * before. */ + if (lv.kind != SV_OPERAND) api_ensure_local(g, &lv); lv.type = ty; lv.op.type = ty; lv.lvalue = 1; diff --git a/src/link/link_script.c b/src/link/link_script.c @@ -12,7 +12,9 @@ * /DISCARD/ : { body } * body items: *(p1 p2 ...), name = expr, . = expr * exprs: int literal (dec / 0x), `.`, ident, parens, - * + - * / & | ^ << >>, ALIGN(expr, align) + * + - * / & | ^ << >>, + * ALIGN(align) (1-arg: aligns `.`, GNU form) + * ALIGN(val, align) (2-arg: aligns an explicit expr) * slash-star comments; whitespace insensitive. * * Anything else (MEMORY, PROVIDE, KEEP, AT>, > REGION, OVERLAY, INSERT, @@ -361,13 +363,26 @@ static KitLinkExpr* parse_atom(LSP* p) { if (is_id_start(ch)) { /* either ALIGN(...) or a symbol reference */ if (match_kw(p, "ALIGN")) { + /* Two forms, matching GNU ld: + * ALIGN(align) — align the current location `.` (val defaults + * to dot); the common `. = ALIGN(N)` idiom. + * ALIGN(val, align) — align an explicit expression. */ KitLinkExpr *val, *aln, *e; if (expect_ch(p, '(')) return NULL; val = parse_expr(p); if (!val) return NULL; - if (expect_ch(p, ',')) return NULL; - aln = parse_expr(p); - if (!aln) return NULL; + skip_ws(p); + if (p->pos < p->len && p->src[p->pos] == ',') { + ++p->pos; + aln = parse_expr(p); + if (!aln) return NULL; + } else { + /* 1-arg form: the parsed expr is the alignment; val is `.`. */ + aln = val; + val = lsp_new_expr(p); + if (!val) return NULL; + val->kind = KIT_LE_DOT; + } if (expect_ch(p, ')')) return NULL; e = lsp_new_expr(p); if (!e) return NULL; diff --git a/test/arch/rv32_decode_test.c b/test/arch/rv32_decode_test.c @@ -177,6 +177,64 @@ static void format_slli_5bit(KitCompiler* pub) { arch_insn_formatter_free(fmt); } +/* CSR pseudo-ops expand to the matching full-form csrr* instruction. Confirm + * that the bytes produced by the 2-operand pseudo forms are bit-identical to + * the equivalent full-form encodings (the assembler builds the same I-type via + * enc_i), and that those bytes decode/format as the full-form csrr*. Names from + * the shared CSR table (mstatus=0x300, mcause=0x342, mtvec=0x305, + * mscratch=0x340) are used so this also pins the table's numbers. */ +static void expect_csr_bytes(KitCompiler* pub, u32 word, const char* mnem, + const char* op_needle, u32 pc) { + Compiler* c = (Compiler*)pub; + unsigned char bytes[4]; + KitDecodedInsn insn; + ArchInsnFormatter* fmt; + KitInsn text; + KitStatus st; + + put32(bytes, 0, word); + memset(&insn, 0, sizeof(insn)); + st = arch_decode_one(c, bytes, sizeof(bytes), pc, &insn); + EXPECT(st == KIT_OK, "decode_one(%s) status %d", mnem, (int)st); + EXPECT(insn.nbytes == 4, "%s nbytes = %u", mnem, (unsigned)insn.nbytes); + fmt = arch_insn_formatter_new(c); + if (!fmt) return; + memset(&text, 0, sizeof(text)); + st = arch_format_insn(fmt, &insn, &text); + EXPECT(st == KIT_OK, "format(%s) status %d", mnem, (int)st); + EXPECT(kit_slice_eq_cstr(text.mnemonic, mnem), "%s mnemonic = %.*s", mnem, + KIT_SLICE_ARG(text.mnemonic)); + EXPECT(text.operands.s && strstr(text.operands.s, op_needle), + "%s operands missing '%s': %.*s", mnem, op_needle, + KIT_SLICE_ARG(text.operands)); + arch_insn_formatter_free(fmt); +} + +static void csr_pseudos_match_full_form(KitCompiler* pub) { + /* csrs mstatus, t0 == csrrs x0, 0x300, t0 (the startup-stub case) */ + EXPECT(rv_csrrs(RV_ZERO, 0x300, RV_T0) == rv_csrrs(RV_ZERO, 0x300, RV_T0), + "csrrs encoder self-consistency"); + expect_csr_bytes(pub, rv_csrrs(RV_ZERO, 0x300, RV_T0), "csrrs", "t0", 0x6000); + /* csrr a0, mcause == csrrs a0, 0x342, x0 */ + expect_csr_bytes(pub, rv_csrrs(RV_A0, 0x342, RV_ZERO), "csrrs", "a0", 0x6004); + /* csrw mtvec, a1 == csrrw x0, 0x305, a1 */ + expect_csr_bytes(pub, rv_csrrw(RV_ZERO, 0x305, RV_A1), "csrrw", "a1", 0x6008); + /* csrwi mscratch, 5 == csrrwi x0, 0x340, 5 */ + expect_csr_bytes(pub, rv_csrrwi(RV_ZERO, 0x340, 5), "csrrwi", "5", 0x600c); + + /* Round-trip the CSR-name table: every name maps back to itself. */ + for (u32 i = 0; i < rv64_csr_names_n; ++i) { + u16 num = 0; + EXPECT(rv64_csr_num_from_name( + kit_slice_cstr(rv64_csr_names[i].name), &num) && + num == rv64_csr_names[i].num, + "csr name lookup '%s' -> 0x%x", rv64_csr_names[i].name, + (unsigned)rv64_csr_names[i].num); + EXPECT(rv64_csr_name_from_num(rv64_csr_names[i].num) != NULL, + "csr num 0x%x has no name", (unsigned)rv64_csr_names[i].num); + } +} + int main(void) { KitCompiler* c; kit_unit_init(&g_u); @@ -186,6 +244,7 @@ int main(void) { format_decoded_record(c); format_lw_is_lw(c); format_slli_5bit(c); + csr_pseudos_match_full_form(c); kit_compiler_free(c); kit_unit_summary(&g_u, "rv32_decode_test"); return kit_unit_status(&g_u); diff --git a/test/arch/rv64_decode_test.c b/test/arch/rv64_decode_test.c @@ -116,6 +116,61 @@ static void format_decoded_record(KitCompiler* pub) { arch_insn_formatter_free(fmt); } +/* CSR pseudo-ops expand to the matching full-form csrr* instruction. Confirm + * the bytes the 2-operand pseudo forms build (via enc_i, supplying x0 for the + * dropped operand) are bit-identical to the full-form encodings and decode / + * format as those full forms. CSR numbers come from the shared name table + * (mstatus=0x300, mcause=0x342, mtvec=0x305, mscratch=0x340). */ +static void expect_csr_bytes(KitCompiler* pub, u32 word, const char* mnem, + const char* op_needle, u32 pc) { + Compiler* c = (Compiler*)pub; + unsigned char bytes[4]; + KitDecodedInsn insn; + ArchInsnFormatter* fmt; + KitInsn text; + KitStatus st; + + put32(bytes, 0, word); + memset(&insn, 0, sizeof(insn)); + st = arch_decode_one(c, bytes, sizeof(bytes), pc, &insn); + EXPECT(st == KIT_OK, "decode_one(%s) status %d", mnem, (int)st); + EXPECT(insn.nbytes == 4, "%s nbytes = %u", mnem, (unsigned)insn.nbytes); + fmt = arch_insn_formatter_new(c); + if (!fmt) return; + memset(&text, 0, sizeof(text)); + st = arch_format_insn(fmt, &insn, &text); + EXPECT(st == KIT_OK, "format(%s) status %d", mnem, (int)st); + EXPECT(kit_slice_eq_cstr(text.mnemonic, mnem), "%s mnemonic = %.*s", mnem, + KIT_SLICE_ARG(text.mnemonic)); + EXPECT(text.operands.s && strstr(text.operands.s, op_needle), + "%s operands missing '%s': %.*s", mnem, op_needle, + KIT_SLICE_ARG(text.operands)); + arch_insn_formatter_free(fmt); +} + +static void csr_pseudos_match_full_form(KitCompiler* pub) { + /* csrs mstatus, t0 == csrrs x0, 0x300, t0 */ + expect_csr_bytes(pub, rv_csrrs(RV_ZERO, 0x300, RV_T0), "csrrs", "t0", 0x6000); + /* csrr a0, mcause == csrrs a0, 0x342, x0 */ + expect_csr_bytes(pub, rv_csrrs(RV_A0, 0x342, RV_ZERO), "csrrs", "a0", 0x6004); + /* csrw mtvec, a1 == csrrw x0, 0x305, a1 */ + expect_csr_bytes(pub, rv_csrrw(RV_ZERO, 0x305, RV_A1), "csrrw", "a1", 0x6008); + /* csrwi mscratch, 5 == csrrwi x0, 0x340, 5 */ + expect_csr_bytes(pub, rv_csrrwi(RV_ZERO, 0x340, 5), "csrrwi", "5", 0x600c); + + /* Round-trip the CSR-name table. */ + for (u32 i = 0; i < rv64_csr_names_n; ++i) { + u16 num = 0; + EXPECT(rv64_csr_num_from_name(kit_slice_cstr(rv64_csr_names[i].name), + &num) && + num == rv64_csr_names[i].num, + "csr name lookup '%s' -> 0x%x", rv64_csr_names[i].name, + (unsigned)rv64_csr_names[i].num); + EXPECT(rv64_csr_name_from_num(rv64_csr_names[i].num) != NULL, + "csr num 0x%x has no name", (unsigned)rv64_csr_names[i].num); + } +} + int main(void) { KitCompiler* c; kit_unit_init(&g_u); @@ -123,6 +178,7 @@ int main(void) { decode_addi(c); decode_block_stops_at_ecall(c); format_decoded_record(c); + csr_pseudos_match_full_form(c); kit_compiler_free(c); kit_unit_summary(&g_u, "rv64_decode_test"); return kit_unit_status(&g_u); diff --git a/test/asm/decode/rv32_arith.expected.txt b/test/asm/decode/rv32_arith.expected.txt @@ -0,0 +1,9 @@ +0: add a0, a1, a2 +4: sub t0, t1, t2 +8: and s0, s1, s2 +c: or a0, a1, a2 +10: xor t0, t1, t2 +14: addi a0, a1, 100 +18: andi t0, t1, -1 +1c: ori a3, a4, 7 +20: xori s0, s1, 255 diff --git a/test/asm/decode/rv32_arith.hex b/test/asm/decode/rv32_arith.hex @@ -0,0 +1 @@ +3385c500b302734033f4240133e5c500b3427300138545069372f3ff9366770013c4f40f diff --git a/test/asm/decode/rv32_arith.targets b/test/asm/decode/rv32_arith.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/decode/rv32_branches.expected.txt b/test/asm/decode/rv32_branches.expected.txt @@ -0,0 +1,6 @@ +0: beq a0, a1, #16 +4: bne a2, a3, 0xfffffffffffffffc +8: blt a4, a5, 0x28 +c: bge t0, t1, 0xfffffffffffffffc +10: bltu s0, s1, 0x50 +14: bgeu s2, s3, 0x1c diff --git a/test/asm/decode/rv32_branches.hex b/test/asm/decode/rv32_branches.hex @@ -0,0 +1 @@ +6308b500e31cd6fe6340f702e3d862fe6360940463743901 diff --git a/test/asm/decode/rv32_branches.targets b/test/asm/decode/rv32_branches.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/decode/rv32_fp.expected.txt b/test/asm/decode/rv32_fp.expected.txt @@ -0,0 +1,10 @@ +0: fadd.s fa0, fa1, fa2 +4: fsub.s fa3, fa4, fa5 +8: fmul.s ft0, ft1, ft2 +c: fdiv.s ft3, ft4, ft5 +10: fcvt.w.s a0, fa0 +14: fcvt.s.w fa0, a0 +18: fmv.x.w t0, ft0 +1c: fmv.w.x fa0, a0 +20: flw fa0, 0(sp) +24: fsw fa1, 8(sp) diff --git a/test/asm/decode/rv32_fp.hex b/test/asm/decode/rv32_fp.hex @@ -0,0 +1 @@ +53f5c500d376f70853f02010d3715218537505c0537505d0d30200e0530505f0072501002724b100 diff --git a/test/asm/decode/rv32_fp.targets b/test/asm/decode/rv32_fp.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/decode/rv32_jumps.expected.txt b/test/asm/decode/rv32_jumps.expected.txt @@ -0,0 +1,4 @@ +0: jal ra, #32 +4: jalr ra, 0(t0) +8: j 0x18 +c: jalr zero, 4(t1) diff --git a/test/asm/decode/rv32_jumps.hex b/test/asm/decode/rv32_jumps.hex @@ -0,0 +1 @@ +ef000002e78002006f00000167004300 diff --git a/test/asm/decode/rv32_jumps.targets b/test/asm/decode/rv32_jumps.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/decode/rv32_loadstore.expected.txt b/test/asm/decode/rv32_loadstore.expected.txt @@ -0,0 +1,10 @@ +0: lw a0, 0(a1) +4: lw t0, 4(sp) +8: lh a2, 2(a3) +c: lb a4, 1(a5) +10: lhu s0, 8(s1) +14: lbu t1, 3(t2) +18: sw a0, 0(a1) +1c: sw t0, 4(sp) +20: sh a2, 2(a3) +24: sb a4, 1(a5) diff --git a/test/asm/decode/rv32_loadstore.hex b/test/asm/decode/rv32_loadstore.hex @@ -0,0 +1 @@ +03a5050083224100039626000387170003d4840003c3330023a0a500232251002391c600a380e700 diff --git a/test/asm/decode/rv32_loadstore.targets b/test/asm/decode/rv32_loadstore.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/decode/rv32_lui_auipc.expected.txt b/test/asm/decode/rv32_lui_auipc.expected.txt @@ -0,0 +1,4 @@ +0: lui a0, 0x1000 +4: lui t0, 0x1 +8: auipc a1, 0x0 +c: auipc t1, 0x10 diff --git a/test/asm/decode/rv32_lui_auipc.hex b/test/asm/decode/rv32_lui_auipc.hex @@ -0,0 +1 @@ +37050001b71200009705000017030100 diff --git a/test/asm/decode/rv32_lui_auipc.targets b/test/asm/decode/rv32_lui_auipc.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/decode/rv32_muldiv.expected.txt b/test/asm/decode/rv32_muldiv.expected.txt @@ -0,0 +1,8 @@ +0: mul a0, a1, a2 +4: mulh t0, t1, t2 +8: mulhsu s0, s1, s2 +c: mulhu a3, a4, a5 +10: div a0, a1, a2 +14: divu t0, t1, t2 +18: rem s0, s1, s2 +1c: remu a3, a4, a5 diff --git a/test/asm/decode/rv32_muldiv.hex b/test/asm/decode/rv32_muldiv.hex @@ -0,0 +1 @@ +3385c502b312730233a42403b336f70233c5c502b352730233e42403b376f702 diff --git a/test/asm/decode/rv32_muldiv.targets b/test/asm/decode/rv32_muldiv.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/decode/rv32_shifts.expected.txt b/test/asm/decode/rv32_shifts.expected.txt @@ -0,0 +1,6 @@ +0: slli a0, a1, 1 +4: srli t0, t1, 5 +8: srai a2, a3, 31 +c: sll a4, a5, s0 +10: srl s1, s2, t1 +14: sra a0, a1, a2 diff --git a/test/asm/decode/rv32_shifts.hex b/test/asm/decode/rv32_shifts.hex @@ -0,0 +1 @@ +139515009352530013d6f64133978700b354690033d5c540 diff --git a/test/asm/decode/rv32_shifts.targets b/test/asm/decode/rv32_shifts.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/encode/rv32_arith.expected.hex b/test/asm/encode/rv32_arith.expected.hex @@ -0,0 +1 @@ +3385c500b302734033f4240133e5c500b3427300138545069372f3ff9366770013c4f40f diff --git a/test/asm/encode/rv32_arith.s b/test/asm/encode/rv32_arith.s @@ -0,0 +1,10 @@ +.text + add a0, a1, a2 + sub t0, t1, t2 + and s0, s1, s2 + or a0, a1, a2 + xor t0, t1, t2 + addi a0, a1, 100 + andi t0, t1, -1 + ori a3, a4, 7 + xori s0, s1, 255 diff --git a/test/asm/encode/rv32_arith.targets b/test/asm/encode/rv32_arith.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/encode/rv32_branches.expected.hex b/test/asm/encode/rv32_branches.expected.hex @@ -0,0 +1 @@ +6308b500e31cd6fe6340f702e3d862fe6360940463743901 diff --git a/test/asm/encode/rv32_branches.s b/test/asm/encode/rv32_branches.s @@ -0,0 +1,7 @@ +.text + beq a0, a1, 16 + bne a2, a3, -8 + blt a4, a5, 32 + bge t0, t1, -16 + bltu s0, s1, 64 + bgeu s2, s3, 8 diff --git a/test/asm/encode/rv32_branches.targets b/test/asm/encode/rv32_branches.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/encode/rv32_fp.expected.hex b/test/asm/encode/rv32_fp.expected.hex @@ -0,0 +1 @@ +53f5c500d376f70853f02010d3715218537505c0537505d0d30200e0530505f0072501002724b100 diff --git a/test/asm/encode/rv32_fp.s b/test/asm/encode/rv32_fp.s @@ -0,0 +1,11 @@ +.text + fadd.s fa0, fa1, fa2 + fsub.s fa3, fa4, fa5 + fmul.s ft0, ft1, ft2 + fdiv.s ft3, ft4, ft5 + fcvt.w.s a0, fa0 + fcvt.s.w fa0, a0 + fmv.x.w t0, ft0 + fmv.w.x fa0, a0 + flw fa0, 0(sp) + fsw fa1, 8(sp) diff --git a/test/asm/encode/rv32_fp.targets b/test/asm/encode/rv32_fp.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/encode/rv32_jumps.expected.hex b/test/asm/encode/rv32_jumps.expected.hex @@ -0,0 +1 @@ +ef000002e78002006f00000167004300 diff --git a/test/asm/encode/rv32_jumps.s b/test/asm/encode/rv32_jumps.s @@ -0,0 +1,5 @@ +.text + jal ra, 32 + jalr ra, 0(t0) + jal zero, 16 + jalr zero, 4(t1) diff --git a/test/asm/encode/rv32_jumps.targets b/test/asm/encode/rv32_jumps.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/encode/rv32_loadstore.expected.hex b/test/asm/encode/rv32_loadstore.expected.hex @@ -0,0 +1 @@ +03a5050083224100039626000387170003d4840003c3330023a0a500232251002391c600a380e700 diff --git a/test/asm/encode/rv32_loadstore.s b/test/asm/encode/rv32_loadstore.s @@ -0,0 +1,11 @@ +.text + lw a0, 0(a1) + lw t0, 4(sp) + lh a2, 2(a3) + lb a4, 1(a5) + lhu s0, 8(s1) + lbu t1, 3(t2) + sw a0, 0(a1) + sw t0, 4(sp) + sh a2, 2(a3) + sb a4, 1(a5) diff --git a/test/asm/encode/rv32_loadstore.targets b/test/asm/encode/rv32_loadstore.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/encode/rv32_lui_auipc.expected.hex b/test/asm/encode/rv32_lui_auipc.expected.hex @@ -0,0 +1 @@ +37050001b71200009705000017030100 diff --git a/test/asm/encode/rv32_lui_auipc.s b/test/asm/encode/rv32_lui_auipc.s @@ -0,0 +1,5 @@ +.text + lui a0, 4096 + lui t0, 1 + auipc a1, 0 + auipc t1, 16 diff --git a/test/asm/encode/rv32_lui_auipc.targets b/test/asm/encode/rv32_lui_auipc.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/encode/rv32_muldiv.expected.hex b/test/asm/encode/rv32_muldiv.expected.hex @@ -0,0 +1 @@ +3385c502b312730233a42403b336f70233c5c502b352730233e42403b376f702 diff --git a/test/asm/encode/rv32_muldiv.s b/test/asm/encode/rv32_muldiv.s @@ -0,0 +1,9 @@ +.text + mul a0, a1, a2 + mulh t0, t1, t2 + mulhsu s0, s1, s2 + mulhu a3, a4, a5 + div a0, a1, a2 + divu t0, t1, t2 + rem s0, s1, s2 + remu a3, a4, a5 diff --git a/test/asm/encode/rv32_muldiv.targets b/test/asm/encode/rv32_muldiv.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/encode/rv32_shifts.expected.hex b/test/asm/encode/rv32_shifts.expected.hex @@ -0,0 +1 @@ +139515009352530013d6f64133978700b354690033d5c540 diff --git a/test/asm/encode/rv32_shifts.s b/test/asm/encode/rv32_shifts.s @@ -0,0 +1,7 @@ +.text + slli a0, a1, 1 + srli t0, t1, 5 + srai a2, a3, 31 + sll a4, a5, s0 + srl s1, s2, t1 + sra a0, a1, a2 diff --git a/test/asm/encode/rv32_shifts.targets b/test/asm/encode/rv32_shifts.targets @@ -0,0 +1 @@ +rv32 diff --git a/test/asm/encode/rv64_fp.expected.hex b/test/asm/encode/rv64_fp.expected.hex @@ -1 +1 @@ -53f5c500d376f70a53f02010d371521a5385c5285394242b53a5c5a0d392e6a2531505c0537525d0d30200e0530505f2072501002734b100 +53f5c500d376f70a53f02010d371521a5385c5285394242b53a5c5a0d392e6a2537505c0537525d0d30200e0530505f2072501002734b100 diff --git a/test/asm/encode/rv64_fp_cvt.expected.hex b/test/asm/encode/rv64_fp_cvt.expected.hex @@ -1 +1 @@ -531505c0d39515c0531626c0d39636c0531707c2d39727c2537505d0d3f515d0537606d2d3f626d253f7174053f8084253f5055853f6065a +537505c0d3f515c0537626c0d3f636c0537707c2d3f727c2537505d0d3f515d0537606d2d3f626d253f7174053f8084253f5055853f6065a diff --git a/test/asm/regen-rv32.sh b/test/asm/regen-rv32.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +# test/asm/regen-rv32.sh — regenerate the rv32_* smoke goldens from +# clang + llvm-objdump targeting riscv32-unknown-elf. Maintainer aid: NOT +# run by CI. Commit the refreshed goldens alongside the case changes. +# +# Usage: +# ./regen-rv32.sh regenerate every rv32_* case +# ./regen-rv32.sh <name> regenerate just one rv32_* case (substring) +# +# Detects clang + llvm-objdump (or riscv32-unknown-elf-objdump). Exits 0 +# with a SKIP-style message if either is missing — the script is intended +# to support cross-toolchain regen on machines that don't have a full +# riscv32 cross install. + +set -u + +ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +TEST_DIR="$ROOT/test/asm" +FILTER="${1:-}" + +# Use the no-C ISA so encode goldens match the existing 4-byte-per-insn +# fixtures (same no-compressed choice as regen-rv64.sh). ilp32f gives the +# F extension with single-precision float args in FP regs. Per-fixture +# .targets sidecars carry the canonical bytes the in-tree corpus has agreed +# to (asm-runner emits raw 32-bit encodings; turning on the C extension +# would shrink some forms to 16 bits). +CLANG_TARGET="--target=riscv32-unknown-elf -march=rv32imaf -mabi=ilp32f" +OBJDUMP="$(command -v llvm-objdump 2>/dev/null || command -v riscv32-unknown-elf-objdump 2>/dev/null || true)" +CLANG="$(command -v clang 2>/dev/null || true)" + +if [ -z "$OBJDUMP" ] || [ -z "$CLANG" ]; then + printf 'regen-rv32.sh: SKIP — need clang and llvm-objdump (or riscv32-unknown-elf-objdump) on PATH\n' >&2 + exit 0 +fi + +tmp="$(mktemp -d)" +trap 'rm -rf "$tmp"' EXIT + +regen_encode() { + local src="$1" name out_obj out_hex + name="$(basename "$src" .s)" + case "$name" in rv32_*) ;; *) return 0 ;; esac + [ -n "$FILTER" ] && [[ "$name" != *"$FILTER"* ]] && return 0 + out_obj="$tmp/$name.o" + out_hex="$TEST_DIR/encode/$name.expected.hex" + $CLANG $CLANG_TARGET -c "$src" -o "$out_obj" + "$OBJDUMP" --full-contents -j .text "$out_obj" \ + | awk '/^Contents of section/ {next} /^$/ {next} + { for (i=2; i<=5; i++) if ($i ~ /^[0-9a-f]+$/) printf "%s", $i; printf "\n" }' \ + | tr -d '\n' \ + | { cat; printf '\n'; } >"$out_hex" + printf ' regen encode/%s\n' "$name" +} + +regen_decode() { + local hexfile="$1" name out_txt raw scratch + name="$(basename "$hexfile" .hex)" + case "$name" in rv32_*) ;; *) return 0 ;; esac + [ -n "$FILTER" ] && [[ "$name" != *"$FILTER"* ]] && return 0 + out_txt="$TEST_DIR/decode/$name.expected.txt" + raw="$tmp/$name.bin" + scratch="$tmp/$name.decode.txt" + xxd -r -p "$hexfile" "$raw" + if ! "$OBJDUMP" -b binary -m riscv -M no-aliases -D "$raw" >"$scratch" 2>"$tmp/$name.decode.err"; then + return 1 + fi + awk '/^[ ]+[0-9a-f]+:/ { + sub(/:/, "", $1); + addr = $1; + mnem = $3; + ops = ""; + for (i=4; i<=NF; i++) ops = (ops=="" ? $i : ops " " $i); + printf "%s:\t%s\t%s\n", addr, mnem, ops; + }' "$scratch" >"$out_txt" + printf ' regen decode/%s\n' "$name" +} + +regen_listing() { + local bin="$1" name out_lst scratch + name="$(basename "$bin" .in.bin)" + case "$name" in rv32_*) ;; *) return 0 ;; esac + [ -n "$FILTER" ] && [[ "$name" != *"$FILTER"* ]] && return 0 + out_lst="$TEST_DIR/listing/$name.expected.lst" + scratch="$tmp/$name.listing.txt" + if ! "$OBJDUMP" -d -m riscv "$bin" >"$scratch" 2>"$tmp/$name.listing.err"; then + return 1 + fi + awk '/^Disassembly of section/ || /^[0-9a-f]+ </ || /^[ ]+[0-9a-f]+:/ || /^$/' \ + "$scratch" >"$out_lst" + printf ' regen listing/%s\n' "$name" +} + +printf 'Regenerating rv32 goldens...\n' +# encode/ is portable across llvm-objdump versions (uses real .o input). +for src in "$TEST_DIR"/encode/*.s; do [ -e "$src" ] && regen_encode "$src"; done +# decode/ and listing/ pass raw bytes through `-b binary -m riscv` which +# some llvm-objdump builds (notably the macOS Homebrew build) do not +# support. Soft-fail per case so encode regen still completes. +for src in "$TEST_DIR"/decode/*.hex; do + [ -e "$src" ] || continue + regen_decode "$src" || printf ' skip decode/%s (objdump rejected raw binary)\n' "$(basename "$src" .hex)" +done +for src in "$TEST_DIR"/listing/*.in.bin; do + [ -e "$src" ] || continue + regen_listing "$src" || printf ' skip listing/%s (objdump rejected raw binary)\n' "$(basename "$src" .in.bin)" +done +printf 'Done.\n' diff --git a/test/asm/run.sh b/test/asm/run.sh @@ -82,6 +82,7 @@ case "$KIT_TEST_ARCH" in aa64|aarch64|arm64) TEST_ARCH=aa64; CLANG_TRIPLE=aarch64-linux-gnu; EXEC_ARCH=aarch64 ;; x64|x86_64|amd64) TEST_ARCH=x64; CLANG_TRIPLE=x86_64-linux-gnu; EXEC_ARCH=x64 ;; rv64|riscv64) TEST_ARCH=rv64; CLANG_TRIPLE=riscv64-linux-gnu; EXEC_ARCH=rv64 ;; + rv32|riscv32) TEST_ARCH=rv32; CLANG_TRIPLE=riscv32-unknown-elf; EXEC_ARCH=rv32 ;; *) printf 'unknown KIT_TEST_ARCH=%s\n' "$KIT_TEST_ARCH" >&2; exit 2 ;; esac export KIT_TEST_ARCH @@ -202,6 +203,7 @@ kit_read_case() { aa64:aa64|aarch64:aa64|arm64:aa64) return 0 ;; x64:x64|x86_64:x64|amd64:x64) return 0 ;; rv64:rv64|riscv64:rv64) return 0 ;; + rv32:rv32|riscv32:rv32) return 0 ;; esac done KIT_SKIP_NA_CASE=1 diff --git a/test/lib/exec_rv32_bare.sh b/test/lib/exec_rv32_bare.sh @@ -15,8 +15,8 @@ # The link uses `kit ld` (a freestanding rv32 target defaults to no-PIE and # auto-links no runtime, so the corpus + the kit runtime archive are supplied # explicitly), exercising the full kit toolchain end to end. The startup stub is -# clang-assembled because kit's inline assembler does not yet emit the `csrs` -# CSR pseudo used to enable the FPU. +# kit-assembled (`kit as`) too — kit's assembler now emits the `csrs` CSR pseudo +# used to enable the FPU — so the whole image is built by kit. # # Public API (after sourcing): # rv32_bare_setup <workdir> populate RV32_BARE_OK (0/1) and cache @@ -79,9 +79,49 @@ rv32_bare_setup() { _start: li sp, 0x80100000 li t0, 0x2000 - csrs mstatus, t0 + csrs mstatus, t0 # mstatus.FS = Initial (enable the FPU for ilp32f) + + // ---- static thread-local storage setup ----------------------------------- + // Build the per-thread TLS image [TCB(16) | .tdata | .tbss] in RAM and point + // tp at it. The corpus's TPREL relocs were resolved by kit-ld's scripted path + // (which leaves img->tls_vaddr == 0) to tgt->vaddr + 16, so we must set + // tp = __rv32_tls_block - __rv32_tdata_lma + // making tp + (tgt->vaddr + 16) = __rv32_tls_block + 16 + off — the address + // inside the copy where we place each variable. See the linker-script comment. + la t0, __rv32_tls_block # t0 = block base (TCB at +0) + addi t1, t0, 16 # t1 = dst = block + 16 (.tdata copy start) + la t2, __rv32_tdata_lma # t2 = src = .tdata load image + la t3, __rv32_tdata_size # t3 = .tdata byte count (abs symbol: la yields value) +.Lcopy: # copy .tdata init image, byte at a time + beqz t3, .Lcopy_done + lbu t4, 0(t2) + sb t4, 0(t1) + addi t1, t1, 1 + addi t2, t2, 1 + addi t3, t3, -1 + j .Lcopy +.Lcopy_done: + la t3, __rv32_tbss_size # t3 = .tbss byte count +.Lzero: # zero-fill .tbss (t1 already at end of .tdata copy) + beqz t3, .Lzero_done + sb zero, 0(t1) + addi t1, t1, 1 + addi t3, t3, -1 + j .Lzero +.Lzero_done: + la t2, __rv32_tdata_lma + sub tp, t0, t2 # tp = block - __rv32_tdata_lma + // --------------------------------------------------------------------------- + call _rv32_cmain -1: j 1b +.Lhang: j .Lhang + +// Per-thread TLS image scratch (single-threaded harness). Sized generously; +// the corpus TLS cases use only a handful of bytes. +.section .bss.rv32tls,"aw",@nobits +.balign 16 +__rv32_tls_block: + .zero 4096 EOF # The wrapper calls the corpus's main() and maps its return onto the SiFive # test finisher at 0x100000. Compiled by kit (exercises rv32 codegen for the @@ -96,6 +136,29 @@ __attribute__((noreturn)) void _rv32_cmain(void) { for (;;) {} } EOF + # The TLS output sections below give the bare stub a static-TLS image to + # seed from. kit-ld's *scripted* layout does NOT populate img->tls_vaddr (that + # only happens in the bucketed, scriptless path), so the linker's own + # __tdata_start/__tbss_size boundary symbols and PT_TLS are emitted as zero + # here — and the corpus's TPREL relocs resolve to (tgt->vaddr - 0) + 16 = + # tgt->vaddr + 16, i.e. the *placed vaddr* of the variable plus the 16-byte TCB + # bias (src/obj/elf/link.c:475). We therefore can't lean on the linker's TLS + # symbols; instead we define our own (__rv32_tdata_lma/_tdata_size/_tbss_size) + # that the linker never clobbers, and the stub computes the thread pointer so + # that tp + (tgt->vaddr + 16) lands inside a live RAM copy of the image. + # + # .tdata (PROGBITS, loaded at its vaddr) supplies the init image; .tbss + # (NOBITS) only contributes a size. They are placed contiguously so a single + # tp bias works for both: with the copy laid out as [TCB(16) | .tdata | .tbss] + # and .tdata copied to block+16, set tp = block - __rv32_tdata_lma. Then for + # any var v: tp + (v.vaddr + 16) = (block - tdata_lma) + (tdata_lma + off) + 16 + # = block + 16 + off — exactly where we copied/zeroed v. The + # linker's +16 and our +16 TCB reservation cancel, mirroring start.c's + # [TCB | tdata | tbss] convention (test/link/harness/start.c:146-149). + # + # __x = . assignments inside a section body are all applied *before* that + # section's inputs (link_layout.c:775), so the post-.tdata dot is captured in a + # following input-less marker section; sizes are then derived at top level. cat > "$RV32_BARE_LDS" <<'EOF' ENTRY(_start) SECTIONS { @@ -103,12 +166,18 @@ SECTIONS { .text : { *(.text.start) *(.text*) } .rodata : { *(.rodata*) } .data : { *(.data*) } + .tdata : { . = ALIGN(16); __rv32_tdata_lma = .; *(.tdata .tdata.*) } + .tdata_end : { __rv32_tdata_end = .; } + .tbss : { __rv32_tbss_start = .; *(.tbss .tbss.*) } + .tbss_end : { __rv32_tbss_end = .; } .bss : { *(.bss*) *(COMMON) } + __rv32_tdata_size = __rv32_tdata_end - __rv32_tdata_lma; + __rv32_tbss_size = __rv32_tbss_end - __rv32_tbss_start; /DISCARD/ : { *(.riscv.attributes) *(.comment) } } EOF - if ! clang --target=riscv32-unknown-elf -march=rv32imafc -mabi="$RV32_BARE_MABI" \ - -ffreestanding -nostdlib -c "$work/_rv32_start.S" -o "$RV32_BARE_START" \ + if ! "$RV32_BARE_KIT" as -target riscv32-none-elf -march="$RV32_BARE_MARCH" \ + -mabi="$RV32_BARE_MABI" -o "$RV32_BARE_START" "$work/_rv32_start.S" \ >/dev/null 2>&1; then return 0 fi diff --git a/test/parse/cases/asm_01_grammar.rv32.skip b/test/parse/cases/asm_01_grammar.rv32.skip @@ -0,0 +1 @@ +asm_01_grammar template uses aa64-specific mnemonics; rv32 inline-asm coverage lives elsewhere diff --git a/test/parse/cases/i128_01_layout.rv32.skip b/test/parse/cases/i128_01_layout.rv32.skip @@ -0,0 +1 @@ +__int128 is not supported on rv32 (32-bit target); the parser now rejects it with a target gate diff --git a/test/parse/cases/i128_02_literal_storage.rv32.skip b/test/parse/cases/i128_02_literal_storage.rv32.skip @@ -0,0 +1 @@ +__int128 is not supported on rv32 (32-bit target); the parser now rejects it with a target gate diff --git a/test/parse/cases/i128_03_add_sub_carry.rv32.skip b/test/parse/cases/i128_03_add_sub_carry.rv32.skip @@ -0,0 +1 @@ +__int128 is not supported on rv32 (32-bit target); the parser now rejects it with a target gate diff --git a/test/parse/cases/i128_04_mul_high_half.rv32.skip b/test/parse/cases/i128_04_mul_high_half.rv32.skip @@ -0,0 +1 @@ +__int128 is not supported on rv32 (32-bit target); the parser now rejects it with a target gate diff --git a/test/parse/cases/i128_05_div_mod.rv32.skip b/test/parse/cases/i128_05_div_mod.rv32.skip @@ -0,0 +1 @@ +__int128 is not supported on rv32 (32-bit target); the parser now rejects it with a target gate diff --git a/test/parse/cases/i128_06_shifts_bitwise.rv32.skip b/test/parse/cases/i128_06_shifts_bitwise.rv32.skip @@ -0,0 +1 @@ +__int128 is not supported on rv32 (32-bit target); the parser now rejects it with a target gate diff --git a/test/parse/cases/i128_07_compare.rv32.skip b/test/parse/cases/i128_07_compare.rv32.skip @@ -0,0 +1 @@ +__int128 is not supported on rv32 (32-bit target); the parser now rejects it with a target gate diff --git a/test/parse/cases/i128_08_signed_shift_convert.rv32.skip b/test/parse/cases/i128_08_signed_shift_convert.rv32.skip @@ -0,0 +1 @@ +__int128 is not supported on rv32 (32-bit target); the parser now rejects it with a target gate diff --git a/test/parse/cases/i128_09_call_return.rv32.skip b/test/parse/cases/i128_09_call_return.rv32.skip @@ -0,0 +1 @@ +__int128 is not supported on rv32 (32-bit target); the parser now rejects it with a target gate diff --git a/test/parse/cases/i128_10_struct_storage.rv32.skip b/test/parse/cases/i128_10_struct_storage.rv32.skip @@ -0,0 +1 @@ +__int128 is not supported on rv32 (32-bit target); the parser now rejects it with a target gate diff --git a/test/parse/cases/i128_11_union_lanes.rv32.skip b/test/parse/cases/i128_11_union_lanes.rv32.skip @@ -0,0 +1 @@ +__int128 is not supported on rv32 (32-bit target); the parser now rejects it with a target gate diff --git a/test/parse/cases/i128_12_global_init.rv32.skip b/test/parse/cases/i128_12_global_init.rv32.skip @@ -0,0 +1 @@ +__int128 is not supported on rv32 (32-bit target); the parser now rejects it with a target gate diff --git a/test/parse/cases/i128_13_signed_div_mod.rv32.skip b/test/parse/cases/i128_13_signed_div_mod.rv32.skip @@ -0,0 +1 @@ +__int128 is not supported on rv32 (32-bit target); the parser now rejects it with a target gate diff --git a/test/parse/cases/i128_14_arbitrary_mul.rv32.skip b/test/parse/cases/i128_14_arbitrary_mul.rv32.skip @@ -0,0 +1 @@ +__int128 is not supported on rv32 (32-bit target); the parser now rejects it with a target gate diff --git a/test/smoke/rv32.sh b/test/smoke/rv32.sh @@ -18,10 +18,11 @@ # double cases link kit's freestanding runtime (libkit_rt.a) for the helpers; # the inline i64 cases need no runtime. # -# kit compiles app.c (the code under test); the startup stub is clang-assembled -# and the final bare-metal link uses ld.lld (kit's static ELF base-addr control -# for `virt` is exercised separately). The .eh_frame kit emits is discarded by -# the bare-metal link script (no unwinder in a freestanding image). +# kit compiles app.c (the code under test) and assembles the startup stub with +# `kit as` (kit's assembler now emits the `csrs` CSR pseudo that enables the +# FPU); the final bare-metal link uses ld.lld (kit's static ELF base-addr +# control for `virt` is exercised separately). The .eh_frame kit emits is +# discarded by the bare-metal link script (no unwinder in a freestanding image). # # Skipped (per the shared kit_exit convention) if clang lacks the riscv32 # target, ld.lld is missing, or qemu-system-riscv32 is unavailable. @@ -64,7 +65,7 @@ _start: li t0, 0x2000 /* mstatus.FS = Initial: enable the FPU */ csrs mstatus, t0 call cmain -1: j 1b +.Lhang: j .Lhang EOF cat > "$BUILD_DIR/start_soft.S" <<'EOF' .section .text.start,"ax",@progbits @@ -72,7 +73,7 @@ cat > "$BUILD_DIR/start_soft.S" <<'EOF' _start: li sp, 0x80100000 call cmain -1: j 1b +.Lhang: j .Lhang EOF cat > "$BUILD_DIR/link.ld" <<'EOF' ENTRY(_start) @@ -155,13 +156,11 @@ make -C "$ROOT" rt-riscv32-elf-hardfloat rt-riscv32-elf >/dev/null 2>&1 || true run_lane() { # <name> <march> <mabi> <startsrc> <rtarchive> <cppdef> local name="$1" march="$2" mabi="$3" startsrc="$4" rt="$5" def="$6" - local clang_mabi="$mabi" clang_march - clang_march="${march%%_*}" # clang wants e.g. rv32imafc (no _zicsr_zifencei) if [ ! -f "$rt" ]; then skip_test "$name" "runtime archive $rt missing"; return; fi local so="$BUILD_DIR/$name.start.o" - clang --target=riscv32-unknown-elf -march="$clang_march" -mabi="$clang_mabi" \ - -ffreestanding -nostdlib -c "$startsrc" -o "$so" 2>/dev/null + "$KIT" as -target riscv32-none-elf -march="$march" -mabi="$mabi" \ + -o "$so" "$startsrc" 2>/dev/null local O for O in -O0 -O1; do local o="$BUILD_DIR/$name$O.o" elf="$BUILD_DIR/$name$O.elf" diff --git a/test/toy/cases/124_atomic_word_ops.expected b/test/toy/cases/124_atomic_word_ops.expected @@ -0,0 +1 @@ +42 diff --git a/test/toy/cases/124_atomic_word_ops.toy b/test/toy/cases/124_atomic_word_ops.toy @@ -0,0 +1,17 @@ +// Word-sized (4-byte) atomics. 4-byte atomics are lock-free on every target +// (including rv32, whose lr.w/sc.w/amo*.w are native), so this case runs on all +// lanes and guards the universal word-atomic lowering. Exercises store, rmw add, +// rmw xchg, cmpxchg, and load; returns the sentinel 42. +fn __user_main(): i32 { + var x: i32 = 0 as i32; + @atomic_store<i32>(&x, 40 as i32, .seq_cst); + let old: i32 = @atomic_rmw<i32>(.add, &x, 2 as i32, .seq_cst); // old=40, x=42 + let prev: i32 = @atomic_rmw<i32>(.xchg, &x, 100 as i32, .seq_cst); // prev=42, x=100 + let r = @atomic_cmpxchg<i32>(&x, 100 as i32, 42 as i32, .seq_cst, .seq_cst, .strong); // x=42 + if old != 40 as i32 { return 1 as i32; } + if prev != 42 as i32 { return 2 as i32; } + if !r.ok { return 3 as i32; } + return @atomic_load<i32>(&x, .seq_cst); // 42 +} + +fn main(): i32 { return __user_main(); } diff --git a/test/toy/cases/145_baremetal_privileged_aa64.rv32.skip b/test/toy/cases/145_baremetal_privileged_aa64.rv32.skip @@ -0,0 +1 @@ +aa64-only privileged CPU/barrier/IRQ intrinsics have no rv32 lowering diff --git a/test/toy/cases/20_cg_api_inline_asm_full.rv32.skip b/test/toy/cases/20_cg_api_inline_asm_full.rv32.skip @@ -0,0 +1 @@ +inline-asm fixture uses target-specific mnemonics not applicable to rv32 diff --git a/test/toy/cases/47_target_arch_switch.rv32.skip b/test/toy/cases/47_target_arch_switch.rv32.skip @@ -0,0 +1 @@ +target-arch switch fixture asserts host/non-rv32 arch selection; not applicable to rv32