kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit bbe0c3e30b210857cf082638b7cc4ecfa2b3e022
parent 28e75424c3c6c828f561cb5a6f216235f1dd5ad7
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue, 19 May 2026 09:30:02 -0700

Complete i128 and binary128 long double support

Diffstat:
Mdoc/C11_LONG_DOUBLE_CHECKLIST.md | 64+++++++++++++++++++++++++++++++++++++++-------------------------
Mlang/c/parse/cg_adapter.c | 3++-
Mlang/c/parse/parse_expr.c | 170+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
Mlang/c/parse/parse_init.c | 128++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mlang/c/parse/parse_priv.h | 4+++-
Mlang/c/parse/parse_type.c | 34++++++++++++++++++++++++++++++++--
Mrt/lib/README.md | 2+-
Mrt/lib/fp_tf/fp_tf.c | 357+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
Mrt/lib/int64/int64.c | 408+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
Msrc/abi/abi_aapcs64.c | 17+++++++++++++++++
Msrc/abi/abi_rv64.c | 3++-
Msrc/api/cg.c | 325+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
Msrc/arch/aa64/ops.c | 12+++++++++++-
Msrc/arch/rv64/alloc.c | 5+++--
Msrc/arch/rv64/emit.c | 124+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Msrc/arch/rv64/internal.h | 4++--
Msrc/arch/rv64/ops.c | 152+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Dtest/parse/cases/6_7_2_12_long_double.skip | 1-
Mtest/parse/cases/i128_06_shifts_bitwise.c | 4++++
Atest/parse/cases/i128_13_signed_div_mod.c | 31+++++++++++++++++++++++++++++++
Atest/parse/cases/i128_13_signed_div_mod.expected | 1+
Atest/parse/cases/i128_14_arbitrary_mul.c | 22++++++++++++++++++++++
Atest/parse/cases/i128_14_arbitrary_mul.expected | 1+
Atest/parse/cases/ldbl128_15_arbitrary_mul.c | 15+++++++++++++++
Atest/parse/cases/ldbl128_15_arbitrary_mul.expected | 1+
25 files changed, 1493 insertions(+), 395 deletions(-)

diff --git a/doc/C11_LONG_DOUBLE_CHECKLIST.md b/doc/C11_LONG_DOUBLE_CHECKLIST.md @@ -8,11 +8,11 @@ implementation pass it on the target that owns that format. ## Target profiles -- [ ] AArch64 Linux: IEEE binary128 `long double`. +- [x] AArch64 Linux: IEEE binary128 `long double`. ABI: passed and returned in SIMD/FP `q` registers when register slots are available. Arithmetic and conversions lower to compiler-rt `*tf*` helpers. -- [ ] RV64 Linux LP64D: IEEE binary128 `long double`. +- [x] RV64 Linux LP64D: IEEE binary128 `long double`. ABI: passed and returned as two integer XLEN eightbytes because FLEN is 64. Arithmetic and conversions lower to compiler-rt `*tf*` helpers. - [ ] AArch64 Darwin: `long double == double`. @@ -24,30 +24,30 @@ implementation pass it on the target that owns that format. ## Support target for the binary128 slice -- [ ] Complete the 16-byte scalar `__int128` path before treating binary128 as +- [x] Complete the 16-byte scalar `__int128` path before treating binary128 as green: layout, locals/globals, constants, arithmetic, shifts, compares, calls/returns, aggregate fields, unions, and static initialization. -- [ ] Add a target long-double profile query used by both the frontend and CG: +- [x] Add a target long-double profile query used by both the frontend and CG: format, storage size, alignment, macro values, and ABI classification. -- [ ] Add a distinct CG type for binary128 `long double`; `TY_LDOUBLE` must not +- [x] Add a distinct CG type for binary128 `long double`; `TY_LDOUBLE` must not map to `F64` on AArch64/RV64 Linux. -- [ ] Emit target-correct `__LDBL_*` and `__DECIMAL_DIG__` predefined macros +- [x] Emit target-correct `__LDBL_*` and `__DECIMAL_DIG__` predefined macros for binary128 targets. -- [ ] Encode `L` floating constants as binary128 bytes without narrowing their +- [x] Encode `L` floating constants as binary128 bytes without narrowing their storage type to `double`. -- [ ] Support binary128 local/global storage, assignment, struct fields, and +- [x] Support binary128 local/global storage, assignment, struct fields, and return values. -- [ ] Lower binary128 arithmetic to runtime helpers: +- [x] Lower binary128 arithmetic to runtime helpers: `__addtf3`, `__subtf3`, `__multf3`, and `__divtf3`. -- [ ] Lower binary128 comparisons through compiler-rt compare helpers. -- [ ] Lower integer, float, and double conversions through compiler-rt helpers: +- [x] Lower binary128 comparisons through compiler-rt compare helpers. +- [x] Lower integer, float, and double conversions through compiler-rt helpers: `__float*tf`, `__fix*tf*`, `__extend{s,d}ftf2`, and `__trunctf{s,d}f2`. -- [ ] Teach AArch64 codegen to move 16-byte FP values through Q-register +- [x] Teach AArch64 codegen to move 16-byte FP values through Q-register load/store/copy paths. -- [ ] Teach RV64 ABI movement to pass/return binary128 values as two integer +- [x] Teach RV64 ABI movement to pass/return binary128 values as two integer parts, backed by memory in CG. -- [ ] Keep runtime linkage using the existing `rt/lib/fp_tf/fp_tf.c` and +- [x] Keep runtime linkage using the existing `rt/lib/fp_tf/fp_tf.c` and `rt/lib/fp_ti/fp_ti.c` objects for the binary128 runtime variants. ## Red tests @@ -69,28 +69,42 @@ x87 work can land later without hiding the binary128 regression signal. Coverage intent: -- `i128_01` through `i128_12`: target layout/alignment, literal storage, +- `i128_01` through `i128_14`: target layout/alignment, literal storage, add/sub carry, multiply high-half behavior, div/mod, shifts/bitwise operations, signed and unsigned compares, signed shifts/conversions, calls/returns, aggregate fields, union lane visibility, and global - initialization. -- `ldbl128_01` through `ldbl128_14`: target macros/layout, literal decoding, + initialization, arbitrary signed div/mod, and arbitrary signed/unsigned + multiplication. +- `ldbl128_01` through `ldbl128_15`: target macros/layout, literal decoding, arithmetic helpers, conversions, comparisons, calls/returns, struct and array storage, raw binary128 bits, globals, unary negation, stack - arguments, mixed arithmetic, and aggregate return. + arguments, mixed arithmetic, aggregate return, and arbitrary binary128 + multiplication. + +Known remaining limits: + +- The binary128 support target is Linux AArch64/RV64. Darwin `long double` + target rules and x87 80-bit `long double` are still separate follow-up + targets. +- Decimal `L` literal coverage currently exercises representable values and + raw canonical encodings; it does not yet prove full decimal-to-binary128 + precision for non-representable literals. +- ABI aggregate classification still covers the implemented scalar and simple + aggregate paths, not the full AArch64 HFA/HVA or every RV64 aggregate + flattening edge. ## Done criteria -- [ ] `CFREE_TEST_ARCH=aa64 CFREE_TEST_FILTER=ldbl128 make test-parse` passes +- [x] `CFREE_TEST_ARCH=aa64 CFREE_TEST_FILTER=ldbl128 make test-parse` passes with `CFREE_TEST_ALLOW_SKIP` unset. -- [ ] `CFREE_TEST_ARCH=rv64 CFREE_TEST_FILTER=ldbl128 make test-parse` passes +- [x] `CFREE_TEST_ARCH=rv64 CFREE_TEST_FILTER=ldbl128 make test-parse` passes with `CFREE_TEST_ALLOW_SKIP` unset. -- [ ] `CFREE_TEST_ARCH=aa64 CFREE_TEST_FILTER=i128 make test-parse` passes +- [x] `CFREE_TEST_ARCH=aa64 CFREE_TEST_FILTER=i128 make test-parse` passes with `CFREE_TEST_ALLOW_SKIP` unset. -- [ ] `CFREE_TEST_ARCH=rv64 CFREE_TEST_FILTER=i128 make test-parse` passes +- [x] `CFREE_TEST_ARCH=rv64 CFREE_TEST_FILTER=i128 make test-parse` passes with `CFREE_TEST_ALLOW_SKIP` unset. -- [ ] `CFREE_TEST_FILTER=6_7_2_12_long_double make test-parse` passes on +- [x] `CFREE_TEST_FILTER=6_7_2_12_long_double make test-parse` passes on AArch64 Linux and RV64 Linux without a `.skip` sidecar. -- [ ] `make rt` still builds the default runtime archives. -- [ ] `make test-rt-headers test-rt-runtime` stays green for the default +- [x] `make rt` still builds the default runtime archives. +- [x] `make test-rt-headers test-rt-runtime` stays green for the default runtime targets. diff --git a/lang/c/parse/cg_adapter.c b/lang/c/parse/cg_adapter.c @@ -422,7 +422,8 @@ void pcg_unop(Parser* p, UnOp op) { } void pcg_cmp(Parser* p, CmpOp op) { - if (op == CMP_LT_F || op == CMP_LE_F || op == CMP_GT_F || op == CMP_GE_F) { + if (op == CMP_LT_F || op == CMP_LE_F || op == CMP_GT_F || op == CMP_GE_F || + ((op == CMP_EQ || op == CMP_NE) && pcg_type_is_fp(pcg_top_type(p)))) { if (pcg_emit_enabled(p)) cfree_cg_fp_cmp(p->cg, pcg_fp_cmp(op)); } else { if (pcg_emit_enabled(p)) cfree_cg_int_cmp(p->cg, pcg_int_cmp(op)); diff --git a/lang/c/parse/parse_expr.c b/lang/c/parse/parse_expr.c @@ -197,7 +197,7 @@ static const Type* int_literal_type(Parser* p, const Tok* t) { } } -static double parse_float_literal(Parser* p, const Tok* t) { +double parse_float_literal(Parser* p, const Tok* t) { size_t len = 0; const char* s = pool_str(p->pool, t->spelling, &len); size_t i = 0; @@ -463,33 +463,112 @@ static u32 cint_bits(Parser* p, const Type* ty) { return sz * 8u; } -static u64 cint_mask_for_bits(u32 bits) { - if (bits >= 64) return ~0ull; - return (1ull << bits) - 1ull; -} - static int cint_signed(Parser* p, const Type* ty) { if (!ty) return 1; return c_abi_type_info(p->abi, ty).signed_ != 0; } -static CConstInt cint_make(Parser* p, const Type* ty, u64 bits) { +static void cint_mask_to_bits(CConstInt* v, u32 bits) { + if (bits < 64) { + v->lo &= (1ull << bits) - 1ull; + v->hi = 0; + } else if (bits < 128) { + v->hi &= (1ull << (bits - 64u)) - 1ull; + } +} + +static CConstInt cint_make_u64(Parser* p, const Type* ty, u64 bits) { CConstInt v; u32 nb; if (!ty) ty = ty_int(p); nb = cint_bits(p, ty); v.type = ty; - v.bits = bits & cint_mask_for_bits(nb); - if (ty->kind == TY_BOOL) v.bits = v.bits ? 1u : 0u; + v.lo = bits; + v.hi = 0; + cint_mask_to_bits(&v, nb); + if (ty->kind == TY_BOOL) { + v.lo = (v.lo || v.hi) ? 1u : 0u; + v.hi = 0; + } return v; } +static CConstInt cint_make_pair(Parser* p, const Type* ty, u64 lo, u64 hi) { + CConstInt v; + if (!ty) ty = ty_int(p); + v.type = ty; + v.lo = lo; + v.hi = hi; + cint_mask_to_bits(&v, cint_bits(p, ty)); + if (ty->kind == TY_BOOL) { + v.lo = (v.lo || v.hi) ? 1u : 0u; + v.hi = 0; + } + return v; +} + +static int cint_nonzero(CConstInt v) { return v.lo != 0 || v.hi != 0; } + +static int cint_eq(CConstInt a, CConstInt b) { + return a.lo == b.lo && a.hi == b.hi; +} + +static int cint_cmp_u(CConstInt a, CConstInt b) { + if (a.hi != b.hi) return a.hi < b.hi ? -1 : 1; + if (a.lo != b.lo) return a.lo < b.lo ? -1 : 1; + return 0; +} + +static CConstInt cint_add(Parser* p, const Type* ty, CConstInt a, CConstInt b) { + u64 lo = a.lo + b.lo; + return cint_make_pair(p, ty, lo, a.hi + b.hi + (lo < a.lo)); +} + +static CConstInt cint_sub(Parser* p, const Type* ty, CConstInt a, CConstInt b) { + return cint_make_pair(p, ty, a.lo - b.lo, a.hi - b.hi - (a.lo < b.lo)); +} + +static CConstInt cint_shl(Parser* p, const Type* ty, CConstInt a, u32 sh) { + if (sh >= 128) return cint_make_u64(p, ty, 0); + if (sh == 0) return cint_make_pair(p, ty, a.lo, a.hi); + if (sh >= 64) return cint_make_pair(p, ty, 0, a.lo << (sh - 64u)); + return cint_make_pair(p, ty, a.lo << sh, (a.hi << sh) | (a.lo >> (64u - sh))); +} + +static CConstInt cint_shr_u(Parser* p, const Type* ty, CConstInt a, u32 sh) { + if (sh >= 128) return cint_make_u64(p, ty, 0); + if (sh == 0) return cint_make_pair(p, ty, a.lo, a.hi); + if (sh >= 64) return cint_make_pair(p, ty, a.hi >> (sh - 64u), 0); + return cint_make_pair(p, ty, (a.lo >> sh) | (a.hi << (64u - sh)), a.hi >> sh); +} + +static CConstInt cint_neg(Parser* p, const Type* ty, CConstInt a) { + CConstInt zero = cint_make_u64(p, ty, 0); + return cint_sub(p, ty, zero, a); +} + +static CConstInt cint_bnot(Parser* p, const Type* ty, CConstInt a) { + return cint_make_pair(p, ty, ~a.lo, ~a.hi); +} + +static CConstInt cint_mul(Parser* p, const Type* ty, CConstInt a, CConstInt b) { + CConstInt r = cint_make_u64(p, ty, 0); + CConstInt x = a; + for (u32 i = 0; i < 128; ++i) { + if ((i < 64 ? (b.lo >> i) : (b.hi >> (i - 64u))) & 1ull) + r = cint_add(p, ty, r, x); + x = cint_shl(p, ty, x, 1); + } + return r; +} + i64 const_int_as_i64(Parser* p, CConstInt v) { u32 nb = cint_bits(p, v.type); - u64 mask = cint_mask_for_bits(nb); - u64 u = v.bits & mask; + u64 u = v.lo; if (cint_signed(p, v.type) && nb < 64) { + u64 mask = (1ull << nb) - 1ull; u64 sign = 1ull << (nb - 1u); + u &= mask; if (u & sign) u |= ~mask; } return (i64)u; @@ -500,7 +579,7 @@ static CConstInt cint_cast(Parser* p, CConstInt v, const Type* ty) { if (!dst || !type_is_int(dst)) { perr(p, "integer constant expression cast requires integer type"); } - return cint_make(p, dst, v.bits); + return cint_make_pair(p, dst, v.lo, v.hi); } static u32 cint_rank(const Type* ty) { @@ -583,16 +662,16 @@ static const Type* cint_common_type(Parser* p, const Type* a, const Type* b) { } static CConstInt cint_convert(Parser* p, CConstInt v, const Type* ty) { - return cint_make(p, ty, v.bits); + return cint_make_pair(p, ty, v.lo, v.hi); } static int cint_truth(Parser* p, CConstInt v) { (void)p; - return v.bits != 0; + return cint_nonzero(v); } static CConstInt cint_bool(Parser* p, int truth) { - return cint_make(p, ty_int(p), truth ? 1u : 0u); + return cint_make_u64(p, ty_int(p), truth ? 1u : 0u); } static CConstInt cexpr_mul(Parser* p, SrcLoc loc) { @@ -615,17 +694,18 @@ static CConstInt cexpr_mul(Parser* p, SrcLoc loc) { v = cint_convert(p, v, ct); r = cint_convert(p, r, ct); if (op == '*') { - v = cint_make(p, ct, v.bits * r.bits); + v = cint_mul(p, ct, v, r); } else { - if (r.bits == 0) + if (!cint_nonzero(r)) compiler_panic(p->c, loc, op == '/' ? "division by zero in constant" : "modulo by zero in constant"); if (cint_signed(p, ct)) { i64 lv = const_int_as_i64(p, v); i64 rv = const_int_as_i64(p, r); - v = cint_make(p, ct, op == '/' ? (u64)(lv / rv) : (u64)(lv % rv)); + v = cint_make_u64(p, ct, op == '/' ? (u64)(lv / rv) : (u64)(lv % rv)); } else { - v = cint_make(p, ct, op == '/' ? v.bits / r.bits : v.bits % r.bits); + v = cint_make_u64(p, ct, + op == '/' ? v.lo / r.lo : v.lo % r.lo); } } } @@ -648,7 +728,7 @@ static CConstInt cexpr_add(Parser* p, SrcLoc loc) { ct = cint_common_type(p, v.type, r.type); v = cint_convert(p, v, ct); r = cint_convert(p, r, ct); - v = cint_make(p, ct, sub ? v.bits - r.bits : v.bits + r.bits); + v = sub ? cint_sub(p, ct, v, r) : cint_add(p, ct, v, r); } return v; } @@ -675,11 +755,11 @@ static CConstInt cexpr_shift(Parser* p, SrcLoc loc) { if (left) { if (cint_signed(p, vt) && const_int_as_i64(p, v) < 0) perr(p, "left shift of negative value in constant expression"); - v = cint_make(p, vt, v.bits << (u32)sh); + v = cint_shl(p, vt, v, (u32)sh); } else if (cint_signed(p, vt)) { - v = cint_make(p, vt, (u64)(const_int_as_i64(p, v) >> (u32)sh)); + v = cint_make_u64(p, vt, (u64)(const_int_as_i64(p, v) >> (u32)sh)); } else { - v = cint_make(p, vt, v.bits >> (u32)sh); + v = cint_shr_u(p, vt, v, (u32)sh); } } return v; @@ -716,10 +796,11 @@ static CConstInt cexpr_rel(Parser* p, SrcLoc loc) { : op == '<' ? lv < rv : lv > rv; } else { - res = op == P_LE ? v.bits <= r.bits - : op == P_GE ? v.bits >= r.bits - : op == '<' ? v.bits < r.bits - : v.bits > r.bits; + int cmp = cint_cmp_u(v, r); + res = op == P_LE ? cmp <= 0 + : op == P_GE ? cmp >= 0 + : op == '<' ? cmp < 0 + : cmp > 0; } v = cint_bool(p, res); } @@ -742,7 +823,7 @@ static CConstInt cexpr_eq(Parser* p, SrcLoc loc) { ct = cint_common_type(p, v.type, r.type); v = cint_convert(p, v, ct); r = cint_convert(p, r, ct); - v = cint_bool(p, ne ? v.bits != r.bits : v.bits == r.bits); + v = cint_bool(p, ne ? !cint_eq(v, r) : cint_eq(v, r)); } return v; } @@ -756,7 +837,7 @@ static CConstInt cexpr_band(Parser* p, SrcLoc loc) { ct = cint_common_type(p, v.type, r.type); v = cint_convert(p, v, ct); r = cint_convert(p, r, ct); - v = cint_make(p, ct, v.bits & r.bits); + v = cint_make_pair(p, ct, v.lo & r.lo, v.hi & r.hi); } return v; } @@ -767,7 +848,7 @@ static CConstInt cexpr_bxor(Parser* p, SrcLoc loc) { const Type* ct = cint_common_type(p, v.type, r.type); v = cint_convert(p, v, ct); r = cint_convert(p, r, ct); - v = cint_make(p, ct, v.bits ^ r.bits); + v = cint_make_pair(p, ct, v.lo ^ r.lo, v.hi ^ r.hi); } return v; } @@ -781,7 +862,7 @@ static CConstInt cexpr_bor(Parser* p, SrcLoc loc) { ct = cint_common_type(p, v.type, r.type); v = cint_convert(p, v, ct); r = cint_convert(p, r, ct); - v = cint_make(p, ct, v.bits | r.bits); + v = cint_make_pair(p, ct, v.lo | r.lo, v.hi | r.hi); } return v; } @@ -823,13 +904,13 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) { CConstInt v = cexpr_unary(p, loc); const Type* pt = cint_promote_type(p, v.type); v = cint_convert(p, v, pt); - return cint_make(p, pt, (u64)(-const_int_as_i64(p, v))); + return cint_neg(p, pt, v); } if (accept_punct(p, '~')) { CConstInt v = cexpr_unary(p, loc); const Type* pt = cint_promote_type(p, v.type); v = cint_convert(p, v, pt); - return cint_make(p, pt, ~v.bits); + return cint_bnot(p, pt, v); } if (accept_punct(p, '!')) return cint_bool(p, !cint_truth(p, cexpr_unary(p, loc))); if (accept_kw(p, KW_SIZEOF)) { @@ -841,7 +922,7 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) { const Type* t = parse_type_name(p); expect_punct(p, ')', "')' after sizeof type-name"); require_sizeof_type(p, t); - return cint_make(p, ty_size_t(p), c_abi_sizeof(p->abi, t)); + return cint_make_u64(p, ty_size_t(p), c_abi_sizeof(p->abi, t)); } } } @@ -852,7 +933,7 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) { require_sizeof_type(p, ty); i64 sz = (i64)c_abi_sizeof(p->abi, ty); cg_drop(p->cg); - return cint_make(p, ty_size_t(p), (u64)sz); + return cint_make_u64(p, ty_size_t(p), (u64)sz); } } if (accept_kw(p, KW_ALIGNOF)) { @@ -863,7 +944,7 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) { { const Type* t = parse_type_name(p); expect_punct(p, ')', "')' after _Alignof type-name"); - return cint_make(p, ty_size_t(p), c_abi_alignof(p->abi, t)); + return cint_make_u64(p, ty_size_t(p), c_abi_alignof(p->abi, t)); } } } @@ -872,7 +953,7 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) { const Type* ty = cg_top_type(p->cg); i64 al = (i64)c_abi_alignof(p->abi, ty); cg_drop(p->cg); - return cint_make(p, ty_size_t(p), (u64)al); + return cint_make_u64(p, ty_size_t(p), (u64)al); } } if (accept_punct(p, '(')) { @@ -888,7 +969,7 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) { } fv = parse_float_literal(p, &p->cur); advance(p); - return cint_make(p, tu, (u64)(i64)fv); + return cint_make_u64(p, tu, (u64)(i64)fv); } CConstInt v = cexpr_unary(p, loc); return cint_cast(p, v, t); @@ -904,12 +985,12 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) { i64 v = parse_int_literal(p, &p->cur); const Type* ty = int_literal_type(p, &p->cur); advance(p); - return cint_make(p, ty, (u64)v); + return cint_make_u64(p, ty, (u64)v); } if (p->cur.kind == TOK_CHR) { i64 v = decode_char_literal(p, &p->cur); advance(p); - return cint_make(p, ty_int(p), (u64)v); + return cint_make_u64(p, ty_int(p), (u64)v); } if (p->cur.kind == TOK_IDENT) { Sym name = p->cur.v.ident; @@ -922,13 +1003,13 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) { expect_punct(p, ',', "',' in __builtin_offsetof"); (void)offsetof_designator(p, root, &off); expect_punct(p, ')', "')' after __builtin_offsetof"); - return cint_make(p, ty_size_t(p), off); + return cint_make_u64(p, ty_size_t(p), off); } { SymEntry* e = scope_lookup(p, name); if (e && e->kind == SEK_ENUM_CST) { advance(p); - return cint_make(p, e->type ? e->type : ty_int(p), (u64)e->v.enum_value); + return cint_make_u64(p, e->type ? e->type : ty_int(p), (u64)e->v.enum_value); } } compiler_panic(p->c, loc, "non-constant identifier in constant expression"); @@ -2552,6 +2633,8 @@ static void parse_shift(Parser* p) { } advance(p); to_rvalue(p); + if (bop == BO_SHR_S && !c_abi_type_info(p->abi, cg_top_type(p->cg)).signed_) + bop = BO_SHR_U; parse_add(p); to_rvalue(p); if (!type_is_int(cg_top2_type(p->cg)) || !type_is_int(cg_top_type(p->cg))) { @@ -2944,6 +3027,8 @@ void parse_assign_expr(Parser* p) { } advance(p); const Type* lhs = cg_top_type(p->cg); + if (compound == BO_SHR_S && !c_abi_type_info(p->abi, lhs).signed_) + compound = BO_SHR_U; { if (lhs && (lhs->qual & Q_CONST)) { perr(p, "assignment to const-qualified object"); @@ -2980,6 +3065,7 @@ void parse_assign_expr(Parser* p) { case BO_XOR: op = '^'; break; case BO_SHL: op = '<'; break; case BO_SHR_S: op = '>'; break; + case BO_SHR_U: op = '>'; break; default: op = 0; break; } CSemCheck chk = diff --git a/lang/c/parse/parse_init.c b/lang/c/parse/parse_init.c @@ -560,6 +560,98 @@ static void encode_uint_le(u8* dst, u32 size, u64 v) { } } +static void encode_uint128_le(u8* dst, u32 size, u64 lo, u64 hi) { + if (size > 16) size = 16; + for (u32 i = 0; i < size; ++i) { + u64 lane = i < 8u ? lo : hi; + dst[i] = (u8)((lane >> (8u * (i & 7u))) & 0xffu); + } +} + +static void encode_binary128_from_double_le(u8 out[16], double value) { + union { + double d; + u64 u; + } in; + u64 lo = 0; + u64 hi = 0; + u64 frac; + u32 sign; + u32 exp; + in.d = value; + sign = (u32)(in.u >> 63); + exp = (u32)((in.u >> 52) & 0x7ffu); + frac = in.u & 0x000fffffffffffffull; + if (sign) + hi |= 1ull << 63; + if (exp == 0x7ffu) { + hi |= (u64)0x7fffu << 48; + if (frac) { + lo |= (frac & 0xfu) << 60; + hi |= frac >> 4; + hi |= 1ull << 47; + } + } else if (exp != 0 || frac != 0) { + i32 e; + u64 sig; + if (exp == 0) { + e = -1022; + sig = frac; + while ((sig & (1ull << 52)) == 0) { + sig <<= 1; + --e; + } + frac = sig & 0x000fffffffffffffull; + } else { + e = (i32)exp - 1023; + } + hi |= (u64)(u32)(e + 16383) << 48; + lo |= (frac & 0xfu) << 60; + hi |= frac >> 4; + } + encode_uint128_le(out, 16, lo, hi); +} + +static int try_parse_static_float(Parser* p, u8* dst, u32 size, + const Type* ty) { + const Type* uty = type_unqual(p->pool, ty); + double value; + if (!uty || + (uty->kind != TY_FLOAT && uty->kind != TY_DOUBLE && + uty->kind != TY_LDOUBLE)) { + return 0; + } + if (p->cur.kind != TOK_FLT && p->cur.kind != TOK_NUM) + perr(p, "expected floating constant expression"); + value = p->cur.kind == TOK_FLT ? parse_float_literal(p, &p->cur) + : (double)parse_int_literal(p, &p->cur); + advance(p); + if (uty->kind == TY_FLOAT && size == 4u) { + union { + float f; + u8 b[4]; + } u; + u.f = (float)value; + memcpy(dst, u.b, 4); + return 1; + } + if ((uty->kind == TY_DOUBLE || uty->kind == TY_LDOUBLE) && size == 8u) { + union { + double d; + u8 b[8]; + } u; + u.d = value; + memcpy(dst, u.b, 8); + return 1; + } + if (uty->kind == TY_LDOUBLE && size == 16u) { + encode_binary128_from_double_le(dst, value); + return 1; + } + perr(p, "unsupported static floating initializer type"); + return 0; +} + /* Encode a string literal at *buf+offset for a char-array sub-object. */ static void parse_static_string_at(Parser* p, u8* buf, u32 buflen, u32 offset, u32 count) { @@ -608,14 +700,24 @@ typedef struct CStaticConst { i64 addend; } CStaticConst; -static u64 int_bits_for_type(Parser* p, CConstInt v, const Type* ty) { +static CConstInt int_bits_for_type(Parser* p, CConstInt v, const Type* ty) { u32 sz = c_abi_sizeof(p->abi, ty); - u64 bits = v.bits; + v.type = ty; if (sz < 8u) { - bits &= (1ull << (sz * 8u)) - 1ull; + u32 bits = sz * 8u; + v.lo &= bits ? ((1ull << bits) - 1ull) : 0; + v.hi = 0; + } else if (sz == 8u) { + v.hi = 0; + } else if (sz < 16u) { + u32 hi_bits = sz * 8u - 64u; + v.hi &= hi_bits ? ((1ull << hi_bits) - 1ull) : 0; + } + if (ty && ty->kind == TY_BOOL) { + v.lo = (v.lo || v.hi) ? 1u : 0u; + v.hi = 0; } - if (ty && ty->kind == TY_BOOL) bits = bits ? 1u : 0u; - return bits; + return v; } static void check_static_integer_initializer_range(Parser* p, const Type* ty, @@ -635,7 +737,7 @@ static void check_static_integer_initializer_range(Parser* p, const Type* ty, } } else { u64 maxu = (u64)maxv; - if (v.bits > maxu) { + if (v.hi != 0 || v.lo > maxu) { perr(p, "initializer value overflows destination type"); } } @@ -793,7 +895,7 @@ static void parse_static_bitfield_at(Parser* p, u8* buf, u32 buflen, ones = width >= 64u ? ~(u64)0 : (((u64)1 << width) - 1u); mask = ones << lsb; cur = decode_uint_le(buf + storage_off, storage_size); - val = (int_bits_for_type(p, parsed.int_value, field_ty) & ones) << lsb; + val = (int_bits_for_type(p, parsed.int_value, field_ty).lo & ones) << lsb; cur = (cur & ~mask) | val; encode_uint_le(buf + storage_off, storage_size, cur); } @@ -915,13 +1017,23 @@ void parse_static_init_at(Parser* p, u8* buf, u32 buflen, u32 offset, u32 sz = c_abi_sizeof(p->abi, ty); CStaticConst cv; if (offset + sz > buflen) perr(p, "initializer overflows object"); + if (try_parse_static_float(p, buf + offset, sz, ty)) { + if (had_brace) { + accept_punct(p, ','); + expect_punct(p, '}', "'}' after scalar initializer"); + } + return; + } cv = parse_static_const(p, ty, cloc); if (cv.kind == C_STATIC_CONST_ADDR) { srl_push(p, offset, sz, cv.target, cv.addend); } else if (cv.kind == C_STATIC_CONST_NULL_PTR) { encode_int_le(buf + offset, sz, 0); } else { - encode_uint_le(buf + offset, sz, int_bits_for_type(p, cv.int_value, ty)); + { + CConstInt bits = int_bits_for_type(p, cv.int_value, ty); + encode_uint128_le(buf + offset, sz, bits.lo, bits.hi); + } } if (had_brace) { accept_punct(p, ','); diff --git a/lang/c/parse/parse_priv.h b/lang/c/parse/parse_priv.h @@ -419,12 +419,14 @@ void parse_cond_expr(Parser* p); void parse_unary(Parser* p); typedef struct CConstInt { const Type* type; - u64 bits; + u64 lo; + u64 hi; } CConstInt; CConstInt eval_const_int_typed(Parser* p, SrcLoc loc); i64 eval_const_int(Parser* p, SrcLoc loc); i64 const_int_as_i64(Parser* p, CConstInt v); i64 parse_int_literal(Parser* p, const Tok* t); +double parse_float_literal(Parser* p, const Tok* t); i64 decode_char_literal(Parser* p, const Tok* t); u8* decode_string_literal(Parser* p, const Tok* t, size_t* nlen_out); void to_rvalue(Parser* p); diff --git a/lang/c/parse/parse_type.c b/lang/c/parse/parse_type.c @@ -59,6 +59,9 @@ static const struct { static SrcLoc tok_loc(const Tok* t) { return t->loc; } +static void attr_canon_range(const char* s, size_t len, const char** out_p, + size_t* out_len); + static int accept_kw(Parser* p, CKw k) { if (is_kw(p, &p->cur, k)) { advance(p); @@ -67,6 +70,30 @@ static int accept_kw(Parser* p, CKw k) { return 0; } +static int attr_sym_canon_eq(Parser* p, Sym sym, const char* want) { + size_t len = 0; + const char* s = pool_str(p->pool, sym, &len); + const char* cs; + size_t clen; + size_t wlen = strlen(want); + if (!s) return 0; + attr_canon_range(s, len, &cs, &clen); + return clen == wlen && memcmp(cs, want, wlen) == 0; +} + +static const Type* attrs_apply_type_mode(Parser* p, const Type* base, + const Attr* attrs) { + for (const Attr* a = attrs; a; a = a->next) { + if (a->kind != ATTR_MODE || a->nargs == 0) continue; + if (attr_sym_canon_eq(p, a->v.sym, "TI")) { + const Type* u = type_unqual(p->pool, base); + int is_unsigned = u && type_is_int(u) && c_abi_type_info(p->abi, u).signed_ == 0; + return type_prim(p->pool, is_unsigned ? TY_UINT128 : TY_INT128); + } + } + return base; +} + static CKw ident_kw(const Parser* p, Sym name) { return ident_kw_inline(p, name); } @@ -627,6 +654,7 @@ int parse_decl_specs(Parser* p, DeclSpecs* out) { out->type = ty_int(p); } } + out->type = attrs_apply_type_mode(p, out->type, out->attrs); if (out->type && out->quals) { out->type = type_qualified(p->pool, out->type, (u16)(out->type->qual | out->quals)); @@ -1134,6 +1162,7 @@ const Type* parse_declarator_full(Parser* p, const Type* base, const Type* parse_declarator_full_ex(Parser* p, const Type* base, int allow_abstract, Sym* name_out, SrcLoc* loc_out, Attr** attrs_out) { + Attr* local_attrs = NULL; base = parse_pointer_layer(p, base); Sym name = 0; @@ -1215,7 +1244,7 @@ const Type* parse_declarator_full_ex(Parser* p, const Type* base, if (attrs_out) parse_attrs_into(p, attrs_out); else - parse_and_discard_attributes(p); + parse_attrs_into(p, &local_attrs); } DeclSuffix suffs[8]; @@ -1227,9 +1256,10 @@ const Type* parse_declarator_full_ex(Parser* p, const Type* base, if (attrs_out) parse_attrs_into(p, attrs_out); else - parse_and_discard_attributes(p); + parse_attrs_into(p, &local_attrs); } } + base = attrs_apply_type_mode(p, base, attrs_out ? *attrs_out : local_attrs); if (nsuffs == 8 && (is_punct(&p->cur, '[') || is_punct(&p->cur, '('))) { perr(p, "too many declarator suffixes (raise the cap if needed)"); } diff --git a/rt/lib/README.md b/rt/lib/README.md @@ -22,7 +22,7 @@ hand-written `mem/mem.c` is 0BSD; relicense as desired. | -------------------------- | ----------------------------------------------------------- | --------------------------------------------------- | | `int/int.c` | Integer helpers needed on every target | All | | `int32/int32.c` | 64-bit ops synthesized from 32-bit | ILP32 only | -| `int64/int64.c` | 128-bit ops via `__int128` | LP64 / LLP64 only | +| `int64/int64.c` | 128-bit ops implemented on explicit 64-bit lanes | LP64 / LLP64 only | | `fp/fp.c` | Soft-float `sf` (binary32) + `df` (binary64) + sf↔df + `fp_mode` | FPU-less (RV{32,64}I, ARM softfp, WASM) | | `fp_tf/fp_tf.c` | Soft-float `tf` (binary128) + sf↔tf + df↔tf + i128↔tf | Targets with binary128 long double (e.g. aarch64 `-mlong-double-128`) | | `fp_ti/fp_ti.c` | `__int128` ↔ sf/df + sf/df → ti fix | LP64 / LLP64 + soft-float | diff --git a/rt/lib/fp_tf/fp_tf.c b/rt/lib/fp_tf/fp_tf.c @@ -78,23 +78,253 @@ COMPILER_RT_ABI fp_t __subtf3(fp_t a, fp_t b) { // ---- multf3.c ---- #define QUAD_PRECISION #include "fp_lib.h" -#include "fp_mul_impl.inc" -COMPILER_RT_ABI fp_t __multf3(fp_t a, fp_t b) { return __mulXf3__(a, b); } +typedef struct { + du_int limb[4]; +} cfree_tf_u256; + +static int cfree_tf_rep_bit(rep_t value, int bit) { + return ((value >> (unsigned)bit) & 1) != 0; +} + +static int cfree_tf_u256_bit(const cfree_tf_u256* value, int bit) { + if (bit < 0 || bit >= 256) return 0; + return ((value->limb[bit / 64] >> (unsigned)(bit % 64)) & 1u) != 0; +} + +static int cfree_tf_u256_any_below(const cfree_tf_u256* value, int bit) { + int full; + int rem; + if (bit <= 0) return 0; + if (bit > 256) bit = 256; + full = bit / 64; + rem = bit % 64; + for (int i = 0; i < full; ++i) { + if (value->limb[i]) return 1; + } + if (rem) { + const du_int mask = ((du_int)1 << (unsigned)rem) - 1u; + if (value->limb[full] & mask) return 1; + } + return 0; +} + +static void cfree_tf_u256_add_limb(cfree_tf_u256* value, int index, + du_int addend) { + du_int old; + if (!addend || index >= 4) return; + old = value->limb[index]; + value->limb[index] = old + addend; + if (value->limb[index] >= old) return; + for (++index; index < 4; ++index) { + old = value->limb[index]; + value->limb[index] = old + 1u; + if (value->limb[index] != 0) return; + } +} + +static void cfree_tf_u256_add_shifted_sig(cfree_tf_u256* product, rep_t sig, + int shift) { + const du_int lo = (du_int)sig; + const du_int hi = (du_int)(sig >> 64); + const int index = shift / 64; + const int bits = shift % 64; + if (bits == 0) { + cfree_tf_u256_add_limb(product, index, lo); + cfree_tf_u256_add_limb(product, index + 1, hi); + } else { + cfree_tf_u256_add_limb(product, index, lo << (unsigned)bits); + cfree_tf_u256_add_limb( + product, index + 1, + (lo >> (unsigned)(64 - bits)) | (hi << (unsigned)bits)); + cfree_tf_u256_add_limb(product, index + 2, + hi >> (unsigned)(64 - bits)); + } +} + +static cfree_tf_u256 cfree_tf_sig_product(rep_t a, rep_t b) { + cfree_tf_u256 product = {{0, 0, 0, 0}}; + for (int bit = 0; bit <= significandBits; ++bit) { + if (cfree_tf_rep_bit(b, bit)) + cfree_tf_u256_add_shifted_sig(&product, a, bit); + } + return product; +} + +static rep_t cfree_tf_u256_extract_rounded(const cfree_tf_u256* product, + int shift) { + rep_t result = 0; + for (int bit = 0; bit <= significandBits; ++bit) { + if (cfree_tf_u256_bit(product, shift + bit)) + result |= (rep_t)1 << (unsigned)bit; + } + if (cfree_tf_u256_bit(product, shift - 1) && + (cfree_tf_u256_any_below(product, shift - 1) || (result & 1))) + ++result; + return result; +} + +COMPILER_RT_ABI fp_t __multf3(fp_t a, fp_t b) { + const rep_t aRep = toRep(a); + const rep_t bRep = toRep(b); + const rep_t aAbs = aRep & absMask; + const rep_t bAbs = bRep & absMask; + const rep_t productSign = (aRep ^ bRep) & signBit; + int aExponent = (int)((aAbs >> significandBits) & maxExponent); + int bExponent = (int)((bAbs >> significandBits) & maxExponent); + int productExponent; + int productTop; + int shift; + rep_t aSignificand = aAbs & significandMask; + rep_t bSignificand = bAbs & significandMask; + cfree_tf_u256 product; + rep_t resultSignificand; + + if (aAbs > infRep) return fromRep(aRep | quietBit); + if (bAbs > infRep) return fromRep(bRep | quietBit); + if (aAbs == infRep) { + if (bAbs) return fromRep(infRep | productSign); + return fromRep(qnanRep); + } + if (bAbs == infRep) { + if (aAbs) return fromRep(infRep | productSign); + return fromRep(qnanRep); + } + if (!aAbs || !bAbs) return fromRep(productSign); + + if (aExponent == 0) + aExponent = normalize(&aSignificand); + else + aSignificand |= implicitBit; + if (bExponent == 0) + bExponent = normalize(&bSignificand); + else + bSignificand |= implicitBit; + + product = cfree_tf_sig_product(aSignificand, bSignificand); + productTop = cfree_tf_u256_bit(&product, 225) ? 225 : 224; + productExponent = aExponent + bExponent - exponentBias; + if (productTop == 225) ++productExponent; + + if (productExponent >= maxExponent) return fromRep(infRep | productSign); + + shift = productTop - significandBits; + if (productExponent <= 0) { + shift += 1 - productExponent; + productExponent = 0; + } + + resultSignificand = cfree_tf_u256_extract_rounded(&product, shift); + if (resultSignificand & ((rep_t)1 << (significandBits + 1))) { + resultSignificand >>= 1; + ++productExponent; + } + if (productExponent == 0 && (resultSignificand & implicitBit)) + productExponent = 1; + if (productExponent >= maxExponent) return fromRep(infRep | productSign); + + return fromRep(productSign | ((rep_t)productExponent << significandBits) | + (resultSignificand & significandMask)); +} // ---- divtf3.c ---- #define QUAD_PRECISION #include "fp_lib.h" +#include "fp_mode.h" + +COMPILER_RT_ABI fp_t __divtf3(fp_t a, fp_t b) { + const rep_t aRep = toRep(a); + const rep_t bRep = toRep(b); + const rep_t aAbs = aRep & absMask; + const rep_t bAbs = bRep & absMask; + const rep_t quotientSign = (aRep ^ bRep) & signBit; + int aExponent = (int)((aAbs >> significandBits) & maxExponent); + int bExponent = (int)((bAbs >> significandBits) & maxExponent); + rep_t aSignificand = aAbs & significandMask; + rep_t bSignificand = bAbs & significandMask; + rep_t quotient = 0; + rep_t remainder; + int writtenExponent; + + if (aAbs > infRep) return fromRep(aRep | quietBit); + if (bAbs > infRep) return fromRep(bRep | quietBit); + if (aAbs == infRep) { + if (bAbs == infRep) return fromRep(qnanRep); + return fromRep(infRep | quotientSign); + } + if (bAbs == infRep) return fromRep(quotientSign); + if (!aAbs) { + if (!bAbs) return fromRep(qnanRep); + return fromRep(quotientSign); + } + if (!bAbs) return fromRep(infRep | quotientSign); + + if (aExponent == 0) + aExponent = normalize(&aSignificand); + else + aSignificand |= implicitBit; + if (bExponent == 0) + bExponent = normalize(&bSignificand); + else + bSignificand |= implicitBit; + + writtenExponent = aExponent - bExponent + exponentBias; + if (aSignificand < bSignificand) { + aSignificand <<= 1; + writtenExponent -= 1; + } -#define NUMBER_OF_HALF_ITERATIONS 4 -#define NUMBER_OF_FULL_ITERATIONS 1 - -#include "fp_div_impl.inc" + remainder = aSignificand; + for (int i = 0; i < significandBits + 4; ++i) { + quotient <<= 1; + if (remainder >= bSignificand) { + quotient |= 1; + remainder -= bSignificand; + } + if (i != significandBits + 3) + remainder <<= 1; + } + if (remainder) + quotient |= 1; + + if (writtenExponent >= maxExponent) + return fromRep(infRep | quotientSign); + if (writtenExponent <= 0) { + const int shift = 1 - writtenExponent; + if (shift >= typeWidth) + return fromRep(quotientSign); + if (shift > 0) { + const bool sticky = (quotient << (typeWidth - shift)) != 0; + quotient = (quotient >> shift) | sticky; + } + writtenExponent = 0; + } -COMPILER_RT_ABI fp_t __divtf3(fp_t a, fp_t b) { return __divXf3__(a, b); } + const int roundGuardSticky = quotient & 0x7; + rep_t absResult = (quotient >> 3) & significandMask; + absResult |= (rep_t)writtenExponent << significandBits; + + switch (__fe_getround()) { + case CRT_FE_TONEAREST: + if (roundGuardSticky > 0x4) + absResult++; + if (roundGuardSticky == 0x4) + absResult += absResult & 1; + break; + case CRT_FE_DOWNWARD: + if (quotientSign && roundGuardSticky) absResult++; + break; + case CRT_FE_UPWARD: + if (!quotientSign && roundGuardSticky) absResult++; + break; + case CRT_FE_TOWARDZERO: + break; + } + if (roundGuardSticky) + __fe_raise_inexact(); + return fromRep(absResult | quotientSign); +} -#undef NUMBER_OF_HALF_ITERATIONS -#undef NUMBER_OF_FULL_ITERATIONS // ---- comparetf2.c ---- #define QUAD_PRECISION #include "fp_compare_impl.inc" @@ -114,32 +344,43 @@ COMPILER_RT_ABI CMP_RESULT __unordtf2(fp_t a, fp_t b) { #define QUAD_PRECISION #include "fp_lib.h" -COMPILER_RT_ABI fp_t __floatsitf(si_int a) { - const int aWidth = sizeof a * CHAR_BIT; +static int cfree_clz_u32(su_int x) { + int n = 0; + for (int bit = 31; bit >= 0; --bit) { + if ((x >> (unsigned)bit) & 1u) break; + ++n; + } + return n; +} - // Handle zero as a special case to protect clz - if (a == 0) return fromRep(0); +static int cfree_clz_u64(du_int x) { + int n = 0; + for (int bit = 63; bit >= 0; --bit) { + if ((x >> (unsigned)bit) & 1u) break; + ++n; + } + return n; +} + +static fp_t cfree_tf_from_u64(du_int mag, rep_t sign, int width) { + if (!mag) return fromRep(0); + int exponent = (width - 1) - + (width == 32 ? cfree_clz_u32((su_int)mag) + : cfree_clz_u64(mag)); + int shift = significandBits - exponent; + rep_t result = ((rep_t)mag << shift) ^ implicitBit; + result |= (rep_t)(exponent + exponentBias) << significandBits; + return fromRep(result | sign); +} - // All other cases begin by extracting the sign and absolute value of a +COMPILER_RT_ABI fp_t __floatsitf(si_int a) { rep_t sign = 0; - su_int aAbs = (su_int)a; + su_int mag = (su_int)a; if (a < 0) { sign = signBit; - aAbs = -aAbs; + mag = (su_int)(0u - mag); } - - // Exponent of (fp_t)a is the width of abs(a). - const int exponent = (aWidth - 1) - clzsi(aAbs); - rep_t result; - - // Shift a into the significand field and clear the implicit bit. - const int shift = significandBits - exponent; - result = (rep_t)aAbs << shift ^ implicitBit; - - // Insert the exponent - result += (rep_t)(exponent + exponentBias) << significandBits; - // Insert the sign bit and return - return fromRep(result | sign); + return cfree_tf_from_u64((du_int)mag, sign, 32); } // ---- floatunsitf.c ---- @@ -147,22 +388,7 @@ COMPILER_RT_ABI fp_t __floatsitf(si_int a) { #include "fp_lib.h" COMPILER_RT_ABI fp_t __floatunsitf(su_int a) { - const int aWidth = sizeof a * CHAR_BIT; - - // Handle zero as a special case to protect clz - if (a == 0) return fromRep(0); - - // Exponent of (fp_t)a is the width of abs(a). - const int exponent = (aWidth - 1) - clzsi(a); - rep_t result; - - // Shift a into the significand field and clear the implicit bit. - const int shift = significandBits - exponent; - result = (rep_t)a << shift ^ implicitBit; - - // Insert the exponent - result += (rep_t)(exponent + exponentBias) << significandBits; - return fromRep(result); + return cfree_tf_from_u64((du_int)a, 0, 32); } // ---- floatditf.c ---- @@ -170,31 +396,13 @@ COMPILER_RT_ABI fp_t __floatunsitf(su_int a) { #include "fp_lib.h" COMPILER_RT_ABI fp_t __floatditf(di_int a) { - const int aWidth = sizeof a * CHAR_BIT; - - // Handle zero as a special case to protect clz - if (a == 0) return fromRep(0); - - // All other cases begin by extracting the sign and absolute value of a rep_t sign = 0; - du_int aAbs = (du_int)a; + du_int mag = (du_int)a; if (a < 0) { sign = signBit; - aAbs = ~(du_int)a + 1U; + mag = (du_int)0 - mag; } - - // Exponent of (fp_t)a is the width of abs(a). - const int exponent = (aWidth - 1) - __builtin_clzll(aAbs); - rep_t result; - - // Shift a into the significand field, rounding if it is a right-shift - const int shift = significandBits - exponent; - result = (rep_t)aAbs << shift ^ implicitBit; - - // Insert the exponent - result += (rep_t)(exponent + exponentBias) << significandBits; - // Insert the sign bit and return - return fromRep(result | sign); + return cfree_tf_from_u64(mag, sign, 64); } // ---- floatunditf.c ---- @@ -202,22 +410,7 @@ COMPILER_RT_ABI fp_t __floatditf(di_int a) { #include "fp_lib.h" COMPILER_RT_ABI fp_t __floatunditf(du_int a) { - const int aWidth = sizeof a * CHAR_BIT; - - // Handle zero as a special case to protect clz - if (a == 0) return fromRep(0); - - // Exponent of (fp_t)a is the width of abs(a). - const int exponent = (aWidth - 1) - __builtin_clzll(a); - rep_t result; - - // Shift a into the significand field and clear the implicit bit. - const int shift = significandBits - exponent; - result = (rep_t)a << shift ^ implicitBit; - - // Insert the exponent - result += (rep_t)(exponent + exponentBias) << significandBits; - return fromRep(result); + return cfree_tf_from_u64(a, 0, 64); } // ---- floattitf.c ---- diff --git a/rt/lib/int64/int64.c b/rt/lib/int64/int64.c @@ -80,61 +80,165 @@ static inline du_int udiv128by64to64(du_int u1, du_int u0, du_int v, return udiv128by64to64default(u1, u0, v, r); } +static inline int ut_is_zero(utwords a) { + return a.s.low == 0 && a.s.high == 0; +} + +static inline int ut_cmp(utwords a, utwords b) { + if (a.s.high != b.s.high) return a.s.high < b.s.high ? -1 : 1; + if (a.s.low != b.s.low) return a.s.low < b.s.low ? -1 : 1; + return 0; +} + +static inline utwords ut_add(utwords a, utwords b) { + utwords r; + r.s.low = a.s.low + b.s.low; + r.s.high = a.s.high + b.s.high + (r.s.low < a.s.low); + return r; +} + +static inline utwords ut_sub(utwords a, utwords b) { + utwords r; + r.s.low = a.s.low - b.s.low; + r.s.high = a.s.high - b.s.high - (a.s.low < b.s.low); + return r; +} + +static inline utwords ut_neg(utwords a) { + utwords z; + z.s.low = 0; + z.s.high = 0; + return ut_sub(z, a); +} + +static inline utwords ut_shl1(utwords a) { + utwords r; + r.s.low = a.s.low << 1; + r.s.high = (a.s.high << 1) | (a.s.low >> 63); + return r; +} + +static inline utwords ut_shr1(utwords a) { + utwords r; + r.s.low = (a.s.low >> 1) | (a.s.high << 63); + r.s.high = a.s.high >> 1; + return r; +} + +static inline utwords ut_shl(utwords a, unsigned sh) { + utwords r; + if (sh >= 128u) { + r.s.low = 0; + r.s.high = 0; + } else if (sh == 0) { + r = a; + } else if (sh >= 64u) { + r.s.low = 0; + r.s.high = a.s.low << (sh - 64u); + } else { + r.s.low = a.s.low << sh; + r.s.high = (a.s.high << sh) | (a.s.low >> (64u - sh)); + } + return r; +} + +static inline utwords ut_lshr(utwords a, unsigned sh) { + utwords r; + if (sh >= 128u) { + r.s.low = 0; + r.s.high = 0; + } else if (sh == 0) { + r = a; + } else if (sh >= 64u) { + r.s.low = a.s.high >> (sh - 64u); + r.s.high = 0; + } else { + r.s.low = (a.s.low >> sh) | (a.s.high << (64u - sh)); + r.s.high = a.s.high >> sh; + } + return r; +} + +static inline twords t_ashr(twords a, unsigned sh) { + twords r; + if (sh >= 128u) { + r.s.low = a.s.high < 0 ? ~(du_int)0 : 0; + r.s.high = a.s.high < 0 ? (di_int)-1 : 0; + } else if (sh == 0) { + r = a; + } else if (sh >= 64u) { + r.s.low = (du_int)(a.s.high >> (sh - 64u)); + r.s.high = a.s.high < 0 ? (di_int)-1 : 0; + } else { + r.s.low = ((du_int)a.s.high << (64u - sh)) | (a.s.low >> sh); + r.s.high = a.s.high >> sh; + } + return r; +} + +static inline utwords ut_mul(utwords a, utwords b) { + utwords r; + const int half_bits = (int)(sizeof(du_int) * CHAR_BIT) / 2; + const du_int mask = (du_int)~0 >> half_bits; + du_int t; + r.s.low = (a.s.low & mask) * (b.s.low & mask); + t = r.s.low >> half_bits; + r.s.low &= mask; + t += (a.s.low >> half_bits) * (b.s.low & mask); + r.s.low += (t & mask) << half_bits; + r.s.high = t >> half_bits; + t = r.s.low >> half_bits; + r.s.low &= mask; + t += (b.s.low >> half_bits) * (a.s.low & mask); + r.s.low += (t & mask) << half_bits; + r.s.high += t >> half_bits; + r.s.high += (a.s.low >> half_bits) * (b.s.low >> half_bits); + r.s.high += a.s.high * b.s.low + a.s.low * b.s.high; + return r; +} + +static inline void ut_udivmod(utwords n, utwords d, utwords* q, utwords* rem) { + utwords quotient; + utwords remainder; + quotient.s.low = 0; + quotient.s.high = 0; + remainder.s.low = 0; + remainder.s.high = 0; + if (ut_is_zero(d)) { + if (q) *q = quotient; + if (rem) *rem = n; + return; + } + for (int i = 127; i >= 0; --i) { + du_int bit = + i < 64 ? ((n.s.low >> (unsigned)i) & 1u) + : ((n.s.high >> (unsigned)(i - 64)) & 1u); + remainder = ut_shl1(remainder); + remainder.s.low |= bit; + if (ut_cmp(remainder, d) >= 0) { + remainder = ut_sub(remainder, d); + if (i < 64) + quotient.s.low |= (du_int)1 << (unsigned)i; + else + quotient.s.high |= (du_int)1 << (unsigned)(i - 64); + } + } + if (q) *q = quotient; + if (rem) *rem = remainder; +} + // Effects: if rem != 0, *rem = a % b // Returns: a / b COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int* rem) { - const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT; utwords dividend; dividend.all = a; utwords divisor; divisor.all = b; utwords quotient; utwords remainder; - if (divisor.all > dividend.all) { - if (rem) *rem = dividend.all; - return 0; - } - // When the divisor fits in 64 bits, we can use an optimized path. - if (divisor.s.high == 0) { - remainder.s.high = 0; - if (dividend.s.high < divisor.s.low) { - // The result fits in 64 bits. - quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low, - divisor.s.low, &remainder.s.low); - quotient.s.high = 0; - } else { - // First, divide with the high part to get the remainder in - // dividend.s.high. After that dividend.s.high < divisor.s.low. - quotient.s.high = dividend.s.high / divisor.s.low; - dividend.s.high = dividend.s.high % divisor.s.low; - quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low, - divisor.s.low, &remainder.s.low); - } - if (rem) *rem = remainder.all; - return quotient.all; - } - // 0 <= shift <= 63. - si_int shift = - __builtin_clzll(divisor.s.high) - __builtin_clzll(dividend.s.high); - divisor.all <<= shift; - quotient.s.high = 0; - quotient.s.low = 0; - for (; shift >= 0; --shift) { - quotient.s.low <<= 1; - // Branch free version of. - // if (dividend.all >= divisor.all) - // { - // dividend.all -= divisor.all; - // carry = 1; - // } - const ti_int s = - (ti_int)(divisor.all - dividend.all - 1) >> (n_utword_bits - 1); - quotient.s.low |= s & 1; - dividend.all -= divisor.all & s; - divisor.all >>= 1; - } - if (rem) *rem = dividend.all; + ut_udivmod(dividend, divisor, &quotient, &remainder); + if (rem) *rem = remainder.all; return quotient.all; } @@ -147,20 +251,11 @@ COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int* rem) { // Precondition: 0 <= b < bits_in_tword COMPILER_RT_ABI ti_int __ashlti3(ti_int a, int b) { - const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT); - twords input; - twords result; + utwords input; + utwords result; input.all = a; - if (b & bits_in_dword) /* bits_in_dword <= b < bits_in_tword */ { - result.s.low = 0; - result.s.high = input.s.low << (b - bits_in_dword); - } else /* 0 <= b < bits_in_dword */ { - if (b == 0) return a; - result.s.low = input.s.low << b; - result.s.high = - ((du_int)input.s.high << b) | (input.s.low >> (bits_in_dword - b)); - } - return result.all; + result = ut_shl(input, (unsigned)b); + return (ti_int)result.all; } // ---- ashrti3.c ---- @@ -171,20 +266,10 @@ COMPILER_RT_ABI ti_int __ashlti3(ti_int a, int b) { // Precondition: 0 <= b < bits_in_tword COMPILER_RT_ABI ti_int __ashrti3(ti_int a, int b) { - const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT); twords input; twords result; input.all = a; - if (b & bits_in_dword) /* bits_in_dword <= b < bits_in_tword */ { - // result.s.high = input.s.high < 0 ? -1 : 0 - result.s.high = input.s.high >> (bits_in_dword - 1); - result.s.low = input.s.high >> (b - bits_in_dword); - } else /* 0 <= b < bits_in_dword */ { - if (b == 0) return a; - result.s.high = input.s.high >> b; - result.s.low = - ((du_int)input.s.high << (bits_in_dword - b)) | (input.s.low >> b); - } + result = t_ashr(input, (unsigned)b); return result.all; } @@ -226,19 +311,11 @@ COMPILER_RT_ABI int __ctzti2(ti_int a) { // Precondition: 0 <= b < bits_in_tword COMPILER_RT_ABI ti_int __lshrti3(ti_int a, int b) { - const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT); utwords input; utwords result; input.all = a; - if (b & bits_in_dword) /* bits_in_dword <= b < bits_in_tword */ { - result.s.high = 0; - result.s.low = input.s.high >> (b - bits_in_dword); - } else /* 0 <= b < bits_in_dword */ { - if (b == 0) return a; - result.s.high = input.s.high >> b; - result.s.low = (input.s.high << (bits_in_dword - b)) | (input.s.low >> b); - } - return result.all; + result = ut_lshr(input, (unsigned)b); + return (ti_int)result.all; } // ---- multi3.c ---- @@ -268,14 +345,13 @@ static ti_int __mulddi3(du_int a, du_int b) { // Returns: a * b COMPILER_RT_ABI ti_int __multi3(ti_int a, ti_int b) { - twords x; - x.all = a; - twords y; - y.all = b; - twords r; - r.all = __mulddi3(x.s.low, y.s.low); - r.s.high += x.s.high * y.s.low + x.s.low * y.s.high; - return r.all; + utwords x; + utwords y; + utwords r; + x.all = (tu_int)a; + y.all = (tu_int)b; + r = ut_mul(x, y); + return (ti_int)r.all; } // ---- negti2.c ---- @@ -284,9 +360,111 @@ COMPILER_RT_ABI ti_int __multi3(ti_int a, ti_int b) { // Returns: -a COMPILER_RT_ABI ti_int __negti2(ti_int a) { - // Note: this routine is here for API compatibility; any sane compiler - // should expand it inline. - return -(tu_int)a; + utwords x; + utwords r; + x.all = (tu_int)a; + r = ut_neg(x); + return (ti_int)r.all; +} + +COMPILER_RT_ABI ti_int __cfree_addti3(ti_int a, ti_int b) { + utwords x; + utwords y; + utwords r; + x.all = (tu_int)a; + y.all = (tu_int)b; + r = ut_add(x, y); + return (ti_int)r.all; +} + +COMPILER_RT_ABI ti_int __cfree_subti3(ti_int a, ti_int b) { + utwords x; + utwords y; + utwords r; + x.all = (tu_int)a; + y.all = (tu_int)b; + r = ut_sub(x, y); + return (ti_int)r.all; +} + +COMPILER_RT_ABI ti_int __cfree_andti3(ti_int a, ti_int b) { + utwords x; + utwords y; + utwords r; + x.all = (tu_int)a; + y.all = (tu_int)b; + r.s.low = x.s.low & y.s.low; + r.s.high = x.s.high & y.s.high; + return (ti_int)r.all; +} + +COMPILER_RT_ABI ti_int __cfree_orti3(ti_int a, ti_int b) { + utwords x; + utwords y; + utwords r; + x.all = (tu_int)a; + y.all = (tu_int)b; + r.s.low = x.s.low | y.s.low; + r.s.high = x.s.high | y.s.high; + return (ti_int)r.all; +} + +COMPILER_RT_ABI ti_int __cfree_xorti3(ti_int a, ti_int b) { + utwords x; + utwords y; + utwords r; + x.all = (tu_int)a; + y.all = (tu_int)b; + r.s.low = x.s.low ^ y.s.low; + r.s.high = x.s.high ^ y.s.high; + return (ti_int)r.all; +} + +COMPILER_RT_ABI ti_int __cfree_notti3(ti_int a) { + utwords x; + utwords r; + x.all = (tu_int)a; + r.s.low = ~x.s.low; + r.s.high = ~x.s.high; + return (ti_int)r.all; +} + +COMPILER_RT_ABI ti_int __cfree_sext64ti(di_int a) { + twords r; + r.s.low = (du_int)a; + r.s.high = a < 0 ? -1 : 0; + return r.all; +} + +COMPILER_RT_ABI ti_int __cfree_zext64ti(du_int a) { + utwords r; + r.s.low = a; + r.s.high = 0; + return (ti_int)r.all; +} + +COMPILER_RT_ABI si_int __cfree_cmpti2(ti_int a, ti_int b) { + twords x; + twords y; + x.all = a; + y.all = b; + if (x.s.high < y.s.high) return -1; + if (x.s.high > y.s.high) return 1; + if (x.s.low < y.s.low) return -1; + if (x.s.low > y.s.low) return 1; + return 0; +} + +COMPILER_RT_ABI si_int __cfree_ucmpti2(tu_int a, tu_int b) { + utwords x; + utwords y; + x.all = a; + y.all = b; + if (x.s.high < y.s.high) return -1; + if (x.s.high > y.s.high) return 1; + if (x.s.low < y.s.low) return -1; + if (x.s.low > y.s.low) return 1; + return 0; } // Callers of __udivmodti4: @@ -316,16 +494,27 @@ COMPILER_RT_ABI tu_int __umodti3(tu_int a, tu_int b) { // Returns: a / b, *rem = a % b COMPILER_RT_ABI ti_int __divmodti4(ti_int a, ti_int b, ti_int* rem) { - const int bits_in_tword_m1 = (int)(sizeof(ti_int) * CHAR_BIT) - 1; - ti_int s_a = a >> bits_in_tword_m1; // s_a = a < 0 ? -1 : 0 - ti_int s_b = b >> bits_in_tword_m1; // s_b = b < 0 ? -1 : 0 - a = (tu_int)(a ^ s_a) - s_a; // negate if s_a == -1 - b = (tu_int)(b ^ s_b) - s_b; // negate if s_b == -1 - s_b ^= s_a; // sign of quotient - tu_int r; - ti_int q = (__udivmodti4(a, b, &r) ^ s_b) - s_b; // negate if s_b == -1 - *rem = (r ^ s_a) - s_a; // negate if s_a == -1 - return q; + twords sa; + twords sb; + utwords ua; + utwords ub; + utwords uq; + utwords ur; + int neg_a; + int neg_b; + sa.all = a; + sb.all = b; + neg_a = sa.s.high < 0; + neg_b = sb.s.high < 0; + ua.all = (tu_int)a; + ub.all = (tu_int)b; + if (neg_a) ua = ut_neg(ua); + if (neg_b) ub = ut_neg(ub); + ut_udivmod(ua, ub, &uq, &ur); + if (neg_a != neg_b) uq = ut_neg(uq); + if (neg_a) ur = ut_neg(ur); + if (rem) *rem = (ti_int)ur.all; + return (ti_int)uq.all; } // ---- divti3.c ---- @@ -333,14 +522,9 @@ COMPILER_RT_ABI ti_int __divmodti4(ti_int a, ti_int b, ti_int* rem) { // Returns: a / b -#define fixint_t ti_int -#define fixuint_t tu_int -#define INT_DIV_SUFFIX divti3 -#define COMPUTE_UDIV(a, b) __udivmodti4((a), (b), (tu_int*)0) -#include "int_div_impl.inc" - COMPILER_RT_ABI ti_int __divti3(ti_int a, ti_int b) { - return __divXi3_divti3(a, b); + ti_int r; + return __divmodti4(a, b, &r); } // ---- modti3.c ---- @@ -348,12 +532,8 @@ COMPILER_RT_ABI ti_int __divti3(ti_int a, ti_int b) { // Returns: a % b -#define fixint_t ti_int -#define fixuint_t tu_int -#define INT_DIV_SUFFIX modti3 -#define ASSIGN_UMOD(res, a, b) __udivmodti4((a), (b), &(res)) -#include "int_div_impl.inc" - COMPILER_RT_ABI ti_int __modti3(ti_int a, ti_int b) { - return __modXi3_modti3(a, b); + ti_int r; + (void)__divmodti4(a, b, &r); + return r; } diff --git a/src/abi/abi_aapcs64.c b/src/abi/abi_aapcs64.c @@ -20,6 +20,23 @@ static void classify_scalar(TargetABI* a, CfreeCgTypeId t, ABIArgInfo* out) { ABITypeInfo ti = abi_internal_type_info(a, t); + if (ti.scalar_kind == ABI_SC_INT && ti.size == 16) { + ABIArgPart* parts = arena_array(a->c->tu, ABIArgPart, 2); + memset(parts, 0, sizeof(ABIArgPart) * 2); + for (u32 i = 0; i < 2; ++i) { + parts[i].cls = ABI_CLASS_INT; + parts[i].loc = ABI_LOC_REG; + parts[i].size = 8; + parts[i].align = 8; + parts[i].src_offset = i * 8; + } + out->kind = ABI_ARG_DIRECT; + out->flags = ABI_AF_NONE; + out->parts = parts; + out->nparts = 2; + out->indirect_align = 0; + return; + } out->kind = ABI_ARG_DIRECT; out->flags = ABI_AF_NONE; out->indirect_align = 0; diff --git a/src/abi/abi_rv64.c b/src/abi/abi_rv64.c @@ -20,7 +20,8 @@ static void classify_scalar(TargetABI* a, CfreeCgTypeId t, ABIArgInfo* out) { ABITypeInfo ti = abi_internal_type_info(a, t); - if (ti.scalar_kind == ABI_SC_FLOAT && ti.size == 16) { + if (ti.size == 16 && + (ti.scalar_kind == ABI_SC_INT || ti.scalar_kind == ABI_SC_FLOAT)) { ABIArgPart* parts = arena_array(a->c->tu, ABIArgPart, 2); memset(parts, 0, sizeof(ABIArgPart) * 2); parts[0].cls = ABI_CLASS_INT; diff --git a/src/api/cg.c b/src/api/cg.c @@ -1303,6 +1303,17 @@ static int api_is_f128_type(Compiler *c, CfreeCgTypeId ty) { return cg && cg->kind == CFREE_CG_TYPE_FLOAT && cg->fp.width == 128; } +static int api_is_i128_type(Compiler *c, CfreeCgTypeId ty) { + const CgType *cg; + ty = api_unalias_type(c, ty); + cg = cg_type_get(c, ty); + return cg && cg->kind == CFREE_CG_TYPE_INT && cg->integer.width == 128; +} + +static int api_is_wide16_scalar_type(Compiler *c, CfreeCgTypeId ty) { + return api_is_f128_type(c, ty) || api_is_i128_type(c, ty); +} + static Operand api_op_imm(i64 v, CfreeCgTypeId ty) { Operand o; memset(&o, 0, sizeof o); @@ -2105,7 +2116,7 @@ static void api_release_arg_storage(CfreeCg *g, Operand *storage) { api_free_reg(g, storage->v.reg, storage->cls); } else if (storage->kind == OPK_LOCAL && storage->cls < 3) { CfreeCgTypeId ty = storage->type; - if (cg_type_is_aggregate(g->c, ty) || api_is_f128_type(g->c, ty)) + if (cg_type_is_aggregate(g->c, ty) || api_is_wide16_scalar_type(g->c, ty)) return; api_return_spill_slot(g, storage->v.frame_slot, storage->cls); } else if (storage->kind == OPK_INDIRECT) { @@ -3189,7 +3200,8 @@ static void api_encode_binary128_from_double(CfreeCg *g, double value, double d; u64 u; } in; - unsigned __int128 rep = 0; + u64 lo = 0; + u64 hi = 0; u64 frac; u32 sign; u32 exp; @@ -3198,12 +3210,13 @@ static void api_encode_binary128_from_double(CfreeCg *g, double value, exp = (u32)((in.u >> 52) & 0x7ffu); frac = in.u & 0x000fffffffffffffull; if (sign) - rep |= ((unsigned __int128)1) << 127; + hi |= 1ull << 63; if (exp == 0x7ffu) { - rep |= ((unsigned __int128)0x7fffu) << 112; + hi |= (u64)0x7fffu << 48; if (frac) { - rep |= ((unsigned __int128)frac) << (112u - 52u); - rep |= ((unsigned __int128)1) << 111; + lo |= (frac & 0xfu) << 60; + hi |= frac >> 4; + hi |= 1ull << 47; } } else if (exp != 0 || frac != 0) { i32 e; @@ -3219,12 +3232,20 @@ static void api_encode_binary128_from_double(CfreeCg *g, double value, } else { e = (i32)exp - 1023; } - rep |= ((unsigned __int128)(u32)(e + 16383)) << 112; - rep |= ((unsigned __int128)frac) << (112u - 52u); + hi |= (u64)(u32)(e + 16383) << 48; + lo |= (frac & 0xfu) << 60; + hi |= frac >> 4; } for (u32 i = 0; i < 16; ++i) { - u32 shift = g->c->target.big_endian ? (15u - i) * 8u : i * 8u; - out[i] = (u8)(rep >> shift); + if (g->c->target.big_endian) { + u64 lane = i < 8u ? hi : lo; + u32 shift = (7u - (i & 7u)) * 8u; + out[i] = (u8)(lane >> shift); + } else { + u64 lane = i < 8u ? lo : hi; + u32 shift = (i & 7u) * 8u; + out[i] = (u8)(lane >> shift); + } } } @@ -3238,8 +3259,8 @@ static ApiSValue api_make_f128_const(CfreeCg *g, double value, return api_make_lv(api_op_local(slot, ty), ty); } -static ApiSValue api_f128_materialize_lvalue(CfreeCg *g, ApiSValue *v, - CfreeCgTypeId ty) { +static ApiSValue api_wide16_materialize_lvalue(CfreeCg *g, ApiSValue *v, + CfreeCgTypeId ty) { if (v->op.kind == OPK_LOCAL || v->op.kind == OPK_INDIRECT) { v->type = ty; v->op.type = ty; @@ -3283,7 +3304,7 @@ static ApiSValue api_f128_materialize_lvalue(CfreeCg *g, ApiSValue *v, return api_make_lv(api_op_local(slot, ty), ty); } compiler_panic(g->c, g->cur_loc, - "CfreeCg: binary128 value is not addressable (kind %u, op %u)", + "CfreeCg: 16-byte scalar value is not addressable (kind %u, op %u)", (unsigned)v->kind, (unsigned)v->op.kind); return *v; } @@ -3339,7 +3360,7 @@ static int api_local_requires_memory(CfreeCg *g, CfreeCgTypeId ty, CfreeCgLocalAttrs attrs) { if (api_source_flags_addr_taken(attrs.flags)) return 1; - if (api_is_f128_type(g->c, ty)) + if (api_is_wide16_scalar_type(g->c, ty)) return 1; return !(cg_type_is_int(g->c, ty) || cg_type_is_float(g->c, ty) || cg_type_is_ptr(g->c, ty)); @@ -3805,7 +3826,7 @@ void cfree_cg_load(CfreeCg *g, CfreeCgMemAccess access) { return; } api_require_scalar_mem_type(g, "load", ty); - if (api_is_f128_type(g->c, ty)) { + if (api_is_wide16_scalar_type(g->c, ty)) { v.type = ty; v.op.type = ty; api_push(g, v); @@ -3968,7 +3989,7 @@ void cfree_cg_store(CfreeCg *g, CfreeCgMemAccess access) { return; } api_validate_memory_value(g, "store", ty, api_sv_type(&rv)); - if (api_is_f128_type(g->c, ty)) { + if (api_is_wide16_scalar_type(g->c, ty)) { if (lv.source_local != CFREE_CG_LOCAL_NONE) { api_local_const_clear(api_local_from_handle(g, lv.source_local)); } else if (lv.op.kind == OPK_INDIRECT || lv.op.kind == OPK_GLOBAL || @@ -3986,6 +4007,35 @@ void cfree_cg_store(CfreeCg *g, CfreeCgMemAccess access) { T->copy_bytes(T, dst_addr, src_addr, agg); api_free_reg(g, dst_addr.v.reg, RC_INT); api_free_reg(g, src_addr.v.reg, RC_INT); + } else if (rv.op.kind == OPK_IMM) { + u8 bytes[16]; + u64 lo = (u64)rv.op.v.imm; + u64 hi = rv.op.v.imm < 0 ? ~(u64)0 : 0; + memset(bytes, 0, sizeof bytes); + for (u32 i = 0; i < 8; ++i) { + u32 lo_idx = g->c->target.big_endian ? 15u - i : i; + u32 hi_idx = g->c->target.big_endian ? 7u - i : 8u + i; + bytes[lo_idx] = (u8)(lo >> (i * 8u)); + bytes[hi_idx] = (u8)(hi >> (i * 8u)); + } + if (lv.op.kind == OPK_LOCAL) { + api_store_f128_bytes(g, lv.op.v.frame_slot, ty, bytes); + } else { + FrameSlot slot = api_f128_temp_slot(g, ty); + ApiSValue tmp = api_make_lv(api_op_local(slot, ty), ty); + CfreeCgTypeId ptr_ty = cg_type_ptr_to(g->c, ty); + Operand dst_addr = api_lvalue_addr(g, &lv, ptr_ty); + Operand src_addr; + AggregateAccess agg; + api_store_f128_bytes(g, slot, ty, bytes); + src_addr = api_lvalue_addr(g, &tmp, ptr_ty); + memset(&agg, 0, sizeof agg); + agg.size = 16; + agg.align = access.align ? access.align : 16; + T->copy_bytes(T, dst_addr, src_addr, agg); + api_free_reg(g, dst_addr.v.reg, RC_INT); + api_free_reg(g, src_addr.v.reg, RC_INT); + } } else { src = api_force_reg(g, &rv, ty); T->store(T, lv.op, src, api_mem_from_access(g, &lv.op, access)); @@ -4113,6 +4163,12 @@ void cfree_cg_rot3(CfreeCg *g) { * Arithmetic / compare / convert * ============================================================ */ +static const char *api_i128_binop_helper(BinOp op); +static int api_i128_cmp_is_unsigned(CmpOp op); +static void api_cg_cmp(CfreeCg *g, CmpOp cop); +static void api_f128_call_unary(CfreeCg *g, const char *name, + CfreeCgTypeId ret, CfreeCgTypeId param); + static void api_cg_binop(CfreeCg *g, BinOp iop, u32 flags) { ApiSValue b, a; CGTarget *T; @@ -4129,6 +4185,22 @@ static void api_cg_binop(CfreeCg *g, BinOp iop, u32 flags) { a = api_pop(g); ty = a.type ? a.type : b.type; + if (api_is_i128_type(g->c, ty)) { + CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128); + CfreeCgTypeId i32 = builtin_id(CFREE_CG_BUILTIN_I32); + CfreeCgTypeId ps[2]; + ApiSValue args[2]; + const char *name = api_i128_binop_helper(iop); + if (!name) + compiler_panic(g->c, g->cur_loc, "CfreeCg: i128 binop unsupported"); + args[0] = a; + args[1] = b; + ps[0] = i128; + ps[1] = (iop == BO_SHL || iop == BO_SHR_U || iop == BO_SHR_S) ? i32 : i128; + api_runtime_call_values(g, name, i128, ps, 2, args); + return; + } + if (!flags && api_sv_op_is(&a, OPK_IMM) && api_sv_op_is(&b, OPK_IMM) && api_try_fold_int_binop(g, iop, ty, a.op.v.imm, b.op.v.imm, &folded)) { api_release(g, &a); @@ -4195,6 +4267,27 @@ static void api_cg_unop(CfreeCg *g, UnOp iop, u32 flags) { a = api_pop(g); ty = a.type ? a.type : a.op.type; + if (api_is_i128_type(g->c, ty)) { + CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128); + CfreeCgTypeId i32 = builtin_id(CFREE_CG_BUILTIN_I32); + if (iop == UO_NEG || iop == UO_BNOT) { + const char *name = (iop == UO_NEG) ? "__negti2" : "__cfree_notti3"; + api_push(g, a); + api_f128_call_unary(g, name, i128, i128); + return; + } + if (iop == UO_NOT) { + CfreeCgTypeId ps[2] = {i128, i128}; + ApiSValue args[2]; + args[0] = a; + args[1] = api_make_sv(api_op_imm(0, i128), i128); + api_runtime_call_values(g, "__cfree_ucmpti2", i32, ps, 2, args); + cfree_cg_push_int(g, 0, i32); + api_cg_cmp(g, CMP_EQ); + return; + } + } + if (!flags && api_sv_op_is(&a, OPK_IMM) && api_try_fold_int_unop(g, iop, ty, a.op.v.imm, &folded)) { api_release(g, &a); @@ -4242,6 +4335,34 @@ static void api_cg_cmp(CfreeCg *g, CmpOp cop) { opty = a.type ? a.type : b.type; i32 = builtin_id(CFREE_CG_BUILTIN_I32); + if (api_is_i128_type(g->c, opty)) { + CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128); + CfreeCgTypeId ps[2] = {i128, i128}; + ApiSValue args[2]; + CmpOp icmp = CMP_EQ; + const char *name = api_i128_cmp_is_unsigned(cop) ? "__cfree_ucmpti2" + : "__cfree_cmpti2"; + switch (cop) { + case CMP_EQ: icmp = CMP_EQ; break; + case CMP_NE: icmp = CMP_NE; break; + case CMP_LT_S: + case CMP_LT_U: icmp = CMP_LT_S; break; + case CMP_LE_S: + case CMP_LE_U: icmp = CMP_LE_S; break; + case CMP_GT_S: + case CMP_GT_U: icmp = CMP_GT_S; break; + case CMP_GE_S: + case CMP_GE_U: icmp = CMP_GE_S; break; + default: icmp = CMP_EQ; break; + } + args[0] = a; + args[1] = b; + api_runtime_call_values(g, name, i32, ps, 2, args); + cfree_cg_push_int(g, 0, i32); + api_cg_cmp(g, icmp); + return; + } + if (api_sv_op_is(&a, OPK_IMM) && api_sv_op_is(&b, OPK_IMM) && api_try_fold_int_cmp(g, cop, opty, a.op.v.imm, b.op.v.imm, &folded)) { api_release(g, &a); @@ -4294,6 +4415,118 @@ static void api_cg_convert_kind(CfreeCg *g, CfreeCgTypeId dst_type, api_push(g, v); return; } + if (api_is_i128_type(g->c, sty) && api_type_is_bool(g->c, dty) && + ck != CV_BITCAST) { + CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128); + CfreeCgTypeId i32 = builtin_id(CFREE_CG_BUILTIN_I32); + CfreeCgTypeId ps[2] = {i128, i128}; + ApiSValue args[2]; + ApiSValue r; + args[0] = v; + args[1] = api_make_sv(api_op_imm(0, i128), i128); + api_runtime_call_values(g, "__cfree_ucmpti2", i32, ps, 2, args); + cfree_cg_push_int(g, 0, i32); + api_cg_cmp(g, CMP_NE); + r = api_pop(g); + r.type = dty; + r.op.type = dty; + api_push(g, r); + return; + } + if (api_is_i128_type(g->c, dty) && !api_is_i128_type(g->c, sty) && + ck != CV_BITCAST) { + u32 sz = (u32)abi_cg_sizeof(g->c->abi, sty); + CfreeCgTypeId i64_ty = builtin_id(CFREE_CG_BUILTIN_I64); + FrameSlot slot = api_f128_temp_slot(g, dty); + Operand dst_lv = api_op_local(slot, dty); + if (api_sv_op_is(&v, OPK_IMM)) { + u8 bytes[16]; + u64 lo = (u64)v.op.v.imm; + u64 hi = 0; + if (ck == CV_SEXT && sz <= 8) { + u32 bits = sz * 8u; + u64 mask = bits >= 64u ? ~(u64)0 : ((1ull << bits) - 1ull); + u64 sign = 1ull << (bits - 1u); + u64 u = lo & mask; + if (u & sign) + u |= ~mask; + lo = u; + hi = (u & (1ull << 63)) ? ~(u64)0 : 0; + } + memset(bytes, 0, sizeof bytes); + for (u32 i = 0; i < 8; ++i) { + u32 lo_idx = g->c->target.big_endian ? 15u - i : i; + u32 hi_idx = g->c->target.big_endian ? 7u - i : 8u + i; + bytes[lo_idx] = (u8)(lo >> (i * 8u)); + bytes[hi_idx] = (u8)(hi >> (i * 8u)); + } + api_store_f128_bytes(g, slot, dty, bytes); + api_release(g, &v); + api_push(g, api_make_lv(dst_lv, dty)); + return; + } + { + CfreeCgTypeId ptr_ty = cg_type_ptr_to(g->c, dty); + CfreeCgTypeId src_ty = sty; + Operand src = api_force_reg(g, &v, sty); + Operand low = src; + Operand base; + Reg low_tmp = REG_NONE; + Reg ar; + MemAccess ma; + memset(&ma, 0, sizeof ma); + ma.type = i64_ty; + ma.size = 8; + ma.align = 8; + if (sz < 8) { + low_tmp = api_alloc_reg_or_spill(g, RC_INT, i64_ty); + low = api_op_reg(low_tmp, i64_ty); + T->convert(T, ck == CV_SEXT ? CV_SEXT : CV_ZEXT, low, src); + src_ty = i64_ty; + } else { + low.type = i64_ty; + } + ar = api_alloc_reg_or_spill(g, RC_INT, ptr_ty); + base = api_op_reg(ar, ptr_ty); + T->addr_of(T, base, dst_lv); + T->store(T, api_op_indirect(ar, 0, i64_ty), low, ma); + if (ck == CV_SEXT) { + Reg hr = api_alloc_reg_or_spill(g, RC_INT, i64_ty); + Operand high = api_op_reg(hr, i64_ty); + T->binop(T, BO_SHR_S, high, low, api_op_imm(63, i64_ty)); + T->store(T, api_op_indirect(ar, 8, i64_ty), high, ma); + api_free_reg(g, hr, RC_INT); + } else { + T->store(T, api_op_indirect(ar, 8, i64_ty), api_op_imm(0, i64_ty), ma); + } + if (low_tmp != REG_NONE) + api_free_reg(g, low_tmp, RC_INT); + (void)src_ty; + api_free_reg(g, ar, RC_INT); + api_release(g, &v); + api_push(g, api_make_lv(dst_lv, dty)); + } + return; + } + if (api_is_i128_type(g->c, sty) && !api_is_i128_type(g->c, dty) && + ck == CV_TRUNC && abi_cg_sizeof(g->c->abi, dty) <= 8) { + Reg rr = api_alloc_reg_or_spill(g, RC_INT, dty); + Operand dst = api_op_reg(rr, dty); + if (api_is_lvalue_sv(&v) || v.op.kind == OPK_LOCAL || + v.op.kind == OPK_INDIRECT || v.op.kind == OPK_GLOBAL) { + ApiSValue lv = v; + lv.lvalue = 1; + T->load(T, dst, lv.op, api_mem_for_lvalue(g, &lv.op, dty)); + } else if (v.op.kind == OPK_IMM) { + T->load_imm(T, dst, v.op.v.imm); + } else { + compiler_panic(g->c, g->cur_loc, + "CfreeCg: unsupported i128 truncation source"); + } + api_release(g, &v); + api_push(g, api_make_sv(dst, dty)); + return; + } if (ck == CV_BITCAST && abi_cg_sizeof(g->c->abi, sty) == abi_cg_sizeof(g->c->abi, dst_type) && api_type_class(sty) == api_type_class(dty)) { @@ -4369,6 +4602,34 @@ void cfree_cg_int_cmp(CfreeCg *g, CfreeCgIntCmpOp op) { api_cg_cmp(g, api_map_int_cmp(op)); } +static const char *api_i128_binop_helper(BinOp op) { + switch (op) { + case BO_IADD: return "__cfree_addti3"; + case BO_ISUB: return "__cfree_subti3"; + case BO_IMUL: return "__multi3"; + case BO_SDIV: return "__divti3"; + case BO_UDIV: return "__udivti3"; + case BO_SREM: return "__modti3"; + case BO_UREM: return "__umodti3"; + case BO_AND: return "__cfree_andti3"; + case BO_OR: return "__cfree_orti3"; + case BO_XOR: return "__cfree_xorti3"; + case BO_SHL: return "__ashlti3"; + case BO_SHR_U: return "__lshrti3"; + case BO_SHR_S: return "__ashrti3"; + case BO_FADD: + case BO_FSUB: + case BO_FMUL: + case BO_FDIV: + default: + return NULL; + } +} + +static int api_i128_cmp_is_unsigned(CmpOp op) { + return op == CMP_LT_U || op == CMP_LE_U || op == CMP_GT_U || op == CMP_GE_U; +} + static const char *api_f128_binop_helper(CfreeCgFpBinOp op) { switch (op) { case CFREE_CG_FP_ADD: return "__addtf3"; @@ -5380,6 +5641,19 @@ static void api_branch_if(CfreeCg *g, ApiSValue *v, int branch_when_true, api_release(g, v); return; } + if (api_is_i128_type(g->c, ty)) { + CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128); + CfreeCgTypeId i32 = builtin_id(CFREE_CG_BUILTIN_I32); + CfreeCgTypeId ps[2] = {i128, i128}; + ApiSValue args[2]; + ApiSValue cmp; + args[0] = *v; + args[1] = api_make_sv(api_op_imm(0, i128), i128); + api_runtime_call_values(g, "__cfree_ucmpti2", i32, ps, 2, args); + cmp = api_pop(g); + api_branch_if(g, &cmp, branch_when_true, label); + return; + } { Operand a = api_force_reg(g, v, ty); Operand zero = api_op_imm(0, ty); @@ -6112,8 +6386,8 @@ void cfree_cg_call(CfreeCg *g, uint32_t nargs, CfreeCgTypeId fn_type, avs[idx].type = aty; avs[idx].abi = is_vararg ? NULL : &abi->params[idx]; int is_aggregate = cg_type_is_aggregate(g->c, aty); - if (api_is_f128_type(g->c, aty)) { - ApiSValue lv = api_f128_materialize_lvalue(g, &arg, aty); + if (api_is_wide16_scalar_type(g->c, aty)) { + ApiSValue lv = api_wide16_materialize_lvalue(g, &arg, aty); avs[idx].storage = lv.op; avs[idx].storage.type = aty; avs[idx].size = 16; @@ -6150,14 +6424,14 @@ void cfree_cg_call(CfreeCg *g, uint32_t nargs, CfreeCgTypeId fn_type, if (has_result) { int ret_is_aggregate = cg_type_is_aggregate(g->c, ret_ty); - if (ret_is_aggregate || api_is_f128_type(g->c, ret_ty)) { + if (ret_is_aggregate || api_is_wide16_scalar_type(g->c, ret_ty)) { FrameSlotDesc fsd; memset(&fsd, 0, sizeof fsd); fsd.type = ret_ty; fsd.size = abi_cg_sizeof(g->c->abi, ret_ty); fsd.align = abi_cg_alignof(g->c->abi, ret_ty); fsd.kind = FS_LOCAL; - if (ret_is_aggregate || api_is_f128_type(g->c, ret_ty)) + if (ret_is_aggregate || api_is_wide16_scalar_type(g->c, ret_ty)) fsd.flags = FSF_ADDR_TAKEN; FrameSlot ret_slot = T->frame_slot(T, &fsd); desc.ret.storage = api_op_local(ret_slot, ret_ty); @@ -6300,8 +6574,8 @@ static void api_call_symbol_common(CfreeCg *g, CfreeCgSym sym, uint32_t nargs, aty = arg.type; avs[idx].type = aty; avs[idx].abi = is_vararg ? NULL : &abi->params[idx]; - if (api_is_f128_type(g->c, aty)) { - ApiSValue lv = api_f128_materialize_lvalue(g, &arg, aty); + if (api_is_wide16_scalar_type(g->c, aty)) { + ApiSValue lv = api_wide16_materialize_lvalue(g, &arg, aty); avs[idx].storage = lv.op; avs[idx].storage.type = aty; avs[idx].size = 16; @@ -6330,7 +6604,8 @@ static void api_call_symbol_common(CfreeCg *g, CfreeCgSym sym, uint32_t nargs, desc.ret.type = ret_ty; desc.ret.abi = &abi->ret; if (has_result) { - if (cg_type_is_aggregate(g->c, ret_ty) || api_is_f128_type(g->c, ret_ty)) { + if (cg_type_is_aggregate(g->c, ret_ty) || + api_is_wide16_scalar_type(g->c, ret_ty)) { FrameSlotDesc fsd; FrameSlot ret_slot; memset(&fsd, 0, sizeof fsd); @@ -6398,8 +6673,8 @@ void cfree_cg_ret(CfreeCg *g) { T->ret(T, &av); return; } - if (api_is_f128_type(g->c, rty)) { - ApiSValue lv = api_f128_materialize_lvalue(g, &v, rty); + if (api_is_wide16_scalar_type(g->c, rty)) { + ApiSValue lv = api_wide16_materialize_lvalue(g, &v, rty); av.storage = lv.op; av.storage.type = rty; av.size = 16; diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c @@ -100,6 +100,11 @@ static RelocKind ldst_lo12_reloc_for(u32 nbytes) { static void aa_emit_ldr_fp_any(MCEmitter* mc, u32 sidx, u32 rt, u32 rn, i32 off) { + if (off < -256 || off > 255) { + aa64_emit_addr_adjust(mc, AA_TMP0, rn, off); + rn = AA_TMP0; + off = 0; + } if (sidx == 4) aa64_emit32(mc, aa64_ldur_q(rt, rn, off)); else @@ -108,6 +113,11 @@ static void aa_emit_ldr_fp_any(MCEmitter* mc, u32 sidx, u32 rt, u32 rn, static void aa_emit_str_fp_any(MCEmitter* mc, u32 sidx, u32 rt, u32 rn, i32 off) { + if (off < -256 || off > 255) { + aa64_emit_addr_adjust(mc, AA_TMP0, rn, off); + rn = AA_TMP0; + off = 0; + } if (sidx == 4) aa64_emit32(mc, aa64_stur_q(rt, rn, off)); else @@ -1271,7 +1281,7 @@ static void aa_call(CGTarget* t, const CGCallDesc* d) { u32 sidx = size_idx_for_bytes(p->size); i32 off = base_off + (i32)p->src_offset; if (p->cls == ABI_CLASS_INT) { - aa64_emit32(mc, aa64_stur(sidx, src_reg, base_reg, off)); + aa64_emit_stur_off(mc, sidx, src_reg, base_reg, off, AA_TMP0); } else { aa_emit_str_fp_any(mc, sidx, src_reg, base_reg, off); } diff --git a/src/arch/rv64/alloc.c b/src/arch/rv64/alloc.c @@ -368,8 +368,9 @@ void rv_cmp(CGTarget* t, CmpOp op, Operand dst, Operand a_op, RImpl* a = impl_of(t); u32 rd = reg_num(dst); - if (op == CMP_EQ || op == CMP_NE || op == CMP_LT_F || op == CMP_LE_F || - op == CMP_GT_F || op == CMP_GE_F) { + if ((a_op.cls == RC_FP || b_op.cls == RC_FP) && + (op == CMP_EQ || op == CMP_NE || op == CMP_LT_F || op == CMP_LE_F || + op == CMP_GT_F || op == CMP_GE_F)) { /* FP compare in fa,fb → rd. Use FLT/FLE/FEQ depending on op. */ int is_d = type_is_fp_double(a_op.type); u32 fa = reg_num(a_op); diff --git a/src/arch/rv64/emit.c b/src/arch/rv64/emit.c @@ -24,8 +24,8 @@ static u32 rv_planned_prologue_words(const RImpl *a) { u32 n = RV_PROLOGUE_FRAME_WORDS; if (a->has_sret) ++n; if (a->is_variadic) n += 8u; - n += count_mask_regs(a->planned_cs_int_mask, 18u, 27u); - n += count_mask_regs(a->planned_cs_fp_mask, 18u, 27u); + n += 4u * count_mask_regs(a->planned_cs_int_mask, 18u, 27u); + n += 4u * count_mask_regs(a->planned_cs_fp_mask, 18u, 27u); return n ? n : 1u; } @@ -250,6 +250,71 @@ static u32 rv_variadic_first_saved_int(const CGFuncDesc *fd) { return next_int; } +static void rv_words_addr_adjust(CGTarget *t, u32 *words, u32 cap, u32 *wi, + u32 rd, u32 base, i32 off) { + if (off == 0) { + if (rd != base) { + if (*wi >= cap) goto overflow; + words[(*wi)++] = rv_addi(rd, base, 0); + } + return; + } + if (off >= -2048 && off <= 2047) { + if (*wi >= cap) goto overflow; + words[(*wi)++] = rv_addi(rd, base, off); + return; + } + i32 hi = (i32)(((i64)off + 0x800) >> 12); + i32 lo = off - (hi << 12); + if (*wi >= cap) goto overflow; + words[(*wi)++] = rv_lui(rd, (u32)hi & 0xfffffu); + if (lo) { + if (*wi >= cap) goto overflow; + words[(*wi)++] = rv_addiw(rd, rd, lo); + } + if (*wi >= cap) goto overflow; + words[(*wi)++] = rv_add(rd, base, rd); + return; + +overflow: + compiler_panic(t->c, impl_of(t)->loc, + "rv64: prologue placeholder too small (cap %u)", cap); +} + +static void rv_words_store_int_s0(CGTarget *t, u32 *words, u32 cap, u32 *wi, + u32 reg, i32 off) { + if (off >= -2048 && off <= 2047) { + if (*wi >= cap) goto overflow; + words[(*wi)++] = rv_sd(reg, RV_S0, off); + return; + } + rv_words_addr_adjust(t, words, cap, wi, RV_T0, RV_S0, off); + if (*wi >= cap) goto overflow; + words[(*wi)++] = rv_sd(reg, RV_T0, 0); + return; + +overflow: + compiler_panic(t->c, impl_of(t)->loc, + "rv64: prologue placeholder too small (cap %u)", cap); +} + +static void rv_words_store_fp_s0(CGTarget *t, u32 *words, u32 cap, u32 *wi, + u32 reg, i32 off) { + if (off >= -2048 && off <= 2047) { + if (*wi >= cap) goto overflow; + words[(*wi)++] = rv_fsd(reg, RV_S0, off); + return; + } + rv_words_addr_adjust(t, words, cap, wi, RV_T0, RV_S0, off); + if (*wi >= cap) goto overflow; + words[(*wi)++] = rv_fsd(reg, RV_T0, 0); + return; + +overflow: + compiler_panic(t->c, impl_of(t)->loc, + "rv64: prologue placeholder too small (cap %u)", cap); +} + static u32 rv_build_prologue(CGTarget *t, u32 *words, u32 cap, const RvFrameLayout *fl, const u32 *int_regs, u32 n_int_saves, const u32 *fp_regs, @@ -277,14 +342,26 @@ static u32 rv_build_prologue(CGTarget *t, u32 *words, u32 cap, words[wi++] = rv_add(RV_SP, RV_SP, RV_T0); } - if ((i32)fl->fp_pair_off > 2047 || - (i32)(fl->fp_pair_off + 8) > 2047) { - compiler_panic(t->c, a->loc, "rv64: fp_pair_off out of imm12 range"); + if ((i32)fl->fp_pair_off <= 2039) { + if (wi + 3 > cap) goto overflow; + words[wi++] = rv_sd(RV_S0, RV_SP, (i32)fl->fp_pair_off); + words[wi++] = rv_sd(RV_RA, RV_SP, (i32)fl->fp_pair_off + 8); + words[wi++] = rv_addi(RV_S0, RV_SP, (i32)fl->fp_pair_off); + } else { + i32 off = (i32)fl->fp_pair_off; + i32 hi = (i32)(((i64)off + 0x800) >> 12); + i32 lo = off - (hi << 12); + if (fl->fp_pair_off > 0x7fffffffu) + compiler_panic(t->c, a->loc, "rv64: fp_pair_off too large"); + if (wi + 6 > cap) goto overflow; + words[wi++] = rv_lui(RV_T0, (u32)hi & 0xfffffu); + if (lo) + words[wi++] = rv_addiw(RV_T0, RV_T0, lo); + words[wi++] = rv_add(RV_T0, RV_SP, RV_T0); + words[wi++] = rv_sd(RV_S0, RV_T0, 0); + words[wi++] = rv_sd(RV_RA, RV_T0, 8); + words[wi++] = rv_addi(RV_S0, RV_T0, 0); } - if (wi + 3 > cap) goto overflow; - words[wi++] = rv_sd(RV_S0, RV_SP, (i32)fl->fp_pair_off); - words[wi++] = rv_sd(RV_RA, RV_SP, (i32)fl->fp_pair_off + 8); - words[wi++] = rv_addi(RV_S0, RV_SP, (i32)fl->fp_pair_off); /* If sret, spill incoming a0 into the hidden slot. */ if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) { @@ -304,14 +381,12 @@ static u32 rv_build_prologue(CGTarget *t, u32 *words, u32 cap, for (u32 i = 0; i < n_int_saves; ++i) { u32 r = int_regs[i]; i32 off = fl->int_save_base - 8 * (i32)i; - if (wi >= cap) goto overflow; - words[wi++] = rv_sd(r, RV_S0, off); + rv_words_store_int_s0(t, words, cap, &wi, r, off); } for (u32 i = 0; i < n_fp_saves; ++i) { u32 r = fp_regs[i]; i32 off = fl->fp_save_base - 8 * (i32)i; - if (wi >= cap) goto overflow; - words[wi++] = rv_fsd(r, RV_S0, off); + rv_words_store_fp_s0(t, words, cap, &wi, r, off); } return wi; @@ -424,23 +499,30 @@ void rv_func_end(CGTarget *t) { for (i32 i = (i32)n_int_saves - 1; i >= 0; --i) { u32 r = int_regs[i]; i32 off = fl.int_save_base - 8 * (i32)i; - rv64_emit32(mc, rv_ld(r, RV_S0, off)); + if (off >= -2048 && off <= 2047) { + rv64_emit32(mc, rv_ld(r, RV_S0, off)); + } else { + rv64_emit_addr_adjust(mc, RV_T0, RV_S0, off); + rv64_emit32(mc, rv_ld(r, RV_T0, 0)); + } } for (i32 i = (i32)n_fp_saves - 1; i >= 0; --i) { u32 r = fp_regs[i]; i32 off = fl.fp_save_base - 8 * (i32)i; - rv64_emit32(mc, rv_fld(r, RV_S0, off)); + if (off >= -2048 && off <= 2047) { + rv64_emit32(mc, rv_fld(r, RV_S0, off)); + } else { + rv64_emit_addr_adjust(mc, RV_T0, RV_S0, off); + rv64_emit32(mc, rv_fld(r, RV_T0, 0)); + } } /* Restore sp from s0 first so alloca-induced offsets don't matter. * After this, sp == its post-prologue value. */ if (a->has_alloca) { - if ((i32)fl.fp_pair_off > 2047) { - compiler_panic(t->c, a->loc, "rv64: fp_pair_off too large for alloca"); - } - rv64_emit32(mc, rv_addi(RV_SP, RV_S0, -(i32)fl.fp_pair_off)); + rv64_emit_addr_adjust(mc, RV_SP, RV_S0, -(i32)fl.fp_pair_off); } - rv64_emit32(mc, rv_ld(RV_S0, RV_SP, (i32)fl.fp_pair_off)); - rv64_emit32(mc, rv_ld(RV_RA, RV_SP, (i32)fl.fp_pair_off + 8)); + rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8)); + rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0)); emit_sp_addi(mc, (i64)fl.frame_size); rv64_emit32(mc, rv_ret_()); diff --git a/src/arch/rv64/internal.h b/src/arch/rv64/internal.h @@ -11,8 +11,8 @@ #include "core/pool.h" #include "obj/obj.h" -#define RV_PROLOGUE_WORDS 35u -#define RV_PROLOGUE_FRAME_WORDS 6u /* worst-case sp adjust + s0/ra + set s0 */ +#define RV_PROLOGUE_WORDS 128u +#define RV_PROLOGUE_FRAME_WORDS 10u /* sp adjust + far/near s0/ra save + set s0 */ /* ---- RvSlot / RvScope ---- */ typedef struct RvSlot { diff --git a/src/arch/rv64/ops.c b/src/arch/rv64/ops.c @@ -804,6 +804,41 @@ static void rv_store_stack_reg(CGTarget* t, u32 reg, RegClass cls, rv_store(t, addr, src, ma); } +static Operand rv_offset_mem_operand(CGTarget* t, Operand op, u32 offset) { + if (!offset) return op; + if (op.kind == OPK_INDIRECT) { + op.v.ind.ofs += (i32)offset; + } else if (op.kind == OPK_LOCAL) { + RImpl* a = impl_of(t); + RvSlot* s = rv64_slot_get(a, op.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "rv64 offset operand: bad slot"); + op.kind = OPK_INDIRECT; + op.v.ind.base = RV_S0; + op.v.ind.ofs = -(i32)s->off + (i32)offset; + } + return op; +} + +static void rv_load_abi_part(CGTarget* t, Operand dst, Operand src, u32 offset, + u32 size) { + MemAccess ma; + memset(&ma, 0, sizeof ma); + ma.type = dst.type; + ma.size = size; + ma.align = size ? size : 1u; + rv_load(t, dst, rv_offset_mem_operand(t, src, offset), ma); +} + +static void rv_store_abi_part(CGTarget* t, Operand dst, Operand src, + u32 offset, u32 size) { + MemAccess ma; + memset(&ma, 0, sizeof ma); + ma.type = src.type; + ma.size = size; + ma.align = size ? size : 1u; + rv_store(t, rv_offset_mem_operand(t, dst, offset), src, ma); +} + static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int, u32* next_fp, u32* stack_off, int tail) { RImpl* a = impl_of(t); @@ -890,18 +925,15 @@ static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int, break; } case OPK_LOCAL: { - RvSlot* s = rv64_slot_get(a, av->storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "rv64 call: bad arg slot"); - i32 off = -(i32)s->off + (i32)pt->src_offset; - rv64_emit32(mc, enc_int_load(sz, 0, dst_reg, RV_S0, off)); + Operand dst = {.kind = OPK_REG, .cls = RC_INT, .type = av->type}; + dst.v.reg = dst_reg; + rv_load_abi_part(t, dst, av->storage, pt->src_offset, sz); break; } case OPK_INDIRECT: { - /* cg holds INDIRECT base regs in s2..s11, disjoint from arg - * regs a0..a7 and the t0 stack-arg scratch. */ - u32 base = av->storage.v.ind.base & 0x1fu; - i32 off = av->storage.v.ind.ofs + (i32)pt->src_offset; - rv64_emit32(mc, enc_int_load(sz, 0, dst_reg, base, off)); + Operand dst = {.kind = OPK_REG, .cls = RC_INT, .type = av->type}; + dst.v.reg = dst_reg; + rv_load_abi_part(t, dst, av->storage, pt->src_offset, sz); break; } default: @@ -925,18 +957,15 @@ static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int, break; } case OPK_LOCAL: { - RvSlot* s = rv64_slot_get(a, av->storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "rv64 call: bad FP arg slot"); - i32 off = -(i32)s->off + (i32)pt->src_offset; - rv64_emit32(mc, (sz == 8) ? rv_fld(freg, RV_S0, off) - : rv_flw(freg, RV_S0, off)); + Operand dst = {.kind = OPK_REG, .cls = RC_FP, .type = av->type}; + dst.v.reg = freg; + rv_load_abi_part(t, dst, av->storage, pt->src_offset, sz); break; } case OPK_INDIRECT: { - u32 base = av->storage.v.ind.base & 0x1fu; - i32 off = av->storage.v.ind.ofs + (i32)pt->src_offset; - rv64_emit32(mc, (sz == 8) ? rv_fld(freg, base, off) - : rv_flw(freg, base, off)); + Operand dst = {.kind = OPK_REG, .cls = RC_FP, .type = av->type}; + dst.v.reg = freg; + rv_load_abi_part(t, dst, av->storage, pt->src_offset, sz); break; } default: @@ -950,15 +979,14 @@ static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int, *stack_off, tail); break; case OPK_LOCAL: { - RvSlot* s = rv64_slot_get(a, av->storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "rv64 call: bad FP arg slot"); - i32 off = -(i32)s->off + (i32)pt->src_offset; + Operand tmp = {.kind = OPK_REG, .cls = RC_FP, .type = av->type}; + tmp.v.reg = 0u; if (sz == 8) { - rv64_emit32(mc, rv_fld(/*ft0=*/0u, RV_S0, off)); + rv_load_abi_part(t, tmp, av->storage, pt->src_offset, sz); rv_store_stack_reg(t, /*ft0=*/0u, RC_FP, av->type, sz, *stack_off, tail); } else { - rv64_emit32(mc, rv_flw(/*ft0=*/0u, RV_S0, off)); + rv_load_abi_part(t, tmp, av->storage, pt->src_offset, sz); rv_store_stack_reg(t, /*ft0=*/0u, RC_FP, av->type, sz, *stack_off, tail); } @@ -967,14 +995,14 @@ static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int, case OPK_INDIRECT: { /* Route through ft0 — it is in {ft0..ft7}, caller-saved * scratch outside the cg fs2..fs11 pool. */ - u32 base = av->storage.v.ind.base & 0x1fu; - i32 off = av->storage.v.ind.ofs + (i32)pt->src_offset; + Operand tmp = {.kind = OPK_REG, .cls = RC_FP, .type = av->type}; + tmp.v.reg = 0u; if (sz == 8) { - rv64_emit32(mc, rv_fld(/*ft0=*/0u, base, off)); + rv_load_abi_part(t, tmp, av->storage, pt->src_offset, sz); rv_store_stack_reg(t, /*ft0=*/0u, RC_FP, av->type, sz, *stack_off, tail); } else { - rv64_emit32(mc, rv_flw(/*ft0=*/0u, base, off)); + rv_load_abi_part(t, tmp, av->storage, pt->src_offset, sz); rv_store_stack_reg(t, /*ft0=*/0u, RC_FP, av->type, sz, *stack_off, tail); } @@ -1089,19 +1117,28 @@ static void rv_tail_restore_frame(CGTarget* t) { if (a->omit_frame) return; for (i32 i = (i32)n_int_saves - 1; i >= 0; --i) { - rv64_emit32(mc, rv_ld(int_regs[i], RV_S0, fl.int_save_base - 8 * i)); + i32 off = fl.int_save_base - 8 * i; + if (off >= -2048 && off <= 2047) { + rv64_emit32(mc, rv_ld(int_regs[i], RV_S0, off)); + } else { + rv64_emit_addr_adjust(mc, RV_T0, RV_S0, off); + rv64_emit32(mc, rv_ld(int_regs[i], RV_T0, 0)); + } } for (i32 i = (i32)n_fp_saves - 1; i >= 0; --i) { - rv64_emit32(mc, rv_fld(fp_regs[i], RV_S0, fl.fp_save_base - 8 * i)); + i32 off = fl.fp_save_base - 8 * i; + if (off >= -2048 && off <= 2047) { + rv64_emit32(mc, rv_fld(fp_regs[i], RV_S0, off)); + } else { + rv64_emit_addr_adjust(mc, RV_T0, RV_S0, off); + rv64_emit32(mc, rv_fld(fp_regs[i], RV_T0, 0)); + } } if (a->has_alloca) { - if ((i32)fl.fp_pair_off > 2047) { - compiler_panic(t->c, a->loc, "rv64 tail call: fp pair offset too large"); - } - rv64_emit32(mc, rv_addi(RV_SP, RV_S0, -(i32)fl.fp_pair_off)); + rv64_emit_addr_adjust(mc, RV_SP, RV_S0, -(i32)fl.fp_pair_off); } - rv64_emit32(mc, rv_ld(RV_S0, RV_SP, (i32)fl.fp_pair_off)); - rv64_emit32(mc, rv_ld(RV_RA, RV_SP, (i32)fl.fp_pair_off + 8)); + rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8)); + rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0)); emit_sp_addi(mc, (i64)fl.frame_size); } @@ -1208,23 +1245,15 @@ static void rv_call(CGTarget* t, const CGCallDesc* d) { rv64_emit32(mc, rv_fsgnj(fmt, reg_num(rs), src_reg, src_reg)); } } else if (rs.kind == OPK_LOCAL || rs.kind == OPK_INDIRECT) { - u32 base_reg; - i32 base_off; - if (rs.kind == OPK_LOCAL) { - RvSlot* s = rv64_slot_get(a, rs.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "rv64 call: bad ret slot"); - base_reg = RV_S0; - base_off = -(i32)s->off; - } else { - base_reg = rs.v.ind.base & 0x1fu; - base_off = rs.v.ind.ofs; - } - i32 off = base_off + (i32)p->src_offset; - if (p->cls == ABI_CLASS_INT) { - rv64_emit32(mc, enc_int_store(p->size, src_reg, base_reg, off)); + Operand src = {.kind = OPK_REG, + .cls = (u8)((p->cls == ABI_CLASS_FP) ? RC_FP : RC_INT), + .type = d->ret.type}; + src.v.reg = src_reg; + if (p->cls == ABI_CLASS_INT || p->cls == ABI_CLASS_FP) { + rv_store_abi_part(t, rs, src, p->src_offset, p->size); } else { - if (p->size == 8) rv64_emit32(mc, rv_fsd(src_reg, base_reg, off)); - else rv64_emit32(mc, rv_fsw(src_reg, base_reg, off)); + compiler_panic(t->c, a->loc, "rv64 call: ret part cls %d unimpl", + (int)p->cls); } } else if (rs.kind == OPK_IMM && rs.type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_VOID)) { /* void return placeholder — nothing to do. */ @@ -1407,28 +1436,19 @@ static void rv_ret(CGTarget* t, const CGABIValue* val) { rv64_emit_load_imm(mc, sf, RV_A0, val->storage.v.imm); } else if (val->storage.kind == OPK_LOCAL || val->storage.kind == OPK_INDIRECT) { - u32 base_reg; - i32 base_off; - if (val->storage.kind == OPK_LOCAL) { - RvSlot* s = rv64_slot_get(a, val->storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "rv64 ret: bad local slot"); - base_reg = RV_S0; - base_off = -(i32)s->off; - } else { - base_reg = val->storage.v.ind.base & 0x1fu; - base_off = val->storage.v.ind.ofs; - } const ABIArgInfo* ri2 = val->abi; u32 nir = 0, nfr = 0; for (u16 i = 0; i < (ri2 ? ri2->nparts : 0); ++i) { const ABIArgPart* pt = &ri2->parts[i]; - i32 off = base_off + (i32)pt->src_offset; if (pt->cls == ABI_CLASS_INT) { - rv64_emit32(mc, enc_int_load(pt->size, 0, RV_A0 + nir++, base_reg, off)); + Operand dst = {.kind = OPK_REG, .cls = RC_INT, .type = val->type}; + dst.v.reg = RV_A0 + nir++; + rv_load_abi_part(t, dst, val->storage, pt->src_offset, pt->size); } else if (pt->cls == ABI_CLASS_FP) { + Operand dst = {.kind = OPK_REG, .cls = RC_FP, .type = val->type}; u32 freg = 10u + nfr++; - if (pt->size == 8) rv64_emit32(mc, rv_fld(freg, base_reg, off)); - else rv64_emit32(mc, rv_flw(freg, base_reg, off)); + dst.v.reg = freg; + rv_load_abi_part(t, dst, val->storage, pt->src_offset, pt->size); } else { compiler_panic(t->c, a->loc, "rv64 ret: part cls %d unimpl", (int)pt->cls); diff --git a/test/parse/cases/6_7_2_12_long_double.skip b/test/parse/cases/6_7_2_12_long_double.skip @@ -1 +0,0 @@ -long double (binary128) literal/convert needs rt/lib/fp_tf wiring through cg diff --git a/test/parse/cases/i128_06_shifts_bitwise.c b/test/parse/cases/i128_06_shifts_bitwise.c @@ -4,8 +4,12 @@ int test_main(void) { u128 x = (u128)0xf0ULL << 68; u128 y = x >> 64; u128 z = (x | ((u128)0x55ULL << 4)) ^ ((u128)0x5ULL << 4); + u128 high_truth = ((u128)1 << 112) << 4; + _Bool high_bool = high_truth; if ((unsigned long long)y != 0xf00ULL) return 11; if ((unsigned long long)z != 0x500ULL) return 12; if ((unsigned long long)(z >> 64) != 0xf00ULL) return 13; + if (!high_truth) return 14; + if (!high_bool) return 15; return 41; } diff --git a/test/parse/cases/i128_13_signed_div_mod.c b/test/parse/cases/i128_13_signed_div_mod.c @@ -0,0 +1,31 @@ +typedef __int128 i128; + +int test_main(void) { + i128 a = -(((i128)1 << 90) + 123456789); + i128 b = ((i128)1 << 30) + 7; + i128 q = a / b; + i128 r = a % b; + + if (q != -(((i128)1 << 60) - (((i128)7 << 30) - 49))) return 11; + if (r != -123456446) return 12; + if (q * b + r != a) return 13; + if (r >= 0) return 14; + + b = -(((i128)1 << 33) + 5); + q = a / b; + r = a % b; + if (q != (((i128)1 << 57) - ((i128)80 << 20))) return 15; + if (r != -542887189) return 16; + if (q * b + r != a) return 17; + if (r >= 0) return 18; + + a = ((i128)1 << 90) + 123456789; + q = a / b; + r = a % b; + if (q != -(((i128)1 << 57) - ((i128)80 << 20))) return 19; + if (r != 542887189) return 20; + if (q * b + r != a) return 21; + if (r <= 0) return 22; + + return 61; +} diff --git a/test/parse/cases/i128_13_signed_div_mod.expected b/test/parse/cases/i128_13_signed_div_mod.expected @@ -0,0 +1 @@ +61 diff --git a/test/parse/cases/i128_14_arbitrary_mul.c b/test/parse/cases/i128_14_arbitrary_mul.c @@ -0,0 +1,22 @@ +typedef __int128 i128; +typedef unsigned __int128 u128; + +int test_main(void) { + u128 a = ((u128)0x123456789abcdef0ULL << 16) | 0x1357ULL; + u128 b = ((u128)0x0fedcba987654321ULL << 12) | 0x246ULL; + u128 p = a * b; + + if ((unsigned long long)p != 0x71407aa829ff67caULL) return 11; + if ((unsigned long long)(p >> 64) != 0x0ad77d7422601184ULL) return 12; + + i128 x = -(((i128)0x1234567 << 40) + 0x89abcdef); + i128 y = ((i128)0x13579 << 28) + 0x2468ace; + i128 z = x * y; + u128 uz = (u128)z; + + if (z >= 0) return 13; + if ((unsigned long long)uz != 0x324b79b4fd6373aeULL) return 14; + if ((unsigned long long)(uz >> 64) != 0xffffe9fe36571cf3ULL) return 15; + + return 73; +} diff --git a/test/parse/cases/i128_14_arbitrary_mul.expected b/test/parse/cases/i128_14_arbitrary_mul.expected @@ -0,0 +1 @@ +73 diff --git a/test/parse/cases/ldbl128_15_arbitrary_mul.c b/test/parse/cases/ldbl128_15_arbitrary_mul.c @@ -0,0 +1,15 @@ +int test_main(void) { + if (__LDBL_MANT_DIG__ != 113) return 0; + + long double a = 7.0L * 9.0L; + long double b = 13.0L * 11.0L; + long double c = 1.5L * 2.5L; + + if ((int)a != 63) return 11; + if ((int)b != 143) return 12; + if ((int)(a + b) != 206) return 13; + if ((int)c != 3) return 14; + if ((int)((c - 3.0L) * 4.0L) != 3) return 15; + + return 71; +} diff --git a/test/parse/cases/ldbl128_15_arbitrary_mul.expected b/test/parse/cases/ldbl128_15_arbitrary_mul.expected @@ -0,0 +1 @@ +71