Complete i128 and binary128 long double support - kit

commit bbe0c3e30b210857cf082638b7cc4ecfa2b3e022
parent 28e75424c3c6c828f561cb5a6f216235f1dd5ad7
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue, 19 May 2026 09:30:02 -0700

Complete i128 and binary128 long double support

Diffstat:
M doc/C11_LONG_DOUBLE_CHECKLIST.md  | 64 +++++++++++++++++++++++++++++++++++++++-------------------------
M lang/c/parse/cg_adapter.c  | 3 ++-
M lang/c/parse/parse_expr.c  | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
M lang/c/parse/parse_init.c  | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M lang/c/parse/parse_priv.h  | 4 +++-
M lang/c/parse/parse_type.c  | 34 ++++++++++++++++++++++++++++++++--
M rt/lib/README.md  | 2 +-
M rt/lib/fp_tf/fp_tf.c  | 357 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
M rt/lib/int64/int64.c  | 408 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
M src/abi/abi_aapcs64.c  | 17 +++++++++++++++++
M src/abi/abi_rv64.c  | 3 ++-
M src/api/cg.c  | 325 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M src/arch/aa64/ops.c  | 12 +++++++++++-
M src/arch/rv64/alloc.c  | 5 +++--
M src/arch/rv64/emit.c  | 124 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M src/arch/rv64/internal.h  | 4 ++--
M src/arch/rv64/ops.c  | 152 +++++++++++++++++++++++++++++++++++++++++++++----------------------------------
D test/parse/cases/6_7_2_12_long_double.skip  | 1 -
M test/parse/cases/i128_06_shifts_bitwise.c  | 4 ++++
A test/parse/cases/i128_13_signed_div_mod.c  | 31 +++++++++++++++++++++++++++++++
A test/parse/cases/i128_13_signed_div_mod.expected  | 1 +
A test/parse/cases/i128_14_arbitrary_mul.c  | 22 ++++++++++++++++++++++
A test/parse/cases/i128_14_arbitrary_mul.expected  | 1 +
A test/parse/cases/ldbl128_15_arbitrary_mul.c  | 15 +++++++++++++++
A test/parse/cases/ldbl128_15_arbitrary_mul.expected  | 1 +

25 files changed, 1493 insertions(+), 395 deletions(-)
diff --git a/doc/C11_LONG_DOUBLE_CHECKLIST.md b/doc/C11_LONG_DOUBLE_CHECKLIST.md
@@ -8,11 +8,11 @@ implementation pass it on the target that owns that format.
 
 ## Target profiles
 
-- [ ] AArch64 Linux: IEEE binary128 `long double`.
+- [x] AArch64 Linux: IEEE binary128 `long double`.
       ABI: passed and returned in SIMD/FP `q` registers when register slots are
       available. Arithmetic and conversions lower to compiler-rt `*tf*`
       helpers.
-- [ ] RV64 Linux LP64D: IEEE binary128 `long double`.
+- [x] RV64 Linux LP64D: IEEE binary128 `long double`.
       ABI: passed and returned as two integer XLEN eightbytes because FLEN is
       64. Arithmetic and conversions lower to compiler-rt `*tf*` helpers.
 - [ ] AArch64 Darwin: `long double == double`.
@@ -24,30 +24,30 @@ implementation pass it on the target that owns that format.
 
 ## Support target for the binary128 slice
 
-- [ ] Complete the 16-byte scalar `__int128` path before treating binary128 as
+- [x] Complete the 16-byte scalar `__int128` path before treating binary128 as
       green: layout, locals/globals, constants, arithmetic, shifts, compares,
       calls/returns, aggregate fields, unions, and static initialization.
-- [ ] Add a target long-double profile query used by both the frontend and CG:
+- [x] Add a target long-double profile query used by both the frontend and CG:
       format, storage size, alignment, macro values, and ABI classification.
-- [ ] Add a distinct CG type for binary128 `long double`; `TY_LDOUBLE` must not
+- [x] Add a distinct CG type for binary128 `long double`; `TY_LDOUBLE` must not
       map to `F64` on AArch64/RV64 Linux.
-- [ ] Emit target-correct `__LDBL_*` and `__DECIMAL_DIG__` predefined macros
+- [x] Emit target-correct `__LDBL_*` and `__DECIMAL_DIG__` predefined macros
       for binary128 targets.
-- [ ] Encode `L` floating constants as binary128 bytes without narrowing their
+- [x] Encode `L` floating constants as binary128 bytes without narrowing their
       storage type to `double`.
-- [ ] Support binary128 local/global storage, assignment, struct fields, and
+- [x] Support binary128 local/global storage, assignment, struct fields, and
       return values.
-- [ ] Lower binary128 arithmetic to runtime helpers:
+- [x] Lower binary128 arithmetic to runtime helpers:
       `__addtf3`, `__subtf3`, `__multf3`, and `__divtf3`.
-- [ ] Lower binary128 comparisons through compiler-rt compare helpers.
-- [ ] Lower integer, float, and double conversions through compiler-rt helpers:
+- [x] Lower binary128 comparisons through compiler-rt compare helpers.
+- [x] Lower integer, float, and double conversions through compiler-rt helpers:
       `__float*tf`, `__fix*tf*`, `__extend{s,d}ftf2`, and
       `__trunctf{s,d}f2`.
-- [ ] Teach AArch64 codegen to move 16-byte FP values through Q-register
+- [x] Teach AArch64 codegen to move 16-byte FP values through Q-register
       load/store/copy paths.
-- [ ] Teach RV64 ABI movement to pass/return binary128 values as two integer
+- [x] Teach RV64 ABI movement to pass/return binary128 values as two integer
       parts, backed by memory in CG.
-- [ ] Keep runtime linkage using the existing `rt/lib/fp_tf/fp_tf.c` and
+- [x] Keep runtime linkage using the existing `rt/lib/fp_tf/fp_tf.c` and
       `rt/lib/fp_ti/fp_ti.c` objects for the binary128 runtime variants.
 
 ## Red tests
@@ -69,28 +69,42 @@ x87 work can land later without hiding the binary128 regression signal.
 
 Coverage intent:
 
-- `i128_01` through `i128_12`: target layout/alignment, literal storage,
+- `i128_01` through `i128_14`: target layout/alignment, literal storage,
       add/sub carry, multiply high-half behavior, div/mod, shifts/bitwise
       operations, signed and unsigned compares, signed shifts/conversions,
       calls/returns, aggregate fields, union lane visibility, and global
-      initialization.
-- `ldbl128_01` through `ldbl128_14`: target macros/layout, literal decoding,
+      initialization, arbitrary signed div/mod, and arbitrary signed/unsigned
+      multiplication.
+- `ldbl128_01` through `ldbl128_15`: target macros/layout, literal decoding,
       arithmetic helpers, conversions, comparisons, calls/returns, struct and
       array storage, raw binary128 bits, globals, unary negation, stack
-      arguments, mixed arithmetic, and aggregate return.
+      arguments, mixed arithmetic, aggregate return, and arbitrary binary128
+      multiplication.
+
+Known remaining limits:
+
+- The binary128 support target is Linux AArch64/RV64. Darwin `long double`
+      target rules and x87 80-bit `long double` are still separate follow-up
+      targets.
+- Decimal `L` literal coverage currently exercises representable values and
+      raw canonical encodings; it does not yet prove full decimal-to-binary128
+      precision for non-representable literals.
+- ABI aggregate classification still covers the implemented scalar and simple
+      aggregate paths, not the full AArch64 HFA/HVA or every RV64 aggregate
+      flattening edge.
 
 ## Done criteria
 
-- [ ] `CFREE_TEST_ARCH=aa64 CFREE_TEST_FILTER=ldbl128 make test-parse` passes
+- [x] `CFREE_TEST_ARCH=aa64 CFREE_TEST_FILTER=ldbl128 make test-parse` passes
       with `CFREE_TEST_ALLOW_SKIP` unset.
-- [ ] `CFREE_TEST_ARCH=rv64 CFREE_TEST_FILTER=ldbl128 make test-parse` passes
+- [x] `CFREE_TEST_ARCH=rv64 CFREE_TEST_FILTER=ldbl128 make test-parse` passes
       with `CFREE_TEST_ALLOW_SKIP` unset.
-- [ ] `CFREE_TEST_ARCH=aa64 CFREE_TEST_FILTER=i128 make test-parse` passes
+- [x] `CFREE_TEST_ARCH=aa64 CFREE_TEST_FILTER=i128 make test-parse` passes
       with `CFREE_TEST_ALLOW_SKIP` unset.
-- [ ] `CFREE_TEST_ARCH=rv64 CFREE_TEST_FILTER=i128 make test-parse` passes
+- [x] `CFREE_TEST_ARCH=rv64 CFREE_TEST_FILTER=i128 make test-parse` passes
       with `CFREE_TEST_ALLOW_SKIP` unset.
-- [ ] `CFREE_TEST_FILTER=6_7_2_12_long_double make test-parse` passes on
+- [x] `CFREE_TEST_FILTER=6_7_2_12_long_double make test-parse` passes on
       AArch64 Linux and RV64 Linux without a `.skip` sidecar.
-- [ ] `make rt` still builds the default runtime archives.
-- [ ] `make test-rt-headers test-rt-runtime` stays green for the default
+- [x] `make rt` still builds the default runtime archives.
+- [x] `make test-rt-headers test-rt-runtime` stays green for the default
       runtime targets.
diff --git a/lang/c/parse/cg_adapter.c b/lang/c/parse/cg_adapter.c
@@ -422,7 +422,8 @@ void pcg_unop(Parser* p, UnOp op) {
 }
 
 void pcg_cmp(Parser* p, CmpOp op) {
-  if (op == CMP_LT_F || op == CMP_LE_F || op == CMP_GT_F || op == CMP_GE_F) {
+  if (op == CMP_LT_F || op == CMP_LE_F || op == CMP_GT_F || op == CMP_GE_F ||
+      ((op == CMP_EQ || op == CMP_NE) && pcg_type_is_fp(pcg_top_type(p)))) {
     if (pcg_emit_enabled(p)) cfree_cg_fp_cmp(p->cg, pcg_fp_cmp(op));
   } else {
     if (pcg_emit_enabled(p)) cfree_cg_int_cmp(p->cg, pcg_int_cmp(op));
diff --git a/lang/c/parse/parse_expr.c b/lang/c/parse/parse_expr.c
@@ -197,7 +197,7 @@ static const Type* int_literal_type(Parser* p, const Tok* t) {
   }
 }
 
-static double parse_float_literal(Parser* p, const Tok* t) {
+double parse_float_literal(Parser* p, const Tok* t) {
   size_t len = 0;
   const char* s = pool_str(p->pool, t->spelling, &len);
   size_t i = 0;
@@ -463,33 +463,112 @@ static u32 cint_bits(Parser* p, const Type* ty) {
   return sz * 8u;
 }
 
-static u64 cint_mask_for_bits(u32 bits) {
-  if (bits >= 64) return ~0ull;
-  return (1ull << bits) - 1ull;
-}
-
 static int cint_signed(Parser* p, const Type* ty) {
   if (!ty) return 1;
   return c_abi_type_info(p->abi, ty).signed_ != 0;
 }
 
-static CConstInt cint_make(Parser* p, const Type* ty, u64 bits) {
+static void cint_mask_to_bits(CConstInt* v, u32 bits) {
+  if (bits < 64) {
+    v->lo &= (1ull << bits) - 1ull;
+    v->hi = 0;
+  } else if (bits < 128) {
+    v->hi &= (1ull << (bits - 64u)) - 1ull;
+  }
+}
+
+static CConstInt cint_make_u64(Parser* p, const Type* ty, u64 bits) {
   CConstInt v;
   u32 nb;
   if (!ty) ty = ty_int(p);
   nb = cint_bits(p, ty);
   v.type = ty;
-  v.bits = bits & cint_mask_for_bits(nb);
-  if (ty->kind == TY_BOOL) v.bits = v.bits ? 1u : 0u;
+  v.lo = bits;
+  v.hi = 0;
+  cint_mask_to_bits(&v, nb);
+  if (ty->kind == TY_BOOL) {
+    v.lo = (v.lo || v.hi) ? 1u : 0u;
+    v.hi = 0;
+  }
   return v;
 }
 
+static CConstInt cint_make_pair(Parser* p, const Type* ty, u64 lo, u64 hi) {
+  CConstInt v;
+  if (!ty) ty = ty_int(p);
+  v.type = ty;
+  v.lo = lo;
+  v.hi = hi;
+  cint_mask_to_bits(&v, cint_bits(p, ty));
+  if (ty->kind == TY_BOOL) {
+    v.lo = (v.lo || v.hi) ? 1u : 0u;
+    v.hi = 0;
+  }
+  return v;
+}
+
+static int cint_nonzero(CConstInt v) { return v.lo != 0 || v.hi != 0; }
+
+static int cint_eq(CConstInt a, CConstInt b) {
+  return a.lo == b.lo && a.hi == b.hi;
+}
+
+static int cint_cmp_u(CConstInt a, CConstInt b) {
+  if (a.hi != b.hi) return a.hi < b.hi ? -1 : 1;
+  if (a.lo != b.lo) return a.lo < b.lo ? -1 : 1;
+  return 0;
+}
+
+static CConstInt cint_add(Parser* p, const Type* ty, CConstInt a, CConstInt b) {
+  u64 lo = a.lo + b.lo;
+  return cint_make_pair(p, ty, lo, a.hi + b.hi + (lo < a.lo));
+}
+
+static CConstInt cint_sub(Parser* p, const Type* ty, CConstInt a, CConstInt b) {
+  return cint_make_pair(p, ty, a.lo - b.lo, a.hi - b.hi - (a.lo < b.lo));
+}
+
+static CConstInt cint_shl(Parser* p, const Type* ty, CConstInt a, u32 sh) {
+  if (sh >= 128) return cint_make_u64(p, ty, 0);
+  if (sh == 0) return cint_make_pair(p, ty, a.lo, a.hi);
+  if (sh >= 64) return cint_make_pair(p, ty, 0, a.lo << (sh - 64u));
+  return cint_make_pair(p, ty, a.lo << sh, (a.hi << sh) | (a.lo >> (64u - sh)));
+}
+
+static CConstInt cint_shr_u(Parser* p, const Type* ty, CConstInt a, u32 sh) {
+  if (sh >= 128) return cint_make_u64(p, ty, 0);
+  if (sh == 0) return cint_make_pair(p, ty, a.lo, a.hi);
+  if (sh >= 64) return cint_make_pair(p, ty, a.hi >> (sh - 64u), 0);
+  return cint_make_pair(p, ty, (a.lo >> sh) | (a.hi << (64u - sh)), a.hi >> sh);
+}
+
+static CConstInt cint_neg(Parser* p, const Type* ty, CConstInt a) {
+  CConstInt zero = cint_make_u64(p, ty, 0);
+  return cint_sub(p, ty, zero, a);
+}
+
+static CConstInt cint_bnot(Parser* p, const Type* ty, CConstInt a) {
+  return cint_make_pair(p, ty, ~a.lo, ~a.hi);
+}
+
+static CConstInt cint_mul(Parser* p, const Type* ty, CConstInt a, CConstInt b) {
+  CConstInt r = cint_make_u64(p, ty, 0);
+  CConstInt x = a;
+  for (u32 i = 0; i < 128; ++i) {
+    if ((i < 64 ? (b.lo >> i) : (b.hi >> (i - 64u))) & 1ull)
+      r = cint_add(p, ty, r, x);
+    x = cint_shl(p, ty, x, 1);
+  }
+  return r;
+}
+
 i64 const_int_as_i64(Parser* p, CConstInt v) {
   u32 nb = cint_bits(p, v.type);
-  u64 mask = cint_mask_for_bits(nb);
-  u64 u = v.bits & mask;
+  u64 u = v.lo;
   if (cint_signed(p, v.type) && nb < 64) {
+    u64 mask = (1ull << nb) - 1ull;
     u64 sign = 1ull << (nb - 1u);
+    u &= mask;
     if (u & sign) u |= ~mask;
   }
   return (i64)u;
@@ -500,7 +579,7 @@ static CConstInt cint_cast(Parser* p, CConstInt v, const Type* ty) {
   if (!dst || !type_is_int(dst)) {
     perr(p, "integer constant expression cast requires integer type");
   }
-  return cint_make(p, dst, v.bits);
+  return cint_make_pair(p, dst, v.lo, v.hi);
 }
 
 static u32 cint_rank(const Type* ty) {
@@ -583,16 +662,16 @@ static const Type* cint_common_type(Parser* p, const Type* a, const Type* b) {
 }
 
 static CConstInt cint_convert(Parser* p, CConstInt v, const Type* ty) {
-  return cint_make(p, ty, v.bits);
+  return cint_make_pair(p, ty, v.lo, v.hi);
 }
 
 static int cint_truth(Parser* p, CConstInt v) {
   (void)p;
-  return v.bits != 0;
+  return cint_nonzero(v);
 }
 
 static CConstInt cint_bool(Parser* p, int truth) {
-  return cint_make(p, ty_int(p), truth ? 1u : 0u);
+  return cint_make_u64(p, ty_int(p), truth ? 1u : 0u);
 }
 
 static CConstInt cexpr_mul(Parser* p, SrcLoc loc) {
@@ -615,17 +694,18 @@ static CConstInt cexpr_mul(Parser* p, SrcLoc loc) {
     v = cint_convert(p, v, ct);
     r = cint_convert(p, r, ct);
     if (op == '*') {
-      v = cint_make(p, ct, v.bits * r.bits);
+      v = cint_mul(p, ct, v, r);
     } else {
-      if (r.bits == 0)
+      if (!cint_nonzero(r))
         compiler_panic(p->c, loc, op == '/' ? "division by zero in constant"
                                             : "modulo by zero in constant");
       if (cint_signed(p, ct)) {
         i64 lv = const_int_as_i64(p, v);
         i64 rv = const_int_as_i64(p, r);
-        v = cint_make(p, ct, op == '/' ? (u64)(lv / rv) : (u64)(lv % rv));
+        v = cint_make_u64(p, ct, op == '/' ? (u64)(lv / rv) : (u64)(lv % rv));
       } else {
-        v = cint_make(p, ct, op == '/' ? v.bits / r.bits : v.bits % r.bits);
+        v = cint_make_u64(p, ct,
+                          op == '/' ? v.lo / r.lo : v.lo % r.lo);
       }
     }
   }
@@ -648,7 +728,7 @@ static CConstInt cexpr_add(Parser* p, SrcLoc loc) {
     ct = cint_common_type(p, v.type, r.type);
     v = cint_convert(p, v, ct);
     r = cint_convert(p, r, ct);
-    v = cint_make(p, ct, sub ? v.bits - r.bits : v.bits + r.bits);
+    v = sub ? cint_sub(p, ct, v, r) : cint_add(p, ct, v, r);
   }
   return v;
 }
@@ -675,11 +755,11 @@ static CConstInt cexpr_shift(Parser* p, SrcLoc loc) {
     if (left) {
       if (cint_signed(p, vt) && const_int_as_i64(p, v) < 0)
         perr(p, "left shift of negative value in constant expression");
-      v = cint_make(p, vt, v.bits << (u32)sh);
+      v = cint_shl(p, vt, v, (u32)sh);
     } else if (cint_signed(p, vt)) {
-      v = cint_make(p, vt, (u64)(const_int_as_i64(p, v) >> (u32)sh));
+      v = cint_make_u64(p, vt, (u64)(const_int_as_i64(p, v) >> (u32)sh));
     } else {
-      v = cint_make(p, vt, v.bits >> (u32)sh);
+      v = cint_shr_u(p, vt, v, (u32)sh);
     }
   }
   return v;
@@ -716,10 +796,11 @@ static CConstInt cexpr_rel(Parser* p, SrcLoc loc) {
             : op == '<'  ? lv < rv
                          : lv > rv;
     } else {
-      res = op == P_LE ? v.bits <= r.bits
-            : op == P_GE ? v.bits >= r.bits
-            : op == '<'  ? v.bits < r.bits
-                         : v.bits > r.bits;
+      int cmp = cint_cmp_u(v, r);
+      res = op == P_LE ? cmp <= 0
+            : op == P_GE ? cmp >= 0
+            : op == '<'  ? cmp < 0
+                         : cmp > 0;
     }
     v = cint_bool(p, res);
   }
@@ -742,7 +823,7 @@ static CConstInt cexpr_eq(Parser* p, SrcLoc loc) {
     ct = cint_common_type(p, v.type, r.type);
     v = cint_convert(p, v, ct);
     r = cint_convert(p, r, ct);
-    v = cint_bool(p, ne ? v.bits != r.bits : v.bits == r.bits);
+    v = cint_bool(p, ne ? !cint_eq(v, r) : cint_eq(v, r));
   }
   return v;
 }
@@ -756,7 +837,7 @@ static CConstInt cexpr_band(Parser* p, SrcLoc loc) {
     ct = cint_common_type(p, v.type, r.type);
     v = cint_convert(p, v, ct);
     r = cint_convert(p, r, ct);
-    v = cint_make(p, ct, v.bits & r.bits);
+    v = cint_make_pair(p, ct, v.lo & r.lo, v.hi & r.hi);
   }
   return v;
 }
@@ -767,7 +848,7 @@ static CConstInt cexpr_bxor(Parser* p, SrcLoc loc) {
     const Type* ct = cint_common_type(p, v.type, r.type);
     v = cint_convert(p, v, ct);
     r = cint_convert(p, r, ct);
-    v = cint_make(p, ct, v.bits ^ r.bits);
+    v = cint_make_pair(p, ct, v.lo ^ r.lo, v.hi ^ r.hi);
   }
   return v;
 }
@@ -781,7 +862,7 @@ static CConstInt cexpr_bor(Parser* p, SrcLoc loc) {
     ct = cint_common_type(p, v.type, r.type);
     v = cint_convert(p, v, ct);
     r = cint_convert(p, r, ct);
-    v = cint_make(p, ct, v.bits | r.bits);
+    v = cint_make_pair(p, ct, v.lo | r.lo, v.hi | r.hi);
   }
   return v;
 }
@@ -823,13 +904,13 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) {
     CConstInt v = cexpr_unary(p, loc);
     const Type* pt = cint_promote_type(p, v.type);
     v = cint_convert(p, v, pt);
-    return cint_make(p, pt, (u64)(-const_int_as_i64(p, v)));
+    return cint_neg(p, pt, v);
   }
   if (accept_punct(p, '~')) {
     CConstInt v = cexpr_unary(p, loc);
     const Type* pt = cint_promote_type(p, v.type);
     v = cint_convert(p, v, pt);
-    return cint_make(p, pt, ~v.bits);
+    return cint_bnot(p, pt, v);
   }
   if (accept_punct(p, '!')) return cint_bool(p, !cint_truth(p, cexpr_unary(p, loc)));
   if (accept_kw(p, KW_SIZEOF)) {
@@ -841,7 +922,7 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) {
           const Type* t = parse_type_name(p);
           expect_punct(p, ')', "')' after sizeof type-name");
           require_sizeof_type(p, t);
-          return cint_make(p, ty_size_t(p), c_abi_sizeof(p->abi, t));
+          return cint_make_u64(p, ty_size_t(p), c_abi_sizeof(p->abi, t));
         }
       }
     }
@@ -852,7 +933,7 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) {
       require_sizeof_type(p, ty);
       i64 sz = (i64)c_abi_sizeof(p->abi, ty);
       cg_drop(p->cg);
-      return cint_make(p, ty_size_t(p), (u64)sz);
+      return cint_make_u64(p, ty_size_t(p), (u64)sz);
     }
   }
   if (accept_kw(p, KW_ALIGNOF)) {
@@ -863,7 +944,7 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) {
         {
           const Type* t = parse_type_name(p);
           expect_punct(p, ')', "')' after _Alignof type-name");
-          return cint_make(p, ty_size_t(p), c_abi_alignof(p->abi, t));
+          return cint_make_u64(p, ty_size_t(p), c_abi_alignof(p->abi, t));
         }
       }
     }
@@ -872,7 +953,7 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) {
       const Type* ty = cg_top_type(p->cg);
       i64 al = (i64)c_abi_alignof(p->abi, ty);
       cg_drop(p->cg);
-      return cint_make(p, ty_size_t(p), (u64)al);
+      return cint_make_u64(p, ty_size_t(p), (u64)al);
     }
   }
   if (accept_punct(p, '(')) {
@@ -888,7 +969,7 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) {
           }
           fv = parse_float_literal(p, &p->cur);
           advance(p);
-          return cint_make(p, tu, (u64)(i64)fv);
+          return cint_make_u64(p, tu, (u64)(i64)fv);
         }
         CConstInt v = cexpr_unary(p, loc);
         return cint_cast(p, v, t);
@@ -904,12 +985,12 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) {
     i64 v = parse_int_literal(p, &p->cur);
     const Type* ty = int_literal_type(p, &p->cur);
     advance(p);
-    return cint_make(p, ty, (u64)v);
+    return cint_make_u64(p, ty, (u64)v);
   }
   if (p->cur.kind == TOK_CHR) {
     i64 v = decode_char_literal(p, &p->cur);
     advance(p);
-    return cint_make(p, ty_int(p), (u64)v);
+    return cint_make_u64(p, ty_int(p), (u64)v);
   }
   if (p->cur.kind == TOK_IDENT) {
     Sym name = p->cur.v.ident;
@@ -922,13 +1003,13 @@ static CConstInt cexpr_unary(Parser* p, SrcLoc loc) {
       expect_punct(p, ',', "',' in __builtin_offsetof");
       (void)offsetof_designator(p, root, &off);
       expect_punct(p, ')', "')' after __builtin_offsetof");
-      return cint_make(p, ty_size_t(p), off);
+      return cint_make_u64(p, ty_size_t(p), off);
     }
     {
       SymEntry* e = scope_lookup(p, name);
       if (e && e->kind == SEK_ENUM_CST) {
         advance(p);
-        return cint_make(p, e->type ? e->type : ty_int(p), (u64)e->v.enum_value);
+        return cint_make_u64(p, e->type ? e->type : ty_int(p), (u64)e->v.enum_value);
       }
     }
     compiler_panic(p->c, loc, "non-constant identifier in constant expression");
@@ -2552,6 +2633,8 @@ static void parse_shift(Parser* p) {
     }
     advance(p);
     to_rvalue(p);
+    if (bop == BO_SHR_S && !c_abi_type_info(p->abi, cg_top_type(p->cg)).signed_)
+      bop = BO_SHR_U;
     parse_add(p);
     to_rvalue(p);
     if (!type_is_int(cg_top2_type(p->cg)) || !type_is_int(cg_top_type(p->cg))) {
@@ -2944,6 +3027,8 @@ void parse_assign_expr(Parser* p) {
   }
   advance(p);
   const Type* lhs = cg_top_type(p->cg);
+  if (compound == BO_SHR_S && !c_abi_type_info(p->abi, lhs).signed_)
+    compound = BO_SHR_U;
   {
     if (lhs && (lhs->qual & Q_CONST)) {
       perr(p, "assignment to const-qualified object");
@@ -2980,6 +3065,7 @@ void parse_assign_expr(Parser* p) {
       case BO_XOR: op = '^'; break;
       case BO_SHL: op = '<'; break;
       case BO_SHR_S: op = '>'; break;
+      case BO_SHR_U: op = '>'; break;
       default: op = 0; break;
     }
     CSemCheck chk =
diff --git a/lang/c/parse/parse_init.c b/lang/c/parse/parse_init.c
@@ -560,6 +560,98 @@ static void encode_uint_le(u8* dst, u32 size, u64 v) {
   }
 }
 
+static void encode_uint128_le(u8* dst, u32 size, u64 lo, u64 hi) {
+  if (size > 16) size = 16;
+  for (u32 i = 0; i < size; ++i) {
+    u64 lane = i < 8u ? lo : hi;
+    dst[i] = (u8)((lane >> (8u * (i & 7u))) & 0xffu);
+  }
+}
+
+static void encode_binary128_from_double_le(u8 out[16], double value) {
+  union {
+    double d;
+    u64 u;
+  } in;
+  u64 lo = 0;
+  u64 hi = 0;
+  u64 frac;
+  u32 sign;
+  u32 exp;
+  in.d = value;
+  sign = (u32)(in.u >> 63);
+  exp = (u32)((in.u >> 52) & 0x7ffu);
+  frac = in.u & 0x000fffffffffffffull;
+  if (sign)
+    hi |= 1ull << 63;
+  if (exp == 0x7ffu) {
+    hi |= (u64)0x7fffu << 48;
+    if (frac) {
+      lo |= (frac & 0xfu) << 60;
+      hi |= frac >> 4;
+      hi |= 1ull << 47;
+    }
+  } else if (exp != 0 || frac != 0) {
+    i32 e;
+    u64 sig;
+    if (exp == 0) {
+      e = -1022;
+      sig = frac;
+      while ((sig & (1ull << 52)) == 0) {
+        sig <<= 1;
+        --e;
+      }
+      frac = sig & 0x000fffffffffffffull;
+    } else {
+      e = (i32)exp - 1023;
+    }
+    hi |= (u64)(u32)(e + 16383) << 48;
+    lo |= (frac & 0xfu) << 60;
+    hi |= frac >> 4;
+  }
+  encode_uint128_le(out, 16, lo, hi);
+}
+
+static int try_parse_static_float(Parser* p, u8* dst, u32 size,
+                                  const Type* ty) {
+  const Type* uty = type_unqual(p->pool, ty);
+  double value;
+  if (!uty ||
+      (uty->kind != TY_FLOAT && uty->kind != TY_DOUBLE &&
+       uty->kind != TY_LDOUBLE)) {
+    return 0;
+  }
+  if (p->cur.kind != TOK_FLT && p->cur.kind != TOK_NUM)
+    perr(p, "expected floating constant expression");
+  value = p->cur.kind == TOK_FLT ? parse_float_literal(p, &p->cur)
+                                 : (double)parse_int_literal(p, &p->cur);
+  advance(p);
+  if (uty->kind == TY_FLOAT && size == 4u) {
+    union {
+      float f;
+      u8 b[4];
+    } u;
+    u.f = (float)value;
+    memcpy(dst, u.b, 4);
+    return 1;
+  }
+  if ((uty->kind == TY_DOUBLE || uty->kind == TY_LDOUBLE) && size == 8u) {
+    union {
+      double d;
+      u8 b[8];
+    } u;
+    u.d = value;
+    memcpy(dst, u.b, 8);
+    return 1;
+  }
+  if (uty->kind == TY_LDOUBLE && size == 16u) {
+    encode_binary128_from_double_le(dst, value);
+    return 1;
+  }
+  perr(p, "unsupported static floating initializer type");
+  return 0;
+}
+
 /* Encode a string literal at *buf+offset for a char-array sub-object. */
 static void parse_static_string_at(Parser* p, u8* buf, u32 buflen, u32 offset,
                                    u32 count) {
@@ -608,14 +700,24 @@ typedef struct CStaticConst {
   i64 addend;
 } CStaticConst;
 
-static u64 int_bits_for_type(Parser* p, CConstInt v, const Type* ty) {
+static CConstInt int_bits_for_type(Parser* p, CConstInt v, const Type* ty) {
   u32 sz = c_abi_sizeof(p->abi, ty);
-  u64 bits = v.bits;
+  v.type = ty;
   if (sz < 8u) {
-    bits &= (1ull << (sz * 8u)) - 1ull;
+    u32 bits = sz * 8u;
+    v.lo &= bits ? ((1ull << bits) - 1ull) : 0;
+    v.hi = 0;
+  } else if (sz == 8u) {
+    v.hi = 0;
+  } else if (sz < 16u) {
+    u32 hi_bits = sz * 8u - 64u;
+    v.hi &= hi_bits ? ((1ull << hi_bits) - 1ull) : 0;
+  }
+  if (ty && ty->kind == TY_BOOL) {
+    v.lo = (v.lo || v.hi) ? 1u : 0u;
+    v.hi = 0;
   }
-  if (ty && ty->kind == TY_BOOL) bits = bits ? 1u : 0u;
-  return bits;
+  return v;
 }
 
 static void check_static_integer_initializer_range(Parser* p, const Type* ty,
@@ -635,7 +737,7 @@ static void check_static_integer_initializer_range(Parser* p, const Type* ty,
       }
     } else {
       u64 maxu = (u64)maxv;
-      if (v.bits > maxu) {
+      if (v.hi != 0 || v.lo > maxu) {
         perr(p, "initializer value overflows destination type");
       }
     }
@@ -793,7 +895,7 @@ static void parse_static_bitfield_at(Parser* p, u8* buf, u32 buflen,
   ones = width >= 64u ? ~(u64)0 : (((u64)1 << width) - 1u);
   mask = ones << lsb;
   cur = decode_uint_le(buf + storage_off, storage_size);
-  val = (int_bits_for_type(p, parsed.int_value, field_ty) & ones) << lsb;
+  val = (int_bits_for_type(p, parsed.int_value, field_ty).lo & ones) << lsb;
   cur = (cur & ~mask) | val;
   encode_uint_le(buf + storage_off, storage_size, cur);
 }
@@ -915,13 +1017,23 @@ void parse_static_init_at(Parser* p, u8* buf, u32 buflen, u32 offset,
     u32 sz = c_abi_sizeof(p->abi, ty);
     CStaticConst cv;
     if (offset + sz > buflen) perr(p, "initializer overflows object");
+    if (try_parse_static_float(p, buf + offset, sz, ty)) {
+      if (had_brace) {
+        accept_punct(p, ',');
+        expect_punct(p, '}', "'}' after scalar initializer");
+      }
+      return;
+    }
     cv = parse_static_const(p, ty, cloc);
     if (cv.kind == C_STATIC_CONST_ADDR) {
       srl_push(p, offset, sz, cv.target, cv.addend);
     } else if (cv.kind == C_STATIC_CONST_NULL_PTR) {
       encode_int_le(buf + offset, sz, 0);
     } else {
-      encode_uint_le(buf + offset, sz, int_bits_for_type(p, cv.int_value, ty));
+      {
+        CConstInt bits = int_bits_for_type(p, cv.int_value, ty);
+        encode_uint128_le(buf + offset, sz, bits.lo, bits.hi);
+      }
     }
     if (had_brace) {
       accept_punct(p, ',');
diff --git a/lang/c/parse/parse_priv.h b/lang/c/parse/parse_priv.h
@@ -419,12 +419,14 @@ void parse_cond_expr(Parser* p);
 void parse_unary(Parser* p);
 typedef struct CConstInt {
   const Type* type;
-  u64 bits;
+  u64 lo;
+  u64 hi;
 } CConstInt;
 CConstInt eval_const_int_typed(Parser* p, SrcLoc loc);
 i64 eval_const_int(Parser* p, SrcLoc loc);
 i64 const_int_as_i64(Parser* p, CConstInt v);
 i64 parse_int_literal(Parser* p, const Tok* t);
+double parse_float_literal(Parser* p, const Tok* t);
 i64 decode_char_literal(Parser* p, const Tok* t);
 u8* decode_string_literal(Parser* p, const Tok* t, size_t* nlen_out);
 void to_rvalue(Parser* p);
diff --git a/lang/c/parse/parse_type.c b/lang/c/parse/parse_type.c
@@ -59,6 +59,9 @@ static const struct {
 
 static SrcLoc tok_loc(const Tok* t) { return t->loc; }
 
+static void attr_canon_range(const char* s, size_t len, const char** out_p,
+                             size_t* out_len);
+
 static int accept_kw(Parser* p, CKw k) {
   if (is_kw(p, &p->cur, k)) {
     advance(p);
@@ -67,6 +70,30 @@ static int accept_kw(Parser* p, CKw k) {
   return 0;
 }
 
+static int attr_sym_canon_eq(Parser* p, Sym sym, const char* want) {
+  size_t len = 0;
+  const char* s = pool_str(p->pool, sym, &len);
+  const char* cs;
+  size_t clen;
+  size_t wlen = strlen(want);
+  if (!s) return 0;
+  attr_canon_range(s, len, &cs, &clen);
+  return clen == wlen && memcmp(cs, want, wlen) == 0;
+}
+
+static const Type* attrs_apply_type_mode(Parser* p, const Type* base,
+                                         const Attr* attrs) {
+  for (const Attr* a = attrs; a; a = a->next) {
+    if (a->kind != ATTR_MODE || a->nargs == 0) continue;
+    if (attr_sym_canon_eq(p, a->v.sym, "TI")) {
+      const Type* u = type_unqual(p->pool, base);
+      int is_unsigned = u && type_is_int(u) && c_abi_type_info(p->abi, u).signed_ == 0;
+      return type_prim(p->pool, is_unsigned ? TY_UINT128 : TY_INT128);
+    }
+  }
+  return base;
+}
+
 static CKw ident_kw(const Parser* p, Sym name) {
   return ident_kw_inline(p, name);
 }
@@ -627,6 +654,7 @@ int parse_decl_specs(Parser* p, DeclSpecs* out) {
         out->type = ty_int(p);
       }
     }
+    out->type = attrs_apply_type_mode(p, out->type, out->attrs);
     if (out->type && out->quals) {
       out->type = type_qualified(p->pool, out->type,
                                  (u16)(out->type->qual | out->quals));
@@ -1134,6 +1162,7 @@ const Type* parse_declarator_full(Parser* p, const Type* base,
 const Type* parse_declarator_full_ex(Parser* p, const Type* base,
                                      int allow_abstract, Sym* name_out,
                                      SrcLoc* loc_out, Attr** attrs_out) {
+  Attr* local_attrs = NULL;
   base = parse_pointer_layer(p, base);
 
   Sym name = 0;
@@ -1215,7 +1244,7 @@ const Type* parse_declarator_full_ex(Parser* p, const Type* base,
     if (attrs_out)
       parse_attrs_into(p, attrs_out);
     else
-      parse_and_discard_attributes(p);
+      parse_attrs_into(p, &local_attrs);
   }
 
   DeclSuffix suffs[8];
@@ -1227,9 +1256,10 @@ const Type* parse_declarator_full_ex(Parser* p, const Type* base,
       if (attrs_out)
         parse_attrs_into(p, attrs_out);
       else
-        parse_and_discard_attributes(p);
+        parse_attrs_into(p, &local_attrs);
     }
   }
+  base = attrs_apply_type_mode(p, base, attrs_out ? *attrs_out : local_attrs);
   if (nsuffs == 8 && (is_punct(&p->cur, '[') || is_punct(&p->cur, '('))) {
     perr(p, "too many declarator suffixes (raise the cap if needed)");
   }
diff --git a/rt/lib/README.md b/rt/lib/README.md
@@ -22,7 +22,7 @@ hand-written `mem/mem.c` is 0BSD; relicense as desired.
 | -------------------------- | ----------------------------------------------------------- | --------------------------------------------------- |
 | `int/int.c`                | Integer helpers needed on every target                      | All                                                 |
 | `int32/int32.c`            | 64-bit ops synthesized from 32-bit                          | ILP32 only                                          |
-| `int64/int64.c`            | 128-bit ops via `__int128`                                  | LP64 / LLP64 only                                   |
+| `int64/int64.c`            | 128-bit ops implemented on explicit 64-bit lanes            | LP64 / LLP64 only                                   |
 | `fp/fp.c`                  | Soft-float `sf` (binary32) + `df` (binary64) + sf↔df + `fp_mode` | FPU-less (RV{32,64}I, ARM softfp, WASM)         |
 | `fp_tf/fp_tf.c`            | Soft-float `tf` (binary128) + sf↔tf + df↔tf + i128↔tf       | Targets with binary128 long double (e.g. aarch64 `-mlong-double-128`) |
 | `fp_ti/fp_ti.c`            | `__int128` ↔ sf/df + sf/df → ti fix                         | LP64 / LLP64 + soft-float                           |
diff --git a/rt/lib/fp_tf/fp_tf.c b/rt/lib/fp_tf/fp_tf.c
@@ -78,23 +78,253 @@ COMPILER_RT_ABI fp_t __subtf3(fp_t a, fp_t b) {
 // ---- multf3.c ----
 #define QUAD_PRECISION
 #include "fp_lib.h"
-#include "fp_mul_impl.inc"
 
-COMPILER_RT_ABI fp_t __multf3(fp_t a, fp_t b) { return __mulXf3__(a, b); }
+typedef struct {
+  du_int limb[4];
+} cfree_tf_u256;
+
+static int cfree_tf_rep_bit(rep_t value, int bit) {
+  return ((value >> (unsigned)bit) & 1) != 0;
+}
+
+static int cfree_tf_u256_bit(const cfree_tf_u256* value, int bit) {
+  if (bit < 0 || bit >= 256) return 0;
+  return ((value->limb[bit / 64] >> (unsigned)(bit % 64)) & 1u) != 0;
+}
+
+static int cfree_tf_u256_any_below(const cfree_tf_u256* value, int bit) {
+  int full;
+  int rem;
+  if (bit <= 0) return 0;
+  if (bit > 256) bit = 256;
+  full = bit / 64;
+  rem = bit % 64;
+  for (int i = 0; i < full; ++i) {
+    if (value->limb[i]) return 1;
+  }
+  if (rem) {
+    const du_int mask = ((du_int)1 << (unsigned)rem) - 1u;
+    if (value->limb[full] & mask) return 1;
+  }
+  return 0;
+}
+
+static void cfree_tf_u256_add_limb(cfree_tf_u256* value, int index,
+                                   du_int addend) {
+  du_int old;
+  if (!addend || index >= 4) return;
+  old = value->limb[index];
+  value->limb[index] = old + addend;
+  if (value->limb[index] >= old) return;
+  for (++index; index < 4; ++index) {
+    old = value->limb[index];
+    value->limb[index] = old + 1u;
+    if (value->limb[index] != 0) return;
+  }
+}
+
+static void cfree_tf_u256_add_shifted_sig(cfree_tf_u256* product, rep_t sig,
+                                          int shift) {
+  const du_int lo = (du_int)sig;
+  const du_int hi = (du_int)(sig >> 64);
+  const int index = shift / 64;
+  const int bits = shift % 64;
+  if (bits == 0) {
+    cfree_tf_u256_add_limb(product, index, lo);
+    cfree_tf_u256_add_limb(product, index + 1, hi);
+  } else {
+    cfree_tf_u256_add_limb(product, index, lo << (unsigned)bits);
+    cfree_tf_u256_add_limb(
+        product, index + 1,
+        (lo >> (unsigned)(64 - bits)) | (hi << (unsigned)bits));
+    cfree_tf_u256_add_limb(product, index + 2,
+                           hi >> (unsigned)(64 - bits));
+  }
+}
+
+static cfree_tf_u256 cfree_tf_sig_product(rep_t a, rep_t b) {
+  cfree_tf_u256 product = {{0, 0, 0, 0}};
+  for (int bit = 0; bit <= significandBits; ++bit) {
+    if (cfree_tf_rep_bit(b, bit))
+      cfree_tf_u256_add_shifted_sig(&product, a, bit);
+  }
+  return product;
+}
+
+static rep_t cfree_tf_u256_extract_rounded(const cfree_tf_u256* product,
+                                           int shift) {
+  rep_t result = 0;
+  for (int bit = 0; bit <= significandBits; ++bit) {
+    if (cfree_tf_u256_bit(product, shift + bit))
+      result |= (rep_t)1 << (unsigned)bit;
+  }
+  if (cfree_tf_u256_bit(product, shift - 1) &&
+      (cfree_tf_u256_any_below(product, shift - 1) || (result & 1)))
+    ++result;
+  return result;
+}
+
+COMPILER_RT_ABI fp_t __multf3(fp_t a, fp_t b) {
+  const rep_t aRep = toRep(a);
+  const rep_t bRep = toRep(b);
+  const rep_t aAbs = aRep & absMask;
+  const rep_t bAbs = bRep & absMask;
+  const rep_t productSign = (aRep ^ bRep) & signBit;
+  int aExponent = (int)((aAbs >> significandBits) & maxExponent);
+  int bExponent = (int)((bAbs >> significandBits) & maxExponent);
+  int productExponent;
+  int productTop;
+  int shift;
+  rep_t aSignificand = aAbs & significandMask;
+  rep_t bSignificand = bAbs & significandMask;
+  cfree_tf_u256 product;
+  rep_t resultSignificand;
+
+  if (aAbs > infRep) return fromRep(aRep | quietBit);
+  if (bAbs > infRep) return fromRep(bRep | quietBit);
+  if (aAbs == infRep) {
+    if (bAbs) return fromRep(infRep | productSign);
+    return fromRep(qnanRep);
+  }
+  if (bAbs == infRep) {
+    if (aAbs) return fromRep(infRep | productSign);
+    return fromRep(qnanRep);
+  }
+  if (!aAbs || !bAbs) return fromRep(productSign);
+
+  if (aExponent == 0)
+    aExponent = normalize(&aSignificand);
+  else
+    aSignificand |= implicitBit;
+  if (bExponent == 0)
+    bExponent = normalize(&bSignificand);
+  else
+    bSignificand |= implicitBit;
+
+  product = cfree_tf_sig_product(aSignificand, bSignificand);
+  productTop = cfree_tf_u256_bit(&product, 225) ? 225 : 224;
+  productExponent = aExponent + bExponent - exponentBias;
+  if (productTop == 225) ++productExponent;
+
+  if (productExponent >= maxExponent) return fromRep(infRep | productSign);
+
+  shift = productTop - significandBits;
+  if (productExponent <= 0) {
+    shift += 1 - productExponent;
+    productExponent = 0;
+  }
+
+  resultSignificand = cfree_tf_u256_extract_rounded(&product, shift);
+  if (resultSignificand & ((rep_t)1 << (significandBits + 1))) {
+    resultSignificand >>= 1;
+    ++productExponent;
+  }
+  if (productExponent == 0 && (resultSignificand & implicitBit))
+    productExponent = 1;
+  if (productExponent >= maxExponent) return fromRep(infRep | productSign);
+
+  return fromRep(productSign | ((rep_t)productExponent << significandBits) |
+                 (resultSignificand & significandMask));
+}
 
 // ---- divtf3.c ----
 #define QUAD_PRECISION
 #include "fp_lib.h"
+#include "fp_mode.h"
+
+COMPILER_RT_ABI fp_t __divtf3(fp_t a, fp_t b) {
+  const rep_t aRep = toRep(a);
+  const rep_t bRep = toRep(b);
+  const rep_t aAbs = aRep & absMask;
+  const rep_t bAbs = bRep & absMask;
+  const rep_t quotientSign = (aRep ^ bRep) & signBit;
+  int aExponent = (int)((aAbs >> significandBits) & maxExponent);
+  int bExponent = (int)((bAbs >> significandBits) & maxExponent);
+  rep_t aSignificand = aAbs & significandMask;
+  rep_t bSignificand = bAbs & significandMask;
+  rep_t quotient = 0;
+  rep_t remainder;
+  int writtenExponent;
+
+  if (aAbs > infRep) return fromRep(aRep | quietBit);
+  if (bAbs > infRep) return fromRep(bRep | quietBit);
+  if (aAbs == infRep) {
+    if (bAbs == infRep) return fromRep(qnanRep);
+    return fromRep(infRep | quotientSign);
+  }
+  if (bAbs == infRep) return fromRep(quotientSign);
+  if (!aAbs) {
+    if (!bAbs) return fromRep(qnanRep);
+    return fromRep(quotientSign);
+  }
+  if (!bAbs) return fromRep(infRep | quotientSign);
+
+  if (aExponent == 0)
+    aExponent = normalize(&aSignificand);
+  else
+    aSignificand |= implicitBit;
+  if (bExponent == 0)
+    bExponent = normalize(&bSignificand);
+  else
+    bSignificand |= implicitBit;
+
+  writtenExponent = aExponent - bExponent + exponentBias;
+  if (aSignificand < bSignificand) {
+    aSignificand <<= 1;
+    writtenExponent -= 1;
+  }
 
-#define NUMBER_OF_HALF_ITERATIONS 4
-#define NUMBER_OF_FULL_ITERATIONS 1
-
-#include "fp_div_impl.inc"
+  remainder = aSignificand;
+  for (int i = 0; i < significandBits + 4; ++i) {
+    quotient <<= 1;
+    if (remainder >= bSignificand) {
+      quotient |= 1;
+      remainder -= bSignificand;
+    }
+    if (i != significandBits + 3)
+      remainder <<= 1;
+  }
+  if (remainder)
+    quotient |= 1;
+
+  if (writtenExponent >= maxExponent)
+    return fromRep(infRep | quotientSign);
+  if (writtenExponent <= 0) {
+    const int shift = 1 - writtenExponent;
+    if (shift >= typeWidth)
+      return fromRep(quotientSign);
+    if (shift > 0) {
+      const bool sticky = (quotient << (typeWidth - shift)) != 0;
+      quotient = (quotient >> shift) | sticky;
+    }
+    writtenExponent = 0;
+  }
 
-COMPILER_RT_ABI fp_t __divtf3(fp_t a, fp_t b) { return __divXf3__(a, b); }
+  const int roundGuardSticky = quotient & 0x7;
+  rep_t absResult = (quotient >> 3) & significandMask;
+  absResult |= (rep_t)writtenExponent << significandBits;
+
+  switch (__fe_getround()) {
+  case CRT_FE_TONEAREST:
+    if (roundGuardSticky > 0x4)
+      absResult++;
+    if (roundGuardSticky == 0x4)
+      absResult += absResult & 1;
+    break;
+  case CRT_FE_DOWNWARD:
+    if (quotientSign && roundGuardSticky) absResult++;
+    break;
+  case CRT_FE_UPWARD:
+    if (!quotientSign && roundGuardSticky) absResult++;
+    break;
+  case CRT_FE_TOWARDZERO:
+    break;
+  }
+  if (roundGuardSticky)
+    __fe_raise_inexact();
+  return fromRep(absResult | quotientSign);
+}
 
-#undef NUMBER_OF_HALF_ITERATIONS
-#undef NUMBER_OF_FULL_ITERATIONS
 // ---- comparetf2.c ----
 #define QUAD_PRECISION
 #include "fp_compare_impl.inc"
@@ -114,32 +344,43 @@ COMPILER_RT_ABI CMP_RESULT __unordtf2(fp_t a, fp_t b) {
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-COMPILER_RT_ABI fp_t __floatsitf(si_int a) {
-  const int aWidth = sizeof a * CHAR_BIT;
+static int cfree_clz_u32(su_int x) {
+  int n = 0;
+  for (int bit = 31; bit >= 0; --bit) {
+    if ((x >> (unsigned)bit) & 1u) break;
+    ++n;
+  }
+  return n;
+}
 
-  // Handle zero as a special case to protect clz
-  if (a == 0) return fromRep(0);
+static int cfree_clz_u64(du_int x) {
+  int n = 0;
+  for (int bit = 63; bit >= 0; --bit) {
+    if ((x >> (unsigned)bit) & 1u) break;
+    ++n;
+  }
+  return n;
+}
+
+static fp_t cfree_tf_from_u64(du_int mag, rep_t sign, int width) {
+  if (!mag) return fromRep(0);
+  int exponent = (width - 1) -
+                 (width == 32 ? cfree_clz_u32((su_int)mag)
+                              : cfree_clz_u64(mag));
+  int shift = significandBits - exponent;
+  rep_t result = ((rep_t)mag << shift) ^ implicitBit;
+  result |= (rep_t)(exponent + exponentBias) << significandBits;
+  return fromRep(result | sign);
+}
 
-  // All other cases begin by extracting the sign and absolute value of a
+COMPILER_RT_ABI fp_t __floatsitf(si_int a) {
   rep_t sign = 0;
-  su_int aAbs = (su_int)a;
+  su_int mag = (su_int)a;
   if (a < 0) {
     sign = signBit;
-    aAbs = -aAbs;
+    mag = (su_int)(0u - mag);
   }
-
-  // Exponent of (fp_t)a is the width of abs(a).
-  const int exponent = (aWidth - 1) - clzsi(aAbs);
-  rep_t result;
-
-  // Shift a into the significand field and clear the implicit bit.
-  const int shift = significandBits - exponent;
-  result = (rep_t)aAbs << shift ^ implicitBit;
-
-  // Insert the exponent
-  result += (rep_t)(exponent + exponentBias) << significandBits;
-  // Insert the sign bit and return
-  return fromRep(result | sign);
+  return cfree_tf_from_u64((du_int)mag, sign, 32);
 }
 
 // ---- floatunsitf.c ----
@@ -147,22 +388,7 @@ COMPILER_RT_ABI fp_t __floatsitf(si_int a) {
 #include "fp_lib.h"
 
 COMPILER_RT_ABI fp_t __floatunsitf(su_int a) {
-  const int aWidth = sizeof a * CHAR_BIT;
-
-  // Handle zero as a special case to protect clz
-  if (a == 0) return fromRep(0);
-
-  // Exponent of (fp_t)a is the width of abs(a).
-  const int exponent = (aWidth - 1) - clzsi(a);
-  rep_t result;
-
-  // Shift a into the significand field and clear the implicit bit.
-  const int shift = significandBits - exponent;
-  result = (rep_t)a << shift ^ implicitBit;
-
-  // Insert the exponent
-  result += (rep_t)(exponent + exponentBias) << significandBits;
-  return fromRep(result);
+  return cfree_tf_from_u64((du_int)a, 0, 32);
 }
 
 // ---- floatditf.c ----
@@ -170,31 +396,13 @@ COMPILER_RT_ABI fp_t __floatunsitf(su_int a) {
 #include "fp_lib.h"
 
 COMPILER_RT_ABI fp_t __floatditf(di_int a) {
-  const int aWidth = sizeof a * CHAR_BIT;
-
-  // Handle zero as a special case to protect clz
-  if (a == 0) return fromRep(0);
-
-  // All other cases begin by extracting the sign and absolute value of a
   rep_t sign = 0;
-  du_int aAbs = (du_int)a;
+  du_int mag = (du_int)a;
   if (a < 0) {
     sign = signBit;
-    aAbs = ~(du_int)a + 1U;
+    mag = (du_int)0 - mag;
   }
-
-  // Exponent of (fp_t)a is the width of abs(a).
-  const int exponent = (aWidth - 1) - __builtin_clzll(aAbs);
-  rep_t result;
-
-  // Shift a into the significand field, rounding if it is a right-shift
-  const int shift = significandBits - exponent;
-  result = (rep_t)aAbs << shift ^ implicitBit;
-
-  // Insert the exponent
-  result += (rep_t)(exponent + exponentBias) << significandBits;
-  // Insert the sign bit and return
-  return fromRep(result | sign);
+  return cfree_tf_from_u64(mag, sign, 64);
 }
 
 // ---- floatunditf.c ----
@@ -202,22 +410,7 @@ COMPILER_RT_ABI fp_t __floatditf(di_int a) {
 #include "fp_lib.h"
 
 COMPILER_RT_ABI fp_t __floatunditf(du_int a) {
-  const int aWidth = sizeof a * CHAR_BIT;
-
-  // Handle zero as a special case to protect clz
-  if (a == 0) return fromRep(0);
-
-  // Exponent of (fp_t)a is the width of abs(a).
-  const int exponent = (aWidth - 1) - __builtin_clzll(a);
-  rep_t result;
-
-  // Shift a into the significand field and clear the implicit bit.
-  const int shift = significandBits - exponent;
-  result = (rep_t)a << shift ^ implicitBit;
-
-  // Insert the exponent
-  result += (rep_t)(exponent + exponentBias) << significandBits;
-  return fromRep(result);
+  return cfree_tf_from_u64(a, 0, 64);
 }
 
 // ---- floattitf.c ----
diff --git a/rt/lib/int64/int64.c b/rt/lib/int64/int64.c
@@ -80,61 +80,165 @@ static inline du_int udiv128by64to64(du_int u1, du_int u0, du_int v,
   return udiv128by64to64default(u1, u0, v, r);
 }
 
+static inline int ut_is_zero(utwords a) {
+  return a.s.low == 0 && a.s.high == 0;
+}
+
+static inline int ut_cmp(utwords a, utwords b) {
+  if (a.s.high != b.s.high) return a.s.high < b.s.high ? -1 : 1;
+  if (a.s.low != b.s.low) return a.s.low < b.s.low ? -1 : 1;
+  return 0;
+}
+
+static inline utwords ut_add(utwords a, utwords b) {
+  utwords r;
+  r.s.low = a.s.low + b.s.low;
+  r.s.high = a.s.high + b.s.high + (r.s.low < a.s.low);
+  return r;
+}
+
+static inline utwords ut_sub(utwords a, utwords b) {
+  utwords r;
+  r.s.low = a.s.low - b.s.low;
+  r.s.high = a.s.high - b.s.high - (a.s.low < b.s.low);
+  return r;
+}
+
+static inline utwords ut_neg(utwords a) {
+  utwords z;
+  z.s.low = 0;
+  z.s.high = 0;
+  return ut_sub(z, a);
+}
+
+static inline utwords ut_shl1(utwords a) {
+  utwords r;
+  r.s.low = a.s.low << 1;
+  r.s.high = (a.s.high << 1) | (a.s.low >> 63);
+  return r;
+}
+
+static inline utwords ut_shr1(utwords a) {
+  utwords r;
+  r.s.low = (a.s.low >> 1) | (a.s.high << 63);
+  r.s.high = a.s.high >> 1;
+  return r;
+}
+
+static inline utwords ut_shl(utwords a, unsigned sh) {
+  utwords r;
+  if (sh >= 128u) {
+    r.s.low = 0;
+    r.s.high = 0;
+  } else if (sh == 0) {
+    r = a;
+  } else if (sh >= 64u) {
+    r.s.low = 0;
+    r.s.high = a.s.low << (sh - 64u);
+  } else {
+    r.s.low = a.s.low << sh;
+    r.s.high = (a.s.high << sh) | (a.s.low >> (64u - sh));
+  }
+  return r;
+}
+
+static inline utwords ut_lshr(utwords a, unsigned sh) {
+  utwords r;
+  if (sh >= 128u) {
+    r.s.low = 0;
+    r.s.high = 0;
+  } else if (sh == 0) {
+    r = a;
+  } else if (sh >= 64u) {
+    r.s.low = a.s.high >> (sh - 64u);
+    r.s.high = 0;
+  } else {
+    r.s.low = (a.s.low >> sh) | (a.s.high << (64u - sh));
+    r.s.high = a.s.high >> sh;
+  }
+  return r;
+}
+
+static inline twords t_ashr(twords a, unsigned sh) {
+  twords r;
+  if (sh >= 128u) {
+    r.s.low = a.s.high < 0 ? ~(du_int)0 : 0;
+    r.s.high = a.s.high < 0 ? (di_int)-1 : 0;
+  } else if (sh == 0) {
+    r = a;
+  } else if (sh >= 64u) {
+    r.s.low = (du_int)(a.s.high >> (sh - 64u));
+    r.s.high = a.s.high < 0 ? (di_int)-1 : 0;
+  } else {
+    r.s.low = ((du_int)a.s.high << (64u - sh)) | (a.s.low >> sh);
+    r.s.high = a.s.high >> sh;
+  }
+  return r;
+}
+
+static inline utwords ut_mul(utwords a, utwords b) {
+  utwords r;
+  const int half_bits = (int)(sizeof(du_int) * CHAR_BIT) / 2;
+  const du_int mask = (du_int)~0 >> half_bits;
+  du_int t;
+  r.s.low = (a.s.low & mask) * (b.s.low & mask);
+  t = r.s.low >> half_bits;
+  r.s.low &= mask;
+  t += (a.s.low >> half_bits) * (b.s.low & mask);
+  r.s.low += (t & mask) << half_bits;
+  r.s.high = t >> half_bits;
+  t = r.s.low >> half_bits;
+  r.s.low &= mask;
+  t += (b.s.low >> half_bits) * (a.s.low & mask);
+  r.s.low += (t & mask) << half_bits;
+  r.s.high += t >> half_bits;
+  r.s.high += (a.s.low >> half_bits) * (b.s.low >> half_bits);
+  r.s.high += a.s.high * b.s.low + a.s.low * b.s.high;
+  return r;
+}
+
+static inline void ut_udivmod(utwords n, utwords d, utwords* q, utwords* rem) {
+  utwords quotient;
+  utwords remainder;
+  quotient.s.low = 0;
+  quotient.s.high = 0;
+  remainder.s.low = 0;
+  remainder.s.high = 0;
+  if (ut_is_zero(d)) {
+    if (q) *q = quotient;
+    if (rem) *rem = n;
+    return;
+  }
+  for (int i = 127; i >= 0; --i) {
+    du_int bit =
+        i < 64 ? ((n.s.low >> (unsigned)i) & 1u)
+               : ((n.s.high >> (unsigned)(i - 64)) & 1u);
+    remainder = ut_shl1(remainder);
+    remainder.s.low |= bit;
+    if (ut_cmp(remainder, d) >= 0) {
+      remainder = ut_sub(remainder, d);
+      if (i < 64)
+        quotient.s.low |= (du_int)1 << (unsigned)i;
+      else
+        quotient.s.high |= (du_int)1 << (unsigned)(i - 64);
+    }
+  }
+  if (q) *q = quotient;
+  if (rem) *rem = remainder;
+}
+
 // Effects: if rem != 0, *rem = a % b
 // Returns: a / b
 
 COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int* rem) {
-  const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT;
   utwords dividend;
   dividend.all = a;
   utwords divisor;
   divisor.all = b;
   utwords quotient;
   utwords remainder;
-  if (divisor.all > dividend.all) {
-    if (rem) *rem = dividend.all;
-    return 0;
-  }
-  // When the divisor fits in 64 bits, we can use an optimized path.
-  if (divisor.s.high == 0) {
-    remainder.s.high = 0;
-    if (dividend.s.high < divisor.s.low) {
-      // The result fits in 64 bits.
-      quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low,
-                                       divisor.s.low, &remainder.s.low);
-      quotient.s.high = 0;
-    } else {
-      // First, divide with the high part to get the remainder in
-      // dividend.s.high. After that dividend.s.high < divisor.s.low.
-      quotient.s.high = dividend.s.high / divisor.s.low;
-      dividend.s.high = dividend.s.high % divisor.s.low;
-      quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low,
-                                       divisor.s.low, &remainder.s.low);
-    }
-    if (rem) *rem = remainder.all;
-    return quotient.all;
-  }
-  // 0 <= shift <= 63.
-  si_int shift =
-      __builtin_clzll(divisor.s.high) - __builtin_clzll(dividend.s.high);
-  divisor.all <<= shift;
-  quotient.s.high = 0;
-  quotient.s.low = 0;
-  for (; shift >= 0; --shift) {
-    quotient.s.low <<= 1;
-    // Branch free version of.
-    // if (dividend.all >= divisor.all)
-    // {
-    //    dividend.all -= divisor.all;
-    //    carry = 1;
-    // }
-    const ti_int s =
-        (ti_int)(divisor.all - dividend.all - 1) >> (n_utword_bits - 1);
-    quotient.s.low |= s & 1;
-    dividend.all -= divisor.all & s;
-    divisor.all >>= 1;
-  }
-  if (rem) *rem = dividend.all;
+  ut_udivmod(dividend, divisor, &quotient, &remainder);
+  if (rem) *rem = remainder.all;
   return quotient.all;
 }
 
@@ -147,20 +251,11 @@ COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int* rem) {
 // Precondition:  0 <= b < bits_in_tword
 
 COMPILER_RT_ABI ti_int __ashlti3(ti_int a, int b) {
-  const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT);
-  twords input;
-  twords result;
+  utwords input;
+  utwords result;
   input.all = a;
-  if (b & bits_in_dword) /* bits_in_dword <= b < bits_in_tword */ {
-    result.s.low = 0;
-    result.s.high = input.s.low << (b - bits_in_dword);
-  } else /* 0 <= b < bits_in_dword */ {
-    if (b == 0) return a;
-    result.s.low = input.s.low << b;
-    result.s.high =
-        ((du_int)input.s.high << b) | (input.s.low >> (bits_in_dword - b));
-  }
-  return result.all;
+  result = ut_shl(input, (unsigned)b);
+  return (ti_int)result.all;
 }
 
 // ---- ashrti3.c ----
@@ -171,20 +266,10 @@ COMPILER_RT_ABI ti_int __ashlti3(ti_int a, int b) {
 // Precondition:  0 <= b < bits_in_tword
 
 COMPILER_RT_ABI ti_int __ashrti3(ti_int a, int b) {
-  const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT);
   twords input;
   twords result;
   input.all = a;
-  if (b & bits_in_dword) /* bits_in_dword <= b < bits_in_tword */ {
-    // result.s.high = input.s.high < 0 ? -1 : 0
-    result.s.high = input.s.high >> (bits_in_dword - 1);
-    result.s.low = input.s.high >> (b - bits_in_dword);
-  } else /* 0 <= b < bits_in_dword */ {
-    if (b == 0) return a;
-    result.s.high = input.s.high >> b;
-    result.s.low =
-        ((du_int)input.s.high << (bits_in_dword - b)) | (input.s.low >> b);
-  }
+  result = t_ashr(input, (unsigned)b);
   return result.all;
 }
 
@@ -226,19 +311,11 @@ COMPILER_RT_ABI int __ctzti2(ti_int a) {
 // Precondition:  0 <= b < bits_in_tword
 
 COMPILER_RT_ABI ti_int __lshrti3(ti_int a, int b) {
-  const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT);
   utwords input;
   utwords result;
   input.all = a;
-  if (b & bits_in_dword) /* bits_in_dword <= b < bits_in_tword */ {
-    result.s.high = 0;
-    result.s.low = input.s.high >> (b - bits_in_dword);
-  } else /* 0 <= b < bits_in_dword */ {
-    if (b == 0) return a;
-    result.s.high = input.s.high >> b;
-    result.s.low = (input.s.high << (bits_in_dword - b)) | (input.s.low >> b);
-  }
-  return result.all;
+  result = ut_lshr(input, (unsigned)b);
+  return (ti_int)result.all;
 }
 
 // ---- multi3.c ----
@@ -268,14 +345,13 @@ static ti_int __mulddi3(du_int a, du_int b) {
 // Returns: a * b
 
 COMPILER_RT_ABI ti_int __multi3(ti_int a, ti_int b) {
-  twords x;
-  x.all = a;
-  twords y;
-  y.all = b;
-  twords r;
-  r.all = __mulddi3(x.s.low, y.s.low);
-  r.s.high += x.s.high * y.s.low + x.s.low * y.s.high;
-  return r.all;
+  utwords x;
+  utwords y;
+  utwords r;
+  x.all = (tu_int)a;
+  y.all = (tu_int)b;
+  r = ut_mul(x, y);
+  return (ti_int)r.all;
 }
 
 // ---- negti2.c ----
@@ -284,9 +360,111 @@ COMPILER_RT_ABI ti_int __multi3(ti_int a, ti_int b) {
 // Returns: -a
 
 COMPILER_RT_ABI ti_int __negti2(ti_int a) {
-  // Note: this routine is here for API compatibility; any sane compiler
-  // should expand it inline.
-  return -(tu_int)a;
+  utwords x;
+  utwords r;
+  x.all = (tu_int)a;
+  r = ut_neg(x);
+  return (ti_int)r.all;
+}
+
+COMPILER_RT_ABI ti_int __cfree_addti3(ti_int a, ti_int b) {
+  utwords x;
+  utwords y;
+  utwords r;
+  x.all = (tu_int)a;
+  y.all = (tu_int)b;
+  r = ut_add(x, y);
+  return (ti_int)r.all;
+}
+
+COMPILER_RT_ABI ti_int __cfree_subti3(ti_int a, ti_int b) {
+  utwords x;
+  utwords y;
+  utwords r;
+  x.all = (tu_int)a;
+  y.all = (tu_int)b;
+  r = ut_sub(x, y);
+  return (ti_int)r.all;
+}
+
+COMPILER_RT_ABI ti_int __cfree_andti3(ti_int a, ti_int b) {
+  utwords x;
+  utwords y;
+  utwords r;
+  x.all = (tu_int)a;
+  y.all = (tu_int)b;
+  r.s.low = x.s.low & y.s.low;
+  r.s.high = x.s.high & y.s.high;
+  return (ti_int)r.all;
+}
+
+COMPILER_RT_ABI ti_int __cfree_orti3(ti_int a, ti_int b) {
+  utwords x;
+  utwords y;
+  utwords r;
+  x.all = (tu_int)a;
+  y.all = (tu_int)b;
+  r.s.low = x.s.low | y.s.low;
+  r.s.high = x.s.high | y.s.high;
+  return (ti_int)r.all;
+}
+
+COMPILER_RT_ABI ti_int __cfree_xorti3(ti_int a, ti_int b) {
+  utwords x;
+  utwords y;
+  utwords r;
+  x.all = (tu_int)a;
+  y.all = (tu_int)b;
+  r.s.low = x.s.low ^ y.s.low;
+  r.s.high = x.s.high ^ y.s.high;
+  return (ti_int)r.all;
+}
+
+COMPILER_RT_ABI ti_int __cfree_notti3(ti_int a) {
+  utwords x;
+  utwords r;
+  x.all = (tu_int)a;
+  r.s.low = ~x.s.low;
+  r.s.high = ~x.s.high;
+  return (ti_int)r.all;
+}
+
+COMPILER_RT_ABI ti_int __cfree_sext64ti(di_int a) {
+  twords r;
+  r.s.low = (du_int)a;
+  r.s.high = a < 0 ? -1 : 0;
+  return r.all;
+}
+
+COMPILER_RT_ABI ti_int __cfree_zext64ti(du_int a) {
+  utwords r;
+  r.s.low = a;
+  r.s.high = 0;
+  return (ti_int)r.all;
+}
+
+COMPILER_RT_ABI si_int __cfree_cmpti2(ti_int a, ti_int b) {
+  twords x;
+  twords y;
+  x.all = a;
+  y.all = b;
+  if (x.s.high < y.s.high) return -1;
+  if (x.s.high > y.s.high) return 1;
+  if (x.s.low < y.s.low) return -1;
+  if (x.s.low > y.s.low) return 1;
+  return 0;
+}
+
+COMPILER_RT_ABI si_int __cfree_ucmpti2(tu_int a, tu_int b) {
+  utwords x;
+  utwords y;
+  x.all = a;
+  y.all = b;
+  if (x.s.high < y.s.high) return -1;
+  if (x.s.high > y.s.high) return 1;
+  if (x.s.low < y.s.low) return -1;
+  if (x.s.low > y.s.low) return 1;
+  return 0;
 }
 
 // Callers of __udivmodti4:
@@ -316,16 +494,27 @@ COMPILER_RT_ABI tu_int __umodti3(tu_int a, tu_int b) {
 // Returns: a / b, *rem = a % b
 
 COMPILER_RT_ABI ti_int __divmodti4(ti_int a, ti_int b, ti_int* rem) {
-  const int bits_in_tword_m1 = (int)(sizeof(ti_int) * CHAR_BIT) - 1;
-  ti_int s_a = a >> bits_in_tword_m1;  // s_a = a < 0 ? -1 : 0
-  ti_int s_b = b >> bits_in_tword_m1;  // s_b = b < 0 ? -1 : 0
-  a = (tu_int)(a ^ s_a) - s_a;         // negate if s_a == -1
-  b = (tu_int)(b ^ s_b) - s_b;         // negate if s_b == -1
-  s_b ^= s_a;                          // sign of quotient
-  tu_int r;
-  ti_int q = (__udivmodti4(a, b, &r) ^ s_b) - s_b;  // negate if s_b == -1
-  *rem = (r ^ s_a) - s_a;                           // negate if s_a == -1
-  return q;
+  twords sa;
+  twords sb;
+  utwords ua;
+  utwords ub;
+  utwords uq;
+  utwords ur;
+  int neg_a;
+  int neg_b;
+  sa.all = a;
+  sb.all = b;
+  neg_a = sa.s.high < 0;
+  neg_b = sb.s.high < 0;
+  ua.all = (tu_int)a;
+  ub.all = (tu_int)b;
+  if (neg_a) ua = ut_neg(ua);
+  if (neg_b) ub = ut_neg(ub);
+  ut_udivmod(ua, ub, &uq, &ur);
+  if (neg_a != neg_b) uq = ut_neg(uq);
+  if (neg_a) ur = ut_neg(ur);
+  if (rem) *rem = (ti_int)ur.all;
+  return (ti_int)uq.all;
 }
 
 // ---- divti3.c ----
@@ -333,14 +522,9 @@ COMPILER_RT_ABI ti_int __divmodti4(ti_int a, ti_int b, ti_int* rem) {
 
 // Returns: a / b
 
-#define fixint_t ti_int
-#define fixuint_t tu_int
-#define INT_DIV_SUFFIX divti3
-#define COMPUTE_UDIV(a, b) __udivmodti4((a), (b), (tu_int*)0)
-#include "int_div_impl.inc"
-
 COMPILER_RT_ABI ti_int __divti3(ti_int a, ti_int b) {
-  return __divXi3_divti3(a, b);
+  ti_int r;
+  return __divmodti4(a, b, &r);
 }
 
 // ---- modti3.c ----
@@ -348,12 +532,8 @@ COMPILER_RT_ABI ti_int __divti3(ti_int a, ti_int b) {
 
 // Returns: a % b
 
-#define fixint_t ti_int
-#define fixuint_t tu_int
-#define INT_DIV_SUFFIX modti3
-#define ASSIGN_UMOD(res, a, b) __udivmodti4((a), (b), &(res))
-#include "int_div_impl.inc"
-
 COMPILER_RT_ABI ti_int __modti3(ti_int a, ti_int b) {
-  return __modXi3_modti3(a, b);
+  ti_int r;
+  (void)__divmodti4(a, b, &r);
+  return r;
 }
diff --git a/src/abi/abi_aapcs64.c b/src/abi/abi_aapcs64.c
@@ -20,6 +20,23 @@
 
 static void classify_scalar(TargetABI* a, CfreeCgTypeId t, ABIArgInfo* out) {
   ABITypeInfo ti = abi_internal_type_info(a, t);
+  if (ti.scalar_kind == ABI_SC_INT && ti.size == 16) {
+    ABIArgPart* parts = arena_array(a->c->tu, ABIArgPart, 2);
+    memset(parts, 0, sizeof(ABIArgPart) * 2);
+    for (u32 i = 0; i < 2; ++i) {
+      parts[i].cls = ABI_CLASS_INT;
+      parts[i].loc = ABI_LOC_REG;
+      parts[i].size = 8;
+      parts[i].align = 8;
+      parts[i].src_offset = i * 8;
+    }
+    out->kind = ABI_ARG_DIRECT;
+    out->flags = ABI_AF_NONE;
+    out->parts = parts;
+    out->nparts = 2;
+    out->indirect_align = 0;
+    return;
+  }
   out->kind = ABI_ARG_DIRECT;
   out->flags = ABI_AF_NONE;
   out->indirect_align = 0;
diff --git a/src/abi/abi_rv64.c b/src/abi/abi_rv64.c
@@ -20,7 +20,8 @@
 
 static void classify_scalar(TargetABI* a, CfreeCgTypeId t, ABIArgInfo* out) {
   ABITypeInfo ti = abi_internal_type_info(a, t);
-  if (ti.scalar_kind == ABI_SC_FLOAT && ti.size == 16) {
+  if (ti.size == 16 &&
+      (ti.scalar_kind == ABI_SC_INT || ti.scalar_kind == ABI_SC_FLOAT)) {
     ABIArgPart* parts = arena_array(a->c->tu, ABIArgPart, 2);
     memset(parts, 0, sizeof(ABIArgPart) * 2);
     parts[0].cls = ABI_CLASS_INT;
diff --git a/src/api/cg.c b/src/api/cg.c
@@ -1303,6 +1303,17 @@ static int api_is_f128_type(Compiler *c, CfreeCgTypeId ty) {
   return cg && cg->kind == CFREE_CG_TYPE_FLOAT && cg->fp.width == 128;
 }
 
+static int api_is_i128_type(Compiler *c, CfreeCgTypeId ty) {
+  const CgType *cg;
+  ty = api_unalias_type(c, ty);
+  cg = cg_type_get(c, ty);
+  return cg && cg->kind == CFREE_CG_TYPE_INT && cg->integer.width == 128;
+}
+
+static int api_is_wide16_scalar_type(Compiler *c, CfreeCgTypeId ty) {
+  return api_is_f128_type(c, ty) || api_is_i128_type(c, ty);
+}
+
 static Operand api_op_imm(i64 v, CfreeCgTypeId ty) {
   Operand o;
   memset(&o, 0, sizeof o);
@@ -2105,7 +2116,7 @@ static void api_release_arg_storage(CfreeCg *g, Operand *storage) {
     api_free_reg(g, storage->v.reg, storage->cls);
   } else if (storage->kind == OPK_LOCAL && storage->cls < 3) {
     CfreeCgTypeId ty = storage->type;
-    if (cg_type_is_aggregate(g->c, ty) || api_is_f128_type(g->c, ty))
+    if (cg_type_is_aggregate(g->c, ty) || api_is_wide16_scalar_type(g->c, ty))
       return;
     api_return_spill_slot(g, storage->v.frame_slot, storage->cls);
   } else if (storage->kind == OPK_INDIRECT) {
@@ -3189,7 +3200,8 @@ static void api_encode_binary128_from_double(CfreeCg *g, double value,
     double d;
     u64 u;
   } in;
-  unsigned __int128 rep = 0;
+  u64 lo = 0;
+  u64 hi = 0;
   u64 frac;
   u32 sign;
   u32 exp;
@@ -3198,12 +3210,13 @@ static void api_encode_binary128_from_double(CfreeCg *g, double value,
   exp = (u32)((in.u >> 52) & 0x7ffu);
   frac = in.u & 0x000fffffffffffffull;
   if (sign)
-    rep |= ((unsigned __int128)1) << 127;
+    hi |= 1ull << 63;
   if (exp == 0x7ffu) {
-    rep |= ((unsigned __int128)0x7fffu) << 112;
+    hi |= (u64)0x7fffu << 48;
     if (frac) {
-      rep |= ((unsigned __int128)frac) << (112u - 52u);
-      rep |= ((unsigned __int128)1) << 111;
+      lo |= (frac & 0xfu) << 60;
+      hi |= frac >> 4;
+      hi |= 1ull << 47;
     }
   } else if (exp != 0 || frac != 0) {
     i32 e;
@@ -3219,12 +3232,20 @@ static void api_encode_binary128_from_double(CfreeCg *g, double value,
     } else {
       e = (i32)exp - 1023;
     }
-    rep |= ((unsigned __int128)(u32)(e + 16383)) << 112;
-    rep |= ((unsigned __int128)frac) << (112u - 52u);
+    hi |= (u64)(u32)(e + 16383) << 48;
+    lo |= (frac & 0xfu) << 60;
+    hi |= frac >> 4;
   }
   for (u32 i = 0; i < 16; ++i) {
-    u32 shift = g->c->target.big_endian ? (15u - i) * 8u : i * 8u;
-    out[i] = (u8)(rep >> shift);
+    if (g->c->target.big_endian) {
+      u64 lane = i < 8u ? hi : lo;
+      u32 shift = (7u - (i & 7u)) * 8u;
+      out[i] = (u8)(lane >> shift);
+    } else {
+      u64 lane = i < 8u ? lo : hi;
+      u32 shift = (i & 7u) * 8u;
+      out[i] = (u8)(lane >> shift);
+    }
   }
 }
 
@@ -3238,8 +3259,8 @@ static ApiSValue api_make_f128_const(CfreeCg *g, double value,
   return api_make_lv(api_op_local(slot, ty), ty);
 }
 
-static ApiSValue api_f128_materialize_lvalue(CfreeCg *g, ApiSValue *v,
-                                             CfreeCgTypeId ty) {
+static ApiSValue api_wide16_materialize_lvalue(CfreeCg *g, ApiSValue *v,
+                                               CfreeCgTypeId ty) {
   if (v->op.kind == OPK_LOCAL || v->op.kind == OPK_INDIRECT) {
     v->type = ty;
     v->op.type = ty;
@@ -3283,7 +3304,7 @@ static ApiSValue api_f128_materialize_lvalue(CfreeCg *g, ApiSValue *v,
     return api_make_lv(api_op_local(slot, ty), ty);
   }
   compiler_panic(g->c, g->cur_loc,
-                 "CfreeCg: binary128 value is not addressable (kind %u, op %u)",
+                 "CfreeCg: 16-byte scalar value is not addressable (kind %u, op %u)",
                  (unsigned)v->kind, (unsigned)v->op.kind);
   return *v;
 }
@@ -3339,7 +3360,7 @@ static int api_local_requires_memory(CfreeCg *g, CfreeCgTypeId ty,
                                      CfreeCgLocalAttrs attrs) {
   if (api_source_flags_addr_taken(attrs.flags))
     return 1;
-  if (api_is_f128_type(g->c, ty))
+  if (api_is_wide16_scalar_type(g->c, ty))
     return 1;
   return !(cg_type_is_int(g->c, ty) || cg_type_is_float(g->c, ty) ||
            cg_type_is_ptr(g->c, ty));
@@ -3805,7 +3826,7 @@ void cfree_cg_load(CfreeCg *g, CfreeCgMemAccess access) {
     return;
   }
   api_require_scalar_mem_type(g, "load", ty);
-  if (api_is_f128_type(g->c, ty)) {
+  if (api_is_wide16_scalar_type(g->c, ty)) {
     v.type = ty;
     v.op.type = ty;
     api_push(g, v);
@@ -3968,7 +3989,7 @@ void cfree_cg_store(CfreeCg *g, CfreeCgMemAccess access) {
     return;
   }
   api_validate_memory_value(g, "store", ty, api_sv_type(&rv));
-  if (api_is_f128_type(g->c, ty)) {
+  if (api_is_wide16_scalar_type(g->c, ty)) {
     if (lv.source_local != CFREE_CG_LOCAL_NONE) {
       api_local_const_clear(api_local_from_handle(g, lv.source_local));
     } else if (lv.op.kind == OPK_INDIRECT || lv.op.kind == OPK_GLOBAL ||
@@ -3986,6 +4007,35 @@ void cfree_cg_store(CfreeCg *g, CfreeCgMemAccess access) {
       T->copy_bytes(T, dst_addr, src_addr, agg);
       api_free_reg(g, dst_addr.v.reg, RC_INT);
       api_free_reg(g, src_addr.v.reg, RC_INT);
+    } else if (rv.op.kind == OPK_IMM) {
+      u8 bytes[16];
+      u64 lo = (u64)rv.op.v.imm;
+      u64 hi = rv.op.v.imm < 0 ? ~(u64)0 : 0;
+      memset(bytes, 0, sizeof bytes);
+      for (u32 i = 0; i < 8; ++i) {
+        u32 lo_idx = g->c->target.big_endian ? 15u - i : i;
+        u32 hi_idx = g->c->target.big_endian ? 7u - i : 8u + i;
+        bytes[lo_idx] = (u8)(lo >> (i * 8u));
+        bytes[hi_idx] = (u8)(hi >> (i * 8u));
+      }
+      if (lv.op.kind == OPK_LOCAL) {
+        api_store_f128_bytes(g, lv.op.v.frame_slot, ty, bytes);
+      } else {
+        FrameSlot slot = api_f128_temp_slot(g, ty);
+        ApiSValue tmp = api_make_lv(api_op_local(slot, ty), ty);
+        CfreeCgTypeId ptr_ty = cg_type_ptr_to(g->c, ty);
+        Operand dst_addr = api_lvalue_addr(g, &lv, ptr_ty);
+        Operand src_addr;
+        AggregateAccess agg;
+        api_store_f128_bytes(g, slot, ty, bytes);
+        src_addr = api_lvalue_addr(g, &tmp, ptr_ty);
+        memset(&agg, 0, sizeof agg);
+        agg.size = 16;
+        agg.align = access.align ? access.align : 16;
+        T->copy_bytes(T, dst_addr, src_addr, agg);
+        api_free_reg(g, dst_addr.v.reg, RC_INT);
+        api_free_reg(g, src_addr.v.reg, RC_INT);
+      }
     } else {
       src = api_force_reg(g, &rv, ty);
       T->store(T, lv.op, src, api_mem_from_access(g, &lv.op, access));
@@ -4113,6 +4163,12 @@ void cfree_cg_rot3(CfreeCg *g) {
  * Arithmetic / compare / convert
  * ============================================================ */
 
+static const char *api_i128_binop_helper(BinOp op);
+static int api_i128_cmp_is_unsigned(CmpOp op);
+static void api_cg_cmp(CfreeCg *g, CmpOp cop);
+static void api_f128_call_unary(CfreeCg *g, const char *name,
+                                CfreeCgTypeId ret, CfreeCgTypeId param);
+
 static void api_cg_binop(CfreeCg *g, BinOp iop, u32 flags) {
   ApiSValue b, a;
   CGTarget *T;
@@ -4129,6 +4185,22 @@ static void api_cg_binop(CfreeCg *g, BinOp iop, u32 flags) {
   a = api_pop(g);
   ty = a.type ? a.type : b.type;
 
+  if (api_is_i128_type(g->c, ty)) {
+    CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128);
+    CfreeCgTypeId i32 = builtin_id(CFREE_CG_BUILTIN_I32);
+    CfreeCgTypeId ps[2];
+    ApiSValue args[2];
+    const char *name = api_i128_binop_helper(iop);
+    if (!name)
+      compiler_panic(g->c, g->cur_loc, "CfreeCg: i128 binop unsupported");
+    args[0] = a;
+    args[1] = b;
+    ps[0] = i128;
+    ps[1] = (iop == BO_SHL || iop == BO_SHR_U || iop == BO_SHR_S) ? i32 : i128;
+    api_runtime_call_values(g, name, i128, ps, 2, args);
+    return;
+  }
+
   if (!flags && api_sv_op_is(&a, OPK_IMM) && api_sv_op_is(&b, OPK_IMM) &&
       api_try_fold_int_binop(g, iop, ty, a.op.v.imm, b.op.v.imm, &folded)) {
     api_release(g, &a);
@@ -4195,6 +4267,27 @@ static void api_cg_unop(CfreeCg *g, UnOp iop, u32 flags) {
   a = api_pop(g);
   ty = a.type ? a.type : a.op.type;
 
+  if (api_is_i128_type(g->c, ty)) {
+    CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128);
+    CfreeCgTypeId i32 = builtin_id(CFREE_CG_BUILTIN_I32);
+    if (iop == UO_NEG || iop == UO_BNOT) {
+      const char *name = (iop == UO_NEG) ? "__negti2" : "__cfree_notti3";
+      api_push(g, a);
+      api_f128_call_unary(g, name, i128, i128);
+      return;
+    }
+    if (iop == UO_NOT) {
+      CfreeCgTypeId ps[2] = {i128, i128};
+      ApiSValue args[2];
+      args[0] = a;
+      args[1] = api_make_sv(api_op_imm(0, i128), i128);
+      api_runtime_call_values(g, "__cfree_ucmpti2", i32, ps, 2, args);
+      cfree_cg_push_int(g, 0, i32);
+      api_cg_cmp(g, CMP_EQ);
+      return;
+    }
+  }
+
   if (!flags && api_sv_op_is(&a, OPK_IMM) &&
       api_try_fold_int_unop(g, iop, ty, a.op.v.imm, &folded)) {
     api_release(g, &a);
@@ -4242,6 +4335,34 @@ static void api_cg_cmp(CfreeCg *g, CmpOp cop) {
   opty = a.type ? a.type : b.type;
   i32 = builtin_id(CFREE_CG_BUILTIN_I32);
 
+  if (api_is_i128_type(g->c, opty)) {
+    CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128);
+    CfreeCgTypeId ps[2] = {i128, i128};
+    ApiSValue args[2];
+    CmpOp icmp = CMP_EQ;
+    const char *name = api_i128_cmp_is_unsigned(cop) ? "__cfree_ucmpti2"
+                                                     : "__cfree_cmpti2";
+    switch (cop) {
+    case CMP_EQ: icmp = CMP_EQ; break;
+    case CMP_NE: icmp = CMP_NE; break;
+    case CMP_LT_S:
+    case CMP_LT_U: icmp = CMP_LT_S; break;
+    case CMP_LE_S:
+    case CMP_LE_U: icmp = CMP_LE_S; break;
+    case CMP_GT_S:
+    case CMP_GT_U: icmp = CMP_GT_S; break;
+    case CMP_GE_S:
+    case CMP_GE_U: icmp = CMP_GE_S; break;
+    default: icmp = CMP_EQ; break;
+    }
+    args[0] = a;
+    args[1] = b;
+    api_runtime_call_values(g, name, i32, ps, 2, args);
+    cfree_cg_push_int(g, 0, i32);
+    api_cg_cmp(g, icmp);
+    return;
+  }
+
   if (api_sv_op_is(&a, OPK_IMM) && api_sv_op_is(&b, OPK_IMM) &&
       api_try_fold_int_cmp(g, cop, opty, a.op.v.imm, b.op.v.imm, &folded)) {
     api_release(g, &a);
@@ -4294,6 +4415,118 @@ static void api_cg_convert_kind(CfreeCg *g, CfreeCgTypeId dst_type,
     api_push(g, v);
     return;
   }
+  if (api_is_i128_type(g->c, sty) && api_type_is_bool(g->c, dty) &&
+      ck != CV_BITCAST) {
+    CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128);
+    CfreeCgTypeId i32 = builtin_id(CFREE_CG_BUILTIN_I32);
+    CfreeCgTypeId ps[2] = {i128, i128};
+    ApiSValue args[2];
+    ApiSValue r;
+    args[0] = v;
+    args[1] = api_make_sv(api_op_imm(0, i128), i128);
+    api_runtime_call_values(g, "__cfree_ucmpti2", i32, ps, 2, args);
+    cfree_cg_push_int(g, 0, i32);
+    api_cg_cmp(g, CMP_NE);
+    r = api_pop(g);
+    r.type = dty;
+    r.op.type = dty;
+    api_push(g, r);
+    return;
+  }
+  if (api_is_i128_type(g->c, dty) && !api_is_i128_type(g->c, sty) &&
+      ck != CV_BITCAST) {
+    u32 sz = (u32)abi_cg_sizeof(g->c->abi, sty);
+    CfreeCgTypeId i64_ty = builtin_id(CFREE_CG_BUILTIN_I64);
+    FrameSlot slot = api_f128_temp_slot(g, dty);
+    Operand dst_lv = api_op_local(slot, dty);
+    if (api_sv_op_is(&v, OPK_IMM)) {
+      u8 bytes[16];
+      u64 lo = (u64)v.op.v.imm;
+      u64 hi = 0;
+      if (ck == CV_SEXT && sz <= 8) {
+        u32 bits = sz * 8u;
+        u64 mask = bits >= 64u ? ~(u64)0 : ((1ull << bits) - 1ull);
+        u64 sign = 1ull << (bits - 1u);
+        u64 u = lo & mask;
+        if (u & sign)
+          u |= ~mask;
+        lo = u;
+        hi = (u & (1ull << 63)) ? ~(u64)0 : 0;
+      }
+      memset(bytes, 0, sizeof bytes);
+      for (u32 i = 0; i < 8; ++i) {
+        u32 lo_idx = g->c->target.big_endian ? 15u - i : i;
+        u32 hi_idx = g->c->target.big_endian ? 7u - i : 8u + i;
+        bytes[lo_idx] = (u8)(lo >> (i * 8u));
+        bytes[hi_idx] = (u8)(hi >> (i * 8u));
+      }
+      api_store_f128_bytes(g, slot, dty, bytes);
+      api_release(g, &v);
+      api_push(g, api_make_lv(dst_lv, dty));
+      return;
+    }
+    {
+      CfreeCgTypeId ptr_ty = cg_type_ptr_to(g->c, dty);
+      CfreeCgTypeId src_ty = sty;
+      Operand src = api_force_reg(g, &v, sty);
+      Operand low = src;
+      Operand base;
+      Reg low_tmp = REG_NONE;
+      Reg ar;
+      MemAccess ma;
+      memset(&ma, 0, sizeof ma);
+      ma.type = i64_ty;
+      ma.size = 8;
+      ma.align = 8;
+      if (sz < 8) {
+        low_tmp = api_alloc_reg_or_spill(g, RC_INT, i64_ty);
+        low = api_op_reg(low_tmp, i64_ty);
+        T->convert(T, ck == CV_SEXT ? CV_SEXT : CV_ZEXT, low, src);
+        src_ty = i64_ty;
+      } else {
+        low.type = i64_ty;
+      }
+      ar = api_alloc_reg_or_spill(g, RC_INT, ptr_ty);
+      base = api_op_reg(ar, ptr_ty);
+      T->addr_of(T, base, dst_lv);
+      T->store(T, api_op_indirect(ar, 0, i64_ty), low, ma);
+      if (ck == CV_SEXT) {
+        Reg hr = api_alloc_reg_or_spill(g, RC_INT, i64_ty);
+        Operand high = api_op_reg(hr, i64_ty);
+        T->binop(T, BO_SHR_S, high, low, api_op_imm(63, i64_ty));
+        T->store(T, api_op_indirect(ar, 8, i64_ty), high, ma);
+        api_free_reg(g, hr, RC_INT);
+      } else {
+        T->store(T, api_op_indirect(ar, 8, i64_ty), api_op_imm(0, i64_ty), ma);
+      }
+      if (low_tmp != REG_NONE)
+        api_free_reg(g, low_tmp, RC_INT);
+      (void)src_ty;
+      api_free_reg(g, ar, RC_INT);
+      api_release(g, &v);
+      api_push(g, api_make_lv(dst_lv, dty));
+    }
+    return;
+  }
+  if (api_is_i128_type(g->c, sty) && !api_is_i128_type(g->c, dty) &&
+      ck == CV_TRUNC && abi_cg_sizeof(g->c->abi, dty) <= 8) {
+    Reg rr = api_alloc_reg_or_spill(g, RC_INT, dty);
+    Operand dst = api_op_reg(rr, dty);
+    if (api_is_lvalue_sv(&v) || v.op.kind == OPK_LOCAL ||
+        v.op.kind == OPK_INDIRECT || v.op.kind == OPK_GLOBAL) {
+      ApiSValue lv = v;
+      lv.lvalue = 1;
+      T->load(T, dst, lv.op, api_mem_for_lvalue(g, &lv.op, dty));
+    } else if (v.op.kind == OPK_IMM) {
+      T->load_imm(T, dst, v.op.v.imm);
+    } else {
+      compiler_panic(g->c, g->cur_loc,
+                     "CfreeCg: unsupported i128 truncation source");
+    }
+    api_release(g, &v);
+    api_push(g, api_make_sv(dst, dty));
+    return;
+  }
   if (ck == CV_BITCAST &&
       abi_cg_sizeof(g->c->abi, sty) == abi_cg_sizeof(g->c->abi, dst_type) &&
       api_type_class(sty) == api_type_class(dty)) {
@@ -4369,6 +4602,34 @@ void cfree_cg_int_cmp(CfreeCg *g, CfreeCgIntCmpOp op) {
   api_cg_cmp(g, api_map_int_cmp(op));
 }
 
+static const char *api_i128_binop_helper(BinOp op) {
+  switch (op) {
+  case BO_IADD: return "__cfree_addti3";
+  case BO_ISUB: return "__cfree_subti3";
+  case BO_IMUL: return "__multi3";
+  case BO_SDIV: return "__divti3";
+  case BO_UDIV: return "__udivti3";
+  case BO_SREM: return "__modti3";
+  case BO_UREM: return "__umodti3";
+  case BO_AND: return "__cfree_andti3";
+  case BO_OR: return "__cfree_orti3";
+  case BO_XOR: return "__cfree_xorti3";
+  case BO_SHL: return "__ashlti3";
+  case BO_SHR_U: return "__lshrti3";
+  case BO_SHR_S: return "__ashrti3";
+  case BO_FADD:
+  case BO_FSUB:
+  case BO_FMUL:
+  case BO_FDIV:
+  default:
+    return NULL;
+  }
+}
+
+static int api_i128_cmp_is_unsigned(CmpOp op) {
+  return op == CMP_LT_U || op == CMP_LE_U || op == CMP_GT_U || op == CMP_GE_U;
+}
+
 static const char *api_f128_binop_helper(CfreeCgFpBinOp op) {
   switch (op) {
   case CFREE_CG_FP_ADD: return "__addtf3";
@@ -5380,6 +5641,19 @@ static void api_branch_if(CfreeCg *g, ApiSValue *v, int branch_when_true,
     api_release(g, v);
     return;
   }
+  if (api_is_i128_type(g->c, ty)) {
+    CfreeCgTypeId i128 = builtin_id(CFREE_CG_BUILTIN_I128);
+    CfreeCgTypeId i32 = builtin_id(CFREE_CG_BUILTIN_I32);
+    CfreeCgTypeId ps[2] = {i128, i128};
+    ApiSValue args[2];
+    ApiSValue cmp;
+    args[0] = *v;
+    args[1] = api_make_sv(api_op_imm(0, i128), i128);
+    api_runtime_call_values(g, "__cfree_ucmpti2", i32, ps, 2, args);
+    cmp = api_pop(g);
+    api_branch_if(g, &cmp, branch_when_true, label);
+    return;
+  }
   {
     Operand a = api_force_reg(g, v, ty);
     Operand zero = api_op_imm(0, ty);
@@ -6112,8 +6386,8 @@ void cfree_cg_call(CfreeCg *g, uint32_t nargs, CfreeCgTypeId fn_type,
     avs[idx].type = aty;
     avs[idx].abi = is_vararg ? NULL : &abi->params[idx];
     int is_aggregate = cg_type_is_aggregate(g->c, aty);
-    if (api_is_f128_type(g->c, aty)) {
-      ApiSValue lv = api_f128_materialize_lvalue(g, &arg, aty);
+    if (api_is_wide16_scalar_type(g->c, aty)) {
+      ApiSValue lv = api_wide16_materialize_lvalue(g, &arg, aty);
       avs[idx].storage = lv.op;
       avs[idx].storage.type = aty;
       avs[idx].size = 16;
@@ -6150,14 +6424,14 @@ void cfree_cg_call(CfreeCg *g, uint32_t nargs, CfreeCgTypeId fn_type,
 
   if (has_result) {
     int ret_is_aggregate = cg_type_is_aggregate(g->c, ret_ty);
-    if (ret_is_aggregate || api_is_f128_type(g->c, ret_ty)) {
+    if (ret_is_aggregate || api_is_wide16_scalar_type(g->c, ret_ty)) {
       FrameSlotDesc fsd;
       memset(&fsd, 0, sizeof fsd);
       fsd.type = ret_ty;
       fsd.size = abi_cg_sizeof(g->c->abi, ret_ty);
       fsd.align = abi_cg_alignof(g->c->abi, ret_ty);
       fsd.kind = FS_LOCAL;
-      if (ret_is_aggregate || api_is_f128_type(g->c, ret_ty))
+      if (ret_is_aggregate || api_is_wide16_scalar_type(g->c, ret_ty))
         fsd.flags = FSF_ADDR_TAKEN;
       FrameSlot ret_slot = T->frame_slot(T, &fsd);
       desc.ret.storage = api_op_local(ret_slot, ret_ty);
@@ -6300,8 +6574,8 @@ static void api_call_symbol_common(CfreeCg *g, CfreeCgSym sym, uint32_t nargs,
       aty = arg.type;
     avs[idx].type = aty;
     avs[idx].abi = is_vararg ? NULL : &abi->params[idx];
-    if (api_is_f128_type(g->c, aty)) {
-      ApiSValue lv = api_f128_materialize_lvalue(g, &arg, aty);
+    if (api_is_wide16_scalar_type(g->c, aty)) {
+      ApiSValue lv = api_wide16_materialize_lvalue(g, &arg, aty);
       avs[idx].storage = lv.op;
       avs[idx].storage.type = aty;
       avs[idx].size = 16;
@@ -6330,7 +6604,8 @@ static void api_call_symbol_common(CfreeCg *g, CfreeCgSym sym, uint32_t nargs,
   desc.ret.type = ret_ty;
   desc.ret.abi = &abi->ret;
   if (has_result) {
-    if (cg_type_is_aggregate(g->c, ret_ty) || api_is_f128_type(g->c, ret_ty)) {
+    if (cg_type_is_aggregate(g->c, ret_ty) ||
+        api_is_wide16_scalar_type(g->c, ret_ty)) {
       FrameSlotDesc fsd;
       FrameSlot ret_slot;
       memset(&fsd, 0, sizeof fsd);
@@ -6398,8 +6673,8 @@ void cfree_cg_ret(CfreeCg *g) {
     T->ret(T, &av);
     return;
   }
-  if (api_is_f128_type(g->c, rty)) {
-    ApiSValue lv = api_f128_materialize_lvalue(g, &v, rty);
+  if (api_is_wide16_scalar_type(g->c, rty)) {
+    ApiSValue lv = api_wide16_materialize_lvalue(g, &v, rty);
     av.storage = lv.op;
     av.storage.type = rty;
     av.size = 16;
diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c
@@ -100,6 +100,11 @@ static RelocKind ldst_lo12_reloc_for(u32 nbytes) {
 
 static void aa_emit_ldr_fp_any(MCEmitter* mc, u32 sidx, u32 rt, u32 rn,
                                i32 off) {
+  if (off < -256 || off > 255) {
+    aa64_emit_addr_adjust(mc, AA_TMP0, rn, off);
+    rn = AA_TMP0;
+    off = 0;
+  }
   if (sidx == 4)
     aa64_emit32(mc, aa64_ldur_q(rt, rn, off));
   else
@@ -108,6 +113,11 @@ static void aa_emit_ldr_fp_any(MCEmitter* mc, u32 sidx, u32 rt, u32 rn,
 
 static void aa_emit_str_fp_any(MCEmitter* mc, u32 sidx, u32 rt, u32 rn,
                                i32 off) {
+  if (off < -256 || off > 255) {
+    aa64_emit_addr_adjust(mc, AA_TMP0, rn, off);
+    rn = AA_TMP0;
+    off = 0;
+  }
   if (sidx == 4)
     aa64_emit32(mc, aa64_stur_q(rt, rn, off));
   else
@@ -1271,7 +1281,7 @@ static void aa_call(CGTarget* t, const CGCallDesc* d) {
       u32 sidx = size_idx_for_bytes(p->size);
       i32 off = base_off + (i32)p->src_offset;
       if (p->cls == ABI_CLASS_INT) {
-        aa64_emit32(mc, aa64_stur(sidx, src_reg, base_reg, off));
+        aa64_emit_stur_off(mc, sidx, src_reg, base_reg, off, AA_TMP0);
       } else {
         aa_emit_str_fp_any(mc, sidx, src_reg, base_reg, off);
       }
diff --git a/src/arch/rv64/alloc.c b/src/arch/rv64/alloc.c
@@ -368,8 +368,9 @@ void rv_cmp(CGTarget* t, CmpOp op, Operand dst, Operand a_op,
   RImpl* a = impl_of(t);
   u32 rd = reg_num(dst);
 
-  if (op == CMP_EQ || op == CMP_NE || op == CMP_LT_F || op == CMP_LE_F ||
-      op == CMP_GT_F || op == CMP_GE_F) {
+  if ((a_op.cls == RC_FP || b_op.cls == RC_FP) &&
+      (op == CMP_EQ || op == CMP_NE || op == CMP_LT_F || op == CMP_LE_F ||
+       op == CMP_GT_F || op == CMP_GE_F)) {
     /* FP compare in fa,fb → rd. Use FLT/FLE/FEQ depending on op. */
     int is_d = type_is_fp_double(a_op.type);
     u32 fa = reg_num(a_op);
diff --git a/src/arch/rv64/emit.c b/src/arch/rv64/emit.c
@@ -24,8 +24,8 @@ static u32 rv_planned_prologue_words(const RImpl *a) {
   u32 n = RV_PROLOGUE_FRAME_WORDS;
   if (a->has_sret) ++n;
   if (a->is_variadic) n += 8u;
-  n += count_mask_regs(a->planned_cs_int_mask, 18u, 27u);
-  n += count_mask_regs(a->planned_cs_fp_mask, 18u, 27u);
+  n += 4u * count_mask_regs(a->planned_cs_int_mask, 18u, 27u);
+  n += 4u * count_mask_regs(a->planned_cs_fp_mask, 18u, 27u);
   return n ? n : 1u;
 }
 
@@ -250,6 +250,71 @@ static u32 rv_variadic_first_saved_int(const CGFuncDesc *fd) {
   return next_int;
 }
 
+static void rv_words_addr_adjust(CGTarget *t, u32 *words, u32 cap, u32 *wi,
+                                 u32 rd, u32 base, i32 off) {
+  if (off == 0) {
+    if (rd != base) {
+      if (*wi >= cap) goto overflow;
+      words[(*wi)++] = rv_addi(rd, base, 0);
+    }
+    return;
+  }
+  if (off >= -2048 && off <= 2047) {
+    if (*wi >= cap) goto overflow;
+    words[(*wi)++] = rv_addi(rd, base, off);
+    return;
+  }
+  i32 hi = (i32)(((i64)off + 0x800) >> 12);
+  i32 lo = off - (hi << 12);
+  if (*wi >= cap) goto overflow;
+  words[(*wi)++] = rv_lui(rd, (u32)hi & 0xfffffu);
+  if (lo) {
+    if (*wi >= cap) goto overflow;
+    words[(*wi)++] = rv_addiw(rd, rd, lo);
+  }
+  if (*wi >= cap) goto overflow;
+  words[(*wi)++] = rv_add(rd, base, rd);
+  return;
+
+overflow:
+  compiler_panic(t->c, impl_of(t)->loc,
+                 "rv64: prologue placeholder too small (cap %u)", cap);
+}
+
+static void rv_words_store_int_s0(CGTarget *t, u32 *words, u32 cap, u32 *wi,
+                                  u32 reg, i32 off) {
+  if (off >= -2048 && off <= 2047) {
+    if (*wi >= cap) goto overflow;
+    words[(*wi)++] = rv_sd(reg, RV_S0, off);
+    return;
+  }
+  rv_words_addr_adjust(t, words, cap, wi, RV_T0, RV_S0, off);
+  if (*wi >= cap) goto overflow;
+  words[(*wi)++] = rv_sd(reg, RV_T0, 0);
+  return;
+
+overflow:
+  compiler_panic(t->c, impl_of(t)->loc,
+                 "rv64: prologue placeholder too small (cap %u)", cap);
+}
+
+static void rv_words_store_fp_s0(CGTarget *t, u32 *words, u32 cap, u32 *wi,
+                                 u32 reg, i32 off) {
+  if (off >= -2048 && off <= 2047) {
+    if (*wi >= cap) goto overflow;
+    words[(*wi)++] = rv_fsd(reg, RV_S0, off);
+    return;
+  }
+  rv_words_addr_adjust(t, words, cap, wi, RV_T0, RV_S0, off);
+  if (*wi >= cap) goto overflow;
+  words[(*wi)++] = rv_fsd(reg, RV_T0, 0);
+  return;
+
+overflow:
+  compiler_panic(t->c, impl_of(t)->loc,
+                 "rv64: prologue placeholder too small (cap %u)", cap);
+}
+
 static u32 rv_build_prologue(CGTarget *t, u32 *words, u32 cap,
                              const RvFrameLayout *fl, const u32 *int_regs,
                              u32 n_int_saves, const u32 *fp_regs,
@@ -277,14 +342,26 @@ static u32 rv_build_prologue(CGTarget *t, u32 *words, u32 cap,
     words[wi++] = rv_add(RV_SP, RV_SP, RV_T0);
   }
 
-  if ((i32)fl->fp_pair_off > 2047 ||
-      (i32)(fl->fp_pair_off + 8) > 2047) {
-    compiler_panic(t->c, a->loc, "rv64: fp_pair_off out of imm12 range");
+  if ((i32)fl->fp_pair_off <= 2039) {
+    if (wi + 3 > cap) goto overflow;
+    words[wi++] = rv_sd(RV_S0, RV_SP, (i32)fl->fp_pair_off);
+    words[wi++] = rv_sd(RV_RA, RV_SP, (i32)fl->fp_pair_off + 8);
+    words[wi++] = rv_addi(RV_S0, RV_SP, (i32)fl->fp_pair_off);
+  } else {
+    i32 off = (i32)fl->fp_pair_off;
+    i32 hi = (i32)(((i64)off + 0x800) >> 12);
+    i32 lo = off - (hi << 12);
+    if (fl->fp_pair_off > 0x7fffffffu)
+      compiler_panic(t->c, a->loc, "rv64: fp_pair_off too large");
+    if (wi + 6 > cap) goto overflow;
+    words[wi++] = rv_lui(RV_T0, (u32)hi & 0xfffffu);
+    if (lo)
+      words[wi++] = rv_addiw(RV_T0, RV_T0, lo);
+    words[wi++] = rv_add(RV_T0, RV_SP, RV_T0);
+    words[wi++] = rv_sd(RV_S0, RV_T0, 0);
+    words[wi++] = rv_sd(RV_RA, RV_T0, 8);
+    words[wi++] = rv_addi(RV_S0, RV_T0, 0);
   }
-  if (wi + 3 > cap) goto overflow;
-  words[wi++] = rv_sd(RV_S0, RV_SP, (i32)fl->fp_pair_off);
-  words[wi++] = rv_sd(RV_RA, RV_SP, (i32)fl->fp_pair_off + 8);
-  words[wi++] = rv_addi(RV_S0, RV_SP, (i32)fl->fp_pair_off);
 
   /* If sret, spill incoming a0 into the hidden slot. */
   if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) {
@@ -304,14 +381,12 @@ static u32 rv_build_prologue(CGTarget *t, u32 *words, u32 cap,
   for (u32 i = 0; i < n_int_saves; ++i) {
     u32 r = int_regs[i];
     i32 off = fl->int_save_base - 8 * (i32)i;
-    if (wi >= cap) goto overflow;
-    words[wi++] = rv_sd(r, RV_S0, off);
+    rv_words_store_int_s0(t, words, cap, &wi, r, off);
   }
   for (u32 i = 0; i < n_fp_saves; ++i) {
     u32 r = fp_regs[i];
     i32 off = fl->fp_save_base - 8 * (i32)i;
-    if (wi >= cap) goto overflow;
-    words[wi++] = rv_fsd(r, RV_S0, off);
+    rv_words_store_fp_s0(t, words, cap, &wi, r, off);
   }
   return wi;
 
@@ -424,23 +499,30 @@ void rv_func_end(CGTarget *t) {
   for (i32 i = (i32)n_int_saves - 1; i >= 0; --i) {
     u32 r = int_regs[i];
     i32 off = fl.int_save_base - 8 * (i32)i;
-    rv64_emit32(mc, rv_ld(r, RV_S0, off));
+    if (off >= -2048 && off <= 2047) {
+      rv64_emit32(mc, rv_ld(r, RV_S0, off));
+    } else {
+      rv64_emit_addr_adjust(mc, RV_T0, RV_S0, off);
+      rv64_emit32(mc, rv_ld(r, RV_T0, 0));
+    }
   }
   for (i32 i = (i32)n_fp_saves - 1; i >= 0; --i) {
     u32 r = fp_regs[i];
     i32 off = fl.fp_save_base - 8 * (i32)i;
-    rv64_emit32(mc, rv_fld(r, RV_S0, off));
+    if (off >= -2048 && off <= 2047) {
+      rv64_emit32(mc, rv_fld(r, RV_S0, off));
+    } else {
+      rv64_emit_addr_adjust(mc, RV_T0, RV_S0, off);
+      rv64_emit32(mc, rv_fld(r, RV_T0, 0));
+    }
   }
   /* Restore sp from s0 first so alloca-induced offsets don't matter.
    * After this, sp == its post-prologue value. */
   if (a->has_alloca) {
-    if ((i32)fl.fp_pair_off > 2047) {
-      compiler_panic(t->c, a->loc, "rv64: fp_pair_off too large for alloca");
-    }
-    rv64_emit32(mc, rv_addi(RV_SP, RV_S0, -(i32)fl.fp_pair_off));
+    rv64_emit_addr_adjust(mc, RV_SP, RV_S0, -(i32)fl.fp_pair_off);
   }
-  rv64_emit32(mc, rv_ld(RV_S0, RV_SP, (i32)fl.fp_pair_off));
-  rv64_emit32(mc, rv_ld(RV_RA, RV_SP, (i32)fl.fp_pair_off + 8));
+  rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8));
+  rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0));
   emit_sp_addi(mc, (i64)fl.frame_size);
   rv64_emit32(mc, rv_ret_());
 
diff --git a/src/arch/rv64/internal.h b/src/arch/rv64/internal.h
@@ -11,8 +11,8 @@
 #include "core/pool.h"
 #include "obj/obj.h"
 
-#define RV_PROLOGUE_WORDS 35u
-#define RV_PROLOGUE_FRAME_WORDS 6u /* worst-case sp adjust + s0/ra + set s0 */
+#define RV_PROLOGUE_WORDS 128u
+#define RV_PROLOGUE_FRAME_WORDS 10u /* sp adjust + far/near s0/ra save + set s0 */
 
 /* ---- RvSlot / RvScope ---- */
 typedef struct RvSlot {
diff --git a/src/arch/rv64/ops.c b/src/arch/rv64/ops.c
@@ -804,6 +804,41 @@ static void rv_store_stack_reg(CGTarget* t, u32 reg, RegClass cls,
   rv_store(t, addr, src, ma);
 }
 
+static Operand rv_offset_mem_operand(CGTarget* t, Operand op, u32 offset) {
+  if (!offset) return op;
+  if (op.kind == OPK_INDIRECT) {
+    op.v.ind.ofs += (i32)offset;
+  } else if (op.kind == OPK_LOCAL) {
+    RImpl* a = impl_of(t);
+    RvSlot* s = rv64_slot_get(a, op.v.frame_slot);
+    if (!s) compiler_panic(t->c, a->loc, "rv64 offset operand: bad slot");
+    op.kind = OPK_INDIRECT;
+    op.v.ind.base = RV_S0;
+    op.v.ind.ofs = -(i32)s->off + (i32)offset;
+  }
+  return op;
+}
+
+static void rv_load_abi_part(CGTarget* t, Operand dst, Operand src, u32 offset,
+                             u32 size) {
+  MemAccess ma;
+  memset(&ma, 0, sizeof ma);
+  ma.type = dst.type;
+  ma.size = size;
+  ma.align = size ? size : 1u;
+  rv_load(t, dst, rv_offset_mem_operand(t, src, offset), ma);
+}
+
+static void rv_store_abi_part(CGTarget* t, Operand dst, Operand src,
+                              u32 offset, u32 size) {
+  MemAccess ma;
+  memset(&ma, 0, sizeof ma);
+  ma.type = src.type;
+  ma.size = size;
+  ma.align = size ? size : 1u;
+  rv_store(t, rv_offset_mem_operand(t, dst, offset), src, ma);
+}
+
 static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int,
                            u32* next_fp, u32* stack_off, int tail) {
   RImpl* a = impl_of(t);
@@ -890,18 +925,15 @@ static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int,
           break;
         }
         case OPK_LOCAL: {
-          RvSlot* s = rv64_slot_get(a, av->storage.v.frame_slot);
-          if (!s) compiler_panic(t->c, a->loc, "rv64 call: bad arg slot");
-          i32 off = -(i32)s->off + (i32)pt->src_offset;
-          rv64_emit32(mc, enc_int_load(sz, 0, dst_reg, RV_S0, off));
+          Operand dst = {.kind = OPK_REG, .cls = RC_INT, .type = av->type};
+          dst.v.reg = dst_reg;
+          rv_load_abi_part(t, dst, av->storage, pt->src_offset, sz);
           break;
         }
         case OPK_INDIRECT: {
-          /* cg holds INDIRECT base regs in s2..s11, disjoint from arg
-           * regs a0..a7 and the t0 stack-arg scratch. */
-          u32 base = av->storage.v.ind.base & 0x1fu;
-          i32 off = av->storage.v.ind.ofs + (i32)pt->src_offset;
-          rv64_emit32(mc, enc_int_load(sz, 0, dst_reg, base, off));
+          Operand dst = {.kind = OPK_REG, .cls = RC_INT, .type = av->type};
+          dst.v.reg = dst_reg;
+          rv_load_abi_part(t, dst, av->storage, pt->src_offset, sz);
           break;
         }
         default:
@@ -925,18 +957,15 @@ static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int,
             break;
           }
           case OPK_LOCAL: {
-            RvSlot* s = rv64_slot_get(a, av->storage.v.frame_slot);
-            if (!s) compiler_panic(t->c, a->loc, "rv64 call: bad FP arg slot");
-            i32 off = -(i32)s->off + (i32)pt->src_offset;
-            rv64_emit32(mc, (sz == 8) ? rv_fld(freg, RV_S0, off)
-                                 : rv_flw(freg, RV_S0, off));
+            Operand dst = {.kind = OPK_REG, .cls = RC_FP, .type = av->type};
+            dst.v.reg = freg;
+            rv_load_abi_part(t, dst, av->storage, pt->src_offset, sz);
             break;
           }
           case OPK_INDIRECT: {
-            u32 base = av->storage.v.ind.base & 0x1fu;
-            i32 off = av->storage.v.ind.ofs + (i32)pt->src_offset;
-            rv64_emit32(mc, (sz == 8) ? rv_fld(freg, base, off)
-                                 : rv_flw(freg, base, off));
+            Operand dst = {.kind = OPK_REG, .cls = RC_FP, .type = av->type};
+            dst.v.reg = freg;
+            rv_load_abi_part(t, dst, av->storage, pt->src_offset, sz);
             break;
           }
           default:
@@ -950,15 +979,14 @@ static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int,
                                *stack_off, tail);
             break;
           case OPK_LOCAL: {
-            RvSlot* s = rv64_slot_get(a, av->storage.v.frame_slot);
-            if (!s) compiler_panic(t->c, a->loc, "rv64 call: bad FP arg slot");
-            i32 off = -(i32)s->off + (i32)pt->src_offset;
+            Operand tmp = {.kind = OPK_REG, .cls = RC_FP, .type = av->type};
+            tmp.v.reg = 0u;
             if (sz == 8) {
-              rv64_emit32(mc, rv_fld(/*ft0=*/0u, RV_S0, off));
+              rv_load_abi_part(t, tmp, av->storage, pt->src_offset, sz);
               rv_store_stack_reg(t, /*ft0=*/0u, RC_FP, av->type, sz,
                                  *stack_off, tail);
             } else {
-              rv64_emit32(mc, rv_flw(/*ft0=*/0u, RV_S0, off));
+              rv_load_abi_part(t, tmp, av->storage, pt->src_offset, sz);
               rv_store_stack_reg(t, /*ft0=*/0u, RC_FP, av->type, sz,
                                  *stack_off, tail);
             }
@@ -967,14 +995,14 @@ static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int,
           case OPK_INDIRECT: {
             /* Route through ft0 — it is in {ft0..ft7}, caller-saved
              * scratch outside the cg fs2..fs11 pool. */
-            u32 base = av->storage.v.ind.base & 0x1fu;
-            i32 off = av->storage.v.ind.ofs + (i32)pt->src_offset;
+            Operand tmp = {.kind = OPK_REG, .cls = RC_FP, .type = av->type};
+            tmp.v.reg = 0u;
             if (sz == 8) {
-              rv64_emit32(mc, rv_fld(/*ft0=*/0u, base, off));
+              rv_load_abi_part(t, tmp, av->storage, pt->src_offset, sz);
               rv_store_stack_reg(t, /*ft0=*/0u, RC_FP, av->type, sz,
                                  *stack_off, tail);
             } else {
-              rv64_emit32(mc, rv_flw(/*ft0=*/0u, base, off));
+              rv_load_abi_part(t, tmp, av->storage, pt->src_offset, sz);
               rv_store_stack_reg(t, /*ft0=*/0u, RC_FP, av->type, sz,
                                  *stack_off, tail);
             }
@@ -1089,19 +1117,28 @@ static void rv_tail_restore_frame(CGTarget* t) {
 
   if (a->omit_frame) return;
   for (i32 i = (i32)n_int_saves - 1; i >= 0; --i) {
-    rv64_emit32(mc, rv_ld(int_regs[i], RV_S0, fl.int_save_base - 8 * i));
+    i32 off = fl.int_save_base - 8 * i;
+    if (off >= -2048 && off <= 2047) {
+      rv64_emit32(mc, rv_ld(int_regs[i], RV_S0, off));
+    } else {
+      rv64_emit_addr_adjust(mc, RV_T0, RV_S0, off);
+      rv64_emit32(mc, rv_ld(int_regs[i], RV_T0, 0));
+    }
   }
   for (i32 i = (i32)n_fp_saves - 1; i >= 0; --i) {
-    rv64_emit32(mc, rv_fld(fp_regs[i], RV_S0, fl.fp_save_base - 8 * i));
+    i32 off = fl.fp_save_base - 8 * i;
+    if (off >= -2048 && off <= 2047) {
+      rv64_emit32(mc, rv_fld(fp_regs[i], RV_S0, off));
+    } else {
+      rv64_emit_addr_adjust(mc, RV_T0, RV_S0, off);
+      rv64_emit32(mc, rv_fld(fp_regs[i], RV_T0, 0));
+    }
   }
   if (a->has_alloca) {
-    if ((i32)fl.fp_pair_off > 2047) {
-      compiler_panic(t->c, a->loc, "rv64 tail call: fp pair offset too large");
-    }
-    rv64_emit32(mc, rv_addi(RV_SP, RV_S0, -(i32)fl.fp_pair_off));
+    rv64_emit_addr_adjust(mc, RV_SP, RV_S0, -(i32)fl.fp_pair_off);
   }
-  rv64_emit32(mc, rv_ld(RV_S0, RV_SP, (i32)fl.fp_pair_off));
-  rv64_emit32(mc, rv_ld(RV_RA, RV_SP, (i32)fl.fp_pair_off + 8));
+  rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8));
+  rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0));
   emit_sp_addi(mc, (i64)fl.frame_size);
 }
 
@@ -1208,23 +1245,15 @@ static void rv_call(CGTarget* t, const CGCallDesc* d) {
         rv64_emit32(mc, rv_fsgnj(fmt, reg_num(rs), src_reg, src_reg));
       }
     } else if (rs.kind == OPK_LOCAL || rs.kind == OPK_INDIRECT) {
-      u32 base_reg;
-      i32 base_off;
-      if (rs.kind == OPK_LOCAL) {
-        RvSlot* s = rv64_slot_get(a, rs.v.frame_slot);
-        if (!s) compiler_panic(t->c, a->loc, "rv64 call: bad ret slot");
-        base_reg = RV_S0;
-        base_off = -(i32)s->off;
-      } else {
-        base_reg = rs.v.ind.base & 0x1fu;
-        base_off = rs.v.ind.ofs;
-      }
-      i32 off = base_off + (i32)p->src_offset;
-      if (p->cls == ABI_CLASS_INT) {
-        rv64_emit32(mc, enc_int_store(p->size, src_reg, base_reg, off));
+      Operand src = {.kind = OPK_REG,
+                     .cls = (u8)((p->cls == ABI_CLASS_FP) ? RC_FP : RC_INT),
+                     .type = d->ret.type};
+      src.v.reg = src_reg;
+      if (p->cls == ABI_CLASS_INT || p->cls == ABI_CLASS_FP) {
+        rv_store_abi_part(t, rs, src, p->src_offset, p->size);
       } else {
-        if (p->size == 8) rv64_emit32(mc, rv_fsd(src_reg, base_reg, off));
-        else              rv64_emit32(mc, rv_fsw(src_reg, base_reg, off));
+        compiler_panic(t->c, a->loc, "rv64 call: ret part cls %d unimpl",
+                       (int)p->cls);
       }
     } else if (rs.kind == OPK_IMM && rs.type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_VOID)) {
       /* void return placeholder — nothing to do. */
@@ -1407,28 +1436,19 @@ static void rv_ret(CGTarget* t, const CGABIValue* val) {
       rv64_emit_load_imm(mc, sf, RV_A0, val->storage.v.imm);
     } else if (val->storage.kind == OPK_LOCAL ||
                val->storage.kind == OPK_INDIRECT) {
-      u32 base_reg;
-      i32 base_off;
-      if (val->storage.kind == OPK_LOCAL) {
-        RvSlot* s = rv64_slot_get(a, val->storage.v.frame_slot);
-        if (!s) compiler_panic(t->c, a->loc, "rv64 ret: bad local slot");
-        base_reg = RV_S0;
-        base_off = -(i32)s->off;
-      } else {
-        base_reg = val->storage.v.ind.base & 0x1fu;
-        base_off = val->storage.v.ind.ofs;
-      }
       const ABIArgInfo* ri2 = val->abi;
       u32 nir = 0, nfr = 0;
       for (u16 i = 0; i < (ri2 ? ri2->nparts : 0); ++i) {
         const ABIArgPart* pt = &ri2->parts[i];
-        i32 off = base_off + (i32)pt->src_offset;
         if (pt->cls == ABI_CLASS_INT) {
-          rv64_emit32(mc, enc_int_load(pt->size, 0, RV_A0 + nir++, base_reg, off));
+          Operand dst = {.kind = OPK_REG, .cls = RC_INT, .type = val->type};
+          dst.v.reg = RV_A0 + nir++;
+          rv_load_abi_part(t, dst, val->storage, pt->src_offset, pt->size);
         } else if (pt->cls == ABI_CLASS_FP) {
+          Operand dst = {.kind = OPK_REG, .cls = RC_FP, .type = val->type};
           u32 freg = 10u + nfr++;
-          if (pt->size == 8) rv64_emit32(mc, rv_fld(freg, base_reg, off));
-          else               rv64_emit32(mc, rv_flw(freg, base_reg, off));
+          dst.v.reg = freg;
+          rv_load_abi_part(t, dst, val->storage, pt->src_offset, pt->size);
         } else {
           compiler_panic(t->c, a->loc, "rv64 ret: part cls %d unimpl",
                          (int)pt->cls);
diff --git a/test/parse/cases/6_7_2_12_long_double.skip b/test/parse/cases/6_7_2_12_long_double.skip
@@ -1 +0,0 @@
-long double (binary128) literal/convert needs rt/lib/fp_tf wiring through cg
diff --git a/test/parse/cases/i128_06_shifts_bitwise.c b/test/parse/cases/i128_06_shifts_bitwise.c
@@ -4,8 +4,12 @@ int test_main(void) {
   u128 x = (u128)0xf0ULL << 68;
   u128 y = x >> 64;
   u128 z = (x | ((u128)0x55ULL << 4)) ^ ((u128)0x5ULL << 4);
+  u128 high_truth = ((u128)1 << 112) << 4;
+  _Bool high_bool = high_truth;
   if ((unsigned long long)y != 0xf00ULL) return 11;
   if ((unsigned long long)z != 0x500ULL) return 12;
   if ((unsigned long long)(z >> 64) != 0xf00ULL) return 13;
+  if (!high_truth) return 14;
+  if (!high_bool) return 15;
   return 41;
 }
diff --git a/test/parse/cases/i128_13_signed_div_mod.c b/test/parse/cases/i128_13_signed_div_mod.c
@@ -0,0 +1,31 @@
+typedef __int128 i128;
+
+int test_main(void) {
+  i128 a = -(((i128)1 << 90) + 123456789);
+  i128 b = ((i128)1 << 30) + 7;
+  i128 q = a / b;
+  i128 r = a % b;
+
+  if (q != -(((i128)1 << 60) - (((i128)7 << 30) - 49))) return 11;
+  if (r != -123456446) return 12;
+  if (q * b + r != a) return 13;
+  if (r >= 0) return 14;
+
+  b = -(((i128)1 << 33) + 5);
+  q = a / b;
+  r = a % b;
+  if (q != (((i128)1 << 57) - ((i128)80 << 20))) return 15;
+  if (r != -542887189) return 16;
+  if (q * b + r != a) return 17;
+  if (r >= 0) return 18;
+
+  a = ((i128)1 << 90) + 123456789;
+  q = a / b;
+  r = a % b;
+  if (q != -(((i128)1 << 57) - ((i128)80 << 20))) return 19;
+  if (r != 542887189) return 20;
+  if (q * b + r != a) return 21;
+  if (r <= 0) return 22;
+
+  return 61;
+}
diff --git a/test/parse/cases/i128_13_signed_div_mod.expected b/test/parse/cases/i128_13_signed_div_mod.expected
@@ -0,0 +1 @@
+61
diff --git a/test/parse/cases/i128_14_arbitrary_mul.c b/test/parse/cases/i128_14_arbitrary_mul.c
@@ -0,0 +1,22 @@
+typedef __int128 i128;
+typedef unsigned __int128 u128;
+
+int test_main(void) {
+  u128 a = ((u128)0x123456789abcdef0ULL << 16) | 0x1357ULL;
+  u128 b = ((u128)0x0fedcba987654321ULL << 12) | 0x246ULL;
+  u128 p = a * b;
+
+  if ((unsigned long long)p != 0x71407aa829ff67caULL) return 11;
+  if ((unsigned long long)(p >> 64) != 0x0ad77d7422601184ULL) return 12;
+
+  i128 x = -(((i128)0x1234567 << 40) + 0x89abcdef);
+  i128 y = ((i128)0x13579 << 28) + 0x2468ace;
+  i128 z = x * y;
+  u128 uz = (u128)z;
+
+  if (z >= 0) return 13;
+  if ((unsigned long long)uz != 0x324b79b4fd6373aeULL) return 14;
+  if ((unsigned long long)(uz >> 64) != 0xffffe9fe36571cf3ULL) return 15;
+
+  return 73;
+}
diff --git a/test/parse/cases/i128_14_arbitrary_mul.expected b/test/parse/cases/i128_14_arbitrary_mul.expected
@@ -0,0 +1 @@
+73
diff --git a/test/parse/cases/ldbl128_15_arbitrary_mul.c b/test/parse/cases/ldbl128_15_arbitrary_mul.c
@@ -0,0 +1,15 @@
+int test_main(void) {
+  if (__LDBL_MANT_DIG__ != 113) return 0;
+
+  long double a = 7.0L * 9.0L;
+  long double b = 13.0L * 11.0L;
+  long double c = 1.5L * 2.5L;
+
+  if ((int)a != 63) return 11;
+  if ((int)b != 143) return 12;
+  if ((int)(a + b) != 206) return 13;
+  if ((int)c != 3) return 14;
+  if ((int)((c - 3.0L) * 4.0L) != 3) return 15;
+
+  return 71;
+}
diff --git a/test/parse/cases/ldbl128_15_arbitrary_mul.expected b/test/parse/cases/ldbl128_15_arbitrary_mul.expected
@@ -0,0 +1 @@
+71

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	doc/C11_LONG_DOUBLE_CHECKLIST.md	\|	64	+++++++++++++++++++++++++++++++++++++++-------------------------
M	lang/c/parse/cg_adapter.c	\|	3	++-
M	lang/c/parse/parse_expr.c	\|	170	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
M	lang/c/parse/parse_init.c	\|	128	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M	lang/c/parse/parse_priv.h	\|	4	+++-
M	lang/c/parse/parse_type.c	\|	34	++++++++++++++++++++++++++++++++--
M	rt/lib/README.md	\|	2	+-
M	rt/lib/fp_tf/fp_tf.c	\|	357	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
M	rt/lib/int64/int64.c	\|	408	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
M	src/abi/abi_aapcs64.c	\|	17	+++++++++++++++++
M	src/abi/abi_rv64.c	\|	3	++-
M	src/api/cg.c	\|	325	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M	src/arch/aa64/ops.c	\|	12	+++++++++++-
M	src/arch/rv64/alloc.c	\|	5	+++--
M	src/arch/rv64/emit.c	\|	124	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M	src/arch/rv64/internal.h	\|	4	++--
M	src/arch/rv64/ops.c	\|	152	+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
D	test/parse/cases/6_7_2_12_long_double.skip	\|	1	-
M	test/parse/cases/i128_06_shifts_bitwise.c	\|	4	++++
A	test/parse/cases/i128_13_signed_div_mod.c	\|	31	+++++++++++++++++++++++++++++++
A	test/parse/cases/i128_13_signed_div_mod.expected	\|	1	+
A	test/parse/cases/i128_14_arbitrary_mul.c	\|	22	++++++++++++++++++++++
A	test/parse/cases/i128_14_arbitrary_mul.expected	\|	1	+
A	test/parse/cases/ldbl128_15_arbitrary_mul.c	\|	15	+++++++++++++++
A	test/parse/cases/ldbl128_15_arbitrary_mul.expected	\|	1	+