wasm: ir_emit lowerings for bitfields, aggregates, TLS, and conds - kit

commit 55817243963568fc08d1668e2d8752ffdae1eb5c
parent 164bf2ad58cd1d8878ae1c792f9a9b139de524de
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 28 May 2026 12:02:19 -0700

wasm: ir_emit lowerings for bitfields, aggregates, TLS, and conds

Four independent backend lowerings that all live in src/arch/wasm/ir_emit.c
(plus a small obj_secnames hook):

- CG_IR_BITFIELD_{LOAD,STORE} lower to load + shift/mask + store over
  the storage unit, all in i64 arithmetic regardless of storage width
  (i64.load{8,16,32}_u zero-extends in, i64.store{8,16,32} truncates
  out, uniform 64-bit shift count keeps extraction width-agnostic).
  Unblocks the 5 bitfield test/parse cases.

- CG_IR_LOAD/CG_IR_COPY of an aggregate type route through a shared
  wasm_ir_emit_agg_move helper that materializes the two endpoint
  addresses with addr_of and emits memory.copy between them, instead
  of falling through to a scalar load. Unblocks the
  call_indirect_*_struct_* and call_large_const_global_struct_byval
  cases.

- cmp_branch, switch selector, and if-condition operands now go through
  wasm_ir_source_op, so an address-taken local (e.g. the `expected`
  out-param of __atomic_compare_exchange) is loaded from memory rather
  than read as an undefined wasm local. Unblocks
  rv64_atomic_widths_orders.

- Wasm has no thread-local storage model (one linear memory per
  instance), so a thread-local is just an ordinary data object:
  obj_secname_tdata/tbss name them .tdata/.tbss for CFREE_OBJ_WASM,
  and CG_IR_TLS_ADDR_OF lowers to a plain symbol address. Unblocks
  6_7_1_03_thread_local_basic and gnu_thread_storage_01.

Diffstat:
M src/arch/wasm/ir_emit.c  | 203 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M src/obj/obj_secnames.c  | 10 ++++++++++

2 files changed, 179 insertions(+), 34 deletions(-)
diff --git a/src/arch/wasm/ir_emit.c b/src/arch/wasm/ir_emit.c
@@ -254,6 +254,36 @@ static Operand wasm_ir_addr_op(WasmIrEmitter* e, CgSemOperand in, SrcLoc loc) {
   return out;
 }
 
+/* Lower an aggregate move to a memory.copy between two linear-memory homes.
+ * Both `dst` and `src` are lvalue operands (frame slots, indirect `[ptr]`
+ * addressing, or global symbols); copy_bytes wants each endpoint as a
+ * pointer-valued register, so materialize the effective address of each with
+ * addr_of first. `ty` names the aggregate being moved. */
+static void wasm_ir_emit_agg_move(WasmIrEmitter* e, CgSemOperand dst,
+                                  CgSemOperand src, CfreeCgTypeId ty,
+                                  SrcLoc loc) {
+  CGTarget* t = (CGTarget*)&e->target->base;
+  AggregateAccess agg;
+  CfreeCgTypeId pty = cg_type_ptr_to(e->target->c, ty);
+  Operand adst = wasm_ir_addr_op(e, dst, loc);
+  Operand asrc = wasm_ir_addr_op(e, src, loc);
+  Operand dreg, sreg;
+  memset(&dreg, 0, sizeof dreg);
+  memset(&sreg, 0, sizeof sreg);
+  dreg.kind = sreg.kind = OPK_REG;
+  dreg.type = sreg.type = pty;
+  dreg.cls = sreg.cls = (u8)RC_INT;
+  dreg.v.reg = wasm_ir_temp_reg(e);
+  sreg.v.reg = wasm_ir_temp_reg(e);
+  wasm_addr_of(t, dreg, adst);
+  wasm_addr_of(t, sreg, asrc);
+  memset(&agg, 0, sizeof agg);
+  agg.type = ty;
+  agg.size = (u32)abi_cg_sizeof(e->target->c->abi, ty);
+  agg.align = (u32)abi_cg_alignof(e->target->c->abi, ty);
+  wasm_copy_bytes(t, dreg, sreg, agg);
+}
+
 static CGScope wasm_ir_scope_lookup(WasmIrEmitter* e, CGScope recorded,
                                     SrcLoc loc) {
   if ((u32)recorded >= e->scope_map_n || !e->scope_map[recorded])
@@ -359,7 +389,7 @@ static void wasm_ir_emit_switch(WasmIrEmitter* e, const CgIrInst* in) {
   const CgIrSwitchAux* aux = (const CgIrSwitchAux*)in->extra.aux;
   CGSwitchDesc d;
   memset(&d, 0, sizeof d);
-  d.selector = wasm_ir_value_op(e, in->opnds[0]);
+  d.selector = wasm_ir_source_op(e, in->opnds[0], in->loc);
   d.selector_type = aux->selector_type;
   d.default_label = aux->default_label;
   d.cases = aux->cases;
@@ -369,6 +399,98 @@ static void wasm_ir_emit_switch(WasmIrEmitter* e, const CgIrInst* in) {
   wasm_switch((CGTarget*)&e->target->base, &d);
 }
 
+/* Bitfields have no native wasm insert/extract, so lower to load + shift/mask
+ * + store over the storage unit. All arithmetic runs in i64 regardless of
+ * storage width: the load zero-extends into i64 (i64.load{8,16,32}_u), the
+ * store truncates back (i64.store{8,16,32}), and a uniform 64-bit shift count
+ * keeps the field-extraction math width-agnostic. storage_offset is always 0
+ * here — the frontend folds it into record_addr. */
+#define WASM_BF_REG_BITS 64u
+
+static Operand wasm_ir_temp_i64(WasmIrEmitter* e) {
+  Operand o;
+  memset(&o, 0, sizeof o);
+  o.kind = OPK_REG;
+  o.type = builtin_id(CFREE_CG_BUILTIN_I64);
+  o.cls = (u8)RC_INT;
+  o.v.reg = wasm_ir_temp_reg(e);
+  return o;
+}
+
+static Operand wasm_ir_imm_i64(i64 v) {
+  Operand o;
+  memset(&o, 0, sizeof o);
+  o.kind = OPK_IMM;
+  o.type = builtin_id(CFREE_CG_BUILTIN_I64);
+  o.cls = (u8)RC_INT;
+  o.v.imm = v;
+  return o;
+}
+
+/* Storage-unit access: i64 value, exactly storage_size bytes wide. */
+static MemAccess wasm_ir_bf_storage_mem(const BitFieldAccess* bf) {
+  MemAccess mem = bf->storage;
+  mem.type = builtin_id(CFREE_CG_BUILTIN_I64);
+  mem.size = bf->storage.size ? bf->storage.size : 4u;
+  return mem;
+}
+
+static void wasm_ir_emit_bitfield_load(WasmIrEmitter* e, const CgIrInst* in) {
+  CGTarget* t = (CGTarget*)&e->target->base;
+  const CgIrBitFieldAux* aux = (const CgIrBitFieldAux*)in->extra.aux;
+  const BitFieldAccess* bf = &aux->access;
+  u32 width = bf->bit_width ? bf->bit_width : 1u;
+  u32 lsb = bf->bit_offset;
+  u32 left = WASM_BF_REG_BITS - lsb - width; /* bits above the field */
+  u32 right = WASM_BF_REG_BITS - width;      /* slide field back to bit 0 */
+  Operand addr = wasm_ir_addr_op(e, in->opnds[1], in->loc);
+  Operand val = wasm_ir_temp_i64(e);
+  WasmIrDest d;
+  Operand dst;
+
+  /* Load the storage unit, slide the field to the top of the i64, then back
+   * down — arithmetic shift sign-extends a signed field, logical zero-extends
+   * an unsigned one. */
+  wasm_load(t, val, addr, wasm_ir_bf_storage_mem(bf));
+  if (left) wasm_binop(t, BO_SHL, val, val, wasm_ir_imm_i64((i64)left));
+  if (right)
+    wasm_binop(t, bf->signed_ ? BO_SHR_S : BO_SHR_U, val, val,
+               wasm_ir_imm_i64((i64)right));
+  dst = wasm_ir_dest_op(e, in->opnds[0], &d);
+  /* Narrow to the field's wasm value type; a no-op copy when dst is i64. */
+  wasm_convert(t, CV_TRUNC, dst, val);
+  wasm_ir_dest_finish(e, &d);
+}
+
+static void wasm_ir_emit_bitfield_store(WasmIrEmitter* e, const CgIrInst* in) {
+  CGTarget* t = (CGTarget*)&e->target->base;
+  const CgIrBitFieldAux* aux = (const CgIrBitFieldAux*)in->extra.aux;
+  const BitFieldAccess* bf = &aux->access;
+  u32 width = bf->bit_width ? bf->bit_width : 1u;
+  u32 lsb = bf->bit_offset;
+  u64 ones = (width >= WASM_BF_REG_BITS) ? ~(u64)0 : (((u64)1 << width) - 1u);
+  u64 mask = ones << lsb;
+  MemAccess mem = wasm_ir_bf_storage_mem(bf);
+  Operand addr = wasm_ir_addr_op(e, in->opnds[0], in->loc);
+  Operand cur = wasm_ir_temp_i64(e);
+
+  /* Read-modify-write: clear the field bits, OR in the masked/shifted value. */
+  wasm_load(t, cur, addr, mem);
+  wasm_binop(t, BO_AND, cur, cur, wasm_ir_imm_i64((i64)~mask));
+  if (in->opnds[1].kind == OPK_IMM) {
+    u64 v = ((u64)in->opnds[1].v.imm & ones) << lsb;
+    wasm_binop(t, BO_OR, cur, cur, wasm_ir_imm_i64((i64)v));
+  } else {
+    Operand src = wasm_ir_source_op(e, in->opnds[1], in->loc);
+    Operand staged = wasm_ir_temp_i64(e);
+    wasm_convert(t, CV_ZEXT, staged, src); /* widen field value to i64 */
+    wasm_binop(t, BO_AND, staged, staged, wasm_ir_imm_i64((i64)ones));
+    if (lsb) wasm_binop(t, BO_SHL, staged, staged, wasm_ir_imm_i64((i64)lsb));
+    wasm_binop(t, BO_OR, cur, cur, staged);
+  }
+  wasm_store(t, addr, cur, mem);
+}
+
 static void wasm_ir_emit_inst(WasmIrEmitter* e, const CgIrFunc* f,
                               const CgIrInst* in) {
   CGTarget* t = (CGTarget*)&e->target->base;
@@ -397,28 +519,8 @@ static void wasm_ir_emit_inst(WasmIrEmitter* e, const CgIrFunc* f,
       WasmIrDest d;
       Operand src, dst;
       if (wasm_ir_is_aggregate(e->target, in->opnds[0].type)) {
-        /* Aggregate value copy: lower to memory.copy between the two homes.
-         * copy_bytes wants both endpoints as pointer-valued registers, so
-         * materialize each slot address with addr_of first. */
-        AggregateAccess agg;
-        CfreeCgTypeId pty = cg_type_ptr_to(e->target->c, in->opnds[0].type);
-        Operand adst = wasm_ir_addr_op(e, in->opnds[0], in->loc);
-        Operand asrc = wasm_ir_addr_op(e, in->opnds[1], in->loc);
-        Operand dreg, sreg;
-        memset(&dreg, 0, sizeof dreg);
-        memset(&sreg, 0, sizeof sreg);
-        dreg.kind = sreg.kind = OPK_REG;
-        dreg.type = sreg.type = pty;
-        dreg.cls = sreg.cls = (u8)RC_INT;
-        dreg.v.reg = wasm_ir_temp_reg(e);
-        sreg.v.reg = wasm_ir_temp_reg(e);
-        wasm_addr_of(t, dreg, adst);
-        wasm_addr_of(t, sreg, asrc);
-        memset(&agg, 0, sizeof agg);
-        agg.type = in->opnds[0].type;
-        agg.size = (u32)abi_cg_sizeof(e->target->c->abi, in->opnds[0].type);
-        agg.align = (u32)abi_cg_alignof(e->target->c->abi, in->opnds[0].type);
-        wasm_copy_bytes(t, dreg, sreg, agg);
+        wasm_ir_emit_agg_move(e, in->opnds[0], in->opnds[1], in->opnds[0].type,
+                              in->loc);
         return;
       }
       src = wasm_ir_source_op(e, in->opnds[1], in->loc);
@@ -429,8 +531,18 @@ static void wasm_ir_emit_inst(WasmIrEmitter* e, const CgIrFunc* f,
     }
     case CG_IR_LOAD: {
       WasmIrDest d;
-      Operand addr = wasm_ir_addr_op(e, in->opnds[1], in->loc);
-      Operand dst = wasm_ir_dest_op(e, in->opnds[0], &d);
+      Operand addr, dst;
+      if (wasm_ir_is_aggregate(e->target, in->opnds[0].type)) {
+        /* Aggregate load: the source operand is the address of the aggregate
+         * (an indirect `[ptr]` or a global symbol), so its effective address
+         * is the source home. Lower to memory.copy into the destination's
+         * home rather than a scalar wasm load. */
+        wasm_ir_emit_agg_move(e, in->opnds[0], in->opnds[1], in->opnds[0].type,
+                              in->loc);
+        return;
+      }
+      addr = wasm_ir_addr_op(e, in->opnds[1], in->loc);
+      dst = wasm_ir_dest_op(e, in->opnds[0], &d);
       wasm_load(t, dst, addr, in->extra.mem);
       wasm_ir_dest_finish(e, &d);
       return;
@@ -449,9 +561,25 @@ static void wasm_ir_emit_inst(WasmIrEmitter* e, const CgIrFunc* f,
       wasm_ir_dest_finish(e, &d);
       return;
     }
-    case CG_IR_TLS_ADDR_OF:
-      wasm_ir_fail(e, in->loc, "wasm target: tls_addr_of not yet implemented");
+    case CG_IR_TLS_ADDR_OF: {
+      /* Wasm has no thread-local storage: a module instance owns one linear
+       * memory, so a thread-local resolves to a fixed data address. Lower to
+       * the symbol's (addend-adjusted) linear-memory address, exactly like a
+       * non-TLS addr_of of a global. */
+      const CgIrTlsAux* aux = (const CgIrTlsAux*)in->extra.aux;
+      WasmIrDest d;
+      Operand src;
+      Operand dst;
+      memset(&src, 0, sizeof src);
+      src.kind = OPK_GLOBAL;
+      src.type = in->opnds[0].type;
+      src.v.global.sym = aux->sym;
+      src.v.global.addend = aux->addend;
+      dst = wasm_ir_dest_op(e, in->opnds[0], &d);
+      wasm_addr_of(t, dst, src);
+      wasm_ir_dest_finish(e, &d);
       return;
+    }
     case CG_IR_AGG_COPY: {
       const CgIrAggAux* aux = (const CgIrAggAux*)in->extra.aux;
       Operand dst = wasm_ir_source_op(e, in->opnds[0], in->loc);
@@ -467,12 +595,10 @@ static void wasm_ir_emit_inst(WasmIrEmitter* e, const CgIrFunc* f,
       return;
     }
     case CG_IR_BITFIELD_LOAD:
-      wasm_ir_fail(e, in->loc,
-                   "wasm target: bitfield_load not yet implemented");
+      wasm_ir_emit_bitfield_load(e, in);
       return;
     case CG_IR_BITFIELD_STORE:
-      wasm_ir_fail(e, in->loc,
-                   "wasm target: bitfield_store not yet implemented");
+      wasm_ir_emit_bitfield_store(e, in);
       return;
     case CG_IR_BINOP: {
       WasmIrDest d;
@@ -519,8 +645,12 @@ static void wasm_ir_emit_inst(WasmIrEmitter* e, const CgIrFunc* f,
       return;
     case CG_IR_CMP_BRANCH: {
       const CgIrCmpBranchAux* aux = (const CgIrCmpBranchAux*)in->extra.aux;
-      wasm_cmp_branch(t, aux->op, wasm_ir_value_op(e, in->opnds[0]),
-                      wasm_ir_value_op(e, in->opnds[1]), aux->target);
+      /* Use source_op, not value_op: a compared operand may be an
+       * address-taken local that lives in linear memory (e.g. the `expected`
+       * out-param of __atomic_compare_exchange), which must be loaded rather
+       * than read as a bare wasm local. */
+      wasm_cmp_branch(t, aux->op, wasm_ir_source_op(e, in->opnds[0], in->loc),
+                      wasm_ir_source_op(e, in->opnds[1], in->loc), aux->target);
       return;
     }
     case CG_IR_SWITCH:
@@ -552,7 +682,12 @@ static void wasm_ir_emit_inst(WasmIrEmitter* e, const CgIrFunc* f,
       d.break_label = aux->desc.break_label;
       d.continue_label = aux->desc.continue_label;
       d.result_type = aux->desc.result_type;
-      d.cond = wasm_ir_value_op(e, aux->desc.cond);
+      /* Only SCOPE_IF consumes cond; source_op (not value_op) so an
+       * address-taken local condition is loaded from memory, not read as a
+       * bare wasm local. */
+      d.cond = aux->desc.kind == SCOPE_IF
+                   ? wasm_ir_source_op(e, aux->desc.cond, in->loc)
+                   : wasm_ir_value_op(e, aux->desc.cond);
       wasm_ir_bind_scope(e, aux->scope, wasm_scope_begin(t, &d), in->loc);
       return;
     }
diff --git a/src/obj/obj_secnames.c b/src/obj/obj_secnames.c
@@ -113,6 +113,12 @@ Sym obj_secname_tdata(Compiler* c) {
       /* MSVC `.tls$` convention; linker concatenates `.tls$*` sorted
        * by suffix.  See doc/WINDOWS.md §1.6. */
       return pool_intern_slice(c->global, SLICE_LIT(".tls$"));
+    case CFREE_OBJ_WASM:
+      /* Wasm has no thread-local storage model: a module instance owns a
+       * single linear memory, so a thread-local is just an ordinary
+       * data object. Keep the `.tdata` name (laid out like `.data`) and
+       * lower tls_addr_of to a plain symbol address. */
+      return pool_intern_slice(c->global, SLICE_LIT(".tdata"));
     default:
       return secname_panic_unimpl(c, ".tdata");
   }
@@ -128,6 +134,10 @@ Sym obj_secname_tbss(Compiler* c) {
       /* sorted-alphabetically-last so it falls at the tail of the TLS
        * image's zero-fill region.  See doc/WINDOWS.md §1.6. */
       return pool_intern_slice(c->global, SLICE_LIT(".tls$ZZZ"));
+    case CFREE_OBJ_WASM:
+      /* See obj_secname_tdata: wasm thread-locals are ordinary
+       * (zero-filled) data. */
+      return pool_intern_slice(c->global, SLICE_LIT(".tbss"));
     default:
       return secname_panic_unimpl(c, ".tbss");
   }

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/wasm/ir_emit.c	\|	203	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M	src/obj/obj_secnames.c	\|	10	++++++++++