commit 8ac1385246eaad90b7b36d075d58086f995dc88d
parent 9f744cea2b46ba8b526001fd7c4a425e531732ea
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 14 May 2026 13:13:49 -0700
Consolidate arch files into subdirs
Diffstat:
59 files changed, 6196 insertions(+), 6216 deletions(-)
diff --git a/doc/ASM.md b/doc/ASM.md
@@ -74,18 +74,18 @@ src/parse/parse_asm_helpers.h
src/parse/parse.c parse_asm_stmt: GNU asm("...") statement
grammar (volatile, goto, four colon-separated
lists, [name] symbolic operands).
-src/arch/aa64_isa.{h,c} per-format pack/unpack/print + AA64InsnDesc
+src/arch/aa64/isa.{h,c} per-format pack/unpack/print + AA64InsnDesc
table + alias flags. Shared between encoder,
decoder, and printer.
-src/arch/aa64_asm.{h,c} aa64 instruction parser: per-mnemonic dispatch
+src/arch/aa64/asm.{h,c} aa64 instruction parser: per-mnemonic dispatch
over the table → inline encoders.
aa64_inline_bind + aa64_asm_run_template
implement the inline-asm template walker.
-src/arch/aa64_disasm.{h,c} aa64 ArchDisasm impl wrapping aa64_disasm_find +
+src/arch/aa64/disasm.{h,c} aa64 ArchDisasm impl wrapping aa64_disasm_find +
aa64_print_operands; synthesizes b.<cond>.
-src/arch/aa64_regs.{h,c} canonical aarch64 register name list.
+src/arch/aa64/regs.{h,c} canonical aarch64 register name list.
src/arch/disasm.c arch_disasm_new dispatch on c->target.arch.
-src/arch/aarch64.c aa_asm_block: CGTarget vtable entry for inline
+src/arch/aa64/arch.c aa_asm_block: CGTarget vtable entry for inline
asm; opens AA64Asm, binds operands, runs
template, closes.
src/cg/cg.c cg_inline_asm: constraint binder (pops inputs,
diff --git a/doc/DBG.md b/doc/DBG.md
@@ -48,7 +48,7 @@ src/
bp.c breakpoint patch table (addr -> saved bytes, refcount)
step.c resume-mode state machine (insn / line / next / out)
displaced.c arch-neutral plumbing for out-of-line execution
- arch_aa64.c aa64 BRK encoding + PC-relative fixups for displaced
+ arch/aa64/dbg.c aa64 BRK encoding + PC-relative fixups for displaced
arch_x64.c (later)
arch_rv64.c (later)
mem.c read/write_mem with sigsetjmp bad-address guard
@@ -215,7 +215,7 @@ Invariants:
## 7. Software breakpoints
-aa64-specific encoding lives in `src/dbg/arch_aa64.c`; everything else
+aa64-specific encoding lives in `src/arch/aa64/dbg.c`; everything else
in `src/dbg/bp.c` is arch-neutral.
- Patch instruction: `BRK #0` (4 bytes on aa64; `0xCC` on x64 later).
@@ -384,10 +384,10 @@ the box.
- [x] `bp.c` — refcounted patch table, idempotent set/clear, read overlay
- [x] `mem.c` — guarded read/write via `dbg_os->guarded_copy`
- [x] `displaced.c` — scratch page + per-insn shim primitive
-- [x] `arch_aa64.c` — verbatim copy + B / BL / B.cond / CBZ / CBNZ /
+- [x] `arch/aa64/dbg.c` — verbatim copy + B / BL / B.cond / CBZ / CBNZ /
TBZ / TBNZ / ADR / ADRP / LDR-lit (W/X/SW) / BR / BLR / RET
- [x] `step.c` — `STEP_LINE` / `NEXT_LINE` / `STEP_OUT` state machines
-- [ ] `arch_aa64.c`: LDR-literal vector forms (S/D/Q register dest);
+- [ ] `arch/aa64/dbg.c`: LDR-literal vector forms (S/D/Q register dest);
currently decline. Common in optimized builds.
- [ ] `arch_x64.c`: INT3 + RIP-relative fixups for the same insn family
- [ ] `arch_rv64.c`: EBREAK + AUIPC/JAL/branch fixups
diff --git a/doc/OPT1.md b/doc/OPT1.md
@@ -42,7 +42,7 @@ substitute a behaviorally similar shortcut without updating both documents.
## Completion Notes
- Implemented in `src/opt/pass_lower.c`, `src/opt/opt.c`, `src/opt/ir.h`,
- `src/arch/aarch64/opt_coord.c`, `src/arch/x64/opt_coord.c`,
+ `src/arch/aa64/opt_coord.c`, `src/arch/x64/opt_coord.c`,
`src/arch/rv64/opt_coord.c`, and `test/opt/opt_test.c`.
- Added `opt_dead_def_elim` pass (pre-RA backward walk with dynamic liveness,
removes cascading dead defs before rewrite).
diff --git a/doc/STAGE2.md b/doc/STAGE2.md
@@ -100,8 +100,8 @@ not been switched back on.
`src/debug/` and `src/link/`.
- [x] **B8.** `sizeof` accepts the no-parens **unary-expression** form
in constant-expression contexts (e.g. file-scope initializers). C99
- §6.5.3.4 standard, not an extension. Blocked `src/arch/aa64_isa.c`
- and `src/arch/aa64_regs.c`.
+ §6.5.3.4 standard, not an extension. Blocked `src/arch/aa64/isa.c`
+ and `src/arch/aa64/regs.c`.
- [x] **B9.** Block-scope `static T name[] = {...}` now completes the
incomplete array, mirroring B6's file-scope fix. Was blocking
`src/pp/pp.c`.
@@ -113,7 +113,7 @@ not been switched back on.
- [x] **C2.** `OPK_INDIRECT` on the indirect-return path (commit
f2d3e01).
- [x] **C0.** Stage-1 regalloc "no spillable victim (class 0)" panic
- fixed — was choking on the complex functions in `src/arch/aarch64.c`,
+ fixed — was choking on the complex functions in `src/arch/aa64/arch.c`,
`src/arch/rv64.c`, `src/cg/cg.c`, and `src/opt/opt.c`. Not a feature
gap; a regalloc bug surfaced by self-host pressure.
diff --git a/doc/TAILCALL.md b/doc/TAILCALL.md
@@ -90,7 +90,7 @@ When `flags & CG_CALL_TAIL`:
## Step 3 — AArch64 backend
-### `src/arch/aarch64/internal.h`
+### `src/arch/aa64/internal.h`
Worst-case inline teardown: 5 int-pair LDPs (x19–x28) + 4 fp-pair LDPs
(d8–d15) + 1 fp/lr LDP + 2 SP-add instructions = 12; use 14 for headroom.
@@ -114,7 +114,7 @@ a->ntail_sites = 0;
a->tail_sites_cap = 0;
```
-### `src/arch/aarch64/ops.c` — `aa_call`
+### `src/arch/aa64/ops.c` — `aa_call`
After the `emit_arg_value` loop and `max_outgoing` update, before the existing
BL/BLR emission:
@@ -159,7 +159,7 @@ if (d->flags & CG_CALL_TAIL) {
`aa_tail_site_push` is a small grow-array helper consistent with the existing
`add_patches` pattern.
-### `src/arch/aarch64/emit.c` — `aa_func_end`
+### `src/arch/aa64/emit.c` — `aa_func_end`
After computing `n_int_pairs`, `n_fp_pairs`, `frame_size`, `int_save_off`,
`fp_save_off`, `fp_lr_off` — before placing the epilogue label — patch each
@@ -227,8 +227,8 @@ compile and run via `cfree run` and verify correctness.
| `src/parse/parse_stmt.c` | attribute prefix detection; musttail return path |
| `src/parse/parse_expr.c` | `cg_tail_call` dispatch when `in_musttail` |
| `src/cg/cg.c` | factor `cg_call_impl`; implement `cg_tail_call` |
-| `src/arch/aarch64/internal.h` | constants, `AATailCallSite`, fields in `AAImpl` |
-| `src/arch/aarch64/ops.c` | tail-call branch in `aa_call`; `aa_tail_site_push` |
-| `src/arch/aarch64/emit.c` | init in `aa_func_begin`; patch loop in `aa_func_end` |
+| `src/arch/aa64/internal.h` | constants, `AATailCallSite`, fields in `AAImpl` |
+| `src/arch/aa64/ops.c` | tail-call branch in `aa_call`; `aa_tail_site_push` |
+| `src/arch/aa64/emit.c` | init in `aa_func_begin`; patch loop in `aa_func_end` |
| `test/parse/` | musttail attribute parse test |
| `test/cg/` | direct/indirect/e2e tail call tests |
diff --git a/doc/arch-registration-plan.md b/doc/arch-registration-plan.md
@@ -1,20 +0,0 @@
-# Architecture Registration Plan
-
-## Checklist
-
-- [x] Introduce one internal arch descriptor and registry lookup.
-- [x] Route existing arch dispatchers through that descriptor without changing behavior.
-- [x] Move ABI selection behind the arch descriptor.
-- [x] Move object-format relocation translators behind the arch descriptor.
-- [x] Move linker-only arch constants and stub emitters fully behind the descriptor.
-- [x] Move assembler/disassembler/register helpers behind arch-owned implementation files.
-- [x] Make `MCEmitter` delegate label fixup encoding to the arch descriptor.
-- [ ] Consolidate files into `src/arch/{aa64,rv64,x64}/` with one exposed implementation object per arch.
-- [ ] Teach the build to honor `CFREE_ARCHS` and compile only selected arch subtrees.
-- [ ] Add targeted subset-build tests for `aa64`, `x64`, `rv64`, and mixed subsets.
-
-## Phase 1
-
-Phase 1 is a refactor-only step. It adds the shared descriptor boundary and
-rewires existing centralized dispatchers to use it while all currently supported
-architectures remain compiled in by default.
diff --git a/src/api/stubs.c b/src/api/stubs.c
@@ -84,9 +84,9 @@ ObjBuilder* read_wasm(Compiler* c, const char* n, const u8* d, size_t l) {
/* Header-dep iterator lives in src/api/dep.c. */
/* Disassembler is real (src/api/disasm.c, src/arch/disasm.c,
- * src/arch/aa64_disasm.c). Per-arch register name lookups and the
+ * src/arch/aa64/disasm.c). Per-arch register name lookups and the
* indexed enumeration (cfree_arch_register_count / _at) are real
- * (src/api/arch_regs.c + src/arch/aa64_regs.c). */
+ * (src/api/arch_regs.c + src/arch/aa64/regs.c). */
/* Linker script parsing lives in src/link/link_script.c. */
@@ -94,7 +94,7 @@ ObjBuilder* read_wasm(Compiler* c, const char* n, const u8* d, size_t l) {
* src/link/link_jit.c. */
/* JIT session implementation lives in src/dbg/ (session.c, bp.c, step.c,
- * displaced.c, arch_aa64.c, mem.c). */
+ * displaced.c, arch/aa64/dbg.c, mem.c). */
/* DWARF consumer: the cfree_dwarf_* implementations live in src/debug/.
* Their stubs were removed when src/debug/dwarf_*.c took ownership of
diff --git a/src/arch/aa64.h b/src/arch/aa64/aa64.h
diff --git a/src/arch/aa64/alloc.c b/src/arch/aa64/alloc.c
@@ -0,0 +1,246 @@
+/* aarch64/alloc.c — spill/reload, labels, control flow, structured scopes. */
+
+#include "arch/aa64/internal.h"
+
+/* ============================================================
+ * AAImpl accessor
+ * ============================================================ */
+
+AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; }
+
+/* ============================================================
+ * Slot accessor
+ * ============================================================ */
+
+AASlot* aa64_slot_get(AAImpl* a, FrameSlot fs) {
+ if (fs == FRAME_SLOT_NONE || fs > a->nslots) return NULL;
+ return &a->slots[fs - 1];
+}
+
+static int aa_resolve_reg_name(CGTarget* t, Sym name, Reg* out,
+ RegClass* cls_out) {
+ (void)t;
+ size_t len = 0;
+ const char* s = pool_str(t->c->global, name, &len);
+ if (!s || !len) return 1;
+ char buf[8];
+ if (len >= sizeof buf) return 1;
+ memcpy(buf, s, len);
+ buf[len] = '\0';
+ u32 dwarf;
+ if (aa64_register_index(buf, &dwarf) != 0) return 1;
+ if (dwarf <= 30u) {
+ if (out) *out = (Reg)dwarf;
+ if (cls_out) *cls_out = RC_INT;
+ return 0;
+ }
+ if (dwarf >= 64u && dwarf <= 95u) {
+ if (out) *out = (Reg)(dwarf - 64u);
+ if (cls_out) *cls_out = RC_FP;
+ return 0;
+ }
+ return 1;
+}
+
+static void aa_spill_reg(CGTarget* t, Operand src, FrameSlot slot,
+ MemAccess ma) {
+ AAImpl* a = impl_of(t);
+ if (src.kind != OPK_REG) {
+ compiler_panic(t->c, a->loc, "aarch64 spill_reg: src is not OPK_REG");
+ }
+ Operand addr;
+ memset(&addr, 0, sizeof addr);
+ addr.kind = OPK_LOCAL;
+ addr.cls = RC_INT;
+ addr.type = ma.type;
+ addr.v.frame_slot = slot;
+ aa_store(t, addr, src, ma);
+}
+
+static void aa_reload_reg(CGTarget* t, Operand dst, FrameSlot slot,
+ MemAccess ma) {
+ AAImpl* a = impl_of(t);
+ if (dst.kind != OPK_REG) {
+ compiler_panic(t->c, a->loc, "aarch64 reload_reg: dst is not OPK_REG");
+ }
+ Operand addr;
+ memset(&addr, 0, sizeof addr);
+ addr.kind = OPK_LOCAL;
+ addr.cls = RC_INT;
+ addr.type = ma.type;
+ addr.v.frame_slot = slot;
+ aa_load(t, dst, addr, ma);
+}
+
+/* ============================================================
+ * Labels / control flow
+ * ============================================================ */
+
+static Label aa_label_new(CGTarget* t) {
+ return (Label)t->mc->label_new(t->mc);
+}
+
+static void aa_label_place(CGTarget* t, Label l) {
+ t->mc->label_place(t->mc, (MCLabel)l);
+}
+
+void aa_jump(CGTarget* t, Label l) {
+ MCEmitter* mc = t->mc;
+ aa64_emit32(mc, aa64_b_base());
+ mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_JUMP26, 4, 0);
+}
+
+static u32 cmp_to_cond(CmpOp op) {
+ switch (op) {
+ case CMP_EQ: return 0x0u;
+ case CMP_NE: return 0x1u;
+ case CMP_LT_U: return 0x3u;
+ case CMP_LE_U: return 0x9u;
+ case CMP_GT_U: return 0x8u;
+ case CMP_GE_U: return 0x2u;
+ case CMP_LT_S: return 0xbu;
+ case CMP_LE_S: return 0xdu;
+ case CMP_GT_S: return 0xcu;
+ case CMP_GE_S: return 0xau;
+ default: return 0x0u;
+ }
+}
+
+void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op) {
+ MCEmitter* mc = t->mc;
+ u32 sf = type_is_64(a_op.type) ? 1u : 0u;
+ if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) {
+ u32 imm12, sh;
+ if (aa64_addsub_imm_fits(b_op.v.imm, &imm12, &sh)) {
+ u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
+ aa64_emit32(mc, aa64_subs_imm12(sf, /*Rd=ZR*/ 31u, rn, imm12, sh));
+ return;
+ }
+ }
+ u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
+ u32 rm =
+ aa64_force_reg_int(t, b_op, sf, (rn == AA_TMP0) ? AA_TMP1 : AA_TMP0);
+ aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, rn, rm));
+}
+
+static void aa_cmp_branch(CGTarget* t, CmpOp op, Operand a, Operand b,
+ Label l) {
+ MCEmitter* mc = t->mc;
+ emit_cmp_ab(t, a, b);
+ aa64_emit32(mc, aa64_b_cond(cmp_to_cond(op)));
+ mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_CONDBR19, 4, 0);
+}
+
+static void aa_cmp(CGTarget* t, CmpOp op, Operand dst, Operand a, Operand b) {
+ emit_cmp_ab(t, a, b);
+ u32 sf_dst = type_is_64(dst.type) ? 1u : 0u;
+ aa64_emit32(t->mc, aa64_cset(sf_dst, reg_num(dst), cmp_to_cond(op)));
+}
+
+/* ============================================================
+ * Structured scopes
+ * ============================================================ */
+
+static CGScope aa_scope_begin(CGTarget* t, const CGScopeDesc* d) {
+ AAImpl* a = impl_of(t);
+ if (a->nscopes == a->scopes_cap) {
+ u32 ncap = a->scopes_cap ? a->scopes_cap * 2u : 4u;
+ AAScope* nb = arena_array(t->c->tu, AAScope, ncap);
+ if (a->scopes) memcpy(nb, a->scopes, sizeof(AAScope) * a->nscopes);
+ a->scopes = nb;
+ a->scopes_cap = ncap;
+ }
+ AAScope* sc = &a->scopes[a->nscopes];
+ sc->kind = (u8)d->kind;
+ sc->has_else = 0;
+ sc->else_label = 0;
+ sc->end_label = 0;
+ sc->break_label = d->break_label;
+ sc->continue_label = d->continue_label;
+
+ if (d->kind == SCOPE_IF) {
+ sc->else_label = t->mc->label_new(t->mc);
+ sc->end_label = t->mc->label_new(t->mc);
+ u32 sf = type_is_64(d->cond.type) ? 1u : 0u;
+ u32 rn = aa64_force_reg_int(t, d->cond, sf, AA_TMP0);
+ aa64_emit32(t->mc, aa64_subs_imm(sf, /*Rd=ZR*/ 31u, rn, 0));
+ aa64_emit32(t->mc, aa64_b_cond(0x0u /*EQ*/));
+ t->mc->emit_label_ref(t->mc, sc->else_label, R_AARCH64_CONDBR19, 4, 0);
+ } else if (d->kind == SCOPE_LOOP || d->kind == SCOPE_BLOCK) {
+ /* bookkeep only */
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 scope_begin: kind %d not yet implemented",
+ (int)d->kind);
+ }
+
+ a->nscopes++;
+ return (CGScope)a->nscopes;
+}
+
+static void aa_scope_else(CGTarget* t, CGScope s) {
+ AAImpl* a = impl_of(t);
+ if (s == CG_SCOPE_NONE || s > a->nscopes) {
+ compiler_panic(t->c, a->loc, "aarch64 scope_else: bad scope %u",
+ (unsigned)s);
+ }
+ AAScope* sc = &a->scopes[s - 1];
+ aa64_emit32(t->mc, aa64_b_base());
+ t->mc->emit_label_ref(t->mc, sc->end_label, R_AARCH64_JUMP26, 4, 0);
+ t->mc->label_place(t->mc, sc->else_label);
+ sc->has_else = 1;
+}
+
+static void aa_scope_end(CGTarget* t, CGScope s) {
+ AAImpl* a = impl_of(t);
+ if (s == CG_SCOPE_NONE || s > a->nscopes) {
+ compiler_panic(t->c, a->loc, "aarch64 scope_end: bad scope %u",
+ (unsigned)s);
+ }
+ AAScope* sc = &a->scopes[s - 1];
+ if (sc->kind == SCOPE_IF) {
+ if (!sc->has_else) {
+ t->mc->label_place(t->mc, sc->else_label);
+ }
+ t->mc->label_place(t->mc, sc->end_label);
+ }
+}
+
+static void aa_break_to(CGTarget* t, CGScope s) {
+ AAImpl* a = impl_of(t);
+ if (s == CG_SCOPE_NONE || s > a->nscopes) {
+ compiler_panic(t->c, a->loc, "aarch64 break_to: bad scope %u", (unsigned)s);
+ }
+ AAScope* sc = &a->scopes[s - 1];
+ aa_jump(t, sc->break_label);
+}
+
+static void aa_continue_to(CGTarget* t, CGScope s) {
+ AAImpl* a = impl_of(t);
+ if (s == CG_SCOPE_NONE || s > a->nscopes) {
+ compiler_panic(t->c, a->loc, "aarch64 continue_to: bad scope %u",
+ (unsigned)s);
+ }
+ AAScope* sc = &a->scopes[s - 1];
+ aa_jump(t, sc->continue_label);
+}
+
+/* Expose vtable entries to ops.c constructor via a registration helper.
+ * ops.c calls this after the basic ops vtable is populated. */
+void aa_alloc_vtable_init(CGTarget* t) {
+ t->spill_reg = aa_spill_reg;
+ t->reload_reg = aa_reload_reg;
+ t->resolve_reg_name = aa_resolve_reg_name;
+
+ t->label_new = aa_label_new;
+ t->label_place = aa_label_place;
+ t->jump = aa_jump;
+ t->cmp_branch = aa_cmp_branch;
+ t->cmp = aa_cmp;
+
+ t->scope_begin = aa_scope_begin;
+ t->scope_else = aa_scope_else;
+ t->scope_end = aa_scope_end;
+ t->break_to = aa_break_to;
+ t->continue_to = aa_continue_to;
+}
diff --git a/src/arch/aa64/arch.c b/src/arch/aa64/arch.c
@@ -0,0 +1,97 @@
+#include "arch/arch.h"
+
+#include "abi/abi_internal.h"
+#include "arch/aa64/aa64.h"
+#include "arch/aa64/asm.h"
+#include "arch/aa64/disasm.h"
+#include "arch/aa64/regs.h"
+#include "core/bytes.h"
+#include "link/link_arch.h"
+#include "obj/elf.h"
+#include "obj/macho.h"
+#include "obj/obj.h"
+
+extern const LinkArchDesc link_arch_aa64;
+
+static const ABIVtable* aa64_abi_vtable(Compiler* c, CfreeOSKind os) {
+ (void)c;
+ switch (os) {
+ case CFREE_OS_MACOS:
+ return &apple_arm64_vtable;
+ default:
+ return &aapcs64_vtable;
+ }
+}
+
+static int aa64_register_at_public(uint32_t idx, CfreeArchReg* out) {
+ if (!out) return 1;
+ return aa64_register_iter_get(idx, &out->dwarf_idx, &out->name);
+}
+
+static const ArchElfOps aa64_elf_ops = {
+ .e_machine = EM_AARCH64,
+ .e_flags = 0,
+ .reloc_to = elf_aarch64_reloc_to,
+ .reloc_from = elf_aarch64_reloc_from,
+};
+
+static const ArchMachoOps aa64_macho_ops = {
+ .cputype = CPU_TYPE_ARM64,
+ .cpusubtype = CPU_SUBTYPE_ARM64_ALL,
+ .reloc_to = macho_aarch64_reloc_to,
+ .reloc_pcrel = macho_aarch64_reloc_pcrel,
+ .reloc_length = macho_aarch64_reloc_length,
+ .reloc_from = macho_aarch64_reloc_from,
+};
+
+static int aa64_apply_label_fixup(Compiler* c, const ArchLabelFixup* fx) {
+ const Section* s;
+ u8 cur[4];
+ u32 word;
+
+ (void)c;
+ if (!fx || fx->width != 4) return 1;
+ s = obj_section_get(fx->obj, fx->sec_id);
+ if (!s) return 0;
+ buf_read(&s->bytes, fx->offset, cur, 4);
+ word = rd_u32_le(cur);
+
+ switch (fx->kind) {
+ case R_AARCH64_JUMP26:
+ case R_AARCH64_CALL26: {
+ i64 idisp = fx->disp >> 2;
+ u32 imm26 = (u32)(idisp & 0x03ffffffu);
+ word = (word & ~0x03ffffffu) | imm26;
+ break;
+ }
+ case R_AARCH64_CONDBR19: {
+ i64 idisp = fx->disp >> 2;
+ u32 imm19 = (u32)(idisp & 0x7ffffu);
+ word = (word & ~(0x7ffffu << 5)) | (imm19 << 5);
+ break;
+ }
+ default:
+ return 1;
+ }
+
+ wr_u32_le(cur, word);
+ obj_patch(fx->obj, fx->sec_id, fx->offset, cur, 4);
+ return 0;
+}
+
+const ArchImpl arch_impl_aa64 = {
+ .kind = CFREE_ARCH_ARM_64,
+ .name = "aa64",
+ .abi_vtable = aa64_abi_vtable,
+ .cgtarget_new = aa64_cgtarget_new,
+ .asm_new = aa64_arch_asm_new,
+ .disasm_new = aa64_disasm_new,
+ .apply_label_fixup = aa64_apply_label_fixup,
+ .link = &link_arch_aa64,
+ .elf = &aa64_elf_ops,
+ .macho = &aa64_macho_ops,
+ .register_name = aa64_register_name,
+ .register_index = aa64_register_index,
+ .register_count = aa64_register_iter_size,
+ .register_at = aa64_register_at_public,
+};
diff --git a/src/arch/aa64/asm.c b/src/arch/aa64/asm.c
@@ -0,0 +1,1379 @@
+/* AArch64 standalone .s instruction parser.
+ *
+ * Per-mnemonic dispatch: each entry in the mnemonic table names a
+ * parse function that reads operand tokens through the asm-driver
+ * surface and emits the encoded word via the inline encoders in
+ * aa64_isa.h. Encoders are the single source of truth for bit
+ * layout — the disassembler shares them through aa64_*_unpack.
+ *
+ * Aliases (`mov`, `neg`, `cmp`, `mul`, ...) live in this table as
+ * dedicated rows that pick the canonical form's encoder with the
+ * alias-specific operand shape. When a mnemonic admits multiple
+ * forms (e.g. `mov` register-vs-immediate, `add` register-vs-
+ * immediate), the parser branches on operand shape after reading
+ * the first non-Rd operand. */
+
+#include "arch/aa64/asm.h"
+
+#include <string.h>
+
+#include "arch/aa64/isa.h"
+#include "arch/aa64/regs.h"
+#include "arch/arch.h"
+#include "core/arena.h"
+#include "core/pool.h"
+#include "core/strbuf.h"
+#include "asm/asm_lex.h"
+#include "obj/obj.h"
+#include "asm/asm_helpers.h"
+
+/* ---- public handle ---- */
+
+struct AA64Asm {
+ ArchAsm base;
+ Compiler* c;
+
+ /* Inline-asm bound state (set by aa64_inline_bind, cleared otherwise).
+ * Operand indexing per GCC convention: 0..nout-1 are outputs, then
+ * nout..nout+nin-1 are inputs. Templates address into this combined
+ * list via %N / %wN / %xN / %aN. out_ops is mutable (the binder fills
+ * in result locations); in_ops + constraints + clobbers are read-only
+ * borrows. */
+ const AsmConstraint* outs;
+ Operand* out_ops;
+ const AsmConstraint* ins;
+ const Operand* in_ops;
+ const Sym* clobbers;
+ u32 nout;
+ u32 nin;
+ u32 nclob;
+};
+
+static void aa64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic);
+static void aa64_arch_asm_destroy(ArchAsm* base);
+
+AA64Asm* aa64_asm_open(Compiler* c) {
+ AA64Asm* a = arena_new(c->tu, AA64Asm);
+ memset(a, 0, sizeof *a);
+ a->base.insn = aa64_arch_asm_insn;
+ a->base.destroy = aa64_arch_asm_destroy;
+ a->c = c;
+ return a;
+}
+
+void aa64_asm_close(AA64Asm* a) { (void)a; }
+
+ArchAsm* aa64_arch_asm_new(Compiler* c) {
+ return &aa64_asm_open(c)->base;
+}
+
+static void aa64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic) {
+ aa64_asm_insn((AA64Asm*)base, d, mnemonic);
+}
+
+static void aa64_arch_asm_destroy(ArchAsm* base) {
+ aa64_asm_close((AA64Asm*)base);
+}
+
+void aa64_inline_bind(AA64Asm* a,
+ const AsmConstraint* outs, u32 nout, Operand* out_ops,
+ const AsmConstraint* ins, u32 nin, const Operand* in_ops,
+ const Sym* clobbers, u32 nclob) {
+ a->outs = outs;
+ a->out_ops = out_ops;
+ a->ins = ins;
+ a->in_ops = in_ops;
+ a->clobbers = clobbers;
+ a->nout = nout;
+ a->nin = nin;
+ a->nclob = nclob;
+}
+
+/* ---- helpers ---- */
+
+static int tok_punct(AsmTok t, u32 p) { return asm_driver_tok_is_punct(t, p); }
+
+static int icase_eq(const char* a, size_t an, const char* b) {
+ size_t i;
+ for (i = 0; i < an; ++i) {
+ char x = a[i], y = b[i];
+ if (x >= 'A' && x <= 'Z') x = (char)(x + ('a' - 'A'));
+ if (y >= 'A' && y <= 'Z') y = (char)(y + ('a' - 'A'));
+ if (x != y || !y) return 0;
+ }
+ return b[an] == '\0';
+}
+
+/* Parse a register operand. Returns the 5-bit encoded register number
+ * via *reg_out and the form via *is64_out. Recognized forms (case-
+ * insensitive):
+ * w0..w30, wzr → is64=0, reg=0..30 / 31
+ * x0..x30, xzr, lr (=x30) → is64=1, reg=0..30 / 31
+ * sp → is64=1, reg=31 (sp_means_sp set)
+ * wsp → is64=0, reg=31 (sp_means_sp set)
+ * Aliases:
+ * fp = x29
+ * ip0 = x16, ip1 = x17 (PLT scratch — useful for hand-written PLTs) */
+typedef struct AA64Reg {
+ u32 num;
+ u8 is64;
+ u8 is_sp; /* 1 if the spelling was "sp" / "wsp" */
+ u8 is_fp; /* 1 for SIMD/FP register spellings accepted in FP forms */
+ u8 pad;
+} AA64Reg;
+
+static int parse_reg_from_ident(AsmDriver* d, Sym ident, AA64Reg* out) {
+ size_t n = 0;
+ const char* p = pool_str(asm_driver_pool(d), ident, &n);
+ if (!p || !n) return 0;
+ /* "sp" */
+ if (icase_eq(p, n, "sp")) {
+ out->num = 31;
+ out->is64 = 1;
+ out->is_sp = 1;
+ out->is_fp = 0;
+ return 1;
+ }
+ if (icase_eq(p, n, "wsp")) {
+ out->num = 31;
+ out->is64 = 0;
+ out->is_sp = 1;
+ out->is_fp = 0;
+ return 1;
+ }
+ if (icase_eq(p, n, "lr")) {
+ out->num = 30;
+ out->is64 = 1;
+ out->is_sp = 0;
+ out->is_fp = 0;
+ return 1;
+ }
+ if (icase_eq(p, n, "fp")) {
+ out->num = 29;
+ out->is64 = 1;
+ out->is_sp = 0;
+ out->is_fp = 0;
+ return 1;
+ }
+ if (icase_eq(p, n, "ip0")) {
+ out->num = 16;
+ out->is64 = 1;
+ out->is_sp = 0;
+ out->is_fp = 0;
+ return 1;
+ }
+ if (icase_eq(p, n, "ip1")) {
+ out->num = 17;
+ out->is64 = 1;
+ out->is_sp = 0;
+ out->is_fp = 0;
+ return 1;
+ }
+ if (icase_eq(p, n, "xzr")) {
+ out->num = 31;
+ out->is64 = 1;
+ out->is_sp = 0;
+ out->is_fp = 0;
+ return 1;
+ }
+ if (icase_eq(p, n, "wzr")) {
+ out->num = 31;
+ out->is64 = 0;
+ out->is_sp = 0;
+ out->is_fp = 0;
+ return 1;
+ }
+ /* W/X<num> */
+ if ((p[0] == 'w' || p[0] == 'W' || p[0] == 'x' || p[0] == 'X') && n >= 2) {
+ u32 r = 0;
+ size_t i;
+ for (i = 1; i < n; ++i) {
+ char c = p[i];
+ if (c < '0' || c > '9') return 0;
+ r = r * 10 + (u32)(c - '0');
+ if (r > 31) return 0;
+ }
+ out->num = r;
+ out->is64 = (p[0] == 'x' || p[0] == 'X') ? 1 : 0;
+ out->is_sp = 0;
+ out->is_fp = 0;
+ return 1;
+ }
+ return 0;
+}
+
+static int parse_fp_d_reg_from_ident(AsmDriver* d, Sym ident, AA64Reg* out) {
+ size_t n = 0;
+ const char* p = pool_str(asm_driver_pool(d), ident, &n);
+ if (!p || n < 2 || (p[0] != 'd' && p[0] != 'D')) return 0;
+ u32 r = 0;
+ for (size_t i = 1; i < n; ++i) {
+ char c = p[i];
+ if (c < '0' || c > '9') return 0;
+ r = r * 10 + (u32)(c - '0');
+ if (r > 31) return 0;
+ }
+ out->num = r;
+ out->is64 = 1;
+ out->is_sp = 0;
+ out->is_fp = 1;
+ return 1;
+}
+
+static AA64Reg parse_reg(AsmDriver* d) {
+ AsmTok t = asm_driver_next(d);
+ AA64Reg r;
+ memset(&r, 0, sizeof r);
+ if (t.kind != ASM_TOK_IDENT || !parse_reg_from_ident(d, t.v.ident, &r))
+ asm_driver_panic(d, "asm: expected register");
+ return r;
+}
+
+static AA64Reg parse_ldstp_reg(AsmDriver* d) {
+ AsmTok t = asm_driver_next(d);
+ AA64Reg r;
+ memset(&r, 0, sizeof r);
+ if (t.kind != ASM_TOK_IDENT ||
+ (!parse_reg_from_ident(d, t.v.ident, &r) &&
+ !parse_fp_d_reg_from_ident(d, t.v.ident, &r))) {
+ asm_driver_panic(d, "asm: expected register");
+ }
+ return r;
+}
+
+static void reject_sp_reg(AsmDriver* d, AA64Reg r, const char* what) {
+ if (r.is_sp) asm_driver_panic(d, "asm: %s: SP register not allowed", what);
+}
+
+static void require_sp_spelling(AsmDriver* d, AA64Reg r, const char* what) {
+ if (r.num == 31u && !r.is_sp)
+ asm_driver_panic(d, "asm: %s: zero register not allowed in SP operand",
+ what);
+}
+
+/* Parse "#imm" (with optional + / -) or a bare expression — GNU as is
+ * lenient about the leading hash. Returns an i64. */
+static i64 parse_imm_const(AsmDriver* d) {
+ (void)asm_driver_eat_punct(d, '#');
+ return asm_driver_parse_const(d);
+}
+
+/* Parse a possibly-symbolic operand prefixed by '#'. */
+static void parse_imm_sym(AsmDriver* d, ObjSymId* sym_out, i64* val_out) {
+ (void)asm_driver_eat_punct(d, '#');
+ asm_driver_parse_sym_expr(d, sym_out, val_out);
+}
+
+static void emit32(AsmDriver* d, u32 word) {
+ MCEmitter* mc = asm_driver_mc(d);
+ (void)asm_driver_cur_section(d);
+ u8 buf[4];
+ buf[0] = (u8)(word & 0xff);
+ buf[1] = (u8)((word >> 8) & 0xff);
+ buf[2] = (u8)((word >> 16) & 0xff);
+ buf[3] = (u8)((word >> 24) & 0xff);
+ mc->emit_bytes(mc, buf, 4);
+}
+
+static int parse_cond_from_ident(AsmDriver* d, Sym ident, u32* out) {
+ size_t n = 0;
+ const char* s = pool_str(asm_driver_pool(d), ident, &n);
+ if (!s) return 0;
+ if (icase_eq(s, n, "eq")) *out = 0;
+ else if (icase_eq(s, n, "ne")) *out = 1;
+ else if (icase_eq(s, n, "cs") || icase_eq(s, n, "hs")) *out = 2;
+ else if (icase_eq(s, n, "cc") || icase_eq(s, n, "lo")) *out = 3;
+ else if (icase_eq(s, n, "mi")) *out = 4;
+ else if (icase_eq(s, n, "pl")) *out = 5;
+ else if (icase_eq(s, n, "vs")) *out = 6;
+ else if (icase_eq(s, n, "vc")) *out = 7;
+ else if (icase_eq(s, n, "hi")) *out = 8;
+ else if (icase_eq(s, n, "ls")) *out = 9;
+ else if (icase_eq(s, n, "ge")) *out = 10;
+ else if (icase_eq(s, n, "lt")) *out = 11;
+ else if (icase_eq(s, n, "gt")) *out = 12;
+ else if (icase_eq(s, n, "le")) *out = 13;
+ else if (icase_eq(s, n, "al")) *out = 14;
+ else return 0;
+ return 1;
+}
+
+static u32 parse_cond(AsmDriver* d, const char* what) {
+ AsmTok t = asm_driver_next(d);
+ u32 cond = 0;
+ if (t.kind != ASM_TOK_IDENT || !parse_cond_from_ident(d, t.v.ident, &cond))
+ asm_driver_panic(d, "asm: %s: expected condition code", what);
+ return cond;
+}
+
+static void expect_comma(AsmDriver* d, const char* what) {
+ if (!asm_driver_eat_comma(d))
+ asm_driver_panic(d, "asm: expected ',' (%s)", what);
+}
+
+/* ---- per-mnemonic parsers ---- */
+
+/* ret [Xn] — Xn defaults to x30. */
+static void p_ret(AsmDriver* d) {
+ if (asm_driver_at_eol(d)) {
+ emit32(d, aa64_ret(30));
+ return;
+ }
+ AA64Reg r = parse_reg(d);
+ if (!r.is64) asm_driver_panic(d, "asm: ret: 64-bit register expected");
+ emit32(d, aa64_ret(r.num));
+}
+
+static void p_br(AsmDriver* d) {
+ AA64Reg r = parse_reg(d);
+ if (!r.is64) asm_driver_panic(d, "asm: br: 64-bit register expected");
+ emit32(d, aa64_br(r.num));
+}
+
+static void p_blr(AsmDriver* d) {
+ AA64Reg r = parse_reg(d);
+ if (!r.is64) asm_driver_panic(d, "asm: blr: 64-bit register expected");
+ emit32(d, aa64_blr(r.num));
+}
+
+static void p_nop(AsmDriver* d) {
+ (void)d;
+ emit32(d, aa64_nop());
+}
+
+/* Memory barriers (DMB / DSB / ISB / CLREX).
+ *
+ * dmb <option> ; option in {sy, ish, nsh, osh, ld, st, ishld,
+ * ishst, nshld, nshst, oshld, oshst}
+ * dmb #imm4 ; numeric form
+ * dsb <option> | #imm4
+ * isb [<option>] ; option defaults to sy when omitted
+ * clrex [#imm4] ; option defaults to sy (15) when omitted */
+static u32 parse_barrier_option(AsmDriver* d, int allow_dmb_ld_st) {
+ if (asm_driver_at_eol(d)) return AA64_BARRIER_OPT_SY;
+ AsmTok t = asm_driver_peek(d);
+ if (t.kind == ASM_TOK_IDENT) {
+ (void)asm_driver_next(d);
+ size_t n = 0;
+ const char* s = pool_str(asm_driver_pool(d), t.v.ident, &n);
+ if (icase_eq(s, n, "sy")) return AA64_BARRIER_OPT_SY;
+ if (icase_eq(s, n, "ish")) return AA64_BARRIER_OPT_ISH;
+ if (icase_eq(s, n, "ishld")) return AA64_BARRIER_OPT_ISHLD;
+ if (icase_eq(s, n, "ishst")) return AA64_BARRIER_OPT_ISHST;
+ if (icase_eq(s, n, "nsh")) return AA64_BARRIER_OPT_NSH;
+ if (icase_eq(s, n, "nshld")) return AA64_BARRIER_OPT_NSHLD;
+ if (icase_eq(s, n, "nshst")) return AA64_BARRIER_OPT_NSHST;
+ if (icase_eq(s, n, "osh")) return AA64_BARRIER_OPT_OSH;
+ if (icase_eq(s, n, "oshld")) return AA64_BARRIER_OPT_OSHLD;
+ if (icase_eq(s, n, "oshst")) return AA64_BARRIER_OPT_OSHST;
+ if (allow_dmb_ld_st) {
+ if (icase_eq(s, n, "ld")) return AA64_BARRIER_OPT_LD;
+ if (icase_eq(s, n, "st")) return AA64_BARRIER_OPT_ST;
+ }
+ asm_driver_panic(d, "asm: unknown barrier option");
+ }
+ /* Numeric form: '#imm4'. */
+ i64 imm = parse_imm_const(d);
+ if (imm < 0 || imm > 15)
+ asm_driver_panic(d, "asm: barrier imm out of range");
+ return (u32)imm;
+}
+
+static void p_dmb(AsmDriver* d) {
+ u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/1);
+ emit32(d, aa64_dmb(opt));
+}
+static void p_dsb(AsmDriver* d) {
+ u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0);
+ emit32(d, aa64_dsb(opt));
+}
+static void p_isb(AsmDriver* d) {
+ u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0);
+ emit32(d, aa64_isb(opt));
+}
+static void p_clrex(AsmDriver* d) {
+ u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0);
+ emit32(d, aa64_clrex(opt));
+}
+
+/* mov:
+ * mov Rd, Rm → ORR Rd, ZR, Rm
+ * mov Rd, #imm → MOVZ (if imm fits in a single halfword unshifted)
+ * MOVN (if ~imm fits)
+ * otherwise: panic (multi-step expansion deferred). */
+static void p_mov(AsmDriver* d) {
+ AA64Reg rd = parse_reg(d);
+ expect_comma(d, "mov");
+ AsmTok t = asm_driver_peek(d);
+ if (t.kind == ASM_TOK_IDENT) {
+ AA64Reg src;
+ memset(&src, 0, sizeof src);
+ if (parse_reg_from_ident(d, t.v.ident, &src)) {
+ (void)asm_driver_next(d);
+ if (src.is64 != rd.is64)
+ asm_driver_panic(d, "asm: mov: register width mismatch");
+ /* mov involving SP encodes as `ADD Rd, Rsp, #0` per AArch64;
+ * approximate with that exact form. */
+ if (rd.is_sp || src.is_sp) {
+ require_sp_spelling(d, rd, "mov sp");
+ require_sp_spelling(d, src, "mov sp");
+ emit32(d, aa64_add_imm(rd.is64, rd.num, src.num, 0, 0));
+ return;
+ }
+ emit32(d, aa64_mov_reg(rd.is64, rd.num, src.num));
+ return;
+ }
+ /* fall through: identifier that is not a register → treat as
+ * symbol/equate via expression below. */
+ }
+ /* Immediate. */
+ i64 imm = parse_imm_const(d);
+ if (rd.is_sp) asm_driver_panic(d, "asm: mov: cannot move imm into SP");
+ u64 uv = (u64)imm;
+ u64 mask = rd.is64 ? ~0ull : 0xffffffffull;
+ uv &= mask;
+ /* Try MOVZ with one of four halfwords. */
+ for (u32 hw = 0; hw < (rd.is64 ? 4u : 2u); ++hw) {
+ u64 shift = (u64)hw * 16;
+ u64 hwmask = 0xffffull << shift;
+ if ((uv & ~hwmask) == 0) {
+ u32 v = (u32)((uv >> shift) & 0xffff);
+ emit32(d, aa64_movz(rd.is64, rd.num, v, hw));
+ return;
+ }
+ }
+ /* Try MOVN with one halfword (encodes ~imm in that halfword). */
+ u64 nv = (~uv) & mask;
+ for (u32 hw = 0; hw < (rd.is64 ? 4u : 2u); ++hw) {
+ u64 shift = (u64)hw * 16;
+ u64 hwmask = 0xffffull << shift;
+ if ((nv & ~hwmask) == 0) {
+ u32 v = (u32)((nv >> shift) & 0xffff);
+ emit32(d, aa64_movn(rd.is64, rd.num, v, hw));
+ return;
+ }
+ }
+ asm_driver_panic(d, "asm: mov: immediate cannot be encoded in one insn");
+}
+
+/* mvn Rd, Rm */
+static void p_mvn(AsmDriver* d) {
+ AA64Reg rd = parse_reg(d);
+ expect_comma(d, "mvn");
+ AA64Reg rm = parse_reg(d);
+ if (rd.is64 != rm.is64) asm_driver_panic(d, "asm: mvn: width mismatch");
+ emit32(d, aa64_mvn(rd.is64, rd.num, rm.num));
+}
+
+/* movz / movn / movk Rd, #imm[, lsl #shift] */
+static void p_movwide(AsmDriver* d, u32 opc) {
+ AA64Reg rd = parse_reg(d);
+ expect_comma(d, "movz/n/k");
+ i64 imm = parse_imm_const(d);
+ u32 hw = 0;
+ if (asm_driver_eat_comma(d)) {
+ /* lsl #N (N is 0/16/32/48). */
+ AsmTok lid = asm_driver_next(d);
+ if (lid.kind != ASM_TOK_IDENT)
+ asm_driver_panic(d, "asm: expected 'lsl'");
+ size_t ln = 0;
+ const char* lp = pool_str(asm_driver_pool(d), lid.v.ident, &ln);
+ if (!lp || !icase_eq(lp, ln, "lsl"))
+ asm_driver_panic(d, "asm: expected 'lsl'");
+ i64 sh = parse_imm_const(d);
+ if (sh % 16 != 0 || sh < 0 || sh > 48)
+ asm_driver_panic(d, "asm: movz/n/k: bad lsl shift");
+ hw = (u32)(sh / 16);
+ }
+ u32 word = ((rd.is64 & 1u) << 31) | ((opc & 3u) << 29) |
+ AA64_MOVEWIDE_FAMILY_MATCH | ((hw & 3u) << 21) |
+ (((u32)imm & 0xffffu) << 5) | (rd.num & 0x1fu);
+ emit32(d, word);
+}
+
+/* svc / brk / hlt #imm */
+static void p_except(AsmDriver* d, u32 form) {
+ i64 imm = parse_imm_const(d);
+ switch (form) {
+ case 0: emit32(d, aa64_svc((u32)imm)); break;
+ case 1: emit32(d, aa64_brk((u32)imm)); break;
+ case 2: {
+ /* HLT */
+ u32 word = AA64_EXCEPT_FAMILY_MATCH | ((u32)2 << 21) |
+ (((u32)imm & 0xffffu) << 5);
+ emit32(d, word);
+ break;
+ }
+ default: asm_driver_panic(d, "asm: bad exception form");
+ }
+}
+
+/* Read optional `, lsl|lsr|asr|ror #imm` shift modifier. Returns 1 if
+ * present. */
+static int parse_shift_mod(AsmDriver* d, u32* shift_out, u32* imm6_out) {
+ AsmTok t = asm_driver_peek(d);
+ if (t.kind != ASM_TOK_IDENT) return 0;
+ size_t n = 0;
+ const char* p = pool_str(asm_driver_pool(d), t.v.ident, &n);
+ u32 sh;
+ if (icase_eq(p, n, "lsl")) sh = 0;
+ else if (icase_eq(p, n, "lsr")) sh = 1;
+ else if (icase_eq(p, n, "asr")) sh = 2;
+ else if (icase_eq(p, n, "ror")) sh = 3;
+ else return 0;
+ (void)asm_driver_next(d);
+ i64 imm = parse_imm_const(d);
+ if (imm < 0 || imm > 63)
+ asm_driver_panic(d, "asm: shift amount out of range");
+ *shift_out = sh;
+ *imm6_out = (u32)imm;
+ return 1;
+}
+
+/* add / sub family.
+ * Forms:
+ * add Rd, Rn, Rm[, lsl #s] shifted-register
+ * add Rd, Rn, #imm immediate
+ * add Rd, Rn, #imm, lsl #12 immediate w/ shift
+ * S-suffixed (adds/subs) sets flags. */
+static void p_addsub(AsmDriver* d, int is_sub, int set_flags) {
+ AA64Reg rd = parse_reg(d);
+ expect_comma(d, "add/sub");
+ AA64Reg rn = parse_reg(d);
+ expect_comma(d, "add/sub");
+ AsmTok t = asm_driver_peek(d);
+ if (tok_punct(t, '#') || t.kind == ASM_TOK_NUM || tok_punct(t, '-') ||
+ tok_punct(t, '+')) {
+ /* immediate form */
+ if (rd.is64 != rn.is64)
+ asm_driver_panic(d, "asm: add/sub imm: width mismatch");
+ require_sp_spelling(d, rn, "add/sub imm");
+ if (set_flags) {
+ reject_sp_reg(d, rd, "add/sub imm");
+ } else {
+ require_sp_spelling(d, rd, "add/sub imm");
+ }
+ i64 imm = parse_imm_const(d);
+ u32 sh = 0;
+ if (asm_driver_eat_comma(d)) {
+ AsmTok lid = asm_driver_next(d);
+ if (lid.kind != ASM_TOK_IDENT)
+ asm_driver_panic(d, "asm: expected 'lsl #12'");
+ size_t ln = 0;
+ const char* lp = pool_str(asm_driver_pool(d), lid.v.ident, &ln);
+ if (!lp || !icase_eq(lp, ln, "lsl"))
+ asm_driver_panic(d, "asm: expected 'lsl'");
+ i64 s = parse_imm_const(d);
+ if (s == 12) sh = 1;
+ else if (s == 0) sh = 0;
+ else asm_driver_panic(d, "asm: add/sub imm: lsl must be 0 or 12");
+ }
+ if (imm < 0 || imm > 0xfff)
+ asm_driver_panic(d, "asm: add/sub imm out of range");
+ u32 word = aa64_addsubimm_pack((AA64AddSubImm){
+ .sf = rd.is64, .op = (u32)is_sub, .S = (u32)set_flags, .sh = sh,
+ .imm12 = (u32)imm, .Rn = rn.num, .Rd = rd.num});
+ emit32(d, word);
+ return;
+ }
+ /* register form */
+ AA64Reg rm = parse_reg(d);
+ reject_sp_reg(d, rd, "add/sub reg");
+ reject_sp_reg(d, rn, "add/sub reg");
+ reject_sp_reg(d, rm, "add/sub reg");
+ if (rd.is64 != rm.is64 || rd.is64 != rn.is64)
+ asm_driver_panic(d, "asm: add/sub reg: width mismatch");
+ u32 shift = 0, imm6 = 0;
+ if (asm_driver_eat_comma(d)) {
+ if (!parse_shift_mod(d, &shift, &imm6))
+ asm_driver_panic(d, "asm: add/sub reg: expected shift modifier");
+ }
+ u32 word = aa64_addsubsr_pack((AA64AddSubSR){
+ .sf = rd.is64, .op = (u32)is_sub, .S = (u32)set_flags,
+ .shift = shift, .Rm = rm.num, .imm6 = imm6, .Rn = rn.num,
+ .Rd = rd.num});
+ emit32(d, word);
+}
+
+/* cmp Rn, Rm | cmp Rn, #imm → SUBS ZR, Rn, ... */
+static void p_cmp(AsmDriver* d, int is_neg /* cmn flips op */) {
+ AA64Reg rn = parse_reg(d);
+ expect_comma(d, "cmp");
+ AsmTok t = asm_driver_peek(d);
+ if (tok_punct(t, '#') || t.kind == ASM_TOK_NUM || tok_punct(t, '-') ||
+ tok_punct(t, '+')) {
+ require_sp_spelling(d, rn, "cmp imm");
+ i64 imm = parse_imm_const(d);
+ u32 sh = 0;
+ if (asm_driver_eat_comma(d)) {
+ AsmTok lid = asm_driver_next(d);
+ size_t ln = 0;
+ const char* lp =
+ (lid.kind == ASM_TOK_IDENT)
+ ? pool_str(asm_driver_pool(d), lid.v.ident, &ln)
+ : NULL;
+ if (!lp || !icase_eq(lp, ln, "lsl"))
+ asm_driver_panic(d, "asm: cmp imm: expected 'lsl'");
+ i64 s = parse_imm_const(d);
+ if (s == 12) sh = 1;
+ else if (s != 0)
+ asm_driver_panic(d, "asm: cmp imm: lsl must be 0 or 12");
+ }
+ if (imm < 0 || imm > 0xfff)
+ asm_driver_panic(d, "asm: cmp imm out of range");
+ u32 word = aa64_addsubimm_pack(
+ (AA64AddSubImm){.sf = rn.is64, .op = (u32)(!is_neg), .S = 1,
+ .sh = sh, .imm12 = (u32)imm, .Rn = rn.num,
+ .Rd = AA64_ZR});
+ emit32(d, word);
+ return;
+ }
+ AA64Reg rm = parse_reg(d);
+ reject_sp_reg(d, rn, "cmp reg");
+ reject_sp_reg(d, rm, "cmp reg");
+ if (rm.is64 != rn.is64) asm_driver_panic(d, "asm: cmp: width mismatch");
+ u32 shift = 0, imm6 = 0;
+ if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6);
+ u32 word = aa64_addsubsr_pack((AA64AddSubSR){
+ .sf = rn.is64, .op = (u32)(!is_neg), .S = 1, .shift = shift,
+ .Rm = rm.num, .imm6 = imm6, .Rn = rn.num, .Rd = AA64_ZR});
+ emit32(d, word);
+}
+
+static void p_csinc(AsmDriver* d) {
+ AA64Reg rd = parse_reg(d);
+ expect_comma(d, "csinc");
+ AA64Reg rn = parse_reg(d);
+ expect_comma(d, "csinc");
+ AA64Reg rm = parse_reg(d);
+ expect_comma(d, "csinc");
+ u32 cond = parse_cond(d, "csinc");
+ if (rd.is_sp || rn.is_sp || rm.is_sp)
+ asm_driver_panic(d, "asm: csinc: SP register not allowed");
+ if (rd.is64 != rn.is64 || rd.is64 != rm.is64)
+ asm_driver_panic(d, "asm: csinc: width mismatch");
+ u32 word = 0x1A800400u | ((u32)rd.is64 << 31) | ((rm.num & 0x1fu) << 16) |
+ ((cond & 0xfu) << 12) | ((rn.num & 0x1fu) << 5) |
+ (rd.num & 0x1fu);
+ emit32(d, word);
+}
+
+/* neg / negs Rd, Rm → SUB / SUBS Rd, ZR, Rm */
+static void p_neg(AsmDriver* d, int set_flags) {
+ AA64Reg rd = parse_reg(d);
+ expect_comma(d, "neg");
+ AA64Reg rm = parse_reg(d);
+ reject_sp_reg(d, rd, "neg");
+ reject_sp_reg(d, rm, "neg");
+ if (rd.is64 != rm.is64) asm_driver_panic(d, "asm: neg: width mismatch");
+ u32 shift = 0, imm6 = 0;
+ if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6);
+ u32 word = aa64_addsubsr_pack((AA64AddSubSR){
+ .sf = rd.is64, .op = 1, .S = (u32)set_flags, .shift = shift,
+ .Rm = rm.num, .imm6 = imm6, .Rn = AA64_ZR, .Rd = rd.num});
+ emit32(d, word);
+}
+
+/* Logical shifted-register family. */
+static void p_log_sr(AsmDriver* d, u32 opc, u32 N) {
+ AA64Reg rd = parse_reg(d);
+ expect_comma(d, "logical");
+ AA64Reg rn = parse_reg(d);
+ expect_comma(d, "logical");
+ AA64Reg rm = parse_reg(d);
+ if (rd.is64 != rn.is64 || rd.is64 != rm.is64)
+ asm_driver_panic(d, "asm: logical: width mismatch");
+ u32 shift = 0, imm6 = 0;
+ if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6);
+ u32 word = aa64_logsr_pack((AA64LogSR){
+ .sf = rd.is64, .opc = opc, .shift = shift, .N = N, .Rm = rm.num,
+ .imm6 = imm6, .Rn = rn.num, .Rd = rd.num});
+ emit32(d, word);
+}
+
+/* Data-processing 3-source: madd/msub Rd, Rn, Rm, Ra. */
+static void p_dp3(AsmDriver* d, u32 o0) {
+ AA64Reg rd = parse_reg(d);
+ expect_comma(d, "dp3");
+ AA64Reg rn = parse_reg(d);
+ expect_comma(d, "dp3");
+ AA64Reg rm = parse_reg(d);
+ expect_comma(d, "dp3");
+ AA64Reg ra = parse_reg(d);
+ if (rd.is64 != rn.is64 || rd.is64 != rm.is64 || rd.is64 != ra.is64)
+ asm_driver_panic(d, "asm: dp3: width mismatch");
+ u32 word = aa64_dp3_pack((AA64DP3){
+ .sf = rd.is64, .op31 = 0, .o0 = o0, .Rm = rm.num, .Ra = ra.num,
+ .Rn = rn.num, .Rd = rd.num});
+ emit32(d, word);
+}
+
+/* mul Rd, Rn, Rm → MADD Rd, Rn, Rm, ZR */
+static void p_mul(AsmDriver* d, u32 o0) {
+ AA64Reg rd = parse_reg(d);
+ expect_comma(d, "mul");
+ AA64Reg rn = parse_reg(d);
+ expect_comma(d, "mul");
+ AA64Reg rm = parse_reg(d);
+ if (rd.is64 != rn.is64 || rd.is64 != rm.is64)
+ asm_driver_panic(d, "asm: mul: width mismatch");
+ u32 word = aa64_dp3_pack((AA64DP3){
+ .sf = rd.is64, .op31 = 0, .o0 = o0, .Rm = rm.num, .Ra = AA64_ZR,
+ .Rn = rn.num, .Rd = rd.num});
+ emit32(d, word);
+}
+
+/* DP2: udiv/sdiv/lslv/lsrv/asrv/rorv Rd, Rn, Rm. */
+static void p_dp2(AsmDriver* d, u32 opcode) {
+ AA64Reg rd = parse_reg(d);
+ expect_comma(d, "dp2");
+ AA64Reg rn = parse_reg(d);
+ expect_comma(d, "dp2");
+ AA64Reg rm = parse_reg(d);
+ if (rd.is64 != rn.is64 || rd.is64 != rm.is64)
+ asm_driver_panic(d, "asm: dp2: width mismatch");
+ u32 word = aa64_dp2_pack((AA64DP2){.sf = rd.is64, .opcode = opcode,
+ .Rm = rm.num, .Rn = rn.num,
+ .Rd = rd.num});
+ emit32(d, word);
+}
+
+/* Branch immediate / conditional / compare-and-branch. */
+
+static void emit_branch_imm(AsmDriver* d, u32 op_bl, ObjSymId target,
+ i64 addend, i64 const_disp) {
+ MCEmitter* mc = asm_driver_mc(d);
+ /* Emit a B/BL with imm26 = 0; record a CALL26/JUMP26 reloc against
+ * either the symbol or the constant displacement. */
+ u32 word = aa64_brimm_pack((AA64BrImm){.op = op_bl, .imm26 = 0});
+ emit32(d, word);
+ u32 ofs = mc->pos(mc) - 4;
+ RelocKind k = op_bl ? R_AARCH64_CALL26 : R_AARCH64_JUMP26;
+ if (target != OBJ_SYM_NONE) {
+ mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, k, target,
+ addend, 1, 0);
+ } else {
+ /* Pure constant displacement is rare in real .s; reject it now.
+ * The recommended form is to use a label and let the assembler
+ * compute the displacement. */
+ (void)const_disp;
+ asm_driver_panic(d, "asm: branch with pure constant disp not supported");
+ }
+}
+
+static void p_b(AsmDriver* d, u32 op_bl) {
+ ObjSymId sym = OBJ_SYM_NONE;
+ i64 off = 0;
+ /* GNU as accepts `b sym`, `bl sym+8`, etc. */
+ parse_imm_sym(d, &sym, &off);
+ if (sym == OBJ_SYM_NONE)
+ asm_driver_panic(d, "asm: b/bl: symbolic target required");
+ emit_branch_imm(d, op_bl, sym, off, 0);
+}
+
+static void p_b_cond(AsmDriver* d, u32 cond) {
+ ObjSymId sym = OBJ_SYM_NONE;
+ i64 off = 0;
+ parse_imm_sym(d, &sym, &off);
+ if (sym == OBJ_SYM_NONE)
+ asm_driver_panic(d, "asm: b.cond: symbolic target required");
+ /* Emit the instruction with imm19=0 + R_AARCH64_CONDBR19 reloc. */
+ u32 word = aa64_brcond_pack((AA64BrCond){.imm19 = 0, .cond = cond});
+ emit32(d, word);
+ MCEmitter* mc = asm_driver_mc(d);
+ u32 ofs = mc->pos(mc) - 4;
+ mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs,
+ R_AARCH64_CONDBR19, sym, off, 1, 0);
+}
+
+static void p_cbz(AsmDriver* d, u32 op) {
+ AA64Reg rt = parse_reg(d);
+ expect_comma(d, "cbz");
+ ObjSymId sym = OBJ_SYM_NONE;
+ i64 off = 0;
+ parse_imm_sym(d, &sym, &off);
+ if (sym == OBJ_SYM_NONE)
+ asm_driver_panic(d, "asm: cbz: symbolic target required");
+ u32 word = aa64_cb_pack((AA64CB){.sf = rt.is64, .op = op, .imm19 = 0,
+ .Rt = rt.num});
+ emit32(d, word);
+ MCEmitter* mc = asm_driver_mc(d);
+ u32 ofs = mc->pos(mc) - 4;
+ mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs,
+ R_AARCH64_CONDBR19, sym, off, 1, 0);
+}
+
+/* Memory-operand parser for [Xn], [Xn, #imm], [Xn, #imm]!.
+ *
+ * pre_index_out is 1 when the closing `]!` appeared (pre-indexed).
+ * imm is the literal byte offset (no scaling). */
+typedef struct AA64Mem {
+ AA64Reg base;
+ i64 imm; /* byte offset (literal as written) */
+ u8 pre_index;
+ u8 has_offset;
+ u8 pad[2];
+} AA64Mem;
+
+static AA64Mem parse_mem(AsmDriver* d) {
+ AA64Mem m;
+ memset(&m, 0, sizeof m);
+ if (!asm_driver_eat_punct(d, '['))
+ asm_driver_panic(d, "asm: expected '['");
+ m.base = parse_reg(d);
+ if (!m.base.is64)
+ asm_driver_panic(d, "asm: ldr/str: base register must be 64-bit");
+ require_sp_spelling(d, m.base, "ldr/str base");
+ if (asm_driver_eat_comma(d)) {
+ m.imm = parse_imm_const(d);
+ m.has_offset = 1;
+ }
+ if (!asm_driver_eat_punct(d, ']'))
+ asm_driver_panic(d, "asm: expected ']'");
+ if (asm_driver_eat_punct(d, '!')) m.pre_index = 1;
+ return m;
+}
+
+/* ldr/str Rt, [Xn, #imm] — chooses scaled or unscaled form based on
+ * alignment of imm. */
+static void p_ldr_str(AsmDriver* d, int is_load) {
+ AA64Reg rt = parse_reg(d);
+ reject_sp_reg(d, rt, "ldr/str");
+ expect_comma(d, "ldr/str");
+ AA64Mem m = parse_mem(d);
+ u32 size = rt.is64 ? 3u : 2u;
+ u32 opc = is_load ? AA64_LDST_OPC_LDR : AA64_LDST_OPC_STR;
+ if (!m.pre_index) {
+ /* Try scaled unsigned-imm12 first. */
+ u32 scale = 1u << size;
+ if (m.imm >= 0 && (i64)((u64)m.imm % scale) == 0 &&
+ (u64)m.imm / scale <= 0xfff) {
+ u32 imm12 = (u32)((u64)m.imm / scale);
+ u32 word = aa64_ldst_uimm_pack((AA64LdStUimm){
+ .size = size, .V = 0, .opc = opc, .imm12 = imm12,
+ .Rn = m.base.num, .Rt = rt.num});
+ emit32(d, word);
+ return;
+ }
+ /* Fall back to unscaled signed-imm9 (LDUR/STUR). */
+ if (m.imm >= -256 && m.imm <= 255) {
+ u32 imm9 = (u32)((u64)m.imm & 0x1ffu);
+ u32 word = aa64_ldst_simm9_pack((AA64LdStSimm9){
+ .size = size, .V = 0, .opc = opc, .imm9 = imm9,
+ .Rn = m.base.num, .Rt = rt.num});
+ emit32(d, word);
+ return;
+ }
+ asm_driver_panic(d, "asm: ldr/str: immediate out of range");
+ }
+ asm_driver_panic(d, "asm: ldr/str: pre-indexed form not yet supported");
+}
+
+/* ldur/stur — unscaled signed-imm9. */
+static void p_ldur_stur(AsmDriver* d, int is_load) {
+ AA64Reg rt = parse_reg(d);
+ reject_sp_reg(d, rt, "ldur/stur");
+ expect_comma(d, "ldur/stur");
+ AA64Mem m = parse_mem(d);
+ u32 size = rt.is64 ? 3u : 2u;
+ if (m.imm < -256 || m.imm > 255)
+ asm_driver_panic(d, "asm: ldur/stur: imm9 out of range");
+ u32 imm9 = (u32)((u64)m.imm & 0x1ffu);
+ u32 word = aa64_ldst_simm9_pack((AA64LdStSimm9){
+ .size = size, .V = 0,
+ .opc = is_load ? AA64_LDST_OPC_LDR : AA64_LDST_OPC_STR,
+ .imm9 = imm9, .Rn = m.base.num, .Rt = rt.num});
+ emit32(d, word);
+}
+
+/* ldp / stp Rt, Rt2, [Xn, #imm] or [Xn, #imm]! */
+static void p_ldp_stp(AsmDriver* d, int is_load) {
+ AA64Reg rt = parse_ldstp_reg(d);
+ expect_comma(d, "ldp/stp");
+ AA64Reg rt2 = parse_ldstp_reg(d);
+ expect_comma(d, "ldp/stp");
+ reject_sp_reg(d, rt, "ldp/stp");
+ reject_sp_reg(d, rt2, "ldp/stp");
+ if (rt.is64 != rt2.is64 || rt.is_fp != rt2.is_fp)
+ asm_driver_panic(d, "asm: ldp/stp: width mismatch");
+ AA64Mem m = parse_mem(d);
+ u32 scale = rt.is64 ? 8u : 4u;
+ if ((i64)((u64)m.imm % scale) != 0)
+ asm_driver_panic(d, "asm: ldp/stp: imm not scale-aligned");
+ i64 imm7 = m.imm / (i64)scale;
+ if (imm7 < -64 || imm7 > 63)
+ asm_driver_panic(d, "asm: ldp/stp: imm7 out of range");
+ AA64LdStPPre f = {.opc = rt.is_fp ? 1u : (rt.is64 ? 2u : 0u),
+ .V = rt.is_fp ? 1u : 0u,
+ .L = is_load ? 1u : 0u,
+ .imm7 = (u32)imm7 & 0x7fu,
+ .Rt2 = rt2.num,
+ .Rn = m.base.num,
+ .Rt = rt.num};
+ if (m.pre_index)
+ emit32(d, aa64_ldstp_pre_pack(f));
+ else
+ emit32(d, aa64_ldstp_soff_pack(f));
+}
+
+/* adr / adrp Rd, sym */
+static void p_adr(AsmDriver* d, int is_adrp) {
+ AA64Reg rd = parse_reg(d);
+ expect_comma(d, "adr");
+ ObjSymId sym = OBJ_SYM_NONE;
+ i64 off = 0;
+ parse_imm_sym(d, &sym, &off);
+ if (sym == OBJ_SYM_NONE)
+ asm_driver_panic(d, "asm: adr/adrp: symbol required");
+ AA64PCRelAdr f = {.op = is_adrp ? AA64_ADR_OP_ADRP : AA64_ADR_OP_ADR,
+ .immlo = 0, .immhi = 0, .Rd = rd.num};
+ emit32(d, aa64_pcrel_adr_pack(f));
+ MCEmitter* mc = asm_driver_mc(d);
+ u32 ofs = mc->pos(mc) - 4;
+ RelocKind k = is_adrp ? R_AARCH64_ADR_PREL_PG_HI21 : R_AARCH64_ADR_PREL_LO21;
+ mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, k, sym, off, 1, 0);
+}
+
+/* ---- mnemonic dispatch table ---- */
+
+typedef void (*P_Fn)(AsmDriver*);
+
+typedef struct AA64Mn {
+ const char* name;
+ P_Fn fn;
+ u32 arg; /* per-fn discriminator (alias parameter) */
+} AA64Mn;
+
+/* Wrapper functions for the discriminator-taking parsers, since the
+ * table holds a uniform P_Fn pointer. Each wraps a single (fn, arg)
+ * tuple. */
+static void p_addsub_add(AsmDriver* d) { p_addsub(d, /*is_sub=*/0, 0); }
+static void p_addsub_adds(AsmDriver* d) { p_addsub(d, 0, 1); }
+static void p_addsub_sub(AsmDriver* d) { p_addsub(d, 1, 0); }
+static void p_addsub_subs(AsmDriver* d) { p_addsub(d, 1, 1); }
+static void p_cmp_w(AsmDriver* d) { p_cmp(d, 0); }
+static void p_cmn_w(AsmDriver* d) { p_cmp(d, 1); }
+static void p_csinc_(AsmDriver* d) { p_csinc(d); }
+static void p_neg_w(AsmDriver* d) { p_neg(d, 0); }
+static void p_negs_w(AsmDriver* d) { p_neg(d, 1); }
+static void p_and_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_AND_OPC, 0); }
+static void p_bic_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_AND_OPC, 1); }
+static void p_orr_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ORR_OPC, 0); }
+static void p_orn_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ORR_OPC, 1); }
+static void p_eor_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_EOR_OPC, 0); }
+static void p_eon_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_EOR_OPC, 1); }
+static void p_ands_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ANDS_OPC, 0); }
+static void p_bics_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ANDS_OPC, 1); }
+static void p_madd(AsmDriver* d) { p_dp3(d, 0); }
+static void p_msub(AsmDriver* d) { p_dp3(d, 1); }
+static void p_mul_w(AsmDriver* d) { p_mul(d, 0); }
+static void p_mneg_w(AsmDriver* d) { p_mul(d, 1); }
+static void p_udiv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_UDIV_OP); }
+static void p_sdiv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_SDIV_OP); }
+static void p_lslv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_LSLV_OP); }
+static void p_lsrv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_LSRV_OP); }
+static void p_asrv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_ASRV_OP); }
+static void p_rorv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_RORV_OP); }
+static void p_b_(AsmDriver* d) { p_b(d, 0); }
+static void p_bl_(AsmDriver* d) { p_b(d, 1); }
+static void p_cbz_(AsmDriver* d) { p_cbz(d, 0); }
+static void p_cbnz_(AsmDriver* d) { p_cbz(d, 1); }
+static void p_movz_(AsmDriver* d) { p_movwide(d, AA64_MOVZ_OPC); }
+static void p_movn_(AsmDriver* d) { p_movwide(d, AA64_MOVN_OPC); }
+static void p_movk_(AsmDriver* d) { p_movwide(d, AA64_MOVK_OPC); }
+static void p_svc_(AsmDriver* d) { p_except(d, 0); }
+static void p_brk_(AsmDriver* d) { p_except(d, 1); }
+static void p_hlt_(AsmDriver* d) { p_except(d, 2); }
+static void p_ldr_(AsmDriver* d) { p_ldr_str(d, 1); }
+static void p_str_(AsmDriver* d) { p_ldr_str(d, 0); }
+static void p_ldur_(AsmDriver* d) { p_ldur_stur(d, 1); }
+static void p_stur_(AsmDriver* d) { p_ldur_stur(d, 0); }
+static void p_ldp_(AsmDriver* d) { p_ldp_stp(d, 1); }
+static void p_stp_(AsmDriver* d) { p_ldp_stp(d, 0); }
+static void p_adr_(AsmDriver* d) { p_adr(d, 0); }
+static void p_adrp_(AsmDriver* d) { p_adr(d, 1); }
+
+/* b.cond family. cond codes follow the standard ARMv8 numbering. */
+static void p_b_eq(AsmDriver* d) { p_b_cond(d, 0); }
+static void p_b_ne(AsmDriver* d) { p_b_cond(d, 1); }
+static void p_b_cs(AsmDriver* d) { p_b_cond(d, 2); }
+static void p_b_hs(AsmDriver* d) { p_b_cond(d, 2); }
+static void p_b_cc(AsmDriver* d) { p_b_cond(d, 3); }
+static void p_b_lo(AsmDriver* d) { p_b_cond(d, 3); }
+static void p_b_mi(AsmDriver* d) { p_b_cond(d, 4); }
+static void p_b_pl(AsmDriver* d) { p_b_cond(d, 5); }
+static void p_b_vs(AsmDriver* d) { p_b_cond(d, 6); }
+static void p_b_vc(AsmDriver* d) { p_b_cond(d, 7); }
+static void p_b_hi(AsmDriver* d) { p_b_cond(d, 8); }
+static void p_b_ls(AsmDriver* d) { p_b_cond(d, 9); }
+static void p_b_ge(AsmDriver* d) { p_b_cond(d, 10); }
+static void p_b_lt(AsmDriver* d) { p_b_cond(d, 11); }
+static void p_b_gt(AsmDriver* d) { p_b_cond(d, 12); }
+static void p_b_le(AsmDriver* d) { p_b_cond(d, 13); }
+static void p_b_al(AsmDriver* d) { p_b_cond(d, 14); }
+
+static const AA64Mn kTable[] = {
+ {"nop", p_nop, 0},
+ {"dmb", p_dmb, 0},
+ {"dsb", p_dsb, 0},
+ {"isb", p_isb, 0},
+ {"clrex", p_clrex, 0},
+ {"ret", p_ret, 0},
+ {"br", p_br, 0},
+ {"blr", p_blr, 0},
+ {"mov", p_mov, 0},
+ {"mvn", p_mvn, 0},
+ {"movz", p_movz_, 0},
+ {"movn", p_movn_, 0},
+ {"movk", p_movk_, 0},
+ {"add", p_addsub_add, 0},
+ {"adds", p_addsub_adds, 0},
+ {"sub", p_addsub_sub, 0},
+ {"subs", p_addsub_subs, 0},
+ {"cmp", p_cmp_w, 0},
+ {"cmn", p_cmn_w, 0},
+ {"csinc", p_csinc_, 0},
+ {"neg", p_neg_w, 0},
+ {"negs", p_negs_w, 0},
+ {"and", p_and_w, 0},
+ {"bic", p_bic_w, 0},
+ {"orr", p_orr_w, 0},
+ {"orn", p_orn_w, 0},
+ {"eor", p_eor_w, 0},
+ {"eon", p_eon_w, 0},
+ {"ands", p_ands_w, 0},
+ {"bics", p_bics_w, 0},
+ {"madd", p_madd, 0},
+ {"msub", p_msub, 0},
+ {"mul", p_mul_w, 0},
+ {"mneg", p_mneg_w, 0},
+ {"udiv", p_udiv_w, 0},
+ {"sdiv", p_sdiv_w, 0},
+ {"lslv", p_lslv_w, 0},
+ {"lsrv", p_lsrv_w, 0},
+ {"asrv", p_asrv_w, 0},
+ {"rorv", p_rorv_w, 0},
+ {"b", p_b_, 0},
+ {"bl", p_bl_, 0},
+ {"cbz", p_cbz_, 0},
+ {"cbnz", p_cbnz_, 0},
+ {"svc", p_svc_, 0},
+ {"brk", p_brk_, 0},
+ {"hlt", p_hlt_, 0},
+ {"ldr", p_ldr_, 0},
+ {"str", p_str_, 0},
+ {"ldur", p_ldur_, 0},
+ {"stur", p_stur_, 0},
+ {"ldp", p_ldp_, 0},
+ {"stp", p_stp_, 0},
+ {"adr", p_adr_, 0},
+ {"adrp", p_adrp_, 0},
+ {"b.eq", p_b_eq, 0}, {"b.ne", p_b_ne, 0},
+ {"b.cs", p_b_cs, 0}, {"b.hs", p_b_hs, 0},
+ {"b.cc", p_b_cc, 0}, {"b.lo", p_b_lo, 0},
+ {"b.mi", p_b_mi, 0}, {"b.pl", p_b_pl, 0},
+ {"b.vs", p_b_vs, 0}, {"b.vc", p_b_vc, 0},
+ {"b.hi", p_b_hi, 0}, {"b.ls", p_b_ls, 0},
+ {"b.ge", p_b_ge, 0}, {"b.lt", p_b_lt, 0},
+ {"b.gt", p_b_gt, 0}, {"b.le", p_b_le, 0},
+ {"b.al", p_b_al, 0},
+ {NULL, NULL, 0},
+};
+
+void aa64_asm_insn(AA64Asm* a, AsmDriver* d, Sym mnemonic) {
+ (void)a;
+ size_t mn = 0;
+ const char* mp = pool_str(asm_driver_pool(d), mnemonic, &mn);
+ for (const AA64Mn* row = kTable; row->name; ++row) {
+ if (icase_eq(mp, mn, row->name)) {
+ row->fn(d);
+ return;
+ }
+ }
+ asm_driver_panic(d, "asm: unknown mnemonic");
+}
+
+/* ---- inline-asm template walker (Phase 4b Track C) ---- */
+
+/* Per-call rendered-line buffer. GCC's inline asm rarely emits more
+ * than a handful of instructions per block; one line of substituted
+ * text fits comfortably inside this. Truncation panics — the operator
+ * grammar should never grow a single line beyond this without a
+ * deliberate reason. */
+#define AA64_INLINE_LINE_CAP 1024
+
+/* Render a 5-bit register number into the StrBuf using the requested
+ * width form. is64 picks x-form vs w-form; SP / ZR encode as
+ * register #31 and we render them as wzr/xzr or wsp/sp depending on
+ * caller intent — for inline-asm v1 the bound operand always names a
+ * GP register, never SP, so we emit wzr/xzr for #31. */
+static void render_reg(StrBuf* sb, u32 reg, int is64) {
+ if (reg == 31u) {
+ strbuf_puts(sb, is64 ? "xzr" : "wzr");
+ return;
+ }
+ strbuf_putc(sb, is64 ? 'x' : 'w');
+ if (reg >= 10u) strbuf_putc(sb, (char)('0' + (reg / 10u)));
+ strbuf_putc(sb, (char)('0' + (reg % 10u)));
+}
+
+/* Render a signed 64-bit integer prefixed with '#'. */
+static void render_imm(StrBuf* sb, i64 v) {
+ strbuf_putc(sb, '#');
+ strbuf_put_i64(sb, v);
+}
+
+/* Render an addressing form `[xN, #ofs]` for OPK_INDIRECT. */
+static void render_indirect(StrBuf* sb, Reg base, i32 ofs) {
+ strbuf_putc(sb, '[');
+ render_reg(sb, (u32)base, /*is64=*/1);
+ if (ofs != 0) {
+ strbuf_puts(sb, ", ");
+ render_imm(sb, (i64)ofs);
+ }
+ strbuf_putc(sb, ']');
+}
+
+_Noreturn static void inline_panic(AA64Asm* a, const char* msg) {
+ SrcLoc loc = {0, 0, 0};
+ compiler_panic(a->c, loc, "inline asm: %s", msg);
+}
+
+/* Resolve operand index N → (kind=0 forced default, 1=force-w, 2=force-x,
+ * 3=address form `%aN`). Renders into sb. */
+static void render_operand(AA64Asm* a, StrBuf* sb, u32 idx, int form) {
+ u32 ntot = a->nout + a->nin;
+ if (idx >= ntot) inline_panic(a, "operand index out of range");
+ const Operand* op = (idx < a->nout) ? &a->out_ops[idx]
+ : &a->in_ops[idx - a->nout];
+ switch (form) {
+ case 1: /* %wN — force 32-bit register form */
+ if (op->kind != OPK_REG)
+ inline_panic(a, "%w on non-register operand");
+ render_reg(sb, (u32)op->v.reg, /*is64=*/0);
+ return;
+ case 2: /* %xN — force 64-bit register form */
+ if (op->kind != OPK_REG)
+ inline_panic(a, "%x on non-register operand");
+ render_reg(sb, (u32)op->v.reg, /*is64=*/1);
+ return;
+ case 3: /* %aN — memory addressing form */
+ if (op->kind != OPK_INDIRECT)
+ inline_panic(a, "%a on non-memory operand");
+ render_indirect(sb, op->v.ind.base, op->v.ind.ofs);
+ return;
+ default:
+ break;
+ }
+ /* Default rendering by operand kind. */
+ switch (op->kind) {
+ case OPK_REG:
+ render_reg(sb, (u32)op->v.reg, /*is64=*/1);
+ return;
+ case OPK_IMM:
+ render_imm(sb, op->v.imm);
+ return;
+ case OPK_INDIRECT:
+ render_indirect(sb, op->v.ind.base, op->v.ind.ofs);
+ return;
+ default:
+ inline_panic(a, "unsupported operand kind for %N");
+ }
+}
+
+/* Lex one line of substituted asm and dispatch via aa64_asm_insn. */
+static void run_one_line(AA64Asm* a, MCEmitter* mc, const char* text,
+ size_t len) {
+ /* Skip blank lines. */
+ size_t i;
+ for (i = 0; i < len; ++i) {
+ if (text[i] != ' ' && text[i] != '\t') break;
+ }
+ if (i == len) return;
+
+ AsmLexer* lx = asm_lex_open_mem(a->c, "<inline-asm>", text, len);
+ AsmDriver* d = asm_driver_open_inline(a->c, mc, lx);
+
+ /* The first non-trivial token must be the mnemonic identifier (or a
+ * `.directive`, but inline asm doesn't normally use directives — leave
+ * that path unsupported until needed). */
+ AsmTok t = asm_driver_peek(d);
+ while (t.kind == ASM_TOK_NEWLINE || t.kind == ASM_TOK_HASH) {
+ (void)asm_driver_next(d);
+ if (t.kind == ASM_TOK_HASH) {
+ /* Skip cpp linemarker rest of line. */
+ while (!asm_driver_at_eol(d)) (void)asm_driver_next(d);
+ }
+ t = asm_driver_peek(d);
+ }
+ if (t.kind == ASM_TOK_EOF) {
+ asm_driver_close_inline(d);
+ asm_lex_close(lx);
+ return;
+ }
+ if (t.kind != ASM_TOK_IDENT)
+ inline_panic(a, "expected mnemonic at start of inline asm line");
+ (void)asm_driver_next(d);
+ Sym mn = t.v.ident;
+ /* Compose `b.eq` etc. — same trick as the standalone driver. */
+ AsmTok dot = asm_driver_peek(d);
+ if (asm_driver_tok_is_punct(dot, '.')) {
+ (void)asm_driver_next(d);
+ AsmTok rest = asm_driver_next(d);
+ if (rest.kind != ASM_TOK_IDENT)
+ inline_panic(a, "composite mnemonic: expected ident after '.'");
+ size_t hn = 0, rn = 0;
+ const char* hp = pool_str(asm_driver_pool(d), mn, &hn);
+ const char* rp = pool_str(asm_driver_pool(d), rest.v.ident, &rn);
+ char buf[64];
+ if (hn + 1 + rn >= sizeof buf)
+ inline_panic(a, "composite mnemonic too long");
+ for (size_t k = 0; k < hn; ++k) buf[k] = hp[k];
+ buf[hn] = '.';
+ for (size_t k = 0; k < rn; ++k) buf[hn + 1 + k] = rp[k];
+ mn = pool_intern(asm_driver_pool(d), buf, hn + 1 + rn);
+ }
+ aa64_asm_insn(a, d, mn);
+ asm_driver_close_inline(d);
+ asm_lex_close(lx);
+}
+
+/* Substitute placeholders into one line's StrBuf, then dispatch.
+ *
+ * The input range is [start, end) inside `tmpl`. Updates `*line_idx`
+ * is not used — the caller resets the StrBuf between lines. */
+static void render_and_run_line(AA64Asm* a, MCEmitter* mc, StrBuf* sb,
+ const char* start, const char* end) {
+ strbuf_reset(sb);
+ for (const char* p = start; p < end; ++p) {
+ char c = *p;
+ if (c != '%') {
+ strbuf_putc(sb, c);
+ continue;
+ }
+ /* Placeholder. */
+ if (p + 1 >= end) inline_panic(a, "trailing '%' in template");
+ char n = *(p + 1);
+ if (n == '%') {
+ strbuf_putc(sb, '%');
+ ++p;
+ continue;
+ }
+ if (n == '[') {
+ /* %[name] — scan to the closing ']' and resolve against
+ * AsmConstraint.name on the combined outs+ins list. Match by
+ * comparing the named-bracket contents against the interned name
+ * Sym stored on each constraint. */
+ const char* nbeg = p + 2;
+ const char* nend = nbeg;
+ while (nend < end && *nend != ']') ++nend;
+ if (nend == end) inline_panic(a, "unterminated %[name]");
+ size_t nlen = (size_t)(nend - nbeg);
+ Sym needle = pool_intern(a->c->global, nbeg, nlen);
+ u32 idx = (u32)-1;
+ for (u32 k = 0; k < a->nout; ++k) {
+ if (a->outs[k].name == needle) { idx = k; break; }
+ }
+ if (idx == (u32)-1) {
+ for (u32 k = 0; k < a->nin; ++k) {
+ if (a->ins[k].name == needle) { idx = a->nout + k; break; }
+ }
+ }
+ if (idx == (u32)-1)
+ inline_panic(a, "%[name] does not match any constraint");
+ p = nend; /* loop's ++p steps past the ']' */
+ render_operand(a, sb, idx, 0);
+ continue;
+ }
+ int form = 0; /* 0=default, 1=w, 2=x, 3=a */
+ if (n == 'w' || n == 'x' || n == 'a') {
+ form = (n == 'w') ? 1 : (n == 'x') ? 2 : 3;
+ ++p;
+ if (p + 1 >= end) inline_panic(a, "trailing '%' modifier in template");
+ n = *(p + 1);
+ }
+ if (n == '[') {
+ /* %w[name] / %x[name] / %a[name] — width modifier + symbolic
+ * operand. Resolves the same way as %[name] but renders with the
+ * declared form. */
+ const char* nbeg = p + 2;
+ const char* nend = nbeg;
+ while (nend < end && *nend != ']') ++nend;
+ if (nend == end) inline_panic(a, "unterminated %[name]");
+ size_t nlen = (size_t)(nend - nbeg);
+ Sym needle = pool_intern(a->c->global, nbeg, nlen);
+ u32 idx = (u32)-1;
+ for (u32 k = 0; k < a->nout; ++k) {
+ if (a->outs[k].name == needle) { idx = k; break; }
+ }
+ if (idx == (u32)-1) {
+ for (u32 k = 0; k < a->nin; ++k) {
+ if (a->ins[k].name == needle) { idx = a->nout + k; break; }
+ }
+ }
+ if (idx == (u32)-1)
+ inline_panic(a, "%[name] does not match any constraint");
+ p = nend; /* loop's ++p steps past the ']' */
+ render_operand(a, sb, idx, form);
+ continue;
+ }
+ if (n < '0' || n > '9')
+ inline_panic(a, "expected digit after '%'");
+ u32 idx = (u32)(n - '0');
+ ++p;
+ /* GCC syntax permits up to two digits (%0..%99). */
+ if (p + 1 < end && *(p + 1) >= '0' && *(p + 1) <= '9') {
+ idx = idx * 10 + (u32)(*(p + 1) - '0');
+ ++p;
+ }
+ render_operand(a, sb, idx, form);
+ }
+ if (sb->truncated) inline_panic(a, "inline asm line buffer overflow");
+ run_one_line(a, mc, strbuf_cstr(sb), strbuf_len(sb));
+}
+
+void aa64_asm_run_template(AA64Asm* a, MCEmitter* mc, const char* tmpl) {
+ if (!tmpl || !*tmpl) return;
+
+ char buf[AA64_INLINE_LINE_CAP];
+ StrBuf sb;
+ strbuf_init(&sb, buf, sizeof buf);
+
+ /* Walk tmpl, splitting on '\n' and ';' line terminators. Track bracket
+ * depth and quote state so that a literal ';' inside `[ ... ]` or a
+ * quoted string is not mistaken for a statement separator. */
+ const char* line_start = tmpl;
+ int bracket = 0;
+ char quote = 0;
+ for (const char* p = tmpl;; ++p) {
+ char c = *p;
+ if (c == '\0') {
+ render_and_run_line(a, mc, &sb, line_start, p);
+ break;
+ }
+ if (quote) {
+ if (c == '\\' && *(p + 1)) {
+ ++p;
+ continue;
+ }
+ if (c == quote) quote = 0;
+ continue;
+ }
+ if (c == '"' || c == '\'') {
+ quote = c;
+ continue;
+ }
+ if (c == '[') {
+ ++bracket;
+ continue;
+ }
+ if (c == ']') {
+ if (bracket) --bracket;
+ continue;
+ }
+ if (bracket == 0 && (c == '\n' || c == ';')) {
+ render_and_run_line(a, mc, &sb, line_start, p);
+ line_start = p + 1;
+ }
+ }
+}
diff --git a/src/arch/aa64_asm.h b/src/arch/aa64/asm.h
diff --git a/src/arch/aa64/dbg.c b/src/arch/aa64/dbg.c
@@ -0,0 +1,235 @@
+/* AArch64 lifter for the displaced-step shim.
+ *
+ * Lays out a fixed-up copy of one insn in the session scratch slot
+ * (DBG_DISPLACED_SLOT_BYTES bytes), followed by a BRK sentinel the
+ * session arms an internal bp on.
+ *
+ * Supported families:
+ * - any insn with no PC-relative operand (copied verbatim);
+ * - B / BL / B.cond — re-encode the immediate;
+ * - CBZ / CBNZ / TBZ / TBNZ — always emit a trampoline:
+ * slot[0] cond-branch +2 words (taken → slot+8)
+ * slot[4] BRK (not-taken fallthrough)
+ * slot[8] LDR x16, =target
+ * slot[12] BR x16
+ * slot[16] literal pool (8 bytes, absolute target)
+ * - ADR / ADRP — replace with LDR Xd, =target:
+ * slot[0] LDR Xd, =target
+ * slot[4] BRK
+ * slot[8] literal pool (8 bytes)
+ * - LDR (literal), integer/LDRSW — synthesize indirect load:
+ * slot[0] LDR x16, =literal_addr
+ * slot[4] LDR Xt/Wt/LDRSW Xt, [x16]
+ * slot[8] BRK
+ * slot[12] literal pool (8 bytes, absolute literal addr)
+ * - BR / BLR / RET — copied verbatim; the BRK after never
+ * fires because the indirect branch transfers control. The session's
+ * stale internal_bp is cleared by the next prepare; finalize gates on
+ * PC == return_pc so it stays a no-op when control left the slot. */
+
+#include "dbg/dbg.h"
+
+#include <string.h>
+
+#include "arch/aa64/isa.h"
+
+#define SHIM_X16 16u /* IP0; safe to clobber inside a shim */
+
+uint32_t dbg_aa64_brk_word(void) {
+ return aa64_brk(0);
+}
+
+static int fits_signed(int64_t v, int bits) {
+ int64_t lim = (int64_t)1 << (bits - 1);
+ return v >= -lim && v < lim;
+}
+
+/* LDR (literal) for integer Xt: opc=01, V=0, fixed bits 011_0_00.
+ * 01 011 0 00 imm19 Rt → 0x58000000 | (imm19<<5) | Rt
+ * imm19 is the signed word offset from the LDR's own PC. */
+static uint32_t enc_ldr_lit_x(uint32_t Rt, int32_t imm19) {
+ return 0x58000000u | (((uint32_t)imm19 & 0x7ffffu) << 5) | (Rt & 0x1fu);
+}
+/* LDR Xt, [Xn, #0] / LDR Wt, [Xn, #0] / LDRSW Xt, [Xn, #0]. */
+static uint32_t enc_ldr64_reg(uint32_t Rt, uint32_t Rn) {
+ return aa64_ldr64_uimm12(Rt, Rn, 0);
+}
+static uint32_t enc_ldr32_reg(uint32_t Rt, uint32_t Rn) {
+ return aa64_ldst_uimm_pack((AA64LdStUimm){
+ .size = 2, .V = 0, .opc = AA64_LDST_OPC_LDR, .imm12 = 0, .Rn = Rn,
+ .Rt = Rt});
+}
+static uint32_t enc_ldrsw_reg(uint32_t Rt, uint32_t Rn) {
+ return aa64_ldst_uimm_pack((AA64LdStUimm){
+ .size = 2, .V = 0, .opc = 2, .imm12 = 0, .Rn = Rn, .Rt = Rt});
+}
+
+static void put_u32(uint8_t* w, uint32_t off, uint32_t v) {
+ memcpy(w + off, &v, sizeof(v));
+}
+static void put_u64(uint8_t* w, uint32_t off, uint64_t v) {
+ memcpy(w + off, &v, sizeof(v));
+}
+
+/* Sign-extend a `bits`-wide field whose raw value is `v`. */
+static int64_t sign_extend(uint64_t v, int bits) {
+ uint64_t m = 1ull << (bits - 1);
+ return (int64_t)((v ^ m) - m);
+}
+
+int dbg_aa64_build_shim(uint32_t orig_insn, uint64_t orig_pc,
+ void* scratch_write, uint64_t scratch_runtime,
+ u32* shim_len) {
+ uint8_t* w = (uint8_t*)scratch_write;
+ uint32_t brk = aa64_brk(0);
+ int64_t pc_delta;
+ if (!shim_len) return 1;
+ *shim_len = 0;
+ pc_delta = (int64_t)orig_pc - (int64_t)scratch_runtime;
+
+ /* ---- B / BL (imm26) ------------------------------------------------ */
+ if ((orig_insn & 0x7C000000u) == 0x14000000u) {
+ AA64BrImm f = aa64_brimm_unpack(orig_insn);
+ int64_t imm = sign_extend(f.imm26, 26);
+ int64_t new_off = imm * 4 + pc_delta;
+ if ((new_off & 3) || !fits_signed(new_off / 4, 26)) {
+ /* Out of B/BL range from scratch: fall back to LDR x30/PC trick is
+ * messy for BL (need to preserve LR). Decline. */
+ return 1;
+ }
+ f.imm26 = (uint32_t)((new_off / 4) & 0x3ffffffu);
+ put_u32(w, 0, aa64_brimm_pack(f));
+ put_u32(w, 4, brk);
+ *shim_len = 4;
+ return 0;
+ }
+
+ /* ---- B.cond (imm19) ------------------------------------------------ */
+ if ((orig_insn & 0xFF000010u) == 0x54000000u) {
+ AA64BrCond f = aa64_brcond_unpack(orig_insn);
+ int64_t imm = sign_extend(f.imm19, 19);
+ int64_t new_off = imm * 4 + pc_delta;
+ if ((new_off & 3) || !fits_signed(new_off / 4, 19)) {
+ /* Synthesize: B.cond +8 (skip BRK) ; BRK ; LDR x16,=tgt ; BR x16 ;
+ * literal. The "taken" path branches to slot+8, the "not-taken"
+ * path falls through to BRK at slot+4. */
+ uint64_t target = orig_pc + (uint64_t)(imm * 4);
+ AA64BrCond nf;
+ nf.cond = f.cond;
+ nf.imm19 = 2u; /* +8 bytes from slot[0] → slot[8] */
+ put_u32(w, 0, aa64_brcond_pack(nf));
+ put_u32(w, 4, brk);
+ put_u32(w, 8, enc_ldr_lit_x(SHIM_X16, 2)); /* LDR x16, [pc+8] = slot[16] */
+ put_u32(w, 12, aa64_br(SHIM_X16));
+ put_u64(w, 16, target);
+ *shim_len = 4;
+ return 0;
+ }
+ f.imm19 = (uint32_t)((new_off / 4) & 0x7ffffu);
+ put_u32(w, 0, aa64_brcond_pack(f));
+ put_u32(w, 4, brk);
+ *shim_len = 4;
+ return 0;
+ }
+
+ /* ---- CBZ / CBNZ (imm19) — always trampoline form ------------------- */
+ if ((orig_insn & 0x7E000000u) == 0x34000000u) {
+ AA64CB f = aa64_cb_unpack(orig_insn);
+ int64_t imm = sign_extend(f.imm19, 19);
+ uint64_t target = orig_pc + (uint64_t)(imm * 4);
+ AA64CB nf = f;
+ nf.imm19 = 2u; /* +8 → slot[8] */
+ put_u32(w, 0, aa64_cb_pack(nf));
+ put_u32(w, 4, brk);
+ put_u32(w, 8, enc_ldr_lit_x(SHIM_X16, 2));
+ put_u32(w, 12, aa64_br(SHIM_X16));
+ put_u64(w, 16, target);
+ *shim_len = 4;
+ return 0;
+ }
+
+ /* ---- TBZ / TBNZ (imm14) — always trampoline ------------------------
+ * b5 011011 op b40[18:14] imm14[18:5] -- wait, field layout:
+ * b5(31) 011011(30..25) op(24) b40(23..19) imm14(18..5) Rt(4..0). */
+ if ((orig_insn & 0x7E000000u) == 0x36000000u) {
+ uint32_t b5 = (orig_insn >> 31) & 1u;
+ uint32_t op = (orig_insn >> 24) & 1u;
+ uint32_t b40 = (orig_insn >> 19) & 0x1fu;
+ uint32_t Rt = orig_insn & 0x1fu;
+ uint32_t imm14_raw = (orig_insn >> 5) & 0x3fffu;
+ int64_t imm = sign_extend(imm14_raw, 14);
+ uint64_t target = orig_pc + (uint64_t)(imm * 4);
+ uint32_t new_imm14 = 2u; /* +8 → slot[8] */
+ uint32_t new_word =
+ (b5 << 31) | 0x36000000u | (op << 24) | (b40 << 19) |
+ ((new_imm14 & 0x3fffu) << 5) | (Rt & 0x1fu);
+ put_u32(w, 0, new_word);
+ put_u32(w, 4, brk);
+ put_u32(w, 8, enc_ldr_lit_x(SHIM_X16, 2));
+ put_u32(w, 12, aa64_br(SHIM_X16));
+ put_u64(w, 16, target);
+ *shim_len = 4;
+ return 0;
+ }
+
+ /* ---- ADR / ADRP ---------------------------------------------------- */
+ if ((orig_insn & 0x1F000000u) == 0x10000000u) {
+ AA64PCRelAdr f = aa64_pcrel_adr_unpack(orig_insn);
+ uint64_t imm_raw = ((uint64_t)f.immhi << 2) | (uint64_t)f.immlo;
+ int64_t imm21 = sign_extend(imm_raw, 21);
+ uint64_t target;
+ if (f.op == AA64_ADR_OP_ADRP) {
+ target = (orig_pc & ~(uint64_t)0xFFF) + ((uint64_t)imm21 << 12);
+ } else {
+ target = orig_pc + (uint64_t)imm21;
+ }
+ /* LDR Xd, [pc + 8] — the literal sits at slot[8]. */
+ put_u32(w, 0, enc_ldr_lit_x(f.Rd, 2));
+ put_u32(w, 4, brk);
+ put_u64(w, 8, target);
+ *shim_len = 4;
+ return 0;
+ }
+
+ /* ---- LDR (literal) — integer & LDRSW only -------------------------- */
+ if ((orig_insn & 0x3B000000u) == 0x18000000u) {
+ uint32_t opc = (orig_insn >> 30) & 3u;
+ uint32_t V = (orig_insn >> 26) & 1u;
+ uint32_t Rt = orig_insn & 0x1fu;
+ uint32_t imm19_raw = (orig_insn >> 5) & 0x7ffffu;
+ int64_t imm19 = sign_extend(imm19_raw, 19);
+ uint64_t literal_addr = orig_pc + (uint64_t)(imm19 * 4);
+ uint32_t load_insn;
+ if (V) return 1; /* vector forms (S/D/Q): not supported in v1 */
+ switch (opc) {
+ case 0: load_insn = enc_ldr32_reg(Rt, SHIM_X16); break; /* LDR Wt */
+ case 1: load_insn = enc_ldr64_reg(Rt, SHIM_X16); break; /* LDR Xt */
+ case 2: load_insn = enc_ldrsw_reg(Rt, SHIM_X16); break; /* LDRSW */
+ default: return 1; /* PRFM (literal): not meaningful here */
+ }
+ /* LDR x16, [pc + 12] — literal at slot[12]. */
+ put_u32(w, 0, enc_ldr_lit_x(SHIM_X16, 3));
+ put_u32(w, 4, load_insn);
+ put_u32(w, 8, brk);
+ put_u64(w, 12, literal_addr);
+ *shim_len = 8;
+ return 0;
+ }
+
+ /* ---- BR / BLR / RET (indirect) ------------------------------------- */
+ if ((orig_insn & 0xFE1FFC1Fu) == AA64_BR_REG_FAMILY_MATCH) {
+ /* Copy verbatim; the BRK after will not fire because control
+ * transfers to the register target. The session clears the stale
+ * internal bp on the next prepare. */
+ put_u32(w, 0, orig_insn);
+ put_u32(w, 4, brk);
+ *shim_len = 4;
+ return 0;
+ }
+
+ /* ---- default: no PC-relative operand — copy verbatim --------------- */
+ put_u32(w, 0, orig_insn);
+ put_u32(w, 4, brk);
+ *shim_len = 4;
+ return 0;
+}
diff --git a/src/arch/aa64/disasm.c b/src/arch/aa64/disasm.c
@@ -0,0 +1,133 @@
+/* AArch64 disassembler implementation.
+ *
+ * Decodes one 4-byte instruction word per call into a CfreeInsn whose
+ * string fields point into iterator-owned StrBufs. The decoder shares
+ * the aa64_isa.{h,c} descriptor table with the encoder: aa64_disasm_find
+ * matches the word; aa64_print_operands renders operand text via the
+ * format's unpack + per-format pretty-printer. Mnemonic rewriting (the
+ * one bit the printer can't own, because b.cond rolls cond into the
+ * "operand" text) happens here. */
+
+#include "arch/aa64/disasm.h"
+
+#include <string.h>
+
+#include "arch/aa64/isa.h"
+#include "core/heap.h"
+#include "core/strbuf.h"
+
+/* Enough for any aarch64 mnemonic-with-suffix ("b.cond" → "b.le", etc.). */
+#define AA64_DASM_MNEM_CAP 16u
+/* Operand text. The widest cases (LDP X, X, [SP, #-imm]!) fit easily. */
+#define AA64_DASM_OPS_CAP 96u
+/* Annotation overlay (symbol + addend). */
+#define AA64_DASM_ANN_CAP 96u
+
+typedef struct AA64Disasm {
+ ArchDisasm base;
+ Compiler* c;
+ Heap* heap;
+ char mnem_buf[AA64_DASM_MNEM_CAP];
+ char ops_buf[AA64_DASM_OPS_CAP];
+ char ann_buf[AA64_DASM_ANN_CAP];
+ StrBuf mnem;
+ StrBuf ops;
+ StrBuf ann;
+} AA64Disasm;
+
+static const char* aa64_cond_names[16] = {
+ "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
+ "hi", "ls", "ge", "lt", "gt", "le", "al", "nv",
+};
+
+static void aa64_write_mnemonic(AA64Disasm* d, const AA64InsnDesc* desc,
+ u32 word) {
+ strbuf_reset(&d->mnem);
+ if (desc->fmt == AA64_FMT_BR_COND) {
+ /* Synthesize "b.<cond>" so the operands buffer can hold just the
+ * target. Matches GNU as / objdump conventions. */
+ u32 cond = word & 0xfu;
+ strbuf_puts(&d->mnem, "b.");
+ strbuf_puts(&d->mnem, aa64_cond_names[cond]);
+ return;
+ }
+ strbuf_puts(&d->mnem, desc->mnemonic);
+}
+
+static void aa64_write_operands(AA64Disasm* d, const AA64InsnDesc* desc,
+ u32 word, u64 vaddr) {
+ strbuf_reset(&d->ops);
+ if (desc->fmt == AA64_FMT_BR_COND) {
+ /* aa64_print_operands prints "<cond> <target>"; we already lifted
+ * the cond into the mnemonic, so skip the dispatcher and inline
+ * just the target. */
+ AA64BrCond f = aa64_brcond_unpack(word);
+ i64 ofs = (i64)((u64)f.imm19 & 0x7ffffu);
+ /* sign-extend 19 bits */
+ if (ofs & 0x40000) ofs |= ~(i64)0x7ffff;
+ ofs *= 4;
+ if (vaddr) {
+ strbuf_put_hex_u64(&d->ops, vaddr + (u64)ofs);
+ } else {
+ strbuf_puts(&d->ops, "#");
+ strbuf_put_i64(&d->ops, ofs);
+ }
+ return;
+ }
+ aa64_print_operands(&d->ops, desc, word, vaddr);
+}
+
+static u32 aa64_read_u32_le(const u8* b) {
+ return (u32)b[0] | ((u32)b[1] << 8) | ((u32)b[2] << 16) | ((u32)b[3] << 24);
+}
+
+static void aa64_write_unknown(AA64Disasm* d, u32 word) {
+ strbuf_reset(&d->mnem);
+ strbuf_puts(&d->mnem, ".inst");
+ strbuf_reset(&d->ops);
+ strbuf_put_hex_u64(&d->ops, (u64)word);
+}
+
+static u32 aa64_decode(ArchDisasm* base, const u8* bytes, size_t len, u64 vaddr,
+ CfreeInsn* out) {
+ AA64Disasm* d = (AA64Disasm*)base;
+ if (len < 4u) return 0;
+ u32 word = aa64_read_u32_le(bytes);
+ const AA64InsnDesc* desc = aa64_disasm_find(word);
+ if (desc) {
+ aa64_write_mnemonic(d, desc, word);
+ aa64_write_operands(d, desc, word, vaddr);
+ } else {
+ aa64_write_unknown(d, word);
+ }
+ /* Annotation overlay is owned by the public iterator (cfree_disasm_iter_*).
+ * The arch-level decoder leaves it empty. */
+ strbuf_reset(&d->ann);
+ out->vaddr = vaddr;
+ out->bytes = bytes;
+ out->nbytes = 4;
+ out->mnemonic = strbuf_cstr(&d->mnem);
+ out->operands = strbuf_cstr(&d->ops);
+ out->annotation = strbuf_cstr(&d->ann);
+ return 4;
+}
+
+static void aa64_destroy(ArchDisasm* base) {
+ AA64Disasm* d = (AA64Disasm*)base;
+ d->heap->free(d->heap, d, sizeof(*d));
+}
+
+ArchDisasm* aa64_disasm_new(Compiler* c) {
+ Heap* h = (Heap*)c->env->heap;
+ AA64Disasm* d = (AA64Disasm*)h->alloc(h, sizeof(*d), _Alignof(AA64Disasm));
+ if (!d) return NULL;
+ memset(d, 0, sizeof(*d));
+ d->c = c;
+ d->heap = h;
+ d->base.decode = aa64_decode;
+ d->base.destroy = aa64_destroy;
+ strbuf_init(&d->mnem, d->mnem_buf, sizeof d->mnem_buf);
+ strbuf_init(&d->ops, d->ops_buf, sizeof d->ops_buf);
+ strbuf_init(&d->ann, d->ann_buf, sizeof d->ann_buf);
+ return &d->base;
+}
diff --git a/src/arch/aa64/disasm.h b/src/arch/aa64/disasm.h
@@ -0,0 +1,14 @@
+#ifndef CFREE_ARCH_AA64_DISASM_H
+#define CFREE_ARCH_AA64_DISASM_H
+
+/* AArch64 disassembler — ArchDisasm implementation.
+ *
+ * Wraps aa64_disasm_find + aa64_print_operands (src/arch/aa64/isa.{h,c}).
+ * The dispatcher in src/arch/disasm.c constructs one of these when the
+ * compiler target is CFREE_ARCH_ARM_64. */
+
+#include "arch/arch.h"
+
+ArchDisasm* aa64_disasm_new(Compiler*);
+
+#endif
diff --git a/src/arch/aa64/emit.c b/src/arch/aa64/emit.c
@@ -0,0 +1,523 @@
+/* aarch64/emit.c — instruction encoding helpers, function lifecycle,
+ * frame layout, parameter ABI, address materialization. */
+
+#include "arch/aa64/internal.h"
+
+extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc);
+
+/* ============================================================
+ * Shared type / operand helpers
+ * ============================================================ */
+
+int type_is_64(CfreeCgTypeId t) {
+ return t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I64) ||
+ t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64) ||
+ t >= (CfreeCgTypeId)(2u << 6);
+}
+
+int type_is_fp_double(CfreeCgTypeId t) {
+ return t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64);
+}
+
+int type_is_signed(CfreeCgTypeId t) {
+ (void)t;
+ return 0;
+}
+
+u32 type_byte_size(CfreeCgTypeId t) {
+ if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I8) ||
+ t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_BOOL))
+ return 1;
+ if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I16)) return 2;
+ if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I32) ||
+ t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F32))
+ return 4;
+ return 8;
+}
+
+u32 size_idx_for_bytes(u32 nbytes) {
+ switch (nbytes) {
+ case 1:
+ return 0;
+ case 2:
+ return 1;
+ case 4:
+ return 2;
+ case 8:
+ return 3;
+ default:
+ return 3;
+ }
+}
+
+u32 reg_num(Operand op) { return op.v.reg & 0x1fu; }
+
+static u32 collect_mask_regs(u32 mask, u32 first, u32 last, u32* out) {
+ u32 n = 0;
+ for (u32 r = first; r <= last; ++r) {
+ if (mask & (1u << r)) out[n++] = r;
+ }
+ return n;
+}
+
+/* ============================================================
+ * Low-level emission
+ * ============================================================ */
+
+void aa64_emit32(MCEmitter* mc, u32 word) {
+ u32 ofs = obj_pos(mc->obj, mc->section_id);
+ u8 b[4];
+ b[0] = (u8)(word & 0xff);
+ b[1] = (u8)((word >> 8) & 0xff);
+ b[2] = (u8)((word >> 16) & 0xff);
+ b[3] = (u8)((word >> 24) & 0xff);
+ mc->emit_bytes(mc, b, 4);
+ if (mc->debug) {
+ debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
+ }
+}
+
+void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word) {
+ u8 b[4];
+ b[0] = (u8)(word & 0xff);
+ b[1] = (u8)((word >> 8) & 0xff);
+ b[2] = (u8)((word >> 16) & 0xff);
+ b[3] = (u8)((word >> 24) & 0xff);
+ obj_patch(obj, sec_id, ofs, b, 4);
+}
+
+/* ============================================================
+ * Immediate encoding helpers
+ * ============================================================ */
+
+void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm) {
+ const u32 nslots = sf ? 4u : 2u;
+ u64 v = sf ? (u64)imm : ((u64)imm & 0xffffffffu);
+
+ for (u32 i = 0; i < nslots; ++i) {
+ u32 slot = (u32)((v >> (i * 16)) & 0xffffu);
+ u64 cleared = v & ~((u64)0xffffu << (i * 16));
+ if (slot != 0 && cleared == 0) {
+ aa64_emit32(mc, aa64_movz(sf, Rd, slot, i));
+ return;
+ }
+ }
+
+ {
+ u64 inv = sf ? ~v : ((~v) & 0xffffffffu);
+ for (u32 i = 0; i < nslots; ++i) {
+ u32 slot = (u32)((inv >> (i * 16)) & 0xffffu);
+ u64 cleared = inv & ~((u64)0xffffu << (i * 16));
+ if (cleared == 0) {
+ aa64_emit32(mc, aa64_movn(sf, Rd, slot, i));
+ return;
+ }
+ }
+ }
+
+ int placed = 0;
+ for (u32 i = 0; i < nslots; ++i) {
+ u32 slot = (u32)((v >> (i * 16)) & 0xffffu);
+ if (!placed) {
+ if (slot == 0) continue;
+ aa64_emit32(mc, aa64_movz(sf, Rd, slot, i));
+ placed = 1;
+ } else if (slot != 0) {
+ aa64_emit32(mc, aa64_movk(sf, Rd, slot, i));
+ }
+ }
+ if (!placed) aa64_emit32(mc, aa64_movz(sf, Rd, 0, 0));
+}
+
+void emit_sp_add(MCEmitter* mc, u32 imm) {
+ if (imm <= 0xfff) {
+ aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm, 0));
+ } else if ((imm & 0xfff) == 0 && (imm >> 12) <= 0xfff) {
+ aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm >> 12, 1));
+ } else {
+ aa64_emit32(mc, aa64_add_imm(1, 31, 31, (imm >> 12) & 0xfff, 1));
+ aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm & 0xfff, 0));
+ }
+}
+
+/* ============================================================
+ * Function lifecycle
+ * ============================================================ */
+
+void aa_func_begin(CGTarget* t, const CGFuncDesc* fd) {
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+
+ mc->set_section(mc, fd->text_section_id);
+ mc->emit_align(mc, 4, 0);
+
+ a->fd = fd;
+ a->func_start = mc->pos(mc);
+ a->next_param_int = 0;
+ a->next_param_fp = 0;
+ a->next_param_stack = 0;
+ a->has_sret = (fd->abi && fd->abi->has_sret) ? 1 : 0;
+ a->cum_off = 0;
+ a->max_outgoing = 0;
+ a->used_cs_int_mask = 0;
+ a->used_cs_fp_mask = 0;
+ a->nslots = 0;
+ a->nscopes = 0;
+ a->has_alloca = 0;
+ a->nadd_patches = 0;
+ a->sret_ptr_slot = FRAME_SLOT_NONE;
+ a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0;
+ a->gp_save_slot = FRAME_SLOT_NONE;
+ a->fp_save_slot = FRAME_SLOT_NONE;
+ a->epilogue_label = mc->label_new(mc);
+
+ mc->cfi_startproc(mc);
+
+ a->prologue_pos = mc->pos(mc);
+ for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) aa64_emit32(mc, AA64_NOP);
+
+ if (a->has_sret) {
+ FrameSlotDesc fsd = {
+ .type = CFREE_CG_TYPE_NONE,
+ .name = 0,
+ .loc = (SrcLoc){0, 0, 0},
+ .size = 8,
+ .align = 8,
+ .kind = FS_SPILL,
+ .flags = 0,
+ };
+ a->sret_ptr_slot = aa_frame_slot(t, &fsd);
+ }
+
+ if (a->is_variadic) {
+ FrameSlotDesc gpd = {
+ .type = CFREE_CG_TYPE_NONE,
+ .name = 0,
+ .loc = (SrcLoc){0, 0, 0},
+ .size = 64,
+ .align = 8,
+ .kind = FS_SPILL,
+ .flags = 0,
+ };
+ a->gp_save_slot = aa_frame_slot(t, &gpd);
+ FrameSlotDesc fpd = {
+ .type = CFREE_CG_TYPE_NONE,
+ .name = 0,
+ .loc = (SrcLoc){0, 0, 0},
+ .size = 128,
+ .align = 16,
+ .kind = FS_SPILL,
+ .flags = 0,
+ };
+ a->fp_save_slot = aa_frame_slot(t, &fpd);
+ AASlot* gs = aa64_slot_get(a, a->gp_save_slot);
+ AASlot* fs = aa64_slot_get(a, a->fp_save_slot);
+ for (u32 i = 0; i < 8; ++i) {
+ aa64_emit32(mc, aa64_stur(3, i, 29, -(i32)gs->off + (i32)i * 8));
+ }
+ for (u32 i = 0; i < 8; ++i) {
+ aa64_emit32(mc, aa64_stur_fp(3, i, 29, -(i32)fs->off + (i32)i * 16));
+ }
+ }
+}
+
+void aa_func_end(CGTarget* t) {
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+
+ u32 int_regs[10];
+ u32 fp_regs[8];
+ u32 n_int_saves = collect_mask_regs(a->used_cs_int_mask, 19u, 28u, int_regs);
+ u32 n_fp_saves = collect_mask_regs(a->used_cs_fp_mask, 8u, 15u, fp_regs);
+
+ u32 outgoing_off = 0;
+ u32 int_save_off = a->max_outgoing;
+ u32 fp_save_off = int_save_off + n_int_saves * 8u;
+ u32 locals_off = fp_save_off + n_fp_saves * 8u;
+ u32 fp_lr_off = locals_off + a->cum_off;
+ u32 frame_size = fp_lr_off + 16;
+ frame_size = (frame_size + 15u) & ~15u;
+ fp_lr_off = frame_size - 16;
+
+ (void)outgoing_off;
+
+ mc->label_place(mc, a->epilogue_label);
+
+ if (a->has_alloca) {
+ if (fp_lr_off <= 0xfff) {
+ aa64_emit32(mc, aa64_sub_imm(1, /*Rd=SP*/ 31, /*Rn=*/29, fp_lr_off, 0));
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64: has_alloca + fp_lr_off %u out of imm12 range",
+ fp_lr_off);
+ }
+ }
+
+ for (i32 i = (i32)n_fp_saves - 1; i >= 0; --i) {
+ u32 r0 = fp_regs[i];
+ aa64_emit32(mc, aa64_ldr_fp_uimm(3, r0, 31,
+ fp_save_off + (u32)i * 8u));
+ }
+ for (i32 i = (i32)n_int_saves - 1; i >= 0; --i) {
+ u32 r0 = int_regs[i];
+ aa64_emit32(mc, aa64_ldr_uimm(3, r0, 31,
+ int_save_off + (u32)i * 8u));
+ }
+ aa64_emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off));
+ emit_sp_add(mc, frame_size);
+ aa64_emit32(mc, aa64_ret(AA64_LR));
+
+ u32 pos = a->prologue_pos;
+ ObjBuilder* obj = t->obj;
+ u32 sec = a->fd->text_section_id;
+
+ u32 words[AA_PROLOGUE_WORDS];
+ for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) words[i] = AA64_NOP;
+ u32 wi = 0;
+
+ if (frame_size <= 0xfff) {
+ words[wi++] = aa64_sub_imm(1, 31, 31, frame_size, 0);
+ } else if ((frame_size & 0xfff) == 0 && (frame_size >> 12) <= 0xfff) {
+ words[wi++] = aa64_sub_imm(1, 31, 31, frame_size >> 12, 1);
+ } else {
+ if (wi + 2 > AA_PROLOGUE_WORDS) {
+ compiler_panic(t->c, a->loc,
+ "aarch64: prologue overflow for frame_size %u",
+ frame_size);
+ }
+ words[wi++] = aa64_sub_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1);
+ words[wi++] = aa64_sub_imm(1, 31, 31, frame_size & 0xfff, 0);
+ }
+ words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off);
+ words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0);
+ if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) {
+ AASlot* s = aa64_slot_get(a, a->sret_ptr_slot);
+ if (s) {
+ if (wi >= AA_PROLOGUE_WORDS) goto overflow;
+ words[wi++] = aa64_stur(3, 8, 29, -(i32)s->off);
+ }
+ }
+ for (u32 i = 0; i < n_int_saves; ++i) {
+ u32 r0 = int_regs[i];
+ if (wi >= AA_PROLOGUE_WORDS) goto overflow;
+ words[wi++] = aa64_str_uimm(3, r0, 31, int_save_off + i * 8u);
+ }
+ for (u32 i = 0; i < n_fp_saves; ++i) {
+ u32 r0 = fp_regs[i];
+ if (wi >= AA_PROLOGUE_WORDS) goto overflow;
+ words[wi++] = aa64_str_fp_uimm(3, r0, 31, fp_save_off + i * 8u);
+ }
+ if (0) {
+ overflow:
+ compiler_panic(
+ t->c, a->loc,
+ "aarch64: prologue placeholder too small (used %u of %u words)", wi,
+ AA_PROLOGUE_WORDS);
+ }
+
+ for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) {
+ aa64_patch32(obj, sec, pos + i * 4u, words[i]);
+ }
+
+ if (a->max_outgoing > 0xfff) {
+ compiler_panic(
+ t->c, a->loc,
+ "aarch64: max_outgoing %u out of imm12 range for alloca patch",
+ a->max_outgoing);
+ }
+ for (u32 i = 0; i < a->nadd_patches; ++i) {
+ u32 dr = a->add_patches[i].dst_reg;
+ u32 word = aa64_add_imm(1, dr, /*Rn=SP*/ 31, a->max_outgoing, 0);
+ aa64_patch32(obj, sec, a->add_patches[i].pos, word);
+ }
+
+ u32 end = mc->pos(mc);
+ obj_symbol_define(obj, a->fd->sym, sec, (u64)a->func_start,
+ (u64)(end - a->func_start));
+
+ mc->cfi_endproc(mc);
+ a->fd = NULL;
+}
+
+/* ============================================================
+ * Frame slots
+ * ============================================================ */
+
+FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d) {
+ AAImpl* a = impl_of(t);
+ if (a->nslots == a->slots_cap) {
+ u32 ncap = a->slots_cap ? a->slots_cap * 2 : 8;
+ AASlot* nbuf = arena_array(t->c->tu, AASlot, ncap);
+ if (a->slots) memcpy(nbuf, a->slots, sizeof(AASlot) * a->nslots);
+ a->slots = nbuf;
+ a->slots_cap = ncap;
+ }
+ u32 size = d->size ? d->size : 8;
+ u32 align = d->align ? d->align : 1;
+ u32 next = a->cum_off + size;
+ u32 mask = align - 1;
+ next = (next + mask) & ~mask;
+
+ AASlot* s = &a->slots[a->nslots];
+ s->off = next;
+ s->size = size;
+ s->align = align;
+ s->kind = d->kind;
+
+ a->cum_off = next;
+ a->nslots++;
+ return (FrameSlot)(a->nslots);
+}
+
+/* ============================================================
+ * Parameters
+ * ============================================================ */
+
+void aa_param(CGTarget* t, const CGParamDesc* p) {
+ AAImpl* a = impl_of(t);
+ AASlot* s = aa64_slot_get(a, p->slot);
+ if (!s) {
+ compiler_panic(t->c, a->loc, "aarch64 param: bad slot");
+ }
+ const ABIArgInfo* ai = p->abi;
+
+ if (ai->kind == ABI_ARG_IGNORE) return;
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ u32 ptr_reg;
+ if (a->next_param_int < 8) {
+ ptr_reg = a->next_param_int++;
+ } else {
+ u32 caller_off = a->next_param_stack;
+ a->next_param_stack += 8;
+ aa64_emit32(t->mc, aa64_ldur(3, AA_TMP0, 29, (i32)(16 + caller_off)));
+ ptr_reg = AA_TMP0;
+ }
+ u32 nbytes = s->size;
+ u32 i = 0;
+ while (i + 8 <= nbytes) {
+ aa64_emit32(t->mc, aa64_ldur(3, AA_TMP1, ptr_reg, (i32)i));
+ aa64_emit32(t->mc, aa64_stur(3, AA_TMP1, 29, -(i32)s->off + (i32)i));
+ i += 8;
+ }
+ while (i + 4 <= nbytes) {
+ aa64_emit32(t->mc, aa64_ldur(2, AA_TMP1, ptr_reg, (i32)i));
+ aa64_emit32(t->mc, aa64_stur(2, AA_TMP1, 29, -(i32)s->off + (i32)i));
+ i += 4;
+ }
+ while (i + 2 <= nbytes) {
+ aa64_emit32(t->mc, aa64_ldur(1, AA_TMP1, ptr_reg, (i32)i));
+ aa64_emit32(t->mc, aa64_stur(1, AA_TMP1, 29, -(i32)s->off + (i32)i));
+ i += 2;
+ }
+ while (i < nbytes) {
+ aa64_emit32(t->mc, aa64_ldur(0, AA_TMP1, ptr_reg, (i32)i));
+ aa64_emit32(t->mc, aa64_stur(0, AA_TMP1, 29, -(i32)s->off + (i32)i));
+ i += 1;
+ }
+ return;
+ }
+ for (u16 i = 0; i < ai->nparts; ++i) {
+ const ABIArgPart* pt = &ai->parts[i];
+ u32 part_off = pt->src_offset;
+ u32 sz = pt->size;
+ u32 sidx = size_idx_for_bytes(sz);
+
+ if (pt->cls == ABI_CLASS_INT) {
+ if (a->next_param_int < 8) {
+ u32 reg = a->next_param_int++;
+ aa64_emit32(t->mc, aa64_stur(sidx, reg, 29, -(i32)s->off + (i32)part_off));
+ } else {
+ u32 caller_off = a->next_param_stack;
+ a->next_param_stack += 8;
+ aa64_emit32(t->mc, aa64_ldur(sidx, AA_TMP0, 29, (i32)(16 + caller_off)));
+ aa64_emit32(t->mc,
+ aa64_stur(sidx, AA_TMP0, 29,
+ -(i32)s->off + (i32)part_off));
+ }
+ } else if (pt->cls == ABI_CLASS_FP) {
+ if (a->next_param_fp < 8) {
+ u32 reg = a->next_param_fp++;
+ aa64_emit32(t->mc,
+ aa64_stur_fp(sidx, reg, 29, -(i32)s->off + (i32)part_off));
+ } else {
+ u32 caller_off = a->next_param_stack;
+ a->next_param_stack += 8;
+ aa64_emit32(t->mc,
+ aa64_ldur_fp(sidx, AA_FP_TMP0, 29,
+ (i32)(16 + caller_off)));
+ aa64_emit32(t->mc,
+ aa64_stur_fp(sidx, AA_FP_TMP0, 29,
+ -(i32)s->off + (i32)part_off));
+ }
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl",
+ (int)pt->cls);
+ }
+ }
+}
+
+/* ============================================================
+ * Address materialization helpers
+ * ============================================================ */
+
+static int use_got_for_sym(CGTarget* t, ObjSymId sym) {
+ return obj_symbol_extern_via_got(t->c, t->obj, sym);
+}
+
+void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym) {
+ MCEmitter* mc = t->mc;
+ u32 sec = mc->section_id;
+ u32 adrp_pos = mc->pos(mc);
+ aa64_emit32(mc, aa64_adrp_base(dst_reg));
+ mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_GOT_PAGE, sym, 0, 0, 0);
+ u32 ldr_pos = mc->pos(mc);
+ aa64_emit32(mc, aa64_ldr_uimm(/*size=*/3, dst_reg, dst_reg, 0));
+ mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_LD64_GOT_LO12_NC, sym, 0, 0, 0);
+}
+
+void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend) {
+ MCEmitter* mc = t->mc;
+ if (use_got_for_sym(t, sym)) {
+ aa64_emit_got_load_addr(t, dst_reg, sym);
+ if (addend) aa64_emit_addr_adjust(mc, dst_reg, dst_reg, (i32)addend);
+ return;
+ }
+ u32 sec = mc->section_id;
+ u32 adrp_pos = mc->pos(mc);
+ aa64_emit32(mc, aa64_adrp_base(dst_reg));
+ mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, addend,
+ 0, 0);
+ u32 add_pos = mc->pos(mc);
+ aa64_emit32(mc, aa64_add_imm(1, dst_reg, dst_reg, 0, 0));
+ mc->emit_reloc_at(mc, sec, add_pos, R_AARCH64_ADD_ABS_LO12_NC, sym, addend, 0,
+ 0);
+}
+
+void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off) {
+ if (off == 0) {
+ aa64_emit32(mc, aa64_mov_reg(1, Rd, base));
+ return;
+ }
+ u32 abs_off = (off < 0) ? (u32)(-off) : (u32)off;
+ if (abs_off <= 0xfff) {
+ if (off < 0)
+ aa64_emit32(mc, aa64_sub_imm(1, Rd, base, abs_off, 0));
+ else
+ aa64_emit32(mc, aa64_add_imm(1, Rd, base, abs_off, 0));
+ return;
+ }
+ if ((abs_off >> 24) == 0) {
+ u32 hi = (abs_off >> 12) & 0xfff;
+ u32 lo = abs_off & 0xfff;
+ if (off < 0) {
+ if (hi) aa64_emit32(mc, aa64_sub_imm(1, Rd, base, hi, 1));
+ if (lo) aa64_emit32(mc, aa64_sub_imm(1, Rd, hi ? Rd : base, lo, 0));
+ } else {
+ if (hi) aa64_emit32(mc, aa64_add_imm(1, Rd, base, hi, 1));
+ if (lo) aa64_emit32(mc, aa64_add_imm(1, Rd, hi ? Rd : base, lo, 0));
+ }
+ return;
+ }
+ aa64_emit_load_imm(mc, 1, Rd, off);
+ aa64_emit32(mc, aa64_add(1, Rd, base, Rd));
+}
diff --git a/src/arch/aa64/internal.h b/src/arch/aa64/internal.h
@@ -0,0 +1,306 @@
+/* aarch64/internal.h — private types and forward decls shared across
+ * emit.c / alloc.c / ops.c. NOT part of the public API. */
+#pragma once
+
+#include <string.h>
+
+#include "arch/aa64/asm.h"
+#include "arch/aa64/isa.h"
+#include "arch/aa64/regs.h"
+#include "arch/arch.h"
+#include "core/arena.h"
+#include "core/pool.h"
+#include "obj/obj.h"
+
+/* ============================================================
+ * Local encoding helpers (kept here, not in aa64_isa.h).
+ * ============================================================ */
+
+#define AA64_NOP 0xD503201Fu
+
+/* Hidden backend temporaries. These must stay outside the allocable pools and
+ * outside optimizer scratch registers because CGTarget ops may clobber them
+ * while lowering a single operation. AA_FP_TMP0 names v31, not integer x31. */
+enum {
+ AA_TMP0 = 9u,
+ AA_TMP1 = 10u,
+ AA_TMP2 = 11u,
+ AA_FP_TMP0 = 31u,
+};
+#define CG_BUILTIN_ID(k) ((CfreeCgTypeId)((1u << 6) | (u32)(k)))
+
+static inline u32 aa64_stp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) {
+ i32 sc = byte_off >> 3;
+ return 0xA9000000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) |
+ ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+static inline u32 aa64_ldp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) {
+ i32 sc = byte_off >> 3;
+ return 0xA9400000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) |
+ ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+static inline u32 aa64_stp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) {
+ i32 sc = byte_off >> 3;
+ return 0x6D000000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) |
+ ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+static inline u32 aa64_ldp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) {
+ i32 sc = byte_off >> 3;
+ return 0x6D400000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) |
+ ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+
+static inline u32 aa64_stur(u32 size, u32 Rt, u32 Rn, i32 simm9) {
+ return 0x38000000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) |
+ ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+static inline u32 aa64_ldur(u32 size, u32 Rt, u32 Rn, i32 simm9) {
+ return 0x38400000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) |
+ ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+static inline u32 aa64_stur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9) {
+ return 0x3C000000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) |
+ ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+static inline u32 aa64_ldur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9) {
+ return 0x3C400000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) |
+ ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+
+static inline u32 aa64_str_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) {
+ u32 sc = byte_off >> size;
+ return 0x39000000u | (size << 30) | ((sc & 0xfffu) << 10) |
+ ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+static inline u32 aa64_ldr_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) {
+ u32 sc = byte_off >> size;
+ return 0x39400000u | (size << 30) | ((sc & 0xfffu) << 10) |
+ ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+static inline u32 aa64_str_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) {
+ u32 sc = byte_off >> size;
+ return 0x3D000000u | (size << 30) | ((sc & 0xfffu) << 10) |
+ ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+
+static inline u32 aa64_mrs_tpidr_el0(u32 Rt) {
+ return 0xD53BD040u | (Rt & 0x1fu);
+}
+static inline u32 aa64_b_base(void) { return 0x14000000u; }
+static inline u32 aa64_bl_base(void) { return 0x94000000u; }
+
+static inline u32 aa64_adrp_base(u32 Rd) { return 0x90000000u | (Rd & 0x1f); }
+
+static inline u32 aa64_ldr_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) {
+ u32 sc = byte_off >> size;
+ return 0x3D400000u | (size << 30) | ((sc & 0xfffu) << 10) |
+ ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+
+static inline u32 aa64_fmov_reg(u32 type, u32 Rd, u32 Rn) {
+ return 0x1E204000u | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+
+static inline u32 aa64_subs_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12) {
+ return 0x71000000u | (sf << 31) | ((imm12 & 0xfff) << 10) |
+ ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+
+static inline u32 aa64_cset_eq(u32 sf, u32 Rd) {
+ return 0x1A800400u | (sf << 31) | (31u << 16) | (0x1u << 12) | (31u << 5) |
+ (Rd & 0x1f);
+}
+
+static inline u32 aa64_fcvtzs(u32 sf, u32 type, u32 Rd, u32 Rn) {
+ return 0x1E380000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) |
+ (Rd & 0x1f);
+}
+static inline u32 aa64_fcvtzu(u32 sf, u32 type, u32 Rd, u32 Rn) {
+ return 0x1E390000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) |
+ (Rd & 0x1f);
+}
+static inline u32 aa64_scvtf(u32 sf, u32 type, u32 Rd, u32 Rn) {
+ return 0x1E220000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) |
+ (Rd & 0x1f);
+}
+static inline u32 aa64_ucvtf(u32 sf, u32 type, u32 Rd, u32 Rn) {
+ return 0x1E230000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) |
+ (Rd & 0x1f);
+}
+
+static inline u32 aa64_fcvt_d_s(u32 Rd, u32 Rn) {
+ return 0x1E22C000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_fcvt_s_d(u32 Rd, u32 Rn) {
+ return 0x1E624000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+
+static inline u32 aa64_fmov_s_w(u32 Rd, u32 Rn) {
+ return 0x1E270000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_fmov_w_s(u32 Rd, u32 Rn) {
+ return 0x1E260000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_fmov_d_x(u32 Rd, u32 Rn) {
+ return 0x9E670000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_fmov_x_d(u32 Rd, u32 Rn) {
+ return 0x9E660000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+
+static inline u32 aa64_sub_extreg_x_uxtx(u32 Rd, u32 Rn, u32 Rm) {
+ return 0xCB206000u | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+
+static inline u32 aa64_subs_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
+ return 0x6B000000u | (sf << 31) | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) |
+ (Rd & 0x1f);
+}
+
+static inline u32 aa64_b_cond(u32 cond) { return 0x54000000u | (cond & 0xfu); }
+
+static inline u32 aa64_csinc(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 cond) {
+ return 0x1A800400u | (sf << 31) | ((Rm & 0x1f) << 16) |
+ ((cond & 0xfu) << 12) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_cset(u32 sf, u32 Rd, u32 cond) {
+ return aa64_csinc(sf, Rd, 31u, 31u, cond ^ 1u);
+}
+
+static inline u32 aa64_fadd(u32 type, u32 Rd, u32 Rn, u32 Rm) {
+ return 0x1E202800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) |
+ ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_fsub(u32 type, u32 Rd, u32 Rn, u32 Rm) {
+ return 0x1E203800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) |
+ ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_fmul(u32 type, u32 Rd, u32 Rn, u32 Rm) {
+ return 0x1E200800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) |
+ ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_fdiv(u32 type, u32 Rd, u32 Rn, u32 Rm) {
+ return 0x1E201800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) |
+ ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+
+static inline u32 aa64_sbfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) {
+ return 0x13000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) |
+ ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_ubfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) {
+ return 0x53000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) |
+ ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_bfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) {
+ return 0x33000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) |
+ ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+
+/* ============================================================
+ * AAImpl types
+ * ============================================================ */
+
+#define AA_PROLOGUE_WORDS \
+ 22u /* worst case: sub sp + stp/add fp + sret + 5 int + 8 fp saves */
+
+typedef struct AASlot {
+ u32 off;
+ u32 size;
+ u32 align;
+ u8 kind;
+ u8 pad[3];
+} AASlot;
+
+typedef struct AAScope {
+ u8 kind;
+ u8 has_else;
+ u8 pad[2];
+ MCLabel else_label;
+ MCLabel end_label;
+ Label break_label;
+ Label continue_label;
+} AAScope;
+
+typedef struct AAImpl {
+ CGTarget base;
+ SrcLoc loc;
+ const CGFuncDesc* fd;
+
+ u32 func_start;
+ u32 prologue_pos;
+ MCLabel epilogue_label;
+
+ AASlot* slots;
+ u32 nslots;
+ u32 slots_cap;
+ u32 cum_off;
+ u32 max_outgoing;
+
+ u32 next_param_int;
+ u32 next_param_fp;
+ u32 next_param_stack;
+ u8 has_sret;
+ FrameSlot sret_ptr_slot;
+
+ u32 used_cs_int_mask; /* bit reg set when x19-x28 must be preserved */
+ u32 used_cs_fp_mask; /* bit reg set when d8-d15 must be preserved */
+
+ AAScope* scopes;
+ u32 nscopes;
+ u32 scopes_cap;
+
+ u8 has_alloca;
+ struct AAAllocaPatch {
+ u32 pos;
+ u32 dst_reg;
+ }* add_patches;
+ u32 nadd_patches;
+ u32 add_patches_cap;
+
+ u8 is_variadic;
+ FrameSlot gp_save_slot;
+ FrameSlot fp_save_slot;
+} AAImpl;
+
+/* ============================================================
+ * Cross-file forward declarations
+ * ============================================================ */
+
+/* emit.c helpers used in alloc.c / ops.c */
+void aa64_emit32(MCEmitter* mc, u32 word);
+void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word);
+void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm);
+void emit_sp_add(MCEmitter* mc, u32 imm);
+void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off);
+void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym);
+void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend);
+
+/* emit.c public surface */
+FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d);
+void aa_func_begin(CGTarget* t, const CGFuncDesc* fd);
+void aa_func_end(CGTarget* t);
+void aa_param(CGTarget* t, const CGParamDesc* p);
+
+/* alloc.c helpers used in emit.c / ops.c */
+AAImpl* impl_of(CGTarget* t);
+AASlot* aa64_slot_get(AAImpl* a, FrameSlot fs);
+void aa_jump(CGTarget* t, Label l);
+
+/* ops.c helpers used in alloc.c */
+void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma);
+void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma);
+u32 aa64_force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch);
+
+/* alloc.c helpers used in ops.c */
+void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op);
+void aa_alloc_vtable_init(CGTarget* t);
+void aa_coord_vtable_init(CGTarget* t);
+
+/* shared type helpers (defined in emit.c, used broadly) */
+int type_is_64(CfreeCgTypeId t);
+int type_is_fp_double(CfreeCgTypeId t);
+int type_is_signed(CfreeCgTypeId t);
+u32 type_byte_size(CfreeCgTypeId t);
+u32 size_idx_for_bytes(u32 nbytes);
+u32 reg_num(Operand op);
diff --git a/src/arch/aa64/isa.c b/src/arch/aa64/isa.c
@@ -0,0 +1,598 @@
+/* AArch64 instruction descriptor table + operand print/parse dispatch.
+ *
+ * The table mirrors the inline encoders in aa64_isa.h: each row records
+ * (mnemonic, match, mask, format, flags) so the disassembler can identify
+ * a raw 32-bit word with one mask-and-compare and then dispatch on the
+ * format to extract operand fields via the same unpack functions the
+ * encoder uses. Encoder and decoder share the bit knowledge — when an
+ * opcode value or field position changes, both sides update at one site.
+ *
+ * Mask values include the family mask plus the bits that distinguish a
+ * specific instruction from its siblings in the same family. sf (bit 31)
+ * is intentionally a don't-care for formats where both 32- and 64-bit
+ * forms share one row; the unpacker reads sf separately when printing
+ * operands.
+ *
+ * Row ordering: first-match wins. Aliases (rows with AA64_ASMFL_ALIAS)
+ * are tighter masks placed BEFORE the canonical row they alias so the
+ * disassembler renders the alias spelling. The assembler accepts both
+ * spellings — they map to the same encoded word. */
+
+#include "arch/aa64/isa.h"
+
+#include <stddef.h>
+
+const AA64InsnDesc aa64_insn_table[] = {
+ /* ----- Move-wide immediate (MOVN / MOVZ / MOVK) ----- */
+ {"movn", 0x12800000u, 0x7F800000u, AA64_FMT_MOVEWIDE, 0, {0, 0}},
+ {"movz", 0x52800000u, 0x7F800000u, AA64_FMT_MOVEWIDE, 0, {0, 0}},
+ {"movk", 0x72800000u, 0x7F800000u, AA64_FMT_MOVEWIDE, 0, {0, 0}},
+
+ /* ----- Logical, shifted register -----
+ * Alias MOV Rd, Rm is ORR Rd, ZR, Rm with shift=0, imm6=0. The mask
+ * pins Rn (bits 9:5) to 11111 (ZR) and shift/imm6 to 0 so only the
+ * MOV spelling matches; broader ORR rows below catch the rest. */
+ {"mov", 0x2A0003E0u, 0x7FE0FFE0u, AA64_FMT_LOG_SR, AA64_ASMFL_ALIAS,
+ {0, 0}},
+ /* MVN Rd, Rm ≡ ORN Rd, ZR, Rm (logical N=1, Rn=ZR, no shift) */
+ {"mvn", 0x2A2003E0u, 0x7FE0FFE0u, AA64_FMT_LOG_SR, AA64_ASMFL_ALIAS,
+ {0, 0}},
+ {"and", 0x0A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
+ {"bic", 0x0A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
+ {"orr", 0x2A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
+ {"orn", 0x2A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
+ {"eor", 0x4A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
+ {"eon", 0x4A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
+ {"ands", 0x6A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
+ {"bics", 0x6A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
+
+ /* ----- Add/Sub, shifted register -----
+ * NEG Rd, Rm ≡ SUB Rd, ZR, Rm (Rn=ZR, shift=0, imm6=0). */
+ {"neg", 0x4B0003E0u, 0x7FE0FFE0u, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS,
+ {0, 0}},
+ {"negs", 0x6B0003E0u, 0x7FE0FFE0u, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS,
+ {0, 0}},
+ /* CMP Rn, Rm ≡ SUBS ZR, Rn, Rm. */
+ {"cmp", 0x6B00001Fu, 0x7F20001Fu, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS,
+ {0, 0}},
+ /* CMN Rn, Rm ≡ ADDS ZR, Rn, Rm. */
+ {"cmn", 0x2B00001Fu, 0x7F20001Fu, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS,
+ {0, 0}},
+ {"add", 0x0B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}},
+ {"adds", 0x2B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}},
+ {"sub", 0x4B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}},
+ {"subs", 0x6B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}},
+
+ /* ----- Data-processing 3-source -----
+ * MUL Rd, Rn, Rm ≡ MADD Rd, Rn, Rm, ZR (Ra=ZR, op31=0, o0=0). */
+ {"mul", 0x1B007C00u, 0x7FE0FC00u, AA64_FMT_DP3, AA64_ASMFL_ALIAS, {0, 0}},
+ /* MNEG Rd, Rn, Rm ≡ MSUB Rd, Rn, Rm, ZR. */
+ {"mneg", 0x1B00FC00u, 0x7FE0FC00u, AA64_FMT_DP3, AA64_ASMFL_ALIAS, {0, 0}},
+ {"madd", 0x1B000000u, 0x7FE08000u, AA64_FMT_DP3, 0, {0, 0}},
+ {"msub", 0x1B008000u, 0x7FE08000u, AA64_FMT_DP3, 0, {0, 0}},
+
+ /* ----- Data-processing 2-source ----- */
+ {"udiv", 0x1AC00800u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}},
+ {"sdiv", 0x1AC00C00u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}},
+ {"lslv", 0x1AC02000u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}},
+ {"lsrv", 0x1AC02400u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}},
+ {"asrv", 0x1AC02800u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}},
+ {"rorv", 0x1AC02C00u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}},
+
+ /* ----- Unconditional branch (register) -----
+ * RET aliases its no-operand spelling to RET X30 (Rn=11110). The
+ * tighter row matches when Rn=30 and prints "ret" without operands;
+ * the looser row below catches RET Xn for other Rn. */
+ {"ret", 0xD65F03C0u, 0xFFFFFFFFu, AA64_FMT_BR_REG,
+ AA64_ASMFL_ALIAS | AA64_ASMFL_NORN, {0, 0}},
+ {"br", 0xD61F0000u, 0xFFFFFC1Fu, AA64_FMT_BR_REG, 0, {0, 0}},
+ {"blr", 0xD63F0000u, 0xFFFFFC1Fu, AA64_FMT_BR_REG, 0, {0, 0}},
+ {"ret", 0xD65F0000u, 0xFFFFFC1Fu, AA64_FMT_BR_REG, 0, {0, 0}},
+
+ /* ----- PC-relative addressing ----- */
+ {"adr", 0x10000000u, 0x9F000000u, AA64_FMT_PCREL_ADR, 0, {0, 0}},
+ {"adrp", 0x90000000u, 0x9F000000u, AA64_FMT_PCREL_ADR, 0, {0, 0}},
+
+ /* ----- Add/Sub immediate ----- */
+ {"add", 0x11000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}},
+ {"adds", 0x31000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}},
+ {"sub", 0x51000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}},
+ {"subs", 0x71000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}},
+
+ /* ----- Load/store, unsigned 12-bit immediate (scaled) -----
+ * Mask: family bits 29:27 + 25:24 + size(31:30) + V(26) + opc(23:22). */
+ {"strb", 0x39000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
+ {"ldrb", 0x39400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
+ {"strh", 0x79000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
+ {"ldrh", 0x79400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
+ {"str", 0xB9000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* 32 */
+ {"ldr", 0xB9400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
+ {"str", 0xF9000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1,
+ {0, 0}}, /* 64 */
+ {"ldr", 0xF9400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1,
+ {0, 0}},
+ /* SIMD/FP scaled loads/stores (V=1). size 0..2 select B/H/S; size=3
+ * selects D; the 128-bit Q form uses size=00 with opc bit 1 set and
+ * is not yet emitted by codegen. */
+ {"str", 0x3D000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* B */
+ {"ldr", 0x3D400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
+ {"str", 0x7D000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* H */
+ {"ldr", 0x7D400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
+ {"str", 0xBD000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* S */
+ {"ldr", 0xBD400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
+ {"str", 0xFD000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1,
+ {0, 0}}, /* D */
+ {"ldr", 0xFD400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1,
+ {0, 0}},
+
+ /* ----- Load/store, unscaled signed 9-bit immediate (LDUR/STUR) -----
+ * V=0 first, V=1 next. Per-row mask narrows size+V+opc; family mask
+ * pins the high family bits + the SIMM9-vs-other-variant selector. */
+ {"sturb", 0x38000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
+ {"ldurb", 0x38400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
+ {"sturh", 0x78000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
+ {"ldurh", 0x78400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
+ {"stur", 0xB8000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* 32 */
+ {"ldur", 0xB8400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
+ {"stur", 0xF8000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1,
+ {0, 0}},
+ {"ldur", 0xF8400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1,
+ {0, 0}},
+ {"stur", 0x3C000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* B */
+ {"ldur", 0x3C400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
+ {"stur", 0x7C000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* H */
+ {"ldur", 0x7C400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
+ {"stur", 0xBC000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* S */
+ {"ldur", 0xBC400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
+ {"stur", 0xFC000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1,
+ {0, 0}}, /* D */
+ {"ldur", 0xFC400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1,
+ {0, 0}},
+
+ /* ----- Load/store pair, pre-indexed (opc=10 X / opc=01 D) ----- */
+ {"stp", 0xA9800000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, AA64_ASMFL_SF1,
+ {0, 0}},
+ {"ldp", 0xA9C00000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, AA64_ASMFL_SF1,
+ {0, 0}},
+ {"stp", 0x6D800000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, 0, {0, 0}}, /* D */
+ {"ldp", 0x6DC00000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, 0, {0, 0}},
+
+ /* ----- Load/store pair, signed-offset ----- */
+ {"stp", 0xA9000000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, AA64_ASMFL_SF1,
+ {0, 0}},
+ {"ldp", 0xA9400000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, AA64_ASMFL_SF1,
+ {0, 0}},
+ {"stp", 0x6D000000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, 0, {0, 0}}, /* D */
+ {"ldp", 0x6D400000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, 0, {0, 0}},
+
+ /* ----- Unconditional branch (immediate) ----- */
+ {"b", 0x14000000u, 0xFC000000u, AA64_FMT_BR_IMM, 0, {0, 0}},
+ {"bl", 0x94000000u, 0xFC000000u, AA64_FMT_BR_IMM, 0, {0, 0}},
+
+ /* ----- Conditional branch (immediate) ----- */
+ {"b.cond", 0x54000000u, 0xFF000010u, AA64_FMT_BR_COND, 0, {0, 0}},
+
+ /* ----- Compare-and-branch ----- */
+ {"cbz", 0x34000000u, 0x7F000000u, AA64_FMT_CB, 0, {0, 0}},
+ {"cbnz", 0x35000000u, 0x7F000000u, AA64_FMT_CB, 0, {0, 0}},
+
+ /* ----- Exception generation ----- */
+ {"svc", 0xD4000001u, 0xFFE0001Fu, AA64_FMT_EXCEPT, 0, {0, 0}},
+ {"brk", 0xD4200000u, 0xFFE0001Fu, AA64_FMT_EXCEPT, 0, {0, 0}},
+ {"hlt", 0xD4400000u, 0xFFE0001Fu, AA64_FMT_EXCEPT, 0, {0, 0}},
+
+ /* ----- Hint ----- */
+ {"nop", 0xD503201Fu, 0xFFFFFFFFu, AA64_FMT_HINT, 0, {0, 0}},
+
+ /* ----- Memory barriers (DMB / DSB / ISB / CLREX) -----
+ * Mask covers everything but CRm at bits[11:8]. */
+ {"dmb", 0xD50330BFu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}},
+ {"dsb", 0xD503309Fu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}},
+ {"isb", 0xD50330DFu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}},
+ {"clrex", 0xD503305Fu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}},
+};
+
+const u32 aa64_insn_table_n =
+ (u32)(sizeof aa64_insn_table / sizeof aa64_insn_table[0]);
+
+const AA64InsnDesc* aa64_disasm_find(u32 word) {
+ for (u32 i = 0; i < aa64_insn_table_n; ++i) {
+ const AA64InsnDesc* d = &aa64_insn_table[i];
+ if ((word & d->mask) == d->match) return d;
+ }
+ return NULL;
+}
+
+/* =====================================================================
+ * Operand print — one helper per format.
+ *
+ * Format choices for immediates:
+ * - branch displacements, signed add/sub imm, signed ldur/stur ofs:
+ * signed decimal.
+ * - MOVZ/MOVK halfword, logical bitmask, exception generation #imm:
+ * 0x-prefixed hex.
+ *
+ * Register naming: ZR alias for x31 in places where the encoding treats
+ * Rd/Rn=31 as the zero register (logical/arith), SP where it treats 31
+ * as the stack pointer (add/sub imm, ldr/str-uimm Rn, ldp/stp Rn).
+ *
+ * vaddr is folded into PC-relative branch operands when nonzero. */
+
+static void emit_reg(StrBuf* sb, u32 r, int sf, int sp_means_sp) {
+ if (r == 31u) {
+ if (sp_means_sp) strbuf_puts(sb, "sp");
+ else if (sf) strbuf_puts(sb, "xzr");
+ else strbuf_puts(sb, "wzr");
+ return;
+ }
+ strbuf_putc(sb, sf ? 'x' : 'w');
+ strbuf_put_u64(sb, (u64)r);
+}
+
+static void emit_vreg(StrBuf* sb, u32 r, char prefix) {
+ strbuf_putc(sb, prefix);
+ strbuf_put_u64(sb, (u64)r);
+}
+
+static void emit_cond(StrBuf* sb, u32 cond) {
+ static const char* names[16] = {"eq", "ne", "cs", "cc", "mi", "pl",
+ "vs", "vc", "hi", "ls", "ge", "lt",
+ "gt", "le", "al", "nv"};
+ strbuf_puts(sb, names[cond & 0xfu]);
+}
+
+/* Sign-extend an n-bit value held in the low bits of v to i64. */
+static i64 sext(u64 v, u32 nbits) {
+ u64 mask = (nbits >= 64u) ? ~0ull : ((1ull << nbits) - 1ull);
+ v &= mask;
+ u64 sign = (nbits == 0u) ? 0ull : (1ull << (nbits - 1u));
+ if (v & sign) v |= ~mask;
+ return (i64)v;
+}
+
+static void print_movewide(StrBuf* sb, u32 w) {
+ AA64MoveWide f = aa64_movewide_unpack(w);
+ emit_reg(sb, f.Rd, (int)f.sf, /*sp_means_sp=*/0);
+ strbuf_puts(sb, ", ");
+ strbuf_put_hex_u64(sb, (u64)f.imm16);
+ if (f.hw) {
+ strbuf_puts(sb, ", lsl ");
+ strbuf_put_u64(sb, (u64)(f.hw * 16u));
+ }
+}
+
+static void print_logsr(StrBuf* sb, u32 w, const AA64InsnDesc* d) {
+ AA64LogSR f = aa64_logsr_unpack(w);
+ if (d->flags & AA64_ASMFL_ALIAS) {
+ /* MOV / MVN: Rd, Rm */
+ emit_reg(sb, f.Rd, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rm, (int)f.sf, 0);
+ return;
+ }
+ emit_reg(sb, f.Rd, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rn, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rm, (int)f.sf, 0);
+ if (f.imm6 || f.shift) {
+ static const char* sh[4] = {"lsl", "lsr", "asr", "ror"};
+ strbuf_puts(sb, ", ");
+ strbuf_puts(sb, sh[f.shift & 3u]);
+ strbuf_puts(sb, " #");
+ strbuf_put_u64(sb, (u64)f.imm6);
+ }
+}
+
+static void print_addsubsr(StrBuf* sb, u32 w, const AA64InsnDesc* d) {
+ AA64AddSubSR f = aa64_addsubsr_unpack(w);
+ if (d->flags & AA64_ASMFL_ALIAS) {
+ /* NEG / NEGS / CMP / CMN. */
+ if (d->mnemonic[0] == 'c') {
+ /* CMP / CMN — print Rn, Rm */
+ emit_reg(sb, f.Rn, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rm, (int)f.sf, 0);
+ } else {
+ /* NEG / NEGS — print Rd, Rm */
+ emit_reg(sb, f.Rd, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rm, (int)f.sf, 0);
+ }
+ return;
+ }
+ emit_reg(sb, f.Rd, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rn, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rm, (int)f.sf, 0);
+ if (f.imm6 || f.shift) {
+ static const char* sh[4] = {"lsl", "lsr", "asr", "rsv"};
+ strbuf_puts(sb, ", ");
+ strbuf_puts(sb, sh[f.shift & 3u]);
+ strbuf_puts(sb, " #");
+ strbuf_put_u64(sb, (u64)f.imm6);
+ }
+}
+
+static void print_dp3(StrBuf* sb, u32 w, const AA64InsnDesc* d) {
+ AA64DP3 f = aa64_dp3_unpack(w);
+ /* MUL / MNEG alias drop Ra (which is ZR for the alias). */
+ if (d->flags & AA64_ASMFL_ALIAS) {
+ emit_reg(sb, f.Rd, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rn, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rm, (int)f.sf, 0);
+ return;
+ }
+ emit_reg(sb, f.Rd, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rn, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rm, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Ra, (int)f.sf, 0);
+}
+
+static void print_dp2(StrBuf* sb, u32 w) {
+ AA64DP2 f = aa64_dp2_unpack(w);
+ emit_reg(sb, f.Rd, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rn, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rm, (int)f.sf, 0);
+}
+
+static void print_brreg(StrBuf* sb, u32 w, const AA64InsnDesc* d) {
+ AA64BrReg f = aa64_brreg_unpack(w);
+ if (d->flags & AA64_ASMFL_NORN) return; /* RET (with implicit X30) */
+ emit_reg(sb, f.Rn, /*sf=*/1, 0);
+}
+
+static void print_pcrel(StrBuf* sb, u32 w, u64 vaddr) {
+ AA64PCRelAdr f = aa64_pcrel_adr_unpack(w);
+ emit_reg(sb, f.Rd, /*sf=*/1, 0);
+ strbuf_puts(sb, ", ");
+ i64 imm = sext(((u64)f.immhi << 2) | (u64)f.immlo, 21);
+ if (f.op == AA64_ADR_OP_ADRP) imm <<= 12;
+ if (vaddr) {
+ u64 base = (f.op == AA64_ADR_OP_ADRP) ? (vaddr & ~0xfffull) : vaddr;
+ strbuf_put_hex_u64(sb, base + (u64)imm);
+ } else {
+ strbuf_puts(sb, "#");
+ strbuf_put_i64(sb, imm);
+ }
+}
+
+static void print_addsubimm(StrBuf* sb, u32 w) {
+ AA64AddSubImm f = aa64_addsubimm_unpack(w);
+ /* For these encodings, Rd/Rn=31 means SP. */
+ emit_reg(sb, f.Rd, (int)f.sf, 1);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rn, (int)f.sf, 1);
+ strbuf_puts(sb, ", #");
+ strbuf_put_u64(sb, (u64)f.imm12);
+ if (f.sh) strbuf_puts(sb, ", lsl #12");
+}
+
+static u32 ldst_log2_size(const AA64InsnDesc* d, u32 size_field) {
+ (void)d;
+ return size_field & 3u;
+}
+
+static void print_ldst_uimm(StrBuf* sb, u32 w, const AA64InsnDesc* d) {
+ AA64LdStUimm f = aa64_ldst_uimm_unpack(w);
+ u32 sz = ldst_log2_size(d, f.size);
+ /* Pick reg prefix: V=0 picks W/X by size; V=1 picks B/H/S/D by size. */
+ if (f.V == 0) {
+ emit_reg(sb, f.Rt, /*sf=*/(int)(sz == 3u), 0);
+ } else {
+ char p = (sz == 0u) ? 'b' : (sz == 1u) ? 'h' : (sz == 2u) ? 's' : 'd';
+ emit_vreg(sb, f.Rt, p);
+ }
+ strbuf_puts(sb, ", [");
+ emit_reg(sb, f.Rn, /*sf=*/1, 1);
+ u32 byte_off = f.imm12 << sz;
+ if (byte_off) {
+ strbuf_puts(sb, ", #");
+ strbuf_put_u64(sb, (u64)byte_off);
+ }
+ strbuf_putc(sb, ']');
+}
+
+static void print_ldst_simm9(StrBuf* sb, u32 w, const AA64InsnDesc* d) {
+ AA64LdStSimm9 f = aa64_ldst_simm9_unpack(w);
+ u32 sz = f.size & 3u;
+ (void)d;
+ if (f.V == 0) {
+ emit_reg(sb, f.Rt, /*sf=*/(int)(sz == 3u), 0);
+ } else {
+ char p = (sz == 0u) ? 'b' : (sz == 1u) ? 'h' : (sz == 2u) ? 's' : 'd';
+ emit_vreg(sb, f.Rt, p);
+ }
+ strbuf_puts(sb, ", [");
+ emit_reg(sb, f.Rn, /*sf=*/1, 1);
+ i64 off = sext((u64)f.imm9, 9);
+ if (off) {
+ strbuf_puts(sb, ", #");
+ strbuf_put_i64(sb, off);
+ }
+ strbuf_putc(sb, ']');
+}
+
+static void print_ldstp_common(StrBuf* sb, AA64LdStPPre f, int pre) {
+ /* opc=10 → 64-bit X; opc=00 → 32-bit W; opc=01 (V=1) → D (FP);
+ * opc=00 (V=1) → S; opc=10 (V=1) → Q (not yet emitted). */
+ i64 scale;
+ int is_fp = (f.V == 1);
+ char fp_prefix = 's';
+ int sf = 1;
+ if (is_fp) {
+ if (f.opc == 0) {
+ fp_prefix = 's';
+ scale = 4;
+ } else if (f.opc == 1) {
+ fp_prefix = 'd';
+ scale = 8;
+ } else {
+ fp_prefix = 'q';
+ scale = 16;
+ }
+ } else {
+ sf = (f.opc == 2);
+ scale = sf ? 8 : 4;
+ }
+ if (is_fp) {
+ emit_vreg(sb, f.Rt, fp_prefix);
+ strbuf_puts(sb, ", ");
+ emit_vreg(sb, f.Rt2, fp_prefix);
+ } else {
+ emit_reg(sb, f.Rt, sf, 0);
+ strbuf_puts(sb, ", ");
+ emit_reg(sb, f.Rt2, sf, 0);
+ }
+ strbuf_puts(sb, ", [");
+ emit_reg(sb, f.Rn, /*sf=*/1, 1);
+ i64 byte_off = sext((u64)f.imm7, 7) * scale;
+ if (byte_off) {
+ strbuf_puts(sb, ", #");
+ strbuf_put_i64(sb, byte_off);
+ }
+ strbuf_putc(sb, ']');
+ if (pre) strbuf_putc(sb, '!');
+}
+
+static void print_ldstp_pre(StrBuf* sb, u32 w) {
+ print_ldstp_common(sb, aa64_ldstp_pre_unpack(w), /*pre=*/1);
+}
+static void print_ldstp_soff(StrBuf* sb, u32 w) {
+ print_ldstp_common(sb, aa64_ldstp_soff_unpack(w), /*pre=*/0);
+}
+
+static void print_br_imm(StrBuf* sb, u32 w, u64 vaddr) {
+ AA64BrImm f = aa64_brimm_unpack(w);
+ i64 ofs = sext((u64)f.imm26, 26) * 4;
+ if (vaddr) {
+ strbuf_put_hex_u64(sb, vaddr + (u64)ofs);
+ } else {
+ strbuf_puts(sb, "#");
+ strbuf_put_i64(sb, ofs);
+ }
+}
+
+static void print_br_cond(StrBuf* sb, u32 w, u64 vaddr,
+ const AA64InsnDesc* d) {
+ AA64BrCond f = aa64_brcond_unpack(w);
+ (void)d;
+ /* mnemonic is "b.cond"; we'll print cond as a suffix on the target.
+ * The b.cond row keeps a single mnemonic for printing — for the asm
+ * spelling to be canonical the writer will need to emit b.<cc>, which
+ * is the printer's job at the dispatcher level (see aa64_print_operands). */
+ emit_cond(sb, f.cond);
+ strbuf_putc(sb, ' ');
+ i64 ofs = sext((u64)f.imm19, 19) * 4;
+ if (vaddr) {
+ strbuf_put_hex_u64(sb, vaddr + (u64)ofs);
+ } else {
+ strbuf_puts(sb, "#");
+ strbuf_put_i64(sb, ofs);
+ }
+}
+
+static void print_cb(StrBuf* sb, u32 w, u64 vaddr) {
+ AA64CB f = aa64_cb_unpack(w);
+ emit_reg(sb, f.Rt, (int)f.sf, 0);
+ strbuf_puts(sb, ", ");
+ i64 ofs = sext((u64)f.imm19, 19) * 4;
+ if (vaddr) {
+ strbuf_put_hex_u64(sb, vaddr + (u64)ofs);
+ } else {
+ strbuf_puts(sb, "#");
+ strbuf_put_i64(sb, ofs);
+ }
+}
+
+static void print_except(StrBuf* sb, u32 w) {
+ AA64Except f = aa64_except_unpack(w);
+ strbuf_puts(sb, "#");
+ strbuf_put_hex_u64(sb, (u64)f.imm16);
+}
+
+static void print_barrier(StrBuf* sb, u32 w, const AA64InsnDesc* desc) {
+ AA64Barrier f = aa64_barrier_unpack(w);
+ /* ISB and CLREX with the default CRm=SY (15) print without an
+ * operand. DMB/DSB always carry an option. */
+ int is_isb = (f.op2 == AA64_BARRIER_OP2_ISB);
+ int is_clrex = (f.op2 == AA64_BARRIER_OP2_CLREX);
+ if ((is_isb || is_clrex) && f.CRm == AA64_BARRIER_OPT_SY) return;
+ const char* opt = NULL;
+ switch (f.CRm) {
+ case AA64_BARRIER_OPT_OSHLD: opt = "oshld"; break;
+ case AA64_BARRIER_OPT_OSHST: opt = "oshst"; break;
+ case AA64_BARRIER_OPT_OSH: opt = "osh"; break;
+ case AA64_BARRIER_OPT_NSHLD: opt = "nshld"; break;
+ case AA64_BARRIER_OPT_NSHST: opt = "nshst"; break;
+ case AA64_BARRIER_OPT_NSH: opt = "nsh"; break;
+ case AA64_BARRIER_OPT_ISHLD: opt = "ishld"; break;
+ case AA64_BARRIER_OPT_ISHST: opt = "ishst"; break;
+ case AA64_BARRIER_OPT_ISH: opt = "ish"; break;
+ case AA64_BARRIER_OPT_LD: opt = (desc && desc->mnemonic &&
+ desc->mnemonic[0] == 'd' &&
+ desc->mnemonic[1] == 'm')
+ ? "ld"
+ : NULL; break;
+ case AA64_BARRIER_OPT_ST: opt = (desc && desc->mnemonic &&
+ desc->mnemonic[0] == 'd' &&
+ desc->mnemonic[1] == 'm')
+ ? "st"
+ : NULL; break;
+ case AA64_BARRIER_OPT_SY: opt = "sy"; break;
+ default: break;
+ }
+ strbuf_putc(sb, ' ');
+ if (opt) {
+ strbuf_puts(sb, opt);
+ } else {
+ strbuf_puts(sb, "#");
+ strbuf_put_u64(sb, (u64)f.CRm);
+ }
+}
+
+void aa64_print_operands(StrBuf* sb, const AA64InsnDesc* desc, u32 word,
+ u64 vaddr) {
+ switch ((AA64Format)desc->fmt) {
+ case AA64_FMT_MOVEWIDE: print_movewide(sb, word); break;
+ case AA64_FMT_LOG_SR: print_logsr(sb, word, desc); break;
+ case AA64_FMT_ADDSUB_SR: print_addsubsr(sb, word, desc); break;
+ case AA64_FMT_DP3: print_dp3(sb, word, desc); break;
+ case AA64_FMT_DP2: print_dp2(sb, word); break;
+ case AA64_FMT_BR_REG: print_brreg(sb, word, desc); break;
+ case AA64_FMT_PCREL_ADR: print_pcrel(sb, word, vaddr); break;
+ case AA64_FMT_ADDSUB_IMM: print_addsubimm(sb, word); break;
+ case AA64_FMT_LDST_UIMM: print_ldst_uimm(sb, word, desc); break;
+ case AA64_FMT_LDSTP_PRE: print_ldstp_pre(sb, word); break;
+ case AA64_FMT_LDSTP_SOFF: print_ldstp_soff(sb, word); break;
+ case AA64_FMT_LDST_SIMM9: print_ldst_simm9(sb, word, desc); break;
+ case AA64_FMT_BR_IMM: print_br_imm(sb, word, vaddr); break;
+ case AA64_FMT_BR_COND: print_br_cond(sb, word, vaddr, desc); break;
+ case AA64_FMT_CB: print_cb(sb, word, vaddr); break;
+ case AA64_FMT_EXCEPT: print_except(sb, word); break;
+ case AA64_FMT_HINT: break; /* no operands for NOP */
+ case AA64_FMT_BARRIER: print_barrier(sb, word, desc); break;
+ }
+}
+
+/* =====================================================================
+ * Operand parse — phase-3 wires this up to the asm token stream. Phase
+ * 2 ships the signature so the assembler bring-up commit doesn't need to
+ * touch the descriptor table; the body returns 0 for every format until
+ * the per-format grammar is implemented. */
+
+int aa64_parse_operands(struct AA64AsmTok* tok, const AA64InsnDesc* desc,
+ void* fields_out) {
+ (void)tok;
+ (void)desc;
+ (void)fields_out;
+ return 0;
+}
diff --git a/src/arch/aa64_isa.h b/src/arch/aa64/isa.h
diff --git a/src/arch/aa64/link.c b/src/arch/aa64/link.c
@@ -0,0 +1,208 @@
+/* AArch64 link-time descriptor.
+ *
+ * Implements the LinkArchDesc contract from link_arch.h for the
+ * aarch64 ELF psABI: PLT0 + per-import PLT entries (lazy-resolve
+ * trampolines emitted in canonical form even under DF_1_NOW), and the
+ * 12-byte IPLT stub used by ifunc resolvers. All instruction bytes
+ * come from the encoders in arch/aa64/isa.h — no raw hex literals
+ * here.
+ *
+ * The byte layout matches the previous inline encodings in
+ * link_dyn.c (PLT) and link_layout.c (IPLT) so that switching the
+ * linker to descriptor dispatch is a no-op on the output image. */
+
+#include "arch/aa64/isa.h"
+#include "core/bytes.h"
+#include "core/core.h"
+#include "link/link_arch.h"
+#include "obj/elf.h"
+#include "obj/macho.h"
+#include "obj/obj.h"
+
+/* Fixed register assignments mandated by the AArch64 PLT ABI. */
+#define AA64_PLT_SCRATCH_X16 16u /* PLT/IPLT scratch (slot address) */
+#define AA64_PLT_SCRATCH_X17 17u /* PLT scratch (loaded function ptr) */
+
+/* PLT geometry. Documented in link_arch.h; redeclared here as the
+ * descriptor table needs them at file scope. */
+#define AA64_PLT0_SIZE 32u
+#define AA64_PLT_ENTRY_SIZE 16u
+#define AA64_IPLT_STUB_SIZE 12u
+
+/* PLT0 references .got.plt[2] (the lazy-resolve hook); the per-import
+ * entries start at .got.plt[3]. */
+#define AA64_GOTPLT_RESOLVER_INDEX 2u
+
+/* Page mask for ADRP: ADRP encodes (page(target) - page(PC)) >> 12,
+ * where page(x) clears the low 12 bits. */
+#define AA64_PAGE_MASK ((u64)0xfffu)
+
+/* Compute the (immlo, immhi) ADRP immediate halves for the page-
+ * relative displacement from `pc` to `target`. Both addresses are
+ * post-shift final image vaddrs; ADRP discards the low 12 bits of
+ * each before subtracting, so the result is invariant under any
+ * segment-base shift that moves both endpoints by the same delta. */
+static inline void aa64_adrp_imm_halves(u64 pc, u64 target, u32* immlo,
+ u32* immhi) {
+ i64 page_disp = (i64)(target & ~AA64_PAGE_MASK) - (i64)(pc & ~AA64_PAGE_MASK);
+ i64 imm21 = page_disp >> 12;
+ *immlo = (u32)(imm21 & 0x3);
+ *immhi = (u32)((imm21 >> 2) & 0x7ffff);
+}
+
+/* Emit one ADRP+LDR+ADD+BR sequence that materializes `slot_vaddr`
+ * (a .got.plt entry) into x16, loads the resolved function pointer
+ * into x17, and tail-calls it. Used by both PLT0 (after its STP) and
+ * each per-import entry — the only thing that varies is `pc`, which
+ * starts at the ADRP itself. */
+static void aa64_emit_adrp_load_br(u8* dst, u64 pc, u64 slot_vaddr) {
+ u32 immlo, immhi;
+ aa64_adrp_imm_halves(pc, slot_vaddr, &immlo, &immhi);
+ u32 lo12 = (u32)(slot_vaddr & AA64_PAGE_MASK);
+ /* LDR Xt encodes the byte offset divided by 8. .got.plt slots are
+ * 8-byte aligned so the low 3 bits of lo12 are always 0. */
+ u32 ldr_imm12 = (lo12 >> 3) & 0xfffu;
+
+ wr_u32_le(dst + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, immlo, immhi));
+ wr_u32_le(dst + 4, aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X17,
+ AA64_PLT_SCRATCH_X16, ldr_imm12));
+ wr_u32_le(dst + 8, aa64_add_imm(/*sf=*/1, AA64_PLT_SCRATCH_X16,
+ AA64_PLT_SCRATCH_X16, lo12, /*sh=*/0));
+ wr_u32_le(dst + 12, aa64_br(AA64_PLT_SCRATCH_X17));
+}
+
+static void aa64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) {
+ /* PLT0:
+ * stp x16, x30, [sp, #-16]!
+ * adrp x16, page(.got.plt[2])
+ * ldr x17, [x16, #lo12(.got.plt[2])]
+ * add x16, x16, #lo12(.got.plt[2])
+ * br x17
+ * nop ; nop ; nop
+ *
+ * Under DF_1_NOW the loader patches every .got.plt slot from
+ * .rela.plt before running PLT0, so this trampoline never executes.
+ * It is still emitted in canonical form so disassemblers and
+ * unwinders see the layout the psABI specifies. */
+ u64 slot2 = gotplt_vaddr + 8u * AA64_GOTPLT_RESOLVER_INDEX;
+ /* The ADRP sits at plt0+4 (one instruction past the leading STP). */
+ u64 adrp_pc = plt0_vaddr + 4u;
+
+ /* `stp x16, x30, [sp, #-16]!` — pre-indexed pair store with imm7
+ * scaled by 8, so the encoded field is -16/8 = -2. */
+ wr_u32_le(dst + 0, aa64_stp64_pre(AA64_PLT_SCRATCH_X16, AA64_LR, AA64_SP,
+ /*imm7_scaled=*/-2));
+ aa64_emit_adrp_load_br(dst + 4, adrp_pc, slot2);
+ wr_u32_le(dst + 20, aa64_nop());
+ wr_u32_le(dst + 24, aa64_nop());
+ wr_u32_le(dst + 28, aa64_nop());
+}
+
+static void aa64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) {
+ /* Per-import 16-byte entry: ADRP+LDR+ADD+BR where ADRP's PC is the
+ * entry's first instruction (no leading STP here — the resolved
+ * function returns to the original caller, not into PLT0). */
+ aa64_emit_adrp_load_br(dst, entry_vaddr, slot_vaddr);
+}
+
+static u32 aa64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr,
+ LinkArchIPltReloc out[2]) {
+ /* IPLT stub: ADRP x16, %page(slot) ; LDR x16, [x16, :lo12:slot] ;
+ * BR x16.
+ *
+ * We deliberately emit the two address-bearing instructions with
+ * zero immediates: the linker enqueues an ADR_PREL_PG_HI21 reloc on
+ * the ADRP and an LDST64_ABS_LO12_NC reloc on the LDR, both
+ * targeting the slot's synthetic local symbol. Reloc-apply runs
+ * after final vaddr assignment, which is the only point at which
+ * both endpoints' page-relative displacement is known. */
+ (void)stub_vaddr;
+ (void)slot_vaddr;
+
+ wr_u32_le(dst + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, /*immlo=*/0,
+ /*immhi=*/0));
+ wr_u32_le(dst + 4,
+ aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X16, AA64_PLT_SCRATCH_X16,
+ /*imm12_scaled=*/0));
+ wr_u32_le(dst + 8, aa64_br(AA64_PLT_SCRATCH_X16));
+
+ out[0].offset_in_stub = 0;
+ out[0].width = 4;
+ out[0].kind = R_AARCH64_ADR_PREL_PG_HI21;
+ out[1].offset_in_stub = 4;
+ out[1].width = 4;
+ out[1].kind = R_AARCH64_LDST64_ABS_LO12_NC;
+ return 2;
+}
+
+static void aa64_emit_macho_stub(u8* out, u64 stub_vaddr, u64 got_slot_vaddr) {
+ i64 page_s = ((i64)got_slot_vaddr) & ~(i64)0xfff;
+ i64 page_p = ((i64)stub_vaddr) & ~(i64)0xfff;
+ i64 imm21 = (page_s - page_p) >> 12;
+ u32 immlo = (u32)(imm21 & 0x3u);
+ u32 immhi = (u32)((imm21 >> 2) & 0x7ffffu);
+ u32 lo12 = (u32)(got_slot_vaddr & 0xfffu);
+ u32 imm12_ldr = (lo12 >> 3) & 0xfffu;
+
+ wr_u32_le(out + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, immlo, immhi));
+ wr_u32_le(out + 4, aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X16,
+ AA64_PLT_SCRATCH_X16, imm12_ldr));
+ wr_u32_le(out + 8, aa64_br(AA64_PLT_SCRATCH_X16));
+}
+
+static int aa64_is_branch_reloc(RelocKind kind) {
+ return kind == R_AARCH64_CALL26 || kind == R_AARCH64_JUMP26;
+}
+
+static int aa64_is_got_load_reloc(RelocKind kind) {
+ return kind == R_AARCH64_ADR_GOT_PAGE || kind == R_AARCH64_LD64_GOT_LO12_NC;
+}
+
+static int aa64_is_tlvp_reloc(RelocKind kind) {
+ return kind == R_AARCH64_TLVP_LOAD_PAGE21 ||
+ kind == R_AARCH64_TLVP_LOAD_PAGEOFF12;
+}
+
+static int aa64_is_direct_page_reloc(RelocKind kind) {
+ switch (kind) {
+ case R_AARCH64_ADR_PREL_PG_HI21:
+ case R_AARCH64_ADR_PREL_PG_HI21_NC:
+ case R_AARCH64_ADD_ABS_LO12_NC:
+ case R_AARCH64_LDST8_ABS_LO12_NC:
+ case R_AARCH64_LDST16_ABS_LO12_NC:
+ case R_AARCH64_LDST32_ABS_LO12_NC:
+ case R_AARCH64_LDST64_ABS_LO12_NC:
+ case R_AARCH64_LDST128_ABS_LO12_NC:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+const LinkArchDesc link_arch_aa64 = {
+ .e_machine = EM_AARCH64,
+ .default_musl_interp = "/lib/ld-musl-aarch64.so.1",
+
+ .elf_r_relative = ELF_R_AARCH64_RELATIVE,
+ .elf_r_glob_dat = ELF_R_AARCH64_GLOB_DAT,
+ .elf_r_jump_slot = ELF_R_AARCH64_JUMP_SLOT,
+
+ .macho_cputype = CPU_TYPE_ARM64,
+ .macho_cpusubtype = CPU_SUBTYPE_ARM64_ALL,
+
+ .plt0_size = AA64_PLT0_SIZE,
+ .plt_entry_size = AA64_PLT_ENTRY_SIZE,
+ .iplt_stub_size = AA64_IPLT_STUB_SIZE,
+
+ .emit_plt0 = aa64_emit_plt0,
+ .emit_plt_entry = aa64_emit_plt_entry,
+ .emit_iplt_stub = aa64_emit_iplt_stub,
+ .macho_stub_size = AA64_IPLT_STUB_SIZE,
+ .emit_macho_stub = aa64_emit_macho_stub,
+
+ .is_branch_reloc = aa64_is_branch_reloc,
+ .is_got_load_reloc = aa64_is_got_load_reloc,
+ .is_tlvp_reloc = aa64_is_tlvp_reloc,
+ .is_direct_page_reloc = aa64_is_direct_page_reloc,
+ .needs_jit_call_stub = aa64_is_branch_reloc,
+};
diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c
@@ -0,0 +1,1925 @@
+/* aarch64/ops.c — data movement, arithmetic, calls, varargs, atomics,
+ * intrinsics, asm_block, set_loc, finalize/destroy, vtable constructor. */
+
+#include "arch/aa64/internal.h"
+
+/* ============================================================
+ * Data movement
+ * ============================================================ */
+
+static void aa_load_imm(CGTarget* t, Operand dst, i64 imm) {
+ u32 sf = type_is_64(dst.type) ? 1u : 0u;
+ aa64_emit_load_imm(t->mc, sf, reg_num(dst), imm);
+}
+
+static void aa_load_const(CGTarget* t, Operand dst, ConstBytes cb) {
+ AAImpl* a = impl_of(t);
+ if (dst.cls != RC_FP) {
+ compiler_panic(t->c, a->loc, "aarch64 load_const: only FP supported in v1");
+ }
+
+ Sym ro_name = pool_intern_cstr(t->c->global, ".rodata");
+ ObjSecId ro = obj_section(t->obj, ro_name, SEC_RODATA, SF_ALLOC, 1u);
+
+ u32 cur_section = t->mc->section_id;
+ t->mc->set_section(t->mc, ro);
+ u32 ro_off = obj_align_to(t->obj, ro, cb.align ? cb.align : 4);
+ t->mc->emit_bytes(t->mc, cb.bytes, cb.size);
+
+ char namebuf[64];
+ static u32 lit_seq = 0;
+ int len = 0;
+ {
+ const char* prefix = ".LCFP";
+ for (; prefix[len]; ++len) namebuf[len] = prefix[len];
+ u32 v = lit_seq++;
+ char tmp[16];
+ int tn = 0;
+ if (v == 0)
+ tmp[tn++] = '0';
+ else {
+ while (v) {
+ tmp[tn++] = '0' + (char)(v % 10);
+ v /= 10;
+ }
+ }
+ for (int i = tn - 1; i >= 0; --i) namebuf[len++] = tmp[i];
+ namebuf[len] = 0;
+ }
+ Sym sname = pool_intern_cstr(t->c->global, namebuf);
+ ObjSymId sym = obj_symbol(t->obj, sname, SB_LOCAL, SK_OBJ, ro, (u64)ro_off,
+ (u64)cb.size);
+
+ t->mc->set_section(t->mc, cur_section);
+
+ u32 adrp_pos = t->mc->pos(t->mc);
+ aa64_emit32(t->mc, aa64_adrp_base(AA_TMP0));
+ t->mc->emit_reloc_at(t->mc, cur_section, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21,
+ sym, 0, 0, 0);
+
+ u32 ldr_pos = t->mc->pos(t->mc);
+ u32 sidx = (cb.size == 8) ? 3u : 2u;
+ aa64_emit32(t->mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), AA_TMP0, 0));
+ RelocKind lo12 = (cb.size == 8) ? R_AARCH64_LDST64_ABS_LO12_NC
+ : R_AARCH64_LDST32_ABS_LO12_NC;
+ t->mc->emit_reloc_at(t->mc, cur_section, ldr_pos, lo12, sym, 0, 0, 0);
+}
+
+static void aa_copy(CGTarget* t, Operand dst, Operand src) {
+ if (dst.cls == RC_FP || src.cls == RC_FP) {
+ u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
+ aa64_emit32(t->mc, aa64_fmov_reg(type, reg_num(dst), reg_num(src)));
+ return;
+ }
+ u32 sf = type_is_64(dst.type) ? 1u : 0u;
+ aa64_emit32(t->mc, aa64_mov_reg(sf, reg_num(dst), reg_num(src)));
+}
+
+/* ============================================================
+ * Load / store
+ * ============================================================ */
+
+static RelocKind ldst_lo12_reloc_for(u32 nbytes) {
+ switch (nbytes) {
+ case 1: return R_AARCH64_LDST8_ABS_LO12_NC;
+ case 2: return R_AARCH64_LDST16_ABS_LO12_NC;
+ case 4: return R_AARCH64_LDST32_ABS_LO12_NC;
+ case 8: return R_AARCH64_LDST64_ABS_LO12_NC;
+ default: return R_AARCH64_LDST64_ABS_LO12_NC;
+ }
+}
+
+static int use_got_for_sym(CGTarget* t, ObjSymId sym) {
+ return obj_symbol_extern_via_got(t->c, t->obj, sym);
+}
+
+static u32 addr_base(CGTarget* t, Operand addr, i32* out_off, u32 tmp_reg) {
+ AAImpl* a = impl_of(t);
+ if (addr.kind == OPK_LOCAL) {
+ AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_base: bad slot");
+ i32 off = -(i32)s->off;
+ if (off >= -256 && off <= 255) {
+ *out_off = off;
+ return 29;
+ }
+ aa64_emit_addr_adjust(t->mc, tmp_reg, 29, off);
+ *out_off = 0;
+ return tmp_reg;
+ }
+ if (addr.kind == OPK_INDIRECT) {
+ i32 off = addr.v.ind.ofs;
+ u32 base = addr.v.ind.base & 0x1f;
+ if (off >= -256 && off <= 255) {
+ *out_off = off;
+ return base;
+ }
+ aa64_emit_addr_adjust(t->mc, tmp_reg, base, off);
+ *out_off = 0;
+ return tmp_reg;
+ }
+ if (addr.kind == OPK_GLOBAL) {
+ emit_global_addr(t, tmp_reg, addr.v.global.sym, addr.v.global.addend);
+ *out_off = 0;
+ return tmp_reg;
+ }
+ compiler_panic(t->c, a->loc, "aarch64 addr_base: unsupported kind %d",
+ (int)addr.kind);
+}
+
+void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) {
+ u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
+ u32 sidx = size_idx_for_bytes(sz);
+
+ if (addr.kind == OPK_GLOBAL) {
+ MCEmitter* mc = t->mc;
+ u32 sec = mc->section_id;
+ ObjSymId sym = addr.v.global.sym;
+ i64 add = addr.v.global.addend;
+ if (use_got_for_sym(t, sym)) {
+ aa64_emit_got_load_addr(t, AA_TMP0, sym);
+ if (dst.cls == RC_FP) {
+ aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP0, (i32)add));
+ } else {
+ aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP0, (i32)add));
+ }
+ return;
+ }
+ u32 adrp_pos = mc->pos(mc);
+ aa64_emit32(mc, aa64_adrp_base(AA_TMP0));
+ mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add,
+ 0, 0);
+ u32 ld_pos = mc->pos(mc);
+ if (dst.cls == RC_FP) {
+ aa64_emit32(mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), AA_TMP0, 0));
+ } else {
+ aa64_emit32(mc, aa64_ldr_uimm(sidx, reg_num(dst), AA_TMP0, 0));
+ }
+ mc->emit_reloc_at(mc, sec, ld_pos, ldst_lo12_reloc_for(sz), sym, add, 0, 0);
+ return;
+ }
+
+ i32 off;
+ u32 base = addr_base(t, addr, &off, AA_TMP0);
+ if (dst.cls == RC_FP) {
+ aa64_emit32(t->mc, aa64_ldur_fp(sidx, reg_num(dst), base, off));
+ } else {
+ aa64_emit32(t->mc, aa64_ldur(sidx, reg_num(dst), base, off));
+ }
+}
+
+void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) {
+ u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
+ u32 sidx = size_idx_for_bytes(sz);
+
+ if (addr.kind == OPK_GLOBAL) {
+ MCEmitter* mc = t->mc;
+ u32 sec = mc->section_id;
+ ObjSymId sym = addr.v.global.sym;
+ i64 add = addr.v.global.addend;
+
+ u32 src_reg;
+ u32 src_is_fp = 0;
+ if (src.kind == OPK_IMM) {
+ u32 sf = (sz == 8) ? 1u : 0u;
+ aa64_emit_load_imm(mc, sf, AA_TMP0, src.v.imm);
+ src_reg = AA_TMP0;
+ } else if (src.cls == RC_FP) {
+ src_reg = reg_num(src);
+ src_is_fp = 1;
+ } else {
+ src_reg = reg_num(src);
+ }
+ u32 base = (src.kind == OPK_IMM) ? AA_TMP1 : AA_TMP0;
+ if (use_got_for_sym(t, sym)) {
+ aa64_emit_got_load_addr(t, base, sym);
+ if (src_is_fp) {
+ aa64_emit32(mc, aa64_stur_fp(sidx, src_reg, base, (i32)add));
+ } else {
+ aa64_emit32(mc, aa64_stur(sidx, src_reg, base, (i32)add));
+ }
+ return;
+ }
+ u32 adrp_pos = mc->pos(mc);
+ aa64_emit32(mc, aa64_adrp_base(base));
+ mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add,
+ 0, 0);
+ u32 st_pos = mc->pos(mc);
+ if (src_is_fp) {
+ aa64_emit32(mc, aa64_str_fp_uimm(sidx, src_reg, base, 0));
+ } else {
+ aa64_emit32(mc, aa64_str_uimm(sidx, src_reg, base, 0));
+ }
+ mc->emit_reloc_at(mc, sec, st_pos, ldst_lo12_reloc_for(sz), sym, add, 0, 0);
+ return;
+ }
+
+ i32 off;
+ u32 addr_tmp = (src.kind == OPK_IMM) ? AA_TMP1 : AA_TMP0;
+ u32 base = addr_base(t, addr, &off, addr_tmp);
+
+ if (src.kind == OPK_IMM) {
+ u32 sf = (sz == 8) ? 1u : 0u;
+ aa64_emit_load_imm(t->mc, sf, AA_TMP0, src.v.imm);
+ aa64_emit32(t->mc, aa64_stur(sidx, AA_TMP0, base, off));
+ return;
+ }
+ if (src.cls == RC_FP) {
+ aa64_emit32(t->mc, aa64_stur_fp(sidx, reg_num(src), base, off));
+ } else {
+ aa64_emit32(t->mc, aa64_stur(sidx, reg_num(src), base, off));
+ }
+}
+
+static void aa_addr_of(CGTarget* t, Operand dst, Operand lv) {
+ AAImpl* a = impl_of(t);
+ if (lv.kind == OPK_LOCAL) {
+ AASlot* s = aa64_slot_get(a, lv.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_of: bad slot");
+ aa64_emit32(t->mc, aa64_sub_imm(1, reg_num(dst), 29, s->off, 0));
+ return;
+ }
+ if (lv.kind == OPK_INDIRECT) {
+ i32 ofs = lv.v.ind.ofs;
+ u32 base = lv.v.ind.base & 0x1f;
+ if (ofs == 0) {
+ aa64_emit32(t->mc, aa64_mov_reg(1, reg_num(dst), base));
+ } else if (ofs > 0 && ofs <= 0xfff) {
+ aa64_emit32(t->mc, aa64_add_imm(1, reg_num(dst), base, (u32)ofs, 0));
+ } else if (ofs < 0 && -ofs <= 0xfff) {
+ aa64_emit32(t->mc, aa64_sub_imm(1, reg_num(dst), base, (u32)(-ofs), 0));
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 addr_of: indirect offset %d unsupported", ofs);
+ }
+ return;
+ }
+ if (lv.kind == OPK_GLOBAL) {
+ u32 rd = reg_num(dst);
+ ObjSymId sym = lv.v.global.sym;
+ i64 addend = lv.v.global.addend;
+ if (use_got_for_sym(t, sym)) {
+ aa64_emit_got_load_addr(t, rd, sym);
+ if (addend) aa64_emit_addr_adjust(t->mc, rd, rd, (i32)addend);
+ return;
+ }
+ u32 sec = t->mc->section_id;
+ u32 adrp_pos = t->mc->pos(t->mc);
+ aa64_emit32(t->mc, aa64_adrp_base(rd));
+ t->mc->emit_reloc_at(t->mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym,
+ addend, 0, 0);
+ u32 add_pos = t->mc->pos(t->mc);
+ aa64_emit32(t->mc, aa64_add_imm(1, rd, rd, 0, 0));
+ t->mc->emit_reloc_at(t->mc, sec, add_pos, R_AARCH64_ADD_ABS_LO12_NC, sym,
+ addend, 0, 0);
+ return;
+ }
+ compiler_panic(t->c, impl_of(t)->loc, "aarch64: addr_of not implemented");
+}
+
+static void aa_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend) {
+ MCEmitter* mc = t->mc;
+ u32 sec = mc->section_id;
+ u32 rd = reg_num(dst);
+
+ if (obj_format_tls_via_descriptor(t->c)) {
+ /* TLV access via per-variable descriptor (Mach-O TLVP). The thunk's
+ * ABI is custom — x0 in/out as descriptor → TLV addr, all other
+ * regs preserved — so we materialize via x0 and copy to `dst` only
+ * when they differ. x0/x1 are scratch here (the regalloc only hands
+ * out x19-x28), and x30 was saved at the prologue.
+ *
+ * adrp x0, sym@TLVPPAGE ; R_AARCH64_TLVP_LOAD_PAGE21
+ * ldr x0, [x0, sym@TLVPPAGEOFF] ; R_AARCH64_TLVP_LOAD_PAGEOFF12
+ * ldr x1, [x0] ; descriptor[0] = thunk pointer
+ * blr x1 ; x0 in/out
+ * mov xdst, x0 ; only if dst != x0
+ *
+ * TLVP relocs do not carry an addend; nonzero addends are applied
+ * after the call as a follow-on ADD/SUB on `dst`. */
+ u32 adrp_pos = mc->pos(mc);
+ aa64_emit32(mc, aa64_adrp_base(/*Rd=*/0));
+ mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_TLVP_LOAD_PAGE21, sym, 0, 0,
+ 0);
+ u32 ldr_pos = mc->pos(mc);
+ aa64_emit32(mc,
+ aa64_ldr_uimm(/*size=*/3, /*Rt=*/0, /*Rn=*/0, /*byte_off=*/0));
+ mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_TLVP_LOAD_PAGEOFF12, sym, 0,
+ 0, 0);
+ aa64_emit32(mc,
+ aa64_ldr_uimm(/*size=*/3, /*Rt=*/1, /*Rn=*/0, /*byte_off=*/0));
+ aa64_emit32(mc, aa64_blr(/*Rn=*/1));
+ if (rd != 0) aa64_emit32(mc, aa64_mov_reg(/*sf=*/1, rd, /*Rm=*/0));
+ if (addend) aa64_emit_addr_adjust(mc, rd, rd, (i32)addend);
+ return;
+ }
+
+ aa64_emit32(mc, aa64_mrs_tpidr_el0(AA_TMP0));
+
+ u32 hi_pos = mc->pos(mc);
+ aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, AA_TMP0, /*imm12=*/0, /*sh=*/1));
+ mc->emit_reloc_at(mc, sec, hi_pos, R_AARCH64_TLSLE_ADD_TPREL_HI12, sym,
+ addend, 0, 0);
+
+ u32 lo_pos = mc->pos(mc);
+ aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, /*Rn=*/rd, /*imm12=*/0, /*sh=*/0));
+ mc->emit_reloc_at(mc, sec, lo_pos, R_AARCH64_TLSLE_ADD_TPREL_LO12_NC, sym,
+ addend, 0, 0);
+}
+
+/* ============================================================
+ * Aggregate helpers
+ * ============================================================ */
+
+static u32 agg_addr_reg(CGTarget* t, Operand op, u32 scratch) {
+ if (op.kind == OPK_REG) return reg_num(op);
+ if (op.kind == OPK_LOCAL) {
+ AAImpl* a = impl_of(t);
+ AASlot* s = aa64_slot_get(a, op.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 agg: bad slot");
+ aa64_emit32(t->mc, aa64_sub_imm(1, scratch, 29, s->off, 0));
+ return scratch;
+ }
+ compiler_panic(t->c, impl_of(t)->loc,
+ "aarch64 agg: address kind %d unsupported", (int)op.kind);
+}
+
+static void aa_copy_bytes(CGTarget* t, Operand dst_addr, Operand src_addr,
+ AggregateAccess agg) {
+ MCEmitter* mc = t->mc;
+ u32 dr = agg_addr_reg(t, dst_addr, AA_TMP0);
+ u32 sr = agg_addr_reg(t, src_addr,
+ (dr == AA_TMP1) ? AA_TMP2 : AA_TMP1);
+ u32 nbytes = agg.size;
+ u32 i = 0;
+ while (i + 8 <= nbytes) {
+ aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i));
+ aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i));
+ i += 8;
+ }
+ while (i + 4 <= nbytes) {
+ aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i));
+ aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i));
+ i += 4;
+ }
+ while (i + 2 <= nbytes) {
+ aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i));
+ aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i));
+ i += 2;
+ }
+ while (i < nbytes) {
+ aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i));
+ aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i));
+ i += 1;
+ }
+}
+
+static void aa_set_bytes(CGTarget* t, Operand dst_addr, Operand byte_value,
+ AggregateAccess agg) {
+ MCEmitter* mc = t->mc;
+ u32 dr = agg_addr_reg(t, dst_addr, AA_TMP0);
+
+ u32 byte;
+ if (byte_value.kind == OPK_IMM) {
+ byte = (u32)(byte_value.v.imm & 0xffu);
+ } else {
+ compiler_panic(t->c, impl_of(t)->loc,
+ "aarch64 set_bytes: REG byte not yet supported");
+ }
+ u32 nbytes = agg.size;
+
+ if (byte == 0) {
+ u32 i = 0;
+ while (i + 8 <= nbytes) {
+ aa64_emit32(mc, aa64_stur(3, 31, dr, (i32)i));
+ i += 8;
+ }
+ while (i + 4 <= nbytes) {
+ aa64_emit32(mc, aa64_stur(2, 31, dr, (i32)i));
+ i += 4;
+ }
+ while (i + 2 <= nbytes) {
+ aa64_emit32(mc, aa64_stur(1, 31, dr, (i32)i));
+ i += 2;
+ }
+ while (i < nbytes) {
+ aa64_emit32(mc, aa64_stur(0, 31, dr, (i32)i));
+ i += 1;
+ }
+ return;
+ }
+
+ u64 b64 = byte;
+ b64 |= b64 << 8;
+ b64 |= b64 << 16;
+ b64 |= b64 << 32;
+ aa64_emit_load_imm(mc, /*sf=*/1u, AA_TMP1, (i64)b64);
+
+ u32 i = 0;
+ while (i + 8 <= nbytes) {
+ aa64_emit32(mc, aa64_stur(3, AA_TMP1, dr, (i32)i));
+ i += 8;
+ }
+ while (i + 4 <= nbytes) {
+ aa64_emit32(mc, aa64_stur(2, AA_TMP1, dr, (i32)i));
+ i += 4;
+ }
+ while (i + 2 <= nbytes) {
+ aa64_emit32(mc, aa64_stur(1, AA_TMP1, dr, (i32)i));
+ i += 2;
+ }
+ while (i < nbytes) {
+ aa64_emit32(mc, aa64_stur(0, AA_TMP1, dr, (i32)i));
+ i += 1;
+ }
+}
+
+/* ============================================================
+ * Bitfields
+ * ============================================================ */
+
+static void aa_bitfield_load(CGTarget* t, Operand dst, Operand record_addr,
+ BitFieldAccess bf) {
+ MCEmitter* mc = t->mc;
+ u32 base = agg_addr_reg(t, record_addr, AA_TMP0);
+ u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
+ u32 sf = (storage_bytes == 8u) ? 1u : 0u;
+ u32 sidx = size_idx_for_bytes(storage_bytes);
+ u32 rd = reg_num(dst);
+
+ aa64_emit32(mc, aa64_ldur(sidx, rd, base, (i32)bf.storage_offset));
+ u32 lsb = bf.bit_offset;
+ u32 width = bf.bit_width ? bf.bit_width : 1u;
+ u32 imms = lsb + width - 1u;
+ if (bf.signed_) {
+ aa64_emit32(mc, aa64_sbfm(sf, rd, rd, lsb, imms));
+ } else {
+ aa64_emit32(mc, aa64_ubfm(sf, rd, rd, lsb, imms));
+ }
+}
+
+static void aa_bitfield_store(CGTarget* t, Operand record_addr, Operand src,
+ BitFieldAccess bf) {
+ MCEmitter* mc = t->mc;
+ u32 base = agg_addr_reg(t, record_addr, AA_TMP0);
+ u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
+ u32 sf = (storage_bytes == 8u) ? 1u : 0u;
+ u32 sidx = size_idx_for_bytes(storage_bytes);
+
+ aa64_emit32(mc, aa64_ldur(sidx, AA_TMP1, base, (i32)bf.storage_offset));
+
+ u32 src_reg;
+ if (src.kind == OPK_IMM) {
+ aa64_emit_load_imm(mc, sf, AA_TMP2, src.v.imm);
+ src_reg = AA_TMP2;
+ } else if (src.kind == OPK_REG) {
+ src_reg = reg_num(src);
+ } else {
+ compiler_panic(t->c, impl_of(t)->loc,
+ "aarch64 bitfield_store: src kind %d unsupported",
+ (int)src.kind);
+ }
+
+ u32 reg_size = sf ? 64u : 32u;
+ u32 lsb = bf.bit_offset;
+ u32 width = bf.bit_width ? bf.bit_width : 1u;
+ u32 immr = (reg_size - lsb) % reg_size;
+ u32 imms = width - 1u;
+ aa64_emit32(mc, aa64_bfm(sf, AA_TMP1, src_reg, immr, imms));
+
+ aa64_emit32(mc, aa64_stur(sidx, AA_TMP1, base, (i32)bf.storage_offset));
+}
+
+/* ============================================================
+ * Arithmetic helpers
+ * ============================================================ */
+
+u32 aa64_force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch) {
+ if (op.kind == OPK_REG) return reg_num(op);
+ if (op.kind == OPK_IMM) {
+ aa64_emit_load_imm(t->mc, sf, scratch, op.v.imm);
+ return scratch;
+ }
+ compiler_panic(t->c, impl_of(t)->loc,
+ "aarch64 binop: operand kind %d unsupported", (int)op.kind);
+}
+
+static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
+ Operand b_op) {
+ MCEmitter* mc = t->mc;
+
+ if (op == BO_FADD || op == BO_FSUB || op == BO_FMUL || op == BO_FDIV) {
+ if (a_op.kind != OPK_REG || b_op.kind != OPK_REG || dst.cls != RC_FP) {
+ compiler_panic(t->c, impl_of(t)->loc,
+ "aarch64 binop: FP op requires REG operands");
+ }
+ u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
+ u32 rd = reg_num(dst);
+ u32 rn = reg_num(a_op);
+ u32 rm = reg_num(b_op);
+ u32 w;
+ switch (op) {
+ case BO_FADD: w = aa64_fadd(type, rd, rn, rm); break;
+ case BO_FSUB: w = aa64_fsub(type, rd, rn, rm); break;
+ case BO_FMUL: w = aa64_fmul(type, rd, rn, rm); break;
+ case BO_FDIV: w = aa64_fdiv(type, rd, rn, rm); break;
+ default: w = 0; break;
+ }
+ aa64_emit32(mc, w);
+ return;
+ }
+
+ u32 sf = type_is_64(dst.type) ? 1u : 0u;
+ u32 rd = reg_num(dst);
+
+ switch (op) {
+ case BO_IADD:
+ case BO_AND:
+ case BO_OR:
+ case BO_XOR: {
+ if (a_op.kind == OPK_IMM && b_op.kind != OPK_IMM) {
+ Operand t_op = a_op; a_op = b_op; b_op = t_op;
+ }
+ break;
+ }
+ default: break;
+ }
+
+ if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) {
+ u32 rn_reg = reg_num(a_op);
+ i64 imm = b_op.v.imm;
+ u32 imm12, sh, N, immr, imms;
+ switch (op) {
+ case BO_IADD:
+ if (aa64_addsub_imm_fits(imm, &imm12, &sh)) {
+ aa64_emit32(mc, aa64_add_imm(sf, rd, rn_reg, imm12, sh));
+ return;
+ }
+ break;
+ case BO_ISUB:
+ if (aa64_addsub_imm_fits(imm, &imm12, &sh)) {
+ aa64_emit32(mc, aa64_sub_imm(sf, rd, rn_reg, imm12, sh));
+ return;
+ }
+ break;
+ case BO_AND:
+ if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) {
+ aa64_emit32(mc, aa64_and_imm(sf, rd, rn_reg, N, immr, imms));
+ return;
+ }
+ break;
+ case BO_OR:
+ if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) {
+ aa64_emit32(mc, aa64_orr_imm(sf, rd, rn_reg, N, immr, imms));
+ return;
+ }
+ break;
+ case BO_XOR:
+ if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) {
+ aa64_emit32(mc, aa64_eor_imm(sf, rd, rn_reg, N, immr, imms));
+ return;
+ }
+ break;
+ case BO_SHL: {
+ u32 width = sf ? 64u : 32u;
+ u32 sh_amt = (u32)((u64)imm & (width - 1u));
+ if (aa64_lsl_imm_fields(sh_amt, sf, &immr, &imms)) {
+ aa64_emit32(mc, aa64_ubfm(sf, rd, rn_reg, immr, imms));
+ return;
+ }
+ break;
+ }
+ case BO_SHR_U: {
+ u32 width = sf ? 64u : 32u;
+ u32 sh_amt = (u32)((u64)imm & (width - 1u));
+ if (aa64_lsr_imm_fields(sh_amt, sf, &immr, &imms)) {
+ aa64_emit32(mc, aa64_ubfm(sf, rd, rn_reg, immr, imms));
+ return;
+ }
+ break;
+ }
+ case BO_SHR_S: {
+ u32 width = sf ? 64u : 32u;
+ u32 sh_amt = (u32)((u64)imm & (width - 1u));
+ if (aa64_asr_imm_fields(sh_amt, sf, &immr, &imms)) {
+ aa64_emit32(mc, aa64_sbfm(sf, rd, rn_reg, immr, imms));
+ return;
+ }
+ break;
+ }
+ default: break;
+ }
+ }
+
+ u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
+ u32 rm =
+ aa64_force_reg_int(t, b_op, sf, (rn == AA_TMP0) ? AA_TMP1 : AA_TMP0);
+
+ u32 word;
+ switch (op) {
+ case BO_IADD: word = aa64_add(sf, rd, rn, rm); break;
+ case BO_ISUB: word = aa64_sub(sf, rd, rn, rm); break;
+ case BO_IMUL: word = aa64_mul(sf, rd, rn, rm); break;
+ case BO_AND: word = aa64_and(sf, rd, rn, rm); break;
+ case BO_OR: word = aa64_orr(sf, rd, rn, rm); break;
+ case BO_XOR: word = aa64_eor(sf, rd, rn, rm); break;
+ case BO_SHL: word = aa64_lslv(sf, rd, rn, rm); break;
+ case BO_SHR_U: word = aa64_lsrv(sf, rd, rn, rm); break;
+ case BO_SHR_S: word = aa64_asrv(sf, rd, rn, rm); break;
+ case BO_UDIV: word = aa64_udiv(sf, rd, rn, rm); break;
+ case BO_SDIV: word = aa64_sdiv(sf, rd, rn, rm); break;
+ case BO_SREM:
+ aa64_emit32(mc, aa64_sdiv(sf, AA_TMP2, rn, rm));
+ word = aa64_msub(sf, rd, AA_TMP2, rm, rn);
+ break;
+ case BO_UREM:
+ aa64_emit32(mc, aa64_udiv(sf, AA_TMP2, rn, rm));
+ word = aa64_msub(sf, rd, AA_TMP2, rm, rn);
+ break;
+ case BO_FADD:
+ case BO_FSUB:
+ case BO_FMUL:
+ case BO_FDIV:
+ default:
+ compiler_panic(t->c, impl_of(t)->loc, "aarch64 binop: op %d unimpl",
+ (int)op);
+ }
+ aa64_emit32(mc, word);
+}
+
+static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a_op) {
+ MCEmitter* mc = t->mc;
+ u32 sf = type_is_64(dst.type) ? 1u : 0u;
+ u32 rd = reg_num(dst);
+ u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
+ u32 word;
+
+ switch (op) {
+ case UO_NEG:
+ word = aa64_neg(sf, rd, rn);
+ break;
+ case UO_BNOT:
+ word = aa64_mvn(sf, rd, rn);
+ break;
+ case UO_NOT:
+ aa64_emit32(mc, aa64_subs_imm(sf, /*ZR=*/31, rn, 0));
+ word = aa64_cset_eq(sf, rd);
+ break;
+ default:
+ compiler_panic(t->c, impl_of(t)->loc, "aarch64 unop: op %d unimpl",
+ (int)op);
+ }
+ aa64_emit32(mc, word);
+}
+
+static void aa_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) {
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ u32 rd = reg_num(dst);
+ u32 rn = reg_num(src);
+
+ switch (k) {
+ case CV_SEXT: {
+ if (src.cls != RC_INT || dst.cls != RC_INT) {
+ compiler_panic(t->c, a->loc, "aarch64 convert SEXT: bad classes");
+ }
+ u32 src_bits = type_byte_size(src.type) * 8u;
+ u32 sf_dst = type_is_64(dst.type) ? 1u : 0u;
+ aa64_emit32(mc, aa64_sbfm(sf_dst, rd, rn, /*immr=*/0, /*imms=*/src_bits - 1u));
+ return;
+ }
+ case CV_ZEXT: {
+ if (src.cls != RC_INT || dst.cls != RC_INT) {
+ compiler_panic(t->c, a->loc, "aarch64 convert ZEXT: bad classes");
+ }
+ u32 src_bits = type_byte_size(src.type) * 8u;
+ if (src_bits == 32u) {
+ aa64_emit32(mc, aa64_mov_reg(0, rd, rn));
+ } else {
+ aa64_emit32(mc, aa64_ubfm(0, rd, rn, /*immr=*/0, /*imms=*/src_bits - 1u));
+ }
+ return;
+ }
+ case CV_TRUNC: {
+ aa64_emit32(mc, aa64_mov_reg(0, rd, rn));
+ return;
+ }
+ case CV_ITOF_S: {
+ u32 sf_src = type_is_64(src.type) ? 1u : 0u;
+ u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
+ aa64_emit32(mc, aa64_scvtf(sf_src, type, rd, rn));
+ return;
+ }
+ case CV_ITOF_U: {
+ u32 sf_src = type_is_64(src.type) ? 1u : 0u;
+ u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
+ aa64_emit32(mc, aa64_ucvtf(sf_src, type, rd, rn));
+ return;
+ }
+ case CV_FTOI_S: {
+ if (src.cls != RC_FP || dst.cls != RC_INT) {
+ compiler_panic(t->c, a->loc, "aarch64 convert FTOI_S: bad classes");
+ }
+ u32 sf = type_is_64(dst.type) ? 1u : 0u;
+ u32 type = type_is_fp_double(src.type) ? 1u : 0u;
+ aa64_emit32(mc, aa64_fcvtzs(sf, type, rd, rn));
+ return;
+ }
+ case CV_FTOI_U: {
+ if (src.cls != RC_FP || dst.cls != RC_INT) {
+ compiler_panic(t->c, a->loc, "aarch64 convert FTOI_U: bad classes");
+ }
+ u32 sf = type_is_64(dst.type) ? 1u : 0u;
+ u32 type = type_is_fp_double(src.type) ? 1u : 0u;
+ aa64_emit32(mc, aa64_fcvtzu(sf, type, rd, rn));
+ return;
+ }
+ case CV_FEXT: {
+ aa64_emit32(mc, aa64_fcvt_d_s(rd, rn));
+ return;
+ }
+ case CV_FTRUNC: {
+ aa64_emit32(mc, aa64_fcvt_s_d(rd, rn));
+ return;
+ }
+ case CV_BITCAST: {
+ if (src.cls == RC_INT && dst.cls == RC_FP) {
+ u32 sz = type_byte_size(dst.type);
+ aa64_emit32(mc, sz == 8 ? aa64_fmov_d_x(rd, rn) : aa64_fmov_s_w(rd, rn));
+ } else if (src.cls == RC_FP && dst.cls == RC_INT) {
+ u32 sz = type_byte_size(src.type);
+ aa64_emit32(mc, sz == 8 ? aa64_fmov_x_d(rd, rn) : aa64_fmov_w_s(rd, rn));
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 convert BITCAST: same-class not yet supported");
+ }
+ return;
+ }
+ default:
+ compiler_panic(t->c, a->loc, "aarch64 convert kind %d unimpl", (int)k);
+ }
+}
+
+/* ============================================================
+ * Calls
+ * ============================================================ */
+
+static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi,
+ const CGABIValue* av, u32* next_int, u32* next_fp,
+ u32* stack_off) {
+ AAImpl* a = impl_of(t);
+ ABIArgInfo va_ai;
+ ABIArgPart va_pt;
+ const ABIArgInfo* ai = av->abi;
+ if (!ai) {
+ u32 sz = type_byte_size(av->type);
+ memset(&va_ai, 0, sizeof va_ai);
+ memset(&va_pt, 0, sizeof va_pt);
+ va_ai.kind = ABI_ARG_DIRECT;
+ va_ai.parts = &va_pt;
+ va_ai.nparts = 1;
+ va_pt.cls = (av->storage.cls == RC_FP) ? ABI_CLASS_FP : ABI_CLASS_INT;
+ va_pt.size = sz;
+ va_pt.align = sz;
+ va_pt.src_offset = 0;
+ ai = &va_ai;
+ if (fi && fi->vararg_on_stack) {
+ *next_int = 8;
+ *next_fp = 8;
+ }
+ }
+ if (ai->kind == ABI_ARG_IGNORE) return;
+
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ u32 dst_reg;
+ int to_stack = (*next_int >= 8);
+ if (!to_stack)
+ dst_reg = (*next_int)++;
+ else
+ dst_reg = AA_TMP0;
+ if (av->storage.kind == OPK_LOCAL) {
+ AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad byval slot");
+ aa64_emit32(t->mc, aa64_sub_imm(1, dst_reg, 29, s->off, 0));
+ } else if (av->storage.kind == OPK_INDIRECT) {
+ aa64_emit_addr_adjust(t->mc, dst_reg, av->storage.v.ind.base & 0x1f,
+ av->storage.v.ind.ofs);
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: INDIRECT arg storage kind %d unsupported",
+ (int)av->storage.kind);
+ }
+ if (to_stack) {
+ aa64_emit32(t->mc, aa64_str_uimm(3, dst_reg, 31, *stack_off));
+ *stack_off += 8;
+ }
+ return;
+ }
+
+ for (u16 i = 0; i < ai->nparts; ++i) {
+ const ABIArgPart* pt = &ai->parts[i];
+ u32 sz = pt->size;
+ u32 sidx = size_idx_for_bytes(sz);
+
+ if (pt->cls == ABI_CLASS_INT) {
+ int to_stack = (*next_int >= 8);
+ u32 dst_reg = to_stack ? AA_TMP0 : (*next_int)++;
+ switch (av->storage.kind) {
+ case OPK_IMM: {
+ u32 sf = (sz == 8) ? 1u : 0u;
+ aa64_emit_load_imm(t->mc, sf, dst_reg, av->storage.v.imm);
+ break;
+ }
+ case OPK_REG: {
+ u32 sf = (sz == 8) ? 1u : 0u;
+ aa64_emit32(t->mc, aa64_mov_reg(sf, dst_reg, reg_num(av->storage)));
+ break;
+ }
+ case OPK_LOCAL: {
+ AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad arg slot");
+ i32 off = -(i32)s->off + (i32)pt->src_offset;
+ aa64_emit32(t->mc, aa64_ldur(sidx, dst_reg, 29, off));
+ break;
+ }
+ case OPK_INDIRECT: {
+ Operand src;
+ memset(&src, 0, sizeof src);
+ src.kind = OPK_INDIRECT;
+ src.v.ind.base = av->storage.v.ind.base;
+ src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset;
+ i32 off;
+ u32 base = addr_base(t, src, &off, AA_TMP0);
+ aa64_emit32(t->mc, aa64_ldur(sidx, dst_reg, base, off));
+ break;
+ }
+ default:
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: arg storage kind %d unsupported",
+ (int)av->storage.kind);
+ }
+ if (to_stack) {
+ aa64_emit32(t->mc, aa64_str_uimm(3, dst_reg, 31, *stack_off));
+ *stack_off += 8;
+ }
+ } else if (pt->cls == ABI_CLASS_FP) {
+ int to_stack = (*next_fp >= 8);
+ if (!to_stack) {
+ u32 dst_reg = (*next_fp)++;
+ switch (av->storage.kind) {
+ case OPK_REG: {
+ u32 type = (sz == 8) ? 1u : 0u;
+ aa64_emit32(t->mc, aa64_fmov_reg(type, dst_reg, reg_num(av->storage)));
+ break;
+ }
+ case OPK_INDIRECT: {
+ Operand src;
+ memset(&src, 0, sizeof src);
+ src.kind = OPK_INDIRECT;
+ src.v.ind.base = av->storage.v.ind.base;
+ src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset;
+ i32 off;
+ u32 base = addr_base(t, src, &off, AA_TMP0);
+ aa64_emit32(t->mc, aa64_ldur_fp(sidx, dst_reg, base, off));
+ break;
+ }
+ default:
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: FP arg storage kind %d unsupported",
+ (int)av->storage.kind);
+ }
+ } else {
+ switch (av->storage.kind) {
+ case OPK_REG:
+ aa64_emit32(t->mc, aa64_stur_fp(sidx, reg_num(av->storage), 31,
+ (i32)*stack_off));
+ break;
+ case OPK_INDIRECT: {
+ Operand src;
+ memset(&src, 0, sizeof src);
+ src.kind = OPK_INDIRECT;
+ src.v.ind.base = av->storage.v.ind.base;
+ src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset;
+ i32 off;
+ u32 base = addr_base(t, src, &off, AA_TMP0);
+ aa64_emit32(t->mc, aa64_ldur_fp(sidx, AA_FP_TMP0, base, off));
+ aa64_emit32(t->mc, aa64_stur_fp(sidx, AA_FP_TMP0, 31, (i32)*stack_off));
+ break;
+ }
+ default:
+ compiler_panic(
+ t->c, a->loc,
+ "aarch64 call: FP stack-arg storage kind %d unsupported",
+ (int)av->storage.kind);
+ }
+ *stack_off += 8;
+ }
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 call: ABI class %d unimpl",
+ (int)pt->cls);
+ }
+ }
+}
+
+static void aa_call(CGTarget* t, const CGCallDesc* d) {
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+
+ u32 next_int = 0, next_fp = 0, stack_off = 0;
+
+ if (d->abi && d->abi->has_sret) {
+ if (d->ret.storage.kind != OPK_LOCAL) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: sret destination must be LOCAL");
+ }
+ AASlot* s = aa64_slot_get(a, d->ret.storage.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad sret slot");
+ aa64_emit32(mc, aa64_sub_imm(1, 8, 29, s->off, 0));
+ }
+
+ for (u32 i = 0; i < d->nargs; ++i) {
+ emit_arg_value(t, d->abi, &d->args[i], &next_int, &next_fp, &stack_off);
+ }
+
+ u32 needed = (stack_off + 15u) & ~15u;
+ if (needed > a->max_outgoing) a->max_outgoing = needed;
+
+ if (d->callee.kind == OPK_GLOBAL) {
+ u32 bl_pos = mc->pos(mc);
+ aa64_emit32(mc, aa64_bl_base());
+ mc->emit_reloc_at(mc, mc->section_id, bl_pos, R_AARCH64_CALL26,
+ d->callee.v.global.sym, d->callee.v.global.addend, 0, 0);
+ } else if (d->callee.kind == OPK_REG) {
+ aa64_emit32(mc, aa64_blr(reg_num(d->callee)));
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 call: callee kind %d unsupported",
+ (int)d->callee.kind);
+ }
+
+ const ABIArgInfo* ri = &d->abi->ret;
+ if (ri->kind == ABI_ARG_IGNORE || ri->kind == ABI_ARG_INDIRECT) {
+ return;
+ }
+ if (ri->nparts == 0) return;
+
+ Operand rs = d->ret.storage;
+ u32 next_int_ret = 0, next_fp_ret = 0;
+ for (u16 i = 0; i < ri->nparts; ++i) {
+ const ABIArgPart* p = &ri->parts[i];
+ u32 src_reg;
+ if (p->cls == ABI_CLASS_INT) {
+ src_reg = next_int_ret++;
+ } else if (p->cls == ABI_CLASS_FP) {
+ src_reg = next_fp_ret++;
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 call: ret part cls %d unimpl",
+ (int)p->cls);
+ }
+
+ if (rs.kind == OPK_REG) {
+ if (ri->nparts != 1) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: REG ret_storage with %u parts",
+ (unsigned)ri->nparts);
+ }
+ if (p->cls == ABI_CLASS_INT) {
+ u32 sf = (p->size == 8) ? 1u : 0u;
+ aa64_emit32(mc, aa64_mov_reg(sf, reg_num(rs), src_reg));
+ } else {
+ u32 type = (p->size == 8) ? 1u : 0u;
+ aa64_emit32(mc, aa64_fmov_reg(type, reg_num(rs), src_reg));
+ }
+ } else if (rs.kind == OPK_LOCAL || rs.kind == OPK_INDIRECT) {
+ u32 base_reg;
+ i32 base_off;
+ if (rs.kind == OPK_LOCAL) {
+ AASlot* s = aa64_slot_get(a, rs.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot");
+ base_reg = 29;
+ base_off = -(i32)s->off;
+ } else {
+ base_reg = rs.v.ind.base & 0x1f;
+ base_off = rs.v.ind.ofs;
+ }
+ u32 sidx = size_idx_for_bytes(p->size);
+ i32 off = base_off + (i32)p->src_offset;
+ if (p->cls == ABI_CLASS_INT) {
+ aa64_emit32(mc, aa64_stur(sidx, src_reg, base_reg, off));
+ } else {
+ aa64_emit32(mc, aa64_stur_fp(sidx, src_reg, base_reg, off));
+ }
+ } else if (rs.kind == OPK_IMM && rs.type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_VOID)) {
+ /* void return placeholder */
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: ret_storage kind %d unsupported",
+ (int)rs.kind);
+ }
+ }
+}
+
+static void aa_ret(CGTarget* t, const CGABIValue* val) {
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+
+ if (val) {
+ const ABIArgInfo* ri = val->abi;
+ if (ri && ri->kind == ABI_ARG_INDIRECT) {
+ if (val->storage.kind == OPK_LOCAL) {
+ AASlot* s = aa64_slot_get(a, val->storage.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad sret slot");
+ if (a->sret_ptr_slot != FRAME_SLOT_NONE) {
+ AASlot* sp = aa64_slot_get(a, a->sret_ptr_slot);
+ if (sp) aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off));
+ }
+ u32 nbytes = s->size;
+ u32 i = 0;
+ while (i + 8 <= nbytes) {
+ aa64_emit32(mc, aa64_ldur(3, AA_TMP0, 29, -(i32)s->off + (i32)i));
+ aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i));
+ i += 8;
+ }
+ while (i + 4 <= nbytes) {
+ aa64_emit32(mc, aa64_ldur(2, AA_TMP0, 29, -(i32)s->off + (i32)i));
+ aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i));
+ i += 4;
+ }
+ while (i + 2 <= nbytes) {
+ aa64_emit32(mc, aa64_ldur(1, AA_TMP0, 29, -(i32)s->off + (i32)i));
+ aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i));
+ i += 2;
+ }
+ while (i < nbytes) {
+ aa64_emit32(mc, aa64_ldur(0, AA_TMP0, 29, -(i32)s->off + (i32)i));
+ aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i));
+ i += 1;
+ }
+ } else if (val->storage.kind == OPK_INDIRECT) {
+ u32 nbytes = val->size;
+ if (!nbytes) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 ret indirect: missing aggregate size");
+ }
+ if (a->sret_ptr_slot != FRAME_SLOT_NONE) {
+ AASlot* sp = aa64_slot_get(a, a->sret_ptr_slot);
+ if (sp) aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off));
+ }
+ u32 base_reg = val->storage.v.ind.base & 0x1f;
+ i32 base_off = val->storage.v.ind.ofs;
+ u32 i = 0;
+ while (i + 8 <= nbytes) {
+ aa64_emit32(mc, aa64_ldur(3, AA_TMP0, base_reg, base_off + (i32)i));
+ aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i));
+ i += 8;
+ }
+ while (i + 4 <= nbytes) {
+ aa64_emit32(mc, aa64_ldur(2, AA_TMP0, base_reg, base_off + (i32)i));
+ aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i));
+ i += 4;
+ }
+ while (i + 2 <= nbytes) {
+ aa64_emit32(mc, aa64_ldur(1, AA_TMP0, base_reg, base_off + (i32)i));
+ aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i));
+ i += 2;
+ }
+ while (i < nbytes) {
+ aa64_emit32(mc, aa64_ldur(0, AA_TMP0, base_reg, base_off + (i32)i));
+ aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i));
+ i += 1;
+ }
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 ret indirect: storage kind %d unsupported",
+ (int)val->storage.kind);
+ }
+ } else if (val->storage.kind == OPK_REG) {
+ if (val->storage.cls == RC_FP) {
+ u32 type = type_is_fp_double(val->storage.type) ? 1u : 0u;
+ aa64_emit32(mc, aa64_fmov_reg(type, /*Rd=*/0, reg_num(val->storage)));
+ } else {
+ u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
+ aa64_emit32(mc, aa64_mov_reg(sf, /*Rd=*/0, reg_num(val->storage)));
+ }
+ } else if (val->storage.kind == OPK_IMM) {
+ u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
+ aa64_emit_load_imm(mc, sf, /*Rd=*/0, val->storage.v.imm);
+ } else if (val->storage.kind == OPK_LOCAL ||
+ val->storage.kind == OPK_INDIRECT) {
+ u32 base_reg;
+ i32 base_off;
+ if (val->storage.kind == OPK_LOCAL) {
+ AASlot* s = aa64_slot_get(a, val->storage.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad local slot");
+ base_reg = 29;
+ base_off = -(i32)s->off;
+ } else {
+ base_reg = val->storage.v.ind.base & 0x1f;
+ base_off = val->storage.v.ind.ofs;
+ }
+ const ABIArgInfo* ri2 = val->abi;
+ for (u16 i = 0; i < (ri2 ? ri2->nparts : 0); ++i) {
+ const ABIArgPart* pt = &ri2->parts[i];
+ u32 sidx = size_idx_for_bytes(pt->size);
+ i32 off = base_off + (i32)pt->src_offset;
+ if (pt->cls == ABI_CLASS_INT) {
+ aa64_emit32(mc, aa64_ldur(sidx, /*Rt=*/i, base_reg, off));
+ } else if (pt->cls == ABI_CLASS_FP) {
+ aa64_emit32(mc, aa64_ldur_fp(sidx, /*Rt=*/i, base_reg, off));
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 ret: ret part cls %d unimpl",
+ (int)pt->cls);
+ }
+ }
+ }
+ }
+ u32 bpos = mc->pos(mc);
+ aa64_emit32(mc, aa64_b_base());
+ mc->emit_label_ref(mc, a->epilogue_label, R_AARCH64_JUMP26, 4, 0);
+ (void)bpos;
+}
+
+/* ============================================================
+ * alloca
+ * ============================================================ */
+
+static void aa_alloca_(CGTarget* t, Operand d, Operand sz, u32 align) {
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+
+ if (d.kind != OPK_REG) {
+ compiler_panic(t->c, a->loc, "aarch64 alloca: dst must be REG");
+ }
+ if (align > 16) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 alloca: align %u > 16 not yet supported", align);
+ }
+
+ if (sz.kind == OPK_IMM) {
+ i64 v = sz.v.imm;
+ if (v < 0) {
+ compiler_panic(t->c, a->loc, "aarch64 alloca: negative size");
+ }
+ u64 aligned = ((u64)v + 15u) & ~(u64)15u;
+ if (aligned == 0) aligned = 16;
+ if (aligned > 0xfffu) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 alloca: const size %llu too large for v1",
+ (unsigned long long)aligned);
+ }
+ aa64_emit32(mc, aa64_sub_imm(1, /*Rd=SP*/ 31, /*Rn=SP*/ 31, (u32)aligned, 0));
+ } else if (sz.kind == OPK_REG) {
+ u32 sz_reg = reg_num(sz);
+ aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, sz_reg, 15u, 0));
+ aa64_emit32(mc, aa64_ubfm(1, AA_TMP0, AA_TMP0, 4, 63));
+ aa64_emit32(mc, aa64_ubfm(1, AA_TMP0, AA_TMP0, 60, 59));
+ aa64_emit32(mc, aa64_sub_extreg_x_uxtx(/*SP*/ 31, /*SP*/ 31, AA_TMP0));
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 alloca: size kind %d unsupported",
+ (int)sz.kind);
+ }
+
+ if (a->nadd_patches == a->add_patches_cap) {
+ u32 ncap = a->add_patches_cap ? a->add_patches_cap * 2 : 4;
+ struct AAAllocaPatch* nb =
+ arena_array(t->c->tu, struct AAAllocaPatch, ncap);
+ if (a->add_patches)
+ memcpy(nb, a->add_patches, sizeof(*nb) * a->nadd_patches);
+ a->add_patches = nb;
+ a->add_patches_cap = ncap;
+ }
+ u32 dst_reg = reg_num(d);
+ a->add_patches[a->nadd_patches].pos = mc->pos(mc);
+ a->add_patches[a->nadd_patches].dst_reg = dst_reg;
+ a->nadd_patches++;
+ aa64_emit32(mc, aa64_add_imm(1, dst_reg, /*Rn=SP*/ 31, 0, 0));
+ a->has_alloca = 1;
+}
+
+/* ============================================================
+ * Varargs
+ * ============================================================ */
+
+static void emit_fp_off(MCEmitter* mc, u32 dst, i32 ofs) {
+ if (ofs == 0)
+ aa64_emit32(mc, aa64_mov_reg(1, dst, 29));
+ else if (ofs > 0 && (u32)ofs <= 0xfff)
+ aa64_emit32(mc, aa64_add_imm(1, dst, 29, (u32)ofs, 0));
+ else if (ofs < 0 && (u32)(-ofs) <= 0xfff)
+ aa64_emit32(mc, aa64_sub_imm(1, dst, 29, (u32)(-ofs), 0));
+ else {
+ aa64_emit_load_imm(mc, 1, dst, ofs);
+ aa64_emit32(mc, aa64_add(1, dst, 29, dst));
+ }
+}
+
+static void aa_va_start_(CGTarget* t, Operand ap_op) {
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ if (!a->is_variadic) {
+ compiler_panic(t->c, a->loc, "aarch64 va_start: function not variadic");
+ }
+ u32 ap = reg_num(ap_op);
+ AASlot* gs = aa64_slot_get(a, a->gp_save_slot);
+ AASlot* fs = aa64_slot_get(a, a->fp_save_slot);
+
+ {
+ u32 ofs = 16u + a->next_param_stack;
+ if (ofs <= 0xfff)
+ aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, 29, ofs, 0));
+ else {
+ aa64_emit_load_imm(mc, 1, AA_TMP0, (i64)ofs);
+ aa64_emit32(mc, aa64_add(1, AA_TMP0, 29, AA_TMP0));
+ }
+ aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 0));
+ }
+ emit_fp_off(mc, AA_TMP0, -(i32)gs->off + (i32)gs->size);
+ aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 8));
+ emit_fp_off(mc, AA_TMP0, -(i32)fs->off + (i32)fs->size);
+ aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 16));
+ aa64_emit_load_imm(mc, 0, AA_TMP0,
+ (i64)((i32)(a->next_param_int * 8u) - 64));
+ aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, ap, 24));
+ aa64_emit_load_imm(mc, 0, AA_TMP0,
+ (i64)((i32)(a->next_param_fp * 16u) - 128));
+ aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, ap, 28));
+}
+
+static void aa_va_arg_(CGTarget* t, Operand dst, Operand ap_op,
+ CfreeCgTypeId ty) {
+ MCEmitter* mc = t->mc;
+ u32 ap = reg_num(ap_op);
+ int is_fp = (dst.cls == RC_FP);
+ u32 offs_field = is_fp ? 28u : 24u;
+ u32 top_field = is_fp ? 16u : 8u;
+ u32 stride_reg = is_fp ? 16u : 8u;
+ u32 sz = type_byte_size(ty);
+ u32 sidx = size_idx_for_bytes(sz);
+
+ MCLabel L_stack = mc->label_new(mc);
+ MCLabel L_done = mc->label_new(mc);
+
+ aa64_emit32(mc, aa64_ldur(2, AA_TMP0, ap, (i32)offs_field));
+ aa64_emit32(mc, aa64_subs_imm(0, 31, AA_TMP0, 0));
+ aa64_emit32(mc, aa64_b_cond(0xa /*GE*/));
+ mc->emit_label_ref(mc, L_stack, R_AARCH64_CONDBR19, 4, 0);
+
+ aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, (i32)top_field));
+ aa64_emit32(mc, aa64_sbfm(1, AA_TMP2, AA_TMP0, 0, 31));
+ aa64_emit32(mc, aa64_add(1, AA_TMP2, AA_TMP1, AA_TMP2));
+ if (is_fp)
+ aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP2, 0));
+ else
+ aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP2, 0));
+ aa64_emit32(mc, aa64_add_imm(0, AA_TMP0, AA_TMP0, stride_reg, 0));
+ aa64_emit32(mc, aa64_stur(2, AA_TMP0, ap, (i32)offs_field));
+ aa64_emit32(mc, aa64_b_base());
+ mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0);
+
+ mc->label_place(mc, L_stack);
+ aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, 0));
+ if (is_fp)
+ aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP1, 0));
+ else
+ aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP1, 0));
+ aa64_emit32(mc, aa64_add_imm(1, AA_TMP1, AA_TMP1, 8u, 0));
+ aa64_emit32(mc, aa64_stur(3, AA_TMP1, ap, 0));
+
+ mc->label_place(mc, L_done);
+}
+
+static void aa_va_end_(CGTarget* t, Operand a) {
+ (void)t;
+ (void)a;
+}
+
+static void aa_va_copy_(CGTarget* t, Operand d, Operand s) {
+ MCEmitter* mc = t->mc;
+ u32 dr = reg_num(d);
+ u32 sr = reg_num(s);
+ for (u32 i = 0; i < 32u; i += 8u) {
+ aa64_emit32(mc, aa64_ldur(3, AA_TMP0, sr, (i32)i));
+ aa64_emit32(mc, aa64_stur(3, AA_TMP0, dr, (i32)i));
+ }
+}
+
+/* ============================================================
+ * Atomics
+ * ============================================================ */
+
+static inline u32 aa64_ldar(u32 sf64, u32 Rt, u32 Rn) {
+ return (sf64 ? 0xC8DFFC00u : 0x88DFFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+static inline u32 aa64_stlr(u32 sf64, u32 Rt, u32 Rn) {
+ return (sf64 ? 0xC89FFC00u : 0x889FFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+static inline u32 aa64_ldxr(u32 sf64, u32 Rt, u32 Rn) {
+ return (sf64 ? 0xC85F7C00u : 0x885F7C00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+static inline u32 aa64_ldaxr(u32 sf64, u32 Rt, u32 Rn) {
+ return (sf64 ? 0xC85FFC00u : 0x885FFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+static inline u32 aa64_stxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn) {
+ return (sf64 ? 0xC8007C00u : 0x88007C00u) | ((Rs & 0x1f) << 16) |
+ ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+static inline u32 aa64_stlxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn) {
+ return (sf64 ? 0xC800FC00u : 0x8800FC00u) | ((Rs & 0x1f) << 16) |
+ ((Rn & 0x1f) << 5) | (Rt & 0x1f);
+}
+static inline u32 aa64_cbnz(u32 sf64, u32 Rt) {
+ return 0x35000000u | (sf64 << 31) | (Rt & 0x1f);
+}
+
+static int mem_order_is_acquire(MemOrder o) {
+ return o == MO_ACQUIRE || o == MO_ACQ_REL || o == MO_SEQ_CST ||
+ o == MO_CONSUME;
+}
+static int mem_order_is_release(MemOrder o) {
+ return o == MO_RELEASE || o == MO_ACQ_REL || o == MO_SEQ_CST;
+}
+
+static void aa_atomic_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma,
+ MemOrder ord) {
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ u32 sf = (ma.size == 8) ? 1u : 0u;
+
+ u32 base;
+ if (addr.kind == OPK_REG) {
+ base = reg_num(addr);
+ } else if (addr.kind == OPK_LOCAL) {
+ AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_load: bad slot");
+ base = AA_TMP0;
+ aa64_emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0));
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 atomic_load: addr kind %d unsupported",
+ (int)addr.kind);
+ }
+ if (mem_order_is_acquire(ord)) {
+ aa64_emit32(mc, aa64_ldar(sf, reg_num(dst), base));
+ } else {
+ u32 sidx = size_idx_for_bytes(ma.size);
+ aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), base, 0));
+ }
+}
+
+static void aa_atomic_store(CGTarget* t, Operand addr, Operand src,
+ MemAccess ma, MemOrder ord) {
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ u32 sf = (ma.size == 8) ? 1u : 0u;
+
+ u32 src_reg;
+ if (src.kind == OPK_IMM) {
+ src_reg = AA_TMP1;
+ aa64_emit_load_imm(mc, sf, src_reg, src.v.imm);
+ } else if (src.kind == OPK_REG) {
+ src_reg = reg_num(src);
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 atomic_store: src kind %d unsupported",
+ (int)src.kind);
+ }
+ u32 base;
+ if (addr.kind == OPK_REG) {
+ base = reg_num(addr);
+ } else if (addr.kind == OPK_LOCAL) {
+ AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_store: bad slot");
+ base = AA_TMP0;
+ aa64_emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0));
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 atomic_store: addr kind %d unsupported",
+ (int)addr.kind);
+ }
+ if (mem_order_is_release(ord)) {
+ aa64_emit32(mc, aa64_stlr(sf, src_reg, base));
+ } else {
+ u32 sidx = size_idx_for_bytes(ma.size);
+ aa64_emit32(mc, aa64_stur(sidx, src_reg, base, 0));
+ }
+}
+
+static void emit_rmw_combine(MCEmitter* mc, AtomicOp op, u32 sf, u32 dst_new,
+ u32 prior, u32 val) {
+ switch (op) {
+ case AO_XCHG: aa64_emit32(mc, aa64_mov_reg(sf, dst_new, val)); break;
+ case AO_ADD: aa64_emit32(mc, aa64_add(sf, dst_new, prior, val)); break;
+ case AO_SUB: aa64_emit32(mc, aa64_sub(sf, dst_new, prior, val)); break;
+ case AO_AND: aa64_emit32(mc, aa64_and(sf, dst_new, prior, val)); break;
+ case AO_OR: aa64_emit32(mc, aa64_orr(sf, dst_new, prior, val)); break;
+ case AO_XOR: aa64_emit32(mc, aa64_eor(sf, dst_new, prior, val)); break;
+ case AO_NAND:
+ aa64_emit32(mc, aa64_and(sf, dst_new, prior, val));
+ aa64_emit32(mc, aa64_mvn(sf, dst_new, dst_new));
+ break;
+ default:
+ aa64_emit32(mc, aa64_mov_reg(sf, dst_new, val));
+ break;
+ }
+}
+
+static void aa_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst, Operand addr,
+ Operand val, MemAccess ma, MemOrder ord) {
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ u32 sf = (ma.size == 8) ? 1u : 0u;
+
+ u32 base = AA_TMP0;
+ if (addr.kind == OPK_REG) {
+ aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(addr)));
+ } else if (addr.kind == OPK_LOCAL) {
+ AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: bad slot");
+ aa64_emit32(mc, aa64_sub_imm(1, AA_TMP0, 29, s->off, 0));
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: addr kind %d unsupported",
+ (int)addr.kind);
+ }
+ u32 vreg = AA_TMP1;
+ if (val.kind == OPK_IMM) {
+ aa64_emit_load_imm(mc, sf, vreg, val.v.imm);
+ } else if (val.kind == OPK_REG) {
+ aa64_emit32(mc, aa64_mov_reg(sf, vreg, reg_num(val)));
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: val kind %d unsupported",
+ (int)val.kind);
+ }
+
+ int do_acq = mem_order_is_acquire(ord);
+ int do_rel = mem_order_is_release(ord);
+
+ MCLabel L_retry = mc->label_new(mc);
+ mc->label_place(mc, L_retry);
+
+ if (do_acq)
+ aa64_emit32(mc, aa64_ldaxr(sf, reg_num(dst), base));
+ else
+ aa64_emit32(mc, aa64_ldxr(sf, reg_num(dst), base));
+
+ emit_rmw_combine(mc, op, sf, AA_TMP2, reg_num(dst), vreg);
+
+ if (do_rel)
+ aa64_emit32(mc, aa64_stlxr(sf, vreg, AA_TMP2, base));
+ else
+ aa64_emit32(mc, aa64_stxr(sf, vreg, AA_TMP2, base));
+
+ u32 cbnz_pos = mc->pos(mc);
+ aa64_emit32(mc, aa64_cbnz(0, vreg));
+ mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0);
+ (void)cbnz_pos;
+}
+
+static void aa_atomic_cas(CGTarget* t, Operand prior, Operand ok, Operand addr,
+ Operand expected, Operand desired, MemAccess ma,
+ MemOrder succ, MemOrder fail) {
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ u32 sf = (ma.size == 8) ? 1u : 0u;
+ (void)fail;
+
+ u32 base = AA_TMP0;
+ if (addr.kind == OPK_REG)
+ aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(addr)));
+ else if (addr.kind == OPK_LOCAL) {
+ AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
+ if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_cas: bad slot");
+ aa64_emit32(mc, aa64_sub_imm(1, AA_TMP0, 29, s->off, 0));
+ } else {
+ compiler_panic(t->c, a->loc, "aarch64 atomic_cas: addr kind %d unsupported",
+ (int)addr.kind);
+ }
+ if (expected.kind == OPK_IMM)
+ aa64_emit_load_imm(mc, sf, AA_TMP1, expected.v.imm);
+ else if (expected.kind == OPK_REG)
+ aa64_emit32(mc, aa64_mov_reg(sf, AA_TMP1, reg_num(expected)));
+ else
+ compiler_panic(t->c, a->loc, "aarch64 atomic_cas: exp kind %d unsupported",
+ (int)expected.kind);
+ if (desired.kind == OPK_IMM)
+ aa64_emit_load_imm(mc, sf, AA_TMP2, desired.v.imm);
+ else if (desired.kind == OPK_REG)
+ aa64_emit32(mc, aa64_mov_reg(sf, AA_TMP2, reg_num(desired)));
+ else
+ compiler_panic(t->c, a->loc, "aarch64 atomic_cas: des kind %d unsupported",
+ (int)desired.kind);
+
+ int do_acq = mem_order_is_acquire(succ);
+ int do_rel = mem_order_is_release(succ);
+
+ MCLabel L_retry = mc->label_new(mc);
+ MCLabel L_fail = mc->label_new(mc);
+ MCLabel L_done = mc->label_new(mc);
+
+ mc->label_place(mc, L_retry);
+ if (do_acq)
+ aa64_emit32(mc, aa64_ldaxr(sf, reg_num(prior), base));
+ else
+ aa64_emit32(mc, aa64_ldxr(sf, reg_num(prior), base));
+
+ aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, reg_num(prior), AA_TMP1));
+ aa64_emit32(mc, aa64_b_cond(0x1u /*NE*/));
+ mc->emit_label_ref(mc, L_fail, R_AARCH64_CONDBR19, 4, 0);
+
+ if (do_rel)
+ aa64_emit32(mc, aa64_stlxr(sf, AA_TMP1, AA_TMP2, base));
+ else
+ aa64_emit32(mc, aa64_stxr(sf, AA_TMP1, AA_TMP2, base));
+ aa64_emit32(mc, aa64_cbnz(0, AA_TMP1));
+ mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0);
+
+ aa64_emit_load_imm(mc, 0, reg_num(ok), 1);
+ aa64_emit32(mc, aa64_b_base());
+ mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0);
+
+ mc->label_place(mc, L_fail);
+ aa64_emit32(mc, aa64_clrex(AA64_BARRIER_OPT_SY));
+ aa64_emit_load_imm(mc, 0, reg_num(ok), 0);
+
+ mc->label_place(mc, L_done);
+}
+
+static void aa_fence(CGTarget* t, MemOrder o) {
+ (void)o;
+ if (o == MO_RELAXED) return;
+ aa64_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
+}
+
+/* ============================================================
+ * Intrinsics
+ * ============================================================ */
+
+static inline u32 aa64_rev16_w(u32 Rd, u32 Rn) {
+ return 0x5AC00400u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_rev_w(u32 Rd, u32 Rn) {
+ return 0x5AC00800u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_rev_x(u32 Rd, u32 Rn) {
+ return 0xDAC00C00u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_rbit(u32 sf64, u32 Rd, u32 Rn) {
+ return (sf64 ? 0xDAC00000u : 0x5AC00000u) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_clz(u32 sf64, u32 Rd, u32 Rn) {
+ return (sf64 ? 0xDAC01000u : 0x5AC01000u) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
+}
+static inline u32 aa64_cnt_8b(u32 Vd, u32 Vn) {
+ return 0x0E205800u | ((Vn & 0x1f) << 5) | (Vd & 0x1f);
+}
+static inline u32 aa64_addv_b_8b(u32 Vd, u32 Vn) {
+ return 0x0E31B800u | ((Vn & 0x1f) << 5) | (Vd & 0x1f);
+}
+static inline u32 aa64_adds_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
+ return 0x2B000000u | (sf << 31) | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) |
+ (Rd & 0x1f);
+}
+static inline u32 aa64_smaddl(u32 Rd, u32 Rn, u32 Rm, u32 Ra) {
+ return aa64_dp3_pack((AA64DP3){
+ .sf = 1, .op31 = 1, .o0 = 0, .Rm = Rm, .Ra = Ra, .Rn = Rn, .Rd = Rd});
+}
+static inline u32 aa64_smull(u32 Rd, u32 Rn, u32 Rm) {
+ return aa64_smaddl(Rd, Rn, Rm, AA64_ZR);
+}
+static inline u32 aa64_subs_extreg_x_sxtw(u32 Rd, u32 Rn, u32 Rm) {
+ return 0xEB200000u | ((Rm & 0x1f) << 16) | (6u << 13) | ((Rn & 0x1f) << 5) |
+ (Rd & 0x1f);
+}
+
+static void aa_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd,
+ const Operand* args, u32 na) {
+ AAImpl* a = impl_of(t);
+ MCEmitter* mc = t->mc;
+ (void)nd;
+
+ switch (kind) {
+ case INTRIN_POPCOUNT: {
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ u32 sz_in = type_byte_size(src.type);
+ if (sz_in == 8)
+ aa64_emit32(mc, aa64_fmov_d_x(AA_FP_TMP0, reg_num(src)));
+ else
+ aa64_emit32(mc, aa64_fmov_s_w(AA_FP_TMP0, reg_num(src)));
+ aa64_emit32(mc, aa64_cnt_8b(AA_FP_TMP0, AA_FP_TMP0));
+ aa64_emit32(mc, aa64_addv_b_8b(AA_FP_TMP0, AA_FP_TMP0));
+ aa64_emit32(mc, aa64_fmov_w_s(reg_num(dst), AA_FP_TMP0));
+ return;
+ }
+ case INTRIN_CLZ: {
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ u32 sf = type_is_64(src.type) ? 1u : 0u;
+ aa64_emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(src)));
+ return;
+ }
+ case INTRIN_CTZ: {
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ u32 sf = type_is_64(src.type) ? 1u : 0u;
+ aa64_emit32(mc, aa64_rbit(sf, reg_num(dst), reg_num(src)));
+ aa64_emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(dst)));
+ return;
+ }
+ case INTRIN_BSWAP16: {
+ aa64_emit32(mc, aa64_rev16_w(reg_num(dsts[0]), reg_num(args[0])));
+ return;
+ }
+ case INTRIN_BSWAP32: {
+ aa64_emit32(mc, aa64_rev_w(reg_num(dsts[0]), reg_num(args[0])));
+ return;
+ }
+ case INTRIN_BSWAP64: {
+ aa64_emit32(mc, aa64_rev_x(reg_num(dsts[0]), reg_num(args[0])));
+ return;
+ }
+ case INTRIN_MEMCPY:
+ case INTRIN_MEMMOVE: {
+ Operand da = args[0], sa = args[1], nb = args[2];
+ if (da.kind != OPK_REG || sa.kind != OPK_REG || nb.kind != OPK_IMM) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 intrinsic: %s with non-const n or non-REG ptr",
+ kind == INTRIN_MEMCPY ? "memcpy" : "memmove");
+ }
+ u32 dr = reg_num(da);
+ u32 sr = reg_num(sa);
+ u32 n = (u32)nb.v.imm;
+ if (kind == INTRIN_MEMCPY) {
+ u32 i = 0;
+ while (i + 8 <= n) {
+ aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i));
+ aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i));
+ i += 8;
+ }
+ while (i + 4 <= n) {
+ aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i));
+ aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i));
+ i += 4;
+ }
+ while (i + 2 <= n) {
+ aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i));
+ aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i));
+ i += 2;
+ }
+ while (i < n) {
+ aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i));
+ aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i));
+ i += 1;
+ }
+ } else {
+ u32 i = n;
+ while (i >= 8) {
+ i -= 8;
+ aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i));
+ aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i));
+ }
+ while (i >= 4) {
+ i -= 4;
+ aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i));
+ aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i));
+ }
+ while (i >= 2) {
+ i -= 2;
+ aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i));
+ aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i));
+ }
+ while (i >= 1) {
+ i -= 1;
+ aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i));
+ aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i));
+ }
+ }
+ return;
+ }
+ case INTRIN_MEMSET: {
+ Operand da = args[0], bv = args[1], nb = args[2];
+ if (da.kind != OPK_REG || nb.kind != OPK_IMM) {
+ compiler_panic(
+ t->c, a->loc,
+ "aarch64 intrinsic: memset with non-const n / non-REG ptr");
+ }
+ u32 dr = reg_num(da);
+ u32 n = (u32)nb.v.imm;
+ u32 byte;
+ u32 src_reg;
+ if (bv.kind == OPK_IMM) {
+ byte = (u32)(bv.v.imm & 0xffu);
+ if (byte == 0) {
+ src_reg = 31u;
+ } else {
+ u64 b64 = byte;
+ b64 |= b64 << 8;
+ b64 |= b64 << 16;
+ b64 |= b64 << 32;
+ aa64_emit_load_imm(mc, 1, AA_TMP2, (i64)b64);
+ src_reg = AA_TMP2;
+ }
+ } else if (bv.kind == OPK_REG) {
+ aa64_emit_load_imm(mc, 1, AA_TMP2, (i64)0x0101010101010101ll);
+ aa64_emit32(mc, aa64_madd(1, AA_TMP2, reg_num(bv), AA_TMP2, AA64_ZR));
+ src_reg = AA_TMP2;
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 intrinsic: memset byte kind %d unsupported",
+ (int)bv.kind);
+ }
+ u32 i = 0;
+ while (i + 8 <= n) {
+ aa64_emit32(mc, aa64_stur(3, src_reg, dr, (i32)i));
+ i += 8;
+ }
+ while (i + 4 <= n) {
+ aa64_emit32(mc, aa64_stur(2, src_reg, dr, (i32)i));
+ i += 4;
+ }
+ while (i + 2 <= n) {
+ aa64_emit32(mc, aa64_stur(1, src_reg, dr, (i32)i));
+ i += 2;
+ }
+ while (i < n) {
+ aa64_emit32(mc, aa64_stur(0, src_reg, dr, (i32)i));
+ i += 1;
+ }
+ return;
+ }
+ case INTRIN_PREFETCH:
+ (void)args;
+ (void)na;
+ return;
+ case INTRIN_ASSUME_ALIGNED: {
+ Operand src = args[0];
+ Operand dst = dsts[0];
+ if (reg_num(src) != reg_num(dst)) {
+ aa64_emit32(mc, aa64_mov_reg(1, reg_num(dst), reg_num(src)));
+ }
+ return;
+ }
+ case INTRIN_EXPECT: {
+ Operand val = args[0];
+ Operand dst = dsts[0];
+ u32 sf = type_is_64(dst.type) ? 1u : 0u;
+ if (val.kind == OPK_REG) {
+ if (reg_num(val) != reg_num(dst)) {
+ aa64_emit32(mc, aa64_mov_reg(sf, reg_num(dst), reg_num(val)));
+ }
+ } else if (val.kind == OPK_IMM) {
+ aa64_emit_load_imm(mc, sf, reg_num(dst), val.v.imm);
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64 intrinsic: expect val kind %d unsupported",
+ (int)val.kind);
+ }
+ return;
+ }
+ case INTRIN_UNREACHABLE:
+ case INTRIN_TRAP:
+ aa64_emit32(mc, aa64_brk(kind == INTRIN_TRAP ? 1u : 0u));
+ return;
+ case INTRIN_ADD_OVERFLOW:
+ case INTRIN_SUB_OVERFLOW: {
+ Operand a_op = args[0], b_op = args[1];
+ Operand dval = dsts[0], dovf = dsts[1];
+ u32 sf = type_is_64(dval.type) ? 1u : 0u;
+ u32 ra = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
+ u32 rb =
+ aa64_force_reg_int(t, b_op, sf,
+ (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0);
+ u32 word = (kind == INTRIN_ADD_OVERFLOW)
+ ? aa64_adds_reg(sf, reg_num(dval), ra, rb)
+ : aa64_subs_reg(sf, reg_num(dval), ra, rb);
+ aa64_emit32(mc, word);
+ aa64_emit32(mc, aa64_cset(sf, reg_num(dovf), 0x6u /*VS*/));
+ return;
+ }
+ case INTRIN_MUL_OVERFLOW: {
+ Operand a_op = args[0], b_op = args[1];
+ Operand dval = dsts[0], dovf = dsts[1];
+ u32 sf = type_is_64(dval.type) ? 1u : 0u;
+ if (sf) {
+ compiler_panic(
+ t->c, a->loc,
+ "aarch64 intrinsic: mul_overflow on i64 not yet supported");
+ }
+ u32 ra = aa64_force_reg_int(t, a_op, 0, AA_TMP0);
+ u32 rb =
+ aa64_force_reg_int(t, b_op, 0,
+ (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0);
+ aa64_emit32(mc, aa64_smull(AA_TMP2, ra, rb));
+ aa64_emit32(mc, aa64_subs_extreg_x_sxtw(/*XZR*/ 31u, AA_TMP2, AA_TMP2));
+ aa64_emit32(mc, aa64_cset(0, reg_num(dovf), 0x1u /*NE*/));
+ aa64_emit32(mc, aa64_mov_reg(0, reg_num(dval), AA_TMP2));
+ return;
+ }
+ default:
+ compiler_panic(t->c, a->loc, "aarch64 intrinsic: kind %d unsupported",
+ (int)kind);
+ }
+}
+
+/* ============================================================
+ * Inline asm block
+ * ============================================================ */
+
+static void aa_asm_block(CGTarget* t, const char* tmpl,
+ const AsmConstraint* outs, u32 no, Operand* oo,
+ const AsmConstraint* ins, u32 ni, const Operand* io,
+ const Sym* clobs, u32 nc) {
+ AAImpl* a_impl = impl_of(t);
+ for (u32 i = 0; i < nc; ++i) {
+ Reg phys;
+ RegClass cls;
+ if (t->resolve_reg_name(t, clobs[i], &phys, &cls) != 0) continue;
+ if (cls == RC_INT) {
+ if (phys >= 19u && phys <= 28u)
+ a_impl->used_cs_int_mask |= 1u << phys;
+ } else if (cls == RC_FP) {
+ if (phys >= 8u && phys <= 15u)
+ a_impl->used_cs_fp_mask |= 1u << phys;
+ }
+ }
+ AA64Asm* a = aa64_asm_open(t->c);
+ aa64_inline_bind(a, outs, no, oo, ins, ni, io, clobs, nc);
+ aa64_asm_run_template(a, t->mc, tmpl);
+ aa64_asm_close(a);
+}
+
+/* ============================================================
+ * Lifecycle / vtable constructor
+ * ============================================================ */
+
+static void aa_set_loc(CGTarget* t, SrcLoc loc) {
+ impl_of(t)->loc = loc;
+ t->mc->set_loc(t->mc, loc);
+}
+
+static void aa_finalize(CGTarget* t) { (void)t; }
+
+static void aa_destroy(CGTarget* t) { (void)t; }
+
+static void cgt_cleanup(void* arg) { cgtarget_free((CGTarget*)arg); }
+
+CGTarget* aa64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
+ AAImpl* a = arena_new(c->tu, AAImpl);
+ memset(a, 0, sizeof *a);
+
+ CGTarget* t = &a->base;
+ t->c = c;
+ t->obj = o;
+ t->mc = m;
+
+ t->func_begin = aa_func_begin;
+ t->func_end = aa_func_end;
+ t->frame_slot = aa_frame_slot;
+ t->param = aa_param;
+
+ t->load_imm = aa_load_imm;
+ t->load_const = aa_load_const;
+ t->copy = aa_copy;
+ t->load = aa_load;
+ t->store = aa_store;
+ t->addr_of = aa_addr_of;
+ t->tls_addr_of = aa_tls_addr_of;
+ t->copy_bytes = aa_copy_bytes;
+ t->set_bytes = aa_set_bytes;
+ t->bitfield_load = aa_bitfield_load;
+ t->bitfield_store = aa_bitfield_store;
+
+ t->binop = aa_binop;
+ t->unop = aa_unop;
+ t->convert = aa_convert;
+
+ t->call = aa_call;
+ t->ret = aa_ret;
+
+ t->alloca_ = aa_alloca_;
+ t->va_start_ = aa_va_start_;
+ t->va_arg_ = aa_va_arg_;
+ t->va_end_ = aa_va_end_;
+ t->va_copy_ = aa_va_copy_;
+
+ t->atomic_load = aa_atomic_load;
+ t->atomic_store = aa_atomic_store;
+ t->atomic_rmw = aa_atomic_rmw;
+ t->atomic_cas = aa_atomic_cas;
+ t->fence = aa_fence;
+
+ t->intrinsic = aa_intrinsic;
+ t->asm_block = aa_asm_block;
+
+ t->set_loc = aa_set_loc;
+ t->finalize = aa_finalize;
+ t->destroy = aa_destroy;
+
+ /* alloc/label/scope vtable entries */
+ aa_alloc_vtable_init(t);
+ aa_coord_vtable_init(t);
+
+ /* Suppress unused warning. */
+ (void)type_is_signed;
+
+ compiler_defer(c, cgt_cleanup, t);
+ return t;
+}
diff --git a/src/arch/aa64/opt_coord.c b/src/arch/aa64/opt_coord.c
@@ -0,0 +1,96 @@
+/* aarch64/opt_coord.c — opt/backend register coordination hooks.
+ * Static arrays so opt_machinize can query the backend instead of
+ * hard-coding arch knowledge. */
+
+#include "arch/aa64/internal.h"
+
+/* ============================================================
+ * Static register tables reported to caller-owned allocators. */
+
+static const Reg aa_int_allocable[] = {19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28};
+static const Reg aa_fp_allocable[] = {8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23};
+
+static const Reg aa_int_scratch[] = {16, 17};
+static const Reg aa_fp_scratch[] = {24, 25};
+
+/* ============================================================
+ * Vtable methods */
+
+static void aa_get_allocable_regs(CGTarget* t, RegClass cls,
+ const Reg** out, u32* nregs) {
+ (void)t;
+ switch (cls) {
+ case RC_INT:
+ *out = aa_int_allocable;
+ *nregs = sizeof aa_int_allocable / sizeof aa_int_allocable[0];
+ break;
+ case RC_FP:
+ *out = aa_fp_allocable;
+ *nregs = sizeof aa_fp_allocable / sizeof aa_fp_allocable[0];
+ break;
+ default:
+ *out = NULL;
+ *nregs = 0;
+ break;
+ }
+}
+
+static void aa_get_scratch_regs(CGTarget* t, RegClass cls,
+ const Reg** out, u32* nregs) {
+ (void)t;
+ switch (cls) {
+ case RC_INT:
+ *out = aa_int_scratch;
+ *nregs = sizeof aa_int_scratch / sizeof aa_int_scratch[0];
+ break;
+ case RC_FP:
+ *out = aa_fp_scratch;
+ *nregs = sizeof aa_fp_scratch / sizeof aa_fp_scratch[0];
+ break;
+ default:
+ *out = NULL;
+ *nregs = 0;
+ break;
+ }
+}
+
+static int aa_is_caller_saved(CGTarget* t, RegClass cls, Reg reg) {
+ (void)t;
+ switch (cls) {
+ case RC_INT:
+ /* AAPCS64 caller-saved: x0-x18, x30 */
+ return reg <= 18 || reg == 30;
+ case RC_FP:
+ /* AAPCS64 caller-saved: v0-v7, v16-v31 */
+ return reg <= 7 || reg >= 16;
+ default:
+ return 0;
+ }
+}
+
+static void aa_reserve_hard_regs(CGTarget* t, RegClass cls,
+ const Reg* regs, u32 n) {
+ AAImpl* a = impl_of(t);
+ for (u32 i = 0; i < n; ++i) {
+ Reg r = regs[i];
+ switch (cls) {
+ case RC_INT:
+ if (r >= 19u && r <= 28u) a->used_cs_int_mask |= 1u << r;
+ break;
+ case RC_FP:
+ if (r >= 8u && r <= 15u) a->used_cs_fp_mask |= 1u << r;
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+void aa_coord_vtable_init(CGTarget* t) {
+ t->get_allocable_regs = aa_get_allocable_regs;
+ t->get_scratch_regs = aa_get_scratch_regs;
+ t->is_caller_saved = aa_is_caller_saved;
+ t->reserve_hard_regs = aa_reserve_hard_regs;
+}
diff --git a/src/arch/aa64/regs.c b/src/arch/aa64/regs.c
@@ -0,0 +1,88 @@
+/* AArch64 register name table — DWARF index ↔ assembler name.
+ *
+ * DWARF register numbering for AArch64 (per the AAPCS64 ABI supplement):
+ * 0..30 X0..X30 (also W0..W30; same DWARF index)
+ * 31 SP (X31 / WSP)
+ * 32 PC
+ * 33 ELR (mode dependent; unused here)
+ * 64..95 V0..V31 (also B/H/S/D forms; same index)
+ *
+ * The canonical assembler spelling for v1 is the 64-bit form (Xn / Vn);
+ * disassembler output picks W/B/H/S/D based on instruction width
+ * separately. */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "arch/aa64/regs.h"
+#include "core/core.h"
+
+typedef struct AA64Reg {
+ uint32_t dwarf_idx;
+ const char* name;
+} AA64Reg;
+
+static const AA64Reg AA64_REGS[] = {
+ {0, "x0"}, {1, "x1"}, {2, "x2"}, {3, "x3"}, {4, "x4"},
+ {5, "x5"}, {6, "x6"}, {7, "x7"}, {8, "x8"}, {9, "x9"},
+ {10, "x10"}, {11, "x11"}, {12, "x12"}, {13, "x13"}, {14, "x14"},
+ {15, "x15"}, {16, "x16"}, {17, "x17"}, {18, "x18"}, {19, "x19"},
+ {20, "x20"}, {21, "x21"}, {22, "x22"}, {23, "x23"}, {24, "x24"},
+ {25, "x25"}, {26, "x26"}, {27, "x27"}, {28, "x28"}, {29, "x29"},
+ {30, "x30"}, {31, "sp"}, {32, "pc"},
+ {64, "v0"}, {65, "v1"}, {66, "v2"}, {67, "v3"}, {68, "v4"},
+ {69, "v5"}, {70, "v6"}, {71, "v7"}, {72, "v8"}, {73, "v9"},
+ {74, "v10"}, {75, "v11"}, {76, "v12"}, {77, "v13"}, {78, "v14"},
+ {79, "v15"}, {80, "v16"}, {81, "v17"}, {82, "v18"}, {83, "v19"},
+ {84, "v20"}, {85, "v21"}, {86, "v22"}, {87, "v23"}, {88, "v24"},
+ {89, "v25"}, {90, "v26"}, {91, "v27"}, {92, "v28"}, {93, "v29"},
+ {94, "v30"}, {95, "v31"},
+};
+
+static const uint32_t AA64_REGS_N = (uint32_t)(sizeof AA64_REGS /
+ sizeof AA64_REGS[0]);
+
+const char* aa64_register_name(uint32_t dwarf_idx) {
+ uint32_t i;
+ for (i = 0; i < AA64_REGS_N; ++i) {
+ if (AA64_REGS[i].dwarf_idx == dwarf_idx) return AA64_REGS[i].name;
+ }
+ return NULL;
+}
+
+int aa64_register_index(const char* name, uint32_t* idx_out) {
+ uint32_t i;
+ if (!name) return 1;
+ for (i = 0; i < AA64_REGS_N; ++i) {
+ if (!strcmp(AA64_REGS[i].name, name)) {
+ if (idx_out) *idx_out = AA64_REGS[i].dwarf_idx;
+ return 0;
+ }
+ }
+ /* Accept Wn alias for Xn (same DWARF index). */
+ if (name[0] == 'w' && name[1] != '\0') {
+ char buf[8];
+ size_t n = strlen(name);
+ if (n < sizeof buf) {
+ buf[0] = 'x';
+ memcpy(buf + 1, name + 1, n);
+ return aa64_register_index(buf, idx_out);
+ }
+ }
+ /* wzr / xzr aliases. */
+ if (!strcmp(name, "wzr") || !strcmp(name, "xzr")) {
+ if (idx_out) *idx_out = 31u; /* shares SP encoding slot; v1 picks SP */
+ return 0;
+ }
+ return 1;
+}
+
+uint32_t aa64_register_iter_size(void) { return AA64_REGS_N; }
+
+int aa64_register_iter_get(uint32_t i, uint32_t* dwarf_out,
+ const char** name_out) {
+ if (i >= AA64_REGS_N) return 1;
+ if (dwarf_out) *dwarf_out = AA64_REGS[i].dwarf_idx;
+ if (name_out) *name_out = AA64_REGS[i].name;
+ return 0;
+}
diff --git a/src/arch/aa64_regs.h b/src/arch/aa64/regs.h
diff --git a/src/arch/aa64_asm.c b/src/arch/aa64_asm.c
@@ -1,1379 +0,0 @@
-/* AArch64 standalone .s instruction parser.
- *
- * Per-mnemonic dispatch: each entry in the mnemonic table names a
- * parse function that reads operand tokens through the asm-driver
- * surface and emits the encoded word via the inline encoders in
- * aa64_isa.h. Encoders are the single source of truth for bit
- * layout — the disassembler shares them through aa64_*_unpack.
- *
- * Aliases (`mov`, `neg`, `cmp`, `mul`, ...) live in this table as
- * dedicated rows that pick the canonical form's encoder with the
- * alias-specific operand shape. When a mnemonic admits multiple
- * forms (e.g. `mov` register-vs-immediate, `add` register-vs-
- * immediate), the parser branches on operand shape after reading
- * the first non-Rd operand. */
-
-#include "arch/aa64_asm.h"
-
-#include <string.h>
-
-#include "arch/aa64_isa.h"
-#include "arch/aa64_regs.h"
-#include "arch/arch.h"
-#include "core/arena.h"
-#include "core/pool.h"
-#include "core/strbuf.h"
-#include "asm/asm_lex.h"
-#include "obj/obj.h"
-#include "asm/asm_helpers.h"
-
-/* ---- public handle ---- */
-
-struct AA64Asm {
- ArchAsm base;
- Compiler* c;
-
- /* Inline-asm bound state (set by aa64_inline_bind, cleared otherwise).
- * Operand indexing per GCC convention: 0..nout-1 are outputs, then
- * nout..nout+nin-1 are inputs. Templates address into this combined
- * list via %N / %wN / %xN / %aN. out_ops is mutable (the binder fills
- * in result locations); in_ops + constraints + clobbers are read-only
- * borrows. */
- const AsmConstraint* outs;
- Operand* out_ops;
- const AsmConstraint* ins;
- const Operand* in_ops;
- const Sym* clobbers;
- u32 nout;
- u32 nin;
- u32 nclob;
-};
-
-static void aa64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic);
-static void aa64_arch_asm_destroy(ArchAsm* base);
-
-AA64Asm* aa64_asm_open(Compiler* c) {
- AA64Asm* a = arena_new(c->tu, AA64Asm);
- memset(a, 0, sizeof *a);
- a->base.insn = aa64_arch_asm_insn;
- a->base.destroy = aa64_arch_asm_destroy;
- a->c = c;
- return a;
-}
-
-void aa64_asm_close(AA64Asm* a) { (void)a; }
-
-ArchAsm* aa64_arch_asm_new(Compiler* c) {
- return &aa64_asm_open(c)->base;
-}
-
-static void aa64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic) {
- aa64_asm_insn((AA64Asm*)base, d, mnemonic);
-}
-
-static void aa64_arch_asm_destroy(ArchAsm* base) {
- aa64_asm_close((AA64Asm*)base);
-}
-
-void aa64_inline_bind(AA64Asm* a,
- const AsmConstraint* outs, u32 nout, Operand* out_ops,
- const AsmConstraint* ins, u32 nin, const Operand* in_ops,
- const Sym* clobbers, u32 nclob) {
- a->outs = outs;
- a->out_ops = out_ops;
- a->ins = ins;
- a->in_ops = in_ops;
- a->clobbers = clobbers;
- a->nout = nout;
- a->nin = nin;
- a->nclob = nclob;
-}
-
-/* ---- helpers ---- */
-
-static int tok_punct(AsmTok t, u32 p) { return asm_driver_tok_is_punct(t, p); }
-
-static int icase_eq(const char* a, size_t an, const char* b) {
- size_t i;
- for (i = 0; i < an; ++i) {
- char x = a[i], y = b[i];
- if (x >= 'A' && x <= 'Z') x = (char)(x + ('a' - 'A'));
- if (y >= 'A' && y <= 'Z') y = (char)(y + ('a' - 'A'));
- if (x != y || !y) return 0;
- }
- return b[an] == '\0';
-}
-
-/* Parse a register operand. Returns the 5-bit encoded register number
- * via *reg_out and the form via *is64_out. Recognized forms (case-
- * insensitive):
- * w0..w30, wzr → is64=0, reg=0..30 / 31
- * x0..x30, xzr, lr (=x30) → is64=1, reg=0..30 / 31
- * sp → is64=1, reg=31 (sp_means_sp set)
- * wsp → is64=0, reg=31 (sp_means_sp set)
- * Aliases:
- * fp = x29
- * ip0 = x16, ip1 = x17 (PLT scratch — useful for hand-written PLTs) */
-typedef struct AA64Reg {
- u32 num;
- u8 is64;
- u8 is_sp; /* 1 if the spelling was "sp" / "wsp" */
- u8 is_fp; /* 1 for SIMD/FP register spellings accepted in FP forms */
- u8 pad;
-} AA64Reg;
-
-static int parse_reg_from_ident(AsmDriver* d, Sym ident, AA64Reg* out) {
- size_t n = 0;
- const char* p = pool_str(asm_driver_pool(d), ident, &n);
- if (!p || !n) return 0;
- /* "sp" */
- if (icase_eq(p, n, "sp")) {
- out->num = 31;
- out->is64 = 1;
- out->is_sp = 1;
- out->is_fp = 0;
- return 1;
- }
- if (icase_eq(p, n, "wsp")) {
- out->num = 31;
- out->is64 = 0;
- out->is_sp = 1;
- out->is_fp = 0;
- return 1;
- }
- if (icase_eq(p, n, "lr")) {
- out->num = 30;
- out->is64 = 1;
- out->is_sp = 0;
- out->is_fp = 0;
- return 1;
- }
- if (icase_eq(p, n, "fp")) {
- out->num = 29;
- out->is64 = 1;
- out->is_sp = 0;
- out->is_fp = 0;
- return 1;
- }
- if (icase_eq(p, n, "ip0")) {
- out->num = 16;
- out->is64 = 1;
- out->is_sp = 0;
- out->is_fp = 0;
- return 1;
- }
- if (icase_eq(p, n, "ip1")) {
- out->num = 17;
- out->is64 = 1;
- out->is_sp = 0;
- out->is_fp = 0;
- return 1;
- }
- if (icase_eq(p, n, "xzr")) {
- out->num = 31;
- out->is64 = 1;
- out->is_sp = 0;
- out->is_fp = 0;
- return 1;
- }
- if (icase_eq(p, n, "wzr")) {
- out->num = 31;
- out->is64 = 0;
- out->is_sp = 0;
- out->is_fp = 0;
- return 1;
- }
- /* W/X<num> */
- if ((p[0] == 'w' || p[0] == 'W' || p[0] == 'x' || p[0] == 'X') && n >= 2) {
- u32 r = 0;
- size_t i;
- for (i = 1; i < n; ++i) {
- char c = p[i];
- if (c < '0' || c > '9') return 0;
- r = r * 10 + (u32)(c - '0');
- if (r > 31) return 0;
- }
- out->num = r;
- out->is64 = (p[0] == 'x' || p[0] == 'X') ? 1 : 0;
- out->is_sp = 0;
- out->is_fp = 0;
- return 1;
- }
- return 0;
-}
-
-static int parse_fp_d_reg_from_ident(AsmDriver* d, Sym ident, AA64Reg* out) {
- size_t n = 0;
- const char* p = pool_str(asm_driver_pool(d), ident, &n);
- if (!p || n < 2 || (p[0] != 'd' && p[0] != 'D')) return 0;
- u32 r = 0;
- for (size_t i = 1; i < n; ++i) {
- char c = p[i];
- if (c < '0' || c > '9') return 0;
- r = r * 10 + (u32)(c - '0');
- if (r > 31) return 0;
- }
- out->num = r;
- out->is64 = 1;
- out->is_sp = 0;
- out->is_fp = 1;
- return 1;
-}
-
-static AA64Reg parse_reg(AsmDriver* d) {
- AsmTok t = asm_driver_next(d);
- AA64Reg r;
- memset(&r, 0, sizeof r);
- if (t.kind != ASM_TOK_IDENT || !parse_reg_from_ident(d, t.v.ident, &r))
- asm_driver_panic(d, "asm: expected register");
- return r;
-}
-
-static AA64Reg parse_ldstp_reg(AsmDriver* d) {
- AsmTok t = asm_driver_next(d);
- AA64Reg r;
- memset(&r, 0, sizeof r);
- if (t.kind != ASM_TOK_IDENT ||
- (!parse_reg_from_ident(d, t.v.ident, &r) &&
- !parse_fp_d_reg_from_ident(d, t.v.ident, &r))) {
- asm_driver_panic(d, "asm: expected register");
- }
- return r;
-}
-
-static void reject_sp_reg(AsmDriver* d, AA64Reg r, const char* what) {
- if (r.is_sp) asm_driver_panic(d, "asm: %s: SP register not allowed", what);
-}
-
-static void require_sp_spelling(AsmDriver* d, AA64Reg r, const char* what) {
- if (r.num == 31u && !r.is_sp)
- asm_driver_panic(d, "asm: %s: zero register not allowed in SP operand",
- what);
-}
-
-/* Parse "#imm" (with optional + / -) or a bare expression — GNU as is
- * lenient about the leading hash. Returns an i64. */
-static i64 parse_imm_const(AsmDriver* d) {
- (void)asm_driver_eat_punct(d, '#');
- return asm_driver_parse_const(d);
-}
-
-/* Parse a possibly-symbolic operand prefixed by '#'. */
-static void parse_imm_sym(AsmDriver* d, ObjSymId* sym_out, i64* val_out) {
- (void)asm_driver_eat_punct(d, '#');
- asm_driver_parse_sym_expr(d, sym_out, val_out);
-}
-
-static void emit32(AsmDriver* d, u32 word) {
- MCEmitter* mc = asm_driver_mc(d);
- (void)asm_driver_cur_section(d);
- u8 buf[4];
- buf[0] = (u8)(word & 0xff);
- buf[1] = (u8)((word >> 8) & 0xff);
- buf[2] = (u8)((word >> 16) & 0xff);
- buf[3] = (u8)((word >> 24) & 0xff);
- mc->emit_bytes(mc, buf, 4);
-}
-
-static int parse_cond_from_ident(AsmDriver* d, Sym ident, u32* out) {
- size_t n = 0;
- const char* s = pool_str(asm_driver_pool(d), ident, &n);
- if (!s) return 0;
- if (icase_eq(s, n, "eq")) *out = 0;
- else if (icase_eq(s, n, "ne")) *out = 1;
- else if (icase_eq(s, n, "cs") || icase_eq(s, n, "hs")) *out = 2;
- else if (icase_eq(s, n, "cc") || icase_eq(s, n, "lo")) *out = 3;
- else if (icase_eq(s, n, "mi")) *out = 4;
- else if (icase_eq(s, n, "pl")) *out = 5;
- else if (icase_eq(s, n, "vs")) *out = 6;
- else if (icase_eq(s, n, "vc")) *out = 7;
- else if (icase_eq(s, n, "hi")) *out = 8;
- else if (icase_eq(s, n, "ls")) *out = 9;
- else if (icase_eq(s, n, "ge")) *out = 10;
- else if (icase_eq(s, n, "lt")) *out = 11;
- else if (icase_eq(s, n, "gt")) *out = 12;
- else if (icase_eq(s, n, "le")) *out = 13;
- else if (icase_eq(s, n, "al")) *out = 14;
- else return 0;
- return 1;
-}
-
-static u32 parse_cond(AsmDriver* d, const char* what) {
- AsmTok t = asm_driver_next(d);
- u32 cond = 0;
- if (t.kind != ASM_TOK_IDENT || !parse_cond_from_ident(d, t.v.ident, &cond))
- asm_driver_panic(d, "asm: %s: expected condition code", what);
- return cond;
-}
-
-static void expect_comma(AsmDriver* d, const char* what) {
- if (!asm_driver_eat_comma(d))
- asm_driver_panic(d, "asm: expected ',' (%s)", what);
-}
-
-/* ---- per-mnemonic parsers ---- */
-
-/* ret [Xn] — Xn defaults to x30. */
-static void p_ret(AsmDriver* d) {
- if (asm_driver_at_eol(d)) {
- emit32(d, aa64_ret(30));
- return;
- }
- AA64Reg r = parse_reg(d);
- if (!r.is64) asm_driver_panic(d, "asm: ret: 64-bit register expected");
- emit32(d, aa64_ret(r.num));
-}
-
-static void p_br(AsmDriver* d) {
- AA64Reg r = parse_reg(d);
- if (!r.is64) asm_driver_panic(d, "asm: br: 64-bit register expected");
- emit32(d, aa64_br(r.num));
-}
-
-static void p_blr(AsmDriver* d) {
- AA64Reg r = parse_reg(d);
- if (!r.is64) asm_driver_panic(d, "asm: blr: 64-bit register expected");
- emit32(d, aa64_blr(r.num));
-}
-
-static void p_nop(AsmDriver* d) {
- (void)d;
- emit32(d, aa64_nop());
-}
-
-/* Memory barriers (DMB / DSB / ISB / CLREX).
- *
- * dmb <option> ; option in {sy, ish, nsh, osh, ld, st, ishld,
- * ishst, nshld, nshst, oshld, oshst}
- * dmb #imm4 ; numeric form
- * dsb <option> | #imm4
- * isb [<option>] ; option defaults to sy when omitted
- * clrex [#imm4] ; option defaults to sy (15) when omitted */
-static u32 parse_barrier_option(AsmDriver* d, int allow_dmb_ld_st) {
- if (asm_driver_at_eol(d)) return AA64_BARRIER_OPT_SY;
- AsmTok t = asm_driver_peek(d);
- if (t.kind == ASM_TOK_IDENT) {
- (void)asm_driver_next(d);
- size_t n = 0;
- const char* s = pool_str(asm_driver_pool(d), t.v.ident, &n);
- if (icase_eq(s, n, "sy")) return AA64_BARRIER_OPT_SY;
- if (icase_eq(s, n, "ish")) return AA64_BARRIER_OPT_ISH;
- if (icase_eq(s, n, "ishld")) return AA64_BARRIER_OPT_ISHLD;
- if (icase_eq(s, n, "ishst")) return AA64_BARRIER_OPT_ISHST;
- if (icase_eq(s, n, "nsh")) return AA64_BARRIER_OPT_NSH;
- if (icase_eq(s, n, "nshld")) return AA64_BARRIER_OPT_NSHLD;
- if (icase_eq(s, n, "nshst")) return AA64_BARRIER_OPT_NSHST;
- if (icase_eq(s, n, "osh")) return AA64_BARRIER_OPT_OSH;
- if (icase_eq(s, n, "oshld")) return AA64_BARRIER_OPT_OSHLD;
- if (icase_eq(s, n, "oshst")) return AA64_BARRIER_OPT_OSHST;
- if (allow_dmb_ld_st) {
- if (icase_eq(s, n, "ld")) return AA64_BARRIER_OPT_LD;
- if (icase_eq(s, n, "st")) return AA64_BARRIER_OPT_ST;
- }
- asm_driver_panic(d, "asm: unknown barrier option");
- }
- /* Numeric form: '#imm4'. */
- i64 imm = parse_imm_const(d);
- if (imm < 0 || imm > 15)
- asm_driver_panic(d, "asm: barrier imm out of range");
- return (u32)imm;
-}
-
-static void p_dmb(AsmDriver* d) {
- u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/1);
- emit32(d, aa64_dmb(opt));
-}
-static void p_dsb(AsmDriver* d) {
- u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0);
- emit32(d, aa64_dsb(opt));
-}
-static void p_isb(AsmDriver* d) {
- u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0);
- emit32(d, aa64_isb(opt));
-}
-static void p_clrex(AsmDriver* d) {
- u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0);
- emit32(d, aa64_clrex(opt));
-}
-
-/* mov:
- * mov Rd, Rm → ORR Rd, ZR, Rm
- * mov Rd, #imm → MOVZ (if imm fits in a single halfword unshifted)
- * MOVN (if ~imm fits)
- * otherwise: panic (multi-step expansion deferred). */
-static void p_mov(AsmDriver* d) {
- AA64Reg rd = parse_reg(d);
- expect_comma(d, "mov");
- AsmTok t = asm_driver_peek(d);
- if (t.kind == ASM_TOK_IDENT) {
- AA64Reg src;
- memset(&src, 0, sizeof src);
- if (parse_reg_from_ident(d, t.v.ident, &src)) {
- (void)asm_driver_next(d);
- if (src.is64 != rd.is64)
- asm_driver_panic(d, "asm: mov: register width mismatch");
- /* mov involving SP encodes as `ADD Rd, Rsp, #0` per AArch64;
- * approximate with that exact form. */
- if (rd.is_sp || src.is_sp) {
- require_sp_spelling(d, rd, "mov sp");
- require_sp_spelling(d, src, "mov sp");
- emit32(d, aa64_add_imm(rd.is64, rd.num, src.num, 0, 0));
- return;
- }
- emit32(d, aa64_mov_reg(rd.is64, rd.num, src.num));
- return;
- }
- /* fall through: identifier that is not a register → treat as
- * symbol/equate via expression below. */
- }
- /* Immediate. */
- i64 imm = parse_imm_const(d);
- if (rd.is_sp) asm_driver_panic(d, "asm: mov: cannot move imm into SP");
- u64 uv = (u64)imm;
- u64 mask = rd.is64 ? ~0ull : 0xffffffffull;
- uv &= mask;
- /* Try MOVZ with one of four halfwords. */
- for (u32 hw = 0; hw < (rd.is64 ? 4u : 2u); ++hw) {
- u64 shift = (u64)hw * 16;
- u64 hwmask = 0xffffull << shift;
- if ((uv & ~hwmask) == 0) {
- u32 v = (u32)((uv >> shift) & 0xffff);
- emit32(d, aa64_movz(rd.is64, rd.num, v, hw));
- return;
- }
- }
- /* Try MOVN with one halfword (encodes ~imm in that halfword). */
- u64 nv = (~uv) & mask;
- for (u32 hw = 0; hw < (rd.is64 ? 4u : 2u); ++hw) {
- u64 shift = (u64)hw * 16;
- u64 hwmask = 0xffffull << shift;
- if ((nv & ~hwmask) == 0) {
- u32 v = (u32)((nv >> shift) & 0xffff);
- emit32(d, aa64_movn(rd.is64, rd.num, v, hw));
- return;
- }
- }
- asm_driver_panic(d, "asm: mov: immediate cannot be encoded in one insn");
-}
-
-/* mvn Rd, Rm */
-static void p_mvn(AsmDriver* d) {
- AA64Reg rd = parse_reg(d);
- expect_comma(d, "mvn");
- AA64Reg rm = parse_reg(d);
- if (rd.is64 != rm.is64) asm_driver_panic(d, "asm: mvn: width mismatch");
- emit32(d, aa64_mvn(rd.is64, rd.num, rm.num));
-}
-
-/* movz / movn / movk Rd, #imm[, lsl #shift] */
-static void p_movwide(AsmDriver* d, u32 opc) {
- AA64Reg rd = parse_reg(d);
- expect_comma(d, "movz/n/k");
- i64 imm = parse_imm_const(d);
- u32 hw = 0;
- if (asm_driver_eat_comma(d)) {
- /* lsl #N (N is 0/16/32/48). */
- AsmTok lid = asm_driver_next(d);
- if (lid.kind != ASM_TOK_IDENT)
- asm_driver_panic(d, "asm: expected 'lsl'");
- size_t ln = 0;
- const char* lp = pool_str(asm_driver_pool(d), lid.v.ident, &ln);
- if (!lp || !icase_eq(lp, ln, "lsl"))
- asm_driver_panic(d, "asm: expected 'lsl'");
- i64 sh = parse_imm_const(d);
- if (sh % 16 != 0 || sh < 0 || sh > 48)
- asm_driver_panic(d, "asm: movz/n/k: bad lsl shift");
- hw = (u32)(sh / 16);
- }
- u32 word = ((rd.is64 & 1u) << 31) | ((opc & 3u) << 29) |
- AA64_MOVEWIDE_FAMILY_MATCH | ((hw & 3u) << 21) |
- (((u32)imm & 0xffffu) << 5) | (rd.num & 0x1fu);
- emit32(d, word);
-}
-
-/* svc / brk / hlt #imm */
-static void p_except(AsmDriver* d, u32 form) {
- i64 imm = parse_imm_const(d);
- switch (form) {
- case 0: emit32(d, aa64_svc((u32)imm)); break;
- case 1: emit32(d, aa64_brk((u32)imm)); break;
- case 2: {
- /* HLT */
- u32 word = AA64_EXCEPT_FAMILY_MATCH | ((u32)2 << 21) |
- (((u32)imm & 0xffffu) << 5);
- emit32(d, word);
- break;
- }
- default: asm_driver_panic(d, "asm: bad exception form");
- }
-}
-
-/* Read optional `, lsl|lsr|asr|ror #imm` shift modifier. Returns 1 if
- * present. */
-static int parse_shift_mod(AsmDriver* d, u32* shift_out, u32* imm6_out) {
- AsmTok t = asm_driver_peek(d);
- if (t.kind != ASM_TOK_IDENT) return 0;
- size_t n = 0;
- const char* p = pool_str(asm_driver_pool(d), t.v.ident, &n);
- u32 sh;
- if (icase_eq(p, n, "lsl")) sh = 0;
- else if (icase_eq(p, n, "lsr")) sh = 1;
- else if (icase_eq(p, n, "asr")) sh = 2;
- else if (icase_eq(p, n, "ror")) sh = 3;
- else return 0;
- (void)asm_driver_next(d);
- i64 imm = parse_imm_const(d);
- if (imm < 0 || imm > 63)
- asm_driver_panic(d, "asm: shift amount out of range");
- *shift_out = sh;
- *imm6_out = (u32)imm;
- return 1;
-}
-
-/* add / sub family.
- * Forms:
- * add Rd, Rn, Rm[, lsl #s] shifted-register
- * add Rd, Rn, #imm immediate
- * add Rd, Rn, #imm, lsl #12 immediate w/ shift
- * S-suffixed (adds/subs) sets flags. */
-static void p_addsub(AsmDriver* d, int is_sub, int set_flags) {
- AA64Reg rd = parse_reg(d);
- expect_comma(d, "add/sub");
- AA64Reg rn = parse_reg(d);
- expect_comma(d, "add/sub");
- AsmTok t = asm_driver_peek(d);
- if (tok_punct(t, '#') || t.kind == ASM_TOK_NUM || tok_punct(t, '-') ||
- tok_punct(t, '+')) {
- /* immediate form */
- if (rd.is64 != rn.is64)
- asm_driver_panic(d, "asm: add/sub imm: width mismatch");
- require_sp_spelling(d, rn, "add/sub imm");
- if (set_flags) {
- reject_sp_reg(d, rd, "add/sub imm");
- } else {
- require_sp_spelling(d, rd, "add/sub imm");
- }
- i64 imm = parse_imm_const(d);
- u32 sh = 0;
- if (asm_driver_eat_comma(d)) {
- AsmTok lid = asm_driver_next(d);
- if (lid.kind != ASM_TOK_IDENT)
- asm_driver_panic(d, "asm: expected 'lsl #12'");
- size_t ln = 0;
- const char* lp = pool_str(asm_driver_pool(d), lid.v.ident, &ln);
- if (!lp || !icase_eq(lp, ln, "lsl"))
- asm_driver_panic(d, "asm: expected 'lsl'");
- i64 s = parse_imm_const(d);
- if (s == 12) sh = 1;
- else if (s == 0) sh = 0;
- else asm_driver_panic(d, "asm: add/sub imm: lsl must be 0 or 12");
- }
- if (imm < 0 || imm > 0xfff)
- asm_driver_panic(d, "asm: add/sub imm out of range");
- u32 word = aa64_addsubimm_pack((AA64AddSubImm){
- .sf = rd.is64, .op = (u32)is_sub, .S = (u32)set_flags, .sh = sh,
- .imm12 = (u32)imm, .Rn = rn.num, .Rd = rd.num});
- emit32(d, word);
- return;
- }
- /* register form */
- AA64Reg rm = parse_reg(d);
- reject_sp_reg(d, rd, "add/sub reg");
- reject_sp_reg(d, rn, "add/sub reg");
- reject_sp_reg(d, rm, "add/sub reg");
- if (rd.is64 != rm.is64 || rd.is64 != rn.is64)
- asm_driver_panic(d, "asm: add/sub reg: width mismatch");
- u32 shift = 0, imm6 = 0;
- if (asm_driver_eat_comma(d)) {
- if (!parse_shift_mod(d, &shift, &imm6))
- asm_driver_panic(d, "asm: add/sub reg: expected shift modifier");
- }
- u32 word = aa64_addsubsr_pack((AA64AddSubSR){
- .sf = rd.is64, .op = (u32)is_sub, .S = (u32)set_flags,
- .shift = shift, .Rm = rm.num, .imm6 = imm6, .Rn = rn.num,
- .Rd = rd.num});
- emit32(d, word);
-}
-
-/* cmp Rn, Rm | cmp Rn, #imm → SUBS ZR, Rn, ... */
-static void p_cmp(AsmDriver* d, int is_neg /* cmn flips op */) {
- AA64Reg rn = parse_reg(d);
- expect_comma(d, "cmp");
- AsmTok t = asm_driver_peek(d);
- if (tok_punct(t, '#') || t.kind == ASM_TOK_NUM || tok_punct(t, '-') ||
- tok_punct(t, '+')) {
- require_sp_spelling(d, rn, "cmp imm");
- i64 imm = parse_imm_const(d);
- u32 sh = 0;
- if (asm_driver_eat_comma(d)) {
- AsmTok lid = asm_driver_next(d);
- size_t ln = 0;
- const char* lp =
- (lid.kind == ASM_TOK_IDENT)
- ? pool_str(asm_driver_pool(d), lid.v.ident, &ln)
- : NULL;
- if (!lp || !icase_eq(lp, ln, "lsl"))
- asm_driver_panic(d, "asm: cmp imm: expected 'lsl'");
- i64 s = parse_imm_const(d);
- if (s == 12) sh = 1;
- else if (s != 0)
- asm_driver_panic(d, "asm: cmp imm: lsl must be 0 or 12");
- }
- if (imm < 0 || imm > 0xfff)
- asm_driver_panic(d, "asm: cmp imm out of range");
- u32 word = aa64_addsubimm_pack(
- (AA64AddSubImm){.sf = rn.is64, .op = (u32)(!is_neg), .S = 1,
- .sh = sh, .imm12 = (u32)imm, .Rn = rn.num,
- .Rd = AA64_ZR});
- emit32(d, word);
- return;
- }
- AA64Reg rm = parse_reg(d);
- reject_sp_reg(d, rn, "cmp reg");
- reject_sp_reg(d, rm, "cmp reg");
- if (rm.is64 != rn.is64) asm_driver_panic(d, "asm: cmp: width mismatch");
- u32 shift = 0, imm6 = 0;
- if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6);
- u32 word = aa64_addsubsr_pack((AA64AddSubSR){
- .sf = rn.is64, .op = (u32)(!is_neg), .S = 1, .shift = shift,
- .Rm = rm.num, .imm6 = imm6, .Rn = rn.num, .Rd = AA64_ZR});
- emit32(d, word);
-}
-
-static void p_csinc(AsmDriver* d) {
- AA64Reg rd = parse_reg(d);
- expect_comma(d, "csinc");
- AA64Reg rn = parse_reg(d);
- expect_comma(d, "csinc");
- AA64Reg rm = parse_reg(d);
- expect_comma(d, "csinc");
- u32 cond = parse_cond(d, "csinc");
- if (rd.is_sp || rn.is_sp || rm.is_sp)
- asm_driver_panic(d, "asm: csinc: SP register not allowed");
- if (rd.is64 != rn.is64 || rd.is64 != rm.is64)
- asm_driver_panic(d, "asm: csinc: width mismatch");
- u32 word = 0x1A800400u | ((u32)rd.is64 << 31) | ((rm.num & 0x1fu) << 16) |
- ((cond & 0xfu) << 12) | ((rn.num & 0x1fu) << 5) |
- (rd.num & 0x1fu);
- emit32(d, word);
-}
-
-/* neg / negs Rd, Rm → SUB / SUBS Rd, ZR, Rm */
-static void p_neg(AsmDriver* d, int set_flags) {
- AA64Reg rd = parse_reg(d);
- expect_comma(d, "neg");
- AA64Reg rm = parse_reg(d);
- reject_sp_reg(d, rd, "neg");
- reject_sp_reg(d, rm, "neg");
- if (rd.is64 != rm.is64) asm_driver_panic(d, "asm: neg: width mismatch");
- u32 shift = 0, imm6 = 0;
- if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6);
- u32 word = aa64_addsubsr_pack((AA64AddSubSR){
- .sf = rd.is64, .op = 1, .S = (u32)set_flags, .shift = shift,
- .Rm = rm.num, .imm6 = imm6, .Rn = AA64_ZR, .Rd = rd.num});
- emit32(d, word);
-}
-
-/* Logical shifted-register family. */
-static void p_log_sr(AsmDriver* d, u32 opc, u32 N) {
- AA64Reg rd = parse_reg(d);
- expect_comma(d, "logical");
- AA64Reg rn = parse_reg(d);
- expect_comma(d, "logical");
- AA64Reg rm = parse_reg(d);
- if (rd.is64 != rn.is64 || rd.is64 != rm.is64)
- asm_driver_panic(d, "asm: logical: width mismatch");
- u32 shift = 0, imm6 = 0;
- if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6);
- u32 word = aa64_logsr_pack((AA64LogSR){
- .sf = rd.is64, .opc = opc, .shift = shift, .N = N, .Rm = rm.num,
- .imm6 = imm6, .Rn = rn.num, .Rd = rd.num});
- emit32(d, word);
-}
-
-/* Data-processing 3-source: madd/msub Rd, Rn, Rm, Ra. */
-static void p_dp3(AsmDriver* d, u32 o0) {
- AA64Reg rd = parse_reg(d);
- expect_comma(d, "dp3");
- AA64Reg rn = parse_reg(d);
- expect_comma(d, "dp3");
- AA64Reg rm = parse_reg(d);
- expect_comma(d, "dp3");
- AA64Reg ra = parse_reg(d);
- if (rd.is64 != rn.is64 || rd.is64 != rm.is64 || rd.is64 != ra.is64)
- asm_driver_panic(d, "asm: dp3: width mismatch");
- u32 word = aa64_dp3_pack((AA64DP3){
- .sf = rd.is64, .op31 = 0, .o0 = o0, .Rm = rm.num, .Ra = ra.num,
- .Rn = rn.num, .Rd = rd.num});
- emit32(d, word);
-}
-
-/* mul Rd, Rn, Rm → MADD Rd, Rn, Rm, ZR */
-static void p_mul(AsmDriver* d, u32 o0) {
- AA64Reg rd = parse_reg(d);
- expect_comma(d, "mul");
- AA64Reg rn = parse_reg(d);
- expect_comma(d, "mul");
- AA64Reg rm = parse_reg(d);
- if (rd.is64 != rn.is64 || rd.is64 != rm.is64)
- asm_driver_panic(d, "asm: mul: width mismatch");
- u32 word = aa64_dp3_pack((AA64DP3){
- .sf = rd.is64, .op31 = 0, .o0 = o0, .Rm = rm.num, .Ra = AA64_ZR,
- .Rn = rn.num, .Rd = rd.num});
- emit32(d, word);
-}
-
-/* DP2: udiv/sdiv/lslv/lsrv/asrv/rorv Rd, Rn, Rm. */
-static void p_dp2(AsmDriver* d, u32 opcode) {
- AA64Reg rd = parse_reg(d);
- expect_comma(d, "dp2");
- AA64Reg rn = parse_reg(d);
- expect_comma(d, "dp2");
- AA64Reg rm = parse_reg(d);
- if (rd.is64 != rn.is64 || rd.is64 != rm.is64)
- asm_driver_panic(d, "asm: dp2: width mismatch");
- u32 word = aa64_dp2_pack((AA64DP2){.sf = rd.is64, .opcode = opcode,
- .Rm = rm.num, .Rn = rn.num,
- .Rd = rd.num});
- emit32(d, word);
-}
-
-/* Branch immediate / conditional / compare-and-branch. */
-
-static void emit_branch_imm(AsmDriver* d, u32 op_bl, ObjSymId target,
- i64 addend, i64 const_disp) {
- MCEmitter* mc = asm_driver_mc(d);
- /* Emit a B/BL with imm26 = 0; record a CALL26/JUMP26 reloc against
- * either the symbol or the constant displacement. */
- u32 word = aa64_brimm_pack((AA64BrImm){.op = op_bl, .imm26 = 0});
- emit32(d, word);
- u32 ofs = mc->pos(mc) - 4;
- RelocKind k = op_bl ? R_AARCH64_CALL26 : R_AARCH64_JUMP26;
- if (target != OBJ_SYM_NONE) {
- mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, k, target,
- addend, 1, 0);
- } else {
- /* Pure constant displacement is rare in real .s; reject it now.
- * The recommended form is to use a label and let the assembler
- * compute the displacement. */
- (void)const_disp;
- asm_driver_panic(d, "asm: branch with pure constant disp not supported");
- }
-}
-
-static void p_b(AsmDriver* d, u32 op_bl) {
- ObjSymId sym = OBJ_SYM_NONE;
- i64 off = 0;
- /* GNU as accepts `b sym`, `bl sym+8`, etc. */
- parse_imm_sym(d, &sym, &off);
- if (sym == OBJ_SYM_NONE)
- asm_driver_panic(d, "asm: b/bl: symbolic target required");
- emit_branch_imm(d, op_bl, sym, off, 0);
-}
-
-static void p_b_cond(AsmDriver* d, u32 cond) {
- ObjSymId sym = OBJ_SYM_NONE;
- i64 off = 0;
- parse_imm_sym(d, &sym, &off);
- if (sym == OBJ_SYM_NONE)
- asm_driver_panic(d, "asm: b.cond: symbolic target required");
- /* Emit the instruction with imm19=0 + R_AARCH64_CONDBR19 reloc. */
- u32 word = aa64_brcond_pack((AA64BrCond){.imm19 = 0, .cond = cond});
- emit32(d, word);
- MCEmitter* mc = asm_driver_mc(d);
- u32 ofs = mc->pos(mc) - 4;
- mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs,
- R_AARCH64_CONDBR19, sym, off, 1, 0);
-}
-
-static void p_cbz(AsmDriver* d, u32 op) {
- AA64Reg rt = parse_reg(d);
- expect_comma(d, "cbz");
- ObjSymId sym = OBJ_SYM_NONE;
- i64 off = 0;
- parse_imm_sym(d, &sym, &off);
- if (sym == OBJ_SYM_NONE)
- asm_driver_panic(d, "asm: cbz: symbolic target required");
- u32 word = aa64_cb_pack((AA64CB){.sf = rt.is64, .op = op, .imm19 = 0,
- .Rt = rt.num});
- emit32(d, word);
- MCEmitter* mc = asm_driver_mc(d);
- u32 ofs = mc->pos(mc) - 4;
- mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs,
- R_AARCH64_CONDBR19, sym, off, 1, 0);
-}
-
-/* Memory-operand parser for [Xn], [Xn, #imm], [Xn, #imm]!.
- *
- * pre_index_out is 1 when the closing `]!` appeared (pre-indexed).
- * imm is the literal byte offset (no scaling). */
-typedef struct AA64Mem {
- AA64Reg base;
- i64 imm; /* byte offset (literal as written) */
- u8 pre_index;
- u8 has_offset;
- u8 pad[2];
-} AA64Mem;
-
-static AA64Mem parse_mem(AsmDriver* d) {
- AA64Mem m;
- memset(&m, 0, sizeof m);
- if (!asm_driver_eat_punct(d, '['))
- asm_driver_panic(d, "asm: expected '['");
- m.base = parse_reg(d);
- if (!m.base.is64)
- asm_driver_panic(d, "asm: ldr/str: base register must be 64-bit");
- require_sp_spelling(d, m.base, "ldr/str base");
- if (asm_driver_eat_comma(d)) {
- m.imm = parse_imm_const(d);
- m.has_offset = 1;
- }
- if (!asm_driver_eat_punct(d, ']'))
- asm_driver_panic(d, "asm: expected ']'");
- if (asm_driver_eat_punct(d, '!')) m.pre_index = 1;
- return m;
-}
-
-/* ldr/str Rt, [Xn, #imm] — chooses scaled or unscaled form based on
- * alignment of imm. */
-static void p_ldr_str(AsmDriver* d, int is_load) {
- AA64Reg rt = parse_reg(d);
- reject_sp_reg(d, rt, "ldr/str");
- expect_comma(d, "ldr/str");
- AA64Mem m = parse_mem(d);
- u32 size = rt.is64 ? 3u : 2u;
- u32 opc = is_load ? AA64_LDST_OPC_LDR : AA64_LDST_OPC_STR;
- if (!m.pre_index) {
- /* Try scaled unsigned-imm12 first. */
- u32 scale = 1u << size;
- if (m.imm >= 0 && (i64)((u64)m.imm % scale) == 0 &&
- (u64)m.imm / scale <= 0xfff) {
- u32 imm12 = (u32)((u64)m.imm / scale);
- u32 word = aa64_ldst_uimm_pack((AA64LdStUimm){
- .size = size, .V = 0, .opc = opc, .imm12 = imm12,
- .Rn = m.base.num, .Rt = rt.num});
- emit32(d, word);
- return;
- }
- /* Fall back to unscaled signed-imm9 (LDUR/STUR). */
- if (m.imm >= -256 && m.imm <= 255) {
- u32 imm9 = (u32)((u64)m.imm & 0x1ffu);
- u32 word = aa64_ldst_simm9_pack((AA64LdStSimm9){
- .size = size, .V = 0, .opc = opc, .imm9 = imm9,
- .Rn = m.base.num, .Rt = rt.num});
- emit32(d, word);
- return;
- }
- asm_driver_panic(d, "asm: ldr/str: immediate out of range");
- }
- asm_driver_panic(d, "asm: ldr/str: pre-indexed form not yet supported");
-}
-
-/* ldur/stur — unscaled signed-imm9. */
-static void p_ldur_stur(AsmDriver* d, int is_load) {
- AA64Reg rt = parse_reg(d);
- reject_sp_reg(d, rt, "ldur/stur");
- expect_comma(d, "ldur/stur");
- AA64Mem m = parse_mem(d);
- u32 size = rt.is64 ? 3u : 2u;
- if (m.imm < -256 || m.imm > 255)
- asm_driver_panic(d, "asm: ldur/stur: imm9 out of range");
- u32 imm9 = (u32)((u64)m.imm & 0x1ffu);
- u32 word = aa64_ldst_simm9_pack((AA64LdStSimm9){
- .size = size, .V = 0,
- .opc = is_load ? AA64_LDST_OPC_LDR : AA64_LDST_OPC_STR,
- .imm9 = imm9, .Rn = m.base.num, .Rt = rt.num});
- emit32(d, word);
-}
-
-/* ldp / stp Rt, Rt2, [Xn, #imm] or [Xn, #imm]! */
-static void p_ldp_stp(AsmDriver* d, int is_load) {
- AA64Reg rt = parse_ldstp_reg(d);
- expect_comma(d, "ldp/stp");
- AA64Reg rt2 = parse_ldstp_reg(d);
- expect_comma(d, "ldp/stp");
- reject_sp_reg(d, rt, "ldp/stp");
- reject_sp_reg(d, rt2, "ldp/stp");
- if (rt.is64 != rt2.is64 || rt.is_fp != rt2.is_fp)
- asm_driver_panic(d, "asm: ldp/stp: width mismatch");
- AA64Mem m = parse_mem(d);
- u32 scale = rt.is64 ? 8u : 4u;
- if ((i64)((u64)m.imm % scale) != 0)
- asm_driver_panic(d, "asm: ldp/stp: imm not scale-aligned");
- i64 imm7 = m.imm / (i64)scale;
- if (imm7 < -64 || imm7 > 63)
- asm_driver_panic(d, "asm: ldp/stp: imm7 out of range");
- AA64LdStPPre f = {.opc = rt.is_fp ? 1u : (rt.is64 ? 2u : 0u),
- .V = rt.is_fp ? 1u : 0u,
- .L = is_load ? 1u : 0u,
- .imm7 = (u32)imm7 & 0x7fu,
- .Rt2 = rt2.num,
- .Rn = m.base.num,
- .Rt = rt.num};
- if (m.pre_index)
- emit32(d, aa64_ldstp_pre_pack(f));
- else
- emit32(d, aa64_ldstp_soff_pack(f));
-}
-
-/* adr / adrp Rd, sym */
-static void p_adr(AsmDriver* d, int is_adrp) {
- AA64Reg rd = parse_reg(d);
- expect_comma(d, "adr");
- ObjSymId sym = OBJ_SYM_NONE;
- i64 off = 0;
- parse_imm_sym(d, &sym, &off);
- if (sym == OBJ_SYM_NONE)
- asm_driver_panic(d, "asm: adr/adrp: symbol required");
- AA64PCRelAdr f = {.op = is_adrp ? AA64_ADR_OP_ADRP : AA64_ADR_OP_ADR,
- .immlo = 0, .immhi = 0, .Rd = rd.num};
- emit32(d, aa64_pcrel_adr_pack(f));
- MCEmitter* mc = asm_driver_mc(d);
- u32 ofs = mc->pos(mc) - 4;
- RelocKind k = is_adrp ? R_AARCH64_ADR_PREL_PG_HI21 : R_AARCH64_ADR_PREL_LO21;
- mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, k, sym, off, 1, 0);
-}
-
-/* ---- mnemonic dispatch table ---- */
-
-typedef void (*P_Fn)(AsmDriver*);
-
-typedef struct AA64Mn {
- const char* name;
- P_Fn fn;
- u32 arg; /* per-fn discriminator (alias parameter) */
-} AA64Mn;
-
-/* Wrapper functions for the discriminator-taking parsers, since the
- * table holds a uniform P_Fn pointer. Each wraps a single (fn, arg)
- * tuple. */
-static void p_addsub_add(AsmDriver* d) { p_addsub(d, /*is_sub=*/0, 0); }
-static void p_addsub_adds(AsmDriver* d) { p_addsub(d, 0, 1); }
-static void p_addsub_sub(AsmDriver* d) { p_addsub(d, 1, 0); }
-static void p_addsub_subs(AsmDriver* d) { p_addsub(d, 1, 1); }
-static void p_cmp_w(AsmDriver* d) { p_cmp(d, 0); }
-static void p_cmn_w(AsmDriver* d) { p_cmp(d, 1); }
-static void p_csinc_(AsmDriver* d) { p_csinc(d); }
-static void p_neg_w(AsmDriver* d) { p_neg(d, 0); }
-static void p_negs_w(AsmDriver* d) { p_neg(d, 1); }
-static void p_and_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_AND_OPC, 0); }
-static void p_bic_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_AND_OPC, 1); }
-static void p_orr_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ORR_OPC, 0); }
-static void p_orn_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ORR_OPC, 1); }
-static void p_eor_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_EOR_OPC, 0); }
-static void p_eon_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_EOR_OPC, 1); }
-static void p_ands_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ANDS_OPC, 0); }
-static void p_bics_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ANDS_OPC, 1); }
-static void p_madd(AsmDriver* d) { p_dp3(d, 0); }
-static void p_msub(AsmDriver* d) { p_dp3(d, 1); }
-static void p_mul_w(AsmDriver* d) { p_mul(d, 0); }
-static void p_mneg_w(AsmDriver* d) { p_mul(d, 1); }
-static void p_udiv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_UDIV_OP); }
-static void p_sdiv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_SDIV_OP); }
-static void p_lslv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_LSLV_OP); }
-static void p_lsrv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_LSRV_OP); }
-static void p_asrv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_ASRV_OP); }
-static void p_rorv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_RORV_OP); }
-static void p_b_(AsmDriver* d) { p_b(d, 0); }
-static void p_bl_(AsmDriver* d) { p_b(d, 1); }
-static void p_cbz_(AsmDriver* d) { p_cbz(d, 0); }
-static void p_cbnz_(AsmDriver* d) { p_cbz(d, 1); }
-static void p_movz_(AsmDriver* d) { p_movwide(d, AA64_MOVZ_OPC); }
-static void p_movn_(AsmDriver* d) { p_movwide(d, AA64_MOVN_OPC); }
-static void p_movk_(AsmDriver* d) { p_movwide(d, AA64_MOVK_OPC); }
-static void p_svc_(AsmDriver* d) { p_except(d, 0); }
-static void p_brk_(AsmDriver* d) { p_except(d, 1); }
-static void p_hlt_(AsmDriver* d) { p_except(d, 2); }
-static void p_ldr_(AsmDriver* d) { p_ldr_str(d, 1); }
-static void p_str_(AsmDriver* d) { p_ldr_str(d, 0); }
-static void p_ldur_(AsmDriver* d) { p_ldur_stur(d, 1); }
-static void p_stur_(AsmDriver* d) { p_ldur_stur(d, 0); }
-static void p_ldp_(AsmDriver* d) { p_ldp_stp(d, 1); }
-static void p_stp_(AsmDriver* d) { p_ldp_stp(d, 0); }
-static void p_adr_(AsmDriver* d) { p_adr(d, 0); }
-static void p_adrp_(AsmDriver* d) { p_adr(d, 1); }
-
-/* b.cond family. cond codes follow the standard ARMv8 numbering. */
-static void p_b_eq(AsmDriver* d) { p_b_cond(d, 0); }
-static void p_b_ne(AsmDriver* d) { p_b_cond(d, 1); }
-static void p_b_cs(AsmDriver* d) { p_b_cond(d, 2); }
-static void p_b_hs(AsmDriver* d) { p_b_cond(d, 2); }
-static void p_b_cc(AsmDriver* d) { p_b_cond(d, 3); }
-static void p_b_lo(AsmDriver* d) { p_b_cond(d, 3); }
-static void p_b_mi(AsmDriver* d) { p_b_cond(d, 4); }
-static void p_b_pl(AsmDriver* d) { p_b_cond(d, 5); }
-static void p_b_vs(AsmDriver* d) { p_b_cond(d, 6); }
-static void p_b_vc(AsmDriver* d) { p_b_cond(d, 7); }
-static void p_b_hi(AsmDriver* d) { p_b_cond(d, 8); }
-static void p_b_ls(AsmDriver* d) { p_b_cond(d, 9); }
-static void p_b_ge(AsmDriver* d) { p_b_cond(d, 10); }
-static void p_b_lt(AsmDriver* d) { p_b_cond(d, 11); }
-static void p_b_gt(AsmDriver* d) { p_b_cond(d, 12); }
-static void p_b_le(AsmDriver* d) { p_b_cond(d, 13); }
-static void p_b_al(AsmDriver* d) { p_b_cond(d, 14); }
-
-static const AA64Mn kTable[] = {
- {"nop", p_nop, 0},
- {"dmb", p_dmb, 0},
- {"dsb", p_dsb, 0},
- {"isb", p_isb, 0},
- {"clrex", p_clrex, 0},
- {"ret", p_ret, 0},
- {"br", p_br, 0},
- {"blr", p_blr, 0},
- {"mov", p_mov, 0},
- {"mvn", p_mvn, 0},
- {"movz", p_movz_, 0},
- {"movn", p_movn_, 0},
- {"movk", p_movk_, 0},
- {"add", p_addsub_add, 0},
- {"adds", p_addsub_adds, 0},
- {"sub", p_addsub_sub, 0},
- {"subs", p_addsub_subs, 0},
- {"cmp", p_cmp_w, 0},
- {"cmn", p_cmn_w, 0},
- {"csinc", p_csinc_, 0},
- {"neg", p_neg_w, 0},
- {"negs", p_negs_w, 0},
- {"and", p_and_w, 0},
- {"bic", p_bic_w, 0},
- {"orr", p_orr_w, 0},
- {"orn", p_orn_w, 0},
- {"eor", p_eor_w, 0},
- {"eon", p_eon_w, 0},
- {"ands", p_ands_w, 0},
- {"bics", p_bics_w, 0},
- {"madd", p_madd, 0},
- {"msub", p_msub, 0},
- {"mul", p_mul_w, 0},
- {"mneg", p_mneg_w, 0},
- {"udiv", p_udiv_w, 0},
- {"sdiv", p_sdiv_w, 0},
- {"lslv", p_lslv_w, 0},
- {"lsrv", p_lsrv_w, 0},
- {"asrv", p_asrv_w, 0},
- {"rorv", p_rorv_w, 0},
- {"b", p_b_, 0},
- {"bl", p_bl_, 0},
- {"cbz", p_cbz_, 0},
- {"cbnz", p_cbnz_, 0},
- {"svc", p_svc_, 0},
- {"brk", p_brk_, 0},
- {"hlt", p_hlt_, 0},
- {"ldr", p_ldr_, 0},
- {"str", p_str_, 0},
- {"ldur", p_ldur_, 0},
- {"stur", p_stur_, 0},
- {"ldp", p_ldp_, 0},
- {"stp", p_stp_, 0},
- {"adr", p_adr_, 0},
- {"adrp", p_adrp_, 0},
- {"b.eq", p_b_eq, 0}, {"b.ne", p_b_ne, 0},
- {"b.cs", p_b_cs, 0}, {"b.hs", p_b_hs, 0},
- {"b.cc", p_b_cc, 0}, {"b.lo", p_b_lo, 0},
- {"b.mi", p_b_mi, 0}, {"b.pl", p_b_pl, 0},
- {"b.vs", p_b_vs, 0}, {"b.vc", p_b_vc, 0},
- {"b.hi", p_b_hi, 0}, {"b.ls", p_b_ls, 0},
- {"b.ge", p_b_ge, 0}, {"b.lt", p_b_lt, 0},
- {"b.gt", p_b_gt, 0}, {"b.le", p_b_le, 0},
- {"b.al", p_b_al, 0},
- {NULL, NULL, 0},
-};
-
-void aa64_asm_insn(AA64Asm* a, AsmDriver* d, Sym mnemonic) {
- (void)a;
- size_t mn = 0;
- const char* mp = pool_str(asm_driver_pool(d), mnemonic, &mn);
- for (const AA64Mn* row = kTable; row->name; ++row) {
- if (icase_eq(mp, mn, row->name)) {
- row->fn(d);
- return;
- }
- }
- asm_driver_panic(d, "asm: unknown mnemonic");
-}
-
-/* ---- inline-asm template walker (Phase 4b Track C) ---- */
-
-/* Per-call rendered-line buffer. GCC's inline asm rarely emits more
- * than a handful of instructions per block; one line of substituted
- * text fits comfortably inside this. Truncation panics — the operator
- * grammar should never grow a single line beyond this without a
- * deliberate reason. */
-#define AA64_INLINE_LINE_CAP 1024
-
-/* Render a 5-bit register number into the StrBuf using the requested
- * width form. is64 picks x-form vs w-form; SP / ZR encode as
- * register #31 and we render them as wzr/xzr or wsp/sp depending on
- * caller intent — for inline-asm v1 the bound operand always names a
- * GP register, never SP, so we emit wzr/xzr for #31. */
-static void render_reg(StrBuf* sb, u32 reg, int is64) {
- if (reg == 31u) {
- strbuf_puts(sb, is64 ? "xzr" : "wzr");
- return;
- }
- strbuf_putc(sb, is64 ? 'x' : 'w');
- if (reg >= 10u) strbuf_putc(sb, (char)('0' + (reg / 10u)));
- strbuf_putc(sb, (char)('0' + (reg % 10u)));
-}
-
-/* Render a signed 64-bit integer prefixed with '#'. */
-static void render_imm(StrBuf* sb, i64 v) {
- strbuf_putc(sb, '#');
- strbuf_put_i64(sb, v);
-}
-
-/* Render an addressing form `[xN, #ofs]` for OPK_INDIRECT. */
-static void render_indirect(StrBuf* sb, Reg base, i32 ofs) {
- strbuf_putc(sb, '[');
- render_reg(sb, (u32)base, /*is64=*/1);
- if (ofs != 0) {
- strbuf_puts(sb, ", ");
- render_imm(sb, (i64)ofs);
- }
- strbuf_putc(sb, ']');
-}
-
-_Noreturn static void inline_panic(AA64Asm* a, const char* msg) {
- SrcLoc loc = {0, 0, 0};
- compiler_panic(a->c, loc, "inline asm: %s", msg);
-}
-
-/* Resolve operand index N → (kind=0 forced default, 1=force-w, 2=force-x,
- * 3=address form `%aN`). Renders into sb. */
-static void render_operand(AA64Asm* a, StrBuf* sb, u32 idx, int form) {
- u32 ntot = a->nout + a->nin;
- if (idx >= ntot) inline_panic(a, "operand index out of range");
- const Operand* op = (idx < a->nout) ? &a->out_ops[idx]
- : &a->in_ops[idx - a->nout];
- switch (form) {
- case 1: /* %wN — force 32-bit register form */
- if (op->kind != OPK_REG)
- inline_panic(a, "%w on non-register operand");
- render_reg(sb, (u32)op->v.reg, /*is64=*/0);
- return;
- case 2: /* %xN — force 64-bit register form */
- if (op->kind != OPK_REG)
- inline_panic(a, "%x on non-register operand");
- render_reg(sb, (u32)op->v.reg, /*is64=*/1);
- return;
- case 3: /* %aN — memory addressing form */
- if (op->kind != OPK_INDIRECT)
- inline_panic(a, "%a on non-memory operand");
- render_indirect(sb, op->v.ind.base, op->v.ind.ofs);
- return;
- default:
- break;
- }
- /* Default rendering by operand kind. */
- switch (op->kind) {
- case OPK_REG:
- render_reg(sb, (u32)op->v.reg, /*is64=*/1);
- return;
- case OPK_IMM:
- render_imm(sb, op->v.imm);
- return;
- case OPK_INDIRECT:
- render_indirect(sb, op->v.ind.base, op->v.ind.ofs);
- return;
- default:
- inline_panic(a, "unsupported operand kind for %N");
- }
-}
-
-/* Lex one line of substituted asm and dispatch via aa64_asm_insn. */
-static void run_one_line(AA64Asm* a, MCEmitter* mc, const char* text,
- size_t len) {
- /* Skip blank lines. */
- size_t i;
- for (i = 0; i < len; ++i) {
- if (text[i] != ' ' && text[i] != '\t') break;
- }
- if (i == len) return;
-
- AsmLexer* lx = asm_lex_open_mem(a->c, "<inline-asm>", text, len);
- AsmDriver* d = asm_driver_open_inline(a->c, mc, lx);
-
- /* The first non-trivial token must be the mnemonic identifier (or a
- * `.directive`, but inline asm doesn't normally use directives — leave
- * that path unsupported until needed). */
- AsmTok t = asm_driver_peek(d);
- while (t.kind == ASM_TOK_NEWLINE || t.kind == ASM_TOK_HASH) {
- (void)asm_driver_next(d);
- if (t.kind == ASM_TOK_HASH) {
- /* Skip cpp linemarker rest of line. */
- while (!asm_driver_at_eol(d)) (void)asm_driver_next(d);
- }
- t = asm_driver_peek(d);
- }
- if (t.kind == ASM_TOK_EOF) {
- asm_driver_close_inline(d);
- asm_lex_close(lx);
- return;
- }
- if (t.kind != ASM_TOK_IDENT)
- inline_panic(a, "expected mnemonic at start of inline asm line");
- (void)asm_driver_next(d);
- Sym mn = t.v.ident;
- /* Compose `b.eq` etc. — same trick as the standalone driver. */
- AsmTok dot = asm_driver_peek(d);
- if (asm_driver_tok_is_punct(dot, '.')) {
- (void)asm_driver_next(d);
- AsmTok rest = asm_driver_next(d);
- if (rest.kind != ASM_TOK_IDENT)
- inline_panic(a, "composite mnemonic: expected ident after '.'");
- size_t hn = 0, rn = 0;
- const char* hp = pool_str(asm_driver_pool(d), mn, &hn);
- const char* rp = pool_str(asm_driver_pool(d), rest.v.ident, &rn);
- char buf[64];
- if (hn + 1 + rn >= sizeof buf)
- inline_panic(a, "composite mnemonic too long");
- for (size_t k = 0; k < hn; ++k) buf[k] = hp[k];
- buf[hn] = '.';
- for (size_t k = 0; k < rn; ++k) buf[hn + 1 + k] = rp[k];
- mn = pool_intern(asm_driver_pool(d), buf, hn + 1 + rn);
- }
- aa64_asm_insn(a, d, mn);
- asm_driver_close_inline(d);
- asm_lex_close(lx);
-}
-
-/* Substitute placeholders into one line's StrBuf, then dispatch.
- *
- * The input range is [start, end) inside `tmpl`. Updates `*line_idx`
- * is not used — the caller resets the StrBuf between lines. */
-static void render_and_run_line(AA64Asm* a, MCEmitter* mc, StrBuf* sb,
- const char* start, const char* end) {
- strbuf_reset(sb);
- for (const char* p = start; p < end; ++p) {
- char c = *p;
- if (c != '%') {
- strbuf_putc(sb, c);
- continue;
- }
- /* Placeholder. */
- if (p + 1 >= end) inline_panic(a, "trailing '%' in template");
- char n = *(p + 1);
- if (n == '%') {
- strbuf_putc(sb, '%');
- ++p;
- continue;
- }
- if (n == '[') {
- /* %[name] — scan to the closing ']' and resolve against
- * AsmConstraint.name on the combined outs+ins list. Match by
- * comparing the named-bracket contents against the interned name
- * Sym stored on each constraint. */
- const char* nbeg = p + 2;
- const char* nend = nbeg;
- while (nend < end && *nend != ']') ++nend;
- if (nend == end) inline_panic(a, "unterminated %[name]");
- size_t nlen = (size_t)(nend - nbeg);
- Sym needle = pool_intern(a->c->global, nbeg, nlen);
- u32 idx = (u32)-1;
- for (u32 k = 0; k < a->nout; ++k) {
- if (a->outs[k].name == needle) { idx = k; break; }
- }
- if (idx == (u32)-1) {
- for (u32 k = 0; k < a->nin; ++k) {
- if (a->ins[k].name == needle) { idx = a->nout + k; break; }
- }
- }
- if (idx == (u32)-1)
- inline_panic(a, "%[name] does not match any constraint");
- p = nend; /* loop's ++p steps past the ']' */
- render_operand(a, sb, idx, 0);
- continue;
- }
- int form = 0; /* 0=default, 1=w, 2=x, 3=a */
- if (n == 'w' || n == 'x' || n == 'a') {
- form = (n == 'w') ? 1 : (n == 'x') ? 2 : 3;
- ++p;
- if (p + 1 >= end) inline_panic(a, "trailing '%' modifier in template");
- n = *(p + 1);
- }
- if (n == '[') {
- /* %w[name] / %x[name] / %a[name] — width modifier + symbolic
- * operand. Resolves the same way as %[name] but renders with the
- * declared form. */
- const char* nbeg = p + 2;
- const char* nend = nbeg;
- while (nend < end && *nend != ']') ++nend;
- if (nend == end) inline_panic(a, "unterminated %[name]");
- size_t nlen = (size_t)(nend - nbeg);
- Sym needle = pool_intern(a->c->global, nbeg, nlen);
- u32 idx = (u32)-1;
- for (u32 k = 0; k < a->nout; ++k) {
- if (a->outs[k].name == needle) { idx = k; break; }
- }
- if (idx == (u32)-1) {
- for (u32 k = 0; k < a->nin; ++k) {
- if (a->ins[k].name == needle) { idx = a->nout + k; break; }
- }
- }
- if (idx == (u32)-1)
- inline_panic(a, "%[name] does not match any constraint");
- p = nend; /* loop's ++p steps past the ']' */
- render_operand(a, sb, idx, form);
- continue;
- }
- if (n < '0' || n > '9')
- inline_panic(a, "expected digit after '%'");
- u32 idx = (u32)(n - '0');
- ++p;
- /* GCC syntax permits up to two digits (%0..%99). */
- if (p + 1 < end && *(p + 1) >= '0' && *(p + 1) <= '9') {
- idx = idx * 10 + (u32)(*(p + 1) - '0');
- ++p;
- }
- render_operand(a, sb, idx, form);
- }
- if (sb->truncated) inline_panic(a, "inline asm line buffer overflow");
- run_one_line(a, mc, strbuf_cstr(sb), strbuf_len(sb));
-}
-
-void aa64_asm_run_template(AA64Asm* a, MCEmitter* mc, const char* tmpl) {
- if (!tmpl || !*tmpl) return;
-
- char buf[AA64_INLINE_LINE_CAP];
- StrBuf sb;
- strbuf_init(&sb, buf, sizeof buf);
-
- /* Walk tmpl, splitting on '\n' and ';' line terminators. Track bracket
- * depth and quote state so that a literal ';' inside `[ ... ]` or a
- * quoted string is not mistaken for a statement separator. */
- const char* line_start = tmpl;
- int bracket = 0;
- char quote = 0;
- for (const char* p = tmpl;; ++p) {
- char c = *p;
- if (c == '\0') {
- render_and_run_line(a, mc, &sb, line_start, p);
- break;
- }
- if (quote) {
- if (c == '\\' && *(p + 1)) {
- ++p;
- continue;
- }
- if (c == quote) quote = 0;
- continue;
- }
- if (c == '"' || c == '\'') {
- quote = c;
- continue;
- }
- if (c == '[') {
- ++bracket;
- continue;
- }
- if (c == ']') {
- if (bracket) --bracket;
- continue;
- }
- if (bracket == 0 && (c == '\n' || c == ';')) {
- render_and_run_line(a, mc, &sb, line_start, p);
- line_start = p + 1;
- }
- }
-}
diff --git a/src/arch/aa64_disasm.c b/src/arch/aa64_disasm.c
@@ -1,133 +0,0 @@
-/* AArch64 disassembler implementation.
- *
- * Decodes one 4-byte instruction word per call into a CfreeInsn whose
- * string fields point into iterator-owned StrBufs. The decoder shares
- * the aa64_isa.{h,c} descriptor table with the encoder: aa64_disasm_find
- * matches the word; aa64_print_operands renders operand text via the
- * format's unpack + per-format pretty-printer. Mnemonic rewriting (the
- * one bit the printer can't own, because b.cond rolls cond into the
- * "operand" text) happens here. */
-
-#include "arch/aa64_disasm.h"
-
-#include <string.h>
-
-#include "arch/aa64_isa.h"
-#include "core/heap.h"
-#include "core/strbuf.h"
-
-/* Enough for any aarch64 mnemonic-with-suffix ("b.cond" → "b.le", etc.). */
-#define AA64_DASM_MNEM_CAP 16u
-/* Operand text. The widest cases (LDP X, X, [SP, #-imm]!) fit easily. */
-#define AA64_DASM_OPS_CAP 96u
-/* Annotation overlay (symbol + addend). */
-#define AA64_DASM_ANN_CAP 96u
-
-typedef struct AA64Disasm {
- ArchDisasm base;
- Compiler* c;
- Heap* heap;
- char mnem_buf[AA64_DASM_MNEM_CAP];
- char ops_buf[AA64_DASM_OPS_CAP];
- char ann_buf[AA64_DASM_ANN_CAP];
- StrBuf mnem;
- StrBuf ops;
- StrBuf ann;
-} AA64Disasm;
-
-static const char* aa64_cond_names[16] = {
- "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
- "hi", "ls", "ge", "lt", "gt", "le", "al", "nv",
-};
-
-static void aa64_write_mnemonic(AA64Disasm* d, const AA64InsnDesc* desc,
- u32 word) {
- strbuf_reset(&d->mnem);
- if (desc->fmt == AA64_FMT_BR_COND) {
- /* Synthesize "b.<cond>" so the operands buffer can hold just the
- * target. Matches GNU as / objdump conventions. */
- u32 cond = word & 0xfu;
- strbuf_puts(&d->mnem, "b.");
- strbuf_puts(&d->mnem, aa64_cond_names[cond]);
- return;
- }
- strbuf_puts(&d->mnem, desc->mnemonic);
-}
-
-static void aa64_write_operands(AA64Disasm* d, const AA64InsnDesc* desc,
- u32 word, u64 vaddr) {
- strbuf_reset(&d->ops);
- if (desc->fmt == AA64_FMT_BR_COND) {
- /* aa64_print_operands prints "<cond> <target>"; we already lifted
- * the cond into the mnemonic, so skip the dispatcher and inline
- * just the target. */
- AA64BrCond f = aa64_brcond_unpack(word);
- i64 ofs = (i64)((u64)f.imm19 & 0x7ffffu);
- /* sign-extend 19 bits */
- if (ofs & 0x40000) ofs |= ~(i64)0x7ffff;
- ofs *= 4;
- if (vaddr) {
- strbuf_put_hex_u64(&d->ops, vaddr + (u64)ofs);
- } else {
- strbuf_puts(&d->ops, "#");
- strbuf_put_i64(&d->ops, ofs);
- }
- return;
- }
- aa64_print_operands(&d->ops, desc, word, vaddr);
-}
-
-static u32 aa64_read_u32_le(const u8* b) {
- return (u32)b[0] | ((u32)b[1] << 8) | ((u32)b[2] << 16) | ((u32)b[3] << 24);
-}
-
-static void aa64_write_unknown(AA64Disasm* d, u32 word) {
- strbuf_reset(&d->mnem);
- strbuf_puts(&d->mnem, ".inst");
- strbuf_reset(&d->ops);
- strbuf_put_hex_u64(&d->ops, (u64)word);
-}
-
-static u32 aa64_decode(ArchDisasm* base, const u8* bytes, size_t len, u64 vaddr,
- CfreeInsn* out) {
- AA64Disasm* d = (AA64Disasm*)base;
- if (len < 4u) return 0;
- u32 word = aa64_read_u32_le(bytes);
- const AA64InsnDesc* desc = aa64_disasm_find(word);
- if (desc) {
- aa64_write_mnemonic(d, desc, word);
- aa64_write_operands(d, desc, word, vaddr);
- } else {
- aa64_write_unknown(d, word);
- }
- /* Annotation overlay is owned by the public iterator (cfree_disasm_iter_*).
- * The arch-level decoder leaves it empty. */
- strbuf_reset(&d->ann);
- out->vaddr = vaddr;
- out->bytes = bytes;
- out->nbytes = 4;
- out->mnemonic = strbuf_cstr(&d->mnem);
- out->operands = strbuf_cstr(&d->ops);
- out->annotation = strbuf_cstr(&d->ann);
- return 4;
-}
-
-static void aa64_destroy(ArchDisasm* base) {
- AA64Disasm* d = (AA64Disasm*)base;
- d->heap->free(d->heap, d, sizeof(*d));
-}
-
-ArchDisasm* aa64_disasm_new(Compiler* c) {
- Heap* h = (Heap*)c->env->heap;
- AA64Disasm* d = (AA64Disasm*)h->alloc(h, sizeof(*d), _Alignof(AA64Disasm));
- if (!d) return NULL;
- memset(d, 0, sizeof(*d));
- d->c = c;
- d->heap = h;
- d->base.decode = aa64_decode;
- d->base.destroy = aa64_destroy;
- strbuf_init(&d->mnem, d->mnem_buf, sizeof d->mnem_buf);
- strbuf_init(&d->ops, d->ops_buf, sizeof d->ops_buf);
- strbuf_init(&d->ann, d->ann_buf, sizeof d->ann_buf);
- return &d->base;
-}
diff --git a/src/arch/aa64_disasm.h b/src/arch/aa64_disasm.h
@@ -1,14 +0,0 @@
-#ifndef CFREE_ARCH_AA64_DISASM_H
-#define CFREE_ARCH_AA64_DISASM_H
-
-/* AArch64 disassembler — ArchDisasm implementation.
- *
- * Wraps aa64_disasm_find + aa64_print_operands (src/arch/aa64_isa.{h,c}).
- * The dispatcher in src/arch/disasm.c constructs one of these when the
- * compiler target is CFREE_ARCH_ARM_64. */
-
-#include "arch/arch.h"
-
-ArchDisasm* aa64_disasm_new(Compiler*);
-
-#endif
diff --git a/src/arch/aa64_isa.c b/src/arch/aa64_isa.c
@@ -1,598 +0,0 @@
-/* AArch64 instruction descriptor table + operand print/parse dispatch.
- *
- * The table mirrors the inline encoders in aa64_isa.h: each row records
- * (mnemonic, match, mask, format, flags) so the disassembler can identify
- * a raw 32-bit word with one mask-and-compare and then dispatch on the
- * format to extract operand fields via the same unpack functions the
- * encoder uses. Encoder and decoder share the bit knowledge — when an
- * opcode value or field position changes, both sides update at one site.
- *
- * Mask values include the family mask plus the bits that distinguish a
- * specific instruction from its siblings in the same family. sf (bit 31)
- * is intentionally a don't-care for formats where both 32- and 64-bit
- * forms share one row; the unpacker reads sf separately when printing
- * operands.
- *
- * Row ordering: first-match wins. Aliases (rows with AA64_ASMFL_ALIAS)
- * are tighter masks placed BEFORE the canonical row they alias so the
- * disassembler renders the alias spelling. The assembler accepts both
- * spellings — they map to the same encoded word. */
-
-#include "arch/aa64_isa.h"
-
-#include <stddef.h>
-
-const AA64InsnDesc aa64_insn_table[] = {
- /* ----- Move-wide immediate (MOVN / MOVZ / MOVK) ----- */
- {"movn", 0x12800000u, 0x7F800000u, AA64_FMT_MOVEWIDE, 0, {0, 0}},
- {"movz", 0x52800000u, 0x7F800000u, AA64_FMT_MOVEWIDE, 0, {0, 0}},
- {"movk", 0x72800000u, 0x7F800000u, AA64_FMT_MOVEWIDE, 0, {0, 0}},
-
- /* ----- Logical, shifted register -----
- * Alias MOV Rd, Rm is ORR Rd, ZR, Rm with shift=0, imm6=0. The mask
- * pins Rn (bits 9:5) to 11111 (ZR) and shift/imm6 to 0 so only the
- * MOV spelling matches; broader ORR rows below catch the rest. */
- {"mov", 0x2A0003E0u, 0x7FE0FFE0u, AA64_FMT_LOG_SR, AA64_ASMFL_ALIAS,
- {0, 0}},
- /* MVN Rd, Rm ≡ ORN Rd, ZR, Rm (logical N=1, Rn=ZR, no shift) */
- {"mvn", 0x2A2003E0u, 0x7FE0FFE0u, AA64_FMT_LOG_SR, AA64_ASMFL_ALIAS,
- {0, 0}},
- {"and", 0x0A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
- {"bic", 0x0A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
- {"orr", 0x2A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
- {"orn", 0x2A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
- {"eor", 0x4A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
- {"eon", 0x4A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
- {"ands", 0x6A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
- {"bics", 0x6A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}},
-
- /* ----- Add/Sub, shifted register -----
- * NEG Rd, Rm ≡ SUB Rd, ZR, Rm (Rn=ZR, shift=0, imm6=0). */
- {"neg", 0x4B0003E0u, 0x7FE0FFE0u, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS,
- {0, 0}},
- {"negs", 0x6B0003E0u, 0x7FE0FFE0u, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS,
- {0, 0}},
- /* CMP Rn, Rm ≡ SUBS ZR, Rn, Rm. */
- {"cmp", 0x6B00001Fu, 0x7F20001Fu, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS,
- {0, 0}},
- /* CMN Rn, Rm ≡ ADDS ZR, Rn, Rm. */
- {"cmn", 0x2B00001Fu, 0x7F20001Fu, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS,
- {0, 0}},
- {"add", 0x0B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}},
- {"adds", 0x2B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}},
- {"sub", 0x4B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}},
- {"subs", 0x6B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}},
-
- /* ----- Data-processing 3-source -----
- * MUL Rd, Rn, Rm ≡ MADD Rd, Rn, Rm, ZR (Ra=ZR, op31=0, o0=0). */
- {"mul", 0x1B007C00u, 0x7FE0FC00u, AA64_FMT_DP3, AA64_ASMFL_ALIAS, {0, 0}},
- /* MNEG Rd, Rn, Rm ≡ MSUB Rd, Rn, Rm, ZR. */
- {"mneg", 0x1B00FC00u, 0x7FE0FC00u, AA64_FMT_DP3, AA64_ASMFL_ALIAS, {0, 0}},
- {"madd", 0x1B000000u, 0x7FE08000u, AA64_FMT_DP3, 0, {0, 0}},
- {"msub", 0x1B008000u, 0x7FE08000u, AA64_FMT_DP3, 0, {0, 0}},
-
- /* ----- Data-processing 2-source ----- */
- {"udiv", 0x1AC00800u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}},
- {"sdiv", 0x1AC00C00u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}},
- {"lslv", 0x1AC02000u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}},
- {"lsrv", 0x1AC02400u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}},
- {"asrv", 0x1AC02800u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}},
- {"rorv", 0x1AC02C00u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}},
-
- /* ----- Unconditional branch (register) -----
- * RET aliases its no-operand spelling to RET X30 (Rn=11110). The
- * tighter row matches when Rn=30 and prints "ret" without operands;
- * the looser row below catches RET Xn for other Rn. */
- {"ret", 0xD65F03C0u, 0xFFFFFFFFu, AA64_FMT_BR_REG,
- AA64_ASMFL_ALIAS | AA64_ASMFL_NORN, {0, 0}},
- {"br", 0xD61F0000u, 0xFFFFFC1Fu, AA64_FMT_BR_REG, 0, {0, 0}},
- {"blr", 0xD63F0000u, 0xFFFFFC1Fu, AA64_FMT_BR_REG, 0, {0, 0}},
- {"ret", 0xD65F0000u, 0xFFFFFC1Fu, AA64_FMT_BR_REG, 0, {0, 0}},
-
- /* ----- PC-relative addressing ----- */
- {"adr", 0x10000000u, 0x9F000000u, AA64_FMT_PCREL_ADR, 0, {0, 0}},
- {"adrp", 0x90000000u, 0x9F000000u, AA64_FMT_PCREL_ADR, 0, {0, 0}},
-
- /* ----- Add/Sub immediate ----- */
- {"add", 0x11000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}},
- {"adds", 0x31000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}},
- {"sub", 0x51000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}},
- {"subs", 0x71000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}},
-
- /* ----- Load/store, unsigned 12-bit immediate (scaled) -----
- * Mask: family bits 29:27 + 25:24 + size(31:30) + V(26) + opc(23:22). */
- {"strb", 0x39000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
- {"ldrb", 0x39400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
- {"strh", 0x79000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
- {"ldrh", 0x79400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
- {"str", 0xB9000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* 32 */
- {"ldr", 0xB9400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
- {"str", 0xF9000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1,
- {0, 0}}, /* 64 */
- {"ldr", 0xF9400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1,
- {0, 0}},
- /* SIMD/FP scaled loads/stores (V=1). size 0..2 select B/H/S; size=3
- * selects D; the 128-bit Q form uses size=00 with opc bit 1 set and
- * is not yet emitted by codegen. */
- {"str", 0x3D000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* B */
- {"ldr", 0x3D400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
- {"str", 0x7D000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* H */
- {"ldr", 0x7D400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
- {"str", 0xBD000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* S */
- {"ldr", 0xBD400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}},
- {"str", 0xFD000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1,
- {0, 0}}, /* D */
- {"ldr", 0xFD400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1,
- {0, 0}},
-
- /* ----- Load/store, unscaled signed 9-bit immediate (LDUR/STUR) -----
- * V=0 first, V=1 next. Per-row mask narrows size+V+opc; family mask
- * pins the high family bits + the SIMM9-vs-other-variant selector. */
- {"sturb", 0x38000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
- {"ldurb", 0x38400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
- {"sturh", 0x78000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
- {"ldurh", 0x78400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
- {"stur", 0xB8000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* 32 */
- {"ldur", 0xB8400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
- {"stur", 0xF8000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1,
- {0, 0}},
- {"ldur", 0xF8400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1,
- {0, 0}},
- {"stur", 0x3C000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* B */
- {"ldur", 0x3C400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
- {"stur", 0x7C000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* H */
- {"ldur", 0x7C400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
- {"stur", 0xBC000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* S */
- {"ldur", 0xBC400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}},
- {"stur", 0xFC000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1,
- {0, 0}}, /* D */
- {"ldur", 0xFC400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1,
- {0, 0}},
-
- /* ----- Load/store pair, pre-indexed (opc=10 X / opc=01 D) ----- */
- {"stp", 0xA9800000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, AA64_ASMFL_SF1,
- {0, 0}},
- {"ldp", 0xA9C00000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, AA64_ASMFL_SF1,
- {0, 0}},
- {"stp", 0x6D800000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, 0, {0, 0}}, /* D */
- {"ldp", 0x6DC00000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, 0, {0, 0}},
-
- /* ----- Load/store pair, signed-offset ----- */
- {"stp", 0xA9000000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, AA64_ASMFL_SF1,
- {0, 0}},
- {"ldp", 0xA9400000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, AA64_ASMFL_SF1,
- {0, 0}},
- {"stp", 0x6D000000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, 0, {0, 0}}, /* D */
- {"ldp", 0x6D400000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, 0, {0, 0}},
-
- /* ----- Unconditional branch (immediate) ----- */
- {"b", 0x14000000u, 0xFC000000u, AA64_FMT_BR_IMM, 0, {0, 0}},
- {"bl", 0x94000000u, 0xFC000000u, AA64_FMT_BR_IMM, 0, {0, 0}},
-
- /* ----- Conditional branch (immediate) ----- */
- {"b.cond", 0x54000000u, 0xFF000010u, AA64_FMT_BR_COND, 0, {0, 0}},
-
- /* ----- Compare-and-branch ----- */
- {"cbz", 0x34000000u, 0x7F000000u, AA64_FMT_CB, 0, {0, 0}},
- {"cbnz", 0x35000000u, 0x7F000000u, AA64_FMT_CB, 0, {0, 0}},
-
- /* ----- Exception generation ----- */
- {"svc", 0xD4000001u, 0xFFE0001Fu, AA64_FMT_EXCEPT, 0, {0, 0}},
- {"brk", 0xD4200000u, 0xFFE0001Fu, AA64_FMT_EXCEPT, 0, {0, 0}},
- {"hlt", 0xD4400000u, 0xFFE0001Fu, AA64_FMT_EXCEPT, 0, {0, 0}},
-
- /* ----- Hint ----- */
- {"nop", 0xD503201Fu, 0xFFFFFFFFu, AA64_FMT_HINT, 0, {0, 0}},
-
- /* ----- Memory barriers (DMB / DSB / ISB / CLREX) -----
- * Mask covers everything but CRm at bits[11:8]. */
- {"dmb", 0xD50330BFu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}},
- {"dsb", 0xD503309Fu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}},
- {"isb", 0xD50330DFu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}},
- {"clrex", 0xD503305Fu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}},
-};
-
-const u32 aa64_insn_table_n =
- (u32)(sizeof aa64_insn_table / sizeof aa64_insn_table[0]);
-
-const AA64InsnDesc* aa64_disasm_find(u32 word) {
- for (u32 i = 0; i < aa64_insn_table_n; ++i) {
- const AA64InsnDesc* d = &aa64_insn_table[i];
- if ((word & d->mask) == d->match) return d;
- }
- return NULL;
-}
-
-/* =====================================================================
- * Operand print — one helper per format.
- *
- * Format choices for immediates:
- * - branch displacements, signed add/sub imm, signed ldur/stur ofs:
- * signed decimal.
- * - MOVZ/MOVK halfword, logical bitmask, exception generation #imm:
- * 0x-prefixed hex.
- *
- * Register naming: ZR alias for x31 in places where the encoding treats
- * Rd/Rn=31 as the zero register (logical/arith), SP where it treats 31
- * as the stack pointer (add/sub imm, ldr/str-uimm Rn, ldp/stp Rn).
- *
- * vaddr is folded into PC-relative branch operands when nonzero. */
-
-static void emit_reg(StrBuf* sb, u32 r, int sf, int sp_means_sp) {
- if (r == 31u) {
- if (sp_means_sp) strbuf_puts(sb, "sp");
- else if (sf) strbuf_puts(sb, "xzr");
- else strbuf_puts(sb, "wzr");
- return;
- }
- strbuf_putc(sb, sf ? 'x' : 'w');
- strbuf_put_u64(sb, (u64)r);
-}
-
-static void emit_vreg(StrBuf* sb, u32 r, char prefix) {
- strbuf_putc(sb, prefix);
- strbuf_put_u64(sb, (u64)r);
-}
-
-static void emit_cond(StrBuf* sb, u32 cond) {
- static const char* names[16] = {"eq", "ne", "cs", "cc", "mi", "pl",
- "vs", "vc", "hi", "ls", "ge", "lt",
- "gt", "le", "al", "nv"};
- strbuf_puts(sb, names[cond & 0xfu]);
-}
-
-/* Sign-extend an n-bit value held in the low bits of v to i64. */
-static i64 sext(u64 v, u32 nbits) {
- u64 mask = (nbits >= 64u) ? ~0ull : ((1ull << nbits) - 1ull);
- v &= mask;
- u64 sign = (nbits == 0u) ? 0ull : (1ull << (nbits - 1u));
- if (v & sign) v |= ~mask;
- return (i64)v;
-}
-
-static void print_movewide(StrBuf* sb, u32 w) {
- AA64MoveWide f = aa64_movewide_unpack(w);
- emit_reg(sb, f.Rd, (int)f.sf, /*sp_means_sp=*/0);
- strbuf_puts(sb, ", ");
- strbuf_put_hex_u64(sb, (u64)f.imm16);
- if (f.hw) {
- strbuf_puts(sb, ", lsl ");
- strbuf_put_u64(sb, (u64)(f.hw * 16u));
- }
-}
-
-static void print_logsr(StrBuf* sb, u32 w, const AA64InsnDesc* d) {
- AA64LogSR f = aa64_logsr_unpack(w);
- if (d->flags & AA64_ASMFL_ALIAS) {
- /* MOV / MVN: Rd, Rm */
- emit_reg(sb, f.Rd, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rm, (int)f.sf, 0);
- return;
- }
- emit_reg(sb, f.Rd, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rn, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rm, (int)f.sf, 0);
- if (f.imm6 || f.shift) {
- static const char* sh[4] = {"lsl", "lsr", "asr", "ror"};
- strbuf_puts(sb, ", ");
- strbuf_puts(sb, sh[f.shift & 3u]);
- strbuf_puts(sb, " #");
- strbuf_put_u64(sb, (u64)f.imm6);
- }
-}
-
-static void print_addsubsr(StrBuf* sb, u32 w, const AA64InsnDesc* d) {
- AA64AddSubSR f = aa64_addsubsr_unpack(w);
- if (d->flags & AA64_ASMFL_ALIAS) {
- /* NEG / NEGS / CMP / CMN. */
- if (d->mnemonic[0] == 'c') {
- /* CMP / CMN — print Rn, Rm */
- emit_reg(sb, f.Rn, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rm, (int)f.sf, 0);
- } else {
- /* NEG / NEGS — print Rd, Rm */
- emit_reg(sb, f.Rd, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rm, (int)f.sf, 0);
- }
- return;
- }
- emit_reg(sb, f.Rd, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rn, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rm, (int)f.sf, 0);
- if (f.imm6 || f.shift) {
- static const char* sh[4] = {"lsl", "lsr", "asr", "rsv"};
- strbuf_puts(sb, ", ");
- strbuf_puts(sb, sh[f.shift & 3u]);
- strbuf_puts(sb, " #");
- strbuf_put_u64(sb, (u64)f.imm6);
- }
-}
-
-static void print_dp3(StrBuf* sb, u32 w, const AA64InsnDesc* d) {
- AA64DP3 f = aa64_dp3_unpack(w);
- /* MUL / MNEG alias drop Ra (which is ZR for the alias). */
- if (d->flags & AA64_ASMFL_ALIAS) {
- emit_reg(sb, f.Rd, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rn, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rm, (int)f.sf, 0);
- return;
- }
- emit_reg(sb, f.Rd, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rn, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rm, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Ra, (int)f.sf, 0);
-}
-
-static void print_dp2(StrBuf* sb, u32 w) {
- AA64DP2 f = aa64_dp2_unpack(w);
- emit_reg(sb, f.Rd, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rn, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rm, (int)f.sf, 0);
-}
-
-static void print_brreg(StrBuf* sb, u32 w, const AA64InsnDesc* d) {
- AA64BrReg f = aa64_brreg_unpack(w);
- if (d->flags & AA64_ASMFL_NORN) return; /* RET (with implicit X30) */
- emit_reg(sb, f.Rn, /*sf=*/1, 0);
-}
-
-static void print_pcrel(StrBuf* sb, u32 w, u64 vaddr) {
- AA64PCRelAdr f = aa64_pcrel_adr_unpack(w);
- emit_reg(sb, f.Rd, /*sf=*/1, 0);
- strbuf_puts(sb, ", ");
- i64 imm = sext(((u64)f.immhi << 2) | (u64)f.immlo, 21);
- if (f.op == AA64_ADR_OP_ADRP) imm <<= 12;
- if (vaddr) {
- u64 base = (f.op == AA64_ADR_OP_ADRP) ? (vaddr & ~0xfffull) : vaddr;
- strbuf_put_hex_u64(sb, base + (u64)imm);
- } else {
- strbuf_puts(sb, "#");
- strbuf_put_i64(sb, imm);
- }
-}
-
-static void print_addsubimm(StrBuf* sb, u32 w) {
- AA64AddSubImm f = aa64_addsubimm_unpack(w);
- /* For these encodings, Rd/Rn=31 means SP. */
- emit_reg(sb, f.Rd, (int)f.sf, 1);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rn, (int)f.sf, 1);
- strbuf_puts(sb, ", #");
- strbuf_put_u64(sb, (u64)f.imm12);
- if (f.sh) strbuf_puts(sb, ", lsl #12");
-}
-
-static u32 ldst_log2_size(const AA64InsnDesc* d, u32 size_field) {
- (void)d;
- return size_field & 3u;
-}
-
-static void print_ldst_uimm(StrBuf* sb, u32 w, const AA64InsnDesc* d) {
- AA64LdStUimm f = aa64_ldst_uimm_unpack(w);
- u32 sz = ldst_log2_size(d, f.size);
- /* Pick reg prefix: V=0 picks W/X by size; V=1 picks B/H/S/D by size. */
- if (f.V == 0) {
- emit_reg(sb, f.Rt, /*sf=*/(int)(sz == 3u), 0);
- } else {
- char p = (sz == 0u) ? 'b' : (sz == 1u) ? 'h' : (sz == 2u) ? 's' : 'd';
- emit_vreg(sb, f.Rt, p);
- }
- strbuf_puts(sb, ", [");
- emit_reg(sb, f.Rn, /*sf=*/1, 1);
- u32 byte_off = f.imm12 << sz;
- if (byte_off) {
- strbuf_puts(sb, ", #");
- strbuf_put_u64(sb, (u64)byte_off);
- }
- strbuf_putc(sb, ']');
-}
-
-static void print_ldst_simm9(StrBuf* sb, u32 w, const AA64InsnDesc* d) {
- AA64LdStSimm9 f = aa64_ldst_simm9_unpack(w);
- u32 sz = f.size & 3u;
- (void)d;
- if (f.V == 0) {
- emit_reg(sb, f.Rt, /*sf=*/(int)(sz == 3u), 0);
- } else {
- char p = (sz == 0u) ? 'b' : (sz == 1u) ? 'h' : (sz == 2u) ? 's' : 'd';
- emit_vreg(sb, f.Rt, p);
- }
- strbuf_puts(sb, ", [");
- emit_reg(sb, f.Rn, /*sf=*/1, 1);
- i64 off = sext((u64)f.imm9, 9);
- if (off) {
- strbuf_puts(sb, ", #");
- strbuf_put_i64(sb, off);
- }
- strbuf_putc(sb, ']');
-}
-
-static void print_ldstp_common(StrBuf* sb, AA64LdStPPre f, int pre) {
- /* opc=10 → 64-bit X; opc=00 → 32-bit W; opc=01 (V=1) → D (FP);
- * opc=00 (V=1) → S; opc=10 (V=1) → Q (not yet emitted). */
- i64 scale;
- int is_fp = (f.V == 1);
- char fp_prefix = 's';
- int sf = 1;
- if (is_fp) {
- if (f.opc == 0) {
- fp_prefix = 's';
- scale = 4;
- } else if (f.opc == 1) {
- fp_prefix = 'd';
- scale = 8;
- } else {
- fp_prefix = 'q';
- scale = 16;
- }
- } else {
- sf = (f.opc == 2);
- scale = sf ? 8 : 4;
- }
- if (is_fp) {
- emit_vreg(sb, f.Rt, fp_prefix);
- strbuf_puts(sb, ", ");
- emit_vreg(sb, f.Rt2, fp_prefix);
- } else {
- emit_reg(sb, f.Rt, sf, 0);
- strbuf_puts(sb, ", ");
- emit_reg(sb, f.Rt2, sf, 0);
- }
- strbuf_puts(sb, ", [");
- emit_reg(sb, f.Rn, /*sf=*/1, 1);
- i64 byte_off = sext((u64)f.imm7, 7) * scale;
- if (byte_off) {
- strbuf_puts(sb, ", #");
- strbuf_put_i64(sb, byte_off);
- }
- strbuf_putc(sb, ']');
- if (pre) strbuf_putc(sb, '!');
-}
-
-static void print_ldstp_pre(StrBuf* sb, u32 w) {
- print_ldstp_common(sb, aa64_ldstp_pre_unpack(w), /*pre=*/1);
-}
-static void print_ldstp_soff(StrBuf* sb, u32 w) {
- print_ldstp_common(sb, aa64_ldstp_soff_unpack(w), /*pre=*/0);
-}
-
-static void print_br_imm(StrBuf* sb, u32 w, u64 vaddr) {
- AA64BrImm f = aa64_brimm_unpack(w);
- i64 ofs = sext((u64)f.imm26, 26) * 4;
- if (vaddr) {
- strbuf_put_hex_u64(sb, vaddr + (u64)ofs);
- } else {
- strbuf_puts(sb, "#");
- strbuf_put_i64(sb, ofs);
- }
-}
-
-static void print_br_cond(StrBuf* sb, u32 w, u64 vaddr,
- const AA64InsnDesc* d) {
- AA64BrCond f = aa64_brcond_unpack(w);
- (void)d;
- /* mnemonic is "b.cond"; we'll print cond as a suffix on the target.
- * The b.cond row keeps a single mnemonic for printing — for the asm
- * spelling to be canonical the writer will need to emit b.<cc>, which
- * is the printer's job at the dispatcher level (see aa64_print_operands). */
- emit_cond(sb, f.cond);
- strbuf_putc(sb, ' ');
- i64 ofs = sext((u64)f.imm19, 19) * 4;
- if (vaddr) {
- strbuf_put_hex_u64(sb, vaddr + (u64)ofs);
- } else {
- strbuf_puts(sb, "#");
- strbuf_put_i64(sb, ofs);
- }
-}
-
-static void print_cb(StrBuf* sb, u32 w, u64 vaddr) {
- AA64CB f = aa64_cb_unpack(w);
- emit_reg(sb, f.Rt, (int)f.sf, 0);
- strbuf_puts(sb, ", ");
- i64 ofs = sext((u64)f.imm19, 19) * 4;
- if (vaddr) {
- strbuf_put_hex_u64(sb, vaddr + (u64)ofs);
- } else {
- strbuf_puts(sb, "#");
- strbuf_put_i64(sb, ofs);
- }
-}
-
-static void print_except(StrBuf* sb, u32 w) {
- AA64Except f = aa64_except_unpack(w);
- strbuf_puts(sb, "#");
- strbuf_put_hex_u64(sb, (u64)f.imm16);
-}
-
-static void print_barrier(StrBuf* sb, u32 w, const AA64InsnDesc* desc) {
- AA64Barrier f = aa64_barrier_unpack(w);
- /* ISB and CLREX with the default CRm=SY (15) print without an
- * operand. DMB/DSB always carry an option. */
- int is_isb = (f.op2 == AA64_BARRIER_OP2_ISB);
- int is_clrex = (f.op2 == AA64_BARRIER_OP2_CLREX);
- if ((is_isb || is_clrex) && f.CRm == AA64_BARRIER_OPT_SY) return;
- const char* opt = NULL;
- switch (f.CRm) {
- case AA64_BARRIER_OPT_OSHLD: opt = "oshld"; break;
- case AA64_BARRIER_OPT_OSHST: opt = "oshst"; break;
- case AA64_BARRIER_OPT_OSH: opt = "osh"; break;
- case AA64_BARRIER_OPT_NSHLD: opt = "nshld"; break;
- case AA64_BARRIER_OPT_NSHST: opt = "nshst"; break;
- case AA64_BARRIER_OPT_NSH: opt = "nsh"; break;
- case AA64_BARRIER_OPT_ISHLD: opt = "ishld"; break;
- case AA64_BARRIER_OPT_ISHST: opt = "ishst"; break;
- case AA64_BARRIER_OPT_ISH: opt = "ish"; break;
- case AA64_BARRIER_OPT_LD: opt = (desc && desc->mnemonic &&
- desc->mnemonic[0] == 'd' &&
- desc->mnemonic[1] == 'm')
- ? "ld"
- : NULL; break;
- case AA64_BARRIER_OPT_ST: opt = (desc && desc->mnemonic &&
- desc->mnemonic[0] == 'd' &&
- desc->mnemonic[1] == 'm')
- ? "st"
- : NULL; break;
- case AA64_BARRIER_OPT_SY: opt = "sy"; break;
- default: break;
- }
- strbuf_putc(sb, ' ');
- if (opt) {
- strbuf_puts(sb, opt);
- } else {
- strbuf_puts(sb, "#");
- strbuf_put_u64(sb, (u64)f.CRm);
- }
-}
-
-void aa64_print_operands(StrBuf* sb, const AA64InsnDesc* desc, u32 word,
- u64 vaddr) {
- switch ((AA64Format)desc->fmt) {
- case AA64_FMT_MOVEWIDE: print_movewide(sb, word); break;
- case AA64_FMT_LOG_SR: print_logsr(sb, word, desc); break;
- case AA64_FMT_ADDSUB_SR: print_addsubsr(sb, word, desc); break;
- case AA64_FMT_DP3: print_dp3(sb, word, desc); break;
- case AA64_FMT_DP2: print_dp2(sb, word); break;
- case AA64_FMT_BR_REG: print_brreg(sb, word, desc); break;
- case AA64_FMT_PCREL_ADR: print_pcrel(sb, word, vaddr); break;
- case AA64_FMT_ADDSUB_IMM: print_addsubimm(sb, word); break;
- case AA64_FMT_LDST_UIMM: print_ldst_uimm(sb, word, desc); break;
- case AA64_FMT_LDSTP_PRE: print_ldstp_pre(sb, word); break;
- case AA64_FMT_LDSTP_SOFF: print_ldstp_soff(sb, word); break;
- case AA64_FMT_LDST_SIMM9: print_ldst_simm9(sb, word, desc); break;
- case AA64_FMT_BR_IMM: print_br_imm(sb, word, vaddr); break;
- case AA64_FMT_BR_COND: print_br_cond(sb, word, vaddr, desc); break;
- case AA64_FMT_CB: print_cb(sb, word, vaddr); break;
- case AA64_FMT_EXCEPT: print_except(sb, word); break;
- case AA64_FMT_HINT: break; /* no operands for NOP */
- case AA64_FMT_BARRIER: print_barrier(sb, word, desc); break;
- }
-}
-
-/* =====================================================================
- * Operand parse — phase-3 wires this up to the asm token stream. Phase
- * 2 ships the signature so the assembler bring-up commit doesn't need to
- * touch the descriptor table; the body returns 0 for every format until
- * the per-format grammar is implemented. */
-
-int aa64_parse_operands(struct AA64AsmTok* tok, const AA64InsnDesc* desc,
- void* fields_out) {
- (void)tok;
- (void)desc;
- (void)fields_out;
- return 0;
-}
diff --git a/src/arch/aa64_regs.c b/src/arch/aa64_regs.c
@@ -1,88 +0,0 @@
-/* AArch64 register name table — DWARF index ↔ assembler name.
- *
- * DWARF register numbering for AArch64 (per the AAPCS64 ABI supplement):
- * 0..30 X0..X30 (also W0..W30; same DWARF index)
- * 31 SP (X31 / WSP)
- * 32 PC
- * 33 ELR (mode dependent; unused here)
- * 64..95 V0..V31 (also B/H/S/D forms; same index)
- *
- * The canonical assembler spelling for v1 is the 64-bit form (Xn / Vn);
- * disassembler output picks W/B/H/S/D based on instruction width
- * separately. */
-
-#include <stdint.h>
-#include <string.h>
-
-#include "arch/aa64_regs.h"
-#include "core/core.h"
-
-typedef struct AA64Reg {
- uint32_t dwarf_idx;
- const char* name;
-} AA64Reg;
-
-static const AA64Reg AA64_REGS[] = {
- {0, "x0"}, {1, "x1"}, {2, "x2"}, {3, "x3"}, {4, "x4"},
- {5, "x5"}, {6, "x6"}, {7, "x7"}, {8, "x8"}, {9, "x9"},
- {10, "x10"}, {11, "x11"}, {12, "x12"}, {13, "x13"}, {14, "x14"},
- {15, "x15"}, {16, "x16"}, {17, "x17"}, {18, "x18"}, {19, "x19"},
- {20, "x20"}, {21, "x21"}, {22, "x22"}, {23, "x23"}, {24, "x24"},
- {25, "x25"}, {26, "x26"}, {27, "x27"}, {28, "x28"}, {29, "x29"},
- {30, "x30"}, {31, "sp"}, {32, "pc"},
- {64, "v0"}, {65, "v1"}, {66, "v2"}, {67, "v3"}, {68, "v4"},
- {69, "v5"}, {70, "v6"}, {71, "v7"}, {72, "v8"}, {73, "v9"},
- {74, "v10"}, {75, "v11"}, {76, "v12"}, {77, "v13"}, {78, "v14"},
- {79, "v15"}, {80, "v16"}, {81, "v17"}, {82, "v18"}, {83, "v19"},
- {84, "v20"}, {85, "v21"}, {86, "v22"}, {87, "v23"}, {88, "v24"},
- {89, "v25"}, {90, "v26"}, {91, "v27"}, {92, "v28"}, {93, "v29"},
- {94, "v30"}, {95, "v31"},
-};
-
-static const uint32_t AA64_REGS_N = (uint32_t)(sizeof AA64_REGS /
- sizeof AA64_REGS[0]);
-
-const char* aa64_register_name(uint32_t dwarf_idx) {
- uint32_t i;
- for (i = 0; i < AA64_REGS_N; ++i) {
- if (AA64_REGS[i].dwarf_idx == dwarf_idx) return AA64_REGS[i].name;
- }
- return NULL;
-}
-
-int aa64_register_index(const char* name, uint32_t* idx_out) {
- uint32_t i;
- if (!name) return 1;
- for (i = 0; i < AA64_REGS_N; ++i) {
- if (!strcmp(AA64_REGS[i].name, name)) {
- if (idx_out) *idx_out = AA64_REGS[i].dwarf_idx;
- return 0;
- }
- }
- /* Accept Wn alias for Xn (same DWARF index). */
- if (name[0] == 'w' && name[1] != '\0') {
- char buf[8];
- size_t n = strlen(name);
- if (n < sizeof buf) {
- buf[0] = 'x';
- memcpy(buf + 1, name + 1, n);
- return aa64_register_index(buf, idx_out);
- }
- }
- /* wzr / xzr aliases. */
- if (!strcmp(name, "wzr") || !strcmp(name, "xzr")) {
- if (idx_out) *idx_out = 31u; /* shares SP encoding slot; v1 picks SP */
- return 0;
- }
- return 1;
-}
-
-uint32_t aa64_register_iter_size(void) { return AA64_REGS_N; }
-
-int aa64_register_iter_get(uint32_t i, uint32_t* dwarf_out,
- const char** name_out) {
- if (i >= AA64_REGS_N) return 1;
- if (dwarf_out) *dwarf_out = AA64_REGS[i].dwarf_idx;
- if (name_out) *name_out = AA64_REGS[i].name;
- return 0;
-}
diff --git a/src/arch/aarch64/alloc.c b/src/arch/aarch64/alloc.c
@@ -1,246 +0,0 @@
-/* aarch64/alloc.c — spill/reload, labels, control flow, structured scopes. */
-
-#include "arch/aarch64/internal.h"
-
-/* ============================================================
- * AAImpl accessor
- * ============================================================ */
-
-AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; }
-
-/* ============================================================
- * Slot accessor
- * ============================================================ */
-
-AASlot* aa64_slot_get(AAImpl* a, FrameSlot fs) {
- if (fs == FRAME_SLOT_NONE || fs > a->nslots) return NULL;
- return &a->slots[fs - 1];
-}
-
-static int aa_resolve_reg_name(CGTarget* t, Sym name, Reg* out,
- RegClass* cls_out) {
- (void)t;
- size_t len = 0;
- const char* s = pool_str(t->c->global, name, &len);
- if (!s || !len) return 1;
- char buf[8];
- if (len >= sizeof buf) return 1;
- memcpy(buf, s, len);
- buf[len] = '\0';
- u32 dwarf;
- if (aa64_register_index(buf, &dwarf) != 0) return 1;
- if (dwarf <= 30u) {
- if (out) *out = (Reg)dwarf;
- if (cls_out) *cls_out = RC_INT;
- return 0;
- }
- if (dwarf >= 64u && dwarf <= 95u) {
- if (out) *out = (Reg)(dwarf - 64u);
- if (cls_out) *cls_out = RC_FP;
- return 0;
- }
- return 1;
-}
-
-static void aa_spill_reg(CGTarget* t, Operand src, FrameSlot slot,
- MemAccess ma) {
- AAImpl* a = impl_of(t);
- if (src.kind != OPK_REG) {
- compiler_panic(t->c, a->loc, "aarch64 spill_reg: src is not OPK_REG");
- }
- Operand addr;
- memset(&addr, 0, sizeof addr);
- addr.kind = OPK_LOCAL;
- addr.cls = RC_INT;
- addr.type = ma.type;
- addr.v.frame_slot = slot;
- aa_store(t, addr, src, ma);
-}
-
-static void aa_reload_reg(CGTarget* t, Operand dst, FrameSlot slot,
- MemAccess ma) {
- AAImpl* a = impl_of(t);
- if (dst.kind != OPK_REG) {
- compiler_panic(t->c, a->loc, "aarch64 reload_reg: dst is not OPK_REG");
- }
- Operand addr;
- memset(&addr, 0, sizeof addr);
- addr.kind = OPK_LOCAL;
- addr.cls = RC_INT;
- addr.type = ma.type;
- addr.v.frame_slot = slot;
- aa_load(t, dst, addr, ma);
-}
-
-/* ============================================================
- * Labels / control flow
- * ============================================================ */
-
-static Label aa_label_new(CGTarget* t) {
- return (Label)t->mc->label_new(t->mc);
-}
-
-static void aa_label_place(CGTarget* t, Label l) {
- t->mc->label_place(t->mc, (MCLabel)l);
-}
-
-void aa_jump(CGTarget* t, Label l) {
- MCEmitter* mc = t->mc;
- aa64_emit32(mc, aa64_b_base());
- mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_JUMP26, 4, 0);
-}
-
-static u32 cmp_to_cond(CmpOp op) {
- switch (op) {
- case CMP_EQ: return 0x0u;
- case CMP_NE: return 0x1u;
- case CMP_LT_U: return 0x3u;
- case CMP_LE_U: return 0x9u;
- case CMP_GT_U: return 0x8u;
- case CMP_GE_U: return 0x2u;
- case CMP_LT_S: return 0xbu;
- case CMP_LE_S: return 0xdu;
- case CMP_GT_S: return 0xcu;
- case CMP_GE_S: return 0xau;
- default: return 0x0u;
- }
-}
-
-void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op) {
- MCEmitter* mc = t->mc;
- u32 sf = type_is_64(a_op.type) ? 1u : 0u;
- if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) {
- u32 imm12, sh;
- if (aa64_addsub_imm_fits(b_op.v.imm, &imm12, &sh)) {
- u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
- aa64_emit32(mc, aa64_subs_imm12(sf, /*Rd=ZR*/ 31u, rn, imm12, sh));
- return;
- }
- }
- u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
- u32 rm =
- aa64_force_reg_int(t, b_op, sf, (rn == AA_TMP0) ? AA_TMP1 : AA_TMP0);
- aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, rn, rm));
-}
-
-static void aa_cmp_branch(CGTarget* t, CmpOp op, Operand a, Operand b,
- Label l) {
- MCEmitter* mc = t->mc;
- emit_cmp_ab(t, a, b);
- aa64_emit32(mc, aa64_b_cond(cmp_to_cond(op)));
- mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_CONDBR19, 4, 0);
-}
-
-static void aa_cmp(CGTarget* t, CmpOp op, Operand dst, Operand a, Operand b) {
- emit_cmp_ab(t, a, b);
- u32 sf_dst = type_is_64(dst.type) ? 1u : 0u;
- aa64_emit32(t->mc, aa64_cset(sf_dst, reg_num(dst), cmp_to_cond(op)));
-}
-
-/* ============================================================
- * Structured scopes
- * ============================================================ */
-
-static CGScope aa_scope_begin(CGTarget* t, const CGScopeDesc* d) {
- AAImpl* a = impl_of(t);
- if (a->nscopes == a->scopes_cap) {
- u32 ncap = a->scopes_cap ? a->scopes_cap * 2u : 4u;
- AAScope* nb = arena_array(t->c->tu, AAScope, ncap);
- if (a->scopes) memcpy(nb, a->scopes, sizeof(AAScope) * a->nscopes);
- a->scopes = nb;
- a->scopes_cap = ncap;
- }
- AAScope* sc = &a->scopes[a->nscopes];
- sc->kind = (u8)d->kind;
- sc->has_else = 0;
- sc->else_label = 0;
- sc->end_label = 0;
- sc->break_label = d->break_label;
- sc->continue_label = d->continue_label;
-
- if (d->kind == SCOPE_IF) {
- sc->else_label = t->mc->label_new(t->mc);
- sc->end_label = t->mc->label_new(t->mc);
- u32 sf = type_is_64(d->cond.type) ? 1u : 0u;
- u32 rn = aa64_force_reg_int(t, d->cond, sf, AA_TMP0);
- aa64_emit32(t->mc, aa64_subs_imm(sf, /*Rd=ZR*/ 31u, rn, 0));
- aa64_emit32(t->mc, aa64_b_cond(0x0u /*EQ*/));
- t->mc->emit_label_ref(t->mc, sc->else_label, R_AARCH64_CONDBR19, 4, 0);
- } else if (d->kind == SCOPE_LOOP || d->kind == SCOPE_BLOCK) {
- /* bookkeep only */
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 scope_begin: kind %d not yet implemented",
- (int)d->kind);
- }
-
- a->nscopes++;
- return (CGScope)a->nscopes;
-}
-
-static void aa_scope_else(CGTarget* t, CGScope s) {
- AAImpl* a = impl_of(t);
- if (s == CG_SCOPE_NONE || s > a->nscopes) {
- compiler_panic(t->c, a->loc, "aarch64 scope_else: bad scope %u",
- (unsigned)s);
- }
- AAScope* sc = &a->scopes[s - 1];
- aa64_emit32(t->mc, aa64_b_base());
- t->mc->emit_label_ref(t->mc, sc->end_label, R_AARCH64_JUMP26, 4, 0);
- t->mc->label_place(t->mc, sc->else_label);
- sc->has_else = 1;
-}
-
-static void aa_scope_end(CGTarget* t, CGScope s) {
- AAImpl* a = impl_of(t);
- if (s == CG_SCOPE_NONE || s > a->nscopes) {
- compiler_panic(t->c, a->loc, "aarch64 scope_end: bad scope %u",
- (unsigned)s);
- }
- AAScope* sc = &a->scopes[s - 1];
- if (sc->kind == SCOPE_IF) {
- if (!sc->has_else) {
- t->mc->label_place(t->mc, sc->else_label);
- }
- t->mc->label_place(t->mc, sc->end_label);
- }
-}
-
-static void aa_break_to(CGTarget* t, CGScope s) {
- AAImpl* a = impl_of(t);
- if (s == CG_SCOPE_NONE || s > a->nscopes) {
- compiler_panic(t->c, a->loc, "aarch64 break_to: bad scope %u", (unsigned)s);
- }
- AAScope* sc = &a->scopes[s - 1];
- aa_jump(t, sc->break_label);
-}
-
-static void aa_continue_to(CGTarget* t, CGScope s) {
- AAImpl* a = impl_of(t);
- if (s == CG_SCOPE_NONE || s > a->nscopes) {
- compiler_panic(t->c, a->loc, "aarch64 continue_to: bad scope %u",
- (unsigned)s);
- }
- AAScope* sc = &a->scopes[s - 1];
- aa_jump(t, sc->continue_label);
-}
-
-/* Expose vtable entries to ops.c constructor via a registration helper.
- * ops.c calls this after the basic ops vtable is populated. */
-void aa_alloc_vtable_init(CGTarget* t) {
- t->spill_reg = aa_spill_reg;
- t->reload_reg = aa_reload_reg;
- t->resolve_reg_name = aa_resolve_reg_name;
-
- t->label_new = aa_label_new;
- t->label_place = aa_label_place;
- t->jump = aa_jump;
- t->cmp_branch = aa_cmp_branch;
- t->cmp = aa_cmp;
-
- t->scope_begin = aa_scope_begin;
- t->scope_else = aa_scope_else;
- t->scope_end = aa_scope_end;
- t->break_to = aa_break_to;
- t->continue_to = aa_continue_to;
-}
diff --git a/src/arch/aarch64/arch.c b/src/arch/aarch64/arch.c
@@ -1,95 +0,0 @@
-#include "arch/arch.h"
-
-#include "abi/abi_internal.h"
-#include "arch/aa64.h"
-#include "arch/aa64_asm.h"
-#include "arch/aa64_disasm.h"
-#include "arch/aa64_regs.h"
-#include "core/bytes.h"
-#include "link/link_arch.h"
-#include "obj/elf.h"
-#include "obj/macho.h"
-#include "obj/obj.h"
-
-static const ABIVtable* aa64_abi_vtable(Compiler* c, CfreeOSKind os) {
- (void)c;
- switch (os) {
- case CFREE_OS_MACOS:
- return &apple_arm64_vtable;
- default:
- return &aapcs64_vtable;
- }
-}
-
-static int aa64_register_at_public(uint32_t idx, CfreeArchReg* out) {
- if (!out) return 1;
- return aa64_register_iter_get(idx, &out->dwarf_idx, &out->name);
-}
-
-static const ArchElfOps aa64_elf_ops = {
- .e_machine = EM_AARCH64,
- .e_flags = 0,
- .reloc_to = elf_aarch64_reloc_to,
- .reloc_from = elf_aarch64_reloc_from,
-};
-
-static const ArchMachoOps aa64_macho_ops = {
- .cputype = CPU_TYPE_ARM64,
- .cpusubtype = CPU_SUBTYPE_ARM64_ALL,
- .reloc_to = macho_aarch64_reloc_to,
- .reloc_pcrel = macho_aarch64_reloc_pcrel,
- .reloc_length = macho_aarch64_reloc_length,
- .reloc_from = macho_aarch64_reloc_from,
-};
-
-static int aa64_apply_label_fixup(Compiler* c, const ArchLabelFixup* fx) {
- const Section* s;
- u8 cur[4];
- u32 word;
-
- (void)c;
- if (!fx || fx->width != 4) return 1;
- s = obj_section_get(fx->obj, fx->sec_id);
- if (!s) return 0;
- buf_read(&s->bytes, fx->offset, cur, 4);
- word = rd_u32_le(cur);
-
- switch (fx->kind) {
- case R_AARCH64_JUMP26:
- case R_AARCH64_CALL26: {
- i64 idisp = fx->disp >> 2;
- u32 imm26 = (u32)(idisp & 0x03ffffffu);
- word = (word & ~0x03ffffffu) | imm26;
- break;
- }
- case R_AARCH64_CONDBR19: {
- i64 idisp = fx->disp >> 2;
- u32 imm19 = (u32)(idisp & 0x7ffffu);
- word = (word & ~(0x7ffffu << 5)) | (imm19 << 5);
- break;
- }
- default:
- return 1;
- }
-
- wr_u32_le(cur, word);
- obj_patch(fx->obj, fx->sec_id, fx->offset, cur, 4);
- return 0;
-}
-
-const ArchImpl arch_impl_aa64 = {
- .kind = CFREE_ARCH_ARM_64,
- .name = "aa64",
- .abi_vtable = aa64_abi_vtable,
- .cgtarget_new = aa64_cgtarget_new,
- .asm_new = aa64_arch_asm_new,
- .disasm_new = aa64_disasm_new,
- .apply_label_fixup = aa64_apply_label_fixup,
- .link = &link_arch_aa64,
- .elf = &aa64_elf_ops,
- .macho = &aa64_macho_ops,
- .register_name = aa64_register_name,
- .register_index = aa64_register_index,
- .register_count = aa64_register_iter_size,
- .register_at = aa64_register_at_public,
-};
diff --git a/src/arch/aarch64/emit.c b/src/arch/aarch64/emit.c
@@ -1,523 +0,0 @@
-/* aarch64/emit.c — instruction encoding helpers, function lifecycle,
- * frame layout, parameter ABI, address materialization. */
-
-#include "arch/aarch64/internal.h"
-
-extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc);
-
-/* ============================================================
- * Shared type / operand helpers
- * ============================================================ */
-
-int type_is_64(CfreeCgTypeId t) {
- return t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I64) ||
- t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64) ||
- t >= (CfreeCgTypeId)(2u << 6);
-}
-
-int type_is_fp_double(CfreeCgTypeId t) {
- return t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64);
-}
-
-int type_is_signed(CfreeCgTypeId t) {
- (void)t;
- return 0;
-}
-
-u32 type_byte_size(CfreeCgTypeId t) {
- if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I8) ||
- t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_BOOL))
- return 1;
- if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I16)) return 2;
- if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I32) ||
- t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F32))
- return 4;
- return 8;
-}
-
-u32 size_idx_for_bytes(u32 nbytes) {
- switch (nbytes) {
- case 1:
- return 0;
- case 2:
- return 1;
- case 4:
- return 2;
- case 8:
- return 3;
- default:
- return 3;
- }
-}
-
-u32 reg_num(Operand op) { return op.v.reg & 0x1fu; }
-
-static u32 collect_mask_regs(u32 mask, u32 first, u32 last, u32* out) {
- u32 n = 0;
- for (u32 r = first; r <= last; ++r) {
- if (mask & (1u << r)) out[n++] = r;
- }
- return n;
-}
-
-/* ============================================================
- * Low-level emission
- * ============================================================ */
-
-void aa64_emit32(MCEmitter* mc, u32 word) {
- u32 ofs = obj_pos(mc->obj, mc->section_id);
- u8 b[4];
- b[0] = (u8)(word & 0xff);
- b[1] = (u8)((word >> 8) & 0xff);
- b[2] = (u8)((word >> 16) & 0xff);
- b[3] = (u8)((word >> 24) & 0xff);
- mc->emit_bytes(mc, b, 4);
- if (mc->debug) {
- debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
- }
-}
-
-void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word) {
- u8 b[4];
- b[0] = (u8)(word & 0xff);
- b[1] = (u8)((word >> 8) & 0xff);
- b[2] = (u8)((word >> 16) & 0xff);
- b[3] = (u8)((word >> 24) & 0xff);
- obj_patch(obj, sec_id, ofs, b, 4);
-}
-
-/* ============================================================
- * Immediate encoding helpers
- * ============================================================ */
-
-void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm) {
- const u32 nslots = sf ? 4u : 2u;
- u64 v = sf ? (u64)imm : ((u64)imm & 0xffffffffu);
-
- for (u32 i = 0; i < nslots; ++i) {
- u32 slot = (u32)((v >> (i * 16)) & 0xffffu);
- u64 cleared = v & ~((u64)0xffffu << (i * 16));
- if (slot != 0 && cleared == 0) {
- aa64_emit32(mc, aa64_movz(sf, Rd, slot, i));
- return;
- }
- }
-
- {
- u64 inv = sf ? ~v : ((~v) & 0xffffffffu);
- for (u32 i = 0; i < nslots; ++i) {
- u32 slot = (u32)((inv >> (i * 16)) & 0xffffu);
- u64 cleared = inv & ~((u64)0xffffu << (i * 16));
- if (cleared == 0) {
- aa64_emit32(mc, aa64_movn(sf, Rd, slot, i));
- return;
- }
- }
- }
-
- int placed = 0;
- for (u32 i = 0; i < nslots; ++i) {
- u32 slot = (u32)((v >> (i * 16)) & 0xffffu);
- if (!placed) {
- if (slot == 0) continue;
- aa64_emit32(mc, aa64_movz(sf, Rd, slot, i));
- placed = 1;
- } else if (slot != 0) {
- aa64_emit32(mc, aa64_movk(sf, Rd, slot, i));
- }
- }
- if (!placed) aa64_emit32(mc, aa64_movz(sf, Rd, 0, 0));
-}
-
-void emit_sp_add(MCEmitter* mc, u32 imm) {
- if (imm <= 0xfff) {
- aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm, 0));
- } else if ((imm & 0xfff) == 0 && (imm >> 12) <= 0xfff) {
- aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm >> 12, 1));
- } else {
- aa64_emit32(mc, aa64_add_imm(1, 31, 31, (imm >> 12) & 0xfff, 1));
- aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm & 0xfff, 0));
- }
-}
-
-/* ============================================================
- * Function lifecycle
- * ============================================================ */
-
-void aa_func_begin(CGTarget* t, const CGFuncDesc* fd) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
-
- mc->set_section(mc, fd->text_section_id);
- mc->emit_align(mc, 4, 0);
-
- a->fd = fd;
- a->func_start = mc->pos(mc);
- a->next_param_int = 0;
- a->next_param_fp = 0;
- a->next_param_stack = 0;
- a->has_sret = (fd->abi && fd->abi->has_sret) ? 1 : 0;
- a->cum_off = 0;
- a->max_outgoing = 0;
- a->used_cs_int_mask = 0;
- a->used_cs_fp_mask = 0;
- a->nslots = 0;
- a->nscopes = 0;
- a->has_alloca = 0;
- a->nadd_patches = 0;
- a->sret_ptr_slot = FRAME_SLOT_NONE;
- a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0;
- a->gp_save_slot = FRAME_SLOT_NONE;
- a->fp_save_slot = FRAME_SLOT_NONE;
- a->epilogue_label = mc->label_new(mc);
-
- mc->cfi_startproc(mc);
-
- a->prologue_pos = mc->pos(mc);
- for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) aa64_emit32(mc, AA64_NOP);
-
- if (a->has_sret) {
- FrameSlotDesc fsd = {
- .type = CFREE_CG_TYPE_NONE,
- .name = 0,
- .loc = (SrcLoc){0, 0, 0},
- .size = 8,
- .align = 8,
- .kind = FS_SPILL,
- .flags = 0,
- };
- a->sret_ptr_slot = aa_frame_slot(t, &fsd);
- }
-
- if (a->is_variadic) {
- FrameSlotDesc gpd = {
- .type = CFREE_CG_TYPE_NONE,
- .name = 0,
- .loc = (SrcLoc){0, 0, 0},
- .size = 64,
- .align = 8,
- .kind = FS_SPILL,
- .flags = 0,
- };
- a->gp_save_slot = aa_frame_slot(t, &gpd);
- FrameSlotDesc fpd = {
- .type = CFREE_CG_TYPE_NONE,
- .name = 0,
- .loc = (SrcLoc){0, 0, 0},
- .size = 128,
- .align = 16,
- .kind = FS_SPILL,
- .flags = 0,
- };
- a->fp_save_slot = aa_frame_slot(t, &fpd);
- AASlot* gs = aa64_slot_get(a, a->gp_save_slot);
- AASlot* fs = aa64_slot_get(a, a->fp_save_slot);
- for (u32 i = 0; i < 8; ++i) {
- aa64_emit32(mc, aa64_stur(3, i, 29, -(i32)gs->off + (i32)i * 8));
- }
- for (u32 i = 0; i < 8; ++i) {
- aa64_emit32(mc, aa64_stur_fp(3, i, 29, -(i32)fs->off + (i32)i * 16));
- }
- }
-}
-
-void aa_func_end(CGTarget* t) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
-
- u32 int_regs[10];
- u32 fp_regs[8];
- u32 n_int_saves = collect_mask_regs(a->used_cs_int_mask, 19u, 28u, int_regs);
- u32 n_fp_saves = collect_mask_regs(a->used_cs_fp_mask, 8u, 15u, fp_regs);
-
- u32 outgoing_off = 0;
- u32 int_save_off = a->max_outgoing;
- u32 fp_save_off = int_save_off + n_int_saves * 8u;
- u32 locals_off = fp_save_off + n_fp_saves * 8u;
- u32 fp_lr_off = locals_off + a->cum_off;
- u32 frame_size = fp_lr_off + 16;
- frame_size = (frame_size + 15u) & ~15u;
- fp_lr_off = frame_size - 16;
-
- (void)outgoing_off;
-
- mc->label_place(mc, a->epilogue_label);
-
- if (a->has_alloca) {
- if (fp_lr_off <= 0xfff) {
- aa64_emit32(mc, aa64_sub_imm(1, /*Rd=SP*/ 31, /*Rn=*/29, fp_lr_off, 0));
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64: has_alloca + fp_lr_off %u out of imm12 range",
- fp_lr_off);
- }
- }
-
- for (i32 i = (i32)n_fp_saves - 1; i >= 0; --i) {
- u32 r0 = fp_regs[i];
- aa64_emit32(mc, aa64_ldr_fp_uimm(3, r0, 31,
- fp_save_off + (u32)i * 8u));
- }
- for (i32 i = (i32)n_int_saves - 1; i >= 0; --i) {
- u32 r0 = int_regs[i];
- aa64_emit32(mc, aa64_ldr_uimm(3, r0, 31,
- int_save_off + (u32)i * 8u));
- }
- aa64_emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off));
- emit_sp_add(mc, frame_size);
- aa64_emit32(mc, aa64_ret(AA64_LR));
-
- u32 pos = a->prologue_pos;
- ObjBuilder* obj = t->obj;
- u32 sec = a->fd->text_section_id;
-
- u32 words[AA_PROLOGUE_WORDS];
- for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) words[i] = AA64_NOP;
- u32 wi = 0;
-
- if (frame_size <= 0xfff) {
- words[wi++] = aa64_sub_imm(1, 31, 31, frame_size, 0);
- } else if ((frame_size & 0xfff) == 0 && (frame_size >> 12) <= 0xfff) {
- words[wi++] = aa64_sub_imm(1, 31, 31, frame_size >> 12, 1);
- } else {
- if (wi + 2 > AA_PROLOGUE_WORDS) {
- compiler_panic(t->c, a->loc,
- "aarch64: prologue overflow for frame_size %u",
- frame_size);
- }
- words[wi++] = aa64_sub_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1);
- words[wi++] = aa64_sub_imm(1, 31, 31, frame_size & 0xfff, 0);
- }
- words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off);
- words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0);
- if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) {
- AASlot* s = aa64_slot_get(a, a->sret_ptr_slot);
- if (s) {
- if (wi >= AA_PROLOGUE_WORDS) goto overflow;
- words[wi++] = aa64_stur(3, 8, 29, -(i32)s->off);
- }
- }
- for (u32 i = 0; i < n_int_saves; ++i) {
- u32 r0 = int_regs[i];
- if (wi >= AA_PROLOGUE_WORDS) goto overflow;
- words[wi++] = aa64_str_uimm(3, r0, 31, int_save_off + i * 8u);
- }
- for (u32 i = 0; i < n_fp_saves; ++i) {
- u32 r0 = fp_regs[i];
- if (wi >= AA_PROLOGUE_WORDS) goto overflow;
- words[wi++] = aa64_str_fp_uimm(3, r0, 31, fp_save_off + i * 8u);
- }
- if (0) {
- overflow:
- compiler_panic(
- t->c, a->loc,
- "aarch64: prologue placeholder too small (used %u of %u words)", wi,
- AA_PROLOGUE_WORDS);
- }
-
- for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) {
- aa64_patch32(obj, sec, pos + i * 4u, words[i]);
- }
-
- if (a->max_outgoing > 0xfff) {
- compiler_panic(
- t->c, a->loc,
- "aarch64: max_outgoing %u out of imm12 range for alloca patch",
- a->max_outgoing);
- }
- for (u32 i = 0; i < a->nadd_patches; ++i) {
- u32 dr = a->add_patches[i].dst_reg;
- u32 word = aa64_add_imm(1, dr, /*Rn=SP*/ 31, a->max_outgoing, 0);
- aa64_patch32(obj, sec, a->add_patches[i].pos, word);
- }
-
- u32 end = mc->pos(mc);
- obj_symbol_define(obj, a->fd->sym, sec, (u64)a->func_start,
- (u64)(end - a->func_start));
-
- mc->cfi_endproc(mc);
- a->fd = NULL;
-}
-
-/* ============================================================
- * Frame slots
- * ============================================================ */
-
-FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d) {
- AAImpl* a = impl_of(t);
- if (a->nslots == a->slots_cap) {
- u32 ncap = a->slots_cap ? a->slots_cap * 2 : 8;
- AASlot* nbuf = arena_array(t->c->tu, AASlot, ncap);
- if (a->slots) memcpy(nbuf, a->slots, sizeof(AASlot) * a->nslots);
- a->slots = nbuf;
- a->slots_cap = ncap;
- }
- u32 size = d->size ? d->size : 8;
- u32 align = d->align ? d->align : 1;
- u32 next = a->cum_off + size;
- u32 mask = align - 1;
- next = (next + mask) & ~mask;
-
- AASlot* s = &a->slots[a->nslots];
- s->off = next;
- s->size = size;
- s->align = align;
- s->kind = d->kind;
-
- a->cum_off = next;
- a->nslots++;
- return (FrameSlot)(a->nslots);
-}
-
-/* ============================================================
- * Parameters
- * ============================================================ */
-
-void aa_param(CGTarget* t, const CGParamDesc* p) {
- AAImpl* a = impl_of(t);
- AASlot* s = aa64_slot_get(a, p->slot);
- if (!s) {
- compiler_panic(t->c, a->loc, "aarch64 param: bad slot");
- }
- const ABIArgInfo* ai = p->abi;
-
- if (ai->kind == ABI_ARG_IGNORE) return;
- if (ai->kind == ABI_ARG_INDIRECT) {
- u32 ptr_reg;
- if (a->next_param_int < 8) {
- ptr_reg = a->next_param_int++;
- } else {
- u32 caller_off = a->next_param_stack;
- a->next_param_stack += 8;
- aa64_emit32(t->mc, aa64_ldur(3, AA_TMP0, 29, (i32)(16 + caller_off)));
- ptr_reg = AA_TMP0;
- }
- u32 nbytes = s->size;
- u32 i = 0;
- while (i + 8 <= nbytes) {
- aa64_emit32(t->mc, aa64_ldur(3, AA_TMP1, ptr_reg, (i32)i));
- aa64_emit32(t->mc, aa64_stur(3, AA_TMP1, 29, -(i32)s->off + (i32)i));
- i += 8;
- }
- while (i + 4 <= nbytes) {
- aa64_emit32(t->mc, aa64_ldur(2, AA_TMP1, ptr_reg, (i32)i));
- aa64_emit32(t->mc, aa64_stur(2, AA_TMP1, 29, -(i32)s->off + (i32)i));
- i += 4;
- }
- while (i + 2 <= nbytes) {
- aa64_emit32(t->mc, aa64_ldur(1, AA_TMP1, ptr_reg, (i32)i));
- aa64_emit32(t->mc, aa64_stur(1, AA_TMP1, 29, -(i32)s->off + (i32)i));
- i += 2;
- }
- while (i < nbytes) {
- aa64_emit32(t->mc, aa64_ldur(0, AA_TMP1, ptr_reg, (i32)i));
- aa64_emit32(t->mc, aa64_stur(0, AA_TMP1, 29, -(i32)s->off + (i32)i));
- i += 1;
- }
- return;
- }
- for (u16 i = 0; i < ai->nparts; ++i) {
- const ABIArgPart* pt = &ai->parts[i];
- u32 part_off = pt->src_offset;
- u32 sz = pt->size;
- u32 sidx = size_idx_for_bytes(sz);
-
- if (pt->cls == ABI_CLASS_INT) {
- if (a->next_param_int < 8) {
- u32 reg = a->next_param_int++;
- aa64_emit32(t->mc, aa64_stur(sidx, reg, 29, -(i32)s->off + (i32)part_off));
- } else {
- u32 caller_off = a->next_param_stack;
- a->next_param_stack += 8;
- aa64_emit32(t->mc, aa64_ldur(sidx, AA_TMP0, 29, (i32)(16 + caller_off)));
- aa64_emit32(t->mc,
- aa64_stur(sidx, AA_TMP0, 29,
- -(i32)s->off + (i32)part_off));
- }
- } else if (pt->cls == ABI_CLASS_FP) {
- if (a->next_param_fp < 8) {
- u32 reg = a->next_param_fp++;
- aa64_emit32(t->mc,
- aa64_stur_fp(sidx, reg, 29, -(i32)s->off + (i32)part_off));
- } else {
- u32 caller_off = a->next_param_stack;
- a->next_param_stack += 8;
- aa64_emit32(t->mc,
- aa64_ldur_fp(sidx, AA_FP_TMP0, 29,
- (i32)(16 + caller_off)));
- aa64_emit32(t->mc,
- aa64_stur_fp(sidx, AA_FP_TMP0, 29,
- -(i32)s->off + (i32)part_off));
- }
- } else {
- compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl",
- (int)pt->cls);
- }
- }
-}
-
-/* ============================================================
- * Address materialization helpers
- * ============================================================ */
-
-static int use_got_for_sym(CGTarget* t, ObjSymId sym) {
- return obj_symbol_extern_via_got(t->c, t->obj, sym);
-}
-
-void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym) {
- MCEmitter* mc = t->mc;
- u32 sec = mc->section_id;
- u32 adrp_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_adrp_base(dst_reg));
- mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_GOT_PAGE, sym, 0, 0, 0);
- u32 ldr_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_ldr_uimm(/*size=*/3, dst_reg, dst_reg, 0));
- mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_LD64_GOT_LO12_NC, sym, 0, 0, 0);
-}
-
-void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend) {
- MCEmitter* mc = t->mc;
- if (use_got_for_sym(t, sym)) {
- aa64_emit_got_load_addr(t, dst_reg, sym);
- if (addend) aa64_emit_addr_adjust(mc, dst_reg, dst_reg, (i32)addend);
- return;
- }
- u32 sec = mc->section_id;
- u32 adrp_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_adrp_base(dst_reg));
- mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, addend,
- 0, 0);
- u32 add_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_add_imm(1, dst_reg, dst_reg, 0, 0));
- mc->emit_reloc_at(mc, sec, add_pos, R_AARCH64_ADD_ABS_LO12_NC, sym, addend, 0,
- 0);
-}
-
-void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off) {
- if (off == 0) {
- aa64_emit32(mc, aa64_mov_reg(1, Rd, base));
- return;
- }
- u32 abs_off = (off < 0) ? (u32)(-off) : (u32)off;
- if (abs_off <= 0xfff) {
- if (off < 0)
- aa64_emit32(mc, aa64_sub_imm(1, Rd, base, abs_off, 0));
- else
- aa64_emit32(mc, aa64_add_imm(1, Rd, base, abs_off, 0));
- return;
- }
- if ((abs_off >> 24) == 0) {
- u32 hi = (abs_off >> 12) & 0xfff;
- u32 lo = abs_off & 0xfff;
- if (off < 0) {
- if (hi) aa64_emit32(mc, aa64_sub_imm(1, Rd, base, hi, 1));
- if (lo) aa64_emit32(mc, aa64_sub_imm(1, Rd, hi ? Rd : base, lo, 0));
- } else {
- if (hi) aa64_emit32(mc, aa64_add_imm(1, Rd, base, hi, 1));
- if (lo) aa64_emit32(mc, aa64_add_imm(1, Rd, hi ? Rd : base, lo, 0));
- }
- return;
- }
- aa64_emit_load_imm(mc, 1, Rd, off);
- aa64_emit32(mc, aa64_add(1, Rd, base, Rd));
-}
diff --git a/src/arch/aarch64/internal.h b/src/arch/aarch64/internal.h
@@ -1,306 +0,0 @@
-/* aarch64/internal.h — private types and forward decls shared across
- * emit.c / alloc.c / ops.c. NOT part of the public API. */
-#pragma once
-
-#include <string.h>
-
-#include "arch/aa64_asm.h"
-#include "arch/aa64_isa.h"
-#include "arch/aa64_regs.h"
-#include "arch/arch.h"
-#include "core/arena.h"
-#include "core/pool.h"
-#include "obj/obj.h"
-
-/* ============================================================
- * Local encoding helpers (kept here, not in aa64_isa.h).
- * ============================================================ */
-
-#define AA64_NOP 0xD503201Fu
-
-/* Hidden backend temporaries. These must stay outside the allocable pools and
- * outside optimizer scratch registers because CGTarget ops may clobber them
- * while lowering a single operation. AA_FP_TMP0 names v31, not integer x31. */
-enum {
- AA_TMP0 = 9u,
- AA_TMP1 = 10u,
- AA_TMP2 = 11u,
- AA_FP_TMP0 = 31u,
-};
-#define CG_BUILTIN_ID(k) ((CfreeCgTypeId)((1u << 6) | (u32)(k)))
-
-static inline u32 aa64_stp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) {
- i32 sc = byte_off >> 3;
- return 0xA9000000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) {
- i32 sc = byte_off >> 3;
- return 0xA9400000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_stp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) {
- i32 sc = byte_off >> 3;
- return 0x6D000000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) {
- i32 sc = byte_off >> 3;
- return 0x6D400000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-
-static inline u32 aa64_stur(u32 size, u32 Rt, u32 Rn, i32 simm9) {
- return 0x38000000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldur(u32 size, u32 Rt, u32 Rn, i32 simm9) {
- return 0x38400000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_stur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9) {
- return 0x3C000000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9) {
- return 0x3C400000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-
-static inline u32 aa64_str_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) {
- u32 sc = byte_off >> size;
- return 0x39000000u | (size << 30) | ((sc & 0xfffu) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldr_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) {
- u32 sc = byte_off >> size;
- return 0x39400000u | (size << 30) | ((sc & 0xfffu) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_str_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) {
- u32 sc = byte_off >> size;
- return 0x3D000000u | (size << 30) | ((sc & 0xfffu) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-
-static inline u32 aa64_mrs_tpidr_el0(u32 Rt) {
- return 0xD53BD040u | (Rt & 0x1fu);
-}
-static inline u32 aa64_b_base(void) { return 0x14000000u; }
-static inline u32 aa64_bl_base(void) { return 0x94000000u; }
-
-static inline u32 aa64_adrp_base(u32 Rd) { return 0x90000000u | (Rd & 0x1f); }
-
-static inline u32 aa64_ldr_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) {
- u32 sc = byte_off >> size;
- return 0x3D400000u | (size << 30) | ((sc & 0xfffu) << 10) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-
-static inline u32 aa64_fmov_reg(u32 type, u32 Rd, u32 Rn) {
- return 0x1E204000u | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-
-static inline u32 aa64_subs_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12) {
- return 0x71000000u | (sf << 31) | ((imm12 & 0xfff) << 10) |
- ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-
-static inline u32 aa64_cset_eq(u32 sf, u32 Rd) {
- return 0x1A800400u | (sf << 31) | (31u << 16) | (0x1u << 12) | (31u << 5) |
- (Rd & 0x1f);
-}
-
-static inline u32 aa64_fcvtzs(u32 sf, u32 type, u32 Rd, u32 Rn) {
- return 0x1E380000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) |
- (Rd & 0x1f);
-}
-static inline u32 aa64_fcvtzu(u32 sf, u32 type, u32 Rd, u32 Rn) {
- return 0x1E390000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) |
- (Rd & 0x1f);
-}
-static inline u32 aa64_scvtf(u32 sf, u32 type, u32 Rd, u32 Rn) {
- return 0x1E220000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) |
- (Rd & 0x1f);
-}
-static inline u32 aa64_ucvtf(u32 sf, u32 type, u32 Rd, u32 Rn) {
- return 0x1E230000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) |
- (Rd & 0x1f);
-}
-
-static inline u32 aa64_fcvt_d_s(u32 Rd, u32 Rn) {
- return 0x1E22C000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fcvt_s_d(u32 Rd, u32 Rn) {
- return 0x1E624000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-
-static inline u32 aa64_fmov_s_w(u32 Rd, u32 Rn) {
- return 0x1E270000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fmov_w_s(u32 Rd, u32 Rn) {
- return 0x1E260000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fmov_d_x(u32 Rd, u32 Rn) {
- return 0x9E670000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fmov_x_d(u32 Rd, u32 Rn) {
- return 0x9E660000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-
-static inline u32 aa64_sub_extreg_x_uxtx(u32 Rd, u32 Rn, u32 Rm) {
- return 0xCB206000u | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-
-static inline u32 aa64_subs_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
- return 0x6B000000u | (sf << 31) | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) |
- (Rd & 0x1f);
-}
-
-static inline u32 aa64_b_cond(u32 cond) { return 0x54000000u | (cond & 0xfu); }
-
-static inline u32 aa64_csinc(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 cond) {
- return 0x1A800400u | (sf << 31) | ((Rm & 0x1f) << 16) |
- ((cond & 0xfu) << 12) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_cset(u32 sf, u32 Rd, u32 cond) {
- return aa64_csinc(sf, Rd, 31u, 31u, cond ^ 1u);
-}
-
-static inline u32 aa64_fadd(u32 type, u32 Rd, u32 Rn, u32 Rm) {
- return 0x1E202800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) |
- ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fsub(u32 type, u32 Rd, u32 Rn, u32 Rm) {
- return 0x1E203800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) |
- ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fmul(u32 type, u32 Rd, u32 Rn, u32 Rm) {
- return 0x1E200800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) |
- ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_fdiv(u32 type, u32 Rd, u32 Rn, u32 Rm) {
- return 0x1E201800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) |
- ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-
-static inline u32 aa64_sbfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) {
- return 0x13000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) |
- ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_ubfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) {
- return 0x53000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) |
- ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_bfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) {
- return 0x33000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) |
- ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-
-/* ============================================================
- * AAImpl types
- * ============================================================ */
-
-#define AA_PROLOGUE_WORDS \
- 22u /* worst case: sub sp + stp/add fp + sret + 5 int + 8 fp saves */
-
-typedef struct AASlot {
- u32 off;
- u32 size;
- u32 align;
- u8 kind;
- u8 pad[3];
-} AASlot;
-
-typedef struct AAScope {
- u8 kind;
- u8 has_else;
- u8 pad[2];
- MCLabel else_label;
- MCLabel end_label;
- Label break_label;
- Label continue_label;
-} AAScope;
-
-typedef struct AAImpl {
- CGTarget base;
- SrcLoc loc;
- const CGFuncDesc* fd;
-
- u32 func_start;
- u32 prologue_pos;
- MCLabel epilogue_label;
-
- AASlot* slots;
- u32 nslots;
- u32 slots_cap;
- u32 cum_off;
- u32 max_outgoing;
-
- u32 next_param_int;
- u32 next_param_fp;
- u32 next_param_stack;
- u8 has_sret;
- FrameSlot sret_ptr_slot;
-
- u32 used_cs_int_mask; /* bit reg set when x19-x28 must be preserved */
- u32 used_cs_fp_mask; /* bit reg set when d8-d15 must be preserved */
-
- AAScope* scopes;
- u32 nscopes;
- u32 scopes_cap;
-
- u8 has_alloca;
- struct AAAllocaPatch {
- u32 pos;
- u32 dst_reg;
- }* add_patches;
- u32 nadd_patches;
- u32 add_patches_cap;
-
- u8 is_variadic;
- FrameSlot gp_save_slot;
- FrameSlot fp_save_slot;
-} AAImpl;
-
-/* ============================================================
- * Cross-file forward declarations
- * ============================================================ */
-
-/* emit.c helpers used in alloc.c / ops.c */
-void aa64_emit32(MCEmitter* mc, u32 word);
-void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word);
-void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm);
-void emit_sp_add(MCEmitter* mc, u32 imm);
-void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off);
-void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym);
-void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend);
-
-/* emit.c public surface */
-FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d);
-void aa_func_begin(CGTarget* t, const CGFuncDesc* fd);
-void aa_func_end(CGTarget* t);
-void aa_param(CGTarget* t, const CGParamDesc* p);
-
-/* alloc.c helpers used in emit.c / ops.c */
-AAImpl* impl_of(CGTarget* t);
-AASlot* aa64_slot_get(AAImpl* a, FrameSlot fs);
-void aa_jump(CGTarget* t, Label l);
-
-/* ops.c helpers used in alloc.c */
-void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma);
-void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma);
-u32 aa64_force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch);
-
-/* alloc.c helpers used in ops.c */
-void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op);
-void aa_alloc_vtable_init(CGTarget* t);
-void aa_coord_vtable_init(CGTarget* t);
-
-/* shared type helpers (defined in emit.c, used broadly) */
-int type_is_64(CfreeCgTypeId t);
-int type_is_fp_double(CfreeCgTypeId t);
-int type_is_signed(CfreeCgTypeId t);
-u32 type_byte_size(CfreeCgTypeId t);
-u32 size_idx_for_bytes(u32 nbytes);
-u32 reg_num(Operand op);
diff --git a/src/arch/aarch64/ops.c b/src/arch/aarch64/ops.c
@@ -1,1925 +0,0 @@
-/* aarch64/ops.c — data movement, arithmetic, calls, varargs, atomics,
- * intrinsics, asm_block, set_loc, finalize/destroy, vtable constructor. */
-
-#include "arch/aarch64/internal.h"
-
-/* ============================================================
- * Data movement
- * ============================================================ */
-
-static void aa_load_imm(CGTarget* t, Operand dst, i64 imm) {
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- aa64_emit_load_imm(t->mc, sf, reg_num(dst), imm);
-}
-
-static void aa_load_const(CGTarget* t, Operand dst, ConstBytes cb) {
- AAImpl* a = impl_of(t);
- if (dst.cls != RC_FP) {
- compiler_panic(t->c, a->loc, "aarch64 load_const: only FP supported in v1");
- }
-
- Sym ro_name = pool_intern_cstr(t->c->global, ".rodata");
- ObjSecId ro = obj_section(t->obj, ro_name, SEC_RODATA, SF_ALLOC, 1u);
-
- u32 cur_section = t->mc->section_id;
- t->mc->set_section(t->mc, ro);
- u32 ro_off = obj_align_to(t->obj, ro, cb.align ? cb.align : 4);
- t->mc->emit_bytes(t->mc, cb.bytes, cb.size);
-
- char namebuf[64];
- static u32 lit_seq = 0;
- int len = 0;
- {
- const char* prefix = ".LCFP";
- for (; prefix[len]; ++len) namebuf[len] = prefix[len];
- u32 v = lit_seq++;
- char tmp[16];
- int tn = 0;
- if (v == 0)
- tmp[tn++] = '0';
- else {
- while (v) {
- tmp[tn++] = '0' + (char)(v % 10);
- v /= 10;
- }
- }
- for (int i = tn - 1; i >= 0; --i) namebuf[len++] = tmp[i];
- namebuf[len] = 0;
- }
- Sym sname = pool_intern_cstr(t->c->global, namebuf);
- ObjSymId sym = obj_symbol(t->obj, sname, SB_LOCAL, SK_OBJ, ro, (u64)ro_off,
- (u64)cb.size);
-
- t->mc->set_section(t->mc, cur_section);
-
- u32 adrp_pos = t->mc->pos(t->mc);
- aa64_emit32(t->mc, aa64_adrp_base(AA_TMP0));
- t->mc->emit_reloc_at(t->mc, cur_section, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21,
- sym, 0, 0, 0);
-
- u32 ldr_pos = t->mc->pos(t->mc);
- u32 sidx = (cb.size == 8) ? 3u : 2u;
- aa64_emit32(t->mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), AA_TMP0, 0));
- RelocKind lo12 = (cb.size == 8) ? R_AARCH64_LDST64_ABS_LO12_NC
- : R_AARCH64_LDST32_ABS_LO12_NC;
- t->mc->emit_reloc_at(t->mc, cur_section, ldr_pos, lo12, sym, 0, 0, 0);
-}
-
-static void aa_copy(CGTarget* t, Operand dst, Operand src) {
- if (dst.cls == RC_FP || src.cls == RC_FP) {
- u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
- aa64_emit32(t->mc, aa64_fmov_reg(type, reg_num(dst), reg_num(src)));
- return;
- }
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- aa64_emit32(t->mc, aa64_mov_reg(sf, reg_num(dst), reg_num(src)));
-}
-
-/* ============================================================
- * Load / store
- * ============================================================ */
-
-static RelocKind ldst_lo12_reloc_for(u32 nbytes) {
- switch (nbytes) {
- case 1: return R_AARCH64_LDST8_ABS_LO12_NC;
- case 2: return R_AARCH64_LDST16_ABS_LO12_NC;
- case 4: return R_AARCH64_LDST32_ABS_LO12_NC;
- case 8: return R_AARCH64_LDST64_ABS_LO12_NC;
- default: return R_AARCH64_LDST64_ABS_LO12_NC;
- }
-}
-
-static int use_got_for_sym(CGTarget* t, ObjSymId sym) {
- return obj_symbol_extern_via_got(t->c, t->obj, sym);
-}
-
-static u32 addr_base(CGTarget* t, Operand addr, i32* out_off, u32 tmp_reg) {
- AAImpl* a = impl_of(t);
- if (addr.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_base: bad slot");
- i32 off = -(i32)s->off;
- if (off >= -256 && off <= 255) {
- *out_off = off;
- return 29;
- }
- aa64_emit_addr_adjust(t->mc, tmp_reg, 29, off);
- *out_off = 0;
- return tmp_reg;
- }
- if (addr.kind == OPK_INDIRECT) {
- i32 off = addr.v.ind.ofs;
- u32 base = addr.v.ind.base & 0x1f;
- if (off >= -256 && off <= 255) {
- *out_off = off;
- return base;
- }
- aa64_emit_addr_adjust(t->mc, tmp_reg, base, off);
- *out_off = 0;
- return tmp_reg;
- }
- if (addr.kind == OPK_GLOBAL) {
- emit_global_addr(t, tmp_reg, addr.v.global.sym, addr.v.global.addend);
- *out_off = 0;
- return tmp_reg;
- }
- compiler_panic(t->c, a->loc, "aarch64 addr_base: unsupported kind %d",
- (int)addr.kind);
-}
-
-void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) {
- u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
- u32 sidx = size_idx_for_bytes(sz);
-
- if (addr.kind == OPK_GLOBAL) {
- MCEmitter* mc = t->mc;
- u32 sec = mc->section_id;
- ObjSymId sym = addr.v.global.sym;
- i64 add = addr.v.global.addend;
- if (use_got_for_sym(t, sym)) {
- aa64_emit_got_load_addr(t, AA_TMP0, sym);
- if (dst.cls == RC_FP) {
- aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP0, (i32)add));
- } else {
- aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP0, (i32)add));
- }
- return;
- }
- u32 adrp_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_adrp_base(AA_TMP0));
- mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add,
- 0, 0);
- u32 ld_pos = mc->pos(mc);
- if (dst.cls == RC_FP) {
- aa64_emit32(mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), AA_TMP0, 0));
- } else {
- aa64_emit32(mc, aa64_ldr_uimm(sidx, reg_num(dst), AA_TMP0, 0));
- }
- mc->emit_reloc_at(mc, sec, ld_pos, ldst_lo12_reloc_for(sz), sym, add, 0, 0);
- return;
- }
-
- i32 off;
- u32 base = addr_base(t, addr, &off, AA_TMP0);
- if (dst.cls == RC_FP) {
- aa64_emit32(t->mc, aa64_ldur_fp(sidx, reg_num(dst), base, off));
- } else {
- aa64_emit32(t->mc, aa64_ldur(sidx, reg_num(dst), base, off));
- }
-}
-
-void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) {
- u32 sz = ma.size ? ma.size : type_byte_size(addr.type);
- u32 sidx = size_idx_for_bytes(sz);
-
- if (addr.kind == OPK_GLOBAL) {
- MCEmitter* mc = t->mc;
- u32 sec = mc->section_id;
- ObjSymId sym = addr.v.global.sym;
- i64 add = addr.v.global.addend;
-
- u32 src_reg;
- u32 src_is_fp = 0;
- if (src.kind == OPK_IMM) {
- u32 sf = (sz == 8) ? 1u : 0u;
- aa64_emit_load_imm(mc, sf, AA_TMP0, src.v.imm);
- src_reg = AA_TMP0;
- } else if (src.cls == RC_FP) {
- src_reg = reg_num(src);
- src_is_fp = 1;
- } else {
- src_reg = reg_num(src);
- }
- u32 base = (src.kind == OPK_IMM) ? AA_TMP1 : AA_TMP0;
- if (use_got_for_sym(t, sym)) {
- aa64_emit_got_load_addr(t, base, sym);
- if (src_is_fp) {
- aa64_emit32(mc, aa64_stur_fp(sidx, src_reg, base, (i32)add));
- } else {
- aa64_emit32(mc, aa64_stur(sidx, src_reg, base, (i32)add));
- }
- return;
- }
- u32 adrp_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_adrp_base(base));
- mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add,
- 0, 0);
- u32 st_pos = mc->pos(mc);
- if (src_is_fp) {
- aa64_emit32(mc, aa64_str_fp_uimm(sidx, src_reg, base, 0));
- } else {
- aa64_emit32(mc, aa64_str_uimm(sidx, src_reg, base, 0));
- }
- mc->emit_reloc_at(mc, sec, st_pos, ldst_lo12_reloc_for(sz), sym, add, 0, 0);
- return;
- }
-
- i32 off;
- u32 addr_tmp = (src.kind == OPK_IMM) ? AA_TMP1 : AA_TMP0;
- u32 base = addr_base(t, addr, &off, addr_tmp);
-
- if (src.kind == OPK_IMM) {
- u32 sf = (sz == 8) ? 1u : 0u;
- aa64_emit_load_imm(t->mc, sf, AA_TMP0, src.v.imm);
- aa64_emit32(t->mc, aa64_stur(sidx, AA_TMP0, base, off));
- return;
- }
- if (src.cls == RC_FP) {
- aa64_emit32(t->mc, aa64_stur_fp(sidx, reg_num(src), base, off));
- } else {
- aa64_emit32(t->mc, aa64_stur(sidx, reg_num(src), base, off));
- }
-}
-
-static void aa_addr_of(CGTarget* t, Operand dst, Operand lv) {
- AAImpl* a = impl_of(t);
- if (lv.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, lv.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_of: bad slot");
- aa64_emit32(t->mc, aa64_sub_imm(1, reg_num(dst), 29, s->off, 0));
- return;
- }
- if (lv.kind == OPK_INDIRECT) {
- i32 ofs = lv.v.ind.ofs;
- u32 base = lv.v.ind.base & 0x1f;
- if (ofs == 0) {
- aa64_emit32(t->mc, aa64_mov_reg(1, reg_num(dst), base));
- } else if (ofs > 0 && ofs <= 0xfff) {
- aa64_emit32(t->mc, aa64_add_imm(1, reg_num(dst), base, (u32)ofs, 0));
- } else if (ofs < 0 && -ofs <= 0xfff) {
- aa64_emit32(t->mc, aa64_sub_imm(1, reg_num(dst), base, (u32)(-ofs), 0));
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 addr_of: indirect offset %d unsupported", ofs);
- }
- return;
- }
- if (lv.kind == OPK_GLOBAL) {
- u32 rd = reg_num(dst);
- ObjSymId sym = lv.v.global.sym;
- i64 addend = lv.v.global.addend;
- if (use_got_for_sym(t, sym)) {
- aa64_emit_got_load_addr(t, rd, sym);
- if (addend) aa64_emit_addr_adjust(t->mc, rd, rd, (i32)addend);
- return;
- }
- u32 sec = t->mc->section_id;
- u32 adrp_pos = t->mc->pos(t->mc);
- aa64_emit32(t->mc, aa64_adrp_base(rd));
- t->mc->emit_reloc_at(t->mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym,
- addend, 0, 0);
- u32 add_pos = t->mc->pos(t->mc);
- aa64_emit32(t->mc, aa64_add_imm(1, rd, rd, 0, 0));
- t->mc->emit_reloc_at(t->mc, sec, add_pos, R_AARCH64_ADD_ABS_LO12_NC, sym,
- addend, 0, 0);
- return;
- }
- compiler_panic(t->c, impl_of(t)->loc, "aarch64: addr_of not implemented");
-}
-
-static void aa_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend) {
- MCEmitter* mc = t->mc;
- u32 sec = mc->section_id;
- u32 rd = reg_num(dst);
-
- if (obj_format_tls_via_descriptor(t->c)) {
- /* TLV access via per-variable descriptor (Mach-O TLVP). The thunk's
- * ABI is custom — x0 in/out as descriptor → TLV addr, all other
- * regs preserved — so we materialize via x0 and copy to `dst` only
- * when they differ. x0/x1 are scratch here (the regalloc only hands
- * out x19-x28), and x30 was saved at the prologue.
- *
- * adrp x0, sym@TLVPPAGE ; R_AARCH64_TLVP_LOAD_PAGE21
- * ldr x0, [x0, sym@TLVPPAGEOFF] ; R_AARCH64_TLVP_LOAD_PAGEOFF12
- * ldr x1, [x0] ; descriptor[0] = thunk pointer
- * blr x1 ; x0 in/out
- * mov xdst, x0 ; only if dst != x0
- *
- * TLVP relocs do not carry an addend; nonzero addends are applied
- * after the call as a follow-on ADD/SUB on `dst`. */
- u32 adrp_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_adrp_base(/*Rd=*/0));
- mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_TLVP_LOAD_PAGE21, sym, 0, 0,
- 0);
- u32 ldr_pos = mc->pos(mc);
- aa64_emit32(mc,
- aa64_ldr_uimm(/*size=*/3, /*Rt=*/0, /*Rn=*/0, /*byte_off=*/0));
- mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_TLVP_LOAD_PAGEOFF12, sym, 0,
- 0, 0);
- aa64_emit32(mc,
- aa64_ldr_uimm(/*size=*/3, /*Rt=*/1, /*Rn=*/0, /*byte_off=*/0));
- aa64_emit32(mc, aa64_blr(/*Rn=*/1));
- if (rd != 0) aa64_emit32(mc, aa64_mov_reg(/*sf=*/1, rd, /*Rm=*/0));
- if (addend) aa64_emit_addr_adjust(mc, rd, rd, (i32)addend);
- return;
- }
-
- aa64_emit32(mc, aa64_mrs_tpidr_el0(AA_TMP0));
-
- u32 hi_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, AA_TMP0, /*imm12=*/0, /*sh=*/1));
- mc->emit_reloc_at(mc, sec, hi_pos, R_AARCH64_TLSLE_ADD_TPREL_HI12, sym,
- addend, 0, 0);
-
- u32 lo_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, /*Rn=*/rd, /*imm12=*/0, /*sh=*/0));
- mc->emit_reloc_at(mc, sec, lo_pos, R_AARCH64_TLSLE_ADD_TPREL_LO12_NC, sym,
- addend, 0, 0);
-}
-
-/* ============================================================
- * Aggregate helpers
- * ============================================================ */
-
-static u32 agg_addr_reg(CGTarget* t, Operand op, u32 scratch) {
- if (op.kind == OPK_REG) return reg_num(op);
- if (op.kind == OPK_LOCAL) {
- AAImpl* a = impl_of(t);
- AASlot* s = aa64_slot_get(a, op.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 agg: bad slot");
- aa64_emit32(t->mc, aa64_sub_imm(1, scratch, 29, s->off, 0));
- return scratch;
- }
- compiler_panic(t->c, impl_of(t)->loc,
- "aarch64 agg: address kind %d unsupported", (int)op.kind);
-}
-
-static void aa_copy_bytes(CGTarget* t, Operand dst_addr, Operand src_addr,
- AggregateAccess agg) {
- MCEmitter* mc = t->mc;
- u32 dr = agg_addr_reg(t, dst_addr, AA_TMP0);
- u32 sr = agg_addr_reg(t, src_addr,
- (dr == AA_TMP1) ? AA_TMP2 : AA_TMP1);
- u32 nbytes = agg.size;
- u32 i = 0;
- while (i + 8 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i));
- aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i));
- i += 8;
- }
- while (i + 4 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i));
- aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i));
- i += 4;
- }
- while (i + 2 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i));
- aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i));
- i += 2;
- }
- while (i < nbytes) {
- aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i));
- aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i));
- i += 1;
- }
-}
-
-static void aa_set_bytes(CGTarget* t, Operand dst_addr, Operand byte_value,
- AggregateAccess agg) {
- MCEmitter* mc = t->mc;
- u32 dr = agg_addr_reg(t, dst_addr, AA_TMP0);
-
- u32 byte;
- if (byte_value.kind == OPK_IMM) {
- byte = (u32)(byte_value.v.imm & 0xffu);
- } else {
- compiler_panic(t->c, impl_of(t)->loc,
- "aarch64 set_bytes: REG byte not yet supported");
- }
- u32 nbytes = agg.size;
-
- if (byte == 0) {
- u32 i = 0;
- while (i + 8 <= nbytes) {
- aa64_emit32(mc, aa64_stur(3, 31, dr, (i32)i));
- i += 8;
- }
- while (i + 4 <= nbytes) {
- aa64_emit32(mc, aa64_stur(2, 31, dr, (i32)i));
- i += 4;
- }
- while (i + 2 <= nbytes) {
- aa64_emit32(mc, aa64_stur(1, 31, dr, (i32)i));
- i += 2;
- }
- while (i < nbytes) {
- aa64_emit32(mc, aa64_stur(0, 31, dr, (i32)i));
- i += 1;
- }
- return;
- }
-
- u64 b64 = byte;
- b64 |= b64 << 8;
- b64 |= b64 << 16;
- b64 |= b64 << 32;
- aa64_emit_load_imm(mc, /*sf=*/1u, AA_TMP1, (i64)b64);
-
- u32 i = 0;
- while (i + 8 <= nbytes) {
- aa64_emit32(mc, aa64_stur(3, AA_TMP1, dr, (i32)i));
- i += 8;
- }
- while (i + 4 <= nbytes) {
- aa64_emit32(mc, aa64_stur(2, AA_TMP1, dr, (i32)i));
- i += 4;
- }
- while (i + 2 <= nbytes) {
- aa64_emit32(mc, aa64_stur(1, AA_TMP1, dr, (i32)i));
- i += 2;
- }
- while (i < nbytes) {
- aa64_emit32(mc, aa64_stur(0, AA_TMP1, dr, (i32)i));
- i += 1;
- }
-}
-
-/* ============================================================
- * Bitfields
- * ============================================================ */
-
-static void aa_bitfield_load(CGTarget* t, Operand dst, Operand record_addr,
- BitFieldAccess bf) {
- MCEmitter* mc = t->mc;
- u32 base = agg_addr_reg(t, record_addr, AA_TMP0);
- u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
- u32 sf = (storage_bytes == 8u) ? 1u : 0u;
- u32 sidx = size_idx_for_bytes(storage_bytes);
- u32 rd = reg_num(dst);
-
- aa64_emit32(mc, aa64_ldur(sidx, rd, base, (i32)bf.storage_offset));
- u32 lsb = bf.bit_offset;
- u32 width = bf.bit_width ? bf.bit_width : 1u;
- u32 imms = lsb + width - 1u;
- if (bf.signed_) {
- aa64_emit32(mc, aa64_sbfm(sf, rd, rd, lsb, imms));
- } else {
- aa64_emit32(mc, aa64_ubfm(sf, rd, rd, lsb, imms));
- }
-}
-
-static void aa_bitfield_store(CGTarget* t, Operand record_addr, Operand src,
- BitFieldAccess bf) {
- MCEmitter* mc = t->mc;
- u32 base = agg_addr_reg(t, record_addr, AA_TMP0);
- u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
- u32 sf = (storage_bytes == 8u) ? 1u : 0u;
- u32 sidx = size_idx_for_bytes(storage_bytes);
-
- aa64_emit32(mc, aa64_ldur(sidx, AA_TMP1, base, (i32)bf.storage_offset));
-
- u32 src_reg;
- if (src.kind == OPK_IMM) {
- aa64_emit_load_imm(mc, sf, AA_TMP2, src.v.imm);
- src_reg = AA_TMP2;
- } else if (src.kind == OPK_REG) {
- src_reg = reg_num(src);
- } else {
- compiler_panic(t->c, impl_of(t)->loc,
- "aarch64 bitfield_store: src kind %d unsupported",
- (int)src.kind);
- }
-
- u32 reg_size = sf ? 64u : 32u;
- u32 lsb = bf.bit_offset;
- u32 width = bf.bit_width ? bf.bit_width : 1u;
- u32 immr = (reg_size - lsb) % reg_size;
- u32 imms = width - 1u;
- aa64_emit32(mc, aa64_bfm(sf, AA_TMP1, src_reg, immr, imms));
-
- aa64_emit32(mc, aa64_stur(sidx, AA_TMP1, base, (i32)bf.storage_offset));
-}
-
-/* ============================================================
- * Arithmetic helpers
- * ============================================================ */
-
-u32 aa64_force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch) {
- if (op.kind == OPK_REG) return reg_num(op);
- if (op.kind == OPK_IMM) {
- aa64_emit_load_imm(t->mc, sf, scratch, op.v.imm);
- return scratch;
- }
- compiler_panic(t->c, impl_of(t)->loc,
- "aarch64 binop: operand kind %d unsupported", (int)op.kind);
-}
-
-static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op,
- Operand b_op) {
- MCEmitter* mc = t->mc;
-
- if (op == BO_FADD || op == BO_FSUB || op == BO_FMUL || op == BO_FDIV) {
- if (a_op.kind != OPK_REG || b_op.kind != OPK_REG || dst.cls != RC_FP) {
- compiler_panic(t->c, impl_of(t)->loc,
- "aarch64 binop: FP op requires REG operands");
- }
- u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
- u32 rd = reg_num(dst);
- u32 rn = reg_num(a_op);
- u32 rm = reg_num(b_op);
- u32 w;
- switch (op) {
- case BO_FADD: w = aa64_fadd(type, rd, rn, rm); break;
- case BO_FSUB: w = aa64_fsub(type, rd, rn, rm); break;
- case BO_FMUL: w = aa64_fmul(type, rd, rn, rm); break;
- case BO_FDIV: w = aa64_fdiv(type, rd, rn, rm); break;
- default: w = 0; break;
- }
- aa64_emit32(mc, w);
- return;
- }
-
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- u32 rd = reg_num(dst);
-
- switch (op) {
- case BO_IADD:
- case BO_AND:
- case BO_OR:
- case BO_XOR: {
- if (a_op.kind == OPK_IMM && b_op.kind != OPK_IMM) {
- Operand t_op = a_op; a_op = b_op; b_op = t_op;
- }
- break;
- }
- default: break;
- }
-
- if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) {
- u32 rn_reg = reg_num(a_op);
- i64 imm = b_op.v.imm;
- u32 imm12, sh, N, immr, imms;
- switch (op) {
- case BO_IADD:
- if (aa64_addsub_imm_fits(imm, &imm12, &sh)) {
- aa64_emit32(mc, aa64_add_imm(sf, rd, rn_reg, imm12, sh));
- return;
- }
- break;
- case BO_ISUB:
- if (aa64_addsub_imm_fits(imm, &imm12, &sh)) {
- aa64_emit32(mc, aa64_sub_imm(sf, rd, rn_reg, imm12, sh));
- return;
- }
- break;
- case BO_AND:
- if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) {
- aa64_emit32(mc, aa64_and_imm(sf, rd, rn_reg, N, immr, imms));
- return;
- }
- break;
- case BO_OR:
- if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) {
- aa64_emit32(mc, aa64_orr_imm(sf, rd, rn_reg, N, immr, imms));
- return;
- }
- break;
- case BO_XOR:
- if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) {
- aa64_emit32(mc, aa64_eor_imm(sf, rd, rn_reg, N, immr, imms));
- return;
- }
- break;
- case BO_SHL: {
- u32 width = sf ? 64u : 32u;
- u32 sh_amt = (u32)((u64)imm & (width - 1u));
- if (aa64_lsl_imm_fields(sh_amt, sf, &immr, &imms)) {
- aa64_emit32(mc, aa64_ubfm(sf, rd, rn_reg, immr, imms));
- return;
- }
- break;
- }
- case BO_SHR_U: {
- u32 width = sf ? 64u : 32u;
- u32 sh_amt = (u32)((u64)imm & (width - 1u));
- if (aa64_lsr_imm_fields(sh_amt, sf, &immr, &imms)) {
- aa64_emit32(mc, aa64_ubfm(sf, rd, rn_reg, immr, imms));
- return;
- }
- break;
- }
- case BO_SHR_S: {
- u32 width = sf ? 64u : 32u;
- u32 sh_amt = (u32)((u64)imm & (width - 1u));
- if (aa64_asr_imm_fields(sh_amt, sf, &immr, &imms)) {
- aa64_emit32(mc, aa64_sbfm(sf, rd, rn_reg, immr, imms));
- return;
- }
- break;
- }
- default: break;
- }
- }
-
- u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
- u32 rm =
- aa64_force_reg_int(t, b_op, sf, (rn == AA_TMP0) ? AA_TMP1 : AA_TMP0);
-
- u32 word;
- switch (op) {
- case BO_IADD: word = aa64_add(sf, rd, rn, rm); break;
- case BO_ISUB: word = aa64_sub(sf, rd, rn, rm); break;
- case BO_IMUL: word = aa64_mul(sf, rd, rn, rm); break;
- case BO_AND: word = aa64_and(sf, rd, rn, rm); break;
- case BO_OR: word = aa64_orr(sf, rd, rn, rm); break;
- case BO_XOR: word = aa64_eor(sf, rd, rn, rm); break;
- case BO_SHL: word = aa64_lslv(sf, rd, rn, rm); break;
- case BO_SHR_U: word = aa64_lsrv(sf, rd, rn, rm); break;
- case BO_SHR_S: word = aa64_asrv(sf, rd, rn, rm); break;
- case BO_UDIV: word = aa64_udiv(sf, rd, rn, rm); break;
- case BO_SDIV: word = aa64_sdiv(sf, rd, rn, rm); break;
- case BO_SREM:
- aa64_emit32(mc, aa64_sdiv(sf, AA_TMP2, rn, rm));
- word = aa64_msub(sf, rd, AA_TMP2, rm, rn);
- break;
- case BO_UREM:
- aa64_emit32(mc, aa64_udiv(sf, AA_TMP2, rn, rm));
- word = aa64_msub(sf, rd, AA_TMP2, rm, rn);
- break;
- case BO_FADD:
- case BO_FSUB:
- case BO_FMUL:
- case BO_FDIV:
- default:
- compiler_panic(t->c, impl_of(t)->loc, "aarch64 binop: op %d unimpl",
- (int)op);
- }
- aa64_emit32(mc, word);
-}
-
-static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a_op) {
- MCEmitter* mc = t->mc;
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- u32 rd = reg_num(dst);
- u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
- u32 word;
-
- switch (op) {
- case UO_NEG:
- word = aa64_neg(sf, rd, rn);
- break;
- case UO_BNOT:
- word = aa64_mvn(sf, rd, rn);
- break;
- case UO_NOT:
- aa64_emit32(mc, aa64_subs_imm(sf, /*ZR=*/31, rn, 0));
- word = aa64_cset_eq(sf, rd);
- break;
- default:
- compiler_panic(t->c, impl_of(t)->loc, "aarch64 unop: op %d unimpl",
- (int)op);
- }
- aa64_emit32(mc, word);
-}
-
-static void aa_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- u32 rd = reg_num(dst);
- u32 rn = reg_num(src);
-
- switch (k) {
- case CV_SEXT: {
- if (src.cls != RC_INT || dst.cls != RC_INT) {
- compiler_panic(t->c, a->loc, "aarch64 convert SEXT: bad classes");
- }
- u32 src_bits = type_byte_size(src.type) * 8u;
- u32 sf_dst = type_is_64(dst.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_sbfm(sf_dst, rd, rn, /*immr=*/0, /*imms=*/src_bits - 1u));
- return;
- }
- case CV_ZEXT: {
- if (src.cls != RC_INT || dst.cls != RC_INT) {
- compiler_panic(t->c, a->loc, "aarch64 convert ZEXT: bad classes");
- }
- u32 src_bits = type_byte_size(src.type) * 8u;
- if (src_bits == 32u) {
- aa64_emit32(mc, aa64_mov_reg(0, rd, rn));
- } else {
- aa64_emit32(mc, aa64_ubfm(0, rd, rn, /*immr=*/0, /*imms=*/src_bits - 1u));
- }
- return;
- }
- case CV_TRUNC: {
- aa64_emit32(mc, aa64_mov_reg(0, rd, rn));
- return;
- }
- case CV_ITOF_S: {
- u32 sf_src = type_is_64(src.type) ? 1u : 0u;
- u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_scvtf(sf_src, type, rd, rn));
- return;
- }
- case CV_ITOF_U: {
- u32 sf_src = type_is_64(src.type) ? 1u : 0u;
- u32 type = type_is_fp_double(dst.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_ucvtf(sf_src, type, rd, rn));
- return;
- }
- case CV_FTOI_S: {
- if (src.cls != RC_FP || dst.cls != RC_INT) {
- compiler_panic(t->c, a->loc, "aarch64 convert FTOI_S: bad classes");
- }
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- u32 type = type_is_fp_double(src.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_fcvtzs(sf, type, rd, rn));
- return;
- }
- case CV_FTOI_U: {
- if (src.cls != RC_FP || dst.cls != RC_INT) {
- compiler_panic(t->c, a->loc, "aarch64 convert FTOI_U: bad classes");
- }
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- u32 type = type_is_fp_double(src.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_fcvtzu(sf, type, rd, rn));
- return;
- }
- case CV_FEXT: {
- aa64_emit32(mc, aa64_fcvt_d_s(rd, rn));
- return;
- }
- case CV_FTRUNC: {
- aa64_emit32(mc, aa64_fcvt_s_d(rd, rn));
- return;
- }
- case CV_BITCAST: {
- if (src.cls == RC_INT && dst.cls == RC_FP) {
- u32 sz = type_byte_size(dst.type);
- aa64_emit32(mc, sz == 8 ? aa64_fmov_d_x(rd, rn) : aa64_fmov_s_w(rd, rn));
- } else if (src.cls == RC_FP && dst.cls == RC_INT) {
- u32 sz = type_byte_size(src.type);
- aa64_emit32(mc, sz == 8 ? aa64_fmov_x_d(rd, rn) : aa64_fmov_w_s(rd, rn));
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 convert BITCAST: same-class not yet supported");
- }
- return;
- }
- default:
- compiler_panic(t->c, a->loc, "aarch64 convert kind %d unimpl", (int)k);
- }
-}
-
-/* ============================================================
- * Calls
- * ============================================================ */
-
-static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi,
- const CGABIValue* av, u32* next_int, u32* next_fp,
- u32* stack_off) {
- AAImpl* a = impl_of(t);
- ABIArgInfo va_ai;
- ABIArgPart va_pt;
- const ABIArgInfo* ai = av->abi;
- if (!ai) {
- u32 sz = type_byte_size(av->type);
- memset(&va_ai, 0, sizeof va_ai);
- memset(&va_pt, 0, sizeof va_pt);
- va_ai.kind = ABI_ARG_DIRECT;
- va_ai.parts = &va_pt;
- va_ai.nparts = 1;
- va_pt.cls = (av->storage.cls == RC_FP) ? ABI_CLASS_FP : ABI_CLASS_INT;
- va_pt.size = sz;
- va_pt.align = sz;
- va_pt.src_offset = 0;
- ai = &va_ai;
- if (fi && fi->vararg_on_stack) {
- *next_int = 8;
- *next_fp = 8;
- }
- }
- if (ai->kind == ABI_ARG_IGNORE) return;
-
- if (ai->kind == ABI_ARG_INDIRECT) {
- u32 dst_reg;
- int to_stack = (*next_int >= 8);
- if (!to_stack)
- dst_reg = (*next_int)++;
- else
- dst_reg = AA_TMP0;
- if (av->storage.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad byval slot");
- aa64_emit32(t->mc, aa64_sub_imm(1, dst_reg, 29, s->off, 0));
- } else if (av->storage.kind == OPK_INDIRECT) {
- aa64_emit_addr_adjust(t->mc, dst_reg, av->storage.v.ind.base & 0x1f,
- av->storage.v.ind.ofs);
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 call: INDIRECT arg storage kind %d unsupported",
- (int)av->storage.kind);
- }
- if (to_stack) {
- aa64_emit32(t->mc, aa64_str_uimm(3, dst_reg, 31, *stack_off));
- *stack_off += 8;
- }
- return;
- }
-
- for (u16 i = 0; i < ai->nparts; ++i) {
- const ABIArgPart* pt = &ai->parts[i];
- u32 sz = pt->size;
- u32 sidx = size_idx_for_bytes(sz);
-
- if (pt->cls == ABI_CLASS_INT) {
- int to_stack = (*next_int >= 8);
- u32 dst_reg = to_stack ? AA_TMP0 : (*next_int)++;
- switch (av->storage.kind) {
- case OPK_IMM: {
- u32 sf = (sz == 8) ? 1u : 0u;
- aa64_emit_load_imm(t->mc, sf, dst_reg, av->storage.v.imm);
- break;
- }
- case OPK_REG: {
- u32 sf = (sz == 8) ? 1u : 0u;
- aa64_emit32(t->mc, aa64_mov_reg(sf, dst_reg, reg_num(av->storage)));
- break;
- }
- case OPK_LOCAL: {
- AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad arg slot");
- i32 off = -(i32)s->off + (i32)pt->src_offset;
- aa64_emit32(t->mc, aa64_ldur(sidx, dst_reg, 29, off));
- break;
- }
- case OPK_INDIRECT: {
- Operand src;
- memset(&src, 0, sizeof src);
- src.kind = OPK_INDIRECT;
- src.v.ind.base = av->storage.v.ind.base;
- src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset;
- i32 off;
- u32 base = addr_base(t, src, &off, AA_TMP0);
- aa64_emit32(t->mc, aa64_ldur(sidx, dst_reg, base, off));
- break;
- }
- default:
- compiler_panic(t->c, a->loc,
- "aarch64 call: arg storage kind %d unsupported",
- (int)av->storage.kind);
- }
- if (to_stack) {
- aa64_emit32(t->mc, aa64_str_uimm(3, dst_reg, 31, *stack_off));
- *stack_off += 8;
- }
- } else if (pt->cls == ABI_CLASS_FP) {
- int to_stack = (*next_fp >= 8);
- if (!to_stack) {
- u32 dst_reg = (*next_fp)++;
- switch (av->storage.kind) {
- case OPK_REG: {
- u32 type = (sz == 8) ? 1u : 0u;
- aa64_emit32(t->mc, aa64_fmov_reg(type, dst_reg, reg_num(av->storage)));
- break;
- }
- case OPK_INDIRECT: {
- Operand src;
- memset(&src, 0, sizeof src);
- src.kind = OPK_INDIRECT;
- src.v.ind.base = av->storage.v.ind.base;
- src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset;
- i32 off;
- u32 base = addr_base(t, src, &off, AA_TMP0);
- aa64_emit32(t->mc, aa64_ldur_fp(sidx, dst_reg, base, off));
- break;
- }
- default:
- compiler_panic(t->c, a->loc,
- "aarch64 call: FP arg storage kind %d unsupported",
- (int)av->storage.kind);
- }
- } else {
- switch (av->storage.kind) {
- case OPK_REG:
- aa64_emit32(t->mc, aa64_stur_fp(sidx, reg_num(av->storage), 31,
- (i32)*stack_off));
- break;
- case OPK_INDIRECT: {
- Operand src;
- memset(&src, 0, sizeof src);
- src.kind = OPK_INDIRECT;
- src.v.ind.base = av->storage.v.ind.base;
- src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset;
- i32 off;
- u32 base = addr_base(t, src, &off, AA_TMP0);
- aa64_emit32(t->mc, aa64_ldur_fp(sidx, AA_FP_TMP0, base, off));
- aa64_emit32(t->mc, aa64_stur_fp(sidx, AA_FP_TMP0, 31, (i32)*stack_off));
- break;
- }
- default:
- compiler_panic(
- t->c, a->loc,
- "aarch64 call: FP stack-arg storage kind %d unsupported",
- (int)av->storage.kind);
- }
- *stack_off += 8;
- }
- } else {
- compiler_panic(t->c, a->loc, "aarch64 call: ABI class %d unimpl",
- (int)pt->cls);
- }
- }
-}
-
-static void aa_call(CGTarget* t, const CGCallDesc* d) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
-
- u32 next_int = 0, next_fp = 0, stack_off = 0;
-
- if (d->abi && d->abi->has_sret) {
- if (d->ret.storage.kind != OPK_LOCAL) {
- compiler_panic(t->c, a->loc,
- "aarch64 call: sret destination must be LOCAL");
- }
- AASlot* s = aa64_slot_get(a, d->ret.storage.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad sret slot");
- aa64_emit32(mc, aa64_sub_imm(1, 8, 29, s->off, 0));
- }
-
- for (u32 i = 0; i < d->nargs; ++i) {
- emit_arg_value(t, d->abi, &d->args[i], &next_int, &next_fp, &stack_off);
- }
-
- u32 needed = (stack_off + 15u) & ~15u;
- if (needed > a->max_outgoing) a->max_outgoing = needed;
-
- if (d->callee.kind == OPK_GLOBAL) {
- u32 bl_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_bl_base());
- mc->emit_reloc_at(mc, mc->section_id, bl_pos, R_AARCH64_CALL26,
- d->callee.v.global.sym, d->callee.v.global.addend, 0, 0);
- } else if (d->callee.kind == OPK_REG) {
- aa64_emit32(mc, aa64_blr(reg_num(d->callee)));
- } else {
- compiler_panic(t->c, a->loc, "aarch64 call: callee kind %d unsupported",
- (int)d->callee.kind);
- }
-
- const ABIArgInfo* ri = &d->abi->ret;
- if (ri->kind == ABI_ARG_IGNORE || ri->kind == ABI_ARG_INDIRECT) {
- return;
- }
- if (ri->nparts == 0) return;
-
- Operand rs = d->ret.storage;
- u32 next_int_ret = 0, next_fp_ret = 0;
- for (u16 i = 0; i < ri->nparts; ++i) {
- const ABIArgPart* p = &ri->parts[i];
- u32 src_reg;
- if (p->cls == ABI_CLASS_INT) {
- src_reg = next_int_ret++;
- } else if (p->cls == ABI_CLASS_FP) {
- src_reg = next_fp_ret++;
- } else {
- compiler_panic(t->c, a->loc, "aarch64 call: ret part cls %d unimpl",
- (int)p->cls);
- }
-
- if (rs.kind == OPK_REG) {
- if (ri->nparts != 1) {
- compiler_panic(t->c, a->loc,
- "aarch64 call: REG ret_storage with %u parts",
- (unsigned)ri->nparts);
- }
- if (p->cls == ABI_CLASS_INT) {
- u32 sf = (p->size == 8) ? 1u : 0u;
- aa64_emit32(mc, aa64_mov_reg(sf, reg_num(rs), src_reg));
- } else {
- u32 type = (p->size == 8) ? 1u : 0u;
- aa64_emit32(mc, aa64_fmov_reg(type, reg_num(rs), src_reg));
- }
- } else if (rs.kind == OPK_LOCAL || rs.kind == OPK_INDIRECT) {
- u32 base_reg;
- i32 base_off;
- if (rs.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, rs.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot");
- base_reg = 29;
- base_off = -(i32)s->off;
- } else {
- base_reg = rs.v.ind.base & 0x1f;
- base_off = rs.v.ind.ofs;
- }
- u32 sidx = size_idx_for_bytes(p->size);
- i32 off = base_off + (i32)p->src_offset;
- if (p->cls == ABI_CLASS_INT) {
- aa64_emit32(mc, aa64_stur(sidx, src_reg, base_reg, off));
- } else {
- aa64_emit32(mc, aa64_stur_fp(sidx, src_reg, base_reg, off));
- }
- } else if (rs.kind == OPK_IMM && rs.type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_VOID)) {
- /* void return placeholder */
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 call: ret_storage kind %d unsupported",
- (int)rs.kind);
- }
- }
-}
-
-static void aa_ret(CGTarget* t, const CGABIValue* val) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
-
- if (val) {
- const ABIArgInfo* ri = val->abi;
- if (ri && ri->kind == ABI_ARG_INDIRECT) {
- if (val->storage.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, val->storage.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad sret slot");
- if (a->sret_ptr_slot != FRAME_SLOT_NONE) {
- AASlot* sp = aa64_slot_get(a, a->sret_ptr_slot);
- if (sp) aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off));
- }
- u32 nbytes = s->size;
- u32 i = 0;
- while (i + 8 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(3, AA_TMP0, 29, -(i32)s->off + (i32)i));
- aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i));
- i += 8;
- }
- while (i + 4 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(2, AA_TMP0, 29, -(i32)s->off + (i32)i));
- aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i));
- i += 4;
- }
- while (i + 2 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(1, AA_TMP0, 29, -(i32)s->off + (i32)i));
- aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i));
- i += 2;
- }
- while (i < nbytes) {
- aa64_emit32(mc, aa64_ldur(0, AA_TMP0, 29, -(i32)s->off + (i32)i));
- aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i));
- i += 1;
- }
- } else if (val->storage.kind == OPK_INDIRECT) {
- u32 nbytes = val->size;
- if (!nbytes) {
- compiler_panic(t->c, a->loc,
- "aarch64 ret indirect: missing aggregate size");
- }
- if (a->sret_ptr_slot != FRAME_SLOT_NONE) {
- AASlot* sp = aa64_slot_get(a, a->sret_ptr_slot);
- if (sp) aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off));
- }
- u32 base_reg = val->storage.v.ind.base & 0x1f;
- i32 base_off = val->storage.v.ind.ofs;
- u32 i = 0;
- while (i + 8 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(3, AA_TMP0, base_reg, base_off + (i32)i));
- aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i));
- i += 8;
- }
- while (i + 4 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(2, AA_TMP0, base_reg, base_off + (i32)i));
- aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i));
- i += 4;
- }
- while (i + 2 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(1, AA_TMP0, base_reg, base_off + (i32)i));
- aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i));
- i += 2;
- }
- while (i < nbytes) {
- aa64_emit32(mc, aa64_ldur(0, AA_TMP0, base_reg, base_off + (i32)i));
- aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i));
- i += 1;
- }
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 ret indirect: storage kind %d unsupported",
- (int)val->storage.kind);
- }
- } else if (val->storage.kind == OPK_REG) {
- if (val->storage.cls == RC_FP) {
- u32 type = type_is_fp_double(val->storage.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_fmov_reg(type, /*Rd=*/0, reg_num(val->storage)));
- } else {
- u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_mov_reg(sf, /*Rd=*/0, reg_num(val->storage)));
- }
- } else if (val->storage.kind == OPK_IMM) {
- u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
- aa64_emit_load_imm(mc, sf, /*Rd=*/0, val->storage.v.imm);
- } else if (val->storage.kind == OPK_LOCAL ||
- val->storage.kind == OPK_INDIRECT) {
- u32 base_reg;
- i32 base_off;
- if (val->storage.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, val->storage.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad local slot");
- base_reg = 29;
- base_off = -(i32)s->off;
- } else {
- base_reg = val->storage.v.ind.base & 0x1f;
- base_off = val->storage.v.ind.ofs;
- }
- const ABIArgInfo* ri2 = val->abi;
- for (u16 i = 0; i < (ri2 ? ri2->nparts : 0); ++i) {
- const ABIArgPart* pt = &ri2->parts[i];
- u32 sidx = size_idx_for_bytes(pt->size);
- i32 off = base_off + (i32)pt->src_offset;
- if (pt->cls == ABI_CLASS_INT) {
- aa64_emit32(mc, aa64_ldur(sidx, /*Rt=*/i, base_reg, off));
- } else if (pt->cls == ABI_CLASS_FP) {
- aa64_emit32(mc, aa64_ldur_fp(sidx, /*Rt=*/i, base_reg, off));
- } else {
- compiler_panic(t->c, a->loc, "aarch64 ret: ret part cls %d unimpl",
- (int)pt->cls);
- }
- }
- }
- }
- u32 bpos = mc->pos(mc);
- aa64_emit32(mc, aa64_b_base());
- mc->emit_label_ref(mc, a->epilogue_label, R_AARCH64_JUMP26, 4, 0);
- (void)bpos;
-}
-
-/* ============================================================
- * alloca
- * ============================================================ */
-
-static void aa_alloca_(CGTarget* t, Operand d, Operand sz, u32 align) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
-
- if (d.kind != OPK_REG) {
- compiler_panic(t->c, a->loc, "aarch64 alloca: dst must be REG");
- }
- if (align > 16) {
- compiler_panic(t->c, a->loc,
- "aarch64 alloca: align %u > 16 not yet supported", align);
- }
-
- if (sz.kind == OPK_IMM) {
- i64 v = sz.v.imm;
- if (v < 0) {
- compiler_panic(t->c, a->loc, "aarch64 alloca: negative size");
- }
- u64 aligned = ((u64)v + 15u) & ~(u64)15u;
- if (aligned == 0) aligned = 16;
- if (aligned > 0xfffu) {
- compiler_panic(t->c, a->loc,
- "aarch64 alloca: const size %llu too large for v1",
- (unsigned long long)aligned);
- }
- aa64_emit32(mc, aa64_sub_imm(1, /*Rd=SP*/ 31, /*Rn=SP*/ 31, (u32)aligned, 0));
- } else if (sz.kind == OPK_REG) {
- u32 sz_reg = reg_num(sz);
- aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, sz_reg, 15u, 0));
- aa64_emit32(mc, aa64_ubfm(1, AA_TMP0, AA_TMP0, 4, 63));
- aa64_emit32(mc, aa64_ubfm(1, AA_TMP0, AA_TMP0, 60, 59));
- aa64_emit32(mc, aa64_sub_extreg_x_uxtx(/*SP*/ 31, /*SP*/ 31, AA_TMP0));
- } else {
- compiler_panic(t->c, a->loc, "aarch64 alloca: size kind %d unsupported",
- (int)sz.kind);
- }
-
- if (a->nadd_patches == a->add_patches_cap) {
- u32 ncap = a->add_patches_cap ? a->add_patches_cap * 2 : 4;
- struct AAAllocaPatch* nb =
- arena_array(t->c->tu, struct AAAllocaPatch, ncap);
- if (a->add_patches)
- memcpy(nb, a->add_patches, sizeof(*nb) * a->nadd_patches);
- a->add_patches = nb;
- a->add_patches_cap = ncap;
- }
- u32 dst_reg = reg_num(d);
- a->add_patches[a->nadd_patches].pos = mc->pos(mc);
- a->add_patches[a->nadd_patches].dst_reg = dst_reg;
- a->nadd_patches++;
- aa64_emit32(mc, aa64_add_imm(1, dst_reg, /*Rn=SP*/ 31, 0, 0));
- a->has_alloca = 1;
-}
-
-/* ============================================================
- * Varargs
- * ============================================================ */
-
-static void emit_fp_off(MCEmitter* mc, u32 dst, i32 ofs) {
- if (ofs == 0)
- aa64_emit32(mc, aa64_mov_reg(1, dst, 29));
- else if (ofs > 0 && (u32)ofs <= 0xfff)
- aa64_emit32(mc, aa64_add_imm(1, dst, 29, (u32)ofs, 0));
- else if (ofs < 0 && (u32)(-ofs) <= 0xfff)
- aa64_emit32(mc, aa64_sub_imm(1, dst, 29, (u32)(-ofs), 0));
- else {
- aa64_emit_load_imm(mc, 1, dst, ofs);
- aa64_emit32(mc, aa64_add(1, dst, 29, dst));
- }
-}
-
-static void aa_va_start_(CGTarget* t, Operand ap_op) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- if (!a->is_variadic) {
- compiler_panic(t->c, a->loc, "aarch64 va_start: function not variadic");
- }
- u32 ap = reg_num(ap_op);
- AASlot* gs = aa64_slot_get(a, a->gp_save_slot);
- AASlot* fs = aa64_slot_get(a, a->fp_save_slot);
-
- {
- u32 ofs = 16u + a->next_param_stack;
- if (ofs <= 0xfff)
- aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, 29, ofs, 0));
- else {
- aa64_emit_load_imm(mc, 1, AA_TMP0, (i64)ofs);
- aa64_emit32(mc, aa64_add(1, AA_TMP0, 29, AA_TMP0));
- }
- aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 0));
- }
- emit_fp_off(mc, AA_TMP0, -(i32)gs->off + (i32)gs->size);
- aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 8));
- emit_fp_off(mc, AA_TMP0, -(i32)fs->off + (i32)fs->size);
- aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 16));
- aa64_emit_load_imm(mc, 0, AA_TMP0,
- (i64)((i32)(a->next_param_int * 8u) - 64));
- aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, ap, 24));
- aa64_emit_load_imm(mc, 0, AA_TMP0,
- (i64)((i32)(a->next_param_fp * 16u) - 128));
- aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, ap, 28));
-}
-
-static void aa_va_arg_(CGTarget* t, Operand dst, Operand ap_op,
- CfreeCgTypeId ty) {
- MCEmitter* mc = t->mc;
- u32 ap = reg_num(ap_op);
- int is_fp = (dst.cls == RC_FP);
- u32 offs_field = is_fp ? 28u : 24u;
- u32 top_field = is_fp ? 16u : 8u;
- u32 stride_reg = is_fp ? 16u : 8u;
- u32 sz = type_byte_size(ty);
- u32 sidx = size_idx_for_bytes(sz);
-
- MCLabel L_stack = mc->label_new(mc);
- MCLabel L_done = mc->label_new(mc);
-
- aa64_emit32(mc, aa64_ldur(2, AA_TMP0, ap, (i32)offs_field));
- aa64_emit32(mc, aa64_subs_imm(0, 31, AA_TMP0, 0));
- aa64_emit32(mc, aa64_b_cond(0xa /*GE*/));
- mc->emit_label_ref(mc, L_stack, R_AARCH64_CONDBR19, 4, 0);
-
- aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, (i32)top_field));
- aa64_emit32(mc, aa64_sbfm(1, AA_TMP2, AA_TMP0, 0, 31));
- aa64_emit32(mc, aa64_add(1, AA_TMP2, AA_TMP1, AA_TMP2));
- if (is_fp)
- aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP2, 0));
- else
- aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP2, 0));
- aa64_emit32(mc, aa64_add_imm(0, AA_TMP0, AA_TMP0, stride_reg, 0));
- aa64_emit32(mc, aa64_stur(2, AA_TMP0, ap, (i32)offs_field));
- aa64_emit32(mc, aa64_b_base());
- mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0);
-
- mc->label_place(mc, L_stack);
- aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, 0));
- if (is_fp)
- aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP1, 0));
- else
- aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP1, 0));
- aa64_emit32(mc, aa64_add_imm(1, AA_TMP1, AA_TMP1, 8u, 0));
- aa64_emit32(mc, aa64_stur(3, AA_TMP1, ap, 0));
-
- mc->label_place(mc, L_done);
-}
-
-static void aa_va_end_(CGTarget* t, Operand a) {
- (void)t;
- (void)a;
-}
-
-static void aa_va_copy_(CGTarget* t, Operand d, Operand s) {
- MCEmitter* mc = t->mc;
- u32 dr = reg_num(d);
- u32 sr = reg_num(s);
- for (u32 i = 0; i < 32u; i += 8u) {
- aa64_emit32(mc, aa64_ldur(3, AA_TMP0, sr, (i32)i));
- aa64_emit32(mc, aa64_stur(3, AA_TMP0, dr, (i32)i));
- }
-}
-
-/* ============================================================
- * Atomics
- * ============================================================ */
-
-static inline u32 aa64_ldar(u32 sf64, u32 Rt, u32 Rn) {
- return (sf64 ? 0xC8DFFC00u : 0x88DFFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_stlr(u32 sf64, u32 Rt, u32 Rn) {
- return (sf64 ? 0xC89FFC00u : 0x889FFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldxr(u32 sf64, u32 Rt, u32 Rn) {
- return (sf64 ? 0xC85F7C00u : 0x885F7C00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_ldaxr(u32 sf64, u32 Rt, u32 Rn) {
- return (sf64 ? 0xC85FFC00u : 0x885FFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_stxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn) {
- return (sf64 ? 0xC8007C00u : 0x88007C00u) | ((Rs & 0x1f) << 16) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_stlxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn) {
- return (sf64 ? 0xC800FC00u : 0x8800FC00u) | ((Rs & 0x1f) << 16) |
- ((Rn & 0x1f) << 5) | (Rt & 0x1f);
-}
-static inline u32 aa64_cbnz(u32 sf64, u32 Rt) {
- return 0x35000000u | (sf64 << 31) | (Rt & 0x1f);
-}
-
-static int mem_order_is_acquire(MemOrder o) {
- return o == MO_ACQUIRE || o == MO_ACQ_REL || o == MO_SEQ_CST ||
- o == MO_CONSUME;
-}
-static int mem_order_is_release(MemOrder o) {
- return o == MO_RELEASE || o == MO_ACQ_REL || o == MO_SEQ_CST;
-}
-
-static void aa_atomic_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma,
- MemOrder ord) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- u32 sf = (ma.size == 8) ? 1u : 0u;
-
- u32 base;
- if (addr.kind == OPK_REG) {
- base = reg_num(addr);
- } else if (addr.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_load: bad slot");
- base = AA_TMP0;
- aa64_emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0));
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 atomic_load: addr kind %d unsupported",
- (int)addr.kind);
- }
- if (mem_order_is_acquire(ord)) {
- aa64_emit32(mc, aa64_ldar(sf, reg_num(dst), base));
- } else {
- u32 sidx = size_idx_for_bytes(ma.size);
- aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), base, 0));
- }
-}
-
-static void aa_atomic_store(CGTarget* t, Operand addr, Operand src,
- MemAccess ma, MemOrder ord) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- u32 sf = (ma.size == 8) ? 1u : 0u;
-
- u32 src_reg;
- if (src.kind == OPK_IMM) {
- src_reg = AA_TMP1;
- aa64_emit_load_imm(mc, sf, src_reg, src.v.imm);
- } else if (src.kind == OPK_REG) {
- src_reg = reg_num(src);
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 atomic_store: src kind %d unsupported",
- (int)src.kind);
- }
- u32 base;
- if (addr.kind == OPK_REG) {
- base = reg_num(addr);
- } else if (addr.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_store: bad slot");
- base = AA_TMP0;
- aa64_emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0));
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 atomic_store: addr kind %d unsupported",
- (int)addr.kind);
- }
- if (mem_order_is_release(ord)) {
- aa64_emit32(mc, aa64_stlr(sf, src_reg, base));
- } else {
- u32 sidx = size_idx_for_bytes(ma.size);
- aa64_emit32(mc, aa64_stur(sidx, src_reg, base, 0));
- }
-}
-
-static void emit_rmw_combine(MCEmitter* mc, AtomicOp op, u32 sf, u32 dst_new,
- u32 prior, u32 val) {
- switch (op) {
- case AO_XCHG: aa64_emit32(mc, aa64_mov_reg(sf, dst_new, val)); break;
- case AO_ADD: aa64_emit32(mc, aa64_add(sf, dst_new, prior, val)); break;
- case AO_SUB: aa64_emit32(mc, aa64_sub(sf, dst_new, prior, val)); break;
- case AO_AND: aa64_emit32(mc, aa64_and(sf, dst_new, prior, val)); break;
- case AO_OR: aa64_emit32(mc, aa64_orr(sf, dst_new, prior, val)); break;
- case AO_XOR: aa64_emit32(mc, aa64_eor(sf, dst_new, prior, val)); break;
- case AO_NAND:
- aa64_emit32(mc, aa64_and(sf, dst_new, prior, val));
- aa64_emit32(mc, aa64_mvn(sf, dst_new, dst_new));
- break;
- default:
- aa64_emit32(mc, aa64_mov_reg(sf, dst_new, val));
- break;
- }
-}
-
-static void aa_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst, Operand addr,
- Operand val, MemAccess ma, MemOrder ord) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- u32 sf = (ma.size == 8) ? 1u : 0u;
-
- u32 base = AA_TMP0;
- if (addr.kind == OPK_REG) {
- aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(addr)));
- } else if (addr.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: bad slot");
- aa64_emit32(mc, aa64_sub_imm(1, AA_TMP0, 29, s->off, 0));
- } else {
- compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: addr kind %d unsupported",
- (int)addr.kind);
- }
- u32 vreg = AA_TMP1;
- if (val.kind == OPK_IMM) {
- aa64_emit_load_imm(mc, sf, vreg, val.v.imm);
- } else if (val.kind == OPK_REG) {
- aa64_emit32(mc, aa64_mov_reg(sf, vreg, reg_num(val)));
- } else {
- compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: val kind %d unsupported",
- (int)val.kind);
- }
-
- int do_acq = mem_order_is_acquire(ord);
- int do_rel = mem_order_is_release(ord);
-
- MCLabel L_retry = mc->label_new(mc);
- mc->label_place(mc, L_retry);
-
- if (do_acq)
- aa64_emit32(mc, aa64_ldaxr(sf, reg_num(dst), base));
- else
- aa64_emit32(mc, aa64_ldxr(sf, reg_num(dst), base));
-
- emit_rmw_combine(mc, op, sf, AA_TMP2, reg_num(dst), vreg);
-
- if (do_rel)
- aa64_emit32(mc, aa64_stlxr(sf, vreg, AA_TMP2, base));
- else
- aa64_emit32(mc, aa64_stxr(sf, vreg, AA_TMP2, base));
-
- u32 cbnz_pos = mc->pos(mc);
- aa64_emit32(mc, aa64_cbnz(0, vreg));
- mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0);
- (void)cbnz_pos;
-}
-
-static void aa_atomic_cas(CGTarget* t, Operand prior, Operand ok, Operand addr,
- Operand expected, Operand desired, MemAccess ma,
- MemOrder succ, MemOrder fail) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- u32 sf = (ma.size == 8) ? 1u : 0u;
- (void)fail;
-
- u32 base = AA_TMP0;
- if (addr.kind == OPK_REG)
- aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(addr)));
- else if (addr.kind == OPK_LOCAL) {
- AASlot* s = aa64_slot_get(a, addr.v.frame_slot);
- if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_cas: bad slot");
- aa64_emit32(mc, aa64_sub_imm(1, AA_TMP0, 29, s->off, 0));
- } else {
- compiler_panic(t->c, a->loc, "aarch64 atomic_cas: addr kind %d unsupported",
- (int)addr.kind);
- }
- if (expected.kind == OPK_IMM)
- aa64_emit_load_imm(mc, sf, AA_TMP1, expected.v.imm);
- else if (expected.kind == OPK_REG)
- aa64_emit32(mc, aa64_mov_reg(sf, AA_TMP1, reg_num(expected)));
- else
- compiler_panic(t->c, a->loc, "aarch64 atomic_cas: exp kind %d unsupported",
- (int)expected.kind);
- if (desired.kind == OPK_IMM)
- aa64_emit_load_imm(mc, sf, AA_TMP2, desired.v.imm);
- else if (desired.kind == OPK_REG)
- aa64_emit32(mc, aa64_mov_reg(sf, AA_TMP2, reg_num(desired)));
- else
- compiler_panic(t->c, a->loc, "aarch64 atomic_cas: des kind %d unsupported",
- (int)desired.kind);
-
- int do_acq = mem_order_is_acquire(succ);
- int do_rel = mem_order_is_release(succ);
-
- MCLabel L_retry = mc->label_new(mc);
- MCLabel L_fail = mc->label_new(mc);
- MCLabel L_done = mc->label_new(mc);
-
- mc->label_place(mc, L_retry);
- if (do_acq)
- aa64_emit32(mc, aa64_ldaxr(sf, reg_num(prior), base));
- else
- aa64_emit32(mc, aa64_ldxr(sf, reg_num(prior), base));
-
- aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, reg_num(prior), AA_TMP1));
- aa64_emit32(mc, aa64_b_cond(0x1u /*NE*/));
- mc->emit_label_ref(mc, L_fail, R_AARCH64_CONDBR19, 4, 0);
-
- if (do_rel)
- aa64_emit32(mc, aa64_stlxr(sf, AA_TMP1, AA_TMP2, base));
- else
- aa64_emit32(mc, aa64_stxr(sf, AA_TMP1, AA_TMP2, base));
- aa64_emit32(mc, aa64_cbnz(0, AA_TMP1));
- mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0);
-
- aa64_emit_load_imm(mc, 0, reg_num(ok), 1);
- aa64_emit32(mc, aa64_b_base());
- mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0);
-
- mc->label_place(mc, L_fail);
- aa64_emit32(mc, aa64_clrex(AA64_BARRIER_OPT_SY));
- aa64_emit_load_imm(mc, 0, reg_num(ok), 0);
-
- mc->label_place(mc, L_done);
-}
-
-static void aa_fence(CGTarget* t, MemOrder o) {
- (void)o;
- if (o == MO_RELAXED) return;
- aa64_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH));
-}
-
-/* ============================================================
- * Intrinsics
- * ============================================================ */
-
-static inline u32 aa64_rev16_w(u32 Rd, u32 Rn) {
- return 0x5AC00400u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_rev_w(u32 Rd, u32 Rn) {
- return 0x5AC00800u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_rev_x(u32 Rd, u32 Rn) {
- return 0xDAC00C00u | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_rbit(u32 sf64, u32 Rd, u32 Rn) {
- return (sf64 ? 0xDAC00000u : 0x5AC00000u) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_clz(u32 sf64, u32 Rd, u32 Rn) {
- return (sf64 ? 0xDAC01000u : 0x5AC01000u) | ((Rn & 0x1f) << 5) | (Rd & 0x1f);
-}
-static inline u32 aa64_cnt_8b(u32 Vd, u32 Vn) {
- return 0x0E205800u | ((Vn & 0x1f) << 5) | (Vd & 0x1f);
-}
-static inline u32 aa64_addv_b_8b(u32 Vd, u32 Vn) {
- return 0x0E31B800u | ((Vn & 0x1f) << 5) | (Vd & 0x1f);
-}
-static inline u32 aa64_adds_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
- return 0x2B000000u | (sf << 31) | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) |
- (Rd & 0x1f);
-}
-static inline u32 aa64_smaddl(u32 Rd, u32 Rn, u32 Rm, u32 Ra) {
- return aa64_dp3_pack((AA64DP3){
- .sf = 1, .op31 = 1, .o0 = 0, .Rm = Rm, .Ra = Ra, .Rn = Rn, .Rd = Rd});
-}
-static inline u32 aa64_smull(u32 Rd, u32 Rn, u32 Rm) {
- return aa64_smaddl(Rd, Rn, Rm, AA64_ZR);
-}
-static inline u32 aa64_subs_extreg_x_sxtw(u32 Rd, u32 Rn, u32 Rm) {
- return 0xEB200000u | ((Rm & 0x1f) << 16) | (6u << 13) | ((Rn & 0x1f) << 5) |
- (Rd & 0x1f);
-}
-
-static void aa_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd,
- const Operand* args, u32 na) {
- AAImpl* a = impl_of(t);
- MCEmitter* mc = t->mc;
- (void)nd;
-
- switch (kind) {
- case INTRIN_POPCOUNT: {
- Operand src = args[0];
- Operand dst = dsts[0];
- u32 sz_in = type_byte_size(src.type);
- if (sz_in == 8)
- aa64_emit32(mc, aa64_fmov_d_x(AA_FP_TMP0, reg_num(src)));
- else
- aa64_emit32(mc, aa64_fmov_s_w(AA_FP_TMP0, reg_num(src)));
- aa64_emit32(mc, aa64_cnt_8b(AA_FP_TMP0, AA_FP_TMP0));
- aa64_emit32(mc, aa64_addv_b_8b(AA_FP_TMP0, AA_FP_TMP0));
- aa64_emit32(mc, aa64_fmov_w_s(reg_num(dst), AA_FP_TMP0));
- return;
- }
- case INTRIN_CLZ: {
- Operand src = args[0];
- Operand dst = dsts[0];
- u32 sf = type_is_64(src.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(src)));
- return;
- }
- case INTRIN_CTZ: {
- Operand src = args[0];
- Operand dst = dsts[0];
- u32 sf = type_is_64(src.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_rbit(sf, reg_num(dst), reg_num(src)));
- aa64_emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(dst)));
- return;
- }
- case INTRIN_BSWAP16: {
- aa64_emit32(mc, aa64_rev16_w(reg_num(dsts[0]), reg_num(args[0])));
- return;
- }
- case INTRIN_BSWAP32: {
- aa64_emit32(mc, aa64_rev_w(reg_num(dsts[0]), reg_num(args[0])));
- return;
- }
- case INTRIN_BSWAP64: {
- aa64_emit32(mc, aa64_rev_x(reg_num(dsts[0]), reg_num(args[0])));
- return;
- }
- case INTRIN_MEMCPY:
- case INTRIN_MEMMOVE: {
- Operand da = args[0], sa = args[1], nb = args[2];
- if (da.kind != OPK_REG || sa.kind != OPK_REG || nb.kind != OPK_IMM) {
- compiler_panic(t->c, a->loc,
- "aarch64 intrinsic: %s with non-const n or non-REG ptr",
- kind == INTRIN_MEMCPY ? "memcpy" : "memmove");
- }
- u32 dr = reg_num(da);
- u32 sr = reg_num(sa);
- u32 n = (u32)nb.v.imm;
- if (kind == INTRIN_MEMCPY) {
- u32 i = 0;
- while (i + 8 <= n) {
- aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i));
- aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i));
- i += 8;
- }
- while (i + 4 <= n) {
- aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i));
- aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i));
- i += 4;
- }
- while (i + 2 <= n) {
- aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i));
- aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i));
- i += 2;
- }
- while (i < n) {
- aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i));
- aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i));
- i += 1;
- }
- } else {
- u32 i = n;
- while (i >= 8) {
- i -= 8;
- aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i));
- aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i));
- }
- while (i >= 4) {
- i -= 4;
- aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i));
- aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i));
- }
- while (i >= 2) {
- i -= 2;
- aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i));
- aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i));
- }
- while (i >= 1) {
- i -= 1;
- aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i));
- aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i));
- }
- }
- return;
- }
- case INTRIN_MEMSET: {
- Operand da = args[0], bv = args[1], nb = args[2];
- if (da.kind != OPK_REG || nb.kind != OPK_IMM) {
- compiler_panic(
- t->c, a->loc,
- "aarch64 intrinsic: memset with non-const n / non-REG ptr");
- }
- u32 dr = reg_num(da);
- u32 n = (u32)nb.v.imm;
- u32 byte;
- u32 src_reg;
- if (bv.kind == OPK_IMM) {
- byte = (u32)(bv.v.imm & 0xffu);
- if (byte == 0) {
- src_reg = 31u;
- } else {
- u64 b64 = byte;
- b64 |= b64 << 8;
- b64 |= b64 << 16;
- b64 |= b64 << 32;
- aa64_emit_load_imm(mc, 1, AA_TMP2, (i64)b64);
- src_reg = AA_TMP2;
- }
- } else if (bv.kind == OPK_REG) {
- aa64_emit_load_imm(mc, 1, AA_TMP2, (i64)0x0101010101010101ll);
- aa64_emit32(mc, aa64_madd(1, AA_TMP2, reg_num(bv), AA_TMP2, AA64_ZR));
- src_reg = AA_TMP2;
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 intrinsic: memset byte kind %d unsupported",
- (int)bv.kind);
- }
- u32 i = 0;
- while (i + 8 <= n) {
- aa64_emit32(mc, aa64_stur(3, src_reg, dr, (i32)i));
- i += 8;
- }
- while (i + 4 <= n) {
- aa64_emit32(mc, aa64_stur(2, src_reg, dr, (i32)i));
- i += 4;
- }
- while (i + 2 <= n) {
- aa64_emit32(mc, aa64_stur(1, src_reg, dr, (i32)i));
- i += 2;
- }
- while (i < n) {
- aa64_emit32(mc, aa64_stur(0, src_reg, dr, (i32)i));
- i += 1;
- }
- return;
- }
- case INTRIN_PREFETCH:
- (void)args;
- (void)na;
- return;
- case INTRIN_ASSUME_ALIGNED: {
- Operand src = args[0];
- Operand dst = dsts[0];
- if (reg_num(src) != reg_num(dst)) {
- aa64_emit32(mc, aa64_mov_reg(1, reg_num(dst), reg_num(src)));
- }
- return;
- }
- case INTRIN_EXPECT: {
- Operand val = args[0];
- Operand dst = dsts[0];
- u32 sf = type_is_64(dst.type) ? 1u : 0u;
- if (val.kind == OPK_REG) {
- if (reg_num(val) != reg_num(dst)) {
- aa64_emit32(mc, aa64_mov_reg(sf, reg_num(dst), reg_num(val)));
- }
- } else if (val.kind == OPK_IMM) {
- aa64_emit_load_imm(mc, sf, reg_num(dst), val.v.imm);
- } else {
- compiler_panic(t->c, a->loc,
- "aarch64 intrinsic: expect val kind %d unsupported",
- (int)val.kind);
- }
- return;
- }
- case INTRIN_UNREACHABLE:
- case INTRIN_TRAP:
- aa64_emit32(mc, aa64_brk(kind == INTRIN_TRAP ? 1u : 0u));
- return;
- case INTRIN_ADD_OVERFLOW:
- case INTRIN_SUB_OVERFLOW: {
- Operand a_op = args[0], b_op = args[1];
- Operand dval = dsts[0], dovf = dsts[1];
- u32 sf = type_is_64(dval.type) ? 1u : 0u;
- u32 ra = aa64_force_reg_int(t, a_op, sf, AA_TMP0);
- u32 rb =
- aa64_force_reg_int(t, b_op, sf,
- (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0);
- u32 word = (kind == INTRIN_ADD_OVERFLOW)
- ? aa64_adds_reg(sf, reg_num(dval), ra, rb)
- : aa64_subs_reg(sf, reg_num(dval), ra, rb);
- aa64_emit32(mc, word);
- aa64_emit32(mc, aa64_cset(sf, reg_num(dovf), 0x6u /*VS*/));
- return;
- }
- case INTRIN_MUL_OVERFLOW: {
- Operand a_op = args[0], b_op = args[1];
- Operand dval = dsts[0], dovf = dsts[1];
- u32 sf = type_is_64(dval.type) ? 1u : 0u;
- if (sf) {
- compiler_panic(
- t->c, a->loc,
- "aarch64 intrinsic: mul_overflow on i64 not yet supported");
- }
- u32 ra = aa64_force_reg_int(t, a_op, 0, AA_TMP0);
- u32 rb =
- aa64_force_reg_int(t, b_op, 0,
- (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0);
- aa64_emit32(mc, aa64_smull(AA_TMP2, ra, rb));
- aa64_emit32(mc, aa64_subs_extreg_x_sxtw(/*XZR*/ 31u, AA_TMP2, AA_TMP2));
- aa64_emit32(mc, aa64_cset(0, reg_num(dovf), 0x1u /*NE*/));
- aa64_emit32(mc, aa64_mov_reg(0, reg_num(dval), AA_TMP2));
- return;
- }
- default:
- compiler_panic(t->c, a->loc, "aarch64 intrinsic: kind %d unsupported",
- (int)kind);
- }
-}
-
-/* ============================================================
- * Inline asm block
- * ============================================================ */
-
-static void aa_asm_block(CGTarget* t, const char* tmpl,
- const AsmConstraint* outs, u32 no, Operand* oo,
- const AsmConstraint* ins, u32 ni, const Operand* io,
- const Sym* clobs, u32 nc) {
- AAImpl* a_impl = impl_of(t);
- for (u32 i = 0; i < nc; ++i) {
- Reg phys;
- RegClass cls;
- if (t->resolve_reg_name(t, clobs[i], &phys, &cls) != 0) continue;
- if (cls == RC_INT) {
- if (phys >= 19u && phys <= 28u)
- a_impl->used_cs_int_mask |= 1u << phys;
- } else if (cls == RC_FP) {
- if (phys >= 8u && phys <= 15u)
- a_impl->used_cs_fp_mask |= 1u << phys;
- }
- }
- AA64Asm* a = aa64_asm_open(t->c);
- aa64_inline_bind(a, outs, no, oo, ins, ni, io, clobs, nc);
- aa64_asm_run_template(a, t->mc, tmpl);
- aa64_asm_close(a);
-}
-
-/* ============================================================
- * Lifecycle / vtable constructor
- * ============================================================ */
-
-static void aa_set_loc(CGTarget* t, SrcLoc loc) {
- impl_of(t)->loc = loc;
- t->mc->set_loc(t->mc, loc);
-}
-
-static void aa_finalize(CGTarget* t) { (void)t; }
-
-static void aa_destroy(CGTarget* t) { (void)t; }
-
-static void cgt_cleanup(void* arg) { cgtarget_free((CGTarget*)arg); }
-
-CGTarget* aa64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
- AAImpl* a = arena_new(c->tu, AAImpl);
- memset(a, 0, sizeof *a);
-
- CGTarget* t = &a->base;
- t->c = c;
- t->obj = o;
- t->mc = m;
-
- t->func_begin = aa_func_begin;
- t->func_end = aa_func_end;
- t->frame_slot = aa_frame_slot;
- t->param = aa_param;
-
- t->load_imm = aa_load_imm;
- t->load_const = aa_load_const;
- t->copy = aa_copy;
- t->load = aa_load;
- t->store = aa_store;
- t->addr_of = aa_addr_of;
- t->tls_addr_of = aa_tls_addr_of;
- t->copy_bytes = aa_copy_bytes;
- t->set_bytes = aa_set_bytes;
- t->bitfield_load = aa_bitfield_load;
- t->bitfield_store = aa_bitfield_store;
-
- t->binop = aa_binop;
- t->unop = aa_unop;
- t->convert = aa_convert;
-
- t->call = aa_call;
- t->ret = aa_ret;
-
- t->alloca_ = aa_alloca_;
- t->va_start_ = aa_va_start_;
- t->va_arg_ = aa_va_arg_;
- t->va_end_ = aa_va_end_;
- t->va_copy_ = aa_va_copy_;
-
- t->atomic_load = aa_atomic_load;
- t->atomic_store = aa_atomic_store;
- t->atomic_rmw = aa_atomic_rmw;
- t->atomic_cas = aa_atomic_cas;
- t->fence = aa_fence;
-
- t->intrinsic = aa_intrinsic;
- t->asm_block = aa_asm_block;
-
- t->set_loc = aa_set_loc;
- t->finalize = aa_finalize;
- t->destroy = aa_destroy;
-
- /* alloc/label/scope vtable entries */
- aa_alloc_vtable_init(t);
- aa_coord_vtable_init(t);
-
- /* Suppress unused warning. */
- (void)type_is_signed;
-
- compiler_defer(c, cgt_cleanup, t);
- return t;
-}
diff --git a/src/arch/aarch64/opt_coord.c b/src/arch/aarch64/opt_coord.c
@@ -1,96 +0,0 @@
-/* aarch64/opt_coord.c — opt/backend register coordination hooks.
- * Static arrays so opt_machinize can query the backend instead of
- * hard-coding arch knowledge. */
-
-#include "arch/aarch64/internal.h"
-
-/* ============================================================
- * Static register tables reported to caller-owned allocators. */
-
-static const Reg aa_int_allocable[] = {19, 20, 21, 22, 23,
- 24, 25, 26, 27, 28};
-static const Reg aa_fp_allocable[] = {8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23};
-
-static const Reg aa_int_scratch[] = {16, 17};
-static const Reg aa_fp_scratch[] = {24, 25};
-
-/* ============================================================
- * Vtable methods */
-
-static void aa_get_allocable_regs(CGTarget* t, RegClass cls,
- const Reg** out, u32* nregs) {
- (void)t;
- switch (cls) {
- case RC_INT:
- *out = aa_int_allocable;
- *nregs = sizeof aa_int_allocable / sizeof aa_int_allocable[0];
- break;
- case RC_FP:
- *out = aa_fp_allocable;
- *nregs = sizeof aa_fp_allocable / sizeof aa_fp_allocable[0];
- break;
- default:
- *out = NULL;
- *nregs = 0;
- break;
- }
-}
-
-static void aa_get_scratch_regs(CGTarget* t, RegClass cls,
- const Reg** out, u32* nregs) {
- (void)t;
- switch (cls) {
- case RC_INT:
- *out = aa_int_scratch;
- *nregs = sizeof aa_int_scratch / sizeof aa_int_scratch[0];
- break;
- case RC_FP:
- *out = aa_fp_scratch;
- *nregs = sizeof aa_fp_scratch / sizeof aa_fp_scratch[0];
- break;
- default:
- *out = NULL;
- *nregs = 0;
- break;
- }
-}
-
-static int aa_is_caller_saved(CGTarget* t, RegClass cls, Reg reg) {
- (void)t;
- switch (cls) {
- case RC_INT:
- /* AAPCS64 caller-saved: x0-x18, x30 */
- return reg <= 18 || reg == 30;
- case RC_FP:
- /* AAPCS64 caller-saved: v0-v7, v16-v31 */
- return reg <= 7 || reg >= 16;
- default:
- return 0;
- }
-}
-
-static void aa_reserve_hard_regs(CGTarget* t, RegClass cls,
- const Reg* regs, u32 n) {
- AAImpl* a = impl_of(t);
- for (u32 i = 0; i < n; ++i) {
- Reg r = regs[i];
- switch (cls) {
- case RC_INT:
- if (r >= 19u && r <= 28u) a->used_cs_int_mask |= 1u << r;
- break;
- case RC_FP:
- if (r >= 8u && r <= 15u) a->used_cs_fp_mask |= 1u << r;
- break;
- default:
- break;
- }
- }
-}
-
-void aa_coord_vtable_init(CGTarget* t) {
- t->get_allocable_regs = aa_get_allocable_regs;
- t->get_scratch_regs = aa_get_scratch_regs;
- t->is_caller_saved = aa_is_caller_saved;
- t->reserve_hard_regs = aa_reserve_hard_regs;
-}
diff --git a/src/arch/rv64/arch.c b/src/arch/rv64/arch.c
@@ -1,12 +1,14 @@
#include "arch/arch.h"
#include "abi/abi_internal.h"
-#include "arch/rv64.h"
+#include "arch/rv64/rv64.h"
#include "core/bytes.h"
#include "link/link_arch.h"
#include "obj/elf.h"
#include "obj/obj.h"
+extern const LinkArchDesc link_arch_rv64;
+
static const ABIVtable* rv64_abi_vtable(Compiler* c, CfreeOSKind os) {
(void)c;
(void)os;
diff --git a/src/arch/rv64/internal.h b/src/arch/rv64/internal.h
@@ -5,8 +5,8 @@
#include <string.h>
#include "arch/arch.h"
-#include "arch/rv64.h"
-#include "arch/rv64_isa.h"
+#include "arch/rv64/rv64.h"
+#include "arch/rv64/isa.h"
#include "core/arena.h"
#include "core/pool.h"
#include "obj/obj.h"
diff --git a/src/arch/rv64_isa.h b/src/arch/rv64/isa.h
diff --git a/src/arch/rv64/link.c b/src/arch/rv64/link.c
@@ -0,0 +1,95 @@
+/* RV64 link-time arch descriptor. See link_arch.h for the contract.
+ *
+ * The PLT0/PLT-entry/IPLT-stub byte layouts here mirror what used to
+ * live inline in link_dyn.c (PLT) and link_layout.c (IPLT) before the
+ * vtable refactor; comments preserve the WHY (notably the +0x800 bias
+ * on AUIPC immediates). */
+
+#include "arch/rv64/isa.h"
+#include "core/bytes.h"
+#include "core/core.h"
+#include "link/link_arch.h"
+#include "obj/elf.h"
+
+/* PLT0 is 8 canonical NOPs (32 bytes); each PLT entry and IPLT stub is
+ * 4 instructions (16 bytes) / 3 instructions (12 bytes) respectively.
+ * Encoded once here so the descriptor and emitters stay in sync. */
+#define RV64_PLT0_SIZE 32u
+#define RV64_PLT_ENTRY_SIZE 16u
+#define RV64_IPLT_STUB_SIZE 12u
+
+/* Split a PC-relative displacement into the (hi20, lo12) pair consumed
+ * by the AUIPC + I-type sequence. The +0x800 bias is the standard
+ * RISC-V two-instruction PCREL trick: AUIPC adds an upper-20 immediate
+ * shifted left 12, then the second instruction adds a sign-extended
+ * 12-bit lo12. If we naively split disp into (disp>>12, disp&0xfff)
+ * the lo12 sign-extends as a *negative* number whenever bit 11 is set,
+ * which underflows the AUIPC result by 0x1000. Adding 0x800 before
+ * the shift rounds the high half up in exactly the cases that need it
+ * so AUIPC + sign-extended-lo12 reconstructs disp correctly. */
+static inline void rv64_split_pcrel(i64 disp, u32* hi20_out, u32* lo12_out) {
+ *hi20_out = (u32)(((u64)(disp + 0x800)) >> 12) & 0xfffffu;
+ *lo12_out = (u32)((u64)disp & 0xfffu);
+}
+
+/* PLT0 under DF_1_NOW is never executed — the loader resolves every
+ * JUMP_SLOT before transferring control — but we still emit it in
+ * canonical form (8 NOPs) so disassemblers and unwinders see a well-
+ * formed prologue at the top of .plt. */
+static void rv64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) {
+ u32 i;
+ (void)plt0_vaddr;
+ (void)gotplt_vaddr;
+ for (i = 0; i < RV64_PLT0_SIZE; i += 4u) wr_u32_le(dst + i, rv_nop());
+}
+
+/* Per-import PLT entry: load the GOT slot pre-filled by the loader
+ * (R_RISCV_JUMP_SLOT) and tail-call through it. t1 is the standard
+ * psABI scratch for the trampoline return-address (clobbered by the
+ * lazy resolver in the non-BIND_NOW path); t3 holds the slot pointer. */
+static void rv64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) {
+ i64 disp = (i64)slot_vaddr - (i64)entry_vaddr;
+ u32 hi20;
+ u32 lo12;
+ rv64_split_pcrel(disp, &hi20, &lo12);
+ wr_u32_le(dst + 0, rv_auipc(RV_T3, hi20));
+ wr_u32_le(dst + 4, rv_ld(RV_T3, RV_T3, (i32)lo12));
+ wr_u32_le(dst + 8, rv_jalr(RV_T1, RV_T3, 0));
+ wr_u32_le(dst + 12, rv_nop());
+}
+
+/* IPLT stub: load .igot.plt[i] (filled at startup by the resolver) and
+ * tail-call to it. The stub->slot displacement is invariant under the
+ * segment-base shift (both addresses live in the same image), so we
+ * bake it directly into the instructions and report zero apply-time
+ * relocs — unlike aarch64, which cannot encode a 32-bit pcrel inline. */
+static u32 rv64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr,
+ LinkArchIPltReloc out[2]) {
+ i64 disp = (i64)slot_vaddr - (i64)stub_vaddr;
+ u32 hi20;
+ u32 lo12;
+ (void)out;
+ rv64_split_pcrel(disp, &hi20, &lo12);
+ wr_u32_le(dst + 0, rv_auipc(RV_T1, hi20));
+ wr_u32_le(dst + 4, rv_ld(RV_T1, RV_T1, (i32)lo12));
+ wr_u32_le(dst + 8, rv_jr(RV_T1));
+ return 0u;
+}
+
+const LinkArchDesc link_arch_rv64 = {
+ .e_machine = EM_RISCV,
+ .default_musl_interp = "/lib/ld-musl-riscv64.so.1",
+ /* RISC-V psABI has no dedicated GLOB_DAT — GOT-slot data imports
+ * use the generic absolute-64 reloc instead. */
+ .elf_r_relative = ELF_R_RISCV_RELATIVE,
+ .elf_r_glob_dat = ELF_R_RISCV_64,
+ .elf_r_jump_slot = ELF_R_RISCV_JUMP_SLOT,
+ .plt0_size = RV64_PLT0_SIZE,
+ .plt_entry_size = RV64_PLT_ENTRY_SIZE,
+ .iplt_stub_size = RV64_IPLT_STUB_SIZE,
+ .global_pointer_symbol = "__global_pointer$",
+ .global_pointer_rw_offset = 0x800u,
+ .emit_plt0 = rv64_emit_plt0,
+ .emit_plt_entry = rv64_emit_plt_entry,
+ .emit_iplt_stub = rv64_emit_iplt_stub,
+};
diff --git a/src/arch/rv64.h b/src/arch/rv64/rv64.h
diff --git a/src/arch/x64/alloc.c b/src/arch/x64/alloc.c
@@ -8,8 +8,8 @@
#include <string.h>
#include "arch/arch.h"
-#include "arch/x64.h"
-#include "arch/x64_isa.h"
+#include "arch/x64/x64.h"
+#include "arch/x64/isa.h"
#include "core/arena.h"
#include "core/pool.h"
#include "obj/obj.h"
diff --git a/src/arch/x64/arch.c b/src/arch/x64/arch.c
@@ -1,12 +1,14 @@
#include "arch/arch.h"
#include "abi/abi_internal.h"
-#include "arch/x64.h"
+#include "arch/x64/x64.h"
#include "core/bytes.h"
#include "link/link_arch.h"
#include "obj/elf.h"
#include "obj/obj.h"
+extern const LinkArchDesc link_arch_x64;
+
static const ABIVtable* x64_abi_vtable(Compiler* c, CfreeOSKind os) {
(void)c;
(void)os;
diff --git a/src/arch/x64/emit.c b/src/arch/x64/emit.c
@@ -7,8 +7,8 @@
#include <string.h>
#include "arch/arch.h"
-#include "arch/x64.h"
-#include "arch/x64_isa.h"
+#include "arch/x64/x64.h"
+#include "arch/x64/isa.h"
#include "core/arena.h"
#include "core/pool.h"
#include "obj/obj.h"
diff --git a/src/arch/x64/internal.h b/src/arch/x64/internal.h
@@ -6,15 +6,15 @@
* - Small type helpers (static inline)
* - Forward declarations of cross-file functions
*
- * NOT included by external consumers; use arch/x64.h for the public API. */
+ * NOT included by external consumers; use arch/x64/x64.h for the public API. */
#pragma once
#include <string.h>
#include "arch/arch.h"
-#include "arch/x64.h"
-#include "arch/x64_isa.h"
+#include "arch/x64/x64.h"
+#include "arch/x64/isa.h"
#include "core/arena.h"
#include "core/pool.h"
#include "obj/obj.h"
diff --git a/src/arch/x64/isa.h b/src/arch/x64/isa.h
@@ -0,0 +1,128 @@
+/* x86_64 ISA helpers used by arch/x64.c.
+ *
+ * Only the constants here. Instruction encoders live in arch/x64.c
+ * because they're variable length and depend on the MCEmitter byte
+ * stream (REX prefix, ModR/M, SIB, displacement). The disassembler
+ * doesn't share these yet; if/when it does, a parallel x64_isa.c will
+ * host decode tables. */
+
+#ifndef CFREE_X64_ISA_H
+#define CFREE_X64_ISA_H
+
+#include "core/bytes.h"
+#include "core/core.h"
+
+/* ---- GPR numbering (DWARF / ABI matches HW encoding 0..15) ---- */
+enum {
+ X64_RAX = 0,
+ X64_RCX = 1,
+ X64_RDX = 2,
+ X64_RBX = 3,
+ X64_RSP = 4,
+ X64_RBP = 5,
+ X64_RSI = 6,
+ X64_RDI = 7,
+ X64_R8 = 8,
+ X64_R9 = 9,
+ X64_R10 = 10,
+ X64_R11 = 11,
+ X64_R12 = 12,
+ X64_R13 = 13,
+ X64_R14 = 14,
+ X64_R15 = 15,
+};
+
+/* SSE register numbering — xmm0..xmm15 share encoding with r0..r15. */
+enum {
+ X64_XMM0 = 0,
+ X64_XMM1 = 1,
+ X64_XMM2 = 2,
+ X64_XMM3 = 3,
+ X64_XMM4 = 4,
+ X64_XMM5 = 5,
+ X64_XMM6 = 6,
+ X64_XMM7 = 7,
+ X64_XMM8 = 8,
+ X64_XMM15 = 15,
+};
+
+/* Condition codes for Jcc / SETcc / CMOVcc. Encoded in the low nibble. */
+enum {
+ X64_CC_O = 0x0,
+ X64_CC_NO = 0x1,
+ X64_CC_B = 0x2, /* below / CF=1 → CMP_LT_U */
+ X64_CC_AE = 0x3, /* above-or-equal / CF=0 → CMP_GE_U */
+ X64_CC_E = 0x4, /* equal / ZF=1 → CMP_EQ */
+ X64_CC_NE = 0x5, /* → CMP_NE */
+ X64_CC_BE = 0x6, /* below-or-equal / CF=1 or ZF=1 → CMP_LE_U */
+ X64_CC_A = 0x7, /* above / CF=0 and ZF=0 → CMP_GT_U */
+ X64_CC_S = 0x8,
+ X64_CC_NS = 0x9,
+ X64_CC_P = 0xA,
+ X64_CC_NP = 0xB,
+ X64_CC_L = 0xC, /* less (signed) / SF!=OF → CMP_LT_S */
+ X64_CC_GE = 0xD, /* → CMP_GE_S */
+ X64_CC_LE = 0xE, /* less-or-equal (signed) → CMP_LE_S */
+ X64_CC_G = 0xF, /* greater → CMP_GT_S */
+};
+
+/* REX prefix is 0x40 | W<<3 | R<<2 | X<<1 | B. */
+#define X64_REX_BASE 0x40u
+#define X64_REX_W 0x08u
+#define X64_REX_R 0x04u
+#define X64_REX_X 0x02u
+#define X64_REX_B 0x01u
+
+/* ---- Branch / NOP encoding constants ----
+ *
+ * Used by the linker to emit PLT entries and IPLT stubs without
+ * sprinkling raw hex into src/arch/x64/link.c. The shape is always the
+ * same RIP-relative indirect JMP plus padding NOPs. */
+
+/* JMP r/m64 — opcode FF /4. ModR/M for the RIP+disp32 form is
+ * mod=00, reg=/4 (JMP m64), r/m=101 → 0x25. */
+#define X64_OP_JMP_RM64 0xFFu
+#define X64_MODRM_JMP_RIPREL 0x25u
+
+/* Single-byte NOP. */
+#define X64_NOP1 0x90u
+
+/* Intel multi-byte ("long") NOP forms. The 6-byte form is the
+ * canonical IPLT-stub tail pad (NOPW 0(%rax,%rax,1)). */
+#define X64_NOP6_BYTE0 0x66u
+#define X64_NOP6_BYTE1 0x0Fu
+#define X64_NOP6_BYTE2 0x1Fu
+#define X64_NOP6_BYTE3 0x44u
+#define X64_NOP6_BYTE4 0x00u
+#define X64_NOP6_BYTE5 0x00u
+
+/* Sizes of the encoded forms above. */
+#define X64_JMP_RIPREL_SIZE 6u
+#define X64_NOP6_SIZE 6u
+
+/* Write a 6-byte `jmp [rip + disp32]` (FF 25 disp32) at dst. */
+static inline void x64_write_jmp_riprel(u8* dst, i32 disp32) {
+ dst[0] = X64_OP_JMP_RM64;
+ dst[1] = X64_MODRM_JMP_RIPREL;
+ wr_u32_le(dst + 2, (u32)disp32);
+}
+
+/* Fill nbytes at dst with single-byte NOPs (0x90). Matches the
+ * existing memset-then-patch pattern used to pad PLT entries to 16. */
+static inline void x64_write_nop_pad(u8* dst, u32 nbytes) {
+ u32 i;
+ for (i = 0; i < nbytes; ++i) dst[i] = X64_NOP1;
+}
+
+/* Write the canonical 6-byte multi-byte NOP (66 0F 1F 44 00 00) at
+ * dst. Used to pad the IPLT stub from 6 → 12 bytes. */
+static inline void x64_write_nop6(u8* dst) {
+ dst[0] = X64_NOP6_BYTE0;
+ dst[1] = X64_NOP6_BYTE1;
+ dst[2] = X64_NOP6_BYTE2;
+ dst[3] = X64_NOP6_BYTE3;
+ dst[4] = X64_NOP6_BYTE4;
+ dst[5] = X64_NOP6_BYTE5;
+}
+
+#endif
diff --git a/src/arch/x64/link.c b/src/arch/x64/link.c
@@ -0,0 +1,77 @@
+/* x86_64 link-time arch descriptor.
+ *
+ * Implements the LinkArchDesc contract from link/link_arch.h for
+ * EM_X86_64. The PLT/IPLT byte sequences here mirror the inline
+ * encodings previously living in link_dyn.c (PLT0 + per-import entry)
+ * and link_layout.c (IPLT stub) — kept identical byte-for-byte so the
+ * descriptor switchover is a pure refactor. All raw byte values come
+ * from named constants / inline writers in arch/x64/isa.h. */
+
+#include "link/link_arch.h"
+
+#include "arch/x64/isa.h"
+#include "core/bytes.h"
+#include "core/core.h"
+#include "obj/elf.h"
+
+/* PLT0 layout under DF_1_NOW: never executed (loader pre-binds every
+ * slot via .rela.plt before user code runs), so we just emit 32 bytes
+ * of single-byte NOPs. Self-documenting and trivially well-formed for
+ * disassemblers and unwinders that walk the section. */
+static void x64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) {
+ (void)plt0_vaddr;
+ (void)gotplt_vaddr;
+ x64_write_nop_pad(dst, 32u);
+}
+
+/* Per-import PLT entry (16 B):
+ *
+ * ff 25 disp32 ; jmpq *[rip + disp_to_slot] (6 B)
+ * 90 90 90 90 90 90 90 90 90 90 ; pad to 16 with single-byte NOPs
+ *
+ * disp32 is measured from the END of the JMP (entry_vaddr + 6) to the
+ * .got.plt slot. The 10-byte tail matches link_dyn.c's prior
+ * memset(0x90)+patch behavior exactly. */
+static void x64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) {
+ i64 disp = (i64)slot_vaddr - (i64)(entry_vaddr + X64_JMP_RIPREL_SIZE);
+ i32 disp32 = (i32)(u32)((u64)disp & 0xffffffffu);
+ x64_write_jmp_riprel(dst, disp32);
+ x64_write_nop_pad(dst + X64_JMP_RIPREL_SIZE,
+ 16u - X64_JMP_RIPREL_SIZE);
+}
+
+/* IPLT (ifunc) trampoline stub (12 B):
+ *
+ * ff 25 disp32 ; jmpq *[rip + disp_to_slot] (6 B)
+ * 66 0f 1f 44 00 00 ; 6-byte multibyte NOP (6 B)
+ *
+ * Like the PLT entry, disp32 is from the END of the JMP to the
+ * .igot.plt slot. The displacement is invariant under image-base
+ * shift (both ends move together), so it's encoded inline and we
+ * report zero apply-time relocations. */
+static u32 x64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr,
+ LinkArchIPltReloc out[2]) {
+ (void)out;
+ i64 disp = (i64)slot_vaddr - (i64)(stub_vaddr + X64_JMP_RIPREL_SIZE);
+ i32 disp32 = (i32)(u32)((u64)disp & 0xffffffffu);
+ x64_write_jmp_riprel(dst, disp32);
+ x64_write_nop6(dst + X64_JMP_RIPREL_SIZE);
+ return 0;
+}
+
+const LinkArchDesc link_arch_x64 = {
+ .e_machine = EM_X86_64,
+ .default_musl_interp = "/lib/ld-musl-x86_64.so.1",
+
+ .elf_r_relative = ELF_R_X86_64_RELATIVE,
+ .elf_r_glob_dat = ELF_R_X86_64_GLOB_DAT,
+ .elf_r_jump_slot = ELF_R_X86_64_JUMP_SLOT,
+
+ .plt0_size = 32u,
+ .plt_entry_size = 16u,
+ .iplt_stub_size = 12u,
+
+ .emit_plt0 = x64_emit_plt0,
+ .emit_plt_entry = x64_emit_plt_entry,
+ .emit_iplt_stub = x64_emit_iplt_stub,
+};
diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c
@@ -13,8 +13,8 @@
#include <string.h>
#include "arch/arch.h"
-#include "arch/x64.h"
-#include "arch/x64_isa.h"
+#include "arch/x64/x64.h"
+#include "arch/x64/isa.h"
#include "core/arena.h"
#include "core/pool.h"
#include "obj/obj.h"
diff --git a/src/arch/x64.h b/src/arch/x64/x64.h
diff --git a/src/arch/x64_isa.h b/src/arch/x64_isa.h
@@ -1,128 +0,0 @@
-/* x86_64 ISA helpers used by arch/x64.c.
- *
- * Only the constants here. Instruction encoders live in arch/x64.c
- * because they're variable length and depend on the MCEmitter byte
- * stream (REX prefix, ModR/M, SIB, displacement). The disassembler
- * doesn't share these yet; if/when it does, a parallel x64_isa.c will
- * host decode tables. */
-
-#ifndef CFREE_X64_ISA_H
-#define CFREE_X64_ISA_H
-
-#include "core/bytes.h"
-#include "core/core.h"
-
-/* ---- GPR numbering (DWARF / ABI matches HW encoding 0..15) ---- */
-enum {
- X64_RAX = 0,
- X64_RCX = 1,
- X64_RDX = 2,
- X64_RBX = 3,
- X64_RSP = 4,
- X64_RBP = 5,
- X64_RSI = 6,
- X64_RDI = 7,
- X64_R8 = 8,
- X64_R9 = 9,
- X64_R10 = 10,
- X64_R11 = 11,
- X64_R12 = 12,
- X64_R13 = 13,
- X64_R14 = 14,
- X64_R15 = 15,
-};
-
-/* SSE register numbering — xmm0..xmm15 share encoding with r0..r15. */
-enum {
- X64_XMM0 = 0,
- X64_XMM1 = 1,
- X64_XMM2 = 2,
- X64_XMM3 = 3,
- X64_XMM4 = 4,
- X64_XMM5 = 5,
- X64_XMM6 = 6,
- X64_XMM7 = 7,
- X64_XMM8 = 8,
- X64_XMM15 = 15,
-};
-
-/* Condition codes for Jcc / SETcc / CMOVcc. Encoded in the low nibble. */
-enum {
- X64_CC_O = 0x0,
- X64_CC_NO = 0x1,
- X64_CC_B = 0x2, /* below / CF=1 → CMP_LT_U */
- X64_CC_AE = 0x3, /* above-or-equal / CF=0 → CMP_GE_U */
- X64_CC_E = 0x4, /* equal / ZF=1 → CMP_EQ */
- X64_CC_NE = 0x5, /* → CMP_NE */
- X64_CC_BE = 0x6, /* below-or-equal / CF=1 or ZF=1 → CMP_LE_U */
- X64_CC_A = 0x7, /* above / CF=0 and ZF=0 → CMP_GT_U */
- X64_CC_S = 0x8,
- X64_CC_NS = 0x9,
- X64_CC_P = 0xA,
- X64_CC_NP = 0xB,
- X64_CC_L = 0xC, /* less (signed) / SF!=OF → CMP_LT_S */
- X64_CC_GE = 0xD, /* → CMP_GE_S */
- X64_CC_LE = 0xE, /* less-or-equal (signed) → CMP_LE_S */
- X64_CC_G = 0xF, /* greater → CMP_GT_S */
-};
-
-/* REX prefix is 0x40 | W<<3 | R<<2 | X<<1 | B. */
-#define X64_REX_BASE 0x40u
-#define X64_REX_W 0x08u
-#define X64_REX_R 0x04u
-#define X64_REX_X 0x02u
-#define X64_REX_B 0x01u
-
-/* ---- Branch / NOP encoding constants ----
- *
- * Used by the linker to emit PLT entries and IPLT stubs without
- * sprinkling raw hex into link_arch_x64.c. The shape is always the
- * same RIP-relative indirect JMP plus padding NOPs. */
-
-/* JMP r/m64 — opcode FF /4. ModR/M for the RIP+disp32 form is
- * mod=00, reg=/4 (JMP m64), r/m=101 → 0x25. */
-#define X64_OP_JMP_RM64 0xFFu
-#define X64_MODRM_JMP_RIPREL 0x25u
-
-/* Single-byte NOP. */
-#define X64_NOP1 0x90u
-
-/* Intel multi-byte ("long") NOP forms. The 6-byte form is the
- * canonical IPLT-stub tail pad (NOPW 0(%rax,%rax,1)). */
-#define X64_NOP6_BYTE0 0x66u
-#define X64_NOP6_BYTE1 0x0Fu
-#define X64_NOP6_BYTE2 0x1Fu
-#define X64_NOP6_BYTE3 0x44u
-#define X64_NOP6_BYTE4 0x00u
-#define X64_NOP6_BYTE5 0x00u
-
-/* Sizes of the encoded forms above. */
-#define X64_JMP_RIPREL_SIZE 6u
-#define X64_NOP6_SIZE 6u
-
-/* Write a 6-byte `jmp [rip + disp32]` (FF 25 disp32) at dst. */
-static inline void x64_write_jmp_riprel(u8* dst, i32 disp32) {
- dst[0] = X64_OP_JMP_RM64;
- dst[1] = X64_MODRM_JMP_RIPREL;
- wr_u32_le(dst + 2, (u32)disp32);
-}
-
-/* Fill nbytes at dst with single-byte NOPs (0x90). Matches the
- * existing memset-then-patch pattern used to pad PLT entries to 16. */
-static inline void x64_write_nop_pad(u8* dst, u32 nbytes) {
- u32 i;
- for (i = 0; i < nbytes; ++i) dst[i] = X64_NOP1;
-}
-
-/* Write the canonical 6-byte multi-byte NOP (66 0F 1F 44 00 00) at
- * dst. Used to pad the IPLT stub from 6 → 12 bytes. */
-static inline void x64_write_nop6(u8* dst) {
- dst[0] = X64_NOP6_BYTE0;
- dst[1] = X64_NOP6_BYTE1;
- dst[2] = X64_NOP6_BYTE2;
- dst[3] = X64_NOP6_BYTE3;
- dst[4] = X64_NOP6_BYTE4;
- dst[5] = X64_NOP6_BYTE5;
-}
-
-#endif
diff --git a/src/dbg/arch_aa64.c b/src/dbg/arch_aa64.c
@@ -1,235 +0,0 @@
-/* AArch64 lifter for the displaced-step shim.
- *
- * Lays out a fixed-up copy of one insn in the session scratch slot
- * (DBG_DISPLACED_SLOT_BYTES bytes), followed by a BRK sentinel the
- * session arms an internal bp on.
- *
- * Supported families:
- * - any insn with no PC-relative operand (copied verbatim);
- * - B / BL / B.cond — re-encode the immediate;
- * - CBZ / CBNZ / TBZ / TBNZ — always emit a trampoline:
- * slot[0] cond-branch +2 words (taken → slot+8)
- * slot[4] BRK (not-taken fallthrough)
- * slot[8] LDR x16, =target
- * slot[12] BR x16
- * slot[16] literal pool (8 bytes, absolute target)
- * - ADR / ADRP — replace with LDR Xd, =target:
- * slot[0] LDR Xd, =target
- * slot[4] BRK
- * slot[8] literal pool (8 bytes)
- * - LDR (literal), integer/LDRSW — synthesize indirect load:
- * slot[0] LDR x16, =literal_addr
- * slot[4] LDR Xt/Wt/LDRSW Xt, [x16]
- * slot[8] BRK
- * slot[12] literal pool (8 bytes, absolute literal addr)
- * - BR / BLR / RET — copied verbatim; the BRK after never
- * fires because the indirect branch transfers control. The session's
- * stale internal_bp is cleared by the next prepare; finalize gates on
- * PC == return_pc so it stays a no-op when control left the slot. */
-
-#include "dbg/dbg.h"
-
-#include <string.h>
-
-#include "arch/aa64_isa.h"
-
-#define SHIM_X16 16u /* IP0; safe to clobber inside a shim */
-
-uint32_t dbg_aa64_brk_word(void) {
- return aa64_brk(0);
-}
-
-static int fits_signed(int64_t v, int bits) {
- int64_t lim = (int64_t)1 << (bits - 1);
- return v >= -lim && v < lim;
-}
-
-/* LDR (literal) for integer Xt: opc=01, V=0, fixed bits 011_0_00.
- * 01 011 0 00 imm19 Rt → 0x58000000 | (imm19<<5) | Rt
- * imm19 is the signed word offset from the LDR's own PC. */
-static uint32_t enc_ldr_lit_x(uint32_t Rt, int32_t imm19) {
- return 0x58000000u | (((uint32_t)imm19 & 0x7ffffu) << 5) | (Rt & 0x1fu);
-}
-/* LDR Xt, [Xn, #0] / LDR Wt, [Xn, #0] / LDRSW Xt, [Xn, #0]. */
-static uint32_t enc_ldr64_reg(uint32_t Rt, uint32_t Rn) {
- return aa64_ldr64_uimm12(Rt, Rn, 0);
-}
-static uint32_t enc_ldr32_reg(uint32_t Rt, uint32_t Rn) {
- return aa64_ldst_uimm_pack((AA64LdStUimm){
- .size = 2, .V = 0, .opc = AA64_LDST_OPC_LDR, .imm12 = 0, .Rn = Rn,
- .Rt = Rt});
-}
-static uint32_t enc_ldrsw_reg(uint32_t Rt, uint32_t Rn) {
- return aa64_ldst_uimm_pack((AA64LdStUimm){
- .size = 2, .V = 0, .opc = 2, .imm12 = 0, .Rn = Rn, .Rt = Rt});
-}
-
-static void put_u32(uint8_t* w, uint32_t off, uint32_t v) {
- memcpy(w + off, &v, sizeof(v));
-}
-static void put_u64(uint8_t* w, uint32_t off, uint64_t v) {
- memcpy(w + off, &v, sizeof(v));
-}
-
-/* Sign-extend a `bits`-wide field whose raw value is `v`. */
-static int64_t sign_extend(uint64_t v, int bits) {
- uint64_t m = 1ull << (bits - 1);
- return (int64_t)((v ^ m) - m);
-}
-
-int dbg_aa64_build_shim(uint32_t orig_insn, uint64_t orig_pc,
- void* scratch_write, uint64_t scratch_runtime,
- u32* shim_len) {
- uint8_t* w = (uint8_t*)scratch_write;
- uint32_t brk = aa64_brk(0);
- int64_t pc_delta;
- if (!shim_len) return 1;
- *shim_len = 0;
- pc_delta = (int64_t)orig_pc - (int64_t)scratch_runtime;
-
- /* ---- B / BL (imm26) ------------------------------------------------ */
- if ((orig_insn & 0x7C000000u) == 0x14000000u) {
- AA64BrImm f = aa64_brimm_unpack(orig_insn);
- int64_t imm = sign_extend(f.imm26, 26);
- int64_t new_off = imm * 4 + pc_delta;
- if ((new_off & 3) || !fits_signed(new_off / 4, 26)) {
- /* Out of B/BL range from scratch: fall back to LDR x30/PC trick is
- * messy for BL (need to preserve LR). Decline. */
- return 1;
- }
- f.imm26 = (uint32_t)((new_off / 4) & 0x3ffffffu);
- put_u32(w, 0, aa64_brimm_pack(f));
- put_u32(w, 4, brk);
- *shim_len = 4;
- return 0;
- }
-
- /* ---- B.cond (imm19) ------------------------------------------------ */
- if ((orig_insn & 0xFF000010u) == 0x54000000u) {
- AA64BrCond f = aa64_brcond_unpack(orig_insn);
- int64_t imm = sign_extend(f.imm19, 19);
- int64_t new_off = imm * 4 + pc_delta;
- if ((new_off & 3) || !fits_signed(new_off / 4, 19)) {
- /* Synthesize: B.cond +8 (skip BRK) ; BRK ; LDR x16,=tgt ; BR x16 ;
- * literal. The "taken" path branches to slot+8, the "not-taken"
- * path falls through to BRK at slot+4. */
- uint64_t target = orig_pc + (uint64_t)(imm * 4);
- AA64BrCond nf;
- nf.cond = f.cond;
- nf.imm19 = 2u; /* +8 bytes from slot[0] → slot[8] */
- put_u32(w, 0, aa64_brcond_pack(nf));
- put_u32(w, 4, brk);
- put_u32(w, 8, enc_ldr_lit_x(SHIM_X16, 2)); /* LDR x16, [pc+8] = slot[16] */
- put_u32(w, 12, aa64_br(SHIM_X16));
- put_u64(w, 16, target);
- *shim_len = 4;
- return 0;
- }
- f.imm19 = (uint32_t)((new_off / 4) & 0x7ffffu);
- put_u32(w, 0, aa64_brcond_pack(f));
- put_u32(w, 4, brk);
- *shim_len = 4;
- return 0;
- }
-
- /* ---- CBZ / CBNZ (imm19) — always trampoline form ------------------- */
- if ((orig_insn & 0x7E000000u) == 0x34000000u) {
- AA64CB f = aa64_cb_unpack(orig_insn);
- int64_t imm = sign_extend(f.imm19, 19);
- uint64_t target = orig_pc + (uint64_t)(imm * 4);
- AA64CB nf = f;
- nf.imm19 = 2u; /* +8 → slot[8] */
- put_u32(w, 0, aa64_cb_pack(nf));
- put_u32(w, 4, brk);
- put_u32(w, 8, enc_ldr_lit_x(SHIM_X16, 2));
- put_u32(w, 12, aa64_br(SHIM_X16));
- put_u64(w, 16, target);
- *shim_len = 4;
- return 0;
- }
-
- /* ---- TBZ / TBNZ (imm14) — always trampoline ------------------------
- * b5 011011 op b40[18:14] imm14[18:5] -- wait, field layout:
- * b5(31) 011011(30..25) op(24) b40(23..19) imm14(18..5) Rt(4..0). */
- if ((orig_insn & 0x7E000000u) == 0x36000000u) {
- uint32_t b5 = (orig_insn >> 31) & 1u;
- uint32_t op = (orig_insn >> 24) & 1u;
- uint32_t b40 = (orig_insn >> 19) & 0x1fu;
- uint32_t Rt = orig_insn & 0x1fu;
- uint32_t imm14_raw = (orig_insn >> 5) & 0x3fffu;
- int64_t imm = sign_extend(imm14_raw, 14);
- uint64_t target = orig_pc + (uint64_t)(imm * 4);
- uint32_t new_imm14 = 2u; /* +8 → slot[8] */
- uint32_t new_word =
- (b5 << 31) | 0x36000000u | (op << 24) | (b40 << 19) |
- ((new_imm14 & 0x3fffu) << 5) | (Rt & 0x1fu);
- put_u32(w, 0, new_word);
- put_u32(w, 4, brk);
- put_u32(w, 8, enc_ldr_lit_x(SHIM_X16, 2));
- put_u32(w, 12, aa64_br(SHIM_X16));
- put_u64(w, 16, target);
- *shim_len = 4;
- return 0;
- }
-
- /* ---- ADR / ADRP ---------------------------------------------------- */
- if ((orig_insn & 0x1F000000u) == 0x10000000u) {
- AA64PCRelAdr f = aa64_pcrel_adr_unpack(orig_insn);
- uint64_t imm_raw = ((uint64_t)f.immhi << 2) | (uint64_t)f.immlo;
- int64_t imm21 = sign_extend(imm_raw, 21);
- uint64_t target;
- if (f.op == AA64_ADR_OP_ADRP) {
- target = (orig_pc & ~(uint64_t)0xFFF) + ((uint64_t)imm21 << 12);
- } else {
- target = orig_pc + (uint64_t)imm21;
- }
- /* LDR Xd, [pc + 8] — the literal sits at slot[8]. */
- put_u32(w, 0, enc_ldr_lit_x(f.Rd, 2));
- put_u32(w, 4, brk);
- put_u64(w, 8, target);
- *shim_len = 4;
- return 0;
- }
-
- /* ---- LDR (literal) — integer & LDRSW only -------------------------- */
- if ((orig_insn & 0x3B000000u) == 0x18000000u) {
- uint32_t opc = (orig_insn >> 30) & 3u;
- uint32_t V = (orig_insn >> 26) & 1u;
- uint32_t Rt = orig_insn & 0x1fu;
- uint32_t imm19_raw = (orig_insn >> 5) & 0x7ffffu;
- int64_t imm19 = sign_extend(imm19_raw, 19);
- uint64_t literal_addr = orig_pc + (uint64_t)(imm19 * 4);
- uint32_t load_insn;
- if (V) return 1; /* vector forms (S/D/Q): not supported in v1 */
- switch (opc) {
- case 0: load_insn = enc_ldr32_reg(Rt, SHIM_X16); break; /* LDR Wt */
- case 1: load_insn = enc_ldr64_reg(Rt, SHIM_X16); break; /* LDR Xt */
- case 2: load_insn = enc_ldrsw_reg(Rt, SHIM_X16); break; /* LDRSW */
- default: return 1; /* PRFM (literal): not meaningful here */
- }
- /* LDR x16, [pc + 12] — literal at slot[12]. */
- put_u32(w, 0, enc_ldr_lit_x(SHIM_X16, 3));
- put_u32(w, 4, load_insn);
- put_u32(w, 8, brk);
- put_u64(w, 12, literal_addr);
- *shim_len = 8;
- return 0;
- }
-
- /* ---- BR / BLR / RET (indirect) ------------------------------------- */
- if ((orig_insn & 0xFE1FFC1Fu) == AA64_BR_REG_FAMILY_MATCH) {
- /* Copy verbatim; the BRK after will not fire because control
- * transfers to the register target. The session clears the stale
- * internal bp on the next prepare. */
- put_u32(w, 0, orig_insn);
- put_u32(w, 4, brk);
- *shim_len = 4;
- return 0;
- }
-
- /* ---- default: no PC-relative operand — copy verbatim --------------- */
- put_u32(w, 0, orig_insn);
- put_u32(w, 4, brk);
- *shim_len = 4;
- return 0;
-}
diff --git a/src/dbg/dbg.h b/src/dbg/dbg.h
@@ -3,7 +3,7 @@
/* Internal contracts for src/dbg/. The public CfreeJitSession entries are
* defined in session.c on top of these primitives; bp.c, step.c, mem.c,
- * displaced.c, and arch_aa64.c each own one slice. */
+ * displaced.c, and arch/aa64/dbg.c each own one slice. */
#include <cfree.h>
@@ -76,7 +76,7 @@ int dbg_mem_write(struct CfreeJitSession*, uint64_t addr, const void* src,
size_t n);
/* ---- displaced step ------------------------------------------------- */
-/* The session owns a single executable scratch region. arch_aa64.c writes
+/* The session owns a single executable scratch region. arch/aa64/dbg.c writes
* a fixed-up copy of the original insn plus a return-shim into it; the
* worker is then resumed with PC pointing at the scratch entry. The shim
* ends with a BRK that the fault classifier recognizes (via the bp table)
diff --git a/src/dbg/displaced.c b/src/dbg/displaced.c
@@ -86,7 +86,7 @@ int dbg_displaced_prepare(CfreeJitSession* s, uint64_t insn_pc,
return 1;
}
/* Flush the entire slot — trampoline forms write up to 24 bytes plus a
- * literal pool; arch_aa64.c returns the BRK *offset*, not the length. */
+ * literal pool; arch/aa64/dbg.c returns the BRK *offset*, not the length. */
if (s->c->env->execmem->flush_icache) {
s->c->env->execmem->flush_icache(s->c->env->execmem->user,
s->displaced.region.runtime,
diff --git a/src/link/link_arch.h b/src/link/link_arch.h
@@ -7,9 +7,8 @@
* Compiler.target.arch. Lets link_dyn.c / link_layout.c / link_elf.c
* stay arch-agnostic instead of branching on target.arch and hand-
* encoding instruction bytes inline. Each backend's descriptor lives
- * in its own translation unit (link_arch_aa64.c / _x64.c / _rv64.c)
- * and leans on the existing arch/<arch>_isa.h encoders for everything
- * but small format-specific constants.
+ * under src/arch/<arch>/ and leans on that arch's ISA encoders for
+ * everything but small format-specific constants.
*
* The struct intentionally collects only fields the LINKER needs.
* Code-generation arch dispatch belongs in CGTarget (arch/arch.h);
@@ -110,11 +109,6 @@ typedef struct LinkArchDesc {
int (*needs_jit_call_stub)(RelocKind);
} LinkArchDesc;
-/* Per-arch descriptors, defined in link_arch_<arch>.c. */
-extern const LinkArchDesc link_arch_aa64;
-extern const LinkArchDesc link_arch_x64;
-extern const LinkArchDesc link_arch_rv64;
-
/* Returns NULL for an unsupported arch. Callers panic with their own
* context-rich message rather than this helper picking one. */
const LinkArchDesc* link_arch_desc_for(const Compiler*);
diff --git a/src/link/link_arch_aa64.c b/src/link/link_arch_aa64.c
@@ -1,208 +0,0 @@
-/* AArch64 link-time descriptor.
- *
- * Implements the LinkArchDesc contract from link_arch.h for the
- * aarch64 ELF psABI: PLT0 + per-import PLT entries (lazy-resolve
- * trampolines emitted in canonical form even under DF_1_NOW), and the
- * 12-byte IPLT stub used by ifunc resolvers. All instruction bytes
- * come from the encoders in arch/aa64_isa.h — no raw hex literals
- * here.
- *
- * The byte layout matches the previous inline encodings in
- * link_dyn.c (PLT) and link_layout.c (IPLT) so that switching the
- * linker to descriptor dispatch is a no-op on the output image. */
-
-#include "arch/aa64_isa.h"
-#include "core/bytes.h"
-#include "core/core.h"
-#include "link/link_arch.h"
-#include "obj/elf.h"
-#include "obj/macho.h"
-#include "obj/obj.h"
-
-/* Fixed register assignments mandated by the AArch64 PLT ABI. */
-#define AA64_PLT_SCRATCH_X16 16u /* PLT/IPLT scratch (slot address) */
-#define AA64_PLT_SCRATCH_X17 17u /* PLT scratch (loaded function ptr) */
-
-/* PLT geometry. Documented in link_arch.h; redeclared here as the
- * descriptor table needs them at file scope. */
-#define AA64_PLT0_SIZE 32u
-#define AA64_PLT_ENTRY_SIZE 16u
-#define AA64_IPLT_STUB_SIZE 12u
-
-/* PLT0 references .got.plt[2] (the lazy-resolve hook); the per-import
- * entries start at .got.plt[3]. */
-#define AA64_GOTPLT_RESOLVER_INDEX 2u
-
-/* Page mask for ADRP: ADRP encodes (page(target) - page(PC)) >> 12,
- * where page(x) clears the low 12 bits. */
-#define AA64_PAGE_MASK ((u64)0xfffu)
-
-/* Compute the (immlo, immhi) ADRP immediate halves for the page-
- * relative displacement from `pc` to `target`. Both addresses are
- * post-shift final image vaddrs; ADRP discards the low 12 bits of
- * each before subtracting, so the result is invariant under any
- * segment-base shift that moves both endpoints by the same delta. */
-static inline void aa64_adrp_imm_halves(u64 pc, u64 target, u32* immlo,
- u32* immhi) {
- i64 page_disp = (i64)(target & ~AA64_PAGE_MASK) - (i64)(pc & ~AA64_PAGE_MASK);
- i64 imm21 = page_disp >> 12;
- *immlo = (u32)(imm21 & 0x3);
- *immhi = (u32)((imm21 >> 2) & 0x7ffff);
-}
-
-/* Emit one ADRP+LDR+ADD+BR sequence that materializes `slot_vaddr`
- * (a .got.plt entry) into x16, loads the resolved function pointer
- * into x17, and tail-calls it. Used by both PLT0 (after its STP) and
- * each per-import entry — the only thing that varies is `pc`, which
- * starts at the ADRP itself. */
-static void aa64_emit_adrp_load_br(u8* dst, u64 pc, u64 slot_vaddr) {
- u32 immlo, immhi;
- aa64_adrp_imm_halves(pc, slot_vaddr, &immlo, &immhi);
- u32 lo12 = (u32)(slot_vaddr & AA64_PAGE_MASK);
- /* LDR Xt encodes the byte offset divided by 8. .got.plt slots are
- * 8-byte aligned so the low 3 bits of lo12 are always 0. */
- u32 ldr_imm12 = (lo12 >> 3) & 0xfffu;
-
- wr_u32_le(dst + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, immlo, immhi));
- wr_u32_le(dst + 4, aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X17,
- AA64_PLT_SCRATCH_X16, ldr_imm12));
- wr_u32_le(dst + 8, aa64_add_imm(/*sf=*/1, AA64_PLT_SCRATCH_X16,
- AA64_PLT_SCRATCH_X16, lo12, /*sh=*/0));
- wr_u32_le(dst + 12, aa64_br(AA64_PLT_SCRATCH_X17));
-}
-
-static void aa64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) {
- /* PLT0:
- * stp x16, x30, [sp, #-16]!
- * adrp x16, page(.got.plt[2])
- * ldr x17, [x16, #lo12(.got.plt[2])]
- * add x16, x16, #lo12(.got.plt[2])
- * br x17
- * nop ; nop ; nop
- *
- * Under DF_1_NOW the loader patches every .got.plt slot from
- * .rela.plt before running PLT0, so this trampoline never executes.
- * It is still emitted in canonical form so disassemblers and
- * unwinders see the layout the psABI specifies. */
- u64 slot2 = gotplt_vaddr + 8u * AA64_GOTPLT_RESOLVER_INDEX;
- /* The ADRP sits at plt0+4 (one instruction past the leading STP). */
- u64 adrp_pc = plt0_vaddr + 4u;
-
- /* `stp x16, x30, [sp, #-16]!` — pre-indexed pair store with imm7
- * scaled by 8, so the encoded field is -16/8 = -2. */
- wr_u32_le(dst + 0, aa64_stp64_pre(AA64_PLT_SCRATCH_X16, AA64_LR, AA64_SP,
- /*imm7_scaled=*/-2));
- aa64_emit_adrp_load_br(dst + 4, adrp_pc, slot2);
- wr_u32_le(dst + 20, aa64_nop());
- wr_u32_le(dst + 24, aa64_nop());
- wr_u32_le(dst + 28, aa64_nop());
-}
-
-static void aa64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) {
- /* Per-import 16-byte entry: ADRP+LDR+ADD+BR where ADRP's PC is the
- * entry's first instruction (no leading STP here — the resolved
- * function returns to the original caller, not into PLT0). */
- aa64_emit_adrp_load_br(dst, entry_vaddr, slot_vaddr);
-}
-
-static u32 aa64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr,
- LinkArchIPltReloc out[2]) {
- /* IPLT stub: ADRP x16, %page(slot) ; LDR x16, [x16, :lo12:slot] ;
- * BR x16.
- *
- * We deliberately emit the two address-bearing instructions with
- * zero immediates: the linker enqueues an ADR_PREL_PG_HI21 reloc on
- * the ADRP and an LDST64_ABS_LO12_NC reloc on the LDR, both
- * targeting the slot's synthetic local symbol. Reloc-apply runs
- * after final vaddr assignment, which is the only point at which
- * both endpoints' page-relative displacement is known. */
- (void)stub_vaddr;
- (void)slot_vaddr;
-
- wr_u32_le(dst + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, /*immlo=*/0,
- /*immhi=*/0));
- wr_u32_le(dst + 4,
- aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X16, AA64_PLT_SCRATCH_X16,
- /*imm12_scaled=*/0));
- wr_u32_le(dst + 8, aa64_br(AA64_PLT_SCRATCH_X16));
-
- out[0].offset_in_stub = 0;
- out[0].width = 4;
- out[0].kind = R_AARCH64_ADR_PREL_PG_HI21;
- out[1].offset_in_stub = 4;
- out[1].width = 4;
- out[1].kind = R_AARCH64_LDST64_ABS_LO12_NC;
- return 2;
-}
-
-static void aa64_emit_macho_stub(u8* out, u64 stub_vaddr, u64 got_slot_vaddr) {
- i64 page_s = ((i64)got_slot_vaddr) & ~(i64)0xfff;
- i64 page_p = ((i64)stub_vaddr) & ~(i64)0xfff;
- i64 imm21 = (page_s - page_p) >> 12;
- u32 immlo = (u32)(imm21 & 0x3u);
- u32 immhi = (u32)((imm21 >> 2) & 0x7ffffu);
- u32 lo12 = (u32)(got_slot_vaddr & 0xfffu);
- u32 imm12_ldr = (lo12 >> 3) & 0xfffu;
-
- wr_u32_le(out + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, immlo, immhi));
- wr_u32_le(out + 4, aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X16,
- AA64_PLT_SCRATCH_X16, imm12_ldr));
- wr_u32_le(out + 8, aa64_br(AA64_PLT_SCRATCH_X16));
-}
-
-static int aa64_is_branch_reloc(RelocKind kind) {
- return kind == R_AARCH64_CALL26 || kind == R_AARCH64_JUMP26;
-}
-
-static int aa64_is_got_load_reloc(RelocKind kind) {
- return kind == R_AARCH64_ADR_GOT_PAGE || kind == R_AARCH64_LD64_GOT_LO12_NC;
-}
-
-static int aa64_is_tlvp_reloc(RelocKind kind) {
- return kind == R_AARCH64_TLVP_LOAD_PAGE21 ||
- kind == R_AARCH64_TLVP_LOAD_PAGEOFF12;
-}
-
-static int aa64_is_direct_page_reloc(RelocKind kind) {
- switch (kind) {
- case R_AARCH64_ADR_PREL_PG_HI21:
- case R_AARCH64_ADR_PREL_PG_HI21_NC:
- case R_AARCH64_ADD_ABS_LO12_NC:
- case R_AARCH64_LDST8_ABS_LO12_NC:
- case R_AARCH64_LDST16_ABS_LO12_NC:
- case R_AARCH64_LDST32_ABS_LO12_NC:
- case R_AARCH64_LDST64_ABS_LO12_NC:
- case R_AARCH64_LDST128_ABS_LO12_NC:
- return 1;
- default:
- return 0;
- }
-}
-
-const LinkArchDesc link_arch_aa64 = {
- .e_machine = EM_AARCH64,
- .default_musl_interp = "/lib/ld-musl-aarch64.so.1",
-
- .elf_r_relative = ELF_R_AARCH64_RELATIVE,
- .elf_r_glob_dat = ELF_R_AARCH64_GLOB_DAT,
- .elf_r_jump_slot = ELF_R_AARCH64_JUMP_SLOT,
-
- .macho_cputype = CPU_TYPE_ARM64,
- .macho_cpusubtype = CPU_SUBTYPE_ARM64_ALL,
-
- .plt0_size = AA64_PLT0_SIZE,
- .plt_entry_size = AA64_PLT_ENTRY_SIZE,
- .iplt_stub_size = AA64_IPLT_STUB_SIZE,
-
- .emit_plt0 = aa64_emit_plt0,
- .emit_plt_entry = aa64_emit_plt_entry,
- .emit_iplt_stub = aa64_emit_iplt_stub,
- .macho_stub_size = AA64_IPLT_STUB_SIZE,
- .emit_macho_stub = aa64_emit_macho_stub,
-
- .is_branch_reloc = aa64_is_branch_reloc,
- .is_got_load_reloc = aa64_is_got_load_reloc,
- .is_tlvp_reloc = aa64_is_tlvp_reloc,
- .is_direct_page_reloc = aa64_is_direct_page_reloc,
- .needs_jit_call_stub = aa64_is_branch_reloc,
-};
diff --git a/src/link/link_arch_rv64.c b/src/link/link_arch_rv64.c
@@ -1,95 +0,0 @@
-/* RV64 link-time arch descriptor. See link_arch.h for the contract.
- *
- * The PLT0/PLT-entry/IPLT-stub byte layouts here mirror what used to
- * live inline in link_dyn.c (PLT) and link_layout.c (IPLT) before the
- * vtable refactor; comments preserve the WHY (notably the +0x800 bias
- * on AUIPC immediates). */
-
-#include "arch/rv64_isa.h"
-#include "core/bytes.h"
-#include "core/core.h"
-#include "link/link_arch.h"
-#include "obj/elf.h"
-
-/* PLT0 is 8 canonical NOPs (32 bytes); each PLT entry and IPLT stub is
- * 4 instructions (16 bytes) / 3 instructions (12 bytes) respectively.
- * Encoded once here so the descriptor and emitters stay in sync. */
-#define RV64_PLT0_SIZE 32u
-#define RV64_PLT_ENTRY_SIZE 16u
-#define RV64_IPLT_STUB_SIZE 12u
-
-/* Split a PC-relative displacement into the (hi20, lo12) pair consumed
- * by the AUIPC + I-type sequence. The +0x800 bias is the standard
- * RISC-V two-instruction PCREL trick: AUIPC adds an upper-20 immediate
- * shifted left 12, then the second instruction adds a sign-extended
- * 12-bit lo12. If we naively split disp into (disp>>12, disp&0xfff)
- * the lo12 sign-extends as a *negative* number whenever bit 11 is set,
- * which underflows the AUIPC result by 0x1000. Adding 0x800 before
- * the shift rounds the high half up in exactly the cases that need it
- * so AUIPC + sign-extended-lo12 reconstructs disp correctly. */
-static inline void rv64_split_pcrel(i64 disp, u32* hi20_out, u32* lo12_out) {
- *hi20_out = (u32)(((u64)(disp + 0x800)) >> 12) & 0xfffffu;
- *lo12_out = (u32)((u64)disp & 0xfffu);
-}
-
-/* PLT0 under DF_1_NOW is never executed — the loader resolves every
- * JUMP_SLOT before transferring control — but we still emit it in
- * canonical form (8 NOPs) so disassemblers and unwinders see a well-
- * formed prologue at the top of .plt. */
-static void rv64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) {
- u32 i;
- (void)plt0_vaddr;
- (void)gotplt_vaddr;
- for (i = 0; i < RV64_PLT0_SIZE; i += 4u) wr_u32_le(dst + i, rv_nop());
-}
-
-/* Per-import PLT entry: load the GOT slot pre-filled by the loader
- * (R_RISCV_JUMP_SLOT) and tail-call through it. t1 is the standard
- * psABI scratch for the trampoline return-address (clobbered by the
- * lazy resolver in the non-BIND_NOW path); t3 holds the slot pointer. */
-static void rv64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) {
- i64 disp = (i64)slot_vaddr - (i64)entry_vaddr;
- u32 hi20;
- u32 lo12;
- rv64_split_pcrel(disp, &hi20, &lo12);
- wr_u32_le(dst + 0, rv_auipc(RV_T3, hi20));
- wr_u32_le(dst + 4, rv_ld(RV_T3, RV_T3, (i32)lo12));
- wr_u32_le(dst + 8, rv_jalr(RV_T1, RV_T3, 0));
- wr_u32_le(dst + 12, rv_nop());
-}
-
-/* IPLT stub: load .igot.plt[i] (filled at startup by the resolver) and
- * tail-call to it. The stub->slot displacement is invariant under the
- * segment-base shift (both addresses live in the same image), so we
- * bake it directly into the instructions and report zero apply-time
- * relocs — unlike aarch64, which cannot encode a 32-bit pcrel inline. */
-static u32 rv64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr,
- LinkArchIPltReloc out[2]) {
- i64 disp = (i64)slot_vaddr - (i64)stub_vaddr;
- u32 hi20;
- u32 lo12;
- (void)out;
- rv64_split_pcrel(disp, &hi20, &lo12);
- wr_u32_le(dst + 0, rv_auipc(RV_T1, hi20));
- wr_u32_le(dst + 4, rv_ld(RV_T1, RV_T1, (i32)lo12));
- wr_u32_le(dst + 8, rv_jr(RV_T1));
- return 0u;
-}
-
-const LinkArchDesc link_arch_rv64 = {
- .e_machine = EM_RISCV,
- .default_musl_interp = "/lib/ld-musl-riscv64.so.1",
- /* RISC-V psABI has no dedicated GLOB_DAT — GOT-slot data imports
- * use the generic absolute-64 reloc instead. */
- .elf_r_relative = ELF_R_RISCV_RELATIVE,
- .elf_r_glob_dat = ELF_R_RISCV_64,
- .elf_r_jump_slot = ELF_R_RISCV_JUMP_SLOT,
- .plt0_size = RV64_PLT0_SIZE,
- .plt_entry_size = RV64_PLT_ENTRY_SIZE,
- .iplt_stub_size = RV64_IPLT_STUB_SIZE,
- .global_pointer_symbol = "__global_pointer$",
- .global_pointer_rw_offset = 0x800u,
- .emit_plt0 = rv64_emit_plt0,
- .emit_plt_entry = rv64_emit_plt_entry,
- .emit_iplt_stub = rv64_emit_iplt_stub,
-};
diff --git a/src/link/link_arch_x64.c b/src/link/link_arch_x64.c
@@ -1,77 +0,0 @@
-/* x86_64 link-time arch descriptor.
- *
- * Implements the LinkArchDesc contract from link/link_arch.h for
- * EM_X86_64. The PLT/IPLT byte sequences here mirror the inline
- * encodings previously living in link_dyn.c (PLT0 + per-import entry)
- * and link_layout.c (IPLT stub) — kept identical byte-for-byte so the
- * descriptor switchover is a pure refactor. All raw byte values come
- * from named constants / inline writers in arch/x64_isa.h. */
-
-#include "link/link_arch.h"
-
-#include "arch/x64_isa.h"
-#include "core/bytes.h"
-#include "core/core.h"
-#include "obj/elf.h"
-
-/* PLT0 layout under DF_1_NOW: never executed (loader pre-binds every
- * slot via .rela.plt before user code runs), so we just emit 32 bytes
- * of single-byte NOPs. Self-documenting and trivially well-formed for
- * disassemblers and unwinders that walk the section. */
-static void x64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) {
- (void)plt0_vaddr;
- (void)gotplt_vaddr;
- x64_write_nop_pad(dst, 32u);
-}
-
-/* Per-import PLT entry (16 B):
- *
- * ff 25 disp32 ; jmpq *[rip + disp_to_slot] (6 B)
- * 90 90 90 90 90 90 90 90 90 90 ; pad to 16 with single-byte NOPs
- *
- * disp32 is measured from the END of the JMP (entry_vaddr + 6) to the
- * .got.plt slot. The 10-byte tail matches link_dyn.c's prior
- * memset(0x90)+patch behavior exactly. */
-static void x64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) {
- i64 disp = (i64)slot_vaddr - (i64)(entry_vaddr + X64_JMP_RIPREL_SIZE);
- i32 disp32 = (i32)(u32)((u64)disp & 0xffffffffu);
- x64_write_jmp_riprel(dst, disp32);
- x64_write_nop_pad(dst + X64_JMP_RIPREL_SIZE,
- 16u - X64_JMP_RIPREL_SIZE);
-}
-
-/* IPLT (ifunc) trampoline stub (12 B):
- *
- * ff 25 disp32 ; jmpq *[rip + disp_to_slot] (6 B)
- * 66 0f 1f 44 00 00 ; 6-byte multibyte NOP (6 B)
- *
- * Like the PLT entry, disp32 is from the END of the JMP to the
- * .igot.plt slot. The displacement is invariant under image-base
- * shift (both ends move together), so it's encoded inline and we
- * report zero apply-time relocations. */
-static u32 x64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr,
- LinkArchIPltReloc out[2]) {
- (void)out;
- i64 disp = (i64)slot_vaddr - (i64)(stub_vaddr + X64_JMP_RIPREL_SIZE);
- i32 disp32 = (i32)(u32)((u64)disp & 0xffffffffu);
- x64_write_jmp_riprel(dst, disp32);
- x64_write_nop6(dst + X64_JMP_RIPREL_SIZE);
- return 0;
-}
-
-const LinkArchDesc link_arch_x64 = {
- .e_machine = EM_X86_64,
- .default_musl_interp = "/lib/ld-musl-x86_64.so.1",
-
- .elf_r_relative = ELF_R_X86_64_RELATIVE,
- .elf_r_glob_dat = ELF_R_X86_64_GLOB_DAT,
- .elf_r_jump_slot = ELF_R_X86_64_JUMP_SLOT,
-
- .plt0_size = 32u,
- .plt_entry_size = 16u,
- .iplt_stub_size = 12u,
-
- .emit_plt0 = x64_emit_plt0,
- .emit_plt_entry = x64_emit_plt_entry,
- .emit_iplt_stub = x64_emit_iplt_stub,
-};
diff --git a/src/link/link_dyn.c b/src/link/link_dyn.c
@@ -514,7 +514,7 @@ void layout_dyn(Linker* l, LinkImage* img) {
img->pie = 1;
/* PT_INTERP path. Default to the canonical musl loader matching the
- * target arch (per-arch table in link_arch_<arch>.c) when the caller
+ * target arch (per-arch table in src/arch/<arch>/link.c) when the caller
* didn't set one. Drivers like cfree-cc always override via
* link_set_interp_path; this default is correctness for direct
* libcfree consumers. glibc users have to set their interp
diff --git a/test/arch/aa64_inline_test.c b/test/arch/aa64_inline_test.c
@@ -24,7 +24,7 @@
#include <stdlib.h>
#include <string.h>
-#include "arch/aa64_asm.h"
+#include "arch/aa64/asm.h"
#include "arch/arch.h"
#include "core/buf.h"
#include "core/core.h"
diff --git a/test/arch/aa64_isa_test.c b/test/arch/aa64_isa_test.c
@@ -7,14 +7,14 @@
* invariant: an alias-bearing word (e.g. ORR Rd, ZR, Rm) resolves to
* the alias spelling (MOV) rather than the canonical row.
*
- * Builds against the internal arch/aa64_isa.h surface (test.mk passes
+ * Builds against the internal arch/aa64/isa.h surface (test.mk passes
* -Isrc). No public-API dependency — this is a unit test of the
* descriptor table itself. */
#include <stdio.h>
#include <string.h>
-#include "arch/aa64_isa.h"
+#include "arch/aa64/isa.h"
#include "core/strbuf.h"
static int fails = 0;