kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 8ac1385246eaad90b7b36d075d58086f995dc88d
parent 9f744cea2b46ba8b526001fd7c4a425e531732ea
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 14 May 2026 13:13:49 -0700

Consolidate arch files into subdirs

Diffstat:
Mdoc/ASM.md | 10+++++-----
Mdoc/DBG.md | 8++++----
Mdoc/OPT1.md | 2+-
Mdoc/STAGE2.md | 6+++---
Mdoc/TAILCALL.md | 12++++++------
Ddoc/arch-registration-plan.md | 20--------------------
Msrc/api/stubs.c | 6+++---
Rsrc/arch/aa64.h -> src/arch/aa64/aa64.h | 0
Asrc/arch/aa64/alloc.c | 246+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/arch/aa64/arch.c | 97+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/arch/aa64/asm.c | 1379+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rsrc/arch/aa64_asm.h -> src/arch/aa64/asm.h | 0
Asrc/arch/aa64/dbg.c | 235+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/arch/aa64/disasm.c | 133+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/arch/aa64/disasm.h | 14++++++++++++++
Asrc/arch/aa64/emit.c | 523+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/arch/aa64/internal.h | 306+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/arch/aa64/isa.c | 598+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rsrc/arch/aa64_isa.h -> src/arch/aa64/isa.h | 0
Asrc/arch/aa64/link.c | 208+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/arch/aa64/ops.c | 1925+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/arch/aa64/opt_coord.c | 96+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/arch/aa64/regs.c | 88+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rsrc/arch/aa64_regs.h -> src/arch/aa64/regs.h | 0
Dsrc/arch/aa64_asm.c | 1379-------------------------------------------------------------------------------
Dsrc/arch/aa64_disasm.c | 133-------------------------------------------------------------------------------
Dsrc/arch/aa64_disasm.h | 14--------------
Dsrc/arch/aa64_isa.c | 598-------------------------------------------------------------------------------
Dsrc/arch/aa64_regs.c | 88-------------------------------------------------------------------------------
Dsrc/arch/aarch64/alloc.c | 246-------------------------------------------------------------------------------
Dsrc/arch/aarch64/arch.c | 95-------------------------------------------------------------------------------
Dsrc/arch/aarch64/emit.c | 523-------------------------------------------------------------------------------
Dsrc/arch/aarch64/internal.h | 306-------------------------------------------------------------------------------
Dsrc/arch/aarch64/ops.c | 1925-------------------------------------------------------------------------------
Dsrc/arch/aarch64/opt_coord.c | 96-------------------------------------------------------------------------------
Msrc/arch/rv64/arch.c | 4+++-
Msrc/arch/rv64/internal.h | 4++--
Rsrc/arch/rv64_isa.h -> src/arch/rv64/isa.h | 0
Asrc/arch/rv64/link.c | 95+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rsrc/arch/rv64.h -> src/arch/rv64/rv64.h | 0
Msrc/arch/x64/alloc.c | 4++--
Msrc/arch/x64/arch.c | 4+++-
Msrc/arch/x64/emit.c | 4++--
Msrc/arch/x64/internal.h | 6+++---
Asrc/arch/x64/isa.h | 128+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/arch/x64/link.c | 77+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/arch/x64/ops.c | 4++--
Rsrc/arch/x64.h -> src/arch/x64/x64.h | 0
Dsrc/arch/x64_isa.h | 128-------------------------------------------------------------------------------
Dsrc/dbg/arch_aa64.c | 235-------------------------------------------------------------------------------
Msrc/dbg/dbg.h | 4++--
Msrc/dbg/displaced.c | 2+-
Msrc/link/link_arch.h | 10++--------
Dsrc/link/link_arch_aa64.c | 208-------------------------------------------------------------------------------
Dsrc/link/link_arch_rv64.c | 95-------------------------------------------------------------------------------
Dsrc/link/link_arch_x64.c | 77-----------------------------------------------------------------------------
Msrc/link/link_dyn.c | 2+-
Mtest/arch/aa64_inline_test.c | 2+-
Mtest/arch/aa64_isa_test.c | 4++--
59 files changed, 6196 insertions(+), 6216 deletions(-)

diff --git a/doc/ASM.md b/doc/ASM.md @@ -74,18 +74,18 @@ src/parse/parse_asm_helpers.h src/parse/parse.c parse_asm_stmt: GNU asm("...") statement grammar (volatile, goto, four colon-separated lists, [name] symbolic operands). -src/arch/aa64_isa.{h,c} per-format pack/unpack/print + AA64InsnDesc +src/arch/aa64/isa.{h,c} per-format pack/unpack/print + AA64InsnDesc table + alias flags. Shared between encoder, decoder, and printer. -src/arch/aa64_asm.{h,c} aa64 instruction parser: per-mnemonic dispatch +src/arch/aa64/asm.{h,c} aa64 instruction parser: per-mnemonic dispatch over the table → inline encoders. aa64_inline_bind + aa64_asm_run_template implement the inline-asm template walker. -src/arch/aa64_disasm.{h,c} aa64 ArchDisasm impl wrapping aa64_disasm_find + +src/arch/aa64/disasm.{h,c} aa64 ArchDisasm impl wrapping aa64_disasm_find + aa64_print_operands; synthesizes b.<cond>. -src/arch/aa64_regs.{h,c} canonical aarch64 register name list. +src/arch/aa64/regs.{h,c} canonical aarch64 register name list. src/arch/disasm.c arch_disasm_new dispatch on c->target.arch. -src/arch/aarch64.c aa_asm_block: CGTarget vtable entry for inline +src/arch/aa64/arch.c aa_asm_block: CGTarget vtable entry for inline asm; opens AA64Asm, binds operands, runs template, closes. src/cg/cg.c cg_inline_asm: constraint binder (pops inputs, diff --git a/doc/DBG.md b/doc/DBG.md @@ -48,7 +48,7 @@ src/ bp.c breakpoint patch table (addr -> saved bytes, refcount) step.c resume-mode state machine (insn / line / next / out) displaced.c arch-neutral plumbing for out-of-line execution - arch_aa64.c aa64 BRK encoding + PC-relative fixups for displaced + arch/aa64/dbg.c aa64 BRK encoding + PC-relative fixups for displaced arch_x64.c (later) arch_rv64.c (later) mem.c read/write_mem with sigsetjmp bad-address guard @@ -215,7 +215,7 @@ Invariants: ## 7. Software breakpoints -aa64-specific encoding lives in `src/dbg/arch_aa64.c`; everything else +aa64-specific encoding lives in `src/arch/aa64/dbg.c`; everything else in `src/dbg/bp.c` is arch-neutral. - Patch instruction: `BRK #0` (4 bytes on aa64; `0xCC` on x64 later). @@ -384,10 +384,10 @@ the box. - [x] `bp.c` — refcounted patch table, idempotent set/clear, read overlay - [x] `mem.c` — guarded read/write via `dbg_os->guarded_copy` - [x] `displaced.c` — scratch page + per-insn shim primitive -- [x] `arch_aa64.c` — verbatim copy + B / BL / B.cond / CBZ / CBNZ / +- [x] `arch/aa64/dbg.c` — verbatim copy + B / BL / B.cond / CBZ / CBNZ / TBZ / TBNZ / ADR / ADRP / LDR-lit (W/X/SW) / BR / BLR / RET - [x] `step.c` — `STEP_LINE` / `NEXT_LINE` / `STEP_OUT` state machines -- [ ] `arch_aa64.c`: LDR-literal vector forms (S/D/Q register dest); +- [ ] `arch/aa64/dbg.c`: LDR-literal vector forms (S/D/Q register dest); currently decline. Common in optimized builds. - [ ] `arch_x64.c`: INT3 + RIP-relative fixups for the same insn family - [ ] `arch_rv64.c`: EBREAK + AUIPC/JAL/branch fixups diff --git a/doc/OPT1.md b/doc/OPT1.md @@ -42,7 +42,7 @@ substitute a behaviorally similar shortcut without updating both documents. ## Completion Notes - Implemented in `src/opt/pass_lower.c`, `src/opt/opt.c`, `src/opt/ir.h`, - `src/arch/aarch64/opt_coord.c`, `src/arch/x64/opt_coord.c`, + `src/arch/aa64/opt_coord.c`, `src/arch/x64/opt_coord.c`, `src/arch/rv64/opt_coord.c`, and `test/opt/opt_test.c`. - Added `opt_dead_def_elim` pass (pre-RA backward walk with dynamic liveness, removes cascading dead defs before rewrite). diff --git a/doc/STAGE2.md b/doc/STAGE2.md @@ -100,8 +100,8 @@ not been switched back on. `src/debug/` and `src/link/`. - [x] **B8.** `sizeof` accepts the no-parens **unary-expression** form in constant-expression contexts (e.g. file-scope initializers). C99 - §6.5.3.4 standard, not an extension. Blocked `src/arch/aa64_isa.c` - and `src/arch/aa64_regs.c`. + §6.5.3.4 standard, not an extension. Blocked `src/arch/aa64/isa.c` + and `src/arch/aa64/regs.c`. - [x] **B9.** Block-scope `static T name[] = {...}` now completes the incomplete array, mirroring B6's file-scope fix. Was blocking `src/pp/pp.c`. @@ -113,7 +113,7 @@ not been switched back on. - [x] **C2.** `OPK_INDIRECT` on the indirect-return path (commit f2d3e01). - [x] **C0.** Stage-1 regalloc "no spillable victim (class 0)" panic - fixed — was choking on the complex functions in `src/arch/aarch64.c`, + fixed — was choking on the complex functions in `src/arch/aa64/arch.c`, `src/arch/rv64.c`, `src/cg/cg.c`, and `src/opt/opt.c`. Not a feature gap; a regalloc bug surfaced by self-host pressure. diff --git a/doc/TAILCALL.md b/doc/TAILCALL.md @@ -90,7 +90,7 @@ When `flags & CG_CALL_TAIL`: ## Step 3 — AArch64 backend -### `src/arch/aarch64/internal.h` +### `src/arch/aa64/internal.h` Worst-case inline teardown: 5 int-pair LDPs (x19–x28) + 4 fp-pair LDPs (d8–d15) + 1 fp/lr LDP + 2 SP-add instructions = 12; use 14 for headroom. @@ -114,7 +114,7 @@ a->ntail_sites = 0; a->tail_sites_cap = 0; ``` -### `src/arch/aarch64/ops.c` — `aa_call` +### `src/arch/aa64/ops.c` — `aa_call` After the `emit_arg_value` loop and `max_outgoing` update, before the existing BL/BLR emission: @@ -159,7 +159,7 @@ if (d->flags & CG_CALL_TAIL) { `aa_tail_site_push` is a small grow-array helper consistent with the existing `add_patches` pattern. -### `src/arch/aarch64/emit.c` — `aa_func_end` +### `src/arch/aa64/emit.c` — `aa_func_end` After computing `n_int_pairs`, `n_fp_pairs`, `frame_size`, `int_save_off`, `fp_save_off`, `fp_lr_off` — before placing the epilogue label — patch each @@ -227,8 +227,8 @@ compile and run via `cfree run` and verify correctness. | `src/parse/parse_stmt.c` | attribute prefix detection; musttail return path | | `src/parse/parse_expr.c` | `cg_tail_call` dispatch when `in_musttail` | | `src/cg/cg.c` | factor `cg_call_impl`; implement `cg_tail_call` | -| `src/arch/aarch64/internal.h` | constants, `AATailCallSite`, fields in `AAImpl` | -| `src/arch/aarch64/ops.c` | tail-call branch in `aa_call`; `aa_tail_site_push` | -| `src/arch/aarch64/emit.c` | init in `aa_func_begin`; patch loop in `aa_func_end` | +| `src/arch/aa64/internal.h` | constants, `AATailCallSite`, fields in `AAImpl` | +| `src/arch/aa64/ops.c` | tail-call branch in `aa_call`; `aa_tail_site_push` | +| `src/arch/aa64/emit.c` | init in `aa_func_begin`; patch loop in `aa_func_end` | | `test/parse/` | musttail attribute parse test | | `test/cg/` | direct/indirect/e2e tail call tests | diff --git a/doc/arch-registration-plan.md b/doc/arch-registration-plan.md @@ -1,20 +0,0 @@ -# Architecture Registration Plan - -## Checklist - -- [x] Introduce one internal arch descriptor and registry lookup. -- [x] Route existing arch dispatchers through that descriptor without changing behavior. -- [x] Move ABI selection behind the arch descriptor. -- [x] Move object-format relocation translators behind the arch descriptor. -- [x] Move linker-only arch constants and stub emitters fully behind the descriptor. -- [x] Move assembler/disassembler/register helpers behind arch-owned implementation files. -- [x] Make `MCEmitter` delegate label fixup encoding to the arch descriptor. -- [ ] Consolidate files into `src/arch/{aa64,rv64,x64}/` with one exposed implementation object per arch. -- [ ] Teach the build to honor `CFREE_ARCHS` and compile only selected arch subtrees. -- [ ] Add targeted subset-build tests for `aa64`, `x64`, `rv64`, and mixed subsets. - -## Phase 1 - -Phase 1 is a refactor-only step. It adds the shared descriptor boundary and -rewires existing centralized dispatchers to use it while all currently supported -architectures remain compiled in by default. diff --git a/src/api/stubs.c b/src/api/stubs.c @@ -84,9 +84,9 @@ ObjBuilder* read_wasm(Compiler* c, const char* n, const u8* d, size_t l) { /* Header-dep iterator lives in src/api/dep.c. */ /* Disassembler is real (src/api/disasm.c, src/arch/disasm.c, - * src/arch/aa64_disasm.c). Per-arch register name lookups and the + * src/arch/aa64/disasm.c). Per-arch register name lookups and the * indexed enumeration (cfree_arch_register_count / _at) are real - * (src/api/arch_regs.c + src/arch/aa64_regs.c). */ + * (src/api/arch_regs.c + src/arch/aa64/regs.c). */ /* Linker script parsing lives in src/link/link_script.c. */ @@ -94,7 +94,7 @@ ObjBuilder* read_wasm(Compiler* c, const char* n, const u8* d, size_t l) { * src/link/link_jit.c. */ /* JIT session implementation lives in src/dbg/ (session.c, bp.c, step.c, - * displaced.c, arch_aa64.c, mem.c). */ + * displaced.c, arch/aa64/dbg.c, mem.c). */ /* DWARF consumer: the cfree_dwarf_* implementations live in src/debug/. * Their stubs were removed when src/debug/dwarf_*.c took ownership of diff --git a/src/arch/aa64.h b/src/arch/aa64/aa64.h diff --git a/src/arch/aa64/alloc.c b/src/arch/aa64/alloc.c @@ -0,0 +1,246 @@ +/* aarch64/alloc.c — spill/reload, labels, control flow, structured scopes. */ + +#include "arch/aa64/internal.h" + +/* ============================================================ + * AAImpl accessor + * ============================================================ */ + +AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; } + +/* ============================================================ + * Slot accessor + * ============================================================ */ + +AASlot* aa64_slot_get(AAImpl* a, FrameSlot fs) { + if (fs == FRAME_SLOT_NONE || fs > a->nslots) return NULL; + return &a->slots[fs - 1]; +} + +static int aa_resolve_reg_name(CGTarget* t, Sym name, Reg* out, + RegClass* cls_out) { + (void)t; + size_t len = 0; + const char* s = pool_str(t->c->global, name, &len); + if (!s || !len) return 1; + char buf[8]; + if (len >= sizeof buf) return 1; + memcpy(buf, s, len); + buf[len] = '\0'; + u32 dwarf; + if (aa64_register_index(buf, &dwarf) != 0) return 1; + if (dwarf <= 30u) { + if (out) *out = (Reg)dwarf; + if (cls_out) *cls_out = RC_INT; + return 0; + } + if (dwarf >= 64u && dwarf <= 95u) { + if (out) *out = (Reg)(dwarf - 64u); + if (cls_out) *cls_out = RC_FP; + return 0; + } + return 1; +} + +static void aa_spill_reg(CGTarget* t, Operand src, FrameSlot slot, + MemAccess ma) { + AAImpl* a = impl_of(t); + if (src.kind != OPK_REG) { + compiler_panic(t->c, a->loc, "aarch64 spill_reg: src is not OPK_REG"); + } + Operand addr; + memset(&addr, 0, sizeof addr); + addr.kind = OPK_LOCAL; + addr.cls = RC_INT; + addr.type = ma.type; + addr.v.frame_slot = slot; + aa_store(t, addr, src, ma); +} + +static void aa_reload_reg(CGTarget* t, Operand dst, FrameSlot slot, + MemAccess ma) { + AAImpl* a = impl_of(t); + if (dst.kind != OPK_REG) { + compiler_panic(t->c, a->loc, "aarch64 reload_reg: dst is not OPK_REG"); + } + Operand addr; + memset(&addr, 0, sizeof addr); + addr.kind = OPK_LOCAL; + addr.cls = RC_INT; + addr.type = ma.type; + addr.v.frame_slot = slot; + aa_load(t, dst, addr, ma); +} + +/* ============================================================ + * Labels / control flow + * ============================================================ */ + +static Label aa_label_new(CGTarget* t) { + return (Label)t->mc->label_new(t->mc); +} + +static void aa_label_place(CGTarget* t, Label l) { + t->mc->label_place(t->mc, (MCLabel)l); +} + +void aa_jump(CGTarget* t, Label l) { + MCEmitter* mc = t->mc; + aa64_emit32(mc, aa64_b_base()); + mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_JUMP26, 4, 0); +} + +static u32 cmp_to_cond(CmpOp op) { + switch (op) { + case CMP_EQ: return 0x0u; + case CMP_NE: return 0x1u; + case CMP_LT_U: return 0x3u; + case CMP_LE_U: return 0x9u; + case CMP_GT_U: return 0x8u; + case CMP_GE_U: return 0x2u; + case CMP_LT_S: return 0xbu; + case CMP_LE_S: return 0xdu; + case CMP_GT_S: return 0xcu; + case CMP_GE_S: return 0xau; + default: return 0x0u; + } +} + +void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op) { + MCEmitter* mc = t->mc; + u32 sf = type_is_64(a_op.type) ? 1u : 0u; + if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) { + u32 imm12, sh; + if (aa64_addsub_imm_fits(b_op.v.imm, &imm12, &sh)) { + u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); + aa64_emit32(mc, aa64_subs_imm12(sf, /*Rd=ZR*/ 31u, rn, imm12, sh)); + return; + } + } + u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); + u32 rm = + aa64_force_reg_int(t, b_op, sf, (rn == AA_TMP0) ? AA_TMP1 : AA_TMP0); + aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, rn, rm)); +} + +static void aa_cmp_branch(CGTarget* t, CmpOp op, Operand a, Operand b, + Label l) { + MCEmitter* mc = t->mc; + emit_cmp_ab(t, a, b); + aa64_emit32(mc, aa64_b_cond(cmp_to_cond(op))); + mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_CONDBR19, 4, 0); +} + +static void aa_cmp(CGTarget* t, CmpOp op, Operand dst, Operand a, Operand b) { + emit_cmp_ab(t, a, b); + u32 sf_dst = type_is_64(dst.type) ? 1u : 0u; + aa64_emit32(t->mc, aa64_cset(sf_dst, reg_num(dst), cmp_to_cond(op))); +} + +/* ============================================================ + * Structured scopes + * ============================================================ */ + +static CGScope aa_scope_begin(CGTarget* t, const CGScopeDesc* d) { + AAImpl* a = impl_of(t); + if (a->nscopes == a->scopes_cap) { + u32 ncap = a->scopes_cap ? a->scopes_cap * 2u : 4u; + AAScope* nb = arena_array(t->c->tu, AAScope, ncap); + if (a->scopes) memcpy(nb, a->scopes, sizeof(AAScope) * a->nscopes); + a->scopes = nb; + a->scopes_cap = ncap; + } + AAScope* sc = &a->scopes[a->nscopes]; + sc->kind = (u8)d->kind; + sc->has_else = 0; + sc->else_label = 0; + sc->end_label = 0; + sc->break_label = d->break_label; + sc->continue_label = d->continue_label; + + if (d->kind == SCOPE_IF) { + sc->else_label = t->mc->label_new(t->mc); + sc->end_label = t->mc->label_new(t->mc); + u32 sf = type_is_64(d->cond.type) ? 1u : 0u; + u32 rn = aa64_force_reg_int(t, d->cond, sf, AA_TMP0); + aa64_emit32(t->mc, aa64_subs_imm(sf, /*Rd=ZR*/ 31u, rn, 0)); + aa64_emit32(t->mc, aa64_b_cond(0x0u /*EQ*/)); + t->mc->emit_label_ref(t->mc, sc->else_label, R_AARCH64_CONDBR19, 4, 0); + } else if (d->kind == SCOPE_LOOP || d->kind == SCOPE_BLOCK) { + /* bookkeep only */ + } else { + compiler_panic(t->c, a->loc, + "aarch64 scope_begin: kind %d not yet implemented", + (int)d->kind); + } + + a->nscopes++; + return (CGScope)a->nscopes; +} + +static void aa_scope_else(CGTarget* t, CGScope s) { + AAImpl* a = impl_of(t); + if (s == CG_SCOPE_NONE || s > a->nscopes) { + compiler_panic(t->c, a->loc, "aarch64 scope_else: bad scope %u", + (unsigned)s); + } + AAScope* sc = &a->scopes[s - 1]; + aa64_emit32(t->mc, aa64_b_base()); + t->mc->emit_label_ref(t->mc, sc->end_label, R_AARCH64_JUMP26, 4, 0); + t->mc->label_place(t->mc, sc->else_label); + sc->has_else = 1; +} + +static void aa_scope_end(CGTarget* t, CGScope s) { + AAImpl* a = impl_of(t); + if (s == CG_SCOPE_NONE || s > a->nscopes) { + compiler_panic(t->c, a->loc, "aarch64 scope_end: bad scope %u", + (unsigned)s); + } + AAScope* sc = &a->scopes[s - 1]; + if (sc->kind == SCOPE_IF) { + if (!sc->has_else) { + t->mc->label_place(t->mc, sc->else_label); + } + t->mc->label_place(t->mc, sc->end_label); + } +} + +static void aa_break_to(CGTarget* t, CGScope s) { + AAImpl* a = impl_of(t); + if (s == CG_SCOPE_NONE || s > a->nscopes) { + compiler_panic(t->c, a->loc, "aarch64 break_to: bad scope %u", (unsigned)s); + } + AAScope* sc = &a->scopes[s - 1]; + aa_jump(t, sc->break_label); +} + +static void aa_continue_to(CGTarget* t, CGScope s) { + AAImpl* a = impl_of(t); + if (s == CG_SCOPE_NONE || s > a->nscopes) { + compiler_panic(t->c, a->loc, "aarch64 continue_to: bad scope %u", + (unsigned)s); + } + AAScope* sc = &a->scopes[s - 1]; + aa_jump(t, sc->continue_label); +} + +/* Expose vtable entries to ops.c constructor via a registration helper. + * ops.c calls this after the basic ops vtable is populated. */ +void aa_alloc_vtable_init(CGTarget* t) { + t->spill_reg = aa_spill_reg; + t->reload_reg = aa_reload_reg; + t->resolve_reg_name = aa_resolve_reg_name; + + t->label_new = aa_label_new; + t->label_place = aa_label_place; + t->jump = aa_jump; + t->cmp_branch = aa_cmp_branch; + t->cmp = aa_cmp; + + t->scope_begin = aa_scope_begin; + t->scope_else = aa_scope_else; + t->scope_end = aa_scope_end; + t->break_to = aa_break_to; + t->continue_to = aa_continue_to; +} diff --git a/src/arch/aa64/arch.c b/src/arch/aa64/arch.c @@ -0,0 +1,97 @@ +#include "arch/arch.h" + +#include "abi/abi_internal.h" +#include "arch/aa64/aa64.h" +#include "arch/aa64/asm.h" +#include "arch/aa64/disasm.h" +#include "arch/aa64/regs.h" +#include "core/bytes.h" +#include "link/link_arch.h" +#include "obj/elf.h" +#include "obj/macho.h" +#include "obj/obj.h" + +extern const LinkArchDesc link_arch_aa64; + +static const ABIVtable* aa64_abi_vtable(Compiler* c, CfreeOSKind os) { + (void)c; + switch (os) { + case CFREE_OS_MACOS: + return &apple_arm64_vtable; + default: + return &aapcs64_vtable; + } +} + +static int aa64_register_at_public(uint32_t idx, CfreeArchReg* out) { + if (!out) return 1; + return aa64_register_iter_get(idx, &out->dwarf_idx, &out->name); +} + +static const ArchElfOps aa64_elf_ops = { + .e_machine = EM_AARCH64, + .e_flags = 0, + .reloc_to = elf_aarch64_reloc_to, + .reloc_from = elf_aarch64_reloc_from, +}; + +static const ArchMachoOps aa64_macho_ops = { + .cputype = CPU_TYPE_ARM64, + .cpusubtype = CPU_SUBTYPE_ARM64_ALL, + .reloc_to = macho_aarch64_reloc_to, + .reloc_pcrel = macho_aarch64_reloc_pcrel, + .reloc_length = macho_aarch64_reloc_length, + .reloc_from = macho_aarch64_reloc_from, +}; + +static int aa64_apply_label_fixup(Compiler* c, const ArchLabelFixup* fx) { + const Section* s; + u8 cur[4]; + u32 word; + + (void)c; + if (!fx || fx->width != 4) return 1; + s = obj_section_get(fx->obj, fx->sec_id); + if (!s) return 0; + buf_read(&s->bytes, fx->offset, cur, 4); + word = rd_u32_le(cur); + + switch (fx->kind) { + case R_AARCH64_JUMP26: + case R_AARCH64_CALL26: { + i64 idisp = fx->disp >> 2; + u32 imm26 = (u32)(idisp & 0x03ffffffu); + word = (word & ~0x03ffffffu) | imm26; + break; + } + case R_AARCH64_CONDBR19: { + i64 idisp = fx->disp >> 2; + u32 imm19 = (u32)(idisp & 0x7ffffu); + word = (word & ~(0x7ffffu << 5)) | (imm19 << 5); + break; + } + default: + return 1; + } + + wr_u32_le(cur, word); + obj_patch(fx->obj, fx->sec_id, fx->offset, cur, 4); + return 0; +} + +const ArchImpl arch_impl_aa64 = { + .kind = CFREE_ARCH_ARM_64, + .name = "aa64", + .abi_vtable = aa64_abi_vtable, + .cgtarget_new = aa64_cgtarget_new, + .asm_new = aa64_arch_asm_new, + .disasm_new = aa64_disasm_new, + .apply_label_fixup = aa64_apply_label_fixup, + .link = &link_arch_aa64, + .elf = &aa64_elf_ops, + .macho = &aa64_macho_ops, + .register_name = aa64_register_name, + .register_index = aa64_register_index, + .register_count = aa64_register_iter_size, + .register_at = aa64_register_at_public, +}; diff --git a/src/arch/aa64/asm.c b/src/arch/aa64/asm.c @@ -0,0 +1,1379 @@ +/* AArch64 standalone .s instruction parser. + * + * Per-mnemonic dispatch: each entry in the mnemonic table names a + * parse function that reads operand tokens through the asm-driver + * surface and emits the encoded word via the inline encoders in + * aa64_isa.h. Encoders are the single source of truth for bit + * layout — the disassembler shares them through aa64_*_unpack. + * + * Aliases (`mov`, `neg`, `cmp`, `mul`, ...) live in this table as + * dedicated rows that pick the canonical form's encoder with the + * alias-specific operand shape. When a mnemonic admits multiple + * forms (e.g. `mov` register-vs-immediate, `add` register-vs- + * immediate), the parser branches on operand shape after reading + * the first non-Rd operand. */ + +#include "arch/aa64/asm.h" + +#include <string.h> + +#include "arch/aa64/isa.h" +#include "arch/aa64/regs.h" +#include "arch/arch.h" +#include "core/arena.h" +#include "core/pool.h" +#include "core/strbuf.h" +#include "asm/asm_lex.h" +#include "obj/obj.h" +#include "asm/asm_helpers.h" + +/* ---- public handle ---- */ + +struct AA64Asm { + ArchAsm base; + Compiler* c; + + /* Inline-asm bound state (set by aa64_inline_bind, cleared otherwise). + * Operand indexing per GCC convention: 0..nout-1 are outputs, then + * nout..nout+nin-1 are inputs. Templates address into this combined + * list via %N / %wN / %xN / %aN. out_ops is mutable (the binder fills + * in result locations); in_ops + constraints + clobbers are read-only + * borrows. */ + const AsmConstraint* outs; + Operand* out_ops; + const AsmConstraint* ins; + const Operand* in_ops; + const Sym* clobbers; + u32 nout; + u32 nin; + u32 nclob; +}; + +static void aa64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic); +static void aa64_arch_asm_destroy(ArchAsm* base); + +AA64Asm* aa64_asm_open(Compiler* c) { + AA64Asm* a = arena_new(c->tu, AA64Asm); + memset(a, 0, sizeof *a); + a->base.insn = aa64_arch_asm_insn; + a->base.destroy = aa64_arch_asm_destroy; + a->c = c; + return a; +} + +void aa64_asm_close(AA64Asm* a) { (void)a; } + +ArchAsm* aa64_arch_asm_new(Compiler* c) { + return &aa64_asm_open(c)->base; +} + +static void aa64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic) { + aa64_asm_insn((AA64Asm*)base, d, mnemonic); +} + +static void aa64_arch_asm_destroy(ArchAsm* base) { + aa64_asm_close((AA64Asm*)base); +} + +void aa64_inline_bind(AA64Asm* a, + const AsmConstraint* outs, u32 nout, Operand* out_ops, + const AsmConstraint* ins, u32 nin, const Operand* in_ops, + const Sym* clobbers, u32 nclob) { + a->outs = outs; + a->out_ops = out_ops; + a->ins = ins; + a->in_ops = in_ops; + a->clobbers = clobbers; + a->nout = nout; + a->nin = nin; + a->nclob = nclob; +} + +/* ---- helpers ---- */ + +static int tok_punct(AsmTok t, u32 p) { return asm_driver_tok_is_punct(t, p); } + +static int icase_eq(const char* a, size_t an, const char* b) { + size_t i; + for (i = 0; i < an; ++i) { + char x = a[i], y = b[i]; + if (x >= 'A' && x <= 'Z') x = (char)(x + ('a' - 'A')); + if (y >= 'A' && y <= 'Z') y = (char)(y + ('a' - 'A')); + if (x != y || !y) return 0; + } + return b[an] == '\0'; +} + +/* Parse a register operand. Returns the 5-bit encoded register number + * via *reg_out and the form via *is64_out. Recognized forms (case- + * insensitive): + * w0..w30, wzr → is64=0, reg=0..30 / 31 + * x0..x30, xzr, lr (=x30) → is64=1, reg=0..30 / 31 + * sp → is64=1, reg=31 (sp_means_sp set) + * wsp → is64=0, reg=31 (sp_means_sp set) + * Aliases: + * fp = x29 + * ip0 = x16, ip1 = x17 (PLT scratch — useful for hand-written PLTs) */ +typedef struct AA64Reg { + u32 num; + u8 is64; + u8 is_sp; /* 1 if the spelling was "sp" / "wsp" */ + u8 is_fp; /* 1 for SIMD/FP register spellings accepted in FP forms */ + u8 pad; +} AA64Reg; + +static int parse_reg_from_ident(AsmDriver* d, Sym ident, AA64Reg* out) { + size_t n = 0; + const char* p = pool_str(asm_driver_pool(d), ident, &n); + if (!p || !n) return 0; + /* "sp" */ + if (icase_eq(p, n, "sp")) { + out->num = 31; + out->is64 = 1; + out->is_sp = 1; + out->is_fp = 0; + return 1; + } + if (icase_eq(p, n, "wsp")) { + out->num = 31; + out->is64 = 0; + out->is_sp = 1; + out->is_fp = 0; + return 1; + } + if (icase_eq(p, n, "lr")) { + out->num = 30; + out->is64 = 1; + out->is_sp = 0; + out->is_fp = 0; + return 1; + } + if (icase_eq(p, n, "fp")) { + out->num = 29; + out->is64 = 1; + out->is_sp = 0; + out->is_fp = 0; + return 1; + } + if (icase_eq(p, n, "ip0")) { + out->num = 16; + out->is64 = 1; + out->is_sp = 0; + out->is_fp = 0; + return 1; + } + if (icase_eq(p, n, "ip1")) { + out->num = 17; + out->is64 = 1; + out->is_sp = 0; + out->is_fp = 0; + return 1; + } + if (icase_eq(p, n, "xzr")) { + out->num = 31; + out->is64 = 1; + out->is_sp = 0; + out->is_fp = 0; + return 1; + } + if (icase_eq(p, n, "wzr")) { + out->num = 31; + out->is64 = 0; + out->is_sp = 0; + out->is_fp = 0; + return 1; + } + /* W/X<num> */ + if ((p[0] == 'w' || p[0] == 'W' || p[0] == 'x' || p[0] == 'X') && n >= 2) { + u32 r = 0; + size_t i; + for (i = 1; i < n; ++i) { + char c = p[i]; + if (c < '0' || c > '9') return 0; + r = r * 10 + (u32)(c - '0'); + if (r > 31) return 0; + } + out->num = r; + out->is64 = (p[0] == 'x' || p[0] == 'X') ? 1 : 0; + out->is_sp = 0; + out->is_fp = 0; + return 1; + } + return 0; +} + +static int parse_fp_d_reg_from_ident(AsmDriver* d, Sym ident, AA64Reg* out) { + size_t n = 0; + const char* p = pool_str(asm_driver_pool(d), ident, &n); + if (!p || n < 2 || (p[0] != 'd' && p[0] != 'D')) return 0; + u32 r = 0; + for (size_t i = 1; i < n; ++i) { + char c = p[i]; + if (c < '0' || c > '9') return 0; + r = r * 10 + (u32)(c - '0'); + if (r > 31) return 0; + } + out->num = r; + out->is64 = 1; + out->is_sp = 0; + out->is_fp = 1; + return 1; +} + +static AA64Reg parse_reg(AsmDriver* d) { + AsmTok t = asm_driver_next(d); + AA64Reg r; + memset(&r, 0, sizeof r); + if (t.kind != ASM_TOK_IDENT || !parse_reg_from_ident(d, t.v.ident, &r)) + asm_driver_panic(d, "asm: expected register"); + return r; +} + +static AA64Reg parse_ldstp_reg(AsmDriver* d) { + AsmTok t = asm_driver_next(d); + AA64Reg r; + memset(&r, 0, sizeof r); + if (t.kind != ASM_TOK_IDENT || + (!parse_reg_from_ident(d, t.v.ident, &r) && + !parse_fp_d_reg_from_ident(d, t.v.ident, &r))) { + asm_driver_panic(d, "asm: expected register"); + } + return r; +} + +static void reject_sp_reg(AsmDriver* d, AA64Reg r, const char* what) { + if (r.is_sp) asm_driver_panic(d, "asm: %s: SP register not allowed", what); +} + +static void require_sp_spelling(AsmDriver* d, AA64Reg r, const char* what) { + if (r.num == 31u && !r.is_sp) + asm_driver_panic(d, "asm: %s: zero register not allowed in SP operand", + what); +} + +/* Parse "#imm" (with optional + / -) or a bare expression — GNU as is + * lenient about the leading hash. Returns an i64. */ +static i64 parse_imm_const(AsmDriver* d) { + (void)asm_driver_eat_punct(d, '#'); + return asm_driver_parse_const(d); +} + +/* Parse a possibly-symbolic operand prefixed by '#'. */ +static void parse_imm_sym(AsmDriver* d, ObjSymId* sym_out, i64* val_out) { + (void)asm_driver_eat_punct(d, '#'); + asm_driver_parse_sym_expr(d, sym_out, val_out); +} + +static void emit32(AsmDriver* d, u32 word) { + MCEmitter* mc = asm_driver_mc(d); + (void)asm_driver_cur_section(d); + u8 buf[4]; + buf[0] = (u8)(word & 0xff); + buf[1] = (u8)((word >> 8) & 0xff); + buf[2] = (u8)((word >> 16) & 0xff); + buf[3] = (u8)((word >> 24) & 0xff); + mc->emit_bytes(mc, buf, 4); +} + +static int parse_cond_from_ident(AsmDriver* d, Sym ident, u32* out) { + size_t n = 0; + const char* s = pool_str(asm_driver_pool(d), ident, &n); + if (!s) return 0; + if (icase_eq(s, n, "eq")) *out = 0; + else if (icase_eq(s, n, "ne")) *out = 1; + else if (icase_eq(s, n, "cs") || icase_eq(s, n, "hs")) *out = 2; + else if (icase_eq(s, n, "cc") || icase_eq(s, n, "lo")) *out = 3; + else if (icase_eq(s, n, "mi")) *out = 4; + else if (icase_eq(s, n, "pl")) *out = 5; + else if (icase_eq(s, n, "vs")) *out = 6; + else if (icase_eq(s, n, "vc")) *out = 7; + else if (icase_eq(s, n, "hi")) *out = 8; + else if (icase_eq(s, n, "ls")) *out = 9; + else if (icase_eq(s, n, "ge")) *out = 10; + else if (icase_eq(s, n, "lt")) *out = 11; + else if (icase_eq(s, n, "gt")) *out = 12; + else if (icase_eq(s, n, "le")) *out = 13; + else if (icase_eq(s, n, "al")) *out = 14; + else return 0; + return 1; +} + +static u32 parse_cond(AsmDriver* d, const char* what) { + AsmTok t = asm_driver_next(d); + u32 cond = 0; + if (t.kind != ASM_TOK_IDENT || !parse_cond_from_ident(d, t.v.ident, &cond)) + asm_driver_panic(d, "asm: %s: expected condition code", what); + return cond; +} + +static void expect_comma(AsmDriver* d, const char* what) { + if (!asm_driver_eat_comma(d)) + asm_driver_panic(d, "asm: expected ',' (%s)", what); +} + +/* ---- per-mnemonic parsers ---- */ + +/* ret [Xn] — Xn defaults to x30. */ +static void p_ret(AsmDriver* d) { + if (asm_driver_at_eol(d)) { + emit32(d, aa64_ret(30)); + return; + } + AA64Reg r = parse_reg(d); + if (!r.is64) asm_driver_panic(d, "asm: ret: 64-bit register expected"); + emit32(d, aa64_ret(r.num)); +} + +static void p_br(AsmDriver* d) { + AA64Reg r = parse_reg(d); + if (!r.is64) asm_driver_panic(d, "asm: br: 64-bit register expected"); + emit32(d, aa64_br(r.num)); +} + +static void p_blr(AsmDriver* d) { + AA64Reg r = parse_reg(d); + if (!r.is64) asm_driver_panic(d, "asm: blr: 64-bit register expected"); + emit32(d, aa64_blr(r.num)); +} + +static void p_nop(AsmDriver* d) { + (void)d; + emit32(d, aa64_nop()); +} + +/* Memory barriers (DMB / DSB / ISB / CLREX). + * + * dmb <option> ; option in {sy, ish, nsh, osh, ld, st, ishld, + * ishst, nshld, nshst, oshld, oshst} + * dmb #imm4 ; numeric form + * dsb <option> | #imm4 + * isb [<option>] ; option defaults to sy when omitted + * clrex [#imm4] ; option defaults to sy (15) when omitted */ +static u32 parse_barrier_option(AsmDriver* d, int allow_dmb_ld_st) { + if (asm_driver_at_eol(d)) return AA64_BARRIER_OPT_SY; + AsmTok t = asm_driver_peek(d); + if (t.kind == ASM_TOK_IDENT) { + (void)asm_driver_next(d); + size_t n = 0; + const char* s = pool_str(asm_driver_pool(d), t.v.ident, &n); + if (icase_eq(s, n, "sy")) return AA64_BARRIER_OPT_SY; + if (icase_eq(s, n, "ish")) return AA64_BARRIER_OPT_ISH; + if (icase_eq(s, n, "ishld")) return AA64_BARRIER_OPT_ISHLD; + if (icase_eq(s, n, "ishst")) return AA64_BARRIER_OPT_ISHST; + if (icase_eq(s, n, "nsh")) return AA64_BARRIER_OPT_NSH; + if (icase_eq(s, n, "nshld")) return AA64_BARRIER_OPT_NSHLD; + if (icase_eq(s, n, "nshst")) return AA64_BARRIER_OPT_NSHST; + if (icase_eq(s, n, "osh")) return AA64_BARRIER_OPT_OSH; + if (icase_eq(s, n, "oshld")) return AA64_BARRIER_OPT_OSHLD; + if (icase_eq(s, n, "oshst")) return AA64_BARRIER_OPT_OSHST; + if (allow_dmb_ld_st) { + if (icase_eq(s, n, "ld")) return AA64_BARRIER_OPT_LD; + if (icase_eq(s, n, "st")) return AA64_BARRIER_OPT_ST; + } + asm_driver_panic(d, "asm: unknown barrier option"); + } + /* Numeric form: '#imm4'. */ + i64 imm = parse_imm_const(d); + if (imm < 0 || imm > 15) + asm_driver_panic(d, "asm: barrier imm out of range"); + return (u32)imm; +} + +static void p_dmb(AsmDriver* d) { + u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/1); + emit32(d, aa64_dmb(opt)); +} +static void p_dsb(AsmDriver* d) { + u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0); + emit32(d, aa64_dsb(opt)); +} +static void p_isb(AsmDriver* d) { + u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0); + emit32(d, aa64_isb(opt)); +} +static void p_clrex(AsmDriver* d) { + u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0); + emit32(d, aa64_clrex(opt)); +} + +/* mov: + * mov Rd, Rm → ORR Rd, ZR, Rm + * mov Rd, #imm → MOVZ (if imm fits in a single halfword unshifted) + * MOVN (if ~imm fits) + * otherwise: panic (multi-step expansion deferred). */ +static void p_mov(AsmDriver* d) { + AA64Reg rd = parse_reg(d); + expect_comma(d, "mov"); + AsmTok t = asm_driver_peek(d); + if (t.kind == ASM_TOK_IDENT) { + AA64Reg src; + memset(&src, 0, sizeof src); + if (parse_reg_from_ident(d, t.v.ident, &src)) { + (void)asm_driver_next(d); + if (src.is64 != rd.is64) + asm_driver_panic(d, "asm: mov: register width mismatch"); + /* mov involving SP encodes as `ADD Rd, Rsp, #0` per AArch64; + * approximate with that exact form. */ + if (rd.is_sp || src.is_sp) { + require_sp_spelling(d, rd, "mov sp"); + require_sp_spelling(d, src, "mov sp"); + emit32(d, aa64_add_imm(rd.is64, rd.num, src.num, 0, 0)); + return; + } + emit32(d, aa64_mov_reg(rd.is64, rd.num, src.num)); + return; + } + /* fall through: identifier that is not a register → treat as + * symbol/equate via expression below. */ + } + /* Immediate. */ + i64 imm = parse_imm_const(d); + if (rd.is_sp) asm_driver_panic(d, "asm: mov: cannot move imm into SP"); + u64 uv = (u64)imm; + u64 mask = rd.is64 ? ~0ull : 0xffffffffull; + uv &= mask; + /* Try MOVZ with one of four halfwords. */ + for (u32 hw = 0; hw < (rd.is64 ? 4u : 2u); ++hw) { + u64 shift = (u64)hw * 16; + u64 hwmask = 0xffffull << shift; + if ((uv & ~hwmask) == 0) { + u32 v = (u32)((uv >> shift) & 0xffff); + emit32(d, aa64_movz(rd.is64, rd.num, v, hw)); + return; + } + } + /* Try MOVN with one halfword (encodes ~imm in that halfword). */ + u64 nv = (~uv) & mask; + for (u32 hw = 0; hw < (rd.is64 ? 4u : 2u); ++hw) { + u64 shift = (u64)hw * 16; + u64 hwmask = 0xffffull << shift; + if ((nv & ~hwmask) == 0) { + u32 v = (u32)((nv >> shift) & 0xffff); + emit32(d, aa64_movn(rd.is64, rd.num, v, hw)); + return; + } + } + asm_driver_panic(d, "asm: mov: immediate cannot be encoded in one insn"); +} + +/* mvn Rd, Rm */ +static void p_mvn(AsmDriver* d) { + AA64Reg rd = parse_reg(d); + expect_comma(d, "mvn"); + AA64Reg rm = parse_reg(d); + if (rd.is64 != rm.is64) asm_driver_panic(d, "asm: mvn: width mismatch"); + emit32(d, aa64_mvn(rd.is64, rd.num, rm.num)); +} + +/* movz / movn / movk Rd, #imm[, lsl #shift] */ +static void p_movwide(AsmDriver* d, u32 opc) { + AA64Reg rd = parse_reg(d); + expect_comma(d, "movz/n/k"); + i64 imm = parse_imm_const(d); + u32 hw = 0; + if (asm_driver_eat_comma(d)) { + /* lsl #N (N is 0/16/32/48). */ + AsmTok lid = asm_driver_next(d); + if (lid.kind != ASM_TOK_IDENT) + asm_driver_panic(d, "asm: expected 'lsl'"); + size_t ln = 0; + const char* lp = pool_str(asm_driver_pool(d), lid.v.ident, &ln); + if (!lp || !icase_eq(lp, ln, "lsl")) + asm_driver_panic(d, "asm: expected 'lsl'"); + i64 sh = parse_imm_const(d); + if (sh % 16 != 0 || sh < 0 || sh > 48) + asm_driver_panic(d, "asm: movz/n/k: bad lsl shift"); + hw = (u32)(sh / 16); + } + u32 word = ((rd.is64 & 1u) << 31) | ((opc & 3u) << 29) | + AA64_MOVEWIDE_FAMILY_MATCH | ((hw & 3u) << 21) | + (((u32)imm & 0xffffu) << 5) | (rd.num & 0x1fu); + emit32(d, word); +} + +/* svc / brk / hlt #imm */ +static void p_except(AsmDriver* d, u32 form) { + i64 imm = parse_imm_const(d); + switch (form) { + case 0: emit32(d, aa64_svc((u32)imm)); break; + case 1: emit32(d, aa64_brk((u32)imm)); break; + case 2: { + /* HLT */ + u32 word = AA64_EXCEPT_FAMILY_MATCH | ((u32)2 << 21) | + (((u32)imm & 0xffffu) << 5); + emit32(d, word); + break; + } + default: asm_driver_panic(d, "asm: bad exception form"); + } +} + +/* Read optional `, lsl|lsr|asr|ror #imm` shift modifier. Returns 1 if + * present. */ +static int parse_shift_mod(AsmDriver* d, u32* shift_out, u32* imm6_out) { + AsmTok t = asm_driver_peek(d); + if (t.kind != ASM_TOK_IDENT) return 0; + size_t n = 0; + const char* p = pool_str(asm_driver_pool(d), t.v.ident, &n); + u32 sh; + if (icase_eq(p, n, "lsl")) sh = 0; + else if (icase_eq(p, n, "lsr")) sh = 1; + else if (icase_eq(p, n, "asr")) sh = 2; + else if (icase_eq(p, n, "ror")) sh = 3; + else return 0; + (void)asm_driver_next(d); + i64 imm = parse_imm_const(d); + if (imm < 0 || imm > 63) + asm_driver_panic(d, "asm: shift amount out of range"); + *shift_out = sh; + *imm6_out = (u32)imm; + return 1; +} + +/* add / sub family. + * Forms: + * add Rd, Rn, Rm[, lsl #s] shifted-register + * add Rd, Rn, #imm immediate + * add Rd, Rn, #imm, lsl #12 immediate w/ shift + * S-suffixed (adds/subs) sets flags. */ +static void p_addsub(AsmDriver* d, int is_sub, int set_flags) { + AA64Reg rd = parse_reg(d); + expect_comma(d, "add/sub"); + AA64Reg rn = parse_reg(d); + expect_comma(d, "add/sub"); + AsmTok t = asm_driver_peek(d); + if (tok_punct(t, '#') || t.kind == ASM_TOK_NUM || tok_punct(t, '-') || + tok_punct(t, '+')) { + /* immediate form */ + if (rd.is64 != rn.is64) + asm_driver_panic(d, "asm: add/sub imm: width mismatch"); + require_sp_spelling(d, rn, "add/sub imm"); + if (set_flags) { + reject_sp_reg(d, rd, "add/sub imm"); + } else { + require_sp_spelling(d, rd, "add/sub imm"); + } + i64 imm = parse_imm_const(d); + u32 sh = 0; + if (asm_driver_eat_comma(d)) { + AsmTok lid = asm_driver_next(d); + if (lid.kind != ASM_TOK_IDENT) + asm_driver_panic(d, "asm: expected 'lsl #12'"); + size_t ln = 0; + const char* lp = pool_str(asm_driver_pool(d), lid.v.ident, &ln); + if (!lp || !icase_eq(lp, ln, "lsl")) + asm_driver_panic(d, "asm: expected 'lsl'"); + i64 s = parse_imm_const(d); + if (s == 12) sh = 1; + else if (s == 0) sh = 0; + else asm_driver_panic(d, "asm: add/sub imm: lsl must be 0 or 12"); + } + if (imm < 0 || imm > 0xfff) + asm_driver_panic(d, "asm: add/sub imm out of range"); + u32 word = aa64_addsubimm_pack((AA64AddSubImm){ + .sf = rd.is64, .op = (u32)is_sub, .S = (u32)set_flags, .sh = sh, + .imm12 = (u32)imm, .Rn = rn.num, .Rd = rd.num}); + emit32(d, word); + return; + } + /* register form */ + AA64Reg rm = parse_reg(d); + reject_sp_reg(d, rd, "add/sub reg"); + reject_sp_reg(d, rn, "add/sub reg"); + reject_sp_reg(d, rm, "add/sub reg"); + if (rd.is64 != rm.is64 || rd.is64 != rn.is64) + asm_driver_panic(d, "asm: add/sub reg: width mismatch"); + u32 shift = 0, imm6 = 0; + if (asm_driver_eat_comma(d)) { + if (!parse_shift_mod(d, &shift, &imm6)) + asm_driver_panic(d, "asm: add/sub reg: expected shift modifier"); + } + u32 word = aa64_addsubsr_pack((AA64AddSubSR){ + .sf = rd.is64, .op = (u32)is_sub, .S = (u32)set_flags, + .shift = shift, .Rm = rm.num, .imm6 = imm6, .Rn = rn.num, + .Rd = rd.num}); + emit32(d, word); +} + +/* cmp Rn, Rm | cmp Rn, #imm → SUBS ZR, Rn, ... */ +static void p_cmp(AsmDriver* d, int is_neg /* cmn flips op */) { + AA64Reg rn = parse_reg(d); + expect_comma(d, "cmp"); + AsmTok t = asm_driver_peek(d); + if (tok_punct(t, '#') || t.kind == ASM_TOK_NUM || tok_punct(t, '-') || + tok_punct(t, '+')) { + require_sp_spelling(d, rn, "cmp imm"); + i64 imm = parse_imm_const(d); + u32 sh = 0; + if (asm_driver_eat_comma(d)) { + AsmTok lid = asm_driver_next(d); + size_t ln = 0; + const char* lp = + (lid.kind == ASM_TOK_IDENT) + ? pool_str(asm_driver_pool(d), lid.v.ident, &ln) + : NULL; + if (!lp || !icase_eq(lp, ln, "lsl")) + asm_driver_panic(d, "asm: cmp imm: expected 'lsl'"); + i64 s = parse_imm_const(d); + if (s == 12) sh = 1; + else if (s != 0) + asm_driver_panic(d, "asm: cmp imm: lsl must be 0 or 12"); + } + if (imm < 0 || imm > 0xfff) + asm_driver_panic(d, "asm: cmp imm out of range"); + u32 word = aa64_addsubimm_pack( + (AA64AddSubImm){.sf = rn.is64, .op = (u32)(!is_neg), .S = 1, + .sh = sh, .imm12 = (u32)imm, .Rn = rn.num, + .Rd = AA64_ZR}); + emit32(d, word); + return; + } + AA64Reg rm = parse_reg(d); + reject_sp_reg(d, rn, "cmp reg"); + reject_sp_reg(d, rm, "cmp reg"); + if (rm.is64 != rn.is64) asm_driver_panic(d, "asm: cmp: width mismatch"); + u32 shift = 0, imm6 = 0; + if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6); + u32 word = aa64_addsubsr_pack((AA64AddSubSR){ + .sf = rn.is64, .op = (u32)(!is_neg), .S = 1, .shift = shift, + .Rm = rm.num, .imm6 = imm6, .Rn = rn.num, .Rd = AA64_ZR}); + emit32(d, word); +} + +static void p_csinc(AsmDriver* d) { + AA64Reg rd = parse_reg(d); + expect_comma(d, "csinc"); + AA64Reg rn = parse_reg(d); + expect_comma(d, "csinc"); + AA64Reg rm = parse_reg(d); + expect_comma(d, "csinc"); + u32 cond = parse_cond(d, "csinc"); + if (rd.is_sp || rn.is_sp || rm.is_sp) + asm_driver_panic(d, "asm: csinc: SP register not allowed"); + if (rd.is64 != rn.is64 || rd.is64 != rm.is64) + asm_driver_panic(d, "asm: csinc: width mismatch"); + u32 word = 0x1A800400u | ((u32)rd.is64 << 31) | ((rm.num & 0x1fu) << 16) | + ((cond & 0xfu) << 12) | ((rn.num & 0x1fu) << 5) | + (rd.num & 0x1fu); + emit32(d, word); +} + +/* neg / negs Rd, Rm → SUB / SUBS Rd, ZR, Rm */ +static void p_neg(AsmDriver* d, int set_flags) { + AA64Reg rd = parse_reg(d); + expect_comma(d, "neg"); + AA64Reg rm = parse_reg(d); + reject_sp_reg(d, rd, "neg"); + reject_sp_reg(d, rm, "neg"); + if (rd.is64 != rm.is64) asm_driver_panic(d, "asm: neg: width mismatch"); + u32 shift = 0, imm6 = 0; + if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6); + u32 word = aa64_addsubsr_pack((AA64AddSubSR){ + .sf = rd.is64, .op = 1, .S = (u32)set_flags, .shift = shift, + .Rm = rm.num, .imm6 = imm6, .Rn = AA64_ZR, .Rd = rd.num}); + emit32(d, word); +} + +/* Logical shifted-register family. */ +static void p_log_sr(AsmDriver* d, u32 opc, u32 N) { + AA64Reg rd = parse_reg(d); + expect_comma(d, "logical"); + AA64Reg rn = parse_reg(d); + expect_comma(d, "logical"); + AA64Reg rm = parse_reg(d); + if (rd.is64 != rn.is64 || rd.is64 != rm.is64) + asm_driver_panic(d, "asm: logical: width mismatch"); + u32 shift = 0, imm6 = 0; + if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6); + u32 word = aa64_logsr_pack((AA64LogSR){ + .sf = rd.is64, .opc = opc, .shift = shift, .N = N, .Rm = rm.num, + .imm6 = imm6, .Rn = rn.num, .Rd = rd.num}); + emit32(d, word); +} + +/* Data-processing 3-source: madd/msub Rd, Rn, Rm, Ra. */ +static void p_dp3(AsmDriver* d, u32 o0) { + AA64Reg rd = parse_reg(d); + expect_comma(d, "dp3"); + AA64Reg rn = parse_reg(d); + expect_comma(d, "dp3"); + AA64Reg rm = parse_reg(d); + expect_comma(d, "dp3"); + AA64Reg ra = parse_reg(d); + if (rd.is64 != rn.is64 || rd.is64 != rm.is64 || rd.is64 != ra.is64) + asm_driver_panic(d, "asm: dp3: width mismatch"); + u32 word = aa64_dp3_pack((AA64DP3){ + .sf = rd.is64, .op31 = 0, .o0 = o0, .Rm = rm.num, .Ra = ra.num, + .Rn = rn.num, .Rd = rd.num}); + emit32(d, word); +} + +/* mul Rd, Rn, Rm → MADD Rd, Rn, Rm, ZR */ +static void p_mul(AsmDriver* d, u32 o0) { + AA64Reg rd = parse_reg(d); + expect_comma(d, "mul"); + AA64Reg rn = parse_reg(d); + expect_comma(d, "mul"); + AA64Reg rm = parse_reg(d); + if (rd.is64 != rn.is64 || rd.is64 != rm.is64) + asm_driver_panic(d, "asm: mul: width mismatch"); + u32 word = aa64_dp3_pack((AA64DP3){ + .sf = rd.is64, .op31 = 0, .o0 = o0, .Rm = rm.num, .Ra = AA64_ZR, + .Rn = rn.num, .Rd = rd.num}); + emit32(d, word); +} + +/* DP2: udiv/sdiv/lslv/lsrv/asrv/rorv Rd, Rn, Rm. */ +static void p_dp2(AsmDriver* d, u32 opcode) { + AA64Reg rd = parse_reg(d); + expect_comma(d, "dp2"); + AA64Reg rn = parse_reg(d); + expect_comma(d, "dp2"); + AA64Reg rm = parse_reg(d); + if (rd.is64 != rn.is64 || rd.is64 != rm.is64) + asm_driver_panic(d, "asm: dp2: width mismatch"); + u32 word = aa64_dp2_pack((AA64DP2){.sf = rd.is64, .opcode = opcode, + .Rm = rm.num, .Rn = rn.num, + .Rd = rd.num}); + emit32(d, word); +} + +/* Branch immediate / conditional / compare-and-branch. */ + +static void emit_branch_imm(AsmDriver* d, u32 op_bl, ObjSymId target, + i64 addend, i64 const_disp) { + MCEmitter* mc = asm_driver_mc(d); + /* Emit a B/BL with imm26 = 0; record a CALL26/JUMP26 reloc against + * either the symbol or the constant displacement. */ + u32 word = aa64_brimm_pack((AA64BrImm){.op = op_bl, .imm26 = 0}); + emit32(d, word); + u32 ofs = mc->pos(mc) - 4; + RelocKind k = op_bl ? R_AARCH64_CALL26 : R_AARCH64_JUMP26; + if (target != OBJ_SYM_NONE) { + mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, k, target, + addend, 1, 0); + } else { + /* Pure constant displacement is rare in real .s; reject it now. + * The recommended form is to use a label and let the assembler + * compute the displacement. */ + (void)const_disp; + asm_driver_panic(d, "asm: branch with pure constant disp not supported"); + } +} + +static void p_b(AsmDriver* d, u32 op_bl) { + ObjSymId sym = OBJ_SYM_NONE; + i64 off = 0; + /* GNU as accepts `b sym`, `bl sym+8`, etc. */ + parse_imm_sym(d, &sym, &off); + if (sym == OBJ_SYM_NONE) + asm_driver_panic(d, "asm: b/bl: symbolic target required"); + emit_branch_imm(d, op_bl, sym, off, 0); +} + +static void p_b_cond(AsmDriver* d, u32 cond) { + ObjSymId sym = OBJ_SYM_NONE; + i64 off = 0; + parse_imm_sym(d, &sym, &off); + if (sym == OBJ_SYM_NONE) + asm_driver_panic(d, "asm: b.cond: symbolic target required"); + /* Emit the instruction with imm19=0 + R_AARCH64_CONDBR19 reloc. */ + u32 word = aa64_brcond_pack((AA64BrCond){.imm19 = 0, .cond = cond}); + emit32(d, word); + MCEmitter* mc = asm_driver_mc(d); + u32 ofs = mc->pos(mc) - 4; + mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, + R_AARCH64_CONDBR19, sym, off, 1, 0); +} + +static void p_cbz(AsmDriver* d, u32 op) { + AA64Reg rt = parse_reg(d); + expect_comma(d, "cbz"); + ObjSymId sym = OBJ_SYM_NONE; + i64 off = 0; + parse_imm_sym(d, &sym, &off); + if (sym == OBJ_SYM_NONE) + asm_driver_panic(d, "asm: cbz: symbolic target required"); + u32 word = aa64_cb_pack((AA64CB){.sf = rt.is64, .op = op, .imm19 = 0, + .Rt = rt.num}); + emit32(d, word); + MCEmitter* mc = asm_driver_mc(d); + u32 ofs = mc->pos(mc) - 4; + mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, + R_AARCH64_CONDBR19, sym, off, 1, 0); +} + +/* Memory-operand parser for [Xn], [Xn, #imm], [Xn, #imm]!. + * + * pre_index_out is 1 when the closing `]!` appeared (pre-indexed). + * imm is the literal byte offset (no scaling). */ +typedef struct AA64Mem { + AA64Reg base; + i64 imm; /* byte offset (literal as written) */ + u8 pre_index; + u8 has_offset; + u8 pad[2]; +} AA64Mem; + +static AA64Mem parse_mem(AsmDriver* d) { + AA64Mem m; + memset(&m, 0, sizeof m); + if (!asm_driver_eat_punct(d, '[')) + asm_driver_panic(d, "asm: expected '['"); + m.base = parse_reg(d); + if (!m.base.is64) + asm_driver_panic(d, "asm: ldr/str: base register must be 64-bit"); + require_sp_spelling(d, m.base, "ldr/str base"); + if (asm_driver_eat_comma(d)) { + m.imm = parse_imm_const(d); + m.has_offset = 1; + } + if (!asm_driver_eat_punct(d, ']')) + asm_driver_panic(d, "asm: expected ']'"); + if (asm_driver_eat_punct(d, '!')) m.pre_index = 1; + return m; +} + +/* ldr/str Rt, [Xn, #imm] — chooses scaled or unscaled form based on + * alignment of imm. */ +static void p_ldr_str(AsmDriver* d, int is_load) { + AA64Reg rt = parse_reg(d); + reject_sp_reg(d, rt, "ldr/str"); + expect_comma(d, "ldr/str"); + AA64Mem m = parse_mem(d); + u32 size = rt.is64 ? 3u : 2u; + u32 opc = is_load ? AA64_LDST_OPC_LDR : AA64_LDST_OPC_STR; + if (!m.pre_index) { + /* Try scaled unsigned-imm12 first. */ + u32 scale = 1u << size; + if (m.imm >= 0 && (i64)((u64)m.imm % scale) == 0 && + (u64)m.imm / scale <= 0xfff) { + u32 imm12 = (u32)((u64)m.imm / scale); + u32 word = aa64_ldst_uimm_pack((AA64LdStUimm){ + .size = size, .V = 0, .opc = opc, .imm12 = imm12, + .Rn = m.base.num, .Rt = rt.num}); + emit32(d, word); + return; + } + /* Fall back to unscaled signed-imm9 (LDUR/STUR). */ + if (m.imm >= -256 && m.imm <= 255) { + u32 imm9 = (u32)((u64)m.imm & 0x1ffu); + u32 word = aa64_ldst_simm9_pack((AA64LdStSimm9){ + .size = size, .V = 0, .opc = opc, .imm9 = imm9, + .Rn = m.base.num, .Rt = rt.num}); + emit32(d, word); + return; + } + asm_driver_panic(d, "asm: ldr/str: immediate out of range"); + } + asm_driver_panic(d, "asm: ldr/str: pre-indexed form not yet supported"); +} + +/* ldur/stur — unscaled signed-imm9. */ +static void p_ldur_stur(AsmDriver* d, int is_load) { + AA64Reg rt = parse_reg(d); + reject_sp_reg(d, rt, "ldur/stur"); + expect_comma(d, "ldur/stur"); + AA64Mem m = parse_mem(d); + u32 size = rt.is64 ? 3u : 2u; + if (m.imm < -256 || m.imm > 255) + asm_driver_panic(d, "asm: ldur/stur: imm9 out of range"); + u32 imm9 = (u32)((u64)m.imm & 0x1ffu); + u32 word = aa64_ldst_simm9_pack((AA64LdStSimm9){ + .size = size, .V = 0, + .opc = is_load ? AA64_LDST_OPC_LDR : AA64_LDST_OPC_STR, + .imm9 = imm9, .Rn = m.base.num, .Rt = rt.num}); + emit32(d, word); +} + +/* ldp / stp Rt, Rt2, [Xn, #imm] or [Xn, #imm]! */ +static void p_ldp_stp(AsmDriver* d, int is_load) { + AA64Reg rt = parse_ldstp_reg(d); + expect_comma(d, "ldp/stp"); + AA64Reg rt2 = parse_ldstp_reg(d); + expect_comma(d, "ldp/stp"); + reject_sp_reg(d, rt, "ldp/stp"); + reject_sp_reg(d, rt2, "ldp/stp"); + if (rt.is64 != rt2.is64 || rt.is_fp != rt2.is_fp) + asm_driver_panic(d, "asm: ldp/stp: width mismatch"); + AA64Mem m = parse_mem(d); + u32 scale = rt.is64 ? 8u : 4u; + if ((i64)((u64)m.imm % scale) != 0) + asm_driver_panic(d, "asm: ldp/stp: imm not scale-aligned"); + i64 imm7 = m.imm / (i64)scale; + if (imm7 < -64 || imm7 > 63) + asm_driver_panic(d, "asm: ldp/stp: imm7 out of range"); + AA64LdStPPre f = {.opc = rt.is_fp ? 1u : (rt.is64 ? 2u : 0u), + .V = rt.is_fp ? 1u : 0u, + .L = is_load ? 1u : 0u, + .imm7 = (u32)imm7 & 0x7fu, + .Rt2 = rt2.num, + .Rn = m.base.num, + .Rt = rt.num}; + if (m.pre_index) + emit32(d, aa64_ldstp_pre_pack(f)); + else + emit32(d, aa64_ldstp_soff_pack(f)); +} + +/* adr / adrp Rd, sym */ +static void p_adr(AsmDriver* d, int is_adrp) { + AA64Reg rd = parse_reg(d); + expect_comma(d, "adr"); + ObjSymId sym = OBJ_SYM_NONE; + i64 off = 0; + parse_imm_sym(d, &sym, &off); + if (sym == OBJ_SYM_NONE) + asm_driver_panic(d, "asm: adr/adrp: symbol required"); + AA64PCRelAdr f = {.op = is_adrp ? AA64_ADR_OP_ADRP : AA64_ADR_OP_ADR, + .immlo = 0, .immhi = 0, .Rd = rd.num}; + emit32(d, aa64_pcrel_adr_pack(f)); + MCEmitter* mc = asm_driver_mc(d); + u32 ofs = mc->pos(mc) - 4; + RelocKind k = is_adrp ? R_AARCH64_ADR_PREL_PG_HI21 : R_AARCH64_ADR_PREL_LO21; + mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, k, sym, off, 1, 0); +} + +/* ---- mnemonic dispatch table ---- */ + +typedef void (*P_Fn)(AsmDriver*); + +typedef struct AA64Mn { + const char* name; + P_Fn fn; + u32 arg; /* per-fn discriminator (alias parameter) */ +} AA64Mn; + +/* Wrapper functions for the discriminator-taking parsers, since the + * table holds a uniform P_Fn pointer. Each wraps a single (fn, arg) + * tuple. */ +static void p_addsub_add(AsmDriver* d) { p_addsub(d, /*is_sub=*/0, 0); } +static void p_addsub_adds(AsmDriver* d) { p_addsub(d, 0, 1); } +static void p_addsub_sub(AsmDriver* d) { p_addsub(d, 1, 0); } +static void p_addsub_subs(AsmDriver* d) { p_addsub(d, 1, 1); } +static void p_cmp_w(AsmDriver* d) { p_cmp(d, 0); } +static void p_cmn_w(AsmDriver* d) { p_cmp(d, 1); } +static void p_csinc_(AsmDriver* d) { p_csinc(d); } +static void p_neg_w(AsmDriver* d) { p_neg(d, 0); } +static void p_negs_w(AsmDriver* d) { p_neg(d, 1); } +static void p_and_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_AND_OPC, 0); } +static void p_bic_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_AND_OPC, 1); } +static void p_orr_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ORR_OPC, 0); } +static void p_orn_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ORR_OPC, 1); } +static void p_eor_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_EOR_OPC, 0); } +static void p_eon_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_EOR_OPC, 1); } +static void p_ands_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ANDS_OPC, 0); } +static void p_bics_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ANDS_OPC, 1); } +static void p_madd(AsmDriver* d) { p_dp3(d, 0); } +static void p_msub(AsmDriver* d) { p_dp3(d, 1); } +static void p_mul_w(AsmDriver* d) { p_mul(d, 0); } +static void p_mneg_w(AsmDriver* d) { p_mul(d, 1); } +static void p_udiv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_UDIV_OP); } +static void p_sdiv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_SDIV_OP); } +static void p_lslv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_LSLV_OP); } +static void p_lsrv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_LSRV_OP); } +static void p_asrv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_ASRV_OP); } +static void p_rorv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_RORV_OP); } +static void p_b_(AsmDriver* d) { p_b(d, 0); } +static void p_bl_(AsmDriver* d) { p_b(d, 1); } +static void p_cbz_(AsmDriver* d) { p_cbz(d, 0); } +static void p_cbnz_(AsmDriver* d) { p_cbz(d, 1); } +static void p_movz_(AsmDriver* d) { p_movwide(d, AA64_MOVZ_OPC); } +static void p_movn_(AsmDriver* d) { p_movwide(d, AA64_MOVN_OPC); } +static void p_movk_(AsmDriver* d) { p_movwide(d, AA64_MOVK_OPC); } +static void p_svc_(AsmDriver* d) { p_except(d, 0); } +static void p_brk_(AsmDriver* d) { p_except(d, 1); } +static void p_hlt_(AsmDriver* d) { p_except(d, 2); } +static void p_ldr_(AsmDriver* d) { p_ldr_str(d, 1); } +static void p_str_(AsmDriver* d) { p_ldr_str(d, 0); } +static void p_ldur_(AsmDriver* d) { p_ldur_stur(d, 1); } +static void p_stur_(AsmDriver* d) { p_ldur_stur(d, 0); } +static void p_ldp_(AsmDriver* d) { p_ldp_stp(d, 1); } +static void p_stp_(AsmDriver* d) { p_ldp_stp(d, 0); } +static void p_adr_(AsmDriver* d) { p_adr(d, 0); } +static void p_adrp_(AsmDriver* d) { p_adr(d, 1); } + +/* b.cond family. cond codes follow the standard ARMv8 numbering. */ +static void p_b_eq(AsmDriver* d) { p_b_cond(d, 0); } +static void p_b_ne(AsmDriver* d) { p_b_cond(d, 1); } +static void p_b_cs(AsmDriver* d) { p_b_cond(d, 2); } +static void p_b_hs(AsmDriver* d) { p_b_cond(d, 2); } +static void p_b_cc(AsmDriver* d) { p_b_cond(d, 3); } +static void p_b_lo(AsmDriver* d) { p_b_cond(d, 3); } +static void p_b_mi(AsmDriver* d) { p_b_cond(d, 4); } +static void p_b_pl(AsmDriver* d) { p_b_cond(d, 5); } +static void p_b_vs(AsmDriver* d) { p_b_cond(d, 6); } +static void p_b_vc(AsmDriver* d) { p_b_cond(d, 7); } +static void p_b_hi(AsmDriver* d) { p_b_cond(d, 8); } +static void p_b_ls(AsmDriver* d) { p_b_cond(d, 9); } +static void p_b_ge(AsmDriver* d) { p_b_cond(d, 10); } +static void p_b_lt(AsmDriver* d) { p_b_cond(d, 11); } +static void p_b_gt(AsmDriver* d) { p_b_cond(d, 12); } +static void p_b_le(AsmDriver* d) { p_b_cond(d, 13); } +static void p_b_al(AsmDriver* d) { p_b_cond(d, 14); } + +static const AA64Mn kTable[] = { + {"nop", p_nop, 0}, + {"dmb", p_dmb, 0}, + {"dsb", p_dsb, 0}, + {"isb", p_isb, 0}, + {"clrex", p_clrex, 0}, + {"ret", p_ret, 0}, + {"br", p_br, 0}, + {"blr", p_blr, 0}, + {"mov", p_mov, 0}, + {"mvn", p_mvn, 0}, + {"movz", p_movz_, 0}, + {"movn", p_movn_, 0}, + {"movk", p_movk_, 0}, + {"add", p_addsub_add, 0}, + {"adds", p_addsub_adds, 0}, + {"sub", p_addsub_sub, 0}, + {"subs", p_addsub_subs, 0}, + {"cmp", p_cmp_w, 0}, + {"cmn", p_cmn_w, 0}, + {"csinc", p_csinc_, 0}, + {"neg", p_neg_w, 0}, + {"negs", p_negs_w, 0}, + {"and", p_and_w, 0}, + {"bic", p_bic_w, 0}, + {"orr", p_orr_w, 0}, + {"orn", p_orn_w, 0}, + {"eor", p_eor_w, 0}, + {"eon", p_eon_w, 0}, + {"ands", p_ands_w, 0}, + {"bics", p_bics_w, 0}, + {"madd", p_madd, 0}, + {"msub", p_msub, 0}, + {"mul", p_mul_w, 0}, + {"mneg", p_mneg_w, 0}, + {"udiv", p_udiv_w, 0}, + {"sdiv", p_sdiv_w, 0}, + {"lslv", p_lslv_w, 0}, + {"lsrv", p_lsrv_w, 0}, + {"asrv", p_asrv_w, 0}, + {"rorv", p_rorv_w, 0}, + {"b", p_b_, 0}, + {"bl", p_bl_, 0}, + {"cbz", p_cbz_, 0}, + {"cbnz", p_cbnz_, 0}, + {"svc", p_svc_, 0}, + {"brk", p_brk_, 0}, + {"hlt", p_hlt_, 0}, + {"ldr", p_ldr_, 0}, + {"str", p_str_, 0}, + {"ldur", p_ldur_, 0}, + {"stur", p_stur_, 0}, + {"ldp", p_ldp_, 0}, + {"stp", p_stp_, 0}, + {"adr", p_adr_, 0}, + {"adrp", p_adrp_, 0}, + {"b.eq", p_b_eq, 0}, {"b.ne", p_b_ne, 0}, + {"b.cs", p_b_cs, 0}, {"b.hs", p_b_hs, 0}, + {"b.cc", p_b_cc, 0}, {"b.lo", p_b_lo, 0}, + {"b.mi", p_b_mi, 0}, {"b.pl", p_b_pl, 0}, + {"b.vs", p_b_vs, 0}, {"b.vc", p_b_vc, 0}, + {"b.hi", p_b_hi, 0}, {"b.ls", p_b_ls, 0}, + {"b.ge", p_b_ge, 0}, {"b.lt", p_b_lt, 0}, + {"b.gt", p_b_gt, 0}, {"b.le", p_b_le, 0}, + {"b.al", p_b_al, 0}, + {NULL, NULL, 0}, +}; + +void aa64_asm_insn(AA64Asm* a, AsmDriver* d, Sym mnemonic) { + (void)a; + size_t mn = 0; + const char* mp = pool_str(asm_driver_pool(d), mnemonic, &mn); + for (const AA64Mn* row = kTable; row->name; ++row) { + if (icase_eq(mp, mn, row->name)) { + row->fn(d); + return; + } + } + asm_driver_panic(d, "asm: unknown mnemonic"); +} + +/* ---- inline-asm template walker (Phase 4b Track C) ---- */ + +/* Per-call rendered-line buffer. GCC's inline asm rarely emits more + * than a handful of instructions per block; one line of substituted + * text fits comfortably inside this. Truncation panics — the operator + * grammar should never grow a single line beyond this without a + * deliberate reason. */ +#define AA64_INLINE_LINE_CAP 1024 + +/* Render a 5-bit register number into the StrBuf using the requested + * width form. is64 picks x-form vs w-form; SP / ZR encode as + * register #31 and we render them as wzr/xzr or wsp/sp depending on + * caller intent — for inline-asm v1 the bound operand always names a + * GP register, never SP, so we emit wzr/xzr for #31. */ +static void render_reg(StrBuf* sb, u32 reg, int is64) { + if (reg == 31u) { + strbuf_puts(sb, is64 ? "xzr" : "wzr"); + return; + } + strbuf_putc(sb, is64 ? 'x' : 'w'); + if (reg >= 10u) strbuf_putc(sb, (char)('0' + (reg / 10u))); + strbuf_putc(sb, (char)('0' + (reg % 10u))); +} + +/* Render a signed 64-bit integer prefixed with '#'. */ +static void render_imm(StrBuf* sb, i64 v) { + strbuf_putc(sb, '#'); + strbuf_put_i64(sb, v); +} + +/* Render an addressing form `[xN, #ofs]` for OPK_INDIRECT. */ +static void render_indirect(StrBuf* sb, Reg base, i32 ofs) { + strbuf_putc(sb, '['); + render_reg(sb, (u32)base, /*is64=*/1); + if (ofs != 0) { + strbuf_puts(sb, ", "); + render_imm(sb, (i64)ofs); + } + strbuf_putc(sb, ']'); +} + +_Noreturn static void inline_panic(AA64Asm* a, const char* msg) { + SrcLoc loc = {0, 0, 0}; + compiler_panic(a->c, loc, "inline asm: %s", msg); +} + +/* Resolve operand index N → (kind=0 forced default, 1=force-w, 2=force-x, + * 3=address form `%aN`). Renders into sb. */ +static void render_operand(AA64Asm* a, StrBuf* sb, u32 idx, int form) { + u32 ntot = a->nout + a->nin; + if (idx >= ntot) inline_panic(a, "operand index out of range"); + const Operand* op = (idx < a->nout) ? &a->out_ops[idx] + : &a->in_ops[idx - a->nout]; + switch (form) { + case 1: /* %wN — force 32-bit register form */ + if (op->kind != OPK_REG) + inline_panic(a, "%w on non-register operand"); + render_reg(sb, (u32)op->v.reg, /*is64=*/0); + return; + case 2: /* %xN — force 64-bit register form */ + if (op->kind != OPK_REG) + inline_panic(a, "%x on non-register operand"); + render_reg(sb, (u32)op->v.reg, /*is64=*/1); + return; + case 3: /* %aN — memory addressing form */ + if (op->kind != OPK_INDIRECT) + inline_panic(a, "%a on non-memory operand"); + render_indirect(sb, op->v.ind.base, op->v.ind.ofs); + return; + default: + break; + } + /* Default rendering by operand kind. */ + switch (op->kind) { + case OPK_REG: + render_reg(sb, (u32)op->v.reg, /*is64=*/1); + return; + case OPK_IMM: + render_imm(sb, op->v.imm); + return; + case OPK_INDIRECT: + render_indirect(sb, op->v.ind.base, op->v.ind.ofs); + return; + default: + inline_panic(a, "unsupported operand kind for %N"); + } +} + +/* Lex one line of substituted asm and dispatch via aa64_asm_insn. */ +static void run_one_line(AA64Asm* a, MCEmitter* mc, const char* text, + size_t len) { + /* Skip blank lines. */ + size_t i; + for (i = 0; i < len; ++i) { + if (text[i] != ' ' && text[i] != '\t') break; + } + if (i == len) return; + + AsmLexer* lx = asm_lex_open_mem(a->c, "<inline-asm>", text, len); + AsmDriver* d = asm_driver_open_inline(a->c, mc, lx); + + /* The first non-trivial token must be the mnemonic identifier (or a + * `.directive`, but inline asm doesn't normally use directives — leave + * that path unsupported until needed). */ + AsmTok t = asm_driver_peek(d); + while (t.kind == ASM_TOK_NEWLINE || t.kind == ASM_TOK_HASH) { + (void)asm_driver_next(d); + if (t.kind == ASM_TOK_HASH) { + /* Skip cpp linemarker rest of line. */ + while (!asm_driver_at_eol(d)) (void)asm_driver_next(d); + } + t = asm_driver_peek(d); + } + if (t.kind == ASM_TOK_EOF) { + asm_driver_close_inline(d); + asm_lex_close(lx); + return; + } + if (t.kind != ASM_TOK_IDENT) + inline_panic(a, "expected mnemonic at start of inline asm line"); + (void)asm_driver_next(d); + Sym mn = t.v.ident; + /* Compose `b.eq` etc. — same trick as the standalone driver. */ + AsmTok dot = asm_driver_peek(d); + if (asm_driver_tok_is_punct(dot, '.')) { + (void)asm_driver_next(d); + AsmTok rest = asm_driver_next(d); + if (rest.kind != ASM_TOK_IDENT) + inline_panic(a, "composite mnemonic: expected ident after '.'"); + size_t hn = 0, rn = 0; + const char* hp = pool_str(asm_driver_pool(d), mn, &hn); + const char* rp = pool_str(asm_driver_pool(d), rest.v.ident, &rn); + char buf[64]; + if (hn + 1 + rn >= sizeof buf) + inline_panic(a, "composite mnemonic too long"); + for (size_t k = 0; k < hn; ++k) buf[k] = hp[k]; + buf[hn] = '.'; + for (size_t k = 0; k < rn; ++k) buf[hn + 1 + k] = rp[k]; + mn = pool_intern(asm_driver_pool(d), buf, hn + 1 + rn); + } + aa64_asm_insn(a, d, mn); + asm_driver_close_inline(d); + asm_lex_close(lx); +} + +/* Substitute placeholders into one line's StrBuf, then dispatch. + * + * The input range is [start, end) inside `tmpl`. Updates `*line_idx` + * is not used — the caller resets the StrBuf between lines. */ +static void render_and_run_line(AA64Asm* a, MCEmitter* mc, StrBuf* sb, + const char* start, const char* end) { + strbuf_reset(sb); + for (const char* p = start; p < end; ++p) { + char c = *p; + if (c != '%') { + strbuf_putc(sb, c); + continue; + } + /* Placeholder. */ + if (p + 1 >= end) inline_panic(a, "trailing '%' in template"); + char n = *(p + 1); + if (n == '%') { + strbuf_putc(sb, '%'); + ++p; + continue; + } + if (n == '[') { + /* %[name] — scan to the closing ']' and resolve against + * AsmConstraint.name on the combined outs+ins list. Match by + * comparing the named-bracket contents against the interned name + * Sym stored on each constraint. */ + const char* nbeg = p + 2; + const char* nend = nbeg; + while (nend < end && *nend != ']') ++nend; + if (nend == end) inline_panic(a, "unterminated %[name]"); + size_t nlen = (size_t)(nend - nbeg); + Sym needle = pool_intern(a->c->global, nbeg, nlen); + u32 idx = (u32)-1; + for (u32 k = 0; k < a->nout; ++k) { + if (a->outs[k].name == needle) { idx = k; break; } + } + if (idx == (u32)-1) { + for (u32 k = 0; k < a->nin; ++k) { + if (a->ins[k].name == needle) { idx = a->nout + k; break; } + } + } + if (idx == (u32)-1) + inline_panic(a, "%[name] does not match any constraint"); + p = nend; /* loop's ++p steps past the ']' */ + render_operand(a, sb, idx, 0); + continue; + } + int form = 0; /* 0=default, 1=w, 2=x, 3=a */ + if (n == 'w' || n == 'x' || n == 'a') { + form = (n == 'w') ? 1 : (n == 'x') ? 2 : 3; + ++p; + if (p + 1 >= end) inline_panic(a, "trailing '%' modifier in template"); + n = *(p + 1); + } + if (n == '[') { + /* %w[name] / %x[name] / %a[name] — width modifier + symbolic + * operand. Resolves the same way as %[name] but renders with the + * declared form. */ + const char* nbeg = p + 2; + const char* nend = nbeg; + while (nend < end && *nend != ']') ++nend; + if (nend == end) inline_panic(a, "unterminated %[name]"); + size_t nlen = (size_t)(nend - nbeg); + Sym needle = pool_intern(a->c->global, nbeg, nlen); + u32 idx = (u32)-1; + for (u32 k = 0; k < a->nout; ++k) { + if (a->outs[k].name == needle) { idx = k; break; } + } + if (idx == (u32)-1) { + for (u32 k = 0; k < a->nin; ++k) { + if (a->ins[k].name == needle) { idx = a->nout + k; break; } + } + } + if (idx == (u32)-1) + inline_panic(a, "%[name] does not match any constraint"); + p = nend; /* loop's ++p steps past the ']' */ + render_operand(a, sb, idx, form); + continue; + } + if (n < '0' || n > '9') + inline_panic(a, "expected digit after '%'"); + u32 idx = (u32)(n - '0'); + ++p; + /* GCC syntax permits up to two digits (%0..%99). */ + if (p + 1 < end && *(p + 1) >= '0' && *(p + 1) <= '9') { + idx = idx * 10 + (u32)(*(p + 1) - '0'); + ++p; + } + render_operand(a, sb, idx, form); + } + if (sb->truncated) inline_panic(a, "inline asm line buffer overflow"); + run_one_line(a, mc, strbuf_cstr(sb), strbuf_len(sb)); +} + +void aa64_asm_run_template(AA64Asm* a, MCEmitter* mc, const char* tmpl) { + if (!tmpl || !*tmpl) return; + + char buf[AA64_INLINE_LINE_CAP]; + StrBuf sb; + strbuf_init(&sb, buf, sizeof buf); + + /* Walk tmpl, splitting on '\n' and ';' line terminators. Track bracket + * depth and quote state so that a literal ';' inside `[ ... ]` or a + * quoted string is not mistaken for a statement separator. */ + const char* line_start = tmpl; + int bracket = 0; + char quote = 0; + for (const char* p = tmpl;; ++p) { + char c = *p; + if (c == '\0') { + render_and_run_line(a, mc, &sb, line_start, p); + break; + } + if (quote) { + if (c == '\\' && *(p + 1)) { + ++p; + continue; + } + if (c == quote) quote = 0; + continue; + } + if (c == '"' || c == '\'') { + quote = c; + continue; + } + if (c == '[') { + ++bracket; + continue; + } + if (c == ']') { + if (bracket) --bracket; + continue; + } + if (bracket == 0 && (c == '\n' || c == ';')) { + render_and_run_line(a, mc, &sb, line_start, p); + line_start = p + 1; + } + } +} diff --git a/src/arch/aa64_asm.h b/src/arch/aa64/asm.h diff --git a/src/arch/aa64/dbg.c b/src/arch/aa64/dbg.c @@ -0,0 +1,235 @@ +/* AArch64 lifter for the displaced-step shim. + * + * Lays out a fixed-up copy of one insn in the session scratch slot + * (DBG_DISPLACED_SLOT_BYTES bytes), followed by a BRK sentinel the + * session arms an internal bp on. + * + * Supported families: + * - any insn with no PC-relative operand (copied verbatim); + * - B / BL / B.cond — re-encode the immediate; + * - CBZ / CBNZ / TBZ / TBNZ — always emit a trampoline: + * slot[0] cond-branch +2 words (taken → slot+8) + * slot[4] BRK (not-taken fallthrough) + * slot[8] LDR x16, =target + * slot[12] BR x16 + * slot[16] literal pool (8 bytes, absolute target) + * - ADR / ADRP — replace with LDR Xd, =target: + * slot[0] LDR Xd, =target + * slot[4] BRK + * slot[8] literal pool (8 bytes) + * - LDR (literal), integer/LDRSW — synthesize indirect load: + * slot[0] LDR x16, =literal_addr + * slot[4] LDR Xt/Wt/LDRSW Xt, [x16] + * slot[8] BRK + * slot[12] literal pool (8 bytes, absolute literal addr) + * - BR / BLR / RET — copied verbatim; the BRK after never + * fires because the indirect branch transfers control. The session's + * stale internal_bp is cleared by the next prepare; finalize gates on + * PC == return_pc so it stays a no-op when control left the slot. */ + +#include "dbg/dbg.h" + +#include <string.h> + +#include "arch/aa64/isa.h" + +#define SHIM_X16 16u /* IP0; safe to clobber inside a shim */ + +uint32_t dbg_aa64_brk_word(void) { + return aa64_brk(0); +} + +static int fits_signed(int64_t v, int bits) { + int64_t lim = (int64_t)1 << (bits - 1); + return v >= -lim && v < lim; +} + +/* LDR (literal) for integer Xt: opc=01, V=0, fixed bits 011_0_00. + * 01 011 0 00 imm19 Rt → 0x58000000 | (imm19<<5) | Rt + * imm19 is the signed word offset from the LDR's own PC. */ +static uint32_t enc_ldr_lit_x(uint32_t Rt, int32_t imm19) { + return 0x58000000u | (((uint32_t)imm19 & 0x7ffffu) << 5) | (Rt & 0x1fu); +} +/* LDR Xt, [Xn, #0] / LDR Wt, [Xn, #0] / LDRSW Xt, [Xn, #0]. */ +static uint32_t enc_ldr64_reg(uint32_t Rt, uint32_t Rn) { + return aa64_ldr64_uimm12(Rt, Rn, 0); +} +static uint32_t enc_ldr32_reg(uint32_t Rt, uint32_t Rn) { + return aa64_ldst_uimm_pack((AA64LdStUimm){ + .size = 2, .V = 0, .opc = AA64_LDST_OPC_LDR, .imm12 = 0, .Rn = Rn, + .Rt = Rt}); +} +static uint32_t enc_ldrsw_reg(uint32_t Rt, uint32_t Rn) { + return aa64_ldst_uimm_pack((AA64LdStUimm){ + .size = 2, .V = 0, .opc = 2, .imm12 = 0, .Rn = Rn, .Rt = Rt}); +} + +static void put_u32(uint8_t* w, uint32_t off, uint32_t v) { + memcpy(w + off, &v, sizeof(v)); +} +static void put_u64(uint8_t* w, uint32_t off, uint64_t v) { + memcpy(w + off, &v, sizeof(v)); +} + +/* Sign-extend a `bits`-wide field whose raw value is `v`. */ +static int64_t sign_extend(uint64_t v, int bits) { + uint64_t m = 1ull << (bits - 1); + return (int64_t)((v ^ m) - m); +} + +int dbg_aa64_build_shim(uint32_t orig_insn, uint64_t orig_pc, + void* scratch_write, uint64_t scratch_runtime, + u32* shim_len) { + uint8_t* w = (uint8_t*)scratch_write; + uint32_t brk = aa64_brk(0); + int64_t pc_delta; + if (!shim_len) return 1; + *shim_len = 0; + pc_delta = (int64_t)orig_pc - (int64_t)scratch_runtime; + + /* ---- B / BL (imm26) ------------------------------------------------ */ + if ((orig_insn & 0x7C000000u) == 0x14000000u) { + AA64BrImm f = aa64_brimm_unpack(orig_insn); + int64_t imm = sign_extend(f.imm26, 26); + int64_t new_off = imm * 4 + pc_delta; + if ((new_off & 3) || !fits_signed(new_off / 4, 26)) { + /* Out of B/BL range from scratch: fall back to LDR x30/PC trick is + * messy for BL (need to preserve LR). Decline. */ + return 1; + } + f.imm26 = (uint32_t)((new_off / 4) & 0x3ffffffu); + put_u32(w, 0, aa64_brimm_pack(f)); + put_u32(w, 4, brk); + *shim_len = 4; + return 0; + } + + /* ---- B.cond (imm19) ------------------------------------------------ */ + if ((orig_insn & 0xFF000010u) == 0x54000000u) { + AA64BrCond f = aa64_brcond_unpack(orig_insn); + int64_t imm = sign_extend(f.imm19, 19); + int64_t new_off = imm * 4 + pc_delta; + if ((new_off & 3) || !fits_signed(new_off / 4, 19)) { + /* Synthesize: B.cond +8 (skip BRK) ; BRK ; LDR x16,=tgt ; BR x16 ; + * literal. The "taken" path branches to slot+8, the "not-taken" + * path falls through to BRK at slot+4. */ + uint64_t target = orig_pc + (uint64_t)(imm * 4); + AA64BrCond nf; + nf.cond = f.cond; + nf.imm19 = 2u; /* +8 bytes from slot[0] → slot[8] */ + put_u32(w, 0, aa64_brcond_pack(nf)); + put_u32(w, 4, brk); + put_u32(w, 8, enc_ldr_lit_x(SHIM_X16, 2)); /* LDR x16, [pc+8] = slot[16] */ + put_u32(w, 12, aa64_br(SHIM_X16)); + put_u64(w, 16, target); + *shim_len = 4; + return 0; + } + f.imm19 = (uint32_t)((new_off / 4) & 0x7ffffu); + put_u32(w, 0, aa64_brcond_pack(f)); + put_u32(w, 4, brk); + *shim_len = 4; + return 0; + } + + /* ---- CBZ / CBNZ (imm19) — always trampoline form ------------------- */ + if ((orig_insn & 0x7E000000u) == 0x34000000u) { + AA64CB f = aa64_cb_unpack(orig_insn); + int64_t imm = sign_extend(f.imm19, 19); + uint64_t target = orig_pc + (uint64_t)(imm * 4); + AA64CB nf = f; + nf.imm19 = 2u; /* +8 → slot[8] */ + put_u32(w, 0, aa64_cb_pack(nf)); + put_u32(w, 4, brk); + put_u32(w, 8, enc_ldr_lit_x(SHIM_X16, 2)); + put_u32(w, 12, aa64_br(SHIM_X16)); + put_u64(w, 16, target); + *shim_len = 4; + return 0; + } + + /* ---- TBZ / TBNZ (imm14) — always trampoline ------------------------ + * b5 011011 op b40[18:14] imm14[18:5] -- wait, field layout: + * b5(31) 011011(30..25) op(24) b40(23..19) imm14(18..5) Rt(4..0). */ + if ((orig_insn & 0x7E000000u) == 0x36000000u) { + uint32_t b5 = (orig_insn >> 31) & 1u; + uint32_t op = (orig_insn >> 24) & 1u; + uint32_t b40 = (orig_insn >> 19) & 0x1fu; + uint32_t Rt = orig_insn & 0x1fu; + uint32_t imm14_raw = (orig_insn >> 5) & 0x3fffu; + int64_t imm = sign_extend(imm14_raw, 14); + uint64_t target = orig_pc + (uint64_t)(imm * 4); + uint32_t new_imm14 = 2u; /* +8 → slot[8] */ + uint32_t new_word = + (b5 << 31) | 0x36000000u | (op << 24) | (b40 << 19) | + ((new_imm14 & 0x3fffu) << 5) | (Rt & 0x1fu); + put_u32(w, 0, new_word); + put_u32(w, 4, brk); + put_u32(w, 8, enc_ldr_lit_x(SHIM_X16, 2)); + put_u32(w, 12, aa64_br(SHIM_X16)); + put_u64(w, 16, target); + *shim_len = 4; + return 0; + } + + /* ---- ADR / ADRP ---------------------------------------------------- */ + if ((orig_insn & 0x1F000000u) == 0x10000000u) { + AA64PCRelAdr f = aa64_pcrel_adr_unpack(orig_insn); + uint64_t imm_raw = ((uint64_t)f.immhi << 2) | (uint64_t)f.immlo; + int64_t imm21 = sign_extend(imm_raw, 21); + uint64_t target; + if (f.op == AA64_ADR_OP_ADRP) { + target = (orig_pc & ~(uint64_t)0xFFF) + ((uint64_t)imm21 << 12); + } else { + target = orig_pc + (uint64_t)imm21; + } + /* LDR Xd, [pc + 8] — the literal sits at slot[8]. */ + put_u32(w, 0, enc_ldr_lit_x(f.Rd, 2)); + put_u32(w, 4, brk); + put_u64(w, 8, target); + *shim_len = 4; + return 0; + } + + /* ---- LDR (literal) — integer & LDRSW only -------------------------- */ + if ((orig_insn & 0x3B000000u) == 0x18000000u) { + uint32_t opc = (orig_insn >> 30) & 3u; + uint32_t V = (orig_insn >> 26) & 1u; + uint32_t Rt = orig_insn & 0x1fu; + uint32_t imm19_raw = (orig_insn >> 5) & 0x7ffffu; + int64_t imm19 = sign_extend(imm19_raw, 19); + uint64_t literal_addr = orig_pc + (uint64_t)(imm19 * 4); + uint32_t load_insn; + if (V) return 1; /* vector forms (S/D/Q): not supported in v1 */ + switch (opc) { + case 0: load_insn = enc_ldr32_reg(Rt, SHIM_X16); break; /* LDR Wt */ + case 1: load_insn = enc_ldr64_reg(Rt, SHIM_X16); break; /* LDR Xt */ + case 2: load_insn = enc_ldrsw_reg(Rt, SHIM_X16); break; /* LDRSW */ + default: return 1; /* PRFM (literal): not meaningful here */ + } + /* LDR x16, [pc + 12] — literal at slot[12]. */ + put_u32(w, 0, enc_ldr_lit_x(SHIM_X16, 3)); + put_u32(w, 4, load_insn); + put_u32(w, 8, brk); + put_u64(w, 12, literal_addr); + *shim_len = 8; + return 0; + } + + /* ---- BR / BLR / RET (indirect) ------------------------------------- */ + if ((orig_insn & 0xFE1FFC1Fu) == AA64_BR_REG_FAMILY_MATCH) { + /* Copy verbatim; the BRK after will not fire because control + * transfers to the register target. The session clears the stale + * internal bp on the next prepare. */ + put_u32(w, 0, orig_insn); + put_u32(w, 4, brk); + *shim_len = 4; + return 0; + } + + /* ---- default: no PC-relative operand — copy verbatim --------------- */ + put_u32(w, 0, orig_insn); + put_u32(w, 4, brk); + *shim_len = 4; + return 0; +} diff --git a/src/arch/aa64/disasm.c b/src/arch/aa64/disasm.c @@ -0,0 +1,133 @@ +/* AArch64 disassembler implementation. + * + * Decodes one 4-byte instruction word per call into a CfreeInsn whose + * string fields point into iterator-owned StrBufs. The decoder shares + * the aa64_isa.{h,c} descriptor table with the encoder: aa64_disasm_find + * matches the word; aa64_print_operands renders operand text via the + * format's unpack + per-format pretty-printer. Mnemonic rewriting (the + * one bit the printer can't own, because b.cond rolls cond into the + * "operand" text) happens here. */ + +#include "arch/aa64/disasm.h" + +#include <string.h> + +#include "arch/aa64/isa.h" +#include "core/heap.h" +#include "core/strbuf.h" + +/* Enough for any aarch64 mnemonic-with-suffix ("b.cond" → "b.le", etc.). */ +#define AA64_DASM_MNEM_CAP 16u +/* Operand text. The widest cases (LDP X, X, [SP, #-imm]!) fit easily. */ +#define AA64_DASM_OPS_CAP 96u +/* Annotation overlay (symbol + addend). */ +#define AA64_DASM_ANN_CAP 96u + +typedef struct AA64Disasm { + ArchDisasm base; + Compiler* c; + Heap* heap; + char mnem_buf[AA64_DASM_MNEM_CAP]; + char ops_buf[AA64_DASM_OPS_CAP]; + char ann_buf[AA64_DASM_ANN_CAP]; + StrBuf mnem; + StrBuf ops; + StrBuf ann; +} AA64Disasm; + +static const char* aa64_cond_names[16] = { + "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc", + "hi", "ls", "ge", "lt", "gt", "le", "al", "nv", +}; + +static void aa64_write_mnemonic(AA64Disasm* d, const AA64InsnDesc* desc, + u32 word) { + strbuf_reset(&d->mnem); + if (desc->fmt == AA64_FMT_BR_COND) { + /* Synthesize "b.<cond>" so the operands buffer can hold just the + * target. Matches GNU as / objdump conventions. */ + u32 cond = word & 0xfu; + strbuf_puts(&d->mnem, "b."); + strbuf_puts(&d->mnem, aa64_cond_names[cond]); + return; + } + strbuf_puts(&d->mnem, desc->mnemonic); +} + +static void aa64_write_operands(AA64Disasm* d, const AA64InsnDesc* desc, + u32 word, u64 vaddr) { + strbuf_reset(&d->ops); + if (desc->fmt == AA64_FMT_BR_COND) { + /* aa64_print_operands prints "<cond> <target>"; we already lifted + * the cond into the mnemonic, so skip the dispatcher and inline + * just the target. */ + AA64BrCond f = aa64_brcond_unpack(word); + i64 ofs = (i64)((u64)f.imm19 & 0x7ffffu); + /* sign-extend 19 bits */ + if (ofs & 0x40000) ofs |= ~(i64)0x7ffff; + ofs *= 4; + if (vaddr) { + strbuf_put_hex_u64(&d->ops, vaddr + (u64)ofs); + } else { + strbuf_puts(&d->ops, "#"); + strbuf_put_i64(&d->ops, ofs); + } + return; + } + aa64_print_operands(&d->ops, desc, word, vaddr); +} + +static u32 aa64_read_u32_le(const u8* b) { + return (u32)b[0] | ((u32)b[1] << 8) | ((u32)b[2] << 16) | ((u32)b[3] << 24); +} + +static void aa64_write_unknown(AA64Disasm* d, u32 word) { + strbuf_reset(&d->mnem); + strbuf_puts(&d->mnem, ".inst"); + strbuf_reset(&d->ops); + strbuf_put_hex_u64(&d->ops, (u64)word); +} + +static u32 aa64_decode(ArchDisasm* base, const u8* bytes, size_t len, u64 vaddr, + CfreeInsn* out) { + AA64Disasm* d = (AA64Disasm*)base; + if (len < 4u) return 0; + u32 word = aa64_read_u32_le(bytes); + const AA64InsnDesc* desc = aa64_disasm_find(word); + if (desc) { + aa64_write_mnemonic(d, desc, word); + aa64_write_operands(d, desc, word, vaddr); + } else { + aa64_write_unknown(d, word); + } + /* Annotation overlay is owned by the public iterator (cfree_disasm_iter_*). + * The arch-level decoder leaves it empty. */ + strbuf_reset(&d->ann); + out->vaddr = vaddr; + out->bytes = bytes; + out->nbytes = 4; + out->mnemonic = strbuf_cstr(&d->mnem); + out->operands = strbuf_cstr(&d->ops); + out->annotation = strbuf_cstr(&d->ann); + return 4; +} + +static void aa64_destroy(ArchDisasm* base) { + AA64Disasm* d = (AA64Disasm*)base; + d->heap->free(d->heap, d, sizeof(*d)); +} + +ArchDisasm* aa64_disasm_new(Compiler* c) { + Heap* h = (Heap*)c->env->heap; + AA64Disasm* d = (AA64Disasm*)h->alloc(h, sizeof(*d), _Alignof(AA64Disasm)); + if (!d) return NULL; + memset(d, 0, sizeof(*d)); + d->c = c; + d->heap = h; + d->base.decode = aa64_decode; + d->base.destroy = aa64_destroy; + strbuf_init(&d->mnem, d->mnem_buf, sizeof d->mnem_buf); + strbuf_init(&d->ops, d->ops_buf, sizeof d->ops_buf); + strbuf_init(&d->ann, d->ann_buf, sizeof d->ann_buf); + return &d->base; +} diff --git a/src/arch/aa64/disasm.h b/src/arch/aa64/disasm.h @@ -0,0 +1,14 @@ +#ifndef CFREE_ARCH_AA64_DISASM_H +#define CFREE_ARCH_AA64_DISASM_H + +/* AArch64 disassembler — ArchDisasm implementation. + * + * Wraps aa64_disasm_find + aa64_print_operands (src/arch/aa64/isa.{h,c}). + * The dispatcher in src/arch/disasm.c constructs one of these when the + * compiler target is CFREE_ARCH_ARM_64. */ + +#include "arch/arch.h" + +ArchDisasm* aa64_disasm_new(Compiler*); + +#endif diff --git a/src/arch/aa64/emit.c b/src/arch/aa64/emit.c @@ -0,0 +1,523 @@ +/* aarch64/emit.c — instruction encoding helpers, function lifecycle, + * frame layout, parameter ABI, address materialization. */ + +#include "arch/aa64/internal.h" + +extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc); + +/* ============================================================ + * Shared type / operand helpers + * ============================================================ */ + +int type_is_64(CfreeCgTypeId t) { + return t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I64) || + t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64) || + t >= (CfreeCgTypeId)(2u << 6); +} + +int type_is_fp_double(CfreeCgTypeId t) { + return t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64); +} + +int type_is_signed(CfreeCgTypeId t) { + (void)t; + return 0; +} + +u32 type_byte_size(CfreeCgTypeId t) { + if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I8) || + t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_BOOL)) + return 1; + if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I16)) return 2; + if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I32) || + t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F32)) + return 4; + return 8; +} + +u32 size_idx_for_bytes(u32 nbytes) { + switch (nbytes) { + case 1: + return 0; + case 2: + return 1; + case 4: + return 2; + case 8: + return 3; + default: + return 3; + } +} + +u32 reg_num(Operand op) { return op.v.reg & 0x1fu; } + +static u32 collect_mask_regs(u32 mask, u32 first, u32 last, u32* out) { + u32 n = 0; + for (u32 r = first; r <= last; ++r) { + if (mask & (1u << r)) out[n++] = r; + } + return n; +} + +/* ============================================================ + * Low-level emission + * ============================================================ */ + +void aa64_emit32(MCEmitter* mc, u32 word) { + u32 ofs = obj_pos(mc->obj, mc->section_id); + u8 b[4]; + b[0] = (u8)(word & 0xff); + b[1] = (u8)((word >> 8) & 0xff); + b[2] = (u8)((word >> 16) & 0xff); + b[3] = (u8)((word >> 24) & 0xff); + mc->emit_bytes(mc, b, 4); + if (mc->debug) { + debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); + } +} + +void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word) { + u8 b[4]; + b[0] = (u8)(word & 0xff); + b[1] = (u8)((word >> 8) & 0xff); + b[2] = (u8)((word >> 16) & 0xff); + b[3] = (u8)((word >> 24) & 0xff); + obj_patch(obj, sec_id, ofs, b, 4); +} + +/* ============================================================ + * Immediate encoding helpers + * ============================================================ */ + +void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm) { + const u32 nslots = sf ? 4u : 2u; + u64 v = sf ? (u64)imm : ((u64)imm & 0xffffffffu); + + for (u32 i = 0; i < nslots; ++i) { + u32 slot = (u32)((v >> (i * 16)) & 0xffffu); + u64 cleared = v & ~((u64)0xffffu << (i * 16)); + if (slot != 0 && cleared == 0) { + aa64_emit32(mc, aa64_movz(sf, Rd, slot, i)); + return; + } + } + + { + u64 inv = sf ? ~v : ((~v) & 0xffffffffu); + for (u32 i = 0; i < nslots; ++i) { + u32 slot = (u32)((inv >> (i * 16)) & 0xffffu); + u64 cleared = inv & ~((u64)0xffffu << (i * 16)); + if (cleared == 0) { + aa64_emit32(mc, aa64_movn(sf, Rd, slot, i)); + return; + } + } + } + + int placed = 0; + for (u32 i = 0; i < nslots; ++i) { + u32 slot = (u32)((v >> (i * 16)) & 0xffffu); + if (!placed) { + if (slot == 0) continue; + aa64_emit32(mc, aa64_movz(sf, Rd, slot, i)); + placed = 1; + } else if (slot != 0) { + aa64_emit32(mc, aa64_movk(sf, Rd, slot, i)); + } + } + if (!placed) aa64_emit32(mc, aa64_movz(sf, Rd, 0, 0)); +} + +void emit_sp_add(MCEmitter* mc, u32 imm) { + if (imm <= 0xfff) { + aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm, 0)); + } else if ((imm & 0xfff) == 0 && (imm >> 12) <= 0xfff) { + aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm >> 12, 1)); + } else { + aa64_emit32(mc, aa64_add_imm(1, 31, 31, (imm >> 12) & 0xfff, 1)); + aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm & 0xfff, 0)); + } +} + +/* ============================================================ + * Function lifecycle + * ============================================================ */ + +void aa_func_begin(CGTarget* t, const CGFuncDesc* fd) { + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + + mc->set_section(mc, fd->text_section_id); + mc->emit_align(mc, 4, 0); + + a->fd = fd; + a->func_start = mc->pos(mc); + a->next_param_int = 0; + a->next_param_fp = 0; + a->next_param_stack = 0; + a->has_sret = (fd->abi && fd->abi->has_sret) ? 1 : 0; + a->cum_off = 0; + a->max_outgoing = 0; + a->used_cs_int_mask = 0; + a->used_cs_fp_mask = 0; + a->nslots = 0; + a->nscopes = 0; + a->has_alloca = 0; + a->nadd_patches = 0; + a->sret_ptr_slot = FRAME_SLOT_NONE; + a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0; + a->gp_save_slot = FRAME_SLOT_NONE; + a->fp_save_slot = FRAME_SLOT_NONE; + a->epilogue_label = mc->label_new(mc); + + mc->cfi_startproc(mc); + + a->prologue_pos = mc->pos(mc); + for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) aa64_emit32(mc, AA64_NOP); + + if (a->has_sret) { + FrameSlotDesc fsd = { + .type = CFREE_CG_TYPE_NONE, + .name = 0, + .loc = (SrcLoc){0, 0, 0}, + .size = 8, + .align = 8, + .kind = FS_SPILL, + .flags = 0, + }; + a->sret_ptr_slot = aa_frame_slot(t, &fsd); + } + + if (a->is_variadic) { + FrameSlotDesc gpd = { + .type = CFREE_CG_TYPE_NONE, + .name = 0, + .loc = (SrcLoc){0, 0, 0}, + .size = 64, + .align = 8, + .kind = FS_SPILL, + .flags = 0, + }; + a->gp_save_slot = aa_frame_slot(t, &gpd); + FrameSlotDesc fpd = { + .type = CFREE_CG_TYPE_NONE, + .name = 0, + .loc = (SrcLoc){0, 0, 0}, + .size = 128, + .align = 16, + .kind = FS_SPILL, + .flags = 0, + }; + a->fp_save_slot = aa_frame_slot(t, &fpd); + AASlot* gs = aa64_slot_get(a, a->gp_save_slot); + AASlot* fs = aa64_slot_get(a, a->fp_save_slot); + for (u32 i = 0; i < 8; ++i) { + aa64_emit32(mc, aa64_stur(3, i, 29, -(i32)gs->off + (i32)i * 8)); + } + for (u32 i = 0; i < 8; ++i) { + aa64_emit32(mc, aa64_stur_fp(3, i, 29, -(i32)fs->off + (i32)i * 16)); + } + } +} + +void aa_func_end(CGTarget* t) { + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + + u32 int_regs[10]; + u32 fp_regs[8]; + u32 n_int_saves = collect_mask_regs(a->used_cs_int_mask, 19u, 28u, int_regs); + u32 n_fp_saves = collect_mask_regs(a->used_cs_fp_mask, 8u, 15u, fp_regs); + + u32 outgoing_off = 0; + u32 int_save_off = a->max_outgoing; + u32 fp_save_off = int_save_off + n_int_saves * 8u; + u32 locals_off = fp_save_off + n_fp_saves * 8u; + u32 fp_lr_off = locals_off + a->cum_off; + u32 frame_size = fp_lr_off + 16; + frame_size = (frame_size + 15u) & ~15u; + fp_lr_off = frame_size - 16; + + (void)outgoing_off; + + mc->label_place(mc, a->epilogue_label); + + if (a->has_alloca) { + if (fp_lr_off <= 0xfff) { + aa64_emit32(mc, aa64_sub_imm(1, /*Rd=SP*/ 31, /*Rn=*/29, fp_lr_off, 0)); + } else { + compiler_panic(t->c, a->loc, + "aarch64: has_alloca + fp_lr_off %u out of imm12 range", + fp_lr_off); + } + } + + for (i32 i = (i32)n_fp_saves - 1; i >= 0; --i) { + u32 r0 = fp_regs[i]; + aa64_emit32(mc, aa64_ldr_fp_uimm(3, r0, 31, + fp_save_off + (u32)i * 8u)); + } + for (i32 i = (i32)n_int_saves - 1; i >= 0; --i) { + u32 r0 = int_regs[i]; + aa64_emit32(mc, aa64_ldr_uimm(3, r0, 31, + int_save_off + (u32)i * 8u)); + } + aa64_emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off)); + emit_sp_add(mc, frame_size); + aa64_emit32(mc, aa64_ret(AA64_LR)); + + u32 pos = a->prologue_pos; + ObjBuilder* obj = t->obj; + u32 sec = a->fd->text_section_id; + + u32 words[AA_PROLOGUE_WORDS]; + for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) words[i] = AA64_NOP; + u32 wi = 0; + + if (frame_size <= 0xfff) { + words[wi++] = aa64_sub_imm(1, 31, 31, frame_size, 0); + } else if ((frame_size & 0xfff) == 0 && (frame_size >> 12) <= 0xfff) { + words[wi++] = aa64_sub_imm(1, 31, 31, frame_size >> 12, 1); + } else { + if (wi + 2 > AA_PROLOGUE_WORDS) { + compiler_panic(t->c, a->loc, + "aarch64: prologue overflow for frame_size %u", + frame_size); + } + words[wi++] = aa64_sub_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1); + words[wi++] = aa64_sub_imm(1, 31, 31, frame_size & 0xfff, 0); + } + words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off); + words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0); + if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) { + AASlot* s = aa64_slot_get(a, a->sret_ptr_slot); + if (s) { + if (wi >= AA_PROLOGUE_WORDS) goto overflow; + words[wi++] = aa64_stur(3, 8, 29, -(i32)s->off); + } + } + for (u32 i = 0; i < n_int_saves; ++i) { + u32 r0 = int_regs[i]; + if (wi >= AA_PROLOGUE_WORDS) goto overflow; + words[wi++] = aa64_str_uimm(3, r0, 31, int_save_off + i * 8u); + } + for (u32 i = 0; i < n_fp_saves; ++i) { + u32 r0 = fp_regs[i]; + if (wi >= AA_PROLOGUE_WORDS) goto overflow; + words[wi++] = aa64_str_fp_uimm(3, r0, 31, fp_save_off + i * 8u); + } + if (0) { + overflow: + compiler_panic( + t->c, a->loc, + "aarch64: prologue placeholder too small (used %u of %u words)", wi, + AA_PROLOGUE_WORDS); + } + + for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) { + aa64_patch32(obj, sec, pos + i * 4u, words[i]); + } + + if (a->max_outgoing > 0xfff) { + compiler_panic( + t->c, a->loc, + "aarch64: max_outgoing %u out of imm12 range for alloca patch", + a->max_outgoing); + } + for (u32 i = 0; i < a->nadd_patches; ++i) { + u32 dr = a->add_patches[i].dst_reg; + u32 word = aa64_add_imm(1, dr, /*Rn=SP*/ 31, a->max_outgoing, 0); + aa64_patch32(obj, sec, a->add_patches[i].pos, word); + } + + u32 end = mc->pos(mc); + obj_symbol_define(obj, a->fd->sym, sec, (u64)a->func_start, + (u64)(end - a->func_start)); + + mc->cfi_endproc(mc); + a->fd = NULL; +} + +/* ============================================================ + * Frame slots + * ============================================================ */ + +FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d) { + AAImpl* a = impl_of(t); + if (a->nslots == a->slots_cap) { + u32 ncap = a->slots_cap ? a->slots_cap * 2 : 8; + AASlot* nbuf = arena_array(t->c->tu, AASlot, ncap); + if (a->slots) memcpy(nbuf, a->slots, sizeof(AASlot) * a->nslots); + a->slots = nbuf; + a->slots_cap = ncap; + } + u32 size = d->size ? d->size : 8; + u32 align = d->align ? d->align : 1; + u32 next = a->cum_off + size; + u32 mask = align - 1; + next = (next + mask) & ~mask; + + AASlot* s = &a->slots[a->nslots]; + s->off = next; + s->size = size; + s->align = align; + s->kind = d->kind; + + a->cum_off = next; + a->nslots++; + return (FrameSlot)(a->nslots); +} + +/* ============================================================ + * Parameters + * ============================================================ */ + +void aa_param(CGTarget* t, const CGParamDesc* p) { + AAImpl* a = impl_of(t); + AASlot* s = aa64_slot_get(a, p->slot); + if (!s) { + compiler_panic(t->c, a->loc, "aarch64 param: bad slot"); + } + const ABIArgInfo* ai = p->abi; + + if (ai->kind == ABI_ARG_IGNORE) return; + if (ai->kind == ABI_ARG_INDIRECT) { + u32 ptr_reg; + if (a->next_param_int < 8) { + ptr_reg = a->next_param_int++; + } else { + u32 caller_off = a->next_param_stack; + a->next_param_stack += 8; + aa64_emit32(t->mc, aa64_ldur(3, AA_TMP0, 29, (i32)(16 + caller_off))); + ptr_reg = AA_TMP0; + } + u32 nbytes = s->size; + u32 i = 0; + while (i + 8 <= nbytes) { + aa64_emit32(t->mc, aa64_ldur(3, AA_TMP1, ptr_reg, (i32)i)); + aa64_emit32(t->mc, aa64_stur(3, AA_TMP1, 29, -(i32)s->off + (i32)i)); + i += 8; + } + while (i + 4 <= nbytes) { + aa64_emit32(t->mc, aa64_ldur(2, AA_TMP1, ptr_reg, (i32)i)); + aa64_emit32(t->mc, aa64_stur(2, AA_TMP1, 29, -(i32)s->off + (i32)i)); + i += 4; + } + while (i + 2 <= nbytes) { + aa64_emit32(t->mc, aa64_ldur(1, AA_TMP1, ptr_reg, (i32)i)); + aa64_emit32(t->mc, aa64_stur(1, AA_TMP1, 29, -(i32)s->off + (i32)i)); + i += 2; + } + while (i < nbytes) { + aa64_emit32(t->mc, aa64_ldur(0, AA_TMP1, ptr_reg, (i32)i)); + aa64_emit32(t->mc, aa64_stur(0, AA_TMP1, 29, -(i32)s->off + (i32)i)); + i += 1; + } + return; + } + for (u16 i = 0; i < ai->nparts; ++i) { + const ABIArgPart* pt = &ai->parts[i]; + u32 part_off = pt->src_offset; + u32 sz = pt->size; + u32 sidx = size_idx_for_bytes(sz); + + if (pt->cls == ABI_CLASS_INT) { + if (a->next_param_int < 8) { + u32 reg = a->next_param_int++; + aa64_emit32(t->mc, aa64_stur(sidx, reg, 29, -(i32)s->off + (i32)part_off)); + } else { + u32 caller_off = a->next_param_stack; + a->next_param_stack += 8; + aa64_emit32(t->mc, aa64_ldur(sidx, AA_TMP0, 29, (i32)(16 + caller_off))); + aa64_emit32(t->mc, + aa64_stur(sidx, AA_TMP0, 29, + -(i32)s->off + (i32)part_off)); + } + } else if (pt->cls == ABI_CLASS_FP) { + if (a->next_param_fp < 8) { + u32 reg = a->next_param_fp++; + aa64_emit32(t->mc, + aa64_stur_fp(sidx, reg, 29, -(i32)s->off + (i32)part_off)); + } else { + u32 caller_off = a->next_param_stack; + a->next_param_stack += 8; + aa64_emit32(t->mc, + aa64_ldur_fp(sidx, AA_FP_TMP0, 29, + (i32)(16 + caller_off))); + aa64_emit32(t->mc, + aa64_stur_fp(sidx, AA_FP_TMP0, 29, + -(i32)s->off + (i32)part_off)); + } + } else { + compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl", + (int)pt->cls); + } + } +} + +/* ============================================================ + * Address materialization helpers + * ============================================================ */ + +static int use_got_for_sym(CGTarget* t, ObjSymId sym) { + return obj_symbol_extern_via_got(t->c, t->obj, sym); +} + +void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym) { + MCEmitter* mc = t->mc; + u32 sec = mc->section_id; + u32 adrp_pos = mc->pos(mc); + aa64_emit32(mc, aa64_adrp_base(dst_reg)); + mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_GOT_PAGE, sym, 0, 0, 0); + u32 ldr_pos = mc->pos(mc); + aa64_emit32(mc, aa64_ldr_uimm(/*size=*/3, dst_reg, dst_reg, 0)); + mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_LD64_GOT_LO12_NC, sym, 0, 0, 0); +} + +void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend) { + MCEmitter* mc = t->mc; + if (use_got_for_sym(t, sym)) { + aa64_emit_got_load_addr(t, dst_reg, sym); + if (addend) aa64_emit_addr_adjust(mc, dst_reg, dst_reg, (i32)addend); + return; + } + u32 sec = mc->section_id; + u32 adrp_pos = mc->pos(mc); + aa64_emit32(mc, aa64_adrp_base(dst_reg)); + mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, addend, + 0, 0); + u32 add_pos = mc->pos(mc); + aa64_emit32(mc, aa64_add_imm(1, dst_reg, dst_reg, 0, 0)); + mc->emit_reloc_at(mc, sec, add_pos, R_AARCH64_ADD_ABS_LO12_NC, sym, addend, 0, + 0); +} + +void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off) { + if (off == 0) { + aa64_emit32(mc, aa64_mov_reg(1, Rd, base)); + return; + } + u32 abs_off = (off < 0) ? (u32)(-off) : (u32)off; + if (abs_off <= 0xfff) { + if (off < 0) + aa64_emit32(mc, aa64_sub_imm(1, Rd, base, abs_off, 0)); + else + aa64_emit32(mc, aa64_add_imm(1, Rd, base, abs_off, 0)); + return; + } + if ((abs_off >> 24) == 0) { + u32 hi = (abs_off >> 12) & 0xfff; + u32 lo = abs_off & 0xfff; + if (off < 0) { + if (hi) aa64_emit32(mc, aa64_sub_imm(1, Rd, base, hi, 1)); + if (lo) aa64_emit32(mc, aa64_sub_imm(1, Rd, hi ? Rd : base, lo, 0)); + } else { + if (hi) aa64_emit32(mc, aa64_add_imm(1, Rd, base, hi, 1)); + if (lo) aa64_emit32(mc, aa64_add_imm(1, Rd, hi ? Rd : base, lo, 0)); + } + return; + } + aa64_emit_load_imm(mc, 1, Rd, off); + aa64_emit32(mc, aa64_add(1, Rd, base, Rd)); +} diff --git a/src/arch/aa64/internal.h b/src/arch/aa64/internal.h @@ -0,0 +1,306 @@ +/* aarch64/internal.h — private types and forward decls shared across + * emit.c / alloc.c / ops.c. NOT part of the public API. */ +#pragma once + +#include <string.h> + +#include "arch/aa64/asm.h" +#include "arch/aa64/isa.h" +#include "arch/aa64/regs.h" +#include "arch/arch.h" +#include "core/arena.h" +#include "core/pool.h" +#include "obj/obj.h" + +/* ============================================================ + * Local encoding helpers (kept here, not in aa64_isa.h). + * ============================================================ */ + +#define AA64_NOP 0xD503201Fu + +/* Hidden backend temporaries. These must stay outside the allocable pools and + * outside optimizer scratch registers because CGTarget ops may clobber them + * while lowering a single operation. AA_FP_TMP0 names v31, not integer x31. */ +enum { + AA_TMP0 = 9u, + AA_TMP1 = 10u, + AA_TMP2 = 11u, + AA_FP_TMP0 = 31u, +}; +#define CG_BUILTIN_ID(k) ((CfreeCgTypeId)((1u << 6) | (u32)(k))) + +static inline u32 aa64_stp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) { + i32 sc = byte_off >> 3; + return 0xA9000000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) | + ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} +static inline u32 aa64_ldp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) { + i32 sc = byte_off >> 3; + return 0xA9400000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) | + ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} +static inline u32 aa64_stp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) { + i32 sc = byte_off >> 3; + return 0x6D000000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) | + ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} +static inline u32 aa64_ldp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) { + i32 sc = byte_off >> 3; + return 0x6D400000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) | + ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} + +static inline u32 aa64_stur(u32 size, u32 Rt, u32 Rn, i32 simm9) { + return 0x38000000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) | + ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} +static inline u32 aa64_ldur(u32 size, u32 Rt, u32 Rn, i32 simm9) { + return 0x38400000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) | + ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} +static inline u32 aa64_stur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9) { + return 0x3C000000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) | + ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} +static inline u32 aa64_ldur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9) { + return 0x3C400000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) | + ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} + +static inline u32 aa64_str_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) { + u32 sc = byte_off >> size; + return 0x39000000u | (size << 30) | ((sc & 0xfffu) << 10) | + ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} +static inline u32 aa64_ldr_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) { + u32 sc = byte_off >> size; + return 0x39400000u | (size << 30) | ((sc & 0xfffu) << 10) | + ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} +static inline u32 aa64_str_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) { + u32 sc = byte_off >> size; + return 0x3D000000u | (size << 30) | ((sc & 0xfffu) << 10) | + ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} + +static inline u32 aa64_mrs_tpidr_el0(u32 Rt) { + return 0xD53BD040u | (Rt & 0x1fu); +} +static inline u32 aa64_b_base(void) { return 0x14000000u; } +static inline u32 aa64_bl_base(void) { return 0x94000000u; } + +static inline u32 aa64_adrp_base(u32 Rd) { return 0x90000000u | (Rd & 0x1f); } + +static inline u32 aa64_ldr_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) { + u32 sc = byte_off >> size; + return 0x3D400000u | (size << 30) | ((sc & 0xfffu) << 10) | + ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} + +static inline u32 aa64_fmov_reg(u32 type, u32 Rd, u32 Rn) { + return 0x1E204000u | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} + +static inline u32 aa64_subs_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12) { + return 0x71000000u | (sf << 31) | ((imm12 & 0xfff) << 10) | + ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} + +static inline u32 aa64_cset_eq(u32 sf, u32 Rd) { + return 0x1A800400u | (sf << 31) | (31u << 16) | (0x1u << 12) | (31u << 5) | + (Rd & 0x1f); +} + +static inline u32 aa64_fcvtzs(u32 sf, u32 type, u32 Rd, u32 Rn) { + return 0x1E380000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | + (Rd & 0x1f); +} +static inline u32 aa64_fcvtzu(u32 sf, u32 type, u32 Rd, u32 Rn) { + return 0x1E390000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | + (Rd & 0x1f); +} +static inline u32 aa64_scvtf(u32 sf, u32 type, u32 Rd, u32 Rn) { + return 0x1E220000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | + (Rd & 0x1f); +} +static inline u32 aa64_ucvtf(u32 sf, u32 type, u32 Rd, u32 Rn) { + return 0x1E230000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | + (Rd & 0x1f); +} + +static inline u32 aa64_fcvt_d_s(u32 Rd, u32 Rn) { + return 0x1E22C000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_fcvt_s_d(u32 Rd, u32 Rn) { + return 0x1E624000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} + +static inline u32 aa64_fmov_s_w(u32 Rd, u32 Rn) { + return 0x1E270000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_fmov_w_s(u32 Rd, u32 Rn) { + return 0x1E260000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_fmov_d_x(u32 Rd, u32 Rn) { + return 0x9E670000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_fmov_x_d(u32 Rd, u32 Rn) { + return 0x9E660000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} + +static inline u32 aa64_sub_extreg_x_uxtx(u32 Rd, u32 Rn, u32 Rm) { + return 0xCB206000u | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} + +static inline u32 aa64_subs_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm) { + return 0x6B000000u | (sf << 31) | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | + (Rd & 0x1f); +} + +static inline u32 aa64_b_cond(u32 cond) { return 0x54000000u | (cond & 0xfu); } + +static inline u32 aa64_csinc(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 cond) { + return 0x1A800400u | (sf << 31) | ((Rm & 0x1f) << 16) | + ((cond & 0xfu) << 12) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_cset(u32 sf, u32 Rd, u32 cond) { + return aa64_csinc(sf, Rd, 31u, 31u, cond ^ 1u); +} + +static inline u32 aa64_fadd(u32 type, u32 Rd, u32 Rn, u32 Rm) { + return 0x1E202800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) | + ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_fsub(u32 type, u32 Rd, u32 Rn, u32 Rm) { + return 0x1E203800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) | + ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_fmul(u32 type, u32 Rd, u32 Rn, u32 Rm) { + return 0x1E200800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) | + ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_fdiv(u32 type, u32 Rd, u32 Rn, u32 Rm) { + return 0x1E201800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) | + ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} + +static inline u32 aa64_sbfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) { + return 0x13000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) | + ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_ubfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) { + return 0x53000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) | + ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_bfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) { + return 0x33000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) | + ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} + +/* ============================================================ + * AAImpl types + * ============================================================ */ + +#define AA_PROLOGUE_WORDS \ + 22u /* worst case: sub sp + stp/add fp + sret + 5 int + 8 fp saves */ + +typedef struct AASlot { + u32 off; + u32 size; + u32 align; + u8 kind; + u8 pad[3]; +} AASlot; + +typedef struct AAScope { + u8 kind; + u8 has_else; + u8 pad[2]; + MCLabel else_label; + MCLabel end_label; + Label break_label; + Label continue_label; +} AAScope; + +typedef struct AAImpl { + CGTarget base; + SrcLoc loc; + const CGFuncDesc* fd; + + u32 func_start; + u32 prologue_pos; + MCLabel epilogue_label; + + AASlot* slots; + u32 nslots; + u32 slots_cap; + u32 cum_off; + u32 max_outgoing; + + u32 next_param_int; + u32 next_param_fp; + u32 next_param_stack; + u8 has_sret; + FrameSlot sret_ptr_slot; + + u32 used_cs_int_mask; /* bit reg set when x19-x28 must be preserved */ + u32 used_cs_fp_mask; /* bit reg set when d8-d15 must be preserved */ + + AAScope* scopes; + u32 nscopes; + u32 scopes_cap; + + u8 has_alloca; + struct AAAllocaPatch { + u32 pos; + u32 dst_reg; + }* add_patches; + u32 nadd_patches; + u32 add_patches_cap; + + u8 is_variadic; + FrameSlot gp_save_slot; + FrameSlot fp_save_slot; +} AAImpl; + +/* ============================================================ + * Cross-file forward declarations + * ============================================================ */ + +/* emit.c helpers used in alloc.c / ops.c */ +void aa64_emit32(MCEmitter* mc, u32 word); +void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word); +void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm); +void emit_sp_add(MCEmitter* mc, u32 imm); +void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off); +void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym); +void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend); + +/* emit.c public surface */ +FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d); +void aa_func_begin(CGTarget* t, const CGFuncDesc* fd); +void aa_func_end(CGTarget* t); +void aa_param(CGTarget* t, const CGParamDesc* p); + +/* alloc.c helpers used in emit.c / ops.c */ +AAImpl* impl_of(CGTarget* t); +AASlot* aa64_slot_get(AAImpl* a, FrameSlot fs); +void aa_jump(CGTarget* t, Label l); + +/* ops.c helpers used in alloc.c */ +void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma); +void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma); +u32 aa64_force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch); + +/* alloc.c helpers used in ops.c */ +void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op); +void aa_alloc_vtable_init(CGTarget* t); +void aa_coord_vtable_init(CGTarget* t); + +/* shared type helpers (defined in emit.c, used broadly) */ +int type_is_64(CfreeCgTypeId t); +int type_is_fp_double(CfreeCgTypeId t); +int type_is_signed(CfreeCgTypeId t); +u32 type_byte_size(CfreeCgTypeId t); +u32 size_idx_for_bytes(u32 nbytes); +u32 reg_num(Operand op); diff --git a/src/arch/aa64/isa.c b/src/arch/aa64/isa.c @@ -0,0 +1,598 @@ +/* AArch64 instruction descriptor table + operand print/parse dispatch. + * + * The table mirrors the inline encoders in aa64_isa.h: each row records + * (mnemonic, match, mask, format, flags) so the disassembler can identify + * a raw 32-bit word with one mask-and-compare and then dispatch on the + * format to extract operand fields via the same unpack functions the + * encoder uses. Encoder and decoder share the bit knowledge — when an + * opcode value or field position changes, both sides update at one site. + * + * Mask values include the family mask plus the bits that distinguish a + * specific instruction from its siblings in the same family. sf (bit 31) + * is intentionally a don't-care for formats where both 32- and 64-bit + * forms share one row; the unpacker reads sf separately when printing + * operands. + * + * Row ordering: first-match wins. Aliases (rows with AA64_ASMFL_ALIAS) + * are tighter masks placed BEFORE the canonical row they alias so the + * disassembler renders the alias spelling. The assembler accepts both + * spellings — they map to the same encoded word. */ + +#include "arch/aa64/isa.h" + +#include <stddef.h> + +const AA64InsnDesc aa64_insn_table[] = { + /* ----- Move-wide immediate (MOVN / MOVZ / MOVK) ----- */ + {"movn", 0x12800000u, 0x7F800000u, AA64_FMT_MOVEWIDE, 0, {0, 0}}, + {"movz", 0x52800000u, 0x7F800000u, AA64_FMT_MOVEWIDE, 0, {0, 0}}, + {"movk", 0x72800000u, 0x7F800000u, AA64_FMT_MOVEWIDE, 0, {0, 0}}, + + /* ----- Logical, shifted register ----- + * Alias MOV Rd, Rm is ORR Rd, ZR, Rm with shift=0, imm6=0. The mask + * pins Rn (bits 9:5) to 11111 (ZR) and shift/imm6 to 0 so only the + * MOV spelling matches; broader ORR rows below catch the rest. */ + {"mov", 0x2A0003E0u, 0x7FE0FFE0u, AA64_FMT_LOG_SR, AA64_ASMFL_ALIAS, + {0, 0}}, + /* MVN Rd, Rm ≡ ORN Rd, ZR, Rm (logical N=1, Rn=ZR, no shift) */ + {"mvn", 0x2A2003E0u, 0x7FE0FFE0u, AA64_FMT_LOG_SR, AA64_ASMFL_ALIAS, + {0, 0}}, + {"and", 0x0A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, + {"bic", 0x0A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, + {"orr", 0x2A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, + {"orn", 0x2A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, + {"eor", 0x4A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, + {"eon", 0x4A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, + {"ands", 0x6A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, + {"bics", 0x6A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, + + /* ----- Add/Sub, shifted register ----- + * NEG Rd, Rm ≡ SUB Rd, ZR, Rm (Rn=ZR, shift=0, imm6=0). */ + {"neg", 0x4B0003E0u, 0x7FE0FFE0u, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS, + {0, 0}}, + {"negs", 0x6B0003E0u, 0x7FE0FFE0u, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS, + {0, 0}}, + /* CMP Rn, Rm ≡ SUBS ZR, Rn, Rm. */ + {"cmp", 0x6B00001Fu, 0x7F20001Fu, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS, + {0, 0}}, + /* CMN Rn, Rm ≡ ADDS ZR, Rn, Rm. */ + {"cmn", 0x2B00001Fu, 0x7F20001Fu, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS, + {0, 0}}, + {"add", 0x0B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}}, + {"adds", 0x2B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}}, + {"sub", 0x4B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}}, + {"subs", 0x6B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}}, + + /* ----- Data-processing 3-source ----- + * MUL Rd, Rn, Rm ≡ MADD Rd, Rn, Rm, ZR (Ra=ZR, op31=0, o0=0). */ + {"mul", 0x1B007C00u, 0x7FE0FC00u, AA64_FMT_DP3, AA64_ASMFL_ALIAS, {0, 0}}, + /* MNEG Rd, Rn, Rm ≡ MSUB Rd, Rn, Rm, ZR. */ + {"mneg", 0x1B00FC00u, 0x7FE0FC00u, AA64_FMT_DP3, AA64_ASMFL_ALIAS, {0, 0}}, + {"madd", 0x1B000000u, 0x7FE08000u, AA64_FMT_DP3, 0, {0, 0}}, + {"msub", 0x1B008000u, 0x7FE08000u, AA64_FMT_DP3, 0, {0, 0}}, + + /* ----- Data-processing 2-source ----- */ + {"udiv", 0x1AC00800u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}}, + {"sdiv", 0x1AC00C00u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}}, + {"lslv", 0x1AC02000u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}}, + {"lsrv", 0x1AC02400u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}}, + {"asrv", 0x1AC02800u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}}, + {"rorv", 0x1AC02C00u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}}, + + /* ----- Unconditional branch (register) ----- + * RET aliases its no-operand spelling to RET X30 (Rn=11110). The + * tighter row matches when Rn=30 and prints "ret" without operands; + * the looser row below catches RET Xn for other Rn. */ + {"ret", 0xD65F03C0u, 0xFFFFFFFFu, AA64_FMT_BR_REG, + AA64_ASMFL_ALIAS | AA64_ASMFL_NORN, {0, 0}}, + {"br", 0xD61F0000u, 0xFFFFFC1Fu, AA64_FMT_BR_REG, 0, {0, 0}}, + {"blr", 0xD63F0000u, 0xFFFFFC1Fu, AA64_FMT_BR_REG, 0, {0, 0}}, + {"ret", 0xD65F0000u, 0xFFFFFC1Fu, AA64_FMT_BR_REG, 0, {0, 0}}, + + /* ----- PC-relative addressing ----- */ + {"adr", 0x10000000u, 0x9F000000u, AA64_FMT_PCREL_ADR, 0, {0, 0}}, + {"adrp", 0x90000000u, 0x9F000000u, AA64_FMT_PCREL_ADR, 0, {0, 0}}, + + /* ----- Add/Sub immediate ----- */ + {"add", 0x11000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}}, + {"adds", 0x31000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}}, + {"sub", 0x51000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}}, + {"subs", 0x71000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}}, + + /* ----- Load/store, unsigned 12-bit immediate (scaled) ----- + * Mask: family bits 29:27 + 25:24 + size(31:30) + V(26) + opc(23:22). */ + {"strb", 0x39000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, + {"ldrb", 0x39400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, + {"strh", 0x79000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, + {"ldrh", 0x79400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, + {"str", 0xB9000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* 32 */ + {"ldr", 0xB9400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, + {"str", 0xF9000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1, + {0, 0}}, /* 64 */ + {"ldr", 0xF9400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1, + {0, 0}}, + /* SIMD/FP scaled loads/stores (V=1). size 0..2 select B/H/S; size=3 + * selects D; the 128-bit Q form uses size=00 with opc bit 1 set and + * is not yet emitted by codegen. */ + {"str", 0x3D000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* B */ + {"ldr", 0x3D400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, + {"str", 0x7D000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* H */ + {"ldr", 0x7D400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, + {"str", 0xBD000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* S */ + {"ldr", 0xBD400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, + {"str", 0xFD000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1, + {0, 0}}, /* D */ + {"ldr", 0xFD400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1, + {0, 0}}, + + /* ----- Load/store, unscaled signed 9-bit immediate (LDUR/STUR) ----- + * V=0 first, V=1 next. Per-row mask narrows size+V+opc; family mask + * pins the high family bits + the SIMM9-vs-other-variant selector. */ + {"sturb", 0x38000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, + {"ldurb", 0x38400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, + {"sturh", 0x78000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, + {"ldurh", 0x78400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, + {"stur", 0xB8000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* 32 */ + {"ldur", 0xB8400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, + {"stur", 0xF8000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1, + {0, 0}}, + {"ldur", 0xF8400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1, + {0, 0}}, + {"stur", 0x3C000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* B */ + {"ldur", 0x3C400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, + {"stur", 0x7C000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* H */ + {"ldur", 0x7C400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, + {"stur", 0xBC000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* S */ + {"ldur", 0xBC400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, + {"stur", 0xFC000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1, + {0, 0}}, /* D */ + {"ldur", 0xFC400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1, + {0, 0}}, + + /* ----- Load/store pair, pre-indexed (opc=10 X / opc=01 D) ----- */ + {"stp", 0xA9800000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, AA64_ASMFL_SF1, + {0, 0}}, + {"ldp", 0xA9C00000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, AA64_ASMFL_SF1, + {0, 0}}, + {"stp", 0x6D800000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, 0, {0, 0}}, /* D */ + {"ldp", 0x6DC00000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, 0, {0, 0}}, + + /* ----- Load/store pair, signed-offset ----- */ + {"stp", 0xA9000000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, AA64_ASMFL_SF1, + {0, 0}}, + {"ldp", 0xA9400000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, AA64_ASMFL_SF1, + {0, 0}}, + {"stp", 0x6D000000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, 0, {0, 0}}, /* D */ + {"ldp", 0x6D400000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, 0, {0, 0}}, + + /* ----- Unconditional branch (immediate) ----- */ + {"b", 0x14000000u, 0xFC000000u, AA64_FMT_BR_IMM, 0, {0, 0}}, + {"bl", 0x94000000u, 0xFC000000u, AA64_FMT_BR_IMM, 0, {0, 0}}, + + /* ----- Conditional branch (immediate) ----- */ + {"b.cond", 0x54000000u, 0xFF000010u, AA64_FMT_BR_COND, 0, {0, 0}}, + + /* ----- Compare-and-branch ----- */ + {"cbz", 0x34000000u, 0x7F000000u, AA64_FMT_CB, 0, {0, 0}}, + {"cbnz", 0x35000000u, 0x7F000000u, AA64_FMT_CB, 0, {0, 0}}, + + /* ----- Exception generation ----- */ + {"svc", 0xD4000001u, 0xFFE0001Fu, AA64_FMT_EXCEPT, 0, {0, 0}}, + {"brk", 0xD4200000u, 0xFFE0001Fu, AA64_FMT_EXCEPT, 0, {0, 0}}, + {"hlt", 0xD4400000u, 0xFFE0001Fu, AA64_FMT_EXCEPT, 0, {0, 0}}, + + /* ----- Hint ----- */ + {"nop", 0xD503201Fu, 0xFFFFFFFFu, AA64_FMT_HINT, 0, {0, 0}}, + + /* ----- Memory barriers (DMB / DSB / ISB / CLREX) ----- + * Mask covers everything but CRm at bits[11:8]. */ + {"dmb", 0xD50330BFu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}}, + {"dsb", 0xD503309Fu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}}, + {"isb", 0xD50330DFu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}}, + {"clrex", 0xD503305Fu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}}, +}; + +const u32 aa64_insn_table_n = + (u32)(sizeof aa64_insn_table / sizeof aa64_insn_table[0]); + +const AA64InsnDesc* aa64_disasm_find(u32 word) { + for (u32 i = 0; i < aa64_insn_table_n; ++i) { + const AA64InsnDesc* d = &aa64_insn_table[i]; + if ((word & d->mask) == d->match) return d; + } + return NULL; +} + +/* ===================================================================== + * Operand print — one helper per format. + * + * Format choices for immediates: + * - branch displacements, signed add/sub imm, signed ldur/stur ofs: + * signed decimal. + * - MOVZ/MOVK halfword, logical bitmask, exception generation #imm: + * 0x-prefixed hex. + * + * Register naming: ZR alias for x31 in places where the encoding treats + * Rd/Rn=31 as the zero register (logical/arith), SP where it treats 31 + * as the stack pointer (add/sub imm, ldr/str-uimm Rn, ldp/stp Rn). + * + * vaddr is folded into PC-relative branch operands when nonzero. */ + +static void emit_reg(StrBuf* sb, u32 r, int sf, int sp_means_sp) { + if (r == 31u) { + if (sp_means_sp) strbuf_puts(sb, "sp"); + else if (sf) strbuf_puts(sb, "xzr"); + else strbuf_puts(sb, "wzr"); + return; + } + strbuf_putc(sb, sf ? 'x' : 'w'); + strbuf_put_u64(sb, (u64)r); +} + +static void emit_vreg(StrBuf* sb, u32 r, char prefix) { + strbuf_putc(sb, prefix); + strbuf_put_u64(sb, (u64)r); +} + +static void emit_cond(StrBuf* sb, u32 cond) { + static const char* names[16] = {"eq", "ne", "cs", "cc", "mi", "pl", + "vs", "vc", "hi", "ls", "ge", "lt", + "gt", "le", "al", "nv"}; + strbuf_puts(sb, names[cond & 0xfu]); +} + +/* Sign-extend an n-bit value held in the low bits of v to i64. */ +static i64 sext(u64 v, u32 nbits) { + u64 mask = (nbits >= 64u) ? ~0ull : ((1ull << nbits) - 1ull); + v &= mask; + u64 sign = (nbits == 0u) ? 0ull : (1ull << (nbits - 1u)); + if (v & sign) v |= ~mask; + return (i64)v; +} + +static void print_movewide(StrBuf* sb, u32 w) { + AA64MoveWide f = aa64_movewide_unpack(w); + emit_reg(sb, f.Rd, (int)f.sf, /*sp_means_sp=*/0); + strbuf_puts(sb, ", "); + strbuf_put_hex_u64(sb, (u64)f.imm16); + if (f.hw) { + strbuf_puts(sb, ", lsl "); + strbuf_put_u64(sb, (u64)(f.hw * 16u)); + } +} + +static void print_logsr(StrBuf* sb, u32 w, const AA64InsnDesc* d) { + AA64LogSR f = aa64_logsr_unpack(w); + if (d->flags & AA64_ASMFL_ALIAS) { + /* MOV / MVN: Rd, Rm */ + emit_reg(sb, f.Rd, (int)f.sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rm, (int)f.sf, 0); + return; + } + emit_reg(sb, f.Rd, (int)f.sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rn, (int)f.sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rm, (int)f.sf, 0); + if (f.imm6 || f.shift) { + static const char* sh[4] = {"lsl", "lsr", "asr", "ror"}; + strbuf_puts(sb, ", "); + strbuf_puts(sb, sh[f.shift & 3u]); + strbuf_puts(sb, " #"); + strbuf_put_u64(sb, (u64)f.imm6); + } +} + +static void print_addsubsr(StrBuf* sb, u32 w, const AA64InsnDesc* d) { + AA64AddSubSR f = aa64_addsubsr_unpack(w); + if (d->flags & AA64_ASMFL_ALIAS) { + /* NEG / NEGS / CMP / CMN. */ + if (d->mnemonic[0] == 'c') { + /* CMP / CMN — print Rn, Rm */ + emit_reg(sb, f.Rn, (int)f.sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rm, (int)f.sf, 0); + } else { + /* NEG / NEGS — print Rd, Rm */ + emit_reg(sb, f.Rd, (int)f.sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rm, (int)f.sf, 0); + } + return; + } + emit_reg(sb, f.Rd, (int)f.sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rn, (int)f.sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rm, (int)f.sf, 0); + if (f.imm6 || f.shift) { + static const char* sh[4] = {"lsl", "lsr", "asr", "rsv"}; + strbuf_puts(sb, ", "); + strbuf_puts(sb, sh[f.shift & 3u]); + strbuf_puts(sb, " #"); + strbuf_put_u64(sb, (u64)f.imm6); + } +} + +static void print_dp3(StrBuf* sb, u32 w, const AA64InsnDesc* d) { + AA64DP3 f = aa64_dp3_unpack(w); + /* MUL / MNEG alias drop Ra (which is ZR for the alias). */ + if (d->flags & AA64_ASMFL_ALIAS) { + emit_reg(sb, f.Rd, (int)f.sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rn, (int)f.sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rm, (int)f.sf, 0); + return; + } + emit_reg(sb, f.Rd, (int)f.sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rn, (int)f.sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rm, (int)f.sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Ra, (int)f.sf, 0); +} + +static void print_dp2(StrBuf* sb, u32 w) { + AA64DP2 f = aa64_dp2_unpack(w); + emit_reg(sb, f.Rd, (int)f.sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rn, (int)f.sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rm, (int)f.sf, 0); +} + +static void print_brreg(StrBuf* sb, u32 w, const AA64InsnDesc* d) { + AA64BrReg f = aa64_brreg_unpack(w); + if (d->flags & AA64_ASMFL_NORN) return; /* RET (with implicit X30) */ + emit_reg(sb, f.Rn, /*sf=*/1, 0); +} + +static void print_pcrel(StrBuf* sb, u32 w, u64 vaddr) { + AA64PCRelAdr f = aa64_pcrel_adr_unpack(w); + emit_reg(sb, f.Rd, /*sf=*/1, 0); + strbuf_puts(sb, ", "); + i64 imm = sext(((u64)f.immhi << 2) | (u64)f.immlo, 21); + if (f.op == AA64_ADR_OP_ADRP) imm <<= 12; + if (vaddr) { + u64 base = (f.op == AA64_ADR_OP_ADRP) ? (vaddr & ~0xfffull) : vaddr; + strbuf_put_hex_u64(sb, base + (u64)imm); + } else { + strbuf_puts(sb, "#"); + strbuf_put_i64(sb, imm); + } +} + +static void print_addsubimm(StrBuf* sb, u32 w) { + AA64AddSubImm f = aa64_addsubimm_unpack(w); + /* For these encodings, Rd/Rn=31 means SP. */ + emit_reg(sb, f.Rd, (int)f.sf, 1); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rn, (int)f.sf, 1); + strbuf_puts(sb, ", #"); + strbuf_put_u64(sb, (u64)f.imm12); + if (f.sh) strbuf_puts(sb, ", lsl #12"); +} + +static u32 ldst_log2_size(const AA64InsnDesc* d, u32 size_field) { + (void)d; + return size_field & 3u; +} + +static void print_ldst_uimm(StrBuf* sb, u32 w, const AA64InsnDesc* d) { + AA64LdStUimm f = aa64_ldst_uimm_unpack(w); + u32 sz = ldst_log2_size(d, f.size); + /* Pick reg prefix: V=0 picks W/X by size; V=1 picks B/H/S/D by size. */ + if (f.V == 0) { + emit_reg(sb, f.Rt, /*sf=*/(int)(sz == 3u), 0); + } else { + char p = (sz == 0u) ? 'b' : (sz == 1u) ? 'h' : (sz == 2u) ? 's' : 'd'; + emit_vreg(sb, f.Rt, p); + } + strbuf_puts(sb, ", ["); + emit_reg(sb, f.Rn, /*sf=*/1, 1); + u32 byte_off = f.imm12 << sz; + if (byte_off) { + strbuf_puts(sb, ", #"); + strbuf_put_u64(sb, (u64)byte_off); + } + strbuf_putc(sb, ']'); +} + +static void print_ldst_simm9(StrBuf* sb, u32 w, const AA64InsnDesc* d) { + AA64LdStSimm9 f = aa64_ldst_simm9_unpack(w); + u32 sz = f.size & 3u; + (void)d; + if (f.V == 0) { + emit_reg(sb, f.Rt, /*sf=*/(int)(sz == 3u), 0); + } else { + char p = (sz == 0u) ? 'b' : (sz == 1u) ? 'h' : (sz == 2u) ? 's' : 'd'; + emit_vreg(sb, f.Rt, p); + } + strbuf_puts(sb, ", ["); + emit_reg(sb, f.Rn, /*sf=*/1, 1); + i64 off = sext((u64)f.imm9, 9); + if (off) { + strbuf_puts(sb, ", #"); + strbuf_put_i64(sb, off); + } + strbuf_putc(sb, ']'); +} + +static void print_ldstp_common(StrBuf* sb, AA64LdStPPre f, int pre) { + /* opc=10 → 64-bit X; opc=00 → 32-bit W; opc=01 (V=1) → D (FP); + * opc=00 (V=1) → S; opc=10 (V=1) → Q (not yet emitted). */ + i64 scale; + int is_fp = (f.V == 1); + char fp_prefix = 's'; + int sf = 1; + if (is_fp) { + if (f.opc == 0) { + fp_prefix = 's'; + scale = 4; + } else if (f.opc == 1) { + fp_prefix = 'd'; + scale = 8; + } else { + fp_prefix = 'q'; + scale = 16; + } + } else { + sf = (f.opc == 2); + scale = sf ? 8 : 4; + } + if (is_fp) { + emit_vreg(sb, f.Rt, fp_prefix); + strbuf_puts(sb, ", "); + emit_vreg(sb, f.Rt2, fp_prefix); + } else { + emit_reg(sb, f.Rt, sf, 0); + strbuf_puts(sb, ", "); + emit_reg(sb, f.Rt2, sf, 0); + } + strbuf_puts(sb, ", ["); + emit_reg(sb, f.Rn, /*sf=*/1, 1); + i64 byte_off = sext((u64)f.imm7, 7) * scale; + if (byte_off) { + strbuf_puts(sb, ", #"); + strbuf_put_i64(sb, byte_off); + } + strbuf_putc(sb, ']'); + if (pre) strbuf_putc(sb, '!'); +} + +static void print_ldstp_pre(StrBuf* sb, u32 w) { + print_ldstp_common(sb, aa64_ldstp_pre_unpack(w), /*pre=*/1); +} +static void print_ldstp_soff(StrBuf* sb, u32 w) { + print_ldstp_common(sb, aa64_ldstp_soff_unpack(w), /*pre=*/0); +} + +static void print_br_imm(StrBuf* sb, u32 w, u64 vaddr) { + AA64BrImm f = aa64_brimm_unpack(w); + i64 ofs = sext((u64)f.imm26, 26) * 4; + if (vaddr) { + strbuf_put_hex_u64(sb, vaddr + (u64)ofs); + } else { + strbuf_puts(sb, "#"); + strbuf_put_i64(sb, ofs); + } +} + +static void print_br_cond(StrBuf* sb, u32 w, u64 vaddr, + const AA64InsnDesc* d) { + AA64BrCond f = aa64_brcond_unpack(w); + (void)d; + /* mnemonic is "b.cond"; we'll print cond as a suffix on the target. + * The b.cond row keeps a single mnemonic for printing — for the asm + * spelling to be canonical the writer will need to emit b.<cc>, which + * is the printer's job at the dispatcher level (see aa64_print_operands). */ + emit_cond(sb, f.cond); + strbuf_putc(sb, ' '); + i64 ofs = sext((u64)f.imm19, 19) * 4; + if (vaddr) { + strbuf_put_hex_u64(sb, vaddr + (u64)ofs); + } else { + strbuf_puts(sb, "#"); + strbuf_put_i64(sb, ofs); + } +} + +static void print_cb(StrBuf* sb, u32 w, u64 vaddr) { + AA64CB f = aa64_cb_unpack(w); + emit_reg(sb, f.Rt, (int)f.sf, 0); + strbuf_puts(sb, ", "); + i64 ofs = sext((u64)f.imm19, 19) * 4; + if (vaddr) { + strbuf_put_hex_u64(sb, vaddr + (u64)ofs); + } else { + strbuf_puts(sb, "#"); + strbuf_put_i64(sb, ofs); + } +} + +static void print_except(StrBuf* sb, u32 w) { + AA64Except f = aa64_except_unpack(w); + strbuf_puts(sb, "#"); + strbuf_put_hex_u64(sb, (u64)f.imm16); +} + +static void print_barrier(StrBuf* sb, u32 w, const AA64InsnDesc* desc) { + AA64Barrier f = aa64_barrier_unpack(w); + /* ISB and CLREX with the default CRm=SY (15) print without an + * operand. DMB/DSB always carry an option. */ + int is_isb = (f.op2 == AA64_BARRIER_OP2_ISB); + int is_clrex = (f.op2 == AA64_BARRIER_OP2_CLREX); + if ((is_isb || is_clrex) && f.CRm == AA64_BARRIER_OPT_SY) return; + const char* opt = NULL; + switch (f.CRm) { + case AA64_BARRIER_OPT_OSHLD: opt = "oshld"; break; + case AA64_BARRIER_OPT_OSHST: opt = "oshst"; break; + case AA64_BARRIER_OPT_OSH: opt = "osh"; break; + case AA64_BARRIER_OPT_NSHLD: opt = "nshld"; break; + case AA64_BARRIER_OPT_NSHST: opt = "nshst"; break; + case AA64_BARRIER_OPT_NSH: opt = "nsh"; break; + case AA64_BARRIER_OPT_ISHLD: opt = "ishld"; break; + case AA64_BARRIER_OPT_ISHST: opt = "ishst"; break; + case AA64_BARRIER_OPT_ISH: opt = "ish"; break; + case AA64_BARRIER_OPT_LD: opt = (desc && desc->mnemonic && + desc->mnemonic[0] == 'd' && + desc->mnemonic[1] == 'm') + ? "ld" + : NULL; break; + case AA64_BARRIER_OPT_ST: opt = (desc && desc->mnemonic && + desc->mnemonic[0] == 'd' && + desc->mnemonic[1] == 'm') + ? "st" + : NULL; break; + case AA64_BARRIER_OPT_SY: opt = "sy"; break; + default: break; + } + strbuf_putc(sb, ' '); + if (opt) { + strbuf_puts(sb, opt); + } else { + strbuf_puts(sb, "#"); + strbuf_put_u64(sb, (u64)f.CRm); + } +} + +void aa64_print_operands(StrBuf* sb, const AA64InsnDesc* desc, u32 word, + u64 vaddr) { + switch ((AA64Format)desc->fmt) { + case AA64_FMT_MOVEWIDE: print_movewide(sb, word); break; + case AA64_FMT_LOG_SR: print_logsr(sb, word, desc); break; + case AA64_FMT_ADDSUB_SR: print_addsubsr(sb, word, desc); break; + case AA64_FMT_DP3: print_dp3(sb, word, desc); break; + case AA64_FMT_DP2: print_dp2(sb, word); break; + case AA64_FMT_BR_REG: print_brreg(sb, word, desc); break; + case AA64_FMT_PCREL_ADR: print_pcrel(sb, word, vaddr); break; + case AA64_FMT_ADDSUB_IMM: print_addsubimm(sb, word); break; + case AA64_FMT_LDST_UIMM: print_ldst_uimm(sb, word, desc); break; + case AA64_FMT_LDSTP_PRE: print_ldstp_pre(sb, word); break; + case AA64_FMT_LDSTP_SOFF: print_ldstp_soff(sb, word); break; + case AA64_FMT_LDST_SIMM9: print_ldst_simm9(sb, word, desc); break; + case AA64_FMT_BR_IMM: print_br_imm(sb, word, vaddr); break; + case AA64_FMT_BR_COND: print_br_cond(sb, word, vaddr, desc); break; + case AA64_FMT_CB: print_cb(sb, word, vaddr); break; + case AA64_FMT_EXCEPT: print_except(sb, word); break; + case AA64_FMT_HINT: break; /* no operands for NOP */ + case AA64_FMT_BARRIER: print_barrier(sb, word, desc); break; + } +} + +/* ===================================================================== + * Operand parse — phase-3 wires this up to the asm token stream. Phase + * 2 ships the signature so the assembler bring-up commit doesn't need to + * touch the descriptor table; the body returns 0 for every format until + * the per-format grammar is implemented. */ + +int aa64_parse_operands(struct AA64AsmTok* tok, const AA64InsnDesc* desc, + void* fields_out) { + (void)tok; + (void)desc; + (void)fields_out; + return 0; +} diff --git a/src/arch/aa64_isa.h b/src/arch/aa64/isa.h diff --git a/src/arch/aa64/link.c b/src/arch/aa64/link.c @@ -0,0 +1,208 @@ +/* AArch64 link-time descriptor. + * + * Implements the LinkArchDesc contract from link_arch.h for the + * aarch64 ELF psABI: PLT0 + per-import PLT entries (lazy-resolve + * trampolines emitted in canonical form even under DF_1_NOW), and the + * 12-byte IPLT stub used by ifunc resolvers. All instruction bytes + * come from the encoders in arch/aa64/isa.h — no raw hex literals + * here. + * + * The byte layout matches the previous inline encodings in + * link_dyn.c (PLT) and link_layout.c (IPLT) so that switching the + * linker to descriptor dispatch is a no-op on the output image. */ + +#include "arch/aa64/isa.h" +#include "core/bytes.h" +#include "core/core.h" +#include "link/link_arch.h" +#include "obj/elf.h" +#include "obj/macho.h" +#include "obj/obj.h" + +/* Fixed register assignments mandated by the AArch64 PLT ABI. */ +#define AA64_PLT_SCRATCH_X16 16u /* PLT/IPLT scratch (slot address) */ +#define AA64_PLT_SCRATCH_X17 17u /* PLT scratch (loaded function ptr) */ + +/* PLT geometry. Documented in link_arch.h; redeclared here as the + * descriptor table needs them at file scope. */ +#define AA64_PLT0_SIZE 32u +#define AA64_PLT_ENTRY_SIZE 16u +#define AA64_IPLT_STUB_SIZE 12u + +/* PLT0 references .got.plt[2] (the lazy-resolve hook); the per-import + * entries start at .got.plt[3]. */ +#define AA64_GOTPLT_RESOLVER_INDEX 2u + +/* Page mask for ADRP: ADRP encodes (page(target) - page(PC)) >> 12, + * where page(x) clears the low 12 bits. */ +#define AA64_PAGE_MASK ((u64)0xfffu) + +/* Compute the (immlo, immhi) ADRP immediate halves for the page- + * relative displacement from `pc` to `target`. Both addresses are + * post-shift final image vaddrs; ADRP discards the low 12 bits of + * each before subtracting, so the result is invariant under any + * segment-base shift that moves both endpoints by the same delta. */ +static inline void aa64_adrp_imm_halves(u64 pc, u64 target, u32* immlo, + u32* immhi) { + i64 page_disp = (i64)(target & ~AA64_PAGE_MASK) - (i64)(pc & ~AA64_PAGE_MASK); + i64 imm21 = page_disp >> 12; + *immlo = (u32)(imm21 & 0x3); + *immhi = (u32)((imm21 >> 2) & 0x7ffff); +} + +/* Emit one ADRP+LDR+ADD+BR sequence that materializes `slot_vaddr` + * (a .got.plt entry) into x16, loads the resolved function pointer + * into x17, and tail-calls it. Used by both PLT0 (after its STP) and + * each per-import entry — the only thing that varies is `pc`, which + * starts at the ADRP itself. */ +static void aa64_emit_adrp_load_br(u8* dst, u64 pc, u64 slot_vaddr) { + u32 immlo, immhi; + aa64_adrp_imm_halves(pc, slot_vaddr, &immlo, &immhi); + u32 lo12 = (u32)(slot_vaddr & AA64_PAGE_MASK); + /* LDR Xt encodes the byte offset divided by 8. .got.plt slots are + * 8-byte aligned so the low 3 bits of lo12 are always 0. */ + u32 ldr_imm12 = (lo12 >> 3) & 0xfffu; + + wr_u32_le(dst + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, immlo, immhi)); + wr_u32_le(dst + 4, aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X17, + AA64_PLT_SCRATCH_X16, ldr_imm12)); + wr_u32_le(dst + 8, aa64_add_imm(/*sf=*/1, AA64_PLT_SCRATCH_X16, + AA64_PLT_SCRATCH_X16, lo12, /*sh=*/0)); + wr_u32_le(dst + 12, aa64_br(AA64_PLT_SCRATCH_X17)); +} + +static void aa64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) { + /* PLT0: + * stp x16, x30, [sp, #-16]! + * adrp x16, page(.got.plt[2]) + * ldr x17, [x16, #lo12(.got.plt[2])] + * add x16, x16, #lo12(.got.plt[2]) + * br x17 + * nop ; nop ; nop + * + * Under DF_1_NOW the loader patches every .got.plt slot from + * .rela.plt before running PLT0, so this trampoline never executes. + * It is still emitted in canonical form so disassemblers and + * unwinders see the layout the psABI specifies. */ + u64 slot2 = gotplt_vaddr + 8u * AA64_GOTPLT_RESOLVER_INDEX; + /* The ADRP sits at plt0+4 (one instruction past the leading STP). */ + u64 adrp_pc = plt0_vaddr + 4u; + + /* `stp x16, x30, [sp, #-16]!` — pre-indexed pair store with imm7 + * scaled by 8, so the encoded field is -16/8 = -2. */ + wr_u32_le(dst + 0, aa64_stp64_pre(AA64_PLT_SCRATCH_X16, AA64_LR, AA64_SP, + /*imm7_scaled=*/-2)); + aa64_emit_adrp_load_br(dst + 4, adrp_pc, slot2); + wr_u32_le(dst + 20, aa64_nop()); + wr_u32_le(dst + 24, aa64_nop()); + wr_u32_le(dst + 28, aa64_nop()); +} + +static void aa64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) { + /* Per-import 16-byte entry: ADRP+LDR+ADD+BR where ADRP's PC is the + * entry's first instruction (no leading STP here — the resolved + * function returns to the original caller, not into PLT0). */ + aa64_emit_adrp_load_br(dst, entry_vaddr, slot_vaddr); +} + +static u32 aa64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr, + LinkArchIPltReloc out[2]) { + /* IPLT stub: ADRP x16, %page(slot) ; LDR x16, [x16, :lo12:slot] ; + * BR x16. + * + * We deliberately emit the two address-bearing instructions with + * zero immediates: the linker enqueues an ADR_PREL_PG_HI21 reloc on + * the ADRP and an LDST64_ABS_LO12_NC reloc on the LDR, both + * targeting the slot's synthetic local symbol. Reloc-apply runs + * after final vaddr assignment, which is the only point at which + * both endpoints' page-relative displacement is known. */ + (void)stub_vaddr; + (void)slot_vaddr; + + wr_u32_le(dst + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, /*immlo=*/0, + /*immhi=*/0)); + wr_u32_le(dst + 4, + aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X16, AA64_PLT_SCRATCH_X16, + /*imm12_scaled=*/0)); + wr_u32_le(dst + 8, aa64_br(AA64_PLT_SCRATCH_X16)); + + out[0].offset_in_stub = 0; + out[0].width = 4; + out[0].kind = R_AARCH64_ADR_PREL_PG_HI21; + out[1].offset_in_stub = 4; + out[1].width = 4; + out[1].kind = R_AARCH64_LDST64_ABS_LO12_NC; + return 2; +} + +static void aa64_emit_macho_stub(u8* out, u64 stub_vaddr, u64 got_slot_vaddr) { + i64 page_s = ((i64)got_slot_vaddr) & ~(i64)0xfff; + i64 page_p = ((i64)stub_vaddr) & ~(i64)0xfff; + i64 imm21 = (page_s - page_p) >> 12; + u32 immlo = (u32)(imm21 & 0x3u); + u32 immhi = (u32)((imm21 >> 2) & 0x7ffffu); + u32 lo12 = (u32)(got_slot_vaddr & 0xfffu); + u32 imm12_ldr = (lo12 >> 3) & 0xfffu; + + wr_u32_le(out + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, immlo, immhi)); + wr_u32_le(out + 4, aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X16, + AA64_PLT_SCRATCH_X16, imm12_ldr)); + wr_u32_le(out + 8, aa64_br(AA64_PLT_SCRATCH_X16)); +} + +static int aa64_is_branch_reloc(RelocKind kind) { + return kind == R_AARCH64_CALL26 || kind == R_AARCH64_JUMP26; +} + +static int aa64_is_got_load_reloc(RelocKind kind) { + return kind == R_AARCH64_ADR_GOT_PAGE || kind == R_AARCH64_LD64_GOT_LO12_NC; +} + +static int aa64_is_tlvp_reloc(RelocKind kind) { + return kind == R_AARCH64_TLVP_LOAD_PAGE21 || + kind == R_AARCH64_TLVP_LOAD_PAGEOFF12; +} + +static int aa64_is_direct_page_reloc(RelocKind kind) { + switch (kind) { + case R_AARCH64_ADR_PREL_PG_HI21: + case R_AARCH64_ADR_PREL_PG_HI21_NC: + case R_AARCH64_ADD_ABS_LO12_NC: + case R_AARCH64_LDST8_ABS_LO12_NC: + case R_AARCH64_LDST16_ABS_LO12_NC: + case R_AARCH64_LDST32_ABS_LO12_NC: + case R_AARCH64_LDST64_ABS_LO12_NC: + case R_AARCH64_LDST128_ABS_LO12_NC: + return 1; + default: + return 0; + } +} + +const LinkArchDesc link_arch_aa64 = { + .e_machine = EM_AARCH64, + .default_musl_interp = "/lib/ld-musl-aarch64.so.1", + + .elf_r_relative = ELF_R_AARCH64_RELATIVE, + .elf_r_glob_dat = ELF_R_AARCH64_GLOB_DAT, + .elf_r_jump_slot = ELF_R_AARCH64_JUMP_SLOT, + + .macho_cputype = CPU_TYPE_ARM64, + .macho_cpusubtype = CPU_SUBTYPE_ARM64_ALL, + + .plt0_size = AA64_PLT0_SIZE, + .plt_entry_size = AA64_PLT_ENTRY_SIZE, + .iplt_stub_size = AA64_IPLT_STUB_SIZE, + + .emit_plt0 = aa64_emit_plt0, + .emit_plt_entry = aa64_emit_plt_entry, + .emit_iplt_stub = aa64_emit_iplt_stub, + .macho_stub_size = AA64_IPLT_STUB_SIZE, + .emit_macho_stub = aa64_emit_macho_stub, + + .is_branch_reloc = aa64_is_branch_reloc, + .is_got_load_reloc = aa64_is_got_load_reloc, + .is_tlvp_reloc = aa64_is_tlvp_reloc, + .is_direct_page_reloc = aa64_is_direct_page_reloc, + .needs_jit_call_stub = aa64_is_branch_reloc, +}; diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c @@ -0,0 +1,1925 @@ +/* aarch64/ops.c — data movement, arithmetic, calls, varargs, atomics, + * intrinsics, asm_block, set_loc, finalize/destroy, vtable constructor. */ + +#include "arch/aa64/internal.h" + +/* ============================================================ + * Data movement + * ============================================================ */ + +static void aa_load_imm(CGTarget* t, Operand dst, i64 imm) { + u32 sf = type_is_64(dst.type) ? 1u : 0u; + aa64_emit_load_imm(t->mc, sf, reg_num(dst), imm); +} + +static void aa_load_const(CGTarget* t, Operand dst, ConstBytes cb) { + AAImpl* a = impl_of(t); + if (dst.cls != RC_FP) { + compiler_panic(t->c, a->loc, "aarch64 load_const: only FP supported in v1"); + } + + Sym ro_name = pool_intern_cstr(t->c->global, ".rodata"); + ObjSecId ro = obj_section(t->obj, ro_name, SEC_RODATA, SF_ALLOC, 1u); + + u32 cur_section = t->mc->section_id; + t->mc->set_section(t->mc, ro); + u32 ro_off = obj_align_to(t->obj, ro, cb.align ? cb.align : 4); + t->mc->emit_bytes(t->mc, cb.bytes, cb.size); + + char namebuf[64]; + static u32 lit_seq = 0; + int len = 0; + { + const char* prefix = ".LCFP"; + for (; prefix[len]; ++len) namebuf[len] = prefix[len]; + u32 v = lit_seq++; + char tmp[16]; + int tn = 0; + if (v == 0) + tmp[tn++] = '0'; + else { + while (v) { + tmp[tn++] = '0' + (char)(v % 10); + v /= 10; + } + } + for (int i = tn - 1; i >= 0; --i) namebuf[len++] = tmp[i]; + namebuf[len] = 0; + } + Sym sname = pool_intern_cstr(t->c->global, namebuf); + ObjSymId sym = obj_symbol(t->obj, sname, SB_LOCAL, SK_OBJ, ro, (u64)ro_off, + (u64)cb.size); + + t->mc->set_section(t->mc, cur_section); + + u32 adrp_pos = t->mc->pos(t->mc); + aa64_emit32(t->mc, aa64_adrp_base(AA_TMP0)); + t->mc->emit_reloc_at(t->mc, cur_section, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, + sym, 0, 0, 0); + + u32 ldr_pos = t->mc->pos(t->mc); + u32 sidx = (cb.size == 8) ? 3u : 2u; + aa64_emit32(t->mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), AA_TMP0, 0)); + RelocKind lo12 = (cb.size == 8) ? R_AARCH64_LDST64_ABS_LO12_NC + : R_AARCH64_LDST32_ABS_LO12_NC; + t->mc->emit_reloc_at(t->mc, cur_section, ldr_pos, lo12, sym, 0, 0, 0); +} + +static void aa_copy(CGTarget* t, Operand dst, Operand src) { + if (dst.cls == RC_FP || src.cls == RC_FP) { + u32 type = type_is_fp_double(dst.type) ? 1u : 0u; + aa64_emit32(t->mc, aa64_fmov_reg(type, reg_num(dst), reg_num(src))); + return; + } + u32 sf = type_is_64(dst.type) ? 1u : 0u; + aa64_emit32(t->mc, aa64_mov_reg(sf, reg_num(dst), reg_num(src))); +} + +/* ============================================================ + * Load / store + * ============================================================ */ + +static RelocKind ldst_lo12_reloc_for(u32 nbytes) { + switch (nbytes) { + case 1: return R_AARCH64_LDST8_ABS_LO12_NC; + case 2: return R_AARCH64_LDST16_ABS_LO12_NC; + case 4: return R_AARCH64_LDST32_ABS_LO12_NC; + case 8: return R_AARCH64_LDST64_ABS_LO12_NC; + default: return R_AARCH64_LDST64_ABS_LO12_NC; + } +} + +static int use_got_for_sym(CGTarget* t, ObjSymId sym) { + return obj_symbol_extern_via_got(t->c, t->obj, sym); +} + +static u32 addr_base(CGTarget* t, Operand addr, i32* out_off, u32 tmp_reg) { + AAImpl* a = impl_of(t); + if (addr.kind == OPK_LOCAL) { + AASlot* s = aa64_slot_get(a, addr.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_base: bad slot"); + i32 off = -(i32)s->off; + if (off >= -256 && off <= 255) { + *out_off = off; + return 29; + } + aa64_emit_addr_adjust(t->mc, tmp_reg, 29, off); + *out_off = 0; + return tmp_reg; + } + if (addr.kind == OPK_INDIRECT) { + i32 off = addr.v.ind.ofs; + u32 base = addr.v.ind.base & 0x1f; + if (off >= -256 && off <= 255) { + *out_off = off; + return base; + } + aa64_emit_addr_adjust(t->mc, tmp_reg, base, off); + *out_off = 0; + return tmp_reg; + } + if (addr.kind == OPK_GLOBAL) { + emit_global_addr(t, tmp_reg, addr.v.global.sym, addr.v.global.addend); + *out_off = 0; + return tmp_reg; + } + compiler_panic(t->c, a->loc, "aarch64 addr_base: unsupported kind %d", + (int)addr.kind); +} + +void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) { + u32 sz = ma.size ? ma.size : type_byte_size(addr.type); + u32 sidx = size_idx_for_bytes(sz); + + if (addr.kind == OPK_GLOBAL) { + MCEmitter* mc = t->mc; + u32 sec = mc->section_id; + ObjSymId sym = addr.v.global.sym; + i64 add = addr.v.global.addend; + if (use_got_for_sym(t, sym)) { + aa64_emit_got_load_addr(t, AA_TMP0, sym); + if (dst.cls == RC_FP) { + aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP0, (i32)add)); + } else { + aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP0, (i32)add)); + } + return; + } + u32 adrp_pos = mc->pos(mc); + aa64_emit32(mc, aa64_adrp_base(AA_TMP0)); + mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add, + 0, 0); + u32 ld_pos = mc->pos(mc); + if (dst.cls == RC_FP) { + aa64_emit32(mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), AA_TMP0, 0)); + } else { + aa64_emit32(mc, aa64_ldr_uimm(sidx, reg_num(dst), AA_TMP0, 0)); + } + mc->emit_reloc_at(mc, sec, ld_pos, ldst_lo12_reloc_for(sz), sym, add, 0, 0); + return; + } + + i32 off; + u32 base = addr_base(t, addr, &off, AA_TMP0); + if (dst.cls == RC_FP) { + aa64_emit32(t->mc, aa64_ldur_fp(sidx, reg_num(dst), base, off)); + } else { + aa64_emit32(t->mc, aa64_ldur(sidx, reg_num(dst), base, off)); + } +} + +void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) { + u32 sz = ma.size ? ma.size : type_byte_size(addr.type); + u32 sidx = size_idx_for_bytes(sz); + + if (addr.kind == OPK_GLOBAL) { + MCEmitter* mc = t->mc; + u32 sec = mc->section_id; + ObjSymId sym = addr.v.global.sym; + i64 add = addr.v.global.addend; + + u32 src_reg; + u32 src_is_fp = 0; + if (src.kind == OPK_IMM) { + u32 sf = (sz == 8) ? 1u : 0u; + aa64_emit_load_imm(mc, sf, AA_TMP0, src.v.imm); + src_reg = AA_TMP0; + } else if (src.cls == RC_FP) { + src_reg = reg_num(src); + src_is_fp = 1; + } else { + src_reg = reg_num(src); + } + u32 base = (src.kind == OPK_IMM) ? AA_TMP1 : AA_TMP0; + if (use_got_for_sym(t, sym)) { + aa64_emit_got_load_addr(t, base, sym); + if (src_is_fp) { + aa64_emit32(mc, aa64_stur_fp(sidx, src_reg, base, (i32)add)); + } else { + aa64_emit32(mc, aa64_stur(sidx, src_reg, base, (i32)add)); + } + return; + } + u32 adrp_pos = mc->pos(mc); + aa64_emit32(mc, aa64_adrp_base(base)); + mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add, + 0, 0); + u32 st_pos = mc->pos(mc); + if (src_is_fp) { + aa64_emit32(mc, aa64_str_fp_uimm(sidx, src_reg, base, 0)); + } else { + aa64_emit32(mc, aa64_str_uimm(sidx, src_reg, base, 0)); + } + mc->emit_reloc_at(mc, sec, st_pos, ldst_lo12_reloc_for(sz), sym, add, 0, 0); + return; + } + + i32 off; + u32 addr_tmp = (src.kind == OPK_IMM) ? AA_TMP1 : AA_TMP0; + u32 base = addr_base(t, addr, &off, addr_tmp); + + if (src.kind == OPK_IMM) { + u32 sf = (sz == 8) ? 1u : 0u; + aa64_emit_load_imm(t->mc, sf, AA_TMP0, src.v.imm); + aa64_emit32(t->mc, aa64_stur(sidx, AA_TMP0, base, off)); + return; + } + if (src.cls == RC_FP) { + aa64_emit32(t->mc, aa64_stur_fp(sidx, reg_num(src), base, off)); + } else { + aa64_emit32(t->mc, aa64_stur(sidx, reg_num(src), base, off)); + } +} + +static void aa_addr_of(CGTarget* t, Operand dst, Operand lv) { + AAImpl* a = impl_of(t); + if (lv.kind == OPK_LOCAL) { + AASlot* s = aa64_slot_get(a, lv.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_of: bad slot"); + aa64_emit32(t->mc, aa64_sub_imm(1, reg_num(dst), 29, s->off, 0)); + return; + } + if (lv.kind == OPK_INDIRECT) { + i32 ofs = lv.v.ind.ofs; + u32 base = lv.v.ind.base & 0x1f; + if (ofs == 0) { + aa64_emit32(t->mc, aa64_mov_reg(1, reg_num(dst), base)); + } else if (ofs > 0 && ofs <= 0xfff) { + aa64_emit32(t->mc, aa64_add_imm(1, reg_num(dst), base, (u32)ofs, 0)); + } else if (ofs < 0 && -ofs <= 0xfff) { + aa64_emit32(t->mc, aa64_sub_imm(1, reg_num(dst), base, (u32)(-ofs), 0)); + } else { + compiler_panic(t->c, a->loc, + "aarch64 addr_of: indirect offset %d unsupported", ofs); + } + return; + } + if (lv.kind == OPK_GLOBAL) { + u32 rd = reg_num(dst); + ObjSymId sym = lv.v.global.sym; + i64 addend = lv.v.global.addend; + if (use_got_for_sym(t, sym)) { + aa64_emit_got_load_addr(t, rd, sym); + if (addend) aa64_emit_addr_adjust(t->mc, rd, rd, (i32)addend); + return; + } + u32 sec = t->mc->section_id; + u32 adrp_pos = t->mc->pos(t->mc); + aa64_emit32(t->mc, aa64_adrp_base(rd)); + t->mc->emit_reloc_at(t->mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, + addend, 0, 0); + u32 add_pos = t->mc->pos(t->mc); + aa64_emit32(t->mc, aa64_add_imm(1, rd, rd, 0, 0)); + t->mc->emit_reloc_at(t->mc, sec, add_pos, R_AARCH64_ADD_ABS_LO12_NC, sym, + addend, 0, 0); + return; + } + compiler_panic(t->c, impl_of(t)->loc, "aarch64: addr_of not implemented"); +} + +static void aa_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend) { + MCEmitter* mc = t->mc; + u32 sec = mc->section_id; + u32 rd = reg_num(dst); + + if (obj_format_tls_via_descriptor(t->c)) { + /* TLV access via per-variable descriptor (Mach-O TLVP). The thunk's + * ABI is custom — x0 in/out as descriptor → TLV addr, all other + * regs preserved — so we materialize via x0 and copy to `dst` only + * when they differ. x0/x1 are scratch here (the regalloc only hands + * out x19-x28), and x30 was saved at the prologue. + * + * adrp x0, sym@TLVPPAGE ; R_AARCH64_TLVP_LOAD_PAGE21 + * ldr x0, [x0, sym@TLVPPAGEOFF] ; R_AARCH64_TLVP_LOAD_PAGEOFF12 + * ldr x1, [x0] ; descriptor[0] = thunk pointer + * blr x1 ; x0 in/out + * mov xdst, x0 ; only if dst != x0 + * + * TLVP relocs do not carry an addend; nonzero addends are applied + * after the call as a follow-on ADD/SUB on `dst`. */ + u32 adrp_pos = mc->pos(mc); + aa64_emit32(mc, aa64_adrp_base(/*Rd=*/0)); + mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_TLVP_LOAD_PAGE21, sym, 0, 0, + 0); + u32 ldr_pos = mc->pos(mc); + aa64_emit32(mc, + aa64_ldr_uimm(/*size=*/3, /*Rt=*/0, /*Rn=*/0, /*byte_off=*/0)); + mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_TLVP_LOAD_PAGEOFF12, sym, 0, + 0, 0); + aa64_emit32(mc, + aa64_ldr_uimm(/*size=*/3, /*Rt=*/1, /*Rn=*/0, /*byte_off=*/0)); + aa64_emit32(mc, aa64_blr(/*Rn=*/1)); + if (rd != 0) aa64_emit32(mc, aa64_mov_reg(/*sf=*/1, rd, /*Rm=*/0)); + if (addend) aa64_emit_addr_adjust(mc, rd, rd, (i32)addend); + return; + } + + aa64_emit32(mc, aa64_mrs_tpidr_el0(AA_TMP0)); + + u32 hi_pos = mc->pos(mc); + aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, AA_TMP0, /*imm12=*/0, /*sh=*/1)); + mc->emit_reloc_at(mc, sec, hi_pos, R_AARCH64_TLSLE_ADD_TPREL_HI12, sym, + addend, 0, 0); + + u32 lo_pos = mc->pos(mc); + aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, /*Rn=*/rd, /*imm12=*/0, /*sh=*/0)); + mc->emit_reloc_at(mc, sec, lo_pos, R_AARCH64_TLSLE_ADD_TPREL_LO12_NC, sym, + addend, 0, 0); +} + +/* ============================================================ + * Aggregate helpers + * ============================================================ */ + +static u32 agg_addr_reg(CGTarget* t, Operand op, u32 scratch) { + if (op.kind == OPK_REG) return reg_num(op); + if (op.kind == OPK_LOCAL) { + AAImpl* a = impl_of(t); + AASlot* s = aa64_slot_get(a, op.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 agg: bad slot"); + aa64_emit32(t->mc, aa64_sub_imm(1, scratch, 29, s->off, 0)); + return scratch; + } + compiler_panic(t->c, impl_of(t)->loc, + "aarch64 agg: address kind %d unsupported", (int)op.kind); +} + +static void aa_copy_bytes(CGTarget* t, Operand dst_addr, Operand src_addr, + AggregateAccess agg) { + MCEmitter* mc = t->mc; + u32 dr = agg_addr_reg(t, dst_addr, AA_TMP0); + u32 sr = agg_addr_reg(t, src_addr, + (dr == AA_TMP1) ? AA_TMP2 : AA_TMP1); + u32 nbytes = agg.size; + u32 i = 0; + while (i + 8 <= nbytes) { + aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i)); + i += 8; + } + while (i + 4 <= nbytes) { + aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i)); + i += 4; + } + while (i + 2 <= nbytes) { + aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i)); + i += 2; + } + while (i < nbytes) { + aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i)); + i += 1; + } +} + +static void aa_set_bytes(CGTarget* t, Operand dst_addr, Operand byte_value, + AggregateAccess agg) { + MCEmitter* mc = t->mc; + u32 dr = agg_addr_reg(t, dst_addr, AA_TMP0); + + u32 byte; + if (byte_value.kind == OPK_IMM) { + byte = (u32)(byte_value.v.imm & 0xffu); + } else { + compiler_panic(t->c, impl_of(t)->loc, + "aarch64 set_bytes: REG byte not yet supported"); + } + u32 nbytes = agg.size; + + if (byte == 0) { + u32 i = 0; + while (i + 8 <= nbytes) { + aa64_emit32(mc, aa64_stur(3, 31, dr, (i32)i)); + i += 8; + } + while (i + 4 <= nbytes) { + aa64_emit32(mc, aa64_stur(2, 31, dr, (i32)i)); + i += 4; + } + while (i + 2 <= nbytes) { + aa64_emit32(mc, aa64_stur(1, 31, dr, (i32)i)); + i += 2; + } + while (i < nbytes) { + aa64_emit32(mc, aa64_stur(0, 31, dr, (i32)i)); + i += 1; + } + return; + } + + u64 b64 = byte; + b64 |= b64 << 8; + b64 |= b64 << 16; + b64 |= b64 << 32; + aa64_emit_load_imm(mc, /*sf=*/1u, AA_TMP1, (i64)b64); + + u32 i = 0; + while (i + 8 <= nbytes) { + aa64_emit32(mc, aa64_stur(3, AA_TMP1, dr, (i32)i)); + i += 8; + } + while (i + 4 <= nbytes) { + aa64_emit32(mc, aa64_stur(2, AA_TMP1, dr, (i32)i)); + i += 4; + } + while (i + 2 <= nbytes) { + aa64_emit32(mc, aa64_stur(1, AA_TMP1, dr, (i32)i)); + i += 2; + } + while (i < nbytes) { + aa64_emit32(mc, aa64_stur(0, AA_TMP1, dr, (i32)i)); + i += 1; + } +} + +/* ============================================================ + * Bitfields + * ============================================================ */ + +static void aa_bitfield_load(CGTarget* t, Operand dst, Operand record_addr, + BitFieldAccess bf) { + MCEmitter* mc = t->mc; + u32 base = agg_addr_reg(t, record_addr, AA_TMP0); + u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u; + u32 sf = (storage_bytes == 8u) ? 1u : 0u; + u32 sidx = size_idx_for_bytes(storage_bytes); + u32 rd = reg_num(dst); + + aa64_emit32(mc, aa64_ldur(sidx, rd, base, (i32)bf.storage_offset)); + u32 lsb = bf.bit_offset; + u32 width = bf.bit_width ? bf.bit_width : 1u; + u32 imms = lsb + width - 1u; + if (bf.signed_) { + aa64_emit32(mc, aa64_sbfm(sf, rd, rd, lsb, imms)); + } else { + aa64_emit32(mc, aa64_ubfm(sf, rd, rd, lsb, imms)); + } +} + +static void aa_bitfield_store(CGTarget* t, Operand record_addr, Operand src, + BitFieldAccess bf) { + MCEmitter* mc = t->mc; + u32 base = agg_addr_reg(t, record_addr, AA_TMP0); + u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u; + u32 sf = (storage_bytes == 8u) ? 1u : 0u; + u32 sidx = size_idx_for_bytes(storage_bytes); + + aa64_emit32(mc, aa64_ldur(sidx, AA_TMP1, base, (i32)bf.storage_offset)); + + u32 src_reg; + if (src.kind == OPK_IMM) { + aa64_emit_load_imm(mc, sf, AA_TMP2, src.v.imm); + src_reg = AA_TMP2; + } else if (src.kind == OPK_REG) { + src_reg = reg_num(src); + } else { + compiler_panic(t->c, impl_of(t)->loc, + "aarch64 bitfield_store: src kind %d unsupported", + (int)src.kind); + } + + u32 reg_size = sf ? 64u : 32u; + u32 lsb = bf.bit_offset; + u32 width = bf.bit_width ? bf.bit_width : 1u; + u32 immr = (reg_size - lsb) % reg_size; + u32 imms = width - 1u; + aa64_emit32(mc, aa64_bfm(sf, AA_TMP1, src_reg, immr, imms)); + + aa64_emit32(mc, aa64_stur(sidx, AA_TMP1, base, (i32)bf.storage_offset)); +} + +/* ============================================================ + * Arithmetic helpers + * ============================================================ */ + +u32 aa64_force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch) { + if (op.kind == OPK_REG) return reg_num(op); + if (op.kind == OPK_IMM) { + aa64_emit_load_imm(t->mc, sf, scratch, op.v.imm); + return scratch; + } + compiler_panic(t->c, impl_of(t)->loc, + "aarch64 binop: operand kind %d unsupported", (int)op.kind); +} + +static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, + Operand b_op) { + MCEmitter* mc = t->mc; + + if (op == BO_FADD || op == BO_FSUB || op == BO_FMUL || op == BO_FDIV) { + if (a_op.kind != OPK_REG || b_op.kind != OPK_REG || dst.cls != RC_FP) { + compiler_panic(t->c, impl_of(t)->loc, + "aarch64 binop: FP op requires REG operands"); + } + u32 type = type_is_fp_double(dst.type) ? 1u : 0u; + u32 rd = reg_num(dst); + u32 rn = reg_num(a_op); + u32 rm = reg_num(b_op); + u32 w; + switch (op) { + case BO_FADD: w = aa64_fadd(type, rd, rn, rm); break; + case BO_FSUB: w = aa64_fsub(type, rd, rn, rm); break; + case BO_FMUL: w = aa64_fmul(type, rd, rn, rm); break; + case BO_FDIV: w = aa64_fdiv(type, rd, rn, rm); break; + default: w = 0; break; + } + aa64_emit32(mc, w); + return; + } + + u32 sf = type_is_64(dst.type) ? 1u : 0u; + u32 rd = reg_num(dst); + + switch (op) { + case BO_IADD: + case BO_AND: + case BO_OR: + case BO_XOR: { + if (a_op.kind == OPK_IMM && b_op.kind != OPK_IMM) { + Operand t_op = a_op; a_op = b_op; b_op = t_op; + } + break; + } + default: break; + } + + if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) { + u32 rn_reg = reg_num(a_op); + i64 imm = b_op.v.imm; + u32 imm12, sh, N, immr, imms; + switch (op) { + case BO_IADD: + if (aa64_addsub_imm_fits(imm, &imm12, &sh)) { + aa64_emit32(mc, aa64_add_imm(sf, rd, rn_reg, imm12, sh)); + return; + } + break; + case BO_ISUB: + if (aa64_addsub_imm_fits(imm, &imm12, &sh)) { + aa64_emit32(mc, aa64_sub_imm(sf, rd, rn_reg, imm12, sh)); + return; + } + break; + case BO_AND: + if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) { + aa64_emit32(mc, aa64_and_imm(sf, rd, rn_reg, N, immr, imms)); + return; + } + break; + case BO_OR: + if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) { + aa64_emit32(mc, aa64_orr_imm(sf, rd, rn_reg, N, immr, imms)); + return; + } + break; + case BO_XOR: + if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) { + aa64_emit32(mc, aa64_eor_imm(sf, rd, rn_reg, N, immr, imms)); + return; + } + break; + case BO_SHL: { + u32 width = sf ? 64u : 32u; + u32 sh_amt = (u32)((u64)imm & (width - 1u)); + if (aa64_lsl_imm_fields(sh_amt, sf, &immr, &imms)) { + aa64_emit32(mc, aa64_ubfm(sf, rd, rn_reg, immr, imms)); + return; + } + break; + } + case BO_SHR_U: { + u32 width = sf ? 64u : 32u; + u32 sh_amt = (u32)((u64)imm & (width - 1u)); + if (aa64_lsr_imm_fields(sh_amt, sf, &immr, &imms)) { + aa64_emit32(mc, aa64_ubfm(sf, rd, rn_reg, immr, imms)); + return; + } + break; + } + case BO_SHR_S: { + u32 width = sf ? 64u : 32u; + u32 sh_amt = (u32)((u64)imm & (width - 1u)); + if (aa64_asr_imm_fields(sh_amt, sf, &immr, &imms)) { + aa64_emit32(mc, aa64_sbfm(sf, rd, rn_reg, immr, imms)); + return; + } + break; + } + default: break; + } + } + + u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); + u32 rm = + aa64_force_reg_int(t, b_op, sf, (rn == AA_TMP0) ? AA_TMP1 : AA_TMP0); + + u32 word; + switch (op) { + case BO_IADD: word = aa64_add(sf, rd, rn, rm); break; + case BO_ISUB: word = aa64_sub(sf, rd, rn, rm); break; + case BO_IMUL: word = aa64_mul(sf, rd, rn, rm); break; + case BO_AND: word = aa64_and(sf, rd, rn, rm); break; + case BO_OR: word = aa64_orr(sf, rd, rn, rm); break; + case BO_XOR: word = aa64_eor(sf, rd, rn, rm); break; + case BO_SHL: word = aa64_lslv(sf, rd, rn, rm); break; + case BO_SHR_U: word = aa64_lsrv(sf, rd, rn, rm); break; + case BO_SHR_S: word = aa64_asrv(sf, rd, rn, rm); break; + case BO_UDIV: word = aa64_udiv(sf, rd, rn, rm); break; + case BO_SDIV: word = aa64_sdiv(sf, rd, rn, rm); break; + case BO_SREM: + aa64_emit32(mc, aa64_sdiv(sf, AA_TMP2, rn, rm)); + word = aa64_msub(sf, rd, AA_TMP2, rm, rn); + break; + case BO_UREM: + aa64_emit32(mc, aa64_udiv(sf, AA_TMP2, rn, rm)); + word = aa64_msub(sf, rd, AA_TMP2, rm, rn); + break; + case BO_FADD: + case BO_FSUB: + case BO_FMUL: + case BO_FDIV: + default: + compiler_panic(t->c, impl_of(t)->loc, "aarch64 binop: op %d unimpl", + (int)op); + } + aa64_emit32(mc, word); +} + +static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a_op) { + MCEmitter* mc = t->mc; + u32 sf = type_is_64(dst.type) ? 1u : 0u; + u32 rd = reg_num(dst); + u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); + u32 word; + + switch (op) { + case UO_NEG: + word = aa64_neg(sf, rd, rn); + break; + case UO_BNOT: + word = aa64_mvn(sf, rd, rn); + break; + case UO_NOT: + aa64_emit32(mc, aa64_subs_imm(sf, /*ZR=*/31, rn, 0)); + word = aa64_cset_eq(sf, rd); + break; + default: + compiler_panic(t->c, impl_of(t)->loc, "aarch64 unop: op %d unimpl", + (int)op); + } + aa64_emit32(mc, word); +} + +static void aa_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) { + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + u32 rd = reg_num(dst); + u32 rn = reg_num(src); + + switch (k) { + case CV_SEXT: { + if (src.cls != RC_INT || dst.cls != RC_INT) { + compiler_panic(t->c, a->loc, "aarch64 convert SEXT: bad classes"); + } + u32 src_bits = type_byte_size(src.type) * 8u; + u32 sf_dst = type_is_64(dst.type) ? 1u : 0u; + aa64_emit32(mc, aa64_sbfm(sf_dst, rd, rn, /*immr=*/0, /*imms=*/src_bits - 1u)); + return; + } + case CV_ZEXT: { + if (src.cls != RC_INT || dst.cls != RC_INT) { + compiler_panic(t->c, a->loc, "aarch64 convert ZEXT: bad classes"); + } + u32 src_bits = type_byte_size(src.type) * 8u; + if (src_bits == 32u) { + aa64_emit32(mc, aa64_mov_reg(0, rd, rn)); + } else { + aa64_emit32(mc, aa64_ubfm(0, rd, rn, /*immr=*/0, /*imms=*/src_bits - 1u)); + } + return; + } + case CV_TRUNC: { + aa64_emit32(mc, aa64_mov_reg(0, rd, rn)); + return; + } + case CV_ITOF_S: { + u32 sf_src = type_is_64(src.type) ? 1u : 0u; + u32 type = type_is_fp_double(dst.type) ? 1u : 0u; + aa64_emit32(mc, aa64_scvtf(sf_src, type, rd, rn)); + return; + } + case CV_ITOF_U: { + u32 sf_src = type_is_64(src.type) ? 1u : 0u; + u32 type = type_is_fp_double(dst.type) ? 1u : 0u; + aa64_emit32(mc, aa64_ucvtf(sf_src, type, rd, rn)); + return; + } + case CV_FTOI_S: { + if (src.cls != RC_FP || dst.cls != RC_INT) { + compiler_panic(t->c, a->loc, "aarch64 convert FTOI_S: bad classes"); + } + u32 sf = type_is_64(dst.type) ? 1u : 0u; + u32 type = type_is_fp_double(src.type) ? 1u : 0u; + aa64_emit32(mc, aa64_fcvtzs(sf, type, rd, rn)); + return; + } + case CV_FTOI_U: { + if (src.cls != RC_FP || dst.cls != RC_INT) { + compiler_panic(t->c, a->loc, "aarch64 convert FTOI_U: bad classes"); + } + u32 sf = type_is_64(dst.type) ? 1u : 0u; + u32 type = type_is_fp_double(src.type) ? 1u : 0u; + aa64_emit32(mc, aa64_fcvtzu(sf, type, rd, rn)); + return; + } + case CV_FEXT: { + aa64_emit32(mc, aa64_fcvt_d_s(rd, rn)); + return; + } + case CV_FTRUNC: { + aa64_emit32(mc, aa64_fcvt_s_d(rd, rn)); + return; + } + case CV_BITCAST: { + if (src.cls == RC_INT && dst.cls == RC_FP) { + u32 sz = type_byte_size(dst.type); + aa64_emit32(mc, sz == 8 ? aa64_fmov_d_x(rd, rn) : aa64_fmov_s_w(rd, rn)); + } else if (src.cls == RC_FP && dst.cls == RC_INT) { + u32 sz = type_byte_size(src.type); + aa64_emit32(mc, sz == 8 ? aa64_fmov_x_d(rd, rn) : aa64_fmov_w_s(rd, rn)); + } else { + compiler_panic(t->c, a->loc, + "aarch64 convert BITCAST: same-class not yet supported"); + } + return; + } + default: + compiler_panic(t->c, a->loc, "aarch64 convert kind %d unimpl", (int)k); + } +} + +/* ============================================================ + * Calls + * ============================================================ */ + +static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi, + const CGABIValue* av, u32* next_int, u32* next_fp, + u32* stack_off) { + AAImpl* a = impl_of(t); + ABIArgInfo va_ai; + ABIArgPart va_pt; + const ABIArgInfo* ai = av->abi; + if (!ai) { + u32 sz = type_byte_size(av->type); + memset(&va_ai, 0, sizeof va_ai); + memset(&va_pt, 0, sizeof va_pt); + va_ai.kind = ABI_ARG_DIRECT; + va_ai.parts = &va_pt; + va_ai.nparts = 1; + va_pt.cls = (av->storage.cls == RC_FP) ? ABI_CLASS_FP : ABI_CLASS_INT; + va_pt.size = sz; + va_pt.align = sz; + va_pt.src_offset = 0; + ai = &va_ai; + if (fi && fi->vararg_on_stack) { + *next_int = 8; + *next_fp = 8; + } + } + if (ai->kind == ABI_ARG_IGNORE) return; + + if (ai->kind == ABI_ARG_INDIRECT) { + u32 dst_reg; + int to_stack = (*next_int >= 8); + if (!to_stack) + dst_reg = (*next_int)++; + else + dst_reg = AA_TMP0; + if (av->storage.kind == OPK_LOCAL) { + AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad byval slot"); + aa64_emit32(t->mc, aa64_sub_imm(1, dst_reg, 29, s->off, 0)); + } else if (av->storage.kind == OPK_INDIRECT) { + aa64_emit_addr_adjust(t->mc, dst_reg, av->storage.v.ind.base & 0x1f, + av->storage.v.ind.ofs); + } else { + compiler_panic(t->c, a->loc, + "aarch64 call: INDIRECT arg storage kind %d unsupported", + (int)av->storage.kind); + } + if (to_stack) { + aa64_emit32(t->mc, aa64_str_uimm(3, dst_reg, 31, *stack_off)); + *stack_off += 8; + } + return; + } + + for (u16 i = 0; i < ai->nparts; ++i) { + const ABIArgPart* pt = &ai->parts[i]; + u32 sz = pt->size; + u32 sidx = size_idx_for_bytes(sz); + + if (pt->cls == ABI_CLASS_INT) { + int to_stack = (*next_int >= 8); + u32 dst_reg = to_stack ? AA_TMP0 : (*next_int)++; + switch (av->storage.kind) { + case OPK_IMM: { + u32 sf = (sz == 8) ? 1u : 0u; + aa64_emit_load_imm(t->mc, sf, dst_reg, av->storage.v.imm); + break; + } + case OPK_REG: { + u32 sf = (sz == 8) ? 1u : 0u; + aa64_emit32(t->mc, aa64_mov_reg(sf, dst_reg, reg_num(av->storage))); + break; + } + case OPK_LOCAL: { + AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad arg slot"); + i32 off = -(i32)s->off + (i32)pt->src_offset; + aa64_emit32(t->mc, aa64_ldur(sidx, dst_reg, 29, off)); + break; + } + case OPK_INDIRECT: { + Operand src; + memset(&src, 0, sizeof src); + src.kind = OPK_INDIRECT; + src.v.ind.base = av->storage.v.ind.base; + src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset; + i32 off; + u32 base = addr_base(t, src, &off, AA_TMP0); + aa64_emit32(t->mc, aa64_ldur(sidx, dst_reg, base, off)); + break; + } + default: + compiler_panic(t->c, a->loc, + "aarch64 call: arg storage kind %d unsupported", + (int)av->storage.kind); + } + if (to_stack) { + aa64_emit32(t->mc, aa64_str_uimm(3, dst_reg, 31, *stack_off)); + *stack_off += 8; + } + } else if (pt->cls == ABI_CLASS_FP) { + int to_stack = (*next_fp >= 8); + if (!to_stack) { + u32 dst_reg = (*next_fp)++; + switch (av->storage.kind) { + case OPK_REG: { + u32 type = (sz == 8) ? 1u : 0u; + aa64_emit32(t->mc, aa64_fmov_reg(type, dst_reg, reg_num(av->storage))); + break; + } + case OPK_INDIRECT: { + Operand src; + memset(&src, 0, sizeof src); + src.kind = OPK_INDIRECT; + src.v.ind.base = av->storage.v.ind.base; + src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset; + i32 off; + u32 base = addr_base(t, src, &off, AA_TMP0); + aa64_emit32(t->mc, aa64_ldur_fp(sidx, dst_reg, base, off)); + break; + } + default: + compiler_panic(t->c, a->loc, + "aarch64 call: FP arg storage kind %d unsupported", + (int)av->storage.kind); + } + } else { + switch (av->storage.kind) { + case OPK_REG: + aa64_emit32(t->mc, aa64_stur_fp(sidx, reg_num(av->storage), 31, + (i32)*stack_off)); + break; + case OPK_INDIRECT: { + Operand src; + memset(&src, 0, sizeof src); + src.kind = OPK_INDIRECT; + src.v.ind.base = av->storage.v.ind.base; + src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset; + i32 off; + u32 base = addr_base(t, src, &off, AA_TMP0); + aa64_emit32(t->mc, aa64_ldur_fp(sidx, AA_FP_TMP0, base, off)); + aa64_emit32(t->mc, aa64_stur_fp(sidx, AA_FP_TMP0, 31, (i32)*stack_off)); + break; + } + default: + compiler_panic( + t->c, a->loc, + "aarch64 call: FP stack-arg storage kind %d unsupported", + (int)av->storage.kind); + } + *stack_off += 8; + } + } else { + compiler_panic(t->c, a->loc, "aarch64 call: ABI class %d unimpl", + (int)pt->cls); + } + } +} + +static void aa_call(CGTarget* t, const CGCallDesc* d) { + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + + u32 next_int = 0, next_fp = 0, stack_off = 0; + + if (d->abi && d->abi->has_sret) { + if (d->ret.storage.kind != OPK_LOCAL) { + compiler_panic(t->c, a->loc, + "aarch64 call: sret destination must be LOCAL"); + } + AASlot* s = aa64_slot_get(a, d->ret.storage.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad sret slot"); + aa64_emit32(mc, aa64_sub_imm(1, 8, 29, s->off, 0)); + } + + for (u32 i = 0; i < d->nargs; ++i) { + emit_arg_value(t, d->abi, &d->args[i], &next_int, &next_fp, &stack_off); + } + + u32 needed = (stack_off + 15u) & ~15u; + if (needed > a->max_outgoing) a->max_outgoing = needed; + + if (d->callee.kind == OPK_GLOBAL) { + u32 bl_pos = mc->pos(mc); + aa64_emit32(mc, aa64_bl_base()); + mc->emit_reloc_at(mc, mc->section_id, bl_pos, R_AARCH64_CALL26, + d->callee.v.global.sym, d->callee.v.global.addend, 0, 0); + } else if (d->callee.kind == OPK_REG) { + aa64_emit32(mc, aa64_blr(reg_num(d->callee))); + } else { + compiler_panic(t->c, a->loc, "aarch64 call: callee kind %d unsupported", + (int)d->callee.kind); + } + + const ABIArgInfo* ri = &d->abi->ret; + if (ri->kind == ABI_ARG_IGNORE || ri->kind == ABI_ARG_INDIRECT) { + return; + } + if (ri->nparts == 0) return; + + Operand rs = d->ret.storage; + u32 next_int_ret = 0, next_fp_ret = 0; + for (u16 i = 0; i < ri->nparts; ++i) { + const ABIArgPart* p = &ri->parts[i]; + u32 src_reg; + if (p->cls == ABI_CLASS_INT) { + src_reg = next_int_ret++; + } else if (p->cls == ABI_CLASS_FP) { + src_reg = next_fp_ret++; + } else { + compiler_panic(t->c, a->loc, "aarch64 call: ret part cls %d unimpl", + (int)p->cls); + } + + if (rs.kind == OPK_REG) { + if (ri->nparts != 1) { + compiler_panic(t->c, a->loc, + "aarch64 call: REG ret_storage with %u parts", + (unsigned)ri->nparts); + } + if (p->cls == ABI_CLASS_INT) { + u32 sf = (p->size == 8) ? 1u : 0u; + aa64_emit32(mc, aa64_mov_reg(sf, reg_num(rs), src_reg)); + } else { + u32 type = (p->size == 8) ? 1u : 0u; + aa64_emit32(mc, aa64_fmov_reg(type, reg_num(rs), src_reg)); + } + } else if (rs.kind == OPK_LOCAL || rs.kind == OPK_INDIRECT) { + u32 base_reg; + i32 base_off; + if (rs.kind == OPK_LOCAL) { + AASlot* s = aa64_slot_get(a, rs.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot"); + base_reg = 29; + base_off = -(i32)s->off; + } else { + base_reg = rs.v.ind.base & 0x1f; + base_off = rs.v.ind.ofs; + } + u32 sidx = size_idx_for_bytes(p->size); + i32 off = base_off + (i32)p->src_offset; + if (p->cls == ABI_CLASS_INT) { + aa64_emit32(mc, aa64_stur(sidx, src_reg, base_reg, off)); + } else { + aa64_emit32(mc, aa64_stur_fp(sidx, src_reg, base_reg, off)); + } + } else if (rs.kind == OPK_IMM && rs.type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_VOID)) { + /* void return placeholder */ + } else { + compiler_panic(t->c, a->loc, + "aarch64 call: ret_storage kind %d unsupported", + (int)rs.kind); + } + } +} + +static void aa_ret(CGTarget* t, const CGABIValue* val) { + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + + if (val) { + const ABIArgInfo* ri = val->abi; + if (ri && ri->kind == ABI_ARG_INDIRECT) { + if (val->storage.kind == OPK_LOCAL) { + AASlot* s = aa64_slot_get(a, val->storage.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad sret slot"); + if (a->sret_ptr_slot != FRAME_SLOT_NONE) { + AASlot* sp = aa64_slot_get(a, a->sret_ptr_slot); + if (sp) aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off)); + } + u32 nbytes = s->size; + u32 i = 0; + while (i + 8 <= nbytes) { + aa64_emit32(mc, aa64_ldur(3, AA_TMP0, 29, -(i32)s->off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i)); + i += 8; + } + while (i + 4 <= nbytes) { + aa64_emit32(mc, aa64_ldur(2, AA_TMP0, 29, -(i32)s->off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i)); + i += 4; + } + while (i + 2 <= nbytes) { + aa64_emit32(mc, aa64_ldur(1, AA_TMP0, 29, -(i32)s->off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i)); + i += 2; + } + while (i < nbytes) { + aa64_emit32(mc, aa64_ldur(0, AA_TMP0, 29, -(i32)s->off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i)); + i += 1; + } + } else if (val->storage.kind == OPK_INDIRECT) { + u32 nbytes = val->size; + if (!nbytes) { + compiler_panic(t->c, a->loc, + "aarch64 ret indirect: missing aggregate size"); + } + if (a->sret_ptr_slot != FRAME_SLOT_NONE) { + AASlot* sp = aa64_slot_get(a, a->sret_ptr_slot); + if (sp) aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off)); + } + u32 base_reg = val->storage.v.ind.base & 0x1f; + i32 base_off = val->storage.v.ind.ofs; + u32 i = 0; + while (i + 8 <= nbytes) { + aa64_emit32(mc, aa64_ldur(3, AA_TMP0, base_reg, base_off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i)); + i += 8; + } + while (i + 4 <= nbytes) { + aa64_emit32(mc, aa64_ldur(2, AA_TMP0, base_reg, base_off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i)); + i += 4; + } + while (i + 2 <= nbytes) { + aa64_emit32(mc, aa64_ldur(1, AA_TMP0, base_reg, base_off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i)); + i += 2; + } + while (i < nbytes) { + aa64_emit32(mc, aa64_ldur(0, AA_TMP0, base_reg, base_off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i)); + i += 1; + } + } else { + compiler_panic(t->c, a->loc, + "aarch64 ret indirect: storage kind %d unsupported", + (int)val->storage.kind); + } + } else if (val->storage.kind == OPK_REG) { + if (val->storage.cls == RC_FP) { + u32 type = type_is_fp_double(val->storage.type) ? 1u : 0u; + aa64_emit32(mc, aa64_fmov_reg(type, /*Rd=*/0, reg_num(val->storage))); + } else { + u32 sf = type_is_64(val->storage.type) ? 1u : 0u; + aa64_emit32(mc, aa64_mov_reg(sf, /*Rd=*/0, reg_num(val->storage))); + } + } else if (val->storage.kind == OPK_IMM) { + u32 sf = type_is_64(val->storage.type) ? 1u : 0u; + aa64_emit_load_imm(mc, sf, /*Rd=*/0, val->storage.v.imm); + } else if (val->storage.kind == OPK_LOCAL || + val->storage.kind == OPK_INDIRECT) { + u32 base_reg; + i32 base_off; + if (val->storage.kind == OPK_LOCAL) { + AASlot* s = aa64_slot_get(a, val->storage.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad local slot"); + base_reg = 29; + base_off = -(i32)s->off; + } else { + base_reg = val->storage.v.ind.base & 0x1f; + base_off = val->storage.v.ind.ofs; + } + const ABIArgInfo* ri2 = val->abi; + for (u16 i = 0; i < (ri2 ? ri2->nparts : 0); ++i) { + const ABIArgPart* pt = &ri2->parts[i]; + u32 sidx = size_idx_for_bytes(pt->size); + i32 off = base_off + (i32)pt->src_offset; + if (pt->cls == ABI_CLASS_INT) { + aa64_emit32(mc, aa64_ldur(sidx, /*Rt=*/i, base_reg, off)); + } else if (pt->cls == ABI_CLASS_FP) { + aa64_emit32(mc, aa64_ldur_fp(sidx, /*Rt=*/i, base_reg, off)); + } else { + compiler_panic(t->c, a->loc, "aarch64 ret: ret part cls %d unimpl", + (int)pt->cls); + } + } + } + } + u32 bpos = mc->pos(mc); + aa64_emit32(mc, aa64_b_base()); + mc->emit_label_ref(mc, a->epilogue_label, R_AARCH64_JUMP26, 4, 0); + (void)bpos; +} + +/* ============================================================ + * alloca + * ============================================================ */ + +static void aa_alloca_(CGTarget* t, Operand d, Operand sz, u32 align) { + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + + if (d.kind != OPK_REG) { + compiler_panic(t->c, a->loc, "aarch64 alloca: dst must be REG"); + } + if (align > 16) { + compiler_panic(t->c, a->loc, + "aarch64 alloca: align %u > 16 not yet supported", align); + } + + if (sz.kind == OPK_IMM) { + i64 v = sz.v.imm; + if (v < 0) { + compiler_panic(t->c, a->loc, "aarch64 alloca: negative size"); + } + u64 aligned = ((u64)v + 15u) & ~(u64)15u; + if (aligned == 0) aligned = 16; + if (aligned > 0xfffu) { + compiler_panic(t->c, a->loc, + "aarch64 alloca: const size %llu too large for v1", + (unsigned long long)aligned); + } + aa64_emit32(mc, aa64_sub_imm(1, /*Rd=SP*/ 31, /*Rn=SP*/ 31, (u32)aligned, 0)); + } else if (sz.kind == OPK_REG) { + u32 sz_reg = reg_num(sz); + aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, sz_reg, 15u, 0)); + aa64_emit32(mc, aa64_ubfm(1, AA_TMP0, AA_TMP0, 4, 63)); + aa64_emit32(mc, aa64_ubfm(1, AA_TMP0, AA_TMP0, 60, 59)); + aa64_emit32(mc, aa64_sub_extreg_x_uxtx(/*SP*/ 31, /*SP*/ 31, AA_TMP0)); + } else { + compiler_panic(t->c, a->loc, "aarch64 alloca: size kind %d unsupported", + (int)sz.kind); + } + + if (a->nadd_patches == a->add_patches_cap) { + u32 ncap = a->add_patches_cap ? a->add_patches_cap * 2 : 4; + struct AAAllocaPatch* nb = + arena_array(t->c->tu, struct AAAllocaPatch, ncap); + if (a->add_patches) + memcpy(nb, a->add_patches, sizeof(*nb) * a->nadd_patches); + a->add_patches = nb; + a->add_patches_cap = ncap; + } + u32 dst_reg = reg_num(d); + a->add_patches[a->nadd_patches].pos = mc->pos(mc); + a->add_patches[a->nadd_patches].dst_reg = dst_reg; + a->nadd_patches++; + aa64_emit32(mc, aa64_add_imm(1, dst_reg, /*Rn=SP*/ 31, 0, 0)); + a->has_alloca = 1; +} + +/* ============================================================ + * Varargs + * ============================================================ */ + +static void emit_fp_off(MCEmitter* mc, u32 dst, i32 ofs) { + if (ofs == 0) + aa64_emit32(mc, aa64_mov_reg(1, dst, 29)); + else if (ofs > 0 && (u32)ofs <= 0xfff) + aa64_emit32(mc, aa64_add_imm(1, dst, 29, (u32)ofs, 0)); + else if (ofs < 0 && (u32)(-ofs) <= 0xfff) + aa64_emit32(mc, aa64_sub_imm(1, dst, 29, (u32)(-ofs), 0)); + else { + aa64_emit_load_imm(mc, 1, dst, ofs); + aa64_emit32(mc, aa64_add(1, dst, 29, dst)); + } +} + +static void aa_va_start_(CGTarget* t, Operand ap_op) { + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + if (!a->is_variadic) { + compiler_panic(t->c, a->loc, "aarch64 va_start: function not variadic"); + } + u32 ap = reg_num(ap_op); + AASlot* gs = aa64_slot_get(a, a->gp_save_slot); + AASlot* fs = aa64_slot_get(a, a->fp_save_slot); + + { + u32 ofs = 16u + a->next_param_stack; + if (ofs <= 0xfff) + aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, 29, ofs, 0)); + else { + aa64_emit_load_imm(mc, 1, AA_TMP0, (i64)ofs); + aa64_emit32(mc, aa64_add(1, AA_TMP0, 29, AA_TMP0)); + } + aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 0)); + } + emit_fp_off(mc, AA_TMP0, -(i32)gs->off + (i32)gs->size); + aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 8)); + emit_fp_off(mc, AA_TMP0, -(i32)fs->off + (i32)fs->size); + aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 16)); + aa64_emit_load_imm(mc, 0, AA_TMP0, + (i64)((i32)(a->next_param_int * 8u) - 64)); + aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, ap, 24)); + aa64_emit_load_imm(mc, 0, AA_TMP0, + (i64)((i32)(a->next_param_fp * 16u) - 128)); + aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, ap, 28)); +} + +static void aa_va_arg_(CGTarget* t, Operand dst, Operand ap_op, + CfreeCgTypeId ty) { + MCEmitter* mc = t->mc; + u32 ap = reg_num(ap_op); + int is_fp = (dst.cls == RC_FP); + u32 offs_field = is_fp ? 28u : 24u; + u32 top_field = is_fp ? 16u : 8u; + u32 stride_reg = is_fp ? 16u : 8u; + u32 sz = type_byte_size(ty); + u32 sidx = size_idx_for_bytes(sz); + + MCLabel L_stack = mc->label_new(mc); + MCLabel L_done = mc->label_new(mc); + + aa64_emit32(mc, aa64_ldur(2, AA_TMP0, ap, (i32)offs_field)); + aa64_emit32(mc, aa64_subs_imm(0, 31, AA_TMP0, 0)); + aa64_emit32(mc, aa64_b_cond(0xa /*GE*/)); + mc->emit_label_ref(mc, L_stack, R_AARCH64_CONDBR19, 4, 0); + + aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, (i32)top_field)); + aa64_emit32(mc, aa64_sbfm(1, AA_TMP2, AA_TMP0, 0, 31)); + aa64_emit32(mc, aa64_add(1, AA_TMP2, AA_TMP1, AA_TMP2)); + if (is_fp) + aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP2, 0)); + else + aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP2, 0)); + aa64_emit32(mc, aa64_add_imm(0, AA_TMP0, AA_TMP0, stride_reg, 0)); + aa64_emit32(mc, aa64_stur(2, AA_TMP0, ap, (i32)offs_field)); + aa64_emit32(mc, aa64_b_base()); + mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0); + + mc->label_place(mc, L_stack); + aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, 0)); + if (is_fp) + aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP1, 0)); + else + aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP1, 0)); + aa64_emit32(mc, aa64_add_imm(1, AA_TMP1, AA_TMP1, 8u, 0)); + aa64_emit32(mc, aa64_stur(3, AA_TMP1, ap, 0)); + + mc->label_place(mc, L_done); +} + +static void aa_va_end_(CGTarget* t, Operand a) { + (void)t; + (void)a; +} + +static void aa_va_copy_(CGTarget* t, Operand d, Operand s) { + MCEmitter* mc = t->mc; + u32 dr = reg_num(d); + u32 sr = reg_num(s); + for (u32 i = 0; i < 32u; i += 8u) { + aa64_emit32(mc, aa64_ldur(3, AA_TMP0, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(3, AA_TMP0, dr, (i32)i)); + } +} + +/* ============================================================ + * Atomics + * ============================================================ */ + +static inline u32 aa64_ldar(u32 sf64, u32 Rt, u32 Rn) { + return (sf64 ? 0xC8DFFC00u : 0x88DFFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} +static inline u32 aa64_stlr(u32 sf64, u32 Rt, u32 Rn) { + return (sf64 ? 0xC89FFC00u : 0x889FFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} +static inline u32 aa64_ldxr(u32 sf64, u32 Rt, u32 Rn) { + return (sf64 ? 0xC85F7C00u : 0x885F7C00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} +static inline u32 aa64_ldaxr(u32 sf64, u32 Rt, u32 Rn) { + return (sf64 ? 0xC85FFC00u : 0x885FFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} +static inline u32 aa64_stxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn) { + return (sf64 ? 0xC8007C00u : 0x88007C00u) | ((Rs & 0x1f) << 16) | + ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} +static inline u32 aa64_stlxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn) { + return (sf64 ? 0xC800FC00u : 0x8800FC00u) | ((Rs & 0x1f) << 16) | + ((Rn & 0x1f) << 5) | (Rt & 0x1f); +} +static inline u32 aa64_cbnz(u32 sf64, u32 Rt) { + return 0x35000000u | (sf64 << 31) | (Rt & 0x1f); +} + +static int mem_order_is_acquire(MemOrder o) { + return o == MO_ACQUIRE || o == MO_ACQ_REL || o == MO_SEQ_CST || + o == MO_CONSUME; +} +static int mem_order_is_release(MemOrder o) { + return o == MO_RELEASE || o == MO_ACQ_REL || o == MO_SEQ_CST; +} + +static void aa_atomic_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma, + MemOrder ord) { + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + u32 sf = (ma.size == 8) ? 1u : 0u; + + u32 base; + if (addr.kind == OPK_REG) { + base = reg_num(addr); + } else if (addr.kind == OPK_LOCAL) { + AASlot* s = aa64_slot_get(a, addr.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_load: bad slot"); + base = AA_TMP0; + aa64_emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0)); + } else { + compiler_panic(t->c, a->loc, + "aarch64 atomic_load: addr kind %d unsupported", + (int)addr.kind); + } + if (mem_order_is_acquire(ord)) { + aa64_emit32(mc, aa64_ldar(sf, reg_num(dst), base)); + } else { + u32 sidx = size_idx_for_bytes(ma.size); + aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), base, 0)); + } +} + +static void aa_atomic_store(CGTarget* t, Operand addr, Operand src, + MemAccess ma, MemOrder ord) { + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + u32 sf = (ma.size == 8) ? 1u : 0u; + + u32 src_reg; + if (src.kind == OPK_IMM) { + src_reg = AA_TMP1; + aa64_emit_load_imm(mc, sf, src_reg, src.v.imm); + } else if (src.kind == OPK_REG) { + src_reg = reg_num(src); + } else { + compiler_panic(t->c, a->loc, + "aarch64 atomic_store: src kind %d unsupported", + (int)src.kind); + } + u32 base; + if (addr.kind == OPK_REG) { + base = reg_num(addr); + } else if (addr.kind == OPK_LOCAL) { + AASlot* s = aa64_slot_get(a, addr.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_store: bad slot"); + base = AA_TMP0; + aa64_emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0)); + } else { + compiler_panic(t->c, a->loc, + "aarch64 atomic_store: addr kind %d unsupported", + (int)addr.kind); + } + if (mem_order_is_release(ord)) { + aa64_emit32(mc, aa64_stlr(sf, src_reg, base)); + } else { + u32 sidx = size_idx_for_bytes(ma.size); + aa64_emit32(mc, aa64_stur(sidx, src_reg, base, 0)); + } +} + +static void emit_rmw_combine(MCEmitter* mc, AtomicOp op, u32 sf, u32 dst_new, + u32 prior, u32 val) { + switch (op) { + case AO_XCHG: aa64_emit32(mc, aa64_mov_reg(sf, dst_new, val)); break; + case AO_ADD: aa64_emit32(mc, aa64_add(sf, dst_new, prior, val)); break; + case AO_SUB: aa64_emit32(mc, aa64_sub(sf, dst_new, prior, val)); break; + case AO_AND: aa64_emit32(mc, aa64_and(sf, dst_new, prior, val)); break; + case AO_OR: aa64_emit32(mc, aa64_orr(sf, dst_new, prior, val)); break; + case AO_XOR: aa64_emit32(mc, aa64_eor(sf, dst_new, prior, val)); break; + case AO_NAND: + aa64_emit32(mc, aa64_and(sf, dst_new, prior, val)); + aa64_emit32(mc, aa64_mvn(sf, dst_new, dst_new)); + break; + default: + aa64_emit32(mc, aa64_mov_reg(sf, dst_new, val)); + break; + } +} + +static void aa_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst, Operand addr, + Operand val, MemAccess ma, MemOrder ord) { + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + u32 sf = (ma.size == 8) ? 1u : 0u; + + u32 base = AA_TMP0; + if (addr.kind == OPK_REG) { + aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(addr))); + } else if (addr.kind == OPK_LOCAL) { + AASlot* s = aa64_slot_get(a, addr.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: bad slot"); + aa64_emit32(mc, aa64_sub_imm(1, AA_TMP0, 29, s->off, 0)); + } else { + compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: addr kind %d unsupported", + (int)addr.kind); + } + u32 vreg = AA_TMP1; + if (val.kind == OPK_IMM) { + aa64_emit_load_imm(mc, sf, vreg, val.v.imm); + } else if (val.kind == OPK_REG) { + aa64_emit32(mc, aa64_mov_reg(sf, vreg, reg_num(val))); + } else { + compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: val kind %d unsupported", + (int)val.kind); + } + + int do_acq = mem_order_is_acquire(ord); + int do_rel = mem_order_is_release(ord); + + MCLabel L_retry = mc->label_new(mc); + mc->label_place(mc, L_retry); + + if (do_acq) + aa64_emit32(mc, aa64_ldaxr(sf, reg_num(dst), base)); + else + aa64_emit32(mc, aa64_ldxr(sf, reg_num(dst), base)); + + emit_rmw_combine(mc, op, sf, AA_TMP2, reg_num(dst), vreg); + + if (do_rel) + aa64_emit32(mc, aa64_stlxr(sf, vreg, AA_TMP2, base)); + else + aa64_emit32(mc, aa64_stxr(sf, vreg, AA_TMP2, base)); + + u32 cbnz_pos = mc->pos(mc); + aa64_emit32(mc, aa64_cbnz(0, vreg)); + mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0); + (void)cbnz_pos; +} + +static void aa_atomic_cas(CGTarget* t, Operand prior, Operand ok, Operand addr, + Operand expected, Operand desired, MemAccess ma, + MemOrder succ, MemOrder fail) { + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + u32 sf = (ma.size == 8) ? 1u : 0u; + (void)fail; + + u32 base = AA_TMP0; + if (addr.kind == OPK_REG) + aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(addr))); + else if (addr.kind == OPK_LOCAL) { + AASlot* s = aa64_slot_get(a, addr.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_cas: bad slot"); + aa64_emit32(mc, aa64_sub_imm(1, AA_TMP0, 29, s->off, 0)); + } else { + compiler_panic(t->c, a->loc, "aarch64 atomic_cas: addr kind %d unsupported", + (int)addr.kind); + } + if (expected.kind == OPK_IMM) + aa64_emit_load_imm(mc, sf, AA_TMP1, expected.v.imm); + else if (expected.kind == OPK_REG) + aa64_emit32(mc, aa64_mov_reg(sf, AA_TMP1, reg_num(expected))); + else + compiler_panic(t->c, a->loc, "aarch64 atomic_cas: exp kind %d unsupported", + (int)expected.kind); + if (desired.kind == OPK_IMM) + aa64_emit_load_imm(mc, sf, AA_TMP2, desired.v.imm); + else if (desired.kind == OPK_REG) + aa64_emit32(mc, aa64_mov_reg(sf, AA_TMP2, reg_num(desired))); + else + compiler_panic(t->c, a->loc, "aarch64 atomic_cas: des kind %d unsupported", + (int)desired.kind); + + int do_acq = mem_order_is_acquire(succ); + int do_rel = mem_order_is_release(succ); + + MCLabel L_retry = mc->label_new(mc); + MCLabel L_fail = mc->label_new(mc); + MCLabel L_done = mc->label_new(mc); + + mc->label_place(mc, L_retry); + if (do_acq) + aa64_emit32(mc, aa64_ldaxr(sf, reg_num(prior), base)); + else + aa64_emit32(mc, aa64_ldxr(sf, reg_num(prior), base)); + + aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, reg_num(prior), AA_TMP1)); + aa64_emit32(mc, aa64_b_cond(0x1u /*NE*/)); + mc->emit_label_ref(mc, L_fail, R_AARCH64_CONDBR19, 4, 0); + + if (do_rel) + aa64_emit32(mc, aa64_stlxr(sf, AA_TMP1, AA_TMP2, base)); + else + aa64_emit32(mc, aa64_stxr(sf, AA_TMP1, AA_TMP2, base)); + aa64_emit32(mc, aa64_cbnz(0, AA_TMP1)); + mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0); + + aa64_emit_load_imm(mc, 0, reg_num(ok), 1); + aa64_emit32(mc, aa64_b_base()); + mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0); + + mc->label_place(mc, L_fail); + aa64_emit32(mc, aa64_clrex(AA64_BARRIER_OPT_SY)); + aa64_emit_load_imm(mc, 0, reg_num(ok), 0); + + mc->label_place(mc, L_done); +} + +static void aa_fence(CGTarget* t, MemOrder o) { + (void)o; + if (o == MO_RELAXED) return; + aa64_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); +} + +/* ============================================================ + * Intrinsics + * ============================================================ */ + +static inline u32 aa64_rev16_w(u32 Rd, u32 Rn) { + return 0x5AC00400u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_rev_w(u32 Rd, u32 Rn) { + return 0x5AC00800u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_rev_x(u32 Rd, u32 Rn) { + return 0xDAC00C00u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_rbit(u32 sf64, u32 Rd, u32 Rn) { + return (sf64 ? 0xDAC00000u : 0x5AC00000u) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_clz(u32 sf64, u32 Rd, u32 Rn) { + return (sf64 ? 0xDAC01000u : 0x5AC01000u) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); +} +static inline u32 aa64_cnt_8b(u32 Vd, u32 Vn) { + return 0x0E205800u | ((Vn & 0x1f) << 5) | (Vd & 0x1f); +} +static inline u32 aa64_addv_b_8b(u32 Vd, u32 Vn) { + return 0x0E31B800u | ((Vn & 0x1f) << 5) | (Vd & 0x1f); +} +static inline u32 aa64_adds_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm) { + return 0x2B000000u | (sf << 31) | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | + (Rd & 0x1f); +} +static inline u32 aa64_smaddl(u32 Rd, u32 Rn, u32 Rm, u32 Ra) { + return aa64_dp3_pack((AA64DP3){ + .sf = 1, .op31 = 1, .o0 = 0, .Rm = Rm, .Ra = Ra, .Rn = Rn, .Rd = Rd}); +} +static inline u32 aa64_smull(u32 Rd, u32 Rn, u32 Rm) { + return aa64_smaddl(Rd, Rn, Rm, AA64_ZR); +} +static inline u32 aa64_subs_extreg_x_sxtw(u32 Rd, u32 Rn, u32 Rm) { + return 0xEB200000u | ((Rm & 0x1f) << 16) | (6u << 13) | ((Rn & 0x1f) << 5) | + (Rd & 0x1f); +} + +static void aa_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd, + const Operand* args, u32 na) { + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + (void)nd; + + switch (kind) { + case INTRIN_POPCOUNT: { + Operand src = args[0]; + Operand dst = dsts[0]; + u32 sz_in = type_byte_size(src.type); + if (sz_in == 8) + aa64_emit32(mc, aa64_fmov_d_x(AA_FP_TMP0, reg_num(src))); + else + aa64_emit32(mc, aa64_fmov_s_w(AA_FP_TMP0, reg_num(src))); + aa64_emit32(mc, aa64_cnt_8b(AA_FP_TMP0, AA_FP_TMP0)); + aa64_emit32(mc, aa64_addv_b_8b(AA_FP_TMP0, AA_FP_TMP0)); + aa64_emit32(mc, aa64_fmov_w_s(reg_num(dst), AA_FP_TMP0)); + return; + } + case INTRIN_CLZ: { + Operand src = args[0]; + Operand dst = dsts[0]; + u32 sf = type_is_64(src.type) ? 1u : 0u; + aa64_emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(src))); + return; + } + case INTRIN_CTZ: { + Operand src = args[0]; + Operand dst = dsts[0]; + u32 sf = type_is_64(src.type) ? 1u : 0u; + aa64_emit32(mc, aa64_rbit(sf, reg_num(dst), reg_num(src))); + aa64_emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(dst))); + return; + } + case INTRIN_BSWAP16: { + aa64_emit32(mc, aa64_rev16_w(reg_num(dsts[0]), reg_num(args[0]))); + return; + } + case INTRIN_BSWAP32: { + aa64_emit32(mc, aa64_rev_w(reg_num(dsts[0]), reg_num(args[0]))); + return; + } + case INTRIN_BSWAP64: { + aa64_emit32(mc, aa64_rev_x(reg_num(dsts[0]), reg_num(args[0]))); + return; + } + case INTRIN_MEMCPY: + case INTRIN_MEMMOVE: { + Operand da = args[0], sa = args[1], nb = args[2]; + if (da.kind != OPK_REG || sa.kind != OPK_REG || nb.kind != OPK_IMM) { + compiler_panic(t->c, a->loc, + "aarch64 intrinsic: %s with non-const n or non-REG ptr", + kind == INTRIN_MEMCPY ? "memcpy" : "memmove"); + } + u32 dr = reg_num(da); + u32 sr = reg_num(sa); + u32 n = (u32)nb.v.imm; + if (kind == INTRIN_MEMCPY) { + u32 i = 0; + while (i + 8 <= n) { + aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i)); + i += 8; + } + while (i + 4 <= n) { + aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i)); + i += 4; + } + while (i + 2 <= n) { + aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i)); + i += 2; + } + while (i < n) { + aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i)); + i += 1; + } + } else { + u32 i = n; + while (i >= 8) { + i -= 8; + aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i)); + } + while (i >= 4) { + i -= 4; + aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i)); + } + while (i >= 2) { + i -= 2; + aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i)); + } + while (i >= 1) { + i -= 1; + aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i)); + } + } + return; + } + case INTRIN_MEMSET: { + Operand da = args[0], bv = args[1], nb = args[2]; + if (da.kind != OPK_REG || nb.kind != OPK_IMM) { + compiler_panic( + t->c, a->loc, + "aarch64 intrinsic: memset with non-const n / non-REG ptr"); + } + u32 dr = reg_num(da); + u32 n = (u32)nb.v.imm; + u32 byte; + u32 src_reg; + if (bv.kind == OPK_IMM) { + byte = (u32)(bv.v.imm & 0xffu); + if (byte == 0) { + src_reg = 31u; + } else { + u64 b64 = byte; + b64 |= b64 << 8; + b64 |= b64 << 16; + b64 |= b64 << 32; + aa64_emit_load_imm(mc, 1, AA_TMP2, (i64)b64); + src_reg = AA_TMP2; + } + } else if (bv.kind == OPK_REG) { + aa64_emit_load_imm(mc, 1, AA_TMP2, (i64)0x0101010101010101ll); + aa64_emit32(mc, aa64_madd(1, AA_TMP2, reg_num(bv), AA_TMP2, AA64_ZR)); + src_reg = AA_TMP2; + } else { + compiler_panic(t->c, a->loc, + "aarch64 intrinsic: memset byte kind %d unsupported", + (int)bv.kind); + } + u32 i = 0; + while (i + 8 <= n) { + aa64_emit32(mc, aa64_stur(3, src_reg, dr, (i32)i)); + i += 8; + } + while (i + 4 <= n) { + aa64_emit32(mc, aa64_stur(2, src_reg, dr, (i32)i)); + i += 4; + } + while (i + 2 <= n) { + aa64_emit32(mc, aa64_stur(1, src_reg, dr, (i32)i)); + i += 2; + } + while (i < n) { + aa64_emit32(mc, aa64_stur(0, src_reg, dr, (i32)i)); + i += 1; + } + return; + } + case INTRIN_PREFETCH: + (void)args; + (void)na; + return; + case INTRIN_ASSUME_ALIGNED: { + Operand src = args[0]; + Operand dst = dsts[0]; + if (reg_num(src) != reg_num(dst)) { + aa64_emit32(mc, aa64_mov_reg(1, reg_num(dst), reg_num(src))); + } + return; + } + case INTRIN_EXPECT: { + Operand val = args[0]; + Operand dst = dsts[0]; + u32 sf = type_is_64(dst.type) ? 1u : 0u; + if (val.kind == OPK_REG) { + if (reg_num(val) != reg_num(dst)) { + aa64_emit32(mc, aa64_mov_reg(sf, reg_num(dst), reg_num(val))); + } + } else if (val.kind == OPK_IMM) { + aa64_emit_load_imm(mc, sf, reg_num(dst), val.v.imm); + } else { + compiler_panic(t->c, a->loc, + "aarch64 intrinsic: expect val kind %d unsupported", + (int)val.kind); + } + return; + } + case INTRIN_UNREACHABLE: + case INTRIN_TRAP: + aa64_emit32(mc, aa64_brk(kind == INTRIN_TRAP ? 1u : 0u)); + return; + case INTRIN_ADD_OVERFLOW: + case INTRIN_SUB_OVERFLOW: { + Operand a_op = args[0], b_op = args[1]; + Operand dval = dsts[0], dovf = dsts[1]; + u32 sf = type_is_64(dval.type) ? 1u : 0u; + u32 ra = aa64_force_reg_int(t, a_op, sf, AA_TMP0); + u32 rb = + aa64_force_reg_int(t, b_op, sf, + (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0); + u32 word = (kind == INTRIN_ADD_OVERFLOW) + ? aa64_adds_reg(sf, reg_num(dval), ra, rb) + : aa64_subs_reg(sf, reg_num(dval), ra, rb); + aa64_emit32(mc, word); + aa64_emit32(mc, aa64_cset(sf, reg_num(dovf), 0x6u /*VS*/)); + return; + } + case INTRIN_MUL_OVERFLOW: { + Operand a_op = args[0], b_op = args[1]; + Operand dval = dsts[0], dovf = dsts[1]; + u32 sf = type_is_64(dval.type) ? 1u : 0u; + if (sf) { + compiler_panic( + t->c, a->loc, + "aarch64 intrinsic: mul_overflow on i64 not yet supported"); + } + u32 ra = aa64_force_reg_int(t, a_op, 0, AA_TMP0); + u32 rb = + aa64_force_reg_int(t, b_op, 0, + (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0); + aa64_emit32(mc, aa64_smull(AA_TMP2, ra, rb)); + aa64_emit32(mc, aa64_subs_extreg_x_sxtw(/*XZR*/ 31u, AA_TMP2, AA_TMP2)); + aa64_emit32(mc, aa64_cset(0, reg_num(dovf), 0x1u /*NE*/)); + aa64_emit32(mc, aa64_mov_reg(0, reg_num(dval), AA_TMP2)); + return; + } + default: + compiler_panic(t->c, a->loc, "aarch64 intrinsic: kind %d unsupported", + (int)kind); + } +} + +/* ============================================================ + * Inline asm block + * ============================================================ */ + +static void aa_asm_block(CGTarget* t, const char* tmpl, + const AsmConstraint* outs, u32 no, Operand* oo, + const AsmConstraint* ins, u32 ni, const Operand* io, + const Sym* clobs, u32 nc) { + AAImpl* a_impl = impl_of(t); + for (u32 i = 0; i < nc; ++i) { + Reg phys; + RegClass cls; + if (t->resolve_reg_name(t, clobs[i], &phys, &cls) != 0) continue; + if (cls == RC_INT) { + if (phys >= 19u && phys <= 28u) + a_impl->used_cs_int_mask |= 1u << phys; + } else if (cls == RC_FP) { + if (phys >= 8u && phys <= 15u) + a_impl->used_cs_fp_mask |= 1u << phys; + } + } + AA64Asm* a = aa64_asm_open(t->c); + aa64_inline_bind(a, outs, no, oo, ins, ni, io, clobs, nc); + aa64_asm_run_template(a, t->mc, tmpl); + aa64_asm_close(a); +} + +/* ============================================================ + * Lifecycle / vtable constructor + * ============================================================ */ + +static void aa_set_loc(CGTarget* t, SrcLoc loc) { + impl_of(t)->loc = loc; + t->mc->set_loc(t->mc, loc); +} + +static void aa_finalize(CGTarget* t) { (void)t; } + +static void aa_destroy(CGTarget* t) { (void)t; } + +static void cgt_cleanup(void* arg) { cgtarget_free((CGTarget*)arg); } + +CGTarget* aa64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) { + AAImpl* a = arena_new(c->tu, AAImpl); + memset(a, 0, sizeof *a); + + CGTarget* t = &a->base; + t->c = c; + t->obj = o; + t->mc = m; + + t->func_begin = aa_func_begin; + t->func_end = aa_func_end; + t->frame_slot = aa_frame_slot; + t->param = aa_param; + + t->load_imm = aa_load_imm; + t->load_const = aa_load_const; + t->copy = aa_copy; + t->load = aa_load; + t->store = aa_store; + t->addr_of = aa_addr_of; + t->tls_addr_of = aa_tls_addr_of; + t->copy_bytes = aa_copy_bytes; + t->set_bytes = aa_set_bytes; + t->bitfield_load = aa_bitfield_load; + t->bitfield_store = aa_bitfield_store; + + t->binop = aa_binop; + t->unop = aa_unop; + t->convert = aa_convert; + + t->call = aa_call; + t->ret = aa_ret; + + t->alloca_ = aa_alloca_; + t->va_start_ = aa_va_start_; + t->va_arg_ = aa_va_arg_; + t->va_end_ = aa_va_end_; + t->va_copy_ = aa_va_copy_; + + t->atomic_load = aa_atomic_load; + t->atomic_store = aa_atomic_store; + t->atomic_rmw = aa_atomic_rmw; + t->atomic_cas = aa_atomic_cas; + t->fence = aa_fence; + + t->intrinsic = aa_intrinsic; + t->asm_block = aa_asm_block; + + t->set_loc = aa_set_loc; + t->finalize = aa_finalize; + t->destroy = aa_destroy; + + /* alloc/label/scope vtable entries */ + aa_alloc_vtable_init(t); + aa_coord_vtable_init(t); + + /* Suppress unused warning. */ + (void)type_is_signed; + + compiler_defer(c, cgt_cleanup, t); + return t; +} diff --git a/src/arch/aa64/opt_coord.c b/src/arch/aa64/opt_coord.c @@ -0,0 +1,96 @@ +/* aarch64/opt_coord.c — opt/backend register coordination hooks. + * Static arrays so opt_machinize can query the backend instead of + * hard-coding arch knowledge. */ + +#include "arch/aa64/internal.h" + +/* ============================================================ + * Static register tables reported to caller-owned allocators. */ + +static const Reg aa_int_allocable[] = {19, 20, 21, 22, 23, + 24, 25, 26, 27, 28}; +static const Reg aa_fp_allocable[] = {8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23}; + +static const Reg aa_int_scratch[] = {16, 17}; +static const Reg aa_fp_scratch[] = {24, 25}; + +/* ============================================================ + * Vtable methods */ + +static void aa_get_allocable_regs(CGTarget* t, RegClass cls, + const Reg** out, u32* nregs) { + (void)t; + switch (cls) { + case RC_INT: + *out = aa_int_allocable; + *nregs = sizeof aa_int_allocable / sizeof aa_int_allocable[0]; + break; + case RC_FP: + *out = aa_fp_allocable; + *nregs = sizeof aa_fp_allocable / sizeof aa_fp_allocable[0]; + break; + default: + *out = NULL; + *nregs = 0; + break; + } +} + +static void aa_get_scratch_regs(CGTarget* t, RegClass cls, + const Reg** out, u32* nregs) { + (void)t; + switch (cls) { + case RC_INT: + *out = aa_int_scratch; + *nregs = sizeof aa_int_scratch / sizeof aa_int_scratch[0]; + break; + case RC_FP: + *out = aa_fp_scratch; + *nregs = sizeof aa_fp_scratch / sizeof aa_fp_scratch[0]; + break; + default: + *out = NULL; + *nregs = 0; + break; + } +} + +static int aa_is_caller_saved(CGTarget* t, RegClass cls, Reg reg) { + (void)t; + switch (cls) { + case RC_INT: + /* AAPCS64 caller-saved: x0-x18, x30 */ + return reg <= 18 || reg == 30; + case RC_FP: + /* AAPCS64 caller-saved: v0-v7, v16-v31 */ + return reg <= 7 || reg >= 16; + default: + return 0; + } +} + +static void aa_reserve_hard_regs(CGTarget* t, RegClass cls, + const Reg* regs, u32 n) { + AAImpl* a = impl_of(t); + for (u32 i = 0; i < n; ++i) { + Reg r = regs[i]; + switch (cls) { + case RC_INT: + if (r >= 19u && r <= 28u) a->used_cs_int_mask |= 1u << r; + break; + case RC_FP: + if (r >= 8u && r <= 15u) a->used_cs_fp_mask |= 1u << r; + break; + default: + break; + } + } +} + +void aa_coord_vtable_init(CGTarget* t) { + t->get_allocable_regs = aa_get_allocable_regs; + t->get_scratch_regs = aa_get_scratch_regs; + t->is_caller_saved = aa_is_caller_saved; + t->reserve_hard_regs = aa_reserve_hard_regs; +} diff --git a/src/arch/aa64/regs.c b/src/arch/aa64/regs.c @@ -0,0 +1,88 @@ +/* AArch64 register name table — DWARF index ↔ assembler name. + * + * DWARF register numbering for AArch64 (per the AAPCS64 ABI supplement): + * 0..30 X0..X30 (also W0..W30; same DWARF index) + * 31 SP (X31 / WSP) + * 32 PC + * 33 ELR (mode dependent; unused here) + * 64..95 V0..V31 (also B/H/S/D forms; same index) + * + * The canonical assembler spelling for v1 is the 64-bit form (Xn / Vn); + * disassembler output picks W/B/H/S/D based on instruction width + * separately. */ + +#include <stdint.h> +#include <string.h> + +#include "arch/aa64/regs.h" +#include "core/core.h" + +typedef struct AA64Reg { + uint32_t dwarf_idx; + const char* name; +} AA64Reg; + +static const AA64Reg AA64_REGS[] = { + {0, "x0"}, {1, "x1"}, {2, "x2"}, {3, "x3"}, {4, "x4"}, + {5, "x5"}, {6, "x6"}, {7, "x7"}, {8, "x8"}, {9, "x9"}, + {10, "x10"}, {11, "x11"}, {12, "x12"}, {13, "x13"}, {14, "x14"}, + {15, "x15"}, {16, "x16"}, {17, "x17"}, {18, "x18"}, {19, "x19"}, + {20, "x20"}, {21, "x21"}, {22, "x22"}, {23, "x23"}, {24, "x24"}, + {25, "x25"}, {26, "x26"}, {27, "x27"}, {28, "x28"}, {29, "x29"}, + {30, "x30"}, {31, "sp"}, {32, "pc"}, + {64, "v0"}, {65, "v1"}, {66, "v2"}, {67, "v3"}, {68, "v4"}, + {69, "v5"}, {70, "v6"}, {71, "v7"}, {72, "v8"}, {73, "v9"}, + {74, "v10"}, {75, "v11"}, {76, "v12"}, {77, "v13"}, {78, "v14"}, + {79, "v15"}, {80, "v16"}, {81, "v17"}, {82, "v18"}, {83, "v19"}, + {84, "v20"}, {85, "v21"}, {86, "v22"}, {87, "v23"}, {88, "v24"}, + {89, "v25"}, {90, "v26"}, {91, "v27"}, {92, "v28"}, {93, "v29"}, + {94, "v30"}, {95, "v31"}, +}; + +static const uint32_t AA64_REGS_N = (uint32_t)(sizeof AA64_REGS / + sizeof AA64_REGS[0]); + +const char* aa64_register_name(uint32_t dwarf_idx) { + uint32_t i; + for (i = 0; i < AA64_REGS_N; ++i) { + if (AA64_REGS[i].dwarf_idx == dwarf_idx) return AA64_REGS[i].name; + } + return NULL; +} + +int aa64_register_index(const char* name, uint32_t* idx_out) { + uint32_t i; + if (!name) return 1; + for (i = 0; i < AA64_REGS_N; ++i) { + if (!strcmp(AA64_REGS[i].name, name)) { + if (idx_out) *idx_out = AA64_REGS[i].dwarf_idx; + return 0; + } + } + /* Accept Wn alias for Xn (same DWARF index). */ + if (name[0] == 'w' && name[1] != '\0') { + char buf[8]; + size_t n = strlen(name); + if (n < sizeof buf) { + buf[0] = 'x'; + memcpy(buf + 1, name + 1, n); + return aa64_register_index(buf, idx_out); + } + } + /* wzr / xzr aliases. */ + if (!strcmp(name, "wzr") || !strcmp(name, "xzr")) { + if (idx_out) *idx_out = 31u; /* shares SP encoding slot; v1 picks SP */ + return 0; + } + return 1; +} + +uint32_t aa64_register_iter_size(void) { return AA64_REGS_N; } + +int aa64_register_iter_get(uint32_t i, uint32_t* dwarf_out, + const char** name_out) { + if (i >= AA64_REGS_N) return 1; + if (dwarf_out) *dwarf_out = AA64_REGS[i].dwarf_idx; + if (name_out) *name_out = AA64_REGS[i].name; + return 0; +} diff --git a/src/arch/aa64_regs.h b/src/arch/aa64/regs.h diff --git a/src/arch/aa64_asm.c b/src/arch/aa64_asm.c @@ -1,1379 +0,0 @@ -/* AArch64 standalone .s instruction parser. - * - * Per-mnemonic dispatch: each entry in the mnemonic table names a - * parse function that reads operand tokens through the asm-driver - * surface and emits the encoded word via the inline encoders in - * aa64_isa.h. Encoders are the single source of truth for bit - * layout — the disassembler shares them through aa64_*_unpack. - * - * Aliases (`mov`, `neg`, `cmp`, `mul`, ...) live in this table as - * dedicated rows that pick the canonical form's encoder with the - * alias-specific operand shape. When a mnemonic admits multiple - * forms (e.g. `mov` register-vs-immediate, `add` register-vs- - * immediate), the parser branches on operand shape after reading - * the first non-Rd operand. */ - -#include "arch/aa64_asm.h" - -#include <string.h> - -#include "arch/aa64_isa.h" -#include "arch/aa64_regs.h" -#include "arch/arch.h" -#include "core/arena.h" -#include "core/pool.h" -#include "core/strbuf.h" -#include "asm/asm_lex.h" -#include "obj/obj.h" -#include "asm/asm_helpers.h" - -/* ---- public handle ---- */ - -struct AA64Asm { - ArchAsm base; - Compiler* c; - - /* Inline-asm bound state (set by aa64_inline_bind, cleared otherwise). - * Operand indexing per GCC convention: 0..nout-1 are outputs, then - * nout..nout+nin-1 are inputs. Templates address into this combined - * list via %N / %wN / %xN / %aN. out_ops is mutable (the binder fills - * in result locations); in_ops + constraints + clobbers are read-only - * borrows. */ - const AsmConstraint* outs; - Operand* out_ops; - const AsmConstraint* ins; - const Operand* in_ops; - const Sym* clobbers; - u32 nout; - u32 nin; - u32 nclob; -}; - -static void aa64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic); -static void aa64_arch_asm_destroy(ArchAsm* base); - -AA64Asm* aa64_asm_open(Compiler* c) { - AA64Asm* a = arena_new(c->tu, AA64Asm); - memset(a, 0, sizeof *a); - a->base.insn = aa64_arch_asm_insn; - a->base.destroy = aa64_arch_asm_destroy; - a->c = c; - return a; -} - -void aa64_asm_close(AA64Asm* a) { (void)a; } - -ArchAsm* aa64_arch_asm_new(Compiler* c) { - return &aa64_asm_open(c)->base; -} - -static void aa64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic) { - aa64_asm_insn((AA64Asm*)base, d, mnemonic); -} - -static void aa64_arch_asm_destroy(ArchAsm* base) { - aa64_asm_close((AA64Asm*)base); -} - -void aa64_inline_bind(AA64Asm* a, - const AsmConstraint* outs, u32 nout, Operand* out_ops, - const AsmConstraint* ins, u32 nin, const Operand* in_ops, - const Sym* clobbers, u32 nclob) { - a->outs = outs; - a->out_ops = out_ops; - a->ins = ins; - a->in_ops = in_ops; - a->clobbers = clobbers; - a->nout = nout; - a->nin = nin; - a->nclob = nclob; -} - -/* ---- helpers ---- */ - -static int tok_punct(AsmTok t, u32 p) { return asm_driver_tok_is_punct(t, p); } - -static int icase_eq(const char* a, size_t an, const char* b) { - size_t i; - for (i = 0; i < an; ++i) { - char x = a[i], y = b[i]; - if (x >= 'A' && x <= 'Z') x = (char)(x + ('a' - 'A')); - if (y >= 'A' && y <= 'Z') y = (char)(y + ('a' - 'A')); - if (x != y || !y) return 0; - } - return b[an] == '\0'; -} - -/* Parse a register operand. Returns the 5-bit encoded register number - * via *reg_out and the form via *is64_out. Recognized forms (case- - * insensitive): - * w0..w30, wzr → is64=0, reg=0..30 / 31 - * x0..x30, xzr, lr (=x30) → is64=1, reg=0..30 / 31 - * sp → is64=1, reg=31 (sp_means_sp set) - * wsp → is64=0, reg=31 (sp_means_sp set) - * Aliases: - * fp = x29 - * ip0 = x16, ip1 = x17 (PLT scratch — useful for hand-written PLTs) */ -typedef struct AA64Reg { - u32 num; - u8 is64; - u8 is_sp; /* 1 if the spelling was "sp" / "wsp" */ - u8 is_fp; /* 1 for SIMD/FP register spellings accepted in FP forms */ - u8 pad; -} AA64Reg; - -static int parse_reg_from_ident(AsmDriver* d, Sym ident, AA64Reg* out) { - size_t n = 0; - const char* p = pool_str(asm_driver_pool(d), ident, &n); - if (!p || !n) return 0; - /* "sp" */ - if (icase_eq(p, n, "sp")) { - out->num = 31; - out->is64 = 1; - out->is_sp = 1; - out->is_fp = 0; - return 1; - } - if (icase_eq(p, n, "wsp")) { - out->num = 31; - out->is64 = 0; - out->is_sp = 1; - out->is_fp = 0; - return 1; - } - if (icase_eq(p, n, "lr")) { - out->num = 30; - out->is64 = 1; - out->is_sp = 0; - out->is_fp = 0; - return 1; - } - if (icase_eq(p, n, "fp")) { - out->num = 29; - out->is64 = 1; - out->is_sp = 0; - out->is_fp = 0; - return 1; - } - if (icase_eq(p, n, "ip0")) { - out->num = 16; - out->is64 = 1; - out->is_sp = 0; - out->is_fp = 0; - return 1; - } - if (icase_eq(p, n, "ip1")) { - out->num = 17; - out->is64 = 1; - out->is_sp = 0; - out->is_fp = 0; - return 1; - } - if (icase_eq(p, n, "xzr")) { - out->num = 31; - out->is64 = 1; - out->is_sp = 0; - out->is_fp = 0; - return 1; - } - if (icase_eq(p, n, "wzr")) { - out->num = 31; - out->is64 = 0; - out->is_sp = 0; - out->is_fp = 0; - return 1; - } - /* W/X<num> */ - if ((p[0] == 'w' || p[0] == 'W' || p[0] == 'x' || p[0] == 'X') && n >= 2) { - u32 r = 0; - size_t i; - for (i = 1; i < n; ++i) { - char c = p[i]; - if (c < '0' || c > '9') return 0; - r = r * 10 + (u32)(c - '0'); - if (r > 31) return 0; - } - out->num = r; - out->is64 = (p[0] == 'x' || p[0] == 'X') ? 1 : 0; - out->is_sp = 0; - out->is_fp = 0; - return 1; - } - return 0; -} - -static int parse_fp_d_reg_from_ident(AsmDriver* d, Sym ident, AA64Reg* out) { - size_t n = 0; - const char* p = pool_str(asm_driver_pool(d), ident, &n); - if (!p || n < 2 || (p[0] != 'd' && p[0] != 'D')) return 0; - u32 r = 0; - for (size_t i = 1; i < n; ++i) { - char c = p[i]; - if (c < '0' || c > '9') return 0; - r = r * 10 + (u32)(c - '0'); - if (r > 31) return 0; - } - out->num = r; - out->is64 = 1; - out->is_sp = 0; - out->is_fp = 1; - return 1; -} - -static AA64Reg parse_reg(AsmDriver* d) { - AsmTok t = asm_driver_next(d); - AA64Reg r; - memset(&r, 0, sizeof r); - if (t.kind != ASM_TOK_IDENT || !parse_reg_from_ident(d, t.v.ident, &r)) - asm_driver_panic(d, "asm: expected register"); - return r; -} - -static AA64Reg parse_ldstp_reg(AsmDriver* d) { - AsmTok t = asm_driver_next(d); - AA64Reg r; - memset(&r, 0, sizeof r); - if (t.kind != ASM_TOK_IDENT || - (!parse_reg_from_ident(d, t.v.ident, &r) && - !parse_fp_d_reg_from_ident(d, t.v.ident, &r))) { - asm_driver_panic(d, "asm: expected register"); - } - return r; -} - -static void reject_sp_reg(AsmDriver* d, AA64Reg r, const char* what) { - if (r.is_sp) asm_driver_panic(d, "asm: %s: SP register not allowed", what); -} - -static void require_sp_spelling(AsmDriver* d, AA64Reg r, const char* what) { - if (r.num == 31u && !r.is_sp) - asm_driver_panic(d, "asm: %s: zero register not allowed in SP operand", - what); -} - -/* Parse "#imm" (with optional + / -) or a bare expression — GNU as is - * lenient about the leading hash. Returns an i64. */ -static i64 parse_imm_const(AsmDriver* d) { - (void)asm_driver_eat_punct(d, '#'); - return asm_driver_parse_const(d); -} - -/* Parse a possibly-symbolic operand prefixed by '#'. */ -static void parse_imm_sym(AsmDriver* d, ObjSymId* sym_out, i64* val_out) { - (void)asm_driver_eat_punct(d, '#'); - asm_driver_parse_sym_expr(d, sym_out, val_out); -} - -static void emit32(AsmDriver* d, u32 word) { - MCEmitter* mc = asm_driver_mc(d); - (void)asm_driver_cur_section(d); - u8 buf[4]; - buf[0] = (u8)(word & 0xff); - buf[1] = (u8)((word >> 8) & 0xff); - buf[2] = (u8)((word >> 16) & 0xff); - buf[3] = (u8)((word >> 24) & 0xff); - mc->emit_bytes(mc, buf, 4); -} - -static int parse_cond_from_ident(AsmDriver* d, Sym ident, u32* out) { - size_t n = 0; - const char* s = pool_str(asm_driver_pool(d), ident, &n); - if (!s) return 0; - if (icase_eq(s, n, "eq")) *out = 0; - else if (icase_eq(s, n, "ne")) *out = 1; - else if (icase_eq(s, n, "cs") || icase_eq(s, n, "hs")) *out = 2; - else if (icase_eq(s, n, "cc") || icase_eq(s, n, "lo")) *out = 3; - else if (icase_eq(s, n, "mi")) *out = 4; - else if (icase_eq(s, n, "pl")) *out = 5; - else if (icase_eq(s, n, "vs")) *out = 6; - else if (icase_eq(s, n, "vc")) *out = 7; - else if (icase_eq(s, n, "hi")) *out = 8; - else if (icase_eq(s, n, "ls")) *out = 9; - else if (icase_eq(s, n, "ge")) *out = 10; - else if (icase_eq(s, n, "lt")) *out = 11; - else if (icase_eq(s, n, "gt")) *out = 12; - else if (icase_eq(s, n, "le")) *out = 13; - else if (icase_eq(s, n, "al")) *out = 14; - else return 0; - return 1; -} - -static u32 parse_cond(AsmDriver* d, const char* what) { - AsmTok t = asm_driver_next(d); - u32 cond = 0; - if (t.kind != ASM_TOK_IDENT || !parse_cond_from_ident(d, t.v.ident, &cond)) - asm_driver_panic(d, "asm: %s: expected condition code", what); - return cond; -} - -static void expect_comma(AsmDriver* d, const char* what) { - if (!asm_driver_eat_comma(d)) - asm_driver_panic(d, "asm: expected ',' (%s)", what); -} - -/* ---- per-mnemonic parsers ---- */ - -/* ret [Xn] — Xn defaults to x30. */ -static void p_ret(AsmDriver* d) { - if (asm_driver_at_eol(d)) { - emit32(d, aa64_ret(30)); - return; - } - AA64Reg r = parse_reg(d); - if (!r.is64) asm_driver_panic(d, "asm: ret: 64-bit register expected"); - emit32(d, aa64_ret(r.num)); -} - -static void p_br(AsmDriver* d) { - AA64Reg r = parse_reg(d); - if (!r.is64) asm_driver_panic(d, "asm: br: 64-bit register expected"); - emit32(d, aa64_br(r.num)); -} - -static void p_blr(AsmDriver* d) { - AA64Reg r = parse_reg(d); - if (!r.is64) asm_driver_panic(d, "asm: blr: 64-bit register expected"); - emit32(d, aa64_blr(r.num)); -} - -static void p_nop(AsmDriver* d) { - (void)d; - emit32(d, aa64_nop()); -} - -/* Memory barriers (DMB / DSB / ISB / CLREX). - * - * dmb <option> ; option in {sy, ish, nsh, osh, ld, st, ishld, - * ishst, nshld, nshst, oshld, oshst} - * dmb #imm4 ; numeric form - * dsb <option> | #imm4 - * isb [<option>] ; option defaults to sy when omitted - * clrex [#imm4] ; option defaults to sy (15) when omitted */ -static u32 parse_barrier_option(AsmDriver* d, int allow_dmb_ld_st) { - if (asm_driver_at_eol(d)) return AA64_BARRIER_OPT_SY; - AsmTok t = asm_driver_peek(d); - if (t.kind == ASM_TOK_IDENT) { - (void)asm_driver_next(d); - size_t n = 0; - const char* s = pool_str(asm_driver_pool(d), t.v.ident, &n); - if (icase_eq(s, n, "sy")) return AA64_BARRIER_OPT_SY; - if (icase_eq(s, n, "ish")) return AA64_BARRIER_OPT_ISH; - if (icase_eq(s, n, "ishld")) return AA64_BARRIER_OPT_ISHLD; - if (icase_eq(s, n, "ishst")) return AA64_BARRIER_OPT_ISHST; - if (icase_eq(s, n, "nsh")) return AA64_BARRIER_OPT_NSH; - if (icase_eq(s, n, "nshld")) return AA64_BARRIER_OPT_NSHLD; - if (icase_eq(s, n, "nshst")) return AA64_BARRIER_OPT_NSHST; - if (icase_eq(s, n, "osh")) return AA64_BARRIER_OPT_OSH; - if (icase_eq(s, n, "oshld")) return AA64_BARRIER_OPT_OSHLD; - if (icase_eq(s, n, "oshst")) return AA64_BARRIER_OPT_OSHST; - if (allow_dmb_ld_st) { - if (icase_eq(s, n, "ld")) return AA64_BARRIER_OPT_LD; - if (icase_eq(s, n, "st")) return AA64_BARRIER_OPT_ST; - } - asm_driver_panic(d, "asm: unknown barrier option"); - } - /* Numeric form: '#imm4'. */ - i64 imm = parse_imm_const(d); - if (imm < 0 || imm > 15) - asm_driver_panic(d, "asm: barrier imm out of range"); - return (u32)imm; -} - -static void p_dmb(AsmDriver* d) { - u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/1); - emit32(d, aa64_dmb(opt)); -} -static void p_dsb(AsmDriver* d) { - u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0); - emit32(d, aa64_dsb(opt)); -} -static void p_isb(AsmDriver* d) { - u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0); - emit32(d, aa64_isb(opt)); -} -static void p_clrex(AsmDriver* d) { - u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0); - emit32(d, aa64_clrex(opt)); -} - -/* mov: - * mov Rd, Rm → ORR Rd, ZR, Rm - * mov Rd, #imm → MOVZ (if imm fits in a single halfword unshifted) - * MOVN (if ~imm fits) - * otherwise: panic (multi-step expansion deferred). */ -static void p_mov(AsmDriver* d) { - AA64Reg rd = parse_reg(d); - expect_comma(d, "mov"); - AsmTok t = asm_driver_peek(d); - if (t.kind == ASM_TOK_IDENT) { - AA64Reg src; - memset(&src, 0, sizeof src); - if (parse_reg_from_ident(d, t.v.ident, &src)) { - (void)asm_driver_next(d); - if (src.is64 != rd.is64) - asm_driver_panic(d, "asm: mov: register width mismatch"); - /* mov involving SP encodes as `ADD Rd, Rsp, #0` per AArch64; - * approximate with that exact form. */ - if (rd.is_sp || src.is_sp) { - require_sp_spelling(d, rd, "mov sp"); - require_sp_spelling(d, src, "mov sp"); - emit32(d, aa64_add_imm(rd.is64, rd.num, src.num, 0, 0)); - return; - } - emit32(d, aa64_mov_reg(rd.is64, rd.num, src.num)); - return; - } - /* fall through: identifier that is not a register → treat as - * symbol/equate via expression below. */ - } - /* Immediate. */ - i64 imm = parse_imm_const(d); - if (rd.is_sp) asm_driver_panic(d, "asm: mov: cannot move imm into SP"); - u64 uv = (u64)imm; - u64 mask = rd.is64 ? ~0ull : 0xffffffffull; - uv &= mask; - /* Try MOVZ with one of four halfwords. */ - for (u32 hw = 0; hw < (rd.is64 ? 4u : 2u); ++hw) { - u64 shift = (u64)hw * 16; - u64 hwmask = 0xffffull << shift; - if ((uv & ~hwmask) == 0) { - u32 v = (u32)((uv >> shift) & 0xffff); - emit32(d, aa64_movz(rd.is64, rd.num, v, hw)); - return; - } - } - /* Try MOVN with one halfword (encodes ~imm in that halfword). */ - u64 nv = (~uv) & mask; - for (u32 hw = 0; hw < (rd.is64 ? 4u : 2u); ++hw) { - u64 shift = (u64)hw * 16; - u64 hwmask = 0xffffull << shift; - if ((nv & ~hwmask) == 0) { - u32 v = (u32)((nv >> shift) & 0xffff); - emit32(d, aa64_movn(rd.is64, rd.num, v, hw)); - return; - } - } - asm_driver_panic(d, "asm: mov: immediate cannot be encoded in one insn"); -} - -/* mvn Rd, Rm */ -static void p_mvn(AsmDriver* d) { - AA64Reg rd = parse_reg(d); - expect_comma(d, "mvn"); - AA64Reg rm = parse_reg(d); - if (rd.is64 != rm.is64) asm_driver_panic(d, "asm: mvn: width mismatch"); - emit32(d, aa64_mvn(rd.is64, rd.num, rm.num)); -} - -/* movz / movn / movk Rd, #imm[, lsl #shift] */ -static void p_movwide(AsmDriver* d, u32 opc) { - AA64Reg rd = parse_reg(d); - expect_comma(d, "movz/n/k"); - i64 imm = parse_imm_const(d); - u32 hw = 0; - if (asm_driver_eat_comma(d)) { - /* lsl #N (N is 0/16/32/48). */ - AsmTok lid = asm_driver_next(d); - if (lid.kind != ASM_TOK_IDENT) - asm_driver_panic(d, "asm: expected 'lsl'"); - size_t ln = 0; - const char* lp = pool_str(asm_driver_pool(d), lid.v.ident, &ln); - if (!lp || !icase_eq(lp, ln, "lsl")) - asm_driver_panic(d, "asm: expected 'lsl'"); - i64 sh = parse_imm_const(d); - if (sh % 16 != 0 || sh < 0 || sh > 48) - asm_driver_panic(d, "asm: movz/n/k: bad lsl shift"); - hw = (u32)(sh / 16); - } - u32 word = ((rd.is64 & 1u) << 31) | ((opc & 3u) << 29) | - AA64_MOVEWIDE_FAMILY_MATCH | ((hw & 3u) << 21) | - (((u32)imm & 0xffffu) << 5) | (rd.num & 0x1fu); - emit32(d, word); -} - -/* svc / brk / hlt #imm */ -static void p_except(AsmDriver* d, u32 form) { - i64 imm = parse_imm_const(d); - switch (form) { - case 0: emit32(d, aa64_svc((u32)imm)); break; - case 1: emit32(d, aa64_brk((u32)imm)); break; - case 2: { - /* HLT */ - u32 word = AA64_EXCEPT_FAMILY_MATCH | ((u32)2 << 21) | - (((u32)imm & 0xffffu) << 5); - emit32(d, word); - break; - } - default: asm_driver_panic(d, "asm: bad exception form"); - } -} - -/* Read optional `, lsl|lsr|asr|ror #imm` shift modifier. Returns 1 if - * present. */ -static int parse_shift_mod(AsmDriver* d, u32* shift_out, u32* imm6_out) { - AsmTok t = asm_driver_peek(d); - if (t.kind != ASM_TOK_IDENT) return 0; - size_t n = 0; - const char* p = pool_str(asm_driver_pool(d), t.v.ident, &n); - u32 sh; - if (icase_eq(p, n, "lsl")) sh = 0; - else if (icase_eq(p, n, "lsr")) sh = 1; - else if (icase_eq(p, n, "asr")) sh = 2; - else if (icase_eq(p, n, "ror")) sh = 3; - else return 0; - (void)asm_driver_next(d); - i64 imm = parse_imm_const(d); - if (imm < 0 || imm > 63) - asm_driver_panic(d, "asm: shift amount out of range"); - *shift_out = sh; - *imm6_out = (u32)imm; - return 1; -} - -/* add / sub family. - * Forms: - * add Rd, Rn, Rm[, lsl #s] shifted-register - * add Rd, Rn, #imm immediate - * add Rd, Rn, #imm, lsl #12 immediate w/ shift - * S-suffixed (adds/subs) sets flags. */ -static void p_addsub(AsmDriver* d, int is_sub, int set_flags) { - AA64Reg rd = parse_reg(d); - expect_comma(d, "add/sub"); - AA64Reg rn = parse_reg(d); - expect_comma(d, "add/sub"); - AsmTok t = asm_driver_peek(d); - if (tok_punct(t, '#') || t.kind == ASM_TOK_NUM || tok_punct(t, '-') || - tok_punct(t, '+')) { - /* immediate form */ - if (rd.is64 != rn.is64) - asm_driver_panic(d, "asm: add/sub imm: width mismatch"); - require_sp_spelling(d, rn, "add/sub imm"); - if (set_flags) { - reject_sp_reg(d, rd, "add/sub imm"); - } else { - require_sp_spelling(d, rd, "add/sub imm"); - } - i64 imm = parse_imm_const(d); - u32 sh = 0; - if (asm_driver_eat_comma(d)) { - AsmTok lid = asm_driver_next(d); - if (lid.kind != ASM_TOK_IDENT) - asm_driver_panic(d, "asm: expected 'lsl #12'"); - size_t ln = 0; - const char* lp = pool_str(asm_driver_pool(d), lid.v.ident, &ln); - if (!lp || !icase_eq(lp, ln, "lsl")) - asm_driver_panic(d, "asm: expected 'lsl'"); - i64 s = parse_imm_const(d); - if (s == 12) sh = 1; - else if (s == 0) sh = 0; - else asm_driver_panic(d, "asm: add/sub imm: lsl must be 0 or 12"); - } - if (imm < 0 || imm > 0xfff) - asm_driver_panic(d, "asm: add/sub imm out of range"); - u32 word = aa64_addsubimm_pack((AA64AddSubImm){ - .sf = rd.is64, .op = (u32)is_sub, .S = (u32)set_flags, .sh = sh, - .imm12 = (u32)imm, .Rn = rn.num, .Rd = rd.num}); - emit32(d, word); - return; - } - /* register form */ - AA64Reg rm = parse_reg(d); - reject_sp_reg(d, rd, "add/sub reg"); - reject_sp_reg(d, rn, "add/sub reg"); - reject_sp_reg(d, rm, "add/sub reg"); - if (rd.is64 != rm.is64 || rd.is64 != rn.is64) - asm_driver_panic(d, "asm: add/sub reg: width mismatch"); - u32 shift = 0, imm6 = 0; - if (asm_driver_eat_comma(d)) { - if (!parse_shift_mod(d, &shift, &imm6)) - asm_driver_panic(d, "asm: add/sub reg: expected shift modifier"); - } - u32 word = aa64_addsubsr_pack((AA64AddSubSR){ - .sf = rd.is64, .op = (u32)is_sub, .S = (u32)set_flags, - .shift = shift, .Rm = rm.num, .imm6 = imm6, .Rn = rn.num, - .Rd = rd.num}); - emit32(d, word); -} - -/* cmp Rn, Rm | cmp Rn, #imm → SUBS ZR, Rn, ... */ -static void p_cmp(AsmDriver* d, int is_neg /* cmn flips op */) { - AA64Reg rn = parse_reg(d); - expect_comma(d, "cmp"); - AsmTok t = asm_driver_peek(d); - if (tok_punct(t, '#') || t.kind == ASM_TOK_NUM || tok_punct(t, '-') || - tok_punct(t, '+')) { - require_sp_spelling(d, rn, "cmp imm"); - i64 imm = parse_imm_const(d); - u32 sh = 0; - if (asm_driver_eat_comma(d)) { - AsmTok lid = asm_driver_next(d); - size_t ln = 0; - const char* lp = - (lid.kind == ASM_TOK_IDENT) - ? pool_str(asm_driver_pool(d), lid.v.ident, &ln) - : NULL; - if (!lp || !icase_eq(lp, ln, "lsl")) - asm_driver_panic(d, "asm: cmp imm: expected 'lsl'"); - i64 s = parse_imm_const(d); - if (s == 12) sh = 1; - else if (s != 0) - asm_driver_panic(d, "asm: cmp imm: lsl must be 0 or 12"); - } - if (imm < 0 || imm > 0xfff) - asm_driver_panic(d, "asm: cmp imm out of range"); - u32 word = aa64_addsubimm_pack( - (AA64AddSubImm){.sf = rn.is64, .op = (u32)(!is_neg), .S = 1, - .sh = sh, .imm12 = (u32)imm, .Rn = rn.num, - .Rd = AA64_ZR}); - emit32(d, word); - return; - } - AA64Reg rm = parse_reg(d); - reject_sp_reg(d, rn, "cmp reg"); - reject_sp_reg(d, rm, "cmp reg"); - if (rm.is64 != rn.is64) asm_driver_panic(d, "asm: cmp: width mismatch"); - u32 shift = 0, imm6 = 0; - if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6); - u32 word = aa64_addsubsr_pack((AA64AddSubSR){ - .sf = rn.is64, .op = (u32)(!is_neg), .S = 1, .shift = shift, - .Rm = rm.num, .imm6 = imm6, .Rn = rn.num, .Rd = AA64_ZR}); - emit32(d, word); -} - -static void p_csinc(AsmDriver* d) { - AA64Reg rd = parse_reg(d); - expect_comma(d, "csinc"); - AA64Reg rn = parse_reg(d); - expect_comma(d, "csinc"); - AA64Reg rm = parse_reg(d); - expect_comma(d, "csinc"); - u32 cond = parse_cond(d, "csinc"); - if (rd.is_sp || rn.is_sp || rm.is_sp) - asm_driver_panic(d, "asm: csinc: SP register not allowed"); - if (rd.is64 != rn.is64 || rd.is64 != rm.is64) - asm_driver_panic(d, "asm: csinc: width mismatch"); - u32 word = 0x1A800400u | ((u32)rd.is64 << 31) | ((rm.num & 0x1fu) << 16) | - ((cond & 0xfu) << 12) | ((rn.num & 0x1fu) << 5) | - (rd.num & 0x1fu); - emit32(d, word); -} - -/* neg / negs Rd, Rm → SUB / SUBS Rd, ZR, Rm */ -static void p_neg(AsmDriver* d, int set_flags) { - AA64Reg rd = parse_reg(d); - expect_comma(d, "neg"); - AA64Reg rm = parse_reg(d); - reject_sp_reg(d, rd, "neg"); - reject_sp_reg(d, rm, "neg"); - if (rd.is64 != rm.is64) asm_driver_panic(d, "asm: neg: width mismatch"); - u32 shift = 0, imm6 = 0; - if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6); - u32 word = aa64_addsubsr_pack((AA64AddSubSR){ - .sf = rd.is64, .op = 1, .S = (u32)set_flags, .shift = shift, - .Rm = rm.num, .imm6 = imm6, .Rn = AA64_ZR, .Rd = rd.num}); - emit32(d, word); -} - -/* Logical shifted-register family. */ -static void p_log_sr(AsmDriver* d, u32 opc, u32 N) { - AA64Reg rd = parse_reg(d); - expect_comma(d, "logical"); - AA64Reg rn = parse_reg(d); - expect_comma(d, "logical"); - AA64Reg rm = parse_reg(d); - if (rd.is64 != rn.is64 || rd.is64 != rm.is64) - asm_driver_panic(d, "asm: logical: width mismatch"); - u32 shift = 0, imm6 = 0; - if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6); - u32 word = aa64_logsr_pack((AA64LogSR){ - .sf = rd.is64, .opc = opc, .shift = shift, .N = N, .Rm = rm.num, - .imm6 = imm6, .Rn = rn.num, .Rd = rd.num}); - emit32(d, word); -} - -/* Data-processing 3-source: madd/msub Rd, Rn, Rm, Ra. */ -static void p_dp3(AsmDriver* d, u32 o0) { - AA64Reg rd = parse_reg(d); - expect_comma(d, "dp3"); - AA64Reg rn = parse_reg(d); - expect_comma(d, "dp3"); - AA64Reg rm = parse_reg(d); - expect_comma(d, "dp3"); - AA64Reg ra = parse_reg(d); - if (rd.is64 != rn.is64 || rd.is64 != rm.is64 || rd.is64 != ra.is64) - asm_driver_panic(d, "asm: dp3: width mismatch"); - u32 word = aa64_dp3_pack((AA64DP3){ - .sf = rd.is64, .op31 = 0, .o0 = o0, .Rm = rm.num, .Ra = ra.num, - .Rn = rn.num, .Rd = rd.num}); - emit32(d, word); -} - -/* mul Rd, Rn, Rm → MADD Rd, Rn, Rm, ZR */ -static void p_mul(AsmDriver* d, u32 o0) { - AA64Reg rd = parse_reg(d); - expect_comma(d, "mul"); - AA64Reg rn = parse_reg(d); - expect_comma(d, "mul"); - AA64Reg rm = parse_reg(d); - if (rd.is64 != rn.is64 || rd.is64 != rm.is64) - asm_driver_panic(d, "asm: mul: width mismatch"); - u32 word = aa64_dp3_pack((AA64DP3){ - .sf = rd.is64, .op31 = 0, .o0 = o0, .Rm = rm.num, .Ra = AA64_ZR, - .Rn = rn.num, .Rd = rd.num}); - emit32(d, word); -} - -/* DP2: udiv/sdiv/lslv/lsrv/asrv/rorv Rd, Rn, Rm. */ -static void p_dp2(AsmDriver* d, u32 opcode) { - AA64Reg rd = parse_reg(d); - expect_comma(d, "dp2"); - AA64Reg rn = parse_reg(d); - expect_comma(d, "dp2"); - AA64Reg rm = parse_reg(d); - if (rd.is64 != rn.is64 || rd.is64 != rm.is64) - asm_driver_panic(d, "asm: dp2: width mismatch"); - u32 word = aa64_dp2_pack((AA64DP2){.sf = rd.is64, .opcode = opcode, - .Rm = rm.num, .Rn = rn.num, - .Rd = rd.num}); - emit32(d, word); -} - -/* Branch immediate / conditional / compare-and-branch. */ - -static void emit_branch_imm(AsmDriver* d, u32 op_bl, ObjSymId target, - i64 addend, i64 const_disp) { - MCEmitter* mc = asm_driver_mc(d); - /* Emit a B/BL with imm26 = 0; record a CALL26/JUMP26 reloc against - * either the symbol or the constant displacement. */ - u32 word = aa64_brimm_pack((AA64BrImm){.op = op_bl, .imm26 = 0}); - emit32(d, word); - u32 ofs = mc->pos(mc) - 4; - RelocKind k = op_bl ? R_AARCH64_CALL26 : R_AARCH64_JUMP26; - if (target != OBJ_SYM_NONE) { - mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, k, target, - addend, 1, 0); - } else { - /* Pure constant displacement is rare in real .s; reject it now. - * The recommended form is to use a label and let the assembler - * compute the displacement. */ - (void)const_disp; - asm_driver_panic(d, "asm: branch with pure constant disp not supported"); - } -} - -static void p_b(AsmDriver* d, u32 op_bl) { - ObjSymId sym = OBJ_SYM_NONE; - i64 off = 0; - /* GNU as accepts `b sym`, `bl sym+8`, etc. */ - parse_imm_sym(d, &sym, &off); - if (sym == OBJ_SYM_NONE) - asm_driver_panic(d, "asm: b/bl: symbolic target required"); - emit_branch_imm(d, op_bl, sym, off, 0); -} - -static void p_b_cond(AsmDriver* d, u32 cond) { - ObjSymId sym = OBJ_SYM_NONE; - i64 off = 0; - parse_imm_sym(d, &sym, &off); - if (sym == OBJ_SYM_NONE) - asm_driver_panic(d, "asm: b.cond: symbolic target required"); - /* Emit the instruction with imm19=0 + R_AARCH64_CONDBR19 reloc. */ - u32 word = aa64_brcond_pack((AA64BrCond){.imm19 = 0, .cond = cond}); - emit32(d, word); - MCEmitter* mc = asm_driver_mc(d); - u32 ofs = mc->pos(mc) - 4; - mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, - R_AARCH64_CONDBR19, sym, off, 1, 0); -} - -static void p_cbz(AsmDriver* d, u32 op) { - AA64Reg rt = parse_reg(d); - expect_comma(d, "cbz"); - ObjSymId sym = OBJ_SYM_NONE; - i64 off = 0; - parse_imm_sym(d, &sym, &off); - if (sym == OBJ_SYM_NONE) - asm_driver_panic(d, "asm: cbz: symbolic target required"); - u32 word = aa64_cb_pack((AA64CB){.sf = rt.is64, .op = op, .imm19 = 0, - .Rt = rt.num}); - emit32(d, word); - MCEmitter* mc = asm_driver_mc(d); - u32 ofs = mc->pos(mc) - 4; - mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, - R_AARCH64_CONDBR19, sym, off, 1, 0); -} - -/* Memory-operand parser for [Xn], [Xn, #imm], [Xn, #imm]!. - * - * pre_index_out is 1 when the closing `]!` appeared (pre-indexed). - * imm is the literal byte offset (no scaling). */ -typedef struct AA64Mem { - AA64Reg base; - i64 imm; /* byte offset (literal as written) */ - u8 pre_index; - u8 has_offset; - u8 pad[2]; -} AA64Mem; - -static AA64Mem parse_mem(AsmDriver* d) { - AA64Mem m; - memset(&m, 0, sizeof m); - if (!asm_driver_eat_punct(d, '[')) - asm_driver_panic(d, "asm: expected '['"); - m.base = parse_reg(d); - if (!m.base.is64) - asm_driver_panic(d, "asm: ldr/str: base register must be 64-bit"); - require_sp_spelling(d, m.base, "ldr/str base"); - if (asm_driver_eat_comma(d)) { - m.imm = parse_imm_const(d); - m.has_offset = 1; - } - if (!asm_driver_eat_punct(d, ']')) - asm_driver_panic(d, "asm: expected ']'"); - if (asm_driver_eat_punct(d, '!')) m.pre_index = 1; - return m; -} - -/* ldr/str Rt, [Xn, #imm] — chooses scaled or unscaled form based on - * alignment of imm. */ -static void p_ldr_str(AsmDriver* d, int is_load) { - AA64Reg rt = parse_reg(d); - reject_sp_reg(d, rt, "ldr/str"); - expect_comma(d, "ldr/str"); - AA64Mem m = parse_mem(d); - u32 size = rt.is64 ? 3u : 2u; - u32 opc = is_load ? AA64_LDST_OPC_LDR : AA64_LDST_OPC_STR; - if (!m.pre_index) { - /* Try scaled unsigned-imm12 first. */ - u32 scale = 1u << size; - if (m.imm >= 0 && (i64)((u64)m.imm % scale) == 0 && - (u64)m.imm / scale <= 0xfff) { - u32 imm12 = (u32)((u64)m.imm / scale); - u32 word = aa64_ldst_uimm_pack((AA64LdStUimm){ - .size = size, .V = 0, .opc = opc, .imm12 = imm12, - .Rn = m.base.num, .Rt = rt.num}); - emit32(d, word); - return; - } - /* Fall back to unscaled signed-imm9 (LDUR/STUR). */ - if (m.imm >= -256 && m.imm <= 255) { - u32 imm9 = (u32)((u64)m.imm & 0x1ffu); - u32 word = aa64_ldst_simm9_pack((AA64LdStSimm9){ - .size = size, .V = 0, .opc = opc, .imm9 = imm9, - .Rn = m.base.num, .Rt = rt.num}); - emit32(d, word); - return; - } - asm_driver_panic(d, "asm: ldr/str: immediate out of range"); - } - asm_driver_panic(d, "asm: ldr/str: pre-indexed form not yet supported"); -} - -/* ldur/stur — unscaled signed-imm9. */ -static void p_ldur_stur(AsmDriver* d, int is_load) { - AA64Reg rt = parse_reg(d); - reject_sp_reg(d, rt, "ldur/stur"); - expect_comma(d, "ldur/stur"); - AA64Mem m = parse_mem(d); - u32 size = rt.is64 ? 3u : 2u; - if (m.imm < -256 || m.imm > 255) - asm_driver_panic(d, "asm: ldur/stur: imm9 out of range"); - u32 imm9 = (u32)((u64)m.imm & 0x1ffu); - u32 word = aa64_ldst_simm9_pack((AA64LdStSimm9){ - .size = size, .V = 0, - .opc = is_load ? AA64_LDST_OPC_LDR : AA64_LDST_OPC_STR, - .imm9 = imm9, .Rn = m.base.num, .Rt = rt.num}); - emit32(d, word); -} - -/* ldp / stp Rt, Rt2, [Xn, #imm] or [Xn, #imm]! */ -static void p_ldp_stp(AsmDriver* d, int is_load) { - AA64Reg rt = parse_ldstp_reg(d); - expect_comma(d, "ldp/stp"); - AA64Reg rt2 = parse_ldstp_reg(d); - expect_comma(d, "ldp/stp"); - reject_sp_reg(d, rt, "ldp/stp"); - reject_sp_reg(d, rt2, "ldp/stp"); - if (rt.is64 != rt2.is64 || rt.is_fp != rt2.is_fp) - asm_driver_panic(d, "asm: ldp/stp: width mismatch"); - AA64Mem m = parse_mem(d); - u32 scale = rt.is64 ? 8u : 4u; - if ((i64)((u64)m.imm % scale) != 0) - asm_driver_panic(d, "asm: ldp/stp: imm not scale-aligned"); - i64 imm7 = m.imm / (i64)scale; - if (imm7 < -64 || imm7 > 63) - asm_driver_panic(d, "asm: ldp/stp: imm7 out of range"); - AA64LdStPPre f = {.opc = rt.is_fp ? 1u : (rt.is64 ? 2u : 0u), - .V = rt.is_fp ? 1u : 0u, - .L = is_load ? 1u : 0u, - .imm7 = (u32)imm7 & 0x7fu, - .Rt2 = rt2.num, - .Rn = m.base.num, - .Rt = rt.num}; - if (m.pre_index) - emit32(d, aa64_ldstp_pre_pack(f)); - else - emit32(d, aa64_ldstp_soff_pack(f)); -} - -/* adr / adrp Rd, sym */ -static void p_adr(AsmDriver* d, int is_adrp) { - AA64Reg rd = parse_reg(d); - expect_comma(d, "adr"); - ObjSymId sym = OBJ_SYM_NONE; - i64 off = 0; - parse_imm_sym(d, &sym, &off); - if (sym == OBJ_SYM_NONE) - asm_driver_panic(d, "asm: adr/adrp: symbol required"); - AA64PCRelAdr f = {.op = is_adrp ? AA64_ADR_OP_ADRP : AA64_ADR_OP_ADR, - .immlo = 0, .immhi = 0, .Rd = rd.num}; - emit32(d, aa64_pcrel_adr_pack(f)); - MCEmitter* mc = asm_driver_mc(d); - u32 ofs = mc->pos(mc) - 4; - RelocKind k = is_adrp ? R_AARCH64_ADR_PREL_PG_HI21 : R_AARCH64_ADR_PREL_LO21; - mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, k, sym, off, 1, 0); -} - -/* ---- mnemonic dispatch table ---- */ - -typedef void (*P_Fn)(AsmDriver*); - -typedef struct AA64Mn { - const char* name; - P_Fn fn; - u32 arg; /* per-fn discriminator (alias parameter) */ -} AA64Mn; - -/* Wrapper functions for the discriminator-taking parsers, since the - * table holds a uniform P_Fn pointer. Each wraps a single (fn, arg) - * tuple. */ -static void p_addsub_add(AsmDriver* d) { p_addsub(d, /*is_sub=*/0, 0); } -static void p_addsub_adds(AsmDriver* d) { p_addsub(d, 0, 1); } -static void p_addsub_sub(AsmDriver* d) { p_addsub(d, 1, 0); } -static void p_addsub_subs(AsmDriver* d) { p_addsub(d, 1, 1); } -static void p_cmp_w(AsmDriver* d) { p_cmp(d, 0); } -static void p_cmn_w(AsmDriver* d) { p_cmp(d, 1); } -static void p_csinc_(AsmDriver* d) { p_csinc(d); } -static void p_neg_w(AsmDriver* d) { p_neg(d, 0); } -static void p_negs_w(AsmDriver* d) { p_neg(d, 1); } -static void p_and_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_AND_OPC, 0); } -static void p_bic_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_AND_OPC, 1); } -static void p_orr_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ORR_OPC, 0); } -static void p_orn_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ORR_OPC, 1); } -static void p_eor_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_EOR_OPC, 0); } -static void p_eon_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_EOR_OPC, 1); } -static void p_ands_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ANDS_OPC, 0); } -static void p_bics_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ANDS_OPC, 1); } -static void p_madd(AsmDriver* d) { p_dp3(d, 0); } -static void p_msub(AsmDriver* d) { p_dp3(d, 1); } -static void p_mul_w(AsmDriver* d) { p_mul(d, 0); } -static void p_mneg_w(AsmDriver* d) { p_mul(d, 1); } -static void p_udiv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_UDIV_OP); } -static void p_sdiv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_SDIV_OP); } -static void p_lslv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_LSLV_OP); } -static void p_lsrv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_LSRV_OP); } -static void p_asrv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_ASRV_OP); } -static void p_rorv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_RORV_OP); } -static void p_b_(AsmDriver* d) { p_b(d, 0); } -static void p_bl_(AsmDriver* d) { p_b(d, 1); } -static void p_cbz_(AsmDriver* d) { p_cbz(d, 0); } -static void p_cbnz_(AsmDriver* d) { p_cbz(d, 1); } -static void p_movz_(AsmDriver* d) { p_movwide(d, AA64_MOVZ_OPC); } -static void p_movn_(AsmDriver* d) { p_movwide(d, AA64_MOVN_OPC); } -static void p_movk_(AsmDriver* d) { p_movwide(d, AA64_MOVK_OPC); } -static void p_svc_(AsmDriver* d) { p_except(d, 0); } -static void p_brk_(AsmDriver* d) { p_except(d, 1); } -static void p_hlt_(AsmDriver* d) { p_except(d, 2); } -static void p_ldr_(AsmDriver* d) { p_ldr_str(d, 1); } -static void p_str_(AsmDriver* d) { p_ldr_str(d, 0); } -static void p_ldur_(AsmDriver* d) { p_ldur_stur(d, 1); } -static void p_stur_(AsmDriver* d) { p_ldur_stur(d, 0); } -static void p_ldp_(AsmDriver* d) { p_ldp_stp(d, 1); } -static void p_stp_(AsmDriver* d) { p_ldp_stp(d, 0); } -static void p_adr_(AsmDriver* d) { p_adr(d, 0); } -static void p_adrp_(AsmDriver* d) { p_adr(d, 1); } - -/* b.cond family. cond codes follow the standard ARMv8 numbering. */ -static void p_b_eq(AsmDriver* d) { p_b_cond(d, 0); } -static void p_b_ne(AsmDriver* d) { p_b_cond(d, 1); } -static void p_b_cs(AsmDriver* d) { p_b_cond(d, 2); } -static void p_b_hs(AsmDriver* d) { p_b_cond(d, 2); } -static void p_b_cc(AsmDriver* d) { p_b_cond(d, 3); } -static void p_b_lo(AsmDriver* d) { p_b_cond(d, 3); } -static void p_b_mi(AsmDriver* d) { p_b_cond(d, 4); } -static void p_b_pl(AsmDriver* d) { p_b_cond(d, 5); } -static void p_b_vs(AsmDriver* d) { p_b_cond(d, 6); } -static void p_b_vc(AsmDriver* d) { p_b_cond(d, 7); } -static void p_b_hi(AsmDriver* d) { p_b_cond(d, 8); } -static void p_b_ls(AsmDriver* d) { p_b_cond(d, 9); } -static void p_b_ge(AsmDriver* d) { p_b_cond(d, 10); } -static void p_b_lt(AsmDriver* d) { p_b_cond(d, 11); } -static void p_b_gt(AsmDriver* d) { p_b_cond(d, 12); } -static void p_b_le(AsmDriver* d) { p_b_cond(d, 13); } -static void p_b_al(AsmDriver* d) { p_b_cond(d, 14); } - -static const AA64Mn kTable[] = { - {"nop", p_nop, 0}, - {"dmb", p_dmb, 0}, - {"dsb", p_dsb, 0}, - {"isb", p_isb, 0}, - {"clrex", p_clrex, 0}, - {"ret", p_ret, 0}, - {"br", p_br, 0}, - {"blr", p_blr, 0}, - {"mov", p_mov, 0}, - {"mvn", p_mvn, 0}, - {"movz", p_movz_, 0}, - {"movn", p_movn_, 0}, - {"movk", p_movk_, 0}, - {"add", p_addsub_add, 0}, - {"adds", p_addsub_adds, 0}, - {"sub", p_addsub_sub, 0}, - {"subs", p_addsub_subs, 0}, - {"cmp", p_cmp_w, 0}, - {"cmn", p_cmn_w, 0}, - {"csinc", p_csinc_, 0}, - {"neg", p_neg_w, 0}, - {"negs", p_negs_w, 0}, - {"and", p_and_w, 0}, - {"bic", p_bic_w, 0}, - {"orr", p_orr_w, 0}, - {"orn", p_orn_w, 0}, - {"eor", p_eor_w, 0}, - {"eon", p_eon_w, 0}, - {"ands", p_ands_w, 0}, - {"bics", p_bics_w, 0}, - {"madd", p_madd, 0}, - {"msub", p_msub, 0}, - {"mul", p_mul_w, 0}, - {"mneg", p_mneg_w, 0}, - {"udiv", p_udiv_w, 0}, - {"sdiv", p_sdiv_w, 0}, - {"lslv", p_lslv_w, 0}, - {"lsrv", p_lsrv_w, 0}, - {"asrv", p_asrv_w, 0}, - {"rorv", p_rorv_w, 0}, - {"b", p_b_, 0}, - {"bl", p_bl_, 0}, - {"cbz", p_cbz_, 0}, - {"cbnz", p_cbnz_, 0}, - {"svc", p_svc_, 0}, - {"brk", p_brk_, 0}, - {"hlt", p_hlt_, 0}, - {"ldr", p_ldr_, 0}, - {"str", p_str_, 0}, - {"ldur", p_ldur_, 0}, - {"stur", p_stur_, 0}, - {"ldp", p_ldp_, 0}, - {"stp", p_stp_, 0}, - {"adr", p_adr_, 0}, - {"adrp", p_adrp_, 0}, - {"b.eq", p_b_eq, 0}, {"b.ne", p_b_ne, 0}, - {"b.cs", p_b_cs, 0}, {"b.hs", p_b_hs, 0}, - {"b.cc", p_b_cc, 0}, {"b.lo", p_b_lo, 0}, - {"b.mi", p_b_mi, 0}, {"b.pl", p_b_pl, 0}, - {"b.vs", p_b_vs, 0}, {"b.vc", p_b_vc, 0}, - {"b.hi", p_b_hi, 0}, {"b.ls", p_b_ls, 0}, - {"b.ge", p_b_ge, 0}, {"b.lt", p_b_lt, 0}, - {"b.gt", p_b_gt, 0}, {"b.le", p_b_le, 0}, - {"b.al", p_b_al, 0}, - {NULL, NULL, 0}, -}; - -void aa64_asm_insn(AA64Asm* a, AsmDriver* d, Sym mnemonic) { - (void)a; - size_t mn = 0; - const char* mp = pool_str(asm_driver_pool(d), mnemonic, &mn); - for (const AA64Mn* row = kTable; row->name; ++row) { - if (icase_eq(mp, mn, row->name)) { - row->fn(d); - return; - } - } - asm_driver_panic(d, "asm: unknown mnemonic"); -} - -/* ---- inline-asm template walker (Phase 4b Track C) ---- */ - -/* Per-call rendered-line buffer. GCC's inline asm rarely emits more - * than a handful of instructions per block; one line of substituted - * text fits comfortably inside this. Truncation panics — the operator - * grammar should never grow a single line beyond this without a - * deliberate reason. */ -#define AA64_INLINE_LINE_CAP 1024 - -/* Render a 5-bit register number into the StrBuf using the requested - * width form. is64 picks x-form vs w-form; SP / ZR encode as - * register #31 and we render them as wzr/xzr or wsp/sp depending on - * caller intent — for inline-asm v1 the bound operand always names a - * GP register, never SP, so we emit wzr/xzr for #31. */ -static void render_reg(StrBuf* sb, u32 reg, int is64) { - if (reg == 31u) { - strbuf_puts(sb, is64 ? "xzr" : "wzr"); - return; - } - strbuf_putc(sb, is64 ? 'x' : 'w'); - if (reg >= 10u) strbuf_putc(sb, (char)('0' + (reg / 10u))); - strbuf_putc(sb, (char)('0' + (reg % 10u))); -} - -/* Render a signed 64-bit integer prefixed with '#'. */ -static void render_imm(StrBuf* sb, i64 v) { - strbuf_putc(sb, '#'); - strbuf_put_i64(sb, v); -} - -/* Render an addressing form `[xN, #ofs]` for OPK_INDIRECT. */ -static void render_indirect(StrBuf* sb, Reg base, i32 ofs) { - strbuf_putc(sb, '['); - render_reg(sb, (u32)base, /*is64=*/1); - if (ofs != 0) { - strbuf_puts(sb, ", "); - render_imm(sb, (i64)ofs); - } - strbuf_putc(sb, ']'); -} - -_Noreturn static void inline_panic(AA64Asm* a, const char* msg) { - SrcLoc loc = {0, 0, 0}; - compiler_panic(a->c, loc, "inline asm: %s", msg); -} - -/* Resolve operand index N → (kind=0 forced default, 1=force-w, 2=force-x, - * 3=address form `%aN`). Renders into sb. */ -static void render_operand(AA64Asm* a, StrBuf* sb, u32 idx, int form) { - u32 ntot = a->nout + a->nin; - if (idx >= ntot) inline_panic(a, "operand index out of range"); - const Operand* op = (idx < a->nout) ? &a->out_ops[idx] - : &a->in_ops[idx - a->nout]; - switch (form) { - case 1: /* %wN — force 32-bit register form */ - if (op->kind != OPK_REG) - inline_panic(a, "%w on non-register operand"); - render_reg(sb, (u32)op->v.reg, /*is64=*/0); - return; - case 2: /* %xN — force 64-bit register form */ - if (op->kind != OPK_REG) - inline_panic(a, "%x on non-register operand"); - render_reg(sb, (u32)op->v.reg, /*is64=*/1); - return; - case 3: /* %aN — memory addressing form */ - if (op->kind != OPK_INDIRECT) - inline_panic(a, "%a on non-memory operand"); - render_indirect(sb, op->v.ind.base, op->v.ind.ofs); - return; - default: - break; - } - /* Default rendering by operand kind. */ - switch (op->kind) { - case OPK_REG: - render_reg(sb, (u32)op->v.reg, /*is64=*/1); - return; - case OPK_IMM: - render_imm(sb, op->v.imm); - return; - case OPK_INDIRECT: - render_indirect(sb, op->v.ind.base, op->v.ind.ofs); - return; - default: - inline_panic(a, "unsupported operand kind for %N"); - } -} - -/* Lex one line of substituted asm and dispatch via aa64_asm_insn. */ -static void run_one_line(AA64Asm* a, MCEmitter* mc, const char* text, - size_t len) { - /* Skip blank lines. */ - size_t i; - for (i = 0; i < len; ++i) { - if (text[i] != ' ' && text[i] != '\t') break; - } - if (i == len) return; - - AsmLexer* lx = asm_lex_open_mem(a->c, "<inline-asm>", text, len); - AsmDriver* d = asm_driver_open_inline(a->c, mc, lx); - - /* The first non-trivial token must be the mnemonic identifier (or a - * `.directive`, but inline asm doesn't normally use directives — leave - * that path unsupported until needed). */ - AsmTok t = asm_driver_peek(d); - while (t.kind == ASM_TOK_NEWLINE || t.kind == ASM_TOK_HASH) { - (void)asm_driver_next(d); - if (t.kind == ASM_TOK_HASH) { - /* Skip cpp linemarker rest of line. */ - while (!asm_driver_at_eol(d)) (void)asm_driver_next(d); - } - t = asm_driver_peek(d); - } - if (t.kind == ASM_TOK_EOF) { - asm_driver_close_inline(d); - asm_lex_close(lx); - return; - } - if (t.kind != ASM_TOK_IDENT) - inline_panic(a, "expected mnemonic at start of inline asm line"); - (void)asm_driver_next(d); - Sym mn = t.v.ident; - /* Compose `b.eq` etc. — same trick as the standalone driver. */ - AsmTok dot = asm_driver_peek(d); - if (asm_driver_tok_is_punct(dot, '.')) { - (void)asm_driver_next(d); - AsmTok rest = asm_driver_next(d); - if (rest.kind != ASM_TOK_IDENT) - inline_panic(a, "composite mnemonic: expected ident after '.'"); - size_t hn = 0, rn = 0; - const char* hp = pool_str(asm_driver_pool(d), mn, &hn); - const char* rp = pool_str(asm_driver_pool(d), rest.v.ident, &rn); - char buf[64]; - if (hn + 1 + rn >= sizeof buf) - inline_panic(a, "composite mnemonic too long"); - for (size_t k = 0; k < hn; ++k) buf[k] = hp[k]; - buf[hn] = '.'; - for (size_t k = 0; k < rn; ++k) buf[hn + 1 + k] = rp[k]; - mn = pool_intern(asm_driver_pool(d), buf, hn + 1 + rn); - } - aa64_asm_insn(a, d, mn); - asm_driver_close_inline(d); - asm_lex_close(lx); -} - -/* Substitute placeholders into one line's StrBuf, then dispatch. - * - * The input range is [start, end) inside `tmpl`. Updates `*line_idx` - * is not used — the caller resets the StrBuf between lines. */ -static void render_and_run_line(AA64Asm* a, MCEmitter* mc, StrBuf* sb, - const char* start, const char* end) { - strbuf_reset(sb); - for (const char* p = start; p < end; ++p) { - char c = *p; - if (c != '%') { - strbuf_putc(sb, c); - continue; - } - /* Placeholder. */ - if (p + 1 >= end) inline_panic(a, "trailing '%' in template"); - char n = *(p + 1); - if (n == '%') { - strbuf_putc(sb, '%'); - ++p; - continue; - } - if (n == '[') { - /* %[name] — scan to the closing ']' and resolve against - * AsmConstraint.name on the combined outs+ins list. Match by - * comparing the named-bracket contents against the interned name - * Sym stored on each constraint. */ - const char* nbeg = p + 2; - const char* nend = nbeg; - while (nend < end && *nend != ']') ++nend; - if (nend == end) inline_panic(a, "unterminated %[name]"); - size_t nlen = (size_t)(nend - nbeg); - Sym needle = pool_intern(a->c->global, nbeg, nlen); - u32 idx = (u32)-1; - for (u32 k = 0; k < a->nout; ++k) { - if (a->outs[k].name == needle) { idx = k; break; } - } - if (idx == (u32)-1) { - for (u32 k = 0; k < a->nin; ++k) { - if (a->ins[k].name == needle) { idx = a->nout + k; break; } - } - } - if (idx == (u32)-1) - inline_panic(a, "%[name] does not match any constraint"); - p = nend; /* loop's ++p steps past the ']' */ - render_operand(a, sb, idx, 0); - continue; - } - int form = 0; /* 0=default, 1=w, 2=x, 3=a */ - if (n == 'w' || n == 'x' || n == 'a') { - form = (n == 'w') ? 1 : (n == 'x') ? 2 : 3; - ++p; - if (p + 1 >= end) inline_panic(a, "trailing '%' modifier in template"); - n = *(p + 1); - } - if (n == '[') { - /* %w[name] / %x[name] / %a[name] — width modifier + symbolic - * operand. Resolves the same way as %[name] but renders with the - * declared form. */ - const char* nbeg = p + 2; - const char* nend = nbeg; - while (nend < end && *nend != ']') ++nend; - if (nend == end) inline_panic(a, "unterminated %[name]"); - size_t nlen = (size_t)(nend - nbeg); - Sym needle = pool_intern(a->c->global, nbeg, nlen); - u32 idx = (u32)-1; - for (u32 k = 0; k < a->nout; ++k) { - if (a->outs[k].name == needle) { idx = k; break; } - } - if (idx == (u32)-1) { - for (u32 k = 0; k < a->nin; ++k) { - if (a->ins[k].name == needle) { idx = a->nout + k; break; } - } - } - if (idx == (u32)-1) - inline_panic(a, "%[name] does not match any constraint"); - p = nend; /* loop's ++p steps past the ']' */ - render_operand(a, sb, idx, form); - continue; - } - if (n < '0' || n > '9') - inline_panic(a, "expected digit after '%'"); - u32 idx = (u32)(n - '0'); - ++p; - /* GCC syntax permits up to two digits (%0..%99). */ - if (p + 1 < end && *(p + 1) >= '0' && *(p + 1) <= '9') { - idx = idx * 10 + (u32)(*(p + 1) - '0'); - ++p; - } - render_operand(a, sb, idx, form); - } - if (sb->truncated) inline_panic(a, "inline asm line buffer overflow"); - run_one_line(a, mc, strbuf_cstr(sb), strbuf_len(sb)); -} - -void aa64_asm_run_template(AA64Asm* a, MCEmitter* mc, const char* tmpl) { - if (!tmpl || !*tmpl) return; - - char buf[AA64_INLINE_LINE_CAP]; - StrBuf sb; - strbuf_init(&sb, buf, sizeof buf); - - /* Walk tmpl, splitting on '\n' and ';' line terminators. Track bracket - * depth and quote state so that a literal ';' inside `[ ... ]` or a - * quoted string is not mistaken for a statement separator. */ - const char* line_start = tmpl; - int bracket = 0; - char quote = 0; - for (const char* p = tmpl;; ++p) { - char c = *p; - if (c == '\0') { - render_and_run_line(a, mc, &sb, line_start, p); - break; - } - if (quote) { - if (c == '\\' && *(p + 1)) { - ++p; - continue; - } - if (c == quote) quote = 0; - continue; - } - if (c == '"' || c == '\'') { - quote = c; - continue; - } - if (c == '[') { - ++bracket; - continue; - } - if (c == ']') { - if (bracket) --bracket; - continue; - } - if (bracket == 0 && (c == '\n' || c == ';')) { - render_and_run_line(a, mc, &sb, line_start, p); - line_start = p + 1; - } - } -} diff --git a/src/arch/aa64_disasm.c b/src/arch/aa64_disasm.c @@ -1,133 +0,0 @@ -/* AArch64 disassembler implementation. - * - * Decodes one 4-byte instruction word per call into a CfreeInsn whose - * string fields point into iterator-owned StrBufs. The decoder shares - * the aa64_isa.{h,c} descriptor table with the encoder: aa64_disasm_find - * matches the word; aa64_print_operands renders operand text via the - * format's unpack + per-format pretty-printer. Mnemonic rewriting (the - * one bit the printer can't own, because b.cond rolls cond into the - * "operand" text) happens here. */ - -#include "arch/aa64_disasm.h" - -#include <string.h> - -#include "arch/aa64_isa.h" -#include "core/heap.h" -#include "core/strbuf.h" - -/* Enough for any aarch64 mnemonic-with-suffix ("b.cond" → "b.le", etc.). */ -#define AA64_DASM_MNEM_CAP 16u -/* Operand text. The widest cases (LDP X, X, [SP, #-imm]!) fit easily. */ -#define AA64_DASM_OPS_CAP 96u -/* Annotation overlay (symbol + addend). */ -#define AA64_DASM_ANN_CAP 96u - -typedef struct AA64Disasm { - ArchDisasm base; - Compiler* c; - Heap* heap; - char mnem_buf[AA64_DASM_MNEM_CAP]; - char ops_buf[AA64_DASM_OPS_CAP]; - char ann_buf[AA64_DASM_ANN_CAP]; - StrBuf mnem; - StrBuf ops; - StrBuf ann; -} AA64Disasm; - -static const char* aa64_cond_names[16] = { - "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc", - "hi", "ls", "ge", "lt", "gt", "le", "al", "nv", -}; - -static void aa64_write_mnemonic(AA64Disasm* d, const AA64InsnDesc* desc, - u32 word) { - strbuf_reset(&d->mnem); - if (desc->fmt == AA64_FMT_BR_COND) { - /* Synthesize "b.<cond>" so the operands buffer can hold just the - * target. Matches GNU as / objdump conventions. */ - u32 cond = word & 0xfu; - strbuf_puts(&d->mnem, "b."); - strbuf_puts(&d->mnem, aa64_cond_names[cond]); - return; - } - strbuf_puts(&d->mnem, desc->mnemonic); -} - -static void aa64_write_operands(AA64Disasm* d, const AA64InsnDesc* desc, - u32 word, u64 vaddr) { - strbuf_reset(&d->ops); - if (desc->fmt == AA64_FMT_BR_COND) { - /* aa64_print_operands prints "<cond> <target>"; we already lifted - * the cond into the mnemonic, so skip the dispatcher and inline - * just the target. */ - AA64BrCond f = aa64_brcond_unpack(word); - i64 ofs = (i64)((u64)f.imm19 & 0x7ffffu); - /* sign-extend 19 bits */ - if (ofs & 0x40000) ofs |= ~(i64)0x7ffff; - ofs *= 4; - if (vaddr) { - strbuf_put_hex_u64(&d->ops, vaddr + (u64)ofs); - } else { - strbuf_puts(&d->ops, "#"); - strbuf_put_i64(&d->ops, ofs); - } - return; - } - aa64_print_operands(&d->ops, desc, word, vaddr); -} - -static u32 aa64_read_u32_le(const u8* b) { - return (u32)b[0] | ((u32)b[1] << 8) | ((u32)b[2] << 16) | ((u32)b[3] << 24); -} - -static void aa64_write_unknown(AA64Disasm* d, u32 word) { - strbuf_reset(&d->mnem); - strbuf_puts(&d->mnem, ".inst"); - strbuf_reset(&d->ops); - strbuf_put_hex_u64(&d->ops, (u64)word); -} - -static u32 aa64_decode(ArchDisasm* base, const u8* bytes, size_t len, u64 vaddr, - CfreeInsn* out) { - AA64Disasm* d = (AA64Disasm*)base; - if (len < 4u) return 0; - u32 word = aa64_read_u32_le(bytes); - const AA64InsnDesc* desc = aa64_disasm_find(word); - if (desc) { - aa64_write_mnemonic(d, desc, word); - aa64_write_operands(d, desc, word, vaddr); - } else { - aa64_write_unknown(d, word); - } - /* Annotation overlay is owned by the public iterator (cfree_disasm_iter_*). - * The arch-level decoder leaves it empty. */ - strbuf_reset(&d->ann); - out->vaddr = vaddr; - out->bytes = bytes; - out->nbytes = 4; - out->mnemonic = strbuf_cstr(&d->mnem); - out->operands = strbuf_cstr(&d->ops); - out->annotation = strbuf_cstr(&d->ann); - return 4; -} - -static void aa64_destroy(ArchDisasm* base) { - AA64Disasm* d = (AA64Disasm*)base; - d->heap->free(d->heap, d, sizeof(*d)); -} - -ArchDisasm* aa64_disasm_new(Compiler* c) { - Heap* h = (Heap*)c->env->heap; - AA64Disasm* d = (AA64Disasm*)h->alloc(h, sizeof(*d), _Alignof(AA64Disasm)); - if (!d) return NULL; - memset(d, 0, sizeof(*d)); - d->c = c; - d->heap = h; - d->base.decode = aa64_decode; - d->base.destroy = aa64_destroy; - strbuf_init(&d->mnem, d->mnem_buf, sizeof d->mnem_buf); - strbuf_init(&d->ops, d->ops_buf, sizeof d->ops_buf); - strbuf_init(&d->ann, d->ann_buf, sizeof d->ann_buf); - return &d->base; -} diff --git a/src/arch/aa64_disasm.h b/src/arch/aa64_disasm.h @@ -1,14 +0,0 @@ -#ifndef CFREE_ARCH_AA64_DISASM_H -#define CFREE_ARCH_AA64_DISASM_H - -/* AArch64 disassembler — ArchDisasm implementation. - * - * Wraps aa64_disasm_find + aa64_print_operands (src/arch/aa64_isa.{h,c}). - * The dispatcher in src/arch/disasm.c constructs one of these when the - * compiler target is CFREE_ARCH_ARM_64. */ - -#include "arch/arch.h" - -ArchDisasm* aa64_disasm_new(Compiler*); - -#endif diff --git a/src/arch/aa64_isa.c b/src/arch/aa64_isa.c @@ -1,598 +0,0 @@ -/* AArch64 instruction descriptor table + operand print/parse dispatch. - * - * The table mirrors the inline encoders in aa64_isa.h: each row records - * (mnemonic, match, mask, format, flags) so the disassembler can identify - * a raw 32-bit word with one mask-and-compare and then dispatch on the - * format to extract operand fields via the same unpack functions the - * encoder uses. Encoder and decoder share the bit knowledge — when an - * opcode value or field position changes, both sides update at one site. - * - * Mask values include the family mask plus the bits that distinguish a - * specific instruction from its siblings in the same family. sf (bit 31) - * is intentionally a don't-care for formats where both 32- and 64-bit - * forms share one row; the unpacker reads sf separately when printing - * operands. - * - * Row ordering: first-match wins. Aliases (rows with AA64_ASMFL_ALIAS) - * are tighter masks placed BEFORE the canonical row they alias so the - * disassembler renders the alias spelling. The assembler accepts both - * spellings — they map to the same encoded word. */ - -#include "arch/aa64_isa.h" - -#include <stddef.h> - -const AA64InsnDesc aa64_insn_table[] = { - /* ----- Move-wide immediate (MOVN / MOVZ / MOVK) ----- */ - {"movn", 0x12800000u, 0x7F800000u, AA64_FMT_MOVEWIDE, 0, {0, 0}}, - {"movz", 0x52800000u, 0x7F800000u, AA64_FMT_MOVEWIDE, 0, {0, 0}}, - {"movk", 0x72800000u, 0x7F800000u, AA64_FMT_MOVEWIDE, 0, {0, 0}}, - - /* ----- Logical, shifted register ----- - * Alias MOV Rd, Rm is ORR Rd, ZR, Rm with shift=0, imm6=0. The mask - * pins Rn (bits 9:5) to 11111 (ZR) and shift/imm6 to 0 so only the - * MOV spelling matches; broader ORR rows below catch the rest. */ - {"mov", 0x2A0003E0u, 0x7FE0FFE0u, AA64_FMT_LOG_SR, AA64_ASMFL_ALIAS, - {0, 0}}, - /* MVN Rd, Rm ≡ ORN Rd, ZR, Rm (logical N=1, Rn=ZR, no shift) */ - {"mvn", 0x2A2003E0u, 0x7FE0FFE0u, AA64_FMT_LOG_SR, AA64_ASMFL_ALIAS, - {0, 0}}, - {"and", 0x0A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, - {"bic", 0x0A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, - {"orr", 0x2A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, - {"orn", 0x2A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, - {"eor", 0x4A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, - {"eon", 0x4A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, - {"ands", 0x6A000000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, - {"bics", 0x6A200000u, 0x7F200000u, AA64_FMT_LOG_SR, 0, {0, 0}}, - - /* ----- Add/Sub, shifted register ----- - * NEG Rd, Rm ≡ SUB Rd, ZR, Rm (Rn=ZR, shift=0, imm6=0). */ - {"neg", 0x4B0003E0u, 0x7FE0FFE0u, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS, - {0, 0}}, - {"negs", 0x6B0003E0u, 0x7FE0FFE0u, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS, - {0, 0}}, - /* CMP Rn, Rm ≡ SUBS ZR, Rn, Rm. */ - {"cmp", 0x6B00001Fu, 0x7F20001Fu, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS, - {0, 0}}, - /* CMN Rn, Rm ≡ ADDS ZR, Rn, Rm. */ - {"cmn", 0x2B00001Fu, 0x7F20001Fu, AA64_FMT_ADDSUB_SR, AA64_ASMFL_ALIAS, - {0, 0}}, - {"add", 0x0B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}}, - {"adds", 0x2B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}}, - {"sub", 0x4B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}}, - {"subs", 0x6B000000u, 0x7F200000u, AA64_FMT_ADDSUB_SR, 0, {0, 0}}, - - /* ----- Data-processing 3-source ----- - * MUL Rd, Rn, Rm ≡ MADD Rd, Rn, Rm, ZR (Ra=ZR, op31=0, o0=0). */ - {"mul", 0x1B007C00u, 0x7FE0FC00u, AA64_FMT_DP3, AA64_ASMFL_ALIAS, {0, 0}}, - /* MNEG Rd, Rn, Rm ≡ MSUB Rd, Rn, Rm, ZR. */ - {"mneg", 0x1B00FC00u, 0x7FE0FC00u, AA64_FMT_DP3, AA64_ASMFL_ALIAS, {0, 0}}, - {"madd", 0x1B000000u, 0x7FE08000u, AA64_FMT_DP3, 0, {0, 0}}, - {"msub", 0x1B008000u, 0x7FE08000u, AA64_FMT_DP3, 0, {0, 0}}, - - /* ----- Data-processing 2-source ----- */ - {"udiv", 0x1AC00800u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}}, - {"sdiv", 0x1AC00C00u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}}, - {"lslv", 0x1AC02000u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}}, - {"lsrv", 0x1AC02400u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}}, - {"asrv", 0x1AC02800u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}}, - {"rorv", 0x1AC02C00u, 0x5FE0FC00u, AA64_FMT_DP2, 0, {0, 0}}, - - /* ----- Unconditional branch (register) ----- - * RET aliases its no-operand spelling to RET X30 (Rn=11110). The - * tighter row matches when Rn=30 and prints "ret" without operands; - * the looser row below catches RET Xn for other Rn. */ - {"ret", 0xD65F03C0u, 0xFFFFFFFFu, AA64_FMT_BR_REG, - AA64_ASMFL_ALIAS | AA64_ASMFL_NORN, {0, 0}}, - {"br", 0xD61F0000u, 0xFFFFFC1Fu, AA64_FMT_BR_REG, 0, {0, 0}}, - {"blr", 0xD63F0000u, 0xFFFFFC1Fu, AA64_FMT_BR_REG, 0, {0, 0}}, - {"ret", 0xD65F0000u, 0xFFFFFC1Fu, AA64_FMT_BR_REG, 0, {0, 0}}, - - /* ----- PC-relative addressing ----- */ - {"adr", 0x10000000u, 0x9F000000u, AA64_FMT_PCREL_ADR, 0, {0, 0}}, - {"adrp", 0x90000000u, 0x9F000000u, AA64_FMT_PCREL_ADR, 0, {0, 0}}, - - /* ----- Add/Sub immediate ----- */ - {"add", 0x11000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}}, - {"adds", 0x31000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}}, - {"sub", 0x51000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}}, - {"subs", 0x71000000u, 0x7F000000u, AA64_FMT_ADDSUB_IMM, 0, {0, 0}}, - - /* ----- Load/store, unsigned 12-bit immediate (scaled) ----- - * Mask: family bits 29:27 + 25:24 + size(31:30) + V(26) + opc(23:22). */ - {"strb", 0x39000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, - {"ldrb", 0x39400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, - {"strh", 0x79000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, - {"ldrh", 0x79400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, - {"str", 0xB9000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* 32 */ - {"ldr", 0xB9400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, - {"str", 0xF9000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1, - {0, 0}}, /* 64 */ - {"ldr", 0xF9400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1, - {0, 0}}, - /* SIMD/FP scaled loads/stores (V=1). size 0..2 select B/H/S; size=3 - * selects D; the 128-bit Q form uses size=00 with opc bit 1 set and - * is not yet emitted by codegen. */ - {"str", 0x3D000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* B */ - {"ldr", 0x3D400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, - {"str", 0x7D000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* H */ - {"ldr", 0x7D400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, - {"str", 0xBD000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, /* S */ - {"ldr", 0xBD400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, 0, {0, 0}}, - {"str", 0xFD000000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1, - {0, 0}}, /* D */ - {"ldr", 0xFD400000u, 0xFFC00000u, AA64_FMT_LDST_UIMM, AA64_ASMFL_SF1, - {0, 0}}, - - /* ----- Load/store, unscaled signed 9-bit immediate (LDUR/STUR) ----- - * V=0 first, V=1 next. Per-row mask narrows size+V+opc; family mask - * pins the high family bits + the SIMM9-vs-other-variant selector. */ - {"sturb", 0x38000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, - {"ldurb", 0x38400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, - {"sturh", 0x78000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, - {"ldurh", 0x78400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, - {"stur", 0xB8000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* 32 */ - {"ldur", 0xB8400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, - {"stur", 0xF8000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1, - {0, 0}}, - {"ldur", 0xF8400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1, - {0, 0}}, - {"stur", 0x3C000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* B */ - {"ldur", 0x3C400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, - {"stur", 0x7C000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* H */ - {"ldur", 0x7C400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, - {"stur", 0xBC000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, /* S */ - {"ldur", 0xBC400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, 0, {0, 0}}, - {"stur", 0xFC000000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1, - {0, 0}}, /* D */ - {"ldur", 0xFC400000u, 0xFFE00C00u, AA64_FMT_LDST_SIMM9, AA64_ASMFL_SF1, - {0, 0}}, - - /* ----- Load/store pair, pre-indexed (opc=10 X / opc=01 D) ----- */ - {"stp", 0xA9800000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, AA64_ASMFL_SF1, - {0, 0}}, - {"ldp", 0xA9C00000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, AA64_ASMFL_SF1, - {0, 0}}, - {"stp", 0x6D800000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, 0, {0, 0}}, /* D */ - {"ldp", 0x6DC00000u, 0xFFC00000u, AA64_FMT_LDSTP_PRE, 0, {0, 0}}, - - /* ----- Load/store pair, signed-offset ----- */ - {"stp", 0xA9000000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, AA64_ASMFL_SF1, - {0, 0}}, - {"ldp", 0xA9400000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, AA64_ASMFL_SF1, - {0, 0}}, - {"stp", 0x6D000000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, 0, {0, 0}}, /* D */ - {"ldp", 0x6D400000u, 0xFFC00000u, AA64_FMT_LDSTP_SOFF, 0, {0, 0}}, - - /* ----- Unconditional branch (immediate) ----- */ - {"b", 0x14000000u, 0xFC000000u, AA64_FMT_BR_IMM, 0, {0, 0}}, - {"bl", 0x94000000u, 0xFC000000u, AA64_FMT_BR_IMM, 0, {0, 0}}, - - /* ----- Conditional branch (immediate) ----- */ - {"b.cond", 0x54000000u, 0xFF000010u, AA64_FMT_BR_COND, 0, {0, 0}}, - - /* ----- Compare-and-branch ----- */ - {"cbz", 0x34000000u, 0x7F000000u, AA64_FMT_CB, 0, {0, 0}}, - {"cbnz", 0x35000000u, 0x7F000000u, AA64_FMT_CB, 0, {0, 0}}, - - /* ----- Exception generation ----- */ - {"svc", 0xD4000001u, 0xFFE0001Fu, AA64_FMT_EXCEPT, 0, {0, 0}}, - {"brk", 0xD4200000u, 0xFFE0001Fu, AA64_FMT_EXCEPT, 0, {0, 0}}, - {"hlt", 0xD4400000u, 0xFFE0001Fu, AA64_FMT_EXCEPT, 0, {0, 0}}, - - /* ----- Hint ----- */ - {"nop", 0xD503201Fu, 0xFFFFFFFFu, AA64_FMT_HINT, 0, {0, 0}}, - - /* ----- Memory barriers (DMB / DSB / ISB / CLREX) ----- - * Mask covers everything but CRm at bits[11:8]. */ - {"dmb", 0xD50330BFu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}}, - {"dsb", 0xD503309Fu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}}, - {"isb", 0xD50330DFu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}}, - {"clrex", 0xD503305Fu, 0xFFFFF0FFu, AA64_FMT_BARRIER, 0, {0, 0}}, -}; - -const u32 aa64_insn_table_n = - (u32)(sizeof aa64_insn_table / sizeof aa64_insn_table[0]); - -const AA64InsnDesc* aa64_disasm_find(u32 word) { - for (u32 i = 0; i < aa64_insn_table_n; ++i) { - const AA64InsnDesc* d = &aa64_insn_table[i]; - if ((word & d->mask) == d->match) return d; - } - return NULL; -} - -/* ===================================================================== - * Operand print — one helper per format. - * - * Format choices for immediates: - * - branch displacements, signed add/sub imm, signed ldur/stur ofs: - * signed decimal. - * - MOVZ/MOVK halfword, logical bitmask, exception generation #imm: - * 0x-prefixed hex. - * - * Register naming: ZR alias for x31 in places where the encoding treats - * Rd/Rn=31 as the zero register (logical/arith), SP where it treats 31 - * as the stack pointer (add/sub imm, ldr/str-uimm Rn, ldp/stp Rn). - * - * vaddr is folded into PC-relative branch operands when nonzero. */ - -static void emit_reg(StrBuf* sb, u32 r, int sf, int sp_means_sp) { - if (r == 31u) { - if (sp_means_sp) strbuf_puts(sb, "sp"); - else if (sf) strbuf_puts(sb, "xzr"); - else strbuf_puts(sb, "wzr"); - return; - } - strbuf_putc(sb, sf ? 'x' : 'w'); - strbuf_put_u64(sb, (u64)r); -} - -static void emit_vreg(StrBuf* sb, u32 r, char prefix) { - strbuf_putc(sb, prefix); - strbuf_put_u64(sb, (u64)r); -} - -static void emit_cond(StrBuf* sb, u32 cond) { - static const char* names[16] = {"eq", "ne", "cs", "cc", "mi", "pl", - "vs", "vc", "hi", "ls", "ge", "lt", - "gt", "le", "al", "nv"}; - strbuf_puts(sb, names[cond & 0xfu]); -} - -/* Sign-extend an n-bit value held in the low bits of v to i64. */ -static i64 sext(u64 v, u32 nbits) { - u64 mask = (nbits >= 64u) ? ~0ull : ((1ull << nbits) - 1ull); - v &= mask; - u64 sign = (nbits == 0u) ? 0ull : (1ull << (nbits - 1u)); - if (v & sign) v |= ~mask; - return (i64)v; -} - -static void print_movewide(StrBuf* sb, u32 w) { - AA64MoveWide f = aa64_movewide_unpack(w); - emit_reg(sb, f.Rd, (int)f.sf, /*sp_means_sp=*/0); - strbuf_puts(sb, ", "); - strbuf_put_hex_u64(sb, (u64)f.imm16); - if (f.hw) { - strbuf_puts(sb, ", lsl "); - strbuf_put_u64(sb, (u64)(f.hw * 16u)); - } -} - -static void print_logsr(StrBuf* sb, u32 w, const AA64InsnDesc* d) { - AA64LogSR f = aa64_logsr_unpack(w); - if (d->flags & AA64_ASMFL_ALIAS) { - /* MOV / MVN: Rd, Rm */ - emit_reg(sb, f.Rd, (int)f.sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rm, (int)f.sf, 0); - return; - } - emit_reg(sb, f.Rd, (int)f.sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rn, (int)f.sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rm, (int)f.sf, 0); - if (f.imm6 || f.shift) { - static const char* sh[4] = {"lsl", "lsr", "asr", "ror"}; - strbuf_puts(sb, ", "); - strbuf_puts(sb, sh[f.shift & 3u]); - strbuf_puts(sb, " #"); - strbuf_put_u64(sb, (u64)f.imm6); - } -} - -static void print_addsubsr(StrBuf* sb, u32 w, const AA64InsnDesc* d) { - AA64AddSubSR f = aa64_addsubsr_unpack(w); - if (d->flags & AA64_ASMFL_ALIAS) { - /* NEG / NEGS / CMP / CMN. */ - if (d->mnemonic[0] == 'c') { - /* CMP / CMN — print Rn, Rm */ - emit_reg(sb, f.Rn, (int)f.sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rm, (int)f.sf, 0); - } else { - /* NEG / NEGS — print Rd, Rm */ - emit_reg(sb, f.Rd, (int)f.sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rm, (int)f.sf, 0); - } - return; - } - emit_reg(sb, f.Rd, (int)f.sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rn, (int)f.sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rm, (int)f.sf, 0); - if (f.imm6 || f.shift) { - static const char* sh[4] = {"lsl", "lsr", "asr", "rsv"}; - strbuf_puts(sb, ", "); - strbuf_puts(sb, sh[f.shift & 3u]); - strbuf_puts(sb, " #"); - strbuf_put_u64(sb, (u64)f.imm6); - } -} - -static void print_dp3(StrBuf* sb, u32 w, const AA64InsnDesc* d) { - AA64DP3 f = aa64_dp3_unpack(w); - /* MUL / MNEG alias drop Ra (which is ZR for the alias). */ - if (d->flags & AA64_ASMFL_ALIAS) { - emit_reg(sb, f.Rd, (int)f.sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rn, (int)f.sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rm, (int)f.sf, 0); - return; - } - emit_reg(sb, f.Rd, (int)f.sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rn, (int)f.sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rm, (int)f.sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Ra, (int)f.sf, 0); -} - -static void print_dp2(StrBuf* sb, u32 w) { - AA64DP2 f = aa64_dp2_unpack(w); - emit_reg(sb, f.Rd, (int)f.sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rn, (int)f.sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rm, (int)f.sf, 0); -} - -static void print_brreg(StrBuf* sb, u32 w, const AA64InsnDesc* d) { - AA64BrReg f = aa64_brreg_unpack(w); - if (d->flags & AA64_ASMFL_NORN) return; /* RET (with implicit X30) */ - emit_reg(sb, f.Rn, /*sf=*/1, 0); -} - -static void print_pcrel(StrBuf* sb, u32 w, u64 vaddr) { - AA64PCRelAdr f = aa64_pcrel_adr_unpack(w); - emit_reg(sb, f.Rd, /*sf=*/1, 0); - strbuf_puts(sb, ", "); - i64 imm = sext(((u64)f.immhi << 2) | (u64)f.immlo, 21); - if (f.op == AA64_ADR_OP_ADRP) imm <<= 12; - if (vaddr) { - u64 base = (f.op == AA64_ADR_OP_ADRP) ? (vaddr & ~0xfffull) : vaddr; - strbuf_put_hex_u64(sb, base + (u64)imm); - } else { - strbuf_puts(sb, "#"); - strbuf_put_i64(sb, imm); - } -} - -static void print_addsubimm(StrBuf* sb, u32 w) { - AA64AddSubImm f = aa64_addsubimm_unpack(w); - /* For these encodings, Rd/Rn=31 means SP. */ - emit_reg(sb, f.Rd, (int)f.sf, 1); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rn, (int)f.sf, 1); - strbuf_puts(sb, ", #"); - strbuf_put_u64(sb, (u64)f.imm12); - if (f.sh) strbuf_puts(sb, ", lsl #12"); -} - -static u32 ldst_log2_size(const AA64InsnDesc* d, u32 size_field) { - (void)d; - return size_field & 3u; -} - -static void print_ldst_uimm(StrBuf* sb, u32 w, const AA64InsnDesc* d) { - AA64LdStUimm f = aa64_ldst_uimm_unpack(w); - u32 sz = ldst_log2_size(d, f.size); - /* Pick reg prefix: V=0 picks W/X by size; V=1 picks B/H/S/D by size. */ - if (f.V == 0) { - emit_reg(sb, f.Rt, /*sf=*/(int)(sz == 3u), 0); - } else { - char p = (sz == 0u) ? 'b' : (sz == 1u) ? 'h' : (sz == 2u) ? 's' : 'd'; - emit_vreg(sb, f.Rt, p); - } - strbuf_puts(sb, ", ["); - emit_reg(sb, f.Rn, /*sf=*/1, 1); - u32 byte_off = f.imm12 << sz; - if (byte_off) { - strbuf_puts(sb, ", #"); - strbuf_put_u64(sb, (u64)byte_off); - } - strbuf_putc(sb, ']'); -} - -static void print_ldst_simm9(StrBuf* sb, u32 w, const AA64InsnDesc* d) { - AA64LdStSimm9 f = aa64_ldst_simm9_unpack(w); - u32 sz = f.size & 3u; - (void)d; - if (f.V == 0) { - emit_reg(sb, f.Rt, /*sf=*/(int)(sz == 3u), 0); - } else { - char p = (sz == 0u) ? 'b' : (sz == 1u) ? 'h' : (sz == 2u) ? 's' : 'd'; - emit_vreg(sb, f.Rt, p); - } - strbuf_puts(sb, ", ["); - emit_reg(sb, f.Rn, /*sf=*/1, 1); - i64 off = sext((u64)f.imm9, 9); - if (off) { - strbuf_puts(sb, ", #"); - strbuf_put_i64(sb, off); - } - strbuf_putc(sb, ']'); -} - -static void print_ldstp_common(StrBuf* sb, AA64LdStPPre f, int pre) { - /* opc=10 → 64-bit X; opc=00 → 32-bit W; opc=01 (V=1) → D (FP); - * opc=00 (V=1) → S; opc=10 (V=1) → Q (not yet emitted). */ - i64 scale; - int is_fp = (f.V == 1); - char fp_prefix = 's'; - int sf = 1; - if (is_fp) { - if (f.opc == 0) { - fp_prefix = 's'; - scale = 4; - } else if (f.opc == 1) { - fp_prefix = 'd'; - scale = 8; - } else { - fp_prefix = 'q'; - scale = 16; - } - } else { - sf = (f.opc == 2); - scale = sf ? 8 : 4; - } - if (is_fp) { - emit_vreg(sb, f.Rt, fp_prefix); - strbuf_puts(sb, ", "); - emit_vreg(sb, f.Rt2, fp_prefix); - } else { - emit_reg(sb, f.Rt, sf, 0); - strbuf_puts(sb, ", "); - emit_reg(sb, f.Rt2, sf, 0); - } - strbuf_puts(sb, ", ["); - emit_reg(sb, f.Rn, /*sf=*/1, 1); - i64 byte_off = sext((u64)f.imm7, 7) * scale; - if (byte_off) { - strbuf_puts(sb, ", #"); - strbuf_put_i64(sb, byte_off); - } - strbuf_putc(sb, ']'); - if (pre) strbuf_putc(sb, '!'); -} - -static void print_ldstp_pre(StrBuf* sb, u32 w) { - print_ldstp_common(sb, aa64_ldstp_pre_unpack(w), /*pre=*/1); -} -static void print_ldstp_soff(StrBuf* sb, u32 w) { - print_ldstp_common(sb, aa64_ldstp_soff_unpack(w), /*pre=*/0); -} - -static void print_br_imm(StrBuf* sb, u32 w, u64 vaddr) { - AA64BrImm f = aa64_brimm_unpack(w); - i64 ofs = sext((u64)f.imm26, 26) * 4; - if (vaddr) { - strbuf_put_hex_u64(sb, vaddr + (u64)ofs); - } else { - strbuf_puts(sb, "#"); - strbuf_put_i64(sb, ofs); - } -} - -static void print_br_cond(StrBuf* sb, u32 w, u64 vaddr, - const AA64InsnDesc* d) { - AA64BrCond f = aa64_brcond_unpack(w); - (void)d; - /* mnemonic is "b.cond"; we'll print cond as a suffix on the target. - * The b.cond row keeps a single mnemonic for printing — for the asm - * spelling to be canonical the writer will need to emit b.<cc>, which - * is the printer's job at the dispatcher level (see aa64_print_operands). */ - emit_cond(sb, f.cond); - strbuf_putc(sb, ' '); - i64 ofs = sext((u64)f.imm19, 19) * 4; - if (vaddr) { - strbuf_put_hex_u64(sb, vaddr + (u64)ofs); - } else { - strbuf_puts(sb, "#"); - strbuf_put_i64(sb, ofs); - } -} - -static void print_cb(StrBuf* sb, u32 w, u64 vaddr) { - AA64CB f = aa64_cb_unpack(w); - emit_reg(sb, f.Rt, (int)f.sf, 0); - strbuf_puts(sb, ", "); - i64 ofs = sext((u64)f.imm19, 19) * 4; - if (vaddr) { - strbuf_put_hex_u64(sb, vaddr + (u64)ofs); - } else { - strbuf_puts(sb, "#"); - strbuf_put_i64(sb, ofs); - } -} - -static void print_except(StrBuf* sb, u32 w) { - AA64Except f = aa64_except_unpack(w); - strbuf_puts(sb, "#"); - strbuf_put_hex_u64(sb, (u64)f.imm16); -} - -static void print_barrier(StrBuf* sb, u32 w, const AA64InsnDesc* desc) { - AA64Barrier f = aa64_barrier_unpack(w); - /* ISB and CLREX with the default CRm=SY (15) print without an - * operand. DMB/DSB always carry an option. */ - int is_isb = (f.op2 == AA64_BARRIER_OP2_ISB); - int is_clrex = (f.op2 == AA64_BARRIER_OP2_CLREX); - if ((is_isb || is_clrex) && f.CRm == AA64_BARRIER_OPT_SY) return; - const char* opt = NULL; - switch (f.CRm) { - case AA64_BARRIER_OPT_OSHLD: opt = "oshld"; break; - case AA64_BARRIER_OPT_OSHST: opt = "oshst"; break; - case AA64_BARRIER_OPT_OSH: opt = "osh"; break; - case AA64_BARRIER_OPT_NSHLD: opt = "nshld"; break; - case AA64_BARRIER_OPT_NSHST: opt = "nshst"; break; - case AA64_BARRIER_OPT_NSH: opt = "nsh"; break; - case AA64_BARRIER_OPT_ISHLD: opt = "ishld"; break; - case AA64_BARRIER_OPT_ISHST: opt = "ishst"; break; - case AA64_BARRIER_OPT_ISH: opt = "ish"; break; - case AA64_BARRIER_OPT_LD: opt = (desc && desc->mnemonic && - desc->mnemonic[0] == 'd' && - desc->mnemonic[1] == 'm') - ? "ld" - : NULL; break; - case AA64_BARRIER_OPT_ST: opt = (desc && desc->mnemonic && - desc->mnemonic[0] == 'd' && - desc->mnemonic[1] == 'm') - ? "st" - : NULL; break; - case AA64_BARRIER_OPT_SY: opt = "sy"; break; - default: break; - } - strbuf_putc(sb, ' '); - if (opt) { - strbuf_puts(sb, opt); - } else { - strbuf_puts(sb, "#"); - strbuf_put_u64(sb, (u64)f.CRm); - } -} - -void aa64_print_operands(StrBuf* sb, const AA64InsnDesc* desc, u32 word, - u64 vaddr) { - switch ((AA64Format)desc->fmt) { - case AA64_FMT_MOVEWIDE: print_movewide(sb, word); break; - case AA64_FMT_LOG_SR: print_logsr(sb, word, desc); break; - case AA64_FMT_ADDSUB_SR: print_addsubsr(sb, word, desc); break; - case AA64_FMT_DP3: print_dp3(sb, word, desc); break; - case AA64_FMT_DP2: print_dp2(sb, word); break; - case AA64_FMT_BR_REG: print_brreg(sb, word, desc); break; - case AA64_FMT_PCREL_ADR: print_pcrel(sb, word, vaddr); break; - case AA64_FMT_ADDSUB_IMM: print_addsubimm(sb, word); break; - case AA64_FMT_LDST_UIMM: print_ldst_uimm(sb, word, desc); break; - case AA64_FMT_LDSTP_PRE: print_ldstp_pre(sb, word); break; - case AA64_FMT_LDSTP_SOFF: print_ldstp_soff(sb, word); break; - case AA64_FMT_LDST_SIMM9: print_ldst_simm9(sb, word, desc); break; - case AA64_FMT_BR_IMM: print_br_imm(sb, word, vaddr); break; - case AA64_FMT_BR_COND: print_br_cond(sb, word, vaddr, desc); break; - case AA64_FMT_CB: print_cb(sb, word, vaddr); break; - case AA64_FMT_EXCEPT: print_except(sb, word); break; - case AA64_FMT_HINT: break; /* no operands for NOP */ - case AA64_FMT_BARRIER: print_barrier(sb, word, desc); break; - } -} - -/* ===================================================================== - * Operand parse — phase-3 wires this up to the asm token stream. Phase - * 2 ships the signature so the assembler bring-up commit doesn't need to - * touch the descriptor table; the body returns 0 for every format until - * the per-format grammar is implemented. */ - -int aa64_parse_operands(struct AA64AsmTok* tok, const AA64InsnDesc* desc, - void* fields_out) { - (void)tok; - (void)desc; - (void)fields_out; - return 0; -} diff --git a/src/arch/aa64_regs.c b/src/arch/aa64_regs.c @@ -1,88 +0,0 @@ -/* AArch64 register name table — DWARF index ↔ assembler name. - * - * DWARF register numbering for AArch64 (per the AAPCS64 ABI supplement): - * 0..30 X0..X30 (also W0..W30; same DWARF index) - * 31 SP (X31 / WSP) - * 32 PC - * 33 ELR (mode dependent; unused here) - * 64..95 V0..V31 (also B/H/S/D forms; same index) - * - * The canonical assembler spelling for v1 is the 64-bit form (Xn / Vn); - * disassembler output picks W/B/H/S/D based on instruction width - * separately. */ - -#include <stdint.h> -#include <string.h> - -#include "arch/aa64_regs.h" -#include "core/core.h" - -typedef struct AA64Reg { - uint32_t dwarf_idx; - const char* name; -} AA64Reg; - -static const AA64Reg AA64_REGS[] = { - {0, "x0"}, {1, "x1"}, {2, "x2"}, {3, "x3"}, {4, "x4"}, - {5, "x5"}, {6, "x6"}, {7, "x7"}, {8, "x8"}, {9, "x9"}, - {10, "x10"}, {11, "x11"}, {12, "x12"}, {13, "x13"}, {14, "x14"}, - {15, "x15"}, {16, "x16"}, {17, "x17"}, {18, "x18"}, {19, "x19"}, - {20, "x20"}, {21, "x21"}, {22, "x22"}, {23, "x23"}, {24, "x24"}, - {25, "x25"}, {26, "x26"}, {27, "x27"}, {28, "x28"}, {29, "x29"}, - {30, "x30"}, {31, "sp"}, {32, "pc"}, - {64, "v0"}, {65, "v1"}, {66, "v2"}, {67, "v3"}, {68, "v4"}, - {69, "v5"}, {70, "v6"}, {71, "v7"}, {72, "v8"}, {73, "v9"}, - {74, "v10"}, {75, "v11"}, {76, "v12"}, {77, "v13"}, {78, "v14"}, - {79, "v15"}, {80, "v16"}, {81, "v17"}, {82, "v18"}, {83, "v19"}, - {84, "v20"}, {85, "v21"}, {86, "v22"}, {87, "v23"}, {88, "v24"}, - {89, "v25"}, {90, "v26"}, {91, "v27"}, {92, "v28"}, {93, "v29"}, - {94, "v30"}, {95, "v31"}, -}; - -static const uint32_t AA64_REGS_N = (uint32_t)(sizeof AA64_REGS / - sizeof AA64_REGS[0]); - -const char* aa64_register_name(uint32_t dwarf_idx) { - uint32_t i; - for (i = 0; i < AA64_REGS_N; ++i) { - if (AA64_REGS[i].dwarf_idx == dwarf_idx) return AA64_REGS[i].name; - } - return NULL; -} - -int aa64_register_index(const char* name, uint32_t* idx_out) { - uint32_t i; - if (!name) return 1; - for (i = 0; i < AA64_REGS_N; ++i) { - if (!strcmp(AA64_REGS[i].name, name)) { - if (idx_out) *idx_out = AA64_REGS[i].dwarf_idx; - return 0; - } - } - /* Accept Wn alias for Xn (same DWARF index). */ - if (name[0] == 'w' && name[1] != '\0') { - char buf[8]; - size_t n = strlen(name); - if (n < sizeof buf) { - buf[0] = 'x'; - memcpy(buf + 1, name + 1, n); - return aa64_register_index(buf, idx_out); - } - } - /* wzr / xzr aliases. */ - if (!strcmp(name, "wzr") || !strcmp(name, "xzr")) { - if (idx_out) *idx_out = 31u; /* shares SP encoding slot; v1 picks SP */ - return 0; - } - return 1; -} - -uint32_t aa64_register_iter_size(void) { return AA64_REGS_N; } - -int aa64_register_iter_get(uint32_t i, uint32_t* dwarf_out, - const char** name_out) { - if (i >= AA64_REGS_N) return 1; - if (dwarf_out) *dwarf_out = AA64_REGS[i].dwarf_idx; - if (name_out) *name_out = AA64_REGS[i].name; - return 0; -} diff --git a/src/arch/aarch64/alloc.c b/src/arch/aarch64/alloc.c @@ -1,246 +0,0 @@ -/* aarch64/alloc.c — spill/reload, labels, control flow, structured scopes. */ - -#include "arch/aarch64/internal.h" - -/* ============================================================ - * AAImpl accessor - * ============================================================ */ - -AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; } - -/* ============================================================ - * Slot accessor - * ============================================================ */ - -AASlot* aa64_slot_get(AAImpl* a, FrameSlot fs) { - if (fs == FRAME_SLOT_NONE || fs > a->nslots) return NULL; - return &a->slots[fs - 1]; -} - -static int aa_resolve_reg_name(CGTarget* t, Sym name, Reg* out, - RegClass* cls_out) { - (void)t; - size_t len = 0; - const char* s = pool_str(t->c->global, name, &len); - if (!s || !len) return 1; - char buf[8]; - if (len >= sizeof buf) return 1; - memcpy(buf, s, len); - buf[len] = '\0'; - u32 dwarf; - if (aa64_register_index(buf, &dwarf) != 0) return 1; - if (dwarf <= 30u) { - if (out) *out = (Reg)dwarf; - if (cls_out) *cls_out = RC_INT; - return 0; - } - if (dwarf >= 64u && dwarf <= 95u) { - if (out) *out = (Reg)(dwarf - 64u); - if (cls_out) *cls_out = RC_FP; - return 0; - } - return 1; -} - -static void aa_spill_reg(CGTarget* t, Operand src, FrameSlot slot, - MemAccess ma) { - AAImpl* a = impl_of(t); - if (src.kind != OPK_REG) { - compiler_panic(t->c, a->loc, "aarch64 spill_reg: src is not OPK_REG"); - } - Operand addr; - memset(&addr, 0, sizeof addr); - addr.kind = OPK_LOCAL; - addr.cls = RC_INT; - addr.type = ma.type; - addr.v.frame_slot = slot; - aa_store(t, addr, src, ma); -} - -static void aa_reload_reg(CGTarget* t, Operand dst, FrameSlot slot, - MemAccess ma) { - AAImpl* a = impl_of(t); - if (dst.kind != OPK_REG) { - compiler_panic(t->c, a->loc, "aarch64 reload_reg: dst is not OPK_REG"); - } - Operand addr; - memset(&addr, 0, sizeof addr); - addr.kind = OPK_LOCAL; - addr.cls = RC_INT; - addr.type = ma.type; - addr.v.frame_slot = slot; - aa_load(t, dst, addr, ma); -} - -/* ============================================================ - * Labels / control flow - * ============================================================ */ - -static Label aa_label_new(CGTarget* t) { - return (Label)t->mc->label_new(t->mc); -} - -static void aa_label_place(CGTarget* t, Label l) { - t->mc->label_place(t->mc, (MCLabel)l); -} - -void aa_jump(CGTarget* t, Label l) { - MCEmitter* mc = t->mc; - aa64_emit32(mc, aa64_b_base()); - mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_JUMP26, 4, 0); -} - -static u32 cmp_to_cond(CmpOp op) { - switch (op) { - case CMP_EQ: return 0x0u; - case CMP_NE: return 0x1u; - case CMP_LT_U: return 0x3u; - case CMP_LE_U: return 0x9u; - case CMP_GT_U: return 0x8u; - case CMP_GE_U: return 0x2u; - case CMP_LT_S: return 0xbu; - case CMP_LE_S: return 0xdu; - case CMP_GT_S: return 0xcu; - case CMP_GE_S: return 0xau; - default: return 0x0u; - } -} - -void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op) { - MCEmitter* mc = t->mc; - u32 sf = type_is_64(a_op.type) ? 1u : 0u; - if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) { - u32 imm12, sh; - if (aa64_addsub_imm_fits(b_op.v.imm, &imm12, &sh)) { - u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); - aa64_emit32(mc, aa64_subs_imm12(sf, /*Rd=ZR*/ 31u, rn, imm12, sh)); - return; - } - } - u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); - u32 rm = - aa64_force_reg_int(t, b_op, sf, (rn == AA_TMP0) ? AA_TMP1 : AA_TMP0); - aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, rn, rm)); -} - -static void aa_cmp_branch(CGTarget* t, CmpOp op, Operand a, Operand b, - Label l) { - MCEmitter* mc = t->mc; - emit_cmp_ab(t, a, b); - aa64_emit32(mc, aa64_b_cond(cmp_to_cond(op))); - mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_CONDBR19, 4, 0); -} - -static void aa_cmp(CGTarget* t, CmpOp op, Operand dst, Operand a, Operand b) { - emit_cmp_ab(t, a, b); - u32 sf_dst = type_is_64(dst.type) ? 1u : 0u; - aa64_emit32(t->mc, aa64_cset(sf_dst, reg_num(dst), cmp_to_cond(op))); -} - -/* ============================================================ - * Structured scopes - * ============================================================ */ - -static CGScope aa_scope_begin(CGTarget* t, const CGScopeDesc* d) { - AAImpl* a = impl_of(t); - if (a->nscopes == a->scopes_cap) { - u32 ncap = a->scopes_cap ? a->scopes_cap * 2u : 4u; - AAScope* nb = arena_array(t->c->tu, AAScope, ncap); - if (a->scopes) memcpy(nb, a->scopes, sizeof(AAScope) * a->nscopes); - a->scopes = nb; - a->scopes_cap = ncap; - } - AAScope* sc = &a->scopes[a->nscopes]; - sc->kind = (u8)d->kind; - sc->has_else = 0; - sc->else_label = 0; - sc->end_label = 0; - sc->break_label = d->break_label; - sc->continue_label = d->continue_label; - - if (d->kind == SCOPE_IF) { - sc->else_label = t->mc->label_new(t->mc); - sc->end_label = t->mc->label_new(t->mc); - u32 sf = type_is_64(d->cond.type) ? 1u : 0u; - u32 rn = aa64_force_reg_int(t, d->cond, sf, AA_TMP0); - aa64_emit32(t->mc, aa64_subs_imm(sf, /*Rd=ZR*/ 31u, rn, 0)); - aa64_emit32(t->mc, aa64_b_cond(0x0u /*EQ*/)); - t->mc->emit_label_ref(t->mc, sc->else_label, R_AARCH64_CONDBR19, 4, 0); - } else if (d->kind == SCOPE_LOOP || d->kind == SCOPE_BLOCK) { - /* bookkeep only */ - } else { - compiler_panic(t->c, a->loc, - "aarch64 scope_begin: kind %d not yet implemented", - (int)d->kind); - } - - a->nscopes++; - return (CGScope)a->nscopes; -} - -static void aa_scope_else(CGTarget* t, CGScope s) { - AAImpl* a = impl_of(t); - if (s == CG_SCOPE_NONE || s > a->nscopes) { - compiler_panic(t->c, a->loc, "aarch64 scope_else: bad scope %u", - (unsigned)s); - } - AAScope* sc = &a->scopes[s - 1]; - aa64_emit32(t->mc, aa64_b_base()); - t->mc->emit_label_ref(t->mc, sc->end_label, R_AARCH64_JUMP26, 4, 0); - t->mc->label_place(t->mc, sc->else_label); - sc->has_else = 1; -} - -static void aa_scope_end(CGTarget* t, CGScope s) { - AAImpl* a = impl_of(t); - if (s == CG_SCOPE_NONE || s > a->nscopes) { - compiler_panic(t->c, a->loc, "aarch64 scope_end: bad scope %u", - (unsigned)s); - } - AAScope* sc = &a->scopes[s - 1]; - if (sc->kind == SCOPE_IF) { - if (!sc->has_else) { - t->mc->label_place(t->mc, sc->else_label); - } - t->mc->label_place(t->mc, sc->end_label); - } -} - -static void aa_break_to(CGTarget* t, CGScope s) { - AAImpl* a = impl_of(t); - if (s == CG_SCOPE_NONE || s > a->nscopes) { - compiler_panic(t->c, a->loc, "aarch64 break_to: bad scope %u", (unsigned)s); - } - AAScope* sc = &a->scopes[s - 1]; - aa_jump(t, sc->break_label); -} - -static void aa_continue_to(CGTarget* t, CGScope s) { - AAImpl* a = impl_of(t); - if (s == CG_SCOPE_NONE || s > a->nscopes) { - compiler_panic(t->c, a->loc, "aarch64 continue_to: bad scope %u", - (unsigned)s); - } - AAScope* sc = &a->scopes[s - 1]; - aa_jump(t, sc->continue_label); -} - -/* Expose vtable entries to ops.c constructor via a registration helper. - * ops.c calls this after the basic ops vtable is populated. */ -void aa_alloc_vtable_init(CGTarget* t) { - t->spill_reg = aa_spill_reg; - t->reload_reg = aa_reload_reg; - t->resolve_reg_name = aa_resolve_reg_name; - - t->label_new = aa_label_new; - t->label_place = aa_label_place; - t->jump = aa_jump; - t->cmp_branch = aa_cmp_branch; - t->cmp = aa_cmp; - - t->scope_begin = aa_scope_begin; - t->scope_else = aa_scope_else; - t->scope_end = aa_scope_end; - t->break_to = aa_break_to; - t->continue_to = aa_continue_to; -} diff --git a/src/arch/aarch64/arch.c b/src/arch/aarch64/arch.c @@ -1,95 +0,0 @@ -#include "arch/arch.h" - -#include "abi/abi_internal.h" -#include "arch/aa64.h" -#include "arch/aa64_asm.h" -#include "arch/aa64_disasm.h" -#include "arch/aa64_regs.h" -#include "core/bytes.h" -#include "link/link_arch.h" -#include "obj/elf.h" -#include "obj/macho.h" -#include "obj/obj.h" - -static const ABIVtable* aa64_abi_vtable(Compiler* c, CfreeOSKind os) { - (void)c; - switch (os) { - case CFREE_OS_MACOS: - return &apple_arm64_vtable; - default: - return &aapcs64_vtable; - } -} - -static int aa64_register_at_public(uint32_t idx, CfreeArchReg* out) { - if (!out) return 1; - return aa64_register_iter_get(idx, &out->dwarf_idx, &out->name); -} - -static const ArchElfOps aa64_elf_ops = { - .e_machine = EM_AARCH64, - .e_flags = 0, - .reloc_to = elf_aarch64_reloc_to, - .reloc_from = elf_aarch64_reloc_from, -}; - -static const ArchMachoOps aa64_macho_ops = { - .cputype = CPU_TYPE_ARM64, - .cpusubtype = CPU_SUBTYPE_ARM64_ALL, - .reloc_to = macho_aarch64_reloc_to, - .reloc_pcrel = macho_aarch64_reloc_pcrel, - .reloc_length = macho_aarch64_reloc_length, - .reloc_from = macho_aarch64_reloc_from, -}; - -static int aa64_apply_label_fixup(Compiler* c, const ArchLabelFixup* fx) { - const Section* s; - u8 cur[4]; - u32 word; - - (void)c; - if (!fx || fx->width != 4) return 1; - s = obj_section_get(fx->obj, fx->sec_id); - if (!s) return 0; - buf_read(&s->bytes, fx->offset, cur, 4); - word = rd_u32_le(cur); - - switch (fx->kind) { - case R_AARCH64_JUMP26: - case R_AARCH64_CALL26: { - i64 idisp = fx->disp >> 2; - u32 imm26 = (u32)(idisp & 0x03ffffffu); - word = (word & ~0x03ffffffu) | imm26; - break; - } - case R_AARCH64_CONDBR19: { - i64 idisp = fx->disp >> 2; - u32 imm19 = (u32)(idisp & 0x7ffffu); - word = (word & ~(0x7ffffu << 5)) | (imm19 << 5); - break; - } - default: - return 1; - } - - wr_u32_le(cur, word); - obj_patch(fx->obj, fx->sec_id, fx->offset, cur, 4); - return 0; -} - -const ArchImpl arch_impl_aa64 = { - .kind = CFREE_ARCH_ARM_64, - .name = "aa64", - .abi_vtable = aa64_abi_vtable, - .cgtarget_new = aa64_cgtarget_new, - .asm_new = aa64_arch_asm_new, - .disasm_new = aa64_disasm_new, - .apply_label_fixup = aa64_apply_label_fixup, - .link = &link_arch_aa64, - .elf = &aa64_elf_ops, - .macho = &aa64_macho_ops, - .register_name = aa64_register_name, - .register_index = aa64_register_index, - .register_count = aa64_register_iter_size, - .register_at = aa64_register_at_public, -}; diff --git a/src/arch/aarch64/emit.c b/src/arch/aarch64/emit.c @@ -1,523 +0,0 @@ -/* aarch64/emit.c — instruction encoding helpers, function lifecycle, - * frame layout, parameter ABI, address materialization. */ - -#include "arch/aarch64/internal.h" - -extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc); - -/* ============================================================ - * Shared type / operand helpers - * ============================================================ */ - -int type_is_64(CfreeCgTypeId t) { - return t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I64) || - t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64) || - t >= (CfreeCgTypeId)(2u << 6); -} - -int type_is_fp_double(CfreeCgTypeId t) { - return t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F64); -} - -int type_is_signed(CfreeCgTypeId t) { - (void)t; - return 0; -} - -u32 type_byte_size(CfreeCgTypeId t) { - if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I8) || - t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_BOOL)) - return 1; - if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I16)) return 2; - if (t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_I32) || - t == CG_BUILTIN_ID(CFREE_CG_BUILTIN_F32)) - return 4; - return 8; -} - -u32 size_idx_for_bytes(u32 nbytes) { - switch (nbytes) { - case 1: - return 0; - case 2: - return 1; - case 4: - return 2; - case 8: - return 3; - default: - return 3; - } -} - -u32 reg_num(Operand op) { return op.v.reg & 0x1fu; } - -static u32 collect_mask_regs(u32 mask, u32 first, u32 last, u32* out) { - u32 n = 0; - for (u32 r = first; r <= last; ++r) { - if (mask & (1u << r)) out[n++] = r; - } - return n; -} - -/* ============================================================ - * Low-level emission - * ============================================================ */ - -void aa64_emit32(MCEmitter* mc, u32 word) { - u32 ofs = obj_pos(mc->obj, mc->section_id); - u8 b[4]; - b[0] = (u8)(word & 0xff); - b[1] = (u8)((word >> 8) & 0xff); - b[2] = (u8)((word >> 16) & 0xff); - b[3] = (u8)((word >> 24) & 0xff); - mc->emit_bytes(mc, b, 4); - if (mc->debug) { - debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); - } -} - -void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word) { - u8 b[4]; - b[0] = (u8)(word & 0xff); - b[1] = (u8)((word >> 8) & 0xff); - b[2] = (u8)((word >> 16) & 0xff); - b[3] = (u8)((word >> 24) & 0xff); - obj_patch(obj, sec_id, ofs, b, 4); -} - -/* ============================================================ - * Immediate encoding helpers - * ============================================================ */ - -void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm) { - const u32 nslots = sf ? 4u : 2u; - u64 v = sf ? (u64)imm : ((u64)imm & 0xffffffffu); - - for (u32 i = 0; i < nslots; ++i) { - u32 slot = (u32)((v >> (i * 16)) & 0xffffu); - u64 cleared = v & ~((u64)0xffffu << (i * 16)); - if (slot != 0 && cleared == 0) { - aa64_emit32(mc, aa64_movz(sf, Rd, slot, i)); - return; - } - } - - { - u64 inv = sf ? ~v : ((~v) & 0xffffffffu); - for (u32 i = 0; i < nslots; ++i) { - u32 slot = (u32)((inv >> (i * 16)) & 0xffffu); - u64 cleared = inv & ~((u64)0xffffu << (i * 16)); - if (cleared == 0) { - aa64_emit32(mc, aa64_movn(sf, Rd, slot, i)); - return; - } - } - } - - int placed = 0; - for (u32 i = 0; i < nslots; ++i) { - u32 slot = (u32)((v >> (i * 16)) & 0xffffu); - if (!placed) { - if (slot == 0) continue; - aa64_emit32(mc, aa64_movz(sf, Rd, slot, i)); - placed = 1; - } else if (slot != 0) { - aa64_emit32(mc, aa64_movk(sf, Rd, slot, i)); - } - } - if (!placed) aa64_emit32(mc, aa64_movz(sf, Rd, 0, 0)); -} - -void emit_sp_add(MCEmitter* mc, u32 imm) { - if (imm <= 0xfff) { - aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm, 0)); - } else if ((imm & 0xfff) == 0 && (imm >> 12) <= 0xfff) { - aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm >> 12, 1)); - } else { - aa64_emit32(mc, aa64_add_imm(1, 31, 31, (imm >> 12) & 0xfff, 1)); - aa64_emit32(mc, aa64_add_imm(1, 31, 31, imm & 0xfff, 0)); - } -} - -/* ============================================================ - * Function lifecycle - * ============================================================ */ - -void aa_func_begin(CGTarget* t, const CGFuncDesc* fd) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - - mc->set_section(mc, fd->text_section_id); - mc->emit_align(mc, 4, 0); - - a->fd = fd; - a->func_start = mc->pos(mc); - a->next_param_int = 0; - a->next_param_fp = 0; - a->next_param_stack = 0; - a->has_sret = (fd->abi && fd->abi->has_sret) ? 1 : 0; - a->cum_off = 0; - a->max_outgoing = 0; - a->used_cs_int_mask = 0; - a->used_cs_fp_mask = 0; - a->nslots = 0; - a->nscopes = 0; - a->has_alloca = 0; - a->nadd_patches = 0; - a->sret_ptr_slot = FRAME_SLOT_NONE; - a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0; - a->gp_save_slot = FRAME_SLOT_NONE; - a->fp_save_slot = FRAME_SLOT_NONE; - a->epilogue_label = mc->label_new(mc); - - mc->cfi_startproc(mc); - - a->prologue_pos = mc->pos(mc); - for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) aa64_emit32(mc, AA64_NOP); - - if (a->has_sret) { - FrameSlotDesc fsd = { - .type = CFREE_CG_TYPE_NONE, - .name = 0, - .loc = (SrcLoc){0, 0, 0}, - .size = 8, - .align = 8, - .kind = FS_SPILL, - .flags = 0, - }; - a->sret_ptr_slot = aa_frame_slot(t, &fsd); - } - - if (a->is_variadic) { - FrameSlotDesc gpd = { - .type = CFREE_CG_TYPE_NONE, - .name = 0, - .loc = (SrcLoc){0, 0, 0}, - .size = 64, - .align = 8, - .kind = FS_SPILL, - .flags = 0, - }; - a->gp_save_slot = aa_frame_slot(t, &gpd); - FrameSlotDesc fpd = { - .type = CFREE_CG_TYPE_NONE, - .name = 0, - .loc = (SrcLoc){0, 0, 0}, - .size = 128, - .align = 16, - .kind = FS_SPILL, - .flags = 0, - }; - a->fp_save_slot = aa_frame_slot(t, &fpd); - AASlot* gs = aa64_slot_get(a, a->gp_save_slot); - AASlot* fs = aa64_slot_get(a, a->fp_save_slot); - for (u32 i = 0; i < 8; ++i) { - aa64_emit32(mc, aa64_stur(3, i, 29, -(i32)gs->off + (i32)i * 8)); - } - for (u32 i = 0; i < 8; ++i) { - aa64_emit32(mc, aa64_stur_fp(3, i, 29, -(i32)fs->off + (i32)i * 16)); - } - } -} - -void aa_func_end(CGTarget* t) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - - u32 int_regs[10]; - u32 fp_regs[8]; - u32 n_int_saves = collect_mask_regs(a->used_cs_int_mask, 19u, 28u, int_regs); - u32 n_fp_saves = collect_mask_regs(a->used_cs_fp_mask, 8u, 15u, fp_regs); - - u32 outgoing_off = 0; - u32 int_save_off = a->max_outgoing; - u32 fp_save_off = int_save_off + n_int_saves * 8u; - u32 locals_off = fp_save_off + n_fp_saves * 8u; - u32 fp_lr_off = locals_off + a->cum_off; - u32 frame_size = fp_lr_off + 16; - frame_size = (frame_size + 15u) & ~15u; - fp_lr_off = frame_size - 16; - - (void)outgoing_off; - - mc->label_place(mc, a->epilogue_label); - - if (a->has_alloca) { - if (fp_lr_off <= 0xfff) { - aa64_emit32(mc, aa64_sub_imm(1, /*Rd=SP*/ 31, /*Rn=*/29, fp_lr_off, 0)); - } else { - compiler_panic(t->c, a->loc, - "aarch64: has_alloca + fp_lr_off %u out of imm12 range", - fp_lr_off); - } - } - - for (i32 i = (i32)n_fp_saves - 1; i >= 0; --i) { - u32 r0 = fp_regs[i]; - aa64_emit32(mc, aa64_ldr_fp_uimm(3, r0, 31, - fp_save_off + (u32)i * 8u)); - } - for (i32 i = (i32)n_int_saves - 1; i >= 0; --i) { - u32 r0 = int_regs[i]; - aa64_emit32(mc, aa64_ldr_uimm(3, r0, 31, - int_save_off + (u32)i * 8u)); - } - aa64_emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off)); - emit_sp_add(mc, frame_size); - aa64_emit32(mc, aa64_ret(AA64_LR)); - - u32 pos = a->prologue_pos; - ObjBuilder* obj = t->obj; - u32 sec = a->fd->text_section_id; - - u32 words[AA_PROLOGUE_WORDS]; - for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) words[i] = AA64_NOP; - u32 wi = 0; - - if (frame_size <= 0xfff) { - words[wi++] = aa64_sub_imm(1, 31, 31, frame_size, 0); - } else if ((frame_size & 0xfff) == 0 && (frame_size >> 12) <= 0xfff) { - words[wi++] = aa64_sub_imm(1, 31, 31, frame_size >> 12, 1); - } else { - if (wi + 2 > AA_PROLOGUE_WORDS) { - compiler_panic(t->c, a->loc, - "aarch64: prologue overflow for frame_size %u", - frame_size); - } - words[wi++] = aa64_sub_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1); - words[wi++] = aa64_sub_imm(1, 31, 31, frame_size & 0xfff, 0); - } - words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off); - words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0); - if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) { - AASlot* s = aa64_slot_get(a, a->sret_ptr_slot); - if (s) { - if (wi >= AA_PROLOGUE_WORDS) goto overflow; - words[wi++] = aa64_stur(3, 8, 29, -(i32)s->off); - } - } - for (u32 i = 0; i < n_int_saves; ++i) { - u32 r0 = int_regs[i]; - if (wi >= AA_PROLOGUE_WORDS) goto overflow; - words[wi++] = aa64_str_uimm(3, r0, 31, int_save_off + i * 8u); - } - for (u32 i = 0; i < n_fp_saves; ++i) { - u32 r0 = fp_regs[i]; - if (wi >= AA_PROLOGUE_WORDS) goto overflow; - words[wi++] = aa64_str_fp_uimm(3, r0, 31, fp_save_off + i * 8u); - } - if (0) { - overflow: - compiler_panic( - t->c, a->loc, - "aarch64: prologue placeholder too small (used %u of %u words)", wi, - AA_PROLOGUE_WORDS); - } - - for (u32 i = 0; i < AA_PROLOGUE_WORDS; ++i) { - aa64_patch32(obj, sec, pos + i * 4u, words[i]); - } - - if (a->max_outgoing > 0xfff) { - compiler_panic( - t->c, a->loc, - "aarch64: max_outgoing %u out of imm12 range for alloca patch", - a->max_outgoing); - } - for (u32 i = 0; i < a->nadd_patches; ++i) { - u32 dr = a->add_patches[i].dst_reg; - u32 word = aa64_add_imm(1, dr, /*Rn=SP*/ 31, a->max_outgoing, 0); - aa64_patch32(obj, sec, a->add_patches[i].pos, word); - } - - u32 end = mc->pos(mc); - obj_symbol_define(obj, a->fd->sym, sec, (u64)a->func_start, - (u64)(end - a->func_start)); - - mc->cfi_endproc(mc); - a->fd = NULL; -} - -/* ============================================================ - * Frame slots - * ============================================================ */ - -FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d) { - AAImpl* a = impl_of(t); - if (a->nslots == a->slots_cap) { - u32 ncap = a->slots_cap ? a->slots_cap * 2 : 8; - AASlot* nbuf = arena_array(t->c->tu, AASlot, ncap); - if (a->slots) memcpy(nbuf, a->slots, sizeof(AASlot) * a->nslots); - a->slots = nbuf; - a->slots_cap = ncap; - } - u32 size = d->size ? d->size : 8; - u32 align = d->align ? d->align : 1; - u32 next = a->cum_off + size; - u32 mask = align - 1; - next = (next + mask) & ~mask; - - AASlot* s = &a->slots[a->nslots]; - s->off = next; - s->size = size; - s->align = align; - s->kind = d->kind; - - a->cum_off = next; - a->nslots++; - return (FrameSlot)(a->nslots); -} - -/* ============================================================ - * Parameters - * ============================================================ */ - -void aa_param(CGTarget* t, const CGParamDesc* p) { - AAImpl* a = impl_of(t); - AASlot* s = aa64_slot_get(a, p->slot); - if (!s) { - compiler_panic(t->c, a->loc, "aarch64 param: bad slot"); - } - const ABIArgInfo* ai = p->abi; - - if (ai->kind == ABI_ARG_IGNORE) return; - if (ai->kind == ABI_ARG_INDIRECT) { - u32 ptr_reg; - if (a->next_param_int < 8) { - ptr_reg = a->next_param_int++; - } else { - u32 caller_off = a->next_param_stack; - a->next_param_stack += 8; - aa64_emit32(t->mc, aa64_ldur(3, AA_TMP0, 29, (i32)(16 + caller_off))); - ptr_reg = AA_TMP0; - } - u32 nbytes = s->size; - u32 i = 0; - while (i + 8 <= nbytes) { - aa64_emit32(t->mc, aa64_ldur(3, AA_TMP1, ptr_reg, (i32)i)); - aa64_emit32(t->mc, aa64_stur(3, AA_TMP1, 29, -(i32)s->off + (i32)i)); - i += 8; - } - while (i + 4 <= nbytes) { - aa64_emit32(t->mc, aa64_ldur(2, AA_TMP1, ptr_reg, (i32)i)); - aa64_emit32(t->mc, aa64_stur(2, AA_TMP1, 29, -(i32)s->off + (i32)i)); - i += 4; - } - while (i + 2 <= nbytes) { - aa64_emit32(t->mc, aa64_ldur(1, AA_TMP1, ptr_reg, (i32)i)); - aa64_emit32(t->mc, aa64_stur(1, AA_TMP1, 29, -(i32)s->off + (i32)i)); - i += 2; - } - while (i < nbytes) { - aa64_emit32(t->mc, aa64_ldur(0, AA_TMP1, ptr_reg, (i32)i)); - aa64_emit32(t->mc, aa64_stur(0, AA_TMP1, 29, -(i32)s->off + (i32)i)); - i += 1; - } - return; - } - for (u16 i = 0; i < ai->nparts; ++i) { - const ABIArgPart* pt = &ai->parts[i]; - u32 part_off = pt->src_offset; - u32 sz = pt->size; - u32 sidx = size_idx_for_bytes(sz); - - if (pt->cls == ABI_CLASS_INT) { - if (a->next_param_int < 8) { - u32 reg = a->next_param_int++; - aa64_emit32(t->mc, aa64_stur(sidx, reg, 29, -(i32)s->off + (i32)part_off)); - } else { - u32 caller_off = a->next_param_stack; - a->next_param_stack += 8; - aa64_emit32(t->mc, aa64_ldur(sidx, AA_TMP0, 29, (i32)(16 + caller_off))); - aa64_emit32(t->mc, - aa64_stur(sidx, AA_TMP0, 29, - -(i32)s->off + (i32)part_off)); - } - } else if (pt->cls == ABI_CLASS_FP) { - if (a->next_param_fp < 8) { - u32 reg = a->next_param_fp++; - aa64_emit32(t->mc, - aa64_stur_fp(sidx, reg, 29, -(i32)s->off + (i32)part_off)); - } else { - u32 caller_off = a->next_param_stack; - a->next_param_stack += 8; - aa64_emit32(t->mc, - aa64_ldur_fp(sidx, AA_FP_TMP0, 29, - (i32)(16 + caller_off))); - aa64_emit32(t->mc, - aa64_stur_fp(sidx, AA_FP_TMP0, 29, - -(i32)s->off + (i32)part_off)); - } - } else { - compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl", - (int)pt->cls); - } - } -} - -/* ============================================================ - * Address materialization helpers - * ============================================================ */ - -static int use_got_for_sym(CGTarget* t, ObjSymId sym) { - return obj_symbol_extern_via_got(t->c, t->obj, sym); -} - -void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym) { - MCEmitter* mc = t->mc; - u32 sec = mc->section_id; - u32 adrp_pos = mc->pos(mc); - aa64_emit32(mc, aa64_adrp_base(dst_reg)); - mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_GOT_PAGE, sym, 0, 0, 0); - u32 ldr_pos = mc->pos(mc); - aa64_emit32(mc, aa64_ldr_uimm(/*size=*/3, dst_reg, dst_reg, 0)); - mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_LD64_GOT_LO12_NC, sym, 0, 0, 0); -} - -void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend) { - MCEmitter* mc = t->mc; - if (use_got_for_sym(t, sym)) { - aa64_emit_got_load_addr(t, dst_reg, sym); - if (addend) aa64_emit_addr_adjust(mc, dst_reg, dst_reg, (i32)addend); - return; - } - u32 sec = mc->section_id; - u32 adrp_pos = mc->pos(mc); - aa64_emit32(mc, aa64_adrp_base(dst_reg)); - mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, addend, - 0, 0); - u32 add_pos = mc->pos(mc); - aa64_emit32(mc, aa64_add_imm(1, dst_reg, dst_reg, 0, 0)); - mc->emit_reloc_at(mc, sec, add_pos, R_AARCH64_ADD_ABS_LO12_NC, sym, addend, 0, - 0); -} - -void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off) { - if (off == 0) { - aa64_emit32(mc, aa64_mov_reg(1, Rd, base)); - return; - } - u32 abs_off = (off < 0) ? (u32)(-off) : (u32)off; - if (abs_off <= 0xfff) { - if (off < 0) - aa64_emit32(mc, aa64_sub_imm(1, Rd, base, abs_off, 0)); - else - aa64_emit32(mc, aa64_add_imm(1, Rd, base, abs_off, 0)); - return; - } - if ((abs_off >> 24) == 0) { - u32 hi = (abs_off >> 12) & 0xfff; - u32 lo = abs_off & 0xfff; - if (off < 0) { - if (hi) aa64_emit32(mc, aa64_sub_imm(1, Rd, base, hi, 1)); - if (lo) aa64_emit32(mc, aa64_sub_imm(1, Rd, hi ? Rd : base, lo, 0)); - } else { - if (hi) aa64_emit32(mc, aa64_add_imm(1, Rd, base, hi, 1)); - if (lo) aa64_emit32(mc, aa64_add_imm(1, Rd, hi ? Rd : base, lo, 0)); - } - return; - } - aa64_emit_load_imm(mc, 1, Rd, off); - aa64_emit32(mc, aa64_add(1, Rd, base, Rd)); -} diff --git a/src/arch/aarch64/internal.h b/src/arch/aarch64/internal.h @@ -1,306 +0,0 @@ -/* aarch64/internal.h — private types and forward decls shared across - * emit.c / alloc.c / ops.c. NOT part of the public API. */ -#pragma once - -#include <string.h> - -#include "arch/aa64_asm.h" -#include "arch/aa64_isa.h" -#include "arch/aa64_regs.h" -#include "arch/arch.h" -#include "core/arena.h" -#include "core/pool.h" -#include "obj/obj.h" - -/* ============================================================ - * Local encoding helpers (kept here, not in aa64_isa.h). - * ============================================================ */ - -#define AA64_NOP 0xD503201Fu - -/* Hidden backend temporaries. These must stay outside the allocable pools and - * outside optimizer scratch registers because CGTarget ops may clobber them - * while lowering a single operation. AA_FP_TMP0 names v31, not integer x31. */ -enum { - AA_TMP0 = 9u, - AA_TMP1 = 10u, - AA_TMP2 = 11u, - AA_FP_TMP0 = 31u, -}; -#define CG_BUILTIN_ID(k) ((CfreeCgTypeId)((1u << 6) | (u32)(k))) - -static inline u32 aa64_stp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) { - i32 sc = byte_off >> 3; - return 0xA9000000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) { - i32 sc = byte_off >> 3; - return 0xA9400000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_stp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) { - i32 sc = byte_off >> 3; - return 0x6D000000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldp_d(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) { - i32 sc = byte_off >> 3; - return 0x6D400000u | (((u32)sc & 0x7fu) << 15) | ((Rt2 & 0x1f) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} - -static inline u32 aa64_stur(u32 size, u32 Rt, u32 Rn, i32 simm9) { - return 0x38000000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldur(u32 size, u32 Rt, u32 Rn, i32 simm9) { - return 0x38400000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_stur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9) { - return 0x3C000000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldur_fp(u32 size, u32 Rt, u32 Rn, i32 simm9) { - return 0x3C400000u | (size << 30) | (((u32)simm9 & 0x1ffu) << 12) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} - -static inline u32 aa64_str_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) { - u32 sc = byte_off >> size; - return 0x39000000u | (size << 30) | ((sc & 0xfffu) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldr_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) { - u32 sc = byte_off >> size; - return 0x39400000u | (size << 30) | ((sc & 0xfffu) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_str_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) { - u32 sc = byte_off >> size; - return 0x3D000000u | (size << 30) | ((sc & 0xfffu) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} - -static inline u32 aa64_mrs_tpidr_el0(u32 Rt) { - return 0xD53BD040u | (Rt & 0x1fu); -} -static inline u32 aa64_b_base(void) { return 0x14000000u; } -static inline u32 aa64_bl_base(void) { return 0x94000000u; } - -static inline u32 aa64_adrp_base(u32 Rd) { return 0x90000000u | (Rd & 0x1f); } - -static inline u32 aa64_ldr_fp_uimm(u32 size, u32 Rt, u32 Rn, u32 byte_off) { - u32 sc = byte_off >> size; - return 0x3D400000u | (size << 30) | ((sc & 0xfffu) << 10) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} - -static inline u32 aa64_fmov_reg(u32 type, u32 Rd, u32 Rn) { - return 0x1E204000u | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} - -static inline u32 aa64_subs_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12) { - return 0x71000000u | (sf << 31) | ((imm12 & 0xfff) << 10) | - ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} - -static inline u32 aa64_cset_eq(u32 sf, u32 Rd) { - return 0x1A800400u | (sf << 31) | (31u << 16) | (0x1u << 12) | (31u << 5) | - (Rd & 0x1f); -} - -static inline u32 aa64_fcvtzs(u32 sf, u32 type, u32 Rd, u32 Rn) { - return 0x1E380000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | - (Rd & 0x1f); -} -static inline u32 aa64_fcvtzu(u32 sf, u32 type, u32 Rd, u32 Rn) { - return 0x1E390000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | - (Rd & 0x1f); -} -static inline u32 aa64_scvtf(u32 sf, u32 type, u32 Rd, u32 Rn) { - return 0x1E220000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | - (Rd & 0x1f); -} -static inline u32 aa64_ucvtf(u32 sf, u32 type, u32 Rd, u32 Rn) { - return 0x1E230000u | (sf << 31) | ((type & 3) << 22) | ((Rn & 0x1f) << 5) | - (Rd & 0x1f); -} - -static inline u32 aa64_fcvt_d_s(u32 Rd, u32 Rn) { - return 0x1E22C000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fcvt_s_d(u32 Rd, u32 Rn) { - return 0x1E624000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} - -static inline u32 aa64_fmov_s_w(u32 Rd, u32 Rn) { - return 0x1E270000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fmov_w_s(u32 Rd, u32 Rn) { - return 0x1E260000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fmov_d_x(u32 Rd, u32 Rn) { - return 0x9E670000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fmov_x_d(u32 Rd, u32 Rn) { - return 0x9E660000u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} - -static inline u32 aa64_sub_extreg_x_uxtx(u32 Rd, u32 Rn, u32 Rm) { - return 0xCB206000u | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} - -static inline u32 aa64_subs_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm) { - return 0x6B000000u | (sf << 31) | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | - (Rd & 0x1f); -} - -static inline u32 aa64_b_cond(u32 cond) { return 0x54000000u | (cond & 0xfu); } - -static inline u32 aa64_csinc(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 cond) { - return 0x1A800400u | (sf << 31) | ((Rm & 0x1f) << 16) | - ((cond & 0xfu) << 12) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_cset(u32 sf, u32 Rd, u32 cond) { - return aa64_csinc(sf, Rd, 31u, 31u, cond ^ 1u); -} - -static inline u32 aa64_fadd(u32 type, u32 Rd, u32 Rn, u32 Rm) { - return 0x1E202800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) | - ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fsub(u32 type, u32 Rd, u32 Rn, u32 Rm) { - return 0x1E203800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) | - ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fmul(u32 type, u32 Rd, u32 Rn, u32 Rm) { - return 0x1E200800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) | - ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_fdiv(u32 type, u32 Rd, u32 Rn, u32 Rm) { - return 0x1E201800u | ((type & 3) << 22) | ((Rm & 0x1f) << 16) | - ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} - -static inline u32 aa64_sbfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) { - return 0x13000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) | - ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_ubfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) { - return 0x53000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) | - ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_bfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) { - return 0x33000000u | (sf << 31) | (sf << 22) | ((immr & 0x3fu) << 16) | - ((imms & 0x3fu) << 10) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} - -/* ============================================================ - * AAImpl types - * ============================================================ */ - -#define AA_PROLOGUE_WORDS \ - 22u /* worst case: sub sp + stp/add fp + sret + 5 int + 8 fp saves */ - -typedef struct AASlot { - u32 off; - u32 size; - u32 align; - u8 kind; - u8 pad[3]; -} AASlot; - -typedef struct AAScope { - u8 kind; - u8 has_else; - u8 pad[2]; - MCLabel else_label; - MCLabel end_label; - Label break_label; - Label continue_label; -} AAScope; - -typedef struct AAImpl { - CGTarget base; - SrcLoc loc; - const CGFuncDesc* fd; - - u32 func_start; - u32 prologue_pos; - MCLabel epilogue_label; - - AASlot* slots; - u32 nslots; - u32 slots_cap; - u32 cum_off; - u32 max_outgoing; - - u32 next_param_int; - u32 next_param_fp; - u32 next_param_stack; - u8 has_sret; - FrameSlot sret_ptr_slot; - - u32 used_cs_int_mask; /* bit reg set when x19-x28 must be preserved */ - u32 used_cs_fp_mask; /* bit reg set when d8-d15 must be preserved */ - - AAScope* scopes; - u32 nscopes; - u32 scopes_cap; - - u8 has_alloca; - struct AAAllocaPatch { - u32 pos; - u32 dst_reg; - }* add_patches; - u32 nadd_patches; - u32 add_patches_cap; - - u8 is_variadic; - FrameSlot gp_save_slot; - FrameSlot fp_save_slot; -} AAImpl; - -/* ============================================================ - * Cross-file forward declarations - * ============================================================ */ - -/* emit.c helpers used in alloc.c / ops.c */ -void aa64_emit32(MCEmitter* mc, u32 word); -void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word); -void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm); -void emit_sp_add(MCEmitter* mc, u32 imm); -void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off); -void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym); -void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend); - -/* emit.c public surface */ -FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d); -void aa_func_begin(CGTarget* t, const CGFuncDesc* fd); -void aa_func_end(CGTarget* t); -void aa_param(CGTarget* t, const CGParamDesc* p); - -/* alloc.c helpers used in emit.c / ops.c */ -AAImpl* impl_of(CGTarget* t); -AASlot* aa64_slot_get(AAImpl* a, FrameSlot fs); -void aa_jump(CGTarget* t, Label l); - -/* ops.c helpers used in alloc.c */ -void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma); -void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma); -u32 aa64_force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch); - -/* alloc.c helpers used in ops.c */ -void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op); -void aa_alloc_vtable_init(CGTarget* t); -void aa_coord_vtable_init(CGTarget* t); - -/* shared type helpers (defined in emit.c, used broadly) */ -int type_is_64(CfreeCgTypeId t); -int type_is_fp_double(CfreeCgTypeId t); -int type_is_signed(CfreeCgTypeId t); -u32 type_byte_size(CfreeCgTypeId t); -u32 size_idx_for_bytes(u32 nbytes); -u32 reg_num(Operand op); diff --git a/src/arch/aarch64/ops.c b/src/arch/aarch64/ops.c @@ -1,1925 +0,0 @@ -/* aarch64/ops.c — data movement, arithmetic, calls, varargs, atomics, - * intrinsics, asm_block, set_loc, finalize/destroy, vtable constructor. */ - -#include "arch/aarch64/internal.h" - -/* ============================================================ - * Data movement - * ============================================================ */ - -static void aa_load_imm(CGTarget* t, Operand dst, i64 imm) { - u32 sf = type_is_64(dst.type) ? 1u : 0u; - aa64_emit_load_imm(t->mc, sf, reg_num(dst), imm); -} - -static void aa_load_const(CGTarget* t, Operand dst, ConstBytes cb) { - AAImpl* a = impl_of(t); - if (dst.cls != RC_FP) { - compiler_panic(t->c, a->loc, "aarch64 load_const: only FP supported in v1"); - } - - Sym ro_name = pool_intern_cstr(t->c->global, ".rodata"); - ObjSecId ro = obj_section(t->obj, ro_name, SEC_RODATA, SF_ALLOC, 1u); - - u32 cur_section = t->mc->section_id; - t->mc->set_section(t->mc, ro); - u32 ro_off = obj_align_to(t->obj, ro, cb.align ? cb.align : 4); - t->mc->emit_bytes(t->mc, cb.bytes, cb.size); - - char namebuf[64]; - static u32 lit_seq = 0; - int len = 0; - { - const char* prefix = ".LCFP"; - for (; prefix[len]; ++len) namebuf[len] = prefix[len]; - u32 v = lit_seq++; - char tmp[16]; - int tn = 0; - if (v == 0) - tmp[tn++] = '0'; - else { - while (v) { - tmp[tn++] = '0' + (char)(v % 10); - v /= 10; - } - } - for (int i = tn - 1; i >= 0; --i) namebuf[len++] = tmp[i]; - namebuf[len] = 0; - } - Sym sname = pool_intern_cstr(t->c->global, namebuf); - ObjSymId sym = obj_symbol(t->obj, sname, SB_LOCAL, SK_OBJ, ro, (u64)ro_off, - (u64)cb.size); - - t->mc->set_section(t->mc, cur_section); - - u32 adrp_pos = t->mc->pos(t->mc); - aa64_emit32(t->mc, aa64_adrp_base(AA_TMP0)); - t->mc->emit_reloc_at(t->mc, cur_section, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, - sym, 0, 0, 0); - - u32 ldr_pos = t->mc->pos(t->mc); - u32 sidx = (cb.size == 8) ? 3u : 2u; - aa64_emit32(t->mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), AA_TMP0, 0)); - RelocKind lo12 = (cb.size == 8) ? R_AARCH64_LDST64_ABS_LO12_NC - : R_AARCH64_LDST32_ABS_LO12_NC; - t->mc->emit_reloc_at(t->mc, cur_section, ldr_pos, lo12, sym, 0, 0, 0); -} - -static void aa_copy(CGTarget* t, Operand dst, Operand src) { - if (dst.cls == RC_FP || src.cls == RC_FP) { - u32 type = type_is_fp_double(dst.type) ? 1u : 0u; - aa64_emit32(t->mc, aa64_fmov_reg(type, reg_num(dst), reg_num(src))); - return; - } - u32 sf = type_is_64(dst.type) ? 1u : 0u; - aa64_emit32(t->mc, aa64_mov_reg(sf, reg_num(dst), reg_num(src))); -} - -/* ============================================================ - * Load / store - * ============================================================ */ - -static RelocKind ldst_lo12_reloc_for(u32 nbytes) { - switch (nbytes) { - case 1: return R_AARCH64_LDST8_ABS_LO12_NC; - case 2: return R_AARCH64_LDST16_ABS_LO12_NC; - case 4: return R_AARCH64_LDST32_ABS_LO12_NC; - case 8: return R_AARCH64_LDST64_ABS_LO12_NC; - default: return R_AARCH64_LDST64_ABS_LO12_NC; - } -} - -static int use_got_for_sym(CGTarget* t, ObjSymId sym) { - return obj_symbol_extern_via_got(t->c, t->obj, sym); -} - -static u32 addr_base(CGTarget* t, Operand addr, i32* out_off, u32 tmp_reg) { - AAImpl* a = impl_of(t); - if (addr.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, addr.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_base: bad slot"); - i32 off = -(i32)s->off; - if (off >= -256 && off <= 255) { - *out_off = off; - return 29; - } - aa64_emit_addr_adjust(t->mc, tmp_reg, 29, off); - *out_off = 0; - return tmp_reg; - } - if (addr.kind == OPK_INDIRECT) { - i32 off = addr.v.ind.ofs; - u32 base = addr.v.ind.base & 0x1f; - if (off >= -256 && off <= 255) { - *out_off = off; - return base; - } - aa64_emit_addr_adjust(t->mc, tmp_reg, base, off); - *out_off = 0; - return tmp_reg; - } - if (addr.kind == OPK_GLOBAL) { - emit_global_addr(t, tmp_reg, addr.v.global.sym, addr.v.global.addend); - *out_off = 0; - return tmp_reg; - } - compiler_panic(t->c, a->loc, "aarch64 addr_base: unsupported kind %d", - (int)addr.kind); -} - -void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) { - u32 sz = ma.size ? ma.size : type_byte_size(addr.type); - u32 sidx = size_idx_for_bytes(sz); - - if (addr.kind == OPK_GLOBAL) { - MCEmitter* mc = t->mc; - u32 sec = mc->section_id; - ObjSymId sym = addr.v.global.sym; - i64 add = addr.v.global.addend; - if (use_got_for_sym(t, sym)) { - aa64_emit_got_load_addr(t, AA_TMP0, sym); - if (dst.cls == RC_FP) { - aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP0, (i32)add)); - } else { - aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP0, (i32)add)); - } - return; - } - u32 adrp_pos = mc->pos(mc); - aa64_emit32(mc, aa64_adrp_base(AA_TMP0)); - mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add, - 0, 0); - u32 ld_pos = mc->pos(mc); - if (dst.cls == RC_FP) { - aa64_emit32(mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), AA_TMP0, 0)); - } else { - aa64_emit32(mc, aa64_ldr_uimm(sidx, reg_num(dst), AA_TMP0, 0)); - } - mc->emit_reloc_at(mc, sec, ld_pos, ldst_lo12_reloc_for(sz), sym, add, 0, 0); - return; - } - - i32 off; - u32 base = addr_base(t, addr, &off, AA_TMP0); - if (dst.cls == RC_FP) { - aa64_emit32(t->mc, aa64_ldur_fp(sidx, reg_num(dst), base, off)); - } else { - aa64_emit32(t->mc, aa64_ldur(sidx, reg_num(dst), base, off)); - } -} - -void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) { - u32 sz = ma.size ? ma.size : type_byte_size(addr.type); - u32 sidx = size_idx_for_bytes(sz); - - if (addr.kind == OPK_GLOBAL) { - MCEmitter* mc = t->mc; - u32 sec = mc->section_id; - ObjSymId sym = addr.v.global.sym; - i64 add = addr.v.global.addend; - - u32 src_reg; - u32 src_is_fp = 0; - if (src.kind == OPK_IMM) { - u32 sf = (sz == 8) ? 1u : 0u; - aa64_emit_load_imm(mc, sf, AA_TMP0, src.v.imm); - src_reg = AA_TMP0; - } else if (src.cls == RC_FP) { - src_reg = reg_num(src); - src_is_fp = 1; - } else { - src_reg = reg_num(src); - } - u32 base = (src.kind == OPK_IMM) ? AA_TMP1 : AA_TMP0; - if (use_got_for_sym(t, sym)) { - aa64_emit_got_load_addr(t, base, sym); - if (src_is_fp) { - aa64_emit32(mc, aa64_stur_fp(sidx, src_reg, base, (i32)add)); - } else { - aa64_emit32(mc, aa64_stur(sidx, src_reg, base, (i32)add)); - } - return; - } - u32 adrp_pos = mc->pos(mc); - aa64_emit32(mc, aa64_adrp_base(base)); - mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add, - 0, 0); - u32 st_pos = mc->pos(mc); - if (src_is_fp) { - aa64_emit32(mc, aa64_str_fp_uimm(sidx, src_reg, base, 0)); - } else { - aa64_emit32(mc, aa64_str_uimm(sidx, src_reg, base, 0)); - } - mc->emit_reloc_at(mc, sec, st_pos, ldst_lo12_reloc_for(sz), sym, add, 0, 0); - return; - } - - i32 off; - u32 addr_tmp = (src.kind == OPK_IMM) ? AA_TMP1 : AA_TMP0; - u32 base = addr_base(t, addr, &off, addr_tmp); - - if (src.kind == OPK_IMM) { - u32 sf = (sz == 8) ? 1u : 0u; - aa64_emit_load_imm(t->mc, sf, AA_TMP0, src.v.imm); - aa64_emit32(t->mc, aa64_stur(sidx, AA_TMP0, base, off)); - return; - } - if (src.cls == RC_FP) { - aa64_emit32(t->mc, aa64_stur_fp(sidx, reg_num(src), base, off)); - } else { - aa64_emit32(t->mc, aa64_stur(sidx, reg_num(src), base, off)); - } -} - -static void aa_addr_of(CGTarget* t, Operand dst, Operand lv) { - AAImpl* a = impl_of(t); - if (lv.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, lv.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 addr_of: bad slot"); - aa64_emit32(t->mc, aa64_sub_imm(1, reg_num(dst), 29, s->off, 0)); - return; - } - if (lv.kind == OPK_INDIRECT) { - i32 ofs = lv.v.ind.ofs; - u32 base = lv.v.ind.base & 0x1f; - if (ofs == 0) { - aa64_emit32(t->mc, aa64_mov_reg(1, reg_num(dst), base)); - } else if (ofs > 0 && ofs <= 0xfff) { - aa64_emit32(t->mc, aa64_add_imm(1, reg_num(dst), base, (u32)ofs, 0)); - } else if (ofs < 0 && -ofs <= 0xfff) { - aa64_emit32(t->mc, aa64_sub_imm(1, reg_num(dst), base, (u32)(-ofs), 0)); - } else { - compiler_panic(t->c, a->loc, - "aarch64 addr_of: indirect offset %d unsupported", ofs); - } - return; - } - if (lv.kind == OPK_GLOBAL) { - u32 rd = reg_num(dst); - ObjSymId sym = lv.v.global.sym; - i64 addend = lv.v.global.addend; - if (use_got_for_sym(t, sym)) { - aa64_emit_got_load_addr(t, rd, sym); - if (addend) aa64_emit_addr_adjust(t->mc, rd, rd, (i32)addend); - return; - } - u32 sec = t->mc->section_id; - u32 adrp_pos = t->mc->pos(t->mc); - aa64_emit32(t->mc, aa64_adrp_base(rd)); - t->mc->emit_reloc_at(t->mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, - addend, 0, 0); - u32 add_pos = t->mc->pos(t->mc); - aa64_emit32(t->mc, aa64_add_imm(1, rd, rd, 0, 0)); - t->mc->emit_reloc_at(t->mc, sec, add_pos, R_AARCH64_ADD_ABS_LO12_NC, sym, - addend, 0, 0); - return; - } - compiler_panic(t->c, impl_of(t)->loc, "aarch64: addr_of not implemented"); -} - -static void aa_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend) { - MCEmitter* mc = t->mc; - u32 sec = mc->section_id; - u32 rd = reg_num(dst); - - if (obj_format_tls_via_descriptor(t->c)) { - /* TLV access via per-variable descriptor (Mach-O TLVP). The thunk's - * ABI is custom — x0 in/out as descriptor → TLV addr, all other - * regs preserved — so we materialize via x0 and copy to `dst` only - * when they differ. x0/x1 are scratch here (the regalloc only hands - * out x19-x28), and x30 was saved at the prologue. - * - * adrp x0, sym@TLVPPAGE ; R_AARCH64_TLVP_LOAD_PAGE21 - * ldr x0, [x0, sym@TLVPPAGEOFF] ; R_AARCH64_TLVP_LOAD_PAGEOFF12 - * ldr x1, [x0] ; descriptor[0] = thunk pointer - * blr x1 ; x0 in/out - * mov xdst, x0 ; only if dst != x0 - * - * TLVP relocs do not carry an addend; nonzero addends are applied - * after the call as a follow-on ADD/SUB on `dst`. */ - u32 adrp_pos = mc->pos(mc); - aa64_emit32(mc, aa64_adrp_base(/*Rd=*/0)); - mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_TLVP_LOAD_PAGE21, sym, 0, 0, - 0); - u32 ldr_pos = mc->pos(mc); - aa64_emit32(mc, - aa64_ldr_uimm(/*size=*/3, /*Rt=*/0, /*Rn=*/0, /*byte_off=*/0)); - mc->emit_reloc_at(mc, sec, ldr_pos, R_AARCH64_TLVP_LOAD_PAGEOFF12, sym, 0, - 0, 0); - aa64_emit32(mc, - aa64_ldr_uimm(/*size=*/3, /*Rt=*/1, /*Rn=*/0, /*byte_off=*/0)); - aa64_emit32(mc, aa64_blr(/*Rn=*/1)); - if (rd != 0) aa64_emit32(mc, aa64_mov_reg(/*sf=*/1, rd, /*Rm=*/0)); - if (addend) aa64_emit_addr_adjust(mc, rd, rd, (i32)addend); - return; - } - - aa64_emit32(mc, aa64_mrs_tpidr_el0(AA_TMP0)); - - u32 hi_pos = mc->pos(mc); - aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, AA_TMP0, /*imm12=*/0, /*sh=*/1)); - mc->emit_reloc_at(mc, sec, hi_pos, R_AARCH64_TLSLE_ADD_TPREL_HI12, sym, - addend, 0, 0); - - u32 lo_pos = mc->pos(mc); - aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, /*Rn=*/rd, /*imm12=*/0, /*sh=*/0)); - mc->emit_reloc_at(mc, sec, lo_pos, R_AARCH64_TLSLE_ADD_TPREL_LO12_NC, sym, - addend, 0, 0); -} - -/* ============================================================ - * Aggregate helpers - * ============================================================ */ - -static u32 agg_addr_reg(CGTarget* t, Operand op, u32 scratch) { - if (op.kind == OPK_REG) return reg_num(op); - if (op.kind == OPK_LOCAL) { - AAImpl* a = impl_of(t); - AASlot* s = aa64_slot_get(a, op.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 agg: bad slot"); - aa64_emit32(t->mc, aa64_sub_imm(1, scratch, 29, s->off, 0)); - return scratch; - } - compiler_panic(t->c, impl_of(t)->loc, - "aarch64 agg: address kind %d unsupported", (int)op.kind); -} - -static void aa_copy_bytes(CGTarget* t, Operand dst_addr, Operand src_addr, - AggregateAccess agg) { - MCEmitter* mc = t->mc; - u32 dr = agg_addr_reg(t, dst_addr, AA_TMP0); - u32 sr = agg_addr_reg(t, src_addr, - (dr == AA_TMP1) ? AA_TMP2 : AA_TMP1); - u32 nbytes = agg.size; - u32 i = 0; - while (i + 8 <= nbytes) { - aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i)); - i += 8; - } - while (i + 4 <= nbytes) { - aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i)); - i += 4; - } - while (i + 2 <= nbytes) { - aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i)); - i += 2; - } - while (i < nbytes) { - aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i)); - i += 1; - } -} - -static void aa_set_bytes(CGTarget* t, Operand dst_addr, Operand byte_value, - AggregateAccess agg) { - MCEmitter* mc = t->mc; - u32 dr = agg_addr_reg(t, dst_addr, AA_TMP0); - - u32 byte; - if (byte_value.kind == OPK_IMM) { - byte = (u32)(byte_value.v.imm & 0xffu); - } else { - compiler_panic(t->c, impl_of(t)->loc, - "aarch64 set_bytes: REG byte not yet supported"); - } - u32 nbytes = agg.size; - - if (byte == 0) { - u32 i = 0; - while (i + 8 <= nbytes) { - aa64_emit32(mc, aa64_stur(3, 31, dr, (i32)i)); - i += 8; - } - while (i + 4 <= nbytes) { - aa64_emit32(mc, aa64_stur(2, 31, dr, (i32)i)); - i += 4; - } - while (i + 2 <= nbytes) { - aa64_emit32(mc, aa64_stur(1, 31, dr, (i32)i)); - i += 2; - } - while (i < nbytes) { - aa64_emit32(mc, aa64_stur(0, 31, dr, (i32)i)); - i += 1; - } - return; - } - - u64 b64 = byte; - b64 |= b64 << 8; - b64 |= b64 << 16; - b64 |= b64 << 32; - aa64_emit_load_imm(mc, /*sf=*/1u, AA_TMP1, (i64)b64); - - u32 i = 0; - while (i + 8 <= nbytes) { - aa64_emit32(mc, aa64_stur(3, AA_TMP1, dr, (i32)i)); - i += 8; - } - while (i + 4 <= nbytes) { - aa64_emit32(mc, aa64_stur(2, AA_TMP1, dr, (i32)i)); - i += 4; - } - while (i + 2 <= nbytes) { - aa64_emit32(mc, aa64_stur(1, AA_TMP1, dr, (i32)i)); - i += 2; - } - while (i < nbytes) { - aa64_emit32(mc, aa64_stur(0, AA_TMP1, dr, (i32)i)); - i += 1; - } -} - -/* ============================================================ - * Bitfields - * ============================================================ */ - -static void aa_bitfield_load(CGTarget* t, Operand dst, Operand record_addr, - BitFieldAccess bf) { - MCEmitter* mc = t->mc; - u32 base = agg_addr_reg(t, record_addr, AA_TMP0); - u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u; - u32 sf = (storage_bytes == 8u) ? 1u : 0u; - u32 sidx = size_idx_for_bytes(storage_bytes); - u32 rd = reg_num(dst); - - aa64_emit32(mc, aa64_ldur(sidx, rd, base, (i32)bf.storage_offset)); - u32 lsb = bf.bit_offset; - u32 width = bf.bit_width ? bf.bit_width : 1u; - u32 imms = lsb + width - 1u; - if (bf.signed_) { - aa64_emit32(mc, aa64_sbfm(sf, rd, rd, lsb, imms)); - } else { - aa64_emit32(mc, aa64_ubfm(sf, rd, rd, lsb, imms)); - } -} - -static void aa_bitfield_store(CGTarget* t, Operand record_addr, Operand src, - BitFieldAccess bf) { - MCEmitter* mc = t->mc; - u32 base = agg_addr_reg(t, record_addr, AA_TMP0); - u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u; - u32 sf = (storage_bytes == 8u) ? 1u : 0u; - u32 sidx = size_idx_for_bytes(storage_bytes); - - aa64_emit32(mc, aa64_ldur(sidx, AA_TMP1, base, (i32)bf.storage_offset)); - - u32 src_reg; - if (src.kind == OPK_IMM) { - aa64_emit_load_imm(mc, sf, AA_TMP2, src.v.imm); - src_reg = AA_TMP2; - } else if (src.kind == OPK_REG) { - src_reg = reg_num(src); - } else { - compiler_panic(t->c, impl_of(t)->loc, - "aarch64 bitfield_store: src kind %d unsupported", - (int)src.kind); - } - - u32 reg_size = sf ? 64u : 32u; - u32 lsb = bf.bit_offset; - u32 width = bf.bit_width ? bf.bit_width : 1u; - u32 immr = (reg_size - lsb) % reg_size; - u32 imms = width - 1u; - aa64_emit32(mc, aa64_bfm(sf, AA_TMP1, src_reg, immr, imms)); - - aa64_emit32(mc, aa64_stur(sidx, AA_TMP1, base, (i32)bf.storage_offset)); -} - -/* ============================================================ - * Arithmetic helpers - * ============================================================ */ - -u32 aa64_force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch) { - if (op.kind == OPK_REG) return reg_num(op); - if (op.kind == OPK_IMM) { - aa64_emit_load_imm(t->mc, sf, scratch, op.v.imm); - return scratch; - } - compiler_panic(t->c, impl_of(t)->loc, - "aarch64 binop: operand kind %d unsupported", (int)op.kind); -} - -static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, - Operand b_op) { - MCEmitter* mc = t->mc; - - if (op == BO_FADD || op == BO_FSUB || op == BO_FMUL || op == BO_FDIV) { - if (a_op.kind != OPK_REG || b_op.kind != OPK_REG || dst.cls != RC_FP) { - compiler_panic(t->c, impl_of(t)->loc, - "aarch64 binop: FP op requires REG operands"); - } - u32 type = type_is_fp_double(dst.type) ? 1u : 0u; - u32 rd = reg_num(dst); - u32 rn = reg_num(a_op); - u32 rm = reg_num(b_op); - u32 w; - switch (op) { - case BO_FADD: w = aa64_fadd(type, rd, rn, rm); break; - case BO_FSUB: w = aa64_fsub(type, rd, rn, rm); break; - case BO_FMUL: w = aa64_fmul(type, rd, rn, rm); break; - case BO_FDIV: w = aa64_fdiv(type, rd, rn, rm); break; - default: w = 0; break; - } - aa64_emit32(mc, w); - return; - } - - u32 sf = type_is_64(dst.type) ? 1u : 0u; - u32 rd = reg_num(dst); - - switch (op) { - case BO_IADD: - case BO_AND: - case BO_OR: - case BO_XOR: { - if (a_op.kind == OPK_IMM && b_op.kind != OPK_IMM) { - Operand t_op = a_op; a_op = b_op; b_op = t_op; - } - break; - } - default: break; - } - - if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) { - u32 rn_reg = reg_num(a_op); - i64 imm = b_op.v.imm; - u32 imm12, sh, N, immr, imms; - switch (op) { - case BO_IADD: - if (aa64_addsub_imm_fits(imm, &imm12, &sh)) { - aa64_emit32(mc, aa64_add_imm(sf, rd, rn_reg, imm12, sh)); - return; - } - break; - case BO_ISUB: - if (aa64_addsub_imm_fits(imm, &imm12, &sh)) { - aa64_emit32(mc, aa64_sub_imm(sf, rd, rn_reg, imm12, sh)); - return; - } - break; - case BO_AND: - if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) { - aa64_emit32(mc, aa64_and_imm(sf, rd, rn_reg, N, immr, imms)); - return; - } - break; - case BO_OR: - if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) { - aa64_emit32(mc, aa64_orr_imm(sf, rd, rn_reg, N, immr, imms)); - return; - } - break; - case BO_XOR: - if (aa64_logimm_encode((u64)imm, sf, &N, &immr, &imms)) { - aa64_emit32(mc, aa64_eor_imm(sf, rd, rn_reg, N, immr, imms)); - return; - } - break; - case BO_SHL: { - u32 width = sf ? 64u : 32u; - u32 sh_amt = (u32)((u64)imm & (width - 1u)); - if (aa64_lsl_imm_fields(sh_amt, sf, &immr, &imms)) { - aa64_emit32(mc, aa64_ubfm(sf, rd, rn_reg, immr, imms)); - return; - } - break; - } - case BO_SHR_U: { - u32 width = sf ? 64u : 32u; - u32 sh_amt = (u32)((u64)imm & (width - 1u)); - if (aa64_lsr_imm_fields(sh_amt, sf, &immr, &imms)) { - aa64_emit32(mc, aa64_ubfm(sf, rd, rn_reg, immr, imms)); - return; - } - break; - } - case BO_SHR_S: { - u32 width = sf ? 64u : 32u; - u32 sh_amt = (u32)((u64)imm & (width - 1u)); - if (aa64_asr_imm_fields(sh_amt, sf, &immr, &imms)) { - aa64_emit32(mc, aa64_sbfm(sf, rd, rn_reg, immr, imms)); - return; - } - break; - } - default: break; - } - } - - u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); - u32 rm = - aa64_force_reg_int(t, b_op, sf, (rn == AA_TMP0) ? AA_TMP1 : AA_TMP0); - - u32 word; - switch (op) { - case BO_IADD: word = aa64_add(sf, rd, rn, rm); break; - case BO_ISUB: word = aa64_sub(sf, rd, rn, rm); break; - case BO_IMUL: word = aa64_mul(sf, rd, rn, rm); break; - case BO_AND: word = aa64_and(sf, rd, rn, rm); break; - case BO_OR: word = aa64_orr(sf, rd, rn, rm); break; - case BO_XOR: word = aa64_eor(sf, rd, rn, rm); break; - case BO_SHL: word = aa64_lslv(sf, rd, rn, rm); break; - case BO_SHR_U: word = aa64_lsrv(sf, rd, rn, rm); break; - case BO_SHR_S: word = aa64_asrv(sf, rd, rn, rm); break; - case BO_UDIV: word = aa64_udiv(sf, rd, rn, rm); break; - case BO_SDIV: word = aa64_sdiv(sf, rd, rn, rm); break; - case BO_SREM: - aa64_emit32(mc, aa64_sdiv(sf, AA_TMP2, rn, rm)); - word = aa64_msub(sf, rd, AA_TMP2, rm, rn); - break; - case BO_UREM: - aa64_emit32(mc, aa64_udiv(sf, AA_TMP2, rn, rm)); - word = aa64_msub(sf, rd, AA_TMP2, rm, rn); - break; - case BO_FADD: - case BO_FSUB: - case BO_FMUL: - case BO_FDIV: - default: - compiler_panic(t->c, impl_of(t)->loc, "aarch64 binop: op %d unimpl", - (int)op); - } - aa64_emit32(mc, word); -} - -static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a_op) { - MCEmitter* mc = t->mc; - u32 sf = type_is_64(dst.type) ? 1u : 0u; - u32 rd = reg_num(dst); - u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); - u32 word; - - switch (op) { - case UO_NEG: - word = aa64_neg(sf, rd, rn); - break; - case UO_BNOT: - word = aa64_mvn(sf, rd, rn); - break; - case UO_NOT: - aa64_emit32(mc, aa64_subs_imm(sf, /*ZR=*/31, rn, 0)); - word = aa64_cset_eq(sf, rd); - break; - default: - compiler_panic(t->c, impl_of(t)->loc, "aarch64 unop: op %d unimpl", - (int)op); - } - aa64_emit32(mc, word); -} - -static void aa_convert(CGTarget* t, ConvKind k, Operand dst, Operand src) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - u32 rd = reg_num(dst); - u32 rn = reg_num(src); - - switch (k) { - case CV_SEXT: { - if (src.cls != RC_INT || dst.cls != RC_INT) { - compiler_panic(t->c, a->loc, "aarch64 convert SEXT: bad classes"); - } - u32 src_bits = type_byte_size(src.type) * 8u; - u32 sf_dst = type_is_64(dst.type) ? 1u : 0u; - aa64_emit32(mc, aa64_sbfm(sf_dst, rd, rn, /*immr=*/0, /*imms=*/src_bits - 1u)); - return; - } - case CV_ZEXT: { - if (src.cls != RC_INT || dst.cls != RC_INT) { - compiler_panic(t->c, a->loc, "aarch64 convert ZEXT: bad classes"); - } - u32 src_bits = type_byte_size(src.type) * 8u; - if (src_bits == 32u) { - aa64_emit32(mc, aa64_mov_reg(0, rd, rn)); - } else { - aa64_emit32(mc, aa64_ubfm(0, rd, rn, /*immr=*/0, /*imms=*/src_bits - 1u)); - } - return; - } - case CV_TRUNC: { - aa64_emit32(mc, aa64_mov_reg(0, rd, rn)); - return; - } - case CV_ITOF_S: { - u32 sf_src = type_is_64(src.type) ? 1u : 0u; - u32 type = type_is_fp_double(dst.type) ? 1u : 0u; - aa64_emit32(mc, aa64_scvtf(sf_src, type, rd, rn)); - return; - } - case CV_ITOF_U: { - u32 sf_src = type_is_64(src.type) ? 1u : 0u; - u32 type = type_is_fp_double(dst.type) ? 1u : 0u; - aa64_emit32(mc, aa64_ucvtf(sf_src, type, rd, rn)); - return; - } - case CV_FTOI_S: { - if (src.cls != RC_FP || dst.cls != RC_INT) { - compiler_panic(t->c, a->loc, "aarch64 convert FTOI_S: bad classes"); - } - u32 sf = type_is_64(dst.type) ? 1u : 0u; - u32 type = type_is_fp_double(src.type) ? 1u : 0u; - aa64_emit32(mc, aa64_fcvtzs(sf, type, rd, rn)); - return; - } - case CV_FTOI_U: { - if (src.cls != RC_FP || dst.cls != RC_INT) { - compiler_panic(t->c, a->loc, "aarch64 convert FTOI_U: bad classes"); - } - u32 sf = type_is_64(dst.type) ? 1u : 0u; - u32 type = type_is_fp_double(src.type) ? 1u : 0u; - aa64_emit32(mc, aa64_fcvtzu(sf, type, rd, rn)); - return; - } - case CV_FEXT: { - aa64_emit32(mc, aa64_fcvt_d_s(rd, rn)); - return; - } - case CV_FTRUNC: { - aa64_emit32(mc, aa64_fcvt_s_d(rd, rn)); - return; - } - case CV_BITCAST: { - if (src.cls == RC_INT && dst.cls == RC_FP) { - u32 sz = type_byte_size(dst.type); - aa64_emit32(mc, sz == 8 ? aa64_fmov_d_x(rd, rn) : aa64_fmov_s_w(rd, rn)); - } else if (src.cls == RC_FP && dst.cls == RC_INT) { - u32 sz = type_byte_size(src.type); - aa64_emit32(mc, sz == 8 ? aa64_fmov_x_d(rd, rn) : aa64_fmov_w_s(rd, rn)); - } else { - compiler_panic(t->c, a->loc, - "aarch64 convert BITCAST: same-class not yet supported"); - } - return; - } - default: - compiler_panic(t->c, a->loc, "aarch64 convert kind %d unimpl", (int)k); - } -} - -/* ============================================================ - * Calls - * ============================================================ */ - -static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi, - const CGABIValue* av, u32* next_int, u32* next_fp, - u32* stack_off) { - AAImpl* a = impl_of(t); - ABIArgInfo va_ai; - ABIArgPart va_pt; - const ABIArgInfo* ai = av->abi; - if (!ai) { - u32 sz = type_byte_size(av->type); - memset(&va_ai, 0, sizeof va_ai); - memset(&va_pt, 0, sizeof va_pt); - va_ai.kind = ABI_ARG_DIRECT; - va_ai.parts = &va_pt; - va_ai.nparts = 1; - va_pt.cls = (av->storage.cls == RC_FP) ? ABI_CLASS_FP : ABI_CLASS_INT; - va_pt.size = sz; - va_pt.align = sz; - va_pt.src_offset = 0; - ai = &va_ai; - if (fi && fi->vararg_on_stack) { - *next_int = 8; - *next_fp = 8; - } - } - if (ai->kind == ABI_ARG_IGNORE) return; - - if (ai->kind == ABI_ARG_INDIRECT) { - u32 dst_reg; - int to_stack = (*next_int >= 8); - if (!to_stack) - dst_reg = (*next_int)++; - else - dst_reg = AA_TMP0; - if (av->storage.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad byval slot"); - aa64_emit32(t->mc, aa64_sub_imm(1, dst_reg, 29, s->off, 0)); - } else if (av->storage.kind == OPK_INDIRECT) { - aa64_emit_addr_adjust(t->mc, dst_reg, av->storage.v.ind.base & 0x1f, - av->storage.v.ind.ofs); - } else { - compiler_panic(t->c, a->loc, - "aarch64 call: INDIRECT arg storage kind %d unsupported", - (int)av->storage.kind); - } - if (to_stack) { - aa64_emit32(t->mc, aa64_str_uimm(3, dst_reg, 31, *stack_off)); - *stack_off += 8; - } - return; - } - - for (u16 i = 0; i < ai->nparts; ++i) { - const ABIArgPart* pt = &ai->parts[i]; - u32 sz = pt->size; - u32 sidx = size_idx_for_bytes(sz); - - if (pt->cls == ABI_CLASS_INT) { - int to_stack = (*next_int >= 8); - u32 dst_reg = to_stack ? AA_TMP0 : (*next_int)++; - switch (av->storage.kind) { - case OPK_IMM: { - u32 sf = (sz == 8) ? 1u : 0u; - aa64_emit_load_imm(t->mc, sf, dst_reg, av->storage.v.imm); - break; - } - case OPK_REG: { - u32 sf = (sz == 8) ? 1u : 0u; - aa64_emit32(t->mc, aa64_mov_reg(sf, dst_reg, reg_num(av->storage))); - break; - } - case OPK_LOCAL: { - AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad arg slot"); - i32 off = -(i32)s->off + (i32)pt->src_offset; - aa64_emit32(t->mc, aa64_ldur(sidx, dst_reg, 29, off)); - break; - } - case OPK_INDIRECT: { - Operand src; - memset(&src, 0, sizeof src); - src.kind = OPK_INDIRECT; - src.v.ind.base = av->storage.v.ind.base; - src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset; - i32 off; - u32 base = addr_base(t, src, &off, AA_TMP0); - aa64_emit32(t->mc, aa64_ldur(sidx, dst_reg, base, off)); - break; - } - default: - compiler_panic(t->c, a->loc, - "aarch64 call: arg storage kind %d unsupported", - (int)av->storage.kind); - } - if (to_stack) { - aa64_emit32(t->mc, aa64_str_uimm(3, dst_reg, 31, *stack_off)); - *stack_off += 8; - } - } else if (pt->cls == ABI_CLASS_FP) { - int to_stack = (*next_fp >= 8); - if (!to_stack) { - u32 dst_reg = (*next_fp)++; - switch (av->storage.kind) { - case OPK_REG: { - u32 type = (sz == 8) ? 1u : 0u; - aa64_emit32(t->mc, aa64_fmov_reg(type, dst_reg, reg_num(av->storage))); - break; - } - case OPK_INDIRECT: { - Operand src; - memset(&src, 0, sizeof src); - src.kind = OPK_INDIRECT; - src.v.ind.base = av->storage.v.ind.base; - src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset; - i32 off; - u32 base = addr_base(t, src, &off, AA_TMP0); - aa64_emit32(t->mc, aa64_ldur_fp(sidx, dst_reg, base, off)); - break; - } - default: - compiler_panic(t->c, a->loc, - "aarch64 call: FP arg storage kind %d unsupported", - (int)av->storage.kind); - } - } else { - switch (av->storage.kind) { - case OPK_REG: - aa64_emit32(t->mc, aa64_stur_fp(sidx, reg_num(av->storage), 31, - (i32)*stack_off)); - break; - case OPK_INDIRECT: { - Operand src; - memset(&src, 0, sizeof src); - src.kind = OPK_INDIRECT; - src.v.ind.base = av->storage.v.ind.base; - src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset; - i32 off; - u32 base = addr_base(t, src, &off, AA_TMP0); - aa64_emit32(t->mc, aa64_ldur_fp(sidx, AA_FP_TMP0, base, off)); - aa64_emit32(t->mc, aa64_stur_fp(sidx, AA_FP_TMP0, 31, (i32)*stack_off)); - break; - } - default: - compiler_panic( - t->c, a->loc, - "aarch64 call: FP stack-arg storage kind %d unsupported", - (int)av->storage.kind); - } - *stack_off += 8; - } - } else { - compiler_panic(t->c, a->loc, "aarch64 call: ABI class %d unimpl", - (int)pt->cls); - } - } -} - -static void aa_call(CGTarget* t, const CGCallDesc* d) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - - u32 next_int = 0, next_fp = 0, stack_off = 0; - - if (d->abi && d->abi->has_sret) { - if (d->ret.storage.kind != OPK_LOCAL) { - compiler_panic(t->c, a->loc, - "aarch64 call: sret destination must be LOCAL"); - } - AASlot* s = aa64_slot_get(a, d->ret.storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad sret slot"); - aa64_emit32(mc, aa64_sub_imm(1, 8, 29, s->off, 0)); - } - - for (u32 i = 0; i < d->nargs; ++i) { - emit_arg_value(t, d->abi, &d->args[i], &next_int, &next_fp, &stack_off); - } - - u32 needed = (stack_off + 15u) & ~15u; - if (needed > a->max_outgoing) a->max_outgoing = needed; - - if (d->callee.kind == OPK_GLOBAL) { - u32 bl_pos = mc->pos(mc); - aa64_emit32(mc, aa64_bl_base()); - mc->emit_reloc_at(mc, mc->section_id, bl_pos, R_AARCH64_CALL26, - d->callee.v.global.sym, d->callee.v.global.addend, 0, 0); - } else if (d->callee.kind == OPK_REG) { - aa64_emit32(mc, aa64_blr(reg_num(d->callee))); - } else { - compiler_panic(t->c, a->loc, "aarch64 call: callee kind %d unsupported", - (int)d->callee.kind); - } - - const ABIArgInfo* ri = &d->abi->ret; - if (ri->kind == ABI_ARG_IGNORE || ri->kind == ABI_ARG_INDIRECT) { - return; - } - if (ri->nparts == 0) return; - - Operand rs = d->ret.storage; - u32 next_int_ret = 0, next_fp_ret = 0; - for (u16 i = 0; i < ri->nparts; ++i) { - const ABIArgPart* p = &ri->parts[i]; - u32 src_reg; - if (p->cls == ABI_CLASS_INT) { - src_reg = next_int_ret++; - } else if (p->cls == ABI_CLASS_FP) { - src_reg = next_fp_ret++; - } else { - compiler_panic(t->c, a->loc, "aarch64 call: ret part cls %d unimpl", - (int)p->cls); - } - - if (rs.kind == OPK_REG) { - if (ri->nparts != 1) { - compiler_panic(t->c, a->loc, - "aarch64 call: REG ret_storage with %u parts", - (unsigned)ri->nparts); - } - if (p->cls == ABI_CLASS_INT) { - u32 sf = (p->size == 8) ? 1u : 0u; - aa64_emit32(mc, aa64_mov_reg(sf, reg_num(rs), src_reg)); - } else { - u32 type = (p->size == 8) ? 1u : 0u; - aa64_emit32(mc, aa64_fmov_reg(type, reg_num(rs), src_reg)); - } - } else if (rs.kind == OPK_LOCAL || rs.kind == OPK_INDIRECT) { - u32 base_reg; - i32 base_off; - if (rs.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, rs.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot"); - base_reg = 29; - base_off = -(i32)s->off; - } else { - base_reg = rs.v.ind.base & 0x1f; - base_off = rs.v.ind.ofs; - } - u32 sidx = size_idx_for_bytes(p->size); - i32 off = base_off + (i32)p->src_offset; - if (p->cls == ABI_CLASS_INT) { - aa64_emit32(mc, aa64_stur(sidx, src_reg, base_reg, off)); - } else { - aa64_emit32(mc, aa64_stur_fp(sidx, src_reg, base_reg, off)); - } - } else if (rs.kind == OPK_IMM && rs.type == CG_BUILTIN_ID(CFREE_CG_BUILTIN_VOID)) { - /* void return placeholder */ - } else { - compiler_panic(t->c, a->loc, - "aarch64 call: ret_storage kind %d unsupported", - (int)rs.kind); - } - } -} - -static void aa_ret(CGTarget* t, const CGABIValue* val) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - - if (val) { - const ABIArgInfo* ri = val->abi; - if (ri && ri->kind == ABI_ARG_INDIRECT) { - if (val->storage.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, val->storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad sret slot"); - if (a->sret_ptr_slot != FRAME_SLOT_NONE) { - AASlot* sp = aa64_slot_get(a, a->sret_ptr_slot); - if (sp) aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off)); - } - u32 nbytes = s->size; - u32 i = 0; - while (i + 8 <= nbytes) { - aa64_emit32(mc, aa64_ldur(3, AA_TMP0, 29, -(i32)s->off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i)); - i += 8; - } - while (i + 4 <= nbytes) { - aa64_emit32(mc, aa64_ldur(2, AA_TMP0, 29, -(i32)s->off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i)); - i += 4; - } - while (i + 2 <= nbytes) { - aa64_emit32(mc, aa64_ldur(1, AA_TMP0, 29, -(i32)s->off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i)); - i += 2; - } - while (i < nbytes) { - aa64_emit32(mc, aa64_ldur(0, AA_TMP0, 29, -(i32)s->off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i)); - i += 1; - } - } else if (val->storage.kind == OPK_INDIRECT) { - u32 nbytes = val->size; - if (!nbytes) { - compiler_panic(t->c, a->loc, - "aarch64 ret indirect: missing aggregate size"); - } - if (a->sret_ptr_slot != FRAME_SLOT_NONE) { - AASlot* sp = aa64_slot_get(a, a->sret_ptr_slot); - if (sp) aa64_emit32(mc, aa64_ldur(3, 8, 29, -(i32)sp->off)); - } - u32 base_reg = val->storage.v.ind.base & 0x1f; - i32 base_off = val->storage.v.ind.ofs; - u32 i = 0; - while (i + 8 <= nbytes) { - aa64_emit32(mc, aa64_ldur(3, AA_TMP0, base_reg, base_off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i)); - i += 8; - } - while (i + 4 <= nbytes) { - aa64_emit32(mc, aa64_ldur(2, AA_TMP0, base_reg, base_off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i)); - i += 4; - } - while (i + 2 <= nbytes) { - aa64_emit32(mc, aa64_ldur(1, AA_TMP0, base_reg, base_off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i)); - i += 2; - } - while (i < nbytes) { - aa64_emit32(mc, aa64_ldur(0, AA_TMP0, base_reg, base_off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i)); - i += 1; - } - } else { - compiler_panic(t->c, a->loc, - "aarch64 ret indirect: storage kind %d unsupported", - (int)val->storage.kind); - } - } else if (val->storage.kind == OPK_REG) { - if (val->storage.cls == RC_FP) { - u32 type = type_is_fp_double(val->storage.type) ? 1u : 0u; - aa64_emit32(mc, aa64_fmov_reg(type, /*Rd=*/0, reg_num(val->storage))); - } else { - u32 sf = type_is_64(val->storage.type) ? 1u : 0u; - aa64_emit32(mc, aa64_mov_reg(sf, /*Rd=*/0, reg_num(val->storage))); - } - } else if (val->storage.kind == OPK_IMM) { - u32 sf = type_is_64(val->storage.type) ? 1u : 0u; - aa64_emit_load_imm(mc, sf, /*Rd=*/0, val->storage.v.imm); - } else if (val->storage.kind == OPK_LOCAL || - val->storage.kind == OPK_INDIRECT) { - u32 base_reg; - i32 base_off; - if (val->storage.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, val->storage.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 ret: bad local slot"); - base_reg = 29; - base_off = -(i32)s->off; - } else { - base_reg = val->storage.v.ind.base & 0x1f; - base_off = val->storage.v.ind.ofs; - } - const ABIArgInfo* ri2 = val->abi; - for (u16 i = 0; i < (ri2 ? ri2->nparts : 0); ++i) { - const ABIArgPart* pt = &ri2->parts[i]; - u32 sidx = size_idx_for_bytes(pt->size); - i32 off = base_off + (i32)pt->src_offset; - if (pt->cls == ABI_CLASS_INT) { - aa64_emit32(mc, aa64_ldur(sidx, /*Rt=*/i, base_reg, off)); - } else if (pt->cls == ABI_CLASS_FP) { - aa64_emit32(mc, aa64_ldur_fp(sidx, /*Rt=*/i, base_reg, off)); - } else { - compiler_panic(t->c, a->loc, "aarch64 ret: ret part cls %d unimpl", - (int)pt->cls); - } - } - } - } - u32 bpos = mc->pos(mc); - aa64_emit32(mc, aa64_b_base()); - mc->emit_label_ref(mc, a->epilogue_label, R_AARCH64_JUMP26, 4, 0); - (void)bpos; -} - -/* ============================================================ - * alloca - * ============================================================ */ - -static void aa_alloca_(CGTarget* t, Operand d, Operand sz, u32 align) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - - if (d.kind != OPK_REG) { - compiler_panic(t->c, a->loc, "aarch64 alloca: dst must be REG"); - } - if (align > 16) { - compiler_panic(t->c, a->loc, - "aarch64 alloca: align %u > 16 not yet supported", align); - } - - if (sz.kind == OPK_IMM) { - i64 v = sz.v.imm; - if (v < 0) { - compiler_panic(t->c, a->loc, "aarch64 alloca: negative size"); - } - u64 aligned = ((u64)v + 15u) & ~(u64)15u; - if (aligned == 0) aligned = 16; - if (aligned > 0xfffu) { - compiler_panic(t->c, a->loc, - "aarch64 alloca: const size %llu too large for v1", - (unsigned long long)aligned); - } - aa64_emit32(mc, aa64_sub_imm(1, /*Rd=SP*/ 31, /*Rn=SP*/ 31, (u32)aligned, 0)); - } else if (sz.kind == OPK_REG) { - u32 sz_reg = reg_num(sz); - aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, sz_reg, 15u, 0)); - aa64_emit32(mc, aa64_ubfm(1, AA_TMP0, AA_TMP0, 4, 63)); - aa64_emit32(mc, aa64_ubfm(1, AA_TMP0, AA_TMP0, 60, 59)); - aa64_emit32(mc, aa64_sub_extreg_x_uxtx(/*SP*/ 31, /*SP*/ 31, AA_TMP0)); - } else { - compiler_panic(t->c, a->loc, "aarch64 alloca: size kind %d unsupported", - (int)sz.kind); - } - - if (a->nadd_patches == a->add_patches_cap) { - u32 ncap = a->add_patches_cap ? a->add_patches_cap * 2 : 4; - struct AAAllocaPatch* nb = - arena_array(t->c->tu, struct AAAllocaPatch, ncap); - if (a->add_patches) - memcpy(nb, a->add_patches, sizeof(*nb) * a->nadd_patches); - a->add_patches = nb; - a->add_patches_cap = ncap; - } - u32 dst_reg = reg_num(d); - a->add_patches[a->nadd_patches].pos = mc->pos(mc); - a->add_patches[a->nadd_patches].dst_reg = dst_reg; - a->nadd_patches++; - aa64_emit32(mc, aa64_add_imm(1, dst_reg, /*Rn=SP*/ 31, 0, 0)); - a->has_alloca = 1; -} - -/* ============================================================ - * Varargs - * ============================================================ */ - -static void emit_fp_off(MCEmitter* mc, u32 dst, i32 ofs) { - if (ofs == 0) - aa64_emit32(mc, aa64_mov_reg(1, dst, 29)); - else if (ofs > 0 && (u32)ofs <= 0xfff) - aa64_emit32(mc, aa64_add_imm(1, dst, 29, (u32)ofs, 0)); - else if (ofs < 0 && (u32)(-ofs) <= 0xfff) - aa64_emit32(mc, aa64_sub_imm(1, dst, 29, (u32)(-ofs), 0)); - else { - aa64_emit_load_imm(mc, 1, dst, ofs); - aa64_emit32(mc, aa64_add(1, dst, 29, dst)); - } -} - -static void aa_va_start_(CGTarget* t, Operand ap_op) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - if (!a->is_variadic) { - compiler_panic(t->c, a->loc, "aarch64 va_start: function not variadic"); - } - u32 ap = reg_num(ap_op); - AASlot* gs = aa64_slot_get(a, a->gp_save_slot); - AASlot* fs = aa64_slot_get(a, a->fp_save_slot); - - { - u32 ofs = 16u + a->next_param_stack; - if (ofs <= 0xfff) - aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, 29, ofs, 0)); - else { - aa64_emit_load_imm(mc, 1, AA_TMP0, (i64)ofs); - aa64_emit32(mc, aa64_add(1, AA_TMP0, 29, AA_TMP0)); - } - aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 0)); - } - emit_fp_off(mc, AA_TMP0, -(i32)gs->off + (i32)gs->size); - aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 8)); - emit_fp_off(mc, AA_TMP0, -(i32)fs->off + (i32)fs->size); - aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 16)); - aa64_emit_load_imm(mc, 0, AA_TMP0, - (i64)((i32)(a->next_param_int * 8u) - 64)); - aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, ap, 24)); - aa64_emit_load_imm(mc, 0, AA_TMP0, - (i64)((i32)(a->next_param_fp * 16u) - 128)); - aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, ap, 28)); -} - -static void aa_va_arg_(CGTarget* t, Operand dst, Operand ap_op, - CfreeCgTypeId ty) { - MCEmitter* mc = t->mc; - u32 ap = reg_num(ap_op); - int is_fp = (dst.cls == RC_FP); - u32 offs_field = is_fp ? 28u : 24u; - u32 top_field = is_fp ? 16u : 8u; - u32 stride_reg = is_fp ? 16u : 8u; - u32 sz = type_byte_size(ty); - u32 sidx = size_idx_for_bytes(sz); - - MCLabel L_stack = mc->label_new(mc); - MCLabel L_done = mc->label_new(mc); - - aa64_emit32(mc, aa64_ldur(2, AA_TMP0, ap, (i32)offs_field)); - aa64_emit32(mc, aa64_subs_imm(0, 31, AA_TMP0, 0)); - aa64_emit32(mc, aa64_b_cond(0xa /*GE*/)); - mc->emit_label_ref(mc, L_stack, R_AARCH64_CONDBR19, 4, 0); - - aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, (i32)top_field)); - aa64_emit32(mc, aa64_sbfm(1, AA_TMP2, AA_TMP0, 0, 31)); - aa64_emit32(mc, aa64_add(1, AA_TMP2, AA_TMP1, AA_TMP2)); - if (is_fp) - aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP2, 0)); - else - aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP2, 0)); - aa64_emit32(mc, aa64_add_imm(0, AA_TMP0, AA_TMP0, stride_reg, 0)); - aa64_emit32(mc, aa64_stur(2, AA_TMP0, ap, (i32)offs_field)); - aa64_emit32(mc, aa64_b_base()); - mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0); - - mc->label_place(mc, L_stack); - aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, 0)); - if (is_fp) - aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP1, 0)); - else - aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP1, 0)); - aa64_emit32(mc, aa64_add_imm(1, AA_TMP1, AA_TMP1, 8u, 0)); - aa64_emit32(mc, aa64_stur(3, AA_TMP1, ap, 0)); - - mc->label_place(mc, L_done); -} - -static void aa_va_end_(CGTarget* t, Operand a) { - (void)t; - (void)a; -} - -static void aa_va_copy_(CGTarget* t, Operand d, Operand s) { - MCEmitter* mc = t->mc; - u32 dr = reg_num(d); - u32 sr = reg_num(s); - for (u32 i = 0; i < 32u; i += 8u) { - aa64_emit32(mc, aa64_ldur(3, AA_TMP0, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(3, AA_TMP0, dr, (i32)i)); - } -} - -/* ============================================================ - * Atomics - * ============================================================ */ - -static inline u32 aa64_ldar(u32 sf64, u32 Rt, u32 Rn) { - return (sf64 ? 0xC8DFFC00u : 0x88DFFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_stlr(u32 sf64, u32 Rt, u32 Rn) { - return (sf64 ? 0xC89FFC00u : 0x889FFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldxr(u32 sf64, u32 Rt, u32 Rn) { - return (sf64 ? 0xC85F7C00u : 0x885F7C00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_ldaxr(u32 sf64, u32 Rt, u32 Rn) { - return (sf64 ? 0xC85FFC00u : 0x885FFC00u) | ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_stxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn) { - return (sf64 ? 0xC8007C00u : 0x88007C00u) | ((Rs & 0x1f) << 16) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_stlxr(u32 sf64, u32 Rs, u32 Rt, u32 Rn) { - return (sf64 ? 0xC800FC00u : 0x8800FC00u) | ((Rs & 0x1f) << 16) | - ((Rn & 0x1f) << 5) | (Rt & 0x1f); -} -static inline u32 aa64_cbnz(u32 sf64, u32 Rt) { - return 0x35000000u | (sf64 << 31) | (Rt & 0x1f); -} - -static int mem_order_is_acquire(MemOrder o) { - return o == MO_ACQUIRE || o == MO_ACQ_REL || o == MO_SEQ_CST || - o == MO_CONSUME; -} -static int mem_order_is_release(MemOrder o) { - return o == MO_RELEASE || o == MO_ACQ_REL || o == MO_SEQ_CST; -} - -static void aa_atomic_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma, - MemOrder ord) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - u32 sf = (ma.size == 8) ? 1u : 0u; - - u32 base; - if (addr.kind == OPK_REG) { - base = reg_num(addr); - } else if (addr.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, addr.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_load: bad slot"); - base = AA_TMP0; - aa64_emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0)); - } else { - compiler_panic(t->c, a->loc, - "aarch64 atomic_load: addr kind %d unsupported", - (int)addr.kind); - } - if (mem_order_is_acquire(ord)) { - aa64_emit32(mc, aa64_ldar(sf, reg_num(dst), base)); - } else { - u32 sidx = size_idx_for_bytes(ma.size); - aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), base, 0)); - } -} - -static void aa_atomic_store(CGTarget* t, Operand addr, Operand src, - MemAccess ma, MemOrder ord) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - u32 sf = (ma.size == 8) ? 1u : 0u; - - u32 src_reg; - if (src.kind == OPK_IMM) { - src_reg = AA_TMP1; - aa64_emit_load_imm(mc, sf, src_reg, src.v.imm); - } else if (src.kind == OPK_REG) { - src_reg = reg_num(src); - } else { - compiler_panic(t->c, a->loc, - "aarch64 atomic_store: src kind %d unsupported", - (int)src.kind); - } - u32 base; - if (addr.kind == OPK_REG) { - base = reg_num(addr); - } else if (addr.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, addr.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_store: bad slot"); - base = AA_TMP0; - aa64_emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0)); - } else { - compiler_panic(t->c, a->loc, - "aarch64 atomic_store: addr kind %d unsupported", - (int)addr.kind); - } - if (mem_order_is_release(ord)) { - aa64_emit32(mc, aa64_stlr(sf, src_reg, base)); - } else { - u32 sidx = size_idx_for_bytes(ma.size); - aa64_emit32(mc, aa64_stur(sidx, src_reg, base, 0)); - } -} - -static void emit_rmw_combine(MCEmitter* mc, AtomicOp op, u32 sf, u32 dst_new, - u32 prior, u32 val) { - switch (op) { - case AO_XCHG: aa64_emit32(mc, aa64_mov_reg(sf, dst_new, val)); break; - case AO_ADD: aa64_emit32(mc, aa64_add(sf, dst_new, prior, val)); break; - case AO_SUB: aa64_emit32(mc, aa64_sub(sf, dst_new, prior, val)); break; - case AO_AND: aa64_emit32(mc, aa64_and(sf, dst_new, prior, val)); break; - case AO_OR: aa64_emit32(mc, aa64_orr(sf, dst_new, prior, val)); break; - case AO_XOR: aa64_emit32(mc, aa64_eor(sf, dst_new, prior, val)); break; - case AO_NAND: - aa64_emit32(mc, aa64_and(sf, dst_new, prior, val)); - aa64_emit32(mc, aa64_mvn(sf, dst_new, dst_new)); - break; - default: - aa64_emit32(mc, aa64_mov_reg(sf, dst_new, val)); - break; - } -} - -static void aa_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst, Operand addr, - Operand val, MemAccess ma, MemOrder ord) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - u32 sf = (ma.size == 8) ? 1u : 0u; - - u32 base = AA_TMP0; - if (addr.kind == OPK_REG) { - aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(addr))); - } else if (addr.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, addr.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: bad slot"); - aa64_emit32(mc, aa64_sub_imm(1, AA_TMP0, 29, s->off, 0)); - } else { - compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: addr kind %d unsupported", - (int)addr.kind); - } - u32 vreg = AA_TMP1; - if (val.kind == OPK_IMM) { - aa64_emit_load_imm(mc, sf, vreg, val.v.imm); - } else if (val.kind == OPK_REG) { - aa64_emit32(mc, aa64_mov_reg(sf, vreg, reg_num(val))); - } else { - compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: val kind %d unsupported", - (int)val.kind); - } - - int do_acq = mem_order_is_acquire(ord); - int do_rel = mem_order_is_release(ord); - - MCLabel L_retry = mc->label_new(mc); - mc->label_place(mc, L_retry); - - if (do_acq) - aa64_emit32(mc, aa64_ldaxr(sf, reg_num(dst), base)); - else - aa64_emit32(mc, aa64_ldxr(sf, reg_num(dst), base)); - - emit_rmw_combine(mc, op, sf, AA_TMP2, reg_num(dst), vreg); - - if (do_rel) - aa64_emit32(mc, aa64_stlxr(sf, vreg, AA_TMP2, base)); - else - aa64_emit32(mc, aa64_stxr(sf, vreg, AA_TMP2, base)); - - u32 cbnz_pos = mc->pos(mc); - aa64_emit32(mc, aa64_cbnz(0, vreg)); - mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0); - (void)cbnz_pos; -} - -static void aa_atomic_cas(CGTarget* t, Operand prior, Operand ok, Operand addr, - Operand expected, Operand desired, MemAccess ma, - MemOrder succ, MemOrder fail) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - u32 sf = (ma.size == 8) ? 1u : 0u; - (void)fail; - - u32 base = AA_TMP0; - if (addr.kind == OPK_REG) - aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(addr))); - else if (addr.kind == OPK_LOCAL) { - AASlot* s = aa64_slot_get(a, addr.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_cas: bad slot"); - aa64_emit32(mc, aa64_sub_imm(1, AA_TMP0, 29, s->off, 0)); - } else { - compiler_panic(t->c, a->loc, "aarch64 atomic_cas: addr kind %d unsupported", - (int)addr.kind); - } - if (expected.kind == OPK_IMM) - aa64_emit_load_imm(mc, sf, AA_TMP1, expected.v.imm); - else if (expected.kind == OPK_REG) - aa64_emit32(mc, aa64_mov_reg(sf, AA_TMP1, reg_num(expected))); - else - compiler_panic(t->c, a->loc, "aarch64 atomic_cas: exp kind %d unsupported", - (int)expected.kind); - if (desired.kind == OPK_IMM) - aa64_emit_load_imm(mc, sf, AA_TMP2, desired.v.imm); - else if (desired.kind == OPK_REG) - aa64_emit32(mc, aa64_mov_reg(sf, AA_TMP2, reg_num(desired))); - else - compiler_panic(t->c, a->loc, "aarch64 atomic_cas: des kind %d unsupported", - (int)desired.kind); - - int do_acq = mem_order_is_acquire(succ); - int do_rel = mem_order_is_release(succ); - - MCLabel L_retry = mc->label_new(mc); - MCLabel L_fail = mc->label_new(mc); - MCLabel L_done = mc->label_new(mc); - - mc->label_place(mc, L_retry); - if (do_acq) - aa64_emit32(mc, aa64_ldaxr(sf, reg_num(prior), base)); - else - aa64_emit32(mc, aa64_ldxr(sf, reg_num(prior), base)); - - aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, reg_num(prior), AA_TMP1)); - aa64_emit32(mc, aa64_b_cond(0x1u /*NE*/)); - mc->emit_label_ref(mc, L_fail, R_AARCH64_CONDBR19, 4, 0); - - if (do_rel) - aa64_emit32(mc, aa64_stlxr(sf, AA_TMP1, AA_TMP2, base)); - else - aa64_emit32(mc, aa64_stxr(sf, AA_TMP1, AA_TMP2, base)); - aa64_emit32(mc, aa64_cbnz(0, AA_TMP1)); - mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0); - - aa64_emit_load_imm(mc, 0, reg_num(ok), 1); - aa64_emit32(mc, aa64_b_base()); - mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0); - - mc->label_place(mc, L_fail); - aa64_emit32(mc, aa64_clrex(AA64_BARRIER_OPT_SY)); - aa64_emit_load_imm(mc, 0, reg_num(ok), 0); - - mc->label_place(mc, L_done); -} - -static void aa_fence(CGTarget* t, MemOrder o) { - (void)o; - if (o == MO_RELAXED) return; - aa64_emit32(t->mc, aa64_dmb(AA64_BARRIER_OPT_ISH)); -} - -/* ============================================================ - * Intrinsics - * ============================================================ */ - -static inline u32 aa64_rev16_w(u32 Rd, u32 Rn) { - return 0x5AC00400u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_rev_w(u32 Rd, u32 Rn) { - return 0x5AC00800u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_rev_x(u32 Rd, u32 Rn) { - return 0xDAC00C00u | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_rbit(u32 sf64, u32 Rd, u32 Rn) { - return (sf64 ? 0xDAC00000u : 0x5AC00000u) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_clz(u32 sf64, u32 Rd, u32 Rn) { - return (sf64 ? 0xDAC01000u : 0x5AC01000u) | ((Rn & 0x1f) << 5) | (Rd & 0x1f); -} -static inline u32 aa64_cnt_8b(u32 Vd, u32 Vn) { - return 0x0E205800u | ((Vn & 0x1f) << 5) | (Vd & 0x1f); -} -static inline u32 aa64_addv_b_8b(u32 Vd, u32 Vn) { - return 0x0E31B800u | ((Vn & 0x1f) << 5) | (Vd & 0x1f); -} -static inline u32 aa64_adds_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm) { - return 0x2B000000u | (sf << 31) | ((Rm & 0x1f) << 16) | ((Rn & 0x1f) << 5) | - (Rd & 0x1f); -} -static inline u32 aa64_smaddl(u32 Rd, u32 Rn, u32 Rm, u32 Ra) { - return aa64_dp3_pack((AA64DP3){ - .sf = 1, .op31 = 1, .o0 = 0, .Rm = Rm, .Ra = Ra, .Rn = Rn, .Rd = Rd}); -} -static inline u32 aa64_smull(u32 Rd, u32 Rn, u32 Rm) { - return aa64_smaddl(Rd, Rn, Rm, AA64_ZR); -} -static inline u32 aa64_subs_extreg_x_sxtw(u32 Rd, u32 Rn, u32 Rm) { - return 0xEB200000u | ((Rm & 0x1f) << 16) | (6u << 13) | ((Rn & 0x1f) << 5) | - (Rd & 0x1f); -} - -static void aa_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd, - const Operand* args, u32 na) { - AAImpl* a = impl_of(t); - MCEmitter* mc = t->mc; - (void)nd; - - switch (kind) { - case INTRIN_POPCOUNT: { - Operand src = args[0]; - Operand dst = dsts[0]; - u32 sz_in = type_byte_size(src.type); - if (sz_in == 8) - aa64_emit32(mc, aa64_fmov_d_x(AA_FP_TMP0, reg_num(src))); - else - aa64_emit32(mc, aa64_fmov_s_w(AA_FP_TMP0, reg_num(src))); - aa64_emit32(mc, aa64_cnt_8b(AA_FP_TMP0, AA_FP_TMP0)); - aa64_emit32(mc, aa64_addv_b_8b(AA_FP_TMP0, AA_FP_TMP0)); - aa64_emit32(mc, aa64_fmov_w_s(reg_num(dst), AA_FP_TMP0)); - return; - } - case INTRIN_CLZ: { - Operand src = args[0]; - Operand dst = dsts[0]; - u32 sf = type_is_64(src.type) ? 1u : 0u; - aa64_emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(src))); - return; - } - case INTRIN_CTZ: { - Operand src = args[0]; - Operand dst = dsts[0]; - u32 sf = type_is_64(src.type) ? 1u : 0u; - aa64_emit32(mc, aa64_rbit(sf, reg_num(dst), reg_num(src))); - aa64_emit32(mc, aa64_clz(sf, reg_num(dst), reg_num(dst))); - return; - } - case INTRIN_BSWAP16: { - aa64_emit32(mc, aa64_rev16_w(reg_num(dsts[0]), reg_num(args[0]))); - return; - } - case INTRIN_BSWAP32: { - aa64_emit32(mc, aa64_rev_w(reg_num(dsts[0]), reg_num(args[0]))); - return; - } - case INTRIN_BSWAP64: { - aa64_emit32(mc, aa64_rev_x(reg_num(dsts[0]), reg_num(args[0]))); - return; - } - case INTRIN_MEMCPY: - case INTRIN_MEMMOVE: { - Operand da = args[0], sa = args[1], nb = args[2]; - if (da.kind != OPK_REG || sa.kind != OPK_REG || nb.kind != OPK_IMM) { - compiler_panic(t->c, a->loc, - "aarch64 intrinsic: %s with non-const n or non-REG ptr", - kind == INTRIN_MEMCPY ? "memcpy" : "memmove"); - } - u32 dr = reg_num(da); - u32 sr = reg_num(sa); - u32 n = (u32)nb.v.imm; - if (kind == INTRIN_MEMCPY) { - u32 i = 0; - while (i + 8 <= n) { - aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i)); - i += 8; - } - while (i + 4 <= n) { - aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i)); - i += 4; - } - while (i + 2 <= n) { - aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i)); - i += 2; - } - while (i < n) { - aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i)); - i += 1; - } - } else { - u32 i = n; - while (i >= 8) { - i -= 8; - aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i)); - } - while (i >= 4) { - i -= 4; - aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i)); - } - while (i >= 2) { - i -= 2; - aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i)); - } - while (i >= 1) { - i -= 1; - aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i)); - } - } - return; - } - case INTRIN_MEMSET: { - Operand da = args[0], bv = args[1], nb = args[2]; - if (da.kind != OPK_REG || nb.kind != OPK_IMM) { - compiler_panic( - t->c, a->loc, - "aarch64 intrinsic: memset with non-const n / non-REG ptr"); - } - u32 dr = reg_num(da); - u32 n = (u32)nb.v.imm; - u32 byte; - u32 src_reg; - if (bv.kind == OPK_IMM) { - byte = (u32)(bv.v.imm & 0xffu); - if (byte == 0) { - src_reg = 31u; - } else { - u64 b64 = byte; - b64 |= b64 << 8; - b64 |= b64 << 16; - b64 |= b64 << 32; - aa64_emit_load_imm(mc, 1, AA_TMP2, (i64)b64); - src_reg = AA_TMP2; - } - } else if (bv.kind == OPK_REG) { - aa64_emit_load_imm(mc, 1, AA_TMP2, (i64)0x0101010101010101ll); - aa64_emit32(mc, aa64_madd(1, AA_TMP2, reg_num(bv), AA_TMP2, AA64_ZR)); - src_reg = AA_TMP2; - } else { - compiler_panic(t->c, a->loc, - "aarch64 intrinsic: memset byte kind %d unsupported", - (int)bv.kind); - } - u32 i = 0; - while (i + 8 <= n) { - aa64_emit32(mc, aa64_stur(3, src_reg, dr, (i32)i)); - i += 8; - } - while (i + 4 <= n) { - aa64_emit32(mc, aa64_stur(2, src_reg, dr, (i32)i)); - i += 4; - } - while (i + 2 <= n) { - aa64_emit32(mc, aa64_stur(1, src_reg, dr, (i32)i)); - i += 2; - } - while (i < n) { - aa64_emit32(mc, aa64_stur(0, src_reg, dr, (i32)i)); - i += 1; - } - return; - } - case INTRIN_PREFETCH: - (void)args; - (void)na; - return; - case INTRIN_ASSUME_ALIGNED: { - Operand src = args[0]; - Operand dst = dsts[0]; - if (reg_num(src) != reg_num(dst)) { - aa64_emit32(mc, aa64_mov_reg(1, reg_num(dst), reg_num(src))); - } - return; - } - case INTRIN_EXPECT: { - Operand val = args[0]; - Operand dst = dsts[0]; - u32 sf = type_is_64(dst.type) ? 1u : 0u; - if (val.kind == OPK_REG) { - if (reg_num(val) != reg_num(dst)) { - aa64_emit32(mc, aa64_mov_reg(sf, reg_num(dst), reg_num(val))); - } - } else if (val.kind == OPK_IMM) { - aa64_emit_load_imm(mc, sf, reg_num(dst), val.v.imm); - } else { - compiler_panic(t->c, a->loc, - "aarch64 intrinsic: expect val kind %d unsupported", - (int)val.kind); - } - return; - } - case INTRIN_UNREACHABLE: - case INTRIN_TRAP: - aa64_emit32(mc, aa64_brk(kind == INTRIN_TRAP ? 1u : 0u)); - return; - case INTRIN_ADD_OVERFLOW: - case INTRIN_SUB_OVERFLOW: { - Operand a_op = args[0], b_op = args[1]; - Operand dval = dsts[0], dovf = dsts[1]; - u32 sf = type_is_64(dval.type) ? 1u : 0u; - u32 ra = aa64_force_reg_int(t, a_op, sf, AA_TMP0); - u32 rb = - aa64_force_reg_int(t, b_op, sf, - (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0); - u32 word = (kind == INTRIN_ADD_OVERFLOW) - ? aa64_adds_reg(sf, reg_num(dval), ra, rb) - : aa64_subs_reg(sf, reg_num(dval), ra, rb); - aa64_emit32(mc, word); - aa64_emit32(mc, aa64_cset(sf, reg_num(dovf), 0x6u /*VS*/)); - return; - } - case INTRIN_MUL_OVERFLOW: { - Operand a_op = args[0], b_op = args[1]; - Operand dval = dsts[0], dovf = dsts[1]; - u32 sf = type_is_64(dval.type) ? 1u : 0u; - if (sf) { - compiler_panic( - t->c, a->loc, - "aarch64 intrinsic: mul_overflow on i64 not yet supported"); - } - u32 ra = aa64_force_reg_int(t, a_op, 0, AA_TMP0); - u32 rb = - aa64_force_reg_int(t, b_op, 0, - (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0); - aa64_emit32(mc, aa64_smull(AA_TMP2, ra, rb)); - aa64_emit32(mc, aa64_subs_extreg_x_sxtw(/*XZR*/ 31u, AA_TMP2, AA_TMP2)); - aa64_emit32(mc, aa64_cset(0, reg_num(dovf), 0x1u /*NE*/)); - aa64_emit32(mc, aa64_mov_reg(0, reg_num(dval), AA_TMP2)); - return; - } - default: - compiler_panic(t->c, a->loc, "aarch64 intrinsic: kind %d unsupported", - (int)kind); - } -} - -/* ============================================================ - * Inline asm block - * ============================================================ */ - -static void aa_asm_block(CGTarget* t, const char* tmpl, - const AsmConstraint* outs, u32 no, Operand* oo, - const AsmConstraint* ins, u32 ni, const Operand* io, - const Sym* clobs, u32 nc) { - AAImpl* a_impl = impl_of(t); - for (u32 i = 0; i < nc; ++i) { - Reg phys; - RegClass cls; - if (t->resolve_reg_name(t, clobs[i], &phys, &cls) != 0) continue; - if (cls == RC_INT) { - if (phys >= 19u && phys <= 28u) - a_impl->used_cs_int_mask |= 1u << phys; - } else if (cls == RC_FP) { - if (phys >= 8u && phys <= 15u) - a_impl->used_cs_fp_mask |= 1u << phys; - } - } - AA64Asm* a = aa64_asm_open(t->c); - aa64_inline_bind(a, outs, no, oo, ins, ni, io, clobs, nc); - aa64_asm_run_template(a, t->mc, tmpl); - aa64_asm_close(a); -} - -/* ============================================================ - * Lifecycle / vtable constructor - * ============================================================ */ - -static void aa_set_loc(CGTarget* t, SrcLoc loc) { - impl_of(t)->loc = loc; - t->mc->set_loc(t->mc, loc); -} - -static void aa_finalize(CGTarget* t) { (void)t; } - -static void aa_destroy(CGTarget* t) { (void)t; } - -static void cgt_cleanup(void* arg) { cgtarget_free((CGTarget*)arg); } - -CGTarget* aa64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) { - AAImpl* a = arena_new(c->tu, AAImpl); - memset(a, 0, sizeof *a); - - CGTarget* t = &a->base; - t->c = c; - t->obj = o; - t->mc = m; - - t->func_begin = aa_func_begin; - t->func_end = aa_func_end; - t->frame_slot = aa_frame_slot; - t->param = aa_param; - - t->load_imm = aa_load_imm; - t->load_const = aa_load_const; - t->copy = aa_copy; - t->load = aa_load; - t->store = aa_store; - t->addr_of = aa_addr_of; - t->tls_addr_of = aa_tls_addr_of; - t->copy_bytes = aa_copy_bytes; - t->set_bytes = aa_set_bytes; - t->bitfield_load = aa_bitfield_load; - t->bitfield_store = aa_bitfield_store; - - t->binop = aa_binop; - t->unop = aa_unop; - t->convert = aa_convert; - - t->call = aa_call; - t->ret = aa_ret; - - t->alloca_ = aa_alloca_; - t->va_start_ = aa_va_start_; - t->va_arg_ = aa_va_arg_; - t->va_end_ = aa_va_end_; - t->va_copy_ = aa_va_copy_; - - t->atomic_load = aa_atomic_load; - t->atomic_store = aa_atomic_store; - t->atomic_rmw = aa_atomic_rmw; - t->atomic_cas = aa_atomic_cas; - t->fence = aa_fence; - - t->intrinsic = aa_intrinsic; - t->asm_block = aa_asm_block; - - t->set_loc = aa_set_loc; - t->finalize = aa_finalize; - t->destroy = aa_destroy; - - /* alloc/label/scope vtable entries */ - aa_alloc_vtable_init(t); - aa_coord_vtable_init(t); - - /* Suppress unused warning. */ - (void)type_is_signed; - - compiler_defer(c, cgt_cleanup, t); - return t; -} diff --git a/src/arch/aarch64/opt_coord.c b/src/arch/aarch64/opt_coord.c @@ -1,96 +0,0 @@ -/* aarch64/opt_coord.c — opt/backend register coordination hooks. - * Static arrays so opt_machinize can query the backend instead of - * hard-coding arch knowledge. */ - -#include "arch/aarch64/internal.h" - -/* ============================================================ - * Static register tables reported to caller-owned allocators. */ - -static const Reg aa_int_allocable[] = {19, 20, 21, 22, 23, - 24, 25, 26, 27, 28}; -static const Reg aa_fp_allocable[] = {8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23}; - -static const Reg aa_int_scratch[] = {16, 17}; -static const Reg aa_fp_scratch[] = {24, 25}; - -/* ============================================================ - * Vtable methods */ - -static void aa_get_allocable_regs(CGTarget* t, RegClass cls, - const Reg** out, u32* nregs) { - (void)t; - switch (cls) { - case RC_INT: - *out = aa_int_allocable; - *nregs = sizeof aa_int_allocable / sizeof aa_int_allocable[0]; - break; - case RC_FP: - *out = aa_fp_allocable; - *nregs = sizeof aa_fp_allocable / sizeof aa_fp_allocable[0]; - break; - default: - *out = NULL; - *nregs = 0; - break; - } -} - -static void aa_get_scratch_regs(CGTarget* t, RegClass cls, - const Reg** out, u32* nregs) { - (void)t; - switch (cls) { - case RC_INT: - *out = aa_int_scratch; - *nregs = sizeof aa_int_scratch / sizeof aa_int_scratch[0]; - break; - case RC_FP: - *out = aa_fp_scratch; - *nregs = sizeof aa_fp_scratch / sizeof aa_fp_scratch[0]; - break; - default: - *out = NULL; - *nregs = 0; - break; - } -} - -static int aa_is_caller_saved(CGTarget* t, RegClass cls, Reg reg) { - (void)t; - switch (cls) { - case RC_INT: - /* AAPCS64 caller-saved: x0-x18, x30 */ - return reg <= 18 || reg == 30; - case RC_FP: - /* AAPCS64 caller-saved: v0-v7, v16-v31 */ - return reg <= 7 || reg >= 16; - default: - return 0; - } -} - -static void aa_reserve_hard_regs(CGTarget* t, RegClass cls, - const Reg* regs, u32 n) { - AAImpl* a = impl_of(t); - for (u32 i = 0; i < n; ++i) { - Reg r = regs[i]; - switch (cls) { - case RC_INT: - if (r >= 19u && r <= 28u) a->used_cs_int_mask |= 1u << r; - break; - case RC_FP: - if (r >= 8u && r <= 15u) a->used_cs_fp_mask |= 1u << r; - break; - default: - break; - } - } -} - -void aa_coord_vtable_init(CGTarget* t) { - t->get_allocable_regs = aa_get_allocable_regs; - t->get_scratch_regs = aa_get_scratch_regs; - t->is_caller_saved = aa_is_caller_saved; - t->reserve_hard_regs = aa_reserve_hard_regs; -} diff --git a/src/arch/rv64/arch.c b/src/arch/rv64/arch.c @@ -1,12 +1,14 @@ #include "arch/arch.h" #include "abi/abi_internal.h" -#include "arch/rv64.h" +#include "arch/rv64/rv64.h" #include "core/bytes.h" #include "link/link_arch.h" #include "obj/elf.h" #include "obj/obj.h" +extern const LinkArchDesc link_arch_rv64; + static const ABIVtable* rv64_abi_vtable(Compiler* c, CfreeOSKind os) { (void)c; (void)os; diff --git a/src/arch/rv64/internal.h b/src/arch/rv64/internal.h @@ -5,8 +5,8 @@ #include <string.h> #include "arch/arch.h" -#include "arch/rv64.h" -#include "arch/rv64_isa.h" +#include "arch/rv64/rv64.h" +#include "arch/rv64/isa.h" #include "core/arena.h" #include "core/pool.h" #include "obj/obj.h" diff --git a/src/arch/rv64_isa.h b/src/arch/rv64/isa.h diff --git a/src/arch/rv64/link.c b/src/arch/rv64/link.c @@ -0,0 +1,95 @@ +/* RV64 link-time arch descriptor. See link_arch.h for the contract. + * + * The PLT0/PLT-entry/IPLT-stub byte layouts here mirror what used to + * live inline in link_dyn.c (PLT) and link_layout.c (IPLT) before the + * vtable refactor; comments preserve the WHY (notably the +0x800 bias + * on AUIPC immediates). */ + +#include "arch/rv64/isa.h" +#include "core/bytes.h" +#include "core/core.h" +#include "link/link_arch.h" +#include "obj/elf.h" + +/* PLT0 is 8 canonical NOPs (32 bytes); each PLT entry and IPLT stub is + * 4 instructions (16 bytes) / 3 instructions (12 bytes) respectively. + * Encoded once here so the descriptor and emitters stay in sync. */ +#define RV64_PLT0_SIZE 32u +#define RV64_PLT_ENTRY_SIZE 16u +#define RV64_IPLT_STUB_SIZE 12u + +/* Split a PC-relative displacement into the (hi20, lo12) pair consumed + * by the AUIPC + I-type sequence. The +0x800 bias is the standard + * RISC-V two-instruction PCREL trick: AUIPC adds an upper-20 immediate + * shifted left 12, then the second instruction adds a sign-extended + * 12-bit lo12. If we naively split disp into (disp>>12, disp&0xfff) + * the lo12 sign-extends as a *negative* number whenever bit 11 is set, + * which underflows the AUIPC result by 0x1000. Adding 0x800 before + * the shift rounds the high half up in exactly the cases that need it + * so AUIPC + sign-extended-lo12 reconstructs disp correctly. */ +static inline void rv64_split_pcrel(i64 disp, u32* hi20_out, u32* lo12_out) { + *hi20_out = (u32)(((u64)(disp + 0x800)) >> 12) & 0xfffffu; + *lo12_out = (u32)((u64)disp & 0xfffu); +} + +/* PLT0 under DF_1_NOW is never executed — the loader resolves every + * JUMP_SLOT before transferring control — but we still emit it in + * canonical form (8 NOPs) so disassemblers and unwinders see a well- + * formed prologue at the top of .plt. */ +static void rv64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) { + u32 i; + (void)plt0_vaddr; + (void)gotplt_vaddr; + for (i = 0; i < RV64_PLT0_SIZE; i += 4u) wr_u32_le(dst + i, rv_nop()); +} + +/* Per-import PLT entry: load the GOT slot pre-filled by the loader + * (R_RISCV_JUMP_SLOT) and tail-call through it. t1 is the standard + * psABI scratch for the trampoline return-address (clobbered by the + * lazy resolver in the non-BIND_NOW path); t3 holds the slot pointer. */ +static void rv64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) { + i64 disp = (i64)slot_vaddr - (i64)entry_vaddr; + u32 hi20; + u32 lo12; + rv64_split_pcrel(disp, &hi20, &lo12); + wr_u32_le(dst + 0, rv_auipc(RV_T3, hi20)); + wr_u32_le(dst + 4, rv_ld(RV_T3, RV_T3, (i32)lo12)); + wr_u32_le(dst + 8, rv_jalr(RV_T1, RV_T3, 0)); + wr_u32_le(dst + 12, rv_nop()); +} + +/* IPLT stub: load .igot.plt[i] (filled at startup by the resolver) and + * tail-call to it. The stub->slot displacement is invariant under the + * segment-base shift (both addresses live in the same image), so we + * bake it directly into the instructions and report zero apply-time + * relocs — unlike aarch64, which cannot encode a 32-bit pcrel inline. */ +static u32 rv64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr, + LinkArchIPltReloc out[2]) { + i64 disp = (i64)slot_vaddr - (i64)stub_vaddr; + u32 hi20; + u32 lo12; + (void)out; + rv64_split_pcrel(disp, &hi20, &lo12); + wr_u32_le(dst + 0, rv_auipc(RV_T1, hi20)); + wr_u32_le(dst + 4, rv_ld(RV_T1, RV_T1, (i32)lo12)); + wr_u32_le(dst + 8, rv_jr(RV_T1)); + return 0u; +} + +const LinkArchDesc link_arch_rv64 = { + .e_machine = EM_RISCV, + .default_musl_interp = "/lib/ld-musl-riscv64.so.1", + /* RISC-V psABI has no dedicated GLOB_DAT — GOT-slot data imports + * use the generic absolute-64 reloc instead. */ + .elf_r_relative = ELF_R_RISCV_RELATIVE, + .elf_r_glob_dat = ELF_R_RISCV_64, + .elf_r_jump_slot = ELF_R_RISCV_JUMP_SLOT, + .plt0_size = RV64_PLT0_SIZE, + .plt_entry_size = RV64_PLT_ENTRY_SIZE, + .iplt_stub_size = RV64_IPLT_STUB_SIZE, + .global_pointer_symbol = "__global_pointer$", + .global_pointer_rw_offset = 0x800u, + .emit_plt0 = rv64_emit_plt0, + .emit_plt_entry = rv64_emit_plt_entry, + .emit_iplt_stub = rv64_emit_iplt_stub, +}; diff --git a/src/arch/rv64.h b/src/arch/rv64/rv64.h diff --git a/src/arch/x64/alloc.c b/src/arch/x64/alloc.c @@ -8,8 +8,8 @@ #include <string.h> #include "arch/arch.h" -#include "arch/x64.h" -#include "arch/x64_isa.h" +#include "arch/x64/x64.h" +#include "arch/x64/isa.h" #include "core/arena.h" #include "core/pool.h" #include "obj/obj.h" diff --git a/src/arch/x64/arch.c b/src/arch/x64/arch.c @@ -1,12 +1,14 @@ #include "arch/arch.h" #include "abi/abi_internal.h" -#include "arch/x64.h" +#include "arch/x64/x64.h" #include "core/bytes.h" #include "link/link_arch.h" #include "obj/elf.h" #include "obj/obj.h" +extern const LinkArchDesc link_arch_x64; + static const ABIVtable* x64_abi_vtable(Compiler* c, CfreeOSKind os) { (void)c; (void)os; diff --git a/src/arch/x64/emit.c b/src/arch/x64/emit.c @@ -7,8 +7,8 @@ #include <string.h> #include "arch/arch.h" -#include "arch/x64.h" -#include "arch/x64_isa.h" +#include "arch/x64/x64.h" +#include "arch/x64/isa.h" #include "core/arena.h" #include "core/pool.h" #include "obj/obj.h" diff --git a/src/arch/x64/internal.h b/src/arch/x64/internal.h @@ -6,15 +6,15 @@ * - Small type helpers (static inline) * - Forward declarations of cross-file functions * - * NOT included by external consumers; use arch/x64.h for the public API. */ + * NOT included by external consumers; use arch/x64/x64.h for the public API. */ #pragma once #include <string.h> #include "arch/arch.h" -#include "arch/x64.h" -#include "arch/x64_isa.h" +#include "arch/x64/x64.h" +#include "arch/x64/isa.h" #include "core/arena.h" #include "core/pool.h" #include "obj/obj.h" diff --git a/src/arch/x64/isa.h b/src/arch/x64/isa.h @@ -0,0 +1,128 @@ +/* x86_64 ISA helpers used by arch/x64.c. + * + * Only the constants here. Instruction encoders live in arch/x64.c + * because they're variable length and depend on the MCEmitter byte + * stream (REX prefix, ModR/M, SIB, displacement). The disassembler + * doesn't share these yet; if/when it does, a parallel x64_isa.c will + * host decode tables. */ + +#ifndef CFREE_X64_ISA_H +#define CFREE_X64_ISA_H + +#include "core/bytes.h" +#include "core/core.h" + +/* ---- GPR numbering (DWARF / ABI matches HW encoding 0..15) ---- */ +enum { + X64_RAX = 0, + X64_RCX = 1, + X64_RDX = 2, + X64_RBX = 3, + X64_RSP = 4, + X64_RBP = 5, + X64_RSI = 6, + X64_RDI = 7, + X64_R8 = 8, + X64_R9 = 9, + X64_R10 = 10, + X64_R11 = 11, + X64_R12 = 12, + X64_R13 = 13, + X64_R14 = 14, + X64_R15 = 15, +}; + +/* SSE register numbering — xmm0..xmm15 share encoding with r0..r15. */ +enum { + X64_XMM0 = 0, + X64_XMM1 = 1, + X64_XMM2 = 2, + X64_XMM3 = 3, + X64_XMM4 = 4, + X64_XMM5 = 5, + X64_XMM6 = 6, + X64_XMM7 = 7, + X64_XMM8 = 8, + X64_XMM15 = 15, +}; + +/* Condition codes for Jcc / SETcc / CMOVcc. Encoded in the low nibble. */ +enum { + X64_CC_O = 0x0, + X64_CC_NO = 0x1, + X64_CC_B = 0x2, /* below / CF=1 → CMP_LT_U */ + X64_CC_AE = 0x3, /* above-or-equal / CF=0 → CMP_GE_U */ + X64_CC_E = 0x4, /* equal / ZF=1 → CMP_EQ */ + X64_CC_NE = 0x5, /* → CMP_NE */ + X64_CC_BE = 0x6, /* below-or-equal / CF=1 or ZF=1 → CMP_LE_U */ + X64_CC_A = 0x7, /* above / CF=0 and ZF=0 → CMP_GT_U */ + X64_CC_S = 0x8, + X64_CC_NS = 0x9, + X64_CC_P = 0xA, + X64_CC_NP = 0xB, + X64_CC_L = 0xC, /* less (signed) / SF!=OF → CMP_LT_S */ + X64_CC_GE = 0xD, /* → CMP_GE_S */ + X64_CC_LE = 0xE, /* less-or-equal (signed) → CMP_LE_S */ + X64_CC_G = 0xF, /* greater → CMP_GT_S */ +}; + +/* REX prefix is 0x40 | W<<3 | R<<2 | X<<1 | B. */ +#define X64_REX_BASE 0x40u +#define X64_REX_W 0x08u +#define X64_REX_R 0x04u +#define X64_REX_X 0x02u +#define X64_REX_B 0x01u + +/* ---- Branch / NOP encoding constants ---- + * + * Used by the linker to emit PLT entries and IPLT stubs without + * sprinkling raw hex into src/arch/x64/link.c. The shape is always the + * same RIP-relative indirect JMP plus padding NOPs. */ + +/* JMP r/m64 — opcode FF /4. ModR/M for the RIP+disp32 form is + * mod=00, reg=/4 (JMP m64), r/m=101 → 0x25. */ +#define X64_OP_JMP_RM64 0xFFu +#define X64_MODRM_JMP_RIPREL 0x25u + +/* Single-byte NOP. */ +#define X64_NOP1 0x90u + +/* Intel multi-byte ("long") NOP forms. The 6-byte form is the + * canonical IPLT-stub tail pad (NOPW 0(%rax,%rax,1)). */ +#define X64_NOP6_BYTE0 0x66u +#define X64_NOP6_BYTE1 0x0Fu +#define X64_NOP6_BYTE2 0x1Fu +#define X64_NOP6_BYTE3 0x44u +#define X64_NOP6_BYTE4 0x00u +#define X64_NOP6_BYTE5 0x00u + +/* Sizes of the encoded forms above. */ +#define X64_JMP_RIPREL_SIZE 6u +#define X64_NOP6_SIZE 6u + +/* Write a 6-byte `jmp [rip + disp32]` (FF 25 disp32) at dst. */ +static inline void x64_write_jmp_riprel(u8* dst, i32 disp32) { + dst[0] = X64_OP_JMP_RM64; + dst[1] = X64_MODRM_JMP_RIPREL; + wr_u32_le(dst + 2, (u32)disp32); +} + +/* Fill nbytes at dst with single-byte NOPs (0x90). Matches the + * existing memset-then-patch pattern used to pad PLT entries to 16. */ +static inline void x64_write_nop_pad(u8* dst, u32 nbytes) { + u32 i; + for (i = 0; i < nbytes; ++i) dst[i] = X64_NOP1; +} + +/* Write the canonical 6-byte multi-byte NOP (66 0F 1F 44 00 00) at + * dst. Used to pad the IPLT stub from 6 → 12 bytes. */ +static inline void x64_write_nop6(u8* dst) { + dst[0] = X64_NOP6_BYTE0; + dst[1] = X64_NOP6_BYTE1; + dst[2] = X64_NOP6_BYTE2; + dst[3] = X64_NOP6_BYTE3; + dst[4] = X64_NOP6_BYTE4; + dst[5] = X64_NOP6_BYTE5; +} + +#endif diff --git a/src/arch/x64/link.c b/src/arch/x64/link.c @@ -0,0 +1,77 @@ +/* x86_64 link-time arch descriptor. + * + * Implements the LinkArchDesc contract from link/link_arch.h for + * EM_X86_64. The PLT/IPLT byte sequences here mirror the inline + * encodings previously living in link_dyn.c (PLT0 + per-import entry) + * and link_layout.c (IPLT stub) — kept identical byte-for-byte so the + * descriptor switchover is a pure refactor. All raw byte values come + * from named constants / inline writers in arch/x64/isa.h. */ + +#include "link/link_arch.h" + +#include "arch/x64/isa.h" +#include "core/bytes.h" +#include "core/core.h" +#include "obj/elf.h" + +/* PLT0 layout under DF_1_NOW: never executed (loader pre-binds every + * slot via .rela.plt before user code runs), so we just emit 32 bytes + * of single-byte NOPs. Self-documenting and trivially well-formed for + * disassemblers and unwinders that walk the section. */ +static void x64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) { + (void)plt0_vaddr; + (void)gotplt_vaddr; + x64_write_nop_pad(dst, 32u); +} + +/* Per-import PLT entry (16 B): + * + * ff 25 disp32 ; jmpq *[rip + disp_to_slot] (6 B) + * 90 90 90 90 90 90 90 90 90 90 ; pad to 16 with single-byte NOPs + * + * disp32 is measured from the END of the JMP (entry_vaddr + 6) to the + * .got.plt slot. The 10-byte tail matches link_dyn.c's prior + * memset(0x90)+patch behavior exactly. */ +static void x64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) { + i64 disp = (i64)slot_vaddr - (i64)(entry_vaddr + X64_JMP_RIPREL_SIZE); + i32 disp32 = (i32)(u32)((u64)disp & 0xffffffffu); + x64_write_jmp_riprel(dst, disp32); + x64_write_nop_pad(dst + X64_JMP_RIPREL_SIZE, + 16u - X64_JMP_RIPREL_SIZE); +} + +/* IPLT (ifunc) trampoline stub (12 B): + * + * ff 25 disp32 ; jmpq *[rip + disp_to_slot] (6 B) + * 66 0f 1f 44 00 00 ; 6-byte multibyte NOP (6 B) + * + * Like the PLT entry, disp32 is from the END of the JMP to the + * .igot.plt slot. The displacement is invariant under image-base + * shift (both ends move together), so it's encoded inline and we + * report zero apply-time relocations. */ +static u32 x64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr, + LinkArchIPltReloc out[2]) { + (void)out; + i64 disp = (i64)slot_vaddr - (i64)(stub_vaddr + X64_JMP_RIPREL_SIZE); + i32 disp32 = (i32)(u32)((u64)disp & 0xffffffffu); + x64_write_jmp_riprel(dst, disp32); + x64_write_nop6(dst + X64_JMP_RIPREL_SIZE); + return 0; +} + +const LinkArchDesc link_arch_x64 = { + .e_machine = EM_X86_64, + .default_musl_interp = "/lib/ld-musl-x86_64.so.1", + + .elf_r_relative = ELF_R_X86_64_RELATIVE, + .elf_r_glob_dat = ELF_R_X86_64_GLOB_DAT, + .elf_r_jump_slot = ELF_R_X86_64_JUMP_SLOT, + + .plt0_size = 32u, + .plt_entry_size = 16u, + .iplt_stub_size = 12u, + + .emit_plt0 = x64_emit_plt0, + .emit_plt_entry = x64_emit_plt_entry, + .emit_iplt_stub = x64_emit_iplt_stub, +}; diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c @@ -13,8 +13,8 @@ #include <string.h> #include "arch/arch.h" -#include "arch/x64.h" -#include "arch/x64_isa.h" +#include "arch/x64/x64.h" +#include "arch/x64/isa.h" #include "core/arena.h" #include "core/pool.h" #include "obj/obj.h" diff --git a/src/arch/x64.h b/src/arch/x64/x64.h diff --git a/src/arch/x64_isa.h b/src/arch/x64_isa.h @@ -1,128 +0,0 @@ -/* x86_64 ISA helpers used by arch/x64.c. - * - * Only the constants here. Instruction encoders live in arch/x64.c - * because they're variable length and depend on the MCEmitter byte - * stream (REX prefix, ModR/M, SIB, displacement). The disassembler - * doesn't share these yet; if/when it does, a parallel x64_isa.c will - * host decode tables. */ - -#ifndef CFREE_X64_ISA_H -#define CFREE_X64_ISA_H - -#include "core/bytes.h" -#include "core/core.h" - -/* ---- GPR numbering (DWARF / ABI matches HW encoding 0..15) ---- */ -enum { - X64_RAX = 0, - X64_RCX = 1, - X64_RDX = 2, - X64_RBX = 3, - X64_RSP = 4, - X64_RBP = 5, - X64_RSI = 6, - X64_RDI = 7, - X64_R8 = 8, - X64_R9 = 9, - X64_R10 = 10, - X64_R11 = 11, - X64_R12 = 12, - X64_R13 = 13, - X64_R14 = 14, - X64_R15 = 15, -}; - -/* SSE register numbering — xmm0..xmm15 share encoding with r0..r15. */ -enum { - X64_XMM0 = 0, - X64_XMM1 = 1, - X64_XMM2 = 2, - X64_XMM3 = 3, - X64_XMM4 = 4, - X64_XMM5 = 5, - X64_XMM6 = 6, - X64_XMM7 = 7, - X64_XMM8 = 8, - X64_XMM15 = 15, -}; - -/* Condition codes for Jcc / SETcc / CMOVcc. Encoded in the low nibble. */ -enum { - X64_CC_O = 0x0, - X64_CC_NO = 0x1, - X64_CC_B = 0x2, /* below / CF=1 → CMP_LT_U */ - X64_CC_AE = 0x3, /* above-or-equal / CF=0 → CMP_GE_U */ - X64_CC_E = 0x4, /* equal / ZF=1 → CMP_EQ */ - X64_CC_NE = 0x5, /* → CMP_NE */ - X64_CC_BE = 0x6, /* below-or-equal / CF=1 or ZF=1 → CMP_LE_U */ - X64_CC_A = 0x7, /* above / CF=0 and ZF=0 → CMP_GT_U */ - X64_CC_S = 0x8, - X64_CC_NS = 0x9, - X64_CC_P = 0xA, - X64_CC_NP = 0xB, - X64_CC_L = 0xC, /* less (signed) / SF!=OF → CMP_LT_S */ - X64_CC_GE = 0xD, /* → CMP_GE_S */ - X64_CC_LE = 0xE, /* less-or-equal (signed) → CMP_LE_S */ - X64_CC_G = 0xF, /* greater → CMP_GT_S */ -}; - -/* REX prefix is 0x40 | W<<3 | R<<2 | X<<1 | B. */ -#define X64_REX_BASE 0x40u -#define X64_REX_W 0x08u -#define X64_REX_R 0x04u -#define X64_REX_X 0x02u -#define X64_REX_B 0x01u - -/* ---- Branch / NOP encoding constants ---- - * - * Used by the linker to emit PLT entries and IPLT stubs without - * sprinkling raw hex into link_arch_x64.c. The shape is always the - * same RIP-relative indirect JMP plus padding NOPs. */ - -/* JMP r/m64 — opcode FF /4. ModR/M for the RIP+disp32 form is - * mod=00, reg=/4 (JMP m64), r/m=101 → 0x25. */ -#define X64_OP_JMP_RM64 0xFFu -#define X64_MODRM_JMP_RIPREL 0x25u - -/* Single-byte NOP. */ -#define X64_NOP1 0x90u - -/* Intel multi-byte ("long") NOP forms. The 6-byte form is the - * canonical IPLT-stub tail pad (NOPW 0(%rax,%rax,1)). */ -#define X64_NOP6_BYTE0 0x66u -#define X64_NOP6_BYTE1 0x0Fu -#define X64_NOP6_BYTE2 0x1Fu -#define X64_NOP6_BYTE3 0x44u -#define X64_NOP6_BYTE4 0x00u -#define X64_NOP6_BYTE5 0x00u - -/* Sizes of the encoded forms above. */ -#define X64_JMP_RIPREL_SIZE 6u -#define X64_NOP6_SIZE 6u - -/* Write a 6-byte `jmp [rip + disp32]` (FF 25 disp32) at dst. */ -static inline void x64_write_jmp_riprel(u8* dst, i32 disp32) { - dst[0] = X64_OP_JMP_RM64; - dst[1] = X64_MODRM_JMP_RIPREL; - wr_u32_le(dst + 2, (u32)disp32); -} - -/* Fill nbytes at dst with single-byte NOPs (0x90). Matches the - * existing memset-then-patch pattern used to pad PLT entries to 16. */ -static inline void x64_write_nop_pad(u8* dst, u32 nbytes) { - u32 i; - for (i = 0; i < nbytes; ++i) dst[i] = X64_NOP1; -} - -/* Write the canonical 6-byte multi-byte NOP (66 0F 1F 44 00 00) at - * dst. Used to pad the IPLT stub from 6 → 12 bytes. */ -static inline void x64_write_nop6(u8* dst) { - dst[0] = X64_NOP6_BYTE0; - dst[1] = X64_NOP6_BYTE1; - dst[2] = X64_NOP6_BYTE2; - dst[3] = X64_NOP6_BYTE3; - dst[4] = X64_NOP6_BYTE4; - dst[5] = X64_NOP6_BYTE5; -} - -#endif diff --git a/src/dbg/arch_aa64.c b/src/dbg/arch_aa64.c @@ -1,235 +0,0 @@ -/* AArch64 lifter for the displaced-step shim. - * - * Lays out a fixed-up copy of one insn in the session scratch slot - * (DBG_DISPLACED_SLOT_BYTES bytes), followed by a BRK sentinel the - * session arms an internal bp on. - * - * Supported families: - * - any insn with no PC-relative operand (copied verbatim); - * - B / BL / B.cond — re-encode the immediate; - * - CBZ / CBNZ / TBZ / TBNZ — always emit a trampoline: - * slot[0] cond-branch +2 words (taken → slot+8) - * slot[4] BRK (not-taken fallthrough) - * slot[8] LDR x16, =target - * slot[12] BR x16 - * slot[16] literal pool (8 bytes, absolute target) - * - ADR / ADRP — replace with LDR Xd, =target: - * slot[0] LDR Xd, =target - * slot[4] BRK - * slot[8] literal pool (8 bytes) - * - LDR (literal), integer/LDRSW — synthesize indirect load: - * slot[0] LDR x16, =literal_addr - * slot[4] LDR Xt/Wt/LDRSW Xt, [x16] - * slot[8] BRK - * slot[12] literal pool (8 bytes, absolute literal addr) - * - BR / BLR / RET — copied verbatim; the BRK after never - * fires because the indirect branch transfers control. The session's - * stale internal_bp is cleared by the next prepare; finalize gates on - * PC == return_pc so it stays a no-op when control left the slot. */ - -#include "dbg/dbg.h" - -#include <string.h> - -#include "arch/aa64_isa.h" - -#define SHIM_X16 16u /* IP0; safe to clobber inside a shim */ - -uint32_t dbg_aa64_brk_word(void) { - return aa64_brk(0); -} - -static int fits_signed(int64_t v, int bits) { - int64_t lim = (int64_t)1 << (bits - 1); - return v >= -lim && v < lim; -} - -/* LDR (literal) for integer Xt: opc=01, V=0, fixed bits 011_0_00. - * 01 011 0 00 imm19 Rt → 0x58000000 | (imm19<<5) | Rt - * imm19 is the signed word offset from the LDR's own PC. */ -static uint32_t enc_ldr_lit_x(uint32_t Rt, int32_t imm19) { - return 0x58000000u | (((uint32_t)imm19 & 0x7ffffu) << 5) | (Rt & 0x1fu); -} -/* LDR Xt, [Xn, #0] / LDR Wt, [Xn, #0] / LDRSW Xt, [Xn, #0]. */ -static uint32_t enc_ldr64_reg(uint32_t Rt, uint32_t Rn) { - return aa64_ldr64_uimm12(Rt, Rn, 0); -} -static uint32_t enc_ldr32_reg(uint32_t Rt, uint32_t Rn) { - return aa64_ldst_uimm_pack((AA64LdStUimm){ - .size = 2, .V = 0, .opc = AA64_LDST_OPC_LDR, .imm12 = 0, .Rn = Rn, - .Rt = Rt}); -} -static uint32_t enc_ldrsw_reg(uint32_t Rt, uint32_t Rn) { - return aa64_ldst_uimm_pack((AA64LdStUimm){ - .size = 2, .V = 0, .opc = 2, .imm12 = 0, .Rn = Rn, .Rt = Rt}); -} - -static void put_u32(uint8_t* w, uint32_t off, uint32_t v) { - memcpy(w + off, &v, sizeof(v)); -} -static void put_u64(uint8_t* w, uint32_t off, uint64_t v) { - memcpy(w + off, &v, sizeof(v)); -} - -/* Sign-extend a `bits`-wide field whose raw value is `v`. */ -static int64_t sign_extend(uint64_t v, int bits) { - uint64_t m = 1ull << (bits - 1); - return (int64_t)((v ^ m) - m); -} - -int dbg_aa64_build_shim(uint32_t orig_insn, uint64_t orig_pc, - void* scratch_write, uint64_t scratch_runtime, - u32* shim_len) { - uint8_t* w = (uint8_t*)scratch_write; - uint32_t brk = aa64_brk(0); - int64_t pc_delta; - if (!shim_len) return 1; - *shim_len = 0; - pc_delta = (int64_t)orig_pc - (int64_t)scratch_runtime; - - /* ---- B / BL (imm26) ------------------------------------------------ */ - if ((orig_insn & 0x7C000000u) == 0x14000000u) { - AA64BrImm f = aa64_brimm_unpack(orig_insn); - int64_t imm = sign_extend(f.imm26, 26); - int64_t new_off = imm * 4 + pc_delta; - if ((new_off & 3) || !fits_signed(new_off / 4, 26)) { - /* Out of B/BL range from scratch: fall back to LDR x30/PC trick is - * messy for BL (need to preserve LR). Decline. */ - return 1; - } - f.imm26 = (uint32_t)((new_off / 4) & 0x3ffffffu); - put_u32(w, 0, aa64_brimm_pack(f)); - put_u32(w, 4, brk); - *shim_len = 4; - return 0; - } - - /* ---- B.cond (imm19) ------------------------------------------------ */ - if ((orig_insn & 0xFF000010u) == 0x54000000u) { - AA64BrCond f = aa64_brcond_unpack(orig_insn); - int64_t imm = sign_extend(f.imm19, 19); - int64_t new_off = imm * 4 + pc_delta; - if ((new_off & 3) || !fits_signed(new_off / 4, 19)) { - /* Synthesize: B.cond +8 (skip BRK) ; BRK ; LDR x16,=tgt ; BR x16 ; - * literal. The "taken" path branches to slot+8, the "not-taken" - * path falls through to BRK at slot+4. */ - uint64_t target = orig_pc + (uint64_t)(imm * 4); - AA64BrCond nf; - nf.cond = f.cond; - nf.imm19 = 2u; /* +8 bytes from slot[0] → slot[8] */ - put_u32(w, 0, aa64_brcond_pack(nf)); - put_u32(w, 4, brk); - put_u32(w, 8, enc_ldr_lit_x(SHIM_X16, 2)); /* LDR x16, [pc+8] = slot[16] */ - put_u32(w, 12, aa64_br(SHIM_X16)); - put_u64(w, 16, target); - *shim_len = 4; - return 0; - } - f.imm19 = (uint32_t)((new_off / 4) & 0x7ffffu); - put_u32(w, 0, aa64_brcond_pack(f)); - put_u32(w, 4, brk); - *shim_len = 4; - return 0; - } - - /* ---- CBZ / CBNZ (imm19) — always trampoline form ------------------- */ - if ((orig_insn & 0x7E000000u) == 0x34000000u) { - AA64CB f = aa64_cb_unpack(orig_insn); - int64_t imm = sign_extend(f.imm19, 19); - uint64_t target = orig_pc + (uint64_t)(imm * 4); - AA64CB nf = f; - nf.imm19 = 2u; /* +8 → slot[8] */ - put_u32(w, 0, aa64_cb_pack(nf)); - put_u32(w, 4, brk); - put_u32(w, 8, enc_ldr_lit_x(SHIM_X16, 2)); - put_u32(w, 12, aa64_br(SHIM_X16)); - put_u64(w, 16, target); - *shim_len = 4; - return 0; - } - - /* ---- TBZ / TBNZ (imm14) — always trampoline ------------------------ - * b5 011011 op b40[18:14] imm14[18:5] -- wait, field layout: - * b5(31) 011011(30..25) op(24) b40(23..19) imm14(18..5) Rt(4..0). */ - if ((orig_insn & 0x7E000000u) == 0x36000000u) { - uint32_t b5 = (orig_insn >> 31) & 1u; - uint32_t op = (orig_insn >> 24) & 1u; - uint32_t b40 = (orig_insn >> 19) & 0x1fu; - uint32_t Rt = orig_insn & 0x1fu; - uint32_t imm14_raw = (orig_insn >> 5) & 0x3fffu; - int64_t imm = sign_extend(imm14_raw, 14); - uint64_t target = orig_pc + (uint64_t)(imm * 4); - uint32_t new_imm14 = 2u; /* +8 → slot[8] */ - uint32_t new_word = - (b5 << 31) | 0x36000000u | (op << 24) | (b40 << 19) | - ((new_imm14 & 0x3fffu) << 5) | (Rt & 0x1fu); - put_u32(w, 0, new_word); - put_u32(w, 4, brk); - put_u32(w, 8, enc_ldr_lit_x(SHIM_X16, 2)); - put_u32(w, 12, aa64_br(SHIM_X16)); - put_u64(w, 16, target); - *shim_len = 4; - return 0; - } - - /* ---- ADR / ADRP ---------------------------------------------------- */ - if ((orig_insn & 0x1F000000u) == 0x10000000u) { - AA64PCRelAdr f = aa64_pcrel_adr_unpack(orig_insn); - uint64_t imm_raw = ((uint64_t)f.immhi << 2) | (uint64_t)f.immlo; - int64_t imm21 = sign_extend(imm_raw, 21); - uint64_t target; - if (f.op == AA64_ADR_OP_ADRP) { - target = (orig_pc & ~(uint64_t)0xFFF) + ((uint64_t)imm21 << 12); - } else { - target = orig_pc + (uint64_t)imm21; - } - /* LDR Xd, [pc + 8] — the literal sits at slot[8]. */ - put_u32(w, 0, enc_ldr_lit_x(f.Rd, 2)); - put_u32(w, 4, brk); - put_u64(w, 8, target); - *shim_len = 4; - return 0; - } - - /* ---- LDR (literal) — integer & LDRSW only -------------------------- */ - if ((orig_insn & 0x3B000000u) == 0x18000000u) { - uint32_t opc = (orig_insn >> 30) & 3u; - uint32_t V = (orig_insn >> 26) & 1u; - uint32_t Rt = orig_insn & 0x1fu; - uint32_t imm19_raw = (orig_insn >> 5) & 0x7ffffu; - int64_t imm19 = sign_extend(imm19_raw, 19); - uint64_t literal_addr = orig_pc + (uint64_t)(imm19 * 4); - uint32_t load_insn; - if (V) return 1; /* vector forms (S/D/Q): not supported in v1 */ - switch (opc) { - case 0: load_insn = enc_ldr32_reg(Rt, SHIM_X16); break; /* LDR Wt */ - case 1: load_insn = enc_ldr64_reg(Rt, SHIM_X16); break; /* LDR Xt */ - case 2: load_insn = enc_ldrsw_reg(Rt, SHIM_X16); break; /* LDRSW */ - default: return 1; /* PRFM (literal): not meaningful here */ - } - /* LDR x16, [pc + 12] — literal at slot[12]. */ - put_u32(w, 0, enc_ldr_lit_x(SHIM_X16, 3)); - put_u32(w, 4, load_insn); - put_u32(w, 8, brk); - put_u64(w, 12, literal_addr); - *shim_len = 8; - return 0; - } - - /* ---- BR / BLR / RET (indirect) ------------------------------------- */ - if ((orig_insn & 0xFE1FFC1Fu) == AA64_BR_REG_FAMILY_MATCH) { - /* Copy verbatim; the BRK after will not fire because control - * transfers to the register target. The session clears the stale - * internal bp on the next prepare. */ - put_u32(w, 0, orig_insn); - put_u32(w, 4, brk); - *shim_len = 4; - return 0; - } - - /* ---- default: no PC-relative operand — copy verbatim --------------- */ - put_u32(w, 0, orig_insn); - put_u32(w, 4, brk); - *shim_len = 4; - return 0; -} diff --git a/src/dbg/dbg.h b/src/dbg/dbg.h @@ -3,7 +3,7 @@ /* Internal contracts for src/dbg/. The public CfreeJitSession entries are * defined in session.c on top of these primitives; bp.c, step.c, mem.c, - * displaced.c, and arch_aa64.c each own one slice. */ + * displaced.c, and arch/aa64/dbg.c each own one slice. */ #include <cfree.h> @@ -76,7 +76,7 @@ int dbg_mem_write(struct CfreeJitSession*, uint64_t addr, const void* src, size_t n); /* ---- displaced step ------------------------------------------------- */ -/* The session owns a single executable scratch region. arch_aa64.c writes +/* The session owns a single executable scratch region. arch/aa64/dbg.c writes * a fixed-up copy of the original insn plus a return-shim into it; the * worker is then resumed with PC pointing at the scratch entry. The shim * ends with a BRK that the fault classifier recognizes (via the bp table) diff --git a/src/dbg/displaced.c b/src/dbg/displaced.c @@ -86,7 +86,7 @@ int dbg_displaced_prepare(CfreeJitSession* s, uint64_t insn_pc, return 1; } /* Flush the entire slot — trampoline forms write up to 24 bytes plus a - * literal pool; arch_aa64.c returns the BRK *offset*, not the length. */ + * literal pool; arch/aa64/dbg.c returns the BRK *offset*, not the length. */ if (s->c->env->execmem->flush_icache) { s->c->env->execmem->flush_icache(s->c->env->execmem->user, s->displaced.region.runtime, diff --git a/src/link/link_arch.h b/src/link/link_arch.h @@ -7,9 +7,8 @@ * Compiler.target.arch. Lets link_dyn.c / link_layout.c / link_elf.c * stay arch-agnostic instead of branching on target.arch and hand- * encoding instruction bytes inline. Each backend's descriptor lives - * in its own translation unit (link_arch_aa64.c / _x64.c / _rv64.c) - * and leans on the existing arch/<arch>_isa.h encoders for everything - * but small format-specific constants. + * under src/arch/<arch>/ and leans on that arch's ISA encoders for + * everything but small format-specific constants. * * The struct intentionally collects only fields the LINKER needs. * Code-generation arch dispatch belongs in CGTarget (arch/arch.h); @@ -110,11 +109,6 @@ typedef struct LinkArchDesc { int (*needs_jit_call_stub)(RelocKind); } LinkArchDesc; -/* Per-arch descriptors, defined in link_arch_<arch>.c. */ -extern const LinkArchDesc link_arch_aa64; -extern const LinkArchDesc link_arch_x64; -extern const LinkArchDesc link_arch_rv64; - /* Returns NULL for an unsupported arch. Callers panic with their own * context-rich message rather than this helper picking one. */ const LinkArchDesc* link_arch_desc_for(const Compiler*); diff --git a/src/link/link_arch_aa64.c b/src/link/link_arch_aa64.c @@ -1,208 +0,0 @@ -/* AArch64 link-time descriptor. - * - * Implements the LinkArchDesc contract from link_arch.h for the - * aarch64 ELF psABI: PLT0 + per-import PLT entries (lazy-resolve - * trampolines emitted in canonical form even under DF_1_NOW), and the - * 12-byte IPLT stub used by ifunc resolvers. All instruction bytes - * come from the encoders in arch/aa64_isa.h — no raw hex literals - * here. - * - * The byte layout matches the previous inline encodings in - * link_dyn.c (PLT) and link_layout.c (IPLT) so that switching the - * linker to descriptor dispatch is a no-op on the output image. */ - -#include "arch/aa64_isa.h" -#include "core/bytes.h" -#include "core/core.h" -#include "link/link_arch.h" -#include "obj/elf.h" -#include "obj/macho.h" -#include "obj/obj.h" - -/* Fixed register assignments mandated by the AArch64 PLT ABI. */ -#define AA64_PLT_SCRATCH_X16 16u /* PLT/IPLT scratch (slot address) */ -#define AA64_PLT_SCRATCH_X17 17u /* PLT scratch (loaded function ptr) */ - -/* PLT geometry. Documented in link_arch.h; redeclared here as the - * descriptor table needs them at file scope. */ -#define AA64_PLT0_SIZE 32u -#define AA64_PLT_ENTRY_SIZE 16u -#define AA64_IPLT_STUB_SIZE 12u - -/* PLT0 references .got.plt[2] (the lazy-resolve hook); the per-import - * entries start at .got.plt[3]. */ -#define AA64_GOTPLT_RESOLVER_INDEX 2u - -/* Page mask for ADRP: ADRP encodes (page(target) - page(PC)) >> 12, - * where page(x) clears the low 12 bits. */ -#define AA64_PAGE_MASK ((u64)0xfffu) - -/* Compute the (immlo, immhi) ADRP immediate halves for the page- - * relative displacement from `pc` to `target`. Both addresses are - * post-shift final image vaddrs; ADRP discards the low 12 bits of - * each before subtracting, so the result is invariant under any - * segment-base shift that moves both endpoints by the same delta. */ -static inline void aa64_adrp_imm_halves(u64 pc, u64 target, u32* immlo, - u32* immhi) { - i64 page_disp = (i64)(target & ~AA64_PAGE_MASK) - (i64)(pc & ~AA64_PAGE_MASK); - i64 imm21 = page_disp >> 12; - *immlo = (u32)(imm21 & 0x3); - *immhi = (u32)((imm21 >> 2) & 0x7ffff); -} - -/* Emit one ADRP+LDR+ADD+BR sequence that materializes `slot_vaddr` - * (a .got.plt entry) into x16, loads the resolved function pointer - * into x17, and tail-calls it. Used by both PLT0 (after its STP) and - * each per-import entry — the only thing that varies is `pc`, which - * starts at the ADRP itself. */ -static void aa64_emit_adrp_load_br(u8* dst, u64 pc, u64 slot_vaddr) { - u32 immlo, immhi; - aa64_adrp_imm_halves(pc, slot_vaddr, &immlo, &immhi); - u32 lo12 = (u32)(slot_vaddr & AA64_PAGE_MASK); - /* LDR Xt encodes the byte offset divided by 8. .got.plt slots are - * 8-byte aligned so the low 3 bits of lo12 are always 0. */ - u32 ldr_imm12 = (lo12 >> 3) & 0xfffu; - - wr_u32_le(dst + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, immlo, immhi)); - wr_u32_le(dst + 4, aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X17, - AA64_PLT_SCRATCH_X16, ldr_imm12)); - wr_u32_le(dst + 8, aa64_add_imm(/*sf=*/1, AA64_PLT_SCRATCH_X16, - AA64_PLT_SCRATCH_X16, lo12, /*sh=*/0)); - wr_u32_le(dst + 12, aa64_br(AA64_PLT_SCRATCH_X17)); -} - -static void aa64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) { - /* PLT0: - * stp x16, x30, [sp, #-16]! - * adrp x16, page(.got.plt[2]) - * ldr x17, [x16, #lo12(.got.plt[2])] - * add x16, x16, #lo12(.got.plt[2]) - * br x17 - * nop ; nop ; nop - * - * Under DF_1_NOW the loader patches every .got.plt slot from - * .rela.plt before running PLT0, so this trampoline never executes. - * It is still emitted in canonical form so disassemblers and - * unwinders see the layout the psABI specifies. */ - u64 slot2 = gotplt_vaddr + 8u * AA64_GOTPLT_RESOLVER_INDEX; - /* The ADRP sits at plt0+4 (one instruction past the leading STP). */ - u64 adrp_pc = plt0_vaddr + 4u; - - /* `stp x16, x30, [sp, #-16]!` — pre-indexed pair store with imm7 - * scaled by 8, so the encoded field is -16/8 = -2. */ - wr_u32_le(dst + 0, aa64_stp64_pre(AA64_PLT_SCRATCH_X16, AA64_LR, AA64_SP, - /*imm7_scaled=*/-2)); - aa64_emit_adrp_load_br(dst + 4, adrp_pc, slot2); - wr_u32_le(dst + 20, aa64_nop()); - wr_u32_le(dst + 24, aa64_nop()); - wr_u32_le(dst + 28, aa64_nop()); -} - -static void aa64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) { - /* Per-import 16-byte entry: ADRP+LDR+ADD+BR where ADRP's PC is the - * entry's first instruction (no leading STP here — the resolved - * function returns to the original caller, not into PLT0). */ - aa64_emit_adrp_load_br(dst, entry_vaddr, slot_vaddr); -} - -static u32 aa64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr, - LinkArchIPltReloc out[2]) { - /* IPLT stub: ADRP x16, %page(slot) ; LDR x16, [x16, :lo12:slot] ; - * BR x16. - * - * We deliberately emit the two address-bearing instructions with - * zero immediates: the linker enqueues an ADR_PREL_PG_HI21 reloc on - * the ADRP and an LDST64_ABS_LO12_NC reloc on the LDR, both - * targeting the slot's synthetic local symbol. Reloc-apply runs - * after final vaddr assignment, which is the only point at which - * both endpoints' page-relative displacement is known. */ - (void)stub_vaddr; - (void)slot_vaddr; - - wr_u32_le(dst + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, /*immlo=*/0, - /*immhi=*/0)); - wr_u32_le(dst + 4, - aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X16, AA64_PLT_SCRATCH_X16, - /*imm12_scaled=*/0)); - wr_u32_le(dst + 8, aa64_br(AA64_PLT_SCRATCH_X16)); - - out[0].offset_in_stub = 0; - out[0].width = 4; - out[0].kind = R_AARCH64_ADR_PREL_PG_HI21; - out[1].offset_in_stub = 4; - out[1].width = 4; - out[1].kind = R_AARCH64_LDST64_ABS_LO12_NC; - return 2; -} - -static void aa64_emit_macho_stub(u8* out, u64 stub_vaddr, u64 got_slot_vaddr) { - i64 page_s = ((i64)got_slot_vaddr) & ~(i64)0xfff; - i64 page_p = ((i64)stub_vaddr) & ~(i64)0xfff; - i64 imm21 = (page_s - page_p) >> 12; - u32 immlo = (u32)(imm21 & 0x3u); - u32 immhi = (u32)((imm21 >> 2) & 0x7ffffu); - u32 lo12 = (u32)(got_slot_vaddr & 0xfffu); - u32 imm12_ldr = (lo12 >> 3) & 0xfffu; - - wr_u32_le(out + 0, aa64_adrp(AA64_PLT_SCRATCH_X16, immlo, immhi)); - wr_u32_le(out + 4, aa64_ldr64_uimm12(AA64_PLT_SCRATCH_X16, - AA64_PLT_SCRATCH_X16, imm12_ldr)); - wr_u32_le(out + 8, aa64_br(AA64_PLT_SCRATCH_X16)); -} - -static int aa64_is_branch_reloc(RelocKind kind) { - return kind == R_AARCH64_CALL26 || kind == R_AARCH64_JUMP26; -} - -static int aa64_is_got_load_reloc(RelocKind kind) { - return kind == R_AARCH64_ADR_GOT_PAGE || kind == R_AARCH64_LD64_GOT_LO12_NC; -} - -static int aa64_is_tlvp_reloc(RelocKind kind) { - return kind == R_AARCH64_TLVP_LOAD_PAGE21 || - kind == R_AARCH64_TLVP_LOAD_PAGEOFF12; -} - -static int aa64_is_direct_page_reloc(RelocKind kind) { - switch (kind) { - case R_AARCH64_ADR_PREL_PG_HI21: - case R_AARCH64_ADR_PREL_PG_HI21_NC: - case R_AARCH64_ADD_ABS_LO12_NC: - case R_AARCH64_LDST8_ABS_LO12_NC: - case R_AARCH64_LDST16_ABS_LO12_NC: - case R_AARCH64_LDST32_ABS_LO12_NC: - case R_AARCH64_LDST64_ABS_LO12_NC: - case R_AARCH64_LDST128_ABS_LO12_NC: - return 1; - default: - return 0; - } -} - -const LinkArchDesc link_arch_aa64 = { - .e_machine = EM_AARCH64, - .default_musl_interp = "/lib/ld-musl-aarch64.so.1", - - .elf_r_relative = ELF_R_AARCH64_RELATIVE, - .elf_r_glob_dat = ELF_R_AARCH64_GLOB_DAT, - .elf_r_jump_slot = ELF_R_AARCH64_JUMP_SLOT, - - .macho_cputype = CPU_TYPE_ARM64, - .macho_cpusubtype = CPU_SUBTYPE_ARM64_ALL, - - .plt0_size = AA64_PLT0_SIZE, - .plt_entry_size = AA64_PLT_ENTRY_SIZE, - .iplt_stub_size = AA64_IPLT_STUB_SIZE, - - .emit_plt0 = aa64_emit_plt0, - .emit_plt_entry = aa64_emit_plt_entry, - .emit_iplt_stub = aa64_emit_iplt_stub, - .macho_stub_size = AA64_IPLT_STUB_SIZE, - .emit_macho_stub = aa64_emit_macho_stub, - - .is_branch_reloc = aa64_is_branch_reloc, - .is_got_load_reloc = aa64_is_got_load_reloc, - .is_tlvp_reloc = aa64_is_tlvp_reloc, - .is_direct_page_reloc = aa64_is_direct_page_reloc, - .needs_jit_call_stub = aa64_is_branch_reloc, -}; diff --git a/src/link/link_arch_rv64.c b/src/link/link_arch_rv64.c @@ -1,95 +0,0 @@ -/* RV64 link-time arch descriptor. See link_arch.h for the contract. - * - * The PLT0/PLT-entry/IPLT-stub byte layouts here mirror what used to - * live inline in link_dyn.c (PLT) and link_layout.c (IPLT) before the - * vtable refactor; comments preserve the WHY (notably the +0x800 bias - * on AUIPC immediates). */ - -#include "arch/rv64_isa.h" -#include "core/bytes.h" -#include "core/core.h" -#include "link/link_arch.h" -#include "obj/elf.h" - -/* PLT0 is 8 canonical NOPs (32 bytes); each PLT entry and IPLT stub is - * 4 instructions (16 bytes) / 3 instructions (12 bytes) respectively. - * Encoded once here so the descriptor and emitters stay in sync. */ -#define RV64_PLT0_SIZE 32u -#define RV64_PLT_ENTRY_SIZE 16u -#define RV64_IPLT_STUB_SIZE 12u - -/* Split a PC-relative displacement into the (hi20, lo12) pair consumed - * by the AUIPC + I-type sequence. The +0x800 bias is the standard - * RISC-V two-instruction PCREL trick: AUIPC adds an upper-20 immediate - * shifted left 12, then the second instruction adds a sign-extended - * 12-bit lo12. If we naively split disp into (disp>>12, disp&0xfff) - * the lo12 sign-extends as a *negative* number whenever bit 11 is set, - * which underflows the AUIPC result by 0x1000. Adding 0x800 before - * the shift rounds the high half up in exactly the cases that need it - * so AUIPC + sign-extended-lo12 reconstructs disp correctly. */ -static inline void rv64_split_pcrel(i64 disp, u32* hi20_out, u32* lo12_out) { - *hi20_out = (u32)(((u64)(disp + 0x800)) >> 12) & 0xfffffu; - *lo12_out = (u32)((u64)disp & 0xfffu); -} - -/* PLT0 under DF_1_NOW is never executed — the loader resolves every - * JUMP_SLOT before transferring control — but we still emit it in - * canonical form (8 NOPs) so disassemblers and unwinders see a well- - * formed prologue at the top of .plt. */ -static void rv64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) { - u32 i; - (void)plt0_vaddr; - (void)gotplt_vaddr; - for (i = 0; i < RV64_PLT0_SIZE; i += 4u) wr_u32_le(dst + i, rv_nop()); -} - -/* Per-import PLT entry: load the GOT slot pre-filled by the loader - * (R_RISCV_JUMP_SLOT) and tail-call through it. t1 is the standard - * psABI scratch for the trampoline return-address (clobbered by the - * lazy resolver in the non-BIND_NOW path); t3 holds the slot pointer. */ -static void rv64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) { - i64 disp = (i64)slot_vaddr - (i64)entry_vaddr; - u32 hi20; - u32 lo12; - rv64_split_pcrel(disp, &hi20, &lo12); - wr_u32_le(dst + 0, rv_auipc(RV_T3, hi20)); - wr_u32_le(dst + 4, rv_ld(RV_T3, RV_T3, (i32)lo12)); - wr_u32_le(dst + 8, rv_jalr(RV_T1, RV_T3, 0)); - wr_u32_le(dst + 12, rv_nop()); -} - -/* IPLT stub: load .igot.plt[i] (filled at startup by the resolver) and - * tail-call to it. The stub->slot displacement is invariant under the - * segment-base shift (both addresses live in the same image), so we - * bake it directly into the instructions and report zero apply-time - * relocs — unlike aarch64, which cannot encode a 32-bit pcrel inline. */ -static u32 rv64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr, - LinkArchIPltReloc out[2]) { - i64 disp = (i64)slot_vaddr - (i64)stub_vaddr; - u32 hi20; - u32 lo12; - (void)out; - rv64_split_pcrel(disp, &hi20, &lo12); - wr_u32_le(dst + 0, rv_auipc(RV_T1, hi20)); - wr_u32_le(dst + 4, rv_ld(RV_T1, RV_T1, (i32)lo12)); - wr_u32_le(dst + 8, rv_jr(RV_T1)); - return 0u; -} - -const LinkArchDesc link_arch_rv64 = { - .e_machine = EM_RISCV, - .default_musl_interp = "/lib/ld-musl-riscv64.so.1", - /* RISC-V psABI has no dedicated GLOB_DAT — GOT-slot data imports - * use the generic absolute-64 reloc instead. */ - .elf_r_relative = ELF_R_RISCV_RELATIVE, - .elf_r_glob_dat = ELF_R_RISCV_64, - .elf_r_jump_slot = ELF_R_RISCV_JUMP_SLOT, - .plt0_size = RV64_PLT0_SIZE, - .plt_entry_size = RV64_PLT_ENTRY_SIZE, - .iplt_stub_size = RV64_IPLT_STUB_SIZE, - .global_pointer_symbol = "__global_pointer$", - .global_pointer_rw_offset = 0x800u, - .emit_plt0 = rv64_emit_plt0, - .emit_plt_entry = rv64_emit_plt_entry, - .emit_iplt_stub = rv64_emit_iplt_stub, -}; diff --git a/src/link/link_arch_x64.c b/src/link/link_arch_x64.c @@ -1,77 +0,0 @@ -/* x86_64 link-time arch descriptor. - * - * Implements the LinkArchDesc contract from link/link_arch.h for - * EM_X86_64. The PLT/IPLT byte sequences here mirror the inline - * encodings previously living in link_dyn.c (PLT0 + per-import entry) - * and link_layout.c (IPLT stub) — kept identical byte-for-byte so the - * descriptor switchover is a pure refactor. All raw byte values come - * from named constants / inline writers in arch/x64_isa.h. */ - -#include "link/link_arch.h" - -#include "arch/x64_isa.h" -#include "core/bytes.h" -#include "core/core.h" -#include "obj/elf.h" - -/* PLT0 layout under DF_1_NOW: never executed (loader pre-binds every - * slot via .rela.plt before user code runs), so we just emit 32 bytes - * of single-byte NOPs. Self-documenting and trivially well-formed for - * disassemblers and unwinders that walk the section. */ -static void x64_emit_plt0(u8* dst, u64 plt0_vaddr, u64 gotplt_vaddr) { - (void)plt0_vaddr; - (void)gotplt_vaddr; - x64_write_nop_pad(dst, 32u); -} - -/* Per-import PLT entry (16 B): - * - * ff 25 disp32 ; jmpq *[rip + disp_to_slot] (6 B) - * 90 90 90 90 90 90 90 90 90 90 ; pad to 16 with single-byte NOPs - * - * disp32 is measured from the END of the JMP (entry_vaddr + 6) to the - * .got.plt slot. The 10-byte tail matches link_dyn.c's prior - * memset(0x90)+patch behavior exactly. */ -static void x64_emit_plt_entry(u8* dst, u64 entry_vaddr, u64 slot_vaddr) { - i64 disp = (i64)slot_vaddr - (i64)(entry_vaddr + X64_JMP_RIPREL_SIZE); - i32 disp32 = (i32)(u32)((u64)disp & 0xffffffffu); - x64_write_jmp_riprel(dst, disp32); - x64_write_nop_pad(dst + X64_JMP_RIPREL_SIZE, - 16u - X64_JMP_RIPREL_SIZE); -} - -/* IPLT (ifunc) trampoline stub (12 B): - * - * ff 25 disp32 ; jmpq *[rip + disp_to_slot] (6 B) - * 66 0f 1f 44 00 00 ; 6-byte multibyte NOP (6 B) - * - * Like the PLT entry, disp32 is from the END of the JMP to the - * .igot.plt slot. The displacement is invariant under image-base - * shift (both ends move together), so it's encoded inline and we - * report zero apply-time relocations. */ -static u32 x64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr, - LinkArchIPltReloc out[2]) { - (void)out; - i64 disp = (i64)slot_vaddr - (i64)(stub_vaddr + X64_JMP_RIPREL_SIZE); - i32 disp32 = (i32)(u32)((u64)disp & 0xffffffffu); - x64_write_jmp_riprel(dst, disp32); - x64_write_nop6(dst + X64_JMP_RIPREL_SIZE); - return 0; -} - -const LinkArchDesc link_arch_x64 = { - .e_machine = EM_X86_64, - .default_musl_interp = "/lib/ld-musl-x86_64.so.1", - - .elf_r_relative = ELF_R_X86_64_RELATIVE, - .elf_r_glob_dat = ELF_R_X86_64_GLOB_DAT, - .elf_r_jump_slot = ELF_R_X86_64_JUMP_SLOT, - - .plt0_size = 32u, - .plt_entry_size = 16u, - .iplt_stub_size = 12u, - - .emit_plt0 = x64_emit_plt0, - .emit_plt_entry = x64_emit_plt_entry, - .emit_iplt_stub = x64_emit_iplt_stub, -}; diff --git a/src/link/link_dyn.c b/src/link/link_dyn.c @@ -514,7 +514,7 @@ void layout_dyn(Linker* l, LinkImage* img) { img->pie = 1; /* PT_INTERP path. Default to the canonical musl loader matching the - * target arch (per-arch table in link_arch_<arch>.c) when the caller + * target arch (per-arch table in src/arch/<arch>/link.c) when the caller * didn't set one. Drivers like cfree-cc always override via * link_set_interp_path; this default is correctness for direct * libcfree consumers. glibc users have to set their interp diff --git a/test/arch/aa64_inline_test.c b/test/arch/aa64_inline_test.c @@ -24,7 +24,7 @@ #include <stdlib.h> #include <string.h> -#include "arch/aa64_asm.h" +#include "arch/aa64/asm.h" #include "arch/arch.h" #include "core/buf.h" #include "core/core.h" diff --git a/test/arch/aa64_isa_test.c b/test/arch/aa64_isa_test.c @@ -7,14 +7,14 @@ * invariant: an alias-bearing word (e.g. ORR Rd, ZR, Rm) resolves to * the alias spelling (MOV) rather than the canonical row. * - * Builds against the internal arch/aa64_isa.h surface (test.mk passes + * Builds against the internal arch/aa64/isa.h surface (test.mk passes * -Isrc). No public-API dependency — this is a unit test of the * descriptor table itself. */ #include <stdio.h> #include <string.h> -#include "arch/aa64_isa.h" +#include "arch/aa64/isa.h" #include "core/strbuf.h" static int fails = 0;