kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 5a7642085de670403406e9ad6a3f29cbd73ef3e1
parent c3ab7c37ff8954e1c3a358f844e8871cf30d7b7c
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 14 May 2026 11:40:53 -0700

Factor simple backend register allocator

Diffstat:
Msrc/arch/aarch64/alloc.c | 28++++++++--------------------
Msrc/arch/aarch64/emit.c | 34++++++++++++++++++++--------------
Msrc/arch/aarch64/internal.h | 19++++++++++++-------
Msrc/arch/aarch64/ops.c | 324++++++++++++++++++++++++++++++++++++++++---------------------------------------
Msrc/arch/aarch64/opt_coord.c | 3+--
Asrc/arch/regalloc.c | 92+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/arch/regalloc.h | 37+++++++++++++++++++++++++++++++++++++
Msrc/arch/rv64/internal.h | 28+++++-----------------------
Msrc/arch/rv64/opt_coord.c | 3+--
Msrc/arch/x64/alloc.c | 24++++--------------------
Msrc/arch/x64/emit.c | 10+++++-----
Msrc/arch/x64/internal.h | 16+++++-----------
Msrc/arch/x64/opt_coord.c | 11+++--------
13 files changed, 359 insertions(+), 270 deletions(-)

diff --git a/src/arch/aarch64/alloc.c b/src/arch/aarch64/alloc.c @@ -14,28 +14,15 @@ AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; } * ============================================================ */ void regpool_init(RegPool* p, u8 base, u8 nregs) { - p->base = base; - p->nregs = nregs; - p->hwm = 0; - p->free = (nregs >= 32u) ? 0xFFFFFFFFu : ((1u << nregs) - 1u); + cg_simple_regpool_init_range(p, base, nregs); } Reg regpool_alloc(RegPool* p) { - if (p->free == 0) return (Reg)REG_NONE; - u32 idx = (u32)__builtin_ctz(p->free); - p->free &= ~(1u << idx); - if (idx + 1u > p->hwm) p->hwm = idx + 1u; - return (Reg)(p->base + idx); + return cg_simple_regpool_alloc(p); } int regpool_free(RegPool* p, Reg r) { - u32 rn = (u32)r; - if (rn < p->base || rn >= (u32)(p->base + p->nregs)) return 0; - u32 idx = rn - p->base; - u32 bit = 1u << idx; - if (p->free & bit) return -1; - p->free |= bit; - return 1; + return cg_simple_regpool_free(p, r); } /* ============================================================ @@ -176,13 +163,14 @@ void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op) { if (b_op.kind == OPK_IMM && a_op.kind != OPK_IMM) { u32 imm12, sh; if (aa64_addsub_imm_fits(b_op.v.imm, &imm12, &sh)) { - u32 rn = aa64_force_reg_int(t, a_op, sf, 9); + u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); aa64_emit32(mc, aa64_subs_imm12(sf, /*Rd=ZR*/ 31u, rn, imm12, sh)); return; } } - u32 rn = aa64_force_reg_int(t, a_op, sf, 9); - u32 rm = aa64_force_reg_int(t, b_op, sf, (rn == 9) ? 10u : 9u); + u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); + u32 rm = + aa64_force_reg_int(t, b_op, sf, (rn == AA_TMP0) ? AA_TMP1 : AA_TMP0); aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, rn, rm)); } @@ -225,7 +213,7 @@ static CGScope aa_scope_begin(CGTarget* t, const CGScopeDesc* d) { sc->else_label = t->mc->label_new(t->mc); sc->end_label = t->mc->label_new(t->mc); u32 sf = type_is_64(d->cond.type) ? 1u : 0u; - u32 rn = aa64_force_reg_int(t, d->cond, sf, 9); + u32 rn = aa64_force_reg_int(t, d->cond, sf, AA_TMP0); aa64_emit32(t->mc, aa64_subs_imm(sf, /*Rd=ZR*/ 31u, rn, 0)); aa64_emit32(t->mc, aa64_b_cond(0x0u /*EQ*/)); t->mc->emit_label_ref(t->mc, sc->else_label, R_AARCH64_CONDBR19, 4, 0); diff --git a/src/arch/aarch64/emit.c b/src/arch/aarch64/emit.c @@ -382,29 +382,29 @@ void aa_param(CGTarget* t, const CGParamDesc* p) { } else { u32 caller_off = a->next_param_stack; a->next_param_stack += 8; - aa64_emit32(t->mc, aa64_ldur(3, 9, 29, (i32)(16 + caller_off))); - ptr_reg = 9; + aa64_emit32(t->mc, aa64_ldur(3, AA_TMP0, 29, (i32)(16 + caller_off))); + ptr_reg = AA_TMP0; } u32 nbytes = s->size; u32 i = 0; while (i + 8 <= nbytes) { - aa64_emit32(t->mc, aa64_ldur(3, 10, ptr_reg, (i32)i)); - aa64_emit32(t->mc, aa64_stur(3, 10, 29, -(i32)s->off + (i32)i)); + aa64_emit32(t->mc, aa64_ldur(3, AA_TMP1, ptr_reg, (i32)i)); + aa64_emit32(t->mc, aa64_stur(3, AA_TMP1, 29, -(i32)s->off + (i32)i)); i += 8; } while (i + 4 <= nbytes) { - aa64_emit32(t->mc, aa64_ldur(2, 10, ptr_reg, (i32)i)); - aa64_emit32(t->mc, aa64_stur(2, 10, 29, -(i32)s->off + (i32)i)); + aa64_emit32(t->mc, aa64_ldur(2, AA_TMP1, ptr_reg, (i32)i)); + aa64_emit32(t->mc, aa64_stur(2, AA_TMP1, 29, -(i32)s->off + (i32)i)); i += 4; } while (i + 2 <= nbytes) { - aa64_emit32(t->mc, aa64_ldur(1, 10, ptr_reg, (i32)i)); - aa64_emit32(t->mc, aa64_stur(1, 10, 29, -(i32)s->off + (i32)i)); + aa64_emit32(t->mc, aa64_ldur(1, AA_TMP1, ptr_reg, (i32)i)); + aa64_emit32(t->mc, aa64_stur(1, AA_TMP1, 29, -(i32)s->off + (i32)i)); i += 2; } while (i < nbytes) { - aa64_emit32(t->mc, aa64_ldur(0, 10, ptr_reg, (i32)i)); - aa64_emit32(t->mc, aa64_stur(0, 10, 29, -(i32)s->off + (i32)i)); + aa64_emit32(t->mc, aa64_ldur(0, AA_TMP1, ptr_reg, (i32)i)); + aa64_emit32(t->mc, aa64_stur(0, AA_TMP1, 29, -(i32)s->off + (i32)i)); i += 1; } return; @@ -422,8 +422,10 @@ void aa_param(CGTarget* t, const CGParamDesc* p) { } else { u32 caller_off = a->next_param_stack; a->next_param_stack += 8; - aa64_emit32(t->mc, aa64_ldur(sidx, 9, 29, (i32)(16 + caller_off))); - aa64_emit32(t->mc, aa64_stur(sidx, 9, 29, -(i32)s->off + (i32)part_off)); + aa64_emit32(t->mc, aa64_ldur(sidx, AA_TMP0, 29, (i32)(16 + caller_off))); + aa64_emit32(t->mc, + aa64_stur(sidx, AA_TMP0, 29, + -(i32)s->off + (i32)part_off)); } } else if (pt->cls == ABI_CLASS_FP) { if (a->next_param_fp < 8) { @@ -433,8 +435,12 @@ void aa_param(CGTarget* t, const CGParamDesc* p) { } else { u32 caller_off = a->next_param_stack; a->next_param_stack += 8; - aa64_emit32(t->mc, aa64_ldur_fp(sidx, 0, 29, (i32)(16 + caller_off))); - aa64_emit32(t->mc, aa64_stur_fp(sidx, 0, 29, -(i32)s->off + (i32)part_off)); + aa64_emit32(t->mc, + aa64_ldur_fp(sidx, AA_FP_TMP0, 29, + (i32)(16 + caller_off))); + aa64_emit32(t->mc, + aa64_stur_fp(sidx, AA_FP_TMP0, 29, + -(i32)s->off + (i32)part_off)); } } else { compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl", diff --git a/src/arch/aarch64/internal.h b/src/arch/aarch64/internal.h @@ -8,6 +8,7 @@ #include "arch/aa64_isa.h" #include "arch/aa64_regs.h" #include "arch/arch.h" +#include "arch/regalloc.h" #include "core/arena.h" #include "core/pool.h" #include "obj/obj.h" @@ -17,6 +18,16 @@ * ============================================================ */ #define AA64_NOP 0xD503201Fu + +/* Hidden backend temporaries. These must stay outside the allocable pools and + * outside optimizer scratch registers because CGTarget ops may clobber them + * while lowering a single operation. AA_FP_TMP0 names v31, not integer x31. */ +enum { + AA_TMP0 = 9u, + AA_TMP1 = 10u, + AA_TMP2 = 11u, + AA_FP_TMP0 = 31u, +}; #define CG_BUILTIN_ID(k) ((CfreeCgTypeId)((1u << 6) | (u32)(k))) static inline u32 aa64_stp_x(u32 Rt, u32 Rt2, u32 Rn, i32 byte_off) { @@ -191,13 +202,7 @@ static inline u32 aa64_bfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) { * RegPool * ============================================================ */ -typedef struct RegPool { - u32 free; - u32 hwm; - u8 base; - u8 nregs; - u8 pad[2]; -} RegPool; +typedef CGSimpleRegPool RegPool; /* ============================================================ * AAImpl types diff --git a/src/arch/aarch64/ops.c b/src/arch/aarch64/ops.c @@ -53,13 +53,13 @@ static void aa_load_const(CGTarget* t, Operand dst, ConstBytes cb) { t->mc->set_section(t->mc, cur_section); u32 adrp_pos = t->mc->pos(t->mc); - aa64_emit32(t->mc, aa64_adrp_base(9)); + aa64_emit32(t->mc, aa64_adrp_base(AA_TMP0)); t->mc->emit_reloc_at(t->mc, cur_section, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, 0, 0, 0); u32 ldr_pos = t->mc->pos(t->mc); u32 sidx = (cb.size == 8) ? 3u : 2u; - aa64_emit32(t->mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), 9, 0)); + aa64_emit32(t->mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), AA_TMP0, 0)); RelocKind lo12 = (cb.size == 8) ? R_AARCH64_LDST64_ABS_LO12_NC : R_AARCH64_LDST32_ABS_LO12_NC; t->mc->emit_reloc_at(t->mc, cur_section, ldr_pos, lo12, sym, 0, 0, 0); @@ -137,30 +137,30 @@ void aa_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma) { ObjSymId sym = addr.v.global.sym; i64 add = addr.v.global.addend; if (use_got_for_sym(t, sym)) { - aa64_emit_got_load_addr(t, /*dst=*/9, sym); + aa64_emit_got_load_addr(t, AA_TMP0, sym); if (dst.cls == RC_FP) { - aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), 9, (i32)add)); + aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP0, (i32)add)); } else { - aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), 9, (i32)add)); + aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP0, (i32)add)); } return; } u32 adrp_pos = mc->pos(mc); - aa64_emit32(mc, aa64_adrp_base(/*Rd=*/9)); + aa64_emit32(mc, aa64_adrp_base(AA_TMP0)); mc->emit_reloc_at(mc, sec, adrp_pos, R_AARCH64_ADR_PREL_PG_HI21, sym, add, 0, 0); u32 ld_pos = mc->pos(mc); if (dst.cls == RC_FP) { - aa64_emit32(mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), 9, 0)); + aa64_emit32(mc, aa64_ldr_fp_uimm(sidx, reg_num(dst), AA_TMP0, 0)); } else { - aa64_emit32(mc, aa64_ldr_uimm(sidx, reg_num(dst), 9, 0)); + aa64_emit32(mc, aa64_ldr_uimm(sidx, reg_num(dst), AA_TMP0, 0)); } mc->emit_reloc_at(mc, sec, ld_pos, ldst_lo12_reloc_for(sz), sym, add, 0, 0); return; } i32 off; - u32 base = addr_base(t, addr, &off, 9); + u32 base = addr_base(t, addr, &off, AA_TMP0); if (dst.cls == RC_FP) { aa64_emit32(t->mc, aa64_ldur_fp(sidx, reg_num(dst), base, off)); } else { @@ -182,15 +182,15 @@ void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) { u32 src_is_fp = 0; if (src.kind == OPK_IMM) { u32 sf = (sz == 8) ? 1u : 0u; - aa64_emit_load_imm(mc, sf, /*Rd=*/9, src.v.imm); - src_reg = 9; + aa64_emit_load_imm(mc, sf, AA_TMP0, src.v.imm); + src_reg = AA_TMP0; } else if (src.cls == RC_FP) { src_reg = reg_num(src); src_is_fp = 1; } else { src_reg = reg_num(src); } - u32 base = (src.kind == OPK_IMM) ? 10u : 9u; + u32 base = (src.kind == OPK_IMM) ? AA_TMP1 : AA_TMP0; if (use_got_for_sym(t, sym)) { aa64_emit_got_load_addr(t, base, sym); if (src_is_fp) { @@ -215,13 +215,13 @@ void aa_store(CGTarget* t, Operand addr, Operand src, MemAccess ma) { } i32 off; - u32 addr_tmp = (src.kind == OPK_IMM) ? 10u : 9u; + u32 addr_tmp = (src.kind == OPK_IMM) ? AA_TMP1 : AA_TMP0; u32 base = addr_base(t, addr, &off, addr_tmp); if (src.kind == OPK_IMM) { u32 sf = (sz == 8) ? 1u : 0u; - aa64_emit_load_imm(t->mc, sf, 9, src.v.imm); - aa64_emit32(t->mc, aa64_stur(sidx, 9, base, off)); + aa64_emit_load_imm(t->mc, sf, AA_TMP0, src.v.imm); + aa64_emit32(t->mc, aa64_stur(sidx, AA_TMP0, base, off)); return; } if (src.cls == RC_FP) { @@ -314,10 +314,10 @@ static void aa_tls_addr_of(CGTarget* t, Operand dst, ObjSymId sym, i64 addend) { return; } - aa64_emit32(mc, aa64_mrs_tpidr_el0(/*Rt=*/9)); + aa64_emit32(mc, aa64_mrs_tpidr_el0(AA_TMP0)); u32 hi_pos = mc->pos(mc); - aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, /*Rn=*/9, /*imm12=*/0, /*sh=*/1)); + aa64_emit32(mc, aa64_add_imm(/*sf=*/1, rd, AA_TMP0, /*imm12=*/0, /*sh=*/1)); mc->emit_reloc_at(mc, sec, hi_pos, R_AARCH64_TLSLE_ADD_TPREL_HI12, sym, addend, 0, 0); @@ -347,28 +347,29 @@ static u32 agg_addr_reg(CGTarget* t, Operand op, u32 scratch) { static void aa_copy_bytes(CGTarget* t, Operand dst_addr, Operand src_addr, AggregateAccess agg) { MCEmitter* mc = t->mc; - u32 dr = agg_addr_reg(t, dst_addr, 9); - u32 sr = agg_addr_reg(t, src_addr, (dr == 10) ? 11u : 10u); + u32 dr = agg_addr_reg(t, dst_addr, AA_TMP0); + u32 sr = agg_addr_reg(t, src_addr, + (dr == AA_TMP1) ? AA_TMP2 : AA_TMP1); u32 nbytes = agg.size; u32 i = 0; while (i + 8 <= nbytes) { - aa64_emit32(mc, aa64_ldur(3, 12, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(3, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i)); i += 8; } while (i + 4 <= nbytes) { - aa64_emit32(mc, aa64_ldur(2, 12, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(2, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i)); i += 4; } while (i + 2 <= nbytes) { - aa64_emit32(mc, aa64_ldur(1, 12, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(1, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i)); i += 2; } while (i < nbytes) { - aa64_emit32(mc, aa64_ldur(0, 12, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(0, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i)); i += 1; } } @@ -376,7 +377,7 @@ static void aa_copy_bytes(CGTarget* t, Operand dst_addr, Operand src_addr, static void aa_set_bytes(CGTarget* t, Operand dst_addr, Operand byte_value, AggregateAccess agg) { MCEmitter* mc = t->mc; - u32 dr = agg_addr_reg(t, dst_addr, 9); + u32 dr = agg_addr_reg(t, dst_addr, AA_TMP0); u32 byte; if (byte_value.kind == OPK_IMM) { @@ -412,23 +413,23 @@ static void aa_set_bytes(CGTarget* t, Operand dst_addr, Operand byte_value, b64 |= b64 << 8; b64 |= b64 << 16; b64 |= b64 << 32; - aa64_emit_load_imm(mc, /*sf=*/1u, /*Rd=*/12u, (i64)b64); + aa64_emit_load_imm(mc, /*sf=*/1u, AA_TMP1, (i64)b64); u32 i = 0; while (i + 8 <= nbytes) { - aa64_emit32(mc, aa64_stur(3, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_stur(3, AA_TMP1, dr, (i32)i)); i += 8; } while (i + 4 <= nbytes) { - aa64_emit32(mc, aa64_stur(2, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_stur(2, AA_TMP1, dr, (i32)i)); i += 4; } while (i + 2 <= nbytes) { - aa64_emit32(mc, aa64_stur(1, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_stur(1, AA_TMP1, dr, (i32)i)); i += 2; } while (i < nbytes) { - aa64_emit32(mc, aa64_stur(0, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_stur(0, AA_TMP1, dr, (i32)i)); i += 1; } } @@ -440,7 +441,7 @@ static void aa_set_bytes(CGTarget* t, Operand dst_addr, Operand byte_value, static void aa_bitfield_load(CGTarget* t, Operand dst, Operand record_addr, BitFieldAccess bf) { MCEmitter* mc = t->mc; - u32 base = agg_addr_reg(t, record_addr, 9); + u32 base = agg_addr_reg(t, record_addr, AA_TMP0); u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u; u32 sf = (storage_bytes == 8u) ? 1u : 0u; u32 sidx = size_idx_for_bytes(storage_bytes); @@ -460,17 +461,17 @@ static void aa_bitfield_load(CGTarget* t, Operand dst, Operand record_addr, static void aa_bitfield_store(CGTarget* t, Operand record_addr, Operand src, BitFieldAccess bf) { MCEmitter* mc = t->mc; - u32 base = agg_addr_reg(t, record_addr, 9); + u32 base = agg_addr_reg(t, record_addr, AA_TMP0); u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u; u32 sf = (storage_bytes == 8u) ? 1u : 0u; u32 sidx = size_idx_for_bytes(storage_bytes); - aa64_emit32(mc, aa64_ldur(sidx, /*Rt=*/10u, base, (i32)bf.storage_offset)); + aa64_emit32(mc, aa64_ldur(sidx, AA_TMP1, base, (i32)bf.storage_offset)); u32 src_reg; if (src.kind == OPK_IMM) { - aa64_emit_load_imm(mc, sf, /*Rd=*/11u, src.v.imm); - src_reg = 11u; + aa64_emit_load_imm(mc, sf, AA_TMP2, src.v.imm); + src_reg = AA_TMP2; } else if (src.kind == OPK_REG) { src_reg = reg_num(src); } else { @@ -484,9 +485,9 @@ static void aa_bitfield_store(CGTarget* t, Operand record_addr, Operand src, u32 width = bf.bit_width ? bf.bit_width : 1u; u32 immr = (reg_size - lsb) % reg_size; u32 imms = width - 1u; - aa64_emit32(mc, aa64_bfm(sf, /*Rd=*/10u, src_reg, immr, imms)); + aa64_emit32(mc, aa64_bfm(sf, AA_TMP1, src_reg, immr, imms)); - aa64_emit32(mc, aa64_stur(sidx, /*Rt=*/10u, base, (i32)bf.storage_offset)); + aa64_emit32(mc, aa64_stur(sidx, AA_TMP1, base, (i32)bf.storage_offset)); } /* ============================================================ @@ -610,8 +611,9 @@ static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, } } - u32 rn = aa64_force_reg_int(t, a_op, sf, 9); - u32 rm = aa64_force_reg_int(t, b_op, sf, (rn == 9) ? 10 : 9); + u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); + u32 rm = + aa64_force_reg_int(t, b_op, sf, (rn == AA_TMP0) ? AA_TMP1 : AA_TMP0); u32 word; switch (op) { @@ -627,12 +629,12 @@ static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, case BO_UDIV: word = aa64_udiv(sf, rd, rn, rm); break; case BO_SDIV: word = aa64_sdiv(sf, rd, rn, rm); break; case BO_SREM: - aa64_emit32(mc, aa64_sdiv(sf, 11, rn, rm)); - word = aa64_msub(sf, rd, 11, rm, rn); + aa64_emit32(mc, aa64_sdiv(sf, AA_TMP2, rn, rm)); + word = aa64_msub(sf, rd, AA_TMP2, rm, rn); break; case BO_UREM: - aa64_emit32(mc, aa64_udiv(sf, 11, rn, rm)); - word = aa64_msub(sf, rd, 11, rm, rn); + aa64_emit32(mc, aa64_udiv(sf, AA_TMP2, rn, rm)); + word = aa64_msub(sf, rd, AA_TMP2, rm, rn); break; case BO_FADD: case BO_FSUB: @@ -649,7 +651,7 @@ static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a_op) { MCEmitter* mc = t->mc; u32 sf = type_is_64(dst.type) ? 1u : 0u; u32 rd = reg_num(dst); - u32 rn = aa64_force_reg_int(t, a_op, sf, 9); + u32 rn = aa64_force_reg_int(t, a_op, sf, AA_TMP0); u32 word; switch (op) { @@ -794,7 +796,7 @@ static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi, if (!to_stack) dst_reg = (*next_int)++; else - dst_reg = 9; + dst_reg = AA_TMP0; if (av->storage.kind == OPK_LOCAL) { AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot); if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad byval slot"); @@ -821,7 +823,7 @@ static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi, if (pt->cls == ABI_CLASS_INT) { int to_stack = (*next_int >= 8); - u32 dst_reg = to_stack ? 9u : (*next_int)++; + u32 dst_reg = to_stack ? AA_TMP0 : (*next_int)++; switch (av->storage.kind) { case OPK_IMM: { u32 sf = (sz == 8) ? 1u : 0u; @@ -847,7 +849,7 @@ static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi, src.v.ind.base = av->storage.v.ind.base; src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset; i32 off; - u32 base = addr_base(t, src, &off, /*tmp=*/9); + u32 base = addr_base(t, src, &off, AA_TMP0); aa64_emit32(t->mc, aa64_ldur(sidx, dst_reg, base, off)); break; } @@ -877,7 +879,7 @@ static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi, src.v.ind.base = av->storage.v.ind.base; src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset; i32 off; - u32 base = addr_base(t, src, &off, /*tmp=*/9); + u32 base = addr_base(t, src, &off, AA_TMP0); aa64_emit32(t->mc, aa64_ldur_fp(sidx, dst_reg, base, off)); break; } @@ -899,9 +901,9 @@ static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi, src.v.ind.base = av->storage.v.ind.base; src.v.ind.ofs = av->storage.v.ind.ofs + (i32)pt->src_offset; i32 off; - u32 base = addr_base(t, src, &off, /*tmp=*/9); - aa64_emit32(t->mc, aa64_ldur_fp(sidx, /*Vt=*/16u, base, off)); - aa64_emit32(t->mc, aa64_stur_fp(sidx, /*Vt=*/16u, 31, (i32)*stack_off)); + u32 base = addr_base(t, src, &off, AA_TMP0); + aa64_emit32(t->mc, aa64_ldur_fp(sidx, AA_FP_TMP0, base, off)); + aa64_emit32(t->mc, aa64_stur_fp(sidx, AA_FP_TMP0, 31, (i32)*stack_off)); break; } default: @@ -1033,23 +1035,23 @@ static void aa_ret(CGTarget* t, const CGABIValue* val) { u32 nbytes = s->size; u32 i = 0; while (i + 8 <= nbytes) { - aa64_emit32(mc, aa64_ldur(3, 9, 29, -(i32)s->off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(3, 9, 8, i)); + aa64_emit32(mc, aa64_ldur(3, AA_TMP0, 29, -(i32)s->off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i)); i += 8; } while (i + 4 <= nbytes) { - aa64_emit32(mc, aa64_ldur(2, 9, 29, -(i32)s->off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(2, 9, 8, i)); + aa64_emit32(mc, aa64_ldur(2, AA_TMP0, 29, -(i32)s->off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i)); i += 4; } while (i + 2 <= nbytes) { - aa64_emit32(mc, aa64_ldur(1, 9, 29, -(i32)s->off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(1, 9, 8, i)); + aa64_emit32(mc, aa64_ldur(1, AA_TMP0, 29, -(i32)s->off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i)); i += 2; } while (i < nbytes) { - aa64_emit32(mc, aa64_ldur(0, 9, 29, -(i32)s->off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(0, 9, 8, i)); + aa64_emit32(mc, aa64_ldur(0, AA_TMP0, 29, -(i32)s->off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i)); i += 1; } } else if (val->storage.kind == OPK_INDIRECT) { @@ -1066,23 +1068,23 @@ static void aa_ret(CGTarget* t, const CGABIValue* val) { i32 base_off = val->storage.v.ind.ofs; u32 i = 0; while (i + 8 <= nbytes) { - aa64_emit32(mc, aa64_ldur(3, 9, base_reg, base_off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(3, 9, 8, i)); + aa64_emit32(mc, aa64_ldur(3, AA_TMP0, base_reg, base_off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i)); i += 8; } while (i + 4 <= nbytes) { - aa64_emit32(mc, aa64_ldur(2, 9, base_reg, base_off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(2, 9, 8, i)); + aa64_emit32(mc, aa64_ldur(2, AA_TMP0, base_reg, base_off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i)); i += 4; } while (i + 2 <= nbytes) { - aa64_emit32(mc, aa64_ldur(1, 9, base_reg, base_off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(1, 9, 8, i)); + aa64_emit32(mc, aa64_ldur(1, AA_TMP0, base_reg, base_off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i)); i += 2; } while (i < nbytes) { - aa64_emit32(mc, aa64_ldur(0, 9, base_reg, base_off + (i32)i)); - aa64_emit32(mc, aa64_str_uimm(0, 9, 8, i)); + aa64_emit32(mc, aa64_ldur(0, AA_TMP0, base_reg, base_off + (i32)i)); + aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i)); i += 1; } } else { @@ -1167,10 +1169,10 @@ static void aa_alloca_(CGTarget* t, Operand d, Operand sz, u32 align) { aa64_emit32(mc, aa64_sub_imm(1, /*Rd=SP*/ 31, /*Rn=SP*/ 31, (u32)aligned, 0)); } else if (sz.kind == OPK_REG) { u32 sz_reg = reg_num(sz); - aa64_emit32(mc, aa64_add_imm(1, 9, sz_reg, 15u, 0)); - aa64_emit32(mc, aa64_ubfm(1, 9, 9, 4, 63)); - aa64_emit32(mc, aa64_ubfm(1, 9, 9, 60, 59)); - aa64_emit32(mc, aa64_sub_extreg_x_uxtx(/*SP*/ 31, /*SP*/ 31, 9)); + aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, sz_reg, 15u, 0)); + aa64_emit32(mc, aa64_ubfm(1, AA_TMP0, AA_TMP0, 4, 63)); + aa64_emit32(mc, aa64_ubfm(1, AA_TMP0, AA_TMP0, 60, 59)); + aa64_emit32(mc, aa64_sub_extreg_x_uxtx(/*SP*/ 31, /*SP*/ 31, AA_TMP0)); } else { compiler_panic(t->c, a->loc, "aarch64 alloca: size kind %d unsupported", (int)sz.kind); @@ -1223,21 +1225,23 @@ static void aa_va_start_(CGTarget* t, Operand ap_op) { { u32 ofs = 16u + a->next_param_stack; if (ofs <= 0xfff) - aa64_emit32(mc, aa64_add_imm(1, 9, 29, ofs, 0)); + aa64_emit32(mc, aa64_add_imm(1, AA_TMP0, 29, ofs, 0)); else { - aa64_emit_load_imm(mc, 1, 9, (i64)ofs); - aa64_emit32(mc, aa64_add(1, 9, 29, 9)); + aa64_emit_load_imm(mc, 1, AA_TMP0, (i64)ofs); + aa64_emit32(mc, aa64_add(1, AA_TMP0, 29, AA_TMP0)); } - aa64_emit32(mc, aa64_str_uimm(3, 9, ap, 0)); - } - emit_fp_off(mc, 9, -(i32)gs->off + (i32)gs->size); - aa64_emit32(mc, aa64_str_uimm(3, 9, ap, 8)); - emit_fp_off(mc, 9, -(i32)fs->off + (i32)fs->size); - aa64_emit32(mc, aa64_str_uimm(3, 9, ap, 16)); - aa64_emit_load_imm(mc, 0, 9, (i64)((i32)(a->next_param_int * 8u) - 64)); - aa64_emit32(mc, aa64_str_uimm(2, 9, ap, 24)); - aa64_emit_load_imm(mc, 0, 9, (i64)((i32)(a->next_param_fp * 16u) - 128)); - aa64_emit32(mc, aa64_str_uimm(2, 9, ap, 28)); + aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 0)); + } + emit_fp_off(mc, AA_TMP0, -(i32)gs->off + (i32)gs->size); + aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 8)); + emit_fp_off(mc, AA_TMP0, -(i32)fs->off + (i32)fs->size); + aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, ap, 16)); + aa64_emit_load_imm(mc, 0, AA_TMP0, + (i64)((i32)(a->next_param_int * 8u) - 64)); + aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, ap, 24)); + aa64_emit_load_imm(mc, 0, AA_TMP0, + (i64)((i32)(a->next_param_fp * 16u) - 128)); + aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, ap, 28)); } static void aa_va_arg_(CGTarget* t, Operand dst, Operand ap_op, @@ -1254,31 +1258,31 @@ static void aa_va_arg_(CGTarget* t, Operand dst, Operand ap_op, MCLabel L_stack = mc->label_new(mc); MCLabel L_done = mc->label_new(mc); - aa64_emit32(mc, aa64_ldur(2, 9, ap, (i32)offs_field)); - aa64_emit32(mc, aa64_subs_imm(0, 31, 9, 0)); + aa64_emit32(mc, aa64_ldur(2, AA_TMP0, ap, (i32)offs_field)); + aa64_emit32(mc, aa64_subs_imm(0, 31, AA_TMP0, 0)); aa64_emit32(mc, aa64_b_cond(0xa /*GE*/)); mc->emit_label_ref(mc, L_stack, R_AARCH64_CONDBR19, 4, 0); - aa64_emit32(mc, aa64_ldur(3, 10, ap, (i32)top_field)); - aa64_emit32(mc, aa64_sbfm(1, 12, 9, 0, 31)); - aa64_emit32(mc, aa64_add(1, 11, 10, 12)); + aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, (i32)top_field)); + aa64_emit32(mc, aa64_sbfm(1, AA_TMP2, AA_TMP0, 0, 31)); + aa64_emit32(mc, aa64_add(1, AA_TMP2, AA_TMP1, AA_TMP2)); if (is_fp) - aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), 11, 0)); + aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP2, 0)); else - aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), 11, 0)); - aa64_emit32(mc, aa64_add_imm(0, 9, 9, stride_reg, 0)); - aa64_emit32(mc, aa64_stur(2, 9, ap, (i32)offs_field)); + aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP2, 0)); + aa64_emit32(mc, aa64_add_imm(0, AA_TMP0, AA_TMP0, stride_reg, 0)); + aa64_emit32(mc, aa64_stur(2, AA_TMP0, ap, (i32)offs_field)); aa64_emit32(mc, aa64_b_base()); mc->emit_label_ref(mc, L_done, R_AARCH64_JUMP26, 4, 0); mc->label_place(mc, L_stack); - aa64_emit32(mc, aa64_ldur(3, 10, ap, 0)); + aa64_emit32(mc, aa64_ldur(3, AA_TMP1, ap, 0)); if (is_fp) - aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), 10, 0)); + aa64_emit32(mc, aa64_ldur_fp(sidx, reg_num(dst), AA_TMP1, 0)); else - aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), 10, 0)); - aa64_emit32(mc, aa64_add_imm(1, 10, 10, 8u, 0)); - aa64_emit32(mc, aa64_stur(3, 10, ap, 0)); + aa64_emit32(mc, aa64_ldur(sidx, reg_num(dst), AA_TMP1, 0)); + aa64_emit32(mc, aa64_add_imm(1, AA_TMP1, AA_TMP1, 8u, 0)); + aa64_emit32(mc, aa64_stur(3, AA_TMP1, ap, 0)); mc->label_place(mc, L_done); } @@ -1293,8 +1297,8 @@ static void aa_va_copy_(CGTarget* t, Operand d, Operand s) { u32 dr = reg_num(d); u32 sr = reg_num(s); for (u32 i = 0; i < 32u; i += 8u) { - aa64_emit32(mc, aa64_ldur(3, 9, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(3, 9, dr, (i32)i)); + aa64_emit32(mc, aa64_ldur(3, AA_TMP0, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(3, AA_TMP0, dr, (i32)i)); } } @@ -1346,7 +1350,7 @@ static void aa_atomic_load(CGTarget* t, Operand dst, Operand addr, MemAccess ma, } else if (addr.kind == OPK_LOCAL) { AASlot* s = aa64_slot_get(a, addr.v.frame_slot); if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_load: bad slot"); - base = 9u; + base = AA_TMP0; aa64_emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0)); } else { compiler_panic(t->c, a->loc, @@ -1369,7 +1373,7 @@ static void aa_atomic_store(CGTarget* t, Operand addr, Operand src, u32 src_reg; if (src.kind == OPK_IMM) { - src_reg = 10u; + src_reg = AA_TMP1; aa64_emit_load_imm(mc, sf, src_reg, src.v.imm); } else if (src.kind == OPK_REG) { src_reg = reg_num(src); @@ -1384,7 +1388,7 @@ static void aa_atomic_store(CGTarget* t, Operand addr, Operand src, } else if (addr.kind == OPK_LOCAL) { AASlot* s = aa64_slot_get(a, addr.v.frame_slot); if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_store: bad slot"); - base = 9u; + base = AA_TMP0; aa64_emit32(mc, aa64_sub_imm(1, base, 29, s->off, 0)); } else { compiler_panic(t->c, a->loc, @@ -1424,18 +1428,18 @@ static void aa_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst, Operand addr, MCEmitter* mc = t->mc; u32 sf = (ma.size == 8) ? 1u : 0u; - u32 base = 9u; + u32 base = AA_TMP0; if (addr.kind == OPK_REG) { - aa64_emit32(mc, aa64_mov_reg(1, 9, reg_num(addr))); + aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(addr))); } else if (addr.kind == OPK_LOCAL) { AASlot* s = aa64_slot_get(a, addr.v.frame_slot); if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: bad slot"); - aa64_emit32(mc, aa64_sub_imm(1, 9, 29, s->off, 0)); + aa64_emit32(mc, aa64_sub_imm(1, AA_TMP0, 29, s->off, 0)); } else { compiler_panic(t->c, a->loc, "aarch64 atomic_rmw: addr kind %d unsupported", (int)addr.kind); } - u32 vreg = 10u; + u32 vreg = AA_TMP1; if (val.kind == OPK_IMM) { aa64_emit_load_imm(mc, sf, vreg, val.v.imm); } else if (val.kind == OPK_REG) { @@ -1456,15 +1460,15 @@ static void aa_atomic_rmw(CGTarget* t, AtomicOp op, Operand dst, Operand addr, else aa64_emit32(mc, aa64_ldxr(sf, reg_num(dst), base)); - emit_rmw_combine(mc, op, sf, /*new=*/11u, /*prior=*/reg_num(dst), vreg); + emit_rmw_combine(mc, op, sf, AA_TMP2, reg_num(dst), vreg); if (do_rel) - aa64_emit32(mc, aa64_stlxr(sf, /*Rs=*/12u, /*Rt=*/11u, base)); + aa64_emit32(mc, aa64_stlxr(sf, vreg, AA_TMP2, base)); else - aa64_emit32(mc, aa64_stxr(sf, /*Rs=*/12u, /*Rt=*/11u, base)); + aa64_emit32(mc, aa64_stxr(sf, vreg, AA_TMP2, base)); u32 cbnz_pos = mc->pos(mc); - aa64_emit32(mc, aa64_cbnz(0, /*Rt=*/12u)); + aa64_emit32(mc, aa64_cbnz(0, vreg)); mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0); (void)cbnz_pos; } @@ -1477,28 +1481,28 @@ static void aa_atomic_cas(CGTarget* t, Operand prior, Operand ok, Operand addr, u32 sf = (ma.size == 8) ? 1u : 0u; (void)fail; - u32 base = 9u; + u32 base = AA_TMP0; if (addr.kind == OPK_REG) - aa64_emit32(mc, aa64_mov_reg(1, 9, reg_num(addr))); + aa64_emit32(mc, aa64_mov_reg(1, AA_TMP0, reg_num(addr))); else if (addr.kind == OPK_LOCAL) { AASlot* s = aa64_slot_get(a, addr.v.frame_slot); if (!s) compiler_panic(t->c, a->loc, "aarch64 atomic_cas: bad slot"); - aa64_emit32(mc, aa64_sub_imm(1, 9, 29, s->off, 0)); + aa64_emit32(mc, aa64_sub_imm(1, AA_TMP0, 29, s->off, 0)); } else { compiler_panic(t->c, a->loc, "aarch64 atomic_cas: addr kind %d unsupported", (int)addr.kind); } if (expected.kind == OPK_IMM) - aa64_emit_load_imm(mc, sf, 10, expected.v.imm); + aa64_emit_load_imm(mc, sf, AA_TMP1, expected.v.imm); else if (expected.kind == OPK_REG) - aa64_emit32(mc, aa64_mov_reg(sf, 10, reg_num(expected))); + aa64_emit32(mc, aa64_mov_reg(sf, AA_TMP1, reg_num(expected))); else compiler_panic(t->c, a->loc, "aarch64 atomic_cas: exp kind %d unsupported", (int)expected.kind); if (desired.kind == OPK_IMM) - aa64_emit_load_imm(mc, sf, 11, desired.v.imm); + aa64_emit_load_imm(mc, sf, AA_TMP2, desired.v.imm); else if (desired.kind == OPK_REG) - aa64_emit32(mc, aa64_mov_reg(sf, 11, reg_num(desired))); + aa64_emit32(mc, aa64_mov_reg(sf, AA_TMP2, reg_num(desired))); else compiler_panic(t->c, a->loc, "aarch64 atomic_cas: des kind %d unsupported", (int)desired.kind); @@ -1516,15 +1520,15 @@ static void aa_atomic_cas(CGTarget* t, Operand prior, Operand ok, Operand addr, else aa64_emit32(mc, aa64_ldxr(sf, reg_num(prior), base)); - aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, reg_num(prior), 10u)); + aa64_emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/ 31u, reg_num(prior), AA_TMP1)); aa64_emit32(mc, aa64_b_cond(0x1u /*NE*/)); mc->emit_label_ref(mc, L_fail, R_AARCH64_CONDBR19, 4, 0); if (do_rel) - aa64_emit32(mc, aa64_stlxr(sf, 12u, 11u, base)); + aa64_emit32(mc, aa64_stlxr(sf, AA_TMP1, AA_TMP2, base)); else - aa64_emit32(mc, aa64_stxr(sf, 12u, 11u, base)); - aa64_emit32(mc, aa64_cbnz(0, 12u)); + aa64_emit32(mc, aa64_stxr(sf, AA_TMP1, AA_TMP2, base)); + aa64_emit32(mc, aa64_cbnz(0, AA_TMP1)); mc->emit_label_ref(mc, L_retry, R_AARCH64_CONDBR19, 4, 0); aa64_emit_load_imm(mc, 0, reg_num(ok), 1); @@ -1597,12 +1601,12 @@ static void aa_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd, Operand dst = dsts[0]; u32 sz_in = type_byte_size(src.type); if (sz_in == 8) - aa64_emit32(mc, aa64_fmov_d_x(0, reg_num(src))); + aa64_emit32(mc, aa64_fmov_d_x(AA_FP_TMP0, reg_num(src))); else - aa64_emit32(mc, aa64_fmov_s_w(0, reg_num(src))); - aa64_emit32(mc, aa64_cnt_8b(0, 0)); - aa64_emit32(mc, aa64_addv_b_8b(0, 0)); - aa64_emit32(mc, aa64_fmov_w_s(reg_num(dst), 0)); + aa64_emit32(mc, aa64_fmov_s_w(AA_FP_TMP0, reg_num(src))); + aa64_emit32(mc, aa64_cnt_8b(AA_FP_TMP0, AA_FP_TMP0)); + aa64_emit32(mc, aa64_addv_b_8b(AA_FP_TMP0, AA_FP_TMP0)); + aa64_emit32(mc, aa64_fmov_w_s(reg_num(dst), AA_FP_TMP0)); return; } case INTRIN_CLZ: { @@ -1646,46 +1650,46 @@ static void aa_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd, if (kind == INTRIN_MEMCPY) { u32 i = 0; while (i + 8 <= n) { - aa64_emit32(mc, aa64_ldur(3, 12, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(3, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i)); i += 8; } while (i + 4 <= n) { - aa64_emit32(mc, aa64_ldur(2, 12, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(2, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i)); i += 4; } while (i + 2 <= n) { - aa64_emit32(mc, aa64_ldur(1, 12, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(1, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i)); i += 2; } while (i < n) { - aa64_emit32(mc, aa64_ldur(0, 12, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(0, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i)); i += 1; } } else { u32 i = n; while (i >= 8) { i -= 8; - aa64_emit32(mc, aa64_ldur(3, 12, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(3, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_ldur(3, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(3, AA_TMP2, dr, (i32)i)); } while (i >= 4) { i -= 4; - aa64_emit32(mc, aa64_ldur(2, 12, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(2, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_ldur(2, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(2, AA_TMP2, dr, (i32)i)); } while (i >= 2) { i -= 2; - aa64_emit32(mc, aa64_ldur(1, 12, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(1, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_ldur(1, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(1, AA_TMP2, dr, (i32)i)); } while (i >= 1) { i -= 1; - aa64_emit32(mc, aa64_ldur(0, 12, sr, (i32)i)); - aa64_emit32(mc, aa64_stur(0, 12, dr, (i32)i)); + aa64_emit32(mc, aa64_ldur(0, AA_TMP2, sr, (i32)i)); + aa64_emit32(mc, aa64_stur(0, AA_TMP2, dr, (i32)i)); } } return; @@ -1710,13 +1714,13 @@ static void aa_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd, b64 |= b64 << 8; b64 |= b64 << 16; b64 |= b64 << 32; - aa64_emit_load_imm(mc, 1, 12, (i64)b64); - src_reg = 12u; + aa64_emit_load_imm(mc, 1, AA_TMP2, (i64)b64); + src_reg = AA_TMP2; } } else if (bv.kind == OPK_REG) { - aa64_emit_load_imm(mc, 1, 12, (i64)0x0101010101010101ll); - aa64_emit32(mc, aa64_madd(1, 12, reg_num(bv), 12, AA64_ZR)); - src_reg = 12u; + aa64_emit_load_imm(mc, 1, AA_TMP2, (i64)0x0101010101010101ll); + aa64_emit32(mc, aa64_madd(1, AA_TMP2, reg_num(bv), AA_TMP2, AA64_ZR)); + src_reg = AA_TMP2; } else { compiler_panic(t->c, a->loc, "aarch64 intrinsic: memset byte kind %d unsupported", @@ -1779,8 +1783,10 @@ static void aa_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd, Operand a_op = args[0], b_op = args[1]; Operand dval = dsts[0], dovf = dsts[1]; u32 sf = type_is_64(dval.type) ? 1u : 0u; - u32 ra = aa64_force_reg_int(t, a_op, sf, 9); - u32 rb = aa64_force_reg_int(t, b_op, sf, (ra == 9) ? 10u : 9u); + u32 ra = aa64_force_reg_int(t, a_op, sf, AA_TMP0); + u32 rb = + aa64_force_reg_int(t, b_op, sf, + (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0); u32 word = (kind == INTRIN_ADD_OVERFLOW) ? aa64_adds_reg(sf, reg_num(dval), ra, rb) : aa64_subs_reg(sf, reg_num(dval), ra, rb); @@ -1797,12 +1803,14 @@ static void aa_intrinsic(CGTarget* t, IntrinKind kind, Operand* dsts, u32 nd, t->c, a->loc, "aarch64 intrinsic: mul_overflow on i64 not yet supported"); } - u32 ra = aa64_force_reg_int(t, a_op, 0, 9); - u32 rb = aa64_force_reg_int(t, b_op, 0, (ra == 9) ? 10u : 9u); - aa64_emit32(mc, aa64_smull(/*X*/ 11u, ra, rb)); - aa64_emit32(mc, aa64_subs_extreg_x_sxtw(/*XZR*/ 31u, /*Xn=*/11u, /*Wm=*/11u)); + u32 ra = aa64_force_reg_int(t, a_op, 0, AA_TMP0); + u32 rb = + aa64_force_reg_int(t, b_op, 0, + (ra == AA_TMP0) ? AA_TMP1 : AA_TMP0); + aa64_emit32(mc, aa64_smull(AA_TMP2, ra, rb)); + aa64_emit32(mc, aa64_subs_extreg_x_sxtw(/*XZR*/ 31u, AA_TMP2, AA_TMP2)); aa64_emit32(mc, aa64_cset(0, reg_num(dovf), 0x1u /*NE*/)); - aa64_emit32(mc, aa64_mov_reg(0, reg_num(dval), 11u)); + aa64_emit32(mc, aa64_mov_reg(0, reg_num(dval), AA_TMP2)); return; } default: diff --git a/src/arch/aarch64/opt_coord.c b/src/arch/aarch64/opt_coord.c @@ -79,8 +79,7 @@ static void aa_reserve_hard_regs(CGTarget* t, RegClass cls, default: return; } for (u32 i = 0; i < n; ++i) { - u32 idx = (u32)(regs[i] - p->base); - if (idx < p->nregs && idx + 1u > p->hwm) p->hwm = idx + 1u; + cg_simple_regpool_reserve(p, regs[i]); } } diff --git a/src/arch/regalloc.c b/src/arch/regalloc.c @@ -0,0 +1,92 @@ +#include "arch/regalloc.h" + +#include <string.h> + +static u32 pool_mask(u32 nregs) { + if (nregs > CG_SIMPLE_REGALLOC_MAX_REGS) nregs = CG_SIMPLE_REGALLOC_MAX_REGS; + return (nregs >= 32u) ? 0xFFFFFFFFu : ((1u << nregs) - 1u); +} + +void cg_simple_regpool_init_range(CGSimpleRegPool* p, Reg base, u32 nregs) { + if (nregs > CG_SIMPLE_REGALLOC_MAX_REGS) nregs = CG_SIMPLE_REGALLOC_MAX_REGS; + p->free = pool_mask(nregs); + p->hwm = 0; + p->order = NULL; + p->base = base; + p->nregs = nregs; +} + +void cg_simple_regpool_init_ordered(CGSimpleRegPool* p, const Reg* regs, + u32 nregs) { + if (nregs > CG_SIMPLE_REGALLOC_MAX_REGS) nregs = CG_SIMPLE_REGALLOC_MAX_REGS; + p->free = pool_mask(nregs); + p->hwm = 0; + p->order = regs; + p->base = 0; + p->nregs = nregs; +} + +Reg cg_simple_regpool_reg_at(const CGSimpleRegPool* p, u32 idx) { + if (idx >= p->nregs) return (Reg)REG_NONE; + return p->order ? p->order[idx] : (Reg)(p->base + idx); +} + +Reg cg_simple_regpool_alloc(CGSimpleRegPool* p) { + if (p->free == 0) return (Reg)REG_NONE; + u32 idx = (u32)__builtin_ctz(p->free); + p->free &= ~(1u << idx); + if (idx + 1u > p->hwm) p->hwm = idx + 1u; + return cg_simple_regpool_reg_at(p, idx); +} + +int cg_simple_regpool_free(CGSimpleRegPool* p, Reg r) { + for (u32 i = 0; i < p->nregs; ++i) { + if (cg_simple_regpool_reg_at(p, i) == r) { + u32 bit = 1u << i; + if (p->free & bit) return -1; + p->free |= bit; + return 1; + } + } + return 0; +} + +void cg_simple_regpool_reserve(CGSimpleRegPool* p, Reg r) { + for (u32 i = 0; i < p->nregs; ++i) { + if (cg_simple_regpool_reg_at(p, i) == r) { + if (i + 1u > p->hwm) p->hwm = i + 1u; + return; + } + } +} + +void cg_simple_regalloc_init(CGSimpleRegAlloc* a) { + memset(a, 0, sizeof *a); +} + +void cg_simple_regalloc_set_range(CGSimpleRegAlloc* a, RegClass cls, Reg base, + u32 nregs) { + if ((u32)cls >= 3u) return; + cg_simple_regpool_init_range(&a->pools[cls], base, nregs); +} + +void cg_simple_regalloc_set_ordered(CGSimpleRegAlloc* a, RegClass cls, + const Reg* regs, u32 nregs) { + if ((u32)cls >= 3u) return; + cg_simple_regpool_init_ordered(&a->pools[cls], regs, nregs); +} + +Reg cg_simple_regalloc_alloc(CGSimpleRegAlloc* a, RegClass cls) { + if ((u32)cls >= 3u) return (Reg)REG_NONE; + return cg_simple_regpool_alloc(&a->pools[cls]); +} + +int cg_simple_regalloc_free(CGSimpleRegAlloc* a, RegClass cls, Reg r) { + if ((u32)cls >= 3u) return -2; + return cg_simple_regpool_free(&a->pools[cls], r); +} + +void cg_simple_regalloc_reserve(CGSimpleRegAlloc* a, RegClass cls, Reg r) { + if ((u32)cls >= 3u) return; + cg_simple_regpool_reserve(&a->pools[cls], r); +} diff --git a/src/arch/regalloc.h b/src/arch/regalloc.h @@ -0,0 +1,37 @@ +#ifndef CFREE_ARCH_REGALLOC_H +#define CFREE_ARCH_REGALLOC_H + +#include "arch/arch.h" + +#define CG_SIMPLE_REGALLOC_MAX_REGS 32u + +typedef struct CGSimpleRegPool { + u32 free; /* bit i set iff reg_at(i) is free */ + u32 hwm; /* highest index+1 ever allocated/reserved */ + const Reg* order; /* optional ordered hard-reg table */ + Reg base; /* used when order is NULL: reg_at(i) = base + i */ + u32 nregs; +} CGSimpleRegPool; + +typedef struct CGSimpleRegAlloc { + CGSimpleRegPool pools[3]; /* indexed by RegClass */ +} CGSimpleRegAlloc; + +void cg_simple_regpool_init_range(CGSimpleRegPool* p, Reg base, u32 nregs); +void cg_simple_regpool_init_ordered(CGSimpleRegPool* p, const Reg* regs, + u32 nregs); +Reg cg_simple_regpool_alloc(CGSimpleRegPool* p); +int cg_simple_regpool_free(CGSimpleRegPool* p, Reg r); +void cg_simple_regpool_reserve(CGSimpleRegPool* p, Reg r); +Reg cg_simple_regpool_reg_at(const CGSimpleRegPool* p, u32 idx); + +void cg_simple_regalloc_init(CGSimpleRegAlloc* a); +void cg_simple_regalloc_set_range(CGSimpleRegAlloc* a, RegClass cls, Reg base, + u32 nregs); +void cg_simple_regalloc_set_ordered(CGSimpleRegAlloc* a, RegClass cls, + const Reg* regs, u32 nregs); +Reg cg_simple_regalloc_alloc(CGSimpleRegAlloc* a, RegClass cls); +int cg_simple_regalloc_free(CGSimpleRegAlloc* a, RegClass cls, Reg r); +void cg_simple_regalloc_reserve(CGSimpleRegAlloc* a, RegClass cls, Reg r); + +#endif diff --git a/src/arch/rv64/internal.h b/src/arch/rv64/internal.h @@ -5,6 +5,7 @@ #include <string.h> #include "arch/arch.h" +#include "arch/regalloc.h" #include "arch/rv64.h" #include "arch/rv64_isa.h" #include "core/arena.h" @@ -14,13 +15,7 @@ #define RV_PROLOGUE_WORDS 32u /* ---- RegPool ---- */ -typedef struct RegPool { - u32 free; - u32 hwm; - u8 base; - u8 nregs; - u8 pad[2]; -} RegPool; +typedef CGSimpleRegPool RegPool; /* ---- RvSlot / RvScope ---- */ typedef struct RvSlot { @@ -115,26 +110,13 @@ static inline u32 reg_num(Operand op) { return op.v.reg & 0x1fu; } /* ---- RegPool ops (inlined — identical in each caller) ---- */ static inline void regpool_init(RegPool* p, u8 base, u8 nregs) { - p->base = base; - p->nregs = nregs; - p->hwm = 0; - p->free = (nregs >= 32u) ? 0xFFFFFFFFu : ((1u << nregs) - 1u); + cg_simple_regpool_init_range(p, base, nregs); } static inline Reg regpool_alloc(RegPool* p) { - if (p->free == 0) return (Reg)REG_NONE; - u32 idx = (u32)__builtin_ctz(p->free); - p->free &= ~(1u << idx); - if (idx + 1u > p->hwm) p->hwm = idx + 1u; - return (Reg)(p->base + idx); + return cg_simple_regpool_alloc(p); } static inline int regpool_free(RegPool* p, Reg r) { - u32 rn = (u32)r; - if (rn < p->base || rn >= (u32)(p->base + p->nregs)) return 0; - u32 idx = rn - p->base; - u32 bit = 1u << idx; - if (p->free & bit) return -1; - p->free |= bit; - return 1; + return cg_simple_regpool_free(p, r); } /* ---- emit.c: function lifecycle (referenced by ops.c vtable) ---- */ diff --git a/src/arch/rv64/opt_coord.c b/src/arch/rv64/opt_coord.c @@ -78,8 +78,7 @@ static void rv_reserve_hard_regs(CGTarget* t, RegClass cls, default: return; } for (u32 i = 0; i < n; ++i) { - u32 idx = (u32)(regs[i] - p->base); - if (idx < p->nregs && idx + 1u > p->hwm) p->hwm = idx + 1u; + cg_simple_regpool_reserve(p, regs[i]); } } diff --git a/src/arch/x64/alloc.c b/src/arch/x64/alloc.c @@ -19,32 +19,16 @@ /* ============================================================ * XRegPool implementation. */ -void xpool_init(XRegPool* p, const u8* order, u8 nregs, u8 n_cs) { - p->order = order; - p->nregs = nregs; - p->n_cs = n_cs; - p->hwm = 0; - p->free = (nregs >= 32u) ? 0xFFFFFFFFu : ((1u << nregs) - 1u); +void xpool_init(XRegPool* p, const Reg* order, u32 nregs) { + cg_simple_regpool_init_ordered(p, order, nregs); } static Reg xpool_alloc(XRegPool* p) { - if (p->free == 0) return (Reg)REG_NONE; - u32 idx = (u32)__builtin_ctz(p->free); - p->free &= ~(1u << idx); - if (idx + 1u > p->hwm) p->hwm = idx + 1u; - return (Reg)p->order[idx]; + return cg_simple_regpool_alloc(p); } static int xpool_free(XRegPool* p, Reg r) { - for (u8 i = 0; i < p->nregs; ++i) { - if (p->order[i] == (u8)r) { - u32 bit = 1u << i; - if (p->free & bit) return -1; - p->free |= bit; - return 1; - } - } - return 0; + return cg_simple_regpool_free(p, r); } /* ============================================================ diff --git a/src/arch/x64/emit.c b/src/arch/x64/emit.c @@ -18,12 +18,12 @@ /* ============================================================ * Shared constant tables. */ -const u8 g_int_order[6] = { +const Reg g_int_order[6] = { X64_RBX, X64_R12, X64_R13, X64_R14, X64_R15, /* callee-saved (n_cs=5) */ X64_R10, /* caller-saved tail */ }; -const u8 g_fp_order[10] = { +const Reg g_fp_order[10] = { /* All xmm regs are caller-saved on SysV; preference order is xmm6 * upward to keep the low arg/return regs (xmm0..5) clear for calls. */ X64_XMM6, X64_XMM7, X64_XMM8, X64_XMM0 + 9, X64_XMM0 + 10, @@ -478,8 +478,8 @@ void x_func_begin(CGTarget* t, const CGFuncDesc* fd) { a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0; a->cum_off = 0; a->max_outgoing = 0; - xpool_init(&a->int_pool, g_int_order, 6u, 5u); - xpool_init(&a->fp_pool, g_fp_order, 10u, 0u); + xpool_init(&a->int_pool, g_int_order, 6u); + xpool_init(&a->fp_pool, g_fp_order, 10u); a->nslots = 0; a->nscopes = 0; a->nalloca_patches = 0; @@ -538,7 +538,7 @@ void x_func_end(CGTarget* t) { MCEmitter* mc = t->mc; u32 cs_used = a->int_pool.hwm; - if (cs_used > a->int_pool.n_cs) cs_used = a->int_pool.n_cs; + if (cs_used > 5u) cs_used = 5u; u32 cs_size = cs_used * 8u; /* Stack alignment: SysV requires rsp ≡ 0 mod 16 just before a call, diff --git a/src/arch/x64/internal.h b/src/arch/x64/internal.h @@ -13,6 +13,7 @@ #include <string.h> #include "arch/arch.h" +#include "arch/regalloc.h" #include "arch/x64.h" #include "arch/x64_isa.h" #include "core/arena.h" @@ -24,14 +25,7 @@ /* ============================================================ * Custom register pool. */ -typedef struct XRegPool { - u32 free; /* bit i set ⇔ alloc_order[i] is free */ - u32 hwm; /* highest index+1 ever allocated */ - const u8* order; /* alloc_order; first n_cs are callee-saved */ - u8 nregs; - u8 n_cs; - u8 pad[2]; -} XRegPool; +typedef CGSimpleRegPool XRegPool; /* ============================================================ * XImpl and friends. */ @@ -137,8 +131,8 @@ static inline _Noreturn void x_panic(CGTarget* t, const char* what) { /* ============================================================ * Shared constant tables (defined in alloc.c, used in emit.c and ops.c). */ -extern const u8 g_int_order[6]; -extern const u8 g_fp_order[10]; +extern const Reg g_int_order[6]; +extern const Reg g_fp_order[10]; extern const u32 g_int_arg_regs[6]; /* ============================================================ @@ -196,7 +190,7 @@ void emit_sse_rr_w(MCEmitter* mc, u8 prefix, u8 opcode, int w, u32 dst, u32 src); /* --- alloc.c exports (used by emit.c and/or ops.c) --- */ -void xpool_init(XRegPool* p, const u8* order, u8 nregs, u8 n_cs); +void xpool_init(XRegPool* p, const Reg* order, u32 nregs); XSlot* x64_slot_get(XImpl* a, FrameSlot fs); FrameSlot x_frame_slot(CGTarget* t, const FrameSlotDesc* d); Reg x_alloc_reg(CGTarget* t, RegClass cls, CfreeCgTypeId ty); diff --git a/src/arch/x64/opt_coord.c b/src/arch/x64/opt_coord.c @@ -16,11 +16,11 @@ static void x_get_allocable_regs(CGTarget* t, RegClass cls, XImpl* a = impl_of(t); switch (cls) { case RC_INT: - *out = (const Reg*)a->int_pool.order; + *out = a->int_pool.order; *nregs = a->int_pool.nregs; break; case RC_FP: - *out = (const Reg*)a->fp_pool.order; + *out = a->fp_pool.order; *nregs = a->fp_pool.nregs; break; default: @@ -75,12 +75,7 @@ static void x_reserve_hard_regs(CGTarget* t, RegClass cls, default: return; } for (u32 i = 0; i < n; ++i) { - for (u8 j = 0; j < p->nregs; ++j) { - if (p->order[j] == (u8)regs[i]) { - if (j + 1u > p->hwm) p->hwm = j + 1u; - break; - } - } + cg_simple_regpool_reserve(p, regs[i]); } }