kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 84863072e412579624a0d6c9f304344ebc4b1289
parent 2d37ba7b367e02cf86618cfa9e3a7a92d516e7ea
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed,  3 Jun 2026 14:53:02 -0700

Support explicit register asm operands

Diffstat:
Minclude/kit/cg.h | 6++++++
Mlang/c/parse/cg_adapter.c | 2++
Mlang/c/parse/cg_adapter.h | 1+
Mlang/c/parse/parse.c | 14+++++++++++---
Mlang/c/parse/parse_priv.h | 6++++++
Mlang/c/parse/parse_stmt.c | 88+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
Mlang/c/parse/parse_type.c | 31+++++++++++++++++++++++++------
Msrc/arch/aa64/native.c | 198++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
Msrc/arch/c_target/c_emit.c | 115++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
Msrc/arch/native_target.h | 13++++++++++++-
Msrc/arch/rv64/native.c | 176++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
Msrc/arch/wasm/emit.c | 8++++++++
Msrc/arch/x64/emit.c | 15+--------------
Msrc/arch/x64/emit.h | 3---
Msrc/arch/x64/native.c | 251++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
Msrc/cg/asm.c | 18+++++++++++++++++-
Msrc/cg/cgtarget.h | 4++++
Msrc/cg/native_asm.c | 74++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/cg/native_asm.h | 25+++++++++++++++++++++++++
Msrc/opt/pass_lower.c | 7++++++-
Msrc/opt/pass_machinize.c | 54++++++++++++++++++++++--------------------------------
Mtest/arch/x64_inline_test.c | 27+++++++++++++++++++++++++++
Atest/parse/cases/asm_03_register_operand.c | 37+++++++++++++++++++++++++++++++++++++
Atest/parse/cases/asm_03_register_operand.expected | 1+
Atest/parse/cases/asm_03_register_operand.wasm.skip | 1+
Atest/parse/cases/asm_04_register_callee_saved.c | 71+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/parse/cases/asm_04_register_callee_saved.expected | 1+
Atest/parse/cases/asm_04_register_callee_saved.wasm.skip | 1+
Atest/parse/cases/asm_05_register_label_scope.c | 10++++++++++
Atest/parse/cases/asm_05_register_label_scope.expected | 1+
Atest/parse/cases_err/asm_register_bad_constraint.c | 13+++++++++++++
Atest/parse/cases_err/asm_register_bad_constraint.errpat | 1+
Atest/parse/cases_err/asm_register_bad_name.c | 5+++++
Atest/parse/cases_err/asm_register_bad_name.errpat | 1+
Atest/parse/cases_err/asm_register_class_mismatch.c | 13+++++++++++++
Atest/parse/cases_err/asm_register_class_mismatch.errpat | 1+
Atest/parse/cases_err/asm_register_forbidden.c | 13+++++++++++++
Atest/parse/cases_err/asm_register_forbidden.errpat | 1+
38 files changed, 1063 insertions(+), 244 deletions(-)

diff --git a/include/kit/cg.h b/include/kit/cg.h @@ -1032,6 +1032,12 @@ typedef struct KitCgAsmOperand { KitSym constraint; /* interned target constraint string */ KitSym name; /* interned symbolic operand name; 0 if absent */ KitCgTypeId type; + /* Explicit hard register this operand must occupy, named by its target + * spelling ("r10", "x8", "a7", ...); 0 when unconstrained. Set by a frontend + * for a GNU local register variable (`register T x __asm__("r10")`) used as + * an operand. The name is opaque to the frontend and CG — only the target's + * register file resolves it to a physical register. */ + KitSym reg; uint8_t dir; /* KitCgAsmDir */ uint8_t pad[3]; } KitCgAsmOperand; diff --git a/lang/c/parse/cg_adapter.c b/lang/c/parse/cg_adapter.c @@ -1158,6 +1158,7 @@ void pcg_inline_asm(Parser* p, const char* tmpl, const AsmConstraint* outs, kit_sym_intern(p->c, kit_slice_cstr(outs[i].str ? outs[i].str : "")); o[i].name = outs[i].name; o[i].type = pcg_tid(p, outs[i].type); + o[i].reg = outs[i].reg; o[i].dir = KIT_CG_ASM_OUT; } } @@ -1168,6 +1169,7 @@ void pcg_inline_asm(Parser* p, const char* tmpl, const AsmConstraint* outs, kit_sym_intern(p->c, kit_slice_cstr(ins[i].str ? ins[i].str : "")); in[i].name = ins[i].name; in[i].type = pcg_tid(p, ins[i].type); + in[i].reg = ins[i].reg; in[i].dir = (ins[i].dir == ASM_INOUT) ? KIT_CG_ASM_INOUT : KIT_CG_ASM_IN; } } diff --git a/lang/c/parse/cg_adapter.h b/lang/c/parse/cg_adapter.h @@ -171,6 +171,7 @@ typedef struct AsmConstraint { const char* str; Sym name; const Type* type; + Sym reg; /* hard-register name for a GNU local register variable; 0 = none */ u8 dir; u8 pad[3]; } AsmConstraint; diff --git a/lang/c/parse/parse.c b/lang/c/parse/parse.c @@ -695,7 +695,9 @@ static SymEntry* declare_function(Parser* p, Sym fname, const Type* fn_ty, static void parse_init_declarator(Parser* p, const DeclSpecs* specs) { SrcLoc loc; Sym name; - const Type* var_ty = parse_declarator(p, specs->type, &name, &loc); + DeclaratorInfo dinfo; + const Type* var_ty = parse_declarator_full_info( + p, specs->type, /*allow_abstract=*/0, &name, &loc, NULL, &dinfo); if ((specs->flags & DF_THREAD) && specs->storage != DS_STATIC && specs->storage != DS_EXTERN) { perr(p, "block-scope _Thread_local requires static or extern"); @@ -876,7 +878,10 @@ static void parse_init_declarator(Parser* p, const DeclSpecs* specs) { s = make_local_aligned(p, name, var_ty, loc, specs->align); if (specs->storage == DS_REGISTER) { SymEntry* e = scope_lookup_current(p, name); - if (e && e->kind == SEK_LOCAL) e->storage = DS_REGISTER; + if (e && e->kind == SEK_LOCAL) { + e->storage = DS_REGISTER; + e->reg_asm_name = dinfo.asm_label; + } } pcg_set_loc(p, loc); init_at(p, s, var_ty, 0, var_ty); @@ -885,7 +890,10 @@ static void parse_init_declarator(Parser* p, const DeclSpecs* specs) { s = make_local_aligned(p, name, var_ty, loc, specs->align); if (specs->storage == DS_REGISTER) { SymEntry* e = scope_lookup_current(p, name); - if (e && e->kind == SEK_LOCAL) e->storage = DS_REGISTER; + if (e && e->kind == SEK_LOCAL) { + e->storage = DS_REGISTER; + e->reg_asm_name = dinfo.asm_label; + } } if (accept_punct(p, '=')) { pcg_set_loc(p, loc); diff --git a/lang/c/parse/parse_priv.h b/lang/c/parse/parse_priv.h @@ -134,6 +134,11 @@ struct SymEntry { FrameSlot vla_byte_slot; VLABound* vla_bounds; struct Attr* attrs; + /* For a `register T x __asm__("reg")` local: the interned hard-register name + * ("r10", "x8", ...) the variable is bound to. Pins x to that register when + * used as an inline-asm operand (GNU explicit register variables). 0 = none. + */ + Sym reg_asm_name; SymEntry* next; }; @@ -473,6 +478,7 @@ typedef struct DeclaratorInfo { ParamInfo* fn_params; u16 fn_nparams; u8 fn_variadic; + Sym asm_label; } DeclaratorInfo; /* ============================================================ diff --git a/lang/c/parse/parse_stmt.c b/lang/c/parse/parse_stmt.c @@ -517,10 +517,28 @@ void parse_static_assert(Parser* p) { * already been consumed by parse_stmt. */ typedef struct AsmOutLValue { FrameSlot addr_slot; + FrameSlot value_slot; const Type* ptr_ty; const Type* val_ty; + u8 direct_local; + u8 pad[3]; } AsmOutLValue; +static void asm_out_lvalue_push(Parser* p, const AsmOutLValue* lv) { + if (lv->direct_local) { + pcg_push_local_typed(p, lv->value_slot, lv->val_ty); + return; + } + pcg_push_local_typed(p, lv->addr_slot, lv->ptr_ty); + pcg_load(p); + pcg_deref(p, lv->val_ty); +} + +static void asm_out_value_push(Parser* p, const AsmOutLValue* lv) { + asm_out_lvalue_push(p, lv); + pcg_load(p); +} + static Sym parse_asm_operand_name(Parser* p) { Sym name = 0; if (!is_punct(&p->cur, '[')) return 0; @@ -554,6 +572,27 @@ static const char* parse_asm_str(Parser* p, const char* what) { return kit_sym_str(p->pool->c, s).s; } +/* GNU local register variables: when an asm operand is exactly a bare reference + * to a `register T x __asm__("reg")` local, return that register name (else 0). + * Called with p->cur positioned at the first token of the operand expression, + * so it only peeks — it must not consume. The operand has to be a lone + * identifier (the canonical idiom); anything more complex is not a + * hard-register operand under GCC's rules either. The name is carried opaquely + * on the constraint's `reg` field; CG/native code validates that the constraint + * is a target register constraint and only the target resolves it to a + * register. */ +static Sym asm_operand_pinned_reg(Parser* p, FrameSlot* slot_out) { + Tok nxt; + SymEntry* e; + if (p->cur.kind != TOK_IDENT) return 0; + nxt = peek1(p); + if (!is_punct(&nxt, ')')) return 0; + e = scope_lookup(p, p->cur.v.ident); + if (!e || e->kind != SEK_LOCAL) return 0; + if (e->reg_asm_name && slot_out) *slot_out = e->v.slot; + return e->reg_asm_name; +} + static void parse_asm_stmt(Parser* p) { const char* tmpl; AsmConstraint* outs = NULL; @@ -592,8 +631,10 @@ static void parse_asm_stmt(Parser* p) { const Type* ptr_ty; FrameSlotDesc fsd; FrameSlot slot; + FrameSlot pinned_slot; memset(&c, 0, sizeof c); memset(&lv, 0, sizeof lv); + pinned_slot = FRAME_SLOT_NONE; c.name = parse_asm_operand_name(p); c.str = parse_asm_str(p, "asm output constraint"); if (c.str && c.str[0] == '+') @@ -601,25 +642,32 @@ static void parse_asm_stmt(Parser* p) { else c.dir = ASM_OUT; expect_punct(p, '(', "'(' before asm output lvalue"); + c.reg = asm_operand_pinned_reg(p, &pinned_slot); parse_assign_expr(p); val_ty = pcg_top_type(p); if (!val_ty) perr(p, "asm output: cannot determine lvalue type"); c.type = val_ty; - pcg_addr(p); - ptr_ty = pcg_top_type(p); - if (!ptr_ty) perr(p, "asm output: cannot take address"); - memset(&fsd, 0, sizeof fsd); - fsd.type = ptr_ty; - fsd.size = 8; - fsd.align = 8; - fsd.kind = FS_LOCAL; - slot = pcg_local(p, &fsd); - pcg_push_local_typed(p, slot, ptr_ty); - pcg_swap(p); - pcg_store(p); - pcg_drop(p); - lv.addr_slot = slot; - lv.ptr_ty = ptr_ty; + if (c.reg && pinned_slot != FRAME_SLOT_NONE) { + pcg_drop(p); + lv.direct_local = 1; + lv.value_slot = pinned_slot; + } else { + pcg_addr(p); + ptr_ty = pcg_top_type(p); + if (!ptr_ty) perr(p, "asm output: cannot take address"); + memset(&fsd, 0, sizeof fsd); + fsd.type = ptr_ty; + fsd.size = 8; + fsd.align = 8; + fsd.kind = FS_LOCAL; + slot = pcg_local(p, &fsd); + pcg_push_local_typed(p, slot, ptr_ty); + pcg_swap(p); + pcg_store(p); + pcg_drop(p); + lv.addr_slot = slot; + lv.ptr_ty = ptr_ty; + } lv.val_ty = val_ty; expect_punct(p, ')', "')' after asm output lvalue"); if (nout == cap_out) { @@ -653,6 +701,7 @@ static void parse_asm_stmt(Parser* p) { c.str = parse_asm_str(p, "asm input constraint"); c.dir = ASM_IN; expect_punct(p, '(', "'(' before asm input expression"); + c.reg = asm_operand_pinned_reg(p, NULL); parse_assign_expr(p); to_rvalue(p); c.type = pcg_top_type(p); @@ -736,10 +785,7 @@ static void parse_asm_stmt(Parser* p) { "matching-digit syntax"); } AsmOutLValue* lv = &out_lvs[i]; - pcg_push_local_typed(p, lv->addr_slot, lv->ptr_ty); - pcg_load(p); - pcg_deref(p, lv->val_ty); - pcg_load(p); + asm_out_value_push(p, lv); AsmConstraint mc; memset(&mc, 0, sizeof mc); mc.str = k_match_strs[i]; @@ -756,9 +802,7 @@ static void parse_asm_stmt(Parser* p) { u32 i; for (i = nout; i-- > 0;) { AsmOutLValue* lv = &out_lvs[i]; - pcg_push_local_typed(p, lv->addr_slot, lv->ptr_ty); - pcg_load(p); - pcg_deref(p, lv->val_ty); + asm_out_lvalue_push(p, lv); pcg_swap(p); pcg_store(p); pcg_drop(p); diff --git a/lang/c/parse/parse_type.c b/lang/c/parse/parse_type.c @@ -109,20 +109,36 @@ static int starts_asm_label(const Parser* p) { return is_kw(p, &p->cur, KW_ASM) || is_kw(p, &p->cur, KW_BUILTIN_ASM); } -static void parse_asm_label(Parser* p) { +static Sym parse_asm_label(Parser* p) { + Sym label = 0; advance(p); /* asm / __asm / __asm__ */ expect_punct(p, '(', "'(' after asm label"); if (p->cur.kind != TOK_STR) { perr(p, "expected string literal in asm label"); } + /* Capture the label string for the declarator currently being parsed. For a + * `register T x __asm__("r10")` local this is the hard register name the + * variable binds to. Other asm labels (symbol renames) are still effectively + * ignored by callers that do not consume DeclaratorInfo.asm_label. */ + { + Tok t = p->cur; + size_t nlen = 0; + u8* bytes = decode_string_literal(p, &t, &nlen); + u32 ilen = (nlen > 0) ? (u32)(nlen - 1) : 0; + label = kit_sym_intern(p->pool->c, + (KitSlice){.s = (const char*)bytes, .len = ilen}); + kit_compiler_context(p->c)->heap->free(kit_compiler_context(p->c)->heap, + bytes, 0); + } do { advance(p); } while (p->cur.kind == TOK_STR); expect_punct(p, ')', "')' after asm label"); + return label; } static void parse_attrs_and_asm_into(Parser* p, Attr** attrs_out, - Attr** local_attrs) { + Attr** local_attrs, Sym* asm_label_out) { for (;;) { if (starts_attr(p)) { if (attrs_out) @@ -132,7 +148,8 @@ static void parse_attrs_and_asm_into(Parser* p, Attr** attrs_out, continue; } if (starts_asm_label(p)) { - parse_asm_label(p); + Sym label = parse_asm_label(p); + if (asm_label_out) *asm_label_out = label; continue; } break; @@ -146,7 +163,7 @@ static void parse_and_discard_attrs_or_asm(Parser* p) { continue; } if (starts_asm_label(p)) { - parse_asm_label(p); + (void)parse_asm_label(p); continue; } break; @@ -1469,6 +1486,7 @@ const Type* parse_declarator_full_info(Parser* p, const Type* base, SrcLoc* loc_out, Attr** attrs_out, DeclaratorInfo* info_out) { Attr* local_attrs = NULL; + Sym asm_label = 0; base = parse_pointer_layer(p, base); Sym name = 0; @@ -1594,7 +1612,7 @@ const Type* parse_declarator_full_info(Parser* p, const Type* base, } } - parse_attrs_and_asm_into(p, attrs_out, &local_attrs); + parse_attrs_and_asm_into(p, attrs_out, &local_attrs, &asm_label); DeclSuffix suffs[8]; int nsuffs = 0; @@ -1603,7 +1621,7 @@ const Type* parse_declarator_full_info(Parser* p, const Type* base, while (nsuffs < 8) { if (!parse_decl_suffix(p, &suffs[nsuffs])) break; ++nsuffs; - parse_attrs_and_asm_into(p, attrs_out, &local_attrs); + parse_attrs_and_asm_into(p, attrs_out, &local_attrs, &asm_label); } base = attrs_apply_type_mode(p, base, attrs_out ? *attrs_out : local_attrs); if (nsuffs == 8 && (is_punct(&p->cur, '[') || is_punct(&p->cur, '('))) { @@ -1642,6 +1660,7 @@ const Type* parse_declarator_full_info(Parser* p, const Type* base, info_out->fn_nparams = final_fn_suff->nparams; info_out->fn_variadic = final_fn_suff->variadic; } + if (info_out) info_out->asm_label = asm_label; if (name_out) *name_out = name; if (loc_out) *loc_out = nloc; return base; diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -774,16 +774,18 @@ static void aa_emit_mem(AANativeTarget* a, int load, NativeLoc reg, mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_LD64_GOT_LO12_NC, addr.base.global.sym, 0, 0, 0); if (addend) aa_emit_add_i64(a, scratch, scratch, addend); - aa_emit32(mc, load ? aa_ldur_v(sz, native_loc_is_fp(reg), rt, scratch, 0) - : aa_stur_v(sz, native_loc_is_fp(reg), rt, scratch, 0)); + aa_emit32(mc, load + ? aa_ldur_v(sz, native_loc_is_fp(reg), rt, scratch, 0) + : aa_stur_v(sz, native_loc_is_fp(reg), rt, scratch, 0)); return; } aa_emit32(mc, aa64_adrp(scratch, 0, 0)); mc->emit_reloc_at(mc, mc->section_id, pos, R_AARCH64_ADR_PREL_PG_HI21, addr.base.global.sym, addend, 0, 0); pos = mc->pos(mc); - aa_emit32(mc, load ? aa_ldr_uimm_v(sz, native_loc_is_fp(reg), rt, scratch, 0) - : aa_str_uimm_v(sz, native_loc_is_fp(reg), rt, scratch, 0)); + aa_emit32(mc, + load ? aa_ldr_uimm_v(sz, native_loc_is_fp(reg), rt, scratch, 0) + : aa_str_uimm_v(sz, native_loc_is_fp(reg), rt, scratch, 0)); mc->emit_reloc_at(mc, mc->section_id, pos, aa_ldst_reloc_for_size(sz), addr.base.global.sym, addend, 0, 0); return; @@ -805,14 +807,16 @@ static void aa_emit_mem(AANativeTarget* a, int load, NativeLoc reg, } else { aa_panic(a, "unsupported memory address scale"); } - aa_emit32(mc, aa_ldst_regoff_v(sz, native_loc_is_fp(reg), load, rt, use_base, - addr.index.reg, scaled)); + aa_emit32(mc, aa_ldst_regoff_v(sz, native_loc_is_fp(reg), load, rt, + use_base, addr.index.reg, scaled)); return; } if (off >= 0 && (((u32)off & ((1u << sz) - 1u)) == 0) && ((u32)off >> sz) <= 0xfffu) { - aa_emit32(mc, load ? aa_ldr_uimm_v(sz, native_loc_is_fp(reg), rt, base, (u32)off) - : aa_str_uimm_v(sz, native_loc_is_fp(reg), rt, base, (u32)off)); + aa_emit32( + mc, load + ? aa_ldr_uimm_v(sz, native_loc_is_fp(reg), rt, base, (u32)off) + : aa_str_uimm_v(sz, native_loc_is_fp(reg), rt, base, (u32)off)); return; } if (off >= -256 && off <= 255) { @@ -962,7 +966,8 @@ static void aa_materialize_frame_index(AANativeTarget* a, NativeAddr* addr, addr->index.reg = reg; } -static NativeLoc native_loc_reg(KitCgTypeId type, NativeAllocClass cls, Reg reg); +static NativeLoc native_loc_reg(KitCgTypeId type, NativeAllocClass cls, + Reg reg); static u32 aa_ldst_q_uimm(int load, u32 rt, u32 rn, u32 byte_off) { return aa64_ldst_uimm_pack((AA64LdStUimm){.size = 0, @@ -2691,7 +2696,8 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, if (plan->callee.kind == NATIVE_LOC_REG && (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT && plan->callee.v.reg < 8u) { - NativeLoc scratch = native_loc_reg(plan->callee.type, NATIVE_REG_INT, AA_TMP0); + NativeLoc scratch = + native_loc_reg(plan->callee.type, NATIVE_REG_INT, AA_TMP0); aa_move(t, scratch, plan->callee); plan->callee = scratch; } @@ -2730,15 +2736,15 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, if (ai->kind == ABI_ARG_INDIRECT) { if (next_int < 8u) { AAArgMove* m = &moves[nmoves++]; - m->dst = native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, - next_int++); + m->dst = native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), + NATIVE_REG_INT, next_int++); m->src = desc->args[i]; m->src_offset = 0; m->size = 8; m->is_addr = 1; } else { NativeLoc ptr = native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), - NATIVE_REG_INT, AA_TMP0); + NATIVE_REG_INT, AA_TMP0); aa_addr_of_loc(t, ptr, desc->args[i]); aa_store_outgoing_part(t, tail_call, stack, ptr, 8); stack += 8u; @@ -2752,15 +2758,16 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, if ((cls == NATIVE_REG_FP && next_fp < 8u) || (cls == NATIVE_REG_INT && next_int < 8u)) { AAArgMove* m = &moves[nmoves++]; - m->dst = native_loc_reg(desc->args[i].type, cls, - cls == NATIVE_REG_FP ? next_fp++ : next_int++); + m->dst = + native_loc_reg(desc->args[i].type, cls, + cls == NATIVE_REG_FP ? next_fp++ : next_int++); m->src = desc->args[i]; m->src_offset = part->src_offset; m->size = part->size; m->is_addr = 0; } else { - NativeLoc tmpreg = native_loc_reg(desc->args[i].type, cls, - cls == NATIVE_REG_FP ? 16u : AA_TMP0); + NativeLoc tmpreg = native_loc_reg( + desc->args[i].type, cls, cls == NATIVE_REG_FP ? 16u : AA_TMP0); aa_load_part(t, tmpreg, desc->args[i], part->src_offset, part->size); stack = align_up_u32(stack, aa_part_stack_align(part)); aa_store_outgoing_part(t, tail_call, stack, tmpreg, part->size); @@ -2792,11 +2799,12 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, NativeAllocClass cls = part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; KitCgTypeId pty = aa_part_scalar_type(part); - rets[nr].src = native_loc_reg(pty, cls, cls == NATIVE_REG_FP ? nf++ : ni++); + rets[nr].src = + native_loc_reg(pty, cls, cls == NATIVE_REG_FP ? nf++ : ni++); rets[nr].dst = desc->results[0]; if (rets[nr].dst.kind == NATIVE_LOC_FRAME) - rets[nr].dst = - native_loc_stack(pty, desc->results[0].v.frame, (i32)part->src_offset); + rets[nr].dst = native_loc_stack(pty, desc->results[0].v.frame, + (i32)part->src_offset); else if (rets[nr].dst.kind == NATIVE_LOC_STACK) { rets[nr].dst.v.stack.offset += (i32)part->src_offset; rets[nr].dst.type = pty; @@ -2935,7 +2943,8 @@ static void aa_plan_ret(NativeTarget* t, const CGFuncDesc* fd, rets[nr].src.v.addr.offset += (i32)part->src_offset; rets[nr].src.type = pty; } - rets[nr].dst = native_loc_reg(pty, cls, cls == NATIVE_REG_FP ? nf++ : ni++); + rets[nr].dst = + native_loc_reg(pty, cls, cls == NATIVE_REG_FP ? nf++ : ni++); rets[nr].mem = aa_mem_for_type(t, pty, part->size); nr++; } @@ -3661,9 +3670,54 @@ static const NativeAllocClassInfo aa_classes[] = { .ret_mask = 0x0000000fu}, }; +/* Resolve a register name ("x8", "v3", ...) to its (class, Reg). Powers the + * optimizer's inline-asm clobber masks and explicit hard-register operands + * ("{x8}" from a GNU local register variable). x0..x30 are DWARF 0..30; the + * SIMD/FP bank v0..v31 is DWARF 64..95. Returns non-zero for a non-register + * name (cc/memory/unknown), which the caller skips. */ +static int aa_resolve_name(const NativeRegInfo* ri, Slice name, Reg* out, + NativeAllocClass* cls_out) { + char buf[16]; + uint32_t dwarf; + (void)ri; + if (!name.s || !name.len || name.len >= sizeof buf) return 1; + memcpy(buf, name.s, name.len); + buf[name.len] = '\0'; + if (aa64_register_index(buf, &dwarf) != 0) return 1; + if (dwarf <= 30u) { + *cls_out = NATIVE_REG_INT; + *out = (Reg)dwarf; + return 0; + } + if (dwarf >= 64u && dwarf <= 95u) { + *cls_out = NATIVE_REG_FP; + *out = (Reg)(dwarf - 64u); + return 0; + } + return 1; +} + +static int aa_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls, + Reg reg) { + (void)ri; + if (cls == NATIVE_REG_INT) { + if (reg <= 8u) return 1; + if (reg >= 12u && reg <= 15u) return 1; + if (reg >= 19u && reg <= 28u) return 1; + return 0; + } + if (cls == NATIVE_REG_FP) { + if (reg <= 19u) return 1; + if (reg >= 22u && reg <= 31u) return 1; + } + return 0; +} + static const NativeRegInfo aa_reg_info = { .classes = aa_classes, .nclasses = sizeof aa_classes / sizeof aa_classes[0], + .resolve_name = aa_resolve_name, + .asm_operand_reg_ok = aa_asm_operand_reg_ok, }; static void aa_va_start_native(NativeTarget* t, NativeLoc ap_ptr); @@ -3768,7 +3822,7 @@ static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p, AggregateAccess access; NativeLoc src = native_loc_reg(p->type, NATIVE_REG_INT, - a->next_param_int < 8u ? a->next_param_int++ : AA_TMP0); + a->next_param_int < 8u ? a->next_param_int++ : AA_TMP0); if (src.v.reg == AA_TMP0) { NativeAddr saddr; memset(&saddr, 0, sizeof saddr); @@ -3826,14 +3880,14 @@ static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p, /* Unused parameter: only the ABI cursor advances. */ } else if (to_reg) { NativeLoc d = native_loc_reg(dst.type ? dst.type : p->type, - (NativeAllocClass)dst.cls, (Reg)dst.v.reg); + (NativeAllocClass)dst.cls, (Reg)dst.v.reg); if (!(src.kind == NATIVE_LOC_REG && src.v.reg == d.v.reg && (NativeAllocClass)src.cls == (NativeAllocClass)d.cls)) aa_move(t, d, src); } else { - aa_store_part(t, - native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), - src, 0, part->size); + aa_store_part( + t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), src, + 0, part->size); } } a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u); @@ -4143,7 +4197,7 @@ static void aa_va_arg_(NativeDirectTarget* d, Operand dst_op, Operand ap_addr, AANativeTarget* a = aa_of(d->native); int is_fp = cg_type_is_float(d->base.c, type); NativeLoc res = native_loc_reg(type, is_fp ? NATIVE_REG_FP : NATIVE_REG_INT, - is_fp ? 16u : 9u); + is_fp ? 16u : 9u); MemAccess val_mem = aa_mem_for_type(d->native, type, type_size32(d->native, type)); NativeAddr dst; @@ -4302,6 +4356,17 @@ AA_UNUSED_FN static NativeAllocClass aa_asm_constraint_class( return NATIVE_REG_INT; } +static int aa_asm_resolve_pin_or_panic(NativeDirectTarget* d, Sym reg, + const char* constraint, + NativeAsmRegPin* pin) { + NativeAsmRegPinStatus st = + native_asm_resolve_pin(d->native, reg, constraint, pin); + if (st == NATIVE_ASM_REG_PIN_ABSENT) return 0; + if (st != NATIVE_ASM_REG_PIN_OK) + aa_asm_panic(d, native_asm_pin_status_message(st)); + return 1; +} + AA_UNUSED_FN static void aa_direct_load_operand_to_reg(NativeDirectTarget* d, Operand op, NativeLoc dst) { @@ -4444,21 +4509,26 @@ static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl, for (u32 i = 0; i < nout; ++i) { const char* body = native_asm_constraint_body(outs[i].str); - if (body[0] == 'r' || body[0] == 'w') { + NativeAsmRegPin pin; + if (aa_asm_resolve_pin_or_panic(d, outs[i].reg, outs[i].str, &pin)) { + /* GNU local register variable: pin to the named hard register. */ + KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; + if (pin.cls == NATIVE_REG_FP) { + used_fp |= 1u << pin.reg; + clob_fp |= 1u << pin.reg; + } else { + used_int |= 1u << pin.reg; + clob_int |= 1u << pin.reg; + } + aa_asm_bound_reg(&bound_outs[i], type, pin.cls, pin.reg); + } else if (body[0] == 'r' || body[0] == 'w') { NativeAllocClass cls = aa_asm_constraint_class(d, body); Reg reg = aa_asm_alloc_reg(d, cls, &used_int, &used_fp); KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; aa_asm_bound_reg(&bound_outs[i], type, cls, reg); - if (outs[i].dir == KIT_CG_ASM_INOUT) { - NativeLoc loc = native_loc_reg(type, cls, reg); - aa_direct_load_operand_to_reg(d, out_ops[i], loc); - } } else if (body[0] == 'm') { Reg reg = aa_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp); - NativeLoc loc = - native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg); KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; - aa_direct_load_address_to_reg(d, out_ops[i], loc); aa_asm_bound_mem(&bound_outs[i], type, reg); } else { aa_asm_panic(d, "unsupported output constraint"); @@ -4476,31 +4546,32 @@ static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl, if (bound_outs[matched].kind != AA64_INLINE_OPK_REG) aa_asm_panic(d, "matching constraint requires register output"); bound_ins[i] = bound_outs[matched]; - aa_direct_load_operand_to_reg( - d, in_ops[i], - native_loc_reg(bound_ins[i].type, - bound_ins[i].pad[0] == AA64_INLINE_OPCLS_FP - ? NATIVE_REG_FP - : NATIVE_REG_INT, - (Reg)bound_ins[i].v.local)); continue; } - if (body[0] == 'r' || body[0] == 'w') { + NativeAsmRegPin pin; + if (aa_asm_resolve_pin_or_panic(d, ins[i].reg, ins[i].str, &pin)) { + /* GNU local register variable: pin to the named hard register. */ + KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type; + if (pin.cls == NATIVE_REG_FP) { + used_fp |= 1u << pin.reg; + clob_fp |= 1u << pin.reg; + } else { + used_int |= 1u << pin.reg; + clob_int |= 1u << pin.reg; + } + aa_asm_bound_reg(&bound_ins[i], type, pin.cls, pin.reg); + } else if (body[0] == 'r' || body[0] == 'w') { NativeAllocClass cls = aa_asm_constraint_class(d, body); Reg reg = aa_asm_alloc_reg(d, cls, &used_int, &used_fp); KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type; aa_asm_bound_reg(&bound_ins[i], type, cls, reg); - aa_direct_load_operand_to_reg(d, in_ops[i], native_loc_reg(type, cls, reg)); } else if (body[0] == 'i') { if (in_ops[i].kind != OPK_IMM) aa_asm_panic(d, "immediate constraint requires immediate operand"); bound_ins[i] = in_ops[i]; } else if (body[0] == 'm') { Reg reg = aa_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp); - NativeLoc loc = - native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg); KitCgTypeId type = ins[i].type ? ins[i].type : in_ops[i].type; - aa_direct_load_address_to_reg(d, in_ops[i], loc); aa_asm_bound_mem(&bound_ins[i], type, reg); } else { aa_asm_panic(d, "unsupported input constraint"); @@ -4509,6 +4580,39 @@ static void aa_direct_asm_block(NativeDirectTarget* d, const char* tmpl, saved = aa_asm_save_callee_clobbers(aa_of(d->native), clob_int, clob_fp, &nsaved); + for (u32 i = 0; i < nout; ++i) { + if (bound_outs[i].kind == AA64_INLINE_OPK_REG) { + NativeAllocClass cls = bound_outs[i].pad[0] == AA64_INLINE_OPCLS_FP + ? NATIVE_REG_FP + : NATIVE_REG_INT; + if (outs[i].dir == KIT_CG_ASM_INOUT) { + aa_direct_load_operand_to_reg( + d, out_ops[i], + native_loc_reg(bound_outs[i].type, cls, + (Reg)bound_outs[i].v.local)); + } + } else if (bound_outs[i].kind == OPK_INDIRECT) { + NativeLoc loc = + native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, + (Reg)bound_outs[i].v.ind.base); + aa_direct_load_address_to_reg(d, out_ops[i], loc); + } + } + for (u32 i = 0; i < nin; ++i) { + if (bound_ins[i].kind == AA64_INLINE_OPK_REG) { + NativeAllocClass cls = bound_ins[i].pad[0] == AA64_INLINE_OPCLS_FP + ? NATIVE_REG_FP + : NATIVE_REG_INT; + aa_direct_load_operand_to_reg( + d, in_ops[i], + native_loc_reg(bound_ins[i].type, cls, (Reg)bound_ins[i].v.local)); + } else if (bound_ins[i].kind == OPK_INDIRECT) { + NativeLoc loc = + native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, + (Reg)bound_ins[i].v.ind.base); + aa_direct_load_address_to_reg(d, in_ops[i], loc); + } + } a = aa64_asm_open(d->base.c); aa64_inline_bind(a, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, nclob); diff --git a/src/arch/c_target/c_emit.c b/src/arch/c_target/c_emit.c @@ -3224,12 +3224,92 @@ static void c_emit_c_string_literal(CBuf* b, const char* s) { cbuf_putc(b, '"'); } +/* "__kit_ao<i>" / "__kit_ai<i>": a unique name for the register temporary that + * carries a hard-register-pinned output/input operand. */ +static void c_asm_reg_temp_name(char* out, size_t cap, int is_out, u32 idx) { + const char* pfx = is_out ? "__kit_ao" : "__kit_ai"; + size_t i = 0; + char tmp[16]; + size_t n = 0; + u32 v = idx; + while (*pfx && i + 1 < cap) out[i++] = *pfx++; + if (!v) tmp[n++] = '0'; + while (v) { + tmp[n++] = (char)('0' + v % 10); + v /= 10; + } + while (n && i + 1 < cap) out[i++] = tmp[--n]; + out[i] = '\0'; +} + +/* Emit an asm output operand's lvalue expression (a plain local, or a + * dereferenced address for OPK_INDIRECT). Usable as both lvalue and rvalue. */ +static void c_emit_asm_out_lvalue(CTarget* t, Operand op) { + if (op.kind == OPK_LOCAL) { + char rb[24]; + c_ensure_local(t, op.v.local, op.type); + c_local_name(op.v.local, rb, sizeof rb); + cbuf_puts(&t->body, rb); + } else { + c_emit_addr_deref(t, op, op.type); + } +} + void c_emit_asm_block(CTarget* t, const char* tmpl, const AsmConstraint* outs, u32 no, Operand* oo, const AsmConstraint* ins, u32 ni, const Operand* io, const Sym* clobs, u32 nc) { + char nm[24]; for (u32 i = 0; i < no; ++i) c_assert_no_index(t, oo[i], "asm_block out"); for (u32 i = 0; i < ni; ++i) c_assert_no_index(t, io[i], "asm_block in"); - cbuf_puts(&t->body, " __asm__ __volatile__ ("); + + /* GNU local register variables (AsmConstraint.reg): a target backend resolves + * the pin to a physical register, but the portable C backend has no register + * names to bind — so re-emit each pinned operand as a faithful + * `register T v __asm__("reg")` temporary (scoped in a block) and let the + * host compiler honor the binding. Dormant unless a frontend marks an + * operand; only the C frontend does, for register variables. */ + int any_pin = 0; + for (u32 i = 0; i < no; ++i) + if (outs[i].reg) any_pin = 1; + for (u32 i = 0; i < ni; ++i) + if (ins[i].reg) any_pin = 1; + + if (any_pin) { + cbuf_puts(&t->body, " {\n"); + for (u32 i = 0; i < ni; ++i) { + if (!ins[i].reg) continue; + c_asm_reg_temp_name(nm, sizeof nm, 0, i); + cbuf_puts(&t->body, " register "); + c_emit_type(t, &t->body, io[i].type); + cbuf_puts(&t->body, " "); + cbuf_puts(&t->body, nm); + cbuf_puts(&t->body, " __asm__("); + c_emit_c_string_literal(&t->body, pool_slice(t->c->global, ins[i].reg).s); + cbuf_puts(&t->body, ") = "); + c_emit_operand(t, io[i]); + cbuf_puts(&t->body, ";\n"); + } + for (u32 i = 0; i < no; ++i) { + if (!outs[i].reg) continue; + c_asm_reg_temp_name(nm, sizeof nm, 1, i); + cbuf_puts(&t->body, " register "); + c_emit_type(t, &t->body, oo[i].type); + cbuf_puts(&t->body, " "); + cbuf_puts(&t->body, nm); + cbuf_puts(&t->body, " __asm__("); + c_emit_c_string_literal(&t->body, + pool_slice(t->c->global, outs[i].reg).s); + cbuf_puts(&t->body, ")"); + if (outs[i].dir == KIT_CG_ASM_INOUT) { + cbuf_puts(&t->body, " = "); + c_emit_asm_out_lvalue(t, oo[i]); + } + cbuf_puts(&t->body, ";\n"); + } + } + + cbuf_puts(&t->body, any_pin ? " __asm__ __volatile__ (" + : " __asm__ __volatile__ ("); c_emit_c_string_literal(&t->body, tmpl ? tmpl : ""); /* Outputs. */ cbuf_puts(&t->body, " : "); @@ -3243,14 +3323,13 @@ void c_emit_asm_block(CTarget* t, const char* tmpl, const AsmConstraint* outs, c_emit_c_string_literal(&t->body, outs[i].str ? outs[i].str : ""); cbuf_puts(&t->body, "("); /* Outputs must be an lvalue. OPK_LOCAL is a plain C local; this - * works directly. OPK_LOCAL / OPK_INDIRECT also produce lvalues. */ - if (oo[i].kind == OPK_LOCAL) { - c_ensure_local(t, oo[i].v.local, oo[i].type); - char rb[24]; - c_local_name(oo[i].v.local, rb, sizeof rb); - cbuf_puts(&t->body, rb); + * works directly. OPK_LOCAL / OPK_INDIRECT also produce lvalues. A pinned + * output names its register temporary instead. */ + if (outs[i].reg) { + c_asm_reg_temp_name(nm, sizeof nm, 1, i); + cbuf_puts(&t->body, nm); } else { - c_emit_addr_deref(t, oo[i], oo[i].type); + c_emit_asm_out_lvalue(t, oo[i]); } cbuf_puts(&t->body, ")"); } @@ -3275,7 +3354,12 @@ void c_emit_asm_block(CTarget* t, const char* tmpl, const AsmConstraint* outs, } c_emit_c_string_literal(&t->body, cs); cbuf_puts(&t->body, "("); - c_emit_operand(t, io[i]); + if (ins[i].reg) { + c_asm_reg_temp_name(nm, sizeof nm, 0, i); + cbuf_puts(&t->body, nm); + } else { + c_emit_operand(t, io[i]); + } cbuf_puts(&t->body, ")"); } /* Clobbers. */ @@ -3285,6 +3369,19 @@ void c_emit_asm_block(CTarget* t, const char* tmpl, const AsmConstraint* outs, c_emit_c_string_literal(&t->body, pool_slice(t->c->global, clobs[i]).s); } cbuf_puts(&t->body, ");\n"); + + if (any_pin) { + for (u32 i = 0; i < no; ++i) { + if (!outs[i].reg) continue; + c_asm_reg_temp_name(nm, sizeof nm, 1, i); + cbuf_puts(&t->body, " "); + c_emit_asm_out_lvalue(t, oo[i]); + cbuf_puts(&t->body, " = "); + cbuf_puts(&t->body, nm); + cbuf_puts(&t->body, ";\n"); + } + cbuf_puts(&t->body, " }\n"); + } } /* === load_const === diff --git a/src/arch/native_target.h b/src/arch/native_target.h @@ -7,6 +7,7 @@ #include "cg/cgtarget.h" #include "cg/type.h" #include "core/core.h" +#include "core/slice.h" /* Slice, for resolve_name */ /* NativeTarget is the physical native-emission contract. It is driven after * semantic CG has been either direct-lowered by NativeDirectTarget or recorded, @@ -149,8 +150,18 @@ struct NativeRegInfo { const NativeAllocClassInfo* classes; u32 nclasses; - int (*resolve_name)(const NativeRegInfo*, Sym name, Reg* out, + /* Map a register name to its (Reg, class). `name` is the raw spelling + * ("rax", "x8", "a7"); the caller resolves any Sym to its bytes first so this + * stays pool-free. Returns 0 on success, non-zero for a non-register name. */ + int (*resolve_name)(const NativeRegInfo*, Slice name, Reg* out, NativeAllocClass* cls_out); + /* True when (cls, reg) is a valid hard-register home for an inline-asm value + * operand. This is intentionally separate from allocator availability: + * syscall idioms need ABI registers such as x8/a7, while stack/frame, zero, + * link, platform, and backend scratch registers must stay unavailable even if + * the assembler can name them. */ + int (*asm_operand_reg_ok)(const NativeRegInfo*, NativeAllocClass cls, + Reg reg); const char* (*debug_name)(const NativeRegInfo*, NativeAllocClass, Reg); u32 (*dwarf_reg)(const NativeRegInfo*, NativeAllocClass, Reg); }; diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c @@ -459,9 +459,52 @@ static const NativeAllocClassInfo rv_classes[] = { .reserved_mask = 0x0000000fu /* ft0-ft3 */}, }; +/* Resolve a register name ("a7", "fa0", ...) to its (class, Reg). Powers the + * optimizer's inline-asm clobber masks and explicit hard-register operands + * ("{a7}" from a GNU local register variable). x0..x31 are DWARF 0..31; the + * FP bank f0..f31 is DWARF 32..63. Returns non-zero for a non-register name + * (cc/memory/unknown), which the caller skips. */ +static int rv_resolve_name(const NativeRegInfo* ri, Slice name, Reg* out, + NativeAllocClass* cls_out) { + char buf[16]; + uint32_t dwarf; + (void)ri; + if (!name.s || !name.len || name.len >= sizeof buf) return 1; + memcpy(buf, name.s, name.len); + buf[name.len] = '\0'; + if (rv64_register_index(buf, &dwarf) != 0) return 1; + if (dwarf <= 31u) { + *cls_out = NATIVE_REG_INT; + *out = (Reg)dwarf; + return 0; + } + if (dwarf >= 32u && dwarf <= 63u) { + *cls_out = NATIVE_REG_FP; + *out = (Reg)(dwarf - 32u); + return 0; + } + return 1; +} + +static int rv_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls, + Reg reg) { + (void)ri; + if (cls == NATIVE_REG_INT) { + if (reg == 9u) return 1; /* s1 */ + if (reg >= 10u && reg <= 17u) return 1; /* a0..a7 */ + if (reg >= 18u && reg <= 27u) return 1; /* s2..s11 */ + if (reg == 31u) return 1; /* t6 */ + return 0; + } + if (cls == NATIVE_REG_FP) return reg >= 4u && reg <= 31u; + return 0; +} + static const NativeRegInfo rv_reg_info = { .classes = rv_classes, .nclasses = sizeof rv_classes / sizeof rv_classes[0], + .resolve_name = rv_resolve_name, + .asm_operand_reg_ok = rv_asm_operand_reg_ok, }; /* ============================ legality ============================ */ @@ -1714,8 +1757,10 @@ static const ABIArgInfo* rv_param_abi(NativeTarget* t, const ABIFuncInfo* abi, (!variadic && cg_type_is_float(t->c, desc->args[i].type)) ? ABI_CLASS_FP : ABI_CLASS_INT; ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG; - ((ABIArgPart*)scratch->parts)[0].size = native_type_size(t, desc->args[i].type); - ((ABIArgPart*)scratch->parts)[0].align = native_type_align(t, desc->args[i].type); + ((ABIArgPart*)scratch->parts)[0].size = + native_type_size(t, desc->args[i].type); + ((ABIArgPart*)scratch->parts)[0].align = + native_type_align(t, desc->args[i].type); return scratch; } @@ -1981,14 +2026,14 @@ static void rv_bind_native_param(NativeTarget* t, const CGParamDesc* p, /* unused parameter; cursors already advanced */ } else if (to_reg) { NativeLoc d = native_loc_reg(dst.type ? dst.type : p->type, - (NativeAllocClass)dst.cls, (Reg)dst.v.reg); + (NativeAllocClass)dst.cls, (Reg)dst.v.reg); if (!(src.kind == NATIVE_LOC_REG && loc_reg(src) == loc_reg(d) && (NativeAllocClass)src.cls == (NativeAllocClass)d.cls)) rv_move(t, d, src); } else { - rv_store_part(t, - native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), - src, 0, part->size); + rv_store_part( + t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), src, + 0, part->size); } } a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u); @@ -2040,7 +2085,8 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc, if (plan->callee.kind == NATIVE_LOC_REG && (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT && plan->callee.v.reg >= RV_A0 && plan->callee.v.reg <= RV_A7) { - NativeLoc scratch = native_loc_reg(plan->callee.type, NATIVE_REG_INT, RV_TMP0); + NativeLoc scratch = + native_loc_reg(plan->callee.type, NATIVE_REG_INT, RV_TMP0); rv_move(t, scratch, plan->callee); plan->callee = scratch; } @@ -2133,8 +2179,8 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc, rets[nr].src = native_loc_reg(pty, cls, rreg); rets[nr].dst = desc->results[0]; if (rets[nr].dst.kind == NATIVE_LOC_FRAME) - rets[nr].dst = - native_loc_stack(pty, desc->results[0].v.frame, (i32)part->src_offset); + rets[nr].dst = native_loc_stack(pty, desc->results[0].v.frame, + (i32)part->src_offset); else if (rets[nr].dst.kind == NATIVE_LOC_STACK) { rets[nr].dst.v.stack.offset += (i32)part->src_offset; rets[nr].dst.type = pty; @@ -2423,7 +2469,8 @@ static void rv_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, MemAccess mem, KitCgMemOrder mo) { RvNativeTarget* a = rv_of(t); MCEmitter* mc = t->mc; - u32 sf = (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u; + u32 sf = + (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u; u32 base = rv_atomic_addr_reg(a, addr); if (mo == KIT_CG_MO_SEQ_CST) rv64_emit32(mc, rv_fence_rw_rw()); if (rv_order_acquire(mo)) { @@ -2431,9 +2478,9 @@ static void rv_atomic_load(NativeTarget* t, NativeLoc dst, NativeAddr addr, rv64_emit32(mc, sf ? rv_lr_d(loc_reg(dst), base, 1, 0) : rv_lr_w(loc_reg(dst), base, 1, 0)); } else { - rv64_emit32(mc, - enc_int_load(mem.size ? mem.size : native_type_size(t, dst.type), 0, - loc_reg(dst), base, 0)); + rv64_emit32( + mc, enc_int_load(mem.size ? mem.size : native_type_size(t, dst.type), 0, + loc_reg(dst), base, 0)); } } @@ -2454,7 +2501,8 @@ static void rv_atomic_rmw(NativeTarget* t, KitCgAtomicOp op, NativeLoc dst, KitCgMemOrder mo) { RvNativeTarget* a = rv_of(t); MCEmitter* mc = t->mc; - u32 sf = (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u; + u32 sf = + (mem.size ? mem.size : native_type_size(t, dst.type)) == 8u ? 1u : 0u; u32 base = rv_atomic_addr_reg(a, addr); /* RV_TMP0 */ u32 vreg = loc_reg(val); u32 rd = loc_reg(dst); @@ -2505,7 +2553,8 @@ static void rv_atomic_cas(NativeTarget* t, NativeLoc prior, NativeLoc ok, KitCgMemOrder success, KitCgMemOrder failure) { RvNativeTarget* a = rv_of(t); MCEmitter* mc = t->mc; - u32 sf = (mem.size ? mem.size : native_type_size(t, prior.type)) == 8u ? 1u : 0u; + u32 sf = + (mem.size ? mem.size : native_type_size(t, prior.type)) == 8u ? 1u : 0u; u32 base = rv_atomic_addr_reg(a, addr); /* RV_TMP0 */ u32 rprior = loc_reg(prior); u32 rexp = loc_reg(expected); @@ -3112,6 +3161,17 @@ static NativeAllocClass rv_asm_constraint_class(NativeDirectTarget* d, return NATIVE_REG_INT; } +static int rv_asm_resolve_pin_or_panic(NativeDirectTarget* d, Sym reg, + const char* constraint, + NativeAsmRegPin* pin) { + NativeAsmRegPinStatus st = + native_asm_resolve_pin(d->native, reg, constraint, pin); + if (st == NATIVE_ASM_REG_PIN_ABSENT) return 0; + if (st != NATIVE_ASM_REG_PIN_OK) + rv_asm_panic(d, native_asm_pin_status_message(st)); + return 1; +} + /* Pick a free register from the arch's caller-saved allocable pools for an * asm operand the direct path must self-allocate. */ static Reg rv_asm_alloc_reg(NativeDirectTarget* d, NativeAllocClass cls, @@ -3620,13 +3680,14 @@ static void rv_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr, RvNativeTarget* a = rv_of(d->native); int is_fp = cg_type_is_float(d->base.c, type); NativeLoc res = native_loc_reg(type, is_fp ? NATIVE_REG_FP : NATIVE_REG_INT, - is_fp ? RV_FTMP0 : RV_TMP0); + is_fp ? RV_FTMP0 : RV_TMP0); NativeAddr dst_addr; rv_va_arg_core(a, res, rv_direct_va_base(d, ap_addr, RV_TMP3), type); /* Store the fetched value back into the semantic destination. */ dst_addr = rv_direct_addr(d, dst); if (dst_addr.base_kind == NATIVE_ADDR_BASE_FRAME_VALUE) { - NativeLoc base = native_loc_reg(dst_addr.base_type, NATIVE_REG_INT, RV_TMP1); + NativeLoc base = + native_loc_reg(dst_addr.base_type, NATIVE_REG_INT, RV_TMP1); NativeAddr load; memset(&load, 0, sizeof load); load.base_kind = NATIVE_ADDR_BASE_FRAME; @@ -3637,8 +3698,9 @@ static void rv_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr, dst_addr.base_kind = NATIVE_ADDR_BASE_REG; dst_addr.base.reg = RV_TMP1; } - rv_emit_mem(a, 0, res, dst_addr, - native_mem_for_type(d->native, type, native_type_size(d->native, type))); + rv_emit_mem( + a, 0, res, dst_addr, + native_mem_for_type(d->native, type, native_type_size(d->native, type))); } static void rv_va_end_(NativeDirectTarget* d, Operand ap_addr) { (void)d; @@ -3681,18 +3743,23 @@ static void rv_direct_asm_block(NativeDirectTarget* d, const char* tmpl, for (i = 0; i < nout; ++i) { const char* body = native_asm_constraint_body(outs[i].str); KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; - if (body[0] == 'r' || body[0] == 'f') { + NativeAsmRegPin pin; + if (rv_asm_resolve_pin_or_panic(d, outs[i].reg, outs[i].str, &pin)) { + /* GNU local register variable: pin to the named hard register. */ + if (pin.cls == NATIVE_REG_FP) { + used_fp |= 1u << pin.reg; + clob_fp |= 1u << pin.reg; + } else { + used_int |= 1u << pin.reg; + clob_int |= 1u << pin.reg; + } + rv_asm_bound_reg(&bound_outs[i], type, pin.cls, pin.reg); + } else if (body[0] == 'r' || body[0] == 'f') { NativeAllocClass cls = rv_asm_constraint_class(d, body); Reg reg = rv_asm_alloc_reg(d, cls, &used_int, &used_fp); rv_asm_bound_reg(&bound_outs[i], type, cls, reg); - if (outs[i].dir == KIT_CG_ASM_INOUT) - rv_direct_load_operand_to_reg(d, out_ops[i], - native_loc_reg(type, cls, reg)); } else if (body[0] == 'm') { Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp); - NativeLoc lloc = - native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg); - rv_direct_load_address_to_reg(d, out_ops[i], lloc); rv_asm_bound_mem(&bound_outs[i], type, reg); } else { rv_asm_panic(d, "unsupported output constraint"); @@ -3711,29 +3778,29 @@ static void rv_direct_asm_block(NativeDirectTarget* d, const char* tmpl, if (bound_outs[matched].kind != RV64_INLINE_OPK_REG) rv_asm_panic(d, "matching constraint requires register output"); bound_ins[i] = bound_outs[matched]; - rv_direct_load_operand_to_reg( - d, in_ops[i], - native_loc_reg(bound_ins[i].type, - bound_ins[i].pad[0] == RV64_INLINE_OPCLS_FP - ? NATIVE_REG_FP - : NATIVE_REG_INT, - (Reg)bound_ins[i].v.local)); continue; } - if (body[0] == 'r' || body[0] == 'f') { + NativeAsmRegPin pin; + if (rv_asm_resolve_pin_or_panic(d, ins[i].reg, ins[i].str, &pin)) { + /* GNU local register variable: pin to the named hard register. */ + if (pin.cls == NATIVE_REG_FP) { + used_fp |= 1u << pin.reg; + clob_fp |= 1u << pin.reg; + } else { + used_int |= 1u << pin.reg; + clob_int |= 1u << pin.reg; + } + rv_asm_bound_reg(&bound_ins[i], type, pin.cls, pin.reg); + } else if (body[0] == 'r' || body[0] == 'f') { NativeAllocClass cls = rv_asm_constraint_class(d, body); Reg reg = rv_asm_alloc_reg(d, cls, &used_int, &used_fp); rv_asm_bound_reg(&bound_ins[i], type, cls, reg); - rv_direct_load_operand_to_reg(d, in_ops[i], native_loc_reg(type, cls, reg)); } else if (body[0] == 'i') { if (in_ops[i].kind != OPK_IMM) rv_asm_panic(d, "immediate constraint requires immediate operand"); bound_ins[i] = in_ops[i]; } else if (body[0] == 'm') { Reg reg = rv_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp); - NativeLoc lloc = - native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg); - rv_direct_load_address_to_reg(d, in_ops[i], lloc); rv_asm_bound_mem(&bound_ins[i], type, reg); } else { rv_asm_panic(d, "unsupported input constraint"); @@ -3741,6 +3808,39 @@ static void rv_direct_asm_block(NativeDirectTarget* d, const char* tmpl, } saved = rv_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved); + for (i = 0; i < nout; ++i) { + if (bound_outs[i].kind == RV64_INLINE_OPK_REG) { + NativeAllocClass cls = bound_outs[i].pad[0] == RV64_INLINE_OPCLS_FP + ? NATIVE_REG_FP + : NATIVE_REG_INT; + if (outs[i].dir == KIT_CG_ASM_INOUT) { + rv_direct_load_operand_to_reg( + d, out_ops[i], + native_loc_reg(bound_outs[i].type, cls, + (Reg)bound_outs[i].v.local)); + } + } else if (bound_outs[i].kind == OPK_INDIRECT) { + NativeLoc loc = + native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, + (Reg)bound_outs[i].v.ind.base); + rv_direct_load_address_to_reg(d, out_ops[i], loc); + } + } + for (i = 0; i < nin; ++i) { + if (bound_ins[i].kind == RV64_INLINE_OPK_REG) { + NativeAllocClass cls = bound_ins[i].pad[0] == RV64_INLINE_OPCLS_FP + ? NATIVE_REG_FP + : NATIVE_REG_INT; + rv_direct_load_operand_to_reg( + d, in_ops[i], + native_loc_reg(bound_ins[i].type, cls, (Reg)bound_ins[i].v.local)); + } else if (bound_ins[i].kind == OPK_INDIRECT) { + NativeLoc loc = + native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, + (Reg)bound_ins[i].v.ind.base); + rv_direct_load_address_to_reg(d, in_ops[i], loc); + } + } asmh = rv64_asm_open(c); rv64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, nclob); diff --git a/src/arch/wasm/emit.c b/src/arch/wasm/emit.c @@ -1858,6 +1858,14 @@ void wasm_asm_block(CGTarget* tg, const char* tmpl, const AsmConstraint* outs, if (clob[i] != sym_memory) wfail_at(t, loc, "wasm target: asm register clobbers not yet supported"); } + for (i = 0; i < nout; ++i) { + if (outs[i].reg) + wfail_at(t, loc, "wasm target: asm hard-register operands not supported"); + } + for (i = 0; i < nin; ++i) { + if (ins[i].reg) + wfail_at(t, loc, "wasm target: asm hard-register operands not supported"); + } /* Build a scratch WasmFunc with the synthetic signature. Layout is: * params = input types (indices 0 .. nin-1) diff --git a/src/arch/x64/emit.c b/src/arch/x64/emit.c @@ -1,8 +1,7 @@ /* arch/x64/emit.c — byte-level emit helpers, function prologue/epilogue. * * Covers: REX, ModR/M, SIB, all emit_* primitives, x_func_begin, - * x_func_end, and the shared constant tables (g_int_order, g_fp_order, - * per-ABI int_args tables exposed via X64ABIRegs). */ + * x_func_end, and the per-ABI int_args tables exposed via X64ABIRegs. */ #include "arch/x64/emit.h" @@ -16,18 +15,6 @@ /* ============================================================ * Shared constant tables. */ -const Reg g_int_order[6] = { - X64_RBX, X64_R12, X64_R13, X64_R14, X64_R15, /* callee-saved (n_cs=5) */ - X64_R10, /* caller-saved tail */ -}; - -const Reg g_fp_order[10] = { - /* All xmm regs are caller-saved on SysV; preference order is xmm6 - * upward to keep the low arg/return regs (xmm0..5) clear for calls. */ - X64_XMM6, X64_XMM7, X64_XMM8, X64_XMM0 + 9, X64_XMM0 + 10, - X64_XMM0 + 11, X64_XMM0 + 12, X64_XMM0 + 13, X64_XMM0 + 14, X64_XMM15, -}; - static const u32 g_int_arg_regs_sysv[6] = {X64_RDI, X64_RSI, X64_RDX, X64_RCX, X64_R8, X64_R9}; static const u32 g_int_arg_regs_win64[4] = {X64_RCX, X64_RDX, X64_R8, X64_R9}; diff --git a/src/arch/x64/emit.h b/src/arch/x64/emit.h @@ -42,9 +42,6 @@ typedef struct X64ABIRegs { const X64ABIRegs* x64_abi_for_os(KitOSKind os); -extern const Reg g_int_order[6]; -extern const Reg g_fp_order[10]; - /* Per-instruction debug line rows. Declared here (mc.h only forward-declares * Debug) so emit.c's encoders and native.c's lifecycle can both record rows * without taking a full dependency on debug/debug.h. */ diff --git a/src/arch/x64/native.c b/src/arch/x64/native.c @@ -14,11 +14,14 @@ * The single-pass (-O0) prologue reserves a NOP placeholder patched in func_end * once max_outgoing and callee-saves are known. * - * Register model. INT scratch (never allocable, never driver scratch): RAX and - * R11 — the legacy emit paths' fixed temporaries. FP scratch: XMM14 and XMM15. - * RSP/RBP are reserved (stack/frame pointers). Everything else is allocable. - * The driver scratch pool is RBX/R12 (int) and XMM12/XMM13 (fp), disjoint from - * the emit temps so a hook never clobbers an operand parked there. ABI arg/ret + * Register model. INT scratch (never allocable, never driver scratch): R10 and + * R11 — the emit paths' fixed temporaries. FP scratch: XMM14 and XMM15. RSP/RBP + * are reserved (stack/frame pointers). RAX is reserved too (return value, the + * div/mul implicit operand), but it is NOT an emit temp, so inline asm may pin + * an operand to it (the Linux syscall idiom) — see x64_asm_operand_reg_ok. + * Everything else is allocable. The driver scratch pool is RBX/R12 (int) and + * XMM12/XMM13 (fp), disjoint from the emit temps so a hook never clobbers an + * operand parked there. ABI arg/ret * registers are caller-saved-allocable; callee-saved set is resolved per-OS via * x64_abi_for_os at runtime (the legality masks below are SysV's, the conserva- * tive superset that both ABIs' allocators respect — Win64's extra callee-saves @@ -46,7 +49,7 @@ #include "obj/obj.h" enum { - X64_TMP_INT = X64_RAX, /* emit-internal int scratch (reserved) */ + X64_TMP_INT = X64_R10, /* emit-internal int scratch (reserved) */ X64_TMP_INT2 = X64_R11, /* emit-internal int scratch (reserved) */ X64_TMP_FP = X64_XMM0 + 14, /* emit-internal fp scratch (reserved) */ X64_TMP_FP2 = X64_XMM15, /* emit-internal fp scratch (reserved) */ @@ -206,14 +209,15 @@ static void emit_jcc_rel32(MCEmitter* mc, u32 cc, MCLabel l); .spill_cost = 0u, \ .copy_cost = 0u} -/* Allocable int pool, opt's spill/reload set: caller-saved callee-saves first - * so -O0's local cache prefers regs that don't grow the prologue. RAX/R11 are - * emit scratch (reserved); RBX/R12 are the driver scratch pool. */ -static const Reg x64_int_allocable[] = {X64_R13, X64_R14, X64_R15, X64_R10}; +/* Allocable int pool, opt's spill/reload set: callee-saves first so the direct + * path's local cache prefers regs that don't grow the prologue. R10/R11 are + * emit scratch (reserved); RBX/R12 are the driver scratch pool; RAX is reserved + * (return / div-mul, asm-pinnable). */ +static const Reg x64_int_allocable[] = {X64_R13, X64_R14, X64_R15}; static const Reg x64_int_scratch[] = {X64_RBX, X64_R12}; static const NativePhysRegInfo x64_int_phys[] = { - X64_PHYS_INT_RESERVED(X64_RAX), /* return / emit scratch */ + X64_PHYS_INT_RESERVED(X64_RAX), /* return / div-mul (asm-pinnable) */ X64_PHYS_INT_ARG(X64_RCX), X64_PHYS_INT_RET_ARG(X64_RDX), X64_PHYS_INT_RESERVED(X64_RBX), /* driver scratch */ @@ -223,7 +227,7 @@ static const NativePhysRegInfo x64_int_phys[] = { X64_PHYS_INT_ARG(X64_RDI), X64_PHYS_INT_ARG(X64_R8), X64_PHYS_INT_ARG(X64_R9), - X64_PHYS_INT_CALLER(X64_R10), + X64_PHYS_INT_RESERVED(X64_R10), /* emit scratch */ X64_PHYS_INT_RESERVED(X64_R11), /* emit scratch */ X64_PHYS_INT_RESERVED(X64_R12), /* driver scratch */ X64_PHYS_INT_CALLEE(X64_R13), @@ -299,9 +303,10 @@ static const NativeAllocClassInfo x64_classes[] = { .arg_mask = (1u << X64_RDI) | (1u << X64_RSI) | (1u << X64_RDX) | (1u << X64_RCX) | (1u << X64_R8) | (1u << X64_R9), .ret_mask = (1u << X64_RAX) | (1u << X64_RDX), - /* rax, rsp, rbp, r11 reserved (plus the rbx/r12 driver scratch pool) */ + /* rax, rsp, rbp reserved; r10/r11 emit scratch; rbx/r12 driver scratch */ .reserved_mask = (1u << X64_RAX) | (1u << X64_RSP) | (1u << X64_RBP) | - (1u << X64_R11) | (1u << X64_RBX) | (1u << X64_R12)}, + (1u << X64_R10) | (1u << X64_R11) | (1u << X64_RBX) | + (1u << X64_R12)}, {.cls = NATIVE_REG_FP, .allocable = x64_fp_allocable, .nallocable = sizeof x64_fp_allocable / sizeof x64_fp_allocable[0], @@ -319,9 +324,64 @@ static const NativeAllocClassInfo x64_classes[] = { (1u << (X64_XMM0 + 14)) | (1u << X64_XMM15)}, }; +/* Resolve a register name ("r10", "xmm3", ...) to its (class, Reg). Powers the + * optimizer's inline-asm clobber masks and explicit hard-register operands + * ("{r10}" from a GNU local register variable). GPR names map through the HW + * encoding; xmm names through the DWARF index table. Returns non-zero for a + * non-register name (cc/memory/unknown), which the caller skips. */ +static int x64_resolve_name(const NativeRegInfo* ri, Slice name, Reg* out, + NativeAllocClass* cls_out) { + char buf[16]; + uint32_t idx; + (void)ri; + if (!name.s || !name.len || name.len >= sizeof buf) return 1; + memcpy(buf, name.s, name.len); + buf[name.len] = '\0'; + if (x64_register_hw_index(buf, &idx) == 0 && idx <= 15u) { + *cls_out = NATIVE_REG_INT; + *out = (Reg)idx; + return 0; + } + if (x64_register_index(buf, &idx) == 0 && idx >= 17u && idx <= 32u) { + *cls_out = NATIVE_REG_FP; + *out = (Reg)(idx - 17u); + return 0; + } + return 1; +} + +static int x64_asm_operand_reg_ok(const NativeRegInfo* ri, NativeAllocClass cls, + Reg reg) { + (void)ri; + if (cls == NATIVE_REG_INT) { + switch (reg) { + /* RAX is reserved but not an emit temp, so it is a legal asm pin (the + * Linux syscall number/return register). R10/R11 are emit scratch and + * RBX/R12 the driver scratch pool, so those stay excluded. */ + case X64_RAX: + case X64_RCX: + case X64_RDX: + case X64_RSI: + case X64_RDI: + case X64_R8: + case X64_R9: + case X64_R13: + case X64_R14: + case X64_R15: + return 1; + default: + return 0; + } + } + if (cls == NATIVE_REG_FP) return reg <= X64_XMM0 + 11u; + return 0; +} + static const NativeRegInfo x64_reg_info = { .classes = x64_classes, .nclasses = sizeof x64_classes / sizeof x64_classes[0], + .resolve_name = x64_resolve_name, + .asm_operand_reg_ok = x64_asm_operand_reg_ok, }; /* ============================ legality ============================ */ @@ -1904,7 +1964,8 @@ static const ABIArgInfo* x64_param_abi(NativeTarget* t, const ABIFuncInfo* abi, ((ABIArgPart*)scratch->parts)[0].cls = cg_type_is_float(t->c, desc->args[i].type) ? ABI_CLASS_FP : ABI_CLASS_INT; ((ABIArgPart*)scratch->parts)[0].loc = ABI_LOC_REG; - ((ABIArgPart*)scratch->parts)[0].size = native_type_size(t, desc->args[i].type); + ((ABIArgPart*)scratch->parts)[0].size = + native_type_size(t, desc->args[i].type); ((ABIArgPart*)scratch->parts)[0].align = native_type_align(t, desc->args[i].type); return scratch; @@ -2187,7 +2248,7 @@ static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p, x64_defer_reg_bind( a, native_loc_reg(dst.type ? dst.type : p->type, - (NativeAllocClass)dst.cls, (Reg)dst.v.reg), + (NativeAllocClass)dst.cls, (Reg)dst.v.reg), isrc, part->size); } else { /* Frame dst: load to scratch then store (memory dst is never a cycle @@ -2196,8 +2257,8 @@ static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p, NativeLoc tloc = native_loc_reg(p->type, cls, tmp); x64_load_part(t, tloc, isrc, 0, part->size); x64_store_part( - t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), tloc, - 0, part->size); + t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), + tloc, 0, part->size); } } return; @@ -2227,22 +2288,23 @@ static void x64_bind_native_param(NativeTarget* t, const CGParamDesc* p, * incoming arg registers, so a per-param move could clobber a register * another bind still needs. x64_bind_params_end resolves them together as * a parallel copy. */ - x64_defer_reg_bind(a, - native_loc_reg(dst.type ? dst.type : p->type, - (NativeAllocClass)dst.cls, (Reg)dst.v.reg), - src, part->size); + x64_defer_reg_bind( + a, + native_loc_reg(dst.type ? dst.type : p->type, + (NativeAllocClass)dst.cls, (Reg)dst.v.reg), + src, part->size); } else if (src.kind == NATIVE_LOC_REG) { - x64_store_part(t, - native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), - src, 0, part->size); + x64_store_part( + t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), src, + 0, part->size); } else { /* Stack source -> frame dst: load to scratch, then store. */ Reg tmp = cls == NATIVE_REG_FP ? X64_TMP_FP : X64_TMP_INT; NativeLoc tloc = native_loc_reg(p->type, cls, tmp); x64_load_part(t, tloc, src, 0, part->size); - x64_store_part(t, - native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), - tloc, 0, part->size); + x64_store_part( + t, native_loc_stack(p->type, dst.v.frame, (i32)part->src_offset), + tloc, 0, part->size); } } a->incoming_stack_size = align_up_u32(a->next_param_stack, 16u); @@ -2352,7 +2414,8 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, if (plan->callee.kind == NATIVE_LOC_REG && (NativeAllocClass)plan->callee.cls == NATIVE_REG_INT && plan->callee.v.reg != X64_R11) { - NativeLoc scratch = native_loc_reg(plan->callee.type, NATIVE_REG_INT, X64_R11); + NativeLoc scratch = + native_loc_reg(plan->callee.type, NATIVE_REG_INT, X64_R11); x64_move(t, scratch, plan->callee); plan->callee = scratch; } @@ -2408,8 +2471,8 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, X64ArgMove* m = &moves[nmoves++]; u32 slot = next_fp; memset(m, 0, sizeof *m); - m->dst = - native_loc_reg(desc->args[i].type, cls, (Reg)(X64_XMM0 + next_fp++)); + m->dst = native_loc_reg(desc->args[i].type, cls, + (Reg)(X64_XMM0 + next_fp++)); m->src = desc->args[i]; m->src_offset = part->src_offset; m->size = part->size; @@ -2422,8 +2485,8 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, } else if (cls == NATIVE_REG_INT && next_int < aregs->n_int_args) { X64ArgMove* m = &moves[nmoves++]; memset(m, 0, sizeof *m); - m->dst = - native_loc_reg(desc->args[i].type, cls, aregs->int_args[next_int++]); + m->dst = native_loc_reg(desc->args[i].type, cls, + aregs->int_args[next_int++]); m->src = desc->args[i]; m->src_offset = part->src_offset; m->size = part->size; @@ -2451,7 +2514,8 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, * the address of this call's result slot. */ NativeLoc sret = native_loc_reg(i64t, NATIVE_REG_INT, aregs->int_args[0]); if (tail) - x64_load_part(t, sret, native_loc_stack(i64t, a->sret_ptr_slot, 0), 0, 8); + x64_load_part(t, sret, native_loc_stack(i64t, a->sret_ptr_slot, 0), 0, + 8); else x64_addr_of_loc(t, sret, desc->results[0]); } @@ -2474,8 +2538,8 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, rets[nr].src = native_loc_reg(pty, cls, rreg); rets[nr].dst = desc->results[0]; if (rets[nr].dst.kind == NATIVE_LOC_FRAME) - rets[nr].dst = - native_loc_stack(pty, desc->results[0].v.frame, (i32)part->src_offset); + rets[nr].dst = native_loc_stack(pty, desc->results[0].v.frame, + (i32)part->src_offset); else if (rets[nr].dst.kind == NATIVE_LOC_STACK) { rets[nr].dst.v.stack.offset += (i32)part->src_offset; rets[nr].dst.type = pty; @@ -2487,7 +2551,8 @@ static void x64_plan_call(NativeTarget* t, const NativeCallDesc* desc, } else if (abi && abi->ret.kind == ABI_ARG_IGNORE) { plan->nrets = 0; } else if (!abi && desc->nresults) { - rets[0].src = native_loc_reg(desc->results[0].type, NATIVE_REG_INT, X64_RAX); + rets[0].src = + native_loc_reg(desc->results[0].type, NATIVE_REG_INT, X64_RAX); rets[0].dst = desc->results[0]; rets[0].mem = native_mem_for_type(t, desc->results[0].type, 0); plan->nrets = 1; @@ -2604,7 +2669,8 @@ static void x64_plan_ret(NativeTarget* t, const CGFuncDesc* fd, access.align = native_type_align(t, values[0].type); x64_copy_bytes(t, dst_addr, src_addr, access); /* rax = sret pointer. Reload it (copy_bytes clobbered r11/rax). */ - x64_load_part(t, native_loc_reg(i64t, NATIVE_REG_INT, X64_RAX), saved, 0, 8); + x64_load_part(t, native_loc_reg(i64t, NATIVE_REG_INT, X64_RAX), saved, 0, + 8); *out_rets = NULL; *out_nrets = 0; return; @@ -3583,12 +3649,23 @@ static NativeAllocClass x64_asm_constraint_class(NativeDirectTarget* d, return NATIVE_REG_INT; } +static int x64_asm_resolve_pin_or_panic(NativeDirectTarget* d, Sym reg, + const char* constraint, + NativeAsmRegPin* pin) { + NativeAsmRegPinStatus st = + native_asm_resolve_pin(d->native, reg, constraint, pin); + if (st == NATIVE_ASM_REG_PIN_ABSENT) return 0; + if (st != NATIVE_ASM_REG_PIN_OK) + x64_asm_panic(d, native_asm_pin_status_message(st)); + return 1; +} + /* Pick a free register from caller-saved allocable pools for an asm operand the * direct path self-allocates. */ static Reg x64_asm_alloc_reg(NativeDirectTarget* d, NativeAllocClass cls, u32* used_int, u32* used_fp) { - static const Reg int_pool[] = {X64_RDI, X64_RSI, X64_RDX, X64_RCX, - X64_R8, X64_R9, X64_R10}; + static const Reg int_pool[] = {X64_RDI, X64_RSI, X64_RDX, + X64_RCX, X64_R8, X64_R9}; static const Reg fp_pool[] = { X64_XMM0, X64_XMM1, X64_XMM2, X64_XMM3, X64_XMM4, X64_XMM5, X64_XMM6, X64_XMM7, X64_XMM8, X64_XMM0 + 9, X64_XMM0 + 10, X64_XMM0 + 11}; @@ -3731,9 +3808,9 @@ static void x64_asm_restore_one(X64NativeTarget* a, addr.base_kind = NATIVE_ADDR_BASE_FRAME; addr.base.frame = s->slot; addr.base_type = s->type; - x64_emit_mem( - a, 1, native_loc_reg(s->type, s->cls, s->reg), addr, - native_mem_for_type(&a->base, s->type, s->cls == NATIVE_REG_FP ? 16u : 8u)); + x64_emit_mem(a, 1, native_loc_reg(s->type, s->cls, s->reg), addr, + native_mem_for_type(&a->base, s->type, + s->cls == NATIVE_REG_FP ? 16u : 8u)); } /* SysV callee-saved: int rbx,r12-r15; no fp. Win64 adds rdi,rsi + xmm6-15. */ @@ -3810,12 +3887,13 @@ static Reg x64_asm_native_mem_base(X64NativeTarget* a, SrcLoc loc, Reg dst; if (addr.base_kind == NATIVE_ADDR_BASE_REG && addr.offset == 0 && addr.index_kind == NATIVE_ADDR_INDEX_NONE) { - if ((addr.base.reg & 0xfu) != X64_RAX && (addr.base.reg & 0xfu) != X64_R11) + if ((addr.base.reg & 0xfu) != X64_TMP_INT && + (addr.base.reg & 0xfu) != X64_TMP_INT2) return (Reg)(addr.base.reg & 0xfu); } if (*ntmp >= 2u) x64_asm_panic_at(a->base.c, loc, "too many memory asm operands"); - dst = (*ntmp == 0u) ? (Reg)X64_RAX : (Reg)X64_R11; + dst = (*ntmp == 0u) ? (Reg)X64_TMP_INT : (Reg)X64_TMP_INT2; (*ntmp)++; x64_addr_to_base_reg(a, addr, dst); return dst; @@ -3880,7 +3958,7 @@ static void x64_asm_block_native(NativeTarget* t, const char* tmpl, if ((body[0] == 'r') && inloc.kind != NATIVE_LOC_REG) { Reg r; if (ntmp >= 2u) x64_asm_panic_at(c, loc, "too many memory asm operands"); - r = (ntmp == 0u) ? (Reg)X64_RAX : (Reg)X64_R11; + r = (ntmp == 0u) ? (Reg)X64_TMP_INT : (Reg)X64_TMP_INT2; ntmp++; inloc = native_loc_reg(type, NATIVE_REG_INT, r); x64_emit_mem(a, 1, inloc, x64_asm_loc_to_addr(a, loc, in_locs[i]), @@ -4133,7 +4211,7 @@ static void x64_va_arg_(NativeDirectTarget* d, Operand dst, Operand ap_addr, X64NativeTarget* a = x64_of(d->native); int is_fp = cg_type_is_float(d->base.c, type); NativeLoc res = native_loc_reg(type, is_fp ? NATIVE_REG_FP : NATIVE_REG_INT, - is_fp ? X64_TMP_FP : (Reg)X64_RDX); + is_fp ? X64_TMP_FP : (Reg)X64_RDX); NativeAddr dst_addr; /* Base in R11: the core advances/loads through R11 plus one GPR scratch (the * integer result reg itself, or RAX for FP results), so R11 must not be RAX. @@ -4180,7 +4258,8 @@ static void x64_direct_asm_block(NativeDirectTarget* d, const char* tmpl, native_asm_abi_clobber_masks(d->native, clobber_abi_sets, &abi_int, &abi_fp); clob_int |= abi_int; clob_fp |= abi_fp; - /* Reserve emit scratch (rax,r11), driver scratch, sp/bp, and clobbers. */ + /* Reserve emit scratch (r10,r11), driver scratch (rbx,r12), rax (reserved; + * only self-allocated here when explicitly pinned), sp/bp, and clobbers. */ used_int = clob_int | (1u << X64_RAX) | (1u << X64_R11) | (1u << X64_RSP) | (1u << X64_RBP) | (1u << X64_RBX) | (1u << X64_R12) | (1u << X64_R10); @@ -4190,18 +4269,23 @@ static void x64_direct_asm_block(NativeDirectTarget* d, const char* tmpl, for (i = 0; i < nout; ++i) { const char* body = native_asm_constraint_body(outs[i].str); KitCgTypeId type = outs[i].type ? outs[i].type : out_ops[i].type; - if (body[0] == 'r' || body[0] == 'x') { + NativeAsmRegPin pin; + if (x64_asm_resolve_pin_or_panic(d, outs[i].reg, outs[i].str, &pin)) { + /* GNU local register variable: pin to the named hard register. */ + if (pin.cls == NATIVE_REG_FP) { + used_fp |= 1u << pin.reg; + clob_fp |= 1u << pin.reg; + } else { + used_int |= 1u << pin.reg; + clob_int |= 1u << pin.reg; + } + x64_asm_bound_reg(&bound_outs[i], type, pin.cls, pin.reg); + } else if (body[0] == 'r' || body[0] == 'x') { NativeAllocClass cls = x64_asm_constraint_class(d, body); Reg reg = x64_asm_alloc_reg(d, cls, &used_int, &used_fp); x64_asm_bound_reg(&bound_outs[i], type, cls, reg); - if (outs[i].dir == KIT_CG_ASM_INOUT) - x64_direct_load_operand_to_reg(d, out_ops[i], - native_loc_reg(type, cls, reg)); } else if (body[0] == 'm') { Reg reg = x64_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp); - NativeLoc lloc = - native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg); - x64_direct_load_address_to_reg(d, out_ops[i], lloc); x64_asm_bound_mem(&bound_outs[i], type, reg); } else { x64_asm_panic(d, "unsupported output constraint"); @@ -4220,29 +4304,29 @@ static void x64_direct_asm_block(NativeDirectTarget* d, const char* tmpl, if (bound_outs[matched].kind != X64_INLINE_OPK_REG) x64_asm_panic(d, "matching constraint requires register output"); bound_ins[i] = bound_outs[matched]; - x64_direct_load_operand_to_reg( - d, in_ops[i], - native_loc_reg(bound_ins[i].type, - bound_ins[i].pad[0] == X64_INLINE_OPCLS_FP - ? NATIVE_REG_FP - : NATIVE_REG_INT, - (Reg)bound_ins[i].v.local)); continue; } - if (body[0] == 'r' || body[0] == 'x') { + NativeAsmRegPin pin; + if (x64_asm_resolve_pin_or_panic(d, ins[i].reg, ins[i].str, &pin)) { + /* GNU local register variable: pin to the named hard register. */ + if (pin.cls == NATIVE_REG_FP) { + used_fp |= 1u << pin.reg; + clob_fp |= 1u << pin.reg; + } else { + used_int |= 1u << pin.reg; + clob_int |= 1u << pin.reg; + } + x64_asm_bound_reg(&bound_ins[i], type, pin.cls, pin.reg); + } else if (body[0] == 'r' || body[0] == 'x') { NativeAllocClass cls = x64_asm_constraint_class(d, body); Reg reg = x64_asm_alloc_reg(d, cls, &used_int, &used_fp); x64_asm_bound_reg(&bound_ins[i], type, cls, reg); - x64_direct_load_operand_to_reg(d, in_ops[i], native_loc_reg(type, cls, reg)); } else if (body[0] == 'i') { if (in_ops[i].kind != OPK_IMM) x64_asm_panic(d, "immediate constraint requires immediate operand"); bound_ins[i] = in_ops[i]; } else if (body[0] == 'm') { Reg reg = x64_asm_alloc_reg(d, NATIVE_REG_INT, &used_int, &used_fp); - NativeLoc lloc = - native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, reg); - x64_direct_load_address_to_reg(d, in_ops[i], lloc); x64_asm_bound_mem(&bound_ins[i], type, reg); } else { x64_asm_panic(d, "unsupported input constraint"); @@ -4250,6 +4334,39 @@ static void x64_direct_asm_block(NativeDirectTarget* d, const char* tmpl, } saved = x64_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved); + for (i = 0; i < nout; ++i) { + if (bound_outs[i].kind == X64_INLINE_OPK_REG) { + NativeAllocClass cls = bound_outs[i].pad[0] == X64_INLINE_OPCLS_FP + ? NATIVE_REG_FP + : NATIVE_REG_INT; + if (outs[i].dir == KIT_CG_ASM_INOUT) { + x64_direct_load_operand_to_reg( + d, out_ops[i], + native_loc_reg(bound_outs[i].type, cls, + (Reg)bound_outs[i].v.local)); + } + } else if (bound_outs[i].kind == OPK_INDIRECT) { + NativeLoc loc = + native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, + (Reg)bound_outs[i].v.ind.base); + x64_direct_load_address_to_reg(d, out_ops[i], loc); + } + } + for (i = 0; i < nin; ++i) { + if (bound_ins[i].kind == X64_INLINE_OPK_REG) { + NativeAllocClass cls = bound_ins[i].pad[0] == X64_INLINE_OPCLS_FP + ? NATIVE_REG_FP + : NATIVE_REG_INT; + x64_direct_load_operand_to_reg( + d, in_ops[i], + native_loc_reg(bound_ins[i].type, cls, (Reg)bound_ins[i].v.local)); + } else if (bound_ins[i].kind == OPK_INDIRECT) { + NativeLoc loc = + native_loc_reg(builtin_id(KIT_CG_BUILTIN_I64), NATIVE_REG_INT, + (Reg)bound_ins[i].v.ind.base); + x64_direct_load_address_to_reg(d, in_ops[i], loc); + } + } asmh = x64_asm_open(c); x64_inline_bind(asmh, outs, nout, bound_outs, ins, nin, bound_ins, clobbers, nclob); diff --git a/src/cg/asm.c b/src/cg/asm.c @@ -35,7 +35,9 @@ int api_asm_is_early_clobber(const char* s) { * SIMD/FP) are the per-target FP/vector register classes. The temp local's type * selects the actual NativeAllocClass downstream, and the target's asm hook * rejects a letter that does not apply to it, so listing all three here is safe - * across backends. */ + * across backends. A hard-register pin (AsmConstraint.reg, from a GNU local + * register variable) rides alongside such a register operand and does not + * change this classification — the constraint letter stays "r". */ int api_asm_is_reg_constraint(char c) { return c == 'r' || c == 'f' || c == 'x' || c == 'w'; } @@ -97,8 +99,15 @@ void kit_cg_inline_asm(KitCg* g, KitCgInlineAsm asm_block) { outs[i].str = api_sym_cstr(g, outputs[i].constraint); outs[i].name = (Sym)outputs[i].name; outs[i].type = resolve_type(g->c, outputs[i].type); + outs[i].reg = (Sym)outputs[i].reg; outs[i].dir = (u8)outputs[i].dir; if (!outs[i].type) outs[i].type = fallback_ty; + if (outs[i].reg && + !api_asm_is_reg_constraint(api_asm_constraint_body(outs[i].str)[0])) { + compiler_panic(g->c, g->cur_loc, + "KitCg: asm hard-register output requires a register " + "constraint"); + } if (outs[i].dir == KIT_CG_ASM_INOUT) { if (i >= 10) { compiler_panic(g->c, g->cur_loc, @@ -130,8 +139,15 @@ void kit_cg_inline_asm(KitCg* g, KitCgInlineAsm asm_block) { ins[i].str = api_sym_cstr(g, inputs[i].constraint); ins[i].name = (Sym)inputs[i].name; ins[i].type = resolve_type(g->c, inputs[i].type); + ins[i].reg = (Sym)inputs[i].reg; ins[i].dir = (u8)inputs[i].dir; if (!ins[i].type) ins[i].type = fallback_ty; + if (ins[i].reg && + !api_asm_is_reg_constraint(api_asm_constraint_body(ins[i].str)[0])) { + compiler_panic(g->c, g->cur_loc, + "KitCg: asm hard-register input requires a register " + "constraint"); + } } inout_index = ninputs; for (u32 i = 0; i < noutputs; ++i) { diff --git a/src/cg/cgtarget.h b/src/cg/cgtarget.h @@ -409,6 +409,10 @@ typedef struct AsmConstraint { input rvalue). Drives type width for the binder. NULL only for hand-built test constraints (binder falls back to a 64-bit int default). */ + Sym reg; /* Explicit hard-register name ("r10"/"x8"/...) this operand + must occupy — a GNU local register variable bound as an + operand; 0 = unconstrained. Only the target's register + file resolves the name to a physical register. */ u8 dir; /* KitCgAsmDir */ u8 pad[3]; } AsmConstraint; diff --git a/src/cg/native_asm.c b/src/cg/native_asm.c @@ -3,6 +3,7 @@ #include "arch/mc.h" #include "asm/asm.h" #include "asm/asm_lex.h" +#include "core/pool.h" /* pool_slice for native_asm_resolve_pin */ void native_file_scope_asm(NativeTarget* t, const char* src, size_t len) { AsmLexer* lex = asm_lex_open_mem(t->c, "<file-scope-asm>", src, len); @@ -48,3 +49,76 @@ void native_asm_abi_clobber_masks(NativeTarget* t, u32 abi_sets, u32* int_mask, *fp_mask |= classes[NATIVE_REG_FP].callee_saved_mask; } } + +int native_asm_constraint_reg_class(const char* constraint, + NativeAllocClass* cls_out) { + const char* body = native_asm_constraint_body(constraint); + if (!body || !body[0]) return 0; + if (body[0] == 'r') { + if (cls_out) *cls_out = NATIVE_REG_INT; + return 1; + } + if (body[0] == 'f' || body[0] == 'x' || body[0] == 'w') { + if (cls_out) *cls_out = NATIVE_REG_FP; + return 1; + } + return 0; +} + +static int native_asm_default_operand_reg_ok(const NativeRegInfo* ri, + NativeAllocClass cls, Reg reg) { + if (!ri || cls >= ri->nclasses) return 0; + const NativeAllocClassInfo* ci = &ri->classes[cls]; + for (u32 i = 0; i < ci->nphys; ++i) { + const NativePhysRegInfo* pi = &ci->phys[i]; + if (pi->reg != reg) continue; + return (pi->flags & NATIVE_REG_RESERVED) == 0; + } + return 0; +} + +NativeAsmRegPinStatus native_asm_resolve_pin(NativeTarget* t, Sym reg, + const char* constraint, + NativeAsmRegPin* out) { + Reg r; + NativeAllocClass cls; + NativeAllocClass want; + if (!reg) return NATIVE_ASM_REG_PIN_ABSENT; + if (!t || !t->regs || !t->regs->resolve_name) + return NATIVE_ASM_REG_PIN_UNKNOWN; + if (t->regs->resolve_name(t->regs, pool_slice(t->c->global, reg), &r, &cls) != + 0) + return NATIVE_ASM_REG_PIN_UNKNOWN; + if (t->regs->asm_operand_reg_ok) { + if (!t->regs->asm_operand_reg_ok(t->regs, cls, r)) + return NATIVE_ASM_REG_PIN_FORBIDDEN; + } else if (!native_asm_default_operand_reg_ok(t->regs, cls, r)) { + return NATIVE_ASM_REG_PIN_FORBIDDEN; + } + if (!native_asm_constraint_reg_class(constraint, &want)) + return NATIVE_ASM_REG_PIN_BAD_CONSTRAINT; + if (want != cls) return NATIVE_ASM_REG_PIN_CLASS_MISMATCH; + if (out) { + out->reg = r; + out->cls = cls; + } + return NATIVE_ASM_REG_PIN_OK; +} + +const char* native_asm_pin_status_message(NativeAsmRegPinStatus st) { + switch (st) { + case NATIVE_ASM_REG_PIN_ABSENT: + return "no hard register pin"; + case NATIVE_ASM_REG_PIN_OK: + return "hard register pin resolved"; + case NATIVE_ASM_REG_PIN_UNKNOWN: + return "unknown asm register variable name"; + case NATIVE_ASM_REG_PIN_FORBIDDEN: + return "asm register variable names an unsupported register"; + case NATIVE_ASM_REG_PIN_BAD_CONSTRAINT: + return "asm register variable requires a register constraint"; + case NATIVE_ASM_REG_PIN_CLASS_MISMATCH: + return "asm register variable class does not match its constraint"; + } + return "invalid asm register variable"; +} diff --git a/src/cg/native_asm.h b/src/cg/native_asm.h @@ -46,4 +46,29 @@ int native_asm_match_index(const char* s); void native_asm_abi_clobber_masks(NativeTarget* t, u32 abi_sets, u32* int_mask, u32* fp_mask); +typedef enum NativeAsmRegPinStatus { + NATIVE_ASM_REG_PIN_ABSENT = 0, + NATIVE_ASM_REG_PIN_OK = 1, + NATIVE_ASM_REG_PIN_UNKNOWN = -1, + NATIVE_ASM_REG_PIN_FORBIDDEN = -2, + NATIVE_ASM_REG_PIN_BAD_CONSTRAINT = -3, + NATIVE_ASM_REG_PIN_CLASS_MISMATCH = -4, +} NativeAsmRegPinStatus; + +typedef struct NativeAsmRegPin { + Reg reg; + NativeAllocClass cls; +} NativeAsmRegPin; + +/* Resolve and validate an inline-asm operand's explicit hard-register pin + * (AsmConstraint.reg, from a GNU local register variable). Distinguishes no pin + * from invalid pins, and verifies that the operand uses a register constraint + * of the matching target class. */ +NativeAsmRegPinStatus native_asm_resolve_pin(NativeTarget* t, Sym reg, + const char* constraint, + NativeAsmRegPin* out); +const char* native_asm_pin_status_message(NativeAsmRegPinStatus st); +int native_asm_constraint_reg_class(const char* constraint, + NativeAllocClass* cls_out); + #endif diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c @@ -1042,7 +1042,12 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges, if (gi.tied_hard_reg >= 0) { Reg fixed = (Reg)gi.tied_hard_reg; - if (!hard_available(f, cls, fixed)) { + /* Machinize has already validated inline-asm hard-register pins against + * the target's operand-register policy. Some legal pins are ABI registers + * outside the standard allocable set (aa64 x0, rv64 a7), so the allocator + * accepts validated physical registers here and relies on the + * conflict/clobber checks below for placement correctness. */ + if (!hard_available(f, cls, fixed) && !phys_info_for(f, cls, fixed)) { SrcLoc loc = {0, 0, 0}; compiler_panic( f->c, loc, diff --git a/src/opt/pass_machinize.c b/src/opt/pass_machinize.c @@ -1,18 +1,12 @@ #include <string.h> +#include "cg/native_asm.h" #include "cg/type.h" #include "core/pool.h" #include "core/slice.h" #include "opt/opt_internal.h" -static const char* asm_constraint_body(const char* s) { - if (!s) return ""; - if (s[0] == '=' && s[1] == '&') return s + 2; - if (s[0] == '=' || s[0] == '+' || s[0] == '&') return s + 1; - return s; -} - -static int native_resolve_reg(NativeTarget* target, Sym name, Reg* out, +static int native_resolve_reg(NativeTarget* target, Slice name, Reg* out, RegClass* cls_out) { NativeAllocClass cls; if (!target || !target->regs || !target->regs->resolve_name) return 1; @@ -21,19 +15,6 @@ static int native_resolve_reg(NativeTarget* target, Sym name, Reg* out, return 0; } -static int asm_resolve_fixed_constraint(Func* f, NativeTarget* target, - const char* constraint, Reg* reg_out, - RegClass* cls_out) { - const char* body = asm_constraint_body(constraint); - if (body[0] != '{') return 0; - const char* end = body + 1; - while (*end && *end != '}') ++end; - if (*end != '}' || end == body + 1) return 0; - Sym name = pool_intern_slice( - f->c->global, (Slice){.s = body + 1, .len = (size_t)(end - body - 1)}); - return native_resolve_reg(target, name, reg_out, cls_out) == 0; -} - static void asm_prepare_constraints(Func* f, NativeTarget* target, IRAsmAux* aux) { if (!aux) return; @@ -51,24 +32,33 @@ static void asm_prepare_constraints(Func* f, NativeTarget* target, for (u32 i = 0; i < aux->nclob; ++i) { Reg r; RegClass cls; - if (native_resolve_reg(target, aux->clobbers[i], &r, &cls) != 0) continue; + Slice nm = pool_slice(f->c->global, aux->clobbers[i]); + if (native_resolve_reg(target, nm, &r, &cls) != 0) continue; if ((u32)cls < OPT_REG_CLASSES && r < 32) aux->clobber_mask[cls] |= 1u << r; } for (u32 i = 0; i < aux->nout; ++i) { - Reg r; - RegClass cls; - if (asm_resolve_fixed_constraint(f, target, aux->outs[i].str, &r, &cls)) { - aux->out_fixed_regs[i] = (i32)r; - aux->out_fixed_cls[i] = (u8)cls; + NativeAsmRegPin pin; + NativeAsmRegPinStatus st = native_asm_resolve_pin(target, aux->outs[i].reg, + aux->outs[i].str, &pin); + if (st == NATIVE_ASM_REG_PIN_ABSENT) continue; + if (st != NATIVE_ASM_REG_PIN_OK) { + compiler_panic(f->c, (SrcLoc){0, 0, 0}, "opt asm: %s", + native_asm_pin_status_message(st)); } + aux->out_fixed_regs[i] = (i32)pin.reg; + aux->out_fixed_cls[i] = (u8)pin.cls; } for (u32 i = 0; i < aux->nin; ++i) { - Reg r; - RegClass cls; - if (asm_resolve_fixed_constraint(f, target, aux->ins[i].str, &r, &cls)) { - aux->in_fixed_regs[i] = (i32)r; - aux->in_fixed_cls[i] = (u8)cls; + NativeAsmRegPin pin; + NativeAsmRegPinStatus st = + native_asm_resolve_pin(target, aux->ins[i].reg, aux->ins[i].str, &pin); + if (st == NATIVE_ASM_REG_PIN_ABSENT) continue; + if (st != NATIVE_ASM_REG_PIN_OK) { + compiler_panic(f->c, (SrcLoc){0, 0, 0}, "opt asm: %s", + native_asm_pin_status_message(st)); } + aux->in_fixed_regs[i] = (i32)pin.reg; + aux->in_fixed_cls[i] = (u8)pin.cls; } } diff --git a/test/arch/x64_inline_test.c b/test/arch/x64_inline_test.c @@ -21,6 +21,19 @@ static void x64_bad_operand(KitCompiler* c, KitCg* cg, KitCgTypeId i64_ty) { it_inline_asm(c, cg, "movq %9, %%rax", NULL, 0, NULL, 0, NULL, 0); } +/* A GNU local register variable pinned to %rax — the Linux syscall idiom + * (syscall number in rax). rax is reserved (return / div-mul), but no longer + * an emit-internal scratch register, so an asm operand may pin it while the + * allocator still leaves it alone. Clobbers rcx/r11 as `syscall` does. */ +static void x64_rax_pin(KitCompiler* c, KitCg* cg, KitCgTypeId i64_ty) { + KitCgAsmOperand in; + const char* clob[] = {"rcx", "r11", "memory"}; + in = it_asm_op(c, "r", "n", i64_ty, KIT_CG_ASM_IN); + in.reg = kit_sym_intern(c, kit_slice_cstr("rax")); + kit_cg_push_int(cg, 60, i64_ty); /* SYS_exit */ + it_inline_asm(c, cg, "syscall", NULL, 0, &in, 1, clob, 3); +} + int main(void) { static const uint8_t nops[] = {0x90u, 0x90u}; static const uint8_t movq_rcx_rax[] = {0x48u, 0x89u, 0xc8u}; @@ -51,6 +64,20 @@ int main(void) { x64_bad_operand, "operand index"), "expected out-of-range x64 asm operand to panic"); + { + static const uint8_t syscall_bytes[] = {0x0fu, 0x05u}; + InlineText sc; + IT_EXPECT(&env, + it_emit_text(&env, KIT_ARCH_X86_64, "x64_rax_pin", x64_rax_pin, + &sc), + "failed to emit rax-pinned syscall inline asm"); + if (sc.data) + IT_EXPECT( + &env, it_contains(sc.data, sc.len, syscall_bytes, sizeof syscall_bytes), + "missing syscall encoding for rax-pinned operand"); + it_text_close(&sc); + } + if (env.fails) { fprintf(stderr, "%d failure(s)\n", env.fails); return 1; diff --git a/test/parse/cases/asm_03_register_operand.c b/test/parse/cases/asm_03_register_operand.c @@ -0,0 +1,37 @@ +/* GNU local register variables (`register T x __asm__("reg")`) used as inline- + * asm operands must occupy the named hard register. Each template below + * addresses its registers directly (not via %N operand substitution), so the + * result is correct only when a/b/r really land in the pinned registers — a + * regression guard for register-variable operand pinning through the native + * backends (-O0 direct and the optimizer's tied-hard-reg path). 40 + 2 == 42. + * + * Arch-guarded; the asm is target-specific. The wasm backend has no native + * hard-register file, so this case opts out of W via sidecar. */ + +int test_main(void) { + long out = 0; +#if defined(__aarch64__) + register long a __asm__("x12") = 40; + register long b __asm__("x13") = 2; + register long r __asm__("x14"); + __asm__ volatile("add x14, x12, x13" : "=r"(r) : "r"(a), "r"(b)); + out = r; +#elif defined(__x86_64__) + register long a __asm__("rdi") = 40; + register long b __asm__("rsi") = 2; + register long r __asm__("rdx"); + __asm__ volatile("movq %%rdi, %%rdx\n\taddq %%rsi, %%rdx" + : "=r"(r) + : "r"(a), "r"(b)); + out = r; +#elif defined(__riscv) && __riscv_xlen == 64 + register long a __asm__("a3") = 40; + register long b __asm__("a4") = 2; + register long r __asm__("a5"); + __asm__ volatile("add a5, a3, a4" : "=r"(r) : "r"(a), "r"(b)); + out = r; +#else + out = 42; +#endif + return (int)out; +} diff --git a/test/parse/cases/asm_03_register_operand.expected b/test/parse/cases/asm_03_register_operand.expected @@ -0,0 +1 @@ +42 diff --git a/test/parse/cases/asm_03_register_operand.wasm.skip b/test/parse/cases/asm_03_register_operand.wasm.skip @@ -0,0 +1 @@ +native hard-register operands are not meaningful for the wasm backend diff --git a/test/parse/cases/asm_04_register_callee_saved.c b/test/parse/cases/asm_04_register_callee_saved.c @@ -0,0 +1,71 @@ +/* A hard-register inline-asm operand may name a callee-saved register. The + * callee must preserve the caller's register value even though it loads the asm + * operand through that hard register internally. */ + +#if defined(__aarch64__) +__asm__( + ".text\n" + ".globl write_saved_reg\n" + "write_saved_reg:\n" + "mov x19, x0\n" + "ret\n" + ".globl read_saved_reg\n" + "read_saved_reg:\n" + "mov x0, x19\n" + "ret\n"); +extern void write_saved_reg(long); +extern long read_saved_reg(void); +#elif defined(__x86_64__) +__asm__( + ".text\n" + ".globl write_saved_reg\n" + "write_saved_reg:\n" + "movq %rdi, %r13\n" + "retq\n" + ".globl read_saved_reg\n" + "read_saved_reg:\n" + "movq %r13, %rax\n" + "retq\n"); +extern void write_saved_reg(long); +extern long read_saved_reg(void); +#elif defined(__riscv) && __riscv_xlen == 64 +__asm__( + ".text\n" + ".globl write_saved_reg\n" + "write_saved_reg:\n" + "mv s1, a0\n" + "ret\n" + ".globl read_saved_reg\n" + "read_saved_reg:\n" + "mv a0, s1\n" + "ret\n"); +extern void write_saved_reg(long); +extern long read_saved_reg(void); +#else +static long saved_fallback; +static void write_saved_reg(long v) { saved_fallback = v; } +static long read_saved_reg(void) { return saved_fallback; } +#endif + +__attribute__((noinline)) static void touch_callee_saved(void) { +#if defined(__aarch64__) + register long v __asm__("x19") = 123; + __asm__ volatile("" : "+r"(v)); +#elif defined(__x86_64__) + register long v __asm__("r13") = 123; + __asm__ volatile("" : "+r"(v)); +#elif defined(__riscv) && __riscv_xlen == 64 + register long v __asm__("s1") = 123; + __asm__ volatile("" : "+r"(v)); +#endif +} + +int test_main(void) { + long saved = read_saved_reg(); + long after; + write_saved_reg(77); + touch_callee_saved(); + after = read_saved_reg(); + write_saved_reg(saved); + return (int)after; +} diff --git a/test/parse/cases/asm_04_register_callee_saved.expected b/test/parse/cases/asm_04_register_callee_saved.expected @@ -0,0 +1 @@ +77 diff --git a/test/parse/cases/asm_04_register_callee_saved.wasm.skip b/test/parse/cases/asm_04_register_callee_saved.wasm.skip @@ -0,0 +1 @@ +native hard-register operands are not meaningful for the wasm backend diff --git a/test/parse/cases/asm_05_register_label_scope.c b/test/parse/cases/asm_05_register_label_scope.c @@ -0,0 +1,10 @@ +/* An asm label belongs to the declarator that parsed it. A label on a separate + * non-register local must not leak into a later register variable and become a + * hard-register pin. */ + +int test_main(void) { + long other __asm__("notareg") = 0; + register long v = 42; + __asm__ volatile("" : : "r"(v)); + return (int)(v + other); +} diff --git a/test/parse/cases/asm_05_register_label_scope.expected b/test/parse/cases/asm_05_register_label_scope.expected @@ -0,0 +1 @@ +42 diff --git a/test/parse/cases_err/asm_register_bad_constraint.c b/test/parse/cases_err/asm_register_bad_constraint.c @@ -0,0 +1,13 @@ +int test_main(void) { +#if defined(__aarch64__) + register long v __asm__("x19") = 1; +#elif defined(__x86_64__) + register long v __asm__("r13") = 1; +#elif defined(__riscv) && __riscv_xlen == 64 + register long v __asm__("s1") = 1; +#else + register long v __asm__("r13") = 1; +#endif + __asm__ volatile("" : : "m"(v)); + return 0; +} diff --git a/test/parse/cases_err/asm_register_bad_constraint.errpat b/test/parse/cases_err/asm_register_bad_constraint.errpat @@ -0,0 +1 @@ +asm hard-register input requires a register constraint diff --git a/test/parse/cases_err/asm_register_bad_name.c b/test/parse/cases_err/asm_register_bad_name.c @@ -0,0 +1,5 @@ +int test_main(void) { + register long v __asm__("notareg") = 1; + __asm__ volatile("" : : "r"(v)); + return 0; +} diff --git a/test/parse/cases_err/asm_register_bad_name.errpat b/test/parse/cases_err/asm_register_bad_name.errpat @@ -0,0 +1 @@ +unknown asm register variable name diff --git a/test/parse/cases_err/asm_register_class_mismatch.c b/test/parse/cases_err/asm_register_class_mismatch.c @@ -0,0 +1,13 @@ +int test_main(void) { +#if defined(__aarch64__) + register long v __asm__("v0") = 1; +#elif defined(__x86_64__) + register long v __asm__("xmm0") = 1; +#elif defined(__riscv) && __riscv_xlen == 64 + register long v __asm__("fa0") = 1; +#else + long v = 1; +#endif + __asm__ volatile("" : : "r"(v)); + return 0; +} diff --git a/test/parse/cases_err/asm_register_class_mismatch.errpat b/test/parse/cases_err/asm_register_class_mismatch.errpat @@ -0,0 +1 @@ +asm register variable class does not match its constraint diff --git a/test/parse/cases_err/asm_register_forbidden.c b/test/parse/cases_err/asm_register_forbidden.c @@ -0,0 +1,13 @@ +int test_main(void) { +#if defined(__aarch64__) + register long v __asm__("x9") = 1; +#elif defined(__x86_64__) + register long v __asm__("r10") = 1; +#elif defined(__riscv) && __riscv_xlen == 64 + register long v __asm__("zero") = 1; +#else + long v = 1; +#endif + __asm__ volatile("" : : "r"(v)); + return 0; +} diff --git a/test/parse/cases_err/asm_register_forbidden.errpat b/test/parse/cases_err/asm_register_forbidden.errpat @@ -0,0 +1 @@ +asm register variable names an unsupported register