commit 32f8a7a95aac37696b844bb9530d5e52650cc5a5 parent ba84296701ef636f54258c4a3b2e5850f63254e7 Author: Ryan Sepassi <rsepassi@gmail.com> Date: Mon, 4 May 2026 11:07:48 -0700 tcc 0.9.26: phase-1 aarch64 assembler Adds arm64-asm.c covering the mnemonic surface used by the in-tree .S files (tcc-cc/aarch64/start.S and tcc-libc/aarch64/{start,sys_stubs}.S): mov, add, sub, ldr, str, ldp, stp, b, bl, ret, svc. Wires the file in via three new before/after patches against tcc.h, libtcc.c, tcctok.h. Also restricts the gas-style '#'-line-comment in tcc's asm lexer to start-of-line so '#imm' tokenizes as itself mid-line — matches stock gas semantics and is what the ARM/AArch64 immediate prefix needs. With this, tcc-boot2 self-assembles its own .S inputs: drops the host cross-asm path for ARCH=aarch64 in Makefile and scripts/boot3.sh. Verified end-to-end: - boot3 aarch64 reaches the tcc2==tcc3 fixed point. - tcc-cc suite: 181/0 passed across stages 2 + 3. - tcc-libc suite: 18/0 passed across stages 2 + 3. - .text bytes from tcc-boot2-built start.o / sys_stubs.o are byte-identical to clang -target aarch64-linux-gnu output, with matching R_AARCH64_CALL26 relocations. See docs/TCC-ARM64-ASM.md for the design (operand model, encoder groupings, phase plan). Diffstat:
14 files changed, 946 insertions(+), 37 deletions(-)
diff --git a/Makefile b/Makefile @@ -396,12 +396,20 @@ HOST_CC_TARGET_aarch64 := aarch64-linux-gnu HOST_CC_TARGET_amd64 := x86_64-linux-gnu HOST_CC_TARGET = $(HOST_CC_TARGET_$(ARCH)) -# Cross-assembler for the per-arch .S harness inputs. aarch64 / amd64 -# go through host clang's `-target` (works out of the box on macOS). -# riscv64 routes through alpine-gcc:riscv64 because Apple's clang ships -# without a RISC-V backend; the routing is invisible at the call site — -# both forms consume (out, src) positionals. -ifeq ($(ARCH),riscv64) +# Cross-assembler for the per-arch .S harness inputs. +# aarch64 — tcc-boot2 itself assembles via the in-tree arm64-asm.c +# (phase 1; see docs/TCC-ARM64-ASM.md). Run inside the +# scratch container since tcc-boot2 is a Linux/aarch64 ELF. +# amd64 — host clang's `-target x86_64-linux-gnu` (works out of the +# box on macOS). +# riscv64 — alpine-gcc:riscv64 because Apple's clang ships without a +# RISC-V backend. +# Both forms consume (out, src) positionals so the call sites are arch- +# agnostic. +ifeq ($(ARCH),aarch64) +TCC_ASM_DEPS := build/$(ARCH)/tcc-boot2/tcc-boot2 build/$(ARCH)/.image +TCC_ASM = $(call PODMAN,$(ARCH)) build/$(ARCH)/tcc-boot2/tcc-boot2 -nostdlib -c -o $(1) $(2) +else ifeq ($(ARCH),riscv64) TCC_ASM_DEPS := build/$(ARCH)/.image-alpine-gcc TCC_ASM = $(call ALPINE_GCC,$(ARCH)) cc -c -o $(1) -x assembler $(2) else @@ -491,8 +499,9 @@ build/amd64/tcc-cc/va_list.o: \ # # start.o threads __libc_init in front of main and exits with main's # return value. sys_stubs.o implements the libp1pp-shaped sys_* -# wrappers via raw syscalls; both are produced by the host -# cross-toolchain (no asm support in tcc-boot2's codegen). +# wrappers via raw syscalls. Both are assembled by TCC_ASM (arch- +# routed in the block above): tcc-boot2 for aarch64, host clang for +# amd64, alpine-gcc for riscv64. $(TCC_LIBC_START): tcc-libc/$(ARCH)/start.S $(TCC_ASM_DEPS) mkdir -p $(@D) $(call TCC_ASM,$@,$<) diff --git a/docs/TCC-ARM64-ASM.md b/docs/TCC-ARM64-ASM.md @@ -0,0 +1,278 @@ +# tcc arm64 assembler — design + +Working doc. Adds an `arm64-asm.c` to vendored tcc 0.9.26 so the +ARM64-target build accepts `.S` inputs and `__asm__("…")` blocks. +Lands in two phases: a narrow first cut covering exactly what the +repo's `.S` files need today, then incremental extension to ride +parity with `riscv64-asm.c` and (modulo x86 quirks) `i386-asm.c`. + +Goal of this doc: lock the **internal shape** so phase 1 is a +genuine subset of the final assembler — not a stub we throw away. + +## Why this exists + +`vendor/upstream/tcc-0.9.26.tar.gz` ships per-target asm: + +| arch | file | notes | +|------|------|-------| +| x86_64 / i386 | `i386-asm.c` (1720 LoC) | shared by both targets | +| arm | `arm-asm.c` (94 LoC) | stub: directives only, every opcode → `tcc_error` | +| riscv64 | `riscv64-asm.c` (856 LoC) | real assembler | +| arm64 | — | absent; `CONFIG_TCC_ASM` undefined for `TCC_TARGET_ARM64` | + +Today the boot2 Makefile compensates for the arm64 gap by cross-asm'ing +`tcc-cc/aarch64/start.S` and `tcc-libc/aarch64/{start,sys_stubs}.S` +through host clang `-target aarch64-linux-gnu` (Makefile:386–410). +This doc describes the assembler that lets us delete that. + +## Phase 1 — narrow scope + +Cover exactly the mnemonics the in-tree `.S` fixtures use, plus the +directive surface `tccasm.c` already handles. + +Mnemonics required by `tcc-cc/aarch64/start.S` and +`tcc-libc/aarch64/{start,sys_stubs}.S`: + +| mnemonic | forms used | encoding family | +|----------|------------|-----------------| +| `mov` | `mov xN, #imm` (incl. negative); `mov xN, xM` | movz/movn/movk + ORR(reg) alias | +| `add` | `add xN, sp, #imm` | add (immediate, 64-bit) | +| `ldr` | `ldr xN, [xM]`, `ldr xN, [sp]` | LDR (immediate, unsigned offset, 64-bit) | +| `bl` | `bl <symbol>` | BL → `R_AARCH64_CALL26` | +| `b` | `b .` (self-loop) | B → `R_AARCH64_JUMP26` (or in-section fixup) | +| `ret` | `ret` (uses x30) | RET (Xn=30 default) | +| `svc` | `svc #0`, `svc #imm16` | SVC | + +Registers: `x0`-`x30`, `w0`-`w30`, `sp`, `xzr`, `wzr`. (`fp`=`x29`, +`lr`=`x30`, `ip0`=`x16`, `ip1`=`x17` are aliases — defer to phase 2.) + +Directives: anything `tccasm.c` already drives — `.globl`, `.text`, +`.data`, `.byte`, `.word`, `.quad`, `.ascii`, `.asciz`, `.align`, +`.skip`, `.section`, `.previous`, labels, `.set`. Phase 1 does +**not** need to add code here; pulling in `tccasm.c` is automatic +once `arm64-asm.c` defines `CONFIG_TCC_ASM`. + +Inline `__asm__` constraint plumbing (`subst_asm_operand`, +`asm_compute_constraints`, `asm_gen_code`) follows `riscv64-asm.c`'s +"defined but no-op" pattern in phase 1: `.S` files work, full +constraint-based inline asm doesn't yet. Same posture upstream +shipped riscv64 with. + +**Phase 1 acceptance:** the existing `tcc-libc` and `tcc-cc` suites +pass on `ARCH=aarch64` with the host cross-asm path removed from +the Makefile (start/sys_stubs assembled by tcc-boot2 itself). + +## File layout + +``` +arm64-asm.c new — opcode table, parser, encoders (~600 LoC at parity) +arm64-tok.h new — DEF_ASM(...) for regs + mnemonics (~150 LoC at parity) +tcctok.h +3 lines: include arm64-tok.h under TCC_TARGET_ARM64 +tcc.h +1 line: include arm64-asm.c in the per-target block +libtcc.c +1 line: same, in the ONE_SOURCE block +``` + +Patches go in `scripts/simple-patches/tcc-0.9.26/` and apply via +`stage1-flatten.sh`'s `apply_our_patch` mechanism — same shape as +the existing arm64 patches (`arm64-stdarg-array`, +`arm64-va-pointer-operand`, `arm64-va-arg-pointer`). New +`arm64-asm.c` and `arm64-tok.h` ship as straight files in +`scripts/simple-patches/tcc-0.9.26/files/` and are copied into +`$SRC` by the flatten script before preprocessing. + +## Internal shape + +The narrow set is small enough to write linearly, but it's worth +ten minutes more to put the real ARM64 ISA encoding model in place +on day one so phase 2 is "add table rows," not "rewrite." + +### Operand model + +```c +enum { + OPT_REG, /* X/W register, 0..31 (sp/zr distinguished by use) */ + OPT_SHIFT_REG, /* Xn[, lsl/lsr/asr/ror #imm] */ + OPT_EXT_REG, /* Xn[, uxtw/sxtw/sxtx #imm] */ + OPT_IMM, /* unparsed signed/unsigned immediate */ + OPT_IMM12, /* add/sub immediate: 12-bit + optional lsl#12 */ + OPT_LOG_IMM, /* and/orr/eor logical immediate (N:imms:immr) */ + OPT_MOV_IMM, /* movz/movk/movn 16-bit + hw shift */ + OPT_MEM, /* [Xn], [Xn,#imm], [Xn,Xm{,ext}], pre/post indexed */ + OPT_LABEL, /* symbol+addend; resolves to PC-rel reloc */ + OPT_COND, /* eq/ne/lt/... (4-bit cond code) */ + OPT_SYS, /* sysreg encoding for mrs/msr — phase 3 */ +}; + +typedef struct Operand { + uint32_t kind; /* OPT_* */ + uint8_t is_w; /* 0=64-bit, 1=32-bit (X vs W) */ + uint8_t is_sp; /* 1 if textual form was sp (vs xzr) */ + union { + struct { uint8_t reg; uint8_t shift_kind; uint8_t shift_amt; } r; + struct { uint8_t base, idx, ext_kind, ext_amt; + int32_t disp; uint8_t mode; /* off|preidx|postidx */ } m; + ExprValue e; /* immediates and label refs */ + uint8_t cond; + }; +} Operand; +``` + +The `kind` enum is the type signature instruction encoders match +against. Phase 1 uses only `OPT_REG`, `OPT_IMM`, `OPT_MEM` +(simple base+offset variant), and `OPT_LABEL`. Adding the rest is +"new enum value + new parse path"; encoders not yet handling them +just `expect("supported operand")` until they do. + +`is_w`, `is_sp` — AArch64-specific. `Wn` and `Xn` share register +numbers; the encoder needs to know the size to set the `sf` bit. +`sp` and `xzr` both encode as register 31 but are not +interchangeable per-instruction; track which token the user wrote. + +### Encoder organization + +One static helper per ARM64 instruction format, mirroring +`riscv64-asm.c`'s `asm_emit_i / asm_emit_r / asm_emit_u / asm_emit_s`. +Group by encoding family in C ARM ARM (Section C4): + +| helper | covers | +|-----------------------------|-----------------------------------------| +| `emit_dp_imm_addsub` | add/sub/cmp/cmn (immediate) | +| `emit_dp_imm_log` | and/orr/eor/tst (immediate) | +| `emit_dp_imm_mov` | movz/movk/movn (incl. `mov` aliases) | +| `emit_dp_imm_bitfield` | sbfm/ubfm/bfm + sxtw/uxtb/lsl-imm aliases | +| `emit_dp_reg_addsub` | add/sub/cmp shifted-reg + extended-reg | +| `emit_dp_reg_log` | and/orr/eor/bic shifted-reg | +| `emit_dp_reg_shift` | lslv/lsrv/asrv/rorv + lsl-reg aliases | +| `emit_dp_reg_mul` | madd/msub/mul/mneg/smull/umull | +| `emit_dp_reg_csel` | csel/csinc/csinv/csneg + cset/cinc aliases | +| `emit_ldst_imm` | ldr/str (immediate, unsigned + pre/post) | +| `emit_ldst_reg` | ldr/str (register offset, with extend) | +| `emit_ldst_pair` | ldp/stp (incl. pre/post indexed) | +| `emit_ldst_pseudo_eq` | `ldr Xn, =imm64` / `=sym` — inline lowering, see below | +| `emit_branch_imm` | b/bl + label reloc | +| `emit_branch_cond` | b.cond + label reloc | +| `emit_branch_cmp` | cbz/cbnz/tbz/tbnz | +| `emit_branch_reg` | br/blr/ret | +| `emit_system` | svc/hvc/smc/brk/hint (nop/yield/wfe/wfi) | + +Phase 1 implements only `emit_dp_imm_addsub`, `emit_dp_imm_mov`, +`emit_ldst_imm`, `emit_branch_imm`, `emit_branch_reg` (just `ret`), +and `emit_system` (just `svc`). Each helper has the full ISA-format +encoding from day one — phase 1 just feeds it the narrow operand +shapes. + +### `asm_opcode` dispatch + +Same shape as `riscv64-asm.c`'s tail switch: outer switch on the +TOK groups dispatching to a per-family parser, which parses +operands, validates `kind`s, and calls the matching `emit_*` helper. +Adding mnemonics in phase 2 is a new `case TOK_ASM_xxx:` plus, if +needed, a new shared parser routine. + +### Label & relocation interface + +`bl <sym>` / `b <sym>` emit zero into the instruction word and call +`greloca(cur_text_section, sym, ind, R_AARCH64_CALL26 /* or +JUMP26 */, 0)` — both reloc types are already handled by +`arm64-link.c:30-46`. Local backward references (`b .`, numbered +labels) resolve through the symbol table the same way `tccasm.c` +already wires for other arches: `asm_new_label` defines, `asm_expr` +resolves, the relocation collapses at link time. + +Symbol address loads (the `ldr Xn, =sym` pseudo, and any phase-2 +movz/movk-via-`:abs_g0:`/`:abs_g1:`/etc. modifiers) emit the same +4-instruction `movz/movk` chain that `arm64-gen.c:431-440` uses for +compiler-emitted loads: `R_AARCH64_MOVW_UABS_G0_NC` + +`G1_NC` + `G2_NC` + `G3`. `adrp`+`add` is deliberately **not** +used — `arm64-gen.c:425-430` documents that the ±4GB ADR_PREL_PG +range fails on tcc's static layout and the MOVW chain is the +working idiom. The relocs are exercised by every existing +tcc-built ARM64 binary, so this is well-trodden ground. + +### Inline-asm constraint plumbing + +Phase 1 stubs: + +```c +ST_FUNC void subst_asm_operand(CString *s, SValue *sv, int mod) { + tcc_error("ARM64 inline asm operands not implemented yet"); +} +ST_FUNC void asm_compute_constraints(...) { /* no-op */ } +ST_FUNC void asm_gen_code(...) { /* no-op */ } +ST_FUNC void asm_clobber(uint8_t *cr, const char *str) { + /* parse register name, mark cr[reg]=1 — copy from riscv64-asm.c */ +} +ST_FUNC int asm_parse_regvar(int t) { /* x0..xzr, w0..wzr → 0..31 */ } +``` + +This is enough for `.S` files, top-level `__asm__("…")` strings, +and the `__asm__("name")` symbol-rename form. Constraint-driven +register allocation (`__asm__("…" : "=r"(out) : "r"(in))`) lights +up in phase 3 once `subst_asm_operand` + `asm_compute_constraints` +are real — straight port from `i386-asm.c`'s template logic +adapted to ARM64 register names; no surprises. + +## Phase plan + +**Phase 1 (this design)** — `mov`/`add`/`ldr`/`bl`/`b`/`ret`/`svc`, +all integer-register operand kinds restricted to OP_REG/OP_IMM/OP_MEM +(base+disp)/OP_LABEL. Acceptance: `.S` files in tcc-cc/tcc-libc +assemble through `tcc-boot2`; Makefile drops `TCC_ASM` dance for +ARCH=aarch64. + +**Phase 2** — broaden mnemonic coverage to riscv64 parity: the rest +of dp-imm / dp-reg / ldp-stp / cbz/cbnz / b.cond, full +shift+extend operand forms, `ldr Xn, =imm64`/`=sym` inline +lowering. Lifts `arm64_encode_bimm64` and `arm64_movimm` from +`arm64-gen.c` to shared `ST_FUNC`s for the logical-imm and +`=imm64` paths. Validates against `tests2/73_arm64.c` (already in +upstream). + +**Phase 3** — full inline-asm constraint surface +(`subst_asm_operand` + `asm_compute_constraints`). Ports the +i386-asm.c template walk; ARM64-specific bits are operand modifier +letters (`%w0` for W-form, `%x0` for X-form) and clobber semantics. + +## Validation + +Unit-level: a new `tests/tcc-asm/` suite with one `.S` (or +`__asm__()` C wrapper) fixture per mnemonic+operand-shape combo, +diffing the encoded bytes against a known-good (host clang or +upstream gas) reference. Same shape as the existing P1 suite — +fixture in, expected hex out, byte diff. + +Integration-level: drop the host cross-asm out of Makefile lines +386–410 for ARCH=aarch64 and let `tcc-boot2` build start.S / +sys_stubs.S directly. The existing `tcc-cc` and `tcc-libc` suites +then exercise the new assembler end-to-end, including the +stage-2/stage-3 fixed-point check. + +Self-host check (phase 2+): compile the patched `tcc.flat.c` itself +with `tcc-tcc` and confirm the `arm64-asm.c` it just compiled is +byte-identical to the one cc.scm produced. + +## Resolved decisions + +- **No literal pool.** Phase 2 lowers `ldr Xn, =imm64` to an inline + `movz/movk` chain (call into the same `arm64_movi`/`arm64_movimm` + logic `arm64-gen.c:155-221` already uses) and `ldr Xn, =sym` to + the 4-instruction MOVW_UABS chain. tcc's own codegen never emits a + pool, so adding pool infrastructure would be net-new for one gas + pseudo nothing in-tree uses; inline lowering matches what + compiler-emitted code already does. Cost: `Xn` is clobbered with + the constant rather than loaded from `.rodata`, and `ldr =0x… ; + .word foo` won't share — neither matters for any in-tree fixture. + +- **Logical-immediate encoder: lift `arm64_encode_bimm64`.** + `arm64-gen.c:106-153` already implements the full N:imms:immr + encoder as a static function, used by gen.c itself for + `orr-immediate`-as-`movi` (line 187) and direct logical-imm + codegen (line 1395). Promote it to an `ST_FUNC` declared in the + arm64 block of `tcc.h` and call it from `arm64-asm.c`. Zero new + code, no port, no licensing question. + +- **`R_AARCH64_MOVW_UABS_G*` is the primary path.** `arm64-gen.c:429` + hardcodes `avoid_adrp = 1`, meaning every symbol address load in + every existing tcc-built ARM64 binary already goes through the + MOVW_UABS_G{0..3}_NC chain. `relocate()` (arm64-link.c:174-189) + implements all four. Use them; don't use `adrp`/`adr` at all. diff --git a/scripts/boot3.sh b/scripts/boot3.sh @@ -59,12 +59,11 @@ ## ## ─── Tools ──────────────────────────────────────────────────────────── ## In container: scratch + busybox (no libc, no /etc, no resolver). -## On host: aarch64 only — cross-assembler for {start.S, -## sys_stubs.S} via $HOST_CC -target aarch64-linux-gnu. -## tcc 0.9.26 has no aarch64 assembler (no arm64-asm.c), -## so .S inputs are pre-compiled host-side. amd64 and -## riscv64 have CONFIG_TCC_ASM in their backends and feed -## .S straight to tcc0 in stages C+D — no host tool. +## On host: none — every arch has CONFIG_TCC_ASM and assembles +## .S inputs (start.S, sys_stubs.S) directly inside the +## container in stages B/D/E. The aarch64 assembler is +## the phase-1 arm64-asm.c that flatten patches into +## tcc-0.9.26 (see docs/TCC-ARM64-ASM.md). ## ## ─── Outputs ────────────────────────────────────────────────────────── ## build/$ARCH/boot3/tcc0 — cc.scm-built tcc (compile 1) @@ -182,21 +181,8 @@ cp "$LIBC_FLAT" "$STAGE/in/libc.flat.c" # -I resolves stdarg.h etc. Recursive cp keeps directory layout. cp -R "$TCC_DIR/include/." "$STAGE/in/tcc-include/" -# ── HOST cross-assembly of start.o + sys_stubs.o (aarch64 only) ─────── -# tcc 0.9.26's aarch64 backend has no assembler (no arm64-asm.c), so -# .S inputs are pre-compiled host-side. amd64 and riscv64 backends ship -# CONFIG_TCC_ASM and assemble .S directly inside the container in -# stages C+D. -if [ "$ARCH" = "aarch64" ]; then - echo "[boot3 $ARCH] cross-asm: start.S + sys_stubs.S -> .o (host)" - $HOST_CC -target aarch64-linux-gnu -c \ - -o "$ROOT/$STAGE/in/start.o" -x assembler "$ROOT/$STAGE/in/start.S" - $HOST_CC -target aarch64-linux-gnu -c \ - -o "$ROOT/$STAGE/in/sys_stubs.o" -x assembler "$ROOT/$STAGE/in/sys_stubs.S" - ASM_BUILD_NEEDED=0 -else - ASM_BUILD_NEEDED=1 -fi +# Every arch's tcc-boot2 has CONFIG_TCC_ASM and assembles .S inputs +# itself inside the container — no host cross-asm step. # ── run the full Stage A + B + C + D + E pipeline in one container ─── # Stage A: cc.scm bundle, libc.P1pp + tcc.flat.P1pp via scheme1 + cc.scm, @@ -212,7 +198,6 @@ podman run --rm -i --pull=never --platform "$PLATFORM" \ -e LIB_HELPER_SRC="$LIB_HELPER_SRC" \ -e LIB_HELPER_OBJ="$LIB_HELPER_OBJ" \ -e LIB_HELPER_DEFINES="$LIB_HELPER_DEFINES" \ - -e ASM_BUILD_NEEDED="$ASM_BUILD_NEEDED" \ -v "$ROOT/$STAGE:/work" -w /work "$IMAGE" \ sh -eu -s <<'CONTAINER' IN=/work/in @@ -244,13 +229,8 @@ $IN/hex2pp -B 0x600000 /tmp/linked.hex2pp $OUT/tcc0 # host-built .o is copied through from $IN. build_asm() { cc=$1; workdir=$2 - if [ "$ASM_BUILD_NEEDED" = "1" ]; then - "$cc" -nostdlib -c -o "$workdir/start.o" "$IN/start.S" - "$cc" -nostdlib -c -o "$workdir/sys_stubs.o" "$IN/sys_stubs.S" - else - cp "$IN/start.o" "$workdir/start.o" - cp "$IN/sys_stubs.o" "$workdir/sys_stubs.o" - fi + "$cc" -nostdlib -c -o "$workdir/start.o" "$IN/start.S" + "$cc" -nostdlib -c -o "$workdir/sys_stubs.o" "$IN/sys_stubs.S" } build_helpers() { cc=$1; workdir=$2 diff --git a/scripts/simple-patches/tcc-0.9.26/arm64-asm-include-libtcc-c.after b/scripts/simple-patches/tcc-0.9.26/arm64-asm-include-libtcc-c.after @@ -0,0 +1,5 @@ +#ifdef TCC_TARGET_ARM64 +#include "arm64-gen.c" +#include "arm64-link.c" +#include "arm64-asm.c" +#endif diff --git a/scripts/simple-patches/tcc-0.9.26/arm64-asm-include-libtcc-c.before b/scripts/simple-patches/tcc-0.9.26/arm64-asm-include-libtcc-c.before @@ -0,0 +1,4 @@ +#ifdef TCC_TARGET_ARM64 +#include "arm64-gen.c" +#include "arm64-link.c" +#endif diff --git a/scripts/simple-patches/tcc-0.9.26/arm64-asm-include-tcc-h.after b/scripts/simple-patches/tcc-0.9.26/arm64-asm-include-tcc-h.after @@ -0,0 +1,5 @@ +#ifdef TCC_TARGET_ARM64 +# include "arm64-gen.c" +# include "arm64-link.c" +# include "arm64-asm.c" +#endif diff --git a/scripts/simple-patches/tcc-0.9.26/arm64-asm-include-tcc-h.before b/scripts/simple-patches/tcc-0.9.26/arm64-asm-include-tcc-h.before @@ -0,0 +1,4 @@ +#ifdef TCC_TARGET_ARM64 +# include "arm64-gen.c" +# include "arm64-link.c" +#endif diff --git a/scripts/simple-patches/tcc-0.9.26/arm64-tok-include-tcctok-h.after b/scripts/simple-patches/tcc-0.9.26/arm64-tok-include-tcctok-h.after @@ -0,0 +1,7 @@ +#if defined TCC_TARGET_RISCV64 +#include "riscv64-tok.h" +#endif + +#if defined TCC_TARGET_ARM64 +#include "arm64-tok.h" +#endif diff --git a/scripts/simple-patches/tcc-0.9.26/arm64-tok-include-tcctok-h.before b/scripts/simple-patches/tcc-0.9.26/arm64-tok-include-tcctok-h.before @@ -0,0 +1,3 @@ +#if defined TCC_TARGET_RISCV64 +#include "riscv64-tok.h" +#endif diff --git a/scripts/simple-patches/tcc-0.9.26/asm-hash-bol-only.after b/scripts/simple-patches/tcc-0.9.26/asm-hash-bol-only.after @@ -0,0 +1,9 @@ + if ((parse_flags & PARSE_FLAG_ASM_FILE) + && (tok_flags & TOK_FLAG_BOL)) { + /* gas-style line comment — only at start of line. + Mid-line '#' is the ARM/AArch64 immediate prefix. */ + p = parse_line_comment(p - 1); + goto redo_no_start; + } else { + tok = '#'; + } diff --git a/scripts/simple-patches/tcc-0.9.26/asm-hash-bol-only.before b/scripts/simple-patches/tcc-0.9.26/asm-hash-bol-only.before @@ -0,0 +1,6 @@ + if (parse_flags & PARSE_FLAG_ASM_FILE) { + p = parse_line_comment(p - 1); + goto redo_no_start; + } else { + tok = '#'; + } diff --git a/scripts/simple-patches/tcc-0.9.26/files/arm64-asm.c b/scripts/simple-patches/tcc-0.9.26/files/arm64-asm.c @@ -0,0 +1,491 @@ +/*************************************************************/ +/* + * ARM64 (AArch64) assembler for TCC — phase 1. + * + * Covers the mnemonic surface needed by the in-tree .S inputs: + * tcc-cc/aarch64/start.S + * tcc-libc/aarch64/start.S + * tcc-libc/aarch64/sys_stubs.S + * + * Mnemonics: mov, add, sub, ldr, str, ldp, stp, b, bl, ret, svc. + * Inline-__asm__ constraint plumbing is stubbed in the same shape + * riscv64-asm.c used at its first cut. See docs/TCC-ARM64-ASM.md. + */ + +#ifdef TARGET_DEFS_ONLY + +#define CONFIG_TCC_ASM +#define NB_ASM_REGS 32 + +ST_FUNC void g(int c); +ST_FUNC void gen_le16(int c); +ST_FUNC void gen_le32(int c); + +/*************************************************************/ +#else +/*************************************************************/ +#define USING_GLOBALS +#include "tcc.h" + +ST_FUNC void g(int c) +{ + int ind1; + if (nocode_wanted) + return; + ind1 = ind + 1; + if (ind1 > cur_text_section->data_allocated) + section_realloc(cur_text_section, ind1); + cur_text_section->data[ind] = c; + ind = ind1; +} + +ST_FUNC void gen_le16(int i) +{ + g(i); + g(i >> 8); +} + +ST_FUNC void gen_le32(int i) +{ + int ind1; + if (nocode_wanted) + return; + ind1 = ind + 4; + if (ind1 > cur_text_section->data_allocated) + section_realloc(cur_text_section, ind1); + cur_text_section->data[ind++] = i & 0xff; + cur_text_section->data[ind++] = (i >> 8) & 0xff; + cur_text_section->data[ind++] = (i >> 16) & 0xff; + cur_text_section->data[ind++] = (i >> 24) & 0xff; +} + +ST_FUNC void gen_expr32(ExprValue *pe) +{ + gen_le32(pe->v); +} + +/* ---- operand model ------------------------------------------------ */ + +#define OP_REG (1 << 0) +#define OP_IMM (1 << 1) +#define OP_MEM (1 << 2) + +#define IDX_OFFSET 0 /* [Xn] or [Xn, #imm] */ +#define IDX_PREIDX 1 /* [Xn, #imm]! */ +#define IDX_POSTIDX 2 /* [Xn], #imm */ + +typedef struct AArch64Op { + uint32_t kind; /* OP_REG | OP_IMM | OP_MEM */ + uint8_t reg; /* register number 0..31 (31 = sp / xzr) */ + uint8_t is_w; /* 1 = W (32-bit), 0 = X (64-bit) */ + uint8_t is_sp; /* 1 if sp/wsp; 0 if zr or general */ + uint8_t base; /* memory base register */ + uint8_t base_is_sp; + uint8_t indexing; /* IDX_* */ + ExprValue e; /* immediate value or label expression */ +} AArch64Op; + +/* Recognise a register-name token. Returns 1 if matched. */ +static int arm64_parse_reg(int t, uint8_t *preg, uint8_t *pis_w, uint8_t *pis_sp) +{ + if (t >= TOK_ASM_x0 && t <= TOK_ASM_x30) { + *preg = t - TOK_ASM_x0; *pis_w = 0; *pis_sp = 0; return 1; + } + if (t == TOK_ASM_sp) { *preg = 31; *pis_w = 0; *pis_sp = 1; return 1; } + if (t == TOK_ASM_xzr) { *preg = 31; *pis_w = 0; *pis_sp = 0; return 1; } + if (t >= TOK_ASM_w0 && t <= TOK_ASM_w30) { + *preg = t - TOK_ASM_w0; *pis_w = 1; *pis_sp = 0; return 1; + } + if (t == TOK_ASM_wsp) { *preg = 31; *pis_w = 1; *pis_sp = 1; return 1; } + if (t == TOK_ASM_wzr) { *preg = 31; *pis_w = 1; *pis_sp = 0; return 1; } + return 0; +} + +ST_FUNC int asm_parse_regvar(int t) +{ + uint8_t r, w, sp; + if (arm64_parse_reg(t, &r, &w, &sp)) + return r; + return -1; +} + +static void asm_skip_comma(void) +{ + if (tok == ',') next(); + else expect("','"); +} + +static void asm_skip_hash(void) +{ + if (tok == '#') next(); +} + +/* Parse [Xn] / [Xn, #imm] / [Xn, #imm]! / [Xn], #imm. */ +static void parse_mem(TCCState *s1, AArch64Op *op) +{ + uint8_t r, w, sp; + skip('['); + if (!arm64_parse_reg(tok, &r, &w, &sp) || w) + expect("64-bit base register"); + op->kind = OP_MEM; + op->base = r; + op->base_is_sp = sp; + op->indexing = IDX_OFFSET; + op->e.v = 0; + op->e.sym = NULL; + op->e.pcrel = 0; + next(); + if (tok == ',') { + next(); + asm_skip_hash(); + asm_expr(s1, &op->e); + } + skip(']'); + if (tok == '!') { + next(); + op->indexing = IDX_PREIDX; + } else if (tok == ',' && op->e.v == 0 && op->e.sym == NULL) { + /* post-indexed form: [Xn], #imm. + Only recognise if no in-bracket disp was set. */ + next(); + asm_skip_hash(); + asm_expr(s1, &op->e); + op->indexing = IDX_POSTIDX; + } +} + +static void parse_operand(TCCState *s1, AArch64Op *op) +{ + uint8_t r, w, sp; + op->kind = 0; + op->e.v = 0; + op->e.sym = NULL; + op->e.pcrel = 0; + + if (arm64_parse_reg(tok, &r, &w, &sp)) { + op->kind = OP_REG; + op->reg = r; + op->is_w = w; + op->is_sp = sp; + next(); + return; + } + if (tok == '[') { + parse_mem(s1, op); + return; + } + asm_skip_hash(); + asm_expr(s1, &op->e); + op->kind = OP_IMM; +} + +/* ---- encoders ----------------------------------------------------- */ + +/* MOVZ/MOVN/MOVK chain to load an X-form 64-bit immediate into Xd. */ +static void emit_movimm_x(int rd, uint64_t x) +{ + int i, z = 0, m = 0, emitted = 0; + uint64_t x1 = x; + uint32_t mov1 = 0xd2800000; /* MOVZ X */ + for (i = 0; i < 64; i += 16) { + if (((x >> i) & 0xffff) == 0) z++; + if (((~x >> i) & 0xffff) == 0) m++; + } + if (m > z) { + x1 = ~x; + mov1 = 0x92800000; /* MOVN X */ + } + for (i = 0; i < 64; i += 16) { + if (((x1 >> i) & 0xffff) != 0) { + gen_le32(mov1 | rd | ((x1 >> i) & 0xffff) << 5 | (i << 17)); + emitted = 1; + i += 16; + break; + } + } + if (!emitted) { + /* x is all zeros (MOVZ) or all ones (MOVN selected on ~0): emit one insn. */ + gen_le32(mov1 | rd); + return; + } + for (; i < 64; i += 16) { + if (((x1 >> i) & 0xffff) != 0) + gen_le32(0xf2800000 | rd | ((x >> i) & 0xffff) << 5 | (i << 17)); + } +} + +/* MOV (register): ORR Xd, XZR, Xm (when neither is SP). */ +static void emit_mov_reg_orr(int rd, int rm, int is_w) +{ + uint32_t insn = 0xaa0003e0 | (rm << 16) | rd; + if (is_w) insn &= 0x7fffffff; + gen_le32(insn); +} + +/* MOV (to/from SP): ADD Xd, Xn, #0. */ +static void emit_mov_sp_add(int rd, int rn, int is_w) +{ + uint32_t insn = 0x91000000 | (rn << 5) | rd; + if (is_w) insn &= 0x7fffffff; + gen_le32(insn); +} + +/* ADD/SUB (immediate). is_sub = 1 emits SUB; negative imm flips polarity. */ +static void emit_addsub_imm(int rd, int rn, int64_t imm, int is_w, int is_sub) +{ + uint32_t base; + if (imm < 0) { + imm = -imm; + is_sub = !is_sub; + } + base = is_sub ? 0xd1000000 : 0x91000000; /* X-form */ + if (is_w) base &= 0x7fffffff; + if (imm >= 0 && imm < 4096) { + gen_le32(base | (((uint32_t)imm) << 10) | (rn << 5) | rd); + } else if (imm >= 0 && imm < (4096 << 12) && (imm & 0xfff) == 0) { + gen_le32(base | (1u << 22) | (((uint32_t)(imm >> 12)) << 10) | (rn << 5) | rd); + } else { + tcc_error("add/sub immediate out of range"); + } +} + +/* LDR/STR (immediate, unsigned offset, X-form). */ +static void emit_ldst_imm_unsigned(int is_load, int rt, int rn, int64_t imm) +{ + uint32_t base = is_load ? 0xf9400000 : 0xf9000000; + int64_t scaled = imm / 8; + if ((imm & 7) || scaled < 0 || scaled > 4095) + tcc_error("ldr/str immediate offset out of range"); + gen_le32(base | (((uint32_t)scaled) << 10) | (rn << 5) | rt); +} + +/* LDP/STP (X-form). indexing = IDX_OFFSET / IDX_PREIDX / IDX_POSTIDX. */ +static void emit_ldst_pair(int is_load, int rt1, int rt2, int rn, + int64_t imm, int indexing) +{ + uint32_t op = is_load ? 0xa8400000 : 0xa8000000; + int64_t scaled = imm / 8; + if ((imm & 7) || scaled < -64 || scaled > 63) + tcc_error("ldp/stp offset out of range"); + if (indexing == IDX_POSTIDX) op |= 0x00800000; + else if (indexing == IDX_PREIDX) op |= 0x01800000; + else op |= 0x01000000; + gen_le32(op | (((uint32_t)(scaled & 0x7f)) << 15) | + (rt2 << 10) | (rn << 5) | rt1); +} + +/* B / BL with a label or in-section offset. is_call = 1 emits BL. */ +static void emit_branch_imm(AArch64Op *op, int is_call) +{ + uint32_t base = is_call ? 0x94000000 : 0x14000000; + Sym *sym = op->e.sym; + if (sym && sym->r == cur_text_section->sh_num + && !(sym->type.t & VT_EXTERN)) { + /* In-section, defined: compute the offset directly. */ + int64_t target = (int64_t)sym->jnext + (int64_t)op->e.v; + int64_t off = target - ind; + if (off & 3) tcc_error("branch target not 4-byte aligned"); + off >>= 2; + if (off < -(1 << 25) || off >= (1 << 25)) + tcc_error("branch target out of range"); + gen_le32(base | (((uint32_t)off) & 0x03ffffffu)); + } else if (sym) { + int reloc = is_call ? R_AARCH64_CALL26 : R_AARCH64_JUMP26; + greloca(cur_text_section, sym, ind, reloc, op->e.v); + gen_le32(base); + } else { + /* Pure immediate offset (rare in source asm, but support `b 0`). */ + int64_t off = (int64_t)op->e.v; + if (off & 3) tcc_error("branch target not 4-byte aligned"); + off >>= 2; + if (off < -(1 << 25) || off >= (1 << 25)) + tcc_error("branch target out of range"); + gen_le32(base | (((uint32_t)off) & 0x03ffffffu)); + } +} + +static void emit_ret(int rn) +{ + gen_le32(0xd65f0000u | (rn << 5)); +} + +static void emit_svc(int64_t imm) +{ + if (imm < 0 || imm > 0xffff) + tcc_error("svc immediate out of range"); + gen_le32(0xd4000001u | (((uint32_t)imm) << 5)); +} + +/* ---- mnemonic dispatch -------------------------------------------- */ + +static void need_xreg(AArch64Op *op, const char *what) +{ + if (op->kind != OP_REG || op->is_w) + tcc_error("%s: expected 64-bit register", what); +} + +ST_FUNC void asm_opcode(TCCState *s1, int token) +{ + AArch64Op a, b, c; + + switch (token) { + + case TOK_ASM_mov: + parse_operand(s1, &a); + asm_skip_comma(); + parse_operand(s1, &b); + if (a.kind != OP_REG) + tcc_error("mov: destination must be a register"); + if (b.kind == OP_REG) { + if (a.is_w != b.is_w) + tcc_error("mov: register size mismatch"); + if (a.is_sp || b.is_sp) + emit_mov_sp_add(a.reg, b.reg, a.is_w); + else + emit_mov_reg_orr(a.reg, b.reg, a.is_w); + } else if (b.kind == OP_IMM) { + if (b.e.sym) + tcc_error("mov: symbol immediate not supported (phase 1)"); + if (a.is_sp) + tcc_error("mov sp, #imm: use add"); + if (a.is_w) { + /* W-form: emit MOVZ/MOVN/MOVK with sf=0. Kept narrow: + only single-hword positive values seen in-tree. */ + uint32_t v = (uint32_t)b.e.v; + if (v <= 0xffff) { + gen_le32(0x52800000u | a.reg | (v << 5)); + } else if ((~v) <= 0xffff) { + gen_le32(0x12800000u | a.reg | (((~v) & 0xffff) << 5)); + } else { + tcc_error("mov W#imm: only 16-bit values supported (phase 1)"); + } + } else { + emit_movimm_x(a.reg, b.e.v); + } + } else { + tcc_error("mov: unsupported source operand"); + } + return; + + case TOK_ASM_add: + case TOK_ASM_sub: + parse_operand(s1, &a); + asm_skip_comma(); + parse_operand(s1, &b); + asm_skip_comma(); + parse_operand(s1, &c); + if (a.kind != OP_REG || b.kind != OP_REG) + tcc_error("add/sub: expected register operands"); + if (c.kind == OP_IMM) { + if (c.e.sym) + tcc_error("add/sub: symbol immediate not supported"); + if (a.is_w != b.is_w) + tcc_error("add/sub: register size mismatch"); + emit_addsub_imm(a.reg, b.reg, (int64_t)c.e.v, a.is_w, + token == TOK_ASM_sub); + } else { + tcc_error("add/sub: only immediate form supported in phase 1"); + } + return; + + case TOK_ASM_ldr: + case TOK_ASM_str: + parse_operand(s1, &a); + asm_skip_comma(); + parse_operand(s1, &b); + need_xreg(&a, "ldr/str data register"); + if (b.kind != OP_MEM) + tcc_error("ldr/str: expected memory operand"); + if (b.indexing != IDX_OFFSET) + tcc_error("ldr/str: pre/post-indexed form not supported in phase 1"); + if (b.e.sym) + tcc_error("ldr/str: symbolic offset not supported"); + emit_ldst_imm_unsigned(token == TOK_ASM_ldr, a.reg, b.base, + (int64_t)b.e.v); + return; + + case TOK_ASM_ldp: + case TOK_ASM_stp: + parse_operand(s1, &a); + asm_skip_comma(); + parse_operand(s1, &b); + asm_skip_comma(); + parse_operand(s1, &c); + need_xreg(&a, "ldp/stp first register"); + need_xreg(&b, "ldp/stp second register"); + if (c.kind != OP_MEM) + tcc_error("ldp/stp: expected memory operand"); + if (c.e.sym) + tcc_error("ldp/stp: symbolic offset not supported"); + emit_ldst_pair(token == TOK_ASM_ldp, a.reg, b.reg, c.base, + (int64_t)c.e.v, c.indexing); + return; + + case TOK_ASM_b: + case TOK_ASM_bl: + parse_operand(s1, &a); + if (a.kind != OP_IMM) + tcc_error("b/bl: expected label or immediate"); + emit_branch_imm(&a, token == TOK_ASM_bl); + return; + + case TOK_ASM_ret: + if (tok != ';' && tok != TOK_LINEFEED && tok != TOK_EOF) { + parse_operand(s1, &a); + need_xreg(&a, "ret"); + emit_ret(a.reg); + } else { + emit_ret(30); + } + return; + + case TOK_ASM_svc: + parse_operand(s1, &a); + if (a.kind != OP_IMM || a.e.sym) + tcc_error("svc: expected immediate"); + emit_svc((int64_t)a.e.v); + return; + + default: + expect("known instruction"); + } +} + +/* ---- inline-asm constraint plumbing — phase-1 stubs --------------- */ + +ST_FUNC void subst_asm_operand(CString *add_str, SValue *sv, int modifier) +{ + tcc_error("ARM64 inline asm operands not implemented yet"); +} + +ST_FUNC void asm_gen_code(ASMOperand *operands, int nb_operands, + int nb_outputs, int is_output, + uint8_t *clobber_regs, int out_reg) +{ +} + +ST_FUNC void asm_compute_constraints(ASMOperand *operands, + int nb_operands, int nb_outputs, + const uint8_t *clobber_regs, + int *pout_reg) +{ +} + +ST_FUNC void asm_clobber(uint8_t *clobber_regs, const char *str) +{ + int reg; + TokenSym *ts; + + if (!strcmp(str, "memory") || + !strcmp(str, "cc") || + !strcmp(str, "flags")) + return; + ts = tok_alloc(str, strlen(str)); + reg = asm_parse_regvar(ts->tok); + if (reg == -1) + tcc_error("invalid clobber register '%s'", str); + clobber_regs[reg] = 1; +} + +/*************************************************************/ +#endif /* ndef TARGET_DEFS_ONLY */ diff --git a/scripts/simple-patches/tcc-0.9.26/files/arm64-tok.h b/scripts/simple-patches/tcc-0.9.26/files/arm64-tok.h @@ -0,0 +1,90 @@ +/* ARM64 assembler tokens. + * + * Phase 1 surface — registers and the mnemonic set required by the + * .S inputs in tcc-cc/aarch64/ and tcc-libc/aarch64/. Order matters + * for the contiguous-range checks in arm64-asm.c (TOK_ASM_x0..xzr, + * TOK_ASM_w0..wzr). + */ + +/* X (64-bit) integer registers. Must be contiguous, x0 first. */ + DEF_ASM(x0) + DEF_ASM(x1) + DEF_ASM(x2) + DEF_ASM(x3) + DEF_ASM(x4) + DEF_ASM(x5) + DEF_ASM(x6) + DEF_ASM(x7) + DEF_ASM(x8) + DEF_ASM(x9) + DEF_ASM(x10) + DEF_ASM(x11) + DEF_ASM(x12) + DEF_ASM(x13) + DEF_ASM(x14) + DEF_ASM(x15) + DEF_ASM(x16) + DEF_ASM(x17) + DEF_ASM(x18) + DEF_ASM(x19) + DEF_ASM(x20) + DEF_ASM(x21) + DEF_ASM(x22) + DEF_ASM(x23) + DEF_ASM(x24) + DEF_ASM(x25) + DEF_ASM(x26) + DEF_ASM(x27) + DEF_ASM(x28) + DEF_ASM(x29) + DEF_ASM(x30) + DEF_ASM(sp) /* X-form stack pointer / zero-reg-31 alias */ + DEF_ASM(xzr) + +/* W (32-bit) integer registers. Must be contiguous, w0 first. */ + DEF_ASM(w0) + DEF_ASM(w1) + DEF_ASM(w2) + DEF_ASM(w3) + DEF_ASM(w4) + DEF_ASM(w5) + DEF_ASM(w6) + DEF_ASM(w7) + DEF_ASM(w8) + DEF_ASM(w9) + DEF_ASM(w10) + DEF_ASM(w11) + DEF_ASM(w12) + DEF_ASM(w13) + DEF_ASM(w14) + DEF_ASM(w15) + DEF_ASM(w16) + DEF_ASM(w17) + DEF_ASM(w18) + DEF_ASM(w19) + DEF_ASM(w20) + DEF_ASM(w21) + DEF_ASM(w22) + DEF_ASM(w23) + DEF_ASM(w24) + DEF_ASM(w25) + DEF_ASM(w26) + DEF_ASM(w27) + DEF_ASM(w28) + DEF_ASM(w29) + DEF_ASM(w30) + DEF_ASM(wsp) + DEF_ASM(wzr) + +/* Mnemonics — phase 1 set. */ + DEF_ASM(mov) + DEF_ASM(add) + DEF_ASM(sub) + DEF_ASM(ldr) + DEF_ASM(str) + DEF_ASM(ldp) + DEF_ASM(stp) + DEF_ASM(b) + DEF_ASM(bl) + DEF_ASM(ret) + DEF_ASM(svc) diff --git a/scripts/stage1-flatten.sh b/scripts/stage1-flatten.sh @@ -174,6 +174,24 @@ apply_our_patch aarch64-stdarg-array "$SRC/include/stdarg.h" apply_our_patch arm64-va-pointer-operand "$SRC/arm64-gen.c" apply_our_patch arm64-va-arg-pointer "$SRC/arm64-gen.c" +# AArch64 assembler — phase 1. Drops in arm64-asm.c + arm64-tok.h and +# wires their includes into tcc.h, libtcc.c, and tcctok.h. Patches are +# gated by TCC_TARGET_ARM64 in the surrounding source so they no-op on +# other arches even when applied. See docs/TCC-ARM64-ASM.md. +cp "$OUR_PATCHES/files/arm64-asm.c" "$SRC/arm64-asm.c" +cp "$OUR_PATCHES/files/arm64-tok.h" "$SRC/arm64-tok.h" +apply_our_patch arm64-asm-include-tcc-h "$SRC/tcc.h" +apply_our_patch arm64-asm-include-libtcc-c "$SRC/libtcc.c" +apply_our_patch arm64-tok-include-tcctok-h "$SRC/tcctok.h" + +# tcc's lexer in ASM_FILE mode swallows mid-line '#' as a line comment, +# which kills the ARM/AArch64 '#imm' immediate prefix. Restrict the +# '#'-as-line-comment behavior to start-of-line so '#' tokenizes as +# itself in operand position. gas's own '#' line-comment rule is BOL +# only, so this matches stock gas semantics. Other arches' assemblers +# don't use '#' as an immediate prefix, so they're unaffected. +apply_our_patch asm-hash-bol-only "$SRC/tccpp.c" + # riscv64 stdarg.h order fix — the upstream `#elif __riscv` branch # uses `__builtin_va_list` before it's typedef'd. Stock tcc treats # `__builtin_va_list` as a built-in keyword and forgives the forward