boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs

commit 92f3cc91a2b73dc490ae6048309beb6da7382c96
parent 90d6018d786c42cf3a053508647144dceca2167e
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 23 Apr 2026 11:55:22 -0700

move files around

Diffstat:
DREADME.md | 53-----------------------------------------------------
Rsrc/lisp.M1 -> lisp/lisp.M1 | 0
Rsrc/prelude.scm -> lisp/prelude.scm | 0
Rsrc/m1pp.M1 -> m1pp/m1pp.M1 | 0
Rsrc/m1pp.c -> m1pp/m1pp.c | 0
Dsrc/elf/ELF-aarch64.hex2 | 63---------------------------------------------------------------
Dsrc/elf/ELF-amd64.hex2 | 51---------------------------------------------------
Dsrc/elf/ELF-riscv64.hex2 | 51---------------------------------------------------
Dsrc/p1_gen.py | 1250-------------------------------------------------------------------------------
9 files changed, 0 insertions(+), 1468 deletions(-)

diff --git a/README.md b/README.md @@ -1,53 +0,0 @@ -# lispcc - -Experimental alternative bootstrap path: a small Lisp written directly in -M1 assembly + a C compiler written in that Lisp, replacing the -`M2-Planet → mes → MesCC → nyacc` stack between M1 asm and tcc-boot. -Goal is a 4–6× shrink in auditable LOC. See [docs/PLAN.md](docs/PLAN.md). - -## Status - -Stage 0: hello-world in the P1 portable pseudo-ISA (see [docs/P1.md](docs/P1.md)), -assembled and run inside a pristine alpine container on all three target -arches (aarch64, amd64, riscv64). The same `tests/hello.M1` source assembles -for every arch; only the backing `build/<arch>/p1_<arch>.M1` defs file -varies. Toolchain (M1, hex2) builds statically from the upstream mescc-tools -C source. - -## Layout - -``` -docs/ design docs (PLAN, SEED, P1, LISP) -src/ real programs (lisp.M1) + p1_gen.py -tests/ smoke programs (hello.M1, demo.M1) + fixtures - lisp/ lisp test fixtures (*.scm + *.expected) -bootstrap.sh hex0-seed → M0/hex2-0/catm toolchain build -populate-upstream.sh host-side copy of upstream seeds + sources into build/upstream/ -lint.sh M1 undefined-token guard -Makefile podman-driven build, ARCH-parameterized -build/ all derived artifacts (gitignored) - upstream/ mirror of the files bootstrap.sh consumes from live-bootstrap - <arch>/ per-arch outputs - tools/ bootstrapped M0, hex2-0, catm (+ throwaway hex0/hex1) - p1_<arch>.M1 generated P1 defs - <prog> final ELF binary -``` - -## Build & run - -Requires podman. Uses Alpine as the host. Non-native arches run via podman's -binfmt + qemu-user path (works transparently on a default `podman machine` -setup). - -``` -make run-all # build + run on all three arches -make clean # wipe build/ -``` - -## Source layout assumption - -`populate-upstream.sh` runs on the host and mirrors the files bootstrap.sh -needs from `$UPSTREAM/seed/stage0-posix/` into `build/upstream/`; the -default is `../live-bootstrap`. Override by invoking `make UPSTREAM=/path -populate-upstream`. Podman itself only ever mounts curdir, so everything -the container sees must live inside the repo. diff --git a/src/lisp.M1 b/lisp/lisp.M1 diff --git a/src/prelude.scm b/lisp/prelude.scm diff --git a/src/m1pp.M1 b/m1pp/m1pp.M1 diff --git a/src/m1pp.c b/m1pp/m1pp.c diff --git a/src/elf/ELF-aarch64.hex2 b/src/elf/ELF-aarch64.hex2 @@ -1,63 +0,0 @@ -## ELF-aarch64.hex2 — BSS-enabled AArch64 ELF header for lispcc program link. -## -## Derived from stage0-posix's ELF-aarch64.hex2 (same single-LOAD-segment -## layout) with one change: p_memsz references :ELF_bss_end while p_filesz -## still references :ELF_end. The on-disk file ends at :ELF_end (after a -## post-link truncate drops the trailing zero bytes hex2 emits); the -## kernel's loader zero-fills the gap up to :ELF_bss_end at runtime. -## -## The stock stage0-posix ELF-*.hex2 files under build/upstream/ stay -## untouched (no :ELF_bss_end label) because bootstrap.sh uses them to -## link the M0 binary itself — a program that has no BSS region to speak -## of and doesn't define :ELF_bss_end. -## -## If you use this header, your program must define BOTH :ELF_end -## (end of real bytes) and :ELF_bss_end (end of zero-fill region). - -## ELF Header - -:ELF_base -7F 45 4C 46 # e_ident[EI_MAG0-3] ELF's magic number - -02 # e_ident[EI_CLASS] Indicating 64 bit -01 # e_ident[EI_DATA] Indicating little endianness -01 # e_ident[EI_VERSION] Indicating original elf - -03 # e_ident[EI_OSABI] Set at 3 because FreeBSD is strict -00 # e_ident[EI_ABIVERSION] Set at 0 because noone cares - -00 00 00 00 00 00 00 # e_ident[EI_PAD] -02 00 # e_type Indicating Executable -B7 00 # e_machine Indicating AArch64 -01 00 00 00 # e_version Indicating original elf - -&_start 00 00 00 00 # e_entry Address of the entry point -%ELF_program_headers>ELF_base 00 00 00 00 # e_phoff Address of program header table -00 00 00 00 00 00 00 00 # e_shoff Address of section header table - -00 00 00 00 # e_flags -40 00 # e_ehsize Indicating our 64 Byte header - -38 00 # e_phentsize size of a program header table -01 00 # e_phnum number of entries in program table - -00 00 # e_shentsize size of a section header table -00 00 # e_shnum number of entries in section table - -00 00 # e_shstrndx index of the section names - -## Program Header -:ELF_program_headers -01 00 00 00 # ph_type: PT-LOAD = 1 -07 00 00 00 # ph_flags: PF-X|PF-W|PF-R = 7 -00 00 00 00 00 00 00 00 # ph_offset - -&ELF_base 00 00 00 00 # ph_vaddr -&ELF_base 00 00 00 00 # ph_physaddr - -%ELF_end>ELF_base 00 00 00 00 # ph_filesz (end of on-disk image) -%ELF_bss_end>ELF_base 00 00 00 00 # ph_memsz (kernel zero-fills past ph_filesz) - -01 00 00 00 00 00 00 00 # ph_align - -:ELF_text diff --git a/src/elf/ELF-amd64.hex2 b/src/elf/ELF-amd64.hex2 @@ -1,51 +0,0 @@ -## ELF-amd64.hex2 — BSS-enabled AMD64 ELF header for lispcc program link. -## See src/elf/ELF-aarch64.hex2 for why this lives here rather than -## under build/upstream/ (bootstrap.sh needs the stock version to link M0). - -## ELF Header - -:ELF_base -7F 45 4C 46 ## e_ident[EI_MAG0-3] ELF's magic number - -02 ## e_ident[EI_CLASS] Indicating 64 bit -01 ## e_ident[EI_DATA] Indicating little endianness -01 ## e_ident[EI_VERSION] Indicating original elf - -03 ## e_ident[EI_OSABI] Set at 3 because FreeBSD is strict -00 ## e_ident[EI_ABIVERSION] Set at 0 because none cares - -00 00 00 00 00 00 00 ## e_ident[EI_PAD] -02 00 ## e_type Indicating Executable -3E 00 ## e_machine Indicating AMD64 -01 00 00 00 ## e_version Indicating original elf - -&_start 00 00 00 00 ## e_entry Address of the entry point -%ELF_program_headers>ELF_base 00 00 00 00 ## e_phoff Address of program header table -00 00 00 00 00 00 00 00 ## e_shoff Address of section header table - -00 00 00 00 ## e_flags -40 00 ## e_ehsize Indicating our 64 Byte header - -38 00 ## e_phentsize size of a program header table -01 00 ## e_phnum number of entries in program table - -00 00 ## e_shentsize size of a section header table -00 00 ## e_shnum number of entries in section table - -00 00 ## e_shstrndx index of the section names - -## Program Header -:ELF_program_headers -01 00 00 00 ## p_type -07 00 00 00 ## ph_flags: PF-X|PF-W|PF-R = 7 -00 00 00 00 00 00 00 00 ## p_offset - -&ELF_base 00 00 00 00 ## p_vaddr -&ELF_base 00 00 00 00 ## p_physaddr - -%ELF_end>ELF_base 00 00 00 00 ## p_filesz (end of on-disk image) -%ELF_bss_end>ELF_base 00 00 00 00 ## p_memsz (kernel zero-fills past p_filesz) - -01 00 00 00 00 00 00 00 ## Required alignment - -:ELF_text diff --git a/src/elf/ELF-riscv64.hex2 b/src/elf/ELF-riscv64.hex2 @@ -1,51 +0,0 @@ -## ELF-riscv64.hex2 — BSS-enabled RISC-V 64 ELF header for lispcc program link. -## See src/elf/ELF-aarch64.hex2 for why this lives here rather than -## under build/upstream/ (bootstrap.sh needs the stock version to link M0). - -## ELF Header - -:ELF_base -7F 45 4C 46 ## e_ident[EI_MAG0-3] ELF's magic number - -02 ## e_ident[EI_CLASS] Indicating 64 bit -01 ## e_ident[EI_DATA] Indicating little endianness -01 ## e_ident[EI_VERSION] Indicating original elf - -03 ## e_ident[EI_OSABI] Set at 3 because FreeBSD is strict -00 ## e_ident[EI_ABIVERSION] Set at 0 because none cares - -00 00 00 00 00 00 00 ## e_ident[EI_PAD] -02 00 ## e_type Indicating Executable -F3 00 ## e_machine Indicating RISC-V -01 00 00 00 ## e_version Indicating original elf - -&_start 00 00 00 00 ## e_entry Address of the entry point -%ELF_program_headers>ELF_base 00 00 00 00 ## e_phoff Address of program header table -00 00 00 00 00 00 00 00 ## e_shoff Address of section header table - -00 00 00 00 ## e_flags -40 00 ## e_ehsize Indicating our 64 Byte header - -38 00 ## e_phentsize size of a program header table -01 00 ## e_phnum number of entries in program table - -00 00 ## e_shentsize size of a section header table -00 00 ## e_shnum number of entries in section table - -00 00 ## e_shstrndx index of the section names - -## Program Header -:ELF_program_headers -01 00 00 00 ## p_type -07 00 00 00 ## ph_flags: PF-X|PF-W|PF-R = 7 -00 00 00 00 00 00 00 00 ## p_offset - -&ELF_base 00 00 00 00 ## p_vaddr -&ELF_base 00 00 00 00 ## p_physaddr - -%ELF_end>ELF_base 00 00 00 00 ## p_filesz (end of on-disk image) -%ELF_bss_end>ELF_base 00 00 00 00 ## p_memsz (kernel zero-fills past p_filesz) - -01 00 00 00 00 00 00 00 ## Required alignment - -:ELF_text diff --git a/src/p1_gen.py b/src/p1_gen.py @@ -1,1250 +0,0 @@ -#!/usr/bin/env python3 -"""p1_gen.py — generate p1_<arch>.M1 from a per-arch encoder table. - -Single source of truth for the P1 DEFINE tables across all three target -arches. Running this script writes <build>/aarch64/p1_aarch64.M1 and the -amd64/riscv64 siblings (default <build> = "build"). - -Structure: - * Low-level native encoders (amd_*, aa_*, rv_*) — one bank of - helpers per arch. - * Encoder classes AA64/AMD64/RV64 (subclasses of Encoder): one - method per P1 op category, lowering (op, reg-tuple, imm) into - native hex. Each arch's encoder is a coherent bundle — adding a - new op means one new method on each of the three. - * Op dataclasses — thin rows holding the DEFINE's name + data. - Op.encode(enc) dispatches into enc.<op-method>() with the Op's - fields unpacked. No per-arch branching lives in Op classes. - * rows() — builds the output list. Non-RRR ops are emitted as the - full register product × a curated imm/offset/shamt set. RRR - keeps an explicit table (the full 8³ cube is 5.6k entries per - arch, >99% dead weight). Adding a new RRR triple or a new imm - value is a one-line edit to rows(); a new register combination - for any other op needs no edit at all. - * emit(arch) / main — iterate rows, ask the arch's encoder to - lower each, write out the defs file. - -Running: - $ python3 p1_gen.py [build-root] # rewrite all three files - $ python3 p1_gen.py --check [build-root] # diff against current files -""" - -import os -import sys -from dataclasses import dataclass -from itertools import product -from typing import Optional - -ARCHES = ('aarch64', 'amd64', 'riscv64') - -## P1 GPRs (the 8 caller/callee-split registers exposed to P1 source). -P1_REGS = ('r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7') - -## ---------- Register mappings -------------------------------------------- -## P1 register name → native encoding number. The native numbers are what -## the per-arch encoders insert into instruction fields; the human-facing -## names (rax, x1, a2, …) never appear in this file. - -## 4:4 caller/callee-saved split. r0–r3 caller (native argregs); r4–r7 -## callee (native callee-saved). `br` is the hidden branch-target scratch -## (not a P1 reg) — picked so every op's expansion clobbers only what its -## name declares. -NAT_AA64 = {'r0': 0, 'r1': 1, 'r2': 2, 'r3': 3, - 'r4': 26, 'r5': 27, 'r6': 19, 'r7': 20, - 'br': 17, # x17 (IP1, caller-saved linker scratch) - 'sp': 31, 'xzr': 31, 'lr': 30, - 'x21': 21, 'x22': 22, 'x23': 23, 'x8': 8} - -## amd64 ModRM.reg/rm + REX.R/B bit: native regnums 0..15 with r8..r15 -## setting the REX bit. We store the 4-bit native number directly. -NAT_AMD64 = {'r0': 0, # rax - 'r1': 7, # rdi - 'r2': 6, # rsi - 'r3': 2, # rdx - 'r4': 13, # r13 (callee-saved) - 'r5': 14, # r14 (callee-saved) - 'r6': 3, # rbx - 'r7': 12, # r12 - 'br': 11, # r11 — branch/call target scratch + DIV/REM r0 save - 'sp': 4, # rsp - 'rcx': 1, # shift-count scratch + DIV/REM rdx save (not a P1 reg) - 'r10': 10, # syscall arg4 slot (not a P1 reg) - 'r8': 8, # syscall arg5 slot (not a P1 reg) - 'r9': 9, # syscall arg6 slot (not a P1 reg) - 'r11': 11, # alias for br (some expansions spell it r11 directly) - } - -NAT_RV64 = {'r0': 10, 'r1': 11, 'r2': 12, 'r3': 13, - 'r4': 20, 'r5': 21, 'r6': 9, 'r7': 18, - 'br': 30, # t5 (caller-saved temp) - 't0': 5, - 'sp': 2, 'ra': 1, 'zero': 0, 'a7': 17, - 's3': 19, 's6': 22, 's7': 23} - - -## ---------- Low-level encoding helpers ----------------------------------- - -def le32(n: int) -> str: - return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() - -def byte(n: int) -> str: - return f'{n & 0xFF:02X}' - - -## ---------- amd64 primitive encoders ------------------------------------ -## amd64 is variable-length. Helpers below emit specific instruction -## shapes used by the P1 expansions. REX prefix bits: W=64b, R=ModRM.reg -## high, B=ModRM.rm high, X=SIB.index high (unused here). - -def rex(w, r, x, b): - v = 0x40 | (w << 3) | (r << 2) | (x << 1) | b - return byte(v) - -def modrm(mod, reg, rm): - return byte((mod << 6) | ((reg & 7) << 3) | (rm & 7)) - -def amd_mov_rr(dst, src): - """mov dst, src — REX.W + 89 /r (MOV r/m64, r64).""" - d, s = NAT_AMD64[dst], NAT_AMD64[src] - return rex(1, s >> 3, 0, d >> 3) + '89' + modrm(3, s, d) - -def amd_alu_rr(op, dst, src): - """op dst, src — 2-operand ALU. op is the opcode byte (01 add, - 29 sub, 21 and, 09 or, 31 xor).""" - d, s = NAT_AMD64[dst], NAT_AMD64[src] - return rex(1, s >> 3, 0, d >> 3) + op + modrm(3, s, d) - -def amd_alu_ri8(ext, dst, imm): - """op dst, imm8 (sign-extended). Opcode 83 /ext ib.""" - d = NAT_AMD64[dst] - return rex(1, 0, 0, d >> 3) + '83' + modrm(3, ext, d) + byte(imm) - -def amd_alu_ri32(ext, dst, imm): - """op dst, imm32 (sign-extended). Opcode 81 /ext id. Used when - an immediate doesn't fit in the imm8 form (e.g., ADDI with - values outside [-128, 127]).""" - d = NAT_AMD64[dst] - imm_le = (imm & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() - return rex(1, 0, 0, d >> 3) + '81' + modrm(3, ext, d) + imm_le - -def amd_shift_ri8(ext, dst, imm): - """shl/shr/sar dst, imm8. Opcode C1 /ext ib.""" - d = NAT_AMD64[dst] - return rex(1, 0, 0, d >> 3) + 'C1' + modrm(3, ext, d) + byte(imm) - -def amd_shift_cl(ext, dst): - """shl/shr/sar dst, cl. Opcode D3 /ext.""" - d = NAT_AMD64[dst] - return rex(1, 0, 0, d >> 3) + 'D3' + modrm(3, ext, d) - -def amd_imul_rr(dst, src): - """imul dst, src — 0F AF /r.""" - d, s = NAT_AMD64[dst], NAT_AMD64[src] - return rex(1, d >> 3, 0, s >> 3) + '0FAF' + modrm(3, d, s) - -def amd_idiv(src): - """idiv src — F7 /7 (signed div of rdx:rax by src).""" - s = NAT_AMD64[src] - return rex(1, 0, 0, s >> 3) + 'F7' + modrm(3, 7, s) - -def amd_cqo(): - """cqo — sign-extend rax into rdx:rax. 48 99.""" - return '4899' - -def amd_mem_rm(opcode, reg, base, disp): - """[base+disp] <-> reg, for MOV r,r/m or MOV r/m,r (opcode=89 store, 8B load). - disp is signed int; encodes as disp8 if in range, else disp32.""" - r, b = NAT_AMD64[reg], NAT_AMD64[base] - prefix = rex(1, r >> 3, 0, b >> 3) + opcode - if -128 <= disp <= 127: - mod = 1 - d = byte(disp) - else: - mod = 2 - d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() - # Any base whose low 3 bits are 100 — rsp (4) or r12 (12) — forces a - # SIB byte when mod != 00; rm=100 without SIB is the [*] / SIB-follows - # special form. REX.B already distinguishes rsp from r12 in the SIB - # base field, so the SIB byte is 24 in both cases. - if (b & 7) == 4: - return prefix + modrm(mod, r, 4) + '24' + d - return prefix + modrm(mod, r, b) + d - -def amd_mov_rm_b(reg, base, disp, store): - """Byte load/store. 88 /r (store), 0F B6 /r (movzx load).""" - r, b = NAT_AMD64[reg], NAT_AMD64[base] - if -128 <= disp <= 127: - mod = 1 - d = byte(disp) - else: - mod = 2 - d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() - # rsp (4) and r12 (12) share rm bits 100, which forces a SIB byte when - # mod != 00; REX.B distinguishes them in the SIB base field. See - # amd_mem_rm for the same rule. - need_sib = (b & 7) == 4 - sib = '24' if need_sib else '' - rmv = 4 if need_sib else b - if store: - # MOV r/m8, r8 — 88 /r. Requires REX to address dil/sil/bpl/spl. - prefix = rex(1, r >> 3, 0, b >> 3) + '88' - return prefix + modrm(mod, r, rmv) + sib + d - else: - # MOVZX r64, r/m8 — REX.W 0F B6 /r. - prefix = rex(1, r >> 3, 0, b >> 3) + '0FB6' - return prefix + modrm(mod, r, rmv) + sib + d - - -## ---------- aarch64 primitive encoders ---------------------------------- -## aarch64 is fixed 4-byte insns. Helpers return the 4 bytes LE-encoded. - -def aa_rrr(base, rD, rA, rB): - d, a, b = NAT_AA64[rD], NAT_AA64[rA], NAT_AA64[rB] - return le32(base | (b << 16) | (a << 5) | d) - -def aa_add_imm(rD, rA, imm12, sub=False): - """ADD/SUB (immediate, shift=0). imm12 unsigned 0..4095.""" - d, a = NAT_AA64[rD], NAT_AA64[rA] - base = 0xD1000000 if sub else 0x91000000 - return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d) - -def aa_logical_imm(base, rD, rA, N, immr, imms): - d, a = NAT_AA64[rD], NAT_AA64[rA] - return le32(base | (N << 22) | (immr << 16) | (imms << 10) | (a << 5) | d) - -def aa_ubfm(rD, rA, immr, imms): - """UBFM (N=1 for sf=64).""" - d, a = NAT_AA64[rD], NAT_AA64[rA] - return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d) - -def aa_sbfm(rD, rA, immr, imms): - """SBFM (N=1 for sf=64).""" - d, a = NAT_AA64[rD], NAT_AA64[rA] - return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d) - -def aa_ldst_uimm12(base, rT, rN, off_bytes, size_log2): - """LDR/STR (unsigned offset). off_bytes must be a multiple of - 2^size_log2 and non-negative. imm12 = off_bytes >> size_log2.""" - assert off_bytes >= 0 and (off_bytes % (1 << size_log2)) == 0 - imm12 = off_bytes >> size_log2 - assert 0 <= imm12 < 4096 - t, n = NAT_AA64[rT], NAT_AA64[rN] - return le32(base | (imm12 << 10) | (n << 5) | t) - -def aa_ldst_unscaled(base, rT, rN, off): - """LDUR/STUR (unscaled, signed imm9). Handles arbitrary small - offsets — negative, or positive-but-not-a-multiple-of-the-access- - size (e.g. LD at offset 7). imm9 range is [-256, 255].""" - assert -256 <= off <= 255 - imm9 = off & 0x1FF - t, n = NAT_AA64[rT], NAT_AA64[rN] - return le32(base | (imm9 << 12) | (n << 5) | t) - -def aa_movz(rD, imm16): - """MOVZ rD, #imm16 — load a 16-bit immediate into the low half of - rD, zeroing the rest. Use this instead of `add rD, xzr, #imm`: - aarch64 ADD encodes register 31 as SP (not XZR), so the obvious - `mov xN, #imm` synthesis silently computes sp + imm.""" - d = NAT_AA64[rD] - assert 0 <= imm16 < (1 << 16) - return le32(0xD2800000 | (imm16 << 5) | d) - - -## ---------- riscv64 primitive encoders ---------------------------------- - -def rv_r(base, rD, rA, rB): - d, a, b = NAT_RV64[rD], NAT_RV64[rA], NAT_RV64[rB] - return le32(base | (b << 20) | (a << 15) | (d << 7)) - -def rv_i(base, rD, rA, imm12): - """I-type: imm12[11:0], rs1, funct3, rd, opcode. imm12 is a signed - int that gets masked to 12 bits.""" - d, a = NAT_RV64[rD], NAT_RV64[rA] - return le32(base | ((imm12 & 0xFFF) << 20) | (a << 15) | (d << 7)) - -def rv_s(base, rS, rA, imm12): - """S-type store: imm12[11:5] rs2 rs1 funct3 imm12[4:0] opcode.""" - s, a = NAT_RV64[rS], NAT_RV64[rA] - hi = (imm12 >> 5) & 0x7F - lo = imm12 & 0x1F - return le32(base | (hi << 25) | (s << 20) | (a << 15) | (lo << 7)) - -def rv_shift_imm(base, rD, rA, shamt): - """Shift-imm: base already has funct7 set; shamt in [0,63].""" - d, a = NAT_RV64[rD], NAT_RV64[rA] - return le32(base | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7)) - - -## ---------- Per-arch op base tables ------------------------------------- - -AA64_RRR_BASE = { - 'ADD': 0x8B000000, - 'SUB': 0xCB000000, - 'AND': 0x8A000000, - 'OR': 0xAA000000, - 'XOR': 0xCA000000, - 'SHL': 0x9AC02000, - 'SHR': 0x9AC02400, - 'SAR': 0x9AC02800, - 'DIV': 0x9AC00C00, -} -AMD64_RRR_OPC = { - 'ADD': '01', 'SUB': '29', 'AND': '21', 'OR': '09', 'XOR': '31', -} -RV_RRR = { - 'ADD': 0x00000033, # funct7=0 funct3=0 opcode=0x33 - 'SUB': 0x40000033, - 'XOR': 0x00004033, - 'OR': 0x00006033, - 'AND': 0x00007033, - 'SHL': 0x00001033, - 'SHR': 0x00005033, - 'SAR': 0x40005033, - 'MUL': 0x02000033, - 'DIV': 0x02004033, - 'REM': 0x02006033, -} - - -## aarch64 bitmask-immediate encoding for ANDI/ORI. Entries are the -## (N, immr, imms) triples that encode each small imm as an aarch64 -## "logical immediate." Computed by hand because the full encoding -## algorithm (contiguous-run + rotation for element sizes -## 2/4/8/16/32/64) is substantial and we only need a handful of -## values. Extend this table if a new imm shows up in P1 source. -AA64_LOGI_ENC = { - 1: (1, 0, 0), # 0b0001 — single bit at position 0 - 2: (1, 63, 0), # 0b0010 — single bit at position 1 - 3: (1, 0, 1), # 0b0011 — 2 contiguous ones - 4: (1, 62, 0), # 0b0100 — single bit at position 2 - 6: (1, 63, 1), # 0b0110 — 2 ones rotated by 1 - 7: (1, 0, 2), # 0b0111 — 3 contiguous ones - 8: (1, 61, 0), # 0b1000 — single bit at position 3 -} - - -## Frame layout after PROLOGUE_Nk (k >= 1, rounded up so total frame -## bytes stay 16-byte aligned): -## [sp + 0] = retaddr (aarch64 lr / riscv64 ra / amd64 retaddr) -## [sp + 8] = saved_fp (caller's sp at call/prologue entry) -## [sp + 16] = k (slot count) -## [sp + 24] = slot 1 (callee-private scratch) -## [sp + 32] = slot 2 -## ... -## -## Frame size = round_up_to_16(24 + 8*k). So k=1 → 32, k=2 → 40 → 48, -## k=3 → 48, k=4 → 56 → 64. - -def prologue_frame_bytes(k: int) -> int: - raw = 24 + 8 * k - return (raw + 15) & ~15 - - -## ---------- Encoders ---------------------------------------------------- -## One class per arch. Each provides one method per P1 op category, -## mapping (op, reg-tuple, imm) to native bytes. Op classes dispatch -## here via `Op.encode(enc)` → `enc.<method>(fields)`. - -class Encoder: - """Per-arch encoder base. Subclasses implement one method per - op category. `arch` is used by literal() to pick the right - pre-encoded bytes from an arch-keyed dict.""" - arch = '' - - def literal(self, hex_by_arch): - return hex_by_arch[self.arch] - - -class AA64(Encoder): - arch = 'aarch64' - - def rrr(self, op, rD, rA, rB): - if op == 'MUL': - # MUL = MADD with Ra=xzr. 100 11011 000 mmmmm 0 aaaaa nnnnn ddddd - d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB] - return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d) - if op == 'REM': - # SDIV x16, xA, xB ; MSUB xD, x16, xB, xA. - # x16 (ARM IP0, caller-saved, not a P1 reg) is scratch so - # REM does not hidden-clobber P1 r4 — the op modifies rD only. - # MSUB needs bit 15 set (o0=1); without it it decodes as - # MADD and REM returns A + (A/B)*B. - d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB] - SC = 16 - sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | SC - msub = 0x9B008000 | (b << 16) | (a << 10) | (SC << 5) | d - return le32(sdiv) + le32(msub) - return aa_rrr(AA64_RRR_BASE[op], rD, rA, rB) - - def addi(self, rD, rA, imm): - if imm >= 0: - return aa_add_imm(rD, rA, imm, sub=False) - return aa_add_imm(rD, rA, -imm, sub=True) - - def logi(self, op, rD, rA, imm): - N, immr, imms = AA64_LOGI_ENC[imm] - base = 0x92000000 if op == 'ANDI' else 0xB2000000 # ORI = orr - return aa_logical_imm(base, rD, rA, N, immr, imms) - - def shifti(self, op, rD, rA, imm): - if op == 'SHLI': - return aa_ubfm(rD, rA, (-imm) & 63, 63 - imm) - if op == 'SHRI': - return aa_ubfm(rD, rA, imm, 63) - if op == 'SARI': - return aa_sbfm(rD, rA, imm, 63) - - def mov(self, rD, rA): - if rD == 'sp': - return aa_add_imm('sp', rA, 0, sub=False) - if rA == 'sp': - return aa_add_imm(rD, 'sp', 0, sub=False) - # MOV xD, xA = ORR xD, xzr, xA - d = NAT_AA64[rD]; a = NAT_AA64[rA] - return le32(0xAA000000 | (a << 16) | (31 << 5) | d) - - def li(self, rD): - # ldr wD, [pc+8] ; b +8 (caller emits 4 bytes of data next) - d = NAT_AA64[rD] - ldr_w_lit = 0x18000040 | d # LDR (literal) 32-bit, offset 8 - b_plus8 = 0x14000002 # B offset 8 (imm26 = 2 words = 8 bytes) - return le32(ldr_w_lit) + le32(b_plus8) - - def la(self, rD): - return self.li(rD) - - def mem(self, op, rT, rN, off): - # Pick uimm12 (scaled, large range) when the offset is a - # non-negative multiple of the access width; otherwise fall - # back to the unscaled signed-imm9 form (covers negative - # offsets and positive-but-misaligned ones like 7). - BASES = { - 'LD': (0xF9400000, 3, 0xF8400000), - 'ST': (0xF9000000, 3, 0xF8000000), - 'LB': (0x39400000, 0, 0x38400000), - 'SB': (0x39000000, 0, 0x38000000), - } - uimm_base, size_log2, unscaled_base = BASES[op] - scale = 1 << size_log2 - if off >= 0 and (off % scale) == 0: - return aa_ldst_uimm12(uimm_base, rT, rN, off, size_log2) - return aa_ldst_unscaled(unscaled_base, rT, rN, off) - - def b(self): - return le32(0xD61F0000 | (NAT_AA64['br'] << 5)) # BR x17 - - def condb(self, op, rA, rB): - # cmp xA, xB = SUBS xzr, xA, xB (0xEB000000 base, rD=31). - # Skip when NOT cond holds. BEQ→NE(1), BNE→EQ(0), BLT→GE(A). - a = NAT_AA64[rA]; b_ = NAT_AA64[rB] - cmp_ = le32(0xEB000000 | (b_ << 16) | (a << 5) | 31) - cond = {'BEQ': 1, 'BNE': 0, 'BLT': 10}[op] - bcond = le32(0x54000040 | cond) - br = le32(0xD61F0000 | (NAT_AA64['br'] << 5)) - return cmp_ + bcond + br - - def condbz(self, op, rA): - # Branch-if-zero family. BEQZ/BNEZ use CBZ/CBNZ (skip br on - # the inverse cond); BLTZ falls back to CMP+B.GE since there's - # no single signed-less-than-zero insn. - a = NAT_AA64[rA] - br = le32(0xD61F0000 | (NAT_AA64['br'] << 5)) - if op == 'BEQZ': - return le32(0xB5000000 | (2 << 5) | a) + br # CBNZ rA, +8 ; BR - if op == 'BNEZ': - return le32(0xB4000000 | (2 << 5) | a) + br # CBZ rA, +8 ; BR - if op == 'BLTZ': - cmp_ = le32(0xEB000000 | (31 << 16) | (a << 5) | 31) # SUBS xzr, rA, xzr - bge = le32(0x54000040 | 10) # b.ge +8 - return cmp_ + bge + br - - def call(self): - return le32(0xD63F0000 | (NAT_AA64['br'] << 5)) # BLR x17 - - def ret(self): - return le32(0xD65F03C0) # RET (= br x30) - - def prologue(self, k): - fb = prologue_frame_bytes(k) - sub = aa_add_imm('sp', 'sp', fb, sub=True) - str_lr = aa_ldst_uimm12(0xF9000000, 'lr', 'sp', 0, 3) - save_fp = aa_add_imm('x8', 'sp', fb, sub=False) - str_fp = aa_ldst_uimm12(0xF9000000, 'x8', 'sp', 8, 3) - # MUST use MOVZ here: ADD imm with Rn=31 reads SP, not XZR, - # so `add x8, xzr, #k` would store sp+k at [sp+16] instead - # of the slot count k. The pre-GC code never read [sp+16] - # so the bug stayed latent until the GC walker showed up. - mov_k = aa_movz('x8', k) - str_k = aa_ldst_uimm12(0xF9000000, 'x8', 'sp', 16, 3) - # Zero each slot. The GC walker reads every slot through - # mark_value, which dereferences any value whose tag bits - # match a heap tag. Stale stack data left here from prior - # frames can otherwise look like a tagged pointer and chase - # the marker into garbage. See LISP-GC.md §"Roots". - zero_slots = '' - for i in range(k): - zero_slots += aa_ldst_uimm12(0xF9000000, 'xzr', 'sp', 24 + 8*i, 3) - return sub + str_lr + save_fp + str_fp + mov_k + str_k + zero_slots - - def epilogue(self, k): - ldr_lr = aa_ldst_uimm12(0xF9400000, 'lr', 'sp', 0, 3) - ldr_fp = aa_ldst_uimm12(0xF9400000, 'x8', 'sp', 8, 3) - mov_sp = aa_add_imm('sp', 'x8', 0, sub=False) - return ldr_lr + ldr_fp + mov_sp - - def tail(self, k): - return self.epilogue(k) + self.b() - - -class AMD64(Encoder): - arch = 'amd64' - - def rrr(self, op, rD, rA, rB): - if op == 'MUL': - # If rD aliases rB, save rB to rcx first (mov rD,rA would - # clobber it before imul reads it). - if rD == rB and rA != rB: - return amd_mov_rr('rcx', rB) + amd_mov_rr(rD, rA) + amd_imul_rr(rD, 'rcx') - return amd_mov_rr(rD, rA) + amd_imul_rr(rD, rB) - if op in ('DIV', 'REM'): - # x86 idiv implicitly reads/writes rax (P1 r0) and rdx - # (P1 r3). To keep DIV/REM clobber-free (only rD changes), - # stash r0 into r11 and r3 into rcx — neither is a P1 reg — - # then restore. If rA or rB alias r0/r3, read from the - # saved copy since we've overwritten the originals. - # Skip the final restore for whichever of r0/r3 *is* rD, - # so rD keeps its newly computed value. - seq = amd_mov_rr('r11', 'r0') # save r0 (rax) - seq += amd_mov_rr('rcx', 'r3') # save r3 (rdx) - src_a = 'r11' if rA == 'r0' else ('rcx' if rA == 'r3' else rA) - seq += amd_mov_rr('r0', src_a) # rax = rA - seq += amd_cqo() # rdx:rax = sign-ext rax - src_b = 'r11' if rB == 'r0' else ('rcx' if rB == 'r3' else rB) - seq += amd_idiv(src_b) - seq += amd_mov_rr(rD, 'r0' if op == 'DIV' else 'r3') - if rD != 'r3': - seq += amd_mov_rr('r3', 'rcx') - if rD != 'r0': - seq += amd_mov_rr('r0', 'r11') - return seq - if op in ('SHL', 'SHR', 'SAR'): - ext = {'SHL': 4, 'SHR': 5, 'SAR': 7}[op] - # Save rB → rcx FIRST so a subsequent mov rD,rA (which may - # alias rB) doesn't clobber the shift count. - seq = amd_mov_rr('rcx', rB) - if rD != rA: - seq += amd_mov_rr(rD, rA) - seq += amd_shift_cl(ext, rD) - return seq - # ADD/SUB/AND/OR/XOR. If rD aliases rB (e.g., sub_r3,r0,r3), the - # naive mov rD,rA overwrites rB before the op reads it — funnel - # rB through rcx. - opcode = AMD64_RRR_OPC[op] - if rD == rB and rA != rB: - seq = amd_mov_rr('rcx', rB) - seq += amd_mov_rr(rD, rA) - seq += amd_alu_rr(opcode, rD, 'rcx') - return seq - seq = '' if rD == rA else amd_mov_rr(rD, rA) - seq += amd_alu_rr(opcode, rD, rB) - return seq - - def addi(self, rD, rA, imm): - # mov rD,rA ; add rD,imm. Use imm8 form when it fits - # ([-128, 127]); otherwise emit the imm32 form. - seq = amd_mov_rr(rD, rA) - if -128 <= imm <= 127: - seq += amd_alu_ri8(0, rD, imm) # /0 = ADD - else: - seq += amd_alu_ri32(0, rD, imm) - return seq - - def logi(self, op, rD, rA, imm): - ext = {'ANDI': 4, 'ORI': 1}[op] - seq = amd_mov_rr(rD, rA) - seq += amd_alu_ri8(ext, rD, imm) - return seq - - def shifti(self, op, rD, rA, imm): - ext = {'SHLI': 4, 'SHRI': 5, 'SARI': 7}[op] - seq = amd_mov_rr(rD, rA) - seq += amd_shift_ri8(ext, rD, imm) - return seq - - def mov(self, rD, rA): - return amd_mov_rr(rD, rA) - - def li(self, rD): - # mov <rD as r32>, imm32 — opcode B8+r (with REX.B if r8..r15) - d = NAT_AMD64[rD] - if d >= 8: - return '41' + byte(0xB8 + (d & 7)) - return byte(0xB8 + d) - - def la(self, rD): - return self.li(rD) - - def mem(self, op, rT, rN, off): - if op == 'LD': return amd_mem_rm('8B', rT, rN, off) - if op == 'ST': return amd_mem_rm('89', rT, rN, off) - if op == 'LB': return amd_mov_rm_b(rT, rN, off, store=False) - if op == 'SB': return amd_mov_rm_b(rT, rN, off, store=True) - - def b(self): - return '41FFE3' # jmp r11 - - def condb(self, op, rA, rB): - a, b_ = NAT_AMD64[rA], NAT_AMD64[rB] - # cmp rA, rB — opcode 39 /r with rA as r/m - cmp_ = rex(1, b_ >> 3, 0, a >> 3) + '39' + modrm(3, b_, a) - # jcc rel8 opcode, skip=3 (past jmp r11): - # BEQ→JNE 75 03 ; BNE→JE 74 03 ; BLT→JGE 7D 03 - jop = {'BEQ': '75', 'BNE': '74', 'BLT': '7D'}[op] - return cmp_ + jop + '03' + '41FFE3' # jmp r11 - - def condbz(self, op, rA): - a = NAT_AMD64[rA] - # test rA, rA — 85 /r with rA in both slots; sets ZF iff rA == 0, - # SF iff rA < 0. Same skip+jmp pattern as condb. - test_ = rex(1, a >> 3, 0, a >> 3) + '85' + modrm(3, a, a) - jop = {'BEQZ': '75', 'BNEZ': '74', 'BLTZ': '7D'}[op] - return test_ + jop + '03' + '41FFE3' # jmp r11 - - def call(self): - return '41FFD3' # call r11 - - def ret(self): - return 'C3' - - def prologue(self, k): - # pop rcx ; mov rax,rsp ; sub rsp,fb ; [rsp]=rcx ; [rsp+8]=rax ; - # xor rax,rax ; add rax,k ; [rsp+16]=rax. - # rcx carries the retaddr, rax carries saved_fp then k. Neither - # is a live incoming P1 argument register at function entry. - # Each slot is then zeroed so a stale stack value can't masquerade - # as a tagged pointer when the GC walks this frame. See - # LISP-GC.md §"Roots". - fb = prologue_frame_bytes(k) - assert fb <= 127 - seq = '59' # pop rcx - seq += amd_mov_rr('r0', 'sp') # mov rax, rsp - seq += '4883EC' + byte(fb) # sub rsp, fb - seq += amd_mem_rm('89', 'rcx', 'sp', 0) # [rsp] = rcx - seq += amd_mem_rm('89', 'r0', 'sp', 8) # [rsp+8] = rax - seq += amd_alu_rr('31', 'r0', 'r0') # xor rax, rax - seq += amd_alu_ri8(0, 'r0', k) # add rax, k - seq += amd_mem_rm('89', 'r0', 'sp', 16) # [rsp+16] = rax - seq += amd_alu_rr('31', 'r0', 'r0') # xor rax, rax - for i in range(k): - seq += amd_mem_rm('89', 'r0', 'sp', 24 + 8*i) - return seq - - def epilogue(self, k): - # r10 = retaddr ; rcx = saved_fp ; rsp = saved_fp ; push r10. - # Keep r0/rax intact so function return values survive EPILOGUE. - seq = amd_mem_rm('8B', 'r10', 'sp', 0) # r10 = [rsp] - seq += amd_mem_rm('8B', 'rcx', 'sp', 8) # rcx = [rsp+8] - seq += amd_mov_rr('sp', 'rcx') # rsp = rcx - seq += '4152' # push r10 - return seq - - def tail(self, k): - return self.epilogue(k) + self.b() - - -class RV64(Encoder): - arch = 'riscv64' - - def rrr(self, op, rD, rA, rB): - return rv_r(RV_RRR[op], rD, rA, rB) - - def addi(self, rD, rA, imm): - return rv_i(0x00000013, rD, rA, imm) - - def logi(self, op, rD, rA, imm): - base = {'ANDI': 0x00007013, 'ORI': 0x00006013}[op] - return rv_i(base, rD, rA, imm) - - def shifti(self, op, rD, rA, imm): - base = {'SHLI': 0x00001013, 'SHRI': 0x00005013, 'SARI': 0x40005013}[op] - return rv_shift_imm(base, rD, rA, imm) - - def mov(self, rD, rA): - return rv_i(0x00000013, rD, rA, 0) # addi rD, rA, 0 - - def li(self, rD): - # auipc rD,0 ; lwu rD,12(rD) ; jal x0,+8 - d = NAT_RV64[rD] - auipc = 0x00000017 | (d << 7) - lwu = 0x00006003 | (d << 7) | (d << 15) | (12 << 20) - jal_p8 = 0x0080006F - return le32(auipc) + le32(lwu) + le32(jal_p8) - - def la(self, rD): - return self.li(rD) - - def mem(self, op, rT, rN, off): - # funct3: LD=3, ST=3, LBU=4, SB=0. Opcodes: load=03, store=23. - if op == 'LD': return rv_i(0x00003003, rT, rN, off) - if op == 'ST': return rv_s(0x00003023, rT, rN, off) - if op == 'LB': return rv_i(0x00004003, rT, rN, off) # LBU - if op == 'SB': return rv_s(0x00000023, rT, rN, off) - - def b(self): - return le32(0x00000067 | (NAT_RV64['br'] << 15)) # jalr x0, 0(t5) - - def condb(self, op, rA, rB): - # B<inv> rA, rB, +8 ; jalr x0, 0(t5). funct3 picks the op: - # BEQ→BNE(1), BNE→BEQ(0), BLT→BGE(5). - a, b_ = NAT_RV64[rA], NAT_RV64[rB] - funct3 = {'BEQ': 1, 'BNE': 0, 'BLT': 5}[op] - insn = 0x00000063 | (funct3 << 12) | (a << 15) | (b_ << 20) | (8 << 7) - jalr = 0x00000067 | (NAT_RV64['br'] << 15) - return le32(insn) + le32(jalr) - - def condbz(self, op, rA): - # Same shape as condb but rs2 = x0 (zero reg). BEQZ/BNEZ/BLTZ - # invert the same way (BNE/BEQ/BGE). - a = NAT_RV64[rA] - funct3 = {'BEQZ': 1, 'BNEZ': 0, 'BLTZ': 5}[op] - insn = 0x00000063 | (funct3 << 12) | (a << 15) | (0 << 20) | (8 << 7) - jalr = 0x00000067 | (NAT_RV64['br'] << 15) - return le32(insn) + le32(jalr) - - def call(self): - return le32(0x000000E7 | (NAT_RV64['br'] << 15)) # jalr ra, 0(t5) - - def ret(self): - return le32(0x00008067) # jalr x0, 0(ra) - - def prologue(self, k): - fb = prologue_frame_bytes(k) - sub = rv_i(0x00000013, 'sp', 'sp', -fb) - sd = rv_s(0x00003023, 'ra', 'sp', 0) - fp = rv_i(0x00000013, 't0', 'sp', fb) - sd_fp = rv_s(0x00003023, 't0', 'sp', 8) - k_imm = rv_i(0x00000013, 't0', 'zero', k) - sd_k = rv_s(0x00003023, 't0', 'sp', 16) - # Zero each slot to keep GC walks safe; see LISP-GC.md §"Roots". - zero_slots = '' - for i in range(k): - zero_slots += rv_s(0x00003023, 'zero', 'sp', 24 + 8*i) - return sub + sd + fp + sd_fp + k_imm + sd_k + zero_slots - - def epilogue(self, k): - ld = rv_i(0x00003003, 'ra', 'sp', 0) - ld_fp = rv_i(0x00003003, 't0', 'sp', 8) - mov_sp = rv_i(0x00000013, 'sp', 't0', 0) - return ld + ld_fp + mov_sp - - def tail(self, k): - return self.epilogue(k) + self.b() - - -ENCODERS = {'aarch64': AA64(), 'amd64': AMD64(), 'riscv64': RV64()} - - -## ---------- Op dataclasses ---------------------------------------------- -## Thin wrappers: each row holds its DEFINE name + the data needed to -## reconstruct the encoding. `encode(enc)` calls the matching method -## on the arch's encoder. - -@dataclass -class Op: - name: str - comment: str = '' - - def encode(self, enc: Encoder) -> str: - raise NotImplementedError - -@dataclass -class RRR(Op): - op: str = '' - rD: str = '' - rA: str = '' - rB: str = '' - def encode(self, enc): - return enc.rrr(self.op, self.rD, self.rA, self.rB) - -@dataclass -class AddI(Op): - rD: str = '' - rA: str = '' - imm: int = 0 - def encode(self, enc): - return enc.addi(self.rD, self.rA, self.imm) - -@dataclass -class LogI(Op): - op: str = '' # ANDI / ORI - rD: str = '' - rA: str = '' - imm: int = 0 - def encode(self, enc): - return enc.logi(self.op, self.rD, self.rA, self.imm) - -@dataclass -class ShiftI(Op): - op: str = '' # SHLI / SHRI / SARI - rD: str = '' - rA: str = '' - imm: int = 0 - def encode(self, enc): - return enc.shifti(self.op, self.rD, self.rA, self.imm) - -@dataclass -class Mov(Op): - rD: str = '' - rA: str = '' - def encode(self, enc): - return enc.mov(self.rD, self.rA) - -@dataclass -class Li(Op): - rD: str = '' - def encode(self, enc): - return enc.li(self.rD) - -@dataclass -class La(Op): - rD: str = '' - def encode(self, enc): - return enc.la(self.rD) - -@dataclass -class Mem(Op): - op: str = '' # LD / ST / LB / SB - rT: str = '' - rN: str = '' - off: int = 0 - def encode(self, enc): - return enc.mem(self.op, self.rT, self.rN, self.off) - -@dataclass -class B(Op): - def encode(self, enc): - return enc.b() - -@dataclass -class CondB(Op): - op: str = '' # BEQ / BNE / BLT - rA: str = '' - rB: str = '' - def encode(self, enc): - return enc.condb(self.op, self.rA, self.rB) - -@dataclass -class CondBZ(Op): - op: str = '' # BEQZ / BNEZ / BLTZ - rA: str = '' - def encode(self, enc): - return enc.condbz(self.op, self.rA) - -@dataclass -class Literal(Op): - hex_by_arch: Optional[dict] = None - def encode(self, enc): - return enc.literal(self.hex_by_arch) - -@dataclass -class Prologue(Op): - k: int = 1 - def encode(self, enc): - return enc.prologue(self.k) - -@dataclass -class Epilogue(Op): - k: int = 1 - def encode(self, enc): - return enc.epilogue(self.k) - -@dataclass -class Tail(Op): - k: int = 1 - def encode(self, enc): - return enc.tail(self.k) - -@dataclass -class Call(Op): - def encode(self, enc): - return enc.call() - -@dataclass -class Ret(Op): - def encode(self, enc): - return enc.ret() - - -## ---------- SYSCALL pre-encoded sequences ------------------------------- -## The one-shot syscall wrapper. Shuffles P1's r0=num, r1–r6=args into -## each arch's native syscall ABI and clobbers only r0 on return. -## Encoded by hand (per P1.md §"Syscall conventions"). - -SYSCALL_HEX = { - 'aarch64': ( - # r4/r5 now live in callee-saved natives (x26/x27), so the - # kernel preserves them — no save/restore needed. Only r1/r2/r3 - # (in caller-saved x1/x2/x3) must be stashed across the shuffle. - '' .join([ - le32(0xAA0003E8), # mov x8, x0 (syscall number) - le32(0xAA0103F5), # mov x21, x1 (save r1) - le32(0xAA0203F6), # mov x22, x2 (save r2) - le32(0xAA0303F7), # mov x23, x3 (save r3) - le32(0xAA1503E0), # mov x0, x21 (arg1 = r1) - le32(0xAA1603E1), # mov x1, x22 (arg2 = r2) - le32(0xAA1703E2), # mov x2, x23 (arg3 = r3) - le32(0xAA1A03E3), # mov x3, x26 (arg4 = r4) - le32(0xAA1B03E4), # mov x4, x27 (arg5 = r5) - le32(0xAA1303E5), # mov x5, x19 (arg6 = r6) - le32(0xD4000001), # svc #0 - le32(0xAA1503E1), # mov x1, x21 (restore r1) - le32(0xAA1603E2), # mov x2, x22 - le32(0xAA1703E3), # mov x3, x23 - ]) - ), - # r4=r13, r5=r14 are callee-saved natively, but syscall wants args - # 4/5 in r10/r8. r6=rbx, but arg6 lives in r9. Three shuffle moves, - # then syscall. The kernel preserves rdi/rsi/rdx/r12–r15/rbx, so no - # P1 reg is clobbered beyond r0 (syscall return). - 'amd64': '4D89EA' + '4D89F0' + '4989D9' + '0F05', - 'riscv64': ( - # Same story as aarch64: r4/r5 in callee-saved s4/s5 (=x20/x21), - # so we only save/restore a1/a2/a3. Scratch slots: s3, s6, s7. - ''.join([ - le32(0x00050893), # mv a7, a0 (syscall number) - le32(0x00058993), # mv s3, a1 (save r1) - le32(0x00060B13), # mv s6, a2 (save r2) - le32(0x00068B93), # mv s7, a3 (save r3) - le32(0x00098513), # mv a0, s3 (arg1 = r1) - le32(0x000B0593), # mv a1, s6 (arg2 = r2) - le32(0x000B8613), # mv a2, s7 (arg3 = r3) - le32(0x000A0693), # mv a3, s4 (arg4 = r4) - le32(0x000A8713), # mv a4, s5 (arg5 = r5) - le32(0x00048793), # mv a5, s1 (arg6 = r6) - le32(0x00000073), # ecall - le32(0x00098593), # mv a1, s3 (restore r1) - le32(0x000B0613), # mv a2, s6 - le32(0x000B8693), # mv a3, s7 - ]) - ), -} - -## Syscall numbers (little-endian 32-bit for LI operand). -## aarch64 and riscv64 share the asm-generic table; amd64 has its own. -## -## Portability notes — every entry below is a syscall that exists on all -## three with the same semantics under the uniform P1 SYSCALL convention -## (r0 = num, r1-r6 = args): -## - `fork` is amd64-only; `wait4` is asm-generic 32-bit compat only. -## Use `clone(SIGCHLD)` and `waitid` instead. -## - `open` is amd64-only (removed from asm-generic). Use `openat` with -## dirfd = AT_FDCWD (-100) as arg1. -## - `clone` arg order differs: amd64 is (flags, stack, ptid, ctid, tls); -## aarch64/riscv64 are (flags, stack, ptid, tls, ctid). Benign when -## ptid/ctid/tls are all zero (the fork-equivalent case). -SYS_NUM = { - 'aarch64': {'SYS_WRITE': 64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57, - 'SYS_OPENAT': 56, - 'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95}, - 'amd64': {'SYS_WRITE': 1, 'SYS_EXIT': 60, 'SYS_READ': 0, 'SYS_CLOSE': 3, - 'SYS_OPENAT':257, - 'SYS_CLONE': 56, 'SYS_EXECVE': 59, 'SYS_WAITID':247}, - 'riscv64': {'SYS_WRITE': 64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57, - 'SYS_OPENAT': 56, - 'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95}, -} - - -## ---------- Canonical imm/offset/shamt sets ----------------------------- -## Enumerated instead of sigil-passed: M1's DEFINE substitutes hex -## bytes verbatim, so every distinct imm value needs its own DEFINE. -## These cover every value used across hello/demo/lisp plus a little -## headroom. Extend when a new value appears in P1 src. - -## ADDI imms. NEG48/48 handle the ASCII '0' bias; the rest cover tag -## stripping, loop counters, and — at 40 — the primitive-registration -## table's per-record stride. Full reg product × this set = 8²×N. -ADDI_IMMS = tuple(range(-128, 129)) + (256, 320) - -## Shift amounts (for SHLI/SHRI/SARI). 32/52 implement low-N-bit masks -## (length field extraction; 4096-slot symbol-table index); the small -## values scale-by-N for byte offsets and fixnum encode/decode. -SHIFT_IMMS = (1, 2, 3, 4, 5, 13, 16, 32, 52) - -## ANDI/ORI imms. Every entry must appear in AA64_LOGI_ENC. -LOGI_IMMS = (1, 2, 3, 4, 6, 7, 8) - -## Memory offsets for LD/ST/LB/SB. 8*(0-40) covers record fields used by -## the bootstrap expander's token, macro, stream, and scratch layouts. -## 6/7 remain for legacy byte tests; -8 reaches one slot below a base. -MEM_OFFS = (-8, 0, 6, 7) + tuple(range(8, 328, 8)) - -CONDB_OPS = ('BEQ', 'BNE', 'BLT') -CONDBZ_OPS = ('BEQZ', 'BNEZ', 'BLTZ') -SHIFT_OPS = ('SHLI', 'SHRI', 'SARI') -LOGI_OPS = ('ANDI', 'ORI') -MEM_OPS = ('LD', 'ST', 'LB', 'SB') - - -## Curated RRR triples. The full cube is 11 ops × 8³ regs = 5632 -## entries per arch — >99% would be dead weight. Each tuple below -## is one actually used by hello/demo/lisp. Lint catches missing -## triples on assembly; add a line here and regenerate. -RRR_TABLE = ( - # demo/lisp step-1 arith cube - ('ADD','r1','r1','r2'), ('ADD','r1','r1','r4'), - ('ADD','r2','r2','r6'), ('ADD','r2','r3','r1'), - ('SUB','r1','r1','r2'), ('SUB','r2','r2','r6'), - ('AND','r1','r1','r5'), - ('OR', 'r1','r1','r2'), - ('XOR','r1','r1','r2'), - ('MUL','r1','r1','r2'), - ('DIV','r1','r1','r2'), - ('REM','r1','r1','r5'), - ('SHL','r1','r1','r2'), - ('SHL','r2','r2','r1'), - ('SHR','r1','r1','r2'), - ('SAR','r4','r4','r2'), - # alloc / pointer arithmetic - ('ADD','r2','r0','r1'), - ('ADD','r2','r1','r0'), - ('ADD','r0','r0','r1'), - ('ADD','r0','r0','r2'), - ('ADD','r0','r0','r3'), - ('ADD','r2','r2','r0'), - ('ADD','r2','r2','r1'), - ('SUB','r1','r1','r0'), - ('SUB','r2','r1','r2'), - ('SUB','r3','r3','r0'), - ('AND','r1','r1','r2'), - ('AND','r3','r3','r2'), - ('OR', 'r3','r3','r2'), - # reader / display index+offset fold - ('ADD','r6','r1','r2'), - ('ADD','r6','r6','r0'), - ('ADD','r7','r1','r2'), - # m1m bootstrap file I/O buffer indexing. - ('ADD','r2','r2','r5'), - ('ADD','r2','r2','r7'), - ('SUB','r2','r1','r6'), - ('SUB','r3','r1','r6'), - ('SUB','r2','r2','r1'), # prelude length = end - start - # m1m bootstrap file I/O remaining-byte counts. - ('SUB','r3','r3','r7'), - ('SUB','r3','r7','r5'), - ('REM','r1','r1','r2'), - # bump-pointer + accumulator updates (originally kaem-minimal; - # retained in case lisp uses them — lint catches dead entries) - ('ADD','r1','r1','r0'), - ('ADD','r5','r5','r0'), - ('ADD','r7','r7','r0'), - ('SUB','r3','r3','r2'), - ('SUB','r6','r6','r0'), - # Primitive bodies (LISP.md step 10c). Convention: r1=argc, - # r2=argv (both input), r3=accumulator, r0=scratch/return. - # Variadic folds: (r3 = r3 op r0). Unary negate / bit-not: - # (r3 = r0 - r3) with r0 = 0 or -1. arithmetic-shift k-negate: - # (r0 = r1 - r0) with r1 = 0 after argc is consumed. - ('ADD','r3','r3','r0'), - ## ('SUB','r3','r3','r0') — already above - ('SUB','r3','r0','r3'), - ('SUB','r0','r1','r0'), - ('MUL','r3','r3','r0'), - ('DIV','r3','r3','r0'), - ('REM','r3','r3','r0'), - ('AND','r3','r3','r0'), - ('OR', 'r3','r3','r0'), - ('XOR','r3','r3','r0'), - ('SHL','r3','r3','r0'), - ('SAR','r3','r3','r0'), -) - - -## ---------- Row assembly ------------------------------------------------ - -HEADER = """## p1_{arch}.M1 — GENERATED by p1_gen.py. Do not edit by hand. -## -## Shared op-table lives in p1_gen.py; each arch's encoder lowers -## (op, register-tuple, imm) rows into native bytes. See P1.md for the -## ISA spec and register mapping. -""" - -@dataclass -class Banner: - text: str - - -def _imm_suf(imm): - return f'NEG{-imm}' if imm < 0 else f'{imm}' - - -def rows(): - R = [] - - # --- LI / LA — wide literal and address loads --- - R.append(Banner('LI / LA — load 4-byte zero-extended literal or label addr')) - for rd in P1_REGS: - R.append(Li(name=f'LI_{rd.upper()}', rD=rd)) - # LI_BR loads into the hidden branch-target scratch (x17/r11/t5). - # Every branch/call site is `LI_BR &target ; P1_<BR>`. The scratch - # is *not* a P1 reg. - R.append(Li(name='LI_BR', rD='br')) - for rd in P1_REGS: - R.append(La(name=f'LA_{rd.upper()}', rD=rd)) - - # --- MOV — register-to-register + MOV rD, sp --- - R.append(Banner('MOV — full register product (src may be sp)')) - for rd in P1_REGS: - for ra in P1_REGS: - R.append(Mov(name=f'MOV_{rd.upper()}_{ra.upper()}', rD=rd, rA=ra)) - R.append(Mov(name=f'MOV_{rd.upper()}_SP', rD=rd, rA='sp')) - - # --- RRR — full cube. m1m.M1 is record-heavy enough that keeping this - # complete is simpler and safer than curating one tuple at a time. The - # Makefile still prunes unused DEFINEs before feeding M0. - R.append(Banner('RRR — full register product')) - all_rrr = tuple((op, d, a, b) - for op in AA64_RRR_BASE - for d in P1_REGS - for a in P1_REGS - for b in P1_REGS) - for op, d, a, b in all_rrr: - R.append(RRR(name=f'{op}_{d.upper()}_{a.upper()}_{b.upper()}', - op=op, rD=d, rA=a, rB=b)) - - # --- Immediate arith: ADDI × full reg product × imm set --- - R.append(Banner('ADDI — full register product × ADDI_IMMS')) - for d, a, imm in product(P1_REGS, P1_REGS, ADDI_IMMS): - R.append(AddI(name=f'ADDI_{d.upper()}_{a.upper()}_{_imm_suf(imm)}', - rD=d, rA=a, imm=imm)) - - # --- ANDI / ORI × full reg product × LOGI_IMMS --- - R.append(Banner('ANDI / ORI — full register product × LOGI_IMMS')) - for op, d, a, imm in product(LOGI_OPS, P1_REGS, P1_REGS, LOGI_IMMS): - R.append(LogI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}', - op=op, rD=d, rA=a, imm=imm)) - - # --- SHLI / SHRI / SARI × full reg product × SHIFT_IMMS --- - R.append(Banner('SHLI / SHRI / SARI — full register product × SHIFT_IMMS')) - for op, d, a, imm in product(SHIFT_OPS, P1_REGS, P1_REGS, SHIFT_IMMS): - R.append(ShiftI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}', - op=op, rD=d, rA=a, imm=imm)) - - # --- Memory: LD/ST/LB/SB × full reg product × MEM_OFFS --- - R.append(Banner('LD / ST / LB / SB — full register product × MEM_OFFS')) - for op, rt, rn, off in product(MEM_OPS, P1_REGS, P1_REGS, MEM_OFFS): - R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{_imm_suf(off)}', - op=op, rT=rt, rN=rn, off=off)) - - # --- SP-relative memory: LD/ST/LB/SB × P1_REGS × MEM_OFFS --- - # Lets callees touch PROLOGUE_N* frame slots directly, skipping the - # MOV_Rk_SP trampoline that previously bridged P1 regs to sp-base. - R.append(Banner('LD / ST / LB / SB — SP-base × MEM_OFFS')) - for op, rt, off in product(MEM_OPS, P1_REGS, MEM_OFFS): - R.append(Mem(name=f'{op}_{rt.upper()}_SP_{_imm_suf(off)}', - op=op, rT=rt, rN='sp', off=off)) - - # --- Branches: BEQ/BNE/BLT × full reg product + unconditional B --- - R.append(Banner('Branches — LI_BR-indirect pattern')) - R.append(B(name='B')) - for op, a, b in product(CONDB_OPS, P1_REGS, P1_REGS): - R.append(CondB(name=f'{op}_{a.upper()}_{b.upper()}', - op=op, rA=a, rB=b)) - - # --- Zero-compare branches: BEQZ/BNEZ/BLTZ × P1_REGS --- - # Compare-with-zero is the common case (null pointer, fd<0, EOF, - # loop-count exhausted), so we give it a single-insn path that - # doesn't consume a scratch reg for the zero operand. - for op, a in product(CONDBZ_OPS, P1_REGS): - R.append(CondBZ(name=f'{op}_{a.upper()}', op=op, rA=a)) - - # --- Control: CALL / RET / PROLOGUE / EPILOGUE / TAIL (Nk = 1..4) --- - R.append(Banner('Control: CALL/RET + single-slot and N-slot PROLOGUE/EPILOGUE/TAIL')) - R.append(Prologue(name='PROLOGUE', k=1)) - R.append(Epilogue(name='EPILOGUE', k=1)) - R.append(Ret(name='RET')) - R.append(Call(name='CALL')) - R.append(Tail(name='TAIL', k=1)) - for k in (2, 3, 4): - R.append(Prologue(name=f'PROLOGUE_N{k}', k=k)) - R.append(Epilogue(name=f'EPILOGUE_N{k}', k=k)) - R.append(Tail(name=f'TAIL_N{k}', k=k)) - - # --- SYSCALL — pre-encoded per-arch wrapper --- - R.append(Banner('SYSCALL — uniform "clobbers r0 only" across arches')) - R.append(Literal(name='SYSCALL', hex_by_arch=SYSCALL_HEX)) - - # --- Syscall numbers (LE-32 immediates) --- - R.append(Banner('Linux syscall numbers (per-arch table). LE-32 operands for LI.')) - for name in ('SYS_WRITE', 'SYS_EXIT', 'SYS_READ', 'SYS_CLOSE', 'SYS_OPENAT', - 'SYS_CLONE', 'SYS_EXECVE', 'SYS_WAITID'): - R.append(Literal(name=name, - hex_by_arch={a: le32(SYS_NUM[a][name]) for a in ARCHES})) - - return R - - -## ---------- File emission ----------------------------------------------- - -def lower_name(name: str) -> str: - """Transform an uppercase row name to the user-facing mnemonic. - First '_' binds op → first operand with underscore; subsequent - '_' become ',' (e.g. MOV_R4_R0 → mov_r4,r0, ADD_R1_R1_R2 → - add_r1,r1,r2). Single-'_' names (PROLOGUE_N3, SYS_WRITE, LI_R0) - keep their lone underscore.""" - low = name.lower() - head, _, rest = low.partition('_') - if not rest: - return head - return f'{head}_{rest.replace("_", ",")}' - - -def emit(arch: str) -> str: - enc = ENCODERS[arch] - out = [HEADER.format(arch=arch).rstrip(), ''] - seen = set() - for row in rows(): - if isinstance(row, Banner): - out.append('') - out.append('## ---- ' + row.text + ' ' + '-' * max(0, 60 - len(row.text))) - continue - name = lower_name(row.name) - if name in seen: - raise RuntimeError(f'duplicate DEFINE: {name}') - seen.add(name) - out.append(f'DEFINE {name} {row.encode(enc)}') - out.append('') - return '\n'.join(out) - - -def main(): - check = '--check' in sys.argv - positional = [a for a in sys.argv[1:] if not a.startswith('--')] - build_root = positional[0] if positional else 'build' - - had_diff = False - for arch in ARCHES: - dest_dir = os.path.join(build_root, arch) - path = os.path.join(dest_dir, f'p1_{arch}.M1') - content = emit(arch) - if check: - try: - with open(path) as f: - existing = f.read() - except FileNotFoundError: - existing = '' - if existing != content: - sys.stderr.write(f'DIFF: {path}\n') - had_diff = True - else: - os.makedirs(dest_dir, exist_ok=True) - with open(path, 'w') as f: - f.write(content) - print(f'wrote {path} ({len(content)} bytes)') - - if check and had_diff: - sys.exit(1) - - -if __name__ == '__main__': - main()