commit 92f3cc91a2b73dc490ae6048309beb6da7382c96
parent 90d6018d786c42cf3a053508647144dceca2167e
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 23 Apr 2026 11:55:22 -0700
move files around
Diffstat:
9 files changed, 0 insertions(+), 1468 deletions(-)
diff --git a/README.md b/README.md
@@ -1,53 +0,0 @@
-# lispcc
-
-Experimental alternative bootstrap path: a small Lisp written directly in
-M1 assembly + a C compiler written in that Lisp, replacing the
-`M2-Planet → mes → MesCC → nyacc` stack between M1 asm and tcc-boot.
-Goal is a 4–6× shrink in auditable LOC. See [docs/PLAN.md](docs/PLAN.md).
-
-## Status
-
-Stage 0: hello-world in the P1 portable pseudo-ISA (see [docs/P1.md](docs/P1.md)),
-assembled and run inside a pristine alpine container on all three target
-arches (aarch64, amd64, riscv64). The same `tests/hello.M1` source assembles
-for every arch; only the backing `build/<arch>/p1_<arch>.M1` defs file
-varies. Toolchain (M1, hex2) builds statically from the upstream mescc-tools
-C source.
-
-## Layout
-
-```
-docs/ design docs (PLAN, SEED, P1, LISP)
-src/ real programs (lisp.M1) + p1_gen.py
-tests/ smoke programs (hello.M1, demo.M1) + fixtures
- lisp/ lisp test fixtures (*.scm + *.expected)
-bootstrap.sh hex0-seed → M0/hex2-0/catm toolchain build
-populate-upstream.sh host-side copy of upstream seeds + sources into build/upstream/
-lint.sh M1 undefined-token guard
-Makefile podman-driven build, ARCH-parameterized
-build/ all derived artifacts (gitignored)
- upstream/ mirror of the files bootstrap.sh consumes from live-bootstrap
- <arch>/ per-arch outputs
- tools/ bootstrapped M0, hex2-0, catm (+ throwaway hex0/hex1)
- p1_<arch>.M1 generated P1 defs
- <prog> final ELF binary
-```
-
-## Build & run
-
-Requires podman. Uses Alpine as the host. Non-native arches run via podman's
-binfmt + qemu-user path (works transparently on a default `podman machine`
-setup).
-
-```
-make run-all # build + run on all three arches
-make clean # wipe build/
-```
-
-## Source layout assumption
-
-`populate-upstream.sh` runs on the host and mirrors the files bootstrap.sh
-needs from `$UPSTREAM/seed/stage0-posix/` into `build/upstream/`; the
-default is `../live-bootstrap`. Override by invoking `make UPSTREAM=/path
-populate-upstream`. Podman itself only ever mounts curdir, so everything
-the container sees must live inside the repo.
diff --git a/src/lisp.M1 b/lisp/lisp.M1
diff --git a/src/prelude.scm b/lisp/prelude.scm
diff --git a/src/m1pp.M1 b/m1pp/m1pp.M1
diff --git a/src/m1pp.c b/m1pp/m1pp.c
diff --git a/src/elf/ELF-aarch64.hex2 b/src/elf/ELF-aarch64.hex2
@@ -1,63 +0,0 @@
-## ELF-aarch64.hex2 — BSS-enabled AArch64 ELF header for lispcc program link.
-##
-## Derived from stage0-posix's ELF-aarch64.hex2 (same single-LOAD-segment
-## layout) with one change: p_memsz references :ELF_bss_end while p_filesz
-## still references :ELF_end. The on-disk file ends at :ELF_end (after a
-## post-link truncate drops the trailing zero bytes hex2 emits); the
-## kernel's loader zero-fills the gap up to :ELF_bss_end at runtime.
-##
-## The stock stage0-posix ELF-*.hex2 files under build/upstream/ stay
-## untouched (no :ELF_bss_end label) because bootstrap.sh uses them to
-## link the M0 binary itself — a program that has no BSS region to speak
-## of and doesn't define :ELF_bss_end.
-##
-## If you use this header, your program must define BOTH :ELF_end
-## (end of real bytes) and :ELF_bss_end (end of zero-fill region).
-
-## ELF Header
-
-:ELF_base
-7F 45 4C 46 # e_ident[EI_MAG0-3] ELF's magic number
-
-02 # e_ident[EI_CLASS] Indicating 64 bit
-01 # e_ident[EI_DATA] Indicating little endianness
-01 # e_ident[EI_VERSION] Indicating original elf
-
-03 # e_ident[EI_OSABI] Set at 3 because FreeBSD is strict
-00 # e_ident[EI_ABIVERSION] Set at 0 because noone cares
-
-00 00 00 00 00 00 00 # e_ident[EI_PAD]
-02 00 # e_type Indicating Executable
-B7 00 # e_machine Indicating AArch64
-01 00 00 00 # e_version Indicating original elf
-
-&_start 00 00 00 00 # e_entry Address of the entry point
-%ELF_program_headers>ELF_base 00 00 00 00 # e_phoff Address of program header table
-00 00 00 00 00 00 00 00 # e_shoff Address of section header table
-
-00 00 00 00 # e_flags
-40 00 # e_ehsize Indicating our 64 Byte header
-
-38 00 # e_phentsize size of a program header table
-01 00 # e_phnum number of entries in program table
-
-00 00 # e_shentsize size of a section header table
-00 00 # e_shnum number of entries in section table
-
-00 00 # e_shstrndx index of the section names
-
-## Program Header
-:ELF_program_headers
-01 00 00 00 # ph_type: PT-LOAD = 1
-07 00 00 00 # ph_flags: PF-X|PF-W|PF-R = 7
-00 00 00 00 00 00 00 00 # ph_offset
-
-&ELF_base 00 00 00 00 # ph_vaddr
-&ELF_base 00 00 00 00 # ph_physaddr
-
-%ELF_end>ELF_base 00 00 00 00 # ph_filesz (end of on-disk image)
-%ELF_bss_end>ELF_base 00 00 00 00 # ph_memsz (kernel zero-fills past ph_filesz)
-
-01 00 00 00 00 00 00 00 # ph_align
-
-:ELF_text
diff --git a/src/elf/ELF-amd64.hex2 b/src/elf/ELF-amd64.hex2
@@ -1,51 +0,0 @@
-## ELF-amd64.hex2 — BSS-enabled AMD64 ELF header for lispcc program link.
-## See src/elf/ELF-aarch64.hex2 for why this lives here rather than
-## under build/upstream/ (bootstrap.sh needs the stock version to link M0).
-
-## ELF Header
-
-:ELF_base
-7F 45 4C 46 ## e_ident[EI_MAG0-3] ELF's magic number
-
-02 ## e_ident[EI_CLASS] Indicating 64 bit
-01 ## e_ident[EI_DATA] Indicating little endianness
-01 ## e_ident[EI_VERSION] Indicating original elf
-
-03 ## e_ident[EI_OSABI] Set at 3 because FreeBSD is strict
-00 ## e_ident[EI_ABIVERSION] Set at 0 because none cares
-
-00 00 00 00 00 00 00 ## e_ident[EI_PAD]
-02 00 ## e_type Indicating Executable
-3E 00 ## e_machine Indicating AMD64
-01 00 00 00 ## e_version Indicating original elf
-
-&_start 00 00 00 00 ## e_entry Address of the entry point
-%ELF_program_headers>ELF_base 00 00 00 00 ## e_phoff Address of program header table
-00 00 00 00 00 00 00 00 ## e_shoff Address of section header table
-
-00 00 00 00 ## e_flags
-40 00 ## e_ehsize Indicating our 64 Byte header
-
-38 00 ## e_phentsize size of a program header table
-01 00 ## e_phnum number of entries in program table
-
-00 00 ## e_shentsize size of a section header table
-00 00 ## e_shnum number of entries in section table
-
-00 00 ## e_shstrndx index of the section names
-
-## Program Header
-:ELF_program_headers
-01 00 00 00 ## p_type
-07 00 00 00 ## ph_flags: PF-X|PF-W|PF-R = 7
-00 00 00 00 00 00 00 00 ## p_offset
-
-&ELF_base 00 00 00 00 ## p_vaddr
-&ELF_base 00 00 00 00 ## p_physaddr
-
-%ELF_end>ELF_base 00 00 00 00 ## p_filesz (end of on-disk image)
-%ELF_bss_end>ELF_base 00 00 00 00 ## p_memsz (kernel zero-fills past p_filesz)
-
-01 00 00 00 00 00 00 00 ## Required alignment
-
-:ELF_text
diff --git a/src/elf/ELF-riscv64.hex2 b/src/elf/ELF-riscv64.hex2
@@ -1,51 +0,0 @@
-## ELF-riscv64.hex2 — BSS-enabled RISC-V 64 ELF header for lispcc program link.
-## See src/elf/ELF-aarch64.hex2 for why this lives here rather than
-## under build/upstream/ (bootstrap.sh needs the stock version to link M0).
-
-## ELF Header
-
-:ELF_base
-7F 45 4C 46 ## e_ident[EI_MAG0-3] ELF's magic number
-
-02 ## e_ident[EI_CLASS] Indicating 64 bit
-01 ## e_ident[EI_DATA] Indicating little endianness
-01 ## e_ident[EI_VERSION] Indicating original elf
-
-03 ## e_ident[EI_OSABI] Set at 3 because FreeBSD is strict
-00 ## e_ident[EI_ABIVERSION] Set at 0 because none cares
-
-00 00 00 00 00 00 00 ## e_ident[EI_PAD]
-02 00 ## e_type Indicating Executable
-F3 00 ## e_machine Indicating RISC-V
-01 00 00 00 ## e_version Indicating original elf
-
-&_start 00 00 00 00 ## e_entry Address of the entry point
-%ELF_program_headers>ELF_base 00 00 00 00 ## e_phoff Address of program header table
-00 00 00 00 00 00 00 00 ## e_shoff Address of section header table
-
-00 00 00 00 ## e_flags
-40 00 ## e_ehsize Indicating our 64 Byte header
-
-38 00 ## e_phentsize size of a program header table
-01 00 ## e_phnum number of entries in program table
-
-00 00 ## e_shentsize size of a section header table
-00 00 ## e_shnum number of entries in section table
-
-00 00 ## e_shstrndx index of the section names
-
-## Program Header
-:ELF_program_headers
-01 00 00 00 ## p_type
-07 00 00 00 ## ph_flags: PF-X|PF-W|PF-R = 7
-00 00 00 00 00 00 00 00 ## p_offset
-
-&ELF_base 00 00 00 00 ## p_vaddr
-&ELF_base 00 00 00 00 ## p_physaddr
-
-%ELF_end>ELF_base 00 00 00 00 ## p_filesz (end of on-disk image)
-%ELF_bss_end>ELF_base 00 00 00 00 ## p_memsz (kernel zero-fills past p_filesz)
-
-01 00 00 00 00 00 00 00 ## Required alignment
-
-:ELF_text
diff --git a/src/p1_gen.py b/src/p1_gen.py
@@ -1,1250 +0,0 @@
-#!/usr/bin/env python3
-"""p1_gen.py — generate p1_<arch>.M1 from a per-arch encoder table.
-
-Single source of truth for the P1 DEFINE tables across all three target
-arches. Running this script writes <build>/aarch64/p1_aarch64.M1 and the
-amd64/riscv64 siblings (default <build> = "build").
-
-Structure:
- * Low-level native encoders (amd_*, aa_*, rv_*) — one bank of
- helpers per arch.
- * Encoder classes AA64/AMD64/RV64 (subclasses of Encoder): one
- method per P1 op category, lowering (op, reg-tuple, imm) into
- native hex. Each arch's encoder is a coherent bundle — adding a
- new op means one new method on each of the three.
- * Op dataclasses — thin rows holding the DEFINE's name + data.
- Op.encode(enc) dispatches into enc.<op-method>() with the Op's
- fields unpacked. No per-arch branching lives in Op classes.
- * rows() — builds the output list. Non-RRR ops are emitted as the
- full register product × a curated imm/offset/shamt set. RRR
- keeps an explicit table (the full 8³ cube is 5.6k entries per
- arch, >99% dead weight). Adding a new RRR triple or a new imm
- value is a one-line edit to rows(); a new register combination
- for any other op needs no edit at all.
- * emit(arch) / main — iterate rows, ask the arch's encoder to
- lower each, write out the defs file.
-
-Running:
- $ python3 p1_gen.py [build-root] # rewrite all three files
- $ python3 p1_gen.py --check [build-root] # diff against current files
-"""
-
-import os
-import sys
-from dataclasses import dataclass
-from itertools import product
-from typing import Optional
-
-ARCHES = ('aarch64', 'amd64', 'riscv64')
-
-## P1 GPRs (the 8 caller/callee-split registers exposed to P1 source).
-P1_REGS = ('r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7')
-
-## ---------- Register mappings --------------------------------------------
-## P1 register name → native encoding number. The native numbers are what
-## the per-arch encoders insert into instruction fields; the human-facing
-## names (rax, x1, a2, …) never appear in this file.
-
-## 4:4 caller/callee-saved split. r0–r3 caller (native argregs); r4–r7
-## callee (native callee-saved). `br` is the hidden branch-target scratch
-## (not a P1 reg) — picked so every op's expansion clobbers only what its
-## name declares.
-NAT_AA64 = {'r0': 0, 'r1': 1, 'r2': 2, 'r3': 3,
- 'r4': 26, 'r5': 27, 'r6': 19, 'r7': 20,
- 'br': 17, # x17 (IP1, caller-saved linker scratch)
- 'sp': 31, 'xzr': 31, 'lr': 30,
- 'x21': 21, 'x22': 22, 'x23': 23, 'x8': 8}
-
-## amd64 ModRM.reg/rm + REX.R/B bit: native regnums 0..15 with r8..r15
-## setting the REX bit. We store the 4-bit native number directly.
-NAT_AMD64 = {'r0': 0, # rax
- 'r1': 7, # rdi
- 'r2': 6, # rsi
- 'r3': 2, # rdx
- 'r4': 13, # r13 (callee-saved)
- 'r5': 14, # r14 (callee-saved)
- 'r6': 3, # rbx
- 'r7': 12, # r12
- 'br': 11, # r11 — branch/call target scratch + DIV/REM r0 save
- 'sp': 4, # rsp
- 'rcx': 1, # shift-count scratch + DIV/REM rdx save (not a P1 reg)
- 'r10': 10, # syscall arg4 slot (not a P1 reg)
- 'r8': 8, # syscall arg5 slot (not a P1 reg)
- 'r9': 9, # syscall arg6 slot (not a P1 reg)
- 'r11': 11, # alias for br (some expansions spell it r11 directly)
- }
-
-NAT_RV64 = {'r0': 10, 'r1': 11, 'r2': 12, 'r3': 13,
- 'r4': 20, 'r5': 21, 'r6': 9, 'r7': 18,
- 'br': 30, # t5 (caller-saved temp)
- 't0': 5,
- 'sp': 2, 'ra': 1, 'zero': 0, 'a7': 17,
- 's3': 19, 's6': 22, 's7': 23}
-
-
-## ---------- Low-level encoding helpers -----------------------------------
-
-def le32(n: int) -> str:
- return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
-
-def byte(n: int) -> str:
- return f'{n & 0xFF:02X}'
-
-
-## ---------- amd64 primitive encoders ------------------------------------
-## amd64 is variable-length. Helpers below emit specific instruction
-## shapes used by the P1 expansions. REX prefix bits: W=64b, R=ModRM.reg
-## high, B=ModRM.rm high, X=SIB.index high (unused here).
-
-def rex(w, r, x, b):
- v = 0x40 | (w << 3) | (r << 2) | (x << 1) | b
- return byte(v)
-
-def modrm(mod, reg, rm):
- return byte((mod << 6) | ((reg & 7) << 3) | (rm & 7))
-
-def amd_mov_rr(dst, src):
- """mov dst, src — REX.W + 89 /r (MOV r/m64, r64)."""
- d, s = NAT_AMD64[dst], NAT_AMD64[src]
- return rex(1, s >> 3, 0, d >> 3) + '89' + modrm(3, s, d)
-
-def amd_alu_rr(op, dst, src):
- """op dst, src — 2-operand ALU. op is the opcode byte (01 add,
- 29 sub, 21 and, 09 or, 31 xor)."""
- d, s = NAT_AMD64[dst], NAT_AMD64[src]
- return rex(1, s >> 3, 0, d >> 3) + op + modrm(3, s, d)
-
-def amd_alu_ri8(ext, dst, imm):
- """op dst, imm8 (sign-extended). Opcode 83 /ext ib."""
- d = NAT_AMD64[dst]
- return rex(1, 0, 0, d >> 3) + '83' + modrm(3, ext, d) + byte(imm)
-
-def amd_alu_ri32(ext, dst, imm):
- """op dst, imm32 (sign-extended). Opcode 81 /ext id. Used when
- an immediate doesn't fit in the imm8 form (e.g., ADDI with
- values outside [-128, 127])."""
- d = NAT_AMD64[dst]
- imm_le = (imm & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
- return rex(1, 0, 0, d >> 3) + '81' + modrm(3, ext, d) + imm_le
-
-def amd_shift_ri8(ext, dst, imm):
- """shl/shr/sar dst, imm8. Opcode C1 /ext ib."""
- d = NAT_AMD64[dst]
- return rex(1, 0, 0, d >> 3) + 'C1' + modrm(3, ext, d) + byte(imm)
-
-def amd_shift_cl(ext, dst):
- """shl/shr/sar dst, cl. Opcode D3 /ext."""
- d = NAT_AMD64[dst]
- return rex(1, 0, 0, d >> 3) + 'D3' + modrm(3, ext, d)
-
-def amd_imul_rr(dst, src):
- """imul dst, src — 0F AF /r."""
- d, s = NAT_AMD64[dst], NAT_AMD64[src]
- return rex(1, d >> 3, 0, s >> 3) + '0FAF' + modrm(3, d, s)
-
-def amd_idiv(src):
- """idiv src — F7 /7 (signed div of rdx:rax by src)."""
- s = NAT_AMD64[src]
- return rex(1, 0, 0, s >> 3) + 'F7' + modrm(3, 7, s)
-
-def amd_cqo():
- """cqo — sign-extend rax into rdx:rax. 48 99."""
- return '4899'
-
-def amd_mem_rm(opcode, reg, base, disp):
- """[base+disp] <-> reg, for MOV r,r/m or MOV r/m,r (opcode=89 store, 8B load).
- disp is signed int; encodes as disp8 if in range, else disp32."""
- r, b = NAT_AMD64[reg], NAT_AMD64[base]
- prefix = rex(1, r >> 3, 0, b >> 3) + opcode
- if -128 <= disp <= 127:
- mod = 1
- d = byte(disp)
- else:
- mod = 2
- d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
- # Any base whose low 3 bits are 100 — rsp (4) or r12 (12) — forces a
- # SIB byte when mod != 00; rm=100 without SIB is the [*] / SIB-follows
- # special form. REX.B already distinguishes rsp from r12 in the SIB
- # base field, so the SIB byte is 24 in both cases.
- if (b & 7) == 4:
- return prefix + modrm(mod, r, 4) + '24' + d
- return prefix + modrm(mod, r, b) + d
-
-def amd_mov_rm_b(reg, base, disp, store):
- """Byte load/store. 88 /r (store), 0F B6 /r (movzx load)."""
- r, b = NAT_AMD64[reg], NAT_AMD64[base]
- if -128 <= disp <= 127:
- mod = 1
- d = byte(disp)
- else:
- mod = 2
- d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
- # rsp (4) and r12 (12) share rm bits 100, which forces a SIB byte when
- # mod != 00; REX.B distinguishes them in the SIB base field. See
- # amd_mem_rm for the same rule.
- need_sib = (b & 7) == 4
- sib = '24' if need_sib else ''
- rmv = 4 if need_sib else b
- if store:
- # MOV r/m8, r8 — 88 /r. Requires REX to address dil/sil/bpl/spl.
- prefix = rex(1, r >> 3, 0, b >> 3) + '88'
- return prefix + modrm(mod, r, rmv) + sib + d
- else:
- # MOVZX r64, r/m8 — REX.W 0F B6 /r.
- prefix = rex(1, r >> 3, 0, b >> 3) + '0FB6'
- return prefix + modrm(mod, r, rmv) + sib + d
-
-
-## ---------- aarch64 primitive encoders ----------------------------------
-## aarch64 is fixed 4-byte insns. Helpers return the 4 bytes LE-encoded.
-
-def aa_rrr(base, rD, rA, rB):
- d, a, b = NAT_AA64[rD], NAT_AA64[rA], NAT_AA64[rB]
- return le32(base | (b << 16) | (a << 5) | d)
-
-def aa_add_imm(rD, rA, imm12, sub=False):
- """ADD/SUB (immediate, shift=0). imm12 unsigned 0..4095."""
- d, a = NAT_AA64[rD], NAT_AA64[rA]
- base = 0xD1000000 if sub else 0x91000000
- return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d)
-
-def aa_logical_imm(base, rD, rA, N, immr, imms):
- d, a = NAT_AA64[rD], NAT_AA64[rA]
- return le32(base | (N << 22) | (immr << 16) | (imms << 10) | (a << 5) | d)
-
-def aa_ubfm(rD, rA, immr, imms):
- """UBFM (N=1 for sf=64)."""
- d, a = NAT_AA64[rD], NAT_AA64[rA]
- return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
-
-def aa_sbfm(rD, rA, immr, imms):
- """SBFM (N=1 for sf=64)."""
- d, a = NAT_AA64[rD], NAT_AA64[rA]
- return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
-
-def aa_ldst_uimm12(base, rT, rN, off_bytes, size_log2):
- """LDR/STR (unsigned offset). off_bytes must be a multiple of
- 2^size_log2 and non-negative. imm12 = off_bytes >> size_log2."""
- assert off_bytes >= 0 and (off_bytes % (1 << size_log2)) == 0
- imm12 = off_bytes >> size_log2
- assert 0 <= imm12 < 4096
- t, n = NAT_AA64[rT], NAT_AA64[rN]
- return le32(base | (imm12 << 10) | (n << 5) | t)
-
-def aa_ldst_unscaled(base, rT, rN, off):
- """LDUR/STUR (unscaled, signed imm9). Handles arbitrary small
- offsets — negative, or positive-but-not-a-multiple-of-the-access-
- size (e.g. LD at offset 7). imm9 range is [-256, 255]."""
- assert -256 <= off <= 255
- imm9 = off & 0x1FF
- t, n = NAT_AA64[rT], NAT_AA64[rN]
- return le32(base | (imm9 << 12) | (n << 5) | t)
-
-def aa_movz(rD, imm16):
- """MOVZ rD, #imm16 — load a 16-bit immediate into the low half of
- rD, zeroing the rest. Use this instead of `add rD, xzr, #imm`:
- aarch64 ADD encodes register 31 as SP (not XZR), so the obvious
- `mov xN, #imm` synthesis silently computes sp + imm."""
- d = NAT_AA64[rD]
- assert 0 <= imm16 < (1 << 16)
- return le32(0xD2800000 | (imm16 << 5) | d)
-
-
-## ---------- riscv64 primitive encoders ----------------------------------
-
-def rv_r(base, rD, rA, rB):
- d, a, b = NAT_RV64[rD], NAT_RV64[rA], NAT_RV64[rB]
- return le32(base | (b << 20) | (a << 15) | (d << 7))
-
-def rv_i(base, rD, rA, imm12):
- """I-type: imm12[11:0], rs1, funct3, rd, opcode. imm12 is a signed
- int that gets masked to 12 bits."""
- d, a = NAT_RV64[rD], NAT_RV64[rA]
- return le32(base | ((imm12 & 0xFFF) << 20) | (a << 15) | (d << 7))
-
-def rv_s(base, rS, rA, imm12):
- """S-type store: imm12[11:5] rs2 rs1 funct3 imm12[4:0] opcode."""
- s, a = NAT_RV64[rS], NAT_RV64[rA]
- hi = (imm12 >> 5) & 0x7F
- lo = imm12 & 0x1F
- return le32(base | (hi << 25) | (s << 20) | (a << 15) | (lo << 7))
-
-def rv_shift_imm(base, rD, rA, shamt):
- """Shift-imm: base already has funct7 set; shamt in [0,63]."""
- d, a = NAT_RV64[rD], NAT_RV64[rA]
- return le32(base | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7))
-
-
-## ---------- Per-arch op base tables -------------------------------------
-
-AA64_RRR_BASE = {
- 'ADD': 0x8B000000,
- 'SUB': 0xCB000000,
- 'AND': 0x8A000000,
- 'OR': 0xAA000000,
- 'XOR': 0xCA000000,
- 'SHL': 0x9AC02000,
- 'SHR': 0x9AC02400,
- 'SAR': 0x9AC02800,
- 'DIV': 0x9AC00C00,
-}
-AMD64_RRR_OPC = {
- 'ADD': '01', 'SUB': '29', 'AND': '21', 'OR': '09', 'XOR': '31',
-}
-RV_RRR = {
- 'ADD': 0x00000033, # funct7=0 funct3=0 opcode=0x33
- 'SUB': 0x40000033,
- 'XOR': 0x00004033,
- 'OR': 0x00006033,
- 'AND': 0x00007033,
- 'SHL': 0x00001033,
- 'SHR': 0x00005033,
- 'SAR': 0x40005033,
- 'MUL': 0x02000033,
- 'DIV': 0x02004033,
- 'REM': 0x02006033,
-}
-
-
-## aarch64 bitmask-immediate encoding for ANDI/ORI. Entries are the
-## (N, immr, imms) triples that encode each small imm as an aarch64
-## "logical immediate." Computed by hand because the full encoding
-## algorithm (contiguous-run + rotation for element sizes
-## 2/4/8/16/32/64) is substantial and we only need a handful of
-## values. Extend this table if a new imm shows up in P1 source.
-AA64_LOGI_ENC = {
- 1: (1, 0, 0), # 0b0001 — single bit at position 0
- 2: (1, 63, 0), # 0b0010 — single bit at position 1
- 3: (1, 0, 1), # 0b0011 — 2 contiguous ones
- 4: (1, 62, 0), # 0b0100 — single bit at position 2
- 6: (1, 63, 1), # 0b0110 — 2 ones rotated by 1
- 7: (1, 0, 2), # 0b0111 — 3 contiguous ones
- 8: (1, 61, 0), # 0b1000 — single bit at position 3
-}
-
-
-## Frame layout after PROLOGUE_Nk (k >= 1, rounded up so total frame
-## bytes stay 16-byte aligned):
-## [sp + 0] = retaddr (aarch64 lr / riscv64 ra / amd64 retaddr)
-## [sp + 8] = saved_fp (caller's sp at call/prologue entry)
-## [sp + 16] = k (slot count)
-## [sp + 24] = slot 1 (callee-private scratch)
-## [sp + 32] = slot 2
-## ...
-##
-## Frame size = round_up_to_16(24 + 8*k). So k=1 → 32, k=2 → 40 → 48,
-## k=3 → 48, k=4 → 56 → 64.
-
-def prologue_frame_bytes(k: int) -> int:
- raw = 24 + 8 * k
- return (raw + 15) & ~15
-
-
-## ---------- Encoders ----------------------------------------------------
-## One class per arch. Each provides one method per P1 op category,
-## mapping (op, reg-tuple, imm) to native bytes. Op classes dispatch
-## here via `Op.encode(enc)` → `enc.<method>(fields)`.
-
-class Encoder:
- """Per-arch encoder base. Subclasses implement one method per
- op category. `arch` is used by literal() to pick the right
- pre-encoded bytes from an arch-keyed dict."""
- arch = ''
-
- def literal(self, hex_by_arch):
- return hex_by_arch[self.arch]
-
-
-class AA64(Encoder):
- arch = 'aarch64'
-
- def rrr(self, op, rD, rA, rB):
- if op == 'MUL':
- # MUL = MADD with Ra=xzr. 100 11011 000 mmmmm 0 aaaaa nnnnn ddddd
- d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB]
- return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d)
- if op == 'REM':
- # SDIV x16, xA, xB ; MSUB xD, x16, xB, xA.
- # x16 (ARM IP0, caller-saved, not a P1 reg) is scratch so
- # REM does not hidden-clobber P1 r4 — the op modifies rD only.
- # MSUB needs bit 15 set (o0=1); without it it decodes as
- # MADD and REM returns A + (A/B)*B.
- d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB]
- SC = 16
- sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | SC
- msub = 0x9B008000 | (b << 16) | (a << 10) | (SC << 5) | d
- return le32(sdiv) + le32(msub)
- return aa_rrr(AA64_RRR_BASE[op], rD, rA, rB)
-
- def addi(self, rD, rA, imm):
- if imm >= 0:
- return aa_add_imm(rD, rA, imm, sub=False)
- return aa_add_imm(rD, rA, -imm, sub=True)
-
- def logi(self, op, rD, rA, imm):
- N, immr, imms = AA64_LOGI_ENC[imm]
- base = 0x92000000 if op == 'ANDI' else 0xB2000000 # ORI = orr
- return aa_logical_imm(base, rD, rA, N, immr, imms)
-
- def shifti(self, op, rD, rA, imm):
- if op == 'SHLI':
- return aa_ubfm(rD, rA, (-imm) & 63, 63 - imm)
- if op == 'SHRI':
- return aa_ubfm(rD, rA, imm, 63)
- if op == 'SARI':
- return aa_sbfm(rD, rA, imm, 63)
-
- def mov(self, rD, rA):
- if rD == 'sp':
- return aa_add_imm('sp', rA, 0, sub=False)
- if rA == 'sp':
- return aa_add_imm(rD, 'sp', 0, sub=False)
- # MOV xD, xA = ORR xD, xzr, xA
- d = NAT_AA64[rD]; a = NAT_AA64[rA]
- return le32(0xAA000000 | (a << 16) | (31 << 5) | d)
-
- def li(self, rD):
- # ldr wD, [pc+8] ; b +8 (caller emits 4 bytes of data next)
- d = NAT_AA64[rD]
- ldr_w_lit = 0x18000040 | d # LDR (literal) 32-bit, offset 8
- b_plus8 = 0x14000002 # B offset 8 (imm26 = 2 words = 8 bytes)
- return le32(ldr_w_lit) + le32(b_plus8)
-
- def la(self, rD):
- return self.li(rD)
-
- def mem(self, op, rT, rN, off):
- # Pick uimm12 (scaled, large range) when the offset is a
- # non-negative multiple of the access width; otherwise fall
- # back to the unscaled signed-imm9 form (covers negative
- # offsets and positive-but-misaligned ones like 7).
- BASES = {
- 'LD': (0xF9400000, 3, 0xF8400000),
- 'ST': (0xF9000000, 3, 0xF8000000),
- 'LB': (0x39400000, 0, 0x38400000),
- 'SB': (0x39000000, 0, 0x38000000),
- }
- uimm_base, size_log2, unscaled_base = BASES[op]
- scale = 1 << size_log2
- if off >= 0 and (off % scale) == 0:
- return aa_ldst_uimm12(uimm_base, rT, rN, off, size_log2)
- return aa_ldst_unscaled(unscaled_base, rT, rN, off)
-
- def b(self):
- return le32(0xD61F0000 | (NAT_AA64['br'] << 5)) # BR x17
-
- def condb(self, op, rA, rB):
- # cmp xA, xB = SUBS xzr, xA, xB (0xEB000000 base, rD=31).
- # Skip when NOT cond holds. BEQ→NE(1), BNE→EQ(0), BLT→GE(A).
- a = NAT_AA64[rA]; b_ = NAT_AA64[rB]
- cmp_ = le32(0xEB000000 | (b_ << 16) | (a << 5) | 31)
- cond = {'BEQ': 1, 'BNE': 0, 'BLT': 10}[op]
- bcond = le32(0x54000040 | cond)
- br = le32(0xD61F0000 | (NAT_AA64['br'] << 5))
- return cmp_ + bcond + br
-
- def condbz(self, op, rA):
- # Branch-if-zero family. BEQZ/BNEZ use CBZ/CBNZ (skip br on
- # the inverse cond); BLTZ falls back to CMP+B.GE since there's
- # no single signed-less-than-zero insn.
- a = NAT_AA64[rA]
- br = le32(0xD61F0000 | (NAT_AA64['br'] << 5))
- if op == 'BEQZ':
- return le32(0xB5000000 | (2 << 5) | a) + br # CBNZ rA, +8 ; BR
- if op == 'BNEZ':
- return le32(0xB4000000 | (2 << 5) | a) + br # CBZ rA, +8 ; BR
- if op == 'BLTZ':
- cmp_ = le32(0xEB000000 | (31 << 16) | (a << 5) | 31) # SUBS xzr, rA, xzr
- bge = le32(0x54000040 | 10) # b.ge +8
- return cmp_ + bge + br
-
- def call(self):
- return le32(0xD63F0000 | (NAT_AA64['br'] << 5)) # BLR x17
-
- def ret(self):
- return le32(0xD65F03C0) # RET (= br x30)
-
- def prologue(self, k):
- fb = prologue_frame_bytes(k)
- sub = aa_add_imm('sp', 'sp', fb, sub=True)
- str_lr = aa_ldst_uimm12(0xF9000000, 'lr', 'sp', 0, 3)
- save_fp = aa_add_imm('x8', 'sp', fb, sub=False)
- str_fp = aa_ldst_uimm12(0xF9000000, 'x8', 'sp', 8, 3)
- # MUST use MOVZ here: ADD imm with Rn=31 reads SP, not XZR,
- # so `add x8, xzr, #k` would store sp+k at [sp+16] instead
- # of the slot count k. The pre-GC code never read [sp+16]
- # so the bug stayed latent until the GC walker showed up.
- mov_k = aa_movz('x8', k)
- str_k = aa_ldst_uimm12(0xF9000000, 'x8', 'sp', 16, 3)
- # Zero each slot. The GC walker reads every slot through
- # mark_value, which dereferences any value whose tag bits
- # match a heap tag. Stale stack data left here from prior
- # frames can otherwise look like a tagged pointer and chase
- # the marker into garbage. See LISP-GC.md §"Roots".
- zero_slots = ''
- for i in range(k):
- zero_slots += aa_ldst_uimm12(0xF9000000, 'xzr', 'sp', 24 + 8*i, 3)
- return sub + str_lr + save_fp + str_fp + mov_k + str_k + zero_slots
-
- def epilogue(self, k):
- ldr_lr = aa_ldst_uimm12(0xF9400000, 'lr', 'sp', 0, 3)
- ldr_fp = aa_ldst_uimm12(0xF9400000, 'x8', 'sp', 8, 3)
- mov_sp = aa_add_imm('sp', 'x8', 0, sub=False)
- return ldr_lr + ldr_fp + mov_sp
-
- def tail(self, k):
- return self.epilogue(k) + self.b()
-
-
-class AMD64(Encoder):
- arch = 'amd64'
-
- def rrr(self, op, rD, rA, rB):
- if op == 'MUL':
- # If rD aliases rB, save rB to rcx first (mov rD,rA would
- # clobber it before imul reads it).
- if rD == rB and rA != rB:
- return amd_mov_rr('rcx', rB) + amd_mov_rr(rD, rA) + amd_imul_rr(rD, 'rcx')
- return amd_mov_rr(rD, rA) + amd_imul_rr(rD, rB)
- if op in ('DIV', 'REM'):
- # x86 idiv implicitly reads/writes rax (P1 r0) and rdx
- # (P1 r3). To keep DIV/REM clobber-free (only rD changes),
- # stash r0 into r11 and r3 into rcx — neither is a P1 reg —
- # then restore. If rA or rB alias r0/r3, read from the
- # saved copy since we've overwritten the originals.
- # Skip the final restore for whichever of r0/r3 *is* rD,
- # so rD keeps its newly computed value.
- seq = amd_mov_rr('r11', 'r0') # save r0 (rax)
- seq += amd_mov_rr('rcx', 'r3') # save r3 (rdx)
- src_a = 'r11' if rA == 'r0' else ('rcx' if rA == 'r3' else rA)
- seq += amd_mov_rr('r0', src_a) # rax = rA
- seq += amd_cqo() # rdx:rax = sign-ext rax
- src_b = 'r11' if rB == 'r0' else ('rcx' if rB == 'r3' else rB)
- seq += amd_idiv(src_b)
- seq += amd_mov_rr(rD, 'r0' if op == 'DIV' else 'r3')
- if rD != 'r3':
- seq += amd_mov_rr('r3', 'rcx')
- if rD != 'r0':
- seq += amd_mov_rr('r0', 'r11')
- return seq
- if op in ('SHL', 'SHR', 'SAR'):
- ext = {'SHL': 4, 'SHR': 5, 'SAR': 7}[op]
- # Save rB → rcx FIRST so a subsequent mov rD,rA (which may
- # alias rB) doesn't clobber the shift count.
- seq = amd_mov_rr('rcx', rB)
- if rD != rA:
- seq += amd_mov_rr(rD, rA)
- seq += amd_shift_cl(ext, rD)
- return seq
- # ADD/SUB/AND/OR/XOR. If rD aliases rB (e.g., sub_r3,r0,r3), the
- # naive mov rD,rA overwrites rB before the op reads it — funnel
- # rB through rcx.
- opcode = AMD64_RRR_OPC[op]
- if rD == rB and rA != rB:
- seq = amd_mov_rr('rcx', rB)
- seq += amd_mov_rr(rD, rA)
- seq += amd_alu_rr(opcode, rD, 'rcx')
- return seq
- seq = '' if rD == rA else amd_mov_rr(rD, rA)
- seq += amd_alu_rr(opcode, rD, rB)
- return seq
-
- def addi(self, rD, rA, imm):
- # mov rD,rA ; add rD,imm. Use imm8 form when it fits
- # ([-128, 127]); otherwise emit the imm32 form.
- seq = amd_mov_rr(rD, rA)
- if -128 <= imm <= 127:
- seq += amd_alu_ri8(0, rD, imm) # /0 = ADD
- else:
- seq += amd_alu_ri32(0, rD, imm)
- return seq
-
- def logi(self, op, rD, rA, imm):
- ext = {'ANDI': 4, 'ORI': 1}[op]
- seq = amd_mov_rr(rD, rA)
- seq += amd_alu_ri8(ext, rD, imm)
- return seq
-
- def shifti(self, op, rD, rA, imm):
- ext = {'SHLI': 4, 'SHRI': 5, 'SARI': 7}[op]
- seq = amd_mov_rr(rD, rA)
- seq += amd_shift_ri8(ext, rD, imm)
- return seq
-
- def mov(self, rD, rA):
- return amd_mov_rr(rD, rA)
-
- def li(self, rD):
- # mov <rD as r32>, imm32 — opcode B8+r (with REX.B if r8..r15)
- d = NAT_AMD64[rD]
- if d >= 8:
- return '41' + byte(0xB8 + (d & 7))
- return byte(0xB8 + d)
-
- def la(self, rD):
- return self.li(rD)
-
- def mem(self, op, rT, rN, off):
- if op == 'LD': return amd_mem_rm('8B', rT, rN, off)
- if op == 'ST': return amd_mem_rm('89', rT, rN, off)
- if op == 'LB': return amd_mov_rm_b(rT, rN, off, store=False)
- if op == 'SB': return amd_mov_rm_b(rT, rN, off, store=True)
-
- def b(self):
- return '41FFE3' # jmp r11
-
- def condb(self, op, rA, rB):
- a, b_ = NAT_AMD64[rA], NAT_AMD64[rB]
- # cmp rA, rB — opcode 39 /r with rA as r/m
- cmp_ = rex(1, b_ >> 3, 0, a >> 3) + '39' + modrm(3, b_, a)
- # jcc rel8 opcode, skip=3 (past jmp r11):
- # BEQ→JNE 75 03 ; BNE→JE 74 03 ; BLT→JGE 7D 03
- jop = {'BEQ': '75', 'BNE': '74', 'BLT': '7D'}[op]
- return cmp_ + jop + '03' + '41FFE3' # jmp r11
-
- def condbz(self, op, rA):
- a = NAT_AMD64[rA]
- # test rA, rA — 85 /r with rA in both slots; sets ZF iff rA == 0,
- # SF iff rA < 0. Same skip+jmp pattern as condb.
- test_ = rex(1, a >> 3, 0, a >> 3) + '85' + modrm(3, a, a)
- jop = {'BEQZ': '75', 'BNEZ': '74', 'BLTZ': '7D'}[op]
- return test_ + jop + '03' + '41FFE3' # jmp r11
-
- def call(self):
- return '41FFD3' # call r11
-
- def ret(self):
- return 'C3'
-
- def prologue(self, k):
- # pop rcx ; mov rax,rsp ; sub rsp,fb ; [rsp]=rcx ; [rsp+8]=rax ;
- # xor rax,rax ; add rax,k ; [rsp+16]=rax.
- # rcx carries the retaddr, rax carries saved_fp then k. Neither
- # is a live incoming P1 argument register at function entry.
- # Each slot is then zeroed so a stale stack value can't masquerade
- # as a tagged pointer when the GC walks this frame. See
- # LISP-GC.md §"Roots".
- fb = prologue_frame_bytes(k)
- assert fb <= 127
- seq = '59' # pop rcx
- seq += amd_mov_rr('r0', 'sp') # mov rax, rsp
- seq += '4883EC' + byte(fb) # sub rsp, fb
- seq += amd_mem_rm('89', 'rcx', 'sp', 0) # [rsp] = rcx
- seq += amd_mem_rm('89', 'r0', 'sp', 8) # [rsp+8] = rax
- seq += amd_alu_rr('31', 'r0', 'r0') # xor rax, rax
- seq += amd_alu_ri8(0, 'r0', k) # add rax, k
- seq += amd_mem_rm('89', 'r0', 'sp', 16) # [rsp+16] = rax
- seq += amd_alu_rr('31', 'r0', 'r0') # xor rax, rax
- for i in range(k):
- seq += amd_mem_rm('89', 'r0', 'sp', 24 + 8*i)
- return seq
-
- def epilogue(self, k):
- # r10 = retaddr ; rcx = saved_fp ; rsp = saved_fp ; push r10.
- # Keep r0/rax intact so function return values survive EPILOGUE.
- seq = amd_mem_rm('8B', 'r10', 'sp', 0) # r10 = [rsp]
- seq += amd_mem_rm('8B', 'rcx', 'sp', 8) # rcx = [rsp+8]
- seq += amd_mov_rr('sp', 'rcx') # rsp = rcx
- seq += '4152' # push r10
- return seq
-
- def tail(self, k):
- return self.epilogue(k) + self.b()
-
-
-class RV64(Encoder):
- arch = 'riscv64'
-
- def rrr(self, op, rD, rA, rB):
- return rv_r(RV_RRR[op], rD, rA, rB)
-
- def addi(self, rD, rA, imm):
- return rv_i(0x00000013, rD, rA, imm)
-
- def logi(self, op, rD, rA, imm):
- base = {'ANDI': 0x00007013, 'ORI': 0x00006013}[op]
- return rv_i(base, rD, rA, imm)
-
- def shifti(self, op, rD, rA, imm):
- base = {'SHLI': 0x00001013, 'SHRI': 0x00005013, 'SARI': 0x40005013}[op]
- return rv_shift_imm(base, rD, rA, imm)
-
- def mov(self, rD, rA):
- return rv_i(0x00000013, rD, rA, 0) # addi rD, rA, 0
-
- def li(self, rD):
- # auipc rD,0 ; lwu rD,12(rD) ; jal x0,+8
- d = NAT_RV64[rD]
- auipc = 0x00000017 | (d << 7)
- lwu = 0x00006003 | (d << 7) | (d << 15) | (12 << 20)
- jal_p8 = 0x0080006F
- return le32(auipc) + le32(lwu) + le32(jal_p8)
-
- def la(self, rD):
- return self.li(rD)
-
- def mem(self, op, rT, rN, off):
- # funct3: LD=3, ST=3, LBU=4, SB=0. Opcodes: load=03, store=23.
- if op == 'LD': return rv_i(0x00003003, rT, rN, off)
- if op == 'ST': return rv_s(0x00003023, rT, rN, off)
- if op == 'LB': return rv_i(0x00004003, rT, rN, off) # LBU
- if op == 'SB': return rv_s(0x00000023, rT, rN, off)
-
- def b(self):
- return le32(0x00000067 | (NAT_RV64['br'] << 15)) # jalr x0, 0(t5)
-
- def condb(self, op, rA, rB):
- # B<inv> rA, rB, +8 ; jalr x0, 0(t5). funct3 picks the op:
- # BEQ→BNE(1), BNE→BEQ(0), BLT→BGE(5).
- a, b_ = NAT_RV64[rA], NAT_RV64[rB]
- funct3 = {'BEQ': 1, 'BNE': 0, 'BLT': 5}[op]
- insn = 0x00000063 | (funct3 << 12) | (a << 15) | (b_ << 20) | (8 << 7)
- jalr = 0x00000067 | (NAT_RV64['br'] << 15)
- return le32(insn) + le32(jalr)
-
- def condbz(self, op, rA):
- # Same shape as condb but rs2 = x0 (zero reg). BEQZ/BNEZ/BLTZ
- # invert the same way (BNE/BEQ/BGE).
- a = NAT_RV64[rA]
- funct3 = {'BEQZ': 1, 'BNEZ': 0, 'BLTZ': 5}[op]
- insn = 0x00000063 | (funct3 << 12) | (a << 15) | (0 << 20) | (8 << 7)
- jalr = 0x00000067 | (NAT_RV64['br'] << 15)
- return le32(insn) + le32(jalr)
-
- def call(self):
- return le32(0x000000E7 | (NAT_RV64['br'] << 15)) # jalr ra, 0(t5)
-
- def ret(self):
- return le32(0x00008067) # jalr x0, 0(ra)
-
- def prologue(self, k):
- fb = prologue_frame_bytes(k)
- sub = rv_i(0x00000013, 'sp', 'sp', -fb)
- sd = rv_s(0x00003023, 'ra', 'sp', 0)
- fp = rv_i(0x00000013, 't0', 'sp', fb)
- sd_fp = rv_s(0x00003023, 't0', 'sp', 8)
- k_imm = rv_i(0x00000013, 't0', 'zero', k)
- sd_k = rv_s(0x00003023, 't0', 'sp', 16)
- # Zero each slot to keep GC walks safe; see LISP-GC.md §"Roots".
- zero_slots = ''
- for i in range(k):
- zero_slots += rv_s(0x00003023, 'zero', 'sp', 24 + 8*i)
- return sub + sd + fp + sd_fp + k_imm + sd_k + zero_slots
-
- def epilogue(self, k):
- ld = rv_i(0x00003003, 'ra', 'sp', 0)
- ld_fp = rv_i(0x00003003, 't0', 'sp', 8)
- mov_sp = rv_i(0x00000013, 'sp', 't0', 0)
- return ld + ld_fp + mov_sp
-
- def tail(self, k):
- return self.epilogue(k) + self.b()
-
-
-ENCODERS = {'aarch64': AA64(), 'amd64': AMD64(), 'riscv64': RV64()}
-
-
-## ---------- Op dataclasses ----------------------------------------------
-## Thin wrappers: each row holds its DEFINE name + the data needed to
-## reconstruct the encoding. `encode(enc)` calls the matching method
-## on the arch's encoder.
-
-@dataclass
-class Op:
- name: str
- comment: str = ''
-
- def encode(self, enc: Encoder) -> str:
- raise NotImplementedError
-
-@dataclass
-class RRR(Op):
- op: str = ''
- rD: str = ''
- rA: str = ''
- rB: str = ''
- def encode(self, enc):
- return enc.rrr(self.op, self.rD, self.rA, self.rB)
-
-@dataclass
-class AddI(Op):
- rD: str = ''
- rA: str = ''
- imm: int = 0
- def encode(self, enc):
- return enc.addi(self.rD, self.rA, self.imm)
-
-@dataclass
-class LogI(Op):
- op: str = '' # ANDI / ORI
- rD: str = ''
- rA: str = ''
- imm: int = 0
- def encode(self, enc):
- return enc.logi(self.op, self.rD, self.rA, self.imm)
-
-@dataclass
-class ShiftI(Op):
- op: str = '' # SHLI / SHRI / SARI
- rD: str = ''
- rA: str = ''
- imm: int = 0
- def encode(self, enc):
- return enc.shifti(self.op, self.rD, self.rA, self.imm)
-
-@dataclass
-class Mov(Op):
- rD: str = ''
- rA: str = ''
- def encode(self, enc):
- return enc.mov(self.rD, self.rA)
-
-@dataclass
-class Li(Op):
- rD: str = ''
- def encode(self, enc):
- return enc.li(self.rD)
-
-@dataclass
-class La(Op):
- rD: str = ''
- def encode(self, enc):
- return enc.la(self.rD)
-
-@dataclass
-class Mem(Op):
- op: str = '' # LD / ST / LB / SB
- rT: str = ''
- rN: str = ''
- off: int = 0
- def encode(self, enc):
- return enc.mem(self.op, self.rT, self.rN, self.off)
-
-@dataclass
-class B(Op):
- def encode(self, enc):
- return enc.b()
-
-@dataclass
-class CondB(Op):
- op: str = '' # BEQ / BNE / BLT
- rA: str = ''
- rB: str = ''
- def encode(self, enc):
- return enc.condb(self.op, self.rA, self.rB)
-
-@dataclass
-class CondBZ(Op):
- op: str = '' # BEQZ / BNEZ / BLTZ
- rA: str = ''
- def encode(self, enc):
- return enc.condbz(self.op, self.rA)
-
-@dataclass
-class Literal(Op):
- hex_by_arch: Optional[dict] = None
- def encode(self, enc):
- return enc.literal(self.hex_by_arch)
-
-@dataclass
-class Prologue(Op):
- k: int = 1
- def encode(self, enc):
- return enc.prologue(self.k)
-
-@dataclass
-class Epilogue(Op):
- k: int = 1
- def encode(self, enc):
- return enc.epilogue(self.k)
-
-@dataclass
-class Tail(Op):
- k: int = 1
- def encode(self, enc):
- return enc.tail(self.k)
-
-@dataclass
-class Call(Op):
- def encode(self, enc):
- return enc.call()
-
-@dataclass
-class Ret(Op):
- def encode(self, enc):
- return enc.ret()
-
-
-## ---------- SYSCALL pre-encoded sequences -------------------------------
-## The one-shot syscall wrapper. Shuffles P1's r0=num, r1–r6=args into
-## each arch's native syscall ABI and clobbers only r0 on return.
-## Encoded by hand (per P1.md §"Syscall conventions").
-
-SYSCALL_HEX = {
- 'aarch64': (
- # r4/r5 now live in callee-saved natives (x26/x27), so the
- # kernel preserves them — no save/restore needed. Only r1/r2/r3
- # (in caller-saved x1/x2/x3) must be stashed across the shuffle.
- '' .join([
- le32(0xAA0003E8), # mov x8, x0 (syscall number)
- le32(0xAA0103F5), # mov x21, x1 (save r1)
- le32(0xAA0203F6), # mov x22, x2 (save r2)
- le32(0xAA0303F7), # mov x23, x3 (save r3)
- le32(0xAA1503E0), # mov x0, x21 (arg1 = r1)
- le32(0xAA1603E1), # mov x1, x22 (arg2 = r2)
- le32(0xAA1703E2), # mov x2, x23 (arg3 = r3)
- le32(0xAA1A03E3), # mov x3, x26 (arg4 = r4)
- le32(0xAA1B03E4), # mov x4, x27 (arg5 = r5)
- le32(0xAA1303E5), # mov x5, x19 (arg6 = r6)
- le32(0xD4000001), # svc #0
- le32(0xAA1503E1), # mov x1, x21 (restore r1)
- le32(0xAA1603E2), # mov x2, x22
- le32(0xAA1703E3), # mov x3, x23
- ])
- ),
- # r4=r13, r5=r14 are callee-saved natively, but syscall wants args
- # 4/5 in r10/r8. r6=rbx, but arg6 lives in r9. Three shuffle moves,
- # then syscall. The kernel preserves rdi/rsi/rdx/r12–r15/rbx, so no
- # P1 reg is clobbered beyond r0 (syscall return).
- 'amd64': '4D89EA' + '4D89F0' + '4989D9' + '0F05',
- 'riscv64': (
- # Same story as aarch64: r4/r5 in callee-saved s4/s5 (=x20/x21),
- # so we only save/restore a1/a2/a3. Scratch slots: s3, s6, s7.
- ''.join([
- le32(0x00050893), # mv a7, a0 (syscall number)
- le32(0x00058993), # mv s3, a1 (save r1)
- le32(0x00060B13), # mv s6, a2 (save r2)
- le32(0x00068B93), # mv s7, a3 (save r3)
- le32(0x00098513), # mv a0, s3 (arg1 = r1)
- le32(0x000B0593), # mv a1, s6 (arg2 = r2)
- le32(0x000B8613), # mv a2, s7 (arg3 = r3)
- le32(0x000A0693), # mv a3, s4 (arg4 = r4)
- le32(0x000A8713), # mv a4, s5 (arg5 = r5)
- le32(0x00048793), # mv a5, s1 (arg6 = r6)
- le32(0x00000073), # ecall
- le32(0x00098593), # mv a1, s3 (restore r1)
- le32(0x000B0613), # mv a2, s6
- le32(0x000B8693), # mv a3, s7
- ])
- ),
-}
-
-## Syscall numbers (little-endian 32-bit for LI operand).
-## aarch64 and riscv64 share the asm-generic table; amd64 has its own.
-##
-## Portability notes — every entry below is a syscall that exists on all
-## three with the same semantics under the uniform P1 SYSCALL convention
-## (r0 = num, r1-r6 = args):
-## - `fork` is amd64-only; `wait4` is asm-generic 32-bit compat only.
-## Use `clone(SIGCHLD)` and `waitid` instead.
-## - `open` is amd64-only (removed from asm-generic). Use `openat` with
-## dirfd = AT_FDCWD (-100) as arg1.
-## - `clone` arg order differs: amd64 is (flags, stack, ptid, ctid, tls);
-## aarch64/riscv64 are (flags, stack, ptid, tls, ctid). Benign when
-## ptid/ctid/tls are all zero (the fork-equivalent case).
-SYS_NUM = {
- 'aarch64': {'SYS_WRITE': 64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57,
- 'SYS_OPENAT': 56,
- 'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95},
- 'amd64': {'SYS_WRITE': 1, 'SYS_EXIT': 60, 'SYS_READ': 0, 'SYS_CLOSE': 3,
- 'SYS_OPENAT':257,
- 'SYS_CLONE': 56, 'SYS_EXECVE': 59, 'SYS_WAITID':247},
- 'riscv64': {'SYS_WRITE': 64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57,
- 'SYS_OPENAT': 56,
- 'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95},
-}
-
-
-## ---------- Canonical imm/offset/shamt sets -----------------------------
-## Enumerated instead of sigil-passed: M1's DEFINE substitutes hex
-## bytes verbatim, so every distinct imm value needs its own DEFINE.
-## These cover every value used across hello/demo/lisp plus a little
-## headroom. Extend when a new value appears in P1 src.
-
-## ADDI imms. NEG48/48 handle the ASCII '0' bias; the rest cover tag
-## stripping, loop counters, and — at 40 — the primitive-registration
-## table's per-record stride. Full reg product × this set = 8²×N.
-ADDI_IMMS = tuple(range(-128, 129)) + (256, 320)
-
-## Shift amounts (for SHLI/SHRI/SARI). 32/52 implement low-N-bit masks
-## (length field extraction; 4096-slot symbol-table index); the small
-## values scale-by-N for byte offsets and fixnum encode/decode.
-SHIFT_IMMS = (1, 2, 3, 4, 5, 13, 16, 32, 52)
-
-## ANDI/ORI imms. Every entry must appear in AA64_LOGI_ENC.
-LOGI_IMMS = (1, 2, 3, 4, 6, 7, 8)
-
-## Memory offsets for LD/ST/LB/SB. 8*(0-40) covers record fields used by
-## the bootstrap expander's token, macro, stream, and scratch layouts.
-## 6/7 remain for legacy byte tests; -8 reaches one slot below a base.
-MEM_OFFS = (-8, 0, 6, 7) + tuple(range(8, 328, 8))
-
-CONDB_OPS = ('BEQ', 'BNE', 'BLT')
-CONDBZ_OPS = ('BEQZ', 'BNEZ', 'BLTZ')
-SHIFT_OPS = ('SHLI', 'SHRI', 'SARI')
-LOGI_OPS = ('ANDI', 'ORI')
-MEM_OPS = ('LD', 'ST', 'LB', 'SB')
-
-
-## Curated RRR triples. The full cube is 11 ops × 8³ regs = 5632
-## entries per arch — >99% would be dead weight. Each tuple below
-## is one actually used by hello/demo/lisp. Lint catches missing
-## triples on assembly; add a line here and regenerate.
-RRR_TABLE = (
- # demo/lisp step-1 arith cube
- ('ADD','r1','r1','r2'), ('ADD','r1','r1','r4'),
- ('ADD','r2','r2','r6'), ('ADD','r2','r3','r1'),
- ('SUB','r1','r1','r2'), ('SUB','r2','r2','r6'),
- ('AND','r1','r1','r5'),
- ('OR', 'r1','r1','r2'),
- ('XOR','r1','r1','r2'),
- ('MUL','r1','r1','r2'),
- ('DIV','r1','r1','r2'),
- ('REM','r1','r1','r5'),
- ('SHL','r1','r1','r2'),
- ('SHL','r2','r2','r1'),
- ('SHR','r1','r1','r2'),
- ('SAR','r4','r4','r2'),
- # alloc / pointer arithmetic
- ('ADD','r2','r0','r1'),
- ('ADD','r2','r1','r0'),
- ('ADD','r0','r0','r1'),
- ('ADD','r0','r0','r2'),
- ('ADD','r0','r0','r3'),
- ('ADD','r2','r2','r0'),
- ('ADD','r2','r2','r1'),
- ('SUB','r1','r1','r0'),
- ('SUB','r2','r1','r2'),
- ('SUB','r3','r3','r0'),
- ('AND','r1','r1','r2'),
- ('AND','r3','r3','r2'),
- ('OR', 'r3','r3','r2'),
- # reader / display index+offset fold
- ('ADD','r6','r1','r2'),
- ('ADD','r6','r6','r0'),
- ('ADD','r7','r1','r2'),
- # m1m bootstrap file I/O buffer indexing.
- ('ADD','r2','r2','r5'),
- ('ADD','r2','r2','r7'),
- ('SUB','r2','r1','r6'),
- ('SUB','r3','r1','r6'),
- ('SUB','r2','r2','r1'), # prelude length = end - start
- # m1m bootstrap file I/O remaining-byte counts.
- ('SUB','r3','r3','r7'),
- ('SUB','r3','r7','r5'),
- ('REM','r1','r1','r2'),
- # bump-pointer + accumulator updates (originally kaem-minimal;
- # retained in case lisp uses them — lint catches dead entries)
- ('ADD','r1','r1','r0'),
- ('ADD','r5','r5','r0'),
- ('ADD','r7','r7','r0'),
- ('SUB','r3','r3','r2'),
- ('SUB','r6','r6','r0'),
- # Primitive bodies (LISP.md step 10c). Convention: r1=argc,
- # r2=argv (both input), r3=accumulator, r0=scratch/return.
- # Variadic folds: (r3 = r3 op r0). Unary negate / bit-not:
- # (r3 = r0 - r3) with r0 = 0 or -1. arithmetic-shift k-negate:
- # (r0 = r1 - r0) with r1 = 0 after argc is consumed.
- ('ADD','r3','r3','r0'),
- ## ('SUB','r3','r3','r0') — already above
- ('SUB','r3','r0','r3'),
- ('SUB','r0','r1','r0'),
- ('MUL','r3','r3','r0'),
- ('DIV','r3','r3','r0'),
- ('REM','r3','r3','r0'),
- ('AND','r3','r3','r0'),
- ('OR', 'r3','r3','r0'),
- ('XOR','r3','r3','r0'),
- ('SHL','r3','r3','r0'),
- ('SAR','r3','r3','r0'),
-)
-
-
-## ---------- Row assembly ------------------------------------------------
-
-HEADER = """## p1_{arch}.M1 — GENERATED by p1_gen.py. Do not edit by hand.
-##
-## Shared op-table lives in p1_gen.py; each arch's encoder lowers
-## (op, register-tuple, imm) rows into native bytes. See P1.md for the
-## ISA spec and register mapping.
-"""
-
-@dataclass
-class Banner:
- text: str
-
-
-def _imm_suf(imm):
- return f'NEG{-imm}' if imm < 0 else f'{imm}'
-
-
-def rows():
- R = []
-
- # --- LI / LA — wide literal and address loads ---
- R.append(Banner('LI / LA — load 4-byte zero-extended literal or label addr'))
- for rd in P1_REGS:
- R.append(Li(name=f'LI_{rd.upper()}', rD=rd))
- # LI_BR loads into the hidden branch-target scratch (x17/r11/t5).
- # Every branch/call site is `LI_BR &target ; P1_<BR>`. The scratch
- # is *not* a P1 reg.
- R.append(Li(name='LI_BR', rD='br'))
- for rd in P1_REGS:
- R.append(La(name=f'LA_{rd.upper()}', rD=rd))
-
- # --- MOV — register-to-register + MOV rD, sp ---
- R.append(Banner('MOV — full register product (src may be sp)'))
- for rd in P1_REGS:
- for ra in P1_REGS:
- R.append(Mov(name=f'MOV_{rd.upper()}_{ra.upper()}', rD=rd, rA=ra))
- R.append(Mov(name=f'MOV_{rd.upper()}_SP', rD=rd, rA='sp'))
-
- # --- RRR — full cube. m1m.M1 is record-heavy enough that keeping this
- # complete is simpler and safer than curating one tuple at a time. The
- # Makefile still prunes unused DEFINEs before feeding M0.
- R.append(Banner('RRR — full register product'))
- all_rrr = tuple((op, d, a, b)
- for op in AA64_RRR_BASE
- for d in P1_REGS
- for a in P1_REGS
- for b in P1_REGS)
- for op, d, a, b in all_rrr:
- R.append(RRR(name=f'{op}_{d.upper()}_{a.upper()}_{b.upper()}',
- op=op, rD=d, rA=a, rB=b))
-
- # --- Immediate arith: ADDI × full reg product × imm set ---
- R.append(Banner('ADDI — full register product × ADDI_IMMS'))
- for d, a, imm in product(P1_REGS, P1_REGS, ADDI_IMMS):
- R.append(AddI(name=f'ADDI_{d.upper()}_{a.upper()}_{_imm_suf(imm)}',
- rD=d, rA=a, imm=imm))
-
- # --- ANDI / ORI × full reg product × LOGI_IMMS ---
- R.append(Banner('ANDI / ORI — full register product × LOGI_IMMS'))
- for op, d, a, imm in product(LOGI_OPS, P1_REGS, P1_REGS, LOGI_IMMS):
- R.append(LogI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}',
- op=op, rD=d, rA=a, imm=imm))
-
- # --- SHLI / SHRI / SARI × full reg product × SHIFT_IMMS ---
- R.append(Banner('SHLI / SHRI / SARI — full register product × SHIFT_IMMS'))
- for op, d, a, imm in product(SHIFT_OPS, P1_REGS, P1_REGS, SHIFT_IMMS):
- R.append(ShiftI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}',
- op=op, rD=d, rA=a, imm=imm))
-
- # --- Memory: LD/ST/LB/SB × full reg product × MEM_OFFS ---
- R.append(Banner('LD / ST / LB / SB — full register product × MEM_OFFS'))
- for op, rt, rn, off in product(MEM_OPS, P1_REGS, P1_REGS, MEM_OFFS):
- R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{_imm_suf(off)}',
- op=op, rT=rt, rN=rn, off=off))
-
- # --- SP-relative memory: LD/ST/LB/SB × P1_REGS × MEM_OFFS ---
- # Lets callees touch PROLOGUE_N* frame slots directly, skipping the
- # MOV_Rk_SP trampoline that previously bridged P1 regs to sp-base.
- R.append(Banner('LD / ST / LB / SB — SP-base × MEM_OFFS'))
- for op, rt, off in product(MEM_OPS, P1_REGS, MEM_OFFS):
- R.append(Mem(name=f'{op}_{rt.upper()}_SP_{_imm_suf(off)}',
- op=op, rT=rt, rN='sp', off=off))
-
- # --- Branches: BEQ/BNE/BLT × full reg product + unconditional B ---
- R.append(Banner('Branches — LI_BR-indirect pattern'))
- R.append(B(name='B'))
- for op, a, b in product(CONDB_OPS, P1_REGS, P1_REGS):
- R.append(CondB(name=f'{op}_{a.upper()}_{b.upper()}',
- op=op, rA=a, rB=b))
-
- # --- Zero-compare branches: BEQZ/BNEZ/BLTZ × P1_REGS ---
- # Compare-with-zero is the common case (null pointer, fd<0, EOF,
- # loop-count exhausted), so we give it a single-insn path that
- # doesn't consume a scratch reg for the zero operand.
- for op, a in product(CONDBZ_OPS, P1_REGS):
- R.append(CondBZ(name=f'{op}_{a.upper()}', op=op, rA=a))
-
- # --- Control: CALL / RET / PROLOGUE / EPILOGUE / TAIL (Nk = 1..4) ---
- R.append(Banner('Control: CALL/RET + single-slot and N-slot PROLOGUE/EPILOGUE/TAIL'))
- R.append(Prologue(name='PROLOGUE', k=1))
- R.append(Epilogue(name='EPILOGUE', k=1))
- R.append(Ret(name='RET'))
- R.append(Call(name='CALL'))
- R.append(Tail(name='TAIL', k=1))
- for k in (2, 3, 4):
- R.append(Prologue(name=f'PROLOGUE_N{k}', k=k))
- R.append(Epilogue(name=f'EPILOGUE_N{k}', k=k))
- R.append(Tail(name=f'TAIL_N{k}', k=k))
-
- # --- SYSCALL — pre-encoded per-arch wrapper ---
- R.append(Banner('SYSCALL — uniform "clobbers r0 only" across arches'))
- R.append(Literal(name='SYSCALL', hex_by_arch=SYSCALL_HEX))
-
- # --- Syscall numbers (LE-32 immediates) ---
- R.append(Banner('Linux syscall numbers (per-arch table). LE-32 operands for LI.'))
- for name in ('SYS_WRITE', 'SYS_EXIT', 'SYS_READ', 'SYS_CLOSE', 'SYS_OPENAT',
- 'SYS_CLONE', 'SYS_EXECVE', 'SYS_WAITID'):
- R.append(Literal(name=name,
- hex_by_arch={a: le32(SYS_NUM[a][name]) for a in ARCHES}))
-
- return R
-
-
-## ---------- File emission -----------------------------------------------
-
-def lower_name(name: str) -> str:
- """Transform an uppercase row name to the user-facing mnemonic.
- First '_' binds op → first operand with underscore; subsequent
- '_' become ',' (e.g. MOV_R4_R0 → mov_r4,r0, ADD_R1_R1_R2 →
- add_r1,r1,r2). Single-'_' names (PROLOGUE_N3, SYS_WRITE, LI_R0)
- keep their lone underscore."""
- low = name.lower()
- head, _, rest = low.partition('_')
- if not rest:
- return head
- return f'{head}_{rest.replace("_", ",")}'
-
-
-def emit(arch: str) -> str:
- enc = ENCODERS[arch]
- out = [HEADER.format(arch=arch).rstrip(), '']
- seen = set()
- for row in rows():
- if isinstance(row, Banner):
- out.append('')
- out.append('## ---- ' + row.text + ' ' + '-' * max(0, 60 - len(row.text)))
- continue
- name = lower_name(row.name)
- if name in seen:
- raise RuntimeError(f'duplicate DEFINE: {name}')
- seen.add(name)
- out.append(f'DEFINE {name} {row.encode(enc)}')
- out.append('')
- return '\n'.join(out)
-
-
-def main():
- check = '--check' in sys.argv
- positional = [a for a in sys.argv[1:] if not a.startswith('--')]
- build_root = positional[0] if positional else 'build'
-
- had_diff = False
- for arch in ARCHES:
- dest_dir = os.path.join(build_root, arch)
- path = os.path.join(dest_dir, f'p1_{arch}.M1')
- content = emit(arch)
- if check:
- try:
- with open(path) as f:
- existing = f.read()
- except FileNotFoundError:
- existing = ''
- if existing != content:
- sys.stderr.write(f'DIFF: {path}\n')
- had_diff = True
- else:
- os.makedirs(dest_dir, exist_ok=True)
- with open(path, 'w') as f:
- f.write(content)
- print(f'wrote {path} ({len(content)} bytes)')
-
- if check and had_diff:
- sys.exit(1)
-
-
-if __name__ == '__main__':
- main()