boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs

commit b9b5b0447f9729f86deac54ff643d8097c82636b
parent 49dcd6e7b024c3b4921442ba79db41bb510e5cc3
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue, 21 Apr 2026 08:16:46 -0700

build: only curdir is mounted into podman

Podman invocations now mount $(CURDIR):/work only. populate-upstream.sh
runs on the host, mirroring the files bootstrap.sh consumes
(hex0-seeds, hex0/1/2/catm/M0 sources, ELF-<arch>.hex2) from
\$UPSTREAM/seed/stage0-posix/ into build/upstream/. PODMAN_BOOTSTRAP
collapses into the unified PODMAN.

arch/ is gone: the vendored (and subtly reformatted) ELF-*.hex2 files
are deleted — the link rule now reads them straight from
build/upstream/. p1_gen.py moves to src/ and writes its three
p1_<arch>.M1 defs files into build/<arch>/. Only original sources
live in curdir; everything derived or copied lands under build/.

Diffstat:
M.gitignore | 3---
MMakefile | 77+++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
MREADME.md | 57++++++++++++++++++++++++++-------------------------------
Darch/ELF-aarch64.hex2 | 75---------------------------------------------------------------------------
Darch/ELF-amd64.hex2 | 74--------------------------------------------------------------------------
Darch/ELF-riscv64.hex2 | 74--------------------------------------------------------------------------
Darch/p1_gen.py | 1066-------------------------------------------------------------------------------
Mbootstrap.sh | 19+++++++++++--------
Apopulate-upstream.sh | 48++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/p1_gen.py | 1089+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
10 files changed, 1225 insertions(+), 1357 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1,4 +1 @@ build/ -arch/p1_aarch64.M1 -arch/p1_amd64.M1 -arch/p1_riscv64.M1 diff --git a/Makefile b/Makefile @@ -48,7 +48,11 @@ ifeq ($(PLATFORM),) $(error ARCH '$(ARCH)' not supported — use aarch64, amd64, or riscv64) endif -UPSTREAM := $(abspath $(CURDIR)/../live-bootstrap) +# Default upstream checkout path. Only consumed by populate-upstream.sh, +# which runs on the host to populate build/upstream/ before the container +# ever starts. Podman itself never mounts this — the container only ever +# sees curdir, so all inputs must already live inside it. +UPSTREAM ?= $(abspath $(CURDIR)/../live-bootstrap) # Pinned to a manifest-list digest (not the :latest tag): podman on macOS # only stores one image per tag locally, so cross-arch pulls under :latest @@ -63,16 +67,22 @@ RUNTIME_IMAGE := public.ecr.aws/docker/library/alpine@sha256:5b10f432ef3da1b8d4c OUT_DIR := build/$(ARCH) TOOLS_DIR := $(OUT_DIR)/tools -# Two container views: -# PODMAN_BOOTSTRAP — toolchain build. Needs read-only access to stage0-posix -# under ../live-bootstrap; writes only into build/$(ARCH)/tools. -# PODMAN — assemble / link / run. Sees only the lispcc dir. -PODMAN_BOOTSTRAP := podman run --rm --platform $(PLATFORM) \ - -v $(UPSTREAM):/work/live-bootstrap:ro \ - -v $(CURDIR):/work/lispcc \ - -w /work/lispcc \ - $(RUNTIME_IMAGE) - +# stage0-posix uses mixed-case arch dirs (AArch64, AMD64) that don't match +# our lowercase ARCH. Map them so build/upstream/ mirrors upstream layout. +ARCH_DIR_aarch64 := AArch64 +ARCH_DIR_amd64 := AMD64 +ARCH_DIR_riscv64 := riscv64 +ARCH_DIR := $(ARCH_DIR_$(ARCH)) + +# Host-populated mirror of the upstream files we consume. Everything +# bootstrap.sh needs (seeds, hex0/1/2 sources, catm, M0, ELF headers) +# lands here before any podman work begins. +UPSTREAM_DIR := build/upstream +UPSTREAM_STAMP := $(UPSTREAM_DIR)/.stamp + +# Single podman view: curdir mounted at /work. Toolchain build, assembly, +# link, and run all share this view. Keeping it narrow means nothing +# outside the repo is visible to the container. PODMAN := podman run --rm --platform $(PLATFORM) \ -v $(CURDIR):/work \ -w /work \ @@ -80,21 +90,36 @@ PODMAN := podman run --rm --platform $(PLATFORM) \ # --- Targets --------------------------------------------------------------- -.PHONY: all toolchain run run-all test-lisp test-lisp-all clean +.PHONY: all toolchain populate-upstream run run-all test-lisp test-lisp-all clean all: $(OUT_DIR)/$(PROG) toolchain: $(TOOLS_DIR)/M0 +populate-upstream: $(UPSTREAM_STAMP) + $(OUT_DIR) $(TOOLS_DIR): mkdir -p $@ +# Mirror the upstream seed + hex0/1/2/catm/M0/ELF files we need from +# $(UPSTREAM) into build/upstream/. Host-side so the container mount stays +# minimal. The stamp doubles as an order marker and avoids re-copying on +# every toolchain build. +$(UPSTREAM_STAMP): populate-upstream.sh + sh populate-upstream.sh $(UPSTREAM) + @touch $@ + +# Any file anyone asks for under build/upstream/ is produced by the stamp +# rule above. Empty recipe — the file is already on disk once the stamp +# exists, and the stamp's timestamp stands in for every file's freshness. +$(UPSTREAM_DIR)/%: $(UPSTREAM_STAMP) ; + # Bootstrap M0, hex2-0, catm (and the throwaway hex0/hex1) from hex0-seed. # One shot per arch — see bootstrap.sh for the phase-by-phase chain. # # Grouped target (&:) so all five outputs come from a single recipe run. -$(TOOLS_DIR)/M0 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm $(TOOLS_DIR)/hex0 $(TOOLS_DIR)/hex1 &: bootstrap.sh | $(TOOLS_DIR) - $(PODMAN_BOOTSTRAP) sh bootstrap.sh $(ARCH) /work/lispcc/$(TOOLS_DIR) +$(TOOLS_DIR)/M0 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm $(TOOLS_DIR)/hex0 $(TOOLS_DIR)/hex1 &: bootstrap.sh $(UPSTREAM_STAMP) | $(TOOLS_DIR) + $(PODMAN) sh bootstrap.sh $(ARCH) /work/$(TOOLS_DIR) # Assemble: lint first, then combine per-arch defs + program and feed to M0. # @@ -105,10 +130,10 @@ $(TOOLS_DIR)/M0 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm $(TOOLS_DIR)/hex0 $(TOOLS_ # M0 takes a single positional input (no -f flag), so we catm the two # sources together first. The intermediate .combined.M1 is kept in OUT_DIR # so it gets cleaned along with everything else. -$(OUT_DIR)/$(PROG).hex2: $(PROG_SRC) arch/p1_$(ARCH).M1 lint.sh $(TOOLS_DIR)/M0 $(TOOLS_DIR)/catm | $(OUT_DIR) - ./lint.sh arch/p1_$(ARCH).M1 $(PROG_SRC) +$(OUT_DIR)/$(PROG).hex2: $(PROG_SRC) $(OUT_DIR)/p1_$(ARCH).M1 lint.sh $(TOOLS_DIR)/M0 $(TOOLS_DIR)/catm | $(OUT_DIR) + ./lint.sh $(OUT_DIR)/p1_$(ARCH).M1 $(PROG_SRC) $(PODMAN) sh -ec ' \ - $(TOOLS_DIR)/catm $(OUT_DIR)/$(PROG).combined.M1 arch/p1_$(ARCH).M1 $(PROG_SRC) ; \ + $(TOOLS_DIR)/catm $(OUT_DIR)/$(PROG).combined.M1 $(OUT_DIR)/p1_$(ARCH).M1 $(PROG_SRC) ; \ $(TOOLS_DIR)/M0 $(OUT_DIR)/$(PROG).combined.M1 $(OUT_DIR)/$(PROG).hex2' # Link: prepend the ELF header and feed to hex2-0. @@ -117,9 +142,9 @@ $(OUT_DIR)/$(PROG).hex2: $(PROG_SRC) arch/p1_$(ARCH).M1 lint.sh $(TOOLS_DIR)/M0 # base address 0x00600000 (no --base-address flag), which is why the ELF # header references `&ELF_base` symbolically rather than baking in a # concrete VA — the header travels to whatever base the linker chose. -$(OUT_DIR)/$(PROG): $(OUT_DIR)/$(PROG).hex2 arch/ELF-$(ARCH).hex2 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm +$(OUT_DIR)/$(PROG): $(OUT_DIR)/$(PROG).hex2 $(UPSTREAM_DIR)/$(ARCH_DIR)/ELF-$(ARCH).hex2 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm $(PODMAN) sh -ec ' \ - $(TOOLS_DIR)/catm $(OUT_DIR)/$(PROG).linked.hex2 arch/ELF-$(ARCH).hex2 $(OUT_DIR)/$(PROG).hex2 ; \ + $(TOOLS_DIR)/catm $(OUT_DIR)/$(PROG).linked.hex2 $(UPSTREAM_DIR)/$(ARCH_DIR)/ELF-$(ARCH).hex2 $(OUT_DIR)/$(PROG).hex2 ; \ $(TOOLS_DIR)/hex2-0 $(OUT_DIR)/$(PROG).linked.hex2 $(OUT_DIR)/$(PROG)' run: $(OUT_DIR)/$(PROG) @@ -168,12 +193,12 @@ test-lisp-all: $(MAKE) --no-print-directory ARCH=riscv64 test-lisp clean: - rm -rf build/ arch/p1_aarch64.M1 arch/p1_amd64.M1 arch/p1_riscv64.M1 + rm -rf build/ -# Generate all three per-arch DEFINE tables from arch/p1_gen.py in a +# Generate all three per-arch DEFINE tables from src/p1_gen.py in a # single shot. Grouped target (&:) because p1_gen.py writes all three -# files unconditionally (next to itself, i.e. into arch/). These are -# build artifacts — gitignored; the build regenerates them on any -# p1_gen.py edit so there's no staleness risk. -arch/p1_aarch64.M1 arch/p1_amd64.M1 arch/p1_riscv64.M1 &: arch/p1_gen.py - python3 arch/p1_gen.py +# files unconditionally. Output lands under build/<arch>/ (build/ is +# wiped by clean, so the build regenerates on any p1_gen.py edit with +# no staleness risk). +build/aarch64/p1_aarch64.M1 build/amd64/p1_amd64.M1 build/riscv64/p1_riscv64.M1 &: src/p1_gen.py + python3 src/p1_gen.py build diff --git a/README.md b/README.md @@ -10,50 +10,45 @@ Goal is a 4–6× shrink in auditable LOC. See [docs/PLAN.md](docs/PLAN.md). Stage 0: hello-world in the P1 portable pseudo-ISA (see [docs/P1.md](docs/P1.md)), assembled and run inside a pristine alpine container on all three target arches (aarch64, amd64, riscv64). The same `tests/hello.M1` source assembles -for every arch; only the backing `arch/p1_<arch>.M1` defs file varies. -Toolchain (M1, hex2) builds statically from the upstream mescc-tools C -source. +for every arch; only the backing `build/<arch>/p1_<arch>.M1` defs file +varies. Toolchain (M1, hex2) builds statically from the upstream mescc-tools +C source. ## Layout ``` -docs/ design docs (PLAN, SEED, P1, C1, LISP) -src/ real programs (lisp.M1, kaem-minimal.M1) -tests/ smoke programs (hello.M1, demo.M1) + fixtures - lisp/ lisp test fixtures (*.scm + *.expected) - kaem.run smoke input for kaem-minimal -arch/ per-arch defs + ELF headers - p1_gen.py generator for p1_<arch>.M1 - p1_<arch>.M1 per-arch P1 defs (gitignored, generated) - ELF-<arch>.hex2 per-arch ELF header template -bootstrap.sh hex0-seed → M0/hex2-0/catm toolchain build -lint.sh M1 undefined-token guard -Makefile podman-driven build, ARCH-parameterized -build/<arch>/ per-arch outputs + toolchain +docs/ design docs (PLAN, SEED, P1, C1, LISP) +src/ real programs (lisp.M1, kaem-minimal.M1) + p1_gen.py +tests/ smoke programs (hello.M1, demo.M1) + fixtures + lisp/ lisp test fixtures (*.scm + *.expected) + kaem.run smoke input for kaem-minimal +bootstrap.sh hex0-seed → M0/hex2-0/catm toolchain build +populate-upstream.sh host-side copy of upstream seeds + sources into build/upstream/ +lint.sh M1 undefined-token guard +Makefile podman-driven build, ARCH-parameterized +build/ all derived artifacts (gitignored) + upstream/ mirror of the files bootstrap.sh consumes from live-bootstrap + <arch>/ per-arch outputs + tools/ bootstrapped M0, hex2-0, catm (+ throwaway hex0/hex1) + p1_<arch>.M1 generated P1 defs + <prog> final ELF binary ``` ## Build & run -Requires podman. Non-native arches run via podman's binfmt + qemu-user -path (works transparently on a default `podman machine` setup). +Requires podman. Uses Alpine as the host. Non-native arches run via podman's +binfmt + qemu-user path (works transparently on a default `podman machine` +setup). ``` -make image # one-time: build the alpine+gcc builder image -make # default ARCH=aarch64 → build/aarch64/hello -make ARCH=amd64 # build/amd64/hello -make ARCH=riscv64 # build/riscv64/hello -make run # run build/$(ARCH)/hello in pristine alpine make run-all # build + run on all three arches make clean # wipe build/ ``` -Two images are used: `lispcc-builder` (alpine+gcc, ~184 MB) only compiles -M1/hex2 at host arch; `alpine:latest` pulled per target platform runs -the assembled binary with the static toolchain mounted in. - ## Source layout assumption -The Makefile reaches the upstream mescc-tools C source via the parent dir -mount (`HOST_ROOT := $(abspath $(CURDIR)/..)`), expecting -`../live-bootstrap/seed/stage0-posix/mescc-tools/`. Override `TOOLCHAIN_SRC` -in the Makefile if your layout differs. +`populate-upstream.sh` runs on the host and mirrors the files bootstrap.sh +needs from `$UPSTREAM/seed/stage0-posix/` into `build/upstream/`; the +default is `../live-bootstrap`. Override by invoking `make UPSTREAM=/path +populate-upstream`. Podman itself only ever mounts curdir, so everything +the container sees must live inside the repo. diff --git a/arch/ELF-aarch64.hex2 b/arch/ELF-aarch64.hex2 @@ -1,75 +0,0 @@ -### Copyright (C) 2016 Jeremiah Orians -### Copyright (C) 2017 Jan Nieuwenhuizen <janneke@gnu.org> -### Copyright (C) 2020 deesix <deesix@tuta.io> -### This file is part of M2-Planet. -### -### M2-Planet is free software: you can redistribute it and/or modify -### it under the terms of the GNU General Public License as published by -### the Free Software Foundation, either version 3 of the License, or -### (at your option) any later version. -### -### M2-Planet is distributed in the hope that it will be useful, -### but WITHOUT ANY WARRANTY; without even the implied warranty of -### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -### GNU General Public License for more details. -### -### You should have received a copy of the GNU General Public License -### along with M2-Planet. If not, see <http://www.gnu.org/licenses/>. - -### stage0's hex2 format -### !<label> 1 byte relative -### $<label> 2 byte address -### @<label> 2 byte relative -### &<label> 4 byte address -### %<label> 4 byte relative - -### if you wish to use this header, you need to add :ELF_end to the end of your -### M1 or hex2 files. - -## ELF Header - -:ELF_base -7F 45 4C 46 # e_ident[EI_MAG0-3] ELF's magic number - -02 # e_ident[EI_CLASS] Indicating 64 bit -01 # e_ident[EI_DATA] Indicating little endianness -01 # e_ident[EI_VERSION] Indicating original elf - -03 # e_ident[EI_OSABI] Set at 3 because FreeBSD is strict -00 # e_ident[EI_ABIVERSION] See above - -00 00 00 00 00 00 00 # e_ident[EI_PAD] - -02 00 # e_type Indicating Executable -B7 00 # e_machine Indicating AArch64 -01 00 00 00 # e_version Indicating original elf - -&_start 00 00 00 00 # e_entry Address of the entry point -%ELF_program_headers>ELF_base 00 00 00 00 # e_phoff Address of program header table -00 00 00 00 00 00 00 00 # e_shoff Address of section header table - -00 00 00 00 # e_flags - -40 00 # e_ehsize Indicating our 64 Byte header - -38 00 # e_phentsize size of a program header table -01 00 # e_phnum number of entries in program table - -00 00 # e_shentsize size of a section header table -00 00 # e_shnum number of entries in section table - -00 00 # e_shstrndx index of the section names - - -:ELF_program_headers -:ELF_program_header__text -01 00 00 00 # ph_type: PT-LOAD = 1 -07 00 00 00 # ph_flags: PF-X|PF-W|PF-R = 7 -00 00 00 00 00 00 00 00 # ph_offset -&ELF_base 00 00 00 00 # ph_vaddr -&ELF_base 00 00 00 00 # ph_physaddr -%ELF_end>ELF_base 00 00 00 00 # ph_filesz -%ELF_end>ELF_base 00 00 00 00 # ph_memsz -01 00 00 00 00 00 00 00 # ph_align - -:ELF_text diff --git a/arch/ELF-amd64.hex2 b/arch/ELF-amd64.hex2 @@ -1,74 +0,0 @@ -### Copyright (C) 2016 Jeremiah Orians -### Copyright (C) 2017 Jan Nieuwenhuizen <janneke@gnu.org> -### This file is part of M2-Planet. -### -### M2-Planet is free software: you can redistribute it and/or modify -### it under the terms of the GNU General Public License as published by -### the Free Software Foundation, either version 3 of the License, or -### (at your option) any later version. -### -### M2-Planet is distributed in the hope that it will be useful, -### but WITHOUT ANY WARRANTY; without even the implied warranty of -### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -### GNU General Public License for more details. -### -### You should have received a copy of the GNU General Public License -### along with M2-Planet. If not, see <http://www.gnu.org/licenses/>. - -### stage0's hex2 format -### !<label> 1 byte relative -### $<label> 2 byte address -### @<label> 2 byte relative -### &<label> 4 byte address -### %<label> 4 byte relative - -### if you wish to use this header, you need to add :ELF_end to the end of your -### M1 or hex2 files. - -## ELF Header - -:ELF_base -7F 45 4C 46 # e_ident[EI_MAG0-3] ELF's magic number - -02 # e_ident[EI_CLASS] Indicating 64 bit -01 # e_ident[EI_DATA] Indicating little endianness -01 # e_ident[EI_VERSION] Indicating original elf - -03 # e_ident[EI_OSABI] Set at 3 because FreeBSD is strict -00 # e_ident[EI_ABIVERSION] See above - -00 00 00 00 00 00 00 # e_ident[EI_PAD] - -02 00 # e_type Indicating Executable -3E 00 # e_machine Indicating AMD64 -01 00 00 00 # e_version Indicating original elf - -&_start 00 00 00 00 # e_entry Address of the entry point -%ELF_program_headers>ELF_base 00 00 00 00 # e_phoff Address of program header table -00 00 00 00 00 00 00 00 # e_shoff Address of section header table - -00 00 00 00 # e_flags - -40 00 # e_ehsize Indicating our 64 Byte header - -38 00 # e_phentsize size of a program header table -01 00 # e_phnum number of entries in program table - -00 00 # e_shentsize size of a section header table -00 00 # e_shnum number of entries in section table - -00 00 # e_shstrndx index of the section names - - -:ELF_program_headers -:ELF_program_header__text -01 00 00 00 # ph_type: PT-LOAD = 1 -07 00 00 00 # ph_flags: PF-X|PF-W|PF-R = 7 -00 00 00 00 00 00 00 00 # ph_offset -&ELF_base 00 00 00 00 # ph_vaddr -&ELF_base 00 00 00 00 # ph_physaddr -%ELF_end>ELF_base 00 00 00 00 # ph_filesz -%ELF_end>ELF_base 00 00 00 00 # ph_memsz -01 00 00 00 00 00 00 00 # ph_align - -:ELF_text diff --git a/arch/ELF-riscv64.hex2 b/arch/ELF-riscv64.hex2 @@ -1,74 +0,0 @@ -### Copyright (C) 2016 Jeremiah Orians -### Copyright (C) 2017 Jan Nieuwenhuizen <janneke@gnu.org> -### This file is part of M2-Planet. -### -### M2-Planet is free software: you can redistribute it and/or modify -### it under the terms of the GNU General Public License as published by -### the Free Software Foundation, either version 3 of the License, or -### (at your option) any later version. -### -### M2-Planet is distributed in the hope that it will be useful, -### but WITHOUT ANY WARRANTY; without even the implied warranty of -### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -### GNU General Public License for more details. -### -### You should have received a copy of the GNU General Public License -### along with M2-Planet. If not, see <http://www.gnu.org/licenses/>. - -### stage0's hex2 format -### !<label> 1 byte relative -### $<label> 2 byte address -### @<label> 2 byte relative -### &<label> 4 byte address -### %<label> 4 byte relative - -### if you wish to use this header, you need to add :ELF_end to the end of your -### M1 or hex2 files. - -## ELF Header - -:ELF_base -7F 45 4C 46 # e_ident[EI_MAG0-3] ELF's magic number - -02 # e_ident[EI_CLASS] Indicating 64 bit -01 # e_ident[EI_DATA] Indicating little endianness -01 # e_ident[EI_VERSION] Indicating original elf - -03 # e_ident[EI_OSABI] Set at 3 because FreeBSD is strict -00 # e_ident[EI_ABIVERSION] See above - -00 00 00 00 00 00 00 # e_ident[EI_PAD] - -02 00 # e_type Indicating Executable -F3 00 # e_machine Indicating RISC-V -01 00 00 00 # e_version Indicating original elf - -&_start 00 00 00 00 # e_entry Address of the entry point -%ELF_program_headers>ELF_base 00 00 00 00 # e_phoff Address of program header table -00 00 00 00 00 00 00 00 # e_shoff Address of section header table - -00 00 00 00 # e_flags - -40 00 # e_ehsize Indicating our 64 Byte header - -38 00 # e_phentsize size of a program header table -01 00 # e_phnum number of entries in program table - -00 00 # e_shentsize size of a section header table -00 00 # e_shnum number of entries in section table - -00 00 # e_shstrndx index of the section names - - -:ELF_program_headers -:ELF_program_header__text -01 00 00 00 # ph_type: PT-LOAD = 1 -07 00 00 00 # ph_flags: PF-X|PF-W|PF-R = 7 -00 00 00 00 00 00 00 00 # ph_offset -&ELF_base 00 00 00 00 # ph_vaddr -&ELF_base 00 00 00 00 # ph_physaddr -%ELF_end>ELF_base 00 00 00 00 # ph_filesz -%ELF_end>ELF_base 00 00 00 00 # ph_memsz -01 00 00 00 00 00 00 00 # ph_align - -:ELF_text diff --git a/arch/p1_gen.py b/arch/p1_gen.py @@ -1,1066 +0,0 @@ -#!/usr/bin/env python3 -"""p1_gen.py — generate p1_<arch>.M1 from a per-arch encoder table. - -Single source of truth for the P1 DEFINE tables across all three target -arches. Running this script rewrites p1_aarch64.M1, p1_amd64.M1, and -p1_riscv64.M1 in place. - -Structure: - * Low-level native encoders (amd_*, aa_*, rv_*) — one bank of - helpers per arch. - * Encoder classes AA64/AMD64/RV64 (subclasses of Encoder): one - method per P1 op category, lowering (op, reg-tuple, imm) into - native hex. Each arch's encoder is a coherent bundle — adding a - new op means one new method on each of the three. - * Op dataclasses — thin rows holding the DEFINE's name + data. - Op.encode(enc) dispatches into enc.<op-method>() with the Op's - fields unpacked. No per-arch branching lives in Op classes. - * rows() — builds the output list. Non-RRR ops are emitted as the - full register product × a curated imm/offset/shamt set. RRR - keeps an explicit table (the full 8³ cube is 5.6k entries per - arch, >99% dead weight). Adding a new RRR triple or a new imm - value is a one-line edit to rows(); a new register combination - for any other op needs no edit at all. - * emit(arch) / main — iterate rows, ask the arch's encoder to - lower each, write out the defs file. - -Running: - $ python3 p1_gen.py # rewrite all three files - $ python3 p1_gen.py --check # diff against current files -""" - -import os -import sys -from dataclasses import dataclass -from itertools import product -from typing import Optional - -ARCHES = ('aarch64', 'amd64', 'riscv64') - -## P1 GPRs (the 8 caller/callee-split registers exposed to P1 source). -P1_REGS = ('r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7') - -## ---------- Register mappings -------------------------------------------- -## P1 register name → native encoding number. The native numbers are what -## the per-arch encoders insert into instruction fields; the human-facing -## names (rax, x1, a2, …) never appear in this file. - -## 4:4 caller/callee-saved split. r0–r3 caller (native argregs); r4–r7 -## callee (native callee-saved). `br` is the hidden branch-target scratch -## (not a P1 reg) — picked so every op's expansion clobbers only what its -## name declares. -NAT_AA64 = {'r0': 0, 'r1': 1, 'r2': 2, 'r3': 3, - 'r4': 26, 'r5': 27, 'r6': 19, 'r7': 20, - 'br': 17, # x17 (IP1, caller-saved linker scratch) - 'sp': 31, 'xzr': 31, 'lr': 30, - 'x21': 21, 'x22': 22, 'x23': 23, 'x8': 8} - -## amd64 ModRM.reg/rm + REX.R/B bit: native regnums 0..15 with r8..r15 -## setting the REX bit. We store the 4-bit native number directly. -NAT_AMD64 = {'r0': 0, # rax - 'r1': 7, # rdi - 'r2': 6, # rsi - 'r3': 2, # rdx - 'r4': 13, # r13 (callee-saved) - 'r5': 14, # r14 (callee-saved) - 'r6': 3, # rbx - 'r7': 12, # r12 - 'br': 11, # r11 — branch/call target scratch + DIV/REM r0 save - 'sp': 4, # rsp - 'rcx': 1, # shift-count scratch + DIV/REM rdx save (not a P1 reg) - 'r10': 10, # syscall arg4 slot (not a P1 reg) - 'r8': 8, # syscall arg5 slot (not a P1 reg) - 'r9': 9, # syscall arg6 slot (not a P1 reg) - 'r11': 11, # alias for br (some expansions spell it r11 directly) - } - -NAT_RV64 = {'r0': 10, 'r1': 11, 'r2': 12, 'r3': 13, - 'r4': 20, 'r5': 21, 'r6': 9, 'r7': 18, - 'br': 30, # t5 (caller-saved temp) - 'sp': 2, 'ra': 1, 'zero': 0, 'a7': 17, - 's3': 19, 's6': 22, 's7': 23} - - -## ---------- Low-level encoding helpers ----------------------------------- - -def le32(n: int) -> str: - return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() - -def byte(n: int) -> str: - return f'{n & 0xFF:02X}' - - -## ---------- amd64 primitive encoders ------------------------------------ -## amd64 is variable-length. Helpers below emit specific instruction -## shapes used by the P1 expansions. REX prefix bits: W=64b, R=ModRM.reg -## high, B=ModRM.rm high, X=SIB.index high (unused here). - -def rex(w, r, x, b): - v = 0x40 | (w << 3) | (r << 2) | (x << 1) | b - return byte(v) - -def modrm(mod, reg, rm): - return byte((mod << 6) | ((reg & 7) << 3) | (rm & 7)) - -def amd_mov_rr(dst, src): - """mov dst, src — REX.W + 89 /r (MOV r/m64, r64).""" - d, s = NAT_AMD64[dst], NAT_AMD64[src] - return rex(1, s >> 3, 0, d >> 3) + '89' + modrm(3, s, d) - -def amd_alu_rr(op, dst, src): - """op dst, src — 2-operand ALU. op is the opcode byte (01 add, - 29 sub, 21 and, 09 or, 31 xor).""" - d, s = NAT_AMD64[dst], NAT_AMD64[src] - return rex(1, s >> 3, 0, d >> 3) + op + modrm(3, s, d) - -def amd_alu_ri8(ext, dst, imm): - """op dst, imm8 (sign-extended). Opcode 83 /ext ib.""" - d = NAT_AMD64[dst] - return rex(1, 0, 0, d >> 3) + '83' + modrm(3, ext, d) + byte(imm) - -def amd_alu_ri32(ext, dst, imm): - """op dst, imm32 (sign-extended). Opcode 81 /ext id. Used when - an immediate doesn't fit in the imm8 form (e.g., ADDI with - values outside [-128, 127]).""" - d = NAT_AMD64[dst] - imm_le = (imm & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() - return rex(1, 0, 0, d >> 3) + '81' + modrm(3, ext, d) + imm_le - -def amd_shift_ri8(ext, dst, imm): - """shl/shr/sar dst, imm8. Opcode C1 /ext ib.""" - d = NAT_AMD64[dst] - return rex(1, 0, 0, d >> 3) + 'C1' + modrm(3, ext, d) + byte(imm) - -def amd_shift_cl(ext, dst): - """shl/shr/sar dst, cl. Opcode D3 /ext.""" - d = NAT_AMD64[dst] - return rex(1, 0, 0, d >> 3) + 'D3' + modrm(3, ext, d) - -def amd_imul_rr(dst, src): - """imul dst, src — 0F AF /r.""" - d, s = NAT_AMD64[dst], NAT_AMD64[src] - return rex(1, d >> 3, 0, s >> 3) + '0FAF' + modrm(3, d, s) - -def amd_idiv(src): - """idiv src — F7 /7 (signed div of rdx:rax by src).""" - s = NAT_AMD64[src] - return rex(1, 0, 0, s >> 3) + 'F7' + modrm(3, 7, s) - -def amd_cqo(): - """cqo — sign-extend rax into rdx:rax. 48 99.""" - return '4899' - -def amd_mem_rm(opcode, reg, base, disp): - """[base+disp] <-> reg, for MOV r,r/m or MOV r/m,r (opcode=89 store, 8B load). - disp is signed int; encodes as disp8 if in range, else disp32.""" - r, b = NAT_AMD64[reg], NAT_AMD64[base] - prefix = rex(1, r >> 3, 0, b >> 3) + opcode - if -128 <= disp <= 127: - mod = 1 - d = byte(disp) - elif b == 4: # SIB required for rsp - mod = 2 - d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() - else: - mod = 2 - d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() - # rsp as base requires SIB byte (rm=4 with no SIB is rip-relative). - if b == 4: - return prefix + modrm(mod, r, 4) + '24' + d - return prefix + modrm(mod, r, b) + d - -def amd_mov_rm_b(reg, base, disp, store): - """Byte load/store. 88 /r (store), 0F B6 /r (movzx load).""" - r, b = NAT_AMD64[reg], NAT_AMD64[base] - if -128 <= disp <= 127: - mod = 1 - d = byte(disp) - else: - mod = 2 - d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() - if store: - # MOV r/m8, r8 — 88 /r. Requires REX to address dil/sil/bpl/spl. - prefix = rex(1, r >> 3, 0, b >> 3) + '88' - sib = '24' if b == 4 else '' - rmv = 4 if b == 4 else b - return prefix + modrm(mod, r, rmv) + sib + d - else: - # MOVZX r64, r/m8 — REX.W 0F B6 /r. - prefix = rex(1, r >> 3, 0, b >> 3) + '0FB6' - sib = '24' if b == 4 else '' - rmv = 4 if b == 4 else b - return prefix + modrm(mod, r, rmv) + sib + d - - -## ---------- aarch64 primitive encoders ---------------------------------- -## aarch64 is fixed 4-byte insns. Helpers return the 4 bytes LE-encoded. - -def aa_rrr(base, rD, rA, rB): - d, a, b = NAT_AA64[rD], NAT_AA64[rA], NAT_AA64[rB] - return le32(base | (b << 16) | (a << 5) | d) - -def aa_add_imm(rD, rA, imm12, sub=False): - """ADD/SUB (immediate, shift=0). imm12 unsigned 0..4095.""" - d, a = NAT_AA64[rD], NAT_AA64[rA] - base = 0xD1000000 if sub else 0x91000000 - return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d) - -def aa_logical_imm(base, rD, rA, N, immr, imms): - d, a = NAT_AA64[rD], NAT_AA64[rA] - return le32(base | (N << 22) | (immr << 16) | (imms << 10) | (a << 5) | d) - -def aa_ubfm(rD, rA, immr, imms): - """UBFM (N=1 for sf=64).""" - d, a = NAT_AA64[rD], NAT_AA64[rA] - return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d) - -def aa_sbfm(rD, rA, immr, imms): - """SBFM (N=1 for sf=64).""" - d, a = NAT_AA64[rD], NAT_AA64[rA] - return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d) - -def aa_ldst_uimm12(base, rT, rN, off_bytes, size_log2): - """LDR/STR (unsigned offset). off_bytes must be a multiple of - 2^size_log2 and non-negative. imm12 = off_bytes >> size_log2.""" - assert off_bytes >= 0 and (off_bytes % (1 << size_log2)) == 0 - imm12 = off_bytes >> size_log2 - assert 0 <= imm12 < 4096 - t, n = NAT_AA64[rT], NAT_AA64[rN] - return le32(base | (imm12 << 10) | (n << 5) | t) - -def aa_ldst_unscaled(base, rT, rN, off): - """LDUR/STUR (unscaled, signed imm9). Handles arbitrary small - offsets — negative, or positive-but-not-a-multiple-of-the-access- - size (e.g. LD at offset 7). imm9 range is [-256, 255].""" - assert -256 <= off <= 255 - imm9 = off & 0x1FF - t, n = NAT_AA64[rT], NAT_AA64[rN] - return le32(base | (imm9 << 12) | (n << 5) | t) - - -## ---------- riscv64 primitive encoders ---------------------------------- - -def rv_r(base, rD, rA, rB): - d, a, b = NAT_RV64[rD], NAT_RV64[rA], NAT_RV64[rB] - return le32(base | (b << 20) | (a << 15) | (d << 7)) - -def rv_i(base, rD, rA, imm12): - """I-type: imm12[11:0], rs1, funct3, rd, opcode. imm12 is a signed - int that gets masked to 12 bits.""" - d, a = NAT_RV64[rD], NAT_RV64[rA] - return le32(base | ((imm12 & 0xFFF) << 20) | (a << 15) | (d << 7)) - -def rv_s(base, rS, rA, imm12): - """S-type store: imm12[11:5] rs2 rs1 funct3 imm12[4:0] opcode.""" - s, a = NAT_RV64[rS], NAT_RV64[rA] - hi = (imm12 >> 5) & 0x7F - lo = imm12 & 0x1F - return le32(base | (hi << 25) | (s << 20) | (a << 15) | (lo << 7)) - -def rv_shift_imm(base, rD, rA, shamt): - """Shift-imm: base already has funct7 set; shamt in [0,63].""" - d, a = NAT_RV64[rD], NAT_RV64[rA] - return le32(base | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7)) - - -## ---------- Per-arch op base tables ------------------------------------- - -AA64_RRR_BASE = { - 'ADD': 0x8B000000, - 'SUB': 0xCB000000, - 'AND': 0x8A000000, - 'OR': 0xAA000000, - 'XOR': 0xCA000000, - 'SHL': 0x9AC02000, - 'SHR': 0x9AC02400, - 'SAR': 0x9AC02800, - 'DIV': 0x9AC00C00, -} -AMD64_RRR_OPC = { - 'ADD': '01', 'SUB': '29', 'AND': '21', 'OR': '09', 'XOR': '31', -} -RV_RRR = { - 'ADD': 0x00000033, # funct7=0 funct3=0 opcode=0x33 - 'SUB': 0x40000033, - 'XOR': 0x00004033, - 'OR': 0x00006033, - 'AND': 0x00007033, - 'SHL': 0x00001033, - 'SHR': 0x00005033, - 'SAR': 0x40005033, - 'MUL': 0x02000033, - 'DIV': 0x02004033, - 'REM': 0x02006033, -} - - -## aarch64 bitmask-immediate encoding for ANDI/ORI. Entries are the -## (N, immr, imms) triples that encode each small imm as an aarch64 -## "logical immediate." Computed by hand because the full encoding -## algorithm (contiguous-run + rotation for element sizes -## 2/4/8/16/32/64) is substantial and we only need a handful of -## values. Extend this table if a new imm shows up in P1 source. -AA64_LOGI_ENC = { - 1: (1, 0, 0), # 0b0001 — single bit at position 0 - 2: (1, 63, 0), # 0b0010 — single bit at position 1 - 3: (1, 0, 1), # 0b0011 — 2 contiguous ones - 4: (1, 62, 0), # 0b0100 — single bit at position 2 - 6: (1, 63, 1), # 0b0110 — 2 ones rotated by 1 - 7: (1, 0, 2), # 0b0111 — 3 contiguous ones - 8: (1, 61, 0), # 0b1000 — single bit at position 3 -} - - -## Frame layout after PROLOGUE_Nk (k >= 1, rounded up so total frame -## bytes stay 16-byte aligned on aarch64): -## [sp + 0] = retaddr (aarch64 lr / riscv64 ra / amd64 retaddr) -## [sp + 8] = slot 1 (callee-private scratch) -## [sp + 16] = slot 2 -## ... -## [sp + 8*k] = slot k -## -## Frame size = round_up_to_16(8 + 8*k). So k=1 → 16, k=2 → 24 → 32, -## k=3 → 32, k=4 → 40 → 48. - -def prologue_frame_bytes(k: int) -> int: - raw = 8 + 8 * k - return (raw + 15) & ~15 - - -## ---------- Encoders ---------------------------------------------------- -## One class per arch. Each provides one method per P1 op category, -## mapping (op, reg-tuple, imm) to native bytes. Op classes dispatch -## here via `Op.encode(enc)` → `enc.<method>(fields)`. - -class Encoder: - """Per-arch encoder base. Subclasses implement one method per - op category. `arch` is used by literal() to pick the right - pre-encoded bytes from an arch-keyed dict.""" - arch = '' - - def literal(self, hex_by_arch): - return hex_by_arch[self.arch] - - -class AA64(Encoder): - arch = 'aarch64' - - def rrr(self, op, rD, rA, rB): - if op == 'MUL': - # MUL = MADD with Ra=xzr. 100 11011 000 mmmmm 0 aaaaa nnnnn ddddd - d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB] - return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d) - if op == 'REM': - # SDIV x16, xA, xB ; MSUB xD, x16, xB, xA. - # x16 (ARM IP0, caller-saved, not a P1 reg) is scratch so - # REM does not hidden-clobber P1 r4 — the op modifies rD only. - # MSUB needs bit 15 set (o0=1); without it it decodes as - # MADD and REM returns A + (A/B)*B. - d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB] - SC = 16 - sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | SC - msub = 0x9B008000 | (b << 16) | (a << 10) | (SC << 5) | d - return le32(sdiv) + le32(msub) - return aa_rrr(AA64_RRR_BASE[op], rD, rA, rB) - - def addi(self, rD, rA, imm): - if imm >= 0: - return aa_add_imm(rD, rA, imm, sub=False) - return aa_add_imm(rD, rA, -imm, sub=True) - - def logi(self, op, rD, rA, imm): - N, immr, imms = AA64_LOGI_ENC[imm] - base = 0x92000000 if op == 'ANDI' else 0xB2000000 # ORI = orr - return aa_logical_imm(base, rD, rA, N, immr, imms) - - def shifti(self, op, rD, rA, imm): - if op == 'SHLI': - return aa_ubfm(rD, rA, (-imm) & 63, 63 - imm) - if op == 'SHRI': - return aa_ubfm(rD, rA, imm, 63) - if op == 'SARI': - return aa_sbfm(rD, rA, imm, 63) - - def mov(self, rD, rA): - if rA == 'sp': - return aa_add_imm(rD, 'sp', 0, sub=False) - # MOV xD, xA = ORR xD, xzr, xA - d = NAT_AA64[rD]; a = NAT_AA64[rA] - return le32(0xAA000000 | (a << 16) | (31 << 5) | d) - - def li(self, rD): - # ldr wD, [pc+8] ; b +8 (caller emits 4 bytes of data next) - d = NAT_AA64[rD] - ldr_w_lit = 0x18000040 | d # LDR (literal) 32-bit, offset 8 - b_plus8 = 0x14000002 # B offset 8 (imm26 = 2 words = 8 bytes) - return le32(ldr_w_lit) + le32(b_plus8) - - def la(self, rD): - return self.li(rD) - - def mem(self, op, rT, rN, off): - # Pick uimm12 (scaled, large range) when the offset is a - # non-negative multiple of the access width; otherwise fall - # back to the unscaled signed-imm9 form (covers negative - # offsets and positive-but-misaligned ones like 7). - BASES = { - 'LD': (0xF9400000, 3, 0xF8400000), - 'ST': (0xF9000000, 3, 0xF8000000), - 'LB': (0x39400000, 0, 0x38400000), - 'SB': (0x39000000, 0, 0x38000000), - } - uimm_base, size_log2, unscaled_base = BASES[op] - scale = 1 << size_log2 - if off >= 0 and (off % scale) == 0: - return aa_ldst_uimm12(uimm_base, rT, rN, off, size_log2) - return aa_ldst_unscaled(unscaled_base, rT, rN, off) - - def b(self): - return le32(0xD61F0000 | (NAT_AA64['br'] << 5)) # BR x17 - - def condb(self, op, rA, rB): - # cmp xA, xB = SUBS xzr, xA, xB (0xEB000000 base, rD=31). - # Skip when NOT cond holds. BEQ→NE(1), BNE→EQ(0), BLT→GE(A). - a = NAT_AA64[rA]; b_ = NAT_AA64[rB] - cmp_ = le32(0xEB000000 | (b_ << 16) | (a << 5) | 31) - cond = {'BEQ': 1, 'BNE': 0, 'BLT': 10}[op] - bcond = le32(0x54000040 | cond) - br = le32(0xD61F0000 | (NAT_AA64['br'] << 5)) - return cmp_ + bcond + br - - def call(self): - return le32(0xD63F0000 | (NAT_AA64['br'] << 5)) # BLR x17 - - def ret(self): - return le32(0xD65F03C0) # RET (= br x30) - - def prologue(self, k): - fb = prologue_frame_bytes(k) - sub = aa_add_imm('sp', 'sp', fb, sub=True) - str_lr = aa_ldst_uimm12(0xF9000000, 'lr', 'sp', 0, 3) - return sub + str_lr - - def epilogue(self, k): - fb = prologue_frame_bytes(k) - ldr_lr = aa_ldst_uimm12(0xF9400000, 'lr', 'sp', 0, 3) - add = aa_add_imm('sp', 'sp', fb, sub=False) - return ldr_lr + add - - def tail(self, k): - return self.epilogue(k) + self.b() - - -class AMD64(Encoder): - arch = 'amd64' - - def rrr(self, op, rD, rA, rB): - if op == 'MUL': - return amd_mov_rr(rD, rA) + amd_imul_rr(rD, rB) - if op in ('DIV', 'REM'): - # x86 idiv implicitly reads/writes rax (P1 r0) and rdx - # (P1 r3). To keep DIV/REM clobber-free (only rD changes), - # stash r0 into r11 and r3 into rcx — neither is a P1 reg — - # then restore. If rA or rB alias r0/r3, read from the - # saved copy since we've overwritten the originals. - # Skip the final restore for whichever of r0/r3 *is* rD, - # so rD keeps its newly computed value. - seq = amd_mov_rr('r11', 'r0') # save r0 (rax) - seq += amd_mov_rr('rcx', 'r3') # save r3 (rdx) - src_a = 'r11' if rA == 'r0' else ('rcx' if rA == 'r3' else rA) - seq += amd_mov_rr('r0', src_a) # rax = rA - seq += amd_cqo() # rdx:rax = sign-ext rax - src_b = 'r11' if rB == 'r0' else ('rcx' if rB == 'r3' else rB) - seq += amd_idiv(src_b) - seq += amd_mov_rr(rD, 'r0' if op == 'DIV' else 'r3') - if rD != 'r3': - seq += amd_mov_rr('r3', 'rcx') - if rD != 'r0': - seq += amd_mov_rr('r0', 'r11') - return seq - if op in ('SHL', 'SHR', 'SAR'): - ext = {'SHL': 4, 'SHR': 5, 'SAR': 7}[op] - seq = amd_mov_rr(rD, rA) - seq += amd_mov_rr('rcx', rB) - seq += amd_shift_cl(ext, rD) - return seq - # ADD/SUB/AND/OR/XOR: mov rD,rA ; op rD,rB - seq = amd_mov_rr(rD, rA) - seq += amd_alu_rr(AMD64_RRR_OPC[op], rD, rB) - return seq - - def addi(self, rD, rA, imm): - # mov rD,rA ; add rD,imm. Use imm8 form when it fits - # ([-128, 127]); otherwise emit the imm32 form. - seq = amd_mov_rr(rD, rA) - if -128 <= imm <= 127: - seq += amd_alu_ri8(0, rD, imm) # /0 = ADD - else: - seq += amd_alu_ri32(0, rD, imm) - return seq - - def logi(self, op, rD, rA, imm): - ext = {'ANDI': 4, 'ORI': 1}[op] - seq = amd_mov_rr(rD, rA) - seq += amd_alu_ri8(ext, rD, imm) - return seq - - def shifti(self, op, rD, rA, imm): - ext = {'SHLI': 4, 'SHRI': 5, 'SARI': 7}[op] - seq = amd_mov_rr(rD, rA) - seq += amd_shift_ri8(ext, rD, imm) - return seq - - def mov(self, rD, rA): - return amd_mov_rr(rD, rA) - - def li(self, rD): - # mov <rD as r32>, imm32 — opcode B8+r (with REX.B if r8..r15) - d = NAT_AMD64[rD] - if d >= 8: - return '41' + byte(0xB8 + (d & 7)) - return byte(0xB8 + d) - - def la(self, rD): - return self.li(rD) - - def mem(self, op, rT, rN, off): - if op == 'LD': return amd_mem_rm('8B', rT, rN, off) - if op == 'ST': return amd_mem_rm('89', rT, rN, off) - if op == 'LB': return amd_mov_rm_b(rT, rN, off, store=False) - if op == 'SB': return amd_mov_rm_b(rT, rN, off, store=True) - - def b(self): - return '41FFE3' # jmp r11 - - def condb(self, op, rA, rB): - a, b_ = NAT_AMD64[rA], NAT_AMD64[rB] - # cmp rA, rB — opcode 39 /r with rA as r/m - cmp_ = rex(1, b_ >> 3, 0, a >> 3) + '39' + modrm(3, b_, a) - # jcc rel8 opcode, skip=3 (past jmp r11): - # BEQ→JNE 75 03 ; BNE→JE 74 03 ; BLT→JGE 7D 03 - jop = {'BEQ': '75', 'BNE': '74', 'BLT': '7D'}[op] - return cmp_ + jop + '03' + '41FFE3' # jmp r11 - - def call(self): - return '41FFD3' # call r11 - - def ret(self): - return 'C3' - - def prologue(self, k): - # pop rcx ; sub rsp,fb ; push rcx. rcx is the retaddr-carry - # scratch — caller-save, never a P1 reg. r11 (= 'br') is - # off-limits because TAIL = EPILOGUE + `jmp r11`, and using - # r11 here would clobber the LI_BR-loaded tail target. - fb = prologue_frame_bytes(k) - assert fb <= 127 - return '59' + '4883EC' + byte(fb) + '51' - - def epilogue(self, k): - # Mirror of prologue: pop rcx ; add rsp,fb ; push rcx. - fb = prologue_frame_bytes(k) - assert fb <= 127 - return '59' + '4883C4' + byte(fb) + '51' - - def tail(self, k): - return self.epilogue(k) + self.b() - - -class RV64(Encoder): - arch = 'riscv64' - - def rrr(self, op, rD, rA, rB): - return rv_r(RV_RRR[op], rD, rA, rB) - - def addi(self, rD, rA, imm): - return rv_i(0x00000013, rD, rA, imm) - - def logi(self, op, rD, rA, imm): - base = {'ANDI': 0x00007013, 'ORI': 0x00006013}[op] - return rv_i(base, rD, rA, imm) - - def shifti(self, op, rD, rA, imm): - base = {'SHLI': 0x00001013, 'SHRI': 0x00005013, 'SARI': 0x40005013}[op] - return rv_shift_imm(base, rD, rA, imm) - - def mov(self, rD, rA): - return rv_i(0x00000013, rD, rA, 0) # addi rD, rA, 0 - - def li(self, rD): - # auipc rD,0 ; lwu rD,12(rD) ; jal x0,+8 - d = NAT_RV64[rD] - auipc = 0x00000017 | (d << 7) - lwu = 0x00006003 | (d << 7) | (d << 15) | (12 << 20) - jal_p8 = 0x0080006F - return le32(auipc) + le32(lwu) + le32(jal_p8) - - def la(self, rD): - return self.li(rD) - - def mem(self, op, rT, rN, off): - # funct3: LD=3, ST=3, LBU=4, SB=0. Opcodes: load=03, store=23. - if op == 'LD': return rv_i(0x00003003, rT, rN, off) - if op == 'ST': return rv_s(0x00003023, rT, rN, off) - if op == 'LB': return rv_i(0x00004003, rT, rN, off) # LBU - if op == 'SB': return rv_s(0x00000023, rT, rN, off) - - def b(self): - return le32(0x00000067 | (NAT_RV64['br'] << 15)) # jalr x0, 0(t5) - - def condb(self, op, rA, rB): - # B<inv> rA, rB, +8 ; jalr x0, 0(t5). funct3 picks the op: - # BEQ→BNE(1), BNE→BEQ(0), BLT→BGE(5). - a, b_ = NAT_RV64[rA], NAT_RV64[rB] - funct3 = {'BEQ': 1, 'BNE': 0, 'BLT': 5}[op] - insn = 0x00000063 | (funct3 << 12) | (a << 15) | (b_ << 20) | (8 << 7) - jalr = 0x00000067 | (NAT_RV64['br'] << 15) - return le32(insn) + le32(jalr) - - def call(self): - return le32(0x000000E7 | (NAT_RV64['br'] << 15)) # jalr ra, 0(t5) - - def ret(self): - return le32(0x00008067) # jalr x0, 0(ra) - - def prologue(self, k): - fb = prologue_frame_bytes(k) - sub = rv_i(0x00000013, 'sp', 'sp', -fb) - sd = rv_s(0x00003023, 'ra', 'sp', 0) - return sub + sd - - def epilogue(self, k): - fb = prologue_frame_bytes(k) - ld = rv_i(0x00003003, 'ra', 'sp', 0) - add = rv_i(0x00000013, 'sp', 'sp', fb) - return ld + add - - def tail(self, k): - return self.epilogue(k) + self.b() - - -ENCODERS = {'aarch64': AA64(), 'amd64': AMD64(), 'riscv64': RV64()} - - -## ---------- Op dataclasses ---------------------------------------------- -## Thin wrappers: each row holds its DEFINE name + the data needed to -## reconstruct the encoding. `encode(enc)` calls the matching method -## on the arch's encoder. - -@dataclass -class Op: - name: str - comment: str = '' - - def encode(self, enc: Encoder) -> str: - raise NotImplementedError - -@dataclass -class RRR(Op): - op: str = '' - rD: str = '' - rA: str = '' - rB: str = '' - def encode(self, enc): - return enc.rrr(self.op, self.rD, self.rA, self.rB) - -@dataclass -class AddI(Op): - rD: str = '' - rA: str = '' - imm: int = 0 - def encode(self, enc): - return enc.addi(self.rD, self.rA, self.imm) - -@dataclass -class LogI(Op): - op: str = '' # ANDI / ORI - rD: str = '' - rA: str = '' - imm: int = 0 - def encode(self, enc): - return enc.logi(self.op, self.rD, self.rA, self.imm) - -@dataclass -class ShiftI(Op): - op: str = '' # SHLI / SHRI / SARI - rD: str = '' - rA: str = '' - imm: int = 0 - def encode(self, enc): - return enc.shifti(self.op, self.rD, self.rA, self.imm) - -@dataclass -class Mov(Op): - rD: str = '' - rA: str = '' - def encode(self, enc): - return enc.mov(self.rD, self.rA) - -@dataclass -class Li(Op): - rD: str = '' - def encode(self, enc): - return enc.li(self.rD) - -@dataclass -class La(Op): - rD: str = '' - def encode(self, enc): - return enc.la(self.rD) - -@dataclass -class Mem(Op): - op: str = '' # LD / ST / LB / SB - rT: str = '' - rN: str = '' - off: int = 0 - def encode(self, enc): - return enc.mem(self.op, self.rT, self.rN, self.off) - -@dataclass -class B(Op): - def encode(self, enc): - return enc.b() - -@dataclass -class CondB(Op): - op: str = '' # BEQ / BNE / BLT - rA: str = '' - rB: str = '' - def encode(self, enc): - return enc.condb(self.op, self.rA, self.rB) - -@dataclass -class Literal(Op): - hex_by_arch: Optional[dict] = None - def encode(self, enc): - return enc.literal(self.hex_by_arch) - -@dataclass -class Prologue(Op): - k: int = 1 - def encode(self, enc): - return enc.prologue(self.k) - -@dataclass -class Epilogue(Op): - k: int = 1 - def encode(self, enc): - return enc.epilogue(self.k) - -@dataclass -class Tail(Op): - k: int = 1 - def encode(self, enc): - return enc.tail(self.k) - -@dataclass -class Call(Op): - def encode(self, enc): - return enc.call() - -@dataclass -class Ret(Op): - def encode(self, enc): - return enc.ret() - - -## ---------- SYSCALL pre-encoded sequences ------------------------------- -## The one-shot syscall wrapper. Shuffles P1's r0=num, r1–r6=args into -## each arch's native syscall ABI and clobbers only r0 on return. -## Encoded by hand (per P1.md §"Syscall conventions"). - -SYSCALL_HEX = { - 'aarch64': ( - # r4/r5 now live in callee-saved natives (x26/x27), so the - # kernel preserves them — no save/restore needed. Only r1/r2/r3 - # (in caller-saved x1/x2/x3) must be stashed across the shuffle. - '' .join([ - le32(0xAA0003E8), # mov x8, x0 (syscall number) - le32(0xAA0103F5), # mov x21, x1 (save r1) - le32(0xAA0203F6), # mov x22, x2 (save r2) - le32(0xAA0303F7), # mov x23, x3 (save r3) - le32(0xAA1503E0), # mov x0, x21 (arg1 = r1) - le32(0xAA1603E1), # mov x1, x22 (arg2 = r2) - le32(0xAA1703E2), # mov x2, x23 (arg3 = r3) - le32(0xAA1A03E3), # mov x3, x26 (arg4 = r4) - le32(0xAA1B03E4), # mov x4, x27 (arg5 = r5) - le32(0xAA1303E5), # mov x5, x19 (arg6 = r6) - le32(0xD4000001), # svc #0 - le32(0xAA1503E1), # mov x1, x21 (restore r1) - le32(0xAA1603E2), # mov x2, x22 - le32(0xAA1703E3), # mov x3, x23 - ]) - ), - # r4=r13, r5=r14 are callee-saved natively, but syscall wants args - # 4/5 in r10/r8. r6=rbx, but arg6 lives in r9. Three shuffle moves, - # then syscall. The kernel preserves rdi/rsi/rdx/r12–r15/rbx, so no - # P1 reg is clobbered beyond r0 (syscall return). - 'amd64': '4D89EA' + '4D89F0' + '4989D9' + '0F05', - 'riscv64': ( - # Same story as aarch64: r4/r5 in callee-saved s4/s5 (=x20/x21), - # so we only save/restore a1/a2/a3. Scratch slots: s3, s6, s7. - ''.join([ - le32(0x00050893), # mv a7, a0 (syscall number) - le32(0x00058993), # mv s3, a1 (save r1) - le32(0x00060B13), # mv s6, a2 (save r2) - le32(0x00068B93), # mv s7, a3 (save r3) - le32(0x00098513), # mv a0, s3 (arg1 = r1) - le32(0x000B0593), # mv a1, s6 (arg2 = r2) - le32(0x000B8613), # mv a2, s7 (arg3 = r3) - le32(0x000A0693), # mv a3, s4 (arg4 = r4) - le32(0x000A8713), # mv a4, s5 (arg5 = r5) - le32(0x00048793), # mv a5, s1 (arg6 = r6) - le32(0x00000073), # ecall - le32(0x00098593), # mv a1, s3 (restore r1) - le32(0x000B0613), # mv a2, s6 - le32(0x000B8693), # mv a3, s7 - ]) - ), -} - -## Syscall numbers (little-endian 32-bit for LI operand). -## aarch64 and riscv64 share the asm-generic table; amd64 has its own. -## -## Portability notes — every entry below is a syscall that exists on all -## three with the same semantics under the uniform P1 SYSCALL convention -## (r0 = num, r1-r6 = args): -## - `fork` is amd64-only; `wait4` is asm-generic 32-bit compat only. -## Use `clone(SIGCHLD)` and `waitid` instead. -## - `open` is amd64-only (removed from asm-generic). Use `openat` with -## dirfd = AT_FDCWD (-100) as arg1. -## - `clone` arg order differs: amd64 is (flags, stack, ptid, ctid, tls); -## aarch64/riscv64 are (flags, stack, ptid, tls, ctid). Benign when -## ptid/ctid/tls are all zero (the fork-equivalent case). -SYS_NUM = { - 'aarch64': {'SYS_WRITE': 64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57, - 'SYS_OPENAT': 56, - 'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95}, - 'amd64': {'SYS_WRITE': 1, 'SYS_EXIT': 60, 'SYS_READ': 0, 'SYS_CLOSE': 3, - 'SYS_OPENAT':257, - 'SYS_CLONE': 56, 'SYS_EXECVE': 59, 'SYS_WAITID':247}, - 'riscv64': {'SYS_WRITE': 64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57, - 'SYS_OPENAT': 56, - 'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95}, -} - - -## ---------- Canonical imm/offset/shamt sets ----------------------------- -## Enumerated instead of sigil-passed: M1's DEFINE substitutes hex -## bytes verbatim, so every distinct imm value needs its own DEFINE. -## These cover every value used across hello/demo/lisp/kaem-minimal -## plus a little headroom. Extend when a new value appears in P1 src. - -## ADDI imms. NEG48/48 handle the ASCII '0' bias; the rest cover tag -## stripping and loop counters. Full reg product × this set = 8²×N. -ADDI_IMMS = (-48, -8, -7, -6, -5, -4, -3, -2, -1, - 1, 2, 3, 4, 5, 6, 7, 8, 48) - -## Shift amounts (for SHLI/SHRI/SARI). 32/52 implement low-N-bit masks -## (length field extraction; 4096-slot symbol-table index); the small -## values scale-by-N for byte offsets and fixnum encode/decode. -SHIFT_IMMS = (1, 2, 3, 5, 16, 32, 52) - -## ANDI/ORI imms. Every entry must appear in AA64_LOGI_ENC. -LOGI_IMMS = (1, 2, 3, 4, 6, 7, 8) - -## Memory offsets for LD/ST/LB/SB. 0/8/16/24/32 cover slot offsets in -## N-slot frames and common struct fields; 7 is the NUL terminator -## position inside an 8-byte zero-padded slot; -8 reaches one slot -## below the current base. -MEM_OFFS = (-8, 0, 7, 8, 16, 24, 32) - -CONDB_OPS = ('BEQ', 'BNE', 'BLT') -SHIFT_OPS = ('SHLI', 'SHRI', 'SARI') -LOGI_OPS = ('ANDI', 'ORI') -MEM_OPS = ('LD', 'ST', 'LB', 'SB') - - -## Curated RRR triples. The full cube is 11 ops × 8³ regs = 5632 -## entries per arch — >99% would be dead weight. Each tuple below -## is one actually used by hello/demo/lisp/kaem-minimal. Lint -## catches missing triples on assembly; add a line here and -## regenerate. -RRR_TABLE = ( - # demo/lisp step-1 arith cube - ('ADD','r1','r1','r2'), ('ADD','r1','r1','r4'), - ('ADD','r2','r2','r6'), ('ADD','r2','r3','r1'), - ('SUB','r1','r1','r2'), ('SUB','r2','r2','r6'), - ('AND','r1','r1','r5'), - ('OR', 'r1','r1','r2'), - ('XOR','r1','r1','r2'), - ('MUL','r1','r1','r2'), - ('DIV','r1','r1','r2'), - ('REM','r1','r1','r5'), - ('SHL','r1','r1','r2'), - ('SHR','r1','r1','r2'), - ('SAR','r4','r4','r2'), - # alloc / pointer arithmetic - ('ADD','r2','r0','r1'), - ('ADD','r0','r0','r3'), - ('ADD','r2','r2','r0'), - ('ADD','r2','r2','r1'), - ('SUB','r3','r3','r0'), - # reader / display index+offset fold - ('ADD','r6','r1','r2'), - ('ADD','r6','r6','r0'), - ('ADD','r7','r1','r2'), - ('SUB','r2','r1','r6'), - ('SUB','r3','r1','r6'), - ('REM','r1','r1','r2'), - # kaem-minimal bump-pointer + accumulator updates - ('ADD','r1','r1','r0'), - ('ADD','r5','r5','r0'), - ('ADD','r7','r7','r0'), - ('SUB','r3','r3','r2'), - ('SUB','r6','r6','r0'), -) - - -## ---------- Row assembly ------------------------------------------------ - -HEADER = """## p1_{arch}.M1 — GENERATED by p1_gen.py. Do not edit by hand. -## -## Shared op-table lives in p1_gen.py; each arch's encoder lowers -## (op, register-tuple, imm) rows into native bytes. See P1.md for the -## ISA spec and register mapping. -""" - -@dataclass -class Banner: - text: str - - -def _imm_suf(imm): - return f'NEG{-imm}' if imm < 0 else f'{imm}' - - -def rows(): - R = [] - - # --- LI / LA — wide literal and address loads --- - R.append(Banner('LI / LA — load 4-byte zero-extended literal or label addr')) - for rd in P1_REGS: - R.append(Li(name=f'LI_{rd.upper()}', rD=rd)) - # LI_BR loads into the hidden branch-target scratch (x17/r11/t5). - # Every branch/call site is `LI_BR &target ; P1_<BR>`. The scratch - # is *not* a P1 reg. - R.append(Li(name='LI_BR', rD='br')) - for rd in P1_REGS: - R.append(La(name=f'LA_{rd.upper()}', rD=rd)) - - # --- MOV — register-to-register + MOV rD, sp --- - R.append(Banner('MOV — full register product (src may be sp)')) - for rd in P1_REGS: - for ra in P1_REGS: - R.append(Mov(name=f'MOV_{rd.upper()}_{ra.upper()}', rD=rd, rA=ra)) - R.append(Mov(name=f'MOV_{rd.upper()}_SP', rD=rd, rA='sp')) - - # --- RRR — curated triples (full cube would be 5.6k/arch) --- - R.append(Banner('RRR — curated triples (explicit table in p1_gen.py)')) - for op, d, a, b in RRR_TABLE: - R.append(RRR(name=f'{op}_{d.upper()}_{a.upper()}_{b.upper()}', - op=op, rD=d, rA=a, rB=b)) - - # --- Immediate arith: ADDI × full reg product × imm set --- - R.append(Banner('ADDI — full register product × ADDI_IMMS')) - for d, a, imm in product(P1_REGS, P1_REGS, ADDI_IMMS): - R.append(AddI(name=f'ADDI_{d.upper()}_{a.upper()}_{_imm_suf(imm)}', - rD=d, rA=a, imm=imm)) - - # --- ANDI / ORI × full reg product × LOGI_IMMS --- - R.append(Banner('ANDI / ORI — full register product × LOGI_IMMS')) - for op, d, a, imm in product(LOGI_OPS, P1_REGS, P1_REGS, LOGI_IMMS): - R.append(LogI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}', - op=op, rD=d, rA=a, imm=imm)) - - # --- SHLI / SHRI / SARI × full reg product × SHIFT_IMMS --- - R.append(Banner('SHLI / SHRI / SARI — full register product × SHIFT_IMMS')) - for op, d, a, imm in product(SHIFT_OPS, P1_REGS, P1_REGS, SHIFT_IMMS): - R.append(ShiftI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}', - op=op, rD=d, rA=a, imm=imm)) - - # --- Memory: LD/ST/LB/SB × full reg product × MEM_OFFS --- - R.append(Banner('LD / ST / LB / SB — full register product × MEM_OFFS')) - for op, rt, rn, off in product(MEM_OPS, P1_REGS, P1_REGS, MEM_OFFS): - R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{_imm_suf(off)}', - op=op, rT=rt, rN=rn, off=off)) - - # --- Branches: BEQ/BNE/BLT × full reg product + unconditional B --- - R.append(Banner('Branches — LI_BR-indirect pattern')) - R.append(B(name='B')) - for op, a, b in product(CONDB_OPS, P1_REGS, P1_REGS): - R.append(CondB(name=f'{op}_{a.upper()}_{b.upper()}', - op=op, rA=a, rB=b)) - - # --- Control: CALL / RET / PROLOGUE / EPILOGUE / TAIL (Nk = 1..4) --- - R.append(Banner('Control: CALL/RET + single-slot and N-slot PROLOGUE/EPILOGUE/TAIL')) - R.append(Prologue(name='PROLOGUE', k=1)) - R.append(Epilogue(name='EPILOGUE', k=1)) - R.append(Ret(name='RET')) - R.append(Call(name='CALL')) - R.append(Tail(name='TAIL', k=1)) - for k in (2, 3, 4): - R.append(Prologue(name=f'PROLOGUE_N{k}', k=k)) - R.append(Epilogue(name=f'EPILOGUE_N{k}', k=k)) - R.append(Tail(name=f'TAIL_N{k}', k=k)) - - # --- SYSCALL — pre-encoded per-arch wrapper --- - R.append(Banner('SYSCALL — uniform "clobbers r0 only" across arches')) - R.append(Literal(name='SYSCALL', hex_by_arch=SYSCALL_HEX)) - - # --- Syscall numbers (LE-32 immediates) --- - R.append(Banner('Linux syscall numbers (per-arch table). LE-32 operands for LI.')) - for name in ('SYS_WRITE', 'SYS_EXIT', 'SYS_READ', 'SYS_CLOSE', 'SYS_OPENAT', - 'SYS_CLONE', 'SYS_EXECVE', 'SYS_WAITID'): - R.append(Literal(name=name, - hex_by_arch={a: le32(SYS_NUM[a][name]) for a in ARCHES})) - - return R - - -## ---------- File emission ----------------------------------------------- - -def emit(arch: str) -> str: - enc = ENCODERS[arch] - out = [HEADER.format(arch=arch).rstrip(), ''] - seen = set() - for row in rows(): - if isinstance(row, Banner): - out.append('') - out.append('## ---- ' + row.text + ' ' + '-' * max(0, 60 - len(row.text))) - continue - name = 'P1_' + row.name if not row.name.startswith('SYS_') else row.name - if name in seen: - raise RuntimeError(f'duplicate DEFINE: {name}') - seen.add(name) - out.append(f'DEFINE {name} {row.encode(enc)}') - out.append('') - return '\n'.join(out) - - -def main(): - here = os.path.dirname(os.path.abspath(__file__)) - check = '--check' in sys.argv - - had_diff = False - for arch in ARCHES: - path = os.path.join(here, f'p1_{arch}.M1') - content = emit(arch) - if check: - with open(path) as f: - existing = f.read() - if existing != content: - sys.stderr.write(f'DIFF: {path}\n') - had_diff = True - else: - with open(path, 'w') as f: - f.write(content) - print(f'wrote {path} ({len(content)} bytes)') - - if check and had_diff: - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/bootstrap.sh b/bootstrap.sh @@ -9,6 +9,10 @@ # bytes, hand-assembled, shipped by stage0-posix). Nothing above M0 is built, # which is the whole point — no C compiler is involved, not even cc_<arch>. # +# Inputs are read from build/upstream/, which populate-upstream.sh mirrors +# from live-bootstrap's stage0-posix on the host. The container mounts only +# curdir, so everything bootstrap.sh needs must already live inside it. +# # Phase map (stage0-posix mescc-tools-{seed,mini}-kaem.kaem phases 0-3): # 0) hex0-seed + hex0_<A>.hex0 -> hex0 # 1) hex0 + hex1_<A>.hex0 -> hex1 @@ -34,9 +38,8 @@ case "$ARCH" in *) echo "bootstrap.sh: unsupported arch '$ARCH'" >&2 ; exit 1 ;; esac -S=/work/live-bootstrap/seed/stage0-posix +S=build/upstream mkdir -p "$OUT" -cd "$S" # qemu-user amd64 workaround: the shipped hex0-seed and the hex0 it produces # both have a program header with p_flags=0x01 (PF_X only, no PF_R). A native @@ -47,7 +50,7 @@ cd "$S" # # This only affects foreign-arch builds on non-amd64 hosts; on a native amd64 # host the patch is a no-op (binary would load fine either way). -SEED=./bootstrap-seeds/POSIX/"$A"/hex0-seed +SEED="$S"/bootstrap-seeds/POSIX/"$A"/hex0-seed if [ "$ARCH" = amd64 ]; then cp "$SEED" "$OUT"/hex0-seed printf '\5' | dd of="$OUT"/hex0-seed bs=1 seek=68 count=1 conv=notrunc status=none @@ -55,13 +58,13 @@ if [ "$ARCH" = amd64 ]; then SEED="$OUT"/hex0-seed fi -"$SEED" "$A"/hex0_"$A".hex0 "$OUT"/hex0 +"$SEED" "$S"/"$A"/hex0_"$A".hex0 "$OUT"/hex0 if [ "$ARCH" = amd64 ]; then printf '\5' | dd of="$OUT"/hex0 bs=1 seek=68 count=1 conv=notrunc status=none fi -"$OUT"/hex0 "$A"/hex1_"$A".hex0 "$OUT"/hex1 -"$OUT"/hex1 "$A"/hex2_"$A".hex1 "$OUT"/hex2-0 -"$OUT"/"$CATM_ASM" "$A"/"$CATM_SRC" "$OUT"/catm -"$OUT"/catm "$OUT"/M0.hex2 "$A"/ELF-"$ARCH".hex2 "$A"/M0_"$A".hex2 +"$OUT"/hex0 "$S"/"$A"/hex1_"$A".hex0 "$OUT"/hex1 +"$OUT"/hex1 "$S"/"$A"/hex2_"$A".hex1 "$OUT"/hex2-0 +"$OUT"/"$CATM_ASM" "$S"/"$A"/"$CATM_SRC" "$OUT"/catm +"$OUT"/catm "$OUT"/M0.hex2 "$S"/"$A"/ELF-"$ARCH".hex2 "$S"/"$A"/M0_"$A".hex2 "$OUT"/hex2-0 "$OUT"/M0.hex2 "$OUT"/M0 diff --git a/populate-upstream.sh b/populate-upstream.sh @@ -0,0 +1,48 @@ +#!/bin/sh +# Copy the files bootstrap.sh needs from live-bootstrap's stage0-posix into +# build/upstream/, mirroring the upstream directory layout. Runs on the host: +# the podman invocations in the Makefile only mount curdir, so anything +# bootstrap.sh needs has to land inside curdir first. +# +# Inputs per arch (A in AArch64|AMD64|riscv64): +# bootstrap-seeds/POSIX/$A/hex0-seed +# $A/hex0_$A.hex0 +# $A/hex1_$A.hex0 +# $A/hex2_$A.hex1 +# $A/catm_$A.(hex1|hex2) extension differs across arches +# $A/M0_$A.hex2 +# $A/ELF-<arch>.hex2 used by both bootstrap.sh (M0.hex2 link) and +# the Makefile's final program link +# +# Usage: populate-upstream.sh [UPSTREAM] +# UPSTREAM: path to live-bootstrap checkout (default: ../live-bootstrap) +set -eu + +UPSTREAM=${1:-../live-bootstrap} +S="$UPSTREAM/seed/stage0-posix" +OUT=build/upstream + +if [ ! -d "$S" ]; then + echo "populate-upstream.sh: expected '$S' to exist" >&2 + exit 1 +fi + +for A in AArch64 AMD64 riscv64; do + case "$A" in + AArch64) arch=aarch64 ; CATM=catm_AArch64.hex1 ;; + AMD64) arch=amd64 ; CATM=catm_AMD64.hex2 ;; + riscv64) arch=riscv64 ; CATM=catm_riscv64.hex2 ;; + esac + + mkdir -p "$OUT/bootstrap-seeds/POSIX/$A" "$OUT/$A" + + cp "$S/bootstrap-seeds/POSIX/$A/hex0-seed" "$OUT/bootstrap-seeds/POSIX/$A/" + cp "$S/$A/hex0_$A.hex0" "$OUT/$A/" + cp "$S/$A/hex1_$A.hex0" "$OUT/$A/" + cp "$S/$A/hex2_$A.hex1" "$OUT/$A/" + cp "$S/$A/$CATM" "$OUT/$A/" + cp "$S/$A/M0_$A.hex2" "$OUT/$A/" + cp "$S/$A/ELF-$arch.hex2" "$OUT/$A/" +done + +echo "populate-upstream: copied into $OUT from $UPSTREAM" diff --git a/src/p1_gen.py b/src/p1_gen.py @@ -0,0 +1,1089 @@ +#!/usr/bin/env python3 +"""p1_gen.py — generate p1_<arch>.M1 from a per-arch encoder table. + +Single source of truth for the P1 DEFINE tables across all three target +arches. Running this script writes <build>/aarch64/p1_aarch64.M1 and the +amd64/riscv64 siblings (default <build> = "build"). + +Structure: + * Low-level native encoders (amd_*, aa_*, rv_*) — one bank of + helpers per arch. + * Encoder classes AA64/AMD64/RV64 (subclasses of Encoder): one + method per P1 op category, lowering (op, reg-tuple, imm) into + native hex. Each arch's encoder is a coherent bundle — adding a + new op means one new method on each of the three. + * Op dataclasses — thin rows holding the DEFINE's name + data. + Op.encode(enc) dispatches into enc.<op-method>() with the Op's + fields unpacked. No per-arch branching lives in Op classes. + * rows() — builds the output list. Non-RRR ops are emitted as the + full register product × a curated imm/offset/shamt set. RRR + keeps an explicit table (the full 8³ cube is 5.6k entries per + arch, >99% dead weight). Adding a new RRR triple or a new imm + value is a one-line edit to rows(); a new register combination + for any other op needs no edit at all. + * emit(arch) / main — iterate rows, ask the arch's encoder to + lower each, write out the defs file. + +Running: + $ python3 p1_gen.py [build-root] # rewrite all three files + $ python3 p1_gen.py --check [build-root] # diff against current files +""" + +import os +import sys +from dataclasses import dataclass +from itertools import product +from typing import Optional + +ARCHES = ('aarch64', 'amd64', 'riscv64') + +## P1 GPRs (the 8 caller/callee-split registers exposed to P1 source). +P1_REGS = ('r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7') + +## ---------- Register mappings -------------------------------------------- +## P1 register name → native encoding number. The native numbers are what +## the per-arch encoders insert into instruction fields; the human-facing +## names (rax, x1, a2, …) never appear in this file. + +## 4:4 caller/callee-saved split. r0–r3 caller (native argregs); r4–r7 +## callee (native callee-saved). `br` is the hidden branch-target scratch +## (not a P1 reg) — picked so every op's expansion clobbers only what its +## name declares. +NAT_AA64 = {'r0': 0, 'r1': 1, 'r2': 2, 'r3': 3, + 'r4': 26, 'r5': 27, 'r6': 19, 'r7': 20, + 'br': 17, # x17 (IP1, caller-saved linker scratch) + 'sp': 31, 'xzr': 31, 'lr': 30, + 'x21': 21, 'x22': 22, 'x23': 23, 'x8': 8} + +## amd64 ModRM.reg/rm + REX.R/B bit: native regnums 0..15 with r8..r15 +## setting the REX bit. We store the 4-bit native number directly. +NAT_AMD64 = {'r0': 0, # rax + 'r1': 7, # rdi + 'r2': 6, # rsi + 'r3': 2, # rdx + 'r4': 13, # r13 (callee-saved) + 'r5': 14, # r14 (callee-saved) + 'r6': 3, # rbx + 'r7': 12, # r12 + 'br': 11, # r11 — branch/call target scratch + DIV/REM r0 save + 'sp': 4, # rsp + 'rcx': 1, # shift-count scratch + DIV/REM rdx save (not a P1 reg) + 'r10': 10, # syscall arg4 slot (not a P1 reg) + 'r8': 8, # syscall arg5 slot (not a P1 reg) + 'r9': 9, # syscall arg6 slot (not a P1 reg) + 'r11': 11, # alias for br (some expansions spell it r11 directly) + } + +NAT_RV64 = {'r0': 10, 'r1': 11, 'r2': 12, 'r3': 13, + 'r4': 20, 'r5': 21, 'r6': 9, 'r7': 18, + 'br': 30, # t5 (caller-saved temp) + 'sp': 2, 'ra': 1, 'zero': 0, 'a7': 17, + 's3': 19, 's6': 22, 's7': 23} + + +## ---------- Low-level encoding helpers ----------------------------------- + +def le32(n: int) -> str: + return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() + +def byte(n: int) -> str: + return f'{n & 0xFF:02X}' + + +## ---------- amd64 primitive encoders ------------------------------------ +## amd64 is variable-length. Helpers below emit specific instruction +## shapes used by the P1 expansions. REX prefix bits: W=64b, R=ModRM.reg +## high, B=ModRM.rm high, X=SIB.index high (unused here). + +def rex(w, r, x, b): + v = 0x40 | (w << 3) | (r << 2) | (x << 1) | b + return byte(v) + +def modrm(mod, reg, rm): + return byte((mod << 6) | ((reg & 7) << 3) | (rm & 7)) + +def amd_mov_rr(dst, src): + """mov dst, src — REX.W + 89 /r (MOV r/m64, r64).""" + d, s = NAT_AMD64[dst], NAT_AMD64[src] + return rex(1, s >> 3, 0, d >> 3) + '89' + modrm(3, s, d) + +def amd_alu_rr(op, dst, src): + """op dst, src — 2-operand ALU. op is the opcode byte (01 add, + 29 sub, 21 and, 09 or, 31 xor).""" + d, s = NAT_AMD64[dst], NAT_AMD64[src] + return rex(1, s >> 3, 0, d >> 3) + op + modrm(3, s, d) + +def amd_alu_ri8(ext, dst, imm): + """op dst, imm8 (sign-extended). Opcode 83 /ext ib.""" + d = NAT_AMD64[dst] + return rex(1, 0, 0, d >> 3) + '83' + modrm(3, ext, d) + byte(imm) + +def amd_alu_ri32(ext, dst, imm): + """op dst, imm32 (sign-extended). Opcode 81 /ext id. Used when + an immediate doesn't fit in the imm8 form (e.g., ADDI with + values outside [-128, 127]).""" + d = NAT_AMD64[dst] + imm_le = (imm & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() + return rex(1, 0, 0, d >> 3) + '81' + modrm(3, ext, d) + imm_le + +def amd_shift_ri8(ext, dst, imm): + """shl/shr/sar dst, imm8. Opcode C1 /ext ib.""" + d = NAT_AMD64[dst] + return rex(1, 0, 0, d >> 3) + 'C1' + modrm(3, ext, d) + byte(imm) + +def amd_shift_cl(ext, dst): + """shl/shr/sar dst, cl. Opcode D3 /ext.""" + d = NAT_AMD64[dst] + return rex(1, 0, 0, d >> 3) + 'D3' + modrm(3, ext, d) + +def amd_imul_rr(dst, src): + """imul dst, src — 0F AF /r.""" + d, s = NAT_AMD64[dst], NAT_AMD64[src] + return rex(1, d >> 3, 0, s >> 3) + '0FAF' + modrm(3, d, s) + +def amd_idiv(src): + """idiv src — F7 /7 (signed div of rdx:rax by src).""" + s = NAT_AMD64[src] + return rex(1, 0, 0, s >> 3) + 'F7' + modrm(3, 7, s) + +def amd_cqo(): + """cqo — sign-extend rax into rdx:rax. 48 99.""" + return '4899' + +def amd_mem_rm(opcode, reg, base, disp): + """[base+disp] <-> reg, for MOV r,r/m or MOV r/m,r (opcode=89 store, 8B load). + disp is signed int; encodes as disp8 if in range, else disp32.""" + r, b = NAT_AMD64[reg], NAT_AMD64[base] + prefix = rex(1, r >> 3, 0, b >> 3) + opcode + if -128 <= disp <= 127: + mod = 1 + d = byte(disp) + elif b == 4: # SIB required for rsp + mod = 2 + d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() + else: + mod = 2 + d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() + # rsp as base requires SIB byte (rm=4 with no SIB is rip-relative). + if b == 4: + return prefix + modrm(mod, r, 4) + '24' + d + return prefix + modrm(mod, r, b) + d + +def amd_mov_rm_b(reg, base, disp, store): + """Byte load/store. 88 /r (store), 0F B6 /r (movzx load).""" + r, b = NAT_AMD64[reg], NAT_AMD64[base] + if -128 <= disp <= 127: + mod = 1 + d = byte(disp) + else: + mod = 2 + d = (disp & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() + if store: + # MOV r/m8, r8 — 88 /r. Requires REX to address dil/sil/bpl/spl. + prefix = rex(1, r >> 3, 0, b >> 3) + '88' + sib = '24' if b == 4 else '' + rmv = 4 if b == 4 else b + return prefix + modrm(mod, r, rmv) + sib + d + else: + # MOVZX r64, r/m8 — REX.W 0F B6 /r. + prefix = rex(1, r >> 3, 0, b >> 3) + '0FB6' + sib = '24' if b == 4 else '' + rmv = 4 if b == 4 else b + return prefix + modrm(mod, r, rmv) + sib + d + + +## ---------- aarch64 primitive encoders ---------------------------------- +## aarch64 is fixed 4-byte insns. Helpers return the 4 bytes LE-encoded. + +def aa_rrr(base, rD, rA, rB): + d, a, b = NAT_AA64[rD], NAT_AA64[rA], NAT_AA64[rB] + return le32(base | (b << 16) | (a << 5) | d) + +def aa_add_imm(rD, rA, imm12, sub=False): + """ADD/SUB (immediate, shift=0). imm12 unsigned 0..4095.""" + d, a = NAT_AA64[rD], NAT_AA64[rA] + base = 0xD1000000 if sub else 0x91000000 + return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d) + +def aa_logical_imm(base, rD, rA, N, immr, imms): + d, a = NAT_AA64[rD], NAT_AA64[rA] + return le32(base | (N << 22) | (immr << 16) | (imms << 10) | (a << 5) | d) + +def aa_ubfm(rD, rA, immr, imms): + """UBFM (N=1 for sf=64).""" + d, a = NAT_AA64[rD], NAT_AA64[rA] + return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d) + +def aa_sbfm(rD, rA, immr, imms): + """SBFM (N=1 for sf=64).""" + d, a = NAT_AA64[rD], NAT_AA64[rA] + return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d) + +def aa_ldst_uimm12(base, rT, rN, off_bytes, size_log2): + """LDR/STR (unsigned offset). off_bytes must be a multiple of + 2^size_log2 and non-negative. imm12 = off_bytes >> size_log2.""" + assert off_bytes >= 0 and (off_bytes % (1 << size_log2)) == 0 + imm12 = off_bytes >> size_log2 + assert 0 <= imm12 < 4096 + t, n = NAT_AA64[rT], NAT_AA64[rN] + return le32(base | (imm12 << 10) | (n << 5) | t) + +def aa_ldst_unscaled(base, rT, rN, off): + """LDUR/STUR (unscaled, signed imm9). Handles arbitrary small + offsets — negative, or positive-but-not-a-multiple-of-the-access- + size (e.g. LD at offset 7). imm9 range is [-256, 255].""" + assert -256 <= off <= 255 + imm9 = off & 0x1FF + t, n = NAT_AA64[rT], NAT_AA64[rN] + return le32(base | (imm9 << 12) | (n << 5) | t) + + +## ---------- riscv64 primitive encoders ---------------------------------- + +def rv_r(base, rD, rA, rB): + d, a, b = NAT_RV64[rD], NAT_RV64[rA], NAT_RV64[rB] + return le32(base | (b << 20) | (a << 15) | (d << 7)) + +def rv_i(base, rD, rA, imm12): + """I-type: imm12[11:0], rs1, funct3, rd, opcode. imm12 is a signed + int that gets masked to 12 bits.""" + d, a = NAT_RV64[rD], NAT_RV64[rA] + return le32(base | ((imm12 & 0xFFF) << 20) | (a << 15) | (d << 7)) + +def rv_s(base, rS, rA, imm12): + """S-type store: imm12[11:5] rs2 rs1 funct3 imm12[4:0] opcode.""" + s, a = NAT_RV64[rS], NAT_RV64[rA] + hi = (imm12 >> 5) & 0x7F + lo = imm12 & 0x1F + return le32(base | (hi << 25) | (s << 20) | (a << 15) | (lo << 7)) + +def rv_shift_imm(base, rD, rA, shamt): + """Shift-imm: base already has funct7 set; shamt in [0,63].""" + d, a = NAT_RV64[rD], NAT_RV64[rA] + return le32(base | ((shamt & 0x3F) << 20) | (a << 15) | (d << 7)) + + +## ---------- Per-arch op base tables ------------------------------------- + +AA64_RRR_BASE = { + 'ADD': 0x8B000000, + 'SUB': 0xCB000000, + 'AND': 0x8A000000, + 'OR': 0xAA000000, + 'XOR': 0xCA000000, + 'SHL': 0x9AC02000, + 'SHR': 0x9AC02400, + 'SAR': 0x9AC02800, + 'DIV': 0x9AC00C00, +} +AMD64_RRR_OPC = { + 'ADD': '01', 'SUB': '29', 'AND': '21', 'OR': '09', 'XOR': '31', +} +RV_RRR = { + 'ADD': 0x00000033, # funct7=0 funct3=0 opcode=0x33 + 'SUB': 0x40000033, + 'XOR': 0x00004033, + 'OR': 0x00006033, + 'AND': 0x00007033, + 'SHL': 0x00001033, + 'SHR': 0x00005033, + 'SAR': 0x40005033, + 'MUL': 0x02000033, + 'DIV': 0x02004033, + 'REM': 0x02006033, +} + + +## aarch64 bitmask-immediate encoding for ANDI/ORI. Entries are the +## (N, immr, imms) triples that encode each small imm as an aarch64 +## "logical immediate." Computed by hand because the full encoding +## algorithm (contiguous-run + rotation for element sizes +## 2/4/8/16/32/64) is substantial and we only need a handful of +## values. Extend this table if a new imm shows up in P1 source. +AA64_LOGI_ENC = { + 1: (1, 0, 0), # 0b0001 — single bit at position 0 + 2: (1, 63, 0), # 0b0010 — single bit at position 1 + 3: (1, 0, 1), # 0b0011 — 2 contiguous ones + 4: (1, 62, 0), # 0b0100 — single bit at position 2 + 6: (1, 63, 1), # 0b0110 — 2 ones rotated by 1 + 7: (1, 0, 2), # 0b0111 — 3 contiguous ones + 8: (1, 61, 0), # 0b1000 — single bit at position 3 +} + + +## Frame layout after PROLOGUE_Nk (k >= 1, rounded up so total frame +## bytes stay 16-byte aligned on aarch64): +## [sp + 0] = retaddr (aarch64 lr / riscv64 ra / amd64 retaddr) +## [sp + 8] = slot 1 (callee-private scratch) +## [sp + 16] = slot 2 +## ... +## [sp + 8*k] = slot k +## +## Frame size = round_up_to_16(8 + 8*k). So k=1 → 16, k=2 → 24 → 32, +## k=3 → 32, k=4 → 40 → 48. + +def prologue_frame_bytes(k: int) -> int: + raw = 8 + 8 * k + return (raw + 15) & ~15 + + +## ---------- Encoders ---------------------------------------------------- +## One class per arch. Each provides one method per P1 op category, +## mapping (op, reg-tuple, imm) to native bytes. Op classes dispatch +## here via `Op.encode(enc)` → `enc.<method>(fields)`. + +class Encoder: + """Per-arch encoder base. Subclasses implement one method per + op category. `arch` is used by literal() to pick the right + pre-encoded bytes from an arch-keyed dict.""" + arch = '' + + def literal(self, hex_by_arch): + return hex_by_arch[self.arch] + + +class AA64(Encoder): + arch = 'aarch64' + + def rrr(self, op, rD, rA, rB): + if op == 'MUL': + # MUL = MADD with Ra=xzr. 100 11011 000 mmmmm 0 aaaaa nnnnn ddddd + d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB] + return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d) + if op == 'REM': + # SDIV x16, xA, xB ; MSUB xD, x16, xB, xA. + # x16 (ARM IP0, caller-saved, not a P1 reg) is scratch so + # REM does not hidden-clobber P1 r4 — the op modifies rD only. + # MSUB needs bit 15 set (o0=1); without it it decodes as + # MADD and REM returns A + (A/B)*B. + d = NAT_AA64[rD]; a = NAT_AA64[rA]; b = NAT_AA64[rB] + SC = 16 + sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | SC + msub = 0x9B008000 | (b << 16) | (a << 10) | (SC << 5) | d + return le32(sdiv) + le32(msub) + return aa_rrr(AA64_RRR_BASE[op], rD, rA, rB) + + def addi(self, rD, rA, imm): + if imm >= 0: + return aa_add_imm(rD, rA, imm, sub=False) + return aa_add_imm(rD, rA, -imm, sub=True) + + def logi(self, op, rD, rA, imm): + N, immr, imms = AA64_LOGI_ENC[imm] + base = 0x92000000 if op == 'ANDI' else 0xB2000000 # ORI = orr + return aa_logical_imm(base, rD, rA, N, immr, imms) + + def shifti(self, op, rD, rA, imm): + if op == 'SHLI': + return aa_ubfm(rD, rA, (-imm) & 63, 63 - imm) + if op == 'SHRI': + return aa_ubfm(rD, rA, imm, 63) + if op == 'SARI': + return aa_sbfm(rD, rA, imm, 63) + + def mov(self, rD, rA): + if rA == 'sp': + return aa_add_imm(rD, 'sp', 0, sub=False) + # MOV xD, xA = ORR xD, xzr, xA + d = NAT_AA64[rD]; a = NAT_AA64[rA] + return le32(0xAA000000 | (a << 16) | (31 << 5) | d) + + def li(self, rD): + # ldr wD, [pc+8] ; b +8 (caller emits 4 bytes of data next) + d = NAT_AA64[rD] + ldr_w_lit = 0x18000040 | d # LDR (literal) 32-bit, offset 8 + b_plus8 = 0x14000002 # B offset 8 (imm26 = 2 words = 8 bytes) + return le32(ldr_w_lit) + le32(b_plus8) + + def la(self, rD): + return self.li(rD) + + def mem(self, op, rT, rN, off): + # Pick uimm12 (scaled, large range) when the offset is a + # non-negative multiple of the access width; otherwise fall + # back to the unscaled signed-imm9 form (covers negative + # offsets and positive-but-misaligned ones like 7). + BASES = { + 'LD': (0xF9400000, 3, 0xF8400000), + 'ST': (0xF9000000, 3, 0xF8000000), + 'LB': (0x39400000, 0, 0x38400000), + 'SB': (0x39000000, 0, 0x38000000), + } + uimm_base, size_log2, unscaled_base = BASES[op] + scale = 1 << size_log2 + if off >= 0 and (off % scale) == 0: + return aa_ldst_uimm12(uimm_base, rT, rN, off, size_log2) + return aa_ldst_unscaled(unscaled_base, rT, rN, off) + + def b(self): + return le32(0xD61F0000 | (NAT_AA64['br'] << 5)) # BR x17 + + def condb(self, op, rA, rB): + # cmp xA, xB = SUBS xzr, xA, xB (0xEB000000 base, rD=31). + # Skip when NOT cond holds. BEQ→NE(1), BNE→EQ(0), BLT→GE(A). + a = NAT_AA64[rA]; b_ = NAT_AA64[rB] + cmp_ = le32(0xEB000000 | (b_ << 16) | (a << 5) | 31) + cond = {'BEQ': 1, 'BNE': 0, 'BLT': 10}[op] + bcond = le32(0x54000040 | cond) + br = le32(0xD61F0000 | (NAT_AA64['br'] << 5)) + return cmp_ + bcond + br + + def call(self): + return le32(0xD63F0000 | (NAT_AA64['br'] << 5)) # BLR x17 + + def ret(self): + return le32(0xD65F03C0) # RET (= br x30) + + def prologue(self, k): + fb = prologue_frame_bytes(k) + sub = aa_add_imm('sp', 'sp', fb, sub=True) + str_lr = aa_ldst_uimm12(0xF9000000, 'lr', 'sp', 0, 3) + return sub + str_lr + + def epilogue(self, k): + fb = prologue_frame_bytes(k) + ldr_lr = aa_ldst_uimm12(0xF9400000, 'lr', 'sp', 0, 3) + add = aa_add_imm('sp', 'sp', fb, sub=False) + return ldr_lr + add + + def tail(self, k): + return self.epilogue(k) + self.b() + + +class AMD64(Encoder): + arch = 'amd64' + + def rrr(self, op, rD, rA, rB): + if op == 'MUL': + return amd_mov_rr(rD, rA) + amd_imul_rr(rD, rB) + if op in ('DIV', 'REM'): + # x86 idiv implicitly reads/writes rax (P1 r0) and rdx + # (P1 r3). To keep DIV/REM clobber-free (only rD changes), + # stash r0 into r11 and r3 into rcx — neither is a P1 reg — + # then restore. If rA or rB alias r0/r3, read from the + # saved copy since we've overwritten the originals. + # Skip the final restore for whichever of r0/r3 *is* rD, + # so rD keeps its newly computed value. + seq = amd_mov_rr('r11', 'r0') # save r0 (rax) + seq += amd_mov_rr('rcx', 'r3') # save r3 (rdx) + src_a = 'r11' if rA == 'r0' else ('rcx' if rA == 'r3' else rA) + seq += amd_mov_rr('r0', src_a) # rax = rA + seq += amd_cqo() # rdx:rax = sign-ext rax + src_b = 'r11' if rB == 'r0' else ('rcx' if rB == 'r3' else rB) + seq += amd_idiv(src_b) + seq += amd_mov_rr(rD, 'r0' if op == 'DIV' else 'r3') + if rD != 'r3': + seq += amd_mov_rr('r3', 'rcx') + if rD != 'r0': + seq += amd_mov_rr('r0', 'r11') + return seq + if op in ('SHL', 'SHR', 'SAR'): + ext = {'SHL': 4, 'SHR': 5, 'SAR': 7}[op] + seq = amd_mov_rr(rD, rA) + seq += amd_mov_rr('rcx', rB) + seq += amd_shift_cl(ext, rD) + return seq + # ADD/SUB/AND/OR/XOR: mov rD,rA ; op rD,rB + seq = amd_mov_rr(rD, rA) + seq += amd_alu_rr(AMD64_RRR_OPC[op], rD, rB) + return seq + + def addi(self, rD, rA, imm): + # mov rD,rA ; add rD,imm. Use imm8 form when it fits + # ([-128, 127]); otherwise emit the imm32 form. + seq = amd_mov_rr(rD, rA) + if -128 <= imm <= 127: + seq += amd_alu_ri8(0, rD, imm) # /0 = ADD + else: + seq += amd_alu_ri32(0, rD, imm) + return seq + + def logi(self, op, rD, rA, imm): + ext = {'ANDI': 4, 'ORI': 1}[op] + seq = amd_mov_rr(rD, rA) + seq += amd_alu_ri8(ext, rD, imm) + return seq + + def shifti(self, op, rD, rA, imm): + ext = {'SHLI': 4, 'SHRI': 5, 'SARI': 7}[op] + seq = amd_mov_rr(rD, rA) + seq += amd_shift_ri8(ext, rD, imm) + return seq + + def mov(self, rD, rA): + return amd_mov_rr(rD, rA) + + def li(self, rD): + # mov <rD as r32>, imm32 — opcode B8+r (with REX.B if r8..r15) + d = NAT_AMD64[rD] + if d >= 8: + return '41' + byte(0xB8 + (d & 7)) + return byte(0xB8 + d) + + def la(self, rD): + return self.li(rD) + + def mem(self, op, rT, rN, off): + if op == 'LD': return amd_mem_rm('8B', rT, rN, off) + if op == 'ST': return amd_mem_rm('89', rT, rN, off) + if op == 'LB': return amd_mov_rm_b(rT, rN, off, store=False) + if op == 'SB': return amd_mov_rm_b(rT, rN, off, store=True) + + def b(self): + return '41FFE3' # jmp r11 + + def condb(self, op, rA, rB): + a, b_ = NAT_AMD64[rA], NAT_AMD64[rB] + # cmp rA, rB — opcode 39 /r with rA as r/m + cmp_ = rex(1, b_ >> 3, 0, a >> 3) + '39' + modrm(3, b_, a) + # jcc rel8 opcode, skip=3 (past jmp r11): + # BEQ→JNE 75 03 ; BNE→JE 74 03 ; BLT→JGE 7D 03 + jop = {'BEQ': '75', 'BNE': '74', 'BLT': '7D'}[op] + return cmp_ + jop + '03' + '41FFE3' # jmp r11 + + def call(self): + return '41FFD3' # call r11 + + def ret(self): + return 'C3' + + def prologue(self, k): + # pop rcx ; sub rsp,fb ; push rcx. rcx is the retaddr-carry + # scratch — caller-save, never a P1 reg. r11 (= 'br') is + # off-limits because TAIL = EPILOGUE + `jmp r11`, and using + # r11 here would clobber the LI_BR-loaded tail target. + fb = prologue_frame_bytes(k) + assert fb <= 127 + return '59' + '4883EC' + byte(fb) + '51' + + def epilogue(self, k): + # Mirror of prologue: pop rcx ; add rsp,fb ; push rcx. + fb = prologue_frame_bytes(k) + assert fb <= 127 + return '59' + '4883C4' + byte(fb) + '51' + + def tail(self, k): + return self.epilogue(k) + self.b() + + +class RV64(Encoder): + arch = 'riscv64' + + def rrr(self, op, rD, rA, rB): + return rv_r(RV_RRR[op], rD, rA, rB) + + def addi(self, rD, rA, imm): + return rv_i(0x00000013, rD, rA, imm) + + def logi(self, op, rD, rA, imm): + base = {'ANDI': 0x00007013, 'ORI': 0x00006013}[op] + return rv_i(base, rD, rA, imm) + + def shifti(self, op, rD, rA, imm): + base = {'SHLI': 0x00001013, 'SHRI': 0x00005013, 'SARI': 0x40005013}[op] + return rv_shift_imm(base, rD, rA, imm) + + def mov(self, rD, rA): + return rv_i(0x00000013, rD, rA, 0) # addi rD, rA, 0 + + def li(self, rD): + # auipc rD,0 ; lwu rD,12(rD) ; jal x0,+8 + d = NAT_RV64[rD] + auipc = 0x00000017 | (d << 7) + lwu = 0x00006003 | (d << 7) | (d << 15) | (12 << 20) + jal_p8 = 0x0080006F + return le32(auipc) + le32(lwu) + le32(jal_p8) + + def la(self, rD): + return self.li(rD) + + def mem(self, op, rT, rN, off): + # funct3: LD=3, ST=3, LBU=4, SB=0. Opcodes: load=03, store=23. + if op == 'LD': return rv_i(0x00003003, rT, rN, off) + if op == 'ST': return rv_s(0x00003023, rT, rN, off) + if op == 'LB': return rv_i(0x00004003, rT, rN, off) # LBU + if op == 'SB': return rv_s(0x00000023, rT, rN, off) + + def b(self): + return le32(0x00000067 | (NAT_RV64['br'] << 15)) # jalr x0, 0(t5) + + def condb(self, op, rA, rB): + # B<inv> rA, rB, +8 ; jalr x0, 0(t5). funct3 picks the op: + # BEQ→BNE(1), BNE→BEQ(0), BLT→BGE(5). + a, b_ = NAT_RV64[rA], NAT_RV64[rB] + funct3 = {'BEQ': 1, 'BNE': 0, 'BLT': 5}[op] + insn = 0x00000063 | (funct3 << 12) | (a << 15) | (b_ << 20) | (8 << 7) + jalr = 0x00000067 | (NAT_RV64['br'] << 15) + return le32(insn) + le32(jalr) + + def call(self): + return le32(0x000000E7 | (NAT_RV64['br'] << 15)) # jalr ra, 0(t5) + + def ret(self): + return le32(0x00008067) # jalr x0, 0(ra) + + def prologue(self, k): + fb = prologue_frame_bytes(k) + sub = rv_i(0x00000013, 'sp', 'sp', -fb) + sd = rv_s(0x00003023, 'ra', 'sp', 0) + return sub + sd + + def epilogue(self, k): + fb = prologue_frame_bytes(k) + ld = rv_i(0x00003003, 'ra', 'sp', 0) + add = rv_i(0x00000013, 'sp', 'sp', fb) + return ld + add + + def tail(self, k): + return self.epilogue(k) + self.b() + + +ENCODERS = {'aarch64': AA64(), 'amd64': AMD64(), 'riscv64': RV64()} + + +## ---------- Op dataclasses ---------------------------------------------- +## Thin wrappers: each row holds its DEFINE name + the data needed to +## reconstruct the encoding. `encode(enc)` calls the matching method +## on the arch's encoder. + +@dataclass +class Op: + name: str + comment: str = '' + + def encode(self, enc: Encoder) -> str: + raise NotImplementedError + +@dataclass +class RRR(Op): + op: str = '' + rD: str = '' + rA: str = '' + rB: str = '' + def encode(self, enc): + return enc.rrr(self.op, self.rD, self.rA, self.rB) + +@dataclass +class AddI(Op): + rD: str = '' + rA: str = '' + imm: int = 0 + def encode(self, enc): + return enc.addi(self.rD, self.rA, self.imm) + +@dataclass +class LogI(Op): + op: str = '' # ANDI / ORI + rD: str = '' + rA: str = '' + imm: int = 0 + def encode(self, enc): + return enc.logi(self.op, self.rD, self.rA, self.imm) + +@dataclass +class ShiftI(Op): + op: str = '' # SHLI / SHRI / SARI + rD: str = '' + rA: str = '' + imm: int = 0 + def encode(self, enc): + return enc.shifti(self.op, self.rD, self.rA, self.imm) + +@dataclass +class Mov(Op): + rD: str = '' + rA: str = '' + def encode(self, enc): + return enc.mov(self.rD, self.rA) + +@dataclass +class Li(Op): + rD: str = '' + def encode(self, enc): + return enc.li(self.rD) + +@dataclass +class La(Op): + rD: str = '' + def encode(self, enc): + return enc.la(self.rD) + +@dataclass +class Mem(Op): + op: str = '' # LD / ST / LB / SB + rT: str = '' + rN: str = '' + off: int = 0 + def encode(self, enc): + return enc.mem(self.op, self.rT, self.rN, self.off) + +@dataclass +class B(Op): + def encode(self, enc): + return enc.b() + +@dataclass +class CondB(Op): + op: str = '' # BEQ / BNE / BLT + rA: str = '' + rB: str = '' + def encode(self, enc): + return enc.condb(self.op, self.rA, self.rB) + +@dataclass +class Literal(Op): + hex_by_arch: Optional[dict] = None + def encode(self, enc): + return enc.literal(self.hex_by_arch) + +@dataclass +class Prologue(Op): + k: int = 1 + def encode(self, enc): + return enc.prologue(self.k) + +@dataclass +class Epilogue(Op): + k: int = 1 + def encode(self, enc): + return enc.epilogue(self.k) + +@dataclass +class Tail(Op): + k: int = 1 + def encode(self, enc): + return enc.tail(self.k) + +@dataclass +class Call(Op): + def encode(self, enc): + return enc.call() + +@dataclass +class Ret(Op): + def encode(self, enc): + return enc.ret() + + +## ---------- SYSCALL pre-encoded sequences ------------------------------- +## The one-shot syscall wrapper. Shuffles P1's r0=num, r1–r6=args into +## each arch's native syscall ABI and clobbers only r0 on return. +## Encoded by hand (per P1.md §"Syscall conventions"). + +SYSCALL_HEX = { + 'aarch64': ( + # r4/r5 now live in callee-saved natives (x26/x27), so the + # kernel preserves them — no save/restore needed. Only r1/r2/r3 + # (in caller-saved x1/x2/x3) must be stashed across the shuffle. + '' .join([ + le32(0xAA0003E8), # mov x8, x0 (syscall number) + le32(0xAA0103F5), # mov x21, x1 (save r1) + le32(0xAA0203F6), # mov x22, x2 (save r2) + le32(0xAA0303F7), # mov x23, x3 (save r3) + le32(0xAA1503E0), # mov x0, x21 (arg1 = r1) + le32(0xAA1603E1), # mov x1, x22 (arg2 = r2) + le32(0xAA1703E2), # mov x2, x23 (arg3 = r3) + le32(0xAA1A03E3), # mov x3, x26 (arg4 = r4) + le32(0xAA1B03E4), # mov x4, x27 (arg5 = r5) + le32(0xAA1303E5), # mov x5, x19 (arg6 = r6) + le32(0xD4000001), # svc #0 + le32(0xAA1503E1), # mov x1, x21 (restore r1) + le32(0xAA1603E2), # mov x2, x22 + le32(0xAA1703E3), # mov x3, x23 + ]) + ), + # r4=r13, r5=r14 are callee-saved natively, but syscall wants args + # 4/5 in r10/r8. r6=rbx, but arg6 lives in r9. Three shuffle moves, + # then syscall. The kernel preserves rdi/rsi/rdx/r12–r15/rbx, so no + # P1 reg is clobbered beyond r0 (syscall return). + 'amd64': '4D89EA' + '4D89F0' + '4989D9' + '0F05', + 'riscv64': ( + # Same story as aarch64: r4/r5 in callee-saved s4/s5 (=x20/x21), + # so we only save/restore a1/a2/a3. Scratch slots: s3, s6, s7. + ''.join([ + le32(0x00050893), # mv a7, a0 (syscall number) + le32(0x00058993), # mv s3, a1 (save r1) + le32(0x00060B13), # mv s6, a2 (save r2) + le32(0x00068B93), # mv s7, a3 (save r3) + le32(0x00098513), # mv a0, s3 (arg1 = r1) + le32(0x000B0593), # mv a1, s6 (arg2 = r2) + le32(0x000B8613), # mv a2, s7 (arg3 = r3) + le32(0x000A0693), # mv a3, s4 (arg4 = r4) + le32(0x000A8713), # mv a4, s5 (arg5 = r5) + le32(0x00048793), # mv a5, s1 (arg6 = r6) + le32(0x00000073), # ecall + le32(0x00098593), # mv a1, s3 (restore r1) + le32(0x000B0613), # mv a2, s6 + le32(0x000B8693), # mv a3, s7 + ]) + ), +} + +## Syscall numbers (little-endian 32-bit for LI operand). +## aarch64 and riscv64 share the asm-generic table; amd64 has its own. +## +## Portability notes — every entry below is a syscall that exists on all +## three with the same semantics under the uniform P1 SYSCALL convention +## (r0 = num, r1-r6 = args): +## - `fork` is amd64-only; `wait4` is asm-generic 32-bit compat only. +## Use `clone(SIGCHLD)` and `waitid` instead. +## - `open` is amd64-only (removed from asm-generic). Use `openat` with +## dirfd = AT_FDCWD (-100) as arg1. +## - `clone` arg order differs: amd64 is (flags, stack, ptid, ctid, tls); +## aarch64/riscv64 are (flags, stack, ptid, tls, ctid). Benign when +## ptid/ctid/tls are all zero (the fork-equivalent case). +SYS_NUM = { + 'aarch64': {'SYS_WRITE': 64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57, + 'SYS_OPENAT': 56, + 'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95}, + 'amd64': {'SYS_WRITE': 1, 'SYS_EXIT': 60, 'SYS_READ': 0, 'SYS_CLOSE': 3, + 'SYS_OPENAT':257, + 'SYS_CLONE': 56, 'SYS_EXECVE': 59, 'SYS_WAITID':247}, + 'riscv64': {'SYS_WRITE': 64, 'SYS_EXIT': 93, 'SYS_READ': 63, 'SYS_CLOSE': 57, + 'SYS_OPENAT': 56, + 'SYS_CLONE': 220, 'SYS_EXECVE': 221, 'SYS_WAITID': 95}, +} + + +## ---------- Canonical imm/offset/shamt sets ----------------------------- +## Enumerated instead of sigil-passed: M1's DEFINE substitutes hex +## bytes verbatim, so every distinct imm value needs its own DEFINE. +## These cover every value used across hello/demo/lisp/kaem-minimal +## plus a little headroom. Extend when a new value appears in P1 src. + +## ADDI imms. NEG48/48 handle the ASCII '0' bias; the rest cover tag +## stripping and loop counters. Full reg product × this set = 8²×N. +ADDI_IMMS = (-48, -8, -7, -6, -5, -4, -3, -2, -1, + 1, 2, 3, 4, 5, 6, 7, 8, 48) + +## Shift amounts (for SHLI/SHRI/SARI). 32/52 implement low-N-bit masks +## (length field extraction; 4096-slot symbol-table index); the small +## values scale-by-N for byte offsets and fixnum encode/decode. +SHIFT_IMMS = (1, 2, 3, 5, 16, 32, 52) + +## ANDI/ORI imms. Every entry must appear in AA64_LOGI_ENC. +LOGI_IMMS = (1, 2, 3, 4, 6, 7, 8) + +## Memory offsets for LD/ST/LB/SB. 0/8/16/24/32 cover slot offsets in +## N-slot frames and common struct fields; 7 is the NUL terminator +## position inside an 8-byte zero-padded slot; -8 reaches one slot +## below the current base. +MEM_OFFS = (-8, 0, 7, 8, 16, 24, 32) + +CONDB_OPS = ('BEQ', 'BNE', 'BLT') +SHIFT_OPS = ('SHLI', 'SHRI', 'SARI') +LOGI_OPS = ('ANDI', 'ORI') +MEM_OPS = ('LD', 'ST', 'LB', 'SB') + + +## Curated RRR triples. The full cube is 11 ops × 8³ regs = 5632 +## entries per arch — >99% would be dead weight. Each tuple below +## is one actually used by hello/demo/lisp/kaem-minimal. Lint +## catches missing triples on assembly; add a line here and +## regenerate. +RRR_TABLE = ( + # demo/lisp step-1 arith cube + ('ADD','r1','r1','r2'), ('ADD','r1','r1','r4'), + ('ADD','r2','r2','r6'), ('ADD','r2','r3','r1'), + ('SUB','r1','r1','r2'), ('SUB','r2','r2','r6'), + ('AND','r1','r1','r5'), + ('OR', 'r1','r1','r2'), + ('XOR','r1','r1','r2'), + ('MUL','r1','r1','r2'), + ('DIV','r1','r1','r2'), + ('REM','r1','r1','r5'), + ('SHL','r1','r1','r2'), + ('SHR','r1','r1','r2'), + ('SAR','r4','r4','r2'), + # alloc / pointer arithmetic + ('ADD','r2','r0','r1'), + ('ADD','r0','r0','r3'), + ('ADD','r2','r2','r0'), + ('ADD','r2','r2','r1'), + ('SUB','r3','r3','r0'), + # reader / display index+offset fold + ('ADD','r6','r1','r2'), + ('ADD','r6','r6','r0'), + ('ADD','r7','r1','r2'), + ('SUB','r2','r1','r6'), + ('SUB','r3','r1','r6'), + ('REM','r1','r1','r2'), + # kaem-minimal bump-pointer + accumulator updates + ('ADD','r1','r1','r0'), + ('ADD','r5','r5','r0'), + ('ADD','r7','r7','r0'), + ('SUB','r3','r3','r2'), + ('SUB','r6','r6','r0'), + # Primitive bodies (LISP.md step 10c). Convention: r1=argc, + # r2=argv (both input), r3=accumulator, r0=scratch/return. + # Variadic folds: (r3 = r3 op r0). Unary negate / bit-not: + # (r3 = r0 - r3) with r0 = 0 or -1. arithmetic-shift k-negate: + # (r0 = r1 - r0) with r1 = 0 after argc is consumed. + ('ADD','r3','r3','r0'), + ## ('SUB','r3','r3','r0') — already above (kaem-minimal row) + ('SUB','r3','r0','r3'), + ('SUB','r0','r1','r0'), + ('MUL','r3','r3','r0'), + ('DIV','r3','r3','r0'), + ('REM','r3','r3','r0'), + ('AND','r3','r3','r0'), + ('OR', 'r3','r3','r0'), + ('XOR','r3','r3','r0'), + ('SHL','r3','r3','r0'), + ('SAR','r3','r3','r0'), +) + + +## ---------- Row assembly ------------------------------------------------ + +HEADER = """## p1_{arch}.M1 — GENERATED by p1_gen.py. Do not edit by hand. +## +## Shared op-table lives in p1_gen.py; each arch's encoder lowers +## (op, register-tuple, imm) rows into native bytes. See P1.md for the +## ISA spec and register mapping. +""" + +@dataclass +class Banner: + text: str + + +def _imm_suf(imm): + return f'NEG{-imm}' if imm < 0 else f'{imm}' + + +def rows(): + R = [] + + # --- LI / LA — wide literal and address loads --- + R.append(Banner('LI / LA — load 4-byte zero-extended literal or label addr')) + for rd in P1_REGS: + R.append(Li(name=f'LI_{rd.upper()}', rD=rd)) + # LI_BR loads into the hidden branch-target scratch (x17/r11/t5). + # Every branch/call site is `LI_BR &target ; P1_<BR>`. The scratch + # is *not* a P1 reg. + R.append(Li(name='LI_BR', rD='br')) + for rd in P1_REGS: + R.append(La(name=f'LA_{rd.upper()}', rD=rd)) + + # --- MOV — register-to-register + MOV rD, sp --- + R.append(Banner('MOV — full register product (src may be sp)')) + for rd in P1_REGS: + for ra in P1_REGS: + R.append(Mov(name=f'MOV_{rd.upper()}_{ra.upper()}', rD=rd, rA=ra)) + R.append(Mov(name=f'MOV_{rd.upper()}_SP', rD=rd, rA='sp')) + + # --- RRR — curated triples (full cube would be 5.6k/arch) --- + R.append(Banner('RRR — curated triples (explicit table in p1_gen.py)')) + for op, d, a, b in RRR_TABLE: + R.append(RRR(name=f'{op}_{d.upper()}_{a.upper()}_{b.upper()}', + op=op, rD=d, rA=a, rB=b)) + + # --- Immediate arith: ADDI × full reg product × imm set --- + R.append(Banner('ADDI — full register product × ADDI_IMMS')) + for d, a, imm in product(P1_REGS, P1_REGS, ADDI_IMMS): + R.append(AddI(name=f'ADDI_{d.upper()}_{a.upper()}_{_imm_suf(imm)}', + rD=d, rA=a, imm=imm)) + + # --- ANDI / ORI × full reg product × LOGI_IMMS --- + R.append(Banner('ANDI / ORI — full register product × LOGI_IMMS')) + for op, d, a, imm in product(LOGI_OPS, P1_REGS, P1_REGS, LOGI_IMMS): + R.append(LogI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}', + op=op, rD=d, rA=a, imm=imm)) + + # --- SHLI / SHRI / SARI × full reg product × SHIFT_IMMS --- + R.append(Banner('SHLI / SHRI / SARI — full register product × SHIFT_IMMS')) + for op, d, a, imm in product(SHIFT_OPS, P1_REGS, P1_REGS, SHIFT_IMMS): + R.append(ShiftI(name=f'{op}_{d.upper()}_{a.upper()}_{imm}', + op=op, rD=d, rA=a, imm=imm)) + + # --- Memory: LD/ST/LB/SB × full reg product × MEM_OFFS --- + R.append(Banner('LD / ST / LB / SB — full register product × MEM_OFFS')) + for op, rt, rn, off in product(MEM_OPS, P1_REGS, P1_REGS, MEM_OFFS): + R.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{_imm_suf(off)}', + op=op, rT=rt, rN=rn, off=off)) + + # --- Branches: BEQ/BNE/BLT × full reg product + unconditional B --- + R.append(Banner('Branches — LI_BR-indirect pattern')) + R.append(B(name='B')) + for op, a, b in product(CONDB_OPS, P1_REGS, P1_REGS): + R.append(CondB(name=f'{op}_{a.upper()}_{b.upper()}', + op=op, rA=a, rB=b)) + + # --- Control: CALL / RET / PROLOGUE / EPILOGUE / TAIL (Nk = 1..4) --- + R.append(Banner('Control: CALL/RET + single-slot and N-slot PROLOGUE/EPILOGUE/TAIL')) + R.append(Prologue(name='PROLOGUE', k=1)) + R.append(Epilogue(name='EPILOGUE', k=1)) + R.append(Ret(name='RET')) + R.append(Call(name='CALL')) + R.append(Tail(name='TAIL', k=1)) + for k in (2, 3, 4): + R.append(Prologue(name=f'PROLOGUE_N{k}', k=k)) + R.append(Epilogue(name=f'EPILOGUE_N{k}', k=k)) + R.append(Tail(name=f'TAIL_N{k}', k=k)) + + # --- SYSCALL — pre-encoded per-arch wrapper --- + R.append(Banner('SYSCALL — uniform "clobbers r0 only" across arches')) + R.append(Literal(name='SYSCALL', hex_by_arch=SYSCALL_HEX)) + + # --- Syscall numbers (LE-32 immediates) --- + R.append(Banner('Linux syscall numbers (per-arch table). LE-32 operands for LI.')) + for name in ('SYS_WRITE', 'SYS_EXIT', 'SYS_READ', 'SYS_CLOSE', 'SYS_OPENAT', + 'SYS_CLONE', 'SYS_EXECVE', 'SYS_WAITID'): + R.append(Literal(name=name, + hex_by_arch={a: le32(SYS_NUM[a][name]) for a in ARCHES})) + + return R + + +## ---------- File emission ----------------------------------------------- + +def emit(arch: str) -> str: + enc = ENCODERS[arch] + out = [HEADER.format(arch=arch).rstrip(), ''] + seen = set() + for row in rows(): + if isinstance(row, Banner): + out.append('') + out.append('## ---- ' + row.text + ' ' + '-' * max(0, 60 - len(row.text))) + continue + name = 'P1_' + row.name if not row.name.startswith('SYS_') else row.name + if name in seen: + raise RuntimeError(f'duplicate DEFINE: {name}') + seen.add(name) + out.append(f'DEFINE {name} {row.encode(enc)}') + out.append('') + return '\n'.join(out) + + +def main(): + check = '--check' in sys.argv + positional = [a for a in sys.argv[1:] if not a.startswith('--')] + build_root = positional[0] if positional else 'build' + + had_diff = False + for arch in ARCHES: + dest_dir = os.path.join(build_root, arch) + path = os.path.join(dest_dir, f'p1_{arch}.M1') + content = emit(arch) + if check: + try: + with open(path) as f: + existing = f.read() + except FileNotFoundError: + existing = '' + if existing != content: + sys.stderr.write(f'DIFF: {path}\n') + had_diff = True + else: + os.makedirs(dest_dir, exist_ok=True) + with open(path, 'w') as f: + f.write(content) + print(f'wrote {path} ({len(content)} bytes)') + + if check and had_diff: + sys.exit(1) + + +if __name__ == '__main__': + main()