commit a84e4ca4303eb3dcad5cf34ca83c2f119019ef1e
parent b97e3d36e52f0da1c9260fb3f5c56dcc35e6ee4f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 22 Apr 2026 11:17:03 -0700
build: ~30x faster + 75% smaller lisp binary
Three orthogonal wins stacked into one build pass:
- Prune unused DEFINEs: awk filter keeps only the ~350 of 7,293
p1_<arch>.M1 DEFINEs that lisp.M1 actually references, cutting M0's
input ~50% and its O(macros × tokens) Set_Expression scan ~20x.
- tmpfs staging: M0/hex2-0 issue one read()/write() syscall per byte
(custom unbuffered fgetc/fputc, not libc). Going through the virtiofs
bind mount on Apple Silicon podman costs ~90s per 340KB stream; the
same stream through the container's own overlayfs /tmp costs ~2s. All
per-byte I/O now lives on /tmp; virtiofs only sees bulk cp traffic.
- BSS via the ELF header: new src/elf/ELF-<arch>.hex2 declares
p_memsz = :ELF_bss_end - :ELF_base (kernel-zero-filled) while
p_filesz stays at :ELF_end. lisp.M1 splits accordingly: :ELF_end
lands before the ZERO32-heavy BSS region, :ELF_bss_end sits at the
very end. :mark_stack_next drops its &mark_stack init (gc_mark_all
re-seeds it every pass). A post-link truncate reads p_filesz from
byte offset 0x60 and drops the trailing zero bytes hex2 had to emit
to advance IP. The stock stage0-posix ELF-*.hex2 files under
build/upstream/ stay untouched because bootstrap.sh still uses them
to link M0 itself (no BSS there).
Measured (aarch64, clean build): ~124s -> ~5s; binary 159,293 -> 38,541
bytes. All 28 lisp tests pass on aarch64/amd64/riscv64.
Diffstat:
5 files changed, 234 insertions(+), 13 deletions(-)
diff --git a/Makefile b/Makefile
@@ -152,31 +152,70 @@ $(UPSTREAM_DIR)/%: $(UPSTREAM_STAMP) ;
$(TOOLS_DIR)/M0 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm $(TOOLS_DIR)/hex0 $(TOOLS_DIR)/hex1 &: bootstrap.sh $(UPSTREAM_STAMP) | $(TOOLS_DIR) $(IMAGE_STAMP)
$(PODMAN) sh bootstrap.sh $(ARCH) /work/$(TOOLS_DIR)
-# Assemble: lint first, then combine per-arch defs + program and feed to M0.
+# Assemble: lint, prune p1 DEFINEs to only those referenced by the program,
+# then combine + feed to M0.
#
# Lint catches P1_*/SYS_* tokens with no matching DEFINE — M1 otherwise
# silently emits the literal token text and produces a SIGILL-on-run
# binary. Runs on the host (plain POSIX sh); no podman dependency.
#
-# M0 takes a single positional input (no -f flag), so we catm the two
-# sources together first. The intermediate .combined.M1 is kept in OUT_DIR
-# so it gets cleaned along with everything else.
+# p1_$(ARCH).M1 enumerates every (op, reg-tuple) encoding — ~7,300 DEFINEs,
+# of which only ~350 are referenced by a typical PROG_SRC. Dropping the dead
+# ones before M0 sees them cuts M0's input bytes ~50% and its per-macro
+# O(N) Set_Expression scan ~20x. Safe because p1 DEFINE bodies are pure hex
+# with no cross-DEFINE references, so the used set is just the tokens that
+# literally appear in PROG_SRC.
+#
+# M0/hex2-0 use unbuffered fgetc/fputc that do one read()/write() syscall per
+# byte. Going through the virtiofs bind mount costs microseconds per syscall
+# on Apple Silicon podman (~90s for a 340KB stream); staging everything in
+# the container's own overlayfs /tmp collapses it to ~2s. So we cp inputs to
+# /tmp up front, run the tools /tmp → /tmp, and cp outputs back — virtiofs
+# only sees bulk read()/write() via cp, never per-byte.
+#
+# M0 takes a single positional input (no -f flag), so we catm the pruned p1
+# with PROG_SRC first. Intermediates stay in OUT_DIR so clean picks them up.
$(OUT_DIR)/$(PROG).hex2: $(PROG_SRC) $(OUT_DIR)/p1_$(ARCH).M1 lint.sh $(TOOLS_DIR)/M0 $(TOOLS_DIR)/catm | $(OUT_DIR) $(IMAGE_STAMP)
./lint.sh $(OUT_DIR)/p1_$(ARCH).M1 $(PROG_SRC)
+ awk 'NR==FNR{for(i=1;i<=NF;i++)u[$$i]=1;next} /^DEFINE /{if($$2 in u)print;next} {print}' \
+ $(PROG_SRC) $(OUT_DIR)/p1_$(ARCH).M1 > $(OUT_DIR)/p1_$(ARCH).pruned.M1
$(PODMAN) sh -ec ' \
- $(TOOLS_DIR)/catm $(OUT_DIR)/$(PROG).combined.M1 $(OUT_DIR)/p1_$(ARCH).M1 $(PROG_SRC) ; \
- $(TOOLS_DIR)/M0 $(OUT_DIR)/$(PROG).combined.M1 $(OUT_DIR)/$(PROG).hex2'
-
-# Link: prepend the ELF header and feed to hex2-0.
+ cp $(OUT_DIR)/p1_$(ARCH).pruned.M1 /tmp/p1.M1 ; \
+ cp $(PROG_SRC) /tmp/prog.M1 ; \
+ $(TOOLS_DIR)/catm /tmp/combined.M1 /tmp/p1.M1 /tmp/prog.M1 ; \
+ $(TOOLS_DIR)/M0 /tmp/combined.M1 /tmp/$(PROG).hex2 ; \
+ cp /tmp/combined.M1 $(OUT_DIR)/$(PROG).combined.M1 ; \
+ cp /tmp/$(PROG).hex2 $(OUT_DIR)/$(PROG).hex2'
+
+# Link: prepend the BSS-enabled ELF header and feed to hex2-0, then truncate
+# the on-disk file to p_filesz bytes. The kernel zero-fills the gap up to
+# p_memsz at load time. See src/elf/ELF-aarch64.hex2 for the scheme.
#
# hex2-0 is strictly positional too, so again catm first. hex2-0 hardcodes
# base address 0x00600000 (no --base-address flag), which is why the ELF
# header references `&ELF_base` symbolically rather than baking in a
# concrete VA — the header travels to whatever base the linker chose.
-$(OUT_DIR)/$(PROG): $(OUT_DIR)/$(PROG).hex2 $(UPSTREAM_DIR)/$(ARCH_DIR)/ELF-$(ARCH).hex2 $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm | $(IMAGE_STAMP)
+#
+# Truncate: p_filesz lives at byte offset 0x60 (96) in the ELF64 header —
+# within the first program header, immediately after p_paddr. We read it
+# as a little-endian u32 (the high four bytes are zero for any sane size)
+# and truncate /tmp/$(PROG) before cp'ing back. Everything past p_filesz
+# in the raw hex2-0 output is zero bytes from the trailing BSS region.
+#
+# All I/O stages through /tmp for the same unbuffered-fputc reason as M0.
+ELF_HDR_SRC := src/elf/ELF-$(ARCH).hex2
+
+$(OUT_DIR)/$(PROG): $(OUT_DIR)/$(PROG).hex2 $(ELF_HDR_SRC) $(TOOLS_DIR)/hex2-0 $(TOOLS_DIR)/catm | $(IMAGE_STAMP)
$(PODMAN) sh -ec ' \
- $(TOOLS_DIR)/catm $(OUT_DIR)/$(PROG).linked.hex2 $(UPSTREAM_DIR)/$(ARCH_DIR)/ELF-$(ARCH).hex2 $(OUT_DIR)/$(PROG).hex2 ; \
- $(TOOLS_DIR)/hex2-0 $(OUT_DIR)/$(PROG).linked.hex2 $(OUT_DIR)/$(PROG)'
+ cp $(ELF_HDR_SRC) /tmp/elf.hex2 ; \
+ cp $(OUT_DIR)/$(PROG).hex2 /tmp/prog.hex2 ; \
+ $(TOOLS_DIR)/catm /tmp/linked.hex2 /tmp/elf.hex2 /tmp/prog.hex2 ; \
+ $(TOOLS_DIR)/hex2-0 /tmp/linked.hex2 /tmp/$(PROG).raw ; \
+ size=$$(od -An -tu4 -N4 -j96 /tmp/$(PROG).raw | tr -d " ") ; \
+ head -c $$size /tmp/$(PROG).raw > /tmp/$(PROG) ; \
+ chmod 0700 /tmp/$(PROG) ; \
+ cp /tmp/linked.hex2 $(OUT_DIR)/$(PROG).linked.hex2 ; \
+ cp /tmp/$(PROG) $(OUT_DIR)/$(PROG)'
# $(RUN_ARGS) is listed as a prerequisite so that lisp's concat'd fixture
# is materialised before the interpreter runs. Empty for hello/demo and
diff --git a/src/elf/ELF-aarch64.hex2 b/src/elf/ELF-aarch64.hex2
@@ -0,0 +1,63 @@
+## ELF-aarch64.hex2 — BSS-enabled AArch64 ELF header for lispcc program link.
+##
+## Derived from stage0-posix's ELF-aarch64.hex2 (same single-LOAD-segment
+## layout) with one change: p_memsz references :ELF_bss_end while p_filesz
+## still references :ELF_end. The on-disk file ends at :ELF_end (after a
+## post-link truncate drops the trailing zero bytes hex2 emits); the
+## kernel's loader zero-fills the gap up to :ELF_bss_end at runtime.
+##
+## The stock stage0-posix ELF-*.hex2 files under build/upstream/ stay
+## untouched (no :ELF_bss_end label) because bootstrap.sh uses them to
+## link the M0 binary itself — a program that has no BSS region to speak
+## of and doesn't define :ELF_bss_end.
+##
+## If you use this header, your program must define BOTH :ELF_end
+## (end of real bytes) and :ELF_bss_end (end of zero-fill region).
+
+## ELF Header
+
+:ELF_base
+7F 45 4C 46 # e_ident[EI_MAG0-3] ELF's magic number
+
+02 # e_ident[EI_CLASS] Indicating 64 bit
+01 # e_ident[EI_DATA] Indicating little endianness
+01 # e_ident[EI_VERSION] Indicating original elf
+
+03 # e_ident[EI_OSABI] Set at 3 because FreeBSD is strict
+00 # e_ident[EI_ABIVERSION] Set at 0 because noone cares
+
+00 00 00 00 00 00 00 # e_ident[EI_PAD]
+02 00 # e_type Indicating Executable
+B7 00 # e_machine Indicating AArch64
+01 00 00 00 # e_version Indicating original elf
+
+&_start 00 00 00 00 # e_entry Address of the entry point
+%ELF_program_headers>ELF_base 00 00 00 00 # e_phoff Address of program header table
+00 00 00 00 00 00 00 00 # e_shoff Address of section header table
+
+00 00 00 00 # e_flags
+40 00 # e_ehsize Indicating our 64 Byte header
+
+38 00 # e_phentsize size of a program header table
+01 00 # e_phnum number of entries in program table
+
+00 00 # e_shentsize size of a section header table
+00 00 # e_shnum number of entries in section table
+
+00 00 # e_shstrndx index of the section names
+
+## Program Header
+:ELF_program_headers
+01 00 00 00 # ph_type: PT-LOAD = 1
+07 00 00 00 # ph_flags: PF-X|PF-W|PF-R = 7
+00 00 00 00 00 00 00 00 # ph_offset
+
+&ELF_base 00 00 00 00 # ph_vaddr
+&ELF_base 00 00 00 00 # ph_physaddr
+
+%ELF_end>ELF_base 00 00 00 00 # ph_filesz (end of on-disk image)
+%ELF_bss_end>ELF_base 00 00 00 00 # ph_memsz (kernel zero-fills past ph_filesz)
+
+01 00 00 00 00 00 00 00 # ph_align
+
+:ELF_text
diff --git a/src/elf/ELF-amd64.hex2 b/src/elf/ELF-amd64.hex2
@@ -0,0 +1,51 @@
+## ELF-amd64.hex2 — BSS-enabled AMD64 ELF header for lispcc program link.
+## See src/elf/ELF-aarch64.hex2 for why this lives here rather than
+## under build/upstream/ (bootstrap.sh needs the stock version to link M0).
+
+## ELF Header
+
+:ELF_base
+7F 45 4C 46 ## e_ident[EI_MAG0-3] ELF's magic number
+
+02 ## e_ident[EI_CLASS] Indicating 64 bit
+01 ## e_ident[EI_DATA] Indicating little endianness
+01 ## e_ident[EI_VERSION] Indicating original elf
+
+03 ## e_ident[EI_OSABI] Set at 3 because FreeBSD is strict
+00 ## e_ident[EI_ABIVERSION] Set at 0 because none cares
+
+00 00 00 00 00 00 00 ## e_ident[EI_PAD]
+02 00 ## e_type Indicating Executable
+3E 00 ## e_machine Indicating AMD64
+01 00 00 00 ## e_version Indicating original elf
+
+&_start 00 00 00 00 ## e_entry Address of the entry point
+%ELF_program_headers>ELF_base 00 00 00 00 ## e_phoff Address of program header table
+00 00 00 00 00 00 00 00 ## e_shoff Address of section header table
+
+00 00 00 00 ## e_flags
+40 00 ## e_ehsize Indicating our 64 Byte header
+
+38 00 ## e_phentsize size of a program header table
+01 00 ## e_phnum number of entries in program table
+
+00 00 ## e_shentsize size of a section header table
+00 00 ## e_shnum number of entries in section table
+
+00 00 ## e_shstrndx index of the section names
+
+## Program Header
+:ELF_program_headers
+01 00 00 00 ## p_type
+07 00 00 00 ## ph_flags: PF-X|PF-W|PF-R = 7
+00 00 00 00 00 00 00 00 ## p_offset
+
+&ELF_base 00 00 00 00 ## p_vaddr
+&ELF_base 00 00 00 00 ## p_physaddr
+
+%ELF_end>ELF_base 00 00 00 00 ## p_filesz (end of on-disk image)
+%ELF_bss_end>ELF_base 00 00 00 00 ## p_memsz (kernel zero-fills past p_filesz)
+
+01 00 00 00 00 00 00 00 ## Required alignment
+
+:ELF_text
diff --git a/src/elf/ELF-riscv64.hex2 b/src/elf/ELF-riscv64.hex2
@@ -0,0 +1,51 @@
+## ELF-riscv64.hex2 — BSS-enabled RISC-V 64 ELF header for lispcc program link.
+## See src/elf/ELF-aarch64.hex2 for why this lives here rather than
+## under build/upstream/ (bootstrap.sh needs the stock version to link M0).
+
+## ELF Header
+
+:ELF_base
+7F 45 4C 46 ## e_ident[EI_MAG0-3] ELF's magic number
+
+02 ## e_ident[EI_CLASS] Indicating 64 bit
+01 ## e_ident[EI_DATA] Indicating little endianness
+01 ## e_ident[EI_VERSION] Indicating original elf
+
+03 ## e_ident[EI_OSABI] Set at 3 because FreeBSD is strict
+00 ## e_ident[EI_ABIVERSION] Set at 0 because none cares
+
+00 00 00 00 00 00 00 ## e_ident[EI_PAD]
+02 00 ## e_type Indicating Executable
+F3 00 ## e_machine Indicating RISC-V
+01 00 00 00 ## e_version Indicating original elf
+
+&_start 00 00 00 00 ## e_entry Address of the entry point
+%ELF_program_headers>ELF_base 00 00 00 00 ## e_phoff Address of program header table
+00 00 00 00 00 00 00 00 ## e_shoff Address of section header table
+
+00 00 00 00 ## e_flags
+40 00 ## e_ehsize Indicating our 64 Byte header
+
+38 00 ## e_phentsize size of a program header table
+01 00 ## e_phnum number of entries in program table
+
+00 00 ## e_shentsize size of a section header table
+00 00 ## e_shnum number of entries in section table
+
+00 00 ## e_shstrndx index of the section names
+
+## Program Header
+:ELF_program_headers
+01 00 00 00 ## p_type
+07 00 00 00 ## ph_flags: PF-X|PF-W|PF-R = 7
+00 00 00 00 00 00 00 00 ## p_offset
+
+&ELF_base 00 00 00 00 ## p_vaddr
+&ELF_base 00 00 00 00 ## p_physaddr
+
+%ELF_end>ELF_base 00 00 00 00 ## p_filesz (end of on-disk image)
+%ELF_bss_end>ELF_base 00 00 00 00 ## p_memsz (kernel zero-fills past p_filesz)
+
+01 00 00 00 00 00 00 00 ## Required alignment
+
+:ELF_text
diff --git a/src/lisp.M1 b/src/lisp.M1
@@ -6528,6 +6528,20 @@ DEFINE ZERO32 '0000000000000000000000000000000000000000000000000000000000000000'
## otherwise).
:global_env_cell NIL %0
+
+## --------------------------------------------------------------------
+## :ELF_end marks the end of the initialised image — everything above
+## is code and data with real bytes, everything below is BSS: labels
+## only, no real content. The ELF header declares p_filesz ending
+## here and p_memsz ending at :ELF_bss_end, so the kernel zero-fills
+## the tail at load time and the on-disk file stops at :ELF_end (a
+## post-link truncate drops the trailing zero bytes). Labels here
+## still need `%0`/`ZERO32` placeholders so hex2's IP keeps advancing
+## and label addresses after them remain correct; the bytes those
+## emit get truncated away and replaced by kernel zeros.
+## --------------------------------------------------------------------
+:ELF_end
+
:stack_bottom_fp %0 %0
:gc_root_fp %0 %0
@@ -6556,7 +6570,10 @@ DEFINE ZERO32 '0000000000000000000000000000000000000000000000000000000000000000'
:free_list_obj96 %0 %0
:free_list_obj128 %0 %0
-:mark_stack_next &mark_stack %0
+## mark_stack_next lives in BSS: gc_mark_all re-seeds it to &mark_stack
+## at the top of every mark pass (see :gc_mark_all), so the initial
+## value is dead — zero is fine.
+:mark_stack_next %0 %0
:pair_mark_bitmap
ZERO32 ZERO32 ZERO32 ZERO32 ZERO32 ZERO32 ZERO32 ZERO32
@@ -7099,4 +7116,4 @@ ZERO32 ZERO32 ZERO32 ZERO32 ZERO32 ZERO32 ZERO32 ZERO32
ZERO32 ZERO32 ZERO32 ZERO32 ZERO32 ZERO32 ZERO32 ZERO32
:src_buf_end
-:ELF_end
+:ELF_bss_end