commit 224a62c7d3f1c05dcdeb1fe9c621a87067c8ce91
parent 58e924d7db1574070253c6e9eb60689d0060f5c2
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 20 Apr 2026 20:31:19 -0700
lisp.M1 step 2 (tagged values); P1 Tranche 8; add P1_TODO.md
Implements LISP.md step 2: fixnum/pair/singleton tags, cons/car/cdr,
with a demo that hand-builds cons(fixnum(42), nil) and exits with the
decoded car. Portable across aarch64/amd64/riscv64.
Boot-aligns heap_next with (x|7)+1 since heap_start's offset from the
ELF base isn't 8-aligned on every arch. Tranche 8 adds the ops needed
for this plus tag manipulation, pair loads/stores, and r7-indirect BNE
variants.
P1_TODO.md captures 5 follow-ups surfaced by the step — undefined-token
lint, alignment doc, DEFINE generator, N-slot PROLOGUE, r7-pattern doc
— all to land before step 3.
Diffstat:
| A | P1_TODO.md | | | 251 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| M | lisp.M1 | | | 316 | ++++++++++++++++++++++++++++++++++++++++++------------------------------------- |
| M | p1_aarch64.M1 | | | 36 | ++++++++++++++++++++++++++++++++++++ |
| M | p1_amd64.M1 | | | 27 | +++++++++++++++++++++++++++ |
| M | p1_riscv64.M1 | | | 27 | +++++++++++++++++++++++++++ |
5 files changed, 509 insertions(+), 148 deletions(-)
diff --git a/P1_TODO.md b/P1_TODO.md
@@ -0,0 +1,251 @@
+# P1 — follow-up work from `lisp.M1` step 2
+
+Issues surfaced while bringing up step 2 (tagged values: cons/car/cdr),
+with the decision recorded for each. This is a work list, not a design
+doc; `P1.md` is the source of truth for the ISA itself.
+
+Two of these (issue 1, issue 2) were caught the hard way during step 2:
+the first build passed on aarch64 and failed on amd64/riscv64 because
+`heap_start` happened to land at different offsets from the ELF base on
+each arch, and the second build SIGILL'd because a `P1_LD_R0_R4_0` macro
+was missing from the defs files but M1 silently passed the undefined
+token through as literal text. Both are preventable upstream.
+
+## 1. Undefined `P1_*` tokens pass silently through M1
+
+**Symptom.** In step 2, `lisp.M1` used `P1_LD_R0_R4_0` (offset 0) but the
+defs files only had `P1_LD_R0_R4_8` (offset 8). M1 emitted the literal
+string `P1_LD_R0_R4_0` into the output stream rather than expanding it.
+The resulting binary SIGILL'd on aarch64 and segfaulted on amd64. No
+diagnostic from M1, hex2, or the loader — it took `xxd` + grep to find.
+
+This will keep biting us as we add more `(op, register-tuple)` entries
+to the defs: the combinatorial surface (~1200 DEFINEs per arch) is
+exactly where a missing tuple is easy to miss and hard to notice.
+
+**Options considered.**
+
+1. **Makefile lint.** Before invoking M1, grep the combined source for
+ `P1_[A-Z0-9_]*` tokens and assert every one has a matching `DEFINE`
+ in the per-arch defs file. Fails the build with the offending token
+ name. ~10 lines of shell, runs in milliseconds.
+2. **Patch M1** to error on undefined tokens. Correct fix, but M1 is
+ upstream stage0 — we don't want to fork it.
+3. **Exhaustive DEFINE table.** Generate every `(op, rD, rA, rB)` tuple
+ so no lookup can miss. Waits on issue 3 (generator).
+
+**Decision — option 1: Makefile lint.** Add a `lint` target to
+`lispcc/Makefile` that runs before each arch's assemble step. Keeps the
+toolchain unchanged; catches the bug at the exact layer where it's
+introduced (M1 input); works today with no generator. Option 3 also
+helps but doesn't cover typos (`P1_LD_R0_R4_0` where the author meant
+`P1_LD_R0_R4_8` would still silently miss if both existed and the wrong
+one was spelled).
+
+## 2. Data alignment is implicit and arch-dependent
+
+**Symptom.** `heap_start` is a label into a zero-padded byte blob.
+Nothing in hex2 or M1 forces the label's file offset (and therefore its
+runtime address under the fixed ELF base) to be 8-byte aligned. Because
+the code size between the ELF header and `heap_start` differs per arch,
+the three targets landed on different alignments:
+
+- aarch64: `0x00600560` → low 3 bits = `000` (aligned)
+- amd64: `0x006002CB` → low 3 bits = `011` (misaligned)
+- riscv64: `0x00600604` → low 3 bits = `100` (misaligned)
+
+Pair pointers carry a 3-bit tag in the low bits, so the allocator MUST
+return pointers with `ptr & 7 == 0`. If `heap_start` itself is
+misaligned, every tagged pair is corrupt.
+
+**Options considered.**
+
+1. **Application-level: align at boot + make alignment an allocator
+ contract.** `_start` rounds `heap_next` up with `(x | 7) + 1` before
+ any allocation. `alloc()` is documented to require 8-aligned sizes
+ and to return 8-aligned pointers. Document both in `P1.md`. This is
+ what step 2 already does.
+2. **hex2-level alignment directive.** Add something like `.align 8` to
+ hex2 so labels can be padded at assembly time.
+
+Investigated hex2 capabilities (both the High Level Prototype and the
+actual bootstrap AMD64 `hex2_AMD64.M1` in stage0-posix). hex2 supports
+only the pointer sigils `: ! @ $ ~ % &` and the `>` base override — no
+alignment directive, no label arithmetic. Option 2 would require forking
+hex2, which is off the table for the same reason we don't fork M1.
+
+**Decision — option 1: doc + allocator contract.** Add a §"Data
+alignment" subsection to `P1.md`:
+
+- Labels have no inherent alignment. The offset of any label from the
+ ELF base is the cumulative code/data size before it, which varies per
+ arch.
+- Programs that care about alignment (tagged pointers, cache lines,
+ atomic fields — the first being immediate for us) must align at boot
+ with `(x | mask) + 1`.
+- Any allocator that returns tagged-pointer-eligible cells MUST return
+ pointers aligned to at least the tag width (8 bytes, low-3-bit tag).
+- Caller must pass tag-width-aligned sizes to bump allocators for now.
+ When the real allocator lands in step 9, it rounds internally.
+
+## 3. The ~1200-entry DEFINE generator is now a blocker
+
+**Symptom.** Step 2 added ~13 new `(op, register-tuple)` entries per
+arch by hand. Step 3 (strings + symbol interning) and step 4 (reader)
+will need many more — load/store variants across r0–r7 × base-register
+× common offsets, more arithmetic tuples, more branches. Hand-writing
+each tuple × 3 arches is already where most of step 2's wall-clock
+went.
+
+**Options considered.**
+
+1. **Build generator before step 3.** Design a shared op table
+ (`op, rD, rA, rB` plus a template for each arch's native encoding),
+ emit `p1_<arch>.M1` from it. Pause Lisp work until done.
+2. **Defer until step 4 or 5.** Hand-add entries as each step needs
+ them; accept the per-step friction.
+3. **Lazy defs.** Only enumerate tuples actually referenced by the
+ program, via a source-scan + per-arch hand-encoder. More complex
+ than full generation.
+
+**Decision — option 1: build before step 3.** Deferring pushes the
+problem into a worse spot (each new step adds friction, and the defs
+files diverge across arches in hard-to-review ways). Design the op
+table to be appendable — tranches 1–8 already map cleanly onto rows of
+`(op, rD, rA, rB)` plus per-arch encoding templates. The generator
+unblocks step 3 and is a prerequisite for the ~1200-entry target in
+`P1.md`.
+
+This adds one item to the staged plan between steps 2 and 3.
+
+## 4. Root spill across `CALL` needs more than one scratch slot
+
+**Symptom.** `cons(car, cdr)` must keep both args live across the
+`alloc(16)` call. The P1 ABI gives each frame exactly one callee-private
+scratch cell at `[sp+8]`, which fits one value. Step 2 worked around
+this by parking `cdr` in a BSS cell (`:cons_save_cdr`), a single-threaded
+hack that breaks under the step-9 GC (which needs every live tagged
+value to be on the stack so the mark phase can find it).
+
+The same pattern will recur everywhere: anything multi-arg that calls
+into the allocator (pair constructors, list builders, environment frame
+builders, closure constructors) needs per-function root-spill slots
+proportional to its live set, not a fixed one.
+
+**Options considered.**
+
+1. **Expand `PROLOGUE` frame + per-function spill-slot count.** Bump
+ standard frame from 16 B to N B, where each function declares how
+ many callee-private cells it needs at `[sp+8]`, `[sp+16]`,
+ `[sp+24]`, etc. `PROLOGUE`/`EPILOGUE` take the slot count as an
+ immediate. Update all three arch defs files.
+2. **Keep one scratch slot, use BSS for overflow.** What step 2 does.
+ Breaks under GC, doesn't scale past single-threaded.
+3. **Caller-managed save area.** Caller subtracts from `sp` itself,
+ spills, calls, restores. Works but breaks the "`sp` only mutated by
+ `PROLOGUE`/`EPILOGUE`" invariant from `P1.md`.
+
+**Decision — option 1: expand PROLOGUE frame + spill-slot count.**
+Aligns with GC discipline (all roots on stack, walkable), keeps `sp`
+mutation localized to PROLOGUE/EPILOGUE, and scales with live-set size.
+Specifics to pin down when implementing:
+
+- Encoding of the slot count. Probably a suffix on the op name
+ (`PROLOGUE_N2`, `PROLOGUE_N3`, …) so each variant is a distinct
+ `DEFINE` and the generator can enumerate them the same way it
+ enumerates register tuples. Cap at some N (4? 8?) and grow if a
+ function ever needs more.
+- Frame size must stay 16-aligned on aarch64 — round up.
+- The existing single-slot `PROLOGUE`/`EPILOGUE` stay as aliases for
+ the N=1 case.
+
+Replace `cons_save_cdr` in `lisp.M1` with a second frame slot once the
+N-slot prologue lands.
+
+## 5. r7-indirect branch pattern is verbose but fine
+
+**Symptom.** Every branch or call compiles through three source lines:
+
+```
+P1_LI_R7
+&target
+P1_BNE_R1_R2_R7
+```
+
+`lisp.M1` has 8 such sites and will grow roughly linearly with the
+interpreter's control flow. Readers unfamiliar with P1 don't immediately
+see this is "branch if r1 != r2 to `target`."
+
+A conditional in a high-level language:
+
+```
+if x == 0 goto fail;
+```
+
+lands in P1 as:
+
+```
+P1_LI_R7
+&fail
+P1_BEQ_R1_R0_R7 ## (after loading 0 into r0)
+```
+
+**Options considered.**
+
+1. **Document the pattern only.** Add a §"Reading P1 source" subsection
+ to `P1.md` that explains: every branch is
+ `LI_R7 &target ; BXX_rA_rB_R7`; r7 is branch-scratch owned by this
+ pattern; target load + branch are always together. Once seen, it
+ reads fluently.
+2. **Introduce `*_TO` pseudos.** E.g. `P1_BNE_R1_R2_TO &target` expands
+ to the 3-op sequence. Cosmetic — saves one source line per site —
+ but it ripples through the DEFINE table (each pseudo needs per-arch
+ encoding that emits a forward-jump-around-a-literal-pool or similar)
+ and hides the r7 clobber, which is load-bearing.
+3. **Change the ISA to have a 32-bit branch-target field.** A real
+ native-encoded conditional branch would save the literal and the
+ `LI_R7`. Breaks hex2's "no label arithmetic" contract (branch
+ offsets are label_A - label_B), and hex2 is not forkable.
+
+**Decision — option 1: document the pattern only.** `*_TO` pseudos
+would require the generator to emit paired (branch, literal-pool-entry)
+per call site, and hiding the r7 clobber invites bugs when someone
+assumes r7 survives a branch. The pattern is learnable; the workaround
+for hex2's missing branch-offset support is exactly what `P1.md` calls
+out ("unbounded range within 4 GiB at the cost of one r7 clobber per
+branch"). Documenting is sufficient.
+
+Add §"Reading P1 source" under the existing §"Register mapping" in
+`P1.md`: one paragraph on the pattern, one on the r7 clobber rule (some
+of this already lives in the "r7 is not a general-purpose callee-saved
+register" paragraph, which should be cross-referenced).
+
+## Ordering
+
+All five land before step 3. Reasons:
+
+- **Issue 4 blocks step 3 directly.** Strings and symbol interning are
+ allocator-calling constructors — same shape as `cons`, more live
+ values per frame. Deferring issue 4 means reproducing the
+ `cons_save_cdr` BSS hack across every step-3 constructor, then
+ ripping them all out for step 9 (GC). Cheaper to land N-slot
+ PROLOGUE once.
+- **Issue 4 must land with issue 3, not after.** If the generator
+ emits the defs table first and N-slot PROLOGUE lands later, the
+ generator runs twice and the defs files churn. Do them together.
+- **Issues 1, 2, 5 are cheap and compounding.** Makefile lint (1)
+ prevents the exact class of bug that ate an hour in step 2 — every
+ step 3 tuple we add is another chance to mistype. Alignment doc (2)
+ and r7-pattern doc (5) are one sitting each and make step 3 review
+ easier for anyone but the author.
+
+Suggested order within the pre-step-3 block:
+
+1. Issue 1 (Makefile lint) — ~10 lines of shell, unblocks confident
+ iteration on the rest.
+2. Issue 5 (`P1.md` §"Reading P1 source") and issue 2
+ (`P1.md` §"Data alignment") — doc-only, batch into one commit.
+3. Issue 3 (generator) + issue 4 (N-slot PROLOGUE) — together.
+ Design the op table so PROLOGUE/EPILOGUE variants are rows like any
+ other `(op, reg-tuple)`. Rip out `cons_save_cdr` from `lisp.M1`
+ as the validation case.
diff --git a/lisp.M1 b/lisp.M1
@@ -1,35 +1,32 @@
## lisp.M1 — Seed Lisp interpreter (portable across aarch64/amd64/riscv64)
##
-## Step 1: proof of life. Establishes the runtime skeleton documented
-## in LISP.md §"Staged implementation plan" item 1:
+## Step 2: tagged values. Implements LISP.md §"Staged implementation plan"
+## item 2 — fixnum/pair/singleton encoding with cons/car/cdr on top of
+## the step-1 heap. Demo hand-builds cons(fixnum(42), nil), verifies the
+## pair tag, verifies cdr==nil, extracts car, verifies the fixnum tag,
+## and exits with the decoded value (42) as the process status.
##
-## - _start + argv parsing (argc at [sp+0], argv at [sp+8])
-## - syscall wrappers: open, read, close, write, exit
-## - error path: "error: <msg>\n" to fd 2, exit 1
-## - BSS layout: heap_next / heap_end pointer cells + a heap arena
-## - bump allocator (minimal: caller passes 8-byte-aligned size)
+## Tag table (LISP.md §"Tagged cells"):
+## 001 = fixnum (value = n<<3 | 1; decode via arithmetic >> 3)
+## 010 = pair (ptr | 2, 16-byte [car][cdr] block on the heap)
+## 111 = singleton (nil=0x07, #t=0x0F, #f=0x17, eof=0x1F, unspec=0x27)
##
-## Proof: opens argv[1], reads up to 256 bytes into a freshly-allocated
-## buffer, closes, exits with buf[0] as the status byte. argc errors,
-## open errors, and short reads each take the error path (fd 2, exit 1).
-##
-## Later steps build tagged cells (§2), strings + symbol interning (§3),
-## reader (§4), printer (§5), eval (§6), … on top. This file should
-## stay tight — LISP.md §"Settled decisions" item 7 targets a single
-## lisp.M1 source up to ~6k LOC.
-
-## ---- heap-state cells ------------------------------------------------
-## Placed at the very start of ELF_text so their file-offset alignment is
+## Observable success: `echo $?` prints 42. Tag-check failures fall
+## through to the `error` path (writes to fd 2, exits 1). Step 1's
+## file-I/O plumbing (`run_file`, open/read/close wrappers, usage/
+## open/read/internal error pads) is removed — it will come back with
+## the Reader in step 4.
+
+## ---- heap-state + cons scratch cell ----------------------------------
+## Placed at the very start of ELF_text so file-offset alignment is
## predictable. ELF header (64 B) + 1 program header (56 B) = 120 B, so
## :heap_next lands at offset 120 from ELF_base — 8-byte aligned, which
-## keeps aarch64's LDR-X64 happy. Both cells are 4-byte hex2 address
-## labels zero-padded to 8 bytes (ELF base < 4 GiB, so the zero-extension
-## matches the 64-bit load).
+## keeps aarch64's LDR-X64 happy. Both heap cells are 4-byte hex2
+## addresses zero-padded to 8 bytes (ELF base < 4 GiB).
##
## The zero pad is spelled as a quoted hex literal rather than bare
## `00 00 00 00` tokens: stage0-posix M0 on riscv64 rejects bare
-## hex-byte tokens whose first nibble is '0'–'9' (rc=1, no diagnostic),
-## while amd64/aarch64 M0 accept them. `'XXXXXXXX'` is portable.
+## hex-byte tokens whose first nibble is '0'–'9' (rc=1, no diagnostic).
:heap_next
&heap_start
'00000000'
@@ -37,123 +34,158 @@
&heap_tail
'00000000'
+## cons needs both incoming args (car, cdr) to survive an alloc() call.
+## The PROLOGUE scratch slot [sp+8] holds one, so park the other here.
+## Safe because the step-2 interpreter is single-threaded and cons is
+## not reentrant. Step 9 (mark-sweep GC) replaces this with the real
+## root-spill discipline.
+:cons_save_cdr
+'0000000000000000'
+
## ---- _start ----------------------------------------------------------
-## Linux process entry. argc is at [sp+0], argv[0] at [sp+8], argv[1] at
-## [sp+16], ... _start is not P1-callable (no P1_PROLOGUE, no RET); the
-## only exit is through the SYS_EXIT syscall inside run_file or error.
+## Linux process entry. Builds the demo pair and exits with the decoded
+## fixnum car as status. Not P1-callable (no PROLOGUE, no RET) — every
+## exit path goes through SYS_EXIT (success) or `error` (failure).
:_start
- P1_MOV_R6_SP ## r6 = sp (argv base, survives syscalls)
+ ## Align heap_next up to the next 8-byte boundary. heap_start's
+ ## offset from ELF_base is code-size-dependent and not aligned on
+ ## every arch (amd64/riscv64 fall on non-8-aligned offsets). Pair
+ ## pointers need low-3-bits = 000 so that ORing in PAIR_TAG (010)
+ ## produces a clean tag. `(x | 7) + 1` rounds any pointer up to 8.
+ P1_LI_R4
+ &heap_next
+ P1_LD_R0_R4_0
+ P1_ORI_R0_R0_7
+ P1_ADDI_R0_R0_1
+ P1_MOV_R2_R0
+ P1_ST_R2_R4_0
- ## argc < 2 -> usage error
- P1_LD_R0_R6_0 ## r0 = argc
+ ## Build fixnum(42) in r1: (42 << 3) | 1 = 0x151.
+ P1_LI_R1
+ '2A000000' ## r1 = 42
P1_LI_R2
- '02000000' ## r2 = 2
- P1_LI_R7
- &usage_err
- P1_BLT_R0_R2_R7
+ '03000000' ## r2 = 3 (shift amount for fixnum tag)
+ P1_SHL_R1_R1_R2 ## r1 = 42 << 3
+ P1_ORI_R1_R1_1 ## r1 |= 1 (fixnum tag)
- ## r1 = argv[1] (path to source file)
- P1_LD_R1_R6_16
-
- ## run_file(path). Never returns on the happy path.
+ ## cons(r1=fixnum(42), r2=nil=0x07).
+ P1_LI_R2
+ '07000000' ## r2 = nil
P1_LI_R7
- &run_file
- P1_CALL
+ &cons
+ P1_CALL ## r0 = pair pointer|2
+ P1_MOV_R6_R0 ## r6 = pair (callee-saved across calls)
- ## Defence-in-depth: run_file should have exited via SYS_EXIT. If
- ## control flows back here something is badly wrong — jump to the
- ## error path.
+ ## Check (pair & 7) == 2.
+ P1_MOV_R1_R6
+ P1_ANDI_R1_R1_7
+ P1_LI_R2
+ '02000000'
P1_LI_R7
- &internal_err
- P1_B
+ ¬_pair_err
+ P1_BNE_R1_R2_R7
+ ## Check cdr(pair) == nil.
+ P1_MOV_R1_R6
+ P1_LI_R7
+ &cdr
+ P1_CALL ## r0 = cdr
+ P1_LI_R2
+ '07000000'
+ P1_LI_R7
+ ¬_nil_err
+ P1_BNE_R0_R2_R7
-## ---- run_file(path) --------------------------------------------------
-## r1 = path. Opens O_RDONLY, allocs a 256-byte buffer, reads a chunk,
-## closes, exits with buf[0] as status. Never returns on the happy path.
-##
-## Register discipline: fd lives in r6 (callee-saved) across everything.
-## buf is spilled to the PROLOGUE scratch slot [sp+8] immediately after
-## alloc; it doesn't need to live in a register because SYSCALL now
-## preserves r1..r6 (see P1.md §"Syscall conventions"), leaving [sp+8]
-## as the only thing we need to survive the BLT and close across.
-:run_file
- P1_PROLOGUE
+ ## x = car(pair); expect tag == fixnum.
+ P1_MOV_R1_R6
+ P1_LI_R7
+ &car
+ P1_CALL ## r0 = car
+ P1_MOV_R6_R0 ## r6 = car (callee-saved)
- ## open(path, O_RDONLY=0, mode=0). Path is already in r1.
+ P1_MOV_R1_R6
+ P1_ANDI_R1_R1_7
P1_LI_R2
- '00000000'
- P1_LI_R3
- '00000000'
- P1_SYSOPEN ## r0 = fd (or -errno)
+ '01000000'
+ P1_LI_R7
+ ¬_fixnum_err
+ P1_BNE_R1_R2_R7
- ## fd < 0 -> open error
+ ## exit(car >> 3) — yields 42.
+ P1_MOV_R1_R6
P1_LI_R2
- '00000000'
- P1_LI_R7
- &open_err
- P1_BLT_R0_R2_R7
+ '03000000'
+ P1_SHR_R1_R1_R2 ## r1 = fixnum value
+ P1_LI_R0
+ SYS_EXIT
+ P1_SYSCALL
+ ## No return.
- P1_MOV_R6_R0 ## r6 = fd (callee-saved across calls/syscalls)
- ## alloc(256). 256 is 8-byte aligned, so no rounding needed.
+## ---- cons(car, cdr) -> pair -----------------------------------------
+## r1 = car, r2 = cdr. Allocates 16 bytes, writes the two cells, returns
+## (ptr | PAIR_TAG). Uses [sp+8] for car and :cons_save_cdr for cdr to
+## shepherd both args across the alloc() call (which clobbers r1..r5).
+:cons
+ P1_PROLOGUE
+
+ ## Spill car to [sp+8].
+ P1_MOV_R4_SP
+ P1_ST_R1_R4_8
+
+ ## Spill cdr to :cons_save_cdr.
+ P1_LI_R4
+ &cons_save_cdr
+ P1_ST_R2_R4_0
+
+ ## alloc(16). 16 is already 8-byte aligned.
P1_LI_R1
- '00010000' ## r1 = 256
+ '10000000'
P1_LI_R7
&alloc
- P1_CALL ## r0 = buf
+ P1_CALL ## r0 = raw (untagged) pair ptr
- ## Spill buf to [sp+8] so it survives the read/close syscalls and
- ## their bracketing BLT (which clobbers r7 via the LI_R7 pattern).
+ ## [ptr+0] = car.
P1_MOV_R4_SP
- P1_ST_R0_R4_8 ## *(sp+8) = buf
+ P1_LD_R1_R4_8 ## r1 = car
+ P1_ST_R1_R0_0
- ## read(fd, buf, 256). Copy buf into r2 before the SYS_READ load
- ## overwrites r0.
- P1_MOV_R2_R0 ## r2 = buf
- P1_LI_R0
- SYS_READ
- P1_MOV_R1_R6 ## r1 = fd
- P1_LI_R3
- '00010000' ## r3 = 256
- P1_SYSCALL ## r0 = bytes_read; r1..r6 preserved
+ ## [ptr+8] = cdr.
+ P1_LI_R4
+ &cons_save_cdr
+ P1_LD_R2_R4_0 ## r2 = cdr
+ P1_ST_R2_R0_8
- ## bytes_read < 1 -> read error (covers empty file + syscall errno)
- P1_LI_R2
- '01000000'
- P1_LI_R7
- &read_err
- P1_BLT_R0_R2_R7
+ ## Tag as a pair and return.
+ P1_ORI_R0_R0_2
- ## close(fd). Ignore the return — fd was valid on open.
- P1_LI_R0
- SYS_CLOSE
- P1_MOV_R1_R6
- P1_SYSCALL
+ P1_EPILOGUE
+ P1_RET
- ## exit(buf[0]). Reload buf from the scratch slot first — the load
- ## lands in r0, so we must set the SYS_EXIT number AFTER dereferencing.
- P1_MOV_R4_SP
- P1_LD_R0_R4_8 ## r0 = buf
- P1_LB_R1_R0_0 ## r1 = zext8(buf[0])
- P1_LI_R0
- SYS_EXIT
- P1_SYSCALL
- ## No return.
+
+## ---- car(pair) -> value ---------------------------------------------
+## Leaf function — no PROLOGUE/EPILOGUE (makes no nested calls). Strips
+## the pair tag by subtracting 2, then loads [ptr+0].
+:car
+ P1_ADDI_R1_R1_NEG2
+ P1_LD_R0_R1_0
+ P1_RET
+
+
+## ---- cdr(pair) -> value ---------------------------------------------
+:cdr
+ P1_ADDI_R1_R1_NEG2
+ P1_LD_R0_R1_8
+ P1_RET
## ---- alloc(size) -> ptr ---------------------------------------------
## Bump allocator over [heap_start, heap_end). Caller must pass an
-## 8-byte-aligned size (step 1 skeleton — later steps round). On
-## overflow branches to alloc_oom, which funnels through `error`.
-##
-## Layout of state cells in BSS:
-## heap_next : current bump pointer (64-bit)
-## heap_end : arena limit (64-bit)
+## 8-byte-aligned size (caller-rounds for now). On overflow branches to
+## :alloc_oom, which funnels through `error`.
:alloc
P1_PROLOGUE
- ## r1 = size
P1_LI_R4
&heap_next
@@ -182,9 +214,9 @@
## Callers reach this via LI_R7 &error ; P1_B (branch, not call) and
## arrange r1 = msg_ptr, r2 = msg_len beforehand.
:error
- ## r1 = msg_ptr, r2 = msg_len — spill to callee-saved regs because
- ## the three write() calls clobber r0..r3. We never return, so we
- ## don't preserve the caller's r6/r7.
+ ## Spill msg_ptr/msg_len into callee-saved regs across the write
+ ## calls (which clobber r0..r3). No need to preserve r6/r7 — we
+ ## never return.
P1_MOV_R6_R1
P1_MOV_R7_R2
@@ -228,51 +260,41 @@
## ---- Error landing pads ---------------------------------------------
-## Each pad loads (msg_ptr, msg_len) into r1/r2 and branches to `error`.
-## They are plain branch targets, not P1 functions — no PROLOGUE, no
-## return. Message lengths are spelled as literal imm32s; keep them in
-## sync if you edit the strings below.
-:usage_err
- P1_LI_R1
- &msg_usage
- P1_LI_R2
- '16000000' ## strlen("usage: lisp <file.scm>") = 22
- P1_LI_R7
- &error
- P1_B
-
-:open_err
+## Plain branch targets (no PROLOGUE, no return). Load (msg_ptr, msg_len)
+## into r1/r2, then r7-indirect branch into `error`. Keep message-length
+## imm32s in sync with the strings below.
+:alloc_oom
P1_LI_R1
- &msg_open
+ &msg_oom
P1_LI_R2
- '0B000000' ## strlen("open failed") = 11
+ '0E000000' ## strlen("heap exhausted") = 14
P1_LI_R7
&error
P1_B
-:read_err
+:not_pair_err
P1_LI_R1
- &msg_read
+ &msg_not_pair
P1_LI_R2
- '0B000000' ## strlen("read failed") = 11
+ '0D000000' ## strlen("expected pair") = 13
P1_LI_R7
&error
P1_B
-:alloc_oom
+:not_nil_err
P1_LI_R1
- &msg_oom
+ &msg_not_nil
P1_LI_R2
- '0E000000' ## strlen("heap exhausted") = 14
+ '0C000000' ## strlen("expected nil") = 12
P1_LI_R7
&error
P1_B
-:internal_err
+:not_fixnum_err
P1_LI_R1
- &msg_internal
+ &msg_not_fixnum
P1_LI_R2
- '0E000000' ## strlen("internal error") = 14
+ '0F000000' ## strlen("expected fixnum") = 15
P1_LI_R7
&error
P1_B
@@ -284,23 +306,21 @@
:msg_newline
"
"
-:msg_usage
-"usage: lisp <file.scm>"
-:msg_open
-"open failed"
-:msg_read
-"read failed"
:msg_oom
"heap exhausted"
-:msg_internal
-"internal error"
+:msg_not_pair
+"expected pair"
+:msg_not_nil
+"expected nil"
+:msg_not_fixnum
+"expected fixnum"
## ---- Heap arena -----------------------------------------------------
-## 1 KiB for step 1 — enough for the 256-byte read buffer plus some
-## headroom. Step 9 (LISP.md §"Mark-sweep GC") grows this to 20 MiB
-## via an ELF header change (memsz > filesz) so the arena stops
-## inflating the on-disk binary.
+## 1 KiB for step 2 — a single 16-byte pair plus headroom. Step 9
+## (LISP.md §"Mark-sweep GC") grows this to 20 MiB via an ELF header
+## change (memsz > filesz) so the arena stops inflating the on-disk
+## binary.
:heap_start
'00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
'00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
diff --git a/p1_aarch64.M1 b/p1_aarch64.M1
@@ -272,3 +272,39 @@ DEFINE P1_MOV_R2_R0 E20300AA ## mov x2, x0
DEFINE P1_ST_R0_R4_8 800400F9 ## str x0, [x4, #8]
DEFINE P1_LD_R0_R4_8 800440F9 ## ldr x0, [x4, #8]
DEFINE P1_LB_R1_R0_0 01004039 ## ldrb w1, [x0, #0] (zero-extend)
+
+
+## ---- Tranche 8: seed-Lisp step 2 (tagged values) ---------------------
+## Ops required by lisp.M1 (LISP.md item 2, tagged values): cons / car /
+## cdr plus tag checks (ANDI #7, BNE) and pair-tag patch (ORI #2).
+## car and cdr untag the pair pointer by subtracting 2 before loading.
+
+## ADDI signed imm (negative): sub x1, x1, #2 (untag pair pointer).
+DEFINE P1_ADDI_R1_R1_NEG2 210800D1
+
+## Pair-cell loads/stores. Base 0xF9400000 (LDR) / 0xF9000000 (STR);
+## imm12 is scaled ×8 for 64-bit accesses.
+DEFINE P1_LD_R0_R1_0 200040F9 ## ldr x0, [x1] (car slot)
+DEFINE P1_LD_R0_R1_8 200440F9 ## ldr x0, [x1, #8] (cdr slot)
+DEFINE P1_ST_R1_R0_0 010000F9 ## str x1, [x0] (write car)
+DEFINE P1_LD_R2_R4_0 820040F9 ## ldr x2, [x4] (reload cdr from BSS)
+DEFINE P1_ST_R2_R0_8 020400F9 ## str x2, [x0, #8] (write cdr)
+
+## Bitwise-imm tag ops. aarch64 logical-imm uses N:immr:imms:
+## orr x0, x0, #2 — N=1, immr=63, imms=0 (1 one rotated to bit 1)
+## and x1, x1, #7 — N=1, immr=0, imms=2 (3 contiguous ones)
+DEFINE P1_ORI_R0_R0_2 00007FB2 ## orr x0, x0, #2 (tag pair)
+DEFINE P1_ANDI_R1_R1_7 21084092 ## and x1, x1, #7 (extract tag)
+
+## BNE reg-pair extensions for tag-dispatch / nil-check branches.
+## cmp xA, xB ; b.eq +8 ; br x20
+DEFINE P1_BNE_R1_R2_R7 3F0002EB4000005480021FD6
+DEFINE P1_BNE_R0_R2_R7 1F0002EB4000005480021FD6
+
+## heap_next boot-alignment: (heap_next | 7) + 1 rounds the initial
+## bump pointer up to the next 8-byte boundary. heap_start's offset
+## from ELF_base is code-size-dependent and not aligned on every arch.
+## orr x0, x0, #7 — N=1, immr=0, imms=2 (3 contiguous ones)
+DEFINE P1_ORI_R0_R0_7 000840B2 ## orr x0, x0, #7
+DEFINE P1_ADDI_R0_R0_1 00040091 ## add x0, x0, #1
+DEFINE P1_LD_R0_R4_0 800040F9 ## ldr x0, [x4]
diff --git a/p1_amd64.M1 b/p1_amd64.M1
@@ -238,3 +238,30 @@ DEFINE P1_MOV_R2_R0 4889C6 ## mov rsi, rax
DEFINE P1_ST_R0_R4_8 49894208 ## mov [r10+8], rax
DEFINE P1_LD_R0_R4_8 498B4208 ## mov rax, [r10+8]
DEFINE P1_LB_R1_R0_0 480FB638 ## movzx rdi, byte [rax]
+
+
+## ---- Tranche 8: seed-Lisp step 2 (tagged values) ---------------------
+## See p1_aarch64.M1 §Tranche 8.
+
+## sub rdi, 2 (via the 2-insn mov+add idiom kept for consistency).
+DEFINE P1_ADDI_R1_R1_NEG2 4889FF4883C7FE ## mov rdi,rdi ; add rdi,-2
+
+## Pair-cell loads/stores. REX.W=48 for 64-bit; r10 base needs REX.B.
+DEFINE P1_LD_R0_R1_0 488B07 ## mov rax, [rdi]
+DEFINE P1_LD_R0_R1_8 488B4708 ## mov rax, [rdi+8]
+DEFINE P1_ST_R1_R0_0 488938 ## mov [rax], rdi
+DEFINE P1_LD_R2_R4_0 498B32 ## mov rsi, [r10]
+DEFINE P1_ST_R2_R0_8 48897008 ## mov [rax+8], rsi
+
+## Tag ops — imm8 sign-extended via opcode 83 (/1 or, /4 and).
+DEFINE P1_ORI_R0_R0_2 4889C04883C802 ## mov rax,rax ; or rax, 2
+DEFINE P1_ANDI_R1_R1_7 4889FF4883E707 ## mov rdi,rdi ; and rdi, 7
+
+## BNE pairs: cmp a,b ; je +3 ; jmp r12
+DEFINE P1_BNE_R1_R2_R7 4839F7740341FFE4 ## cmp rdi,rsi ; je +3 ; jmp r12
+DEFINE P1_BNE_R0_R2_R7 4839F0740341FFE4 ## cmp rax,rsi ; je +3 ; jmp r12
+
+## heap_next boot-alignment helpers (see p1_aarch64.M1 §Tranche 8).
+DEFINE P1_ORI_R0_R0_7 4889C04883C807 ## mov rax,rax ; or rax, 7
+DEFINE P1_ADDI_R0_R0_1 4889C04883C001 ## mov rax,rax ; add rax, 1
+DEFINE P1_LD_R0_R4_0 498B02 ## mov rax, [r10]
diff --git a/p1_riscv64.M1 b/p1_riscv64.M1
@@ -241,3 +241,30 @@ DEFINE P1_MOV_R2_R0 13060500 ## addi a2, a0, 0 (mv a2, a0)
DEFINE P1_ST_R0_R4_8 2334A700 ## sd a0, 8(a4)
DEFINE P1_LD_R0_R4_8 03358700 ## ld a0, 8(a4)
DEFINE P1_LB_R1_R0_0 83450500 ## lbu a1, 0(a0)
+
+
+## ---- Tranche 8: seed-Lisp step 2 (tagged values) ---------------------
+## See p1_aarch64.M1 §Tranche 8.
+
+## addi a1, a1, -2 — sign-extended imm12 = 0xFFE.
+DEFINE P1_ADDI_R1_R1_NEG2 9385E5FF
+
+## Pair-cell loads/stores. Opcode 0x03 (LOAD), 0x23 (STORE), funct3=011 (d).
+DEFINE P1_LD_R0_R1_0 03B50500 ## ld a0, 0(a1)
+DEFINE P1_LD_R0_R1_8 03B58500 ## ld a0, 8(a1)
+DEFINE P1_ST_R1_R0_0 2330B500 ## sd a1, 0(a0)
+DEFINE P1_LD_R2_R4_0 03360700 ## ld a2, 0(a4)
+DEFINE P1_ST_R2_R0_8 2334C500 ## sd a2, 8(a0)
+
+## Tag ops — I-type (funct3=110 ori, 111 andi).
+DEFINE P1_ORI_R0_R0_2 13652500 ## ori a0, a0, 2
+DEFINE P1_ANDI_R1_R1_7 93F57500 ## andi a1, a1, 7
+
+## BNE pairs: beq a,b,+8 ; jalr x0,0(s2) (skip past JALR on equal-false).
+DEFINE P1_BNE_R1_R2_R7 6384C50067000900 ## beq a1,a2,+8 ; jalr x0,0(s2)
+DEFINE P1_BNE_R0_R2_R7 6304C50067000900 ## beq a0,a2,+8 ; jalr x0,0(s2)
+
+## heap_next boot-alignment helpers (see p1_aarch64.M1 §Tranche 8).
+DEFINE P1_ORI_R0_R0_7 13657500 ## ori a0, a0, 7
+DEFINE P1_ADDI_R0_R0_1 13051500 ## addi a0, a0, 1
+DEFINE P1_LD_R0_R4_0 03350700 ## ld a0, 0(a4)