commit 58e924d7db1574070253c6e9eb60689d0060f5c2
parent d189dc2447799f1159087d8a3383382b2de6d49b
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 20 Apr 2026 19:12:26 -0700
P1 refine conventions; add lisp.M1 step 1
Observable contract tightened so callers don't need per-arch register
clobber knowledge:
- SYSCALL preserves r1-r6 on all three arches. aarch64/riscv64 gain
~12 extra movs using reserved callee-saved natives (x21-x25 / s3-s7)
as the save area; amd64 already did. r0 is the sole clobber.
- r7 reclassified as per-op scratch. The r7-indirect branch pattern
overwrites it at every BLT/B anyway; documenting reality avoids
future bugs like the one in an earlier draft of run_file.
- PROLOGUE reserves [sp+8] as callee-private scratch on all arches.
aarch64/riscv64 already allocated 16 bytes; amd64 now does real
work (pop r11 ; sub rsp,16 ; push r11) to match.
lisp.M1 step 1 is the proof-of-life for the P1-hosted Lisp (LISP.md
item 1): opens argv[1], reads the first byte, exits with it as status.
Exercises the new conventions by spilling buf to [sp+8] across the
read/close syscalls and bracketing BLT instead of the earlier
heap_start-reload hack.
Tested on amd64, aarch64, riscv64 (demo.M1 still exits 5; lisp.M1
exits 127 on its own ELF, 35 on SEED.md).
Diffstat:
| M | P1.md | | | 75 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------- |
| A | lisp.M1 | | | 315 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| M | p1_aarch64.M1 | | | 103 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------- |
| M | p1_amd64.M1 | | | 79 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- |
| M | p1_riscv64.M1 | | | 97 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------- |
5 files changed, 614 insertions(+), 55 deletions(-)
diff --git a/P1.md b/P1.md
@@ -66,13 +66,15 @@ distances inside the DEFINE.
| Registers | 8 GPRs (`r0`–`r7`) + `sp`, `lr`-on-stack | Fits x86-64's usable register budget |
| Narrow imm | Signed 12-bit | riscv I-type width; aarch64 ≤12 also OK |
| Wide imm | Pool-loaded via PC-relative `LI` | Avoids arch-specific immediate synthesis |
-| Calling conv | r0 = return, r1–r6 = args, r6–r7 callee-saved | P1-defined; not platform ABI |
+| Calling conv | r0 = return, r1–r6 = args, r6 callee-saved, r7 per-op scratch | P1-defined; not platform ABI |
| Return address | Always spilled to stack on entry | Hides x86's missing `lr` uniformly |
-| Syscall | `SYSCALL` with num in r0, args r1–r6 | Per-arch wrapper emits native sequence |
+| Syscall | `SYSCALL` with num in r0, args r1–r6; clobbers r0 only | Per-arch wrapper emits native sequence |
+| Spill slot | `[sp + 8]` is callee-private scratch after `PROLOGUE` | Frame already 16 B for alignment; second cell was otherwise unused |
## Register mapping
-All mappings are callee-saved for `r6`/`r7`, caller-saved otherwise.
+`r0`–`r5` are caller-saved. `r6` is callee-saved. `r7` is per-op scratch
+(see below). `sp` is special-purpose — see `PROLOGUE` semantics.
| P1 | amd64 | aarch64 | riscv64 |
|------|-------|---------|---------|
@@ -93,6 +95,21 @@ the stack too, so all three converge on "return address lives in
`[sp + 0]` after prologue." This uniformity is worth the extra store on the
register-rich arches.
+**`r7` is not a general-purpose callee-saved register.** Every conditional
+and unconditional branch in P1 compiles through the r7-indirect pattern
+(`LI_R7 &target ; BLT/B`), which overwrites `r7` at every branch site.
+Treat `r7` as a per-instruction scratch owned by the branch / call
+machinery: its value is meaningful only between the `LI_R7` that loads a
+target and the branch that consumes it. Never carry a live value through a
+branch or call in `r7`.
+
+**Syscall-reserved registers (not available to P1):** on aarch64, `x21`–
+`x25` are used by the `SYSCALL` expansion as save slots for `r1`–`r5`
+around the arg shuffle; on riscv64, `s3`–`s7` serve the same role. The
+kernel preserves them, and `SYSCALL` restores them, so to P1 programs they
+simply don't exist. amd64 has no such reservation (native arg regs already
+align with P1 `r1`–`r5`).
+
## Instruction set (~30 ops)
```
@@ -154,12 +171,19 @@ SYSCALL # num in r0, args r1-r6, ret in r0
- `CALL %label` pushes a return address (via the arch's native mechanism
or the caller-emitted `PROLOGUE`, see below) and jumps. `RET` pops and
jumps.
-- `PROLOGUE` / `EPILOGUE` set up and tear down a 16-byte frame, spilling
- the return address to `[sp + 0]` on entry and reloading it on exit. On
- amd64 the native `CALL` already pushes the retaddr, so both are a NOP;
- on aarch64/riscv64 they do real work spilling `lr`/`ra`. After
- `PROLOGUE`, `[sp + 0]` holds the caller's return address uniformly
- across all three arches.
+- `PROLOGUE` / `EPILOGUE` set up and tear down a 16-byte frame. After
+ `PROLOGUE`, `[sp + 0]` holds the caller's return address and
+ `[sp + 8]` is a **callee-private scratch slot** — one 8-byte cell that
+ each function may spill into across `CALL`s (or across any sequence
+ where the live register set is inconvenient). The slot is private to
+ the current frame: a nested `PROLOGUE` allocates its own pair of
+ cells, so the parent's spill at `[sp + 8]` survives unchanged.
+ Per-arch mechanics differ — aarch64/riscv64 `PROLOGUE` subtracts 16
+ from `sp` and stores `lr`/`ra` at `[sp + 0]`; amd64 pops the retaddr
+ native `call` already pushed into a non-P1 scratch (`r11`), subtracts
+ 16, then re-pushes it so the final layout matches. Access the scratch
+ slot via `MOV rX, sp` followed by `LD rY, rX, 8` / `ST rY, rX, 8`; `sp`
+ itself isn't a valid base for `LD`/`ST`.
- `TAIL %label` is a tail call — it performs the current function's
standard epilogue (restore `lr` from `[sp+0]`, pop the frame) and then
branches unconditionally to `%label`, reusing the caller's return
@@ -214,14 +238,31 @@ Linux syscall mechanics differ across arches. The `SYSCALL` macro hides this.
| aarch64 | `svc #0` | `x8` | `x0 – x5` |
| riscv64 | `ecall` | `a7` | `a0 – a5` |
-On aarch64 and riscv64 the P1 register mapping already places args in the
-native arg regs; only the number register differs (`r0` → `x8`/`a7`). The
-`SYSCALL` expansion emits a `mov` from `r0` to the arch's num register, then
-`svc 0`/`ecall`.
-
-On amd64 the P1 mapping matches the syscall ABI *except* `r0`/`rax` is the
-num reg (correct) and return reg (correct). Perfect — `SYSCALL` expands to
-a single `syscall` instruction.
+**Observable semantics:** `SYSCALL` takes the number in `r0` and args in
+`r1`–`r6`, traps, and returns the kernel's result in `r0`. **Only `r0` is
+clobbered.** `r1`–`r7` are preserved across `SYSCALL` on every arch. This
+matches the kernel's own register discipline and lets callers thread live
+values through syscalls without per-arch save/restore dances.
+
+The per-arch expansions:
+
+- **amd64** — P1 args already occupy the native arg regs except for arg6
+ (`r9` vs. P1 `r6`/`rbx`). Expansion is `mov r9, rbx ; syscall`. The
+ kernel preserves everything except `rax`, `rcx`, `r11`; none of `rcx`
+ or `r11` are P1 registers, so the only visible clobber is P1 `r0`.
+- **aarch64** — native arg regs are `x0`–`x5` but P1 puts args in
+ `x1`–`x5`,`x19` (one register higher). The expansion saves P1 `r1`–
+ `r5` into `x21`–`x25` (reserved for SYSCALL, see Register mapping),
+ shuffles the saved values into `x0`–`x4` plus `x19` into `x5`, moves
+ the number into `x8`, `svc #0`s, then restores `r1`–`r5` from the
+ `x21`–`x25` saves. Net cost: 12 additional moves vs. a bare
+ single-instruction syscall.
+- **riscv64** — same shape as aarch64, with `s3`–`s7` as the save slots
+ and `a7` as the number register.
+
+The extra moves on aarch64/riscv64 are a few nanoseconds per syscall.
+Trading them for uniform "clobbers `r0` only" semantics is worth it:
+callers don't need to memorize a per-arch clobber set.
### Syscall numbers
diff --git a/lisp.M1 b/lisp.M1
@@ -0,0 +1,315 @@
+## lisp.M1 — Seed Lisp interpreter (portable across aarch64/amd64/riscv64)
+##
+## Step 1: proof of life. Establishes the runtime skeleton documented
+## in LISP.md §"Staged implementation plan" item 1:
+##
+## - _start + argv parsing (argc at [sp+0], argv at [sp+8])
+## - syscall wrappers: open, read, close, write, exit
+## - error path: "error: <msg>\n" to fd 2, exit 1
+## - BSS layout: heap_next / heap_end pointer cells + a heap arena
+## - bump allocator (minimal: caller passes 8-byte-aligned size)
+##
+## Proof: opens argv[1], reads up to 256 bytes into a freshly-allocated
+## buffer, closes, exits with buf[0] as the status byte. argc errors,
+## open errors, and short reads each take the error path (fd 2, exit 1).
+##
+## Later steps build tagged cells (§2), strings + symbol interning (§3),
+## reader (§4), printer (§5), eval (§6), … on top. This file should
+## stay tight — LISP.md §"Settled decisions" item 7 targets a single
+## lisp.M1 source up to ~6k LOC.
+
+## ---- heap-state cells ------------------------------------------------
+## Placed at the very start of ELF_text so their file-offset alignment is
+## predictable. ELF header (64 B) + 1 program header (56 B) = 120 B, so
+## :heap_next lands at offset 120 from ELF_base — 8-byte aligned, which
+## keeps aarch64's LDR-X64 happy. Both cells are 4-byte hex2 address
+## labels zero-padded to 8 bytes (ELF base < 4 GiB, so the zero-extension
+## matches the 64-bit load).
+##
+## The zero pad is spelled as a quoted hex literal rather than bare
+## `00 00 00 00` tokens: stage0-posix M0 on riscv64 rejects bare
+## hex-byte tokens whose first nibble is '0'–'9' (rc=1, no diagnostic),
+## while amd64/aarch64 M0 accept them. `'XXXXXXXX'` is portable.
+:heap_next
+&heap_start
+'00000000'
+:heap_end
+&heap_tail
+'00000000'
+
+
+## ---- _start ----------------------------------------------------------
+## Linux process entry. argc is at [sp+0], argv[0] at [sp+8], argv[1] at
+## [sp+16], ... _start is not P1-callable (no P1_PROLOGUE, no RET); the
+## only exit is through the SYS_EXIT syscall inside run_file or error.
+:_start
+ P1_MOV_R6_SP ## r6 = sp (argv base, survives syscalls)
+
+ ## argc < 2 -> usage error
+ P1_LD_R0_R6_0 ## r0 = argc
+ P1_LI_R2
+ '02000000' ## r2 = 2
+ P1_LI_R7
+ &usage_err
+ P1_BLT_R0_R2_R7
+
+ ## r1 = argv[1] (path to source file)
+ P1_LD_R1_R6_16
+
+ ## run_file(path). Never returns on the happy path.
+ P1_LI_R7
+ &run_file
+ P1_CALL
+
+ ## Defence-in-depth: run_file should have exited via SYS_EXIT. If
+ ## control flows back here something is badly wrong — jump to the
+ ## error path.
+ P1_LI_R7
+ &internal_err
+ P1_B
+
+
+## ---- run_file(path) --------------------------------------------------
+## r1 = path. Opens O_RDONLY, allocs a 256-byte buffer, reads a chunk,
+## closes, exits with buf[0] as status. Never returns on the happy path.
+##
+## Register discipline: fd lives in r6 (callee-saved) across everything.
+## buf is spilled to the PROLOGUE scratch slot [sp+8] immediately after
+## alloc; it doesn't need to live in a register because SYSCALL now
+## preserves r1..r6 (see P1.md §"Syscall conventions"), leaving [sp+8]
+## as the only thing we need to survive the BLT and close across.
+:run_file
+ P1_PROLOGUE
+
+ ## open(path, O_RDONLY=0, mode=0). Path is already in r1.
+ P1_LI_R2
+ '00000000'
+ P1_LI_R3
+ '00000000'
+ P1_SYSOPEN ## r0 = fd (or -errno)
+
+ ## fd < 0 -> open error
+ P1_LI_R2
+ '00000000'
+ P1_LI_R7
+ &open_err
+ P1_BLT_R0_R2_R7
+
+ P1_MOV_R6_R0 ## r6 = fd (callee-saved across calls/syscalls)
+
+ ## alloc(256). 256 is 8-byte aligned, so no rounding needed.
+ P1_LI_R1
+ '00010000' ## r1 = 256
+ P1_LI_R7
+ &alloc
+ P1_CALL ## r0 = buf
+
+ ## Spill buf to [sp+8] so it survives the read/close syscalls and
+ ## their bracketing BLT (which clobbers r7 via the LI_R7 pattern).
+ P1_MOV_R4_SP
+ P1_ST_R0_R4_8 ## *(sp+8) = buf
+
+ ## read(fd, buf, 256). Copy buf into r2 before the SYS_READ load
+ ## overwrites r0.
+ P1_MOV_R2_R0 ## r2 = buf
+ P1_LI_R0
+ SYS_READ
+ P1_MOV_R1_R6 ## r1 = fd
+ P1_LI_R3
+ '00010000' ## r3 = 256
+ P1_SYSCALL ## r0 = bytes_read; r1..r6 preserved
+
+ ## bytes_read < 1 -> read error (covers empty file + syscall errno)
+ P1_LI_R2
+ '01000000'
+ P1_LI_R7
+ &read_err
+ P1_BLT_R0_R2_R7
+
+ ## close(fd). Ignore the return — fd was valid on open.
+ P1_LI_R0
+ SYS_CLOSE
+ P1_MOV_R1_R6
+ P1_SYSCALL
+
+ ## exit(buf[0]). Reload buf from the scratch slot first — the load
+ ## lands in r0, so we must set the SYS_EXIT number AFTER dereferencing.
+ P1_MOV_R4_SP
+ P1_LD_R0_R4_8 ## r0 = buf
+ P1_LB_R1_R0_0 ## r1 = zext8(buf[0])
+ P1_LI_R0
+ SYS_EXIT
+ P1_SYSCALL
+ ## No return.
+
+
+## ---- alloc(size) -> ptr ---------------------------------------------
+## Bump allocator over [heap_start, heap_end). Caller must pass an
+## 8-byte-aligned size (step 1 skeleton — later steps round). On
+## overflow branches to alloc_oom, which funnels through `error`.
+##
+## Layout of state cells in BSS:
+## heap_next : current bump pointer (64-bit)
+## heap_end : arena limit (64-bit)
+:alloc
+ P1_PROLOGUE
+ ## r1 = size
+
+ P1_LI_R4
+ &heap_next
+ P1_LD_R3_R4_0 ## r3 = *heap_next (old bump ptr)
+
+ P1_ADD_R2_R3_R1 ## r2 = new_next = old + size
+
+ P1_LI_R5
+ &heap_end
+ P1_LD_R0_R5_0 ## r0 = *heap_end
+
+ ## heap_end < new_next -> OOM
+ P1_LI_R7
+ &alloc_oom
+ P1_BLT_R0_R2_R7
+
+ P1_ST_R2_R4_0 ## *heap_next = new_next
+ P1_MOV_R0_R3 ## return old bump ptr
+
+ P1_EPILOGUE
+ P1_RET
+
+
+## ---- error(msg_ptr, msg_len) ----------------------------------------
+## Writes "error: " + msg + "\n" to fd 2, then exit(1). Never returns.
+## Callers reach this via LI_R7 &error ; P1_B (branch, not call) and
+## arrange r1 = msg_ptr, r2 = msg_len beforehand.
+:error
+ ## r1 = msg_ptr, r2 = msg_len — spill to callee-saved regs because
+ ## the three write() calls clobber r0..r3. We never return, so we
+ ## don't preserve the caller's r6/r7.
+ P1_MOV_R6_R1
+ P1_MOV_R7_R2
+
+ ## write(2, "error: ", 7)
+ P1_LI_R0
+ SYS_WRITE
+ P1_LI_R1
+ '02000000'
+ P1_LI_R2
+ &msg_error_prefix
+ P1_LI_R3
+ '07000000'
+ P1_SYSCALL
+
+ ## write(2, msg_ptr, msg_len)
+ P1_LI_R0
+ SYS_WRITE
+ P1_LI_R1
+ '02000000'
+ P1_MOV_R2_R6
+ P1_MOV_R3_R7
+ P1_SYSCALL
+
+ ## write(2, "\n", 1)
+ P1_LI_R0
+ SYS_WRITE
+ P1_LI_R1
+ '02000000'
+ P1_LI_R2
+ &msg_newline
+ P1_LI_R3
+ '01000000'
+ P1_SYSCALL
+
+ ## exit(1)
+ P1_LI_R0
+ SYS_EXIT
+ P1_LI_R1
+ '01000000'
+ P1_SYSCALL
+
+
+## ---- Error landing pads ---------------------------------------------
+## Each pad loads (msg_ptr, msg_len) into r1/r2 and branches to `error`.
+## They are plain branch targets, not P1 functions — no PROLOGUE, no
+## return. Message lengths are spelled as literal imm32s; keep them in
+## sync if you edit the strings below.
+:usage_err
+ P1_LI_R1
+ &msg_usage
+ P1_LI_R2
+ '16000000' ## strlen("usage: lisp <file.scm>") = 22
+ P1_LI_R7
+ &error
+ P1_B
+
+:open_err
+ P1_LI_R1
+ &msg_open
+ P1_LI_R2
+ '0B000000' ## strlen("open failed") = 11
+ P1_LI_R7
+ &error
+ P1_B
+
+:read_err
+ P1_LI_R1
+ &msg_read
+ P1_LI_R2
+ '0B000000' ## strlen("read failed") = 11
+ P1_LI_R7
+ &error
+ P1_B
+
+:alloc_oom
+ P1_LI_R1
+ &msg_oom
+ P1_LI_R2
+ '0E000000' ## strlen("heap exhausted") = 14
+ P1_LI_R7
+ &error
+ P1_B
+
+:internal_err
+ P1_LI_R1
+ &msg_internal
+ P1_LI_R2
+ '0E000000' ## strlen("internal error") = 14
+ P1_LI_R7
+ &error
+ P1_B
+
+
+## ---- Static strings -------------------------------------------------
+:msg_error_prefix
+"error: "
+:msg_newline
+"
+"
+:msg_usage
+"usage: lisp <file.scm>"
+:msg_open
+"open failed"
+:msg_read
+"read failed"
+:msg_oom
+"heap exhausted"
+:msg_internal
+"internal error"
+
+
+## ---- Heap arena -----------------------------------------------------
+## 1 KiB for step 1 — enough for the 256-byte read buffer plus some
+## headroom. Step 9 (LISP.md §"Mark-sweep GC") grows this to 20 MiB
+## via an ELF header change (memsz > filesz) so the arena stops
+## inflating the on-disk binary.
+:heap_start
+'00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
+'00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
+'00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
+'00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
+'00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
+'00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
+'00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
+'00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
+:heap_tail
+
+:ELF_end
diff --git a/p1_aarch64.M1 b/p1_aarch64.M1
@@ -34,27 +34,38 @@ DEFINE P1_LI_R5 4500001802000014
DEFINE P1_LI_R6 5300001802000014
DEFINE P1_LI_R7 5400001802000014
-## SYSCALL — num in r0, args r1..r6, result in r0.
+## SYSCALL — num in r0, args r1..r6, result in r0. Clobbers r0 only.
##
-## aarch64 Linux wants num in x8 and args in x0..x5. P1's mapping puts
-## args one register higher than the native ABI, so SYSCALL shuffles
-## x0..x19 down into x0..x5 and moves the num into x8, then svc #0.
+## aarch64 Linux wants num in x8 and args in x0..x5. P1 puts args one
+## register higher (x1..x5, x19), so we save r1..r5 into x21..x25 before
+## shuffling them down into x0..x4; x5 comes from x19 directly; x8 gets
+## the number. After `svc #0` we restore r1..r5 from the x21..x25 saves
+## so the caller sees only r0 clobbered.
##
-## Unconditional (every P1 ISA expansion is unoptimized). Inputs not
-## used by a given syscall are shuffled through harmlessly — x0..x5 are
-## caller-saved on the aarch64 platform ABI, and the kernel only reads
-## the registers the specific syscall cares about.
+## x21..x25 are reserved for this expansion and invisible to P1 code
+## (see P1.md §"Register mapping"). Using callee-saved native regs means
+## zero memory traffic — the save-and-restore is pure register moves.
##
-## Expansion:
-## mov x8, x0 ; P1 r0 (num) -> native num reg
-## mov x0, x1 ; P1 r1 -> native arg1
-## mov x1, x2 ; P1 r2 -> native arg2
-## mov x2, x3 ; P1 r3 -> native arg3
-## mov x3, x4 ; P1 r4 -> native arg4
-## mov x4, x5 ; P1 r5 -> native arg5
-## mov x5, x19 ; P1 r6 (x19) -> native arg6
+## Expansion (18 insns, 72 bytes):
+## mov x8, x0 ; num
+## mov x21, x1 ; save P1 r1
+## mov x22, x2 ; save P1 r2
+## mov x23, x3 ; save P1 r3
+## mov x24, x4 ; save P1 r4
+## mov x25, x5 ; save P1 r5
+## mov x0, x21 ; arg0 <- saved r1
+## mov x1, x22 ; arg1 <- saved r2
+## mov x2, x23 ; arg2 <- saved r3
+## mov x3, x24 ; arg3 <- saved r4
+## mov x4, x25 ; arg4 <- saved r5
+## mov x5, x19 ; arg5 <- P1 r6
## svc #0
-DEFINE P1_SYSCALL e80300aae00301aae10302aae20303aae30304aae40305aae50313aa010000d4
+## mov x1, x21 ; restore r1
+## mov x2, x22 ; restore r2
+## mov x3, x23 ; restore r3
+## mov x4, x24 ; restore r4
+## mov x5, x25 ; restore r5
+DEFINE P1_SYSCALL E80300AAF50301AAF60302AAF70303AAF80304AAF90305AAE00315AAE10316AAE20317AAE30318AAE40319AAE50313AA010000D4E10315AAE20316AAE30317AAE40318AAE50319AA
## Linux syscall numbers (aarch64 uses the generic table).
@@ -203,3 +214,61 @@ DEFINE P1_EPILOGUE FE0340F9FF430091 ## ldr x30,[sp] ; add sp,#16
DEFINE P1_RET C0035FD6 ## ret (br x30)
DEFINE P1_CALL 80023FD6 ## blr x20
DEFINE P1_TAIL FE0340F9FF43009180021FD6 ## epilogue ; br x20
+
+
+## ---- Tranche 6: seed-Lisp step 1 extensions ---------------------------
+## Ops required by lisp.M1 (LISP.md item 1, proof-of-life): file I/O
+## wrappers, cell loads/stores, and the arg-shuffling MOVs they imply.
+
+## Extra Linux syscall numbers (generic table, same as riscv64).
+DEFINE SYS_READ 3F000000
+DEFINE SYS_CLOSE 39000000
+
+## SYSOPEN — portable open(path, flags, mode). Assumes r1=path, r2=flags,
+## r3=mode. aarch64 has no open(2); it uses openat(2) = syscall 56, which
+## wants (dirfd=AT_FDCWD, path, flags, mode). P1's r1..r3 already line up
+## with native x1..x3, so only x0=AT_FDCWD and x8=56 need loading.
+##
+## Expansion:
+## movn x0, #99 ; x0 = -100 = AT_FDCWD (60 0C 80 92)
+## movz x8, #56 ; x8 = SYS_openat (08 07 80 D2)
+## svc #0 (01 00 00 D4)
+DEFINE P1_SYSOPEN 600C8092080780D2010000D4
+
+## MOV xD, xA — extra register pairings used around syscalls / calls.
+DEFINE P1_MOV_R6_R0 F30300AA ## mov x19, x0
+DEFINE P1_MOV_R0_R3 E00303AA ## mov x0, x3
+DEFINE P1_MOV_R7_R0 F40300AA ## mov x20, x0
+DEFINE P1_MOV_R7_R2 F40302AA ## mov x20, x2
+DEFINE P1_MOV_R2_R6 E20313AA ## mov x2, x19
+DEFINE P1_MOV_R3_R7 E30314AA ## mov x3, x20
+DEFINE P1_MOV_R2_R7 E20314AA ## mov x2, x20
+DEFINE P1_MOV_R4_R7 E40314AA ## mov x4, x20
+
+## LD/ST extras — additional dst/base pairings at scaled imm12.
+DEFINE P1_LD_R0_R6_0 600240F9 ## ldr x0, [x19]
+DEFINE P1_LD_R1_R6_16 610A40F9 ## ldr x1, [x19, #16]
+DEFINE P1_LD_R3_R4_0 830040F9 ## ldr x3, [x4]
+DEFINE P1_LD_R0_R5_0 A00040F9 ## ldr x0, [x5]
+DEFINE P1_LB_R1_R4_0 81004039 ## ldrb w1, [x4]
+DEFINE P1_ST_R2_R4_0 820000F9 ## str x2, [x4]
+
+## ADD r2, r3, r1 — x2 = x3 + x1.
+DEFINE P1_ADD_R2_R3_R1 6200018B ## add x2, x3, x1
+
+## BLT r0, r2, r7 — signed jump-if-less; used for argc/open/read checks.
+## cmp x0, x2 ; b.ge +8 ; br x20
+DEFINE P1_BLT_R0_R2_R7 1F0002EB4A00005480021FD6
+
+
+## ---- Tranche 7: [sp+8] scratch-slot access + assorted MOV/LD/ST -------
+## P1.md §PROLOGUE defines [sp+8] as a callee-private scratch cell.
+## Access pattern: MOV rX, sp then LD/ST rY, [rX, 8]. These defs cover
+## the specific register pairings lisp.M1 needs to spill the `buf`
+## pointer around the read-syscall + BLT + close-syscall sequence.
+
+DEFINE P1_MOV_R4_SP E4030091 ## add x4, sp, #0 (canonical mov x4, sp)
+DEFINE P1_MOV_R2_R0 E20300AA ## mov x2, x0
+DEFINE P1_ST_R0_R4_8 800400F9 ## str x0, [x4, #8]
+DEFINE P1_LD_R0_R4_8 800440F9 ## ldr x0, [x4, #8]
+DEFINE P1_LB_R1_R0_0 01004039 ## ldrb w1, [x0, #0] (zero-extend)
diff --git a/p1_amd64.M1 b/p1_amd64.M1
@@ -161,17 +161,80 @@ DEFINE P1_BLT_R4_R2_R7 4939F27D0341FFE4 ## cmp r10,rsi ; jge +3 ; j
## ---- Tranche 5: CALL / RET / PROLOGUE / EPILOGUE / TAIL -----------------
-## amd64's native CALL already pushes the return address to the stack, and
-## RET pops it. So PROLOGUE/EPILOGUE are no-ops here — a single NOP keeps
-## them non-empty (handy in disasm, and sidesteps any M0 quirks around an
-## empty DEFINE value). On the other two arches PROLOGUE/EPILOGUE do real
-## work spilling/reloading lr.
+## amd64's native CALL already pushes the return address to the stack, but
+## the P1 convention requires a 16-byte frame on entry so that [sp+0] =
+## retaddr and [sp+8] = callee-private scratch (see P1.md §PROLOGUE).
+## The native CALL only supplies [sp+0]; we have to allocate [sp+8]
+## ourselves. The pop/sub/push dance is the smallest way to do that
+## while keeping retaddr at [sp+0]:
+##
+## pop r11 ; r11 = retaddr (scratch — r11 is not a P1 reg)
+## sub rsp, 16 ; reserve 2 cells
+## push r11 ; [sp+0] = retaddr, [sp+8] = scratch
+##
+## r11 is Linux's syscall clobber register and not mapped to any P1 GPR,
+## so using it as the pop scratch is safe. EPILOGUE is the inverse.
##
## CALL expects the target pre-loaded into r7 (= r12); expands to `call r12`.
## TAIL = EPILOGUE + unconditional B (= `jmp r12`).
-DEFINE P1_PROLOGUE 90 ## nop (retaddr already on stack)
-DEFINE P1_EPILOGUE 90 ## nop
+DEFINE P1_PROLOGUE 415B4883EC104153 ## pop r11 ; sub rsp,16 ; push r11
+DEFINE P1_EPILOGUE 415B4883C4104153 ## pop r11 ; add rsp,16 ; push r11
DEFINE P1_RET C3 ## ret
DEFINE P1_CALL 41FFD4 ## call r12
-DEFINE P1_TAIL 9041FFE4 ## epilogue(nop) ; jmp r12
+DEFINE P1_TAIL 415B4883C410415341FFE4 ## epilogue ; jmp r12
+
+
+## ---- Tranche 6: seed-Lisp step 1 extensions ---------------------------
+## Ops required by lisp.M1 (LISP.md item 1, proof-of-life): file I/O
+## wrappers, cell loads/stores, and the arg-shuffling MOVs they imply.
+
+## Extra Linux syscall numbers (amd64-specific table).
+DEFINE SYS_READ 00000000
+DEFINE SYS_CLOSE 03000000
+
+## SYSOPEN — portable open(path, flags, mode). Assumes r1=path, r2=flags,
+## r3=mode. On amd64 maps to open(2) (syscall 2); P1's r1/r2/r3 already
+## sit in rdi/rsi/rdx, so the expansion is just load-num + syscall.
+##
+## Expansion:
+## mov eax, 2 ; B8 02 00 00 00 (zero-extends to rax)
+## syscall ; 0F 05
+DEFINE P1_SYSOPEN B8020000000F05
+
+## MOV rD, rA — extra register pairings used around syscalls / calls.
+DEFINE P1_MOV_R6_R0 4889C3 ## mov rbx, rax
+DEFINE P1_MOV_R0_R3 4889D0 ## mov rax, rdx
+DEFINE P1_MOV_R7_R0 4989C4 ## mov r12, rax
+DEFINE P1_MOV_R7_R2 4989F4 ## mov r12, rsi
+DEFINE P1_MOV_R2_R6 4889DE ## mov rsi, rbx
+DEFINE P1_MOV_R3_R7 4C89E2 ## mov rdx, r12
+DEFINE P1_MOV_R2_R7 4C89E6 ## mov rsi, r12
+DEFINE P1_MOV_R4_R7 4D89E2 ## mov r10, r12
+
+## LD/ST extras — additional dst/base pairings at imm8 offsets.
+DEFINE P1_LD_R0_R6_0 488B03 ## mov rax, [rbx]
+DEFINE P1_LD_R1_R6_16 488B7B10 ## mov rdi, [rbx+16]
+DEFINE P1_LD_R3_R4_0 498B12 ## mov rdx, [r10]
+DEFINE P1_LD_R0_R5_0 498B00 ## mov rax, [r8]
+DEFINE P1_LB_R1_R4_0 490FB63A ## movzx rdi, byte [r10]
+DEFINE P1_ST_R2_R4_0 498932 ## mov [r10], rsi
+
+## ADD r2, r3, r1 — rsi = rdx + rdi. Two-insn form like other ADDs.
+## mov rsi, rdx ; add rsi, rdi
+DEFINE P1_ADD_R2_R3_R1 4889D64801FE
+
+## BLT r0, r2, r7 — signed jump-if-less; used for argc/open/read checks.
+## cmp rax, rsi ; jge +3 ; jmp r12
+DEFINE P1_BLT_R0_R2_R7 4839F07D0341FFE4
+
+
+## ---- Tranche 7: [sp+8] scratch-slot access + assorted MOV/LD/ST -------
+## P1.md §PROLOGUE defines [sp+8] as a callee-private scratch cell.
+## Access pattern: MOV rX, sp then LD/ST rY, [rX, 8].
+
+DEFINE P1_MOV_R4_SP 4989E2 ## mov r10, rsp
+DEFINE P1_MOV_R2_R0 4889C6 ## mov rsi, rax
+DEFINE P1_ST_R0_R4_8 49894208 ## mov [r10+8], rax
+DEFINE P1_LD_R0_R4_8 498B4208 ## mov rax, [r10+8]
+DEFINE P1_LB_R1_R0_0 480FB638 ## movzx rdi, byte [rax]
diff --git a/p1_riscv64.M1 b/p1_riscv64.M1
@@ -36,22 +36,37 @@ DEFINE P1_LI_R5 9707000083E7C7006F008000
DEFINE P1_LI_R6 9704000083E4C4006F008000
DEFINE P1_LI_R7 170900000369C9006F008000
-## SYSCALL — num in r0, args r1..r6, result in r0.
+## SYSCALL — num in r0, args r1..r6, result in r0. Clobbers r0 only.
##
-## riscv64 Linux syscall ABI: num in a7, args in a0..a5. P1's mapping
-## puts args one register higher than native, so SYSCALL shuffles
-## a0..s1 down into a0..a5, moves num (a0) into a7, and traps.
+## riscv64 Linux wants num in a7, args in a0..a5. P1 puts args one
+## register higher (a1..a5, s1), so we save a1..a5 into s3..s7 before
+## shuffling them down into a0..a4; a5 comes from s1; a7 gets the
+## number. After `ecall` we restore a1..a5 from s3..s7 so the caller
+## sees only r0 clobbered.
##
-## Expansion:
-## mv a7, a0 ; P1 r0 (num) -> native num reg
-## mv a0, a1 ; P1 r1 -> native arg1
-## mv a1, a2 ; P1 r2 -> native arg2
-## mv a2, a3 ; P1 r3 -> native arg3
-## mv a3, a4 ; P1 r4 -> native arg4
-## mv a4, a5 ; P1 r5 -> native arg5
-## mv a5, s1 ; P1 r6 -> native arg6
+## s3..s7 are reserved for this expansion and invisible to P1 code (see
+## P1.md §"Register mapping").
+##
+## Expansion (18 insns, 72 bytes):
+## mv a7, a0 ; num
+## mv s3, a1 ; save P1 r1
+## mv s4, a2 ; save P1 r2
+## mv s5, a3 ; save P1 r3
+## mv s6, a4 ; save P1 r4
+## mv s7, a5 ; save P1 r5
+## mv a0, s3 ; arg0 <- saved r1
+## mv a1, s4 ; arg1 <- saved r2
+## mv a2, s5 ; arg2 <- saved r3
+## mv a3, s6 ; arg3 <- saved r4
+## mv a4, s7 ; arg4 <- saved r5
+## mv a5, s1 ; arg5 <- P1 r6
## ecall
-DEFINE P1_SYSCALL 9308050013850500930506001386060093060700138707009387040073000000
+## mv a1, s3 ; restore r1
+## mv a2, s4 ; restore r2
+## mv a3, s5 ; restore r3
+## mv a4, s6 ; restore r4
+## mv a5, s7 ; restore r5
+DEFINE P1_SYSCALL 9308050093890500130A0600938A0600130B0700938B07001385090093050A0013860A0093060B0013870B0093870400730000009385090013060A0093860A0013070B0093870B00
## Linux syscall numbers (riscv64 uses the generic table — same as aarch64).
@@ -170,3 +185,59 @@ DEFINE P1_EPILOGUE 8330010013010101 ## ld ra,0(sp) ; addi sp,sp,16
DEFINE P1_RET 67800000 ## jalr x0, 0(ra)
DEFINE P1_CALL E7000900 ## jalr ra, 0(s2)
DEFINE P1_TAIL 833001001301010167000900 ## epilogue ; jalr x0, 0(s2)
+
+
+## ---- Tranche 6: seed-Lisp step 1 extensions ---------------------------
+## Ops required by lisp.M1 (LISP.md item 1, proof-of-life): file I/O
+## wrappers, cell loads/stores, and the arg-shuffling MOVs they imply.
+
+## Extra Linux syscall numbers (generic table, same as aarch64).
+DEFINE SYS_READ 3F000000
+DEFINE SYS_CLOSE 39000000
+
+## SYSOPEN — portable open(path, flags, mode). Assumes r1=path, r2=flags,
+## r3=mode. riscv64 uses openat(2) = syscall 56; P1's r1..r3 already line
+## up with native a1..a3, so only a0=AT_FDCWD and a7=56 need loading.
+##
+## Expansion:
+## addi a0, x0, -100 ; a0 = AT_FDCWD (13 05 C0 F9)
+## addi a7, x0, 56 ; a7 = SYS_openat (93 08 80 03)
+## ecall (73 00 00 00)
+DEFINE P1_SYSOPEN 1305C0F99308800373000000
+
+## MOV rD, rA — extra register pairings used around syscalls / calls.
+## MOV rD, rA -> addi rD, rA, 0 (the `mv` pseudo).
+DEFINE P1_MOV_R6_R0 93040500 ## mv s1, a0
+DEFINE P1_MOV_R0_R3 13850600 ## mv a0, a3
+DEFINE P1_MOV_R7_R0 13090500 ## mv s2, a0
+DEFINE P1_MOV_R7_R2 13090600 ## mv s2, a2
+DEFINE P1_MOV_R2_R6 13860400 ## mv a2, s1
+DEFINE P1_MOV_R3_R7 93060900 ## mv a3, s2
+DEFINE P1_MOV_R2_R7 13060900 ## mv a2, s2
+DEFINE P1_MOV_R4_R7 13070900 ## mv a4, s2
+
+## LD/ST extras — additional dst/base pairings at signed imm12 offsets.
+DEFINE P1_LD_R0_R6_0 03B50400 ## ld a0, 0(s1)
+DEFINE P1_LD_R1_R6_16 83B50401 ## ld a1, 16(s1)
+DEFINE P1_LD_R3_R4_0 83360700 ## ld a3, 0(a4)
+DEFINE P1_LD_R0_R5_0 03B50700 ## ld a0, 0(a5)
+DEFINE P1_LB_R1_R4_0 83450700 ## lbu a1, 0(a4)
+DEFINE P1_ST_R2_R4_0 2330C700 ## sd a2, 0(a4)
+
+## ADD r2, r3, r1 — a2 = a3 + a1.
+DEFINE P1_ADD_R2_R3_R1 3386B600 ## add a2, a3, a1
+
+## BLT r0, r2, r7 — signed jump-if-less; used for argc/open/read checks.
+## bge a0, a2, +8 ; jalr x0, 0(s2)
+DEFINE P1_BLT_R0_R2_R7 6354C50067000900
+
+
+## ---- Tranche 7: [sp+8] scratch-slot access + assorted MOV/LD/ST -------
+## P1.md §PROLOGUE defines [sp+8] as a callee-private scratch cell.
+## Access pattern: MOV rX, sp then LD/ST rY, [rX, 8].
+
+DEFINE P1_MOV_R4_SP 13070100 ## addi a4, sp, 0 (mv a4, sp)
+DEFINE P1_MOV_R2_R0 13060500 ## addi a2, a0, 0 (mv a2, a0)
+DEFINE P1_ST_R0_R4_8 2334A700 ## sd a0, 8(a4)
+DEFINE P1_LD_R0_R4_8 03358700 ## ld a0, 8(a4)
+DEFINE P1_LB_R1_R0_0 83450500 ## lbu a1, 0(a0)