p1v2 - boot2 - Playing with the boostrap

commit c34eadeb5a6d1e48435c97d722fcc15a677a6a54
parent 1b11e9dca9efcdb09fd2917b96b240e43c8ec91d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed, 22 Apr 2026 17:51:03 -0700

p1v2

Diffstat:
A docs/P1v2.md  | 503 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A p1/aarch64.py  | 354 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A p1/common.py  | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A p1/llvm_disasm_aarch64.py  | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A p1/p1_gen.py  | 251 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A post.md  | 247 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/m1macro.c  | 886 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

7 files changed, 2464 insertions(+), 0 deletions(-)
diff --git a/docs/P1v2.md b/docs/P1v2.md
@@ -0,0 +1,503 @@
+# P1 v2
+
+## Scope
+
+P1 v2 is a portable pseudo-ISA for standalone executables.
+
+P1 v2 has two width variants:
+
+- **P1v2-64** — one word is one 64-bit integer or pointer value
+- **P1v2-32** — one word is one 32-bit integer or pointer value
+
+Portable source may use any number of word arguments. The first four argument
+registers are explicit, and additional argument words are passed through a
+portable incoming stack-argument area.
+
+Portable source may directly return `0..1` word. Wider results use the
+portable indirect-result convention described below.
+
+## Toolchain envelope
+
+P1 v2 must be assemblable through the existing `M0` + `hex2` path, with
+`catm` as the only composition primitive between source or generated fragments.
+The spec therefore assumes only the following toolchain features:
+
+- `M0`-level `DEFINE name hex_bytes` substitution
+- raw byte emission
+- labels and label references supported by `hex2`
+- file concatenation via `catm`
+
+## Source notation
+
+This document describes instructions using ordinary assembly notation such as
+`ADD rd, ra, rb`, `LD rd, [ra + off]`, or `CALL`.
+
+Because of the toolchain constraints above, portable source does not encode
+most operands as textual instruction arguments. Instead, register choices,
+inline immediate values, and small fixed parameters are fused into opcode
+names, following the generated-table style used by `src/p1_gen.py`.
+
+So the notation in this document is descriptive rather than literal:
+
+- `ADD rd, ra, rb` means a family of fused register-specific opcodes
+- `ADDI rd, ra, imm` means a family of fused register-and-immediate-specific
+  opcodes
+- `ENTER size` means a family of fused byte-count-specific opcodes
+- `LDARG rd, idx` means a family of fused register-and-argument-slot-specific
+  opcodes
+- `BR rs`, `CALLR rs`, and `TAILR rs` mean register-specific control-flow
+  opcodes
+- `LEAVE`, `CALL`, `RET`, `TAIL`, `B`, and `SYSCALL` remain operand-free
+
+Labels still appear in source where the toolchain supports them directly, such
+as `LA rd, %label` and `LA_BR %label`.
+
+## Register Model
+
+### Exposed registers
+
+P1 v2 exposes the following source-level registers:
+
+- `a0`–`a3` — argument registers. Also caller-saved general registers.
+- `t0` — caller-saved temporary.
+- `s0`–`s1` — callee-saved general registers.
+- `sp` — stack pointer.
+
+### Hidden registers
+
+The backend may reserve additional native registers that are never visible in
+P1 source:
+
+- `br` — branch / call target mechanism
+- backend-local scratch used entirely within one instruction expansion
+
+No hidden register may carry a live P1 value across an instruction boundary.
+`br` need not be a native register on every target; it may be implemented by
+another backend-private mechanism.
+
+## Calling Convention
+
+### Arguments and return values
+
+- In the direct-result convention, explicit argument words 0-3 live in
+  `a0-a3`.
+- Additional explicit argument words live in the incoming stack-argument area
+  and are read with `LDARG`.
+- In the direct-result convention, a one-word return value lives in `a0`.
+
+P1 v2 also defines an indirect-result convention for wider returns:
+
+- The caller passes a writable result buffer pointer in `a0`.
+- Explicit argument words 0-2 then live in `a1-a3`.
+- Additional explicit argument words still live in the incoming
+  stack-argument area.
+- On return, `a0` holds the same result buffer pointer value.
+
+In the direct-result convention, incoming stack-argument slot `0` therefore
+corresponds to explicit argument word `4`. In the indirect-result convention,
+incoming stack-argument slot `0` corresponds to explicit argument word `3`.
+
+The indirect-result convention is the portable way to return aggregates or any
+result wider than one word.
+
+### Register preservation
+
+Caller-saved:
+
+- `a0`–`a3`
+- `t0`
+
+Callee-saved:
+
+- `s0`–`s1`
+- `sp`
+
+### Call semantics
+
+A call is valid from any function, including a leaf. Call / return correctness
+does not depend on establishing a frame first.
+
+If a function needs any incoming argument after making a call, it must save it
+before the call. This matters in particular for argument 0, since `a0` is also
+the return-value register.
+
+A call that passes any stack argument words requires the caller to have an
+active standard frame with enough frame-local storage to stage those outgoing
+words.
+
+The return address is hidden machine state. Portable source must not assume
+that it lives in any exposed register.
+
+## Stack Convention
+
+### Call-boundary rule
+
+At every call boundary, the backend must satisfy the native C ABI stack
+alignment rule for the target architecture.
+
+Portable source must therefore treat raw function-entry `sp` as opaque. It may
+not assume that the low bits of `sp` have the same meaning on all targets
+before a frame is established.
+
+### Incoming stack-argument area
+
+P1 v2 defines an abstract incoming stack-argument area for explicit argument
+words that do not fit in registers.
+
+- Slot `0` is the first stack-passed explicit argument word.
+- Slots are word-indexed, not byte-indexed.
+- Portable source may access this area only through `LDARG`.
+
+`LDARG` is valid only when the current function has an active standard frame.
+Therefore, a function that needs any incoming stack argument must establish a
+standard frame before its first `LDARG`.
+
+Portable source must not assume any direct relationship between incoming
+argument slots and raw function-entry `sp`. In particular, source must not try
+to reconstruct stack arguments by manually indexing from `sp`; backend entry
+layouts differ across targets.
+
+For a call with `m` stack-passed explicit argument words, the caller stages
+those words in the first `m` words of its frame-local storage immediately
+before the call:
+
+```
+[sp + 2*WORD + 0*WORD] = outgoing arg word 0
+[sp + 2*WORD + 1*WORD] = outgoing arg word 1
+...
+```
+
+At callee entry, those staged words become incoming argument slots `0..m-1`.
+The backend is responsible for mapping between the caller's frame layout and
+the callee's abstract incoming argument slots.
+
+Portable code that needs both ordinary locals and stack-passed outgoing
+arguments must reserve enough total frame-local storage and keep the low-
+addressed prefix available for outgoing argument staging across the call.
+
+### Standard frame layout
+
+Functions that need local stack storage use a standard frame layout. After
+frame establishment:
+
+```
+[sp + 0*WORD] = saved return address
+[sp + 1*WORD] = saved caller stack pointer
+[sp + 2*WORD ... sp + 2*WORD + local_bytes - 1] = frame-local storage
+...
+```
+
+Frame-local storage is byte-addressed. Portable code may use it for ordinary
+locals, spilled callee-saved registers, and the caller-staged outgoing
+stack-argument words described above.
+
+Total frame size is:
+
+`round_up(STACK_ALIGN, 2*WORD_SIZE + local_bytes)`
+
+Where:
+
+- `WORD_SIZE = 8` in P1v2-64
+- `WORD_SIZE = 4` in P1v2-32
+- `STACK_ALIGN` is target-defined and must satisfy the native call ABI
+
+Leaf functions that need no frame-local storage may omit the frame entirely.
+
+### Frame invariants
+
+- A function that allocates a frame must restore `sp` before returning.
+- Callee-saved registers modified by the function must be restored before
+  returning.
+- The standard frame layout is the only frame shape recognized by P1 v2.
+
+## Op Set Summary
+
+| Category | Operations |
+|----------|------------|
+| Materialization | `LI rd, imm`, `LA rd, %label`, `LA_BR %label` |
+| Moves | `MOV rd, rs`, `MOV rd, sp` |
+| Arithmetic | `ADD`, `SUB`, `AND`, `OR`, `XOR`, `SHL`, `SHR`, `SAR`, `MUL`, `DIV`, `REM` |
+| Immediate arithmetic | `ADDI`, `ANDI`, `ORI`, `SHLI`, `SHRI`, `SARI` |
+| Memory | `LD`, `ST`, `LB`, `SB` |
+| ABI access | `LDARG` |
+| Branching | `B`, `BR`, `BEQ`, `BNE`, `BLT`, `BEQZ`, `BNEZ`, `BLTZ` |
+| Calls / returns | `CALL`, `CALLR`, `RET`, `TAIL`, `TAILR` |
+| Frame management | `ENTER`, `LEAVE` |
+| System | `SYSCALL` |
+
+## Immediates
+
+Immediate operands appear only in instructions that explicitly admit them.
+Portable source has three immediate classes:
+
+- **Inline integer immediate** — a signed 12-bit assembly-time constant in the
+  range `-2048..2047`
+- **Materialized word value** — a full one-word assembly-time constant loaded
+  with `LI`
+- **Materialized address** — the address of a label loaded with `LA`
+
+P1 v2 also uses two structured assembly-time operands:
+
+- **Frame-local byte count** — a non-negative byte count used by `ENTER`
+- **Argument-slot index** — a non-negative word-slot index used by `LDARG`
+
+`LI rd, imm` loads the one-word integer value `imm`.
+
+`LA rd, %label` loads the address of `%label` as a one-word pointer value.
+
+The backend may realize `LI` and `LA` using native immediates, literal pools,
+multi-instruction sequences, or other backend-private mechanisms.
+
+## Control Flow
+
+### Call / Return / Tail Call
+
+Control-flow targets are materialized with `LA_BR %label`, which loads
+`%label` into the hidden branch-target mechanism `br`. The immediately
+following control-flow op consumes that target.
+
+`CALL` transfers control to the target most recently loaded by `LA_BR` and
+establishes a return continuation such that a subsequent `RET` returns to the
+instruction after the `CALL`. `CALL` is valid whether or not the caller has
+established a standard frame, except that any call using stack-passed argument
+words requires an active standard frame to hold the staged outgoing words.
+
+`CALLR rs` is the register-indirect form of `CALL`. It transfers control to
+the code pointer value held in `rs` and establishes the same return
+continuation semantics as `CALL`.
+
+`RET` returns through the current return continuation. `RET` is valid whether
+or not the current function has established a standard frame, provided any
+frame established by the function has already been torn down.
+
+`TAIL` is a tail call to the target most recently loaded by `LA_BR`. It is
+valid only when the current function has an active standard frame. `TAIL`
+performs the standard epilogue for the current frame and then transfers control
+to the loaded target without creating a new return continuation. The callee
+therefore returns directly to the current function's caller.
+
+`TAILR rs` is the register-indirect form of `TAIL`. It is valid only when the
+current function has an active standard frame.
+
+Because stack-passed outgoing argument words are staged in the caller's own
+frame-local storage, `TAIL` and `TAILR` are portable only when the tail-called
+callee requires no stack-passed argument words. Portable compilers must lower
+other tail-call cases to an ordinary `CALL` / `RET` sequence.
+
+Portable source must treat the return continuation as hidden machine state. It
+must not assume that the return address lives in any exposed register or stack
+location except as defined by the standard frame layout after frame
+establishment.
+
+### Prologue / Epilogue
+
+P1 v2 defines the following frame-establishment and frame-teardown operations:
+
+- `ENTER size`
+- `LEAVE`
+
+`ENTER size` establishes the standard frame layout with `size` bytes of
+frame-local storage:
+
+```
+[sp + 0*WORD] = saved return address
+[sp + 1*WORD] = saved caller stack pointer
+[sp + 2*WORD ... sp + 2*WORD + size - 1] = frame-local storage
+```
+
+The total allocation size is:
+
+`round_up(STACK_ALIGN, 2*WORD_SIZE + size)`
+
+The named frame-local bytes are the usable local storage. Any additional bytes
+introduced by alignment rounding are padding, not extra local bytes.
+
+`LEAVE` tears down the current standard frame and restores the hidden return
+continuation so that a subsequent `RET` returns correctly.
+
+Because every standard frame stores the saved caller stack pointer at
+`[sp + 1*WORD]`, `LEAVE` does not need to know the frame-local byte count used
+by the corresponding `ENTER`.
+
+A function may omit `ENTER` / `LEAVE` entirely if it is a leaf and needs no
+standard frame.
+
+`ENTER` and `LEAVE` do not implicitly save or restore `s0` or `s1`. A
+function that modifies `s0` or `s1` must preserve them explicitly, typically by
+storing them in frame-local storage within its standard frame.
+
+### Branching
+
+P1 v2 branch targets are carried through the hidden branch-target mechanism
+`br`. Portable source may load `br` only through:
+
+- `LA_BR %label` — materialize the address of `%label` as the next branch, call,
+  or tail-call target
+
+No branch, call, or tail opcode takes a label operand directly. Portable source
+must treat `br` as owned by the control-flow machinery. No live value may be
+carried in `br`. Each `LA_BR` must be consumed by the immediately following
+branch, call, or tail op, and portable source must not rely on `br` surviving
+across any other instruction.
+
+The portable branch families are:
+
+- `B` — unconditional branch to the target in `br`
+- `BR rs` — unconditional branch to the code pointer in `rs`
+- `BEQ`, `BNE`, `BLT` — conditional branch to the target in `br`
+- `BEQZ`, `BNEZ`, `BLTZ` — conditional branch to the target in `br` using zero
+  as the second operand
+
+`BLT` and `BLTZ` perform signed comparisons on one-word values.
+
+If a branch condition is true, control transfers to the target currently held in
+`br`. If the condition is false, execution falls through to the next
+instruction.
+
+## Data Ops
+
+### Arithmetic
+
+P1 v2 defines the following arithmetic and bitwise operations on one-word
+values:
+
+- register-register: `ADD`, `SUB`, `AND`, `OR`, `XOR`, `SHL`, `SHR`, `SAR`,
+  `MUL`, `DIV`, `REM`
+- immediate: `ADDI`, `ANDI`, `ORI`, `SHLI`, `SHRI`, `SARI`
+
+For `ADD`, `SUB`, `MUL`, `AND`, `OR`, and `XOR`, computation is modulo the
+active word size.
+
+`SHL` shifts left and discards high bits. `SHR` is a logical right shift and
+zero-fills. `SAR` is an arithmetic right shift and sign-fills.
+
+For register-count shifts, only the low `5` bits of the shift count are
+observed in `P1v2-32`, and only the low `6` bits are observed in `P1v2-64`.
+
+Immediate-form shifts use inline immediates in the range `0..31` in `P1v2-32`
+and `0..63` in `P1v2-64`.
+
+`DIV` is signed division on one-word two's-complement values and truncates
+toward zero. `REM` is the corresponding signed remainder.
+
+Division by zero is outside the portable contract. The overflow case
+`MIN_INT / -1` is also outside the portable contract, as is the corresponding
+remainder case.
+
+### Moves
+
+P1 v2 defines the following move and materialization operations:
+
+- `MOV` — register-to-register copy
+- `LI` — load one-word integer constant
+- `LA` — load label address
+
+`MOV` may copy from any exposed general register to any exposed general
+register.
+
+Portable source may also read the current stack pointer through `MOV rd, sp`.
+
+Portable source may not write `sp` through `MOV`. Stack-pointer updates are only
+performed by `ENTER`, `LEAVE`, and backend-private call/return machinery.
+
+`LI` materializes an integer bit-pattern. `LA` materializes the address of a
+label. `LA_BR` is a separate control-flow-target materialization form and is not
+part of the general move family.
+
+### Memory
+
+P1 v2 defines the following memory-access operations:
+
+- `LD`, `ST` — one-word load and store
+- `LB`, `SB` — byte load and store
+- `LDARG` — one-word load from the incoming stack-argument area
+
+`LD` and `ST` access one full word: 4 bytes in `P1v2-32` and 8 bytes in
+`P1v2-64`.
+
+`LB` loads one byte and zero-extends it to a full word. `SB` stores the low
+8 bits of the source value.
+
+Memory offsets use signed 12-bit inline immediates.
+
+The base address for a memory access may be any exposed general register or
+`sp`.
+
+`LDARG rd, idx` loads incoming stack-argument slot `idx`, where slot `0` is the
+first stack-passed explicit argument word. `idx` is word-indexed, not
+byte-indexed. `LDARG` is an ABI access, not a general memory operation; it does
+not expose or imply any raw `sp`-relative layout at function entry.
+
+`LDARG` is valid only when the current function has an active standard frame.
+
+Portable source must not assume that labels are aligned beyond what is
+explicitly established by the program itself. Portable code should use
+naturally aligned addresses for `LD` and `ST`. Unaligned word accesses are
+outside the portable contract. Byte accesses have no additional alignment
+requirement.
+
+## System
+
+`SYSCALL` is part of the portable ISA surface.
+
+At the portable level, the syscall convention is:
+
+- `a0` = syscall number on entry, return value on exit
+- `a1`, `a2`, `a3`, `t0`, `s0`, `s1` = syscall arguments 0 through 5
+
+At the portable level, `SYSCALL` clobbers only `a0`. All other exposed
+registers are preserved across the syscall.
+
+The mapping from symbolic syscall names to numeric syscall identifiers is
+target-defined. The set of syscalls available to a given program is likewise
+specified outside the core P1 v2 ISA, for example by a target profile or
+runtime interface document.
+
+## Target notes
+
+- `a0` is both argument 0 and the return-value register in the portable
+  calling convention in the direct-result case, and the indirect-result buffer
+  pointer in the indirect-result case.
+- On aarch64, riscv64, arm32, and rv32, that matches the native integer/pointer
+  ABI directly.
+- On amd64, the backend must translate between portable `a0` and native
+  return register `rax` at call and return boundaries.
+- On i386, the backend must translate between portable argument registers and
+  the native stack-argument ABI at call boundaries.
+- On amd64 and i386, `LDARG` must account for the return address pushed by the
+  native `call` instruction. On aarch64, riscv64, arm32, and rv32, it maps more
+  directly to the entry `sp` plus the backend's standard frame/header policy.
+- On amd64, aarch64, riscv64, arm32, and rv32, `br` may be implemented as a
+  dedicated hidden native register.
+- On i386, `br` is expected to be a backend-private stack convention, not a
+  dedicated hidden register.
+- Frame-pointer use is backend policy, not part of the P1 v2 architectural
+  register set.
+
+### Native register mapping
+
+#### 64-bit targets
+
+| P1   | amd64 | aarch64 | riscv64 |
+|------|-------|---------|---------|
+| `a0` | `rdi` | `x0`    | `a0`    |
+| `a1` | `rsi` | `x1`    | `a1`    |
+| `a2` | `rdx` | `x2`    | `a2`    |
+| `a3` | `rcx` | `x3`    | `a3`    |
+| `t0` | `r10` | `x9`    | `t0`    |
+| `s0` | `rbx` | `x19`   | `s1`    |
+| `s1` | `r12` | `x20`   | `s2`    |
+| `sp` | `rsp` | `sp`    | `sp`    |
+
+#### 32-bit targets
+
+| P1   | arm32 | i386  | rv32  |
+|------|-------|-------|-------|
+| `a0` | `r0`  | `eax` | `a0`  |
+| `a1` | `r1`  | `ecx` | `a1`  |
+| `a2` | `r2`  | `edx` | `a2`  |
+| `a3` | `r3`  | `ebx` | `a3`  |
+| `t0` | `r12` | `esi` | `t0`  |
+| `s0` | `r4`  | `edi` | `s1`  |
+| `s1` | `r5`  | `ebp` | `s2`  |
+| `sp` | `sp`  | `esp` | `sp`  |
diff --git a/p1/aarch64.py b/p1/aarch64.py
@@ -0,0 +1,354 @@
+from common import (
+    AddI,
+    ArchDef,
+    BranchReg,
+    CondB,
+    CondBZ,
+    Enter,
+    La,
+    LaBr,
+    LdArg,
+    Li,
+    LogI,
+    Mem,
+    Mov,
+    Nullary,
+    Rrr,
+    ShiftI,
+    le32,
+    register_arch,
+    round_up,
+)
+
+
+NAT = {
+    'a0': 0,
+    'a1': 1,
+    'a2': 2,
+    'a3': 3,
+    'x4': 4,
+    'x5': 5,
+    't0': 9,
+    's0': 19,
+    's1': 20,
+    'sp': 31,
+    'xzr': 31,
+    'lr': 30,
+    'br': 17,
+    'scratch': 16,
+    'x8': 8,
+    'save0': 21,
+    'save1': 22,
+    'save2': 23,
+}
+
+
+RRR_BASE = {
+    'ADD': 0x8B000000,
+    'SUB': 0xCB000000,
+    'AND': 0x8A000000,
+    'OR': 0xAA000000,
+    'XOR': 0xCA000000,
+    'SHL': 0x9AC02000,
+    'SHR': 0x9AC02400,
+    'SAR': 0x9AC02800,
+    'DIV': 0x9AC00C00,
+}
+
+
+SYSCALL_NUMBERS = {
+    'SYS_READ': 63,
+    'SYS_WRITE': 64,
+    'SYS_CLOSE': 57,
+    'SYS_OPENAT': 56,
+    'SYS_EXIT': 93,
+    'SYS_CLONE': 220,
+    'SYS_EXECVE': 221,
+    'SYS_WAITID': 95,
+}
+
+
+def aa_rrr(base, rd, ra, rb):
+    d = NAT[rd]
+    a = NAT[ra]
+    b = NAT[rb]
+    return le32(base | (b << 16) | (a << 5) | d)
+
+
+def aa_add_imm(rd, ra, imm12, sub=False):
+    d = NAT[rd]
+    a = NAT[ra]
+    base = 0xD1000000 if sub else 0x91000000
+    return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d)
+
+
+def aa_mov_rr(dst, src):
+    if dst == 'sp':
+        return aa_add_imm('sp', src, 0, sub=False)
+    if src == 'sp':
+        return aa_add_imm(dst, 'sp', 0, sub=False)
+    d = NAT[dst]
+    s = NAT[src]
+    return le32(0xAA000000 | (s << 16) | (31 << 5) | d)
+
+
+def aa_ubfm(rd, ra, immr, imms):
+    d = NAT[rd]
+    a = NAT[ra]
+    return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+
+def aa_sbfm(rd, ra, immr, imms):
+    d = NAT[rd]
+    a = NAT[ra]
+    return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+
+def aa_movz(rd, imm16):
+    d = NAT[rd]
+    return le32(0xD2800000 | ((imm16 & 0xFFFF) << 5) | d)
+
+
+def aa_movn(rd, imm16):
+    d = NAT[rd]
+    return le32(0x92800000 | ((imm16 & 0xFFFF) << 5) | d)
+
+
+def aa_materialize_small_imm(rd, imm):
+    if imm >= 0:
+        return aa_movz(rd, imm)
+    return aa_movn(rd, (~imm) & 0xFFFF)
+
+
+def aa_ldst_uimm12(base, rt, rn, off_bytes, size_log2):
+    imm12 = off_bytes >> size_log2
+    t = NAT[rt]
+    n = NAT[rn]
+    return le32(base | (imm12 << 10) | (n << 5) | t)
+
+
+def aa_ldst_unscaled(base, rt, rn, off):
+    imm9 = off & 0x1FF
+    t = NAT[rt]
+    n = NAT[rn]
+    return le32(base | (imm9 << 12) | (n << 5) | t)
+
+
+def aa_mem(op, rt, rn, off):
+    bases = {
+        'LD': (0xF9400000, 3, 0xF8400000),
+        'ST': (0xF9000000, 3, 0xF8000000),
+        'LB': (0x39400000, 0, 0x38400000),
+        'SB': (0x39000000, 0, 0x38000000),
+    }
+    uimm_base, size_log2, unscaled_base = bases[op]
+    scale = 1 << size_log2
+    if off >= 0 and off % scale == 0 and off < (4096 << size_log2):
+        return aa_ldst_uimm12(uimm_base, rt, rn, off, size_log2)
+    if -256 <= off <= 255:
+        return aa_ldst_unscaled(unscaled_base, rt, rn, off)
+    if -2048 <= off <= 2047:
+        if off >= 0:
+            addr = aa_add_imm('scratch', rn, off, sub=False)
+        else:
+            addr = aa_add_imm('scratch', rn, -off, sub=True)
+        return addr + aa_ldst_uimm12(uimm_base, rt, 'scratch', 0, size_log2)
+    raise ValueError(f'aarch64 offset out of range for {op}: {off}')
+
+
+def aa_cmp_skip(op, ra, rb):
+    a = NAT[ra]
+    b = NAT[rb]
+    cmp_hex = le32(0xEB000000 | (b << 16) | (a << 5) | 31)
+    skip_cond = {
+        'BEQ': 1,
+        'BNE': 0,
+        'BLT': 10,
+    }[op]
+    return cmp_hex + le32(0x54000040 | skip_cond)
+
+
+def aa_br(reg):
+    return le32(0xD61F0000 | (NAT[reg] << 5))
+
+
+def aa_blr(reg):
+    return le32(0xD63F0000 | (NAT[reg] << 5))
+
+
+def aa_ret():
+    return le32(0xD65F03C0)
+
+
+def aa_lit64_prefix(rd):
+    d = NAT[rd]
+    ldr_lit = 0x58000040 | d
+    b_plus8 = 0x14000002
+    return le32(ldr_lit) + le32(b_plus8)
+
+
+def encode_li(_arch, row):
+    return aa_lit64_prefix(row.rd)
+
+
+def encode_la(_arch, row):
+    return aa_lit64_prefix(row.rd)
+
+
+def encode_labr(_arch, _row):
+    return aa_lit64_prefix('br')
+
+
+def encode_mov(_arch, row):
+    return aa_mov_rr(row.rd, row.rs)
+
+
+def encode_rrr(_arch, row):
+    if row.op == 'MUL':
+        d = NAT[row.rd]
+        a = NAT[row.ra]
+        b = NAT[row.rb]
+        return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d)
+    if row.op == 'REM':
+        d = NAT[row.rd]
+        a = NAT[row.ra]
+        b = NAT[row.rb]
+        sc = NAT['scratch']
+        sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | sc
+        msub = 0x9B008000 | (b << 16) | (a << 10) | (sc << 5) | d
+        return le32(sdiv) + le32(msub)
+    return aa_rrr(RRR_BASE[row.op], row.rd, row.ra, row.rb)
+
+
+def encode_addi(_arch, row):
+    if row.imm >= 0:
+        return aa_add_imm(row.rd, row.ra, row.imm, sub=False)
+    return aa_add_imm(row.rd, row.ra, -row.imm, sub=True)
+
+
+def encode_logi(_arch, row):
+    seq = aa_materialize_small_imm('scratch', row.imm)
+    base = {
+        'ANDI': 0x8A000000,
+        'ORI': 0xAA000000,
+    }[row.op]
+    return seq + aa_rrr(base, row.rd, row.ra, 'scratch')
+
+
+def encode_shifti(_arch, row):
+    if row.op == 'SHLI':
+        return aa_ubfm(row.rd, row.ra, (-row.imm) & 63, 63 - row.imm)
+    if row.op == 'SHRI':
+        return aa_ubfm(row.rd, row.ra, row.imm, 63)
+    return aa_sbfm(row.rd, row.ra, row.imm, 63)
+
+
+def encode_mem(_arch, row):
+    return aa_mem(row.op, row.rt, row.rn, row.off)
+
+
+def encode_ldarg(_arch, row):
+    return aa_mem('LD', 'scratch', 'sp', 8) + aa_mem('LD', row.rd, 'scratch', 16 + 8 * row.slot)
+
+
+def encode_branch_reg(_arch, row):
+    if row.kind == 'BR':
+        return aa_br(row.rs)
+    if row.kind == 'CALLR':
+        return aa_blr(row.rs)
+    if row.kind == 'TAILR':
+        leave = encode_nullary(_arch, Nullary('LEAVE', 'LEAVE'))
+        return leave + aa_br(row.rs)
+    raise ValueError(f'unknown branch-reg kind: {row.kind}')
+
+
+def encode_condb(_arch, row):
+    return aa_cmp_skip(row.op, row.ra, row.rb) + aa_br('br')
+
+
+def encode_condbz(_arch, row):
+    a = NAT[row.ra]
+    br_hex = aa_br('br')
+    if row.op == 'BEQZ':
+        return le32(0xB5000000 | (2 << 5) | a) + br_hex
+    if row.op == 'BNEZ':
+        return le32(0xB4000000 | (2 << 5) | a) + br_hex
+    cmp_zero = le32(0xEB1F001F | (a << 5))
+    bge = le32(0x54000040 | 10)
+    return cmp_zero + bge + br_hex
+
+
+def encode_enter(arch, row):
+    frame_bytes = round_up(arch.stack_align, 2 * arch.word_bytes + row.size)
+    return (
+        aa_add_imm('sp', 'sp', frame_bytes, sub=True)
+        + aa_mem('ST', 'lr', 'sp', 0)
+        + aa_add_imm('x8', 'sp', frame_bytes, sub=False)
+        + aa_mem('ST', 'x8', 'sp', 8)
+    )
+
+
+def encode_nullary(_arch, row):
+    if row.kind == 'B':
+        return aa_br('br')
+    if row.kind == 'CALL':
+        return aa_blr('br')
+    if row.kind == 'RET':
+        return aa_ret()
+    if row.kind == 'LEAVE':
+        return (
+            aa_mem('LD', 'lr', 'sp', 0)
+            + aa_mem('LD', 'x8', 'sp', 8)
+            + aa_mov_rr('sp', 'x8')
+        )
+    if row.kind == 'TAIL':
+        leave = encode_nullary(_arch, Nullary('LEAVE', 'LEAVE'))
+        return leave + aa_br('br')
+    if row.kind == 'SYSCALL':
+        return ''.join([
+            aa_mov_rr('x8', 'a0'),
+            aa_mov_rr('save0', 'a1'),
+            aa_mov_rr('save1', 'a2'),
+            aa_mov_rr('save2', 'a3'),
+            aa_mov_rr('a0', 'save0'),
+            aa_mov_rr('a1', 'save1'),
+            aa_mov_rr('a2', 'save2'),
+            aa_mov_rr('a3', 't0'),
+            aa_mov_rr('x4', 's0'),
+            aa_mov_rr('x5', 's1'),
+            le32(0xD4000001),
+            aa_mov_rr('a1', 'save0'),
+            aa_mov_rr('a2', 'save1'),
+            aa_mov_rr('a3', 'save2'),
+        ])
+    raise ValueError(f'unknown nullary kind: {row.kind}')
+
+
+ENCODERS = {
+    Li: encode_li,
+    La: encode_la,
+    LaBr: encode_labr,
+    Mov: encode_mov,
+    Rrr: encode_rrr,
+    AddI: encode_addi,
+    LogI: encode_logi,
+    ShiftI: encode_shifti,
+    Mem: encode_mem,
+    LdArg: encode_ldarg,
+    Nullary: encode_nullary,
+    BranchReg: encode_branch_reg,
+    CondB: encode_condb,
+    CondBZ: encode_condbz,
+    Enter: encode_enter,
+}
+
+
+register_arch(
+    ArchDef(
+        name='aarch64',
+        word_bytes=8,
+        stack_align=16,
+        syscall_numbers=SYSCALL_NUMBERS,
+        encoders=ENCODERS,
+    )
+)
diff --git a/p1/common.py b/p1/common.py
@@ -0,0 +1,66 @@
+from collections import namedtuple
+
+
+ArchDef = namedtuple(
+    'ArchDef',
+    'name word_bytes stack_align syscall_numbers encoders',
+)
+
+Banner = namedtuple('Banner', 'text')
+Literal = namedtuple('Literal', 'name hex_by_arch')
+Nullary = namedtuple('Nullary', 'name kind')
+Li = namedtuple('Li', 'name rd')
+La = namedtuple('La', 'name rd')
+LaBr = namedtuple('LaBr', 'name')
+Mov = namedtuple('Mov', 'name rd rs')
+Rrr = namedtuple('Rrr', 'name op rd ra rb')
+AddI = namedtuple('AddI', 'name rd ra imm')
+LogI = namedtuple('LogI', 'name op rd ra imm')
+ShiftI = namedtuple('ShiftI', 'name op rd ra imm')
+Mem = namedtuple('Mem', 'name op rt rn off')
+LdArg = namedtuple('LdArg', 'name rd slot')
+BranchReg = namedtuple('BranchReg', 'name kind rs')
+CondB = namedtuple('CondB', 'name op ra rb')
+CondBZ = namedtuple('CondBZ', 'name op ra')
+Enter = namedtuple('Enter', 'name size')
+
+
+ARCH_REGISTRY = {}
+
+
+def register_arch(arch):
+    if arch.name in ARCH_REGISTRY:
+        raise RuntimeError(f'duplicate arch registration: {arch.name}')
+    ARCH_REGISTRY[arch.name] = arch
+
+
+def get_arch(name):
+    return ARCH_REGISTRY[name]
+
+
+def registered_arches():
+    return tuple(sorted(ARCH_REGISTRY))
+
+
+def byte(n):
+    return f'{n & 0xFF:02X}'
+
+
+def le32(n):
+    return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+
+
+def le64(n):
+    return (n & 0xFFFFFFFFFFFFFFFF).to_bytes(8, 'little').hex().upper()
+
+
+def word_hex(word_bytes, n):
+    if word_bytes == 4:
+        return le32(n)
+    if word_bytes == 8:
+        return le64(n)
+    raise ValueError(f'unsupported word size: {word_bytes}')
+
+
+def round_up(align, n):
+    return ((n + align - 1) // align) * align
diff --git a/p1/llvm_disasm_aarch64.py b/p1/llvm_disasm_aarch64.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""Annotate p1_aarch64.M1 DEFINE rows with llvm-mc disassembly.
+
+Reads generated DEFINE lines from p1_aarch64.M1, disassembles code-bearing rows
+with llvm-mc, and prints the DEFINE name beside the native aarch64 mnemonic
+sequence. Literal data rows such as syscall-number constants are labeled as data
+instead of being treated as instructions.
+"""
+
+import argparse
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+
+DEFINE_RE = re.compile(r'^DEFINE\s+(\S+)\s+([0-9A-Fa-f]+)\s*$')
+
+
+def repo_root():
+    return Path(__file__).resolve().parent.parent
+
+
+def default_input_path():
+    return repo_root() / 'build' / 'p1v2' / 'aarch64' / 'p1_aarch64.M1'
+
+
+def ensure_generated(path: Path):
+    if path.exists():
+        return
+    gen = repo_root() / 'p1' / 'p1_gen.py'
+    proc = subprocess.run(
+        [sys.executable, str(gen), '--arch', 'aarch64', str(path.parent.parent)],
+        check=True,
+        cwd=repo_root(),
+        capture_output=True,
+        text=True,
+    )
+    if proc.stderr:
+        sys.stderr.write(proc.stderr)
+
+
+def parse_rows(path: Path):
+    rows = []
+    for line in path.read_text().splitlines():
+        match = DEFINE_RE.match(line)
+        if not match:
+            continue
+        name, hex_bytes = match.groups()
+        rows.append((name, hex_bytes.upper()))
+    return rows
+
+
+def is_data_row(name: str):
+    return name.startswith('sys_')
+
+
+def disassemble_code_rows(rows, llvm_mc):
+    code_rows = [(name, hex_bytes) for name, hex_bytes in rows if not is_data_row(name)]
+    if not code_rows:
+        return {}
+
+    payload = '\n'.join(hex_bytes for _, hex_bytes in code_rows) + '\n'
+    proc = subprocess.run(
+        [llvm_mc, '--disassemble', '--hex', '--arch=aarch64'],
+        input=payload,
+        text=True,
+        capture_output=True,
+        check=True,
+    )
+    inst_lines = [line.strip() for line in proc.stdout.splitlines() if line.strip()]
+
+    out = {}
+    index = 0
+    for name, hex_bytes in code_rows:
+        words = len(hex_bytes) // 8
+        out[name] = inst_lines[index:index + words]
+        index += words
+
+    if index != len(inst_lines):
+        raise RuntimeError(
+            f'llvm output row split mismatch: consumed {index}, got {len(inst_lines)}'
+        )
+    return out
+
+
+def format_rows(rows, disasm_by_name, show_bytes):
+    name_width = max(len(name) for name, _ in rows) if rows else 0
+    out = []
+    for name, hex_bytes in rows:
+        if is_data_row(name):
+            rhs = f'data 0x{hex_bytes}'
+            out.append(f'{name:<{name_width}}  {rhs}')
+            continue
+
+        insns = disasm_by_name.get(name, [])
+        if not insns:
+            out.append(f'{name:<{name_width}}  <no disassembly>')
+            continue
+
+        prefix = name.ljust(name_width)
+        byte_col = f'  {hex_bytes}' if show_bytes else ''
+        out.append(f'{prefix}{byte_col}  {insns[0]}')
+        for insn in insns[1:]:
+            spacer = ' ' * name_width
+            if show_bytes:
+                spacer += '  ' + ' ' * len(hex_bytes)
+            out.append(f'{spacer}  {insn}')
+    return '\n'.join(out)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'input',
+        nargs='?',
+        default=str(default_input_path()),
+        help='path to p1_aarch64.M1',
+    )
+    parser.add_argument(
+        '--llvm-mc',
+        default=os.environ.get('LLVM_MC', 'llvm-mc'),
+        help='path to llvm-mc',
+    )
+    parser.add_argument(
+        '--grep',
+        default='',
+        help='only include DEFINE names containing this substring',
+    )
+    parser.add_argument(
+        '--limit',
+        type=int,
+        default=0,
+        help='maximum number of DEFINE rows to print (0 = all)',
+    )
+    parser.add_argument(
+        '--show-bytes',
+        action='store_true',
+        help='include raw DEFINE bytes next to the name',
+    )
+    args = parser.parse_args()
+
+    path = Path(args.input)
+    ensure_generated(path)
+    rows = parse_rows(path)
+    if args.grep:
+        rows = [(name, hex_bytes) for name, hex_bytes in rows if args.grep in name]
+    if args.limit:
+        rows = rows[:args.limit]
+
+    disasm_by_name = disassemble_code_rows(rows, args.llvm_mc)
+    print(format_rows(rows, disasm_by_name, args.show_bytes))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/p1/p1_gen.py b/p1/p1_gen.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+"""Generate P1 v2 DEFINE tables.
+
+This is a fresh generator for docs/P1v2.md. The ISA surface is described by
+plain namedtuple rows, and each backend registers a simple row-type -> encoder
+mapping. The emitted immediate/offset domains are still curated tables rather
+than the full theoretical spec space, so extending coverage is a one-line data
+edit instead of an architecture rewrite.
+
+Usage:
+    python3 p1/p1_gen.py [--arch ARCH] [build-root]
+    python3 p1/p1_gen.py --check [--arch ARCH] [build-root]
+    python3 p1/p1_gen.py --list-archs
+"""
+
+import os
+import sys
+from itertools import product
+
+from common import (
+    AddI,
+    Banner,
+    BranchReg,
+    CondB,
+    CondBZ,
+    Enter,
+    La,
+    LaBr,
+    LdArg,
+    Li,
+    Literal,
+    LogI,
+    Mem,
+    Mov,
+    Nullary,
+    Rrr,
+    ShiftI,
+    get_arch,
+    registered_arches,
+    word_hex,
+)
+
+import aarch64  # noqa: F401 - imported for arch registration side effects
+
+
+P1_GPRS = ('a0', 'a1', 'a2', 'a3', 't0', 's0', 's1')
+P1_BASES = P1_GPRS + ('sp',)
+
+RRR_OPS = ('ADD', 'SUB', 'AND', 'OR', 'XOR', 'SHL', 'SHR', 'SAR', 'MUL', 'DIV', 'REM')
+LOGI_OPS = ('ANDI', 'ORI')
+SHIFT_OPS = ('SHLI', 'SHRI', 'SARI')
+MEM_OPS = ('LD', 'ST', 'LB', 'SB')
+CONDB_OPS = ('BEQ', 'BNE', 'BLT')
+CONDBZ_OPS = ('BEQZ', 'BNEZ', 'BLTZ')
+
+ADDI_IMMS = (
+    -2048, -1024, -256, -128, -64, -48, -32, -24, -16, -12, -8, -7, -6,
+    -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16, 24, 32, 40,
+    48, 63, 64, 127, 128, 255, 256, 512, 1024, 2047,
+)
+
+LOGI_IMMS = (
+    -1, 0, 1, 2, 3, 4, 6, 7, 8, 15, 16, 31, 32, 63, 64, 127, 255, 511, 1023,
+    2047,
+)
+
+SHIFT_IMMS = tuple(range(64))
+
+MEM_OFFS = (
+    -256, -128, -64, -48, -32, -24, -16, -8, -1, 0, 1, 7, 8, 15, 16, 24, 32,
+    40, 48, 56, 64, 128, 255,
+)
+
+LDARG_SLOTS = tuple(range(32))
+ENTER_SIZES = tuple(range(0, 129))
+
+
+HEADER = """## p1_{arch}.M1 — GENERATED by p1/p1_gen.py. Do not edit by hand.
+##
+## This table targets the P1 v2 ISA described in docs/P1v2.md.
+## Row shapes are shared; per-arch lowering lives in p1/<arch>.py.
+"""
+
+
+def imm_suffix(imm):
+    return f'NEG{-imm}' if imm < 0 else str(imm)
+
+
+def rows(arch):
+    out = []
+
+    out.append(Banner('Materialization'))
+    for rd in P1_GPRS:
+        out.append(Li(name=f'LI_{rd.upper()}', rd=rd))
+    for rd in P1_GPRS:
+        out.append(La(name=f'LA_{rd.upper()}', rd=rd))
+    out.append(LaBr(name='LA_BR'))
+
+    out.append(Banner('Moves'))
+    for rd, rs in product(P1_GPRS, P1_GPRS):
+        out.append(Mov(name=f'MOV_{rd.upper()}_{rs.upper()}', rd=rd, rs=rs))
+    for rd in P1_GPRS:
+        out.append(Mov(name=f'MOV_{rd.upper()}_SP', rd=rd, rs='sp'))
+
+    out.append(Banner('Register Arithmetic'))
+    for op, rd, ra, rb in product(RRR_OPS, P1_GPRS, P1_GPRS, P1_GPRS):
+        out.append(Rrr(name=f'{op}_{rd.upper()}_{ra.upper()}_{rb.upper()}',
+                       op=op, rd=rd, ra=ra, rb=rb))
+
+    out.append(Banner('Immediate Arithmetic'))
+    for rd, ra, imm in product(P1_GPRS, P1_GPRS, ADDI_IMMS):
+        out.append(AddI(name=f'ADDI_{rd.upper()}_{ra.upper()}_{imm_suffix(imm)}',
+                        rd=rd, ra=ra, imm=imm))
+    for op, rd, ra, imm in product(LOGI_OPS, P1_GPRS, P1_GPRS, LOGI_IMMS):
+        out.append(LogI(name=f'{op}_{rd.upper()}_{ra.upper()}_{imm_suffix(imm)}',
+                        op=op, rd=rd, ra=ra, imm=imm))
+    for op, rd, ra, imm in product(SHIFT_OPS, P1_GPRS, P1_GPRS, SHIFT_IMMS):
+        out.append(ShiftI(name=f'{op}_{rd.upper()}_{ra.upper()}_{imm}',
+                          op=op, rd=rd, ra=ra, imm=imm))
+
+    out.append(Banner('Memory'))
+    for op, rt, rn, off in product(MEM_OPS, P1_GPRS, P1_BASES, MEM_OFFS):
+        out.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{imm_suffix(off)}',
+                       op=op, rt=rt, rn=rn, off=off))
+
+    out.append(Banner('ABI Access'))
+    for rd, slot in product(P1_GPRS, LDARG_SLOTS):
+        out.append(LdArg(name=f'LDARG_{rd.upper()}_{slot}', rd=rd, slot=slot))
+
+    out.append(Banner('Branches'))
+    out.append(Nullary(name='B', kind='B'))
+    for rs in P1_GPRS:
+        out.append(BranchReg(name=f'BR_{rs.upper()}', kind='BR', rs=rs))
+    for op, ra, rb in product(CONDB_OPS, P1_GPRS, P1_GPRS):
+        out.append(CondB(name=f'{op}_{ra.upper()}_{rb.upper()}', op=op, ra=ra, rb=rb))
+    for op, ra in product(CONDBZ_OPS, P1_GPRS):
+        out.append(CondBZ(name=f'{op}_{ra.upper()}', op=op, ra=ra))
+
+    out.append(Banner('Calls And Returns'))
+    out.append(Nullary(name='CALL', kind='CALL'))
+    out.append(Nullary(name='RET', kind='RET'))
+    out.append(Nullary(name='TAIL', kind='TAIL'))
+    for rs in P1_GPRS:
+        out.append(BranchReg(name=f'CALLR_{rs.upper()}', kind='CALLR', rs=rs))
+    for rs in P1_GPRS:
+        out.append(BranchReg(name=f'TAILR_{rs.upper()}', kind='TAILR', rs=rs))
+
+    out.append(Banner('Frame Management'))
+    for size in ENTER_SIZES:
+        out.append(Enter(name=f'ENTER_{size}', size=size))
+    out.append(Nullary(name='LEAVE', kind='LEAVE'))
+
+    out.append(Banner('System'))
+    out.append(Nullary(name='SYSCALL', kind='SYSCALL'))
+    for name, number in sorted(arch.syscall_numbers.items()):
+        out.append(Literal(name=name, hex_by_arch={arch.name: word_hex(arch.word_bytes, number)}))
+
+    return out
+
+
+def lower_name(name):
+    low = name.lower()
+    head, sep, rest = low.partition('_')
+    if not sep:
+        return low
+    if '_' not in rest:
+        return low
+    return f'{head}_{rest.replace("_", ",")}'
+
+
+def encode_row(arch, row):
+    if isinstance(row, Literal):
+        return row.hex_by_arch[arch.name]
+    encoder = arch.encoders[type(row)]
+    return encoder(arch, row)
+
+
+def emit(arch_name):
+    arch = get_arch(arch_name)
+    out = [HEADER.format(arch=arch.name).rstrip(), '']
+    seen = set()
+    for row in rows(arch):
+        if isinstance(row, Banner):
+            out.append('')
+            out.append(f'## ---- {row.text}')
+            continue
+        name = lower_name(row.name)
+        if name in seen:
+            raise RuntimeError(f'duplicate DEFINE: {name}')
+        seen.add(name)
+        out.append(f'DEFINE {name} {encode_row(arch, row)}')
+    out.append('')
+    return '\n'.join(out)
+
+
+def parse_args(argv):
+    check = False
+    archs = []
+    positional = []
+    i = 0
+    while i < len(argv):
+        arg = argv[i]
+        if arg == '--check':
+            check = True
+        elif arg == '--list-archs':
+            print('\n'.join(registered_arches()))
+            sys.exit(0)
+        elif arg == '--arch':
+            i += 1
+            if i >= len(argv):
+                raise SystemExit('--arch requires a value')
+            archs.append(argv[i])
+        else:
+            positional.append(arg)
+        i += 1
+    build_root = positional[0] if positional else os.path.join('build', 'p1v2')
+    if not archs:
+        archs = list(registered_arches())
+    return check, archs, build_root
+
+
+def main(argv=None):
+    check, archs, build_root = parse_args(argv or sys.argv[1:])
+    had_diff = False
+
+    for arch_name in archs:
+        arch = get_arch(arch_name)
+        dest_dir = os.path.join(build_root, arch.name)
+        path = os.path.join(dest_dir, f'p1_{arch.name}.M1')
+        content = emit(arch.name)
+        if check:
+            try:
+                with open(path) as f:
+                    existing = f.read()
+            except FileNotFoundError:
+                existing = ''
+            if existing != content:
+                sys.stderr.write(f'DIFF: {path}\n')
+                had_diff = True
+            continue
+        os.makedirs(dest_dir, exist_ok=True)
+        with open(path, 'w') as f:
+            f.write(content)
+        print(f'wrote {path} ({len(content)} bytes)')
+
+    if check and had_diff:
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/post.md b/post.md
@@ -0,0 +1,247 @@
+# Title
+
+Your compiler compiles your code, but what compiled the compiler? And the one
+before that?
+
+Ken Thompson freaked everyone out in 1983 by describing a nasty
+self-replicating attack on compilers themselves that reminded folks that it
+might be a good idea to have a "full bootstrap" from as small a binary seed as
+could be managed and then audited source from there forward to the toolchains
+we know and (maybe-kinda) love today.
+
+Jeremiah Orians and Jan Nieuwenhuizen delivered exactly that to the world in
+2023, going from `hex0-seed`, a few hundred byte seed binary that is the "hex0"
+compiler, through Fabrice Bellard's Tiny C Compiler (TCC), and then up through
+GCC and the rest of the stack. Truly awesome work.
+
+Between "hex0" and TCC, there's a minimal assembler and linker in M0 and hex2,
+then a "subset C" compiler that builds out the Mes Scheme ecosystem, then
+a Mes C compiler that compiles TCC.
+
+From seed to M0 and hex2 is ~2KLOC. The "middle" stage is ~25KLOC to get to
+Mes Scheme and a collection of build and packaging tools. And the "late" stage
+of the Mes C compiler and TCC sources are another ~25KLOC.
+
+I find M0 and hex2 to be very cute and everything after that to be rather
+goopy. So, let's dive into the cuteness!
+
+Getting there is fairly straightforward:
+
+```
+hex0-seed = given
+
+hex0 = hex0-seed(hex0.hex0)  # hex0 compiles itself from source
+hex1 = hex0(hex1.hex0)
+hex2 = hex1(hex2.hex1)
+catm = hex2(catm.hex2)
+M0 = hex2(catm(ELF.hex2, M0.hex2))
+```
+
+**hex0**: the minimal seed hex compiler; it turns pairs of hexadecimal digits
+into raw bytes. Supports # and ; line comments.
+
+**hex1**: a small extension of hex0 that adds the first notion of symbolic
+control flow. Features everything in hex0, single-character labels, one size
+of relative jump/offset encoding.
+
+**hex2**: the final "hex language" and first real linker-like stage in the
+chain. Features everything in hex1, labels of arbitrary length, relative
+references of multiple sizes (!, @, %), absolute references ($, &), enough
+symbol/address resolution to link later stages.
+
+**catm**: a minimal file concatenation utility, take N files and output 1.
+
+**M0**: the first assembler built on top of hex2. Features DEFINE substitution
+of names to hex bytes, immediate values in various sizes/formats, symbolic
+assembly over hex2.
+
+M0 programs are architecture-specific because the DEFINEd set of mnemonics and
+their byte-level encodings are architecture-specific. So that means that the
+subset C compiler, defined in M0, has to be written once per architecture.
+That's a drag.
+
+I thought it'd be rather fun to play around at this low level and see if we
+could build off M0 in a way that might get us to a functioning C compiler
+through a path that cuts down on the amount of code in that "middle" layer.
+
+So I present to you "P1", a portable pseudo-ISA targetable via M1 DEFINEs, and
+"M1M", a macro expander for M1, written in P1.
+
+P1
+
+Registers:
+
+- `a0`–`a3` — argument registers. Also caller-saved general registers.
+- `t0` — caller-saved temporary.
+- `s0`–`s1` — callee-saved general registers.
+- `sp` — stack pointer.
+
+Ops:
+- Materialization: `LI`, `LA`, `LA_BR`
+- Moves: `MOV`
+- Arithmetic: `ADD`, `SUB`, `AND`, `OR`, `XOR`, `SHL`, `SHR`, `SAR`, `MUL`,
+  `DIV`, `REM`
+- Immediate arithmetic: `ADDI`, `ANDI`, `ORI`, `SHLI`, `SHRI`, `SARI`
+- Memory: `LD`, `ST`, `LB`, `SB`
+- Branching: `B`, `BR`, `BEQ`, `BNE`, `BLT`, `BEQZ`, `BNEZ`, `BLTZ`
+- Calls / returns: `CALL`, `CALLR`, `RET`, `TAIL`, `TAILR`
+- Frame management: `ENTER`, `LEAVE`
+- ABI access: `LDARG`
+- System: `SYSCALL`
+
+It's a deliberately dumb little RISC: a handful of visible registers, a hidden
+branch/call target register behind `LA_BR`, and a generator that spits out the
+per-arch `DEFINE` tables for 32-bit and 64-bit x86, ARM, and RISC-V.
+
+Calling convention:
+
+- `a0`-`a3` hold the first four argument words.
+- Extra arguments live in an incoming stack-argument area and are read with
+  `LDARG`.
+- `a0` is also the one-word return register.
+- `a0`-`a3` and `t0` are caller-saved; `s0`-`s1` and `sp` are callee-saved.
+- If you need a frame, `ENTER` builds the standard one and `LEAVE` tears it
+  down.
+- Stack-passed outgoing args are staged in frame-local space before `CALL`.
+- Wider-than-one-word returns use the usual hidden pointer trick: caller passes
+  a writable result buffer in `a0`, real args slide over by one, callee returns
+  that same pointer in `a0`.
+
+That's enough to write a portable macro expander, a Lisp, or a tiny C
+compiler once, then retarget by swapping out the generated M1 defs instead of
+rewriting the whole program per architecture. That cuts down auditable code at
+the bottom of the stack quite a bit. And it may give us a shorter path from
+M0 to TCC (but no promises). Mostly it's just a cute base to work on.
+
+```
+;; Bootstrap from seed
+(define hex0 (hex0-seed hex0.hex0))
+(define hex1 (hex0 hex1.hex0))
+(define hex2 (hex1 hex2.hex1))
+(define catm (hex2 catm.hex2))
+(define M0   (hex2 (catm ELF.hex2 M0.hex2)))
+
+(defn link (hex2-src) (hex2 (catm ELF.hex2 hex2-src)))
+
+;; Compile P1 source, no macros
+(defn p1compile0 (src) (M0 (catm P1.M1 src)))
+
+;; Bootstrap the macro expander once without macros.
+(define m1m (link (p1compile0 m1m.M1)))
+
+;; Normal P1 build: expand macros, then assemble/link.
+(defn p1compile (src) (M0 (catm P1.M1 (m1m (catm P1M.M1 src)))))
+(defn p1exe (src) (link (p1compile src)))
+```
+
+So, what does P1 code with the macro expansion base look like?
+
+```
+## copy.M1
+##
+## And symbolic constants:
+##   SYS_open SYS_read SYS_write SYS_close SYS_exit O_RDONLY
+##   WORD
+##   SAVE_S0 SAVE_S1 BUF_OFF COPY_LOCALS BUF_SIZE
+##
+## Entry convention:
+##   a0 = argc
+##   a1 = argv
+
+DEFINE BUF_SIZE %128
+DEFINE SAVE_S0 %16
+DEFINE SAVE_S1 %24
+DEFINE COPY_LOCALS %144
+
+:main
+  %li(t0, 2)
+  %blt(a0, t0, usage)          ; if argc < 2, exit 2
+
+  %ld(t0, a1, WORD)            ; t0 = argv[1]
+
+  %li(a0, SYS_open)
+  %mov(a1, t0)                 ; path
+  %li(a2, O_RDONLY)            ; flags
+  %li(a3, 0)                   ; mode
+  %syscall()
+  %bltz(a0, fail)              ; open failed
+
+  %mov(s0, a0)                 ; s0 = fd
+  %mov(a0, s0)
+  %call(copy_to_stdout)        ; returns 0 or -1 in a0
+  %bltz(a0, close_fail)
+
+  %li(a0, SYS_close)
+  %mov(a1, s0)
+  %syscall()
+
+  %li(a0, SYS_exit)
+  %li(a1, 0)
+  %syscall()
+
+:close_fail
+  %li(a0, SYS_close)
+  %mov(a1, s0)
+  %syscall()
+
+:fail
+  %li(a0, SYS_exit)
+  %li(a1, 1)
+  %syscall()
+
+:usage
+  %li(a0, SYS_exit)
+  %li(a1, 2)
+  %syscall()
+
+
+:copy_to_stdout
+  %enter(COPY_LOCALS)
+  %st(sp, SAVE_S0, s0)
+  %st(sp, SAVE_S1, s1)
+
+  %mov(s0, a0)                 ; s0 = input fd
+
+:read_loop
+  %li(a0, SYS_read)
+  %mov(a1, s0)                 ; fd
+  %mov(a2, sp)
+  %addi(a2, a2, BUF_OFF)       ; buf = sp + BUF_OFF
+  %li(a3, BUF_SIZE)            ; count
+  %syscall()
+
+  %beqz(a0, done)              ; EOF
+  %bltz(a0, io_error)          ; read error
+
+  %mov(s1, a0)                 ; remaining byte count
+  %mov(t0, sp)
+  %addi(t0, t0, BUF_OFF)       ; t0 = current write ptr
+
+:write_loop
+  %li(a0, SYS_write)
+  %li(a1, 1)                   ; stdout
+  %mov(a2, t0)                 ; buf
+  %mov(a3, s1)                 ; count
+  %syscall()
+
+  %bltz(a0, io_error)          ; write error
+
+  %add(t0, t0, a0)             ; buf += nwritten
+  %sub(s1, s1, a0)             ; remaining -= nwritten
+  %bnez(s1, write_loop)
+  %b(read_loop)
+
+:done
+  %li(a0, 0)
+  %ld(s0, sp, SAVE_S0)
+  %ld(s1, sp, SAVE_S1)
+  %leave()
+  %ret()
+
+:io_error
+  %li(a0, -1)
+  %ld(s0, sp, SAVE_S0)
+  %ld(s1, sp, SAVE_S1)
+  %leave()
+  %ret()
+```
diff --git a/src/m1macro.c b/src/m1macro.c
@@ -0,0 +1,886 @@
+#include <stdio.h>
+#include <string.h>
+
+/*
+ * Tiny single-pass M1 macro expander.
+ *
+ * Syntax:
+ *   %macro NAME(a, b)
+ *   ... body ...
+ *   %endm
+ *
+ *   %NAME(x, y)      function-like macro call
+ *   ##               token pasting inside macro bodies
+ *   @loop            local token inside a macro body
+ *   :@loop / &@loop  local label / reference shorthand
+ *   :param / &param  prefix a single-token parameter with ':' or '&'
+ *
+ * Notes:
+ *   - Macros are define-before-use. There is no prescan.
+ *   - Expansion rescans by pushing expanded tokens back through the same loop.
+ *   - There is no cycle detection. Recursive macros will loop until a limit.
+ *   - Only recognized %NAME(...) calls expand. Other text passes through.
+ *   - Output formatting is normalized to tokens plus '\n', not preserved.
+ */
+
+#define MAX_INPUT       262144
+#define MAX_OUTPUT      524288
+#define MAX_TEXT        524288
+#define MAX_TOKENS      65536
+#define MAX_MACROS      256
+#define MAX_PARAMS      16
+#define MAX_BODY_TOKENS 4096
+#define MAX_EXPAND      65536
+#define MAX_STACK       64
+
+enum {
+    TOK_WORD,
+    TOK_STRING,
+    TOK_NEWLINE,
+    TOK_LPAREN,
+    TOK_RPAREN,
+    TOK_COMMA,
+    TOK_PASTE
+};
+
+struct Token {
+    int kind;
+    int start;
+    int len;
+    int line;
+};
+
+struct Macro {
+    int name_start;
+    int name_len;
+    int line;
+    int param_count;
+    int param_start[MAX_PARAMS];
+    int param_len[MAX_PARAMS];
+    struct Token body[MAX_BODY_TOKENS];
+    int body_count;
+};
+
+struct Stream {
+    struct Token *toks;
+    int count;
+    int pos;
+    int line_start;
+    int pool_mark;
+};
+
+static char input_buf[MAX_INPUT + 1];
+static char output_buf[MAX_OUTPUT + 1];
+static char text_buf[MAX_TEXT];
+
+static struct Token source_tokens[MAX_TOKENS];
+static struct Token expand_pool[MAX_EXPAND];
+static struct Macro macros[MAX_MACROS];
+static struct Stream streams[MAX_STACK];
+
+static int text_used;
+static int source_count;
+static int pool_used;
+static int macro_count;
+static int local_id_next = 1;
+static int output_used;
+static int output_need_space;
+static int stream_top;
+
+static char error_buf[256];
+
+static int is_space_no_nl(int c)
+{
+    return c == ' ' || c == '\t' || c == '\r' || c == '\f' || c == '\v';
+}
+
+static int append_text_len(const char *s, int len)
+{
+    int start;
+
+    if (text_used + len + 1 > MAX_TEXT) {
+        snprintf(error_buf, sizeof(error_buf), "text buffer overflow");
+        return -1;
+    }
+    start = text_used;
+    memcpy(text_buf + text_used, s, (size_t)len);
+    text_used += len;
+    text_buf[text_used++] = '\0';
+    return start;
+}
+
+static int append_text_str(const char *s)
+{
+    return append_text_len(s, (int)strlen(s));
+}
+
+static int push_token(struct Token *buf, int *count, int max_count,
+                      int kind, int start, int len, int line)
+{
+    if (*count >= max_count) {
+        snprintf(error_buf, sizeof(error_buf), "token buffer overflow");
+        return 0;
+    }
+    buf[*count].kind = kind;
+    buf[*count].start = start;
+    buf[*count].len = len;
+    buf[*count].line = line;
+    *count += 1;
+    return 1;
+}
+
+static int token_text_eq(const struct Token *tok, const char *s)
+{
+    int len = (int)strlen(s);
+
+    return tok->len == len && memcmp(text_buf + tok->start, s, (size_t)len) == 0;
+}
+
+static int span_eq(int start, int len, const struct Token *tok)
+{
+    return len == tok->len && memcmp(text_buf + start, text_buf + tok->start, (size_t)len) == 0;
+}
+
+static int lex_source(const char *src)
+{
+    int i = 0;
+    int line = 1;
+
+    while (src[i] != '\0') {
+        int start;
+        int text_start;
+        int len;
+
+        if (is_space_no_nl((unsigned char)src[i])) {
+            i++;
+            continue;
+        }
+        if (src[i] == '\n') {
+            if (!push_token(source_tokens, &source_count, MAX_TOKENS,
+                            TOK_NEWLINE, -1, 0, line)) {
+                return 0;
+            }
+            i++;
+            line++;
+            continue;
+        }
+        if (src[i] == '"' || src[i] == '\'') {
+            int quote = src[i];
+
+            start = i;
+            i++;
+            while (src[i] != '\0' && src[i] != quote) {
+                i++;
+            }
+            if (src[i] == quote) {
+                i++;
+            }
+            len = i - start;
+            text_start = append_text_len(src + start, len);
+            if (text_start < 0) {
+                return 0;
+            }
+            if (!push_token(source_tokens, &source_count, MAX_TOKENS,
+                            TOK_STRING, text_start, len, line)) {
+                return 0;
+            }
+            continue;
+        }
+        if (src[i] == '#' && src[i + 1] == '#') {
+            text_start = append_text_str("##");
+            if (text_start < 0) {
+                return 0;
+            }
+            if (!push_token(source_tokens, &source_count, MAX_TOKENS,
+                            TOK_PASTE, text_start, 2, line)) {
+                return 0;
+            }
+            i += 2;
+            continue;
+        }
+        if (src[i] == '#' || src[i] == ';') {
+            while (src[i] != '\0' && src[i] != '\n') {
+                i++;
+            }
+            continue;
+        }
+        if (src[i] == '(') {
+            text_start = append_text_str("(");
+            if (text_start < 0) {
+                return 0;
+            }
+            if (!push_token(source_tokens, &source_count, MAX_TOKENS,
+                            TOK_LPAREN, text_start, 1, line)) {
+                return 0;
+            }
+            i++;
+            continue;
+        }
+        if (src[i] == ')') {
+            text_start = append_text_str(")");
+            if (text_start < 0) {
+                return 0;
+            }
+            if (!push_token(source_tokens, &source_count, MAX_TOKENS,
+                            TOK_RPAREN, text_start, 1, line)) {
+                return 0;
+            }
+            i++;
+            continue;
+        }
+        if (src[i] == ',') {
+            text_start = append_text_str(",");
+            if (text_start < 0) {
+                return 0;
+            }
+            if (!push_token(source_tokens, &source_count, MAX_TOKENS,
+                            TOK_COMMA, text_start, 1, line)) {
+                return 0;
+            }
+            i++;
+            continue;
+        }
+
+        start = i;
+        while (src[i] != '\0' &&
+               !is_space_no_nl((unsigned char)src[i]) &&
+               src[i] != '\n' &&
+               src[i] != '#' &&
+               src[i] != ';' &&
+               src[i] != '(' &&
+               src[i] != ')' &&
+               src[i] != ',' &&
+               !(src[i] == '#' && src[i + 1] == '#')) {
+            i++;
+        }
+        len = i - start;
+        text_start = append_text_len(src + start, len);
+        if (text_start < 0) {
+            return 0;
+        }
+        if (!push_token(source_tokens, &source_count, MAX_TOKENS,
+                        TOK_WORD, text_start, len, line)) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+static int find_macro(const struct Token *tok)
+{
+    int i;
+    const char *s;
+
+    if (tok->kind != TOK_WORD) {
+        return -1;
+    }
+    if (tok->len < 2) {
+        return -1;
+    }
+    s = text_buf + tok->start;
+    if (s[0] != '%') {
+        return -1;
+    }
+    for (i = 0; i < macro_count; i++) {
+        if (tok->len - 1 == macros[i].name_len &&
+            memcmp(s + 1, text_buf + macros[i].name_start,
+                   (size_t)macros[i].name_len) == 0) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+static int find_param(const struct Macro *m, const struct Token *tok)
+{
+    int i;
+
+    if (tok->kind != TOK_WORD) {
+        return -1;
+    }
+    for (i = 0; i < m->param_count; i++) {
+        if (span_eq(m->param_start[i], m->param_len[i], tok)) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+static int find_prefixed_param(const struct Macro *m, const struct Token *tok, char *prefix)
+{
+    int i;
+    const char *s;
+
+    if (tok->kind != TOK_WORD || tok->len < 2) {
+        return -1;
+    }
+    s = text_buf + tok->start;
+    if (s[0] != ':' && s[0] != '&') {
+        return -1;
+    }
+    for (i = 0; i < m->param_count; i++) {
+        if (tok->len - 1 == m->param_len[i] &&
+            memcmp(s + 1, text_buf + m->param_start[i], (size_t)m->param_len[i]) == 0) {
+            *prefix = s[0];
+            return i;
+        }
+    }
+    return -1;
+}
+
+static int rewrite_local(const struct Token *in, int local_id, struct Token *out)
+{
+    const char *src = text_buf + in->start;
+    char tmp[256];
+    int prefix_len = 0;
+    int name_start = 0;
+    int n;
+    int text_start;
+
+    if (in->kind != TOK_WORD) {
+        *out = *in;
+        return 1;
+    }
+    if (in->len >= 2 && (src[0] == ':' || src[0] == '&') && src[1] == '@') {
+        prefix_len = 1;
+        name_start = 2;
+    } else if (in->len >= 1 && src[0] == '@') {
+        prefix_len = 0;
+        name_start = 1;
+    } else {
+        *out = *in;
+        return 1;
+    }
+
+    n = snprintf(tmp, sizeof(tmp), "%.*s__mlocal_%d_%.*s",
+                 prefix_len, src, local_id, in->len - name_start, src + name_start);
+    if (n < 0 || n >= (int)sizeof(tmp)) {
+        snprintf(error_buf, sizeof(error_buf), "local label too long at line %d", in->line);
+        return 0;
+    }
+    text_start = append_text_len(tmp, n);
+    if (text_start < 0) {
+        return 0;
+    }
+    out->kind = TOK_WORD;
+    out->start = text_start;
+    out->len = n;
+    out->line = in->line;
+    return 1;
+}
+
+static int push_pool_token(struct Token tok)
+{
+    if (pool_used >= MAX_EXPAND) {
+        snprintf(error_buf, sizeof(error_buf), "expansion pool overflow");
+        return 0;
+    }
+    expand_pool[pool_used++] = tok;
+    return 1;
+}
+
+static int emit_newline(void)
+{
+    if (output_used + 1 >= MAX_OUTPUT) {
+        snprintf(error_buf, sizeof(error_buf), "output buffer overflow");
+        return 0;
+    }
+    output_buf[output_used++] = '\n';
+    output_need_space = 0;
+    return 1;
+}
+
+static int emit_token(const struct Token *tok)
+{
+    if (output_need_space) {
+        if (output_used + 1 >= MAX_OUTPUT) {
+            snprintf(error_buf, sizeof(error_buf), "output buffer overflow");
+            return 0;
+        }
+        output_buf[output_used++] = ' ';
+    }
+    if (output_used + tok->len >= MAX_OUTPUT) {
+        snprintf(error_buf, sizeof(error_buf), "output buffer overflow");
+        return 0;
+    }
+    memcpy(output_buf + output_used, text_buf + tok->start, (size_t)tok->len);
+    output_used += tok->len;
+    output_need_space = 1;
+    return 1;
+}
+
+static int push_stream(struct Token *toks, int count, int pool_mark)
+{
+    struct Stream *s;
+
+    if (stream_top >= MAX_STACK) {
+        snprintf(error_buf, sizeof(error_buf), "stream stack overflow");
+        return 0;
+    }
+    s = &streams[stream_top++];
+    s->toks = toks;
+    s->count = count;
+    s->pos = 0;
+    s->line_start = 1;
+    s->pool_mark = pool_mark;
+    return 1;
+}
+
+static void pop_stream(void)
+{
+    if (stream_top <= 0) {
+        return;
+    }
+    stream_top--;
+    if (streams[stream_top].pool_mark >= 0) {
+        pool_used = streams[stream_top].pool_mark;
+    }
+}
+
+static int define_macro(struct Stream *s)
+{
+    struct Macro *m;
+    int i;
+    int line_start;
+
+    if (macro_count >= MAX_MACROS) {
+        snprintf(error_buf, sizeof(error_buf), "too many macros");
+        return 0;
+    }
+    m = &macros[macro_count];
+    memset(m, 0, sizeof(*m));
+    m->line = s->toks[s->pos].line;
+    s->pos++;
+
+    if (s->pos >= s->count || s->toks[s->pos].kind != TOK_WORD) {
+        snprintf(error_buf, sizeof(error_buf), "expected macro name at line %d", m->line);
+        return 0;
+    }
+    m->name_start = s->toks[s->pos].start;
+    m->name_len = s->toks[s->pos].len;
+    for (i = 0; i < macro_count; i++) {
+        if (macros[i].name_len == m->name_len &&
+            memcmp(text_buf + macros[i].name_start,
+                   text_buf + m->name_start, (size_t)m->name_len) == 0) {
+            snprintf(error_buf, sizeof(error_buf), "duplicate macro at line %d", m->line);
+            return 0;
+        }
+    }
+    s->pos++;
+
+    if (s->pos >= s->count || s->toks[s->pos].kind != TOK_LPAREN) {
+        snprintf(error_buf, sizeof(error_buf), "expected '(' after macro name at line %d", m->line);
+        return 0;
+    }
+    s->pos++;
+
+    if (s->pos < s->count && s->toks[s->pos].kind != TOK_RPAREN) {
+        while (1) {
+            if (m->param_count >= MAX_PARAMS) {
+                snprintf(error_buf, sizeof(error_buf), "too many params on macro at line %d", m->line);
+                return 0;
+            }
+            if (s->pos >= s->count || s->toks[s->pos].kind != TOK_WORD) {
+                snprintf(error_buf, sizeof(error_buf), "expected parameter name at line %d", m->line);
+                return 0;
+            }
+            m->param_start[m->param_count] = s->toks[s->pos].start;
+            m->param_len[m->param_count] = s->toks[s->pos].len;
+            m->param_count++;
+            s->pos++;
+            if (s->pos < s->count && s->toks[s->pos].kind == TOK_COMMA) {
+                s->pos++;
+                continue;
+            }
+            break;
+        }
+    }
+
+    if (s->pos >= s->count || s->toks[s->pos].kind != TOK_RPAREN) {
+        snprintf(error_buf, sizeof(error_buf), "expected ')' in macro header at line %d", m->line);
+        return 0;
+    }
+    s->pos++;
+
+    if (s->pos >= s->count || s->toks[s->pos].kind != TOK_NEWLINE) {
+        snprintf(error_buf, sizeof(error_buf), "expected newline after macro header at line %d", m->line);
+        return 0;
+    }
+    s->pos++;
+
+    line_start = 1;
+    while (s->pos < s->count) {
+        if (line_start &&
+            s->toks[s->pos].kind == TOK_WORD &&
+            token_text_eq(&s->toks[s->pos], "%endm")) {
+            while (s->pos < s->count && s->toks[s->pos].kind != TOK_NEWLINE) {
+                s->pos++;
+            }
+            if (s->pos < s->count && s->toks[s->pos].kind == TOK_NEWLINE) {
+                s->pos++;
+            }
+            s->line_start = 1;
+            macro_count++;
+            return 1;
+        }
+        if (m->body_count >= MAX_BODY_TOKENS) {
+            snprintf(error_buf, sizeof(error_buf), "macro body too large at line %d", m->line);
+            return 0;
+        }
+        m->body[m->body_count++] = s->toks[s->pos];
+        line_start = (s->toks[s->pos].kind == TOK_NEWLINE);
+        s->pos++;
+    }
+
+    snprintf(error_buf, sizeof(error_buf), "unterminated %%macro starting at line %d", m->line);
+    return 0;
+}
+
+static int parse_args(struct Stream *s, int lparen_pos,
+                      int arg_starts[MAX_PARAMS], int arg_ends[MAX_PARAMS],
+                      int *arg_count, int *end_pos)
+{
+    int i = lparen_pos + 1;
+    int depth = 1;
+    int cur = 0;
+
+    memset(arg_starts, 0, sizeof(int) * MAX_PARAMS);
+    memset(arg_ends, 0, sizeof(int) * MAX_PARAMS);
+    arg_starts[0] = i;
+
+    while (i < s->count) {
+        if (s->toks[i].kind == TOK_LPAREN) {
+            depth++;
+            i++;
+            continue;
+        }
+        if (s->toks[i].kind == TOK_RPAREN) {
+            depth--;
+            if (depth == 0) {
+                arg_ends[cur] = i;
+                *arg_count = cur + 1;
+                *end_pos = i + 1;
+                if (arg_starts[0] == arg_ends[0] && cur == 0) {
+                    *arg_count = 0;
+                }
+                return 1;
+            }
+            i++;
+            continue;
+        }
+        if (s->toks[i].kind == TOK_COMMA && depth == 1) {
+            arg_ends[cur] = i;
+            cur++;
+            if (cur >= MAX_PARAMS) {
+                snprintf(error_buf, sizeof(error_buf), "too many macro args at line %d", s->toks[i].line);
+                return 0;
+            }
+            arg_starts[cur] = i + 1;
+            i++;
+            continue;
+        }
+        i++;
+    }
+
+    snprintf(error_buf, sizeof(error_buf), "unterminated macro call at line %d", s->toks[lparen_pos].line);
+    return 0;
+}
+
+static int append_arg(struct Stream *s, int start, int end, int single)
+{
+    int i;
+
+    if (start == end) {
+        snprintf(error_buf, sizeof(error_buf), "empty macro argument");
+        return 0;
+    }
+    if (single && end - start != 1) {
+        snprintf(error_buf, sizeof(error_buf), "pasted or prefixed arg must be one token");
+        return 0;
+    }
+    for (i = start; i < end; i++) {
+        if (!push_pool_token(s->toks[i])) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+static int append_prefixed_arg(struct Stream *s, int start, int end, char prefix)
+{
+    char tmp[512];
+    int n;
+    int text_start;
+    struct Token tok;
+
+    if (end - start != 1) {
+        snprintf(error_buf, sizeof(error_buf), "prefixed arg must be one token");
+        return 0;
+    }
+    n = snprintf(tmp, sizeof(tmp), "%c%.*s",
+                 prefix, s->toks[start].len, text_buf + s->toks[start].start);
+    if (n < 0 || n >= (int)sizeof(tmp)) {
+        snprintf(error_buf, sizeof(error_buf), "prefixed arg too long");
+        return 0;
+    }
+    text_start = append_text_len(tmp, n);
+    if (text_start < 0) {
+        return 0;
+    }
+    tok.kind = TOK_WORD;
+    tok.start = text_start;
+    tok.len = n;
+    tok.line = s->toks[start].line;
+    return push_pool_token(tok);
+}
+
+static int paste_range(int start, int *count)
+{
+    int in = start;
+    int out = start;
+    int end = start + *count;
+
+    while (in < end) {
+        if (expand_pool[in].kind == TOK_PASTE) {
+            char tmp[512];
+            int n;
+            int text_start;
+
+            if (out == start || in + 1 >= end) {
+                snprintf(error_buf, sizeof(error_buf), "misplaced ## at line %d", expand_pool[in].line);
+                return 0;
+            }
+            if (expand_pool[out - 1].kind == TOK_NEWLINE ||
+                expand_pool[in + 1].kind == TOK_NEWLINE ||
+                expand_pool[out - 1].kind == TOK_PASTE ||
+                expand_pool[in + 1].kind == TOK_PASTE) {
+                snprintf(error_buf, sizeof(error_buf), "bad ## operand at line %d", expand_pool[in].line);
+                return 0;
+            }
+
+            n = snprintf(tmp, sizeof(tmp), "%.*s%.*s",
+                         expand_pool[out - 1].len, text_buf + expand_pool[out - 1].start,
+                         expand_pool[in + 1].len, text_buf + expand_pool[in + 1].start);
+            if (n < 0 || n >= (int)sizeof(tmp)) {
+                snprintf(error_buf, sizeof(error_buf), "pasted token too long at line %d", expand_pool[in].line);
+                return 0;
+            }
+            text_start = append_text_len(tmp, n);
+            if (text_start < 0) {
+                return 0;
+            }
+            expand_pool[out - 1].kind = TOK_WORD;
+            expand_pool[out - 1].start = text_start;
+            expand_pool[out - 1].len = n;
+            in += 2;
+            continue;
+        }
+        if (out != in) {
+            expand_pool[out] = expand_pool[in];
+        }
+        out++;
+        in++;
+    }
+
+    *count = out - start;
+    return 1;
+}
+
+static int expand_call(struct Stream *s, int macro_idx)
+{
+    struct Macro *m = &macros[macro_idx];
+    int arg_starts[MAX_PARAMS];
+    int arg_ends[MAX_PARAMS];
+    int arg_count;
+    int end_pos;
+    int mark;
+    int count;
+    int local_id;
+    int i;
+
+    if (s->pos + 1 >= s->count || s->toks[s->pos + 1].kind != TOK_LPAREN) {
+        snprintf(error_buf, sizeof(error_buf), "internal macro call error");
+        return 0;
+    }
+    if (!parse_args(s, s->pos + 1, arg_starts, arg_ends, &arg_count, &end_pos)) {
+        return 0;
+    }
+    if (arg_count != m->param_count) {
+        snprintf(error_buf, sizeof(error_buf),
+                 "macro '%.*s' expects %d args, got %d at line %d",
+                 m->name_len, text_buf + m->name_start,
+                 m->param_count, arg_count, s->toks[s->pos].line);
+        return 0;
+    }
+
+    s->pos = end_pos;
+    s->line_start = 0;
+    mark = pool_used;
+    local_id = local_id_next++;
+
+    for (i = 0; i < m->body_count; i++) {
+        int param_idx = find_param(m, &m->body[i]);
+        int pasted = 0;
+        char prefix = 0;
+        struct Token tok;
+
+        if (param_idx >= 0) {
+            pasted = (i > 0 && m->body[i - 1].kind == TOK_PASTE) ||
+                     (i + 1 < m->body_count && m->body[i + 1].kind == TOK_PASTE);
+            if (!append_arg(s, arg_starts[param_idx], arg_ends[param_idx], pasted)) {
+                pool_used = mark;
+                return 0;
+            }
+            continue;
+        }
+
+        param_idx = find_prefixed_param(m, &m->body[i], &prefix);
+        if (param_idx >= 0) {
+            if (!append_prefixed_arg(s, arg_starts[param_idx], arg_ends[param_idx], prefix)) {
+                pool_used = mark;
+                return 0;
+            }
+            continue;
+        }
+
+        if (m->body[i].kind == TOK_PASTE) {
+            if (!push_pool_token(m->body[i])) {
+                pool_used = mark;
+                return 0;
+            }
+            continue;
+        }
+
+        if (!rewrite_local(&m->body[i], local_id, &tok)) {
+            pool_used = mark;
+            return 0;
+        }
+        if (!push_pool_token(tok)) {
+            pool_used = mark;
+            return 0;
+        }
+    }
+
+    count = pool_used - mark;
+    if (!paste_range(mark, &count)) {
+        pool_used = mark;
+        return 0;
+    }
+    pool_used = mark + count;
+    if (count == 0) {
+        return 1;
+    }
+    return push_stream(expand_pool + mark, count, mark);
+}
+
+static int process_tokens(void)
+{
+    if (!push_stream(source_tokens, source_count, -1)) {
+        return 0;
+    }
+
+    for (;;) {
+        struct Stream *s;
+        struct Token *tok;
+        int macro_idx;
+
+        if (stream_top <= 0) {
+            break;
+        }
+        s = &streams[stream_top - 1];
+        if (s->pos >= s->count) {
+            pop_stream();
+            continue;
+        }
+
+        tok = &s->toks[s->pos];
+
+        if (s->line_start &&
+            tok->kind == TOK_WORD &&
+            token_text_eq(tok, "%macro")) {
+            if (!define_macro(s)) {
+                return 0;
+            }
+            continue;
+        }
+
+        if (tok->kind == TOK_NEWLINE) {
+            s->pos++;
+            s->line_start = 1;
+            if (!emit_newline()) {
+                return 0;
+            }
+            continue;
+        }
+
+        macro_idx = find_macro(tok);
+        if (macro_idx >= 0 && s->pos + 1 < s->count && s->toks[s->pos + 1].kind == TOK_LPAREN) {
+            if (!expand_call(s, macro_idx)) {
+                return 0;
+            }
+            continue;
+        }
+
+        s->pos++;
+        s->line_start = 0;
+        if (!emit_token(tok)) {
+            return 0;
+        }
+    }
+
+    if (output_used >= MAX_OUTPUT) {
+        snprintf(error_buf, sizeof(error_buf), "output buffer overflow");
+        return 0;
+    }
+    output_buf[output_used] = '\0';
+    return 1;
+}
+
+int main(int argc, char **argv)
+{
+    FILE *in;
+    FILE *out;
+    size_t nread;
+
+    if (argc != 3) {
+        fprintf(stderr, "usage: %s input.M1 output.M1\n", argv[0]);
+        return 1;
+    }
+
+    in = fopen(argv[1], "rb");
+    if (in == NULL) {
+        perror(argv[1]);
+        return 1;
+    }
+    nread = fread(input_buf, 1, MAX_INPUT, in);
+    if (ferror(in)) {
+        perror(argv[1]);
+        fclose(in);
+        return 1;
+    }
+    fclose(in);
+    if (nread >= MAX_INPUT) {
+        fprintf(stderr, "input too large\n");
+        return 1;
+    }
+    input_buf[nread] = '\0';
+
+    if (!lex_source(input_buf) || !process_tokens()) {
+        fprintf(stderr, "m1macro: %s\n", error_buf);
+        return 1;
+    }
+
+    out = fopen(argv[2], "wb");
+    if (out == NULL) {
+        perror(argv[2]);
+        return 1;
+    }
+    if (fwrite(output_buf, 1, (size_t)output_used, out) != (size_t)output_used) {
+        perror(argv[2]);
+        fclose(out);
+        return 1;
+    }
+    fclose(out);
+    return 0;
+}

	boot2 Playing with the boostrap
	git clone https://git.ryansepassi.com/git/boot2.git
	Log \| Files \| Refs

A	docs/P1v2.md	\|	503	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	p1/aarch64.py	\|	354	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	p1/common.py	\|	66	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	p1/llvm_disasm_aarch64.py	\|	157	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	p1/p1_gen.py	\|	251	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	post.md	\|	247	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/m1macro.c	\|	886	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++