commit c34eadeb5a6d1e48435c97d722fcc15a677a6a54
parent 1b11e9dca9efcdb09fd2917b96b240e43c8ec91d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 22 Apr 2026 17:51:03 -0700
p1v2
Diffstat:
| A | docs/P1v2.md | | | 503 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | p1/aarch64.py | | | 354 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | p1/common.py | | | 66 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | p1/llvm_disasm_aarch64.py | | | 157 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | p1/p1_gen.py | | | 251 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | post.md | | | 247 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | src/m1macro.c | | | 886 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
7 files changed, 2464 insertions(+), 0 deletions(-)
diff --git a/docs/P1v2.md b/docs/P1v2.md
@@ -0,0 +1,503 @@
+# P1 v2
+
+## Scope
+
+P1 v2 is a portable pseudo-ISA for standalone executables.
+
+P1 v2 has two width variants:
+
+- **P1v2-64** — one word is one 64-bit integer or pointer value
+- **P1v2-32** — one word is one 32-bit integer or pointer value
+
+Portable source may use any number of word arguments. The first four argument
+registers are explicit, and additional argument words are passed through a
+portable incoming stack-argument area.
+
+Portable source may directly return `0..1` word. Wider results use the
+portable indirect-result convention described below.
+
+## Toolchain envelope
+
+P1 v2 must be assemblable through the existing `M0` + `hex2` path, with
+`catm` as the only composition primitive between source or generated fragments.
+The spec therefore assumes only the following toolchain features:
+
+- `M0`-level `DEFINE name hex_bytes` substitution
+- raw byte emission
+- labels and label references supported by `hex2`
+- file concatenation via `catm`
+
+## Source notation
+
+This document describes instructions using ordinary assembly notation such as
+`ADD rd, ra, rb`, `LD rd, [ra + off]`, or `CALL`.
+
+Because of the toolchain constraints above, portable source does not encode
+most operands as textual instruction arguments. Instead, register choices,
+inline immediate values, and small fixed parameters are fused into opcode
+names, following the generated-table style used by `src/p1_gen.py`.
+
+So the notation in this document is descriptive rather than literal:
+
+- `ADD rd, ra, rb` means a family of fused register-specific opcodes
+- `ADDI rd, ra, imm` means a family of fused register-and-immediate-specific
+ opcodes
+- `ENTER size` means a family of fused byte-count-specific opcodes
+- `LDARG rd, idx` means a family of fused register-and-argument-slot-specific
+ opcodes
+- `BR rs`, `CALLR rs`, and `TAILR rs` mean register-specific control-flow
+ opcodes
+- `LEAVE`, `CALL`, `RET`, `TAIL`, `B`, and `SYSCALL` remain operand-free
+
+Labels still appear in source where the toolchain supports them directly, such
+as `LA rd, %label` and `LA_BR %label`.
+
+## Register Model
+
+### Exposed registers
+
+P1 v2 exposes the following source-level registers:
+
+- `a0`–`a3` — argument registers. Also caller-saved general registers.
+- `t0` — caller-saved temporary.
+- `s0`–`s1` — callee-saved general registers.
+- `sp` — stack pointer.
+
+### Hidden registers
+
+The backend may reserve additional native registers that are never visible in
+P1 source:
+
+- `br` — branch / call target mechanism
+- backend-local scratch used entirely within one instruction expansion
+
+No hidden register may carry a live P1 value across an instruction boundary.
+`br` need not be a native register on every target; it may be implemented by
+another backend-private mechanism.
+
+## Calling Convention
+
+### Arguments and return values
+
+- In the direct-result convention, explicit argument words 0-3 live in
+ `a0-a3`.
+- Additional explicit argument words live in the incoming stack-argument area
+ and are read with `LDARG`.
+- In the direct-result convention, a one-word return value lives in `a0`.
+
+P1 v2 also defines an indirect-result convention for wider returns:
+
+- The caller passes a writable result buffer pointer in `a0`.
+- Explicit argument words 0-2 then live in `a1-a3`.
+- Additional explicit argument words still live in the incoming
+ stack-argument area.
+- On return, `a0` holds the same result buffer pointer value.
+
+In the direct-result convention, incoming stack-argument slot `0` therefore
+corresponds to explicit argument word `4`. In the indirect-result convention,
+incoming stack-argument slot `0` corresponds to explicit argument word `3`.
+
+The indirect-result convention is the portable way to return aggregates or any
+result wider than one word.
+
+### Register preservation
+
+Caller-saved:
+
+- `a0`–`a3`
+- `t0`
+
+Callee-saved:
+
+- `s0`–`s1`
+- `sp`
+
+### Call semantics
+
+A call is valid from any function, including a leaf. Call / return correctness
+does not depend on establishing a frame first.
+
+If a function needs any incoming argument after making a call, it must save it
+before the call. This matters in particular for argument 0, since `a0` is also
+the return-value register.
+
+A call that passes any stack argument words requires the caller to have an
+active standard frame with enough frame-local storage to stage those outgoing
+words.
+
+The return address is hidden machine state. Portable source must not assume
+that it lives in any exposed register.
+
+## Stack Convention
+
+### Call-boundary rule
+
+At every call boundary, the backend must satisfy the native C ABI stack
+alignment rule for the target architecture.
+
+Portable source must therefore treat raw function-entry `sp` as opaque. It may
+not assume that the low bits of `sp` have the same meaning on all targets
+before a frame is established.
+
+### Incoming stack-argument area
+
+P1 v2 defines an abstract incoming stack-argument area for explicit argument
+words that do not fit in registers.
+
+- Slot `0` is the first stack-passed explicit argument word.
+- Slots are word-indexed, not byte-indexed.
+- Portable source may access this area only through `LDARG`.
+
+`LDARG` is valid only when the current function has an active standard frame.
+Therefore, a function that needs any incoming stack argument must establish a
+standard frame before its first `LDARG`.
+
+Portable source must not assume any direct relationship between incoming
+argument slots and raw function-entry `sp`. In particular, source must not try
+to reconstruct stack arguments by manually indexing from `sp`; backend entry
+layouts differ across targets.
+
+For a call with `m` stack-passed explicit argument words, the caller stages
+those words in the first `m` words of its frame-local storage immediately
+before the call:
+
+```
+[sp + 2*WORD + 0*WORD] = outgoing arg word 0
+[sp + 2*WORD + 1*WORD] = outgoing arg word 1
+...
+```
+
+At callee entry, those staged words become incoming argument slots `0..m-1`.
+The backend is responsible for mapping between the caller's frame layout and
+the callee's abstract incoming argument slots.
+
+Portable code that needs both ordinary locals and stack-passed outgoing
+arguments must reserve enough total frame-local storage and keep the low-
+addressed prefix available for outgoing argument staging across the call.
+
+### Standard frame layout
+
+Functions that need local stack storage use a standard frame layout. After
+frame establishment:
+
+```
+[sp + 0*WORD] = saved return address
+[sp + 1*WORD] = saved caller stack pointer
+[sp + 2*WORD ... sp + 2*WORD + local_bytes - 1] = frame-local storage
+...
+```
+
+Frame-local storage is byte-addressed. Portable code may use it for ordinary
+locals, spilled callee-saved registers, and the caller-staged outgoing
+stack-argument words described above.
+
+Total frame size is:
+
+`round_up(STACK_ALIGN, 2*WORD_SIZE + local_bytes)`
+
+Where:
+
+- `WORD_SIZE = 8` in P1v2-64
+- `WORD_SIZE = 4` in P1v2-32
+- `STACK_ALIGN` is target-defined and must satisfy the native call ABI
+
+Leaf functions that need no frame-local storage may omit the frame entirely.
+
+### Frame invariants
+
+- A function that allocates a frame must restore `sp` before returning.
+- Callee-saved registers modified by the function must be restored before
+ returning.
+- The standard frame layout is the only frame shape recognized by P1 v2.
+
+## Op Set Summary
+
+| Category | Operations |
+|----------|------------|
+| Materialization | `LI rd, imm`, `LA rd, %label`, `LA_BR %label` |
+| Moves | `MOV rd, rs`, `MOV rd, sp` |
+| Arithmetic | `ADD`, `SUB`, `AND`, `OR`, `XOR`, `SHL`, `SHR`, `SAR`, `MUL`, `DIV`, `REM` |
+| Immediate arithmetic | `ADDI`, `ANDI`, `ORI`, `SHLI`, `SHRI`, `SARI` |
+| Memory | `LD`, `ST`, `LB`, `SB` |
+| ABI access | `LDARG` |
+| Branching | `B`, `BR`, `BEQ`, `BNE`, `BLT`, `BEQZ`, `BNEZ`, `BLTZ` |
+| Calls / returns | `CALL`, `CALLR`, `RET`, `TAIL`, `TAILR` |
+| Frame management | `ENTER`, `LEAVE` |
+| System | `SYSCALL` |
+
+## Immediates
+
+Immediate operands appear only in instructions that explicitly admit them.
+Portable source has three immediate classes:
+
+- **Inline integer immediate** — a signed 12-bit assembly-time constant in the
+ range `-2048..2047`
+- **Materialized word value** — a full one-word assembly-time constant loaded
+ with `LI`
+- **Materialized address** — the address of a label loaded with `LA`
+
+P1 v2 also uses two structured assembly-time operands:
+
+- **Frame-local byte count** — a non-negative byte count used by `ENTER`
+- **Argument-slot index** — a non-negative word-slot index used by `LDARG`
+
+`LI rd, imm` loads the one-word integer value `imm`.
+
+`LA rd, %label` loads the address of `%label` as a one-word pointer value.
+
+The backend may realize `LI` and `LA` using native immediates, literal pools,
+multi-instruction sequences, or other backend-private mechanisms.
+
+## Control Flow
+
+### Call / Return / Tail Call
+
+Control-flow targets are materialized with `LA_BR %label`, which loads
+`%label` into the hidden branch-target mechanism `br`. The immediately
+following control-flow op consumes that target.
+
+`CALL` transfers control to the target most recently loaded by `LA_BR` and
+establishes a return continuation such that a subsequent `RET` returns to the
+instruction after the `CALL`. `CALL` is valid whether or not the caller has
+established a standard frame, except that any call using stack-passed argument
+words requires an active standard frame to hold the staged outgoing words.
+
+`CALLR rs` is the register-indirect form of `CALL`. It transfers control to
+the code pointer value held in `rs` and establishes the same return
+continuation semantics as `CALL`.
+
+`RET` returns through the current return continuation. `RET` is valid whether
+or not the current function has established a standard frame, provided any
+frame established by the function has already been torn down.
+
+`TAIL` is a tail call to the target most recently loaded by `LA_BR`. It is
+valid only when the current function has an active standard frame. `TAIL`
+performs the standard epilogue for the current frame and then transfers control
+to the loaded target without creating a new return continuation. The callee
+therefore returns directly to the current function's caller.
+
+`TAILR rs` is the register-indirect form of `TAIL`. It is valid only when the
+current function has an active standard frame.
+
+Because stack-passed outgoing argument words are staged in the caller's own
+frame-local storage, `TAIL` and `TAILR` are portable only when the tail-called
+callee requires no stack-passed argument words. Portable compilers must lower
+other tail-call cases to an ordinary `CALL` / `RET` sequence.
+
+Portable source must treat the return continuation as hidden machine state. It
+must not assume that the return address lives in any exposed register or stack
+location except as defined by the standard frame layout after frame
+establishment.
+
+### Prologue / Epilogue
+
+P1 v2 defines the following frame-establishment and frame-teardown operations:
+
+- `ENTER size`
+- `LEAVE`
+
+`ENTER size` establishes the standard frame layout with `size` bytes of
+frame-local storage:
+
+```
+[sp + 0*WORD] = saved return address
+[sp + 1*WORD] = saved caller stack pointer
+[sp + 2*WORD ... sp + 2*WORD + size - 1] = frame-local storage
+```
+
+The total allocation size is:
+
+`round_up(STACK_ALIGN, 2*WORD_SIZE + size)`
+
+The named frame-local bytes are the usable local storage. Any additional bytes
+introduced by alignment rounding are padding, not extra local bytes.
+
+`LEAVE` tears down the current standard frame and restores the hidden return
+continuation so that a subsequent `RET` returns correctly.
+
+Because every standard frame stores the saved caller stack pointer at
+`[sp + 1*WORD]`, `LEAVE` does not need to know the frame-local byte count used
+by the corresponding `ENTER`.
+
+A function may omit `ENTER` / `LEAVE` entirely if it is a leaf and needs no
+standard frame.
+
+`ENTER` and `LEAVE` do not implicitly save or restore `s0` or `s1`. A
+function that modifies `s0` or `s1` must preserve them explicitly, typically by
+storing them in frame-local storage within its standard frame.
+
+### Branching
+
+P1 v2 branch targets are carried through the hidden branch-target mechanism
+`br`. Portable source may load `br` only through:
+
+- `LA_BR %label` — materialize the address of `%label` as the next branch, call,
+ or tail-call target
+
+No branch, call, or tail opcode takes a label operand directly. Portable source
+must treat `br` as owned by the control-flow machinery. No live value may be
+carried in `br`. Each `LA_BR` must be consumed by the immediately following
+branch, call, or tail op, and portable source must not rely on `br` surviving
+across any other instruction.
+
+The portable branch families are:
+
+- `B` — unconditional branch to the target in `br`
+- `BR rs` — unconditional branch to the code pointer in `rs`
+- `BEQ`, `BNE`, `BLT` — conditional branch to the target in `br`
+- `BEQZ`, `BNEZ`, `BLTZ` — conditional branch to the target in `br` using zero
+ as the second operand
+
+`BLT` and `BLTZ` perform signed comparisons on one-word values.
+
+If a branch condition is true, control transfers to the target currently held in
+`br`. If the condition is false, execution falls through to the next
+instruction.
+
+## Data Ops
+
+### Arithmetic
+
+P1 v2 defines the following arithmetic and bitwise operations on one-word
+values:
+
+- register-register: `ADD`, `SUB`, `AND`, `OR`, `XOR`, `SHL`, `SHR`, `SAR`,
+ `MUL`, `DIV`, `REM`
+- immediate: `ADDI`, `ANDI`, `ORI`, `SHLI`, `SHRI`, `SARI`
+
+For `ADD`, `SUB`, `MUL`, `AND`, `OR`, and `XOR`, computation is modulo the
+active word size.
+
+`SHL` shifts left and discards high bits. `SHR` is a logical right shift and
+zero-fills. `SAR` is an arithmetic right shift and sign-fills.
+
+For register-count shifts, only the low `5` bits of the shift count are
+observed in `P1v2-32`, and only the low `6` bits are observed in `P1v2-64`.
+
+Immediate-form shifts use inline immediates in the range `0..31` in `P1v2-32`
+and `0..63` in `P1v2-64`.
+
+`DIV` is signed division on one-word two's-complement values and truncates
+toward zero. `REM` is the corresponding signed remainder.
+
+Division by zero is outside the portable contract. The overflow case
+`MIN_INT / -1` is also outside the portable contract, as is the corresponding
+remainder case.
+
+### Moves
+
+P1 v2 defines the following move and materialization operations:
+
+- `MOV` — register-to-register copy
+- `LI` — load one-word integer constant
+- `LA` — load label address
+
+`MOV` may copy from any exposed general register to any exposed general
+register.
+
+Portable source may also read the current stack pointer through `MOV rd, sp`.
+
+Portable source may not write `sp` through `MOV`. Stack-pointer updates are only
+performed by `ENTER`, `LEAVE`, and backend-private call/return machinery.
+
+`LI` materializes an integer bit-pattern. `LA` materializes the address of a
+label. `LA_BR` is a separate control-flow-target materialization form and is not
+part of the general move family.
+
+### Memory
+
+P1 v2 defines the following memory-access operations:
+
+- `LD`, `ST` — one-word load and store
+- `LB`, `SB` — byte load and store
+- `LDARG` — one-word load from the incoming stack-argument area
+
+`LD` and `ST` access one full word: 4 bytes in `P1v2-32` and 8 bytes in
+`P1v2-64`.
+
+`LB` loads one byte and zero-extends it to a full word. `SB` stores the low
+8 bits of the source value.
+
+Memory offsets use signed 12-bit inline immediates.
+
+The base address for a memory access may be any exposed general register or
+`sp`.
+
+`LDARG rd, idx` loads incoming stack-argument slot `idx`, where slot `0` is the
+first stack-passed explicit argument word. `idx` is word-indexed, not
+byte-indexed. `LDARG` is an ABI access, not a general memory operation; it does
+not expose or imply any raw `sp`-relative layout at function entry.
+
+`LDARG` is valid only when the current function has an active standard frame.
+
+Portable source must not assume that labels are aligned beyond what is
+explicitly established by the program itself. Portable code should use
+naturally aligned addresses for `LD` and `ST`. Unaligned word accesses are
+outside the portable contract. Byte accesses have no additional alignment
+requirement.
+
+## System
+
+`SYSCALL` is part of the portable ISA surface.
+
+At the portable level, the syscall convention is:
+
+- `a0` = syscall number on entry, return value on exit
+- `a1`, `a2`, `a3`, `t0`, `s0`, `s1` = syscall arguments 0 through 5
+
+At the portable level, `SYSCALL` clobbers only `a0`. All other exposed
+registers are preserved across the syscall.
+
+The mapping from symbolic syscall names to numeric syscall identifiers is
+target-defined. The set of syscalls available to a given program is likewise
+specified outside the core P1 v2 ISA, for example by a target profile or
+runtime interface document.
+
+## Target notes
+
+- `a0` is both argument 0 and the return-value register in the portable
+ calling convention in the direct-result case, and the indirect-result buffer
+ pointer in the indirect-result case.
+- On aarch64, riscv64, arm32, and rv32, that matches the native integer/pointer
+ ABI directly.
+- On amd64, the backend must translate between portable `a0` and native
+ return register `rax` at call and return boundaries.
+- On i386, the backend must translate between portable argument registers and
+ the native stack-argument ABI at call boundaries.
+- On amd64 and i386, `LDARG` must account for the return address pushed by the
+ native `call` instruction. On aarch64, riscv64, arm32, and rv32, it maps more
+ directly to the entry `sp` plus the backend's standard frame/header policy.
+- On amd64, aarch64, riscv64, arm32, and rv32, `br` may be implemented as a
+ dedicated hidden native register.
+- On i386, `br` is expected to be a backend-private stack convention, not a
+ dedicated hidden register.
+- Frame-pointer use is backend policy, not part of the P1 v2 architectural
+ register set.
+
+### Native register mapping
+
+#### 64-bit targets
+
+| P1 | amd64 | aarch64 | riscv64 |
+|------|-------|---------|---------|
+| `a0` | `rdi` | `x0` | `a0` |
+| `a1` | `rsi` | `x1` | `a1` |
+| `a2` | `rdx` | `x2` | `a2` |
+| `a3` | `rcx` | `x3` | `a3` |
+| `t0` | `r10` | `x9` | `t0` |
+| `s0` | `rbx` | `x19` | `s1` |
+| `s1` | `r12` | `x20` | `s2` |
+| `sp` | `rsp` | `sp` | `sp` |
+
+#### 32-bit targets
+
+| P1 | arm32 | i386 | rv32 |
+|------|-------|-------|-------|
+| `a0` | `r0` | `eax` | `a0` |
+| `a1` | `r1` | `ecx` | `a1` |
+| `a2` | `r2` | `edx` | `a2` |
+| `a3` | `r3` | `ebx` | `a3` |
+| `t0` | `r12` | `esi` | `t0` |
+| `s0` | `r4` | `edi` | `s1` |
+| `s1` | `r5` | `ebp` | `s2` |
+| `sp` | `sp` | `esp` | `sp` |
diff --git a/p1/aarch64.py b/p1/aarch64.py
@@ -0,0 +1,354 @@
+from common import (
+ AddI,
+ ArchDef,
+ BranchReg,
+ CondB,
+ CondBZ,
+ Enter,
+ La,
+ LaBr,
+ LdArg,
+ Li,
+ LogI,
+ Mem,
+ Mov,
+ Nullary,
+ Rrr,
+ ShiftI,
+ le32,
+ register_arch,
+ round_up,
+)
+
+
+NAT = {
+ 'a0': 0,
+ 'a1': 1,
+ 'a2': 2,
+ 'a3': 3,
+ 'x4': 4,
+ 'x5': 5,
+ 't0': 9,
+ 's0': 19,
+ 's1': 20,
+ 'sp': 31,
+ 'xzr': 31,
+ 'lr': 30,
+ 'br': 17,
+ 'scratch': 16,
+ 'x8': 8,
+ 'save0': 21,
+ 'save1': 22,
+ 'save2': 23,
+}
+
+
+RRR_BASE = {
+ 'ADD': 0x8B000000,
+ 'SUB': 0xCB000000,
+ 'AND': 0x8A000000,
+ 'OR': 0xAA000000,
+ 'XOR': 0xCA000000,
+ 'SHL': 0x9AC02000,
+ 'SHR': 0x9AC02400,
+ 'SAR': 0x9AC02800,
+ 'DIV': 0x9AC00C00,
+}
+
+
+SYSCALL_NUMBERS = {
+ 'SYS_READ': 63,
+ 'SYS_WRITE': 64,
+ 'SYS_CLOSE': 57,
+ 'SYS_OPENAT': 56,
+ 'SYS_EXIT': 93,
+ 'SYS_CLONE': 220,
+ 'SYS_EXECVE': 221,
+ 'SYS_WAITID': 95,
+}
+
+
+def aa_rrr(base, rd, ra, rb):
+ d = NAT[rd]
+ a = NAT[ra]
+ b = NAT[rb]
+ return le32(base | (b << 16) | (a << 5) | d)
+
+
+def aa_add_imm(rd, ra, imm12, sub=False):
+ d = NAT[rd]
+ a = NAT[ra]
+ base = 0xD1000000 if sub else 0x91000000
+ return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d)
+
+
+def aa_mov_rr(dst, src):
+ if dst == 'sp':
+ return aa_add_imm('sp', src, 0, sub=False)
+ if src == 'sp':
+ return aa_add_imm(dst, 'sp', 0, sub=False)
+ d = NAT[dst]
+ s = NAT[src]
+ return le32(0xAA000000 | (s << 16) | (31 << 5) | d)
+
+
+def aa_ubfm(rd, ra, immr, imms):
+ d = NAT[rd]
+ a = NAT[ra]
+ return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+
+def aa_sbfm(rd, ra, immr, imms):
+ d = NAT[rd]
+ a = NAT[ra]
+ return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d)
+
+
+def aa_movz(rd, imm16):
+ d = NAT[rd]
+ return le32(0xD2800000 | ((imm16 & 0xFFFF) << 5) | d)
+
+
+def aa_movn(rd, imm16):
+ d = NAT[rd]
+ return le32(0x92800000 | ((imm16 & 0xFFFF) << 5) | d)
+
+
+def aa_materialize_small_imm(rd, imm):
+ if imm >= 0:
+ return aa_movz(rd, imm)
+ return aa_movn(rd, (~imm) & 0xFFFF)
+
+
+def aa_ldst_uimm12(base, rt, rn, off_bytes, size_log2):
+ imm12 = off_bytes >> size_log2
+ t = NAT[rt]
+ n = NAT[rn]
+ return le32(base | (imm12 << 10) | (n << 5) | t)
+
+
+def aa_ldst_unscaled(base, rt, rn, off):
+ imm9 = off & 0x1FF
+ t = NAT[rt]
+ n = NAT[rn]
+ return le32(base | (imm9 << 12) | (n << 5) | t)
+
+
+def aa_mem(op, rt, rn, off):
+ bases = {
+ 'LD': (0xF9400000, 3, 0xF8400000),
+ 'ST': (0xF9000000, 3, 0xF8000000),
+ 'LB': (0x39400000, 0, 0x38400000),
+ 'SB': (0x39000000, 0, 0x38000000),
+ }
+ uimm_base, size_log2, unscaled_base = bases[op]
+ scale = 1 << size_log2
+ if off >= 0 and off % scale == 0 and off < (4096 << size_log2):
+ return aa_ldst_uimm12(uimm_base, rt, rn, off, size_log2)
+ if -256 <= off <= 255:
+ return aa_ldst_unscaled(unscaled_base, rt, rn, off)
+ if -2048 <= off <= 2047:
+ if off >= 0:
+ addr = aa_add_imm('scratch', rn, off, sub=False)
+ else:
+ addr = aa_add_imm('scratch', rn, -off, sub=True)
+ return addr + aa_ldst_uimm12(uimm_base, rt, 'scratch', 0, size_log2)
+ raise ValueError(f'aarch64 offset out of range for {op}: {off}')
+
+
+def aa_cmp_skip(op, ra, rb):
+ a = NAT[ra]
+ b = NAT[rb]
+ cmp_hex = le32(0xEB000000 | (b << 16) | (a << 5) | 31)
+ skip_cond = {
+ 'BEQ': 1,
+ 'BNE': 0,
+ 'BLT': 10,
+ }[op]
+ return cmp_hex + le32(0x54000040 | skip_cond)
+
+
+def aa_br(reg):
+ return le32(0xD61F0000 | (NAT[reg] << 5))
+
+
+def aa_blr(reg):
+ return le32(0xD63F0000 | (NAT[reg] << 5))
+
+
+def aa_ret():
+ return le32(0xD65F03C0)
+
+
+def aa_lit64_prefix(rd):
+ d = NAT[rd]
+ ldr_lit = 0x58000040 | d
+ b_plus8 = 0x14000002
+ return le32(ldr_lit) + le32(b_plus8)
+
+
+def encode_li(_arch, row):
+ return aa_lit64_prefix(row.rd)
+
+
+def encode_la(_arch, row):
+ return aa_lit64_prefix(row.rd)
+
+
+def encode_labr(_arch, _row):
+ return aa_lit64_prefix('br')
+
+
+def encode_mov(_arch, row):
+ return aa_mov_rr(row.rd, row.rs)
+
+
+def encode_rrr(_arch, row):
+ if row.op == 'MUL':
+ d = NAT[row.rd]
+ a = NAT[row.ra]
+ b = NAT[row.rb]
+ return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d)
+ if row.op == 'REM':
+ d = NAT[row.rd]
+ a = NAT[row.ra]
+ b = NAT[row.rb]
+ sc = NAT['scratch']
+ sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | sc
+ msub = 0x9B008000 | (b << 16) | (a << 10) | (sc << 5) | d
+ return le32(sdiv) + le32(msub)
+ return aa_rrr(RRR_BASE[row.op], row.rd, row.ra, row.rb)
+
+
+def encode_addi(_arch, row):
+ if row.imm >= 0:
+ return aa_add_imm(row.rd, row.ra, row.imm, sub=False)
+ return aa_add_imm(row.rd, row.ra, -row.imm, sub=True)
+
+
+def encode_logi(_arch, row):
+ seq = aa_materialize_small_imm('scratch', row.imm)
+ base = {
+ 'ANDI': 0x8A000000,
+ 'ORI': 0xAA000000,
+ }[row.op]
+ return seq + aa_rrr(base, row.rd, row.ra, 'scratch')
+
+
+def encode_shifti(_arch, row):
+ if row.op == 'SHLI':
+ return aa_ubfm(row.rd, row.ra, (-row.imm) & 63, 63 - row.imm)
+ if row.op == 'SHRI':
+ return aa_ubfm(row.rd, row.ra, row.imm, 63)
+ return aa_sbfm(row.rd, row.ra, row.imm, 63)
+
+
+def encode_mem(_arch, row):
+ return aa_mem(row.op, row.rt, row.rn, row.off)
+
+
+def encode_ldarg(_arch, row):
+ return aa_mem('LD', 'scratch', 'sp', 8) + aa_mem('LD', row.rd, 'scratch', 16 + 8 * row.slot)
+
+
+def encode_branch_reg(_arch, row):
+ if row.kind == 'BR':
+ return aa_br(row.rs)
+ if row.kind == 'CALLR':
+ return aa_blr(row.rs)
+ if row.kind == 'TAILR':
+ leave = encode_nullary(_arch, Nullary('LEAVE', 'LEAVE'))
+ return leave + aa_br(row.rs)
+ raise ValueError(f'unknown branch-reg kind: {row.kind}')
+
+
+def encode_condb(_arch, row):
+ return aa_cmp_skip(row.op, row.ra, row.rb) + aa_br('br')
+
+
+def encode_condbz(_arch, row):
+ a = NAT[row.ra]
+ br_hex = aa_br('br')
+ if row.op == 'BEQZ':
+ return le32(0xB5000000 | (2 << 5) | a) + br_hex
+ if row.op == 'BNEZ':
+ return le32(0xB4000000 | (2 << 5) | a) + br_hex
+ cmp_zero = le32(0xEB1F001F | (a << 5))
+ bge = le32(0x54000040 | 10)
+ return cmp_zero + bge + br_hex
+
+
+def encode_enter(arch, row):
+ frame_bytes = round_up(arch.stack_align, 2 * arch.word_bytes + row.size)
+ return (
+ aa_add_imm('sp', 'sp', frame_bytes, sub=True)
+ + aa_mem('ST', 'lr', 'sp', 0)
+ + aa_add_imm('x8', 'sp', frame_bytes, sub=False)
+ + aa_mem('ST', 'x8', 'sp', 8)
+ )
+
+
+def encode_nullary(_arch, row):
+ if row.kind == 'B':
+ return aa_br('br')
+ if row.kind == 'CALL':
+ return aa_blr('br')
+ if row.kind == 'RET':
+ return aa_ret()
+ if row.kind == 'LEAVE':
+ return (
+ aa_mem('LD', 'lr', 'sp', 0)
+ + aa_mem('LD', 'x8', 'sp', 8)
+ + aa_mov_rr('sp', 'x8')
+ )
+ if row.kind == 'TAIL':
+ leave = encode_nullary(_arch, Nullary('LEAVE', 'LEAVE'))
+ return leave + aa_br('br')
+ if row.kind == 'SYSCALL':
+ return ''.join([
+ aa_mov_rr('x8', 'a0'),
+ aa_mov_rr('save0', 'a1'),
+ aa_mov_rr('save1', 'a2'),
+ aa_mov_rr('save2', 'a3'),
+ aa_mov_rr('a0', 'save0'),
+ aa_mov_rr('a1', 'save1'),
+ aa_mov_rr('a2', 'save2'),
+ aa_mov_rr('a3', 't0'),
+ aa_mov_rr('x4', 's0'),
+ aa_mov_rr('x5', 's1'),
+ le32(0xD4000001),
+ aa_mov_rr('a1', 'save0'),
+ aa_mov_rr('a2', 'save1'),
+ aa_mov_rr('a3', 'save2'),
+ ])
+ raise ValueError(f'unknown nullary kind: {row.kind}')
+
+
+ENCODERS = {
+ Li: encode_li,
+ La: encode_la,
+ LaBr: encode_labr,
+ Mov: encode_mov,
+ Rrr: encode_rrr,
+ AddI: encode_addi,
+ LogI: encode_logi,
+ ShiftI: encode_shifti,
+ Mem: encode_mem,
+ LdArg: encode_ldarg,
+ Nullary: encode_nullary,
+ BranchReg: encode_branch_reg,
+ CondB: encode_condb,
+ CondBZ: encode_condbz,
+ Enter: encode_enter,
+}
+
+
+register_arch(
+ ArchDef(
+ name='aarch64',
+ word_bytes=8,
+ stack_align=16,
+ syscall_numbers=SYSCALL_NUMBERS,
+ encoders=ENCODERS,
+ )
+)
diff --git a/p1/common.py b/p1/common.py
@@ -0,0 +1,66 @@
+from collections import namedtuple
+
+
+ArchDef = namedtuple(
+ 'ArchDef',
+ 'name word_bytes stack_align syscall_numbers encoders',
+)
+
+Banner = namedtuple('Banner', 'text')
+Literal = namedtuple('Literal', 'name hex_by_arch')
+Nullary = namedtuple('Nullary', 'name kind')
+Li = namedtuple('Li', 'name rd')
+La = namedtuple('La', 'name rd')
+LaBr = namedtuple('LaBr', 'name')
+Mov = namedtuple('Mov', 'name rd rs')
+Rrr = namedtuple('Rrr', 'name op rd ra rb')
+AddI = namedtuple('AddI', 'name rd ra imm')
+LogI = namedtuple('LogI', 'name op rd ra imm')
+ShiftI = namedtuple('ShiftI', 'name op rd ra imm')
+Mem = namedtuple('Mem', 'name op rt rn off')
+LdArg = namedtuple('LdArg', 'name rd slot')
+BranchReg = namedtuple('BranchReg', 'name kind rs')
+CondB = namedtuple('CondB', 'name op ra rb')
+CondBZ = namedtuple('CondBZ', 'name op ra')
+Enter = namedtuple('Enter', 'name size')
+
+
+ARCH_REGISTRY = {}
+
+
+def register_arch(arch):
+ if arch.name in ARCH_REGISTRY:
+ raise RuntimeError(f'duplicate arch registration: {arch.name}')
+ ARCH_REGISTRY[arch.name] = arch
+
+
+def get_arch(name):
+ return ARCH_REGISTRY[name]
+
+
+def registered_arches():
+ return tuple(sorted(ARCH_REGISTRY))
+
+
+def byte(n):
+ return f'{n & 0xFF:02X}'
+
+
+def le32(n):
+ return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper()
+
+
+def le64(n):
+ return (n & 0xFFFFFFFFFFFFFFFF).to_bytes(8, 'little').hex().upper()
+
+
+def word_hex(word_bytes, n):
+ if word_bytes == 4:
+ return le32(n)
+ if word_bytes == 8:
+ return le64(n)
+ raise ValueError(f'unsupported word size: {word_bytes}')
+
+
+def round_up(align, n):
+ return ((n + align - 1) // align) * align
diff --git a/p1/llvm_disasm_aarch64.py b/p1/llvm_disasm_aarch64.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""Annotate p1_aarch64.M1 DEFINE rows with llvm-mc disassembly.
+
+Reads generated DEFINE lines from p1_aarch64.M1, disassembles code-bearing rows
+with llvm-mc, and prints the DEFINE name beside the native aarch64 mnemonic
+sequence. Literal data rows such as syscall-number constants are labeled as data
+instead of being treated as instructions.
+"""
+
+import argparse
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+
+DEFINE_RE = re.compile(r'^DEFINE\s+(\S+)\s+([0-9A-Fa-f]+)\s*$')
+
+
+def repo_root():
+ return Path(__file__).resolve().parent.parent
+
+
+def default_input_path():
+ return repo_root() / 'build' / 'p1v2' / 'aarch64' / 'p1_aarch64.M1'
+
+
+def ensure_generated(path: Path):
+ if path.exists():
+ return
+ gen = repo_root() / 'p1' / 'p1_gen.py'
+ proc = subprocess.run(
+ [sys.executable, str(gen), '--arch', 'aarch64', str(path.parent.parent)],
+ check=True,
+ cwd=repo_root(),
+ capture_output=True,
+ text=True,
+ )
+ if proc.stderr:
+ sys.stderr.write(proc.stderr)
+
+
+def parse_rows(path: Path):
+ rows = []
+ for line in path.read_text().splitlines():
+ match = DEFINE_RE.match(line)
+ if not match:
+ continue
+ name, hex_bytes = match.groups()
+ rows.append((name, hex_bytes.upper()))
+ return rows
+
+
+def is_data_row(name: str):
+ return name.startswith('sys_')
+
+
+def disassemble_code_rows(rows, llvm_mc):
+ code_rows = [(name, hex_bytes) for name, hex_bytes in rows if not is_data_row(name)]
+ if not code_rows:
+ return {}
+
+ payload = '\n'.join(hex_bytes for _, hex_bytes in code_rows) + '\n'
+ proc = subprocess.run(
+ [llvm_mc, '--disassemble', '--hex', '--arch=aarch64'],
+ input=payload,
+ text=True,
+ capture_output=True,
+ check=True,
+ )
+ inst_lines = [line.strip() for line in proc.stdout.splitlines() if line.strip()]
+
+ out = {}
+ index = 0
+ for name, hex_bytes in code_rows:
+ words = len(hex_bytes) // 8
+ out[name] = inst_lines[index:index + words]
+ index += words
+
+ if index != len(inst_lines):
+ raise RuntimeError(
+ f'llvm output row split mismatch: consumed {index}, got {len(inst_lines)}'
+ )
+ return out
+
+
+def format_rows(rows, disasm_by_name, show_bytes):
+ name_width = max(len(name) for name, _ in rows) if rows else 0
+ out = []
+ for name, hex_bytes in rows:
+ if is_data_row(name):
+ rhs = f'data 0x{hex_bytes}'
+ out.append(f'{name:<{name_width}} {rhs}')
+ continue
+
+ insns = disasm_by_name.get(name, [])
+ if not insns:
+ out.append(f'{name:<{name_width}} <no disassembly>')
+ continue
+
+ prefix = name.ljust(name_width)
+ byte_col = f' {hex_bytes}' if show_bytes else ''
+ out.append(f'{prefix}{byte_col} {insns[0]}')
+ for insn in insns[1:]:
+ spacer = ' ' * name_width
+ if show_bytes:
+ spacer += ' ' + ' ' * len(hex_bytes)
+ out.append(f'{spacer} {insn}')
+ return '\n'.join(out)
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ 'input',
+ nargs='?',
+ default=str(default_input_path()),
+ help='path to p1_aarch64.M1',
+ )
+ parser.add_argument(
+ '--llvm-mc',
+ default=os.environ.get('LLVM_MC', 'llvm-mc'),
+ help='path to llvm-mc',
+ )
+ parser.add_argument(
+ '--grep',
+ default='',
+ help='only include DEFINE names containing this substring',
+ )
+ parser.add_argument(
+ '--limit',
+ type=int,
+ default=0,
+ help='maximum number of DEFINE rows to print (0 = all)',
+ )
+ parser.add_argument(
+ '--show-bytes',
+ action='store_true',
+ help='include raw DEFINE bytes next to the name',
+ )
+ args = parser.parse_args()
+
+ path = Path(args.input)
+ ensure_generated(path)
+ rows = parse_rows(path)
+ if args.grep:
+ rows = [(name, hex_bytes) for name, hex_bytes in rows if args.grep in name]
+ if args.limit:
+ rows = rows[:args.limit]
+
+ disasm_by_name = disassemble_code_rows(rows, args.llvm_mc)
+ print(format_rows(rows, disasm_by_name, args.show_bytes))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/p1/p1_gen.py b/p1/p1_gen.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+"""Generate P1 v2 DEFINE tables.
+
+This is a fresh generator for docs/P1v2.md. The ISA surface is described by
+plain namedtuple rows, and each backend registers a simple row-type -> encoder
+mapping. The emitted immediate/offset domains are still curated tables rather
+than the full theoretical spec space, so extending coverage is a one-line data
+edit instead of an architecture rewrite.
+
+Usage:
+ python3 p1/p1_gen.py [--arch ARCH] [build-root]
+ python3 p1/p1_gen.py --check [--arch ARCH] [build-root]
+ python3 p1/p1_gen.py --list-archs
+"""
+
+import os
+import sys
+from itertools import product
+
+from common import (
+ AddI,
+ Banner,
+ BranchReg,
+ CondB,
+ CondBZ,
+ Enter,
+ La,
+ LaBr,
+ LdArg,
+ Li,
+ Literal,
+ LogI,
+ Mem,
+ Mov,
+ Nullary,
+ Rrr,
+ ShiftI,
+ get_arch,
+ registered_arches,
+ word_hex,
+)
+
+import aarch64 # noqa: F401 - imported for arch registration side effects
+
+
+P1_GPRS = ('a0', 'a1', 'a2', 'a3', 't0', 's0', 's1')
+P1_BASES = P1_GPRS + ('sp',)
+
+RRR_OPS = ('ADD', 'SUB', 'AND', 'OR', 'XOR', 'SHL', 'SHR', 'SAR', 'MUL', 'DIV', 'REM')
+LOGI_OPS = ('ANDI', 'ORI')
+SHIFT_OPS = ('SHLI', 'SHRI', 'SARI')
+MEM_OPS = ('LD', 'ST', 'LB', 'SB')
+CONDB_OPS = ('BEQ', 'BNE', 'BLT')
+CONDBZ_OPS = ('BEQZ', 'BNEZ', 'BLTZ')
+
+ADDI_IMMS = (
+ -2048, -1024, -256, -128, -64, -48, -32, -24, -16, -12, -8, -7, -6,
+ -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16, 24, 32, 40,
+ 48, 63, 64, 127, 128, 255, 256, 512, 1024, 2047,
+)
+
+LOGI_IMMS = (
+ -1, 0, 1, 2, 3, 4, 6, 7, 8, 15, 16, 31, 32, 63, 64, 127, 255, 511, 1023,
+ 2047,
+)
+
+SHIFT_IMMS = tuple(range(64))
+
+MEM_OFFS = (
+ -256, -128, -64, -48, -32, -24, -16, -8, -1, 0, 1, 7, 8, 15, 16, 24, 32,
+ 40, 48, 56, 64, 128, 255,
+)
+
+LDARG_SLOTS = tuple(range(32))
+ENTER_SIZES = tuple(range(0, 129))
+
+
+HEADER = """## p1_{arch}.M1 — GENERATED by p1/p1_gen.py. Do not edit by hand.
+##
+## This table targets the P1 v2 ISA described in docs/P1v2.md.
+## Row shapes are shared; per-arch lowering lives in p1/<arch>.py.
+"""
+
+
+def imm_suffix(imm):
+ return f'NEG{-imm}' if imm < 0 else str(imm)
+
+
+def rows(arch):
+ out = []
+
+ out.append(Banner('Materialization'))
+ for rd in P1_GPRS:
+ out.append(Li(name=f'LI_{rd.upper()}', rd=rd))
+ for rd in P1_GPRS:
+ out.append(La(name=f'LA_{rd.upper()}', rd=rd))
+ out.append(LaBr(name='LA_BR'))
+
+ out.append(Banner('Moves'))
+ for rd, rs in product(P1_GPRS, P1_GPRS):
+ out.append(Mov(name=f'MOV_{rd.upper()}_{rs.upper()}', rd=rd, rs=rs))
+ for rd in P1_GPRS:
+ out.append(Mov(name=f'MOV_{rd.upper()}_SP', rd=rd, rs='sp'))
+
+ out.append(Banner('Register Arithmetic'))
+ for op, rd, ra, rb in product(RRR_OPS, P1_GPRS, P1_GPRS, P1_GPRS):
+ out.append(Rrr(name=f'{op}_{rd.upper()}_{ra.upper()}_{rb.upper()}',
+ op=op, rd=rd, ra=ra, rb=rb))
+
+ out.append(Banner('Immediate Arithmetic'))
+ for rd, ra, imm in product(P1_GPRS, P1_GPRS, ADDI_IMMS):
+ out.append(AddI(name=f'ADDI_{rd.upper()}_{ra.upper()}_{imm_suffix(imm)}',
+ rd=rd, ra=ra, imm=imm))
+ for op, rd, ra, imm in product(LOGI_OPS, P1_GPRS, P1_GPRS, LOGI_IMMS):
+ out.append(LogI(name=f'{op}_{rd.upper()}_{ra.upper()}_{imm_suffix(imm)}',
+ op=op, rd=rd, ra=ra, imm=imm))
+ for op, rd, ra, imm in product(SHIFT_OPS, P1_GPRS, P1_GPRS, SHIFT_IMMS):
+ out.append(ShiftI(name=f'{op}_{rd.upper()}_{ra.upper()}_{imm}',
+ op=op, rd=rd, ra=ra, imm=imm))
+
+ out.append(Banner('Memory'))
+ for op, rt, rn, off in product(MEM_OPS, P1_GPRS, P1_BASES, MEM_OFFS):
+ out.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{imm_suffix(off)}',
+ op=op, rt=rt, rn=rn, off=off))
+
+ out.append(Banner('ABI Access'))
+ for rd, slot in product(P1_GPRS, LDARG_SLOTS):
+ out.append(LdArg(name=f'LDARG_{rd.upper()}_{slot}', rd=rd, slot=slot))
+
+ out.append(Banner('Branches'))
+ out.append(Nullary(name='B', kind='B'))
+ for rs in P1_GPRS:
+ out.append(BranchReg(name=f'BR_{rs.upper()}', kind='BR', rs=rs))
+ for op, ra, rb in product(CONDB_OPS, P1_GPRS, P1_GPRS):
+ out.append(CondB(name=f'{op}_{ra.upper()}_{rb.upper()}', op=op, ra=ra, rb=rb))
+ for op, ra in product(CONDBZ_OPS, P1_GPRS):
+ out.append(CondBZ(name=f'{op}_{ra.upper()}', op=op, ra=ra))
+
+ out.append(Banner('Calls And Returns'))
+ out.append(Nullary(name='CALL', kind='CALL'))
+ out.append(Nullary(name='RET', kind='RET'))
+ out.append(Nullary(name='TAIL', kind='TAIL'))
+ for rs in P1_GPRS:
+ out.append(BranchReg(name=f'CALLR_{rs.upper()}', kind='CALLR', rs=rs))
+ for rs in P1_GPRS:
+ out.append(BranchReg(name=f'TAILR_{rs.upper()}', kind='TAILR', rs=rs))
+
+ out.append(Banner('Frame Management'))
+ for size in ENTER_SIZES:
+ out.append(Enter(name=f'ENTER_{size}', size=size))
+ out.append(Nullary(name='LEAVE', kind='LEAVE'))
+
+ out.append(Banner('System'))
+ out.append(Nullary(name='SYSCALL', kind='SYSCALL'))
+ for name, number in sorted(arch.syscall_numbers.items()):
+ out.append(Literal(name=name, hex_by_arch={arch.name: word_hex(arch.word_bytes, number)}))
+
+ return out
+
+
+def lower_name(name):
+ low = name.lower()
+ head, sep, rest = low.partition('_')
+ if not sep:
+ return low
+ if '_' not in rest:
+ return low
+ return f'{head}_{rest.replace("_", ",")}'
+
+
+def encode_row(arch, row):
+ if isinstance(row, Literal):
+ return row.hex_by_arch[arch.name]
+ encoder = arch.encoders[type(row)]
+ return encoder(arch, row)
+
+
+def emit(arch_name):
+ arch = get_arch(arch_name)
+ out = [HEADER.format(arch=arch.name).rstrip(), '']
+ seen = set()
+ for row in rows(arch):
+ if isinstance(row, Banner):
+ out.append('')
+ out.append(f'## ---- {row.text}')
+ continue
+ name = lower_name(row.name)
+ if name in seen:
+ raise RuntimeError(f'duplicate DEFINE: {name}')
+ seen.add(name)
+ out.append(f'DEFINE {name} {encode_row(arch, row)}')
+ out.append('')
+ return '\n'.join(out)
+
+
+def parse_args(argv):
+ check = False
+ archs = []
+ positional = []
+ i = 0
+ while i < len(argv):
+ arg = argv[i]
+ if arg == '--check':
+ check = True
+ elif arg == '--list-archs':
+ print('\n'.join(registered_arches()))
+ sys.exit(0)
+ elif arg == '--arch':
+ i += 1
+ if i >= len(argv):
+ raise SystemExit('--arch requires a value')
+ archs.append(argv[i])
+ else:
+ positional.append(arg)
+ i += 1
+ build_root = positional[0] if positional else os.path.join('build', 'p1v2')
+ if not archs:
+ archs = list(registered_arches())
+ return check, archs, build_root
+
+
+def main(argv=None):
+ check, archs, build_root = parse_args(argv or sys.argv[1:])
+ had_diff = False
+
+ for arch_name in archs:
+ arch = get_arch(arch_name)
+ dest_dir = os.path.join(build_root, arch.name)
+ path = os.path.join(dest_dir, f'p1_{arch.name}.M1')
+ content = emit(arch.name)
+ if check:
+ try:
+ with open(path) as f:
+ existing = f.read()
+ except FileNotFoundError:
+ existing = ''
+ if existing != content:
+ sys.stderr.write(f'DIFF: {path}\n')
+ had_diff = True
+ continue
+ os.makedirs(dest_dir, exist_ok=True)
+ with open(path, 'w') as f:
+ f.write(content)
+ print(f'wrote {path} ({len(content)} bytes)')
+
+ if check and had_diff:
+ sys.exit(1)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/post.md b/post.md
@@ -0,0 +1,247 @@
+# Title
+
+Your compiler compiles your code, but what compiled the compiler? And the one
+before that?
+
+Ken Thompson freaked everyone out in 1983 by describing a nasty
+self-replicating attack on compilers themselves that reminded folks that it
+might be a good idea to have a "full bootstrap" from as small a binary seed as
+could be managed and then audited source from there forward to the toolchains
+we know and (maybe-kinda) love today.
+
+Jeremiah Orians and Jan Nieuwenhuizen delivered exactly that to the world in
+2023, going from `hex0-seed`, a few hundred byte seed binary that is the "hex0"
+compiler, through Fabrice Bellard's Tiny C Compiler (TCC), and then up through
+GCC and the rest of the stack. Truly awesome work.
+
+Between "hex0" and TCC, there's a minimal assembler and linker in M0 and hex2,
+then a "subset C" compiler that builds out the Mes Scheme ecosystem, then
+a Mes C compiler that compiles TCC.
+
+From seed to M0 and hex2 is ~2KLOC. The "middle" stage is ~25KLOC to get to
+Mes Scheme and a collection of build and packaging tools. And the "late" stage
+of the Mes C compiler and TCC sources are another ~25KLOC.
+
+I find M0 and hex2 to be very cute and everything after that to be rather
+goopy. So, let's dive into the cuteness!
+
+Getting there is fairly straightforward:
+
+```
+hex0-seed = given
+
+hex0 = hex0-seed(hex0.hex0) # hex0 compiles itself from source
+hex1 = hex0(hex1.hex0)
+hex2 = hex1(hex2.hex1)
+catm = hex2(catm.hex2)
+M0 = hex2(catm(ELF.hex2, M0.hex2))
+```
+
+**hex0**: the minimal seed hex compiler; it turns pairs of hexadecimal digits
+into raw bytes. Supports # and ; line comments.
+
+**hex1**: a small extension of hex0 that adds the first notion of symbolic
+control flow. Features everything in hex0, single-character labels, one size
+of relative jump/offset encoding.
+
+**hex2**: the final "hex language" and first real linker-like stage in the
+chain. Features everything in hex1, labels of arbitrary length, relative
+references of multiple sizes (!, @, %), absolute references ($, &), enough
+symbol/address resolution to link later stages.
+
+**catm**: a minimal file concatenation utility, take N files and output 1.
+
+**M0**: the first assembler built on top of hex2. Features DEFINE substitution
+of names to hex bytes, immediate values in various sizes/formats, symbolic
+assembly over hex2.
+
+M0 programs are architecture-specific because the DEFINEd set of mnemonics and
+their byte-level encodings are architecture-specific. So that means that the
+subset C compiler, defined in M0, has to be written once per architecture.
+That's a drag.
+
+I thought it'd be rather fun to play around at this low level and see if we
+could build off M0 in a way that might get us to a functioning C compiler
+through a path that cuts down on the amount of code in that "middle" layer.
+
+So I present to you "P1", a portable pseudo-ISA targetable via M1 DEFINEs, and
+"M1M", a macro expander for M1, written in P1.
+
+P1
+
+Registers:
+
+- `a0`–`a3` — argument registers. Also caller-saved general registers.
+- `t0` — caller-saved temporary.
+- `s0`–`s1` — callee-saved general registers.
+- `sp` — stack pointer.
+
+Ops:
+- Materialization: `LI`, `LA`, `LA_BR`
+- Moves: `MOV`
+- Arithmetic: `ADD`, `SUB`, `AND`, `OR`, `XOR`, `SHL`, `SHR`, `SAR`, `MUL`,
+ `DIV`, `REM`
+- Immediate arithmetic: `ADDI`, `ANDI`, `ORI`, `SHLI`, `SHRI`, `SARI`
+- Memory: `LD`, `ST`, `LB`, `SB`
+- Branching: `B`, `BR`, `BEQ`, `BNE`, `BLT`, `BEQZ`, `BNEZ`, `BLTZ`
+- Calls / returns: `CALL`, `CALLR`, `RET`, `TAIL`, `TAILR`
+- Frame management: `ENTER`, `LEAVE`
+- ABI access: `LDARG`
+- System: `SYSCALL`
+
+It's a deliberately dumb little RISC: a handful of visible registers, a hidden
+branch/call target register behind `LA_BR`, and a generator that spits out the
+per-arch `DEFINE` tables for 32-bit and 64-bit x86, ARM, and RISC-V.
+
+Calling convention:
+
+- `a0`-`a3` hold the first four argument words.
+- Extra arguments live in an incoming stack-argument area and are read with
+ `LDARG`.
+- `a0` is also the one-word return register.
+- `a0`-`a3` and `t0` are caller-saved; `s0`-`s1` and `sp` are callee-saved.
+- If you need a frame, `ENTER` builds the standard one and `LEAVE` tears it
+ down.
+- Stack-passed outgoing args are staged in frame-local space before `CALL`.
+- Wider-than-one-word returns use the usual hidden pointer trick: caller passes
+ a writable result buffer in `a0`, real args slide over by one, callee returns
+ that same pointer in `a0`.
+
+That's enough to write a portable macro expander, a Lisp, or a tiny C
+compiler once, then retarget by swapping out the generated M1 defs instead of
+rewriting the whole program per architecture. That cuts down auditable code at
+the bottom of the stack quite a bit. And it may give us a shorter path from
+M0 to TCC (but no promises). Mostly it's just a cute base to work on.
+
+```
+;; Bootstrap from seed
+(define hex0 (hex0-seed hex0.hex0))
+(define hex1 (hex0 hex1.hex0))
+(define hex2 (hex1 hex2.hex1))
+(define catm (hex2 catm.hex2))
+(define M0 (hex2 (catm ELF.hex2 M0.hex2)))
+
+(defn link (hex2-src) (hex2 (catm ELF.hex2 hex2-src)))
+
+;; Compile P1 source, no macros
+(defn p1compile0 (src) (M0 (catm P1.M1 src)))
+
+;; Bootstrap the macro expander once without macros.
+(define m1m (link (p1compile0 m1m.M1)))
+
+;; Normal P1 build: expand macros, then assemble/link.
+(defn p1compile (src) (M0 (catm P1.M1 (m1m (catm P1M.M1 src)))))
+(defn p1exe (src) (link (p1compile src)))
+```
+
+So, what does P1 code with the macro expansion base look like?
+
+```
+## copy.M1
+##
+## And symbolic constants:
+## SYS_open SYS_read SYS_write SYS_close SYS_exit O_RDONLY
+## WORD
+## SAVE_S0 SAVE_S1 BUF_OFF COPY_LOCALS BUF_SIZE
+##
+## Entry convention:
+## a0 = argc
+## a1 = argv
+
+DEFINE BUF_SIZE %128
+DEFINE SAVE_S0 %16
+DEFINE SAVE_S1 %24
+DEFINE COPY_LOCALS %144
+
+:main
+ %li(t0, 2)
+ %blt(a0, t0, usage) ; if argc < 2, exit 2
+
+ %ld(t0, a1, WORD) ; t0 = argv[1]
+
+ %li(a0, SYS_open)
+ %mov(a1, t0) ; path
+ %li(a2, O_RDONLY) ; flags
+ %li(a3, 0) ; mode
+ %syscall()
+ %bltz(a0, fail) ; open failed
+
+ %mov(s0, a0) ; s0 = fd
+ %mov(a0, s0)
+ %call(copy_to_stdout) ; returns 0 or -1 in a0
+ %bltz(a0, close_fail)
+
+ %li(a0, SYS_close)
+ %mov(a1, s0)
+ %syscall()
+
+ %li(a0, SYS_exit)
+ %li(a1, 0)
+ %syscall()
+
+:close_fail
+ %li(a0, SYS_close)
+ %mov(a1, s0)
+ %syscall()
+
+:fail
+ %li(a0, SYS_exit)
+ %li(a1, 1)
+ %syscall()
+
+:usage
+ %li(a0, SYS_exit)
+ %li(a1, 2)
+ %syscall()
+
+
+:copy_to_stdout
+ %enter(COPY_LOCALS)
+ %st(sp, SAVE_S0, s0)
+ %st(sp, SAVE_S1, s1)
+
+ %mov(s0, a0) ; s0 = input fd
+
+:read_loop
+ %li(a0, SYS_read)
+ %mov(a1, s0) ; fd
+ %mov(a2, sp)
+ %addi(a2, a2, BUF_OFF) ; buf = sp + BUF_OFF
+ %li(a3, BUF_SIZE) ; count
+ %syscall()
+
+ %beqz(a0, done) ; EOF
+ %bltz(a0, io_error) ; read error
+
+ %mov(s1, a0) ; remaining byte count
+ %mov(t0, sp)
+ %addi(t0, t0, BUF_OFF) ; t0 = current write ptr
+
+:write_loop
+ %li(a0, SYS_write)
+ %li(a1, 1) ; stdout
+ %mov(a2, t0) ; buf
+ %mov(a3, s1) ; count
+ %syscall()
+
+ %bltz(a0, io_error) ; write error
+
+ %add(t0, t0, a0) ; buf += nwritten
+ %sub(s1, s1, a0) ; remaining -= nwritten
+ %bnez(s1, write_loop)
+ %b(read_loop)
+
+:done
+ %li(a0, 0)
+ %ld(s0, sp, SAVE_S0)
+ %ld(s1, sp, SAVE_S1)
+ %leave()
+ %ret()
+
+:io_error
+ %li(a0, -1)
+ %ld(s0, sp, SAVE_S0)
+ %ld(s1, sp, SAVE_S1)
+ %leave()
+ %ret()
+```
diff --git a/src/m1macro.c b/src/m1macro.c
@@ -0,0 +1,886 @@
+#include <stdio.h>
+#include <string.h>
+
+/*
+ * Tiny single-pass M1 macro expander.
+ *
+ * Syntax:
+ * %macro NAME(a, b)
+ * ... body ...
+ * %endm
+ *
+ * %NAME(x, y) function-like macro call
+ * ## token pasting inside macro bodies
+ * @loop local token inside a macro body
+ * :@loop / &@loop local label / reference shorthand
+ * :param / ¶m prefix a single-token parameter with ':' or '&'
+ *
+ * Notes:
+ * - Macros are define-before-use. There is no prescan.
+ * - Expansion rescans by pushing expanded tokens back through the same loop.
+ * - There is no cycle detection. Recursive macros will loop until a limit.
+ * - Only recognized %NAME(...) calls expand. Other text passes through.
+ * - Output formatting is normalized to tokens plus '\n', not preserved.
+ */
+
+#define MAX_INPUT 262144
+#define MAX_OUTPUT 524288
+#define MAX_TEXT 524288
+#define MAX_TOKENS 65536
+#define MAX_MACROS 256
+#define MAX_PARAMS 16
+#define MAX_BODY_TOKENS 4096
+#define MAX_EXPAND 65536
+#define MAX_STACK 64
+
+enum {
+ TOK_WORD,
+ TOK_STRING,
+ TOK_NEWLINE,
+ TOK_LPAREN,
+ TOK_RPAREN,
+ TOK_COMMA,
+ TOK_PASTE
+};
+
+struct Token {
+ int kind;
+ int start;
+ int len;
+ int line;
+};
+
+struct Macro {
+ int name_start;
+ int name_len;
+ int line;
+ int param_count;
+ int param_start[MAX_PARAMS];
+ int param_len[MAX_PARAMS];
+ struct Token body[MAX_BODY_TOKENS];
+ int body_count;
+};
+
+struct Stream {
+ struct Token *toks;
+ int count;
+ int pos;
+ int line_start;
+ int pool_mark;
+};
+
+static char input_buf[MAX_INPUT + 1];
+static char output_buf[MAX_OUTPUT + 1];
+static char text_buf[MAX_TEXT];
+
+static struct Token source_tokens[MAX_TOKENS];
+static struct Token expand_pool[MAX_EXPAND];
+static struct Macro macros[MAX_MACROS];
+static struct Stream streams[MAX_STACK];
+
+static int text_used;
+static int source_count;
+static int pool_used;
+static int macro_count;
+static int local_id_next = 1;
+static int output_used;
+static int output_need_space;
+static int stream_top;
+
+static char error_buf[256];
+
+static int is_space_no_nl(int c)
+{
+ return c == ' ' || c == '\t' || c == '\r' || c == '\f' || c == '\v';
+}
+
+static int append_text_len(const char *s, int len)
+{
+ int start;
+
+ if (text_used + len + 1 > MAX_TEXT) {
+ snprintf(error_buf, sizeof(error_buf), "text buffer overflow");
+ return -1;
+ }
+ start = text_used;
+ memcpy(text_buf + text_used, s, (size_t)len);
+ text_used += len;
+ text_buf[text_used++] = '\0';
+ return start;
+}
+
+static int append_text_str(const char *s)
+{
+ return append_text_len(s, (int)strlen(s));
+}
+
+static int push_token(struct Token *buf, int *count, int max_count,
+ int kind, int start, int len, int line)
+{
+ if (*count >= max_count) {
+ snprintf(error_buf, sizeof(error_buf), "token buffer overflow");
+ return 0;
+ }
+ buf[*count].kind = kind;
+ buf[*count].start = start;
+ buf[*count].len = len;
+ buf[*count].line = line;
+ *count += 1;
+ return 1;
+}
+
+static int token_text_eq(const struct Token *tok, const char *s)
+{
+ int len = (int)strlen(s);
+
+ return tok->len == len && memcmp(text_buf + tok->start, s, (size_t)len) == 0;
+}
+
+static int span_eq(int start, int len, const struct Token *tok)
+{
+ return len == tok->len && memcmp(text_buf + start, text_buf + tok->start, (size_t)len) == 0;
+}
+
+static int lex_source(const char *src)
+{
+ int i = 0;
+ int line = 1;
+
+ while (src[i] != '\0') {
+ int start;
+ int text_start;
+ int len;
+
+ if (is_space_no_nl((unsigned char)src[i])) {
+ i++;
+ continue;
+ }
+ if (src[i] == '\n') {
+ if (!push_token(source_tokens, &source_count, MAX_TOKENS,
+ TOK_NEWLINE, -1, 0, line)) {
+ return 0;
+ }
+ i++;
+ line++;
+ continue;
+ }
+ if (src[i] == '"' || src[i] == '\'') {
+ int quote = src[i];
+
+ start = i;
+ i++;
+ while (src[i] != '\0' && src[i] != quote) {
+ i++;
+ }
+ if (src[i] == quote) {
+ i++;
+ }
+ len = i - start;
+ text_start = append_text_len(src + start, len);
+ if (text_start < 0) {
+ return 0;
+ }
+ if (!push_token(source_tokens, &source_count, MAX_TOKENS,
+ TOK_STRING, text_start, len, line)) {
+ return 0;
+ }
+ continue;
+ }
+ if (src[i] == '#' && src[i + 1] == '#') {
+ text_start = append_text_str("##");
+ if (text_start < 0) {
+ return 0;
+ }
+ if (!push_token(source_tokens, &source_count, MAX_TOKENS,
+ TOK_PASTE, text_start, 2, line)) {
+ return 0;
+ }
+ i += 2;
+ continue;
+ }
+ if (src[i] == '#' || src[i] == ';') {
+ while (src[i] != '\0' && src[i] != '\n') {
+ i++;
+ }
+ continue;
+ }
+ if (src[i] == '(') {
+ text_start = append_text_str("(");
+ if (text_start < 0) {
+ return 0;
+ }
+ if (!push_token(source_tokens, &source_count, MAX_TOKENS,
+ TOK_LPAREN, text_start, 1, line)) {
+ return 0;
+ }
+ i++;
+ continue;
+ }
+ if (src[i] == ')') {
+ text_start = append_text_str(")");
+ if (text_start < 0) {
+ return 0;
+ }
+ if (!push_token(source_tokens, &source_count, MAX_TOKENS,
+ TOK_RPAREN, text_start, 1, line)) {
+ return 0;
+ }
+ i++;
+ continue;
+ }
+ if (src[i] == ',') {
+ text_start = append_text_str(",");
+ if (text_start < 0) {
+ return 0;
+ }
+ if (!push_token(source_tokens, &source_count, MAX_TOKENS,
+ TOK_COMMA, text_start, 1, line)) {
+ return 0;
+ }
+ i++;
+ continue;
+ }
+
+ start = i;
+ while (src[i] != '\0' &&
+ !is_space_no_nl((unsigned char)src[i]) &&
+ src[i] != '\n' &&
+ src[i] != '#' &&
+ src[i] != ';' &&
+ src[i] != '(' &&
+ src[i] != ')' &&
+ src[i] != ',' &&
+ !(src[i] == '#' && src[i + 1] == '#')) {
+ i++;
+ }
+ len = i - start;
+ text_start = append_text_len(src + start, len);
+ if (text_start < 0) {
+ return 0;
+ }
+ if (!push_token(source_tokens, &source_count, MAX_TOKENS,
+ TOK_WORD, text_start, len, line)) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static int find_macro(const struct Token *tok)
+{
+ int i;
+ const char *s;
+
+ if (tok->kind != TOK_WORD) {
+ return -1;
+ }
+ if (tok->len < 2) {
+ return -1;
+ }
+ s = text_buf + tok->start;
+ if (s[0] != '%') {
+ return -1;
+ }
+ for (i = 0; i < macro_count; i++) {
+ if (tok->len - 1 == macros[i].name_len &&
+ memcmp(s + 1, text_buf + macros[i].name_start,
+ (size_t)macros[i].name_len) == 0) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+static int find_param(const struct Macro *m, const struct Token *tok)
+{
+ int i;
+
+ if (tok->kind != TOK_WORD) {
+ return -1;
+ }
+ for (i = 0; i < m->param_count; i++) {
+ if (span_eq(m->param_start[i], m->param_len[i], tok)) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+static int find_prefixed_param(const struct Macro *m, const struct Token *tok, char *prefix)
+{
+ int i;
+ const char *s;
+
+ if (tok->kind != TOK_WORD || tok->len < 2) {
+ return -1;
+ }
+ s = text_buf + tok->start;
+ if (s[0] != ':' && s[0] != '&') {
+ return -1;
+ }
+ for (i = 0; i < m->param_count; i++) {
+ if (tok->len - 1 == m->param_len[i] &&
+ memcmp(s + 1, text_buf + m->param_start[i], (size_t)m->param_len[i]) == 0) {
+ *prefix = s[0];
+ return i;
+ }
+ }
+ return -1;
+}
+
+static int rewrite_local(const struct Token *in, int local_id, struct Token *out)
+{
+ const char *src = text_buf + in->start;
+ char tmp[256];
+ int prefix_len = 0;
+ int name_start = 0;
+ int n;
+ int text_start;
+
+ if (in->kind != TOK_WORD) {
+ *out = *in;
+ return 1;
+ }
+ if (in->len >= 2 && (src[0] == ':' || src[0] == '&') && src[1] == '@') {
+ prefix_len = 1;
+ name_start = 2;
+ } else if (in->len >= 1 && src[0] == '@') {
+ prefix_len = 0;
+ name_start = 1;
+ } else {
+ *out = *in;
+ return 1;
+ }
+
+ n = snprintf(tmp, sizeof(tmp), "%.*s__mlocal_%d_%.*s",
+ prefix_len, src, local_id, in->len - name_start, src + name_start);
+ if (n < 0 || n >= (int)sizeof(tmp)) {
+ snprintf(error_buf, sizeof(error_buf), "local label too long at line %d", in->line);
+ return 0;
+ }
+ text_start = append_text_len(tmp, n);
+ if (text_start < 0) {
+ return 0;
+ }
+ out->kind = TOK_WORD;
+ out->start = text_start;
+ out->len = n;
+ out->line = in->line;
+ return 1;
+}
+
+static int push_pool_token(struct Token tok)
+{
+ if (pool_used >= MAX_EXPAND) {
+ snprintf(error_buf, sizeof(error_buf), "expansion pool overflow");
+ return 0;
+ }
+ expand_pool[pool_used++] = tok;
+ return 1;
+}
+
+static int emit_newline(void)
+{
+ if (output_used + 1 >= MAX_OUTPUT) {
+ snprintf(error_buf, sizeof(error_buf), "output buffer overflow");
+ return 0;
+ }
+ output_buf[output_used++] = '\n';
+ output_need_space = 0;
+ return 1;
+}
+
+static int emit_token(const struct Token *tok)
+{
+ if (output_need_space) {
+ if (output_used + 1 >= MAX_OUTPUT) {
+ snprintf(error_buf, sizeof(error_buf), "output buffer overflow");
+ return 0;
+ }
+ output_buf[output_used++] = ' ';
+ }
+ if (output_used + tok->len >= MAX_OUTPUT) {
+ snprintf(error_buf, sizeof(error_buf), "output buffer overflow");
+ return 0;
+ }
+ memcpy(output_buf + output_used, text_buf + tok->start, (size_t)tok->len);
+ output_used += tok->len;
+ output_need_space = 1;
+ return 1;
+}
+
+static int push_stream(struct Token *toks, int count, int pool_mark)
+{
+ struct Stream *s;
+
+ if (stream_top >= MAX_STACK) {
+ snprintf(error_buf, sizeof(error_buf), "stream stack overflow");
+ return 0;
+ }
+ s = &streams[stream_top++];
+ s->toks = toks;
+ s->count = count;
+ s->pos = 0;
+ s->line_start = 1;
+ s->pool_mark = pool_mark;
+ return 1;
+}
+
+static void pop_stream(void)
+{
+ if (stream_top <= 0) {
+ return;
+ }
+ stream_top--;
+ if (streams[stream_top].pool_mark >= 0) {
+ pool_used = streams[stream_top].pool_mark;
+ }
+}
+
+static int define_macro(struct Stream *s)
+{
+ struct Macro *m;
+ int i;
+ int line_start;
+
+ if (macro_count >= MAX_MACROS) {
+ snprintf(error_buf, sizeof(error_buf), "too many macros");
+ return 0;
+ }
+ m = ¯os[macro_count];
+ memset(m, 0, sizeof(*m));
+ m->line = s->toks[s->pos].line;
+ s->pos++;
+
+ if (s->pos >= s->count || s->toks[s->pos].kind != TOK_WORD) {
+ snprintf(error_buf, sizeof(error_buf), "expected macro name at line %d", m->line);
+ return 0;
+ }
+ m->name_start = s->toks[s->pos].start;
+ m->name_len = s->toks[s->pos].len;
+ for (i = 0; i < macro_count; i++) {
+ if (macros[i].name_len == m->name_len &&
+ memcmp(text_buf + macros[i].name_start,
+ text_buf + m->name_start, (size_t)m->name_len) == 0) {
+ snprintf(error_buf, sizeof(error_buf), "duplicate macro at line %d", m->line);
+ return 0;
+ }
+ }
+ s->pos++;
+
+ if (s->pos >= s->count || s->toks[s->pos].kind != TOK_LPAREN) {
+ snprintf(error_buf, sizeof(error_buf), "expected '(' after macro name at line %d", m->line);
+ return 0;
+ }
+ s->pos++;
+
+ if (s->pos < s->count && s->toks[s->pos].kind != TOK_RPAREN) {
+ while (1) {
+ if (m->param_count >= MAX_PARAMS) {
+ snprintf(error_buf, sizeof(error_buf), "too many params on macro at line %d", m->line);
+ return 0;
+ }
+ if (s->pos >= s->count || s->toks[s->pos].kind != TOK_WORD) {
+ snprintf(error_buf, sizeof(error_buf), "expected parameter name at line %d", m->line);
+ return 0;
+ }
+ m->param_start[m->param_count] = s->toks[s->pos].start;
+ m->param_len[m->param_count] = s->toks[s->pos].len;
+ m->param_count++;
+ s->pos++;
+ if (s->pos < s->count && s->toks[s->pos].kind == TOK_COMMA) {
+ s->pos++;
+ continue;
+ }
+ break;
+ }
+ }
+
+ if (s->pos >= s->count || s->toks[s->pos].kind != TOK_RPAREN) {
+ snprintf(error_buf, sizeof(error_buf), "expected ')' in macro header at line %d", m->line);
+ return 0;
+ }
+ s->pos++;
+
+ if (s->pos >= s->count || s->toks[s->pos].kind != TOK_NEWLINE) {
+ snprintf(error_buf, sizeof(error_buf), "expected newline after macro header at line %d", m->line);
+ return 0;
+ }
+ s->pos++;
+
+ line_start = 1;
+ while (s->pos < s->count) {
+ if (line_start &&
+ s->toks[s->pos].kind == TOK_WORD &&
+ token_text_eq(&s->toks[s->pos], "%endm")) {
+ while (s->pos < s->count && s->toks[s->pos].kind != TOK_NEWLINE) {
+ s->pos++;
+ }
+ if (s->pos < s->count && s->toks[s->pos].kind == TOK_NEWLINE) {
+ s->pos++;
+ }
+ s->line_start = 1;
+ macro_count++;
+ return 1;
+ }
+ if (m->body_count >= MAX_BODY_TOKENS) {
+ snprintf(error_buf, sizeof(error_buf), "macro body too large at line %d", m->line);
+ return 0;
+ }
+ m->body[m->body_count++] = s->toks[s->pos];
+ line_start = (s->toks[s->pos].kind == TOK_NEWLINE);
+ s->pos++;
+ }
+
+ snprintf(error_buf, sizeof(error_buf), "unterminated %%macro starting at line %d", m->line);
+ return 0;
+}
+
+static int parse_args(struct Stream *s, int lparen_pos,
+ int arg_starts[MAX_PARAMS], int arg_ends[MAX_PARAMS],
+ int *arg_count, int *end_pos)
+{
+ int i = lparen_pos + 1;
+ int depth = 1;
+ int cur = 0;
+
+ memset(arg_starts, 0, sizeof(int) * MAX_PARAMS);
+ memset(arg_ends, 0, sizeof(int) * MAX_PARAMS);
+ arg_starts[0] = i;
+
+ while (i < s->count) {
+ if (s->toks[i].kind == TOK_LPAREN) {
+ depth++;
+ i++;
+ continue;
+ }
+ if (s->toks[i].kind == TOK_RPAREN) {
+ depth--;
+ if (depth == 0) {
+ arg_ends[cur] = i;
+ *arg_count = cur + 1;
+ *end_pos = i + 1;
+ if (arg_starts[0] == arg_ends[0] && cur == 0) {
+ *arg_count = 0;
+ }
+ return 1;
+ }
+ i++;
+ continue;
+ }
+ if (s->toks[i].kind == TOK_COMMA && depth == 1) {
+ arg_ends[cur] = i;
+ cur++;
+ if (cur >= MAX_PARAMS) {
+ snprintf(error_buf, sizeof(error_buf), "too many macro args at line %d", s->toks[i].line);
+ return 0;
+ }
+ arg_starts[cur] = i + 1;
+ i++;
+ continue;
+ }
+ i++;
+ }
+
+ snprintf(error_buf, sizeof(error_buf), "unterminated macro call at line %d", s->toks[lparen_pos].line);
+ return 0;
+}
+
+static int append_arg(struct Stream *s, int start, int end, int single)
+{
+ int i;
+
+ if (start == end) {
+ snprintf(error_buf, sizeof(error_buf), "empty macro argument");
+ return 0;
+ }
+ if (single && end - start != 1) {
+ snprintf(error_buf, sizeof(error_buf), "pasted or prefixed arg must be one token");
+ return 0;
+ }
+ for (i = start; i < end; i++) {
+ if (!push_pool_token(s->toks[i])) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int append_prefixed_arg(struct Stream *s, int start, int end, char prefix)
+{
+ char tmp[512];
+ int n;
+ int text_start;
+ struct Token tok;
+
+ if (end - start != 1) {
+ snprintf(error_buf, sizeof(error_buf), "prefixed arg must be one token");
+ return 0;
+ }
+ n = snprintf(tmp, sizeof(tmp), "%c%.*s",
+ prefix, s->toks[start].len, text_buf + s->toks[start].start);
+ if (n < 0 || n >= (int)sizeof(tmp)) {
+ snprintf(error_buf, sizeof(error_buf), "prefixed arg too long");
+ return 0;
+ }
+ text_start = append_text_len(tmp, n);
+ if (text_start < 0) {
+ return 0;
+ }
+ tok.kind = TOK_WORD;
+ tok.start = text_start;
+ tok.len = n;
+ tok.line = s->toks[start].line;
+ return push_pool_token(tok);
+}
+
+static int paste_range(int start, int *count)
+{
+ int in = start;
+ int out = start;
+ int end = start + *count;
+
+ while (in < end) {
+ if (expand_pool[in].kind == TOK_PASTE) {
+ char tmp[512];
+ int n;
+ int text_start;
+
+ if (out == start || in + 1 >= end) {
+ snprintf(error_buf, sizeof(error_buf), "misplaced ## at line %d", expand_pool[in].line);
+ return 0;
+ }
+ if (expand_pool[out - 1].kind == TOK_NEWLINE ||
+ expand_pool[in + 1].kind == TOK_NEWLINE ||
+ expand_pool[out - 1].kind == TOK_PASTE ||
+ expand_pool[in + 1].kind == TOK_PASTE) {
+ snprintf(error_buf, sizeof(error_buf), "bad ## operand at line %d", expand_pool[in].line);
+ return 0;
+ }
+
+ n = snprintf(tmp, sizeof(tmp), "%.*s%.*s",
+ expand_pool[out - 1].len, text_buf + expand_pool[out - 1].start,
+ expand_pool[in + 1].len, text_buf + expand_pool[in + 1].start);
+ if (n < 0 || n >= (int)sizeof(tmp)) {
+ snprintf(error_buf, sizeof(error_buf), "pasted token too long at line %d", expand_pool[in].line);
+ return 0;
+ }
+ text_start = append_text_len(tmp, n);
+ if (text_start < 0) {
+ return 0;
+ }
+ expand_pool[out - 1].kind = TOK_WORD;
+ expand_pool[out - 1].start = text_start;
+ expand_pool[out - 1].len = n;
+ in += 2;
+ continue;
+ }
+ if (out != in) {
+ expand_pool[out] = expand_pool[in];
+ }
+ out++;
+ in++;
+ }
+
+ *count = out - start;
+ return 1;
+}
+
+static int expand_call(struct Stream *s, int macro_idx)
+{
+ struct Macro *m = ¯os[macro_idx];
+ int arg_starts[MAX_PARAMS];
+ int arg_ends[MAX_PARAMS];
+ int arg_count;
+ int end_pos;
+ int mark;
+ int count;
+ int local_id;
+ int i;
+
+ if (s->pos + 1 >= s->count || s->toks[s->pos + 1].kind != TOK_LPAREN) {
+ snprintf(error_buf, sizeof(error_buf), "internal macro call error");
+ return 0;
+ }
+ if (!parse_args(s, s->pos + 1, arg_starts, arg_ends, &arg_count, &end_pos)) {
+ return 0;
+ }
+ if (arg_count != m->param_count) {
+ snprintf(error_buf, sizeof(error_buf),
+ "macro '%.*s' expects %d args, got %d at line %d",
+ m->name_len, text_buf + m->name_start,
+ m->param_count, arg_count, s->toks[s->pos].line);
+ return 0;
+ }
+
+ s->pos = end_pos;
+ s->line_start = 0;
+ mark = pool_used;
+ local_id = local_id_next++;
+
+ for (i = 0; i < m->body_count; i++) {
+ int param_idx = find_param(m, &m->body[i]);
+ int pasted = 0;
+ char prefix = 0;
+ struct Token tok;
+
+ if (param_idx >= 0) {
+ pasted = (i > 0 && m->body[i - 1].kind == TOK_PASTE) ||
+ (i + 1 < m->body_count && m->body[i + 1].kind == TOK_PASTE);
+ if (!append_arg(s, arg_starts[param_idx], arg_ends[param_idx], pasted)) {
+ pool_used = mark;
+ return 0;
+ }
+ continue;
+ }
+
+ param_idx = find_prefixed_param(m, &m->body[i], &prefix);
+ if (param_idx >= 0) {
+ if (!append_prefixed_arg(s, arg_starts[param_idx], arg_ends[param_idx], prefix)) {
+ pool_used = mark;
+ return 0;
+ }
+ continue;
+ }
+
+ if (m->body[i].kind == TOK_PASTE) {
+ if (!push_pool_token(m->body[i])) {
+ pool_used = mark;
+ return 0;
+ }
+ continue;
+ }
+
+ if (!rewrite_local(&m->body[i], local_id, &tok)) {
+ pool_used = mark;
+ return 0;
+ }
+ if (!push_pool_token(tok)) {
+ pool_used = mark;
+ return 0;
+ }
+ }
+
+ count = pool_used - mark;
+ if (!paste_range(mark, &count)) {
+ pool_used = mark;
+ return 0;
+ }
+ pool_used = mark + count;
+ if (count == 0) {
+ return 1;
+ }
+ return push_stream(expand_pool + mark, count, mark);
+}
+
+static int process_tokens(void)
+{
+ if (!push_stream(source_tokens, source_count, -1)) {
+ return 0;
+ }
+
+ for (;;) {
+ struct Stream *s;
+ struct Token *tok;
+ int macro_idx;
+
+ if (stream_top <= 0) {
+ break;
+ }
+ s = &streams[stream_top - 1];
+ if (s->pos >= s->count) {
+ pop_stream();
+ continue;
+ }
+
+ tok = &s->toks[s->pos];
+
+ if (s->line_start &&
+ tok->kind == TOK_WORD &&
+ token_text_eq(tok, "%macro")) {
+ if (!define_macro(s)) {
+ return 0;
+ }
+ continue;
+ }
+
+ if (tok->kind == TOK_NEWLINE) {
+ s->pos++;
+ s->line_start = 1;
+ if (!emit_newline()) {
+ return 0;
+ }
+ continue;
+ }
+
+ macro_idx = find_macro(tok);
+ if (macro_idx >= 0 && s->pos + 1 < s->count && s->toks[s->pos + 1].kind == TOK_LPAREN) {
+ if (!expand_call(s, macro_idx)) {
+ return 0;
+ }
+ continue;
+ }
+
+ s->pos++;
+ s->line_start = 0;
+ if (!emit_token(tok)) {
+ return 0;
+ }
+ }
+
+ if (output_used >= MAX_OUTPUT) {
+ snprintf(error_buf, sizeof(error_buf), "output buffer overflow");
+ return 0;
+ }
+ output_buf[output_used] = '\0';
+ return 1;
+}
+
+int main(int argc, char **argv)
+{
+ FILE *in;
+ FILE *out;
+ size_t nread;
+
+ if (argc != 3) {
+ fprintf(stderr, "usage: %s input.M1 output.M1\n", argv[0]);
+ return 1;
+ }
+
+ in = fopen(argv[1], "rb");
+ if (in == NULL) {
+ perror(argv[1]);
+ return 1;
+ }
+ nread = fread(input_buf, 1, MAX_INPUT, in);
+ if (ferror(in)) {
+ perror(argv[1]);
+ fclose(in);
+ return 1;
+ }
+ fclose(in);
+ if (nread >= MAX_INPUT) {
+ fprintf(stderr, "input too large\n");
+ return 1;
+ }
+ input_buf[nread] = '\0';
+
+ if (!lex_source(input_buf) || !process_tokens()) {
+ fprintf(stderr, "m1macro: %s\n", error_buf);
+ return 1;
+ }
+
+ out = fopen(argv[2], "wb");
+ if (out == NULL) {
+ perror(argv[2]);
+ return 1;
+ }
+ if (fwrite(output_buf, 1, (size_t)output_used, out) != (size_t)output_used) {
+ perror(argv[2]);
+ fclose(out);
+ return 1;
+ }
+ fclose(out);
+ return 0;
+}