boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs

commit c34eadeb5a6d1e48435c97d722fcc15a677a6a54
parent 1b11e9dca9efcdb09fd2917b96b240e43c8ec91d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed, 22 Apr 2026 17:51:03 -0700

p1v2

Diffstat:
Adocs/P1v2.md | 503+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ap1/aarch64.py | 354+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ap1/common.py | 66++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ap1/llvm_disasm_aarch64.py | 157+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ap1/p1_gen.py | 251+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apost.md | 247+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/m1macro.c | 886+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 2464 insertions(+), 0 deletions(-)

diff --git a/docs/P1v2.md b/docs/P1v2.md @@ -0,0 +1,503 @@ +# P1 v2 + +## Scope + +P1 v2 is a portable pseudo-ISA for standalone executables. + +P1 v2 has two width variants: + +- **P1v2-64** — one word is one 64-bit integer or pointer value +- **P1v2-32** — one word is one 32-bit integer or pointer value + +Portable source may use any number of word arguments. The first four argument +registers are explicit, and additional argument words are passed through a +portable incoming stack-argument area. + +Portable source may directly return `0..1` word. Wider results use the +portable indirect-result convention described below. + +## Toolchain envelope + +P1 v2 must be assemblable through the existing `M0` + `hex2` path, with +`catm` as the only composition primitive between source or generated fragments. +The spec therefore assumes only the following toolchain features: + +- `M0`-level `DEFINE name hex_bytes` substitution +- raw byte emission +- labels and label references supported by `hex2` +- file concatenation via `catm` + +## Source notation + +This document describes instructions using ordinary assembly notation such as +`ADD rd, ra, rb`, `LD rd, [ra + off]`, or `CALL`. + +Because of the toolchain constraints above, portable source does not encode +most operands as textual instruction arguments. Instead, register choices, +inline immediate values, and small fixed parameters are fused into opcode +names, following the generated-table style used by `src/p1_gen.py`. + +So the notation in this document is descriptive rather than literal: + +- `ADD rd, ra, rb` means a family of fused register-specific opcodes +- `ADDI rd, ra, imm` means a family of fused register-and-immediate-specific + opcodes +- `ENTER size` means a family of fused byte-count-specific opcodes +- `LDARG rd, idx` means a family of fused register-and-argument-slot-specific + opcodes +- `BR rs`, `CALLR rs`, and `TAILR rs` mean register-specific control-flow + opcodes +- `LEAVE`, `CALL`, `RET`, `TAIL`, `B`, and `SYSCALL` remain operand-free + +Labels still appear in source where the toolchain supports them directly, such +as `LA rd, %label` and `LA_BR %label`. + +## Register Model + +### Exposed registers + +P1 v2 exposes the following source-level registers: + +- `a0`–`a3` — argument registers. Also caller-saved general registers. +- `t0` — caller-saved temporary. +- `s0`–`s1` — callee-saved general registers. +- `sp` — stack pointer. + +### Hidden registers + +The backend may reserve additional native registers that are never visible in +P1 source: + +- `br` — branch / call target mechanism +- backend-local scratch used entirely within one instruction expansion + +No hidden register may carry a live P1 value across an instruction boundary. +`br` need not be a native register on every target; it may be implemented by +another backend-private mechanism. + +## Calling Convention + +### Arguments and return values + +- In the direct-result convention, explicit argument words 0-3 live in + `a0-a3`. +- Additional explicit argument words live in the incoming stack-argument area + and are read with `LDARG`. +- In the direct-result convention, a one-word return value lives in `a0`. + +P1 v2 also defines an indirect-result convention for wider returns: + +- The caller passes a writable result buffer pointer in `a0`. +- Explicit argument words 0-2 then live in `a1-a3`. +- Additional explicit argument words still live in the incoming + stack-argument area. +- On return, `a0` holds the same result buffer pointer value. + +In the direct-result convention, incoming stack-argument slot `0` therefore +corresponds to explicit argument word `4`. In the indirect-result convention, +incoming stack-argument slot `0` corresponds to explicit argument word `3`. + +The indirect-result convention is the portable way to return aggregates or any +result wider than one word. + +### Register preservation + +Caller-saved: + +- `a0`–`a3` +- `t0` + +Callee-saved: + +- `s0`–`s1` +- `sp` + +### Call semantics + +A call is valid from any function, including a leaf. Call / return correctness +does not depend on establishing a frame first. + +If a function needs any incoming argument after making a call, it must save it +before the call. This matters in particular for argument 0, since `a0` is also +the return-value register. + +A call that passes any stack argument words requires the caller to have an +active standard frame with enough frame-local storage to stage those outgoing +words. + +The return address is hidden machine state. Portable source must not assume +that it lives in any exposed register. + +## Stack Convention + +### Call-boundary rule + +At every call boundary, the backend must satisfy the native C ABI stack +alignment rule for the target architecture. + +Portable source must therefore treat raw function-entry `sp` as opaque. It may +not assume that the low bits of `sp` have the same meaning on all targets +before a frame is established. + +### Incoming stack-argument area + +P1 v2 defines an abstract incoming stack-argument area for explicit argument +words that do not fit in registers. + +- Slot `0` is the first stack-passed explicit argument word. +- Slots are word-indexed, not byte-indexed. +- Portable source may access this area only through `LDARG`. + +`LDARG` is valid only when the current function has an active standard frame. +Therefore, a function that needs any incoming stack argument must establish a +standard frame before its first `LDARG`. + +Portable source must not assume any direct relationship between incoming +argument slots and raw function-entry `sp`. In particular, source must not try +to reconstruct stack arguments by manually indexing from `sp`; backend entry +layouts differ across targets. + +For a call with `m` stack-passed explicit argument words, the caller stages +those words in the first `m` words of its frame-local storage immediately +before the call: + +``` +[sp + 2*WORD + 0*WORD] = outgoing arg word 0 +[sp + 2*WORD + 1*WORD] = outgoing arg word 1 +... +``` + +At callee entry, those staged words become incoming argument slots `0..m-1`. +The backend is responsible for mapping between the caller's frame layout and +the callee's abstract incoming argument slots. + +Portable code that needs both ordinary locals and stack-passed outgoing +arguments must reserve enough total frame-local storage and keep the low- +addressed prefix available for outgoing argument staging across the call. + +### Standard frame layout + +Functions that need local stack storage use a standard frame layout. After +frame establishment: + +``` +[sp + 0*WORD] = saved return address +[sp + 1*WORD] = saved caller stack pointer +[sp + 2*WORD ... sp + 2*WORD + local_bytes - 1] = frame-local storage +... +``` + +Frame-local storage is byte-addressed. Portable code may use it for ordinary +locals, spilled callee-saved registers, and the caller-staged outgoing +stack-argument words described above. + +Total frame size is: + +`round_up(STACK_ALIGN, 2*WORD_SIZE + local_bytes)` + +Where: + +- `WORD_SIZE = 8` in P1v2-64 +- `WORD_SIZE = 4` in P1v2-32 +- `STACK_ALIGN` is target-defined and must satisfy the native call ABI + +Leaf functions that need no frame-local storage may omit the frame entirely. + +### Frame invariants + +- A function that allocates a frame must restore `sp` before returning. +- Callee-saved registers modified by the function must be restored before + returning. +- The standard frame layout is the only frame shape recognized by P1 v2. + +## Op Set Summary + +| Category | Operations | +|----------|------------| +| Materialization | `LI rd, imm`, `LA rd, %label`, `LA_BR %label` | +| Moves | `MOV rd, rs`, `MOV rd, sp` | +| Arithmetic | `ADD`, `SUB`, `AND`, `OR`, `XOR`, `SHL`, `SHR`, `SAR`, `MUL`, `DIV`, `REM` | +| Immediate arithmetic | `ADDI`, `ANDI`, `ORI`, `SHLI`, `SHRI`, `SARI` | +| Memory | `LD`, `ST`, `LB`, `SB` | +| ABI access | `LDARG` | +| Branching | `B`, `BR`, `BEQ`, `BNE`, `BLT`, `BEQZ`, `BNEZ`, `BLTZ` | +| Calls / returns | `CALL`, `CALLR`, `RET`, `TAIL`, `TAILR` | +| Frame management | `ENTER`, `LEAVE` | +| System | `SYSCALL` | + +## Immediates + +Immediate operands appear only in instructions that explicitly admit them. +Portable source has three immediate classes: + +- **Inline integer immediate** — a signed 12-bit assembly-time constant in the + range `-2048..2047` +- **Materialized word value** — a full one-word assembly-time constant loaded + with `LI` +- **Materialized address** — the address of a label loaded with `LA` + +P1 v2 also uses two structured assembly-time operands: + +- **Frame-local byte count** — a non-negative byte count used by `ENTER` +- **Argument-slot index** — a non-negative word-slot index used by `LDARG` + +`LI rd, imm` loads the one-word integer value `imm`. + +`LA rd, %label` loads the address of `%label` as a one-word pointer value. + +The backend may realize `LI` and `LA` using native immediates, literal pools, +multi-instruction sequences, or other backend-private mechanisms. + +## Control Flow + +### Call / Return / Tail Call + +Control-flow targets are materialized with `LA_BR %label`, which loads +`%label` into the hidden branch-target mechanism `br`. The immediately +following control-flow op consumes that target. + +`CALL` transfers control to the target most recently loaded by `LA_BR` and +establishes a return continuation such that a subsequent `RET` returns to the +instruction after the `CALL`. `CALL` is valid whether or not the caller has +established a standard frame, except that any call using stack-passed argument +words requires an active standard frame to hold the staged outgoing words. + +`CALLR rs` is the register-indirect form of `CALL`. It transfers control to +the code pointer value held in `rs` and establishes the same return +continuation semantics as `CALL`. + +`RET` returns through the current return continuation. `RET` is valid whether +or not the current function has established a standard frame, provided any +frame established by the function has already been torn down. + +`TAIL` is a tail call to the target most recently loaded by `LA_BR`. It is +valid only when the current function has an active standard frame. `TAIL` +performs the standard epilogue for the current frame and then transfers control +to the loaded target without creating a new return continuation. The callee +therefore returns directly to the current function's caller. + +`TAILR rs` is the register-indirect form of `TAIL`. It is valid only when the +current function has an active standard frame. + +Because stack-passed outgoing argument words are staged in the caller's own +frame-local storage, `TAIL` and `TAILR` are portable only when the tail-called +callee requires no stack-passed argument words. Portable compilers must lower +other tail-call cases to an ordinary `CALL` / `RET` sequence. + +Portable source must treat the return continuation as hidden machine state. It +must not assume that the return address lives in any exposed register or stack +location except as defined by the standard frame layout after frame +establishment. + +### Prologue / Epilogue + +P1 v2 defines the following frame-establishment and frame-teardown operations: + +- `ENTER size` +- `LEAVE` + +`ENTER size` establishes the standard frame layout with `size` bytes of +frame-local storage: + +``` +[sp + 0*WORD] = saved return address +[sp + 1*WORD] = saved caller stack pointer +[sp + 2*WORD ... sp + 2*WORD + size - 1] = frame-local storage +``` + +The total allocation size is: + +`round_up(STACK_ALIGN, 2*WORD_SIZE + size)` + +The named frame-local bytes are the usable local storage. Any additional bytes +introduced by alignment rounding are padding, not extra local bytes. + +`LEAVE` tears down the current standard frame and restores the hidden return +continuation so that a subsequent `RET` returns correctly. + +Because every standard frame stores the saved caller stack pointer at +`[sp + 1*WORD]`, `LEAVE` does not need to know the frame-local byte count used +by the corresponding `ENTER`. + +A function may omit `ENTER` / `LEAVE` entirely if it is a leaf and needs no +standard frame. + +`ENTER` and `LEAVE` do not implicitly save or restore `s0` or `s1`. A +function that modifies `s0` or `s1` must preserve them explicitly, typically by +storing them in frame-local storage within its standard frame. + +### Branching + +P1 v2 branch targets are carried through the hidden branch-target mechanism +`br`. Portable source may load `br` only through: + +- `LA_BR %label` — materialize the address of `%label` as the next branch, call, + or tail-call target + +No branch, call, or tail opcode takes a label operand directly. Portable source +must treat `br` as owned by the control-flow machinery. No live value may be +carried in `br`. Each `LA_BR` must be consumed by the immediately following +branch, call, or tail op, and portable source must not rely on `br` surviving +across any other instruction. + +The portable branch families are: + +- `B` — unconditional branch to the target in `br` +- `BR rs` — unconditional branch to the code pointer in `rs` +- `BEQ`, `BNE`, `BLT` — conditional branch to the target in `br` +- `BEQZ`, `BNEZ`, `BLTZ` — conditional branch to the target in `br` using zero + as the second operand + +`BLT` and `BLTZ` perform signed comparisons on one-word values. + +If a branch condition is true, control transfers to the target currently held in +`br`. If the condition is false, execution falls through to the next +instruction. + +## Data Ops + +### Arithmetic + +P1 v2 defines the following arithmetic and bitwise operations on one-word +values: + +- register-register: `ADD`, `SUB`, `AND`, `OR`, `XOR`, `SHL`, `SHR`, `SAR`, + `MUL`, `DIV`, `REM` +- immediate: `ADDI`, `ANDI`, `ORI`, `SHLI`, `SHRI`, `SARI` + +For `ADD`, `SUB`, `MUL`, `AND`, `OR`, and `XOR`, computation is modulo the +active word size. + +`SHL` shifts left and discards high bits. `SHR` is a logical right shift and +zero-fills. `SAR` is an arithmetic right shift and sign-fills. + +For register-count shifts, only the low `5` bits of the shift count are +observed in `P1v2-32`, and only the low `6` bits are observed in `P1v2-64`. + +Immediate-form shifts use inline immediates in the range `0..31` in `P1v2-32` +and `0..63` in `P1v2-64`. + +`DIV` is signed division on one-word two's-complement values and truncates +toward zero. `REM` is the corresponding signed remainder. + +Division by zero is outside the portable contract. The overflow case +`MIN_INT / -1` is also outside the portable contract, as is the corresponding +remainder case. + +### Moves + +P1 v2 defines the following move and materialization operations: + +- `MOV` — register-to-register copy +- `LI` — load one-word integer constant +- `LA` — load label address + +`MOV` may copy from any exposed general register to any exposed general +register. + +Portable source may also read the current stack pointer through `MOV rd, sp`. + +Portable source may not write `sp` through `MOV`. Stack-pointer updates are only +performed by `ENTER`, `LEAVE`, and backend-private call/return machinery. + +`LI` materializes an integer bit-pattern. `LA` materializes the address of a +label. `LA_BR` is a separate control-flow-target materialization form and is not +part of the general move family. + +### Memory + +P1 v2 defines the following memory-access operations: + +- `LD`, `ST` — one-word load and store +- `LB`, `SB` — byte load and store +- `LDARG` — one-word load from the incoming stack-argument area + +`LD` and `ST` access one full word: 4 bytes in `P1v2-32` and 8 bytes in +`P1v2-64`. + +`LB` loads one byte and zero-extends it to a full word. `SB` stores the low +8 bits of the source value. + +Memory offsets use signed 12-bit inline immediates. + +The base address for a memory access may be any exposed general register or +`sp`. + +`LDARG rd, idx` loads incoming stack-argument slot `idx`, where slot `0` is the +first stack-passed explicit argument word. `idx` is word-indexed, not +byte-indexed. `LDARG` is an ABI access, not a general memory operation; it does +not expose or imply any raw `sp`-relative layout at function entry. + +`LDARG` is valid only when the current function has an active standard frame. + +Portable source must not assume that labels are aligned beyond what is +explicitly established by the program itself. Portable code should use +naturally aligned addresses for `LD` and `ST`. Unaligned word accesses are +outside the portable contract. Byte accesses have no additional alignment +requirement. + +## System + +`SYSCALL` is part of the portable ISA surface. + +At the portable level, the syscall convention is: + +- `a0` = syscall number on entry, return value on exit +- `a1`, `a2`, `a3`, `t0`, `s0`, `s1` = syscall arguments 0 through 5 + +At the portable level, `SYSCALL` clobbers only `a0`. All other exposed +registers are preserved across the syscall. + +The mapping from symbolic syscall names to numeric syscall identifiers is +target-defined. The set of syscalls available to a given program is likewise +specified outside the core P1 v2 ISA, for example by a target profile or +runtime interface document. + +## Target notes + +- `a0` is both argument 0 and the return-value register in the portable + calling convention in the direct-result case, and the indirect-result buffer + pointer in the indirect-result case. +- On aarch64, riscv64, arm32, and rv32, that matches the native integer/pointer + ABI directly. +- On amd64, the backend must translate between portable `a0` and native + return register `rax` at call and return boundaries. +- On i386, the backend must translate between portable argument registers and + the native stack-argument ABI at call boundaries. +- On amd64 and i386, `LDARG` must account for the return address pushed by the + native `call` instruction. On aarch64, riscv64, arm32, and rv32, it maps more + directly to the entry `sp` plus the backend's standard frame/header policy. +- On amd64, aarch64, riscv64, arm32, and rv32, `br` may be implemented as a + dedicated hidden native register. +- On i386, `br` is expected to be a backend-private stack convention, not a + dedicated hidden register. +- Frame-pointer use is backend policy, not part of the P1 v2 architectural + register set. + +### Native register mapping + +#### 64-bit targets + +| P1 | amd64 | aarch64 | riscv64 | +|------|-------|---------|---------| +| `a0` | `rdi` | `x0` | `a0` | +| `a1` | `rsi` | `x1` | `a1` | +| `a2` | `rdx` | `x2` | `a2` | +| `a3` | `rcx` | `x3` | `a3` | +| `t0` | `r10` | `x9` | `t0` | +| `s0` | `rbx` | `x19` | `s1` | +| `s1` | `r12` | `x20` | `s2` | +| `sp` | `rsp` | `sp` | `sp` | + +#### 32-bit targets + +| P1 | arm32 | i386 | rv32 | +|------|-------|-------|-------| +| `a0` | `r0` | `eax` | `a0` | +| `a1` | `r1` | `ecx` | `a1` | +| `a2` | `r2` | `edx` | `a2` | +| `a3` | `r3` | `ebx` | `a3` | +| `t0` | `r12` | `esi` | `t0` | +| `s0` | `r4` | `edi` | `s1` | +| `s1` | `r5` | `ebp` | `s2` | +| `sp` | `sp` | `esp` | `sp` | diff --git a/p1/aarch64.py b/p1/aarch64.py @@ -0,0 +1,354 @@ +from common import ( + AddI, + ArchDef, + BranchReg, + CondB, + CondBZ, + Enter, + La, + LaBr, + LdArg, + Li, + LogI, + Mem, + Mov, + Nullary, + Rrr, + ShiftI, + le32, + register_arch, + round_up, +) + + +NAT = { + 'a0': 0, + 'a1': 1, + 'a2': 2, + 'a3': 3, + 'x4': 4, + 'x5': 5, + 't0': 9, + 's0': 19, + 's1': 20, + 'sp': 31, + 'xzr': 31, + 'lr': 30, + 'br': 17, + 'scratch': 16, + 'x8': 8, + 'save0': 21, + 'save1': 22, + 'save2': 23, +} + + +RRR_BASE = { + 'ADD': 0x8B000000, + 'SUB': 0xCB000000, + 'AND': 0x8A000000, + 'OR': 0xAA000000, + 'XOR': 0xCA000000, + 'SHL': 0x9AC02000, + 'SHR': 0x9AC02400, + 'SAR': 0x9AC02800, + 'DIV': 0x9AC00C00, +} + + +SYSCALL_NUMBERS = { + 'SYS_READ': 63, + 'SYS_WRITE': 64, + 'SYS_CLOSE': 57, + 'SYS_OPENAT': 56, + 'SYS_EXIT': 93, + 'SYS_CLONE': 220, + 'SYS_EXECVE': 221, + 'SYS_WAITID': 95, +} + + +def aa_rrr(base, rd, ra, rb): + d = NAT[rd] + a = NAT[ra] + b = NAT[rb] + return le32(base | (b << 16) | (a << 5) | d) + + +def aa_add_imm(rd, ra, imm12, sub=False): + d = NAT[rd] + a = NAT[ra] + base = 0xD1000000 if sub else 0x91000000 + return le32(base | ((imm12 & 0xFFF) << 10) | (a << 5) | d) + + +def aa_mov_rr(dst, src): + if dst == 'sp': + return aa_add_imm('sp', src, 0, sub=False) + if src == 'sp': + return aa_add_imm(dst, 'sp', 0, sub=False) + d = NAT[dst] + s = NAT[src] + return le32(0xAA000000 | (s << 16) | (31 << 5) | d) + + +def aa_ubfm(rd, ra, immr, imms): + d = NAT[rd] + a = NAT[ra] + return le32(0xD3400000 | (immr << 16) | (imms << 10) | (a << 5) | d) + + +def aa_sbfm(rd, ra, immr, imms): + d = NAT[rd] + a = NAT[ra] + return le32(0x93400000 | (immr << 16) | (imms << 10) | (a << 5) | d) + + +def aa_movz(rd, imm16): + d = NAT[rd] + return le32(0xD2800000 | ((imm16 & 0xFFFF) << 5) | d) + + +def aa_movn(rd, imm16): + d = NAT[rd] + return le32(0x92800000 | ((imm16 & 0xFFFF) << 5) | d) + + +def aa_materialize_small_imm(rd, imm): + if imm >= 0: + return aa_movz(rd, imm) + return aa_movn(rd, (~imm) & 0xFFFF) + + +def aa_ldst_uimm12(base, rt, rn, off_bytes, size_log2): + imm12 = off_bytes >> size_log2 + t = NAT[rt] + n = NAT[rn] + return le32(base | (imm12 << 10) | (n << 5) | t) + + +def aa_ldst_unscaled(base, rt, rn, off): + imm9 = off & 0x1FF + t = NAT[rt] + n = NAT[rn] + return le32(base | (imm9 << 12) | (n << 5) | t) + + +def aa_mem(op, rt, rn, off): + bases = { + 'LD': (0xF9400000, 3, 0xF8400000), + 'ST': (0xF9000000, 3, 0xF8000000), + 'LB': (0x39400000, 0, 0x38400000), + 'SB': (0x39000000, 0, 0x38000000), + } + uimm_base, size_log2, unscaled_base = bases[op] + scale = 1 << size_log2 + if off >= 0 and off % scale == 0 and off < (4096 << size_log2): + return aa_ldst_uimm12(uimm_base, rt, rn, off, size_log2) + if -256 <= off <= 255: + return aa_ldst_unscaled(unscaled_base, rt, rn, off) + if -2048 <= off <= 2047: + if off >= 0: + addr = aa_add_imm('scratch', rn, off, sub=False) + else: + addr = aa_add_imm('scratch', rn, -off, sub=True) + return addr + aa_ldst_uimm12(uimm_base, rt, 'scratch', 0, size_log2) + raise ValueError(f'aarch64 offset out of range for {op}: {off}') + + +def aa_cmp_skip(op, ra, rb): + a = NAT[ra] + b = NAT[rb] + cmp_hex = le32(0xEB000000 | (b << 16) | (a << 5) | 31) + skip_cond = { + 'BEQ': 1, + 'BNE': 0, + 'BLT': 10, + }[op] + return cmp_hex + le32(0x54000040 | skip_cond) + + +def aa_br(reg): + return le32(0xD61F0000 | (NAT[reg] << 5)) + + +def aa_blr(reg): + return le32(0xD63F0000 | (NAT[reg] << 5)) + + +def aa_ret(): + return le32(0xD65F03C0) + + +def aa_lit64_prefix(rd): + d = NAT[rd] + ldr_lit = 0x58000040 | d + b_plus8 = 0x14000002 + return le32(ldr_lit) + le32(b_plus8) + + +def encode_li(_arch, row): + return aa_lit64_prefix(row.rd) + + +def encode_la(_arch, row): + return aa_lit64_prefix(row.rd) + + +def encode_labr(_arch, _row): + return aa_lit64_prefix('br') + + +def encode_mov(_arch, row): + return aa_mov_rr(row.rd, row.rs) + + +def encode_rrr(_arch, row): + if row.op == 'MUL': + d = NAT[row.rd] + a = NAT[row.ra] + b = NAT[row.rb] + return le32(0x9B000000 | (b << 16) | (31 << 10) | (a << 5) | d) + if row.op == 'REM': + d = NAT[row.rd] + a = NAT[row.ra] + b = NAT[row.rb] + sc = NAT['scratch'] + sdiv = 0x9AC00C00 | (b << 16) | (a << 5) | sc + msub = 0x9B008000 | (b << 16) | (a << 10) | (sc << 5) | d + return le32(sdiv) + le32(msub) + return aa_rrr(RRR_BASE[row.op], row.rd, row.ra, row.rb) + + +def encode_addi(_arch, row): + if row.imm >= 0: + return aa_add_imm(row.rd, row.ra, row.imm, sub=False) + return aa_add_imm(row.rd, row.ra, -row.imm, sub=True) + + +def encode_logi(_arch, row): + seq = aa_materialize_small_imm('scratch', row.imm) + base = { + 'ANDI': 0x8A000000, + 'ORI': 0xAA000000, + }[row.op] + return seq + aa_rrr(base, row.rd, row.ra, 'scratch') + + +def encode_shifti(_arch, row): + if row.op == 'SHLI': + return aa_ubfm(row.rd, row.ra, (-row.imm) & 63, 63 - row.imm) + if row.op == 'SHRI': + return aa_ubfm(row.rd, row.ra, row.imm, 63) + return aa_sbfm(row.rd, row.ra, row.imm, 63) + + +def encode_mem(_arch, row): + return aa_mem(row.op, row.rt, row.rn, row.off) + + +def encode_ldarg(_arch, row): + return aa_mem('LD', 'scratch', 'sp', 8) + aa_mem('LD', row.rd, 'scratch', 16 + 8 * row.slot) + + +def encode_branch_reg(_arch, row): + if row.kind == 'BR': + return aa_br(row.rs) + if row.kind == 'CALLR': + return aa_blr(row.rs) + if row.kind == 'TAILR': + leave = encode_nullary(_arch, Nullary('LEAVE', 'LEAVE')) + return leave + aa_br(row.rs) + raise ValueError(f'unknown branch-reg kind: {row.kind}') + + +def encode_condb(_arch, row): + return aa_cmp_skip(row.op, row.ra, row.rb) + aa_br('br') + + +def encode_condbz(_arch, row): + a = NAT[row.ra] + br_hex = aa_br('br') + if row.op == 'BEQZ': + return le32(0xB5000000 | (2 << 5) | a) + br_hex + if row.op == 'BNEZ': + return le32(0xB4000000 | (2 << 5) | a) + br_hex + cmp_zero = le32(0xEB1F001F | (a << 5)) + bge = le32(0x54000040 | 10) + return cmp_zero + bge + br_hex + + +def encode_enter(arch, row): + frame_bytes = round_up(arch.stack_align, 2 * arch.word_bytes + row.size) + return ( + aa_add_imm('sp', 'sp', frame_bytes, sub=True) + + aa_mem('ST', 'lr', 'sp', 0) + + aa_add_imm('x8', 'sp', frame_bytes, sub=False) + + aa_mem('ST', 'x8', 'sp', 8) + ) + + +def encode_nullary(_arch, row): + if row.kind == 'B': + return aa_br('br') + if row.kind == 'CALL': + return aa_blr('br') + if row.kind == 'RET': + return aa_ret() + if row.kind == 'LEAVE': + return ( + aa_mem('LD', 'lr', 'sp', 0) + + aa_mem('LD', 'x8', 'sp', 8) + + aa_mov_rr('sp', 'x8') + ) + if row.kind == 'TAIL': + leave = encode_nullary(_arch, Nullary('LEAVE', 'LEAVE')) + return leave + aa_br('br') + if row.kind == 'SYSCALL': + return ''.join([ + aa_mov_rr('x8', 'a0'), + aa_mov_rr('save0', 'a1'), + aa_mov_rr('save1', 'a2'), + aa_mov_rr('save2', 'a3'), + aa_mov_rr('a0', 'save0'), + aa_mov_rr('a1', 'save1'), + aa_mov_rr('a2', 'save2'), + aa_mov_rr('a3', 't0'), + aa_mov_rr('x4', 's0'), + aa_mov_rr('x5', 's1'), + le32(0xD4000001), + aa_mov_rr('a1', 'save0'), + aa_mov_rr('a2', 'save1'), + aa_mov_rr('a3', 'save2'), + ]) + raise ValueError(f'unknown nullary kind: {row.kind}') + + +ENCODERS = { + Li: encode_li, + La: encode_la, + LaBr: encode_labr, + Mov: encode_mov, + Rrr: encode_rrr, + AddI: encode_addi, + LogI: encode_logi, + ShiftI: encode_shifti, + Mem: encode_mem, + LdArg: encode_ldarg, + Nullary: encode_nullary, + BranchReg: encode_branch_reg, + CondB: encode_condb, + CondBZ: encode_condbz, + Enter: encode_enter, +} + + +register_arch( + ArchDef( + name='aarch64', + word_bytes=8, + stack_align=16, + syscall_numbers=SYSCALL_NUMBERS, + encoders=ENCODERS, + ) +) diff --git a/p1/common.py b/p1/common.py @@ -0,0 +1,66 @@ +from collections import namedtuple + + +ArchDef = namedtuple( + 'ArchDef', + 'name word_bytes stack_align syscall_numbers encoders', +) + +Banner = namedtuple('Banner', 'text') +Literal = namedtuple('Literal', 'name hex_by_arch') +Nullary = namedtuple('Nullary', 'name kind') +Li = namedtuple('Li', 'name rd') +La = namedtuple('La', 'name rd') +LaBr = namedtuple('LaBr', 'name') +Mov = namedtuple('Mov', 'name rd rs') +Rrr = namedtuple('Rrr', 'name op rd ra rb') +AddI = namedtuple('AddI', 'name rd ra imm') +LogI = namedtuple('LogI', 'name op rd ra imm') +ShiftI = namedtuple('ShiftI', 'name op rd ra imm') +Mem = namedtuple('Mem', 'name op rt rn off') +LdArg = namedtuple('LdArg', 'name rd slot') +BranchReg = namedtuple('BranchReg', 'name kind rs') +CondB = namedtuple('CondB', 'name op ra rb') +CondBZ = namedtuple('CondBZ', 'name op ra') +Enter = namedtuple('Enter', 'name size') + + +ARCH_REGISTRY = {} + + +def register_arch(arch): + if arch.name in ARCH_REGISTRY: + raise RuntimeError(f'duplicate arch registration: {arch.name}') + ARCH_REGISTRY[arch.name] = arch + + +def get_arch(name): + return ARCH_REGISTRY[name] + + +def registered_arches(): + return tuple(sorted(ARCH_REGISTRY)) + + +def byte(n): + return f'{n & 0xFF:02X}' + + +def le32(n): + return (n & 0xFFFFFFFF).to_bytes(4, 'little').hex().upper() + + +def le64(n): + return (n & 0xFFFFFFFFFFFFFFFF).to_bytes(8, 'little').hex().upper() + + +def word_hex(word_bytes, n): + if word_bytes == 4: + return le32(n) + if word_bytes == 8: + return le64(n) + raise ValueError(f'unsupported word size: {word_bytes}') + + +def round_up(align, n): + return ((n + align - 1) // align) * align diff --git a/p1/llvm_disasm_aarch64.py b/p1/llvm_disasm_aarch64.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +"""Annotate p1_aarch64.M1 DEFINE rows with llvm-mc disassembly. + +Reads generated DEFINE lines from p1_aarch64.M1, disassembles code-bearing rows +with llvm-mc, and prints the DEFINE name beside the native aarch64 mnemonic +sequence. Literal data rows such as syscall-number constants are labeled as data +instead of being treated as instructions. +""" + +import argparse +import os +import re +import subprocess +import sys +from pathlib import Path + + +DEFINE_RE = re.compile(r'^DEFINE\s+(\S+)\s+([0-9A-Fa-f]+)\s*$') + + +def repo_root(): + return Path(__file__).resolve().parent.parent + + +def default_input_path(): + return repo_root() / 'build' / 'p1v2' / 'aarch64' / 'p1_aarch64.M1' + + +def ensure_generated(path: Path): + if path.exists(): + return + gen = repo_root() / 'p1' / 'p1_gen.py' + proc = subprocess.run( + [sys.executable, str(gen), '--arch', 'aarch64', str(path.parent.parent)], + check=True, + cwd=repo_root(), + capture_output=True, + text=True, + ) + if proc.stderr: + sys.stderr.write(proc.stderr) + + +def parse_rows(path: Path): + rows = [] + for line in path.read_text().splitlines(): + match = DEFINE_RE.match(line) + if not match: + continue + name, hex_bytes = match.groups() + rows.append((name, hex_bytes.upper())) + return rows + + +def is_data_row(name: str): + return name.startswith('sys_') + + +def disassemble_code_rows(rows, llvm_mc): + code_rows = [(name, hex_bytes) for name, hex_bytes in rows if not is_data_row(name)] + if not code_rows: + return {} + + payload = '\n'.join(hex_bytes for _, hex_bytes in code_rows) + '\n' + proc = subprocess.run( + [llvm_mc, '--disassemble', '--hex', '--arch=aarch64'], + input=payload, + text=True, + capture_output=True, + check=True, + ) + inst_lines = [line.strip() for line in proc.stdout.splitlines() if line.strip()] + + out = {} + index = 0 + for name, hex_bytes in code_rows: + words = len(hex_bytes) // 8 + out[name] = inst_lines[index:index + words] + index += words + + if index != len(inst_lines): + raise RuntimeError( + f'llvm output row split mismatch: consumed {index}, got {len(inst_lines)}' + ) + return out + + +def format_rows(rows, disasm_by_name, show_bytes): + name_width = max(len(name) for name, _ in rows) if rows else 0 + out = [] + for name, hex_bytes in rows: + if is_data_row(name): + rhs = f'data 0x{hex_bytes}' + out.append(f'{name:<{name_width}} {rhs}') + continue + + insns = disasm_by_name.get(name, []) + if not insns: + out.append(f'{name:<{name_width}} <no disassembly>') + continue + + prefix = name.ljust(name_width) + byte_col = f' {hex_bytes}' if show_bytes else '' + out.append(f'{prefix}{byte_col} {insns[0]}') + for insn in insns[1:]: + spacer = ' ' * name_width + if show_bytes: + spacer += ' ' + ' ' * len(hex_bytes) + out.append(f'{spacer} {insn}') + return '\n'.join(out) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + 'input', + nargs='?', + default=str(default_input_path()), + help='path to p1_aarch64.M1', + ) + parser.add_argument( + '--llvm-mc', + default=os.environ.get('LLVM_MC', 'llvm-mc'), + help='path to llvm-mc', + ) + parser.add_argument( + '--grep', + default='', + help='only include DEFINE names containing this substring', + ) + parser.add_argument( + '--limit', + type=int, + default=0, + help='maximum number of DEFINE rows to print (0 = all)', + ) + parser.add_argument( + '--show-bytes', + action='store_true', + help='include raw DEFINE bytes next to the name', + ) + args = parser.parse_args() + + path = Path(args.input) + ensure_generated(path) + rows = parse_rows(path) + if args.grep: + rows = [(name, hex_bytes) for name, hex_bytes in rows if args.grep in name] + if args.limit: + rows = rows[:args.limit] + + disasm_by_name = disassemble_code_rows(rows, args.llvm_mc) + print(format_rows(rows, disasm_by_name, args.show_bytes)) + + +if __name__ == '__main__': + main() diff --git a/p1/p1_gen.py b/p1/p1_gen.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +"""Generate P1 v2 DEFINE tables. + +This is a fresh generator for docs/P1v2.md. The ISA surface is described by +plain namedtuple rows, and each backend registers a simple row-type -> encoder +mapping. The emitted immediate/offset domains are still curated tables rather +than the full theoretical spec space, so extending coverage is a one-line data +edit instead of an architecture rewrite. + +Usage: + python3 p1/p1_gen.py [--arch ARCH] [build-root] + python3 p1/p1_gen.py --check [--arch ARCH] [build-root] + python3 p1/p1_gen.py --list-archs +""" + +import os +import sys +from itertools import product + +from common import ( + AddI, + Banner, + BranchReg, + CondB, + CondBZ, + Enter, + La, + LaBr, + LdArg, + Li, + Literal, + LogI, + Mem, + Mov, + Nullary, + Rrr, + ShiftI, + get_arch, + registered_arches, + word_hex, +) + +import aarch64 # noqa: F401 - imported for arch registration side effects + + +P1_GPRS = ('a0', 'a1', 'a2', 'a3', 't0', 's0', 's1') +P1_BASES = P1_GPRS + ('sp',) + +RRR_OPS = ('ADD', 'SUB', 'AND', 'OR', 'XOR', 'SHL', 'SHR', 'SAR', 'MUL', 'DIV', 'REM') +LOGI_OPS = ('ANDI', 'ORI') +SHIFT_OPS = ('SHLI', 'SHRI', 'SARI') +MEM_OPS = ('LD', 'ST', 'LB', 'SB') +CONDB_OPS = ('BEQ', 'BNE', 'BLT') +CONDBZ_OPS = ('BEQZ', 'BNEZ', 'BLTZ') + +ADDI_IMMS = ( + -2048, -1024, -256, -128, -64, -48, -32, -24, -16, -12, -8, -7, -6, + -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16, 24, 32, 40, + 48, 63, 64, 127, 128, 255, 256, 512, 1024, 2047, +) + +LOGI_IMMS = ( + -1, 0, 1, 2, 3, 4, 6, 7, 8, 15, 16, 31, 32, 63, 64, 127, 255, 511, 1023, + 2047, +) + +SHIFT_IMMS = tuple(range(64)) + +MEM_OFFS = ( + -256, -128, -64, -48, -32, -24, -16, -8, -1, 0, 1, 7, 8, 15, 16, 24, 32, + 40, 48, 56, 64, 128, 255, +) + +LDARG_SLOTS = tuple(range(32)) +ENTER_SIZES = tuple(range(0, 129)) + + +HEADER = """## p1_{arch}.M1 — GENERATED by p1/p1_gen.py. Do not edit by hand. +## +## This table targets the P1 v2 ISA described in docs/P1v2.md. +## Row shapes are shared; per-arch lowering lives in p1/<arch>.py. +""" + + +def imm_suffix(imm): + return f'NEG{-imm}' if imm < 0 else str(imm) + + +def rows(arch): + out = [] + + out.append(Banner('Materialization')) + for rd in P1_GPRS: + out.append(Li(name=f'LI_{rd.upper()}', rd=rd)) + for rd in P1_GPRS: + out.append(La(name=f'LA_{rd.upper()}', rd=rd)) + out.append(LaBr(name='LA_BR')) + + out.append(Banner('Moves')) + for rd, rs in product(P1_GPRS, P1_GPRS): + out.append(Mov(name=f'MOV_{rd.upper()}_{rs.upper()}', rd=rd, rs=rs)) + for rd in P1_GPRS: + out.append(Mov(name=f'MOV_{rd.upper()}_SP', rd=rd, rs='sp')) + + out.append(Banner('Register Arithmetic')) + for op, rd, ra, rb in product(RRR_OPS, P1_GPRS, P1_GPRS, P1_GPRS): + out.append(Rrr(name=f'{op}_{rd.upper()}_{ra.upper()}_{rb.upper()}', + op=op, rd=rd, ra=ra, rb=rb)) + + out.append(Banner('Immediate Arithmetic')) + for rd, ra, imm in product(P1_GPRS, P1_GPRS, ADDI_IMMS): + out.append(AddI(name=f'ADDI_{rd.upper()}_{ra.upper()}_{imm_suffix(imm)}', + rd=rd, ra=ra, imm=imm)) + for op, rd, ra, imm in product(LOGI_OPS, P1_GPRS, P1_GPRS, LOGI_IMMS): + out.append(LogI(name=f'{op}_{rd.upper()}_{ra.upper()}_{imm_suffix(imm)}', + op=op, rd=rd, ra=ra, imm=imm)) + for op, rd, ra, imm in product(SHIFT_OPS, P1_GPRS, P1_GPRS, SHIFT_IMMS): + out.append(ShiftI(name=f'{op}_{rd.upper()}_{ra.upper()}_{imm}', + op=op, rd=rd, ra=ra, imm=imm)) + + out.append(Banner('Memory')) + for op, rt, rn, off in product(MEM_OPS, P1_GPRS, P1_BASES, MEM_OFFS): + out.append(Mem(name=f'{op}_{rt.upper()}_{rn.upper()}_{imm_suffix(off)}', + op=op, rt=rt, rn=rn, off=off)) + + out.append(Banner('ABI Access')) + for rd, slot in product(P1_GPRS, LDARG_SLOTS): + out.append(LdArg(name=f'LDARG_{rd.upper()}_{slot}', rd=rd, slot=slot)) + + out.append(Banner('Branches')) + out.append(Nullary(name='B', kind='B')) + for rs in P1_GPRS: + out.append(BranchReg(name=f'BR_{rs.upper()}', kind='BR', rs=rs)) + for op, ra, rb in product(CONDB_OPS, P1_GPRS, P1_GPRS): + out.append(CondB(name=f'{op}_{ra.upper()}_{rb.upper()}', op=op, ra=ra, rb=rb)) + for op, ra in product(CONDBZ_OPS, P1_GPRS): + out.append(CondBZ(name=f'{op}_{ra.upper()}', op=op, ra=ra)) + + out.append(Banner('Calls And Returns')) + out.append(Nullary(name='CALL', kind='CALL')) + out.append(Nullary(name='RET', kind='RET')) + out.append(Nullary(name='TAIL', kind='TAIL')) + for rs in P1_GPRS: + out.append(BranchReg(name=f'CALLR_{rs.upper()}', kind='CALLR', rs=rs)) + for rs in P1_GPRS: + out.append(BranchReg(name=f'TAILR_{rs.upper()}', kind='TAILR', rs=rs)) + + out.append(Banner('Frame Management')) + for size in ENTER_SIZES: + out.append(Enter(name=f'ENTER_{size}', size=size)) + out.append(Nullary(name='LEAVE', kind='LEAVE')) + + out.append(Banner('System')) + out.append(Nullary(name='SYSCALL', kind='SYSCALL')) + for name, number in sorted(arch.syscall_numbers.items()): + out.append(Literal(name=name, hex_by_arch={arch.name: word_hex(arch.word_bytes, number)})) + + return out + + +def lower_name(name): + low = name.lower() + head, sep, rest = low.partition('_') + if not sep: + return low + if '_' not in rest: + return low + return f'{head}_{rest.replace("_", ",")}' + + +def encode_row(arch, row): + if isinstance(row, Literal): + return row.hex_by_arch[arch.name] + encoder = arch.encoders[type(row)] + return encoder(arch, row) + + +def emit(arch_name): + arch = get_arch(arch_name) + out = [HEADER.format(arch=arch.name).rstrip(), ''] + seen = set() + for row in rows(arch): + if isinstance(row, Banner): + out.append('') + out.append(f'## ---- {row.text}') + continue + name = lower_name(row.name) + if name in seen: + raise RuntimeError(f'duplicate DEFINE: {name}') + seen.add(name) + out.append(f'DEFINE {name} {encode_row(arch, row)}') + out.append('') + return '\n'.join(out) + + +def parse_args(argv): + check = False + archs = [] + positional = [] + i = 0 + while i < len(argv): + arg = argv[i] + if arg == '--check': + check = True + elif arg == '--list-archs': + print('\n'.join(registered_arches())) + sys.exit(0) + elif arg == '--arch': + i += 1 + if i >= len(argv): + raise SystemExit('--arch requires a value') + archs.append(argv[i]) + else: + positional.append(arg) + i += 1 + build_root = positional[0] if positional else os.path.join('build', 'p1v2') + if not archs: + archs = list(registered_arches()) + return check, archs, build_root + + +def main(argv=None): + check, archs, build_root = parse_args(argv or sys.argv[1:]) + had_diff = False + + for arch_name in archs: + arch = get_arch(arch_name) + dest_dir = os.path.join(build_root, arch.name) + path = os.path.join(dest_dir, f'p1_{arch.name}.M1') + content = emit(arch.name) + if check: + try: + with open(path) as f: + existing = f.read() + except FileNotFoundError: + existing = '' + if existing != content: + sys.stderr.write(f'DIFF: {path}\n') + had_diff = True + continue + os.makedirs(dest_dir, exist_ok=True) + with open(path, 'w') as f: + f.write(content) + print(f'wrote {path} ({len(content)} bytes)') + + if check and had_diff: + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/post.md b/post.md @@ -0,0 +1,247 @@ +# Title + +Your compiler compiles your code, but what compiled the compiler? And the one +before that? + +Ken Thompson freaked everyone out in 1983 by describing a nasty +self-replicating attack on compilers themselves that reminded folks that it +might be a good idea to have a "full bootstrap" from as small a binary seed as +could be managed and then audited source from there forward to the toolchains +we know and (maybe-kinda) love today. + +Jeremiah Orians and Jan Nieuwenhuizen delivered exactly that to the world in +2023, going from `hex0-seed`, a few hundred byte seed binary that is the "hex0" +compiler, through Fabrice Bellard's Tiny C Compiler (TCC), and then up through +GCC and the rest of the stack. Truly awesome work. + +Between "hex0" and TCC, there's a minimal assembler and linker in M0 and hex2, +then a "subset C" compiler that builds out the Mes Scheme ecosystem, then +a Mes C compiler that compiles TCC. + +From seed to M0 and hex2 is ~2KLOC. The "middle" stage is ~25KLOC to get to +Mes Scheme and a collection of build and packaging tools. And the "late" stage +of the Mes C compiler and TCC sources are another ~25KLOC. + +I find M0 and hex2 to be very cute and everything after that to be rather +goopy. So, let's dive into the cuteness! + +Getting there is fairly straightforward: + +``` +hex0-seed = given + +hex0 = hex0-seed(hex0.hex0) # hex0 compiles itself from source +hex1 = hex0(hex1.hex0) +hex2 = hex1(hex2.hex1) +catm = hex2(catm.hex2) +M0 = hex2(catm(ELF.hex2, M0.hex2)) +``` + +**hex0**: the minimal seed hex compiler; it turns pairs of hexadecimal digits +into raw bytes. Supports # and ; line comments. + +**hex1**: a small extension of hex0 that adds the first notion of symbolic +control flow. Features everything in hex0, single-character labels, one size +of relative jump/offset encoding. + +**hex2**: the final "hex language" and first real linker-like stage in the +chain. Features everything in hex1, labels of arbitrary length, relative +references of multiple sizes (!, @, %), absolute references ($, &), enough +symbol/address resolution to link later stages. + +**catm**: a minimal file concatenation utility, take N files and output 1. + +**M0**: the first assembler built on top of hex2. Features DEFINE substitution +of names to hex bytes, immediate values in various sizes/formats, symbolic +assembly over hex2. + +M0 programs are architecture-specific because the DEFINEd set of mnemonics and +their byte-level encodings are architecture-specific. So that means that the +subset C compiler, defined in M0, has to be written once per architecture. +That's a drag. + +I thought it'd be rather fun to play around at this low level and see if we +could build off M0 in a way that might get us to a functioning C compiler +through a path that cuts down on the amount of code in that "middle" layer. + +So I present to you "P1", a portable pseudo-ISA targetable via M1 DEFINEs, and +"M1M", a macro expander for M1, written in P1. + +P1 + +Registers: + +- `a0`–`a3` — argument registers. Also caller-saved general registers. +- `t0` — caller-saved temporary. +- `s0`–`s1` — callee-saved general registers. +- `sp` — stack pointer. + +Ops: +- Materialization: `LI`, `LA`, `LA_BR` +- Moves: `MOV` +- Arithmetic: `ADD`, `SUB`, `AND`, `OR`, `XOR`, `SHL`, `SHR`, `SAR`, `MUL`, + `DIV`, `REM` +- Immediate arithmetic: `ADDI`, `ANDI`, `ORI`, `SHLI`, `SHRI`, `SARI` +- Memory: `LD`, `ST`, `LB`, `SB` +- Branching: `B`, `BR`, `BEQ`, `BNE`, `BLT`, `BEQZ`, `BNEZ`, `BLTZ` +- Calls / returns: `CALL`, `CALLR`, `RET`, `TAIL`, `TAILR` +- Frame management: `ENTER`, `LEAVE` +- ABI access: `LDARG` +- System: `SYSCALL` + +It's a deliberately dumb little RISC: a handful of visible registers, a hidden +branch/call target register behind `LA_BR`, and a generator that spits out the +per-arch `DEFINE` tables for 32-bit and 64-bit x86, ARM, and RISC-V. + +Calling convention: + +- `a0`-`a3` hold the first four argument words. +- Extra arguments live in an incoming stack-argument area and are read with + `LDARG`. +- `a0` is also the one-word return register. +- `a0`-`a3` and `t0` are caller-saved; `s0`-`s1` and `sp` are callee-saved. +- If you need a frame, `ENTER` builds the standard one and `LEAVE` tears it + down. +- Stack-passed outgoing args are staged in frame-local space before `CALL`. +- Wider-than-one-word returns use the usual hidden pointer trick: caller passes + a writable result buffer in `a0`, real args slide over by one, callee returns + that same pointer in `a0`. + +That's enough to write a portable macro expander, a Lisp, or a tiny C +compiler once, then retarget by swapping out the generated M1 defs instead of +rewriting the whole program per architecture. That cuts down auditable code at +the bottom of the stack quite a bit. And it may give us a shorter path from +M0 to TCC (but no promises). Mostly it's just a cute base to work on. + +``` +;; Bootstrap from seed +(define hex0 (hex0-seed hex0.hex0)) +(define hex1 (hex0 hex1.hex0)) +(define hex2 (hex1 hex2.hex1)) +(define catm (hex2 catm.hex2)) +(define M0 (hex2 (catm ELF.hex2 M0.hex2))) + +(defn link (hex2-src) (hex2 (catm ELF.hex2 hex2-src))) + +;; Compile P1 source, no macros +(defn p1compile0 (src) (M0 (catm P1.M1 src))) + +;; Bootstrap the macro expander once without macros. +(define m1m (link (p1compile0 m1m.M1))) + +;; Normal P1 build: expand macros, then assemble/link. +(defn p1compile (src) (M0 (catm P1.M1 (m1m (catm P1M.M1 src))))) +(defn p1exe (src) (link (p1compile src))) +``` + +So, what does P1 code with the macro expansion base look like? + +``` +## copy.M1 +## +## And symbolic constants: +## SYS_open SYS_read SYS_write SYS_close SYS_exit O_RDONLY +## WORD +## SAVE_S0 SAVE_S1 BUF_OFF COPY_LOCALS BUF_SIZE +## +## Entry convention: +## a0 = argc +## a1 = argv + +DEFINE BUF_SIZE %128 +DEFINE SAVE_S0 %16 +DEFINE SAVE_S1 %24 +DEFINE COPY_LOCALS %144 + +:main + %li(t0, 2) + %blt(a0, t0, usage) ; if argc < 2, exit 2 + + %ld(t0, a1, WORD) ; t0 = argv[1] + + %li(a0, SYS_open) + %mov(a1, t0) ; path + %li(a2, O_RDONLY) ; flags + %li(a3, 0) ; mode + %syscall() + %bltz(a0, fail) ; open failed + + %mov(s0, a0) ; s0 = fd + %mov(a0, s0) + %call(copy_to_stdout) ; returns 0 or -1 in a0 + %bltz(a0, close_fail) + + %li(a0, SYS_close) + %mov(a1, s0) + %syscall() + + %li(a0, SYS_exit) + %li(a1, 0) + %syscall() + +:close_fail + %li(a0, SYS_close) + %mov(a1, s0) + %syscall() + +:fail + %li(a0, SYS_exit) + %li(a1, 1) + %syscall() + +:usage + %li(a0, SYS_exit) + %li(a1, 2) + %syscall() + + +:copy_to_stdout + %enter(COPY_LOCALS) + %st(sp, SAVE_S0, s0) + %st(sp, SAVE_S1, s1) + + %mov(s0, a0) ; s0 = input fd + +:read_loop + %li(a0, SYS_read) + %mov(a1, s0) ; fd + %mov(a2, sp) + %addi(a2, a2, BUF_OFF) ; buf = sp + BUF_OFF + %li(a3, BUF_SIZE) ; count + %syscall() + + %beqz(a0, done) ; EOF + %bltz(a0, io_error) ; read error + + %mov(s1, a0) ; remaining byte count + %mov(t0, sp) + %addi(t0, t0, BUF_OFF) ; t0 = current write ptr + +:write_loop + %li(a0, SYS_write) + %li(a1, 1) ; stdout + %mov(a2, t0) ; buf + %mov(a3, s1) ; count + %syscall() + + %bltz(a0, io_error) ; write error + + %add(t0, t0, a0) ; buf += nwritten + %sub(s1, s1, a0) ; remaining -= nwritten + %bnez(s1, write_loop) + %b(read_loop) + +:done + %li(a0, 0) + %ld(s0, sp, SAVE_S0) + %ld(s1, sp, SAVE_S1) + %leave() + %ret() + +:io_error + %li(a0, -1) + %ld(s0, sp, SAVE_S0) + %ld(s1, sp, SAVE_S1) + %leave() + %ret() +``` diff --git a/src/m1macro.c b/src/m1macro.c @@ -0,0 +1,886 @@ +#include <stdio.h> +#include <string.h> + +/* + * Tiny single-pass M1 macro expander. + * + * Syntax: + * %macro NAME(a, b) + * ... body ... + * %endm + * + * %NAME(x, y) function-like macro call + * ## token pasting inside macro bodies + * @loop local token inside a macro body + * :@loop / &@loop local label / reference shorthand + * :param / &param prefix a single-token parameter with ':' or '&' + * + * Notes: + * - Macros are define-before-use. There is no prescan. + * - Expansion rescans by pushing expanded tokens back through the same loop. + * - There is no cycle detection. Recursive macros will loop until a limit. + * - Only recognized %NAME(...) calls expand. Other text passes through. + * - Output formatting is normalized to tokens plus '\n', not preserved. + */ + +#define MAX_INPUT 262144 +#define MAX_OUTPUT 524288 +#define MAX_TEXT 524288 +#define MAX_TOKENS 65536 +#define MAX_MACROS 256 +#define MAX_PARAMS 16 +#define MAX_BODY_TOKENS 4096 +#define MAX_EXPAND 65536 +#define MAX_STACK 64 + +enum { + TOK_WORD, + TOK_STRING, + TOK_NEWLINE, + TOK_LPAREN, + TOK_RPAREN, + TOK_COMMA, + TOK_PASTE +}; + +struct Token { + int kind; + int start; + int len; + int line; +}; + +struct Macro { + int name_start; + int name_len; + int line; + int param_count; + int param_start[MAX_PARAMS]; + int param_len[MAX_PARAMS]; + struct Token body[MAX_BODY_TOKENS]; + int body_count; +}; + +struct Stream { + struct Token *toks; + int count; + int pos; + int line_start; + int pool_mark; +}; + +static char input_buf[MAX_INPUT + 1]; +static char output_buf[MAX_OUTPUT + 1]; +static char text_buf[MAX_TEXT]; + +static struct Token source_tokens[MAX_TOKENS]; +static struct Token expand_pool[MAX_EXPAND]; +static struct Macro macros[MAX_MACROS]; +static struct Stream streams[MAX_STACK]; + +static int text_used; +static int source_count; +static int pool_used; +static int macro_count; +static int local_id_next = 1; +static int output_used; +static int output_need_space; +static int stream_top; + +static char error_buf[256]; + +static int is_space_no_nl(int c) +{ + return c == ' ' || c == '\t' || c == '\r' || c == '\f' || c == '\v'; +} + +static int append_text_len(const char *s, int len) +{ + int start; + + if (text_used + len + 1 > MAX_TEXT) { + snprintf(error_buf, sizeof(error_buf), "text buffer overflow"); + return -1; + } + start = text_used; + memcpy(text_buf + text_used, s, (size_t)len); + text_used += len; + text_buf[text_used++] = '\0'; + return start; +} + +static int append_text_str(const char *s) +{ + return append_text_len(s, (int)strlen(s)); +} + +static int push_token(struct Token *buf, int *count, int max_count, + int kind, int start, int len, int line) +{ + if (*count >= max_count) { + snprintf(error_buf, sizeof(error_buf), "token buffer overflow"); + return 0; + } + buf[*count].kind = kind; + buf[*count].start = start; + buf[*count].len = len; + buf[*count].line = line; + *count += 1; + return 1; +} + +static int token_text_eq(const struct Token *tok, const char *s) +{ + int len = (int)strlen(s); + + return tok->len == len && memcmp(text_buf + tok->start, s, (size_t)len) == 0; +} + +static int span_eq(int start, int len, const struct Token *tok) +{ + return len == tok->len && memcmp(text_buf + start, text_buf + tok->start, (size_t)len) == 0; +} + +static int lex_source(const char *src) +{ + int i = 0; + int line = 1; + + while (src[i] != '\0') { + int start; + int text_start; + int len; + + if (is_space_no_nl((unsigned char)src[i])) { + i++; + continue; + } + if (src[i] == '\n') { + if (!push_token(source_tokens, &source_count, MAX_TOKENS, + TOK_NEWLINE, -1, 0, line)) { + return 0; + } + i++; + line++; + continue; + } + if (src[i] == '"' || src[i] == '\'') { + int quote = src[i]; + + start = i; + i++; + while (src[i] != '\0' && src[i] != quote) { + i++; + } + if (src[i] == quote) { + i++; + } + len = i - start; + text_start = append_text_len(src + start, len); + if (text_start < 0) { + return 0; + } + if (!push_token(source_tokens, &source_count, MAX_TOKENS, + TOK_STRING, text_start, len, line)) { + return 0; + } + continue; + } + if (src[i] == '#' && src[i + 1] == '#') { + text_start = append_text_str("##"); + if (text_start < 0) { + return 0; + } + if (!push_token(source_tokens, &source_count, MAX_TOKENS, + TOK_PASTE, text_start, 2, line)) { + return 0; + } + i += 2; + continue; + } + if (src[i] == '#' || src[i] == ';') { + while (src[i] != '\0' && src[i] != '\n') { + i++; + } + continue; + } + if (src[i] == '(') { + text_start = append_text_str("("); + if (text_start < 0) { + return 0; + } + if (!push_token(source_tokens, &source_count, MAX_TOKENS, + TOK_LPAREN, text_start, 1, line)) { + return 0; + } + i++; + continue; + } + if (src[i] == ')') { + text_start = append_text_str(")"); + if (text_start < 0) { + return 0; + } + if (!push_token(source_tokens, &source_count, MAX_TOKENS, + TOK_RPAREN, text_start, 1, line)) { + return 0; + } + i++; + continue; + } + if (src[i] == ',') { + text_start = append_text_str(","); + if (text_start < 0) { + return 0; + } + if (!push_token(source_tokens, &source_count, MAX_TOKENS, + TOK_COMMA, text_start, 1, line)) { + return 0; + } + i++; + continue; + } + + start = i; + while (src[i] != '\0' && + !is_space_no_nl((unsigned char)src[i]) && + src[i] != '\n' && + src[i] != '#' && + src[i] != ';' && + src[i] != '(' && + src[i] != ')' && + src[i] != ',' && + !(src[i] == '#' && src[i + 1] == '#')) { + i++; + } + len = i - start; + text_start = append_text_len(src + start, len); + if (text_start < 0) { + return 0; + } + if (!push_token(source_tokens, &source_count, MAX_TOKENS, + TOK_WORD, text_start, len, line)) { + return 0; + } + } + + return 1; +} + +static int find_macro(const struct Token *tok) +{ + int i; + const char *s; + + if (tok->kind != TOK_WORD) { + return -1; + } + if (tok->len < 2) { + return -1; + } + s = text_buf + tok->start; + if (s[0] != '%') { + return -1; + } + for (i = 0; i < macro_count; i++) { + if (tok->len - 1 == macros[i].name_len && + memcmp(s + 1, text_buf + macros[i].name_start, + (size_t)macros[i].name_len) == 0) { + return i; + } + } + return -1; +} + +static int find_param(const struct Macro *m, const struct Token *tok) +{ + int i; + + if (tok->kind != TOK_WORD) { + return -1; + } + for (i = 0; i < m->param_count; i++) { + if (span_eq(m->param_start[i], m->param_len[i], tok)) { + return i; + } + } + return -1; +} + +static int find_prefixed_param(const struct Macro *m, const struct Token *tok, char *prefix) +{ + int i; + const char *s; + + if (tok->kind != TOK_WORD || tok->len < 2) { + return -1; + } + s = text_buf + tok->start; + if (s[0] != ':' && s[0] != '&') { + return -1; + } + for (i = 0; i < m->param_count; i++) { + if (tok->len - 1 == m->param_len[i] && + memcmp(s + 1, text_buf + m->param_start[i], (size_t)m->param_len[i]) == 0) { + *prefix = s[0]; + return i; + } + } + return -1; +} + +static int rewrite_local(const struct Token *in, int local_id, struct Token *out) +{ + const char *src = text_buf + in->start; + char tmp[256]; + int prefix_len = 0; + int name_start = 0; + int n; + int text_start; + + if (in->kind != TOK_WORD) { + *out = *in; + return 1; + } + if (in->len >= 2 && (src[0] == ':' || src[0] == '&') && src[1] == '@') { + prefix_len = 1; + name_start = 2; + } else if (in->len >= 1 && src[0] == '@') { + prefix_len = 0; + name_start = 1; + } else { + *out = *in; + return 1; + } + + n = snprintf(tmp, sizeof(tmp), "%.*s__mlocal_%d_%.*s", + prefix_len, src, local_id, in->len - name_start, src + name_start); + if (n < 0 || n >= (int)sizeof(tmp)) { + snprintf(error_buf, sizeof(error_buf), "local label too long at line %d", in->line); + return 0; + } + text_start = append_text_len(tmp, n); + if (text_start < 0) { + return 0; + } + out->kind = TOK_WORD; + out->start = text_start; + out->len = n; + out->line = in->line; + return 1; +} + +static int push_pool_token(struct Token tok) +{ + if (pool_used >= MAX_EXPAND) { + snprintf(error_buf, sizeof(error_buf), "expansion pool overflow"); + return 0; + } + expand_pool[pool_used++] = tok; + return 1; +} + +static int emit_newline(void) +{ + if (output_used + 1 >= MAX_OUTPUT) { + snprintf(error_buf, sizeof(error_buf), "output buffer overflow"); + return 0; + } + output_buf[output_used++] = '\n'; + output_need_space = 0; + return 1; +} + +static int emit_token(const struct Token *tok) +{ + if (output_need_space) { + if (output_used + 1 >= MAX_OUTPUT) { + snprintf(error_buf, sizeof(error_buf), "output buffer overflow"); + return 0; + } + output_buf[output_used++] = ' '; + } + if (output_used + tok->len >= MAX_OUTPUT) { + snprintf(error_buf, sizeof(error_buf), "output buffer overflow"); + return 0; + } + memcpy(output_buf + output_used, text_buf + tok->start, (size_t)tok->len); + output_used += tok->len; + output_need_space = 1; + return 1; +} + +static int push_stream(struct Token *toks, int count, int pool_mark) +{ + struct Stream *s; + + if (stream_top >= MAX_STACK) { + snprintf(error_buf, sizeof(error_buf), "stream stack overflow"); + return 0; + } + s = &streams[stream_top++]; + s->toks = toks; + s->count = count; + s->pos = 0; + s->line_start = 1; + s->pool_mark = pool_mark; + return 1; +} + +static void pop_stream(void) +{ + if (stream_top <= 0) { + return; + } + stream_top--; + if (streams[stream_top].pool_mark >= 0) { + pool_used = streams[stream_top].pool_mark; + } +} + +static int define_macro(struct Stream *s) +{ + struct Macro *m; + int i; + int line_start; + + if (macro_count >= MAX_MACROS) { + snprintf(error_buf, sizeof(error_buf), "too many macros"); + return 0; + } + m = &macros[macro_count]; + memset(m, 0, sizeof(*m)); + m->line = s->toks[s->pos].line; + s->pos++; + + if (s->pos >= s->count || s->toks[s->pos].kind != TOK_WORD) { + snprintf(error_buf, sizeof(error_buf), "expected macro name at line %d", m->line); + return 0; + } + m->name_start = s->toks[s->pos].start; + m->name_len = s->toks[s->pos].len; + for (i = 0; i < macro_count; i++) { + if (macros[i].name_len == m->name_len && + memcmp(text_buf + macros[i].name_start, + text_buf + m->name_start, (size_t)m->name_len) == 0) { + snprintf(error_buf, sizeof(error_buf), "duplicate macro at line %d", m->line); + return 0; + } + } + s->pos++; + + if (s->pos >= s->count || s->toks[s->pos].kind != TOK_LPAREN) { + snprintf(error_buf, sizeof(error_buf), "expected '(' after macro name at line %d", m->line); + return 0; + } + s->pos++; + + if (s->pos < s->count && s->toks[s->pos].kind != TOK_RPAREN) { + while (1) { + if (m->param_count >= MAX_PARAMS) { + snprintf(error_buf, sizeof(error_buf), "too many params on macro at line %d", m->line); + return 0; + } + if (s->pos >= s->count || s->toks[s->pos].kind != TOK_WORD) { + snprintf(error_buf, sizeof(error_buf), "expected parameter name at line %d", m->line); + return 0; + } + m->param_start[m->param_count] = s->toks[s->pos].start; + m->param_len[m->param_count] = s->toks[s->pos].len; + m->param_count++; + s->pos++; + if (s->pos < s->count && s->toks[s->pos].kind == TOK_COMMA) { + s->pos++; + continue; + } + break; + } + } + + if (s->pos >= s->count || s->toks[s->pos].kind != TOK_RPAREN) { + snprintf(error_buf, sizeof(error_buf), "expected ')' in macro header at line %d", m->line); + return 0; + } + s->pos++; + + if (s->pos >= s->count || s->toks[s->pos].kind != TOK_NEWLINE) { + snprintf(error_buf, sizeof(error_buf), "expected newline after macro header at line %d", m->line); + return 0; + } + s->pos++; + + line_start = 1; + while (s->pos < s->count) { + if (line_start && + s->toks[s->pos].kind == TOK_WORD && + token_text_eq(&s->toks[s->pos], "%endm")) { + while (s->pos < s->count && s->toks[s->pos].kind != TOK_NEWLINE) { + s->pos++; + } + if (s->pos < s->count && s->toks[s->pos].kind == TOK_NEWLINE) { + s->pos++; + } + s->line_start = 1; + macro_count++; + return 1; + } + if (m->body_count >= MAX_BODY_TOKENS) { + snprintf(error_buf, sizeof(error_buf), "macro body too large at line %d", m->line); + return 0; + } + m->body[m->body_count++] = s->toks[s->pos]; + line_start = (s->toks[s->pos].kind == TOK_NEWLINE); + s->pos++; + } + + snprintf(error_buf, sizeof(error_buf), "unterminated %%macro starting at line %d", m->line); + return 0; +} + +static int parse_args(struct Stream *s, int lparen_pos, + int arg_starts[MAX_PARAMS], int arg_ends[MAX_PARAMS], + int *arg_count, int *end_pos) +{ + int i = lparen_pos + 1; + int depth = 1; + int cur = 0; + + memset(arg_starts, 0, sizeof(int) * MAX_PARAMS); + memset(arg_ends, 0, sizeof(int) * MAX_PARAMS); + arg_starts[0] = i; + + while (i < s->count) { + if (s->toks[i].kind == TOK_LPAREN) { + depth++; + i++; + continue; + } + if (s->toks[i].kind == TOK_RPAREN) { + depth--; + if (depth == 0) { + arg_ends[cur] = i; + *arg_count = cur + 1; + *end_pos = i + 1; + if (arg_starts[0] == arg_ends[0] && cur == 0) { + *arg_count = 0; + } + return 1; + } + i++; + continue; + } + if (s->toks[i].kind == TOK_COMMA && depth == 1) { + arg_ends[cur] = i; + cur++; + if (cur >= MAX_PARAMS) { + snprintf(error_buf, sizeof(error_buf), "too many macro args at line %d", s->toks[i].line); + return 0; + } + arg_starts[cur] = i + 1; + i++; + continue; + } + i++; + } + + snprintf(error_buf, sizeof(error_buf), "unterminated macro call at line %d", s->toks[lparen_pos].line); + return 0; +} + +static int append_arg(struct Stream *s, int start, int end, int single) +{ + int i; + + if (start == end) { + snprintf(error_buf, sizeof(error_buf), "empty macro argument"); + return 0; + } + if (single && end - start != 1) { + snprintf(error_buf, sizeof(error_buf), "pasted or prefixed arg must be one token"); + return 0; + } + for (i = start; i < end; i++) { + if (!push_pool_token(s->toks[i])) { + return 0; + } + } + return 1; +} + +static int append_prefixed_arg(struct Stream *s, int start, int end, char prefix) +{ + char tmp[512]; + int n; + int text_start; + struct Token tok; + + if (end - start != 1) { + snprintf(error_buf, sizeof(error_buf), "prefixed arg must be one token"); + return 0; + } + n = snprintf(tmp, sizeof(tmp), "%c%.*s", + prefix, s->toks[start].len, text_buf + s->toks[start].start); + if (n < 0 || n >= (int)sizeof(tmp)) { + snprintf(error_buf, sizeof(error_buf), "prefixed arg too long"); + return 0; + } + text_start = append_text_len(tmp, n); + if (text_start < 0) { + return 0; + } + tok.kind = TOK_WORD; + tok.start = text_start; + tok.len = n; + tok.line = s->toks[start].line; + return push_pool_token(tok); +} + +static int paste_range(int start, int *count) +{ + int in = start; + int out = start; + int end = start + *count; + + while (in < end) { + if (expand_pool[in].kind == TOK_PASTE) { + char tmp[512]; + int n; + int text_start; + + if (out == start || in + 1 >= end) { + snprintf(error_buf, sizeof(error_buf), "misplaced ## at line %d", expand_pool[in].line); + return 0; + } + if (expand_pool[out - 1].kind == TOK_NEWLINE || + expand_pool[in + 1].kind == TOK_NEWLINE || + expand_pool[out - 1].kind == TOK_PASTE || + expand_pool[in + 1].kind == TOK_PASTE) { + snprintf(error_buf, sizeof(error_buf), "bad ## operand at line %d", expand_pool[in].line); + return 0; + } + + n = snprintf(tmp, sizeof(tmp), "%.*s%.*s", + expand_pool[out - 1].len, text_buf + expand_pool[out - 1].start, + expand_pool[in + 1].len, text_buf + expand_pool[in + 1].start); + if (n < 0 || n >= (int)sizeof(tmp)) { + snprintf(error_buf, sizeof(error_buf), "pasted token too long at line %d", expand_pool[in].line); + return 0; + } + text_start = append_text_len(tmp, n); + if (text_start < 0) { + return 0; + } + expand_pool[out - 1].kind = TOK_WORD; + expand_pool[out - 1].start = text_start; + expand_pool[out - 1].len = n; + in += 2; + continue; + } + if (out != in) { + expand_pool[out] = expand_pool[in]; + } + out++; + in++; + } + + *count = out - start; + return 1; +} + +static int expand_call(struct Stream *s, int macro_idx) +{ + struct Macro *m = &macros[macro_idx]; + int arg_starts[MAX_PARAMS]; + int arg_ends[MAX_PARAMS]; + int arg_count; + int end_pos; + int mark; + int count; + int local_id; + int i; + + if (s->pos + 1 >= s->count || s->toks[s->pos + 1].kind != TOK_LPAREN) { + snprintf(error_buf, sizeof(error_buf), "internal macro call error"); + return 0; + } + if (!parse_args(s, s->pos + 1, arg_starts, arg_ends, &arg_count, &end_pos)) { + return 0; + } + if (arg_count != m->param_count) { + snprintf(error_buf, sizeof(error_buf), + "macro '%.*s' expects %d args, got %d at line %d", + m->name_len, text_buf + m->name_start, + m->param_count, arg_count, s->toks[s->pos].line); + return 0; + } + + s->pos = end_pos; + s->line_start = 0; + mark = pool_used; + local_id = local_id_next++; + + for (i = 0; i < m->body_count; i++) { + int param_idx = find_param(m, &m->body[i]); + int pasted = 0; + char prefix = 0; + struct Token tok; + + if (param_idx >= 0) { + pasted = (i > 0 && m->body[i - 1].kind == TOK_PASTE) || + (i + 1 < m->body_count && m->body[i + 1].kind == TOK_PASTE); + if (!append_arg(s, arg_starts[param_idx], arg_ends[param_idx], pasted)) { + pool_used = mark; + return 0; + } + continue; + } + + param_idx = find_prefixed_param(m, &m->body[i], &prefix); + if (param_idx >= 0) { + if (!append_prefixed_arg(s, arg_starts[param_idx], arg_ends[param_idx], prefix)) { + pool_used = mark; + return 0; + } + continue; + } + + if (m->body[i].kind == TOK_PASTE) { + if (!push_pool_token(m->body[i])) { + pool_used = mark; + return 0; + } + continue; + } + + if (!rewrite_local(&m->body[i], local_id, &tok)) { + pool_used = mark; + return 0; + } + if (!push_pool_token(tok)) { + pool_used = mark; + return 0; + } + } + + count = pool_used - mark; + if (!paste_range(mark, &count)) { + pool_used = mark; + return 0; + } + pool_used = mark + count; + if (count == 0) { + return 1; + } + return push_stream(expand_pool + mark, count, mark); +} + +static int process_tokens(void) +{ + if (!push_stream(source_tokens, source_count, -1)) { + return 0; + } + + for (;;) { + struct Stream *s; + struct Token *tok; + int macro_idx; + + if (stream_top <= 0) { + break; + } + s = &streams[stream_top - 1]; + if (s->pos >= s->count) { + pop_stream(); + continue; + } + + tok = &s->toks[s->pos]; + + if (s->line_start && + tok->kind == TOK_WORD && + token_text_eq(tok, "%macro")) { + if (!define_macro(s)) { + return 0; + } + continue; + } + + if (tok->kind == TOK_NEWLINE) { + s->pos++; + s->line_start = 1; + if (!emit_newline()) { + return 0; + } + continue; + } + + macro_idx = find_macro(tok); + if (macro_idx >= 0 && s->pos + 1 < s->count && s->toks[s->pos + 1].kind == TOK_LPAREN) { + if (!expand_call(s, macro_idx)) { + return 0; + } + continue; + } + + s->pos++; + s->line_start = 0; + if (!emit_token(tok)) { + return 0; + } + } + + if (output_used >= MAX_OUTPUT) { + snprintf(error_buf, sizeof(error_buf), "output buffer overflow"); + return 0; + } + output_buf[output_used] = '\0'; + return 1; +} + +int main(int argc, char **argv) +{ + FILE *in; + FILE *out; + size_t nread; + + if (argc != 3) { + fprintf(stderr, "usage: %s input.M1 output.M1\n", argv[0]); + return 1; + } + + in = fopen(argv[1], "rb"); + if (in == NULL) { + perror(argv[1]); + return 1; + } + nread = fread(input_buf, 1, MAX_INPUT, in); + if (ferror(in)) { + perror(argv[1]); + fclose(in); + return 1; + } + fclose(in); + if (nread >= MAX_INPUT) { + fprintf(stderr, "input too large\n"); + return 1; + } + input_buf[nread] = '\0'; + + if (!lex_source(input_buf) || !process_tokens()) { + fprintf(stderr, "m1macro: %s\n", error_buf); + return 1; + } + + out = fopen(argv[2], "wb"); + if (out == NULL) { + perror(argv[2]); + return 1; + } + if (fwrite(output_buf, 1, (size_t)output_used, out) != (size_t)output_used) { + perror(argv[2]); + fclose(out); + return 1; + } + fclose(out); + return 0; +}