commit 13564bfed1d3e4f66117aeaf9a24138c72696475
parent 455667f35b962f3ef09f2aa17490c4afc458fc6b
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 24 Apr 2026 12:06:51 -0700
Add libp1pp v1 utility library (p1/p1pp.P1pp)
Implements the contract from docs/LIBP1PP.md: control-flow macros
(%if_/ifelse_/while_/do_while_/for_lt/loop plus tagged variants with
%break/%continue), %fn function-definer using M1PP scopes, %assert_
macros, and C-level primitives for bytes/strings/ints, character
predicates, raw syscalls, print helpers, file helpers, bump allocator,
and panic. Targets P1v2-64 only. Verified with smoke tests on
aarch64, amd64, and riscv64.
Diffstat:
| A | p1/p1pp.P1pp | | | 1457 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
1 file changed, 1457 insertions(+), 0 deletions(-)
diff --git a/p1/p1pp.P1pp b/p1/p1pp.P1pp
@@ -0,0 +1,1457 @@
+# p1pp.P1pp -- libp1pp v1, portable utility library for P1pp programs.
+#
+# Concatenated after the P1 backend header and frontend, and before user
+# source:
+#
+# catm P1-<arch>.M1pp P1.M1pp p1pp.P1pp usersrc.P1pp > program.M1
+#
+# Targets P1v2-64 only (WORD = 8). All internal labels use the
+# `libp1pp__` prefix; public entry points are unprefixed.
+#
+# See docs/LIBP1PP.md for the public contract.
+
+# =========================================================================
+# Control-flow macros
+# =========================================================================
+#
+# Every conditional block macro uses a uniform three-branch lowering that
+# works for all seven P1 conditions (including LT, LTU, LTZ which have no
+# inverted branch): load a "take the body" target, branch on cc, then
+# unconditionally skip past the body.
+
+# ---- %if_<cc> -----------------------------------------------------------
+
+%macro if_eq(ra, rb, body)
+%la_br() &@body
+%beq(ra, rb)
+%la_br() &@end
+%b()
+:@body
+body
+:@end
+%endm
+
+%macro if_ne(ra, rb, body)
+%la_br() &@body
+%bne(ra, rb)
+%la_br() &@end
+%b()
+:@body
+body
+:@end
+%endm
+
+%macro if_lt(ra, rb, body)
+%la_br() &@body
+%blt(ra, rb)
+%la_br() &@end
+%b()
+:@body
+body
+:@end
+%endm
+
+%macro if_ltu(ra, rb, body)
+%la_br() &@body
+%bltu(ra, rb)
+%la_br() &@end
+%b()
+:@body
+body
+:@end
+%endm
+
+%macro if_eqz(ra, body)
+%la_br() &@body
+%beqz(ra)
+%la_br() &@end
+%b()
+:@body
+body
+:@end
+%endm
+
+%macro if_nez(ra, body)
+%la_br() &@body
+%bnez(ra)
+%la_br() &@end
+%b()
+:@body
+body
+:@end
+%endm
+
+%macro if_ltz(ra, body)
+%la_br() &@body
+%bltz(ra)
+%la_br() &@end
+%b()
+:@body
+body
+:@end
+%endm
+
+# ---- %ifelse_<cc> -------------------------------------------------------
+
+%macro ifelse_eq(ra, rb, tblk, fblk)
+%la_br() &@tblk
+%beq(ra, rb)
+fblk
+%la_br() &@end
+%b()
+:@tblk
+tblk
+:@end
+%endm
+
+%macro ifelse_ne(ra, rb, tblk, fblk)
+%la_br() &@tblk
+%bne(ra, rb)
+fblk
+%la_br() &@end
+%b()
+:@tblk
+tblk
+:@end
+%endm
+
+%macro ifelse_lt(ra, rb, tblk, fblk)
+%la_br() &@tblk
+%blt(ra, rb)
+fblk
+%la_br() &@end
+%b()
+:@tblk
+tblk
+:@end
+%endm
+
+%macro ifelse_ltu(ra, rb, tblk, fblk)
+%la_br() &@tblk
+%bltu(ra, rb)
+fblk
+%la_br() &@end
+%b()
+:@tblk
+tblk
+:@end
+%endm
+
+%macro ifelse_eqz(ra, tblk, fblk)
+%la_br() &@tblk
+%beqz(ra)
+fblk
+%la_br() &@end
+%b()
+:@tblk
+tblk
+:@end
+%endm
+
+%macro ifelse_nez(ra, tblk, fblk)
+%la_br() &@tblk
+%bnez(ra)
+fblk
+%la_br() &@end
+%b()
+:@tblk
+tblk
+:@end
+%endm
+
+%macro ifelse_ltz(ra, tblk, fblk)
+%la_br() &@tblk
+%bltz(ra)
+fblk
+%la_br() &@end
+%b()
+:@tblk
+tblk
+:@end
+%endm
+
+# ---- %while_<cc> -------------------------------------------------------
+#
+# Jump-to-test layout: the body runs iff the positive-sense test holds,
+# and the test is compiled below the body so we only emit a forward
+# branch at entry.
+
+%macro while_eq(ra, rb, body)
+%la_br() &@test
+%b()
+:@body
+body
+:@test
+%la_br() &@body
+%beq(ra, rb)
+%endm
+
+%macro while_ne(ra, rb, body)
+%la_br() &@test
+%b()
+:@body
+body
+:@test
+%la_br() &@body
+%bne(ra, rb)
+%endm
+
+%macro while_lt(ra, rb, body)
+%la_br() &@test
+%b()
+:@body
+body
+:@test
+%la_br() &@body
+%blt(ra, rb)
+%endm
+
+%macro while_ltu(ra, rb, body)
+%la_br() &@test
+%b()
+:@body
+body
+:@test
+%la_br() &@body
+%bltu(ra, rb)
+%endm
+
+%macro while_eqz(ra, body)
+%la_br() &@test
+%b()
+:@body
+body
+:@test
+%la_br() &@body
+%beqz(ra)
+%endm
+
+%macro while_nez(ra, body)
+%la_br() &@test
+%b()
+:@body
+body
+:@test
+%la_br() &@body
+%bnez(ra)
+%endm
+
+%macro while_ltz(ra, body)
+%la_br() &@test
+%b()
+:@body
+body
+:@test
+%la_br() &@body
+%bltz(ra)
+%endm
+
+# ---- %do_while_<cc> ----------------------------------------------------
+
+%macro do_while_eq(ra, rb, body)
+:@body
+body
+%la_br() &@body
+%beq(ra, rb)
+%endm
+
+%macro do_while_ne(ra, rb, body)
+:@body
+body
+%la_br() &@body
+%bne(ra, rb)
+%endm
+
+%macro do_while_lt(ra, rb, body)
+:@body
+body
+%la_br() &@body
+%blt(ra, rb)
+%endm
+
+%macro do_while_ltu(ra, rb, body)
+:@body
+body
+%la_br() &@body
+%bltu(ra, rb)
+%endm
+
+%macro do_while_eqz(ra, body)
+:@body
+body
+%la_br() &@body
+%beqz(ra)
+%endm
+
+%macro do_while_nez(ra, body)
+:@body
+body
+%la_br() &@body
+%bnez(ra)
+%endm
+
+%macro do_while_ltz(ra, body)
+:@body
+body
+%la_br() &@body
+%bltz(ra)
+%endm
+
+# ---- %for_lt ------------------------------------------------------------
+
+%macro for_lt(i_reg, n_reg, body)
+%li(i_reg) $(0)
+%la_br() &@test
+%b()
+:@body
+body
+%addi(i_reg, i_reg, 1)
+:@test
+%la_br() &@body
+%blt(i_reg, n_reg)
+%endm
+
+# ---- %loop --------------------------------------------------------------
+
+%macro loop(body)
+:@top
+body
+%la_br() &@top
+%b()
+%endm
+
+# ---- Tagged loops -------------------------------------------------------
+#
+# Each tagged form emits two global labels `tag_top` and `tag_end`, built
+# by `##` paste so references cross every macro boundary cleanly.
+# `%break(tag)` jumps to `tag_end`; `%continue(tag)` jumps to `tag_top`.
+
+%macro loop_tag(tag, body)
+: ## tag ## _top
+body
+%la_br() & ## tag ## _top
+%b()
+: ## tag ## _end
+%endm
+
+%macro while_tag_eq(tag, ra, rb, body)
+%la_br() & ## tag ## _top
+%b()
+:@body
+body
+: ## tag ## _top
+%la_br() &@body
+%beq(ra, rb)
+: ## tag ## _end
+%endm
+
+%macro while_tag_ne(tag, ra, rb, body)
+%la_br() & ## tag ## _top
+%b()
+:@body
+body
+: ## tag ## _top
+%la_br() &@body
+%bne(ra, rb)
+: ## tag ## _end
+%endm
+
+%macro while_tag_lt(tag, ra, rb, body)
+%la_br() & ## tag ## _top
+%b()
+:@body
+body
+: ## tag ## _top
+%la_br() &@body
+%blt(ra, rb)
+: ## tag ## _end
+%endm
+
+%macro while_tag_ltu(tag, ra, rb, body)
+%la_br() & ## tag ## _top
+%b()
+:@body
+body
+: ## tag ## _top
+%la_br() &@body
+%bltu(ra, rb)
+: ## tag ## _end
+%endm
+
+%macro while_tag_eqz(tag, ra, body)
+%la_br() & ## tag ## _top
+%b()
+:@body
+body
+: ## tag ## _top
+%la_br() &@body
+%beqz(ra)
+: ## tag ## _end
+%endm
+
+%macro while_tag_nez(tag, ra, body)
+%la_br() & ## tag ## _top
+%b()
+:@body
+body
+: ## tag ## _top
+%la_br() &@body
+%bnez(ra)
+: ## tag ## _end
+%endm
+
+%macro while_tag_ltz(tag, ra, body)
+%la_br() & ## tag ## _top
+%b()
+:@body
+body
+: ## tag ## _top
+%la_br() &@body
+%bltz(ra)
+: ## tag ## _end
+%endm
+
+%macro for_lt_tag(tag, i_reg, n_reg, body)
+%li(i_reg) $(0)
+%la_br() & ## tag ## _test
+%b()
+:@body
+body
+: ## tag ## _top
+%addi(i_reg, i_reg, 1)
+: ## tag ## _test
+%la_br() &@body
+%blt(i_reg, n_reg)
+: ## tag ## _end
+%endm
+
+%macro break(tag)
+%la_br() & ## tag ## _end
+%b()
+%endm
+
+%macro continue(tag)
+%la_br() & ## tag ## _top
+%b()
+%endm
+
+# =========================================================================
+# %fn -- scope-introducing function definition
+# =========================================================================
+#
+# Pushes a scope named after the function so `::foo` inside the body
+# mangles to `name__foo`. The body is bracketed by %enter(size) and
+# %eret(), so functions defined with %fn always carry a standard frame.
+
+%macro fn(name, size, body)
+: ## name
+%scope name
+%enter(size)
+body
+%eret()
+%endscope
+%endm
+
+# =========================================================================
+# %assert_<cc> macros
+# =========================================================================
+#
+# Branch past the panic call when the condition holds; otherwise fall
+# through to `LA a0, msg; LA_BR &panic; CALL`. Each assert requires the
+# enclosing function to have an established frame.
+
+%macro assert_eq(ra, rb, msg)
+%la_br() &@done
+%beq(ra, rb)
+%la(a0) & ## msg
+%la_br() &panic
+%call()
+:@done
+%endm
+
+%macro assert_ne(ra, rb, msg)
+%la_br() &@done
+%bne(ra, rb)
+%la(a0) & ## msg
+%la_br() &panic
+%call()
+:@done
+%endm
+
+%macro assert_lt(ra, rb, msg)
+%la_br() &@done
+%blt(ra, rb)
+%la(a0) & ## msg
+%la_br() &panic
+%call()
+:@done
+%endm
+
+%macro assert_ltu(ra, rb, msg)
+%la_br() &@done
+%bltu(ra, rb)
+%la(a0) & ## msg
+%la_br() &panic
+%call()
+:@done
+%endm
+
+%macro assert_eqz(ra, msg)
+%la_br() &@done
+%beqz(ra)
+%la(a0) & ## msg
+%la_br() &panic
+%call()
+:@done
+%endm
+
+%macro assert_nez(ra, msg)
+%la_br() &@done
+%bnez(ra)
+%la(a0) & ## msg
+%la_br() &panic
+%call()
+:@done
+%endm
+
+%macro assert_ltz(ra, msg)
+%la_br() &@done
+%bltz(ra)
+%la(a0) & ## msg
+%la_br() &panic
+%call()
+:@done
+%endm
+
+# =========================================================================
+# Memory and strings
+# =========================================================================
+
+# memcpy(dst=a0, src=a1, n=a2) -> dst (a0)
+# Leaf. Copies n bytes from src to dst. No overlap support where
+# dst > src && dst < src + n.
+:memcpy
+%scope memcpy
+%mov(a3, a0)
+%li(t0) $(0)
+::loop
+%la_br() &::done
+%beq(t0, a2)
+%add(t1, a1, t0)
+%lb(t1, t1, 0)
+%add(t2, a3, t0)
+%sb(t1, t2, 0)
+%addi(t0, t0, 1)
+%la_br() &::loop
+%b()
+::done
+%mov(a0, a3)
+%ret()
+%endscope
+
+# memset(dst=a0, byte=a1, n=a2) -> dst (a0)
+:memset
+%scope memset
+%mov(a3, a0)
+%li(t0) $(0)
+::loop
+%la_br() &::done
+%beq(t0, a2)
+%add(t1, a3, t0)
+%sb(a1, t1, 0)
+%addi(t0, t0, 1)
+%la_br() &::loop
+%b()
+::done
+%mov(a0, a3)
+%ret()
+%endscope
+
+# memcmp(a=a0, b=a1, n=a2) -> -1/0/1 (a0)
+:memcmp
+%scope memcmp
+%li(t0) $(0)
+::loop
+%la_br() &::eq
+%beq(t0, a2)
+%add(t1, a0, t0)
+%lb(t1, t1, 0)
+%add(t2, a1, t0)
+%lb(t2, t2, 0)
+%la_br() &::lt
+%bltu(t1, t2)
+%la_br() &::gt
+%bltu(t2, t1)
+%addi(t0, t0, 1)
+%la_br() &::loop
+%b()
+::lt
+%li(a0) $(-1)
+%ret()
+::gt
+%li(a0) $(1)
+%ret()
+::eq
+%li(a0) $(0)
+%ret()
+%endscope
+
+# strlen(cstr=a0) -> n (a0)
+:strlen
+%scope strlen
+%mov(a1, a0)
+::loop
+%lb(t0, a1, 0)
+%la_br() &::done
+%beqz(t0)
+%addi(a1, a1, 1)
+%la_br() &::loop
+%b()
+::done
+%sub(a0, a1, a0)
+%ret()
+%endscope
+
+# streq(a=a0, b=a1) -> 0 or 1
+:streq
+%scope streq
+::loop
+%lb(t0, a0, 0)
+%lb(t1, a1, 0)
+%la_br() &::ne
+%bne(t0, t1)
+%la_br() &::eq
+%beqz(t0)
+%addi(a0, a0, 1)
+%addi(a1, a1, 1)
+%la_br() &::loop
+%b()
+::ne
+%li(a0) $(0)
+%ret()
+::eq
+%li(a0) $(1)
+%ret()
+%endscope
+
+# strcmp(a=a0, b=a1) -> -1/0/1
+:strcmp
+%scope strcmp
+::loop
+%lb(t0, a0, 0)
+%lb(t1, a1, 0)
+%la_br() &::lt
+%bltu(t0, t1)
+%la_br() &::gt
+%bltu(t1, t0)
+%la_br() &::eq
+%beqz(t0)
+%addi(a0, a0, 1)
+%addi(a1, a1, 1)
+%la_br() &::loop
+%b()
+::lt
+%li(a0) $(-1)
+%ret()
+::gt
+%li(a0) $(1)
+%ret()
+::eq
+%li(a0) $(0)
+%ret()
+%endscope
+
+# =========================================================================
+# Integer parsing and formatting
+# =========================================================================
+
+# parse_dec(buf=a0, len=a1) -> (value=a0, consumed=a1)
+# Uses an 8-byte frame slot to save buf_start; all hot-loop state lives
+# in caller-saved registers.
+:parse_dec
+%scope parse_dec
+%enter(8)
+%st(a0, sp, 0)
+%add(a3, a0, a1)
+%mov(a2, a0)
+%li(t0) $(0)
+%li(t1) $(0)
+
+%la_br() &::after_sign
+%beq(a2, a3)
+%lb(t2, a2, 0)
+%addi(t2, t2, -45)
+%la_br() &::after_sign
+%bnez(t2)
+%li(t0) $(1)
+%addi(a2, a2, 1)
+
+::after_sign
+%mov(a1, a2)
+
+::digit_loop
+%la_br() &::digits_done
+%beq(a2, a3)
+%lb(t2, a2, 0)
+%addi(t2, t2, -48)
+%la_br() &::digits_done
+%bltz(t2)
+%li(a0) $(9)
+%la_br() &::digits_done
+%bltu(a0, t2)
+%li(a0) $(10)
+%mul(t1, t1, a0)
+%add(t1, t1, t2)
+%addi(a2, a2, 1)
+%la_br() &::digit_loop
+%b()
+
+::digits_done
+%la_br() &::no_digits
+%beq(a2, a1)
+
+%la_br() &::apply_sign
+%bnez(t0)
+%la_br() &::compute_return
+%b()
+::apply_sign
+%li(a0) $(0)
+%sub(t1, a0, t1)
+
+::compute_return
+%ld(a0, sp, 0)
+%sub(a1, a2, a0)
+%mov(a0, t1)
+%eret()
+
+::no_digits
+%li(a0) $(0)
+%li(a1) $(0)
+%eret()
+%endscope
+
+# parse_hex(buf=a0, len=a1) -> (value=a0, consumed=a1)
+:parse_hex
+%scope parse_hex
+%enter(8)
+%st(a0, sp, 0)
+%add(a3, a0, a1)
+%mov(a2, a0)
+%li(t1) $(0)
+%mov(a1, a2)
+
+::loop
+%la_br() &::done
+%beq(a2, a3)
+%lb(t2, a2, 0)
+
+%addi(t0, t2, -48)
+%la_br() &::check_lower
+%bltz(t0)
+%li(a0) $(9)
+%la_br() &::check_lower
+%bltu(a0, t0)
+%la_br() &::accept
+%b()
+
+::check_lower
+%addi(t0, t2, -97)
+%la_br() &::check_upper
+%bltz(t0)
+%li(a0) $(5)
+%la_br() &::check_upper
+%bltu(a0, t0)
+%addi(t0, t0, 10)
+%la_br() &::accept
+%b()
+
+::check_upper
+%addi(t0, t2, -65)
+%la_br() &::done
+%bltz(t0)
+%li(a0) $(5)
+%la_br() &::done
+%bltu(a0, t0)
+%addi(t0, t0, 10)
+
+::accept
+%shli(t1, t1, 4)
+%or(t1, t1, t0)
+%addi(a2, a2, 1)
+%la_br() &::loop
+%b()
+
+::done
+%la_br() &::no_digits
+%beq(a2, a1)
+%ld(a0, sp, 0)
+%sub(a1, a2, a0)
+%mov(a0, t1)
+%eret()
+
+::no_digits
+%li(a0) $(0)
+%li(a1) $(0)
+%eret()
+%endscope
+
+# fmt_dec(buf=a0, value=a1) -> n_bytes (a0)
+#
+# Unified signed formatting: digits are written from the per-iteration
+# `value % 10`, negated when value is negative. This avoids the
+# INT_MIN-overflow trap that `value = -value` would hit.
+:fmt_dec
+%scope fmt_dec
+%enter(8)
+%st(a0, sp, 0)
+
+%la_br() &::is_neg
+%bltz(a1)
+%la_br() &::count
+%b()
+::is_neg
+%li(t0) $(45)
+%sb(t0, a0, 0)
+%addi(a0, a0, 1)
+
+::count
+%mov(t0, a1)
+%li(a2) $(1)
+%li(t1) $(10)
+::count_loop
+%div(t0, t0, t1)
+%la_br() &::count_done
+%beqz(t0)
+%addi(a2, a2, 1)
+%la_br() &::count_loop
+%b()
+::count_done
+
+%add(a3, a0, a2)
+
+::dig_loop
+%addi(a3, a3, -1)
+%rem(t0, a1, t1)
+%la_br() &::neg_digit
+%bltz(t0)
+%la_br() &::write_digit
+%b()
+::neg_digit
+%li(t2) $(0)
+%sub(t0, t2, t0)
+::write_digit
+%addi(t0, t0, 48)
+%sb(t0, a3, 0)
+%div(a1, a1, t1)
+%la_br() &::dig_loop
+%bnez(a1)
+
+%ld(t2, sp, 0)
+%add(a0, a0, a2)
+%sub(a0, a0, t2)
+%eret()
+%endscope
+
+# fmt_hex(buf=a0, value=a1) -> n_bytes (a0)
+:fmt_hex
+%scope fmt_hex
+%enter(8)
+%st(a0, sp, 0)
+
+%la_br() &::nonzero
+%bnez(a1)
+%li(t0) $(48)
+%sb(t0, a0, 0)
+%li(a0) $(1)
+%eret()
+
+::nonzero
+%mov(t0, a1)
+%li(a2) $(0)
+::count_loop
+%addi(a2, a2, 1)
+%shri(t0, t0, 4)
+%la_br() &::count_loop
+%bnez(t0)
+
+%add(a3, a0, a2)
+
+::dig_loop
+%addi(a3, a3, -1)
+%andi(t0, a1, 15)
+%li(t1) $(10)
+%la_br() &::is_letter
+%bltu(t0, t1)
+%addi(t0, t0, -10)
+%addi(t0, t0, 97)
+%la_br() &::write_digit
+%b()
+::is_letter
+%addi(t0, t0, 48)
+::write_digit
+%sb(t0, a3, 0)
+%shri(a1, a1, 4)
+%la_br() &::dig_loop
+%bnez(a1)
+
+%ld(t2, sp, 0)
+%add(a0, a0, a2)
+%sub(a0, a0, t2)
+%eret()
+%endscope
+
+# =========================================================================
+# Character predicates
+# =========================================================================
+
+# is_digit(c=a0) -> 0 or 1
+:is_digit
+%scope is_digit
+%addi(t0, a0, -48)
+%li(t1) $(10)
+%li(a0) $(1)
+%la_br() &::done
+%bltu(t0, t1)
+%li(a0) $(0)
+::done
+%ret()
+%endscope
+
+# is_hex_digit(c=a0) -> 0 or 1
+:is_hex_digit
+%scope is_hex_digit
+%li(t2) $(1)
+%addi(t0, a0, -48)
+%li(t1) $(10)
+%la_br() &::done
+%bltu(t0, t1)
+%addi(t0, a0, -97)
+%li(t1) $(6)
+%la_br() &::done
+%bltu(t0, t1)
+%addi(t0, a0, -65)
+%la_br() &::done
+%bltu(t0, t1)
+%li(t2) $(0)
+::done
+%mov(a0, t2)
+%ret()
+%endscope
+
+# is_space(c=a0) -> 0 or 1
+:is_space
+%scope is_space
+%li(t2) $(1)
+%addi(t0, a0, -32)
+%la_br() &::done
+%beqz(t0)
+%addi(t0, a0, -9)
+%li(t1) $(5)
+%la_br() &::done
+%bltu(t0, t1)
+%li(t2) $(0)
+::done
+%mov(a0, t2)
+%ret()
+%endscope
+
+# is_alpha(c=a0) -> 0 or 1
+:is_alpha
+%scope is_alpha
+%li(t2) $(1)
+%addi(t0, a0, -97)
+%li(t1) $(26)
+%la_br() &::done
+%bltu(t0, t1)
+%addi(t0, a0, -65)
+%la_br() &::done
+%bltu(t0, t1)
+%li(t2) $(0)
+::done
+%mov(a0, t2)
+%ret()
+%endscope
+
+# is_alnum(c=a0) -> 0 or 1
+:is_alnum
+%scope is_alnum
+%li(t2) $(1)
+%addi(t0, a0, -48)
+%li(t1) $(10)
+%la_br() &::done
+%bltu(t0, t1)
+%addi(t0, a0, -97)
+%li(t1) $(26)
+%la_br() &::done
+%bltu(t0, t1)
+%addi(t0, a0, -65)
+%la_br() &::done
+%bltu(t0, t1)
+%li(t2) $(0)
+::done
+%mov(a0, t2)
+%ret()
+%endscope
+
+# =========================================================================
+# Raw syscall wrappers
+# =========================================================================
+#
+# Each wrapper shifts arguments into the syscall convention
+# (a0 = number, a1..a3/t0/s0/s1 = args 0..5), emits SYSCALL, and returns
+# the raw kernel result. Syscall clobbers only a0, so t0/s0/s1 do not
+# need saving.
+
+# sys_read(fd=a0, buf=a1, len=a2) -> n (a0)
+:sys_read
+%mov(a3, a2)
+%mov(a2, a1)
+%mov(a1, a0)
+%li(a0) %p1_sys_read()
+%syscall()
+%ret()
+
+# sys_write(fd=a0, buf=a1, len=a2) -> n (a0)
+:sys_write
+%mov(a3, a2)
+%mov(a2, a1)
+%mov(a1, a0)
+%li(a0) %p1_sys_write()
+%syscall()
+%ret()
+
+# sys_open(path=a0, flags=a1, mode=a2) -> fd (a0)
+# Implemented as openat(AT_FDCWD, path, flags, mode). AT_FDCWD = -100.
+:sys_open
+%mov(t0, a2)
+%mov(a3, a1)
+%mov(a2, a0)
+%li(a1) $(-100)
+%li(a0) %p1_sys_openat()
+%syscall()
+%ret()
+
+# sys_close(fd=a0) -> r (a0)
+:sys_close
+%mov(a1, a0)
+%li(a0) %p1_sys_close()
+%syscall()
+%ret()
+
+# sys_exit(code=a0) -> never returns
+:sys_exit
+%scope sys_exit
+%mov(a1, a0)
+%li(a0) %p1_sys_exit()
+%syscall()
+::spin
+%la_br() &::spin
+%b()
+%endscope
+
+# =========================================================================
+# Print helpers
+# =========================================================================
+#
+# print(buf, len) and eprint(buf, len) loop on sys_write until all bytes
+# are written or the kernel reports an error. All other print helpers
+# compose on top of those two.
+
+%fn(print, 16, {
+%st(s0, sp, 0)
+%st(s1, sp, 8)
+%mov(s0, a0)
+%mov(s1, a1)
+
+::loop
+%la_br() &::done_ok
+%beqz(s1)
+%li(a0) $(1)
+%mov(a1, s0)
+%mov(a2, s1)
+%la_br() &sys_write
+%call()
+%la_br() &::done
+%bltz(a0)
+%add(s0, s0, a0)
+%sub(s1, s1, a0)
+%la_br() &::loop
+%b()
+
+::done_ok
+%li(a0) $(0)
+::done
+%ld(s0, sp, 0)
+%ld(s1, sp, 8)
+})
+
+%fn(eprint, 16, {
+%st(s0, sp, 0)
+%st(s1, sp, 8)
+%mov(s0, a0)
+%mov(s1, a1)
+
+::loop
+%la_br() &::done_ok
+%beqz(s1)
+%li(a0) $(2)
+%mov(a1, s0)
+%mov(a2, s1)
+%la_br() &sys_write
+%call()
+%la_br() &::done
+%bltz(a0)
+%add(s0, s0, a0)
+%sub(s1, s1, a0)
+%la_br() &::loop
+%b()
+
+::done_ok
+%li(a0) $(0)
+::done
+%ld(s0, sp, 0)
+%ld(s1, sp, 8)
+})
+
+%fn(println, 16, {
+%st(s0, sp, 0)
+
+%la_br() &print
+%call()
+%mov(s0, a0)
+%la_br() &::done
+%bltz(s0)
+
+%la(a0) &libp1pp__newline
+%li(a1) $(1)
+%la_br() &print
+%call()
+%mov(s0, a0)
+
+::done
+%mov(a0, s0)
+%ld(s0, sp, 0)
+})
+
+%fn(eprintln, 16, {
+%st(s0, sp, 0)
+
+%la_br() &eprint
+%call()
+%mov(s0, a0)
+%la_br() &::done
+%bltz(s0)
+
+%la(a0) &libp1pp__newline
+%li(a1) $(1)
+%la_br() &eprint
+%call()
+%mov(s0, a0)
+
+::done
+%mov(a0, s0)
+%ld(s0, sp, 0)
+})
+
+%fn(print_cstr, 16, {
+%st(s0, sp, 0)
+%mov(s0, a0)
+%la_br() &strlen
+%call()
+%mov(a1, a0)
+%mov(a0, s0)
+%la_br() &print
+%call()
+%ld(s0, sp, 0)
+})
+
+%fn(eprint_cstr, 16, {
+%st(s0, sp, 0)
+%mov(s0, a0)
+%la_br() &strlen
+%call()
+%mov(a1, a0)
+%mov(a0, s0)
+%la_br() &eprint
+%call()
+%ld(s0, sp, 0)
+})
+
+%fn(print_int, 0, {
+%mov(a1, a0)
+%la(a0) &libp1pp__num_buf
+%la_br() &fmt_dec
+%call()
+%mov(a1, a0)
+%la(a0) &libp1pp__num_buf
+%la_br() &print
+%call()
+})
+
+%fn(print_hex, 0, {
+%mov(a1, a0)
+%la(a0) &libp1pp__num_buf
+%la_br() &fmt_hex
+%call()
+%mov(a1, a0)
+%la(a0) &libp1pp__num_buf
+%la_br() &print
+%call()
+})
+
+# =========================================================================
+# File helpers
+# =========================================================================
+
+# read_file(path=a0, buf=a1, cap=a2) -> n or -1
+%fn(read_file, 32, {
+%st(s0, sp, 0)
+%st(s1, sp, 8)
+%st(s2, sp, 16)
+%st(s3, sp, 24)
+
+%mov(s1, a1)
+%mov(s2, a2)
+
+%li(a1) $(0)
+%li(a2) $(0)
+%la_br() &sys_open
+%call()
+%la_br() &::open_fail
+%bltz(a0)
+%mov(s3, a0)
+
+%mov(a0, s3)
+%mov(a1, s1)
+%mov(a2, s2)
+%la_br() &sys_read
+%call()
+%mov(s0, a0)
+
+%mov(a0, s3)
+%la_br() &sys_close
+%call()
+
+%mov(a0, s0)
+%la_br() &::read_fail
+%bltz(a0)
+%la_br() &::done
+%b()
+
+::read_fail
+%li(a0) $(-1)
+%la_br() &::done
+%b()
+
+::open_fail
+%li(a0) $(-1)
+
+::done
+%ld(s0, sp, 0)
+%ld(s1, sp, 8)
+%ld(s2, sp, 16)
+%ld(s3, sp, 24)
+})
+
+# libp1pp__write_all(fd=a0, buf=a1, len=a2) -> 0 or <0 on error
+#
+# Loop on sys_write until all bytes are written. Used by print / eprint
+# / write_file. Retries partial writes but returns the first negative
+# kernel return unchanged.
+%fn(libp1pp__write_all, 24, {
+%st(s0, sp, 0)
+%st(s1, sp, 8)
+%st(s2, sp, 16)
+
+%mov(s0, a0)
+%mov(s1, a1)
+%mov(s2, a2)
+
+::loop
+%la_br() &::done_ok
+%beqz(s2)
+%mov(a0, s0)
+%mov(a1, s1)
+%mov(a2, s2)
+%la_br() &sys_write
+%call()
+%la_br() &::done
+%bltz(a0)
+%add(s1, s1, a0)
+%sub(s2, s2, a0)
+%la_br() &::loop
+%b()
+
+::done_ok
+%li(a0) $(0)
+::done
+%ld(s0, sp, 0)
+%ld(s1, sp, 8)
+%ld(s2, sp, 16)
+})
+
+# write_file(path=a0, buf=a1, len=a2) -> 0 or -1
+#
+# Flags: O_WRONLY|O_CREAT|O_TRUNC. On Linux these are 0x1 | 0x40 |
+# 0x200 = 0x241. Mode 0644 octal = 0x1A4.
+%fn(write_file, 24, {
+%st(s0, sp, 0)
+%st(s1, sp, 8)
+%st(s2, sp, 16)
+
+%mov(s0, a1)
+%mov(s1, a2)
+
+%li(a1) $(0x241)
+%li(a2) $(0x1A4)
+%la_br() &sys_open
+%call()
+%la_br() &::open_fail
+%bltz(a0)
+%mov(s2, a0)
+
+%mov(a0, s2)
+%mov(a1, s0)
+%mov(a2, s1)
+%la_br() &libp1pp__write_all
+%call()
+
+%mov(s0, a0)
+%mov(a0, s2)
+%la_br() &sys_close
+%call()
+
+%mov(a0, s0)
+%la_br() &::fail_ret
+%bltz(a0)
+%li(a0) $(0)
+%la_br() &::done
+%b()
+
+::fail_ret
+%li(a0) $(-1)
+%la_br() &::done
+%b()
+
+::open_fail
+%li(a0) $(-1)
+
+::done
+%ld(s0, sp, 0)
+%ld(s1, sp, 8)
+%ld(s2, sp, 16)
+})
+
+# =========================================================================
+# Bump allocator
+# =========================================================================
+#
+# Single global arena, bytes carved by monotonic cursor with 8-byte
+# alignment. bump_alloc returns 0 when the request would overflow.
+
+# bump_init(base=a0, cap=a1) -> 0
+:bump_init
+%la(t0) &libp1pp__bump_base
+%st(a0, t0, 0)
+%la(t0) &libp1pp__bump_cursor
+%st(a0, t0, 0)
+%la(t0) &libp1pp__bump_cap
+%st(a1, t0, 0)
+%li(a0) $(0)
+%ret()
+
+# bump_alloc(n=a0) -> ptr (0 on exhaustion)
+#
+# Round n up to a multiple of 8, then admit iff cursor + n_rounded does
+# not exceed base + cap. On success, advance the cursor and return the
+# pre-advance value; on failure, leave the cursor untouched and return 0.
+:bump_alloc
+%scope bump_alloc
+%addi(a0, a0, 7)
+%li(t0) $(-8)
+%and(a0, a0, t0)
+%la(t0) &libp1pp__bump_cursor
+%ld(t1, t0, 0)
+%add(t2, t1, a0)
+%la(a1) &libp1pp__bump_base
+%ld(a2, a1, 0)
+%la(a1) &libp1pp__bump_cap
+%ld(a3, a1, 0)
+%add(a3, a2, a3)
+%la_br() &::fail
+%bltu(a3, t2)
+%st(t2, t0, 0)
+%mov(a0, t1)
+%ret()
+::fail
+%li(a0) $(0)
+%ret()
+%endscope
+
+# bump_mark() -> saved
+:bump_mark
+%la(t0) &libp1pp__bump_cursor
+%ld(a0, t0, 0)
+%ret()
+
+# bump_release(saved=a0) -> 0
+:bump_release
+%la(t0) &libp1pp__bump_cursor
+%st(a0, t0, 0)
+%li(a0) $(0)
+%ret()
+
+# bump_reset() -> 0
+:bump_reset
+%la(t0) &libp1pp__bump_base
+%ld(t1, t0, 0)
+%la(t0) &libp1pp__bump_cursor
+%st(t1, t0, 0)
+%li(a0) $(0)
+%ret()
+
+# =========================================================================
+# Panic
+# =========================================================================
+
+# panic(msg_cstr=a0) -> never returns
+%fn(panic, 0, {
+%la_br() &eprint_cstr
+%call()
+%la(a0) &libp1pp__newline
+%li(a1) $(1)
+%la_br() &eprint
+%call()
+%li(a0) $(1)
+%la_br() &sys_exit
+%call()
+::spin
+%la_br() &::spin
+%b()
+})
+
+# =========================================================================
+# Internal data
+# =========================================================================
+
+# Single newline byte for println / eprintln / panic. Emitted as an
+# 8-byte word (0x0A in the low byte, zeros above) so the following
+# buffers and the user source that comes after libp1pp stay 8-byte
+# aligned. sys_write reads only the one byte callers request.
+:libp1pp__newline
+$(10)
+
+# Scratch buffer used by print_int / print_hex. fmt_dec writes at most
+# 20 bytes, fmt_hex at most 16, so 32 bytes with word alignment is
+# comfortably above both.
+:libp1pp__num_buf
+$(0) $(0) $(0) $(0)
+
+# Bump-allocator state. Zero-initialized so bump_alloc returns 0 until
+# bump_init installs an arena.
+:libp1pp__bump_base
+$(0)
+:libp1pp__bump_cursor
+$(0)
+:libp1pp__bump_cap
+$(0)