commit 71fc4b9777dfcce9ddeb91fda7bf0f95f3059cdf
parent f831ef7fd9e12863ce06e59378f6e668e7c9cc4b
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 30 Apr 2026 09:30:25 -0700
p1pp: expand libp1pp and use from cc.scm
Diffstat:
18 files changed, 979 insertions(+), 119 deletions(-)
diff --git a/P1/P1pp.P1pp b/P1/P1pp.P1pp
@@ -98,6 +98,281 @@
%endm
# =========================================================================
+# Sub-word memory access
+# =========================================================================
+#
+# P1 has only 1-byte (%lb/%sb) and 8-byte (%ld/%st) memory ops, and the
+# 8-byte ops require natural 8-byte alignment. For struct fields and
+# packed data laid out at narrower widths, sub-word access is byte-
+# decomposed: %lb-gather + shli/or for loads, %sb-scatter + shri for
+# stores. These macros encapsulate that pattern so callers do not have
+# to open-code it (and so a backend can later substitute a single
+# native sub-word load/store when alignment is statically known).
+#
+# Conventions:
+# `rd` is the destination (loads); `rs` is the source (stores).
+# Stores preserve `rs`; loads clobber `rd`. `scratch` is a working
+# register distinct from rd/rs and base. Bytes are little-endian:
+# byte 0 (low) at off+0. The signed-load variants (%ld_sh, %ld_sw)
+# sign-extend the gathered value to the canonical 64-bit form.
+#
+# %ld_h(rd, base, off, scratch) — 2-byte zero-extending load
+# %ld_w(rd, base, off, scratch) — 4-byte zero-extending load
+# %ld_sh(rd, base, off, scratch) — 2-byte sign-extending load
+# %ld_sw(rd, base, off, scratch) — 4-byte sign-extending load
+# %st_h(rs, base, off, scratch) — 2-byte store (writes low 16 bits)
+# %st_w(rs, base, off, scratch) — 4-byte store (writes low 32 bits)
+
+%macro ld_h(rd, base, off, scratch)
+ %lb(rd, base, off)
+ %lb(scratch, base, (+ off 1))
+ %shli(scratch, scratch, 8)
+ %or(rd, rd, scratch)
+%endm
+
+%macro ld_w(rd, base, off, scratch)
+ %lb(rd, base, off)
+ %lb(scratch, base, (+ off 1))
+ %shli(scratch, scratch, 8)
+ %or(rd, rd, scratch)
+ %lb(scratch, base, (+ off 2))
+ %shli(scratch, scratch, 16)
+ %or(rd, rd, scratch)
+ %lb(scratch, base, (+ off 3))
+ %shli(scratch, scratch, 24)
+ %or(rd, rd, scratch)
+%endm
+
+%macro ld_sh(rd, base, off, scratch)
+ %ld_h(rd, base, off, scratch)
+ %shli(rd, rd, 48)
+ %sari(rd, rd, 48)
+%endm
+
+%macro ld_sw(rd, base, off, scratch)
+ %ld_w(rd, base, off, scratch)
+ %shli(rd, rd, 32)
+ %sari(rd, rd, 32)
+%endm
+
+%macro st_h(rs, base, off, scratch)
+ %sb(rs, base, off)
+ %shri(scratch, rs, 8)
+ %sb(scratch, base, (+ off 1))
+%endm
+
+%macro st_w(rs, base, off, scratch)
+ %sb(rs, base, off)
+ %shri(scratch, rs, 8)
+ %sb(scratch, base, (+ off 1))
+ %shri(scratch, rs, 16)
+ %sb(scratch, base, (+ off 2))
+ %shri(scratch, rs, 24)
+ %sb(scratch, base, (+ off 3))
+%endm
+
+# =========================================================================
+# Sign and zero extension
+# =========================================================================
+#
+# %sextN(rd, ra) truncate ra to N bits and sign-extend to 64.
+# %zextN(rd, ra) truncate ra to N bits and zero-extend to 64.
+# %zext32(rd, ra, scratch)
+# like zextN but needs a scratch register because
+# 0xFFFFFFFF does not fit a 16-bit movz immediate
+# (the path %andi takes when materializing the mask).
+#
+# rd may equal ra. The signed forms use shli/sari at the right amount;
+# zext8/zext16 ride on %andi (the mask fits movz so no caller scratch
+# needed); zext32 materializes the mask explicitly.
+
+%macro sext8(rd, ra)
+ %shli(rd, ra, 56)
+ %sari(rd, rd, 56)
+%endm
+
+%macro sext16(rd, ra)
+ %shli(rd, ra, 48)
+ %sari(rd, rd, 48)
+%endm
+
+%macro sext32(rd, ra)
+ %shli(rd, ra, 32)
+ %sari(rd, rd, 32)
+%endm
+
+%macro zext8(rd, ra)
+ %andi(rd, ra, 255)
+%endm
+
+%macro zext16(rd, ra)
+ %andi(rd, ra, 65535)
+%endm
+
+%macro zext32(rd, ra, scratch)
+ %li(scratch, 4294967295)
+ %and(rd, ra, scratch)
+%endm
+
+# =========================================================================
+# Frame-slot address
+# =========================================================================
+#
+# %lea_slot(rd, slot) rd = address of the frame slot at byte offset
+# `slot`. Centralizes the "%mov(rd, sp) +
+# %addi(rd, rd, slot)" idiom — the backend folds
+# its hidden frame-header offset into %mov(rd, sp),
+# so callers must not bake a literal 16 into the
+# %addi. `slot` may be any M1pp integer expression
+# (a literal byte offset or a %fn__SO-relative
+# slot-expr).
+
+%macro lea_slot(rd, slot)
+ %mov(rd, sp)
+ %addi(rd, rd, slot)
+%endm
+
+# =========================================================================
+# Pointer scaling
+# =========================================================================
+#
+# %ptr_add(rd, ptr, idx, sz, scratch) rd = ptr + idx*sz
+# %ptr_sub(rd, ptr, idx, sz, scratch) rd = ptr - idx*sz
+# %ptr_diff(rd, p, q, sz, scratch) rd = (p - q) / sz
+#
+# `sz` is an M1pp-time integer constant (the C pointee size). When
+# sz == 1 the multiply (or divide) collapses out at expansion time.
+#
+# %ptr_add and %ptr_sub clobber `scratch`. %ptr_diff clobbers `scratch`
+# (only when sz != 1) and computes through `rd`, so callers must not
+# alias `rd` with `p` or `q` in the sz != 1 path.
+
+# sz <= 1 takes the byte-stride fast path: char* (sz=1) and void*
+# (cc.scm uses sz=-1 for the void pointee, following GCC's byte-arith
+# extension) both want raw idx with no scaling.
+
+%macro ptr_add(rd, ptr, idx, sz, scratch)
+%select((< sz 2),
+ %add(rd, ptr, idx),
+ %li(scratch, sz)
+ %mul(scratch, idx, scratch)
+ %add(rd, ptr, scratch))
+%endm
+
+%macro ptr_sub(rd, ptr, idx, sz, scratch)
+%select((< sz 2),
+ %sub(rd, ptr, idx),
+ %li(scratch, sz)
+ %mul(scratch, idx, scratch)
+ %sub(rd, ptr, scratch))
+%endm
+
+%macro ptr_diff(rd, p, q, sz, scratch)
+%select((< sz 2),
+ %sub(rd, p, q),
+ %sub(rd, p, q)
+ %li(scratch, sz)
+ %div(rd, rd, scratch))
+%endm
+
+# =========================================================================
+# Memcpy-call shorthand
+# =========================================================================
+#
+# %memcpy_call(dst_reg, src_reg, n_imm)
+# Marshal arguments into the libp1pp memcpy ABI and invoke it. Useful
+# for fixed-size memory copies (e.g. struct copy in a code generator)
+# where the size is known at expansion time. dst_reg and src_reg must
+# not be a0 — the dst move would clobber a different live input.
+
+%macro memcpy_call(dst_reg, src_reg, n_imm)
+ %li(a2, n_imm)
+ %mov(a1, src_reg)
+ %mov(a0, dst_reg)
+ %call(&memcpy)
+%endm
+
+# =========================================================================
+# Compare-and-set-bool macros
+# =========================================================================
+#
+# %cmpset_<cc>(rd, ra[, rb]) rd = (ra <cc> rb) ? 1 : 0
+#
+# Two-operand: eq, ne, lt, ltu (signed/unsigned).
+# Zero-operand (compare against zero): eqz, nez, ltz.
+#
+# Lower to %ifelse_<cc>(...) which itself works across all P1 backends.
+# A backend that supports a native conditional-set instruction can later
+# specialize these to a single op without touching callers.
+
+%macro cmpset_eq(rd, ra, rb)
+ %ifelse_eq(ra, rb, { %li(rd, 1) }, { %li(rd, 0) })
+%endm
+
+%macro cmpset_ne(rd, ra, rb)
+ %ifelse_ne(ra, rb, { %li(rd, 1) }, { %li(rd, 0) })
+%endm
+
+%macro cmpset_lt(rd, ra, rb)
+ %ifelse_lt(ra, rb, { %li(rd, 1) }, { %li(rd, 0) })
+%endm
+
+%macro cmpset_ltu(rd, ra, rb)
+ %ifelse_ltu(ra, rb, { %li(rd, 1) }, { %li(rd, 0) })
+%endm
+
+%macro cmpset_eqz(rd, ra)
+ %ifelse_eqz(ra, { %li(rd, 1) }, { %li(rd, 0) })
+%endm
+
+%macro cmpset_nez(rd, ra)
+ %ifelse_nez(ra, { %li(rd, 1) }, { %li(rd, 0) })
+%endm
+
+%macro cmpset_ltz(rd, ra)
+ %ifelse_ltz(ra, { %li(rd, 1) }, { %li(rd, 0) })
+%endm
+
+# =========================================================================
+# Tiny unops
+# =========================================================================
+#
+# %neg(rd, ra, scratch) rd = -ra (scratch holds the zero literal)
+# %bnot(rd, ra, scratch) rd = ~ra (scratch holds the all-ones literal)
+# %bool(rd, ra) rd = (ra != 0) ? 1 : 0 (alias of cmpset_nez)
+
+%macro neg(rd, ra, scratch)
+ %li(scratch, 0)
+ %sub(rd, scratch, ra)
+%endm
+
+%macro bnot(rd, ra, scratch)
+ %li(scratch, -1)
+ %xor(rd, ra, scratch)
+%endm
+
+%macro bool(rd, ra)
+ %cmpset_nez(rd, ra)
+%endm
+
+# =========================================================================
+# Switch dispatch
+# =========================================================================
+#
+# %switch_case(ctrl, scratch, key, target)
+# If `ctrl == key`, branch to `target`. `scratch` is used to
+# materialize the key as a register operand. `target` is the full
+# branch target (e.g. `&::case_3`).
+#
+# A code generator emitting a switch dispatcher emits one
+# %switch_case per case, then an unconditional branch to the default.
+
+%macro switch_case(ctrl, scratch, key, target)
+ %li(scratch, key)
+ %beq(ctrl, scratch, target)
+%endm
+
+# =========================================================================
# Control-flow macros
# =========================================================================
#
diff --git a/cc/cc.scm b/cc/cc.scm
@@ -2673,100 +2673,91 @@
;; Scratch convention: helpers may clobber t1; callers never pass
;; reg=t1.
-(define (%cg-emit-ldN-bytes cg reg base-bv off-expr-fn n-bytes)
- ;; Emit n-bytes %lb gathers into reg with shift+OR. byte 0 is low.
- ;; off-expr-fn is a procedure: (off-expr-fn k) returns the bv
- ;; expression for offset k.
- (%cg-emit-many cg (list "%lb(" (%cg-reg->bv reg) ", " base-bv ", "
- (off-expr-fn 0) ")\n"))
- (let loop ((k 1))
- (cond
- ((= k n-bytes) 0)
- (else
- (%cg-emit-many cg (list
- "%lb(t1, " base-bv ", " (off-expr-fn k) ")\n"
- "%shli(t1, t1, " (%n (* 8 k)) ")\n"
- "%or(" (%cg-reg->bv reg) ", " (%cg-reg->bv reg) ", t1)\n"))
- (loop (+ k 1))))))
-
-(define (%cg-emit-stN-bytes cg reg base-bv off-expr-fn n-bytes)
- ;; Emit n-bytes %sb scatters from reg via shri-shifted t1.
- (%cg-emit-many cg (list "%sb(" (%cg-reg->bv reg) ", " base-bv ", "
- (off-expr-fn 0) ")\n"))
- (let loop ((k 1))
- (cond
- ((= k n-bytes) 0)
- (else
- (%cg-emit-many cg (list
- "%shri(t1, " (%cg-reg->bv reg) ", " (%n (* 8 k)) ")\n"
- "%sb(t1, " base-bv ", " (off-expr-fn k) ")\n"))
- (loop (+ k 1))))))
-
+;; Sub-word loads/stores defer byte-decomposition to libp1pp's
+;; %ld_h / %ld_w / %ld_sh / %ld_sw / %st_h / %st_w macros (see
+;; P1/P1pp.P1pp). cc.scm just emits one macro call per access; the
+;; macro arranges the byte gather/scatter and (for signed loads) folds
+;; in the sign-extend. t1 is the conventional scratch.
+(define (%cg-emit-ld-sub cg reg base-bv off-bv signed? n-bytes)
+ (let ((mname (cond ((= n-bytes 2) (if signed? "%ld_sh(" "%ld_h("))
+ ((= n-bytes 4) (if signed? "%ld_sw(" "%ld_w("))
+ (else (die #f "cg-emit-ld-sub: bad width" n-bytes)))))
+ (%cg-emit-many cg (list mname (%cg-reg->bv reg) ", "
+ base-bv ", " off-bv ", t1)\n"))))
+
+(define (%cg-emit-st-sub cg reg base-bv off-bv n-bytes)
+ (let ((mname (cond ((= n-bytes 2) "%st_h(")
+ ((= n-bytes 4) "%st_w(")
+ (else (die #f "cg-emit-st-sub: bad width" n-bytes)))))
+ (%cg-emit-many cg (list mname (%cg-reg->bv reg) ", "
+ base-bv ", " off-bv ", t1)\n"))))
+
+;; "address of frame slot" — defers to libp1pp's %lea_slot, which hides
+;; the backend frame-header offset that %mov(rd, sp) folds in.
+(define (%cg-emit-lea-slot cg reg-bv slot-bv)
+ (%cg-emit-many cg (list "%lea_slot(" reg-bv ", " slot-bv ")\n")))
+
+;; sext8/16/32 emitted via libp1pp's %sext<N>(rd, ra). shift-amount is
+;; kept as the parameter for call-site clarity (callers think in bit
+;; widths via the same 56/48/32 amounts they always have).
(define (%cg-emit-sext cg reg shift-amount)
- (%cg-emit-many cg (list
- "%shli(" (%cg-reg->bv reg) ", " (%cg-reg->bv reg) ", "
- (%n shift-amount) ")\n"
- "%sari(" (%cg-reg->bv reg) ", " (%cg-reg->bv reg) ", "
- (%n shift-amount) ")\n")))
+ (let ((width (cond ((= shift-amount 56) "8")
+ ((= shift-amount 48) "16")
+ ((= shift-amount 32) "32")
+ (else (die #f "cg-emit-sext: bad shift" shift-amount))))
+ (rb (%cg-reg->bv reg)))
+ (%cg-emit-many cg (list "%sext" width "(" rb ", " rb ")\n"))))
(define (%cg-emit-ld-slot-typed cg reg ctype logical-off)
(%cg-fp-reject! 'ld-slot ctype)
(let* ((sz (ctype-size ctype)) (kind (ctype-kind ctype))
- (off-fn (lambda (k) (%cg-slot-expr cg (+ logical-off k)))))
+ (off-bv (%cg-slot-expr cg logical-off)))
(cond
((= sz 1)
(%cg-emit-many cg (list "%lb(" (%cg-reg->bv reg) ", sp, "
- (off-fn 0) ")\n"))
+ off-bv ")\n"))
(cond ((eq? kind 'i8) (%cg-emit-sext cg reg 56))))
- ((= sz 2)
- (%cg-emit-ldN-bytes cg reg "sp" off-fn 2)
- (cond ((eq? kind 'i16) (%cg-emit-sext cg reg 48))))
- ((= sz 4)
- (%cg-emit-ldN-bytes cg reg "sp" off-fn 4)
- (cond ((eq? kind 'i32) (%cg-emit-sext cg reg 32))))
+ ((= sz 2) (%cg-emit-ld-sub cg reg "sp" off-bv (eq? kind 'i16) 2))
+ ((= sz 4) (%cg-emit-ld-sub cg reg "sp" off-bv (eq? kind 'i32) 4))
(else (%cg-emit-ld-slot cg reg logical-off)))))
(define (%cg-emit-st-slot-typed cg reg ctype logical-off)
(%cg-fp-reject! 'st-slot ctype)
(let* ((sz (ctype-size ctype))
- (off-fn (lambda (k) (%cg-slot-expr cg (+ logical-off k)))))
+ (off-bv (%cg-slot-expr cg logical-off)))
(cond
((= sz 1)
(%cg-emit-many cg (list "%sb(" (%cg-reg->bv reg) ", sp, "
- (off-fn 0) ")\n")))
- ((= sz 2) (%cg-emit-stN-bytes cg reg "sp" off-fn 2))
- ((= sz 4) (%cg-emit-stN-bytes cg reg "sp" off-fn 4))
+ off-bv ")\n")))
+ ((= sz 2) (%cg-emit-st-sub cg reg "sp" off-bv 2))
+ ((= sz 4) (%cg-emit-st-sub cg reg "sp" off-bv 4))
(else (%cg-emit-st-slot cg reg logical-off)))))
(define (%cg-emit-ld-typed cg reg ctype base off)
(%cg-fp-reject! 'ld ctype)
(let* ((sz (ctype-size ctype)) (kind (ctype-kind ctype))
(base-bv (%cg-reg->bv base))
- (off-fn (lambda (k) (%n (+ off k)))))
+ (off-bv (%n off)))
(cond
((= sz 1)
(%cg-emit-many cg (list "%lb(" (%cg-reg->bv reg) ", "
- base-bv ", " (off-fn 0) ")\n"))
+ base-bv ", " off-bv ")\n"))
(cond ((eq? kind 'i8) (%cg-emit-sext cg reg 56))))
- ((= sz 2)
- (%cg-emit-ldN-bytes cg reg base-bv off-fn 2)
- (cond ((eq? kind 'i16) (%cg-emit-sext cg reg 48))))
- ((= sz 4)
- (%cg-emit-ldN-bytes cg reg base-bv off-fn 4)
- (cond ((eq? kind 'i32) (%cg-emit-sext cg reg 32))))
+ ((= sz 2) (%cg-emit-ld-sub cg reg base-bv off-bv (eq? kind 'i16) 2))
+ ((= sz 4) (%cg-emit-ld-sub cg reg base-bv off-bv (eq? kind 'i32) 4))
(else (%cg-emit-ld cg reg base off)))))
(define (%cg-emit-st-typed cg reg ctype base off)
(%cg-fp-reject! 'st ctype)
(let* ((sz (ctype-size ctype))
(base-bv (%cg-reg->bv base))
- (off-fn (lambda (k) (%n (+ off k)))))
+ (off-bv (%n off)))
(cond
((= sz 1)
(%cg-emit-many cg (list "%sb(" (%cg-reg->bv reg) ", "
- base-bv ", " (off-fn 0) ")\n")))
- ((= sz 2) (%cg-emit-stN-bytes cg reg base-bv off-fn 2))
- ((= sz 4) (%cg-emit-stN-bytes cg reg base-bv off-fn 4))
+ base-bv ", " off-bv ")\n")))
+ ((= sz 2) (%cg-emit-st-sub cg reg base-bv off-bv 2))
+ ((= sz 4) (%cg-emit-st-sub cg reg base-bv off-bv 4))
(else (%cg-emit-st cg reg base off)))))
(define (%cg-load-opnd-into cg op reg)
@@ -3292,9 +3283,7 @@
;; direct frame lval: address is sp+off.
(($ opnd? (kind frame) (ext ,off))
(guard (not (%cg-indirect? cg off)))
- (%cg-emit-many cg (list "%mov(t0, sp)\n"
- "%addi(t0, t0, "
- (%cg-slot-expr cg off) ")\n"))
+ (%cg-emit-lea-slot cg "t0" (%cg-slot-expr cg off))
(%cg-spill-reg cg 't0 pty))
;; indirect frame lval (rare for arrays, but support it):
;; the slot holds the address already.
@@ -3327,9 +3316,7 @@
(guard (%cg-indirect? cg off))
(%cg-emit-ld-slot cg reg off))
(($ opnd? (kind frame) (ext ,off))
- (%cg-emit-many cg (list "%mov(" reg-bv ", sp)\n"
- "%addi(" reg-bv ", " reg-bv ", "
- (%cg-slot-expr cg off) ")\n")))
+ (%cg-emit-lea-slot cg reg-bv (%cg-slot-expr cg off)))
(($ opnd? (kind global) (ext ,lbl))
(%cg-emit-la cg reg lbl))
(else (die #f "cg-emit-addr-of: unsupported lval kind"
@@ -3351,20 +3338,18 @@
(%cg-emit-addr-of cg dst 't2)
(%cg-emit-byte-copy cg 't2 't0 't1 sz)))
-;; Per-byte struct copy. dst-reg and src-reg hold addresses; emits
-;; size byte-load/byte-store pairs using tmp-reg as the byte staging
-;; register. All three regs are assumed caller-saved temporaries.
+;; Struct copy: defer to libp1pp memcpy via %memcpy_call. dst-reg and
+;; src-reg hold the addresses; size is the byte count. tmp-reg is no
+;; longer needed by this helper (kept in the signature so existing
+;; callers don't have to thread their scratch allocation differently),
+;; but the macro itself uses a0/a1/a2 around the call. dst-reg and
+;; src-reg must not be a0 (the dst move would clobber a different live
+;; input register); both current callers use t-regs.
(define (%cg-emit-byte-copy cg dst-reg src-reg tmp-reg size)
- (let ((dr (%cg-reg->bv dst-reg))
- (sr (%cg-reg->bv src-reg))
- (tr (%cg-reg->bv tmp-reg)))
- (let loop ((k 0))
- (cond
- ((>= k size) #t)
- (else
- (%cg-emit-many cg (list "%lb(" tr ", " sr ", " (%n k) ")\n"
- "%sb(" tr ", " dr ", " (%n k) ")\n"))
- (loop (+ k 1)))))))
+ (%cg-emit-many cg (list "%memcpy_call("
+ (%cg-reg->bv dst-reg) ", "
+ (%cg-reg->bv src-reg) ", "
+ (%n size) ")\n")))
(define (cg-take-addr cg)
(let* ((p (cg-pop cg))
@@ -3381,12 +3366,10 @@
(guard (%cg-indirect? cg off))
(%cg-emit-ld-slot cg 't0 off)
(%cg-spill-reg cg 't0 pty))
- ;; %mov(rd, sp) gives the portable-sp pointer (the backend
- ;; handles any hidden frame-header offset). Then add slot.
+ ;; %lea_slot wraps the "%mov(rd, sp); %addi(rd, rd, slot)" idiom;
+ ;; the backend hides any frame-header offset inside %mov(rd, sp).
(($ opnd? (kind frame) (ext ,off))
- (%cg-emit-many cg (list "%mov(t0, sp)\n"
- "%addi(t0, t0, "
- (%cg-slot-expr cg off) ")\n"))
+ (%cg-emit-lea-slot cg "t0" (%cg-slot-expr cg off))
(%cg-spill-reg cg 't0 pty))
(($ opnd? (kind global) (ext ,lbl))
(%cg-emit-la cg 't0 lbl)
@@ -3426,8 +3409,7 @@
(cond
((eq? to-kind 'bool)
(%cg-load-opnd-into cg p 't0)
- (%cg-emit-many cg (list
- "%ifelse_eqz(t0, { %li(t0, 0) }, { %li(t0, 1) })\n"))
+ (%cg-emit-many cg (list "%bool(t0, t0)\n"))
(%cg-spill-reg cg 't0 to-type))
((or (eq? to-kind 'ptr)
(and (or (eq? to-kind 'i64) (eq? to-kind 'u64))
@@ -3462,11 +3444,9 @@
((eq? to-kind 'i8) (%cg-emit-sext cg 't0 56))
((eq? to-kind 'i16) (%cg-emit-sext cg 't0 48))
((eq? to-kind 'i32) (%cg-emit-sext cg 't0 32))
- ((= to-sz 1) (%cg-emit-many cg (list "%andi(t0, t0, 255)\n")))
- ((= to-sz 2)
- (%cg-emit-many cg (list "%li(t1, 65535)\n%and(t0, t0, t1)\n")))
- ((= to-sz 4)
- (%cg-emit-many cg (list "%li(t1, 4294967295)\n%and(t0, t0, t1)\n")))
+ ((= to-sz 1) (%cg-emit-many cg (list "%zext8(t0, t0)\n")))
+ ((= to-sz 2) (%cg-emit-many cg (list "%zext16(t0, t0)\n")))
+ ((= to-sz 4) (%cg-emit-many cg (list "%zext32(t0, t0, t1)\n")))
(else 0))
(%cg-spill-reg cg 't0 to-type)))))
@@ -3520,10 +3500,10 @@
(%cg-reg->bv ra) ", " (%cg-reg->bv rb) ")\n")))
(define (%cg-emit-cmp cg cc ra rb rd)
- (%cg-emit-many cg (list "%ifelse_" cc "("
+ (%cg-emit-many cg (list "%cmpset_" cc "("
+ (%cg-reg->bv rd) ", "
(%cg-reg->bv ra) ", " (%cg-reg->bv rb)
- ", { %li(" (%cg-reg->bv rd) ", 1) }, "
- "{ %li(" (%cg-reg->bv rd) ", 0) })\n")))
+ ")\n")))
(define (cg-binop cg op)
(let* ((b (cg-pop cg))
@@ -3546,29 +3526,21 @@
((and a-ptr? (or (eq? op 'add) (eq? op 'sub)) (not b-ptr?))
(%cg-load-opnd-into cg a 'a0)
(%cg-load-opnd-into cg b 'a1)
- (let ((sz (%ctype-size (%ctype-pointee ta))))
- (cond ((> sz 1) (%cg-emit-many cg (list "%li(t0, " (%n sz) ")\n"))
- (%cg-emit-rrr cg "mul" 'a1 'a1 't0))
- (else 0)))
- (%cg-emit-rrr cg (if (eq? op 'add) "add" "sub") 't0 'a0 'a1)
+ (let ((sz (%ctype-size (%ctype-pointee ta)))
+ (mac (if (eq? op 'add) "%ptr_add(" "%ptr_sub(")))
+ (%cg-emit-many cg (list mac "t0, a0, a1, " (%n sz) ", t1)\n")))
(%cg-spill-reg cg 't0 result-ty))
((and b-ptr? (eq? op 'add) (not a-ptr?))
(%cg-load-opnd-into cg a 'a0)
(%cg-load-opnd-into cg b 'a1)
(let ((sz (%ctype-size (%ctype-pointee tb))))
- (cond ((> sz 1) (%cg-emit-many cg (list "%li(t0, " (%n sz) ")\n"))
- (%cg-emit-rrr cg "mul" 'a0 'a0 't0))
- (else 0)))
- (%cg-emit-rrr cg "add" 't0 'a0 'a1)
+ (%cg-emit-many cg (list "%ptr_add(t0, a1, a0, " (%n sz) ", t1)\n")))
(%cg-spill-reg cg 't0 result-ty))
((and a-ptr? b-ptr? (eq? op 'sub))
(%cg-load-opnd-into cg a 'a0)
(%cg-load-opnd-into cg b 'a1)
- (%cg-emit-rrr cg "sub" 't0 'a0 'a1)
(let ((sz (%ctype-size (%ctype-pointee ta))))
- (cond ((> sz 1) (%cg-emit-many cg (list "%li(t1, " (%n sz) ")\n"))
- (%cg-emit-rrr cg "div" 't0 't0 't1))
- (else 0)))
+ (%cg-emit-many cg (list "%ptr_diff(t0, a0, a1, " (%n sz) ", t1)\n")))
(%cg-spill-reg cg 't0 result-ty))
(else
(%cg-load-opnd-into cg a 'a0)
@@ -3624,13 +3596,13 @@
(%cg-load-opnd-into cg p 't0)
(cond
((eq? op 'neg)
- (%cg-emit-many cg (list "%li(t1, 0)\n%sub(t0, t1, t0)\n"))
+ (%cg-emit-many cg (list "%neg(t0, t0, t1)\n"))
(%cg-spill-reg cg 't0 ty))
((eq? op 'bnot)
- (%cg-emit-many cg (list "%li(t1, -1)\n%xor(t0, t0, t1)\n"))
+ (%cg-emit-many cg (list "%bnot(t0, t0, t1)\n"))
(%cg-spill-reg cg 't0 ty))
((eq? op 'lnot)
- (%cg-emit-many cg (list "%ifelse_eqz(t0, { %li(t0, 1) }, { %li(t0, 0) })\n"))
+ (%cg-emit-many cg (list "%cmpset_eqz(t0, t0)\n"))
(%cg-spill-reg cg 't0 %t-i32))
(else (die #f "cg-unop: unknown op" op)))))
@@ -3705,9 +3677,7 @@
(cond ((> sa 0) (%cg-bump-outgoing! cg sa)) (else 0)))
(cond
(sret?
- (%cg-emit-many cg (list "%mov(a0, sp)\n"
- "%addi(a0, a0, "
- (%cg-slot-expr cg recv-slot) ")\n"))))
+ (%cg-emit-lea-slot cg "a0" (%cg-slot-expr cg recv-slot))))
(cond
((and (eq? (opnd-kind fn-op) 'global) (not (opnd-lval? fn-op)))
(%cg-emit-many cg (list "%call(&" (opnd-ext fn-op) ")\n")))
@@ -3753,9 +3723,7 @@
(sret?
(%cg-emit-ld-slot cg 't2 (%cg-fn-get cg '%fn-sret-slot)))
(else
- (%cg-emit-many cg (list "%mov(t2, sp)\n"
- "%addi(t2, t2, "
- (%cg-slot-expr cg ret-slot) ")\n"))))
+ (%cg-emit-lea-slot cg "t2" (%cg-slot-expr cg ret-slot))))
(%cg-emit-byte-copy cg 't2 't0 't1 sz)
(%cg-emit-many cg (list "%b(&::ret)\n"))))
(else
@@ -3862,9 +3830,7 @@
(cond ((not (opnd-lval? ap-lv))
(die #f "cg-va-start: ap not lvalue")))
;; Compute address into a0.
- (%cg-emit-many cg (list "%mov(a0, sp)\n"
- "%addi(a0, a0, "
- (%cg-slot-expr cg vsl) ")\n"))
+ (%cg-emit-lea-slot cg "a0" (%cg-slot-expr cg vsl))
;; Store a0 at ap-lval.
(cond
((eq? (opnd-kind ap-lv) 'frame)
@@ -3982,8 +3948,8 @@
(%cg-slot-expr cg (swctx-ctrl-slot sw)) ")\n"))
(for-each
(lambda (c)
- (%cg-emit-many cg (list "%li(t1, " (%n (car c)) ")\n"
- "%beq(t0, t1, &::" (cdr c) ")\n")))
+ (%cg-emit-many cg (list "%switch_case(t0, t1, "
+ (%n (car c)) ", &::" (cdr c) ")\n")))
cases)
(cond
(default-lbl (%cg-emit-many cg (list "%b(&::" default-lbl ")\n")))
diff --git a/tests/P1/cmpset.P1pp b/tests/P1/cmpset.P1pp
@@ -0,0 +1,98 @@
+# tests/p1/cmpset.P1pp -- libp1pp compare-and-set-bool macros.
+#
+# %cmpset_eq(rd, ra, rb) rd = (ra == rb) ? 1 : 0
+# %cmpset_ne rd = (ra != rb) ? 1 : 0
+# %cmpset_lt signed less-than
+# %cmpset_ltu unsigned less-than
+# %cmpset_eqz(rd, ra) rd = (ra == 0) ? 1 : 0
+# %cmpset_nez rd = (ra != 0) ? 1 : 0
+# %cmpset_ltz rd = (ra < 0) ? 1 : 0
+#
+# Each subtest checks both the true and false case for a comparator.
+# Output: "EQ NE LT LTU EZ NZ LZ\n" on full pass.
+
+%fn(p1_main, 0, {
+ # ---- eq -------------------------------------------------------------
+ %li(s0, 5) %li(s1, 5)
+ %cmpset_eq(t0, s0, s1)
+ %li(t1, 1) %bne(t0, t1, &::fail)
+ %li(s1, 6)
+ %cmpset_eq(t0, s0, s1)
+ %li(t1, 0) %bne(t0, t1, &::fail)
+ %la(a0, &c_eq) %li(a1, 3) %call(&print)
+
+ # ---- ne -------------------------------------------------------------
+ %li(s0, 5) %li(s1, 6)
+ %cmpset_ne(t0, s0, s1)
+ %li(t1, 1) %bne(t0, t1, &::fail)
+ %li(s1, 5)
+ %cmpset_ne(t0, s0, s1)
+ %li(t1, 0) %bne(t0, t1, &::fail)
+ %la(a0, &c_ne) %li(a1, 3) %call(&print)
+
+ # ---- lt -------------------------------------------------------------
+ %li(s0, -3) %li(s1, 2)
+ %cmpset_lt(t0, s0, s1)
+ %li(t1, 1) %bne(t0, t1, &::fail)
+ %cmpset_lt(t0, s1, s0)
+ %li(t1, 0) %bne(t0, t1, &::fail)
+ %la(a0, &c_lt) %li(a1, 3) %call(&print)
+
+ # ---- ltu (unsigned: -1 is huge) ------------------------------------
+ %li(s0, 5) %li(s1, -1)
+ %cmpset_ltu(t0, s0, s1)
+ %li(t1, 1) %bne(t0, t1, &::fail)
+ %cmpset_ltu(t0, s1, s0)
+ %li(t1, 0) %bne(t0, t1, &::fail)
+ %la(a0, &c_ltu) %li(a1, 4) %call(&print)
+
+ # ---- eqz ------------------------------------------------------------
+ %li(s0, 0)
+ %cmpset_eqz(t0, s0)
+ %li(t1, 1) %bne(t0, t1, &::fail)
+ %li(s0, 7)
+ %cmpset_eqz(t0, s0)
+ %li(t1, 0) %bne(t0, t1, &::fail)
+ %la(a0, &c_ez) %li(a1, 3) %call(&print)
+
+ # ---- nez ------------------------------------------------------------
+ %li(s0, 7)
+ %cmpset_nez(t0, s0)
+ %li(t1, 1) %bne(t0, t1, &::fail)
+ %li(s0, 0)
+ %cmpset_nez(t0, s0)
+ %li(t1, 0) %bne(t0, t1, &::fail)
+ %la(a0, &c_nz) %li(a1, 3) %call(&print)
+
+ # ---- ltz ------------------------------------------------------------
+ %li(s0, -1)
+ %cmpset_ltz(t0, s0)
+ %li(t1, 1) %bne(t0, t1, &::fail)
+ %li(s0, 0)
+ %cmpset_ltz(t0, s0)
+ %li(t1, 0) %bne(t0, t1, &::fail)
+ %la(a0, &c_lz) %li(a1, 3) %call(&print)
+
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 0)
+ %b(&::done)
+
+ ::fail
+ %la(a0, &c_x) %li(a1, 1) %call(&print)
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 1)
+ ::done
+})
+
+:c_eq "EQ "
+:c_ne "NE "
+:c_lt "LT "
+:c_ltu "LTU "
+:c_ez "EZ "
+:c_nz "NZ "
+:c_lz "LZ"
+:c_x "X"
+:c_nl "
+"
+
+:ELF_end
diff --git a/tests/P1/cmpset.expected b/tests/P1/cmpset.expected
@@ -0,0 +1 @@
+EQ NE LT LTU EZ NZ LZ
diff --git a/tests/P1/ext-macros.P1pp b/tests/P1/ext-macros.P1pp
@@ -0,0 +1,123 @@
+# tests/p1/ext-macros.P1pp -- libp1pp sign/zero extension macros.
+#
+# %sext8/16/32(rd, ra) truncate to N bits and sign-extend to 64
+# %zext8(rd, ra) truncate to N bits and zero-extend
+# %zext16(rd, ra) same
+# %zext32(rd, ra, scratch) same; needs scratch since 0xFFFFFFFF > movz
+#
+# Each subtest emits one ASCII byte on success, "X" on mismatch.
+# Expected: "ABCDEFGHIJKL\n".
+
+%fn(p1_main, 0, {
+ # ---- A: sext8 positive (0x7F → 0x7F) -------------------------------
+ %li(t0, 0x7F)
+ %sext8(t0, t0)
+ %li(t1, 127)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_a) %li(a1, 1) %call(&print)
+
+ # ---- B: sext8 negative (0x80 → -128) -------------------------------
+ %li(t0, 0x80)
+ %sext8(t0, t0)
+ %li(t1, -128)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_b) %li(a1, 1) %call(&print)
+
+ # ---- C: sext16 positive (0x7FFF → 32767) ---------------------------
+ %li(t0, 0x7FFF)
+ %sext16(t0, t0)
+ %li(t1, 32767)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_c) %li(a1, 1) %call(&print)
+
+ # ---- D: sext16 negative (0x8000 → -32768) --------------------------
+ %li(t0, 0x8000)
+ %sext16(t0, t0)
+ %li(t1, -32768)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_d) %li(a1, 1) %call(&print)
+
+ # ---- E: sext32 positive (0x7FFFFFFF → 2147483647) ------------------
+ %li(t0, 0x7FFFFFFF)
+ %sext32(t0, t0)
+ %li(t1, 2147483647)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_e) %li(a1, 1) %call(&print)
+
+ # ---- F: sext32 negative (0x80000000 → -2147483648) -----------------
+ %li(t0, 0x80000000)
+ %sext32(t0, t0)
+ %li(t1, -2147483648)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_f) %li(a1, 1) %call(&print)
+
+ # ---- G: zext8 (-1 → 0xFF) ------------------------------------------
+ %li(t0, -1)
+ %zext8(t0, t0)
+ %li(t1, 0xFF)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_g) %li(a1, 1) %call(&print)
+
+ # ---- H: zext16 (-1 → 0xFFFF) ---------------------------------------
+ %li(t0, -1)
+ %zext16(t0, t0)
+ %li(t1, 0xFFFF)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_h) %li(a1, 1) %call(&print)
+
+ # ---- I: zext32 (-1 → 0xFFFFFFFF) -----------------------------------
+ %li(t0, -1)
+ %zext32(t0, t0, t1)
+ %li(t1, 0xFFFFFFFF)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_i) %li(a1, 1) %call(&print)
+
+ # ---- J: rd != ra split (sext8) -------------------------------------
+ %li(s0, 0x80)
+ %sext8(t0, s0)
+ %li(t1, -128)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_j) %li(a1, 1) %call(&print)
+
+ # ---- K: rd != ra split (zext16) ------------------------------------
+ %li(s0, -1)
+ %zext16(t0, s0)
+ %li(t1, 0xFFFF)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_k) %li(a1, 1) %call(&print)
+
+ # ---- L: rd != ra split (zext32) ------------------------------------
+ %li(s0, -1)
+ %zext32(t0, s0, t1)
+ %li(t1, 0xFFFFFFFF)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_l) %li(a1, 1) %call(&print)
+
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 0)
+ %b(&::done)
+
+ ::fail
+ %la(a0, &c_x) %li(a1, 1) %call(&print)
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 1)
+ ::done
+})
+
+:c_a "A"
+:c_b "B"
+:c_c "C"
+:c_d "D"
+:c_e "E"
+:c_f "F"
+:c_g "G"
+:c_h "H"
+:c_i "I"
+:c_j "J"
+:c_k "K"
+:c_l "L"
+:c_x "X"
+:c_nl "
+"
+
+:ELF_end
diff --git a/tests/P1/ext-macros.expected b/tests/P1/ext-macros.expected
@@ -0,0 +1 @@
+ABCDEFGHIJKL
diff --git a/tests/P1/lea-slot.P1pp b/tests/P1/lea-slot.P1pp
@@ -0,0 +1,47 @@
+# tests/p1/lea-slot.P1pp -- exercise libp1pp %lea_slot.
+#
+# %lea_slot(rd, slot_expr) rd = address of frame slot at slot_expr.
+#
+# Equivalent to %mov(rd, sp) + %addi(rd, rd, slot_expr) — centralizes
+# the hidden 16-byte frame header that the backend folds into %mov(rd, sp).
+#
+# Verification: store via sp-relative %st, read via address from
+# %lea_slot, expect equality. Then write via address, read via sp,
+# again expect equality. Two slots so we also verify slot offset != 0.
+# Output: "AB\n".
+
+%fn(p1_main, 16, {
+ # ---- A: write @sp+0, read via lea_slot ------------------------------
+ %li(t0, 0xCAFEBABE)
+ %st(t0, sp, 0)
+ %lea_slot(s0, 0)
+ %ld(t1, s0, 0)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_a) %li(a1, 1) %call(&print)
+
+ # ---- B: write via lea_slot @offset 8, read via sp+8 -----------------
+ %lea_slot(s0, 8)
+ %li(t0, 0xDEADBEEF)
+ %st(t0, s0, 0)
+ %ld(t1, sp, 8)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_b) %li(a1, 1) %call(&print)
+
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 0)
+ %b(&::done)
+
+ ::fail
+ %la(a0, &c_x) %li(a1, 1) %call(&print)
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 1)
+ ::done
+})
+
+:c_a "A"
+:c_b "B"
+:c_x "X"
+:c_nl "
+"
+
+:ELF_end
diff --git a/tests/P1/lea-slot.expected b/tests/P1/lea-slot.expected
@@ -0,0 +1 @@
+AB
diff --git a/tests/P1/memcpy-call.P1pp b/tests/P1/memcpy-call.P1pp
@@ -0,0 +1,53 @@
+# tests/p1/memcpy-call.P1pp -- libp1pp %memcpy_call macro.
+#
+# %memcpy_call(dst_reg, src_reg, n_imm)
+# Convenience wrapper around libp1pp's memcpy: marshals dst/src into
+# a0/a1, sets a2=n_imm, and invokes %call(&memcpy). dst_reg and
+# src_reg must not be a0 (the dst move would clobber a different
+# live input register).
+#
+# Verification: copy a 13-byte source buffer into a destination
+# buffer and byte-compare. Output: "OK\n" on pass.
+
+%fn(p1_main, 0, {
+ %la(s0, &dst)
+ %la(s1, &src)
+ %memcpy_call(s0, s1, 13)
+
+ # Verify dst[0..13] == src[0..13]
+ %li(t2, 0)
+ %loop_tag(L0, {
+ %li(t1, 13)
+ %if_eq(t2, t1, { %break(L0) })
+ %la(s0, &dst)
+ %add(s0, s0, t2)
+ %lb(t0, s0, 0)
+ %la(s1, &src)
+ %add(s1, s1, t2)
+ %lb(t1, s1, 0)
+ %bne(t0, t1, &::fail)
+ %addi(t2, t2, 1)
+ })
+
+ %la(a0, &c_ok) %li(a1, 3) %call(&print)
+ %li(a0, 0)
+ %b(&::done)
+
+ ::fail
+ %la(a0, &c_fail) %li(a1, 5) %call(&print)
+ %li(a0, 1)
+ ::done
+})
+
+:src
+"Hello, World!"
+:dst
+"............."
+:c_ok
+"OK
+"
+:c_fail
+"FAIL
+"
+
+:ELF_end
diff --git a/tests/P1/memcpy-call.expected b/tests/P1/memcpy-call.expected
@@ -0,0 +1 @@
+OK
diff --git a/tests/P1/ptr-arith.P1pp b/tests/P1/ptr-arith.P1pp
@@ -0,0 +1,65 @@
+# tests/p1/ptr-arith.P1pp -- libp1pp pointer scaling macros.
+#
+# %ptr_add(rd, ptr, idx, sz, scratch) rd = ptr + idx*sz
+# %ptr_sub(rd, ptr, idx, sz, scratch) rd = ptr - idx*sz
+# %ptr_diff(rd, p, q, sz, scratch) rd = (p - q) / sz (sz constant)
+
+%fn(p1_main, 0, {
+ # ---- A: ptr_add sz=1 ------------------------------------------------
+ %li(s0, 1000) %li(s1, 7)
+ %ptr_add(t0, s0, s1, 1, t1)
+ %li(t2, 1007) %bne(t0, t2, &::fail)
+ %la(a0, &c_a) %li(a1, 1) %call(&print)
+
+ # ---- B: ptr_add sz=4 ------------------------------------------------
+ %li(s0, 1000) %li(s1, 5)
+ %ptr_add(t0, s0, s1, 4, t1)
+ %li(t2, 1020) %bne(t0, t2, &::fail)
+ %la(a0, &c_b) %li(a1, 1) %call(&print)
+
+ # ---- C: ptr_add sz=8 ------------------------------------------------
+ %li(s0, 1000) %li(s1, 3)
+ %ptr_add(t0, s0, s1, 8, t1)
+ %li(t2, 1024) %bne(t0, t2, &::fail)
+ %la(a0, &c_c) %li(a1, 1) %call(&print)
+
+ # ---- D: ptr_sub sz=4 ------------------------------------------------
+ %li(s0, 1000) %li(s1, 5)
+ %ptr_sub(t0, s0, s1, 4, t1)
+ %li(t2, 980) %bne(t0, t2, &::fail)
+ %la(a0, &c_d) %li(a1, 1) %call(&print)
+
+ # ---- E: ptr_diff sz=4 -----------------------------------------------
+ %li(s0, 1020) %li(s1, 1000)
+ %ptr_diff(t0, s0, s1, 4, t1)
+ %li(t2, 5) %bne(t0, t2, &::fail)
+ %la(a0, &c_e) %li(a1, 1) %call(&print)
+
+ # ---- F: ptr_diff sz=8 -----------------------------------------------
+ %li(s0, 1024) %li(s1, 1000)
+ %ptr_diff(t0, s0, s1, 8, t1)
+ %li(t2, 3) %bne(t0, t2, &::fail)
+ %la(a0, &c_f) %li(a1, 1) %call(&print)
+
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 0)
+ %b(&::done)
+
+ ::fail
+ %la(a0, &c_x) %li(a1, 1) %call(&print)
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 1)
+ ::done
+})
+
+:c_a "A"
+:c_b "B"
+:c_c "C"
+:c_d "D"
+:c_e "E"
+:c_f "F"
+:c_x "X"
+:c_nl "
+"
+
+:ELF_end
diff --git a/tests/P1/ptr-arith.expected b/tests/P1/ptr-arith.expected
@@ -0,0 +1 @@
+ABCDEF
diff --git a/tests/P1/sub-word-mem.P1pp b/tests/P1/sub-word-mem.P1pp
@@ -0,0 +1,96 @@
+# tests/p1/sub-word-mem.P1pp -- exercise libp1pp sub-word memory macros.
+#
+# %ld_h(rd, base, off, scratch) — 2-byte zero-extending load
+# %ld_w(rd, base, off, scratch) — 4-byte zero-extending load
+# %ld_sh(rd, base, off, scratch) — 2-byte sign-extending load
+# %ld_sw(rd, base, off, scratch) — 4-byte sign-extending load
+# %st_h(rs, base, off, scratch) — 2-byte store (low 16 bits)
+# %st_w(rs, base, off, scratch) — 4-byte store (low 32 bits)
+#
+# Each subtest writes one ASCII byte to stdout on success, "X" on
+# any mismatch. Expected: "ABCDEF\n".
+
+%fn(p1_main, 0, {
+ # ---- A: %st_h byte order (little-endian) ----------------------------
+ %la(s0, &buf)
+ %li(t0, 0xCAFE)
+ %st_h(t0, s0, 0, t1)
+ %lb(t2, s0, 0)
+ %li(t1, 0xFE)
+ %bne(t2, t1, &::fail)
+ %lb(t2, s0, 1)
+ %li(t1, 0xCA)
+ %bne(t2, t1, &::fail)
+ %la(a0, &c_a) %li(a1, 1) %call(&print)
+
+ # ---- B: %st_w byte order --------------------------------------------
+ %la(s0, &buf)
+ %li(t0, 0xDEADBEEF)
+ %st_w(t0, s0, 0, t1)
+ %lb(t2, s0, 0) %li(t1, 0xEF) %bne(t2, t1, &::fail)
+ %lb(t2, s0, 1) %li(t1, 0xBE) %bne(t2, t1, &::fail)
+ %lb(t2, s0, 2) %li(t1, 0xAD) %bne(t2, t1, &::fail)
+ %lb(t2, s0, 3) %li(t1, 0xDE) %bne(t2, t1, &::fail)
+ %la(a0, &c_b) %li(a1, 1) %call(&print)
+
+ # ---- C: %ld_h round-trip (zero-extend) ------------------------------
+ %la(s0, &buf)
+ %li(t0, 0xCAFE)
+ %st_h(t0, s0, 8, t1)
+ %ld_h(t0, s0, 8, t1)
+ %li(t1, 0xCAFE)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_c) %li(a1, 1) %call(&print)
+
+ # ---- D: %ld_w round-trip (zero-extend) ------------------------------
+ %la(s0, &buf)
+ %li(t0, 0xDEADBEEF)
+ %st_w(t0, s0, 8, t1)
+ %ld_w(t0, s0, 8, t1)
+ %li(t1, 0xDEADBEEF)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_d) %li(a1, 1) %call(&print)
+
+ # ---- E: %ld_sh sign-extend ------------------------------------------
+ %la(s0, &buf)
+ %li(t0, 0x8000)
+ %st_h(t0, s0, 0, t1)
+ %ld_sh(t0, s0, 0, t1)
+ %li(t1, -32768)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_e) %li(a1, 1) %call(&print)
+
+ # ---- F: %ld_sw sign-extend ------------------------------------------
+ %la(s0, &buf)
+ %li(t0, 0x80000000)
+ %st_w(t0, s0, 0, t1)
+ %ld_sw(t0, s0, 0, t1)
+ %li(t1, -2147483648)
+ %bne(t0, t1, &::fail)
+ %la(a0, &c_f) %li(a1, 1) %call(&print)
+
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 0)
+ %b(&::done)
+
+ ::fail
+ %la(a0, &c_x) %li(a1, 1) %call(&print)
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 1)
+ ::done
+})
+
+:c_a "A"
+:c_b "B"
+:c_c "C"
+:c_d "D"
+:c_e "E"
+:c_f "F"
+:c_x "X"
+:c_nl "
+"
+
+:buf
+%(0) %(0)
+
+:ELF_end
diff --git a/tests/P1/sub-word-mem.expected b/tests/P1/sub-word-mem.expected
@@ -0,0 +1 @@
+ABCDEF
diff --git a/tests/P1/switch-case.P1pp b/tests/P1/switch-case.P1pp
@@ -0,0 +1,56 @@
+# tests/p1/switch-case.P1pp -- libp1pp %switch_case dispatch macro.
+#
+# %switch_case(ctrl, scratch, key, target)
+# if ctrl == key, branch to target. scratch holds the key literal.
+#
+# A small dispatcher: select(n) returns 100, 200, or 300 for n==1/2/3,
+# and 999 for the default. Drive it with three calls and verify.
+# Output: "ABC\n".
+
+%fn(select_n, 0, {
+ %switch_case(a0, t1, 1, &::case_1)
+ %switch_case(a0, t1, 2, &::case_2)
+ %switch_case(a0, t1, 3, &::case_3)
+ %li(a0, 999)
+ %b(&::done)
+ ::case_1 %li(a0, 100) %b(&::done)
+ ::case_2 %li(a0, 200) %b(&::done)
+ ::case_3 %li(a0, 300)
+ ::done
+})
+
+%fn(p1_main, 0, {
+ %li(a0, 1) %call(&select_n)
+ %li(t0, 100) %bne(a0, t0, &::fail)
+ %la(a0, &c_a) %li(a1, 1) %call(&print)
+
+ %li(a0, 2) %call(&select_n)
+ %li(t0, 200) %bne(a0, t0, &::fail)
+ %la(a0, &c_b) %li(a1, 1) %call(&print)
+
+ %li(a0, 3) %call(&select_n)
+ %li(t0, 300) %bne(a0, t0, &::fail)
+ %la(a0, &c_c) %li(a1, 1) %call(&print)
+
+ %li(a0, 99) %call(&select_n)
+ %li(t0, 999) %bne(a0, t0, &::fail)
+
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 0)
+ %b(&::done)
+
+ ::fail
+ %la(a0, &c_x) %li(a1, 1) %call(&print)
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 1)
+ ::done
+})
+
+:c_a "A"
+:c_b "B"
+:c_c "C"
+:c_x "X"
+:c_nl "
+"
+
+:ELF_end
diff --git a/tests/P1/switch-case.expected b/tests/P1/switch-case.expected
@@ -0,0 +1 @@
+ABC
diff --git a/tests/P1/unops.P1pp b/tests/P1/unops.P1pp
@@ -0,0 +1,73 @@
+# tests/p1/unops.P1pp -- libp1pp unary helpers.
+#
+# %neg(rd, ra, scratch) rd = -ra (uses scratch for the zero literal)
+# %bnot(rd, ra, scratch) rd = ~ra (uses scratch for the all-ones literal)
+# %bool(rd, ra) rd = (ra != 0) ? 1 : 0
+#
+# Output: "ABCDEF\n".
+
+%fn(p1_main, 0, {
+ # ---- A: neg positive (5 -> -5) -------------------------------------
+ %li(s0, 5)
+ %neg(t0, s0, t1)
+ %li(t2, -5)
+ %bne(t0, t2, &::fail)
+ %la(a0, &c_a) %li(a1, 1) %call(&print)
+
+ # ---- B: neg negative (-7 -> 7) -------------------------------------
+ %li(s0, -7)
+ %neg(t0, s0, t1)
+ %li(t2, 7)
+ %bne(t0, t2, &::fail)
+ %la(a0, &c_b) %li(a1, 1) %call(&print)
+
+ # ---- C: bnot 0 -> -1 -----------------------------------------------
+ %li(s0, 0)
+ %bnot(t0, s0, t1)
+ %li(t2, -1)
+ %bne(t0, t2, &::fail)
+ %la(a0, &c_c) %li(a1, 1) %call(&print)
+
+ # ---- D: bnot 0xA5 -> ~0xA5 (= -0x166 actually) ----------------------
+ %li(s0, 0xA5)
+ %bnot(t0, s0, t1)
+ %li(t2, -166)
+ %bne(t0, t2, &::fail)
+ %la(a0, &c_d) %li(a1, 1) %call(&print)
+
+ # ---- E: bool zero -> 0 ---------------------------------------------
+ %li(s0, 0)
+ %bool(t0, s0)
+ %li(t2, 0)
+ %bne(t0, t2, &::fail)
+ %la(a0, &c_e) %li(a1, 1) %call(&print)
+
+ # ---- F: bool nonzero -> 1 ------------------------------------------
+ %li(s0, 42)
+ %bool(t0, s0)
+ %li(t2, 1)
+ %bne(t0, t2, &::fail)
+ %la(a0, &c_f) %li(a1, 1) %call(&print)
+
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 0)
+ %b(&::done)
+
+ ::fail
+ %la(a0, &c_x) %li(a1, 1) %call(&print)
+ %la(a0, &c_nl) %li(a1, 1) %call(&print)
+ %li(a0, 1)
+ ::done
+})
+
+:c_a "A"
+:c_b "B"
+:c_c "C"
+:c_d "D"
+:c_e "E"
+:c_f "F"
+:c_x "X"
+:c_nl "
+"
+
+:ELF_end
diff --git a/tests/P1/unops.expected b/tests/P1/unops.expected
@@ -0,0 +1 @@
+ABCDEF