commit f0972c4527e533f9195944889ae6cb4440d04a36
parent 8a62b039948dd58bcedd08fe1e1c654900c39154
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 22 Apr 2026 07:05:08 -0700
lisp.M1: fix mark_push_recurse self-loop on aarch64/riscv64; add GC stress tests from LISP-GC.md step-5 set
Bug:
mark_push_recurse was `li_br &mark_value ; call ; ret` with no
prologue. On amd64 this works because native CALL pushes the
return address on the stack. On aarch64/riscv64 native CALL
writes the return address to a link register (LR / ra) instead.
The inner CALL to mark_value therefore clobbered mark_push's
incoming LR with a pointer to the RET right below it; when
mark_value returned, the RET branched to itself and the
interpreter hung.
The bug was latent since nothing triggered mark_push_recurse
before — existing GC tests never grew the mark stack past its
512-entry capacity. Diagnosed with stderr probes at gc() entry
and exit and at mark_push_recurse: a 6th GC fired, emitted one
M, then the counter inside mark_value stopped advancing,
pinpointing a self-loop in the overflow path rather than an
infinite mark cycle.
Fix:
`li_br &mark_value ; b`. mark_value's prologue spills the
incoming LR (set by mark_push's caller), executes, and its
epilogue restores and branches through it, so the eventual RET
goes directly to mark_push's caller. Tail-branching from a
prologue-less leaf is the correct shape.
This is a latent bug in lisp.M1, not P1. P1's contract is: if
you want to make nested CALLs, you need a PROLOGUE to spill LR.
mark_push_recurse violated that contract. The failure mode is
unfortunately silent-success on amd64 and silent-hang on the
other two arches; worth calling out in P1.md as a footgun, and
possibly teachable to lint.sh, as follow-ups.
Tests (from LISP-GC.md §"Remaining work" → stress tests):
- 19-gc-vector-churn.scm: allocates/drops vectors whose sizes
straddle the 128-byte threshold so gc_sweep_obj's
>128-coalesce path actually runs; previously zero coverage.
- 21-gc-mixed.scm: interleaves pair/vector/string/closure
allocations so pair and obj GC cycles overlap.
- 19-gc-closure-churn.scm → 20-gc-closure-churn.scm so the
numbering matches the plan in LISP-GC.md §Implementation
sequence.
- 18-gc-deep-list.scm reworked to force the
mark_push → mark_push_recurse → mark_value path. LISP-GC.md
conjectured a deep list would overflow the mark stack; it
doesn't — mark_value_pair pushes cdr first then car, so LIFO
processes the car immediately and a flat list never exceeds
depth ~2. A live vector with >512 elements does overflow
(mark_value_vector pushes every slot in one burst). Test
uses a 490-element global vector plus pair churn so the
vector is re-marked across several GCs — the recursive path
fires ~10x per GC, which is what exposed the self-loop.
All 28 tests pass on aarch64, amd64, and riscv64.
Diffstat:
10 files changed, 127 insertions(+), 21 deletions(-)
diff --git a/Makefile b/Makefile
@@ -225,7 +225,9 @@ LISP_TESTS := \
tests/lisp/29-innerdef.scm \
tests/lisp/17-gc-cons-churn.scm \
tests/lisp/18-gc-deep-list.scm \
- tests/lisp/19-gc-closure-churn.scm
+ tests/lisp/19-gc-vector-churn.scm \
+ tests/lisp/20-gc-closure-churn.scm \
+ tests/lisp/21-gc-mixed.scm
# Build the prelude-prefixed twin of every fixture and feed that to the
# interpreter; .expected still lives next to the original .scm.
diff --git a/src/lisp.M1 b/src/lisp.M1
@@ -1244,10 +1244,15 @@ DEFINE ZERO32 '0000000000000000000000000000000000000000000000000000000000000000'
addi_r3,r3,8
st_r3,r2,0
ret
+## Tail-branch (not CALL) so mark_value returns directly to mark_push's
+## caller. A CALL+RET here would loop on aarch64/riscv64: their native
+## CALL writes retaddr to LR rather than pushing it, so the inner CALL
+## clobbers this routine's own incoming LR and the final RET branches
+## back to itself. mark_push has no prologue and thus no place to spill
+## LR, so tail-branching is the correct shape.
:mark_push_recurse
li_br &mark_value
- call
- ret
+ b
## ---- mark_value(tagged value) ---------------------------------------
diff --git a/tests/lisp/18-gc-deep-list.expected b/tests/lisp/18-gc-deep-list.expected
@@ -1 +1 @@
-45150
+119805
diff --git a/tests/lisp/18-gc-deep-list.scm b/tests/lisp/18-gc-deep-list.scm
@@ -1,23 +1,61 @@
-;; Deep-list stress: build a list whose total allocations exceed the
-;; arena, then sum it. The mark phase must preserve every live pair
-;; on the partial list across the GC cycles that fire mid-build.
-;; A sweep that drops a live pair would corrupt the chain and either
-;; crash sum or yield a wrong total.
+;; Mark-stack overflow stress: force the
+;; `mark_push → mark_push_recurse → mark_value` tail-branch fallback.
;;
-;; build 300 → 300 user pairs alive at the end; ~8 pair allocs per
-;; iter * 300 = ~2400 raw allocations vs the 32 KB / 16 = 2048-pair
-;; arena, so GC must fire mid-build.
+;; `mark_stack` holds 512 entries (see `:mark_stack` in lisp.M1).
+;; Overflow requires a DFS frontier wider than 512 at some point during
+;; the mark phase. The cleanest trigger is a single live vector with
+;; >512 element slots: `mark_value_vector` pushes each slot in one burst,
+;; so the stack fills in-place and every further push routes through
+;; `mark_push_recurse`.
;;
-;; Sum of 1..300 = 300 * 301 / 2 = 45150.
+;; (LISP-GC.md originally conjectured that a deep *list* would overflow;
+;; inspection of the mark order shows it won't — `mark_value_pair`
+;; pushes cdr first and car second, so the DFS immediately consumes
+;; the car side and leaves at most one leftover cdr per level. A flat
+;; list never exceeds stack depth ~2. A large vector does. Keep the
+;; filename for continuity with the step-5 plan in LISP-GC.md.)
+;;
+;; Bug this test pinned down: on aarch64/riscv64, `mark_push_recurse`
+;; originally did `CALL mark_value ; RET`. Those arches' native CALL
+;; writes the return address to a link register rather than pushing it,
+;; so the inner CALL clobbered the incoming LR and the outer RET
+;; branched to itself. The routine now tail-branches via
+;; `li_br &mark_value ; b`, which lets mark_value return directly to
+;; mark_push's caller. This test exercises that path across all three
+;; arches.
+;;
+;; Plan:
+;; 1. Allocate a 490-slot global vector. The mark stack is 512 entries
+;; and root-walking has contributed a handful of pending pair
+;; children before the vector is popped, so pushing 490 more
+;; elements crosses the threshold mid-vector.
+;; 2. Fill with distinct fixnums so a mis-swept vector corrupts the
+;; checksum.
+;; 3. Churn the pair arena to force repeated GCs while the vector
+;; stays globally reachable — each GC re-marks the vector and
+;; re-hits the overflow path.
+;; 4. Sum the vector. Expected: 0 + 1 + … + 489 = 490 * 489 / 2 = 119805.
+
+(define v (make-vector 490 0))
+
+(define fill
+ (lambda (i)
+ (if (= i 490)
+ 0
+ (begin (vector-set! v i i) (fill (+ i 1))))))
+(fill 0)
-(define build
- (lambda (n acc)
- (if (= n 0) acc
- (build (- n 1) (cons n acc)))))
+(define churn
+ (lambda (n)
+ (if (= n 0)
+ 0
+ (begin (cons n n) (churn (- n 1))))))
+(churn 2000)
(define sum
- (lambda (lst acc)
- (if (null? lst) acc
- (sum (cdr lst) (+ acc (car lst))))))
+ (lambda (i acc)
+ (if (= i 490)
+ acc
+ (sum (+ i 1) (+ acc (vector-ref v i))))))
-(sum (build 300 (quote ())) 0)
+(sum 0 0)
diff --git a/tests/lisp/19-gc-closure-churn.expected b/tests/lisp/19-gc-vector-churn.expected
diff --git a/tests/lisp/19-gc-vector-churn.scm b/tests/lisp/19-gc-vector-churn.scm
@@ -0,0 +1,28 @@
+;; Vector churn: every iteration allocates three short-lived vectors
+;; whose sizes straddle the >128-byte threshold. Sub-128 allocations
+;; get recycled through `free_lists[class_of(sz)]`; anything >128
+;; falls into the coalescing path in `gc_sweep_obj`, which has no
+;; coverage from tests 17/18/20. Varying the sizes ensures
+;; neighbouring dead chunks of different classes land adjacent to
+;; each other so the coalesce merges non-trivially.
+;;
+;; Sizes (8-byte header + 8*N slots):
+;; (make-vector 20 0) = 168 bytes (>128, coalesce path)
+;; (make-vector 40 0) = 328 bytes (>128, coalesce path)
+;; (make-vector 16 0) = 136 bytes (>128, coalesce path)
+;;
+;; 300 iterations × ~632 bytes/iter = ~190 KB of obj churn against
+;; a 32 KiB obj arena — forces roughly 6 GC cycles and exercises
+;; the coalesce-then-rewrite-pseudo-header logic on every one.
+
+(define churn
+ (lambda (n)
+ (if (= n 0)
+ 42
+ (begin
+ (make-vector 20 0)
+ (make-vector 40 0)
+ (make-vector 16 0)
+ (churn (- n 1))))))
+
+(churn 300)
diff --git a/tests/lisp/19-gc-closure-churn.expected b/tests/lisp/20-gc-closure-churn.expected
diff --git a/tests/lisp/19-gc-closure-churn.scm b/tests/lisp/20-gc-closure-churn.scm
diff --git a/tests/lisp/19-gc-closure-churn.expected b/tests/lisp/21-gc-mixed.expected
diff --git a/tests/lisp/21-gc-mixed.scm b/tests/lisp/21-gc-mixed.scm
@@ -0,0 +1,33 @@
+;; Mixed-allocator churn: every iteration allocates one value of each
+;; of the four heap shapes so pair-arena and obj-arena GCs overlap
+;; within a single run.
+;;
+;; (cons n n) — pair arena, 16 bytes
+;; (make-vector 6 n) — obj arena, 56 bytes (sub-128
+;; class 56 free list)
+;; (make-vector 24 n) — obj arena, 200 bytes (>128
+;; coalesce path)
+;; (string-append "x" "y") — obj arena, short string
+;; ((lambda (x) x) n) — obj arena, 32-byte closure
+;; allocated then immediately called
+;; and dropped
+;;
+;; 400 iterations × (1 pair + 4 obj) exhausts both arenas multiple
+;; times, interleaving sweep and free-list recycle across every
+;; size class plus the pair bitmap. A missed root or mis-classified
+;; chunk in either arena would surface as a crash, a wrong checksum,
+;; or a stuck loop.
+
+(define churn
+ (lambda (n)
+ (if (= n 0)
+ 42
+ (begin
+ (cons n n)
+ (make-vector 6 n)
+ (make-vector 24 n)
+ (string-append "x" "y")
+ ((lambda (x) x) n)
+ (churn (- n 1))))))
+
+(churn 400)