commit a73a78dfa060ba0a69f92bf8e1fd941ac8dc61dd
parent 8e2de46571802726e1a2f0c3d9d7e6d422bb53be
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sun, 3 May 2026 23:50:08 -0700
cc: split aggregate args across two ABI slots on call + receive
cc.scm was treating every parameter as a single 8-byte ABI slot on
both the call and receive sides. AAPCS hands a 9..16-byte aggregate
in two consecutive arg positions (regs or stack words); cc.scm dropped
the second word, so any callee with a struct-by-value param wider than
8 bytes saw its second half clobbered by the next argument or by
uninitialized stack.
Surfaced earlier while debugging the tcc-tcc self-host: tcc.flat.c's
expr_cond hands CType structs by value through helpers like
gen_op / gen_cast. tcc-boot2 happened not to exercise the broken
path on its own (locals only, not params), so it self-hosted, but
any user code with struct-by-value params miscompiled silently.
Receive side (cg-fn-begin/v): allocate a slot sized to the aggregate,
spill ⌈size/8⌉ consecutive ABI positions into successive 8-byte chunks,
and advance idx by that count.
Call side (cg-call): when the arg is an aggregate >8B, stage its
address once and chunk-load 8 bytes at a time into successive arg
regs / stack words; account for the extra ABI slot in the
outgoing-stack-arg footprint.
>16B aggregates would normally pass by reference per AAPCS — left
unimplemented; cc.scm dies cleanly if user code hits it.
Regression locked by tests/cc/337-struct-by-value-arg.
cc 180/0 -> 181/0; tcc-cc 180/1 unchanged at parity with tcc-gcc;
tcc-libc 17/0; all other suites unchanged.
Diffstat:
2 files changed, 118 insertions(+), 11 deletions(-)
diff --git a/cc/cc.scm b/cc/cc.scm
@@ -3015,12 +3015,44 @@
(let* ((p (car ps))
(nm (car p))
(ty (cdr p))
- (off (cg-alloc-slot cg 8 8))
+ ;; AAPCS: 9..16B aggregates ride two consecutive arg
+ ;; positions (regs or stack slots), wider-than-16B
+ ;; aggregates would normally pass by reference — not
+ ;; supported here yet.
+ (n (%cg-param-reg-count ty))
+ (sz (cond ((%cg-param-aggregate? ty)
+ (align-up (ctype-size ty) 8))
+ (else 8)))
+ (al (cond ((%cg-param-aggregate? ty)
+ (max 8 (ctype-align ty)))
+ (else 8)))
+ (off (cg-alloc-slot cg sz al))
(psym (%sym nm 'param #f ty off #t)))
- (spill (+ idx sret-shift) off)
- (walk (cdr ps) (+ idx 1) (cons (cons nm psym) out)
+ (let chunk ((i 0))
+ (cond ((>= i n) 0)
+ (else
+ (spill (+ idx sret-shift i) (+ off (* i 8)))
+ (chunk (+ i 1)))))
+ (walk (cdr ps) (+ idx n) (cons (cons nm psym) out)
(or first-slot off))))))))
+;; Number of consecutive ABI slots (regs or stack words) consumed by a
+;; parameter of TY. Aggregates ≤16B take ⌈size/8⌉; everything else 1.
+(define (%cg-param-reg-count ty)
+ (cond
+ ((%cg-param-aggregate? ty)
+ (let ((sz (ctype-size ty)))
+ (cond
+ ((> sz 16)
+ (die #f "cg: aggregate arg/param >16B not supported" sz))
+ ((> sz 8) 2)
+ (else 1))))
+ (else 1)))
+
+(define (%cg-param-aggregate? ty)
+ (let ((k (ctype-kind ty)))
+ (or (eq? k 'struct) (eq? k 'union))))
+
(define (cg-fn-end cg)
;; Drain prologue-buf and fn-buf directly into cg-text via buf-drain!
;; (memcpy, no allocation). Header/footer pieces go through buf-push!
@@ -3779,16 +3811,51 @@
(cond
((null? xs) 0)
(else
- (let ((abi (+ idx sret-shift)))
+ (let* ((arg (car xs))
+ (aty (opnd-type arg))
+ (n (%cg-param-reg-count aty)))
(cond
- ((< abi 4)
- (%cg-load-opnd-into cg (car xs) (%reg-by-idx abi))
- (stage (cdr xs) (+ idx 1)))
+ ;; Aggregate >8B: load both halves into successive arg
+ ;; regs / stack slots. Stage the struct's address in t0
+ ;; once and chunk-load 8 bytes at a time.
+ ((and (%cg-param-aggregate? aty) (> n 1))
+ (%cg-emit-addr-of cg arg 't0)
+ (let chunk ((i 0))
+ (cond
+ ((>= i n) 0)
+ (else
+ (let ((tabi (+ idx sret-shift i)))
+ (cond
+ ((< tabi 4)
+ (%cg-emit-many cg
+ (list "%ld("
+ (%cg-reg->bv (%reg-by-idx tabi))
+ ", t0, " (%n (* i 8)) ")\n")))
+ (else
+ (%cg-emit-many cg
+ (list "%ld(t1, t0, "
+ (%n (* i 8)) ")\n"))
+ (%cg-emit-st cg 't1 'sp (* 8 (- tabi 4))))))
+ (chunk (+ i 1)))))
+ (stage (cdr xs) (+ idx n)))
(else
- (%cg-load-opnd-into cg (car xs) 't0)
- (%cg-emit-st cg 't0 'sp (* 8 (- abi 4)))
- (stage (cdr xs) (+ idx 1))))))))
- (let ((sa (if sret? (max 0 (- arity 3)) (max 0 (- arity 4)))))
+ (let ((abi (+ idx sret-shift)))
+ (cond
+ ((< abi 4)
+ (%cg-load-opnd-into cg arg (%reg-by-idx abi))
+ (stage (cdr xs) (+ idx 1)))
+ (else
+ (%cg-load-opnd-into cg arg 't0)
+ (%cg-emit-st cg 't0 'sp (* 8 (- abi 4)))
+ (stage (cdr xs) (+ idx 1)))))))))))
+ ;; Stack-arg footprint accounts for the extra ABI slot any
+ ;; >8B-aggregate arg consumed beyond its single-position cousin.
+ (let* ((nabi (let count ((xs args) (n sret-shift))
+ (cond ((null? xs) n)
+ (else (count (cdr xs)
+ (+ n (%cg-param-reg-count
+ (opnd-type (car xs)))))))))
+ (sa (max 0 (- nabi 4))))
(cond ((> sa 0) (%cg-bump-outgoing! cg sa)) (else 0)))
(cond
(sret?
diff --git a/tests/cc/337-struct-by-value-arg.c b/tests/cc/337-struct-by-value-arg.c
@@ -0,0 +1,40 @@
+/* Struct-by-value parameter passing. aarch64 AAPCS hands a 9..16-byte
+ * aggregate in two consecutive arg registers (or two stack slots if
+ * neither fits); cc.scm has to mirror that on both the call and
+ * receive sides. Until this works, every callee with a wider-than-8B
+ * struct param sees the second word truncated, which silently
+ * miscompiles tcc.flat.c's CType-passing helpers and any user code
+ * with similar shapes.
+ */
+
+struct Pair { long a; long b; };
+
+static int probe(struct Pair x, struct Pair y, long ea1, long ea2,
+ long eb1, long eb2)
+{
+ if (x.a != ea1) return 1;
+ if (x.b != ea2) return 2;
+ if (y.a != eb1) return 3;
+ if (y.b != eb2) return 4;
+ return 0;
+}
+
+static int probe_after_int(int prefix, struct Pair p, long ea, long eb)
+{
+ if (prefix != 99) return 5;
+ if (p.a != ea) return 6;
+ if (p.b != eb) return 7;
+ return 0;
+}
+
+int main(void)
+{
+ struct Pair a; a.a = 10; a.b = 20;
+ struct Pair b; b.a = 30; b.b = 40;
+
+ int r;
+ if ((r = probe(a, b, 10, 20, 30, 40))) return 10 + r;
+ if ((r = probe_after_int(99, a, 10, 20))) return 30 + r;
+
+ return 0;
+}