boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

kernel.c (61992B)


      1 /* seed kernel — minimal OS satisfying docs/OS.md Tier 1.
      2  *
      3  * Boots through an arch backend with two virtio-blk-MMIO disks, parses
      4  * the DTB to find virtio_mmio nodes + memory, brings up
      5  * a small polling virtio-blk driver, reads the cpio newc archive from
      6  * blk0 (read-only) into the in-memory tmpfs, loads /init (a static
      7  * target ELF), and enters it through the arch trap-return path. Syscall
      8  * traps land in trap_sync() and dispatch Tier-1/Tier-2 syscalls. On exit,
      9  * the tmpfs is
     10  * serialised to blk1 in a small SEEDFS table for the host extractor.
     11  */
     12 
     13 typedef unsigned char  u8;
     14 typedef unsigned short u16;
     15 typedef unsigned int   u32;
     16 typedef unsigned long  u64;
     17 typedef long           i64;
     18 typedef int            i32;
     19 
     20 #include "arch.h"
     21 
     22 /* ─── Console ───────────────────────────────────────────────────────────── */
     23 
     24 static void uart_putc(char c) {
     25     arch_console_putc(c);
     26 }
     27 
     28 static void uart_puts(const char *s) {
     29     while (*s) {
     30         if (*s == '\n') uart_putc('\r');
     31         uart_putc(*s++);
     32     }
     33 }
     34 
     35 static void uart_putx(u64 v) {
     36     static const char hex[] = "0123456789abcdef";
     37     uart_puts("0x");
     38     for (int i = 60; i >= 0; i -= 4) uart_putc(hex[(v >> i) & 0xf]);
     39 }
     40 
     41 static void uart_putd(i64 v) {
     42     if (v < 0) { uart_putc('-'); v = -v; }
     43     char buf[24];
     44     int i = 0;
     45     if (v == 0) buf[i++] = '0';
     46     while (v) { buf[i++] = '0' + (v % 10); v /= 10; }
     47     while (i--) uart_putc(buf[i]);
     48 }
     49 
     50 __attribute__((noreturn)) static void hang(void) { for (;;) arch_pause(); }
     51 
     52 /* ─── Tiny libc-ish helpers ─────────────────────────────────────────────── */
     53 
     54 /* memcpy / memset / memmove come from tcc/cc/mem.c, linked alongside.
     55  * Both gcc and tcc emit calls to these for struct copies and bulk
     56  * zero-init past their inline thresholds; centralising them in
     57  * tcc/cc/mem.c keeps the tcc-built and gcc-built kernels in sync. */
     58 void *memcpy(void *dst, const void *src, u64 n);
     59 void *memset(void *dst, int c, u64 n);
     60 void *memmove(void *dst, const void *src, u64 n);
     61 
     62 static int str_eq(const char *a, const char *b) {
     63     while (*a && *a == *b) { a++; b++; }
     64     return *a == 0 && *b == 0;
     65 }
     66 static int str_n(const char *s) { int n = 0; while (s[n]) n++; return n; }
     67 static void mem_cpy(void *d, const void *s, u64 n) {
     68     /* 8-byte fast path when both pointers are 8-aligned and n is a multiple
     69      * of 8. Under TCG this is roughly 8× faster than the byte loop. */
     70     u8 *dd = d; const u8 *ss = s;
     71     if ((((u64)dd | (u64)ss | n) & 7) == 0) {
     72         u64 *dq = (u64 *)dd;
     73         const u64 *sq = (const u64 *)ss;
     74         u64 m = n >> 3;
     75         for (u64 i = 0; i < m; i++) dq[i] = sq[i];
     76         return;
     77     }
     78     for (u64 i = 0; i < n; i++) dd[i] = ss[i];
     79 }
     80 static void mem_set(void *d, int c, u64 n) {
     81     u8 *dd = d;
     82     for (u64 i = 0; i < n; i++) dd[i] = (u8)c;
     83 }
     84 
     85 /* User address-space constants are supplied by the arch backend. */
     86 #define USER_POOL_A_PA ARCH_USER_POOL_A_PA
     87 #define USER_POOL_B_PA ARCH_USER_POOL_B_PA
     88 #define USER_POOL_SIZE ARCH_USER_POOL_SIZE
     89 #define USER_VA_LO     ARCH_USER_VA_LO
     90 #define USER_VA_HI     ARCH_USER_VA_HI
     91 #define USER_POOL_FIRST_SLOT ARCH_USER_POOL_FIRST_SLOT
     92 #define USER_POOL_LAST_SLOT  ARCH_USER_POOL_LAST_SLOT
     93 
     94 /* 0 = pool A is currently mapped at user VAs; 1 = pool B. */
     95 static int current_pool = 0;
     96 
     97 /* ─── Kernel heap (bump allocator) ──────────────────────────────────────── */
     98 
     99 extern char _end[];
    100 static u8 *kheap_ptr;
    101 static u8 *kheap_end;
    102 
    103 static void *kalloc(u64 n) {
    104     n = (n + 15) & ~15UL;
    105     if (kheap_ptr + n > kheap_end) {
    106         uart_puts("kalloc: out of memory\n");
    107         hang();
    108     }
    109     void *r = kheap_ptr;
    110     kheap_ptr += n;
    111     return r;
    112 }
    113 
    114 /* ─── Big-endian readers (DTB is BE) ────────────────────────────────────── */
    115 
    116 static u32 be32(const u8 *p) { return (u32)p[0]<<24 | (u32)p[1]<<16 | (u32)p[2]<<8 | (u32)p[3]; }
    117 static u64 be64(const u8 *p) { return ((u64)be32(p) << 32) | (u64)be32(p + 4); }
    118 
    119 /* ─── Flattened Device Tree walker ──────────────────────────────────────── */
    120 
    121 /* DTB ("flattened device tree blob") header + token codes — devicetree
    122  * spec §5.1 (header) + §5.4.1 (struct block tokens). */
    123 #define FDT_MAGIC      0xd00dfeedu
    124 #define FDT_BEGIN_NODE 1
    125 #define FDT_END_NODE   2
    126 #define FDT_PROP       3
    127 #define FDT_NOP        4
    128 #define FDT_END        9
    129 
    130 /* Xen PVH ABI handoff (see Xen's docs/misc/pvh.pandoc). On PVH boot QEMU
    131  * points EBX at a `struct hvm_start_info`; the amd64 long-mode init in
    132  * arch/amd64/kernel.S preserves EBX through to kmain's `dtb_phys` arg.
    133  * First word is the magic; cmdline_paddr sits at offset 0x18. microvm
    134  * has no DTB, so this is the only path for `qemu -append "..."`. */
    135 #define PVH_HVM_START_MAGIC       0x336ec578u
    136 #define PVH_HVM_START_OFF_CMDLINE 0x18
    137 
    138 /* cpio "newc" (SVR4 portable) format — first 6 bytes of every header
    139  * record are the magic; header is fixed-size (110 bytes) followed by
    140  * the NUL-terminated name and the file data, both 4-byte padded. The
    141  * 13 numeric fields are 8-char ASCII hex starting at offset 6. Spec:
    142  * cpio(5) under "New ASCII Format". */
    143 #define CPIO_NEWC_MAGIC "070701"
    144 #define CPIO_NEWC_HDR_SIZE 110
    145 #define CPIO_NEWC_FIELD(p, n) ((p) + 6 + (n) * 8)
    146 #define CPIO_MODE_TYPE_MASK 0xf000          /* st_mode file-type bits */
    147 #define CPIO_MODE_TYPE_DIR  0x4000
    148 #define CPIO_MODE_TYPE_REG  0x8000
    149 
    150 /* QEMU virt has 32 virtio-mmio slots (0x0a000000..0x0a004000, 0x200 each).
    151  * Most are unpopulated and report MagicValue=0/DeviceID=0 — we capture all
    152  * slots advertised by the DTB and the driver init filters at probe time. */
    153 #define MAX_VIRTIO_MMIO 32
    154 
    155 struct dtb_info {
    156     u64 mem_start;
    157     u64 mem_size;
    158     u64 virtio_mmio_pa[MAX_VIRTIO_MMIO];
    159     int virtio_mmio_n;
    160     char bootargs[256];
    161 };
    162 
    163 /* str_starts: returns 1 iff `s` begins with `prefix`. */
    164 static int str_starts(const char *s, const char *prefix) {
    165     while (*prefix) { if (*s++ != *prefix++) return 0; }
    166     return 1;
    167 }
    168 
    169 static void parse_dtb(const void *dtb, struct dtb_info *out) {
    170 #ifdef ARCH_STATIC_VIRTIO_MMIO_BASE
    171     /* No DTB on this arch (amd64 microvm). mem + virtio-mmio come from
    172      * arch.h compile-time constants; the kernel cmdline arrives via the
    173      * PVH hvm_start_info struct (see PVH_HVM_START_* above). */
    174     out->mem_start = ARCH_STATIC_MEM_START;
    175     out->mem_size = ARCH_STATIC_MEM_SIZE;
    176     out->virtio_mmio_n = ARCH_STATIC_VIRTIO_MMIO_COUNT;
    177     for (int i = 0; i < out->virtio_mmio_n && i < MAX_VIRTIO_MMIO; i++)
    178         out->virtio_mmio_pa[i] = ARCH_STATIC_VIRTIO_MMIO_BASE +
    179                                  (u64)i * ARCH_STATIC_VIRTIO_MMIO_STRIDE;
    180     if ((u64)dtb != 0) {
    181         const u8 *p = dtb;
    182         u32 magic = (u32)p[0] | ((u32)p[1] << 8) | ((u32)p[2] << 16) | ((u32)p[3] << 24);
    183         if (magic == PVH_HVM_START_MAGIC) {
    184             const u8 *cp = p + PVH_HVM_START_OFF_CMDLINE;
    185             u64 cmdline_paddr = 0;
    186             for (int i = 0; i < 8; i++) cmdline_paddr |= (u64)cp[i] << (i * 8);
    187             if (cmdline_paddr) {
    188                 const char *s = (const char *)cmdline_paddr;
    189                 int i = 0;
    190                 while (s[i] && i < 255) { out->bootargs[i] = s[i]; i++; }
    191                 out->bootargs[i] = 0;
    192             }
    193         }
    194     }
    195     return;
    196 #endif
    197     const u8 *base = dtb;
    198     if (be32(base) != FDT_MAGIC) {
    199         uart_puts("DTB: bad magic\n"); return;
    200     }
    201     u32 off_struct = be32(base + 8);
    202     u32 off_strings = be32(base + 12);
    203     const u8 *strings = base + off_strings;
    204     const u8 *p = base + off_struct;
    205 
    206     char path[4][64] = {{0}};
    207     int depth = -1;
    208 
    209     for (;;) {
    210         u32 tok = be32(p); p += 4;
    211         if (tok == FDT_BEGIN_NODE) {
    212             depth++;
    213             if (depth < 4) {
    214                 int i = 0;
    215                 while (p[i] && i < 63) { path[depth][i] = (char)p[i]; i++; }
    216                 path[depth][i] = 0;
    217             }
    218             while (*p) p++;
    219             p++;
    220             p = (const u8 *)(((u64)p + 3) & ~3UL);
    221         } else if (tok == FDT_END_NODE) {
    222             depth--;
    223         } else if (tok == FDT_PROP) {
    224             u32 len = be32(p); p += 4;
    225             u32 nameoff = be32(p); p += 4;
    226             const char *pn = (const char *)(strings + nameoff);
    227 
    228             if (depth == 1 && str_eq(path[1], "chosen")) {
    229                 if (str_eq(pn, "bootargs")) {
    230                     u32 i = 0;
    231                     while (i < len && i < 255) { out->bootargs[i] = (char)p[i]; i++; }
    232                     out->bootargs[i] = 0;
    233                 }
    234             }
    235             if (depth == 1) {
    236                 /* memory node is named "memory@<addr>" */
    237                 if (str_starts(path[1], "memory") &&
    238                     str_eq(pn, "reg") && len >= 16 && out->mem_size == 0) {
    239                     out->mem_start = be64(p);
    240                     out->mem_size  = be64(p + 8);
    241                 }
    242             }
    243             /* virtio-mmio nodes: capture each slot's PA. Root/soc
    244              * #address-cells/#size-cells are both 2 on QEMU virt, so reg
    245              * is 16 bytes (PA u64, size u64); we only need the PA. */
    246             if (depth >= 1 && str_starts(path[depth], "virtio_mmio@") &&
    247                 str_eq(pn, "reg") && len >= 16 &&
    248                 out->virtio_mmio_n < MAX_VIRTIO_MMIO) {
    249                 out->virtio_mmio_pa[out->virtio_mmio_n++] = be64(p);
    250             }
    251             p += len;
    252             p = (const u8 *)(((u64)p + 3) & ~3UL);
    253         } else if (tok == FDT_NOP) {
    254             /* skip */
    255         } else if (tok == FDT_END) {
    256             break;
    257         } else {
    258             uart_puts("DTB: bad token "); uart_putx(tok); uart_puts("\n");
    259             break;
    260         }
    261     }
    262 }
    263 
    264 /* ─── virtio-blk-MMIO driver (polling, single-VQ) ───────────────────────── */
    265 /*
    266  * Two block devices: blk0 = read-only cpio input, blk1 = read-write output.
    267  * Identification is content-based (sector 0 cpio newc magic "070701" ⇒ blk0)
    268  * so we don't depend on -drive ordering on the qemu command line.
    269  *
    270  * The driver is intentionally small: 8-entry split virtqueue, one in-flight
    271  * request at a time, polling used.idx. No interrupts (DAIF stays masked).
    272  *
    273  * MMIO transport regs are reached via DEVICE_ALIAS_BASE + PA. virtqueue
    274  * memory comes from kernel BSS (identity-mapped Normal, VA == PA).
    275  */
    276 
    277 #define VIRTIO_MMIO_MAGIC          0x000
    278 #define VIRTIO_MMIO_VERSION        0x004
    279 #define VIRTIO_MMIO_DEVICE_ID      0x008
    280 #define VIRTIO_MMIO_DEV_FEATURES   0x010
    281 #define VIRTIO_MMIO_DEV_FEAT_SEL   0x014
    282 #define VIRTIO_MMIO_DRV_FEATURES   0x020
    283 #define VIRTIO_MMIO_DRV_FEAT_SEL   0x024
    284 #define VIRTIO_MMIO_QUEUE_SEL      0x030
    285 #define VIRTIO_MMIO_QUEUE_NUM_MAX  0x034
    286 #define VIRTIO_MMIO_QUEUE_NUM      0x038
    287 #define VIRTIO_MMIO_QUEUE_READY    0x044
    288 #define VIRTIO_MMIO_QUEUE_NOTIFY   0x050
    289 #define VIRTIO_MMIO_INT_STATUS     0x060
    290 #define VIRTIO_MMIO_INT_ACK        0x064
    291 #define VIRTIO_MMIO_STATUS         0x070
    292 #define VIRTIO_MMIO_QUEUE_DESC_LO  0x080
    293 #define VIRTIO_MMIO_QUEUE_DESC_HI  0x084
    294 #define VIRTIO_MMIO_QUEUE_DRIVER_LO 0x090
    295 #define VIRTIO_MMIO_QUEUE_DRIVER_HI 0x094
    296 #define VIRTIO_MMIO_QUEUE_DEVICE_LO 0x0a0
    297 #define VIRTIO_MMIO_QUEUE_DEVICE_HI 0x0a4
    298 #define VIRTIO_MMIO_CONFIG         0x100
    299 
    300 #define VIRTIO_STATUS_ACKNOWLEDGE  1
    301 #define VIRTIO_STATUS_DRIVER       2
    302 #define VIRTIO_STATUS_DRIVER_OK    4
    303 #define VIRTIO_STATUS_FEATURES_OK  8
    304 #define VIRTIO_STATUS_FAILED       128
    305 
    306 #define VIRTIO_F_VERSION_1_BIT     32  /* bit 32 in feature space */
    307 
    308 #define VIRTIO_BLK_T_IN  0
    309 #define VIRTIO_BLK_T_OUT 1
    310 
    311 #define VRING_DESC_F_NEXT  1
    312 #define VRING_DESC_F_WRITE 2
    313 
    314 #define VQ_SIZE 8
    315 
    316 struct vring_desc {
    317     u64 addr;
    318     u32 len;
    319     u16 flags;
    320     u16 next;
    321 };
    322 
    323 struct vring_avail {
    324     u16 flags;
    325     u16 idx;
    326     u16 ring[VQ_SIZE];
    327     u16 used_event;
    328 };
    329 
    330 struct vring_used_elem {
    331     u32 id;
    332     u32 len;
    333 };
    334 
    335 struct vring_used {
    336     u16 flags;
    337     u16 idx;
    338     struct vring_used_elem ring[VQ_SIZE];
    339     u16 avail_event;
    340 };
    341 
    342 struct virtio_blk_req_hdr {
    343     u32 type;
    344     u32 reserved;
    345     u64 sector;
    346 };
    347 
    348 #define BLK_DEV_MAX 2
    349 
    350 struct blk_dev {
    351     volatile u8 *regs;             /* alias VA pointing at the MMIO region */
    352     u64 capacity_sectors;
    353     int present;
    354 };
    355 
    356 static struct blk_dev blk_devs[BLK_DEV_MAX];
    357 static int blk_n_devs = 0;
    358 
    359 /* One vring per device in BSS, 4 KB-aligned. Layout within the page:
    360  *   desc:  offset 0   (128 B for VQ_SIZE=8)
    361  *   avail: offset 128 (24 B with VQ_SIZE=8 + used_event)
    362  *   used:  offset 256 (72 B with VQ_SIZE=8 + avail_event)
    363  * Plenty of slack inside one 4 KB page. */
    364 __attribute__((aligned(4096))) static u8 vq_pages[BLK_DEV_MAX][4096];
    365 
    366 #define VQ_DESC_OFF  0
    367 #define VQ_AVAIL_OFF 128
    368 #define VQ_USED_OFF  256
    369 
    370 static struct vring_desc *vq_desc(int i)
    371     { return (struct vring_desc *)(vq_pages[i] + VQ_DESC_OFF); }
    372 static struct vring_avail *vq_avail(int i)
    373     { return (struct vring_avail *)(vq_pages[i] + VQ_AVAIL_OFF); }
    374 static struct vring_used *vq_used(int i)
    375     { return (struct vring_used *)(vq_pages[i] + VQ_USED_OFF); }
    376 
    377 /* MMIO accessors. Reg offsets are byte offsets per the spec. */
    378 static u32 mmio_r32(struct blk_dev *d, u32 off) {
    379     return *(volatile u32 *)(d->regs + off);
    380 }
    381 static void mmio_w32(struct blk_dev *d, u32 off, u32 val) {
    382     *(volatile u32 *)(d->regs + off) = val;
    383 }
    384 
    385 /* Initialise one device per spec §3.1.1 / §4.2 / §5.2. Returns 1 on
    386  * success (device is virtio-blk and ready), 0 if slot is empty / not
    387  * a block device, -1 on error. */
    388 static int blk_init_one(struct blk_dev *d) {
    389     u32 magic = mmio_r32(d, VIRTIO_MMIO_MAGIC);
    390     if (magic != 0x74726976) return 0;                  /* not a virtio slot */
    391     u32 devid = mmio_r32(d, VIRTIO_MMIO_DEVICE_ID);
    392     if (devid == 0) return 0;                           /* unpopulated slot */
    393     if (devid != 2) return 0;                           /* not a block device */
    394     u32 ver   = mmio_r32(d, VIRTIO_MMIO_VERSION);
    395     if (ver != 2) {
    396         /* QEMU virt defaults to legacy (version 1) virtio-mmio transports
    397          * unless the host passes -global virtio-mmio.force-legacy=false.
    398          * The harness scripts set that flag — reaching here means it was
    399          * forgotten. */
    400         uart_puts("[seed] virtio-mmio version != 2 (legacy): ");
    401         uart_putd((i64)ver);
    402         uart_puts(" — pass -global virtio-mmio.force-legacy=false\n");
    403         return -1;
    404     }
    405 
    406     /* Reset, ack, driver. */
    407     mmio_w32(d, VIRTIO_MMIO_STATUS, 0);
    408     mmio_w32(d, VIRTIO_MMIO_STATUS, VIRTIO_STATUS_ACKNOWLEDGE);
    409     mmio_w32(d, VIRTIO_MMIO_STATUS,
    410              VIRTIO_STATUS_ACKNOWLEDGE | VIRTIO_STATUS_DRIVER);
    411 
    412     /* Negotiate VIRTIO_F_VERSION_1 only (bit 32 → feature word 1). */
    413     mmio_w32(d, VIRTIO_MMIO_DEV_FEAT_SEL, 1);
    414     u32 dev_feat_hi = mmio_r32(d, VIRTIO_MMIO_DEV_FEATURES);
    415     if (!(dev_feat_hi & (1u << (VIRTIO_F_VERSION_1_BIT - 32)))) {
    416         uart_puts("[seed] virtio: device lacks VERSION_1\n");
    417         return -1;
    418     }
    419     mmio_w32(d, VIRTIO_MMIO_DRV_FEAT_SEL, 0);
    420     mmio_w32(d, VIRTIO_MMIO_DRV_FEATURES, 0);
    421     mmio_w32(d, VIRTIO_MMIO_DRV_FEAT_SEL, 1);
    422     mmio_w32(d, VIRTIO_MMIO_DRV_FEATURES, 1u << (VIRTIO_F_VERSION_1_BIT - 32));
    423 
    424     mmio_w32(d, VIRTIO_MMIO_STATUS,
    425              VIRTIO_STATUS_ACKNOWLEDGE | VIRTIO_STATUS_DRIVER |
    426              VIRTIO_STATUS_FEATURES_OK);
    427     u32 st = mmio_r32(d, VIRTIO_MMIO_STATUS);
    428     if (!(st & VIRTIO_STATUS_FEATURES_OK)) {
    429         uart_puts("[seed] virtio: FEATURES_OK rejected\n");
    430         return -1;
    431     }
    432 
    433     /* Queue 0. */
    434     mmio_w32(d, VIRTIO_MMIO_QUEUE_SEL, 0);
    435     u32 qmax = mmio_r32(d, VIRTIO_MMIO_QUEUE_NUM_MAX);
    436     if (qmax < VQ_SIZE) {
    437         uart_puts("[seed] virtio: QueueNumMax < VQ_SIZE\n");
    438         return -1;
    439     }
    440     mmio_w32(d, VIRTIO_MMIO_QUEUE_NUM, VQ_SIZE);
    441 
    442     int i = (int)(d - blk_devs);
    443     /* Zero the queue page so all idx/flags start at 0. */
    444     for (int k = 0; k < 4096; k++) vq_pages[i][k] = 0;
    445 
    446     u64 desc_pa  = (u64)vq_desc(i);
    447     u64 avail_pa = (u64)vq_avail(i);
    448     u64 used_pa  = (u64)vq_used(i);
    449     mmio_w32(d, VIRTIO_MMIO_QUEUE_DESC_LO,    (u32)desc_pa);
    450     mmio_w32(d, VIRTIO_MMIO_QUEUE_DESC_HI,    (u32)(desc_pa >> 32));
    451     mmio_w32(d, VIRTIO_MMIO_QUEUE_DRIVER_LO,  (u32)avail_pa);
    452     mmio_w32(d, VIRTIO_MMIO_QUEUE_DRIVER_HI,  (u32)(avail_pa >> 32));
    453     mmio_w32(d, VIRTIO_MMIO_QUEUE_DEVICE_LO,  (u32)used_pa);
    454     mmio_w32(d, VIRTIO_MMIO_QUEUE_DEVICE_HI,  (u32)(used_pa >> 32));
    455 
    456     mmio_w32(d, VIRTIO_MMIO_QUEUE_READY, 1);
    457     mmio_w32(d, VIRTIO_MMIO_STATUS,
    458              VIRTIO_STATUS_ACKNOWLEDGE | VIRTIO_STATUS_DRIVER |
    459              VIRTIO_STATUS_FEATURES_OK | VIRTIO_STATUS_DRIVER_OK);
    460 
    461     /* virtio-blk config @+0 = capacity (sectors, little-endian u64). */
    462     u32 cap_lo = *(volatile u32 *)(d->regs + VIRTIO_MMIO_CONFIG + 0);
    463     u32 cap_hi = *(volatile u32 *)(d->regs + VIRTIO_MMIO_CONFIG + 4);
    464     d->capacity_sectors = ((u64)cap_hi << 32) | cap_lo;
    465     d->present = 1;
    466     return 1;
    467 }
    468 
    469 /* Issue one request (VIRTIO_BLK_T_IN or _OUT) and poll for completion.
    470  * `buf` PA == VA (kheap or kernel BSS / stack). nsec ≤ 2048 (1 MB/req). */
    471 static int blk_request_one(int devi, u32 type, u64 sector, void *buf, u64 nsec) {
    472     struct blk_dev *d = &blk_devs[devi];
    473     /* Per-call hdr/status — both reachable via VA==PA on the kernel stack. */
    474     struct virtio_blk_req_hdr hdr = { .type = type, .reserved = 0, .sector = sector };
    475     volatile u8 status = 0xff;
    476 
    477     struct vring_desc *desc = vq_desc(devi);
    478     struct vring_avail *avail = vq_avail(devi);
    479     struct vring_used *used = vq_used(devi);
    480 
    481     desc[0].addr  = (u64)&hdr;
    482     desc[0].len   = (u32)sizeof(hdr);
    483     desc[0].flags = VRING_DESC_F_NEXT;
    484     desc[0].next  = 1;
    485 
    486     desc[1].addr  = (u64)buf;
    487     desc[1].len   = (u32)(nsec * 512);
    488     /* For READ (T_IN), device writes into our buffer (F_WRITE).
    489      * For WRITE (T_OUT), device reads our buffer (no F_WRITE). */
    490     desc[1].flags = VRING_DESC_F_NEXT | (type == VIRTIO_BLK_T_IN ? VRING_DESC_F_WRITE : 0);
    491     desc[1].next  = 2;
    492 
    493     desc[2].addr  = (u64)&status;
    494     desc[2].len   = 1;
    495     desc[2].flags = VRING_DESC_F_WRITE;
    496     desc[2].next  = 0;
    497 
    498     u16 head = 0;
    499     u16 ai = avail->idx;
    500     avail->ring[ai % VQ_SIZE] = head;
    501     arch_wmb();
    502     avail->idx = ai + 1;
    503     arch_wmb();
    504 
    505     mmio_w32(d, VIRTIO_MMIO_QUEUE_NOTIFY, 0);
    506 
    507     /* Poll used.idx — single in-flight, advances by exactly one. */
    508     while (used->idx == ai) {
    509         arch_pause();
    510     }
    511     arch_rmb();
    512 
    513     /* Acknowledge any pending interrupt status (we don't service IRQs but
    514      * the device sets these bits anyway). */
    515     u32 is = mmio_r32(d, VIRTIO_MMIO_INT_STATUS);
    516     if (is) mmio_w32(d, VIRTIO_MMIO_INT_ACK, is);
    517 
    518     if (status != 0) {
    519         uart_puts("[seed] virtio-blk req failed status="); uart_putd((i64)status);
    520         uart_puts("\n");
    521         return -1;
    522     }
    523     return 0;
    524 }
    525 
    526 /* Multi-chunk read/write. Chunks at 1 MB (2048 sectors) per request. */
    527 #define BLK_CHUNK_SECTORS 2048
    528 
    529 static int blk_io(int devi, u32 type, u64 sector, u8 *buf, u64 nsec) {
    530     while (nsec) {
    531         u64 n = nsec;
    532         if (n > BLK_CHUNK_SECTORS) n = BLK_CHUNK_SECTORS;
    533         if (blk_request_one(devi, type, sector, buf, n) < 0) return -1;
    534         sector += n;
    535         buf    += n * 512;
    536         nsec   -= n;
    537     }
    538     return 0;
    539 }
    540 
    541 static int blk_read(int devi, u64 sector, void *buf, u64 nsec) {
    542     return blk_io(devi, VIRTIO_BLK_T_IN, sector, buf, nsec);
    543 }
    544 static int blk_write(int devi, u64 sector, const void *buf, u64 nsec) {
    545     return blk_io(devi, VIRTIO_BLK_T_OUT, sector, (u8 *)buf, nsec);
    546 }
    547 
    548 /* Probe every populated MMIO slot the DTB advertised; bring up block
    549  * devices; identify blk0 vs blk1 by reading sector 0 — the cpio newc
    550  * magic ("070701") is on blk0, the other is blk1. Panics if exactly one
    551  * of each isn't found. */
    552 static int g_blk_input  = -1;     /* index in blk_devs[] for cpio input */
    553 static int g_blk_output = -1;     /* index for output dump              */
    554 
    555 static void blk_init(struct dtb_info *dt) {
    556     int n_blocks = 0;
    557     for (int i = 0; i < dt->virtio_mmio_n; i++) {
    558         u64 pa = dt->virtio_mmio_pa[i];
    559         if (n_blocks >= BLK_DEV_MAX) break;
    560         /* Stage into blk_devs[n_blocks] so blk_init_one's index-derived
    561          * vq page assignment is correct from the start. */
    562         struct blk_dev *d = &blk_devs[n_blocks];
    563         d->regs = arch_mmio_ptr(pa);
    564         d->capacity_sectors = 0;
    565         d->present = 0;
    566         int r = blk_init_one(d);
    567         if (r > 0) {
    568             n_blocks++;
    569         } else if (r < 0) {
    570             uart_puts("[seed] virtio: init failed at PA="); uart_putx(pa);
    571             uart_puts("\n");
    572             hang();
    573         }
    574     }
    575     blk_n_devs = n_blocks;
    576     if (n_blocks != 2) {
    577         uart_puts("[seed] virtio-blk: expected 2 block devices, got ");
    578         uart_putd((i64)n_blocks); uart_puts("\n");
    579         hang();
    580     }
    581 
    582     /* Identify blk0 (cpio) vs blk1 (output) by reading sector 0. */
    583     __attribute__((aligned(16))) static u8 probe[512];
    584     for (int i = 0; i < n_blocks; i++) {
    585         if (blk_read(i, 0, probe, 1) < 0) {
    586             uart_puts("[seed] virtio-blk: probe read failed dev=");
    587             uart_putd((i64)i); uart_puts("\n");
    588             hang();
    589         }
    590         int is_cpio = str_starts((const char *)probe, CPIO_NEWC_MAGIC);
    591         if (is_cpio) {
    592             if (g_blk_input >= 0) {
    593                 uart_puts("[seed] virtio-blk: multiple cpio disks\n");
    594                 hang();
    595             }
    596             g_blk_input = i;
    597         } else {
    598             if (g_blk_output >= 0) {
    599                 uart_puts("[seed] virtio-blk: multiple non-cpio disks\n");
    600                 hang();
    601             }
    602             g_blk_output = i;
    603         }
    604     }
    605     if (g_blk_input < 0 || g_blk_output < 0) {
    606         uart_puts("[seed] virtio-blk: failed to identify in/out\n");
    607         hang();
    608     }
    609     uart_puts("[seed] virtio-blk: in=dev"); uart_putd((i64)g_blk_input);
    610     uart_puts(" cap=");                     uart_putd((i64)blk_devs[g_blk_input].capacity_sectors);
    611     uart_puts(" sec  out=dev");             uart_putd((i64)g_blk_output);
    612     uart_puts(" cap=");                     uart_putd((i64)blk_devs[g_blk_output].capacity_sectors);
    613     uart_puts(" sec\n");
    614 }
    615 
    616 /* ─── In-memory tmpfs from cpio newc ────────────────────────────────────── */
    617 
    618 /* boot5 stages a full musl tree in the cpio (~1300 .c sources + ~1200
    619  * headers/aux) plus per-TU .o outputs (~1300) — observed cpio entry
    620  * count is ~2600 inputs + ~1300 outputs ≈ 3900. Round up to 4096 to
    621  * leave headroom; the struct is ~120 bytes so this costs ~480 KB of
    622  * kernel BSS — comfortably within the 192 MB kheap. Path length is 96
    623  * to fit "tmp/musl-1.2.5/obj/src/<sub>/<name>.o" (paths stored with
    624  * leading slashes stripped, so these are user-visible as
    625  * /tmp/musl-1.2.5/obj/...). */
    626 #define MAX_FILES 4096
    627 struct file {
    628     int   used;
    629     char  path[96];
    630     u8   *data;
    631     u64   len;
    632     u64   cap;
    633 };
    634 static struct file files[MAX_FILES];
    635 
    636 /* Resolve "." and ".." segments in `in` into `out`. Required because real
    637  * filesystems normalize `foo/../bar` → `bar` at lookup time, but our
    638  * tmpfs is a flat path → blob map; without this, tcc's pstrcat-style
    639  * include resolution (e.g. "/tmp/musl-1.2.5/src/include/" +
    640  * "../../include/features.h") produces a literal path that misses the
    641  * real entry "tmp/musl-1.2.5/include/features.h". The buffers are sized
    642  * for our worst case (~96-char paths) plus headroom for the unresolved
    643  * form. Trailing slashes are dropped. */
    644 static void normalize_path(const char *in, char *out, int outsz) {
    645     char buf[256];
    646     int n = 0;
    647     while (in[n] && n < (int)sizeof(buf) - 1) { buf[n] = in[n]; n++; }
    648     buf[n] = 0;
    649 
    650     const char *segs[64];
    651     int seg_lens[64];
    652     int nsegs = 0;
    653     int i = 0;
    654     while (buf[i]) {
    655         int start = i;
    656         while (buf[i] && buf[i] != '/') i++;
    657         int len = i - start;
    658         if (buf[i]) { buf[i] = 0; i++; }
    659         if (len == 0) continue;
    660         if (len == 1 && buf[start] == '.') continue;
    661         if (len == 2 && buf[start] == '.' && buf[start + 1] == '.') {
    662             if (nsegs > 0) nsegs--;
    663             continue;
    664         }
    665         if (nsegs < 64) {
    666             segs[nsegs] = &buf[start];
    667             seg_lens[nsegs] = len;
    668             nsegs++;
    669         }
    670     }
    671 
    672     int o = 0;
    673     for (int k = 0; k < nsegs; k++) {
    674         for (int j = 0; j < seg_lens[k] && o < outsz - 1; j++)
    675             out[o++] = segs[k][j];
    676         if (k < nsegs - 1 && o < outsz - 1) out[o++] = '/';
    677     }
    678     out[o] = 0;
    679 }
    680 
    681 static int find_file(const char *path) {
    682     while (*path == '/') path++;
    683     char norm[128];
    684     normalize_path(path, norm, sizeof(norm));
    685     for (int i = 0; i < MAX_FILES; i++) {
    686         if (files[i].used && str_eq(files[i].path, norm)) return i;
    687     }
    688     return -1;
    689 }
    690 
    691 static int new_file(const char *path) {
    692     while (*path == '/') path++;
    693     /* Normalize at store time so all later lookups match regardless of
    694      * how the caller spelled `..` / `.`. */
    695     char norm[128];
    696     normalize_path(path, norm, sizeof(norm));
    697     for (int i = 0; i < MAX_FILES; i++) {
    698         if (!files[i].used) {
    699             files[i].used = 1;
    700             int j = 0;
    701             while (norm[j] && j < (int)sizeof(files[i].path) - 1) {
    702                 files[i].path[j] = norm[j]; j++;
    703             }
    704             files[i].path[j] = 0;
    705             files[i].data = 0;
    706             files[i].len = 0;
    707             files[i].cap = 0;
    708             return i;
    709         }
    710     }
    711     return -1;
    712 }
    713 
    714 static u64 hex_n(const char *s, int n) {
    715     u64 v = 0;
    716     for (int i = 0; i < n; i++) {
    717         char c = s[i];
    718         v <<= 4;
    719         if (c >= '0' && c <= '9') v |= (u64)(c - '0');
    720         else if (c >= 'a' && c <= 'f') v |= (u64)(c - 'a' + 10);
    721         else if (c >= 'A' && c <= 'F') v |= (u64)(c - 'A' + 10);
    722     }
    723     return v;
    724 }
    725 
    726 static void parse_cpio(const void *cpio, u64 total) {
    727     const u8 *p = cpio;
    728     const u8 *end = p + total;
    729     while (p + CPIO_NEWC_HDR_SIZE <= end) {
    730         if (!str_starts((const char *)p, CPIO_NEWC_MAGIC)) break;
    731         u64 mode    = hex_n((const char *)CPIO_NEWC_FIELD(p, 1),  8);
    732         u64 fsz     = hex_n((const char *)CPIO_NEWC_FIELD(p, 6),  8);
    733         u64 nsz     = hex_n((const char *)CPIO_NEWC_FIELD(p, 11), 8);
    734         const char *name = (const char *)(p + CPIO_NEWC_HDR_SIZE);
    735         if (str_eq(name, "TRAILER!!!")) break;
    736 
    737         u64 hstride = (CPIO_NEWC_HDR_SIZE + nsz + 3) & ~3UL;
    738         u64 fstride = (fsz + 3) & ~3UL;
    739         const u8 *fdata = p + hstride;
    740 
    741         int is_dir = ((mode & CPIO_MODE_TYPE_MASK) == CPIO_MODE_TYPE_DIR);
    742         int is_reg = ((mode & CPIO_MODE_TYPE_MASK) == CPIO_MODE_TYPE_REG);
    743         if (is_reg && !str_eq(name, ".")) {
    744             int idx = new_file(name);
    745             if (idx >= 0) {
    746                 /* Copy out — we'll let the user write back later if needed. */
    747                 files[idx].data = kalloc(fsz ? fsz : 1);
    748                 files[idx].cap  = fsz ? fsz : 1;
    749                 files[idx].len  = fsz;
    750                 if (fsz) mem_cpy(files[idx].data, fdata, fsz);
    751             } else {
    752                 /* Silent drops here are how MAX_FILES being too low
    753                  * masquerades as random "file not found" errors during
    754                  * the build — surface it loudly. */
    755                 uart_puts("[seed] WARN: cpio entry dropped (MAX_FILES "
    756                           "exhausted): "); uart_puts(name); uart_puts("\n");
    757             }
    758         }
    759         (void)is_dir;
    760         p += hstride + fstride;
    761     }
    762 }
    763 
    764 /* ─── ELF64 static loader ───────────────────────────────────────────────── */
    765 
    766 struct ehdr { u8 e_ident[16]; u16 e_type, e_machine; u32 e_version; u64 e_entry, e_phoff, e_shoff; u32 e_flags; u16 e_ehsize, e_phentsize, e_phnum, e_shentsize, e_shnum, e_shstrndx; };
    767 struct phdr { u32 p_type, p_flags; u64 p_offset, p_vaddr, p_paddr, p_filesz, p_memsz, p_align; };
    768 
    769 #define PT_LOAD 1
    770 
    771 /* Highest VA touched by the most recently loaded image's PT_LOAD segments
    772  * (after USER_VA_HI clipping). load_elf updates this; kmain / sys_spawn
    773  * use it to seed brk_base above the user image's BSS. */
    774 static u64 g_user_image_end;
    775 
    776 static u64 load_elf(const u8 *elf) {
    777     const struct ehdr *eh = (const struct ehdr *)elf;
    778     if (!(eh->e_ident[0] == 0x7f && eh->e_ident[1] == 'E' &&
    779           eh->e_ident[2] == 'L'  && eh->e_ident[3] == 'F')) {
    780         uart_puts("ELF: bad magic\n"); return 0;
    781     }
    782     if (eh->e_machine != ARCH_ELF_MACHINE) {
    783         uart_puts("ELF: not "); uart_puts(ARCH_ELF_MACHINE_NAME); uart_puts("\n"); return 0;
    784     }
    785     /* p_flags (R/W/X) are deliberately ignored: the user mapping is one
    786      * giant Normal-memory RWX region (see arch_setup_mmu). OS.md
    787      * §"Memory model" permits this — there's no W^X enforcement in the
    788      * contract, and tcc-boot2 never JITs.
    789      *
    790      * Segments are clipped at USER_VA_HI: a binary may declare a BSS that
    791      * extends past the mapped user window (scheme1 reserves ~256 MB), and
    792      * a naive mem_set would walk into the device-block region above and
    793      * trigger an external abort. The user image gets only the portion of
    794      * its memsz that fits in the user pool; if user code later touches
    795      * the unmapped tail, that's a user-space fault, not a kernel panic. */
    796     u64 hi = 0;
    797     for (int i = 0; i < eh->e_phnum; i++) {
    798         const struct phdr *ph = (const struct phdr *)(elf + eh->e_phoff + (u64)i * eh->e_phentsize);
    799         if (ph->p_type != PT_LOAD) continue;
    800         u64 vaddr = ph->p_vaddr;
    801         u64 filesz = ph->p_filesz;
    802         u64 memsz  = ph->p_memsz;
    803         if (vaddr >= USER_VA_HI) continue;          /* segment fully out of window */
    804         u64 reach = USER_VA_HI - vaddr;
    805         if (filesz > reach) filesz = reach;
    806         if (memsz > reach)  memsz = reach;
    807         u8 *dst = (u8 *)vaddr;
    808         const u8 *src = elf + ph->p_offset;
    809         mem_cpy(dst, src, filesz);
    810         if (memsz > filesz)
    811             mem_set(dst + filesz, 0, memsz - filesz);
    812         u64 end = vaddr + memsz;
    813         if (end > hi) hi = end;
    814     }
    815     /* Round up to 16 bytes so callers can use it directly as brk_base. */
    816     g_user_image_end = (hi + 15) & ~15UL;
    817     /* I-cache sync (cheap insurance even with caches off). */
    818     arch_icache_sync();
    819     return eh->e_entry;
    820 }
    821 
    822 /* ─── Syscall layer (Tier 1) ────────────────────────────────────────────── */
    823 
    824 #define MAX_FD 32
    825 struct fdent { int used; int fidx; u64 pos; int wflag; int append; };
    826 static struct fdent fdtab[MAX_FD];
    827 
    828 /* User program break (single-process). */
    829 static u64 brk_base;
    830 static u64 brk_cur;
    831 static u64 brk_max;
    832 
    833 #define EBADF   9
    834 #define ENOENT  2
    835 #define EINVAL  22
    836 #define EMFILE  24
    837 #define EFAULT  14
    838 #define ENOSPC  28
    839 
    840 #define O_RDONLY  0
    841 #define O_WRONLY  1
    842 #define O_RDWR    2
    843 #define O_CREAT   0100
    844 #define O_TRUNC   01000
    845 #define O_APPEND  02000
    846 
    847 #define AT_FDCWD (-100)
    848 
    849 #define SYS_unlinkat   ARCH_SYS_unlinkat
    850 #define SYS_openat     ARCH_SYS_openat
    851 #ifdef ARCH_SYS_open
    852 #define SYS_open       ARCH_SYS_open
    853 #endif
    854 #define SYS_close      ARCH_SYS_close
    855 #define SYS_lseek      ARCH_SYS_lseek
    856 #define SYS_read       ARCH_SYS_read
    857 #define SYS_write      ARCH_SYS_write
    858 #define SYS_exit_group ARCH_SYS_exit_group
    859 #define SYS_waitid     ARCH_SYS_waitid
    860 #define SYS_brk        ARCH_SYS_brk
    861 /* Private syscall number, deliberately outside the normal Linux range.
    862  * The scheme1 prelude probes (sys-spawn) once at init: on Linux this
    863  * number is unmapped so the probe gets -ENOSYS and the prelude falls
    864  * back to the classic clone+execve path; on the seed kernel the probe
    865  * succeeds (or returns -ENOENT for a missing file) and the prelude uses
    866  * sys-spawn for every (run …) thereafter. */
    867 #define SYS_spawn      ARCH_SYS_spawn
    868 
    869 #define ECHILD     10
    870 #define EAGAIN     11
    871 #define ENOEXEC     8
    872 
    873 static i64 sys_write(int fd, const void *buf, u64 len) {
    874     if (fd == 1 || fd == 2) {
    875         const u8 *s = buf;
    876         for (u64 i = 0; i < len; i++) uart_putc((char)s[i]);
    877         return (i64)len;
    878     }
    879     if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used || !fdtab[fd].wflag) return -EBADF;
    880     struct file *f = &files[fdtab[fd].fidx];
    881     u64 pos = fdtab[fd].append ? f->len : fdtab[fd].pos;
    882     u64 need = pos + len;
    883     if (need > f->cap) {
    884         u64 ncap = f->cap ? f->cap : 64;
    885         while (ncap < need) ncap *= 2;
    886         u8 *nd = kalloc(ncap);
    887         if (f->len) mem_cpy(nd, f->data, f->len);
    888         f->data = nd;
    889         f->cap = ncap;
    890     }
    891     mem_cpy(f->data + pos, buf, len);
    892     if (need > f->len) f->len = need;
    893     fdtab[fd].pos = pos + len;
    894     return (i64)len;
    895 }
    896 
    897 static i64 sys_read(int fd, void *buf, u64 len) {
    898     if (fd == 0) return 0;
    899     if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used) return -EBADF;
    900     struct file *f = &files[fdtab[fd].fidx];
    901     u64 pos = fdtab[fd].pos;
    902     if (pos >= f->len) return 0;
    903     u64 n = len;
    904     if (pos + n > f->len) n = f->len - pos;
    905     mem_cpy(buf, f->data + pos, n);
    906     fdtab[fd].pos = pos + n;
    907     return (i64)n;
    908 }
    909 
    910 static i64 sys_openat(int dirfd, const char *path, int flags, int mode) {
    911     (void)dirfd; (void)mode;
    912     int fidx = find_file(path);
    913     int wflag = (flags & 3) != 0;
    914     if (fidx < 0) {
    915         if (!(flags & O_CREAT)) return -ENOENT;
    916         fidx = new_file(path);
    917         if (fidx < 0) return -ENOSPC;
    918     } else if (flags & O_TRUNC) {
    919         files[fidx].len = 0;
    920     }
    921     int fd = -1;
    922     for (int i = 3; i < MAX_FD; i++) {
    923         if (!fdtab[i].used) { fd = i; break; }
    924     }
    925     if (fd < 0) return -EMFILE;
    926     fdtab[fd].used = 1;
    927     fdtab[fd].fidx = fidx;
    928     fdtab[fd].pos = 0;
    929     fdtab[fd].wflag = wflag;
    930     fdtab[fd].append = (flags & O_APPEND) ? 1 : 0;
    931     return fd;
    932 }
    933 
    934 static i64 sys_close(int fd) {
    935     if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used) return -EBADF;
    936     fdtab[fd].used = 0;
    937     return 0;
    938 }
    939 
    940 static i64 sys_lseek(int fd, i64 off, int whence) {
    941     if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used) return -EBADF;
    942     struct file *f = &files[fdtab[fd].fidx];
    943     i64 base;
    944     if (whence == 0) base = 0;
    945     else if (whence == 1) base = (i64)fdtab[fd].pos;
    946     else if (whence == 2) base = (i64)f->len;
    947     else return -EINVAL;
    948     i64 np = base + off;
    949     if (np < 0) return -EINVAL;
    950     fdtab[fd].pos = (u64)np;
    951     return np;
    952 }
    953 
    954 static i64 sys_brk(u64 addr) {
    955     if (addr == 0) return (i64)brk_cur;
    956     if (addr < brk_base || addr > brk_max) return (i64)brk_cur;
    957     brk_cur = addr;
    958     return (i64)brk_cur;
    959 }
    960 
    961 static i64 sys_unlinkat(int dirfd, const char *path, int flags) {
    962     /* dirfd is ignored: tmpfs is a flat path map, all callers pass
    963      * AT_FDCWD. flags (AT_REMOVEDIR) is ignored too — there are no
    964      * directories in our tmpfs, so the dir-vs-file distinction the
    965      * flag selects has no observable effect. */
    966     (void)dirfd; (void)flags;
    967     int fidx = find_file(path);
    968     if (fidx < 0) return -ENOENT;
    969     files[fidx].used = 0;
    970     return 0;
    971 }
    972 
    973 /* ─── Tier 2: atomic spawn (spawn / waitid / exit_group) ────────────────── */
    974 /*
    975  * The boot2 chain's process-creation shape (scheme1/prelude.scm `(spawn …)`)
    976  * is rigidly synchronous: parent creates a child to run a single program,
    977  * waits for it, reads the exit code. Nothing else runs in the child between
    978  * creation and the new program's entry, and nothing else runs in the
    979  * parent between creation and wait.
    980  *
    981  * We implement that as a single atomic syscall on a single-threaded kernel:
    982  *
    983  *   sys_spawn   → capture path+argv into kernel buffers (still reading from
    984  *                 the parent's pool); push parent state (regs, brk, fd
    985  *                 table, current pool) onto proc_stack; remap user VAs to
    986  *                 the alternate pool with NO COPY (the child won't read any
    987  *                 byte of the parent's pool — load_elf overwrites just the
    988  *                 PT_LOAD ranges and build_user_stack writes the top of the
    989  *                 user VA window); load_elf into the alternate pool, reset
    990  *                 brk, build the user stack, rewrite tf so trap return
    991  *                 enters the new program at its entry with the new stack.
    992  *   sys_exit    → if proc_stack non-empty: stash exit code in last_child,
    993  *                 swap user VAs back to the parent's pool (no copy — the
    994  *                 parent's pool was never written by the child), restore
    995  *                 regs/brk/fds, cache-sync (the user VAs now resolve to
    996  *                 different physical pages), set tf so trap return resumes
    997  *                 the parent's spawn() call with child pid. If proc_stack
    998  *                 empty: real exit (dump tmpfs, arch shutdown).
    999  *   sys_waitid  → return last_child's exit code via the siginfo struct.
   1000  *
   1001  * No actual concurrency. The "parent" is suspended at the moment of spawn
   1002  * and resumed only when the child calls exit_group.
   1003  *
   1004  * Memory cost per spawn: zero copy. User-map rewrite + TLB/cache sync
   1005  * + load_elf (which copies just the new image's PT_LOAD bytes, typically
   1006  * ~1 MB for tcc). This replaces the previous clone/execve design which
   1007  * paid one 768 MB mem_cpy per fork to seed the child's pool with parent
   1008  * state — needed only because scheme1 ran a few interpreter-bytecode
   1009  * cells of user code between clone and execve, which would otherwise
   1010  * mutate the parent's BSS heap-allocator globals (heap_next,
   1011  * current_heap_next_ptr, scratch_next). Folding clone+execve into one
   1012  * syscall closes that window entirely.
   1013  */
   1014 
   1015 /* Forward decls for state defined further down. boot5's per-source
   1016  * compile passes ~25 argv entries / ~750 bytes, but the final
   1017  * `tcc -ar rcs libc.a obj1 … obj1263` call passes ~1300 entries totalling
   1018  * ~65 KB of strings. MAX_ARGV / spawn_argv_pool size for that worst
   1019  * case; both are kept generous so a future taller chain doesn't hit a
   1020  * silent-truncation cliff. */
   1021 #define MAX_ARGV 2048
   1022 static u64 build_user_stack(u64 stack_top, int argc, char **argv);
   1023 static int tokenise(char *src, char **argv, int cap);
   1024 
   1025 #define MAX_PROC_DEPTH 1
   1026 
   1027 struct proc_save {
   1028     int active;
   1029     u64 child_pid;
   1030     /* Saved trap-frame state — enough to resume the parent at the trap
   1031      * instruction following its sys_spawn. The return register is
   1032      * overwritten with child_pid at restore time. */
   1033     struct trapframe tf;
   1034     u64 user_sp;
   1035     /* Per-process state at the moment of spawn. brk_base is saved alongside
   1036      * brk_cur because sys_spawn resets it above the new image's end-of-bss;
   1037      * the parent's value comes back with the parent's pool. */
   1038     u64 brk_base_save;
   1039     u64 brk_cur_save;
   1040     struct fdent fdtab_save[MAX_FD];
   1041     int pool_save;     /* parent's user pool (0=A, 1=B) */
   1042 };
   1043 
   1044 /* Rewrite the user-VA mapping to point at pool `which`, then flush TLB. */
   1045 static void swap_user_pool(int which) {
   1046     arch_swap_user_pool(which);
   1047     current_pool = which;
   1048 }
   1049 
   1050 static struct proc_save proc_stack[MAX_PROC_DEPTH];
   1051 static int proc_depth = 0;
   1052 static u64 g_next_pid = 2;
   1053 
   1054 /* The most recently exited child, for sys_waitid to consume. */
   1055 static int last_child_valid = 0;
   1056 static u64 last_child_pid = 0;
   1057 static int last_child_code = 0;
   1058 
   1059 /* sys_spawn captures path+argv from the parent's pool into kernel buffers
   1060  * BEFORE swapping pools — load_elf will only ever read from the cpio-
   1061  * staged file (kernel state) and write to the alternate pool, but the
   1062  * argv strings the caller passed live in the parent's pool, which we're
   1063  * about to stop mapping. The pool + argv pointer table sit in BSS
   1064  * (rather than on the kernel stack) because MAX_ARGV * 8 = 16 KB is
   1065  * too large to put on the syscall stack. */
   1066 static char  spawn_argv_pool[131072];     /* 128 KB; boot5 ar peaks ~65 KB */
   1067 static char *spawn_argv_ptrs[MAX_ARGV];
   1068 static i64 sys_spawn(struct trapframe *tf, const char *path, char **argv) {
   1069     if (!path) return -EFAULT;
   1070     if (proc_depth >= MAX_PROC_DEPTH) return -EAGAIN;
   1071 
   1072     /* Copy path out of the parent's pool first (find_file uses kernel
   1073      * state, but the caller's `path` pointer is into user memory). */
   1074     char path_buf[128];
   1075     int pn = 0;
   1076     while (path[pn] && pn < 127) { path_buf[pn] = path[pn]; pn++; }
   1077     path_buf[pn] = 0;
   1078     int fidx = find_file(path_buf);
   1079     if (fidx < 0) return -ENOENT;
   1080 
   1081     /* Capture argv strings into spawn_argv_pool (kernel BSS, not the
   1082      * user pool — survives the pool swap below). Truncation here is
   1083      * silent but loud: we panic-warn on the UART so a too-low MAX_ARGV
   1084      * surfaces as a kernel message, not a downstream link failure. */
   1085     int argc = 0;
   1086     int pool_off = 0;
   1087     if (argv) {
   1088         while (argc < MAX_ARGV - 1 && argv[argc]) {
   1089             const char *s = argv[argc];
   1090             int n = 0;
   1091             while (s[n] && pool_off + n < (int)sizeof(spawn_argv_pool) - 1) n++;
   1092             for (int j = 0; j < n; j++) spawn_argv_pool[pool_off + j] = s[j];
   1093             spawn_argv_pool[pool_off + n] = 0;
   1094             spawn_argv_ptrs[argc] = &spawn_argv_pool[pool_off];
   1095             pool_off += n + 1;
   1096             argc++;
   1097         }
   1098         if (argv[argc]) {
   1099             uart_puts("[seed] WARN: sys_spawn argv truncated at MAX_ARGV="
   1100                       ); uart_putd(MAX_ARGV); uart_puts(" for path=");
   1101             uart_puts(path_buf); uart_puts("\n");
   1102         }
   1103     }
   1104     if (argc == 0) {
   1105         /* Synthesise argv[0] from the path so user code that reads argv[0]
   1106          * doesn't crash. */
   1107         int n = 0;
   1108         while (path_buf[n] && pool_off + n < (int)sizeof(spawn_argv_pool) - 1) n++;
   1109         for (int j = 0; j < n; j++) spawn_argv_pool[pool_off + j] = path_buf[j];
   1110         spawn_argv_pool[pool_off + n] = 0;
   1111         spawn_argv_ptrs[0] = &spawn_argv_pool[pool_off];
   1112         pool_off += n + 1;
   1113         argc = 1;
   1114     }
   1115 
   1116     /* Save parent state — regs, brk, fd table, which pool the parent ran
   1117      * in. After sys_exit_or_resume_parent restores from this frame, the
   1118      * parent's spawn() call returns with child_pid. */
   1119     struct proc_save *p = &proc_stack[proc_depth];
   1120     p->active = 1;
   1121     p->child_pid = g_next_pid++;
   1122     p->tf = *tf;
   1123     p->user_sp = arch_read_user_sp();
   1124     p->brk_base_save = brk_base;
   1125     p->brk_cur_save  = brk_cur;
   1126     for (int i = 0; i < MAX_FD; i++) p->fdtab_save[i] = fdtab[i];
   1127     p->pool_save = current_pool;
   1128 
   1129     /* Swap to the alternate pool. NO COPY: the child will only read
   1130      * memory that load_elf writes (its own PT_LOAD segments) and what
   1131      * build_user_stack writes (top of user VA). Stale bytes elsewhere in
   1132      * the alt pool are user-invisible — sbrk pages aren't zeroed but
   1133      * neither were they under the old execve path. */
   1134     int new_pool = current_pool ^ 1;
   1135     swap_user_pool(new_pool);
   1136     proc_depth++;
   1137 
   1138     /* Load new ELF into the (just-swapped) alt pool. files[fidx].data is
   1139      * in kernel heap, not the user pool, so this read is unaffected. */
   1140     u64 entry = load_elf(files[fidx].data);
   1141     if (!entry) {
   1142         /* Roll back: alt pool is in undefined state but parent pool is
   1143          * still pristine. Swap back and pop proc_stack. */
   1144         proc_depth--;
   1145         swap_user_pool(p->pool_save);
   1146         return -ENOEXEC;
   1147     }
   1148 
   1149     /* Reset brk above the new image's end-of-bss, page-aligned up.
   1150      * Some seed binaries (e.g. riscv64 hex2) embed PC-relative scratch
   1151      * buffers in the bytes just past their loaded image and assume brk
   1152      * lives a full page beyond — Linux rounds brk to PAGE_SIZE. If we
   1153      * placed the heap immediately after the image (16-byte aligned), a
   1154      * write through the in-binary scratch overlaps the first heap node
   1155      * and silently corrupts the user's data structures. */
   1156     brk_base = g_user_image_end ? g_user_image_end : USER_VA_LO;
   1157     brk_base = (brk_base + 0xfffUL) & ~0xfffUL;
   1158     brk_cur  = brk_base;
   1159 
   1160     /* Build new user stack at top of user VA window. */
   1161     u64 new_sp = build_user_stack(USER_VA_HI, argc, spawn_argv_ptrs);
   1162 
   1163     /* Rewrite trap frame so eret enters the child at the new image's
   1164      * entry with a clean register state and the new stack. The parent's
   1165      * regs sit on proc_stack until sys_exit_or_resume_parent restores
   1166      * them on child exit. */
   1167     arch_clear_to_user_entry(tf, entry);
   1168     /* Some backends keep the user stack pointer outside the saved
   1169      * trapframe, so set it through the arch hook. */
   1170     arch_write_user_sp(new_sp);
   1171     /* Returning 0; dispatcher writes the arch return register. The child's
   1172      * _start reads argc/argv from the stack, so the return register is
   1173      * don't-care. */
   1174     return 0;
   1175 }
   1176 
   1177 static i64 sys_waitid(struct trapframe *tf, int idtype, u64 id,
   1178                       void *info, int options) {
   1179     (void)tf; (void)idtype; (void)id; (void)options;
   1180     if (!last_child_valid) return -ECHILD;
   1181     /* scheme1/prelude.scm:497-506 reads info[8]=si_code (CLD_EXITED=1) and
   1182      * info[24]=si_status. siginfo_t is sparsely written — zero the rest so
   1183      * the prelude's view is deterministic. */
   1184     if (info) {
   1185         u8 *p = info;
   1186         for (int i = 0; i < 128; i++) p[i] = 0;
   1187         u32 *si_code   = (u32 *)(p + 8);
   1188         u32 *si_status = (u32 *)(p + 24);
   1189         *si_code   = 1;                       /* CLD_EXITED */
   1190         *si_status = (u32)last_child_code;
   1191     }
   1192     last_child_valid = 0;
   1193     return 0;
   1194 }
   1195 
   1196 static int g_exit_code = 0;
   1197 static int g_exited = 0;
   1198 
   1199 /* On-disk dump format on blk1 (SEEDFS, sector-aligned, little-endian):
   1200  *
   1201  *   sector 0:        struct seedfs_hdr {
   1202  *                       char magic[8]    = "SEEDFS\0\0";
   1203  *                       u32  nfiles;
   1204  *                       u32  reserved;
   1205  *                    };  (16 B; rest of sector zero-padded)
   1206  *   sector 1..T:     nfiles directory entries, 4 entries/sector:
   1207  *                    struct seedfs_ent {
   1208  *                       char path[96];
   1209  *                       u32  data_offset_sectors;
   1210  *                       u32  _pad;
   1211  *                       u64  size_bytes;
   1212  *                    };  (112 B; T = ceil(nfiles/4))
   1213  *   sector T+1..:    file data, each file padded up to a 512-byte boundary.
   1214  *
   1215  * The host-side extractor (extract-blk.sh) walks the table and writes
   1216  * each file out by data_offset_sectors / size_bytes.
   1217  *
   1218  * Runs unconditionally on user exit. If the user code never reached exit
   1219  * (kernel panic, hang, etc.) the host extractor sees no SEEDFS magic at
   1220  * sector 0 and reports the missing-exit failure mode. */
   1221 
   1222 #define SEEDFS_ENT_SZ 112
   1223 
   1224 struct seedfs_hdr {
   1225     char magic[8];
   1226     u32  nfiles;
   1227     u32  reserved;
   1228 };
   1229 
   1230 struct seedfs_ent {
   1231     char path[96];
   1232     u32  data_offset_sectors;
   1233     u32  _pad;
   1234     u64  size_bytes;
   1235 };
   1236 
   1237 /* Scratch sector for trailing-byte padding of files whose size isn't a
   1238  * multiple of 512. Single 512-byte buffer is enough — we serialise file
   1239  * writes and rezero before each use. */
   1240 __attribute__((aligned(16))) static u8 dump_tail_sector[512];
   1241 
   1242 static void dump_tmpfs_blk(void) {
   1243     /* Count active files. */
   1244     u32 nfiles = 0;
   1245     for (int i = 0; i < MAX_FILES; i++) if (files[i].used) nfiles++;
   1246 
   1247     u32 table_sectors = (nfiles + 3) / 4;
   1248     u32 hdr_sectors   = 1 + table_sectors;
   1249     u64 hdr_bytes     = (u64)hdr_sectors * 512;
   1250 
   1251     u8 *hdr_buf = kalloc(hdr_bytes);
   1252     for (u64 i = 0; i < hdr_bytes; i++) hdr_buf[i] = 0;
   1253 
   1254     struct seedfs_hdr *hdr = (struct seedfs_hdr *)hdr_buf;
   1255     hdr->magic[0]='S'; hdr->magic[1]='E'; hdr->magic[2]='E';
   1256     hdr->magic[3]='D'; hdr->magic[4]='F'; hdr->magic[5]='S';
   1257     hdr->magic[6]=0;   hdr->magic[7]=0;
   1258     hdr->nfiles   = nfiles;
   1259     hdr->reserved = 0;
   1260 
   1261     /* Walk files: fill table entries, write data sectors, advance cursor. */
   1262     u32 ent_idx = 0;
   1263     u64 cursor  = (u64)hdr_sectors;
   1264     /* Output device capacity guard — we don't grow blk1, the host pre-
   1265      * sized it (256 MB by default). Refuse the dump if it would exceed. */
   1266     u64 out_cap = blk_devs[g_blk_output].capacity_sectors;
   1267 
   1268     for (int i = 0; i < MAX_FILES; i++) {
   1269         if (!files[i].used) continue;
   1270         struct seedfs_ent *e = (struct seedfs_ent *)(hdr_buf + 512 +
   1271                                   (u64)ent_idx * SEEDFS_ENT_SZ);
   1272         int j = 0;
   1273         while (files[i].path[j] && j < (int)sizeof(e->path) - 1) {
   1274             e->path[j] = files[i].path[j]; j++;
   1275         }
   1276         e->path[j] = 0;
   1277         e->data_offset_sectors = (u32)cursor;
   1278         e->size_bytes          = files[i].len;
   1279 
   1280         u64 nsec_full = files[i].len / 512;
   1281         u64 rem       = files[i].len - nsec_full * 512;
   1282         u64 need      = nsec_full + (rem ? 1 : 0);
   1283         if (cursor + need > out_cap) {
   1284             uart_puts("[seed] dump: out.img too small for tmpfs\n");
   1285             return;
   1286         }
   1287         if (nsec_full)
   1288             blk_write(g_blk_output, cursor, files[i].data, nsec_full);
   1289         cursor += nsec_full;
   1290         if (rem) {
   1291             for (int k = 0; k < 512; k++) dump_tail_sector[k] = 0;
   1292             for (u64 k = 0; k < rem; k++)
   1293                 dump_tail_sector[k] = files[i].data[nsec_full * 512 + k];
   1294             blk_write(g_blk_output, cursor, dump_tail_sector, 1);
   1295             cursor++;
   1296         }
   1297         ent_idx++;
   1298     }
   1299 
   1300     blk_write(g_blk_output, 0, hdr_buf, hdr_sectors);
   1301     uart_puts("[seed] dump: nfiles="); uart_putd((i64)nfiles);
   1302     uart_puts(" cursor=");              uart_putd((i64)cursor);
   1303     uart_puts(" sectors\n");
   1304 }
   1305 
   1306 static void sys_exit_final(int code) {
   1307     g_exit_code = code;
   1308     g_exited = 1;
   1309     dump_tmpfs_blk();
   1310     uart_puts("\n[seed] user exit_group("); uart_putd(code); uart_puts(")\n");
   1311     arch_system_off();
   1312     arch_idle_forever();
   1313 }
   1314 
   1315 /* Dispatcher-side exit_group: pops proc_stack and resumes the parent's
   1316  * sys_spawn if there's a saved frame, otherwise falls through to the
   1317  * real shutdown path. Returns 1 if the trap frame was rewritten (resume
   1318  * parent), 0 if the caller should treat it as a normal trap-return path
   1319  * (which will never happen, since sys_exit_final does not return). */
   1320 static int sys_exit_or_resume_parent(struct trapframe *tf, int code) {
   1321     code &= 0xff;
   1322     if (proc_depth > 0) {
   1323         struct proc_save *p = &proc_stack[--proc_depth];
   1324         last_child_pid   = p->child_pid;
   1325         last_child_code  = code;
   1326         last_child_valid = 1;
   1327         /* Swap the user-VA mapping back to the parent's pool. The parent's
   1328          * physical pool was never overwritten — only the child's pool was
   1329          * — so no mem_cpy is needed. */
   1330         if (current_pool != p->pool_save) swap_user_pool(p->pool_save);
   1331         brk_base = p->brk_base_save;
   1332         brk_cur  = p->brk_cur_save;
   1333         for (int i = 0; i < MAX_FD; i++) fdtab[i] = p->fdtab_save[i];
   1334         /* Restore registers; the dispatcher writes the child pid into the
   1335          * arch return register below. */
   1336         *tf = p->tf;
   1337         arch_write_user_sp(p->user_sp);
   1338         /* I-cache invalidation. The parent's pool was never written, so
   1339          * its instruction bytes (in DRAM) are byte-identical to what was
   1340          * originally fetched. But the same user VAs were just used to
   1341          * fetch the child's instructions from the other physical pool;
   1342          * I-caches may hold lines tagged by VA whose translation just
   1343          * changed, so the arch backend invalidates whatever is needed. */
   1344         arch_icache_context_sync();
   1345         return (int)p->child_pid;     /* >0: tells dispatcher to write this as r */
   1346     }
   1347     sys_exit_final(code);
   1348     return 0;                        /* unreachable */
   1349 }
   1350 
   1351 /* ─── Trap dispatch (called from start.S vector handlers) ───────────────── */
   1352 
   1353 i64 trap_sync(u64 esr, struct trapframe *tf);
   1354 void trap_kernel(u64 esr, struct trapframe *tf);
   1355 void trap_unhandled(u64 esr, struct trapframe *tf);
   1356 
   1357 i64 trap_sync(u64 esr, struct trapframe *tf) {
   1358     if (ARCH_IS_SYSCALL(esr)) {
   1359         u64 nr = ARCH_SYSCALL_NR(tf);
   1360         u64 a0 = ARCH_SYSCALL_ARG(tf, 0), a1 = ARCH_SYSCALL_ARG(tf, 1);
   1361         u64 a2 = ARCH_SYSCALL_ARG(tf, 2), a3 = ARCH_SYSCALL_ARG(tf, 3);
   1362         u64 a4 = ARCH_SYSCALL_ARG(tf, 4), a5 = ARCH_SYSCALL_ARG(tf, 5);
   1363         i64 r;
   1364         switch (nr) {
   1365         case SYS_read:       r = sys_read((int)a0, (void *)a1, a2); break;
   1366         case SYS_write:      r = sys_write((int)a0, (const void *)a1, a2); break;
   1367         case SYS_openat:     r = sys_openat((int)a0, (const char *)a1, (int)a2, (int)a3); break;
   1368 #ifdef SYS_open
   1369         /* amd64 hex0/hex1/hex2/M0 seed binaries call legacy `open(path,
   1370          * flags, mode)` directly; alias to openat(AT_FDCWD, ...). */
   1371         case SYS_open:       r = sys_openat(AT_FDCWD, (const char *)a0, (int)a1, (int)a2); break;
   1372 #endif
   1373         case SYS_close:      r = sys_close((int)a0); break;
   1374         case SYS_lseek:      r = sys_lseek((int)a0, (i64)a1, (int)a2); break;
   1375         case SYS_brk:        r = sys_brk(a0); break;
   1376         case SYS_unlinkat:   r = sys_unlinkat((int)a0, (const char *)a1, (int)a2); break;
   1377         case SYS_spawn:      r = sys_spawn(tf, (const char *)a0, (char **)a1); break;
   1378         case SYS_waitid:     r = sys_waitid(tf, (int)a0, a1, (void *)a2, (int)a3); break;
   1379         case SYS_exit_group:
   1380             r = sys_exit_or_resume_parent(tf, (int)a0);
   1381             /* If we resumed the parent, sys_exit_or_resume_parent has
   1382              * rewritten the trapframe; set only the arch return register. */
   1383             if (proc_depth >= 0 && r != 0) {
   1384                 ARCH_SET_RET(tf, r);
   1385                 return 0;
   1386             }
   1387             break;
   1388         default:
   1389             uart_puts("[seed] ENOSYS "); uart_putd((i64)nr); uart_puts("\n");
   1390             r = -38; /* ENOSYS */
   1391         }
   1392         ARCH_SET_RET(tf, r);
   1393         (void)a4; (void)a5;
   1394         return 0;
   1395     }
   1396     uart_puts("[seed] PANIC: user sync, ESR="); uart_putx(esr);
   1397     uart_puts(" ELR=");                 uart_putx(ARCH_TF_PC(tf));
   1398     uart_puts(" FAR=");
   1399     u64 far = arch_fault_addr(); uart_putx(far);
   1400     uart_puts("\n");
   1401     hang();
   1402 }
   1403 
   1404 void trap_kernel(u64 esr, struct trapframe *tf) {
   1405     u64 far = arch_fault_addr();
   1406     uart_puts("[seed] PANIC: kernel sync, ESR="); uart_putx(esr);
   1407     uart_puts(" ELR=");                          uart_putx(ARCH_TF_PC(tf));
   1408     uart_puts(" FAR=");                          uart_putx(far);
   1409     uart_puts("\n");
   1410     hang();
   1411 }
   1412 
   1413 void trap_unhandled(u64 esr, struct trapframe *tf) {
   1414     uart_puts("[seed] PANIC: unhandled exception, ESR="); uart_putx(esr);
   1415     uart_puts(" ELR=");                                   uart_putx(ARCH_TF_PC(tf));
   1416     uart_puts("\n");
   1417     hang();
   1418 }
   1419 
   1420 /* ─── User stack setup + entry ──────────────────────────────────────────── */
   1421 
   1422 /* Tokenise `src` in place (whitespace separators) into argv slots.
   1423  * Writes pointers into argv[0..argc-1] and returns argc. Stops at cap. */
   1424 static int tokenise(char *src, char **argv, int cap) {
   1425     int argc = 0;
   1426     char *p = src;
   1427     while (*p && argc < cap) {
   1428         while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++;
   1429         if (!*p) break;
   1430         argv[argc++] = p;
   1431         while (*p && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r') p++;
   1432         if (*p) *p++ = 0;
   1433     }
   1434     return argc;
   1435 }
   1436 
   1437 /* Out-of-stack scratch for the per-call user-VA pointer table. With
   1438  * MAX_ARGV=2048, sizeof(strs)=16 KB — too large to put on the syscall
   1439  * stack. */
   1440 static u64 build_user_stack_strs[MAX_ARGV];
   1441 
   1442 static u64 build_user_stack(u64 stack_top, int argc, char **argv) {
   1443     /* SysV layout, low to high at the returned sp:
   1444      *   argc, argv[0..argc-1], NULL (argv term), NULL (envp term).
   1445      * Strings live above the vectors, in a string pool placed just below
   1446      * stack_top so the user image's high-water mark is stable. */
   1447     if (argc < 1) argc = 1;
   1448     if (argc > MAX_ARGV) argc = MAX_ARGV;
   1449 
   1450     /* Lay strings down from stack_top - 16 (16-byte alignment slack). */
   1451     u64 strs_top = stack_top - 16;
   1452     u64 *strs = build_user_stack_strs;
   1453     char *cursor = (char *)strs_top;
   1454     for (int i = argc - 1; i >= 0; i--) {
   1455         int n = str_n(argv[i]) + 1;
   1456         cursor -= n;
   1457         for (int j = 0; j < n; j++) cursor[j] = argv[i][j];
   1458         strs[i] = (u64)cursor;
   1459     }
   1460 
   1461     /* sp must hold: argc + (argc+1)*8 (argv + NULL) + 8 (envp NULL) */
   1462     u64 sp = (u64)cursor - (u64)((argc + 3) * 8);
   1463     sp &= ~15UL;
   1464     u64 *p = (u64 *)sp;
   1465     p[0] = (u64)argc;
   1466     for (int i = 0; i < argc; i++) p[1 + i] = strs[i];
   1467     p[1 + argc] = 0;                /* argv terminator */
   1468     p[2 + argc] = 0;                /* envp terminator */
   1469     return sp;
   1470 }
   1471 
   1472 /* ─── kmain ─────────────────────────────────────────────────────────────── */
   1473 
   1474 void kmain(u64 dtb_phys) {
   1475     arch_setup_mmu();
   1476 
   1477     /* Bring up heap immediately — placed at a 16MB-aligned offset above
   1478      * our image, well clear of BSS/stack. Without -initrd reserving the
   1479      * 0x44000000–0x4b000000 region, the full 176 MB is ours from boot. */
   1480     u64 image_end = (u64)_end;
   1481     kheap_ptr = (u8 *)((image_end + 0xfffful) & ~0xfffful);
   1482     kheap_end = (u8 *)ARCH_KERNEL_HEAP_END;
   1483 
   1484     uart_puts("\n[seed] "); uart_puts(ARCH_NAME); uart_puts(" boot, dtb=");
   1485     uart_putx(dtb_phys); uart_puts("\n");
   1486 
   1487     struct dtb_info dt = {0};
   1488     parse_dtb((const void *)dtb_phys, &dt);
   1489     uart_puts("[seed] mem ");      uart_putx(dt.mem_start);
   1490     uart_puts(" + ");              uart_putx(dt.mem_size); uart_puts("\n");
   1491     uart_puts("[seed] virtio-mmio slots="); uart_putd((i64)dt.virtio_mmio_n);
   1492     uart_puts("\n");
   1493     if (dt.bootargs[0]) { uart_puts("[seed] bootargs: "); uart_puts(dt.bootargs); uart_puts("\n"); }
   1494 
   1495     /* Bring up virtio-blk: identifies blk0 (cpio) and blk1 (output). */
   1496     blk_init(&dt);
   1497 
   1498     /* Reserve the cpio buffer at the top of kheap so we can release it
   1499      * after parse_cpio. parse_cpio kallocs each file's data below cpio_buf;
   1500      * once it returns, the cpio buffer's bytes are dead and we can let
   1501      * subsequent kallocs use that space. */
   1502     u64 in_cap_sec   = blk_devs[g_blk_input].capacity_sectors;
   1503     u64 in_cap_bytes = in_cap_sec * 512;
   1504     u64 in_cap_aln   = (in_cap_bytes + 0xfffUL) & ~0xfffUL;
   1505     u8 *cpio_buf     = (u8 *)((u64)kheap_end - in_cap_aln);
   1506     u8 *kheap_end_full = kheap_end;
   1507     kheap_end = cpio_buf;
   1508 
   1509     if (blk_read(g_blk_input, 0, cpio_buf, in_cap_sec) < 0) {
   1510         uart_puts("[seed] cpio read failed\n");
   1511         hang();
   1512     }
   1513     parse_cpio(cpio_buf, in_cap_bytes);
   1514     uart_puts("[seed] tmpfs:\n");
   1515     for (int i = 0; i < MAX_FILES; i++) {
   1516         if (!files[i].used) continue;
   1517         uart_puts("  /"); uart_puts(files[i].path);
   1518         uart_puts(" ("); uart_putd((i64)files[i].len); uart_puts(" bytes)\n");
   1519     }
   1520 
   1521     int init_idx = find_file("init");
   1522     if (init_idx < 0) { uart_puts("[seed] no /init in initrd, halting\n"); hang(); }
   1523 
   1524     u64 entry = load_elf(files[init_idx].data);
   1525     if (!entry) { uart_puts("[seed] load_elf failed\n"); hang(); }
   1526     uart_puts("[seed] /init e_entry="); uart_putx(entry); uart_puts("\n");
   1527 
   1528     /* parse_cpio + load_elf are done — cpio buffer's bytes are dead.
   1529      * Release the reserved tail of kheap for tmpfs file growth. */
   1530     kheap_end = kheap_end_full;
   1531 
   1532     /* User runs in the L2-mapped low-VA window (USER_VA_LO..USER_VA_HI,
   1533      * physically backed by pool A initially). Stack grows down from the top
   1534      * of the window; brk grows up from above the loaded image's
   1535      * end-of-bss (g_user_image_end, set by load_elf). 16 MB reserved at
   1536      * the top for the user stack. */
   1537     u64 ustack_top = USER_VA_HI;
   1538     /* See sys_spawn for why brk_base is page-rounded above end-of-image. */
   1539     brk_base = g_user_image_end ? g_user_image_end : USER_VA_LO;
   1540     brk_base = (brk_base + 0xfffUL) & ~0xfffUL;
   1541     brk_cur  = brk_base;
   1542     brk_max  = USER_VA_HI - 0x01000000UL;
   1543 
   1544     /* Build argv. Priority:
   1545      *   1. DTB /chosen/bootargs (whitespace-tokenised — qemu -append "...").
   1546      *   2. /init.argv from the initramfs (one arg per line).
   1547      *   3. Fallback: argc=1, argv[0]="init".
   1548      * In all three cases, argv passed to user is exactly what the source
   1549      * provided — no implicit argv[0]="init" prefix. */
   1550     static char argv_pool[512];
   1551     char *uargv[MAX_ARGV];
   1552     int uargc = 0;
   1553 
   1554     if (dt.bootargs[0]) {
   1555         int n = 0;
   1556         while (dt.bootargs[n] && n < (int)sizeof(argv_pool) - 1) {
   1557             argv_pool[n] = dt.bootargs[n]; n++;
   1558         }
   1559         argv_pool[n] = 0;
   1560         uargc = tokenise(argv_pool, uargv, MAX_ARGV);
   1561     }
   1562     if (uargc == 0) {
   1563         int aidx = find_file("init.argv");
   1564         if (aidx >= 0) {
   1565             u64 n = files[aidx].len;
   1566             if (n >= sizeof(argv_pool)) n = sizeof(argv_pool) - 1;
   1567             for (u64 i = 0; i < n; i++) argv_pool[i] = (char)files[aidx].data[i];
   1568             argv_pool[n] = 0;
   1569             uargc = tokenise(argv_pool, uargv, MAX_ARGV);
   1570         }
   1571     }
   1572     if (uargc == 0) {
   1573         argv_pool[0] = 'i'; argv_pool[1] = 'n'; argv_pool[2] = 'i';
   1574         argv_pool[3] = 't'; argv_pool[4] = 0;
   1575         uargv[0] = argv_pool;
   1576         uargc = 1;
   1577     }
   1578     uart_puts("[seed] argv:");
   1579     for (int i = 0; i < uargc; i++) { uart_puts(" "); uart_puts(uargv[i]); }
   1580     uart_puts("\n");
   1581 
   1582     u64 user_sp = build_user_stack(ustack_top, uargc, uargv);
   1583 
   1584     uart_puts("[seed] eret to user, sp="); uart_putx(user_sp); uart_puts("\n");
   1585     eret_to_user(entry, user_sp);
   1586     /* unreachable */
   1587 }