kernel.c (61992B)
1 /* seed kernel — minimal OS satisfying docs/OS.md Tier 1. 2 * 3 * Boots through an arch backend with two virtio-blk-MMIO disks, parses 4 * the DTB to find virtio_mmio nodes + memory, brings up 5 * a small polling virtio-blk driver, reads the cpio newc archive from 6 * blk0 (read-only) into the in-memory tmpfs, loads /init (a static 7 * target ELF), and enters it through the arch trap-return path. Syscall 8 * traps land in trap_sync() and dispatch Tier-1/Tier-2 syscalls. On exit, 9 * the tmpfs is 10 * serialised to blk1 in a small SEEDFS table for the host extractor. 11 */ 12 13 typedef unsigned char u8; 14 typedef unsigned short u16; 15 typedef unsigned int u32; 16 typedef unsigned long u64; 17 typedef long i64; 18 typedef int i32; 19 20 #include "arch.h" 21 22 /* ─── Console ───────────────────────────────────────────────────────────── */ 23 24 static void uart_putc(char c) { 25 arch_console_putc(c); 26 } 27 28 static void uart_puts(const char *s) { 29 while (*s) { 30 if (*s == '\n') uart_putc('\r'); 31 uart_putc(*s++); 32 } 33 } 34 35 static void uart_putx(u64 v) { 36 static const char hex[] = "0123456789abcdef"; 37 uart_puts("0x"); 38 for (int i = 60; i >= 0; i -= 4) uart_putc(hex[(v >> i) & 0xf]); 39 } 40 41 static void uart_putd(i64 v) { 42 if (v < 0) { uart_putc('-'); v = -v; } 43 char buf[24]; 44 int i = 0; 45 if (v == 0) buf[i++] = '0'; 46 while (v) { buf[i++] = '0' + (v % 10); v /= 10; } 47 while (i--) uart_putc(buf[i]); 48 } 49 50 __attribute__((noreturn)) static void hang(void) { for (;;) arch_pause(); } 51 52 /* ─── Tiny libc-ish helpers ─────────────────────────────────────────────── */ 53 54 /* memcpy / memset / memmove come from tcc/cc/mem.c, linked alongside. 55 * Both gcc and tcc emit calls to these for struct copies and bulk 56 * zero-init past their inline thresholds; centralising them in 57 * tcc/cc/mem.c keeps the tcc-built and gcc-built kernels in sync. */ 58 void *memcpy(void *dst, const void *src, u64 n); 59 void *memset(void *dst, int c, u64 n); 60 void *memmove(void *dst, const void *src, u64 n); 61 62 static int str_eq(const char *a, const char *b) { 63 while (*a && *a == *b) { a++; b++; } 64 return *a == 0 && *b == 0; 65 } 66 static int str_n(const char *s) { int n = 0; while (s[n]) n++; return n; } 67 static void mem_cpy(void *d, const void *s, u64 n) { 68 /* 8-byte fast path when both pointers are 8-aligned and n is a multiple 69 * of 8. Under TCG this is roughly 8× faster than the byte loop. */ 70 u8 *dd = d; const u8 *ss = s; 71 if ((((u64)dd | (u64)ss | n) & 7) == 0) { 72 u64 *dq = (u64 *)dd; 73 const u64 *sq = (const u64 *)ss; 74 u64 m = n >> 3; 75 for (u64 i = 0; i < m; i++) dq[i] = sq[i]; 76 return; 77 } 78 for (u64 i = 0; i < n; i++) dd[i] = ss[i]; 79 } 80 static void mem_set(void *d, int c, u64 n) { 81 u8 *dd = d; 82 for (u64 i = 0; i < n; i++) dd[i] = (u8)c; 83 } 84 85 /* User address-space constants are supplied by the arch backend. */ 86 #define USER_POOL_A_PA ARCH_USER_POOL_A_PA 87 #define USER_POOL_B_PA ARCH_USER_POOL_B_PA 88 #define USER_POOL_SIZE ARCH_USER_POOL_SIZE 89 #define USER_VA_LO ARCH_USER_VA_LO 90 #define USER_VA_HI ARCH_USER_VA_HI 91 #define USER_POOL_FIRST_SLOT ARCH_USER_POOL_FIRST_SLOT 92 #define USER_POOL_LAST_SLOT ARCH_USER_POOL_LAST_SLOT 93 94 /* 0 = pool A is currently mapped at user VAs; 1 = pool B. */ 95 static int current_pool = 0; 96 97 /* ─── Kernel heap (bump allocator) ──────────────────────────────────────── */ 98 99 extern char _end[]; 100 static u8 *kheap_ptr; 101 static u8 *kheap_end; 102 103 static void *kalloc(u64 n) { 104 n = (n + 15) & ~15UL; 105 if (kheap_ptr + n > kheap_end) { 106 uart_puts("kalloc: out of memory\n"); 107 hang(); 108 } 109 void *r = kheap_ptr; 110 kheap_ptr += n; 111 return r; 112 } 113 114 /* ─── Big-endian readers (DTB is BE) ────────────────────────────────────── */ 115 116 static u32 be32(const u8 *p) { return (u32)p[0]<<24 | (u32)p[1]<<16 | (u32)p[2]<<8 | (u32)p[3]; } 117 static u64 be64(const u8 *p) { return ((u64)be32(p) << 32) | (u64)be32(p + 4); } 118 119 /* ─── Flattened Device Tree walker ──────────────────────────────────────── */ 120 121 /* DTB ("flattened device tree blob") header + token codes — devicetree 122 * spec §5.1 (header) + §5.4.1 (struct block tokens). */ 123 #define FDT_MAGIC 0xd00dfeedu 124 #define FDT_BEGIN_NODE 1 125 #define FDT_END_NODE 2 126 #define FDT_PROP 3 127 #define FDT_NOP 4 128 #define FDT_END 9 129 130 /* Xen PVH ABI handoff (see Xen's docs/misc/pvh.pandoc). On PVH boot QEMU 131 * points EBX at a `struct hvm_start_info`; the amd64 long-mode init in 132 * arch/amd64/kernel.S preserves EBX through to kmain's `dtb_phys` arg. 133 * First word is the magic; cmdline_paddr sits at offset 0x18. microvm 134 * has no DTB, so this is the only path for `qemu -append "..."`. */ 135 #define PVH_HVM_START_MAGIC 0x336ec578u 136 #define PVH_HVM_START_OFF_CMDLINE 0x18 137 138 /* cpio "newc" (SVR4 portable) format — first 6 bytes of every header 139 * record are the magic; header is fixed-size (110 bytes) followed by 140 * the NUL-terminated name and the file data, both 4-byte padded. The 141 * 13 numeric fields are 8-char ASCII hex starting at offset 6. Spec: 142 * cpio(5) under "New ASCII Format". */ 143 #define CPIO_NEWC_MAGIC "070701" 144 #define CPIO_NEWC_HDR_SIZE 110 145 #define CPIO_NEWC_FIELD(p, n) ((p) + 6 + (n) * 8) 146 #define CPIO_MODE_TYPE_MASK 0xf000 /* st_mode file-type bits */ 147 #define CPIO_MODE_TYPE_DIR 0x4000 148 #define CPIO_MODE_TYPE_REG 0x8000 149 150 /* QEMU virt has 32 virtio-mmio slots (0x0a000000..0x0a004000, 0x200 each). 151 * Most are unpopulated and report MagicValue=0/DeviceID=0 — we capture all 152 * slots advertised by the DTB and the driver init filters at probe time. */ 153 #define MAX_VIRTIO_MMIO 32 154 155 struct dtb_info { 156 u64 mem_start; 157 u64 mem_size; 158 u64 virtio_mmio_pa[MAX_VIRTIO_MMIO]; 159 int virtio_mmio_n; 160 char bootargs[256]; 161 }; 162 163 /* str_starts: returns 1 iff `s` begins with `prefix`. */ 164 static int str_starts(const char *s, const char *prefix) { 165 while (*prefix) { if (*s++ != *prefix++) return 0; } 166 return 1; 167 } 168 169 static void parse_dtb(const void *dtb, struct dtb_info *out) { 170 #ifdef ARCH_STATIC_VIRTIO_MMIO_BASE 171 /* No DTB on this arch (amd64 microvm). mem + virtio-mmio come from 172 * arch.h compile-time constants; the kernel cmdline arrives via the 173 * PVH hvm_start_info struct (see PVH_HVM_START_* above). */ 174 out->mem_start = ARCH_STATIC_MEM_START; 175 out->mem_size = ARCH_STATIC_MEM_SIZE; 176 out->virtio_mmio_n = ARCH_STATIC_VIRTIO_MMIO_COUNT; 177 for (int i = 0; i < out->virtio_mmio_n && i < MAX_VIRTIO_MMIO; i++) 178 out->virtio_mmio_pa[i] = ARCH_STATIC_VIRTIO_MMIO_BASE + 179 (u64)i * ARCH_STATIC_VIRTIO_MMIO_STRIDE; 180 if ((u64)dtb != 0) { 181 const u8 *p = dtb; 182 u32 magic = (u32)p[0] | ((u32)p[1] << 8) | ((u32)p[2] << 16) | ((u32)p[3] << 24); 183 if (magic == PVH_HVM_START_MAGIC) { 184 const u8 *cp = p + PVH_HVM_START_OFF_CMDLINE; 185 u64 cmdline_paddr = 0; 186 for (int i = 0; i < 8; i++) cmdline_paddr |= (u64)cp[i] << (i * 8); 187 if (cmdline_paddr) { 188 const char *s = (const char *)cmdline_paddr; 189 int i = 0; 190 while (s[i] && i < 255) { out->bootargs[i] = s[i]; i++; } 191 out->bootargs[i] = 0; 192 } 193 } 194 } 195 return; 196 #endif 197 const u8 *base = dtb; 198 if (be32(base) != FDT_MAGIC) { 199 uart_puts("DTB: bad magic\n"); return; 200 } 201 u32 off_struct = be32(base + 8); 202 u32 off_strings = be32(base + 12); 203 const u8 *strings = base + off_strings; 204 const u8 *p = base + off_struct; 205 206 char path[4][64] = {{0}}; 207 int depth = -1; 208 209 for (;;) { 210 u32 tok = be32(p); p += 4; 211 if (tok == FDT_BEGIN_NODE) { 212 depth++; 213 if (depth < 4) { 214 int i = 0; 215 while (p[i] && i < 63) { path[depth][i] = (char)p[i]; i++; } 216 path[depth][i] = 0; 217 } 218 while (*p) p++; 219 p++; 220 p = (const u8 *)(((u64)p + 3) & ~3UL); 221 } else if (tok == FDT_END_NODE) { 222 depth--; 223 } else if (tok == FDT_PROP) { 224 u32 len = be32(p); p += 4; 225 u32 nameoff = be32(p); p += 4; 226 const char *pn = (const char *)(strings + nameoff); 227 228 if (depth == 1 && str_eq(path[1], "chosen")) { 229 if (str_eq(pn, "bootargs")) { 230 u32 i = 0; 231 while (i < len && i < 255) { out->bootargs[i] = (char)p[i]; i++; } 232 out->bootargs[i] = 0; 233 } 234 } 235 if (depth == 1) { 236 /* memory node is named "memory@<addr>" */ 237 if (str_starts(path[1], "memory") && 238 str_eq(pn, "reg") && len >= 16 && out->mem_size == 0) { 239 out->mem_start = be64(p); 240 out->mem_size = be64(p + 8); 241 } 242 } 243 /* virtio-mmio nodes: capture each slot's PA. Root/soc 244 * #address-cells/#size-cells are both 2 on QEMU virt, so reg 245 * is 16 bytes (PA u64, size u64); we only need the PA. */ 246 if (depth >= 1 && str_starts(path[depth], "virtio_mmio@") && 247 str_eq(pn, "reg") && len >= 16 && 248 out->virtio_mmio_n < MAX_VIRTIO_MMIO) { 249 out->virtio_mmio_pa[out->virtio_mmio_n++] = be64(p); 250 } 251 p += len; 252 p = (const u8 *)(((u64)p + 3) & ~3UL); 253 } else if (tok == FDT_NOP) { 254 /* skip */ 255 } else if (tok == FDT_END) { 256 break; 257 } else { 258 uart_puts("DTB: bad token "); uart_putx(tok); uart_puts("\n"); 259 break; 260 } 261 } 262 } 263 264 /* ─── virtio-blk-MMIO driver (polling, single-VQ) ───────────────────────── */ 265 /* 266 * Two block devices: blk0 = read-only cpio input, blk1 = read-write output. 267 * Identification is content-based (sector 0 cpio newc magic "070701" ⇒ blk0) 268 * so we don't depend on -drive ordering on the qemu command line. 269 * 270 * The driver is intentionally small: 8-entry split virtqueue, one in-flight 271 * request at a time, polling used.idx. No interrupts (DAIF stays masked). 272 * 273 * MMIO transport regs are reached via DEVICE_ALIAS_BASE + PA. virtqueue 274 * memory comes from kernel BSS (identity-mapped Normal, VA == PA). 275 */ 276 277 #define VIRTIO_MMIO_MAGIC 0x000 278 #define VIRTIO_MMIO_VERSION 0x004 279 #define VIRTIO_MMIO_DEVICE_ID 0x008 280 #define VIRTIO_MMIO_DEV_FEATURES 0x010 281 #define VIRTIO_MMIO_DEV_FEAT_SEL 0x014 282 #define VIRTIO_MMIO_DRV_FEATURES 0x020 283 #define VIRTIO_MMIO_DRV_FEAT_SEL 0x024 284 #define VIRTIO_MMIO_QUEUE_SEL 0x030 285 #define VIRTIO_MMIO_QUEUE_NUM_MAX 0x034 286 #define VIRTIO_MMIO_QUEUE_NUM 0x038 287 #define VIRTIO_MMIO_QUEUE_READY 0x044 288 #define VIRTIO_MMIO_QUEUE_NOTIFY 0x050 289 #define VIRTIO_MMIO_INT_STATUS 0x060 290 #define VIRTIO_MMIO_INT_ACK 0x064 291 #define VIRTIO_MMIO_STATUS 0x070 292 #define VIRTIO_MMIO_QUEUE_DESC_LO 0x080 293 #define VIRTIO_MMIO_QUEUE_DESC_HI 0x084 294 #define VIRTIO_MMIO_QUEUE_DRIVER_LO 0x090 295 #define VIRTIO_MMIO_QUEUE_DRIVER_HI 0x094 296 #define VIRTIO_MMIO_QUEUE_DEVICE_LO 0x0a0 297 #define VIRTIO_MMIO_QUEUE_DEVICE_HI 0x0a4 298 #define VIRTIO_MMIO_CONFIG 0x100 299 300 #define VIRTIO_STATUS_ACKNOWLEDGE 1 301 #define VIRTIO_STATUS_DRIVER 2 302 #define VIRTIO_STATUS_DRIVER_OK 4 303 #define VIRTIO_STATUS_FEATURES_OK 8 304 #define VIRTIO_STATUS_FAILED 128 305 306 #define VIRTIO_F_VERSION_1_BIT 32 /* bit 32 in feature space */ 307 308 #define VIRTIO_BLK_T_IN 0 309 #define VIRTIO_BLK_T_OUT 1 310 311 #define VRING_DESC_F_NEXT 1 312 #define VRING_DESC_F_WRITE 2 313 314 #define VQ_SIZE 8 315 316 struct vring_desc { 317 u64 addr; 318 u32 len; 319 u16 flags; 320 u16 next; 321 }; 322 323 struct vring_avail { 324 u16 flags; 325 u16 idx; 326 u16 ring[VQ_SIZE]; 327 u16 used_event; 328 }; 329 330 struct vring_used_elem { 331 u32 id; 332 u32 len; 333 }; 334 335 struct vring_used { 336 u16 flags; 337 u16 idx; 338 struct vring_used_elem ring[VQ_SIZE]; 339 u16 avail_event; 340 }; 341 342 struct virtio_blk_req_hdr { 343 u32 type; 344 u32 reserved; 345 u64 sector; 346 }; 347 348 #define BLK_DEV_MAX 2 349 350 struct blk_dev { 351 volatile u8 *regs; /* alias VA pointing at the MMIO region */ 352 u64 capacity_sectors; 353 int present; 354 }; 355 356 static struct blk_dev blk_devs[BLK_DEV_MAX]; 357 static int blk_n_devs = 0; 358 359 /* One vring per device in BSS, 4 KB-aligned. Layout within the page: 360 * desc: offset 0 (128 B for VQ_SIZE=8) 361 * avail: offset 128 (24 B with VQ_SIZE=8 + used_event) 362 * used: offset 256 (72 B with VQ_SIZE=8 + avail_event) 363 * Plenty of slack inside one 4 KB page. */ 364 __attribute__((aligned(4096))) static u8 vq_pages[BLK_DEV_MAX][4096]; 365 366 #define VQ_DESC_OFF 0 367 #define VQ_AVAIL_OFF 128 368 #define VQ_USED_OFF 256 369 370 static struct vring_desc *vq_desc(int i) 371 { return (struct vring_desc *)(vq_pages[i] + VQ_DESC_OFF); } 372 static struct vring_avail *vq_avail(int i) 373 { return (struct vring_avail *)(vq_pages[i] + VQ_AVAIL_OFF); } 374 static struct vring_used *vq_used(int i) 375 { return (struct vring_used *)(vq_pages[i] + VQ_USED_OFF); } 376 377 /* MMIO accessors. Reg offsets are byte offsets per the spec. */ 378 static u32 mmio_r32(struct blk_dev *d, u32 off) { 379 return *(volatile u32 *)(d->regs + off); 380 } 381 static void mmio_w32(struct blk_dev *d, u32 off, u32 val) { 382 *(volatile u32 *)(d->regs + off) = val; 383 } 384 385 /* Initialise one device per spec §3.1.1 / §4.2 / §5.2. Returns 1 on 386 * success (device is virtio-blk and ready), 0 if slot is empty / not 387 * a block device, -1 on error. */ 388 static int blk_init_one(struct blk_dev *d) { 389 u32 magic = mmio_r32(d, VIRTIO_MMIO_MAGIC); 390 if (magic != 0x74726976) return 0; /* not a virtio slot */ 391 u32 devid = mmio_r32(d, VIRTIO_MMIO_DEVICE_ID); 392 if (devid == 0) return 0; /* unpopulated slot */ 393 if (devid != 2) return 0; /* not a block device */ 394 u32 ver = mmio_r32(d, VIRTIO_MMIO_VERSION); 395 if (ver != 2) { 396 /* QEMU virt defaults to legacy (version 1) virtio-mmio transports 397 * unless the host passes -global virtio-mmio.force-legacy=false. 398 * The harness scripts set that flag — reaching here means it was 399 * forgotten. */ 400 uart_puts("[seed] virtio-mmio version != 2 (legacy): "); 401 uart_putd((i64)ver); 402 uart_puts(" — pass -global virtio-mmio.force-legacy=false\n"); 403 return -1; 404 } 405 406 /* Reset, ack, driver. */ 407 mmio_w32(d, VIRTIO_MMIO_STATUS, 0); 408 mmio_w32(d, VIRTIO_MMIO_STATUS, VIRTIO_STATUS_ACKNOWLEDGE); 409 mmio_w32(d, VIRTIO_MMIO_STATUS, 410 VIRTIO_STATUS_ACKNOWLEDGE | VIRTIO_STATUS_DRIVER); 411 412 /* Negotiate VIRTIO_F_VERSION_1 only (bit 32 → feature word 1). */ 413 mmio_w32(d, VIRTIO_MMIO_DEV_FEAT_SEL, 1); 414 u32 dev_feat_hi = mmio_r32(d, VIRTIO_MMIO_DEV_FEATURES); 415 if (!(dev_feat_hi & (1u << (VIRTIO_F_VERSION_1_BIT - 32)))) { 416 uart_puts("[seed] virtio: device lacks VERSION_1\n"); 417 return -1; 418 } 419 mmio_w32(d, VIRTIO_MMIO_DRV_FEAT_SEL, 0); 420 mmio_w32(d, VIRTIO_MMIO_DRV_FEATURES, 0); 421 mmio_w32(d, VIRTIO_MMIO_DRV_FEAT_SEL, 1); 422 mmio_w32(d, VIRTIO_MMIO_DRV_FEATURES, 1u << (VIRTIO_F_VERSION_1_BIT - 32)); 423 424 mmio_w32(d, VIRTIO_MMIO_STATUS, 425 VIRTIO_STATUS_ACKNOWLEDGE | VIRTIO_STATUS_DRIVER | 426 VIRTIO_STATUS_FEATURES_OK); 427 u32 st = mmio_r32(d, VIRTIO_MMIO_STATUS); 428 if (!(st & VIRTIO_STATUS_FEATURES_OK)) { 429 uart_puts("[seed] virtio: FEATURES_OK rejected\n"); 430 return -1; 431 } 432 433 /* Queue 0. */ 434 mmio_w32(d, VIRTIO_MMIO_QUEUE_SEL, 0); 435 u32 qmax = mmio_r32(d, VIRTIO_MMIO_QUEUE_NUM_MAX); 436 if (qmax < VQ_SIZE) { 437 uart_puts("[seed] virtio: QueueNumMax < VQ_SIZE\n"); 438 return -1; 439 } 440 mmio_w32(d, VIRTIO_MMIO_QUEUE_NUM, VQ_SIZE); 441 442 int i = (int)(d - blk_devs); 443 /* Zero the queue page so all idx/flags start at 0. */ 444 for (int k = 0; k < 4096; k++) vq_pages[i][k] = 0; 445 446 u64 desc_pa = (u64)vq_desc(i); 447 u64 avail_pa = (u64)vq_avail(i); 448 u64 used_pa = (u64)vq_used(i); 449 mmio_w32(d, VIRTIO_MMIO_QUEUE_DESC_LO, (u32)desc_pa); 450 mmio_w32(d, VIRTIO_MMIO_QUEUE_DESC_HI, (u32)(desc_pa >> 32)); 451 mmio_w32(d, VIRTIO_MMIO_QUEUE_DRIVER_LO, (u32)avail_pa); 452 mmio_w32(d, VIRTIO_MMIO_QUEUE_DRIVER_HI, (u32)(avail_pa >> 32)); 453 mmio_w32(d, VIRTIO_MMIO_QUEUE_DEVICE_LO, (u32)used_pa); 454 mmio_w32(d, VIRTIO_MMIO_QUEUE_DEVICE_HI, (u32)(used_pa >> 32)); 455 456 mmio_w32(d, VIRTIO_MMIO_QUEUE_READY, 1); 457 mmio_w32(d, VIRTIO_MMIO_STATUS, 458 VIRTIO_STATUS_ACKNOWLEDGE | VIRTIO_STATUS_DRIVER | 459 VIRTIO_STATUS_FEATURES_OK | VIRTIO_STATUS_DRIVER_OK); 460 461 /* virtio-blk config @+0 = capacity (sectors, little-endian u64). */ 462 u32 cap_lo = *(volatile u32 *)(d->regs + VIRTIO_MMIO_CONFIG + 0); 463 u32 cap_hi = *(volatile u32 *)(d->regs + VIRTIO_MMIO_CONFIG + 4); 464 d->capacity_sectors = ((u64)cap_hi << 32) | cap_lo; 465 d->present = 1; 466 return 1; 467 } 468 469 /* Issue one request (VIRTIO_BLK_T_IN or _OUT) and poll for completion. 470 * `buf` PA == VA (kheap or kernel BSS / stack). nsec ≤ 2048 (1 MB/req). */ 471 static int blk_request_one(int devi, u32 type, u64 sector, void *buf, u64 nsec) { 472 struct blk_dev *d = &blk_devs[devi]; 473 /* Per-call hdr/status — both reachable via VA==PA on the kernel stack. */ 474 struct virtio_blk_req_hdr hdr = { .type = type, .reserved = 0, .sector = sector }; 475 volatile u8 status = 0xff; 476 477 struct vring_desc *desc = vq_desc(devi); 478 struct vring_avail *avail = vq_avail(devi); 479 struct vring_used *used = vq_used(devi); 480 481 desc[0].addr = (u64)&hdr; 482 desc[0].len = (u32)sizeof(hdr); 483 desc[0].flags = VRING_DESC_F_NEXT; 484 desc[0].next = 1; 485 486 desc[1].addr = (u64)buf; 487 desc[1].len = (u32)(nsec * 512); 488 /* For READ (T_IN), device writes into our buffer (F_WRITE). 489 * For WRITE (T_OUT), device reads our buffer (no F_WRITE). */ 490 desc[1].flags = VRING_DESC_F_NEXT | (type == VIRTIO_BLK_T_IN ? VRING_DESC_F_WRITE : 0); 491 desc[1].next = 2; 492 493 desc[2].addr = (u64)&status; 494 desc[2].len = 1; 495 desc[2].flags = VRING_DESC_F_WRITE; 496 desc[2].next = 0; 497 498 u16 head = 0; 499 u16 ai = avail->idx; 500 avail->ring[ai % VQ_SIZE] = head; 501 arch_wmb(); 502 avail->idx = ai + 1; 503 arch_wmb(); 504 505 mmio_w32(d, VIRTIO_MMIO_QUEUE_NOTIFY, 0); 506 507 /* Poll used.idx — single in-flight, advances by exactly one. */ 508 while (used->idx == ai) { 509 arch_pause(); 510 } 511 arch_rmb(); 512 513 /* Acknowledge any pending interrupt status (we don't service IRQs but 514 * the device sets these bits anyway). */ 515 u32 is = mmio_r32(d, VIRTIO_MMIO_INT_STATUS); 516 if (is) mmio_w32(d, VIRTIO_MMIO_INT_ACK, is); 517 518 if (status != 0) { 519 uart_puts("[seed] virtio-blk req failed status="); uart_putd((i64)status); 520 uart_puts("\n"); 521 return -1; 522 } 523 return 0; 524 } 525 526 /* Multi-chunk read/write. Chunks at 1 MB (2048 sectors) per request. */ 527 #define BLK_CHUNK_SECTORS 2048 528 529 static int blk_io(int devi, u32 type, u64 sector, u8 *buf, u64 nsec) { 530 while (nsec) { 531 u64 n = nsec; 532 if (n > BLK_CHUNK_SECTORS) n = BLK_CHUNK_SECTORS; 533 if (blk_request_one(devi, type, sector, buf, n) < 0) return -1; 534 sector += n; 535 buf += n * 512; 536 nsec -= n; 537 } 538 return 0; 539 } 540 541 static int blk_read(int devi, u64 sector, void *buf, u64 nsec) { 542 return blk_io(devi, VIRTIO_BLK_T_IN, sector, buf, nsec); 543 } 544 static int blk_write(int devi, u64 sector, const void *buf, u64 nsec) { 545 return blk_io(devi, VIRTIO_BLK_T_OUT, sector, (u8 *)buf, nsec); 546 } 547 548 /* Probe every populated MMIO slot the DTB advertised; bring up block 549 * devices; identify blk0 vs blk1 by reading sector 0 — the cpio newc 550 * magic ("070701") is on blk0, the other is blk1. Panics if exactly one 551 * of each isn't found. */ 552 static int g_blk_input = -1; /* index in blk_devs[] for cpio input */ 553 static int g_blk_output = -1; /* index for output dump */ 554 555 static void blk_init(struct dtb_info *dt) { 556 int n_blocks = 0; 557 for (int i = 0; i < dt->virtio_mmio_n; i++) { 558 u64 pa = dt->virtio_mmio_pa[i]; 559 if (n_blocks >= BLK_DEV_MAX) break; 560 /* Stage into blk_devs[n_blocks] so blk_init_one's index-derived 561 * vq page assignment is correct from the start. */ 562 struct blk_dev *d = &blk_devs[n_blocks]; 563 d->regs = arch_mmio_ptr(pa); 564 d->capacity_sectors = 0; 565 d->present = 0; 566 int r = blk_init_one(d); 567 if (r > 0) { 568 n_blocks++; 569 } else if (r < 0) { 570 uart_puts("[seed] virtio: init failed at PA="); uart_putx(pa); 571 uart_puts("\n"); 572 hang(); 573 } 574 } 575 blk_n_devs = n_blocks; 576 if (n_blocks != 2) { 577 uart_puts("[seed] virtio-blk: expected 2 block devices, got "); 578 uart_putd((i64)n_blocks); uart_puts("\n"); 579 hang(); 580 } 581 582 /* Identify blk0 (cpio) vs blk1 (output) by reading sector 0. */ 583 __attribute__((aligned(16))) static u8 probe[512]; 584 for (int i = 0; i < n_blocks; i++) { 585 if (blk_read(i, 0, probe, 1) < 0) { 586 uart_puts("[seed] virtio-blk: probe read failed dev="); 587 uart_putd((i64)i); uart_puts("\n"); 588 hang(); 589 } 590 int is_cpio = str_starts((const char *)probe, CPIO_NEWC_MAGIC); 591 if (is_cpio) { 592 if (g_blk_input >= 0) { 593 uart_puts("[seed] virtio-blk: multiple cpio disks\n"); 594 hang(); 595 } 596 g_blk_input = i; 597 } else { 598 if (g_blk_output >= 0) { 599 uart_puts("[seed] virtio-blk: multiple non-cpio disks\n"); 600 hang(); 601 } 602 g_blk_output = i; 603 } 604 } 605 if (g_blk_input < 0 || g_blk_output < 0) { 606 uart_puts("[seed] virtio-blk: failed to identify in/out\n"); 607 hang(); 608 } 609 uart_puts("[seed] virtio-blk: in=dev"); uart_putd((i64)g_blk_input); 610 uart_puts(" cap="); uart_putd((i64)blk_devs[g_blk_input].capacity_sectors); 611 uart_puts(" sec out=dev"); uart_putd((i64)g_blk_output); 612 uart_puts(" cap="); uart_putd((i64)blk_devs[g_blk_output].capacity_sectors); 613 uart_puts(" sec\n"); 614 } 615 616 /* ─── In-memory tmpfs from cpio newc ────────────────────────────────────── */ 617 618 /* boot5 stages a full musl tree in the cpio (~1300 .c sources + ~1200 619 * headers/aux) plus per-TU .o outputs (~1300) — observed cpio entry 620 * count is ~2600 inputs + ~1300 outputs ≈ 3900. Round up to 4096 to 621 * leave headroom; the struct is ~120 bytes so this costs ~480 KB of 622 * kernel BSS — comfortably within the 192 MB kheap. Path length is 96 623 * to fit "tmp/musl-1.2.5/obj/src/<sub>/<name>.o" (paths stored with 624 * leading slashes stripped, so these are user-visible as 625 * /tmp/musl-1.2.5/obj/...). */ 626 #define MAX_FILES 4096 627 struct file { 628 int used; 629 char path[96]; 630 u8 *data; 631 u64 len; 632 u64 cap; 633 }; 634 static struct file files[MAX_FILES]; 635 636 /* Resolve "." and ".." segments in `in` into `out`. Required because real 637 * filesystems normalize `foo/../bar` → `bar` at lookup time, but our 638 * tmpfs is a flat path → blob map; without this, tcc's pstrcat-style 639 * include resolution (e.g. "/tmp/musl-1.2.5/src/include/" + 640 * "../../include/features.h") produces a literal path that misses the 641 * real entry "tmp/musl-1.2.5/include/features.h". The buffers are sized 642 * for our worst case (~96-char paths) plus headroom for the unresolved 643 * form. Trailing slashes are dropped. */ 644 static void normalize_path(const char *in, char *out, int outsz) { 645 char buf[256]; 646 int n = 0; 647 while (in[n] && n < (int)sizeof(buf) - 1) { buf[n] = in[n]; n++; } 648 buf[n] = 0; 649 650 const char *segs[64]; 651 int seg_lens[64]; 652 int nsegs = 0; 653 int i = 0; 654 while (buf[i]) { 655 int start = i; 656 while (buf[i] && buf[i] != '/') i++; 657 int len = i - start; 658 if (buf[i]) { buf[i] = 0; i++; } 659 if (len == 0) continue; 660 if (len == 1 && buf[start] == '.') continue; 661 if (len == 2 && buf[start] == '.' && buf[start + 1] == '.') { 662 if (nsegs > 0) nsegs--; 663 continue; 664 } 665 if (nsegs < 64) { 666 segs[nsegs] = &buf[start]; 667 seg_lens[nsegs] = len; 668 nsegs++; 669 } 670 } 671 672 int o = 0; 673 for (int k = 0; k < nsegs; k++) { 674 for (int j = 0; j < seg_lens[k] && o < outsz - 1; j++) 675 out[o++] = segs[k][j]; 676 if (k < nsegs - 1 && o < outsz - 1) out[o++] = '/'; 677 } 678 out[o] = 0; 679 } 680 681 static int find_file(const char *path) { 682 while (*path == '/') path++; 683 char norm[128]; 684 normalize_path(path, norm, sizeof(norm)); 685 for (int i = 0; i < MAX_FILES; i++) { 686 if (files[i].used && str_eq(files[i].path, norm)) return i; 687 } 688 return -1; 689 } 690 691 static int new_file(const char *path) { 692 while (*path == '/') path++; 693 /* Normalize at store time so all later lookups match regardless of 694 * how the caller spelled `..` / `.`. */ 695 char norm[128]; 696 normalize_path(path, norm, sizeof(norm)); 697 for (int i = 0; i < MAX_FILES; i++) { 698 if (!files[i].used) { 699 files[i].used = 1; 700 int j = 0; 701 while (norm[j] && j < (int)sizeof(files[i].path) - 1) { 702 files[i].path[j] = norm[j]; j++; 703 } 704 files[i].path[j] = 0; 705 files[i].data = 0; 706 files[i].len = 0; 707 files[i].cap = 0; 708 return i; 709 } 710 } 711 return -1; 712 } 713 714 static u64 hex_n(const char *s, int n) { 715 u64 v = 0; 716 for (int i = 0; i < n; i++) { 717 char c = s[i]; 718 v <<= 4; 719 if (c >= '0' && c <= '9') v |= (u64)(c - '0'); 720 else if (c >= 'a' && c <= 'f') v |= (u64)(c - 'a' + 10); 721 else if (c >= 'A' && c <= 'F') v |= (u64)(c - 'A' + 10); 722 } 723 return v; 724 } 725 726 static void parse_cpio(const void *cpio, u64 total) { 727 const u8 *p = cpio; 728 const u8 *end = p + total; 729 while (p + CPIO_NEWC_HDR_SIZE <= end) { 730 if (!str_starts((const char *)p, CPIO_NEWC_MAGIC)) break; 731 u64 mode = hex_n((const char *)CPIO_NEWC_FIELD(p, 1), 8); 732 u64 fsz = hex_n((const char *)CPIO_NEWC_FIELD(p, 6), 8); 733 u64 nsz = hex_n((const char *)CPIO_NEWC_FIELD(p, 11), 8); 734 const char *name = (const char *)(p + CPIO_NEWC_HDR_SIZE); 735 if (str_eq(name, "TRAILER!!!")) break; 736 737 u64 hstride = (CPIO_NEWC_HDR_SIZE + nsz + 3) & ~3UL; 738 u64 fstride = (fsz + 3) & ~3UL; 739 const u8 *fdata = p + hstride; 740 741 int is_dir = ((mode & CPIO_MODE_TYPE_MASK) == CPIO_MODE_TYPE_DIR); 742 int is_reg = ((mode & CPIO_MODE_TYPE_MASK) == CPIO_MODE_TYPE_REG); 743 if (is_reg && !str_eq(name, ".")) { 744 int idx = new_file(name); 745 if (idx >= 0) { 746 /* Copy out — we'll let the user write back later if needed. */ 747 files[idx].data = kalloc(fsz ? fsz : 1); 748 files[idx].cap = fsz ? fsz : 1; 749 files[idx].len = fsz; 750 if (fsz) mem_cpy(files[idx].data, fdata, fsz); 751 } else { 752 /* Silent drops here are how MAX_FILES being too low 753 * masquerades as random "file not found" errors during 754 * the build — surface it loudly. */ 755 uart_puts("[seed] WARN: cpio entry dropped (MAX_FILES " 756 "exhausted): "); uart_puts(name); uart_puts("\n"); 757 } 758 } 759 (void)is_dir; 760 p += hstride + fstride; 761 } 762 } 763 764 /* ─── ELF64 static loader ───────────────────────────────────────────────── */ 765 766 struct ehdr { u8 e_ident[16]; u16 e_type, e_machine; u32 e_version; u64 e_entry, e_phoff, e_shoff; u32 e_flags; u16 e_ehsize, e_phentsize, e_phnum, e_shentsize, e_shnum, e_shstrndx; }; 767 struct phdr { u32 p_type, p_flags; u64 p_offset, p_vaddr, p_paddr, p_filesz, p_memsz, p_align; }; 768 769 #define PT_LOAD 1 770 771 /* Highest VA touched by the most recently loaded image's PT_LOAD segments 772 * (after USER_VA_HI clipping). load_elf updates this; kmain / sys_spawn 773 * use it to seed brk_base above the user image's BSS. */ 774 static u64 g_user_image_end; 775 776 static u64 load_elf(const u8 *elf) { 777 const struct ehdr *eh = (const struct ehdr *)elf; 778 if (!(eh->e_ident[0] == 0x7f && eh->e_ident[1] == 'E' && 779 eh->e_ident[2] == 'L' && eh->e_ident[3] == 'F')) { 780 uart_puts("ELF: bad magic\n"); return 0; 781 } 782 if (eh->e_machine != ARCH_ELF_MACHINE) { 783 uart_puts("ELF: not "); uart_puts(ARCH_ELF_MACHINE_NAME); uart_puts("\n"); return 0; 784 } 785 /* p_flags (R/W/X) are deliberately ignored: the user mapping is one 786 * giant Normal-memory RWX region (see arch_setup_mmu). OS.md 787 * §"Memory model" permits this — there's no W^X enforcement in the 788 * contract, and tcc-boot2 never JITs. 789 * 790 * Segments are clipped at USER_VA_HI: a binary may declare a BSS that 791 * extends past the mapped user window (scheme1 reserves ~256 MB), and 792 * a naive mem_set would walk into the device-block region above and 793 * trigger an external abort. The user image gets only the portion of 794 * its memsz that fits in the user pool; if user code later touches 795 * the unmapped tail, that's a user-space fault, not a kernel panic. */ 796 u64 hi = 0; 797 for (int i = 0; i < eh->e_phnum; i++) { 798 const struct phdr *ph = (const struct phdr *)(elf + eh->e_phoff + (u64)i * eh->e_phentsize); 799 if (ph->p_type != PT_LOAD) continue; 800 u64 vaddr = ph->p_vaddr; 801 u64 filesz = ph->p_filesz; 802 u64 memsz = ph->p_memsz; 803 if (vaddr >= USER_VA_HI) continue; /* segment fully out of window */ 804 u64 reach = USER_VA_HI - vaddr; 805 if (filesz > reach) filesz = reach; 806 if (memsz > reach) memsz = reach; 807 u8 *dst = (u8 *)vaddr; 808 const u8 *src = elf + ph->p_offset; 809 mem_cpy(dst, src, filesz); 810 if (memsz > filesz) 811 mem_set(dst + filesz, 0, memsz - filesz); 812 u64 end = vaddr + memsz; 813 if (end > hi) hi = end; 814 } 815 /* Round up to 16 bytes so callers can use it directly as brk_base. */ 816 g_user_image_end = (hi + 15) & ~15UL; 817 /* I-cache sync (cheap insurance even with caches off). */ 818 arch_icache_sync(); 819 return eh->e_entry; 820 } 821 822 /* ─── Syscall layer (Tier 1) ────────────────────────────────────────────── */ 823 824 #define MAX_FD 32 825 struct fdent { int used; int fidx; u64 pos; int wflag; int append; }; 826 static struct fdent fdtab[MAX_FD]; 827 828 /* User program break (single-process). */ 829 static u64 brk_base; 830 static u64 brk_cur; 831 static u64 brk_max; 832 833 #define EBADF 9 834 #define ENOENT 2 835 #define EINVAL 22 836 #define EMFILE 24 837 #define EFAULT 14 838 #define ENOSPC 28 839 840 #define O_RDONLY 0 841 #define O_WRONLY 1 842 #define O_RDWR 2 843 #define O_CREAT 0100 844 #define O_TRUNC 01000 845 #define O_APPEND 02000 846 847 #define AT_FDCWD (-100) 848 849 #define SYS_unlinkat ARCH_SYS_unlinkat 850 #define SYS_openat ARCH_SYS_openat 851 #ifdef ARCH_SYS_open 852 #define SYS_open ARCH_SYS_open 853 #endif 854 #define SYS_close ARCH_SYS_close 855 #define SYS_lseek ARCH_SYS_lseek 856 #define SYS_read ARCH_SYS_read 857 #define SYS_write ARCH_SYS_write 858 #define SYS_exit_group ARCH_SYS_exit_group 859 #define SYS_waitid ARCH_SYS_waitid 860 #define SYS_brk ARCH_SYS_brk 861 /* Private syscall number, deliberately outside the normal Linux range. 862 * The scheme1 prelude probes (sys-spawn) once at init: on Linux this 863 * number is unmapped so the probe gets -ENOSYS and the prelude falls 864 * back to the classic clone+execve path; on the seed kernel the probe 865 * succeeds (or returns -ENOENT for a missing file) and the prelude uses 866 * sys-spawn for every (run …) thereafter. */ 867 #define SYS_spawn ARCH_SYS_spawn 868 869 #define ECHILD 10 870 #define EAGAIN 11 871 #define ENOEXEC 8 872 873 static i64 sys_write(int fd, const void *buf, u64 len) { 874 if (fd == 1 || fd == 2) { 875 const u8 *s = buf; 876 for (u64 i = 0; i < len; i++) uart_putc((char)s[i]); 877 return (i64)len; 878 } 879 if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used || !fdtab[fd].wflag) return -EBADF; 880 struct file *f = &files[fdtab[fd].fidx]; 881 u64 pos = fdtab[fd].append ? f->len : fdtab[fd].pos; 882 u64 need = pos + len; 883 if (need > f->cap) { 884 u64 ncap = f->cap ? f->cap : 64; 885 while (ncap < need) ncap *= 2; 886 u8 *nd = kalloc(ncap); 887 if (f->len) mem_cpy(nd, f->data, f->len); 888 f->data = nd; 889 f->cap = ncap; 890 } 891 mem_cpy(f->data + pos, buf, len); 892 if (need > f->len) f->len = need; 893 fdtab[fd].pos = pos + len; 894 return (i64)len; 895 } 896 897 static i64 sys_read(int fd, void *buf, u64 len) { 898 if (fd == 0) return 0; 899 if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used) return -EBADF; 900 struct file *f = &files[fdtab[fd].fidx]; 901 u64 pos = fdtab[fd].pos; 902 if (pos >= f->len) return 0; 903 u64 n = len; 904 if (pos + n > f->len) n = f->len - pos; 905 mem_cpy(buf, f->data + pos, n); 906 fdtab[fd].pos = pos + n; 907 return (i64)n; 908 } 909 910 static i64 sys_openat(int dirfd, const char *path, int flags, int mode) { 911 (void)dirfd; (void)mode; 912 int fidx = find_file(path); 913 int wflag = (flags & 3) != 0; 914 if (fidx < 0) { 915 if (!(flags & O_CREAT)) return -ENOENT; 916 fidx = new_file(path); 917 if (fidx < 0) return -ENOSPC; 918 } else if (flags & O_TRUNC) { 919 files[fidx].len = 0; 920 } 921 int fd = -1; 922 for (int i = 3; i < MAX_FD; i++) { 923 if (!fdtab[i].used) { fd = i; break; } 924 } 925 if (fd < 0) return -EMFILE; 926 fdtab[fd].used = 1; 927 fdtab[fd].fidx = fidx; 928 fdtab[fd].pos = 0; 929 fdtab[fd].wflag = wflag; 930 fdtab[fd].append = (flags & O_APPEND) ? 1 : 0; 931 return fd; 932 } 933 934 static i64 sys_close(int fd) { 935 if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used) return -EBADF; 936 fdtab[fd].used = 0; 937 return 0; 938 } 939 940 static i64 sys_lseek(int fd, i64 off, int whence) { 941 if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used) return -EBADF; 942 struct file *f = &files[fdtab[fd].fidx]; 943 i64 base; 944 if (whence == 0) base = 0; 945 else if (whence == 1) base = (i64)fdtab[fd].pos; 946 else if (whence == 2) base = (i64)f->len; 947 else return -EINVAL; 948 i64 np = base + off; 949 if (np < 0) return -EINVAL; 950 fdtab[fd].pos = (u64)np; 951 return np; 952 } 953 954 static i64 sys_brk(u64 addr) { 955 if (addr == 0) return (i64)brk_cur; 956 if (addr < brk_base || addr > brk_max) return (i64)brk_cur; 957 brk_cur = addr; 958 return (i64)brk_cur; 959 } 960 961 static i64 sys_unlinkat(int dirfd, const char *path, int flags) { 962 /* dirfd is ignored: tmpfs is a flat path map, all callers pass 963 * AT_FDCWD. flags (AT_REMOVEDIR) is ignored too — there are no 964 * directories in our tmpfs, so the dir-vs-file distinction the 965 * flag selects has no observable effect. */ 966 (void)dirfd; (void)flags; 967 int fidx = find_file(path); 968 if (fidx < 0) return -ENOENT; 969 files[fidx].used = 0; 970 return 0; 971 } 972 973 /* ─── Tier 2: atomic spawn (spawn / waitid / exit_group) ────────────────── */ 974 /* 975 * The boot2 chain's process-creation shape (scheme1/prelude.scm `(spawn …)`) 976 * is rigidly synchronous: parent creates a child to run a single program, 977 * waits for it, reads the exit code. Nothing else runs in the child between 978 * creation and the new program's entry, and nothing else runs in the 979 * parent between creation and wait. 980 * 981 * We implement that as a single atomic syscall on a single-threaded kernel: 982 * 983 * sys_spawn → capture path+argv into kernel buffers (still reading from 984 * the parent's pool); push parent state (regs, brk, fd 985 * table, current pool) onto proc_stack; remap user VAs to 986 * the alternate pool with NO COPY (the child won't read any 987 * byte of the parent's pool — load_elf overwrites just the 988 * PT_LOAD ranges and build_user_stack writes the top of the 989 * user VA window); load_elf into the alternate pool, reset 990 * brk, build the user stack, rewrite tf so trap return 991 * enters the new program at its entry with the new stack. 992 * sys_exit → if proc_stack non-empty: stash exit code in last_child, 993 * swap user VAs back to the parent's pool (no copy — the 994 * parent's pool was never written by the child), restore 995 * regs/brk/fds, cache-sync (the user VAs now resolve to 996 * different physical pages), set tf so trap return resumes 997 * the parent's spawn() call with child pid. If proc_stack 998 * empty: real exit (dump tmpfs, arch shutdown). 999 * sys_waitid → return last_child's exit code via the siginfo struct. 1000 * 1001 * No actual concurrency. The "parent" is suspended at the moment of spawn 1002 * and resumed only when the child calls exit_group. 1003 * 1004 * Memory cost per spawn: zero copy. User-map rewrite + TLB/cache sync 1005 * + load_elf (which copies just the new image's PT_LOAD bytes, typically 1006 * ~1 MB for tcc). This replaces the previous clone/execve design which 1007 * paid one 768 MB mem_cpy per fork to seed the child's pool with parent 1008 * state — needed only because scheme1 ran a few interpreter-bytecode 1009 * cells of user code between clone and execve, which would otherwise 1010 * mutate the parent's BSS heap-allocator globals (heap_next, 1011 * current_heap_next_ptr, scratch_next). Folding clone+execve into one 1012 * syscall closes that window entirely. 1013 */ 1014 1015 /* Forward decls for state defined further down. boot5's per-source 1016 * compile passes ~25 argv entries / ~750 bytes, but the final 1017 * `tcc -ar rcs libc.a obj1 … obj1263` call passes ~1300 entries totalling 1018 * ~65 KB of strings. MAX_ARGV / spawn_argv_pool size for that worst 1019 * case; both are kept generous so a future taller chain doesn't hit a 1020 * silent-truncation cliff. */ 1021 #define MAX_ARGV 2048 1022 static u64 build_user_stack(u64 stack_top, int argc, char **argv); 1023 static int tokenise(char *src, char **argv, int cap); 1024 1025 #define MAX_PROC_DEPTH 1 1026 1027 struct proc_save { 1028 int active; 1029 u64 child_pid; 1030 /* Saved trap-frame state — enough to resume the parent at the trap 1031 * instruction following its sys_spawn. The return register is 1032 * overwritten with child_pid at restore time. */ 1033 struct trapframe tf; 1034 u64 user_sp; 1035 /* Per-process state at the moment of spawn. brk_base is saved alongside 1036 * brk_cur because sys_spawn resets it above the new image's end-of-bss; 1037 * the parent's value comes back with the parent's pool. */ 1038 u64 brk_base_save; 1039 u64 brk_cur_save; 1040 struct fdent fdtab_save[MAX_FD]; 1041 int pool_save; /* parent's user pool (0=A, 1=B) */ 1042 }; 1043 1044 /* Rewrite the user-VA mapping to point at pool `which`, then flush TLB. */ 1045 static void swap_user_pool(int which) { 1046 arch_swap_user_pool(which); 1047 current_pool = which; 1048 } 1049 1050 static struct proc_save proc_stack[MAX_PROC_DEPTH]; 1051 static int proc_depth = 0; 1052 static u64 g_next_pid = 2; 1053 1054 /* The most recently exited child, for sys_waitid to consume. */ 1055 static int last_child_valid = 0; 1056 static u64 last_child_pid = 0; 1057 static int last_child_code = 0; 1058 1059 /* sys_spawn captures path+argv from the parent's pool into kernel buffers 1060 * BEFORE swapping pools — load_elf will only ever read from the cpio- 1061 * staged file (kernel state) and write to the alternate pool, but the 1062 * argv strings the caller passed live in the parent's pool, which we're 1063 * about to stop mapping. The pool + argv pointer table sit in BSS 1064 * (rather than on the kernel stack) because MAX_ARGV * 8 = 16 KB is 1065 * too large to put on the syscall stack. */ 1066 static char spawn_argv_pool[131072]; /* 128 KB; boot5 ar peaks ~65 KB */ 1067 static char *spawn_argv_ptrs[MAX_ARGV]; 1068 static i64 sys_spawn(struct trapframe *tf, const char *path, char **argv) { 1069 if (!path) return -EFAULT; 1070 if (proc_depth >= MAX_PROC_DEPTH) return -EAGAIN; 1071 1072 /* Copy path out of the parent's pool first (find_file uses kernel 1073 * state, but the caller's `path` pointer is into user memory). */ 1074 char path_buf[128]; 1075 int pn = 0; 1076 while (path[pn] && pn < 127) { path_buf[pn] = path[pn]; pn++; } 1077 path_buf[pn] = 0; 1078 int fidx = find_file(path_buf); 1079 if (fidx < 0) return -ENOENT; 1080 1081 /* Capture argv strings into spawn_argv_pool (kernel BSS, not the 1082 * user pool — survives the pool swap below). Truncation here is 1083 * silent but loud: we panic-warn on the UART so a too-low MAX_ARGV 1084 * surfaces as a kernel message, not a downstream link failure. */ 1085 int argc = 0; 1086 int pool_off = 0; 1087 if (argv) { 1088 while (argc < MAX_ARGV - 1 && argv[argc]) { 1089 const char *s = argv[argc]; 1090 int n = 0; 1091 while (s[n] && pool_off + n < (int)sizeof(spawn_argv_pool) - 1) n++; 1092 for (int j = 0; j < n; j++) spawn_argv_pool[pool_off + j] = s[j]; 1093 spawn_argv_pool[pool_off + n] = 0; 1094 spawn_argv_ptrs[argc] = &spawn_argv_pool[pool_off]; 1095 pool_off += n + 1; 1096 argc++; 1097 } 1098 if (argv[argc]) { 1099 uart_puts("[seed] WARN: sys_spawn argv truncated at MAX_ARGV=" 1100 ); uart_putd(MAX_ARGV); uart_puts(" for path="); 1101 uart_puts(path_buf); uart_puts("\n"); 1102 } 1103 } 1104 if (argc == 0) { 1105 /* Synthesise argv[0] from the path so user code that reads argv[0] 1106 * doesn't crash. */ 1107 int n = 0; 1108 while (path_buf[n] && pool_off + n < (int)sizeof(spawn_argv_pool) - 1) n++; 1109 for (int j = 0; j < n; j++) spawn_argv_pool[pool_off + j] = path_buf[j]; 1110 spawn_argv_pool[pool_off + n] = 0; 1111 spawn_argv_ptrs[0] = &spawn_argv_pool[pool_off]; 1112 pool_off += n + 1; 1113 argc = 1; 1114 } 1115 1116 /* Save parent state — regs, brk, fd table, which pool the parent ran 1117 * in. After sys_exit_or_resume_parent restores from this frame, the 1118 * parent's spawn() call returns with child_pid. */ 1119 struct proc_save *p = &proc_stack[proc_depth]; 1120 p->active = 1; 1121 p->child_pid = g_next_pid++; 1122 p->tf = *tf; 1123 p->user_sp = arch_read_user_sp(); 1124 p->brk_base_save = brk_base; 1125 p->brk_cur_save = brk_cur; 1126 for (int i = 0; i < MAX_FD; i++) p->fdtab_save[i] = fdtab[i]; 1127 p->pool_save = current_pool; 1128 1129 /* Swap to the alternate pool. NO COPY: the child will only read 1130 * memory that load_elf writes (its own PT_LOAD segments) and what 1131 * build_user_stack writes (top of user VA). Stale bytes elsewhere in 1132 * the alt pool are user-invisible — sbrk pages aren't zeroed but 1133 * neither were they under the old execve path. */ 1134 int new_pool = current_pool ^ 1; 1135 swap_user_pool(new_pool); 1136 proc_depth++; 1137 1138 /* Load new ELF into the (just-swapped) alt pool. files[fidx].data is 1139 * in kernel heap, not the user pool, so this read is unaffected. */ 1140 u64 entry = load_elf(files[fidx].data); 1141 if (!entry) { 1142 /* Roll back: alt pool is in undefined state but parent pool is 1143 * still pristine. Swap back and pop proc_stack. */ 1144 proc_depth--; 1145 swap_user_pool(p->pool_save); 1146 return -ENOEXEC; 1147 } 1148 1149 /* Reset brk above the new image's end-of-bss, page-aligned up. 1150 * Some seed binaries (e.g. riscv64 hex2) embed PC-relative scratch 1151 * buffers in the bytes just past their loaded image and assume brk 1152 * lives a full page beyond — Linux rounds brk to PAGE_SIZE. If we 1153 * placed the heap immediately after the image (16-byte aligned), a 1154 * write through the in-binary scratch overlaps the first heap node 1155 * and silently corrupts the user's data structures. */ 1156 brk_base = g_user_image_end ? g_user_image_end : USER_VA_LO; 1157 brk_base = (brk_base + 0xfffUL) & ~0xfffUL; 1158 brk_cur = brk_base; 1159 1160 /* Build new user stack at top of user VA window. */ 1161 u64 new_sp = build_user_stack(USER_VA_HI, argc, spawn_argv_ptrs); 1162 1163 /* Rewrite trap frame so eret enters the child at the new image's 1164 * entry with a clean register state and the new stack. The parent's 1165 * regs sit on proc_stack until sys_exit_or_resume_parent restores 1166 * them on child exit. */ 1167 arch_clear_to_user_entry(tf, entry); 1168 /* Some backends keep the user stack pointer outside the saved 1169 * trapframe, so set it through the arch hook. */ 1170 arch_write_user_sp(new_sp); 1171 /* Returning 0; dispatcher writes the arch return register. The child's 1172 * _start reads argc/argv from the stack, so the return register is 1173 * don't-care. */ 1174 return 0; 1175 } 1176 1177 static i64 sys_waitid(struct trapframe *tf, int idtype, u64 id, 1178 void *info, int options) { 1179 (void)tf; (void)idtype; (void)id; (void)options; 1180 if (!last_child_valid) return -ECHILD; 1181 /* scheme1/prelude.scm:497-506 reads info[8]=si_code (CLD_EXITED=1) and 1182 * info[24]=si_status. siginfo_t is sparsely written — zero the rest so 1183 * the prelude's view is deterministic. */ 1184 if (info) { 1185 u8 *p = info; 1186 for (int i = 0; i < 128; i++) p[i] = 0; 1187 u32 *si_code = (u32 *)(p + 8); 1188 u32 *si_status = (u32 *)(p + 24); 1189 *si_code = 1; /* CLD_EXITED */ 1190 *si_status = (u32)last_child_code; 1191 } 1192 last_child_valid = 0; 1193 return 0; 1194 } 1195 1196 static int g_exit_code = 0; 1197 static int g_exited = 0; 1198 1199 /* On-disk dump format on blk1 (SEEDFS, sector-aligned, little-endian): 1200 * 1201 * sector 0: struct seedfs_hdr { 1202 * char magic[8] = "SEEDFS\0\0"; 1203 * u32 nfiles; 1204 * u32 reserved; 1205 * }; (16 B; rest of sector zero-padded) 1206 * sector 1..T: nfiles directory entries, 4 entries/sector: 1207 * struct seedfs_ent { 1208 * char path[96]; 1209 * u32 data_offset_sectors; 1210 * u32 _pad; 1211 * u64 size_bytes; 1212 * }; (112 B; T = ceil(nfiles/4)) 1213 * sector T+1..: file data, each file padded up to a 512-byte boundary. 1214 * 1215 * The host-side extractor (extract-blk.sh) walks the table and writes 1216 * each file out by data_offset_sectors / size_bytes. 1217 * 1218 * Runs unconditionally on user exit. If the user code never reached exit 1219 * (kernel panic, hang, etc.) the host extractor sees no SEEDFS magic at 1220 * sector 0 and reports the missing-exit failure mode. */ 1221 1222 #define SEEDFS_ENT_SZ 112 1223 1224 struct seedfs_hdr { 1225 char magic[8]; 1226 u32 nfiles; 1227 u32 reserved; 1228 }; 1229 1230 struct seedfs_ent { 1231 char path[96]; 1232 u32 data_offset_sectors; 1233 u32 _pad; 1234 u64 size_bytes; 1235 }; 1236 1237 /* Scratch sector for trailing-byte padding of files whose size isn't a 1238 * multiple of 512. Single 512-byte buffer is enough — we serialise file 1239 * writes and rezero before each use. */ 1240 __attribute__((aligned(16))) static u8 dump_tail_sector[512]; 1241 1242 static void dump_tmpfs_blk(void) { 1243 /* Count active files. */ 1244 u32 nfiles = 0; 1245 for (int i = 0; i < MAX_FILES; i++) if (files[i].used) nfiles++; 1246 1247 u32 table_sectors = (nfiles + 3) / 4; 1248 u32 hdr_sectors = 1 + table_sectors; 1249 u64 hdr_bytes = (u64)hdr_sectors * 512; 1250 1251 u8 *hdr_buf = kalloc(hdr_bytes); 1252 for (u64 i = 0; i < hdr_bytes; i++) hdr_buf[i] = 0; 1253 1254 struct seedfs_hdr *hdr = (struct seedfs_hdr *)hdr_buf; 1255 hdr->magic[0]='S'; hdr->magic[1]='E'; hdr->magic[2]='E'; 1256 hdr->magic[3]='D'; hdr->magic[4]='F'; hdr->magic[5]='S'; 1257 hdr->magic[6]=0; hdr->magic[7]=0; 1258 hdr->nfiles = nfiles; 1259 hdr->reserved = 0; 1260 1261 /* Walk files: fill table entries, write data sectors, advance cursor. */ 1262 u32 ent_idx = 0; 1263 u64 cursor = (u64)hdr_sectors; 1264 /* Output device capacity guard — we don't grow blk1, the host pre- 1265 * sized it (256 MB by default). Refuse the dump if it would exceed. */ 1266 u64 out_cap = blk_devs[g_blk_output].capacity_sectors; 1267 1268 for (int i = 0; i < MAX_FILES; i++) { 1269 if (!files[i].used) continue; 1270 struct seedfs_ent *e = (struct seedfs_ent *)(hdr_buf + 512 + 1271 (u64)ent_idx * SEEDFS_ENT_SZ); 1272 int j = 0; 1273 while (files[i].path[j] && j < (int)sizeof(e->path) - 1) { 1274 e->path[j] = files[i].path[j]; j++; 1275 } 1276 e->path[j] = 0; 1277 e->data_offset_sectors = (u32)cursor; 1278 e->size_bytes = files[i].len; 1279 1280 u64 nsec_full = files[i].len / 512; 1281 u64 rem = files[i].len - nsec_full * 512; 1282 u64 need = nsec_full + (rem ? 1 : 0); 1283 if (cursor + need > out_cap) { 1284 uart_puts("[seed] dump: out.img too small for tmpfs\n"); 1285 return; 1286 } 1287 if (nsec_full) 1288 blk_write(g_blk_output, cursor, files[i].data, nsec_full); 1289 cursor += nsec_full; 1290 if (rem) { 1291 for (int k = 0; k < 512; k++) dump_tail_sector[k] = 0; 1292 for (u64 k = 0; k < rem; k++) 1293 dump_tail_sector[k] = files[i].data[nsec_full * 512 + k]; 1294 blk_write(g_blk_output, cursor, dump_tail_sector, 1); 1295 cursor++; 1296 } 1297 ent_idx++; 1298 } 1299 1300 blk_write(g_blk_output, 0, hdr_buf, hdr_sectors); 1301 uart_puts("[seed] dump: nfiles="); uart_putd((i64)nfiles); 1302 uart_puts(" cursor="); uart_putd((i64)cursor); 1303 uart_puts(" sectors\n"); 1304 } 1305 1306 static void sys_exit_final(int code) { 1307 g_exit_code = code; 1308 g_exited = 1; 1309 dump_tmpfs_blk(); 1310 uart_puts("\n[seed] user exit_group("); uart_putd(code); uart_puts(")\n"); 1311 arch_system_off(); 1312 arch_idle_forever(); 1313 } 1314 1315 /* Dispatcher-side exit_group: pops proc_stack and resumes the parent's 1316 * sys_spawn if there's a saved frame, otherwise falls through to the 1317 * real shutdown path. Returns 1 if the trap frame was rewritten (resume 1318 * parent), 0 if the caller should treat it as a normal trap-return path 1319 * (which will never happen, since sys_exit_final does not return). */ 1320 static int sys_exit_or_resume_parent(struct trapframe *tf, int code) { 1321 code &= 0xff; 1322 if (proc_depth > 0) { 1323 struct proc_save *p = &proc_stack[--proc_depth]; 1324 last_child_pid = p->child_pid; 1325 last_child_code = code; 1326 last_child_valid = 1; 1327 /* Swap the user-VA mapping back to the parent's pool. The parent's 1328 * physical pool was never overwritten — only the child's pool was 1329 * — so no mem_cpy is needed. */ 1330 if (current_pool != p->pool_save) swap_user_pool(p->pool_save); 1331 brk_base = p->brk_base_save; 1332 brk_cur = p->brk_cur_save; 1333 for (int i = 0; i < MAX_FD; i++) fdtab[i] = p->fdtab_save[i]; 1334 /* Restore registers; the dispatcher writes the child pid into the 1335 * arch return register below. */ 1336 *tf = p->tf; 1337 arch_write_user_sp(p->user_sp); 1338 /* I-cache invalidation. The parent's pool was never written, so 1339 * its instruction bytes (in DRAM) are byte-identical to what was 1340 * originally fetched. But the same user VAs were just used to 1341 * fetch the child's instructions from the other physical pool; 1342 * I-caches may hold lines tagged by VA whose translation just 1343 * changed, so the arch backend invalidates whatever is needed. */ 1344 arch_icache_context_sync(); 1345 return (int)p->child_pid; /* >0: tells dispatcher to write this as r */ 1346 } 1347 sys_exit_final(code); 1348 return 0; /* unreachable */ 1349 } 1350 1351 /* ─── Trap dispatch (called from start.S vector handlers) ───────────────── */ 1352 1353 i64 trap_sync(u64 esr, struct trapframe *tf); 1354 void trap_kernel(u64 esr, struct trapframe *tf); 1355 void trap_unhandled(u64 esr, struct trapframe *tf); 1356 1357 i64 trap_sync(u64 esr, struct trapframe *tf) { 1358 if (ARCH_IS_SYSCALL(esr)) { 1359 u64 nr = ARCH_SYSCALL_NR(tf); 1360 u64 a0 = ARCH_SYSCALL_ARG(tf, 0), a1 = ARCH_SYSCALL_ARG(tf, 1); 1361 u64 a2 = ARCH_SYSCALL_ARG(tf, 2), a3 = ARCH_SYSCALL_ARG(tf, 3); 1362 u64 a4 = ARCH_SYSCALL_ARG(tf, 4), a5 = ARCH_SYSCALL_ARG(tf, 5); 1363 i64 r; 1364 switch (nr) { 1365 case SYS_read: r = sys_read((int)a0, (void *)a1, a2); break; 1366 case SYS_write: r = sys_write((int)a0, (const void *)a1, a2); break; 1367 case SYS_openat: r = sys_openat((int)a0, (const char *)a1, (int)a2, (int)a3); break; 1368 #ifdef SYS_open 1369 /* amd64 hex0/hex1/hex2/M0 seed binaries call legacy `open(path, 1370 * flags, mode)` directly; alias to openat(AT_FDCWD, ...). */ 1371 case SYS_open: r = sys_openat(AT_FDCWD, (const char *)a0, (int)a1, (int)a2); break; 1372 #endif 1373 case SYS_close: r = sys_close((int)a0); break; 1374 case SYS_lseek: r = sys_lseek((int)a0, (i64)a1, (int)a2); break; 1375 case SYS_brk: r = sys_brk(a0); break; 1376 case SYS_unlinkat: r = sys_unlinkat((int)a0, (const char *)a1, (int)a2); break; 1377 case SYS_spawn: r = sys_spawn(tf, (const char *)a0, (char **)a1); break; 1378 case SYS_waitid: r = sys_waitid(tf, (int)a0, a1, (void *)a2, (int)a3); break; 1379 case SYS_exit_group: 1380 r = sys_exit_or_resume_parent(tf, (int)a0); 1381 /* If we resumed the parent, sys_exit_or_resume_parent has 1382 * rewritten the trapframe; set only the arch return register. */ 1383 if (proc_depth >= 0 && r != 0) { 1384 ARCH_SET_RET(tf, r); 1385 return 0; 1386 } 1387 break; 1388 default: 1389 uart_puts("[seed] ENOSYS "); uart_putd((i64)nr); uart_puts("\n"); 1390 r = -38; /* ENOSYS */ 1391 } 1392 ARCH_SET_RET(tf, r); 1393 (void)a4; (void)a5; 1394 return 0; 1395 } 1396 uart_puts("[seed] PANIC: user sync, ESR="); uart_putx(esr); 1397 uart_puts(" ELR="); uart_putx(ARCH_TF_PC(tf)); 1398 uart_puts(" FAR="); 1399 u64 far = arch_fault_addr(); uart_putx(far); 1400 uart_puts("\n"); 1401 hang(); 1402 } 1403 1404 void trap_kernel(u64 esr, struct trapframe *tf) { 1405 u64 far = arch_fault_addr(); 1406 uart_puts("[seed] PANIC: kernel sync, ESR="); uart_putx(esr); 1407 uart_puts(" ELR="); uart_putx(ARCH_TF_PC(tf)); 1408 uart_puts(" FAR="); uart_putx(far); 1409 uart_puts("\n"); 1410 hang(); 1411 } 1412 1413 void trap_unhandled(u64 esr, struct trapframe *tf) { 1414 uart_puts("[seed] PANIC: unhandled exception, ESR="); uart_putx(esr); 1415 uart_puts(" ELR="); uart_putx(ARCH_TF_PC(tf)); 1416 uart_puts("\n"); 1417 hang(); 1418 } 1419 1420 /* ─── User stack setup + entry ──────────────────────────────────────────── */ 1421 1422 /* Tokenise `src` in place (whitespace separators) into argv slots. 1423 * Writes pointers into argv[0..argc-1] and returns argc. Stops at cap. */ 1424 static int tokenise(char *src, char **argv, int cap) { 1425 int argc = 0; 1426 char *p = src; 1427 while (*p && argc < cap) { 1428 while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++; 1429 if (!*p) break; 1430 argv[argc++] = p; 1431 while (*p && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r') p++; 1432 if (*p) *p++ = 0; 1433 } 1434 return argc; 1435 } 1436 1437 /* Out-of-stack scratch for the per-call user-VA pointer table. With 1438 * MAX_ARGV=2048, sizeof(strs)=16 KB — too large to put on the syscall 1439 * stack. */ 1440 static u64 build_user_stack_strs[MAX_ARGV]; 1441 1442 static u64 build_user_stack(u64 stack_top, int argc, char **argv) { 1443 /* SysV layout, low to high at the returned sp: 1444 * argc, argv[0..argc-1], NULL (argv term), NULL (envp term). 1445 * Strings live above the vectors, in a string pool placed just below 1446 * stack_top so the user image's high-water mark is stable. */ 1447 if (argc < 1) argc = 1; 1448 if (argc > MAX_ARGV) argc = MAX_ARGV; 1449 1450 /* Lay strings down from stack_top - 16 (16-byte alignment slack). */ 1451 u64 strs_top = stack_top - 16; 1452 u64 *strs = build_user_stack_strs; 1453 char *cursor = (char *)strs_top; 1454 for (int i = argc - 1; i >= 0; i--) { 1455 int n = str_n(argv[i]) + 1; 1456 cursor -= n; 1457 for (int j = 0; j < n; j++) cursor[j] = argv[i][j]; 1458 strs[i] = (u64)cursor; 1459 } 1460 1461 /* sp must hold: argc + (argc+1)*8 (argv + NULL) + 8 (envp NULL) */ 1462 u64 sp = (u64)cursor - (u64)((argc + 3) * 8); 1463 sp &= ~15UL; 1464 u64 *p = (u64 *)sp; 1465 p[0] = (u64)argc; 1466 for (int i = 0; i < argc; i++) p[1 + i] = strs[i]; 1467 p[1 + argc] = 0; /* argv terminator */ 1468 p[2 + argc] = 0; /* envp terminator */ 1469 return sp; 1470 } 1471 1472 /* ─── kmain ─────────────────────────────────────────────────────────────── */ 1473 1474 void kmain(u64 dtb_phys) { 1475 arch_setup_mmu(); 1476 1477 /* Bring up heap immediately — placed at a 16MB-aligned offset above 1478 * our image, well clear of BSS/stack. Without -initrd reserving the 1479 * 0x44000000–0x4b000000 region, the full 176 MB is ours from boot. */ 1480 u64 image_end = (u64)_end; 1481 kheap_ptr = (u8 *)((image_end + 0xfffful) & ~0xfffful); 1482 kheap_end = (u8 *)ARCH_KERNEL_HEAP_END; 1483 1484 uart_puts("\n[seed] "); uart_puts(ARCH_NAME); uart_puts(" boot, dtb="); 1485 uart_putx(dtb_phys); uart_puts("\n"); 1486 1487 struct dtb_info dt = {0}; 1488 parse_dtb((const void *)dtb_phys, &dt); 1489 uart_puts("[seed] mem "); uart_putx(dt.mem_start); 1490 uart_puts(" + "); uart_putx(dt.mem_size); uart_puts("\n"); 1491 uart_puts("[seed] virtio-mmio slots="); uart_putd((i64)dt.virtio_mmio_n); 1492 uart_puts("\n"); 1493 if (dt.bootargs[0]) { uart_puts("[seed] bootargs: "); uart_puts(dt.bootargs); uart_puts("\n"); } 1494 1495 /* Bring up virtio-blk: identifies blk0 (cpio) and blk1 (output). */ 1496 blk_init(&dt); 1497 1498 /* Reserve the cpio buffer at the top of kheap so we can release it 1499 * after parse_cpio. parse_cpio kallocs each file's data below cpio_buf; 1500 * once it returns, the cpio buffer's bytes are dead and we can let 1501 * subsequent kallocs use that space. */ 1502 u64 in_cap_sec = blk_devs[g_blk_input].capacity_sectors; 1503 u64 in_cap_bytes = in_cap_sec * 512; 1504 u64 in_cap_aln = (in_cap_bytes + 0xfffUL) & ~0xfffUL; 1505 u8 *cpio_buf = (u8 *)((u64)kheap_end - in_cap_aln); 1506 u8 *kheap_end_full = kheap_end; 1507 kheap_end = cpio_buf; 1508 1509 if (blk_read(g_blk_input, 0, cpio_buf, in_cap_sec) < 0) { 1510 uart_puts("[seed] cpio read failed\n"); 1511 hang(); 1512 } 1513 parse_cpio(cpio_buf, in_cap_bytes); 1514 uart_puts("[seed] tmpfs:\n"); 1515 for (int i = 0; i < MAX_FILES; i++) { 1516 if (!files[i].used) continue; 1517 uart_puts(" /"); uart_puts(files[i].path); 1518 uart_puts(" ("); uart_putd((i64)files[i].len); uart_puts(" bytes)\n"); 1519 } 1520 1521 int init_idx = find_file("init"); 1522 if (init_idx < 0) { uart_puts("[seed] no /init in initrd, halting\n"); hang(); } 1523 1524 u64 entry = load_elf(files[init_idx].data); 1525 if (!entry) { uart_puts("[seed] load_elf failed\n"); hang(); } 1526 uart_puts("[seed] /init e_entry="); uart_putx(entry); uart_puts("\n"); 1527 1528 /* parse_cpio + load_elf are done — cpio buffer's bytes are dead. 1529 * Release the reserved tail of kheap for tmpfs file growth. */ 1530 kheap_end = kheap_end_full; 1531 1532 /* User runs in the L2-mapped low-VA window (USER_VA_LO..USER_VA_HI, 1533 * physically backed by pool A initially). Stack grows down from the top 1534 * of the window; brk grows up from above the loaded image's 1535 * end-of-bss (g_user_image_end, set by load_elf). 16 MB reserved at 1536 * the top for the user stack. */ 1537 u64 ustack_top = USER_VA_HI; 1538 /* See sys_spawn for why brk_base is page-rounded above end-of-image. */ 1539 brk_base = g_user_image_end ? g_user_image_end : USER_VA_LO; 1540 brk_base = (brk_base + 0xfffUL) & ~0xfffUL; 1541 brk_cur = brk_base; 1542 brk_max = USER_VA_HI - 0x01000000UL; 1543 1544 /* Build argv. Priority: 1545 * 1. DTB /chosen/bootargs (whitespace-tokenised — qemu -append "..."). 1546 * 2. /init.argv from the initramfs (one arg per line). 1547 * 3. Fallback: argc=1, argv[0]="init". 1548 * In all three cases, argv passed to user is exactly what the source 1549 * provided — no implicit argv[0]="init" prefix. */ 1550 static char argv_pool[512]; 1551 char *uargv[MAX_ARGV]; 1552 int uargc = 0; 1553 1554 if (dt.bootargs[0]) { 1555 int n = 0; 1556 while (dt.bootargs[n] && n < (int)sizeof(argv_pool) - 1) { 1557 argv_pool[n] = dt.bootargs[n]; n++; 1558 } 1559 argv_pool[n] = 0; 1560 uargc = tokenise(argv_pool, uargv, MAX_ARGV); 1561 } 1562 if (uargc == 0) { 1563 int aidx = find_file("init.argv"); 1564 if (aidx >= 0) { 1565 u64 n = files[aidx].len; 1566 if (n >= sizeof(argv_pool)) n = sizeof(argv_pool) - 1; 1567 for (u64 i = 0; i < n; i++) argv_pool[i] = (char)files[aidx].data[i]; 1568 argv_pool[n] = 0; 1569 uargc = tokenise(argv_pool, uargv, MAX_ARGV); 1570 } 1571 } 1572 if (uargc == 0) { 1573 argv_pool[0] = 'i'; argv_pool[1] = 'n'; argv_pool[2] = 'i'; 1574 argv_pool[3] = 't'; argv_pool[4] = 0; 1575 uargv[0] = argv_pool; 1576 uargc = 1; 1577 } 1578 uart_puts("[seed] argv:"); 1579 for (int i = 0; i < uargc; i++) { uart_puts(" "); uart_puts(uargv[i]); } 1580 uart_puts("\n"); 1581 1582 u64 user_sp = build_user_stack(ustack_top, uargc, uargv); 1583 1584 uart_puts("[seed] eret to user, sp="); uart_putx(user_sp); uart_puts("\n"); 1585 eret_to_user(entry, user_sp); 1586 /* unreachable */ 1587 }