commit d437cb28c8b53b20923897d9aaa74fd7b0244928
parent 0aa867eecc1187ba9b9ef6194511152c877eafc0
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Mon, 4 May 2026 19:54:16 -0700
seed-kernel: minimal arm64 OS satisfying docs/OS.md Tier 1
Boots via the Linux arm64 boot protocol (`-kernel` + `-initrd` on
QEMU virt), parses the DTB for memory + initrd, unpacks the cpio
newc into an in-memory tmpfs, loads /init as a static aarch64 ELF,
and ERETs into it at EL1t. SVC traps land in trap_sync() and dispatch
the eight Tier-1 syscalls (read, write, openat, close, lseek, brk,
unlinkat, exit_group). exit_group issues PSCI SYSTEM_OFF so qemu
exits cleanly.
Bring-up sets up an identity-map MMU (4× 1G L1 blocks: 0-1G Device,
1-4G Normal cacheable) — needed because gcc auto-vectorises the
DTB byte-by-byte BE readers into 64-bit unaligned loads, which fault
on Device memory with the MMU off.
Tier 2 (clone/execve/waitid pseudo-fork) is not yet wired; the
dispatcher returns ENOSYS for unknown calls.
Build with `make` inside boot2-alpine-gcc:aarch64; run with
./run.sh. End-to-end boot + tmpfs round-trip + exit takes ~100ms.
Diffstat:
7 files changed, 1078 insertions(+), 0 deletions(-)
diff --git a/seed-kernel/Makefile b/seed-kernel/Makefile
@@ -0,0 +1,48 @@
+# seed-kernel — minimal arm64 OS that satisfies docs/OS.md Tier 1.
+#
+# Build runs inside boot2-alpine-gcc:aarch64 (already arm64-native), so
+# everything compiles with the host toolchain — no cross prefixes.
+
+OUT := build
+KOBJS := $(OUT)/start.o $(OUT)/kernel.o
+KIMAGE := $(OUT)/kernel.elf
+KBIN := $(OUT)/Image
+USER := $(OUT)/init
+INITRAMFS := $(OUT)/initramfs.cpio
+
+CFLAGS_COMMON := -nostdlib -nostartfiles -ffreestanding -fno-stack-protector \
+ -fno-pic -static -Wall -Wextra -O2 -mcmodel=large \
+ -fno-asynchronous-unwind-tables -fno-unwind-tables
+KCFLAGS := $(CFLAGS_COMMON) -mgeneral-regs-only
+
+.PHONY: all clean kernel user initramfs
+all: $(KBIN) $(INITRAMFS)
+
+$(OUT):
+ mkdir -p $(OUT)
+
+$(OUT)/start.o: start.S | $(OUT)
+ gcc $(KCFLAGS) -c -o $@ $<
+
+$(OUT)/kernel.o: kernel.c | $(OUT)
+ gcc $(KCFLAGS) -c -o $@ $<
+
+$(KIMAGE): $(KOBJS) kernel.lds
+ ld -nostdlib -static -T kernel.lds -o $@ $(KOBJS)
+
+# Strip ELF down to a flat binary that QEMU's -kernel can load.
+$(KBIN): $(KIMAGE)
+ objcopy -O binary $< $@
+
+$(USER): user/hello.c user/user.lds | $(OUT)
+ gcc $(CFLAGS_COMMON) -mgeneral-regs-only -T user/user.lds -o $@ $<
+
+$(INITRAMFS): $(USER)
+ cd $(OUT) && printf 'init\n' | cpio -o -H newc > initramfs.cpio
+
+kernel: $(KBIN)
+user: $(USER)
+initramfs: $(INITRAMFS)
+
+clean:
+ rm -rf $(OUT)
diff --git a/seed-kernel/kernel.c b/seed-kernel/kernel.c
@@ -0,0 +1,628 @@
+/* seed kernel — minimal OS satisfying docs/OS.md Tier 1.
+ *
+ * Boots via Linux arm64 boot protocol (-kernel/-initrd), parses the DTB
+ * to find initrd + memory, unpacks the cpio newc initramfs into an
+ * in-memory tmpfs, loads /init (a static aarch64 ELF), and ERETs into
+ * it at EL1t. SVC traps land in trap_sync() and dispatch the eight
+ * Tier-1 syscalls.
+ */
+
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned int u32;
+typedef unsigned long u64;
+typedef long i64;
+typedef int i32;
+
+/* ─── PL011 console ─────────────────────────────────────────────────────── */
+
+#define UART0 0x09000000UL
+#define UART_DR ((volatile u32 *)(UART0 + 0x00))
+#define UART_FR ((volatile u32 *)(UART0 + 0x18))
+#define UART_FR_TXFF (1u << 5)
+
+static void uart_putc(char c) {
+ while (*UART_FR & UART_FR_TXFF) { }
+ *UART_DR = (u32)(u8)c;
+}
+
+static void uart_puts(const char *s) {
+ while (*s) {
+ if (*s == '\n') uart_putc('\r');
+ uart_putc(*s++);
+ }
+}
+
+static void uart_putx(u64 v) {
+ static const char hex[] = "0123456789abcdef";
+ uart_puts("0x");
+ for (int i = 60; i >= 0; i -= 4) uart_putc(hex[(v >> i) & 0xf]);
+}
+
+static void uart_putd(i64 v) {
+ if (v < 0) { uart_putc('-'); v = -v; }
+ char buf[24];
+ int i = 0;
+ if (v == 0) buf[i++] = '0';
+ while (v) { buf[i++] = '0' + (v % 10); v /= 10; }
+ while (i--) uart_putc(buf[i]);
+}
+
+/* ─── Tiny libc-ish helpers ─────────────────────────────────────────────── */
+
+/* libgcc / freestanding ABI helpers gcc may call implicitly. */
+void *memset(void *d, int c, u64 n) {
+ u8 *dd = d; for (u64 i = 0; i < n; i++) dd[i] = (u8)c; return d;
+}
+void *memcpy(void *d, const void *s, u64 n) {
+ u8 *dd = d; const u8 *ss = s;
+ for (u64 i = 0; i < n; i++) dd[i] = ss[i];
+ return d;
+}
+
+static int str_eq(const char *a, const char *b) {
+ while (*a && *a == *b) { a++; b++; }
+ return *a == 0 && *b == 0;
+}
+static int str_n(const char *s) { int n = 0; while (s[n]) n++; return n; }
+static void mem_cpy(void *d, const void *s, u64 n) {
+ u8 *dd = d; const u8 *ss = s;
+ for (u64 i = 0; i < n; i++) dd[i] = ss[i];
+}
+static void mem_set(void *d, int c, u64 n) {
+ u8 *dd = d;
+ for (u64 i = 0; i < n; i++) dd[i] = (u8)c;
+}
+
+/* ─── MMU bring-up ──────────────────────────────────────────────────────── */
+/* Identity-map the first 4 GB at L1 (1 GB blocks). One page table — 4 KB.
+ * Entry 0 (0..1G): Device-nGnRnE (UART/GIC/virtio/flash live here)
+ * Entry 1 (1..2G): Normal WB-WA (RAM 0x40000000-)
+ * Entry 2 (2..3G): Normal WB-WA (extra RAM if -m > 1G)
+ * Entry 3 (3..4G): Normal WB-WA (above-RAM PCI on virt; rarely touched)
+ * With MMU on + Normal memory, unaligned loads/stores work — gcc's auto-
+ * vectorised 64-bit load in be64() stops trapping.
+ */
+__attribute__((aligned(4096))) static u64 l1_pt[512];
+
+static void setup_mmu(void) {
+ /* AP=00 (RW EL1 only — keep EL0 out for now), SH=ISH, AF=1, AttrIdx=0/1.
+ * Bits: V(0)=1, block(1)=0, AttrIdx[4:2], NS(5)=0, AP[7:6]=00, SH[9:8]=11,
+ * AF(10)=1, nG(11)=0 → 0x701 (Normal) / 0x705 (Device) */
+ u64 normal = 0x701;
+ u64 device = 0x705;
+
+ for (int i = 0; i < 512; i++) l1_pt[i] = 0;
+ l1_pt[0] = 0x00000000UL | device;
+ l1_pt[1] = 0x40000000UL | normal;
+ l1_pt[2] = 0x80000000UL | normal;
+ l1_pt[3] = 0xc0000000UL | normal;
+
+ /* MAIR: Attr0 = 0xff (Normal WB-WA), Attr1 = 0x00 (Device-nGnRnE) */
+ u64 mair = 0x00000000000000ffUL;
+ asm volatile("msr mair_el1, %0" :: "r"(mair));
+
+ u64 tcr = (u64)25 /* T0SZ: 39-bit VA */
+ | ((u64)1 << 8) /* IRGN0 = WBWA */
+ | ((u64)1 << 10) /* ORGN0 = WBWA */
+ | ((u64)3 << 12) /* SH0 = inner shareable */
+ | ((u64)0 << 14) /* TG0 = 4KB */
+ | ((u64)1 << 23) /* EPD1 = disable TTBR1 walks */
+ | ((u64)2 << 32); /* IPS = 40-bit phys */
+ asm volatile("msr tcr_el1, %0" :: "r"(tcr));
+ asm volatile("msr ttbr0_el1, %0" :: "r"((u64)l1_pt));
+
+ asm volatile("ic iallu"); /* invalidate I-cache */
+ asm volatile("dsb ish");
+ asm volatile("tlbi vmalle1");
+ asm volatile("dsb ish");
+ asm volatile("isb");
+
+ u64 sctlr;
+ asm volatile("mrs %0, sctlr_el1" : "=r"(sctlr));
+ sctlr &= ~(u64)((1 << 1) | (1 << 19)); /* clear A, WXN */
+ sctlr |= (u64)(1 << 0); /* M (MMU) only — caches stay off */
+ asm volatile("msr sctlr_el1, %0" :: "r"(sctlr));
+ asm volatile("isb");
+}
+
+/* ─── Kernel heap (bump allocator) ──────────────────────────────────────── */
+
+extern char _end[];
+static u8 *kheap_ptr;
+static u8 *kheap_end;
+
+static void *kalloc(u64 n) {
+ n = (n + 15) & ~15UL;
+ if (kheap_ptr + n > kheap_end) {
+ uart_puts("kalloc: out of memory\n");
+ for (;;) asm volatile("wfe");
+ }
+ void *r = kheap_ptr;
+ kheap_ptr += n;
+ return r;
+}
+
+/* ─── Big-endian readers (DTB is BE) ────────────────────────────────────── */
+
+static u32 be32(const u8 *p) { return (u32)p[0]<<24 | (u32)p[1]<<16 | (u32)p[2]<<8 | (u32)p[3]; }
+static u64 be64(const u8 *p) { return ((u64)be32(p) << 32) | (u64)be32(p + 4); }
+
+/* ─── Flattened Device Tree walker ──────────────────────────────────────── */
+
+#define FDT_MAGIC 0xd00dfeedu
+#define FDT_BEGIN_NODE 1
+#define FDT_END_NODE 2
+#define FDT_PROP 3
+#define FDT_NOP 4
+#define FDT_END 9
+
+struct dtb_info {
+ u64 initrd_start;
+ u64 initrd_end;
+ u64 mem_start;
+ u64 mem_size;
+ char bootargs[256];
+};
+
+static void parse_dtb(const void *dtb, struct dtb_info *out) {
+ const u8 *base = dtb;
+ if (be32(base) != FDT_MAGIC) {
+ uart_puts("DTB: bad magic\n"); return;
+ }
+ u32 off_struct = be32(base + 8);
+ u32 off_strings = be32(base + 12);
+ const u8 *strings = base + off_strings;
+ const u8 *p = base + off_struct;
+
+ char path[4][64] = {{0}};
+ int depth = -1;
+
+ for (;;) {
+ u32 tok = be32(p); p += 4;
+ if (tok == FDT_BEGIN_NODE) {
+ depth++;
+ if (depth < 4) {
+ int i = 0;
+ while (p[i] && i < 63) { path[depth][i] = (char)p[i]; i++; }
+ path[depth][i] = 0;
+ }
+ while (*p) p++;
+ p++;
+ p = (const u8 *)(((u64)p + 3) & ~3UL);
+ } else if (tok == FDT_END_NODE) {
+ depth--;
+ } else if (tok == FDT_PROP) {
+ u32 len = be32(p); p += 4;
+ u32 nameoff = be32(p); p += 4;
+ const char *pn = (const char *)(strings + nameoff);
+
+ if (depth == 1 && str_eq(path[1], "chosen")) {
+ if (str_eq(pn, "linux,initrd-start")) {
+ out->initrd_start = (len == 8) ? be64(p) : (u64)be32(p);
+ } else if (str_eq(pn, "linux,initrd-end")) {
+ out->initrd_end = (len == 8) ? be64(p) : (u64)be32(p);
+ } else if (str_eq(pn, "bootargs")) {
+ u32 i = 0;
+ while (i < len && i < 255) { out->bootargs[i] = (char)p[i]; i++; }
+ out->bootargs[i] = 0;
+ }
+ }
+ if (depth == 1) {
+ /* memory node is named "memory@<addr>" */
+ if ((path[1][0] == 'm' && path[1][1] == 'e' && path[1][2] == 'm' &&
+ path[1][3] == 'o' && path[1][4] == 'r' && path[1][5] == 'y') &&
+ str_eq(pn, "reg") && len >= 16 && out->mem_size == 0) {
+ out->mem_start = be64(p);
+ out->mem_size = be64(p + 8);
+ }
+ }
+ p += len;
+ p = (const u8 *)(((u64)p + 3) & ~3UL);
+ } else if (tok == FDT_NOP) {
+ /* skip */
+ } else if (tok == FDT_END) {
+ break;
+ } else {
+ uart_puts("DTB: bad token "); uart_putx(tok); uart_puts("\n");
+ break;
+ }
+ }
+}
+
+/* ─── In-memory tmpfs from cpio newc ────────────────────────────────────── */
+
+#define MAX_FILES 64
+struct file {
+ int used;
+ char path[64];
+ u8 *data;
+ u64 len;
+ u64 cap;
+};
+static struct file files[MAX_FILES];
+
+static int find_file(const char *path) {
+ while (*path == '/') path++;
+ for (int i = 0; i < MAX_FILES; i++) {
+ if (files[i].used && str_eq(files[i].path, path)) return i;
+ }
+ return -1;
+}
+
+static int new_file(const char *path) {
+ while (*path == '/') path++;
+ for (int i = 0; i < MAX_FILES; i++) {
+ if (!files[i].used) {
+ files[i].used = 1;
+ int j = 0;
+ while (path[j] && j < 63) { files[i].path[j] = path[j]; j++; }
+ files[i].path[j] = 0;
+ files[i].data = 0;
+ files[i].len = 0;
+ files[i].cap = 0;
+ return i;
+ }
+ }
+ return -1;
+}
+
+static u64 hex_n(const char *s, int n) {
+ u64 v = 0;
+ for (int i = 0; i < n; i++) {
+ char c = s[i];
+ v <<= 4;
+ if (c >= '0' && c <= '9') v |= (u64)(c - '0');
+ else if (c >= 'a' && c <= 'f') v |= (u64)(c - 'a' + 10);
+ else if (c >= 'A' && c <= 'F') v |= (u64)(c - 'A' + 10);
+ }
+ return v;
+}
+
+static void parse_cpio(const void *cpio, u64 total) {
+ const u8 *p = cpio;
+ const u8 *end = p + total;
+ while (p + 110 <= end) {
+ if (!(p[0]=='0'&&p[1]=='7'&&p[2]=='0'&&p[3]=='7'&&p[4]=='0'&&p[5]=='1')) break;
+ u64 mode = hex_n((const char *)(p + 6 + 1*8), 8);
+ u64 fsz = hex_n((const char *)(p + 6 + 6*8), 8);
+ u64 nsz = hex_n((const char *)(p + 6 + 11*8), 8);
+ const char *name = (const char *)(p + 110);
+ if (str_eq(name, "TRAILER!!!")) break;
+
+ u64 hstride = (110 + nsz + 3) & ~3UL;
+ u64 fstride = (fsz + 3) & ~3UL;
+ const u8 *fdata = p + hstride;
+
+ int is_dir = ((mode & 0xf000) == 0x4000);
+ int is_reg = ((mode & 0xf000) == 0x8000);
+ if (is_reg && !str_eq(name, ".")) {
+ int idx = new_file(name);
+ if (idx >= 0) {
+ /* Copy out — we'll let the user write back later if needed. */
+ files[idx].data = kalloc(fsz ? fsz : 1);
+ files[idx].cap = fsz ? fsz : 1;
+ files[idx].len = fsz;
+ if (fsz) mem_cpy(files[idx].data, fdata, fsz);
+ }
+ }
+ (void)is_dir;
+ p += hstride + fstride;
+ }
+}
+
+/* ─── ELF64 static loader ───────────────────────────────────────────────── */
+
+struct ehdr { u8 e_ident[16]; u16 e_type, e_machine; u32 e_version; u64 e_entry, e_phoff, e_shoff; u32 e_flags; u16 e_ehsize, e_phentsize, e_phnum, e_shentsize, e_shnum, e_shstrndx; };
+struct phdr { u32 p_type, p_flags; u64 p_offset, p_vaddr, p_paddr, p_filesz, p_memsz, p_align; };
+
+#define PT_LOAD 1
+
+static u64 load_elf(const u8 *elf) {
+ const struct ehdr *eh = (const struct ehdr *)elf;
+ if (!(eh->e_ident[0] == 0x7f && eh->e_ident[1] == 'E' &&
+ eh->e_ident[2] == 'L' && eh->e_ident[3] == 'F')) {
+ uart_puts("ELF: bad magic\n"); return 0;
+ }
+ if (eh->e_machine != 0xb7) { /* EM_AARCH64 */
+ uart_puts("ELF: not aarch64\n"); return 0;
+ }
+ for (int i = 0; i < eh->e_phnum; i++) {
+ const struct phdr *ph = (const struct phdr *)(elf + eh->e_phoff + (u64)i * eh->e_phentsize);
+ if (ph->p_type != PT_LOAD) continue;
+ u8 *dst = (u8 *)ph->p_vaddr;
+ const u8 *src = elf + ph->p_offset;
+ mem_cpy(dst, src, ph->p_filesz);
+ if (ph->p_memsz > ph->p_filesz)
+ mem_set(dst + ph->p_filesz, 0, ph->p_memsz - ph->p_filesz);
+ }
+ /* I-cache sync (cheap insurance even with caches off). */
+ asm volatile("dsb sy" ::: "memory");
+ asm volatile("ic iallu" ::: "memory");
+ asm volatile("dsb sy" ::: "memory");
+ asm volatile("isb");
+ return eh->e_entry;
+}
+
+/* ─── Syscall layer (Tier 1) ────────────────────────────────────────────── */
+
+#define MAX_FD 32
+struct fdent { int used; int fidx; u64 pos; int wflag; int append; };
+static struct fdent fdtab[MAX_FD];
+
+/* User program break (single-process). */
+static u64 brk_base;
+static u64 brk_cur;
+static u64 brk_max;
+
+#define EBADF 9
+#define ENOENT 2
+#define EINVAL 22
+#define EMFILE 24
+#define EFAULT 14
+#define ENOSPC 28
+
+#define O_RDONLY 0
+#define O_WRONLY 1
+#define O_RDWR 2
+#define O_CREAT 0100
+#define O_TRUNC 01000
+#define O_APPEND 02000
+
+#define AT_FDCWD (-100)
+
+#define SYS_unlinkat 35
+#define SYS_openat 56
+#define SYS_close 57
+#define SYS_lseek 62
+#define SYS_read 63
+#define SYS_write 64
+#define SYS_exit_group 93
+#define SYS_brk 214
+
+static i64 sys_write(int fd, const void *buf, u64 len) {
+ if (fd == 1 || fd == 2) {
+ const u8 *s = buf;
+ for (u64 i = 0; i < len; i++) uart_putc((char)s[i]);
+ return (i64)len;
+ }
+ if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used || !fdtab[fd].wflag) return -EBADF;
+ struct file *f = &files[fdtab[fd].fidx];
+ u64 pos = fdtab[fd].append ? f->len : fdtab[fd].pos;
+ u64 need = pos + len;
+ if (need > f->cap) {
+ u64 ncap = f->cap ? f->cap : 64;
+ while (ncap < need) ncap *= 2;
+ u8 *nd = kalloc(ncap);
+ if (f->len) mem_cpy(nd, f->data, f->len);
+ f->data = nd;
+ f->cap = ncap;
+ }
+ mem_cpy(f->data + pos, buf, len);
+ if (need > f->len) f->len = need;
+ fdtab[fd].pos = pos + len;
+ return (i64)len;
+}
+
+static i64 sys_read(int fd, void *buf, u64 len) {
+ if (fd == 0) return 0;
+ if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used) return -EBADF;
+ struct file *f = &files[fdtab[fd].fidx];
+ u64 pos = fdtab[fd].pos;
+ if (pos >= f->len) return 0;
+ u64 n = len;
+ if (pos + n > f->len) n = f->len - pos;
+ mem_cpy(buf, f->data + pos, n);
+ fdtab[fd].pos = pos + n;
+ return (i64)n;
+}
+
+static i64 sys_openat(int dirfd, const char *path, int flags, int mode) {
+ (void)dirfd; (void)mode;
+ int fidx = find_file(path);
+ int wflag = (flags & 3) != 0;
+ if (fidx < 0) {
+ if (!(flags & O_CREAT)) return -ENOENT;
+ fidx = new_file(path);
+ if (fidx < 0) return -ENOSPC;
+ } else if (flags & O_TRUNC) {
+ files[fidx].len = 0;
+ }
+ int fd = -1;
+ for (int i = 3; i < MAX_FD; i++) {
+ if (!fdtab[i].used) { fd = i; break; }
+ }
+ if (fd < 0) return -EMFILE;
+ fdtab[fd].used = 1;
+ fdtab[fd].fidx = fidx;
+ fdtab[fd].pos = 0;
+ fdtab[fd].wflag = wflag;
+ fdtab[fd].append = (flags & O_APPEND) ? 1 : 0;
+ return fd;
+}
+
+static i64 sys_close(int fd) {
+ if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used) return -EBADF;
+ fdtab[fd].used = 0;
+ return 0;
+}
+
+static i64 sys_lseek(int fd, i64 off, int whence) {
+ if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used) return -EBADF;
+ struct file *f = &files[fdtab[fd].fidx];
+ i64 base;
+ if (whence == 0) base = 0;
+ else if (whence == 1) base = (i64)fdtab[fd].pos;
+ else if (whence == 2) base = (i64)f->len;
+ else return -EINVAL;
+ i64 np = base + off;
+ if (np < 0) return -EINVAL;
+ fdtab[fd].pos = (u64)np;
+ return np;
+}
+
+static i64 sys_brk(u64 addr) {
+ if (addr == 0) return (i64)brk_cur;
+ if (addr < brk_base || addr > brk_max) return (i64)brk_cur;
+ brk_cur = addr;
+ return (i64)brk_cur;
+}
+
+static i64 sys_unlinkat(int dirfd, const char *path, int flags) {
+ (void)dirfd; (void)flags;
+ int fidx = find_file(path);
+ if (fidx < 0) return -ENOENT;
+ files[fidx].used = 0;
+ return 0;
+}
+
+static int g_exit_code = 0;
+static int g_exited = 0;
+
+static void sys_exit(int code) {
+ g_exit_code = code;
+ g_exited = 1;
+ uart_puts("\n[seed] user exit_group("); uart_putd(code); uart_puts(")\n");
+ /* Try PSCI SYSTEM_OFF so QEMU exits cleanly; fall back to spin. */
+ register u64 x0 asm("x0") = 0x84000008;
+ asm volatile("hvc #0" : "+r"(x0));
+ register u64 x0s asm("x0") = 0x84000008;
+ asm volatile("smc #0" : "+r"(x0s));
+ for (;;) asm volatile("wfi");
+}
+
+/* ─── Trap dispatch (called from start.S vector handlers) ───────────────── */
+
+struct trapframe {
+ u64 x[31];
+ u64 elr;
+ u64 spsr;
+};
+
+i64 trap_sync(u64 esr, struct trapframe *tf);
+void trap_kernel(u64 esr, struct trapframe *tf);
+void trap_unhandled(u64 esr, struct trapframe *tf);
+
+i64 trap_sync(u64 esr, struct trapframe *tf) {
+ u32 ec = (u32)((esr >> 26) & 0x3f);
+ if (ec == 0x15) { /* SVC, AArch64 */
+ u64 nr = tf->x[8];
+ u64 a0 = tf->x[0], a1 = tf->x[1], a2 = tf->x[2];
+ u64 a3 = tf->x[3], a4 = tf->x[4], a5 = tf->x[5];
+ i64 r;
+ switch (nr) {
+ case SYS_read: r = sys_read((int)a0, (void *)a1, a2); break;
+ case SYS_write: r = sys_write((int)a0, (const void *)a1, a2); break;
+ case SYS_openat: r = sys_openat((int)a0, (const char *)a1, (int)a2, (int)a3); break;
+ case SYS_close: r = sys_close((int)a0); break;
+ case SYS_lseek: r = sys_lseek((int)a0, (i64)a1, (int)a2); break;
+ case SYS_brk: r = sys_brk(a0); break;
+ case SYS_unlinkat: r = sys_unlinkat((int)a0, (const char *)a1, (int)a2); break;
+ case SYS_exit_group: sys_exit((int)a0); r = 0; break;
+ default:
+ uart_puts("[seed] ENOSYS "); uart_putd((i64)nr); uart_puts("\n");
+ r = -38; /* ENOSYS */
+ }
+ tf->x[0] = (u64)r;
+ (void)a4; (void)a5;
+ return 0;
+ }
+ uart_puts("[seed] PANIC: user sync, ESR="); uart_putx(esr);
+ uart_puts(" ELR="); uart_putx(tf->elr);
+ uart_puts(" FAR=");
+ u64 far; asm volatile("mrs %0, far_el1" : "=r"(far)); uart_putx(far);
+ uart_puts("\n");
+ for (;;) asm volatile("wfe");
+}
+
+void trap_kernel(u64 esr, struct trapframe *tf) {
+ uart_puts("[seed] PANIC: kernel sync, ESR="); uart_putx(esr);
+ uart_puts(" ELR="); uart_putx(tf->elr);
+ uart_puts("\n");
+ for (;;) asm volatile("wfe");
+}
+
+void trap_unhandled(u64 esr, struct trapframe *tf) {
+ uart_puts("[seed] PANIC: unhandled exception, ESR="); uart_putx(esr);
+ uart_puts(" ELR="); uart_putx(tf->elr);
+ uart_puts("\n");
+ for (;;) asm volatile("wfe");
+}
+
+/* ─── User stack setup + entry ──────────────────────────────────────────── */
+
+extern void eret_to_user(u64 entry, u64 sp);
+
+static u64 build_user_stack(u64 stack_top, const char *argv0) {
+ /* Place argv0 string at top, then argc/argv/envp below it.
+ *
+ * SysV layout from low to high at sp:
+ * argc, argv[0], NULL, NULL (envp term)
+ */
+ int n = str_n(argv0) + 1;
+ char *str = (char *)(stack_top - 32);
+ for (int i = 0; i < n; i++) str[i] = argv0[i];
+
+ u64 sp = (u64)str - 64;
+ sp &= ~15UL;
+ u64 *p = (u64 *)sp;
+ p[0] = 1; /* argc */
+ p[1] = (u64)str; /* argv[0] */
+ p[2] = 0; /* argv terminator */
+ p[3] = 0; /* envp terminator */
+ return sp;
+}
+
+/* ─── kmain ─────────────────────────────────────────────────────────────── */
+
+void kmain(u64 dtb_phys) {
+ setup_mmu();
+
+ /* Bring up heap immediately — placed at a 16MB-aligned offset above
+ * our image, well clear of BSS/stack and of QEMU's initrd placement. */
+ u64 image_end = (u64)_end;
+ kheap_ptr = (u8 *)((image_end + 0xfffful) & ~0xfffful);
+ kheap_end = (u8 *)0x44000000UL; /* 64MB of heap, plenty */
+
+ uart_puts("\n[seed] arm64 boot, x0/dtb="); uart_putx(dtb_phys); uart_puts("\n");
+
+ struct dtb_info dt = {0};
+ parse_dtb((const void *)dtb_phys, &dt);
+ uart_puts("[seed] mem "); uart_putx(dt.mem_start);
+ uart_puts(" + "); uart_putx(dt.mem_size); uart_puts("\n");
+ uart_puts("[seed] initrd "); uart_putx(dt.initrd_start);
+ uart_puts(" .. "); uart_putx(dt.initrd_end); uart_puts("\n");
+ if (dt.bootargs[0]) { uart_puts("[seed] bootargs: "); uart_puts(dt.bootargs); uart_puts("\n"); }
+
+ if (dt.initrd_start == 0 || dt.initrd_end <= dt.initrd_start) {
+ uart_puts("[seed] no initrd, halting\n");
+ for (;;) asm volatile("wfe");
+ }
+
+ parse_cpio((const void *)dt.initrd_start, dt.initrd_end - dt.initrd_start);
+ uart_puts("[seed] tmpfs:\n");
+ for (int i = 0; i < MAX_FILES; i++) {
+ if (!files[i].used) continue;
+ uart_puts(" /"); uart_puts(files[i].path);
+ uart_puts(" ("); uart_putd((i64)files[i].len); uart_puts(" bytes)\n");
+ }
+
+ int init_idx = find_file("init");
+ if (init_idx < 0) { uart_puts("[seed] no /init in initrd, halting\n"); for(;;) asm volatile("wfe"); }
+
+ u64 entry = load_elf(files[init_idx].data);
+ if (!entry) { uart_puts("[seed] load_elf failed\n"); for(;;) asm volatile("wfe"); }
+ uart_puts("[seed] /init e_entry="); uart_putx(entry); uart_puts("\n");
+
+ /* User stack at top of a reserved high region. brk above that. */
+ u64 ustack_top = 0x46000000UL;
+ brk_base = 0x46000000UL;
+ brk_cur = brk_base;
+ brk_max = 0x4a000000UL;
+
+ u64 user_sp = build_user_stack(ustack_top, "init");
+
+ uart_puts("[seed] eret to user, sp="); uart_putx(user_sp); uart_puts("\n");
+ eret_to_user(entry, user_sp);
+ /* unreachable */
+}
diff --git a/seed-kernel/kernel.lds b/seed-kernel/kernel.lds
@@ -0,0 +1,51 @@
+/* arm64 seed kernel link layout.
+ *
+ * QEMU `-machine virt` puts RAM at 0x40000000. With `-kernel` and the arm64
+ * Image header, QEMU loads us at RAM_BASE + text_offset = 0x40080000.
+ * We don't reference absolute addresses in code (the entry stub uses adrp
+ * which is PC-relative), so the link base mostly affects symbol values.
+ */
+
+ENTRY(_head)
+
+SECTIONS {
+ . = 0x40080000;
+
+ .head.text : {
+ KEEP(*(.head.text))
+ }
+
+ .text : ALIGN(8) {
+ *(.text .text.*)
+ }
+
+ .rodata : ALIGN(8) {
+ *(.rodata .rodata.*)
+ }
+
+ .data : ALIGN(8) {
+ *(.data .data.*)
+ }
+
+ .bss : ALIGN(16) {
+ __bss_start = .;
+ *(.bss .bss.*)
+ *(COMMON)
+ . = ALIGN(16);
+ __bss_end = .;
+ }
+
+ /* 64KB kernel stack */
+ .stack : ALIGN(16) {
+ kstack_bottom = .;
+ . += 0x10000;
+ kstack_top = .;
+ }
+
+ _end = .;
+ _image_end = .;
+
+ /DISCARD/ : {
+ *(.note.*) *(.comment) *(.eh_frame)
+ }
+}
diff --git a/seed-kernel/run.sh b/seed-kernel/run.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+# Boot the seed kernel + initramfs in QEMU.
+#
+# Usage: ./run.sh [extra qemu args...]
+
+set -eu
+cd "$(dirname "$0")"
+
+KERNEL=build/Image
+INITRD=build/initramfs.cpio
+
+[ -f "$KERNEL" ] || { echo "missing $KERNEL — run 'make' first"; exit 1; }
+[ -f "$INITRD" ] || { echo "missing $INITRD — run 'make' first"; exit 1; }
+
+exec qemu-system-aarch64 \
+ -machine virt \
+ -cpu cortex-a72 \
+ -m 512M \
+ -nographic \
+ -no-reboot \
+ -kernel "$KERNEL" \
+ -initrd "$INITRD" \
+ "$@"
diff --git a/seed-kernel/start.S b/seed-kernel/start.S
@@ -0,0 +1,204 @@
+/* seed kernel — arm64 boot protocol entry, vector table, SVC handler. */
+
+.section .head.text, "ax"
+.globl _head
+_head:
+ /* arm64 Image header (Documentation/arm64/booting.rst).
+ * code0 must be a valid instruction (a branch, in our case). */
+ b stext
+ .long 0
+ .quad 0x80000 /* text_offset (preferred load offset within RAM) */
+ .quad _image_end - _head /* image_size */
+ .quad 0xa /* flags: 4K pages, anywhere in physmem, LE */
+ .quad 0
+ .quad 0
+ .quad 0
+ .ascii "ARM\x64" /* magic */
+ .long 0 /* PE COFF offset (none) */
+
+stext:
+ /* Entry contract: x0 = DTB phys, MMU off, caches off, EL2 or EL1. */
+ msr daifset, #0xf
+
+ /* If we entered at EL2, drop to EL1. Otherwise we're already at EL1. */
+ mrs x9, CurrentEL
+ lsr x9, x9, #2
+ cmp x9, #2
+ b.ne in_el1
+
+ /* EL2 → EL1: set HCR_EL2.RW=1 (EL1 is AArch64), CNTHCTL/CNTVOFF defaults,
+ * SPSR=EL1h with DAIF masked, ELR=in_el1, eret. */
+ mov x9, #(1 << 31)
+ msr hcr_el2, x9
+ mov x9, #0x3c5 /* EL1h, DAIF=1111 */
+ msr spsr_el2, x9
+ adr x9, in_el1
+ msr elr_el2, x9
+ /* Make sure SP_EL1 is set before we eret to EL1 (else we land with
+ * an undefined SP). Use the same kernel stack we're about to install. */
+ adrp x9, kstack_top
+ add x9, x9, :lo12:kstack_top
+ msr sp_el1, x9
+ eret
+
+in_el1:
+ /* Stack. */
+ adrp x9, kstack_top
+ add x9, x9, :lo12:kstack_top
+ mov sp, x9
+
+ /* Vector table. */
+ adrp x9, vector_table
+ add x9, x9, :lo12:vector_table
+ msr vbar_el1, x9
+ isb
+
+ /* Zero BSS. */
+ adrp x1, __bss_start
+ add x1, x1, :lo12:__bss_start
+ adrp x2, __bss_end
+ add x2, x2, :lo12:__bss_end
+1: cmp x1, x2
+ b.ge 2f
+ str xzr, [x1], #8
+ b 1b
+2:
+ /* Hand control to C. x0 still = DTB phys (not clobbered above). */
+ bl kmain
+
+ /* kmain shouldn't return. */
+hang:
+ wfe
+ b hang
+
+
+/* ─── Exception vector table ──────────────────────────────────────────── */
+
+.macro VENTRY label
+ .balign 0x80
+ b \label
+.endm
+
+.section .text, "ax"
+.balign 0x800
+.globl vector_table
+vector_table:
+ /* Current EL with SP_EL0 (we never run kernel like this — only user). */
+ VENTRY el1_sp0_sync /* 0x000: SVC from EL1t (our "user") */
+ VENTRY unhandled /* 0x080 */
+ VENTRY unhandled /* 0x100 */
+ VENTRY unhandled /* 0x180 */
+ /* Current EL with SP_ELx (kernel internal). */
+ VENTRY el1_spx_sync /* 0x200: panic on kernel sync fault */
+ VENTRY unhandled /* 0x280 */
+ VENTRY unhandled /* 0x300 */
+ VENTRY unhandled /* 0x380 */
+ /* Lower EL using AArch64 (EL0). Unused in this design but wired. */
+ VENTRY el1_sp0_sync /* 0x400 */
+ VENTRY unhandled /* 0x480 */
+ VENTRY unhandled /* 0x500 */
+ VENTRY unhandled /* 0x580 */
+ /* Lower EL using AArch32 (unused). */
+ VENTRY unhandled /* 0x600 */
+ VENTRY unhandled /* 0x680 */
+ VENTRY unhandled /* 0x700 */
+ VENTRY unhandled /* 0x780 */
+
+
+/* ─── Trap entry/exit ─────────────────────────────────────────────────────
+ * Save x0..x30 + ELR_EL1 + SPSR_EL1 onto the kernel stack as a trapframe,
+ * call C trap_sync(esr, &tf), restore, eret. The C handler reads/writes
+ * tf->x[0..7] for syscall args and return value, plus tf->x[8] for the
+ * syscall number.
+ */
+
+.macro SAVE_TF
+ sub sp, sp, #272
+ stp x0, x1, [sp, #0]
+ stp x2, x3, [sp, #16]
+ stp x4, x5, [sp, #32]
+ stp x6, x7, [sp, #48]
+ stp x8, x9, [sp, #64]
+ stp x10, x11, [sp, #80]
+ stp x12, x13, [sp, #96]
+ stp x14, x15, [sp, #112]
+ stp x16, x17, [sp, #128]
+ stp x18, x19, [sp, #144]
+ stp x20, x21, [sp, #160]
+ stp x22, x23, [sp, #176]
+ stp x24, x25, [sp, #192]
+ stp x26, x27, [sp, #208]
+ stp x28, x29, [sp, #224]
+ str x30, [sp, #240]
+ mrs x10, elr_el1
+ mrs x11, spsr_el1
+ stp x10, x11, [sp, #248]
+.endm
+
+.macro RESTORE_TF
+ ldp x10, x11, [sp, #248]
+ msr elr_el1, x10
+ msr spsr_el1, x11
+ ldr x30, [sp, #240]
+ ldp x28, x29, [sp, #224]
+ ldp x26, x27, [sp, #208]
+ ldp x24, x25, [sp, #192]
+ ldp x22, x23, [sp, #176]
+ ldp x20, x21, [sp, #160]
+ ldp x18, x19, [sp, #144]
+ ldp x16, x17, [sp, #128]
+ ldp x14, x15, [sp, #112]
+ ldp x12, x13, [sp, #96]
+ ldp x10, x11, [sp, #80]
+ ldp x8, x9, [sp, #64]
+ ldp x6, x7, [sp, #48]
+ ldp x4, x5, [sp, #32]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #0]
+ add sp, sp, #272
+.endm
+
+el1_sp0_sync:
+ SAVE_TF
+ mrs x0, esr_el1
+ mov x1, sp
+ bl trap_sync
+ RESTORE_TF
+ eret
+
+el1_spx_sync:
+ /* Same shape as user sync — let C distinguish via SPSR/ESR if needed. */
+ SAVE_TF
+ mrs x0, esr_el1
+ mov x1, sp
+ bl trap_kernel
+ RESTORE_TF
+ eret
+
+unhandled:
+ SAVE_TF
+ mrs x0, esr_el1
+ mov x1, sp
+ bl trap_unhandled
+ RESTORE_TF
+ eret
+
+
+/* ─── eret_to_user(entry, sp) ─────────────────────────────────────────────
+ * Drop into the loaded user program. Runs at EL1t (same EL as kernel,
+ * but uses SP_EL0 — gives us a separate user stack without setting up
+ * an MMU). DAIF stays masked since we don't service interrupts.
+ */
+.globl eret_to_user
+eret_to_user:
+ msr sp_el0, x1
+ msr elr_el1, x0
+ mov x9, #0x3c4 /* EL1t, DAIF=1111 */
+ msr spsr_el1, x9
+ /* Clear all GP regs so user starts clean (except x0..argc handled
+ * via the SysV stack layout, which the user reads directly). */
+ mov x0, xzr
+ mov x1, xzr
+ mov x2, xzr
+ mov x3, xzr
+ eret
diff --git a/seed-kernel/user/hello.c b/seed-kernel/user/hello.c
@@ -0,0 +1,109 @@
+/* User-space "hello" for the seed kernel. Static aarch64 ELF, no libc.
+ * Speaks the Linux aarch64 syscall ABI directly via SVC. */
+
+typedef long i64;
+typedef unsigned long u64;
+
+#define SYS_write 64
+#define SYS_openat 56
+#define SYS_close 57
+#define SYS_read 63
+#define SYS_lseek 62
+#define SYS_brk 214
+#define SYS_exit_group 93
+
+static i64 sysc(u64 nr, u64 a, u64 b, u64 c, u64 d, u64 e, u64 f) {
+ register u64 x8 asm("x8") = nr;
+ register u64 x0 asm("x0") = a;
+ register u64 x1 asm("x1") = b;
+ register u64 x2 asm("x2") = c;
+ register u64 x3 asm("x3") = d;
+ register u64 x4 asm("x4") = e;
+ register u64 x5 asm("x5") = f;
+ asm volatile("svc #0"
+ : "+r"(x0)
+ : "r"(x8), "r"(x1), "r"(x2), "r"(x3), "r"(x4), "r"(x5)
+ : "memory", "cc");
+ return (i64)x0;
+}
+
+static i64 sys_write(int fd, const void *buf, u64 n) { return sysc(SYS_write, (u64)fd, (u64)buf, n, 0,0,0); }
+static void sys_exit(int c) { sysc(SYS_exit_group, (u64)c, 0,0,0,0,0); for(;;); }
+static i64 sys_openat(int dfd, const char *p, int fl, int mo) { return sysc(SYS_openat, (u64)dfd, (u64)p, (u64)fl, (u64)mo, 0,0); }
+static i64 sys_read(int fd, void *b, u64 n) { return sysc(SYS_read, (u64)fd, (u64)b, n, 0,0,0); }
+static i64 sys_close(int fd) { return sysc(SYS_close, (u64)fd, 0,0,0,0,0); }
+static i64 sys_brk(u64 a) { return sysc(SYS_brk, a, 0,0,0,0,0); }
+
+/* gcc may emit calls to memset/memcpy for stack zeroing or struct copies. */
+void *memset(void *d, int c, u64 n) {
+ unsigned char *dd = d; for (u64 i = 0; i < n; i++) dd[i] = (unsigned char)c; return d;
+}
+void *memcpy(void *d, const void *s, u64 n) {
+ unsigned char *dd = d; const unsigned char *ss = s;
+ for (u64 i = 0; i < n; i++) dd[i] = ss[i];
+ return d;
+}
+
+static u64 strlen_(const char *s) { u64 n = 0; while (s[n]) n++; return n; }
+static void puts_(const char *s) { sys_write(1, s, strlen_(s)); }
+
+static void put_d(i64 v) {
+ char buf[24]; int i = 0;
+ if (v < 0) { sys_write(1, "-", 1); v = -v; }
+ if (v == 0) buf[i++] = '0';
+ while (v) { buf[i++] = '0' + (char)(v % 10); v /= 10; }
+ while (i--) sys_write(1, &buf[i], 1);
+}
+
+static void put_x(u64 v) {
+ static const char hex[] = "0123456789abcdef";
+ sys_write(1, "0x", 2);
+ for (int i = 60; i >= 0; i -= 4) { char c = hex[(v >> i) & 0xf]; sys_write(1, &c, 1); }
+}
+
+void _start(void) {
+ puts_("hello from user space (EL1t, identity-map MMU)\n");
+
+ /* Exercise brk: ask current break, push it up by 1 MiB, write+read. */
+ u64 b0 = (u64)sys_brk(0);
+ puts_("brk(0) = "); put_x(b0); puts_("\n");
+ u64 b1 = (u64)sys_brk(b0 + 0x100000);
+ puts_("brk(+1MB) = "); put_x(b1); puts_("\n");
+ char *p = (char *)b0;
+ for (int i = 0; i < 16; i++) p[i] = (char)('A' + i);
+ sys_write(1, "first 16 bytes of new heap: ", 28);
+ sys_write(1, p, 16);
+ puts_("\n");
+
+ /* Exercise tmpfs: open /init (us), read first 4 bytes (ELF magic). */
+ int fd = (int)sys_openat(-100, "init", 0 /*O_RDONLY*/, 0);
+ puts_("openat(\"init\") = "); put_d(fd); puts_("\n");
+ if (fd >= 0) {
+ unsigned char m[4] = {0};
+ i64 n = sys_read(fd, m, 4);
+ puts_("read("); put_d(n); puts_(") magic = ");
+ for (int i = 0; i < 4; i++) {
+ char c[3] = { "0123456789abcdef"[m[i]>>4], "0123456789abcdef"[m[i]&0xf], ' ' };
+ sys_write(1, c, 3);
+ }
+ puts_("\n");
+ sys_close(fd);
+ }
+
+ /* Write a new file and read it back. */
+ int wfd = (int)sys_openat(-100, "scratch", 0101 /*O_WRONLY|O_CREAT*/, 0644);
+ if (wfd >= 0) {
+ const char *msg = "boot2 says hi\n";
+ sys_write(wfd, msg, strlen_(msg));
+ sys_close(wfd);
+ int rfd = (int)sys_openat(-100, "scratch", 0, 0);
+ char buf[64] = {0};
+ i64 n = sys_read(rfd, buf, sizeof buf);
+ puts_("scratch readback ("); put_d(n); puts_(" bytes): ");
+ sys_write(1, buf, (u64)n);
+ sys_close(rfd);
+ }
+
+ puts_("[user] all checks passed, exiting 0\n");
+ sys_exit(0);
+}
diff --git a/seed-kernel/user/user.lds b/seed-kernel/user/user.lds
@@ -0,0 +1,15 @@
+/* Link the user binary high enough to be clear of the kernel image
+ * (which sits at 0x40080000) and the initrd (placed by QEMU). */
+
+ENTRY(_start)
+
+SECTIONS {
+ . = 0x42000000;
+
+ .text : { *(.text .text.*) }
+ .rodata : ALIGN(8) { *(.rodata .rodata.*) }
+ .data : ALIGN(8) { *(.data .data.*) }
+ .bss : ALIGN(16) { *(.bss .bss.*) *(COMMON) }
+
+ /DISCARD/ : { *(.note.*) *(.comment) *(.eh_frame) }
+}