boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit d437cb28c8b53b20923897d9aaa74fd7b0244928
parent 0aa867eecc1187ba9b9ef6194511152c877eafc0
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Mon,  4 May 2026 19:54:16 -0700

seed-kernel: minimal arm64 OS satisfying docs/OS.md Tier 1

Boots via the Linux arm64 boot protocol (`-kernel` + `-initrd` on
QEMU virt), parses the DTB for memory + initrd, unpacks the cpio
newc into an in-memory tmpfs, loads /init as a static aarch64 ELF,
and ERETs into it at EL1t. SVC traps land in trap_sync() and dispatch
the eight Tier-1 syscalls (read, write, openat, close, lseek, brk,
unlinkat, exit_group). exit_group issues PSCI SYSTEM_OFF so qemu
exits cleanly.

Bring-up sets up an identity-map MMU (4× 1G L1 blocks: 0-1G Device,
1-4G Normal cacheable) — needed because gcc auto-vectorises the
DTB byte-by-byte BE readers into 64-bit unaligned loads, which fault
on Device memory with the MMU off.

Tier 2 (clone/execve/waitid pseudo-fork) is not yet wired; the
dispatcher returns ENOSYS for unknown calls.

Build with `make` inside boot2-alpine-gcc:aarch64; run with
./run.sh. End-to-end boot + tmpfs round-trip + exit takes ~100ms.

Diffstat:
Aseed-kernel/Makefile | 48++++++++++++++++++++++++++++++++++++++++++++++++
Aseed-kernel/kernel.c | 628+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aseed-kernel/kernel.lds | 51+++++++++++++++++++++++++++++++++++++++++++++++++++
Aseed-kernel/run.sh | 23+++++++++++++++++++++++
Aseed-kernel/start.S | 204+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aseed-kernel/user/hello.c | 109+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aseed-kernel/user/user.lds | 15+++++++++++++++
7 files changed, 1078 insertions(+), 0 deletions(-)

diff --git a/seed-kernel/Makefile b/seed-kernel/Makefile @@ -0,0 +1,48 @@ +# seed-kernel — minimal arm64 OS that satisfies docs/OS.md Tier 1. +# +# Build runs inside boot2-alpine-gcc:aarch64 (already arm64-native), so +# everything compiles with the host toolchain — no cross prefixes. + +OUT := build +KOBJS := $(OUT)/start.o $(OUT)/kernel.o +KIMAGE := $(OUT)/kernel.elf +KBIN := $(OUT)/Image +USER := $(OUT)/init +INITRAMFS := $(OUT)/initramfs.cpio + +CFLAGS_COMMON := -nostdlib -nostartfiles -ffreestanding -fno-stack-protector \ + -fno-pic -static -Wall -Wextra -O2 -mcmodel=large \ + -fno-asynchronous-unwind-tables -fno-unwind-tables +KCFLAGS := $(CFLAGS_COMMON) -mgeneral-regs-only + +.PHONY: all clean kernel user initramfs +all: $(KBIN) $(INITRAMFS) + +$(OUT): + mkdir -p $(OUT) + +$(OUT)/start.o: start.S | $(OUT) + gcc $(KCFLAGS) -c -o $@ $< + +$(OUT)/kernel.o: kernel.c | $(OUT) + gcc $(KCFLAGS) -c -o $@ $< + +$(KIMAGE): $(KOBJS) kernel.lds + ld -nostdlib -static -T kernel.lds -o $@ $(KOBJS) + +# Strip ELF down to a flat binary that QEMU's -kernel can load. +$(KBIN): $(KIMAGE) + objcopy -O binary $< $@ + +$(USER): user/hello.c user/user.lds | $(OUT) + gcc $(CFLAGS_COMMON) -mgeneral-regs-only -T user/user.lds -o $@ $< + +$(INITRAMFS): $(USER) + cd $(OUT) && printf 'init\n' | cpio -o -H newc > initramfs.cpio + +kernel: $(KBIN) +user: $(USER) +initramfs: $(INITRAMFS) + +clean: + rm -rf $(OUT) diff --git a/seed-kernel/kernel.c b/seed-kernel/kernel.c @@ -0,0 +1,628 @@ +/* seed kernel — minimal OS satisfying docs/OS.md Tier 1. + * + * Boots via Linux arm64 boot protocol (-kernel/-initrd), parses the DTB + * to find initrd + memory, unpacks the cpio newc initramfs into an + * in-memory tmpfs, loads /init (a static aarch64 ELF), and ERETs into + * it at EL1t. SVC traps land in trap_sync() and dispatch the eight + * Tier-1 syscalls. + */ + +typedef unsigned char u8; +typedef unsigned short u16; +typedef unsigned int u32; +typedef unsigned long u64; +typedef long i64; +typedef int i32; + +/* ─── PL011 console ─────────────────────────────────────────────────────── */ + +#define UART0 0x09000000UL +#define UART_DR ((volatile u32 *)(UART0 + 0x00)) +#define UART_FR ((volatile u32 *)(UART0 + 0x18)) +#define UART_FR_TXFF (1u << 5) + +static void uart_putc(char c) { + while (*UART_FR & UART_FR_TXFF) { } + *UART_DR = (u32)(u8)c; +} + +static void uart_puts(const char *s) { + while (*s) { + if (*s == '\n') uart_putc('\r'); + uart_putc(*s++); + } +} + +static void uart_putx(u64 v) { + static const char hex[] = "0123456789abcdef"; + uart_puts("0x"); + for (int i = 60; i >= 0; i -= 4) uart_putc(hex[(v >> i) & 0xf]); +} + +static void uart_putd(i64 v) { + if (v < 0) { uart_putc('-'); v = -v; } + char buf[24]; + int i = 0; + if (v == 0) buf[i++] = '0'; + while (v) { buf[i++] = '0' + (v % 10); v /= 10; } + while (i--) uart_putc(buf[i]); +} + +/* ─── Tiny libc-ish helpers ─────────────────────────────────────────────── */ + +/* libgcc / freestanding ABI helpers gcc may call implicitly. */ +void *memset(void *d, int c, u64 n) { + u8 *dd = d; for (u64 i = 0; i < n; i++) dd[i] = (u8)c; return d; +} +void *memcpy(void *d, const void *s, u64 n) { + u8 *dd = d; const u8 *ss = s; + for (u64 i = 0; i < n; i++) dd[i] = ss[i]; + return d; +} + +static int str_eq(const char *a, const char *b) { + while (*a && *a == *b) { a++; b++; } + return *a == 0 && *b == 0; +} +static int str_n(const char *s) { int n = 0; while (s[n]) n++; return n; } +static void mem_cpy(void *d, const void *s, u64 n) { + u8 *dd = d; const u8 *ss = s; + for (u64 i = 0; i < n; i++) dd[i] = ss[i]; +} +static void mem_set(void *d, int c, u64 n) { + u8 *dd = d; + for (u64 i = 0; i < n; i++) dd[i] = (u8)c; +} + +/* ─── MMU bring-up ──────────────────────────────────────────────────────── */ +/* Identity-map the first 4 GB at L1 (1 GB blocks). One page table — 4 KB. + * Entry 0 (0..1G): Device-nGnRnE (UART/GIC/virtio/flash live here) + * Entry 1 (1..2G): Normal WB-WA (RAM 0x40000000-) + * Entry 2 (2..3G): Normal WB-WA (extra RAM if -m > 1G) + * Entry 3 (3..4G): Normal WB-WA (above-RAM PCI on virt; rarely touched) + * With MMU on + Normal memory, unaligned loads/stores work — gcc's auto- + * vectorised 64-bit load in be64() stops trapping. + */ +__attribute__((aligned(4096))) static u64 l1_pt[512]; + +static void setup_mmu(void) { + /* AP=00 (RW EL1 only — keep EL0 out for now), SH=ISH, AF=1, AttrIdx=0/1. + * Bits: V(0)=1, block(1)=0, AttrIdx[4:2], NS(5)=0, AP[7:6]=00, SH[9:8]=11, + * AF(10)=1, nG(11)=0 → 0x701 (Normal) / 0x705 (Device) */ + u64 normal = 0x701; + u64 device = 0x705; + + for (int i = 0; i < 512; i++) l1_pt[i] = 0; + l1_pt[0] = 0x00000000UL | device; + l1_pt[1] = 0x40000000UL | normal; + l1_pt[2] = 0x80000000UL | normal; + l1_pt[3] = 0xc0000000UL | normal; + + /* MAIR: Attr0 = 0xff (Normal WB-WA), Attr1 = 0x00 (Device-nGnRnE) */ + u64 mair = 0x00000000000000ffUL; + asm volatile("msr mair_el1, %0" :: "r"(mair)); + + u64 tcr = (u64)25 /* T0SZ: 39-bit VA */ + | ((u64)1 << 8) /* IRGN0 = WBWA */ + | ((u64)1 << 10) /* ORGN0 = WBWA */ + | ((u64)3 << 12) /* SH0 = inner shareable */ + | ((u64)0 << 14) /* TG0 = 4KB */ + | ((u64)1 << 23) /* EPD1 = disable TTBR1 walks */ + | ((u64)2 << 32); /* IPS = 40-bit phys */ + asm volatile("msr tcr_el1, %0" :: "r"(tcr)); + asm volatile("msr ttbr0_el1, %0" :: "r"((u64)l1_pt)); + + asm volatile("ic iallu"); /* invalidate I-cache */ + asm volatile("dsb ish"); + asm volatile("tlbi vmalle1"); + asm volatile("dsb ish"); + asm volatile("isb"); + + u64 sctlr; + asm volatile("mrs %0, sctlr_el1" : "=r"(sctlr)); + sctlr &= ~(u64)((1 << 1) | (1 << 19)); /* clear A, WXN */ + sctlr |= (u64)(1 << 0); /* M (MMU) only — caches stay off */ + asm volatile("msr sctlr_el1, %0" :: "r"(sctlr)); + asm volatile("isb"); +} + +/* ─── Kernel heap (bump allocator) ──────────────────────────────────────── */ + +extern char _end[]; +static u8 *kheap_ptr; +static u8 *kheap_end; + +static void *kalloc(u64 n) { + n = (n + 15) & ~15UL; + if (kheap_ptr + n > kheap_end) { + uart_puts("kalloc: out of memory\n"); + for (;;) asm volatile("wfe"); + } + void *r = kheap_ptr; + kheap_ptr += n; + return r; +} + +/* ─── Big-endian readers (DTB is BE) ────────────────────────────────────── */ + +static u32 be32(const u8 *p) { return (u32)p[0]<<24 | (u32)p[1]<<16 | (u32)p[2]<<8 | (u32)p[3]; } +static u64 be64(const u8 *p) { return ((u64)be32(p) << 32) | (u64)be32(p + 4); } + +/* ─── Flattened Device Tree walker ──────────────────────────────────────── */ + +#define FDT_MAGIC 0xd00dfeedu +#define FDT_BEGIN_NODE 1 +#define FDT_END_NODE 2 +#define FDT_PROP 3 +#define FDT_NOP 4 +#define FDT_END 9 + +struct dtb_info { + u64 initrd_start; + u64 initrd_end; + u64 mem_start; + u64 mem_size; + char bootargs[256]; +}; + +static void parse_dtb(const void *dtb, struct dtb_info *out) { + const u8 *base = dtb; + if (be32(base) != FDT_MAGIC) { + uart_puts("DTB: bad magic\n"); return; + } + u32 off_struct = be32(base + 8); + u32 off_strings = be32(base + 12); + const u8 *strings = base + off_strings; + const u8 *p = base + off_struct; + + char path[4][64] = {{0}}; + int depth = -1; + + for (;;) { + u32 tok = be32(p); p += 4; + if (tok == FDT_BEGIN_NODE) { + depth++; + if (depth < 4) { + int i = 0; + while (p[i] && i < 63) { path[depth][i] = (char)p[i]; i++; } + path[depth][i] = 0; + } + while (*p) p++; + p++; + p = (const u8 *)(((u64)p + 3) & ~3UL); + } else if (tok == FDT_END_NODE) { + depth--; + } else if (tok == FDT_PROP) { + u32 len = be32(p); p += 4; + u32 nameoff = be32(p); p += 4; + const char *pn = (const char *)(strings + nameoff); + + if (depth == 1 && str_eq(path[1], "chosen")) { + if (str_eq(pn, "linux,initrd-start")) { + out->initrd_start = (len == 8) ? be64(p) : (u64)be32(p); + } else if (str_eq(pn, "linux,initrd-end")) { + out->initrd_end = (len == 8) ? be64(p) : (u64)be32(p); + } else if (str_eq(pn, "bootargs")) { + u32 i = 0; + while (i < len && i < 255) { out->bootargs[i] = (char)p[i]; i++; } + out->bootargs[i] = 0; + } + } + if (depth == 1) { + /* memory node is named "memory@<addr>" */ + if ((path[1][0] == 'm' && path[1][1] == 'e' && path[1][2] == 'm' && + path[1][3] == 'o' && path[1][4] == 'r' && path[1][5] == 'y') && + str_eq(pn, "reg") && len >= 16 && out->mem_size == 0) { + out->mem_start = be64(p); + out->mem_size = be64(p + 8); + } + } + p += len; + p = (const u8 *)(((u64)p + 3) & ~3UL); + } else if (tok == FDT_NOP) { + /* skip */ + } else if (tok == FDT_END) { + break; + } else { + uart_puts("DTB: bad token "); uart_putx(tok); uart_puts("\n"); + break; + } + } +} + +/* ─── In-memory tmpfs from cpio newc ────────────────────────────────────── */ + +#define MAX_FILES 64 +struct file { + int used; + char path[64]; + u8 *data; + u64 len; + u64 cap; +}; +static struct file files[MAX_FILES]; + +static int find_file(const char *path) { + while (*path == '/') path++; + for (int i = 0; i < MAX_FILES; i++) { + if (files[i].used && str_eq(files[i].path, path)) return i; + } + return -1; +} + +static int new_file(const char *path) { + while (*path == '/') path++; + for (int i = 0; i < MAX_FILES; i++) { + if (!files[i].used) { + files[i].used = 1; + int j = 0; + while (path[j] && j < 63) { files[i].path[j] = path[j]; j++; } + files[i].path[j] = 0; + files[i].data = 0; + files[i].len = 0; + files[i].cap = 0; + return i; + } + } + return -1; +} + +static u64 hex_n(const char *s, int n) { + u64 v = 0; + for (int i = 0; i < n; i++) { + char c = s[i]; + v <<= 4; + if (c >= '0' && c <= '9') v |= (u64)(c - '0'); + else if (c >= 'a' && c <= 'f') v |= (u64)(c - 'a' + 10); + else if (c >= 'A' && c <= 'F') v |= (u64)(c - 'A' + 10); + } + return v; +} + +static void parse_cpio(const void *cpio, u64 total) { + const u8 *p = cpio; + const u8 *end = p + total; + while (p + 110 <= end) { + if (!(p[0]=='0'&&p[1]=='7'&&p[2]=='0'&&p[3]=='7'&&p[4]=='0'&&p[5]=='1')) break; + u64 mode = hex_n((const char *)(p + 6 + 1*8), 8); + u64 fsz = hex_n((const char *)(p + 6 + 6*8), 8); + u64 nsz = hex_n((const char *)(p + 6 + 11*8), 8); + const char *name = (const char *)(p + 110); + if (str_eq(name, "TRAILER!!!")) break; + + u64 hstride = (110 + nsz + 3) & ~3UL; + u64 fstride = (fsz + 3) & ~3UL; + const u8 *fdata = p + hstride; + + int is_dir = ((mode & 0xf000) == 0x4000); + int is_reg = ((mode & 0xf000) == 0x8000); + if (is_reg && !str_eq(name, ".")) { + int idx = new_file(name); + if (idx >= 0) { + /* Copy out — we'll let the user write back later if needed. */ + files[idx].data = kalloc(fsz ? fsz : 1); + files[idx].cap = fsz ? fsz : 1; + files[idx].len = fsz; + if (fsz) mem_cpy(files[idx].data, fdata, fsz); + } + } + (void)is_dir; + p += hstride + fstride; + } +} + +/* ─── ELF64 static loader ───────────────────────────────────────────────── */ + +struct ehdr { u8 e_ident[16]; u16 e_type, e_machine; u32 e_version; u64 e_entry, e_phoff, e_shoff; u32 e_flags; u16 e_ehsize, e_phentsize, e_phnum, e_shentsize, e_shnum, e_shstrndx; }; +struct phdr { u32 p_type, p_flags; u64 p_offset, p_vaddr, p_paddr, p_filesz, p_memsz, p_align; }; + +#define PT_LOAD 1 + +static u64 load_elf(const u8 *elf) { + const struct ehdr *eh = (const struct ehdr *)elf; + if (!(eh->e_ident[0] == 0x7f && eh->e_ident[1] == 'E' && + eh->e_ident[2] == 'L' && eh->e_ident[3] == 'F')) { + uart_puts("ELF: bad magic\n"); return 0; + } + if (eh->e_machine != 0xb7) { /* EM_AARCH64 */ + uart_puts("ELF: not aarch64\n"); return 0; + } + for (int i = 0; i < eh->e_phnum; i++) { + const struct phdr *ph = (const struct phdr *)(elf + eh->e_phoff + (u64)i * eh->e_phentsize); + if (ph->p_type != PT_LOAD) continue; + u8 *dst = (u8 *)ph->p_vaddr; + const u8 *src = elf + ph->p_offset; + mem_cpy(dst, src, ph->p_filesz); + if (ph->p_memsz > ph->p_filesz) + mem_set(dst + ph->p_filesz, 0, ph->p_memsz - ph->p_filesz); + } + /* I-cache sync (cheap insurance even with caches off). */ + asm volatile("dsb sy" ::: "memory"); + asm volatile("ic iallu" ::: "memory"); + asm volatile("dsb sy" ::: "memory"); + asm volatile("isb"); + return eh->e_entry; +} + +/* ─── Syscall layer (Tier 1) ────────────────────────────────────────────── */ + +#define MAX_FD 32 +struct fdent { int used; int fidx; u64 pos; int wflag; int append; }; +static struct fdent fdtab[MAX_FD]; + +/* User program break (single-process). */ +static u64 brk_base; +static u64 brk_cur; +static u64 brk_max; + +#define EBADF 9 +#define ENOENT 2 +#define EINVAL 22 +#define EMFILE 24 +#define EFAULT 14 +#define ENOSPC 28 + +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0100 +#define O_TRUNC 01000 +#define O_APPEND 02000 + +#define AT_FDCWD (-100) + +#define SYS_unlinkat 35 +#define SYS_openat 56 +#define SYS_close 57 +#define SYS_lseek 62 +#define SYS_read 63 +#define SYS_write 64 +#define SYS_exit_group 93 +#define SYS_brk 214 + +static i64 sys_write(int fd, const void *buf, u64 len) { + if (fd == 1 || fd == 2) { + const u8 *s = buf; + for (u64 i = 0; i < len; i++) uart_putc((char)s[i]); + return (i64)len; + } + if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used || !fdtab[fd].wflag) return -EBADF; + struct file *f = &files[fdtab[fd].fidx]; + u64 pos = fdtab[fd].append ? f->len : fdtab[fd].pos; + u64 need = pos + len; + if (need > f->cap) { + u64 ncap = f->cap ? f->cap : 64; + while (ncap < need) ncap *= 2; + u8 *nd = kalloc(ncap); + if (f->len) mem_cpy(nd, f->data, f->len); + f->data = nd; + f->cap = ncap; + } + mem_cpy(f->data + pos, buf, len); + if (need > f->len) f->len = need; + fdtab[fd].pos = pos + len; + return (i64)len; +} + +static i64 sys_read(int fd, void *buf, u64 len) { + if (fd == 0) return 0; + if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used) return -EBADF; + struct file *f = &files[fdtab[fd].fidx]; + u64 pos = fdtab[fd].pos; + if (pos >= f->len) return 0; + u64 n = len; + if (pos + n > f->len) n = f->len - pos; + mem_cpy(buf, f->data + pos, n); + fdtab[fd].pos = pos + n; + return (i64)n; +} + +static i64 sys_openat(int dirfd, const char *path, int flags, int mode) { + (void)dirfd; (void)mode; + int fidx = find_file(path); + int wflag = (flags & 3) != 0; + if (fidx < 0) { + if (!(flags & O_CREAT)) return -ENOENT; + fidx = new_file(path); + if (fidx < 0) return -ENOSPC; + } else if (flags & O_TRUNC) { + files[fidx].len = 0; + } + int fd = -1; + for (int i = 3; i < MAX_FD; i++) { + if (!fdtab[i].used) { fd = i; break; } + } + if (fd < 0) return -EMFILE; + fdtab[fd].used = 1; + fdtab[fd].fidx = fidx; + fdtab[fd].pos = 0; + fdtab[fd].wflag = wflag; + fdtab[fd].append = (flags & O_APPEND) ? 1 : 0; + return fd; +} + +static i64 sys_close(int fd) { + if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used) return -EBADF; + fdtab[fd].used = 0; + return 0; +} + +static i64 sys_lseek(int fd, i64 off, int whence) { + if (fd < 3 || fd >= MAX_FD || !fdtab[fd].used) return -EBADF; + struct file *f = &files[fdtab[fd].fidx]; + i64 base; + if (whence == 0) base = 0; + else if (whence == 1) base = (i64)fdtab[fd].pos; + else if (whence == 2) base = (i64)f->len; + else return -EINVAL; + i64 np = base + off; + if (np < 0) return -EINVAL; + fdtab[fd].pos = (u64)np; + return np; +} + +static i64 sys_brk(u64 addr) { + if (addr == 0) return (i64)brk_cur; + if (addr < brk_base || addr > brk_max) return (i64)brk_cur; + brk_cur = addr; + return (i64)brk_cur; +} + +static i64 sys_unlinkat(int dirfd, const char *path, int flags) { + (void)dirfd; (void)flags; + int fidx = find_file(path); + if (fidx < 0) return -ENOENT; + files[fidx].used = 0; + return 0; +} + +static int g_exit_code = 0; +static int g_exited = 0; + +static void sys_exit(int code) { + g_exit_code = code; + g_exited = 1; + uart_puts("\n[seed] user exit_group("); uart_putd(code); uart_puts(")\n"); + /* Try PSCI SYSTEM_OFF so QEMU exits cleanly; fall back to spin. */ + register u64 x0 asm("x0") = 0x84000008; + asm volatile("hvc #0" : "+r"(x0)); + register u64 x0s asm("x0") = 0x84000008; + asm volatile("smc #0" : "+r"(x0s)); + for (;;) asm volatile("wfi"); +} + +/* ─── Trap dispatch (called from start.S vector handlers) ───────────────── */ + +struct trapframe { + u64 x[31]; + u64 elr; + u64 spsr; +}; + +i64 trap_sync(u64 esr, struct trapframe *tf); +void trap_kernel(u64 esr, struct trapframe *tf); +void trap_unhandled(u64 esr, struct trapframe *tf); + +i64 trap_sync(u64 esr, struct trapframe *tf) { + u32 ec = (u32)((esr >> 26) & 0x3f); + if (ec == 0x15) { /* SVC, AArch64 */ + u64 nr = tf->x[8]; + u64 a0 = tf->x[0], a1 = tf->x[1], a2 = tf->x[2]; + u64 a3 = tf->x[3], a4 = tf->x[4], a5 = tf->x[5]; + i64 r; + switch (nr) { + case SYS_read: r = sys_read((int)a0, (void *)a1, a2); break; + case SYS_write: r = sys_write((int)a0, (const void *)a1, a2); break; + case SYS_openat: r = sys_openat((int)a0, (const char *)a1, (int)a2, (int)a3); break; + case SYS_close: r = sys_close((int)a0); break; + case SYS_lseek: r = sys_lseek((int)a0, (i64)a1, (int)a2); break; + case SYS_brk: r = sys_brk(a0); break; + case SYS_unlinkat: r = sys_unlinkat((int)a0, (const char *)a1, (int)a2); break; + case SYS_exit_group: sys_exit((int)a0); r = 0; break; + default: + uart_puts("[seed] ENOSYS "); uart_putd((i64)nr); uart_puts("\n"); + r = -38; /* ENOSYS */ + } + tf->x[0] = (u64)r; + (void)a4; (void)a5; + return 0; + } + uart_puts("[seed] PANIC: user sync, ESR="); uart_putx(esr); + uart_puts(" ELR="); uart_putx(tf->elr); + uart_puts(" FAR="); + u64 far; asm volatile("mrs %0, far_el1" : "=r"(far)); uart_putx(far); + uart_puts("\n"); + for (;;) asm volatile("wfe"); +} + +void trap_kernel(u64 esr, struct trapframe *tf) { + uart_puts("[seed] PANIC: kernel sync, ESR="); uart_putx(esr); + uart_puts(" ELR="); uart_putx(tf->elr); + uart_puts("\n"); + for (;;) asm volatile("wfe"); +} + +void trap_unhandled(u64 esr, struct trapframe *tf) { + uart_puts("[seed] PANIC: unhandled exception, ESR="); uart_putx(esr); + uart_puts(" ELR="); uart_putx(tf->elr); + uart_puts("\n"); + for (;;) asm volatile("wfe"); +} + +/* ─── User stack setup + entry ──────────────────────────────────────────── */ + +extern void eret_to_user(u64 entry, u64 sp); + +static u64 build_user_stack(u64 stack_top, const char *argv0) { + /* Place argv0 string at top, then argc/argv/envp below it. + * + * SysV layout from low to high at sp: + * argc, argv[0], NULL, NULL (envp term) + */ + int n = str_n(argv0) + 1; + char *str = (char *)(stack_top - 32); + for (int i = 0; i < n; i++) str[i] = argv0[i]; + + u64 sp = (u64)str - 64; + sp &= ~15UL; + u64 *p = (u64 *)sp; + p[0] = 1; /* argc */ + p[1] = (u64)str; /* argv[0] */ + p[2] = 0; /* argv terminator */ + p[3] = 0; /* envp terminator */ + return sp; +} + +/* ─── kmain ─────────────────────────────────────────────────────────────── */ + +void kmain(u64 dtb_phys) { + setup_mmu(); + + /* Bring up heap immediately — placed at a 16MB-aligned offset above + * our image, well clear of BSS/stack and of QEMU's initrd placement. */ + u64 image_end = (u64)_end; + kheap_ptr = (u8 *)((image_end + 0xfffful) & ~0xfffful); + kheap_end = (u8 *)0x44000000UL; /* 64MB of heap, plenty */ + + uart_puts("\n[seed] arm64 boot, x0/dtb="); uart_putx(dtb_phys); uart_puts("\n"); + + struct dtb_info dt = {0}; + parse_dtb((const void *)dtb_phys, &dt); + uart_puts("[seed] mem "); uart_putx(dt.mem_start); + uart_puts(" + "); uart_putx(dt.mem_size); uart_puts("\n"); + uart_puts("[seed] initrd "); uart_putx(dt.initrd_start); + uart_puts(" .. "); uart_putx(dt.initrd_end); uart_puts("\n"); + if (dt.bootargs[0]) { uart_puts("[seed] bootargs: "); uart_puts(dt.bootargs); uart_puts("\n"); } + + if (dt.initrd_start == 0 || dt.initrd_end <= dt.initrd_start) { + uart_puts("[seed] no initrd, halting\n"); + for (;;) asm volatile("wfe"); + } + + parse_cpio((const void *)dt.initrd_start, dt.initrd_end - dt.initrd_start); + uart_puts("[seed] tmpfs:\n"); + for (int i = 0; i < MAX_FILES; i++) { + if (!files[i].used) continue; + uart_puts(" /"); uart_puts(files[i].path); + uart_puts(" ("); uart_putd((i64)files[i].len); uart_puts(" bytes)\n"); + } + + int init_idx = find_file("init"); + if (init_idx < 0) { uart_puts("[seed] no /init in initrd, halting\n"); for(;;) asm volatile("wfe"); } + + u64 entry = load_elf(files[init_idx].data); + if (!entry) { uart_puts("[seed] load_elf failed\n"); for(;;) asm volatile("wfe"); } + uart_puts("[seed] /init e_entry="); uart_putx(entry); uart_puts("\n"); + + /* User stack at top of a reserved high region. brk above that. */ + u64 ustack_top = 0x46000000UL; + brk_base = 0x46000000UL; + brk_cur = brk_base; + brk_max = 0x4a000000UL; + + u64 user_sp = build_user_stack(ustack_top, "init"); + + uart_puts("[seed] eret to user, sp="); uart_putx(user_sp); uart_puts("\n"); + eret_to_user(entry, user_sp); + /* unreachable */ +} diff --git a/seed-kernel/kernel.lds b/seed-kernel/kernel.lds @@ -0,0 +1,51 @@ +/* arm64 seed kernel link layout. + * + * QEMU `-machine virt` puts RAM at 0x40000000. With `-kernel` and the arm64 + * Image header, QEMU loads us at RAM_BASE + text_offset = 0x40080000. + * We don't reference absolute addresses in code (the entry stub uses adrp + * which is PC-relative), so the link base mostly affects symbol values. + */ + +ENTRY(_head) + +SECTIONS { + . = 0x40080000; + + .head.text : { + KEEP(*(.head.text)) + } + + .text : ALIGN(8) { + *(.text .text.*) + } + + .rodata : ALIGN(8) { + *(.rodata .rodata.*) + } + + .data : ALIGN(8) { + *(.data .data.*) + } + + .bss : ALIGN(16) { + __bss_start = .; + *(.bss .bss.*) + *(COMMON) + . = ALIGN(16); + __bss_end = .; + } + + /* 64KB kernel stack */ + .stack : ALIGN(16) { + kstack_bottom = .; + . += 0x10000; + kstack_top = .; + } + + _end = .; + _image_end = .; + + /DISCARD/ : { + *(.note.*) *(.comment) *(.eh_frame) + } +} diff --git a/seed-kernel/run.sh b/seed-kernel/run.sh @@ -0,0 +1,23 @@ +#!/bin/sh +# Boot the seed kernel + initramfs in QEMU. +# +# Usage: ./run.sh [extra qemu args...] + +set -eu +cd "$(dirname "$0")" + +KERNEL=build/Image +INITRD=build/initramfs.cpio + +[ -f "$KERNEL" ] || { echo "missing $KERNEL — run 'make' first"; exit 1; } +[ -f "$INITRD" ] || { echo "missing $INITRD — run 'make' first"; exit 1; } + +exec qemu-system-aarch64 \ + -machine virt \ + -cpu cortex-a72 \ + -m 512M \ + -nographic \ + -no-reboot \ + -kernel "$KERNEL" \ + -initrd "$INITRD" \ + "$@" diff --git a/seed-kernel/start.S b/seed-kernel/start.S @@ -0,0 +1,204 @@ +/* seed kernel — arm64 boot protocol entry, vector table, SVC handler. */ + +.section .head.text, "ax" +.globl _head +_head: + /* arm64 Image header (Documentation/arm64/booting.rst). + * code0 must be a valid instruction (a branch, in our case). */ + b stext + .long 0 + .quad 0x80000 /* text_offset (preferred load offset within RAM) */ + .quad _image_end - _head /* image_size */ + .quad 0xa /* flags: 4K pages, anywhere in physmem, LE */ + .quad 0 + .quad 0 + .quad 0 + .ascii "ARM\x64" /* magic */ + .long 0 /* PE COFF offset (none) */ + +stext: + /* Entry contract: x0 = DTB phys, MMU off, caches off, EL2 or EL1. */ + msr daifset, #0xf + + /* If we entered at EL2, drop to EL1. Otherwise we're already at EL1. */ + mrs x9, CurrentEL + lsr x9, x9, #2 + cmp x9, #2 + b.ne in_el1 + + /* EL2 → EL1: set HCR_EL2.RW=1 (EL1 is AArch64), CNTHCTL/CNTVOFF defaults, + * SPSR=EL1h with DAIF masked, ELR=in_el1, eret. */ + mov x9, #(1 << 31) + msr hcr_el2, x9 + mov x9, #0x3c5 /* EL1h, DAIF=1111 */ + msr spsr_el2, x9 + adr x9, in_el1 + msr elr_el2, x9 + /* Make sure SP_EL1 is set before we eret to EL1 (else we land with + * an undefined SP). Use the same kernel stack we're about to install. */ + adrp x9, kstack_top + add x9, x9, :lo12:kstack_top + msr sp_el1, x9 + eret + +in_el1: + /* Stack. */ + adrp x9, kstack_top + add x9, x9, :lo12:kstack_top + mov sp, x9 + + /* Vector table. */ + adrp x9, vector_table + add x9, x9, :lo12:vector_table + msr vbar_el1, x9 + isb + + /* Zero BSS. */ + adrp x1, __bss_start + add x1, x1, :lo12:__bss_start + adrp x2, __bss_end + add x2, x2, :lo12:__bss_end +1: cmp x1, x2 + b.ge 2f + str xzr, [x1], #8 + b 1b +2: + /* Hand control to C. x0 still = DTB phys (not clobbered above). */ + bl kmain + + /* kmain shouldn't return. */ +hang: + wfe + b hang + + +/* ─── Exception vector table ──────────────────────────────────────────── */ + +.macro VENTRY label + .balign 0x80 + b \label +.endm + +.section .text, "ax" +.balign 0x800 +.globl vector_table +vector_table: + /* Current EL with SP_EL0 (we never run kernel like this — only user). */ + VENTRY el1_sp0_sync /* 0x000: SVC from EL1t (our "user") */ + VENTRY unhandled /* 0x080 */ + VENTRY unhandled /* 0x100 */ + VENTRY unhandled /* 0x180 */ + /* Current EL with SP_ELx (kernel internal). */ + VENTRY el1_spx_sync /* 0x200: panic on kernel sync fault */ + VENTRY unhandled /* 0x280 */ + VENTRY unhandled /* 0x300 */ + VENTRY unhandled /* 0x380 */ + /* Lower EL using AArch64 (EL0). Unused in this design but wired. */ + VENTRY el1_sp0_sync /* 0x400 */ + VENTRY unhandled /* 0x480 */ + VENTRY unhandled /* 0x500 */ + VENTRY unhandled /* 0x580 */ + /* Lower EL using AArch32 (unused). */ + VENTRY unhandled /* 0x600 */ + VENTRY unhandled /* 0x680 */ + VENTRY unhandled /* 0x700 */ + VENTRY unhandled /* 0x780 */ + + +/* ─── Trap entry/exit ───────────────────────────────────────────────────── + * Save x0..x30 + ELR_EL1 + SPSR_EL1 onto the kernel stack as a trapframe, + * call C trap_sync(esr, &tf), restore, eret. The C handler reads/writes + * tf->x[0..7] for syscall args and return value, plus tf->x[8] for the + * syscall number. + */ + +.macro SAVE_TF + sub sp, sp, #272 + stp x0, x1, [sp, #0] + stp x2, x3, [sp, #16] + stp x4, x5, [sp, #32] + stp x6, x7, [sp, #48] + stp x8, x9, [sp, #64] + stp x10, x11, [sp, #80] + stp x12, x13, [sp, #96] + stp x14, x15, [sp, #112] + stp x16, x17, [sp, #128] + stp x18, x19, [sp, #144] + stp x20, x21, [sp, #160] + stp x22, x23, [sp, #176] + stp x24, x25, [sp, #192] + stp x26, x27, [sp, #208] + stp x28, x29, [sp, #224] + str x30, [sp, #240] + mrs x10, elr_el1 + mrs x11, spsr_el1 + stp x10, x11, [sp, #248] +.endm + +.macro RESTORE_TF + ldp x10, x11, [sp, #248] + msr elr_el1, x10 + msr spsr_el1, x11 + ldr x30, [sp, #240] + ldp x28, x29, [sp, #224] + ldp x26, x27, [sp, #208] + ldp x24, x25, [sp, #192] + ldp x22, x23, [sp, #176] + ldp x20, x21, [sp, #160] + ldp x18, x19, [sp, #144] + ldp x16, x17, [sp, #128] + ldp x14, x15, [sp, #112] + ldp x12, x13, [sp, #96] + ldp x10, x11, [sp, #80] + ldp x8, x9, [sp, #64] + ldp x6, x7, [sp, #48] + ldp x4, x5, [sp, #32] + ldp x2, x3, [sp, #16] + ldp x0, x1, [sp, #0] + add sp, sp, #272 +.endm + +el1_sp0_sync: + SAVE_TF + mrs x0, esr_el1 + mov x1, sp + bl trap_sync + RESTORE_TF + eret + +el1_spx_sync: + /* Same shape as user sync — let C distinguish via SPSR/ESR if needed. */ + SAVE_TF + mrs x0, esr_el1 + mov x1, sp + bl trap_kernel + RESTORE_TF + eret + +unhandled: + SAVE_TF + mrs x0, esr_el1 + mov x1, sp + bl trap_unhandled + RESTORE_TF + eret + + +/* ─── eret_to_user(entry, sp) ───────────────────────────────────────────── + * Drop into the loaded user program. Runs at EL1t (same EL as kernel, + * but uses SP_EL0 — gives us a separate user stack without setting up + * an MMU). DAIF stays masked since we don't service interrupts. + */ +.globl eret_to_user +eret_to_user: + msr sp_el0, x1 + msr elr_el1, x0 + mov x9, #0x3c4 /* EL1t, DAIF=1111 */ + msr spsr_el1, x9 + /* Clear all GP regs so user starts clean (except x0..argc handled + * via the SysV stack layout, which the user reads directly). */ + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + eret diff --git a/seed-kernel/user/hello.c b/seed-kernel/user/hello.c @@ -0,0 +1,109 @@ +/* User-space "hello" for the seed kernel. Static aarch64 ELF, no libc. + * Speaks the Linux aarch64 syscall ABI directly via SVC. */ + +typedef long i64; +typedef unsigned long u64; + +#define SYS_write 64 +#define SYS_openat 56 +#define SYS_close 57 +#define SYS_read 63 +#define SYS_lseek 62 +#define SYS_brk 214 +#define SYS_exit_group 93 + +static i64 sysc(u64 nr, u64 a, u64 b, u64 c, u64 d, u64 e, u64 f) { + register u64 x8 asm("x8") = nr; + register u64 x0 asm("x0") = a; + register u64 x1 asm("x1") = b; + register u64 x2 asm("x2") = c; + register u64 x3 asm("x3") = d; + register u64 x4 asm("x4") = e; + register u64 x5 asm("x5") = f; + asm volatile("svc #0" + : "+r"(x0) + : "r"(x8), "r"(x1), "r"(x2), "r"(x3), "r"(x4), "r"(x5) + : "memory", "cc"); + return (i64)x0; +} + +static i64 sys_write(int fd, const void *buf, u64 n) { return sysc(SYS_write, (u64)fd, (u64)buf, n, 0,0,0); } +static void sys_exit(int c) { sysc(SYS_exit_group, (u64)c, 0,0,0,0,0); for(;;); } +static i64 sys_openat(int dfd, const char *p, int fl, int mo) { return sysc(SYS_openat, (u64)dfd, (u64)p, (u64)fl, (u64)mo, 0,0); } +static i64 sys_read(int fd, void *b, u64 n) { return sysc(SYS_read, (u64)fd, (u64)b, n, 0,0,0); } +static i64 sys_close(int fd) { return sysc(SYS_close, (u64)fd, 0,0,0,0,0); } +static i64 sys_brk(u64 a) { return sysc(SYS_brk, a, 0,0,0,0,0); } + +/* gcc may emit calls to memset/memcpy for stack zeroing or struct copies. */ +void *memset(void *d, int c, u64 n) { + unsigned char *dd = d; for (u64 i = 0; i < n; i++) dd[i] = (unsigned char)c; return d; +} +void *memcpy(void *d, const void *s, u64 n) { + unsigned char *dd = d; const unsigned char *ss = s; + for (u64 i = 0; i < n; i++) dd[i] = ss[i]; + return d; +} + +static u64 strlen_(const char *s) { u64 n = 0; while (s[n]) n++; return n; } +static void puts_(const char *s) { sys_write(1, s, strlen_(s)); } + +static void put_d(i64 v) { + char buf[24]; int i = 0; + if (v < 0) { sys_write(1, "-", 1); v = -v; } + if (v == 0) buf[i++] = '0'; + while (v) { buf[i++] = '0' + (char)(v % 10); v /= 10; } + while (i--) sys_write(1, &buf[i], 1); +} + +static void put_x(u64 v) { + static const char hex[] = "0123456789abcdef"; + sys_write(1, "0x", 2); + for (int i = 60; i >= 0; i -= 4) { char c = hex[(v >> i) & 0xf]; sys_write(1, &c, 1); } +} + +void _start(void) { + puts_("hello from user space (EL1t, identity-map MMU)\n"); + + /* Exercise brk: ask current break, push it up by 1 MiB, write+read. */ + u64 b0 = (u64)sys_brk(0); + puts_("brk(0) = "); put_x(b0); puts_("\n"); + u64 b1 = (u64)sys_brk(b0 + 0x100000); + puts_("brk(+1MB) = "); put_x(b1); puts_("\n"); + char *p = (char *)b0; + for (int i = 0; i < 16; i++) p[i] = (char)('A' + i); + sys_write(1, "first 16 bytes of new heap: ", 28); + sys_write(1, p, 16); + puts_("\n"); + + /* Exercise tmpfs: open /init (us), read first 4 bytes (ELF magic). */ + int fd = (int)sys_openat(-100, "init", 0 /*O_RDONLY*/, 0); + puts_("openat(\"init\") = "); put_d(fd); puts_("\n"); + if (fd >= 0) { + unsigned char m[4] = {0}; + i64 n = sys_read(fd, m, 4); + puts_("read("); put_d(n); puts_(") magic = "); + for (int i = 0; i < 4; i++) { + char c[3] = { "0123456789abcdef"[m[i]>>4], "0123456789abcdef"[m[i]&0xf], ' ' }; + sys_write(1, c, 3); + } + puts_("\n"); + sys_close(fd); + } + + /* Write a new file and read it back. */ + int wfd = (int)sys_openat(-100, "scratch", 0101 /*O_WRONLY|O_CREAT*/, 0644); + if (wfd >= 0) { + const char *msg = "boot2 says hi\n"; + sys_write(wfd, msg, strlen_(msg)); + sys_close(wfd); + int rfd = (int)sys_openat(-100, "scratch", 0, 0); + char buf[64] = {0}; + i64 n = sys_read(rfd, buf, sizeof buf); + puts_("scratch readback ("); put_d(n); puts_(" bytes): "); + sys_write(1, buf, (u64)n); + sys_close(rfd); + } + + puts_("[user] all checks passed, exiting 0\n"); + sys_exit(0); +} diff --git a/seed-kernel/user/user.lds b/seed-kernel/user/user.lds @@ -0,0 +1,15 @@ +/* Link the user binary high enough to be clear of the kernel image + * (which sits at 0x40080000) and the initrd (placed by QEMU). */ + +ENTRY(_start) + +SECTIONS { + . = 0x42000000; + + .text : { *(.text .text.*) } + .rodata : ALIGN(8) { *(.rodata .rodata.*) } + .data : ALIGN(8) { *(.data .data.*) } + .bss : ALIGN(16) { *(.bss .bss.*) *(COMMON) } + + /DISCARD/ : { *(.note.*) *(.comment) *(.eh_frame) } +}