commit 169712546ddf9aaaa6d2e1e0c71d9299cef534d3
parent 7aab93aa0fdde3db4b9fd0ed9362b80479a7cba3
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 5 May 2026 12:03:07 -0700
seed-kernel: virtio-blk in/out transport replaces UART tmpfs dump
Drop the sentinel-framed UART hex dump in favor of two virtio-blk-device
disks: hd0 (read-only) carries the cpio rootfs, hd1 receives a SEEDFS
image of the post-run tmpfs at exit. Padding inputs to 512-byte
multiples and force-disabling legacy mmio in QEMU.
Renames start.S to kernel.S now that it carries virtio helpers + the
syscall6 entry next to boot/exception code. extract-blk.sh decodes the
SEEDFS dump on the host; tier1/tier2 gates point at it.
Diffstat:
10 files changed, 1468 insertions(+), 435 deletions(-)
diff --git a/docs/SEED-VIRTIO-BLK.md b/docs/SEED-VIRTIO-BLK.md
@@ -0,0 +1,358 @@
+# Seed kernel: virtio-blk for I/O
+
+Plan to replace the cpio-initrd-in / UART-hex-out I/O shape of
+`seed-kernel/` with a pair of virtio-blk-MMIO devices: one read-only
+disk carrying the boot inputs (cpio newc, byte-identical to today's
+`-initrd`) and one read-write disk for outputs. UART stays the console.
+
+## Motivation
+
+Current shape (`seed-kernel/run.sh`, `lib-seed-runscm.sh:98-103`):
+
+- **In**: QEMU `-initrd` loads cpio newc; kernel finds it via DTB
+ `/chosen/linux,initrd-{start,end}` (kernel.c:286-296), unpacks via
+ `parse_cpio` (kernel.c:429-465) into the in-memory tmpfs.
+- **Out**: PL011 UART is the only egress. On exit (`dumpfs` bootarg),
+ `dump_tmpfs` (kernel.c:926-939) hex-encodes every tmpfs file framed by
+ `=== DUMP-BEGIN ===` / `=== FILE … ===` / `=== DUMP-END ===` sentinels.
+ `seed-kernel/scripts/extract-dump.sh` reassembles host-side.
+
+This works but has real costs:
+
+1. **Output throughput.** Hex doubles size; PL011 is byte-at-a-time MMIO.
+ For boot5 (full musl + libc.a + crt*.o, tens of MB), UART dump is the
+ dominant wall-time cost on TCG and a non-trivial slice under HVF.
+2. **No mid-run egress.** Output exists only at exit. A crash mid-build
+ loses everything in tmpfs.
+3. **Symmetry.** Inputs ride a structured device (initrd memory region);
+ outputs ride a debug device (UART). Two separate framings.
+4. **Optionality.** Once the kernel can talk to virtio-blk, mounting a
+ real on-disk artifact cache (across multiple runs) becomes trivial.
+
+## Non-goals
+
+- Generalised block layer, partitions, ext2/FAT, write-back cache,
+ buffer cache, multi-queue, MSI-X, IRQ delivery.
+- virtio-net, virtio-9p, virtio-console, PCI transport.
+- Replacing the in-memory tmpfs semantics user code sees (`sys_openat`,
+ `sys_read`, `sys_write` against `files[]` stays exactly as-is).
+
+The seed kernel keeps its single-process, polling-only,
+no-interrupts shape. virtio-blk just swaps the boot-time *transport*
+and the exit-time *dump format*.
+
+## Design
+
+Two virtio-blk-MMIO devices on QEMU virt:
+
+```
+-drive file=in.img,if=none,format=raw,id=hd0,readonly=on \
+-device virtio-blk-device,drive=hd0 \
+-drive file=out.img,if=none,format=raw,id=hd1 \
+-device virtio-blk-device,drive=hd1
+```
+
+`in.img` is the cpio newc archive (today's `initramfs.cpio`), padded to
+a 512-byte multiple. `out.img` is a pre-allocated zero file sized to an
+upper bound (≈256 MB covers boot5 worst case).
+
+Boot flow:
+
+1. `kmain` brings up MMU as today.
+2. New `virtio_blk_init()` walks DTB nodes named `virtio_mmio@*`
+ (compatible `"virtio,mmio"`); for each, probe MagicValue / Version /
+ DeviceID. ID 2 = block; finish device init for each, then read
+ sector 0 to classify (cpio magic → blk0, otherwise → blk1). Panic
+ if the count of either is not exactly 1.
+3. `parse_cpio` is fed by `blk_read_all(blk0)` instead of the initrd
+ memory region. cpio bytes land in the existing kheap-backed buffer;
+ `parse_cpio` is unchanged.
+4. On exit, kernel writes a serialised tmpfs to blk1 in the flat
+ format described below, then PSCI off. No bootarg gating — the
+ write is unconditional (a no-op if `files[]` is empty).
+5. Host reads blk1 with a small extractor (`extract-blk.sh`).
+
+UART stays the console; `uart_puts` everywhere is untouched. The
+`dumpfs` bootarg, `dump_tmpfs`, the `=== DUMP-{BEGIN,END} ===` /
+`=== FILE … ===` sentinels, and `scripts/extract-dump.sh` are all
+deleted in the same change.
+
+### On-disk layout for outputs
+
+Tiny custom format — no FS. Sector-aligned (512 B), little-endian, all
+offsets in sectors:
+
+```
+sector 0 magic "SEEDFS\0\0" (8B) | nfiles u32 | reserved u32
+ followed by nfiles directory entries:
+ path[96] | data_offset_sectors u32 | size_bytes u64 | _pad
+ (entry size 112 B → 4 entries/sector → sector 1.. for table)
+sector N.. file data, each file padded up to 512-byte boundary
+```
+
+Reusing the existing `path[96]` and `MAX_FILES=4096` from `struct file`
+keeps the table at ≤900 KB (under 2 sectors of header + ~896 KB table).
+The host extractor walks the table and writes each file out.
+
+This is roughly "cpio-without-the-headers-per-file." Could equally
+write cpio newc back; the flat table is just smaller code in the
+kernel (no hex name length, no per-entry headers, no parse loop).
+
+## Memory / DMA
+
+virtio-mmio descriptors carry **physical** addresses. Kernel-side
+buffers must therefore have known PAs.
+
+The current MMU (kernel.c:144-213) gives us this for free in two
+regions:
+
+- **`L1[1..3]`** identity-maps VA 1..4 GB to PA 1..4 GB as Normal
+ memory. Kernel image (0x40080000) and kheap (0x40xxxxxx..0x4b000000)
+ live here, so any `kalloc()`'d buffer has VA == PA. `mem_cpy` etc.
+ work directly.
+- **`L1[4]`** is a 1 GB Device block aliasing PA 0..1 GB at VA 4..5 GB,
+ which is how we already reach UART; we'll reach the virtio-mmio
+ control regs at `0x0a000000..0x0a004000` through the same alias
+ (`DEVICE_ALIAS_BASE + 0x0a000000`).
+
+So DMA buffers come from `kalloc()` (Normal, identity-mapped, VA==PA)
+and the device regs from the existing high alias. No MMU changes.
+
+**Cache coherency.** virtio-mmio in QEMU is `dma-coherent` per the DTB
+(`/virtio_mmio@…/dma-coherent`); virtio-mmio v2 + the modern feature
+bits assume coherent DMA. Inner-shareable WBWA (already programmed in
+TCR/MAIR) plus DMB before NotifyQueue and DMB after reading the used
+ring is sufficient. No explicit cache maintenance ops.
+
+**Reservation.** virtio queue memory must be 4 KB-aligned. The cpio
+read buffer must be sized to the cpio length (fetched from blk0 capacity
+and trimmed at parse). Sizes of interest today:
+
+- boot5 cpio ≈ 30-80 MB. Already fits in current kheap (192 MB).
+- Output blob: bound by tmpfs total bytes. Current `kheap_end =
+ 0x4b000000` allows ~176 MB heap; sufficient for boot5's output
+ (≈10s of MB). Sizing is unchanged from today's cpio-in-RAM design.
+
+Conclusion: no memory layout changes required. Only one new fixed
+allocation: a small (single 4 KB page) virtqueue area per device.
+
+## virtio-blk-MMIO driver shape
+
+A polling, single-virtqueue, one-request-at-a-time driver. Spec ref:
+virtio 1.2 §4.2 (MMIO transport) and §5.2 (block device).
+
+**Layout (one struct `virtio_mmio` per device):**
+
+```c
+volatile u32 *regs; // VA in DEVICE_ALIAS_BASE+phy
+struct vring_desc desc[8]; // 8 descriptors plenty (we issue 1 at a time, 3 chained)
+struct vring_avail avail;
+struct vring_used used;
+u16 next_desc;
+u16 last_used;
+u64 capacity_sectors;
+```
+
+**Registers used (offsets from §4.2):**
+
+`MagicValue (0x000)`, `Version (0x004)`, `DeviceID (0x008)`,
+`DeviceFeatures (0x010)` / `DeviceFeaturesSel (0x014)`,
+`DriverFeatures (0x020)` / `DriverFeaturesSel (0x024)`,
+`QueueSel (0x030)`, `QueueNumMax (0x034)`, `QueueNum (0x038)`,
+`QueueReady (0x044)`, `QueueNotify (0x050)`,
+`InterruptStatus (0x060)`, `InterruptACK (0x064)`,
+`Status (0x070)`, `QueueDescLow/High (0x080/084)`,
+`QueueDriverLow/High (0x090/094)`, `QueueDeviceLow/High (0x0a0/0a4)`,
+`Config (0x100)` (block: 8-byte capacity at +0).
+
+**Init sequence (§3.1.1):**
+
+1. `MagicValue == 0x74726976` ("virt"), `Version == 2`, `DeviceID == 2`.
+2. `Status = 0` (reset), then `|= ACKNOWLEDGE`, then `|= DRIVER`.
+3. Read `DeviceFeatures` (sel 0 and 1); negotiate `VIRTIO_F_VERSION_1`
+ only (bit 32). Refuse `VIRTIO_BLK_F_RO` if mismatched with intent
+ (we set `readonly=on` on blk0 so the device offers RO; the driver
+ doesn't need to *negotiate* RO since we just won't issue writes).
+4. `Status |= FEATURES_OK`; reread Status to confirm.
+5. `QueueSel = 0`; `QueueNumMax` (≥8 always on QEMU); `QueueNum = 8`.
+6. Allocate 4 KB-aligned 4 KB page; lay out desc[8] / avail / used per
+ §2.7, write `QueueDesc{Low,High}`, `QueueDriver{Low,High}`,
+ `QueueDevice{Low,High}` to PAs.
+7. `QueueReady = 1`; `Status |= DRIVER_OK`.
+8. Read `Config + 0` for capacity (sectors).
+
+**Request shape (§5.2.6):** chain of three descriptors:
+
+```
+desc[0]: read-only, points to struct virtio_blk_req_hdr {u32 type, u32 reserved, u64 sector}
+desc[1]: write-only (for read req) / read-only (for write req), points to data buffer (multi-sector OK with one descriptor per spec; in practice we use 1 desc per ≤4 MB chunk and loop)
+desc[2]: write-only, 1 byte status
+```
+
+Add head index to `avail.ring[avail.idx % qsz]`, `dmb ishst`,
+`avail.idx++`, `dmb ishst`, `regs[QueueNotify] = 0`, then poll
+`used.idx` until it advances past `last_used`. Read status byte;
+0 = OK, else fail.
+
+Chunk size: pick 1 MB per request (2048 sectors). Cpio fetch loops
+until `capacity_sectors` sectors are read or the cpio TRAILER is seen
+(we can also just read all of `capacity_sectors` since `in.img` is
+sized to the cpio).
+
+### Public API
+
+```c
+int blk_init(void); // probes DTB, finds blk0/blk1
+u64 blk_capacity(int dev); // sectors
+int blk_read (int dev, u64 sector, void *buf, u64 nsectors);
+int blk_write(int dev, u64 sector, const void *buf, u64 nsectors);
+```
+
+Used by:
+
+- `kmain`: `blk_init()`; `blk_read(0, 0, cpio_buf, blk_capacity(0))`,
+ then `parse_cpio(cpio_buf, capacity*512)`.
+- `dump_tmpfs_blk()`: serialise `files[]` into the SEEDFS layout
+ described above and `blk_write(1, …)`.
+
+## DTB walking
+
+`parse_dtb` (kernel.c:254-317) currently records only `chosen.initrd*`
+and the `memory@…` reg. Extend with a callback that, when entering a
+node whose name starts with `"virtio_mmio@"`, captures up to N reg
+tuples into a `dtb_info::virtio_mmio[]` array (PA + size). The MMU
+device alias already covers all of these.
+
+Subtlety: per QEMU virt, only some of the 32 virtio-mmio slots are
+populated — unpopulated slots return `MagicValue==0` / `DeviceID==0`.
+The driver init must skip those.
+
+## Build-system changes
+
+`seed-kernel/Makefile`:
+
+- Add `kernel.c` dep on a new `virtio_blk.c` (or inline in `kernel.c`
+ to keep the single-TU shape — leaning toward inline; the driver is
+ ≤300 lines).
+- Add `$(OUT)/in.img` rule: copy `initramfs.cpio`, pad to a
+ 512-byte multiple with `truncate -s %512`.
+- Add `$(OUT)/out.img` rule: `truncate -s 256M`.
+- Update `run.sh`: drop `-initrd "$INITRD"`; add the two
+ `-drive`/`-device` pairs above. `INITRD` becomes `IN_IMG`,
+ `OUT_IMG` is created fresh per run.
+
+`scripts/lib-seed-runscm.sh`:
+
+- Replace `-initrd "$INITRAMFS"` with the two-disk variant.
+- Drop `dumpfs` from the `-append` line (no longer recognised).
+- Replace `"$EXTRACT" … "$TRANSCRIPT"` with
+ `extract-blk.sh "$S_OUT_DIR" "$OUT_IMG"`. The DUMP-END grep guard
+ is replaced by checking that `extract-blk.sh` finds the SEEDFS
+ magic at sector 0; absence means the kernel didn't reach exit.
+
+`seed-kernel/scripts/extract-blk.sh`: reads sector 0 magic, walks
+the table, writes files. Output contract matches what
+`extract-dump.sh` produced (same filenames in the same dump dir),
+so `seed_runscm_export` and downstream acceptance scripts don't
+need to change.
+
+`seed-kernel/scripts/extract-dump.sh` is deleted; `tier1-gate.sh`
+and `tier2-gate.sh` switch their `EXTRACT` envvar / direct calls
+to `extract-blk.sh`.
+
+## Implementation order
+
+Single branch, single landing. Internal checkpoints in a sensible
+order; no dual paths in the tree at any commit boundary.
+
+1. **Add `virtio_blk` driver and DTB enumeration.** Extend
+ `parse_dtb` to record `virtio_mmio@…` reg tuples. Add the driver
+ (init, `blk_read`, `blk_write`). Not yet wired into `kmain`.
+ Sanity check: a unit-test `kmain` that probes and prints
+ capacity for both disks boots cleanly under a hand-built
+ `run.sh` with two `-drive`/`-device` pairs.
+2. **Cut over input path.** Replace the initrd-region read in
+ `kmain` with `blk_read(0, 0, cpio_buf, blk_capacity(0))`. Delete
+ the `chosen.initrd-{start,end}` handling from `parse_dtb` and
+ the "no initrd" panic. Update `Makefile` to produce `in.img`
+ from `initramfs.cpio`.
+3. **Cut over output path.** Add `dump_tmpfs_blk` and
+ `extract-blk.sh`. Delete `dump_tmpfs`, `dump_tmpfs`'s sentinels,
+ the `dumpfs` bootarg parser, `g_dumpfs`, and
+ `scripts/extract-dump.sh`. `dump_tmpfs_blk` runs unconditionally
+ from `sys_exit_final` before PSCI off.
+4. **Acceptance.** Run `tier1-gate.sh`, `tier2-gate.sh`,
+ `seed-accept.sh`, `seed-accept-boot34.sh`, `seed-accept-boot5.sh`.
+ All must produce byte-identical artifacts to the prior
+ (cpio+dumpfs) tree at `HEAD~1`. Expect boot5 to surface any
+ off-by-one in the directory table fastest (≈3900 tmpfs entries).
+
+## Decisions (resolved)
+
+- **Console.** PL011 stays. `uart_putc/_puts/_putd/_putx` and user
+ `write(1, …)` are unchanged. Only the file dump moves to virtio.
+- **virtio version.** Pin MMIO Version == 2 (MagicValue == 0x74726976,
+ Version regs read in init). Anything else: `uart_puts` a panic line
+ and `wfe`. QEMU 10 (current host) and any QEMU ≥4.0 ship v2; the
+ build harness has a single QEMU floor and we don't support pre-v2.
+- **Identifying blk0 vs blk1.** Slot order in the DTB does not depend
+ on `-drive` attachment (verified with `dumpdtb`: all 32
+ `virtio_mmio@…` nodes are present unconditionally), and QEMU's
+ command-line-to-slot mapping is not contractual across versions.
+ Use **content-based** identification: after enumerating all
+ populated DeviceID==2 devices, read sector 0 of each and call the
+ one whose first 6 bytes are `"070701"` (cpio newc magic) `blk0`;
+ the other is `blk1`. If neither matches or both match, panic. This
+ removes the dependency on `-drive` ordering on the qemu command
+ line entirely.
+- **Output image size.** Host pre-allocates `out.img` as a 256 MB
+ sparse file (`truncate -s 256M out.img`). Header at sector 0
+ records total used bytes; `extract-blk.sh` reads only that many.
+ No truncation of `out.img` is needed — sparse + bounded read is
+ free on APFS / ext4.
+- **Initial kheap sizing.** Today `kheap_end` starts at `0x44000000`
+ (64 MB) and bumps to `0x4b000000` after `parse_cpio` finishes,
+ because the initrd region was reserved up to `0x4b000000`. Without
+ `-initrd`, that region is free from boot, so set the initial
+ `kheap_end = 0x4b000000` (176 MB). The cpio read buffer
+ (`kalloc(blk_capacity(0) * 512)`) lands in this range. Boot5 cpio
+ ≈ 80 MB; comfortably fits.
+- **Persistence across runs.** Out of scope. `out.img` is
+ re-created (truncated to 256 MB of zeros) before each run by the
+ harness; the kernel always writes a fresh header at sector 0.
+- **Per-request chunking.** 1 MB chunks (2048 sectors) per virtio
+ request, single data descriptor per chunk (3-descriptor chain:
+ hdr / data / status). 8-entry virtqueue, one in-flight request at
+ a time, polling `used.idx`. No interrupts (`InterruptACK` written
+ once per used entry to clear the device-side bit, but no IRQ
+ handler — DAIF stays masked as today).
+- **Coherency.** Inner-shareable WBWA (already programmed); `dmb
+ ishst` before `QueueNotify`, `dmb ish` after observing
+ `used.idx` advance. No `dc civac` / `ic` ops — virtio-mmio is
+ `dma-coherent` per DTB and the device DMAs into the same
+ inner-shareable domain the kernel reads.
+
+## Risks (residual)
+
+- **Empty / mis-sized `in.img`.** If the harness fails to stage the
+ cpio onto blk0, `parse_cpio` walks zero bytes and `find_file("init")`
+ fails — exactly the same failure mode as a missing `-initrd` today
+ (`kernel.c:1136-1139`). No new risk.
+- **Boot5 file count growth.** `MAX_FILES = 4096` and `path[96]`
+ remain the binding limits, unchanged from today. The on-disk
+ directory table is sized off these constants; bumping either
+ requires a same-commit bump to `extract-blk.sh`'s parser.
+
+## Estimated effort
+
+- DTB walk extension + virtio_blk driver + integration in `kmain`:
+ ~300 lines C, one work session.
+- Output serialiser + extractor: ~80 lines C + ~40 lines shell.
+- Build/run wiring + acceptance plumbing: ~50 lines shell across
+ Makefile, run.sh, lib-seed-runscm.sh.
+- Stabilising acceptance against existing fixtures: a couple sessions
+ to chase any byte-divergence (most likely culprit is dump ordering
+ or padding, both fixable in extractor).
+
+Total: ~1-2 days of focused work, gated by byte-identical
+acceptance vs the cpio+dumpfs tree at the pre-cutover commit.
diff --git a/seed-kernel/Makefile b/seed-kernel/Makefile
@@ -3,8 +3,10 @@
# Build runs inside boot2-alpine-gcc:aarch64 (already arm64-native), so
# everything compiles with the host toolchain — no cross prefixes.
+CC := gcc
+LD := ld
OUT := build
-KOBJS := $(OUT)/start.o $(OUT)/kernel.o
+KOBJS := $(OUT)/kasm.o $(OUT)/kernel.o
KIMAGE := $(OUT)/kernel.elf
KBIN := $(OUT)/Image
USER := $(OUT)/init
@@ -12,39 +14,45 @@ USER_FORK := $(OUT)/forktest
USER_CHILD := $(OUT)/child
INITRAMFS := $(OUT)/initramfs.cpio
INITRAMFS_FORK := $(OUT)/initramfs-fork.cpio
+# Block-device images for the seed kernel's virtio-blk transport.
+# in*.img: cpio newc archive padded to a 512-byte multiple (read-only).
+# out.img: pre-allocated 256 MiB sparse file the kernel writes SEEDFS into.
+IN_IMG := $(OUT)/in.img
+IN_IMG_FORK := $(OUT)/in-fork.img
+OUT_IMG_SIZE := 268435456
-CFLAGS_COMMON := -nostdlib -nostartfiles -ffreestanding -fno-stack-protector \
+CFLAGS_COMMON := -nostdlib -ffreestanding -fno-stack-protector \
-fno-pic -static -Wall -Wextra -O2 -mcmodel=large \
-fno-asynchronous-unwind-tables -fno-unwind-tables
KCFLAGS := $(CFLAGS_COMMON) -mgeneral-regs-only
.PHONY: all clean kernel user initramfs
-all: $(KBIN) $(INITRAMFS) $(INITRAMFS_FORK)
+all: $(KBIN) $(INITRAMFS) $(INITRAMFS_FORK) $(IN_IMG) $(IN_IMG_FORK)
$(OUT):
mkdir -p $(OUT)
-$(OUT)/start.o: start.S | $(OUT)
- gcc $(KCFLAGS) -c -o $@ $<
+$(OUT)/kasm.o: kernel.S | $(OUT)
+ $(CC) $(KCFLAGS) -c -o $@ $<
$(OUT)/kernel.o: kernel.c | $(OUT)
- gcc $(KCFLAGS) -c -o $@ $<
+ $(CC) $(KCFLAGS) -c -o $@ $<
$(KIMAGE): $(KOBJS) kernel.lds
- ld -nostdlib -static -T kernel.lds -o $@ $(KOBJS)
+ $(LD) -nostdlib -static -T kernel.lds -o $@ $(KOBJS)
# Strip ELF down to a flat binary that QEMU's -kernel can load.
$(KBIN): $(KIMAGE)
objcopy -O binary $< $@
$(USER): user/hello.c user/user.lds | $(OUT)
- gcc $(CFLAGS_COMMON) -mgeneral-regs-only -T user/user.lds -o $@ $<
+ $(CC) $(CFLAGS_COMMON) -mgeneral-regs-only -T user/user.lds -o $@ $<
$(USER_FORK): user/forktest.c user/user.lds | $(OUT)
- gcc $(CFLAGS_COMMON) -mgeneral-regs-only -T user/user.lds -o $@ $<
+ $(CC) $(CFLAGS_COMMON) -mgeneral-regs-only -T user/user.lds -o $@ $<
$(USER_CHILD): user/child.c user/user.lds | $(OUT)
- gcc $(CFLAGS_COMMON) -mgeneral-regs-only -T user/user.lds -o $@ $<
+ $(CC) $(CFLAGS_COMMON) -mgeneral-regs-only -T user/user.lds -o $@ $<
$(INITRAMFS): $(USER)
cd $(OUT) && printf 'init\n' | cpio -o -H newc > initramfs.cpio
@@ -56,6 +64,26 @@ $(INITRAMFS_FORK): $(USER_FORK) $(USER_CHILD)
cp $(USER_CHILD) $(OUT)/fork-stage/child
cd $(OUT)/fork-stage && printf 'init\nchild\n' | cpio -o -H newc > ../initramfs-fork.cpio
+# Pad an arbitrary cpio archive up to the next 512-byte multiple so QEMU's
+# virtio-blk transport sees a whole-sector device.
+$(IN_IMG): $(INITRAMFS)
+ cp $(INITRAMFS) $@.tmp
+ sz=$$(wc -c < $@.tmp); \
+ pad=$$(( (512 - sz % 512) % 512 )); \
+ if [ $$pad -gt 0 ]; then \
+ head -c $$pad /dev/zero >> $@.tmp; \
+ fi
+ mv $@.tmp $@
+
+$(IN_IMG_FORK): $(INITRAMFS_FORK)
+ cp $(INITRAMFS_FORK) $@.tmp
+ sz=$$(wc -c < $@.tmp); \
+ pad=$$(( (512 - sz % 512) % 512 )); \
+ if [ $$pad -gt 0 ]; then \
+ head -c $$pad /dev/zero >> $@.tmp; \
+ fi
+ mv $@.tmp $@
+
kernel: $(KBIN)
user: $(USER)
initramfs: $(INITRAMFS)
diff --git a/seed-kernel/kernel.S b/seed-kernel/kernel.S
@@ -0,0 +1,355 @@
+/* seed kernel — arm64 boot protocol entry, vector table, SVC handler,
+ * plus C-callable thunks for ops that can't be expressed in plain C
+ * (sysreg msr/mrs, barriers, cache/TLB ops, PSCI conduits, cpu pause). */
+
+.section .head.text, "ax"
+.globl _head
+_head:
+ /* arm64 Image header (Documentation/arm64/booting.rst).
+ * code0 must be a valid instruction (a branch, in our case). */
+ b stext
+ .long 0
+ .quad 0x80000 /* text_offset (preferred load offset within RAM) */
+ .quad _image_end - _head /* image_size */
+ .quad 0xa /* flags: 4K pages, anywhere in physmem, LE */
+ .quad 0
+ .quad 0
+ .quad 0
+ .ascii "ARM\x64" /* magic */
+ .long 0 /* PE COFF offset (none) */
+
+stext:
+ /* Entry contract: x0 = DTB phys, MMU off, caches off, EL2 or EL1. */
+ msr daifset, #0xf
+
+ /* If we entered at EL2, drop to EL1. Otherwise we're already at EL1. */
+ mrs x9, CurrentEL
+ lsr x9, x9, #2
+ cmp x9, #2
+ b.ne in_el1
+
+ /* EL2 → EL1: set HCR_EL2.RW=1 (EL1 is AArch64), CNTHCTL/CNTVOFF defaults,
+ * SPSR=EL1h with DAIF masked, ELR=in_el1, eret. */
+ mov x9, #(1 << 31)
+ msr hcr_el2, x9
+ mov x9, #0x3c5 /* EL1h, DAIF=1111 */
+ msr spsr_el2, x9
+ adr x9, in_el1
+ msr elr_el2, x9
+ /* Make sure SP_EL1 is set before we eret to EL1 (else we land with
+ * an undefined SP). Use the same kernel stack we're about to install. */
+ adrp x9, kstack_top
+ add x9, x9, :lo12:kstack_top
+ msr sp_el1, x9
+ eret
+
+in_el1:
+ /* Stack. */
+ adrp x9, kstack_top
+ add x9, x9, :lo12:kstack_top
+ mov sp, x9
+
+ /* Vector table. */
+ adrp x9, vector_table
+ add x9, x9, :lo12:vector_table
+ msr vbar_el1, x9
+ isb
+
+ /* Zero BSS. */
+ adrp x1, __bss_start
+ add x1, x1, :lo12:__bss_start
+ adrp x2, __bss_end
+ add x2, x2, :lo12:__bss_end
+1: cmp x1, x2
+ b.ge 2f
+ str xzr, [x1], #8
+ b 1b
+2:
+ /* Hand control to C. x0 still = DTB phys (not clobbered above). */
+ bl kmain
+
+ /* kmain shouldn't return. */
+hang:
+ wfe
+ b hang
+
+
+/* ─── Exception vector table ──────────────────────────────────────────── */
+
+.macro VENTRY label
+ .balign 0x80
+ b \label
+.endm
+
+.section .text, "ax"
+.balign 0x800
+.globl vector_table
+vector_table:
+ /* Current EL with SP_EL0 (we never run kernel like this — only user). */
+ VENTRY el1_sp0_sync /* 0x000: SVC from EL1t (our "user") */
+ VENTRY unhandled /* 0x080 */
+ VENTRY unhandled /* 0x100 */
+ VENTRY unhandled /* 0x180 */
+ /* Current EL with SP_ELx (kernel internal). */
+ VENTRY el1_spx_sync /* 0x200: panic on kernel sync fault */
+ VENTRY unhandled /* 0x280 */
+ VENTRY unhandled /* 0x300 */
+ VENTRY unhandled /* 0x380 */
+ /* Lower EL using AArch64 (EL0). Unused in this design but wired. */
+ VENTRY el1_sp0_sync /* 0x400 */
+ VENTRY unhandled /* 0x480 */
+ VENTRY unhandled /* 0x500 */
+ VENTRY unhandled /* 0x580 */
+ /* Lower EL using AArch32 (unused). */
+ VENTRY unhandled /* 0x600 */
+ VENTRY unhandled /* 0x680 */
+ VENTRY unhandled /* 0x700 */
+ VENTRY unhandled /* 0x780 */
+
+
+/* ─── Trap entry/exit ─────────────────────────────────────────────────────
+ * Save x0..x30 + ELR_EL1 + SPSR_EL1 onto the kernel stack as a trapframe,
+ * call C trap_sync(esr, &tf), restore, eret. The C handler reads/writes
+ * tf->x[0..7] for syscall args and return value, plus tf->x[8] for the
+ * syscall number.
+ */
+
+.macro SAVE_TF
+ sub sp, sp, #272
+ stp x0, x1, [sp, #0]
+ stp x2, x3, [sp, #16]
+ stp x4, x5, [sp, #32]
+ stp x6, x7, [sp, #48]
+ stp x8, x9, [sp, #64]
+ stp x10, x11, [sp, #80]
+ stp x12, x13, [sp, #96]
+ stp x14, x15, [sp, #112]
+ stp x16, x17, [sp, #128]
+ stp x18, x19, [sp, #144]
+ stp x20, x21, [sp, #160]
+ stp x22, x23, [sp, #176]
+ stp x24, x25, [sp, #192]
+ stp x26, x27, [sp, #208]
+ stp x28, x29, [sp, #224]
+ str x30, [sp, #240]
+ mrs x10, elr_el1
+ mrs x11, spsr_el1
+ stp x10, x11, [sp, #248]
+.endm
+
+.macro RESTORE_TF
+ ldp x10, x11, [sp, #248]
+ msr elr_el1, x10
+ msr spsr_el1, x11
+ ldr x30, [sp, #240]
+ ldp x28, x29, [sp, #224]
+ ldp x26, x27, [sp, #208]
+ ldp x24, x25, [sp, #192]
+ ldp x22, x23, [sp, #176]
+ ldp x20, x21, [sp, #160]
+ ldp x18, x19, [sp, #144]
+ ldp x16, x17, [sp, #128]
+ ldp x14, x15, [sp, #112]
+ ldp x12, x13, [sp, #96]
+ ldp x10, x11, [sp, #80]
+ ldp x8, x9, [sp, #64]
+ ldp x6, x7, [sp, #48]
+ ldp x4, x5, [sp, #32]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #0]
+ add sp, sp, #272
+.endm
+
+el1_sp0_sync:
+ SAVE_TF
+ mrs x0, esr_el1
+ mov x1, sp
+ bl trap_sync
+ RESTORE_TF
+ eret
+
+el1_spx_sync:
+ /* Same shape as user sync — let C distinguish via SPSR/ESR if needed. */
+ SAVE_TF
+ mrs x0, esr_el1
+ mov x1, sp
+ bl trap_kernel
+ RESTORE_TF
+ eret
+
+unhandled:
+ SAVE_TF
+ mrs x0, esr_el1
+ mov x1, sp
+ bl trap_unhandled
+ RESTORE_TF
+ eret
+
+
+/* ─── eret_to_user(entry, sp) ─────────────────────────────────────────────
+ * Drop into the loaded user program. Runs at EL1t (same EL as kernel,
+ * but uses SP_EL0 — gives us a separate user stack without setting up
+ * an MMU). DAIF stays masked since we don't service interrupts.
+ */
+.globl eret_to_user
+eret_to_user:
+ msr sp_el0, x1
+ msr elr_el1, x0
+ mov x9, #0x3c4 /* EL1t, DAIF=1111 */
+ msr spsr_el1, x9
+ /* Clear all GP regs so user starts clean. argc/argv come in via the
+ * SysV stack layout, which the user reads directly off SP_EL0. Some
+ * boot0/1 seed-stage binaries (notably M0) read xN before any write,
+ * so leaking kernel register state past the eret would fault them. */
+ mov x0, xzr
+ mov x1, xzr
+ mov x2, xzr
+ mov x3, xzr
+ mov x4, xzr
+ mov x5, xzr
+ mov x6, xzr
+ mov x7, xzr
+ mov x8, xzr
+ mov x9, xzr
+ mov x10, xzr
+ mov x11, xzr
+ mov x12, xzr
+ mov x13, xzr
+ mov x14, xzr
+ mov x15, xzr
+ mov x16, xzr
+ mov x17, xzr
+ mov x18, xzr
+ mov x19, xzr
+ mov x20, xzr
+ mov x21, xzr
+ mov x22, xzr
+ mov x23, xzr
+ mov x24, xzr
+ mov x25, xzr
+ mov x26, xzr
+ mov x27, xzr
+ mov x28, xzr
+ mov x29, xzr
+ mov x30, xzr
+ eret
+
+
+/* ─── C-callable thunks ───────────────────────────────────────────────────
+ * The arm64 sysreg name is encoded in the msr/mrs opcode itself, so each
+ * register needs its own emit site — sysreg_read/sysreg_write dispatch on
+ * an integer id that must match the SR_* enum in kernel.c. Likewise for
+ * arm64_barrier (BAR_*) and cpu_pause (PAUSE_*).
+ */
+
+/* SR_* — matches kernel.c enum, declaration order. */
+#define SR_MAIR_EL1 0
+#define SR_TCR_EL1 1
+#define SR_TTBR0_EL1 2
+#define SR_SCTLR_EL1 3
+#define SR_CPACR_EL1 4
+#define SR_SP_EL0 5
+#define SR_FAR_EL1 6
+
+.globl sysreg_read
+sysreg_read:
+ cmp x0, #SR_SCTLR_EL1
+ b.eq .Lrd_sctlr_el1
+ cmp x0, #SR_SP_EL0
+ b.eq .Lrd_sp_el0
+ cmp x0, #SR_FAR_EL1
+ b.eq .Lrd_far_el1
+ mov x0, xzr
+ ret
+.Lrd_sctlr_el1: mrs x0, sctlr_el1; ret
+.Lrd_sp_el0: mrs x0, sp_el0; ret
+.Lrd_far_el1: mrs x0, far_el1; ret
+
+.globl sysreg_write
+sysreg_write:
+ cmp x0, #SR_MAIR_EL1
+ b.eq .Lwr_mair_el1
+ cmp x0, #SR_TCR_EL1
+ b.eq .Lwr_tcr_el1
+ cmp x0, #SR_TTBR0_EL1
+ b.eq .Lwr_ttbr0_el1
+ cmp x0, #SR_SCTLR_EL1
+ b.eq .Lwr_sctlr_el1
+ cmp x0, #SR_CPACR_EL1
+ b.eq .Lwr_cpacr_el1
+ cmp x0, #SR_SP_EL0
+ b.eq .Lwr_sp_el0
+ ret
+.Lwr_mair_el1: msr mair_el1, x1; ret
+.Lwr_tcr_el1: msr tcr_el1, x1; ret
+.Lwr_ttbr0_el1: msr ttbr0_el1, x1; ret
+.Lwr_sctlr_el1: msr sctlr_el1, x1; ret
+.Lwr_cpacr_el1: msr cpacr_el1, x1; ret
+.Lwr_sp_el0: msr sp_el0, x1; ret
+
+/* BAR_* — matches kernel.c enum. */
+#define BAR_DSB_SY 0
+#define BAR_DSB_ISH 1
+#define BAR_DMB_ISH 2
+#define BAR_DMB_ISHST 3
+#define BAR_ISB 4
+
+.globl arm64_barrier
+arm64_barrier:
+ cmp x0, #BAR_DSB_SY
+ b.eq .Lbar_dsb_sy
+ cmp x0, #BAR_DSB_ISH
+ b.eq .Lbar_dsb_ish
+ cmp x0, #BAR_DMB_ISH
+ b.eq .Lbar_dmb_ish
+ cmp x0, #BAR_DMB_ISHST
+ b.eq .Lbar_dmb_ishst
+ isb
+ ret
+.Lbar_dsb_sy: dsb sy; ret
+.Lbar_dsb_ish: dsb ish; ret
+.Lbar_dmb_ish: dmb ish; ret
+.Lbar_dmb_ishst: dmb ishst; ret
+
+/* Bare cache/TLB primitives — kernel.c brackets them with arm64_barrier
+ * calls so the dsb scope at each call site stays explicit (the kernel
+ * uses both `dsb ish` and `dsb sy` patterns, not a single canonical
+ * sequence). */
+.globl arm64_ic_iallu
+arm64_ic_iallu:
+ ic iallu
+ ret
+
+.globl arm64_tlbi_vmalle1
+arm64_tlbi_vmalle1:
+ tlbi vmalle1
+ ret
+
+/* PAUSE_* — matches kernel.c enum. */
+#define PAUSE_WFE 0
+#define PAUSE_WFI 1
+#define PAUSE_YIELD 2
+
+.globl cpu_pause
+cpu_pause:
+ cmp x0, #PAUSE_WFI
+ b.eq .Lp_wfi
+ cmp x0, #PAUSE_YIELD
+ b.eq .Lp_yield
+ wfe
+ ret
+.Lp_wfi: wfi; ret
+.Lp_yield: yield; ret
+
+/* PSCI / SMCCC: x0 = conduit (0=HVC, 1=SMC), x1 = function id.
+ * Returns the call's x0. */
+.globl arm64_psci_call
+arm64_psci_call:
+ mov x9, x0
+ mov x0, x1
+ cbnz x9, .Lpsci_smc
+ hvc #0
+ ret
+.Lpsci_smc:
+ smc #0
+ ret
diff --git a/seed-kernel/kernel.c b/seed-kernel/kernel.c
@@ -1,10 +1,12 @@
/* seed kernel — minimal OS satisfying docs/OS.md Tier 1.
*
- * Boots via Linux arm64 boot protocol (-kernel/-initrd), parses the DTB
- * to find initrd + memory, unpacks the cpio newc initramfs into an
- * in-memory tmpfs, loads /init (a static aarch64 ELF), and ERETs into
- * it at EL1t. SVC traps land in trap_sync() and dispatch the eight
- * Tier-1 syscalls.
+ * Boots via Linux arm64 boot protocol (-kernel + two virtio-blk-MMIO
+ * disks), parses the DTB to find virtio_mmio nodes + memory, brings up
+ * a small polling virtio-blk driver, reads the cpio newc archive from
+ * blk0 (read-only) into the in-memory tmpfs, loads /init (a static
+ * aarch64 ELF), and ERETs into it at EL1t. SVC traps land in trap_sync()
+ * and dispatch the eight Tier-1 syscalls. On exit, the tmpfs is
+ * serialised to blk1 in a small SEEDFS table for the host extractor.
*/
typedef unsigned char u8;
@@ -54,6 +56,24 @@ static void uart_putd(i64 v) {
while (i--) uart_putc(buf[i]);
}
+/* ─── kernel.S thunk API ────────────────────────────────────────────────
+ * SR_*, BAR_*, PAUSE_* ids must match the #defines at the bottom of
+ * kernel.S in declaration order. */
+enum { SR_MAIR_EL1, SR_TCR_EL1, SR_TTBR0_EL1, SR_SCTLR_EL1,
+ SR_CPACR_EL1, SR_SP_EL0, SR_FAR_EL1 };
+enum { BAR_DSB_SY, BAR_DSB_ISH, BAR_DMB_ISH, BAR_DMB_ISHST, BAR_ISB };
+enum { PAUSE_WFE, PAUSE_WFI, PAUSE_YIELD };
+
+extern u64 sysreg_read(int id);
+extern void sysreg_write(int id, u64 v);
+extern void arm64_barrier(int kind);
+extern void arm64_ic_iallu(void);
+extern void arm64_tlbi_vmalle1(void);
+extern void cpu_pause(int kind);
+extern u64 arm64_psci_call(int conduit, u64 fnid);
+
+__attribute__((noreturn)) static void hang(void) { for (;;) cpu_pause(PAUSE_WFE); }
+
/* ─── Tiny libc-ish helpers ─────────────────────────────────────────────── */
/* libgcc / freestanding ABI helpers gcc may call implicitly. */
@@ -176,7 +196,7 @@ static void setup_mmu(void) {
/* MAIR: Attr0 = 0xff (Normal WB-WA), Attr1 = 0x00 (Device-nGnRnE) */
u64 mair = 0x00000000000000ffUL;
- asm volatile("msr mair_el1, %0" :: "r"(mair));
+ sysreg_write(SR_MAIR_EL1, mair);
u64 tcr = (u64)25 /* T0SZ: 39-bit VA */
| ((u64)1 << 8) /* IRGN0 = WBWA */
@@ -185,30 +205,29 @@ static void setup_mmu(void) {
| ((u64)0 << 14) /* TG0 = 4KB */
| ((u64)1 << 23) /* EPD1 = disable TTBR1 walks */
| ((u64)2 << 32); /* IPS = 40-bit phys */
- asm volatile("msr tcr_el1, %0" :: "r"(tcr));
- asm volatile("msr ttbr0_el1, %0" :: "r"((u64)l1_pt));
+ sysreg_write(SR_TCR_EL1, tcr);
+ sysreg_write(SR_TTBR0_EL1, (u64)l1_pt);
- asm volatile("ic iallu"); /* invalidate I-cache */
- asm volatile("dsb ish");
- asm volatile("tlbi vmalle1");
- asm volatile("dsb ish");
- asm volatile("isb");
+ arm64_ic_iallu();
+ arm64_barrier(BAR_DSB_ISH);
+ arm64_tlbi_vmalle1();
+ arm64_barrier(BAR_DSB_ISH);
+ arm64_barrier(BAR_ISB);
- u64 sctlr;
- asm volatile("mrs %0, sctlr_el1" : "=r"(sctlr));
+ u64 sctlr = sysreg_read(SR_SCTLR_EL1);
sctlr &= ~(u64)((1 << 1) | (1 << 19)); /* clear A (alignment), WXN */
sctlr |= (u64)((1 << 0) /* M — MMU on */
| (1 << 2) /* C — D-cache on */
| (1 << 12)); /* I — I-cache on */
- asm volatile("msr sctlr_el1, %0" :: "r"(sctlr));
- asm volatile("isb");
+ sysreg_write(SR_SCTLR_EL1, sctlr);
+ arm64_barrier(BAR_ISB);
/* CPACR_EL1.FPEN = 0b11: don't trap FP/ASIMD from EL0 or EL1.
* tcc-built user binaries (notably the self-rebuilt tcc1) emit FP
* register saves in their start glue; default FPEN=00 traps those
* to EL1 with EC=0x07. */
- asm volatile("msr cpacr_el1, %0" :: "r"((u64)3 << 20));
- asm volatile("isb");
+ sysreg_write(SR_CPACR_EL1, (u64)3 << 20);
+ arm64_barrier(BAR_ISB);
}
/* ─── Kernel heap (bump allocator) ──────────────────────────────────────── */
@@ -221,7 +240,7 @@ static void *kalloc(u64 n) {
n = (n + 15) & ~15UL;
if (kheap_ptr + n > kheap_end) {
uart_puts("kalloc: out of memory\n");
- for (;;) asm volatile("wfe");
+ hang();
}
void *r = kheap_ptr;
kheap_ptr += n;
@@ -242,14 +261,25 @@ static u64 be64(const u8 *p) { return ((u64)be32(p) << 32) | (u64)be32(p + 4); }
#define FDT_NOP 4
#define FDT_END 9
+/* QEMU virt has 32 virtio-mmio slots (0x0a000000..0x0a004000, 0x200 each).
+ * Most are unpopulated and report MagicValue=0/DeviceID=0 — we capture all
+ * slots advertised by the DTB and the driver init filters at probe time. */
+#define MAX_VIRTIO_MMIO 32
+
struct dtb_info {
- u64 initrd_start;
- u64 initrd_end;
u64 mem_start;
u64 mem_size;
+ u64 virtio_mmio_pa[MAX_VIRTIO_MMIO];
+ int virtio_mmio_n;
char bootargs[256];
};
+/* str_starts: returns 1 iff `s` begins with `prefix`. */
+static int str_starts(const char *s, const char *prefix) {
+ while (*prefix) { if (*s++ != *prefix++) return 0; }
+ return 1;
+}
+
static void parse_dtb(const void *dtb, struct dtb_info *out) {
const u8 *base = dtb;
if (be32(base) != FDT_MAGIC) {
@@ -283,11 +313,7 @@ static void parse_dtb(const void *dtb, struct dtb_info *out) {
const char *pn = (const char *)(strings + nameoff);
if (depth == 1 && str_eq(path[1], "chosen")) {
- if (str_eq(pn, "linux,initrd-start")) {
- out->initrd_start = (len == 8) ? be64(p) : (u64)be32(p);
- } else if (str_eq(pn, "linux,initrd-end")) {
- out->initrd_end = (len == 8) ? be64(p) : (u64)be32(p);
- } else if (str_eq(pn, "bootargs")) {
+ if (str_eq(pn, "bootargs")) {
u32 i = 0;
while (i < len && i < 255) { out->bootargs[i] = (char)p[i]; i++; }
out->bootargs[i] = 0;
@@ -295,12 +321,19 @@ static void parse_dtb(const void *dtb, struct dtb_info *out) {
}
if (depth == 1) {
/* memory node is named "memory@<addr>" */
- if ((path[1][0] == 'm' && path[1][1] == 'e' && path[1][2] == 'm' &&
- path[1][3] == 'o' && path[1][4] == 'r' && path[1][5] == 'y') &&
+ if (str_starts(path[1], "memory") &&
str_eq(pn, "reg") && len >= 16 && out->mem_size == 0) {
out->mem_start = be64(p);
out->mem_size = be64(p + 8);
}
+ /* virtio-mmio nodes: capture each slot's PA. Root #address-
+ * cells/#size-cells are both 2 on QEMU virt → reg is 16 bytes
+ * (PA u64, size u64); we only need the PA. */
+ if (str_starts(path[1], "virtio_mmio@") &&
+ str_eq(pn, "reg") && len >= 16 &&
+ out->virtio_mmio_n < MAX_VIRTIO_MMIO) {
+ out->virtio_mmio_pa[out->virtio_mmio_n++] = be64(p);
+ }
}
p += len;
p = (const u8 *)(((u64)p + 3) & ~3UL);
@@ -315,6 +348,359 @@ static void parse_dtb(const void *dtb, struct dtb_info *out) {
}
}
+/* ─── virtio-blk-MMIO driver (polling, single-VQ) ───────────────────────── */
+/*
+ * Two block devices: blk0 = read-only cpio input, blk1 = read-write output.
+ * Identification is content-based (sector 0 cpio newc magic "070701" ⇒ blk0)
+ * so we don't depend on -drive ordering on the qemu command line.
+ *
+ * The driver is intentionally small: 8-entry split virtqueue, one in-flight
+ * request at a time, polling used.idx. No interrupts (DAIF stays masked).
+ *
+ * MMIO transport regs are reached via DEVICE_ALIAS_BASE + PA. virtqueue
+ * memory comes from kernel BSS (identity-mapped Normal, VA == PA).
+ */
+
+#define VIRTIO_MMIO_MAGIC 0x000
+#define VIRTIO_MMIO_VERSION 0x004
+#define VIRTIO_MMIO_DEVICE_ID 0x008
+#define VIRTIO_MMIO_DEV_FEATURES 0x010
+#define VIRTIO_MMIO_DEV_FEAT_SEL 0x014
+#define VIRTIO_MMIO_DRV_FEATURES 0x020
+#define VIRTIO_MMIO_DRV_FEAT_SEL 0x024
+#define VIRTIO_MMIO_QUEUE_SEL 0x030
+#define VIRTIO_MMIO_QUEUE_NUM_MAX 0x034
+#define VIRTIO_MMIO_QUEUE_NUM 0x038
+#define VIRTIO_MMIO_QUEUE_READY 0x044
+#define VIRTIO_MMIO_QUEUE_NOTIFY 0x050
+#define VIRTIO_MMIO_INT_STATUS 0x060
+#define VIRTIO_MMIO_INT_ACK 0x064
+#define VIRTIO_MMIO_STATUS 0x070
+#define VIRTIO_MMIO_QUEUE_DESC_LO 0x080
+#define VIRTIO_MMIO_QUEUE_DESC_HI 0x084
+#define VIRTIO_MMIO_QUEUE_DRIVER_LO 0x090
+#define VIRTIO_MMIO_QUEUE_DRIVER_HI 0x094
+#define VIRTIO_MMIO_QUEUE_DEVICE_LO 0x0a0
+#define VIRTIO_MMIO_QUEUE_DEVICE_HI 0x0a4
+#define VIRTIO_MMIO_CONFIG 0x100
+
+#define VIRTIO_STATUS_ACKNOWLEDGE 1
+#define VIRTIO_STATUS_DRIVER 2
+#define VIRTIO_STATUS_DRIVER_OK 4
+#define VIRTIO_STATUS_FEATURES_OK 8
+#define VIRTIO_STATUS_FAILED 128
+
+#define VIRTIO_F_VERSION_1_BIT 32 /* bit 32 in feature space */
+
+#define VIRTIO_BLK_T_IN 0
+#define VIRTIO_BLK_T_OUT 1
+
+#define VRING_DESC_F_NEXT 1
+#define VRING_DESC_F_WRITE 2
+
+#define VQ_SIZE 8
+
+struct vring_desc {
+ u64 addr;
+ u32 len;
+ u16 flags;
+ u16 next;
+};
+
+struct vring_avail {
+ u16 flags;
+ u16 idx;
+ u16 ring[VQ_SIZE];
+ u16 used_event;
+};
+
+struct vring_used_elem {
+ u32 id;
+ u32 len;
+};
+
+struct vring_used {
+ u16 flags;
+ u16 idx;
+ struct vring_used_elem ring[VQ_SIZE];
+ u16 avail_event;
+};
+
+struct virtio_blk_req_hdr {
+ u32 type;
+ u32 reserved;
+ u64 sector;
+};
+
+#define BLK_DEV_MAX 2
+
+struct blk_dev {
+ volatile u8 *regs; /* alias VA pointing at the MMIO region */
+ u64 capacity_sectors;
+ int present;
+};
+
+static struct blk_dev blk_devs[BLK_DEV_MAX];
+static int blk_n_devs = 0;
+
+/* One vring per device in BSS, 4 KB-aligned. Layout within the page:
+ * desc: offset 0 (128 B for VQ_SIZE=8)
+ * avail: offset 128 (24 B with VQ_SIZE=8 + used_event)
+ * used: offset 256 (72 B with VQ_SIZE=8 + avail_event)
+ * Plenty of slack inside one 4 KB page. */
+__attribute__((aligned(4096))) static u8 vq_pages[BLK_DEV_MAX][4096];
+
+#define VQ_DESC_OFF 0
+#define VQ_AVAIL_OFF 128
+#define VQ_USED_OFF 256
+
+static struct vring_desc *vq_desc(int i)
+ { return (struct vring_desc *)(vq_pages[i] + VQ_DESC_OFF); }
+static struct vring_avail *vq_avail(int i)
+ { return (struct vring_avail *)(vq_pages[i] + VQ_AVAIL_OFF); }
+static struct vring_used *vq_used(int i)
+ { return (struct vring_used *)(vq_pages[i] + VQ_USED_OFF); }
+
+/* MMIO accessors. Reg offsets are byte offsets per the spec. */
+static u32 mmio_r32(struct blk_dev *d, u32 off) {
+ return *(volatile u32 *)(d->regs + off);
+}
+static void mmio_w32(struct blk_dev *d, u32 off, u32 val) {
+ *(volatile u32 *)(d->regs + off) = val;
+}
+
+/* Initialise one device per spec §3.1.1 / §4.2 / §5.2. Returns 1 on
+ * success (device is virtio-blk and ready), 0 if slot is empty / not
+ * a block device, -1 on error. */
+static int blk_init_one(struct blk_dev *d) {
+ u32 magic = mmio_r32(d, VIRTIO_MMIO_MAGIC);
+ if (magic != 0x74726976) return 0; /* not a virtio slot */
+ u32 devid = mmio_r32(d, VIRTIO_MMIO_DEVICE_ID);
+ if (devid == 0) return 0; /* unpopulated slot */
+ if (devid != 2) return 0; /* not a block device */
+ u32 ver = mmio_r32(d, VIRTIO_MMIO_VERSION);
+ if (ver != 2) {
+ /* QEMU virt defaults to legacy (version 1) virtio-mmio transports
+ * unless the host passes -global virtio-mmio.force-legacy=false.
+ * The harness scripts set that flag — reaching here means it was
+ * forgotten. */
+ uart_puts("[seed] virtio-mmio version != 2 (legacy): ");
+ uart_putd((i64)ver);
+ uart_puts(" — pass -global virtio-mmio.force-legacy=false\n");
+ return -1;
+ }
+
+ /* Reset, ack, driver. */
+ mmio_w32(d, VIRTIO_MMIO_STATUS, 0);
+ mmio_w32(d, VIRTIO_MMIO_STATUS, VIRTIO_STATUS_ACKNOWLEDGE);
+ mmio_w32(d, VIRTIO_MMIO_STATUS,
+ VIRTIO_STATUS_ACKNOWLEDGE | VIRTIO_STATUS_DRIVER);
+
+ /* Negotiate VIRTIO_F_VERSION_1 only (bit 32 → feature word 1). */
+ mmio_w32(d, VIRTIO_MMIO_DEV_FEAT_SEL, 1);
+ u32 dev_feat_hi = mmio_r32(d, VIRTIO_MMIO_DEV_FEATURES);
+ if (!(dev_feat_hi & (1u << (VIRTIO_F_VERSION_1_BIT - 32)))) {
+ uart_puts("[seed] virtio: device lacks VERSION_1\n");
+ return -1;
+ }
+ mmio_w32(d, VIRTIO_MMIO_DRV_FEAT_SEL, 0);
+ mmio_w32(d, VIRTIO_MMIO_DRV_FEATURES, 0);
+ mmio_w32(d, VIRTIO_MMIO_DRV_FEAT_SEL, 1);
+ mmio_w32(d, VIRTIO_MMIO_DRV_FEATURES, 1u << (VIRTIO_F_VERSION_1_BIT - 32));
+
+ mmio_w32(d, VIRTIO_MMIO_STATUS,
+ VIRTIO_STATUS_ACKNOWLEDGE | VIRTIO_STATUS_DRIVER |
+ VIRTIO_STATUS_FEATURES_OK);
+ u32 st = mmio_r32(d, VIRTIO_MMIO_STATUS);
+ if (!(st & VIRTIO_STATUS_FEATURES_OK)) {
+ uart_puts("[seed] virtio: FEATURES_OK rejected\n");
+ return -1;
+ }
+
+ /* Queue 0. */
+ mmio_w32(d, VIRTIO_MMIO_QUEUE_SEL, 0);
+ u32 qmax = mmio_r32(d, VIRTIO_MMIO_QUEUE_NUM_MAX);
+ if (qmax < VQ_SIZE) {
+ uart_puts("[seed] virtio: QueueNumMax < VQ_SIZE\n");
+ return -1;
+ }
+ mmio_w32(d, VIRTIO_MMIO_QUEUE_NUM, VQ_SIZE);
+
+ int i = (int)(d - blk_devs);
+ /* Zero the queue page so all idx/flags start at 0. */
+ for (int k = 0; k < 4096; k++) vq_pages[i][k] = 0;
+
+ u64 desc_pa = (u64)vq_desc(i);
+ u64 avail_pa = (u64)vq_avail(i);
+ u64 used_pa = (u64)vq_used(i);
+ mmio_w32(d, VIRTIO_MMIO_QUEUE_DESC_LO, (u32)desc_pa);
+ mmio_w32(d, VIRTIO_MMIO_QUEUE_DESC_HI, (u32)(desc_pa >> 32));
+ mmio_w32(d, VIRTIO_MMIO_QUEUE_DRIVER_LO, (u32)avail_pa);
+ mmio_w32(d, VIRTIO_MMIO_QUEUE_DRIVER_HI, (u32)(avail_pa >> 32));
+ mmio_w32(d, VIRTIO_MMIO_QUEUE_DEVICE_LO, (u32)used_pa);
+ mmio_w32(d, VIRTIO_MMIO_QUEUE_DEVICE_HI, (u32)(used_pa >> 32));
+
+ mmio_w32(d, VIRTIO_MMIO_QUEUE_READY, 1);
+ mmio_w32(d, VIRTIO_MMIO_STATUS,
+ VIRTIO_STATUS_ACKNOWLEDGE | VIRTIO_STATUS_DRIVER |
+ VIRTIO_STATUS_FEATURES_OK | VIRTIO_STATUS_DRIVER_OK);
+
+ /* virtio-blk config @+0 = capacity (sectors, little-endian u64). */
+ u32 cap_lo = *(volatile u32 *)(d->regs + VIRTIO_MMIO_CONFIG + 0);
+ u32 cap_hi = *(volatile u32 *)(d->regs + VIRTIO_MMIO_CONFIG + 4);
+ d->capacity_sectors = ((u64)cap_hi << 32) | cap_lo;
+ d->present = 1;
+ return 1;
+}
+
+/* Issue one request (VIRTIO_BLK_T_IN or _OUT) and poll for completion.
+ * `buf` PA == VA (kheap or kernel BSS / stack). nsec ≤ 2048 (1 MB/req). */
+static int blk_request_one(int devi, u32 type, u64 sector, void *buf, u64 nsec) {
+ struct blk_dev *d = &blk_devs[devi];
+ /* Per-call hdr/status — both reachable via VA==PA on the kernel stack. */
+ struct virtio_blk_req_hdr hdr = { .type = type, .reserved = 0, .sector = sector };
+ volatile u8 status = 0xff;
+
+ struct vring_desc *desc = vq_desc(devi);
+ struct vring_avail *avail = vq_avail(devi);
+ struct vring_used *used = vq_used(devi);
+
+ desc[0].addr = (u64)&hdr;
+ desc[0].len = (u32)sizeof(hdr);
+ desc[0].flags = VRING_DESC_F_NEXT;
+ desc[0].next = 1;
+
+ desc[1].addr = (u64)buf;
+ desc[1].len = (u32)(nsec * 512);
+ /* For READ (T_IN), device writes into our buffer (F_WRITE).
+ * For WRITE (T_OUT), device reads our buffer (no F_WRITE). */
+ desc[1].flags = VRING_DESC_F_NEXT | (type == VIRTIO_BLK_T_IN ? VRING_DESC_F_WRITE : 0);
+ desc[1].next = 2;
+
+ desc[2].addr = (u64)&status;
+ desc[2].len = 1;
+ desc[2].flags = VRING_DESC_F_WRITE;
+ desc[2].next = 0;
+
+ u16 head = 0;
+ u16 ai = avail->idx;
+ avail->ring[ai % VQ_SIZE] = head;
+ arm64_barrier(BAR_DMB_ISHST);
+ avail->idx = ai + 1;
+ arm64_barrier(BAR_DMB_ISHST);
+
+ mmio_w32(d, VIRTIO_MMIO_QUEUE_NOTIFY, 0);
+
+ /* Poll used.idx — single in-flight, advances by exactly one. */
+ while (used->idx == ai) {
+ cpu_pause(PAUSE_YIELD);
+ }
+ arm64_barrier(BAR_DMB_ISH);
+
+ /* Acknowledge any pending interrupt status (we don't service IRQs but
+ * the device sets these bits anyway). */
+ u32 is = mmio_r32(d, VIRTIO_MMIO_INT_STATUS);
+ if (is) mmio_w32(d, VIRTIO_MMIO_INT_ACK, is);
+
+ if (status != 0) {
+ uart_puts("[seed] virtio-blk req failed status="); uart_putd((i64)status);
+ uart_puts("\n");
+ return -1;
+ }
+ return 0;
+}
+
+/* Multi-chunk read/write. Chunks at 1 MB (2048 sectors) per request. */
+#define BLK_CHUNK_SECTORS 2048
+
+static int blk_io(int devi, u32 type, u64 sector, u8 *buf, u64 nsec) {
+ while (nsec) {
+ u64 n = nsec;
+ if (n > BLK_CHUNK_SECTORS) n = BLK_CHUNK_SECTORS;
+ if (blk_request_one(devi, type, sector, buf, n) < 0) return -1;
+ sector += n;
+ buf += n * 512;
+ nsec -= n;
+ }
+ return 0;
+}
+
+static int blk_read(int devi, u64 sector, void *buf, u64 nsec) {
+ return blk_io(devi, VIRTIO_BLK_T_IN, sector, buf, nsec);
+}
+static int blk_write(int devi, u64 sector, const void *buf, u64 nsec) {
+ return blk_io(devi, VIRTIO_BLK_T_OUT, sector, (u8 *)buf, nsec);
+}
+
+/* Probe every populated MMIO slot the DTB advertised; bring up block
+ * devices; identify blk0 vs blk1 by reading sector 0 — the cpio newc
+ * magic ("070701") is on blk0, the other is blk1. Panics if exactly one
+ * of each isn't found. */
+static int g_blk_input = -1; /* index in blk_devs[] for cpio input */
+static int g_blk_output = -1; /* index for output dump */
+
+static void blk_init(struct dtb_info *dt) {
+ int n_blocks = 0;
+ for (int i = 0; i < dt->virtio_mmio_n; i++) {
+ u64 pa = dt->virtio_mmio_pa[i];
+ if (n_blocks >= BLK_DEV_MAX) break;
+ /* Stage into blk_devs[n_blocks] so blk_init_one's index-derived
+ * vq page assignment is correct from the start. */
+ struct blk_dev *d = &blk_devs[n_blocks];
+ d->regs = (volatile u8 *)(DEVICE_ALIAS_BASE + pa);
+ d->capacity_sectors = 0;
+ d->present = 0;
+ int r = blk_init_one(d);
+ if (r > 0) {
+ n_blocks++;
+ } else if (r < 0) {
+ uart_puts("[seed] virtio: init failed at PA="); uart_putx(pa);
+ uart_puts("\n");
+ hang();
+ }
+ }
+ blk_n_devs = n_blocks;
+ if (n_blocks != 2) {
+ uart_puts("[seed] virtio-blk: expected 2 block devices, got ");
+ uart_putd((i64)n_blocks); uart_puts("\n");
+ hang();
+ }
+
+ /* Identify blk0 (cpio) vs blk1 (output) by reading sector 0. */
+ __attribute__((aligned(16))) static u8 probe[512];
+ for (int i = 0; i < n_blocks; i++) {
+ if (blk_read(i, 0, probe, 1) < 0) {
+ uart_puts("[seed] virtio-blk: probe read failed dev=");
+ uart_putd((i64)i); uart_puts("\n");
+ hang();
+ }
+ int is_cpio = (probe[0]=='0' && probe[1]=='7' && probe[2]=='0' &&
+ probe[3]=='7' && probe[4]=='0' && probe[5]=='1');
+ if (is_cpio) {
+ if (g_blk_input >= 0) {
+ uart_puts("[seed] virtio-blk: multiple cpio disks\n");
+ hang();
+ }
+ g_blk_input = i;
+ } else {
+ if (g_blk_output >= 0) {
+ uart_puts("[seed] virtio-blk: multiple non-cpio disks\n");
+ hang();
+ }
+ g_blk_output = i;
+ }
+ }
+ if (g_blk_input < 0 || g_blk_output < 0) {
+ uart_puts("[seed] virtio-blk: failed to identify in/out\n");
+ hang();
+ }
+ uart_puts("[seed] virtio-blk: in=dev"); uart_putd((i64)g_blk_input);
+ uart_puts(" cap="); uart_putd((i64)blk_devs[g_blk_input].capacity_sectors);
+ uart_puts(" sec out=dev"); uart_putd((i64)g_blk_output);
+ uart_puts(" cap="); uart_putd((i64)blk_devs[g_blk_output].capacity_sectors);
+ uart_puts(" sec\n");
+}
+
/* ─── In-memory tmpfs from cpio newc ────────────────────────────────────── */
/* boot5 stages a full musl tree in the cpio (~1300 .c sources + ~1200
@@ -517,10 +903,10 @@ static u64 load_elf(const u8 *elf) {
/* Round up to 16 bytes so callers can use it directly as brk_base. */
g_user_image_end = (hi + 15) & ~15UL;
/* I-cache sync (cheap insurance even with caches off). */
- asm volatile("dsb sy" ::: "memory");
- asm volatile("ic iallu" ::: "memory");
- asm volatile("dsb sy" ::: "memory");
- asm volatile("isb");
+ arm64_barrier(BAR_DSB_SY);
+ arm64_ic_iallu();
+ arm64_barrier(BAR_DSB_SY);
+ arm64_barrier(BAR_ISB);
return eh->e_entry;
}
@@ -757,10 +1143,10 @@ static void swap_user_pool(int which) {
for (int i = USER_POOL_FIRST_SLOT; i <= USER_POOL_LAST_SLOT; i++) {
l2_user[i] = (base + (u64)(i - USER_POOL_FIRST_SLOT) * 0x200000UL) | normal;
}
- asm volatile("dsb ish" ::: "memory");
- asm volatile("tlbi vmalle1");
- asm volatile("dsb ish" ::: "memory");
- asm volatile("isb");
+ arm64_barrier(BAR_DSB_ISH);
+ arm64_tlbi_vmalle1();
+ arm64_barrier(BAR_DSB_ISH);
+ arm64_barrier(BAR_ISB);
current_pool = which;
}
@@ -839,7 +1225,7 @@ static i64 sys_spawn(struct trapframe *tf, const char *path, char **argv) {
for (int i = 0; i < 31; i++) p->regs[i] = tf->x[i];
p->elr = tf->elr;
p->spsr = tf->spsr;
- asm volatile("mrs %0, sp_el0" : "=r"(p->sp_el0));
+ p->sp_el0 = sysreg_read(SR_SP_EL0);
p->brk_base_save = brk_base;
p->brk_cur_save = brk_cur;
for (int i = 0; i < MAX_FD; i++) p->fdtab_save[i] = fdtab[i];
@@ -880,7 +1266,7 @@ static i64 sys_spawn(struct trapframe *tf, const char *path, char **argv) {
tf->elr = entry;
/* sp_el0 isn't in the trap frame — set it directly; it survives
* until eret since the kernel uses SP_ELx while in trap_sync. */
- asm volatile("msr sp_el0, %0" :: "r"(new_sp));
+ sysreg_write(SR_SP_EL0, new_sp);
/* Returning 0; dispatcher writes tf->x[0] = 0. The child's _start
* reads argc/argv from the stack, so x[0] is don't-care. */
return 0;
@@ -908,46 +1294,122 @@ static i64 sys_waitid(struct trapframe *tf, int idtype, u64 id,
static int g_exit_code = 0;
static int g_exited = 0;
-/* Dump every file in the tmpfs to UART, hex-encoded, framed by sentinels
- * a host-side extractor can scan for. The chain's verification harness
- * (qemu-host wrapper) parses this to recover output ELFs etc. without
- * needing virtio-9p — flat tmpfs over UART is enough for boot2's
- * file-only IPC. Dump only happens when a "dumpfs" token is present in
- * /chosen/bootargs; the hello.c demo runs without it and stays quiet. */
-static int g_dumpfs = 0;
+/* On-disk dump format on blk1 (SEEDFS, sector-aligned, little-endian):
+ *
+ * sector 0: struct seedfs_hdr {
+ * char magic[8] = "SEEDFS\0\0";
+ * u32 nfiles;
+ * u32 reserved;
+ * }; (16 B; rest of sector zero-padded)
+ * sector 1..T: nfiles directory entries, 4 entries/sector:
+ * struct seedfs_ent {
+ * char path[96];
+ * u32 data_offset_sectors;
+ * u32 _pad;
+ * u64 size_bytes;
+ * }; (112 B; T = ceil(nfiles/4))
+ * sector T+1..: file data, each file padded up to a 512-byte boundary.
+ *
+ * The host-side extractor (extract-blk.sh) walks the table and writes
+ * each file out by data_offset_sectors / size_bytes.
+ *
+ * Runs unconditionally on user exit. If the user code never reached exit
+ * (kernel panic, hang, etc.) the host extractor sees no SEEDFS magic at
+ * sector 0 and reports the missing-exit failure mode. */
-static void uart_putc_hex(u8 b) {
- static const char hex[] = "0123456789abcdef";
- uart_putc(hex[b >> 4]);
- uart_putc(hex[b & 0xf]);
-}
+#define SEEDFS_ENT_SZ 112
+
+struct seedfs_hdr {
+ char magic[8];
+ u32 nfiles;
+ u32 reserved;
+};
+
+struct seedfs_ent {
+ char path[96];
+ u32 data_offset_sectors;
+ u32 _pad;
+ u64 size_bytes;
+};
+
+/* Scratch sector for trailing-byte padding of files whose size isn't a
+ * multiple of 512. Single 512-byte buffer is enough — we serialise file
+ * writes and rezero before each use. */
+__attribute__((aligned(16))) static u8 dump_tail_sector[512];
+
+static void dump_tmpfs_blk(void) {
+ /* Count active files. */
+ u32 nfiles = 0;
+ for (int i = 0; i < MAX_FILES; i++) if (files[i].used) nfiles++;
+
+ u32 table_sectors = (nfiles + 3) / 4;
+ u32 hdr_sectors = 1 + table_sectors;
+ u64 hdr_bytes = (u64)hdr_sectors * 512;
+
+ u8 *hdr_buf = kalloc(hdr_bytes);
+ for (u64 i = 0; i < hdr_bytes; i++) hdr_buf[i] = 0;
+
+ struct seedfs_hdr *hdr = (struct seedfs_hdr *)hdr_buf;
+ hdr->magic[0]='S'; hdr->magic[1]='E'; hdr->magic[2]='E';
+ hdr->magic[3]='D'; hdr->magic[4]='F'; hdr->magic[5]='S';
+ hdr->magic[6]=0; hdr->magic[7]=0;
+ hdr->nfiles = nfiles;
+ hdr->reserved = 0;
+
+ /* Walk files: fill table entries, write data sectors, advance cursor. */
+ u32 ent_idx = 0;
+ u64 cursor = (u64)hdr_sectors;
+ /* Output device capacity guard — we don't grow blk1, the host pre-
+ * sized it (256 MB by default). Refuse the dump if it would exceed. */
+ u64 out_cap = blk_devs[g_blk_output].capacity_sectors;
-static void dump_tmpfs(void) {
- uart_puts("\n=== DUMP-BEGIN ===\n");
for (int i = 0; i < MAX_FILES; i++) {
if (!files[i].used) continue;
- uart_puts("=== FILE path=");
- uart_puts(files[i].path);
- uart_puts(" size=");
- uart_putd((i64)files[i].len);
- uart_puts(" ===\n");
- for (u64 j = 0; j < files[i].len; j++) uart_putc_hex(files[i].data[j]);
- uart_puts("\n");
+ struct seedfs_ent *e = (struct seedfs_ent *)(hdr_buf + 512 +
+ (u64)ent_idx * SEEDFS_ENT_SZ);
+ int j = 0;
+ while (files[i].path[j] && j < (int)sizeof(e->path) - 1) {
+ e->path[j] = files[i].path[j]; j++;
+ }
+ e->path[j] = 0;
+ e->data_offset_sectors = (u32)cursor;
+ e->size_bytes = files[i].len;
+
+ u64 nsec_full = files[i].len / 512;
+ u64 rem = files[i].len - nsec_full * 512;
+ u64 need = nsec_full + (rem ? 1 : 0);
+ if (cursor + need > out_cap) {
+ uart_puts("[seed] dump: out.img too small for tmpfs\n");
+ return;
+ }
+ if (nsec_full)
+ blk_write(g_blk_output, cursor, files[i].data, nsec_full);
+ cursor += nsec_full;
+ if (rem) {
+ for (int k = 0; k < 512; k++) dump_tail_sector[k] = 0;
+ for (u64 k = 0; k < rem; k++)
+ dump_tail_sector[k] = files[i].data[nsec_full * 512 + k];
+ blk_write(g_blk_output, cursor, dump_tail_sector, 1);
+ cursor++;
+ }
+ ent_idx++;
}
- uart_puts("=== DUMP-END ===\n");
+
+ blk_write(g_blk_output, 0, hdr_buf, hdr_sectors);
+ uart_puts("[seed] dump: nfiles="); uart_putd((i64)nfiles);
+ uart_puts(" cursor="); uart_putd((i64)cursor);
+ uart_puts(" sectors\n");
}
static void sys_exit_final(int code) {
g_exit_code = code;
g_exited = 1;
- if (g_dumpfs) dump_tmpfs();
+ dump_tmpfs_blk();
uart_puts("\n[seed] user exit_group("); uart_putd(code); uart_puts(")\n");
/* Try PSCI SYSTEM_OFF so QEMU exits cleanly; fall back to spin. */
- register u64 x0 asm("x0") = 0x84000008;
- asm volatile("hvc #0" : "+r"(x0));
- register u64 x0s asm("x0") = 0x84000008;
- asm volatile("smc #0" : "+r"(x0s));
- for (;;) asm volatile("wfi");
+ arm64_psci_call(0 /*HVC*/, 0x84000008);
+ arm64_psci_call(1 /*SMC*/, 0x84000008);
+ for (;;) cpu_pause(PAUSE_WFI);
}
/* Dispatcher-side exit_group: pops proc_stack and resumes the parent's
@@ -975,7 +1437,7 @@ static int sys_exit_or_resume_parent(struct trapframe *tf, int code) {
for (int i = 0; i < 31; i++) tf->x[i] = p->regs[i];
tf->elr = p->elr;
tf->spsr = p->spsr;
- asm volatile("msr sp_el0, %0" :: "r"(p->sp_el0));
+ sysreg_write(SR_SP_EL0, p->sp_el0);
/* I-cache invalidation. The parent's pool was never written, so
* its instruction bytes (in DRAM) are byte-identical to what was
* originally fetched. But the same user VAs were just used to
@@ -983,9 +1445,9 @@ static int sys_exit_or_resume_parent(struct trapframe *tf, int code) {
* aarch64 I-caches may hold lines tagged by VA whose translation
* just changed. `ic iallu` invalidates by VA so subsequent fetches
* miss and re-walk through the freshly-swapped L2. */
- asm volatile("ic iallu" ::: "memory");
- asm volatile("dsb ish" ::: "memory");
- asm volatile("isb");
+ arm64_ic_iallu();
+ arm64_barrier(BAR_DSB_ISH);
+ arm64_barrier(BAR_ISB);
return (int)p->child_pid; /* >0: tells dispatcher to write this as r */
}
sys_exit_final(code);
@@ -1036,25 +1498,25 @@ i64 trap_sync(u64 esr, struct trapframe *tf) {
uart_puts("[seed] PANIC: user sync, ESR="); uart_putx(esr);
uart_puts(" ELR="); uart_putx(tf->elr);
uart_puts(" FAR=");
- u64 far; asm volatile("mrs %0, far_el1" : "=r"(far)); uart_putx(far);
+ u64 far = sysreg_read(SR_FAR_EL1); uart_putx(far);
uart_puts("\n");
- for (;;) asm volatile("wfe");
+ hang();
}
void trap_kernel(u64 esr, struct trapframe *tf) {
- u64 far; asm volatile("mrs %0, far_el1" : "=r"(far));
+ u64 far = sysreg_read(SR_FAR_EL1);
uart_puts("[seed] PANIC: kernel sync, ESR="); uart_putx(esr);
uart_puts(" ELR="); uart_putx(tf->elr);
uart_puts(" FAR="); uart_putx(far);
uart_puts("\n");
- for (;;) asm volatile("wfe");
+ hang();
}
void trap_unhandled(u64 esr, struct trapframe *tf) {
uart_puts("[seed] PANIC: unhandled exception, ESR="); uart_putx(esr);
uart_puts(" ELR="); uart_putx(tf->elr);
uart_puts("\n");
- for (;;) asm volatile("wfe");
+ hang();
}
/* ─── User stack setup + entry ──────────────────────────────────────────── */
@@ -1117,10 +1579,11 @@ void kmain(u64 dtb_phys) {
setup_mmu();
/* Bring up heap immediately — placed at a 16MB-aligned offset above
- * our image, well clear of BSS/stack and of QEMU's initrd placement. */
+ * our image, well clear of BSS/stack. Without -initrd reserving the
+ * 0x44000000–0x4b000000 region, the full 176 MB is ours from boot. */
u64 image_end = (u64)_end;
kheap_ptr = (u8 *)((image_end + 0xfffful) & ~0xfffful);
- kheap_end = (u8 *)0x44000000UL; /* 64MB of heap, plenty */
+ kheap_end = (u8 *)0x4b000000UL; /* 176MB of heap */
uart_puts("\n[seed] arm64 boot, x0/dtb="); uart_putx(dtb_phys); uart_puts("\n");
@@ -1128,16 +1591,29 @@ void kmain(u64 dtb_phys) {
parse_dtb((const void *)dtb_phys, &dt);
uart_puts("[seed] mem "); uart_putx(dt.mem_start);
uart_puts(" + "); uart_putx(dt.mem_size); uart_puts("\n");
- uart_puts("[seed] initrd "); uart_putx(dt.initrd_start);
- uart_puts(" .. "); uart_putx(dt.initrd_end); uart_puts("\n");
+ uart_puts("[seed] virtio-mmio slots="); uart_putd((i64)dt.virtio_mmio_n);
+ uart_puts("\n");
if (dt.bootargs[0]) { uart_puts("[seed] bootargs: "); uart_puts(dt.bootargs); uart_puts("\n"); }
- if (dt.initrd_start == 0 || dt.initrd_end <= dt.initrd_start) {
- uart_puts("[seed] no initrd, halting\n");
- for (;;) asm volatile("wfe");
+ /* Bring up virtio-blk: identifies blk0 (cpio) and blk1 (output). */
+ blk_init(&dt);
+
+ /* Reserve the cpio buffer at the top of kheap so we can release it
+ * after parse_cpio. parse_cpio kallocs each file's data below cpio_buf;
+ * once it returns, the cpio buffer's bytes are dead and we can let
+ * subsequent kallocs use that space. */
+ u64 in_cap_sec = blk_devs[g_blk_input].capacity_sectors;
+ u64 in_cap_bytes = in_cap_sec * 512;
+ u64 in_cap_aln = (in_cap_bytes + 0xfffUL) & ~0xfffUL;
+ u8 *cpio_buf = (u8 *)((u64)kheap_end - in_cap_aln);
+ u8 *kheap_end_full = kheap_end;
+ kheap_end = cpio_buf;
+
+ if (blk_read(g_blk_input, 0, cpio_buf, in_cap_sec) < 0) {
+ uart_puts("[seed] cpio read failed\n");
+ hang();
}
-
- parse_cpio((const void *)dt.initrd_start, dt.initrd_end - dt.initrd_start);
+ parse_cpio(cpio_buf, in_cap_bytes);
uart_puts("[seed] tmpfs:\n");
for (int i = 0; i < MAX_FILES; i++) {
if (!files[i].used) continue;
@@ -1146,15 +1622,15 @@ void kmain(u64 dtb_phys) {
}
int init_idx = find_file("init");
- if (init_idx < 0) { uart_puts("[seed] no /init in initrd, halting\n"); for(;;) asm volatile("wfe"); }
+ if (init_idx < 0) { uart_puts("[seed] no /init in initrd, halting\n"); hang(); }
u64 entry = load_elf(files[init_idx].data);
- if (!entry) { uart_puts("[seed] load_elf failed\n"); for(;;) asm volatile("wfe"); }
+ if (!entry) { uart_puts("[seed] load_elf failed\n"); hang(); }
uart_puts("[seed] /init e_entry="); uart_putx(entry); uart_puts("\n");
- /* parse_cpio + load_elf are done — original initrd memory is dead.
- * Bump kheap_end to reclaim it for tmpfs file growth via sys_write. */
- kheap_end = (u8 *)0x4b000000UL;
+ /* parse_cpio + load_elf are done — cpio buffer's bytes are dead.
+ * Release the reserved tail of kheap for tmpfs file growth. */
+ kheap_end = kheap_end_full;
/* User runs in the L2-mapped low-VA window (USER_VA_LO..USER_VA_HI,
* physically backed by pool A initially). Stack grows down from the top
@@ -1171,11 +1647,7 @@ void kmain(u64 dtb_phys) {
* 2. /init.argv from the initramfs (one arg per line).
* 3. Fallback: argc=1, argv[0]="init".
* In all three cases, argv passed to user is exactly what the source
- * provided — no implicit argv[0]="init" prefix.
- *
- * The seed kernel reserves one bootargs token: "dumpfs". When present,
- * it is stripped from argv and triggers a hex-encoded dump of the
- * full tmpfs over UART on exit (sentinel-framed for host extraction). */
+ * provided — no implicit argv[0]="init" prefix. */
static char argv_pool[512];
char *uargv[MAX_ARGV];
int uargc = 0;
@@ -1186,12 +1658,7 @@ void kmain(u64 dtb_phys) {
argv_pool[n] = dt.bootargs[n]; n++;
}
argv_pool[n] = 0;
- char *raw[MAX_ARGV];
- int rawc = tokenise(argv_pool, raw, MAX_ARGV);
- for (int i = 0; i < rawc; i++) {
- if (str_eq(raw[i], "dumpfs")) { g_dumpfs = 1; continue; }
- uargv[uargc++] = raw[i];
- }
+ uargc = tokenise(argv_pool, uargv, MAX_ARGV);
}
if (uargc == 0) {
int aidx = find_file("init.argv");
diff --git a/seed-kernel/run.sh b/seed-kernel/run.sh
@@ -1,5 +1,5 @@
#!/bin/sh
-# Boot the seed kernel + initramfs in QEMU.
+# Boot the seed kernel + virtio-blk input/output disks in QEMU.
#
# Usage: ./run.sh [extra qemu args...]
@@ -7,10 +7,16 @@ set -eu
cd "$(dirname "$0")"
KERNEL=build/Image
-INITRD=build/initramfs.cpio
+IN_IMG=build/in.img
+OUT_IMG=build/out.img
[ -f "$KERNEL" ] || { echo "missing $KERNEL — run 'make' first"; exit 1; }
-[ -f "$INITRD" ] || { echo "missing $INITRD — run 'make' first"; exit 1; }
+[ -f "$IN_IMG" ] || { echo "missing $IN_IMG — run 'make' first"; exit 1; }
+
+# Pre-allocate a fresh 256 MiB output image (sparse) for each run; the
+# kernel writes a SEEDFS header at sector 0 unconditionally on exit.
+rm -f "$OUT_IMG"
+truncate -s 256M "$OUT_IMG"
exec qemu-system-aarch64 \
-machine virt,gic-version=3,accel=hvf \
@@ -18,6 +24,10 @@ exec qemu-system-aarch64 \
-m 2048M \
-nographic \
-no-reboot \
+ -global virtio-mmio.force-legacy=false \
-kernel "$KERNEL" \
- -initrd "$INITRD" \
+ -drive file="$IN_IMG",if=none,format=raw,id=hd0,readonly=on \
+ -device virtio-blk-device,drive=hd0 \
+ -drive file="$OUT_IMG",if=none,format=raw,id=hd1 \
+ -device virtio-blk-device,drive=hd1 \
"$@"
diff --git a/seed-kernel/scripts/extract-blk.sh b/seed-kernel/scripts/extract-blk.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+"""extract-blk — extract files from a SEEDFS image written to blk1 by the
+seed kernel.
+
+Layout (sector-aligned, little-endian):
+ sector 0: char magic[8] = "SEEDFS\0\0"; u32 nfiles; u32 reserved;
+ sector 1..T: nfiles * { char path[96]; u32 data_offset_sectors;
+ u32 _pad; u64 size_bytes; }
+ (entry size 112 B, 4 entries/sector → T=ceil(nfiles/4))
+ sector T+1..: file data, each padded up to a 512-byte boundary.
+
+Usage: extract-blk.sh <outdir> <out.img>
+"""
+
+import os
+import struct
+import sys
+
+ENT_SIZE = 112
+HDR_FMT = "<8sII" # magic[8], nfiles u32, reserved u32
+ENT_FMT = "<96sIIQ" # path[96], data_offset u32, _pad u32, size u64
+
+
+def fail(msg: str) -> None:
+ print(f"extract-blk: {msg}", file=sys.stderr)
+ sys.exit(3)
+
+
+def main() -> int:
+ if len(sys.argv) != 3:
+ print("usage: extract-blk.sh <outdir> <out.img>", file=sys.stderr)
+ return 2
+
+ outdir, img_path = sys.argv[1], sys.argv[2]
+ if not os.path.isfile(img_path):
+ fail(f"missing {img_path}")
+ os.makedirs(outdir, exist_ok=True)
+
+ with open(img_path, "rb") as f:
+ hdr = f.read(16)
+ if len(hdr) < 16:
+ fail("image too small for header")
+ magic, nfiles, _reserved = struct.unpack(HDR_FMT, hdr)
+ if magic != b"SEEDFS\0\0":
+ fail(f"bad magic at sector 0 (got {magic!r}) — kernel didn't reach exit")
+ if nfiles < 0:
+ fail(f"bad nfiles={nfiles}")
+
+ table_sectors = (nfiles + 3) // 4
+ f.seek(512)
+ table = f.read(table_sectors * 512)
+ if len(table) < nfiles * ENT_SIZE:
+ fail("image truncated mid-table")
+
+ for i in range(nfiles):
+ path_bytes, off_sectors, _pad, size_bytes = struct.unpack_from(
+ ENT_FMT, table, i * ENT_SIZE
+ )
+ path = path_bytes.split(b"\0", 1)[0].decode("utf-8", "replace")
+ if not path:
+ fail(f"empty path at entry {i}")
+
+ out = os.path.join(outdir, path)
+ os.makedirs(os.path.dirname(out) or ".", exist_ok=True)
+
+ if size_bytes == 0:
+ open(out, "wb").close()
+ continue
+
+ f.seek(off_sectors * 512)
+ remaining = size_bytes
+ with open(out, "wb") as g:
+ while remaining > 0:
+ chunk = f.read(min(remaining, 1 << 20))
+ if not chunk:
+ fail(f"image truncated reading {path}")
+ g.write(chunk)
+ remaining -= len(chunk)
+
+ print(f"extract-blk: wrote {nfiles} file(s) to {outdir}", file=sys.stderr)
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/seed-kernel/scripts/extract-dump.sh b/seed-kernel/scripts/extract-dump.sh
@@ -1,57 +0,0 @@
-#!/bin/sh
-# Extract files from a seed-kernel UART transcript that was produced with
-# the "dumpfs" bootargs token. Reads transcript from stdin (or $1), writes
-# each dumped file to <outdir>/<path>. Header format emitted by kernel.c
-# dump_tmpfs():
-#
-# === DUMP-BEGIN ===
-# === FILE path=<name> size=<N> ===
-# <2*N hex chars><LF>
-# ... repeat ...
-# === DUMP-END ===
-#
-# Anything before DUMP-BEGIN or after DUMP-END is ignored.
-#
-# Usage: extract-dump.sh <outdir> [transcript]
-
-set -eu
-
-[ $# -ge 1 ] || { echo "usage: $0 <outdir> [transcript]"; exit 2; }
-
-outdir=$1
-shift
-mkdir -p "$outdir"
-
-if [ $# -ge 1 ]; then
- src=$(cat "$1")
-else
- src=$(cat)
-fi
-
-# Strip CRs that QEMU's nographic UART likes to emit.
-src=$(printf '%s' "$src" | tr -d '\r')
-
-awk -v outdir="$outdir" '
-/^=== DUMP-BEGIN ===$/ { in_dump = 1; next }
-/^=== DUMP-END ===$/ { in_dump = 0; next }
-in_dump && /^=== FILE path=/ {
- sub(/^=== FILE path=/, "")
- sub(/ ===$/, "")
- n = split($0, kv, " size=")
- path = kv[1]
- size = kv[2]+0
- out = outdir "/" path
- # Make any parent dirs (tmpfs is flat but be safe).
- cmd = "mkdir -p \"$(dirname \"" out "\")\""; system(cmd); close(cmd)
- print "extract: " path " (" size " bytes) -> " out > "/dev/stderr"
- # Hex payload is one (potentially many-MB) line; pipe it directly
- # into xxd. Avoids ARG_MAX limits that bite for files >~500 KB.
- getline hex
- decode_cmd = "xxd -r -p > \"" out "\""
- print hex | decode_cmd
- close(decode_cmd)
- next
-}
-' <<EOF
-$src
-EOF
diff --git a/seed-kernel/scripts/tier1-gate.sh b/seed-kernel/scripts/tier1-gate.sh
@@ -7,11 +7,11 @@
# Usage:
# tier1-gate.sh <stage-binary> <output-dir> -- <argv...> -- <input-files...>
#
-# Builds an initramfs containing the stage binary as /init plus every
-# file from <input-files...> at its basename, runs qemu, and extracts
-# every file in the post-run tmpfs into <output-dir>/. The driver
-# passes <argv...> verbatim through /chosen/bootargs (with a "dumpfs"
-# token appended to trigger the UART dump on exit).
+# Builds an in.img cpio archive containing the stage binary as /init plus
+# every file from <input-files...> at its basename, runs qemu, and
+# extracts every file from the post-run SEEDFS image (out.img) into
+# <output-dir>/. The driver passes <argv...> verbatim through
+# /chosen/bootargs.
#
# Example: run boot0 catm to concatenate a + b into out.
# tier1-gate.sh build/aarch64/boot0/catm /tmp/out \
@@ -41,14 +41,14 @@ shift
HERE=$(cd "$(dirname "$0")" && pwd)
SEED_DIR=$(cd "$HERE/.." && pwd)
KERNEL=$SEED_DIR/build/Image
-EXTRACT=$HERE/extract-dump.sh
+EXTRACT=$HERE/extract-blk.sh
[ -f "$KERNEL" ] || { echo "missing $KERNEL — run 'make' in $SEED_DIR first" >&2; exit 1; }
[ -x "$EXTRACT" ] || { echo "missing $EXTRACT" >&2; exit 1; }
mkdir -p "$OUTDIR"
-# Stage initramfs.
+# Stage cpio + in.img + out.img.
STAGE=$(mktemp -d -t tier1-stage.XXXXXX)
trap 'rm -rf "$STAGE"' EXIT
@@ -61,17 +61,29 @@ for inp in "$@"; do
NAMES="$NAMES
$base"
done
-INITRAMFS=$STAGE/initramfs.cpio
( cd "$STAGE" && printf '%s\n' "$NAMES" | cpio -o -H newc 2>/dev/null > initramfs.cpio )
+# Pad cpio up to a 512-byte multiple so virtio-blk sees whole sectors.
+sz=$(wc -c < "$STAGE/initramfs.cpio")
+pad=$(( (512 - sz % 512) % 512 ))
+if [ "$pad" -gt 0 ]; then
+ head -c "$pad" /dev/zero >> "$STAGE/initramfs.cpio"
+fi
+mv "$STAGE/initramfs.cpio" "$STAGE/in.img"
+truncate -s 256M "$STAGE/out.img"
# Run qemu, capture transcript, extract.
TRANSCRIPT=$STAGE/transcript.txt
-echo "[gate] running stage with argv: $ARGV dumpfs" >&2
+echo "[gate] running stage with argv: $ARGV" >&2
qemu-system-aarch64 \
-machine virt,gic-version=3,accel=hvf -cpu host -m 2048M \
-nographic -no-reboot \
- -kernel "$KERNEL" -initrd "$INITRAMFS" \
- -append "$ARGV dumpfs" \
+ -global virtio-mmio.force-legacy=false \
+ -kernel "$KERNEL" \
+ -drive file="$STAGE/in.img",if=none,format=raw,id=hd0,readonly=on \
+ -device virtio-blk-device,drive=hd0 \
+ -drive file="$STAGE/out.img",if=none,format=raw,id=hd1 \
+ -device virtio-blk-device,drive=hd1 \
+ -append "$ARGV" \
> "$TRANSCRIPT" 2>&1 &
QPID=$!
# Bound the run; the seed kernel ends with PSCI SYSTEM_OFF on exit,
@@ -81,11 +93,9 @@ WATCHER=$!
wait $QPID 2>/dev/null || true
kill $WATCHER 2>/dev/null || true
-if ! grep -q '=== DUMP-END ===' "$TRANSCRIPT"; then
- echo "[gate] FAIL: no DUMP-END in transcript" >&2
+"$EXTRACT" "$OUTDIR" "$STAGE/out.img" >/dev/null 2>&1 || {
+ echo "[gate] FAIL: extract-blk failed (kernel didn't reach exit?)" >&2
tail -40 "$TRANSCRIPT" >&2
exit 3
-fi
-
-"$EXTRACT" "$OUTDIR" "$TRANSCRIPT"
+}
echo "[gate] extracted to $OUTDIR" >&2
diff --git a/seed-kernel/scripts/tier2-gate.sh b/seed-kernel/scripts/tier2-gate.sh
@@ -8,14 +8,14 @@
# <child-bin> <output-dir> -- <input-files...>
#
# Stages: combines prelude.scm + driver.scm into combined.scm via host
-# `cat`, packs an initramfs containing /init=scheme1, /combined.scm,
-# /child-prog=<child-bin>, plus every input file at its basename, then
-# boots qemu with bootargs "init combined.scm dumpfs". driver.scm is
-# expected to use prelude's (run "child-prog" ...) wrapper.
+# `cat`, packs an in.img cpio archive containing /init=scheme1,
+# /combined.scm, /child-prog=<child-bin>, plus every input file at its
+# basename, then boots qemu with bootargs "init combined.scm". driver.scm
+# is expected to use prelude's (run "child-prog" ...) wrapper.
#
-# After qemu exits, every file in the post-run tmpfs is extracted into
-# <output-dir>/. The driver's exit status is reflected in this script's
-# exit status (0 = scheme1 driver said success).
+# After qemu exits, every file from the post-run SEEDFS dump (out.img) is
+# extracted into <output-dir>/. The driver's exit status is reflected in
+# this script's exit status (0 = scheme1 driver said success).
set -eu
@@ -32,7 +32,7 @@ shift
HERE=$(cd "$(dirname "$0")" && pwd)
SEED_DIR=$(cd "$HERE/.." && pwd)
KERNEL=$SEED_DIR/build/Image
-EXTRACT=$HERE/extract-dump.sh
+EXTRACT=$HERE/extract-blk.sh
[ -f "$KERNEL" ] || { echo "missing $KERNEL — run 'make' in $SEED_DIR first" >&2; exit 1; }
mkdir -p "$OUTDIR"
@@ -52,14 +52,26 @@ for inp in "$@"; do
$base"
done
( cd "$STAGE" && printf '%s\n' "$NAMES" | cpio -o -H newc 2>/dev/null > initramfs.cpio )
+sz=$(wc -c < "$STAGE/initramfs.cpio")
+pad=$(( (512 - sz % 512) % 512 ))
+if [ "$pad" -gt 0 ]; then
+ head -c "$pad" /dev/zero >> "$STAGE/initramfs.cpio"
+fi
+mv "$STAGE/initramfs.cpio" "$STAGE/in.img"
+truncate -s 256M "$STAGE/out.img"
TRANSCRIPT=$STAGE/transcript.txt
echo "[gate] running scheme1 driver" >&2
qemu-system-aarch64 \
-machine virt,gic-version=3,accel=hvf -cpu host -m 2048M \
-nographic -no-reboot \
- -kernel "$KERNEL" -initrd "$STAGE/initramfs.cpio" \
- -append "init combined.scm dumpfs" \
+ -global virtio-mmio.force-legacy=false \
+ -kernel "$KERNEL" \
+ -drive file="$STAGE/in.img",if=none,format=raw,id=hd0,readonly=on \
+ -device virtio-blk-device,drive=hd0 \
+ -drive file="$STAGE/out.img",if=none,format=raw,id=hd1 \
+ -device virtio-blk-device,drive=hd1 \
+ -append "init combined.scm" \
> "$TRANSCRIPT" 2>&1 &
QPID=$!
( sleep 240; kill -9 $QPID 2>/dev/null ) </dev/null >/dev/null 2>&1 &
@@ -67,16 +79,14 @@ WATCHER=$!
wait $QPID 2>/dev/null || true
kill $WATCHER 2>/dev/null || true
-if ! grep -q '=== DUMP-END ===' "$TRANSCRIPT"; then
- echo "[gate] FAIL: no DUMP-END in transcript" >&2
+if ! "$EXTRACT" "$OUTDIR" "$STAGE/out.img" >/dev/null 2>&1; then
+ echo "[gate] FAIL: extract-blk failed (kernel didn't reach exit?)" >&2
tail -40 "$TRANSCRIPT" >&2
exit 3
fi
# Capture the driver's exit code from the kernel's parting message.
EXIT_LINE=$(grep -E "user exit_group" "$TRANSCRIPT" | tail -1 || true)
-"$EXTRACT" "$OUTDIR" "$TRANSCRIPT"
-
case "$EXIT_LINE" in
*"exit_group(0)"*)
echo "[gate] PASS — driver exit 0; outputs in $OUTDIR" >&2
diff --git a/seed-kernel/start.S b/seed-kernel/start.S
@@ -1,233 +0,0 @@
-/* seed kernel — arm64 boot protocol entry, vector table, SVC handler. */
-
-.section .head.text, "ax"
-.globl _head
-_head:
- /* arm64 Image header (Documentation/arm64/booting.rst).
- * code0 must be a valid instruction (a branch, in our case). */
- b stext
- .long 0
- .quad 0x80000 /* text_offset (preferred load offset within RAM) */
- .quad _image_end - _head /* image_size */
- .quad 0xa /* flags: 4K pages, anywhere in physmem, LE */
- .quad 0
- .quad 0
- .quad 0
- .ascii "ARM\x64" /* magic */
- .long 0 /* PE COFF offset (none) */
-
-stext:
- /* Entry contract: x0 = DTB phys, MMU off, caches off, EL2 or EL1. */
- msr daifset, #0xf
-
- /* If we entered at EL2, drop to EL1. Otherwise we're already at EL1. */
- mrs x9, CurrentEL
- lsr x9, x9, #2
- cmp x9, #2
- b.ne in_el1
-
- /* EL2 → EL1: set HCR_EL2.RW=1 (EL1 is AArch64), CNTHCTL/CNTVOFF defaults,
- * SPSR=EL1h with DAIF masked, ELR=in_el1, eret. */
- mov x9, #(1 << 31)
- msr hcr_el2, x9
- mov x9, #0x3c5 /* EL1h, DAIF=1111 */
- msr spsr_el2, x9
- adr x9, in_el1
- msr elr_el2, x9
- /* Make sure SP_EL1 is set before we eret to EL1 (else we land with
- * an undefined SP). Use the same kernel stack we're about to install. */
- adrp x9, kstack_top
- add x9, x9, :lo12:kstack_top
- msr sp_el1, x9
- eret
-
-in_el1:
- /* Stack. */
- adrp x9, kstack_top
- add x9, x9, :lo12:kstack_top
- mov sp, x9
-
- /* Vector table. */
- adrp x9, vector_table
- add x9, x9, :lo12:vector_table
- msr vbar_el1, x9
- isb
-
- /* Zero BSS. */
- adrp x1, __bss_start
- add x1, x1, :lo12:__bss_start
- adrp x2, __bss_end
- add x2, x2, :lo12:__bss_end
-1: cmp x1, x2
- b.ge 2f
- str xzr, [x1], #8
- b 1b
-2:
- /* Hand control to C. x0 still = DTB phys (not clobbered above). */
- bl kmain
-
- /* kmain shouldn't return. */
-hang:
- wfe
- b hang
-
-
-/* ─── Exception vector table ──────────────────────────────────────────── */
-
-.macro VENTRY label
- .balign 0x80
- b \label
-.endm
-
-.section .text, "ax"
-.balign 0x800
-.globl vector_table
-vector_table:
- /* Current EL with SP_EL0 (we never run kernel like this — only user). */
- VENTRY el1_sp0_sync /* 0x000: SVC from EL1t (our "user") */
- VENTRY unhandled /* 0x080 */
- VENTRY unhandled /* 0x100 */
- VENTRY unhandled /* 0x180 */
- /* Current EL with SP_ELx (kernel internal). */
- VENTRY el1_spx_sync /* 0x200: panic on kernel sync fault */
- VENTRY unhandled /* 0x280 */
- VENTRY unhandled /* 0x300 */
- VENTRY unhandled /* 0x380 */
- /* Lower EL using AArch64 (EL0). Unused in this design but wired. */
- VENTRY el1_sp0_sync /* 0x400 */
- VENTRY unhandled /* 0x480 */
- VENTRY unhandled /* 0x500 */
- VENTRY unhandled /* 0x580 */
- /* Lower EL using AArch32 (unused). */
- VENTRY unhandled /* 0x600 */
- VENTRY unhandled /* 0x680 */
- VENTRY unhandled /* 0x700 */
- VENTRY unhandled /* 0x780 */
-
-
-/* ─── Trap entry/exit ─────────────────────────────────────────────────────
- * Save x0..x30 + ELR_EL1 + SPSR_EL1 onto the kernel stack as a trapframe,
- * call C trap_sync(esr, &tf), restore, eret. The C handler reads/writes
- * tf->x[0..7] for syscall args and return value, plus tf->x[8] for the
- * syscall number.
- */
-
-.macro SAVE_TF
- sub sp, sp, #272
- stp x0, x1, [sp, #0]
- stp x2, x3, [sp, #16]
- stp x4, x5, [sp, #32]
- stp x6, x7, [sp, #48]
- stp x8, x9, [sp, #64]
- stp x10, x11, [sp, #80]
- stp x12, x13, [sp, #96]
- stp x14, x15, [sp, #112]
- stp x16, x17, [sp, #128]
- stp x18, x19, [sp, #144]
- stp x20, x21, [sp, #160]
- stp x22, x23, [sp, #176]
- stp x24, x25, [sp, #192]
- stp x26, x27, [sp, #208]
- stp x28, x29, [sp, #224]
- str x30, [sp, #240]
- mrs x10, elr_el1
- mrs x11, spsr_el1
- stp x10, x11, [sp, #248]
-.endm
-
-.macro RESTORE_TF
- ldp x10, x11, [sp, #248]
- msr elr_el1, x10
- msr spsr_el1, x11
- ldr x30, [sp, #240]
- ldp x28, x29, [sp, #224]
- ldp x26, x27, [sp, #208]
- ldp x24, x25, [sp, #192]
- ldp x22, x23, [sp, #176]
- ldp x20, x21, [sp, #160]
- ldp x18, x19, [sp, #144]
- ldp x16, x17, [sp, #128]
- ldp x14, x15, [sp, #112]
- ldp x12, x13, [sp, #96]
- ldp x10, x11, [sp, #80]
- ldp x8, x9, [sp, #64]
- ldp x6, x7, [sp, #48]
- ldp x4, x5, [sp, #32]
- ldp x2, x3, [sp, #16]
- ldp x0, x1, [sp, #0]
- add sp, sp, #272
-.endm
-
-el1_sp0_sync:
- SAVE_TF
- mrs x0, esr_el1
- mov x1, sp
- bl trap_sync
- RESTORE_TF
- eret
-
-el1_spx_sync:
- /* Same shape as user sync — let C distinguish via SPSR/ESR if needed. */
- SAVE_TF
- mrs x0, esr_el1
- mov x1, sp
- bl trap_kernel
- RESTORE_TF
- eret
-
-unhandled:
- SAVE_TF
- mrs x0, esr_el1
- mov x1, sp
- bl trap_unhandled
- RESTORE_TF
- eret
-
-
-/* ─── eret_to_user(entry, sp) ─────────────────────────────────────────────
- * Drop into the loaded user program. Runs at EL1t (same EL as kernel,
- * but uses SP_EL0 — gives us a separate user stack without setting up
- * an MMU). DAIF stays masked since we don't service interrupts.
- */
-.globl eret_to_user
-eret_to_user:
- msr sp_el0, x1
- msr elr_el1, x0
- mov x9, #0x3c4 /* EL1t, DAIF=1111 */
- msr spsr_el1, x9
- /* Clear all GP regs so user starts clean. argc/argv come in via the
- * SysV stack layout, which the user reads directly off SP_EL0. Some
- * boot0/1 seed-stage binaries (notably M0) read xN before any write,
- * so leaking kernel register state past the eret would fault them. */
- mov x0, xzr
- mov x1, xzr
- mov x2, xzr
- mov x3, xzr
- mov x4, xzr
- mov x5, xzr
- mov x6, xzr
- mov x7, xzr
- mov x8, xzr
- mov x9, xzr
- mov x10, xzr
- mov x11, xzr
- mov x12, xzr
- mov x13, xzr
- mov x14, xzr
- mov x15, xzr
- mov x16, xzr
- mov x17, xzr
- mov x18, xzr
- mov x19, xzr
- mov x20, xzr
- mov x21, xzr
- mov x22, xzr
- mov x23, xzr
- mov x24, xzr
- mov x25, xzr
- mov x26, xzr
- mov x27, xzr
- mov x28, xzr
- mov x29, xzr
- mov x30, xzr
- eret