boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit 6077ca5c6ba45b34582013d0569f8869f5d34b63
parent 4a1e722026a5a30bd68d2975cb39e93833df4867
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Tue,  5 May 2026 23:31:33 -0700

Implement amd64 and riscv64 seed kernels

Diffstat:
Mseed-kernel/Makefile | 60+++++++++++++++++++++++++++++++++++++++++++++---------------
Mseed-kernel/arch/amd64/arch.h | 13+++++++++++--
Aseed-kernel/arch/amd64/kernel.S | 355+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aseed-kernel/arch/amd64/kernel.lds | 37+++++++++++++++++++++++++++++++++++++
Aseed-kernel/arch/amd64/mmu.c | 171+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aseed-kernel/arch/riscv64/kernel.S | 252+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aseed-kernel/arch/riscv64/kernel.lds | 32++++++++++++++++++++++++++++++++
Aseed-kernel/arch/riscv64/mmu.c | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mseed-kernel/kernel.c | 27+++++++++++++++++++--------
Mseed-kernel/run.sh | 72+++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Mseed-kernel/user/child.c | 47+++++++++++++++++++++++++++++++++++++++++++++++
Mseed-kernel/user/forktest.c | 59+++++++++++++++++++++++++++++++++++++++++++++++++++++------
Mseed-kernel/user/hello.c | 67+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
13 files changed, 1205 insertions(+), 50 deletions(-)

diff --git a/seed-kernel/Makefile b/seed-kernel/Makefile @@ -1,9 +1,18 @@ # seed-kernel — minimal OS that satisfies docs/OS.md Tier 1/2. ARCH ?= aarch64 -CC ?= gcc -LD ?= ld -OBJCOPY ?= objcopy +ifeq ($(origin CC),default) +CC := clang +endif +ifeq ($(origin LD),default) +LD := ld.lld +endif +ifeq ($(origin OBJCOPY),default) +OBJCOPY := llvm-objcopy +endif +ifeq ($(strip $(OBJCOPY)),) +OBJCOPY := llvm-objcopy +endif OUT := build ARCHDIR := arch/$(ARCH) KOBJS := $(OUT)/kasm.o $(OUT)/kernel.o $(OUT)/mmu.o $(OUT)/mem.o @@ -22,19 +31,31 @@ IN_IMG_FORK := $(OUT)/in-fork.img OUT_IMG_SIZE := 268435456 CFLAGS_COMMON := -nostdlib -ffreestanding -fno-stack-protector \ - -fno-pic -static -Wall -Wextra -O2 -mcmodel=large \ + -fno-pic -static -Wall -Wextra -O2 \ -fno-asynchronous-unwind-tables -fno-unwind-tables KCFLAGS := $(CFLAGS_COMMON) -I$(ARCHDIR) ifeq ($(ARCH),aarch64) -KCFLAGS += -mgeneral-regs-only -USER_ARCH_CFLAGS := -mgeneral-regs-only +KCFLAGS += -target aarch64-unknown-elf -mcmodel=large -mgeneral-regs-only +USER_ARCH_CFLAGS := -target aarch64-unknown-elf -mcmodel=large -mgeneral-regs-only +LDFLAGS_ARCH := -m aarch64elf +EXTRA_ARCH_TARGETS := +else ifeq ($(ARCH),amd64) +KCFLAGS += -target x86_64-unknown-elf -mcmodel=large -mno-red-zone -mno-sse -mno-mmx +USER_ARCH_CFLAGS := -target x86_64-unknown-elf -mcmodel=large -mno-red-zone -mno-sse -mno-mmx +LDFLAGS_ARCH := -m elf_x86_64 +EXTRA_ARCH_TARGETS := +else ifeq ($(ARCH),riscv64) +KCFLAGS += -target riscv64-unknown-elf -march=rv64imac_zicsr_zifencei -mabi=lp64 -mcmodel=medany -mno-relax -msmall-data-limit=0 +USER_ARCH_CFLAGS := -target riscv64-unknown-elf -march=rv64imac_zicsr_zifencei -mabi=lp64 -mcmodel=medany -mno-relax -msmall-data-limit=0 +LDFLAGS_ARCH := -m elf64lriscv +EXTRA_ARCH_TARGETS := else -$(error seed-kernel backend '$(ARCH)' is staged but not boot-wired yet; use ARCH=aarch64) +$(error seed-kernel backend '$(ARCH)' is unknown; use ARCH=aarch64, amd64, or riscv64) endif .PHONY: all clean kernel user initramfs -all: $(KBIN) $(INITRAMFS) $(INITRAMFS_FORK) $(IN_IMG) $(IN_IMG_FORK) +all: $(KBIN) $(EXTRA_ARCH_TARGETS) $(INITRAMFS) $(INITRAMFS_FORK) $(IN_IMG) $(IN_IMG_FORK) $(OUT): mkdir -p $(OUT) @@ -56,20 +77,29 @@ $(OUT)/mem.o: ../tcc-cc/mem.c | $(OUT) $(CC) $(KCFLAGS) -c -o $@ $< $(KIMAGE): $(KOBJS) $(ARCHDIR)/kernel.lds - $(LD) -nostdlib -static -T $(ARCHDIR)/kernel.lds -o $@ $(KOBJS) + $(LD) $(LDFLAGS_ARCH) -nostdlib -static -T $(ARCHDIR)/kernel.lds -o $@ $(KOBJS) # Strip ELF down to a flat binary that QEMU's -kernel can load. $(KBIN): $(KIMAGE) $(OBJCOPY) -O binary $< $@ -$(USER): user/hello.c user/user.lds | $(OUT) - $(CC) $(CFLAGS_COMMON) $(USER_ARCH_CFLAGS) -T user/user.lds -o $@ $< +$(OUT)/init.o: user/hello.c | $(OUT) + $(CC) $(CFLAGS_COMMON) $(USER_ARCH_CFLAGS) -c -o $@ $< + +$(OUT)/forktest.o: user/forktest.c | $(OUT) + $(CC) $(CFLAGS_COMMON) $(USER_ARCH_CFLAGS) -c -o $@ $< + +$(OUT)/child.o: user/child.c | $(OUT) + $(CC) $(CFLAGS_COMMON) $(USER_ARCH_CFLAGS) -c -o $@ $< + +$(USER): $(OUT)/init.o user/user.lds + $(LD) $(LDFLAGS_ARCH) -nostdlib -static -T user/user.lds -o $@ $(OUT)/init.o -$(USER_FORK): user/forktest.c user/user.lds | $(OUT) - $(CC) $(CFLAGS_COMMON) $(USER_ARCH_CFLAGS) -T user/user.lds -o $@ $< +$(USER_FORK): $(OUT)/forktest.o user/user.lds + $(LD) $(LDFLAGS_ARCH) -nostdlib -static -T user/user.lds -o $@ $(OUT)/forktest.o -$(USER_CHILD): user/child.c user/user.lds | $(OUT) - $(CC) $(CFLAGS_COMMON) $(USER_ARCH_CFLAGS) -T user/user.lds -o $@ $< +$(USER_CHILD): $(OUT)/child.o user/user.lds + $(LD) $(LDFLAGS_ARCH) -nostdlib -static -T user/user.lds -o $@ $(OUT)/child.o $(INITRAMFS): $(USER) cd $(OUT) && printf 'init\n' | cpio -o -H newc > initramfs.cpio diff --git a/seed-kernel/arch/amd64/arch.h b/seed-kernel/arch/amd64/arch.h @@ -9,7 +9,13 @@ #define ARCH_DEVICE_ALIAS_BASE 0xffff800000000000UL #define ARCH_UART0_PA 0x000003f8UL -#define ARCH_KERNEL_HEAP_END 0x04b00000UL +#define ARCH_KERNEL_HEAP_END 0x4b000000UL + +#define ARCH_STATIC_MEM_START 0x00100000UL +#define ARCH_STATIC_MEM_SIZE 0x7ff00000UL +#define ARCH_STATIC_VIRTIO_MMIO_BASE 0xfeb00000UL +#define ARCH_STATIC_VIRTIO_MMIO_STRIDE 0x200UL +#define ARCH_STATIC_VIRTIO_MMIO_COUNT 8 #define ARCH_USER_POOL_A_PA 0x04c00000UL #define ARCH_USER_POOL_B_PA 0x34c00000UL @@ -62,6 +68,8 @@ extern void arch_icache_sync(void); extern void arch_icache_context_sync(void); extern void arch_system_off(void); extern void eret_to_user(u64 entry, u64 sp); +extern void amd64_outb(u16 port, u8 val); +extern u8 amd64_inb(u16 port); static inline void arch_clear_to_user_entry(struct trapframe *tf, u64 entry) { for (int i = 0; i < ARCH_TRAPFRAME_NREGS; i++) tf->x[i] = 0; @@ -69,7 +77,8 @@ static inline void arch_clear_to_user_entry(struct trapframe *tf, u64 entry) { } static inline void arch_console_putc(char c) { - (void)c; + while ((amd64_inb((u16)(ARCH_UART0_PA + 5)) & 0x20) == 0) { } + amd64_outb((u16)ARCH_UART0_PA, (u8)c); } #endif diff --git a/seed-kernel/arch/amd64/kernel.S b/seed-kernel/arch/amd64/kernel.S @@ -0,0 +1,355 @@ +.section .note.Xen, "a" +.align 4 + .long 4 + .long 4 + .long 18 + .ascii "Xen\0" + .long _start + +.section .text, "ax" +.code32 +.globl _start +_start: + cli + movl $boot_stack_top, %esp + lgdt boot_gdt64_ptr + + movl %cr4, %eax + orl $0x20, %eax + movl %eax, %cr4 + + movl $boot_pml4, %eax + movl %eax, %cr3 + + movl $0xc0000080, %ecx + rdmsr + orl $0x100, %eax + wrmsr + + movl %cr0, %eax + orl $0x80000001, %eax + movl %eax, %cr0 + + ljmp $0x08, $long_mode + +.code64 +long_mode: + movw $0x10, %ax + movw %ax, %ds + movw %ax, %es + movw %ax, %ss + movabsq $kstack_top, %rsp + + call amd64_serial_init + + movabsq $__bss_start, %rdi + movabsq $_end, %rsi + xorl %eax, %eax +1: + cmpq %rsi, %rdi + jae 2f + movq %rax, (%rdi) + addq $8, %rdi + jmp 1b +2: + xorl %edi, %edi + call kmain + +3: + hlt + jmp 3b + +.globl amd64_int80 +amd64_int80: + subq $216, %rsp + movq %rax, 48(%rsp) + movq 216(%rsp), %r11 + movq %r11, 192(%rsp) + movq 232(%rsp), %r11 + movq %r11, 200(%rsp) + movq 240(%rsp), %r11 + movq %r11, saved_user_sp(%rip) + + movq %rdi, 0(%rsp) + movq %rsi, 8(%rsp) + movq %rdx, 16(%rsp) + movq %r10, 24(%rsp) + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movq %rbx, 56(%rsp) + movq %rcx, 64(%rsp) + movq %r11, 72(%rsp) + movq %r12, 80(%rsp) + movq %r13, 88(%rsp) + movq %r14, 96(%rsp) + movq %r15, 104(%rsp) + movq %rbp, 112(%rsp) + + xorl %edi, %edi + movq %rsp, %rsi + call trap_sync + + movq 192(%rsp), %rax + movq %rax, 216(%rsp) + movq 200(%rsp), %rax + movq %rax, 232(%rsp) + movq saved_user_sp(%rip), %rax + movq %rax, 240(%rsp) + + movq 0(%rsp), %rdi + movq 8(%rsp), %rsi + movq 16(%rsp), %rdx + movq 24(%rsp), %r10 + movq 32(%rsp), %r8 + movq 40(%rsp), %r9 + movq 48(%rsp), %rax + movq 56(%rsp), %rbx + movq 64(%rsp), %rcx + movq 72(%rsp), %r11 + movq 80(%rsp), %r12 + movq 88(%rsp), %r13 + movq 96(%rsp), %r14 + movq 104(%rsp), %r15 + movq 112(%rsp), %rbp + addq $216, %rsp + iretq + +.globl amd64_unhandled +amd64_unhandled: + subq $216, %rsp + movq 216(%rsp), %rax + movq %rax, 192(%rsp) + movq 232(%rsp), %rax + movq %rax, 200(%rsp) + movq $0xdead, %rdi + movq %rsp, %rsi + call trap_unhandled +1: + hlt + jmp 1b + +.globl eret_to_user +eret_to_user: + movq %rsi, saved_user_sp(%rip) + movabsq $kstack_top, %rsp + pushq $0x1b + pushq %rsi + pushq $0x202 + pushq $0x23 + pushq %rdi + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + xorl %ebp, %ebp + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + xorq %r14, %r14 + xorq %r15, %r15 + iretq + +.globl arch_read_user_sp +arch_read_user_sp: + movq saved_user_sp(%rip), %rax + ret + +.globl arch_write_user_sp +arch_write_user_sp: + movq %rdi, saved_user_sp(%rip) + ret + +.globl arch_fault_addr +arch_fault_addr: + movq %cr2, %rax + ret + +.globl arch_pause +arch_pause: + pause + ret + +.globl arch_idle_forever +arch_idle_forever: +1: + hlt + jmp 1b + +.globl arch_mmio_ptr +arch_mmio_ptr: + movabsq $0xffff800000000000, %rax + addq %rdi, %rax + ret + +.globl arch_wmb +arch_wmb: + sfence + ret + +.globl arch_rmb +arch_rmb: + lfence + ret + +.globl arch_icache_sync +arch_icache_sync: + ret + +.globl arch_icache_context_sync +arch_icache_context_sync: + ret + +.globl arch_system_off +arch_system_off: + movw $0x501, %dx + movw $0x31, %ax + outw %ax, %dx +1: + hlt + jmp 1b + +.globl amd64_outb +amd64_outb: + movw %di, %dx + movb %sil, %al + outb %al, %dx + ret + +.globl amd64_inb +amd64_inb: + movw %di, %dx + xorl %eax, %eax + inb %dx, %al + ret + +.globl amd64_load_cr3 +amd64_load_cr3: + movq %rdi, %cr3 + ret + +.globl amd64_lgdt +amd64_lgdt: + lgdt (%rdi) + movw $0x10, %ax + movw %ax, %ds + movw %ax, %es + movw %ax, %ss + pushq $0x08 + leaq 1f(%rip), %rax + pushq %rax + lretq +1: + ret + +.globl amd64_lidt +amd64_lidt: + lidt (%rdi) + ret + +.globl amd64_ltr +amd64_ltr: + ltr %di + ret + +amd64_serial_init: + movw $0x3f9, %dx + xorb %al, %al + outb %al, %dx + movw $0x3fb, %dx + movb $0x80, %al + outb %al, %dx + movw $0x3f8, %dx + movb $0x01, %al + outb %al, %dx + movw $0x3f9, %dx + xorb %al, %al + outb %al, %dx + movw $0x3fb, %dx + movb $0x03, %al + outb %al, %dx + movw $0x3fa, %dx + movb $0xc7, %al + outb %al, %dx + movw $0x3fc, %dx + movb $0x0b, %al + outb %al, %dx + ret + +.section .rodata, "a" +.align 8 +boot_gdt64: + .quad 0 + .quad 0x00af9a000000ffff + .quad 0x00af92000000ffff +boot_gdt64_ptr: + .word boot_gdt64_ptr - boot_gdt64 - 1 + .long boot_gdt64 + +.section .data, "aw" +.align 4096 +boot_pml4: + .quad boot_pdpt + 0x003 + .rept 255 + .quad 0 + .endr + .quad boot_pdpt + 0x003 + .rept 255 + .quad 0 + .endr + +.align 4096 +boot_pdpt: + .quad boot_pd0 + 0x003 + .quad boot_pd1 + 0x003 + .quad boot_pd2 + 0x003 + .quad boot_pd3 + 0x003 + .rept 508 + .quad 0 + .endr + +.align 4096 +boot_pd0: + .set i, 0 + .rept 512 + .quad (i * 0x200000) + 0x083 + .set i, i + 1 + .endr +.align 4096 +boot_pd1: + .set i, 512 + .rept 512 + .quad (i * 0x200000) + 0x083 + .set i, i + 1 + .endr +.align 4096 +boot_pd2: + .set i, 1024 + .rept 512 + .quad (i * 0x200000) + 0x083 + .set i, i + 1 + .endr +.align 4096 +boot_pd3: + .set i, 1536 + .rept 512 + .quad (i * 0x200000) + 0x083 + .set i, i + 1 + .endr + +.section .bss, "aw", @nobits +.align 16 +.globl saved_user_sp +saved_user_sp: + .skip 8 +.align 16 +.skip 0x10000 +.globl kstack_top +kstack_top: +.align 16 +.skip 0x4000 +boot_stack_top: diff --git a/seed-kernel/arch/amd64/kernel.lds b/seed-kernel/arch/amd64/kernel.lds @@ -0,0 +1,37 @@ +/* amd64 seed kernel: QEMU microvm loads this ELF through -kernel. */ + +ENTRY(_start) + +SECTIONS { + . = 0x40000000; + + .note.Xen : ALIGN(4) { + KEEP(*(.note.Xen)) + } + + .text : ALIGN(4096) { + build/kasm.o(.text .text.*) + *(.text .text.* .ltext .ltext.*) + } + + .rodata : ALIGN(4096) { + *(.rodata .rodata.* .lrodata .lrodata.*) + } + + .data : ALIGN(4096) { + *(.data .data.* .ldata .ldata.*) + } + + .bss : ALIGN(4096) { + __bss_start = .; + *(.bss .bss.* .lbss .lbss.*) + *(COMMON) + . = ALIGN(16); + } + + _end = .; + + /DISCARD/ : { + *(.note.*) *(.comment) *(.eh_frame) + } +} diff --git a/seed-kernel/arch/amd64/mmu.c b/seed-kernel/arch/amd64/mmu.c @@ -0,0 +1,171 @@ +typedef unsigned long u64; +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#include "arch.h" + +#define PTE_P 0x001UL +#define PTE_W 0x002UL +#define PTE_U 0x004UL +#define PTE_PCD 0x010UL +#define PTE_PS 0x080UL + +#define KSEG (PTE_P | PTE_W | PTE_PS) +#define USEG (PTE_P | PTE_W | PTE_U | PTE_PS) +#define DSEG (PTE_P | PTE_W | PTE_PCD | PTE_PS) + +__attribute__((aligned(4096))) static u64 pml4[512]; +__attribute__((aligned(4096))) static u64 pdpt_low[512]; +__attribute__((aligned(4096))) static u64 pdpt_alias[512]; +__attribute__((aligned(4096))) static u64 pd0[512]; +__attribute__((aligned(4096))) static u64 pd1[512]; +__attribute__((aligned(4096))) static u64 pd2[512]; +__attribute__((aligned(4096))) static u64 pd3[512]; +__attribute__((aligned(4096))) static u64 pda0[512]; +__attribute__((aligned(4096))) static u64 pda1[512]; +__attribute__((aligned(4096))) static u64 pda2[512]; +__attribute__((aligned(4096))) static u64 pda3[512]; + +struct gdtr { + u16 limit; + u64 base; +} __attribute__((packed)); + +struct idtr { + u16 limit; + u64 base; +} __attribute__((packed)); + +struct idt_gate { + u16 off0; + u16 sel; + u8 ist; + u8 type; + u16 off1; + u32 off2; + u32 zero; +} __attribute__((packed)); + +struct tss64 { + u32 reserved0; + u64 rsp0; + u64 rsp1; + u64 rsp2; + u64 reserved1; + u64 ist[7]; + u64 reserved2; + u16 reserved3; + u16 iopb; +} __attribute__((packed)); + +static u64 gdt[7]; +static struct idt_gate idt[256]; +static struct tss64 tss; + +extern void amd64_load_cr3(u64 pml4_pa); +extern void amd64_lgdt(const struct gdtr *g); +extern void amd64_lidt(const struct idtr *i); +extern void amd64_ltr(u16 sel); +extern void amd64_int80(void); +extern void amd64_unhandled(void); +extern char kstack_top[]; + +static void set_gate(int vec, void (*fn)(void), int dpl) { + u64 a = (u64)fn; + idt[vec].off0 = (u16)a; + idt[vec].sel = 0x08; + idt[vec].ist = 0; + idt[vec].type = (u8)(0x8e | (dpl << 5)); + idt[vec].off1 = (u16)(a >> 16); + idt[vec].off2 = (u32)(a >> 32); + idt[vec].zero = 0; +} + +static void setup_cpu_tables(void) { + gdt[0] = 0; + gdt[1] = 0x00af9a000000ffffUL; /* kernel code */ + gdt[2] = 0x00af92000000ffffUL; /* kernel data */ + gdt[3] = 0x00aff2000000ffffUL; /* user data */ + gdt[4] = 0x00affa000000ffffUL; /* user code */ + + u64 base = (u64)&tss; + u64 limit = sizeof(tss) - 1; + gdt[5] = (limit & 0xffff) + | ((base & 0xffffff) << 16) + | (0x89UL << 40) + | (((limit >> 16) & 0xf) << 48) + | (((base >> 24) & 0xff) << 56); + gdt[6] = base >> 32; + + tss.rsp0 = (u64)kstack_top; + tss.iopb = sizeof(tss); + + struct gdtr gdtr = { (u16)(sizeof(gdt) - 1), (u64)gdt }; + amd64_lgdt(&gdtr); + amd64_ltr(0x28); + + for (int i = 0; i < 256; i++) set_gate(i, amd64_unhandled, 0); + set_gate(0x80, amd64_int80, 3); + struct idtr idtr = { (u16)(sizeof(idt) - 1), (u64)idt }; + amd64_lidt(&idtr); +} + +static u64 pool_pa(int which) { + return which ? ARCH_USER_POOL_B_PA : ARCH_USER_POOL_A_PA; +} + +static void fill_pd(u64 *pd, u64 first_slot, u64 flags) { + for (int i = 0; i < 512; i++) { + u64 pa = (first_slot + (u64)i) * 0x200000UL; + pd[i] = pa | flags; + } +} + +static void fill_user_pd0(int which) { + fill_pd(pd0, 0, KSEG); + u64 base = pool_pa(which); + for (int i = ARCH_USER_POOL_FIRST_SLOT; i <= ARCH_USER_POOL_LAST_SLOT; i++) { + u64 pa = base + (u64)(i - ARCH_USER_POOL_FIRST_SLOT) * 0x200000UL; + pd0[i] = pa | USEG; + } +} + +void arch_setup_mmu(void) { + for (int i = 0; i < 512; i++) { + pml4[i] = 0; + pdpt_low[i] = 0; + pdpt_alias[i] = 0; + } + + fill_user_pd0(0); + fill_pd(pd1, 512, KSEG); + fill_pd(pd2, 1024, KSEG); + fill_pd(pd3, 1536, KSEG); + + fill_pd(pda0, 0, DSEG); + fill_pd(pda1, 512, DSEG); + fill_pd(pda2, 1024, DSEG); + fill_pd(pda3, 1536, DSEG); + + pdpt_low[0] = (u64)pd0 | PTE_P | PTE_W | PTE_U; + pdpt_low[1] = (u64)pd1 | PTE_P | PTE_W; + pdpt_low[2] = (u64)pd2 | PTE_P | PTE_W; + pdpt_low[3] = (u64)pd3 | PTE_P | PTE_W; + + pdpt_alias[0] = (u64)pda0 | PTE_P | PTE_W; + pdpt_alias[1] = (u64)pda1 | PTE_P | PTE_W; + pdpt_alias[2] = (u64)pda2 | PTE_P | PTE_W; + pdpt_alias[3] = (u64)pda3 | PTE_P | PTE_W; + + pml4[0] = (u64)pdpt_low | PTE_P | PTE_W | PTE_U; + pml4[256] = (u64)pdpt_alias | PTE_P | PTE_W; + + setup_cpu_tables(); + amd64_load_cr3((u64)pml4); +} + +void arch_swap_user_pool(int which) { + fill_user_pd0(which); + amd64_load_cr3((u64)pml4); +} diff --git a/seed-kernel/arch/riscv64/kernel.S b/seed-kernel/arch/riscv64/kernel.S @@ -0,0 +1,252 @@ +.section .text, "ax" +.globl _start +_start: + mv s0, a1 + la sp, kstack_top + la t0, trap_entry + csrw stvec, t0 + + la t0, __bss_start + la t1, _end +1: + bgeu t0, t1, 2f + sd zero, 0(t0) + addi t0, t0, 8 + j 1b +2: + mv a0, s0 + call kmain +3: + wfi + j 3b + +.align 2 +.globl trap_entry +trap_entry: + csrrw sp, sscratch, sp + addi sp, sp, -272 + + sd x1, 8(sp) + csrr t0, sscratch + sd t0, 16(sp) + la t1, saved_user_sp + sd t0, 0(t1) + sd x3, 24(sp) + sd x4, 32(sp) + sd x5, 40(sp) + sd x6, 48(sp) + sd x7, 56(sp) + sd x8, 64(sp) + sd x9, 72(sp) + sd x10, 80(sp) + sd x11, 88(sp) + sd x12, 96(sp) + sd x13, 104(sp) + sd x14, 112(sp) + sd x15, 120(sp) + sd x16, 128(sp) + sd x17, 136(sp) + sd x18, 144(sp) + sd x19, 152(sp) + sd x20, 160(sp) + sd x21, 168(sp) + sd x22, 176(sp) + sd x23, 184(sp) + sd x24, 192(sp) + sd x25, 200(sp) + sd x26, 208(sp) + sd x27, 216(sp) + sd x28, 224(sp) + sd x29, 232(sp) + sd x30, 240(sp) + sd x31, 248(sp) + + csrr t0, sepc + csrr a0, scause + li t1, 8 + bne a0, t1, 4f + addi t0, t0, 4 +4: + sd t0, 256(sp) + csrr t0, sstatus + sd t0, 264(sp) + + mv a1, sp + call trap_sync + + ld t0, 256(sp) + csrw sepc, t0 + ld t0, 264(sp) + csrw sstatus, t0 + + ld x1, 8(sp) + ld x3, 24(sp) + ld x4, 32(sp) + ld x7, 56(sp) + ld x8, 64(sp) + ld x9, 72(sp) + ld x10, 80(sp) + ld x11, 88(sp) + ld x12, 96(sp) + ld x13, 104(sp) + ld x14, 112(sp) + ld x15, 120(sp) + ld x16, 128(sp) + ld x17, 136(sp) + ld x18, 144(sp) + ld x19, 152(sp) + ld x20, 160(sp) + ld x21, 168(sp) + ld x22, 176(sp) + ld x23, 184(sp) + ld x24, 192(sp) + ld x25, 200(sp) + ld x26, 208(sp) + ld x27, 216(sp) + ld x28, 224(sp) + ld x29, 232(sp) + ld x30, 240(sp) + ld x31, 248(sp) + + la t0, saved_user_sp + ld t1, 0(t0) + csrw sscratch, t1 + ld x5, 40(sp) + ld x6, 48(sp) + addi sp, sp, 272 + csrrw sp, sscratch, sp + sret + +.globl eret_to_user +eret_to_user: + la t0, saved_user_sp + sd a1, 0(t0) + la t0, kstack_top + csrw sscratch, t0 + csrw sepc, a0 + csrr t0, sstatus + li t1, ~(1 << 8) + and t0, t0, t1 + li t1, (1 << 5) | (1 << 18) + or t0, t0, t1 + csrw sstatus, t0 + mv sp, a1 + li ra, 0 + li gp, 0 + li tp, 0 + li t0, 0 + li t1, 0 + li t2, 0 + li s0, 0 + li s1, 0 + li a0, 0 + li a1, 0 + li a2, 0 + li a3, 0 + li a4, 0 + li a5, 0 + li a6, 0 + li a7, 0 + li s2, 0 + li s3, 0 + li s4, 0 + li s5, 0 + li s6, 0 + li s7, 0 + li s8, 0 + li s9, 0 + li s10, 0 + li s11, 0 + li t3, 0 + li t4, 0 + li t5, 0 + li t6, 0 + sret + +.globl arch_read_user_sp +arch_read_user_sp: + la t0, saved_user_sp + ld a0, 0(t0) + ret + +.globl arch_write_user_sp +arch_write_user_sp: + la t0, saved_user_sp + sd a0, 0(t0) + ret + +.globl arch_fault_addr +arch_fault_addr: + csrr a0, stval + ret + +.globl arch_pause +arch_pause: + nop + ret + +.globl arch_idle_forever +arch_idle_forever: +1: + wfi + j 1b + +.globl arch_mmio_ptr +arch_mmio_ptr: + li t0, 0x100000000 + add a0, a0, t0 + ret + +.globl arch_wmb +arch_wmb: + fence w, w + ret + +.globl arch_rmb +arch_rmb: + fence r, r + ret + +.globl arch_icache_sync +arch_icache_sync: + fence.i + ret + +.globl arch_icache_context_sync +arch_icache_context_sync: + sfence.vma + fence.i + ret + +.globl riscv_write_satp +riscv_write_satp: + csrw satp, a0 + sfence.vma + ret + +.globl riscv_set_sum +riscv_set_sum: + csrr t0, sstatus + li t1, (1 << 18) + or t0, t0, t1 + csrw sstatus, t0 + ret + +.globl arch_system_off +arch_system_off: + li t0, 0x100000 + li t1, 0x5555 + sw t1, 0(t0) +1: + wfi + j 1b + +.section .bss, "aw", @nobits +.align 3 +.globl saved_user_sp +saved_user_sp: + .skip 8 +.align 4 +.skip 0x10000 +.globl kstack_top +kstack_top: diff --git a/seed-kernel/arch/riscv64/kernel.lds b/seed-kernel/arch/riscv64/kernel.lds @@ -0,0 +1,32 @@ +/* riscv64 seed kernel for QEMU virt. OpenSBI loads -kernel at 0x80200000. */ + +ENTRY(_start) + +SECTIONS { + . = 0x80200000; + + .text : ALIGN(16) { + *(.text .text.*) + } + + .rodata : ALIGN(16) { + *(.rodata .rodata.*) + } + + .data : ALIGN(16) { + *(.data .data.*) + } + + .bss : ALIGN(16) { + __bss_start = .; + *(.bss .bss.*) + *(COMMON) + . = ALIGN(16); + } + + _end = .; + + /DISCARD/ : { + *(.note.*) *(.comment) *(.eh_frame) + } +} diff --git a/seed-kernel/arch/riscv64/mmu.c b/seed-kernel/arch/riscv64/mmu.c @@ -0,0 +1,63 @@ +typedef unsigned long u64; +typedef unsigned int u32; +typedef unsigned char u8; + +#include "arch.h" + +#define PTE_V 0x001UL +#define PTE_R 0x002UL +#define PTE_W 0x004UL +#define PTE_X 0x008UL +#define PTE_U 0x010UL +#define PTE_G 0x020UL +#define PTE_A 0x040UL +#define PTE_D 0x080UL + +#define KFLAGS (PTE_V | PTE_R | PTE_W | PTE_X | PTE_A | PTE_D) +#define DFLAGS (PTE_V | PTE_R | PTE_W | PTE_A | PTE_D) +#define UFLAGS (PTE_V | PTE_R | PTE_W | PTE_X | PTE_U | PTE_A | PTE_D) + +__attribute__((aligned(4096))) static u64 l2_root[512]; +__attribute__((aligned(4096))) static u64 l1_user[512]; + +extern void riscv_write_satp(u64 v); +extern void riscv_set_sum(void); + +static u64 pte(u64 pa, u64 flags) { + return (pa >> 2) | flags; +} + +static u64 pool_pa(int which) { + return which ? ARCH_USER_POOL_B_PA : ARCH_USER_POOL_A_PA; +} + +static void fill_user_l1(int which) { + for (int i = 0; i < 512; i++) { + u64 pa = (u64)i * 0x200000UL; + l1_user[i] = pte(pa, DFLAGS); + } + u64 base = pool_pa(which); + for (int i = ARCH_USER_POOL_FIRST_SLOT; i <= ARCH_USER_POOL_LAST_SLOT; i++) { + u64 pa = base + (u64)(i - ARCH_USER_POOL_FIRST_SLOT) * 0x200000UL; + l1_user[i] = pte(pa, UFLAGS); + } +} + +void arch_setup_mmu(void) { + for (int i = 0; i < 512; i++) l2_root[i] = 0; + fill_user_l1(0); + + l2_root[0] = pte((u64)l1_user, PTE_V); + l2_root[1] = pte(0x40000000UL, DFLAGS); + l2_root[2] = pte(0x80000000UL, KFLAGS); + l2_root[3] = pte(0xc0000000UL, KFLAGS); + l2_root[4] = pte(0x00000000UL, DFLAGS); + + riscv_set_sum(); + riscv_write_satp(((u64)8 << 60) | ((u64)l2_root >> 12)); +} + +void arch_swap_user_pool(int which) { + fill_user_l1(which); + riscv_write_satp(((u64)8 << 60) | ((u64)l2_root >> 12)); +} diff --git a/seed-kernel/kernel.c b/seed-kernel/kernel.c @@ -145,6 +145,17 @@ static int str_starts(const char *s, const char *prefix) { } static void parse_dtb(const void *dtb, struct dtb_info *out) { +#ifdef ARCH_STATIC_VIRTIO_MMIO_BASE + if ((u64)dtb == 0) { + out->mem_start = ARCH_STATIC_MEM_START; + out->mem_size = ARCH_STATIC_MEM_SIZE; + out->virtio_mmio_n = ARCH_STATIC_VIRTIO_MMIO_COUNT; + for (int i = 0; i < out->virtio_mmio_n && i < MAX_VIRTIO_MMIO; i++) + out->virtio_mmio_pa[i] = ARCH_STATIC_VIRTIO_MMIO_BASE + + (u64)i * ARCH_STATIC_VIRTIO_MMIO_STRIDE; + return; + } +#endif const u8 *base = dtb; if (be32(base) != FDT_MAGIC) { uart_puts("DTB: bad magic\n"); return; @@ -190,14 +201,14 @@ static void parse_dtb(const void *dtb, struct dtb_info *out) { out->mem_start = be64(p); out->mem_size = be64(p + 8); } - /* virtio-mmio nodes: capture each slot's PA. Root #address- - * cells/#size-cells are both 2 on QEMU virt → reg is 16 bytes - * (PA u64, size u64); we only need the PA. */ - if (str_starts(path[1], "virtio_mmio@") && - str_eq(pn, "reg") && len >= 16 && - out->virtio_mmio_n < MAX_VIRTIO_MMIO) { - out->virtio_mmio_pa[out->virtio_mmio_n++] = be64(p); - } + } + /* virtio-mmio nodes: capture each slot's PA. Root/soc + * #address-cells/#size-cells are both 2 on QEMU virt, so reg + * is 16 bytes (PA u64, size u64); we only need the PA. */ + if (depth >= 1 && str_starts(path[depth], "virtio_mmio@") && + str_eq(pn, "reg") && len >= 16 && + out->virtio_mmio_n < MAX_VIRTIO_MMIO) { + out->virtio_mmio_pa[out->virtio_mmio_n++] = be64(p); } p += len; p = (const u8 *)(((u64)p + 3) & ~3UL); diff --git a/seed-kernel/run.sh b/seed-kernel/run.sh @@ -1,12 +1,18 @@ #!/bin/sh # Boot the seed kernel + virtio-blk input/output disks in QEMU. # -# Usage: ./run.sh [extra qemu args...] +# Usage: ARCH=<aarch64|amd64|riscv64> ./run.sh [extra qemu args...] set -eu cd "$(dirname "$0")" -KERNEL=build/Image +ARCH=${ARCH:-aarch64} +case "$ARCH" in + aarch64) KERNEL=build/Image ;; + amd64) KERNEL=build/kernel.elf ;; + riscv64) KERNEL=build/kernel.elf ;; + *) echo "unsupported ARCH=$ARCH" >&2; exit 2 ;; +esac IN_IMG=build/in.img OUT_IMG=build/out.img @@ -18,16 +24,52 @@ OUT_IMG=build/out.img rm -f "$OUT_IMG" truncate -s 256M "$OUT_IMG" -exec qemu-system-aarch64 \ - -machine virt,gic-version=3,accel=hvf \ - -cpu host \ - -m 2048M \ - -nographic \ - -no-reboot \ - -global virtio-mmio.force-legacy=false \ - -kernel "$KERNEL" \ - -drive file="$IN_IMG",if=none,format=raw,id=hd0,readonly=on \ - -device virtio-blk-device,drive=hd0 \ - -drive file="$OUT_IMG",if=none,format=raw,id=hd1 \ - -device virtio-blk-device,drive=hd1 \ - "$@" +case "$ARCH" in + aarch64) + exec qemu-system-aarch64 \ + -machine virt,gic-version=3,accel=hvf \ + -cpu host \ + -m 2048M \ + -nographic \ + -no-reboot \ + -global virtio-mmio.force-legacy=false \ + -kernel "$KERNEL" \ + -drive file="$IN_IMG",if=none,format=raw,id=hd0,readonly=on \ + -device virtio-blk-device,drive=hd0 \ + -drive file="$OUT_IMG",if=none,format=raw,id=hd1 \ + -device virtio-blk-device,drive=hd1 \ + "$@" + ;; + amd64) + exec qemu-system-x86_64 \ + -machine microvm,acpi=off,pic=off,pit=off,rtc=off,isa-serial=on,auto-kernel-cmdline=off \ + -cpu max \ + -m 2048M \ + -nodefaults \ + -display none \ + -serial stdio \ + -no-reboot \ + -global virtio-mmio.force-legacy=false \ + -device isa-debug-exit,iobase=0x501,iosize=2 \ + -kernel "$KERNEL" \ + -drive file="$IN_IMG",if=none,format=raw,id=hd0,readonly=on \ + -device virtio-blk-device,drive=hd0 \ + -drive file="$OUT_IMG",if=none,format=raw,id=hd1 \ + -device virtio-blk-device,drive=hd1 \ + "$@" + ;; + riscv64) + exec qemu-system-riscv64 \ + -machine virt \ + -m 2048M \ + -nographic \ + -no-reboot \ + -global virtio-mmio.force-legacy=false \ + -kernel "$KERNEL" \ + -drive file="$IN_IMG",if=none,format=raw,id=hd0,readonly=on \ + -device virtio-blk-device,drive=hd0 \ + -drive file="$OUT_IMG",if=none,format=raw,id=hd1 \ + -device virtio-blk-device,drive=hd1 \ + "$@" + ;; +esac diff --git a/seed-kernel/user/child.c b/seed-kernel/user/child.c @@ -4,8 +4,13 @@ typedef long i64; typedef unsigned long u64; +#if defined(__x86_64__) +#define SYS_write 1 +#define SYS_exit_group 60 +#else #define SYS_write 64 #define SYS_exit_group 93 +#endif extern i64 syscall6(u64 nr, u64 a, u64 b, u64 c, u64 d, u64 e, u64 f); @@ -36,6 +41,47 @@ void _start_c(long argc, char **argv) { } /* See forktest.c for the .globl-after-label tcc 0.9.26 quirk. */ +#if defined(__x86_64__) +asm( + "_start: mov (%rsp), %rdi\n" + ".globl _start\n" + ".type _start, @function\n" + " lea 8(%rsp), %rsi\n" + " call _start_c\n" + "\n" + "syscall6:\n" + ".globl syscall6\n" + ".type syscall6, @function\n" + " mov %rdi, %rax\n" + " mov %rsi, %rdi\n" + " mov %rdx, %rsi\n" + " mov %rcx, %rdx\n" + " mov %r8, %r10\n" + " mov %r9, %r8\n" + " mov 8(%rsp), %r9\n" + " int $0x80\n" + " ret\n"); +#elif defined(__riscv) +asm( + "_start: ld a0, 0(sp)\n" + ".globl _start\n" + ".type _start, @function\n" + " addi a1, sp, 8\n" + " tail _start_c\n" + "\n" + "syscall6:\n" + ".globl syscall6\n" + ".type syscall6, @function\n" + " mv a7, a0\n" + " mv a0, a1\n" + " mv a1, a2\n" + " mv a2, a3\n" + " mv a3, a4\n" + " mv a4, a5\n" + " mv a5, a6\n" + " ecall\n" + " ret\n"); +#else asm( "_start: ldr x0, [sp]\n" ".globl _start\n" @@ -55,3 +101,4 @@ asm( " mov x5, x6\n" " svc #0\n" " ret\n"); +#endif diff --git a/seed-kernel/user/forktest.c b/seed-kernel/user/forktest.c @@ -7,14 +7,19 @@ typedef long i64; typedef unsigned long u64; typedef int i32; +#if defined(__x86_64__) +#define SYS_write 1 +#define SYS_exit_group 60 +#define SYS_waitid 247 +#elif defined(__riscv) #define SYS_write 64 -#define SYS_openat 56 -#define SYS_close 57 -#define SYS_read 63 -#define SYS_lseek 62 -#define SYS_brk 214 #define SYS_exit_group 93 -#define SYS_waitid 95 +#define SYS_waitid 95 +#else +#define SYS_write 64 +#define SYS_exit_group 93 +#define SYS_waitid 95 +#endif #define SYS_spawn 1024 extern i64 syscall6(u64 nr, u64 a, u64 b, u64 c, u64 d, u64 e, u64 f); @@ -61,6 +66,47 @@ void _start_c(long argc, char **argv) { * `name:` leaves the symbol UND in the .o symtab. Putting the label * first and `.globl name` after makes tcc register it as defined. gcc * accepts both orderings. */ +#if defined(__x86_64__) +asm( + "_start: mov (%rsp), %rdi\n" + ".globl _start\n" + ".type _start, @function\n" + " lea 8(%rsp), %rsi\n" + " call _start_c\n" + "\n" + "syscall6:\n" + ".globl syscall6\n" + ".type syscall6, @function\n" + " mov %rdi, %rax\n" + " mov %rsi, %rdi\n" + " mov %rdx, %rsi\n" + " mov %rcx, %rdx\n" + " mov %r8, %r10\n" + " mov %r9, %r8\n" + " mov 8(%rsp), %r9\n" + " int $0x80\n" + " ret\n"); +#elif defined(__riscv) +asm( + "_start: ld a0, 0(sp)\n" + ".globl _start\n" + ".type _start, @function\n" + " addi a1, sp, 8\n" + " tail _start_c\n" + "\n" + "syscall6:\n" + ".globl syscall6\n" + ".type syscall6, @function\n" + " mv a7, a0\n" + " mv a0, a1\n" + " mv a1, a2\n" + " mv a2, a3\n" + " mv a3, a4\n" + " mv a4, a5\n" + " mv a5, a6\n" + " ecall\n" + " ret\n"); +#else asm( "_start: ldr x0, [sp]\n" ".globl _start\n" @@ -82,3 +128,4 @@ asm( " mov x5, x6\n" " svc #0\n" " ret\n"); +#endif diff --git a/seed-kernel/user/hello.c b/seed-kernel/user/hello.c @@ -1,16 +1,33 @@ -/* User-space "hello" for the seed kernel. Static aarch64 ELF, no libc. - * Speaks the Linux aarch64 syscall ABI directly via SVC. */ +/* User-space "hello" for the seed kernel. Static ELF, no libc. */ typedef long i64; typedef unsigned long u64; -#define SYS_write 64 +#if defined(__x86_64__) +#define SYS_read 0 +#define SYS_write 1 +#define SYS_close 3 +#define SYS_lseek 8 +#define SYS_brk 12 +#define SYS_exit_group 60 +#define SYS_openat 257 +#elif defined(__riscv) #define SYS_openat 56 #define SYS_close 57 -#define SYS_read 63 #define SYS_lseek 62 +#define SYS_read 63 +#define SYS_write 64 +#define SYS_exit_group 93 #define SYS_brk 214 +#else +#define SYS_openat 56 +#define SYS_close 57 +#define SYS_lseek 62 +#define SYS_read 63 +#define SYS_write 64 #define SYS_exit_group 93 +#define SYS_brk 214 +#endif extern i64 syscall6(u64 nr, u64 a, u64 b, u64 c, u64 d, u64 e, u64 f); @@ -113,6 +130,47 @@ void _start_c(long argc, char **argv) { * Emitted as a plain global symbol with raw asm — no C-compiler-generated * prologue, since gcc would clobber sp before we read argc. */ /* See forktest.c for the .globl-after-label tcc 0.9.26 quirk. */ +#if defined(__x86_64__) +asm( + "_start: mov (%rsp), %rdi\n" + ".globl _start\n" + ".type _start, @function\n" + " lea 8(%rsp), %rsi\n" + " call _start_c\n" + "\n" + "syscall6:\n" + ".globl syscall6\n" + ".type syscall6, @function\n" + " mov %rdi, %rax\n" + " mov %rsi, %rdi\n" + " mov %rdx, %rsi\n" + " mov %rcx, %rdx\n" + " mov %r8, %r10\n" + " mov %r9, %r8\n" + " mov 8(%rsp), %r9\n" + " int $0x80\n" + " ret\n"); +#elif defined(__riscv) +asm( + "_start: ld a0, 0(sp)\n" + ".globl _start\n" + ".type _start, @function\n" + " addi a1, sp, 8\n" + " tail _start_c\n" + "\n" + "syscall6:\n" + ".globl syscall6\n" + ".type syscall6, @function\n" + " mv a7, a0\n" + " mv a0, a1\n" + " mv a1, a2\n" + " mv a2, a3\n" + " mv a3, a4\n" + " mv a4, a5\n" + " mv a5, a6\n" + " ecall\n" + " ret\n"); +#else asm( "_start: ldr x0, [sp]\n" ".globl _start\n" @@ -132,3 +190,4 @@ asm( " mov x5, x6\n" " svc #0\n" " ret\n"); +#endif