boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

commit c612ee4b1ae6980f51ef14c0a4d990c6580c59aa
parent 97c83d00c094c070d8dae05e5a06ec7cd3968dcc
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed,  6 May 2026 12:39:55 -0700

AT.1 + AT.4: inline mes-libc abtol fix; arch.h primitive set + macros

AT.1: widen mes-libc abtol's accumulator to `long long` directly in
vendor/mes-libc/mes/abtol.c (the existing libc-flatten patch already
did this as `long`; inlining drops the patch and the libc-flatten
stanza). Revert the .long lo,hi pair workaround for boot_gdt64 in
seed-kernel/arch/amd64/kernel.S now that 64-bit `.quad` literals parse
correctly through tcc3.

AT.4: rewrite seed-kernel/arch/{amd64,riscv64}/arch.h to match
aarch64's pattern — small primitive set in kernel.S, arch_*() API
synthesized as macros. New primitives: cpu_pause(kind),
amd64_fence(kind), amd64_read_cr2() / riscv_fence(kind),
riscv_read_stval(). Drops 11 amd64 and 9 riscv64 dedicated extern
functions in favor of macro composition. Pure-C macros for
arch_read_user_sp / arch_write_user_sp / arch_mmio_ptr (the AT.3
side-wins) on both amd64 and riscv64.

Validated: aarch64 podman boot6 byte-identical after re-flatten,
confirming the `int → long long` swap is a no-op on LP64 hosts.
amd64/riscv64 byte output changes intentionally (workarounds gone).

Diffstat:
Mscripts/libc-flatten.sh | 11-----------
Mseed-kernel/arch/amd64/arch.h | 28++++++++++++++++++----------
Mseed-kernel/arch/amd64/kernel.S | 68++++++++++++++++++++++++++------------------------------------------
Mseed-kernel/arch/riscv64/arch.h | 30++++++++++++++++++++----------
Mseed-kernel/arch/riscv64/kernel.S | 82++++++++++++++++++++++++++++++++++++++++----------------------------------------
Mvendor/mes-libc/mes/abtol.c | 7++++++-
Dvendor/mes-libc/patches/abtol-long-accumulator.after | 9---------
Dvendor/mes-libc/patches/abtol-long-accumulator.before | 3---
8 files changed, 111 insertions(+), 127 deletions(-)

diff --git a/scripts/libc-flatten.sh b/scripts/libc-flatten.sh @@ -158,17 +158,6 @@ apply_simple_patch \ "$STAGE/stdio/vsnprintf.c" \ "$PATCHES/vsnprintf-int-promo.before" \ "$PATCHES/vsnprintf-int-promo.after" -# abtol uses an `int` accumulator, which overflows for values that don't -# fit in 32-bit signed (e.g. 0x80200000 — riscv64 OpenSBI kernel base). -# strtol/strtoul/strtoull all bottom out here, so the overflow propagates -# through everywhere mes-libc's number parsers are called. Concretely, -# tcc3 (linked against mes-libc) mishandles `-Wl,-Ttext=0x80200000` and -# emits an ELF with sign-extended vaddr=0xffffffff80200000 that QEMU -# rejects. Switching the accumulator to `long` fixes the parse. -apply_simple_patch \ - "$STAGE/mes/abtol.c" \ - "$PATCHES/abtol-long-accumulator.before" \ - "$PATCHES/abtol-long-accumulator.after" # --- (3) flatten via host preprocessor -------------------------------- HOST_CC=${HOST_CC:-cc} diff --git a/seed-kernel/arch/amd64/arch.h b/seed-kernel/arch/amd64/arch.h @@ -61,23 +61,31 @@ struct trapframe { #define ARCH_TF_PC(tf) ((tf)->pc) #define ARCH_IS_SYSCALL(cause) ((cause) == 0) +enum { BAR_WMB, BAR_RMB }; +enum { PAUSE_PAUSE, PAUSE_HLT }; + +extern u64 saved_user_sp; +extern void cpu_pause(int kind); +extern void amd64_fence(int kind); +extern u64 amd64_read_cr2(void); extern void arch_setup_mmu(void); extern void arch_swap_user_pool(int which); -extern u64 arch_read_user_sp(void); -extern void arch_write_user_sp(u64 v); -extern u64 arch_fault_addr(void); -extern void arch_pause(void); -extern void arch_idle_forever(void); -extern volatile u8 *arch_mmio_ptr(u64 pa); -extern void arch_wmb(void); -extern void arch_rmb(void); -extern void arch_icache_sync(void); -extern void arch_icache_context_sync(void); extern void arch_system_off(void); extern void eret_to_user(u64 entry, u64 sp); extern void amd64_outb(u16 port, u8 val); extern u8 amd64_inb(u16 port); +#define arch_read_user_sp() (saved_user_sp) +#define arch_write_user_sp(v) (saved_user_sp = (v)) +#define arch_fault_addr() amd64_read_cr2() +#define arch_pause() cpu_pause(PAUSE_PAUSE) +#define arch_idle_forever() do { for (;;) cpu_pause(PAUSE_HLT); } while (0) +#define arch_mmio_ptr(pa) ((volatile u8 *)(ARCH_DEVICE_ALIAS_BASE + (u64)(pa))) +#define arch_wmb() amd64_fence(BAR_WMB) +#define arch_rmb() amd64_fence(BAR_RMB) +#define arch_icache_sync() do {} while (0) +#define arch_icache_context_sync() do {} while (0) + static inline void arch_clear_to_user_entry(struct trapframe *tf, u64 entry) { for (int i = 0; i < ARCH_TRAPFRAME_NREGS; i++) tf->x[i] = 0; tf->pc = entry; diff --git a/seed-kernel/arch/amd64/kernel.S b/seed-kernel/arch/amd64/kernel.S @@ -275,54 +275,40 @@ eret_to_user: xorq %r15, %r15 .byte 0x48,0xcf /* iretq */ -.globl arch_read_user_sp -arch_read_user_sp: - movq saved_user_sp(%rip), %rax - ret - -.globl arch_write_user_sp -arch_write_user_sp: - movq %rdi, saved_user_sp(%rip) - ret +/* C-callable thunks. arch.h synthesizes the arch_*() API on top of these + * (mirrors aarch64's sysreg_read/arm64_barrier/cpu_pause primitive set). */ -.globl arch_fault_addr -arch_fault_addr: - movq %cr2, %rax - ret +/* PAUSE_* — matches arch.h enum. */ +#define PAUSE_PAUSE 0 +#define PAUSE_HLT 1 -.globl arch_pause -arch_pause: +.globl cpu_pause +cpu_pause: + cmpl $PAUSE_HLT, %edi + je .Lp_hlt pause ret - -.globl arch_idle_forever -arch_idle_forever: -1: +.Lp_hlt: hlt - jmp 1b - -.globl arch_mmio_ptr -arch_mmio_ptr: - movq $0xffff800000000000, %rax - addq %rdi, %rax ret -.globl arch_wmb -arch_wmb: +/* BAR_* — matches arch.h enum. */ +#define BAR_WMB 0 +#define BAR_RMB 1 + +.globl amd64_fence +amd64_fence: + cmpl $BAR_RMB, %edi + je .Lf_rmb sfence ret - -.globl arch_rmb -arch_rmb: +.Lf_rmb: lfence ret -.globl arch_icache_sync -arch_icache_sync: - ret - -.globl arch_icache_context_sync -arch_icache_context_sync: +.globl amd64_read_cr2 +amd64_read_cr2: + movq %cr2, %rax ret .globl arch_system_off @@ -405,12 +391,10 @@ amd64_serial_init: .align 8 boot_gdt64: /* null, 64-bit code (P=1,DPL=0,S=1,type=A; G=1,L=1; limit=0xfffff), - * 64-bit data (P=1,DPL=0,S=1,type=2; G=1,limit=0xfffff). Encoded as - * pairs of .long because tcc 0.9.26's assembler truncates a single - * `.quad` literal to 32 bits when the high half is non-zero. */ - .long 0, 0 - .long 0x0000ffff, 0x00af9a00 - .long 0x0000ffff, 0x00af9200 + * 64-bit data (P=1,DPL=0,S=1,type=2; G=1,limit=0xfffff). */ + .quad 0 + .quad 0x00af9a000000ffff + .quad 0x00af92000000ffff boot_gdt64_ptr: .word boot_gdt64_ptr - boot_gdt64 - 1 .long boot_gdt64 diff --git a/seed-kernel/arch/riscv64/arch.h b/seed-kernel/arch/riscv64/arch.h @@ -45,21 +45,31 @@ struct trapframe { #define ARCH_TF_PC(tf) ((tf)->pc) #define ARCH_IS_SYSCALL(cause) ((cause) == 8) +enum { BAR_WMB, BAR_RMB, BAR_ICACHE, BAR_ICACHE_CTX }; +enum { PAUSE_NOP, PAUSE_WFI }; + +extern u64 saved_user_sp; +extern void cpu_pause(int kind); +extern void riscv_fence(int kind); +extern u64 riscv_read_stval(void); +extern void riscv_write_satp(u64 v); +extern void riscv_set_sum(void); extern void arch_setup_mmu(void); extern void arch_swap_user_pool(int which); -extern u64 arch_read_user_sp(void); -extern void arch_write_user_sp(u64 v); -extern u64 arch_fault_addr(void); -extern void arch_pause(void); -extern void arch_idle_forever(void); -extern volatile u8 *arch_mmio_ptr(u64 pa); -extern void arch_wmb(void); -extern void arch_rmb(void); -extern void arch_icache_sync(void); -extern void arch_icache_context_sync(void); extern void arch_system_off(void); extern void eret_to_user(u64 entry, u64 sp); +#define arch_read_user_sp() (saved_user_sp) +#define arch_write_user_sp(v) (saved_user_sp = (v)) +#define arch_fault_addr() riscv_read_stval() +#define arch_pause() cpu_pause(PAUSE_NOP) +#define arch_idle_forever() do { for (;;) cpu_pause(PAUSE_WFI); } while (0) +#define arch_mmio_ptr(pa) ((volatile u8 *)(ARCH_DEVICE_ALIAS_BASE + (u64)(pa))) +#define arch_wmb() riscv_fence(BAR_WMB) +#define arch_rmb() riscv_fence(BAR_RMB) +#define arch_icache_sync() riscv_fence(BAR_ICACHE) +#define arch_icache_context_sync() riscv_fence(BAR_ICACHE_CTX) + static inline void arch_clear_to_user_entry(struct trapframe *tf, u64 entry) { for (int i = 0; i < ARCH_TRAPFRAME_NREGS; i++) tf->x[i] = 0; tf->pc = entry; diff --git a/seed-kernel/arch/riscv64/kernel.S b/seed-kernel/arch/riscv64/kernel.S @@ -238,64 +238,64 @@ eret_to_user: li t6, 0 SRET -.globl arch_read_user_sp -arch_read_user_sp: - LA(t0, saved_user_sp) - LD(a0, t0, 0) - RET - -.globl arch_write_user_sp -arch_write_user_sp: - LA(t0, saved_user_sp) - SD(a0, t0, 0) - RET - -.globl arch_fault_addr -arch_fault_addr: - CSRR_A0_STVAL - RET +/* C-callable thunks. arch.h synthesizes the arch_*() API on top of these + * (mirrors aarch64's sysreg_read/arm64_barrier/cpu_pause primitive set). */ -.globl arch_pause -arch_pause: +/* PAUSE_* — matches arch.h enum. */ +.globl cpu_pause +cpu_pause: + li t0, 1 /* PAUSE_WFI */ +#ifdef __TINYC__ + beq a0, t0, 12 +#else + beq a0, t0, 1f +#endif NOP RET - -.globl arch_idle_forever -arch_idle_forever: 1: wfi - J(1b) - -.globl arch_mmio_ptr -arch_mmio_ptr: - /* Device alias offset = ARCH_DEVICE_ALIAS_BASE = 1 << 33. - * Must match the L2 slot picked in arch/riscv64/mmu.c. */ - li t0, 1 - slli t0, t0, 33 - add a0, a0, t0 RET -.globl arch_wmb -arch_wmb: +/* BAR_* — matches arch.h enum. */ +.globl riscv_fence +riscv_fence: + li t0, 1 /* BAR_RMB */ +#ifdef __TINYC__ + beq a0, t0, 28 +#else + beq a0, t0, 1f +#endif + li t0, 2 /* BAR_ICACHE */ +#ifdef __TINYC__ + beq a0, t0, 28 +#else + beq a0, t0, 2f +#endif + li t0, 3 /* BAR_ICACHE_CTX */ +#ifdef __TINYC__ + beq a0, t0, 28 +#else + beq a0, t0, 3f +#endif + /* default: BAR_WMB */ FENCE_W_W RET - -.globl arch_rmb -arch_rmb: +1: FENCE_R_R RET - -.globl arch_icache_sync -arch_icache_sync: +2: FENCE_I RET - -.globl arch_icache_context_sync -arch_icache_context_sync: +3: SFENCE_VMA FENCE_I RET +.globl riscv_read_stval +riscv_read_stval: + CSRR_A0_STVAL + RET + .globl riscv_write_satp riscv_write_satp: CSRW_SATP_A0 diff --git a/vendor/mes-libc/mes/abtol.c b/vendor/mes-libc/mes/abtol.c @@ -26,7 +26,12 @@ long abtol (char const **p, int base) { char const *s = p[0]; - int i = 0; + /* `long long` (≥64-bit) accumulator so values that don't fit in 32-bit + * signed don't sign-extend through the parse. Affects every mes-libc + * number parser (strtol/strtoul/strtoull, vfprintf field widths, …). + * Without it, tcc3 mishandles `-Wl,-Ttext=0x80200000` (riscv64 OpenSBI + * kernel base) and `.quad 0x00af9a000000ffff` in amd64 kernel.S. */ + long long i = 0; int sign_p = 0; int m = '0'; if (base == 0) diff --git a/vendor/mes-libc/patches/abtol-long-accumulator.after b/vendor/mes-libc/patches/abtol-long-accumulator.after @@ -1,9 +0,0 @@ - char const *s = p[0]; - /* Use a `long` accumulator so values that don't fit in 32-bit signed - * (e.g. 0x80200000 — riscv64's OpenSBI kernel base) don't overflow - * to a sign-extended negative. Affects strtol/strtoul/strtoull, - * which all bottom out in this routine. Without this, tcc3 mishandles - * `-Wl,-Ttext=0x80200000` and emits an ELF with vaddr=0xffffffff80200000. - */ - long i = 0; - int sign_p = 0; diff --git a/vendor/mes-libc/patches/abtol-long-accumulator.before b/vendor/mes-libc/patches/abtol-long-accumulator.before @@ -1,3 +0,0 @@ - char const *s = p[0]; - int i = 0; - int sign_p = 0;