commit 799eba0226f49f5dc353cef2359a0f5311a4fc87
parent e4bfcde162d44394a1bbd0ee55becafb57087502
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 6 May 2026 09:36:52 -0700
seed-kernel: amd64 boot6 + DRIVER=seed wiring; bring kernel up to scheme1+tcc3
boot6 amd64 (DRIVER=podman) now self-builds a clean PVH ELF and
boots scheme1 + tcc3 cleanly under qemu-system-x86_64. DRIVER=seed
amd64 self-rebuilds the kernel inside itself (closing the
bootstrap loop) and runs boot{0,1,2} cleanly; boot3+ are wired but
left compute-bound on TCG.
Kernel additions (none of which the gcc-built kernel needed
because gcc-built userland used `-mno-sse` + `int 0x80`):
* MSR-based `syscall` entry stub + EFER.SCE / STAR / LSTAR /
SFMASK setup, so scheme1 (and any tcc-emitted user binary) can
syscall instead of `int 0x80`.
* CR0.MP / CR4.OSFXSR | OSXMMEXCPT in long-mode init so user
xmm-spill prologues don't trap with #UD.
* PVH `hvm_start_info` cmdline parsing (preserve EBX through
long-mode init, decode magic 0x336ec578 + cmdline_paddr@+0x18).
* SYS_open(2) alias to openat(AT_FDCWD,...) for the amd64
hex0-seed/hex0/hex1/hex2/M0 stage0 binaries.
tcc 0.9.26 amd64 workarounds (see docs/SEED-AMD64-TODO.md):
* GDT entries split into `.long lo, hi` pairs because tcc's
assembler silently truncates `.quad` literals when bits 32-63
are nonzero.
* New seed-kernel/scripts/elf-pvh-note.c: tcc-built post-link
tool that retypes `.note.Xen` to SHT_NOTE and writes a fresh
PT_NOTE phdr at phoff+phnum*phentsize, since tcc's linker
emits no PT_NOTE phdrs and QEMU's PVH `-kernel` path requires
one. Wired into boot6-gen-runscm.sh + boot6.sh for amd64.
Driver glue:
* boot.sh, boot{0..6}.sh, lib-runscm.sh, lib-pipeline.sh: add
amd64 to the DRIVER=seed gate everywhere; both seed harnesses
gain a `qemu-system-x86_64 -machine microvm ...` arm matching
seed-kernel/run.sh.
* boot3.sh / boot4.sh runscm timeouts are now env-overridable
(BOOT3_TIMEOUT, BOOT4_TIMEOUT) — boot3+ on TCG-emulated x86_64
needs much longer than the aarch64+hvf default.
Diffstat:
17 files changed, 483 insertions(+), 37 deletions(-)
diff --git a/docs/SEED-AMD64-TODO.md b/docs/SEED-AMD64-TODO.md
@@ -0,0 +1,122 @@
+# amd64 seed-kernel TODO
+
+Working doc. Captures the tcc 0.9.26 limitations we worked around
+to bring up `boot6 amd64` and `DRIVER=seed ./scripts/boot.sh amd64`,
+plus what's still unvalidated. Pairs with `docs/OS.md` (kernel
+contract), `docs/TCC.md` (compiler), and `docs/SEED-RISCV64-TODO.md`
+(parallel write-up for the riscv64 path).
+
+## Goal
+
+`DRIVER=seed ./scripts/boot.sh amd64` should run the full
+boot0→boot6 chain entirely inside the tcc-built amd64 seed kernel
+(the kernel is its own build driver, podman only mints the first
+kernel image). This validates every kernel path the chain depends
+on under real workloads.
+
+## What works (May 2026)
+
+- `scripts/boot6.sh amd64` (DRIVER=podman) builds a clean PVH ELF at
+ `build/amd64/boot6/kernel.elf`. Boots under
+ `qemu-system-x86_64 -machine microvm -kernel ...`; runs scheme1 +
+ tcc3 + the user smoke tests under `seed-kernel/run.sh ARCH=amd64`.
+- `DRIVER=seed scripts/boot6.sh amd64` self-rebuilds the kernel
+ inside itself (closes the bootstrap loop).
+- `DRIVER=seed scripts/boot{0,1,2}.sh amd64` complete cleanly on the
+ tcc-built kernel (boot0 ≈1 s, boot1 ≈14 s, boot2 ≈43 s under TCG).
+ Outputs are `build/amd64/boot{0,1,2}/...`.
+
+## tcc 0.9.26 limitations worked around for amd64
+
+These are amd64-specific gotchas; the existing `docs/TCC.md` and
+the simple-patches in `scripts/simple-patches/tcc-0.9.26/` cover the
+shared/aarch64/riscv64 issues.
+
+1. **`.quad` literal truncation.** tcc's assembler silently
+ truncates a `.quad` value to its low 32 bits when the high half
+ is non-zero (`gen_le64(int64_t c)` is fed a value already lost
+ through the parser path). The amd64 GDT entries
+ (`0x00af9a000000ffff`, `0x00af92000000ffff`) are the natural
+ trip wire — without the workaround, the bootloader's `lgdt`
+ loads zero P/L bits, the long-mode `ljmp` raises #GP, and the
+ kernel never prints anything.
+ - **Workaround:** `seed-kernel/arch/amd64/kernel.S` encodes each
+ descriptor as `.long lo, hi` so the high half is parsed as a
+ fresh 32-bit literal. Comment in-place explains why.
+ - **TODO:** add a `simple-patch` to fix `.quad` parsing in
+ tccasm.c so the source can use the natural form. Suspected
+ site is the parser path that intermediates 64-bit values
+ through a 32-bit `int` before reaching `gen_le64`.
+
+2. **No PT_NOTE program header / SHT_NOTE section type.** tcc's
+ linker emits exactly two PT_LOAD phdrs for static EXEs (no
+ PT_NOTE), and `find_section` defaults every assembler-created
+ section to SHT_PROGBITS regardless of name. QEMU's PVH
+ `-kernel` path scans PT_NOTE phdrs for the Xen 18 note that
+ names the 32-bit entry; without one it errors out with
+ "Error loading uncompressed kernel without PVH ELF Note".
+ - **Workaround:** `seed-kernel/scripts/elf-pvh-note.c` is a
+ tiny tcc-built post-link tool that locates `.note.Xen` via
+ the section header table, retypes the section as SHT_NOTE,
+ and writes a fresh PT_NOTE phdr at `phoff + phnum*phentsize`
+ (the gap between Ehdr/Phdrs and the first PT_LOAD's content
+ at offset `s_align` = 0x200000 has plenty of room). Wired
+ into `boot6-gen-runscm.sh` and `boot6.sh` for amd64 only.
+ - **TODO:** patch tcc to (a) detect ".note.*" section names and
+ create them as SHT_NOTE, and (b) bump phnum + emit PT_NOTE
+ phdrs for each SHT_NOTE alloc section after `layout_sections`.
+ That removes the need for the post-link tool.
+
+## amd64 kernel additions (not tcc workarounds)
+
+These are real kernel features that aarch64/riscv64 already had in
+some form and amd64 was missing. Listing them so the riscv64-style
+TODO has full context, not because they're broken.
+
+- **MSR-based `syscall` entry path.** scheme1 (and any tcc-built
+ user binary that follows the SysV/Linux amd64 ABI) emits the
+ `syscall` instruction, which goes through MSR_LSTAR — not the
+ IDT. The previous kernel only handled `int $0x80`. Added
+ `amd64_syscall_entry` in `kernel.S` and `MSR_EFER.SCE | MSR_STAR
+ | MSR_LSTAR | MSR_SFMASK` setup in `mmu.c::setup_cpu_tables`.
+- **CR0.MP / CR4.OSFXSR / CR4.OSXMMEXCPT.** tcc emits xmm-spill
+ prologues (`movq %xmm7, -0x20(%rbp)`) in user binaries; without
+ these CR bits set, the first such instruction raises #UD/#NM.
+ The gcc-built kernel + `-mno-sse` user (user/hello.c) avoided
+ this; tcc has no `-mno-sse` equivalent so we enable SSE in
+ long-mode init.
+- **PVH `hvm_start_info` cmdline parsing.** microvm has no DTB.
+ PVH passes the `hvm_start_info` phys addr in EBX; we preserve
+ it through long-mode init and pass to `kmain`. `parse_dtb` in
+ `kernel.c` recognises magic 0x336ec578 and reads
+ `cmdline_paddr` at +0x18 to populate `dt.bootargs`.
+- **Legacy `SYS_open(2)` alias.** amd64 stage0 `hex0-seed` issues
+ `syscall 2` (Linux's legacy open) directly. Added
+ `ARCH_SYS_open` in amd64 arch.h and a switch case that aliases
+ to `sys_openat(AT_FDCWD, ...)`.
+
+## What's not yet validated
+
+1. **boot3 → boot6 under DRIVER=seed amd64.** boot3 (scheme1
+ driving cc.scm to compile tcc.flat.c + libc.flat.c → tcc0)
+ is compute-bound under TCG-emulated x86_64 on Apple Silicon
+ (no hvf for amd64). The aarch64 path uses hvf and runs at
+ near-native speed; amd64 is 5–20× slower. Concretely: boot3
+ timed out at the previous 1800 s default. We bumped the
+ timeout knob (`BOOT3_TIMEOUT`, `BOOT4_TIMEOUT`) and a 4 h
+ boot3 run is in progress — see `build/amd64/.boot3-stage/
+ transcript.txt` for the live log.
+2. **Fixed-point check.** Once boot3/4/5 complete, the artifacts
+ (catm/scheme1/tcc0/tcc1/tcc2/tcc3/libc.a/...) should byte-match
+ the DRIVER=podman amd64 outputs. Untested.
+3. **`scripts/run-tests.sh amd64`** under DRIVER=seed. Untested.
+4. **boot4's tcc fixed-point assertion (`tcc2 == tcc3`)** under
+ DRIVER=seed amd64. Untested.
+
+## Cost / acceleration note
+
+The amd64 seed driver runs under `qemu-system-x86_64 -cpu max`
+with TCG (no hvf for x86_64 on Apple Silicon). A full
+`DRIVER=seed scripts/boot.sh amd64` is realistically a multi-hour
+operation. The aarch64 path is the fast/CI iterating loop; the
+amd64 seed path is mostly there for parity validation.
diff --git a/scripts/boot.sh b/scripts/boot.sh
@@ -2,15 +2,15 @@
## boot.sh — drive boot0 → boot5 end-to-end under one driver.
##
## Usage: scripts/boot.sh <arch>
-## DRIVER=seed scripts/boot.sh aarch64
+## DRIVER=seed scripts/boot.sh <amd64|aarch64|riscv64>
## DRIVER=podman scripts/boot.sh <amd64|aarch64|riscv64>
##
## DRIVER (default podman) is exported and consumed by each bootN.sh.
-## DRIVER=seed is aarch64-only and runs on build/$ARCH/boot6/Image —
-## the tcc3-built seed kernel produced by boot6. First-time setup
+## DRIVER=seed runs the build pipeline on build/$ARCH/boot6/{Image,kernel.elf}
+## — the tcc3-built seed kernel produced by boot6. First-time setup
## therefore requires one prior podman pass to produce that kernel:
-## ./scripts/boot.sh aarch64 # default DRIVER=podman
-## DRIVER=seed ./scripts/boot.sh aarch64 # re-run on tcc-built kernel
+## ./scripts/boot.sh <arch> # default DRIVER=podman
+## DRIVER=seed ./scripts/boot.sh <arch> # re-run on tcc-built kernel
## Subsequent DRIVER=seed runs reuse the Image from the prior boot6
## (stashed across the build/$ARCH wipe below).
@@ -23,8 +23,9 @@ case "$DRIVER" in
seed)
case "$ARCH" in
aarch64) KERNEL_NAME=Image ;;
+ amd64) KERNEL_NAME=kernel.elf ;;
riscv64) KERNEL_NAME=kernel.elf ;;
- *) echo "[boot] DRIVER=seed: aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
+ *) echo "[boot] DRIVER=seed: amd64|aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
esac
KERNEL=build/$ARCH/boot6/$KERNEL_NAME
if [ ! -f "$KERNEL" ]; then
@@ -69,8 +70,6 @@ stage boot3 ./scripts/boot3.sh $ARCH
stage boot4 ./scripts/boot4.sh $ARCH
stage boot5 ./scripts/boot5.sh $ARCH
-# boot6 builds the seed-kernel ELF with boot4's tcc3 (no `ld -T`,
-# no objcopy). Currently aarch64 + riscv64.
-case "$ARCH" in
- aarch64|riscv64) stage boot6 ./scripts/boot6.sh $ARCH ;;
-esac
+# boot6 builds the seed-kernel ELF/Image with boot4's tcc3 (no `ld -T`,
+# no objcopy).
+stage boot6 ./scripts/boot6.sh $ARCH
diff --git a/scripts/boot0.sh b/scripts/boot0.sh
@@ -49,8 +49,9 @@ case "$DRIVER" in
seed)
case "$ARCH" in
aarch64) KIMG=Image ;;
+ amd64) KIMG=kernel.elf ;;
riscv64) KIMG=kernel.elf ;;
- *) echo "[boot0] DRIVER=seed: aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
+ *) echo "[boot0] DRIVER=seed: amd64|aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
esac
KERNEL_IMAGE=$ROOT/build/$ARCH/boot6/$KIMG
EXTRACT=$ROOT/seed-kernel/scripts/extract-blk.sh
diff --git a/scripts/boot1.sh b/scripts/boot1.sh
@@ -59,8 +59,9 @@ case "$DRIVER" in
seed)
case "$ARCH" in
aarch64) KIMG=Image ;;
+ amd64) KIMG=kernel.elf ;;
riscv64) KIMG=kernel.elf ;;
- *) echo "[boot1] DRIVER=seed: aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
+ *) echo "[boot1] DRIVER=seed: amd64|aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
esac
KERNEL_IMAGE=$ROOT/build/$ARCH/boot6/$KIMG
EXTRACT=$ROOT/seed-kernel/scripts/extract-blk.sh
diff --git a/scripts/boot2.sh b/scripts/boot2.sh
@@ -67,8 +67,9 @@ case "$DRIVER" in
seed)
case "$ARCH" in
aarch64) KIMG=Image ;;
+ amd64) KIMG=kernel.elf ;;
riscv64) KIMG=kernel.elf ;;
- *) echo "[boot2] DRIVER=seed: aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
+ *) echo "[boot2] DRIVER=seed: amd64|aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
esac
KERNEL_IMAGE=$ROOT/build/$ARCH/boot6/$KIMG
EXTRACT=$ROOT/seed-kernel/scripts/extract-blk.sh
diff --git a/scripts/boot3.sh b/scripts/boot3.sh
@@ -86,8 +86,9 @@ fi
if [ "$DRIVER" = seed ]; then
case "$ARCH" in
aarch64) KIMG=Image ;;
+ amd64) KIMG=kernel.elf ;;
riscv64) KIMG=kernel.elf ;;
- *) echo "[boot3] DRIVER=seed: aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
+ *) echo "[boot3] DRIVER=seed: amd64|aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
esac
KERNEL_IMAGE=$ROOT/build/$ARCH/boot6/$KIMG
EXTRACT=$ROOT/seed-kernel/scripts/extract-blk.sh
@@ -150,7 +151,7 @@ runscm_input tcc.flat.c "$TCC_FLAT"
runscm_input libc.flat.c "$LIBC_FLAT"
runscm_export tcc0
-runscm_run 1800
+runscm_run "${BOOT3_TIMEOUT:-1800}"
echo "[boot3 $ARCH/$DRIVER] sizes: tcc0=$(wc -c <"$OUT/tcc0")"
echo "[boot3 $ARCH/$DRIVER] OK -> $OUT/tcc0"
diff --git a/scripts/boot4.sh b/scripts/boot4.sh
@@ -122,8 +122,9 @@ fi
if [ "$DRIVER" = seed ]; then
case "$ARCH" in
aarch64) KIMG=Image ;;
+ amd64) KIMG=kernel.elf ;;
riscv64) KIMG=kernel.elf ;;
- *) echo "[boot4] DRIVER=seed: aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
+ *) echo "[boot4] DRIVER=seed: amd64|aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
esac
KERNEL_IMAGE=$ROOT/build/$ARCH/boot6/$KIMG
EXTRACT=$ROOT/seed-kernel/scripts/extract-blk.sh
@@ -186,7 +187,7 @@ runscm_export s3-crt1.o
runscm_export s3-libc.a
runscm_export s3-libtcc1.a
runscm_export hello
-runscm_run 5400
+runscm_run "${BOOT4_TIMEOUT:-5400}"
# ── fixed-point check (host-side) ─────────────────────────────────────
# After a codegen-altering tcc patch, tcc1 (built by tcc0 = pre-fix) and
diff --git a/scripts/boot5.sh b/scripts/boot5.sh
@@ -71,8 +71,8 @@ cd "$ROOT"
DRIVER=${DRIVER:-podman}
if [ "$DRIVER" = seed ]; then
case "$ARCH" in
- aarch64|riscv64) ;;
- *) echo "[boot5] DRIVER=seed: aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
+ amd64|aarch64|riscv64) ;;
+ *) echo "[boot5] DRIVER=seed: amd64|aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
esac
fi
@@ -108,6 +108,7 @@ fi
if [ "$DRIVER" = seed ]; then
case "$ARCH" in
aarch64) KIMG=Image ;;
+ amd64) KIMG=kernel.elf ;;
riscv64) KIMG=kernel.elf ;;
esac
KERNEL_IMAGE=$ROOT/build/$ARCH/boot6/$KIMG
diff --git a/scripts/boot6-gen-runscm.sh b/scripts/boot6-gen-runscm.sh
@@ -94,6 +94,29 @@ cat > "$OUT" <<EOF
"-o" "out/$OUT_FILE"
"out/kernel-asm.o" "out/kernel.o" "out/mmu.o" "out/mem.o")
"link $OUT_FILE")
+EOF
+
+if [ "$ARCH" = amd64 ]; then
+ cat >> "$OUT" <<'EOF'
+
+;; amd64: tcc3 doesn't emit PT_NOTE phdrs, so QEMU's PVH `-kernel`
+;; path doesn't find the Xen 18 note we placed in `.note.Xen`.
+;; Build the fixup tool with tcc3 + boot4's libc/crt1/libtcc1 and
+;; rewrite the ELF in place.
+(write-string stdout "boot6: tcc3 build elf-pvh-note\n")
+(must (run "in/tcc3" "-nostdlib"
+ "in/crt1.o" "in/elf-pvh-note.c"
+ "in/libc.a" "in/libtcc1.a" "in/libc.a"
+ "-o" "out/elf-pvh-note")
+ "build elf-pvh-note")
+
+(write-string stdout "boot6: amd64 PVH note fixup\n")
+(must (run "out/elf-pvh-note" "out/kernel.elf")
+ "elf-pvh-note kernel.elf")
+EOF
+fi
+
+cat >> "$OUT" <<EOF
(write-string stdout "boot6: ALL-OK\n")
(exit 0)
diff --git a/scripts/boot6.sh b/scripts/boot6.sh
@@ -37,9 +37,7 @@
## before jumping to _start.
##
## Usage: scripts/boot6.sh <arch>
-## <arch> ∈ {amd64,aarch64,riscv64} for DRIVER=podman (default).
-## DRIVER=seed remains aarch64-only because the seed transport boots the
-## aarch64 seed kernel.
+## <arch> ∈ {amd64,aarch64,riscv64} for either DRIVER (default podman).
set -eu
@@ -59,8 +57,8 @@ cd "$ROOT"
DRIVER=${DRIVER:-podman}
case "$DRIVER:$ARCH" in
- seed:aarch64|seed:riscv64) ;;
- seed:*) echo "[boot6] DRIVER=seed: aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
+ seed:amd64|seed:aarch64|seed:riscv64) ;;
+ seed:*) echo "[boot6] DRIVER=seed: amd64|aarch64|riscv64 only (got $ARCH)" >&2; exit 2 ;;
esac
IMAGE=boot2-empty:$ARCH
BOOT2=build/$ARCH/boot2
@@ -108,6 +106,17 @@ runscm_input arch.h seed-kernel/arch/$ARCHDIR/arch.h
runscm_input mmu.c seed-kernel/arch/$ARCHDIR/mmu.c
runscm_input mem.c tcc-cc/mem.c
+# amd64 needs a post-link fixup — tcc3 doesn't emit PT_NOTE phdrs, so
+# QEMU's PVH `-kernel` path can't find the Xen 18 note that names the
+# 32-bit entry. The fixup is a hosted C tool we build inside the same
+# run.scm with tcc3 + boot4's libc/crt1/libtcc1, then run on kernel.elf.
+if [ "$ARCH" = amd64 ]; then
+ runscm_input elf-pvh-note.c seed-kernel/scripts/elf-pvh-note.c
+ runscm_input crt1.o "$BOOT4/crt1.o"
+ runscm_input libc.a "$BOOT4/libc.a"
+ runscm_input libtcc1.a "$BOOT4/libtcc1.a"
+fi
+
runscm_export "$OUT_FILE"
runscm_run 1200
diff --git a/scripts/lib-pipeline.sh b/scripts/lib-pipeline.sh
@@ -205,6 +205,21 @@ in/$inp"
-append "$APPEND" \
> "$TRANSCRIPT" 2>&1 &
;;
+ amd64)
+ qemu-system-x86_64 \
+ -machine microvm,acpi=off,pic=off,pit=off,rtc=off,isa-serial=on,auto-kernel-cmdline=off \
+ -cpu max -m 2048M \
+ -nodefaults -display none -serial stdio -no-reboot \
+ -global virtio-mmio.force-legacy=false \
+ -device isa-debug-exit,iobase=0x501,iosize=2 \
+ -kernel "$KERNEL_IMAGE" \
+ -drive file="$cpio_dir/in.img",if=none,format=raw,id=hd0,readonly=on \
+ -device virtio-blk-device,drive=hd0 \
+ -drive file="$cpio_dir/out.img",if=none,format=raw,id=hd1 \
+ -device virtio-blk-device,drive=hd1 \
+ -append "$APPEND" \
+ > "$TRANSCRIPT" 2>&1 &
+ ;;
*) echo "[lib-pipeline:seed] unsupported SEED_ARCH=$seed_arch" >&2; exit 2 ;;
esac
QPID=$!
diff --git a/scripts/lib-runscm.sh b/scripts/lib-runscm.sh
@@ -173,6 +173,24 @@ _runscm_run_seed() {
-append "init in/combined.scm" \
> "$TRANSCRIPT" 2>&1 &
;;
+ amd64)
+ # microvm + isa-debug-exit mirrors seed-kernel/run.sh: the
+ # kernel writes to port 0x501 on user exit_group(0) so QEMU
+ # exits cleanly (no `-no-reboot` triple-fault gymnastics).
+ qemu-system-x86_64 \
+ -machine microvm,acpi=off,pic=off,pit=off,rtc=off,isa-serial=on,auto-kernel-cmdline=off \
+ -cpu max -m "$mem" \
+ -nodefaults -display none -serial stdio -no-reboot \
+ -global virtio-mmio.force-legacy=false \
+ -device isa-debug-exit,iobase=0x501,iosize=2 \
+ -kernel "$KERNEL_IMAGE" \
+ -drive file="$S_STAGE_DIR/in.img",if=none,format=raw,id=hd0,readonly=on \
+ -device virtio-blk-device,drive=hd0 \
+ -drive file="$S_STAGE_DIR/out.img",if=none,format=raw,id=hd1 \
+ -device virtio-blk-device,drive=hd1 \
+ -append "init in/combined.scm" \
+ > "$TRANSCRIPT" 2>&1 &
+ ;;
*)
echo "[runscm/seed] unsupported SEED_ARCH=$seed_arch" >&2
exit 2
diff --git a/seed-kernel/arch/amd64/arch.h b/seed-kernel/arch/amd64/arch.h
@@ -27,6 +27,7 @@
#define ARCH_SYS_read 0
#define ARCH_SYS_write 1
+#define ARCH_SYS_open 2
#define ARCH_SYS_close 3
#define ARCH_SYS_lseek 8
#define ARCH_SYS_brk 12
diff --git a/seed-kernel/arch/amd64/kernel.S b/seed-kernel/arch/amd64/kernel.S
@@ -61,12 +61,31 @@ _start:
.byte 0xea; .long long_mode; .word 0x08
/* ljmp $0x08,$long_mode */
long_mode:
+ /* PVH entry hands us the hvm_start_info phys addr in ebx; preserve
+ * it through long-mode init (zero-extend to rbx) so kmain can pull
+ * `cmdline_paddr` out of it. The boot code above never touches ebx,
+ * so the value survives all the way down here. */
+ movl %ebx, %ebx
movw $0x10, %ax
movw %ax, %ds
movw %ax, %es
movw %ax, %ss
movq $kstack_top, %rsp
+ /* Enable SSE/SSE2 for user mode. tcc-emitted user binaries (tcc3,
+ * scheme1) save callee-saved xmm regs in every function prologue,
+ * so the first user `movq %xmm7, ...` would otherwise trap with
+ * #UD or #NM. CR0.MP=1, EM=0; CR4.OSFXSR=1, OSXMMEXCPT=1. The
+ * gcc-built kernel + `-mno-sse` user (user/hello.c) avoided this
+ * because gcc never emitted SSE; tcc has no equivalent flag. */
+ .byte 0x0f,0x20,0xc0 /* movq %cr0, %rax */
+ andq $~4, %rax
+ orq $2, %rax
+ .byte 0x0f,0x22,0xc0 /* movq %rax, %cr0 */
+ .byte 0x0f,0x20,0xe0 /* movq %cr4, %rax */
+ orq $0x600, %rax
+ .byte 0x0f,0x22,0xe0 /* movq %rax, %cr4 */
+
call amd64_serial_init
movq $__bss_start, %rdi
@@ -79,7 +98,7 @@ long_mode:
addq $8, %rdi
jmp 1b
2:
- xorl %edi, %edi
+ movq %rbx, %rdi
call kmain
3:
@@ -141,6 +160,81 @@ amd64_int80:
addq $216, %rsp
.byte 0x48,0xcf /* iretq */
+/* `syscall` entry point. Reached via MSR_LSTAR after EFER.SCE is set
+ * by mmu.c's setup_cpu_tables(). On entry: rcx=user RIP, r11=user RFLAGS,
+ * CS/SS reloaded from STAR, but RSP is still the user stack — we save
+ * it to saved_user_sp and switch to kstack manually. The trapframe
+ * mirrors amd64_int80's so trap_sync sees a uniform shape regardless
+ * of whether userland used `int $0x80` or `syscall`. */
+.globl amd64_syscall_entry
+amd64_syscall_entry:
+ movq %rsp, saved_user_sp(%rip)
+ movq $kstack_top, %rsp
+
+ subq $216, %rsp
+ movq %rcx, 192(%rsp)
+ movq %r11, 200(%rsp)
+
+ movq %rdi, 0(%rsp)
+ movq %rsi, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %r10, 24(%rsp)
+ movq %r8, 32(%rsp)
+ movq %r9, 40(%rsp)
+ movq %rax, 48(%rsp)
+ movq %rbx, 56(%rsp)
+ movq %rcx, 64(%rsp)
+ movq %r11, 72(%rsp)
+ movq %r12, 80(%rsp)
+ movq %r13, 88(%rsp)
+ movq %r14, 96(%rsp)
+ movq %r15, 104(%rsp)
+ movq %rbp, 112(%rsp)
+
+ xorl %edi, %edi
+ movq %rsp, %rsi
+ call trap_sync
+
+ movq 0(%rsp), %rdi
+ movq 8(%rsp), %rsi
+ movq 16(%rsp), %rdx
+ movq 24(%rsp), %r10
+ movq 32(%rsp), %r8
+ movq 40(%rsp), %r9
+ movq 48(%rsp), %rax
+ movq 56(%rsp), %rbx
+ movq 80(%rsp), %r12
+ movq 88(%rsp), %r13
+ movq 96(%rsp), %r14
+ movq 104(%rsp), %r15
+ movq 112(%rsp), %rbp
+
+ movq 192(%rsp), %rcx
+ movq 200(%rsp), %r11
+ addq $216, %rsp
+
+ movq saved_user_sp(%rip), %rsp
+ .byte 0x48,0x0f,0x07 /* sysretq */
+
+.globl amd64_wrmsr
+amd64_wrmsr:
+ /* amd64_wrmsr(u32 msr, u64 val) */
+ movl %edi, %ecx
+ movq %rsi, %rax
+ movq %rsi, %rdx
+ shrq $32, %rdx
+ .byte 0x0f,0x30 /* wrmsr */
+ ret
+
+.globl amd64_rdmsr
+amd64_rdmsr:
+ /* u64 amd64_rdmsr(u32 msr) */
+ movl %edi, %ecx
+ .byte 0x0f,0x32 /* rdmsr */
+ shlq $32, %rdx
+ orq %rdx, %rax
+ ret
+
.globl amd64_unhandled
amd64_unhandled:
subq $216, %rsp
@@ -310,9 +404,13 @@ amd64_serial_init:
.section .rodata, "a"
.align 8
boot_gdt64:
- .quad 0
- .quad 0x00af9a000000ffff
- .quad 0x00af92000000ffff
+ /* null, 64-bit code (P=1,DPL=0,S=1,type=A; G=1,L=1; limit=0xfffff),
+ * 64-bit data (P=1,DPL=0,S=1,type=2; G=1,limit=0xfffff). Encoded as
+ * pairs of .long because tcc 0.9.26's assembler truncates a single
+ * `.quad` literal to 32 bits when the high half is non-zero. */
+ .long 0, 0
+ .long 0x0000ffff, 0x00af9a00
+ .long 0x0000ffff, 0x00af9200
boot_gdt64_ptr:
.word boot_gdt64_ptr - boot_gdt64 - 1
.long boot_gdt64
diff --git a/seed-kernel/arch/amd64/mmu.c b/seed-kernel/arch/amd64/mmu.c
@@ -69,8 +69,16 @@ extern void amd64_lidt(const struct idtr *i);
extern void amd64_ltr(u16 sel);
extern void amd64_int80(void);
extern void amd64_unhandled(void);
+extern void amd64_syscall_entry(void);
+extern void amd64_wrmsr(u32 msr, u64 val);
+extern u64 amd64_rdmsr(u32 msr);
extern char kstack_top[];
+#define MSR_EFER 0xc0000080U
+#define MSR_STAR 0xc0000081U
+#define MSR_LSTAR 0xc0000082U
+#define MSR_SFMASK 0xc0000084U
+
static void set_gate(int vec, void (*fn)(void), int dpl) {
u64 a = (u64)fn;
idt[vec].off0 = (u16)a;
@@ -109,6 +117,24 @@ static void setup_cpu_tables(void) {
set_gate(0x80, amd64_int80, 3);
struct idtr idtr = { (u16)(sizeof(idt) - 1), (u64)idt };
amd64_lidt(&idtr);
+
+ /* Modern x86_64 fast-syscall path: scheme1 (and any tcc-built user
+ * binary that follows the standard SysV/Linux amd64 ABI) emits the
+ * `syscall` instruction, which routes through MSR_LSTAR rather than
+ * the IDT. Without this block the first `syscall` raises #UD because
+ * EFER.SCE is clear, manifesting as the unhandled-vector PANIC that
+ * scheme1 hits ~0xa09 bytes into its prelude.
+ *
+ * STAR layout for sysret with REX.W=1: the CPU computes user CS as
+ * STAR[63:48]+16 and user SS as STAR[63:48]+8 (RPL forced to 3).
+ * Our user CS sel is 0x23 (gdt[4] | 3) and user SS sel is 0x1b
+ * (gdt[3] | 3), so STAR[63:48] = 0x10. Kernel side: STAR[47:32] = 8
+ * yields kernel CS=0x08 and kernel SS=0x10 on syscall entry, which
+ * matches our gdt[1]/gdt[2]. */
+ amd64_wrmsr(MSR_EFER, amd64_rdmsr(MSR_EFER) | 1UL);
+ amd64_wrmsr(MSR_STAR, ((u64)0x10UL << 48) | ((u64)0x08UL << 32));
+ amd64_wrmsr(MSR_LSTAR, (u64)amd64_syscall_entry);
+ amd64_wrmsr(MSR_SFMASK, 0x200UL); /* clear IF on syscall entry */
}
static u64 pool_pa(int which) {
diff --git a/seed-kernel/kernel.c b/seed-kernel/kernel.c
@@ -146,15 +146,33 @@ static int str_starts(const char *s, const char *prefix) {
static void parse_dtb(const void *dtb, struct dtb_info *out) {
#ifdef ARCH_STATIC_VIRTIO_MMIO_BASE
- if ((u64)dtb == 0) {
- out->mem_start = ARCH_STATIC_MEM_START;
- out->mem_size = ARCH_STATIC_MEM_SIZE;
- out->virtio_mmio_n = ARCH_STATIC_VIRTIO_MMIO_COUNT;
- for (int i = 0; i < out->virtio_mmio_n && i < MAX_VIRTIO_MMIO; i++)
- out->virtio_mmio_pa[i] = ARCH_STATIC_VIRTIO_MMIO_BASE +
- (u64)i * ARCH_STATIC_VIRTIO_MMIO_STRIDE;
- return;
+ /* amd64 (microvm) has no DTB. mem + virtio-mmio come from arch.h
+ * defines, and the kernel cmdline (qemu -append "...") arrives via
+ * the PVH `hvm_start_info` struct that QEMU points EBX at. The
+ * Xen-defined magic is XEN_HVM_START_MAGIC_VALUE = 0x336ec578;
+ * cmdline_paddr lives at offset 0x18. */
+ out->mem_start = ARCH_STATIC_MEM_START;
+ out->mem_size = ARCH_STATIC_MEM_SIZE;
+ out->virtio_mmio_n = ARCH_STATIC_VIRTIO_MMIO_COUNT;
+ for (int i = 0; i < out->virtio_mmio_n && i < MAX_VIRTIO_MMIO; i++)
+ out->virtio_mmio_pa[i] = ARCH_STATIC_VIRTIO_MMIO_BASE +
+ (u64)i * ARCH_STATIC_VIRTIO_MMIO_STRIDE;
+ if ((u64)dtb != 0) {
+ const u8 *p = dtb;
+ u32 magic = (u32)p[0] | ((u32)p[1] << 8) | ((u32)p[2] << 16) | ((u32)p[3] << 24);
+ if (magic == 0x336ec578U) {
+ const u8 *cp = p + 0x18;
+ u64 cmdline_paddr = 0;
+ for (int i = 0; i < 8; i++) cmdline_paddr |= (u64)cp[i] << (i * 8);
+ if (cmdline_paddr) {
+ const char *s = (const char *)cmdline_paddr;
+ int i = 0;
+ while (s[i] && i < 255) { out->bootargs[i] = s[i]; i++; }
+ out->bootargs[i] = 0;
+ }
+ }
}
+ return;
#endif
const u8 *base = dtb;
if (be32(base) != FDT_MAGIC) {
@@ -811,6 +829,9 @@ static u64 brk_max;
#define SYS_unlinkat ARCH_SYS_unlinkat
#define SYS_openat ARCH_SYS_openat
+#ifdef ARCH_SYS_open
+#define SYS_open ARCH_SYS_open
+#endif
#define SYS_close ARCH_SYS_close
#define SYS_lseek ARCH_SYS_lseek
#define SYS_read ARCH_SYS_read
@@ -1314,6 +1335,11 @@ i64 trap_sync(u64 esr, struct trapframe *tf) {
case SYS_read: r = sys_read((int)a0, (void *)a1, a2); break;
case SYS_write: r = sys_write((int)a0, (const void *)a1, a2); break;
case SYS_openat: r = sys_openat((int)a0, (const char *)a1, (int)a2, (int)a3); break;
+#ifdef SYS_open
+ /* amd64 hex0/hex1/hex2/M0 seed binaries call legacy `open(path,
+ * flags, mode)` directly; alias to openat(AT_FDCWD, ...). */
+ case SYS_open: r = sys_openat(AT_FDCWD, (const char *)a0, (int)a1, (int)a2); break;
+#endif
case SYS_close: r = sys_close((int)a0); break;
case SYS_lseek: r = sys_lseek((int)a0, (i64)a1, (int)a2); break;
case SYS_brk: r = sys_brk(a0); break;
diff --git a/seed-kernel/scripts/elf-pvh-note.c b/seed-kernel/scripts/elf-pvh-note.c
@@ -0,0 +1,103 @@
+/* elf-pvh-note — append a PT_NOTE program header pointing at the
+ * existing `.note.Xen` section, and retype that section as SHT_NOTE.
+ *
+ * Why: tcc 0.9.26's linker emits only PT_LOAD program headers and
+ * marks every assembler-created section as SHT_PROGBITS, so QEMU's
+ * PVH `-kernel` path (which scans PT_NOTE phdrs for the Xen 18 note
+ * to find the 32-bit entry) refuses the kernel with "Error loading
+ * uncompressed kernel without PVH ELF Note". Patching the linker is
+ * a much bigger change than rewriting six fields after the fact;
+ * tcc reserves the post-Ehdr program-header area only large enough
+ * for its declared phnum, but the next section content lives at
+ * 0x200000 (s_align), so writing one extra Phdr at phoff+phnum*phentsize
+ * fits in the gap.
+ *
+ * Usage: elf-pvh-note <elf-path>
+ */
+
+extern int open(const char *, int, ...);
+extern long lseek(int, long, int);
+extern long read(int, void *, unsigned long);
+extern long write(int, const void *, unsigned long);
+extern int close(int);
+extern void *malloc(unsigned long);
+extern int strcmp(const char *, const char *);
+extern int printf(const char *, ...);
+
+#define O_RDWR 2
+#define SEEK_SET 0
+#define SEEK_END 2
+
+#define PT_NOTE 4
+#define PF_R 4
+#define SHT_NOTE 7
+
+typedef unsigned long u64;
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+static u16 r16(u8 *p) { return (u16)p[0] | ((u16)p[1] << 8); }
+static u32 r32(u8 *p) {
+ return (u32)p[0] | ((u32)p[1] << 8) | ((u32)p[2] << 16) | ((u32)p[3] << 24);
+}
+static u64 r64(u8 *p) {
+ return (u64)r32(p) | ((u64)r32(p + 4) << 32);
+}
+static void w16(u8 *p, u16 v) { p[0] = v; p[1] = v >> 8; }
+static void w32(u8 *p, u32 v) {
+ p[0] = v; p[1] = v >> 8; p[2] = v >> 16; p[3] = v >> 24;
+}
+static void w64(u8 *p, u64 v) { w32(p, (u32)v); w32(p + 4, (u32)(v >> 32)); }
+
+int main(int argc, char **argv) {
+ if (argc != 2) { printf("usage: elf-pvh-note <elf>\n"); return 2; }
+ int fd = open(argv[1], O_RDWR);
+ if (fd < 0) { printf("elf-pvh-note: open failed\n"); return 1; }
+ long size = lseek(fd, 0, SEEK_END);
+ lseek(fd, 0, SEEK_SET);
+ u8 *buf = (u8 *)malloc(size);
+ if (read(fd, buf, size) != size) { printf("elf-pvh-note: short read\n"); return 1; }
+
+ u64 phoff = r64(buf + 0x20);
+ u16 phentsize = r16(buf + 0x36);
+ u16 phnum = r16(buf + 0x38);
+ u64 shoff = r64(buf + 0x28);
+ u16 shentsize = r16(buf + 0x3a);
+ u16 shnum = r16(buf + 0x3c);
+ u16 shstrndx = r16(buf + 0x3e);
+
+ u64 shstrtab_off = r64(buf + shoff + shstrndx * shentsize + 0x18);
+
+ u64 note_off = 0, note_size = 0, note_addr = 0;
+ int found = 0;
+ for (int i = 0; i < shnum; i++) {
+ u8 *sh = buf + shoff + i * shentsize;
+ u32 name_off = r32(sh);
+ if (strcmp((char *)(buf + shstrtab_off + name_off), ".note.Xen") == 0) {
+ note_addr = r64(sh + 0x10);
+ note_off = r64(sh + 0x18);
+ note_size = r64(sh + 0x20);
+ w32(sh + 4, SHT_NOTE);
+ found = 1;
+ break;
+ }
+ }
+ if (!found) { printf("elf-pvh-note: no .note.Xen section\n"); return 1; }
+
+ u8 *ph = buf + phoff + phnum * phentsize;
+ w32(ph + 0, PT_NOTE);
+ w32(ph + 4, PF_R);
+ w64(ph + 8, note_off);
+ w64(ph + 16, note_addr);
+ w64(ph + 24, note_addr);
+ w64(ph + 32, note_size);
+ w64(ph + 40, note_size);
+ w64(ph + 48, 4);
+ w16(buf + 0x38, phnum + 1);
+
+ lseek(fd, 0, SEEK_SET);
+ if (write(fd, buf, size) != size) { printf("elf-pvh-note: short write\n"); return 1; }
+ close(fd);
+ return 0;
+}