commit 29e05d332a0b41d5af518091260aeead4c1d3fca
parent 981f61e8f53007812c4d6cb9cdac887f26e4f529
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 5 May 2026 16:50:59 -0700
seed-kernel: refactor kernel.S to drop tcc-asm features; refresh OS-TODO
Rewrites kernel.S to use only features tcc3's arm64 assembler already
supports, narrowing the gap to self-host the seed kernel:
- adrp/add :lo12: and adr label loads → ldr Xn, =sym (lowers to the
same MOVW_UABS_G{0..3} chain arm64-gen.c emits today)
- .macro VENTRY (16 invocations) → unrolled .balign 0x80 + b
- .macro SAVE_TF / RESTORE_TF → unrolled at all three trap entries
(linkage-register factoring rejected: would clobber user x18 before
saving it)
Verified: gcc + kernel.lds build still produces a working Image; the
hello and fork initramfs both boot and exit cleanly under qemu+hvf.
Updates docs/OS-TODO.md with the empirical tcc3 blocker list, taken
by running build/aarch64/boot4/tcc3 against the post-refactor sources
and stubbing each error until the next surfaced. Drops the items
the refactor solved (.macro, adrp/:lo12:); adds the ones it
exposed (b.cond/cbz forward-ref symbol-table bug; tcc-emitted
memmove call kernel.c doesn't define).
Diffstat:
| M | docs/OS-TODO.md | | | 150 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------- |
| M | seed-kernel/kernel.S | | | 192 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------- |
2 files changed, 252 insertions(+), 90 deletions(-)
diff --git a/docs/OS-TODO.md b/docs/OS-TODO.md
@@ -15,50 +15,122 @@ This file tracks remaining polish.
## tcc3 self-host of `seed-kernel/`
-The C side already compiles and runs cleanly under
+The C side compiles cleanly under
[`build/aarch64/boot4/tcc3`](../scripts/boot4.sh) — `kernel.c` and
`user/{forktest,child,hello}.c` have no inline asm, all sysreg / barrier
/ cache / TLB / PSCI ops route through the C-callable thunks at the
bottom of [`kernel.S`](../seed-kernel/kernel.S), and per-syscall
wrappers delegate to a single `syscall6` toplevel-asm thunk.
-
-Remaining blockers are all in tcc 0.9.26's arm64 assembler /
-linker (see [`docs/TCC-ARM64-ASM.md`](TCC-ARM64-ASM.md) for the
-phase-1/2/3 trajectory of
-`scripts/simple-patches/tcc-0.9.26/files/arm64-asm.c`).
-
-### `kernel.S` mnemonics not yet in phase-2
-
-- `msr` / `mrs` to/from named system registers: `mair_el1`,
- `tcr_el1`, `ttbr0_el1`, `sctlr_el1`, `cpacr_el1`, `sp_el0`,
- `esr_el1`, `far_el1`, `vbar_el1`, `hcr_el2`, `spsr_el2`,
- `elr_el2`, `sp_el1`. Plus the immediate / pseudo forms
- `msr daifset, #imm` and `mrs CurrentEL`.
-- `eret`.
-- `ic iallu`, `tlbi vmalle1`.
-- `dsb sy` / `dsb ish` / `dmb ish` / `dmb ishst` by name — the
- current assembler wants `dsb #imm` and rejects the named scope.
-- `adrp` / `adr` + `:lo12:` relocations for label addresses (the
- EL1 boot path uses `adrp Xn, sym; add Xn, Xn, :lo12:sym`).
-- `.macro` / `.endm` (VENTRY pattern in the exception vector table).
-- `.quad sym1 - sym2` (the arm64 Image header's `image_size` field).
-
-### `kernel.lds` — no `-T` in tcc3
-
-tcc3's linker accepts `-Wl,-Ttext=`, `-Wl,-image-base=`,
-`-Wl,-section-alignment=` only; there is no `-T script`. The seed
-kernel's link layout needs:
-
-- `KEEP(*(.head.text))` placed first (boot header at `0x40080000`).
-- Link-time symbol assignments: `__bss_start`, `__bss_end`,
- `kstack_top`, `_end`, `_image_end`.
-- A custom `.stack` section sized to 64 KB.
-
-Two paths: tcc gains a small ld-script subset, or `kernel.S`
-self-defines the symbols and reserves stack space inline (with
-`-Wl,-Ttext=0x40080000` replacing the base). The former is bigger
-but reusable across kernel-style targets; the latter ties section
-ordering to input file order and is brittle.
+[`kernel.S`](../seed-kernel/kernel.S) was refactored to drop the
+features the doc previously listed as "tcc-side fixable":
+
+- All `adrp/add :lo12:` and `adr` label loads rewritten to `ldr Xn,
+ =sym`, which already lowers to the same MOVW_UABS_G{0..3} chain
+ `arm64-gen.c` uses for compiler-emitted symbol loads. No tcc work
+ needed for label addressing.
+- The 16 `.macro VENTRY` invocations unrolled.
+- `.macro SAVE_TF / RESTORE_TF` unrolled at all three trap entries.
+
+What remains, observed by running tcc3 against the post-refactor
+sources (and confirmed empirically by stubbing each blocker and
+re-running until the next one surfaces).
+
+### `kernel.S` blockers — assembler
+
+Failures from `tcc3 -c kernel.S` (line numbers as of this writing):
+
+1. **`.quad sym1 - sym2`** (line 13: `.quad _image_end - _head` in
+ the Image header). `tccasm.c`'s `.quad` handler on non-x86_64
+ reads tokens through `strtoll` and rejects everything but a bare
+ integer. `asm_expr_sum` already supports same-section symbol
+ subtraction (lines 286–294 of tccasm.c) — extending `.quad` to go
+ through it is ~10 lines.
+2. **`msr` / `mrs` named system registers** (line 23: `msr daifset,
+ #0xf`; line 26: `mrs x9, CurrentEL`; many more). Both mnemonics
+ absent from `arm64-tok.h` / `arm64-asm.c`. Needs:
+ - A small named-sysreg table covering the kernel's set:
+ `mair_el1`, `tcr_el1`, `ttbr0_el1`, `sctlr_el1`, `cpacr_el1`,
+ `sp_el0`, `sp_el1`, `esr_el1`, `far_el1`, `vbar_el1`, `elr_el1`,
+ `spsr_el1`, `hcr_el2`, `spsr_el2`, `elr_el2`, plus read-only
+ `CurrentEL`.
+ - The MSR-immediate form for ProcState fields: `msr daifset,
+ #imm` / `msr daifclr, #imm`.
+3. **`eret`** (line 48). Single 32-bit fixed encoding (`0xd69f03e0`).
+4. **`ic iallu`** (one site), **`tlbi vmalle1`** (one site). Both
+ are SYS-instruction encodings; one generic `emit_sys` helper plus
+ a small named-op table covers them and leaves room to grow.
+5. **`dsb sy` / `dsb ish` / `dmb ish` / `dmb ishst` by name.** The
+ current `do_barrier` already accepts `#imm`; adding the named
+ tokens is one extra `tok_to_barrier_crm()` lookup. Workaround
+ for now: write `dsb #0xf` (sy), `dsb #0xb` (ish), `dmb #0xb`
+ (ish), `dmb #0xa` (ishst). No tcc work needed if we accept the
+ numeric form.
+6. **`b.cond` to a forward in-section label** (line 29: `b.ne
+ in_el1`, plus several others to `1f` / `2f`). `sec_local_offset`
+ in `arm64-asm.c` rejects `sym->r != cur_text_section->sh_num`,
+ but on a forward reference the symbol is freshly created with
+ `sym->r == 0` (SHN_UNDEF) — so even legitimate same-section
+ forward jumps fall through to the "extern target needs CONDBR19
+ reloc (unsupported)" error path. Two fixes:
+ - Add a forward-fixup list (record use site, patch when label is
+ defined). Same shape `i386-asm.c` uses.
+ - Implement `R_AARCH64_CONDBR19` in `arm64-link.c` and emit a
+ reloc when the symbol is undefined; the linker resolves it.
+ Both fix `cbz`/`cbnz` (#7) too. The reloc path is also needed if
+ a future kernel ever wants `b.cond <extern>`.
+7. **`cbz` / `cbnz` to a forward in-section label** (line 440:
+ `cbnz x9, .Lpsci_smc`). Same root cause as #6 — same emitter
+ helper (`sec_local_offset`) — same fix unlocks both.
+
+The `.macro`/`.endm` and `adrp`/`:lo12:` items from the previous
+draft of this list are gone: the first by unrolling, the second by
+switching to `ldr Xn, =sym`. Both are pure-`kernel.S` choices that
+made the assembler's job substantially smaller.
+
+### `kernel.S` blockers — runtime / codegen
+
+8. **`memmove` undefined at link.** `kernel.c` defines `memset` and
+ `memcpy` but not `memmove`. gcc never emits a `memmove` call
+ from this source, but tcc does (likely a struct-copy lowering;
+ the offending site has not been chased down). Add a small
+ `memmove` next to `memset`/`memcpy` in `kernel.c` — overlap
+ logic is ~6 lines.
+
+### `kernel.lds` blockers — linker
+
+9. **No `-T script` in tcc3.** `tcc -Wl,-T,kernel.lds` errors out:
+ `unsupported linker option`. Tcc3's `-Wl,` accepts `-Ttext=`,
+ `-image-base=`, `-section-alignment=` only. The seed kernel's
+ layout needs:
+ - `KEEP(*(.head.text))` first (boot header at `0x40080000`).
+ - Link-time symbol assignments: `__bss_start`, `__bss_end`,
+ `kstack_top`, `_end`, `_image_end`.
+ - A custom `.stack` section sized to 64 KB.
+
+10. **Linker-script symbols undefined** (`kstack_top`,
+ `__bss_start`, `__bss_end`). Confirmed by the link error above
+ when the script is dropped — the Image-header `image_size`
+ expression and the entry stub's BSS-zeroing loop both reference
+ these.
+
+Two paths for the layout problem (#9 + #10):
+- **Inline.** Define the symbols and reserve a 64 KB stack inside
+ `kernel.S` using `.skip` / `.balign`. Brackets the kernel's `.bss`
+ by putting `__bss_start:` in a `.bss.0_start` section and
+ `__bss_end:` in `.bss.9_end` — but tcc/ld won't merge these into
+ the main `.bss` without a script, so each becomes its own output
+ section. Workable if BSS is replaced with explicit storage in
+ `kernel.S`; brittle if we keep relying on the C compiler's `.bss`
+ emission.
+- **Tcc gains a small ld-script subset.** `KEEP(*(...))`, simple
+ symbol assignment (`sym = .;`), and a single explicit-storage
+ section block. ~150 LoC, reusable across kernel-style targets,
+ doesn't restructure `kernel.S` further.
+
+The post-refactor balance probably tips toward the script subset —
+the inline-bracket trick for BSS is the wart the previous draft of
+this doc called out as "brittle," and the cost gap to a small ld
+implementation has narrowed now that everything else is solved.
### Toplevel `asm()` `.globl` ordering — worked around
diff --git a/seed-kernel/kernel.S b/seed-kernel/kernel.S
@@ -29,37 +29,37 @@ stext:
b.ne in_el1
/* EL2 → EL1: set HCR_EL2.RW=1 (EL1 is AArch64), CNTHCTL/CNTVOFF defaults,
- * SPSR=EL1h with DAIF masked, ELR=in_el1, eret. */
+ * SPSR=EL1h with DAIF masked, ELR=in_el1, eret.
+ *
+ * Address loads use `ldr Xn, =sym` rather than `adrp/add :lo12:` so this
+ * file assembles under tcc3 (the MOVW_UABS_G{0..3} reloc chain that
+ * `ldr =sym` lowers to is the same chain `arm64-gen.c` uses for every
+ * compiler-emitted symbol load — see docs/TCC-ARM64-ASM.md). */
mov x9, #(1 << 31)
msr hcr_el2, x9
mov x9, #0x3c5 /* EL1h, DAIF=1111 */
msr spsr_el2, x9
- adr x9, in_el1
+ ldr x9, =in_el1
msr elr_el2, x9
/* Make sure SP_EL1 is set before we eret to EL1 (else we land with
* an undefined SP). Use the same kernel stack we're about to install. */
- adrp x9, kstack_top
- add x9, x9, :lo12:kstack_top
+ ldr x9, =kstack_top
msr sp_el1, x9
eret
in_el1:
/* Stack. */
- adrp x9, kstack_top
- add x9, x9, :lo12:kstack_top
+ ldr x9, =kstack_top
mov sp, x9
/* Vector table. */
- adrp x9, vector_table
- add x9, x9, :lo12:vector_table
+ ldr x9, =vector_table
msr vbar_el1, x9
isb
/* Zero BSS. */
- adrp x1, __bss_start
- add x1, x1, :lo12:__bss_start
- adrp x2, __bss_end
- add x2, x2, :lo12:__bss_end
+ ldr x1, =__bss_start
+ ldr x2, =__bss_end
1: cmp x1, x2
b.ge 2f
str xzr, [x1], #8
@@ -76,45 +76,67 @@ hang:
/* ─── Exception vector table ──────────────────────────────────────────── */
-.macro VENTRY label
- .balign 0x80
- b \label
-.endm
+/* The 16 entries are unrolled (rather than emitted via `.macro VENTRY` +
+ * 16 invocations) because tcc 0.9.26's `tccasm.c` does not implement
+ * `.macro` / `.endm`. Each entry is `.balign 0x80` (puts the entry on its
+ * 128-byte slot) followed by a single `b <handler>`. */
.section .text, "ax"
.balign 0x800
.globl vector_table
vector_table:
/* Current EL with SP_EL0 (we never run kernel like this — only user). */
- VENTRY el1_sp0_sync /* 0x000: SVC from EL1t (our "user") */
- VENTRY unhandled /* 0x080 */
- VENTRY unhandled /* 0x100 */
- VENTRY unhandled /* 0x180 */
+ .balign 0x80 /* 0x000: SVC from EL1t (our "user") */
+ b el1_sp0_sync
+ .balign 0x80 /* 0x080 */
+ b unhandled
+ .balign 0x80 /* 0x100 */
+ b unhandled
+ .balign 0x80 /* 0x180 */
+ b unhandled
/* Current EL with SP_ELx (kernel internal). */
- VENTRY el1_spx_sync /* 0x200: panic on kernel sync fault */
- VENTRY unhandled /* 0x280 */
- VENTRY unhandled /* 0x300 */
- VENTRY unhandled /* 0x380 */
+ .balign 0x80 /* 0x200: panic on kernel sync fault */
+ b el1_spx_sync
+ .balign 0x80 /* 0x280 */
+ b unhandled
+ .balign 0x80 /* 0x300 */
+ b unhandled
+ .balign 0x80 /* 0x380 */
+ b unhandled
/* Lower EL using AArch64 (EL0). Unused in this design but wired. */
- VENTRY el1_sp0_sync /* 0x400 */
- VENTRY unhandled /* 0x480 */
- VENTRY unhandled /* 0x500 */
- VENTRY unhandled /* 0x580 */
+ .balign 0x80 /* 0x400 */
+ b el1_sp0_sync
+ .balign 0x80 /* 0x480 */
+ b unhandled
+ .balign 0x80 /* 0x500 */
+ b unhandled
+ .balign 0x80 /* 0x580 */
+ b unhandled
/* Lower EL using AArch32 (unused). */
- VENTRY unhandled /* 0x600 */
- VENTRY unhandled /* 0x680 */
- VENTRY unhandled /* 0x700 */
- VENTRY unhandled /* 0x780 */
+ .balign 0x80 /* 0x600 */
+ b unhandled
+ .balign 0x80 /* 0x680 */
+ b unhandled
+ .balign 0x80 /* 0x700 */
+ b unhandled
+ .balign 0x80 /* 0x780 */
+ b unhandled
/* ─── Trap entry/exit ─────────────────────────────────────────────────────
* Save x0..x30 + ELR_EL1 + SPSR_EL1 onto the kernel stack as a trapframe,
- * call C trap_sync(esr, &tf), restore, eret. The C handler reads/writes
- * tf->x[0..7] for syscall args and return value, plus tf->x[8] for the
- * syscall number.
+ * call a C handler with (esr, &tf), restore, eret. The C handler reads
+ * and writes tf->x[0..7] for syscall args and return value, plus tf->x[8]
+ * for the syscall number.
+ *
+ * Save/restore are unrolled at each of the three entries (sp0_sync,
+ * spx_sync, unhandled) rather than being factored through `.macro` —
+ * tcc 0.9.26's tccasm.c does not implement .macro, and the obvious
+ * shared-subroutine factoring (`b save_tf; ...; b restore_tf` with a
+ * linkage register) would clobber the user's x18 before saving it.
*/
-.macro SAVE_TF
+el1_sp0_sync:
sub sp, sp, #272
stp x0, x1, [sp, #0]
stp x2, x3, [sp, #16]
@@ -135,9 +157,9 @@ vector_table:
mrs x10, elr_el1
mrs x11, spsr_el1
stp x10, x11, [sp, #248]
-.endm
-
-.macro RESTORE_TF
+ mrs x0, esr_el1
+ mov x1, sp
+ bl trap_sync
ldp x10, x11, [sp, #248]
msr elr_el1, x10
msr spsr_el1, x11
@@ -158,31 +180,99 @@ vector_table:
ldp x2, x3, [sp, #16]
ldp x0, x1, [sp, #0]
add sp, sp, #272
-.endm
-
-el1_sp0_sync:
- SAVE_TF
- mrs x0, esr_el1
- mov x1, sp
- bl trap_sync
- RESTORE_TF
eret
el1_spx_sync:
/* Same shape as user sync — let C distinguish via SPSR/ESR if needed. */
- SAVE_TF
+ sub sp, sp, #272
+ stp x0, x1, [sp, #0]
+ stp x2, x3, [sp, #16]
+ stp x4, x5, [sp, #32]
+ stp x6, x7, [sp, #48]
+ stp x8, x9, [sp, #64]
+ stp x10, x11, [sp, #80]
+ stp x12, x13, [sp, #96]
+ stp x14, x15, [sp, #112]
+ stp x16, x17, [sp, #128]
+ stp x18, x19, [sp, #144]
+ stp x20, x21, [sp, #160]
+ stp x22, x23, [sp, #176]
+ stp x24, x25, [sp, #192]
+ stp x26, x27, [sp, #208]
+ stp x28, x29, [sp, #224]
+ str x30, [sp, #240]
+ mrs x10, elr_el1
+ mrs x11, spsr_el1
+ stp x10, x11, [sp, #248]
mrs x0, esr_el1
mov x1, sp
bl trap_kernel
- RESTORE_TF
+ ldp x10, x11, [sp, #248]
+ msr elr_el1, x10
+ msr spsr_el1, x11
+ ldr x30, [sp, #240]
+ ldp x28, x29, [sp, #224]
+ ldp x26, x27, [sp, #208]
+ ldp x24, x25, [sp, #192]
+ ldp x22, x23, [sp, #176]
+ ldp x20, x21, [sp, #160]
+ ldp x18, x19, [sp, #144]
+ ldp x16, x17, [sp, #128]
+ ldp x14, x15, [sp, #112]
+ ldp x12, x13, [sp, #96]
+ ldp x10, x11, [sp, #80]
+ ldp x8, x9, [sp, #64]
+ ldp x6, x7, [sp, #48]
+ ldp x4, x5, [sp, #32]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #0]
+ add sp, sp, #272
eret
unhandled:
- SAVE_TF
+ sub sp, sp, #272
+ stp x0, x1, [sp, #0]
+ stp x2, x3, [sp, #16]
+ stp x4, x5, [sp, #32]
+ stp x6, x7, [sp, #48]
+ stp x8, x9, [sp, #64]
+ stp x10, x11, [sp, #80]
+ stp x12, x13, [sp, #96]
+ stp x14, x15, [sp, #112]
+ stp x16, x17, [sp, #128]
+ stp x18, x19, [sp, #144]
+ stp x20, x21, [sp, #160]
+ stp x22, x23, [sp, #176]
+ stp x24, x25, [sp, #192]
+ stp x26, x27, [sp, #208]
+ stp x28, x29, [sp, #224]
+ str x30, [sp, #240]
+ mrs x10, elr_el1
+ mrs x11, spsr_el1
+ stp x10, x11, [sp, #248]
mrs x0, esr_el1
mov x1, sp
bl trap_unhandled
- RESTORE_TF
+ ldp x10, x11, [sp, #248]
+ msr elr_el1, x10
+ msr spsr_el1, x11
+ ldr x30, [sp, #240]
+ ldp x28, x29, [sp, #224]
+ ldp x26, x27, [sp, #208]
+ ldp x24, x25, [sp, #192]
+ ldp x22, x23, [sp, #176]
+ ldp x20, x21, [sp, #160]
+ ldp x18, x19, [sp, #144]
+ ldp x16, x17, [sp, #128]
+ ldp x14, x15, [sp, #112]
+ ldp x12, x13, [sp, #96]
+ ldp x10, x11, [sp, #80]
+ ldp x8, x9, [sp, #64]
+ ldp x6, x7, [sp, #48]
+ ldp x4, x5, [sp, #32]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #0]
+ add sp, sp, #272
eret