boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

stage1-flatten.sh (17163B)


      1 #!/bin/sh
      2 ## bootprep/stage1-flatten.sh — flatten upstream tcc-0.9.26 into a single
      3 ## C bytestream (tcc.flat.c) using only the host preprocessor.
      4 ##
      5 ## This is the first of three stages building tcc-0.9.26 without
      6 ## M2-Planet, MesCC, or Mes Scheme. See docs/TCC.md.
      7 ##
      8 ## Stages:
      9 ##   1. unpack tcc-0.9.26-1147-gee75a10c.tar.gz
     10 ##   2. apply live-bootstrap simple-patches (tcctools.c file-open reorder)
     11 ##   3. host cc -E -nostdinc with mes-bundled headers + tcc-mes defines
     12 ##   4. emit build/<arch>/vendor/tcc/tcc.flat.c
     13 ##   5. (--verify) compile tcc.flat.c with host cc to confirm well-formedness
     14 ##
     15 ## Stage 1 deliberately stays on the host: it is just text manipulation
     16 ## (preprocess + concat) and the resulting tcc.flat.c is a portable
     17 ## artifact downstream stages consume. No container needed.
     18 ##
     19 ## Usage:
     20 ##   bootprep/stage1-flatten.sh [--arch <X86_64|I386|RISCV64|ARM64|AARCH64>] [--verify]
     21 
     22 set -eu
     23 
     24 # --- arg parse --------------------------------------------------------
     25 ARCH=X86_64
     26 VERIFY=0
     27 while [ $# -gt 0 ]; do
     28     case "$1" in
     29         --arch)    ARCH=$2; shift 2 ;;
     30         --verify)  VERIFY=1; shift ;;
     31         -h|--help) awk '/^##/ { sub(/^## ?/, ""); print }' "$0"; exit 0 ;;
     32         *) echo "unknown arg: $1" >&2; exit 2 ;;
     33     esac
     34 done
     35 
     36 case "$ARCH" in
     37     X86_64|x86_64|amd64)    BOOT_ARCH=amd64;   MES_ARCH=x86_64;  HAVE_LL=1; TCC_TARGET_DEFINE=X86_64;  CPP_ARCH=x86_64  ;;
     38     I386|i386)              BOOT_ARCH=i386;    MES_ARCH=x86;     HAVE_LL=0; TCC_TARGET_DEFINE=I386;    CPP_ARCH=x86     ;;
     39     RISCV64|riscv64)        BOOT_ARCH=riscv64; MES_ARCH=riscv64; HAVE_LL=1; TCC_TARGET_DEFINE=RISCV64; CPP_ARCH=riscv64 ;;
     40     ARM64|arm64|AARCH64|aarch64)
     41                             BOOT_ARCH=aarch64; MES_ARCH=riscv64; HAVE_LL=1; TCC_TARGET_DEFINE=ARM64;   CPP_ARCH=aarch64 ;;
     42     *) echo "unknown ARCH: $ARCH" >&2; exit 2 ;;
     43 esac
     44 
     45 # --- paths ------------------------------------------------------------
     46 # Everything used by this script is in-tree under $ROOT. No reach into
     47 # sibling repos.
     48 #
     49 #   vendor/tcc/0.9.26.tar.gz      — pristine upstream tarball
     50 #   vendor/tcc/patches-lb/  — live-bootstrap's tcc
     51 #                                            simple-patches, copied in
     52 #                                            for in-tree builds
     53 #   vendor/tcc/patches/     — our own tcc patches
     54 #   bootprep/headers/       — our hand-rolled libc headers used at
     55 #                             flatten time by host `cc -E -nostdinc`.
     56 #                             stdarg.h here routes va_* through
     57 #                             __builtin_va_*; the rest are minimal
     58 #                             stubs so tcc.c parses through the host
     59 #                             preprocessor.
     60 ROOT=$(cd "$(dirname "$0")/.." && pwd)
     61 WORK=$ROOT/build/$BOOT_ARCH/vendor/tcc
     62 DISTFILES=$ROOT/vendor/tcc
     63 LB_PATCHES=$ROOT/vendor/tcc/patches-lb
     64 OUR_PATCHES=$ROOT/vendor/tcc/patches
     65 SYS_INCLUDE=$ROOT/bootprep/headers
     66 
     67 TCC_TAR=$DISTFILES/0.9.26.tar.gz
     68 TCC_PKG=tcc-0.9.26-1147-gee75a10c
     69 
     70 [ -r "$TCC_TAR"           ] || { echo "missing $TCC_TAR" >&2; exit 1; }
     71 [ -d "$LB_PATCHES"        ] || { echo "missing $LB_PATCHES" >&2; exit 1; }
     72 [ -d "$OUR_PATCHES"       ] || { echo "missing $OUR_PATCHES" >&2; exit 1; }
     73 [ -d "$SYS_INCLUDE"       ] || { echo "missing $SYS_INCLUDE" >&2; exit 1; }
     74 
     75 # --- (1) unpack -------------------------------------------------------
     76 mkdir -p "$WORK"
     77 rm -rf "$WORK/$TCC_PKG"
     78 tar -xzf "$TCC_TAR" -C "$WORK"
     79 
     80 SRC=$WORK/$TCC_PKG
     81 
     82 # --- (2) simple-patches ----------------------------------------------
     83 # Both patches edit tcctools.c. The pair (remove-fileopen, addback-fileopen)
     84 # moves a fopen() block earlier in the function. We implement live-bootstrap's
     85 # simple-patch as an awk literal-block replacer; no binary dep.
     86 apply_simple_patch() {
     87     target=$1; before=$2; after=$3
     88     [ -r "$target" ] || { echo "patch target missing: $target" >&2; exit 1; }
     89     [ -r "$before" ] || { echo "patch before missing: $before" >&2; exit 1; }
     90     [ -r "$after"  ] || { echo "patch after missing: $after"   >&2; exit 1; }
     91     awk -v BFILE="$before" -v AFILE="$after" '
     92         BEGIN {
     93             while ((getline line < BFILE) > 0) bef = bef line "\n";
     94             close(BFILE);
     95             while ((getline line < AFILE) > 0) aft = aft line "\n";
     96             close(AFILE);
     97         }
     98         { src = src $0 "\n" }
     99         END {
    100             i = index(src, bef);
    101             if (i == 0) { print "patch did not match" > "/dev/stderr"; exit 1 }
    102             printf "%s%s%s",
    103                 substr(src, 1, i - 1),
    104                 aft,
    105                 substr(src, i + length(bef));
    106         }
    107     ' "$target" > "$target.new"
    108     mv "$target.new" "$target"
    109 }
    110 
    111 apply_simple_patch \
    112     "$SRC/tcctools.c" \
    113     "$LB_PATCHES/remove-fileopen.before" \
    114     "$LB_PATCHES/remove-fileopen.after"
    115 
    116 apply_simple_patch \
    117     "$SRC/tcctools.c" \
    118     "$LB_PATCHES/addback-fileopen.before" \
    119     "$LB_PATCHES/addback-fileopen.after"
    120 
    121 # Bootstrap stub patches — eliminate libc symbols not provided by mes-mini-libc
    122 # (mprotect, getcwd, getenv, gettimeofday, ldexp, time, localtime, sscanf) by
    123 # gating call sites on the existing BOOTSTRAP CPP define.
    124 apply_our_patch() {
    125     name=$1; target=$2
    126     apply_simple_patch \
    127         "$target" \
    128         "$OUR_PATCHES/$name.before" \
    129         "$OUR_PATCHES/$name.after"
    130 }
    131 
    132 apply_our_patch tcc-is-native-stub   "$SRC/tcc.h"
    133 apply_our_patch tccrun-include       "$SRC/libtcc.c"
    134 apply_our_patch tinyc-define         "$SRC/libtcc.c"
    135 apply_our_patch longjmp-stub         "$SRC/libtcc.c"
    136 apply_our_patch set-environment-stub "$SRC/tcc.c"
    137 apply_our_patch getclock-ms-stub     "$SRC/tcc.c"
    138 apply_our_patch getcwd-stub          "$SRC/tccgen.c"
    139 apply_our_patch strip-file-prefix    "$SRC/tccgen.c"
    140 apply_our_patch ldexp-stub           "$SRC/tccpp.c"
    141 apply_our_patch date-time-stub       "$SRC/tccpp.c"
    142 apply_our_patch lex-char-unsigned    "$SRC/tccpp.c"
    143 
    144 # LP64 constants: upstream's parser treats one `L` suffix as 64-bit
    145 # only on x86_64. ARM64/RISCV64 are LP64 too; without this, `-4096UL`
    146 # is zero-extended from 32 bits and musl's __syscall_ret rejects valid
    147 # high mmap addresses as errors.
    148 apply_our_patch lp64-long-constant   "$SRC/tccpp.c"
    149 apply_our_patch elfinterp-stub       "$SRC/tccelf.c"
    150 
    151 # Auto-define `__bss_start` alongside tcc's existing `_end` symbol so a
    152 # freestanding image (kernel.S) can zero its .bss with start/end anchors
    153 # without an ld script.  Mirrors the live-bootstrap convention.
    154 apply_our_patch bss-start-symbol     "$SRC/tccelf.c"
    155 
    156 # x86_64 static-link PLT32 collapse: under BOOTSTRAP we force
    157 # static_link=1 with no .dynamic / no PT_INTERP, so the runtime linker
    158 # never fills the PLT's GOT slots. Upstream tcc 0.9.26 only collapses
    159 # PLT32→PC32 for hidden-visibility or LOCAL symbols, leaving global
    160 # defined symbols going through unfilled PLT entries. The patch widens
    161 # the condition to any symbol defined in this binary (st_shndx !=
    162 # SHN_UNDEF), which matches the aarch64 path's behavior. Harmless on
    163 # other arches: the block is gated `#ifdef TCC_TARGET_X86_64`.
    164 apply_our_patch x86_64-static-plt32  "$SRC/tccelf.c"
    165 
    166 # AT.2: native PT_NOTE for PVH boot. Stock tcc tags `.note.*` sections
    167 # as SHT_PROGBITS and never emits a PT_NOTE phdr, so QEMU's PVH
    168 # `-kernel` path on amd64 (which scans PT_NOTE for the Xen 18 entry)
    169 # rejects the kernel. Three patches: (1) retype implicitly-created
    170 # `.note*` sections to SHT_NOTE; (2) allocate a PT_NOTE phdr covering
    171 # every SHT_NOTE+SHF_ALLOC section; (3) accept SHT_NOTE in the .o
    172 # loader so kernel-asm.o's .note.Xen merges into the link output (else
    173 # the subsequent .rela.note.Xen merge derefs sm_table[].s == NULL).
    174 # The phnum bump in (2) is gated on actual presence so aarch64/riscv64
    175 # (no .note sections) keep their existing phdr count and stay
    176 # byte-identical to pre-patch output.
    177 apply_our_patch note-section-sht-note    "$SRC/tccelf.c"
    178 apply_our_patch pt-note-phdr             "$SRC/tccelf.c"
    179 apply_our_patch load-obj-accept-sht-note "$SRC/tccelf.c"
    180 
    181 # x86_64 va_list runtime: tcc's lib/va_list.c declares `extern void
    182 # abort(void)` and calls it in an unreachable default branch of the
    183 # arg-type switch. Under -nostdlib that abort() symbol is unresolved
    184 # and the link fails. Replace with an inline spin — same effect, no
    185 # libc dependency. Unconditional patch: lib/va_list.c is only
    186 # compiled on amd64, but the .before block is gated by the file's
    187 # `#if defined TCC_TARGET_X86_64` so other arches see the patch
    188 # inert.
    189 apply_our_patch va_list-no-abort     "$SRC/lib/va_list.c"
    190 
    191 # Const-expr short-circuit: gen_opic/gen_opif must respect nocode_wanted
    192 # so 1 || (1/0), 0 && (1/0), 1 ? 2 : 1/0 etc. don't abort with "division
    193 # by zero in constant" in their unevaluated arms (C11 §6.6¶3).
    194 apply_our_patch const-divzero-shortcircuit-int   "$SRC/tccgen.c"
    195 apply_our_patch const-divzero-shortcircuit-float "$SRC/tccgen.c"
    196 
    197 # AArch64 vararg fixes — only relevant when targeting ARM64; harmless
    198 # to apply unconditionally since neither file is read on other arches.
    199 apply_our_patch aarch64-stdarg-array     "$SRC/include/stdarg.h"
    200 apply_our_patch arm64-va-pointer-operand "$SRC/arm64-gen.c"
    201 apply_our_patch arm64-va-arg-pointer     "$SRC/arm64-gen.c"
    202 
    203 # AArch64 codegen: store/load through a literal integer address
    204 # (VT_CONST | VT_LVAL without VT_SYM). Stock arm64-gen.c only handles
    205 # the |VT_SYM case; bare integer addresses fall through to the
    206 # `printf + assert(0)` tail. Hits in musl when tcc folds weak-hidden
    207 # refs in __libc_start_main/mallocng. Patch is gated by the
    208 # surrounding store/load functions which exist only under
    209 # TCC_TARGET_ARM64.
    210 apply_our_patch arm64-store-const-lvalue "$SRC/arm64-gen.c"
    211 apply_our_patch arm64-load-const-lvalue  "$SRC/arm64-gen.c"
    212 
    213 # Stock arm64-gen.c truncates SValue::c.i to uint32_t at the top of
    214 # both load() and store(). Fine for struct-field offsets, fatal for
    215 # pointer-sized constant addresses (e.g., the seed-kernel writing to
    216 # the device alias VA 0x109000000 for the PL011 UART). Drop the
    217 # truncation; signed 9-bit ldur/stur offsets fit regardless.
    218 apply_our_patch arm64-svcul-no-truncate         "$SRC/arm64-gen.c"
    219 apply_our_patch arm64-svcul-no-truncate-store   "$SRC/arm64-gen.c"
    220 
    221 # AArch64 assembler — phase 1. Drops in arm64-asm.c + arm64-tok.h and
    222 # wires their includes into tcc.h, libtcc.c, and tcctok.h. Patches are
    223 # gated by TCC_TARGET_ARM64 in the surrounding source so they no-op on
    224 # other arches even when applied.
    225 cp "$OUR_PATCHES/files/arm64-asm.c" "$SRC/arm64-asm.c"
    226 cp "$OUR_PATCHES/files/arm64-tok.h" "$SRC/arm64-tok.h"
    227 apply_our_patch arm64-asm-include-tcc-h     "$SRC/tcc.h"
    228 apply_our_patch arm64-asm-include-libtcc-c  "$SRC/libtcc.c"
    229 apply_our_patch arm64-tok-include-tcctok-h  "$SRC/tcctok.h"
    230 
    231 # arm64-asm.c emits gen_expr64 for `.quad sym - sym2`; declare it for
    232 # arm64 too (was x86_64-only).
    233 apply_our_patch tcc-h-gen-expr64-arm64      "$SRC/tcc.h"
    234 
    235 # Route .quad through asm_data on arm64 so symbol-difference expressions
    236 # emit a relocation (R_AARCH64_PREL64) instead of failing to parse.
    237 apply_our_patch tccasm-arm64-quad           "$SRC/tccasm.c"
    238 apply_our_patch tccasm-arm64-quad-asm-data  "$SRC/tccasm.c"
    239 
    240 # Enable the relocations the assembler now emits: PREL64 (data symbol
    241 # difference), CONDBR19 + TSTBR14 (forward conditional branch / tbz).
    242 apply_our_patch arm64-link-asm-relocs       "$SRC/arm64-link.c"
    243 apply_our_patch arm64-link-prel64-condbr    "$SRC/arm64-link.c"
    244 
    245 # tcc's lexer in ASM_FILE mode swallows mid-line '#' as a line comment,
    246 # which kills the ARM/AArch64 '#imm' immediate prefix. Restrict the
    247 # '#'-as-line-comment behavior to start-of-line so '#' tokenizes as
    248 # itself in operand position. gas's own '#' line-comment rule is BOL
    249 # only, so this matches stock gas semantics. Other arches' assemblers
    250 # don't use '#' as an immediate prefix, so they're unaffected.
    251 apply_our_patch asm-hash-bol-only           "$SRC/tccpp.c"
    252 
    253 # Side effect of the patch above: alloca86_64-bt.S has two tab-prefixed
    254 # tail comments (`mov %rax,%rsi	# size, a second parm…`) that the
    255 # x86_64 assembler now rejects with "end of line expected". They are
    256 # inert documentation; strip them. The file is only compiled when
    257 # building the amd64 libtcc1.a (LIBTCC1_ASM_SRCS in boot4.sh), so this
    258 # rewrite is a no-op on aarch64/riscv64 builds.
    259 awk '{ sub(/\t#.*$/, ""); print }' "$SRC/lib/alloca86_64-bt.S" \
    260     > "$SRC/lib/alloca86_64-bt.S.tmp"
    261 mv "$SRC/lib/alloca86_64-bt.S.tmp" "$SRC/lib/alloca86_64-bt.S"
    262 
    263 # riscv64 int->llong cast: stock tcc 0.9.26 leaves unsigned int values
    264 # in their native register width, but RV64 32-bit ops sign-extend bits
    265 # 63:32, so widening an `unsigned int` to `unsigned long` reads garbage
    266 # upper bits.  Make gen_cvt_sxtw do the right thing for both signs, and
    267 # always invoke it on riscv64.  Hits be64() in the seed kernel's DTB
    268 # parser; without the fix the kernel sees mem_start sign-extended to
    269 # 0xffffffff80000000 and the boot panics during MMU bring-up.  Patch is
    270 # gated by the call-site / function name so it no-ops on other arches.
    271 apply_our_patch riscv64-cvt-int-zext     "$SRC/tccgen.c"
    272 apply_our_patch riscv64-gen-cvt-sxtw     "$SRC/riscv64-gen.c"
    273 apply_our_patch riscv64-load-ptr-zext    "$SRC/riscv64-gen.c"
    274 
    275 # riscv64 ELF default load address — stock tcc lands binaries at
    276 # 0x10000, below the seed kernel's USER_VA_LO=0x200000. Move the
    277 # default to 0x600000 so tcc-emitted ELFs slot into the user pool
    278 # without per-link `-Wl,-Ttext=` overrides. Patch is gated by the
    279 # stock literal in the before-block, so it no-ops elsewhere.
    280 apply_our_patch riscv64-elf-start-addr   "$SRC/riscv64-link.c"
    281 
    282 # riscv64 stdarg.h order fix — the upstream `#elif __riscv` branch
    283 # uses `__builtin_va_list` before it's typedef'd. Stock tcc treats
    284 # `__builtin_va_list` as a built-in keyword and forgives the forward
    285 # reference; tcc-boot2's frontend does not. Swap the two typedefs so
    286 # the base `char *__builtin_va_list` is in scope before va_list claims
    287 # it. Affects only the riscv branch — the patch is gated by the
    288 # `#elif __riscv` line in the before-block, so it's a no-op when that
    289 # branch is absent (other tcc trees).
    290 apply_our_patch riscv-stdarg-fix         "$SRC/include/stdarg.h"
    291 
    292 # gcc/clang __builtin_va_* spelling bridge — append aliases at the end
    293 # of tcc's <stdarg.h> so the same flat.c (which uses the gcc spelling
    294 # because that's what cc.scm recognizes) also compiles back through
    295 # tcc on amd64/aarch64. Gated `#ifndef __riscv` inside .after — the
    296 # __riscv branch already maps these names natively. See the .after
    297 # block for the full rationale.
    298 apply_our_patch stdarg-builtin-aliases   "$SRC/include/stdarg.h"
    299 
    300 # Empty config.h shims — pass1.kaem creates these via `catm <out>` (line 27-28).
    301 : > "$SRC/config.h"
    302 mkdir -p "$WORK/mes-overlay/mes"
    303 : > "$WORK/mes-overlay/mes/config.h"
    304 
    305 # --- (3) flatten via host preprocessor --------------------------------
    306 HOST_CC=${HOST_CC:-cc}
    307 FLAT=$WORK/tcc.flat.c
    308 
    309 "$HOST_CC" -E -P \
    310     -nostdinc \
    311     -I "$SRC" \
    312     -I "$WORK/mes-overlay" \
    313     -I "$SYS_INCLUDE" \
    314     -D __linux__=1 \
    315     -D __${CPP_ARCH}__=1 \
    316     -D BOOTSTRAP=1 \
    317     -D HAVE_LONG_LONG=$HAVE_LL \
    318     -D inline= \
    319     -D "CONFIG_TCCDIR=\"/lib/tcc\"" \
    320     -D "CONFIG_SYSROOT=\"/\"" \
    321     -D "CONFIG_TCC_CRTPREFIX=\"/lib\"" \
    322     -D "CONFIG_TCC_ELFINTERP=\"/mes/loader\"" \
    323     -D "CONFIG_TCC_SYSINCLUDEPATHS=\"/include/mes\"" \
    324     -D "TCC_LIBGCC=\"/lib/libc.a\"" \
    325     -D CONFIG_TCC_LIBTCC1_MES=0 \
    326     -D CONFIG_TCCBOOT=1 \
    327     -D CONFIG_TCC_STATIC=1 \
    328     -D CONFIG_USE_LIBGCC=1 \
    329     -D "TCC_VERSION=\"0.9.26\"" \
    330     -D ONE_SOURCE=1 \
    331     -D TCC_TARGET_${TCC_TARGET_DEFINE}=1 \
    332     "$SRC/tcc.c" > "$FLAT.body"
    333 
    334 # Publish the post-patch tcc <stdarg.h> as a per-arch bridge file
    335 # alongside tcc.flat.c. libc-flatten.sh prepends the same bridge to
    336 # libc.flat.c, so the boot3/boot4 container compiles no longer need
    337 # `-I /work/in/tcc-include -include /work/in/tcc-include/stdarg.h`.
    338 # The patched stdarg.h is byte-identical across X86_64 / ARM64 / RISCV64
    339 # (per-arch logic lives inside its #ifdefs); we still write a per-arch
    340 # copy so every artifact under build/<arch>/ comes from a single
    341 # `boot.sh <arch>` invocation, with nothing shared across arches.
    342 BRIDGE=$WORK/stdarg-bridge.h
    343 cp "$SRC/include/stdarg.h" "$BRIDGE"
    344 
    345 # Prepend the bridge into tcc.flat.c, guarded by !CCSCM so cc.scm
    346 # (which has __builtin_va_list / __builtin_va_* as native frontend
    347 # keywords and predefines CCSCM) skips the whole block. Under tcc,
    348 # the per-arch #ifdef branches inside the bridge resolve and define
    349 # the va_list typedef + __builtin_va_* → tcc native __va_* macros
    350 # that flat.c needs.
    351 {
    352     echo '#ifndef CCSCM'
    353     cat "$BRIDGE"
    354     echo '#endif'
    355     cat "$FLAT.body"
    356 } > "$FLAT"
    357 rm -f "$FLAT.body"
    358 
    359 BYTES=$(wc -c < "$FLAT")
    360 echo "produced $FLAT  ($BYTES bytes)"
    361 
    362 # --- (4) optional verify ---------------------------------------------
    363 if [ "$VERIFY" -eq 1 ]; then
    364     HOST_OBJ=$WORK/tcc.flat.o
    365     if "$HOST_CC" -c -w -o "$HOST_OBJ" "$FLAT" 2>"$WORK/host-cc.log"; then
    366         echo "host cc: tcc.flat.c compiles cleanly to $HOST_OBJ"
    367     else
    368         echo "host cc: tcc.flat.c FAILED to compile; see $WORK/host-cc.log" >&2
    369         exit 1
    370     fi
    371 fi