stage1-flatten.sh (17163B)
1 #!/bin/sh 2 ## bootprep/stage1-flatten.sh — flatten upstream tcc-0.9.26 into a single 3 ## C bytestream (tcc.flat.c) using only the host preprocessor. 4 ## 5 ## This is the first of three stages building tcc-0.9.26 without 6 ## M2-Planet, MesCC, or Mes Scheme. See docs/TCC.md. 7 ## 8 ## Stages: 9 ## 1. unpack tcc-0.9.26-1147-gee75a10c.tar.gz 10 ## 2. apply live-bootstrap simple-patches (tcctools.c file-open reorder) 11 ## 3. host cc -E -nostdinc with mes-bundled headers + tcc-mes defines 12 ## 4. emit build/<arch>/vendor/tcc/tcc.flat.c 13 ## 5. (--verify) compile tcc.flat.c with host cc to confirm well-formedness 14 ## 15 ## Stage 1 deliberately stays on the host: it is just text manipulation 16 ## (preprocess + concat) and the resulting tcc.flat.c is a portable 17 ## artifact downstream stages consume. No container needed. 18 ## 19 ## Usage: 20 ## bootprep/stage1-flatten.sh [--arch <X86_64|I386|RISCV64|ARM64|AARCH64>] [--verify] 21 22 set -eu 23 24 # --- arg parse -------------------------------------------------------- 25 ARCH=X86_64 26 VERIFY=0 27 while [ $# -gt 0 ]; do 28 case "$1" in 29 --arch) ARCH=$2; shift 2 ;; 30 --verify) VERIFY=1; shift ;; 31 -h|--help) awk '/^##/ { sub(/^## ?/, ""); print }' "$0"; exit 0 ;; 32 *) echo "unknown arg: $1" >&2; exit 2 ;; 33 esac 34 done 35 36 case "$ARCH" in 37 X86_64|x86_64|amd64) BOOT_ARCH=amd64; MES_ARCH=x86_64; HAVE_LL=1; TCC_TARGET_DEFINE=X86_64; CPP_ARCH=x86_64 ;; 38 I386|i386) BOOT_ARCH=i386; MES_ARCH=x86; HAVE_LL=0; TCC_TARGET_DEFINE=I386; CPP_ARCH=x86 ;; 39 RISCV64|riscv64) BOOT_ARCH=riscv64; MES_ARCH=riscv64; HAVE_LL=1; TCC_TARGET_DEFINE=RISCV64; CPP_ARCH=riscv64 ;; 40 ARM64|arm64|AARCH64|aarch64) 41 BOOT_ARCH=aarch64; MES_ARCH=riscv64; HAVE_LL=1; TCC_TARGET_DEFINE=ARM64; CPP_ARCH=aarch64 ;; 42 *) echo "unknown ARCH: $ARCH" >&2; exit 2 ;; 43 esac 44 45 # --- paths ------------------------------------------------------------ 46 # Everything used by this script is in-tree under $ROOT. No reach into 47 # sibling repos. 48 # 49 # vendor/tcc/0.9.26.tar.gz — pristine upstream tarball 50 # vendor/tcc/patches-lb/ — live-bootstrap's tcc 51 # simple-patches, copied in 52 # for in-tree builds 53 # vendor/tcc/patches/ — our own tcc patches 54 # bootprep/headers/ — our hand-rolled libc headers used at 55 # flatten time by host `cc -E -nostdinc`. 56 # stdarg.h here routes va_* through 57 # __builtin_va_*; the rest are minimal 58 # stubs so tcc.c parses through the host 59 # preprocessor. 60 ROOT=$(cd "$(dirname "$0")/.." && pwd) 61 WORK=$ROOT/build/$BOOT_ARCH/vendor/tcc 62 DISTFILES=$ROOT/vendor/tcc 63 LB_PATCHES=$ROOT/vendor/tcc/patches-lb 64 OUR_PATCHES=$ROOT/vendor/tcc/patches 65 SYS_INCLUDE=$ROOT/bootprep/headers 66 67 TCC_TAR=$DISTFILES/0.9.26.tar.gz 68 TCC_PKG=tcc-0.9.26-1147-gee75a10c 69 70 [ -r "$TCC_TAR" ] || { echo "missing $TCC_TAR" >&2; exit 1; } 71 [ -d "$LB_PATCHES" ] || { echo "missing $LB_PATCHES" >&2; exit 1; } 72 [ -d "$OUR_PATCHES" ] || { echo "missing $OUR_PATCHES" >&2; exit 1; } 73 [ -d "$SYS_INCLUDE" ] || { echo "missing $SYS_INCLUDE" >&2; exit 1; } 74 75 # --- (1) unpack ------------------------------------------------------- 76 mkdir -p "$WORK" 77 rm -rf "$WORK/$TCC_PKG" 78 tar -xzf "$TCC_TAR" -C "$WORK" 79 80 SRC=$WORK/$TCC_PKG 81 82 # --- (2) simple-patches ---------------------------------------------- 83 # Both patches edit tcctools.c. The pair (remove-fileopen, addback-fileopen) 84 # moves a fopen() block earlier in the function. We implement live-bootstrap's 85 # simple-patch as an awk literal-block replacer; no binary dep. 86 apply_simple_patch() { 87 target=$1; before=$2; after=$3 88 [ -r "$target" ] || { echo "patch target missing: $target" >&2; exit 1; } 89 [ -r "$before" ] || { echo "patch before missing: $before" >&2; exit 1; } 90 [ -r "$after" ] || { echo "patch after missing: $after" >&2; exit 1; } 91 awk -v BFILE="$before" -v AFILE="$after" ' 92 BEGIN { 93 while ((getline line < BFILE) > 0) bef = bef line "\n"; 94 close(BFILE); 95 while ((getline line < AFILE) > 0) aft = aft line "\n"; 96 close(AFILE); 97 } 98 { src = src $0 "\n" } 99 END { 100 i = index(src, bef); 101 if (i == 0) { print "patch did not match" > "/dev/stderr"; exit 1 } 102 printf "%s%s%s", 103 substr(src, 1, i - 1), 104 aft, 105 substr(src, i + length(bef)); 106 } 107 ' "$target" > "$target.new" 108 mv "$target.new" "$target" 109 } 110 111 apply_simple_patch \ 112 "$SRC/tcctools.c" \ 113 "$LB_PATCHES/remove-fileopen.before" \ 114 "$LB_PATCHES/remove-fileopen.after" 115 116 apply_simple_patch \ 117 "$SRC/tcctools.c" \ 118 "$LB_PATCHES/addback-fileopen.before" \ 119 "$LB_PATCHES/addback-fileopen.after" 120 121 # Bootstrap stub patches — eliminate libc symbols not provided by mes-mini-libc 122 # (mprotect, getcwd, getenv, gettimeofday, ldexp, time, localtime, sscanf) by 123 # gating call sites on the existing BOOTSTRAP CPP define. 124 apply_our_patch() { 125 name=$1; target=$2 126 apply_simple_patch \ 127 "$target" \ 128 "$OUR_PATCHES/$name.before" \ 129 "$OUR_PATCHES/$name.after" 130 } 131 132 apply_our_patch tcc-is-native-stub "$SRC/tcc.h" 133 apply_our_patch tccrun-include "$SRC/libtcc.c" 134 apply_our_patch tinyc-define "$SRC/libtcc.c" 135 apply_our_patch longjmp-stub "$SRC/libtcc.c" 136 apply_our_patch set-environment-stub "$SRC/tcc.c" 137 apply_our_patch getclock-ms-stub "$SRC/tcc.c" 138 apply_our_patch getcwd-stub "$SRC/tccgen.c" 139 apply_our_patch strip-file-prefix "$SRC/tccgen.c" 140 apply_our_patch ldexp-stub "$SRC/tccpp.c" 141 apply_our_patch date-time-stub "$SRC/tccpp.c" 142 apply_our_patch lex-char-unsigned "$SRC/tccpp.c" 143 144 # LP64 constants: upstream's parser treats one `L` suffix as 64-bit 145 # only on x86_64. ARM64/RISCV64 are LP64 too; without this, `-4096UL` 146 # is zero-extended from 32 bits and musl's __syscall_ret rejects valid 147 # high mmap addresses as errors. 148 apply_our_patch lp64-long-constant "$SRC/tccpp.c" 149 apply_our_patch elfinterp-stub "$SRC/tccelf.c" 150 151 # Auto-define `__bss_start` alongside tcc's existing `_end` symbol so a 152 # freestanding image (kernel.S) can zero its .bss with start/end anchors 153 # without an ld script. Mirrors the live-bootstrap convention. 154 apply_our_patch bss-start-symbol "$SRC/tccelf.c" 155 156 # x86_64 static-link PLT32 collapse: under BOOTSTRAP we force 157 # static_link=1 with no .dynamic / no PT_INTERP, so the runtime linker 158 # never fills the PLT's GOT slots. Upstream tcc 0.9.26 only collapses 159 # PLT32→PC32 for hidden-visibility or LOCAL symbols, leaving global 160 # defined symbols going through unfilled PLT entries. The patch widens 161 # the condition to any symbol defined in this binary (st_shndx != 162 # SHN_UNDEF), which matches the aarch64 path's behavior. Harmless on 163 # other arches: the block is gated `#ifdef TCC_TARGET_X86_64`. 164 apply_our_patch x86_64-static-plt32 "$SRC/tccelf.c" 165 166 # AT.2: native PT_NOTE for PVH boot. Stock tcc tags `.note.*` sections 167 # as SHT_PROGBITS and never emits a PT_NOTE phdr, so QEMU's PVH 168 # `-kernel` path on amd64 (which scans PT_NOTE for the Xen 18 entry) 169 # rejects the kernel. Three patches: (1) retype implicitly-created 170 # `.note*` sections to SHT_NOTE; (2) allocate a PT_NOTE phdr covering 171 # every SHT_NOTE+SHF_ALLOC section; (3) accept SHT_NOTE in the .o 172 # loader so kernel-asm.o's .note.Xen merges into the link output (else 173 # the subsequent .rela.note.Xen merge derefs sm_table[].s == NULL). 174 # The phnum bump in (2) is gated on actual presence so aarch64/riscv64 175 # (no .note sections) keep their existing phdr count and stay 176 # byte-identical to pre-patch output. 177 apply_our_patch note-section-sht-note "$SRC/tccelf.c" 178 apply_our_patch pt-note-phdr "$SRC/tccelf.c" 179 apply_our_patch load-obj-accept-sht-note "$SRC/tccelf.c" 180 181 # x86_64 va_list runtime: tcc's lib/va_list.c declares `extern void 182 # abort(void)` and calls it in an unreachable default branch of the 183 # arg-type switch. Under -nostdlib that abort() symbol is unresolved 184 # and the link fails. Replace with an inline spin — same effect, no 185 # libc dependency. Unconditional patch: lib/va_list.c is only 186 # compiled on amd64, but the .before block is gated by the file's 187 # `#if defined TCC_TARGET_X86_64` so other arches see the patch 188 # inert. 189 apply_our_patch va_list-no-abort "$SRC/lib/va_list.c" 190 191 # Const-expr short-circuit: gen_opic/gen_opif must respect nocode_wanted 192 # so 1 || (1/0), 0 && (1/0), 1 ? 2 : 1/0 etc. don't abort with "division 193 # by zero in constant" in their unevaluated arms (C11 §6.6¶3). 194 apply_our_patch const-divzero-shortcircuit-int "$SRC/tccgen.c" 195 apply_our_patch const-divzero-shortcircuit-float "$SRC/tccgen.c" 196 197 # AArch64 vararg fixes — only relevant when targeting ARM64; harmless 198 # to apply unconditionally since neither file is read on other arches. 199 apply_our_patch aarch64-stdarg-array "$SRC/include/stdarg.h" 200 apply_our_patch arm64-va-pointer-operand "$SRC/arm64-gen.c" 201 apply_our_patch arm64-va-arg-pointer "$SRC/arm64-gen.c" 202 203 # AArch64 codegen: store/load through a literal integer address 204 # (VT_CONST | VT_LVAL without VT_SYM). Stock arm64-gen.c only handles 205 # the |VT_SYM case; bare integer addresses fall through to the 206 # `printf + assert(0)` tail. Hits in musl when tcc folds weak-hidden 207 # refs in __libc_start_main/mallocng. Patch is gated by the 208 # surrounding store/load functions which exist only under 209 # TCC_TARGET_ARM64. 210 apply_our_patch arm64-store-const-lvalue "$SRC/arm64-gen.c" 211 apply_our_patch arm64-load-const-lvalue "$SRC/arm64-gen.c" 212 213 # Stock arm64-gen.c truncates SValue::c.i to uint32_t at the top of 214 # both load() and store(). Fine for struct-field offsets, fatal for 215 # pointer-sized constant addresses (e.g., the seed-kernel writing to 216 # the device alias VA 0x109000000 for the PL011 UART). Drop the 217 # truncation; signed 9-bit ldur/stur offsets fit regardless. 218 apply_our_patch arm64-svcul-no-truncate "$SRC/arm64-gen.c" 219 apply_our_patch arm64-svcul-no-truncate-store "$SRC/arm64-gen.c" 220 221 # AArch64 assembler — phase 1. Drops in arm64-asm.c + arm64-tok.h and 222 # wires their includes into tcc.h, libtcc.c, and tcctok.h. Patches are 223 # gated by TCC_TARGET_ARM64 in the surrounding source so they no-op on 224 # other arches even when applied. 225 cp "$OUR_PATCHES/files/arm64-asm.c" "$SRC/arm64-asm.c" 226 cp "$OUR_PATCHES/files/arm64-tok.h" "$SRC/arm64-tok.h" 227 apply_our_patch arm64-asm-include-tcc-h "$SRC/tcc.h" 228 apply_our_patch arm64-asm-include-libtcc-c "$SRC/libtcc.c" 229 apply_our_patch arm64-tok-include-tcctok-h "$SRC/tcctok.h" 230 231 # arm64-asm.c emits gen_expr64 for `.quad sym - sym2`; declare it for 232 # arm64 too (was x86_64-only). 233 apply_our_patch tcc-h-gen-expr64-arm64 "$SRC/tcc.h" 234 235 # Route .quad through asm_data on arm64 so symbol-difference expressions 236 # emit a relocation (R_AARCH64_PREL64) instead of failing to parse. 237 apply_our_patch tccasm-arm64-quad "$SRC/tccasm.c" 238 apply_our_patch tccasm-arm64-quad-asm-data "$SRC/tccasm.c" 239 240 # Enable the relocations the assembler now emits: PREL64 (data symbol 241 # difference), CONDBR19 + TSTBR14 (forward conditional branch / tbz). 242 apply_our_patch arm64-link-asm-relocs "$SRC/arm64-link.c" 243 apply_our_patch arm64-link-prel64-condbr "$SRC/arm64-link.c" 244 245 # tcc's lexer in ASM_FILE mode swallows mid-line '#' as a line comment, 246 # which kills the ARM/AArch64 '#imm' immediate prefix. Restrict the 247 # '#'-as-line-comment behavior to start-of-line so '#' tokenizes as 248 # itself in operand position. gas's own '#' line-comment rule is BOL 249 # only, so this matches stock gas semantics. Other arches' assemblers 250 # don't use '#' as an immediate prefix, so they're unaffected. 251 apply_our_patch asm-hash-bol-only "$SRC/tccpp.c" 252 253 # Side effect of the patch above: alloca86_64-bt.S has two tab-prefixed 254 # tail comments (`mov %rax,%rsi # size, a second parm…`) that the 255 # x86_64 assembler now rejects with "end of line expected". They are 256 # inert documentation; strip them. The file is only compiled when 257 # building the amd64 libtcc1.a (LIBTCC1_ASM_SRCS in boot4.sh), so this 258 # rewrite is a no-op on aarch64/riscv64 builds. 259 awk '{ sub(/\t#.*$/, ""); print }' "$SRC/lib/alloca86_64-bt.S" \ 260 > "$SRC/lib/alloca86_64-bt.S.tmp" 261 mv "$SRC/lib/alloca86_64-bt.S.tmp" "$SRC/lib/alloca86_64-bt.S" 262 263 # riscv64 int->llong cast: stock tcc 0.9.26 leaves unsigned int values 264 # in their native register width, but RV64 32-bit ops sign-extend bits 265 # 63:32, so widening an `unsigned int` to `unsigned long` reads garbage 266 # upper bits. Make gen_cvt_sxtw do the right thing for both signs, and 267 # always invoke it on riscv64. Hits be64() in the seed kernel's DTB 268 # parser; without the fix the kernel sees mem_start sign-extended to 269 # 0xffffffff80000000 and the boot panics during MMU bring-up. Patch is 270 # gated by the call-site / function name so it no-ops on other arches. 271 apply_our_patch riscv64-cvt-int-zext "$SRC/tccgen.c" 272 apply_our_patch riscv64-gen-cvt-sxtw "$SRC/riscv64-gen.c" 273 apply_our_patch riscv64-load-ptr-zext "$SRC/riscv64-gen.c" 274 275 # riscv64 ELF default load address — stock tcc lands binaries at 276 # 0x10000, below the seed kernel's USER_VA_LO=0x200000. Move the 277 # default to 0x600000 so tcc-emitted ELFs slot into the user pool 278 # without per-link `-Wl,-Ttext=` overrides. Patch is gated by the 279 # stock literal in the before-block, so it no-ops elsewhere. 280 apply_our_patch riscv64-elf-start-addr "$SRC/riscv64-link.c" 281 282 # riscv64 stdarg.h order fix — the upstream `#elif __riscv` branch 283 # uses `__builtin_va_list` before it's typedef'd. Stock tcc treats 284 # `__builtin_va_list` as a built-in keyword and forgives the forward 285 # reference; tcc-boot2's frontend does not. Swap the two typedefs so 286 # the base `char *__builtin_va_list` is in scope before va_list claims 287 # it. Affects only the riscv branch — the patch is gated by the 288 # `#elif __riscv` line in the before-block, so it's a no-op when that 289 # branch is absent (other tcc trees). 290 apply_our_patch riscv-stdarg-fix "$SRC/include/stdarg.h" 291 292 # gcc/clang __builtin_va_* spelling bridge — append aliases at the end 293 # of tcc's <stdarg.h> so the same flat.c (which uses the gcc spelling 294 # because that's what cc.scm recognizes) also compiles back through 295 # tcc on amd64/aarch64. Gated `#ifndef __riscv` inside .after — the 296 # __riscv branch already maps these names natively. See the .after 297 # block for the full rationale. 298 apply_our_patch stdarg-builtin-aliases "$SRC/include/stdarg.h" 299 300 # Empty config.h shims — pass1.kaem creates these via `catm <out>` (line 27-28). 301 : > "$SRC/config.h" 302 mkdir -p "$WORK/mes-overlay/mes" 303 : > "$WORK/mes-overlay/mes/config.h" 304 305 # --- (3) flatten via host preprocessor -------------------------------- 306 HOST_CC=${HOST_CC:-cc} 307 FLAT=$WORK/tcc.flat.c 308 309 "$HOST_CC" -E -P \ 310 -nostdinc \ 311 -I "$SRC" \ 312 -I "$WORK/mes-overlay" \ 313 -I "$SYS_INCLUDE" \ 314 -D __linux__=1 \ 315 -D __${CPP_ARCH}__=1 \ 316 -D BOOTSTRAP=1 \ 317 -D HAVE_LONG_LONG=$HAVE_LL \ 318 -D inline= \ 319 -D "CONFIG_TCCDIR=\"/lib/tcc\"" \ 320 -D "CONFIG_SYSROOT=\"/\"" \ 321 -D "CONFIG_TCC_CRTPREFIX=\"/lib\"" \ 322 -D "CONFIG_TCC_ELFINTERP=\"/mes/loader\"" \ 323 -D "CONFIG_TCC_SYSINCLUDEPATHS=\"/include/mes\"" \ 324 -D "TCC_LIBGCC=\"/lib/libc.a\"" \ 325 -D CONFIG_TCC_LIBTCC1_MES=0 \ 326 -D CONFIG_TCCBOOT=1 \ 327 -D CONFIG_TCC_STATIC=1 \ 328 -D CONFIG_USE_LIBGCC=1 \ 329 -D "TCC_VERSION=\"0.9.26\"" \ 330 -D ONE_SOURCE=1 \ 331 -D TCC_TARGET_${TCC_TARGET_DEFINE}=1 \ 332 "$SRC/tcc.c" > "$FLAT.body" 333 334 # Publish the post-patch tcc <stdarg.h> as a per-arch bridge file 335 # alongside tcc.flat.c. libc-flatten.sh prepends the same bridge to 336 # libc.flat.c, so the boot3/boot4 container compiles no longer need 337 # `-I /work/in/tcc-include -include /work/in/tcc-include/stdarg.h`. 338 # The patched stdarg.h is byte-identical across X86_64 / ARM64 / RISCV64 339 # (per-arch logic lives inside its #ifdefs); we still write a per-arch 340 # copy so every artifact under build/<arch>/ comes from a single 341 # `boot.sh <arch>` invocation, with nothing shared across arches. 342 BRIDGE=$WORK/stdarg-bridge.h 343 cp "$SRC/include/stdarg.h" "$BRIDGE" 344 345 # Prepend the bridge into tcc.flat.c, guarded by !CCSCM so cc.scm 346 # (which has __builtin_va_list / __builtin_va_* as native frontend 347 # keywords and predefines CCSCM) skips the whole block. Under tcc, 348 # the per-arch #ifdef branches inside the bridge resolve and define 349 # the va_list typedef + __builtin_va_* → tcc native __va_* macros 350 # that flat.c needs. 351 { 352 echo '#ifndef CCSCM' 353 cat "$BRIDGE" 354 echo '#endif' 355 cat "$FLAT.body" 356 } > "$FLAT" 357 rm -f "$FLAT.body" 358 359 BYTES=$(wc -c < "$FLAT") 360 echo "produced $FLAT ($BYTES bytes)" 361 362 # --- (4) optional verify --------------------------------------------- 363 if [ "$VERIFY" -eq 1 ]; then 364 HOST_OBJ=$WORK/tcc.flat.o 365 if "$HOST_CC" -c -w -o "$HOST_OBJ" "$FLAT" 2>"$WORK/host-cc.log"; then 366 echo "host cc: tcc.flat.c compiles cleanly to $HOST_OBJ" 367 else 368 echo "host cc: tcc.flat.c FAILED to compile; see $WORK/host-cc.log" >&2 369 exit 1 370 fi 371 fi