stage1-flatten.sh (17433B)
1 #!/bin/sh 2 ## scripts/stage1-flatten.sh — flatten upstream tcc-0.9.26 into a single 3 ## C bytestream (tcc.flat.c) using only the host preprocessor. 4 ## 5 ## This is the first of three stages building tcc-0.9.26 without 6 ## M2-Planet, MesCC, or Mes Scheme. See docs/TCC.md. 7 ## 8 ## Stages: 9 ## 1. unpack tcc-0.9.26-1147-gee75a10c.tar.gz 10 ## 2. apply live-bootstrap simple-patches (tcctools.c file-open reorder) 11 ## 3. host cc -E -nostdinc with mes-bundled headers + tcc-mes defines 12 ## 4. emit build/<arch>/vendor/tcc/tcc.flat.c 13 ## 5. (--verify) compile tcc.flat.c with host cc to confirm well-formedness 14 ## 15 ## Stage 1 deliberately stays on the host: it is just text manipulation 16 ## (preprocess + concat) and the resulting tcc.flat.c is a portable 17 ## artifact downstream stages consume. No container needed. 18 ## 19 ## Usage: 20 ## scripts/stage1-flatten.sh [--arch <X86_64|I386|RISCV64|ARM64|AARCH64>] [--verify] 21 22 set -eu 23 24 # --- arg parse -------------------------------------------------------- 25 ARCH=X86_64 26 VERIFY=0 27 while [ $# -gt 0 ]; do 28 case "$1" in 29 --arch) ARCH=$2; shift 2 ;; 30 --verify) VERIFY=1; shift ;; 31 -h|--help) awk '/^##/ { sub(/^## ?/, ""); print }' "$0"; exit 0 ;; 32 *) echo "unknown arg: $1" >&2; exit 2 ;; 33 esac 34 done 35 36 case "$ARCH" in 37 X86_64|x86_64|amd64) BOOT_ARCH=amd64; MES_ARCH=x86_64; HAVE_LL=1; TCC_TARGET_DEFINE=X86_64; CPP_ARCH=x86_64 ;; 38 I386|i386) BOOT_ARCH=i386; MES_ARCH=x86; HAVE_LL=0; TCC_TARGET_DEFINE=I386; CPP_ARCH=x86 ;; 39 RISCV64|riscv64) BOOT_ARCH=riscv64; MES_ARCH=riscv64; HAVE_LL=1; TCC_TARGET_DEFINE=RISCV64; CPP_ARCH=riscv64 ;; 40 ARM64|arm64|AARCH64|aarch64) 41 BOOT_ARCH=aarch64; MES_ARCH=riscv64; HAVE_LL=1; TCC_TARGET_DEFINE=ARM64; CPP_ARCH=aarch64 ;; 42 *) echo "unknown ARCH: $ARCH" >&2; exit 2 ;; 43 esac 44 45 # --- paths ------------------------------------------------------------ 46 # Everything used by this script is in-tree under $ROOT. No reach into 47 # sibling repos. 48 # 49 # vendor/upstream/tcc-0.9.26.tar.gz — pristine upstream tarball 50 # scripts/simple-patches/tcc-0.9.26-lb/ — live-bootstrap's tcc 51 # simple-patches, copied in 52 # for in-tree builds 53 # scripts/simple-patches/tcc-0.9.26/ — our own tcc patches 54 # vendor/mes-libc/include/ — vendored mes-libc headers 55 # (byte-identical to upstream 56 # mes/include) 57 # vendor/boot2-include/ — our own header shim, wins 58 # -I priority for stdarg.h 59 ROOT=$(cd "$(dirname "$0")/.." && pwd) 60 WORK=$ROOT/build/$BOOT_ARCH/vendor/tcc 61 DISTFILES=$ROOT/vendor/upstream 62 LB_PATCHES=$ROOT/scripts/simple-patches/tcc-0.9.26-lb 63 OUR_PATCHES=$ROOT/scripts/simple-patches/tcc-0.9.26 64 MES_INCLUDE=$ROOT/vendor/mes-libc/include 65 MES_INCLUDE_LINUX=$MES_INCLUDE/linux/$MES_ARCH 66 67 TCC_TAR=$DISTFILES/tcc-0.9.26.tar.gz 68 TCC_PKG=tcc-0.9.26-1147-gee75a10c 69 70 [ -r "$TCC_TAR" ] || { echo "missing $TCC_TAR" >&2; exit 1; } 71 [ -d "$LB_PATCHES" ] || { echo "missing $LB_PATCHES" >&2; exit 1; } 72 [ -d "$OUR_PATCHES" ] || { echo "missing $OUR_PATCHES" >&2; exit 1; } 73 [ -d "$MES_INCLUDE" ] || { echo "missing $MES_INCLUDE" >&2; exit 1; } 74 [ -d "$MES_INCLUDE_LINUX" ] || { echo "missing $MES_INCLUDE_LINUX" >&2; exit 1; } 75 76 # --- (1) unpack ------------------------------------------------------- 77 mkdir -p "$WORK" 78 rm -rf "$WORK/$TCC_PKG" 79 tar -xzf "$TCC_TAR" -C "$WORK" 80 81 SRC=$WORK/$TCC_PKG 82 83 # --- (2) simple-patches ---------------------------------------------- 84 # Both patches edit tcctools.c. The pair (remove-fileopen, addback-fileopen) 85 # moves a fopen() block earlier in the function. We implement live-bootstrap's 86 # simple-patch as an awk literal-block replacer; no binary dep. 87 apply_simple_patch() { 88 target=$1; before=$2; after=$3 89 [ -r "$target" ] || { echo "patch target missing: $target" >&2; exit 1; } 90 [ -r "$before" ] || { echo "patch before missing: $before" >&2; exit 1; } 91 [ -r "$after" ] || { echo "patch after missing: $after" >&2; exit 1; } 92 awk -v BFILE="$before" -v AFILE="$after" ' 93 BEGIN { 94 while ((getline line < BFILE) > 0) bef = bef line "\n"; 95 close(BFILE); 96 while ((getline line < AFILE) > 0) aft = aft line "\n"; 97 close(AFILE); 98 } 99 { src = src $0 "\n" } 100 END { 101 i = index(src, bef); 102 if (i == 0) { print "patch did not match" > "/dev/stderr"; exit 1 } 103 printf "%s%s%s", 104 substr(src, 1, i - 1), 105 aft, 106 substr(src, i + length(bef)); 107 } 108 ' "$target" > "$target.new" 109 mv "$target.new" "$target" 110 } 111 112 apply_simple_patch \ 113 "$SRC/tcctools.c" \ 114 "$LB_PATCHES/remove-fileopen.before" \ 115 "$LB_PATCHES/remove-fileopen.after" 116 117 apply_simple_patch \ 118 "$SRC/tcctools.c" \ 119 "$LB_PATCHES/addback-fileopen.before" \ 120 "$LB_PATCHES/addback-fileopen.after" 121 122 # Bootstrap stub patches — eliminate libc symbols not provided by mes-mini-libc 123 # (mprotect, getcwd, getenv, gettimeofday, ldexp, time, localtime, sscanf) by 124 # gating call sites on the existing BOOTSTRAP CPP define. 125 apply_our_patch() { 126 name=$1; target=$2 127 apply_simple_patch \ 128 "$target" \ 129 "$OUR_PATCHES/$name.before" \ 130 "$OUR_PATCHES/$name.after" 131 } 132 133 apply_our_patch tcc-is-native-stub "$SRC/tcc.h" 134 apply_our_patch tccrun-include "$SRC/libtcc.c" 135 apply_our_patch tinyc-define "$SRC/libtcc.c" 136 apply_our_patch longjmp-stub "$SRC/libtcc.c" 137 apply_our_patch set-environment-stub "$SRC/tcc.c" 138 apply_our_patch getclock-ms-stub "$SRC/tcc.c" 139 apply_our_patch getcwd-stub "$SRC/tccgen.c" 140 apply_our_patch strip-file-prefix "$SRC/tccgen.c" 141 apply_our_patch ldexp-stub "$SRC/tccpp.c" 142 apply_our_patch date-time-stub "$SRC/tccpp.c" 143 apply_our_patch lex-char-unsigned "$SRC/tccpp.c" 144 145 # LP64 constants: upstream's parser treats one `L` suffix as 64-bit 146 # only on x86_64. ARM64/RISCV64 are LP64 too; without this, `-4096UL` 147 # is zero-extended from 32 bits and musl's __syscall_ret rejects valid 148 # high mmap addresses as errors. 149 apply_our_patch lp64-long-constant "$SRC/tccpp.c" 150 apply_our_patch elfinterp-stub "$SRC/tccelf.c" 151 152 # Auto-define `__bss_start` alongside tcc's existing `_end` symbol so a 153 # freestanding image (kernel.S) can zero its .bss with start/end anchors 154 # without an ld script. Mirrors the live-bootstrap convention. 155 apply_our_patch bss-start-symbol "$SRC/tccelf.c" 156 157 # x86_64 static-link PLT32 collapse: under BOOTSTRAP we force 158 # static_link=1 with no .dynamic / no PT_INTERP, so the runtime linker 159 # never fills the PLT's GOT slots. Upstream tcc 0.9.26 only collapses 160 # PLT32→PC32 for hidden-visibility or LOCAL symbols, leaving global 161 # defined symbols going through unfilled PLT entries. The patch widens 162 # the condition to any symbol defined in this binary (st_shndx != 163 # SHN_UNDEF), which matches the aarch64 path's behavior. Harmless on 164 # other arches: the block is gated `#ifdef TCC_TARGET_X86_64`. 165 apply_our_patch x86_64-static-plt32 "$SRC/tccelf.c" 166 167 # AT.2: native PT_NOTE for PVH boot. Stock tcc tags `.note.*` sections 168 # as SHT_PROGBITS and never emits a PT_NOTE phdr, so QEMU's PVH 169 # `-kernel` path on amd64 (which scans PT_NOTE for the Xen 18 entry) 170 # rejects the kernel. Three patches: (1) retype implicitly-created 171 # `.note*` sections to SHT_NOTE; (2) allocate a PT_NOTE phdr covering 172 # every SHT_NOTE+SHF_ALLOC section; (3) accept SHT_NOTE in the .o 173 # loader so kernel-asm.o's .note.Xen merges into the link output (else 174 # the subsequent .rela.note.Xen merge derefs sm_table[].s == NULL). 175 # The phnum bump in (2) is gated on actual presence so aarch64/riscv64 176 # (no .note sections) keep their existing phdr count and stay 177 # byte-identical to pre-patch output. 178 apply_our_patch note-section-sht-note "$SRC/tccelf.c" 179 apply_our_patch pt-note-phdr "$SRC/tccelf.c" 180 apply_our_patch load-obj-accept-sht-note "$SRC/tccelf.c" 181 182 # x86_64 va_list runtime: tcc's lib/va_list.c declares `extern void 183 # abort(void)` and calls it in an unreachable default branch of the 184 # arg-type switch. Under -nostdlib that abort() symbol is unresolved 185 # and the link fails. Replace with an inline spin — same effect, no 186 # libc dependency. Unconditional patch: lib/va_list.c is only 187 # compiled on amd64, but the .before block is gated by the file's 188 # `#if defined TCC_TARGET_X86_64` so other arches see the patch 189 # inert. 190 apply_our_patch va_list-no-abort "$SRC/lib/va_list.c" 191 192 # Const-expr short-circuit: gen_opic/gen_opif must respect nocode_wanted 193 # so 1 || (1/0), 0 && (1/0), 1 ? 2 : 1/0 etc. don't abort with "division 194 # by zero in constant" in their unevaluated arms (C11 §6.6¶3). 195 apply_our_patch const-divzero-shortcircuit-int "$SRC/tccgen.c" 196 apply_our_patch const-divzero-shortcircuit-float "$SRC/tccgen.c" 197 198 # AArch64 vararg fixes — only relevant when targeting ARM64; harmless 199 # to apply unconditionally since neither file is read on other arches. 200 apply_our_patch aarch64-stdarg-array "$SRC/include/stdarg.h" 201 apply_our_patch arm64-va-pointer-operand "$SRC/arm64-gen.c" 202 apply_our_patch arm64-va-arg-pointer "$SRC/arm64-gen.c" 203 204 # AArch64 codegen: store/load through a literal integer address 205 # (VT_CONST | VT_LVAL without VT_SYM). Stock arm64-gen.c only handles 206 # the |VT_SYM case; bare integer addresses fall through to the 207 # `printf + assert(0)` tail. Hits in musl when tcc folds weak-hidden 208 # refs in __libc_start_main/mallocng. Patch is gated by the 209 # surrounding store/load functions which exist only under 210 # TCC_TARGET_ARM64. 211 apply_our_patch arm64-store-const-lvalue "$SRC/arm64-gen.c" 212 apply_our_patch arm64-load-const-lvalue "$SRC/arm64-gen.c" 213 214 # Stock arm64-gen.c truncates SValue::c.i to uint32_t at the top of 215 # both load() and store(). Fine for struct-field offsets, fatal for 216 # pointer-sized constant addresses (e.g., the seed-kernel writing to 217 # the device alias VA 0x109000000 for the PL011 UART). Drop the 218 # truncation; signed 9-bit ldur/stur offsets fit regardless. 219 apply_our_patch arm64-svcul-no-truncate "$SRC/arm64-gen.c" 220 apply_our_patch arm64-svcul-no-truncate-store "$SRC/arm64-gen.c" 221 222 # AArch64 assembler — phase 1. Drops in arm64-asm.c + arm64-tok.h and 223 # wires their includes into tcc.h, libtcc.c, and tcctok.h. Patches are 224 # gated by TCC_TARGET_ARM64 in the surrounding source so they no-op on 225 # other arches even when applied. See docs/TCC-ARM64-ASM.md. 226 cp "$OUR_PATCHES/files/arm64-asm.c" "$SRC/arm64-asm.c" 227 cp "$OUR_PATCHES/files/arm64-tok.h" "$SRC/arm64-tok.h" 228 apply_our_patch arm64-asm-include-tcc-h "$SRC/tcc.h" 229 apply_our_patch arm64-asm-include-libtcc-c "$SRC/libtcc.c" 230 apply_our_patch arm64-tok-include-tcctok-h "$SRC/tcctok.h" 231 232 # arm64-asm.c emits gen_expr64 for `.quad sym - sym2`; declare it for 233 # arm64 too (was x86_64-only). 234 apply_our_patch tcc-h-gen-expr64-arm64 "$SRC/tcc.h" 235 236 # Route .quad through asm_data on arm64 so symbol-difference expressions 237 # emit a relocation (R_AARCH64_PREL64) instead of failing to parse. 238 apply_our_patch tccasm-arm64-quad "$SRC/tccasm.c" 239 apply_our_patch tccasm-arm64-quad-asm-data "$SRC/tccasm.c" 240 241 # Enable the relocations the assembler now emits: PREL64 (data symbol 242 # difference), CONDBR19 + TSTBR14 (forward conditional branch / tbz). 243 apply_our_patch arm64-link-asm-relocs "$SRC/arm64-link.c" 244 apply_our_patch arm64-link-prel64-condbr "$SRC/arm64-link.c" 245 246 # tcc's lexer in ASM_FILE mode swallows mid-line '#' as a line comment, 247 # which kills the ARM/AArch64 '#imm' immediate prefix. Restrict the 248 # '#'-as-line-comment behavior to start-of-line so '#' tokenizes as 249 # itself in operand position. gas's own '#' line-comment rule is BOL 250 # only, so this matches stock gas semantics. Other arches' assemblers 251 # don't use '#' as an immediate prefix, so they're unaffected. 252 apply_our_patch asm-hash-bol-only "$SRC/tccpp.c" 253 254 # Side effect of the patch above: alloca86_64-bt.S has two tab-prefixed 255 # tail comments (`mov %rax,%rsi # size, a second parm…`) that the 256 # x86_64 assembler now rejects with "end of line expected". They are 257 # inert documentation; strip them. The file is only compiled when 258 # building the amd64 libtcc1.a (LIBTCC1_ASM_SRCS in boot4.sh), so this 259 # rewrite is a no-op on aarch64/riscv64 builds. 260 awk '{ sub(/\t#.*$/, ""); print }' "$SRC/lib/alloca86_64-bt.S" \ 261 > "$SRC/lib/alloca86_64-bt.S.tmp" 262 mv "$SRC/lib/alloca86_64-bt.S.tmp" "$SRC/lib/alloca86_64-bt.S" 263 264 # riscv64 int->llong cast: stock tcc 0.9.26 leaves unsigned int values 265 # in their native register width, but RV64 32-bit ops sign-extend bits 266 # 63:32, so widening an `unsigned int` to `unsigned long` reads garbage 267 # upper bits. Make gen_cvt_sxtw do the right thing for both signs, and 268 # always invoke it on riscv64. Hits be64() in the seed kernel's DTB 269 # parser; without the fix the kernel sees mem_start sign-extended to 270 # 0xffffffff80000000 and the boot panics during MMU bring-up. Patch is 271 # gated by the call-site / function name so it no-ops on other arches. 272 apply_our_patch riscv64-cvt-int-zext "$SRC/tccgen.c" 273 apply_our_patch riscv64-gen-cvt-sxtw "$SRC/riscv64-gen.c" 274 apply_our_patch riscv64-load-ptr-zext "$SRC/riscv64-gen.c" 275 276 # riscv64 ELF default load address — stock tcc lands binaries at 277 # 0x10000, below the seed kernel's USER_VA_LO=0x200000. Move the 278 # default to 0x600000 so tcc-emitted ELFs slot into the user pool 279 # without per-link `-Wl,-Ttext=` overrides. Patch is gated by the 280 # stock literal in the before-block, so it no-ops elsewhere. 281 apply_our_patch riscv64-elf-start-addr "$SRC/riscv64-link.c" 282 283 # riscv64 stdarg.h order fix — the upstream `#elif __riscv` branch 284 # uses `__builtin_va_list` before it's typedef'd. Stock tcc treats 285 # `__builtin_va_list` as a built-in keyword and forgives the forward 286 # reference; tcc-boot2's frontend does not. Swap the two typedefs so 287 # the base `char *__builtin_va_list` is in scope before va_list claims 288 # it. Affects only the riscv branch — the patch is gated by the 289 # `#elif __riscv` line in the before-block, so it's a no-op when that 290 # branch is absent (other tcc trees). 291 apply_our_patch riscv-stdarg-fix "$SRC/include/stdarg.h" 292 293 # gcc/clang __builtin_va_* spelling bridge — append aliases at the end 294 # of tcc's <stdarg.h> so the same flat.c (which uses the gcc spelling 295 # because that's what cc.scm recognizes) also compiles back through 296 # tcc on amd64/aarch64. Gated `#ifndef __riscv` inside .after — the 297 # __riscv branch already maps these names natively. See the .after 298 # block for the full rationale. 299 apply_our_patch stdarg-builtin-aliases "$SRC/include/stdarg.h" 300 301 # Empty config.h shims — pass1.kaem creates these via `catm <out>` (line 27-28). 302 : > "$SRC/config.h" 303 mkdir -p "$WORK/mes-overlay/mes" 304 : > "$WORK/mes-overlay/mes/config.h" 305 306 # --- (3) flatten via host preprocessor -------------------------------- 307 HOST_CC=${HOST_CC:-cc} 308 FLAT=$WORK/tcc.flat.c 309 310 "$HOST_CC" -E -P \ 311 -nostdinc \ 312 -I "$SRC" \ 313 -I "$WORK/mes-overlay" \ 314 -I "$ROOT/vendor/boot2-include" \ 315 -I "$MES_INCLUDE_LINUX" \ 316 -I "$MES_INCLUDE" \ 317 -D __linux__=1 \ 318 -D __${CPP_ARCH}__=1 \ 319 -D BOOTSTRAP=1 \ 320 -D HAVE_LONG_LONG=$HAVE_LL \ 321 -D inline= \ 322 -D "CONFIG_TCCDIR=\"/lib/tcc\"" \ 323 -D "CONFIG_SYSROOT=\"/\"" \ 324 -D "CONFIG_TCC_CRTPREFIX=\"/lib\"" \ 325 -D "CONFIG_TCC_ELFINTERP=\"/mes/loader\"" \ 326 -D "CONFIG_TCC_SYSINCLUDEPATHS=\"/include/mes\"" \ 327 -D "TCC_LIBGCC=\"/lib/libc.a\"" \ 328 -D CONFIG_TCC_LIBTCC1_MES=0 \ 329 -D CONFIG_TCCBOOT=1 \ 330 -D CONFIG_TCC_STATIC=1 \ 331 -D CONFIG_USE_LIBGCC=1 \ 332 -D "TCC_VERSION=\"0.9.26\"" \ 333 -D ONE_SOURCE=1 \ 334 -D TCC_TARGET_${TCC_TARGET_DEFINE}=1 \ 335 "$SRC/tcc.c" > "$FLAT.body" 336 337 # Publish the post-patch tcc <stdarg.h> as a per-arch bridge file 338 # alongside tcc.flat.c. libc-flatten.sh prepends the same bridge to 339 # libc.flat.c, so the boot3/boot4 container compiles no longer need 340 # `-I /work/in/tcc-include -include /work/in/tcc-include/stdarg.h`. 341 # The patched stdarg.h is byte-identical across X86_64 / ARM64 / RISCV64 342 # (per-arch logic lives inside its #ifdefs); we still write a per-arch 343 # copy so every artifact under build/<arch>/ comes from a single 344 # `boot.sh <arch>` invocation, with nothing shared across arches. 345 BRIDGE=$WORK/stdarg-bridge.h 346 cp "$SRC/include/stdarg.h" "$BRIDGE" 347 348 # Prepend the bridge into tcc.flat.c, guarded by !CCSCM so cc.scm 349 # (which has __builtin_va_list / __builtin_va_* as native frontend 350 # keywords and predefines CCSCM) skips the whole block. Under tcc, 351 # the per-arch #ifdef branches inside the bridge resolve and define 352 # the va_list typedef + __builtin_va_* → tcc native __va_* macros 353 # that flat.c needs. 354 { 355 echo '#ifndef CCSCM' 356 cat "$BRIDGE" 357 echo '#endif' 358 cat "$FLAT.body" 359 } > "$FLAT" 360 rm -f "$FLAT.body" 361 362 BYTES=$(wc -c < "$FLAT") 363 echo "produced $FLAT ($BYTES bytes)" 364 365 # --- (4) optional verify --------------------------------------------- 366 if [ "$VERIFY" -eq 1 ]; then 367 HOST_OBJ=$WORK/tcc.flat.o 368 if "$HOST_CC" -c -w -o "$HOST_OBJ" "$FLAT" 2>"$WORK/host-cc.log"; then 369 echo "host cc: tcc.flat.c compiles cleanly to $HOST_OBJ" 370 else 371 echo "host cc: tcc.flat.c FAILED to compile; see $WORK/host-cc.log" >&2 372 exit 1 373 fi 374 fi