commit 578721239fdeb810d6b419713fad35b6f90a283f
parent 0d2d302c47e1bb326de5dc23136b33bd37de160d
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 3 Jun 2026 08:29:41 -0700
doc: add CODE_SIZE.md with per-component cloc counts + regen script
scripts/code_size.sh regenerates the doc; `make code-size` wraps it.
Linked from the DESIGN.md doc index.
Diffstat:
4 files changed, 286 insertions(+), 1 deletion(-)
diff --git a/doc/CODE_SIZE.md b/doc/CODE_SIZE.md
@@ -0,0 +1,122 @@
+# kit Code Size
+
+Line counts per functional component, broken down so per-format and per-target
+code is separated from shared core. Counts are **code lines only** (C +
+headers, excluding comments and blanks), via `cloc`.
+
+> Snapshot: 2026-06-03. Regenerate with `make code-size` (or
+> `scripts/code_size.sh`); these numbers drift as the tree changes.
+
+## Arch backends — `src/arch/` (37,920)
+
+| Component | Lines |
+|---|---:|
+| aarch64 (`aa64`, reference backend) | 9,112 |
+| x86-64 (`x64`) | 8,355 |
+| riscv64 (`rv64`) | 8,345 |
+| WebAssembly (`wasm`) | 6,214 |
+| C-source backend (`c_target`) | 3,882 |
+| shared core (ArchImpl, MCEmitter, disasm, dwarf, registry) | 2,012 |
+
+## Object model — `src/obj/` (14,142)
+
+| Component | Lines |
+|---|---:|
+| ELF | 4,289 |
+| Mach-O | 3,780 |
+| COFF/PE | 3,266 |
+| Wasm object | 191 |
+| core (format-neutral model, registry, reloc apply, TLS) | 2,616 |
+
+## Linker — `src/link/` (6,958)
+
+Entirely **format-neutral** — no per-format files; emits through the `obj/`
+format writers above. Covers resolve, layout, relocation, linker scripts,
+incremental linking, and the JIT image mapper.
+
+## Frontends — `lang/`
+
+| Frontend | Lines |
+|---|---:|
+| C compiler (`lang/c`: parse/type/decl/ABI-lower) | 12,348 |
+| toy (CG-API exercise frontend) | 9,210 |
+| cpp (C lexer + preprocessor; shared by C frontend & `cpp`) | 3,726 |
+| wasm/WAT | 3,711 |
+
+The full C compiler ≈ `lang/c` + `lang/cpp`, driving the shared
+`cg`/`abi`/`opt` infrastructure below.
+
+## Other subsystems — `src/`
+
+| Subsystem | Lines |
+|---|---:|
+| optimizer (`opt`, -O1 SSA/regalloc) | 16,680 |
+| codegen (`cg`, public CG API + IR) | 11,349 |
+| api (composition layer) | 6,800 |
+| debug/DWARF (`debug`) | 6,094 |
+| dist (CAS + `.kpkg`) | 5,147 |
+| interp (bytecode interpreter) | 2,547 |
+| emu (guest-ELF emulator) | 2,401 |
+| asm (standalone + inline assembler) | 1,962 |
+| core (arenas/maps/bufs/diag/hash) | 1,377 |
+| abi (calling conventions) | 1,185 |
+| dbg (debugger) | 1,098 |
+| os (emu syscall personality) | 806 |
+| jit (stub; real mapper lives in `link`) | 12 |
+
+### dist split — `src/dist/` (5,147)
+
+| Component | Lines |
+|---|---:|
+| packaging (manifest/kpkg/tar/trust) | 2,032 |
+| vendored compression (deflate/lz4) | 1,976 |
+| CAS store (blob/tree/cas) | 749 |
+| vendored crypto (blake2b/ed25519/b64/minisig) | 390 |
+
+## Driver — `driver/` (23,079)
+
+| Area | Lines |
+|---|---:|
+| cmd (per-subcommand) | 16,334 |
+| env (host adapters) | 3,834 |
+| lib (shared support) | 2,411 |
+| `*.c` (main + dispatch) | 500 |
+
+### driver/cmd per-subcommand
+
+| Subcommand | Lines |
+|---|---:|
+| dbg.c | 2,991 |
+| cc.c | 2,634 |
+| objdump.c | 1,710 |
+| ld.c | 1,161 |
+| run.c | 889 |
+| objcopy.c | 669 |
+| strip.c | 579 |
+| pkg.c | 575 |
+| ar.c | 546 |
+| compile.c | 501 |
+| xxd.c | 428 |
+| nm.c | 341 |
+| size.c | 327 |
+| cas.c | 289 |
+| emu.c | 283 |
+| mc.c | 262 |
+| disas.c | 249 |
+| cmp.c | 237 |
+| compress.c | 232 |
+| strings.c | 226 |
+| as.c | 220 |
+| ranlib.c | 203 |
+| install.c | 202 |
+| addr2line.c | 199 |
+| hash.c | 192 |
+| cpp.c | 189 |
+
+## Outside `src/`
+
+| Area | Lines |
+|---|---:|
+| rt/ (freestanding runtime) | 5,167 |
+| vendor/ (monocypher, lz4) | 9,077 |
+| include/kit (public headers) | 2,935 |
diff --git a/doc/DESIGN.md b/doc/DESIGN.md
@@ -222,5 +222,6 @@ unless an API states otherwise.
| [RUNTIME.md](RUNTIME.md) | The freestanding headers and compiler-rt/libc-style support in `rt/`. |
| [BUILD.md](BUILD.md) | The build system and `KIT_*_ENABLED` component gating. |
| [TESTING.md](TESTING.md) | The test suites and harnesses under `test/`. |
+| [CODE_SIZE.md](CODE_SIZE.md) | Line counts per component (per-format/per-target split from core). |
Planned work and roadmaps live under `doc/plan/`.
diff --git a/mk/maint.mk b/mk/maint.mk
@@ -3,7 +3,7 @@
# Developer maintenance targets: source formatting, the clangd compilation
# database, optimizer benchmarking, and clean.
-.PHONY: format compile-commands bench-opt clean
+.PHONY: format compile-commands bench-opt code-size clean
# Format only the .c/.h files changed in the working tree (staged, unstaged, or
# new/untracked), restricted to the formatted roots and excluding test/pp. When
@@ -32,5 +32,9 @@ bench-opt:
$(MAKE) RELEASE=1 bin
@KIT='$(abspath build/release/kit)' bash scripts/opt_bench.sh
+# Regenerate doc/CODE_SIZE.md (per-component cloc line counts). Requires cloc.
+code-size:
+ bash scripts/code_size.sh
+
clean:
rm -rf $(BUILD_DIR)
diff --git a/scripts/code_size.sh b/scripts/code_size.sh
@@ -0,0 +1,158 @@
+#!/usr/bin/env bash
+# Regenerate doc/CODE_SIZE.md: line counts per functional component, with
+# per-format/per-target code separated from shared core. Counts are code lines
+# only (C + headers, excluding comments and blanks) via cloc.
+#
+# Usage: scripts/code_size.sh # rewrite doc/CODE_SIZE.md
+# scripts/code_size.sh --stdout # print to stdout instead
+set -uo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+command -v cloc >/dev/null 2>&1 || { echo "code_size.sh: cloc not found" >&2; exit 1; }
+
+# code-line count (cloc SUM 'code' column) for the given paths/globs
+loc() {
+ cloc --quiet --csv --include-lang=C,"C/C++ Header" "$@" 2>/dev/null \
+ | awk -F, '/SUM,/{print $5}'
+}
+
+# format an integer with thousands separators
+fmt() { printf "%s" "$1" | sed ':a;s/\B[0-9]\{3\}\>/,&/;ta'; }
+
+# table row: "| <label> | <comma-formatted loc of paths> |"
+trow() { local label="$1"; shift; printf "| %s | %s |\n" "$label" "$(fmt "$(loc "$@")")"; }
+
+OUT="doc/CODE_SIZE.md"
+[ "${1:-}" = "--stdout" ] && OUT=/dev/stdout
+
+DATE="$(date +%Y-%m-%d)"
+
+{
+cat <<EOF
+# kit Code Size
+
+Line counts per functional component, broken down so per-format and per-target
+code is separated from shared core. Counts are **code lines only** (C +
+headers, excluding comments and blanks), via \`cloc\`.
+
+> Snapshot: $DATE. Regenerate with \`make code-size\` (or
+> \`scripts/code_size.sh\`); these numbers drift as the tree changes.
+
+## Arch backends — \`src/arch/\` ($(fmt "$(loc src/arch)"))
+
+| Component | Lines |
+|---|---:|
+EOF
+trow "aarch64 (\`aa64\`, reference backend)" src/arch/aa64
+trow "x86-64 (\`x64\`)" src/arch/x64
+trow "riscv64 (\`rv64\`)" src/arch/rv64
+trow "WebAssembly (\`wasm\`)" src/arch/wasm
+trow "C-source backend (\`c_target\`)" src/arch/c_target
+trow "shared core (ArchImpl, MCEmitter, disasm, dwarf, registry)" src/arch/*.c src/arch/*.h
+
+cat <<EOF
+
+## Object model — \`src/obj/\` ($(fmt "$(loc src/obj)"))
+
+| Component | Lines |
+|---|---:|
+EOF
+trow "ELF" src/obj/elf
+trow "Mach-O" src/obj/macho
+trow "COFF/PE" src/obj/coff
+trow "Wasm object" src/obj/wasm
+trow "core (format-neutral model, registry, reloc apply, TLS)" src/obj/*.c src/obj/*.h
+
+cat <<EOF
+
+## Linker — \`src/link/\` ($(fmt "$(loc src/link)"))
+
+Entirely **format-neutral** — no per-format files; emits through the \`obj/\`
+format writers above. Covers resolve, layout, relocation, linker scripts,
+incremental linking, and the JIT image mapper.
+
+## Frontends — \`lang/\`
+
+| Frontend | Lines |
+|---|---:|
+EOF
+trow "C compiler (\`lang/c\`: parse/type/decl/ABI-lower)" lang/c
+trow "toy (CG-API exercise frontend)" lang/toy
+trow "cpp (C lexer + preprocessor; shared by C frontend & \`cpp\`)" lang/cpp
+trow "wasm/WAT" lang/wasm
+
+cat <<EOF
+
+The full C compiler ≈ \`lang/c\` + \`lang/cpp\`, driving the shared
+\`cg\`/\`abi\`/\`opt\` infrastructure below.
+
+## Other subsystems — \`src/\`
+
+| Subsystem | Lines |
+|---|---:|
+EOF
+trow "optimizer (\`opt\`, -O1 SSA/regalloc)" src/opt
+trow "codegen (\`cg\`, public CG API + IR)" src/cg
+trow "api (composition layer)" src/api
+trow "debug/DWARF (\`debug\`)" src/debug
+trow "dist (CAS + \`.kpkg\`)" src/dist
+trow "interp (bytecode interpreter)" src/interp
+trow "emu (guest-ELF emulator)" src/emu
+trow "asm (standalone + inline assembler)" src/asm
+trow "core (arenas/maps/bufs/diag/hash)" src/core
+trow "abi (calling conventions)" src/abi
+trow "dbg (debugger)" src/dbg
+trow "os (emu syscall personality)" src/os
+trow "jit (stub; real mapper lives in \`link\`)" src/jit
+
+cat <<EOF
+
+### dist split — \`src/dist/\` ($(fmt "$(loc src/dist)"))
+
+| Component | Lines |
+|---|---:|
+EOF
+trow "packaging (manifest/kpkg/tar/trust)" src/dist/manifest.* src/dist/kpkg.* src/dist/tar.* src/dist/trust.* src/dist/dist.*
+trow "vendored compression (deflate/lz4)" src/dist/deflate.* src/dist/lz4.* src/dist/lz4frame.*
+trow "CAS store (blob/tree/cas)" src/dist/blob.* src/dist/tree.* src/dist/cas.*
+trow "vendored crypto (blake2b/ed25519/b64/minisig)" src/dist/blake2b.* src/dist/ed25519.* src/dist/b64.* src/dist/minisig.*
+
+cat <<EOF
+
+## Driver — \`driver/\` ($(fmt "$(loc driver)"))
+
+| Area | Lines |
+|---|---:|
+EOF
+trow "cmd (per-subcommand)" driver/cmd
+trow "env (host adapters)" driver/env
+trow "lib (shared support)" driver/lib
+trow "\`*.c\` (main + dispatch)" driver/*.c driver/*.h
+
+cat <<EOF
+
+### driver/cmd per-subcommand
+
+| Subcommand | Lines |
+|---|---:|
+EOF
+cloc --quiet --by-file --csv --include-lang=C,"C/C++ Header" driver/cmd 2>/dev/null \
+ | awk -F, 'NF>=5 && $5 ~ /^[0-9]+$/ && $2 ~ /cmd/ {n=$2; sub(/.*\//,"",n); printf "%d\t%s\n", $5, n}' \
+ | sort -rn \
+ | while IFS=$'\t' read -r n name; do printf "| %s | %s |\n" "$name" "$(fmt "$n")"; done
+
+cat <<EOF
+
+## Outside \`src/\`
+
+| Area | Lines |
+|---|---:|
+EOF
+trow "rt/ (freestanding runtime)" rt
+trow "vendor/ (monocypher, lz4)" vendor
+trow "include/kit (public headers)" include
+} > "$OUT"
+
+[ "$OUT" != /dev/stdout ] && echo "wrote $OUT"