commit 20712035bb5304ff0edae2d9285f1a6ff1890c34
parent 5bed955eaba50144dbbccebc8ba6d17728ed7cfa
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Tue, 12 May 2026 14:54:45 -0700
LANGS.md plan
Diffstat:
| A | doc/LANGS.md | | | 456 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
1 file changed, 456 insertions(+), 0 deletions(-)
diff --git a/doc/LANGS.md b/doc/LANGS.md
@@ -0,0 +1,456 @@
+# Language Frontend Architecture Plan
+
+## Overview
+
+libcfree currently hard-codes two source consumers inside `src/api/pipeline.c`:
+
+- **C** (`CFREE_LANG_C`) — preprocessor → C parser → `CG` → `CGTarget` → `MCEmitter`
+- **Assembly** (`CFREE_LANG_ASM`) — lexer → assembler → `MCEmitter`
+
+The C path is privileged because `CG` (`src/cg/cg.h`) is glued to the internal C
+type system (`src/type/type.h`). To enable *alternative* language frontends we
+will introduce a **new public codegen seam** (`include/cfree/cg.h`) that speaks
+in language-neutral layout descriptors rather than C `Type*`. Frontends live
+under `lang/` and consume only public headers (`include/cfree*.h`); the driver
+registers them at startup.
+
+This plan defines Phase 1 (public ObjBuilder registration) and Phase 2 (the new
+`CfreeCg` API), then scopes a **toy language** (`lang/toy/`) to prove the seam
+before touching the C frontend.
+
+## Directory layout
+
+```
+lang/
+ toy/
+ lex.c — tokenizer (names, numbers, punctuation, keywords)
+ parse.c — recursive-descent parser → CfreeCg calls
+ type.c — toy type system: int, record, array, pointer
+ type.h
+ toy.h — public frontend entry: cfree_toy_compile()
+ Makefile — produces libcfree_toy.a
+```
+
+`lang/` is a sibling of `driver/` and `src/`. It only includes `<cfree.h>` and
+`<cfree/cg.h>`. No internal `src/` headers.
+
+## Public API: `include/cfree/cg.h`
+
+This is the **language-neutral codegen surface**. It replaces the internal
+`CG` + `CGTarget` vtable with a stable, typed C API. The implementation lives
+in `src/api/cg.c` (or alongside `pipeline.c`) and adapts calls to the existing
+`CGTarget` machinery.
+
+### Handles
+
+```c
+typedef struct CfreeCg CfreeCg;
+typedef struct CfreeCgType CfreeCgType;
+typedef struct CfreeCgValue CfreeCgValue; /* opaque stack/SSA handle */
+typedef uint32_t CfreeCgLabel;
+#define CFREE_CG_LABEL_NONE 0u
+```
+
+### Type factory
+
+Types are **layout descriptors**, not semantic C types. They carry `(size,
+align, scalar_kind)` and, for aggregates, a flat field list. The backend ABI
+classification derives layout from these descriptors without knowing about C
+qualifiers, bitfields, or tag identity.
+
+```c
+CfreeCgType* cfree_cg_type_i32(CfreeCompiler*);
+CfreeCgType* cfree_cg_type_i64(CfreeCompiler*);
+CfreeCgType* cfree_cg_type_u32(CfreeCompiler*);
+CfreeCgType* cfree_cg_type_u64(CfreeCompiler*);
+CfreeCgType* cfree_cg_type_f32(CfreeCompiler*);
+CfreeCgType* cfree_cg_type_f64(CfreeCompiler*);
+
+/* Pointer: element type + count (0 = single pointer or unknown, >0 array) */
+CfreeCgType* cfree_cg_type_ptr(CfreeCompiler*, CfreeCgType* pointee, uint32_t count);
+
+/* Records (structs, tuples, tagged unions). The caller describes fields in
+ declaration order; the backend computes offsets/alignment/padding. */
+typedef struct CfreeCgField {
+ CfreeSym name; /* may be 0 for anonymous/tuples */
+ CfreeCgType* type;
+ uint32_t align_override; /* 0 = natural, 1 = packed */
+} CfreeCgField;
+CfreeCgType* cfree_cg_type_record(CfreeCompiler*,
+ CfreeSym tag,
+ const CfreeCgField* fields,
+ uint32_t nfields);
+
+/* Function type for indirect calls and type-checking. */
+CfreeCgType* cfree_cg_type_func(CfreeCompiler*,
+ CfreeCgType* ret,
+ CfreeCgType** params,
+ uint32_t nparams,
+ int variadic);
+```
+
+### CG lifecycle
+
+```c
+/* Construct a CG context bound to an ObjBuilder. */
+CfreeCg* cfree_cg_new(CfreeCompiler*, CfreeObjBuilder* out);
+void cfree_cg_free(CfreeCg*);
+
+/* Function boundaries. `name` is the source-level symbol; the backend applies
+ the active object format's C-symbol mangling (e.g. leading `_` on Mach-O). */
+void cfree_cg_func_begin(CfreeCg*, const char* name, CfreeCgType* fn_type);
+void cfree_cg_func_end(CfreeCg*);
+
+/* Source location tracking (sticky until next call). */
+void cfree_cg_set_loc(CfreeCg*, CfreeSrcLoc);
+```
+
+### Value stack
+
+`CfreeCg` owns a TCC-style stack. Every push produces a value; every operation
+consumes and produces values. The stack discipline is the frontend's
+responsibility; the backend manages register allocation and spills.
+
+```c
+/* Literal materialization */
+void cfree_cg_push_int(CfreeCg*, int64_t value, CfreeCgType* type);
+void cfree_cg_push_float(CfreeCg*, double value, CfreeCgType* type);
+
+/* String literals → rodata pointer */
+void cfree_cg_push_bytes(CfreeCg*, const uint8_t* str, size_t len);
+
+/* Addressable storage */
+void cfree_cg_push_local(CfreeCg*, uint32_t slot_id, CfreeCgType* type);
+void cfree_cg_push_global(CfreeCg*, CfreeSym name, CfreeCgType* type);
+
+/* Lvalue/rvalue conversion */
+void cfree_cg_load(CfreeCg*); /* lvalue → rvalue */
+void cfree_cg_addr(CfreeCg*); /* lvalue → pointer rvalue */
+void cfree_cg_store(CfreeCg*); /* pop [addr_or_lvalue, rvalue] */
+
+/* Stack manipulation */
+void cfree_cg_dup(CfreeCg*);
+void cfree_cg_swap(CfreeCg*);
+void cfree_cg_drop(CfreeCg*);
+void cfree_cg_rot3(CfreeCg*);
+```
+
+### Arithmetic, compare, convert
+
+```c
+typedef enum CfreeCgBinOp {
+ CFREE_CG_ADD, CFREE_CG_SUB, CFREE_CG_MUL,
+ CFREE_CG_SDIV, CFREE_CG_UDIV, CFREE_CG_SREM, CFREE_CG_UREM,
+ CFREE_CG_AND, CFREE_CG_OR, CFREE_CG_XOR,
+ CFREE_CG_SHL, CFREE_CG_SHR_S, CFREE_CG_SHR_U,
+} CfreeCgBinOp;
+
+typedef enum CfreeCgCmpOp {
+ CFREE_CG_EQ, CFREE_CG_NE,
+ CFREE_CG_LT_S, CFREE_CG_LE_S, CFREE_CG_GT_S, CFREE_CG_GE_S,
+ CFREE_CG_LT_U, CFREE_CG_LE_U, CFREE_CG_GT_U, CFREE_CG_GE_U,
+} CfreeCgCmpOp;
+
+void cfree_cg_binop(CfreeCg*, CfreeCgBinOp);
+void cfree_cg_cmp(CfreeCg*, CfreeCgCmpOp);
+void cfree_cg_convert(CfreeCg*, CfreeCgType* dst);
+```
+
+### Control flow
+
+Labels are numeric handles. The backend maps them to per-arch branch targets or
+SSA blocks.
+
+```c
+CfreeCgLabel cfree_cg_label_new(CfreeCg*);
+void cfree_cg_label_place(CfreeCg*, CfreeCgLabel);
+void cfree_cg_jump(CfreeCg*, CfreeCgLabel);
+void cfree_cg_branch_true(CfreeCg*, CfreeCgLabel); /* pop i1 */
+void cfree_cg_branch_false(CfreeCg*, CfreeCgLabel); /* pop i1 */
+
+/* Structured control flow (optional but recommended). Backends that don't
+ consume structure directly (all real ISAs except WASM) lower to labels. */
+typedef uint32_t CfreeCgScope;
+CfreeCgScope cfree_cg_scope_begin(CfreeCg*, const CfreeCgType* result);
+void cfree_cg_scope_end(CfreeCg*, CfreeCgScope);
+void cfree_cg_break(CfreeCg*, CfreeCgScope);
+void cfree_cg_continue(CfreeCg*, CfreeCgScope);
+```
+
+### Aggregate and memory operations
+
+```c
+/* Copy `size` bytes from src_addr to dst_addr. Pops [dst, src]. */
+void cfree_cg_memcpy(CfreeCg*, uint32_t size, uint32_t align);
+
+/* Initialize `size` bytes at addr. Pops addr. */
+void cfree_cg_memset(CfreeCg*, uint8_t val, uint32_t size, uint32_t align);
+
+/* Element access for arrays and records.
+ Pops base_addr, pushes addr_of_element. */
+void cfree_cg_index(CfreeCg*, uint32_t elem_size, uint32_t index);
+void cfree_cg_field_addr(CfreeCg*, uint32_t offset);
+```
+
+### Calls and returns
+
+```c
+/* `nargs` values must be on the stack (left-to-right or right-to-left
+ depending on the frontend's calling convention choice). `fn_type` is the
+ callee's function type; the backend uses it for ABI classification.
+ The callee value itself must be the deepest value on the stack, below args. */
+void cfree_cg_call(CfreeCg*, uint32_t nargs, CfreeCgType* fn_type);
+void cfree_cg_tail_call(CfreeCg*, uint32_t nargs, CfreeCgType* fn_type);
+void cfree_cg_ret(CfreeCg*, int has_value); /* has_value=0 for void */
+```
+
+### Inline assembly (future)
+
+```c
+/* TBD: define after the toy language proves the seam. Same constraint model
+ as the internal AsmConstraint, but using CfreeCgValue handles instead of
+ internal SValues. */
+```
+
+### Frame slots (locals and parameters)
+
+Frontends can allocate frame slots explicitly, or let the backend infer them
+from `cfree_cg_push_local` usage. The explicit API is useful when the frontend
+wants deterministic slot IDs (e.g. for debug variable location):
+
+```c
+uint32_t cfree_cg_local_slot(CfreeCg*, CfreeCgType* type, CfreeSym name);
+uint32_t cfree_cg_param_slot(CfreeCg*, uint32_t index, CfreeCgType* type,
+ CfreeSym name);
+```
+
+### Debug info hooks
+
+The toy language will skip debug info in v1, but the API surface must reserve
+room so frontends can emit DWARF later without growing the vtable.
+
+```c
+/* TBD: debug_func_begin, debug_local, debug_param, debug_line.
+ For v1 the driver passes debug_info=0 and the CG skips it. */
+```
+
+## Toy language (`lang/toy/`)
+
+### Grammar (v1)
+
+```
+decl ::= fn_decl | global_decl | type_decl
+fn_decl ::= "fn" name "(" param_list ")" (":" type)? block
+param_list ::= (name ":" type ("," name ":" type)*)?
+block ::= "{" stmt* "}"
+stmt ::= let_stmt
+ | assign_stmt
+ | if_stmt
+ | while_stmt
+ | break_stmt
+ | continue_stmt
+ | return_stmt
+ | expr_stmt
+let_stmt ::= "let" name ":" type ("=" expr)? ";"
+assign_stmt ::= lvalue "=" expr ";"
+if_stmt ::= "if" expr block ("else" block)?
+while_stmt ::= "while" expr block
+break_stmt ::= "break" ";"
+continue_stmt ::= "continue" ";"
+return_stmt ::= "return" expr? ";"
+expr_stmt ::= expr ";"
+lvalue ::= name (lvalue_r)*
+lvalue_r ::= ("[" expr "]")*
+ | ("." name)*
+ | (".*")*
+
+global_decl ::= "let" name ":" type "=" expr ";"
+
+type_decl ::= "type" name "=" type ";"
+type ::= "int" | "*" type | "[" number "]" type | record_type
+record_type ::= "{" field_decl ("," field_decl)* "}"
+field_decl ::= name ":" type
+
+expr ::= or_expr
+or_expr ::= and_expr ("||" and_expr)*
+and_expr ::= cmp_expr ("&&" cmp_expr)*
+cmp_expr ::= add_expr (("<" | ">" | "<=" | ">=" | "==" | "!=") add_expr)?
+add_expr ::= mul_expr (("+" | "-") mul_expr)*
+mul_expr ::= unary_expr (("*" | "/" | "%") unary_expr)*
+unary_expr ::= ("-" | "!" | "&") unary_expr | primary
+primary ::= number | string | name | lvalue | "(" expr ")"
+```
+
+### Semantics
+
+- **One integer type**: `int` is a signed integer whose width equals the target
+ pointer width (32-bit on ILP32, 64-bit on LP64). The frontend queries
+ `cfree_cg_type_int(compiler, cfree_target_ptr_size(compiler)*8, 1)`.
+- **No implicit conversions**: the parser rejects `int + ptr`.
+- **Records** are value types (like C structs). Assignment copies the whole
+ record. Parameter passing follows the target ABI (the backend decides
+ direct/indirect/split).
+- **Arrays** are fixed-size and decay to pointers only in subscript and field
+ contexts. Array assignment is not allowed.
+- **Pointers**: `ptr` is an untyped pointer (like `void*`). Dereference is
+ `*expr` (sugar for `expr[0]`). Typed pointers are a future extension.
+- **Functions**: no forward declarations needed; a single-pass parser resolves
+ all function names into globals. Recursion is allowed.
+
+### Frontend pipeline
+
+1. **Lex** (`lex.c`) — plain recursive descent with 1-char lookahead.
+2. **Type check** (`type.c`) — minimal bidirectional inference:
+ - `let` requires an explicit type (or an initializer from which to infer).
+ - Every expression node carries a `ToyType*`.
+ - Subscript and field access check bounds/field names at parse time.
+3. **Codegen** (`parse.c` → `CfreeCg`) — single-pass lowering:
+ - Globals → `cfree_cg_push_global` + `cfree_cg_store`.
+ - Locals → `cfree_cg_local_slot` + `cfree_cg_push_local`.
+ - Records/arrays → `cfree_cg_memcpy` or `cfree_cg_memset`.
+ - Control flow → scopes and `cfree_cg_branch_true` / `cfree_cg_jump`.
+ - Function calls → push callee global, push args, `cfree_cg_call`.
+
+### Entry point
+
+```c
+/* lang/toy/toy.h */
+#include <cfree.h>
+
+int cfree_toy_compile(CfreeCompiler*, const CfreeCompileOptions*,
+ const CfreeBytesInput* input, CfreeObjBuilder* out);
+```
+
+This matches the signature planned for `cfree_register_frontend`.
+
+## Driver registration (Phase 1.5)
+
+### New public entry
+
+```c
+/* include/cfree.h, near the CfreeLanguage enum */
+typedef int (*CfreeCompileFn)(CfreeCompiler*, const CfreeCompileOptions*,
+ const CfreeBytesInput*, CfreeObjBuilder* out);
+
+/* Register a frontend for a language tag. Overwrites any prior registration
+ for that tag. Returns 0 on success, 1 on OOM or bad args. */
+int cfree_register_frontend(CfreeCompiler*, CfreeLanguage, CfreeCompileFn);
+```
+
+`CfreeLanguage` grows:
+
+```c
+typedef enum CfreeLanguage {
+ CFREE_LANG_C = 0,
+ CFREE_LANG_ASM = 1,
+ CFREE_LANG_TOY = 2, /* new */
+} CfreeLanguage;
+```
+
+### Driver changes
+
+`driver/main.c` gains a registration hook invoked at tool startup:
+
+```c
+static void driver_register_frontends(CfreeCompiler* c) {
+ cfree_register_frontend(c, CFREE_LANG_ASM, internal_compile_asm);
+ cfree_register_frontend(c, CFREE_LANG_C, internal_compile_c);
+ cfree_register_frontend(c, CFREE_LANG_TOY, cfree_toy_compile);
+}
+```
+
+`cfree_compile_obj` in `src/api/pipeline.c` changes from a hard-coded switch
+to a table dispatch:
+
+```c
+static void compile_into(Compiler* c, const CfreeCompileOptions* opts,
+ const CfreeBytesInput* input, ObjBuilder* ob) {
+ CfreeCompileFn fn = compiler_get_frontend(c, input->lang);
+ if (fn) {
+ CfreeObjBuilder* pub_ob = ob;
+ int rc = fn(c, opts, input, pub_ob);
+ if (rc != 0) panic(...);
+ return;
+ }
+ /* fallback for unknown language */
+ panic(...);
+}
+```
+
+`cfree_language_for_path` learns `.toy` → `CFREE_LANG_TOY`.
+
+### Build integration
+
+The Makefile grows a `lang/` target:
+
+```makefile
+# lang/toy/Makefile
+libcfree_toy.a: toy.o lex.o parse.o type.o
+ $(AR) rcs $@ $^
+
+toy.o: toy.c $(CFREE_INCLUDES)
+ $(CC) $(CFLAGS) -I../../include -c $< -o $@
+```
+
+The top-level `Makefile` adds `lang/toy/libcfree_toy.a` to `LIBCFREE_OBJS` (or
+links it into `libcfree.a` if we decide frontends are part of the core library).
+Initially we keep it as a separate static archive that the driver links against.
+
+## Migration path for C (Phase 3, after toy proves the seam)
+
+Once `lang/toy/` compiles and links end-to-end, we can migrate the C frontend:
+
+1. Move `src/parse/`, `src/pp/`, `src/lex/`, `src/decl/` into `lang/c/`.
+2. Rename internal `parse_c` to `cfree_c_compile` with the `CfreeCompileFn`
+ signature.
+3. Build an **internal adapter layer** that translates C `Type*` into
+ `CfreeCgType*` before calling the public `CfreeCg` methods. Initially this
+ adapter can live in `lang/c/cg_adapter.c`.
+4. The internal `CG` layer (`src/cg/cg.h`) is either:
+ - retired and replaced by the public `CfreeCg` implementation, or
+ - kept as a private fast-path for the C frontend if the adapter overhead is
+ unacceptable.
+5. Assembly remains in core (`src/api/pipeline.c` or a thin `lang/asm/`
+ wrapper) because it bypasses CG entirely and talks to `MCEmitter`.
+
+## Inline assembly
+
+Inline assembly stays internal to the C frontend for now. The public `CfreeCg`
+API can grow an `asm_block` method later, but it is not required for the toy
+language. When it arrives, the signature will mirror the internal
+`CGTarget.asm_block` but use `CfreeCgValue` handles and `const char*` strings
+instead of internal `Operand` / `Sym`.
+
+## Open questions / TBD
+
+1. **String interning (`Sym`)**: The public API uses `const char*` everywhere.
+ The adapter must map names to internal `Sym` ids efficiently (a temporary
+ hash map inside `CfreeCg` is fine for v1).
+2. **Panic boundary**: Every public `cfree_cg_*` call is a thin wrapper that
+ saves/restores `c->panic` around the internal work, exactly like
+ `cfree_compile_obj` today.
+3. **Optimization wrapper**: How does `opt_level` reach the public CG? The
+ `CfreeCompileOptions` already carries `opt_level`; the `CfreeCg` constructor
+ can wrap the underlying `CGTarget` with `opt_cgtarget_new` internally.
+4. **Debug info**: Toy v1 skips debug info. When we add it, the public API will
+ expose `CfreeDebugBuilder` handles that frontends populate with
+ language-neutral type and location records.
+5. **Memory ownership of `CfreeCgType`**: Types should be arena-allocated from
+ the compiler's scratch arena and valid until `cfree_cg_free`. The public
+ surface should not require the frontend to free them.
+6. **Variadics / atomics**: These stay in the `CfreeCg` API but are
+ optional for the toy language. They are needed when C is lifted later.
+7. **ELF symbol visibility**: The toy frontend emits globals as
+ `SB_GLOBAL`/`SK_OBJ` by default. No linkage modifiers in v1.
+
+## Acceptance criteria for toy v1
+
+- [ ] `include/cfree/cg.h` exists and compiles.
+- [ ] `src/api/cg.c` implements the public API over existing `CGTarget`.
+- [ ] `lang/toy/` compiles to `libcfree_toy.a` using only public headers.
+- [ ] `driver/cc.c` (or a new `driver/toyc.c`) can compile `.toy` files:
+ `cfree toyc -c hello.toy -o hello.o`
+- [ ] `cfree run` can JIT a `.toy` file and execute `main()`.
+- [ ] No changes to the C frontend or internal `src/cg/cg.h` API surface.
+- [ ] Existing test suite (`make test-asm test-lex test-parse test-pp test-cg
+ test-link test-elf`) passes unchanged.