kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 4f4f5f3154004c2d4041df033a93a896d88d8228
parent df346d81fc94705392812053c1a029bbf7e0ed62
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sun, 10 May 2026 10:52:20 -0700

parse: Phase 6 — initializers (designators, string init, compound literals)

Lands the §6.7.9 initializer surface on top of Phase 4's static-storage
machinery and Phase 3's aggregates. Adds a token replay buffer so we
can two-pass scan a brace list to size `T[]` declarators / compound
literals before slot allocation; designator chains (`[i]`, `.field`,
nested `[i][j]`) navigate from any aggregate type and reset the cursor
in both the local and static paths, with locals zero-filling gaps.
String literals initialize char-arrays at any nesting level. Compound
literals lower to a hidden `cg_local` slot whose lvalue is pushed for
postfix/cast machinery to decay. `parse_type_name` now accepts a full
abstract declarator so `(int[])` and `(int (*)[3])` parse.

Flips `6_5_29_compound_literal` and `6_7_9_02..10` from · to ★.

Diffstat:
Mdoc/parser-status.md | 47+++++++++++++++++++++++++++++++++++++----------
Msrc/parse/parse.c | 524++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mtest/parse/CORPUS.md | 20++++++++++----------
3 files changed, 538 insertions(+), 53 deletions(-)

diff --git a/doc/parser-status.md b/doc/parser-status.md @@ -228,20 +228,47 @@ because it's the §6.8.6 spec slot). --- -## Phase 6 — Initializers ⬜ +## Phase 6 — Initializers ✅ Full §6.7.9 surface. Requires aggregates (Phase 3) and globals (Phase 4) to be fully useful. -- [ ] Brace initializer for arrays -- [ ] Brace initializer for structs -- [ ] Designated initializers (`[i] = ...`, `.field = ...`) -- [ ] Nested designators (`[i][j] = ...`) -- [ ] Partial init with zero-fill -- [ ] String literal init for `char[]` -- [ ] Compound literals (`(int[]){1, 2}`) - -Unlocks: `6_5_29`, `6_7_9_02–10`, `6_9_08–09`. +- [x] Brace initializer for arrays +- [x] Brace initializer for structs +- [x] Designated initializers (`[i] = ...`, `.field = ...`) +- [x] Nested designators (`[i][j] = ...`) +- [x] Partial init with zero-fill +- [x] String literal init for `char[]` +- [x] Compound literals (`(int[]){1, 2}`) + +Phase 6 also added: + - A token replay buffer on `Parser` so we can two-pass scan a braced + initializer: record tokens through the matching `}`, count top-level + items, then `replay_rewind` to re-parse. Used by + `complete_incomplete_array` to size `T[]` declarators / compound + literals (`(int[]){...}` and `char s[] = "hi"`) before the slot is + allocated. + - `parse_designator_chain` walks `[const]` / `.ident` chains starting + from any aggregate type, returning the leaf sub-object's offset/type + plus the top-level cursor index for the parent loop. Both + `init_at` (local) and `parse_static_init_at` (file scope / static + locals) consume designators; locals zero-fill gaps between the + cursor and a forward designator, while statics rely on the + pre-zeroed buffer. + - String-literal element initialization for char arrays at any nesting + level: `init_string_at` / `parse_static_string_at`. With or without + surrounding braces; truncation rules match §6.7.9 ¶14. + - Compound literals (`(T){...}`) lower as a hidden `cg_local` slot in + `parse_unary` immediately after the type-name; the slot's lvalue is + pushed and outer postfix/cast machinery handles array→pointer decay. + - `parse_type_name` now accepts a full abstract declarator (pointer + prefix + array/function suffixes) so casts like `(int (*)[3])` and + compound literals like `(int[]){...}` parse cleanly. The Phase 1 + docstring noting "abstract declarators are pointer-prefix only" is + obsolete. + +Unlocks (status as landed): `6_5_29` ★, `6_7_9_02–10` ★. `6_9_08–09` +were already ★ from Phase 4 (no compound-literal dependency). --- diff --git a/src/parse/parse.c b/src/parse/parse.c @@ -252,6 +252,26 @@ typedef struct Parser { /* Counter used to mint unique linker-visible names for static locals so * that two functions can each have their own `static int s = ...`. */ u32 static_local_counter; + + /* Counter used to mint anonymous local names for compound literals + * (`(T){...}`). Each compound literal becomes a hidden frame slot whose + * name is reserved here purely for diagnostics; the symbol is never + * visible to user code. */ + u32 compound_literal_counter; + + /* Replay buffer for two-pass scans of brace-enclosed initializers. + * Used when a compound literal or initializer needs to size an + * incomplete array (`(int[]){10, 32}`): we record tokens through the + * matching `}`, count items, then rewind to re-parse. While + * `replay_active`, advance()/peek1() pull from `replay` instead of pp; + * once exhausted, they fall back to the regular pp source so the + * post-brace token is fetched fresh. The buffer lives in arena + * storage. */ + Tok* replay; + u32 replay_cap; + u32 replay_len; + u32 replay_pos; + u8 replay_active; } Parser; /* ============================================================ @@ -272,6 +292,16 @@ static _Noreturn void perr(Parser* p, const char* fmt, ...) { * ============================================================ */ static void advance(Parser* p) { + if (p->replay_active) { + if (p->replay_pos < p->replay_len) { + p->cur = p->replay[p->replay_pos++]; + return; + } + /* Replay exhausted; fall back to the underlying source. The pp stream + * sits exactly past the recorded `}` (record_braced_block left it + * there), so fetching the next token resumes parsing after the brace. */ + p->replay_active = 0; + } if (p->has_next) { p->cur = p->next; p->has_next = 0; @@ -282,6 +312,9 @@ static void advance(Parser* p) { /* One-token lookahead beyond p->cur. Lazily populated. */ static Tok peek1(Parser* p) { + if (p->replay_active && p->replay_pos < p->replay_len) { + return p->replay[p->replay_pos]; + } if (!p->has_next) { p->next = pp_next(p->pp); p->has_next = 1; @@ -330,6 +363,76 @@ static void expect_punct(Parser* p, u32 punct, const char* what) { } } +/* Record tokens from the current `{` through the matching `}` into the + * parser's replay buffer. Pre: p->cur is `{`. Post: p->cur is the closing + * `}` (not yet advanced past); replay buffer holds [`{`, ..., `}`]. The + * caller must subsequently call replay_rewind() to re-scan, or simply + * advance() to skip past the brace. */ +static void record_braced_block(Parser* p) { + int depth = 0; + if (!is_punct(&p->cur, '{')) perr(p, "internal: record on non-'{'"); + p->replay_len = 0; + for (;;) { + if (p->replay_len == p->replay_cap) { + u32 new_cap = p->replay_cap ? p->replay_cap * 2 : 32; + Tok* nv = arena_array(p->c->tu, Tok, new_cap); + if (!nv) perr(p, "out of memory in record_braced_block"); + if (p->replay && p->replay_len) { + memcpy(nv, p->replay, p->replay_len * sizeof(Tok)); + } + p->replay = nv; + p->replay_cap = new_cap; + } + p->replay[p->replay_len++] = p->cur; + if (is_punct(&p->cur, '{')) { + ++depth; + } else if (is_punct(&p->cur, '}')) { + --depth; + if (depth == 0) break; + } else if (p->cur.kind == TOK_EOF) { + perr(p, "unexpected end of file in initializer"); + } + advance(p); + } + /* cur is the recorded closing `}`. Caller decides what to do next. */ +} + +/* After record_braced_block, rewind so subsequent advance()/peek1() pull + * tokens from the replay buffer starting at index 0. Discards any + * lazily-buffered `next` since tokens within the recorded range are now + * served from the buffer. The post-`}` token will be fetched via pp_next + * once the replay finishes draining. */ +static void replay_rewind(Parser* p) { + if (p->replay_len == 0) perr(p, "internal: replay_rewind with empty buffer"); + p->cur = p->replay[0]; + p->replay_pos = 1; + p->replay_active = 1; + p->has_next = 0; +} + +/* Count top-level items in a recorded brace list (positional or designator- + * led). The recording starts with `{` at index 0 and ends with the matching + * `}` at len-1. Top-level commas separate items; a trailing comma before + * the closing `}` does not introduce an extra item. Used to size incomplete + * arrays initialized with `{...}`. */ +static u32 count_recorded_top_level_items(const Tok* vec, u32 len) { + u32 count; + u32 i; + int depth = 0; + if (len < 2) return 0; + if (len == 2) return 0; /* `{}` */ + count = 1; + for (i = 1; i < len - 1; ++i) { + const Tok* t = &vec[i]; + if (is_punct(t, '{') || is_punct(t, '(') || is_punct(t, '[')) ++depth; + else if (is_punct(t, '}') || is_punct(t, ')') || is_punct(t, ']')) --depth; + else if (depth == 0 && is_punct(t, ',')) ++count; + } + /* If the last interior token is `,` it was a trailing separator; back off. */ + if (is_punct(&vec[len - 2], ',')) --count; + return count; +} + /* expect_kw is wired up but unused at this slice — `void` consumption * goes through accept_kw already. Kept commented as a documentation hook * for the next slice that needs it (e.g. `_Static_assert`). @@ -1040,10 +1143,16 @@ static const Type* parse_pointer_layer(Parser* p, const Type* base) { * function suffixes land in Phase 2. Used by sizeof / _Alignof / cast. */ static const Type* parse_type_name(Parser* p) { DeclSpecs specs; + Sym dummy_name = 0; + SrcLoc dummy_loc = {0, 0, 0}; if (!parse_decl_specs(p, &specs)) { perr(p, "expected type-name"); } - return parse_pointer_layer(p, specs.type); + /* Type-name accepts a full abstract declarator (pointer prefix + array + * and/or function suffixes); compound literals like `(int[]){...}` and + * casts like `(int (*)[3])` rely on this. */ + return parse_declarator_full(p, specs.type, /*allow_abstract=*/1, + &dummy_name, &dummy_loc); } /* ============================================================ @@ -1123,6 +1232,14 @@ static void parse_assign_expr(Parser* p); static void parse_unary(Parser* p); static void parse_postfix(Parser* p); +/* Initializer entry points used by compound-literal lowering in parse_unary; + * the bodies live next to the rest of the initializer machinery further + * down. */ +typedef struct DeclSpecs DeclSpecs; +static const Type* complete_incomplete_array(Parser* p, const Type* ty); +static void init_at(Parser* p, FrameSlot slot, const Type* arr_ty, u32 offset, + const Type* ty); + /* Produce an rvalue on the stack. Three cases beyond the trivial scalar: * - array lvalue: §6.3.2.1 array-to-pointer decay → take address, retag the * resulting `T(*)[N]` as `T*` so subsequent ops see a pointer. @@ -1583,6 +1700,36 @@ static void parse_unary(Parser* p) { advance(p); /* '(' */ dst = parse_type_name(p); expect_punct(p, ')', "')' after type-name"); + /* Compound literal `(type-name) { init-list }` per §6.5.2.5. The + * literal has automatic storage in the enclosing block (function + * scope here — same lifetime as a local). Allocate a hidden frame + * slot, parse the brace initializer into it, and push the slot's + * lvalue. Outer postfix/cast machinery handles array-to-pointer + * decay if the consumer expects an rvalue. */ + if (is_punct(&p->cur, '{')) { + FrameSlotDesc fsd; + FrameSlot slot; + const Type* lit_ty = dst; + if (lit_ty && lit_ty->kind == TY_ARRAY && lit_ty->arr.incomplete) { + lit_ty = complete_incomplete_array(p, lit_ty); + } + memset(&fsd, 0, sizeof fsd); + fsd.type = lit_ty; + fsd.size = abi_sizeof(p->abi, lit_ty); + fsd.align = abi_alignof(p->abi, lit_ty); + fsd.kind = FS_LOCAL; + fsd.flags = FSF_NONE; + slot = cg_local(p->cg, &fsd); + if (lit_ty && (lit_ty->kind == TY_ARRAY || lit_ty->kind == TY_STRUCT || + lit_ty->kind == TY_UNION)) { + init_at(p, slot, lit_ty, 0, lit_ty); + } else { + /* Scalar compound literal `(int){42}`. */ + init_at(p, slot, lit_ty, 0, lit_ty); + } + cg_push_local_typed(p->cg, slot, lit_ty); + return; + } parse_unary(p); /* cast-expression */ to_rvalue(p); /* `(void) expr` is the C idiom for "discard the value"; we must not @@ -2513,6 +2660,24 @@ static const Type* parse_declarator(Parser* p, const Type* base, Sym* name_out, return parse_declarator_full(p, base, /*allow_abstract=*/0, name_out, loc_out); } +/* True if `ty` is char/signed char/unsigned char (the three element types + * permitted as the target of a string-literal initializer per §6.7.9 ¶14). */ +static int is_char_kind(const Type* ty) { + if (!ty) return 0; + return ty->kind == TY_CHAR || ty->kind == TY_SCHAR || ty->kind == TY_UCHAR; +} + +/* Decode the string token at p->cur (must be TOK_STR) without advancing. + * Returns a heap-allocated byte buffer (caller frees) and writes the + * length (including the trailing NUL) to *nlen_out. Convenience wrapper + * around decode_string_literal, kept here so initializer code doesn't + * need to reach into the literal-parsing section. */ +static u8* peek_string_bytes(Parser* p, size_t* nlen_out) { + Tok t = p->cur; + if (t.kind != TOK_STR) perr(p, "internal: peek_string_bytes on non-string"); + return decode_string_literal(p, &t, nlen_out); +} + /* Push the lvalue of a sub-object at byte offset `offset` within the array * local `slot` (whose type is `arr_ty`), with element type `elem_ty`. The * value stack ends with an OPK_INDIRECT lvalue ready for cg_store. */ @@ -2569,18 +2734,142 @@ static void zero_init_at(Parser* p, FrameSlot slot, const Type* arr_ty, /* Parse the initializer for the sub-object at `offset` of type `ty`. * - * Aggregates (`{...}`) follow §6.7.9 with two simplifications: - * - No designated initializers (Phase 6). - * - Brace elision is supported on entry: a sub-aggregate without its own - * `{` consumes scalars from the parent's initializer stream until its - * first scalar slot is filled. This matches the corpus rows that nest - * anonymous structs inside outer braced inits. + * Aggregates (`{...}`) follow §6.7.9: + * - Designated initializers (`[i] = ...`, `.field = ...`, and chains + * such as `[i][j] = ...` or `.a.b = ...`) reset the cursor before + * each item; subsequent positional items continue from there. Gaps + * between the previous cursor and a forward designator are + * zero-filled. + * - Brace elision: a sub-aggregate without its own `{` consumes + * scalars from the parent's stream until its first scalar slot is + * filled. + * - String literals initialize char-arrays directly per §6.7.9 ¶14 + * (with or without surrounding braces). * * Scalars take a single assignment-expression, optionally wrapped in * `{ x }` per §6.7.9 ¶11. */ static void init_at(Parser* p, FrameSlot slot, const Type* arr_ty, u32 offset, const Type* ty); +/* Emit byte stores for a string literal initializing a char-array sub- + * object at `offset` whose declared element count is `count`. Bytes + * beyond the literal are zero-filled. Per §6.7.9 ¶14 it is well-formed + * to drop the terminating NUL when `count == strlen(s)`; longer arrays + * keep the NUL and zero-pad. */ +static void init_string_at(Parser* p, FrameSlot slot, const Type* arr_ty, + u32 offset, const Type* elem_ty, u32 count) { + size_t n = 0; + u8* bytes = peek_string_bytes(p, &n); + size_t copy = n; + size_t i; + if (copy > count) copy = count; /* §6.7.9 ¶14 truncation */ + for (i = 0; i < copy; ++i) { + push_subobject_lv(p, slot, arr_ty, offset + (u32)i, elem_ty); + cg_push_int(p->cg, (i64)bytes[i], elem_ty); + cg_store(p->cg); + cg_drop(p->cg); + } + for (; i < count; ++i) { + push_subobject_lv(p, slot, arr_ty, offset + (u32)i, elem_ty); + cg_push_int(p->cg, 0, elem_ty); + cg_store(p->cg); + cg_drop(p->cg); + } + p->c->env->heap->free(p->c->env->heap, bytes, 0); + advance(p); /* consume TOK_STR */ +} + +/* Parse a designator chain (`[const]` and `.ident` repeats) starting at + * the current token and ending at `=`. The chain navigates from the outer + * type `outer_ty` (offset_in `outer_offset`) down to a sub-object; + * returns the sub-object's type via *sub_ty_out and absolute byte offset + * via *sub_offset_out. Also writes the index of the FIRST designator + * (which selects the cursor position in the immediately-enclosing brace + * list): for an array that's the [i] index, for a struct that's the + * field index of the named member. */ +static void parse_designator_chain(Parser* p, const Type* outer_ty, + u32 outer_offset, const Type** sub_ty_out, + u32* sub_offset_out, u32* top_index_out) { + const Type* cur_ty = outer_ty; + u32 cur_off = outer_offset; + int first = 1; + for (;;) { + if (is_punct(&p->cur, '[')) { + i64 idx; + u32 esz; + SrcLoc cloc = tok_loc(&p->cur); + advance(p); + idx = eval_const_int(p, cloc); + expect_punct(p, ']', "']' after designator index"); + if (!cur_ty || cur_ty->kind != TY_ARRAY) { + perr(p, "array designator on non-array"); + } + if (idx < 0 || (u32)idx >= cur_ty->arr.count) { + perr(p, "array designator index out of range"); + } + esz = abi_sizeof(p->abi, cur_ty->arr.elem); + cur_off += (u32)idx * esz; + cur_ty = cur_ty->arr.elem; + if (first) *top_index_out = (u32)idx; + first = 0; + } else if (is_punct(&p->cur, '.')) { + Sym fname; + const Type* fty; + u32 foff; + const Field* ff; + u16 fi; + advance(p); + if (p->cur.kind != TOK_IDENT || ident_kw(p, p->cur.v.ident) != KW_NONE) { + perr(p, "expected field name after '.'"); + } + fname = p->cur.v.ident; + advance(p); + if (!cur_ty || + (cur_ty->kind != TY_STRUCT && cur_ty->kind != TY_UNION)) { + perr(p, "field designator on non-record type"); + } + if (!find_field(p->abi, cur_ty, fname, &fty, &foff, &ff)) { + perr(p, "no such field in designator"); + } + cur_off += foff; + if (first) { + /* Find the field index for cursor advance in the parent loop. + * find_field returns the offset/type but not the index, so do a + * second linear scan here. Anonymous-member transparency: an + * IDENT inside a nested anonymous member belongs to the outer + * record's NTH visible position; we use the outer slot for + * cursor advance, scanning the outer record. */ + for (fi = 0; fi < cur_ty->rec.nfields; ++fi) { + const Field* g = &cur_ty->rec.fields[fi]; + if (g->name == fname && fname != 0) { + *top_index_out = fi; + break; + } + if ((g->flags & FIELD_ANON) && + (g->type->kind == TY_STRUCT || g->type->kind == TY_UNION)) { + const Type* tmp_ty; + u32 tmp_off; + const Field* tmp_f; + if (find_field(p->abi, g->type, fname, &tmp_ty, &tmp_off, + &tmp_f)) { + *top_index_out = fi; + break; + } + } + } + } + cur_ty = fty; + first = 0; + } else { + break; + } + } + if (first) perr(p, "internal: empty designator chain"); + expect_punct(p, '=', "'=' after designator"); + *sub_ty_out = cur_ty; + *sub_offset_out = cur_off; +} + /* Parse a brace-elided sequence of scalars filling sub-objects of `ty` * starting at `offset`. `count_out` is set to the number of scalars * consumed; the function returns when the parent's initializer stream @@ -2596,21 +2885,47 @@ static u32 init_struct_fields(Parser* p, FrameSlot slot, const Type* arr_ty, * stream. With `braced=1`, we are inside this struct's own `{ ... }` and * stop on `}`; with `braced=0`, we are eliding into the parent's stream * and return as soon as the first scalar slot is filled (caller manages - * outer field index). Returns the number of fields consumed. */ + * outer field index). Returns the number of fields consumed. + * + * In braced mode, designated initializers (`.field = ...`) reset `i`; + * gaps between the previous cursor and the designator are zero-filled. */ const ABIRecordLayout* L = abi_record_layout(p->abi, ty); u32 i = start_field; + u32 zero_lo = start_field; /* first not-yet-zero-filled field index */ for (; i < ty->rec.nfields; ++i) { const Field* f = &ty->rec.fields[i]; u32 foff = offset + L->fields[i].offset; if (f->flags & FIELD_BITFIELD) continue; if (braced && (is_punct(&p->cur, '}') || p->cur.kind == TOK_EOF)) break; + if (braced && is_punct(&p->cur, '.')) { + const Type* sub_ty; + u32 sub_off; + u32 top_idx = 0; + parse_designator_chain(p, ty, offset, &sub_ty, &sub_off, &top_idx); + /* Zero-fill any fields the designator skipped over (or back-tracked + * past — duplicate inits are allowed but we just overwrite). */ + while (zero_lo < top_idx) { + const Field* zf = &ty->rec.fields[zero_lo]; + u32 zoff = offset + L->fields[zero_lo].offset; + if (!(zf->flags & FIELD_BITFIELD)) { + zero_init_at(p, slot, arr_ty, zoff, zf->type); + } + ++zero_lo; + } + init_at(p, slot, arr_ty, sub_off, sub_ty); + i = top_idx; /* loop ++ advances past it */ + if (zero_lo <= top_idx) zero_lo = top_idx + 1; + goto next_item_struct; + } init_at(p, slot, arr_ty, foff, f->type); + if (zero_lo <= i) zero_lo = i + 1; if (!braced) { /* Caller (parent's elision) only wanted us to consume one scalar's * worth into our first non-bitfield slot. */ ++i; break; } + next_item_struct: if (!accept_punct(p, ',')) { ++i; break; @@ -2622,9 +2937,10 @@ static u32 init_struct_fields(Parser* p, FrameSlot slot, const Type* arr_ty, } /* Zero-fill any unconsumed fields in braced mode. */ if (braced) { - for (; i < ty->rec.nfields; ++i) { - const Field* f = &ty->rec.fields[i]; - u32 foff = offset + L->fields[i].offset; + u32 j; + for (j = zero_lo; j < ty->rec.nfields; ++j) { + const Field* f = &ty->rec.fields[j]; + u32 foff = offset + L->fields[j].offset; if (f->flags & FIELD_BITFIELD) continue; zero_init_at(p, slot, arr_ty, foff, f->type); } @@ -2661,31 +2977,68 @@ static u32 init_elided(Parser* p, FrameSlot slot, const Type* arr_ty, static void init_at(Parser* p, FrameSlot slot, const Type* arr_ty, u32 offset, const Type* ty) { if (ty->kind == TY_ARRAY) { + const Type* elem_ty = ty->arr.elem; + u32 esz = abi_sizeof(p->abi, elem_ty); + /* String literal initializing a char-array (with or without braces) per + * §6.7.9 ¶14. Wide character types are deferred (Phase 7). */ + if (is_char_kind(elem_ty)) { + if (p->cur.kind == TOK_STR) { + init_string_at(p, slot, arr_ty, offset, elem_ty, ty->arr.count); + return; + } + if (is_punct(&p->cur, '{') && peek1(p).kind == TOK_STR) { + advance(p); + init_string_at(p, slot, arr_ty, offset, elem_ty, ty->arr.count); + accept_punct(p, ','); + expect_punct(p, '}', "'}' after string initializer"); + return; + } + } if (!is_punct(&p->cur, '{')) { /* Brace elision: the array consumes scalars from the parent stream. * A bare assignment-expression on entry only fills one scalar slot * worth, then returns. */ - init_elided(p, slot, arr_ty, offset, ty->arr.elem); + init_elided(p, slot, arr_ty, offset, elem_ty); return; } advance(p); /* '{' */ - const Type* elem_ty = ty->arr.elem; - u32 esz = abi_sizeof(p->abi, elem_ty); - u32 i = 0; - if (!is_punct(&p->cur, '}')) { - for (;;) { - if (i >= ty->arr.count) { - perr(p, "too many initializers for array"); + { + u32 i = 0; + u32 zero_lo = 0; /* first index not yet zero-filled (after explicit init) */ + if (!is_punct(&p->cur, '}')) { + for (;;) { + if (is_punct(&p->cur, '[')) { + const Type* sub_ty; + u32 sub_off; + u32 top_idx = 0; + parse_designator_chain(p, ty, offset, &sub_ty, &sub_off, + &top_idx); + while (zero_lo < top_idx) { + zero_init_at(p, slot, arr_ty, offset + zero_lo * esz, elem_ty); + ++zero_lo; + } + init_at(p, slot, arr_ty, sub_off, sub_ty); + i = top_idx + 1; + if (zero_lo < i) zero_lo = i; + } else { + if (i >= ty->arr.count) { + perr(p, "too many initializers for array"); + } + init_at(p, slot, arr_ty, offset + i * esz, elem_ty); + ++i; + if (zero_lo < i) zero_lo = i; + } + if (!accept_punct(p, ',')) break; + if (is_punct(&p->cur, '}')) break; + } + } + expect_punct(p, '}', "'}' after array initializer"); + { + u32 j; + for (j = zero_lo; j < ty->arr.count; ++j) { + zero_init_at(p, slot, arr_ty, offset + j * esz, elem_ty); } - init_at(p, slot, arr_ty, offset + i * esz, elem_ty); - ++i; - if (!accept_punct(p, ',')) break; - if (is_punct(&p->cur, '}')) break; } - } - expect_punct(p, '}', "'}' after array initializer"); - for (; i < ty->arr.count; ++i) { - zero_init_at(p, slot, arr_ty, offset + i * esz, elem_ty); } return; } @@ -2757,23 +3110,63 @@ static void encode_int_le(u8* dst, u32 size, i64 v) { } } +/* Encode a string literal at *buf+offset for a char-array sub-object of + * declared element count `count`. Bytes beyond the literal stay zero + * (buf is pre-zeroed by define_static_object). Truncation rules match + * §6.7.9 ¶14. */ +static void parse_static_string_at(Parser* p, u8* buf, u32 buflen, u32 offset, + u32 count) { + size_t n = 0; + u8* bytes = peek_string_bytes(p, &n); + size_t copy = n; + if (copy > count) copy = count; + if (offset + (u32)copy > buflen) perr(p, "string initializer overflows object"); + memcpy(buf + offset, bytes, copy); + p->c->env->heap->free(p->c->env->heap, bytes, 0); + advance(p); +} + static void parse_static_init_at(Parser* p, u8* buf, u32 buflen, u32 offset, const Type* ty) { if (ty->kind == TY_ARRAY) { const Type* elem = ty->arr.elem; u32 esz = abi_sizeof(p->abi, elem); u32 i = 0; - int had_brace = accept_punct(p, '{'); + int had_brace; + /* String literal initializer for char-arrays (with or without braces). */ + if (is_char_kind(elem)) { + if (p->cur.kind == TOK_STR) { + parse_static_string_at(p, buf, buflen, offset, ty->arr.count); + return; + } + if (is_punct(&p->cur, '{') && peek1(p).kind == TOK_STR) { + advance(p); + parse_static_string_at(p, buf, buflen, offset, ty->arr.count); + accept_punct(p, ','); + expect_punct(p, '}', "'}' after string initializer"); + return; + } + } + had_brace = accept_punct(p, '{'); if (!had_brace) { perr(p, "expected '{' for static-storage array initializer"); } if (!is_punct(&p->cur, '}')) { for (;;) { - if (i >= ty->arr.count) { - perr(p, "too many initializers for array"); + if (is_punct(&p->cur, '[')) { + const Type* sub_ty; + u32 sub_off; + u32 top_idx = 0; + parse_designator_chain(p, ty, offset, &sub_ty, &sub_off, &top_idx); + parse_static_init_at(p, buf, buflen, sub_off, sub_ty); + i = top_idx + 1; + } else { + if (i >= ty->arr.count) { + perr(p, "too many initializers for array"); + } + parse_static_init_at(p, buf, buflen, offset + i * esz, elem); + ++i; } - parse_static_init_at(p, buf, buflen, offset + i * esz, elem); - ++i; if (!accept_punct(p, ',')) break; if (is_punct(&p->cur, '}')) break; } @@ -2790,6 +3183,16 @@ static void parse_static_init_at(Parser* p, u8* buf, u32 buflen, u32 offset, } while (i < ty->rec.nfields && !is_punct(&p->cur, '}')) { const Field* f = &ty->rec.fields[i]; + if (is_punct(&p->cur, '.')) { + const Type* sub_ty; + u32 sub_off; + u32 top_idx = 0; + parse_designator_chain(p, ty, offset, &sub_ty, &sub_off, &top_idx); + parse_static_init_at(p, buf, buflen, sub_off, sub_ty); + i = top_idx + 1; + if (!accept_punct(p, ',')) break; + continue; + } if (f->flags & FIELD_BITFIELD) { ++i; continue; } parse_static_init_at(p, buf, buflen, offset + L->fields[i].offset, f->type); @@ -2920,6 +3323,48 @@ static Sym mint_static_local_sym(Parser* p, Sym orig) { return pool_intern(p->pool, buf, wlen); } +/* If `ty` is an incomplete array (`T[]`), peek the initializer at p->cur + * and complete the type by counting the items it provides. Three cases: + * - `T` is a char-kind and the initializer is a string literal: count = + * decoded length (including NUL). + * - `{...}` initializer: record the braced range and count top-level + * items; positional only, no designators (sufficient for the corpus). + * After completion the parser is rewound to the recorded `{`. + * - Otherwise: panic (incomplete array with non-list init). + * Returns the completed array type. The caller should use this as the + * declared variable type going forward. */ +static const Type* complete_incomplete_array(Parser* p, const Type* ty) { + const Type* elem; + if (!ty || ty->kind != TY_ARRAY || !ty->arr.incomplete) return ty; + elem = ty->arr.elem; + if (is_char_kind(elem) && p->cur.kind == TOK_STR) { + Tok t = p->cur; + size_t n = 0; + u8* bytes = decode_string_literal(p, &t, &n); + p->c->env->heap->free(p->c->env->heap, bytes, 0); + return type_array(p->pool, elem, (u32)n, /*incomplete=*/0); + } + if (is_punct(&p->cur, '{')) { + u32 cnt; + record_braced_block(p); + cnt = count_recorded_top_level_items(p->replay, p->replay_len); + /* String literal as the sole brace contents is also valid: `char s[] = + * {"hi"}`. Detect by replay[1] being TOK_STR; recompute count from the + * decoded length. */ + if (cnt == 1 && p->replay_len >= 3 && p->replay[1].kind == TOK_STR && + is_char_kind(elem)) { + Tok t = p->replay[1]; + size_t n = 0; + u8* bytes = decode_string_literal(p, &t, &n); + p->c->env->heap->free(p->c->env->heap, bytes, 0); + cnt = (u32)n; + } + replay_rewind(p); + return type_array(p->pool, elem, cnt, /*incomplete=*/0); + } + perr(p, "initializer cannot complete incomplete array type"); +} + /* Parse a single init-declarator after the decl-specs have been consumed. * Grammar: declarator = (`*` qual*)* (IDENT | `(` declarator `)`) suffix* * init = `=` (assign_expr | brace_init) */ @@ -3012,7 +3457,20 @@ static void parse_init_declarator(Parser* p, const DeclSpecs* specs) { } /* Non-VLA local. */ { - FrameSlot s = make_local(p, name, var_ty, loc); + int has_init = is_punct(&p->cur, '='); + FrameSlot s; + if (has_init && var_ty && var_ty->kind == TY_ARRAY && var_ty->arr.incomplete) { + /* `T name[] = ...`: peek the initializer to deduce the count, then + * allocate the slot with the now-complete type. The slot allocation + * has to wait until after sizing, so move it inside this branch. */ + advance(p); /* '=' */ + var_ty = complete_incomplete_array(p, var_ty); + s = make_local(p, name, var_ty, loc); + cg_set_loc(p->cg, loc); + init_at(p, s, var_ty, 0, var_ty); + return; + } + s = make_local(p, name, var_ty, loc); if (accept_punct(p, '=')) { cg_set_loc(p->cg, loc); if (var_ty->kind == TY_ARRAY || var_ty->kind == TY_STRUCT || diff --git a/test/parse/CORPUS.md b/test/parse/CORPUS.md @@ -139,7 +139,7 @@ here for completeness once they're real cases. | `6_5_26_pre_dec` | ★ | `int x = 43; return --x;` | 42 | | `6_5_27_post_dec` | ★ | `int x = 43; x--; return x;` | 42 | | `6_5_28_arrow` | ★ | `struct S{int v;} s={42}; struct S *p=&s; return p->v;` | 42 | -| `6_5_29_compound_literal` | · | `int *p = (int[]){10, 32}; return p[0]+p[1];` | 42 | +| `6_5_29_compound_literal` | ★ | `int *p = (int[]){10, 32}; return p[0]+p[1];` | 42 | | `6_5_30_generic_selection`| ★ | `int x=42; return _Generic((x), int: x, default: 0);` | 42 | | `6_5_31_subscript_commute`| ★ | `int a[5]={0,0,42,0,0}; return 2[a];` | 42 | | `6_5_32_string_subscript` | ★ | `return "*"[0];` | 42 | @@ -263,15 +263,15 @@ cover compound typedef targets. | Case | Status | Body | Expected | |---|---|---|---| | `6_7_9_01_scalar_init` | ★ | `int x = 42; return x;` | 42 | -| `6_7_9_02_array_brace` | · | `int a[3] = {10, 20, 12}; return a[0]+a[1]+a[2];` | 42 | -| `6_7_9_03_partial_zero` | · | `int a[5] = {42}; return a[0] + a[4];` | 42 | -| `6_7_9_04_designated` | · | `int a[5] = {[2] = 42}; return a[2];` | 42 | -| `6_7_9_05_struct_init` | · | `struct S {int a,b;} s={40,2}; return s.a+s.b;` | 42 | -| `6_7_9_06_string_init` | · | `char s[] = "hi"; return s[0]+s[1]+s[2];` | 'h'+'i' | -| `6_7_9_07_designated_struct` | · | `struct S{int a,b,c;} s={.b=42}; return s.b;` | 42 | -| `6_7_9_08_nested_designated` | · | `int a[2][3] = {[1][2] = 42}; return a[1][2];` | 42 | -| `6_7_9_09_struct_in_array` | · | `struct P{int x,y;} a[2] = {{0,0},{0,42}}; return a[1].y;` | 42 | -| `6_7_9_10_zero_init_static` | · | full TU: `static int g[3]; int test_main(void){return g[0]+g[1]+g[2]+42;}` | 42 | +| `6_7_9_02_array_brace` | ★ | `int a[3] = {10, 20, 12}; return a[0]+a[1]+a[2];` | 42 | +| `6_7_9_03_partial_zero` | ★ | `int a[5] = {42}; return a[0] + a[4];` | 42 | +| `6_7_9_04_designated` | ★ | `int a[5] = {[2] = 42}; return a[2];` | 42 | +| `6_7_9_05_struct_init` | ★ | `struct S {int a,b;} s={40,2}; return s.a+s.b;` | 42 | +| `6_7_9_06_string_init` | ★ | `char s[] = "hi"; return s[0]+s[1]+s[2];` | 'h'+'i' | +| `6_7_9_07_designated_struct` | ★ | `struct S{int a,b,c;} s={.b=42}; return s.b;` | 42 | +| `6_7_9_08_nested_designated` | ★ | `int a[2][3] = {[1][2] = 42}; return a[1][2];` | 42 | +| `6_7_9_09_struct_in_array` | ★ | `struct P{int x,y;} a[2] = {{0,0},{0,42}}; return a[1].y;` | 42 | +| `6_7_9_10_zero_init_static` | ★ | full TU: `static int g[3]; int test_main(void){return g[0]+g[1]+g[2]+42;}` | 42 | ## §6.7.10 Static assertions