Fix AArch64 large frame offset emission - kit

commit a7442212be4d2e3a72cad198038e1f75103ac0bb
parent e3cc40502d041ed5d803683ac0afc4edf7b641ef
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 15 May 2026 14:51:25 -0700

Fix AArch64 large frame offset emission

Diffstat:
M doc/OPT1.md  | 24 +++++++++---------------
M src/arch/aa64/emit.c  | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M src/arch/aa64/internal.h  | 13 +++++++++++--
M src/arch/aa64/ops.c  | 30 +++++++++++++++++++-----------
M test/opt/phase0_guardrails.sh  | 40 ++++++++++++++++++++++++++++++++++++++++

5 files changed, 183 insertions(+), 53 deletions(-)
diff --git a/doc/OPT1.md b/doc/OPT1.md
@@ -287,18 +287,13 @@ mov w19, w21
 MIR's O1 path suggests these high-value local cleanups that still fit cfree's
 fast tier:
 
-1. Reduce and fix the spill-heavy JIT/runtime crash.
-   The nonconstant wide-local spill probe returned correctly for many sizes but
-   segfaulted at nearby large sizes. Isolate this to a small parse or CG API
-   testcase before doing more spill-pressure perf work.
-
-2. Clean up local branch layout artifacts.
+1. Clean up local branch layout artifacts.
    MIR's full jump optimizer is O2-only, but its cheap pieces are appropriate
    for O1: delete branches to immediate fallthrough blocks, forward
    branch-to-branch targets, and invert a branch when it removes an
    unconditional jump. Avoid full CFG layout work.
 
-3. Promote remaining scalar entry slots before backend allocation.
+2. Promote remaining scalar entry slots before backend allocation.
    MIR's C frontend represents normal scalar block locals as MIR registers and
    leaves stack slots for aggregates, forced-stack cases, and address-taken
    values. O1 now keeps simple loop locals in registers in the probe, but still
@@ -306,33 +301,32 @@ fast tier:
    pass should promote remaining integer/pointer scalars whose address does not
    escape, starting with parameters and single-entry structured control flow.
 
-4. Avoid unnecessary callee-save traffic.
+3. Avoid unnecessary callee-save traffic.
    Reserve and preserve only hard registers that survive final post-rewrite
    cleanup, and consider caller-saved registers for values that are not live
    across calls. This would make small leaf functions much closer to expected
    O1 output without requiring global optimization.
 
-5. Continue tightening post-rewrite DCE and copy cleanup.
+4. Continue tightening post-rewrite DCE and copy cleanup.
    Model hard-register call arguments and call clobbers precisely enough to
    delete dead caller-saved defs before calls without removing required ABI
    traffic. Also fold single-use arithmetic temporaries into their destination
    when target constraints allow it.
 
-6. Keep compare-branch fusion covered by tests.
+5. Keep compare-branch fusion covered by tests.
    The current probes show direct `cmp` plus branch shapes for branch-consuming
    compares on AArch64. Add focused regression coverage so the old
    `cmp; cset; cmp #0; b.cond` bridge does not return.
 
-7. Keep tiny local constant simplification bounded.
+6. Keep tiny local constant simplification bounded.
    The vstack constfold path now removes obvious immediate and straight-line
    scalar-local code before allocation. Keep it basic-block-local; broader
    propagation belongs in the O2 SSA/value optimizer.
 
-8. Watch spill-pressure regalloc slope.
+7. Watch spill-pressure regalloc slope.
    Normal-path scaling is linear, but heavy spill pressure still bends slightly
-   in regalloc time. After the correctness crash is fixed, rerun the
-   nonconstant wide-local ladder and decide whether interval probing or stack
-   slot assignment needs another narrow cleanup.
+   in regalloc time. Rerun the nonconstant wide-local ladder and decide whether
+   interval probing or stack slot assignment needs another narrow cleanup.
 
 - Keep `opt_combine` legality target-aware.
   Existing one-use copy/immediate/convert folds should stay conservative. New
diff --git a/src/arch/aa64/emit.c b/src/arch/aa64/emit.c
@@ -287,9 +287,34 @@ static u32 aa_build_prologue(CGTarget *t, u32 *words, u32 cap, u32 frame_size,
     words[wi++] = aa64_sub_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1);
     words[wi++] = aa64_sub_imm(1, 31, 31, frame_size & 0xfff, 0);
   }
-  if (wi + 2 > cap) goto overflow;
-  words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off);
-  words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0);
+  if (fp_lr_off <= 504u) {
+    if (wi >= cap) goto overflow;
+    words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off);
+  } else {
+    if (wi + 2 > cap) goto overflow;
+    words[wi++] = aa64_str_uimm(3, 29, 31, fp_lr_off);
+    words[wi++] = aa64_str_uimm(3, 30, 31, fp_lr_off + 8u);
+  }
+  if (wi >= cap) goto overflow;
+  if (fp_lr_off <= 0xfffu) {
+    words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0);
+  } else if ((fp_lr_off >> 24) == 0) {
+    u32 hi = (fp_lr_off >> 12) & 0xfffu;
+    u32 lo = fp_lr_off & 0xfffu;
+    if (hi) {
+      words[wi++] = aa64_add_imm(1, 29, 31, hi, 1);
+      if (lo) {
+        if (wi >= cap) goto overflow;
+        words[wi++] = aa64_add_imm(1, 29, 29, lo, 0);
+      }
+    } else if (lo) {
+      words[wi++] = aa64_add_imm(1, 29, 31, lo, 0);
+    }
+  } else {
+    compiler_panic(t->c, a->loc,
+                   "aarch64: fp/lr offset %u out of prologue range",
+                   fp_lr_off);
+  }
   if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) {
     AASlot *s = aa64_slot_get(a, a->sret_ptr_slot);
     if (s) {
@@ -395,7 +420,12 @@ void aa_func_end(CGTarget *t) {
     u32 r0 = int_regs[i];
     aa64_emit32(mc, aa64_ldr_uimm(3, r0, 31, int_save_off + (u32)i * 8u));
   }
-  aa64_emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off));
+  if (fp_lr_off <= 504u) {
+    aa64_emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off));
+  } else {
+    aa64_emit32(mc, aa64_ldr_uimm(3, 29, 31, fp_lr_off));
+    aa64_emit32(mc, aa64_ldr_uimm(3, 30, 31, fp_lr_off + 8u));
+  }
   emit_sp_add(mc, frame_size);
   aa64_emit32(mc, aa64_ret(AA64_LR));
 
@@ -487,29 +517,34 @@ void aa_param(CGTarget *t, const CGParamDesc *p) {
     } else {
       u32 caller_off = a->next_param_stack;
       a->next_param_stack += 8;
-      aa64_emit32(t->mc, aa64_ldur(3, AA_TMP0, 29, (i32)(16 + caller_off)));
+      aa64_emit_ldur_off(t->mc, 3, AA_TMP0, 29, (i32)(16 + caller_off),
+                         AA_TMP0);
       ptr_reg = AA_TMP0;
     }
     u32 nbytes = s->size;
     u32 i = 0;
     while (i + 8 <= nbytes) {
-      aa64_emit32(t->mc, aa64_ldur(3, AA_TMP1, ptr_reg, (i32)i));
-      aa64_emit32(t->mc, aa64_stur(3, AA_TMP1, 29, -(i32)s->off + (i32)i));
+      aa64_emit_ldur_off(t->mc, 3, AA_TMP1, ptr_reg, (i32)i, AA_TMP2);
+      aa64_emit_stur_off(t->mc, 3, AA_TMP1, 29, -(i32)s->off + (i32)i,
+                         AA_TMP2);
       i += 8;
     }
     while (i + 4 <= nbytes) {
-      aa64_emit32(t->mc, aa64_ldur(2, AA_TMP1, ptr_reg, (i32)i));
-      aa64_emit32(t->mc, aa64_stur(2, AA_TMP1, 29, -(i32)s->off + (i32)i));
+      aa64_emit_ldur_off(t->mc, 2, AA_TMP1, ptr_reg, (i32)i, AA_TMP2);
+      aa64_emit_stur_off(t->mc, 2, AA_TMP1, 29, -(i32)s->off + (i32)i,
+                         AA_TMP2);
       i += 4;
     }
     while (i + 2 <= nbytes) {
-      aa64_emit32(t->mc, aa64_ldur(1, AA_TMP1, ptr_reg, (i32)i));
-      aa64_emit32(t->mc, aa64_stur(1, AA_TMP1, 29, -(i32)s->off + (i32)i));
+      aa64_emit_ldur_off(t->mc, 1, AA_TMP1, ptr_reg, (i32)i, AA_TMP2);
+      aa64_emit_stur_off(t->mc, 1, AA_TMP1, 29, -(i32)s->off + (i32)i,
+                         AA_TMP2);
       i += 2;
     }
     while (i < nbytes) {
-      aa64_emit32(t->mc, aa64_ldur(0, AA_TMP1, ptr_reg, (i32)i));
-      aa64_emit32(t->mc, aa64_stur(0, AA_TMP1, 29, -(i32)s->off + (i32)i));
+      aa64_emit_ldur_off(t->mc, 0, AA_TMP1, ptr_reg, (i32)i, AA_TMP2);
+      aa64_emit_stur_off(t->mc, 0, AA_TMP1, 29, -(i32)s->off + (i32)i,
+                         AA_TMP2);
       i += 1;
     }
     return;
@@ -523,28 +558,28 @@ void aa_param(CGTarget *t, const CGParamDesc *p) {
     if (pt->cls == ABI_CLASS_INT) {
       if (a->next_param_int < 8) {
         u32 reg = a->next_param_int++;
-        aa64_emit32(t->mc,
-                    aa64_stur(sidx, reg, 29, -(i32)s->off + (i32)part_off));
+        aa64_emit_stur_off(t->mc, sidx, reg, 29,
+                           -(i32)s->off + (i32)part_off, AA_TMP0);
       } else {
         u32 caller_off = a->next_param_stack;
         a->next_param_stack += 8;
-        aa64_emit32(t->mc,
-                    aa64_ldur(sidx, AA_TMP0, 29, (i32)(16 + caller_off)));
-        aa64_emit32(t->mc,
-                    aa64_stur(sidx, AA_TMP0, 29, -(i32)s->off + (i32)part_off));
+        aa64_emit_ldur_off(t->mc, sidx, AA_TMP0, 29,
+                           (i32)(16 + caller_off), AA_TMP0);
+        aa64_emit_stur_off(t->mc, sidx, AA_TMP0, 29,
+                           -(i32)s->off + (i32)part_off, AA_TMP1);
       }
     } else if (pt->cls == ABI_CLASS_FP) {
       if (a->next_param_fp < 8) {
         u32 reg = a->next_param_fp++;
-        aa64_emit32(t->mc,
-                    aa64_stur_fp(sidx, reg, 29, -(i32)s->off + (i32)part_off));
+        aa64_emit_stur_fp_off(t->mc, sidx, reg, 29,
+                              -(i32)s->off + (i32)part_off, AA_TMP0);
       } else {
         u32 caller_off = a->next_param_stack;
         a->next_param_stack += 8;
-        aa64_emit32(t->mc,
-                    aa64_ldur_fp(sidx, AA_FP_TMP0, 29, (i32)(16 + caller_off)));
-        aa64_emit32(t->mc, aa64_stur_fp(sidx, AA_FP_TMP0, 29,
-                                        -(i32)s->off + (i32)part_off));
+        aa64_emit_ldur_fp_off(t->mc, sidx, AA_FP_TMP0, 29,
+                              (i32)(16 + caller_off), AA_TMP0);
+        aa64_emit_stur_fp_off(t->mc, sidx, AA_FP_TMP0, 29,
+                              -(i32)s->off + (i32)part_off, AA_TMP0);
       }
     } else {
       compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl",
@@ -623,3 +658,47 @@ void aa64_emit_addr_adjust(MCEmitter *mc, u32 Rd, u32 base, i32 off) {
   aa64_emit_load_imm(mc, 1, Rd, off);
   aa64_emit32(mc, aa64_add(1, Rd, base, Rd));
 }
+
+static int aa64_simm9_fits(i32 off) {
+  return off >= -256 && off <= 255;
+}
+
+void aa64_emit_ldur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+                        u32 tmp) {
+  if (aa64_simm9_fits(off)) {
+    aa64_emit32(mc, aa64_ldur(size, Rt, Rn, off));
+    return;
+  }
+  aa64_emit_addr_adjust(mc, tmp, Rn, off);
+  aa64_emit32(mc, aa64_ldur(size, Rt, tmp, 0));
+}
+
+void aa64_emit_stur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+                        u32 tmp) {
+  if (aa64_simm9_fits(off)) {
+    aa64_emit32(mc, aa64_stur(size, Rt, Rn, off));
+    return;
+  }
+  aa64_emit_addr_adjust(mc, tmp, Rn, off);
+  aa64_emit32(mc, aa64_stur(size, Rt, tmp, 0));
+}
+
+void aa64_emit_ldur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+                           u32 tmp) {
+  if (aa64_simm9_fits(off)) {
+    aa64_emit32(mc, aa64_ldur_fp(size, Rt, Rn, off));
+    return;
+  }
+  aa64_emit_addr_adjust(mc, tmp, Rn, off);
+  aa64_emit32(mc, aa64_ldur_fp(size, Rt, tmp, 0));
+}
+
+void aa64_emit_stur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+                           u32 tmp) {
+  if (aa64_simm9_fits(off)) {
+    aa64_emit32(mc, aa64_stur_fp(size, Rt, Rn, off));
+    return;
+  }
+  aa64_emit_addr_adjust(mc, tmp, Rn, off);
+  aa64_emit32(mc, aa64_stur_fp(size, Rt, tmp, 0));
+}
diff --git a/src/arch/aa64/internal.h b/src/arch/aa64/internal.h
@@ -202,8 +202,9 @@ static inline u32 aa64_bfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) {
  * ============================================================ */
 
 #define AA_PROLOGUE_WORDS \
-  23u /* worst case: sub sp + stp/add fp + sret + 10 int + 8 fp saves */
-#define AA_PROLOGUE_FRAME_WORDS 4u /* worst-case frame adjust + stp/add fp */
+  25u /* worst case: sub sp + str/str/add-add fp + sret + 10 int + 8 fp */
+#define AA_PROLOGUE_FRAME_WORDS \
+  6u /* worst-case frame adjust + split fp/lr saves + add-add fp */
 
 typedef struct AASlot {
   u32 off;
@@ -281,6 +282,14 @@ void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word);
 void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm);
 void emit_sp_add(MCEmitter* mc, u32 imm);
 void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off);
+void aa64_emit_ldur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+                        u32 tmp);
+void aa64_emit_stur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+                        u32 tmp);
+void aa64_emit_ldur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+                           u32 tmp);
+void aa64_emit_stur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+                           u32 tmp);
 void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym);
 void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend);
 
diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c
@@ -839,7 +839,7 @@ static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi,
           AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot);
           if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad arg slot");
           i32 off = -(i32)s->off + (i32)pt->src_offset;
-          aa64_emit32(t->mc, aa64_ldur(sidx, dst_reg, 29, off));
+          aa64_emit_ldur_off(t->mc, sidx, dst_reg, 29, off, dst_reg);
           break;
         }
         case OPK_INDIRECT: {
@@ -1095,22 +1095,26 @@ static void aa_ret(CGTarget* t, const CGABIValue* val) {
         u32 nbytes = s->size;
         u32 i = 0;
         while (i + 8 <= nbytes) {
-          aa64_emit32(mc, aa64_ldur(3, AA_TMP0, 29, -(i32)s->off + (i32)i));
+          aa64_emit_ldur_off(mc, 3, AA_TMP0, 29, -(i32)s->off + (i32)i,
+                             AA_TMP0);
           aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i));
           i += 8;
         }
         while (i + 4 <= nbytes) {
-          aa64_emit32(mc, aa64_ldur(2, AA_TMP0, 29, -(i32)s->off + (i32)i));
+          aa64_emit_ldur_off(mc, 2, AA_TMP0, 29, -(i32)s->off + (i32)i,
+                             AA_TMP0);
           aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i));
           i += 4;
         }
         while (i + 2 <= nbytes) {
-          aa64_emit32(mc, aa64_ldur(1, AA_TMP0, 29, -(i32)s->off + (i32)i));
+          aa64_emit_ldur_off(mc, 1, AA_TMP0, 29, -(i32)s->off + (i32)i,
+                             AA_TMP0);
           aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i));
           i += 2;
         }
         while (i < nbytes) {
-          aa64_emit32(mc, aa64_ldur(0, AA_TMP0, 29, -(i32)s->off + (i32)i));
+          aa64_emit_ldur_off(mc, 0, AA_TMP0, 29, -(i32)s->off + (i32)i,
+                             AA_TMP0);
           aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i));
           i += 1;
         }
@@ -1128,22 +1132,26 @@ static void aa_ret(CGTarget* t, const CGABIValue* val) {
         i32 base_off = val->storage.v.ind.ofs;
         u32 i = 0;
         while (i + 8 <= nbytes) {
-          aa64_emit32(mc, aa64_ldur(3, AA_TMP0, base_reg, base_off + (i32)i));
+          aa64_emit_ldur_off(mc, 3, AA_TMP0, base_reg, base_off + (i32)i,
+                             AA_TMP0);
           aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i));
           i += 8;
         }
         while (i + 4 <= nbytes) {
-          aa64_emit32(mc, aa64_ldur(2, AA_TMP0, base_reg, base_off + (i32)i));
+          aa64_emit_ldur_off(mc, 2, AA_TMP0, base_reg, base_off + (i32)i,
+                             AA_TMP0);
           aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i));
           i += 4;
         }
         while (i + 2 <= nbytes) {
-          aa64_emit32(mc, aa64_ldur(1, AA_TMP0, base_reg, base_off + (i32)i));
+          aa64_emit_ldur_off(mc, 1, AA_TMP0, base_reg, base_off + (i32)i,
+                             AA_TMP0);
           aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i));
           i += 2;
         }
         while (i < nbytes) {
-          aa64_emit32(mc, aa64_ldur(0, AA_TMP0, base_reg, base_off + (i32)i));
+          aa64_emit_ldur_off(mc, 0, AA_TMP0, base_reg, base_off + (i32)i,
+                             AA_TMP0);
           aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i));
           i += 1;
         }
@@ -1182,9 +1190,9 @@ static void aa_ret(CGTarget* t, const CGABIValue* val) {
         u32 sidx = size_idx_for_bytes(pt->size);
         i32 off = base_off + (i32)pt->src_offset;
         if (pt->cls == ABI_CLASS_INT) {
-          aa64_emit32(mc, aa64_ldur(sidx, /*Rt=*/i, base_reg, off));
+          aa64_emit_ldur_off(mc, sidx, /*Rt=*/i, base_reg, off, AA_TMP0);
         } else if (pt->cls == ABI_CLASS_FP) {
-          aa64_emit32(mc, aa64_ldur_fp(sidx, /*Rt=*/i, base_reg, off));
+          aa64_emit_ldur_fp_off(mc, sidx, /*Rt=*/i, base_reg, off, AA_TMP0);
         } else {
           compiler_panic(t->c, a->loc, "aarch64 ret: ret part cls %d unimpl",
                          (int)pt->cls);
diff --git a/test/opt/phase0_guardrails.sh b/test/opt/phase0_guardrails.sh
@@ -100,6 +100,37 @@ write_large_straight_line() {
   } >"$TMP/large_straight_line.c"
 }
 
+write_many_stack_args_o1() {
+  local nargs=96
+  local expected=0
+  {
+    printf 'static int sink_many('
+    for i in $(seq 0 $((nargs - 1))); do
+      if [ "$i" -gt 0 ]; then printf ','; fi
+      printf 'int a%d' "$i"
+    done
+    printf ') {\n  int s = 0;\n'
+    for i in $(seq 0 $((nargs - 1))); do
+      local p=$(( (i * 17 + 3) % 101 + 1 ))
+      expected=$(( (expected + ((5 + i) & 255) * p) & 255 ))
+      printf '  s = (s + a%d * %d) & 255;\n' "$i" "$p"
+    done
+    printf '  return s;\n}\n'
+    printf 'int main() {\n'
+    for i in $(seq 0 $((nargs - 1))); do
+      printf '  int v%d = (5 + %d) & 255;\n' "$i" "$i"
+    done
+    printf '  int s = sink_many('
+    for i in $(seq 0 $((nargs - 1))); do
+      if [ "$i" -gt 0 ]; then printf ','; fi
+      printf 'v%d' "$i"
+    done
+    printf ');\n'
+    printf '  return s == %d ? 0 : 1;\n' "$expected"
+    printf '}\n'
+  } >"$TMP/many_stack_args_o1.c"
+}
+
 run_case() {
   local name="$1"
   local src="$2"
@@ -109,6 +140,13 @@ run_case() {
   printf 'phase0 %-24s O0/O1 OK\n' "$name"
 }
 
+run_o1_case() {
+  local name="$1"
+  local src="$2"
+  "$BIN" run -O1 "$src" >/dev/null
+  printf 'phase0 %-24s O1 OK\n' "$name"
+}
+
 check_metrics() {
   local src="$TMP/branch_liveness.c"
   local err="$TMP/metrics.err"
@@ -160,6 +198,7 @@ write_late_addrof_join
 write_spills
 write_many_small_functions
 write_large_straight_line
+write_many_stack_args_o1
 
 run_case branch_liveness "$TMP/branch_liveness.c"
 run_case call_clobber "$TMP/call_clobber.c"
@@ -168,6 +207,7 @@ run_case late_addrof_join "$TMP/late_addrof_join.c"
 run_case spills "$TMP/spills.c"
 run_case many_small_functions "$TMP/many_small_functions.c"
 run_case large_straight_line "$TMP/large_straight_line.c"
+run_o1_case many_stack_args "$TMP/many_stack_args_o1.c"
 check_metrics
 
 printf 'phase0 identified inline-asm stress: test/parse/cases/asm_01_grammar.c\n'

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	doc/OPT1.md	\|	24	+++++++++---------------
M	src/arch/aa64/emit.c	\|	129	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M	src/arch/aa64/internal.h	\|	13	+++++++++++--
M	src/arch/aa64/ops.c	\|	30	+++++++++++++++++++-----------
M	test/opt/phase0_guardrails.sh	\|	40	++++++++++++++++++++++++++++++++++++++++