commit 3a249786f7441806f9ba1eb92ce1dd40ba07da04
parent b998131d3551a570801f5028d6f55683a4cdac69
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 21 May 2026 14:12:41 -0700
Implement O2 allocation coalescing and splitting
Diffstat:
10 files changed, 941 insertions(+), 85 deletions(-)
diff --git a/doc/OPT.md b/doc/OPT.md
@@ -471,6 +471,8 @@ The optimizer is no longer just a stub:
- `src/opt/pass_ssa.c` implements pseudo-register SSA construction, mem2reg
SSA construction, conventional SSA edge-copy lowering, SSA undo, and stable
SSA dumps for pass tests.
+- `src/opt/pass_coalesce.c` implements O2 move collection, move-related
+ conflict analysis, and conservative coalescing.
- `src/opt/pass_analysis.c` owns shared CFG order, dominators, dominance
frontiers, dominator children, def-use rebuilding, and verifier guardrails.
- `cfree cc` forwards `-O0`/`-O1`/`-O2` into `CfreeCodeOptions`, so native
@@ -483,14 +485,26 @@ Current behavior:
- Level `1` records IR, builds CFG, machinizes, builds loop frequencies,
computes PReg liveness and live ranges, runs the current priority
allocator/rewrite path over mutable PRegs, combines, runs post-RA DCE,
- cleans jumps/layout, and emits through the wrapped target.
+ cleans jumps/layout, and emits through the wrapped target. This path keeps
+ whole-PReg allocation and does not run coalescing or splitting.
- Level `2` has its own scheduler entry point. It now converts mutable
pseudo-registers to SSA values, builds real mem2reg SSA, verifies phi/use-def
invariants, runs the first small O2 transforms, lowers phis through
- conventional SSA, undoes SSA, and then routes through the same proven
- lowering path as `-O1`.
-- `opt_regalloc(..., allow_live_range_split)` accepts the O2 policy bit, but
- live-range splitting is not implemented yet.
+ conventional SSA, undoes SSA, and then routes through the shared lowering
+ path with O2 allocation policy enabled.
+- `opt_regalloc(..., allow_live_range_split)` now uses the O2 policy bit to
+ enable move coalescing and singleton live-range splitting. Coalescing only
+ considers valid same-class, same-type `IR_COPY` pairs, weights moves by block
+ frequency, builds a bounded conflict matrix for move-related PRegs, and
+ rejects incompatible tied-register or forbidden-register constraints.
+- O2 allocation assigns coalesced groups as a unit. Every member receives the
+ same hard register or spill slot, so coalesced copies rewrite to hard-reg
+ self copies and are removed by post-RA combine/DCE.
+- O2 live-range splitting is intentionally conservative in this slice: only
+ singleton allocation groups with multiple live ranges can split. Split PRegs
+ get a canonical spill home plus per-range segment records. Hard segments
+ insert boundary reload/store traffic through the existing rewrite buffers;
+ coalesced groups, call-crossing values, and O1 allocations remain whole.
- `opt_build_ssa` is no longer just a shape check: promoted loads/stores are
rewritten to SSA values, phis have real `Val` defs and predecessor inputs,
and def-use chains are rebuilt for downstream SSA passes.
@@ -627,28 +641,41 @@ Exit criteria:
### Phase C - Full Allocation Infrastructure
-Status: pending quality work. The current priority allocator/rewrite path is
-shared by O1 and O2 lowering and accepts the O2 live-range-splitting policy
-bit, but coalescing and splitting are not implemented yet. Several small O2
-SSA cleanup passes landed first; this phase remains the allocation-quality
-milestone before heavier value/memory optimization.
+Status: implemented as the first O2 allocation-quality slice. O1 keeps the
+existing whole-PReg priority allocator. O2 enables conservative move
+coalescing, coalesced-group allocation, and singleton live-range splitting
+without changing public `libcfree` APIs or adding CFG edge splitting.
Goal: bring `-O2` allocation quality up to MIR's model before value passes
start changing code aggressively.
Implement:
-- Move collection.
-- Move-only liveness.
-- Conflict matrix for move-related regs.
-- Aggressive coalescing.
-- Live-range splitting and edge/block spill placement.
+- [x] Move collection.
+- [x] Bounded conflict matrix for move-related regs using existing live-range
+ overlap data.
+- [x] Conservative coalescing for same-class, same-type copies with compatible
+ fixed-register and forbidden-register constraints.
+- [x] Coalesced-group allocation before singleton allocation.
+- [x] O2-only split allocation representation with per-PReg segment lists and
+ canonical spill homes.
+- [x] Conservative singleton live-range splitting when whole hard allocation
+ fails.
+- [x] Boundary reload/store insertion for hard split segments through rewrite.
+- [ ] Critical-edge CFG mutation for spill placement. Current placement stays
+ at unambiguous block/range boundaries.
Exit criteria:
-- `-O1` remains green.
-- `-O2` can use the same lowering path with coalescing/splitting enabled, even
+- [x] `-O1` remains green.
+- [x] `-O2` can use the same lowering path with coalescing/splitting enabled,
while preserving the current small SSA cleanup passes.
+- [x] Focused `make test-opt` coverage for coalescing, conflict refusal,
+ incompatible constraints, O1 preservation, and singleton splitting.
+- [x] Toy pressure fixture:
+ `CFREE_TEST_FILTER=129_o2_pressure_coalesce CFREE_TOY_OPT_LEVELS="0 1 2"
+ make test-toy`
+- [x] Full `CFREE_TOY_OPT_LEVELS="0 1 2" make test-toy`
### Phase D - SSA-Backed Value and Memory Passes
@@ -786,7 +813,7 @@ src/opt/
pass_machinize.c target ABI and machine-form lowering
pass_live.c liveness, pressure, live ranges
pass_lower.c current assignment/rewrite implementation
- pass_coalesce.c move collection and coalescing (split out when implemented)
+ pass_coalesce.c move collection and coalescing
pass_regalloc.c assignment, rewrite, splitting (split out when warranted)
pass_combine.c target-aware code selection
pass_emit.c drive wrapped CGTarget
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -325,8 +325,24 @@ typedef enum OptAllocKind {
OPT_ALLOC_NONE,
OPT_ALLOC_HARD,
OPT_ALLOC_SPILL,
+ OPT_ALLOC_SPLIT,
} OptAllocKind;
+typedef struct OptAllocSegment {
+ u32 start;
+ u32 end;
+ u32 block;
+ u8 loc_kind;
+ u8 cls;
+ Reg hard_reg;
+ FrameSlot spill_slot;
+ FrameSlot spill_home;
+ u8 reload_at_start;
+ u8 store_at_end;
+ u8 pad[2];
+ u32 next;
+} OptAllocSegment;
+
typedef struct OptPRegInfo {
u32 first_pos;
u32 last_pos;
@@ -435,8 +451,19 @@ typedef struct Func {
u64 opt_rewrite_inserted_insts;
u64 opt_rewrite_live_words_touched;
u64 opt_dde_live_words_touched;
+ u64 opt_coalesce_moves_seen;
+ u64 opt_coalesce_candidates;
+ u64 opt_coalesce_conflicts;
+ u64 opt_coalesce_merge_attempts;
+ u64 opt_coalesce_merges;
InstId next_inst_id;
OptPRegInfo* preg_info; /* indexed by the current allocation reg namespace */
+ u32* opt_coalesce_parent;
+ u32* opt_coalesce_size;
+ OptAllocSegment* opt_alloc_segments;
+ u32 opt_nalloc_segments;
+ u32 opt_alloc_segments_cap;
+ u32* opt_first_segment_by_preg;
OptUse* opt_uses;
u32 opt_nuses, opt_uses_cap;
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -107,6 +107,8 @@ typedef struct OptLiveRange {
PReg preg;
u32 start;
u32 end;
+ u32 raw_start;
+ u32 raw_end;
u32 next;
u32 block;
u8 whole_block;
diff --git a/src/opt/opt_internal.h b/src/opt/opt_internal.h
@@ -111,6 +111,7 @@ OptHardBlockLive* opt_maybe_build_hard_live(Func*);
OptHardRegSet opt_hard_live_out_for_block(const OptHardBlockLive*);
int opt_block_live_out_has_phys_reg(Func*, const OptHardBlockLive*, u32 block,
const Operand*);
+void opt_coalesce_ranges(Func*, const OptLiveRangeSet*);
void opt_replay(Compiler*, Func*, CGTarget* target);
diff --git a/src/opt/pass_coalesce.c b/src/opt/pass_coalesce.c
@@ -0,0 +1,292 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "core/arena.h"
+#include "core/metrics.h"
+#include "opt/opt_internal.h"
+
+typedef struct CoalesceMove {
+ PReg dst;
+ PReg src;
+ u32 weight;
+} CoalesceMove;
+
+typedef struct CoalesceCtx {
+ Func* f;
+ const OptLiveRangeSet* ranges;
+ PReg* related;
+ u32 nrelated;
+ u32 related_cap;
+ u32* related_index;
+ u64* conflicts;
+ u64* unit_conflicts;
+ u32 conflict_words;
+} CoalesceCtx;
+
+static PReg coalesce_find(Func* f, PReg v) {
+ if (!f->opt_coalesce_parent || v == PREG_NONE || v >= opt_reg_count(f))
+ return v;
+ PReg p = (PReg)f->opt_coalesce_parent[v];
+ while (p != f->opt_coalesce_parent[p]) p = (PReg)f->opt_coalesce_parent[p];
+ while (v != p) {
+ PReg n = (PReg)f->opt_coalesce_parent[v];
+ f->opt_coalesce_parent[v] = p;
+ v = n;
+ }
+ return p;
+}
+
+static void coalesce_add_related(CoalesceCtx* c, PReg v) {
+ Func* f = c->f;
+ if (!opt_reg_valid(f, v)) return;
+ if (c->related_index[v] != OPT_RANGE_NONE) return;
+ if (c->nrelated == c->related_cap) {
+ u32 ncap = c->related_cap ? c->related_cap * 2u : 32u;
+ PReg* nr = arena_array(f->arena, PReg, ncap);
+ if (c->related) memcpy(nr, c->related, sizeof(c->related[0]) * c->nrelated);
+ c->related = nr;
+ c->related_cap = ncap;
+ }
+ c->related_index[v] = c->nrelated;
+ c->related[c->nrelated++] = v;
+}
+
+static int ranges_overlap_kind(const OptLiveRangeSet* ranges, PReg a, PReg b) {
+ int unit_only = 0;
+ for (u32 ar = ranges->first_range_by_preg[a]; ar != OPT_RANGE_NONE;
+ ar = ranges->ranges[ar].next) {
+ const OptLiveRange* ra = &ranges->ranges[ar];
+ for (u32 br = ranges->first_range_by_preg[b]; br != OPT_RANGE_NONE;
+ br = ranges->ranges[br].next) {
+ const OptLiveRange* rb = &ranges->ranges[br];
+ if (ra->start < rb->end && rb->start < ra->end) {
+ u32 start = ra->start > rb->start ? ra->start : rb->start;
+ u32 end = ra->end < rb->end ? ra->end : rb->end;
+ if (end > start + 1u) return 2;
+ unit_only = 1;
+ }
+ }
+ }
+ return unit_only ? 1 : 0;
+}
+
+static void coalesce_set_conflict_bit(u64* bits, u32 nrelated, u32 a, u32 b) {
+ if (!bits) return;
+ if (a == b) return;
+ u32 x = a < b ? a : b;
+ u32 y = a < b ? b : a;
+ u32 bit = x * nrelated + y;
+ bits[bit / 64u] |= 1ull << (bit % 64u);
+}
+
+static void coalesce_set_conflict(CoalesceCtx* c, u32 a, u32 b, int unit) {
+ coalesce_set_conflict_bit(c->conflicts, c->nrelated, a, b);
+ if (unit) coalesce_set_conflict_bit(c->unit_conflicts, c->nrelated, a, b);
+}
+
+static int coalesce_has_bit(const CoalesceCtx* c, const u64* bits, PReg a,
+ PReg b) {
+ if (!bits || a >= opt_reg_count(c->f) || b >= opt_reg_count(c->f)) return 1;
+ u32 ai = c->related_index[a];
+ u32 bi = c->related_index[b];
+ if (ai == OPT_RANGE_NONE || bi == OPT_RANGE_NONE) return 0;
+ if (ai == bi) return 0;
+ u32 x = ai < bi ? ai : bi;
+ u32 y = ai < bi ? bi : ai;
+ u32 bit = x * c->nrelated + y;
+ return (bits[bit / 64u] & (1ull << (bit % 64u))) != 0;
+}
+
+static int coalesce_has_conflict(const CoalesceCtx* c, PReg a, PReg b) {
+ return coalesce_has_bit(c, c->conflicts, a, b);
+}
+
+static int coalesce_has_unit_conflict(const CoalesceCtx* c, PReg a, PReg b) {
+ return coalesce_has_bit(c, c->unit_conflicts, a, b);
+}
+
+static int group_conflicts(const CoalesceCtx* c, PReg ra, PReg rb, PReg allow_a,
+ PReg allow_b) {
+ for (u32 i = 0; i < c->nrelated; ++i) {
+ PReg a = c->related[i];
+ if (coalesce_find(c->f, a) != ra) continue;
+ for (u32 j = 0; j < c->nrelated; ++j) {
+ PReg b = c->related[j];
+ if (coalesce_find(c->f, b) != rb) continue;
+ if (((a == allow_a && b == allow_b) || (a == allow_b && b == allow_a)) &&
+ coalesce_has_unit_conflict(c, a, b))
+ continue;
+ if (coalesce_has_conflict(c, a, b)) return 1;
+ }
+ }
+ return 0;
+}
+
+static int hard_reg_possible(Func* f, u8 cls, u32 forbidden) {
+ for (u32 i = 0; i < f->opt_hard_reg_count[cls]; ++i) {
+ Reg r = f->opt_hard_regs[cls][i];
+ if (r < 32 && (forbidden & (1u << r)) == 0) return 1;
+ }
+ return f->opt_hard_reg_count[cls] == 0;
+}
+
+static int group_constraints_compatible(const CoalesceCtx* c, PReg ra,
+ PReg rb) {
+ Func* f = c->f;
+ u8 cls = opt_reg_cls(f, ra);
+ CfreeCgTypeId type = opt_reg_type(f, ra);
+ i32 tied = -1;
+ u32 forbidden = 0;
+ for (PReg v = 1; v < opt_reg_count(f); ++v) {
+ PReg r = coalesce_find(f, v);
+ if (r != ra && r != rb) continue;
+ if (opt_reg_cls(f, v) != cls || opt_reg_type(f, v) != type) return 0;
+ const OptPRegInfo* vi = &f->preg_info[v];
+ forbidden |= vi->forbidden_hard_regs;
+ if (vi->tied_hard_reg >= 0) {
+ if (tied >= 0 && tied != vi->tied_hard_reg) return 0;
+ tied = vi->tied_hard_reg;
+ }
+ }
+ if (tied >= 0 && tied < 32 && (forbidden & (1u << (Reg)tied))) return 0;
+ return hard_reg_possible(f, cls, forbidden);
+}
+
+static void coalesce_union(Func* f, PReg a, PReg b) {
+ PReg ra = coalesce_find(f, a);
+ PReg rb = coalesce_find(f, b);
+ if (ra == rb) return;
+ u32 as = f->opt_coalesce_size[ra];
+ u32 bs = f->opt_coalesce_size[rb];
+ if (as < bs || (as == bs && rb < ra)) {
+ PReg t = ra;
+ ra = rb;
+ rb = t;
+ }
+ f->opt_coalesce_parent[rb] = ra;
+ f->opt_coalesce_size[ra] += f->opt_coalesce_size[rb];
+}
+
+static int move_higher(const CoalesceMove* a, const CoalesceMove* b) {
+ if (a->weight != b->weight) return a->weight > b->weight;
+ if (a->dst != b->dst) return a->dst < b->dst;
+ return a->src < b->src;
+}
+
+static int move_cmp(const void* va, const void* vb) {
+ const CoalesceMove* a = (const CoalesceMove*)va;
+ const CoalesceMove* b = (const CoalesceMove*)vb;
+ if (move_higher(a, b)) return -1;
+ if (move_higher(b, a)) return 1;
+ return 0;
+}
+
+static int collect_move(Func* f, const OptLiveRangeSet* ranges, Inst* in,
+ u32 block, CoalesceMove* out) {
+ if ((IROp)in->op != IR_COPY || in->nopnds < 2) return 0;
+ if (in->opnds[0].kind != OPK_REG || in->opnds[1].kind != OPK_REG) return 0;
+ PReg dst = (PReg)in->opnds[0].v.reg;
+ PReg src = (PReg)in->opnds[1].v.reg;
+ if (!opt_reg_valid(f, dst) || !opt_reg_valid(f, src) || dst == src) return 0;
+ if (ranges->first_range_by_preg[dst] == OPT_RANGE_NONE ||
+ ranges->first_range_by_preg[src] == OPT_RANGE_NONE)
+ return 0;
+ if (opt_reg_cls(f, dst) != opt_reg_cls(f, src)) return 0;
+ if (opt_reg_type(f, dst) != opt_reg_type(f, src)) return 0;
+ out->dst = dst;
+ out->src = src;
+ out->weight = f->blocks[block].frequency ? f->blocks[block].frequency : 1u;
+ return 1;
+}
+
+void opt_coalesce_ranges(Func* f, const OptLiveRangeSet* ranges) {
+ if (!f || !ranges || !f->preg_info) return;
+ u32 nregs = opt_reg_count(f);
+ f->opt_coalesce_moves_seen = 0;
+ f->opt_coalesce_candidates = 0;
+ f->opt_coalesce_conflicts = 0;
+ f->opt_coalesce_merge_attempts = 0;
+ f->opt_coalesce_merges = 0;
+ f->opt_coalesce_parent = arena_array(f->arena, u32, nregs ? nregs : 1u);
+ f->opt_coalesce_size = arena_array(f->arena, u32, nregs ? nregs : 1u);
+ for (PReg v = 0; v < nregs; ++v) {
+ f->opt_coalesce_parent[v] = v;
+ f->opt_coalesce_size[v] = 1;
+ }
+
+ CoalesceMove* moves = NULL;
+ u32 nmoves = 0;
+ u32 move_cap = 0;
+ CoalesceCtx ctx;
+ memset(&ctx, 0, sizeof ctx);
+ ctx.f = f;
+ ctx.ranges = ranges;
+ ctx.related_index = arena_array(f->arena, u32, nregs ? nregs : 1u);
+ memset(ctx.related_index, 0xff,
+ sizeof(ctx.related_index[0]) * (nregs ? nregs : 1u));
+
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ if ((IROp)bl->insts[i].op == IR_COPY) ++f->opt_coalesce_moves_seen;
+ CoalesceMove m;
+ if (!collect_move(f, ranges, &bl->insts[i], b, &m)) continue;
+ if (nmoves == move_cap) {
+ u32 ncap = move_cap ? move_cap * 2u : 32u;
+ CoalesceMove* nv = arena_array(f->arena, CoalesceMove, ncap);
+ if (moves) memcpy(nv, moves, sizeof(moves[0]) * nmoves);
+ moves = nv;
+ move_cap = ncap;
+ }
+ moves[nmoves++] = m;
+ coalesce_add_related(&ctx, m.dst);
+ coalesce_add_related(&ctx, m.src);
+ }
+ }
+ f->opt_coalesce_candidates = nmoves;
+ if (!nmoves || ctx.nrelated < 2) goto metrics;
+
+ ctx.conflict_words = (ctx.nrelated * ctx.nrelated + 63u) / 64u;
+ ctx.conflicts =
+ arena_zarray(f->arena, u64, ctx.conflict_words ? ctx.conflict_words : 1u);
+ ctx.unit_conflicts =
+ arena_zarray(f->arena, u64, ctx.conflict_words ? ctx.conflict_words : 1u);
+ for (u32 i = 0; i < ctx.nrelated; ++i) {
+ for (u32 j = i + 1u; j < ctx.nrelated; ++j) {
+ int kind = ranges_overlap_kind(ranges, ctx.related[i], ctx.related[j]);
+ if (kind) {
+ coalesce_set_conflict(&ctx, i, j, kind == 1);
+ ++f->opt_coalesce_conflicts;
+ }
+ }
+ }
+
+ qsort(moves, nmoves, sizeof(moves[0]), move_cmp);
+ for (u32 i = 0; i < nmoves; ++i) {
+ PReg ra = coalesce_find(f, moves[i].dst);
+ PReg rb = coalesce_find(f, moves[i].src);
+ if (ra == rb) continue;
+ ++f->opt_coalesce_merge_attempts;
+ if (group_conflicts(&ctx, ra, rb, moves[i].dst, moves[i].src)) continue;
+ if (!group_constraints_compatible(&ctx, ra, rb)) continue;
+ coalesce_union(f, ra, rb);
+ ++f->opt_coalesce_merges;
+ }
+
+metrics:
+ metrics_count(f->c, "opt.coalesce.moves_seen", f->opt_coalesce_moves_seen);
+ metrics_count(f->c, "opt.coalesce.candidates", f->opt_coalesce_candidates);
+ metrics_count(f->c, "opt.coalesce.conflicts", f->opt_coalesce_conflicts);
+ metrics_count(f->c, "opt.coalesce.merge_attempts",
+ f->opt_coalesce_merge_attempts);
+ metrics_count(f->c, "opt.coalesce.merges", f->opt_coalesce_merges);
+}
+
+void opt_coalesce(Func* f) {
+ if (!f) return;
+ OptLiveInfo live;
+ opt_live_blocks(f, &live);
+ OptLiveRangeSet ranges;
+ opt_live_ranges_build(f, &live, &ranges);
+ opt_coalesce_ranges(f, &ranges);
+}
diff --git a/src/opt/pass_live.c b/src/opt/pass_live.c
@@ -520,6 +520,8 @@ static void range_append(RangeBuildCtx* c, PReg r, u32 start, u32 end,
lr->preg = r;
lr->start = start;
lr->end = end;
+ lr->raw_start = start;
+ lr->raw_end = end;
lr->next = OPT_RANGE_NONE;
lr->block = block;
lr->whole_block = whole_block ? 1u : 0u;
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -328,6 +328,18 @@ typedef struct OptSpillCandidate {
u32 last;
} OptSpillCandidate;
+typedef struct OptAllocGroupInfo {
+ PReg root;
+ u32 spill_cost;
+ u32 live_length;
+ u32 first;
+ u32 last;
+ i32 tied_hard_reg;
+ u32 forbidden_hard_regs;
+ u8 cls;
+ u8 pad[3];
+} OptAllocGroupInfo;
+
typedef struct AllocInterval {
u32 start;
u32 end;
@@ -415,6 +427,45 @@ static void alloc_sort_spills(OptSpillCandidate* spills, u32 n) {
if (n > 1) qsort(spills, n, sizeof(spills[0]), spill_candidate_cmp);
}
+static PReg alloc_coalesce_root(Func* f, PReg v) {
+ if (!f->opt_coalesce_parent || v == PREG_NONE || v >= opt_reg_count(f))
+ return v;
+ PReg p = (PReg)f->opt_coalesce_parent[v];
+ while (p != f->opt_coalesce_parent[p]) p = (PReg)f->opt_coalesce_parent[p];
+ while (v != p) {
+ PReg n = (PReg)f->opt_coalesce_parent[v];
+ f->opt_coalesce_parent[v] = p;
+ v = n;
+ }
+ return p;
+}
+
+static int alloc_group_member(Func* f, PReg root, PReg v) {
+ return alloc_coalesce_root(f, v) == root;
+}
+
+static void alloc_group_info(Func* f, const OptLiveRangeSet* ranges, PReg root,
+ OptAllocGroupInfo* out) {
+ memset(out, 0, sizeof *out);
+ out->root = root;
+ out->first = (u32)~0u;
+ out->tied_hard_reg = -1;
+ out->cls = f->preg_info[root].cls;
+ for (PReg v = 1; v < opt_reg_count(f); ++v) {
+ if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
+ if (!alloc_group_member(f, root, v)) continue;
+ OptPRegInfo* vi = &f->preg_info[v];
+ out->spill_cost += vi->frequency ? vi->frequency : vi->spill_cost;
+ out->live_length += vi->live_length;
+ u32 first = vi->first_pos ? vi->first_pos - 1u : 0;
+ if (first < out->first) out->first = first;
+ if (vi->last_pos > out->last) out->last = vi->last_pos;
+ out->forbidden_hard_regs |= vi->forbidden_hard_regs;
+ if (vi->tied_hard_reg >= 0) out->tied_hard_reg = vi->tied_hard_reg;
+ }
+ if (out->first == (u32)~0u) out->first = 0;
+}
+
static u32 alloc_interval_lower_bound(const AllocIntervalVec* v, u32 start) {
u32 lo = 0;
u32 hi = v ? v->n : 0;
@@ -636,18 +687,30 @@ static int spill_slot_compatible(Func* f, FrameSlot fs, PReg v) {
return 1;
}
-static int alloc_hard_conflicts(OptAllocator* a, const OptLiveRangeSet* ranges,
- PReg v, u32 bit) {
+static int alloc_group_hard_conflicts(Func* f, OptAllocator* a,
+ const OptLiveRangeSet* ranges, PReg root,
+ u32 bit) {
if (bit >= a->hard_loc_bits) return 1;
- return alloc_ranges_overlap_vec(a, ranges, v, &a->hard_used_locs[bit],
- &a->hard_point_visits);
+ for (PReg v = 1; v < opt_reg_count(f); ++v) {
+ if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
+ if (!alloc_group_member(f, root, v)) continue;
+ if (alloc_ranges_overlap_vec(a, ranges, v, &a->hard_used_locs[bit],
+ &a->hard_point_visits))
+ return 1;
+ }
+ return 0;
}
-static void alloc_mark_hard_loc(OptAllocator* a, const OptLiveRangeSet* ranges,
- PReg v, u32 bit, Func* f) {
+static void alloc_mark_group_hard_loc(Func* f, OptAllocator* a,
+ const OptLiveRangeSet* ranges, PReg root,
+ u32 bit) {
if (bit >= a->hard_loc_bits) return;
- alloc_mark_vec(f, a, ranges, v, &a->hard_used_locs[bit],
- &a->hard_mark_points);
+ for (PReg v = 1; v < opt_reg_count(f); ++v) {
+ if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
+ if (!alloc_group_member(f, root, v)) continue;
+ alloc_mark_vec(f, a, ranges, v, &a->hard_used_locs[bit],
+ &a->hard_mark_points);
+ }
}
static void alloc_grow_stack_locs(Func* f, OptAllocator* a, u32 need_slots) {
@@ -670,11 +733,16 @@ static void alloc_grow_stack_locs(Func* f, OptAllocator* a, u32 need_slots) {
a->stack_slot_cap = ncap;
}
-static void alloc_mark_stack_loc(OptAllocator* a, const OptLiveRangeSet* ranges,
- PReg v, u32 stack_idx, Func* f) {
+static void alloc_mark_group_stack_loc(Func* f, OptAllocator* a,
+ const OptLiveRangeSet* ranges, PReg root,
+ u32 stack_idx) {
if (stack_idx >= a->stack_slot_count) return;
- alloc_mark_vec(f, a, ranges, v, &a->stack_used_locs[stack_idx],
- &a->stack_mark_points);
+ for (PReg v = 1; v < opt_reg_count(f); ++v) {
+ if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
+ if (!alloc_group_member(f, root, v)) continue;
+ alloc_mark_vec(f, a, ranges, v, &a->stack_used_locs[stack_idx],
+ &a->stack_mark_points);
+ }
}
static int stack_slot_heap_less(const OptAllocator* a, u32 lhs, u32 rhs) {
@@ -776,57 +844,179 @@ static int stack_take_free_compatible(Func* f, OptAllocator* a, PReg v,
return 0;
}
-static void alloc_assign_hard(Func* f, OptAllocator* a,
- const OptLiveRangeSet* ranges, PReg v, Reg r) {
- OptPRegInfo* vi = &f->preg_info[v];
- vi->alloc_kind = OPT_ALLOC_HARD;
- vi->hard_reg = r;
- a->locs[v].kind = OPT_LOC_HARD;
- a->locs[v].cls = vi->cls;
- a->locs[v].hard_reg = r;
- a->locs[v].spill_slot = FRAME_SLOT_NONE;
- alloc_mark_hard_loc(a, ranges, v, hard_loc_bit(vi->cls, r), f);
+static void alloc_assign_group_hard(Func* f, OptAllocator* a,
+ const OptLiveRangeSet* ranges, PReg root,
+ Reg r) {
+ u32 bit = hard_loc_bit(f->preg_info[root].cls, r);
+ for (PReg v = 1; v < opt_reg_count(f); ++v) {
+ if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
+ if (!alloc_group_member(f, root, v)) continue;
+ OptPRegInfo* vi = &f->preg_info[v];
+ vi->alloc_kind = OPT_ALLOC_HARD;
+ vi->hard_reg = r;
+ a->locs[v].kind = OPT_LOC_HARD;
+ a->locs[v].cls = vi->cls;
+ a->locs[v].hard_reg = r;
+ a->locs[v].spill_slot = FRAME_SLOT_NONE;
+ }
+ alloc_mark_group_hard_loc(f, a, ranges, root, bit);
}
-static void alloc_assign_stack_slot(Func* f, OptAllocator* a,
- const OptLiveRangeSet* ranges, PReg v,
- u32 stack_idx) {
- OptPRegInfo* vi = &f->preg_info[v];
+static void alloc_assign_group_stack_slot(Func* f, OptAllocator* a,
+ const OptLiveRangeSet* ranges,
+ PReg root, u32 stack_idx,
+ u32 group_last) {
FrameSlot slot = a->stack_slots[stack_idx];
- vi->spill_slot = slot;
- vi->alloc_kind = OPT_ALLOC_SPILL;
- a->locs[v].kind = OPT_LOC_STACK;
- a->locs[v].cls = vi->cls;
- a->locs[v].hard_reg = REG_NONE;
- a->locs[v].spill_slot = slot;
- alloc_mark_stack_loc(a, ranges, v, stack_idx, f);
- a->stack_slot_end[stack_idx] = vi->last_pos;
+ for (PReg v = 1; v < opt_reg_count(f); ++v) {
+ if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
+ if (!alloc_group_member(f, root, v)) continue;
+ OptPRegInfo* vi = &f->preg_info[v];
+ vi->spill_slot = slot;
+ vi->alloc_kind = OPT_ALLOC_SPILL;
+ a->locs[v].kind = OPT_LOC_STACK;
+ a->locs[v].cls = vi->cls;
+ a->locs[v].hard_reg = REG_NONE;
+ a->locs[v].spill_slot = slot;
+ }
+ alloc_mark_group_stack_loc(f, a, ranges, root, stack_idx);
+ a->stack_slot_end[stack_idx] = group_last;
stack_active_push(f, a, stack_idx);
}
-static void alloc_assign_stack(Func* f, OptAllocator* a,
- const OptLiveRangeSet* ranges, PReg v) {
- OptPRegInfo* vi = &f->preg_info[v];
- u32 first = vi->first_pos ? vi->first_pos - 1u : 0;
+static void alloc_assign_group_stack(Func* f, OptAllocator* a,
+ const OptLiveRangeSet* ranges, PReg root,
+ const OptAllocGroupInfo* gi) {
u32 stack_idx = 0;
- stack_expire_slots(f, a, first);
- if (!stack_take_free_compatible(f, a, v, &stack_idx)) {
- FrameSlot slot = spill_slot_for(f, v);
+ stack_expire_slots(f, a, gi->first);
+ if (!stack_take_free_compatible(f, a, root, &stack_idx)) {
+ FrameSlot slot = spill_slot_for(f, root);
alloc_grow_stack_locs(f, a, a->stack_slot_count + 1u);
stack_idx = a->stack_slot_count++;
a->stack_slots[stack_idx] = slot;
}
- alloc_assign_stack_slot(f, a, ranges, v, stack_idx);
+ alloc_assign_group_stack_slot(f, a, ranges, root, stack_idx, gi->last);
+}
+
+static void split_segment_grow(Func* f) {
+ if (f->opt_nalloc_segments < f->opt_alloc_segments_cap) return;
+ u32 ncap = f->opt_alloc_segments_cap ? f->opt_alloc_segments_cap * 2u : 32u;
+ OptAllocSegment* ns = arena_array(f->arena, OptAllocSegment, ncap);
+ if (f->opt_alloc_segments)
+ memcpy(ns, f->opt_alloc_segments,
+ sizeof(f->opt_alloc_segments[0]) * f->opt_nalloc_segments);
+ f->opt_alloc_segments = ns;
+ f->opt_alloc_segments_cap = ncap;
+}
+
+static void split_segment_push(Func* f, PReg v, const OptLiveRange* lr,
+ u8 loc_kind, Reg hard_reg,
+ FrameSlot spill_home) {
+ split_segment_grow(f);
+ u32 idx = f->opt_nalloc_segments++;
+ OptAllocSegment* s = &f->opt_alloc_segments[idx];
+ memset(s, 0, sizeof *s);
+ s->start = lr->raw_start;
+ s->end = lr->raw_end;
+ s->block = lr->block;
+ s->loc_kind = loc_kind;
+ s->cls = f->preg_info[v].cls;
+ s->hard_reg = hard_reg;
+ s->spill_slot = loc_kind == OPT_LOC_STACK ? spill_home : FRAME_SLOT_NONE;
+ s->spill_home = spill_home;
+ s->reload_at_start = loc_kind == OPT_LOC_HARD ? 1u : 0u;
+ s->store_at_end = loc_kind == OPT_LOC_HARD ? 1u : 0u;
+ s->next = f->opt_first_segment_by_preg[v];
+ f->opt_first_segment_by_preg[v] = idx;
+}
+
+static int alloc_range_hard_conflicts(OptAllocator* a, const OptLiveRange* lr,
+ u32 bit) {
+ if (bit >= a->hard_loc_bits) return 1;
+ u32 end = lr->end < a->point_count ? lr->end : a->point_count;
+ ++a->hard_point_visits;
+ if (lr->start >= end) return 0;
+ return alloc_interval_overlaps(&a->hard_used_locs[bit], lr->start, end);
+}
+
+static void alloc_mark_range_hard(Func* f, OptAllocator* a,
+ const OptLiveRange* lr, u32 bit) {
+ if (bit >= a->hard_loc_bits) return;
+ u32 end = lr->end < a->point_count ? lr->end : a->point_count;
+ if (lr->start >= end) return;
+ ++a->hard_mark_points;
+ alloc_interval_insert(f, &a->hard_used_locs[bit], lr->start, end);
+}
+
+static int alloc_try_split_singleton(Func* f, OptAllocator* a,
+ const OptLiveRangeSet* ranges, PReg v) {
+ if (f->opt_coalesce_parent &&
+ f->opt_coalesce_size[alloc_coalesce_root(f, v)] > 1)
+ return 0;
+ if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) return 0;
+ if (f->preg_info[v].live_across_call_freq) return 0;
+ u32 nranges = 0;
+ for (u32 ri = ranges->first_range_by_preg[v]; ri != OPT_RANGE_NONE;
+ ri = ranges->ranges[ri].next)
+ ++nranges;
+ if (nranges < 2) return 0;
+ if (!f->opt_first_segment_by_preg) {
+ f->opt_first_segment_by_preg =
+ arena_array(f->arena, u32, opt_reg_count(f) ? opt_reg_count(f) : 1u);
+ memset(f->opt_first_segment_by_preg, 0xff,
+ sizeof(f->opt_first_segment_by_preg[0]) *
+ (opt_reg_count(f) ? opt_reg_count(f) : 1u));
+ }
+
+ OptPRegInfo* vi = &f->preg_info[v];
+ FrameSlot home = spill_slot_for(f, v);
+ vi->alloc_kind = OPT_ALLOC_SPLIT;
+ vi->hard_reg = REG_NONE;
+ vi->spill_slot = home;
+ a->locs[v].kind = OPT_LOC_STACK;
+ a->locs[v].cls = vi->cls;
+ a->locs[v].hard_reg = REG_NONE;
+ a->locs[v].spill_slot = home;
+
+ for (u32 ri = ranges->first_range_by_preg[v]; ri != OPT_RANGE_NONE;
+ ri = ranges->ranges[ri].next) {
+ const OptLiveRange* lr = &ranges->ranges[ri];
+ int found = 0;
+ Reg best = REG_NONE;
+ u32 best_score = 0xffffffffu;
+ for (u32 r = 0; r < f->opt_hard_reg_count[vi->cls]; ++r) {
+ Reg hr = f->opt_hard_regs[vi->cls][r];
+ if (hr >= 32) continue;
+ if (vi->forbidden_hard_regs & (1u << hr)) continue;
+ u32 bit = hard_loc_bit(vi->cls, hr);
+ if (alloc_range_hard_conflicts(a, lr, bit)) continue;
+ u32 score = hard_reg_alloc_score(f, a, vi, hr);
+ if (!found || score < best_score) {
+ found = 1;
+ best = hr;
+ best_score = score;
+ }
+ }
+ if (found) {
+ alloc_mark_range_hard(f, a, lr, hard_loc_bit(vi->cls, best));
+ split_segment_push(f, v, lr, OPT_LOC_HARD, best, home);
+ } else {
+ split_segment_push(f, v, lr, OPT_LOC_STACK, REG_NONE, home);
+ }
+ }
+ return 1;
}
static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
- OptAllocator* a) {
+ OptAllocator* a, int allow_live_range_split) {
memset(a, 0, sizeof *a);
a->point_count = ranges->point_count ? ranges->point_count : 1u;
a->hard_loc_bits = OPT_REG_CLASSES * 32u;
u32 ncands = 0;
- for (PReg v = 1; v < opt_reg_count(f); ++v)
- if (ranges->first_range_by_preg[v] != OPT_RANGE_NONE) ++ncands;
+ for (PReg v = 1; v < opt_reg_count(f); ++v) {
+ if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
+ if (alloc_coalesce_root(f, v) != v) continue;
+ ++ncands;
+ }
a->locs =
arena_zarray(f->arena, OptLoc, opt_reg_count(f) ? opt_reg_count(f) : 1u);
a->hard_used_locs = arena_zarray(f->arena, AllocIntervalVec,
@@ -842,11 +1032,14 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
u32 n = 0;
for (PReg v = 1; v < opt_reg_count(f); ++v) {
if (ranges->first_range_by_preg[v] == OPT_RANGE_NONE) continue;
- OptPRegInfo* vi = &f->preg_info[v];
+ PReg root = alloc_coalesce_root(f, v);
+ if (root != v) continue;
+ OptAllocGroupInfo gi;
+ alloc_group_info(f, ranges, root, &gi);
cands[n].v = v;
- cands[n].spill_cost = vi->frequency ? vi->frequency : vi->spill_cost;
- cands[n].live_length = vi->live_length;
- cands[n].tied = vi->tied_hard_reg >= 0;
+ cands[n].spill_cost = gi.spill_cost;
+ cands[n].live_length = gi.live_length;
+ cands[n].tied = gi.tied_hard_reg >= 0;
memset(cands[n].pad, 0, sizeof cands[n].pad);
++n;
}
@@ -855,10 +1048,12 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
u32 nspills = 0;
for (u32 i = 0; i < n; ++i) {
PReg v = cands[i].v;
+ OptAllocGroupInfo gi;
+ alloc_group_info(f, ranges, v, &gi);
OptPRegInfo* vi = &f->preg_info[v];
- u8 cls = vi->cls;
- if (vi->tied_hard_reg >= 0) {
- Reg fixed = (Reg)vi->tied_hard_reg;
+ u8 cls = gi.cls;
+ if (gi.tied_hard_reg >= 0) {
+ Reg fixed = (Reg)gi.tied_hard_reg;
if (!hard_available(f, cls, fixed)) {
SrcLoc loc = {0, 0, 0};
compiler_panic(
@@ -866,19 +1061,19 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
"opt regalloc: fixed hard reg %u unavailable in class %u",
(unsigned)fixed, (unsigned)cls);
}
- if (fixed < 32 && (vi->forbidden_hard_regs & (1u << fixed))) {
+ if (fixed < 32 && (gi.forbidden_hard_regs & (1u << fixed))) {
SrcLoc loc = {0, 0, 0};
compiler_panic(f->c, loc,
"opt regalloc: fixed hard reg %u is clobbered",
(unsigned)fixed);
}
- if (fixed >= 32 ||
- alloc_hard_conflicts(a, ranges, v, hard_loc_bit(cls, fixed))) {
+ if (fixed >= 32 || alloc_group_hard_conflicts(f, a, ranges, v,
+ hard_loc_bit(cls, fixed))) {
SrcLoc loc = {0, 0, 0};
compiler_panic(f->c, loc, "opt regalloc: conflicting fixed hard reg %u",
(unsigned)fixed);
}
- alloc_assign_hard(f, a, ranges, v, fixed);
+ alloc_assign_group_hard(f, a, ranges, v, fixed);
continue;
}
@@ -888,8 +1083,9 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
for (u32 r = 0; r < f->opt_hard_reg_count[cls]; ++r) {
Reg hr = f->opt_hard_regs[cls][r];
if (hr >= 32) continue;
- if (vi->forbidden_hard_regs & (1u << hr)) continue;
- if (alloc_hard_conflicts(a, ranges, v, hard_loc_bit(cls, hr))) continue;
+ if (gi.forbidden_hard_regs & (1u << hr)) continue;
+ if (alloc_group_hard_conflicts(f, a, ranges, v, hard_loc_bit(cls, hr)))
+ continue;
u32 score = hard_reg_alloc_score(f, a, vi, hr);
if (!found || score < best_score) {
found = 1;
@@ -898,18 +1094,23 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
}
}
if (found) {
- alloc_assign_hard(f, a, ranges, v, best);
+ alloc_assign_group_hard(f, a, ranges, v, best);
}
if (!found) {
+ if (allow_live_range_split && alloc_try_split_singleton(f, a, ranges, v))
+ continue;
spills[nspills].v = v;
- spills[nspills].first = vi->first_pos ? vi->first_pos - 1u : 0;
- spills[nspills].last = vi->last_pos;
+ spills[nspills].first = gi.first;
+ spills[nspills].last = gi.last;
++nspills;
}
}
alloc_sort_spills(spills, nspills);
- for (u32 i = 0; i < nspills; ++i)
- alloc_assign_stack(f, a, ranges, spills[i].v);
+ for (u32 i = 0; i < nspills; ++i) {
+ OptAllocGroupInfo gi;
+ alloc_group_info(f, ranges, spills[i].v, &gi);
+ alloc_assign_group_stack(f, a, ranges, spills[i].v, &gi);
+ }
a->hard_loc_words =
alloc_interval_storage_words(a->hard_used_locs, a->hard_loc_bits);
a->stack_loc_words =
@@ -1027,6 +1228,16 @@ static Operand hard_operand(Func* f, PReg v) {
return o;
}
+static Operand hard_operand_reg(Func* f, PReg v, Reg hard_reg) {
+ Operand o;
+ memset(&o, 0, sizeof o);
+ o.kind = OPK_REG;
+ o.cls = f->preg_info[v].cls;
+ o.type = opt_reg_type(f, v);
+ o.v.reg = hard_reg;
+ return o;
+}
+
static void append_store_preg(Func* f, RewriteList* out, PReg v) {
Inst* st = list_push(f, out, IR_STORE);
st->opnds = arena_array(f->arena, Operand, 2);
@@ -1045,6 +1256,26 @@ static void append_load_preg(Func* f, RewriteList* out, PReg v) {
ld->extra.mem = spill_mem(f, v);
}
+static void append_store_preg_hard(Func* f, RewriteList* out, PReg v,
+ Reg hard_reg) {
+ Inst* st = list_push(f, out, IR_STORE);
+ st->opnds = arena_array(f->arena, Operand, 2);
+ st->opnds[0] = spill_addr(f, v);
+ st->opnds[1] = hard_operand_reg(f, v, hard_reg);
+ st->nopnds = 2;
+ st->extra.mem = spill_mem(f, v);
+}
+
+static void append_load_preg_hard(Func* f, RewriteList* out, PReg v,
+ Reg hard_reg) {
+ Inst* ld = list_push(f, out, IR_LOAD);
+ ld->opnds = arena_array(f->arena, Operand, 2);
+ ld->opnds[0] = hard_operand_reg(f, v, hard_reg);
+ ld->opnds[1] = spill_addr(f, v);
+ ld->nopnds = 2;
+ ld->extra.mem = spill_mem(f, v);
+}
+
static Reg scratch_for(Func* f, u8 cls, u32* next) {
u32 n = f->opt_scratch_reg_count[cls];
if (!n) return REG_NONE;
@@ -1057,8 +1288,19 @@ typedef struct RewriteCtx {
RewriteList* before;
RewriteList* after;
u32 next_scratch[OPT_REG_CLASSES];
+ u32 raw_point;
} RewriteCtx;
+static const OptAllocSegment* split_segment_at(Func* f, PReg v, u32 raw_point) {
+ if (!f->opt_first_segment_by_preg || v >= opt_reg_count(f)) return NULL;
+ for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE;
+ si = f->opt_alloc_segments[si].next) {
+ const OptAllocSegment* s = &f->opt_alloc_segments[si];
+ if (s->start <= raw_point && raw_point < s->end) return s;
+ }
+ return NULL;
+}
+
static void rewrite_one_operand(Func* f, Inst* owner, Operand* op, int is_def,
void* arg) {
RewriteCtx* c = (RewriteCtx*)arg;
@@ -1070,7 +1312,15 @@ static void rewrite_one_operand(Func* f, Inst* owner, Operand* op, int is_def,
op->v.reg = vi->hard_reg;
return;
}
- if (vi->alloc_kind != OPT_ALLOC_SPILL) return;
+ if (vi->alloc_kind == OPT_ALLOC_SPLIT) {
+ const OptAllocSegment* seg = split_segment_at(f, v, c->raw_point);
+ if (seg && seg->loc_kind == OPT_LOC_HARD) {
+ op->v.reg = seg->hard_reg;
+ return;
+ }
+ } else if (vi->alloc_kind != OPT_ALLOC_SPILL) {
+ return;
+ }
u8 cls = vi->cls;
Reg scratch = scratch_for(f, cls, &c->next_scratch[cls]);
if (scratch == (Reg)REG_NONE) {
@@ -1104,7 +1354,8 @@ static void rewrite_call_arg_operand(Func* f, Operand* op) {
OptPRegInfo* vi = &f->preg_info[v];
if (vi->alloc_kind == OPT_ALLOC_HARD) {
op->v.reg = vi->hard_reg;
- } else if (vi->alloc_kind == OPT_ALLOC_SPILL) {
+ } else if (vi->alloc_kind == OPT_ALLOC_SPILL ||
+ vi->alloc_kind == OPT_ALLOC_SPLIT) {
*op = spill_addr(f, v);
}
}
@@ -1192,6 +1443,38 @@ static PReg* rewrite_collect_call_save_pregs(Func* f, u32* count_out) {
return pregs;
}
+static void append_split_reloads_at(Func* f, RewriteList* out, u32 block,
+ u32 raw_point) {
+ if (!f->opt_first_segment_by_preg) return;
+ for (PReg v = 1; v < opt_reg_count(f); ++v) {
+ if (f->preg_info[v].alloc_kind != OPT_ALLOC_SPLIT) continue;
+ for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE;
+ si = f->opt_alloc_segments[si].next) {
+ const OptAllocSegment* s = &f->opt_alloc_segments[si];
+ if (s->block == block && s->start == raw_point && s->reload_at_start &&
+ s->loc_kind == OPT_LOC_HARD) {
+ append_load_preg_hard(f, out, v, s->hard_reg);
+ }
+ }
+ }
+}
+
+static void append_split_stores_at(Func* f, RewriteList* out, u32 block,
+ u32 raw_point) {
+ if (!f->opt_first_segment_by_preg) return;
+ for (PReg v = 1; v < opt_reg_count(f); ++v) {
+ if (f->preg_info[v].alloc_kind != OPT_ALLOC_SPLIT) continue;
+ for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE;
+ si = f->opt_alloc_segments[si].next) {
+ const OptAllocSegment* s = &f->opt_alloc_segments[si];
+ if (s->block == block && s->end == raw_point && s->store_at_end &&
+ s->loc_kind == OPT_LOC_HARD) {
+ append_store_preg_hard(f, out, v, s->hard_reg);
+ }
+ }
+ }
+}
+
static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
u32 words = live_info ? live_info->words : f->opt_live_words;
if (!words) words = bit_words(opt_reg_count(f));
@@ -1205,6 +1488,12 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
InstRefs refs;
memset(&refs, 0, sizeof refs);
u32 live_active_words = 0;
+ u32* block_base = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u);
+ u32 raw = 0;
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ block_base[b] = raw;
+ raw += f->blocks[b].ninsts ? f->blocks[b].ninsts : 1u;
+ }
for (u32 b = 0; b < f->nblocks; ++b) {
Block* bl = &f->blocks[b];
RewriteOut out;
@@ -1235,6 +1524,7 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
memset(&ctx, 0, sizeof ctx);
ctx.before = &before;
ctx.after = &after;
+ ctx.raw_point = block_base[b] + i;
if ((IROp)in.op == IR_CALL) {
IRCallAux* aux = (IRCallAux*)in.extra.aux;
if (aux) {
@@ -1265,6 +1555,8 @@ static void rewrite_func(Func* f, const OptLiveInfo* live_info) {
append_live_call_saves(f, &call_restores, &in, live, live_active_words,
&refs, call_save_pregs, ncall_save_pregs, 1);
}
+ append_split_reloads_at(f, &before, b, block_base[b] + i);
+ append_split_stores_at(f, &after, b, block_base[b] + i + 1u);
out_prepend_list_reverse(f, &out, &after);
out_prepend_list_reverse(f, &out, &call_restores);
@@ -1367,7 +1659,6 @@ void opt_dead_def_elim(Func* f) {
}
void opt_regalloc(Func* f, int allow_live_range_split) {
- (void)allow_live_range_split;
metrics_scope_begin(f->c, "opt.live_ranges.regalloc");
OptLiveInfo live;
opt_live_blocks(f, &live);
@@ -1376,6 +1667,7 @@ void opt_regalloc(Func* f, int allow_live_range_split) {
opt_init_preg_info_from_ranges(f, &ranges);
opt_apply_asm_constraints_from_live(f, &live);
apply_param_incoming_register_hazards(f);
+ if (allow_live_range_split) opt_coalesce_ranges(f, &ranges);
metrics_count(f->c, "opt.live_words", f->opt_live_words);
metrics_count(f->c, "opt.ranges", ranges.nranges);
metrics_count(f->c, "opt.range_points", ranges.point_count);
@@ -1396,6 +1688,6 @@ void opt_regalloc(Func* f, int allow_live_range_split) {
metrics_scope_end(f->c, "opt.live_ranges.regalloc");
OptAllocator alloc;
- opt_assign_ranges(f, &ranges, &alloc);
+ opt_assign_ranges(f, &ranges, &alloc, allow_live_range_split);
rewrite_func(f, &live);
}
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -2911,6 +2911,181 @@ static void opt_regalloc_priority(void) {
tc_fini(&tc);
}
+static void opt_o2_coalesces_nonconflicting_copy(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ MockCGTarget mock;
+ mock_init(&mock, tc.c);
+ static const Reg pool[] = {19};
+ static const Reg scratch[] = {9, 10};
+ mock_set_pool(&mock, RC_INT, pool, 1, scratch, 2, 0x4007FFFFu);
+
+ Func* f = new_func(&tc);
+ opt_machinize(f, &mock.base);
+ Val a = add_val(f, tc.i32);
+ Val b = add_val(f, tc.i32);
+ emit_load_imm(f, f->entry, a, tc.i32, 1);
+ emit_copy(f, f->entry, b, a, tc.i32);
+ emit_ret_val(f, f->entry, b, tc.i32);
+ opt_build_cfg(f);
+ opt_build_loop_tree(f);
+ opt_regalloc(f, 1);
+
+ EXPECT(f->opt_coalesce_candidates == 1,
+ "copy should be a coalesce candidate");
+ EXPECT(f->opt_coalesce_merges == 1, "non-overlapping copy should coalesce");
+ EXPECT(f->preg_info[a].alloc_kind == OPT_ALLOC_HARD &&
+ f->preg_info[b].alloc_kind == OPT_ALLOC_HARD,
+ "coalesced values should allocate hard under one-reg pressure");
+ EXPECT(f->preg_info[a].hard_reg == f->preg_info[b].hard_reg,
+ "coalesced values should share a hard register");
+ opt_combine(f);
+ opt_dce(f);
+ EXPECT(count_op(f, IR_COPY) == 0,
+ "coalesced hard-reg self copy should be removed");
+ tc_fini(&tc);
+}
+
+static void opt_o1_ignores_coalescing(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ MockCGTarget mock;
+ mock_init(&mock, tc.c);
+ static const Reg pool[] = {19};
+ static const Reg scratch[] = {9, 10};
+ mock_set_pool(&mock, RC_INT, pool, 1, scratch, 2, 0x4007FFFFu);
+
+ Func* f = new_func(&tc);
+ opt_machinize(f, &mock.base);
+ Val a = add_val(f, tc.i32);
+ Val b = add_val(f, tc.i32);
+ emit_load_imm(f, f->entry, a, tc.i32, 1);
+ emit_copy(f, f->entry, b, a, tc.i32);
+ emit_ret_val(f, f->entry, b, tc.i32);
+ opt_build_cfg(f);
+ opt_build_loop_tree(f);
+ opt_regalloc(f, 0);
+
+ EXPECT(f->opt_coalesce_candidates == 0 && f->opt_coalesce_merges == 0,
+ "O1 regalloc should not run coalescing");
+ tc_fini(&tc);
+}
+
+static void opt_o2_refuses_overlapping_copy_coalesce(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ MockCGTarget mock;
+ mock_init(&mock, tc.c);
+ static const Reg pool[] = {19, 20};
+ static const Reg scratch[] = {9, 10};
+ mock_set_pool(&mock, RC_INT, pool, 2, scratch, 2, 0x4007FFFFu);
+
+ Func* f = new_func(&tc);
+ opt_machinize(f, &mock.base);
+ Val a = add_val(f, tc.i32);
+ Val b = add_val(f, tc.i32);
+ Val out = add_val(f, tc.i32);
+ emit_load_imm(f, f->entry, a, tc.i32, 1);
+ emit_copy(f, f->entry, b, a, tc.i32);
+ emit_binop(f, f->entry, out, a, b, tc.i32);
+ emit_ret_val(f, f->entry, out, tc.i32);
+ opt_build_cfg(f);
+ opt_build_loop_tree(f);
+ opt_regalloc(f, 1);
+
+ EXPECT(f->opt_coalesce_candidates == 1,
+ "overlapping copy should still be collected as a candidate");
+ EXPECT(f->opt_coalesce_conflicts >= 1,
+ "overlapping copy values should be recorded as conflicting");
+ EXPECT(f->opt_coalesce_merges == 0, "overlapping copy should not coalesce");
+ tc_fini(&tc);
+}
+
+static void opt_o2_refuses_incompatible_copy_coalesce(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ MockCGTarget mock;
+ mock_init(&mock, tc.c);
+ static const Reg pool[] = {19, 20};
+ static const Reg scratch[] = {9, 10};
+ mock_set_pool(&mock, RC_INT, pool, 2, scratch, 2, 0x4007FFFFu);
+
+ Func* diff_type = new_func(&tc);
+ opt_machinize(diff_type, &mock.base);
+ Val a = add_val(diff_type, tc.i32);
+ Val b = add_val(diff_type, tc.i64);
+ emit_load_imm(diff_type, diff_type->entry, a, tc.i32, 1);
+ emit_copy(diff_type, diff_type->entry, b, a, tc.i64);
+ emit_ret_val(diff_type, diff_type->entry, b, tc.i64);
+ opt_build_cfg(diff_type);
+ opt_build_loop_tree(diff_type);
+ opt_regalloc(diff_type, 1);
+ EXPECT(diff_type->opt_coalesce_candidates == 0,
+ "different value types should not become coalesce candidates");
+
+ Func* tied = new_func(&tc);
+ opt_machinize(tied, &mock.base);
+ a = add_val(tied, tc.i32);
+ b = add_val(tied, tc.i32);
+ emit_load_imm(tied, tied->entry, a, tc.i32, 1);
+ emit_copy(tied, tied->entry, b, a, tc.i32);
+ emit_ret_val(tied, tied->entry, b, tc.i32);
+ opt_build_cfg(tied);
+ opt_build_loop_tree(tied);
+ ensure_test_preg_info(tied);
+ tied->preg_info[a].tied_hard_reg = 19;
+ tied->preg_info[b].tied_hard_reg = 20;
+ opt_regalloc(tied, 1);
+ EXPECT(tied->opt_coalesce_candidates == 1,
+ "same-shape tied copy should be collected");
+ EXPECT(tied->opt_coalesce_merges == 0,
+ "different fixed hard regs should block coalescing");
+ tc_fini(&tc);
+}
+
+static void opt_o2_splits_singleton_when_whole_alloc_fails(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ MockCGTarget mock;
+ mock_init(&mock, tc.c);
+ static const Reg pool[] = {19};
+ static const Reg scratch[] = {9, 10};
+ mock_set_pool(&mock, RC_INT, pool, 1, scratch, 2, 0x4007FFFFu);
+
+ Func* f = new_func(&tc);
+ opt_machinize(f, &mock.base);
+ Val pinned = add_val(f, tc.i32);
+ Val v = add_val(f, tc.i32);
+ Val tmp = add_val(f, tc.i32);
+ emit_load_imm(f, f->entry, pinned, tc.i32, 1);
+ emit_load_imm(f, f->entry, v, tc.i32, 2);
+ emit_binop(f, f->entry, tmp, pinned, v, tc.i32);
+ emit_load_imm(f, f->entry, v, tc.i32, 3);
+ emit_ret_val(f, f->entry, v, tc.i32);
+ opt_build_cfg(f);
+ opt_build_loop_tree(f);
+ ensure_test_preg_info(f);
+ f->preg_info[pinned].tied_hard_reg = 19;
+ opt_regalloc(f, 1);
+
+ EXPECT(f->preg_info[pinned].alloc_kind == OPT_ALLOC_HARD,
+ "pinned value should keep the only hard register");
+ EXPECT(f->preg_info[v].alloc_kind == OPT_ALLOC_SPLIT,
+ "singleton value should split instead of whole spilling");
+ int saw_hard = 0;
+ int saw_stack = 0;
+ for (u32 si = f->opt_first_segment_by_preg[v]; si != OPT_RANGE_NONE;
+ si = f->opt_alloc_segments[si].next) {
+ if (f->opt_alloc_segments[si].loc_kind == OPT_LOC_HARD) saw_hard = 1;
+ if (f->opt_alloc_segments[si].loc_kind == OPT_LOC_STACK) saw_stack = 1;
+ }
+ EXPECT(saw_hard && saw_stack,
+ "split value should have both hard and spill-home segments");
+ EXPECT(f->preg_info[v].spill_slot != FRAME_SLOT_NONE,
+ "split value should have a canonical spill home");
+ tc_fini(&tc);
+}
+
static void opt_range_regalloc_no_conflicts_and_stack_reuse(void) {
TestCtx tc;
tc_init(&tc);
@@ -4711,6 +4886,11 @@ int main(void) {
opt_live_across_call_frequency();
opt_range_overlap_class();
opt_regalloc_priority();
+ opt_o2_coalesces_nonconflicting_copy();
+ opt_o1_ignores_coalescing();
+ opt_o2_refuses_overlapping_copy_coalesce();
+ opt_o2_refuses_incompatible_copy_coalesce();
+ opt_o2_splits_singleton_when_whole_alloc_fails();
opt_range_regalloc_no_conflicts_and_stack_reuse();
opt_stack_spill_assignment_avoids_quadratic_probe();
opt_rewrite_spill_use_def();
diff --git a/test/toy/cases/129_o2_pressure_coalesce.expected b/test/toy/cases/129_o2_pressure_coalesce.expected
@@ -0,0 +1 @@
+40
diff --git a/test/toy/cases/129_o2_pressure_coalesce.toy b/test/toy/cases/129_o2_pressure_coalesce.toy
@@ -0,0 +1,32 @@
+fn pressure(n0: i64): i64 {
+ var n: i64 = n0;
+ let a0: i64 = n0 + 1;
+ let a1: i64 = n0 + 2;
+ let a2: i64 = n0 + 3;
+ let a3: i64 = n0 + 4;
+ let a4: i64 = n0 + 5;
+ let a5: i64 = n0 + 6;
+ let a6: i64 = n0 + 7;
+ let a7: i64 = n0 + 8;
+ let a8: i64 = n0 + 9;
+ let a9: i64 = n0 + 10;
+ let a10: i64 = n0 + 11;
+ let a11: i64 = n0 + 12;
+ let a12: i64 = n0 + 13;
+ let a13: i64 = n0 + 14;
+ let a14: i64 = n0 + 15;
+ let a15: i64 = n0 + 16;
+ var s: i64 = 0;
+ while n > 0 {
+ s = s + a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 +
+ a8 + a9 + a10 + a11 + a12 + a13 + a14 + a15;
+ n = n - 1;
+ }
+ return s;
+}
+
+fn __user_main(): i64 {
+ return pressure(3);
+}
+
+fn main(): i32 { return __user_main() as i32; }