pass_lower_loop_imm.c (6223B)
1 /* O1 pre-hoist lowering for loop-body immediate operands. 2 * 3 * `opt_hoist_loop_consts` consolidates duplicate IR_LOAD_IMM defs of the same 4 * constant out of a loop. But many constants never become IR_LOAD_IMM defs: 5 * they arrive at the emitter as inline `imm:` operands on IR_BINOP / IR_CMP / 6 * IR_CMP_BRANCH / IR_STORE. If the backend can fold the immediate into the 7 * instruction (e.g. `add x, y, #1`), the emitter keeps it inline and there is 8 * nothing to hoist. If it cannot (e.g. `mul x, y, #5` on aarch64), the emitter 9 * materializes the constant into a scratch register inside the loop every 10 * iteration. 11 * 12 * This pass walks loop-body instructions and rewrites those non-foldable inline 13 * imm operands to a fresh IR_LOAD_IMM + OPK_REG use so the existing hoister 14 * picks them up. Stores of non-zero imms are also lowered (the emitter has a 15 * zero-source fast path but materializes every other inline imm). 16 * 17 * Guards: 18 * - Only loop bodies (loop_depth > 0) — extending live ranges outside loops 19 * is unprofitable. 20 * - Skip zero imms — the emitter has zero-reg fast paths (e.g. `strb wzr`) 21 * that beat holding 0 in an allocated register. 22 * - Skip imms the target says it can fold (`imm_legal`) — those produce no 23 * materialize in the loop and lifting them would just lengthen live ranges. 24 * 25 * Must run after opt_machinize_native and opt_build_loop_tree (reads block 26 * loop_depth and target->imm_legal), and before opt_hoist_loop_consts (which 27 * does the hoisting). */ 28 #include <kit/cg.h> 29 #include <string.h> 30 31 #include "opt/opt_internal.h" 32 33 typedef enum LowerImmKind { 34 LOWER_IMM_NONE, 35 LOWER_IMM_BINOP, /* foldable iff imm_legal(NATIVE_IMM_BINOP, ...) */ 36 LOWER_IMM_CMP, /* foldable iff imm_legal(NATIVE_IMM_CMP, ...) */ 37 LOWER_IMM_STORE, /* never foldable as inline imm; emit materializes always */ 38 } LowerImmKind; 39 40 /* For each instruction shape we care about, return which operand index holds 41 * the imm-bearing source, and what use-kind/sub-op to pass to imm_legal. */ 42 static u32 inst_imm_slot(const Inst* in, LowerImmKind* kind_out, u32* sub_out) { 43 switch ((IROp)in->op) { 44 case IR_BINOP: 45 *kind_out = LOWER_IMM_BINOP; 46 *sub_out = (u32)in->extra.imm; 47 return 2u; 48 case IR_CMP: 49 *kind_out = LOWER_IMM_CMP; 50 *sub_out = (u32)in->extra.imm; 51 return 2u; 52 case IR_CMP_BRANCH: 53 *kind_out = LOWER_IMM_CMP; 54 *sub_out = (u32)in->extra.imm; 55 return 1u; 56 case IR_STORE: 57 *kind_out = LOWER_IMM_STORE; 58 *sub_out = 0u; 59 return 1u; 60 default: 61 *kind_out = LOWER_IMM_NONE; 62 *sub_out = 0u; 63 return 0u; 64 } 65 } 66 67 static int operand_should_lift(NativeTarget* target, const Operand* op, 68 LowerImmKind kind, u32 sub) { 69 if (op->kind != OPK_IMM) return 0; 70 if (op->v.imm == 0) return 0; 71 switch (kind) { 72 case LOWER_IMM_BINOP: 73 case LOWER_IMM_CMP: { 74 NativeImmUse use = 75 (kind == LOWER_IMM_BINOP) ? NATIVE_IMM_BINOP : NATIVE_IMM_CMP; 76 if (target->imm_legal && 77 target->imm_legal(target, use, sub, op->type, op->v.imm)) 78 return 0; 79 return 1; 80 } 81 case LOWER_IMM_STORE: 82 return 1; /* emitter has no fold path for store value imms */ 83 case LOWER_IMM_NONE: 84 default: 85 return 0; 86 } 87 } 88 89 /* Count how many imm operands in `bl` are lift candidates, so we can grow the 90 * block's inst array once rather than per-lift. */ 91 static u32 count_lifts_in_block(Func* f, Block* bl, NativeTarget* target) { 92 u32 n = 0; 93 (void)f; 94 for (u32 i = 0; i < bl->ninsts; ++i) { 95 Inst* in = &bl->insts[i]; 96 LowerImmKind kind; 97 u32 sub; 98 u32 slot = inst_imm_slot(in, &kind, &sub); 99 if (kind == LOWER_IMM_NONE) continue; 100 if (slot >= in->nopnds) continue; 101 if (operand_should_lift(target, &in->opnds[slot], kind, sub)) ++n; 102 } 103 return n; 104 } 105 106 void opt_lower_loop_imm_operands(Func* f, NativeTarget* target) { 107 if (!f || f->opt_reg_ssa || f->opt_rewritten) return; 108 if (!target) return; 109 if (f->nblocks == 0) return; 110 111 /* Process blocks one at a time, inserting per-block in a single pre-grown 112 * batch. Each lift produces one IR_LOAD_IMM directly before its consumer. */ 113 for (u32 b = 0; b < f->nblocks; ++b) { 114 Block* bl = &f->blocks[b]; 115 if (bl->loop_depth == 0) continue; 116 u32 nlifts = count_lifts_in_block(f, bl, target); 117 if (!nlifts) continue; 118 119 /* Walk forward; for each lift, insert one IR_LOAD_IMM before the consumer 120 * and rewrite the operand. After insert_at the consumer slides down by 1, 121 * so re-fetch via index. We use opt_block_insert_at one slot at a time; 122 * cap-doubling keeps total work O(ninsts + nlifts). */ 123 for (u32 i = 0; i < bl->ninsts;) { 124 Inst* in = &bl->insts[i]; 125 LowerImmKind kind; 126 u32 sub; 127 u32 slot = inst_imm_slot(in, &kind, &sub); 128 if (kind == LOWER_IMM_NONE || slot >= in->nopnds) { 129 ++i; 130 continue; 131 } 132 Operand* op = &in->opnds[slot]; 133 if (!operand_should_lift(target, op, kind, sub)) { 134 ++i; 135 continue; 136 } 137 /* Snapshot the imm operand before the insert moves memory around. */ 138 KitCgTypeId type = op->type; 139 u8 cls = op->cls; 140 i64 imm = op->v.imm; 141 142 PReg p = ir_alloc_preg(f, type, cls); 143 Inst* slot_in = opt_block_insert_at(f, bl, i, 1u); 144 slot_in->op = (u16)IR_LOAD_IMM; 145 slot_in->def = (Val)p; 146 slot_in->type = type; 147 slot_in->extra.imm = imm; 148 slot_in->loc = bl->insts[i + 1u].loc; 149 slot_in->nopnds = 1u; 150 slot_in->opnds = arena_array(f->arena, Operand, 1); 151 memset(&slot_in->opnds[0], 0, sizeof(Operand)); 152 slot_in->opnds[0].kind = OPK_REG; 153 slot_in->opnds[0].cls = cls; 154 slot_in->opnds[0].type = type; 155 slot_in->opnds[0].v.reg = p; 156 ir_assign_inst_id(f, slot_in); 157 158 /* The consumer is now at index i+1; rewrite its imm operand to OPK_REG. 159 */ 160 Inst* user = &bl->insts[i + 1u]; 161 Operand* uop = &user->opnds[slot]; 162 memset(uop, 0, sizeof(Operand)); 163 uop->kind = OPK_REG; 164 uop->cls = cls; 165 uop->type = type; 166 uop->v.reg = p; 167 168 i += 2u; 169 } 170 } 171 172 opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE); 173 }