pass_lower_loop_imm.c - kit

pass_lower_loop_imm.c (6223B)
      1 /* O1 pre-hoist lowering for loop-body immediate operands.
      2  *
      3  * `opt_hoist_loop_consts` consolidates duplicate IR_LOAD_IMM defs of the same
      4  * constant out of a loop. But many constants never become IR_LOAD_IMM defs:
      5  * they arrive at the emitter as inline `imm:` operands on IR_BINOP / IR_CMP /
      6  * IR_CMP_BRANCH / IR_STORE. If the backend can fold the immediate into the
      7  * instruction (e.g. `add x, y, #1`), the emitter keeps it inline and there is
      8  * nothing to hoist. If it cannot (e.g. `mul x, y, #5` on aarch64), the emitter
      9  * materializes the constant into a scratch register inside the loop every
     10  * iteration.
     11  *
     12  * This pass walks loop-body instructions and rewrites those non-foldable inline
     13  * imm operands to a fresh IR_LOAD_IMM + OPK_REG use so the existing hoister
     14  * picks them up. Stores of non-zero imms are also lowered (the emitter has a
     15  * zero-source fast path but materializes every other inline imm).
     16  *
     17  * Guards:
     18  *   - Only loop bodies (loop_depth > 0) — extending live ranges outside loops
     19  *     is unprofitable.
     20  *   - Skip zero imms — the emitter has zero-reg fast paths (e.g. `strb wzr`)
     21  *     that beat holding 0 in an allocated register.
     22  *   - Skip imms the target says it can fold (`imm_legal`) — those produce no
     23  *     materialize in the loop and lifting them would just lengthen live ranges.
     24  *
     25  * Must run after opt_machinize_native and opt_build_loop_tree (reads block
     26  * loop_depth and target->imm_legal), and before opt_hoist_loop_consts (which
     27  * does the hoisting). */
     28 #include <kit/cg.h>
     29 #include <string.h>
     30 
     31 #include "opt/opt_internal.h"
     32 
     33 typedef enum LowerImmKind {
     34   LOWER_IMM_NONE,
     35   LOWER_IMM_BINOP, /* foldable iff imm_legal(NATIVE_IMM_BINOP, ...) */
     36   LOWER_IMM_CMP,   /* foldable iff imm_legal(NATIVE_IMM_CMP, ...) */
     37   LOWER_IMM_STORE, /* never foldable as inline imm; emit materializes always */
     38 } LowerImmKind;
     39 
     40 /* For each instruction shape we care about, return which operand index holds
     41  * the imm-bearing source, and what use-kind/sub-op to pass to imm_legal. */
     42 static u32 inst_imm_slot(const Inst* in, LowerImmKind* kind_out, u32* sub_out) {
     43   switch ((IROp)in->op) {
     44     case IR_BINOP:
     45       *kind_out = LOWER_IMM_BINOP;
     46       *sub_out = (u32)in->extra.imm;
     47       return 2u;
     48     case IR_CMP:
     49       *kind_out = LOWER_IMM_CMP;
     50       *sub_out = (u32)in->extra.imm;
     51       return 2u;
     52     case IR_CMP_BRANCH:
     53       *kind_out = LOWER_IMM_CMP;
     54       *sub_out = (u32)in->extra.imm;
     55       return 1u;
     56     case IR_STORE:
     57       *kind_out = LOWER_IMM_STORE;
     58       *sub_out = 0u;
     59       return 1u;
     60     default:
     61       *kind_out = LOWER_IMM_NONE;
     62       *sub_out = 0u;
     63       return 0u;
     64   }
     65 }
     66 
     67 static int operand_should_lift(NativeTarget* target, const Operand* op,
     68                                LowerImmKind kind, u32 sub) {
     69   if (op->kind != OPK_IMM) return 0;
     70   if (op->v.imm == 0) return 0;
     71   switch (kind) {
     72     case LOWER_IMM_BINOP:
     73     case LOWER_IMM_CMP: {
     74       NativeImmUse use =
     75           (kind == LOWER_IMM_BINOP) ? NATIVE_IMM_BINOP : NATIVE_IMM_CMP;
     76       if (target->imm_legal &&
     77           target->imm_legal(target, use, sub, op->type, op->v.imm))
     78         return 0;
     79       return 1;
     80     }
     81     case LOWER_IMM_STORE:
     82       return 1; /* emitter has no fold path for store value imms */
     83     case LOWER_IMM_NONE:
     84     default:
     85       return 0;
     86   }
     87 }
     88 
     89 /* Count how many imm operands in `bl` are lift candidates, so we can grow the
     90  * block's inst array once rather than per-lift. */
     91 static u32 count_lifts_in_block(Func* f, Block* bl, NativeTarget* target) {
     92   u32 n = 0;
     93   (void)f;
     94   for (u32 i = 0; i < bl->ninsts; ++i) {
     95     Inst* in = &bl->insts[i];
     96     LowerImmKind kind;
     97     u32 sub;
     98     u32 slot = inst_imm_slot(in, &kind, &sub);
     99     if (kind == LOWER_IMM_NONE) continue;
    100     if (slot >= in->nopnds) continue;
    101     if (operand_should_lift(target, &in->opnds[slot], kind, sub)) ++n;
    102   }
    103   return n;
    104 }
    105 
    106 void opt_lower_loop_imm_operands(Func* f, NativeTarget* target) {
    107   if (!f || f->opt_reg_ssa || f->opt_rewritten) return;
    108   if (!target) return;
    109   if (f->nblocks == 0) return;
    110 
    111   /* Process blocks one at a time, inserting per-block in a single pre-grown
    112    * batch. Each lift produces one IR_LOAD_IMM directly before its consumer. */
    113   for (u32 b = 0; b < f->nblocks; ++b) {
    114     Block* bl = &f->blocks[b];
    115     if (bl->loop_depth == 0) continue;
    116     u32 nlifts = count_lifts_in_block(f, bl, target);
    117     if (!nlifts) continue;
    118 
    119     /* Walk forward; for each lift, insert one IR_LOAD_IMM before the consumer
    120      * and rewrite the operand. After insert_at the consumer slides down by 1,
    121      * so re-fetch via index. We use opt_block_insert_at one slot at a time;
    122      * cap-doubling keeps total work O(ninsts + nlifts). */
    123     for (u32 i = 0; i < bl->ninsts;) {
    124       Inst* in = &bl->insts[i];
    125       LowerImmKind kind;
    126       u32 sub;
    127       u32 slot = inst_imm_slot(in, &kind, &sub);
    128       if (kind == LOWER_IMM_NONE || slot >= in->nopnds) {
    129         ++i;
    130         continue;
    131       }
    132       Operand* op = &in->opnds[slot];
    133       if (!operand_should_lift(target, op, kind, sub)) {
    134         ++i;
    135         continue;
    136       }
    137       /* Snapshot the imm operand before the insert moves memory around. */
    138       KitCgTypeId type = op->type;
    139       u8 cls = op->cls;
    140       i64 imm = op->v.imm;
    141 
    142       PReg p = ir_alloc_preg(f, type, cls);
    143       Inst* slot_in = opt_block_insert_at(f, bl, i, 1u);
    144       slot_in->op = (u16)IR_LOAD_IMM;
    145       slot_in->def = (Val)p;
    146       slot_in->type = type;
    147       slot_in->extra.imm = imm;
    148       slot_in->loc = bl->insts[i + 1u].loc;
    149       slot_in->nopnds = 1u;
    150       slot_in->opnds = arena_array(f->arena, Operand, 1);
    151       memset(&slot_in->opnds[0], 0, sizeof(Operand));
    152       slot_in->opnds[0].kind = OPK_REG;
    153       slot_in->opnds[0].cls = cls;
    154       slot_in->opnds[0].type = type;
    155       slot_in->opnds[0].v.reg = p;
    156       ir_assign_inst_id(f, slot_in);
    157 
    158       /* The consumer is now at index i+1; rewrite its imm operand to OPK_REG.
    159        */
    160       Inst* user = &bl->insts[i + 1u];
    161       Operand* uop = &user->opnds[slot];
    162       memset(uop, 0, sizeof(Operand));
    163       uop->kind = OPK_REG;
    164       uop->cls = cls;
    165       uop->type = type;
    166       uop->v.reg = p;
    167 
    168       i += 2u;
    169     }
    170   }
    171 
    172   opt_analysis_invalidate(f, OPT_ANALYSIS_DEF_USE);
    173 }
	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README