P1-amd64.M1pp - boot2 - Playing with the boostrap

P1-amd64.M1pp (20913B)
      1 # P1-amd64.M1pp -- P1v2 amd64 backend expressed in m1macro.
      2 #
      3 # Mirrors p1/P1-aarch64.M1pp. Native register mapping is backend-private;
      4 # see the amd_reg_* table below. amd64 is variable-length, so every op
      5 # emits its prefix bytes (REX / opcode) directly via the m1pp `!(…)`
      6 # single-byte builtin; 4-byte immediates still go through `%(…)`.
      7 #
      8 # Hidden backend regs:
      9 #   br      = r15   -- branch-target mechanism
     10 #   scratch = r9    -- per-expansion scratch (e.g. rcx save slot for SHIFT)
     11 #   rax            -- syscall number / return slot + retaddr spill
     12 #   rbp            -- spill slot for rcx / rdx when SHIFT and DIV/REM need
     13 #                      to preserve a3 / a2
     14 
     15 # ---- Native register numbers --------------------------------------------
     16 #
     17 # Macros emit the 4-bit native regnum 0..15. Callers use `(& N 7)` for the
     18 # ModRM/SIB low 3 bits and `(>> N 3)` for the REX high bit.
     19 
     20 %macro amd_reg_a0()
     21 7
     22 %endm
     23 %macro amd_reg_a1()
     24 6
     25 %endm
     26 %macro amd_reg_a2()
     27 2
     28 %endm
     29 %macro amd_reg_a3()
     30 1
     31 %endm
     32 %macro amd_reg_t0()
     33 10
     34 %endm
     35 %macro amd_reg_t1()
     36 11
     37 %endm
     38 %macro amd_reg_t2()
     39 8
     40 %endm
     41 %macro amd_reg_s0()
     42 3
     43 %endm
     44 %macro amd_reg_s1()
     45 12
     46 %endm
     47 %macro amd_reg_s2()
     48 13
     49 %endm
     50 %macro amd_reg_s3()
     51 14
     52 %endm
     53 %macro amd_reg_sp()
     54 4
     55 %endm
     56 %macro amd_reg_rax()
     57 0
     58 %endm
     59 %macro amd_reg_rcx()
     60 1
     61 %endm
     62 %macro amd_reg_rdx()
     63 2
     64 %endm
     65 %macro amd_reg_rbx()
     66 3
     67 %endm
     68 %macro amd_reg_rsp()
     69 4
     70 %endm
     71 %macro amd_reg_rbp()
     72 5
     73 %endm
     74 %macro amd_reg_rsi()
     75 6
     76 %endm
     77 %macro amd_reg_rdi()
     78 7
     79 %endm
     80 %macro amd_reg_r8()
     81 8
     82 %endm
     83 %macro amd_reg_r9()
     84 9
     85 %endm
     86 %macro amd_reg_r10()
     87 10
     88 %endm
     89 %macro amd_reg_r11()
     90 11
     91 %endm
     92 %macro amd_reg_r12()
     93 12
     94 %endm
     95 %macro amd_reg_r13()
     96 13
     97 %endm
     98 %macro amd_reg_r14()
     99 14
    100 %endm
    101 %macro amd_reg_r15()
    102 15
    103 %endm
    104 %macro amd_reg_br()
    105 15
    106 %endm
    107 %macro amd_reg_scratch()
    108 9
    109 %endm
    110 
    111 %macro amd_reg(r)
    112 %amd_reg_##r()
    113 %endm
    114 
    115 # Per-P1-name `is this sp?` predicate. Used by p1_mem to decide whether
    116 # the supplied offset needs the +16 frame-header adjustment.
    117 
    118 %macro amd_is_sp_a0()
    119 0
    120 %endm
    121 %macro amd_is_sp_a1()
    122 0
    123 %endm
    124 %macro amd_is_sp_a2()
    125 0
    126 %endm
    127 %macro amd_is_sp_a3()
    128 0
    129 %endm
    130 %macro amd_is_sp_t0()
    131 0
    132 %endm
    133 %macro amd_is_sp_t1()
    134 0
    135 %endm
    136 %macro amd_is_sp_t2()
    137 0
    138 %endm
    139 %macro amd_is_sp_s0()
    140 0
    141 %endm
    142 %macro amd_is_sp_s1()
    143 0
    144 %endm
    145 %macro amd_is_sp_s2()
    146 0
    147 %endm
    148 %macro amd_is_sp_s3()
    149 0
    150 %endm
    151 %macro amd_is_sp_sp()
    152 1
    153 %endm
    154 
    155 %macro amd_is_sp(r)
    156 %amd_is_sp_##r()
    157 %endm
    158 
    159 # ---- REX / ModRM helpers ------------------------------------------------
    160 
    161 # Short one-byte REX.B prefix (no W). Used by opcodes that don't need 64-bit
    162 # width — push/pop/jmp r/call r/mov r,imm32 — when the target reg is r8-r15.
    163 %macro amd_rex_b_short()
    164 !(0x41)
    165 %endm
    166 
    167 # No-op sentinel for %select branches that shouldn't emit anything.
    168 %macro amd_nobytes()
    169 %endm
    170 
    171 # Emit REX.B (0x41) iff r is r8-r15. Used by the short-prefix opcodes above.
    172 %macro amd_maybe_rex_b(r)
    173 %select((>= %amd_reg(r) 8),
    174     %amd_rex_b_short(),
    175     %amd_nobytes())
    176 %endm
    177 
    178 # REX.WB: W=1 for 64-bit, B=(r>>3) to extend ModRM.rm / SIB.base.
    179 %macro amd_rex_wb(r)
    180 !((| 0x48 (& (>> %amd_reg(r) 3) 1)))
    181 %endm
    182 
    183 # REX.WRB: W=1, R=(rg>>3), B=(rm>>3). Used whenever a ModRM.reg field is
    184 # in use together with a ModRM.rm field.
    185 %macro amd_rex_wrb(rg, rm)
    186 !((| 0x48 (| (<< (& (>> %amd_reg(rg) 3) 1) 2) (& (>> %amd_reg(rm) 3) 1))))
    187 %endm
    188 
    189 # ModRM byte for register/register: mod=3, reg=rg low3, rm=rm low3.
    190 %macro amd_modrm_rr(rg, rm)
    191 !((| 0xC0 (| (<< (& %amd_reg(rg) 7) 3) (& %amd_reg(rm) 7))))
    192 %endm
    193 
    194 # ModRM /ext, rm: mod=3, reg=ext, rm=low3(rm). ext is 0..7.
    195 %macro amd_modrm_ext_r(ext, rm)
    196 !((| 0xC0 (| (<< ext 3) (& %amd_reg(rm) 7))))
    197 %endm
    198 
    199 # ---- Memory-addressing ModRM (+ SIB + disp) ----------------------------
    200 #
    201 # [base + disp] with `reg` in ModRM.reg. Bases whose low 3 bits are 100 —
    202 # rsp and r12 — must go through a SIB byte; all others use the plain
    203 # encoding. disp selects mod=1 (disp8) when it fits in [-128,127], else
    204 # mod=2 (disp32). We never emit mod=0 / no-disp; the extra byte is fine.
    205 
    206 %macro amd_modrm_disp8_plain(reg, base, disp)
    207 !((| 0x40 (| (<< (& %amd_reg(reg) 7) 3) (& %amd_reg(base) 7))))
    208 !((& disp 0xFF))
    209 %endm
    210 
    211 %macro amd_modrm_disp32_plain(reg, base, disp)
    212 !((| 0x80 (| (<< (& %amd_reg(reg) 7) 3) (& %amd_reg(base) 7))))
    213 %((& disp 0xFFFFFFFF))
    214 %endm
    215 
    216 %macro amd_modrm_disp8_sib(reg, disp)
    217 !((| 0x44 (<< (& %amd_reg(reg) 7) 3)))
    218 !(0x24)
    219 !((& disp 0xFF))
    220 %endm
    221 
    222 %macro amd_modrm_disp32_sib(reg, disp)
    223 !((| 0x84 (<< (& %amd_reg(reg) 7) 3)))
    224 !(0x24)
    225 %((& disp 0xFFFFFFFF))
    226 %endm
    227 
    228 %macro amd_modrm_disp_plain(reg, base, disp)
    229 %select((>= disp -128),
    230     %select((<= disp 127),
    231         %amd_modrm_disp8_plain(reg, base, disp),
    232         %amd_modrm_disp32_plain(reg, base, disp)),
    233     %amd_modrm_disp32_plain(reg, base, disp))
    234 %endm
    235 
    236 %macro amd_modrm_disp_sib(reg, disp)
    237 %select((>= disp -128),
    238     %select((<= disp 127),
    239         %amd_modrm_disp8_sib(reg, disp),
    240         %amd_modrm_disp32_sib(reg, disp)),
    241     %amd_modrm_disp32_sib(reg, disp))
    242 %endm
    243 
    244 %macro amd_modrm_disp(reg, base, disp)
    245 %select((= (& %amd_reg(base) 7) 4),
    246     %amd_modrm_disp_sib(reg, disp),
    247     %amd_modrm_disp_plain(reg, base, disp))
    248 %endm
    249 
    250 # ---- Register / arithmetic primitives ----------------------------------
    251 
    252 # mov dst, src -- 48 89 /r  (modrm form: source in reg, dest in rm).
    253 %macro amd_mov_rr(dst, src)
    254 %amd_rex_wrb(src, dst)
    255 !(0x89)
    256 %amd_modrm_rr(src, dst)
    257 %endm
    258 
    259 # op dst, src for ADD/SUB/AND/OR/XOR (same shape, different opcode byte).
    260 %macro amd_alu_rr(opcode, dst, src)
    261 %amd_rex_wrb(src, dst)
    262 !(opcode)
    263 %amd_modrm_rr(src, dst)
    264 %endm
    265 
    266 # op dst, imm8 -- 48 83 /ext ib.
    267 %macro amd_alu_ri8(ext, dst, imm)
    268 %amd_rex_wb(dst)
    269 !(0x83)
    270 %amd_modrm_ext_r(ext, dst)
    271 !((& imm 0xFF))
    272 %endm
    273 
    274 # op dst, imm32 -- 48 81 /ext id.
    275 %macro amd_alu_ri32(ext, dst, imm)
    276 %amd_rex_wb(dst)
    277 !(0x81)
    278 %amd_modrm_ext_r(ext, dst)
    279 %((& imm 0xFFFFFFFF))
    280 %endm
    281 
    282 # shift dst, imm8 -- 48 C1 /ext ib.  (ext: SHL=4, SHR=5, SAR=7)
    283 %macro amd_shift_ri8(ext, dst, imm)
    284 %amd_rex_wb(dst)
    285 !(0xC1)
    286 %amd_modrm_ext_r(ext, dst)
    287 !((& imm 0x3F))
    288 %endm
    289 
    290 # shift dst, cl -- 48 D3 /ext.
    291 %macro amd_shift_cl(ext, dst)
    292 %amd_rex_wb(dst)
    293 !(0xD3)
    294 %amd_modrm_ext_r(ext, dst)
    295 %endm
    296 
    297 # imul dst, src -- 48 0F AF /r (load source into reg, dest in rm? actually
    298 # the canonical form is IMUL r64, r/m64 — dest in reg, source in rm.)
    299 %macro amd_imul_rr(dst, src)
    300 %amd_rex_wrb(dst, src)
    301 !(0x0F)
    302 !(0xAF)
    303 %amd_modrm_rr(dst, src)
    304 %endm
    305 
    306 # idiv src -- 48 F7 /7.
    307 %macro amd_idiv_r(src)
    308 %amd_rex_wb(src)
    309 !(0xF7)
    310 %amd_modrm_ext_r(7, src)
    311 %endm
    312 
    313 # cqo -- 48 99 (sign-extend rax into rdx:rax).
    314 %macro amd_cqo()
    315 !(0x48)
    316 !(0x99)
    317 %endm
    318 
    319 # push / pop r64.  50+r / 58+r; REX.B=0x41 if r8-r15.
    320 %macro amd_push(r)
    321 %amd_maybe_rex_b(r)
    322 !((| 0x50 (& %amd_reg(r) 7)))
    323 %endm
    324 
    325 %macro amd_pop(r)
    326 %amd_maybe_rex_b(r)
    327 !((| 0x58 (& %amd_reg(r) 7)))
    328 %endm
    329 
    330 # mov r32, imm32 -- B8+r id.  Low-register form skips REX; r8-r15 need
    331 # REX.B=0x41.  The 4-byte literal the caller emits is zero-extended into
    332 # the full 64-bit register, matching the LA / LA_BR literal-pool contract.
    333 %macro amd_mov_imm32_prefix(rd)
    334 %amd_maybe_rex_b(rd)
    335 !((| 0xB8 (& %amd_reg(rd) 7)))
    336 %endm
    337 
    338 # mov r64, imm64 -- REX.W [+ REX.B] B8+r  followed by 8 bytes of literal.
    339 %macro amd_mov_imm64_prefix(rd)
    340 %amd_rex_wb(rd)
    341 !((| 0xB8 (& %amd_reg(rd) 7)))
    342 %endm
    343 
    344 # ---- Memory ops ---------------------------------------------------------
    345 
    346 # mov rT, [rN + off]        48 8B /r  modrm-with-disp
    347 %macro amd_mem_LD(rt, rn, off)
    348 %amd_rex_wrb(rt, rn)
    349 !(0x8B)
    350 %amd_modrm_disp(rt, rn, off)
    351 %endm
    352 
    353 # mov [rN + off], rT        48 89 /r
    354 %macro amd_mem_ST(rt, rn, off)
    355 %amd_rex_wrb(rt, rn)
    356 !(0x89)
    357 %amd_modrm_disp(rt, rn, off)
    358 %endm
    359 
    360 # mov [rN + off], rT8       48 88 /r  (REX.W forces the rD8 encoding of
    361 # dil/sil/bpl/spl when the byte view of those regs is needed.)
    362 %macro amd_mem_SB(rt, rn, off)
    363 %amd_rex_wrb(rt, rn)
    364 !(0x88)
    365 %amd_modrm_disp(rt, rn, off)
    366 %endm
    367 
    368 # movzx rT, byte ptr [rN + off]  -- 48 0F B6 /r
    369 %macro amd_mem_LB(rt, rn, off)
    370 %amd_rex_wrb(rt, rn)
    371 !(0x0F)
    372 !(0xB6)
    373 %amd_modrm_disp(rt, rn, off)
    374 %endm
    375 
    376 # ---- Control flow primitives -------------------------------------------
    377 
    378 # jmp r/m64        -- FF /4
    379 # call r/m64       -- FF /2
    380 # ret              -- C3
    381 # syscall          -- 0F 05
    382 # cmp rA, rB       -- 48 39 /r  (modrm: rB in reg, rA in rm)
    383 # test rA, rA      -- 48 85 /r
    384 # Jcc rel8         -- 7x ib
    385 
    386 %macro amd_jmp_r(r)
    387 %amd_maybe_rex_b(r)
    388 !(0xFF)
    389 !((| 0xE0 (& %amd_reg(r) 7)))
    390 %endm
    391 
    392 %macro amd_call_r(r)
    393 %amd_maybe_rex_b(r)
    394 !(0xFF)
    395 !((| 0xD0 (& %amd_reg(r) 7)))
    396 %endm
    397 
    398 %macro amd_ret()
    399 !(0xC3)
    400 %endm
    401 
    402 %macro amd_syscall()
    403 !(0x0F)
    404 !(0x05)
    405 %endm
    406 
    407 # cmp rA, rB -- 48 39 /r (modrm: rB in reg, rA in rm).
    408 %macro amd_cmp_rr(ra, rb)
    409 %amd_rex_wrb(rb, ra)
    410 !(0x39)
    411 %amd_modrm_rr(rb, ra)
    412 %endm
    413 
    414 %macro amd_test_rr(ra, rb)
    415 %amd_rex_wrb(rb, ra)
    416 !(0x85)
    417 %amd_modrm_rr(rb, ra)
    418 %endm
    419 
    420 # ---- P1 register-register op lowering ----------------------------------
    421 #
    422 # For ADD/SUB/AND/OR/XOR we honor rD=rB aliasing — the naive `mov rD,rA ;
    423 # op rD,rB` would clobber rB before the op reads it. Route rB through the
    424 # scratch reg when that aliasing shows up.
    425 
    426 %macro amd_rrr_simple_ADD(rd, ra, rb)
    427 %amd_rrr_simple(0x01, rd, ra, rb)
    428 %endm
    429 %macro amd_rrr_simple_SUB(rd, ra, rb)
    430 %amd_rrr_simple(0x29, rd, ra, rb)
    431 %endm
    432 %macro amd_rrr_simple_AND(rd, ra, rb)
    433 %amd_rrr_simple(0x21, rd, ra, rb)
    434 %endm
    435 %macro amd_rrr_simple_OR(rd, ra, rb)
    436 %amd_rrr_simple(0x09, rd, ra, rb)
    437 %endm
    438 %macro amd_rrr_simple_XOR(rd, ra, rb)
    439 %amd_rrr_simple(0x31, rd, ra, rb)
    440 %endm
    441 
    442 %macro amd_rrr_simple(opcode, rd, ra, rb)
    443 %select((= %amd_reg(rd) %amd_reg(rb)),
    444     %amd_rrr_simple_via_scratch(opcode, rd, ra, rb),
    445     %amd_rrr_simple_direct(opcode, rd, ra, rb))
    446 %endm
    447 
    448 %macro amd_rrr_simple_direct(opcode, rd, ra, rb)
    449 %amd_mov_rr(rd, ra)
    450 %amd_alu_rr(opcode, rd, rb)
    451 %endm
    452 
    453 %macro amd_rrr_simple_via_scratch(opcode, rd, ra, rb)
    454 %amd_mov_rr(scratch, rb)
    455 %amd_mov_rr(rd, ra)
    456 %amd_alu_rr(opcode, rd, scratch)
    457 %endm
    458 
    459 %macro amd_rrr_MUL(rd, ra, rb)
    460 %select((= %amd_reg(rd) %amd_reg(rb)),
    461     %amd_rrr_MUL_via_scratch(rd, ra, rb),
    462     %amd_rrr_MUL_direct(rd, ra, rb))
    463 %endm
    464 %macro amd_rrr_MUL_direct(rd, ra, rb)
    465 %amd_mov_rr(rd, ra)
    466 %amd_imul_rr(rd, rb)
    467 %endm
    468 %macro amd_rrr_MUL_via_scratch(rd, ra, rb)
    469 %amd_mov_rr(scratch, rb)
    470 %amd_mov_rr(rd, ra)
    471 %amd_imul_rr(rd, scratch)
    472 %endm
    473 
    474 # DIV / REM clobber rax and rdx natively. rax is not a P1 register, so
    475 # we clobber it freely; rdx IS P1 a2, so we stash it to rbp (also outside
    476 # the P1 mapping) for the lifetime of the op.
    477 #
    478 # Aliasing-safety plan, same for DIV and REM:
    479 #   1. rbp = rdx                    -- saved a2, also serves as "original rb
    480 #                                       if rb == a2" via the scratch copy
    481 #   2. scratch = rb                 -- read rb while rdx still holds its
    482 #                                       original value (in case rb == a2)
    483 #   3. rax = ra                     -- ra == a2 reads original rdx for the
    484 #                                       same reason; cqo hasn't run yet
    485 #   4. cqo ; idiv scratch           -- divide
    486 #   5. rdx = rbp (restore) BEFORE   -- so `mov rd, rax/rdx` below can
    487 #      writing rd                      legitimately overwrite rdx when
    488 #                                      rd == a2 without losing the result
    489 #   6. mov rd, rax                  -- DIV quotient
    490 #      or capture rdx -> rax first,
    491 #      then rd = rax                -- REM remainder (capture dodges the
    492 #                                      restore overwriting the remainder)
    493 
    494 %macro amd_rrr_DIV(rd, ra, rb)
    495 %amd_mov_rr(rbp, rdx)
    496 %amd_mov_rr(scratch, rb)
    497 %amd_mov_rr(rax, ra)
    498 %amd_cqo()
    499 %amd_idiv_r(scratch)
    500 %amd_mov_rr(rdx, rbp)
    501 %amd_mov_rr(rd, rax)
    502 %endm
    503 
    504 %macro amd_rrr_REM(rd, ra, rb)
    505 %amd_mov_rr(rbp, rdx)
    506 %amd_mov_rr(scratch, rb)
    507 %amd_mov_rr(rax, ra)
    508 %amd_cqo()
    509 %amd_idiv_r(scratch)
    510 %amd_mov_rr(rax, rdx)
    511 %amd_mov_rr(rdx, rbp)
    512 %amd_mov_rr(rd, rax)
    513 %endm
    514 
    515 # SHL / SHR / SAR with reg count. x86 reads the count from CL only, so
    516 # staging goes through rcx — which IS P1 a3. Save rcx to rbp for the
    517 # duration.
    518 #
    519 # Ordering is load-bearing:
    520 #   1. rbp = rcx                     -- save a3
    521 #   2. scratch = ra                  -- read ra BEFORE we overwrite rcx;
    522 #                                      otherwise `ra == a3` reads the count
    523 #                                      we just staged
    524 #   3. rcx = rb                      -- count into cl
    525 #   4. shift scratch, cl             -- do the work
    526 #   5. rcx = rbp (restore) BEFORE    -- so `mov rd, scratch` below can
    527 #      writing rd                       legitimately overwrite rcx when
    528 #                                      rd == a3 without losing the result
    529 #   6. mov rd, scratch
    530 
    531 %macro amd_rrr_SHL(rd, ra, rb)
    532 %amd_rrr_shift(4, rd, ra, rb)
    533 %endm
    534 %macro amd_rrr_SHR(rd, ra, rb)
    535 %amd_rrr_shift(5, rd, ra, rb)
    536 %endm
    537 %macro amd_rrr_SAR(rd, ra, rb)
    538 %amd_rrr_shift(7, rd, ra, rb)
    539 %endm
    540 
    541 %macro amd_rrr_shift(ext, rd, ra, rb)
    542 %amd_mov_rr(rbp, rcx)
    543 %amd_mov_rr(scratch, ra)
    544 %amd_mov_rr(rcx, rb)
    545 %amd_shift_cl(ext, scratch)
    546 %amd_mov_rr(rcx, rbp)
    547 %amd_mov_rr(rd, scratch)
    548 %endm
    549 
    550 %macro amd_rrr_op(op, rd, ra, rb)
    551 %amd_rrr_##op(rd, ra, rb)
    552 %endm
    553 
    554 %macro amd_rrr_ADD(rd, ra, rb)
    555 %amd_rrr_simple_ADD(rd, ra, rb)
    556 %endm
    557 %macro amd_rrr_SUB(rd, ra, rb)
    558 %amd_rrr_simple_SUB(rd, ra, rb)
    559 %endm
    560 %macro amd_rrr_AND(rd, ra, rb)
    561 %amd_rrr_simple_AND(rd, ra, rb)
    562 %endm
    563 %macro amd_rrr_OR(rd, ra, rb)
    564 %amd_rrr_simple_OR(rd, ra, rb)
    565 %endm
    566 %macro amd_rrr_XOR(rd, ra, rb)
    567 %amd_rrr_simple_XOR(rd, ra, rb)
    568 %endm
    569 
    570 # ---- P1 operation lowering ---------------------------------------------
    571 
    572 %macro p1_li(rd)
    573 %amd_mov_imm64_prefix(rd)
    574 %endm
    575 
    576 %macro p1_la(rd)
    577 %amd_mov_imm32_prefix(rd)
    578 %endm
    579 
    580 %macro p1_labr()
    581 %amd_mov_imm32_prefix(br)
    582 %endm
    583 
    584 %macro p1_mov(rd, rs)
    585 %p1_mov_##rs(rd)
    586 %endm
    587 
    588 # All non-sp sources: plain register copy.
    589 %macro p1_mov_a0(rd)
    590 %amd_mov_rr(rd, a0)
    591 %endm
    592 %macro p1_mov_a1(rd)
    593 %amd_mov_rr(rd, a1)
    594 %endm
    595 %macro p1_mov_a2(rd)
    596 %amd_mov_rr(rd, a2)
    597 %endm
    598 %macro p1_mov_a3(rd)
    599 %amd_mov_rr(rd, a3)
    600 %endm
    601 %macro p1_mov_t0(rd)
    602 %amd_mov_rr(rd, t0)
    603 %endm
    604 %macro p1_mov_t1(rd)
    605 %amd_mov_rr(rd, t1)
    606 %endm
    607 %macro p1_mov_t2(rd)
    608 %amd_mov_rr(rd, t2)
    609 %endm
    610 %macro p1_mov_s0(rd)
    611 %amd_mov_rr(rd, s0)
    612 %endm
    613 %macro p1_mov_s1(rd)
    614 %amd_mov_rr(rd, s1)
    615 %endm
    616 %macro p1_mov_s2(rd)
    617 %amd_mov_rr(rd, s2)
    618 %endm
    619 %macro p1_mov_s3(rd)
    620 %amd_mov_rr(rd, s3)
    621 %endm
    622 
    623 # sp-source: portable sp is the frame-local base, which is native rsp + 16
    624 # (the 16-byte backend-private frame header sits at [rsp+0..rsp+15]).
    625 # Emit `mov rd, rsp ; add rd, 16`.
    626 %macro p1_mov_sp(rd)
    627 %amd_mov_rr(rd, sp)
    628 %amd_alu_ri8(0, rd, 16)
    629 %endm
    630 
    631 %macro p1_rrr(op, rd, ra, rb)
    632 %amd_rrr_op(op, rd, ra, rb)
    633 %endm
    634 
    635 %macro p1_addi(rd, ra, imm)
    636 %amd_mov_rr(rd, ra)
    637 %select((>= imm -128),
    638     %select((<= imm 127),
    639         %amd_alu_ri8(0, rd, imm),
    640         %amd_alu_ri32(0, rd, imm)),
    641     %amd_alu_ri32(0, rd, imm))
    642 %endm
    643 
    644 # AND/OR with imm: 83 /ext ib sign-extends imm8 to 64 bits. That works for
    645 # imm in [-128, 127] (and for -1 as a convenient all-ones mask), but breaks
    646 # for positive imms >= 128 — ANDI with 255 would become AND with
    647 # 0xFFFFFFFFFFFFFFFF. Widen to the imm32 form when imm8 would misencode.
    648 %macro p1_logi_ANDI(rd, ra, imm)
    649 %amd_mov_rr(rd, ra)
    650 %select((>= imm -128),
    651     %select((<= imm 127),
    652         %amd_alu_ri8(4, rd, imm),
    653         %amd_alu_ri32(4, rd, imm)),
    654     %amd_alu_ri32(4, rd, imm))
    655 %endm
    656 %macro p1_logi_ORI(rd, ra, imm)
    657 %amd_mov_rr(rd, ra)
    658 %select((>= imm -128),
    659     %select((<= imm 127),
    660         %amd_alu_ri8(1, rd, imm),
    661         %amd_alu_ri32(1, rd, imm)),
    662     %amd_alu_ri32(1, rd, imm))
    663 %endm
    664 %macro p1_logi(op, rd, ra, imm)
    665 %p1_logi_##op(rd, ra, imm)
    666 %endm
    667 
    668 %macro p1_shifti_SHLI(rd, ra, imm)
    669 %amd_mov_rr(rd, ra)
    670 %amd_shift_ri8(4, rd, imm)
    671 %endm
    672 %macro p1_shifti_SHRI(rd, ra, imm)
    673 %amd_mov_rr(rd, ra)
    674 %amd_shift_ri8(5, rd, imm)
    675 %endm
    676 %macro p1_shifti_SARI(rd, ra, imm)
    677 %amd_mov_rr(rd, ra)
    678 %amd_shift_ri8(7, rd, imm)
    679 %endm
    680 %macro p1_shifti(op, rd, ra, imm)
    681 %p1_shifti_##op(rd, ra, imm)
    682 %endm
    683 
    684 # p1_mem -- portable-offset memory access. When the base is sp, portable
    685 # sp is the frame-local base (16 bytes above native rsp), so the physical
    686 # access needs the supplied offset plus 16. For any other base, portable
    687 # and native offsets coincide. Internal backend callers that need raw
    688 # native-rsp access (p1_enter, p1_eret, _start stub, p1_ldarg, p1_syscall)
    689 # use amd_mem_LD/amd_mem_ST directly and bypass this translation.
    690 
    691 %macro p1_mem(op, rt, rn, off)
    692 %select((= %amd_is_sp(rn) 1),
    693     %amd_mem_##op(rt, rn, (+ off 16)),
    694     %amd_mem_##op(rt, rn, off))
    695 %endm
    696 
    697 %macro p1_ldarg(rd, slot)
    698 %amd_mem_LD(scratch, sp, 8)
    699 %amd_mem_LD(rd, scratch, (+ 16 (* 8 slot)))
    700 %endm
    701 
    702 %macro p1_b()
    703 %amd_jmp_r(br)
    704 %endm
    705 
    706 %macro p1_br(rs)
    707 %amd_jmp_r(rs)
    708 %endm
    709 
    710 %macro p1_call()
    711 %amd_call_r(br)
    712 %endm
    713 
    714 %macro p1_callr(rs)
    715 %amd_call_r(rs)
    716 %endm
    717 
    718 %macro p1_ret()
    719 %amd_ret()
    720 %endm
    721 
    722 # ERET -- atomic frame epilogue + return from a framed function.
    723 #   r9 = [rsp + 0]       -- retaddr into scratch (native rsp; backend-private)
    724 #   rax = [rsp + 8]      -- saved caller sp into rax (an unused native reg)
    725 #   rsp = rax            -- unwind to caller sp
    726 #   push r9              -- reinstall retaddr so the trailing ret returns
    727 #                          correctly
    728 #   ret                  -- pop reinstated retaddr into rip
    729 %macro p1_eret()
    730 %amd_mem_LD(scratch, sp, 0)
    731 %amd_mem_LD(rax, sp, 8)
    732 %amd_mov_rr(sp, rax)
    733 %amd_push(scratch)
    734 %amd_ret()
    735 %endm
    736 
    737 # TAIL / TAILR -- frame epilogue followed by an unconditional jump to the
    738 # target. The epilogue is the same sequence as the first four steps of
    739 # p1_eret (we omit the trailing ret because we jmp to a fresh target
    740 # instead).
    741 %macro p1_tail()
    742 %amd_mem_LD(scratch, sp, 0)
    743 %amd_mem_LD(rax, sp, 8)
    744 %amd_mov_rr(sp, rax)
    745 %amd_push(scratch)
    746 %amd_jmp_r(br)
    747 %endm
    748 
    749 %macro p1_tailr(rs)
    750 %amd_mem_LD(scratch, sp, 0)
    751 %amd_mem_LD(rax, sp, 8)
    752 %amd_mov_rr(sp, rax)
    753 %amd_push(scratch)
    754 %amd_jmp_r(rs)
    755 %endm
    756 
    757 # Conditional-branch lowering:
    758 #   compare / test
    759 #   Jcc_inverse +3          skip the 3-byte `jmp r15`
    760 #   jmp r15                 P1 branch-taken path
    761 #
    762 # Invert codes: BEQ->JNE(75), BNE->JE(74), BLT->JGE(7D), BLTU->JAE(73),
    763 # BLTZ->JGE(7D), BEQZ->JNE(75), BNEZ->JE(74).
    764 
    765 %macro p1_condb_BEQ(ra, rb)
    766 %amd_cmp_rr(ra, rb)
    767 !(0x75)
    768 !(0x03)
    769 %amd_jmp_r(br)
    770 %endm
    771 %macro p1_condb_BNE(ra, rb)
    772 %amd_cmp_rr(ra, rb)
    773 !(0x74)
    774 !(0x03)
    775 %amd_jmp_r(br)
    776 %endm
    777 %macro p1_condb_BLT(ra, rb)
    778 %amd_cmp_rr(ra, rb)
    779 !(0x7D)
    780 !(0x03)
    781 %amd_jmp_r(br)
    782 %endm
    783 %macro p1_condb_BLTU(ra, rb)
    784 %amd_cmp_rr(ra, rb)
    785 !(0x73)
    786 !(0x03)
    787 %amd_jmp_r(br)
    788 %endm
    789 %macro p1_condb(op, ra, rb)
    790 %p1_condb_##op(ra, rb)
    791 %endm
    792 
    793 %macro p1_condbz_BEQZ(ra)
    794 %amd_test_rr(ra, ra)
    795 !(0x75)
    796 !(0x03)
    797 %amd_jmp_r(br)
    798 %endm
    799 %macro p1_condbz_BNEZ(ra)
    800 %amd_test_rr(ra, ra)
    801 !(0x74)
    802 !(0x03)
    803 %amd_jmp_r(br)
    804 %endm
    805 %macro p1_condbz_BLTZ(ra)
    806 %amd_test_rr(ra, ra)
    807 !(0x7D)
    808 !(0x03)
    809 %amd_jmp_r(br)
    810 %endm
    811 %macro p1_condbz(op, ra)
    812 %p1_condbz_##op(ra)
    813 %endm
    814 
    815 # ENTER size
    816 #
    817 # CALL on amd64 pushed the retaddr, so on entry:
    818 #   rsp = caller_sp - 8
    819 #   [rsp] = retaddr
    820 #
    821 # We want the standard frame:
    822 #   [sp + 0] = retaddr
    823 #   [sp + 8] = saved caller_sp
    824 #   [sp + 16 .. 16 + size - 1] = locals
    825 #   total frame = round_up(16, 16 + size)
    826 #
    827 # Pop retaddr into scratch, save caller_sp into rax (unused by P1),
    828 # allocate frame, restore retaddr at [sp], store caller_sp at [sp+8].
    829 %macro p1_enter(size)
    830 %amd_pop(scratch)
    831 %amd_mov_rr(rax, sp)
    832 %amd_alu_ri32(5, sp, (& (+ (+ 16 size) 15) -16))
    833 %amd_mem_ST(scratch, sp, 0)
    834 %amd_mem_ST(rax, sp, 8)
    835 %endm
    836 
    837 %macro p1_entry()
    838 # :_start stub per the P1v2 program-entry model. Linux amd64 puts argc
    839 # at [rsp] and argv starting at [rsp+8]. Load argc into a0 (rdi),
    840 # compute &argv[0] into a1 (rsi), call p1_main under the one-word
    841 # direct-result convention, then issue sys_exit with p1_main's return
    842 # value in a0.
    843 :_start
    844 %amd_mem_LD(a0, sp, 0)
    845 %amd_mov_rr(a1, sp)
    846 %amd_alu_ri8(0, a1, 8)
    847 %amd_mov_imm32_prefix(br)
    848 &p1_main
    849 %amd_call_r(br)
    850 # mov eax, 60  (sys_exit); syscall. P1 a0 (native rdi) already holds
    851 # p1_main's return value.
    852 !(0xB8)
    853 %(60)
    854 !(0x0F)
    855 !(0x05)
    856 %endm
    857 
    858 %macro p1_syscall()
    859 # P1: a0=num, a1..a3,t0,s0,s1 = args 0..5. Linux amd64: rax=num,
    860 # rdi/rsi/rdx/r10/r8/r9 = args 0..5, return in rax; syscall also
    861 # clobbers rcx and r11.
    862 #
    863 # Plan: push the P1 registers whose native slots get overwritten or
    864 # syscall-clobbered — rsi (a1), rdx (a2), rcx (a3), r11 (t1), r8 (t2) —
    865 # then shuffle into the native slots, issue syscall, restore, and move
    866 # the return value (rax) into a0 (rdi).
    867 %amd_push(rsi)
    868 %amd_push(rdx)
    869 %amd_push(rcx)
    870 %amd_push(r11)
    871 %amd_push(r8)
    872 
    873 %amd_mov_rr(rax, rdi)
    874 %amd_mem_LD(rdi, sp, 32)
    875 %amd_mem_LD(rsi, sp, 24)
    876 %amd_mem_LD(rdx, sp, 16)
    877 %amd_mov_rr(r8, rbx)
    878 %amd_mov_rr(r9, r12)
    879 
    880 !(0x0F)
    881 !(0x05)
    882 
    883 %amd_pop(r8)
    884 %amd_pop(r11)
    885 %amd_pop(rcx)
    886 %amd_pop(rdx)
    887 %amd_pop(rsi)
    888 
    889 %amd_mov_rr(rdi, rax)
    890 %endm
    891 
    892 # ---- Linux amd64 syscall number data words ------------------------------
    893 
    894 %macro p1_sys_read()
    895 $(0)
    896 %endm
    897 %macro p1_sys_write()
    898 $(1)
    899 %endm
    900 %macro p1_sys_close()
    901 $(3)
    902 %endm
    903 %macro p1_sys_openat()
    904 $(257)
    905 %endm
    906 %macro p1_sys_exit()
    907 $(60)
    908 %endm
    909 %macro p1_sys_clone()
    910 $(56)
    911 %endm
    912 %macro p1_sys_execve()
    913 $(59)
    914 %endm
    915 %macro p1_sys_waitid()
    916 $(247)
    917 %endm
	boot2 Playing with the boostrap
	git clone https://git.ryansepassi.com/git/boot2.git
	Log \| Files \| Refs \| README