P1-amd64.M1pp (20913B)
1 # P1-amd64.M1pp -- P1v2 amd64 backend expressed in m1macro. 2 # 3 # Mirrors p1/P1-aarch64.M1pp. Native register mapping is backend-private; 4 # see the amd_reg_* table below. amd64 is variable-length, so every op 5 # emits its prefix bytes (REX / opcode) directly via the m1pp `!(…)` 6 # single-byte builtin; 4-byte immediates still go through `%(…)`. 7 # 8 # Hidden backend regs: 9 # br = r15 -- branch-target mechanism 10 # scratch = r9 -- per-expansion scratch (e.g. rcx save slot for SHIFT) 11 # rax -- syscall number / return slot + retaddr spill 12 # rbp -- spill slot for rcx / rdx when SHIFT and DIV/REM need 13 # to preserve a3 / a2 14 15 # ---- Native register numbers -------------------------------------------- 16 # 17 # Macros emit the 4-bit native regnum 0..15. Callers use `(& N 7)` for the 18 # ModRM/SIB low 3 bits and `(>> N 3)` for the REX high bit. 19 20 %macro amd_reg_a0() 21 7 22 %endm 23 %macro amd_reg_a1() 24 6 25 %endm 26 %macro amd_reg_a2() 27 2 28 %endm 29 %macro amd_reg_a3() 30 1 31 %endm 32 %macro amd_reg_t0() 33 10 34 %endm 35 %macro amd_reg_t1() 36 11 37 %endm 38 %macro amd_reg_t2() 39 8 40 %endm 41 %macro amd_reg_s0() 42 3 43 %endm 44 %macro amd_reg_s1() 45 12 46 %endm 47 %macro amd_reg_s2() 48 13 49 %endm 50 %macro amd_reg_s3() 51 14 52 %endm 53 %macro amd_reg_sp() 54 4 55 %endm 56 %macro amd_reg_rax() 57 0 58 %endm 59 %macro amd_reg_rcx() 60 1 61 %endm 62 %macro amd_reg_rdx() 63 2 64 %endm 65 %macro amd_reg_rbx() 66 3 67 %endm 68 %macro amd_reg_rsp() 69 4 70 %endm 71 %macro amd_reg_rbp() 72 5 73 %endm 74 %macro amd_reg_rsi() 75 6 76 %endm 77 %macro amd_reg_rdi() 78 7 79 %endm 80 %macro amd_reg_r8() 81 8 82 %endm 83 %macro amd_reg_r9() 84 9 85 %endm 86 %macro amd_reg_r10() 87 10 88 %endm 89 %macro amd_reg_r11() 90 11 91 %endm 92 %macro amd_reg_r12() 93 12 94 %endm 95 %macro amd_reg_r13() 96 13 97 %endm 98 %macro amd_reg_r14() 99 14 100 %endm 101 %macro amd_reg_r15() 102 15 103 %endm 104 %macro amd_reg_br() 105 15 106 %endm 107 %macro amd_reg_scratch() 108 9 109 %endm 110 111 %macro amd_reg(r) 112 %amd_reg_##r() 113 %endm 114 115 # Per-P1-name `is this sp?` predicate. Used by p1_mem to decide whether 116 # the supplied offset needs the +16 frame-header adjustment. 117 118 %macro amd_is_sp_a0() 119 0 120 %endm 121 %macro amd_is_sp_a1() 122 0 123 %endm 124 %macro amd_is_sp_a2() 125 0 126 %endm 127 %macro amd_is_sp_a3() 128 0 129 %endm 130 %macro amd_is_sp_t0() 131 0 132 %endm 133 %macro amd_is_sp_t1() 134 0 135 %endm 136 %macro amd_is_sp_t2() 137 0 138 %endm 139 %macro amd_is_sp_s0() 140 0 141 %endm 142 %macro amd_is_sp_s1() 143 0 144 %endm 145 %macro amd_is_sp_s2() 146 0 147 %endm 148 %macro amd_is_sp_s3() 149 0 150 %endm 151 %macro amd_is_sp_sp() 152 1 153 %endm 154 155 %macro amd_is_sp(r) 156 %amd_is_sp_##r() 157 %endm 158 159 # ---- REX / ModRM helpers ------------------------------------------------ 160 161 # Short one-byte REX.B prefix (no W). Used by opcodes that don't need 64-bit 162 # width — push/pop/jmp r/call r/mov r,imm32 — when the target reg is r8-r15. 163 %macro amd_rex_b_short() 164 !(0x41) 165 %endm 166 167 # No-op sentinel for %select branches that shouldn't emit anything. 168 %macro amd_nobytes() 169 %endm 170 171 # Emit REX.B (0x41) iff r is r8-r15. Used by the short-prefix opcodes above. 172 %macro amd_maybe_rex_b(r) 173 %select((>= %amd_reg(r) 8), 174 %amd_rex_b_short(), 175 %amd_nobytes()) 176 %endm 177 178 # REX.WB: W=1 for 64-bit, B=(r>>3) to extend ModRM.rm / SIB.base. 179 %macro amd_rex_wb(r) 180 !((| 0x48 (& (>> %amd_reg(r) 3) 1))) 181 %endm 182 183 # REX.WRB: W=1, R=(rg>>3), B=(rm>>3). Used whenever a ModRM.reg field is 184 # in use together with a ModRM.rm field. 185 %macro amd_rex_wrb(rg, rm) 186 !((| 0x48 (| (<< (& (>> %amd_reg(rg) 3) 1) 2) (& (>> %amd_reg(rm) 3) 1)))) 187 %endm 188 189 # ModRM byte for register/register: mod=3, reg=rg low3, rm=rm low3. 190 %macro amd_modrm_rr(rg, rm) 191 !((| 0xC0 (| (<< (& %amd_reg(rg) 7) 3) (& %amd_reg(rm) 7)))) 192 %endm 193 194 # ModRM /ext, rm: mod=3, reg=ext, rm=low3(rm). ext is 0..7. 195 %macro amd_modrm_ext_r(ext, rm) 196 !((| 0xC0 (| (<< ext 3) (& %amd_reg(rm) 7)))) 197 %endm 198 199 # ---- Memory-addressing ModRM (+ SIB + disp) ---------------------------- 200 # 201 # [base + disp] with `reg` in ModRM.reg. Bases whose low 3 bits are 100 — 202 # rsp and r12 — must go through a SIB byte; all others use the plain 203 # encoding. disp selects mod=1 (disp8) when it fits in [-128,127], else 204 # mod=2 (disp32). We never emit mod=0 / no-disp; the extra byte is fine. 205 206 %macro amd_modrm_disp8_plain(reg, base, disp) 207 !((| 0x40 (| (<< (& %amd_reg(reg) 7) 3) (& %amd_reg(base) 7)))) 208 !((& disp 0xFF)) 209 %endm 210 211 %macro amd_modrm_disp32_plain(reg, base, disp) 212 !((| 0x80 (| (<< (& %amd_reg(reg) 7) 3) (& %amd_reg(base) 7)))) 213 %((& disp 0xFFFFFFFF)) 214 %endm 215 216 %macro amd_modrm_disp8_sib(reg, disp) 217 !((| 0x44 (<< (& %amd_reg(reg) 7) 3))) 218 !(0x24) 219 !((& disp 0xFF)) 220 %endm 221 222 %macro amd_modrm_disp32_sib(reg, disp) 223 !((| 0x84 (<< (& %amd_reg(reg) 7) 3))) 224 !(0x24) 225 %((& disp 0xFFFFFFFF)) 226 %endm 227 228 %macro amd_modrm_disp_plain(reg, base, disp) 229 %select((>= disp -128), 230 %select((<= disp 127), 231 %amd_modrm_disp8_plain(reg, base, disp), 232 %amd_modrm_disp32_plain(reg, base, disp)), 233 %amd_modrm_disp32_plain(reg, base, disp)) 234 %endm 235 236 %macro amd_modrm_disp_sib(reg, disp) 237 %select((>= disp -128), 238 %select((<= disp 127), 239 %amd_modrm_disp8_sib(reg, disp), 240 %amd_modrm_disp32_sib(reg, disp)), 241 %amd_modrm_disp32_sib(reg, disp)) 242 %endm 243 244 %macro amd_modrm_disp(reg, base, disp) 245 %select((= (& %amd_reg(base) 7) 4), 246 %amd_modrm_disp_sib(reg, disp), 247 %amd_modrm_disp_plain(reg, base, disp)) 248 %endm 249 250 # ---- Register / arithmetic primitives ---------------------------------- 251 252 # mov dst, src -- 48 89 /r (modrm form: source in reg, dest in rm). 253 %macro amd_mov_rr(dst, src) 254 %amd_rex_wrb(src, dst) 255 !(0x89) 256 %amd_modrm_rr(src, dst) 257 %endm 258 259 # op dst, src for ADD/SUB/AND/OR/XOR (same shape, different opcode byte). 260 %macro amd_alu_rr(opcode, dst, src) 261 %amd_rex_wrb(src, dst) 262 !(opcode) 263 %amd_modrm_rr(src, dst) 264 %endm 265 266 # op dst, imm8 -- 48 83 /ext ib. 267 %macro amd_alu_ri8(ext, dst, imm) 268 %amd_rex_wb(dst) 269 !(0x83) 270 %amd_modrm_ext_r(ext, dst) 271 !((& imm 0xFF)) 272 %endm 273 274 # op dst, imm32 -- 48 81 /ext id. 275 %macro amd_alu_ri32(ext, dst, imm) 276 %amd_rex_wb(dst) 277 !(0x81) 278 %amd_modrm_ext_r(ext, dst) 279 %((& imm 0xFFFFFFFF)) 280 %endm 281 282 # shift dst, imm8 -- 48 C1 /ext ib. (ext: SHL=4, SHR=5, SAR=7) 283 %macro amd_shift_ri8(ext, dst, imm) 284 %amd_rex_wb(dst) 285 !(0xC1) 286 %amd_modrm_ext_r(ext, dst) 287 !((& imm 0x3F)) 288 %endm 289 290 # shift dst, cl -- 48 D3 /ext. 291 %macro amd_shift_cl(ext, dst) 292 %amd_rex_wb(dst) 293 !(0xD3) 294 %amd_modrm_ext_r(ext, dst) 295 %endm 296 297 # imul dst, src -- 48 0F AF /r (load source into reg, dest in rm? actually 298 # the canonical form is IMUL r64, r/m64 — dest in reg, source in rm.) 299 %macro amd_imul_rr(dst, src) 300 %amd_rex_wrb(dst, src) 301 !(0x0F) 302 !(0xAF) 303 %amd_modrm_rr(dst, src) 304 %endm 305 306 # idiv src -- 48 F7 /7. 307 %macro amd_idiv_r(src) 308 %amd_rex_wb(src) 309 !(0xF7) 310 %amd_modrm_ext_r(7, src) 311 %endm 312 313 # cqo -- 48 99 (sign-extend rax into rdx:rax). 314 %macro amd_cqo() 315 !(0x48) 316 !(0x99) 317 %endm 318 319 # push / pop r64. 50+r / 58+r; REX.B=0x41 if r8-r15. 320 %macro amd_push(r) 321 %amd_maybe_rex_b(r) 322 !((| 0x50 (& %amd_reg(r) 7))) 323 %endm 324 325 %macro amd_pop(r) 326 %amd_maybe_rex_b(r) 327 !((| 0x58 (& %amd_reg(r) 7))) 328 %endm 329 330 # mov r32, imm32 -- B8+r id. Low-register form skips REX; r8-r15 need 331 # REX.B=0x41. The 4-byte literal the caller emits is zero-extended into 332 # the full 64-bit register, matching the LA / LA_BR literal-pool contract. 333 %macro amd_mov_imm32_prefix(rd) 334 %amd_maybe_rex_b(rd) 335 !((| 0xB8 (& %amd_reg(rd) 7))) 336 %endm 337 338 # mov r64, imm64 -- REX.W [+ REX.B] B8+r followed by 8 bytes of literal. 339 %macro amd_mov_imm64_prefix(rd) 340 %amd_rex_wb(rd) 341 !((| 0xB8 (& %amd_reg(rd) 7))) 342 %endm 343 344 # ---- Memory ops --------------------------------------------------------- 345 346 # mov rT, [rN + off] 48 8B /r modrm-with-disp 347 %macro amd_mem_LD(rt, rn, off) 348 %amd_rex_wrb(rt, rn) 349 !(0x8B) 350 %amd_modrm_disp(rt, rn, off) 351 %endm 352 353 # mov [rN + off], rT 48 89 /r 354 %macro amd_mem_ST(rt, rn, off) 355 %amd_rex_wrb(rt, rn) 356 !(0x89) 357 %amd_modrm_disp(rt, rn, off) 358 %endm 359 360 # mov [rN + off], rT8 48 88 /r (REX.W forces the rD8 encoding of 361 # dil/sil/bpl/spl when the byte view of those regs is needed.) 362 %macro amd_mem_SB(rt, rn, off) 363 %amd_rex_wrb(rt, rn) 364 !(0x88) 365 %amd_modrm_disp(rt, rn, off) 366 %endm 367 368 # movzx rT, byte ptr [rN + off] -- 48 0F B6 /r 369 %macro amd_mem_LB(rt, rn, off) 370 %amd_rex_wrb(rt, rn) 371 !(0x0F) 372 !(0xB6) 373 %amd_modrm_disp(rt, rn, off) 374 %endm 375 376 # ---- Control flow primitives ------------------------------------------- 377 378 # jmp r/m64 -- FF /4 379 # call r/m64 -- FF /2 380 # ret -- C3 381 # syscall -- 0F 05 382 # cmp rA, rB -- 48 39 /r (modrm: rB in reg, rA in rm) 383 # test rA, rA -- 48 85 /r 384 # Jcc rel8 -- 7x ib 385 386 %macro amd_jmp_r(r) 387 %amd_maybe_rex_b(r) 388 !(0xFF) 389 !((| 0xE0 (& %amd_reg(r) 7))) 390 %endm 391 392 %macro amd_call_r(r) 393 %amd_maybe_rex_b(r) 394 !(0xFF) 395 !((| 0xD0 (& %amd_reg(r) 7))) 396 %endm 397 398 %macro amd_ret() 399 !(0xC3) 400 %endm 401 402 %macro amd_syscall() 403 !(0x0F) 404 !(0x05) 405 %endm 406 407 # cmp rA, rB -- 48 39 /r (modrm: rB in reg, rA in rm). 408 %macro amd_cmp_rr(ra, rb) 409 %amd_rex_wrb(rb, ra) 410 !(0x39) 411 %amd_modrm_rr(rb, ra) 412 %endm 413 414 %macro amd_test_rr(ra, rb) 415 %amd_rex_wrb(rb, ra) 416 !(0x85) 417 %amd_modrm_rr(rb, ra) 418 %endm 419 420 # ---- P1 register-register op lowering ---------------------------------- 421 # 422 # For ADD/SUB/AND/OR/XOR we honor rD=rB aliasing — the naive `mov rD,rA ; 423 # op rD,rB` would clobber rB before the op reads it. Route rB through the 424 # scratch reg when that aliasing shows up. 425 426 %macro amd_rrr_simple_ADD(rd, ra, rb) 427 %amd_rrr_simple(0x01, rd, ra, rb) 428 %endm 429 %macro amd_rrr_simple_SUB(rd, ra, rb) 430 %amd_rrr_simple(0x29, rd, ra, rb) 431 %endm 432 %macro amd_rrr_simple_AND(rd, ra, rb) 433 %amd_rrr_simple(0x21, rd, ra, rb) 434 %endm 435 %macro amd_rrr_simple_OR(rd, ra, rb) 436 %amd_rrr_simple(0x09, rd, ra, rb) 437 %endm 438 %macro amd_rrr_simple_XOR(rd, ra, rb) 439 %amd_rrr_simple(0x31, rd, ra, rb) 440 %endm 441 442 %macro amd_rrr_simple(opcode, rd, ra, rb) 443 %select((= %amd_reg(rd) %amd_reg(rb)), 444 %amd_rrr_simple_via_scratch(opcode, rd, ra, rb), 445 %amd_rrr_simple_direct(opcode, rd, ra, rb)) 446 %endm 447 448 %macro amd_rrr_simple_direct(opcode, rd, ra, rb) 449 %amd_mov_rr(rd, ra) 450 %amd_alu_rr(opcode, rd, rb) 451 %endm 452 453 %macro amd_rrr_simple_via_scratch(opcode, rd, ra, rb) 454 %amd_mov_rr(scratch, rb) 455 %amd_mov_rr(rd, ra) 456 %amd_alu_rr(opcode, rd, scratch) 457 %endm 458 459 %macro amd_rrr_MUL(rd, ra, rb) 460 %select((= %amd_reg(rd) %amd_reg(rb)), 461 %amd_rrr_MUL_via_scratch(rd, ra, rb), 462 %amd_rrr_MUL_direct(rd, ra, rb)) 463 %endm 464 %macro amd_rrr_MUL_direct(rd, ra, rb) 465 %amd_mov_rr(rd, ra) 466 %amd_imul_rr(rd, rb) 467 %endm 468 %macro amd_rrr_MUL_via_scratch(rd, ra, rb) 469 %amd_mov_rr(scratch, rb) 470 %amd_mov_rr(rd, ra) 471 %amd_imul_rr(rd, scratch) 472 %endm 473 474 # DIV / REM clobber rax and rdx natively. rax is not a P1 register, so 475 # we clobber it freely; rdx IS P1 a2, so we stash it to rbp (also outside 476 # the P1 mapping) for the lifetime of the op. 477 # 478 # Aliasing-safety plan, same for DIV and REM: 479 # 1. rbp = rdx -- saved a2, also serves as "original rb 480 # if rb == a2" via the scratch copy 481 # 2. scratch = rb -- read rb while rdx still holds its 482 # original value (in case rb == a2) 483 # 3. rax = ra -- ra == a2 reads original rdx for the 484 # same reason; cqo hasn't run yet 485 # 4. cqo ; idiv scratch -- divide 486 # 5. rdx = rbp (restore) BEFORE -- so `mov rd, rax/rdx` below can 487 # writing rd legitimately overwrite rdx when 488 # rd == a2 without losing the result 489 # 6. mov rd, rax -- DIV quotient 490 # or capture rdx -> rax first, 491 # then rd = rax -- REM remainder (capture dodges the 492 # restore overwriting the remainder) 493 494 %macro amd_rrr_DIV(rd, ra, rb) 495 %amd_mov_rr(rbp, rdx) 496 %amd_mov_rr(scratch, rb) 497 %amd_mov_rr(rax, ra) 498 %amd_cqo() 499 %amd_idiv_r(scratch) 500 %amd_mov_rr(rdx, rbp) 501 %amd_mov_rr(rd, rax) 502 %endm 503 504 %macro amd_rrr_REM(rd, ra, rb) 505 %amd_mov_rr(rbp, rdx) 506 %amd_mov_rr(scratch, rb) 507 %amd_mov_rr(rax, ra) 508 %amd_cqo() 509 %amd_idiv_r(scratch) 510 %amd_mov_rr(rax, rdx) 511 %amd_mov_rr(rdx, rbp) 512 %amd_mov_rr(rd, rax) 513 %endm 514 515 # SHL / SHR / SAR with reg count. x86 reads the count from CL only, so 516 # staging goes through rcx — which IS P1 a3. Save rcx to rbp for the 517 # duration. 518 # 519 # Ordering is load-bearing: 520 # 1. rbp = rcx -- save a3 521 # 2. scratch = ra -- read ra BEFORE we overwrite rcx; 522 # otherwise `ra == a3` reads the count 523 # we just staged 524 # 3. rcx = rb -- count into cl 525 # 4. shift scratch, cl -- do the work 526 # 5. rcx = rbp (restore) BEFORE -- so `mov rd, scratch` below can 527 # writing rd legitimately overwrite rcx when 528 # rd == a3 without losing the result 529 # 6. mov rd, scratch 530 531 %macro amd_rrr_SHL(rd, ra, rb) 532 %amd_rrr_shift(4, rd, ra, rb) 533 %endm 534 %macro amd_rrr_SHR(rd, ra, rb) 535 %amd_rrr_shift(5, rd, ra, rb) 536 %endm 537 %macro amd_rrr_SAR(rd, ra, rb) 538 %amd_rrr_shift(7, rd, ra, rb) 539 %endm 540 541 %macro amd_rrr_shift(ext, rd, ra, rb) 542 %amd_mov_rr(rbp, rcx) 543 %amd_mov_rr(scratch, ra) 544 %amd_mov_rr(rcx, rb) 545 %amd_shift_cl(ext, scratch) 546 %amd_mov_rr(rcx, rbp) 547 %amd_mov_rr(rd, scratch) 548 %endm 549 550 %macro amd_rrr_op(op, rd, ra, rb) 551 %amd_rrr_##op(rd, ra, rb) 552 %endm 553 554 %macro amd_rrr_ADD(rd, ra, rb) 555 %amd_rrr_simple_ADD(rd, ra, rb) 556 %endm 557 %macro amd_rrr_SUB(rd, ra, rb) 558 %amd_rrr_simple_SUB(rd, ra, rb) 559 %endm 560 %macro amd_rrr_AND(rd, ra, rb) 561 %amd_rrr_simple_AND(rd, ra, rb) 562 %endm 563 %macro amd_rrr_OR(rd, ra, rb) 564 %amd_rrr_simple_OR(rd, ra, rb) 565 %endm 566 %macro amd_rrr_XOR(rd, ra, rb) 567 %amd_rrr_simple_XOR(rd, ra, rb) 568 %endm 569 570 # ---- P1 operation lowering --------------------------------------------- 571 572 %macro p1_li(rd) 573 %amd_mov_imm64_prefix(rd) 574 %endm 575 576 %macro p1_la(rd) 577 %amd_mov_imm32_prefix(rd) 578 %endm 579 580 %macro p1_labr() 581 %amd_mov_imm32_prefix(br) 582 %endm 583 584 %macro p1_mov(rd, rs) 585 %p1_mov_##rs(rd) 586 %endm 587 588 # All non-sp sources: plain register copy. 589 %macro p1_mov_a0(rd) 590 %amd_mov_rr(rd, a0) 591 %endm 592 %macro p1_mov_a1(rd) 593 %amd_mov_rr(rd, a1) 594 %endm 595 %macro p1_mov_a2(rd) 596 %amd_mov_rr(rd, a2) 597 %endm 598 %macro p1_mov_a3(rd) 599 %amd_mov_rr(rd, a3) 600 %endm 601 %macro p1_mov_t0(rd) 602 %amd_mov_rr(rd, t0) 603 %endm 604 %macro p1_mov_t1(rd) 605 %amd_mov_rr(rd, t1) 606 %endm 607 %macro p1_mov_t2(rd) 608 %amd_mov_rr(rd, t2) 609 %endm 610 %macro p1_mov_s0(rd) 611 %amd_mov_rr(rd, s0) 612 %endm 613 %macro p1_mov_s1(rd) 614 %amd_mov_rr(rd, s1) 615 %endm 616 %macro p1_mov_s2(rd) 617 %amd_mov_rr(rd, s2) 618 %endm 619 %macro p1_mov_s3(rd) 620 %amd_mov_rr(rd, s3) 621 %endm 622 623 # sp-source: portable sp is the frame-local base, which is native rsp + 16 624 # (the 16-byte backend-private frame header sits at [rsp+0..rsp+15]). 625 # Emit `mov rd, rsp ; add rd, 16`. 626 %macro p1_mov_sp(rd) 627 %amd_mov_rr(rd, sp) 628 %amd_alu_ri8(0, rd, 16) 629 %endm 630 631 %macro p1_rrr(op, rd, ra, rb) 632 %amd_rrr_op(op, rd, ra, rb) 633 %endm 634 635 %macro p1_addi(rd, ra, imm) 636 %amd_mov_rr(rd, ra) 637 %select((>= imm -128), 638 %select((<= imm 127), 639 %amd_alu_ri8(0, rd, imm), 640 %amd_alu_ri32(0, rd, imm)), 641 %amd_alu_ri32(0, rd, imm)) 642 %endm 643 644 # AND/OR with imm: 83 /ext ib sign-extends imm8 to 64 bits. That works for 645 # imm in [-128, 127] (and for -1 as a convenient all-ones mask), but breaks 646 # for positive imms >= 128 — ANDI with 255 would become AND with 647 # 0xFFFFFFFFFFFFFFFF. Widen to the imm32 form when imm8 would misencode. 648 %macro p1_logi_ANDI(rd, ra, imm) 649 %amd_mov_rr(rd, ra) 650 %select((>= imm -128), 651 %select((<= imm 127), 652 %amd_alu_ri8(4, rd, imm), 653 %amd_alu_ri32(4, rd, imm)), 654 %amd_alu_ri32(4, rd, imm)) 655 %endm 656 %macro p1_logi_ORI(rd, ra, imm) 657 %amd_mov_rr(rd, ra) 658 %select((>= imm -128), 659 %select((<= imm 127), 660 %amd_alu_ri8(1, rd, imm), 661 %amd_alu_ri32(1, rd, imm)), 662 %amd_alu_ri32(1, rd, imm)) 663 %endm 664 %macro p1_logi(op, rd, ra, imm) 665 %p1_logi_##op(rd, ra, imm) 666 %endm 667 668 %macro p1_shifti_SHLI(rd, ra, imm) 669 %amd_mov_rr(rd, ra) 670 %amd_shift_ri8(4, rd, imm) 671 %endm 672 %macro p1_shifti_SHRI(rd, ra, imm) 673 %amd_mov_rr(rd, ra) 674 %amd_shift_ri8(5, rd, imm) 675 %endm 676 %macro p1_shifti_SARI(rd, ra, imm) 677 %amd_mov_rr(rd, ra) 678 %amd_shift_ri8(7, rd, imm) 679 %endm 680 %macro p1_shifti(op, rd, ra, imm) 681 %p1_shifti_##op(rd, ra, imm) 682 %endm 683 684 # p1_mem -- portable-offset memory access. When the base is sp, portable 685 # sp is the frame-local base (16 bytes above native rsp), so the physical 686 # access needs the supplied offset plus 16. For any other base, portable 687 # and native offsets coincide. Internal backend callers that need raw 688 # native-rsp access (p1_enter, p1_eret, _start stub, p1_ldarg, p1_syscall) 689 # use amd_mem_LD/amd_mem_ST directly and bypass this translation. 690 691 %macro p1_mem(op, rt, rn, off) 692 %select((= %amd_is_sp(rn) 1), 693 %amd_mem_##op(rt, rn, (+ off 16)), 694 %amd_mem_##op(rt, rn, off)) 695 %endm 696 697 %macro p1_ldarg(rd, slot) 698 %amd_mem_LD(scratch, sp, 8) 699 %amd_mem_LD(rd, scratch, (+ 16 (* 8 slot))) 700 %endm 701 702 %macro p1_b() 703 %amd_jmp_r(br) 704 %endm 705 706 %macro p1_br(rs) 707 %amd_jmp_r(rs) 708 %endm 709 710 %macro p1_call() 711 %amd_call_r(br) 712 %endm 713 714 %macro p1_callr(rs) 715 %amd_call_r(rs) 716 %endm 717 718 %macro p1_ret() 719 %amd_ret() 720 %endm 721 722 # ERET -- atomic frame epilogue + return from a framed function. 723 # r9 = [rsp + 0] -- retaddr into scratch (native rsp; backend-private) 724 # rax = [rsp + 8] -- saved caller sp into rax (an unused native reg) 725 # rsp = rax -- unwind to caller sp 726 # push r9 -- reinstall retaddr so the trailing ret returns 727 # correctly 728 # ret -- pop reinstated retaddr into rip 729 %macro p1_eret() 730 %amd_mem_LD(scratch, sp, 0) 731 %amd_mem_LD(rax, sp, 8) 732 %amd_mov_rr(sp, rax) 733 %amd_push(scratch) 734 %amd_ret() 735 %endm 736 737 # TAIL / TAILR -- frame epilogue followed by an unconditional jump to the 738 # target. The epilogue is the same sequence as the first four steps of 739 # p1_eret (we omit the trailing ret because we jmp to a fresh target 740 # instead). 741 %macro p1_tail() 742 %amd_mem_LD(scratch, sp, 0) 743 %amd_mem_LD(rax, sp, 8) 744 %amd_mov_rr(sp, rax) 745 %amd_push(scratch) 746 %amd_jmp_r(br) 747 %endm 748 749 %macro p1_tailr(rs) 750 %amd_mem_LD(scratch, sp, 0) 751 %amd_mem_LD(rax, sp, 8) 752 %amd_mov_rr(sp, rax) 753 %amd_push(scratch) 754 %amd_jmp_r(rs) 755 %endm 756 757 # Conditional-branch lowering: 758 # compare / test 759 # Jcc_inverse +3 skip the 3-byte `jmp r15` 760 # jmp r15 P1 branch-taken path 761 # 762 # Invert codes: BEQ->JNE(75), BNE->JE(74), BLT->JGE(7D), BLTU->JAE(73), 763 # BLTZ->JGE(7D), BEQZ->JNE(75), BNEZ->JE(74). 764 765 %macro p1_condb_BEQ(ra, rb) 766 %amd_cmp_rr(ra, rb) 767 !(0x75) 768 !(0x03) 769 %amd_jmp_r(br) 770 %endm 771 %macro p1_condb_BNE(ra, rb) 772 %amd_cmp_rr(ra, rb) 773 !(0x74) 774 !(0x03) 775 %amd_jmp_r(br) 776 %endm 777 %macro p1_condb_BLT(ra, rb) 778 %amd_cmp_rr(ra, rb) 779 !(0x7D) 780 !(0x03) 781 %amd_jmp_r(br) 782 %endm 783 %macro p1_condb_BLTU(ra, rb) 784 %amd_cmp_rr(ra, rb) 785 !(0x73) 786 !(0x03) 787 %amd_jmp_r(br) 788 %endm 789 %macro p1_condb(op, ra, rb) 790 %p1_condb_##op(ra, rb) 791 %endm 792 793 %macro p1_condbz_BEQZ(ra) 794 %amd_test_rr(ra, ra) 795 !(0x75) 796 !(0x03) 797 %amd_jmp_r(br) 798 %endm 799 %macro p1_condbz_BNEZ(ra) 800 %amd_test_rr(ra, ra) 801 !(0x74) 802 !(0x03) 803 %amd_jmp_r(br) 804 %endm 805 %macro p1_condbz_BLTZ(ra) 806 %amd_test_rr(ra, ra) 807 !(0x7D) 808 !(0x03) 809 %amd_jmp_r(br) 810 %endm 811 %macro p1_condbz(op, ra) 812 %p1_condbz_##op(ra) 813 %endm 814 815 # ENTER size 816 # 817 # CALL on amd64 pushed the retaddr, so on entry: 818 # rsp = caller_sp - 8 819 # [rsp] = retaddr 820 # 821 # We want the standard frame: 822 # [sp + 0] = retaddr 823 # [sp + 8] = saved caller_sp 824 # [sp + 16 .. 16 + size - 1] = locals 825 # total frame = round_up(16, 16 + size) 826 # 827 # Pop retaddr into scratch, save caller_sp into rax (unused by P1), 828 # allocate frame, restore retaddr at [sp], store caller_sp at [sp+8]. 829 %macro p1_enter(size) 830 %amd_pop(scratch) 831 %amd_mov_rr(rax, sp) 832 %amd_alu_ri32(5, sp, (& (+ (+ 16 size) 15) -16)) 833 %amd_mem_ST(scratch, sp, 0) 834 %amd_mem_ST(rax, sp, 8) 835 %endm 836 837 %macro p1_entry() 838 # :_start stub per the P1v2 program-entry model. Linux amd64 puts argc 839 # at [rsp] and argv starting at [rsp+8]. Load argc into a0 (rdi), 840 # compute &argv[0] into a1 (rsi), call p1_main under the one-word 841 # direct-result convention, then issue sys_exit with p1_main's return 842 # value in a0. 843 :_start 844 %amd_mem_LD(a0, sp, 0) 845 %amd_mov_rr(a1, sp) 846 %amd_alu_ri8(0, a1, 8) 847 %amd_mov_imm32_prefix(br) 848 &p1_main 849 %amd_call_r(br) 850 # mov eax, 60 (sys_exit); syscall. P1 a0 (native rdi) already holds 851 # p1_main's return value. 852 !(0xB8) 853 %(60) 854 !(0x0F) 855 !(0x05) 856 %endm 857 858 %macro p1_syscall() 859 # P1: a0=num, a1..a3,t0,s0,s1 = args 0..5. Linux amd64: rax=num, 860 # rdi/rsi/rdx/r10/r8/r9 = args 0..5, return in rax; syscall also 861 # clobbers rcx and r11. 862 # 863 # Plan: push the P1 registers whose native slots get overwritten or 864 # syscall-clobbered — rsi (a1), rdx (a2), rcx (a3), r11 (t1), r8 (t2) — 865 # then shuffle into the native slots, issue syscall, restore, and move 866 # the return value (rax) into a0 (rdi). 867 %amd_push(rsi) 868 %amd_push(rdx) 869 %amd_push(rcx) 870 %amd_push(r11) 871 %amd_push(r8) 872 873 %amd_mov_rr(rax, rdi) 874 %amd_mem_LD(rdi, sp, 32) 875 %amd_mem_LD(rsi, sp, 24) 876 %amd_mem_LD(rdx, sp, 16) 877 %amd_mov_rr(r8, rbx) 878 %amd_mov_rr(r9, r12) 879 880 !(0x0F) 881 !(0x05) 882 883 %amd_pop(r8) 884 %amd_pop(r11) 885 %amd_pop(rcx) 886 %amd_pop(rdx) 887 %amd_pop(rsi) 888 889 %amd_mov_rr(rdi, rax) 890 %endm 891 892 # ---- Linux amd64 syscall number data words ------------------------------ 893 894 %macro p1_sys_read() 895 $(0) 896 %endm 897 %macro p1_sys_write() 898 $(1) 899 %endm 900 %macro p1_sys_close() 901 $(3) 902 %endm 903 %macro p1_sys_openat() 904 $(257) 905 %endm 906 %macro p1_sys_exit() 907 $(60) 908 %endm 909 %macro p1_sys_clone() 910 $(56) 911 %endm 912 %macro p1_sys_execve() 913 $(59) 914 %endm 915 %macro p1_sys_waitid() 916 $(247) 917 %endm