P1-amd64.M1pp (21013B)
1 # P1-amd64.M1pp -- P1 amd64 backend expressed in m1macro. 2 # 3 # Mirrors p1/P1-aarch64.M1pp. Native register mapping is backend-private; 4 # see the amd_reg_* table below. amd64 is variable-length, so every op 5 # emits its prefix bytes (REX / opcode) directly via the m1pp `!(…)` 6 # single-byte builtin; 4-byte immediates still go through `%(…)`. 7 # 8 # Hidden backend regs: 9 # br = r15 -- branch-target mechanism 10 # scratch = r9 -- per-expansion scratch (e.g. rcx save slot for SHIFT) 11 # rax -- syscall number / return slot + retaddr spill 12 # rbp -- spill slot for rcx / rdx when SHIFT and DIV/REM need 13 # to preserve a3 / a2 14 15 # ---- Native register numbers -------------------------------------------- 16 # 17 # Macros emit the 4-bit native regnum 0..15. Callers use `(& N 7)` for the 18 # ModRM/SIB low 3 bits and `(>> N 3)` for the REX high bit. 19 20 %macro amd_reg_a0() 21 7 22 %endm 23 %macro amd_reg_a1() 24 6 25 %endm 26 %macro amd_reg_a2() 27 2 28 %endm 29 %macro amd_reg_a3() 30 1 31 %endm 32 %macro amd_reg_t0() 33 10 34 %endm 35 %macro amd_reg_t1() 36 11 37 %endm 38 %macro amd_reg_t2() 39 8 40 %endm 41 %macro amd_reg_s0() 42 3 43 %endm 44 %macro amd_reg_s1() 45 12 46 %endm 47 %macro amd_reg_s2() 48 13 49 %endm 50 %macro amd_reg_s3() 51 14 52 %endm 53 %macro amd_reg_sp() 54 4 55 %endm 56 %macro amd_reg_rax() 57 0 58 %endm 59 %macro amd_reg_rcx() 60 1 61 %endm 62 %macro amd_reg_rdx() 63 2 64 %endm 65 %macro amd_reg_rbx() 66 3 67 %endm 68 %macro amd_reg_rsp() 69 4 70 %endm 71 %macro amd_reg_rbp() 72 5 73 %endm 74 %macro amd_reg_rsi() 75 6 76 %endm 77 %macro amd_reg_rdi() 78 7 79 %endm 80 %macro amd_reg_r8() 81 8 82 %endm 83 %macro amd_reg_r9() 84 9 85 %endm 86 %macro amd_reg_r10() 87 10 88 %endm 89 %macro amd_reg_r11() 90 11 91 %endm 92 %macro amd_reg_r12() 93 12 94 %endm 95 %macro amd_reg_r13() 96 13 97 %endm 98 %macro amd_reg_r14() 99 14 100 %endm 101 %macro amd_reg_r15() 102 15 103 %endm 104 %macro amd_reg_br() 105 15 106 %endm 107 %macro amd_reg_scratch() 108 9 109 %endm 110 111 %macro amd_reg(r) 112 %amd_reg_##r 113 %endm 114 115 # Per-P1-name `is this sp?` predicate. Used by p1_mem to decide whether 116 # the supplied offset needs the +16 frame-header adjustment. 117 118 %macro amd_is_sp_a0() 119 0 120 %endm 121 %macro amd_is_sp_a1() 122 0 123 %endm 124 %macro amd_is_sp_a2() 125 0 126 %endm 127 %macro amd_is_sp_a3() 128 0 129 %endm 130 %macro amd_is_sp_t0() 131 0 132 %endm 133 %macro amd_is_sp_t1() 134 0 135 %endm 136 %macro amd_is_sp_t2() 137 0 138 %endm 139 %macro amd_is_sp_s0() 140 0 141 %endm 142 %macro amd_is_sp_s1() 143 0 144 %endm 145 %macro amd_is_sp_s2() 146 0 147 %endm 148 %macro amd_is_sp_s3() 149 0 150 %endm 151 %macro amd_is_sp_sp() 152 1 153 %endm 154 155 %macro amd_is_sp(r) 156 %amd_is_sp_##r 157 %endm 158 159 # ---- REX / ModRM helpers ------------------------------------------------ 160 161 # Short one-byte REX.B prefix (no W). Used by opcodes that don't need 64-bit 162 # width — push/pop/jmp r/call r/mov r,imm32 — when the target reg is r8-r15. 163 %macro amd_rex_b_short() 164 !(0x41) 165 %endm 166 167 # No-op sentinel for %select branches that shouldn't emit anything. 168 %macro amd_nobytes() 169 %endm 170 171 # Emit REX.B (0x41) iff r is r8-r15. Used by the short-prefix opcodes above. 172 %macro amd_maybe_rex_b(r) 173 %select((>= %amd_reg(r) 8), 174 %amd_rex_b_short, 175 %amd_nobytes) 176 %endm 177 178 # REX.WB: W=1 for 64-bit, B=(r>>3) to extend ModRM.rm / SIB.base. 179 %macro amd_rex_wb(r) 180 !((| 0x48 (& (>> %amd_reg(r) 3) 1))) 181 %endm 182 183 # REX.WRB: W=1, R=(rg>>3), B=(rm>>3). Used whenever a ModRM.reg field is 184 # in use together with a ModRM.rm field. 185 %macro amd_rex_wrb(rg, rm) 186 !((| 0x48 (| (<< (& (>> %amd_reg(rg) 3) 1) 2) (& (>> %amd_reg(rm) 3) 1)))) 187 %endm 188 189 # ModRM byte for register/register: mod=3, reg=rg low3, rm=rm low3. 190 %macro amd_modrm_rr(rg, rm) 191 !((| 0xC0 (| (<< (& %amd_reg(rg) 7) 3) (& %amd_reg(rm) 7)))) 192 %endm 193 194 # ModRM /ext, rm: mod=3, reg=ext, rm=low3(rm). ext is 0..7. 195 %macro amd_modrm_ext_r(ext, rm) 196 !((| 0xC0 (| (<< ext 3) (& %amd_reg(rm) 7)))) 197 %endm 198 199 # ---- Memory-addressing ModRM (+ SIB + disp) ---------------------------- 200 # 201 # [base + disp] with `reg` in ModRM.reg. Bases whose low 3 bits are 100 — 202 # rsp and r12 — must go through a SIB byte; all others use the plain 203 # encoding. disp selects mod=1 (disp8) when it fits in [-128,127], else 204 # mod=2 (disp32). We never emit mod=0 / no-disp; the extra byte is fine. 205 206 %macro amd_modrm_disp8_plain(reg, base, disp) 207 !((| 0x40 (| (<< (& %amd_reg(reg) 7) 3) (& %amd_reg(base) 7)))) 208 !((& disp 0xFF)) 209 %endm 210 211 %macro amd_modrm_disp32_plain(reg, base, disp) 212 !((| 0x80 (| (<< (& %amd_reg(reg) 7) 3) (& %amd_reg(base) 7)))) 213 %((& disp 0xFFFFFFFF)) 214 %endm 215 216 %macro amd_modrm_disp8_sib(reg, disp) 217 !((| 0x44 (<< (& %amd_reg(reg) 7) 3))) 218 !(0x24) 219 !((& disp 0xFF)) 220 %endm 221 222 %macro amd_modrm_disp32_sib(reg, disp) 223 !((| 0x84 (<< (& %amd_reg(reg) 7) 3))) 224 !(0x24) 225 %((& disp 0xFFFFFFFF)) 226 %endm 227 228 %macro amd_modrm_disp_plain(reg, base, disp) 229 %select((>= disp -128), 230 %select((<= disp 127), 231 %amd_modrm_disp8_plain(reg, base, disp), 232 %amd_modrm_disp32_plain(reg, base, disp)), 233 %amd_modrm_disp32_plain(reg, base, disp)) 234 %endm 235 236 %macro amd_modrm_disp_sib(reg, disp) 237 %select((>= disp -128), 238 %select((<= disp 127), 239 %amd_modrm_disp8_sib(reg, disp), 240 %amd_modrm_disp32_sib(reg, disp)), 241 %amd_modrm_disp32_sib(reg, disp)) 242 %endm 243 244 %macro amd_modrm_disp(reg, base, disp) 245 %select((= (& %amd_reg(base) 7) 4), 246 %amd_modrm_disp_sib(reg, disp), 247 %amd_modrm_disp_plain(reg, base, disp)) 248 %endm 249 250 # ---- Register / arithmetic primitives ---------------------------------- 251 252 # mov dst, src -- 48 89 /r (modrm form: source in reg, dest in rm). 253 %macro amd_mov_rr(dst, src) 254 %amd_rex_wrb(src, dst) 255 !(0x89) 256 %amd_modrm_rr(src, dst) 257 %endm 258 259 # op dst, src for ADD/SUB/AND/OR/XOR (same shape, different opcode byte). 260 %macro amd_alu_rr(opcode, dst, src) 261 %amd_rex_wrb(src, dst) 262 !(opcode) 263 %amd_modrm_rr(src, dst) 264 %endm 265 266 # op dst, imm8 -- 48 83 /ext ib. 267 %macro amd_alu_ri8(ext, dst, imm) 268 %amd_rex_wb(dst) 269 !(0x83) 270 %amd_modrm_ext_r(ext, dst) 271 !((& imm 0xFF)) 272 %endm 273 274 # op dst, imm32 -- 48 81 /ext id. 275 %macro amd_alu_ri32(ext, dst, imm) 276 %amd_rex_wb(dst) 277 !(0x81) 278 %amd_modrm_ext_r(ext, dst) 279 %((& imm 0xFFFFFFFF)) 280 %endm 281 282 # shift dst, imm8 -- 48 C1 /ext ib. (ext: SHL=4, SHR=5, SAR=7) 283 %macro amd_shift_ri8(ext, dst, imm) 284 %amd_rex_wb(dst) 285 !(0xC1) 286 %amd_modrm_ext_r(ext, dst) 287 !((& imm 0x3F)) 288 %endm 289 290 # shift dst, cl -- 48 D3 /ext. 291 %macro amd_shift_cl(ext, dst) 292 %amd_rex_wb(dst) 293 !(0xD3) 294 %amd_modrm_ext_r(ext, dst) 295 %endm 296 297 # imul dst, src -- 48 0F AF /r (load source into reg, dest in rm? actually 298 # the canonical form is IMUL r64, r/m64 — dest in reg, source in rm.) 299 %macro amd_imul_rr(dst, src) 300 %amd_rex_wrb(dst, src) 301 !(0x0F) 302 !(0xAF) 303 %amd_modrm_rr(dst, src) 304 %endm 305 306 # idiv src -- 48 F7 /7. 307 %macro amd_idiv_r(src) 308 %amd_rex_wb(src) 309 !(0xF7) 310 %amd_modrm_ext_r(7, src) 311 %endm 312 313 # cqo -- 48 99 (sign-extend rax into rdx:rax). 314 %macro amd_cqo() 315 !(0x48) 316 !(0x99) 317 %endm 318 319 # push / pop r64. 50+r / 58+r; REX.B=0x41 if r8-r15. 320 %macro amd_push(r) 321 %amd_maybe_rex_b(r) 322 !((| 0x50 (& %amd_reg(r) 7))) 323 %endm 324 325 %macro amd_pop(r) 326 %amd_maybe_rex_b(r) 327 !((| 0x58 (& %amd_reg(r) 7))) 328 %endm 329 330 # mov r32, imm32 -- B8+r id. Low-register form skips REX; r8-r15 need 331 # REX.B=0x41. The 4-byte literal the caller emits is zero-extended into 332 # the full 64-bit register, matching the LA / LA_BR literal-pool contract. 333 %macro amd_mov_imm32_prefix(rd) 334 %amd_maybe_rex_b(rd) 335 !((| 0xB8 (& %amd_reg(rd) 7))) 336 %endm 337 338 # mov r64, imm64 -- REX.W [+ REX.B] B8+r followed by 8 bytes of literal. 339 %macro amd_mov_imm64_prefix(rd) 340 %amd_rex_wb(rd) 341 !((| 0xB8 (& %amd_reg(rd) 7))) 342 %endm 343 344 # ---- Memory ops --------------------------------------------------------- 345 346 # mov rT, [rN + off] 48 8B /r modrm-with-disp 347 %macro amd_mem_LD(rt, rn, off) 348 %amd_rex_wrb(rt, rn) 349 !(0x8B) 350 %amd_modrm_disp(rt, rn, off) 351 %endm 352 353 # mov [rN + off], rT 48 89 /r 354 %macro amd_mem_ST(rt, rn, off) 355 %amd_rex_wrb(rt, rn) 356 !(0x89) 357 %amd_modrm_disp(rt, rn, off) 358 %endm 359 360 # mov [rN + off], rT8 48 88 /r (REX.W forces the rD8 encoding of 361 # dil/sil/bpl/spl when the byte view of those regs is needed.) 362 %macro amd_mem_SB(rt, rn, off) 363 %amd_rex_wrb(rt, rn) 364 !(0x88) 365 %amd_modrm_disp(rt, rn, off) 366 %endm 367 368 # movzx rT, byte ptr [rN + off] -- 48 0F B6 /r 369 %macro amd_mem_LB(rt, rn, off) 370 %amd_rex_wrb(rt, rn) 371 !(0x0F) 372 !(0xB6) 373 %amd_modrm_disp(rt, rn, off) 374 %endm 375 376 # ---- Control flow primitives ------------------------------------------- 377 378 # jmp r/m64 -- FF /4 379 # call r/m64 -- FF /2 380 # ret -- C3 381 # syscall -- 0F 05 382 # cmp rA, rB -- 48 39 /r (modrm: rB in reg, rA in rm) 383 # test rA, rA -- 48 85 /r 384 # Jcc rel8 -- 7x ib 385 386 %macro amd_jmp_r(r) 387 %amd_maybe_rex_b(r) 388 !(0xFF) 389 !((| 0xE0 (& %amd_reg(r) 7))) 390 %endm 391 392 %macro amd_call_r(r) 393 %amd_maybe_rex_b(r) 394 !(0xFF) 395 !((| 0xD0 (& %amd_reg(r) 7))) 396 %endm 397 398 %macro amd_ret() 399 !(0xC3) 400 %endm 401 402 %macro amd_syscall() 403 !(0x0F) 404 !(0x05) 405 %endm 406 407 # cmp rA, rB -- 48 39 /r (modrm: rB in reg, rA in rm). 408 %macro amd_cmp_rr(ra, rb) 409 %amd_rex_wrb(rb, ra) 410 !(0x39) 411 %amd_modrm_rr(rb, ra) 412 %endm 413 414 %macro amd_test_rr(ra, rb) 415 %amd_rex_wrb(rb, ra) 416 !(0x85) 417 %amd_modrm_rr(rb, ra) 418 %endm 419 420 # ---- P1 register-register op lowering ---------------------------------- 421 # 422 # For ADD/SUB/AND/OR/XOR we honor rD=rB aliasing — the naive `mov rD,rA ; 423 # op rD,rB` would clobber rB before the op reads it. Route rB through the 424 # scratch reg when that aliasing shows up. 425 426 %macro amd_rrr_simple_ADD(rd, ra, rb) 427 %amd_rrr_simple(0x01, rd, ra, rb) 428 %endm 429 %macro amd_rrr_simple_SUB(rd, ra, rb) 430 %amd_rrr_simple(0x29, rd, ra, rb) 431 %endm 432 %macro amd_rrr_simple_AND(rd, ra, rb) 433 %amd_rrr_simple(0x21, rd, ra, rb) 434 %endm 435 %macro amd_rrr_simple_OR(rd, ra, rb) 436 %amd_rrr_simple(0x09, rd, ra, rb) 437 %endm 438 %macro amd_rrr_simple_XOR(rd, ra, rb) 439 %amd_rrr_simple(0x31, rd, ra, rb) 440 %endm 441 442 %macro amd_rrr_simple(opcode, rd, ra, rb) 443 %select((= %amd_reg(rd) %amd_reg(rb)), 444 %amd_rrr_simple_via_scratch(opcode, rd, ra, rb), 445 %amd_rrr_simple_direct(opcode, rd, ra, rb)) 446 %endm 447 448 %macro amd_rrr_simple_direct(opcode, rd, ra, rb) 449 %amd_mov_rr(rd, ra) 450 %amd_alu_rr(opcode, rd, rb) 451 %endm 452 453 %macro amd_rrr_simple_via_scratch(opcode, rd, ra, rb) 454 %amd_mov_rr(scratch, rb) 455 %amd_mov_rr(rd, ra) 456 %amd_alu_rr(opcode, rd, scratch) 457 %endm 458 459 %macro amd_rrr_MUL(rd, ra, rb) 460 %select((= %amd_reg(rd) %amd_reg(rb)), 461 %amd_rrr_MUL_via_scratch(rd, ra, rb), 462 %amd_rrr_MUL_direct(rd, ra, rb)) 463 %endm 464 %macro amd_rrr_MUL_direct(rd, ra, rb) 465 %amd_mov_rr(rd, ra) 466 %amd_imul_rr(rd, rb) 467 %endm 468 %macro amd_rrr_MUL_via_scratch(rd, ra, rb) 469 %amd_mov_rr(scratch, rb) 470 %amd_mov_rr(rd, ra) 471 %amd_imul_rr(rd, scratch) 472 %endm 473 474 # DIV / REM clobber rax and rdx natively. rax is not a P1 register, so 475 # we clobber it freely; rdx IS P1 a2, so we stash it to rbp (also outside 476 # the P1 mapping) for the lifetime of the op. 477 # 478 # Aliasing-safety plan, same for DIV and REM: 479 # 1. rbp = rdx -- saved a2, also serves as "original rb 480 # if rb == a2" via the scratch copy 481 # 2. scratch = rb -- read rb while rdx still holds its 482 # original value (in case rb == a2) 483 # 3. rax = ra -- ra == a2 reads original rdx for the 484 # same reason; cqo hasn't run yet 485 # 4. cqo ; idiv scratch -- divide 486 # 5. rdx = rbp (restore) BEFORE -- so `mov rd, rax/rdx` below can 487 # writing rd legitimately overwrite rdx when 488 # rd == a2 without losing the result 489 # 6. mov rd, rax -- DIV quotient 490 # or capture rdx -> rax first, 491 # then rd = rax -- REM remainder (capture dodges the 492 # restore overwriting the remainder) 493 494 %macro amd_rrr_DIV(rd, ra, rb) 495 %amd_mov_rr(rbp, rdx) 496 %amd_mov_rr(scratch, rb) 497 %amd_mov_rr(rax, ra) 498 %amd_cqo 499 %amd_idiv_r(scratch) 500 %amd_mov_rr(rdx, rbp) 501 %amd_mov_rr(rd, rax) 502 %endm 503 504 %macro amd_rrr_REM(rd, ra, rb) 505 %amd_mov_rr(rbp, rdx) 506 %amd_mov_rr(scratch, rb) 507 %amd_mov_rr(rax, ra) 508 %amd_cqo 509 %amd_idiv_r(scratch) 510 %amd_mov_rr(rax, rdx) 511 %amd_mov_rr(rdx, rbp) 512 %amd_mov_rr(rd, rax) 513 %endm 514 515 # SHL / SHR / SAR with reg count. x86 reads the count from CL only, so 516 # staging goes through rcx — which IS P1 a3. Save rcx to rbp for the 517 # duration. 518 # 519 # Ordering is load-bearing: 520 # 1. rbp = rcx -- save a3 521 # 2. scratch = ra -- read ra BEFORE we overwrite rcx; 522 # otherwise `ra == a3` reads the count 523 # we just staged 524 # 3. rcx = rb -- count into cl 525 # 4. shift scratch, cl -- do the work 526 # 5. rcx = rbp (restore) BEFORE -- so `mov rd, scratch` below can 527 # writing rd legitimately overwrite rcx when 528 # rd == a3 without losing the result 529 # 6. mov rd, scratch 530 531 %macro amd_rrr_SHL(rd, ra, rb) 532 %amd_rrr_shift(4, rd, ra, rb) 533 %endm 534 %macro amd_rrr_SHR(rd, ra, rb) 535 %amd_rrr_shift(5, rd, ra, rb) 536 %endm 537 %macro amd_rrr_SAR(rd, ra, rb) 538 %amd_rrr_shift(7, rd, ra, rb) 539 %endm 540 541 %macro amd_rrr_shift(ext, rd, ra, rb) 542 %amd_mov_rr(rbp, rcx) 543 %amd_mov_rr(scratch, ra) 544 %amd_mov_rr(rcx, rb) 545 %amd_shift_cl(ext, scratch) 546 %amd_mov_rr(rcx, rbp) 547 %amd_mov_rr(rd, scratch) 548 %endm 549 550 %macro amd_rrr_op(op, rd, ra, rb) 551 %amd_rrr_##op(rd, ra, rb) 552 %endm 553 554 %macro amd_rrr_ADD(rd, ra, rb) 555 %amd_rrr_simple_ADD(rd, ra, rb) 556 %endm 557 %macro amd_rrr_SUB(rd, ra, rb) 558 %amd_rrr_simple_SUB(rd, ra, rb) 559 %endm 560 %macro amd_rrr_AND(rd, ra, rb) 561 %amd_rrr_simple_AND(rd, ra, rb) 562 %endm 563 %macro amd_rrr_OR(rd, ra, rb) 564 %amd_rrr_simple_OR(rd, ra, rb) 565 %endm 566 %macro amd_rrr_XOR(rd, ra, rb) 567 %amd_rrr_simple_XOR(rd, ra, rb) 568 %endm 569 570 # ---- P1 operation lowering --------------------------------------------- 571 572 %macro p1_li(rd, imm) 573 %amd_mov_imm64_prefix(rd) 574 $(imm) 575 %endm 576 577 %macro p1_la(rd) 578 %amd_mov_imm32_prefix(rd) 579 %endm 580 581 %macro p1_labr() 582 %amd_mov_imm32_prefix(br) 583 %endm 584 585 %macro p1_mov(rd, rs) 586 %p1_mov_##rs(rd) 587 %endm 588 589 # All non-sp sources: plain register copy. 590 %macro p1_mov_a0(rd) 591 %amd_mov_rr(rd, a0) 592 %endm 593 %macro p1_mov_a1(rd) 594 %amd_mov_rr(rd, a1) 595 %endm 596 %macro p1_mov_a2(rd) 597 %amd_mov_rr(rd, a2) 598 %endm 599 %macro p1_mov_a3(rd) 600 %amd_mov_rr(rd, a3) 601 %endm 602 %macro p1_mov_t0(rd) 603 %amd_mov_rr(rd, t0) 604 %endm 605 %macro p1_mov_t1(rd) 606 %amd_mov_rr(rd, t1) 607 %endm 608 %macro p1_mov_t2(rd) 609 %amd_mov_rr(rd, t2) 610 %endm 611 %macro p1_mov_s0(rd) 612 %amd_mov_rr(rd, s0) 613 %endm 614 %macro p1_mov_s1(rd) 615 %amd_mov_rr(rd, s1) 616 %endm 617 %macro p1_mov_s2(rd) 618 %amd_mov_rr(rd, s2) 619 %endm 620 %macro p1_mov_s3(rd) 621 %amd_mov_rr(rd, s3) 622 %endm 623 624 # sp-source: portable sp is the frame-local base, which is native rsp + 16 625 # (the 16-byte backend-private frame header sits at [rsp+0..rsp+15]). 626 # Emit `mov rd, rsp ; add rd, 16`. 627 %macro p1_mov_sp(rd) 628 %amd_mov_rr(rd, sp) 629 %amd_alu_ri8(0, rd, 16) 630 %endm 631 632 %macro p1_rrr(op, rd, ra, rb) 633 %amd_rrr_op(op, rd, ra, rb) 634 %endm 635 636 %macro p1_addi(rd, ra, imm) 637 %amd_mov_rr(rd, ra) 638 %select((>= imm -128), 639 %select((<= imm 127), 640 %amd_alu_ri8(0, rd, imm), 641 %amd_alu_ri32(0, rd, imm)), 642 %amd_alu_ri32(0, rd, imm)) 643 %endm 644 645 # AND/OR with imm: 83 /ext ib sign-extends imm8 to 64 bits. That works for 646 # imm in [-128, 127] (and for -1 as a convenient all-ones mask), but breaks 647 # for positive imms >= 128 — ANDI with 255 would become AND with 648 # 0xFFFFFFFFFFFFFFFF. Widen to the imm32 form when imm8 would misencode. 649 %macro p1_logi_ANDI(rd, ra, imm) 650 %amd_mov_rr(rd, ra) 651 %select((>= imm -128), 652 %select((<= imm 127), 653 %amd_alu_ri8(4, rd, imm), 654 %amd_alu_ri32(4, rd, imm)), 655 %amd_alu_ri32(4, rd, imm)) 656 %endm 657 %macro p1_logi_ORI(rd, ra, imm) 658 %amd_mov_rr(rd, ra) 659 %select((>= imm -128), 660 %select((<= imm 127), 661 %amd_alu_ri8(1, rd, imm), 662 %amd_alu_ri32(1, rd, imm)), 663 %amd_alu_ri32(1, rd, imm)) 664 %endm 665 %macro p1_logi(op, rd, ra, imm) 666 %p1_logi_##op(rd, ra, imm) 667 %endm 668 669 %macro p1_shifti_SHLI(rd, ra, imm) 670 %amd_mov_rr(rd, ra) 671 %amd_shift_ri8(4, rd, imm) 672 %endm 673 %macro p1_shifti_SHRI(rd, ra, imm) 674 %amd_mov_rr(rd, ra) 675 %amd_shift_ri8(5, rd, imm) 676 %endm 677 %macro p1_shifti_SARI(rd, ra, imm) 678 %amd_mov_rr(rd, ra) 679 %amd_shift_ri8(7, rd, imm) 680 %endm 681 %macro p1_shifti(op, rd, ra, imm) 682 %p1_shifti_##op(rd, ra, imm) 683 %endm 684 685 # p1_mem -- portable-offset memory access. When the base is sp, portable 686 # sp is the frame-local base (16 bytes above native rsp), so the physical 687 # access needs the supplied offset plus 16. For any other base, portable 688 # and native offsets coincide. Internal backend callers that need raw 689 # native-rsp access (p1_enter, p1_eret, _start stub, p1_ldarg, p1_syscall) 690 # use amd_mem_LD/amd_mem_ST directly and bypass this translation. 691 692 %macro p1_mem(op, rt, rn, off) 693 %select((= %amd_is_sp(rn) 1), 694 %amd_mem_##op(rt, rn, (+ off 16)), 695 %amd_mem_##op(rt, rn, off)) 696 %endm 697 698 %macro p1_ldarg(rd, slot) 699 %amd_mem_LD(scratch, sp, 8) 700 %amd_mem_LD(rd, scratch, (+ 16 (* 8 slot))) 701 %endm 702 703 %macro p1_b() 704 %amd_jmp_r(br) 705 %endm 706 707 %macro p1_br(rs) 708 %amd_jmp_r(rs) 709 %endm 710 711 %macro p1_call() 712 %amd_call_r(br) 713 %endm 714 715 %macro p1_callr(rs) 716 %amd_call_r(rs) 717 %endm 718 719 %macro p1_ret() 720 %amd_ret 721 %endm 722 723 # ERET -- atomic frame epilogue + return from a framed function. 724 # r9 = [rsp + 0] -- retaddr into scratch (native rsp; backend-private) 725 # rax = [rsp + 8] -- saved caller sp into rax (an unused native reg) 726 # rsp = rax -- unwind to caller sp 727 # push r9 -- reinstall retaddr so the trailing ret returns 728 # correctly 729 # ret -- pop reinstated retaddr into rip 730 %macro p1_eret() 731 %amd_mem_LD(scratch, sp, 0) 732 %amd_mem_LD(rax, sp, 8) 733 %amd_mov_rr(sp, rax) 734 %amd_push(scratch) 735 %amd_ret 736 %endm 737 738 # TAIL / TAILR -- frame epilogue followed by an unconditional jump to the 739 # target. The epilogue is the same sequence as the first four steps of 740 # p1_eret (we omit the trailing ret because we jmp to a fresh target 741 # instead). 742 %macro p1_tail() 743 %amd_mem_LD(scratch, sp, 0) 744 %amd_mem_LD(rax, sp, 8) 745 %amd_mov_rr(sp, rax) 746 %amd_push(scratch) 747 %amd_jmp_r(br) 748 %endm 749 750 %macro p1_tailr(rs) 751 %amd_mem_LD(scratch, sp, 0) 752 %amd_mem_LD(rax, sp, 8) 753 %amd_mov_rr(sp, rax) 754 %amd_push(scratch) 755 %amd_jmp_r(rs) 756 %endm 757 758 # Conditional-branch lowering: 759 # compare / test 760 # Jcc_inverse +3 skip the 3-byte `jmp r15` 761 # jmp r15 P1 branch-taken path 762 # 763 # Invert codes: BEQ->JNE(75), BNE->JE(74), BLT->JGE(7D), BLTU->JAE(73), 764 # BLTZ->JGE(7D), BEQZ->JNE(75), BNEZ->JE(74). 765 766 %macro p1_condb_BEQ(ra, rb) 767 %amd_cmp_rr(ra, rb) 768 !(0x75) 769 !(0x03) 770 %amd_jmp_r(br) 771 %endm 772 %macro p1_condb_BNE(ra, rb) 773 %amd_cmp_rr(ra, rb) 774 !(0x74) 775 !(0x03) 776 %amd_jmp_r(br) 777 %endm 778 %macro p1_condb_BLT(ra, rb) 779 %amd_cmp_rr(ra, rb) 780 !(0x7D) 781 !(0x03) 782 %amd_jmp_r(br) 783 %endm 784 %macro p1_condb_BLTU(ra, rb) 785 %amd_cmp_rr(ra, rb) 786 !(0x73) 787 !(0x03) 788 %amd_jmp_r(br) 789 %endm 790 %macro p1_condb(op, ra, rb) 791 %p1_condb_##op(ra, rb) 792 %endm 793 794 %macro p1_condbz_BEQZ(ra) 795 %amd_test_rr(ra, ra) 796 !(0x75) 797 !(0x03) 798 %amd_jmp_r(br) 799 %endm 800 %macro p1_condbz_BNEZ(ra) 801 %amd_test_rr(ra, ra) 802 !(0x74) 803 !(0x03) 804 %amd_jmp_r(br) 805 %endm 806 %macro p1_condbz_BLTZ(ra) 807 %amd_test_rr(ra, ra) 808 !(0x7D) 809 !(0x03) 810 %amd_jmp_r(br) 811 %endm 812 %macro p1_condbz(op, ra) 813 %p1_condbz_##op(ra) 814 %endm 815 816 # ENTER size 817 # 818 # CALL on amd64 pushed the retaddr, so on entry: 819 # rsp = caller_sp - 8 820 # [rsp] = retaddr 821 # 822 # We want the standard frame: 823 # [sp + 0] = retaddr 824 # [sp + 8] = saved caller_sp 825 # [sp + 16 .. 16 + size - 1] = locals 826 # total frame = round_up(16, 16 + size) 827 # 828 # Pop retaddr into scratch, save caller_sp into rax (unused by P1), 829 # allocate frame, restore retaddr at [sp], store caller_sp at [sp+8]. 830 %macro p1_enter(size) 831 %amd_pop(scratch) 832 %amd_mov_rr(rax, sp) 833 %amd_alu_ri32(5, sp, (& (+ (+ 16 size) 15) -16)) 834 %amd_mem_ST(scratch, sp, 0) 835 %amd_mem_ST(rax, sp, 8) 836 %endm 837 838 %macro p1_entry() 839 # :_start stub per the P1 program-entry model. Linux amd64 puts argc 840 # at [rsp] and argv starting at [rsp+8]. Load argc into a0 (rdi), 841 # compute &argv[0] into a1 (rsi), call p1_main under the one-word 842 # direct-result convention, then issue sys_exit with p1_main's return 843 # value in a0. 844 :_start 845 %amd_mem_LD(a0, sp, 0) 846 %amd_mov_rr(a1, sp) 847 %amd_alu_ri8(0, a1, 8) 848 %amd_mov_imm32_prefix(br) 849 &p1_main 850 %amd_call_r(br) 851 # mov eax, 60 (sys_exit); syscall. P1 a0 (native rdi) already holds 852 # p1_main's return value. 853 !(0xB8) 854 %(60) 855 !(0x0F) 856 !(0x05) 857 %endm 858 859 %macro p1_syscall() 860 # P1: a0=num, a1..a3,t0,s0,s1 = args 0..5. Linux amd64: rax=num, 861 # rdi/rsi/rdx/r10/r8/r9 = args 0..5, return in rax; syscall also 862 # clobbers rcx and r11. 863 # 864 # Plan: push the P1 registers whose native slots get overwritten or 865 # syscall-clobbered — rsi (a1), rdx (a2), rcx (a3), r11 (t1), r8 (t2) — 866 # then shuffle into the native slots, issue syscall, restore, and move 867 # the return value (rax) into a0 (rdi). 868 %amd_push(rsi) 869 %amd_push(rdx) 870 %amd_push(rcx) 871 %amd_push(r11) 872 %amd_push(r8) 873 874 %amd_mov_rr(rax, rdi) 875 %amd_mem_LD(rdi, sp, 32) 876 %amd_mem_LD(rsi, sp, 24) 877 %amd_mem_LD(rdx, sp, 16) 878 %amd_mov_rr(r8, rbx) 879 %amd_mov_rr(r9, r12) 880 881 !(0x0F) 882 !(0x05) 883 884 %amd_pop(r8) 885 %amd_pop(r11) 886 %amd_pop(rcx) 887 %amd_pop(rdx) 888 %amd_pop(rsi) 889 890 %amd_mov_rr(rdi, rax) 891 %endm 892 893 # ---- Linux amd64 syscall numbers ---------------------------------------- 894 # Each macro returns the syscall number as an integer atom so callers can 895 # use it inside expressions (e.g. `%li(a0, %sys_write)`). 896 897 %macro p1_sys_read() 898 0 899 %endm 900 %macro p1_sys_write() 901 1 902 %endm 903 %macro p1_sys_close() 904 3 905 %endm 906 %macro p1_sys_openat() 907 257 908 %endm 909 %macro p1_sys_exit() 910 60 911 %endm 912 %macro p1_sys_clone() 913 56 914 %endm 915 %macro p1_sys_execve() 916 59 917 %endm 918 %macro p1_sys_waitid() 919 247 920 %endm