P1-aarch64.M1pp (13745B)
1 # aarch64.M1M -- P1 aarch64 backend expressed in m1macro. 2 # 3 # This mirrors p1/aarch64.py using the m1macro integer builtins: 4 # %(sexpr), $(sexpr), and %select(cond, then, else). 5 6 # ---- Native register numbers -------------------------------------------- 7 8 %macro aa64_reg_a0() 9 0 10 %endm 11 %macro aa64_reg_a1() 12 1 13 %endm 14 %macro aa64_reg_a2() 15 2 16 %endm 17 %macro aa64_reg_a3() 18 3 19 %endm 20 %macro aa64_reg_x4() 21 4 22 %endm 23 %macro aa64_reg_x5() 24 5 25 %endm 26 %macro aa64_reg_t0() 27 9 28 %endm 29 %macro aa64_reg_t1() 30 10 31 %endm 32 %macro aa64_reg_t2() 33 11 34 %endm 35 %macro aa64_reg_s0() 36 19 37 %endm 38 %macro aa64_reg_s1() 39 20 40 %endm 41 %macro aa64_reg_s2() 42 21 43 %endm 44 %macro aa64_reg_s3() 45 22 46 %endm 47 %macro aa64_reg_sp() 48 31 49 %endm 50 %macro aa64_reg_xzr() 51 31 52 %endm 53 %macro aa64_reg_lr() 54 30 55 %endm 56 %macro aa64_reg_br() 57 17 58 %endm 59 %macro aa64_reg_scratch() 60 16 61 %endm 62 %macro aa64_reg_x8() 63 8 64 %endm 65 %macro aa64_reg_save0() 66 23 67 %endm 68 %macro aa64_reg_save1() 69 24 70 %endm 71 %macro aa64_reg_save2() 72 25 73 %endm 74 75 %macro aa64_reg(r) 76 %aa64_reg_##r 77 %endm 78 79 %macro aa64_is_sp_a0() 80 0 81 %endm 82 %macro aa64_is_sp_a1() 83 0 84 %endm 85 %macro aa64_is_sp_a2() 86 0 87 %endm 88 %macro aa64_is_sp_a3() 89 0 90 %endm 91 %macro aa64_is_sp_x4() 92 0 93 %endm 94 %macro aa64_is_sp_x5() 95 0 96 %endm 97 %macro aa64_is_sp_t0() 98 0 99 %endm 100 %macro aa64_is_sp_t1() 101 0 102 %endm 103 %macro aa64_is_sp_t2() 104 0 105 %endm 106 %macro aa64_is_sp_s0() 107 0 108 %endm 109 %macro aa64_is_sp_s1() 110 0 111 %endm 112 %macro aa64_is_sp_s2() 113 0 114 %endm 115 %macro aa64_is_sp_s3() 116 0 117 %endm 118 %macro aa64_is_sp_sp() 119 1 120 %endm 121 %macro aa64_is_sp_xzr() 122 0 123 %endm 124 %macro aa64_is_sp_lr() 125 0 126 %endm 127 %macro aa64_is_sp_br() 128 0 129 %endm 130 %macro aa64_is_sp_scratch() 131 0 132 %endm 133 %macro aa64_is_sp_x8() 134 0 135 %endm 136 %macro aa64_is_sp_save0() 137 0 138 %endm 139 %macro aa64_is_sp_save1() 140 0 141 %endm 142 %macro aa64_is_sp_save2() 143 0 144 %endm 145 146 %macro aa64_is_sp(r) 147 %aa64_is_sp_##r 148 %endm 149 150 # ---- Low-level instruction encoders -------------------------------------- 151 152 %macro aa64_rrr(base, rd, ra, rb) 153 %((| base (<< %aa64_reg(rb) 16) (<< %aa64_reg(ra) 5) %aa64_reg(rd))) 154 %endm 155 156 %macro aa64_add_imm(rd, ra, imm12) 157 %((| 0x91000000 (<< (& imm12 0xFFF) 10) (<< %aa64_reg(ra) 5) %aa64_reg(rd))) 158 %endm 159 160 %macro aa64_sub_imm(rd, ra, imm12) 161 %((| 0xD1000000 (<< (& imm12 0xFFF) 10) (<< %aa64_reg(ra) 5) %aa64_reg(rd))) 162 %endm 163 164 %macro aa64_add_imm_lsl12(rd, ra, imm12) 165 %((| 0x91400000 (<< (& imm12 0xFFF) 10) (<< %aa64_reg(ra) 5) %aa64_reg(rd))) 166 %endm 167 168 %macro aa64_sub_imm_lsl12(rd, ra, imm12) 169 %((| 0xD1400000 (<< (& imm12 0xFFF) 10) (<< %aa64_reg(ra) 5) %aa64_reg(rd))) 170 %endm 171 172 # ADD/SUB immediate with arbitrary unsigned magnitude. The native imm12 173 # form covers [0, 4095]; the imm12<<12 form (optionally combined with a 174 # second imm12 for the low bits) covers [4096, 0xFFFFFF]. Past 24 bits, 175 # materialize the constant in scratch and emit the R-type ADD/SUB. 176 # Callers must not pass `scratch` as `ra` (the materialize would 177 # clobber it before the R-type read). 178 %macro aa64_add_imm_any(rd, ra, imm) 179 %select((<= imm 4095), 180 %aa64_add_imm(rd, ra, imm), 181 %select((<= imm 0xFFFFFF), 182 %select((= (& imm 0xFFF) 0), 183 %aa64_add_imm_lsl12(rd, ra, (>> imm 12)), 184 %aa64_add_imm_lsl12(rd, ra, (>> imm 12)) 185 %aa64_add_imm(rd, rd, (& imm 0xFFF))), 186 %p1_li(scratch, imm) 187 %aa64_rrr(0x8B000000, rd, ra, scratch))) 188 %endm 189 190 %macro aa64_sub_imm_any(rd, ra, imm) 191 %select((<= imm 4095), 192 %aa64_sub_imm(rd, ra, imm), 193 %select((<= imm 0xFFFFFF), 194 %select((= (& imm 0xFFF) 0), 195 %aa64_sub_imm_lsl12(rd, ra, (>> imm 12)), 196 %aa64_sub_imm_lsl12(rd, ra, (>> imm 12)) 197 %aa64_sub_imm(rd, rd, (& imm 0xFFF))), 198 %p1_li(scratch, imm) 199 %aa64_rrr(0xCB000000, rd, ra, scratch))) 200 %endm 201 202 %macro aa64_mov_rr(dst, src) 203 %select((= %aa64_is_sp(dst) 1), 204 %aa64_add_imm(sp, src, 0), 205 %select((= %aa64_is_sp(src) 1), 206 %aa64_add_imm(dst, sp, 16), 207 %((| 0xAA000000 (<< %aa64_reg(src) 16) (<< 31 5) %aa64_reg(dst))))) 208 %endm 209 210 %macro aa64_ubfm(rd, ra, immr, imms) 211 %((| 0xD3400000 (<< immr 16) (<< imms 10) (<< %aa64_reg(ra) 5) %aa64_reg(rd))) 212 %endm 213 214 %macro aa64_sbfm(rd, ra, immr, imms) 215 %((| 0x93400000 (<< immr 16) (<< imms 10) (<< %aa64_reg(ra) 5) %aa64_reg(rd))) 216 %endm 217 218 %macro aa64_movz(rd, imm16) 219 %((| 0xD2800000 (<< (& imm16 0xFFFF) 5) %aa64_reg(rd))) 220 %endm 221 222 %macro aa64_movn(rd, imm16) 223 %((| 0x92800000 (<< (& imm16 0xFFFF) 5) %aa64_reg(rd))) 224 %endm 225 226 %macro aa64_materialize_small_imm(rd, imm) 227 %select((>= imm 0), 228 %aa64_movz(rd, imm), 229 %aa64_movn(rd, (& (~ imm) 0xFFFF))) 230 %endm 231 232 # Materialize an arbitrary 64-bit signed immediate into `rd`. Picks the 233 # 1-insn MOVZ / MOVN form when the value (or its complement, for 234 # negatives) fits 16 bits; otherwise emits the 4-insn MOVZ + 3*MOVK 235 # chain used by %p1_li. Used by ANDI/ORI/ADDI fallbacks below to avoid 236 # silently truncating to the small-imm window. 237 %macro aa64_materialize_imm_any(rd, imm) 238 %select((>= imm 0), 239 %select((<= imm 0xFFFF), 240 %aa64_movz(rd, imm), 241 %p1_li(rd, imm)), 242 %select((>= imm -65536), 243 %aa64_movn(rd, (& (~ imm) 0xFFFF)), 244 %p1_li(rd, imm))) 245 %endm 246 247 %macro aa64_ldst_uimm12(base, rt, rn, off_bytes, size_log2) 248 %((| base (<< (>> off_bytes size_log2) 10) (<< %aa64_reg(rn) 5) %aa64_reg(rt))) 249 %endm 250 251 %macro aa64_ldst_unscaled(base, rt, rn, off) 252 %((| base (<< (& off 0x1FF) 12) (<< %aa64_reg(rn) 5) %aa64_reg(rt))) 253 %endm 254 255 %macro aa64_mem_uimm_base_LD() 256 0xF9400000 257 %endm 258 %macro aa64_mem_uimm_base_ST() 259 0xF9000000 260 %endm 261 %macro aa64_mem_uimm_base_LB() 262 0x39400000 263 %endm 264 %macro aa64_mem_uimm_base_SB() 265 0x39000000 266 %endm 267 268 %macro aa64_mem_unscaled_base_LD() 269 0xF8400000 270 %endm 271 %macro aa64_mem_unscaled_base_ST() 272 0xF8000000 273 %endm 274 %macro aa64_mem_unscaled_base_LB() 275 0x38400000 276 %endm 277 %macro aa64_mem_unscaled_base_SB() 278 0x38000000 279 %endm 280 281 %macro aa64_mem_size_LD() 282 3 283 %endm 284 %macro aa64_mem_size_ST() 285 3 286 %endm 287 %macro aa64_mem_size_LB() 288 0 289 %endm 290 %macro aa64_mem_size_SB() 291 0 292 %endm 293 294 %macro aa64_mem_uimm_base(op) 295 %aa64_mem_uimm_base_##op 296 %endm 297 298 %macro aa64_mem_unscaled_base(op) 299 %aa64_mem_unscaled_base_##op 300 %endm 301 302 %macro aa64_mem_size(op) 303 %aa64_mem_size_##op 304 %endm 305 306 %macro aa64_mem_fallback(op, rt, rn, off) 307 %select((>= off 0), 308 %aa64_add_imm_any(scratch, rn, off) 309 %aa64_ldst_uimm12(%aa64_mem_uimm_base(op), rt, scratch, 0, %aa64_mem_size(op)), 310 %aa64_sub_imm_any(scratch, rn, (- 0 off)) 311 %aa64_ldst_uimm12(%aa64_mem_uimm_base(op), rt, scratch, 0, %aa64_mem_size(op))) 312 %endm 313 314 %macro aa64_mem_after_uimm(op, rt, rn, off) 315 %select((>= off -256), 316 %select((<= off 255), 317 %aa64_ldst_unscaled(%aa64_mem_unscaled_base(op), rt, rn, off), 318 %aa64_mem_fallback(op, rt, rn, off)), 319 %aa64_mem_fallback(op, rt, rn, off)) 320 %endm 321 322 %macro aa64_mem_after_nonneg(op, rt, rn, off) 323 %select((= (& off (- (<< 1 %aa64_mem_size(op)) 1)) 0), 324 %select((< off (<< 4096 %aa64_mem_size(op))), 325 %aa64_ldst_uimm12(%aa64_mem_uimm_base(op), rt, rn, off, %aa64_mem_size(op)), 326 %aa64_mem_after_uimm(op, rt, rn, off)), 327 %aa64_mem_after_uimm(op, rt, rn, off)) 328 %endm 329 330 %macro aa64_mem(op, rt, rn, off) 331 %select((>= off 0), 332 %aa64_mem_after_nonneg(op, rt, rn, off), 333 %aa64_mem_after_uimm(op, rt, rn, off)) 334 %endm 335 336 %macro aa64_cmp_skip(cond, ra, rb) 337 %((| 0xEB000000 (<< %aa64_reg(rb) 16) (<< %aa64_reg(ra) 5) 31)) 338 %((| 0x54000040 cond)) 339 %endm 340 341 %macro aa64_br(reg) 342 %((| 0xD61F0000 (<< %aa64_reg(reg) 5))) 343 %endm 344 345 %macro aa64_blr(reg) 346 %((| 0xD63F0000 (<< %aa64_reg(reg) 5))) 347 %endm 348 349 %macro aa64_ret() 350 %(0xD65F03C0) 351 %endm 352 353 %macro aa64_lit64_prefix(rd) 354 # 64-bit literal-pool prefix for LI: ldr xN, [pc,#8]; b PC+12. 355 # The 8 bytes that follow in source become the literal; b skips them. 356 %((| 0x58000040 %aa64_reg(rd))) 357 %(0x14000003) 358 %endm 359 360 %macro aa64_lit32_prefix(rd) 361 # 32-bit literal-pool prefix for LA / LA_BR: ldr wN, [pc,#8]; b PC+8. 362 # ldr w zero-extends into the full 64-bit register, so a 4-byte literal 363 # is enough for any address in the stage0 layout. Lets source use 364 # `&label` directly without padding to 8 bytes. 365 %((| 0x18000040 %aa64_reg(rd))) 366 %(0x14000002) 367 %endm 368 369 # ---- P1 register-register op lowering ----------------------------------- 370 371 %macro aa64_rrr_ADD(rd, ra, rb) 372 %aa64_rrr(0x8B000000, rd, ra, rb) 373 %endm 374 %macro aa64_rrr_SUB(rd, ra, rb) 375 %aa64_rrr(0xCB000000, rd, ra, rb) 376 %endm 377 %macro aa64_rrr_AND(rd, ra, rb) 378 %aa64_rrr(0x8A000000, rd, ra, rb) 379 %endm 380 %macro aa64_rrr_OR(rd, ra, rb) 381 %aa64_rrr(0xAA000000, rd, ra, rb) 382 %endm 383 %macro aa64_rrr_XOR(rd, ra, rb) 384 %aa64_rrr(0xCA000000, rd, ra, rb) 385 %endm 386 %macro aa64_rrr_SHL(rd, ra, rb) 387 %aa64_rrr(0x9AC02000, rd, ra, rb) 388 %endm 389 %macro aa64_rrr_SHR(rd, ra, rb) 390 %aa64_rrr(0x9AC02400, rd, ra, rb) 391 %endm 392 %macro aa64_rrr_SAR(rd, ra, rb) 393 %aa64_rrr(0x9AC02800, rd, ra, rb) 394 %endm 395 %macro aa64_rrr_DIV(rd, ra, rb) 396 %aa64_rrr(0x9AC00C00, rd, ra, rb) 397 %endm 398 %macro aa64_rrr_MUL(rd, ra, rb) 399 %((| 0x9B000000 (<< %aa64_reg(rb) 16) (<< 31 10) (<< %aa64_reg(ra) 5) %aa64_reg(rd))) 400 %endm 401 %macro aa64_rrr_REM(rd, ra, rb) 402 %((| 0x9AC00C00 (<< %aa64_reg(rb) 16) (<< %aa64_reg(ra) 5) %aa64_reg(scratch))) 403 %((| 0x9B008000 (<< %aa64_reg(rb) 16) (<< %aa64_reg(ra) 10) (<< %aa64_reg(scratch) 5) %aa64_reg(rd))) 404 %endm 405 406 %macro aa64_rrr_op(op, rd, ra, rb) 407 %aa64_rrr_##op(rd, ra, rb) 408 %endm 409 410 # ---- P1 operation lowering ----------------------------------------------- 411 412 # MOVZ + 3 MOVK chain for 64-bit immediate. 4 instructions, 16 bytes — same 413 # size as the prior LDR-literal-pool lowering. Pure instructions, no inline 414 # data, which is the standard aarch64 codegen for materializing constants. 415 %macro aa64_movk_lsl16(rd, imm16) 416 %((| 0xF2A00000 (<< (& imm16 0xFFFF) 5) %aa64_reg(rd))) 417 %endm 418 %macro aa64_movk_lsl32(rd, imm16) 419 %((| 0xF2C00000 (<< (& imm16 0xFFFF) 5) %aa64_reg(rd))) 420 %endm 421 %macro aa64_movk_lsl48(rd, imm16) 422 %((| 0xF2E00000 (<< (& imm16 0xFFFF) 5) %aa64_reg(rd))) 423 %endm 424 425 %macro p1_li(rd, imm) 426 %aa64_movz(rd, (& imm 0xFFFF)) 427 %aa64_movk_lsl16(rd, (& (>> imm 16) 0xFFFF)) 428 %aa64_movk_lsl32(rd, (& (>> imm 32) 0xFFFF)) 429 %aa64_movk_lsl48(rd, (& (>> imm 48) 0xFFFF)) 430 %endm 431 432 %macro p1_la(rd) 433 %aa64_lit32_prefix(rd) 434 %endm 435 436 %macro p1_labr() 437 %aa64_lit32_prefix(br) 438 %endm 439 440 %macro p1_mov(rd, rs) 441 %aa64_mov_rr(rd, rs) 442 %endm 443 444 %macro p1_rrr(op, rd, ra, rb) 445 %aa64_rrr_op(op, rd, ra, rb) 446 %endm 447 448 %macro p1_addi(rd, ra, imm) 449 %select((>= imm 0), 450 %aa64_add_imm_any(rd, ra, imm), 451 %aa64_sub_imm_any(rd, ra, (- 0 imm))) 452 %endm 453 454 %macro p1_logi_ANDI(rd, ra, imm) 455 %aa64_materialize_imm_any(scratch, imm) 456 %aa64_rrr(0x8A000000, rd, ra, scratch) 457 %endm 458 %macro p1_logi_ORI(rd, ra, imm) 459 %aa64_materialize_imm_any(scratch, imm) 460 %aa64_rrr(0xAA000000, rd, ra, scratch) 461 %endm 462 %macro p1_logi(op, rd, ra, imm) 463 %p1_logi_##op(rd, ra, imm) 464 %endm 465 466 %macro p1_shifti_SHLI(rd, ra, imm) 467 %aa64_ubfm(rd, ra, (& (- 0 imm) 63), (- 63 imm)) 468 %endm 469 %macro p1_shifti_SHRI(rd, ra, imm) 470 %aa64_ubfm(rd, ra, imm, 63) 471 %endm 472 %macro p1_shifti_SARI(rd, ra, imm) 473 %aa64_sbfm(rd, ra, imm, 63) 474 %endm 475 %macro p1_shifti(op, rd, ra, imm) 476 %p1_shifti_##op(rd, ra, imm) 477 %endm 478 479 %macro p1_mem(op, rt, rn, off) 480 %select((= %aa64_is_sp(rn) 1), 481 %aa64_mem(op, rt, rn, (+ off 16)), 482 %aa64_mem(op, rt, rn, off)) 483 %endm 484 485 %macro p1_ldarg(rd, slot) 486 %aa64_mem(LD, scratch, sp, 8) 487 %aa64_mem(LD, rd, scratch, (+ 16 (* 8 slot))) 488 %endm 489 490 %macro p1_b() 491 %aa64_br(br) 492 %endm 493 494 %macro p1_br(rs) 495 %aa64_br(rs) 496 %endm 497 498 %macro p1_call() 499 %aa64_blr(br) 500 %endm 501 502 %macro p1_callr(rs) 503 %aa64_blr(rs) 504 %endm 505 506 %macro p1_ret() 507 %aa64_ret 508 %endm 509 510 %macro p1_eret() 511 %aa64_mem(LD, lr, sp, 0) 512 %aa64_mem(LD, x8, sp, 8) 513 %aa64_mov_rr(sp, x8) 514 %aa64_ret 515 %endm 516 517 %macro p1_tail() 518 %aa64_mem(LD, lr, sp, 0) 519 %aa64_mem(LD, x8, sp, 8) 520 %aa64_mov_rr(sp, x8) 521 %aa64_br(br) 522 %endm 523 524 %macro p1_tailr(rs) 525 %aa64_mem(LD, lr, sp, 0) 526 %aa64_mem(LD, x8, sp, 8) 527 %aa64_mov_rr(sp, x8) 528 %aa64_br(rs) 529 %endm 530 531 %macro p1_condb_BEQ(ra, rb) 532 %aa64_cmp_skip(1, ra, rb) 533 %aa64_br(br) 534 %endm 535 %macro p1_condb_BNE(ra, rb) 536 %aa64_cmp_skip(0, ra, rb) 537 %aa64_br(br) 538 %endm 539 %macro p1_condb_BLT(ra, rb) 540 %aa64_cmp_skip(10, ra, rb) 541 %aa64_br(br) 542 %endm 543 %macro p1_condb_BLTU(ra, rb) 544 %aa64_cmp_skip(2, ra, rb) 545 %aa64_br(br) 546 %endm 547 %macro p1_condb(op, ra, rb) 548 %p1_condb_##op(ra, rb) 549 %endm 550 551 %macro p1_condbz_BEQZ(ra) 552 %((| 0xB5000000 (<< 2 5) %aa64_reg(ra))) 553 %aa64_br(br) 554 %endm 555 %macro p1_condbz_BNEZ(ra) 556 %((| 0xB4000000 (<< 2 5) %aa64_reg(ra))) 557 %aa64_br(br) 558 %endm 559 %macro p1_condbz_BLTZ(ra) 560 %((| 0xEB1F001F (<< %aa64_reg(ra) 5))) 561 %((| 0x54000040 10)) 562 %aa64_br(br) 563 %endm 564 %macro p1_condbz(op, ra) 565 %p1_condbz_##op(ra) 566 %endm 567 568 %macro p1_enter(size) 569 %aa64_sub_imm_any(sp, sp, (& (+ (+ 16 size) 15) -16)) 570 %aa64_mem(ST, lr, sp, 0) 571 %aa64_add_imm_any(x8, sp, (& (+ (+ 16 size) 15) -16)) 572 %aa64_mem(ST, x8, sp, 8) 573 %endm 574 575 %macro p1_entry() 576 # :_start stub emitted by the aarch64 backend per the P1 program-entry 577 # model. Captures argc from [sp] into a0, computes argv=sp+8 into a1, 578 # calls p1_main under the one-word direct-result convention, then issues 579 # a native Linux sys_exit with p1_main's return value as the exit status. 580 :_start 581 %aa64_mem(LD, a0, sp, 0) 582 %aa64_add_imm(a1, sp, 8) 583 %aa64_lit32_prefix(br) 584 &p1_main 585 %aa64_blr(br) 586 %aa64_movz(x8, 93) 587 %(0xD4000001) 588 %endm 589 590 %macro p1_syscall() 591 %aa64_mov_rr(x8, a0) 592 %aa64_mov_rr(save0, a1) 593 %aa64_mov_rr(save1, a2) 594 %aa64_mov_rr(save2, a3) 595 %aa64_mov_rr(a0, save0) 596 %aa64_mov_rr(a1, save1) 597 %aa64_mov_rr(a2, save2) 598 %aa64_mov_rr(a3, t0) 599 %aa64_mov_rr(x4, s0) 600 %aa64_mov_rr(x5, s1) 601 %(0xD4000001) 602 %aa64_mov_rr(a1, save0) 603 %aa64_mov_rr(a2, save1) 604 %aa64_mov_rr(a3, save2) 605 %endm 606 607 # ---- Linux aarch64 syscall numbers --------------------------------------- 608 # Each macro returns the syscall number as an integer atom so callers can 609 # use it inside expressions (e.g. `%li(a0, %sys_write)`). 610 611 %macro p1_sys_read() 612 63 613 %endm 614 %macro p1_sys_write() 615 64 616 %endm 617 %macro p1_sys_close() 618 57 619 %endm 620 %macro p1_sys_openat() 621 56 622 %endm 623 %macro p1_sys_exit() 624 93 625 %endm 626 %macro p1_sys_clone() 627 220 628 %endm 629 %macro p1_sys_execve() 630 221 631 %endm 632 %macro p1_sys_spawn() 633 1024 634 %endm 635 %macro p1_sys_waitid() 636 95 637 %endm 638 %macro p1_sys_lseek() 639 62 640 %endm 641 %macro p1_sys_brk() 642 214 643 %endm 644 %macro p1_sys_unlinkat() 645 35 646 %endm