P1-riscv64.M1pp (14436B)
1 # P1-riscv64.M1pp -- P1 riscv64 backend expressed in m1macro. 2 # 3 # Mirrors p1/P1-aarch64.M1pp; same macro surface, different encodings. 4 # Native register picks follow docs/P1.md's 64-bit mapping table. 5 # 6 # Hidden backend regs: 7 # br = t6 (x31) -- dedicated branch-target mechanism 8 # scratch = t5 (x30) -- per-expansion scratch, never live across ops 9 # save0 = t4 (x29) -- transient across SYSCALL only 10 # save1 = t3 (x28) 11 # save2 = a6 (x16) 12 # saved_fp = fp (x8) -- used by ENTER/ERET to capture caller sp 13 # a7 = x17 -- Linux riscv64 syscall-number slot 14 # a4 = x14 -- syscall arg4 slot 15 # a5 = x15 -- syscall arg5 slot 16 17 # ---- Native register numbers -------------------------------------------- 18 19 %macro rv_reg_a0() 20 10 21 %endm 22 %macro rv_reg_a1() 23 11 24 %endm 25 %macro rv_reg_a2() 26 12 27 %endm 28 %macro rv_reg_a3() 29 13 30 %endm 31 %macro rv_reg_a4() 32 14 33 %endm 34 %macro rv_reg_a5() 35 15 36 %endm 37 %macro rv_reg_a6() 38 16 39 %endm 40 %macro rv_reg_a7() 41 17 42 %endm 43 %macro rv_reg_t0() 44 5 45 %endm 46 %macro rv_reg_t1() 47 6 48 %endm 49 %macro rv_reg_t2() 50 7 51 %endm 52 %macro rv_reg_s0() 53 9 54 %endm 55 %macro rv_reg_s1() 56 18 57 %endm 58 %macro rv_reg_s2() 59 19 60 %endm 61 %macro rv_reg_s3() 62 20 63 %endm 64 %macro rv_reg_sp() 65 2 66 %endm 67 %macro rv_reg_zero() 68 0 69 %endm 70 %macro rv_reg_ra() 71 1 72 %endm 73 %macro rv_reg_fp() 74 8 75 %endm 76 %macro rv_reg_br() 77 31 78 %endm 79 %macro rv_reg_scratch() 80 30 81 %endm 82 %macro rv_reg_save0() 83 29 84 %endm 85 %macro rv_reg_save1() 86 28 87 %endm 88 %macro rv_reg_save2() 89 16 90 %endm 91 92 %macro rv_reg(r) 93 %rv_reg_##r 94 %endm 95 96 %macro rv_is_sp_a0() 97 0 98 %endm 99 %macro rv_is_sp_a1() 100 0 101 %endm 102 %macro rv_is_sp_a2() 103 0 104 %endm 105 %macro rv_is_sp_a3() 106 0 107 %endm 108 %macro rv_is_sp_a4() 109 0 110 %endm 111 %macro rv_is_sp_a5() 112 0 113 %endm 114 %macro rv_is_sp_a6() 115 0 116 %endm 117 %macro rv_is_sp_a7() 118 0 119 %endm 120 %macro rv_is_sp_t0() 121 0 122 %endm 123 %macro rv_is_sp_t1() 124 0 125 %endm 126 %macro rv_is_sp_t2() 127 0 128 %endm 129 %macro rv_is_sp_s0() 130 0 131 %endm 132 %macro rv_is_sp_s1() 133 0 134 %endm 135 %macro rv_is_sp_s2() 136 0 137 %endm 138 %macro rv_is_sp_s3() 139 0 140 %endm 141 %macro rv_is_sp_sp() 142 1 143 %endm 144 %macro rv_is_sp_zero() 145 0 146 %endm 147 %macro rv_is_sp_ra() 148 0 149 %endm 150 %macro rv_is_sp_fp() 151 0 152 %endm 153 %macro rv_is_sp_br() 154 0 155 %endm 156 %macro rv_is_sp_scratch() 157 0 158 %endm 159 %macro rv_is_sp_save0() 160 0 161 %endm 162 %macro rv_is_sp_save1() 163 0 164 %endm 165 %macro rv_is_sp_save2() 166 0 167 %endm 168 169 %macro rv_is_sp(r) 170 %rv_is_sp_##r 171 %endm 172 173 # ---- Low-level instruction encoders -------------------------------------- 174 175 # R-type: funct7[31:25] rs2[24:20] rs1[19:15] funct3[14:12] rd[11:7] opcode[6:0] 176 %macro rv_r_type(base, rd, ra, rb) 177 %((| base (<< %rv_reg(rb) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7))) 178 %endm 179 180 # I-type: imm[31:20] rs1[19:15] funct3[14:12] rd[11:7] opcode[6:0] 181 %macro rv_i_type(base, rd, ra, imm12) 182 %((| base (<< (& imm12 0xFFF) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7))) 183 %endm 184 185 # S-type: imm[31:25] rs2[24:20] rs1[19:15] funct3[14:12] imm[11:7] opcode[6:0] 186 %macro rv_s_type(base, rs, ra, imm12) 187 %((| base (<< (& (>> imm12 5) 0x7F) 25) (<< %rv_reg(rs) 20) (<< %rv_reg(ra) 15) (<< (& imm12 0x1F) 7))) 188 %endm 189 190 # B-type: imm[12|10:5] rs2 rs1 funct3 imm[4:1|11] opcode. 12-bit signed, 191 # imm[0] always 0. For the hardcoded skip-over-jalr we only need a fixed 192 # positive offset (8 bytes = 2 insns), so inline the resulting bit pattern. 193 %macro rv_b_type_skip8(base, ra, rb) 194 # imm value 8 -> imm[11:0] = 0000_0000_0100. Bits of encoded imm: 195 # imm[12]=0, imm[10:5]=0, imm[4:1]=0100 (=4), imm[11]=0. 196 # encoded bits: [31:25]=0, [11:7]= (imm[4:1] << 1) | imm[11] = (4<<1)|0 = 8. 197 %((| base (<< %rv_reg(rb) 20) (<< %rv_reg(ra) 15) (<< 8 7))) 198 %endm 199 200 %macro rv_addi(rd, ra, imm12) 201 %rv_i_type(0x00000013, rd, ra, imm12) 202 %endm 203 204 # rv_addi with arbitrary 64-bit signed immediate. Falls back to a 205 # 64-bit literal load into `scratch` followed by an R-type ADD when the 206 # immediate doesn't fit in ADDI's 12-bit signed field. `scratch` (t5/x30) 207 # is per-expansion and never live across ops, so clobbering it is safe. 208 %macro rv_addi_any(rd, ra, imm) 209 %select((>= imm -2048), 210 %select((<= imm 2047), 211 %rv_addi(rd, ra, imm), 212 %rv_lit64_prefix(scratch) 213 $(imm) 214 %rv_r_type(0x00000033, rd, ra, scratch)), 215 %rv_lit64_prefix(scratch) 216 $(imm) 217 %rv_r_type(0x00000033, rd, ra, scratch)) 218 %endm 219 220 %macro rv_ld(rd, ra, imm12) 221 %rv_i_type(0x00003003, rd, ra, imm12) 222 %endm 223 224 %macro rv_sd(rs, ra, imm12) 225 %rv_s_type(0x00003023, rs, ra, imm12) 226 %endm 227 228 %macro rv_lbu(rd, ra, imm12) 229 %rv_i_type(0x00004003, rd, ra, imm12) 230 %endm 231 232 %macro rv_sb(rs, ra, imm12) 233 %rv_s_type(0x00000023, rs, ra, imm12) 234 %endm 235 236 %macro rv_lwu(rd, ra, imm12) 237 %rv_i_type(0x00006003, rd, ra, imm12) 238 %endm 239 240 # Load/store with arbitrary signed offset. The native I-type/S-type 241 # imm12 covers [-2048, 2047]; past that, materialize the offset in 242 # scratch (t5/x30), compute scratch = ra + scratch via R-type ADD, and 243 # issue the load/store with offset 0. Callers must not pass scratch as 244 # `ra` or `rs` — the materialize would clobber it before the address 245 # computation reads it. 246 %macro rv_ld_any(rd, ra, off) 247 %select((>= off -2048), 248 %select((<= off 2047), 249 %rv_ld(rd, ra, off), 250 %rv_lit64_prefix(scratch) 251 $(off) 252 %rv_r_type(0x00000033, scratch, ra, scratch) 253 %rv_ld(rd, scratch, 0)), 254 %rv_lit64_prefix(scratch) 255 $(off) 256 %rv_r_type(0x00000033, scratch, ra, scratch) 257 %rv_ld(rd, scratch, 0)) 258 %endm 259 260 %macro rv_sd_any(rs, ra, off) 261 %select((>= off -2048), 262 %select((<= off 2047), 263 %rv_sd(rs, ra, off), 264 %rv_lit64_prefix(scratch) 265 $(off) 266 %rv_r_type(0x00000033, scratch, ra, scratch) 267 %rv_sd(rs, scratch, 0)), 268 %rv_lit64_prefix(scratch) 269 $(off) 270 %rv_r_type(0x00000033, scratch, ra, scratch) 271 %rv_sd(rs, scratch, 0)) 272 %endm 273 274 %macro rv_lbu_any(rd, ra, off) 275 %select((>= off -2048), 276 %select((<= off 2047), 277 %rv_lbu(rd, ra, off), 278 %rv_lit64_prefix(scratch) 279 $(off) 280 %rv_r_type(0x00000033, scratch, ra, scratch) 281 %rv_lbu(rd, scratch, 0)), 282 %rv_lit64_prefix(scratch) 283 $(off) 284 %rv_r_type(0x00000033, scratch, ra, scratch) 285 %rv_lbu(rd, scratch, 0)) 286 %endm 287 288 %macro rv_sb_any(rs, ra, off) 289 %select((>= off -2048), 290 %select((<= off 2047), 291 %rv_sb(rs, ra, off), 292 %rv_lit64_prefix(scratch) 293 $(off) 294 %rv_r_type(0x00000033, scratch, ra, scratch) 295 %rv_sb(rs, scratch, 0)), 296 %rv_lit64_prefix(scratch) 297 $(off) 298 %rv_r_type(0x00000033, scratch, ra, scratch) 299 %rv_sb(rs, scratch, 0)) 300 %endm 301 302 %macro rv_mov_rr(dst, src) 303 %rv_addi(dst, src, 0) 304 %endm 305 306 %macro rv_slli(rd, ra, shamt) 307 %((| 0x00001013 (<< (& shamt 0x3F) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7))) 308 %endm 309 310 %macro rv_srli(rd, ra, shamt) 311 %((| 0x00005013 (<< (& shamt 0x3F) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7))) 312 %endm 313 314 %macro rv_srai(rd, ra, shamt) 315 %((| 0x40005013 (<< (& shamt 0x3F) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7))) 316 %endm 317 318 %macro rv_jalr(rd, rs, imm12) 319 %((| 0x00000067 (<< (& imm12 0xFFF) 20) (<< %rv_reg(rs) 15) (<< %rv_reg(rd) 7))) 320 %endm 321 322 %macro rv_ecall() 323 %(0x00000073) 324 %endm 325 326 # 64-bit literal-pool prefix for LI: 327 # auipc rd, 0 pc-relative base 328 # ld rd, 12(rd) load 8-byte literal from pc+12 329 # jal x0, 12 skip 12 bytes (literal + pad, =8 bytes of literal). 330 # The 8 bytes that follow in source become the literal. 331 %macro rv_lit64_prefix(rd) 332 %((| 0x00000017 (<< %rv_reg(rd) 7))) 333 %((| 0x00C03003 (<< %rv_reg(rd) 15) (<< %rv_reg(rd) 7))) 334 %(0x00C0006F) 335 %endm 336 337 # 32-bit literal-pool prefix for LA / LA_BR: 338 # auipc rd, 0 339 # lwu rd, 12(rd) zero-extend 4-byte literal from pc+12 340 # jal x0, 8 skip 8 bytes (=4 of insn slot? no, literal + align). 341 # lwu zero-extends into the full 64-bit register, so 4 bytes is enough for 342 # any address in the stage0 layout. Lets source use `&label` directly 343 # without padding to 8 bytes. 344 %macro rv_lit32_prefix(rd) 345 %((| 0x00000017 (<< %rv_reg(rd) 7))) 346 %((| 0x00C06003 (<< %rv_reg(rd) 15) (<< %rv_reg(rd) 7))) 347 %(0x0080006F) 348 %endm 349 350 # Memory op fallback: offset outside signed 12-bit range. Load the 351 # offset into `scratch` via LUI+ADDI dance? For stage0 programs the 352 # curated offsets stay inside -2048..2047, so fall back is unused; 353 # still emit a defensive failure to flag any future overflow. 354 # (In practice none of the LD/ST off values in p1_gen.py exceed the 355 # signed 12-bit range, so no fallback path is wired in here.) 356 357 # ---- P1 register-register op lowering ----------------------------------- 358 359 %macro rv_rrr_ADD(rd, ra, rb) 360 %rv_r_type(0x00000033, rd, ra, rb) 361 %endm 362 %macro rv_rrr_SUB(rd, ra, rb) 363 %rv_r_type(0x40000033, rd, ra, rb) 364 %endm 365 %macro rv_rrr_AND(rd, ra, rb) 366 %rv_r_type(0x00007033, rd, ra, rb) 367 %endm 368 %macro rv_rrr_OR(rd, ra, rb) 369 %rv_r_type(0x00006033, rd, ra, rb) 370 %endm 371 %macro rv_rrr_XOR(rd, ra, rb) 372 %rv_r_type(0x00004033, rd, ra, rb) 373 %endm 374 %macro rv_rrr_SHL(rd, ra, rb) 375 %rv_r_type(0x00001033, rd, ra, rb) 376 %endm 377 %macro rv_rrr_SHR(rd, ra, rb) 378 %rv_r_type(0x00005033, rd, ra, rb) 379 %endm 380 %macro rv_rrr_SAR(rd, ra, rb) 381 %rv_r_type(0x40005033, rd, ra, rb) 382 %endm 383 %macro rv_rrr_MUL(rd, ra, rb) 384 %rv_r_type(0x02000033, rd, ra, rb) 385 %endm 386 %macro rv_rrr_DIV(rd, ra, rb) 387 %rv_r_type(0x02004033, rd, ra, rb) 388 %endm 389 %macro rv_rrr_REM(rd, ra, rb) 390 %rv_r_type(0x02006033, rd, ra, rb) 391 %endm 392 393 %macro rv_rrr_op(op, rd, ra, rb) 394 %rv_rrr_##op(rd, ra, rb) 395 %endm 396 397 # ---- P1 operation lowering ----------------------------------------------- 398 399 %macro p1_li(rd, imm) 400 %rv_lit64_prefix(rd) 401 $(imm) 402 %endm 403 404 %macro p1_la(rd) 405 %rv_lit32_prefix(rd) 406 %endm 407 408 %macro p1_labr() 409 %rv_lit32_prefix(br) 410 %endm 411 412 %macro p1_mov(rd, rs) 413 %select((= %rv_is_sp(rs) 1), 414 %rv_addi(rd, sp, 16), 415 %rv_mov_rr(rd, rs)) 416 %endm 417 418 %macro p1_rrr(op, rd, ra, rb) 419 %rv_rrr_op(op, rd, ra, rb) 420 %endm 421 422 %macro p1_addi(rd, ra, imm) 423 %rv_addi_any(rd, ra, imm) 424 %endm 425 426 # Logical-immediate fallback: when imm fits the I-type's 12-bit signed 427 # field, emit the native ANDI/ORI; otherwise materialize the immediate 428 # in scratch (t5/x30) and use the R-type AND/OR. funct3=7 (AND) or 6 429 # (OR) is shared between the I-type (opcode 0x13) and R-type 430 # (opcode 0x33) encodings. 431 %macro rv_logi_any(rd, ra, imm, base_i, base_r) 432 %select((>= imm -2048), 433 %select((<= imm 2047), 434 %rv_i_type(base_i, rd, ra, imm), 435 %rv_lit64_prefix(scratch) 436 $(imm) 437 %rv_r_type(base_r, rd, ra, scratch)), 438 %rv_lit64_prefix(scratch) 439 $(imm) 440 %rv_r_type(base_r, rd, ra, scratch)) 441 %endm 442 443 %macro p1_logi_ANDI(rd, ra, imm) 444 %rv_logi_any(rd, ra, imm, 0x00007013, 0x00007033) 445 %endm 446 %macro p1_logi_ORI(rd, ra, imm) 447 %rv_logi_any(rd, ra, imm, 0x00006013, 0x00006033) 448 %endm 449 %macro p1_logi(op, rd, ra, imm) 450 %p1_logi_##op(rd, ra, imm) 451 %endm 452 453 %macro p1_shifti_SHLI(rd, ra, imm) 454 %rv_slli(rd, ra, imm) 455 %endm 456 %macro p1_shifti_SHRI(rd, ra, imm) 457 %rv_srli(rd, ra, imm) 458 %endm 459 %macro p1_shifti_SARI(rd, ra, imm) 460 %rv_srai(rd, ra, imm) 461 %endm 462 %macro p1_shifti(op, rd, ra, imm) 463 %p1_shifti_##op(rd, ra, imm) 464 %endm 465 466 %macro p1_mem_LD(rt, rn, off) 467 %rv_ld_any(rt, rn, off) 468 %endm 469 %macro p1_mem_ST(rt, rn, off) 470 %rv_sd_any(rt, rn, off) 471 %endm 472 %macro p1_mem_LB(rt, rn, off) 473 %rv_lbu_any(rt, rn, off) 474 %endm 475 %macro p1_mem_SB(rt, rn, off) 476 %rv_sb_any(rt, rn, off) 477 %endm 478 %macro p1_mem(op, rt, rn, off) 479 %select((= %rv_is_sp(rn) 1), 480 %p1_mem_##op(rt, rn, (+ off 16)), 481 %p1_mem_##op(rt, rn, off)) 482 %endm 483 484 %macro p1_ldarg(rd, slot) 485 %rv_ld(rd, sp, 8) 486 %rv_ld_any(rd, rd, (+ 16 (* 8 slot))) 487 %endm 488 489 %macro p1_b() 490 %rv_jalr(zero, br, 0) 491 %endm 492 493 %macro p1_br(rs) 494 %rv_jalr(zero, rs, 0) 495 %endm 496 497 %macro p1_call() 498 %rv_jalr(ra, br, 0) 499 %endm 500 501 %macro p1_callr(rs) 502 %rv_jalr(ra, rs, 0) 503 %endm 504 505 %macro p1_ret() 506 %rv_jalr(zero, ra, 0) 507 %endm 508 509 %macro p1_eret() 510 %rv_ld(ra, sp, 0) 511 %rv_ld(fp, sp, 8) 512 %rv_mov_rr(sp, fp) 513 %rv_jalr(zero, ra, 0) 514 %endm 515 516 %macro p1_tail() 517 %rv_ld(ra, sp, 0) 518 %rv_ld(fp, sp, 8) 519 %rv_mov_rr(sp, fp) 520 %rv_jalr(zero, br, 0) 521 %endm 522 523 %macro p1_tailr(rs) 524 %rv_ld(ra, sp, 0) 525 %rv_ld(fp, sp, 8) 526 %rv_mov_rr(sp, fp) 527 %rv_jalr(zero, rs, 0) 528 %endm 529 530 # Conditional branch: emit a skip-taken native branch over the `%p1_b` 531 # fall-through, then the jalr(br) that takes the P1 branch. Each native 532 # B-type here uses the inverted condition with a +8 offset so the `jalr` 533 # two insns below is the taken target. 534 %macro p1_condb_BEQ(ra, rb) 535 %rv_b_type_skip8(0x00001063, ra, rb) 536 %p1_b 537 %endm 538 %macro p1_condb_BNE(ra, rb) 539 %rv_b_type_skip8(0x00000063, ra, rb) 540 %p1_b 541 %endm 542 %macro p1_condb_BLT(ra, rb) 543 %rv_b_type_skip8(0x00005063, ra, rb) 544 %p1_b 545 %endm 546 %macro p1_condb_BLTU(ra, rb) 547 %rv_b_type_skip8(0x00007063, ra, rb) 548 %p1_b 549 %endm 550 %macro p1_condb(op, ra, rb) 551 %p1_condb_##op(ra, rb) 552 %endm 553 554 %macro p1_condbz_BEQZ(ra) 555 %rv_b_type_skip8(0x00001063, ra, zero) 556 %p1_b 557 %endm 558 %macro p1_condbz_BNEZ(ra) 559 %rv_b_type_skip8(0x00000063, ra, zero) 560 %p1_b 561 %endm 562 %macro p1_condbz_BLTZ(ra) 563 %rv_b_type_skip8(0x00005063, ra, zero) 564 %p1_b 565 %endm 566 %macro p1_condbz(op, ra) 567 %p1_condbz_##op(ra) 568 %endm 569 570 %macro p1_enter(size) 571 %rv_addi_any(sp, sp, (- 0 (& (+ (+ 16 size) 15) -16))) 572 %rv_sd(ra, sp, 0) 573 %rv_addi_any(fp, sp, (& (+ (+ 16 size) 15) -16)) 574 %rv_sd(fp, sp, 8) 575 %endm 576 577 %macro p1_entry() 578 # :_start stub per the P1 program-entry model. Linux riscv64 puts argc 579 # at [sp] and argv starting at [sp+8], matching the generic SysV entry 580 # stack. Load argc into a0, compute &argv[0] into a1, call p1_main under 581 # the one-word direct-result convention, then issue sys_exit with the 582 # returned status. 583 :_start 584 %rv_ld(a0, sp, 0) 585 %rv_addi(a1, sp, 8) 586 %rv_lit32_prefix(br) 587 &p1_main 588 %rv_jalr(ra, br, 0) 589 %rv_addi(a7, zero, 93) 590 %rv_ecall 591 %endm 592 593 %macro p1_syscall() 594 # P1: a0=number, a1,a2,a3,t0,s0,s1 = args 0..5. 595 # Linux riscv64: a7=number, a0..a5 = args 0..5, return in a0. 596 # SYSCALL clobbers only P1 a0; restore a1/a2/a3 after ecall. 597 # Native a4/a5 (x14/x15) aren't P1-exposed; we use them as syscall arg 598 # slots and don't need to save them. 599 %rv_mov_rr(save0, a1) 600 %rv_mov_rr(save1, a2) 601 %rv_mov_rr(save2, a3) 602 %rv_mov_rr(a7, a0) 603 %rv_mov_rr(a0, save0) 604 %rv_mov_rr(a1, save1) 605 %rv_mov_rr(a2, save2) 606 %rv_mov_rr(a3, t0) 607 %rv_mov_rr(a4, s0) 608 %rv_mov_rr(a5, s1) 609 %rv_ecall 610 %rv_mov_rr(a1, save0) 611 %rv_mov_rr(a2, save1) 612 %rv_mov_rr(a3, save2) 613 %endm 614 615 # ---- Linux riscv64 syscall numbers --------------------------------------- 616 # Each macro returns the syscall number as an integer atom so callers can 617 # use it inside expressions (e.g. `%li(a0, %sys_write)`). 618 619 %macro p1_sys_read() 620 63 621 %endm 622 %macro p1_sys_write() 623 64 624 %endm 625 %macro p1_sys_close() 626 57 627 %endm 628 %macro p1_sys_openat() 629 56 630 %endm 631 %macro p1_sys_exit() 632 93 633 %endm 634 %macro p1_sys_clone() 635 220 636 %endm 637 %macro p1_sys_execve() 638 221 639 %endm 640 %macro p1_sys_spawn() 641 1024 642 %endm 643 %macro p1_sys_waitid() 644 95 645 %endm 646 %macro p1_sys_lseek() 647 62 648 %endm 649 %macro p1_sys_brk() 650 214 651 %endm 652 %macro p1_sys_unlinkat() 653 35 654 %endm