P1-riscv64.M1pp (11189B)
1 # P1-riscv64.M1pp -- P1 riscv64 backend expressed in m1macro. 2 # 3 # Mirrors p1/P1-aarch64.M1pp; same macro surface, different encodings. 4 # Native register picks follow docs/P1.md's 64-bit mapping table. 5 # 6 # Hidden backend regs: 7 # br = t6 (x31) -- dedicated branch-target mechanism 8 # scratch = t5 (x30) -- per-expansion scratch, never live across ops 9 # save0 = t4 (x29) -- transient across SYSCALL only 10 # save1 = t3 (x28) 11 # save2 = a6 (x16) 12 # saved_fp = fp (x8) -- used by ENTER/ERET to capture caller sp 13 # a7 = x17 -- Linux riscv64 syscall-number slot 14 # a4 = x14 -- syscall arg4 slot 15 # a5 = x15 -- syscall arg5 slot 16 17 # ---- Native register numbers -------------------------------------------- 18 19 %macro rv_reg_a0() 20 10 21 %endm 22 %macro rv_reg_a1() 23 11 24 %endm 25 %macro rv_reg_a2() 26 12 27 %endm 28 %macro rv_reg_a3() 29 13 30 %endm 31 %macro rv_reg_a4() 32 14 33 %endm 34 %macro rv_reg_a5() 35 15 36 %endm 37 %macro rv_reg_a6() 38 16 39 %endm 40 %macro rv_reg_a7() 41 17 42 %endm 43 %macro rv_reg_t0() 44 5 45 %endm 46 %macro rv_reg_t1() 47 6 48 %endm 49 %macro rv_reg_t2() 50 7 51 %endm 52 %macro rv_reg_s0() 53 9 54 %endm 55 %macro rv_reg_s1() 56 18 57 %endm 58 %macro rv_reg_s2() 59 19 60 %endm 61 %macro rv_reg_s3() 62 20 63 %endm 64 %macro rv_reg_sp() 65 2 66 %endm 67 %macro rv_reg_zero() 68 0 69 %endm 70 %macro rv_reg_ra() 71 1 72 %endm 73 %macro rv_reg_fp() 74 8 75 %endm 76 %macro rv_reg_br() 77 31 78 %endm 79 %macro rv_reg_scratch() 80 30 81 %endm 82 %macro rv_reg_save0() 83 29 84 %endm 85 %macro rv_reg_save1() 86 28 87 %endm 88 %macro rv_reg_save2() 89 16 90 %endm 91 92 %macro rv_reg(r) 93 %rv_reg_##r 94 %endm 95 96 %macro rv_is_sp_a0() 97 0 98 %endm 99 %macro rv_is_sp_a1() 100 0 101 %endm 102 %macro rv_is_sp_a2() 103 0 104 %endm 105 %macro rv_is_sp_a3() 106 0 107 %endm 108 %macro rv_is_sp_a4() 109 0 110 %endm 111 %macro rv_is_sp_a5() 112 0 113 %endm 114 %macro rv_is_sp_a6() 115 0 116 %endm 117 %macro rv_is_sp_a7() 118 0 119 %endm 120 %macro rv_is_sp_t0() 121 0 122 %endm 123 %macro rv_is_sp_t1() 124 0 125 %endm 126 %macro rv_is_sp_t2() 127 0 128 %endm 129 %macro rv_is_sp_s0() 130 0 131 %endm 132 %macro rv_is_sp_s1() 133 0 134 %endm 135 %macro rv_is_sp_s2() 136 0 137 %endm 138 %macro rv_is_sp_s3() 139 0 140 %endm 141 %macro rv_is_sp_sp() 142 1 143 %endm 144 %macro rv_is_sp_zero() 145 0 146 %endm 147 %macro rv_is_sp_ra() 148 0 149 %endm 150 %macro rv_is_sp_fp() 151 0 152 %endm 153 %macro rv_is_sp_br() 154 0 155 %endm 156 %macro rv_is_sp_scratch() 157 0 158 %endm 159 %macro rv_is_sp_save0() 160 0 161 %endm 162 %macro rv_is_sp_save1() 163 0 164 %endm 165 %macro rv_is_sp_save2() 166 0 167 %endm 168 169 %macro rv_is_sp(r) 170 %rv_is_sp_##r 171 %endm 172 173 # ---- Low-level instruction encoders -------------------------------------- 174 175 # R-type: funct7[31:25] rs2[24:20] rs1[19:15] funct3[14:12] rd[11:7] opcode[6:0] 176 %macro rv_r_type(base, rd, ra, rb) 177 %((| base (<< %rv_reg(rb) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7))) 178 %endm 179 180 # I-type: imm[31:20] rs1[19:15] funct3[14:12] rd[11:7] opcode[6:0] 181 %macro rv_i_type(base, rd, ra, imm12) 182 %((| base (<< (& imm12 0xFFF) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7))) 183 %endm 184 185 # S-type: imm[31:25] rs2[24:20] rs1[19:15] funct3[14:12] imm[11:7] opcode[6:0] 186 %macro rv_s_type(base, rs, ra, imm12) 187 %((| base (<< (& (>> imm12 5) 0x7F) 25) (<< %rv_reg(rs) 20) (<< %rv_reg(ra) 15) (<< (& imm12 0x1F) 7))) 188 %endm 189 190 # B-type: imm[12|10:5] rs2 rs1 funct3 imm[4:1|11] opcode. 12-bit signed, 191 # imm[0] always 0. For the hardcoded skip-over-jalr we only need a fixed 192 # positive offset (8 bytes = 2 insns), so inline the resulting bit pattern. 193 %macro rv_b_type_skip8(base, ra, rb) 194 # imm value 8 -> imm[11:0] = 0000_0000_0100. Bits of encoded imm: 195 # imm[12]=0, imm[10:5]=0, imm[4:1]=0100 (=4), imm[11]=0. 196 # encoded bits: [31:25]=0, [11:7]= (imm[4:1] << 1) | imm[11] = (4<<1)|0 = 8. 197 %((| base (<< %rv_reg(rb) 20) (<< %rv_reg(ra) 15) (<< 8 7))) 198 %endm 199 200 %macro rv_addi(rd, ra, imm12) 201 %rv_i_type(0x00000013, rd, ra, imm12) 202 %endm 203 204 %macro rv_ld(rd, ra, imm12) 205 %rv_i_type(0x00003003, rd, ra, imm12) 206 %endm 207 208 %macro rv_sd(rs, ra, imm12) 209 %rv_s_type(0x00003023, rs, ra, imm12) 210 %endm 211 212 %macro rv_lbu(rd, ra, imm12) 213 %rv_i_type(0x00004003, rd, ra, imm12) 214 %endm 215 216 %macro rv_sb(rs, ra, imm12) 217 %rv_s_type(0x00000023, rs, ra, imm12) 218 %endm 219 220 %macro rv_lwu(rd, ra, imm12) 221 %rv_i_type(0x00006003, rd, ra, imm12) 222 %endm 223 224 %macro rv_mov_rr(dst, src) 225 %rv_addi(dst, src, 0) 226 %endm 227 228 %macro rv_slli(rd, ra, shamt) 229 %((| 0x00001013 (<< (& shamt 0x3F) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7))) 230 %endm 231 232 %macro rv_srli(rd, ra, shamt) 233 %((| 0x00005013 (<< (& shamt 0x3F) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7))) 234 %endm 235 236 %macro rv_srai(rd, ra, shamt) 237 %((| 0x40005013 (<< (& shamt 0x3F) 20) (<< %rv_reg(ra) 15) (<< %rv_reg(rd) 7))) 238 %endm 239 240 %macro rv_jalr(rd, rs, imm12) 241 %((| 0x00000067 (<< (& imm12 0xFFF) 20) (<< %rv_reg(rs) 15) (<< %rv_reg(rd) 7))) 242 %endm 243 244 %macro rv_ecall() 245 %(0x00000073) 246 %endm 247 248 # 64-bit literal-pool prefix for LI: 249 # auipc rd, 0 pc-relative base 250 # ld rd, 12(rd) load 8-byte literal from pc+12 251 # jal x0, 12 skip 12 bytes (literal + pad, =8 bytes of literal). 252 # The 8 bytes that follow in source become the literal. 253 %macro rv_lit64_prefix(rd) 254 %((| 0x00000017 (<< %rv_reg(rd) 7))) 255 %((| 0x00C03003 (<< %rv_reg(rd) 15) (<< %rv_reg(rd) 7))) 256 %(0x00C0006F) 257 %endm 258 259 # 32-bit literal-pool prefix for LA / LA_BR: 260 # auipc rd, 0 261 # lwu rd, 12(rd) zero-extend 4-byte literal from pc+12 262 # jal x0, 8 skip 8 bytes (=4 of insn slot? no, literal + align). 263 # lwu zero-extends into the full 64-bit register, so 4 bytes is enough for 264 # any address in the stage0 layout. Lets source use `&label` directly 265 # without padding to 8 bytes. 266 %macro rv_lit32_prefix(rd) 267 %((| 0x00000017 (<< %rv_reg(rd) 7))) 268 %((| 0x00C06003 (<< %rv_reg(rd) 15) (<< %rv_reg(rd) 7))) 269 %(0x0080006F) 270 %endm 271 272 # Memory op fallback: offset outside signed 12-bit range. Load the 273 # offset into `scratch` via LUI+ADDI dance? For stage0 programs the 274 # curated offsets stay inside -2048..2047, so fall back is unused; 275 # still emit a defensive failure to flag any future overflow. 276 # (In practice none of the LD/ST off values in p1_gen.py exceed the 277 # signed 12-bit range, so no fallback path is wired in here.) 278 279 # ---- P1 register-register op lowering ----------------------------------- 280 281 %macro rv_rrr_ADD(rd, ra, rb) 282 %rv_r_type(0x00000033, rd, ra, rb) 283 %endm 284 %macro rv_rrr_SUB(rd, ra, rb) 285 %rv_r_type(0x40000033, rd, ra, rb) 286 %endm 287 %macro rv_rrr_AND(rd, ra, rb) 288 %rv_r_type(0x00007033, rd, ra, rb) 289 %endm 290 %macro rv_rrr_OR(rd, ra, rb) 291 %rv_r_type(0x00006033, rd, ra, rb) 292 %endm 293 %macro rv_rrr_XOR(rd, ra, rb) 294 %rv_r_type(0x00004033, rd, ra, rb) 295 %endm 296 %macro rv_rrr_SHL(rd, ra, rb) 297 %rv_r_type(0x00001033, rd, ra, rb) 298 %endm 299 %macro rv_rrr_SHR(rd, ra, rb) 300 %rv_r_type(0x00005033, rd, ra, rb) 301 %endm 302 %macro rv_rrr_SAR(rd, ra, rb) 303 %rv_r_type(0x40005033, rd, ra, rb) 304 %endm 305 %macro rv_rrr_MUL(rd, ra, rb) 306 %rv_r_type(0x02000033, rd, ra, rb) 307 %endm 308 %macro rv_rrr_DIV(rd, ra, rb) 309 %rv_r_type(0x02004033, rd, ra, rb) 310 %endm 311 %macro rv_rrr_REM(rd, ra, rb) 312 %rv_r_type(0x02006033, rd, ra, rb) 313 %endm 314 315 %macro rv_rrr_op(op, rd, ra, rb) 316 %rv_rrr_##op(rd, ra, rb) 317 %endm 318 319 # ---- P1 operation lowering ----------------------------------------------- 320 321 %macro p1_li(rd, imm) 322 %rv_lit64_prefix(rd) 323 $(imm) 324 %endm 325 326 %macro p1_la(rd) 327 %rv_lit32_prefix(rd) 328 %endm 329 330 %macro p1_labr() 331 %rv_lit32_prefix(br) 332 %endm 333 334 %macro p1_mov(rd, rs) 335 %select((= %rv_is_sp(rs) 1), 336 %rv_addi(rd, sp, 16), 337 %rv_mov_rr(rd, rs)) 338 %endm 339 340 %macro p1_rrr(op, rd, ra, rb) 341 %rv_rrr_op(op, rd, ra, rb) 342 %endm 343 344 %macro p1_addi(rd, ra, imm) 345 %rv_addi(rd, ra, imm) 346 %endm 347 348 %macro p1_logi_ANDI(rd, ra, imm) 349 %rv_i_type(0x00007013, rd, ra, imm) 350 %endm 351 %macro p1_logi_ORI(rd, ra, imm) 352 %rv_i_type(0x00006013, rd, ra, imm) 353 %endm 354 %macro p1_logi(op, rd, ra, imm) 355 %p1_logi_##op(rd, ra, imm) 356 %endm 357 358 %macro p1_shifti_SHLI(rd, ra, imm) 359 %rv_slli(rd, ra, imm) 360 %endm 361 %macro p1_shifti_SHRI(rd, ra, imm) 362 %rv_srli(rd, ra, imm) 363 %endm 364 %macro p1_shifti_SARI(rd, ra, imm) 365 %rv_srai(rd, ra, imm) 366 %endm 367 %macro p1_shifti(op, rd, ra, imm) 368 %p1_shifti_##op(rd, ra, imm) 369 %endm 370 371 %macro p1_mem_LD(rt, rn, off) 372 %rv_ld(rt, rn, off) 373 %endm 374 %macro p1_mem_ST(rt, rn, off) 375 %rv_sd(rt, rn, off) 376 %endm 377 %macro p1_mem_LB(rt, rn, off) 378 %rv_lbu(rt, rn, off) 379 %endm 380 %macro p1_mem_SB(rt, rn, off) 381 %rv_sb(rt, rn, off) 382 %endm 383 %macro p1_mem(op, rt, rn, off) 384 %select((= %rv_is_sp(rn) 1), 385 %p1_mem_##op(rt, rn, (+ off 16)), 386 %p1_mem_##op(rt, rn, off)) 387 %endm 388 389 %macro p1_ldarg(rd, slot) 390 %rv_ld(scratch, sp, 8) 391 %rv_ld(rd, scratch, (+ 16 (* 8 slot))) 392 %endm 393 394 %macro p1_b() 395 %rv_jalr(zero, br, 0) 396 %endm 397 398 %macro p1_br(rs) 399 %rv_jalr(zero, rs, 0) 400 %endm 401 402 %macro p1_call() 403 %rv_jalr(ra, br, 0) 404 %endm 405 406 %macro p1_callr(rs) 407 %rv_jalr(ra, rs, 0) 408 %endm 409 410 %macro p1_ret() 411 %rv_jalr(zero, ra, 0) 412 %endm 413 414 %macro p1_eret() 415 %rv_ld(ra, sp, 0) 416 %rv_ld(fp, sp, 8) 417 %rv_mov_rr(sp, fp) 418 %rv_jalr(zero, ra, 0) 419 %endm 420 421 %macro p1_tail() 422 %rv_ld(ra, sp, 0) 423 %rv_ld(fp, sp, 8) 424 %rv_mov_rr(sp, fp) 425 %rv_jalr(zero, br, 0) 426 %endm 427 428 %macro p1_tailr(rs) 429 %rv_ld(ra, sp, 0) 430 %rv_ld(fp, sp, 8) 431 %rv_mov_rr(sp, fp) 432 %rv_jalr(zero, rs, 0) 433 %endm 434 435 # Conditional branch: emit a skip-taken native branch over the `%p1_b` 436 # fall-through, then the jalr(br) that takes the P1 branch. Each native 437 # B-type here uses the inverted condition with a +8 offset so the `jalr` 438 # two insns below is the taken target. 439 %macro p1_condb_BEQ(ra, rb) 440 %rv_b_type_skip8(0x00001063, ra, rb) 441 %p1_b 442 %endm 443 %macro p1_condb_BNE(ra, rb) 444 %rv_b_type_skip8(0x00000063, ra, rb) 445 %p1_b 446 %endm 447 %macro p1_condb_BLT(ra, rb) 448 %rv_b_type_skip8(0x00005063, ra, rb) 449 %p1_b 450 %endm 451 %macro p1_condb_BLTU(ra, rb) 452 %rv_b_type_skip8(0x00007063, ra, rb) 453 %p1_b 454 %endm 455 %macro p1_condb(op, ra, rb) 456 %p1_condb_##op(ra, rb) 457 %endm 458 459 %macro p1_condbz_BEQZ(ra) 460 %rv_b_type_skip8(0x00001063, ra, zero) 461 %p1_b 462 %endm 463 %macro p1_condbz_BNEZ(ra) 464 %rv_b_type_skip8(0x00000063, ra, zero) 465 %p1_b 466 %endm 467 %macro p1_condbz_BLTZ(ra) 468 %rv_b_type_skip8(0x00005063, ra, zero) 469 %p1_b 470 %endm 471 %macro p1_condbz(op, ra) 472 %p1_condbz_##op(ra) 473 %endm 474 475 %macro p1_enter(size) 476 %rv_addi(sp, sp, (- 0 (& (+ (+ 16 size) 15) -16))) 477 %rv_sd(ra, sp, 0) 478 %rv_addi(fp, sp, (& (+ (+ 16 size) 15) -16)) 479 %rv_sd(fp, sp, 8) 480 %endm 481 482 %macro p1_entry() 483 # :_start stub per the P1 program-entry model. Linux riscv64 puts argc 484 # at [sp] and argv starting at [sp+8], matching the generic SysV entry 485 # stack. Load argc into a0, compute &argv[0] into a1, call p1_main under 486 # the one-word direct-result convention, then issue sys_exit with the 487 # returned status. 488 :_start 489 %rv_ld(a0, sp, 0) 490 %rv_addi(a1, sp, 8) 491 %rv_lit32_prefix(br) 492 &p1_main 493 %rv_jalr(ra, br, 0) 494 %rv_addi(a7, zero, 93) 495 %rv_ecall 496 %endm 497 498 %macro p1_syscall() 499 # P1: a0=number, a1,a2,a3,t0,s0,s1 = args 0..5. 500 # Linux riscv64: a7=number, a0..a5 = args 0..5, return in a0. 501 # SYSCALL clobbers only P1 a0; restore a1/a2/a3 after ecall. 502 # Native a4/a5 (x14/x15) aren't P1-exposed; we use them as syscall arg 503 # slots and don't need to save them. 504 %rv_mov_rr(save0, a1) 505 %rv_mov_rr(save1, a2) 506 %rv_mov_rr(save2, a3) 507 %rv_mov_rr(a7, a0) 508 %rv_mov_rr(a0, save0) 509 %rv_mov_rr(a1, save1) 510 %rv_mov_rr(a2, save2) 511 %rv_mov_rr(a3, t0) 512 %rv_mov_rr(a4, s0) 513 %rv_mov_rr(a5, s1) 514 %rv_ecall 515 %rv_mov_rr(a1, save0) 516 %rv_mov_rr(a2, save1) 517 %rv_mov_rr(a3, save2) 518 %endm 519 520 # ---- Linux riscv64 syscall numbers --------------------------------------- 521 # Each macro returns the syscall number as an integer atom so callers can 522 # use it inside expressions (e.g. `%li(a0, %sys_write)`). 523 524 %macro p1_sys_read() 525 63 526 %endm 527 %macro p1_sys_write() 528 64 529 %endm 530 %macro p1_sys_close() 531 57 532 %endm 533 %macro p1_sys_openat() 534 56 535 %endm 536 %macro p1_sys_exit() 537 93 538 %endm 539 %macro p1_sys_clone() 540 220 541 %endm 542 %macro p1_sys_execve() 543 221 544 %endm 545 %macro p1_sys_waitid() 546 95 547 %endm