P1pp.P1pp (44172B)
1 # p1pp.P1pp -- libp1pp v1, portable utility library for P1pp programs. 2 # 3 # Concatenated after the P1 backend header and frontend, and before user 4 # source: 5 # 6 # catm P1-<arch>.M1pp P1.M1pp p1pp.P1pp usersrc.P1pp > program.M1 7 # 8 # Targets P1-64 only (WORD = 8). All internal labels use the 9 # `libp1pp__` prefix; public entry points are unprefixed. 10 # 11 # See docs/LIBP1PP.md for the public contract. 12 13 # ========================================================================= 14 # Compile-time helpers 15 # ========================================================================= 16 17 # %alignup -- Round `rs` up to a multiple of `align` (a power of two) and 18 # place the result in `rd`. `scratch` is clobbered. `align` must be a 19 # constant integer expression. Two instructions plus an %li for the mask: 20 # 21 # rd = rs + (align - 1) 22 # scratch = -align (i.e. ~(align-1) for power-of-two align) 23 # rd = rd & scratch 24 %macro alignup(rd, rs, align, scratch) 25 %addi(rd, rs, (- align 1)) 26 %li(scratch, (- 0 align)) 27 %and(rd, rd, scratch) 28 %endm 29 30 # ========================================================================= 31 # Global memory access 32 # ========================================================================= 33 # 34 # Shorthand for the la + ld(_,0) / la + st(_,0) idiom that dereferences a 35 # pointer slot at a labeled global. `name` is a label expression in the 36 # `&foo` form that %la accepts. 37 # 38 # %ld_global -- rd = *name. rd doubles as the address scratch. 39 # %st_global -- *name = rs. Needs a separate `scratch` since rs holds the 40 # value being written. 41 # %lda_global -- rd = *name AND raddr = &name. Use when the address is 42 # also needed afterward (e.g. load then store back). 43 44 %macro ld_global(rd, name) 45 %la(rd, name) 46 %ld(rd, rd, 0) 47 %endm 48 49 %macro st_global(rs, name, scratch) 50 %la(scratch, name) 51 %st(rs, scratch, 0) 52 %endm 53 54 %macro lda_global(rd, raddr, name) 55 %la(raddr, name) 56 %ld(rd, raddr, 0) 57 %endm 58 59 # ========================================================================= 60 # Array indexing 61 # ========================================================================= 62 # 63 # Compute *(base + idx * stride + off) for register `base`, register 64 # `idx`, and constant `stride` and `off`. `stride` is materialized via 65 # %li and folded into idx via %mul, so it does not need to be a power of 66 # two. `off` is a constant byte offset within the element (use 0 for 67 # plain element access; non-zero for field access). 68 # 69 # All three macros need a separate scratch to hold the computed address, 70 # because both `base` and `idx` are register inputs that must survive the 71 # multiply. `lda_array` reuses its `raddr` output as that scratch. 72 # 73 # %ld_array -- rd = *(base + idx*stride + off). 74 # %st_array -- *(base + idx*stride + off) = rs. 75 # %lda_array -- rd = *addr AND raddr = base + idx*stride; ld is at 76 # offset `off`. Use when subsequent code also needs the 77 # computed address (e.g. for additional field accesses). 78 79 %macro ld_array(rd, base, stride, idx, off, scratch) 80 %li(scratch, stride) 81 %mul(scratch, idx, scratch) 82 %add(scratch, base, scratch) 83 %ld(rd, scratch, off) 84 %endm 85 86 %macro st_array(rs, base, stride, idx, off, scratch) 87 %li(scratch, stride) 88 %mul(scratch, idx, scratch) 89 %add(scratch, base, scratch) 90 %st(rs, scratch, off) 91 %endm 92 93 %macro lda_array(rd, raddr, base, stride, idx, off) 94 %li(raddr, stride) 95 %mul(raddr, idx, raddr) 96 %add(raddr, base, raddr) 97 %ld(rd, raddr, off) 98 %endm 99 100 # ========================================================================= 101 # Sub-word memory access 102 # ========================================================================= 103 # 104 # P1 has only 1-byte (%lb/%sb) and 8-byte (%ld/%st) memory ops, and the 105 # 8-byte ops require natural 8-byte alignment. For struct fields and 106 # packed data laid out at narrower widths, sub-word access is byte- 107 # decomposed: %lb-gather + shli/or for loads, %sb-scatter + shri for 108 # stores. These macros encapsulate that pattern so callers do not have 109 # to open-code it (and so a backend can later substitute a single 110 # native sub-word load/store when alignment is statically known). 111 # 112 # Conventions: 113 # `rd` is the destination (loads); `rs` is the source (stores). 114 # Stores preserve `rs`; loads clobber `rd`. `scratch` is a working 115 # register distinct from rd/rs and base. Bytes are little-endian: 116 # byte 0 (low) at off+0. The signed-load variants (%ld_sh, %ld_sw) 117 # sign-extend the gathered value to the canonical 64-bit form. 118 # 119 # %ld_h(rd, base, off, scratch) — 2-byte zero-extending load 120 # %ld_w(rd, base, off, scratch) — 4-byte zero-extending load 121 # %ld_sh(rd, base, off, scratch) — 2-byte sign-extending load 122 # %ld_sw(rd, base, off, scratch) — 4-byte sign-extending load 123 # %st_h(rs, base, off, scratch) — 2-byte store (writes low 16 bits) 124 # %st_w(rs, base, off, scratch) — 4-byte store (writes low 32 bits) 125 126 %macro ld_h(rd, base, off, scratch) 127 %lb(rd, base, off) 128 %lb(scratch, base, (+ off 1)) 129 %shli(scratch, scratch, 8) 130 %or(rd, rd, scratch) 131 %endm 132 133 %macro ld_w(rd, base, off, scratch) 134 %lb(rd, base, off) 135 %lb(scratch, base, (+ off 1)) 136 %shli(scratch, scratch, 8) 137 %or(rd, rd, scratch) 138 %lb(scratch, base, (+ off 2)) 139 %shli(scratch, scratch, 16) 140 %or(rd, rd, scratch) 141 %lb(scratch, base, (+ off 3)) 142 %shli(scratch, scratch, 24) 143 %or(rd, rd, scratch) 144 %endm 145 146 %macro ld_sh(rd, base, off, scratch) 147 %ld_h(rd, base, off, scratch) 148 %shli(rd, rd, 48) 149 %sari(rd, rd, 48) 150 %endm 151 152 %macro ld_sw(rd, base, off, scratch) 153 %ld_w(rd, base, off, scratch) 154 %shli(rd, rd, 32) 155 %sari(rd, rd, 32) 156 %endm 157 158 %macro st_h(rs, base, off, scratch) 159 %sb(rs, base, off) 160 %shri(scratch, rs, 8) 161 %sb(scratch, base, (+ off 1)) 162 %endm 163 164 %macro st_w(rs, base, off, scratch) 165 %sb(rs, base, off) 166 %shri(scratch, rs, 8) 167 %sb(scratch, base, (+ off 1)) 168 %shri(scratch, rs, 16) 169 %sb(scratch, base, (+ off 2)) 170 %shri(scratch, rs, 24) 171 %sb(scratch, base, (+ off 3)) 172 %endm 173 174 # ========================================================================= 175 # Sign and zero extension 176 # ========================================================================= 177 # 178 # %sextN(rd, ra) truncate ra to N bits and sign-extend to 64. 179 # %zextN(rd, ra) truncate ra to N bits and zero-extend to 64. 180 # %zext32(rd, ra, scratch) 181 # like zextN but needs a scratch register because 182 # 0xFFFFFFFF does not fit a 16-bit movz immediate 183 # (the path %andi takes when materializing the mask). 184 # 185 # rd may equal ra. The signed forms use shli/sari at the right amount; 186 # zext8/zext16 ride on %andi (the mask fits movz so no caller scratch 187 # needed); zext32 materializes the mask explicitly. 188 189 %macro sext8(rd, ra) 190 %shli(rd, ra, 56) 191 %sari(rd, rd, 56) 192 %endm 193 194 %macro sext16(rd, ra) 195 %shli(rd, ra, 48) 196 %sari(rd, rd, 48) 197 %endm 198 199 %macro sext32(rd, ra) 200 %shli(rd, ra, 32) 201 %sari(rd, rd, 32) 202 %endm 203 204 %macro zext8(rd, ra) 205 %andi(rd, ra, 255) 206 %endm 207 208 %macro zext16(rd, ra) 209 %andi(rd, ra, 65535) 210 %endm 211 212 %macro zext32(rd, ra, scratch) 213 %li(scratch, 4294967295) 214 %and(rd, ra, scratch) 215 %endm 216 217 # ========================================================================= 218 # Frame-slot address 219 # ========================================================================= 220 # 221 # %lea_slot(rd, slot) rd = address of the frame slot at byte offset 222 # `slot`. Centralizes the "%mov(rd, sp) + 223 # %addi(rd, rd, slot)" idiom — the backend folds 224 # its hidden frame-header offset into %mov(rd, sp), 225 # so callers must not bake a literal 16 into the 226 # %addi. `slot` may be any M1pp integer expression 227 # (a literal byte offset or a %fn__SO-relative 228 # slot-expr). 229 230 %macro lea_slot(rd, slot) 231 %mov(rd, sp) 232 %addi(rd, rd, slot) 233 %endm 234 235 # ========================================================================= 236 # Pointer scaling 237 # ========================================================================= 238 # 239 # %ptr_add(rd, ptr, idx, sz, scratch) rd = ptr + idx*sz 240 # %ptr_sub(rd, ptr, idx, sz, scratch) rd = ptr - idx*sz 241 # %ptr_diff(rd, p, q, sz, scratch) rd = (p - q) / sz 242 # 243 # `sz` is an M1pp-time integer constant (the C pointee size). When 244 # sz == 1 the multiply (or divide) collapses out at expansion time. 245 # 246 # %ptr_add and %ptr_sub clobber `scratch`. %ptr_diff clobbers `scratch` 247 # (only when sz != 1) and computes through `rd`, so callers must not 248 # alias `rd` with `p` or `q` in the sz != 1 path. 249 250 # sz <= 1 takes the byte-stride fast path: char* (sz=1) and void* 251 # (cc.scm uses sz=-1 for the void pointee, following GCC's byte-arith 252 # extension) both want raw idx with no scaling. 253 254 %macro ptr_add(rd, ptr, idx, sz, scratch) 255 %select((< sz 2), 256 %add(rd, ptr, idx), 257 %li(scratch, sz) 258 %mul(scratch, idx, scratch) 259 %add(rd, ptr, scratch)) 260 %endm 261 262 %macro ptr_sub(rd, ptr, idx, sz, scratch) 263 %select((< sz 2), 264 %sub(rd, ptr, idx), 265 %li(scratch, sz) 266 %mul(scratch, idx, scratch) 267 %sub(rd, ptr, scratch)) 268 %endm 269 270 %macro ptr_diff(rd, p, q, sz, scratch) 271 %select((< sz 2), 272 %sub(rd, p, q), 273 %sub(rd, p, q) 274 %li(scratch, sz) 275 %div(rd, rd, scratch)) 276 %endm 277 278 # ========================================================================= 279 # Memcpy-call shorthand 280 # ========================================================================= 281 # 282 # %memcpy_call(dst_reg, src_reg, n_imm) 283 # Marshal arguments into the libp1pp memcpy ABI and invoke it. Useful 284 # for fixed-size memory copies (e.g. struct copy in a code generator) 285 # where the size is known at expansion time. dst_reg and src_reg must 286 # not be a0 — the dst move would clobber a different live input. 287 288 %macro memcpy_call(dst_reg, src_reg, n_imm) 289 %li(a2, n_imm) 290 %mov(a1, src_reg) 291 %mov(a0, dst_reg) 292 %call(&memcpy) 293 %endm 294 295 # ========================================================================= 296 # Compare-and-set-bool macros 297 # ========================================================================= 298 # 299 # %cmpset_<cc>(rd, ra[, rb]) rd = (ra <cc> rb) ? 1 : 0 300 # 301 # Two-operand: eq, ne, lt, ltu, le, leu, ge, geu (signed/unsigned). 302 # Zero-operand (compare against zero): eqz, nez, ltz. 303 # 304 # le/ge/leu/geu lower through the same ifelse machinery with operands 305 # swapped or condition flipped: a >= b iff !(a < b) iff (b <= a-1), and 306 # we reach it as (b < a) ? 0 : 1 via ifelse_lt with swapped arms (and 307 # the unsigned/signed pairing follows ltu/lt). 308 # 309 # Lower to %ifelse_<cc>(...) which itself works across all P1 backends. 310 # A backend that supports a native conditional-set instruction can later 311 # specialize these to a single op without touching callers. 312 313 %macro cmpset_eq(rd, ra, rb) 314 %ifelse_eq(ra, rb, { %li(rd, 1) }, { %li(rd, 0) }) 315 %endm 316 317 %macro cmpset_ne(rd, ra, rb) 318 %ifelse_ne(ra, rb, { %li(rd, 1) }, { %li(rd, 0) }) 319 %endm 320 321 %macro cmpset_lt(rd, ra, rb) 322 %ifelse_lt(ra, rb, { %li(rd, 1) }, { %li(rd, 0) }) 323 %endm 324 325 %macro cmpset_ltu(rd, ra, rb) 326 %ifelse_ltu(ra, rb, { %li(rd, 1) }, { %li(rd, 0) }) 327 %endm 328 329 %macro cmpset_le(rd, ra, rb) 330 %ifelse_lt(rb, ra, { %li(rd, 0) }, { %li(rd, 1) }) 331 %endm 332 333 %macro cmpset_leu(rd, ra, rb) 334 %ifelse_ltu(rb, ra, { %li(rd, 0) }, { %li(rd, 1) }) 335 %endm 336 337 %macro cmpset_ge(rd, ra, rb) 338 %ifelse_lt(ra, rb, { %li(rd, 0) }, { %li(rd, 1) }) 339 %endm 340 341 %macro cmpset_geu(rd, ra, rb) 342 %ifelse_ltu(ra, rb, { %li(rd, 0) }, { %li(rd, 1) }) 343 %endm 344 345 %macro cmpset_eqz(rd, ra) 346 %ifelse_eqz(ra, { %li(rd, 1) }, { %li(rd, 0) }) 347 %endm 348 349 %macro cmpset_nez(rd, ra) 350 %ifelse_nez(ra, { %li(rd, 1) }, { %li(rd, 0) }) 351 %endm 352 353 %macro cmpset_ltz(rd, ra) 354 %ifelse_ltz(ra, { %li(rd, 1) }, { %li(rd, 0) }) 355 %endm 356 357 # ========================================================================= 358 # Tiny unops 359 # ========================================================================= 360 # 361 # %neg(rd, ra, scratch) rd = -ra (scratch holds the zero literal) 362 # %bnot(rd, ra, scratch) rd = ~ra (scratch holds the all-ones literal) 363 # %bool(rd, ra) rd = (ra != 0) ? 1 : 0 (alias of cmpset_nez) 364 365 %macro neg(rd, ra, scratch) 366 %li(scratch, 0) 367 %sub(rd, scratch, ra) 368 %endm 369 370 %macro bnot(rd, ra, scratch) 371 %li(scratch, -1) 372 %xor(rd, ra, scratch) 373 %endm 374 375 %macro bool(rd, ra) 376 %cmpset_nez(rd, ra) 377 %endm 378 379 # ========================================================================= 380 # Switch dispatch 381 # ========================================================================= 382 # 383 # %switch_case(ctrl, scratch, key, target) 384 # If `ctrl == key`, branch to `target`. `scratch` is used to 385 # materialize the key as a register operand. `target` is the full 386 # branch target (e.g. `&.case_3`). 387 # 388 # A code generator emitting a switch dispatcher emits one 389 # %switch_case per case, then an unconditional branch to the default. 390 391 %macro switch_case(ctrl, scratch, key, target) 392 %li(scratch, key) 393 %beq(ctrl, scratch, target) 394 %endm 395 396 # ========================================================================= 397 # Control-flow macros 398 # ========================================================================= 399 # 400 # Every conditional block macro uses a uniform three-branch lowering that 401 # works for all seven P1 conditions (including LT, LTU, LTZ which have no 402 # inverted branch): load a "take the body" target, branch on cc, then 403 # unconditionally skip past the body. 404 405 # ---- %if_<cc> ----------------------------------------------------------- 406 407 %macro if_eq(ra, rb, body) 408 %beq(ra, rb, &@body) 409 %b(&@end) 410 :@body 411 body 412 :@end 413 %endm 414 415 %macro if_ne(ra, rb, body) 416 %bne(ra, rb, &@body) 417 %b(&@end) 418 :@body 419 body 420 :@end 421 %endm 422 423 %macro if_lt(ra, rb, body) 424 %blt(ra, rb, &@body) 425 %b(&@end) 426 :@body 427 body 428 :@end 429 %endm 430 431 %macro if_ltu(ra, rb, body) 432 %bltu(ra, rb, &@body) 433 %b(&@end) 434 :@body 435 body 436 :@end 437 %endm 438 439 %macro if_eqz(ra, body) 440 %beqz(ra, &@body) 441 %b(&@end) 442 :@body 443 body 444 :@end 445 %endm 446 447 %macro if_nez(ra, body) 448 %bnez(ra, &@body) 449 %b(&@end) 450 :@body 451 body 452 :@end 453 %endm 454 455 %macro if_ltz(ra, body) 456 %bltz(ra, &@body) 457 %b(&@end) 458 :@body 459 body 460 :@end 461 %endm 462 463 # ---- %ifelse_<cc> ------------------------------------------------------- 464 465 %macro ifelse_eq(ra, rb, tblk, fblk) 466 %beq(ra, rb, &@tblk) 467 fblk 468 %b(&@end) 469 :@tblk 470 tblk 471 :@end 472 %endm 473 474 %macro ifelse_ne(ra, rb, tblk, fblk) 475 %bne(ra, rb, &@tblk) 476 fblk 477 %b(&@end) 478 :@tblk 479 tblk 480 :@end 481 %endm 482 483 %macro ifelse_lt(ra, rb, tblk, fblk) 484 %blt(ra, rb, &@tblk) 485 fblk 486 %b(&@end) 487 :@tblk 488 tblk 489 :@end 490 %endm 491 492 %macro ifelse_ltu(ra, rb, tblk, fblk) 493 %bltu(ra, rb, &@tblk) 494 fblk 495 %b(&@end) 496 :@tblk 497 tblk 498 :@end 499 %endm 500 501 %macro ifelse_eqz(ra, tblk, fblk) 502 %beqz(ra, &@tblk) 503 fblk 504 %b(&@end) 505 :@tblk 506 tblk 507 :@end 508 %endm 509 510 %macro ifelse_nez(ra, tblk, fblk) 511 %bnez(ra, &@tblk) 512 fblk 513 %b(&@end) 514 :@tblk 515 tblk 516 :@end 517 %endm 518 519 %macro ifelse_ltz(ra, tblk, fblk) 520 %bltz(ra, &@tblk) 521 fblk 522 %b(&@end) 523 :@tblk 524 tblk 525 :@end 526 %endm 527 528 # ---- %while_<cc> ------------------------------------------------------- 529 # 530 # Jump-to-test layout: the body runs iff the positive-sense test holds, 531 # and the test is compiled below the body so we only emit a forward 532 # branch at entry. 533 534 %macro while_eq(ra, rb, body) 535 %b(&@test) 536 :@body 537 body 538 :@test 539 %beq(ra, rb, &@body) 540 %endm 541 542 %macro while_ne(ra, rb, body) 543 %b(&@test) 544 :@body 545 body 546 :@test 547 %bne(ra, rb, &@body) 548 %endm 549 550 %macro while_lt(ra, rb, body) 551 %b(&@test) 552 :@body 553 body 554 :@test 555 %blt(ra, rb, &@body) 556 %endm 557 558 %macro while_ltu(ra, rb, body) 559 %b(&@test) 560 :@body 561 body 562 :@test 563 %bltu(ra, rb, &@body) 564 %endm 565 566 %macro while_eqz(ra, body) 567 %b(&@test) 568 :@body 569 body 570 :@test 571 %beqz(ra, &@body) 572 %endm 573 574 %macro while_nez(ra, body) 575 %b(&@test) 576 :@body 577 body 578 :@test 579 %bnez(ra, &@body) 580 %endm 581 582 %macro while_ltz(ra, body) 583 %b(&@test) 584 :@body 585 body 586 :@test 587 %bltz(ra, &@body) 588 %endm 589 590 # ---- %do_while_<cc> ---------------------------------------------------- 591 592 %macro do_while_eq(ra, rb, body) 593 :@body 594 body 595 %beq(ra, rb, &@body) 596 %endm 597 598 %macro do_while_ne(ra, rb, body) 599 :@body 600 body 601 %bne(ra, rb, &@body) 602 %endm 603 604 %macro do_while_lt(ra, rb, body) 605 :@body 606 body 607 %blt(ra, rb, &@body) 608 %endm 609 610 %macro do_while_ltu(ra, rb, body) 611 :@body 612 body 613 %bltu(ra, rb, &@body) 614 %endm 615 616 %macro do_while_eqz(ra, body) 617 :@body 618 body 619 %beqz(ra, &@body) 620 %endm 621 622 %macro do_while_nez(ra, body) 623 :@body 624 body 625 %bnez(ra, &@body) 626 %endm 627 628 %macro do_while_ltz(ra, body) 629 :@body 630 body 631 %bltz(ra, &@body) 632 %endm 633 634 # ---- %for_lt ------------------------------------------------------------ 635 636 %macro for_lt(i_reg, n_reg, body) 637 %li(i_reg, 0) 638 %b(&@test) 639 :@body 640 body 641 %addi(i_reg, i_reg, 1) 642 :@test 643 %blt(i_reg, n_reg, &@body) 644 %endm 645 646 # ---- %loop -------------------------------------------------------------- 647 648 %macro loop(body) 649 :@top 650 body 651 %b(&@top) 652 %endm 653 654 # ---- Scoped loops ------------------------------------------------------- 655 # 656 # Each scoped form opens a hex2++ `.scope` and defines two dotted labels 657 # inside it: `.top` (where `%continue` should land) and `.end` 658 # (immediately after the loop, where `%break` should land). The generic 659 # `%break` and `%continue` macros below emit branches to `&.end` / 660 # `&.top`; hex2++'s innermost-out scope walk binds those references to 661 # the nearest enclosing scoped loop. 662 # 663 # Nested scoped loops shadow each other: a `%break` inside an inner loop 664 # targets the inner loop's `.end`. Non-loop control-flow macros 665 # (`%if_<cc>`, `%ifelse_<cc>`) do not open a `.scope`, so `%break` / 666 # `%continue` inside them passes through to the enclosing scoped loop. 667 668 %macro loop_scoped(body) 669 .scope 670 :.top 671 body 672 %b(&.top) 673 :.end 674 .endscope 675 %endm 676 677 %macro while_scoped_eq(ra, rb, body) 678 .scope 679 %b(&.top) 680 :.body 681 body 682 :.top 683 %beq(ra, rb, &.body) 684 :.end 685 .endscope 686 %endm 687 688 %macro while_scoped_ne(ra, rb, body) 689 .scope 690 %b(&.top) 691 :.body 692 body 693 :.top 694 %bne(ra, rb, &.body) 695 :.end 696 .endscope 697 %endm 698 699 %macro while_scoped_lt(ra, rb, body) 700 .scope 701 %b(&.top) 702 :.body 703 body 704 :.top 705 %blt(ra, rb, &.body) 706 :.end 707 .endscope 708 %endm 709 710 %macro while_scoped_ltu(ra, rb, body) 711 .scope 712 %b(&.top) 713 :.body 714 body 715 :.top 716 %bltu(ra, rb, &.body) 717 :.end 718 .endscope 719 %endm 720 721 %macro while_scoped_eqz(ra, body) 722 .scope 723 %b(&.top) 724 :.body 725 body 726 :.top 727 %beqz(ra, &.body) 728 :.end 729 .endscope 730 %endm 731 732 %macro while_scoped_nez(ra, body) 733 .scope 734 %b(&.top) 735 :.body 736 body 737 :.top 738 %bnez(ra, &.body) 739 :.end 740 .endscope 741 %endm 742 743 %macro while_scoped_ltz(ra, body) 744 .scope 745 %b(&.top) 746 :.body 747 body 748 :.top 749 %bltz(ra, &.body) 750 :.end 751 .endscope 752 %endm 753 754 %macro for_lt_scoped(i_reg, n_reg, body) 755 .scope 756 %li(i_reg, 0) 757 %b(&.test) 758 :.body 759 body 760 :.top 761 %addi(i_reg, i_reg, 1) 762 :.test 763 %blt(i_reg, n_reg, &.body) 764 :.end 765 .endscope 766 %endm 767 768 %macro break() 769 %b(&.end) 770 %endm 771 772 %macro continue() 773 %b(&.top) 774 %endm 775 776 # ========================================================================= 777 # %fn -- scope-introducing function definition 778 # ========================================================================= 779 # 780 # Opens a hex2++ `.scope` around the body so dotted local labels (`:.foo`, 781 # `&.foo`) are private to this function. The body is bracketed by 782 # %enter(size) and %eret, so functions defined with %fn always carry a 783 # standard frame. 784 785 %macro fn(name, size, body) 786 : ## name 787 .scope 788 %enter(size) 789 body 790 %eret 791 .endscope 792 %endm 793 794 # ========================================================================= 795 # %fn2 -- function with named locals 796 # ========================================================================= 797 # 798 # Like %fn, but the second argument is a braced list of local names 799 # instead of a byte frame size. Synthesizes a `name_FRAME` %struct 800 # (one 8-byte slot per local), opens both a hex2++ `.scope` and an 801 # m1pp `%frame` named after the function, and sizes the stack frame 802 # from %name_FRAME.SIZE. 803 # 804 # Inside the body these helpers resolve against the enclosing frame: 805 # %local(slot) byte offset of local `slot` 806 # %stl(reg, slot) store reg into local `slot` 807 # %ldl(reg, slot) load local `slot` into reg 808 # 809 # m1pp tracks the active frame in a single slot independent of hex2++ 810 # scope nesting, so %local / %stl / %ldl keep resolving against the 811 # function even when the body opens nested `.scope` blocks (e.g. from 812 # a scoped control-flow macro). 813 # 814 # Locals follow the same braces convention as `body`: a multi-local 815 # list must be braced (`{a, b, c}`); a zero-local function uses `{}`. 816 817 %macro fn2(name, locals, body) 818 %struct name ## _FRAME { locals } 819 : ## name 820 .scope 821 %frame name 822 %enter(% ## name ## _FRAME.SIZE) 823 body 824 %eret 825 %endframe 826 .endscope 827 %endm 828 829 %macro stl(reg, slot) %st(reg, sp, %local(slot)) %endm 830 %macro ldl(reg, slot) %ld(reg, sp, %local(slot)) %endm 831 832 # ========================================================================= 833 # %assert_<cc> macros 834 # ========================================================================= 835 # 836 # Branch past the panic call when the condition holds; otherwise fall 837 # through to `LA a0, msg; LA_BR &panic; CALL`. Each assert requires the 838 # enclosing function to have an established frame. 839 840 %macro assert_eq(ra, rb, msg) 841 %beq(ra, rb, &@done) 842 %la(a0, & ## msg) 843 %call(&panic) 844 :@done 845 %endm 846 847 %macro assert_ne(ra, rb, msg) 848 %bne(ra, rb, &@done) 849 %la(a0, & ## msg) 850 %call(&panic) 851 :@done 852 %endm 853 854 %macro assert_lt(ra, rb, msg) 855 %blt(ra, rb, &@done) 856 %la(a0, & ## msg) 857 %call(&panic) 858 :@done 859 %endm 860 861 %macro assert_ltu(ra, rb, msg) 862 %bltu(ra, rb, &@done) 863 %la(a0, & ## msg) 864 %call(&panic) 865 :@done 866 %endm 867 868 %macro assert_eqz(ra, msg) 869 %beqz(ra, &@done) 870 %la(a0, & ## msg) 871 %call(&panic) 872 :@done 873 %endm 874 875 %macro assert_nez(ra, msg) 876 %bnez(ra, &@done) 877 %la(a0, & ## msg) 878 %call(&panic) 879 :@done 880 %endm 881 882 %macro assert_ltz(ra, msg) 883 %bltz(ra, &@done) 884 %la(a0, & ## msg) 885 %call(&panic) 886 :@done 887 %endm 888 889 # ========================================================================= 890 # Memory and strings 891 # ========================================================================= 892 893 # memcpy(dst=a0, src=a1, n=a2) -> dst (a0) 894 # Leaf. Copies n bytes from src to dst. No overlap support where 895 # dst > src && dst < src + n; use memmove for that case. These mem* 896 # entries are the canonical compiler-builtin runtime — every build 897 # process in this tree (cc.scm + libp1pp + libc, tcc-cc, tcc-gcc) 898 # resolves bare `extern memcpy` against this implementation. The 899 # vendored mes-libc is flattened with its own memcpy/memmove/memset/ 900 # memcmp omitted so the symbols are not duplicated at hex2++ time. 901 :memcpy 902 .scope 903 %mov(a3, a0) 904 %li(t0, 0) 905 :.loop 906 %beq(t0, a2, &.done) 907 %add(t1, a1, t0) 908 %lb(t1, t1, 0) 909 %add(t2, a3, t0) 910 %sb(t1, t2, 0) 911 %addi(t0, t0, 1) 912 %b(&.loop) 913 :.done 914 %mov(a0, a3) 915 %ret 916 .endscope 917 918 # memmove(dst=a0, src=a1, n=a2) -> dst (a0) 919 # Leaf. Like memcpy but tolerates overlap by picking the safe direction. 920 :memmove 921 .scope 922 %mov(a3, a0) 923 %beq(a0, a1, &.done) 924 %beqz(a2, &.done) 925 %bltu(a0, a1, &.fwd) 926 # dst > src: copy from the high end down so an overlap that would 927 # clobber a yet-unread src byte is harmless. 928 %mov(t0, a2) 929 :.bwd_loop 930 %addi(t0, t0, -1) 931 %add(t1, a1, t0) 932 %lb(t1, t1, 0) 933 %add(t2, a3, t0) 934 %sb(t1, t2, 0) 935 %bnez(t0, &.bwd_loop) 936 %b(&.done) 937 :.fwd 938 # dst < src: forward copy is safe. 939 %li(t0, 0) 940 :.fwd_loop 941 %beq(t0, a2, &.done) 942 %add(t1, a1, t0) 943 %lb(t1, t1, 0) 944 %add(t2, a3, t0) 945 %sb(t1, t2, 0) 946 %addi(t0, t0, 1) 947 %b(&.fwd_loop) 948 :.done 949 %mov(a0, a3) 950 %ret 951 .endscope 952 953 # memset(dst=a0, byte=a1, n=a2) -> dst (a0) 954 :memset 955 .scope 956 %mov(a3, a0) 957 %li(t0, 0) 958 :.loop 959 %beq(t0, a2, &.done) 960 %add(t1, a3, t0) 961 %sb(a1, t1, 0) 962 %addi(t0, t0, 1) 963 %b(&.loop) 964 :.done 965 %mov(a0, a3) 966 %ret 967 .endscope 968 969 # memcmp(a=a0, b=a1, n=a2) -> -1/0/1 (a0) 970 :memcmp 971 .scope 972 %li(t0, 0) 973 :.loop 974 %beq(t0, a2, &.eq) 975 %add(t1, a0, t0) 976 %lb(t1, t1, 0) 977 %add(t2, a1, t0) 978 %lb(t2, t2, 0) 979 %bltu(t1, t2, &.lt) 980 %bltu(t2, t1, &.gt) 981 %addi(t0, t0, 1) 982 %b(&.loop) 983 :.lt 984 %li(a0, -1) 985 %ret 986 :.gt 987 %li(a0, 1) 988 %ret 989 :.eq 990 %li(a0, 0) 991 %ret 992 .endscope 993 994 # libp1pp__strlen(cstr=a0) -> n (a0) 995 :libp1pp__strlen 996 .scope 997 %mov(a1, a0) 998 :.loop 999 %lb(t0, a1, 0) 1000 %beqz(t0, &.done) 1001 %addi(a1, a1, 1) 1002 %b(&.loop) 1003 :.done 1004 %sub(a0, a1, a0) 1005 %ret 1006 .endscope 1007 1008 # libp1pp__streq(a=a0, b=a1) -> 0 or 1 1009 :libp1pp__streq 1010 .scope 1011 :.loop 1012 %lb(t0, a0, 0) 1013 %lb(t1, a1, 0) 1014 %bne(t0, t1, &.ne) 1015 %beqz(t0, &.eq) 1016 %addi(a0, a0, 1) 1017 %addi(a1, a1, 1) 1018 %b(&.loop) 1019 :.ne 1020 %li(a0, 0) 1021 %ret 1022 :.eq 1023 %li(a0, 1) 1024 %ret 1025 .endscope 1026 1027 # libp1pp__strcmp(a=a0, b=a1) -> -1/0/1 1028 :libp1pp__strcmp 1029 .scope 1030 :.loop 1031 %lb(t0, a0, 0) 1032 %lb(t1, a1, 0) 1033 %bltu(t0, t1, &.lt) 1034 %bltu(t1, t0, &.gt) 1035 %beqz(t0, &.eq) 1036 %addi(a0, a0, 1) 1037 %addi(a1, a1, 1) 1038 %b(&.loop) 1039 :.lt 1040 %li(a0, -1) 1041 %ret 1042 :.gt 1043 %li(a0, 1) 1044 %ret 1045 :.eq 1046 %li(a0, 0) 1047 %ret 1048 .endscope 1049 1050 # ========================================================================= 1051 # Integer parsing and formatting 1052 # ========================================================================= 1053 1054 # parse_dec(buf=a0, len=a1) -> (value=a0, consumed=a1) 1055 # Uses an 8-byte frame slot to save buf_start; all hot-loop state lives 1056 # in caller-saved registers. 1057 :parse_dec 1058 .scope 1059 %enter(8) 1060 %st(a0, sp, 0) 1061 %add(a3, a0, a1) 1062 %mov(a2, a0) 1063 %li(t0, 0) 1064 %li(t1, 0) 1065 1066 %beq(a2, a3, &.after_sign) 1067 %lb(t2, a2, 0) 1068 %addi(t2, t2, -45) 1069 %bnez(t2, &.after_sign) 1070 %li(t0, 1) 1071 %addi(a2, a2, 1) 1072 1073 :.after_sign 1074 %mov(a1, a2) 1075 1076 :.digit_loop 1077 %beq(a2, a3, &.digits_done) 1078 %lb(t2, a2, 0) 1079 %addi(t2, t2, -48) 1080 %bltz(t2, &.digits_done) 1081 %li(a0, 9) 1082 %bltu(a0, t2, &.digits_done) 1083 %li(a0, 10) 1084 %mul(t1, t1, a0) 1085 %add(t1, t1, t2) 1086 %addi(a2, a2, 1) 1087 %b(&.digit_loop) 1088 1089 :.digits_done 1090 %beq(a2, a1, &.no_digits) 1091 1092 %bnez(t0, &.apply_sign) 1093 %b(&.compute_return) 1094 :.apply_sign 1095 %li(a0, 0) 1096 %sub(t1, a0, t1) 1097 1098 :.compute_return 1099 %ld(a0, sp, 0) 1100 %sub(a1, a2, a0) 1101 %mov(a0, t1) 1102 %eret 1103 1104 :.no_digits 1105 %li(a0, 0) 1106 %li(a1, 0) 1107 %eret 1108 .endscope 1109 1110 # parse_hex(buf=a0, len=a1) -> (value=a0, consumed=a1) 1111 :parse_hex 1112 .scope 1113 %enter(8) 1114 %st(a0, sp, 0) 1115 %add(a3, a0, a1) 1116 %mov(a2, a0) 1117 %li(t1, 0) 1118 %mov(a1, a2) 1119 1120 :.loop 1121 %beq(a2, a3, &.done) 1122 %lb(t2, a2, 0) 1123 1124 %addi(t0, t2, -48) 1125 %bltz(t0, &.check_lower) 1126 %li(a0, 9) 1127 %bltu(a0, t0, &.check_lower) 1128 %b(&.accept) 1129 1130 :.check_lower 1131 %addi(t0, t2, -97) 1132 %bltz(t0, &.check_upper) 1133 %li(a0, 5) 1134 %bltu(a0, t0, &.check_upper) 1135 %addi(t0, t0, 10) 1136 %b(&.accept) 1137 1138 :.check_upper 1139 %addi(t0, t2, -65) 1140 %bltz(t0, &.done) 1141 %li(a0, 5) 1142 %bltu(a0, t0, &.done) 1143 %addi(t0, t0, 10) 1144 1145 :.accept 1146 %shli(t1, t1, 4) 1147 %or(t1, t1, t0) 1148 %addi(a2, a2, 1) 1149 %b(&.loop) 1150 1151 :.done 1152 %beq(a2, a1, &.no_digits) 1153 %ld(a0, sp, 0) 1154 %sub(a1, a2, a0) 1155 %mov(a0, t1) 1156 %eret 1157 1158 :.no_digits 1159 %li(a0, 0) 1160 %li(a1, 0) 1161 %eret 1162 .endscope 1163 1164 # fmt_dec(buf=a0, value=a1) -> n_bytes (a0) 1165 # 1166 # Unified signed formatting: digits are written from the per-iteration 1167 # `value % 10`, negated when value is negative. This avoids the 1168 # INT_MIN-overflow trap that `value = -value` would hit. 1169 :fmt_dec 1170 .scope 1171 %enter(8) 1172 %st(a0, sp, 0) 1173 1174 %bltz(a1, &.is_neg) 1175 %b(&.count) 1176 :.is_neg 1177 %li(t0, 45) 1178 %sb(t0, a0, 0) 1179 %addi(a0, a0, 1) 1180 1181 :.count 1182 %mov(t0, a1) 1183 %li(a2, 1) 1184 %li(t1, 10) 1185 :.count_loop 1186 %div(t0, t0, t1) 1187 %beqz(t0, &.count_done) 1188 %addi(a2, a2, 1) 1189 %b(&.count_loop) 1190 :.count_done 1191 1192 %add(a3, a0, a2) 1193 1194 :.dig_loop 1195 %addi(a3, a3, -1) 1196 %rem(t0, a1, t1) 1197 %bltz(t0, &.neg_digit) 1198 %b(&.write_digit) 1199 :.neg_digit 1200 %li(t2, 0) 1201 %sub(t0, t2, t0) 1202 :.write_digit 1203 %addi(t0, t0, 48) 1204 %sb(t0, a3, 0) 1205 %div(a1, a1, t1) 1206 %bnez(a1, &.dig_loop) 1207 1208 %ld(t2, sp, 0) 1209 %add(a0, a0, a2) 1210 %sub(a0, a0, t2) 1211 %eret 1212 .endscope 1213 1214 # fmt_hex(buf=a0, value=a1) -> n_bytes (a0) 1215 :fmt_hex 1216 .scope 1217 %enter(8) 1218 %st(a0, sp, 0) 1219 1220 %bnez(a1, &.nonzero) 1221 %li(t0, 48) 1222 %sb(t0, a0, 0) 1223 %li(a0, 1) 1224 %eret 1225 1226 :.nonzero 1227 %mov(t0, a1) 1228 %li(a2, 0) 1229 :.count_loop 1230 %addi(a2, a2, 1) 1231 %shri(t0, t0, 4) 1232 %bnez(t0, &.count_loop) 1233 1234 %add(a3, a0, a2) 1235 1236 :.dig_loop 1237 %addi(a3, a3, -1) 1238 %andi(t0, a1, 15) 1239 %li(t1, 10) 1240 %bltu(t0, t1, &.is_letter) 1241 %addi(t0, t0, -10) 1242 %addi(t0, t0, 97) 1243 %b(&.write_digit) 1244 :.is_letter 1245 %addi(t0, t0, 48) 1246 :.write_digit 1247 %sb(t0, a3, 0) 1248 %shri(a1, a1, 4) 1249 %bnez(a1, &.dig_loop) 1250 1251 %ld(t2, sp, 0) 1252 %add(a0, a0, a2) 1253 %sub(a0, a0, t2) 1254 %eret 1255 .endscope 1256 1257 # ========================================================================= 1258 # Character predicates 1259 # ========================================================================= 1260 1261 # is_digit(c=a0) -> 0 or 1 1262 :is_digit 1263 .scope 1264 %addi(t0, a0, -48) 1265 %li(t1, 10) 1266 %li(a0, 1) 1267 %bltu(t0, t1, &.done) 1268 %li(a0, 0) 1269 :.done 1270 %ret 1271 .endscope 1272 1273 # is_hex_digit(c=a0) -> 0 or 1 1274 :is_hex_digit 1275 .scope 1276 %li(t2, 1) 1277 %addi(t0, a0, -48) 1278 %li(t1, 10) 1279 %bltu(t0, t1, &.done) 1280 %addi(t0, a0, -97) 1281 %li(t1, 6) 1282 %bltu(t0, t1, &.done) 1283 %addi(t0, a0, -65) 1284 %bltu(t0, t1, &.done) 1285 %li(t2, 0) 1286 :.done 1287 %mov(a0, t2) 1288 %ret 1289 .endscope 1290 1291 # is_space(c=a0) -> 0 or 1 1292 :is_space 1293 .scope 1294 %li(t2, 1) 1295 %addi(t0, a0, -32) 1296 %beqz(t0, &.done) 1297 %addi(t0, a0, -9) 1298 %li(t1, 5) 1299 %bltu(t0, t1, &.done) 1300 %li(t2, 0) 1301 :.done 1302 %mov(a0, t2) 1303 %ret 1304 .endscope 1305 1306 # is_alpha(c=a0) -> 0 or 1 1307 :is_alpha 1308 .scope 1309 %li(t2, 1) 1310 %addi(t0, a0, -97) 1311 %li(t1, 26) 1312 %bltu(t0, t1, &.done) 1313 %addi(t0, a0, -65) 1314 %bltu(t0, t1, &.done) 1315 %li(t2, 0) 1316 :.done 1317 %mov(a0, t2) 1318 %ret 1319 .endscope 1320 1321 # is_alnum(c=a0) -> 0 or 1 1322 :is_alnum 1323 .scope 1324 %li(t2, 1) 1325 %addi(t0, a0, -48) 1326 %li(t1, 10) 1327 %bltu(t0, t1, &.done) 1328 %addi(t0, a0, -97) 1329 %li(t1, 26) 1330 %bltu(t0, t1, &.done) 1331 %addi(t0, a0, -65) 1332 %bltu(t0, t1, &.done) 1333 %li(t2, 0) 1334 :.done 1335 %mov(a0, t2) 1336 %ret 1337 .endscope 1338 1339 # ========================================================================= 1340 # Raw syscall wrappers 1341 # ========================================================================= 1342 # 1343 # Each wrapper shifts arguments into the syscall convention 1344 # (a0 = number, a1..a3/t0/s0/s1 = args 0..5), emits SYSCALL, and returns 1345 # the raw kernel result. Syscall clobbers only a0, so t0/s0/s1 do not 1346 # need saving. 1347 1348 # sys_read(fd=a0, buf=a1, len=a2) -> n (a0) 1349 :sys_read 1350 %mov(a3, a2) 1351 %mov(a2, a1) 1352 %mov(a1, a0) 1353 %li(a0, %p1_sys_read) 1354 %syscall 1355 %ret 1356 1357 # sys_write(fd=a0, buf=a1, len=a2) -> n (a0) 1358 :sys_write 1359 %mov(a3, a2) 1360 %mov(a2, a1) 1361 %mov(a1, a0) 1362 %li(a0, %p1_sys_write) 1363 %syscall 1364 %ret 1365 1366 # sys_open(path=a0, flags=a1, mode=a2) -> fd (a0) 1367 # Implemented as openat(AT_FDCWD, path, flags, mode). AT_FDCWD = -100. 1368 :sys_open 1369 %mov(t0, a2) 1370 %mov(a3, a1) 1371 %mov(a2, a0) 1372 %li(a1, -100) 1373 %li(a0, %p1_sys_openat) 1374 %syscall 1375 %ret 1376 1377 # sys_close(fd=a0) -> r (a0) 1378 :sys_close 1379 %mov(a1, a0) 1380 %li(a0, %p1_sys_close) 1381 %syscall 1382 %ret 1383 1384 # sys_lseek(fd=a0, off=a1, whence=a2) -> off (a0) 1385 :sys_lseek 1386 %mov(a3, a2) 1387 %mov(a2, a1) 1388 %mov(a1, a0) 1389 %li(a0, %p1_sys_lseek) 1390 %syscall 1391 %ret 1392 1393 # sys_brk(addr=a0) -> new_break (a0). addr=0 returns the current break. 1394 :sys_brk 1395 %mov(a1, a0) 1396 %li(a0, %p1_sys_brk) 1397 %syscall 1398 %ret 1399 1400 # sys_unlink(path=a0) -> 0 / -errno (a0). 1401 # Implemented as unlinkat(AT_FDCWD, path, 0). AT_FDCWD = -100. 1402 :sys_unlink 1403 %li(a3, 0) 1404 %mov(a2, a0) 1405 %li(a1, -100) 1406 %li(a0, %p1_sys_unlinkat) 1407 %syscall 1408 %ret 1409 1410 # sys_exit(code=a0) -> never returns 1411 :sys_exit 1412 .scope 1413 %mov(a1, a0) 1414 %li(a0, %p1_sys_exit) 1415 %syscall 1416 :.spin 1417 %b(&.spin) 1418 .endscope 1419 1420 # ========================================================================= 1421 # Print helpers 1422 # ========================================================================= 1423 # 1424 # print(buf, len) and eprint(buf, len) loop on sys_write until all bytes 1425 # are written or the kernel reports an error. All other print helpers 1426 # compose on top of those two. 1427 1428 %fn(print, 16, { 1429 %st(s0, sp, 0) 1430 %st(s1, sp, 8) 1431 %mov(s0, a0) 1432 %mov(s1, a1) 1433 1434 :.loop 1435 %beqz(s1, &.done_ok) 1436 %li(a0, 1) 1437 %mov(a1, s0) 1438 %mov(a2, s1) 1439 %call(&sys_write) 1440 %bltz(a0, &.done) 1441 %add(s0, s0, a0) 1442 %sub(s1, s1, a0) 1443 %b(&.loop) 1444 1445 :.done_ok 1446 %li(a0, 0) 1447 :.done 1448 %ld(s0, sp, 0) 1449 %ld(s1, sp, 8) 1450 }) 1451 1452 %fn(eprint, 16, { 1453 %st(s0, sp, 0) 1454 %st(s1, sp, 8) 1455 %mov(s0, a0) 1456 %mov(s1, a1) 1457 1458 :.loop 1459 %beqz(s1, &.done_ok) 1460 %li(a0, 2) 1461 %mov(a1, s0) 1462 %mov(a2, s1) 1463 %call(&sys_write) 1464 %bltz(a0, &.done) 1465 %add(s0, s0, a0) 1466 %sub(s1, s1, a0) 1467 %b(&.loop) 1468 1469 :.done_ok 1470 %li(a0, 0) 1471 :.done 1472 %ld(s0, sp, 0) 1473 %ld(s1, sp, 8) 1474 }) 1475 1476 %fn(println, 16, { 1477 %st(s0, sp, 0) 1478 1479 %call(&print) 1480 %mov(s0, a0) 1481 %bltz(s0, &.done) 1482 1483 %la(a0, &libp1pp__newline) 1484 %li(a1, 1) 1485 %call(&print) 1486 %mov(s0, a0) 1487 1488 :.done 1489 %mov(a0, s0) 1490 %ld(s0, sp, 0) 1491 }) 1492 1493 %fn(eprintln, 16, { 1494 %st(s0, sp, 0) 1495 1496 %call(&eprint) 1497 %mov(s0, a0) 1498 %bltz(s0, &.done) 1499 1500 %la(a0, &libp1pp__newline) 1501 %li(a1, 1) 1502 %call(&eprint) 1503 %mov(s0, a0) 1504 1505 :.done 1506 %mov(a0, s0) 1507 %ld(s0, sp, 0) 1508 }) 1509 1510 %fn(print_cstr, 16, { 1511 %st(s0, sp, 0) 1512 %mov(s0, a0) 1513 %call(&libp1pp__strlen) 1514 %mov(a1, a0) 1515 %mov(a0, s0) 1516 %call(&print) 1517 %ld(s0, sp, 0) 1518 }) 1519 1520 %fn(eprint_cstr, 16, { 1521 %st(s0, sp, 0) 1522 %mov(s0, a0) 1523 %call(&libp1pp__strlen) 1524 %mov(a1, a0) 1525 %mov(a0, s0) 1526 %call(&eprint) 1527 %ld(s0, sp, 0) 1528 }) 1529 1530 %fn(print_int, 0, { 1531 %mov(a1, a0) 1532 %la(a0, &libp1pp__num_buf) 1533 %call(&fmt_dec) 1534 %mov(a1, a0) 1535 %la(a0, &libp1pp__num_buf) 1536 %call(&print) 1537 }) 1538 1539 %fn(print_hex, 0, { 1540 %mov(a1, a0) 1541 %la(a0, &libp1pp__num_buf) 1542 %call(&fmt_hex) 1543 %mov(a1, a0) 1544 %la(a0, &libp1pp__num_buf) 1545 %call(&print) 1546 }) 1547 1548 # ========================================================================= 1549 # File helpers 1550 # ========================================================================= 1551 1552 # read_file(path=a0, buf=a1, cap=a2) -> n or -1 1553 %fn(read_file, 32, { 1554 %st(s0, sp, 0) 1555 %st(s1, sp, 8) 1556 %st(s2, sp, 16) 1557 %st(s3, sp, 24) 1558 1559 %mov(s1, a1) 1560 %mov(s2, a2) 1561 1562 %li(a1, 0) 1563 %li(a2, 0) 1564 %call(&sys_open) 1565 %bltz(a0, &.open_fail) 1566 %mov(s3, a0) 1567 1568 %mov(a0, s3) 1569 %mov(a1, s1) 1570 %mov(a2, s2) 1571 %call(&sys_read) 1572 %mov(s0, a0) 1573 1574 %mov(a0, s3) 1575 %call(&sys_close) 1576 1577 %mov(a0, s0) 1578 %bltz(a0, &.read_fail) 1579 %b(&.done) 1580 1581 :.read_fail 1582 %li(a0, -1) 1583 %b(&.done) 1584 1585 :.open_fail 1586 %li(a0, -1) 1587 1588 :.done 1589 %ld(s0, sp, 0) 1590 %ld(s1, sp, 8) 1591 %ld(s2, sp, 16) 1592 %ld(s3, sp, 24) 1593 }) 1594 1595 # libp1pp__write_all(fd=a0, buf=a1, len=a2) -> 0 or <0 on error 1596 # 1597 # Loop on sys_write until all bytes are written. Used by print / eprint 1598 # / write_file. Retries partial writes but returns the first negative 1599 # kernel return unchanged. 1600 %fn(libp1pp__write_all, 24, { 1601 %st(s0, sp, 0) 1602 %st(s1, sp, 8) 1603 %st(s2, sp, 16) 1604 1605 %mov(s0, a0) 1606 %mov(s1, a1) 1607 %mov(s2, a2) 1608 1609 :.loop 1610 %beqz(s2, &.done_ok) 1611 %mov(a0, s0) 1612 %mov(a1, s1) 1613 %mov(a2, s2) 1614 %call(&sys_write) 1615 %bltz(a0, &.done) 1616 %add(s1, s1, a0) 1617 %sub(s2, s2, a0) 1618 %b(&.loop) 1619 1620 :.done_ok 1621 %li(a0, 0) 1622 :.done 1623 %ld(s0, sp, 0) 1624 %ld(s1, sp, 8) 1625 %ld(s2, sp, 16) 1626 }) 1627 1628 # write_file(path=a0, buf=a1, len=a2) -> 0 or -1 1629 # 1630 # Flags: O_WRONLY|O_CREAT|O_TRUNC. On Linux these are 0x1 | 0x40 | 1631 # 0x200 = 0x241. Mode 0644 octal = 0x1A4. 1632 %fn(write_file, 24, { 1633 %st(s0, sp, 0) 1634 %st(s1, sp, 8) 1635 %st(s2, sp, 16) 1636 1637 %mov(s0, a1) 1638 %mov(s1, a2) 1639 1640 %li(a1, 0x241) 1641 %li(a2, 0x1A4) 1642 %call(&sys_open) 1643 %bltz(a0, &.open_fail) 1644 %mov(s2, a0) 1645 1646 %mov(a0, s2) 1647 %mov(a1, s0) 1648 %mov(a2, s1) 1649 %call(&libp1pp__write_all) 1650 1651 %mov(s0, a0) 1652 %mov(a0, s2) 1653 %call(&sys_close) 1654 1655 %mov(a0, s0) 1656 %bltz(a0, &.fail_ret) 1657 %li(a0, 0) 1658 %b(&.done) 1659 1660 :.fail_ret 1661 %li(a0, -1) 1662 %b(&.done) 1663 1664 :.open_fail 1665 %li(a0, -1) 1666 1667 :.done 1668 %ld(s0, sp, 0) 1669 %ld(s1, sp, 8) 1670 %ld(s2, sp, 16) 1671 }) 1672 1673 # ========================================================================= 1674 # BSS arena pointer-init table 1675 # ========================================================================= 1676 # 1677 # Pattern: a program reserves a stretch of memory past :ELF_end (or any 1678 # base) and wants to carve it into N fixed-size arenas, each anchored 1679 # by a pointer slot in the data section. The table emits one 1680 # (slot, size) row per arena via %arena_entry; init_arenas walks the 1681 # table once at startup and writes base + sum of prior sizes into each 1682 # slot, so arena[k] starts where arena[k-1] ended. 1683 1684 # %arena_entry(slot, size) -- one 16-byte row: 4-byte label ref + 4 1685 # bytes zero pad + 8-byte size. `slot` is passed as a label ref (`&foo`). 1686 %macro arena_entry(slot, size) slot %(0) $(size) %endm 1687 1688 # init_arenas(base=a0, tbl=a1, tbl_end=a2) -> 0 1689 # 1690 # Walks (slot, size) pairs from `tbl` to `tbl_end`, threading a running 1691 # offset starting at 0. For each entry: *slot = base + offset, then 1692 # offset += size. Leaf. 1693 :init_arenas 1694 .scope 1695 %li(t0, 0) 1696 :.loop 1697 %beq(a1, a2, &.done) 1698 %ld(t1, a1, 0) 1699 %ld(t2, a1, 8) 1700 %add(a3, a0, t0) 1701 %st(a3, t1, 0) 1702 %add(t0, t0, t2) 1703 %addi(a1, a1, 16) 1704 %b(&.loop) 1705 :.done 1706 %li(a0, 0) 1707 %ret 1708 .endscope 1709 1710 # ========================================================================= 1711 # Bump allocator 1712 # ========================================================================= 1713 # 1714 # Single global arena, bytes carved by monotonic cursor with 8-byte 1715 # alignment. bump_alloc returns 0 when the request would overflow. 1716 1717 # bump_init(base=a0, cap=a1) -> 0 1718 :bump_init 1719 %la(t0, &libp1pp__bump_base) 1720 %st(a0, t0, 0) 1721 %la(t0, &libp1pp__bump_cursor) 1722 %st(a0, t0, 0) 1723 %la(t0, &libp1pp__bump_cap) 1724 %st(a1, t0, 0) 1725 %li(a0, 0) 1726 %ret 1727 1728 # bump_alloc(n=a0) -> ptr (0 on exhaustion) 1729 # 1730 # Round n up to a multiple of 8, then admit iff cursor + n_rounded does 1731 # not exceed base + cap. On success, advance the cursor and return the 1732 # pre-advance value; on failure, leave the cursor untouched and return 0. 1733 :bump_alloc 1734 .scope 1735 %addi(a0, a0, 7) 1736 %li(t0, -8) 1737 %and(a0, a0, t0) 1738 %la(t0, &libp1pp__bump_cursor) 1739 %ld(t1, t0, 0) 1740 %add(t2, t1, a0) 1741 %la(a1, &libp1pp__bump_base) 1742 %ld(a2, a1, 0) 1743 %la(a1, &libp1pp__bump_cap) 1744 %ld(a3, a1, 0) 1745 %add(a3, a2, a3) 1746 %bltu(a3, t2, &.fail) 1747 %st(t2, t0, 0) 1748 %mov(a0, t1) 1749 %ret 1750 :.fail 1751 %li(a0, 0) 1752 %ret 1753 .endscope 1754 1755 # bump_mark() -> saved 1756 :bump_mark 1757 %la(t0, &libp1pp__bump_cursor) 1758 %ld(a0, t0, 0) 1759 %ret 1760 1761 # bump_release(saved=a0) -> 0 1762 :bump_release 1763 %la(t0, &libp1pp__bump_cursor) 1764 %st(a0, t0, 0) 1765 %li(a0, 0) 1766 %ret 1767 1768 # bump_reset() -> 0 1769 :bump_reset 1770 %la(t0, &libp1pp__bump_base) 1771 %ld(t1, t0, 0) 1772 %la(t0, &libp1pp__bump_cursor) 1773 %st(t1, t0, 0) 1774 %li(a0, 0) 1775 %ret 1776 1777 # ========================================================================= 1778 # Panic 1779 # ========================================================================= 1780 1781 # panic(msg_cstr=a0) -> never returns 1782 %fn(panic, 0, { 1783 %call(&eprint_cstr) 1784 %la(a0, &libp1pp__newline) 1785 %li(a1, 1) 1786 %call(&eprint) 1787 %li(a0, 1) 1788 %call(&sys_exit) 1789 :.spin 1790 %b(&.spin) 1791 }) 1792 1793 # ========================================================================= 1794 # Tracepoint 1795 # ========================================================================= 1796 # 1797 # %trace(tag_addr, tag_len) — emit a runtime stderr probe at the call 1798 # site. Prints `[trace @0xHEX TAG]\n` to stderr, where 0xHEX is the 1799 # runtime address of this trace site (the address of `:@here` in this 1800 # site's expansion) and TAG is the byte string at 1801 # [tag_addr..tag_addr+tag_len). 1802 # 1803 # `tag_addr` is a label reference token (e.g. `&cc__str_3`) — the 1804 # caller is responsible for emitting the bytes at that label. cc.scm's 1805 # --cc-trace-emit interns the mangled function name through the 1806 # regular string pool, which already pads each entry to an 8-byte 1807 # multiple, so the next item past the tag stays aligned. `tag_len` is 1808 # the *logical* byte count to print (without trailing NUL or pad). 1809 # 1810 # To map a printed address back to source, disassemble the ELF 1811 # (`scripts/disasm-elf.sh`) and locate the printed address. cc.scm 1812 # guarantees that each function's first instruction *is* a trace call, 1813 # so the printed address falls on a known function-entry boundary. 1814 # 1815 # Preserves all exposed P1 registers (a0..a3, t0..t2, s0..s3) by 1816 # borrowing 112 aligned bytes below the current stack pointer: 16 bytes 1817 # for the backend frame prefix plus 88 bytes for saved registers. Use 1818 # only inside an active %fn body, after %enter and before %eret. 1819 %macro trace(tag_addr, tag_len) 1820 :@here 1821 %addi(sp, sp, -112) 1822 %st(a0, sp, 0) 1823 %st(a1, sp, 8) 1824 %st(a2, sp, 16) 1825 %st(a3, sp, 24) 1826 %st(t0, sp, 32) 1827 %st(t1, sp, 40) 1828 %st(t2, sp, 48) 1829 %st(s0, sp, 56) 1830 %st(s1, sp, 64) 1831 %st(s2, sp, 72) 1832 %st(s3, sp, 80) 1833 %la(a0, &@here) 1834 %la(a1, tag_addr) 1835 %li(a2, tag_len) 1836 %call(&libp1pp__trace) 1837 %ld(a0, sp, 0) 1838 %ld(a1, sp, 8) 1839 %ld(a2, sp, 16) 1840 %ld(a3, sp, 24) 1841 %ld(t0, sp, 32) 1842 %ld(t1, sp, 40) 1843 %ld(t2, sp, 48) 1844 %ld(s0, sp, 56) 1845 %ld(s1, sp, 64) 1846 %ld(s2, sp, 72) 1847 %ld(s3, sp, 80) 1848 %addi(sp, sp, 112) 1849 %endm 1850 1851 # libp1pp__trace(addr=a0, tag_addr=a1, tag_len=a2) — print 1852 # "[trace @0xHEX TAG]\n" to stderr. 1853 %fn(libp1pp__trace, 32, { 1854 %st(s0, sp, 0) 1855 %st(s1, sp, 8) 1856 %st(s2, sp, 16) 1857 %st(s3, sp, 24) 1858 %mov(s0, a0) 1859 %mov(s1, a1) 1860 %mov(s2, a2) 1861 1862 %la(a0, &libp1pp__trace_pre) 1863 %li(a1, 8) 1864 %call(&eprint) 1865 1866 %la(a0, &libp1pp__num_buf) 1867 %mov(a1, s0) 1868 %call(&fmt_hex) 1869 %mov(s3, a0) 1870 %la(a0, &libp1pp__num_buf) 1871 %mov(a1, s3) 1872 %call(&eprint) 1873 1874 %la(a0, &libp1pp__trace_sep) 1875 %li(a1, 1) 1876 %call(&eprint) 1877 1878 %mov(a0, s1) 1879 %mov(a1, s2) 1880 %call(&eprint) 1881 1882 %la(a0, &libp1pp__trace_post) 1883 %li(a1, 2) 1884 %call(&eprint) 1885 1886 %ld(s0, sp, 0) 1887 %ld(s1, sp, 8) 1888 %ld(s2, sp, 16) 1889 %ld(s3, sp, 24) 1890 }) 1891 1892 # Tracepoint message fragments. eprint reads only the leading 1893 # visible-byte count (8, 1, 2); .align 8 keeps each fragment and the 1894 # data labels that follow 8-byte aligned (aarch64 LDR / 4-byte 1895 # inline-data loads fault otherwise). 1896 :libp1pp__trace_pre "[trace @" 1897 .align 8 1898 :libp1pp__trace_sep " " 1899 .align 8 1900 :libp1pp__trace_post "]\n" 1901 .align 8 1902 1903 # ========================================================================= 1904 # Internal data 1905 # ========================================================================= 1906 1907 # Single newline byte for println / eprintln / panic. Emitted as an 1908 # 8-byte word (0x0A in the low byte, zeros above) so the following 1909 # buffers and the user source that comes after libp1pp stay 8-byte 1910 # aligned. sys_write reads only the one byte callers request. 1911 :libp1pp__newline $(10) 1912 1913 # Scratch buffer used by print_int / print_hex. fmt_dec writes at most 1914 # 20 bytes, fmt_hex at most 16, so 32 bytes with word alignment is 1915 # comfortably above both. 1916 :libp1pp__num_buf $(0) $(0) $(0) $(0) 1917 1918 # Bump-allocator state. Zero-initialized so bump_alloc returns 0 until 1919 # bump_init installs an arena. 1920 :libp1pp__bump_base $(0) 1921 :libp1pp__bump_cursor $(0) 1922 :libp1pp__bump_cap $(0)