boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

P1pp.P1pp (44172B)


      1 # p1pp.P1pp -- libp1pp v1, portable utility library for P1pp programs.
      2 #
      3 # Concatenated after the P1 backend header and frontend, and before user
      4 # source:
      5 #
      6 #     catm P1-<arch>.M1pp P1.M1pp p1pp.P1pp usersrc.P1pp > program.M1
      7 #
      8 # Targets P1-64 only (WORD = 8). All internal labels use the
      9 # `libp1pp__` prefix; public entry points are unprefixed.
     10 #
     11 # See docs/LIBP1PP.md for the public contract.
     12 
     13 # =========================================================================
     14 # Compile-time helpers
     15 # =========================================================================
     16 
     17 # %alignup -- Round `rs` up to a multiple of `align` (a power of two) and
     18 # place the result in `rd`. `scratch` is clobbered. `align` must be a
     19 # constant integer expression. Two instructions plus an %li for the mask:
     20 #
     21 #     rd      = rs + (align - 1)
     22 #     scratch = -align          (i.e. ~(align-1) for power-of-two align)
     23 #     rd      = rd & scratch
     24 %macro alignup(rd, rs, align, scratch)
     25     %addi(rd, rs, (- align 1))
     26     %li(scratch, (- 0 align))
     27     %and(rd, rd, scratch)
     28 %endm
     29 
     30 # =========================================================================
     31 # Global memory access
     32 # =========================================================================
     33 #
     34 # Shorthand for the la + ld(_,0) / la + st(_,0) idiom that dereferences a
     35 # pointer slot at a labeled global. `name` is a label expression in the
     36 # `&foo` form that %la accepts.
     37 #
     38 # %ld_global  -- rd = *name. rd doubles as the address scratch.
     39 # %st_global  -- *name = rs. Needs a separate `scratch` since rs holds the
     40 #                value being written.
     41 # %lda_global -- rd = *name AND raddr = &name. Use when the address is
     42 #                also needed afterward (e.g. load then store back).
     43 
     44 %macro ld_global(rd, name)
     45     %la(rd, name)
     46     %ld(rd, rd, 0)
     47 %endm
     48 
     49 %macro st_global(rs, name, scratch)
     50     %la(scratch, name)
     51     %st(rs, scratch, 0)
     52 %endm
     53 
     54 %macro lda_global(rd, raddr, name)
     55     %la(raddr, name)
     56     %ld(rd, raddr, 0)
     57 %endm
     58 
     59 # =========================================================================
     60 # Array indexing
     61 # =========================================================================
     62 #
     63 # Compute *(base + idx * stride + off) for register `base`, register
     64 # `idx`, and constant `stride` and `off`. `stride` is materialized via
     65 # %li and folded into idx via %mul, so it does not need to be a power of
     66 # two. `off` is a constant byte offset within the element (use 0 for
     67 # plain element access; non-zero for field access).
     68 #
     69 # All three macros need a separate scratch to hold the computed address,
     70 # because both `base` and `idx` are register inputs that must survive the
     71 # multiply. `lda_array` reuses its `raddr` output as that scratch.
     72 #
     73 # %ld_array  -- rd = *(base + idx*stride + off).
     74 # %st_array  -- *(base + idx*stride + off) = rs.
     75 # %lda_array -- rd = *addr AND raddr = base + idx*stride; ld is at
     76 #               offset `off`. Use when subsequent code also needs the
     77 #               computed address (e.g. for additional field accesses).
     78 
     79 %macro ld_array(rd, base, stride, idx, off, scratch)
     80     %li(scratch, stride)
     81     %mul(scratch, idx, scratch)
     82     %add(scratch, base, scratch)
     83     %ld(rd, scratch, off)
     84 %endm
     85 
     86 %macro st_array(rs, base, stride, idx, off, scratch)
     87     %li(scratch, stride)
     88     %mul(scratch, idx, scratch)
     89     %add(scratch, base, scratch)
     90     %st(rs, scratch, off)
     91 %endm
     92 
     93 %macro lda_array(rd, raddr, base, stride, idx, off)
     94     %li(raddr, stride)
     95     %mul(raddr, idx, raddr)
     96     %add(raddr, base, raddr)
     97     %ld(rd, raddr, off)
     98 %endm
     99 
    100 # =========================================================================
    101 # Sub-word memory access
    102 # =========================================================================
    103 #
    104 # P1 has only 1-byte (%lb/%sb) and 8-byte (%ld/%st) memory ops, and the
    105 # 8-byte ops require natural 8-byte alignment. For struct fields and
    106 # packed data laid out at narrower widths, sub-word access is byte-
    107 # decomposed: %lb-gather + shli/or for loads, %sb-scatter + shri for
    108 # stores. These macros encapsulate that pattern so callers do not have
    109 # to open-code it (and so a backend can later substitute a single
    110 # native sub-word load/store when alignment is statically known).
    111 #
    112 # Conventions:
    113 #   `rd` is the destination (loads); `rs` is the source (stores).
    114 #   Stores preserve `rs`; loads clobber `rd`. `scratch` is a working
    115 #   register distinct from rd/rs and base. Bytes are little-endian:
    116 #   byte 0 (low) at off+0. The signed-load variants (%ld_sh, %ld_sw)
    117 #   sign-extend the gathered value to the canonical 64-bit form.
    118 #
    119 # %ld_h(rd, base, off, scratch)   — 2-byte zero-extending load
    120 # %ld_w(rd, base, off, scratch)   — 4-byte zero-extending load
    121 # %ld_sh(rd, base, off, scratch)  — 2-byte sign-extending load
    122 # %ld_sw(rd, base, off, scratch)  — 4-byte sign-extending load
    123 # %st_h(rs, base, off, scratch)   — 2-byte store (writes low 16 bits)
    124 # %st_w(rs, base, off, scratch)   — 4-byte store (writes low 32 bits)
    125 
    126 %macro ld_h(rd, base, off, scratch)
    127     %lb(rd, base, off)
    128     %lb(scratch, base, (+ off 1))
    129     %shli(scratch, scratch, 8)
    130     %or(rd, rd, scratch)
    131 %endm
    132 
    133 %macro ld_w(rd, base, off, scratch)
    134     %lb(rd, base, off)
    135     %lb(scratch, base, (+ off 1))
    136     %shli(scratch, scratch, 8)
    137     %or(rd, rd, scratch)
    138     %lb(scratch, base, (+ off 2))
    139     %shli(scratch, scratch, 16)
    140     %or(rd, rd, scratch)
    141     %lb(scratch, base, (+ off 3))
    142     %shli(scratch, scratch, 24)
    143     %or(rd, rd, scratch)
    144 %endm
    145 
    146 %macro ld_sh(rd, base, off, scratch)
    147     %ld_h(rd, base, off, scratch)
    148     %shli(rd, rd, 48)
    149     %sari(rd, rd, 48)
    150 %endm
    151 
    152 %macro ld_sw(rd, base, off, scratch)
    153     %ld_w(rd, base, off, scratch)
    154     %shli(rd, rd, 32)
    155     %sari(rd, rd, 32)
    156 %endm
    157 
    158 %macro st_h(rs, base, off, scratch)
    159     %sb(rs, base, off)
    160     %shri(scratch, rs, 8)
    161     %sb(scratch, base, (+ off 1))
    162 %endm
    163 
    164 %macro st_w(rs, base, off, scratch)
    165     %sb(rs, base, off)
    166     %shri(scratch, rs, 8)
    167     %sb(scratch, base, (+ off 1))
    168     %shri(scratch, rs, 16)
    169     %sb(scratch, base, (+ off 2))
    170     %shri(scratch, rs, 24)
    171     %sb(scratch, base, (+ off 3))
    172 %endm
    173 
    174 # =========================================================================
    175 # Sign and zero extension
    176 # =========================================================================
    177 #
    178 # %sextN(rd, ra)        truncate ra to N bits and sign-extend to 64.
    179 # %zextN(rd, ra)        truncate ra to N bits and zero-extend to 64.
    180 # %zext32(rd, ra, scratch)
    181 #                       like zextN but needs a scratch register because
    182 #                       0xFFFFFFFF does not fit a 16-bit movz immediate
    183 #                       (the path %andi takes when materializing the mask).
    184 #
    185 # rd may equal ra. The signed forms use shli/sari at the right amount;
    186 # zext8/zext16 ride on %andi (the mask fits movz so no caller scratch
    187 # needed); zext32 materializes the mask explicitly.
    188 
    189 %macro sext8(rd, ra)
    190     %shli(rd, ra, 56)
    191     %sari(rd, rd, 56)
    192 %endm
    193 
    194 %macro sext16(rd, ra)
    195     %shli(rd, ra, 48)
    196     %sari(rd, rd, 48)
    197 %endm
    198 
    199 %macro sext32(rd, ra)
    200     %shli(rd, ra, 32)
    201     %sari(rd, rd, 32)
    202 %endm
    203 
    204 %macro zext8(rd, ra)
    205     %andi(rd, ra, 255)
    206 %endm
    207 
    208 %macro zext16(rd, ra)
    209     %andi(rd, ra, 65535)
    210 %endm
    211 
    212 %macro zext32(rd, ra, scratch)
    213     %li(scratch, 4294967295)
    214     %and(rd, ra, scratch)
    215 %endm
    216 
    217 # =========================================================================
    218 # Frame-slot address
    219 # =========================================================================
    220 #
    221 # %lea_slot(rd, slot)   rd = address of the frame slot at byte offset
    222 #                       `slot`. Centralizes the "%mov(rd, sp) +
    223 #                       %addi(rd, rd, slot)" idiom — the backend folds
    224 #                       its hidden frame-header offset into %mov(rd, sp),
    225 #                       so callers must not bake a literal 16 into the
    226 #                       %addi. `slot` may be any M1pp integer expression
    227 #                       (a literal byte offset or a %fn__SO-relative
    228 #                       slot-expr).
    229 
    230 %macro lea_slot(rd, slot)
    231     %mov(rd, sp)
    232     %addi(rd, rd, slot)
    233 %endm
    234 
    235 # =========================================================================
    236 # Pointer scaling
    237 # =========================================================================
    238 #
    239 # %ptr_add(rd, ptr, idx, sz, scratch)   rd = ptr + idx*sz
    240 # %ptr_sub(rd, ptr, idx, sz, scratch)   rd = ptr - idx*sz
    241 # %ptr_diff(rd, p, q, sz, scratch)      rd = (p - q) / sz
    242 #
    243 # `sz` is an M1pp-time integer constant (the C pointee size). When
    244 # sz == 1 the multiply (or divide) collapses out at expansion time.
    245 #
    246 # %ptr_add and %ptr_sub clobber `scratch`. %ptr_diff clobbers `scratch`
    247 # (only when sz != 1) and computes through `rd`, so callers must not
    248 # alias `rd` with `p` or `q` in the sz != 1 path.
    249 
    250 # sz <= 1 takes the byte-stride fast path: char* (sz=1) and void*
    251 # (cc.scm uses sz=-1 for the void pointee, following GCC's byte-arith
    252 # extension) both want raw idx with no scaling.
    253 
    254 %macro ptr_add(rd, ptr, idx, sz, scratch)
    255 %select((< sz 2),
    256     %add(rd, ptr, idx),
    257     %li(scratch, sz)
    258     %mul(scratch, idx, scratch)
    259     %add(rd, ptr, scratch))
    260 %endm
    261 
    262 %macro ptr_sub(rd, ptr, idx, sz, scratch)
    263 %select((< sz 2),
    264     %sub(rd, ptr, idx),
    265     %li(scratch, sz)
    266     %mul(scratch, idx, scratch)
    267     %sub(rd, ptr, scratch))
    268 %endm
    269 
    270 %macro ptr_diff(rd, p, q, sz, scratch)
    271 %select((< sz 2),
    272     %sub(rd, p, q),
    273     %sub(rd, p, q)
    274     %li(scratch, sz)
    275     %div(rd, rd, scratch))
    276 %endm
    277 
    278 # =========================================================================
    279 # Memcpy-call shorthand
    280 # =========================================================================
    281 #
    282 # %memcpy_call(dst_reg, src_reg, n_imm)
    283 #   Marshal arguments into the libp1pp memcpy ABI and invoke it. Useful
    284 #   for fixed-size memory copies (e.g. struct copy in a code generator)
    285 #   where the size is known at expansion time. dst_reg and src_reg must
    286 #   not be a0 — the dst move would clobber a different live input.
    287 
    288 %macro memcpy_call(dst_reg, src_reg, n_imm)
    289     %li(a2, n_imm)
    290     %mov(a1, src_reg)
    291     %mov(a0, dst_reg)
    292     %call(&memcpy)
    293 %endm
    294 
    295 # =========================================================================
    296 # Compare-and-set-bool macros
    297 # =========================================================================
    298 #
    299 # %cmpset_<cc>(rd, ra[, rb])  rd = (ra <cc> rb) ? 1 : 0
    300 #
    301 # Two-operand: eq, ne, lt, ltu, le, leu, ge, geu (signed/unsigned).
    302 # Zero-operand (compare against zero): eqz, nez, ltz.
    303 #
    304 # le/ge/leu/geu lower through the same ifelse machinery with operands
    305 # swapped or condition flipped: a >= b iff !(a < b) iff (b <= a-1), and
    306 # we reach it as (b < a) ? 0 : 1 via ifelse_lt with swapped arms (and
    307 # the unsigned/signed pairing follows ltu/lt).
    308 #
    309 # Lower to %ifelse_<cc>(...) which itself works across all P1 backends.
    310 # A backend that supports a native conditional-set instruction can later
    311 # specialize these to a single op without touching callers.
    312 
    313 %macro cmpset_eq(rd, ra, rb)
    314     %ifelse_eq(ra, rb, { %li(rd, 1) }, { %li(rd, 0) })
    315 %endm
    316 
    317 %macro cmpset_ne(rd, ra, rb)
    318     %ifelse_ne(ra, rb, { %li(rd, 1) }, { %li(rd, 0) })
    319 %endm
    320 
    321 %macro cmpset_lt(rd, ra, rb)
    322     %ifelse_lt(ra, rb, { %li(rd, 1) }, { %li(rd, 0) })
    323 %endm
    324 
    325 %macro cmpset_ltu(rd, ra, rb)
    326     %ifelse_ltu(ra, rb, { %li(rd, 1) }, { %li(rd, 0) })
    327 %endm
    328 
    329 %macro cmpset_le(rd, ra, rb)
    330     %ifelse_lt(rb, ra, { %li(rd, 0) }, { %li(rd, 1) })
    331 %endm
    332 
    333 %macro cmpset_leu(rd, ra, rb)
    334     %ifelse_ltu(rb, ra, { %li(rd, 0) }, { %li(rd, 1) })
    335 %endm
    336 
    337 %macro cmpset_ge(rd, ra, rb)
    338     %ifelse_lt(ra, rb, { %li(rd, 0) }, { %li(rd, 1) })
    339 %endm
    340 
    341 %macro cmpset_geu(rd, ra, rb)
    342     %ifelse_ltu(ra, rb, { %li(rd, 0) }, { %li(rd, 1) })
    343 %endm
    344 
    345 %macro cmpset_eqz(rd, ra)
    346     %ifelse_eqz(ra, { %li(rd, 1) }, { %li(rd, 0) })
    347 %endm
    348 
    349 %macro cmpset_nez(rd, ra)
    350     %ifelse_nez(ra, { %li(rd, 1) }, { %li(rd, 0) })
    351 %endm
    352 
    353 %macro cmpset_ltz(rd, ra)
    354     %ifelse_ltz(ra, { %li(rd, 1) }, { %li(rd, 0) })
    355 %endm
    356 
    357 # =========================================================================
    358 # Tiny unops
    359 # =========================================================================
    360 #
    361 # %neg(rd, ra, scratch)   rd = -ra        (scratch holds the zero literal)
    362 # %bnot(rd, ra, scratch)  rd = ~ra        (scratch holds the all-ones literal)
    363 # %bool(rd, ra)           rd = (ra != 0) ? 1 : 0   (alias of cmpset_nez)
    364 
    365 %macro neg(rd, ra, scratch)
    366     %li(scratch, 0)
    367     %sub(rd, scratch, ra)
    368 %endm
    369 
    370 %macro bnot(rd, ra, scratch)
    371     %li(scratch, -1)
    372     %xor(rd, ra, scratch)
    373 %endm
    374 
    375 %macro bool(rd, ra)
    376     %cmpset_nez(rd, ra)
    377 %endm
    378 
    379 # =========================================================================
    380 # Switch dispatch
    381 # =========================================================================
    382 #
    383 # %switch_case(ctrl, scratch, key, target)
    384 #   If `ctrl == key`, branch to `target`. `scratch` is used to
    385 #   materialize the key as a register operand. `target` is the full
    386 #   branch target (e.g. `&.case_3`).
    387 #
    388 # A code generator emitting a switch dispatcher emits one
    389 # %switch_case per case, then an unconditional branch to the default.
    390 
    391 %macro switch_case(ctrl, scratch, key, target)
    392     %li(scratch, key)
    393     %beq(ctrl, scratch, target)
    394 %endm
    395 
    396 # =========================================================================
    397 # Control-flow macros
    398 # =========================================================================
    399 #
    400 # Every conditional block macro uses a uniform three-branch lowering that
    401 # works for all seven P1 conditions (including LT, LTU, LTZ which have no
    402 # inverted branch): load a "take the body" target, branch on cc, then
    403 # unconditionally skip past the body.
    404 
    405 # ---- %if_<cc> -----------------------------------------------------------
    406 
    407 %macro if_eq(ra, rb, body)
    408     %beq(ra, rb, &@body)
    409     %b(&@end)
    410     :@body
    411     body
    412     :@end
    413 %endm
    414 
    415 %macro if_ne(ra, rb, body)
    416     %bne(ra, rb, &@body)
    417     %b(&@end)
    418     :@body
    419     body
    420     :@end
    421 %endm
    422 
    423 %macro if_lt(ra, rb, body)
    424     %blt(ra, rb, &@body)
    425     %b(&@end)
    426     :@body
    427     body
    428     :@end
    429 %endm
    430 
    431 %macro if_ltu(ra, rb, body)
    432     %bltu(ra, rb, &@body)
    433     %b(&@end)
    434     :@body
    435     body
    436     :@end
    437 %endm
    438 
    439 %macro if_eqz(ra, body)
    440     %beqz(ra, &@body)
    441     %b(&@end)
    442     :@body
    443     body
    444     :@end
    445 %endm
    446 
    447 %macro if_nez(ra, body)
    448     %bnez(ra, &@body)
    449     %b(&@end)
    450     :@body
    451     body
    452     :@end
    453 %endm
    454 
    455 %macro if_ltz(ra, body)
    456     %bltz(ra, &@body)
    457     %b(&@end)
    458     :@body
    459     body
    460     :@end
    461 %endm
    462 
    463 # ---- %ifelse_<cc> -------------------------------------------------------
    464 
    465 %macro ifelse_eq(ra, rb, tblk, fblk)
    466     %beq(ra, rb, &@tblk)
    467     fblk
    468     %b(&@end)
    469     :@tblk
    470     tblk
    471     :@end
    472 %endm
    473 
    474 %macro ifelse_ne(ra, rb, tblk, fblk)
    475     %bne(ra, rb, &@tblk)
    476     fblk
    477     %b(&@end)
    478     :@tblk
    479     tblk
    480     :@end
    481 %endm
    482 
    483 %macro ifelse_lt(ra, rb, tblk, fblk)
    484     %blt(ra, rb, &@tblk)
    485     fblk
    486     %b(&@end)
    487     :@tblk
    488     tblk
    489     :@end
    490 %endm
    491 
    492 %macro ifelse_ltu(ra, rb, tblk, fblk)
    493     %bltu(ra, rb, &@tblk)
    494     fblk
    495     %b(&@end)
    496     :@tblk
    497     tblk
    498     :@end
    499 %endm
    500 
    501 %macro ifelse_eqz(ra, tblk, fblk)
    502     %beqz(ra, &@tblk)
    503     fblk
    504     %b(&@end)
    505     :@tblk
    506     tblk
    507     :@end
    508 %endm
    509 
    510 %macro ifelse_nez(ra, tblk, fblk)
    511     %bnez(ra, &@tblk)
    512     fblk
    513     %b(&@end)
    514     :@tblk
    515     tblk
    516     :@end
    517 %endm
    518 
    519 %macro ifelse_ltz(ra, tblk, fblk)
    520     %bltz(ra, &@tblk)
    521     fblk
    522     %b(&@end)
    523     :@tblk
    524     tblk
    525     :@end
    526 %endm
    527 
    528 # ---- %while_<cc> -------------------------------------------------------
    529 #
    530 # Jump-to-test layout: the body runs iff the positive-sense test holds,
    531 # and the test is compiled below the body so we only emit a forward
    532 # branch at entry.
    533 
    534 %macro while_eq(ra, rb, body)
    535     %b(&@test)
    536     :@body
    537     body
    538     :@test
    539     %beq(ra, rb, &@body)
    540 %endm
    541 
    542 %macro while_ne(ra, rb, body)
    543     %b(&@test)
    544     :@body
    545     body
    546     :@test
    547     %bne(ra, rb, &@body)
    548 %endm
    549 
    550 %macro while_lt(ra, rb, body)
    551     %b(&@test)
    552     :@body
    553     body
    554     :@test
    555     %blt(ra, rb, &@body)
    556 %endm
    557 
    558 %macro while_ltu(ra, rb, body)
    559     %b(&@test)
    560     :@body
    561     body
    562     :@test
    563     %bltu(ra, rb, &@body)
    564 %endm
    565 
    566 %macro while_eqz(ra, body)
    567     %b(&@test)
    568     :@body
    569     body
    570     :@test
    571     %beqz(ra, &@body)
    572 %endm
    573 
    574 %macro while_nez(ra, body)
    575     %b(&@test)
    576     :@body
    577     body
    578     :@test
    579     %bnez(ra, &@body)
    580 %endm
    581 
    582 %macro while_ltz(ra, body)
    583     %b(&@test)
    584     :@body
    585     body
    586     :@test
    587     %bltz(ra, &@body)
    588 %endm
    589 
    590 # ---- %do_while_<cc> ----------------------------------------------------
    591 
    592 %macro do_while_eq(ra, rb, body)
    593     :@body
    594     body
    595     %beq(ra, rb, &@body)
    596 %endm
    597 
    598 %macro do_while_ne(ra, rb, body)
    599     :@body
    600     body
    601     %bne(ra, rb, &@body)
    602 %endm
    603 
    604 %macro do_while_lt(ra, rb, body)
    605     :@body
    606     body
    607     %blt(ra, rb, &@body)
    608 %endm
    609 
    610 %macro do_while_ltu(ra, rb, body)
    611     :@body
    612     body
    613     %bltu(ra, rb, &@body)
    614 %endm
    615 
    616 %macro do_while_eqz(ra, body)
    617     :@body
    618     body
    619     %beqz(ra, &@body)
    620 %endm
    621 
    622 %macro do_while_nez(ra, body)
    623     :@body
    624     body
    625     %bnez(ra, &@body)
    626 %endm
    627 
    628 %macro do_while_ltz(ra, body)
    629     :@body
    630     body
    631     %bltz(ra, &@body)
    632 %endm
    633 
    634 # ---- %for_lt ------------------------------------------------------------
    635 
    636 %macro for_lt(i_reg, n_reg, body)
    637     %li(i_reg, 0)
    638     %b(&@test)
    639     :@body
    640     body
    641     %addi(i_reg, i_reg, 1)
    642     :@test
    643     %blt(i_reg, n_reg, &@body)
    644 %endm
    645 
    646 # ---- %loop --------------------------------------------------------------
    647 
    648 %macro loop(body)
    649     :@top
    650     body
    651     %b(&@top)
    652 %endm
    653 
    654 # ---- Scoped loops -------------------------------------------------------
    655 #
    656 # Each scoped form opens a hex2++ `.scope` and defines two dotted labels
    657 # inside it: `.top` (where `%continue` should land) and `.end`
    658 # (immediately after the loop, where `%break` should land). The generic
    659 # `%break` and `%continue` macros below emit branches to `&.end` /
    660 # `&.top`; hex2++'s innermost-out scope walk binds those references to
    661 # the nearest enclosing scoped loop.
    662 #
    663 # Nested scoped loops shadow each other: a `%break` inside an inner loop
    664 # targets the inner loop's `.end`. Non-loop control-flow macros
    665 # (`%if_<cc>`, `%ifelse_<cc>`) do not open a `.scope`, so `%break` /
    666 # `%continue` inside them passes through to the enclosing scoped loop.
    667 
    668 %macro loop_scoped(body)
    669     .scope
    670     :.top
    671     body
    672     %b(&.top)
    673     :.end
    674     .endscope
    675 %endm
    676 
    677 %macro while_scoped_eq(ra, rb, body)
    678     .scope
    679     %b(&.top)
    680     :.body
    681     body
    682     :.top
    683     %beq(ra, rb, &.body)
    684     :.end
    685     .endscope
    686 %endm
    687 
    688 %macro while_scoped_ne(ra, rb, body)
    689     .scope
    690     %b(&.top)
    691     :.body
    692     body
    693     :.top
    694     %bne(ra, rb, &.body)
    695     :.end
    696     .endscope
    697 %endm
    698 
    699 %macro while_scoped_lt(ra, rb, body)
    700     .scope
    701     %b(&.top)
    702     :.body
    703     body
    704     :.top
    705     %blt(ra, rb, &.body)
    706     :.end
    707     .endscope
    708 %endm
    709 
    710 %macro while_scoped_ltu(ra, rb, body)
    711     .scope
    712     %b(&.top)
    713     :.body
    714     body
    715     :.top
    716     %bltu(ra, rb, &.body)
    717     :.end
    718     .endscope
    719 %endm
    720 
    721 %macro while_scoped_eqz(ra, body)
    722     .scope
    723     %b(&.top)
    724     :.body
    725     body
    726     :.top
    727     %beqz(ra, &.body)
    728     :.end
    729     .endscope
    730 %endm
    731 
    732 %macro while_scoped_nez(ra, body)
    733     .scope
    734     %b(&.top)
    735     :.body
    736     body
    737     :.top
    738     %bnez(ra, &.body)
    739     :.end
    740     .endscope
    741 %endm
    742 
    743 %macro while_scoped_ltz(ra, body)
    744     .scope
    745     %b(&.top)
    746     :.body
    747     body
    748     :.top
    749     %bltz(ra, &.body)
    750     :.end
    751     .endscope
    752 %endm
    753 
    754 %macro for_lt_scoped(i_reg, n_reg, body)
    755     .scope
    756     %li(i_reg, 0)
    757     %b(&.test)
    758     :.body
    759     body
    760     :.top
    761     %addi(i_reg, i_reg, 1)
    762     :.test
    763     %blt(i_reg, n_reg, &.body)
    764     :.end
    765     .endscope
    766 %endm
    767 
    768 %macro break()
    769     %b(&.end)
    770 %endm
    771 
    772 %macro continue()
    773     %b(&.top)
    774 %endm
    775 
    776 # =========================================================================
    777 # %fn -- scope-introducing function definition
    778 # =========================================================================
    779 #
    780 # Opens a hex2++ `.scope` around the body so dotted local labels (`:.foo`,
    781 # `&.foo`) are private to this function. The body is bracketed by
    782 # %enter(size) and %eret, so functions defined with %fn always carry a
    783 # standard frame.
    784 
    785 %macro fn(name, size, body)
    786     : ## name
    787     .scope
    788     %enter(size)
    789     body
    790     %eret
    791     .endscope
    792 %endm
    793 
    794 # =========================================================================
    795 # %fn2 -- function with named locals
    796 # =========================================================================
    797 #
    798 # Like %fn, but the second argument is a braced list of local names
    799 # instead of a byte frame size. Synthesizes a `name_FRAME` %struct
    800 # (one 8-byte slot per local), opens both a hex2++ `.scope` and an
    801 # m1pp `%frame` named after the function, and sizes the stack frame
    802 # from %name_FRAME.SIZE.
    803 #
    804 # Inside the body these helpers resolve against the enclosing frame:
    805 #   %local(slot)     byte offset of local `slot`
    806 #   %stl(reg, slot)  store reg into local `slot`
    807 #   %ldl(reg, slot)  load local `slot` into reg
    808 #
    809 # m1pp tracks the active frame in a single slot independent of hex2++
    810 # scope nesting, so %local / %stl / %ldl keep resolving against the
    811 # function even when the body opens nested `.scope` blocks (e.g. from
    812 # a scoped control-flow macro).
    813 #
    814 # Locals follow the same braces convention as `body`: a multi-local
    815 # list must be braced (`{a, b, c}`); a zero-local function uses `{}`.
    816 
    817 %macro fn2(name, locals, body)
    818     %struct name ## _FRAME { locals }
    819     : ## name
    820     .scope
    821     %frame name
    822     %enter(% ## name ## _FRAME.SIZE)
    823     body
    824     %eret
    825     %endframe
    826     .endscope
    827 %endm
    828 
    829 %macro stl(reg, slot) %st(reg, sp, %local(slot)) %endm
    830 %macro ldl(reg, slot) %ld(reg, sp, %local(slot)) %endm
    831 
    832 # =========================================================================
    833 # %assert_<cc> macros
    834 # =========================================================================
    835 #
    836 # Branch past the panic call when the condition holds; otherwise fall
    837 # through to `LA a0, msg; LA_BR &panic; CALL`. Each assert requires the
    838 # enclosing function to have an established frame.
    839 
    840 %macro assert_eq(ra, rb, msg)
    841     %beq(ra, rb, &@done)
    842     %la(a0, & ## msg)
    843     %call(&panic)
    844     :@done
    845 %endm
    846 
    847 %macro assert_ne(ra, rb, msg)
    848     %bne(ra, rb, &@done)
    849     %la(a0, & ## msg)
    850     %call(&panic)
    851     :@done
    852 %endm
    853 
    854 %macro assert_lt(ra, rb, msg)
    855     %blt(ra, rb, &@done)
    856     %la(a0, & ## msg)
    857     %call(&panic)
    858     :@done
    859 %endm
    860 
    861 %macro assert_ltu(ra, rb, msg)
    862     %bltu(ra, rb, &@done)
    863     %la(a0, & ## msg)
    864     %call(&panic)
    865     :@done
    866 %endm
    867 
    868 %macro assert_eqz(ra, msg)
    869     %beqz(ra, &@done)
    870     %la(a0, & ## msg)
    871     %call(&panic)
    872     :@done
    873 %endm
    874 
    875 %macro assert_nez(ra, msg)
    876     %bnez(ra, &@done)
    877     %la(a0, & ## msg)
    878     %call(&panic)
    879     :@done
    880 %endm
    881 
    882 %macro assert_ltz(ra, msg)
    883     %bltz(ra, &@done)
    884     %la(a0, & ## msg)
    885     %call(&panic)
    886     :@done
    887 %endm
    888 
    889 # =========================================================================
    890 # Memory and strings
    891 # =========================================================================
    892 
    893 # memcpy(dst=a0, src=a1, n=a2) -> dst (a0)
    894 # Leaf. Copies n bytes from src to dst. No overlap support where
    895 # dst > src && dst < src + n; use memmove for that case. These mem*
    896 # entries are the canonical compiler-builtin runtime — every build
    897 # process in this tree (cc.scm + libp1pp + libc, tcc-cc, tcc-gcc)
    898 # resolves bare `extern memcpy` against this implementation. The
    899 # vendored mes-libc is flattened with its own memcpy/memmove/memset/
    900 # memcmp omitted so the symbols are not duplicated at hex2++ time.
    901 :memcpy
    902 .scope
    903     %mov(a3, a0)
    904     %li(t0, 0)
    905     :.loop
    906     %beq(t0, a2, &.done)
    907     %add(t1, a1, t0)
    908     %lb(t1, t1, 0)
    909     %add(t2, a3, t0)
    910     %sb(t1, t2, 0)
    911     %addi(t0, t0, 1)
    912     %b(&.loop)
    913     :.done
    914     %mov(a0, a3)
    915     %ret
    916 .endscope
    917 
    918 # memmove(dst=a0, src=a1, n=a2) -> dst (a0)
    919 # Leaf. Like memcpy but tolerates overlap by picking the safe direction.
    920 :memmove
    921 .scope
    922     %mov(a3, a0)
    923     %beq(a0, a1, &.done)
    924     %beqz(a2, &.done)
    925     %bltu(a0, a1, &.fwd)
    926     # dst > src: copy from the high end down so an overlap that would
    927     # clobber a yet-unread src byte is harmless.
    928     %mov(t0, a2)
    929     :.bwd_loop
    930     %addi(t0, t0, -1)
    931     %add(t1, a1, t0)
    932     %lb(t1, t1, 0)
    933     %add(t2, a3, t0)
    934     %sb(t1, t2, 0)
    935     %bnez(t0, &.bwd_loop)
    936     %b(&.done)
    937     :.fwd
    938     # dst < src: forward copy is safe.
    939     %li(t0, 0)
    940     :.fwd_loop
    941     %beq(t0, a2, &.done)
    942     %add(t1, a1, t0)
    943     %lb(t1, t1, 0)
    944     %add(t2, a3, t0)
    945     %sb(t1, t2, 0)
    946     %addi(t0, t0, 1)
    947     %b(&.fwd_loop)
    948     :.done
    949     %mov(a0, a3)
    950     %ret
    951 .endscope
    952 
    953 # memset(dst=a0, byte=a1, n=a2) -> dst (a0)
    954 :memset
    955 .scope
    956     %mov(a3, a0)
    957     %li(t0, 0)
    958     :.loop
    959     %beq(t0, a2, &.done)
    960     %add(t1, a3, t0)
    961     %sb(a1, t1, 0)
    962     %addi(t0, t0, 1)
    963     %b(&.loop)
    964     :.done
    965     %mov(a0, a3)
    966     %ret
    967 .endscope
    968 
    969 # memcmp(a=a0, b=a1, n=a2) -> -1/0/1 (a0)
    970 :memcmp
    971 .scope
    972     %li(t0, 0)
    973     :.loop
    974     %beq(t0, a2, &.eq)
    975     %add(t1, a0, t0)
    976     %lb(t1, t1, 0)
    977     %add(t2, a1, t0)
    978     %lb(t2, t2, 0)
    979     %bltu(t1, t2, &.lt)
    980     %bltu(t2, t1, &.gt)
    981     %addi(t0, t0, 1)
    982     %b(&.loop)
    983     :.lt
    984     %li(a0, -1)
    985     %ret
    986     :.gt
    987     %li(a0, 1)
    988     %ret
    989     :.eq
    990     %li(a0, 0)
    991     %ret
    992 .endscope
    993 
    994 # libp1pp__strlen(cstr=a0) -> n (a0)
    995 :libp1pp__strlen
    996 .scope
    997     %mov(a1, a0)
    998     :.loop
    999     %lb(t0, a1, 0)
   1000     %beqz(t0, &.done)
   1001     %addi(a1, a1, 1)
   1002     %b(&.loop)
   1003     :.done
   1004     %sub(a0, a1, a0)
   1005     %ret
   1006 .endscope
   1007 
   1008 # libp1pp__streq(a=a0, b=a1) -> 0 or 1
   1009 :libp1pp__streq
   1010 .scope
   1011     :.loop
   1012     %lb(t0, a0, 0)
   1013     %lb(t1, a1, 0)
   1014     %bne(t0, t1, &.ne)
   1015     %beqz(t0, &.eq)
   1016     %addi(a0, a0, 1)
   1017     %addi(a1, a1, 1)
   1018     %b(&.loop)
   1019     :.ne
   1020     %li(a0, 0)
   1021     %ret
   1022     :.eq
   1023     %li(a0, 1)
   1024     %ret
   1025 .endscope
   1026 
   1027 # libp1pp__strcmp(a=a0, b=a1) -> -1/0/1
   1028 :libp1pp__strcmp
   1029 .scope
   1030     :.loop
   1031     %lb(t0, a0, 0)
   1032     %lb(t1, a1, 0)
   1033     %bltu(t0, t1, &.lt)
   1034     %bltu(t1, t0, &.gt)
   1035     %beqz(t0, &.eq)
   1036     %addi(a0, a0, 1)
   1037     %addi(a1, a1, 1)
   1038     %b(&.loop)
   1039     :.lt
   1040     %li(a0, -1)
   1041     %ret
   1042     :.gt
   1043     %li(a0, 1)
   1044     %ret
   1045     :.eq
   1046     %li(a0, 0)
   1047     %ret
   1048 .endscope
   1049 
   1050 # =========================================================================
   1051 # Integer parsing and formatting
   1052 # =========================================================================
   1053 
   1054 # parse_dec(buf=a0, len=a1) -> (value=a0, consumed=a1)
   1055 # Uses an 8-byte frame slot to save buf_start; all hot-loop state lives
   1056 # in caller-saved registers.
   1057 :parse_dec
   1058 .scope
   1059     %enter(8)
   1060     %st(a0, sp, 0)
   1061     %add(a3, a0, a1)
   1062     %mov(a2, a0)
   1063     %li(t0, 0)
   1064     %li(t1, 0)
   1065 
   1066     %beq(a2, a3, &.after_sign)
   1067     %lb(t2, a2, 0)
   1068     %addi(t2, t2, -45)
   1069     %bnez(t2, &.after_sign)
   1070     %li(t0, 1)
   1071     %addi(a2, a2, 1)
   1072 
   1073     :.after_sign
   1074     %mov(a1, a2)
   1075 
   1076     :.digit_loop
   1077     %beq(a2, a3, &.digits_done)
   1078     %lb(t2, a2, 0)
   1079     %addi(t2, t2, -48)
   1080     %bltz(t2, &.digits_done)
   1081     %li(a0, 9)
   1082     %bltu(a0, t2, &.digits_done)
   1083     %li(a0, 10)
   1084     %mul(t1, t1, a0)
   1085     %add(t1, t1, t2)
   1086     %addi(a2, a2, 1)
   1087     %b(&.digit_loop)
   1088 
   1089     :.digits_done
   1090     %beq(a2, a1, &.no_digits)
   1091 
   1092     %bnez(t0, &.apply_sign)
   1093     %b(&.compute_return)
   1094     :.apply_sign
   1095     %li(a0, 0)
   1096     %sub(t1, a0, t1)
   1097 
   1098     :.compute_return
   1099     %ld(a0, sp, 0)
   1100     %sub(a1, a2, a0)
   1101     %mov(a0, t1)
   1102     %eret
   1103 
   1104     :.no_digits
   1105     %li(a0, 0)
   1106     %li(a1, 0)
   1107     %eret
   1108 .endscope
   1109 
   1110 # parse_hex(buf=a0, len=a1) -> (value=a0, consumed=a1)
   1111 :parse_hex
   1112 .scope
   1113     %enter(8)
   1114     %st(a0, sp, 0)
   1115     %add(a3, a0, a1)
   1116     %mov(a2, a0)
   1117     %li(t1, 0)
   1118     %mov(a1, a2)
   1119 
   1120     :.loop
   1121     %beq(a2, a3, &.done)
   1122     %lb(t2, a2, 0)
   1123 
   1124     %addi(t0, t2, -48)
   1125     %bltz(t0, &.check_lower)
   1126     %li(a0, 9)
   1127     %bltu(a0, t0, &.check_lower)
   1128     %b(&.accept)
   1129 
   1130     :.check_lower
   1131     %addi(t0, t2, -97)
   1132     %bltz(t0, &.check_upper)
   1133     %li(a0, 5)
   1134     %bltu(a0, t0, &.check_upper)
   1135     %addi(t0, t0, 10)
   1136     %b(&.accept)
   1137 
   1138     :.check_upper
   1139     %addi(t0, t2, -65)
   1140     %bltz(t0, &.done)
   1141     %li(a0, 5)
   1142     %bltu(a0, t0, &.done)
   1143     %addi(t0, t0, 10)
   1144 
   1145     :.accept
   1146     %shli(t1, t1, 4)
   1147     %or(t1, t1, t0)
   1148     %addi(a2, a2, 1)
   1149     %b(&.loop)
   1150 
   1151     :.done
   1152     %beq(a2, a1, &.no_digits)
   1153     %ld(a0, sp, 0)
   1154     %sub(a1, a2, a0)
   1155     %mov(a0, t1)
   1156     %eret
   1157 
   1158     :.no_digits
   1159     %li(a0, 0)
   1160     %li(a1, 0)
   1161     %eret
   1162 .endscope
   1163 
   1164 # fmt_dec(buf=a0, value=a1) -> n_bytes (a0)
   1165 #
   1166 # Unified signed formatting: digits are written from the per-iteration
   1167 # `value % 10`, negated when value is negative. This avoids the
   1168 # INT_MIN-overflow trap that `value = -value` would hit.
   1169 :fmt_dec
   1170 .scope
   1171     %enter(8)
   1172     %st(a0, sp, 0)
   1173 
   1174     %bltz(a1, &.is_neg)
   1175     %b(&.count)
   1176     :.is_neg
   1177     %li(t0, 45)
   1178     %sb(t0, a0, 0)
   1179     %addi(a0, a0, 1)
   1180 
   1181     :.count
   1182     %mov(t0, a1)
   1183     %li(a2, 1)
   1184     %li(t1, 10)
   1185     :.count_loop
   1186     %div(t0, t0, t1)
   1187     %beqz(t0, &.count_done)
   1188     %addi(a2, a2, 1)
   1189     %b(&.count_loop)
   1190     :.count_done
   1191 
   1192     %add(a3, a0, a2)
   1193 
   1194     :.dig_loop
   1195     %addi(a3, a3, -1)
   1196     %rem(t0, a1, t1)
   1197     %bltz(t0, &.neg_digit)
   1198     %b(&.write_digit)
   1199     :.neg_digit
   1200     %li(t2, 0)
   1201     %sub(t0, t2, t0)
   1202     :.write_digit
   1203     %addi(t0, t0, 48)
   1204     %sb(t0, a3, 0)
   1205     %div(a1, a1, t1)
   1206     %bnez(a1, &.dig_loop)
   1207 
   1208     %ld(t2, sp, 0)
   1209     %add(a0, a0, a2)
   1210     %sub(a0, a0, t2)
   1211     %eret
   1212 .endscope
   1213 
   1214 # fmt_hex(buf=a0, value=a1) -> n_bytes (a0)
   1215 :fmt_hex
   1216 .scope
   1217     %enter(8)
   1218     %st(a0, sp, 0)
   1219 
   1220     %bnez(a1, &.nonzero)
   1221     %li(t0, 48)
   1222     %sb(t0, a0, 0)
   1223     %li(a0, 1)
   1224     %eret
   1225 
   1226     :.nonzero
   1227     %mov(t0, a1)
   1228     %li(a2, 0)
   1229     :.count_loop
   1230     %addi(a2, a2, 1)
   1231     %shri(t0, t0, 4)
   1232     %bnez(t0, &.count_loop)
   1233 
   1234     %add(a3, a0, a2)
   1235 
   1236     :.dig_loop
   1237     %addi(a3, a3, -1)
   1238     %andi(t0, a1, 15)
   1239     %li(t1, 10)
   1240     %bltu(t0, t1, &.is_letter)
   1241     %addi(t0, t0, -10)
   1242     %addi(t0, t0, 97)
   1243     %b(&.write_digit)
   1244     :.is_letter
   1245     %addi(t0, t0, 48)
   1246     :.write_digit
   1247     %sb(t0, a3, 0)
   1248     %shri(a1, a1, 4)
   1249     %bnez(a1, &.dig_loop)
   1250 
   1251     %ld(t2, sp, 0)
   1252     %add(a0, a0, a2)
   1253     %sub(a0, a0, t2)
   1254     %eret
   1255 .endscope
   1256 
   1257 # =========================================================================
   1258 # Character predicates
   1259 # =========================================================================
   1260 
   1261 # is_digit(c=a0) -> 0 or 1
   1262 :is_digit
   1263 .scope
   1264     %addi(t0, a0, -48)
   1265     %li(t1, 10)
   1266     %li(a0, 1)
   1267     %bltu(t0, t1, &.done)
   1268     %li(a0, 0)
   1269     :.done
   1270     %ret
   1271 .endscope
   1272 
   1273 # is_hex_digit(c=a0) -> 0 or 1
   1274 :is_hex_digit
   1275 .scope
   1276     %li(t2, 1)
   1277     %addi(t0, a0, -48)
   1278     %li(t1, 10)
   1279     %bltu(t0, t1, &.done)
   1280     %addi(t0, a0, -97)
   1281     %li(t1, 6)
   1282     %bltu(t0, t1, &.done)
   1283     %addi(t0, a0, -65)
   1284     %bltu(t0, t1, &.done)
   1285     %li(t2, 0)
   1286     :.done
   1287     %mov(a0, t2)
   1288     %ret
   1289 .endscope
   1290 
   1291 # is_space(c=a0) -> 0 or 1
   1292 :is_space
   1293 .scope
   1294     %li(t2, 1)
   1295     %addi(t0, a0, -32)
   1296     %beqz(t0, &.done)
   1297     %addi(t0, a0, -9)
   1298     %li(t1, 5)
   1299     %bltu(t0, t1, &.done)
   1300     %li(t2, 0)
   1301     :.done
   1302     %mov(a0, t2)
   1303     %ret
   1304 .endscope
   1305 
   1306 # is_alpha(c=a0) -> 0 or 1
   1307 :is_alpha
   1308 .scope
   1309     %li(t2, 1)
   1310     %addi(t0, a0, -97)
   1311     %li(t1, 26)
   1312     %bltu(t0, t1, &.done)
   1313     %addi(t0, a0, -65)
   1314     %bltu(t0, t1, &.done)
   1315     %li(t2, 0)
   1316     :.done
   1317     %mov(a0, t2)
   1318     %ret
   1319 .endscope
   1320 
   1321 # is_alnum(c=a0) -> 0 or 1
   1322 :is_alnum
   1323 .scope
   1324     %li(t2, 1)
   1325     %addi(t0, a0, -48)
   1326     %li(t1, 10)
   1327     %bltu(t0, t1, &.done)
   1328     %addi(t0, a0, -97)
   1329     %li(t1, 26)
   1330     %bltu(t0, t1, &.done)
   1331     %addi(t0, a0, -65)
   1332     %bltu(t0, t1, &.done)
   1333     %li(t2, 0)
   1334     :.done
   1335     %mov(a0, t2)
   1336     %ret
   1337 .endscope
   1338 
   1339 # =========================================================================
   1340 # Raw syscall wrappers
   1341 # =========================================================================
   1342 #
   1343 # Each wrapper shifts arguments into the syscall convention
   1344 # (a0 = number, a1..a3/t0/s0/s1 = args 0..5), emits SYSCALL, and returns
   1345 # the raw kernel result. Syscall clobbers only a0, so t0/s0/s1 do not
   1346 # need saving.
   1347 
   1348 # sys_read(fd=a0, buf=a1, len=a2) -> n (a0)
   1349 :sys_read
   1350     %mov(a3, a2)
   1351     %mov(a2, a1)
   1352     %mov(a1, a0)
   1353     %li(a0, %p1_sys_read)
   1354     %syscall
   1355     %ret
   1356 
   1357 # sys_write(fd=a0, buf=a1, len=a2) -> n (a0)
   1358 :sys_write
   1359     %mov(a3, a2)
   1360     %mov(a2, a1)
   1361     %mov(a1, a0)
   1362     %li(a0, %p1_sys_write)
   1363     %syscall
   1364     %ret
   1365 
   1366 # sys_open(path=a0, flags=a1, mode=a2) -> fd (a0)
   1367 # Implemented as openat(AT_FDCWD, path, flags, mode). AT_FDCWD = -100.
   1368 :sys_open
   1369     %mov(t0, a2)
   1370     %mov(a3, a1)
   1371     %mov(a2, a0)
   1372     %li(a1, -100)
   1373     %li(a0, %p1_sys_openat)
   1374     %syscall
   1375     %ret
   1376 
   1377 # sys_close(fd=a0) -> r (a0)
   1378 :sys_close
   1379     %mov(a1, a0)
   1380     %li(a0, %p1_sys_close)
   1381     %syscall
   1382     %ret
   1383 
   1384 # sys_lseek(fd=a0, off=a1, whence=a2) -> off (a0)
   1385 :sys_lseek
   1386     %mov(a3, a2)
   1387     %mov(a2, a1)
   1388     %mov(a1, a0)
   1389     %li(a0, %p1_sys_lseek)
   1390     %syscall
   1391     %ret
   1392 
   1393 # sys_brk(addr=a0) -> new_break (a0). addr=0 returns the current break.
   1394 :sys_brk
   1395     %mov(a1, a0)
   1396     %li(a0, %p1_sys_brk)
   1397     %syscall
   1398     %ret
   1399 
   1400 # sys_unlink(path=a0) -> 0 / -errno (a0).
   1401 # Implemented as unlinkat(AT_FDCWD, path, 0). AT_FDCWD = -100.
   1402 :sys_unlink
   1403     %li(a3, 0)
   1404     %mov(a2, a0)
   1405     %li(a1, -100)
   1406     %li(a0, %p1_sys_unlinkat)
   1407     %syscall
   1408     %ret
   1409 
   1410 # sys_exit(code=a0) -> never returns
   1411 :sys_exit
   1412 .scope
   1413     %mov(a1, a0)
   1414     %li(a0, %p1_sys_exit)
   1415     %syscall
   1416     :.spin
   1417     %b(&.spin)
   1418 .endscope
   1419 
   1420 # =========================================================================
   1421 # Print helpers
   1422 # =========================================================================
   1423 #
   1424 # print(buf, len) and eprint(buf, len) loop on sys_write until all bytes
   1425 # are written or the kernel reports an error. All other print helpers
   1426 # compose on top of those two.
   1427 
   1428 %fn(print, 16, {
   1429     %st(s0, sp, 0)
   1430     %st(s1, sp, 8)
   1431     %mov(s0, a0)
   1432     %mov(s1, a1)
   1433 
   1434     :.loop
   1435     %beqz(s1, &.done_ok)
   1436     %li(a0, 1)
   1437     %mov(a1, s0)
   1438     %mov(a2, s1)
   1439     %call(&sys_write)
   1440     %bltz(a0, &.done)
   1441     %add(s0, s0, a0)
   1442     %sub(s1, s1, a0)
   1443     %b(&.loop)
   1444 
   1445     :.done_ok
   1446     %li(a0, 0)
   1447     :.done
   1448     %ld(s0, sp, 0)
   1449     %ld(s1, sp, 8)
   1450 })
   1451 
   1452 %fn(eprint, 16, {
   1453     %st(s0, sp, 0)
   1454     %st(s1, sp, 8)
   1455     %mov(s0, a0)
   1456     %mov(s1, a1)
   1457 
   1458     :.loop
   1459     %beqz(s1, &.done_ok)
   1460     %li(a0, 2)
   1461     %mov(a1, s0)
   1462     %mov(a2, s1)
   1463     %call(&sys_write)
   1464     %bltz(a0, &.done)
   1465     %add(s0, s0, a0)
   1466     %sub(s1, s1, a0)
   1467     %b(&.loop)
   1468 
   1469     :.done_ok
   1470     %li(a0, 0)
   1471     :.done
   1472     %ld(s0, sp, 0)
   1473     %ld(s1, sp, 8)
   1474 })
   1475 
   1476 %fn(println, 16, {
   1477     %st(s0, sp, 0)
   1478 
   1479     %call(&print)
   1480     %mov(s0, a0)
   1481     %bltz(s0, &.done)
   1482 
   1483     %la(a0, &libp1pp__newline)
   1484     %li(a1, 1)
   1485     %call(&print)
   1486     %mov(s0, a0)
   1487 
   1488     :.done
   1489     %mov(a0, s0)
   1490     %ld(s0, sp, 0)
   1491 })
   1492 
   1493 %fn(eprintln, 16, {
   1494     %st(s0, sp, 0)
   1495 
   1496     %call(&eprint)
   1497     %mov(s0, a0)
   1498     %bltz(s0, &.done)
   1499 
   1500     %la(a0, &libp1pp__newline)
   1501     %li(a1, 1)
   1502     %call(&eprint)
   1503     %mov(s0, a0)
   1504 
   1505     :.done
   1506     %mov(a0, s0)
   1507     %ld(s0, sp, 0)
   1508 })
   1509 
   1510 %fn(print_cstr, 16, {
   1511     %st(s0, sp, 0)
   1512     %mov(s0, a0)
   1513     %call(&libp1pp__strlen)
   1514     %mov(a1, a0)
   1515     %mov(a0, s0)
   1516     %call(&print)
   1517     %ld(s0, sp, 0)
   1518 })
   1519 
   1520 %fn(eprint_cstr, 16, {
   1521     %st(s0, sp, 0)
   1522     %mov(s0, a0)
   1523     %call(&libp1pp__strlen)
   1524     %mov(a1, a0)
   1525     %mov(a0, s0)
   1526     %call(&eprint)
   1527     %ld(s0, sp, 0)
   1528 })
   1529 
   1530 %fn(print_int, 0, {
   1531     %mov(a1, a0)
   1532     %la(a0, &libp1pp__num_buf)
   1533     %call(&fmt_dec)
   1534     %mov(a1, a0)
   1535     %la(a0, &libp1pp__num_buf)
   1536     %call(&print)
   1537 })
   1538 
   1539 %fn(print_hex, 0, {
   1540     %mov(a1, a0)
   1541     %la(a0, &libp1pp__num_buf)
   1542     %call(&fmt_hex)
   1543     %mov(a1, a0)
   1544     %la(a0, &libp1pp__num_buf)
   1545     %call(&print)
   1546 })
   1547 
   1548 # =========================================================================
   1549 # File helpers
   1550 # =========================================================================
   1551 
   1552 # read_file(path=a0, buf=a1, cap=a2) -> n or -1
   1553 %fn(read_file, 32, {
   1554     %st(s0, sp, 0)
   1555     %st(s1, sp, 8)
   1556     %st(s2, sp, 16)
   1557     %st(s3, sp, 24)
   1558 
   1559     %mov(s1, a1)
   1560     %mov(s2, a2)
   1561 
   1562     %li(a1, 0)
   1563     %li(a2, 0)
   1564     %call(&sys_open)
   1565     %bltz(a0, &.open_fail)
   1566     %mov(s3, a0)
   1567 
   1568     %mov(a0, s3)
   1569     %mov(a1, s1)
   1570     %mov(a2, s2)
   1571     %call(&sys_read)
   1572     %mov(s0, a0)
   1573 
   1574     %mov(a0, s3)
   1575     %call(&sys_close)
   1576 
   1577     %mov(a0, s0)
   1578     %bltz(a0, &.read_fail)
   1579     %b(&.done)
   1580 
   1581     :.read_fail
   1582     %li(a0, -1)
   1583     %b(&.done)
   1584 
   1585     :.open_fail
   1586     %li(a0, -1)
   1587 
   1588     :.done
   1589     %ld(s0, sp, 0)
   1590     %ld(s1, sp, 8)
   1591     %ld(s2, sp, 16)
   1592     %ld(s3, sp, 24)
   1593 })
   1594 
   1595 # libp1pp__write_all(fd=a0, buf=a1, len=a2) -> 0 or <0 on error
   1596 #
   1597 # Loop on sys_write until all bytes are written. Used by print / eprint
   1598 # / write_file. Retries partial writes but returns the first negative
   1599 # kernel return unchanged.
   1600 %fn(libp1pp__write_all, 24, {
   1601     %st(s0, sp, 0)
   1602     %st(s1, sp, 8)
   1603     %st(s2, sp, 16)
   1604 
   1605     %mov(s0, a0)
   1606     %mov(s1, a1)
   1607     %mov(s2, a2)
   1608 
   1609     :.loop
   1610     %beqz(s2, &.done_ok)
   1611     %mov(a0, s0)
   1612     %mov(a1, s1)
   1613     %mov(a2, s2)
   1614     %call(&sys_write)
   1615     %bltz(a0, &.done)
   1616     %add(s1, s1, a0)
   1617     %sub(s2, s2, a0)
   1618     %b(&.loop)
   1619 
   1620     :.done_ok
   1621     %li(a0, 0)
   1622     :.done
   1623     %ld(s0, sp, 0)
   1624     %ld(s1, sp, 8)
   1625     %ld(s2, sp, 16)
   1626 })
   1627 
   1628 # write_file(path=a0, buf=a1, len=a2) -> 0 or -1
   1629 #
   1630 # Flags: O_WRONLY|O_CREAT|O_TRUNC. On Linux these are 0x1 | 0x40 |
   1631 # 0x200 = 0x241. Mode 0644 octal = 0x1A4.
   1632 %fn(write_file, 24, {
   1633     %st(s0, sp, 0)
   1634     %st(s1, sp, 8)
   1635     %st(s2, sp, 16)
   1636 
   1637     %mov(s0, a1)
   1638     %mov(s1, a2)
   1639 
   1640     %li(a1, 0x241)
   1641     %li(a2, 0x1A4)
   1642     %call(&sys_open)
   1643     %bltz(a0, &.open_fail)
   1644     %mov(s2, a0)
   1645 
   1646     %mov(a0, s2)
   1647     %mov(a1, s0)
   1648     %mov(a2, s1)
   1649     %call(&libp1pp__write_all)
   1650 
   1651     %mov(s0, a0)
   1652     %mov(a0, s2)
   1653     %call(&sys_close)
   1654 
   1655     %mov(a0, s0)
   1656     %bltz(a0, &.fail_ret)
   1657     %li(a0, 0)
   1658     %b(&.done)
   1659 
   1660     :.fail_ret
   1661     %li(a0, -1)
   1662     %b(&.done)
   1663 
   1664     :.open_fail
   1665     %li(a0, -1)
   1666 
   1667     :.done
   1668     %ld(s0, sp, 0)
   1669     %ld(s1, sp, 8)
   1670     %ld(s2, sp, 16)
   1671 })
   1672 
   1673 # =========================================================================
   1674 # BSS arena pointer-init table
   1675 # =========================================================================
   1676 #
   1677 # Pattern: a program reserves a stretch of memory past :ELF_end (or any
   1678 # base) and wants to carve it into N fixed-size arenas, each anchored
   1679 # by a pointer slot in the data section. The table emits one
   1680 # (slot, size) row per arena via %arena_entry; init_arenas walks the
   1681 # table once at startup and writes base + sum of prior sizes into each
   1682 # slot, so arena[k] starts where arena[k-1] ended.
   1683 
   1684 # %arena_entry(slot, size) -- one 16-byte row: 4-byte label ref + 4
   1685 # bytes zero pad + 8-byte size. `slot` is passed as a label ref (`&foo`).
   1686 %macro arena_entry(slot, size) slot %(0) $(size) %endm
   1687 
   1688 # init_arenas(base=a0, tbl=a1, tbl_end=a2) -> 0
   1689 #
   1690 # Walks (slot, size) pairs from `tbl` to `tbl_end`, threading a running
   1691 # offset starting at 0. For each entry: *slot = base + offset, then
   1692 # offset += size. Leaf.
   1693 :init_arenas
   1694 .scope
   1695     %li(t0, 0)
   1696     :.loop
   1697         %beq(a1, a2, &.done)
   1698         %ld(t1, a1, 0)
   1699         %ld(t2, a1, 8)
   1700         %add(a3, a0, t0)
   1701         %st(a3, t1, 0)
   1702         %add(t0, t0, t2)
   1703         %addi(a1, a1, 16)
   1704         %b(&.loop)
   1705     :.done
   1706     %li(a0, 0)
   1707     %ret
   1708 .endscope
   1709 
   1710 # =========================================================================
   1711 # Bump allocator
   1712 # =========================================================================
   1713 #
   1714 # Single global arena, bytes carved by monotonic cursor with 8-byte
   1715 # alignment. bump_alloc returns 0 when the request would overflow.
   1716 
   1717 # bump_init(base=a0, cap=a1) -> 0
   1718 :bump_init
   1719     %la(t0, &libp1pp__bump_base)
   1720     %st(a0, t0, 0)
   1721     %la(t0, &libp1pp__bump_cursor)
   1722     %st(a0, t0, 0)
   1723     %la(t0, &libp1pp__bump_cap)
   1724     %st(a1, t0, 0)
   1725     %li(a0, 0)
   1726     %ret
   1727 
   1728 # bump_alloc(n=a0) -> ptr (0 on exhaustion)
   1729 #
   1730 # Round n up to a multiple of 8, then admit iff cursor + n_rounded does
   1731 # not exceed base + cap. On success, advance the cursor and return the
   1732 # pre-advance value; on failure, leave the cursor untouched and return 0.
   1733 :bump_alloc
   1734 .scope
   1735     %addi(a0, a0, 7)
   1736     %li(t0, -8)
   1737     %and(a0, a0, t0)
   1738     %la(t0, &libp1pp__bump_cursor)
   1739     %ld(t1, t0, 0)
   1740     %add(t2, t1, a0)
   1741     %la(a1, &libp1pp__bump_base)
   1742     %ld(a2, a1, 0)
   1743     %la(a1, &libp1pp__bump_cap)
   1744     %ld(a3, a1, 0)
   1745     %add(a3, a2, a3)
   1746     %bltu(a3, t2, &.fail)
   1747     %st(t2, t0, 0)
   1748     %mov(a0, t1)
   1749     %ret
   1750     :.fail
   1751     %li(a0, 0)
   1752     %ret
   1753 .endscope
   1754 
   1755 # bump_mark() -> saved
   1756 :bump_mark
   1757     %la(t0, &libp1pp__bump_cursor)
   1758     %ld(a0, t0, 0)
   1759     %ret
   1760 
   1761 # bump_release(saved=a0) -> 0
   1762 :bump_release
   1763     %la(t0, &libp1pp__bump_cursor)
   1764     %st(a0, t0, 0)
   1765     %li(a0, 0)
   1766     %ret
   1767 
   1768 # bump_reset() -> 0
   1769 :bump_reset
   1770     %la(t0, &libp1pp__bump_base)
   1771     %ld(t1, t0, 0)
   1772     %la(t0, &libp1pp__bump_cursor)
   1773     %st(t1, t0, 0)
   1774     %li(a0, 0)
   1775     %ret
   1776 
   1777 # =========================================================================
   1778 # Panic
   1779 # =========================================================================
   1780 
   1781 # panic(msg_cstr=a0) -> never returns
   1782 %fn(panic, 0, {
   1783     %call(&eprint_cstr)
   1784     %la(a0, &libp1pp__newline)
   1785     %li(a1, 1)
   1786     %call(&eprint)
   1787     %li(a0, 1)
   1788     %call(&sys_exit)
   1789     :.spin
   1790     %b(&.spin)
   1791 })
   1792 
   1793 # =========================================================================
   1794 # Tracepoint
   1795 # =========================================================================
   1796 #
   1797 # %trace(tag_addr, tag_len) — emit a runtime stderr probe at the call
   1798 # site. Prints `[trace @0xHEX TAG]\n` to stderr, where 0xHEX is the
   1799 # runtime address of this trace site (the address of `:@here` in this
   1800 # site's expansion) and TAG is the byte string at
   1801 # [tag_addr..tag_addr+tag_len).
   1802 #
   1803 # `tag_addr` is a label reference token (e.g. `&cc__str_3`) — the
   1804 # caller is responsible for emitting the bytes at that label. cc.scm's
   1805 # --cc-trace-emit interns the mangled function name through the
   1806 # regular string pool, which already pads each entry to an 8-byte
   1807 # multiple, so the next item past the tag stays aligned. `tag_len` is
   1808 # the *logical* byte count to print (without trailing NUL or pad).
   1809 #
   1810 # To map a printed address back to source, disassemble the ELF
   1811 # (`scripts/disasm-elf.sh`) and locate the printed address. cc.scm
   1812 # guarantees that each function's first instruction *is* a trace call,
   1813 # so the printed address falls on a known function-entry boundary.
   1814 #
   1815 # Preserves all exposed P1 registers (a0..a3, t0..t2, s0..s3) by
   1816 # borrowing 112 aligned bytes below the current stack pointer: 16 bytes
   1817 # for the backend frame prefix plus 88 bytes for saved registers. Use
   1818 # only inside an active %fn body, after %enter and before %eret.
   1819 %macro trace(tag_addr, tag_len)
   1820     :@here
   1821     %addi(sp, sp, -112)
   1822     %st(a0, sp, 0)
   1823     %st(a1, sp, 8)
   1824     %st(a2, sp, 16)
   1825     %st(a3, sp, 24)
   1826     %st(t0, sp, 32)
   1827     %st(t1, sp, 40)
   1828     %st(t2, sp, 48)
   1829     %st(s0, sp, 56)
   1830     %st(s1, sp, 64)
   1831     %st(s2, sp, 72)
   1832     %st(s3, sp, 80)
   1833     %la(a0, &@here)
   1834     %la(a1, tag_addr)
   1835     %li(a2, tag_len)
   1836     %call(&libp1pp__trace)
   1837     %ld(a0, sp, 0)
   1838     %ld(a1, sp, 8)
   1839     %ld(a2, sp, 16)
   1840     %ld(a3, sp, 24)
   1841     %ld(t0, sp, 32)
   1842     %ld(t1, sp, 40)
   1843     %ld(t2, sp, 48)
   1844     %ld(s0, sp, 56)
   1845     %ld(s1, sp, 64)
   1846     %ld(s2, sp, 72)
   1847     %ld(s3, sp, 80)
   1848     %addi(sp, sp, 112)
   1849 %endm
   1850 
   1851 # libp1pp__trace(addr=a0, tag_addr=a1, tag_len=a2) — print
   1852 # "[trace @0xHEX TAG]\n" to stderr.
   1853 %fn(libp1pp__trace, 32, {
   1854     %st(s0, sp, 0)
   1855     %st(s1, sp, 8)
   1856     %st(s2, sp, 16)
   1857     %st(s3, sp, 24)
   1858     %mov(s0, a0)
   1859     %mov(s1, a1)
   1860     %mov(s2, a2)
   1861 
   1862     %la(a0, &libp1pp__trace_pre)
   1863     %li(a1, 8)
   1864     %call(&eprint)
   1865 
   1866     %la(a0, &libp1pp__num_buf)
   1867     %mov(a1, s0)
   1868     %call(&fmt_hex)
   1869     %mov(s3, a0)
   1870     %la(a0, &libp1pp__num_buf)
   1871     %mov(a1, s3)
   1872     %call(&eprint)
   1873 
   1874     %la(a0, &libp1pp__trace_sep)
   1875     %li(a1, 1)
   1876     %call(&eprint)
   1877 
   1878     %mov(a0, s1)
   1879     %mov(a1, s2)
   1880     %call(&eprint)
   1881 
   1882     %la(a0, &libp1pp__trace_post)
   1883     %li(a1, 2)
   1884     %call(&eprint)
   1885 
   1886     %ld(s0, sp, 0)
   1887     %ld(s1, sp, 8)
   1888     %ld(s2, sp, 16)
   1889     %ld(s3, sp, 24)
   1890 })
   1891 
   1892 # Tracepoint message fragments. eprint reads only the leading
   1893 # visible-byte count (8, 1, 2); .align 8 keeps each fragment and the
   1894 # data labels that follow 8-byte aligned (aarch64 LDR / 4-byte
   1895 # inline-data loads fault otherwise).
   1896 :libp1pp__trace_pre  "[trace @"
   1897 .align 8
   1898 :libp1pp__trace_sep  " "
   1899 .align 8
   1900 :libp1pp__trace_post "]\n"
   1901 .align 8
   1902 
   1903 # =========================================================================
   1904 # Internal data
   1905 # =========================================================================
   1906 
   1907 # Single newline byte for println / eprintln / panic. Emitted as an
   1908 # 8-byte word (0x0A in the low byte, zeros above) so the following
   1909 # buffers and the user source that comes after libp1pp stay 8-byte
   1910 # aligned. sys_write reads only the one byte callers request.
   1911 :libp1pp__newline $(10)
   1912 
   1913 # Scratch buffer used by print_int / print_hex. fmt_dec writes at most
   1914 # 20 bytes, fmt_hex at most 16, so 32 bytes with word alignment is
   1915 # comfortably above both.
   1916 :libp1pp__num_buf $(0) $(0) $(0) $(0)
   1917 
   1918 # Bump-allocator state. Zero-initialized so bump_alloc returns 0 until
   1919 # bump_init installs an arena.
   1920 :libp1pp__bump_base $(0)
   1921 :libp1pp__bump_cursor $(0)
   1922 :libp1pp__bump_cap $(0)