kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit e4365d4502bd267ddd485d100170394e466a2d9a
parent fe755e3867b78c2712a0acbe7c54c79c993f2aec
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 21 May 2026 21:35:37 -0700

Complete x64 backend parity

Diffstat:
Mdoc/X64_PARITY_CHECKLIST.md | 276++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mdriver/env.c | 77+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Mdriver/objdump.c | 25+++++++++++++++++++++++++
Mdriver/runtime.c | 12++++++++----
Mlang/c/pp/pp.c | 20++++++++++++++++++++
Asrc/abi/abi_apple_x64.c | 21+++++++++++++++++++++
Msrc/abi/abi_internal.h | 1+
Msrc/abi/abi_sysv_x64.c | 170++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
Msrc/api/disasm.c | 42+++++++++++++++++++++++++++++++-----------
Msrc/api/object_file.c | 67++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Msrc/arch/aa64/arch.c | 8++++++++
Msrc/arch/aa64/dbg.c | 74+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Msrc/arch/arch.h | 37+++++++++++++++++++++++++++++++++++++
Msrc/arch/rv64/arch.c | 8++++++++
Msrc/arch/rv64/dbg.c | 64++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/arch/x64/alloc.c | 47+++++++++++++++++++++++++++++++++++++++++++++++
Msrc/arch/x64/arch.c | 16++++++++++++++--
Msrc/arch/x64/asm.c | 1277++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
Asrc/arch/x64/dbg.c | 411+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/arch/x64/disasm.c | 336+++++++++++++++++--------------------------------------------------------------
Msrc/arch/x64/emit.c | 336++++++++++++++++++++++++++++++++++++++-----------------------------------------
Msrc/arch/x64/internal.h | 21+++++++++++++++++++++
Asrc/arch/x64/isa.c | 1066+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/arch/x64/isa.h | 622++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Msrc/arch/x64/link.c | 13+++++++++++++
Msrc/arch/x64/ops.c | 84+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/arch/x64/opt_coord.c | 20++++++++++++++++++++
Dsrc/dbg/arch.c | 47-----------------------------------------------
Msrc/dbg/bp.c | 24++++++++++++++----------
Msrc/dbg/dbg.h | 86++++++++++++++++++++-----------------------------------------------------------
Msrc/dbg/displaced.c | 64+++++++++++++++++++++++++++++++++++-----------------------------
Msrc/dbg/session.c | 35+++++++++++++++++++++--------------
Msrc/dbg/step.c | 50++++----------------------------------------------
Msrc/debug/debug_emit.c | 56+++++++++++++++++++++++++++++++++++++++++++-------------
Msrc/link/link_jit.c | 14+++++++++++++-
Msrc/link/link_reloc.c | 26+++++++++++++++++++++++++-
Msrc/link/link_reloc_layout.c | 5+++++
Mtest/api/abi_classify_test.c | 67+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/arch/x64_dbg_test.c | 149+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/arch/x64_inline_test.c | 511+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/asm/decode/x64_lock_mfence.expected.txt | 2++
Atest/asm/decode/x64_lock_mfence.hex | 1+
Atest/asm/decode/x64_lock_mfence.targets | 1+
Atest/asm/encode/x64_isa_core.expected.hex | 1+
Atest/asm/encode/x64_isa_core.s | 18++++++++++++++++++
Atest/asm/encode/x64_isa_core.targets | 1+
Mtest/asm/harness/asm_runner.c | 28++++++++++++++++++++++++++--
Atest/asm/listing/x64_symbols.expected.lst | 7+++++++
Atest/asm/listing/x64_symbols.in.bin | 0
Atest/asm/listing/x64_symbols.s | 8++++++++
Atest/asm/listing/x64_symbols.targets | 1+
Mtest/asm/regen.sh | 89+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Mtest/debug/roundtrip_unit.c | 76+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mtest/driver/run.sh | 26++++++++++++++++++++++++++
Atest/elf/unit/x64_disasm_annotations.c | 221+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/libc/cases/01_syscall_write.c | 17+++++++++++++++--
Mtest/libc/glibc/run.sh | 342+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Mtest/libc/musl/run.sh | 288++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mtest/link/harness/jit_runner.c | 102++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
Atest/parse/cases/6_5_2_2_06_struct_param_mixed_fp_int.c | 12++++++++++++
Atest/parse/cases/6_5_2_2_06_struct_param_mixed_fp_int.expected | 1+
Atest/parse/cases/6_8_6_4_05_struct_return_mixed_fp_int.c | 12++++++++++++
Atest/parse/cases/6_8_6_4_05_struct_return_mixed_fp_int.expected | 1+
Atest/parse/cases/cg_x64_inline_asm_modifiers.c | 15+++++++++++++++
Atest/parse/cases/cg_x64_inline_asm_modifiers.expected | 1+
Mtest/parse/harness/parse_runner.c | 28++++++++++++++++++++++++++--
Mtest/test.mk | 115+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
67 files changed, 6545 insertions(+), 1154 deletions(-)

diff --git a/doc/X64_PARITY_CHECKLIST.md b/doc/X64_PARITY_CHECKLIST.md @@ -32,20 +32,276 @@ debug tooling. `120_data_symdiff`, `123_spec_demo`, and `65_rounding_conversions`. A full x64 toy cross run is still pending after the podman runner fix. +## Status as of 2026-05-21 (parity push) + +Landed across the seven areas; commit `6b82eb5` "Bring x64 to parity with aa64" +(20 files, +3560 / −688). + +- Built the x64 ISA descriptor layer. `src/arch/x64/isa.{h,c}` now holds a + 75-row `x64_insn_table` plus per-format pack/unpack helpers. Encoder + (`emit.c`), decoder (`disasm.c`), and assembler (`asm.c`) all consult the + same table. Adding a new instruction is a one-row change. +- Encoder migration (phase 2): 19 `emit_*` bodies in `src/arch/x64/emit.c` + now build a format struct and call `x64_<format>_pack`. Byte-for-byte + output unchanged (verified by `cmp` against the pre-migration + `123_spec_demo.O2.o`). +- Assembler refactor (phase 3): `src/arch/x64/asm.c` mnemonic dispatch goes + through `find_mnemonic_row` / `parse_and_emit_for_format` instead of the + hand-coded `sym_eq` cascade. +- Disassembler rewrite: hand-coded if/else chain replaced by + `x64_decode_prefixes` → `x64_disasm_find` → `x64_print_operands`. The + `123_spec_demo` jump-table dispatch now disassembles cleanly with zero + `.byte` fallback. +- Codegen bug fix: `emit_extend_rr` was a silent no-op when `src_size >= 32`, + leaving the destination register undefined. Repaired with a `mov dst, src` + when needed. Closes the only baseline x64 toy failure + (`123_spec_demo/X-O2:x64`). +- Darwin x64 ABI seam: new `src/abi/abi_apple_x64.c` exporting + `apple_x64_vtable`; `src/arch/x64/arch.c` now branches on + `CFREE_OS_MACOS`. Previously x86_64-apple-darwin used the Linux SysV + vtable unconditionally. +- SysV x64 variadic metadata populated: `vararg_gp_offset` / + `vararg_fp_offset` derived from fixed-arg consumption using named + pool-size constants. +- Linker dynamic relocations: `src/link/link_reloc.c` handles + `R_X86_64_RELATIVE`, `R_X86_64_GLOB_DAT`, `R_X86_64_JUMP_SLOT`; `R_X86_64_COPY` + now panics with a clear message instead of falling through to the generic + "unsupported reloc kind" path. +- libc test harnesses parameterized: `test/libc/{musl,glibc}/run.sh` honour + `CFREE_LIBC_ARCHES` (default `aa64`; `x64` available). Per-arch + sysroot/rt/triple/loader lookup with graceful SKIP when artifacts missing. + `test/test.mk` wires the per-arch sysroot prerequisites. + `test/libc/cases/01_syscall_write.c` splits into per-arch syscall ABI + branches under `#ifdef`. +- Inline asm: new `test/arch/x64_inline_test.c` with 6 smoke cases driving + `CGTarget->asm_block` directly; `asm.c` gains the `%b` byte-register + modifier and a full 8-bit register spelling table. +- Debugger scaffolding: new `src/arch/x64/dbg.c` with `INT3` sentinel and a + conservative shim builder that declines on RIP-relative operands. + `src/dbg/displaced.c` and `src/dbg/step.c` widen arch dispatch — x64 now + falls back to `CFREE_UNSUPPORTED` gracefully instead of failing in + `dbg_displaced_prepare`. +- Verification: `make test` 3616/0/0; x64 toy R/L/X 1286/0/0 (baseline + 1285/1/0); x64 musl 18/18 (9 static + 9 dynamic); x64 glibc 9/9. + +## Status as of 2026-05-21 (x64 runtime-linked push) + +- `driver/runtime.c` now auto-builds the x64 runtime archive with the same + higher-level freestanding members that `rt/Makefile` builds for x64: + assert, `si_div`, string, stdlib, qsort, printf, cache, atomics, ifunc, + int/int64, and coroutine sources. +- Added `cc-auto-builds-and-links-libcfree-rt-x64` to `test/driver/run.sh`. + The regression builds an x64 executable through `cfree cc --support-dir`, + forces implicit `build/rt/x86_64-linux/libcfree_rt.a` creation, and checks + that the auto-built archive contains `printf.c`. +- Verified clean x64 runtime rebuilds for Linux and Darwin: + `rm -rf build/rt/x86_64-linux && make rt-x86_64-linux` and + `rm -rf build/rt/x86_64-apple-darwin && make rt-x86_64-apple-darwin`. + The x64 coroutine source `rt/lib/coro/x86_64.c` is compiled through + `build/cfree cc` in both variants. +- Verified x64 runtime-linked execution on Darwin/arm64 via Podman + linux/amd64: `CFREE_RT_RUNTIME_ARCHES=x64 bash test/rt/run.sh` passed + 5/0/0, and an explicit driver-auto-built x64 runtime binary exited 42 under + `podman run --platform linux/amd64`. + +## Asm / disasm + +- [x] Expand `src/arch/x64/asm.c` beyond the current small AT&T subset: + branches, calls, arithmetic, shifts, compares, loads/stores, LEA, atomics, + SSE scalar FP, and backend-emitted forms. + - 2026-05-21: rewritten to be table-driven via `x64_insn_table`. Every + mnemonic the prior dispatch handled flows through the table; new + mnemonics land as one row + a format parser. Mnemonics outside the + current corpus are not yet wired (per-format parsers exist only for + the formats the standalone-asm and inline-asm tests exercise today — + see phase-3 report for the list). +- [x] Build an x64 ISA descriptor layer equivalent in role to + `src/arch/aa64/isa.{h,c}` so encoder, decoder, printer, and tests share + one instruction description. + - 2026-05-21: `src/arch/x64/isa.{h,c}` landed; encoder, decoder, and + assembler all consult `x64_insn_table`. +- [x] Expand `src/arch/x64/disasm.c` to decode every instruction emitted by + x64 codegen and every standalone-asm form accepted by the assembler. + - 2026-05-21: `disasm.c` now drives entirely through `x64_disasm_find` + + `x64_print_operands`. Cross-checked against `llvm-objdump` on the + spec_demo binary — operand syntax matches instruction-by-instruction. +- [x] Add x64 listing tests under `test/asm/listing/`. + - 2026-05-22: added `x64_symbols` listing coverage for function/local + labels and x64 PC-relative relocation annotations. +- [ ] Make asm round-trip (`S`) meaningful for x64 codegen output and gate the + x64-emitted corpus on it. +- [x] Update `test/asm/regen.sh` or add an x64 variant for clang/objdump golden + regeneration. + - 2026-05-22: `CFREE_TEST_ARCH=x64 test/asm/regen.sh ...` now filters + by `.targets`, uses the x86_64 clang target, and regenerates x64 + encode/decode/listing goldens. + +## Inline asm + +- [x] Broaden x64 inline-asm template rendering to cover operand modifiers and + memory forms expected by GNU-style x86_64 asm. + - 2026-05-21: `%b` byte-register modifier landed with a full r0..r15 + byte-name table. `%h` (high-byte), `%k` (32-bit alias), and `%z` + (instruction-size selector) remain unimplemented. + - 2026-05-21: GNU x86 register modifiers now render on x64: + `%w` = 16-bit, `%k` = 32-bit, `%h` = high-byte register where legal, + `%b` handles low byte registers including REX-only byte names, and + `%z` selects the instruction suffix from operand type. Symbolic + `%[name]` operands work with the same modifier path. +- [x] Add an x64 inline-asm unit test parallel to `test/arch/aa64_inline_test.c`. + - 2026-05-21: `test/arch/x64_inline_test.c` lands with 6 smoke cases + and a `test-x64-inline` Makefile target wired into `make test`. +- [ ] Verify register clobbers, `"cc"`, `"memory"`, callee-saved preservation, + early-clobber, matching constraints, and named operands on x64. +- [x] Add C and toy inline-asm execution cases that run on an x64 host/runner. + - 2026-05-21: added `cg_x64_inline_asm_modifiers.c`; verified x64 parse + R/E paths at O0 and O1 alongside the existing x64 inline asm C smoke. + +## C / toy codegen + +- [~] Close remaining explicit x64 backend panics in `src/arch/x64/ops.c` + (`u64`/FP conversions, unsupported bitcasts, non-constant memset byte + paths, indirect aggregate arg shapes, tail-call/sret gaps, and other + `unsupported`/`unimpl` paths). + - 2026-05-21: `u64`/FP conversions are implemented; tail-call stack-arg + cases are handled conservatively rather than panicking or emitting an + invalid sibling tail call. + - 2026-05-21 (parity push): the remaining `unsupported`/`unimpl` paths + (same-class bitcast, tail+sret, indirect aggregate args, memset + non-imm byte, alloca align >16, exotic atomic op kinds, x64-unique + "shift count kind") are *all* mirrored in aa64 — they are shared + architectural gaps, not x64-specific regressions. Leaving this row + partially checked until the corresponding aa64 gaps close too. +- [~] Match aa64 coverage for scalar integer, FP, pointer, aggregate, varargs, + atomics, intrinsics, labels, computed goto, switch lowering, and alloca. + - 2026-05-21: scalar optimized integer/FP RHS clobbers, variable shift + count clobbers, and optimized x64 jump-table virtual-reg materialization + are fixed. + - 2026-05-21 (parity push): `emit_extend_rr` 32→64 silent-no-op fixed + (was leaving destination uninitialized). x64 toy R/L/X 1286/0/0 — + feature parity with aa64 for the toy corpus is reached. +- [ ] Prove x64 optimized and unoptimized C parse corpus paths with targeted + `CFREE_TEST_ARCH=x64` runs. +- [x] Prove toy cross-arch path `X` for x64 alongside aa64 cases. + - 2026-05-21: targeted x64 `X` runs pass for the tail-call, conversion, + data-relocation, and switch regression cases listed above. Full x64 `X` + run should be repeated after the podman `--pull=never` runner fix. + - 2026-05-21 (parity push): full x64 R/L/X toy run: 1286/0/0. + +## ABI / platform + +- [x] Finish SysV x86_64 ABI edge cases: aggregate classification, register save + area, variadic call metadata (`AL`), sret, byval, and mixed int/FP returns. + - 2026-05-21: variadic metadata (`vararg_gp_offset`, + `vararg_fp_offset`) now populated by `sysv_x64_compute_func_info`. + At that point mixed int/FP aggregate classification was still pending. + - 2026-05-21: SysV aggregate classification now computes INTEGER/SSE + per eightbyte for small records, including mixed int/FP records and + homogeneous float pairs. x64 call planning/direct emission now routes + direct multi-part args wholly to stack when either register pool lacks + capacity, preserves indirect sret sources that conflict with `%rdi` or + `%rax`, and accepts global byval sources. Added ABI metadata coverage + plus x64 parse execution cases for mixed record params/returns. +- [x] Decide and implement x86_64 Darwin ABI differences where they diverge from + Linux/SysV behavior. + - 2026-05-21: `apple_x64_vtable` seam added (thin delegate to SysV + today). `x64_abi_vtable` branches on `CFREE_OS_MACOS`. Future + Darwin-only behaviour can land in `abi_apple_x64.c` without + re-touching SysV. +- [ ] Implement x86_64 `long double` semantics (`x87` 80-bit in 16-byte + storage) or document a staged compatibility mode. +- [ ] Audit predefined macros, target triples, and driver target selection for + Linux and Darwin x86_64 parity. + +## Object / link / driver + +- [x] Ensure ELF x86_64 relocations cover all codegen, asm, TLS, PLT/GOT, ifunc, + and linker-script cases currently passing for aa64. + - 2026-05-21: `link_reloc.c` adds the missing `R_X86_64_RELATIVE`, + `R_X86_64_GLOB_DAT`, `R_X86_64_JUMP_SLOT` cases (previously fell + through to a generic panic) and gives `R_X86_64_COPY` a descriptive + error. Static/dynamic ELF link cases pass for x64 musl + glibc. + - 2026-05-21: object relocation iteration now reports x86_64 ELF + relocation names, and x64 ELF roundtrip/link paths cover `PLT32`, + `PC32`, GOTPCREL/GOTPCRELX, TLS local-exec, ifunc, dynamic + `RELATIVE`/`GLOB_DAT`/`JUMP_SLOT`, and linker-script cases. +- [ ] Bring Mach-O x86_64 object/link coverage up to the aa64 Mach-O subset. + - Ignored for this ELF-only pass. +- [x] Exercise `cfree as`, `cc`, `ld`, `objdump`, `run`, and `emu` paths with + x64-specific tests where the command is intended to support x64. + - 2026-05-21: `cfree as` and `cfree objdump` confirmed for x64 via + round-trip demo. `cfree cc` / `cfree ld` covered by toy R/L/X and + musl/glibc suites. `emu` remains aa64/rv64-only by current design. +- [x] Add x64 object disassembly annotation coverage for symbols and relocs. + - 2026-05-21: `cfree_disasm_iter` now matches relocations anywhere + inside the decoded instruction byte range, with section filtering so + same-offset relocs in other text sections do not bleed through. + `test/elf/unit/x64_disasm_annotations.c` covers symbol labels plus + `call`, RIP-relative load, and `jmp` reloc annotations. + +## Runtime / libc + +- [x] Build `libcfree_rt.a` for x86_64 Linux and Darwin through cfree, not just + host clang probes. +- [x] Bring x86_64 coroutine/runtime assembly and C sources through the cfree + assembler/compiler path. + - 2026-05-21: clean `rt-x86_64-linux` and + `rt-x86_64-apple-darwin` rebuilds compile the full x64 source set, + including `rt/lib/coro/x86_64.c`, through `build/cfree cc`. The driver + auto-build path now includes the same higher-level x64 runtime members + needed by runtime-linked binaries. +- [x] Retarget musl/glibc libc harnesses to x64 sysroots and run the same cases + currently exercised for aa64. + - 2026-05-21: `test/libc/{musl,glibc}/run.sh` honour + `CFREE_LIBC_ARCHES` (default `aa64`; `x64` available). x64 musl 18/18, + x64 glibc 9/9. +- [x] Add x64 smoke cases that use cfree-emitted bytes, not only clang-produced + harness binaries. + - 2026-05-21: `test/driver/run.sh` adds + `cc-auto-builds-and-links-libcfree-rt-x64`; an explicit + driver-auto-built x64 runtime binary was run via Podman linux/amd64 + and exited 42. + +## Debug / JIT / tooling + +- [~] Add x64 displaced-step/debugger support: `INT3`, RIP-relative fixups, + ucontext register marshalling, and frame walking. + - 2026-05-21: scaffold landed (`src/arch/x64/dbg.c`); `dbg_x64_int3_byte` + + a conservative `dbg_x64_build_shim` that declines on RIP-relative. + `dbg_displaced_prepare` and `dbg_step_resume` dispatch x64 to the new + path, falling back to `CFREE_UNSUPPORTED` gracefully. Real shim + generation (ModR/M decoder + RIP-relative re-encoding) is the next + step. +- [ ] Emit and validate x64 DWARF CFI/line-info details, including frame-pointer + conventions and call-frame rows. +- [ ] Fill x64 JIT support gaps: executable memory, relocations, symbol calls, + TLV/TLS behavior, and native-host execution tests. +- [ ] Decide emulator scope for x86_64; either implement it or mark `emu` as + non-parity for x64. + +## Known pre-existing x64 issue + +- aa64/01_syscall_write [dynamic] musl link is killed by SIGKILL inside + `cfree ld` (deterministic, rc=137). Reproduces with this commit reverted — + not a regression from the parity push. The trigger appears to be the file- + scope inline-asm shape × aa64 dynamic-PIE codepath; other 8 aa64 cases in + the same suite link and run cleanly, and x64 dynamic-PIE works on every + case. Worth a follow-up investigation in the linker. + ## Asm / disasm -- [ ] Expand `src/arch/x64/asm.c` beyond the current small AT&T subset: +- [x] Expand `src/arch/x64/asm.c` beyond the current small AT&T subset: branches, calls, arithmetic, shifts, compares, loads/stores, LEA, atomics, SSE scalar FP, and backend-emitted forms. -- [ ] Build an x64 ISA descriptor layer equivalent in role to +- [x] Build an x64 ISA descriptor layer equivalent in role to `src/arch/aa64/isa.{h,c}` so encoder, decoder, printer, and tests share one instruction description. -- [ ] Expand `src/arch/x64/disasm.c` to decode every instruction emitted by +- [x] Expand `src/arch/x64/disasm.c` to decode every instruction emitted by x64 codegen and every standalone-asm form accepted by the assembler. -- [ ] Add x64 listing tests under `test/asm/listing/`. +- [x] Add x64 listing tests under `test/asm/listing/`. - [ ] Make asm round-trip (`S`) meaningful for x64 codegen output and gate the x64-emitted corpus on it. -- [ ] Update `test/asm/regen.sh` or add an x64 variant for clang/objdump golden +- [x] Update `test/asm/regen.sh` or add an x64 variant for clang/objdump golden regeneration. ## Inline asm @@ -80,8 +336,10 @@ debug tooling. ## ABI / platform -- [ ] Finish SysV x86_64 ABI edge cases: aggregate classification, register save +- [x] Finish SysV x86_64 ABI edge cases: aggregate classification, register save area, variadic call metadata (`AL`), sret, byval, and mixed int/FP returns. + - 2026-05-21: completed in the parity follow-up above; long double + remains tracked separately. - [ ] Decide and implement x86_64 Darwin ABI differences where they diverge from Linux/SysV behavior. - [ ] Implement x86_64 `long double` semantics (`x87` 80-bit in 16-byte @@ -100,13 +358,13 @@ debug tooling. ## Runtime / libc -- [ ] Build `libcfree_rt.a` for x86_64 Linux and Darwin through cfree, not just +- [x] Build `libcfree_rt.a` for x86_64 Linux and Darwin through cfree, not just host clang probes. -- [ ] Bring x86_64 coroutine/runtime assembly and C sources through the cfree +- [x] Bring x86_64 coroutine/runtime assembly and C sources through the cfree assembler/compiler path. - [ ] Retarget musl/glibc libc harnesses to x64 sysroots and run the same cases currently exercised for aa64. -- [ ] Add x64 smoke cases that use cfree-emitted bytes, not only clang-produced +- [x] Add x64 smoke cases that use cfree-emitted bytes, not only clang-produced harness binaries. ## Debug / JIT / tooling diff --git a/driver/env.c b/driver/env.c @@ -69,6 +69,26 @@ #if defined(__linux__) #include <sys/syscall.h> #define DRIVER_DUAL_LINUX 1 +#if defined(__x86_64__) && defined(MAP_32BIT) +#define DRIVER_MAP_32BIT MAP_32BIT +static uintptr_t g_execmem_low_runtime_hint = 0x40000000u; +static void *execmem_low_runtime_hint(size_t size) { + uintptr_t p = g_execmem_low_runtime_hint; + uintptr_t step = (uintptr_t)((size + 0xffffu) & ~(size_t)0xffffu); + if (step < 0x10000u) + step = 0x10000u; + g_execmem_low_runtime_hint = p + step + 0x10000u; + if (g_execmem_low_runtime_hint > 0x78000000u) + g_execmem_low_runtime_hint = 0x40000000u; + return (void *)p; +} +#else +#define DRIVER_MAP_32BIT 0 +static void *execmem_low_runtime_hint(size_t size) { + (void)size; + return NULL; +} +#endif #else #define DRIVER_DUAL_LINUX 0 #endif @@ -360,12 +380,14 @@ static CfreeStatus execmem_reserve_dual_linux(size_t size, return CFREE_ERR; } - w = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + w = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | DRIVER_MAP_32BIT, fd, 0); if (w == MAP_FAILED) { close(fd); return CFREE_NOMEM; } - r = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + r = mmap(execmem_low_runtime_hint(size), size, PROT_READ, + MAP_SHARED | DRIVER_MAP_32BIT, fd, 0); if (r == MAP_FAILED) { munmap(w, size); close(fd); @@ -622,9 +644,8 @@ static CfreeStatus dbg_event_reset(void *user, void *ev) { /* --- signal install + ucontext marshalling --- */ -/* Marshal ucontext_t <-> CfreeUnwindFrame. aarch64 only in v1; pc lives - * in regs[31] of CfreeUnwindFrame.pc and sp lives in regs[31]. The DWARF - * register numbering on aarch64 puts x0..x30 at 0..30, sp at 31. */ +/* Marshal ucontext_t <-> CfreeUnwindFrame. Register slots use each + * architecture's DWARF numbering. */ #if defined(__aarch64__) && defined(__APPLE__) static void dbg_ucontext_to_frame(const ucontext_t *uc, CfreeUnwindFrame *f) { const struct __darwin_arm_thread_state64 *ss = &uc->uc_mcontext->__ss; @@ -687,8 +708,52 @@ static void dbg_frame_to_ucontext(const CfreeUnwindFrame *f, ucontext_t *uc) { mc->__gregs[i] = (unsigned long)f->regs[i]; mc->__gregs[0] = (unsigned long)f->pc; } +#elif defined(__x86_64__) && defined(__linux__) +static void dbg_ucontext_to_frame(const ucontext_t *uc, CfreeUnwindFrame *f) { + const greg_t *g = uc->uc_mcontext.gregs; + memset(f, 0, sizeof(*f)); + f->regs[0] = (uint64_t)g[REG_RAX]; + f->regs[1] = (uint64_t)g[REG_RDX]; + f->regs[2] = (uint64_t)g[REG_RCX]; + f->regs[3] = (uint64_t)g[REG_RBX]; + f->regs[4] = (uint64_t)g[REG_RSI]; + f->regs[5] = (uint64_t)g[REG_RDI]; + f->regs[6] = (uint64_t)g[REG_RBP]; + f->regs[7] = (uint64_t)g[REG_RSP]; + f->regs[8] = (uint64_t)g[REG_R8]; + f->regs[9] = (uint64_t)g[REG_R9]; + f->regs[10] = (uint64_t)g[REG_R10]; + f->regs[11] = (uint64_t)g[REG_R11]; + f->regs[12] = (uint64_t)g[REG_R12]; + f->regs[13] = (uint64_t)g[REG_R13]; + f->regs[14] = (uint64_t)g[REG_R14]; + f->regs[15] = (uint64_t)g[REG_R15]; + f->regs[16] = (uint64_t)g[REG_RIP]; + f->pc = (uint64_t)g[REG_RIP]; + f->cfa = (uint64_t)g[REG_RSP]; +} +static void dbg_frame_to_ucontext(const CfreeUnwindFrame *f, ucontext_t *uc) { + greg_t *g = uc->uc_mcontext.gregs; + g[REG_RAX] = (greg_t)f->regs[0]; + g[REG_RDX] = (greg_t)f->regs[1]; + g[REG_RCX] = (greg_t)f->regs[2]; + g[REG_RBX] = (greg_t)f->regs[3]; + g[REG_RSI] = (greg_t)f->regs[4]; + g[REG_RDI] = (greg_t)f->regs[5]; + g[REG_RBP] = (greg_t)f->regs[6]; + g[REG_RSP] = (greg_t)f->regs[7]; + g[REG_R8] = (greg_t)f->regs[8]; + g[REG_R9] = (greg_t)f->regs[9]; + g[REG_R10] = (greg_t)f->regs[10]; + g[REG_R11] = (greg_t)f->regs[11]; + g[REG_R12] = (greg_t)f->regs[12]; + g[REG_R13] = (greg_t)f->regs[13]; + g[REG_R14] = (greg_t)f->regs[14]; + g[REG_R15] = (greg_t)f->regs[15]; + g[REG_RIP] = (greg_t)f->pc; +} #else -#error "cfree dbg v1 supports only aarch64 on macOS/Linux or riscv64 on Linux" +#error "cfree dbg v1 supports only aarch64 on macOS/Linux, riscv64 on Linux, or x86_64 on Linux" #endif static void dbg_signal_handler(int signo, siginfo_t *si, void *ucv) { diff --git a/driver/objdump.c b/driver/objdump.c @@ -335,6 +335,26 @@ static void dump_relocs(CfreeObjFile* f, const ObjdumpOpts* opts) { if (emitted_any) driver_printf("\n"); } +static const char* objdump_sym_at(CfreeObjFile* f, uint32_t section_idx, + uint64_t value) { + CfreeObjSymIter* it = NULL; + CfreeObjSymInfo sym; + const char* best = NULL; + + if (cfree_obj_symiter_new(f, &it) != CFREE_OK) return NULL; + for (;;) { + CfreeIterResult r = cfree_obj_symiter_next(it, &sym); + if (r != CFREE_ITER_ITEM) break; + if (sym.section != section_idx || sym.value != value) continue; + if (!sym.name || !sym.name[0]) continue; + if (sym.kind == CFREE_SK_SECTION) continue; + best = sym.name; + if (sym.kind == CFREE_SK_FUNC || sym.bind != CFREE_SB_LOCAL) break; + } + cfree_obj_symiter_free(it); + return best; +} + static void dump_disasm(const CfreeDisasmContext* dctx, CfreeObjFile* f, const ObjdumpOpts* opts) { uint32_t nsec = cfree_obj_nsections(f); @@ -369,7 +389,12 @@ static void dump_disasm(const CfreeDisasmContext* dctx, CfreeObjFile* f, for (;;) { CfreeIterResult r = cfree_disasm_iter_next(dis, &insn); uint32_t b; + const char* label; if (r != CFREE_ITER_ITEM) break; + label = objdump_sym_at(f, i, insn.vaddr); + if (label) + driver_printf("%016llx <%s>:\n", (unsigned long long)insn.vaddr, + label); driver_printf("%8llx:\t", (unsigned long long)insn.vaddr); for (b = 0; b < insn.nbytes; ++b) driver_printf("%02x ", insn.bytes[b]); for (b = insn.nbytes; b < 8; ++b) driver_printf(" "); diff --git a/driver/runtime.c b/driver/runtime.c @@ -25,10 +25,14 @@ typedef struct RuntimeVariant { } RuntimeVariant; static const char* const kRtSrcX64[] = { - "int/int.c", "fp/fp.c", - "mem/mem.c", "atomic/atomic_freestanding.c", - "cfree/ifunc_init.c", "int64/int64.c", - "coro/x86_64.c", "coro/coro.c", + "assert/assert.c", "int/int.c", + "int/si_div.c", "fp/fp.c", + "mem/mem.c", "string/string.c", + "stdlib/stdlib.c", "stdlib/qsort.c", + "stdio/printf.c", "atomic/atomic_freestanding.c", + "cache/clear_cache.c", "cfree/ifunc_init.c", + "int64/int64.c", "coro/x86_64.c", + "coro/coro.c", }; static const char* const kRtSrcAarch64Linux[] = { diff --git a/lang/c/pp/pp.c b/lang/c/pp/pp.c @@ -349,6 +349,26 @@ static void pp_register_target_predefined(Pp* pp) { pp_define(pp, "__USER_LABEL_PREFIX__", target.obj == CFREE_OBJ_MACHO ? "_" : ""); + /* Byte / type sizes. cfree uses a single LP64 (or ILP32) model across + * every supported target: int=4, short=2, long-long=8, float=4, double=8, + * long-double=8 (sharing the double representation — see the + * __LDBL_* block below). long and pointer-derived types track ptr_size. + * These macros let portable C code probe widths without first pulling in + * <limits.h> / <stddef.h>. */ + pp_define(pp, "__CHAR_BIT__", "8"); + pp_define(pp, "__SIZEOF_SHORT__", "2"); + pp_define(pp, "__SIZEOF_INT__", "4"); + pp_define(pp, "__SIZEOF_LONG__", lp64 ? "8" : "4"); + pp_define(pp, "__SIZEOF_LONG_LONG__", "8"); + pp_define(pp, "__SIZEOF_POINTER__", lp64 ? "8" : "4"); + pp_define(pp, "__SIZEOF_SIZE_T__", lp64 ? "8" : "4"); + pp_define(pp, "__SIZEOF_PTRDIFF_T__", lp64 ? "8" : "4"); + pp_define(pp, "__SIZEOF_WCHAR_T__", "4"); + pp_define(pp, "__SIZEOF_WINT_T__", "4"); + pp_define(pp, "__SIZEOF_FLOAT__", "4"); + pp_define(pp, "__SIZEOF_DOUBLE__", "8"); + pp_define(pp, "__SIZEOF_LONG_DOUBLE__", "8"); + /* stddef.h base aliases */ pp_define(pp, "__SIZE_TYPE__", lp64 ? "unsigned long" : "unsigned int"); pp_define(pp, "__PTRDIFF_TYPE__", lp64 ? "long" : "int"); diff --git a/src/abi/abi_apple_x64.c b/src/abi/abi_apple_x64.c @@ -0,0 +1,21 @@ +/* Apple x86_64 (Darwin) ABI dispatch. + * + * Darwin x86_64 ABI is identical to Linux SysV x86_64; this seam exists + * so future divergences (alignment, struct passing edge cases) have a + * place to live. Today the vtable thinly delegates to the SysV + * classifier and reuses its va_list_info. */ + +#include "abi/abi_internal.h" +#include "core/core.h" + +extern const ABIVtable sysv_x64_vtable; + +static ABIFuncInfo* apple_x64_compute_func_info(TargetABI* a, + CfreeCgTypeId fn) { + return sysv_x64_vtable.compute_func_info(a, fn); +} + +const ABIVtable apple_x64_vtable = { + .compute_func_info = apple_x64_compute_func_info, + .va_list_info = {24, 8, ABI_SC_VOID, 0, 0, 0}, +}; diff --git a/src/abi/abi_internal.h b/src/abi/abi_internal.h @@ -23,6 +23,7 @@ extern const ABIVtable rv64_vtable; /* Apple Darwin variants — selected when (arch, os) matches. See * abi.c::select_vtable. */ extern const ABIVtable apple_arm64_vtable; +extern const ABIVtable apple_x64_vtable; /* Shared TargetABI internals. The struct definition is here so each ABI * TU can reach into the per-TU caches via TargetABI*. abi.c owns the diff --git a/src/abi/abi_sysv_x64.c b/src/abi/abi_sysv_x64.c @@ -1,17 +1,8 @@ -/* SysV AMD64 ABI — minimal classifier. +/* SysV AMD64 ABI classifier. * - * Covers the subset the cg test harness needs through the spine: - * void -> IGNORE - * integer ≤ 8B -> DIRECT, one INT part (rdi..r9 for args; rax for return) - * pointer -> DIRECT, one INT part - * float/double -> DIRECT, one FP part (xmm0..xmm7 for args; xmm0 return) - * small struct -> DIRECT, INT parts up to 16B (passed in up to 2 GPRs) - * large struct -> INDIRECT (sret for return; byval for args) - * - * The full SysV INTEGER/SSE eight-byte classification (with X87/COMPLEX_X87/ - * NO_CLASS rules and the MEMORY-pulls-down rule) is deferred — for the - * cg corpus this approximation is enough and matches what the rv64 ABI - * does today. */ + * Implements the INTEGER/SSE/MEMORY subset used by cfree's scalar and record + * types. x87 long double still routes through memory because the backend does + * not have x87 codegen yet. */ #include <string.h> @@ -76,6 +67,95 @@ static void classify_scalar(TargetABI* a, CfreeCgTypeId t, ABIArgInfo* out, out->nparts = 1; } +typedef enum SysVClass { + SYSV_NO_CLASS, + SYSV_INTEGER, + SYSV_SSE, + SYSV_MEMORY, +} SysVClass; + +static SysVClass merge_class(SysVClass a, SysVClass b) { + if (a == b) return a; + if (a == SYSV_NO_CLASS) return b; + if (b == SYSV_NO_CLASS) return a; + if (a == SYSV_MEMORY || b == SYSV_MEMORY) return SYSV_MEMORY; + if (a == SYSV_INTEGER || b == SYSV_INTEGER) return SYSV_INTEGER; + return SYSV_SSE; +} + +static int mark_eightbytes(SysVClass cls[2], u32 offset, u32 size, + SysVClass k) { + if (!size) return 1; + if (offset >= 16u || offset + size > 16u) return 0; + u32 first = offset / 8u; + u32 last = (offset + size - 1u) / 8u; + for (u32 i = first; i <= last; ++i) { + cls[i] = merge_class(cls[i], k); + if (cls[i] == SYSV_MEMORY) return 0; + } + return 1; +} + +static int classify_range(TargetABI* a, CfreeCgTypeId t, u32 base, + SysVClass cls[2]) { + const CgType* ty = cg_type_get(a->c, t); + ABITypeInfo ti; + if (!ty) return 0; + if (ty->kind == CFREE_CG_TYPE_ALIAS) { + return classify_range(a, ty->alias.base, base, cls); + } + if (ty->kind == CFREE_CG_TYPE_ENUM) { + return classify_range(a, ty->enum_.base, base, cls); + } + ti = abi_internal_type_info(a, t); + switch (ty->kind) { + case CFREE_CG_TYPE_BOOL: + case CFREE_CG_TYPE_INT: + case CFREE_CG_TYPE_PTR: + return mark_eightbytes(cls, base, ti.size, SYSV_INTEGER); + case CFREE_CG_TYPE_FLOAT: + if (ti.size == 4u || ti.size == 8u) + return mark_eightbytes(cls, base, ti.size, SYSV_SSE); + return 0; + case CFREE_CG_TYPE_ARRAY: { + ABITypeInfo ei = abi_internal_type_info(a, ty->array.elem); + for (u64 i = 0; i < ty->array.count; ++i) { + if (i > UINT32_MAX || ei.size > UINT32_MAX || + base > UINT32_MAX - (u32)(i * ei.size)) + return 0; + if (!classify_range(a, ty->array.elem, base + (u32)(i * ei.size), + cls)) + return 0; + } + return 1; + } + case CFREE_CG_TYPE_RECORD: { + const ABIRecordLayout* L = abi_cg_record_layout(a, t); + if (!L || L->size > 16u) return 0; + for (u32 i = 0; i < ty->record.nfields; ++i) { + const CgTypeField* f = &ty->record.fields[i]; + const ABIFieldLayout* fl = &L->fields[i]; + ABITypeInfo fi = abi_internal_type_info(a, f->type); + if ((f->flags & CFREE_CG_FIELD_BITFIELD) != 0) { + if (fl->bit_width == 0) continue; + if (!mark_eightbytes(cls, base + fl->offset, fl->storage_size, + SYSV_INTEGER)) + return 0; + continue; + } + if (fi.size && fi.align && ((base + fl->offset) % fi.align) != 0) + return 0; + if (!classify_range(a, f->type, base + fl->offset, cls)) return 0; + } + return 1; + } + case CFREE_CG_TYPE_VOID: + return 1; + default: + return mark_eightbytes(cls, base, ti.size, SYSV_INTEGER); + } +} + static void classify_aggregate(TargetABI* a, CfreeCgTypeId t, ABIArgInfo* out, int is_return) { ABITypeInfo ti = abi_internal_type_info(a, t); @@ -84,13 +164,22 @@ static void classify_aggregate(TargetABI* a, CfreeCgTypeId t, ABIArgInfo* out, return; } if (ti.size <= 16) { + SysVClass cls[2] = {SYSV_NO_CLASS, SYSV_NO_CLASS}; + if (!classify_range(a, t, 0, cls)) { + out->kind = ABI_ARG_INDIRECT; + out->flags = is_return ? ABI_AF_SRET : ABI_AF_BYVAL; + out->indirect_align = ti.align ? ti.align : 8; + out->parts = NULL; + out->nparts = 0; + return; + } u32 nparts = (ti.size + 7) / 8; ABIArgPart* parts = arena_array(a->c->tu, ABIArgPart, nparts); memset(parts, 0, sizeof(ABIArgPart) * nparts); u32 off = 0; for (u32 i = 0; i < nparts; ++i) { u32 chunk = (ti.size - off > 8) ? 8 : (ti.size - off); - parts[i].cls = ABI_CLASS_INT; + parts[i].cls = (cls[i] == SYSV_SSE) ? ABI_CLASS_FP : ABI_CLASS_INT; parts[i].loc = ABI_LOC_REG; parts[i].size = chunk; parts[i].align = 8; @@ -98,7 +187,7 @@ static void classify_aggregate(TargetABI* a, CfreeCgTypeId t, ABIArgInfo* out, off += chunk; } out->kind = ABI_ARG_DIRECT; - out->flags = ABI_AF_NONE; + out->flags = nparts > 1 ? ABI_AF_SPLIT : ABI_AF_NONE; out->parts = parts; out->nparts = (u16)nparts; out->indirect_align = 0; @@ -131,6 +220,20 @@ static void classify_one(TargetABI* a, CfreeCgTypeId t, ABIArgInfo* out, } } +/* SysV x86_64 register-pool sizes for the variadic reg-save area. + * GP: rdi, rsi, rdx, rcx, r8, r9 — 6 slots * 8 bytes = 48 + * FP: xmm0..xmm7 — 8 slots * 16 bytes = 128 + * Total reg-save area is 48 + 128 = 176 bytes; the fp_offset field starts + * at 48 (right after the GP block) and ranges up to 176. */ +#define SYSV_X64_GP_REG_COUNT 6u +#define SYSV_X64_FP_REG_COUNT 8u +#define SYSV_X64_GP_SLOT_SIZE 8u +#define SYSV_X64_FP_SLOT_SIZE 16u +#define SYSV_X64_GP_MAX_OFFSET (SYSV_X64_GP_REG_COUNT * SYSV_X64_GP_SLOT_SIZE) +#define SYSV_X64_FP_BASE_OFFSET SYSV_X64_GP_MAX_OFFSET +#define SYSV_X64_FP_MAX_OFFSET \ + (SYSV_X64_FP_BASE_OFFSET + SYSV_X64_FP_REG_COUNT * SYSV_X64_FP_SLOT_SIZE) + static ABIFuncInfo* sysv_x64_compute_func_info(TargetABI* a, CfreeCgTypeId fn) { ABIFuncInfo* info = arena_new(a->c->tu, ABIFuncInfo); @@ -152,6 +255,43 @@ static ABIFuncInfo* sysv_x64_compute_func_info(TargetABI* a, } else { info->params = NULL; } + + /* Variadic register-save-area offsets at function entry. Counts the + * GP/FP register slots consumed by the fixed (named) parameters; va_start + * uses these as the initial gp_offset / fp_offset in the __va_list_tag + * struct so the va_arg fetch path skips over the already-consumed slots + * in the reg_save_area before falling through to overflow_arg_area. + * + * overflow_arg_area is computed at the call site from rbp + 16 + stack + * args at va_start time (see x_va_start_ in src/arch/x64/ops.c), so the + * vararg_overflow_offset metadata is left at 0 here. */ + if (info->variadic) { + u32 gp_used = info->has_sret ? 1u : 0u; + u32 fp_used = 0u; + for (u32 i = 0; i < info->nparams; ++i) { + const ABIArgInfo* ai = &info->params[i]; + if (ai->kind == ABI_ARG_INDIRECT) { + if (gp_used < SYSV_X64_GP_REG_COUNT) ++gp_used; + continue; + } + if (ai->kind != ABI_ARG_DIRECT) continue; + for (u32 p = 0; p < ai->nparts; ++p) { + if (ai->parts[p].cls == ABI_CLASS_FP) { + if (fp_used < SYSV_X64_FP_REG_COUNT) ++fp_used; + } else if (ai->parts[p].cls == ABI_CLASS_INT) { + if (gp_used < SYSV_X64_GP_REG_COUNT) ++gp_used; + } + } + } + if (gp_used > SYSV_X64_GP_REG_COUNT) gp_used = SYSV_X64_GP_REG_COUNT; + if (fp_used > SYSV_X64_FP_REG_COUNT) fp_used = SYSV_X64_FP_REG_COUNT; + info->vararg_gp_offset = gp_used * SYSV_X64_GP_SLOT_SIZE; + info->vararg_fp_offset = + SYSV_X64_FP_BASE_OFFSET + fp_used * SYSV_X64_FP_SLOT_SIZE; + info->vararg_overflow_offset = 0; + (void)SYSV_X64_GP_MAX_OFFSET; + (void)SYSV_X64_FP_MAX_OFFSET; + } return info; } diff --git a/src/api/disasm.c b/src/api/disasm.c @@ -29,21 +29,27 @@ struct CfreeDisasmIter { uint64_t vaddr0; const CfreeObjFile* annot; ObjBuilder* annot_ob; + uint32_t annot_section; char ann_buf[DASM_ANN_CAP]; StrBuf ann; }; -static const char* dasm_overlay(CfreeDisasmIter* it, uint64_t vaddr) { +static const char* dasm_overlay(CfreeDisasmIter* it, uint64_t vaddr, + uint32_t nbytes) { ObjBuilder* obj = it->annot_ob; if (!obj) return ""; strbuf_reset(&it->ann); u64 want = vaddr - it->vaddr0; + u64 end = want + nbytes; u32 nrel = obj_reloc_total(obj); for (u32 i = 0; i < nrel; ++i) { const Reloc* r = obj_reloc_at(obj, i); if (!r) continue; - if ((u64)r->offset != want) continue; + if (it->annot_section != CFREE_SECTION_NONE && + r->section_id != (ObjSecId)(it->annot_section + 1)) + continue; + if ((u64)r->offset < want || (u64)r->offset >= end) continue; const ObjSym* sym = (r->sym != OBJ_SYM_NONE) ? obj_symbol_get(obj, r->sym) : NULL; @@ -78,6 +84,20 @@ static const char* dasm_overlay(CfreeDisasmIter* it, uint64_t vaddr) { return strbuf_cstr(&it->ann); } +static uint32_t dasm_find_section(const CfreeObjFile* f, const uint8_t* bytes, + size_t len) { + uint32_t nsec, i; + if (!f || !bytes) return CFREE_SECTION_NONE; + nsec = cfree_obj_nsections(f); + for (i = 0; i < nsec; ++i) { + const uint8_t* data = NULL; + size_t n = 0; + if (cfree_obj_section_data(f, i, &data, &n) != CFREE_OK) continue; + if (data == bytes && n == len) return i; + } + return CFREE_SECTION_NONE; +} + CfreeStatus cfree_disasm_iter_new(const CfreeDisasmContext* dctx, const uint8_t* bytes, size_t len, uint64_t vaddr, const CfreeObjFile* annot, @@ -107,6 +127,7 @@ CfreeStatus cfree_disasm_iter_new(const CfreeDisasmContext* dctx, it->vaddr0 = vaddr; it->annot = annot; it->annot_ob = annot ? cfree_objfile_builder(annot) : NULL; + it->annot_section = dasm_find_section(annot, bytes, len); strbuf_init(&it->ann, it->ann_buf, sizeof it->ann_buf); *out = it; return CFREE_OK; @@ -131,7 +152,7 @@ CfreeIterResult cfree_disasm_iter_next(CfreeDisasmIter* it, CfreeInsn* out) { it->off += n; return CFREE_ITER_ITEM; } - out->annotation = dasm_overlay(it, vaddr); + out->annotation = dasm_overlay(it, vaddr, n); it->off += n; return CFREE_ITER_ITEM; } @@ -229,20 +250,19 @@ CfreeStatus cfree_disasm_obj(const CfreeContext* ctx, const CfreeObjFile* f, w_str(out, s.name ? s.name : ".text"); w_str(out, ":\n\n"); - head = dasm_sym_at(f, i, 0); - if (head) { - w_hex(out, 0, 16); - w_str(out, " <"); - w_str(out, head); - w_str(out, ">:\n"); - } - st = cfree_disasm_iter_new(&dctx, data, n, 0, f, &it); if (st != CFREE_OK) return st; for (;;) { CfreeInsn ins; CfreeIterResult r = cfree_disasm_iter_next(it, &ins); if (r != CFREE_ITER_ITEM) break; + head = dasm_sym_at(f, i, ins.vaddr); + if (head) { + w_hex(out, ins.vaddr, 16); + w_str(out, " <"); + w_str(out, head); + w_str(out, ">:\n"); + } w_hex_padded(out, ins.vaddr, 8); w_str(out, ":\t"); if (ins.nbytes == 4) { diff --git a/src/api/object_file.c b/src/api/object_file.c @@ -291,6 +291,70 @@ CfreeStatus cfree_obj_reliter_new(CfreeObjFile* f, CfreeObjRelocIter** out) { return CFREE_OK; } +static const char* cfree_obj_reloc_kind_name(CfreeArchKind arch, + CfreeObjFmt fmt, u32 kind) { + if (fmt == CFREE_OBJ_ELF && arch == CFREE_ARCH_X86_64) { + switch ((RelocKind)kind) { + case R_NONE: + return "R_X86_64_NONE"; + case R_ABS64: + return "R_X86_64_64"; + case R_PC32: + return "R_X86_64_PC32"; + case R_GOT32: + return "R_X86_64_GOT32"; + case R_PLT32: + case R_X64_PLT32: + return "R_X86_64_PLT32"; + case R_X64_COPY: + return "R_X86_64_COPY"; + case R_X64_GLOB_DAT: + return "R_X86_64_GLOB_DAT"; + case R_X64_JUMP_SLOT: + return "R_X86_64_JUMP_SLOT"; + case R_X64_RELATIVE: + return "R_X86_64_RELATIVE"; + case R_X64_GOTPCREL: + return "R_X86_64_GOTPCREL"; + case R_ABS32: + return "R_X86_64_32"; + case R_X64_32S: + return "R_X86_64_32S"; + case R_X64_PC8: + return "R_X86_64_PC8"; + case R_X64_DTPMOD64: + return "R_X86_64_DTPMOD64"; + case R_X64_DTPOFF64: + return "R_X86_64_DTPOFF64"; + case R_X64_TPOFF64: + return "R_X86_64_TPOFF64"; + case R_X64_TLSGD: + return "R_X86_64_TLSGD"; + case R_X64_TLSLD: + return "R_X86_64_TLSLD"; + case R_X64_DTPOFF32: + return "R_X86_64_DTPOFF32"; + case R_X64_GOTTPOFF: + return "R_X86_64_GOTTPOFF"; + case R_X64_TPOFF32: + return "R_X86_64_TPOFF32"; + case R_PC64: + return "R_X86_64_PC64"; + case R_X64_GOTOFF64: + return "R_X86_64_GOTOFF64"; + case R_X64_GOTPC32: + return "R_X86_64_GOTPC32"; + case R_X64_GOTPCRELX: + return "R_X86_64_GOTPCRELX"; + case R_X64_REX_GOTPCRELX: + return "R_X86_64_REX_GOTPCRELX"; + default: + break; + } + } + return NULL; +} + CfreeIterResult cfree_obj_reliter_next(CfreeObjRelocIter* it, CfreeObjReloc* out) { const Reloc* r; @@ -306,7 +370,8 @@ CfreeIterResult cfree_obj_reliter_next(CfreeObjRelocIter* it, out->kind.arch = it->file->target.arch; out->kind.obj_fmt = it->file->fmt; out->kind.code = (uint32_t)r->kind; - out->kind_name = reloc_kind_name(r->kind); + out->kind_name = cfree_obj_reloc_kind_name(it->file->target.arch, + it->file->fmt, r->kind); if (r->sym == OBJ_SYM_NONE) { out->sym = CFREE_OBJ_SYMBOL_NONE; diff --git a/src/arch/aa64/arch.c b/src/arch/aa64/arch.c @@ -13,6 +13,7 @@ #include "obj/obj.h" extern const LinkArchDesc link_arch_aa64; +extern const ArchDbgOps aa64_dbg_ops; static const ABIVtable* aa64_abi_vtable(Compiler* c, CfreeOSKind os) { (void)c; @@ -53,6 +54,11 @@ static const ArchMachoOps aa64_macho_ops = { .reloc_from = macho_aarch64_reloc_from, }; +static const ArchDwarfOps aa64_dwarf_ops = { + .min_inst_len = 4u, + .max_ops_per_inst = 1u, +}; + static int aa64_apply_label_fixup(Compiler* c, const ArchLabelFixup* fx) { const Section* s; u8 cur[4]; @@ -153,6 +159,8 @@ const ArchImpl arch_impl_aa64 = { .link = &link_arch_aa64, .elf = &aa64_elf_ops, .macho = &aa64_macho_ops, + .dwarf = &aa64_dwarf_ops, + .dbg = &aa64_dbg_ops, .predefined_macros = aa64_predefined_macros, .npredefined_macros = (u32)(sizeof aa64_predefined_macros / sizeof aa64_predefined_macros[0]), diff --git a/src/arch/aa64/dbg.c b/src/arch/aa64/dbg.c @@ -27,15 +27,18 @@ * stale internal_bp is cleared by the next prepare; finalize gates on * PC == return_pc so it stays a no-op when control left the slot. */ -#include "dbg/dbg.h" +#include "arch/arch.h" #include <string.h> #include "arch/aa64/isa.h" #define SHIM_X16 16u /* IP0; safe to clobber inside a shim */ +#define AA64_DBG_INSN_LEN 4u +#define AA64_DBG_BL_MASK 0xFC000000u +#define AA64_DBG_BL_OP 0x94000000u -uint32_t dbg_aa64_brk_word(void) { +static uint32_t aa64_dbg_brk_word(void) { return aa64_brk(0); } @@ -77,9 +80,9 @@ static int64_t sign_extend(uint64_t v, int bits) { return (int64_t)((v ^ m) - m); } -int dbg_aa64_build_shim(uint32_t orig_insn, uint64_t orig_pc, - void* scratch_write, uint64_t scratch_runtime, - u32* shim_len) { +static int aa64_dbg_build_shim_word(uint32_t orig_insn, uint64_t orig_pc, + void* scratch_write, + uint64_t scratch_runtime, u32* shim_len) { uint8_t* w = (uint8_t*)scratch_write; uint32_t brk = aa64_brk(0); int64_t pc_delta; @@ -233,3 +236,64 @@ int dbg_aa64_build_shim(uint32_t orig_insn, uint64_t orig_pc, *shim_len = 4; return 0; } + +static CfreeStatus aa64_dbg_breakpoint_patch(u8* out, u32 cap, u32* len_out) { + uint32_t brk = aa64_dbg_brk_word(); + if (!out || !len_out) return CFREE_INVALID; + if (cap < AA64_DBG_INSN_LEN) return CFREE_INVALID; + memcpy(out, &brk, sizeof(brk)); + *len_out = AA64_DBG_INSN_LEN; + return CFREE_OK; +} + +static u64 aa64_dbg_breakpoint_addr_from_fault_pc(u64 fault_pc) { + return fault_pc; +} + +static CfreeStatus aa64_dbg_decode_insn(const u8* bytes, u32 len, u64 pc, + ArchDbgInsn* out) { + if (!bytes || !out) return CFREE_INVALID; + if (len < AA64_DBG_INSN_LEN) return CFREE_UNSUPPORTED; + memset(out, 0, sizeof(*out)); + out->pc = pc; + out->len = AA64_DBG_INSN_LEN; + memcpy(out->bytes, bytes, AA64_DBG_INSN_LEN); + return CFREE_OK; +} + +static CfreeStatus aa64_dbg_build_displaced_shim(const ArchDbgInsn* insn, + void* scratch_write, + u64 scratch_runtime, + u32 scratch_cap, + u32* sentinel_off, + u64* fallthrough_pc) { + uint32_t word = 0; + if (!insn || !scratch_write || !sentinel_off || !fallthrough_pc) + return CFREE_INVALID; + if (insn->len != AA64_DBG_INSN_LEN) return CFREE_UNSUPPORTED; + if (scratch_cap < 24u) return CFREE_INVALID; + memcpy(&word, insn->bytes, sizeof(word)); + if (aa64_dbg_build_shim_word(word, insn->pc, scratch_write, scratch_runtime, + sentinel_off) != 0) { + return CFREE_UNSUPPORTED; + } + *fallthrough_pc = insn->pc + AA64_DBG_INSN_LEN; + return CFREE_OK; +} + +static int aa64_dbg_is_call(const ArchDbgInsn* insn) { + uint32_t word = 0; + if (!insn || insn->len != AA64_DBG_INSN_LEN) return 0; + memcpy(&word, insn->bytes, sizeof(word)); + return (word & AA64_DBG_BL_MASK) == AA64_DBG_BL_OP; +} + +const ArchDbgOps aa64_dbg_ops = { + .min_insn_len = AA64_DBG_INSN_LEN, + .max_insn_len = AA64_DBG_INSN_LEN, + .breakpoint_patch = aa64_dbg_breakpoint_patch, + .breakpoint_addr_from_fault_pc = aa64_dbg_breakpoint_addr_from_fault_pc, + .decode_insn = aa64_dbg_decode_insn, + .build_displaced_shim = aa64_dbg_build_displaced_shim, + .is_call = aa64_dbg_is_call, +}; diff --git a/src/arch/arch.h b/src/arch/arch.h @@ -1049,6 +1049,41 @@ typedef struct ArchCoffOps { u32 (*reloc_from)(u32 wire_type); } ArchCoffOps; +typedef struct ArchDwarfOps { + /* DWARF .debug_line minimum instruction length and maximum operations per + * instruction. Fixed-width ISAs normally use their instruction width; x86_64 + * uses 1 because line-program PC advances are byte granular. */ + u8 min_inst_len; + u8 max_ops_per_inst; + u8 pad[2]; +} ArchDwarfOps; + +#define ARCH_DBG_MAX_TRAP_BYTES 8u +#define ARCH_DBG_MAX_INSN_BYTES 15u + +typedef struct ArchDbgInsn { + u64 pc; + u8 bytes[ARCH_DBG_MAX_INSN_BYTES]; + u32 len; +} ArchDbgInsn; + +typedef struct ArchDbgOps { + u32 min_insn_len; + u32 max_insn_len; + + CfreeStatus (*breakpoint_patch)(u8* out, u32 cap, u32* len_out); + u64 (*breakpoint_addr_from_fault_pc)(u64 fault_pc); + + CfreeStatus (*decode_insn)(const u8* bytes, u32 len, u64 pc, + ArchDbgInsn* out); + CfreeStatus (*build_displaced_shim)(const ArchDbgInsn* insn, + void* scratch_write, + u64 scratch_runtime, u32 scratch_cap, + u32* sentinel_off, + u64* fallthrough_pc); + int (*is_call)(const ArchDbgInsn* insn); +} ArchDbgOps; + typedef struct ArchImpl { CfreeArchKind kind; const char* name; @@ -1063,6 +1098,8 @@ typedef struct ArchImpl { const ArchElfOps* elf; const ArchMachoOps* macho; const ArchCoffOps* coff; + const ArchDwarfOps* dwarf; + const ArchDbgOps* dbg; const CfreePredefinedMacro* predefined_macros; u32 npredefined_macros; diff --git a/src/arch/rv64/arch.c b/src/arch/rv64/arch.c @@ -11,6 +11,7 @@ #include "obj/obj.h" extern const LinkArchDesc link_arch_rv64; +extern const ArchDbgOps rv64_dbg_ops; static const ABIVtable* rv64_abi_vtable(Compiler* c, CfreeOSKind os) { (void)c; @@ -25,6 +26,11 @@ static const ArchElfOps rv64_elf_ops = { .reloc_from = elf_riscv64_reloc_from, }; +static const ArchDwarfOps rv64_dwarf_ops = { + .min_inst_len = 4u, + .max_ops_per_inst = 1u, +}; + static int rv64_register_at_public(uint32_t idx, CfreeArchReg* out) { if (!out) return 1; return rv64_register_iter_get(idx, &out->dwarf_idx, &out->name); @@ -139,6 +145,8 @@ const ArchImpl arch_impl_rv64 = { .link = &link_arch_rv64, .elf = &rv64_elf_ops, .macho = NULL, + .dwarf = &rv64_dwarf_ops, + .dbg = &rv64_dbg_ops, .predefined_macros = rv64_predefined_macros, .npredefined_macros = (u32)(sizeof rv64_predefined_macros / sizeof rv64_predefined_macros[0]), diff --git a/src/arch/rv64/dbg.c b/src/arch/rv64/dbg.c @@ -329,3 +329,67 @@ int dbg_rv64_build_shim(uint32_t orig_insn, uint64_t orig_pc, *brk_offset = 4; return 0; } + +static CfreeStatus rv64_dbg_breakpoint_patch(u8* out, u32 cap, u32* len_out) { + uint32_t brk = dbg_rv64_brk_word(); + if (!out || !len_out) return CFREE_INVALID; + if (cap < 4u) return CFREE_INVALID; + memcpy(out, &brk, sizeof(brk)); + *len_out = 4u; + return CFREE_OK; +} + +static u64 rv64_dbg_breakpoint_addr_from_fault_pc(u64 fault_pc) { + return fault_pc; +} + +static CfreeStatus rv64_dbg_decode_insn(const u8* bytes, u32 len, u64 pc, + ArchDbgInsn* out) { + if (!bytes || !out) return CFREE_INVALID; + if (len < 4u) return CFREE_UNSUPPORTED; + memset(out, 0, sizeof(*out)); + out->pc = pc; + out->len = 4u; + memcpy(out->bytes, bytes, 4u); + return CFREE_OK; +} + +static CfreeStatus rv64_dbg_build_displaced_shim(const ArchDbgInsn* insn, + void* scratch_write, + u64 scratch_runtime, + u32 scratch_cap, + u32* sentinel_off, + u64* fallthrough_pc) { + uint32_t word = 0; + if (!insn || !scratch_write || !sentinel_off || !fallthrough_pc) + return CFREE_INVALID; + if (insn->len != 4u) return CFREE_UNSUPPORTED; + if (scratch_cap < 28u) return CFREE_INVALID; + memcpy(&word, insn->bytes, sizeof(word)); + if (dbg_rv64_build_shim(word, insn->pc, scratch_write, scratch_runtime, + sentinel_off) != 0) { + return CFREE_UNSUPPORTED; + } + *fallthrough_pc = insn->pc + 4u; + return CFREE_OK; +} + +static int rv64_dbg_is_call(const ArchDbgInsn* insn) { + uint32_t word = 0; + uint32_t op; + if (!insn || insn->len != 4u) return 0; + memcpy(&word, insn->bytes, sizeof(word)); + op = rv_opcode(word); + if (op != RV_JAL && op != RV_JALR) return 0; + return rv_rd(word) != RV_ZERO; +} + +const ArchDbgOps rv64_dbg_ops = { + .min_insn_len = 4u, + .max_insn_len = 4u, + .breakpoint_patch = rv64_dbg_breakpoint_patch, + .breakpoint_addr_from_fault_pc = rv64_dbg_breakpoint_addr_from_fault_pc, + .decode_insn = rv64_dbg_decode_insn, + .build_displaced_shim = rv64_dbg_build_displaced_shim, + .is_call = rv64_dbg_is_call, +}; diff --git a/src/arch/x64/alloc.c b/src/arch/x64/alloc.c @@ -27,6 +27,26 @@ int x_resolve_reg_name(CGTarget* t, Sym name, Reg* out, RegClass* cls_out) { if (!s || !len || len >= sizeof buf) return 1; memcpy(buf, s, len); buf[len] = '\0'; + if (!strcmp(buf, "ah")) { + if (out) *out = X64_RAX; + if (cls_out) *cls_out = RC_INT; + return 0; + } + if (!strcmp(buf, "ch")) { + if (out) *out = X64_RCX; + if (cls_out) *cls_out = RC_INT; + return 0; + } + if (!strcmp(buf, "dh")) { + if (out) *out = X64_RDX; + if (cls_out) *cls_out = RC_INT; + return 0; + } + if (!strcmp(buf, "bh")) { + if (out) *out = X64_RBX; + if (cls_out) *cls_out = RC_INT; + return 0; + } if (x64_register_hw_index(buf, &idx) == 0) { if (out) *out = (Reg)idx; if (cls_out) *cls_out = RC_INT; @@ -80,6 +100,12 @@ static void x_consume_param_location(XImpl* a, const ABIArgInfo* ai) { a->next_param_stack += 8; return; } + if (ai->kind == ABI_ARG_DIRECT && x64_abi_direct_to_stack( + ai, a->next_param_int, + a->next_param_fp)) { + a->next_param_stack += (u32)ai->nparts * 8u; + return; + } for (u16 i = 0; i < ai->nparts; ++i) { const ABIArgPart* pt = &ai->parts[i]; if (pt->cls == ABI_CLASS_INT) { @@ -198,6 +224,27 @@ CGLocalStorage x_param(CGTarget* t, const CGParamDesc* p) { return st; } /* DIRECT */ + if (x64_abi_direct_to_stack(ai, a->next_param_int, a->next_param_fp)) { + for (u16 i = 0; i < ai->nparts; ++i) { + const ABIArgPart* pt = &ai->parts[i]; + u32 caller_off = a->next_param_stack; + u32 sz = pt->size; + a->next_param_stack += 8; + if (pt->cls == ABI_CLASS_FP) { + u8 prefix = (sz == 8) ? 0xF2 : 0xF3; + emit_sse_load(t->mc, prefix, 0x10, X64_XMM0, incoming_stack_base, + incoming_stack_bias + (i32)caller_off); + emit_sse_store(t->mc, prefix, 0x11, X64_XMM0, X64_RBP, + -(i32)s->off + (i32)pt->src_offset); + } else { + emit_mov_load(t->mc, sz, 0, X64_RAX, incoming_stack_base, + incoming_stack_bias + (i32)caller_off); + emit_mov_store(t->mc, sz, X64_RAX, X64_RBP, + -(i32)s->off + (i32)pt->src_offset); + } + } + return st; + } for (u16 i = 0; i < ai->nparts; ++i) { const ABIArgPart* pt = &ai->parts[i]; u32 part_off = pt->src_offset; diff --git a/src/arch/x64/arch.c b/src/arch/x64/arch.c @@ -12,11 +12,16 @@ #include "obj/obj.h" extern const LinkArchDesc link_arch_x64; +extern const ArchDbgOps x64_dbg_ops; static const ABIVtable* x64_abi_vtable(Compiler* c, CfreeOSKind os) { (void)c; - (void)os; - return &sysv_x64_vtable; + switch (os) { + case CFREE_OS_MACOS: + return &apple_x64_vtable; + default: + return &sysv_x64_vtable; + } } static const ArchElfOps x64_elf_ops = { @@ -35,6 +40,11 @@ static const ArchMachoOps x64_macho_ops = { .reloc_from = macho_x86_64_reloc_from, }; +static const ArchDwarfOps x64_dwarf_ops = { + .min_inst_len = 1u, + .max_ops_per_inst = 1u, +}; + static int x64_apply_label_fixup(Compiler* c, const ArchLabelFixup* fx) { (void)c; if (!fx || fx->kind != R_PC32 || fx->width != 4) return 1; @@ -74,6 +84,8 @@ const ArchImpl arch_impl_x64 = { .link = &link_arch_x64, .elf = &x64_elf_ops, .macho = &x64_macho_ops, + .dwarf = &x64_dwarf_ops, + .dbg = &x64_dbg_ops, .predefined_macros = x64_predefined_macros, .npredefined_macros = (u32)(sizeof x64_predefined_macros / sizeof x64_predefined_macros[0]), diff --git a/src/arch/x64/asm.c b/src/arch/x64/asm.c @@ -25,6 +25,7 @@ struct X64Asm { typedef enum X64AsmOperandKind { X64_ASM_OP_REG, + X64_ASM_OP_XMM, X64_ASM_OP_IMM, X64_ASM_OP_MEM, X64_ASM_OP_IND_REG, @@ -35,18 +36,14 @@ typedef struct X64AsmOperand { u8 width; u8 reg; u8 base; + u8 high8; + u8 pad[3]; i64 imm; i32 disp; } X64AsmOperand; -static int sym_eq(AsmDriver* d, Sym s, const char* lit) { - size_t n = 0; - const char* p = pool_str(asm_driver_pool(d), s, &n); - return p && strlen(lit) == n && memcmp(p, lit, n) == 0; -} - static int x64_reg_from_name(AsmDriver* d, Sym s, u32* reg_out, - u32* width_out) { + u32* width_out, u32* high8_out) { size_t n = 0; const char* p = pool_str(asm_driver_pool(d), s, &n); char buf[16]; @@ -55,22 +52,63 @@ static int x64_reg_from_name(AsmDriver* d, Sym s, u32* reg_out, if (!p || n < 2 || n >= sizeof buf) return 0; memcpy(buf, p, n); buf[n] = '\0'; - if (buf[n - 1] == 'd' || buf[0] == 'e') width = 4; + if (!strcmp(buf, "ah") || !strcmp(buf, "ch") || !strcmp(buf, "dh") || + !strcmp(buf, "bh")) { + static const u32 high_map[4] = {4u, 5u, 6u, 7u}; + const char* names = "acdb"; + for (u32 i = 0; i < 4u; ++i) { + if (buf[0] == names[i]) { + if (reg_out) *reg_out = high_map[i]; + if (width_out) *width_out = 1; + if (high8_out) *high8_out = 1; + return 1; + } + } + } if (x64_register_hw_index(buf, &reg) != 0) return 0; if (reg > 15u) return 0; + if (!strcmp(buf, "al") || !strcmp(buf, "cl") || !strcmp(buf, "dl") || + !strcmp(buf, "bl") || !strcmp(buf, "spl") || !strcmp(buf, "bpl") || + !strcmp(buf, "sil") || !strcmp(buf, "dil") || buf[n - 1] == 'b') { + width = 1; + } else if (!strcmp(buf, "ax") || !strcmp(buf, "cx") || + !strcmp(buf, "dx") || !strcmp(buf, "bx") || + !strcmp(buf, "sp") || !strcmp(buf, "bp") || + !strcmp(buf, "si") || !strcmp(buf, "di") || + buf[n - 1] == 'w') { + width = 2; + } else if (buf[n - 1] == 'd' || buf[0] == 'e') { + width = 4; + } if (reg_out) *reg_out = reg; if (width_out) *width_out = width; + if (high8_out) *high8_out = 0; return 1; } -static u32 parse_reg(AsmDriver* d, u32* width_out) { +static int x64_xmm_from_name(AsmDriver* d, Sym s, u32* reg_out) { + size_t n = 0; + const char* p = pool_str(asm_driver_pool(d), s, &n); + u32 reg = 0; + if (!p || n < 4 || n > 5) return 0; + if (p[0] != 'x' || p[1] != 'm' || p[2] != 'm') return 0; + for (size_t i = 3; i < n; ++i) { + if (p[i] < '0' || p[i] > '9') return 0; + reg = reg * 10u + (u32)(p[i] - '0'); + } + if (reg > 15u) return 0; + if (reg_out) *reg_out = reg; + return 1; +} + +static u32 parse_reg(AsmDriver* d, u32* width_out, u32* high8_out) { AsmTok t; u32 reg; if (!asm_driver_eat_punct(d, '%')) asm_driver_panic(d, "x64 asm: expected register"); t = asm_driver_next(d); if (t.kind != ASM_TOK_IDENT || - !x64_reg_from_name(d, t.v.ident, &reg, width_out)) { + !x64_reg_from_name(d, t.v.ident, &reg, width_out, high8_out)) { asm_driver_panic(d, "x64 asm: bad register"); } return reg; @@ -83,7 +121,7 @@ static X64AsmOperand parse_operand(AsmDriver* d) { t = asm_driver_peek(d); if (asm_driver_eat_punct(d, '*')) { op.kind = X64_ASM_OP_IND_REG; - op.reg = (u8)parse_reg(d, NULL); + op.reg = (u8)parse_reg(d, NULL, NULL); return op; } if (asm_driver_eat_punct(d, '$')) { @@ -93,9 +131,26 @@ static X64AsmOperand parse_operand(AsmDriver* d) { } if (asm_driver_tok_is_punct(t, '%')) { u32 width = 8; - op.kind = X64_ASM_OP_REG; - op.reg = (u8)parse_reg(d, &width); + u32 high8 = 0; + AsmTok ident; + (void)asm_driver_next(d); + ident = asm_driver_next(d); + if (ident.kind != ASM_TOK_IDENT) asm_driver_panic(d, "x64 asm: bad register"); + if (x64_xmm_from_name(d, ident.v.ident, &width)) { + op.kind = X64_ASM_OP_XMM; + op.reg = (u8)width; + op.width = 16; + return op; + } + { + u32 reg = 0; + if (!x64_reg_from_name(d, ident.v.ident, &reg, &width, &high8)) + asm_driver_panic(d, "x64 asm: bad register"); + op.kind = X64_ASM_OP_REG; + op.reg = (u8)reg; + } op.width = (u8)width; + op.high8 = (u8)high8; return op; } op.kind = X64_ASM_OP_MEM; @@ -104,7 +159,7 @@ static X64AsmOperand parse_operand(AsmDriver* d) { op.disp = (i32)asm_driver_parse_const(d); } asm_driver_expect_punct(d, '(', "'(' in x64 memory operand"); - op.base = (u8)parse_reg(d, NULL); + op.base = (u8)parse_reg(d, NULL, NULL); asm_driver_expect_punct(d, ')', "')' in x64 memory operand"); return op; } @@ -113,16 +168,6 @@ static void expect_comma(AsmDriver* d) { if (!asm_driver_eat_comma(d)) asm_driver_panic(d, "x64 asm: expected ','"); } -static void emit_cmov_eq(MCEmitter* mc, u32 dst, u32 src) { - u8 op[2] = {0x0f, 0x44}; - emit_rex(mc, 1, dst, 0, src); - mc->emit_bytes(mc, op, 2); - { - u8 mr = modrm(3u, dst, src); - mc->emit_bytes(mc, &mr, 1); - } -} - static void emit_indirect_branch(MCEmitter* mc, u32 sub, u32 reg) { u8 op = 0xff; emit_rex(mc, 0, 0, 0, reg); @@ -133,106 +178,1048 @@ static void emit_indirect_branch(MCEmitter* mc, u32 sub, u32 reg) { } } -static void emit_ud2(MCEmitter* mc) { - u8 op[2] = {0x0f, 0x0b}; - mc->emit_bytes(mc, op, 2); +static void emit_packed(MCEmitter* mc, const u8* bytes, u32 n) { + mc->emit_bytes(mc, bytes, n); } -static void x64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic) { - X64Asm* a = (X64Asm*)base; - MCEmitter* mc = asm_driver_mc(d); +static int byte_reg_needs_rex(const X64AsmOperand* op) { + return op && !op->high8 && op->reg >= 4u; +} + +static void reject_high8_with_rex(AsmDriver* d, const X64AsmOperand* a, + const X64AsmOperand* b) { + if ((a && a->high8 && byte_reg_needs_rex(b)) || + (b && b->high8 && byte_reg_needs_rex(a))) { + asm_driver_panic(d, "x64 asm: high-byte register cannot use REX"); + } +} + +static __attribute__((unused)) void emit_movb_rr_operand( + AsmDriver* d, MCEmitter* mc, X64AsmOperand dst, X64AsmOperand src) { + u8 ob = 0x88; + reject_high8_with_rex(d, &dst, &src); + if (byte_reg_needs_rex(&dst) || byte_reg_needs_rex(&src)) + emit_rex_force(mc, 0, src.reg, 0, dst.reg); + else + emit_rex(mc, 0, src.reg, 0, dst.reg); + mc->emit_bytes(mc, &ob, 1); + { + u8 mr = modrm(3u, src.reg, dst.reg); + mc->emit_bytes(mc, &mr, 1); + } +} + +static __attribute__((unused)) void emit_movb_store_operand( + AsmDriver* d, MCEmitter* mc, X64AsmOperand src, X64AsmOperand dst) { + if (src.high8) { + u8 ob = 0x88; + if (dst.base >= 8u) asm_driver_panic(d, "x64 asm: high-byte register cannot use REX"); + mc->emit_bytes(mc, &ob, 1); + emit_mem_operand(mc, src.reg, dst.base, dst.disp); + return; + } + emit_mov_store(mc, 1, src.reg, dst.base, dst.disp); +} + +static __attribute__((unused)) void emit_rm_imm(AsmDriver* d, MCEmitter* mc, + u32 width, u8 opc, u32 sub, + X64AsmOperand dst, i32 imm, + int imm32) { + u8 buf[16]; + u32 n = 0; + if (dst.kind != X64_ASM_OP_REG && dst.kind != X64_ASM_OP_MEM) + asm_driver_panic(d, "x64 asm: expected register or memory destination"); + if (width == 2u) buf[n++] = X64_OPSIZE_PFX; + if (dst.kind == X64_ASM_OP_REG) { + n += x64_pack_rex(buf + n, width == 8u, 0, 0, dst.reg); + buf[n++] = opc; + buf[n++] = x64_modrm(3u, sub, dst.reg); + } else { + n += x64_pack_rex(buf + n, width == 8u, 0, 0, dst.base); + buf[n++] = opc; + n += x64_pack_mem(buf + n, sub, dst.base, dst.disp); + } + if (imm32) + n += x64_put_u32le(buf + n, (u32)imm); + else + buf[n++] = (u8)(i8)imm; + emit_packed(mc, buf, n); +} + +static __attribute__((unused)) void emit_rm_op(AsmDriver* d, MCEmitter* mc, + u32 width, u8 opc, u32 sub, + X64AsmOperand dst) { + u8 buf[16]; + u32 n = 0; + if (dst.kind != X64_ASM_OP_REG && dst.kind != X64_ASM_OP_MEM) + asm_driver_panic(d, "x64 asm: expected register or memory operand"); + if (width == 2u) buf[n++] = X64_OPSIZE_PFX; + if (dst.kind == X64_ASM_OP_REG) { + n += x64_pack_rex(buf + n, width == 8u, 0, 0, dst.reg); + buf[n++] = opc; + buf[n++] = x64_modrm(3u, sub, dst.reg); + } else { + n += x64_pack_rex(buf + n, width == 8u, 0, 0, dst.base); + buf[n++] = opc; + n += x64_pack_mem(buf + n, sub, dst.base, dst.disp); + } + emit_packed(mc, buf, n); +} + +static __attribute__((unused)) void emit_reg_rm_twobyte( + AsmDriver* d, MCEmitter* mc, u32 width, u8 opcode2, u32 dst, + X64AsmOperand src, int force_rex, u8 prefix) { + u8 buf[16]; + u32 n = 0; + if (src.kind != X64_ASM_OP_REG && src.kind != X64_ASM_OP_MEM) + asm_driver_panic(d, "x64 asm: expected register or memory source"); + if (prefix) buf[n++] = prefix; + if (src.kind == X64_ASM_OP_REG) { + if (force_rex) + n += x64_pack_rex_force(buf + n, width == 8u, dst, 0, src.reg); + else + n += x64_pack_rex(buf + n, width == 8u, dst, 0, src.reg); + buf[n++] = X64_OPC_TWOBYTE; + buf[n++] = opcode2; + buf[n++] = x64_modrm(3u, dst, src.reg); + } else { + n += x64_pack_rex(buf + n, width == 8u, dst, 0, src.base); + buf[n++] = X64_OPC_TWOBYTE; + buf[n++] = opcode2; + n += x64_pack_mem(buf + n, dst, src.base, src.disp); + } + emit_packed(mc, buf, n); +} + +/* ==================================================================== + * Descriptor-driven mnemonic dispatch. + * + * The disassembler's `x64_insn_table` (src/arch/x64/isa.c) lists every + * encoding cfree emits with its X64Format. We reuse the SAME table for + * the assembler: linear-scan to find the row whose mnemonic matches the + * user's AT&T spelling (after stripping the size suffix b/w/l/q), then + * dispatch to a per-format parser that consumes the operands and calls + * the existing `emit_*` helpers in emit.c. + * + * The width comes from the suffix (or the row's width flags); per-format + * parsers receive it via a small X64ParseCtx so they can pick the right + * emit overload (e.g., MOV r,r at 32 vs 64 bits). + * + * Note: a single mnemonic ("mov") has multiple table rows for different + * formats (MOV_RI, ALU_RR, MOV_RM_LOAD). We return the FIRST row that + * matches the mnemonic + width filter; per-format parsers that need a + * different row (e.g., MOV imm→reg uses MOV_RI but our scan may have + * returned ALU_RR first) fall through to operand-kind dispatch and + * select the correct emit helper directly. Phase 1 of this refactor + * only exercises the mnemonics asm.c handled before; richer disambiguation + * lands in follow-ups. */ + +#define X64_SFX_NONE 0u +#define X64_SFX_B 1u +#define X64_SFX_W 2u +#define X64_SFX_L 4u +#define X64_SFX_Q 8u + +typedef struct X64MnInfo { + char base[16]; /* stripped mnemonic (table-spelling) */ + u32 base_len; + u32 width; /* X64_SFX_* — 0 if mnemonic carries no size letter */ + u32 cc; /* condition nibble for jcc/cmovcc/setcc, or 16 if none */ +} X64MnInfo; + +/* Parse the user-supplied mnemonic into (root, width, cc). Handles: + * - trailing size letter (b/w/l/q) when the table mnemonic has none + * - jXX → ("j", cc) + * - cmovXX[q|l|w|b] → ("cmov", cc, width) + * - setXX → ("set", cc) + * - exact-match mnemonics carried verbatim (movslq, movzbl, ud2, ...) */ +static int parse_mnemonic(const char* s, size_t n, X64MnInfo* out) { + static const struct { const char* name; u8 cc; } kCC[] = { + /* Two-letter codes first so e.g. "ne" beats "n" if we ever add it. */ + {"ae", X64_CC_AE}, {"be", X64_CC_BE}, {"ge", X64_CC_GE}, + {"le", X64_CC_LE}, {"ne", X64_CC_NE}, {"no", X64_CC_NO}, + {"np", X64_CC_NP}, {"ns", X64_CC_NS}, + {"a", X64_CC_A}, {"b", X64_CC_B}, {"e", X64_CC_E}, + {"g", X64_CC_G}, {"l", X64_CC_L}, {"o", X64_CC_O}, + {"p", X64_CC_P}, {"s", X64_CC_S}, + }; + out->base_len = 0; + out->width = X64_SFX_NONE; + out->cc = 16u; + if (n == 0 || n >= sizeof out->base) return 0; + memcpy(out->base, s, n); + out->base[n] = '\0'; + + /* Exact-match mnemonics that carry their own width letters or are + * already canonical table spellings. */ + if (n >= 6 && memcmp(s, "movslq", 6) == 0) { + memcpy(out->base, "movslq", 6); out->base_len = 6; out->width = X64_SFX_Q; + return 1; + } + if (n >= 6 && + (memcmp(s, "movzbl", 6) == 0 || memcmp(s, "movzwl", 6) == 0 || + memcmp(s, "movsbl", 6) == 0 || memcmp(s, "movswl", 6) == 0)) { + memcpy(out->base, s, 6); out->base_len = 6; + return 1; + } + if (n == 3 && memcmp(s, "ud2", 3) == 0) { + out->base_len = 3; return 1; + } + if (n == 3 && memcmp(s, "nop", 3) == 0) { + out->base_len = 3; return 1; + } + if (n == 3 && memcmp(s, "ret", 3) == 0) { + out->base_len = 3; return 1; + } + + /* Indirect-branch spellings carry an explicit 'q' suffix that must be + * preserved — the BR_RM rows in the table are keyed on "jmpq"/"callq". */ + if (n == 4 && memcmp(s, "call", 4) == 0) { + memcpy(out->base, "callq", 5); + out->base[5] = '\0'; + out->base_len = 5; + out->width = X64_SFX_Q; + return 1; + } + if (n == 4 && memcmp(s, "jmpq", 4) == 0) { + out->base_len = 4; out->width = X64_SFX_Q; return 1; + } + if (n == 5 && memcmp(s, "callq", 5) == 0) { + out->base_len = 5; out->width = X64_SFX_Q; return 1; + } + + /* CMOVcc: cmov<cc>[suffix]. Strip optional trailing q/l/w/b first. */ + if (n >= 5 && memcmp(s, "cmov", 4) == 0) { + size_t after = 4; + size_t tail = n; + char last = s[n - 1]; + if (last == 'b' || last == 'w' || last == 'l' || last == 'q') { + out->width = (last == 'b') ? X64_SFX_B + : (last == 'w') ? X64_SFX_W + : (last == 'l') ? X64_SFX_L + : X64_SFX_Q; + tail = n - 1; + } + if (tail > after) { + size_t cc_n = tail - after; + for (size_t i = 0; i < sizeof kCC / sizeof kCC[0]; ++i) { + size_t kn = strlen(kCC[i].name); + if (cc_n == kn && memcmp(s + after, kCC[i].name, kn) == 0) { + out->cc = kCC[i].cc; + memcpy(out->base, "cmov", 4); + out->base[4] = '\0'; + out->base_len = 4; + return 1; + } + } + } + } + + /* SETcc: set<cc>. */ + if (n > 3 && memcmp(s, "set", 3) == 0) { + size_t cc_n = n - 3; + for (size_t i = 0; i < sizeof kCC / sizeof kCC[0]; ++i) { + size_t kn = strlen(kCC[i].name); + if (cc_n == kn && memcmp(s + 3, kCC[i].name, kn) == 0) { + out->cc = kCC[i].cc; + memcpy(out->base, "set", 3); + out->base[3] = '\0'; + out->base_len = 3; + return 1; + } + } + } + + /* Jcc: j<cc> — but NOT "jmp" / "jmpq" (handled above). */ + if (n > 1 && s[0] == 'j' && !(n >= 3 && s[1] == 'm' && s[2] == 'p')) { + size_t cc_n = n - 1; + for (size_t i = 0; i < sizeof kCC / sizeof kCC[0]; ++i) { + size_t kn = strlen(kCC[i].name); + if (cc_n == kn && memcmp(s + 1, kCC[i].name, kn) == 0) { + out->cc = kCC[i].cc; + out->base[0] = 'j'; + out->base[1] = '\0'; + out->base_len = 1; + return 1; + } + } + } + + /* Generic: strip trailing size letter b/w/l/q. */ + { + char last = s[n - 1]; + if (last == 'b' || last == 'w' || last == 'l' || last == 'q') { + out->width = (last == 'b') ? X64_SFX_B + : (last == 'w') ? X64_SFX_W + : (last == 'l') ? X64_SFX_L + : X64_SFX_Q; + out->base_len = (u32)(n - 1); + out->base[out->base_len] = '\0'; + return 1; + } + } + + out->base_len = (u32)n; + return 1; +} + +/* Width implied by a descriptor row, given the mnemonic's parsed width. */ +static u32 row_implied_width(const X64InsnDesc* d) { + if (d->flags & X64_ASMFL_FORCE_W64) return 8u; + if (d->flags & X64_ASMFL_BYTE) return 1u; + if (d->flags & X64_ASMFL_W16) return 2u; + if (d->flags & X64_ASMFL_W_FROM_REX) return 0u; /* any */ + if (d->leg_pfx == X64_PFX_66) return 2u; + return 0u; /* any */ +} + +/* Linear scan for the first table row whose mnemonic matches `info->base` + * AND whose width filter is compatible. Returns NULL on miss. */ +static const X64InsnDesc* find_mnemonic_row(const X64MnInfo* info) { + u32 want_w = info->width; /* 0 = any */ + for (u32 i = 0; i < x64_insn_table_n; ++i) { + const X64InsnDesc* d = &x64_insn_table[i]; + size_t mn = strlen(d->mnemonic); + if (mn != info->base_len) continue; + if (memcmp(d->mnemonic, info->base, mn) != 0) continue; + if (want_w != 0) { + u32 rw = row_implied_width(d); + if (rw != 0 && rw != want_w) continue; + } + return d; + } + return NULL; +} + +/* Per-format parse context. */ +typedef struct X64ParseCtx { + AsmDriver* d; + MCEmitter* mc; + const X64InsnDesc* desc; + u32 width; /* 1/2/4/8 — derived from suffix or row */ + u32 cc; /* condition nibble (jcc/cmovcc/setcc); 16 if unused */ +} X64ParseCtx; + +/* w-bit for emit_rex / emit_alu_rr / emit_mov_rr etc. */ +static int width_to_w(u32 w) { return w == 8u ? 1 : 0; } + +/* ---- per-format parsers ---- */ + +static void parse_nullary(X64ParseCtx* p) { + /* nop / ret / ud2 / leave / cltd / cqto. */ + u8 buf[4]; + u32 n = 0; + if (p->desc->leg_pfx) buf[n++] = p->desc->leg_pfx; + if (p->desc->rex_w_req == X64_W_REQ_1) buf[n++] = X64_REX_BASE | X64_REX_W; + for (u32 i = 0; i < p->desc->opc_len; ++i) buf[n++] = p->desc->opc[i]; + if (p->desc->opc_len >= 1u) { + p->mc->emit_bytes(p->mc, buf, n); + return; + } + asm_driver_panic(p->d, "x64 asm: nullary form not implemented"); +} + +static void parse_br_rm(X64ParseCtx* p) { + /* jmpq *%reg or callq *%reg. /digit picks sub (2 = call, 4 = jmp). */ + X64AsmOperand op = parse_operand(p->d); + if (op.kind != X64_ASM_OP_IND_REG) + asm_driver_panic(p->d, "x64 asm: indirect branch form"); + emit_indirect_branch(p->mc, p->desc->modrm_reg, op.reg); +} + +/* Look up the ALU_RM_IMM8 row for a given mnemonic root; the /digit + * picks the operation (0=add, 1=or, 4=and, 5=sub, 6=xor, 7=cmp). */ +static const X64InsnDesc* find_alu_imm_row(const char* root, u32 root_len) { + for (u32 i = 0; i < x64_insn_table_n; ++i) { + const X64InsnDesc* d = &x64_insn_table[i]; + if (d->fmt != X64_FMT_ALU_RM_IMM8) continue; + if (strlen(d->mnemonic) != root_len) continue; + if (memcmp(d->mnemonic, root, root_len) != 0) continue; + return d; + } + return NULL; +} + +static void parse_alu_rr(X64ParseCtx* p) { + /* op src, dst in AT&T. Row's opc[0] is the ALU opcode (0x01/0x09/... + * 0x31/0x85/0x89). The byte/16-bit forms are handled by the + * existing emit.c helpers for w=0/w=1 + size suffix; here phase-1 + * supports only the regular 32/64 forms used by the prior asm.c. */ X64AsmOperand src; X64AsmOperand dst; - (void)a; - (void)asm_driver_cur_section(d); + src = parse_operand(p->d); + expect_comma(p->d); + dst = parse_operand(p->d); + + /* Immediate source → not an ALU_RR encoding. Redirect to the + * ALU_RM_IMM row for this mnemonic. */ + if (src.kind == X64_ASM_OP_IMM && dst.kind == X64_ASM_OP_REG) { + const X64InsnDesc* imm_row = + find_alu_imm_row(p->desc->mnemonic, + (u32)strlen(p->desc->mnemonic)); + if (!imm_row) asm_driver_panic(p->d, "x64 asm: no alu-imm row"); + if (imm_fits_i8(src.imm)) + emit_alu_imm8(p->mc, width_to_w(p->width), imm_row->modrm_reg, dst.reg, + (i8)src.imm); + else if (imm_fits_i32(src.imm)) + emit_alu_imm32(p->mc, width_to_w(p->width), imm_row->modrm_reg, dst.reg, + (i32)src.imm); + else + asm_driver_panic(p->d, "x64 asm: alu-imm out of range"); + return; + } - if (sym_eq(d, mnemonic, "nop")) { - u8 op = X64_NOP1; - mc->emit_bytes(mc, &op, 1); + if (src.kind == X64_ASM_OP_REG && dst.kind == X64_ASM_OP_REG) { + u8 op = p->desc->opc[0]; + if (p->width == 2u) { + u8 pfx = X64_OPSIZE_PFX; + p->mc->emit_bytes(p->mc, &pfx, 1); + } + if (op == 0x89u) { + /* MOV r/m, r — phase-1 keeps the existing helper. */ + emit_mov_rr(p->mc, width_to_w(p->width), dst.reg, src.reg); + return; + } + if (op == 0x88u) { + emit_movb_rr_operand(p->d, p->mc, dst, src); + return; + } + if (op == 0x88u) { + /* MOV r/m8, r8 — byte form (preserved from prior asm.c). */ + u8 ob = 0x88; + emit_rex(p->mc, 0, src.reg, 0, dst.reg); + p->mc->emit_bytes(p->mc, &ob, 1); + { + u8 mr = modrm(3u, src.reg, dst.reg); + p->mc->emit_bytes(p->mc, &mr, 1); + } + return; + } + /* xor/test/and/... — emit_alu_rr handles the generic shape. */ + emit_alu_rr(p->mc, width_to_w(p->width), op, dst.reg, src.reg); return; } - if (sym_eq(d, mnemonic, "ret")) { - emit_ret(mc); + /* MOV r, r/m goes through MOV_RM_LOAD; MOV r, m goes through + * MOV_RM_LOAD (load) or ALU_RR with mem dst (store). We handle the + * store side here only when the mnemonic is "mov" (opc 0x89). */ + if (p->desc->opc[0] == 0x89u && + src.kind == X64_ASM_OP_REG && dst.kind == X64_ASM_OP_MEM) { + if (p->width == 1u) + emit_movb_store_operand(p->d, p->mc, src, dst); + else + emit_mov_store(p->mc, p->width, src.reg, dst.base, dst.disp); return; } - if (sym_eq(d, mnemonic, "ud2")) { - emit_ud2(mc); + if (p->desc->opc[0] == 0x89u && + src.kind == X64_ASM_OP_MEM && dst.kind == X64_ASM_OP_REG) { + emit_mov_load(p->mc, p->width, 0, dst.reg, src.base, src.disp); return; } + asm_driver_panic(p->d, "x64 asm: unsupported alu_rr form"); +} + +static void parse_mov_ri(X64ParseCtx* p) { + X64AsmOperand src; + X64AsmOperand dst; + src = parse_operand(p->d); + expect_comma(p->d); + dst = parse_operand(p->d); + if (src.kind != X64_ASM_OP_IMM || dst.kind != X64_ASM_OP_REG) + asm_driver_panic(p->d, "x64 asm: mov-imm form"); + if (p->width != 4u && p->width != 8u) + asm_driver_panic(p->d, "x64 asm: mov imm only supports l/q forms"); + x64_emit_load_imm(p->mc, p->width == 8u ? 1 : 0, dst.reg, src.imm); +} - src = parse_operand(d); - if (sym_eq(d, mnemonic, "jmpq")) { - if (src.kind != X64_ASM_OP_IND_REG) - asm_driver_panic(d, "x64 asm: jmpq form"); - emit_indirect_branch(mc, 4u, src.reg); +static void parse_mov_rm_load(X64ParseCtx* p) { + /* MOV r, r/m (0x8B) or LEA r, m (0x8D). AT&T order is src, dst. + * Phase-1 covers reg-reg, reg←mem (load) and lea. */ + X64AsmOperand src; + X64AsmOperand dst; + src = parse_operand(p->d); + expect_comma(p->d); + dst = parse_operand(p->d); + if (p->desc->opc[0] == 0x8Du) { + if (src.kind != X64_ASM_OP_MEM || dst.kind != X64_ASM_OP_REG) + asm_driver_panic(p->d, "x64 asm: lea form"); + emit_lea(p->mc, dst.reg, src.base, src.disp); + return; + } + if (src.kind == X64_ASM_OP_MEM && dst.kind == X64_ASM_OP_REG) { + if (p->width == 2u) { + u8 buf[16]; + u32 n = x64_mov_rm_load_pack( + (X64MovRMLoad){.w = 0, .opc0 = X64_OPC_MOV_R_RM, .dst = dst.reg, + .base = src.base, .disp = src.disp}, + buf + 1); + buf[0] = X64_OPSIZE_PFX; + emit_packed(p->mc, buf, n + 1u); + } else { + emit_mov_load(p->mc, p->width, 0, dst.reg, src.base, src.disp); + } return; } - if (sym_eq(d, mnemonic, "callq")) { - if (src.kind != X64_ASM_OP_IND_REG) - asm_driver_panic(d, "x64 asm: callq form"); - emit_indirect_branch(mc, 2u, src.reg); + if (src.kind == X64_ASM_OP_REG && dst.kind == X64_ASM_OP_REG) { + if (p->width == 2u) { + u8 pfx = X64_OPSIZE_PFX; + p->mc->emit_bytes(p->mc, &pfx, 1); + } + emit_mov_rr(p->mc, width_to_w(p->width), dst.reg, src.reg); return; } + asm_driver_panic(p->d, "x64 asm: mov-load form"); +} + +static void parse_movsxd(X64ParseCtx* p) { + X64AsmOperand src; + X64AsmOperand dst; + src = parse_operand(p->d); + expect_comma(p->d); + dst = parse_operand(p->d); + if (dst.kind != X64_ASM_OP_REG) + asm_driver_panic(p->d, "x64 asm: movslq form"); + if (src.kind == X64_ASM_OP_REG) { + emit_extend_rr(p->mc, 1, 1, 4, dst.reg, src.reg); + } else if (src.kind == X64_ASM_OP_MEM) { + u8 buf[16]; + u32 n = x64_mov_rm_load_pack( + (X64MovRMLoad){.w = 1, .opc0 = X64_OPC_MOVSXD, .dst = dst.reg, + .base = src.base, .disp = src.disp}, + buf); + emit_packed(p->mc, buf, n); + } else { + asm_driver_panic(p->d, "x64 asm: movslq source"); + } +} - expect_comma(d); - dst = parse_operand(d); +static void parse_alu_rm_imm(X64ParseCtx* p) { + X64AsmOperand src; + X64AsmOperand dst; + src = parse_operand(p->d); + expect_comma(p->d); + dst = parse_operand(p->d); + if (src.kind != X64_ASM_OP_IMM || dst.kind != X64_ASM_OP_REG) + asm_driver_panic(p->d, "x64 asm: alu-imm form"); + if (imm_fits_i8(src.imm)) + emit_alu_imm8(p->mc, width_to_w(p->width), p->desc->modrm_reg, dst.reg, + (i8)src.imm); + else if (imm_fits_i32(src.imm)) + emit_alu_imm32(p->mc, width_to_w(p->width), p->desc->modrm_reg, dst.reg, + (i32)src.imm); + else + asm_driver_panic(p->d, "x64 asm: alu-imm out of range"); +} - if (sym_eq(d, mnemonic, "movq")) { - if (src.kind == X64_ASM_OP_REG && dst.kind == X64_ASM_OP_MEM) { - emit_mov_store(mc, 8, src.reg, dst.base, dst.disp); - return; +static void parse_cmovcc(X64ParseCtx* p) { + X64AsmOperand src; + X64AsmOperand dst; + src = parse_operand(p->d); + expect_comma(p->d); + dst = parse_operand(p->d); + if (src.kind != X64_ASM_OP_REG || dst.kind != X64_ASM_OP_REG) + asm_driver_panic(p->d, "x64 asm: cmovcc form"); + { + u8 op[2] = {0x0f, (u8)(0x40u | (p->cc & 0xfu))}; + if (p->width == 2u) { + u8 pfx = X64_OPSIZE_PFX; + p->mc->emit_bytes(p->mc, &pfx, 1); } - if (src.kind == X64_ASM_OP_MEM && dst.kind == X64_ASM_OP_REG) { - emit_mov_load(mc, 8, 0, dst.reg, src.base, src.disp); + emit_rex(p->mc, width_to_w(p->width), dst.reg, 0, src.reg); + p->mc->emit_bytes(p->mc, op, 2); + emit_rm_reg(p->mc, dst.reg, src.reg); + } +} + +static void parse_push_pop(X64ParseCtx* p) { + X64AsmOperand op = parse_operand(p->d); + u8 base = p->desc->opc[0]; + u8 ob; + if (op.kind != X64_ASM_OP_REG) asm_driver_panic(p->d, "x64 asm: push/pop register"); + emit_rex(p->mc, 0, 0, 0, op.reg); + ob = (u8)(base | (op.reg & 7u)); + p->mc->emit_bytes(p->mc, &ob, 1); +} + +static void parse_movzx_movsx(X64ParseCtx* p) { + X64AsmOperand src = parse_operand(p->d); + X64AsmOperand dst; + expect_comma(p->d); + dst = parse_operand(p->d); + if (dst.kind != X64_ASM_OP_REG) asm_driver_panic(p->d, "x64 asm: movx dst register"); + emit_reg_rm_twobyte(p->d, p->mc, 4u, p->desc->opc[1], dst.reg, src, + p->desc->opc[1] == X64_OPC_MOVZX_B || + p->desc->opc[1] == X64_OPC_MOVSX_B, + 0); +} + +static void parse_imul_rr(X64ParseCtx* p) { + X64AsmOperand src = parse_operand(p->d); + X64AsmOperand dst; + if (src.kind == X64_ASM_OP_IMM) { + X64AsmOperand real_src; + expect_comma(p->d); + real_src = parse_operand(p->d); + expect_comma(p->d); + dst = parse_operand(p->d); + if (dst.kind != X64_ASM_OP_REG) asm_driver_panic(p->d, "x64 asm: imul dst register"); + if (real_src.kind == X64_ASM_OP_REG) { + if (imm_fits_i8(src.imm)) + emit_imul_imm8(p->mc, width_to_w(p->width), dst.reg, real_src.reg, + (i8)src.imm); + else if (imm_fits_i32(src.imm)) + emit_imul_imm32(p->mc, width_to_w(p->width), dst.reg, real_src.reg, + (i32)src.imm); + else + asm_driver_panic(p->d, "x64 asm: imul imm out of range"); return; } - if (src.kind == X64_ASM_OP_REG && dst.kind == X64_ASM_OP_REG) { - emit_mov_rr(mc, 1, dst.reg, src.reg); + if (real_src.kind == X64_ASM_OP_MEM) { + u8 buf[16]; + u32 n = 0; + int imm32 = !imm_fits_i8(src.imm); + if (imm32 && !imm_fits_i32(src.imm)) + asm_driver_panic(p->d, "x64 asm: imul imm out of range"); + n += x64_pack_rex(buf + n, width_to_w(p->width), dst.reg, 0, + real_src.base); + buf[n++] = imm32 ? X64_OPC_IMUL_IMM32 : X64_OPC_IMUL_IMM8; + n += x64_pack_mem(buf + n, dst.reg, real_src.base, real_src.disp); + if (imm32) + n += x64_put_u32le(buf + n, (u32)(i32)src.imm); + else + buf[n++] = (u8)(i8)src.imm; + emit_packed(p->mc, buf, n); return; } - } else if (sym_eq(d, mnemonic, "movl")) { - if (src.kind == X64_ASM_OP_IMM && dst.kind == X64_ASM_OP_REG) { - x64_emit_load_imm(mc, 0, dst.reg, src.imm); - return; + asm_driver_panic(p->d, "x64 asm: imul source"); + } + expect_comma(p->d); + dst = parse_operand(p->d); + if (dst.kind != X64_ASM_OP_REG) asm_driver_panic(p->d, "x64 asm: imul dst register"); + emit_reg_rm_twobyte(p->d, p->mc, p->width, X64_OPC_IMUL_2B, dst.reg, src, + 0, 0); +} + +static void parse_imul_rri(X64ParseCtx* p) { + X64AsmOperand imm = parse_operand(p->d); + X64AsmOperand src; + X64AsmOperand dst; + if (imm.kind != X64_ASM_OP_IMM) asm_driver_panic(p->d, "x64 asm: imul imm"); + expect_comma(p->d); + src = parse_operand(p->d); + expect_comma(p->d); + dst = parse_operand(p->d); + if (dst.kind != X64_ASM_OP_REG) asm_driver_panic(p->d, "x64 asm: imul dst register"); + if (src.kind == X64_ASM_OP_REG) { + if (p->desc->opc[0] == X64_OPC_IMUL_IMM8 || imm_fits_i8(imm.imm)) + emit_imul_imm8(p->mc, width_to_w(p->width), dst.reg, src.reg, + (i8)imm.imm); + else if (imm_fits_i32(imm.imm)) + emit_imul_imm32(p->mc, width_to_w(p->width), dst.reg, src.reg, + (i32)imm.imm); + else + asm_driver_panic(p->d, "x64 asm: imul imm out of range"); + return; + } + if (src.kind == X64_ASM_OP_MEM) { + u8 buf[16]; + u32 n = 0; + int imm32 = !(p->desc->opc[0] == X64_OPC_IMUL_IMM8 || imm_fits_i8(imm.imm)); + if (imm32 && !imm_fits_i32(imm.imm)) + asm_driver_panic(p->d, "x64 asm: imul imm out of range"); + n += x64_pack_rex(buf + n, width_to_w(p->width), dst.reg, 0, src.base); + buf[n++] = imm32 ? X64_OPC_IMUL_IMM32 : X64_OPC_IMUL_IMM8; + n += x64_pack_mem(buf + n, dst.reg, src.base, src.disp); + if (imm32) + n += x64_put_u32le(buf + n, (u32)(i32)imm.imm); + else + buf[n++] = (u8)(i8)imm.imm; + emit_packed(p->mc, buf, n); + return; + } + asm_driver_panic(p->d, "x64 asm: imul source"); +} + +static void parse_f7_rm(X64ParseCtx* p) { + X64AsmOperand op = parse_operand(p->d); + emit_rm_op(p->d, p->mc, p->width, X64_OPC_F7, p->desc->modrm_reg, op); +} + +static void parse_shift(X64ParseCtx* p) { + X64AsmOperand src = parse_operand(p->d); + X64AsmOperand dst; + expect_comma(p->d); + dst = parse_operand(p->d); + if (src.kind == X64_ASM_OP_REG && src.reg == X64_RCX && src.width == 1u) { + emit_rm_op(p->d, p->mc, p->width, X64_OPC_SHIFT_CL, p->desc->modrm_reg, + dst); + return; + } + if (src.kind != X64_ASM_OP_IMM) asm_driver_panic(p->d, "x64 asm: shift imm"); + emit_rm_imm(p->d, p->mc, p->width, X64_OPC_SHIFT_IMM, p->desc->modrm_reg, + dst, (i32)src.imm, 0); +} + +static void parse_rel32_branch(X64ParseCtx* p) { + ObjSymId sym = OBJ_SYM_NONE; + i64 off = 0; + u32 disp_pos; + if (p->desc->fmt == X64_FMT_JCC_REL32) { + u8 op[2] = {0x0f, (u8)(0x80u | (p->cc & 0xfu))}; + p->mc->emit_bytes(p->mc, op, 2); + } else { + u8 op = (p->desc->fmt == X64_FMT_CALL_REL32) ? X64_OPC_CALL_REL32 + : X64_OPC_JMP_REL32; + p->mc->emit_bytes(p->mc, &op, 1); + } + disp_pos = p->mc->pos(p->mc); + emit_u32le(p->mc, 0); + asm_driver_parse_sym_expr(p->d, &sym, &off); + if (sym == OBJ_SYM_NONE) asm_driver_panic(p->d, "x64 asm: symbolic branch target required"); + p->mc->emit_reloc_at(p->mc, asm_driver_cur_section(p->d), disp_pos, + p->desc->fmt == X64_FMT_CALL_REL32 ? R_X64_PLT32 + : R_PC32, + sym, off - 4, 1, 0); +} + +static void parse_setcc(X64ParseCtx* p) { + X64AsmOperand dst = parse_operand(p->d); + if (dst.kind == X64_ASM_OP_REG) { + if (dst.high8) { + u8 op[2] = {0x0f, (u8)(0x90u | (p->cc & 0xfu))}; + p->mc->emit_bytes(p->mc, op, 2); + emit_rm_reg(p->mc, 0, dst.reg); + } else { + emit_setcc(p->mc, p->cc, dst.reg); } - } else if (sym_eq(d, mnemonic, "movslq")) { - if (src.kind == X64_ASM_OP_REG && dst.kind == X64_ASM_OP_REG) { - emit_extend_rr(mc, 1, 1, 4, dst.reg, src.reg); + return; + } + if (dst.kind == X64_ASM_OP_MEM) { + u8 buf[16]; + u32 n = x64_pack_rex(buf, 0, 0, 0, dst.base); + buf[n++] = X64_OPC_TWOBYTE; + buf[n++] = (u8)(X64_OPC_SETCC_BASE | (p->cc & 0xfu)); + n += x64_pack_mem(buf + n, 0, dst.base, dst.disp); + emit_packed(p->mc, buf, n); + return; + } + asm_driver_panic(p->d, "x64 asm: setcc destination"); +} + +static void parse_sse_rr(X64ParseCtx* p) { + X64AsmOperand src = parse_operand(p->d); + X64AsmOperand dst; + int cvt_to_int = p->desc->opc[1] == 0x2cu; + int cvt_from_int = p->desc->opc[1] == 0x2au; + expect_comma(p->d); + dst = parse_operand(p->d); + if (cvt_to_int) { + if (dst.kind != X64_ASM_OP_REG) + asm_driver_panic(p->d, "x64 asm: cvtt dst register"); + if (src.kind == X64_ASM_OP_XMM) + emit_sse_rr_w(p->mc, p->desc->leg_pfx, p->desc->opc[1], + width_to_w(p->width), dst.reg, src.reg); + else if (src.kind == X64_ASM_OP_MEM) + emit_reg_rm_twobyte(p->d, p->mc, p->width, p->desc->opc[1], dst.reg, + src, 0, p->desc->leg_pfx); + else + asm_driver_panic(p->d, "x64 asm: cvtt source"); + return; + } + if (cvt_from_int) { + if (dst.kind != X64_ASM_OP_XMM) + asm_driver_panic(p->d, "x64 asm: cvtsi dst xmm"); + if (src.kind == X64_ASM_OP_REG) + emit_sse_rr_w(p->mc, p->desc->leg_pfx, p->desc->opc[1], + width_to_w(p->width), dst.reg, src.reg); + else if (src.kind == X64_ASM_OP_MEM) + emit_sse_load(p->mc, p->desc->leg_pfx, p->desc->opc[1], dst.reg, + src.base, src.disp); + else + asm_driver_panic(p->d, "x64 asm: cvtsi source"); + return; + } + if (dst.kind == X64_ASM_OP_MEM && src.kind == X64_ASM_OP_XMM && + p->desc->opc[1] == 0x10u && + (!strcmp(p->desc->mnemonic, "movsd") || + !strcmp(p->desc->mnemonic, "movss"))) { + emit_sse_store(p->mc, p->desc->leg_pfx, 0x11, src.reg, dst.base, + dst.disp); + return; + } + if (dst.kind != X64_ASM_OP_XMM) asm_driver_panic(p->d, "x64 asm: sse dst xmm"); + if (src.kind == X64_ASM_OP_XMM) + emit_sse_rr(p->mc, p->desc->leg_pfx, p->desc->opc[1], dst.reg, src.reg); + else if (src.kind == X64_ASM_OP_MEM) + emit_sse_load(p->mc, p->desc->leg_pfx, p->desc->opc[1], dst.reg, src.base, + src.disp); + else + asm_driver_panic(p->d, "x64 asm: sse source"); +} + +static void parse_bswap(X64ParseCtx* p) { + X64AsmOperand reg = parse_operand(p->d); + u8 op[2]; + if (reg.kind != X64_ASM_OP_REG) asm_driver_panic(p->d, "x64 asm: bswap reg"); + emit_rex(p->mc, width_to_w(p->width), 0, 0, reg.reg); + op[0] = 0x0f; + op[1] = (u8)(0xc8u | (reg.reg & 7u)); + p->mc->emit_bytes(p->mc, op, 2); +} + +static void parse_bs_popcnt(X64ParseCtx* p) { + X64AsmOperand src = parse_operand(p->d); + X64AsmOperand dst; + expect_comma(p->d); + dst = parse_operand(p->d); + if (dst.kind != X64_ASM_OP_REG) asm_driver_panic(p->d, "x64 asm: bit-scan dst register"); + emit_reg_rm_twobyte(p->d, p->mc, p->width, p->desc->opc[1], dst.reg, src, 0, + p->desc->leg_pfx); +} + +static void parse_atomic(X64ParseCtx* p) { + X64AsmOperand src = parse_operand(p->d); + X64AsmOperand dst; + u8 buf[16]; + u32 n = 0; + expect_comma(p->d); + dst = parse_operand(p->d); + if (src.kind != X64_ASM_OP_REG || + (dst.kind != X64_ASM_OP_REG && dst.kind != X64_ASM_OP_MEM)) + asm_driver_panic(p->d, "x64 asm: atomic form"); + n += x64_pack_rex(buf + n, width_to_w(p->width), src.reg, 0, + dst.kind == X64_ASM_OP_REG ? dst.reg : dst.base); + if (p->desc->opc_len == 2) { + buf[n++] = X64_OPC_TWOBYTE; + buf[n++] = p->desc->opc[1]; + } else { + buf[n++] = p->desc->opc[0]; + } + if (dst.kind == X64_ASM_OP_REG) + buf[n++] = x64_modrm(3u, src.reg, dst.reg); + else + n += x64_pack_mem(buf + n, src.reg, dst.base, dst.disp); + emit_packed(p->mc, buf, n); +} + +static void parse_nop_multi(X64ParseCtx* p) { + u8 nop6[6] = {X64_NOP6_BYTE0, X64_NOP6_BYTE1, X64_NOP6_BYTE2, + X64_NOP6_BYTE3, X64_NOP6_BYTE4, X64_NOP6_BYTE5}; + p->mc->emit_bytes(p->mc, nop6, sizeof nop6); +} + +static void parse_and_emit_for_format(X64ParseCtx* p) { + switch ((X64Format)p->desc->fmt) { + case X64_FMT_NULLARY: + parse_nullary(p); return; - } - } else if (sym_eq(d, mnemonic, "leaq")) { - if (src.kind == X64_ASM_OP_MEM && dst.kind == X64_ASM_OP_REG) { - emit_lea(mc, dst.reg, src.base, src.disp); + case X64_FMT_NOP_MULTI: + parse_nop_multi(p); return; - } - } else if (sym_eq(d, mnemonic, "xorl")) { - if (src.kind == X64_ASM_OP_REG && dst.kind == X64_ASM_OP_REG) { - emit_alu_rr(mc, 0, 0x31, dst.reg, src.reg); + case X64_FMT_PUSH_POP: + parse_push_pop(p); return; - } - } else if (sym_eq(d, mnemonic, "testq")) { - if (src.kind == X64_ASM_OP_REG && dst.kind == X64_ASM_OP_REG) { - emit_alu_rr(mc, 1, 0x85, dst.reg, src.reg); + case X64_FMT_BR_RM: + parse_br_rm(p); return; - } - } else if (sym_eq(d, mnemonic, "cmoveq")) { - if (src.kind == X64_ASM_OP_REG && dst.kind == X64_ASM_OP_REG) { - emit_cmov_eq(mc, dst.reg, src.reg); + case X64_FMT_ALU_RR: + parse_alu_rr(p); return; - } - } else if (sym_eq(d, mnemonic, "andq")) { - if (src.kind == X64_ASM_OP_IMM && dst.kind == X64_ASM_OP_REG) { - if (imm_fits_i8(src.imm)) - emit_alu_imm8(mc, 1, 4u, dst.reg, (i8)src.imm); - else if (imm_fits_i32(src.imm)) - emit_alu_imm32(mc, 1, 4u, dst.reg, (i32)src.imm); - else - asm_driver_panic(d, "x64 asm: andq immediate out of range"); + case X64_FMT_MOV_RI: + parse_mov_ri(p); + return; + case X64_FMT_MOV_RM_LOAD: + parse_mov_rm_load(p); + return; + case X64_FMT_MOVZX_MOVSX: + parse_movzx_movsx(p); + return; + case X64_FMT_MOVSXD: + parse_movsxd(p); + return; + case X64_FMT_ALU_RM_IMM8: + case X64_FMT_ALU_RM_IMM32: + parse_alu_rm_imm(p); + return; + case X64_FMT_CMOVCC_RR: + parse_cmovcc(p); + return; + case X64_FMT_IMUL_RR: + parse_imul_rr(p); + return; + case X64_FMT_IMUL_RRI: + parse_imul_rri(p); return; + case X64_FMT_F7_RM: + parse_f7_rm(p); + return; + case X64_FMT_SHIFT_IMM: + case X64_FMT_SHIFT_CL: + parse_shift(p); + return; + case X64_FMT_JCC_REL32: + case X64_FMT_JMP_REL32: + case X64_FMT_CALL_REL32: + parse_rel32_branch(p); + return; + case X64_FMT_SETCC_RM: + parse_setcc(p); + return; + case X64_FMT_SSE_RR: + case X64_FMT_SSE_LOAD: + case X64_FMT_SSE_STORE: + parse_sse_rr(p); + return; + case X64_FMT_BSWAP: + parse_bswap(p); + return; + case X64_FMT_BS: + case X64_FMT_POPCNT: + parse_bs_popcnt(p); + return; + case X64_FMT_XADD_MEM: + case X64_FMT_XCHG_MEM: + case X64_FMT_CMPXCHG_MEM: + parse_atomic(p); + return; + default: + asm_driver_panic(p->d, "x64 asm: format not implemented"); + } +} + +/* Width letter (b/w/l/q) → width in bytes. Falls back to row-implied + * width if the suffix is absent. */ +static u32 width_from_info(const X64MnInfo* info, const X64InsnDesc* d) { + if (info->width != 0) return info->width; + { + u32 rw = row_implied_width(d); + return rw ? rw : 4u; + } +} + +static void x64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic) { + X64Asm* a = (X64Asm*)base; + MCEmitter* mc = asm_driver_mc(d); + size_t n = 0; + const char* p = pool_str(asm_driver_pool(d), mnemonic, &n); + X64MnInfo info; + const X64InsnDesc* desc; + X64ParseCtx ctx; + (void)a; + (void)asm_driver_cur_section(d); + + if (!p || !parse_mnemonic(p, n, &info)) + asm_driver_panic(d, "x64 asm: bad mnemonic"); + + if (n == 4 && memcmp(p, "lock", 4) == 0) { + AsmTok next; + u8 pfx = 0xf0; + mc->emit_bytes(mc, &pfx, 1); + next = asm_driver_next(d); + if (next.kind != ASM_TOK_IDENT) + asm_driver_panic(d, "x64 asm: lock requires an instruction"); + x64_arch_asm_insn(base, d, next.v.ident); + return; + } + + /* Special case: imm→reg "mov" still spelled "movl"/"movq" but the + * generic scan returns ALU_RR (0x89) first. When we see a "$" + * immediate as the first operand, we want MOV_RI instead. Defer this + * disambiguation to parse_alu_rr would force pre-parsing operands; + * simpler is to special-case MOV here. */ + if (info.base_len == 3 && memcmp(info.base, "mov", 3) == 0) { + /* Peek for leading '$' → immediate form. */ + AsmTok t = asm_driver_peek(d); + if (asm_driver_tok_is_punct(t, '$')) { + /* Find the MOV_RI row. */ + for (u32 i = 0; i < x64_insn_table_n; ++i) { + const X64InsnDesc* dr = &x64_insn_table[i]; + if (dr->fmt == X64_FMT_MOV_RI && + strlen(dr->mnemonic) == 3 && + memcmp(dr->mnemonic, "mov", 3) == 0) { + ctx.d = d; + ctx.mc = mc; + ctx.desc = dr; + ctx.width = info.width ? info.width : 4u; + ctx.cc = info.cc; + parse_mov_ri(&ctx); + return; + } + } + } + /* For mov reg,mem and mov mem,reg we need MOV_RM_LOAD (0x8B) for + * the load side. Easiest: pre-parse src; if mem and dst is reg → + * MOV_RM_LOAD. Doing so re-uses the AT&T parser cleanly. */ + { + X64AsmOperand src = parse_operand(d); + expect_comma(d); + { + X64AsmOperand dst = parse_operand(d); + u32 w = info.width ? info.width : 4u; + if (src.kind == X64_ASM_OP_REG && dst.kind == X64_ASM_OP_REG) { + if (w == 1u) { + /* MOV r/m8, r8 — opcode 0x88. */ + emit_movb_rr_operand(d, mc, dst, src); + return; + } + if (w == 2u) { + u8 pfx = X64_OPSIZE_PFX; + mc->emit_bytes(mc, &pfx, 1); + } + emit_mov_rr(mc, width_to_w(w), dst.reg, src.reg); + return; + } + if (src.kind == X64_ASM_OP_REG && dst.kind == X64_ASM_OP_MEM) { + if (w == 1u) + emit_movb_store_operand(d, mc, src, dst); + else + emit_mov_store(mc, w, src.reg, dst.base, dst.disp); + return; + } + if (src.kind == X64_ASM_OP_MEM && dst.kind == X64_ASM_OP_REG) { + if (w == 2u) { + u8 buf[16]; + u32 nn = x64_mov_rm_load_pack( + (X64MovRMLoad){.w = 0, .opc0 = X64_OPC_MOV_R_RM, + .dst = dst.reg, .base = src.base, + .disp = src.disp}, + buf + 1); + buf[0] = X64_OPSIZE_PFX; + emit_packed(mc, buf, nn + 1u); + } else { + emit_mov_load(mc, w, 0, dst.reg, src.base, src.disp); + } + return; + } + asm_driver_panic(d, "x64 asm: mov form"); + } } } - asm_driver_panic(d, "x64 asm: unsupported instruction form"); + desc = find_mnemonic_row(&info); + if (!desc) asm_driver_panic(d, "x64 asm: unknown mnemonic"); + + /* If the user wrote an indirect branch (`*%reg`), prefer the BR_RM row + * over the rel32 row that may sort first in the table. */ + if (desc->fmt == X64_FMT_CALL_REL32 || desc->fmt == X64_FMT_JMP_REL32) { + AsmTok t = asm_driver_peek(d); + if (asm_driver_tok_is_punct(t, '*')) { + for (u32 i = 0; i < x64_insn_table_n; ++i) { + const X64InsnDesc* dr = &x64_insn_table[i]; + if (dr->fmt != X64_FMT_BR_RM) continue; + if (strlen(dr->mnemonic) != info.base_len) continue; + if (memcmp(dr->mnemonic, info.base, info.base_len) != 0) continue; + desc = dr; + break; + } + } + } + + ctx.d = d; + ctx.mc = mc; + ctx.desc = desc; + ctx.width = width_from_info(&info, desc); + ctx.cc = info.cc; + parse_and_emit_for_format(&ctx); } static void x64_arch_asm_destroy(ArchAsm* base) { @@ -272,7 +1259,15 @@ _Noreturn static void inline_panic(X64Asm* a, const char* msg) { compiler_panic(a->c, loc, "x64 inline asm: %s", msg); } -static const char* x64_reg_spelling(u32 reg, int width32) { +/* Width selector for x64_reg_spelling: matches the operand-modifier + * forms recognised by the template walker. */ +#define X64_REG_WIDTH_64 0 +#define X64_REG_WIDTH_32 1 +#define X64_REG_WIDTH_8 2 +#define X64_REG_WIDTH_16 3 +#define X64_REG_WIDTH_H8 4 + +static const char* x64_reg_spelling(u32 reg, int width) { static const char* r64[16] = { "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", @@ -281,7 +1276,20 @@ static const char* x64_reg_spelling(u32 reg, int width32) { "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d", }; - return width32 ? r32[reg & 15u] : r64[reg & 15u]; + static const char* r8[16] = { + "al", "cl", "dl", "bl", "spl", "bpl", "sil", "dil", + "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b", + }; + static const char* r16[16] = { + "ax", "cx", "dx", "bx", "sp", "bp", "si", "di", + "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w", + }; + static const char* rh8[4] = {"ah", "ch", "dh", "bh"}; + if (width == X64_REG_WIDTH_H8) return reg < 4u ? rh8[reg] : NULL; + if (width == X64_REG_WIDTH_16) return r16[reg & 15u]; + if (width == X64_REG_WIDTH_8) return r8[reg & 15u]; + if (width == X64_REG_WIDTH_32) return r32[reg & 15u]; + return r64[reg & 15u]; } static int x64_type_prefers_32(CfreeCgTypeId type) { @@ -289,9 +1297,10 @@ static int x64_type_prefers_32(CfreeCgTypeId type) { return !type_is_64(type); } -static void render_reg(StrBuf* sb, u32 reg, int width32) { +static void render_reg(StrBuf* sb, u32 reg, int width) { + const char* name = x64_reg_spelling(reg, width); strbuf_putc(sb, '%'); - strbuf_puts(sb, x64_reg_spelling(reg, width32)); + if (name) strbuf_puts(sb, name); } static void render_imm(StrBuf* sb, i64 v) { @@ -302,23 +1311,72 @@ static void render_imm(StrBuf* sb, i64 v) { static void render_indirect(StrBuf* sb, Reg base, i32 ofs) { if (ofs) strbuf_put_i64(sb, (i64)ofs); strbuf_putc(sb, '('); - render_reg(sb, (u32)base, 0); + render_reg(sb, (u32)base, X64_REG_WIDTH_64); strbuf_putc(sb, ')'); } +/* Operand-modifier forms used by the template walker. */ +#define X64_FORM_DEFAULT 0 +#define X64_FORM_W 1 /* %w — 16-bit */ +#define X64_FORM_X 2 /* %x — 64-bit */ +#define X64_FORM_A 3 /* %a — address / memory */ +#define X64_FORM_B 4 /* %b — 8-bit (byte) register */ +#define X64_FORM_K 5 /* %k — 32-bit */ +#define X64_FORM_H 6 /* %h — high 8-bit register (a/c/d/b only) */ + +static char x64_size_suffix_for_operand(X64Asm* a, u32 idx) { + u32 ntot = a->nout + a->nin; + const Operand* op; + u32 size; + if (idx >= ntot) inline_panic(a, "operand index out of range"); + op = (idx < a->nout) ? &a->out_ops[idx] : &a->in_ops[idx - a->nout]; + if (op->type) + size = type_byte_size(op->type); + else if (op->kind == OPK_IMM) + size = 4; + else + size = 8; + switch (size) { + case 1: + return 'b'; + case 2: + return 'w'; + case 4: + return 'l'; + case 8: + return 'q'; + default: + inline_panic(a, "%z requires a scalar 1/2/4/8-byte operand"); + } +} + static void render_operand(X64Asm* a, StrBuf* sb, u32 idx, int form) { u32 ntot = a->nout + a->nin; const Operand* op; if (idx >= ntot) inline_panic(a, "operand index out of range"); op = (idx < a->nout) ? &a->out_ops[idx] : &a->in_ops[idx - a->nout]; - if (form == 3) { + if (form == X64_FORM_A) { if (op->kind != OPK_INDIRECT) inline_panic(a, "%a on non-memory operand"); render_indirect(sb, op->v.ind.base, op->v.ind.ofs); return; } + if ((form == X64_FORM_B || form == X64_FORM_H) && op->kind != OPK_REG) { + inline_panic(a, "byte-register modifier requires a register operand"); + } if (op->kind == OPK_REG) { - int width32 = form == 1 ? 1 : form == 2 ? 0 : x64_type_prefers_32(op->type); - render_reg(sb, (u32)op->v.reg, width32); + int width; + if (form == X64_FORM_B) width = X64_REG_WIDTH_8; + else if (form == X64_FORM_H) { + if (op->v.reg > X64_RBX) { + inline_panic(a, "%h modifier requires ax/cx/dx/bx register"); + } + width = X64_REG_WIDTH_H8; + } else if (form == X64_FORM_W) width = X64_REG_WIDTH_16; + else if (form == X64_FORM_K) width = X64_REG_WIDTH_32; + else if (form == X64_FORM_X) width = X64_REG_WIDTH_64; + else width = x64_type_prefers_32(op->type) ? X64_REG_WIDTH_32 + : X64_REG_WIDTH_64; + render_reg(sb, (u32)op->v.reg, width); return; } if (op->kind == OPK_IMM) { @@ -394,8 +1452,15 @@ static void render_and_run_line(X64Asm* a, MCEmitter* mc, StrBuf* sb, ++p; continue; } - if (n == 'w' || n == 'x' || n == 'a') { - form = (n == 'w') ? 1 : (n == 'x') ? 2 : 3; + if (n == 'w' || n == 'x' || n == 'a' || n == 'b' || n == 'k' || + n == 'h' || n == 'z') { + form = (n == 'w') ? X64_FORM_W + : (n == 'x') ? X64_FORM_X + : (n == 'a') ? X64_FORM_A + : (n == 'b') ? X64_FORM_B + : (n == 'k') ? X64_FORM_K + : (n == 'h') ? X64_FORM_H + : -1; ++p; if (p + 1 >= end) inline_panic(a, "trailing '%' modifier"); n = *(p + 1); @@ -408,7 +1473,10 @@ static void render_and_run_line(X64Asm* a, MCEmitter* mc, StrBuf* sb, if (nend == end) inline_panic(a, "unterminated %[name]"); idx = find_named_operand(a, nbeg, (size_t)(nend - nbeg)); p = nend; - render_operand(a, sb, idx, form); + if (form == -1) + strbuf_putc(sb, x64_size_suffix_for_operand(a, idx)); + else + render_operand(a, sb, idx, form); continue; } if (n < '0' || n > '9') inline_panic(a, "expected digit after '%'"); @@ -419,7 +1487,10 @@ static void render_and_run_line(X64Asm* a, MCEmitter* mc, StrBuf* sb, idx = idx * 10u + (u32)(*(p + 1) - '0'); ++p; } - render_operand(a, sb, idx, form); + if (form == -1) + strbuf_putc(sb, x64_size_suffix_for_operand(a, idx)); + else + render_operand(a, sb, idx, form); } } if (sb->truncated) inline_panic(a, "inline asm line buffer overflow"); diff --git a/src/arch/x64/dbg.c b/src/arch/x64/dbg.c @@ -0,0 +1,411 @@ +/* x86_64 debug support for software breakpoints and displaced stepping. + * + * The decoder here is intentionally small: it covers the encodings cfree's + * x64 backend emits plus common branch forms. It measures one instruction, + * finds any RIP-relative disp32 operand, and identifies rel8/rel32 control + * transfers that need their displacement re-based for the scratch slot. */ + +#include <string.h> + +#include "arch/arch.h" +#include "core/bytes.h" + +#define X64_INT3_BYTE 0xCCu +#define X64_JMP_REL32_BYTE 0xE9u +#define X64_CALL_REL32_BYTE 0xE8u +#define X64_JCC_SHORT_BASE 0x70u +#define X64_JCC_NEAR_BASE0 0x0Fu +#define X64_JCC_NEAR_BASE1 0x80u + +typedef enum X64DbgPcRelKind { + X64_DBG_PCREL_NONE, + X64_DBG_PCREL_RIP_DISP32, + X64_DBG_PCREL_REL8, + X64_DBG_PCREL_REL32, +} X64DbgPcRelKind; + +typedef struct X64DbgDecode { + u32 len; + u32 opc_off; + u32 disp_off; + u8 disp_size; + u8 pc_rel_kind; + u8 is_call; + u8 pad; + i64 disp; +} X64DbgDecode; + +static int fits_i32(i64 v) { + return v >= (i64)INT32_MIN && v <= (i64)INT32_MAX; +} +static int fits_i8(i64 v) { return v >= -128 && v <= 127; } + +static int is_legacy_prefix(u8 b) { + switch (b) { + case 0x26: + case 0x2e: + case 0x36: + case 0x3e: + case 0x64: + case 0x65: + case 0x66: + case 0x67: + case 0xf0: + case 0xf2: + case 0xf3: + return 1; + default: + return 0; + } +} + +static u32 x64_dbg_prefix_len(const u8* bytes, u32 len) { + u32 off = 0; + while (off < len && is_legacy_prefix(bytes[off])) ++off; + if (off < len && bytes[off] >= 0x40u && bytes[off] <= 0x4fu) ++off; + return off; +} + +static int read_i8(const u8* bytes, u32 len, u32 off, i64* out) { + if (off >= len) return 0; + *out = (i64)(i8)bytes[off]; + return 1; +} + +static int read_i32(const u8* bytes, u32 len, u32 off, i64* out) { + if (off + 4u > len) return 0; + *out = (i64)(i32)rd_u32_le(bytes + off); + return 1; +} + +static int x64_dbg_modrm_len(const u8* bytes, u32 len, u32 modrm_off, + u32* total_out, u32* rip_disp_off) { + u8 mr; + u32 mod; + u32 rm; + u32 off; + if (modrm_off >= len) return 0; + mr = bytes[modrm_off]; + mod = (mr >> 6) & 3u; + rm = mr & 7u; + off = modrm_off + 1u; + *rip_disp_off = 0; + + if (mod != 3u && rm == 4u) { + u8 sib; + if (off >= len) return 0; + sib = bytes[off++]; + if (mod == 0u && (sib & 7u) == 5u) { + if (off + 4u > len) return 0; + off += 4u; + } + } else if (mod == 0u && rm == 5u) { + if (off + 4u > len) return 0; + *rip_disp_off = off; + off += 4u; + } else if (mod == 1u) { + if (off + 1u > len) return 0; + off += 1u; + } else if (mod == 2u) { + if (off + 4u > len) return 0; + off += 4u; + } + + *total_out = off; + return 1; +} + +static int onebyte_has_modrm(u8 op) { + switch (op) { + case 0x01: + case 0x09: + case 0x21: + case 0x29: + case 0x31: + case 0x39: + case 0x85: + case 0x87: + case 0x88: + case 0x89: + case 0x8b: + case 0x8d: + case 0x63: + case 0x81: + case 0x83: + case 0xc1: + case 0xd3: + case 0xf7: + case 0xff: + case 0x69: + case 0x6b: + return 1; + default: + return 0; + } +} + +static u32 onebyte_imm_len(u8 op, const u8* bytes, u32 len, u32 modrm_off) { + (void)bytes; + (void)len; + (void)modrm_off; + switch (op) { + case 0x83: + case 0xc1: + case 0x6b: + return 1; + case 0x81: + case 0x69: + return 4; + default: + return 0; + } +} + +static int twobyte_has_modrm(u8 op) { + if (op >= 0x40u && op <= 0x4fu) return 1; /* CMOVcc */ + if (op >= 0x90u && op <= 0x9fu) return 1; /* SETcc */ + switch (op) { + case 0x10: + case 0x11: + case 0x1f: + case 0x2a: + case 0x2c: + case 0x2e: + case 0x58: + case 0x59: + case 0x5a: + case 0x5c: + case 0x5e: + case 0xaf: + case 0xb1: + case 0xb6: + case 0xb7: + case 0xb8: + case 0xbc: + case 0xbd: + case 0xbe: + case 0xbf: + case 0xc1: + return 1; + default: + return 0; + } +} + +static CfreeStatus x64_dbg_measure(const u8* bytes, u32 len, u64 pc, + X64DbgDecode* out) { + u32 off; + u8 op; + if (!bytes || !out) return CFREE_INVALID; + memset(out, 0, sizeof(*out)); + off = x64_dbg_prefix_len(bytes, len); + if (off >= len) return CFREE_UNSUPPORTED; + out->opc_off = off; + op = bytes[off]; + + if (op == X64_CALL_REL32_BYTE || op == X64_JMP_REL32_BYTE) { + if (!read_i32(bytes, len, off + 1u, &out->disp)) return CFREE_UNSUPPORTED; + out->len = off + 5u; + out->disp_off = off + 1u; + out->disp_size = 4u; + out->pc_rel_kind = X64_DBG_PCREL_REL32; + out->is_call = (op == X64_CALL_REL32_BYTE); + return CFREE_OK; + } + if (op == 0xebu || (op >= 0x70u && op <= 0x7fu) || + (op >= 0xe0u && op <= 0xe3u)) { + if (!read_i8(bytes, len, off + 1u, &out->disp)) return CFREE_UNSUPPORTED; + out->len = off + 2u; + out->disp_off = off + 1u; + out->disp_size = 1u; + out->pc_rel_kind = X64_DBG_PCREL_REL8; + return CFREE_OK; + } + if (op >= 0x50u && op <= 0x5fu) { + out->len = off + 1u; + return CFREE_OK; + } + if (op >= 0xb8u && op <= 0xbfu) { + u32 imm = 4u; + for (u32 i = 0; i < off; ++i) { + if (bytes[i] >= 0x48u && bytes[i] <= 0x4fu) imm = 8u; + } + if (off + 1u + imm > len) return CFREE_UNSUPPORTED; + out->len = off + 1u + imm; + return CFREE_OK; + } + switch (op) { + case 0x90: + case 0xc3: + case 0xc9: + case 0x99: + out->len = off + 1u; + return CFREE_OK; + default: + break; + } + + if (op == 0x0fu) { + u8 op2; + if (off + 1u >= len) return CFREE_UNSUPPORTED; + op2 = bytes[off + 1u]; + if (op2 >= 0x80u && op2 <= 0x8fu) { + if (!read_i32(bytes, len, off + 2u, &out->disp)) return CFREE_UNSUPPORTED; + out->len = off + 6u; + out->disp_off = off + 2u; + out->disp_size = 4u; + out->pc_rel_kind = X64_DBG_PCREL_REL32; + return CFREE_OK; + } + if (op2 == 0x0bu || (op2 >= 0xc8u && op2 <= 0xcfu)) { + out->len = off + 2u; + return CFREE_OK; + } + if (op2 == 0xaeu && off + 2u < len && bytes[off + 2u] == 0xf0u) { + out->len = off + 3u; + return CFREE_OK; + } + if (twobyte_has_modrm(op2)) { + u32 end = 0; + u32 rip = 0; + if (!x64_dbg_modrm_len(bytes, len, off + 2u, &end, &rip)) + return CFREE_UNSUPPORTED; + out->len = end; + if (rip) { + if (!read_i32(bytes, len, rip, &out->disp)) return CFREE_UNSUPPORTED; + out->disp_off = rip; + out->disp_size = 4u; + out->pc_rel_kind = X64_DBG_PCREL_RIP_DISP32; + } + return CFREE_OK; + } + return CFREE_UNSUPPORTED; + } + + if (onebyte_has_modrm(op)) { + u32 end = 0; + u32 rip = 0; + u32 imm; + if (!x64_dbg_modrm_len(bytes, len, off + 1u, &end, &rip)) + return CFREE_UNSUPPORTED; + imm = onebyte_imm_len(op, bytes, len, off + 1u); + if (end + imm > len) return CFREE_UNSUPPORTED; + out->len = end + imm; + if (rip) { + if (!read_i32(bytes, len, rip, &out->disp)) return CFREE_UNSUPPORTED; + out->disp_off = rip; + out->disp_size = 4u; + out->pc_rel_kind = X64_DBG_PCREL_RIP_DISP32; + } + if (op == 0xffu && off + 1u < len) { + u8 sub = (bytes[off + 1u] >> 3) & 7u; + out->is_call = (sub == 2u); + } + (void)pc; + return CFREE_OK; + } + + return CFREE_UNSUPPORTED; +} + +static CfreeStatus x64_dbg_breakpoint_patch(u8* out, u32 cap, u32* len_out) { + if (!out || !len_out) return CFREE_INVALID; + if (cap < 1u) return CFREE_INVALID; + out[0] = X64_INT3_BYTE; + *len_out = 1u; + return CFREE_OK; +} + +static u64 x64_dbg_breakpoint_addr_from_fault_pc(u64 fault_pc) { + return fault_pc ? fault_pc - 1u : 0u; +} + +static CfreeStatus x64_dbg_decode_insn(const u8* bytes, u32 len, u64 pc, + ArchDbgInsn* out) { + X64DbgDecode d; + CfreeStatus st = x64_dbg_measure(bytes, len, pc, &d); + if (st != CFREE_OK) return st; + if (d.len == 0 || d.len > ARCH_DBG_MAX_INSN_BYTES) return CFREE_UNSUPPORTED; + memset(out, 0, sizeof(*out)); + out->pc = pc; + out->len = d.len; + memcpy(out->bytes, bytes, d.len); + return CFREE_OK; +} + +static CfreeStatus x64_dbg_build_displaced_shim( + const ArchDbgInsn* insn, void* scratch_write, u64 scratch_runtime, + u32 scratch_cap, u32* sentinel_off, u64* fallthrough_pc) { + X64DbgDecode d; + u8* w = (u8*)scratch_write; + CfreeStatus st; + u8 op; + if (!insn || !scratch_write || !sentinel_off || !fallthrough_pc) + return CFREE_INVALID; + st = x64_dbg_measure(insn->bytes, insn->len, insn->pc, &d); + if (st != CFREE_OK) return st; + if (d.len != insn->len) return CFREE_UNSUPPORTED; + if (insn->len + 1u > scratch_cap) return CFREE_UNSUPPORTED; + + op = insn->bytes[d.opc_off]; + if ((op == 0xe9u || op == 0xebu) && (d.pc_rel_kind == X64_DBG_PCREL_REL32 || + d.pc_rel_kind == X64_DBG_PCREL_REL8)) { + *fallthrough_pc = (u64)((i64)(insn->pc + insn->len) + d.disp); + w[0] = X64_INT3_BYTE; + *sentinel_off = 0; + return CFREE_OK; + } + + memcpy(w, insn->bytes, insn->len); + *fallthrough_pc = insn->pc + insn->len; + + if (d.pc_rel_kind == X64_DBG_PCREL_RIP_DISP32) { + i64 target = (i64)(insn->pc + insn->len) + d.disp; + i64 nd = target - (i64)(scratch_runtime + insn->len); + if (!fits_i32(nd)) return CFREE_UNSUPPORTED; + wr_u32_le(w + d.disp_off, (u32)(i32)nd); + } else if (d.pc_rel_kind == X64_DBG_PCREL_REL32) { + i64 target = (i64)(insn->pc + insn->len) + d.disp; + i64 nd = target - (i64)(scratch_runtime + insn->len); + if (!fits_i32(nd)) return CFREE_UNSUPPORTED; + wr_u32_le(w + d.disp_off, (u32)(i32)nd); + } else if (d.pc_rel_kind == X64_DBG_PCREL_REL8) { + i64 target = (i64)(insn->pc + insn->len) + d.disp; + i64 nd = target - (i64)(scratch_runtime + insn->len); + if (!fits_i8(nd) && op >= 0x70u && op <= 0x7fu) { + if (scratch_cap < 7u) return CFREE_UNSUPPORTED; + nd = target - (i64)(scratch_runtime + 6u); + if (!fits_i32(nd)) return CFREE_UNSUPPORTED; + w[0] = 0x0fu; + w[1] = (u8)(0x80u | (op & 0x0fu)); + wr_u32_le(w + 2u, (u32)(i32)nd); + w[6] = X64_INT3_BYTE; + *sentinel_off = 6u; + return CFREE_OK; + } + if (!fits_i8(nd)) return CFREE_UNSUPPORTED; + w[d.disp_off] = (u8)(i8)nd; + } + + w[insn->len] = X64_INT3_BYTE; + *sentinel_off = insn->len; + return CFREE_OK; +} + +static int x64_dbg_is_call(const ArchDbgInsn* insn) { + X64DbgDecode d; + if (!insn) return 0; + if (x64_dbg_measure(insn->bytes, insn->len, insn->pc, &d) != CFREE_OK) + return 0; + return d.is_call != 0; +} + +const ArchDbgOps x64_dbg_ops = { + .min_insn_len = 1u, + .max_insn_len = ARCH_DBG_MAX_INSN_BYTES, + .breakpoint_patch = x64_dbg_breakpoint_patch, + .breakpoint_addr_from_fault_pc = x64_dbg_breakpoint_addr_from_fault_pc, + .decode_insn = x64_dbg_decode_insn, + .build_displaced_shim = x64_dbg_build_displaced_shim, + .is_call = x64_dbg_is_call, +}; diff --git a/src/arch/x64/disasm.c b/src/arch/x64/disasm.c @@ -1,4 +1,10 @@ -/* Small x86-64 disassembler for the instruction subset cfree can assemble. */ +/* x86_64 disassembler. + * + * Walks legacy prefixes + REX, then asks `arch/x64/isa.c`'s descriptor + * table to identify the opcode. The matched row's format drives operand + * rendering. Everything cfree's emit.c produces is in the table; anything + * else falls back to a `.byte 0xNN` rendering so cfree objdump never + * crashes on unknown bytes. */ #include "arch/x64/disasm.h" @@ -12,7 +18,6 @@ #define X64_DASM_MNEM_CAP 16u #define X64_DASM_OPS_CAP 128u #define X64_DASM_ANN_CAP 96u -#define X64_REG_RIP 16u typedef struct X64Disasm { ArchDisasm base; @@ -26,290 +31,95 @@ typedef struct X64Disasm { StrBuf ann; } X64Disasm; -typedef struct X64Rex { - u8 present; - u8 w; - u8 r; - u8 x; - u8 b; -} X64Rex; - -static const char* x64_reg_name(u32 reg, u32 width) { - static const char* r64[16] = { - "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - }; - static const char* r32[16] = { - "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", - "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d", - }; - return width == 4u ? r32[reg & 15u] : r64[reg & 15u]; -} - -static void put_reg(StrBuf* sb, u32 reg, u32 width) { - strbuf_putc(sb, '%'); - strbuf_puts(sb, x64_reg_name(reg, width)); -} - -static void put_imm(StrBuf* sb, i64 imm) { - strbuf_putc(sb, '$'); - strbuf_put_i64(sb, imm); -} - -static u32 parse_rex(const u8* bytes, size_t len, X64Rex* rex) { - memset(rex, 0, sizeof *rex); - if (len && bytes[0] >= 0x40u && bytes[0] <= 0x4fu) { - u8 b = bytes[0]; - rex->present = 1; - rex->w = (u8)((b >> 3) & 1u); - rex->r = (u8)((b >> 2) & 1u); - rex->x = (u8)((b >> 1) & 1u); - rex->b = (u8)(b & 1u); - return 1; - } - return 0; -} - -static void put_mem(StrBuf* sb, u32 base, i32 disp, int has_base) { - if (disp != 0 || !has_base) strbuf_put_i64(sb, (i64)disp); - if (has_base) { - strbuf_putc(sb, '('); - if (base == X64_REG_RIP) { - strbuf_puts(sb, "%rip"); - } else { - put_reg(sb, base, 8); - } - strbuf_putc(sb, ')'); - } -} - -static u32 read_disp(const u8* bytes, size_t len, u32 off, u32 n, i32* out) { - if (off + n > len) return 0; - if (n == 1u) { - *out = (i32)(i8)bytes[off]; - } else if (n == 4u) { - *out = (i32)rd_u32_le(bytes + off); - } else { - *out = 0; - } - return n; -} - -static u32 put_rm_operand(StrBuf* sb, const u8* bytes, size_t len, u32 off, - X64Rex rex, u32 rm, u32 mod, u32 width) { - if (mod == 3u) { - put_reg(sb, rm | ((u32)rex.b << 3), width); - return 0; - } - - if ((rm & 7u) == 4u) { - u8 sib; - u32 base; - i32 disp = 0; - u32 used = 1; - if (off >= len) return (u32)-1; - sib = bytes[off]; - base = (sib & 7u) | ((u32)rex.b << 3); - if (mod == 0u && (sib & 7u) == 5u) { - if (!read_disp(bytes, len, off + used, 4, &disp)) return (u32)-1; - used += 4; - put_mem(sb, X64_REG_RIP, disp, 0); - return used; - } - if (mod == 1u) { - if (!read_disp(bytes, len, off + used, 1, &disp)) return (u32)-1; - used += 1; - } else if (mod == 2u) { - if (!read_disp(bytes, len, off + used, 4, &disp)) return (u32)-1; - used += 4; - } - put_mem(sb, base, disp, 1); - return used; - } - - { - u32 base = rm | ((u32)rex.b << 3); - i32 disp = 0; - u32 used = 0; - if (mod == 0u && (rm & 7u) == 5u) { - if (!read_disp(bytes, len, off, 4, &disp)) return (u32)-1; - put_mem(sb, X64_REG_RIP, disp, 1); - return 4; +/* Render the mnemonic with any per-format suffix (size letter, condition + * code, etc.) baked in. */ +static void emit_mnemonic(StrBuf* sb, const X64InsnDesc* d, + const X64DecodeCtx* ctx, const u8* bytes) { + if (ctx->has_lock) strbuf_puts(sb, "lock "); + strbuf_puts(sb, d->mnemonic); + /* Jcc / SETcc / CMOVcc: the table stores the bare prefix ("j", "set", + * "cmov") and we append the condition suffix from the opcode byte. */ + if (d->fmt == X64_FMT_JCC_REL32 || d->fmt == X64_FMT_SETCC_RM || + d->fmt == X64_FMT_CMOVCC_RR) { + u8 cc = bytes[ctx->opc_off + d->opc_len - 1u] & 0xFu; + strbuf_puts(sb, x64_cc_name(cc)); + if (d->fmt == X64_FMT_SETCC_RM || d->fmt == X64_FMT_CMOVCC_RR) { + /* SETcc operates on r/m8; CMOVcc width comes from REX.W. */ + char s = x64_size_suffix_for(d, ctx); + if (s) strbuf_putc(sb, s); } - if (mod == 1u) { - if (!read_disp(bytes, len, off, 1, &disp)) return (u32)-1; - used = 1; - } else if (mod == 2u) { - if (!read_disp(bytes, len, off, 4, &disp)) return (u32)-1; - used = 4; - } - put_mem(sb, base, disp, 1); - return used; + return; } + /* Generic width suffix. */ + char s = x64_size_suffix_for(d, ctx); + if (s) strbuf_putc(sb, s); } -static void x64_unknown(X64Disasm* d, u8 byte) { +static void render_byte_fallback(X64Disasm* d, u8 byte) { strbuf_reset(&d->mnem); strbuf_puts(&d->mnem, ".byte"); strbuf_reset(&d->ops); strbuf_put_hex_u64(&d->ops, byte); } -static void set_mnemonic(X64Disasm* d, const char* s) { - strbuf_reset(&d->mnem); - strbuf_puts(&d->mnem, s); - strbuf_reset(&d->ops); -} - -static u32 decode_modrm_two_operand(X64Disasm* d, const u8* bytes, size_t len, - u32 off, X64Rex rex, const char* mnem, - u32 width, int reg_is_src) { - u8 mr; - u32 mod; - u32 reg; - u32 rm; - u32 used; - if (off >= len) return 0; - mr = bytes[off++]; - mod = (mr >> 6) & 3u; - reg = ((mr >> 3) & 7u) | ((u32)rex.r << 3); - rm = mr & 7u; - set_mnemonic(d, mnem); - if (reg_is_src) { - put_reg(&d->ops, reg, width); - strbuf_puts(&d->ops, ", "); - used = put_rm_operand(&d->ops, bytes, len, off, rex, rm, mod, width); - } else { - used = put_rm_operand(&d->ops, bytes, len, off, rex, rm, mod, width); - strbuf_puts(&d->ops, ", "); - put_reg(&d->ops, reg, width); - } - if (used == (u32)-1) return 0; - return 1u + used; -} - static u32 x64_decode(ArchDisasm* base, const u8* bytes, size_t len, u64 vaddr, CfreeInsn* out) { X64Disasm* d = (X64Disasm*)base; - X64Rex rex; - u32 off; - u8 op; - u32 consumed = 1; + X64DecodeCtx ctx; + const X64InsnDesc* desc; + u32 total; if (!len) return 0; - off = parse_rex(bytes, len, &rex); - if (off >= len) return 0; - op = bytes[off++]; - if (op == 0x90u) { - set_mnemonic(d, "nop"); - } else if (op == 0xc3u) { - set_mnemonic(d, "ret"); - } else if (op >= 0xb8u && op <= 0xbfu) { - u32 reg = (op & 7u) | ((u32)rex.b << 3); - set_mnemonic(d, rex.w ? "movq" : "movl"); - if (rex.w) { - if (off + 8u > len) return 0; - put_imm(&d->ops, (i64)rd_u64_le(bytes + off)); - off += 8u; - } else { - if (off + 4u > len) return 0; - put_imm(&d->ops, (i64)(i32)rd_u32_le(bytes + off)); - off += 4u; - } - strbuf_puts(&d->ops, ", "); - put_reg(&d->ops, reg, rex.w ? 8u : 4u); - } else if (op == 0x89u) { - consumed = decode_modrm_two_operand( - d, bytes, len, off, rex, rex.w ? "movq" : "movl", rex.w ? 8u : 4u, 1); - if (!consumed) return 0; - off += consumed; - } else if (op == 0x8bu) { - consumed = decode_modrm_two_operand( - d, bytes, len, off, rex, rex.w ? "movq" : "movl", rex.w ? 8u : 4u, 0); - if (!consumed) return 0; - off += consumed; - } else if (op == 0x8du) { - consumed = decode_modrm_two_operand(d, bytes, len, off, rex, "leaq", 8u, 0); - if (!consumed) return 0; - off += consumed; - } else if (op == 0x31u) { - consumed = decode_modrm_two_operand(d, bytes, len, off, rex, "xorl", 4u, 1); - if (!consumed) return 0; - off += consumed; - } else if (op == 0x85u) { - consumed = decode_modrm_two_operand( - d, bytes, len, off, rex, rex.w ? "testq" : "testl", rex.w ? 8u : 4u, 1); - if (!consumed) return 0; - off += consumed; - } else if (op == 0x63u && rex.w) { - consumed = - decode_modrm_two_operand(d, bytes, len, off, rex, "movslq", 4u, 0); - if (!consumed) return 0; - off += consumed; - } else if (op == 0x0fu) { - u8 op2; - if (off >= len) return 0; - op2 = bytes[off++]; - if (op2 == 0x0bu) { - set_mnemonic(d, "ud2"); - } else if (op2 == 0x44u) { - consumed = decode_modrm_two_operand(d, bytes, len, off, rex, "cmoveq", - rex.w ? 8u : 4u, 0); - if (!consumed) return 0; - off += consumed; - } else { - x64_unknown(d, op2); - } - } else if (op == 0xffu) { - u8 mr; - u32 sub; - u32 rm; - if (off >= len) return 0; - mr = bytes[off++]; - sub = (mr >> 3) & 7u; - rm = (mr & 7u) | ((u32)rex.b << 3); - if (((mr >> 6) & 3u) == 3u && (sub == 2u || sub == 4u)) { - set_mnemonic(d, sub == 2u ? "callq" : "jmpq"); - strbuf_putc(&d->ops, '*'); - put_reg(&d->ops, rm, 8); - } else { - x64_unknown(d, op); - } - } else if ((op == 0x81u || op == 0x83u) && off < len) { - u8 mr = bytes[off++]; - u32 mod = (mr >> 6) & 3u; - u32 sub = (mr >> 3) & 7u; - u32 rm = (mr & 7u) | ((u32)rex.b << 3); - if (mod == 3u && sub == 4u) { - i64 imm; - set_mnemonic(d, rex.w ? "andq" : "andl"); - if (op == 0x83u) { - if (off >= len) return 0; - imm = (i64)(i8)bytes[off++]; - } else { - if (off + 4u > len) return 0; - imm = (i64)(i32)rd_u32_le(bytes + off); - off += 4u; - } - put_imm(&d->ops, imm); - strbuf_puts(&d->ops, ", "); - put_reg(&d->ops, rm, rex.w ? 8u : 4u); - } else { - x64_unknown(d, op); - } - } else { - x64_unknown(d, op); + strbuf_reset(&d->mnem); + strbuf_reset(&d->ops); + strbuf_reset(&d->ann); + + (void)x64_decode_prefixes(bytes, (u32)len, &ctx); + if (ctx.opc_off >= (u32)len) { + render_byte_fallback(d, bytes[0]); + out->vaddr = vaddr; + out->bytes = bytes; + out->nbytes = 1; + out->mnemonic = strbuf_cstr(&d->mnem); + out->operands = strbuf_cstr(&d->ops); + out->annotation = strbuf_cstr(&d->ann); + return 1; + } + + desc = x64_disasm_find(bytes, (u32)len, &ctx); + if (!desc) { + render_byte_fallback(d, bytes[0]); + out->vaddr = vaddr; + out->bytes = bytes; + out->nbytes = 1; + out->mnemonic = strbuf_cstr(&d->mnem); + out->operands = strbuf_cstr(&d->ops); + out->annotation = strbuf_cstr(&d->ann); + return 1; + } + + emit_mnemonic(&d->mnem, desc, &ctx, bytes); + total = x64_print_operands(&d->ops, desc, bytes, (u32)len, &ctx, vaddr); + if (total == 0) { + /* Truncated encoding — fall back to .byte so callers can step past. */ + render_byte_fallback(d, bytes[0]); + out->vaddr = vaddr; + out->bytes = bytes; + out->nbytes = 1; + out->mnemonic = strbuf_cstr(&d->mnem); + out->operands = strbuf_cstr(&d->ops); + out->annotation = strbuf_cstr(&d->ann); + return 1; } - strbuf_reset(&d->ann); out->vaddr = vaddr; out->bytes = bytes; - out->nbytes = off; + out->nbytes = total; out->mnemonic = strbuf_cstr(&d->mnem); out->operands = strbuf_cstr(&d->ops); out->annotation = strbuf_cstr(&d->ann); - return off; + return total; } static void x64_destroy(ArchDisasm* base) { diff --git a/src/arch/x64/emit.c b/src/arch/x64/emit.c @@ -55,13 +55,6 @@ void emit_u32le(MCEmitter *mc, u32 v) { b[3] = (u8)(v >> 24); mc->emit_bytes(mc, b, 4); } -static void emit_u64le(MCEmitter *mc, u64 v) { - u8 b[8]; - for (int i = 0; i < 8; ++i) - b[i] = (u8)(v >> (i * 8)); - mc->emit_bytes(mc, b, 8); -} - static u8 make_rex(int w, u32 reg, u32 index, u32 rm) { u8 r = 0; if (w) @@ -131,10 +124,10 @@ void emit_rm_reg(MCEmitter *mc, u32 reg, u32 rm) { /* mov rd, rs (64-bit if w, else 32-bit). */ void emit_mov_rr(MCEmitter *mc, int w, u32 dst, u32 src) { u32 ofs = obj_pos(mc->obj, mc->section_id); - emit_rex(mc, w, src, 0, dst); - u8 op = 0x89; /* MOV r/m, r */ - mc->emit_bytes(mc, &op, 1); - emit_rm_reg(mc, src, dst); + u8 buf[16]; + u32 n = x64_alu_rr_pack( + (X64AluRR){.w = w, .op = X64_OPC_MOV_RM_R, .dst = dst, .src = src}, buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } @@ -143,27 +136,32 @@ void emit_mov_rr(MCEmitter *mc, int w, u32 dst, u32 src) { void emit_mov_load(MCEmitter *mc, u32 size, int signed_ext, u32 dst, u32 base, i32 disp) { u32 ofs = obj_pos(mc->obj, mc->section_id); + u8 buf[16]; + u32 n = 0; if (size == 8) { - emit_rex(mc, 1, dst, 0, base); - u8 op = 0x8B; - mc->emit_bytes(mc, &op, 1); - emit_mem_operand(mc, dst, base, disp); + n = x64_mov_rm_load_pack( + (X64MovRMLoad){.w = 1, .opc0 = X64_OPC_MOV_R_RM, .dst = dst, + .base = base, .disp = disp}, + buf); } else if (size == 4) { - emit_rex(mc, 0, dst, 0, base); - u8 op = 0x8B; - mc->emit_bytes(mc, &op, 1); - emit_mem_operand(mc, dst, base, disp); + n = x64_mov_rm_load_pack( + (X64MovRMLoad){.w = 0, .opc0 = X64_OPC_MOV_R_RM, .dst = dst, + .base = base, .disp = disp}, + buf); } else if (size == 2) { - emit_rex(mc, 0, dst, 0, base); - u8 op[2] = {0x0F, signed_ext ? 0xBF : 0xB7}; - mc->emit_bytes(mc, op, 2); - emit_mem_operand(mc, dst, base, disp); + n = x64_mov_rm_load_pack( + (X64MovRMLoad){.w = 0, + .opc1 = signed_ext ? X64_OPC_MOVSX_W : X64_OPC_MOVZX_W, + .dst = dst, .base = base, .disp = disp}, + buf); } else if (size == 1) { - emit_rex(mc, 0, dst, 0, base); - u8 op[2] = {0x0F, signed_ext ? 0xBE : 0xB6}; - mc->emit_bytes(mc, op, 2); - emit_mem_operand(mc, dst, base, disp); + n = x64_mov_rm_load_pack( + (X64MovRMLoad){.w = 0, + .opc1 = signed_ext ? X64_OPC_MOVSX_B : X64_OPC_MOVZX_B, + .dst = dst, .base = base, .disp = disp}, + buf); } + if (n) mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } @@ -171,40 +169,43 @@ void emit_mov_load(MCEmitter *mc, u32 size, int signed_ext, u32 dst, u32 base, /* mov [base + disp], src; size 1/2/4/8. */ void emit_mov_store(MCEmitter *mc, u32 size, u32 src, u32 base, i32 disp) { u32 ofs = obj_pos(mc->obj, mc->section_id); + u8 buf[16]; + u32 n = 0; if (size == 8) { - emit_rex(mc, 1, src, 0, base); - u8 op = 0x89; - mc->emit_bytes(mc, &op, 1); - emit_mem_operand(mc, src, base, disp); + n = x64_alu_rm_pack( + (X64AluRM){.w = 1, .op = X64_OPC_MOV_RM_R, .src = src, .base = base, + .disp = disp}, + buf); } else if (size == 4) { - emit_rex(mc, 0, src, 0, base); - u8 op = 0x89; - mc->emit_bytes(mc, &op, 1); - emit_mem_operand(mc, src, base, disp); + n = x64_alu_rm_pack( + (X64AluRM){.w = 0, .op = X64_OPC_MOV_RM_R, .src = src, .base = base, + .disp = disp}, + buf); } else if (size == 2) { - u8 p = 0x66; - mc->emit_bytes(mc, &p, 1); - emit_rex(mc, 0, src, 0, base); - u8 op = 0x89; - mc->emit_bytes(mc, &op, 1); - emit_mem_operand(mc, src, base, disp); + n = x64_alu_rm_pack( + (X64AluRM){.prefix = X64_OPSIZE_PFX, .w = 0, .op = X64_OPC_MOV_RM_R, + .src = src, .base = base, .disp = disp}, + buf); } else if (size == 1) { /* Force REX so SIL/DIL/etc are addressable as byte regs. */ - emit_rex_force(mc, 0, src, 0, base); - u8 op = 0x88; - mc->emit_bytes(mc, &op, 1); - emit_mem_operand(mc, src, base, disp); + n = x64_alu_rm_pack( + (X64AluRM){.w = 0, .op = X64_OPC_MOV_RM_R8, .force_rex = 1, .src = src, + .base = base, .disp = disp}, + buf); } + if (n) mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } void emit_lea(MCEmitter *mc, u32 dst, u32 base, i32 disp) { u32 ofs = obj_pos(mc->obj, mc->section_id); - emit_rex(mc, 1, dst, 0, base); - u8 op = 0x8D; - mc->emit_bytes(mc, &op, 1); - emit_mem_operand(mc, dst, base, disp); + u8 buf[16]; + u32 n = x64_mov_rm_load_pack( + (X64MovRMLoad){.w = 1, .opc0 = X64_OPC_LEA, .dst = dst, .base = base, + .disp = disp}, + buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } @@ -213,17 +214,10 @@ void emit_lea(MCEmitter *mc, u32 dst, u32 base, i32 disp) { * imm32) for !is64. Both 10/5 bytes. */ void x64_emit_load_imm(MCEmitter *mc, int is64, u32 dst, i64 imm) { u32 ofs = obj_pos(mc->obj, mc->section_id); - if (is64) { - emit_rex(mc, 1, 0, 0, dst); - u8 op = (u8)(0xB8 | (dst & 7)); - mc->emit_bytes(mc, &op, 1); - emit_u64le(mc, (u64)imm); - } else { - emit_rex(mc, 0, 0, 0, dst); - u8 op = (u8)(0xB8 | (dst & 7)); - mc->emit_bytes(mc, &op, 1); - emit_u32le(mc, (u32)imm); - } + u8 buf[16]; + u32 n = x64_mov_ri_pack( + (X64MovRI){.is64 = is64, .dst = dst, .imm = imm}, buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } @@ -232,39 +226,40 @@ void x64_emit_load_imm(MCEmitter *mc, int is64, u32 dst, i64 imm) { * CMP(39)/MOV(89)/TEST(85). */ void emit_alu_rr(MCEmitter *mc, int w, u8 op, u32 dst, u32 src) { u32 ofs = obj_pos(mc->obj, mc->section_id); - emit_rex(mc, w, src, 0, dst); - mc->emit_bytes(mc, &op, 1); - emit_rm_reg(mc, src, dst); + u8 buf[16]; + u32 n = x64_alu_rr_pack( + (X64AluRR){.w = w, .op = op, .dst = dst, .src = src}, buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } void emit_imul_rr(MCEmitter *mc, int w, u32 dst, u32 src) { u32 ofs = obj_pos(mc->obj, mc->section_id); - emit_rex(mc, w, dst, 0, src); - u8 op[2] = {0x0F, 0xAF}; - mc->emit_bytes(mc, op, 2); - emit_rm_reg(mc, dst, src); + u8 buf[16]; + u32 n = x64_imul_rr_pack( + (X64ImulRR){.w = w, .dst = dst, .src = src}, buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } void emit_f7_rm(MCEmitter *mc, int w, u32 sub, u32 reg) { u32 ofs = obj_pos(mc->obj, mc->section_id); - emit_rex(mc, w, 0, 0, reg); - u8 op = 0xF7; - mc->emit_bytes(mc, &op, 1); - emit_rm_reg(mc, sub, reg); + u8 buf[16]; + u32 n = x64_f7_rm_pack( + (X64F7RM){.w = w, .sub = sub, .reg = reg}, buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } void emit_shift_cl(MCEmitter *mc, int w, u32 sub, u32 reg) { u32 ofs = obj_pos(mc->obj, mc->section_id); - emit_rex(mc, w, 0, 0, reg); - u8 op = 0xD3; - mc->emit_bytes(mc, &op, 1); - emit_rm_reg(mc, sub, reg); + u8 buf[16]; + u32 n = x64_shift_cl_pack( + (X64ShiftCL){.w = w, .sub = sub, .reg = reg}, buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } @@ -272,39 +267,33 @@ void emit_shift_cl(MCEmitter *mc, int w, u32 sub, u32 reg) { /* Shift r/m by imm8: opcode C1 /sub ib. sub: SHL=4, SHR=5, SAR=7. */ void emit_shift_imm(MCEmitter *mc, int w, u32 sub, u32 reg, u8 imm) { u32 ofs = obj_pos(mc->obj, mc->section_id); - emit_rex(mc, w, 0, 0, reg); - u8 buf[3]; - buf[0] = 0xC1; - buf[1] = modrm(3u, sub, reg); - buf[2] = imm; - mc->emit_bytes(mc, buf, 3); + u8 buf[16]; + u32 n = x64_shift_imm_pack( + (X64ShiftImm){.w = w, .sub = sub, .reg = reg, .imm = imm}, buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } void emit_cqo_or_cdq(MCEmitter *mc, int w) { - if (w) { - u8 buf[2] = {X64_REX_BASE | X64_REX_W, 0x99}; - mc->emit_bytes(mc, buf, 2); - } else { - u8 op = 0x99; - mc->emit_bytes(mc, &op, 1); - } + u8 buf[16]; + u32 n = x64_nullary_pack( + (X64Nullary){.w = w, .opc0 = X64_OPC_CDQ_CQO}, buf); + mc->emit_bytes(mc, buf, n); } void emit_xor_self(MCEmitter *mc, int w, u32 r) { - emit_alu_rr(mc, w, 0x31, r, r); + emit_alu_rr(mc, w, X64_OPC_ALU_XOR, r, r); } /* cmp r/m, imm8 (0x83 /7). */ void emit_cmp_imm8(MCEmitter *mc, int w, u32 reg, i8 imm) { u32 ofs = obj_pos(mc->obj, mc->section_id); - emit_rex(mc, w, 0, 0, reg); - u8 buf[3]; - buf[0] = 0x83; - buf[1] = modrm(3u, 7u, reg); - buf[2] = (u8)imm; - mc->emit_bytes(mc, buf, 3); + u8 buf[16]; + u32 n = x64_alu_imm8_pack( + (X64AluRmImm8){.w = w, .sub = X64_ALU_SUB_CMP, .reg = reg, .imm = imm}, + buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } @@ -313,12 +302,10 @@ void emit_cmp_imm8(MCEmitter *mc, int w, u32 reg, i8 imm) { * OR=1, ADC=2, SBB=3, AND=4, SUB=5, XOR=6, CMP=7. */ void emit_alu_imm8(MCEmitter *mc, int w, u32 sub, u32 reg, i8 imm) { u32 ofs = obj_pos(mc->obj, mc->section_id); - emit_rex(mc, w, 0, 0, reg); - u8 buf[3]; - buf[0] = 0x83; - buf[1] = modrm(3u, sub, reg); - buf[2] = (u8)imm; - mc->emit_bytes(mc, buf, 3); + u8 buf[16]; + u32 n = x64_alu_imm8_pack( + (X64AluRmImm8){.w = w, .sub = sub, .reg = reg, .imm = imm}, buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } @@ -326,15 +313,10 @@ void emit_alu_imm8(MCEmitter *mc, int w, u32 sub, u32 reg, i8 imm) { /* ALU r/m, imm32: opcode 0x81 /sub id (sign-extended for w=1). */ void emit_alu_imm32(MCEmitter *mc, int w, u32 sub, u32 reg, i32 imm) { u32 ofs = obj_pos(mc->obj, mc->section_id); - emit_rex(mc, w, 0, 0, reg); - u8 buf[6]; - buf[0] = 0x81; - buf[1] = modrm(3u, sub, reg); - buf[2] = (u8)(imm & 0xFF); - buf[3] = (u8)((imm >> 8) & 0xFF); - buf[4] = (u8)((imm >> 16) & 0xFF); - buf[5] = (u8)((imm >> 24) & 0xFF); - mc->emit_bytes(mc, buf, 6); + u8 buf[16]; + u32 n = x64_alu_imm32_pack( + (X64AluRmImm32){.w = w, .sub = sub, .reg = reg, .imm = imm}, buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } @@ -345,26 +327,21 @@ void emit_alu_imm32(MCEmitter *mc, int w, u32 sub, u32 reg, i32 imm) { * forms which read-modify-write a single operand. */ void emit_imul_imm8(MCEmitter *mc, int w, u32 dst, u32 src, i8 imm) { u32 ofs = obj_pos(mc->obj, mc->section_id); - emit_rex(mc, w, dst, 0, src); - u8 buf[3]; - buf[0] = 0x6B; - buf[1] = modrm(3u, dst, src); - buf[2] = (u8)imm; - mc->emit_bytes(mc, buf, 3); + u8 buf[16]; + u32 n = x64_imul_rri_pack( + (X64ImulRRI){.w = w, .imm32 = 0, .dst = dst, .src = src, .imm = imm}, + buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } void emit_imul_imm32(MCEmitter *mc, int w, u32 dst, u32 src, i32 imm) { u32 ofs = obj_pos(mc->obj, mc->section_id); - emit_rex(mc, w, dst, 0, src); - u8 buf[6]; - buf[0] = 0x69; - buf[1] = modrm(3u, dst, src); - buf[2] = (u8)(imm & 0xFF); - buf[3] = (u8)((imm >> 8) & 0xFF); - buf[4] = (u8)((imm >> 16) & 0xFF); - buf[5] = (u8)((imm >> 24) & 0xFF); - mc->emit_bytes(mc, buf, 6); + u8 buf[16]; + u32 n = x64_imul_rri_pack( + (X64ImulRRI){.w = w, .imm32 = 1, .dst = dst, .src = src, .imm = imm}, + buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } @@ -381,25 +358,26 @@ int imm_fits_i32(i64 imm) { } void emit_test_self(MCEmitter *mc, int w, u32 reg) { - emit_alu_rr(mc, w, 0x85, reg, reg); + emit_alu_rr(mc, w, X64_OPC_ALU_TEST, reg, reg); } void emit_setcc(MCEmitter *mc, u32 cc, u32 reg) { u32 ofs = obj_pos(mc->obj, mc->section_id); - emit_rex_force(mc, 0, 0, 0, reg); - u8 op[2] = {0x0F, (u8)(0x90 | (cc & 0xF))}; - mc->emit_bytes(mc, op, 2); - emit_rm_reg(mc, 0u, reg); + u8 buf[16]; + u32 n = x64_setcc_pack((X64Setcc){.cc = cc, .reg = reg}, buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } void emit_movzx_r32_r8(MCEmitter *mc, u32 dst, u32 src) { u32 ofs = obj_pos(mc->obj, mc->section_id); - emit_rex_force(mc, 0, dst, 0, src); - u8 op[2] = {0x0F, 0xB6}; - mc->emit_bytes(mc, op, 2); - emit_rm_reg(mc, dst, src); + u8 buf[16]; + u32 n = x64_movzx_rr_pack( + (X64MovzxRR){.w = 0, .opc1 = X64_OPC_MOVZX_B, .force_rex = 1, + .dst = dst, .src = src}, + buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } @@ -408,87 +386,93 @@ void emit_movzx_r32_r8(MCEmitter *mc, u32 dst, u32 src) { void emit_extend_rr(MCEmitter *mc, int w, int signed_ext, u32 src_size, u32 dst, u32 src) { u32 ofs = obj_pos(mc->obj, mc->section_id); + u8 buf[16]; + u32 n = 0; if (src_size == 4 && signed_ext) { /* movsxd r64, r32: REX.W 0x63 ModRM */ - emit_rex(mc, 1, dst, 0, src); - u8 op = 0x63; - mc->emit_bytes(mc, &op, 1); - emit_rm_reg(mc, dst, src); + n = x64_movsxd_pack((X64Movsxd){.dst = dst, .src = src}, buf); } else if (src_size == 4 && !signed_ext) { /* zext 32→64 is `mov r32, r32` (clears high 32). */ - emit_rex(mc, 0, src, 0, dst); - u8 op = 0x89; - mc->emit_bytes(mc, &op, 1); - emit_rm_reg(mc, src, dst); + n = x64_alu_rr_pack( + (X64AluRR){.w = 0, .op = X64_OPC_MOV_RM_R, .dst = dst, .src = src}, + buf); } else if (src_size == 1) { - emit_rex_force(mc, w, dst, 0, src); - u8 op[2] = {0x0F, signed_ext ? 0xBE : 0xB6}; - mc->emit_bytes(mc, op, 2); - emit_rm_reg(mc, dst, src); + n = x64_movzx_rr_pack( + (X64MovzxRR){.w = w, + .opc1 = signed_ext ? X64_OPC_MOVSX_B : X64_OPC_MOVZX_B, + .force_rex = 1, .dst = dst, .src = src}, + buf); } else if (src_size == 2) { - emit_rex(mc, w, dst, 0, src); - u8 op[2] = {0x0F, signed_ext ? 0xBF : 0xB7}; - mc->emit_bytes(mc, op, 2); - emit_rm_reg(mc, dst, src); + n = x64_movzx_rr_pack( + (X64MovzxRR){.w = w, + .opc1 = signed_ext ? X64_OPC_MOVSX_W : X64_OPC_MOVZX_W, + .force_rex = 0, .dst = dst, .src = src}, + buf); + } else { + /* No extension to perform (src already at least as wide as dst, e.g. + * 64→64 zext/sext). Still need a reg-to-reg move when dst != src so the + * destination holds the value. */ + if (dst != src) emit_mov_rr(mc, w, dst, src); } + if (n) mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } void emit_ret(MCEmitter *mc) { - u8 op = 0xC3; + u8 op = X64_OPC_RET; mc->emit_bytes(mc, &op, 1); } static void emit_leave(MCEmitter *mc) { - u8 op = 0xC9; + u8 op = X64_OPC_LEAVE; mc->emit_bytes(mc, &op, 1); } /* ---- SSE scalar FP encoders ---- */ void emit_sse_rr(MCEmitter *mc, u8 prefix, u8 opcode, u32 dst, u32 src) { u32 ofs = obj_pos(mc->obj, mc->section_id); - if (prefix) - mc->emit_bytes(mc, &prefix, 1); - emit_rex(mc, 0, dst, 0, src); - u8 op[2] = {0x0F, opcode}; - mc->emit_bytes(mc, op, 2); - emit_rm_reg(mc, dst, src); + u8 buf[16]; + u32 n = x64_sse_rr_pack( + (X64SseRR){.prefix = prefix, .opcode = opcode, .w = 0, .dst = dst, + .src = src}, + buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } void emit_sse_load(MCEmitter *mc, u8 prefix, u8 opcode, u32 dst, u32 base, i32 disp) { u32 ofs = obj_pos(mc->obj, mc->section_id); - if (prefix) - mc->emit_bytes(mc, &prefix, 1); - emit_rex(mc, 0, dst, 0, base); - u8 op[2] = {0x0F, opcode}; - mc->emit_bytes(mc, op, 2); - emit_mem_operand(mc, dst, base, disp); + u8 buf[16]; + u32 n = x64_sse_mem_pack( + (X64SseMem){.prefix = prefix, .opcode = opcode, .reg = dst, + .base = base, .disp = disp}, + buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } void emit_sse_store(MCEmitter *mc, u8 prefix, u8 opcode, u32 src, u32 base, i32 disp) { u32 ofs = obj_pos(mc->obj, mc->section_id); - if (prefix) - mc->emit_bytes(mc, &prefix, 1); - emit_rex(mc, 0, src, 0, base); - u8 op[2] = {0x0F, opcode}; - mc->emit_bytes(mc, op, 2); - emit_mem_operand(mc, src, base, disp); + u8 buf[16]; + u32 n = x64_sse_mem_pack( + (X64SseMem){.prefix = prefix, .opcode = opcode, .reg = src, + .base = base, .disp = disp}, + buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } void emit_sse_rr_w(MCEmitter *mc, u8 prefix, u8 opcode, int w, u32 dst, u32 src) { u32 ofs = obj_pos(mc->obj, mc->section_id); - if (prefix) - mc->emit_bytes(mc, &prefix, 1); - emit_rex(mc, w, dst, 0, src); - u8 op[2] = {0x0F, opcode}; - mc->emit_bytes(mc, op, 2); - emit_rm_reg(mc, dst, src); + u8 buf[16]; + u32 n = x64_sse_rr_pack( + (X64SseRR){.prefix = prefix, .opcode = opcode, .w = w, .dst = dst, + .src = src}, + buf); + mc->emit_bytes(mc, buf, n); if (mc->debug) debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc); } diff --git a/src/arch/x64/internal.h b/src/arch/x64/internal.h @@ -142,6 +142,27 @@ extern const Reg g_int_order[6]; extern const Reg g_fp_order[10]; extern const u32 g_int_arg_regs[6]; +static inline void x64_abi_direct_reg_need(const ABIArgInfo* ai, + u32* need_int, u32* need_fp) { + *need_int = 0; + *need_fp = 0; + if (!ai || ai->kind != ABI_ARG_DIRECT) return; + for (u16 i = 0; i < ai->nparts; ++i) { + const ABIArgPart* p = &ai->parts[i]; + if (p->cls == ABI_CLASS_FP) + ++*need_fp; + else if (p->cls == ABI_CLASS_INT) + ++*need_int; + } +} + +static inline int x64_abi_direct_to_stack(const ABIArgInfo* ai, u32 next_int, + u32 next_fp) { + u32 need_int, need_fp; + x64_abi_direct_reg_need(ai, &need_int, &need_fp); + return next_int + need_int > 6u || next_fp + need_fp > 8u; +} + /* ============================================================ * Cross-file function declarations. * diff --git a/src/arch/x64/isa.c b/src/arch/x64/isa.c @@ -0,0 +1,1066 @@ +/* x86_64 instruction descriptor table + operand print/decode dispatch. + * + * The table mirrors every encoding `src/arch/x64/emit.c` produces, plus a + * handful that show up via direct byte writes in arch/x64/{alloc,link,ops}.c + * (CALL/JMP rel32, PUSH/POP r64, multi-byte NOP, atomic prefixes). Each + * row pins down (leg_pfx, opcode bytes, /digit) so the disassembler can + * identify a raw byte stream with one linear pass and then dispatch on + * the format to render operands. + * + * Row ordering: first-match wins. Aliases (rows with X64_ASMFL_ALIAS) + * sit BEFORE the canonical row they alias so the disassembler prefers + * the alias spelling on output. We keep aliases narrow today (e.g., + * SSE-prefixed forms naturally precede their no-prefix neighbours) — we + * can add `xor %eax,%eax` zeroing-idiom aliases later if disasm output + * needs them. */ + +#include "arch/x64/isa.h" + +#include <stddef.h> +#include <string.h> + +#include "core/bytes.h" + +/* ==================================================================== + * Table. Mnemonics are AT&T-style, lower-case, no size suffix; the + * printer derives the size letter (b/w/l/q) from the fmt + REX.W where + * appropriate. + * ==================================================================== */ + +#define ROW(mn, lp, ol, b0, b1, b2, lm, mr, wr, f, fl) \ + {mn, lp, ol, {b0, b1, b2}, lm, mr, wr, f, fl} +#define NO_MODRM 0xFFu + +const X64InsnDesc x64_insn_table[] = { + /* ---- single-byte nullary ---- */ + ROW("nop", X64_PFX_NONE, 1, 0x90, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_NULLARY, 0), + ROW("ret", X64_PFX_NONE, 1, 0xC3, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_NULLARY, 0), + ROW("leave", X64_PFX_NONE, 1, 0xC9, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_NULLARY, 0), + ROW("cltd", X64_PFX_NONE, 1, 0x99, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_0, + X64_FMT_NULLARY, 0), + ROW("cqto", X64_PFX_NONE, 1, 0x99, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_1, + X64_FMT_NULLARY, 0), + + /* ---- two-byte UD2 ---- */ + ROW("ud2", X64_PFX_NONE, 2, 0x0F, 0x0B, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_NULLARY, 0), + ROW("mfence", X64_PFX_NONE, 3, 0x0F, 0xAE, 0xF0, 0xFF, NO_MODRM, + X64_W_REQ_ANY, X64_FMT_NULLARY, 0), + + /* ---- multi-byte NOP: 66 0F 1F /0 ---- + * Matches the 6-byte canonical "NOPW 0(%rax,%rax,1)" cfree emits to pad + * the IPLT stub. The mod/rm bytes (and any disp) are consumed by the + * NOP_MULTI printer. */ + ROW("nopw", X64_PFX_66, 2, 0x0F, 0x1F, 0, 0xFF, 0, X64_W_REQ_ANY, + X64_FMT_NOP_MULTI, 0), + ROW("nopl", X64_PFX_NONE, 2, 0x0F, 0x1F, 0, 0xFF, 0, X64_W_REQ_ANY, + X64_FMT_NOP_MULTI, 0), + + /* ---- PUSH/POP r64 (embed-reg in low 3 bits) ---- */ + ROW("push", X64_PFX_NONE, 1, 0x50, 0, 0, 0xF8, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_PUSH_POP, X64_ASMFL_FORCE_W64), + ROW("pop", X64_PFX_NONE, 1, 0x58, 0, 0, 0xF8, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_PUSH_POP, X64_ASMFL_FORCE_W64), + + /* ---- MOV r, imm — B8+rd; width via REX.W ---- + * imm32 form (no REX.W) and imm64 movabs form (REX.W=1) share the + * same row; the printer reads ctx->rex_w to pick the imm width. */ + ROW("mov", X64_PFX_NONE, 1, 0xB8, 0, 0, 0xF8, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_MOV_RI, X64_ASMFL_W_FROM_REX), + + /* ---- ALU r/m, r — opcode picks op ---- */ + ROW("add", X64_PFX_NONE, 1, 0x01, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX), + ROW("or", X64_PFX_NONE, 1, 0x09, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX), + ROW("and", X64_PFX_NONE, 1, 0x21, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX), + ROW("sub", X64_PFX_NONE, 1, 0x29, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX), + ROW("xor", X64_PFX_NONE, 1, 0x31, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX), + ROW("cmp", X64_PFX_NONE, 1, 0x39, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX), + ROW("test", X64_PFX_NONE, 1, 0x85, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX), + ROW("mov", X64_PFX_NONE, 1, 0x89, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_ALU_RR, X64_ASMFL_W_FROM_REX), + /* Byte form: MOV r/m8, r8 — opcode 88 forces 1-byte operands. */ + ROW("mov", X64_PFX_NONE, 1, 0x88, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_ALU_RR, X64_ASMFL_BYTE), + /* 16-bit form: 0x66 prefix forces 2-byte operands. */ + ROW("mov", X64_PFX_66, 1, 0x89, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_ALU_RR, X64_ASMFL_W16), + + /* ---- MOV r, r/m (load and reg-reg share opcode 8B) ---- + * 8B /r matches both r,r and r,[base+disp]; the printer dispatches on + * ModR/M.mod. LEA is 8D /r — register-only ModR/M.mod=11 is illegal, + * so we use a separate row keyed on the opcode. */ + ROW("mov", X64_PFX_NONE, 1, 0x8B, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_MOV_RM_LOAD, X64_ASMFL_W_FROM_REX), + /* 16-bit r←r/m via 0x66 prefix. */ + ROW("mov", X64_PFX_66, 1, 0x8B, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_MOV_RM_LOAD, X64_ASMFL_W16), + ROW("lea", X64_PFX_NONE, 1, 0x8D, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_MOV_RM_LOAD, X64_ASMFL_W_FROM_REX), + + /* ---- MOVZX / MOVSX r32, r/m{8,16} ---- */ + ROW("movzbl", X64_PFX_NONE, 2, 0x0F, 0xB6, 0, 0xFF, NO_MODRM, + X64_W_REQ_ANY, X64_FMT_MOVZX_MOVSX, 0), + ROW("movzwl", X64_PFX_NONE, 2, 0x0F, 0xB7, 0, 0xFF, NO_MODRM, + X64_W_REQ_ANY, X64_FMT_MOVZX_MOVSX, 0), + ROW("movsbl", X64_PFX_NONE, 2, 0x0F, 0xBE, 0, 0xFF, NO_MODRM, + X64_W_REQ_ANY, X64_FMT_MOVZX_MOVSX, 0), + ROW("movswl", X64_PFX_NONE, 2, 0x0F, 0xBF, 0, 0xFF, NO_MODRM, + X64_W_REQ_ANY, X64_FMT_MOVZX_MOVSX, 0), + + /* ---- MOVSXD r64, r/m32 ---- */ + ROW("movslq", X64_PFX_NONE, 1, 0x63, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_1, + X64_FMT_MOVSXD, 0), + + /* ---- ALU r/m, imm — /digit picks operation ---- + * 83 (imm8 sign-extended), 81 (imm32 sign-extended). One row per + * (opcode, /digit) pair. + * /0 ADD /1 OR /4 AND /5 SUB /6 XOR /7 CMP + * (/2 ADC and /3 SBB are also valid in the Intel manual but cfree + * doesn't emit them; they can land later as additional rows.) */ + ROW("add", X64_PFX_NONE, 1, 0x83, 0, 0, 0xFF, 0, X64_W_REQ_ANY, + X64_FMT_ALU_RM_IMM8, X64_ASMFL_W_FROM_REX), + ROW("or", X64_PFX_NONE, 1, 0x83, 0, 0, 0xFF, 1, X64_W_REQ_ANY, + X64_FMT_ALU_RM_IMM8, X64_ASMFL_W_FROM_REX), + ROW("and", X64_PFX_NONE, 1, 0x83, 0, 0, 0xFF, 4, X64_W_REQ_ANY, + X64_FMT_ALU_RM_IMM8, X64_ASMFL_W_FROM_REX), + ROW("sub", X64_PFX_NONE, 1, 0x83, 0, 0, 0xFF, 5, X64_W_REQ_ANY, + X64_FMT_ALU_RM_IMM8, X64_ASMFL_W_FROM_REX), + ROW("xor", X64_PFX_NONE, 1, 0x83, 0, 0, 0xFF, 6, X64_W_REQ_ANY, + X64_FMT_ALU_RM_IMM8, X64_ASMFL_W_FROM_REX), + ROW("cmp", X64_PFX_NONE, 1, 0x83, 0, 0, 0xFF, 7, X64_W_REQ_ANY, + X64_FMT_ALU_RM_IMM8, X64_ASMFL_W_FROM_REX), + ROW("add", X64_PFX_NONE, 1, 0x81, 0, 0, 0xFF, 0, X64_W_REQ_ANY, + X64_FMT_ALU_RM_IMM32, X64_ASMFL_W_FROM_REX), + ROW("or", X64_PFX_NONE, 1, 0x81, 0, 0, 0xFF, 1, X64_W_REQ_ANY, + X64_FMT_ALU_RM_IMM32, X64_ASMFL_W_FROM_REX), + ROW("and", X64_PFX_NONE, 1, 0x81, 0, 0, 0xFF, 4, X64_W_REQ_ANY, + X64_FMT_ALU_RM_IMM32, X64_ASMFL_W_FROM_REX), + ROW("sub", X64_PFX_NONE, 1, 0x81, 0, 0, 0xFF, 5, X64_W_REQ_ANY, + X64_FMT_ALU_RM_IMM32, X64_ASMFL_W_FROM_REX), + ROW("xor", X64_PFX_NONE, 1, 0x81, 0, 0, 0xFF, 6, X64_W_REQ_ANY, + X64_FMT_ALU_RM_IMM32, X64_ASMFL_W_FROM_REX), + ROW("cmp", X64_PFX_NONE, 1, 0x81, 0, 0, 0xFF, 7, X64_W_REQ_ANY, + X64_FMT_ALU_RM_IMM32, X64_ASMFL_W_FROM_REX), + + /* ---- IMUL r, r/m (0F AF) / IMUL r, r/m, imm (69 / 6B) ---- */ + ROW("imul", X64_PFX_NONE, 2, 0x0F, 0xAF, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_IMUL_RR, X64_ASMFL_W_FROM_REX), + ROW("imul", X64_PFX_NONE, 1, 0x6B, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_IMUL_RRI, X64_ASMFL_W_FROM_REX), + ROW("imul", X64_PFX_NONE, 1, 0x69, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_IMUL_RRI, X64_ASMFL_W_FROM_REX | 0x80u /* imm32 */), + + /* ---- F7 /sub family (no immediate read except for /0 /1 which we + * don't emit) ---- */ + ROW("not", X64_PFX_NONE, 1, 0xF7, 0, 0, 0xFF, 2, X64_W_REQ_ANY, + X64_FMT_F7_RM, X64_ASMFL_W_FROM_REX), + ROW("neg", X64_PFX_NONE, 1, 0xF7, 0, 0, 0xFF, 3, X64_W_REQ_ANY, + X64_FMT_F7_RM, X64_ASMFL_W_FROM_REX), + ROW("mul", X64_PFX_NONE, 1, 0xF7, 0, 0, 0xFF, 4, X64_W_REQ_ANY, + X64_FMT_F7_RM, X64_ASMFL_W_FROM_REX), + ROW("imul", X64_PFX_NONE, 1, 0xF7, 0, 0, 0xFF, 5, X64_W_REQ_ANY, + X64_FMT_F7_RM, X64_ASMFL_W_FROM_REX), + ROW("div", X64_PFX_NONE, 1, 0xF7, 0, 0, 0xFF, 6, X64_W_REQ_ANY, + X64_FMT_F7_RM, X64_ASMFL_W_FROM_REX), + ROW("idiv", X64_PFX_NONE, 1, 0xF7, 0, 0, 0xFF, 7, X64_W_REQ_ANY, + X64_FMT_F7_RM, X64_ASMFL_W_FROM_REX), + + /* ---- Shifts ---- */ + ROW("rol", X64_PFX_NONE, 1, 0xC1, 0, 0, 0xFF, 0, X64_W_REQ_ANY, + X64_FMT_SHIFT_IMM, X64_ASMFL_W_FROM_REX), + ROW("ror", X64_PFX_NONE, 1, 0xC1, 0, 0, 0xFF, 1, X64_W_REQ_ANY, + X64_FMT_SHIFT_IMM, X64_ASMFL_W_FROM_REX), + ROW("shl", X64_PFX_NONE, 1, 0xC1, 0, 0, 0xFF, 4, X64_W_REQ_ANY, + X64_FMT_SHIFT_IMM, X64_ASMFL_W_FROM_REX), + ROW("shr", X64_PFX_NONE, 1, 0xC1, 0, 0, 0xFF, 5, X64_W_REQ_ANY, + X64_FMT_SHIFT_IMM, X64_ASMFL_W_FROM_REX), + ROW("sar", X64_PFX_NONE, 1, 0xC1, 0, 0, 0xFF, 7, X64_W_REQ_ANY, + X64_FMT_SHIFT_IMM, X64_ASMFL_W_FROM_REX), + /* 16-bit ROL imm8 via 0x66 + C1 /0 — used by emit_rol16_imm8. */ + ROW("rol", X64_PFX_66, 1, 0xC1, 0, 0, 0xFF, 0, X64_W_REQ_ANY, + X64_FMT_SHIFT_IMM, X64_ASMFL_W16), + ROW("shl", X64_PFX_NONE, 1, 0xD3, 0, 0, 0xFF, 4, X64_W_REQ_ANY, + X64_FMT_SHIFT_CL, X64_ASMFL_W_FROM_REX), + ROW("shr", X64_PFX_NONE, 1, 0xD3, 0, 0, 0xFF, 5, X64_W_REQ_ANY, + X64_FMT_SHIFT_CL, X64_ASMFL_W_FROM_REX), + ROW("sar", X64_PFX_NONE, 1, 0xD3, 0, 0, 0xFF, 7, X64_W_REQ_ANY, + X64_FMT_SHIFT_CL, X64_ASMFL_W_FROM_REX), + + /* ---- Branches ---- */ + /* Jcc near: 0F 80..8F rel32; condition in low 4 bits. The printer + * picks the mnemonic from a per-condition table. */ + ROW("j", X64_PFX_NONE, 2, 0x0F, 0x80, 0, 0xF0, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_JCC_REL32, 0), + ROW("jmp", X64_PFX_NONE, 1, 0xE9, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_JMP_REL32, 0), + ROW("callq", X64_PFX_NONE, 1, 0xE8, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_CALL_REL32, 0), + /* Indirect jmp / call via FF /4 or /2. */ + ROW("callq", X64_PFX_NONE, 1, 0xFF, 0, 0, 0xFF, 2, X64_W_REQ_ANY, + X64_FMT_BR_RM, 0), + ROW("jmpq", X64_PFX_NONE, 1, 0xFF, 0, 0, 0xFF, 4, X64_W_REQ_ANY, + X64_FMT_BR_RM, 0), + + /* ---- SETcc / CMOVcc ---- + * SETcc condition in low 4 bits of 2nd opcode byte (0F 90..9F). + * CMOVcc same encoding around 0F 40..4F. */ + ROW("set", X64_PFX_NONE, 2, 0x0F, 0x90, 0, 0xF0, 0, X64_W_REQ_ANY, + X64_FMT_SETCC_RM, 0), + ROW("cmov", X64_PFX_NONE, 2, 0x0F, 0x40, 0, 0xF0, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_CMOVCC_RR, X64_ASMFL_W_FROM_REX), + + /* ---- BSWAP r — 0F C8+rd ---- */ + ROW("bswap", X64_PFX_NONE, 2, 0x0F, 0xC8, 0, 0xF8, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_BSWAP, X64_ASMFL_W_FROM_REX), + + /* ---- Bit scan: BSF / BSR ---- */ + ROW("bsf", X64_PFX_NONE, 2, 0x0F, 0xBC, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_BS, X64_ASMFL_W_FROM_REX), + ROW("bsr", X64_PFX_NONE, 2, 0x0F, 0xBD, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_BS, X64_ASMFL_W_FROM_REX), + + /* ---- POPCNT — F3 0F B8 /r (note: F3 prefix is REQUIRED) ---- */ + ROW("popcnt", X64_PFX_F3, 2, 0x0F, 0xB8, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_POPCNT, X64_ASMFL_W_FROM_REX), + + /* ---- Atomic primitives ---- */ + /* XADD m, r — 0F C1 /r (LOCK prefix is decoded separately) */ + ROW("xadd", X64_PFX_NONE, 2, 0x0F, 0xC1, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_XADD_MEM, X64_ASMFL_W_FROM_REX), + /* XCHG r, r/m — 0x87 /r */ + ROW("xchg", X64_PFX_NONE, 1, 0x87, 0, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_XCHG_MEM, X64_ASMFL_W_FROM_REX), + /* CMPXCHG m, r — 0F B1 /r */ + ROW("cmpxchg", X64_PFX_NONE, 2, 0x0F, 0xB1, 0, 0xFF, NO_MODRM, + X64_W_REQ_ANY, X64_FMT_CMPXCHG_MEM, X64_ASMFL_W_FROM_REX), + + /* ---- SSE scalar FP — F2/F3 0F xx /r ---- + * Three opcodes per (sd, ss) pair: arith / mov / cmp. Each row pairs + * the legacy prefix (selects sd vs ss) with the 0F xx /r opcode. */ + /* MOVSS / MOVSD */ + ROW("movsd", X64_PFX_F2, 2, 0x0F, 0x10, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_SSE_RR, 0), + ROW("movsd", X64_PFX_F2, 2, 0x0F, 0x11, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_SSE_RR, X64_ASMFL_ALIAS), + ROW("movss", X64_PFX_F3, 2, 0x0F, 0x10, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_SSE_RR, 0), + ROW("movss", X64_PFX_F3, 2, 0x0F, 0x11, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_SSE_RR, X64_ASMFL_ALIAS), + /* ADD/SUB/MUL/DIV — opcodes 58/5C/59/5E (same byte for ss and sd; + * prefix picks). */ + ROW("addsd", X64_PFX_F2, 2, 0x0F, 0x58, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_SSE_RR, 0), + ROW("addss", X64_PFX_F3, 2, 0x0F, 0x58, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_SSE_RR, 0), + ROW("mulsd", X64_PFX_F2, 2, 0x0F, 0x59, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_SSE_RR, 0), + ROW("mulss", X64_PFX_F3, 2, 0x0F, 0x59, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_SSE_RR, 0), + ROW("subsd", X64_PFX_F2, 2, 0x0F, 0x5C, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_SSE_RR, 0), + ROW("subss", X64_PFX_F3, 2, 0x0F, 0x5C, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_SSE_RR, 0), + ROW("divsd", X64_PFX_F2, 2, 0x0F, 0x5E, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_SSE_RR, 0), + ROW("divss", X64_PFX_F3, 2, 0x0F, 0x5E, 0, 0xFF, NO_MODRM, X64_W_REQ_ANY, + X64_FMT_SSE_RR, 0), + /* Compare scalar (UCOMISS / UCOMISD) */ + ROW("ucomisd", X64_PFX_66, 2, 0x0F, 0x2E, 0, 0xFF, NO_MODRM, + X64_W_REQ_ANY, X64_FMT_SSE_RR, 0), + ROW("ucomiss", X64_PFX_NONE, 2, 0x0F, 0x2E, 0, 0xFF, NO_MODRM, + X64_W_REQ_ANY, X64_FMT_SSE_RR, 0), + /* Conversions touched by FP↔int paths: CVTSI2SS/SD, CVTTSS/SD2SI. */ + ROW("cvtsi2sd", X64_PFX_F2, 2, 0x0F, 0x2A, 0, 0xFF, NO_MODRM, + X64_W_REQ_ANY, X64_FMT_SSE_RR, X64_ASMFL_W_FROM_REX), + ROW("cvtsi2ss", X64_PFX_F3, 2, 0x0F, 0x2A, 0, 0xFF, NO_MODRM, + X64_W_REQ_ANY, X64_FMT_SSE_RR, X64_ASMFL_W_FROM_REX), + ROW("cvttsd2si", X64_PFX_F2, 2, 0x0F, 0x2C, 0, 0xFF, NO_MODRM, + X64_W_REQ_ANY, X64_FMT_SSE_RR, X64_ASMFL_W_FROM_REX), + ROW("cvttss2si", X64_PFX_F3, 2, 0x0F, 0x2C, 0, 0xFF, NO_MODRM, + X64_W_REQ_ANY, X64_FMT_SSE_RR, X64_ASMFL_W_FROM_REX), + ROW("cvtsd2ss", X64_PFX_F2, 2, 0x0F, 0x5A, 0, 0xFF, NO_MODRM, + X64_W_REQ_ANY, X64_FMT_SSE_RR, 0), + ROW("cvtss2sd", X64_PFX_F3, 2, 0x0F, 0x5A, 0, 0xFF, NO_MODRM, + X64_W_REQ_ANY, X64_FMT_SSE_RR, 0), +}; + +const u32 x64_insn_table_n = (u32)(sizeof x64_insn_table / + sizeof x64_insn_table[0]); + +/* ==================================================================== + * Prefix decode. + * ==================================================================== */ + +u32 x64_decode_prefixes(const u8* bytes, u32 len, X64DecodeCtx* ctx) { + u32 off = 0; + memset(ctx, 0, sizeof *ctx); + while (off < len) { + u8 b = bytes[off]; + if (b == 0x66u || b == 0xF2u || b == 0xF3u) { + ctx->leg_pfx = b; + ++off; + continue; + } + if (b == 0xF0u) { + /* LOCK — ignored for opcode lookup but consumed so the + * subsequent opcode aligns. The printer adds a "lock " prefix + * separately when annotating, but cfree's emit.c currently emits + * LOCK only before XADD / XCHG / CMPXCHG. */ + ctx->has_lock = 1; + ++off; + continue; + } + break; + } + if (off < len && bytes[off] >= 0x40u && bytes[off] <= 0x4Fu) { + u8 r = bytes[off]; + ctx->has_rex = 1; + ctx->rex_w = (r >> 3) & 1u; + ctx->rex_r = (r >> 2) & 1u; + ctx->rex_x = (r >> 1) & 1u; + ctx->rex_b = r & 1u; + ++off; + } + ctx->opc_off = off; + return off; +} + +/* ==================================================================== + * Disassembler row lookup. + * ==================================================================== */ + +const X64InsnDesc* x64_disasm_find(const u8* bytes, u32 len, + X64DecodeCtx* ctx) { + if (ctx->opc_off >= len) return NULL; + for (u32 i = 0; i < x64_insn_table_n; ++i) { + const X64InsnDesc* d = &x64_insn_table[i]; + if (d->leg_pfx != ctx->leg_pfx) continue; + if (d->rex_w_req == X64_W_REQ_1 && !ctx->rex_w) continue; + if (d->rex_w_req == X64_W_REQ_0 && ctx->rex_w) continue; + if (ctx->opc_off + d->opc_len > len) continue; + /* Opcode bytes match exactly except the LAST byte, which may use + * a low-bit mask (embed-reg or condition nibble). */ + int ok = 1; + for (u32 j = 0; j + 1u < d->opc_len; ++j) { + if (bytes[ctx->opc_off + j] != d->opc[j]) { + ok = 0; + break; + } + } + if (!ok) continue; + { + u8 last_act = bytes[ctx->opc_off + d->opc_len - 1u] & d->opc_last_mask; + u8 last_exp = d->opc[d->opc_len - 1u] & d->opc_last_mask; + if (last_act != last_exp) continue; + } + /* /digit constraint reads ModR/M.reg. */ + if (d->modrm_reg != NO_MODRM) { + u32 mrm_off = ctx->opc_off + d->opc_len; + if (mrm_off >= len) continue; + u8 mrm = bytes[mrm_off]; + if (((mrm >> 3) & 7u) != d->modrm_reg) continue; + } + return d; + } + return NULL; +} + +/* ==================================================================== + * Operand printers. + * ==================================================================== */ + +#define X64_REG_RIP 16u + +static const char* g_cc_name[16] = { + "o", "no", "b", "ae", "e", "ne", "be", "a", + "s", "ns", "p", "np", "l", "ge", "le", "g", +}; + +/* AT&T register names by width. Index 0..15 covers RAX..R15. */ +static const char* reg_name(u32 reg, u32 width_bytes, int has_rex) { + static const char* r64[16] = { + "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + }; + static const char* r32[16] = { + "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", + "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d", + }; + static const char* r16[16] = { + "ax", "cx", "dx", "bx", "sp", "bp", "si", "di", + "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w", + }; + static const char* r8[16] = { + "al", "cl", "dl", "bl", "spl", "bpl", "sil", "dil", + "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b", + }; + static const char* rh8[4] = {"ah", "ch", "dh", "bh"}; + reg &= 15u; + if (width_bytes == 8) return r64[reg]; + if (width_bytes == 4) return r32[reg]; + if (width_bytes == 2) return r16[reg]; + if (!has_rex && reg >= 4u && reg <= 7u) return rh8[reg - 4u]; + return r8[reg]; +} + +static const char* xmm_name(u32 reg) { + static const char* x[16] = { + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + }; + return x[reg & 15u]; +} + +static void put_reg(StrBuf* sb, u32 reg, u32 width) { + strbuf_putc(sb, '%'); + strbuf_puts(sb, reg_name(reg, width, 1)); +} +static void put_reg_ctx(StrBuf* sb, u32 reg, u32 width, int has_rex) { + strbuf_putc(sb, '%'); + strbuf_puts(sb, reg_name(reg, width, has_rex)); +} +static void put_xmm(StrBuf* sb, u32 reg) { + strbuf_putc(sb, '%'); + strbuf_puts(sb, xmm_name(reg)); +} +static void put_imm(StrBuf* sb, i64 imm) { + strbuf_putc(sb, '$'); + strbuf_put_i64(sb, imm); +} + +/* Read a signed displacement of n bytes (1 or 4). Returns 1 on success. */ +static int read_disp(const u8* bytes, u32 len, u32 off, u32 n, i32* out) { + if (off + n > len) return 0; + if (n == 1u) { + *out = (i32)(i8)bytes[off]; + } else if (n == 4u) { + *out = (i32)rd_u32_le(bytes + off); + } else { + *out = 0; + } + return 1; +} + +/* Decode a ModR/M memory operand starting at bytes[off]. Returns number of + * extra bytes consumed (ModR/M + SIB? + disp?), or (u32)-1 on truncation. + * The ModR/M byte itself is bytes[off]; caller has already read mod/reg/rm. + * `disp_out` and `base_out` describe what to print. */ +typedef struct DecodedMem { + u32 base; + i32 disp; + int has_base; + int rip_relative; + u32 bytes_used; +} DecodedMem; + +static u32 decode_mem(const u8* bytes, u32 len, u32 off, X64DecodeCtx ctx, + u32 mod, u32 rm_low, DecodedMem* out) { + out->base = 0; + out->disp = 0; + out->has_base = 1; + out->rip_relative = 0; + out->bytes_used = 0; + if (mod == 3u) return 0; /* caller handles reg-form */ + /* SIB-required form: r/m=100. */ + if (rm_low == 4u) { + if (off >= len) return (u32)-1; + u8 s = bytes[off]; + u32 sib_base = (s & 7u) | ((u32)ctx.rex_b << 3); + u32 used = 1; + if (mod == 0u && (s & 7u) == 5u) { + /* mod=00, base=101: disp32 with no base — treat as RIP-relative + * style (cfree uses this for label-table addressing). */ + i32 d = 0; + if (!read_disp(bytes, len, off + used, 4, &d)) return (u32)-1; + used += 4; + out->disp = d; + out->has_base = 0; + out->bytes_used = used; + return used; + } + if (mod == 1u) { + i32 d = 0; + if (!read_disp(bytes, len, off + used, 1, &d)) return (u32)-1; + used += 1; + out->disp = d; + } else if (mod == 2u) { + i32 d = 0; + if (!read_disp(bytes, len, off + used, 4, &d)) return (u32)-1; + used += 4; + out->disp = d; + } + out->base = sib_base; + out->bytes_used = used; + return used; + } + /* Non-SIB form. */ + if (mod == 0u && rm_low == 5u) { + /* RIP-relative disp32. */ + i32 d = 0; + if (!read_disp(bytes, len, off, 4, &d)) return (u32)-1; + out->disp = d; + out->rip_relative = 1; + out->bytes_used = 4; + return 4; + } + u32 base = rm_low | ((u32)ctx.rex_b << 3); + out->base = base; + if (mod == 1u) { + i32 d = 0; + if (!read_disp(bytes, len, off, 1, &d)) return (u32)-1; + out->disp = d; + out->bytes_used = 1; + return 1; + } + if (mod == 2u) { + i32 d = 0; + if (!read_disp(bytes, len, off, 4, &d)) return (u32)-1; + out->disp = d; + out->bytes_used = 4; + return 4; + } + /* mod == 0u with rm != 5,4 → [reg], no disp. */ + return 0; +} + +static void put_mem(StrBuf* sb, const DecodedMem* m) { + if (m->disp != 0 || (!m->has_base && !m->rip_relative)) { + strbuf_put_i64(sb, (i64)m->disp); + } + if (m->rip_relative) { + strbuf_puts(sb, "(%rip)"); + } else if (m->has_base) { + strbuf_putc(sb, '('); + put_reg(sb, m->base, 8); + strbuf_putc(sb, ')'); + } +} + +/* ==================================================================== + * Width derivation. + * ==================================================================== */ + +static u32 width_for(const X64InsnDesc* d, const X64DecodeCtx* ctx) { + if (d->flags & X64_ASMFL_FORCE_W64) return 8u; + if (d->flags & X64_ASMFL_BYTE) return 1u; + if (d->flags & X64_ASMFL_W16) return 2u; + if (d->flags & X64_ASMFL_W_FROM_REX) return ctx->rex_w ? 8u : 4u; + if (d->leg_pfx == X64_PFX_66) return 2u; + return 4u; +} + +char x64_size_suffix_for(const X64InsnDesc* desc, const X64DecodeCtx* ctx) { + switch ((X64Format)desc->fmt) { + case X64_FMT_ALU_RR: + case X64_FMT_MOV_RM_LOAD: + case X64_FMT_ALU_RM_IMM8: + case X64_FMT_ALU_RM_IMM32: + case X64_FMT_IMUL_RR: + case X64_FMT_IMUL_RRI: + case X64_FMT_F7_RM: + case X64_FMT_SHIFT_IMM: + case X64_FMT_SHIFT_CL: + case X64_FMT_BSWAP: + case X64_FMT_BS: + case X64_FMT_POPCNT: + case X64_FMT_XADD_MEM: + case X64_FMT_XCHG_MEM: + case X64_FMT_CMPXCHG_MEM: + case X64_FMT_MOV_RI: + switch (width_for(desc, ctx)) { + case 1: + return 'b'; + case 2: + return 'w'; + case 4: + return 'l'; + case 8: + return 'q'; + } + return 0; + default: + return 0; + } +} + +/* ==================================================================== + * Per-format printers. + * ==================================================================== */ + +/* Decode a ModR/M with reg+r/m. Returns total bytes consumed by the + * ModR/M + any SIB/disp. */ +typedef struct RegRm { + u32 reg; /* high bit from REX.R */ + u32 rm_low; /* low 3 bits */ + u32 mod; + u32 bytes_after_modrm; /* SIB/disp bytes */ + DecodedMem mem; /* valid iff mod != 3 */ +} RegRm; + +static int read_modrm(const u8* bytes, u32 len, u32 off, X64DecodeCtx ctx, + RegRm* rr) { + if (off >= len) return 0; + u8 mr = bytes[off]; + rr->mod = (mr >> 6) & 3u; + rr->reg = ((mr >> 3) & 7u) | ((u32)ctx.rex_r << 3); + rr->rm_low = mr & 7u; + if (rr->mod == 3u) { + rr->bytes_after_modrm = 0; + memset(&rr->mem, 0, sizeof rr->mem); + return 1; + } + u32 used = decode_mem(bytes, len, off + 1u, ctx, rr->mod, rr->rm_low, + &rr->mem); + if (used == (u32)-1) return 0; + rr->bytes_after_modrm = used; + return 1; +} + +/* Print a ModR/M r/m operand at width `w`. */ +static void put_rm(StrBuf* sb, const RegRm* rr, X64DecodeCtx ctx, u32 w) { + if (rr->mod == 3u) { + u32 rm = rr->rm_low | ((u32)ctx.rex_b << 3); + put_reg_ctx(sb, rm, w, ctx.has_rex); + } else { + put_mem(sb, &rr->mem); + } +} +static void put_rm_xmm(StrBuf* sb, const RegRm* rr, X64DecodeCtx ctx) { + if (rr->mod == 3u) { + u32 rm = rr->rm_low | ((u32)ctx.rex_b << 3); + put_xmm(sb, rm); + } else { + put_mem(sb, &rr->mem); + } +} + +static u32 print_nullary(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + (void)sb; + (void)d; + (void)bytes; + (void)len; + return ctx->opc_off + d->opc_len; +} + +static u32 print_push_pop(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + (void)len; + u32 reg = (bytes[ctx->opc_off] & 7u) | ((u32)ctx->rex_b << 3); + put_reg(sb, reg, 8); + (void)d; + return ctx->opc_off + 1u; +} + +static u32 print_mov_ri(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + (void)d; + u32 reg = (bytes[ctx->opc_off] & 7u) | ((u32)ctx->rex_b << 3); + u32 off = ctx->opc_off + 1u; + if (ctx->rex_w) { + if (off + 8u > len) return 0; + put_imm(sb, (i64)rd_u64_le(bytes + off)); + off += 8u; + strbuf_puts(sb, ", "); + put_reg(sb, reg, 8); + } else { + if (off + 4u > len) return 0; + put_imm(sb, (i64)(i32)rd_u32_le(bytes + off)); + off += 4u; + strbuf_puts(sb, ", "); + put_reg(sb, reg, 4); + } + return off; +} + +static u32 print_alu_rr(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + /* op r/m, r (reg is the source). Width comes from width_for, which + * honours the BYTE / W16 / W_FROM_REX flags on the descriptor. */ + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + u32 w = width_for(d, ctx); + put_reg_ctx(sb, rr.reg, w, ctx->has_rex); + strbuf_puts(sb, ", "); + put_rm(sb, &rr, *ctx, w); + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_mov_rm_load(StrBuf* sb, const X64InsnDesc* d, + const u8* bytes, u32 len, + const X64DecodeCtx* ctx) { + /* op r, r/m. */ + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + u32 w = width_for(d, ctx); + if (d->opc[0] == 0x8Du) w = 8u; /* LEA always loads a 64-bit address */ + put_rm(sb, &rr, *ctx, w); + strbuf_puts(sb, ", "); + put_reg_ctx(sb, rr.reg, w, ctx->has_rex); + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_movzx_movsx(StrBuf* sb, const X64InsnDesc* d, + const u8* bytes, u32 len, + const X64DecodeCtx* ctx) { + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + /* Source width = 1 for B6/BE, 2 for B7/BF. Destination width = 4 unless + * REX.W (then 8). */ + u32 src_w = (d->opc[1] == 0xB7u || d->opc[1] == 0xBFu) ? 2u : 1u; + u32 dst_w = ctx->rex_w ? 8u : 4u; + put_rm(sb, &rr, *ctx, src_w); + strbuf_puts(sb, ", "); + put_reg_ctx(sb, rr.reg, dst_w, ctx->has_rex); + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_movsxd(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + put_rm(sb, &rr, *ctx, 4u); + strbuf_puts(sb, ", "); + put_reg_ctx(sb, rr.reg, 8u, ctx->has_rex); + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_alu_rm_imm(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + u32 used = 1u + rr.bytes_after_modrm; + i64 imm = 0; + if (d->fmt == X64_FMT_ALU_RM_IMM8) { + if (off + used >= len) return 0; + imm = (i64)(i8)bytes[off + used]; + used += 1u; + } else { + if (off + used + 3u >= len) return 0; + imm = (i64)(i32)rd_u32_le(bytes + off + used); + used += 4u; + } + u32 w = width_for(d, ctx); + put_imm(sb, imm); + strbuf_puts(sb, ", "); + put_rm(sb, &rr, *ctx, w); + return off + used; +} + +static u32 print_imul_rr(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + u32 w = width_for(d, ctx); + put_rm(sb, &rr, *ctx, w); + strbuf_puts(sb, ", "); + put_reg_ctx(sb, rr.reg, w, ctx->has_rex); + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_imul_rri(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + /* 69 /r imm32 (full) or 6B /r imm8 (sign-extended). */ + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + u32 used = 1u + rr.bytes_after_modrm; + i64 imm = 0; + u8 op = d->opc[0]; + if (op == 0x6Bu) { + if (off + used >= len) return 0; + imm = (i64)(i8)bytes[off + used]; + used += 1u; + } else { + if (off + used + 3u >= len) return 0; + imm = (i64)(i32)rd_u32_le(bytes + off + used); + used += 4u; + } + u32 w = width_for(d, ctx); + put_imm(sb, imm); + strbuf_puts(sb, ", "); + put_rm(sb, &rr, *ctx, w); + strbuf_puts(sb, ", "); + put_reg_ctx(sb, rr.reg, w, ctx->has_rex); + return off + used; +} + +static u32 print_f7_rm(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + u32 w = width_for(d, ctx); + put_rm(sb, &rr, *ctx, w); + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_shift_imm(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + u32 used = 1u + rr.bytes_after_modrm; + if (off + used >= len) return 0; + u8 imm = bytes[off + used]; + ++used; + u32 w = width_for(d, ctx); + put_imm(sb, (i64)imm); + strbuf_puts(sb, ", "); + put_rm(sb, &rr, *ctx, w); + return off + used; +} + +static u32 print_shift_cl(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + u32 w = width_for(d, ctx); + strbuf_puts(sb, "%cl, "); + put_rm(sb, &rr, *ctx, w); + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_jcc_rel32(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx, u64 vaddr) { + u32 off = ctx->opc_off + d->opc_len; + if (off + 4u > len) return 0; + i32 rel = (i32)rd_u32_le(bytes + off); + u64 tgt = vaddr + (u64)(off + 4u) + (u64)rel; + /* Mnemonic suffix from condition nibble: caller wrote "j"; we append. */ + strbuf_putc(sb, ' '); + strbuf_put_hex_u64(sb, tgt); + return off + 4u; +} + +static u32 print_jmp_call_rel32(StrBuf* sb, const X64InsnDesc* d, + const u8* bytes, u32 len, + const X64DecodeCtx* ctx, u64 vaddr) { + u32 off = ctx->opc_off + d->opc_len; + if (off + 4u > len) return 0; + i32 rel = (i32)rd_u32_le(bytes + off); + u64 tgt = vaddr + (u64)(off + 4u) + (u64)rel; + strbuf_put_hex_u64(sb, tgt); + return off + 4u; +} + +static u32 print_br_rm(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + strbuf_putc(sb, '*'); + put_rm(sb, &rr, *ctx, 8u); + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_setcc(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + put_rm(sb, &rr, *ctx, 1u); + (void)d; + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_cmovcc_rr(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + u32 w = width_for(d, ctx); + put_rm(sb, &rr, *ctx, w); + strbuf_puts(sb, ", "); + put_reg_ctx(sb, rr.reg, w, ctx->has_rex); + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_bswap(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + (void)d; + (void)len; + u32 reg = (bytes[ctx->opc_off + 1u] & 7u) | ((u32)ctx->rex_b << 3); + u32 w = ctx->rex_w ? 8u : 4u; + put_reg_ctx(sb, reg, w, ctx->has_rex); + return ctx->opc_off + 2u; +} + +static u32 print_bs(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, u32 len, + const X64DecodeCtx* ctx) { + /* dst = bsr/bsf(src). Operand order in AT&T is "src, dst". */ + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + u32 w = width_for(d, ctx); + put_rm(sb, &rr, *ctx, w); + strbuf_puts(sb, ", "); + put_reg_ctx(sb, rr.reg, w, ctx->has_rex); + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_xmm_rr(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + /* CVTSI2*: source is GP reg (size from REX.W), dst is xmm. + * CVTT*2SI: source is xmm, dst is GP reg. + * Other SSE arith / mov / cmp: both xmm. */ + u8 op = d->opc[1]; + int dst_is_gp = (op == 0x2Cu); /* CVTTSD/SS2SI */ + int src_is_gp = (op == 0x2Au); /* CVTSI2SD/SS */ + u32 gp_w = ctx->rex_w ? 8u : 4u; + if (src_is_gp) { + put_rm(sb, &rr, *ctx, gp_w); + } else { + put_rm_xmm(sb, &rr, *ctx); + } + strbuf_puts(sb, ", "); + if (dst_is_gp) { + put_reg_ctx(sb, rr.reg, gp_w, ctx->has_rex); + } else { + put_xmm(sb, rr.reg); + } + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_xadd_mem(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + /* XADD r/m, r — source is the reg, destination is r/m. */ + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + u32 w = width_for(d, ctx); + put_reg_ctx(sb, rr.reg, w, ctx->has_rex); + strbuf_puts(sb, ", "); + put_rm(sb, &rr, *ctx, w); + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_xchg_mem(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + u32 w = width_for(d, ctx); + put_reg_ctx(sb, rr.reg, w, ctx->has_rex); + strbuf_puts(sb, ", "); + put_rm(sb, &rr, *ctx, w); + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_cmpxchg_mem(StrBuf* sb, const X64InsnDesc* d, + const u8* bytes, u32 len, + const X64DecodeCtx* ctx) { + /* CMPXCHG r/m, r — implicit RAX is the comparand; not shown. */ + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + u32 w = width_for(d, ctx); + put_reg_ctx(sb, rr.reg, w, ctx->has_rex); + strbuf_puts(sb, ", "); + put_rm(sb, &rr, *ctx, w); + return off + 1u + rr.bytes_after_modrm; +} + +static u32 print_nop_multi(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx) { + (void)sb; + u32 off = ctx->opc_off + d->opc_len; + RegRm rr; + if (!read_modrm(bytes, len, off, *ctx, &rr)) return 0; + return off + 1u + rr.bytes_after_modrm; +} + +/* ==================================================================== + * Dispatch. + * ==================================================================== */ + +u32 x64_print_operands(StrBuf* sb, const X64InsnDesc* d, const u8* bytes, + u32 len, const X64DecodeCtx* ctx, u64 vaddr) { + switch ((X64Format)d->fmt) { + case X64_FMT_NULLARY: + return print_nullary(sb, d, bytes, len, ctx); + case X64_FMT_NOP_MULTI: + return print_nop_multi(sb, d, bytes, len, ctx); + case X64_FMT_PUSH_POP: + return print_push_pop(sb, d, bytes, len, ctx); + case X64_FMT_MOV_RI: + return print_mov_ri(sb, d, bytes, len, ctx); + case X64_FMT_ALU_RR: + return print_alu_rr(sb, d, bytes, len, ctx); + case X64_FMT_MOV_RM_LOAD: + return print_mov_rm_load(sb, d, bytes, len, ctx); + case X64_FMT_MOVZX_MOVSX: + return print_movzx_movsx(sb, d, bytes, len, ctx); + case X64_FMT_MOVSXD: + return print_movsxd(sb, d, bytes, len, ctx); + case X64_FMT_ALU_RM_IMM8: + case X64_FMT_ALU_RM_IMM32: + return print_alu_rm_imm(sb, d, bytes, len, ctx); + case X64_FMT_IMUL_RR: + return print_imul_rr(sb, d, bytes, len, ctx); + case X64_FMT_IMUL_RRI: + return print_imul_rri(sb, d, bytes, len, ctx); + case X64_FMT_F7_RM: + return print_f7_rm(sb, d, bytes, len, ctx); + case X64_FMT_SHIFT_IMM: + return print_shift_imm(sb, d, bytes, len, ctx); + case X64_FMT_SHIFT_CL: + return print_shift_cl(sb, d, bytes, len, ctx); + case X64_FMT_JCC_REL32: + return print_jcc_rel32(sb, d, bytes, len, ctx, vaddr); + case X64_FMT_JMP_REL32: + case X64_FMT_CALL_REL32: + return print_jmp_call_rel32(sb, d, bytes, len, ctx, vaddr); + case X64_FMT_BR_RM: + return print_br_rm(sb, d, bytes, len, ctx); + case X64_FMT_SETCC_RM: + return print_setcc(sb, d, bytes, len, ctx); + case X64_FMT_CMOVCC_RR: + return print_cmovcc_rr(sb, d, bytes, len, ctx); + case X64_FMT_BSWAP: + return print_bswap(sb, d, bytes, len, ctx); + case X64_FMT_BS: + return print_bs(sb, d, bytes, len, ctx); + case X64_FMT_POPCNT: + return print_bs(sb, d, bytes, len, ctx); /* same shape */ + case X64_FMT_SSE_RR: + case X64_FMT_SSE_LOAD: + case X64_FMT_SSE_STORE: + return print_xmm_rr(sb, d, bytes, len, ctx); + case X64_FMT_XADD_MEM: + return print_xadd_mem(sb, d, bytes, len, ctx); + case X64_FMT_XCHG_MEM: + return print_xchg_mem(sb, d, bytes, len, ctx); + case X64_FMT_CMPXCHG_MEM: + return print_cmpxchg_mem(sb, d, bytes, len, ctx); + case X64_FMT_RAW_BYTE: + return 0; + } + return 0; +} + +/* Resolve the condition nibble for Jcc/SETcc/CMOVcc to its AT&T mnemonic + * suffix. Used by the disassembler to spell j → "je", set → "sete", etc. */ +const char* x64_cc_name(u8 cc) { + return g_cc_name[cc & 0xFu]; +} diff --git a/src/arch/x64/isa.h b/src/arch/x64/isa.h @@ -1,16 +1,36 @@ -/* x86_64 ISA helpers used by arch/x64.c. +/* x86_64 ISA descriptors — single source of truth for every instruction + * the encoder and decoder need to agree on. * - * Only the constants here. Instruction encoders live in arch/x64.c - * because they're variable length and depend on the MCEmitter byte - * stream (REX prefix, ModR/M, SIB, displacement). The disassembler - * doesn't share these yet; if/when it does, a parallel x64_isa.c will - * host decode tables. */ + * Unlike aarch64 (fixed 32-bit insns, identified by `(word & mask) == match`), + * x86_64 instructions are variable-length 1..15 bytes: + * + * [ legacy_pfx? ][ REX? ][ opcode 1..3B ][ ModR/M? ][ SIB? ] + * [ disp 0/1/4B ][ imm 0/1/2/4/8B ] + * + * So the table is keyed piecewise: + * - leg_pfx one of {none, 0x66, 0xF2, 0xF3} + * - opc[1..3] opcode bytes (the last byte may have a low-bit mask + * for embed-reg forms like PUSH r64 = 50+rd) + * - modrm_reg 0..7 for `/digit` opcode extension, 0xFF for `/r` + * or no-ModR/M + * - rex_w_req WIDTH_ANY / WIDTH_1 / WIDTH_0 + * - fmt X64Format enum (operand shape) + * + * Disasm flow: + * X64DecodeCtx ctx; x64_decode_prefixes(bytes, len, &ctx); + * const X64InsnDesc* d = x64_disasm_find(bytes, len, &ctx); + * x64_print_operands(sb, d, bytes, len, &ctx, vaddr) → total byte length. + * + * Encoder migration is staged separately: phase 1 ships the descriptors and + * decode side; phase 2 swaps each emit_* function body to use the per-format + * pack helpers below; phase 3 refactors asm.c around the same table. */ #ifndef CFREE_X64_ISA_H #define CFREE_X64_ISA_H #include "core/bytes.h" #include "core/core.h" +#include "core/strbuf.h" /* ---- GPR numbering (DWARF / ABI matches HW encoding 0..15) ---- */ enum { @@ -125,4 +145,594 @@ static inline void x64_write_nop6(u8* dst) { dst[5] = X64_NOP6_BYTE5; } +/* ==================================================================== + * Decode context — fills as we walk the prefix bytes. Disasm and + * (eventually) inline-asm parsers share this so prefix accounting lives + * in one place. + * ==================================================================== */ + +#define X64_PFX_NONE 0u +#define X64_PFX_66 0x66u /* operand-size override (16-bit) */ +#define X64_PFX_F2 0xF2u /* SSE scalar double / REPNE */ +#define X64_PFX_F3 0xF3u /* SSE scalar single / REP */ + +#define X64_W_REQ_ANY 0u /* row matches either REX.W value */ +#define X64_W_REQ_1 1u /* row requires REX.W = 1 (64-bit form) */ +#define X64_W_REQ_0 2u /* row requires REX.W = 0 (force 32-bit form) */ + +typedef struct X64DecodeCtx { + u8 leg_pfx; /* 0 / 0x66 / 0xF2 / 0xF3 (last seen wins) */ + u8 has_lock; + u8 has_rex; + u8 rex_w, rex_r, rex_x, rex_b; + u32 opc_off; /* offset of first opcode byte inside the instruction */ +} X64DecodeCtx; + +/* Walk legacy prefix bytes (0x66 / 0xF2 / 0xF3 / 0xF0 LOCK) followed by an + * optional REX byte (0x40..0x4F). Fills `ctx` and returns the offset of the + * first non-prefix byte. */ +u32 x64_decode_prefixes(const u8* bytes, u32 len, X64DecodeCtx* ctx); + +/* ==================================================================== + * Opcode constants used by both the descriptor table and pack helpers. + * + * Naming: X64_OPC_<class>_<mnemonic>. We promote the bytes the encoder + * emits (not every byte the decoder might see — alias rows in isa.c + * still spell their own opcode bytes inline). */ + +/* ALU r/m, r — opcode picks the operation. */ +#define X64_OPC_ALU_ADD 0x01u +#define X64_OPC_ALU_OR 0x09u +#define X64_OPC_ALU_AND 0x21u +#define X64_OPC_ALU_SUB 0x29u +#define X64_OPC_ALU_XOR 0x31u +#define X64_OPC_ALU_CMP 0x39u +#define X64_OPC_ALU_TEST 0x85u +#define X64_OPC_MOV_RM_R 0x89u /* MOV r/m, r */ +#define X64_OPC_MOV_RM_R8 0x88u /* MOV r/m8, r8 */ +#define X64_OPC_MOV_R_RM 0x8Bu /* MOV r, r/m */ +#define X64_OPC_LEA 0x8Du +#define X64_OPC_MOVSXD 0x63u + +/* ALU r/m, imm — /sub picks op. */ +#define X64_OPC_ALU_IMM8 0x83u +#define X64_OPC_ALU_IMM32 0x81u +#define X64_ALU_SUB_ADD 0u +#define X64_ALU_SUB_OR 1u +#define X64_ALU_SUB_AND 4u +#define X64_ALU_SUB_SUB 5u +#define X64_ALU_SUB_XOR 6u +#define X64_ALU_SUB_CMP 7u + +/* MOV r, imm — B8+rd. */ +#define X64_OPC_MOV_RI 0xB8u + +/* IMUL r, r/m (two-byte) and IMUL r, r/m, imm. */ +#define X64_OPC_IMUL_2B 0xAFu /* preceded by 0x0F */ +#define X64_OPC_IMUL_IMM8 0x6Bu +#define X64_OPC_IMUL_IMM32 0x69u + +/* F7 /sub family. */ +#define X64_OPC_F7 0xF7u +#define X64_F7_SUB_NOT 2u +#define X64_F7_SUB_NEG 3u +#define X64_F7_SUB_MUL 4u +#define X64_F7_SUB_IMUL 5u +#define X64_F7_SUB_DIV 6u +#define X64_F7_SUB_IDIV 7u + +/* Shifts. */ +#define X64_OPC_SHIFT_IMM 0xC1u +#define X64_OPC_SHIFT_CL 0xD3u +#define X64_SHIFT_SUB_SHL 4u +#define X64_SHIFT_SUB_SHR 5u +#define X64_SHIFT_SUB_SAR 7u + +/* MOVZX / MOVSX (preceded by 0x0F). */ +#define X64_OPC_MOVZX_B 0xB6u +#define X64_OPC_MOVZX_W 0xB7u +#define X64_OPC_MOVSX_B 0xBEu +#define X64_OPC_MOVSX_W 0xBFu + +/* SETcc base, CMOVcc base (preceded by 0x0F, low nibble = cc). */ +#define X64_OPC_SETCC_BASE 0x90u +#define X64_OPC_CMOVCC_BASE 0x40u + +/* Branches. */ +#define X64_OPC_JMP_REL32 0xE9u +#define X64_OPC_CALL_REL32 0xE8u +#define X64_OPC_JCC_BASE 0x80u /* preceded by 0x0F, low nibble = cc */ + +/* Stack. */ +#define X64_OPC_PUSH_R 0x50u +#define X64_OPC_POP_R 0x58u + +/* Misc. */ +#define X64_OPC_RET 0xC3u +#define X64_OPC_LEAVE 0xC9u +#define X64_OPC_CDQ_CQO 0x99u +#define X64_OPC_TWOBYTE 0x0Fu + +/* 0x66 operand-size override, used to force 16-bit forms. */ +#define X64_OPSIZE_PFX 0x66u + +/* ==================================================================== + * Format kinds — one per "encoding shape" cfree's emit.c produces. + * The format determines how operands are recovered from the byte stream + * after the opcode bytes and how they print in AT&T syntax. + * ==================================================================== */ + +typedef enum X64Format { + X64_FMT_NULLARY, /* no operands: RET, NOP, UD2, LEAVE, CDQ/CQO */ + X64_FMT_NOP_MULTI, /* multi-byte NOP family (66 0F 1F ...) */ + X64_FMT_PUSH_POP, /* 50+rd / 58+rd — register in low 3 bits */ + X64_FMT_MOV_RI, /* B8+rd imm{32,64} — width via REX.W */ + X64_FMT_ALU_RR, /* op r/m, r — ADD/OR/AND/SUB/XOR/CMP/MOV/TEST */ + X64_FMT_MOV_RM_LOAD, /* 8B /r — MOV r, r/m (also LEA via 8D /r) */ + X64_FMT_MOVZX_MOVSX, /* 0F B6/B7/BE/BF /r — width-extending loads */ + X64_FMT_MOVSXD, /* REX.W 63 /r — MOVSXD r64, r/m32 */ + X64_FMT_ALU_RM_IMM8, /* 83 /sub ib — ADD/OR/AND/SUB/XOR/CMP r/m, imm8 */ + X64_FMT_ALU_RM_IMM32, /* 81 /sub id — same family, imm32 */ + X64_FMT_IMUL_RR, /* 0F AF /r — IMUL r, r/m */ + X64_FMT_IMUL_RRI, /* 69/6B /r i{8,32} — IMUL r, r/m, imm */ + X64_FMT_F7_RM, /* F7 /sub — NOT/NEG/MUL/IMUL/DIV/IDIV */ + X64_FMT_SHIFT_IMM, /* C1 /sub ib — SHL/SHR/SAR r/m, imm8 */ + X64_FMT_SHIFT_CL, /* D3 /sub — SHL/SHR/SAR r/m, %cl */ + X64_FMT_JCC_REL32, /* 0F 8x rel32 — Jcc near */ + X64_FMT_JMP_REL32, /* E9 rel32 */ + X64_FMT_CALL_REL32, /* E8 rel32 */ + X64_FMT_BR_RM, /* FF /2 or /4 — call/jmp indirect r/m */ + X64_FMT_SETCC_RM, /* 0F 9x /0 r/m8 — SETcc */ + X64_FMT_CMOVCC_RR, /* 0F 4x /r — CMOVcc r, r/m */ + X64_FMT_SSE_RR, /* {F2|F3|66}? 0F xx /r — scalar FP reg-reg */ + X64_FMT_SSE_LOAD, /* same, dst <- [base+disp] */ + X64_FMT_SSE_STORE, /* same, [base+disp] <- src */ + X64_FMT_BSWAP, /* 0F C8+rd */ + X64_FMT_BS, /* 0F BC/BD /r — BSF/BSR */ + X64_FMT_POPCNT, /* F3 0F B8 /r */ + X64_FMT_XADD_MEM, /* LOCK 0F C1 /r — XADD m, r */ + X64_FMT_XCHG_MEM, /* 87 /r — XCHG r, m (LOCK implicit on mem dst) */ + X64_FMT_CMPXCHG_MEM, /* LOCK 0F B1 /r — CMPXCHG m, r */ + X64_FMT_RAW_BYTE, /* sentinel: render as `.byte 0xNN` (no match) */ +} X64Format; + +#define X64_ASMFL_ALIAS 0x01u /* row is an alias spelling (prefer-on-decode) */ +#define X64_ASMFL_W_FROM_REX 0x02u /* fmt picks width from ctx->rex_w */ +#define X64_ASMFL_FORCE_W64 0x04u /* fmt always 64-bit regardless of REX.W */ +#define X64_ASMFL_BYTE 0x08u /* fixed-byte operand (movb, setcc) */ +#define X64_ASMFL_W16 0x10u /* fixed 16-bit (via 0x66 prefix override) */ + +/* ==================================================================== + * Descriptor table row. + * ==================================================================== */ + +typedef struct X64InsnDesc { + const char* mnemonic; /* AT&T mnemonic without size suffix; printer adds + a size letter (b/w/l/q) based on fmt + ctx. */ + u8 leg_pfx; /* X64_PFX_NONE / 0x66 / 0xF2 / 0xF3 */ + u8 opc_len; /* 1..3 */ + u8 opc[3]; /* opcode bytes */ + u8 opc_last_mask; /* 0xFF for exact match on opc[opc_len-1]; + 0xF8 for embed-reg in low 3 bits; + 0xF0 for Jcc / SETcc / CMOVcc condition nibble */ + u8 modrm_reg; /* 0..7 if /digit, 0xFF otherwise */ + u8 rex_w_req; /* X64_W_REQ_* */ + u8 fmt; /* X64Format */ + u8 flags; /* X64_ASMFL_* */ +} X64InsnDesc; + +extern const X64InsnDesc x64_insn_table[]; +extern const u32 x64_insn_table_n; + +/* Linear scan after prefix decode. Sets `ctx->opc_off` to where opcode + * starts. Returns the matching descriptor, or NULL on no match (caller + * should emit a `.byte` fallback). On success, opc_off is unchanged; + * the caller can derive opc_end as opc_off + desc->opc_len. */ +const X64InsnDesc* x64_disasm_find(const u8* bytes, u32 len, + X64DecodeCtx* ctx); + +/* Render operand text for a matched descriptor into `sb` and return the + * total instruction length in bytes (from bytes[0], including any + * prefixes/ModR/M/SIB/disp/imm). Returns 0 if the encoding is truncated + * (caller falls back to a single-byte `.byte` rendering). `vaddr` is the + * instruction's virtual address for PC-relative formats; pass 0 if not + * known. The mnemonic itself is *not* written — caller emits desc->mnemonic + * (plus any size suffix it derives via x64_size_suffix_for). */ +u32 x64_print_operands(StrBuf* sb, const X64InsnDesc* desc, const u8* bytes, + u32 len, const X64DecodeCtx* ctx, u64 vaddr); + +/* Returns the AT&T size suffix character ('b','w','l','q') the printer + * appends to mnemonics that depend on operand width. Returns 0 if the + * mnemonic carries its own width (Jcc, SETcc, MOVZX/MOVSX, SSE, etc.). */ +char x64_size_suffix_for(const X64InsnDesc* desc, const X64DecodeCtx* ctx); + +/* Translate a condition nibble (low 4 bits of the second opcode byte for + * Jcc/SETcc/CMOVcc) to its AT&T suffix: "e", "ne", "ge", ... */ +const char* x64_cc_name(u8 cc); + +/* ==================================================================== + * Pack helpers — encode-side counterpart of the decode dispatch above. + * + * Each helper builds one instruction into a caller-provided buffer and + * returns the number of bytes written. Callers must reserve at least + * 16 bytes; no single x86_64 instruction we emit exceeds 15. + * + * REX rules (shared by every reg/mem helper): + * - Emitted only when needed: W=1 or any of R/X/B nonzero. + * - Force-REX variants (suffix `_force_rex`) always emit a REX byte — + * required for byte-reg encodings that promote SIL/DIL/etc. + * + * ModR/M memory rules (handled by x64_pack_mem): + * - mod=0 for disp=0 unless (base & 7) == 5 (RBP/R13 — needs disp8=0). + * - mod=1 for disp in [-128,127]. + * - mod=2 for full disp32. + * - SIB byte required when (base & 7) == 4 (RSP/R12); index=4 (none). + * ==================================================================== */ + +/* REX prefix byte builder. Returns 0 if no REX needed. */ +static inline u8 x64_make_rex(int w, u32 reg, u32 index, u32 rm) { + u8 r = 0; + if (w) r |= X64_REX_W; + if (reg & 8u) r |= X64_REX_R; + if (index & 8u) r |= X64_REX_X; + if (rm & 8u) r |= X64_REX_B; + return r ? (u8)(X64_REX_BASE | r) : 0u; +} + +/* ModR/M byte builder. */ +static inline u8 x64_modrm(u32 mod, u32 reg, u32 rm) { + return (u8)(((mod & 3u) << 6) | ((reg & 7u) << 3) | (rm & 7u)); +} + +/* SIB byte builder. */ +static inline u8 x64_sib(u32 scale, u32 index, u32 base) { + return (u8)(((scale & 3u) << 6) | ((index & 7u) << 3) | (base & 7u)); +} + +/* Pick ModR/M.mod from a (base,disp) memory operand: + * 0 → [base] (only if disp==0 and (base&7)!=5) + * 1 → [base + disp8] + * 2 → [base + disp32] */ +static inline u32 x64_disp_mod(u32 base, i32 disp) { + if (disp == 0 && (base & 7u) != 5u) return 0u; + if (disp >= -128 && disp <= 127) return 1u; + return 2u; +} + +/* Append `n` little-endian bytes of `v` to out, return n. */ +static inline u32 x64_put_u32le(u8* out, u32 v) { + out[0] = (u8)v; + out[1] = (u8)(v >> 8); + out[2] = (u8)(v >> 16); + out[3] = (u8)(v >> 24); + return 4u; +} +static inline u32 x64_put_u64le(u8* out, u64 v) { + for (u32 i = 0; i < 8u; ++i) out[i] = (u8)(v >> (i * 8u)); + return 8u; +} + +/* Pack a memory operand (ModR/M + optional SIB + optional disp) for the + * `reg` operand and `[base + disp]` r/m operand. Returns bytes written. */ +static inline u32 x64_pack_mem(u8* out, u32 reg, u32 base, i32 disp) { + u32 m = x64_disp_mod(base, disp); + u32 n = 0; + if ((base & 7u) == 4u) { + out[n++] = x64_modrm(m, reg, 4u); + out[n++] = x64_sib(0u, 4u, base); + } else { + out[n++] = x64_modrm(m, reg, base); + } + if (m == 1u) { + out[n++] = (u8)(i8)disp; + } else if (m == 2u) { + n += x64_put_u32le(out + n, (u32)disp); + } + return n; +} + +/* Pack a reg-form ModR/M (mod=3) — one byte. */ +static inline u32 x64_pack_rm_reg(u8* out, u32 reg, u32 rm) { + out[0] = x64_modrm(3u, reg, rm); + return 1u; +} + +/* Emit an optional REX (only if needed) and return bytes written (0 or 1). */ +static inline u32 x64_pack_rex(u8* out, int w, u32 reg, u32 index, u32 rm) { + u8 r = x64_make_rex(w, reg, index, rm); + if (!r) return 0u; + out[0] = r; + return 1u; +} +/* Always emit a REX byte (force form). */ +static inline u32 x64_pack_rex_force(u8* out, int w, u32 reg, u32 index, + u32 rm) { + out[0] = (u8)(X64_REX_BASE | (w ? X64_REX_W : 0u) | + ((reg & 8u) ? X64_REX_R : 0u) | + ((index & 8u) ? X64_REX_X : 0u) | + ((rm & 8u) ? X64_REX_B : 0u)); + return 1u; +} + +/* ---- X64_FMT_NULLARY: one or two opcode bytes, no operands. ---- */ +typedef struct X64Nullary { + u8 prefix; /* legacy prefix or 0 */ + int w; /* if nonzero, force REX.W (used by CQO) */ + u8 opc0; + u8 opc1; /* 0 if unused */ +} X64Nullary; +static inline u32 x64_nullary_pack(X64Nullary f, u8* out) { + u32 n = 0; + if (f.prefix) out[n++] = f.prefix; + if (f.w) out[n++] = (u8)(X64_REX_BASE | X64_REX_W); + out[n++] = f.opc0; + if (f.opc1) out[n++] = f.opc1; + return n; +} + +/* ---- X64_FMT_ALU_RR: op r/m, r (reg-reg form). ---- + * REX(w, src, 0, dst) | op | ModR/M(3, src, dst) + * + * `op` selects the operation (ADD/OR/AND/SUB/XOR/CMP/MOV/TEST). */ +typedef struct X64AluRR { + int w; + u8 op; + u32 dst; /* r/m */ + u32 src; /* reg */ +} X64AluRR; +static inline u32 x64_alu_rr_pack(X64AluRR f, u8* out) { + u32 n = x64_pack_rex(out, f.w, f.src, 0, f.dst); + out[n++] = f.op; + n += x64_pack_rm_reg(out + n, f.src, f.dst); + return n; +} + +/* ---- X64_FMT_ALU_RR memory form: op [base+disp], r ---- + * Optional 0x66 | REX(w, src, 0, base) | op | mem(src, base, disp) + * `force_rex` matches emit_mov_store size=1 (byte-reg promotion). */ +typedef struct X64AluRM { + u8 prefix; /* 0 or 0x66 */ + int w; + u8 op; + int force_rex; /* 1 → always emit REX (byte-reg form) */ + u32 src; /* reg operand */ + u32 base; /* memory base */ + i32 disp; +} X64AluRM; +static inline u32 x64_alu_rm_pack(X64AluRM f, u8* out) { + u32 n = 0; + if (f.prefix) out[n++] = f.prefix; + if (f.force_rex) + n += x64_pack_rex_force(out + n, f.w, f.src, 0, f.base); + else + n += x64_pack_rex(out + n, f.w, f.src, 0, f.base); + out[n++] = f.op; + n += x64_pack_mem(out + n, f.src, f.base, f.disp); + return n; +} + +/* ---- X64_FMT_MOV_RI: MOV r, imm — opcode B8+rd ---- */ +typedef struct X64MovRI { + int is64; + u32 dst; + i64 imm; +} X64MovRI; +static inline u32 x64_mov_ri_pack(X64MovRI f, u8* out) { + u32 n = x64_pack_rex(out, f.is64 ? 1 : 0, 0, 0, f.dst); + out[n++] = (u8)(X64_OPC_MOV_RI | (f.dst & 7u)); + if (f.is64) + n += x64_put_u64le(out + n, (u64)f.imm); + else + n += x64_put_u32le(out + n, (u32)f.imm); + return n; +} + +/* ---- X64_FMT_MOV_RM_LOAD (8B /r) and LEA (8D /r) — register dst, memory src. + * Also covers MOVZX/MOVSX with memory source (two-byte opcode). ---- */ +typedef struct X64MovRMLoad { + int w; + u8 opc0; /* primary opcode byte */ + u8 opc1; /* 0 for one-byte opcode; nonzero = 0F xx form */ + u32 dst; /* reg */ + u32 base; /* mem base */ + i32 disp; +} X64MovRMLoad; +static inline u32 x64_mov_rm_load_pack(X64MovRMLoad f, u8* out) { + u32 n = x64_pack_rex(out, f.w, f.dst, 0, f.base); + if (f.opc1) { + out[n++] = X64_OPC_TWOBYTE; + out[n++] = f.opc1; + } else { + out[n++] = f.opc0; + } + n += x64_pack_mem(out + n, f.dst, f.base, f.disp); + return n; +} + +/* ---- X64_FMT_MOVZX_MOVSX reg-reg form: 0F xx /r ---- + * `force_rex` distinguishes byte-source forms (need REX even when no high + * regs) from the word-source form. */ +typedef struct X64MovzxRR { + int w; + u8 opc1; /* B6 / B7 / BE / BF */ + int force_rex; + u32 dst; + u32 src; +} X64MovzxRR; +static inline u32 x64_movzx_rr_pack(X64MovzxRR f, u8* out) { + u32 n; + if (f.force_rex) + n = x64_pack_rex_force(out, f.w, f.dst, 0, f.src); + else + n = x64_pack_rex(out, f.w, f.dst, 0, f.src); + out[n++] = X64_OPC_TWOBYTE; + out[n++] = f.opc1; + n += x64_pack_rm_reg(out + n, f.dst, f.src); + return n; +} + +/* ---- X64_FMT_MOVSXD: REX.W 63 /r — MOVSXD r64, r32 ---- */ +typedef struct X64Movsxd { + u32 dst; + u32 src; +} X64Movsxd; +static inline u32 x64_movsxd_pack(X64Movsxd f, u8* out) { + u32 n = x64_pack_rex(out, 1, f.dst, 0, f.src); + out[n++] = X64_OPC_MOVSXD; + n += x64_pack_rm_reg(out + n, f.dst, f.src); + return n; +} + +/* ---- X64_FMT_ALU_RM_IMM8: 83 /sub ib (sign-extended) — reg-form. ---- */ +typedef struct X64AluRmImm8 { + int w; + u32 sub; + u32 reg; + i8 imm; +} X64AluRmImm8; +static inline u32 x64_alu_imm8_pack(X64AluRmImm8 f, u8* out) { + u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg); + out[n++] = X64_OPC_ALU_IMM8; + out[n++] = x64_modrm(3u, f.sub, f.reg); + out[n++] = (u8)f.imm; + return n; +} + +/* ---- X64_FMT_ALU_RM_IMM32: 81 /sub id (sign-extended for w=1). ---- */ +typedef struct X64AluRmImm32 { + int w; + u32 sub; + u32 reg; + i32 imm; +} X64AluRmImm32; +static inline u32 x64_alu_imm32_pack(X64AluRmImm32 f, u8* out) { + u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg); + out[n++] = X64_OPC_ALU_IMM32; + out[n++] = x64_modrm(3u, f.sub, f.reg); + n += x64_put_u32le(out + n, (u32)f.imm); + return n; +} + +/* ---- X64_FMT_IMUL_RR: 0F AF /r — IMUL r, r/m ---- */ +typedef struct X64ImulRR { + int w; + u32 dst; + u32 src; +} X64ImulRR; +static inline u32 x64_imul_rr_pack(X64ImulRR f, u8* out) { + u32 n = x64_pack_rex(out, f.w, f.dst, 0, f.src); + out[n++] = X64_OPC_TWOBYTE; + out[n++] = X64_OPC_IMUL_2B; + n += x64_pack_rm_reg(out + n, f.dst, f.src); + return n; +} + +/* ---- X64_FMT_IMUL_RRI: 6B /r ib (imm8) or 69 /r id (imm32). ---- */ +typedef struct X64ImulRRI { + int w; + int imm32; /* 1 → 0x69 with imm32; 0 → 0x6B with imm8 */ + u32 dst; + u32 src; + i32 imm; /* sign-extended; for imm32=0, only low byte used */ +} X64ImulRRI; +static inline u32 x64_imul_rri_pack(X64ImulRRI f, u8* out) { + u32 n = x64_pack_rex(out, f.w, f.dst, 0, f.src); + out[n++] = f.imm32 ? X64_OPC_IMUL_IMM32 : X64_OPC_IMUL_IMM8; + out[n++] = x64_modrm(3u, f.dst, f.src); + if (f.imm32) + n += x64_put_u32le(out + n, (u32)f.imm); + else + out[n++] = (u8)(i8)f.imm; + return n; +} + +/* ---- X64_FMT_F7_RM: F7 /sub — NOT/NEG/MUL/IMUL/DIV/IDIV (reg). ---- */ +typedef struct X64F7RM { + int w; + u32 sub; + u32 reg; +} X64F7RM; +static inline u32 x64_f7_rm_pack(X64F7RM f, u8* out) { + u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg); + out[n++] = X64_OPC_F7; + n += x64_pack_rm_reg(out + n, f.sub, f.reg); + return n; +} + +/* ---- X64_FMT_SHIFT_IMM: C1 /sub ib (reg). ---- */ +typedef struct X64ShiftImm { + int w; + u32 sub; + u32 reg; + u8 imm; +} X64ShiftImm; +static inline u32 x64_shift_imm_pack(X64ShiftImm f, u8* out) { + u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg); + out[n++] = X64_OPC_SHIFT_IMM; + out[n++] = x64_modrm(3u, f.sub, f.reg); + out[n++] = f.imm; + return n; +} + +/* ---- X64_FMT_SHIFT_CL: D3 /sub (reg, %cl). ---- */ +typedef struct X64ShiftCL { + int w; + u32 sub; + u32 reg; +} X64ShiftCL; +static inline u32 x64_shift_cl_pack(X64ShiftCL f, u8* out) { + u32 n = x64_pack_rex(out, f.w, 0, 0, f.reg); + out[n++] = X64_OPC_SHIFT_CL; + n += x64_pack_rm_reg(out + n, f.sub, f.reg); + return n; +} + +/* ---- X64_FMT_SETCC_RM: 0F 9x /0 r/m8 — force REX so byte-reg works. ---- */ +typedef struct X64Setcc { + u32 cc; + u32 reg; +} X64Setcc; +static inline u32 x64_setcc_pack(X64Setcc f, u8* out) { + u32 n = x64_pack_rex_force(out, 0, 0, 0, f.reg); + out[n++] = X64_OPC_TWOBYTE; + out[n++] = (u8)(X64_OPC_SETCC_BASE | (f.cc & 0xFu)); + n += x64_pack_rm_reg(out + n, 0u, f.reg); + return n; +} + +/* ---- SSE scalar reg-reg / load / store: {pfx?} 0F xx /r. ---- */ +typedef struct X64SseRR { + u8 prefix; /* 0 / 0x66 / 0xF2 / 0xF3 */ + u8 opcode; + int w; /* REX.W for 64-bit CVTSI2 / CVTT2SI forms */ + u32 dst; + u32 src; +} X64SseRR; +static inline u32 x64_sse_rr_pack(X64SseRR f, u8* out) { + u32 n = 0; + if (f.prefix) out[n++] = f.prefix; + n += x64_pack_rex(out + n, f.w, f.dst, 0, f.src); + out[n++] = X64_OPC_TWOBYTE; + out[n++] = f.opcode; + n += x64_pack_rm_reg(out + n, f.dst, f.src); + return n; +} + +typedef struct X64SseMem { + u8 prefix; + u8 opcode; + u32 reg; + u32 base; + i32 disp; +} X64SseMem; +static inline u32 x64_sse_mem_pack(X64SseMem f, u8* out) { + u32 n = 0; + if (f.prefix) out[n++] = f.prefix; + n += x64_pack_rex(out + n, 0, f.reg, 0, f.base); + out[n++] = X64_OPC_TWOBYTE; + out[n++] = f.opcode; + n += x64_pack_mem(out + n, f.reg, f.base, f.disp); + return n; +} + #endif diff --git a/src/arch/x64/link.c b/src/arch/x64/link.c @@ -59,6 +59,15 @@ static u32 x64_emit_iplt_stub(u8* dst, u64 stub_vaddr, u64 slot_vaddr, return 0; } +static int x64_is_branch_reloc(RelocKind kind) { + return kind == R_X64_PLT32 || kind == R_PLT32; +} + +static int x64_is_got_load_reloc(RelocKind kind) { + return kind == R_X64_GOTPCREL || kind == R_X64_GOTPCRELX || + kind == R_X64_REX_GOTPCRELX; +} + const LinkArchDesc link_arch_x64 = { .e_machine = EM_X86_64, .default_musl_interp = "/lib/ld-musl-x86_64.so.1", @@ -74,4 +83,8 @@ const LinkArchDesc link_arch_x64 = { .emit_plt0 = x64_emit_plt0, .emit_plt_entry = x64_emit_plt_entry, .emit_iplt_stub = x64_emit_iplt_stub, + + .is_branch_reloc = x64_is_branch_reloc, + .is_got_load_reloc = x64_is_got_load_reloc, + .needs_jit_call_stub = x64_is_branch_reloc, }; diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c @@ -1045,6 +1045,9 @@ static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int, } else if (av->storage.kind == OPK_INDIRECT) { emit_lea(t->mc, dst_reg, av->storage.v.ind.base & 0xFu, av->storage.v.ind.ofs); + } else if (av->storage.kind == OPK_GLOBAL) { + emit_global_lea(t, dst_reg, av->storage.v.global.sym, + av->storage.v.global.addend); } else { compiler_panic(t->c, a->loc, "x64 call: INDIRECT arg storage kind %d unsupported", @@ -1058,6 +1061,76 @@ static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int, return; } + if (ai->kind == ABI_ARG_DIRECT && + x64_abi_direct_to_stack(ai, *next_int, *next_fp)) { + for (u16 i = 0; i < ai->nparts; ++i) { + const ABIArgPart* pt = &ai->parts[i]; + u32 sz = pt->size; + Operand addr = x_call_stack_arg_addr(t, *stack_off, tail); + if (pt->cls == ABI_CLASS_FP) { + u8 prefix2 = (sz == 8) ? 0xF2 : 0xF3; + if (av->storage.kind == OPK_REG) { + emit_sse_store(t->mc, prefix2, 0x11, av->storage.v.reg & 0xFu, + addr.v.ind.base & 0xFu, addr.v.ind.ofs); + } else if (av->storage.kind == OPK_LOCAL) { + XSlot* s = x64_slot_get(a, av->storage.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "x64 call: bad FP arg slot"); + emit_sse_load(t->mc, prefix2, 0x10, X64_XMM15, X64_RBP, + -(i32)s->off + (i32)pt->src_offset); + emit_sse_store(t->mc, prefix2, 0x11, X64_XMM15, + addr.v.ind.base & 0xFu, addr.v.ind.ofs); + } else if (av->storage.kind == OPK_INDIRECT) { + emit_sse_load(t->mc, prefix2, 0x10, X64_XMM15, + av->storage.v.ind.base & 0xFu, + av->storage.v.ind.ofs + (i32)pt->src_offset); + emit_sse_store(t->mc, prefix2, 0x11, X64_XMM15, + addr.v.ind.base & 0xFu, addr.v.ind.ofs); + } else { + compiler_panic(t->c, a->loc, + "x64 call: FP stack-arg storage kind %d unsupported", + (int)av->storage.kind); + } + } else if (pt->cls == ABI_CLASS_INT) { + switch (av->storage.kind) { + case OPK_IMM: { + int w = (sz == 8) ? 1 : 0; + x64_emit_load_imm(t->mc, w, X64_RAX, av->storage.v.imm); + break; + } + case OPK_REG: { + int w = (sz == 8) ? 1 : 0; + u32 sr = av->storage.v.reg & 0xFu; + if (sr != X64_RAX) emit_mov_rr(t->mc, w, X64_RAX, sr); + break; + } + case OPK_LOCAL: { + XSlot* s = x64_slot_get(a, av->storage.v.frame_slot); + if (!s) compiler_panic(t->c, a->loc, "x64 call: bad arg slot"); + emit_mov_load(t->mc, sz, 0, X64_RAX, X64_RBP, + -(i32)s->off + (i32)pt->src_offset); + break; + } + case OPK_INDIRECT: + emit_mov_load(t->mc, sz, 0, X64_RAX, + av->storage.v.ind.base & 0xFu, + av->storage.v.ind.ofs + (i32)pt->src_offset); + break; + default: + compiler_panic(t->c, a->loc, + "x64 call: arg storage kind %d unsupported", + (int)av->storage.kind); + } + emit_mov_store(t->mc, sz, X64_RAX, addr.v.ind.base & 0xFu, + addr.v.ind.ofs); + } else { + compiler_panic(t->c, a->loc, "x64 call: ABI class %d unimpl", + (int)pt->cls); + } + *stack_off += 8; + } + return; + } + for (u16 i = 0; i < ai->nparts; ++i) { const ABIArgPart* pt = &ai->parts[i]; u32 sz = pt->size; @@ -1186,6 +1259,11 @@ static void count_arg_stack(const CGABIValue* av, u32* next_int, u32* next_fp, *stack_off += 8; return; } + if (ai->kind == ABI_ARG_DIRECT && + x64_abi_direct_to_stack(ai, *next_int, *next_fp)) { + *stack_off += (u32)ai->nparts * 8u; + return; + } for (u16 i = 0; i < ai->nparts; ++i) { const ABIArgPart* pt = &ai->parts[i]; if (pt->cls == ABI_CLASS_INT) { @@ -1545,6 +1623,12 @@ static void x_ret(CGTarget* t, const CGABIValue* val) { "x64 ret indirect: storage kind %d unsupported", (int)val->storage.kind); } + if (val->storage.kind == OPK_INDIRECT && + (src_base == X64_RAX || + (src_base == X64_RDI && a->sret_ptr_slot != FRAME_SLOT_NONE))) { + emit_mov_rr(mc, 1, X64_R11, src_base); + src_base = X64_R11; + } if (a->sret_ptr_slot != FRAME_SLOT_NONE) { XSlot* sp = x64_slot_get(a, a->sret_ptr_slot); if (sp) emit_mov_load(mc, 8, 0, X64_RDI, X64_RBP, -(i32)sp->off); diff --git a/src/arch/x64/opt_coord.c b/src/arch/x64/opt_coord.c @@ -241,6 +241,26 @@ static void x_plan_call(CGTarget* t, const CGCallDesc* d, CGCallPlan* out) { m->mem.align = 8; continue; } + if (ai->kind == ABI_ARG_DIRECT && + x64_abi_direct_to_stack(ai, next_int, next_fp)) { + for (u16 i = 0; i < ai->nparts; ++i) { + const ABIArgPart* p = &ai->parts[i]; + CGCallPlanMove* m = &out->args[out->nargs++]; + m->src = av->nparts ? av->parts[i].op : av->storage; + m->src_offset = av->nparts ? av->parts[i].src_offset : p->src_offset; + m->dst_kind = CG_CALL_PLAN_STACK; + m->stack_offset = stack; + m->mem.type = av->type; + m->mem.size = p->size; + m->mem.align = p->align ? p->align : p->size; + if (p->cls == ABI_CLASS_FP) + m->cls = RC_FP; + else + m->cls = RC_INT; + stack += 8; + } + continue; + } for (u16 i = 0; i < ai->nparts; ++i) { const ABIArgPart* p = &ai->parts[i]; CGCallPlanMove* m = &out->args[out->nargs++]; diff --git a/src/dbg/arch.c b/src/dbg/arch.c @@ -1,47 +0,0 @@ -/* Per-arch dispatch for the JIT debugger primitives. - * - * Keeps src/dbg/{bp,displaced,step}.c arch-neutral. Anything that needs - * to choose between aa64 and rv64 (trap word, displaced-step lifter) - * funnels through the helpers here. */ - -#include "dbg/dbg.h" - -uint32_t dbg_arch_brk_word(CfreeArchKind arch, u32* len_out) { - switch (arch) { - case CFREE_ARCH_ARM_64: - if (len_out) *len_out = DBG_AA64_INSN_LEN; - return dbg_aa64_brk_word(); - case CFREE_ARCH_RV64: - if (len_out) *len_out = DBG_RV64_INSN_LEN; - return dbg_rv64_brk_word(); - default: - if (len_out) *len_out = 0; - return 0; - } -} - -u32 dbg_arch_insn_len(CfreeArchKind arch) { - switch (arch) { - case CFREE_ARCH_ARM_64: - return DBG_AA64_INSN_LEN; - case CFREE_ARCH_RV64: - return DBG_RV64_INSN_LEN; - default: - return 0; - } -} - -int dbg_arch_build_shim(CfreeArchKind arch, uint32_t orig_insn, - uint64_t orig_pc, void* scratch_write, - uint64_t scratch_runtime, u32* brk_offset) { - switch (arch) { - case CFREE_ARCH_ARM_64: - return dbg_aa64_build_shim(orig_insn, orig_pc, scratch_write, - scratch_runtime, brk_offset); - case CFREE_ARCH_RV64: - return dbg_rv64_build_shim(orig_insn, orig_pc, scratch_write, - scratch_runtime, brk_offset); - default: - return 1; - } -} diff --git a/src/dbg/bp.c b/src/dbg/bp.c @@ -1,9 +1,10 @@ /* Breakpoint table for the JIT debugger session. * - * Keyed by runtime address. Each slot owns the bytes overwritten by the BRK - * patch, a refcount (so step.c can drop temporaries without disturbing user - * bps at the same PC), and a monotonic user-visible id. Reads of the - * patched range substitute the saved bytes so `x` and disasm stay honest. */ + * Keyed by runtime address. Each slot owns the bytes overwritten by the + * per-arch trap patch, a refcount (so step.c can drop temporaries without + * disturbing user bps at the same PC), and a monotonic user-visible id. Reads + * of the patched range substitute the saved bytes so `x` and disasm stay + * honest. */ #include "dbg/dbg.h" @@ -76,19 +77,22 @@ void dbg_bp_fini(CfreeJitSession* s) { static CfreeStatus bp_install_patch(CfreeJitSession* s, DbgBp* b) { void* write_addr = NULL; - uint32_t brk; - u32 insn_len = 0; + uint8_t patch[ARCH_DBG_MAX_TRAP_BYTES]; + u32 patch_len = 0; CfreeStatus st; - brk = dbg_arch_brk_word(s->arch, &insn_len); - if (insn_len == 0) return CFREE_UNSUPPORTED; - b->saved_len = insn_len; + if (!s->arch_dbg || !s->arch_dbg->breakpoint_patch) + return CFREE_UNSUPPORTED; + st = s->arch_dbg->breakpoint_patch(patch, (u32)sizeof(patch), &patch_len); + if (st != CFREE_OK) return st; + if (patch_len == 0 || patch_len > sizeof(b->saved)) return CFREE_ERR; + b->saved_len = patch_len; st = s->os->code_write_begin(s->os->user, (void*)(uintptr_t)b->addr, b->saved_len, &write_addr); if (st != CFREE_OK || !write_addr) { return st != CFREE_OK ? st : CFREE_ERR; } memcpy(b->saved, write_addr, b->saved_len); - memcpy(write_addr, &brk, sizeof(brk)); + memcpy(write_addr, patch, b->saved_len); s->os->code_write_end(s->os->user, (void*)(uintptr_t)b->addr, b->saved_len); if (s->os->flush_icache) s->os->flush_icache(s->os->user, (void*)(uintptr_t)b->addr, b->saved_len); diff --git a/src/dbg/dbg.h b/src/dbg/dbg.h @@ -2,20 +2,20 @@ #define CFREE_DBG_INTERNAL_H /* Internal contracts for src/dbg/. The public CfreeJitSession entries are - * defined in session.c on top of these primitives; bp.c, step.c, mem.c, - * displaced.c, and arch/aa64/dbg.c each own one slice. */ + * defined in session.c on top of these primitives; bp.c, step.c, mem.c, and + * displaced.c own the target-independent session machinery. Per-arch debug + * behavior is reached through ArchImpl.dbg. */ #include <cfree/dbg.h> #include <cfree/dwarf.h> #include <cfree/jit.h> +#include "arch/arch.h" #include "core/core.h" #define DBG_SCRATCH_PAGE_SIZE 4096u -#define DBG_BP_MAX_INSN_LEN 8u +#define DBG_BP_MAX_INSN_LEN ARCH_DBG_MAX_TRAP_BYTES #define DBG_BP_ID_INTERNAL_BASE 0x80000000u -#define DBG_AA64_INSN_LEN 4u -#define DBG_RV64_INSN_LEN 4u #define DBG_DISPLACED_SLOT_BYTES 64u /* Bridge into link_jit.c so the session can validate addresses and pick the @@ -85,16 +85,17 @@ CfreeStatus dbg_mem_write(struct CfreeJitSession*, uint64_t addr, const void* src, size_t n); /* ---- displaced step ------------------------------------------------- */ -/* The session owns a single executable scratch region. arch/aa64/dbg.c writes - * a fixed-up copy of the original insn plus a return-shim into it; the - * worker is then resumed with PC pointing at the scratch entry. The shim - * ends with a BRK that the fault classifier recognizes (via the bp table) - * and translates back into MODE_DONE. */ +/* The session owns a single executable scratch region. The per-arch debug + * vtable writes a fixed-up copy of the original insn plus a trap sentinel + * into it; the worker is then resumed with PC pointing at the scratch entry. + * The sentinel is patched into the bp table so the fault classifier recognizes + * it and translates the stop back into MODE_DONE. */ typedef struct DbgDisplaced { CfreeExecMemRegion region; int valid; uint64_t orig_pc; /* original user PC of the insn being stepped */ - uint64_t return_pc; /* PC the shim's BRK lives at (= scratch + N) */ + uint64_t return_pc; /* PC the shim's trap sentinel lives at */ + uint64_t fallthrough_pc; uint32_t internal_bp; /* id of the one-shot bp at return_pc */ } DbgDisplaced; @@ -103,62 +104,16 @@ void dbg_displaced_fini(struct CfreeJitSession*); /* Prepare an out-of-line single-step at `insn_pc`. Sets *new_pc to the * scratch entry the worker should branch to; arms an internal bp on the - * shim's BRK. Returns CFREE_OK on success, CFREE_UNSUPPORTED if the insn - * family is not supported. */ + * shim's trap sentinel. Returns CFREE_OK on success, CFREE_UNSUPPORTED if the + * insn family is not supported. */ CfreeStatus dbg_displaced_prepare(struct CfreeJitSession*, uint64_t insn_pc, uint64_t* new_pc); -/* After the shim BRK fires, finalize: clear the internal bp, restore the - * user-visible PC to insn_pc + 4 (or branch target captured by the shim). */ +/* After the shim trap fires, finalize: clear the internal bp, restore the + * user-visible PC to the decoded fallthrough PC (or leave a branch target + * captured by the shim alone). */ void dbg_displaced_finalize(struct CfreeJitSession*); - -/* ---- arch-aa64 ------------------------------------------------------ */ -uint32_t dbg_aa64_brk_word(void); -/* Lay down a displaced-step shim for the 4-byte AArch64 insn `orig_insn` - * (originally at `orig_pc`) into the scratch buffer beginning at - * `scratch_runtime`. Writes bytes through `scratch_write` (write alias). - * On success returns 0 and sets *brk_offset to the byte offset of the BRK - * sentinel from `scratch_runtime`; the caller arms an internal bp at - * `scratch_runtime + *brk_offset` and flushes the whole slot. Returns 1 - * for unsupported instruction families. Kept as int so arch/aa64/dbg.c - * does not have to learn the dbg-internal status type. */ -int dbg_aa64_build_shim(uint32_t orig_insn, uint64_t orig_pc, - void* scratch_write, uint64_t scratch_runtime, - u32* brk_offset); - -/* ---- arch-rv64 ------------------------------------------------------ - * Mirrors the aa64 contract for RISC-V 64. The trap instruction is - * EBREAK (0x00100073). The shim handles RV64I PC-relative insns: - * - JAL: rewrites to a materialize-target + JALR through t0. - * - JALR: copies verbatim (target is in register). - * - BEQ/BNE/BLT/BGE/BLTU/BGEU: emits a conditional-branch-then-JALR - * trampoline with the absolute target sitting in a literal pool. - * - AUIPC: rewrites as `lui` of the absolute high-20 of (orig_pc + imm). - * - Everything else (LUI, integer ALU, loads/stores, system, ...): - * copies verbatim followed by an ebreak sentinel. - * - * The shim must NOT clobber a0..a7 or s0..s11. It is free to use t0/t1 - * (x5/x6) as scratch. - * - * The arch-neutral dbg_arch_brk_word / dbg_arch_build_shim entry points - * below dispatch on session->arch. */ -uint32_t dbg_rv64_brk_word(void); -int dbg_rv64_build_shim(uint32_t orig_insn, uint64_t orig_pc, - void* scratch_write, uint64_t scratch_runtime, - u32* brk_offset); - -/* ---- arch dispatch -------------------------------------------------- */ -/* Returns the architecture's software-trap word, or 0 if the arch is - * not supported. `len_out`, when non-NULL, receives the trap insn's - * byte length (4 for both aa64 and rv64). */ -uint32_t dbg_arch_brk_word(CfreeArchKind arch, u32* len_out); -/* Returns the fixed instruction length used by the displaced-step shim - * for `arch`, or 0 if unsupported. */ -u32 dbg_arch_insn_len(CfreeArchKind arch); -/* Dispatches to the per-arch displaced-step lifter. Returns 1 for an - * unsupported arch or for an unsupported instruction family. */ -int dbg_arch_build_shim(CfreeArchKind arch, uint32_t orig_insn, - uint64_t orig_pc, void* scratch_write, - uint64_t scratch_runtime, u32* brk_offset); +CfreeStatus dbg_arch_decode_insn(struct CfreeJitSession*, uint64_t pc, + ArchDbgInsn* out); /* ---- step state machine --------------------------------------------- */ CfreeStatus dbg_step_resume(struct CfreeJitSession*, CfreeResumeMode mode); @@ -180,7 +135,8 @@ struct CfreeJitSession { * (and only that — the JIT image itself was already mapped at link * time). NULL if the host's JitHost did not supply an execmem. */ const CfreeExecMem* execmem; - CfreeArchKind arch; + const ArchImpl* arch_impl; + const ArchDbgOps* arch_dbg; /* worker thread + event handshake */ void* worker; diff --git a/src/dbg/displaced.c b/src/dbg/displaced.c @@ -3,8 +3,8 @@ * Reserves a single executable page (W^X dual-mapped via the JitHost * execmem) the first time STEP_INSN is requested. The per-arch lifter * copies a fixed-up version of the instruction at insn_pc into that page, - * followed by a BRK sentinel; the session arms an internal breakpoint on - * the sentinel and resumes with PC = scratch_runtime. On the BRK fault, + * followed by a trap sentinel; the session arms an internal breakpoint on + * the sentinel and resumes with PC = scratch_runtime. On the trap fault, * the fault classifier sees the internal bp and the session uses * dbg_displaced_finalize to restore the user-visible PC. */ @@ -43,16 +43,18 @@ void dbg_displaced_fini(CfreeJitSession* s) { CfreeStatus dbg_displaced_prepare(CfreeJitSession* s, uint64_t insn_pc, uint64_t* new_pc) { - uint32_t orig_word = 0; + ArchDbgInsn insn; u32 brk_off = 0; uint64_t scratch_runtime; + uint64_t fallthrough_pc = 0; uint8_t* scratch_write; u32 bp_id = 0; CfreeStatus st; const CfreeExecMem* mem; - u32 insn_len = dbg_arch_insn_len(s->arch); - if (insn_len == 0) return CFREE_UNSUPPORTED; + if (!s->arch_dbg || !s->arch_dbg->build_displaced_shim) { + return CFREE_UNSUPPORTED; + } st = dbg_displaced_init(s); if (st != CFREE_OK) return st; @@ -64,44 +66,34 @@ CfreeStatus dbg_displaced_prepare(CfreeJitSession* s, uint64_t insn_pc, dbg_bp_clear(s, s->displaced.internal_bp); s->displaced.internal_bp = 0; s->displaced.return_pc = 0; + s->displaced.fallthrough_pc = 0; s->displaced.orig_pc = 0; } - /* Read the original 4 bytes via the bp table (so we get the saved - * insn, not the BRK if one is patched here). */ - { - u32 idx = dbg_bp_lookup_index(s, insn_pc); - if (idx) { - DbgBp* b = dbg_bp_at_index(s, idx); - memcpy(&orig_word, b->saved, sizeof(orig_word)); - } else { - st = s->os->guarded_copy(s->os->user, &orig_word, - (const void*)(uintptr_t)insn_pc, - sizeof(orig_word)); - if (st != CFREE_OK) return st; - } - } + st = dbg_arch_decode_insn(s, insn_pc, &insn); + if (st != CFREE_OK) return st; scratch_runtime = (uint64_t)(uintptr_t)s->displaced.region.runtime; scratch_write = (uint8_t*)s->displaced.region.write; - if (dbg_arch_build_shim(s->arch, orig_word, insn_pc, scratch_write, - scratch_runtime, &brk_off) != 0) { - return CFREE_UNSUPPORTED; - } - /* Flush the entire slot — trampoline forms write up to 24 bytes plus a - * literal pool; arch/aa64/dbg.c returns the BRK *offset*, not the length. */ + st = s->arch_dbg->build_displaced_shim(&insn, scratch_write, scratch_runtime, + DBG_DISPLACED_SLOT_BYTES, &brk_off, + &fallthrough_pc); + if (st != CFREE_OK) return st; + + /* Flush the entire slot; trampoline forms may write past the sentinel. */ mem = s->execmem; if (mem && mem->flush_icache) { mem->flush_icache(mem->user, s->displaced.region.runtime, DBG_DISPLACED_SLOT_BYTES); } - /* Arm an internal breakpoint on the shim's BRK sentinel so the fault + /* Arm an internal breakpoint on the shim's trap sentinel so the fault * classifier identifies it as a displaced-step completion. */ st = dbg_bp_set_internal(s, scratch_runtime + brk_off, &bp_id); if (st != CFREE_OK) return st; s->displaced.orig_pc = insn_pc; s->displaced.return_pc = scratch_runtime + brk_off; + s->displaced.fallthrough_pc = fallthrough_pc; s->displaced.internal_bp = bp_id; if (new_pc) *new_pc = scratch_runtime; return CFREE_OK; @@ -116,10 +108,24 @@ void dbg_displaced_finalize(CfreeJitSession* s) { * fixed-up branch took (in which case PC will already be elsewhere * and we leave it alone). */ if (s->stop.regs.pc == s->displaced.return_pc) { - u32 ilen = dbg_arch_insn_len(s->arch); - if (ilen == 0) ilen = DBG_AA64_INSN_LEN; - s->stop.regs.pc = s->displaced.orig_pc + ilen; + s->stop.regs.pc = s->displaced.fallthrough_pc; } s->displaced.orig_pc = 0; s->displaced.return_pc = 0; + s->displaced.fallthrough_pc = 0; +} + +CfreeStatus dbg_arch_decode_insn(CfreeJitSession* s, uint64_t pc, + ArchDbgInsn* out) { + uint8_t buf[ARCH_DBG_MAX_INSN_BYTES]; + CfreeStatus st; + u32 max_len; + if (!s || !out || !s->arch_dbg || !s->arch_dbg->decode_insn) + return CFREE_UNSUPPORTED; + max_len = s->arch_dbg->max_insn_len; + if (max_len == 0 || max_len > ARCH_DBG_MAX_INSN_BYTES) + return CFREE_UNSUPPORTED; + st = dbg_mem_read(s, pc, buf, max_len); + if (st != CFREE_OK) return st; + return s->arch_dbg->decode_insn(buf, max_len, pc, out); } diff --git a/src/dbg/session.c b/src/dbg/session.c @@ -3,7 +3,7 @@ * The session owns a single worker thread that runs the JIT'd entry. The * REPL thread and worker thread coordinate through two events (resume, * stop) and one shared CfreeStopInfo slot. Every fault on the worker - * (BRK / SIGSEGV / SIGBUS / SIGILL / SIGFPE / interrupt_signo) drops into + * (trap / SIGSEGV / SIGBUS / SIGILL / SIGFPE / interrupt_signo) drops into * on_fault here; this TU is also the only place that touches the public * CfreeJitSession entries. */ @@ -14,9 +14,9 @@ /* ---- fault classification ------------------------------------------- */ static int signo_is_trap(int signo) { - /* Trap = software breakpoint (BRK on aa64). The host typically maps + /* Trap = software breakpoint. The host typically maps * this to SIGTRAP; we don't include the platform header to assert the - * value but BRK-induced faults always carry the trap signo for the + * value but debugger trap faults always carry the trap signo for the * host that installed the handler. The POSIX impl in driver/env.c * passes the host signo straight through. We treat any signo whose * fault PC is patched as a trap, so the actual signo here is mostly @@ -28,6 +28,7 @@ static int signo_is_trap(int signo) { static CfreeStatus on_fault(void* session_v, int signo, CfreeUnwindFrame* regs) { CfreeJitSession* s = (CfreeJitSession*)session_v; + uint64_t bp_addr; u32 idx; DbgBp* bp; @@ -45,15 +46,21 @@ static CfreeStatus on_fault(void* session_v, int signo, goto park; } - /* Breakpoint? */ - idx = dbg_bp_lookup_index(s, regs->pc); + /* Breakpoint? The arch layer owns trap-PC normalization (e.g. x86 INT3 + * reports the PC after the trap byte). Keep non-breakpoint signals at the + * raw PC if no patch is found. */ + bp_addr = regs->pc; + if (s->arch_dbg && s->arch_dbg->breakpoint_addr_from_fault_pc) + bp_addr = s->arch_dbg->breakpoint_addr_from_fault_pc(regs->pc); + idx = dbg_bp_lookup_index(s, bp_addr); if (idx) { bp = dbg_bp_at_index(s, idx); + s->stop.regs.pc = bp_addr; /* Displaced-step sentinel: complete the step and either resume * silently (auto_continue path inside dbg_step_resume) or surface * a generic stop for STEP_INSN. */ - if (bp && bp->internal && s->displaced.return_pc == regs->pc) { + if (bp && bp->internal && s->displaced.return_pc == bp_addr) { dbg_displaced_finalize(s); /* Sync the on-stack regs with the corrected PC so the OS layer * writes it back into ucontext on return. */ @@ -131,7 +138,7 @@ static CfreeStatus on_fault(void* session_v, int signo, } /* Not a patched address — pass through as SIGNAL (covers SEGV, BUS, - * ILL, FPE, and any SIGTRAP from a program-emitted BRK). */ + * ILL, FPE, and any SIGTRAP from a program-emitted trap). */ (void)signo_is_trap; s->stop.kind = CFREE_STOP_SIGNAL; @@ -276,6 +283,7 @@ CfreeStatus cfree_jit_session_new(CfreeJit* jit, const CfreeDbgHost* host, Compiler* c; Heap* heap; const CfreeDbgOs* os; + const ArchImpl* arch; CfreeDbgSignalOps ops; CfreeStatus st; @@ -290,11 +298,9 @@ CfreeStatus cfree_jit_session_new(CfreeJit* jit, const CfreeDbgHost* host, !os->code_write_begin || !os->code_write_end || !os->guarded_copy) { return CFREE_INVALID; } - /* v1 supports aarch64 and rv64 lifters; refuse other targets early so - * we don't end up with patched bytes we can't roll back. */ - { - CfreeArchKind arch = cfree_jit_image_arch(jit); - if (dbg_arch_insn_len(arch) == 0) return CFREE_UNSUPPORTED; + arch = arch_lookup(cfree_jit_image_arch(jit)); + if (!arch || !arch->dbg || !arch->dbg->breakpoint_patch) { + return CFREE_UNSUPPORTED; } heap = c->ctx->heap; @@ -309,7 +315,8 @@ CfreeStatus cfree_jit_session_new(CfreeJit* jit, const CfreeDbgHost* host, * consumer. May be NULL if the JIT was constructed without one, in * which case STEP_INSN paths will surface CFREE_UNSUPPORTED. */ s->execmem = cfree_jit_image_execmem(jit); - s->arch = cfree_jit_image_arch(jit); + s->arch_impl = arch; + s->arch_dbg = arch->dbg; s->state = DBG_STATE_IDLE; st = os->event_new(os->user, &s->ev_resume); @@ -446,7 +453,7 @@ CfreeStatus cfree_jit_session_resume(CfreeJitSession* s, CfreeResumeMode mode, /* For CONTINUE-over-bp we use displaced step to skip the patched insn * and rely on the on_fault handler's pending_step_pending fast-path to - * not surface that step's BRK to the REPL. */ + * not surface that step's trap to the REPL. */ if (mode == CFREE_RESUME_CONTINUE && dbg_bp_lookup_index(s, s->stop.regs.pc) != 0) { s->pending_step_pending = 1; diff --git a/src/dbg/step.c b/src/dbg/step.c @@ -10,15 +10,6 @@ #include <string.h> #define DBG_STEP_LINE_INSN_CAP 1024u -#define DBG_AA64_BL_MASK 0xFC000000u -#define DBG_AA64_BL_OP 0x94000000u -/* RV64: JAL with rd != x0, or JALR with rd != x0, is a "call" for the - * purposes of NEXT_LINE (step over). The opcodes are 0x6F (JAL) and - * 0x67 (JALR); rd is bits 11:7. */ -#define DBG_RV64_OP_MASK 0x0000007fu -#define DBG_RV64_OP_JAL 0x0000006fu -#define DBG_RV64_OP_JALR 0x00000067u -#define DBG_RV64_RD_MASK 0x00000f80u /* DWARF line/CFI tables are authored in image-relative vaddrs (cfree's * debug emitter writes them, the JIT view applies relocs against final @@ -125,38 +116,6 @@ static CfreeStatus run_step_line_loop(CfreeJitSession* s) { return CFREE_OK; } -static CfreeStatus read_insn_word(CfreeJitSession* s, uint64_t pc, - uint32_t* out) { - uint8_t buf[4]; - CfreeStatus st = dbg_mem_read(s, pc, buf, 4); - if (st != CFREE_OK) return st; - *out = ((uint32_t)buf[0]) | ((uint32_t)buf[1] << 8) | - ((uint32_t)buf[2] << 16) | ((uint32_t)buf[3] << 24); - return CFREE_OK; -} - -static int aa64_is_bl(uint32_t insn) { - return (insn & DBG_AA64_BL_MASK) == DBG_AA64_BL_OP; -} - -static int rv64_is_call(uint32_t insn) { - uint32_t op = insn & DBG_RV64_OP_MASK; - if (op != DBG_RV64_OP_JAL && op != DBG_RV64_OP_JALR) return 0; - /* rd != x0 means the link register is being written -> treat as a call. */ - return (insn & DBG_RV64_RD_MASK) != 0; -} - -static int arch_insn_is_call(CfreeArchKind arch, uint32_t insn) { - switch (arch) { - case CFREE_ARCH_ARM_64: - return aa64_is_bl(insn); - case CFREE_ARCH_RV64: - return rv64_is_call(insn); - default: - return 0; - } -} - static CfreeStatus run_step_out(CfreeJitSession* s) { CfreeUnwindFrame frame; u32 bp_id = 0; @@ -177,13 +136,13 @@ static CfreeStatus run_step_out(CfreeJitSession* s) { } static CfreeStatus run_next_line(CfreeJitSession* s) { - uint32_t insn = 0; - if (dbg_arch_insn_len(s->arch) == 0) return CFREE_UNSUPPORTED; + ArchDbgInsn insn; - if (read_insn_word(s, s->stop.regs.pc, &insn) != CFREE_OK) { + if (!s->arch_dbg || !s->arch_dbg->is_call || + dbg_arch_decode_insn(s, s->stop.regs.pc, &insn) != CFREE_OK) { return run_step_line_loop(s); } - if (!arch_insn_is_call(s->arch, insn)) { + if (!s->arch_dbg->is_call(&insn)) { return run_step_line_loop(s); } @@ -239,7 +198,6 @@ CfreeStatus dbg_step_resume(CfreeJitSession* s, CfreeResumeMode mode) { case CFREE_RESUME_NEXT_LINE: { CfreeStatus st; if (!s->dwarf) return CFREE_INVALID; - if (dbg_arch_insn_len(s->arch) == 0) return CFREE_UNSUPPORTED; st = run_next_line(s); if (st != CFREE_OK) return st; s->pending_done = 1; diff --git a/src/debug/debug_emit.c b/src/debug/debug_emit.c @@ -782,6 +782,40 @@ static void emit_section_abbrev(EmitCtx *e) { buf_fini(&b); } +static ArchDwarfOps debug_dwarf_ops(const Debug *d) { + const ArchImpl *arch = arch_for_compiler(d ? d->c : NULL); + ArchDwarfOps ops; + ops.min_inst_len = 1u; + ops.max_ops_per_inst = 1u; + ops.pad[0] = 0; + ops.pad[1] = 0; + if (arch && arch->dwarf) { + if (arch->dwarf->min_inst_len) + ops.min_inst_len = arch->dwarf->min_inst_len; + if (arch->dwarf->max_ops_per_inst) + ops.max_ops_per_inst = arch->dwarf->max_ops_per_inst; + } + return ops; +} + +static void line_advance_pc(Buf *prog, u32 byte_delta, u32 min_inst_len) { + if (byte_delta == 0) return; + if (min_inst_len == 0) min_inst_len = 1; + if ((byte_delta % min_inst_len) == 0) { + form_u8(prog, DW_LNS_advance_pc); + form_uleb(prog, byte_delta / min_inst_len); + return; + } + + while (byte_delta > 0xffffu) { + form_u8(prog, DW_LNS_fixed_advance_pc); + form_u16(prog, 0xffffu); + byte_delta -= 0xffffu; + } + form_u8(prog, DW_LNS_fixed_advance_pc); + form_u16(prog, (u16)byte_delta); +} + /* .debug_line program emission. * * Header layout (32-bit DWARF5): @@ -803,6 +837,11 @@ static void emit_section_line(EmitCtx *e) { u32 dir_count; Sym *dirs = NULL; u32 ndirs = 0, dirs_cap = 0; + ArchDwarfOps dwarf_ops = debug_dwarf_ops(e->d); + const u32 min_inst_len = + dwarf_ops.min_inst_len ? dwarf_ops.min_inst_len : 1u; + const u32 max_ops_per_inst = + dwarf_ops.max_ops_per_inst ? dwarf_ops.max_ops_per_inst : 1u; /* Pending line_strp relocs. Each slot is a u32 in hdr_body at * `slot[k].at` with addend `slot[k].ofs` (the resolved .debug_line_str * offset). Translated to section offsets and turned into R_ABS32 @@ -813,13 +852,6 @@ static void emit_section_line(EmitCtx *e) { u32 ofs; } *lsp_slots = NULL; u32 nlsp = 0, lsp_cap = 0; - /* aarch64 and rv64 (RV64I, no C-extension produced by the backend): - * instructions are 4-byte aligned and exactly 4 bytes wide. - * DW_LNS_advance_pc takes the advance in *operations*, which the - * consumer multiplies by min_inst_length (DWARF5 §6.2.5.2). Keep this - * in sync with the value emitted into the header below. x64 producers - * override at the call site if/when they grow .debug_line emission. */ - const u32 min_inst_len = 4; buf_init(&prog, e->heap); buf_init(&hdr_body, e->heap); @@ -859,8 +891,7 @@ static void emit_section_line(EmitCtx *e) { form_uleb(&prog, r->loc.col); } if (pc_delta != 0) { - form_u8(&prog, DW_LNS_advance_pc); - form_uleb(&prog, pc_delta / min_inst_len); + line_advance_pc(&prog, pc_delta, min_inst_len); } line_delta = (i64)r->loc.line - prev_line; if (line_delta != 0) { @@ -875,8 +906,7 @@ static void emit_section_line(EmitCtx *e) { u32 last = prev ? prev->offset : f->begin_ofs; u32 delta = f->end_ofs - last; if (delta != 0) { - form_u8(&prog, DW_LNS_advance_pc); - form_uleb(&prog, delta / min_inst_len); + line_advance_pc(&prog, delta, min_inst_len); } } form_u8(&prog, 0); @@ -885,8 +915,8 @@ static void emit_section_line(EmitCtx *e) { } /* Build header body (from min_inst_length onward). */ - form_u8(&hdr_body, (u8)min_inst_len); /* min_inst_length (aa64/rv64) */ - form_u8(&hdr_body, 1); /* max_ops_per_inst */ + form_u8(&hdr_body, (u8)min_inst_len); /* min_inst_length */ + form_u8(&hdr_body, (u8)max_ops_per_inst); /* max_ops_per_inst */ form_u8(&hdr_body, 1); /* default_is_stmt = 1 */ form_u8(&hdr_body, (u8)(i8)-5); /* line_base */ form_u8(&hdr_body, 14); /* line_range */ diff --git a/src/link/link_jit.c b/src/link/link_jit.c @@ -120,7 +120,6 @@ struct CfreeJit { * TCB ahead of .tdata and biases TP accordingly so a single TPREL * convention works for both arches. Mirrors src/link/link_elf.c's * TLS_TCB_SIZE comment. */ -#define AARCH64_TCB_SIZE 16ull #define JIT_TLS_TCB_SIZE 16ull static int reloc_is_tlsle(RelocKind k) { @@ -136,6 +135,13 @@ static int reloc_is_tlsle(RelocKind k) { static i64 jit_rv_pcrel_lo12_disp(LinkImage* img, CfreeExecMemRegion* segs, u64 auipc_image_vaddr); +/* x86_64 SysV TLS variant II: %fs points at the TCB immediately after + * the static TLS image, so a symbol at image offset X is addressed as + * X - tls_memsz. */ +static int reloc_is_x64_tlsle(RelocKind k) { + return k == R_X64_TPOFF32 || k == R_X64_TPOFF64; +} + static int perms_for(u32 secflags) { int p = CFREE_PROT_READ; if (secflags & SF_EXEC) p |= CFREE_PROT_EXEC; @@ -508,6 +514,9 @@ CfreeJit* cfree_jit_from_image(LinkImage* img) { link_reloc_apply(c, alias, P_bytes, (u64)disp, 0, (u64)vaddr_to_runtime(img, segs, r->write_vaddr)); continue; + } else if (reloc_is_x64_tlsle(r->kind)) { + i64 off = (i64)(tgt->vaddr - img->tls_vaddr) - (i64)img->tls_memsz; + S = (u64)off; } else if (tgt->kind == SK_ABS) { /* extern resolver result OR true absolute symbol — vaddr * already holds the runtime address. */ @@ -865,6 +874,9 @@ static void jit_apply_one_reloc(CfreeJit* jit, const LinkRelocApply* r) { link_reloc_apply(jit->c, alias, P_bytes, (u64)disp, 0, (u64)vaddr_to_runtime(img, jit->segs, r->write_vaddr)); return; + } else if (reloc_is_x64_tlsle(r->kind)) { + i64 off = (i64)(tgt->vaddr - img->tls_vaddr) - (i64)img->tls_memsz; + S = (u64)off; } else if (tgt->kind == SK_ABS) { S = tgt->vaddr; } else { diff --git a/src/link/link_reloc.c b/src/link/link_reloc.c @@ -39,11 +39,28 @@ void link_reloc_apply(Compiler* c, RelocKind k, u8* P_bytes, u64 S, i64 A, return; } case R_ABS64: - case R_X64_TPOFF64: { + case R_X64_TPOFF64: + case R_X64_RELATIVE: { + /* R_X64_RELATIVE: (S + A) — for static-with-relocs paths the + * linker writes the relocated value directly; the dynamic + * loader would otherwise do the same fixup at load time. */ u64 v = S + (u64)A; wr_u64_le(P_bytes, v); return; } + case R_X64_GLOB_DAT: + case R_X64_JUMP_SLOT: { + /* Dynamic linker normally applies these; for static-with-relocs + * paths we write the resolved symbol value (S) into the GOT/PLT + * slot. Addend is unused per the x86_64 psABI. */ + wr_u64_le(P_bytes, S); + return; + } + case R_X64_COPY: + compiler_panic(c, no_loc(), + "link: R_X64_COPY belongs in dynamic loader, " + "not static link"); + return; case R_REL32: case R_PC32: case R_X64_PLT32: @@ -62,6 +79,13 @@ void link_reloc_apply(Compiler* c, RelocKind k, u8* P_bytes, u64 S, i64 A, wr_u32_le(P_bytes, (u32)((u64)v & 0xffffffffu)); return; } + case R_X64_PC8: { + i64 v = (i64)S + A - (i64)P; + if (v < -128 || v > 127) + compiler_panic(c, no_loc(), "link: X64_PC8 out of range"); + P_bytes[0] = (u8)((u64)v & 0xffu); + return; + } case R_REL64: case R_PC64: { /* 64-bit PC-relative; AArch64 R_AARCH64_PREL64. Used by diff --git a/src/link/link_reloc_layout.c b/src/link/link_reloc_layout.c @@ -261,10 +261,15 @@ static u8 reloc_width(RelocKind k) { case R_REL64: case R_PC64: case R_X64_TPOFF64: + case R_X64_GLOB_DAT: + case R_X64_JUMP_SLOT: + case R_X64_RELATIVE: return 8; case R_AARCH64_ABS16: case R_AARCH64_PREL16: return 2; + case R_X64_PC8: + return 1; case R_AARCH64_JUMP26: case R_AARCH64_CALL26: case R_AARCH64_CONDBR19: diff --git a/test/api/abi_classify_test.c b/test/api/abi_classify_test.c @@ -160,6 +160,35 @@ static void expect_direct_1x_fp(const char* tag, const ABIArgInfo* ai, (unsigned)ai->parts[0].src_offset); } +static void expect_direct_2(const char* tag, const ABIArgInfo* ai, u8 c0, + u8 c1, u32 s0, u32 s1) { + EXPECT(ai->kind == ABI_ARG_DIRECT, "%s: kind=%d want DIRECT", tag, + (int)ai->kind); + EXPECT(ai->nparts == 2, "%s: nparts=%u want 2", tag, (unsigned)ai->nparts); + if (ai->nparts != 2 || !ai->parts) return; + EXPECT(ai->parts[0].cls == c0, "%s: parts[0].cls=%d want %d", tag, + (int)ai->parts[0].cls, (int)c0); + EXPECT(ai->parts[1].cls == c1, "%s: parts[1].cls=%d want %d", tag, + (int)ai->parts[1].cls, (int)c1); + EXPECT(ai->parts[0].size == s0, "%s: parts[0].size=%u want %u", tag, + (unsigned)ai->parts[0].size, s0); + EXPECT(ai->parts[1].size == s1, "%s: parts[1].size=%u want %u", tag, + (unsigned)ai->parts[1].size, s1); + EXPECT(ai->parts[0].src_offset == 0, "%s: parts[0].src_offset=%u want 0", + tag, (unsigned)ai->parts[0].src_offset); + EXPECT(ai->parts[1].src_offset == 8, "%s: parts[1].src_offset=%u want 8", + tag, (unsigned)ai->parts[1].src_offset); +} + +static CfreeCgTypeId record2(CfreeCompiler* c, CfreeCgTypeId a, + CfreeCgTypeId b) { + CfreeCgField f[2]; + memset(f, 0, sizeof f); + f[0].type = a; + f[1].type = b; + return cfree_cg_type_record(c, 0, f, 2); +} + static void check_target(CfreeArchKind arch, CfreeOSKind os, CfreeObjFmt obj) { CfreeCompiler* c = new_compiler(arch, os, obj); CfreeCgBuiltinTypes bi = cfree_cg_builtin_types(c); @@ -217,6 +246,44 @@ static void check_target(CfreeArchKind arch, CfreeOSKind os, CfreeObjFmt obj) { } } + if (arch == CFREE_ARCH_X86_64) { + CfreeCgTypeId f64_i64 = record2(c, bi.id[CFREE_CG_BUILTIN_F64], + bi.id[CFREE_CG_BUILTIN_I64]); + CfreeCgTypeId i64_f64 = record2(c, bi.id[CFREE_CG_BUILTIN_I64], + bi.id[CFREE_CG_BUILTIN_F64]); + CfreeCgTypeId f32x2 = record2(c, bi.id[CFREE_CG_BUILTIN_F32], + bi.id[CFREE_CG_BUILTIN_F32]); + { + const ABIFuncInfo* fi = classify_fn(c, f64_i64, f64_i64); + snprintf(tag, sizeof tag, "%s/%s {double,long} arg", + arch_name(arch), os_name(os)); + expect_direct_2(tag, &fi->params[0], ABI_CLASS_FP, ABI_CLASS_INT, 8, 8); + snprintf(tag, sizeof tag, "%s/%s {double,long} ret", + arch_name(arch), os_name(os)); + expect_direct_2(tag, &fi->ret, ABI_CLASS_FP, ABI_CLASS_INT, 8, 8); + EXPECT(fi->has_sret == 0, "%s/%s: mixed record should not use sret", + arch_name(arch), os_name(os)); + } + { + const ABIFuncInfo* fi = classify_fn(c, i64_f64, i64_f64); + snprintf(tag, sizeof tag, "%s/%s {long,double} arg", + arch_name(arch), os_name(os)); + expect_direct_2(tag, &fi->params[0], ABI_CLASS_INT, ABI_CLASS_FP, 8, 8); + snprintf(tag, sizeof tag, "%s/%s {long,double} ret", + arch_name(arch), os_name(os)); + expect_direct_2(tag, &fi->ret, ABI_CLASS_INT, ABI_CLASS_FP, 8, 8); + } + { + const ABIFuncInfo* fi = classify_fn(c, f32x2, f32x2); + snprintf(tag, sizeof tag, "%s/%s {float,float} arg", + arch_name(arch), os_name(os)); + expect_direct_1x_fp(tag, &fi->params[0], 8); + snprintf(tag, sizeof tag, "%s/%s {float,float} ret", + arch_name(arch), os_name(os)); + expect_direct_1x_fp(tag, &fi->ret, 8); + } + } + cfree_compiler_free(c); } diff --git a/test/arch/x64_dbg_test.c b/test/arch/x64_dbg_test.c @@ -0,0 +1,149 @@ +/* x86_64 ArchDbgOps unit test. + * + * Exercises the small debugger decoder/lifter used by displaced stepping: + * instruction length, RIP-relative displacement rebasing, FS-segment TLS + * addressing, branch rebasing, breakpoint patching, and call detection. */ + +#include <cfree/core.h> +#include <stdio.h> +#include <string.h> + +#include "arch/arch.h" +#include "core/bytes.h" + +static int g_fail = 0; + +#define EXPECT(cond, ...) \ + do { \ + if (!(cond)) { \ + g_fail++; \ + fprintf(stderr, "FAIL %s:%d: ", __FILE__, __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + } \ + } while (0) + +static const ArchDbgOps* x64_dbg(void) { + const ArchImpl* a = arch_lookup(CFREE_ARCH_X86_64); + return a ? a->dbg : NULL; +} + +static ArchDbgInsn decode_one(const ArchDbgOps* dbg, const u8* bytes, u32 n, + u64 pc) { + ArchDbgInsn insn; + memset(&insn, 0, sizeof insn); + EXPECT(dbg->decode_insn(bytes, n, pc, &insn) == CFREE_OK, + "decode_insn failed"); + return insn; +} + +static void check_breakpoint(const ArchDbgOps* dbg) { + u8 patch[ARCH_DBG_MAX_TRAP_BYTES]; + u32 n = 0; + EXPECT(dbg->breakpoint_patch(patch, sizeof patch, &n) == CFREE_OK, + "breakpoint_patch failed"); + EXPECT(n == 1u && patch[0] == 0xccu, "bad breakpoint patch"); + EXPECT(dbg->breakpoint_addr_from_fault_pc(0x1235u) == 0x1234u, + "bad x64 breakpoint pc normalization"); +} + +static void check_rip_relative(const ArchDbgOps* dbg) { + u8 code[] = {0x48, 0x8b, 0x05, 0x34, 0x12, 0x00, 0x00}; + u8 shim[64]; + ArchDbgInsn insn = decode_one(dbg, code, sizeof code, 0x1000u); + u32 sentinel = 0; + u64 fallthrough = 0; + EXPECT(insn.len == sizeof code, "rip-rel len got %u", insn.len); + memset(shim, 0, sizeof shim); + EXPECT(dbg->build_displaced_shim(&insn, shim, 0x2000u, sizeof shim, &sentinel, + &fallthrough) == CFREE_OK, + "rip-rel shim failed"); + EXPECT(sentinel == sizeof code, "rip-rel sentinel got %u", sentinel); + EXPECT(fallthrough == 0x1000u + sizeof code, "rip-rel fallthrough bad"); + EXPECT(rd_u32_le(shim + 3) == 0x234u, "rip-rel disp got 0x%x", + rd_u32_le(shim + 3)); + EXPECT(shim[sizeof code] == 0xccu, "rip-rel missing sentinel"); +} + +static void check_fs_tls_copy(const ArchDbgOps* dbg) { + u8 code[] = {0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0}; + u8 shim[64]; + ArchDbgInsn insn = decode_one(dbg, code, sizeof code, 0x1000u); + u32 sentinel = 0; + u64 fallthrough = 0; + EXPECT(insn.len == sizeof code, "fs tls len got %u", insn.len); + memset(shim, 0xa5, sizeof shim); + EXPECT(dbg->build_displaced_shim(&insn, shim, 0x3000u, sizeof shim, &sentinel, + &fallthrough) == CFREE_OK, + "fs tls shim failed"); + EXPECT(memcmp(shim, code, sizeof code) == 0, + "fs absolute disp32 form should copy unchanged"); + EXPECT(sentinel == sizeof code && shim[sentinel] == 0xccu, + "fs tls sentinel bad"); + EXPECT(fallthrough == 0x1000u + sizeof code, "fs tls fallthrough bad"); +} + +static void check_branches_and_calls(const ArchDbgOps* dbg) { + u8 call[] = {0xe8, 0x78, 0x56, 0x34, 0x12}; + u8 jcc[] = {0x0f, 0x85, 0x04, 0x00, 0x00, 0x00}; + u8 short_jcc[] = {0x75, 0x7f}; + u8 jmp[] = {0xe9, 0x20, 0x00, 0x00, 0x00}; + u8 indirect_call[] = {0xff, 0xd0}; /* call *%rax */ + u8 shim[64]; + ArchDbgInsn insn = decode_one(dbg, call, sizeof call, 0x1000u); + u32 sentinel = 0; + u64 fallthrough = 0; + i64 target = (i64)(0x1000u + sizeof call) + (i64)0x12345678; + i64 nd = target - (i64)(0x2000u + sizeof call); + + EXPECT(insn.len == sizeof call, "call len got %u", insn.len); + EXPECT(dbg->is_call(&insn), "near call not detected"); + EXPECT(dbg->build_displaced_shim(&insn, shim, 0x2000u, sizeof shim, &sentinel, + &fallthrough) == CFREE_OK, + "call shim failed"); + EXPECT((i32)rd_u32_le(shim + 1) == (i32)nd, "call disp not rebased"); + + insn = decode_one(dbg, jcc, sizeof jcc, 0x4000u); + EXPECT(!dbg->is_call(&insn), "jcc detected as call"); + EXPECT(dbg->build_displaced_shim(&insn, shim, 0x5000u, sizeof shim, &sentinel, + &fallthrough) == CFREE_OK, + "jcc shim failed"); + EXPECT((i32)rd_u32_le(shim + 2) == (i32)(0x400au - 0x5006u), + "jcc disp not rebased"); + + insn = decode_one(dbg, short_jcc, sizeof short_jcc, 0x1000u); + EXPECT(dbg->build_displaced_shim(&insn, shim, 0x2000u, sizeof shim, &sentinel, + &fallthrough) == CFREE_OK, + "short jcc promotion failed"); + EXPECT(sentinel == 6u && shim[0] == 0x0fu && shim[1] == 0x85u, + "short jcc was not promoted"); + + insn = decode_one(dbg, jmp, sizeof jmp, 0x7000u); + EXPECT(dbg->build_displaced_shim(&insn, shim, 0x8000u, sizeof shim, &sentinel, + &fallthrough) == CFREE_OK, + "jmp shim failed"); + EXPECT(sentinel == 0u && shim[0] == 0xccu, "jmp should trap immediately"); + EXPECT(fallthrough == 0x7025u, "jmp target got 0x%llx", + (unsigned long long)fallthrough); + + insn = decode_one(dbg, indirect_call, sizeof indirect_call, 0x6000u); + EXPECT(insn.len == sizeof indirect_call, "indirect call len got %u", + insn.len); + EXPECT(dbg->is_call(&insn), "indirect call not detected"); +} + +int main(void) { + const ArchDbgOps* dbg = x64_dbg(); + EXPECT(dbg != NULL, "x64 dbg ops missing"); + if (!dbg) return 1; + check_breakpoint(dbg); + check_rip_relative(dbg); + check_fs_tls_copy(dbg); + check_branches_and_calls(dbg); + if (g_fail) { + fprintf(stderr, "%d FAILED\n", g_fail); + return 1; + } + printf("x64 dbg test: OK\n"); + return 0; +} diff --git a/test/arch/x64_inline_test.c b/test/arch/x64_inline_test.c @@ -0,0 +1,511 @@ +/* x86_64 inline-asm backend unit test (peer of aa64_inline_test.c). + * + * Drives x_asm_block (via the CGTarget vtable) directly: builds an + * Operand array by hand, calls the entry point against an in-process + * MCEmitter, and asserts the emitted .text bytes match the expected + * machine encoding. No parser or cg involvement — this isolates the + * x64 template walker (render_operand + run_one_line) and the + * per-mnemonic dispatch in x_arch_asm_insn. + * + * Smoke cases exercise: + * - basic movq reg-to-reg via "%0" / "%1" + * - %k modifier (32-bit reg form) via movl + * - %x modifier (64-bit reg form) + * - %b modifier (8-bit reg form) via movb + * - GNU x86 %w/%h/%z modifiers and symbolic %[name] operands + * - "%%" literal escape (via "nop ; nop") + * - %a address-form rendering for an OPK_INDIRECT operand + * + * Builds against the internal arch/ + obj/ surface (test.mk passes + * -Isrc). Mirrors aa64_inline_test.c byte-for-byte where possible. */ + +#include <cfree/core.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "arch/arch.h" +#include "arch/x64/asm.h" +#include "arch/x64/isa.h" +#include "core/buf.h" +#include "core/core.h" +#include "core/pool.h" +#include "obj/obj.h" + +/* ---- env ---- */ + +static void* h_alloc(CfreeHeap* h, size_t n, size_t a) { + (void)h; + (void)a; + return n ? malloc(n) : NULL; +} +static void* h_realloc(CfreeHeap* h, void* p, size_t o, size_t n, size_t a) { + (void)h; + (void)o; + (void)a; + return realloc(p, n); +} +static void h_free(CfreeHeap* h, void* p, size_t n) { + (void)h; + (void)n; + free(p); +} +static CfreeHeap g_heap = {h_alloc, h_realloc, h_free, NULL}; + +static void diag_emit(CfreeDiagSink* s, CfreeDiagKind k, CfreeSrcLoc loc, + const char* fmt, va_list ap) { + (void)s; + (void)loc; + fprintf(stderr, "[%s] ", + k == CFREE_DIAG_ERROR ? "error" + : k == CFREE_DIAG_WARN ? "warning" + : "note"); + vfprintf(stderr, fmt, ap); + fputc('\n', stderr); +} +static CfreeDiagSink g_sink = {diag_emit, 0, 0, 0}; +static CfreeContext g_ctx = {.heap = &g_heap, .diag = &g_sink, .now = -1}; + +static int g_fail = 0; +#define EXPECT(cond, ...) \ + do { \ + if (!(cond)) { \ + g_fail++; \ + fprintf(stderr, "FAIL %s:%d: ", __FILE__, __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + } \ + } while (0) + +/* Architecture-defined opcode constants used by the expected-encoding + * table. Promoted to named constants per the project convention + * (no bare hex literals as load-bearing values). */ +#define X64_NOP_BYTE 0x90u +#define EXPECTED_MOVQ_RAX_RCX_0 0x48u +#define EXPECTED_MOVQ_RAX_RCX_1 0x89u +#define EXPECTED_MOVQ_RAX_RCX_2 0xC8u +#define EXPECTED_MOVL_EAX_ECX_0 0x89u +#define EXPECTED_MOVL_EAX_ECX_1 0xC8u +#define EXPECTED_MOVB_AL_CL_0 0x88u +#define EXPECTED_MOVB_AL_CL_1 0xC8u + +static u8 read_byte(const Section* s, u32 ofs) { + u8 b; + buf_read(&s->bytes, ofs, &b, 1); + return b; +} + +/* External constructors we need from the internal arch surface — these + * are the same entry points cg_runner uses to spin up a backend without + * dragging in opt or the JIT. */ +MCEmitter* mc_new(Compiler*, ObjBuilder*); +CGTarget* cgtarget_new(Compiler*, ObjBuilder*, MCEmitter*); + +int main(void) { + CfreeTarget t; + memset(&t, 0, sizeof t); + t.arch = CFREE_ARCH_X86_64; + t.os = CFREE_OS_LINUX; + t.obj = CFREE_OBJ_ELF; + t.ptr_size = 8; + t.ptr_align = 8; + + CfreeCompiler* cc = NULL; + if (cfree_compiler_new(t, &g_ctx, &cc) != CFREE_OK || !cc) { + fprintf(stderr, "compiler_new failed\n"); + return 2; + } + Compiler* c = (Compiler*)cc; + CfreeCgBuiltinTypes bi = cfree_cg_builtin_types(cc); + + if (setjmp(c->panic)) { + fprintf(stderr, "FAIL: compiler panic\n"); + cfree_compiler_free(cc); + return 1; + } + + ObjBuilder* ob = obj_new(c); + Pool* pool = c->global; + ObjSecId text_sec = obj_section(ob, pool_intern_cstr(pool, ".text"), + SEC_TEXT, SF_EXEC | SF_ALLOC, 16); + MCEmitter* mc = mc_new(c, ob); + mc->set_section(mc, text_sec); + CGTarget* target = cgtarget_new(c, ob, mc); + + /* ---- smoke 1: movq reg-to-reg via "%0" / "%1" (default form) ---- + * + * outs = "=r" → rax, ins = "r" → rcx. AT&T spelling: "movq %1, %0" + * means MOV from %1 into %0, i.e. MOV rax <- rcx. + * Expected encoding: 48 89 C8. */ + { + AsmConstraint outs[1] = {{0}}; + outs[0].str = "=r"; + outs[0].dir = ASM_OUT; + Operand out_ops[1]; + memset(out_ops, 0, sizeof out_ops); + out_ops[0].kind = OPK_REG; + out_ops[0].cls = RC_INT; + out_ops[0].v.reg = X64_RAX; + + AsmConstraint ins[1] = {{0}}; + ins[0].str = "r"; + ins[0].dir = ASM_IN; + Operand in_ops[1]; + memset(in_ops, 0, sizeof in_ops); + in_ops[0].kind = OPK_REG; + in_ops[0].cls = RC_INT; + in_ops[0].v.reg = X64_RCX; + + u32 start = mc->pos(mc); + target->asm_block(target, "movq %1, %0", + outs, 1, out_ops, ins, 1, in_ops, NULL, 0); + u32 end = mc->pos(mc); + + EXPECT(end - start == 3u, "smoke1: expected 3 bytes, got %u", + (end - start)); + if (end - start == 3u) { + const Section* sec = obj_section_get(ob, text_sec); + EXPECT(read_byte(sec, start) == EXPECTED_MOVQ_RAX_RCX_0, + "smoke1: byte 0 0x%02x, want 0x48", + read_byte(sec, start)); + EXPECT(read_byte(sec, start + 1) == EXPECTED_MOVQ_RAX_RCX_1, + "smoke1: byte 1 0x%02x, want 0x89", + read_byte(sec, start + 1)); + EXPECT(read_byte(sec, start + 2) == EXPECTED_MOVQ_RAX_RCX_2, + "smoke1: byte 2 0x%02x, want 0xC8", + read_byte(sec, start + 2)); + } + } + + /* ---- smoke 2: %k modifier picks 32-bit reg form ---- + * + * "movl %k1, %k0" with out=rax in=rcx -> "movl %ecx, %eax" -> 89 C8. */ + { + AsmConstraint outs[1] = {{0}}; + outs[0].str = "=r"; + outs[0].dir = ASM_OUT; + Operand out_ops[1]; + memset(out_ops, 0, sizeof out_ops); + out_ops[0].kind = OPK_REG; + out_ops[0].cls = RC_INT; + out_ops[0].v.reg = X64_RAX; + + AsmConstraint ins[1] = {{0}}; + ins[0].str = "r"; + ins[0].dir = ASM_IN; + Operand in_ops[1]; + memset(in_ops, 0, sizeof in_ops); + in_ops[0].kind = OPK_REG; + in_ops[0].cls = RC_INT; + in_ops[0].v.reg = X64_RCX; + + u32 start = mc->pos(mc); + target->asm_block(target, "movl %k1, %k0", + outs, 1, out_ops, ins, 1, in_ops, NULL, 0); + u32 end = mc->pos(mc); + + EXPECT(end - start == 2u, "smoke2: expected 2 bytes, got %u", + (end - start)); + if (end - start == 2u) { + const Section* sec = obj_section_get(ob, text_sec); + EXPECT(read_byte(sec, start) == EXPECTED_MOVL_EAX_ECX_0, + "smoke2: byte 0 0x%02x, want 0x89", + read_byte(sec, start)); + EXPECT(read_byte(sec, start + 1) == EXPECTED_MOVL_EAX_ECX_1, + "smoke2: byte 1 0x%02x, want 0xC8", + read_byte(sec, start + 1)); + } + } + + /* ---- smoke 3: %x modifier picks 64-bit reg form ---- + * + * "movq %x1, %x0" with out=rax in=rcx → "movq %rcx, %rax" → 48 89 C8. */ + { + AsmConstraint outs[1] = {{0}}; + outs[0].str = "=r"; + outs[0].dir = ASM_OUT; + Operand out_ops[1]; + memset(out_ops, 0, sizeof out_ops); + out_ops[0].kind = OPK_REG; + out_ops[0].cls = RC_INT; + out_ops[0].v.reg = X64_RAX; + + AsmConstraint ins[1] = {{0}}; + ins[0].str = "r"; + ins[0].dir = ASM_IN; + Operand in_ops[1]; + memset(in_ops, 0, sizeof in_ops); + in_ops[0].kind = OPK_REG; + in_ops[0].cls = RC_INT; + in_ops[0].v.reg = X64_RCX; + + u32 start = mc->pos(mc); + target->asm_block(target, "movq %x1, %x0", + outs, 1, out_ops, ins, 1, in_ops, NULL, 0); + u32 end = mc->pos(mc); + + EXPECT(end - start == 3u, "smoke3: expected 3 bytes, got %u", + (end - start)); + if (end - start == 3u) { + const Section* sec = obj_section_get(ob, text_sec); + EXPECT(read_byte(sec, start) == EXPECTED_MOVQ_RAX_RCX_0, + "smoke3: byte 0 0x%02x, want 0x48", + read_byte(sec, start)); + EXPECT(read_byte(sec, start + 1) == EXPECTED_MOVQ_RAX_RCX_1, + "smoke3: byte 1 0x%02x, want 0x89", + read_byte(sec, start + 1)); + EXPECT(read_byte(sec, start + 2) == EXPECTED_MOVQ_RAX_RCX_2, + "smoke3: byte 2 0x%02x, want 0xC8", + read_byte(sec, start + 2)); + } + } + + /* ---- smoke 4: %b modifier picks 8-bit reg form via movb ---- + * + * "movb %b1, %b0" with out=rax in=rcx → "movb %cl, %al" → 88 C8. */ + { + AsmConstraint outs[1] = {{0}}; + outs[0].str = "=r"; + outs[0].dir = ASM_OUT; + Operand out_ops[1]; + memset(out_ops, 0, sizeof out_ops); + out_ops[0].kind = OPK_REG; + out_ops[0].cls = RC_INT; + out_ops[0].v.reg = X64_RAX; + + AsmConstraint ins[1] = {{0}}; + ins[0].str = "r"; + ins[0].dir = ASM_IN; + Operand in_ops[1]; + memset(in_ops, 0, sizeof in_ops); + in_ops[0].kind = OPK_REG; + in_ops[0].cls = RC_INT; + in_ops[0].v.reg = X64_RCX; + + u32 start = mc->pos(mc); + target->asm_block(target, "movb %b1, %b0", + outs, 1, out_ops, ins, 1, in_ops, NULL, 0); + u32 end = mc->pos(mc); + + EXPECT(end - start == 2u, "smoke4: expected 2 bytes, got %u", + (end - start)); + if (end - start == 2u) { + const Section* sec = obj_section_get(ob, text_sec); + EXPECT(read_byte(sec, start) == EXPECTED_MOVB_AL_CL_0, + "smoke4: byte 0 0x%02x, want 0x88", + read_byte(sec, start)); + EXPECT(read_byte(sec, start + 1) == EXPECTED_MOVB_AL_CL_1, + "smoke4: byte 1 0x%02x, want 0xC8", + read_byte(sec, start + 1)); + } + } + + /* ---- smoke 5: "%%" literal escape ---- + * + * The template walker collapses "%%" to a single '%' before handing + * the rendered line to the per-mnemonic dispatch. There is no clean + * way to land a stray '%' in a no-operand mnemonic on x64 (every + * register reference starts with '%'), so we use the same trick as + * the aa64 peer: confirm two no-operand instructions on one line + * tokenize correctly through the ';' line splitter. Then drive a + * separate template that contains "%%" inside what would otherwise + * be a literal block — the line must still parse as "nop". */ + { + u32 start = mc->pos(mc); + target->asm_block(target, "nop ; nop", + NULL, 0, NULL, NULL, 0, NULL, NULL, 0); + u32 end = mc->pos(mc); + EXPECT(end - start == 2u, "smoke5a: expected 2 bytes, got %u", + (end - start)); + if (end - start == 2u) { + const Section* sec = obj_section_get(ob, text_sec); + EXPECT(read_byte(sec, start) == X64_NOP_BYTE, + "smoke5a: nop[0] = 0x%02x", read_byte(sec, start)); + EXPECT(read_byte(sec, start + 1) == X64_NOP_BYTE, + "smoke5a: nop[1] = 0x%02x", read_byte(sec, start + 1)); + } + } + /* The "%%" → "%" collapse is exercised directly: a template + * containing "%%" followed by a register name must produce the + * literal "%<name>" in the rendered line and assemble cleanly. + * "movq %%rcx, %%rax" is the AT&T spelling of MOV rax <- rcx + * once the "%%" pairs collapse. */ + { + u32 start = mc->pos(mc); + target->asm_block(target, "movq %%rcx, %%rax", + NULL, 0, NULL, NULL, 0, NULL, NULL, 0); + u32 end = mc->pos(mc); + EXPECT(end - start == 3u, "smoke5b: expected 3 bytes, got %u", + (end - start)); + if (end - start == 3u) { + const Section* sec = obj_section_get(ob, text_sec); + EXPECT(read_byte(sec, start) == EXPECTED_MOVQ_RAX_RCX_0, + "smoke5b: byte 0 0x%02x", read_byte(sec, start)); + EXPECT(read_byte(sec, start + 1) == EXPECTED_MOVQ_RAX_RCX_1, + "smoke5b: byte 1 0x%02x", read_byte(sec, start + 1)); + EXPECT(read_byte(sec, start + 2) == EXPECTED_MOVQ_RAX_RCX_2, + "smoke5b: byte 2 0x%02x", read_byte(sec, start + 2)); + } + } + + /* ---- smoke 6: %a renders an OPK_INDIRECT as a memory operand ---- + * + * Output is INDIRECT [rcx + 0]; input is rax. Template + * "movq %1, %a0" should render as "movq %rax, (%rcx)" and dispatch + * to the REG, MEM branch of movq. + * Expected: REX.W=0x48, 0x89, modrm(0, reg=rax=0, rm=rcx=1)=0x01. */ + { + AsmConstraint outs[1] = {{0}}; + outs[0].str = "=m"; + outs[0].dir = ASM_OUT; + Operand out_ops[1]; + memset(out_ops, 0, sizeof out_ops); + out_ops[0].kind = OPK_INDIRECT; + out_ops[0].cls = RC_INT; + out_ops[0].v.ind.base = X64_RCX; + out_ops[0].v.ind.ofs = 0; + + AsmConstraint ins[1] = {{0}}; + ins[0].str = "r"; + ins[0].dir = ASM_IN; + Operand in_ops[1]; + memset(in_ops, 0, sizeof in_ops); + in_ops[0].kind = OPK_REG; + in_ops[0].cls = RC_INT; + in_ops[0].v.reg = X64_RAX; + + u32 start = mc->pos(mc); + target->asm_block(target, "movq %1, %a0", + outs, 1, out_ops, ins, 1, in_ops, NULL, 0); + u32 end = mc->pos(mc); + + EXPECT(end - start == 3u, "smoke6: expected 3 bytes, got %u", + (end - start)); + if (end - start == 3u) { + const Section* sec = obj_section_get(ob, text_sec); + EXPECT(read_byte(sec, start) == 0x48u, + "smoke6: byte 0 0x%02x, want 0x48", read_byte(sec, start)); + EXPECT(read_byte(sec, start + 1) == 0x89u, + "smoke6: byte 1 0x%02x, want 0x89", read_byte(sec, start + 1)); + EXPECT(read_byte(sec, start + 2) == 0x01u, + "smoke6: byte 2 0x%02x, want 0x01", read_byte(sec, start + 2)); + } + } + + /* ---- smoke 7: GNU x86 modifiers %k/%w/%h/%z and symbolic names ---- */ + { + AsmConstraint outs[1] = {{0}}; + outs[0].str = "=r"; + outs[0].dir = ASM_OUT; + outs[0].name = pool_intern_cstr(pool, "dst"); + Operand out_ops[1]; + memset(out_ops, 0, sizeof out_ops); + out_ops[0].kind = OPK_REG; + out_ops[0].cls = RC_INT; + out_ops[0].type = bi.id[CFREE_CG_BUILTIN_I32]; + out_ops[0].v.reg = X64_RAX; + + AsmConstraint ins[1] = {{0}}; + ins[0].str = "r"; + ins[0].dir = ASM_IN; + ins[0].name = pool_intern_cstr(pool, "src"); + Operand in_ops[1]; + memset(in_ops, 0, sizeof in_ops); + in_ops[0].kind = OPK_REG; + in_ops[0].cls = RC_INT; + in_ops[0].type = bi.id[CFREE_CG_BUILTIN_I32]; + in_ops[0].v.reg = X64_RCX; + + u32 start = mc->pos(mc); + target->asm_block(target, "mov%z[dst] %k[src], %k[dst]", + outs, 1, out_ops, ins, 1, in_ops, NULL, 0); + u32 end = mc->pos(mc); + EXPECT(end - start == 2u, "smoke7a: expected 2 bytes, got %u", + (end - start)); + if (end - start == 2u) { + const Section* sec = obj_section_get(ob, text_sec); + EXPECT(read_byte(sec, start) == EXPECTED_MOVL_EAX_ECX_0, + "smoke7a: byte 0 0x%02x, want 0x89", read_byte(sec, start)); + EXPECT(read_byte(sec, start + 1) == EXPECTED_MOVL_EAX_ECX_1, + "smoke7a: byte 1 0x%02x, want 0xC8", read_byte(sec, start + 1)); + } + } + { + AsmConstraint outs[1] = {{0}}; + outs[0].str = "=r"; + outs[0].dir = ASM_OUT; + Operand out_ops[1]; + memset(out_ops, 0, sizeof out_ops); + out_ops[0].kind = OPK_REG; + out_ops[0].cls = RC_INT; + out_ops[0].v.reg = X64_RAX; + + AsmConstraint ins[1] = {{0}}; + ins[0].str = "r"; + ins[0].dir = ASM_IN; + Operand in_ops[1]; + memset(in_ops, 0, sizeof in_ops); + in_ops[0].kind = OPK_REG; + in_ops[0].cls = RC_INT; + in_ops[0].v.reg = X64_RCX; + + u32 start = mc->pos(mc); + target->asm_block(target, "movw %w1, %w0", + outs, 1, out_ops, ins, 1, in_ops, NULL, 0); + u32 end = mc->pos(mc); + EXPECT(end - start == 3u, "smoke7b: expected 3 bytes, got %u", + (end - start)); + if (end - start == 3u) { + const Section* sec = obj_section_get(ob, text_sec); + EXPECT(read_byte(sec, start) == 0x66u, + "smoke7b: byte 0 0x%02x, want 0x66", read_byte(sec, start)); + EXPECT(read_byte(sec, start + 1) == 0x89u, + "smoke7b: byte 1 0x%02x, want 0x89", read_byte(sec, start + 1)); + EXPECT(read_byte(sec, start + 2) == 0xC8u, + "smoke7b: byte 2 0x%02x, want 0xC8", read_byte(sec, start + 2)); + } + } + { + AsmConstraint outs[1] = {{0}}; + outs[0].str = "=r"; + outs[0].dir = ASM_OUT; + Operand out_ops[1]; + memset(out_ops, 0, sizeof out_ops); + out_ops[0].kind = OPK_REG; + out_ops[0].cls = RC_INT; + out_ops[0].v.reg = X64_RDX; + + AsmConstraint ins[1] = {{0}}; + ins[0].str = "r"; + ins[0].dir = ASM_IN; + Operand in_ops[1]; + memset(in_ops, 0, sizeof in_ops); + in_ops[0].kind = OPK_REG; + in_ops[0].cls = RC_INT; + in_ops[0].v.reg = X64_RCX; + + u32 start = mc->pos(mc); + target->asm_block(target, "movb %h1, %h0", + outs, 1, out_ops, ins, 1, in_ops, NULL, 0); + u32 end = mc->pos(mc); + EXPECT(end - start == 2u, "smoke7c: expected 2 bytes, got %u", + (end - start)); + if (end - start == 2u) { + const Section* sec = obj_section_get(ob, text_sec); + EXPECT(read_byte(sec, start) == 0x88u, + "smoke7c: byte 0 0x%02x, want 0x88", read_byte(sec, start)); + EXPECT(read_byte(sec, start + 1) == 0xEEu, + "smoke7c: byte 1 0x%02x, want 0xEE", read_byte(sec, start + 1)); + } + } + + cfree_compiler_free(cc); + + if (g_fail) { + fprintf(stderr, "%d failure(s)\n", g_fail); + return 1; + } + printf("x64_inline_test: ok\n"); + return 0; +} diff --git a/test/asm/decode/x64_lock_mfence.expected.txt b/test/asm/decode/x64_lock_mfence.expected.txt @@ -0,0 +1,2 @@ +0: lock xaddq %rcx, (%rax) +5: mfence diff --git a/test/asm/decode/x64_lock_mfence.hex b/test/asm/decode/x64_lock_mfence.hex @@ -0,0 +1 @@ +f0480fc1080faef0 diff --git a/test/asm/decode/x64_lock_mfence.targets b/test/asm/decode/x64_lock_mfence.targets @@ -0,0 +1 @@ +x64 diff --git a/test/asm/encode/x64_isa_core.expected.hex b/test/asm/encode/x64_isa_core.expected.hex @@ -0,0 +1 @@ +554889e548b80000000001000000488945f8488b4df8488d55f84801c84883e801486bc0034839c8400f94c6400fb6c65dc3 diff --git a/test/asm/encode/x64_isa_core.s b/test/asm/encode/x64_isa_core.s @@ -0,0 +1,18 @@ +// x64 descriptor-backed assembler coverage. +.text +.globl test_main +test_main: + pushq %rbp + movq %rsp, %rbp + movq $4294967296, %rax + movq %rax, -8(%rbp) + movq -8(%rbp), %rcx + leaq -8(%rbp), %rdx + addq %rcx, %rax + subq $1, %rax + imulq $3, %rax, %rax + cmpq %rcx, %rax + sete %sil + movzbl %sil, %eax + popq %rbp + ret diff --git a/test/asm/encode/x64_isa_core.targets b/test/asm/encode/x64_isa_core.targets @@ -0,0 +1 @@ +x64 diff --git a/test/asm/harness/asm_runner.c b/test/asm/harness/asm_runner.c @@ -19,6 +19,10 @@ * — strict dual-mapping on Apple/Linux, single mapping elsewhere. Only * --jit exercises it. */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + #include <cfree/compile.h> #include <cfree/core.h> #include <cfree/disasm.h> @@ -93,6 +97,24 @@ static int xm_to_posix(int p) { if (p & CFREE_PROT_EXEC) q |= PROT_EXEC; return q; } +#if XM_DUAL_LINUX && defined(__x86_64__) && defined(MAP_32BIT) +#define XM_MAP_32BIT MAP_32BIT +static uintptr_t g_xm_low_runtime_hint = 0x40000000u; +static void* xm_low_runtime_hint(size_t n) { + uintptr_t p = g_xm_low_runtime_hint; + uintptr_t step = (uintptr_t)((n + 0xffffu) & ~(size_t)0xffffu); + if (step < 0x10000u) step = 0x10000u; + g_xm_low_runtime_hint = p + step + 0x10000u; + if (g_xm_low_runtime_hint > 0x78000000u) g_xm_low_runtime_hint = 0x40000000u; + return (void*)p; +} +#elif XM_DUAL_LINUX +#define XM_MAP_32BIT 0 +static void* xm_low_runtime_hint(size_t n) { + (void)n; + return NULL; +} +#endif typedef struct XmTok { void* w; void* r; @@ -157,12 +179,14 @@ static CfreeStatus xm_reserve(void* u, size_t n, int p, close(fd); return CFREE_NOMEM; } - w = mmap(NULL, n, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + w = mmap(NULL, n, PROT_READ | PROT_WRITE, + MAP_SHARED | XM_MAP_32BIT, fd, 0); if (w == MAP_FAILED) { close(fd); return CFREE_NOMEM; } - r = mmap(NULL, n, PROT_READ, MAP_SHARED, fd, 0); + r = mmap(xm_low_runtime_hint(n), n, PROT_READ, + MAP_SHARED | XM_MAP_32BIT, fd, 0); close(fd); if (r == MAP_FAILED) { munmap(w, n); diff --git a/test/asm/listing/x64_symbols.expected.lst b/test/asm/listing/x64_symbols.expected.lst @@ -0,0 +1,7 @@ +Disassembly of section .text: + +0000000000000000 <test_main>: + 0: e800000000 callq 0x5 ; ext_fn-4 + 5: e900000000 jmp 0xa ; local_done-4 +000000000000000a <local_done>: + a: c3 ret diff --git a/test/asm/listing/x64_symbols.in.bin b/test/asm/listing/x64_symbols.in.bin Binary files differ. diff --git a/test/asm/listing/x64_symbols.s b/test/asm/listing/x64_symbols.s @@ -0,0 +1,8 @@ +// Source for x64_symbols.in.bin. +.text +.globl test_main +test_main: + callq ext_fn + jmp local_done +local_done: + ret diff --git a/test/asm/listing/x64_symbols.targets b/test/asm/listing/x64_symbols.targets @@ -0,0 +1 @@ +x64 diff --git a/test/asm/regen.sh b/test/asm/regen.sh @@ -8,8 +8,8 @@ # ./regen.sh <name> regenerate just one case (substring match) # # Requires: -# clang --target=aarch64-linux-gnu (the system clang on macOS is fine) -# llvm-objdump or aarch64-linux-gnu-objdump +# clang --target=aarch64-linux-gnu or --target=x86_64-linux-gnu +# llvm-objdump or a target objdump # xxd (for hex dumps) set -eu @@ -17,21 +17,55 @@ set -eu ROOT="$(cd "$(dirname "$0")/../.." && pwd)" TEST_DIR="$ROOT/test/asm" FILTER="${1:-}" +CFREE_TEST_ARCH="${CFREE_TEST_ARCH:-aa64}" +ASM_RUNNER="$ROOT/build/test/asm-runner" -CLANG_TARGET="--target=aarch64-linux-gnu" -OBJDUMP="$(command -v llvm-objdump 2>/dev/null || command -v aarch64-linux-gnu-objdump 2>/dev/null || true)" +case "$CFREE_TEST_ARCH" in + aa64|aarch64|arm64) + TEST_ARCH=aa64 + CLANG_TARGET="--target=aarch64-linux-gnu" + OBJDUMP_MACHINE="aarch64" + TARGET_OBJDUMP="aarch64-linux-gnu-objdump" + ;; + x64|x86_64|amd64) + TEST_ARCH=x64 + CLANG_TARGET="--target=x86_64-linux-gnu" + OBJDUMP_MACHINE="i386:x86-64" + TARGET_OBJDUMP="x86_64-linux-gnu-objdump" + ;; + *) + printf 'regen.sh: unknown CFREE_TEST_ARCH=%s (want aa64|x64)\n' "$CFREE_TEST_ARCH" >&2 + exit 2 + ;; +esac + +OBJDUMP="$(command -v llvm-objdump 2>/dev/null || command -v "$TARGET_OBJDUMP" 2>/dev/null || true)" if [ -z "$OBJDUMP" ]; then - printf 'regen.sh: no llvm-objdump / aarch64-linux-gnu-objdump on PATH\n' >&2 + printf 'regen.sh: no llvm-objdump / %s on PATH\n' "$TARGET_OBJDUMP" >&2 exit 1 fi tmp="$(mktemp -d)" trap 'rm -rf "$tmp"' EXIT +case_applies() { + local dir="$1" name="$2" targets tuple + targets="$dir/$name.targets" + [ -f "$targets" ] || return 0 + for tuple in $(cat "$targets"); do + case "$tuple:$TEST_ARCH" in + aa64:aa64|aarch64:aa64|arm64:aa64) return 0 ;; + x64:x64|x86_64:x64|amd64:x64) return 0 ;; + esac + done + return 1 +} + regen_encode() { local src="$1" name out_obj out_hex name="$(basename "$src" .s)" [ -n "$FILTER" ] && [[ "$name" != *"$FILTER"* ]] && return 0 + case_applies "$TEST_DIR/encode" "$name" || return 0 out_obj="$tmp/$name.o" out_hex="$TEST_DIR/encode/$name.expected.hex" clang $CLANG_TARGET -c "$src" -o "$out_obj" @@ -51,22 +85,42 @@ regen_decode() { local hexfile="$1" name out_txt name="$(basename "$hexfile" .hex)" [ -n "$FILTER" ] && [[ "$name" != *"$FILTER"* ]] && return 0 + case_applies "$TEST_DIR/decode" "$name" || return 0 out_txt="$TEST_DIR/decode/$name.expected.txt" # Mirror asm-runner --decode output exactly: vaddr:\tmnemonic\toperands. # objdump's listing format differs (it interleaves addresses + raw hex); # rebuild a minimal line per insn via awk so the goldens match the # runner's exact-match expectation. - local raw="$tmp/$name.bin" + local raw="$tmp/$name.bin" bytes_s="$tmp/$name.bytes.s" bytes_o="$tmp/$name.bytes.o" xxd -r -p "$hexfile" "$raw" - "$OBJDUMP" -b binary -m aarch64 -D "$raw" \ - | awk '/^[ ]+[0-9a-f]+:/ { + { + printf '.text\n' + od -An -tx1 -v "$raw" \ + | awk '{ for (i=1; i<=NF; i++) printf ".byte 0x%s\n", $i }' + } >"$bytes_s" + clang $CLANG_TARGET -c "$bytes_s" -o "$bytes_o" + "$OBJDUMP" -d --no-show-raw-insn "$bytes_o" \ + | awk '/^[ ]*[0-9a-f]+:/ { sub(/:/, "", $1); addr = $1; - # fields: addr raw-hex mnemonic operands... - mnem = $3; + # fields: addr mnemonic operands... + mnem = $2; ops = ""; - for (i=4; i<=NF; i++) ops = (ops=="" ? $i : ops " " $i); - printf "%s:\t%s\t%s\n", addr, mnem, ops; + for (i=3; i<=NF; i++) ops = (ops=="" ? $i : ops " " $i); + if (mnem == "lock") { + lock_addr = addr; + pending_lock = 1; + next; + } + if (pending_lock) { + addr = lock_addr; + mnem = "lock " mnem; + pending_lock = 0; + } + if (ops == "") + printf "%s:\t%s\n", addr, mnem; + else + printf "%s:\t%s\t%s\n", addr, mnem, ops; }' >"$out_txt" printf ' regen decode/%s\n' "$name" } @@ -75,10 +129,15 @@ regen_listing() { local bin="$1" name out_lst name="$(basename "$bin" .in.bin)" [ -n "$FILTER" ] && [[ "$name" != *"$FILTER"* ]] && return 0 + case_applies "$TEST_DIR/listing" "$name" || return 0 out_lst="$TEST_DIR/listing/$name.expected.lst" - "$OBJDUMP" -d "$bin" \ - | awk '/^Disassembly of section/ || /^[0-9a-f]+ </ || /^[ ]+[0-9a-f]+:/ || /^$/' \ - >"$out_lst" + if [ -x "$ASM_RUNNER" ]; then + CFREE_TEST_ARCH="$TEST_ARCH" "$ASM_RUNNER" --listing "$bin" "$out_lst" + else + "$OBJDUMP" -d "$bin" \ + | awk '/^Disassembly of section/ || /^[0-9a-f]+ </ || /^[ ]+[0-9a-f]+:/ || /^$/' \ + >"$out_lst" + fi printf ' regen listing/%s\n' "$name" } diff --git a/test/debug/roundtrip_unit.c b/test/debug/roundtrip_unit.c @@ -2,8 +2,10 @@ * assert the resulting section bytes match a known-good encoding for one * tiny case. * - * The case: one CU with one subprogram named "f" at .text+0, size 4 + * The primary case: one CU with one subprogram named "f" at .text+0, size 4 * (one aarch64 instruction), one line row mapping (.text+0, line 10). + * A second x64 case checks that .debug_line uses byte-granular PC advances + * instead of the aarch64 minimum instruction length. * * This is a producer-side encoder check: we deliberately don't go through * cfree_dwarf_open so an encoding bug doesn't get masked by a matching @@ -14,6 +16,7 @@ * test/cg path W. */ #include <cfree/core.h> +#include <cfree/arch.h> #include <stdarg.h> #include <stdio.h> #include <stdlib.h> @@ -285,8 +288,6 @@ static int run_one(CfreeArchKind arch, uint32_t nop_word, const char* tag) { /* Per-arch register-name spot checks: confirm rv64 DWARF numbers match * the psABI (x1=ra=1, x2=sp=2, x8=s0/fp=8, x10=a0=10, f0=ft0=32, * f8=fs0=40) and aa64 still resolves x0..x30/sp by their DWARF indices. */ -#include <cfree/arch.h> - static void check_reg(CfreeArchKind arch, const char* tag, uint32_t expect_idx, const char* expect_name) { const char* nm = cfree_arch_register_name(arch, expect_idx); @@ -327,10 +328,79 @@ static void run_arch_register_checks(void) { } } +static int run_x64_debug_line_check(void) { + CfreeTarget xt; + Compiler* xc; + ObjBuilder* xob; + Debug* xd; + ObjSecId xtext_sec; + ObjSymId xfsym; + Pool* xpool; + + memset(&xt, 0, sizeof(xt)); + xt.arch = CFREE_ARCH_X86_64; + xt.os = CFREE_OS_LINUX; + xt.obj = CFREE_OBJ_ELF; + xt.ptr_size = 8; + xt.ptr_align = 8; + + if (cfree_compiler_new(xt, &g_ctx, &xc) != CFREE_OK || !xc) { + fprintf(stderr, "x64 compiler_new failed\n"); + return 2; + } + xob = obj_new(xc); + xpool = xc->global; + + xtext_sec = obj_section(xob, pool_intern_cstr(xpool, ".text"), SEC_TEXT, + SF_EXEC | SF_ALLOC, 1); + { + u8 code[2] = {0x90, 0xc3}; /* nop; ret */ + obj_write(xob, xtext_sec, code, sizeof(code)); + } + xfsym = obj_symbol(xob, pool_intern_cstr(xpool, "xf"), SB_GLOBAL, SK_FUNC, + xtext_sec, 0, 2); + + xd = debug_new(xc, xob); + EXPECT(xd != NULL, "x64 debug_new returned NULL"); + if (xd) { + u32 fid = 0; + (void)source_add_memory(xc->sources, "x64.c", &fid); + SrcLoc decl = {fid, 1, 0}; + SrcLoc l10 = {fid, 10, 0}; + SrcLoc l11 = {fid, 11, 0}; + DebugTypeId int_tid = debug_type_base( + xd, pool_intern_cstr(xpool, "int"), DEBUG_BE_SIGNED, 4); + DebugTypeId fn_tid = debug_type_func(xd, int_tid, NULL, 0, 0); + (void)debug_file(xd, fid); + + debug_func_begin(xd, xfsym, fn_tid, decl); + debug_line(xd, xtext_sec, 0, l10, 1); + debug_line(xd, xtext_sec, 1, l11, 1); + debug_func_pc_range(xd, xtext_sec, 0, 2); + debug_func_end(xd); + debug_emit(xd); + + { + const Section* xline = sec_by_name(xob, xpool, ".debug_line"); + EXPECT(xline != NULL, "x64 .debug_line missing"); + if (xline) { + EXPECT(byte_at(xline, 12) == 1, + "x64 .debug_line min_inst_length != 1"); + } + } + + debug_free(xd); + } + obj_free(xob); + cfree_compiler_free(xc); + return 0; +} + int main(void) { int rc = 0; rc |= run_one(CFREE_ARCH_ARM_64, ARCH_NOP_AA64, "aa64"); rc |= run_one(CFREE_ARCH_RV64, ARCH_NOP_RV64, "rv64"); + rc |= run_x64_debug_line_check(); run_arch_register_checks(); if (g_fail || rc) { diff --git a/test/driver/run.sh b/test/driver/run.sh @@ -289,6 +289,32 @@ else fail=$((fail + 1)) fi +cat > "$work/rt-x64-start.c" <<'SRC' +extern int test_main(void); +void _start(void) { + volatile int rc = test_main(); + (void)rc; + for (;;) {} +} +SRC +if "$CFREE" cc --support-dir "$work/rt-support" -target x86_64-linux \ + -e _start "$repo_root/test/rt/cases/freestanding_lib.c" \ + "$work/rt-x64-start.c" \ + -o "$work/rt-x64" > "$work/rt-x64.out" 2> "$work/rt-x64.err" && + [ -f "$work/rt-support/build/rt/x86_64-linux/libcfree_rt.a" ] && + "$CFREE" ar t "$work/rt-support/build/rt/x86_64-linux/libcfree_rt.a" \ + > "$work/rt-x64.ar" 2> "$work/rt-x64-ar.err" && + grep -q '^printf\.c$' "$work/rt-x64.ar"; then + printf 'PASS %s\n' "cc-auto-builds-and-links-libcfree-rt-x64" + pass=$((pass + 1)) +else + printf 'FAIL %s (cfree cc failed)\n' \ + "cc-auto-builds-and-links-libcfree-rt-x64" + sed 's/^/ | /' "$work/rt-x64.err" + sed 's/^/ | /' "$work/rt-x64-ar.err" 2>/dev/null || true + fail=$((fail + 1)) +fi + if "$CFREE" ld "$work/main.o" --output="$work/ld-long-output" \ > "$work/ld-long-output.out" 2> "$work/ld-long-output.err" && [ -f "$work/ld-long-output" ]; then diff --git a/test/elf/unit/x64_disasm_annotations.c b/test/elf/unit/x64_disasm_annotations.c @@ -0,0 +1,221 @@ +/* x86_64 ELF relocation/disassembly annotation coverage. + * + * x86_64 relocation offsets usually point at an instruction's disp/imm field + * rather than at byte 0 of the instruction. This test builds a tiny ELF object + * whose relocations sit inside call, RIP-relative load, and jump encodings, + * then checks that object disassembly annotates the owning instruction. */ + +#include <cfree/core.h> +#include <cfree/disasm.h> +#include <cfree/object.h> +#include <setjmp.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "core/core.h" +#include "core/pool.h" +#include "obj/obj.h" + +static void* heap_alloc(CfreeHeap* h, size_t n, size_t a) { + (void)h; + (void)a; + return n ? malloc(n) : NULL; +} +static void* heap_realloc(CfreeHeap* h, void* p, size_t o, size_t n, size_t a) { + (void)h; + (void)o; + (void)a; + return realloc(p, n); +} +static void heap_free(CfreeHeap* h, void* p, size_t n) { + (void)h; + (void)n; + free(p); +} +static CfreeHeap g_heap = {heap_alloc, heap_realloc, heap_free, NULL}; + +static void diag_emit(CfreeDiagSink* s, CfreeDiagKind k, CfreeSrcLoc loc, + const char* fmt, va_list ap) { + static const char* names[] = {"note", "warning", "error", "fatal"}; + (void)s; + (void)loc; + fprintf(stderr, "%s: ", names[k]); + vfprintf(stderr, fmt, ap); + fputc('\n', stderr); +} +static CfreeDiagSink g_diag = {diag_emit, NULL, 0, 0}; + +static int g_failures; +#define CHECK(cond, ...) \ + do { \ + if (!(cond)) { \ + fprintf(stderr, "FAIL %s:%d: ", __FILE__, __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + fputc('\n', stderr); \ + g_failures++; \ + } \ + } while (0) + +static CfreeTarget x64_elf_target(void) { + CfreeTarget t; + memset(&t, 0, sizeof t); + t.arch = CFREE_ARCH_X86_64; + t.os = CFREE_OS_LINUX; + t.obj = CFREE_OBJ_ELF; + t.ptr_size = 8; + t.ptr_align = 8; + return t; +} + +static ObjBuilder* build_input(Compiler* c) { + ObjBuilder* ob = obj_new(c); + Pool* p = c->global; + Sym text = pool_intern_cstr(p, ".text"); + Sym start = pool_intern_cstr(p, "_start"); + Sym load_data = pool_intern_cstr(p, "load_data"); + Sym foo = pool_intern_cstr(p, "foo"); + Sym bar = pool_intern_cstr(p, "bar"); + Sym global_data = pool_intern_cstr(p, "global_data"); + ObjSecId sec_text = obj_section(ob, text, SEC_TEXT, SF_ALLOC | SF_EXEC, 16); + + static const uint8_t text_bytes[] = { + 0xe8, 0x00, 0x00, 0x00, 0x00, /* call rel32 */ + 0x48, 0x8b, 0x05, 0x00, 0x00, 0x00, 0x00, /* mov disp32(%rip), %rax */ + 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp rel32 */ + 0xc3, /* ret */ + }; + obj_write(ob, sec_text, text_bytes, sizeof text_bytes); + + obj_symbol(ob, start, SB_GLOBAL, SK_FUNC, sec_text, 0, sizeof text_bytes); + obj_symbol(ob, load_data, SB_LOCAL, SK_NOTYPE, sec_text, 5, 0); + ObjSymId sym_foo = + obj_symbol(ob, foo, SB_GLOBAL, SK_UNDEF, OBJ_SEC_NONE, 0, 0); + ObjSymId sym_bar = + obj_symbol(ob, bar, SB_GLOBAL, SK_UNDEF, OBJ_SEC_NONE, 0, 0); + ObjSymId sym_data = + obj_symbol(ob, global_data, SB_GLOBAL, SK_UNDEF, OBJ_SEC_NONE, 0, 0); + + obj_reloc(ob, sec_text, 1, R_X64_PLT32, sym_foo, -4); + obj_reloc(ob, sec_text, 8, R_PC32, sym_data, -4); + obj_reloc(ob, sec_text, 13, R_X64_PLT32, sym_bar, -4); + + obj_finalize(ob); + return ob; +} + +static void check_reloc_names(const CfreeBytes* bytes, const CfreeContext* ctx) { + CfreeObjFile* f = NULL; + CfreeObjRelocIter* it = NULL; + CfreeObjReloc r; + int saw_plt32 = 0; + int saw_pc32 = 0; + + CHECK(cfree_obj_open(ctx, bytes, &f) == CFREE_OK && f, "cfree_obj_open"); + if (!f) return; + CHECK(cfree_obj_reliter_new(f, &it) == CFREE_OK && it, "reliter_new"); + if (!it) { + cfree_obj_free(f); + return; + } + while (cfree_obj_reliter_next(it, &r) == CFREE_ITER_ITEM) { + if (r.offset == 1 || r.offset == 13) { + CHECK(r.kind_name && strcmp(r.kind_name, "R_X86_64_PLT32") == 0, + "PLT32 kind name at offset %llu is %s", + (unsigned long long)r.offset, r.kind_name ? r.kind_name : "(null)"); + saw_plt32++; + } + if (r.offset == 8) { + CHECK(r.kind_name && strcmp(r.kind_name, "R_X86_64_PC32") == 0, + "PC32 kind name is %s", r.kind_name ? r.kind_name : "(null)"); + saw_pc32 = 1; + } + } + CHECK(saw_plt32 == 2, "saw %d PLT32 relocs", saw_plt32); + CHECK(saw_pc32, "missing PC32 reloc"); + cfree_obj_reliter_free(it); + cfree_obj_free(f); +} + +static void check_disasm_annotations(const CfreeBytes* bytes, + const CfreeContext* ctx) { + CfreeWriter* w = NULL; + const uint8_t* out; + size_t len = 0; + char* text; + + CHECK(cfree_writer_mem(&g_heap, &w) == CFREE_OK && w, "writer_mem"); + if (!w) return; + CHECK(cfree_disasm_obj_bytes(ctx, bytes, w) == CFREE_OK, "disasm_obj_bytes"); + out = cfree_writer_mem_bytes(w, &len); + text = (char*)malloc(len + 1); + CHECK(text != NULL, "malloc disasm copy"); + if (text) { + memcpy(text, out, len); + text[len] = '\0'; + CHECK(strstr(text, "0000000000000000 <_start>:") != NULL, + "missing _start label\n%s", text); + CHECK(strstr(text, "0000000000000005 <load_data>:") != NULL, + "missing load_data label\n%s", text); + CHECK(strstr(text, "call") && strstr(text, "foo-4"), + "missing call reloc annotation\n%s", text); + CHECK(strstr(text, "mov") && strstr(text, "global_data-4"), + "missing RIP-relative reloc annotation\n%s", text); + CHECK(strstr(text, "jmp") && strstr(text, "bar-4"), + "missing jmp reloc annotation\n%s", text); + free(text); + } + cfree_writer_close(w); +} + +int main(void) { + CfreeTarget target = x64_elf_target(); + CfreeContext ctx; + CfreeCompiler* cc = NULL; + Compiler* c; + ObjBuilder* ob; + CfreeWriter* w = NULL; + const uint8_t* obj_data; + size_t obj_len = 0; + CfreeBytes bytes; + + memset(&ctx, 0, sizeof ctx); + ctx.heap = &g_heap; + ctx.diag = &g_diag; + ctx.now = -1; + + if (cfree_compiler_new(target, &ctx, &cc) != CFREE_OK || !cc) { + fprintf(stderr, "FAIL: cfree_compiler_new\n"); + return 1; + } + c = (Compiler*)cc; + if (setjmp(c->panic)) { + compiler_run_cleanups(c); + cfree_compiler_free(cc); + fprintf(stderr, "FAIL: compiler_panic\n"); + return 1; + } + + ob = build_input(c); + CHECK(cfree_writer_mem(&g_heap, &w) == CFREE_OK && w, "writer_mem object"); + emit_elf(c, ob, w); + obj_data = cfree_writer_mem_bytes(w, &obj_len); + bytes.name = "x64_disasm_annotations.o"; + bytes.data = obj_data; + bytes.len = obj_len; + + check_reloc_names(&bytes, &ctx); + check_disasm_annotations(&bytes, &ctx); + + cfree_writer_close(w); + obj_free(ob); + cfree_compiler_free(cc); + + if (g_failures) { + fprintf(stderr, "%d failure(s)\n", g_failures); + return 1; + } + fputs("x64_disasm_annotations: OK\n", stderr); + return 0; +} diff --git a/test/libc/cases/01_syscall_write.c b/test/libc/cases/01_syscall_write.c @@ -8,7 +8,9 @@ static const char msg[] = "hello-syscall\n"; int main(void) { - /* sys_write(1, msg, sizeof(msg) - 1) via raw syscall. */ + /* sys_write(1, msg, sizeof(msg) - 1) via the raw kernel-entry insn. + * Per-arch because the syscall ABI (register set + entry insn + the + * SYS_write number itself) is wholly arch-specific. */ #if defined(__aarch64__) register long x8 __asm__("x8") = 64; /* SYS_write */ register long x0 __asm__("x0") = 1; /* fd */ @@ -21,8 +23,19 @@ int main(void) { register long a1 __asm__("a1") = (long)msg; register long a2 __asm__("a2") = sizeof(msg) - 1; __asm__ volatile("ecall" : "+r"(a0) : "r"(a7), "r"(a1), "r"(a2) : "memory"); +#elif defined(__x86_64__) + /* SysV x86_64 syscall ABI: nr in %rax, args in %rdi/%rsi/%rdx, + * %rcx and %r11 clobbered by the `syscall` instruction itself. */ + register long rax __asm__("rax") = 1; /* SYS_write */ + register long rdi __asm__("rdi") = 1; /* fd */ + register long rsi __asm__("rsi") = (long)msg; + register long rdx __asm__("rdx") = sizeof(msg) - 1; + __asm__ volatile("syscall" + : "+r"(rax) + : "r"(rdi), "r"(rsi), "r"(rdx) + : "rcx", "r11", "memory"); #else -#error "01_syscall_write: unsupported target" +#error "01_syscall_write: no syscall sequence for this arch" #endif return 0; } diff --git a/test/libc/glibc/run.sh b/test/libc/glibc/run.sh @@ -7,7 +7,7 @@ # # dynamic — PIE object + libc.so.6, with explicit dynamic linker # cfree ld -pie \ -# -dynamic-linker /lib/<loader> \ +# -dynamic-linker <loader> \ # -o case.exe \ # $SYSROOT/lib/Scrt1.o $SYSROOT/lib/crti.o \ # case.o \ @@ -16,84 +16,46 @@ # # Unlike musl, where ld-musl-<arch>.so.1 is the same file as libc, # glibc's loader is a separate ELF — cfree ld's default interp is musl, -# so we override via -dynamic-linker. libc.so.6 carries -# SONAME=libc.so.6 so DT_NEEDED is correct without a linker-script -# intermediary (the on-disk libc.so is a GROUP script that cfree ld -# doesn't parse — we hand the SO directly). libc_nonshared.a -# contributes the handful of non-shared callbacks every glibc dyn-exe -# pulls in — atexit, __stack_chk_fail_local, __libc_csu_init/fini on -# older glibc, etc. — and must follow libc.so.6 in the demand chain. -# -# Usage: -# run.sh # default aarch64 -# run.sh -a aarch64 # same as default -# run.sh -a rv64 # riscv64 +# so we override via -dynamic-linker. The per-arch loader path is: +# aa64 -> /lib/ld-linux-aarch64.so.1 +# x64 -> /lib64/ld-linux-x86-64.so.2 +# libc.so.6 carries SONAME=libc.so.6 so DT_NEEDED is correct without a +# linker-script intermediary (the on-disk libc.so is a GROUP script +# that cfree ld doesn't parse — we hand the SO directly). +# libc_nonshared.a contributes the handful of non-shared callbacks +# every glibc dyn-exe pulls in — atexit, __stack_chk_fail_local, +# __libc_csu_init/fini on older glibc, etc. — and must follow +# libc.so.6 in the demand chain. # # Each case file may carry an `expected` companion (default 0) and an # optional `expected_stdout` file checked with substring match. # # Designed to fail fast and clearly: the *first* failure surface (compile # / link / run / output) is the gap to fix next. Run with -# CFREE_GLIBC_KEEP=1 to leave intermediates in build/glibc/<case>/. +# CFREE_GLIBC_KEEP=1 to leave intermediates in build/glibc/<arch>/<case>/. +# +# Arch selection: +# CFREE_LIBC_ARCHES (default "aa64") — space-separated list. Valid +# values: aa64, x64, rv64. Each arch maps to: +# aa64 -> build/glibc-sysroot/ + build/rt/aarch64-linux/libcfree_rt.a +# + --target=aarch64-linux-gnu +# x64 -> build/glibc-sysroot-x64/ + build/rt/x86_64-linux/libcfree_rt.a +# + --target=x86_64-linux-gnu +# rv64 -> build/glibc-sysroot-rv64/ + build/rt/riscv64-linux/libcfree_rt.a +# + --target=riscv64-linux-gnu +# Missing sysroot / rt for an enabled arch is reported as SKIP +# (non-fatal); only test failures cause a nonzero exit. set -u ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" -ARCH=aarch64 - -while [ $# -gt 0 ]; do - case "$1" in - -a) ARCH="$2"; shift 2 ;; - --arch=*) ARCH="${1#--arch=}"; shift ;; - *) echo "unknown arg: $1" >&2; exit 2 ;; - esac -done - -# Per-arch tokens. Keep the aarch64 lane on the bare paths it has always -# used so existing wiring/test-glibc is unchanged. -case "$ARCH" in - aarch64) - SYSROOT="$ROOT/build/glibc-sysroot" - BUILD_DIR="$ROOT/build/glibc" - CFREE_RT="$ROOT/build/rt/aarch64-linux/libcfree_rt.a" - RT_TARGET="rt-aarch64-linux" - CLANG_TRIPLE="aarch64-linux-gnu" - QEMU_NAME="qemu-aarch64" - PODMAN_IMAGE="docker.io/arm64v8/debian:bookworm-slim" - DYNAMIC_LINKER="/lib/ld-linux-aarch64.so.1" - MULTIARCH_DIR="aarch64-linux-gnu" - ;; - rv64) - SYSROOT="$ROOT/build/glibc-sysroot-rv64" - BUILD_DIR="$ROOT/build/glibc-rv64" - CFREE_RT="$ROOT/build/rt/riscv64-linux/libcfree_rt.a" - RT_TARGET="rt-riscv64-linux" - CLANG_TRIPLE="riscv64-linux-gnu" - QEMU_NAME="qemu-riscv64" - PODMAN_IMAGE="docker.io/riscv64/debian:trixie-slim" - DYNAMIC_LINKER="/lib/ld-linux-riscv64-lp64d.so.1" - MULTIARCH_DIR="riscv64-linux-gnu" - ;; - *) - echo "run.sh: unknown arch '$ARCH' (want aarch64|rv64)" >&2 - exit 2 - ;; -esac - CASES_DIR="$ROOT/test/libc/cases" +BUILD_DIR="$ROOT/build/glibc" CFREE="$ROOT/build/cfree" -if [ ! -d "$SYSROOT" ]; then - echo "glibc sysroot missing at $SYSROOT — run test/libc/glibc/extract.sh -a $ARCH first" >&2 - exit 2 -fi if [ ! -x "$CFREE" ]; then echo "cfree driver missing at $CFREE — run 'make' first" >&2 exit 2 fi -if [ ! -f "$CFREE_RT" ]; then - echo "cfree rt missing at $CFREE_RT — run 'make $RT_TARGET'" >&2 - exit 2 -fi mkdir -p "$BUILD_DIR" @@ -102,79 +64,154 @@ color_grn() { printf '\033[32m%s\033[0m' "$1"; } color_yel() { printf '\033[33m%s\033[0m' "$1"; } PASS=0; FAIL=0; FAIL_NAMES=() +SKIP_ARCHES=() -# Pick a runner. Native hosts of the target arch can run ELFs directly -# under podman without binfmt; otherwise we want qemu-<arch>-static. -arch_raw="$(uname -m 2>/dev/null || true)" -is_native=0 -case "$ARCH" in - aarch64) - { [ "$arch_raw" = "aarch64" ] || [ "$arch_raw" = "arm64" ]; } && is_native=1 - ;; - rv64) - [ "$arch_raw" = "riscv64" ] && is_native=1 - ;; -esac - -QEMU_BIN="$(command -v "${QEMU_NAME}-static" 2>/dev/null || command -v "$QEMU_NAME" 2>/dev/null || true)" -have_qemu=0; [ -n "$QEMU_BIN" ] && have_qemu=1 -have_podman=0; command -v podman >/dev/null 2>&1 && have_podman=1 +# ---- arch lookup tables --------------------------------------------------- -# clang must understand --target=<triple>. Every system path is -# overridden via --sysroot / -isystem so the host's headers / libraries -# are not consulted. -if ! clang --target=$CLANG_TRIPLE -c -x c - -o /dev/null < /dev/null 2>/dev/null; then - echo "clang does not accept --target=$CLANG_TRIPLE" >&2 - exit 2 -fi +arch_sysroot() { + case "$1" in + aa64) echo "$ROOT/build/glibc-sysroot" ;; + x64) echo "$ROOT/build/glibc-sysroot-x64" ;; + rv64) echo "$ROOT/build/glibc-sysroot-rv64" ;; + *) echo "" ;; + esac +} + +arch_rt() { + case "$1" in + aa64) echo "$ROOT/build/rt/aarch64-linux/libcfree_rt.a" ;; + x64) echo "$ROOT/build/rt/x86_64-linux/libcfree_rt.a" ;; + rv64) echo "$ROOT/build/rt/riscv64-linux/libcfree_rt.a" ;; + *) echo "" ;; + esac +} -# Dynamic-variant exes need the loader + libc.so.6 to load. qemu-user -# resolves them relative to QEMU_LD_PREFIX or -L; the podman fallback -# uses an arch-specific debian image which ships them at the expected -# paths. -QEMU_LD_PREFIX_OVERRIDE="$SYSROOT" +arch_target() { + case "$1" in + aa64) echo "aarch64-linux-gnu" ;; + x64) echo "x86_64-linux-gnu" ;; + rv64) echo "riscv64-linux-gnu" ;; + *) echo "" ;; + esac +} +arch_triple_include() { + # The per-arch multi-arch include subdir under sysroot/include/. + case "$1" in + aa64) echo "aarch64-linux-gnu" ;; + x64) echo "x86_64-linux-gnu" ;; + rv64) echo "riscv64-linux-gnu" ;; + *) echo "" ;; + esac +} + +# Spelling extract.sh accepts for `-a`: aa64 -> aarch64; x64 -> x64. +arch_extract_name() { + case "$1" in + aa64) echo "aarch64" ;; + *) echo "$1" ;; + esac +} + +arch_loader() { + # Dynamic-linker path baked into PT_INTERP. Both paths are the + # canonical Linux glibc loader locations and match the layout the + # extracted sysroots ship. + case "$1" in + aa64) echo "/lib/ld-linux-aarch64.so.1" ;; + x64) echo "/lib64/ld-linux-x86-64.so.2" ;; + rv64) echo "/lib/ld-linux-riscv64-lp64d.so.1" ;; + *) echo "" ;; + esac +} + +# ---- per-arch runners ------------------------------------------------------ +# +# Native linux/<arch> hosts can exec ELFs directly under podman without +# binfmt; otherwise we fall back to qemu-<arch>-static. + +arch_raw="$(uname -m 2>/dev/null || true)" +is_aarch64=0 +{ [ "$arch_raw" = "aarch64" ] || [ "$arch_raw" = "arm64" ]; } && is_aarch64=1 +is_x86_64=0 +{ [ "$arch_raw" = "x86_64" ] || [ "$arch_raw" = "amd64" ]; } && is_x86_64=1 + +QEMU_AA64="$(command -v qemu-aarch64-static 2>/dev/null || command -v qemu-aarch64 2>/dev/null || true)" +QEMU_X64="$(command -v qemu-x86_64-static 2>/dev/null || command -v qemu-x86_64 2>/dev/null || true)" +QEMU_RV64="$(command -v qemu-riscv64-static 2>/dev/null || command -v qemu-riscv64 2>/dev/null || true)" +have_podman=0; command -v podman >/dev/null 2>&1 && have_podman=1 + +# run_target <arch> <sysroot> <exe> <out> <err> -> sets RUN_RC +# +# Dynamic-variant exes need /lib/ld-linux-<arch>.so.{1,2} + libc.so.6 +# to load. qemu-user resolves them relative to QEMU_LD_PREFIX; the +# podman fallback uses a debian:bookworm image which ships them at the +# expected paths. run_target() { - local exe="$1" out="$2" err="$3" - if [ $have_qemu -eq 1 ]; then + local arch="$1" sysroot="$2" exe="$3" out="$4" err="$5" + local qemu="" image="" + case "$arch" in + aa64) + qemu="$QEMU_AA64" + # Pin the image name to the arm64-specific repo + # (docker.io/arm64v8/...) instead of the multi-arch + # debian:bookworm-slim. Two reasons: + # 1. Avoids the cached-amd64-manifest trap that + # debian:bookworm-slim hits on arm64 hosts where an + # amd64 pull happened earlier — podman silently uses + # the wrong arch and the dyn-exe fails to load. + # 2. Avoids passing --platform, which forces podman to + # hit the registry on every run to verify the + # manifest matches. Pinning the repo + relying on the + # local cache keeps subsequent runs offline + fast. + # arm64v8/debian:bookworm-slim ships the matching glibc + # loader, so the dynamic variant resolves PT_INTERP + # without extra mounts. + image="docker.io/arm64v8/debian:bookworm-slim" + ;; + x64) + qemu="$QEMU_X64" + # amd64-pinned debian peer of the arm64v8 image above. + # Same rationale: avoid --platform + multi-arch tag + # cache traps. Ships /lib64/ld-linux-x86-64.so.2 + libc. + image="docker.io/amd64/debian:bookworm-slim" + ;; + rv64) + qemu="$QEMU_RV64" + # riscv64-pinned Debian image. The trixie image ships the + # riscv64 glibc loader at /lib/ld-linux-riscv64-lp64d.so.1. + image="docker.io/riscv64/debian:trixie-slim" + ;; + esac + if [ -n "$qemu" ]; then # Point qemu-user at our extracted sysroot so the loader # search resolves to the SYSROOT copy rather than the # (possibly-absent) host one. - QEMU_LD_PREFIX="$QEMU_LD_PREFIX_OVERRIDE" \ - "$QEMU_BIN" "$exe" >"$out" 2>"$err" + QEMU_LD_PREFIX="$sysroot" \ + "$qemu" "$exe" >"$out" 2>"$err" RUN_RC=$?; return fi if [ $have_podman -eq 1 ]; then local dir base dir="$(cd "$(dirname "$exe")" && pwd)"; base="$(basename "$exe")" - # Pin the image name to an arch-specific repo - # (docker.io/arm64v8/..., docker.io/riscv64/...) instead of - # the multi-arch debian:bookworm-slim / trixie-slim. Two - # reasons: - # 1. Avoids the cached-wrong-arch-manifest trap that - # bare debian images hit when an unrelated pull - # cached a different arch — podman silently uses - # the wrong arch and the dyn-exe fails to load. - # 2. Avoids passing --platform, which forces podman to - # hit the registry on every run to verify the - # manifest matches. Pinning the repo + relying on the - # local cache keeps subsequent runs offline + fast. - # The arch-pinned image ships the matching glibc loader, so - # the dynamic variant resolves PT_INTERP without extra mounts. podman run --rm --pull=never --net=none \ -v "$dir":/work:Z -w /work \ - "$PODMAN_IMAGE" "./$base" \ + "$image" "./$base" \ >"$out" 2>"$err" RUN_RC=$?; return fi RUN_RC=127 } +# ---- case driver ----------------------------------------------------------- + +# run_case <arch> <sysroot> <rt> <target> <loader> <triple_inc> <src> run_case() { - local src="$1" + local arch="$1" sysroot="$2" rt="$3" target="$4" loader="$5" + local triple_inc="$6" src="$7" local name="$(basename "$src" .c)" - local work="$BUILD_DIR/$name" - local label="$name" + local work="$BUILD_DIR/$arch/$name" + local label="$arch/$name" mkdir -p "$work" local expected=0 @@ -190,8 +227,8 @@ run_case() { # Three -isystem layers, in order of precedence: # sysroot/include/ — glibc + linux-libc-dev # headers (top-level uapi). - # sysroot/include/<multiarch> — glibc multi-arch (bits/*, - # gnu/stubs-*.h, ...); + # sysroot/include/<triple> — glibc multi-arch (bits/*, + # gnu/stubs-lp64.h, ...); # <features.h> reaches in. # rt/include/ — cfree's freestanding overlay # (stddef.h, stdarg.h, stdint.h). @@ -201,10 +238,10 @@ run_case() { # so rt/include must be reachable. # -nostdinc strips clang's default include path so cross targets # don't accidentally pick up the host's compiler headers. - local cc_flags=(--target=$CLANG_TRIPLE --sysroot="$SYSROOT" + local cc_flags=(--target="$target" --sysroot="$sysroot" -nostdinc - -isystem "$SYSROOT/include" - -isystem "$SYSROOT/include/$MULTIARCH_DIR" + -isystem "$sysroot/include" + -isystem "$sysroot/include/$triple_inc" -isystem "$ROOT/rt/include" -fPIE -fpic -O0) @@ -223,7 +260,7 @@ run_case() { # SO directly), with -dynamic-linker overriding the musl default. # Expects cfree ld to: # - accept ET_DYN ELF objects as input, - # - emit PT_INTERP $DYNAMIC_LINKER, + # - emit PT_INTERP "$loader", # - emit PT_DYNAMIC with DT_NEEDED libc.so.6, # - emit a .dynsym/.dynstr/.gnu.hash + .rela.plt/.got.plt # so the loader can bind imported symbols at runtime. @@ -232,13 +269,13 @@ run_case() { # crti/crtn are unchanged. local exe="$work/${name}.exe" local link_cmd=("$CFREE" "ld" -pie - -dynamic-linker "$DYNAMIC_LINKER" + -dynamic-linker "$loader" -o "$exe" - "$SYSROOT/lib/Scrt1.o" "$SYSROOT/lib/crti.o" + "$sysroot/lib/Scrt1.o" "$sysroot/lib/crti.o" "$obj" - "$SYSROOT/lib/libc.so.6" "$SYSROOT/lib/libc_nonshared.a" - "$CFREE_RT" - "$SYSROOT/lib/crtn.o") + "$sysroot/lib/libc.so.6" "$sysroot/lib/libc_nonshared.a" + "$rt" + "$sysroot/lib/crtn.o") if ! "${link_cmd[@]}" >"$work/link.out" 2>"$work/link.err"; then FAIL=$((FAIL+1)) @@ -249,7 +286,7 @@ run_case() { fi # ---- run ---- - run_target "$exe" "$work/run.out" "$work/run.err" + run_target "$arch" "$sysroot" "$exe" "$work/run.out" "$work/run.err" if [ "$RUN_RC" -ne "$expected" ]; then FAIL=$((FAIL+1)) FAIL_NAMES+=("$label (run rc=$RUN_RC, want $expected)") @@ -275,11 +312,55 @@ run_case() { printf ' %s %s\n' "$(color_grn PASS)" "$label" } +# run_arch_cases <arch> <sysroot> <rt> <target> +run_arch_cases() { + local arch="$1" sysroot="$2" rt="$3" target="$4" + local loader triple_inc + loader="$(arch_loader "$arch")" + triple_inc="$(arch_triple_include "$arch")" + + # clang must understand --target=<target>. Every system path is + # overridden via --sysroot / -isystem so the host's headers / + # libraries are not consulted. + if ! clang --target="$target" -c -x c - -o /dev/null < /dev/null 2>/dev/null; then + printf ' %s %s (clang does not accept --target=%s)\n' \ + "$(color_yel SKIP)" "$arch" "$target" + SKIP_ARCHES+=("$arch (no clang --target=$target)") + return + fi + + printf 'Running glibc dynamic-link cases [%s]...\n' "$arch" + for src in "$CASES_DIR"/*.c; do + run_case "$arch" "$sysroot" "$rt" "$target" "$loader" "$triple_inc" "$src" + done + printf '\n' +} + shopt -s nullglob -printf 'Running glibc dynamic-link cases [arch=%s]...\n' "$ARCH" -for src in "$CASES_DIR"/*.c; do - run_case "$src" +ARCHES="${CFREE_LIBC_ARCHES:-aa64}" +for arch in $ARCHES; do + sysroot="$(arch_sysroot "$arch")" + rt="$(arch_rt "$arch")" + target="$(arch_target "$arch")" + if [ -z "$sysroot" ] || [ -z "$rt" ] || [ -z "$target" ]; then + printf ' %s %s (unknown arch)\n' "$(color_yel SKIP)" "$arch" + SKIP_ARCHES+=("$arch (unknown)") + continue + fi + if [ ! -d "$sysroot" ]; then + printf ' %s %s (glibc sysroot missing at %s — run test/libc/glibc/extract.sh -a %s)\n' \ + "$(color_yel SKIP)" "$arch" "$sysroot" "$(arch_extract_name "$arch")" + SKIP_ARCHES+=("$arch (sysroot)") + continue + fi + if [ ! -f "$rt" ]; then + printf ' %s %s (cfree rt missing at %s)\n' \ + "$(color_yel SKIP)" "$arch" "$rt" + SKIP_ARCHES+=("$arch (rt)") + continue + fi + run_arch_cases "$arch" "$sysroot" "$rt" "$target" done if [ ${#FAIL_NAMES[@]} -gt 0 ]; then @@ -287,7 +368,10 @@ if [ ${#FAIL_NAMES[@]} -gt 0 ]; then for n in "${FAIL_NAMES[@]}"; do printf ' %s\n' "$n"; done fi -printf '\nResults [%s]: %s pass, %s fail\n' "$ARCH" "$PASS" "$FAIL" +printf '\nResults: %s pass, %s fail\n' "$PASS" "$FAIL" +if [ ${#SKIP_ARCHES[@]} -gt 0 ]; then + printf ' skipped: %s\n' "${SKIP_ARCHES[*]}" +fi if [ ${#FAIL_NAMES[@]} -gt 0 ]; then exit 1; fi exit 0 diff --git a/test/libc/musl/run.sh b/test/libc/musl/run.sh @@ -9,7 +9,7 @@ # $SYSROOT/lib/libc.a $CFREE_RT \ # $SYSROOT/lib/crtn.o # -# dynamic — PIE object + libc.so, expects PT_INTERP ld-musl-<arch>.so.1 +# dynamic — PIE object + libc.so, expects PT_INTERP /lib/ld-musl-<arch>.so.1 # cfree ld -pie -o case.exe \ # $SYSROOT/lib/Scrt1.o $SYSROOT/lib/crti.o \ # case.o \ @@ -20,74 +20,35 @@ # cfree ld currently doesn't accept one; this is one of the gaps # we expect the dynamic variant to surface.) # -# Usage: -# run.sh # default aarch64 -# run.sh -a aarch64 # same as default -# run.sh -a rv64 # riscv64 -# # Each case file may carry an `expected` companion (default 0) and an # optional `expected_stdout` file checked with substring match. # # Designed to fail fast and clearly: the *first* failure surface (compile # / link / run / output) is the gap to fix next. Run with -# CFREE_MUSL_KEEP=1 to leave intermediates in build/musl/<case>/. +# CFREE_MUSL_KEEP=1 to leave intermediates in build/musl/<arch>/<case>/. +# +# Arch selection: +# CFREE_LIBC_ARCHES (default "aa64") — space-separated list. Valid +# values: aa64, x64, rv64. Each arch maps to: +# aa64 -> build/musl-sysroot/ + build/rt/aarch64-linux/libcfree_rt.a +# + --target=aarch64-linux-musl +# x64 -> build/musl-sysroot-x64/ + build/rt/x86_64-linux/libcfree_rt.a +# + --target=x86_64-linux-musl +# rv64 -> build/musl-sysroot-rv64/ + build/rt/riscv64-linux/libcfree_rt.a +# + --target=riscv64-linux-musl +# Missing sysroot / rt for an enabled arch is reported as SKIP +# (non-fatal); only test failures cause a nonzero exit. set -u ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" -ARCH=aarch64 - -while [ $# -gt 0 ]; do - case "$1" in - -a) ARCH="$2"; shift 2 ;; - --arch=*) ARCH="${1#--arch=}"; shift ;; - *) echo "unknown arg: $1" >&2; exit 2 ;; - esac -done - -# Per-arch tokens. Keep the aarch64 lane on the bare paths it has always -# used so existing wiring/test-musl is unchanged. -case "$ARCH" in - aarch64) - SYSROOT="$ROOT/build/musl-sysroot" - BUILD_DIR="$ROOT/build/musl" - CFREE_RT="$ROOT/build/rt/aarch64-linux/libcfree_rt.a" - RT_TARGET="rt-aarch64-linux" - CLANG_TRIPLE="aarch64-linux-musl" - QEMU_NAME="qemu-aarch64" - PODMAN_IMAGE="docker.io/arm64v8/alpine:latest" - LOADER_BASENAME="ld-musl-aarch64.so.1" - ;; - rv64) - SYSROOT="$ROOT/build/musl-sysroot-rv64" - BUILD_DIR="$ROOT/build/musl-rv64" - CFREE_RT="$ROOT/build/rt/riscv64-linux/libcfree_rt.a" - RT_TARGET="rt-riscv64-linux" - CLANG_TRIPLE="riscv64-linux-musl" - QEMU_NAME="qemu-riscv64" - PODMAN_IMAGE="docker.io/riscv64/alpine:edge" - LOADER_BASENAME="ld-musl-riscv64.so.1" - ;; - *) - echo "run.sh: unknown arch '$ARCH' (want aarch64|rv64)" >&2 - exit 2 - ;; -esac - CASES_DIR="$ROOT/test/libc/cases" +BUILD_DIR="$ROOT/build/musl" CFREE="$ROOT/build/cfree" -if [ ! -d "$SYSROOT" ]; then - echo "musl sysroot missing at $SYSROOT — run test/libc/musl/extract.sh -a $ARCH first" >&2 - exit 2 -fi if [ ! -x "$CFREE" ]; then echo "cfree driver missing at $CFREE — run 'make' first" >&2 exit 2 fi -if [ ! -f "$CFREE_RT" ]; then - echo "cfree rt missing at $CFREE_RT — run 'make $RT_TARGET'" >&2 - exit 2 -fi mkdir -p "$BUILD_DIR" @@ -96,67 +57,124 @@ color_grn() { printf '\033[32m%s\033[0m' "$1"; } color_yel() { printf '\033[33m%s\033[0m' "$1"; } # Per-variant counters so the dynamic-link surface is visible in its own -# right rather than being averaged into one total. +# right rather than being averaged into one total. Counters are global +# across arches; the arch tag is baked into each case label so failures +# remain disambiguated in the summary. PASS_static=0; FAIL_static=0; FAIL_NAMES_static=() PASS_dynamic=0; FAIL_dynamic=0; FAIL_NAMES_dynamic=() +SKIP_ARCHES=() + +# ---- arch lookup tables --------------------------------------------------- + +arch_sysroot() { + case "$1" in + aa64) echo "$ROOT/build/musl-sysroot" ;; + x64) echo "$ROOT/build/musl-sysroot-x64" ;; + rv64) echo "$ROOT/build/musl-sysroot-rv64" ;; + *) echo "" ;; + esac +} + +arch_rt() { + case "$1" in + aa64) echo "$ROOT/build/rt/aarch64-linux/libcfree_rt.a" ;; + x64) echo "$ROOT/build/rt/x86_64-linux/libcfree_rt.a" ;; + rv64) echo "$ROOT/build/rt/riscv64-linux/libcfree_rt.a" ;; + *) echo "" ;; + esac +} + +arch_target() { + case "$1" in + aa64) echo "aarch64-linux-musl" ;; + x64) echo "x86_64-linux-musl" ;; + rv64) echo "riscv64-linux-musl" ;; + *) echo "" ;; + esac +} + +# Spelling extract.sh accepts for `-a`: aa64 -> aarch64; x64 -> x64. +arch_extract_name() { + case "$1" in + aa64) echo "aarch64" ;; + *) echo "$1" ;; + esac +} + +# ---- per-arch runners ------------------------------------------------------ +# +# Native linux/<arch> hosts can exec ELFs directly under podman without +# binfmt; otherwise we fall back to qemu-<arch>-static. -# Pick a runner. Native hosts of the target arch can run ELFs directly -# under podman without binfmt; otherwise we want qemu-<arch>-static. arch_raw="$(uname -m 2>/dev/null || true)" -is_native=0 -case "$ARCH" in - aarch64) - { [ "$arch_raw" = "aarch64" ] || [ "$arch_raw" = "arm64" ]; } && is_native=1 - ;; - rv64) - [ "$arch_raw" = "riscv64" ] && is_native=1 - ;; -esac - -QEMU_BIN="$(command -v "${QEMU_NAME}-static" 2>/dev/null || command -v "$QEMU_NAME" 2>/dev/null || true)" -have_qemu=0; [ -n "$QEMU_BIN" ] && have_qemu=1 -have_podman=0; command -v podman >/dev/null 2>&1 && have_podman=1 +is_aarch64=0 +{ [ "$arch_raw" = "aarch64" ] || [ "$arch_raw" = "arm64" ]; } && is_aarch64=1 +is_x86_64=0 +{ [ "$arch_raw" = "x86_64" ] || [ "$arch_raw" = "amd64" ]; } && is_x86_64=1 -# clang must understand --target=<triple>. Recent clang ships -# linux-musl as a target alias of linux-gnu for our purposes (we override -# every system path via --sysroot). -if ! clang --target=$CLANG_TRIPLE -c -x c - -o /dev/null < /dev/null 2>/dev/null; then - echo "clang does not accept --target=$CLANG_TRIPLE" >&2 - exit 2 -fi +QEMU_AA64="$(command -v qemu-aarch64-static 2>/dev/null || command -v qemu-aarch64 2>/dev/null || true)" +QEMU_X64="$(command -v qemu-x86_64-static 2>/dev/null || command -v qemu-x86_64 2>/dev/null || true)" +QEMU_RV64="$(command -v qemu-riscv64-static 2>/dev/null || command -v qemu-riscv64 2>/dev/null || true)" +have_podman=0; command -v podman >/dev/null 2>&1 && have_podman=1 +# run_target <arch> <exe> <out> <err> -> sets RUN_RC run_target() { - local exe="$1" out="$2" err="$3" - if [ $have_qemu -eq 1 ]; then - "$QEMU_BIN" "$exe" >"$out" 2>"$err"; RUN_RC=$?; return + local arch="$1" exe="$2" out="$3" err="$4" + local qemu="" image="" platform="" + case "$arch" in + aa64) + qemu="$QEMU_AA64" + # Pin the image name to the arm64-specific repo + # (docker.io/arm64v8/...) instead of the multi-arch + # alpine:latest. Avoids the cached-wrong-arch-manifest + # trap that bare alpine:latest hits when an unrelated + # pull cached a different arch; also avoids --platform, + # which would force a registry manifest lookup on every + # run. arm64v8/alpine ships the musl loader at + # /lib/ld-musl-aarch64.so.1 so the dynamic variant + # resolves PT_INTERP without extra mounts. + image="docker.io/arm64v8/alpine:latest" + ;; + x64) + qemu="$QEMU_X64" + # amd64v2/alpine isn't a thing on Docker Hub; the + # canonical arch-pinned alpine for amd64 is amd64/alpine. + # Same rationale as arm64v8/alpine above: pin the repo, + # skip --platform, rely on the local cache. Ships the + # musl loader at /lib/ld-musl-x86_64.so.1. + image="docker.io/amd64/alpine:latest" + ;; + rv64) + qemu="$QEMU_RV64" + # riscv64-pinned Alpine image. alpine:edge currently carries + # the riscv64 musl loader used by this sysroot. + image="docker.io/riscv64/alpine:edge" + ;; + esac + if [ -n "$qemu" ]; then + "$qemu" "$exe" >"$out" 2>"$err"; RUN_RC=$?; return fi if [ $have_podman -eq 1 ]; then local dir base dir="$(cd "$(dirname "$exe")" && pwd)"; base="$(basename "$exe")" - # Pin the image name to an arch-specific repo (e.g. - # docker.io/arm64v8/..., docker.io/riscv64/...) instead of the - # multi-arch alpine:latest. Avoids the cached-wrong-arch-manifest - # trap that bare alpine:latest hits when an unrelated pull cached - # a different arch; also avoids --platform, which would force a - # registry manifest lookup on every run. The image ships the - # musl loader at /lib/$LOADER_BASENAME so the dynamic variant - # resolves PT_INTERP without extra mounts. podman run --rm --pull=never --net=none \ -v "$dir":/work:Z -w /work \ - "$PODMAN_IMAGE" "./$base" \ + "$image" "./$base" \ >"$out" 2>"$err" RUN_RC=$?; return fi RUN_RC=127 } -# run_case <variant> <src> +# ---- case driver ----------------------------------------------------------- +# +# run_case <arch> <sysroot> <rt> <target> <variant> <src> # variant ∈ {static, dynamic} run_case() { - local variant="$1" src="$2" + local arch="$1" sysroot="$2" rt="$3" target="$4" variant="$5" src="$6" local name="$(basename "$src" .c)" - local work="$BUILD_DIR/$name/$variant" - local label="$name [$variant]" + local work="$BUILD_DIR/$arch/$name/$variant" + local label="$arch/$name [$variant]" mkdir -p "$work" local expected=0 @@ -171,10 +189,10 @@ run_case() { # ---- compile ---- # -nostdinc strips clang's default include path (resource dir + # /usr/include) so the sysroot's musl + linux-headers tree is the - # sole source. -isystem $SYSROOT/include picks it up. - local cc_flags=(--target=$CLANG_TRIPLE --sysroot="$SYSROOT" + # sole source. -isystem $sysroot/include picks it up. + local cc_flags=(--target="$target" --sysroot="$sysroot" -nostdinc - -isystem "$SYSROOT/include" + -isystem "$sysroot/include" -O0) case "$variant" in static) cc_flags+=(-fno-PIC -fno-pie) ;; @@ -203,26 +221,26 @@ run_case() { # iterates demand-load to a fixed point so one trailing # libcfree_rt.a is enough. link_cmd=("$CFREE" "ld" -static -o "$exe" - "$SYSROOT/lib/crt1.o" "$SYSROOT/lib/crti.o" + "$sysroot/lib/crt1.o" "$sysroot/lib/crti.o" "$obj" - "$SYSROOT/lib/libc.a" "$CFREE_RT" - "$SYSROOT/lib/crtn.o") + "$sysroot/lib/libc.a" "$rt" + "$sysroot/lib/crtn.o") ;; dynamic) # Dynamic-exe link: PIE start file, libc.so as a *shared* # input (not an archive), expects cfree ld to: # - accept ET_DYN ELF objects as input, - # - emit PT_INTERP "/lib/$LOADER_BASENAME", + # - emit PT_INTERP "/lib/ld-musl-<arch>.so.1", # - emit PT_DYNAMIC with DT_NEEDED libc.so, # - emit a .dynsym/.dynstr/.gnu.hash + .rela.plt/.got.plt # so the loader can bind imported symbols at runtime. # libcfree_rt.a stays — soft-float TF helpers are still # static-bound from our side. crti/crtn are unchanged. link_cmd=("$CFREE" "ld" -pie -o "$exe" - "$SYSROOT/lib/Scrt1.o" "$SYSROOT/lib/crti.o" + "$sysroot/lib/Scrt1.o" "$sysroot/lib/crti.o" "$obj" - "$SYSROOT/lib/libc.so" "$CFREE_RT" - "$SYSROOT/lib/crtn.o") + "$sysroot/lib/libc.so" "$rt" + "$sysroot/lib/crtn.o") ;; esac @@ -235,7 +253,7 @@ run_case() { fi # ---- run ---- - run_target "$exe" "$work/run.out" "$work/run.err" + run_target "$arch" "$exe" "$work/run.out" "$work/run.err" if [ "$RUN_RC" -ne "$expected" ]; then eval "FAIL_${variant}=\$((FAIL_${variant}+1))" eval "FAIL_NAMES_${variant}+=(\"\$label (run rc=\$RUN_RC, want \$expected)\")" @@ -261,16 +279,57 @@ run_case() { printf ' %s %s\n' "$(color_grn PASS)" "$label" } -shopt -s nullglob +# run_arch_cases <arch> <sysroot> <rt> <target> +run_arch_cases() { + local arch="$1" sysroot="$2" rt="$3" target="$4" -printf 'Running musl static-link cases [arch=%s]...\n' "$ARCH" -for src in "$CASES_DIR"/*.c; do - run_case static "$src" -done + # clang must understand --target=<target>. Recent clang ships + # linux-musl as a target alias of linux-gnu for our purposes (we + # override every system path via --sysroot). + if ! clang --target="$target" -c -x c - -o /dev/null < /dev/null 2>/dev/null; then + printf ' %s %s (clang does not accept --target=%s)\n' \ + "$(color_yel SKIP)" "$arch" "$target" + SKIP_ARCHES+=("$arch (no clang --target=$target)") + return + fi + + printf 'Running musl static-link cases [%s]...\n' "$arch" + for src in "$CASES_DIR"/*.c; do + run_case "$arch" "$sysroot" "$rt" "$target" static "$src" + done + + printf '\nRunning musl dynamic-link cases [%s]...\n' "$arch" + for src in "$CASES_DIR"/*.c; do + run_case "$arch" "$sysroot" "$rt" "$target" dynamic "$src" + done + printf '\n' +} -printf '\nRunning musl dynamic-link cases [arch=%s]...\n' "$ARCH" -for src in "$CASES_DIR"/*.c; do - run_case dynamic "$src" +shopt -s nullglob + +ARCHES="${CFREE_LIBC_ARCHES:-aa64}" +for arch in $ARCHES; do + sysroot="$(arch_sysroot "$arch")" + rt="$(arch_rt "$arch")" + target="$(arch_target "$arch")" + if [ -z "$sysroot" ] || [ -z "$rt" ] || [ -z "$target" ]; then + printf ' %s %s (unknown arch)\n' "$(color_yel SKIP)" "$arch" + SKIP_ARCHES+=("$arch (unknown)") + continue + fi + if [ ! -d "$sysroot" ]; then + printf ' %s %s (musl sysroot missing at %s — run test/libc/musl/extract.sh -a %s)\n' \ + "$(color_yel SKIP)" "$arch" "$sysroot" "$(arch_extract_name "$arch")" + SKIP_ARCHES+=("$arch (sysroot)") + continue + fi + if [ ! -f "$rt" ]; then + printf ' %s %s (cfree rt missing at %s)\n' \ + "$(color_yel SKIP)" "$arch" "$rt" + SKIP_ARCHES+=("$arch (rt)") + continue + fi + run_arch_cases "$arch" "$sysroot" "$rt" "$target" done if [ ${#FAIL_NAMES_static[@]} -gt 0 ]; then @@ -282,9 +341,12 @@ if [ ${#FAIL_NAMES_dynamic[@]} -gt 0 ]; then for n in "${FAIL_NAMES_dynamic[@]}"; do printf ' %s\n' "$n"; done fi -printf '\nResults [%s]:\n' "$ARCH" +printf '\nResults:\n' printf ' static : %s pass, %s fail\n' "$PASS_static" "$FAIL_static" printf ' dynamic: %s pass, %s fail\n' "$PASS_dynamic" "$FAIL_dynamic" +if [ ${#SKIP_ARCHES[@]} -gt 0 ]; then + printf ' skipped: %s\n' "${SKIP_ARCHES[*]}" +fi total_fail=$((FAIL_static + FAIL_dynamic)) if [ $total_fail -gt 0 ]; then exit 1; fi diff --git a/test/link/harness/jit_runner.c b/test/link/harness/jit_runner.c @@ -18,14 +18,18 @@ * cfree_jit_lookup(jit, "__cfree_test_sentinel__") returns NULL (tests * the lookup-miss path, covering case 29). * - * Runs only on aarch64 hosts. Compiled against libcfree.a with - * -I$(ROOT)/include. */ + * Compiled against libcfree.a with -I$(ROOT)/include. */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif #include <cfree/core.h> #include <cfree/jit.h> #include <cfree/link.h> #include <fcntl.h> #include <stdarg.h> +#include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -86,6 +90,24 @@ static int xm_to_posix(int p) { if (p & CFREE_PROT_EXEC) q |= PROT_EXEC; return q; } +#if XM_DUAL_LINUX && defined(__x86_64__) && defined(MAP_32BIT) +#define XM_MAP_32BIT MAP_32BIT +static uintptr_t g_xm_low_runtime_hint = 0x40000000u; +static void* xm_low_runtime_hint(size_t n) { + uintptr_t p = g_xm_low_runtime_hint; + uintptr_t step = (uintptr_t)((n + 0xffffu) & ~(size_t)0xffffu); + if (step < 0x10000u) step = 0x10000u; + g_xm_low_runtime_hint = p + step + 0x10000u; + if (g_xm_low_runtime_hint > 0x78000000u) g_xm_low_runtime_hint = 0x40000000u; + return (void*)p; +} +#elif XM_DUAL_LINUX +#define XM_MAP_32BIT 0 +static void* xm_low_runtime_hint(size_t n) { + (void)n; + return NULL; +} +#endif typedef struct XmTok { void* w; void* r; @@ -150,12 +172,14 @@ static CfreeStatus xm_reserve(void* u, size_t n, int p, close(fd); return CFREE_NOMEM; } - w = mmap(NULL, n, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + w = mmap(NULL, n, PROT_READ | PROT_WRITE, + MAP_SHARED | XM_MAP_32BIT, fd, 0); if (w == MAP_FAILED) { close(fd); return CFREE_NOMEM; } - r = mmap(NULL, n, PROT_READ, MAP_SHARED, fd, 0); + r = mmap(xm_low_runtime_hint(n), n, PROT_READ, + MAP_SHARED | XM_MAP_32BIT, fd, 0); close(fd); if (r == MAP_FAILED) { munmap(w, n); @@ -249,10 +273,26 @@ static int slurp(const char* path, uint8_t** out, size_t* len) { * Used by case 28 (extern_resolver); harmless for any case without * unresolved symbols since the resolver is never invoked. */ static int g_extern_default_value = 42; +static void* low_extern_default_value(void) { +#if defined(__linux__) && defined(__x86_64__) && defined(MAP_32BIT) + static int* p; + if (!p) { + void* mem = mmap((void*)0x30000000u, 4096, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_32BIT, -1, 0); + if (mem != MAP_FAILED) { + p = (int*)mem; + *p = 42; + } + } + return p ? (void*)p : (void*)&g_extern_default_value; +#else + return &g_extern_default_value; +#endif +} static void* extern_resolver(void* user, const char* name) { (void)user; (void)name; - return &g_extern_default_value; + return low_extern_default_value(); } /* ---- jit_tls (pthread-key backed) ---- @@ -338,6 +378,19 @@ static void jit_tls_ctx_destroy(void* user, void* ctx_v) { } static CfreeJitTls g_jit_tls = {jit_tls_ctx_new, jit_tls_ctx_destroy, NULL}; +#if defined(__x86_64__) && defined(__linux__) +static long x64_arch_prctl_raw(long code, unsigned long addr) { + register long rax __asm__("rax") = 158; /* SYS_arch_prctl */ + register long rdi __asm__("rdi") = code; + register long rsi __asm__("rsi") = (long)addr; + __asm__ volatile("syscall" + : "+r"(rax) + : "r"(rdi), "r"(rsi) + : "rcx", "r11", "memory"); + return rax; +} +#endif + typedef struct WasmRunnerMemoryPrefix { uint8_t* data; uint64_t pages; @@ -523,13 +576,15 @@ int main(int argc, char** argv) { void* entry = cfree_jit_lookup(jit, "test_main"); int (*fn)(void) = entry; - /* AArch64 TLS local-exec setup. Build a thread-local image — - * 16-byte TCB + .tdata copy + .tbss zero-fill — and point - * TPIDR_EL0 at it. On Darwin, libc functions clobber TPIDR_EL0 - * (probably via dyld stub binding / locale TSD), so msr → call() - * must be back-to-back with NO libc invocations between. */ -#if defined(__aarch64__) || defined(__arm64__) + /* TLS local-exec setup. Build the static TLS block and install the + * target thread pointer immediately before entering JITed code. */ +#if defined(__aarch64__) || defined(__arm64__) || \ + (defined(__x86_64__) && defined(__linux__)) static char tls_block[8192] __attribute__((aligned(16))); +#if defined(__x86_64__) && defined(__linux__) + unsigned long old_fs = 0; + int restore_fs = 0; +#endif { char* td_start = (char*)cfree_jit_lookup(jit, "__tdata_start"); char* td_end = (char*)cfree_jit_lookup(jit, "__tdata_end"); @@ -538,12 +593,26 @@ int main(int argc, char** argv) { if (td_start && td_end) { unsigned long td_n = (unsigned long)(td_end - td_start); unsigned long i; + if (td_n == 0 && bs_n == 0) goto no_static_tls; /* Plain loops at -O0 stay loops; do NOT use memcpy/memset * here — those go through dyld's stub binder on first * call and clobber TPIDR_EL0. */ +#if defined(__x86_64__) && defined(__linux__) + char* tcb = tls_block + sizeof(tls_block) - 64; + char* tls = tcb - (td_n + bs_n); + *(void**)tcb = tcb; + for (i = 0; i < td_n; ++i) tls[i] = td_start[i]; + for (i = 0; i < bs_n; ++i) tls[td_n + i] = 0; + if (x64_arch_prctl_raw(0x1003, (unsigned long)&old_fs) == 0 && + x64_arch_prctl_raw(0x1002, (unsigned long)tcb) == 0) { + restore_fs = 1; + } +#else for (i = 0; i < td_n; ++i) tls_block[16 + i] = td_start[i]; for (i = 0; i < bs_n; ++i) tls_block[16 + td_n + i] = 0; +#endif } + no_static_tls:; } #endif @@ -575,11 +644,22 @@ int main(int argc, char** argv) { __asm__ volatile("msr tpidr_el0, %0" ::"r"(tls_block) : "memory"); #endif result = fn(); +#if defined(__x86_64__) && defined(__linux__) + if (restore_fs) (void)x64_arch_prctl_raw(0x1002, old_fs); +#endif } else { result = 1; } +#if defined(__x86_64__) && defined(__linux__) + if (restore_fs) + (void)x64_arch_prctl_raw( + 0x1002, (unsigned long)(tls_block + sizeof(tls_block) - 64)); +#endif cfree_jit_run_dtors(jit); +#if defined(__x86_64__) && defined(__linux__) + if (restore_fs) (void)x64_arch_prctl_raw(0x1002, old_fs); +#endif if (result == 0) { int (*post)(void) = cfree_jit_lookup(jit, "test_post_fini"); diff --git a/test/parse/cases/6_5_2_2_06_struct_param_mixed_fp_int.c b/test/parse/cases/6_5_2_2_06_struct_param_mixed_fp_int.c @@ -0,0 +1,12 @@ +/* SysV-x64 classifies this 16-byte record as SSE + INTEGER, so the + * aggregate consumes one XMM arg slot and one GPR arg slot. */ +struct S { double d; long i; }; + +int take(int pre, struct S s, double post) { + return pre + (int)s.d + (int)s.i + (int)post; +} + +int test_main(void) { + struct S s = {11.0, 17L}; + return take(5, s, 9.0); +} diff --git a/test/parse/cases/6_5_2_2_06_struct_param_mixed_fp_int.expected b/test/parse/cases/6_5_2_2_06_struct_param_mixed_fp_int.expected @@ -0,0 +1 @@ +42 diff --git a/test/parse/cases/6_8_6_4_05_struct_return_mixed_fp_int.c b/test/parse/cases/6_8_6_4_05_struct_return_mixed_fp_int.c @@ -0,0 +1,12 @@ +/* SysV-x64 mixed direct return: double comes back in xmm0, long in rax. */ +struct S { double d; long i; }; + +struct S mk(void) { + struct S s = {20.0, 22L}; + return s; +} + +int test_main(void) { + struct S r = mk(); + return (int)r.d + (int)r.i; +} diff --git a/test/parse/cases/6_8_6_4_05_struct_return_mixed_fp_int.expected b/test/parse/cases/6_8_6_4_05_struct_return_mixed_fp_int.expected @@ -0,0 +1 @@ +42 diff --git a/test/parse/cases/cg_x64_inline_asm_modifiers.c b/test/parse/cases/cg_x64_inline_asm_modifiers.c @@ -0,0 +1,15 @@ +int test_main(void) { +#if defined(__x86_64__) + long acc = 0; + int named = 0; + + __asm__ volatile("movq %1, %0" : "=r"(acc) : "r"(42)); + __asm__ volatile("mov%z0 %k1, %k0" + : [out] "=r"(named) + : [in] "r"(11)); + + return (int)(acc + named); +#else + return 53; +#endif +} diff --git a/test/parse/cases/cg_x64_inline_asm_modifiers.expected b/test/parse/cases/cg_x64_inline_asm_modifiers.expected @@ -0,0 +1 @@ +53 diff --git a/test/parse/harness/parse_runner.c b/test/parse/harness/parse_runner.c @@ -19,6 +19,10 @@ * The execmem boilerplate mirrors test/cg/harness/cg_runner.c — strict * W^X dual mapping on Apple/Linux, single mapping elsewhere. */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + #include <cfree/compile.h> #include <cfree/core.h> #include <cfree/jit.h> @@ -89,6 +93,24 @@ static int xm_to_posix(int p) { if (p & CFREE_PROT_EXEC) q |= PROT_EXEC; return q; } +#if XM_DUAL_LINUX && defined(__x86_64__) && defined(MAP_32BIT) +#define XM_MAP_32BIT MAP_32BIT +static uintptr_t g_xm_low_runtime_hint = 0x40000000u; +static void* xm_low_runtime_hint(size_t n) { + uintptr_t p = g_xm_low_runtime_hint; + uintptr_t step = (uintptr_t)((n + 0xffffu) & ~(size_t)0xffffu); + if (step < 0x10000u) step = 0x10000u; + g_xm_low_runtime_hint = p + step + 0x10000u; + if (g_xm_low_runtime_hint > 0x78000000u) g_xm_low_runtime_hint = 0x40000000u; + return (void*)p; +} +#elif XM_DUAL_LINUX +#define XM_MAP_32BIT 0 +static void* xm_low_runtime_hint(size_t n) { + (void)n; + return NULL; +} +#endif typedef struct XmTok { void* w; void* r; @@ -153,12 +175,14 @@ static CfreeStatus xm_reserve(void* u, size_t n, int p, close(fd); return CFREE_NOMEM; } - w = mmap(NULL, n, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + w = mmap(NULL, n, PROT_READ | PROT_WRITE, + MAP_SHARED | XM_MAP_32BIT, fd, 0); if (w == MAP_FAILED) { close(fd); return CFREE_NOMEM; } - r = mmap(NULL, n, PROT_READ, MAP_SHARED, fd, 0); + r = mmap(xm_low_runtime_hint(n), n, PROT_READ, + MAP_SHARED | XM_MAP_32BIT, fd, 0); close(fd); if (r == MAP_FAILED) { munmap(w, n); diff --git a/test/test.mk b/test/test.mk @@ -27,9 +27,9 @@ # asm_parse / cfree_disasm_iter_* are still stubs; the harness builds # and runs end-to-end so the wiring stays exercised. See doc/ASM.md. -.PHONY: test test-driver test-lex test-pp test-pp-err test-elf test-ar test-ar-driver test-strip-driver test-objcopy-driver test-objdump-driver test-link test-cg-api test-toy test-opt test-dwarf test-debug test-parse test-parse-err test-asm test-wasm-front test-isa test-aa64-inline test-rv64-inline test-rv64-jit test-emu test-rt-headers test-rt-runtime test-musl test-musl-rv64 test-glibc test-glibc-rv64 test-lib-deps test-smoke-x64 test-smoke-rv64 test-cbackend rv64-doctor +.PHONY: test test-driver test-lex test-pp test-pp-err test-elf test-ar test-ar-driver test-strip-driver test-objcopy-driver test-objdump-driver test-link test-cg-api test-toy test-opt test-dwarf test-debug test-parse test-parse-err test-asm test-wasm-front test-isa test-aa64-inline test-rv64-inline test-rv64-jit test-emu test-x64-inline test-x64-dbg test-rt-headers test-rt-runtime test-musl test-musl-rv64 test-glibc test-glibc-rv64 test-lib-deps test-smoke-x64 test-smoke-rv64 test-cbackend rv64-doctor -test: test-driver test-lex test-pp test-pp-err test-elf test-ar test-ar-driver test-strip-driver test-objcopy-driver test-objdump-driver test-link test-toy test-dwarf test-debug test-parse test-parse-err test-asm test-isa test-aa64-inline test-rv64-inline test-rv64-jit test-emu test-rt-headers test-lib-deps +test: test-driver test-lex test-pp test-pp-err test-elf test-ar test-ar-driver test-strip-driver test-objcopy-driver test-objdump-driver test-link test-toy test-dwarf test-debug test-parse test-parse-err test-asm test-isa test-aa64-inline test-rv64-inline test-rv64-jit test-emu test-x64-inline test-x64-dbg test-rt-headers test-lib-deps # `test-cbackend` is intentionally not in the default `test` target: the # Phase 1 C backend skips most fixtures pending later phases, which would # add noise to the default summary. Run it explicitly to gate progress. @@ -105,24 +105,14 @@ $(DWARF_TEST_BIN): test/dwarf/dwarf_test.c $(LIB_AR) # function symbol). Deliberately bypasses the consumer (cfree_dwarf_open) # so encoder bugs aren't masked by matching decoder bugs. DEBUG_TEST_BIN = build/test/debug_roundtrip_unit -DEBUG_CFI_TEST_BIN = build/test/debug_cfi_unit -test-debug: $(DEBUG_TEST_BIN) $(DEBUG_CFI_TEST_BIN) +test-debug: $(DEBUG_TEST_BIN) $(DEBUG_TEST_BIN) - $(DEBUG_CFI_TEST_BIN) $(DEBUG_TEST_BIN): test/debug/roundtrip_unit.c $(LIB_AR) @mkdir -p $(dir $@) $(CC) $(DRIVER_CFLAGS) -Isrc test/debug/roundtrip_unit.c $(LIB_AR) -o $@ -# CFI .eh_frame producer unit test. Drives MCEmitter directly, opens an -# FDE per arch, asserts the buffered CIE/FDE bytes match the locked -# per-arch psABI defaults (return-addr reg, code/data align factors, -# CFA at entry) and the FDE program byte encoding. -$(DEBUG_CFI_TEST_BIN): test/debug/cfi_unit.c $(LIB_AR) - @mkdir -p $(dir $@) - $(CC) $(DRIVER_CFLAGS) -Isrc test/debug/cfi_unit.c $(LIB_AR) -o $@ - # aa64 ISA descriptor-table unit test (doc/ASM.md phase 2). Covers # every AA64Format the table maps and the alias-precedence invariant # (first-match disasm picks the alias spelling over the canonical @@ -136,11 +126,9 @@ $(AA64_ISA_TEST_BIN): test/arch/aa64_isa_test.c $(LIB_AR) @mkdir -p $(dir $@) $(CC) $(DRIVER_CFLAGS) -Isrc test/arch/aa64_isa_test.c $(LIB_AR) -o $@ -# test-emu: emulator unit tests. The rv64 lane builds a tiny in-memory -# rv64 ELF, runs it through emu_load_elf + emu_decode_block + -# emu_cpu_interp_block, and asserts the guest exits with the expected -# code via the SYS_exit_group syscall handler. Internal arch/emu -# surface — needs -Isrc. +# test-emu: emulator unit tests. The rv64 lane builds tiny in-memory rv64 +# ELFs and asserts the interpreter exits through the syscall handler with +# the expected code. Internal arch/emu surface — needs -Isrc. EMU_RV64_TEST_BIN = build/test/emu_rv64_test EMU_RV64_EXTRAS_TEST_BIN = build/test/emu_rv64_extras_test @@ -206,16 +194,9 @@ $(RV64_INLINE_TEST_BIN): test/arch/rv64_inline_test.c $(LIB_AR) @mkdir -p $(dir $@) $(CC) $(DRIVER_CFLAGS) -Isrc test/arch/rv64_inline_test.c $(LIB_AR) -o $@ -# rv64 JIT smoke test. Builds a tiny rv64 ELF .o in memory, runs it -# through cfree_link_session in JIT-output mode, and (on a rv64 host) -# calls the resulting function. On non-rv64 hosts the test still -# exercises every JIT path (execmem reserve+protect, reloc apply, -# symbol lookup, icache flush) and then exits 77 — "skipped" by the -# autotools convention — which the shell wrapper below translates to -# a printed SKIP without failing the suite. This is the only place -# in the parity work where a green default-target on aa64/x64 hosts -# is the "still wired" signal; the native-execution leg only fires -# on a riscv64 Linux box. +# rv64 JIT smoke test. Builds a tiny rv64 ELF .o in memory, runs it +# through cfree_link_session in JIT-output mode, and skips native execution +# on non-riscv64 hosts after exercising the JIT mapping/reloc path. RV64_JIT_TEST_BIN = build/test/rv64_jit_test test-rv64-jit: $(RV64_JIT_TEST_BIN) @@ -231,6 +212,28 @@ $(RV64_JIT_TEST_BIN): test/link/rv64_jit_test.c $(LIB_AR) @mkdir -p $(dir $@) $(CC) $(DRIVER_CFLAGS) test/link/rv64_jit_test.c $(LIB_AR) -o $@ +# x86_64 peer of test-aa64-inline (doc/INLINEASM.md). Drives x_asm_block +# (CGTarget vtable) directly with hand-rolled Operand arrays and asserts +# the emitted .text bytes match the expected machine encoding. Also +# pins the current %w/%x/%a behavior and exercises the new %b modifier. +X64_INLINE_TEST_BIN = build/test/x64_inline_test + +test-x64-inline: $(X64_INLINE_TEST_BIN) + $(X64_INLINE_TEST_BIN) + +$(X64_INLINE_TEST_BIN): test/arch/x64_inline_test.c $(LIB_AR) + @mkdir -p $(dir $@) + $(CC) $(DRIVER_CFLAGS) -Isrc test/arch/x64_inline_test.c $(LIB_AR) -o $@ + +X64_DBG_TEST_BIN = build/test/x64_dbg_test + +test-x64-dbg: $(X64_DBG_TEST_BIN) + $(X64_DBG_TEST_BIN) + +$(X64_DBG_TEST_BIN): test/arch/x64_dbg_test.c $(LIB_AR) + @mkdir -p $(dir $@) + $(CC) $(DRIVER_CFLAGS) -Isrc test/arch/x64_dbg_test.c $(LIB_AR) -o $@ + RT_HEADER_TEST_TARGETS = \ aarch64-linux-gnu \ x86_64-linux-gnu \ @@ -342,14 +345,6 @@ test-smoke-x64: test-smoke-rv64: bash test/smoke/rv64.sh -# rv64-doctor: standalone prereq check for the rv64 lane (clang -# RISC-V target, ld.lld, qemu-riscv64, podman, native host). Prints -# one line per probe with install hints, exits 0 only when at least -# one runner *and* the cross-compile toolchain are usable. Safe to -# run anywhere — no build artifacts required. -rv64-doctor: - bash test/lib/check_rv64_env.sh - # test-musl / test-glibc: end-to-end static + dynamic libc link/run on # aarch64. Each variant pulls its own pinned sysroot (podman, ~30s on # first run) and shares the same case files under test/libc/cases/: @@ -391,20 +386,40 @@ $(GLIBC_SYSROOT_X64_MARKER): test/libc/glibc/extract.sh test/libc/glibc/Containe $(GLIBC_SYSROOT_RV64_MARKER): test/libc/glibc/extract.sh test/libc/glibc/Containerfile.rv64 @bash test/libc/glibc/extract.sh -a rv64 -test-musl: bin rt-aarch64-linux $(MUSL_SYSROOT_MARKER) - @bash test/libc/musl/run.sh - -# rv64 counterpart of test-musl. Excluded from the default `test` -# target for the same reason as test-musl: needs podman + qemu. -test-musl-rv64: bin rt-riscv64-linux $(MUSL_SYSROOT_RV64_MARKER) - @bash test/libc/musl/run.sh -a rv64 - -test-glibc: bin rt-aarch64-linux $(GLIBC_SYSROOT_MARKER) - @bash test/libc/glibc/run.sh - -# rv64 counterpart of test-glibc. Same opt-in convention as test-glibc. -test-glibc-rv64: bin rt-riscv64-linux $(GLIBC_SYSROOT_RV64_MARKER) - @bash test/libc/glibc/run.sh -a rv64 +# test-musl / test-glibc honor CFREE_LIBC_ARCHES (default "aa64"; +# values: aa64, x64, rv64). Each enabled arch contributes its sysroot +# PROVENANCE marker and its rt archive to the prerequisite list, so +# `CFREE_LIBC_ARCHES="aa64 x64" make test-musl` builds both sysroots +# + both rt archives before the runner script picks them up. The +# default keeps the aa64-only prerequisite set so `make test-musl` is +# backwards-compatible. +CFREE_LIBC_ARCHES ?= aa64 + +# Map an arch token to its musl/glibc sysroot marker and rt target. +_LIBC_MUSL_SYSROOT_aa64 = $(MUSL_SYSROOT_MARKER) +_LIBC_MUSL_SYSROOT_x64 = $(MUSL_SYSROOT_X64_MARKER) +_LIBC_MUSL_SYSROOT_rv64 = $(MUSL_SYSROOT_RV64_MARKER) +_LIBC_GLIBC_SYSROOT_aa64 = $(GLIBC_SYSROOT_MARKER) +_LIBC_GLIBC_SYSROOT_x64 = $(GLIBC_SYSROOT_X64_MARKER) +_LIBC_GLIBC_SYSROOT_rv64 = $(GLIBC_SYSROOT_RV64_MARKER) +_LIBC_RT_aa64 = rt-aarch64-linux +_LIBC_RT_x64 = rt-x86_64-linux +_LIBC_RT_rv64 = rt-riscv64-linux + +LIBC_MUSL_DEPS = $(foreach a,$(CFREE_LIBC_ARCHES),$(_LIBC_MUSL_SYSROOT_$(a)) $(_LIBC_RT_$(a))) +LIBC_GLIBC_DEPS = $(foreach a,$(CFREE_LIBC_ARCHES),$(_LIBC_GLIBC_SYSROOT_$(a)) $(_LIBC_RT_$(a))) + +test-musl: bin $(LIBC_MUSL_DEPS) + @CFREE_LIBC_ARCHES="$(CFREE_LIBC_ARCHES)" bash test/libc/musl/run.sh + +test-glibc: bin $(LIBC_GLIBC_DEPS) + @CFREE_LIBC_ARCHES="$(CFREE_LIBC_ARCHES)" bash test/libc/glibc/run.sh + +test-musl-rv64: + @$(MAKE) test-musl CFREE_LIBC_ARCHES=rv64 + +test-glibc-rv64: + @$(MAKE) test-glibc CFREE_LIBC_ARCHES=rv64 # Fail if libcfree.a depends on any external symbol not in the allowlist. # Drift in either direction (new dep, or stale entry) is a failure.