amd64.py (17779B)
1 from common import ( 2 AddI, 3 ArchDef, 4 BranchReg, 5 CondB, 6 CondBZ, 7 Enter, 8 La, 9 LaBr, 10 LdArg, 11 Li, 12 LogI, 13 Mem, 14 Mov, 15 Nullary, 16 Rrr, 17 ShiftI, 18 byte, 19 le32, 20 round_up, 21 ) 22 23 24 # ---- Native register numbers -------------------------------------------- 25 # 26 # Backend-private mapping from P1 register names to native amd64 regnums. 27 # `br` is the hidden branch-target reg (r15). `scratch` is the per-expansion 28 # scratch reg (r9). rax/rbp are also used internally (retaddr spill, rcx / 29 # rdx save slots) and are not P1-visible. 30 31 NAT = { 32 'a0': 7, # rdi 33 'a1': 6, # rsi 34 'a2': 2, # rdx 35 'a3': 1, # rcx 36 't0': 10, # r10 37 't1': 11, # r11 38 't2': 8, # r8 39 's0': 3, # rbx 40 's1': 12, # r12 41 's2': 13, # r13 42 's3': 14, # r14 43 'sp': 4, # rsp 44 'br': 15, # r15 45 'scratch': 9, # r9 46 'rax': 0, 47 'rcx': 1, 48 'rdx': 2, 49 'rbx': 3, 50 'rsp': 4, 51 'rbp': 5, 52 'rsi': 6, 53 'rdi': 7, 54 'r8': 8, 55 'r9': 9, 56 'r10': 10, 57 'r11': 11, 58 'r12': 12, 59 'r13': 13, 60 'r14': 14, 61 'r15': 15, 62 } 63 64 65 SYSCALL_NUMBERS = { 66 'SYS_READ': 0, 67 'SYS_WRITE': 1, 68 'SYS_CLOSE': 3, 69 'SYS_OPENAT': 257, 70 'SYS_EXIT': 60, 71 'SYS_CLONE': 56, 72 'SYS_EXECVE': 59, 73 'SYS_WAITID': 247, 74 } 75 76 77 # ---- REX / ModRM helpers ------------------------------------------------ 78 79 def amd_rex_b_short(r): 80 # Optional one-byte REX.B (no W) prefix used by push/pop/jmp r/call r/ 81 # mov r,imm32 when the target reg is r8-r15. Returns '' for low regs. 82 if NAT[r] >= 8: 83 return byte(0x41) 84 return '' 85 86 87 def amd_rex_wb(r): 88 # REX.W=1, B=(r>>3) to extend ModRM.rm / SIB.base. 89 return byte(0x48 | ((NAT[r] >> 3) & 1)) 90 91 92 def amd_rex_wrb(rg, rm): 93 # REX.W=1, R=(rg>>3), B=(rm>>3). Used whenever a ModRM.reg field is in 94 # use together with a ModRM.rm field. 95 return byte(0x48 | (((NAT[rg] >> 3) & 1) << 2) | ((NAT[rm] >> 3) & 1)) 96 97 98 def amd_modrm_rr(rg, rm): 99 return byte(0xC0 | ((NAT[rg] & 7) << 3) | (NAT[rm] & 7)) 100 101 102 def amd_modrm_ext_r(ext, rm): 103 return byte(0xC0 | ((ext & 7) << 3) | (NAT[rm] & 7)) 104 105 106 # ---- Memory-addressing ModRM (+ SIB + disp) ---------------------------- 107 # 108 # [base + disp] with `reg` in ModRM.reg. Bases whose low 3 bits are 100 - 109 # rsp and r12 - must go through a SIB byte; all others use the plain 110 # encoding. disp selects mod=1 (disp8) when it fits in [-128,127], else 111 # mod=2 (disp32). We never emit mod=0 / no-disp; the extra byte is fine. 112 113 def amd_modrm_disp(reg, base, disp): 114 use_sib = (NAT[base] & 7) == 4 115 use_disp8 = -128 <= disp <= 127 116 reg_lo = NAT[reg] & 7 117 if use_sib: 118 if use_disp8: 119 return byte(0x44 | (reg_lo << 3)) + byte(0x24) + byte(disp) 120 return byte(0x84 | (reg_lo << 3)) + byte(0x24) + le32(disp) 121 base_lo = NAT[base] & 7 122 if use_disp8: 123 return byte(0x40 | (reg_lo << 3) | base_lo) + byte(disp) 124 return byte(0x80 | (reg_lo << 3) | base_lo) + le32(disp) 125 126 127 # ---- Register / arithmetic primitives ---------------------------------- 128 129 def amd_mov_rr(dst, src): 130 # mov dst, src -- REX.WRB 89 /r (source in ModRM.reg, dest in rm) 131 return amd_rex_wrb(src, dst) + byte(0x89) + amd_modrm_rr(src, dst) 132 133 134 def amd_alu_rr(opcode, dst, src): 135 # ADD/SUB/AND/OR/XOR dst, src -- REX.WRB <op> /r (src in reg, dst in rm) 136 return amd_rex_wrb(src, dst) + byte(opcode) + amd_modrm_rr(src, dst) 137 138 139 def amd_alu_ri8(ext, dst, imm): 140 # op dst, imm8 -- REX.WB 83 /ext ib 141 return amd_rex_wb(dst) + byte(0x83) + amd_modrm_ext_r(ext, dst) + byte(imm) 142 143 144 def amd_alu_ri32(ext, dst, imm): 145 # op dst, imm32 -- REX.WB 81 /ext id 146 return amd_rex_wb(dst) + byte(0x81) + amd_modrm_ext_r(ext, dst) + le32(imm) 147 148 149 def amd_shift_ri8(ext, dst, imm): 150 # shift dst, imm8 -- REX.WB C1 /ext ib (SHL=4, SHR=5, SAR=7) 151 return (amd_rex_wb(dst) + byte(0xC1) + amd_modrm_ext_r(ext, dst) 152 + byte(imm & 0x3F)) 153 154 155 def amd_shift_cl(ext, dst): 156 # shift dst, cl -- REX.WB D3 /ext 157 return amd_rex_wb(dst) + byte(0xD3) + amd_modrm_ext_r(ext, dst) 158 159 160 def amd_imul_rr(dst, src): 161 # imul dst, src -- REX.WRB 0F AF /r (dst in reg, src in rm) 162 return (amd_rex_wrb(dst, src) + byte(0x0F) + byte(0xAF) 163 + amd_modrm_rr(dst, src)) 164 165 166 def amd_idiv_r(src): 167 # idiv src -- REX.WB F7 /7 168 return amd_rex_wb(src) + byte(0xF7) + amd_modrm_ext_r(7, src) 169 170 171 def amd_cqo(): 172 # cqo -- 48 99 (sign-extend rax into rdx:rax) 173 return byte(0x48) + byte(0x99) 174 175 176 def amd_push(r): 177 return amd_rex_b_short(r) + byte(0x50 | (NAT[r] & 7)) 178 179 180 def amd_pop(r): 181 return amd_rex_b_short(r) + byte(0x58 | (NAT[r] & 7)) 182 183 184 def amd_mov_imm32_prefix(rd): 185 # mov r32, imm32 -- [REX.B] B8+r (caller appends 4-byte literal). 186 # Result is zero-extended into the full 64-bit register. 187 return amd_rex_b_short(rd) + byte(0xB8 | (NAT[rd] & 7)) 188 189 190 def amd_mov_imm64_prefix(rd): 191 # mov r64, imm64 -- REX.W[.B] B8+r (caller appends 8-byte literal). 192 return amd_rex_wb(rd) + byte(0xB8 | (NAT[rd] & 7)) 193 194 195 # ---- Memory ops -------------------------------------------------------- 196 197 def amd_mem_LD(rt, rn, off): 198 # mov rT, [rN + off] -- REX.WRB 8B /r modrm-with-disp 199 return amd_rex_wrb(rt, rn) + byte(0x8B) + amd_modrm_disp(rt, rn, off) 200 201 202 def amd_mem_ST(rt, rn, off): 203 # mov [rN + off], rT -- REX.WRB 89 /r 204 return amd_rex_wrb(rt, rn) + byte(0x89) + amd_modrm_disp(rt, rn, off) 205 206 207 def amd_mem_SB(rt, rn, off): 208 # mov [rN + off], rT8 -- REX.WRB 88 /r (REX.W forces dil/sil/bpl/spl 209 # byte-view encoding when the low byte of those regs is needed). 210 return amd_rex_wrb(rt, rn) + byte(0x88) + amd_modrm_disp(rt, rn, off) 211 212 213 def amd_mem_LB(rt, rn, off): 214 # movzx rT, byte ptr [rN + off] -- REX.WRB 0F B6 /r 215 return (amd_rex_wrb(rt, rn) + byte(0x0F) + byte(0xB6) 216 + amd_modrm_disp(rt, rn, off)) 217 218 219 # ---- Control-flow primitives ------------------------------------------- 220 221 def amd_jmp_r(r): 222 # jmp r/m64 -- [REX.B] FF /4. 2 bytes for low regs, 3 bytes for r8-r15. 223 return amd_rex_b_short(r) + byte(0xFF) + byte(0xE0 | (NAT[r] & 7)) 224 225 226 def amd_call_r(r): 227 # call r/m64 -- [REX.B] FF /2. 228 return amd_rex_b_short(r) + byte(0xFF) + byte(0xD0 | (NAT[r] & 7)) 229 230 231 def amd_ret(): 232 return byte(0xC3) 233 234 235 def amd_syscall(): 236 return byte(0x0F) + byte(0x05) 237 238 239 def amd_cmp_rr(ra, rb): 240 # cmp rA, rB -- REX.WRB 39 /r (rB in reg, rA in rm). 241 return amd_rex_wrb(rb, ra) + byte(0x39) + amd_modrm_rr(rb, ra) 242 243 244 def amd_test_rr(ra, rb): 245 return amd_rex_wrb(rb, ra) + byte(0x85) + amd_modrm_rr(rb, ra) 246 247 248 # ---- P1 register-register op lowering ---------------------------------- 249 # 250 # For ADD/SUB/AND/OR/XOR we honor rD==rB aliasing -- the naive 251 # `mov rD,rA ; op rD,rB` would clobber rB before the op reads it. Route rB 252 # through the scratch reg when that aliasing shows up. 253 254 ALU_OPCODE = { 255 'ADD': 0x01, 256 'SUB': 0x29, 257 'AND': 0x21, 258 'OR': 0x09, 259 'XOR': 0x31, 260 } 261 262 263 def amd_rrr_simple(opcode, rd, ra, rb): 264 if NAT[rd] == NAT[rb]: 265 return (amd_mov_rr('scratch', rb) 266 + amd_mov_rr(rd, ra) 267 + amd_alu_rr(opcode, rd, 'scratch')) 268 return amd_mov_rr(rd, ra) + amd_alu_rr(opcode, rd, rb) 269 270 271 def amd_rrr_MUL(rd, ra, rb): 272 if NAT[rd] == NAT[rb]: 273 return (amd_mov_rr('scratch', rb) 274 + amd_mov_rr(rd, ra) 275 + amd_imul_rr(rd, 'scratch')) 276 return amd_mov_rr(rd, ra) + amd_imul_rr(rd, rb) 277 278 279 # DIV / REM clobber rax and rdx natively. rax is not a P1 register, so we 280 # clobber it freely; rdx IS P1 a2, so we stash it to rbp (also outside the 281 # P1 mapping) for the lifetime of the op. Aliasing-safety plan mirrors the 282 # M1pp comments verbatim. 283 284 def amd_rrr_DIV(rd, ra, rb): 285 return ''.join([ 286 amd_mov_rr('rbp', 'rdx'), 287 amd_mov_rr('scratch', rb), 288 amd_mov_rr('rax', ra), 289 amd_cqo(), 290 amd_idiv_r('scratch'), 291 amd_mov_rr('rdx', 'rbp'), 292 amd_mov_rr(rd, 'rax'), 293 ]) 294 295 296 def amd_rrr_REM(rd, ra, rb): 297 return ''.join([ 298 amd_mov_rr('rbp', 'rdx'), 299 amd_mov_rr('scratch', rb), 300 amd_mov_rr('rax', ra), 301 amd_cqo(), 302 amd_idiv_r('scratch'), 303 amd_mov_rr('rax', 'rdx'), 304 amd_mov_rr('rdx', 'rbp'), 305 amd_mov_rr(rd, 'rax'), 306 ]) 307 308 309 # SHL / SHR / SAR with reg count. x86 reads the count from CL only, so 310 # staging goes through rcx -- which IS P1 a3. Save rcx to rbp for the 311 # duration. Ordering matches the M1pp comments. 312 313 def amd_rrr_shift(ext, rd, ra, rb): 314 return ''.join([ 315 amd_mov_rr('rbp', 'rcx'), 316 amd_mov_rr('scratch', ra), 317 amd_mov_rr('rcx', rb), 318 amd_shift_cl(ext, 'scratch'), 319 amd_mov_rr('rcx', 'rbp'), 320 amd_mov_rr(rd, 'scratch'), 321 ]) 322 323 324 # ---- Encoders ---------------------------------------------------------- 325 326 def encode_li(_arch, row): 327 return amd_mov_imm64_prefix(row.rd) 328 329 330 def encode_la(_arch, row): 331 return amd_mov_imm32_prefix(row.rd) 332 333 334 def encode_labr(_arch, _row): 335 return amd_mov_imm32_prefix('br') 336 337 338 def encode_mov(_arch, row): 339 # Portable sp is the frame-local base, which is 16 bytes above native 340 # rsp. Reading sp into a register yields native_rsp + 16, so emit 341 # `mov rd, rsp ; add rd, 16` for the sp-source case. 342 if row.rs == 'sp': 343 return amd_mov_rr(row.rd, 'sp') + amd_alu_ri8(0, row.rd, 16) 344 return amd_mov_rr(row.rd, row.rs) 345 346 347 def encode_rrr(_arch, row): 348 if row.op == 'MUL': 349 return amd_rrr_MUL(row.rd, row.ra, row.rb) 350 if row.op == 'DIV': 351 return amd_rrr_DIV(row.rd, row.ra, row.rb) 352 if row.op == 'REM': 353 return amd_rrr_REM(row.rd, row.ra, row.rb) 354 if row.op == 'SHL': 355 return amd_rrr_shift(4, row.rd, row.ra, row.rb) 356 if row.op == 'SHR': 357 return amd_rrr_shift(5, row.rd, row.ra, row.rb) 358 if row.op == 'SAR': 359 return amd_rrr_shift(7, row.rd, row.ra, row.rb) 360 return amd_rrr_simple(ALU_OPCODE[row.op], row.rd, row.ra, row.rb) 361 362 363 def encode_addi(_arch, row): 364 head = amd_mov_rr(row.rd, row.ra) 365 if -128 <= row.imm <= 127: 366 return head + amd_alu_ri8(0, row.rd, row.imm) 367 return head + amd_alu_ri32(0, row.rd, row.imm) 368 369 370 # AND/OR with imm: 83 /ext ib sign-extends imm8 to 64 bits. That works for 371 # imm in [-128, 127] (and for -1 as a convenient all-ones mask), but breaks 372 # for positive imms >= 128 -- ANDI with 255 would become AND with 373 # 0xFFFFFFFFFFFFFFFF. Widen to the imm32 form when imm8 would misencode. 374 LOGI_EXT = { 375 'ANDI': 4, 376 'ORI': 1, 377 } 378 379 380 def encode_logi(_arch, row): 381 head = amd_mov_rr(row.rd, row.ra) 382 ext = LOGI_EXT[row.op] 383 if -128 <= row.imm <= 127: 384 return head + amd_alu_ri8(ext, row.rd, row.imm) 385 return head + amd_alu_ri32(ext, row.rd, row.imm) 386 387 388 SHIFTI_EXT = { 389 'SHLI': 4, 390 'SHRI': 5, 391 'SARI': 7, 392 } 393 394 395 def encode_shifti(_arch, row): 396 return (amd_mov_rr(row.rd, row.ra) 397 + amd_shift_ri8(SHIFTI_EXT[row.op], row.rd, row.imm)) 398 399 400 def encode_mem(_arch, row): 401 # Portable sp points to the frame-local base; the 16-byte hidden frame 402 # header sits at native_rsp+0..15 and is not portable-addressable. 403 # Shift sp-relative offsets past the header. 404 off = row.off + 16 if row.rn == 'sp' else row.off 405 if row.op == 'LD': 406 return amd_mem_LD(row.rt, row.rn, off) 407 if row.op == 'ST': 408 return amd_mem_ST(row.rt, row.rn, off) 409 if row.op == 'LB': 410 return amd_mem_LB(row.rt, row.rn, off) 411 if row.op == 'SB': 412 return amd_mem_SB(row.rt, row.rn, off) 413 raise ValueError(f'unknown mem op: {row.op}') 414 415 416 def encode_ldarg(_arch, row): 417 # Internal callers bypass the +16 sp-base translation: native rsp+8 418 # holds the saved caller-sp pointer set up by p1_enter, and the first 419 # incoming stack-arg word lives 16 bytes past that. 420 return (amd_mem_LD('scratch', 'sp', 8) 421 + amd_mem_LD(row.rd, 'scratch', 16 + 8 * row.slot)) 422 423 424 def amd_epilogue_prefix(): 425 # Frame-teardown prefix shared by ERET, TAIL, TAILR. Loads retaddr into 426 # scratch (r9), saved caller sp into rax, unwinds rsp, then re-pushes 427 # retaddr so a trailing `ret` or `jmp` finds the right top-of-stack 428 # layout. (For TAIL/TAILR the trailing op is a jmp, but the retaddr 429 # still needs to be back on the stack so the eventual callee `ret` 430 # returns to the original caller.) 431 return ''.join([ 432 amd_mem_LD('scratch', 'sp', 0), 433 amd_mem_LD('rax', 'sp', 8), 434 amd_mov_rr('sp', 'rax'), 435 amd_push('scratch'), 436 ]) 437 438 439 def encode_branch_reg(_arch, row): 440 if row.kind == 'BR': 441 return amd_jmp_r(row.rs) 442 if row.kind == 'CALLR': 443 return amd_call_r(row.rs) 444 if row.kind == 'TAILR': 445 return amd_epilogue_prefix() + amd_jmp_r(row.rs) 446 raise ValueError(f'unknown branch-reg kind: {row.kind}') 447 448 449 # Conditional-branch lowering: 450 # cmp / test 451 # Jcc_inverse +3 -- skip the 3-byte `jmp r15` 452 # jmp r15 -- P1 branch-taken path 453 # 454 # Invert codes: BEQ->JNE(75), BNE->JE(74), BLT->JGE(7D), BLTU->JAE(73), 455 # BLTZ->JGE(7D), BEQZ->JNE(75), BNEZ->JE(74). The 0x03 rel8 skips 456 # `amd_jmp_r(br)` which is 3 bytes (REX.B 41 + FF + E7). 457 CONDB_INVERT = { 458 'BEQ': 0x75, # JNE 459 'BNE': 0x74, # JE 460 'BLT': 0x7D, # JGE 461 'BLTU': 0x73, # JAE 462 } 463 464 CONDBZ_INVERT = { 465 'BEQZ': 0x75, # JNE 466 'BNEZ': 0x74, # JE 467 'BLTZ': 0x7D, # JGE 468 } 469 470 471 def encode_condb(_arch, row): 472 return (amd_cmp_rr(row.ra, row.rb) 473 + byte(CONDB_INVERT[row.op]) + byte(0x03) 474 + amd_jmp_r('br')) 475 476 477 def encode_condbz(_arch, row): 478 return (amd_test_rr(row.ra, row.ra) 479 + byte(CONDBZ_INVERT[row.op]) + byte(0x03) 480 + amd_jmp_r('br')) 481 482 483 def encode_enter(arch, row): 484 # CALL on amd64 pushed the retaddr, so on entry: 485 # rsp = caller_sp - 8 486 # [rsp] = retaddr 487 # 488 # Standard frame after ENTER: 489 # [sp + 0] = retaddr 490 # [sp + 8] = saved caller_sp 491 # [sp + 16 .. 16 + size - 1] = portable locals 492 # total frame = round_up(stack_align, 16 + size) 493 frame_bytes = round_up(arch.stack_align, 2 * arch.word_bytes + row.size) 494 return ''.join([ 495 amd_pop('scratch'), 496 amd_mov_rr('rax', 'sp'), 497 amd_alu_ri32(5, 'sp', frame_bytes), 498 amd_mem_ST('scratch', 'sp', 0), 499 amd_mem_ST('rax', 'sp', 8), 500 ]) 501 502 503 def encode_nullary(_arch, row): 504 if row.kind == 'B': 505 return amd_jmp_r('br') 506 if row.kind == 'CALL': 507 return amd_call_r('br') 508 if row.kind == 'RET': 509 return amd_ret() 510 if row.kind == 'ERET': 511 return amd_epilogue_prefix() + amd_ret() 512 if row.kind == 'TAIL': 513 return amd_epilogue_prefix() + amd_jmp_r('br') 514 if row.kind == 'SYSCALL': 515 # P1: a0=num, a1..a3,t0,s0,s1 = args 0..5. Linux amd64: rax=num, 516 # rdi/rsi/rdx/r10/r8/r9 = args 0..5, return in rax; syscall also 517 # clobbers rcx and r11. 518 # 519 # Push the P1 registers whose native slots get overwritten or 520 # syscall-clobbered -- rsi (a1), rdx (a2), rcx (a3), r11 (t1), 521 # r8 (t2) -- then shuffle into the native arg slots, issue 522 # syscall, restore, and move the return value (rax) into a0 523 # (rdi). Stack offsets after the 5 pushes: [rsp+0]=r8, 524 # [rsp+8]=r11, [rsp+16]=rcx (a3), [rsp+24]=rdx (a2), 525 # [rsp+32]=rsi (a1). 526 return ''.join([ 527 amd_push('rsi'), 528 amd_push('rdx'), 529 amd_push('rcx'), 530 amd_push('r11'), 531 amd_push('r8'), 532 amd_mov_rr('rax', 'rdi'), 533 amd_mem_LD('rdi', 'sp', 32), 534 amd_mem_LD('rsi', 'sp', 24), 535 amd_mem_LD('rdx', 'sp', 16), 536 amd_mov_rr('r8', 'rbx'), 537 amd_mov_rr('r9', 'r12'), 538 amd_syscall(), 539 amd_pop('r8'), 540 amd_pop('r11'), 541 amd_pop('rcx'), 542 amd_pop('rdx'), 543 amd_pop('rsi'), 544 amd_mov_rr('rdi', 'rax'), 545 ]) 546 raise ValueError(f'unknown nullary kind: {row.kind}') 547 548 549 def amd_start_stub(): 550 # Backend-owned :_start stub per docs/P1.md §Program Entry. Linux amd64 551 # puts argc at [rsp] and argv starting at [rsp+8]. Load argc into a0 552 # (rdi), compute &argv[0] into a1 (rsi), call p1_main under the 553 # one-word direct-result convention, then issue sys_exit with 554 # p1_main's return value in a0 (== rdi). Mirrors the `%p1_entry` 555 # macro in p1/P1-amd64.M1pp. 556 # 557 # Raw hex outside DEFINE bodies must be single-quoted so bootstrap M0 558 # treats it as a literal byte run. The bootstrap amd64 M0 has a 256B 559 # token buffer, so each quoted run must stay <= 128 hex chars; we 560 # split into multiple short lines defensively. 561 def q(hex_bytes): 562 return f"'{hex_bytes}'" 563 564 load_argc = amd_mem_LD('a0', 'sp', 0) 565 compute_argv = amd_mov_rr('a1', 'sp') + amd_alu_ri8(0, 'a1', 8) 566 labr_prefix = amd_mov_imm32_prefix('br') 567 call_main = amd_call_r('br') 568 # mov eax, 60 ; syscall. P1 a0 (rdi) already holds p1_main's return. 569 sys_exit = byte(0xB8) + le32(60) + amd_syscall() 570 571 return [ 572 ':_start', 573 q(load_argc), 574 q(compute_argv), 575 q(labr_prefix), 576 '&p1_main', 577 q(call_main), 578 q(sys_exit), 579 ] 580 581 582 ENCODERS = { 583 Li: encode_li, 584 La: encode_la, 585 LaBr: encode_labr, 586 Mov: encode_mov, 587 Rrr: encode_rrr, 588 AddI: encode_addi, 589 LogI: encode_logi, 590 ShiftI: encode_shifti, 591 Mem: encode_mem, 592 LdArg: encode_ldarg, 593 Nullary: encode_nullary, 594 BranchReg: encode_branch_reg, 595 CondB: encode_condb, 596 CondBZ: encode_condbz, 597 Enter: encode_enter, 598 } 599 600 601 ARCH = ArchDef( 602 name='amd64', 603 word_bytes=8, 604 stack_align=16, 605 syscall_numbers=SYSCALL_NUMBERS, 606 encoders=ENCODERS, 607 start_stub=amd_start_stub, 608 )