mmu.c (5860B)
1 typedef unsigned long u64; 2 typedef unsigned int u32; 3 typedef unsigned short u16; 4 typedef unsigned char u8; 5 6 #include "arch.h" 7 8 #define PTE_P 0x001UL 9 #define PTE_W 0x002UL 10 #define PTE_U 0x004UL 11 #define PTE_PCD 0x010UL 12 #define PTE_PS 0x080UL 13 14 #define KSEG (PTE_P | PTE_W | PTE_PS) 15 #define USEG (PTE_P | PTE_W | PTE_U | PTE_PS) 16 #define DSEG (PTE_P | PTE_W | PTE_PCD | PTE_PS) 17 18 __attribute__((aligned(4096))) static u64 pml4[512]; 19 __attribute__((aligned(4096))) static u64 pdpt_low[512]; 20 __attribute__((aligned(4096))) static u64 pdpt_alias[512]; 21 __attribute__((aligned(4096))) static u64 pd0[512]; 22 __attribute__((aligned(4096))) static u64 pd1[512]; 23 __attribute__((aligned(4096))) static u64 pd2[512]; 24 __attribute__((aligned(4096))) static u64 pd3[512]; 25 __attribute__((aligned(4096))) static u64 pda0[512]; 26 __attribute__((aligned(4096))) static u64 pda1[512]; 27 __attribute__((aligned(4096))) static u64 pda2[512]; 28 __attribute__((aligned(4096))) static u64 pda3[512]; 29 30 struct gdtr { 31 u16 limit; 32 u64 base; 33 } __attribute__((packed)); 34 35 struct idtr { 36 u16 limit; 37 u64 base; 38 } __attribute__((packed)); 39 40 struct idt_gate { 41 u16 off0; 42 u16 sel; 43 u8 ist; 44 u8 type; 45 u16 off1; 46 u32 off2; 47 u32 zero; 48 } __attribute__((packed)); 49 50 struct tss64 { 51 u32 reserved0; 52 u64 rsp0; 53 u64 rsp1; 54 u64 rsp2; 55 u64 reserved1; 56 u64 ist[7]; 57 u64 reserved2; 58 u16 reserved3; 59 u16 iopb; 60 } __attribute__((packed)); 61 62 static u64 gdt[7]; 63 static struct idt_gate idt[256]; 64 static struct tss64 tss; 65 66 extern void amd64_load_cr3(u64 pml4_pa); 67 extern void amd64_lgdt(const struct gdtr *g); 68 extern void amd64_lidt(const struct idtr *i); 69 extern void amd64_ltr(u16 sel); 70 extern void amd64_int80(void); 71 extern void amd64_unhandled(void); 72 extern void amd64_syscall_entry(void); 73 extern void amd64_wrmsr(u32 msr, u64 val); 74 extern u64 amd64_rdmsr(u32 msr); 75 extern char kstack_top[]; 76 77 #define MSR_EFER 0xc0000080U 78 #define MSR_STAR 0xc0000081U 79 #define MSR_LSTAR 0xc0000082U 80 #define MSR_SFMASK 0xc0000084U 81 82 static void set_gate(int vec, void (*fn)(void), int dpl) { 83 u64 a = (u64)fn; 84 idt[vec].off0 = (u16)a; 85 idt[vec].sel = 0x08; 86 idt[vec].ist = 0; 87 idt[vec].type = (u8)(0x8e | (dpl << 5)); 88 idt[vec].off1 = (u16)(a >> 16); 89 idt[vec].off2 = (u32)(a >> 32); 90 idt[vec].zero = 0; 91 } 92 93 static void setup_cpu_tables(void) { 94 gdt[0] = 0; 95 gdt[1] = 0x00af9a000000ffffUL; /* kernel code */ 96 gdt[2] = 0x00af92000000ffffUL; /* kernel data */ 97 gdt[3] = 0x00aff2000000ffffUL; /* user data */ 98 gdt[4] = 0x00affa000000ffffUL; /* user code */ 99 100 u64 base = (u64)&tss; 101 u64 limit = sizeof(tss) - 1; 102 gdt[5] = (limit & 0xffff) 103 | ((base & 0xffffff) << 16) 104 | (0x89UL << 40) 105 | (((limit >> 16) & 0xf) << 48) 106 | (((base >> 24) & 0xff) << 56); 107 gdt[6] = base >> 32; 108 109 tss.rsp0 = (u64)kstack_top; 110 tss.iopb = sizeof(tss); 111 112 struct gdtr gdtr = { (u16)(sizeof(gdt) - 1), (u64)gdt }; 113 amd64_lgdt(&gdtr); 114 amd64_ltr(0x28); 115 116 for (int i = 0; i < 256; i++) set_gate(i, amd64_unhandled, 0); 117 set_gate(0x80, amd64_int80, 3); 118 struct idtr idtr = { (u16)(sizeof(idt) - 1), (u64)idt }; 119 amd64_lidt(&idtr); 120 121 /* Modern x86_64 fast-syscall path: scheme1 (and any tcc-built user 122 * binary that follows the standard SysV/Linux amd64 ABI) emits the 123 * `syscall` instruction, which routes through MSR_LSTAR rather than 124 * the IDT. Without this block the first `syscall` raises #UD because 125 * EFER.SCE is clear, manifesting as the unhandled-vector PANIC that 126 * scheme1 hits ~0xa09 bytes into its prelude. 127 * 128 * STAR layout for sysret with REX.W=1: the CPU computes user CS as 129 * STAR[63:48]+16 and user SS as STAR[63:48]+8 (RPL forced to 3). 130 * Our user CS sel is 0x23 (gdt[4] | 3) and user SS sel is 0x1b 131 * (gdt[3] | 3), so STAR[63:48] = 0x10. Kernel side: STAR[47:32] = 8 132 * yields kernel CS=0x08 and kernel SS=0x10 on syscall entry, which 133 * matches our gdt[1]/gdt[2]. */ 134 amd64_wrmsr(MSR_EFER, amd64_rdmsr(MSR_EFER) | 1UL); 135 amd64_wrmsr(MSR_STAR, ((u64)0x10UL << 48) | ((u64)0x08UL << 32)); 136 amd64_wrmsr(MSR_LSTAR, (u64)amd64_syscall_entry); 137 amd64_wrmsr(MSR_SFMASK, 0x200UL); /* clear IF on syscall entry */ 138 } 139 140 static u64 pool_pa(int which) { 141 return which ? ARCH_USER_POOL_B_PA : ARCH_USER_POOL_A_PA; 142 } 143 144 static void fill_pd(u64 *pd, u64 first_slot, u64 flags) { 145 for (int i = 0; i < 512; i++) { 146 u64 pa = (first_slot + (u64)i) * 0x200000UL; 147 pd[i] = pa | flags; 148 } 149 } 150 151 static void fill_user_pd0(int which) { 152 fill_pd(pd0, 0, KSEG); 153 u64 base = pool_pa(which); 154 for (int i = ARCH_USER_POOL_FIRST_SLOT; i <= ARCH_USER_POOL_LAST_SLOT; i++) { 155 u64 pa = base + (u64)(i - ARCH_USER_POOL_FIRST_SLOT) * 0x200000UL; 156 pd0[i] = pa | USEG; 157 } 158 } 159 160 void arch_setup_mmu(void) { 161 for (int i = 0; i < 512; i++) { 162 pml4[i] = 0; 163 pdpt_low[i] = 0; 164 pdpt_alias[i] = 0; 165 } 166 167 fill_user_pd0(0); 168 fill_pd(pd1, 512, KSEG); 169 fill_pd(pd2, 1024, KSEG); 170 fill_pd(pd3, 1536, KSEG); 171 172 fill_pd(pda0, 0, DSEG); 173 fill_pd(pda1, 512, DSEG); 174 fill_pd(pda2, 1024, DSEG); 175 fill_pd(pda3, 1536, DSEG); 176 177 pdpt_low[0] = (u64)pd0 | PTE_P | PTE_W | PTE_U; 178 pdpt_low[1] = (u64)pd1 | PTE_P | PTE_W; 179 pdpt_low[2] = (u64)pd2 | PTE_P | PTE_W; 180 pdpt_low[3] = (u64)pd3 | PTE_P | PTE_W; 181 182 pdpt_alias[0] = (u64)pda0 | PTE_P | PTE_W; 183 pdpt_alias[1] = (u64)pda1 | PTE_P | PTE_W; 184 pdpt_alias[2] = (u64)pda2 | PTE_P | PTE_W; 185 pdpt_alias[3] = (u64)pda3 | PTE_P | PTE_W; 186 187 pml4[0] = (u64)pdpt_low | PTE_P | PTE_W | PTE_U; 188 pml4[256] = (u64)pdpt_alias | PTE_P | PTE_W; 189 190 setup_cpu_tables(); 191 amd64_load_cr3((u64)pml4); 192 } 193 194 void arch_swap_user_pool(int which) { 195 fill_user_pd0(which); 196 amd64_load_cr3((u64)pml4); 197 }