diff options
Diffstat (limited to 'sys')
-rw-r--r-- | sys/arch/amd64/amd64/cpu.c | 83 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/gdt.c | 29 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/genassym.cf | 17 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/identcpu.c | 7 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/lapic.c | 22 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/locore.S | 190 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/locore0.S | 47 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/machdep.c | 95 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/pmap.c | 230 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/spl.S | 12 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/trap.c | 32 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/vector.S | 237 | ||||
-rw-r--r-- | sys/arch/amd64/conf/ld.script | 22 | ||||
-rw-r--r-- | sys/arch/amd64/include/asm.h | 19 | ||||
-rw-r--r-- | sys/arch/amd64/include/cpu.h | 29 | ||||
-rw-r--r-- | sys/arch/amd64/include/cpu_full.h | 66 | ||||
-rw-r--r-- | sys/arch/amd64/include/cpufunc.h | 5 | ||||
-rw-r--r-- | sys/arch/amd64/include/frame.h | 16 | ||||
-rw-r--r-- | sys/arch/amd64/include/frameasm.h | 76 | ||||
-rw-r--r-- | sys/arch/amd64/include/gdt.h | 3 | ||||
-rw-r--r-- | sys/arch/amd64/include/pmap.h | 18 | ||||
-rw-r--r-- | sys/arch/amd64/include/specialreg.h | 5 |
22 files changed, 1020 insertions, 240 deletions
diff --git a/sys/arch/amd64/amd64/cpu.c b/sys/arch/amd64/amd64/cpu.c index c8727f583b0..7e14b3709de 100644 --- a/sys/arch/amd64/amd64/cpu.c +++ b/sys/arch/amd64/amd64/cpu.c @@ -1,4 +1,4 @@ -/* $OpenBSD: cpu.c,v 1.111 2018/02/06 01:09:17 patrick Exp $ */ +/* $OpenBSD: cpu.c,v 1.112 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: cpu.c,v 1.1 2003/04/26 18:39:26 fvdl Exp $ */ /*- @@ -81,7 +81,7 @@ #include <uvm/uvm_extern.h> #include <machine/codepatch.h> -#include <machine/cpu.h> +#include <machine/cpu_full.h> #include <machine/cpufunc.h> #include <machine/cpuvar.h> #include <machine/pmap.h> @@ -116,6 +116,14 @@ #include <machine/hibernate.h> #endif /* HIBERNATE */ +/* #define CPU_DEBUG */ + +#ifdef CPU_DEBUG +#define DPRINTF(x...) do { printf(x); } while(0) +#else +#define DPRINTF(x...) +#endif /* CPU_DEBUG */ + int cpu_match(struct device *, void *, void *); void cpu_attach(struct device *, struct device *, void *); int cpu_activate(struct device *, int); @@ -172,7 +180,7 @@ struct cfdriver cpu_cd = { * CPU, on uniprocessors). The CPU info list is initialized to * point at it. */ -struct cpu_info cpu_info_primary = { 0, &cpu_info_primary }; +struct cpu_info_full cpu_info_full_primary = { .cif_cpu = { .ci_self = &cpu_info_primary } }; struct cpu_info *cpu_info_list = &cpu_info_primary; @@ -338,8 +346,15 @@ cpu_attach(struct device *parent, struct device *self, void *aux) * structure, otherwise use the primary's. */ if (caa->cpu_role == CPU_ROLE_AP) { - ci = malloc(sizeof(*ci), M_DEVBUF, M_WAITOK|M_ZERO); + struct cpu_info_full *cif; + + cif = km_alloc(sizeof *cif, &kv_any, &kp_zero, &kd_waitok); + ci = &cif->cif_cpu; #if defined(MULTIPROCESSOR) + ci->ci_tss = &cif->cif_tss; + ci->ci_gdt = (void *)(ci->ci_tss + 1); + memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE); + cpu_enter_pages(cif); if (cpu_info[cpunum] != NULL) panic("cpu at apic id %d already attached?", cpunum); cpu_info[cpunum] = ci; @@ -451,7 +466,6 @@ cpu_attach(struct device *parent, struct device *self, void *aux) #if defined(MULTIPROCESSOR) cpu_intr_init(ci); - gdt_alloc_cpu(ci); sched_init_cpu(ci); cpu_start_secondary(ci); ncpus++; @@ -938,3 +952,62 @@ cpu_activate(struct device *self, int act) return (0); } + +/* + * cpu_enter_pages + * + * Requests mapping of various special pages required in the Intel Meltdown + * case (to be entered into the U-K page table): + * + * 1 tss+gdt page for each CPU + * 1 trampoline stack page for each CPU + * + * The cpu_info_full struct for each CPU straddles these pages. The offset into + * 'cif' is calculated below, for each page. For more information, consult + * the definition of struct cpu_info_full in cpu_full.h + * + * On CPUs unaffected by Meltdown, this function still configures 'cif' but + * the calls to pmap_enter_special become no-ops. + * + * Parameters: + * cif : the cpu_info_full structure describing a CPU whose pages are to be + * entered into the special meltdown U-K page table. + */ +void +cpu_enter_pages(struct cpu_info_full *cif) +{ + vaddr_t va; + paddr_t pa; + + /* The TSS+GDT need to be readable */ + va = (vaddr_t)cif; + pmap_extract(pmap_kernel(), va, &pa); + pmap_enter_special(va, pa, PROT_READ); + DPRINTF("%s: entered tss+gdt page at va 0x%llx pa 0x%llx\n", __func__, + (uint64_t)va, (uint64_t)pa); + + /* The trampoline stack page needs to be read/write */ + va = (vaddr_t)&cif->cif_tramp_stack; + pmap_extract(pmap_kernel(), va, &pa); + pmap_enter_special(va, pa, PROT_READ | PROT_WRITE); + DPRINTF("%s: entered t.stack page at va 0x%llx pa 0x%llx\n", __func__, + (uint64_t)va, (uint64_t)pa); + + cif->cif_tss.tss_rsp0 = va + sizeof(cif->cif_tramp_stack) - 16; + DPRINTF("%s: cif_tss.tss_rsp0 = 0x%llx\n" ,__func__, + (uint64_t)cif->cif_tss.tss_rsp0); + cif->cif_cpu.ci_intr_rsp = cif->cif_tss.tss_rsp0 - + sizeof(struct iretq_frame); + +#define SETUP_IST_SPECIAL_STACK(ist, cif, member) do { \ + (cif)->cif_tss.tss_ist[(ist)] = (vaddr_t)&(cif)->member + \ + sizeof((cif)->member) - 16; \ + (cif)->member[nitems((cif)->member) - 2] = (int64_t)&(cif)->cif_cpu; \ +} while (0) + + SETUP_IST_SPECIAL_STACK(0, cif, cif_dblflt_stack); + SETUP_IST_SPECIAL_STACK(1, cif, cif_nmi_stack); + + /* an empty iomap, by setting its offset to the TSS limit */ + cif->cif_tss.tss_iobase = sizeof(cif->cif_tss); +} diff --git a/sys/arch/amd64/amd64/gdt.c b/sys/arch/amd64/amd64/gdt.c index 8aa28a098f9..1372ebd083e 100644 --- a/sys/arch/amd64/amd64/gdt.c +++ b/sys/arch/amd64/amd64/gdt.c @@ -1,4 +1,4 @@ -/* $OpenBSD: gdt.c,v 1.25 2018/01/07 05:36:47 guenther Exp $ */ +/* $OpenBSD: gdt.c,v 1.26 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: gdt.c,v 1.1 2003/04/26 18:39:28 fvdl Exp $ */ /*- @@ -40,33 +40,6 @@ #include <machine/pcb.h> /* - * Allocate shadow GDT for a slave cpu. - */ -void -gdt_alloc_cpu(struct cpu_info *ci) -{ - struct vm_page *pg; - vaddr_t va; - - ci->ci_gdt = (char *)uvm_km_valloc(kernel_map, - GDT_SIZE + sizeof(*ci->ci_tss)); - ci->ci_tss = (void *)(ci->ci_gdt + GDT_SIZE); - uvm_map_pageable(kernel_map, (vaddr_t)ci->ci_gdt, - (vaddr_t)ci->ci_gdt + GDT_SIZE, FALSE, FALSE); - for (va = (vaddr_t)ci->ci_gdt; - va < (vaddr_t)ci->ci_gdt + GDT_SIZE + sizeof(*ci->ci_tss); - va += PAGE_SIZE) { - pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); - if (pg == NULL) - panic("gdt_init: no pages"); - pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), PROT_READ | PROT_WRITE); - } - memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE); - bzero(ci->ci_tss, sizeof(*ci->ci_tss)); -} - - -/* * Load appropriate gdt descriptor; we better be running on *ci */ void diff --git a/sys/arch/amd64/amd64/genassym.cf b/sys/arch/amd64/amd64/genassym.cf index f72dd494c9b..4d65a3a56a2 100644 --- a/sys/arch/amd64/amd64/genassym.cf +++ b/sys/arch/amd64/amd64/genassym.cf @@ -1,4 +1,4 @@ -# $OpenBSD: genassym.cf,v 1.33 2018/02/10 09:21:12 mpi Exp $ +# $OpenBSD: genassym.cf,v 1.34 2018/02/21 19:24:15 guenther Exp $ # Written by Artur Grabowski art@openbsd.org, Public Domain include <sys/param.h> @@ -78,6 +78,15 @@ member tf_ss define FRAMESIZE sizeof(struct trapframe) +struct iretq_frame +member IRETQ_CS iretq_cs +member IRETQ_RIP iretq_rip +member IRETQ_RFLAGS iretq_rflags +member IRETQ_RSP iretq_rsp +member IRETQ_SS iretq_ss + +define IRETQ_SIZE sizeof(struct iretq_frame) + struct pcb member pcb_cr3 member pcb_rsp @@ -91,6 +100,8 @@ member pcb_cr0 struct pmap member pm_cpus +member pm_pdirpa +member pm_pdirpa_intel struct x86_64_tss member tss_rsp0 @@ -115,6 +126,10 @@ endif member CPU_INFO_GDT ci_gdt member CPU_INFO_TSS ci_tss member CPU_INFO_FLAGS ci_flags +member CPU_INFO_KERN_CR3 ci_kern_cr3 +member CPU_INFO_USER_CR3 ci_user_cr3 +member CPU_INFO_KERN_RSP ci_kern_rsp +member CPU_INFO_INTR_RSP ci_intr_rsp export CPUF_USERSEGS_BIT diff --git a/sys/arch/amd64/amd64/identcpu.c b/sys/arch/amd64/amd64/identcpu.c index 046fde6855a..4bc2e6d10d5 100644 --- a/sys/arch/amd64/amd64/identcpu.c +++ b/sys/arch/amd64/amd64/identcpu.c @@ -1,4 +1,4 @@ -/* $OpenBSD: identcpu.c,v 1.94 2018/02/10 09:46:58 jsg Exp $ */ +/* $OpenBSD: identcpu.c,v 1.95 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: identcpu.c,v 1.1 2003/04/26 18:39:28 fvdl Exp $ */ /* @@ -208,6 +208,7 @@ const struct { { SEFF0EDX_AVX512_4FMAPS, "AVX512FMAPS" }, { SEFF0EDX_IBRS, "IBRS,IBPB" }, { SEFF0EDX_STIBP, "STIBP" }, + /* SEFF0EDX_ARCH_CAP (not printed) */ }, cpu_tpm_eaxfeatures[] = { { TPM_SENSOR, "SENSOR" }, { TPM_ARAT, "ARAT" }, @@ -455,6 +456,7 @@ identifycpu(struct cpu_info *ci) int i; char *brandstr_from, *brandstr_to; int skipspace; + extern uint32_t cpu_meltdown; CPUID(1, ci->ci_signature, val, dummy, ci->ci_feature_flags); CPUID(0x80000000, ci->ci_pnfeatset, dummy, dummy, dummy); @@ -612,6 +614,9 @@ identifycpu(struct cpu_info *ci) } } + if (cpu_meltdown) + printf(",MELTDOWN"); + printf("\n"); x86_print_cacheinfo(ci); diff --git a/sys/arch/amd64/amd64/lapic.c b/sys/arch/amd64/amd64/lapic.c index 6a1086c2f62..83ee4472d9f 100644 --- a/sys/arch/amd64/amd64/lapic.c +++ b/sys/arch/amd64/amd64/lapic.c @@ -1,4 +1,4 @@ -/* $OpenBSD: lapic.c,v 1.49 2017/10/14 04:44:43 jsg Exp $ */ +/* $OpenBSD: lapic.c,v 1.50 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: lapic.c,v 1.2 2003/05/08 01:04:35 fvdl Exp $ */ /*- @@ -59,6 +59,14 @@ #include <machine/i82093var.h> #endif +/* #define LAPIC_DEBUG */ + +#ifdef LAPIC_DEBUG +#define DPRINTF(x...) do { printf(x); } while(0) +#else +#define DPRINTF(x...) +#endif /* LAPIC_DEBUG */ + struct evcount clk_count; #ifdef MULTIPROCESSOR struct evcount ipi_count; @@ -201,6 +209,7 @@ lapic_map(paddr_t lapic_base) codepatch_call(CPTAG_EOI, &x2apic_eoi); lapic_writereg(LAPIC_TPRI, s); + va = (vaddr_t)&local_apic; } else { /* * Map local apic. If we have a local apic, it's safe to @@ -220,6 +229,17 @@ lapic_map(paddr_t lapic_base) lapic_tpr = s; } + /* + * Enter the LAPIC MMIO page in the U-K page table for handling + * Meltdown (needed in the interrupt stub to acknowledge the + * incoming interrupt). On CPUs unaffected by Meltdown, + * pmap_enter_special is a no-op. + * XXX - need to map this PG_N + */ + pmap_enter_special(va, lapic_base, PROT_READ | PROT_WRITE); + DPRINTF("%s: entered lapic page va 0x%llx pa 0x%llx\n", __func__, + (uint64_t)va, (uint64_t)lapic_base); + enable_intr(); } diff --git a/sys/arch/amd64/amd64/locore.S b/sys/arch/amd64/amd64/locore.S index 6e00ce3dddf..282a25310c6 100644 --- a/sys/arch/amd64/amd64/locore.S +++ b/sys/arch/amd64/amd64/locore.S @@ -1,4 +1,4 @@ -/* $OpenBSD: locore.S,v 1.93 2018/01/07 19:56:19 mlarkin Exp $ */ +/* $OpenBSD: locore.S,v 1.94 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: locore.S,v 1.13 2004/03/25 18:33:17 drochner Exp $ */ /* @@ -113,6 +113,7 @@ #include <sys/syscall.h> #include <machine/param.h> +#include <machine/psl.h> #include <machine/segments.h> #include <machine/specialreg.h> #include <machine/frameasm.h> @@ -176,6 +177,7 @@ _C_LABEL(lapic_isr): .globl _C_LABEL(bootapiver) .globl _C_LABEL(pg_nx) .globl _C_LABEL(pg_g_kern) + .globl _C_LABEL(cpu_meltdown) _C_LABEL(cpu_id): .long 0 # saved from `cpuid' instruction _C_LABEL(cpu_feature): .long 0 # feature flags from 'cpuid' # instruction @@ -210,7 +212,8 @@ _C_LABEL(biosextmem): .long REALEXTMEM _C_LABEL(pg_nx): .quad 0 # NX PTE bit (if CPU supports) _C_LABEL(pg_g_kern): .quad 0 # 0x100 if global pages should be used # in kernel mappings, 0 otherwise (for - # Intel) + # insecure CPUs) +_C_LABEL(cpu_meltdown): .long 0 # 1 if this CPU has Meltdown #define _RELOC(x) ((x) - KERNBASE) #define RELOC(x) _RELOC(_C_LABEL(x)) @@ -236,7 +239,7 @@ gdt64_end: /*****************************************************************************/ /* - * Signal trampoline; copied to top of user stack. + * Signal trampoline; copied to a page mapped into userspace. * gdb's backtrace logic matches against the instructions in this. */ .section .rodata @@ -401,20 +404,34 @@ restore_saved: movq PCB_RSP(%r13),%rsp movq PCB_RBP(%r13),%rbp - movq CPUVAR(TSS),%rcx - movq PCB_KSTACK(%r13),%rdx - movq %rdx,TSS_RSP0(%rcx) - movq PCB_CR3(%r13),%rax - movq %rax,%cr3 + movq %rax,%cr3 /* %rax used below too */ /* Don't bother with the rest if switching to a system process. */ testl $P_SYSTEM,P_FLAG(%r12) jnz switch_restored + /* record the bits needed for future U-->K transition */ + movq PCB_KSTACK(%r13),%rdx + subq $FRAMESIZE,%rdx + movq %rdx,CPUVAR(KERN_RSP) + movq PCB_PMAP(%r13),%rcx + + /* + * Meltdown: iff we're doing separate U+K and U-K page tables, + * then record them in cpu_info for easy access in syscall and + * interrupt trampolines. XXX code patch this + */ + + movq PM_PDIRPA_INTEL(%rcx),%rdx + testq %rdx,%rdx + jz 0f /* yay, no intel suckiness */ + movq %rax,CPUVAR(KERN_CR3) + movq %rdx,CPUVAR(USER_CR3) +0: + /* set the new pmap's bit for the cpu */ movl CPUVAR(CPUID),%edi - movq PCB_PMAP(%r13),%rcx lock btsq %rdi,PM_CPUS(%rcx) #ifdef DIAGNOSTIC @@ -503,8 +520,7 @@ IDTVEC(syscall32) sysret /* go away please */ /* - * syscall insn entry. This currently isn't much faster, but - * it can be made faster in the future. + * syscall insn entry. */ IDTVEC(syscall) /* @@ -514,13 +530,20 @@ IDTVEC(syscall) * the user-space value. * First order of business is to swap to the kernel gs.base so that * we can access our struct cpu_info and use the scratch space there - * to switch to our kernel stack. Once that's in place we can + * to switch to the kernel page tables (thank you, Intel), then + * switch to our kernel stack. Once that's in place we can * unblock interrupts and save the rest of the syscall frame. */ swapgs movq %r15,CPUVAR(SCRATCH) - movq CPUVAR(CURPCB),%r15 - movq PCB_KSTACK(%r15),%r15 + movq CPUVAR(KERN_CR3),%r15 + testq %r15,%r15 + jz Xsyscall_untramp + movq %r15,%cr3 + jmp Xsyscall_untramp + +NENTRY(Xsyscall_untramp) + movq CPUVAR(KERN_RSP),%r15 xchgq %r15,%rsp sti @@ -531,12 +554,11 @@ IDTVEC(syscall) * ss:rsp, etc, so that all GP registers can be * saved. Then, fill in the rest. */ - pushq $(GSEL(GUDATA_SEL, SEL_UPL)) - pushq %r15 - subq $(TF_RSP-TF_TRAPNO),%rsp + movq $(GSEL(GUDATA_SEL, SEL_UPL)),TF_SS(%rsp) + movq %r15,TF_RSP(%rsp) movq CPUVAR(SCRATCH),%r15 - subq $32,%rsp - INTR_SAVE_GPRS + INTR_SAVE_MOST_GPRS_NO_ADJ + movq %rcx,TF_RCX(%rsp) movq %r11, TF_RFLAGS(%rsp) /* old rflags from syscall insn */ movq $(GSEL(GUCODE_SEL, SEL_UPL)), TF_CS(%rsp) movq %rcx,TF_RIP(%rsp) @@ -581,16 +603,45 @@ IDTVEC(syscall) movq TF_RBP(%rsp),%rbp movq TF_RBX(%rsp),%rbx - INTR_RESTORE_SELECTORS + /* Restore FS.base if it's not already in the CPU */ + btsl $CPUF_USERSEGS_BIT,CPUVAR(FLAGS) + jc 99f + movq CPUVAR(CURPCB),%rdx + movq PCB_FSBASE(%rdx),%rax + movq %rax,%rdx + shrq $32,%rdx + movl $MSR_FSBASE,%ecx + wrmsr +99: + /* + * We need to finish reading from the trapframe, then switch + * to the user page tables, swapgs, and return. We need + * to get the final value for the register that was used + * for the mov to %cr3 from somewhere accessible on the + * user page tables, so save it in CPUVAR(SCRATCH) across + * the switch. + */ movq TF_RDX(%rsp),%rdx movq TF_RAX(%rsp),%rax + movq %rax,CPUVAR(SCRATCH) + movq CPUVAR(USER_CR3),%rax movq TF_RIP(%rsp),%rcx movq TF_RFLAGS(%rsp),%r11 movq TF_RSP(%rsp),%rsp + testq %rax,%rax + jz 1f + jmp syscall_trampback + +KUENTRY(syscall_trampback) + movq %rax,%cr3 +1: movq CPUVAR(SCRATCH),%rax + swapgs sysretq + .text + #ifdef DIAGNOSTIC .Lsyscall_spl_not_lowered: movabsq $spl_lowered, %rdi @@ -627,6 +678,12 @@ NENTRY(proc_trampoline) * Return via iretq, for real interrupts and signal returns */ NENTRY(intr_fast_exit) +#ifdef DIAGNOSTIC + pushfq + popq %rdx + testq $PSL_I,%rdx + jnz .Lintr_exit_not_blocked +#endif /* DIAGNOSTIC */ movq TF_RDI(%rsp),%rdi movq TF_RSI(%rsp),%rsi movq TF_R8(%rsp),%r8 @@ -640,11 +697,68 @@ NENTRY(intr_fast_exit) movq TF_RBX(%rsp),%rbx testq $SEL_RPL,TF_CS(%rsp) - je 5f + je intr_exit_recurse /* returning back to kernel? */ + + /* returning to userspace. XXX fix up iret frame here */ + + /* restore FS.base if it's not already in the CPU */ + btsl $CPUF_USERSEGS_BIT,CPUVAR(FLAGS) + jc 99f + movq CPUVAR(CURPCB),%rdx /* for below */ + movq PCB_FSBASE(%rdx),%rax + movq %rax,%rdx + shrq $32,%rdx + movl $MSR_FSBASE,%ecx + wrmsr +99: + /* + * Returning to userspace. We need to go things in this order: + * - update the iret frame from the trapframe + * - finish reading from the trapframe + * - switch to the trampoline stack + * - jump to the .kutext segment + * - switch to the user page tables + * - swapgs + * - iretq + * To get the final value for the register that was used + * for the mov to %cr3, we need access to somewhere accessible + * on the user page tables, so we save it in CPUVAR(SCRATCH) + * across the switch. + */ + /* update iret frame */ + movq CPUVAR(INTR_RSP),%rdx + movq $(GSEL(GUCODE_SEL,SEL_UPL)),IRETQ_CS(%rdx) + movq TF_RIP(%rsp),%rax + movq %rax,IRETQ_RIP(%rdx) + movq TF_RFLAGS(%rsp),%rax + movq %rax,IRETQ_RFLAGS(%rdx) + movq TF_RSP(%rsp),%rax + movq %rax,IRETQ_RSP(%rdx) + movq $(GSEL(GUDATA_SEL,SEL_UPL)),IRETQ_SS(%rdx) + /* finish with the trap frame */ + movq TF_RAX(%rsp),%rax + movq %rax,CPUVAR(SCRATCH) + movq TF_RCX(%rsp),%rcx + movq TF_R11(%rsp),%r11 + /* switch to the trampoline stack */ + xchgq %rdx,%rsp + movq TF_RDX(%rdx),%rdx + movq CPUVAR(USER_CR3),%rax + testq %rax,%rax + jz 1f + jmp iretq_tramp - INTR_RESTORE_SELECTORS +KUENTRY(iretq_tramp) + movq %rax,%cr3 +1: movq CPUVAR(SCRATCH),%rax + swapgs -5: movq TF_RDX(%rsp),%rdx + .globl _C_LABEL(doreti_iret) +_C_LABEL(doreti_iret): + iretq + +NENTRY(intr_exit_recurse) + movq TF_RDX(%rsp),%rdx movq TF_RCX(%rsp),%rcx movq TF_R11(%rsp),%r11 movq TF_RAX(%rsp),%rax @@ -662,9 +776,6 @@ NENTRY(intr_fast_exit) #endif /* !defined(GPROF) && defined(DDBPROF) */ addq $TF_RIP,%rsp - - .globl _C_LABEL(doreti_iret) -_C_LABEL(doreti_iret): iretq @@ -697,6 +808,33 @@ _C_LABEL(doreti_iret): addq $TF_RIP,%rsp iretq #endif /* !defined(GPROF) && defined(DDBPROF) */ + .text + +#ifdef DIAGNOSTIC +.Lintr_exit_not_blocked: + xchgw %bx, %bx + movl warn_once(%rip),%edi + testl %edi,%edi + jnz 1f + incl %edi + movl %edi,warn_once(%rip) + leaq .Lnot_blocked(%rip),%rdi + call _C_LABEL(printf) +#ifdef DDB + int $3 +#endif /* DDB */ +1: cli + jmp intr_fast_exit + + .data +.global warn_once +warn_once: + .long 0 + .section .rodata +.Lnot_blocked: + .asciz "WARNING: INTERRUPTS NOT BLOCKED ON INTERRUPT RETURN: 0x%x 0x%x\n" + .text +#endif ENTRY(xrstor_user) movq %rsi, %rdx diff --git a/sys/arch/amd64/amd64/locore0.S b/sys/arch/amd64/amd64/locore0.S index 50f0c7ecd82..53ef3672be5 100644 --- a/sys/arch/amd64/amd64/locore0.S +++ b/sys/arch/amd64/amd64/locore0.S @@ -1,4 +1,4 @@ -/* $OpenBSD: locore0.S,v 1.6 2018/01/07 19:56:19 mlarkin Exp $ */ +/* $OpenBSD: locore0.S,v 1.7 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: locore.S,v 1.13 2004/03/25 18:33:17 drochner Exp $ */ /* @@ -205,26 +205,47 @@ bi_size_ok: movl $0, 12(%ebp) /* - * Determine if CPU is Intel. Intel CPUs cannot use PG_G (global - * pages) in kernel mappings. If CPU is not Intel, this is safe. - * Cache the result in pg_g_kern - 0 if not supported or PG_G (0x100) - * if supported. - * - * This treatment is required for the meltdown CVE mitigation. + * Determine if CPU has meltdown. Certain Intel CPUs do not properly + * respect page permissions when speculatively loading data into + * the cache ("Meltdown" CVE). These CPUs must utilize a secondary + * sanitized page table lacking kernel mappings when executing user + * processes, and may not use PG_G global PTEs for kernel VAs. */ + movl $0x1, RELOC(cpu_meltdown) /* assume insecure at first */ + movl $0x0, RELOC(pg_g_kern) + cmpl $0x756e6547, %ebx # "Genu" - jne not_intel + jne .Lcpu_secure cmpl $0x6c65746e, %ecx # "ntel" - jne not_intel + jne .Lcpu_secure cmpl $0x49656e69, %edx # "ineI" - jne not_intel + jne .Lcpu_secure - jmp pg_g_check_finished + /* + * Intel CPU, now check if IA32_ARCH_CAPABILITIES is supported and + * if it says this CPU is safe. + */ + movl $0x0, %eax + cpuid + cmpl $0x7, %eax + jl .Lcpu_check_finished + + movl $0x7, %eax + cpuid + testl $SEFF0EDX_ARCH_CAP, %edx + jz .Lcpu_check_finished + + /* IA32_ARCH_CAPABILITIES MSR avaialble, use it to check CPU security */ + movl $MSR_ARCH_CAPABILITIES, %ecx + rdmsr + testl $ARCH_CAPABILITIES_RDCL_NO, %eax + jz .Lcpu_check_finished -not_intel: +.Lcpu_secure: + movl $0x0, RELOC(cpu_meltdown) movl $PG_G, RELOC(pg_g_kern) -pg_g_check_finished: +.Lcpu_check_finished: movl $1,%eax cpuid movl %eax,RELOC(cpu_id) diff --git a/sys/arch/amd64/amd64/machdep.c b/sys/arch/amd64/amd64/machdep.c index a1d5e02f340..dd0623b15c4 100644 --- a/sys/arch/amd64/amd64/machdep.c +++ b/sys/arch/amd64/amd64/machdep.c @@ -1,4 +1,4 @@ -/* $OpenBSD: machdep.c,v 1.238 2018/02/06 01:09:17 patrick Exp $ */ +/* $OpenBSD: machdep.c,v 1.239 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: machdep.c,v 1.3 2003/05/07 22:58:18 fvdl Exp $ */ /*- @@ -90,7 +90,7 @@ #include <sys/sysctl.h> -#include <machine/cpu.h> +#include <machine/cpu_full.h> #include <machine/cpufunc.h> #include <machine/pio.h> #include <machine/psl.h> @@ -141,6 +141,14 @@ extern int db_console; #include <dev/ic/pckbcvar.h> #endif +/* #define MACHDEP_DEBUG */ + +#ifdef MACHDEP_DEBUG +#define DPRINTF(x...) do { printf(x); } while(0) +#else +#define DPRINTF(x...) +#endif /* MACHDEP_DEBUG */ + /* the following is used externally (sysctl_hw) */ char machine[] = MACHINE; @@ -257,6 +265,7 @@ void cpu_init_extents(void); void map_tramps(void); void init_x86_64(paddr_t); void (*cpuresetfn)(void); +void enter_shared_special_pages(void); #ifdef APERTURE int allowaperture = 0; @@ -313,6 +322,65 @@ cpu_startup(void) #ifndef SMALL_KERNEL cpu_ucode_setup(); #endif + /* enter the IDT and trampoline code in the u-k maps */ + enter_shared_special_pages(); + + /* initialize CPU0's TSS and GDT and put them in the u-k maps */ + cpu_enter_pages(&cpu_info_full_primary); +} + +/* + * enter_shared_special_pages + * + * Requests mapping of various special pages required in the Intel Meltdown + * case (to be entered into the U-K page table): + * + * 1 IDT page + * Various number of pages covering the U-K ".kutext" section. This section + * contains code needed during trampoline operation + * Various number of pages covering the U-K ".kudata" section. This section + * contains data accessed by the trampoline, before switching to U+K + * (for example, various shared global variables used by IPIs, etc) + * + * The linker script places the required symbols in the sections above. + * + * On CPUs not affected by Meltdown, the calls to pmap_enter_special below + * become no-ops. + */ +void +enter_shared_special_pages(void) +{ + extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[]; + extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[]; + vaddr_t va; + paddr_t pa; + + /* idt */ + pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ); + DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__, + (uint64_t)idt_vaddr, (uint64_t)idt_paddr); + + /* .kutext section */ + va = (vaddr_t)__kutext_start; + pa = (paddr_t)__kernel_kutext_phys; + while (va < (vaddr_t)__kutext_end) { + pmap_enter_special(va, pa, PROT_READ | PROT_EXEC); + DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n", + __func__, (uint64_t)va, (uint64_t)pa); + va += PAGE_SIZE; + pa += PAGE_SIZE; + } + + /* .kudata section */ + va = (vaddr_t)__kudata_start; + pa = (paddr_t)__kernel_kudata_phys; + while (va < (vaddr_t)__kudata_end) { + pmap_enter_special(va, pa, PROT_READ | PROT_WRITE); + DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n", + __func__, (uint64_t)va, (uint64_t)pa); + va += PAGE_SIZE; + pa += PAGE_SIZE; + } } /* @@ -329,12 +397,6 @@ x86_64_proc0_tss_ldt_init(void) pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16; proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1; - /* an empty iomap, by setting its offset to the TSS limit */ - cpu_info_primary.ci_tss->tss_iobase = sizeof(struct x86_64_tss); - cpu_info_primary.ci_tss->tss_rsp0 = pcb->pcb_kstack; - cpu_info_primary.ci_tss->tss_ist[0] = - (u_int64_t)proc0.p_addr + PAGE_SIZE - 16; - ltr(GSYSSEL(GPROC0_SEL, SEL_KPL)); lldt(0); } @@ -346,15 +408,11 @@ x86_64_proc0_tss_ldt_init(void) #ifdef MULTIPROCESSOR void x86_64_init_pcb_tss_ldt(struct cpu_info *ci) -{ +{ struct pcb *pcb = ci->ci_idle_pcb; - ci->ci_tss->tss_iobase = sizeof(*ci->ci_tss); - ci->ci_tss->tss_rsp0 = pcb->pcb_kstack; - ci->ci_tss->tss_ist[0] = pcb->pcb_kstack - USPACE + PAGE_SIZE; - pcb->pcb_cr0 = rcr0(); -} +} #endif /* MULTIPROCESSOR */ bios_diskinfo_t * @@ -1551,8 +1609,6 @@ init_x86_64(paddr_t first_avail) pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024); pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE); - pmap_kenter_pa(idt_vaddr + PAGE_SIZE, idt_paddr + PAGE_SIZE, - PROT_READ | PROT_WRITE); #if defined(MULTIPROCESSOR) || \ (NACPI > 0 && !defined(SMALL_KERNEL)) @@ -1560,7 +1616,7 @@ init_x86_64(paddr_t first_avail) #endif idt = (struct gate_descriptor *)idt_vaddr; - cpu_info_primary.ci_tss = (void *)(idt + NIDT); + cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss; cpu_info_primary.ci_gdt = (void *)(cpu_info_primary.ci_tss + 1); /* make gdt gates and memory segments */ @@ -1585,9 +1641,10 @@ init_x86_64(paddr_t first_avail) /* exceptions */ for (x = 0; x < 32; x++) { - ist = (x == 8) ? 1 : 0; + /* trap2 == NMI, trap8 == double fault */ + ist = (x == 2) ? 2 : (x == 8) ? 1 : 0; setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT, - (x == 3 || x == 4) ? SEL_UPL : SEL_KPL, + (x == 3) ? SEL_UPL : SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); idt_allocmap[x] = 1; } diff --git a/sys/arch/amd64/amd64/pmap.c b/sys/arch/amd64/amd64/pmap.c index bb7ba397bbe..3e559206608 100644 --- a/sys/arch/amd64/amd64/pmap.c +++ b/sys/arch/amd64/amd64/pmap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pmap.c,v 1.108 2018/01/07 19:56:19 mlarkin Exp $ */ +/* $OpenBSD: pmap.c,v 1.109 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $ */ /* @@ -119,6 +119,15 @@ #include "acpi.h" +/* #define PMAP_DEBUG */ + +#ifdef PMAP_DEBUG +#define DPRINTF(x...) do { printf(x); } while(0) +#else +#define DPRINTF(x...) +#endif /* PMAP_DEBUG */ + + /* * general info: * @@ -255,6 +264,7 @@ TAILQ_HEAD(pg_to_free, vm_page); struct pool pmap_pdp_pool; void pmap_pdp_ctor(pd_entry_t *); +void pmap_pdp_ctor_intel(pd_entry_t *); extern vaddr_t msgbuf_vaddr; extern paddr_t msgbuf_paddr; @@ -268,6 +278,8 @@ extern vaddr_t lo32_paddr; vaddr_t virtual_avail; extern int end; +extern uint32_t cpu_meltdown; + /* * local prototypes */ @@ -309,7 +321,6 @@ void pmap_tlb_shootwait(void); #define pmap_tlb_shootwait() #endif - /* * p m a p i n l i n e h e l p e r f u n c t i o n s */ @@ -323,7 +334,8 @@ static __inline boolean_t pmap_is_curpmap(struct pmap *pmap) { return((pmap == pmap_kernel()) || - (pmap->pm_pdirpa == (paddr_t) rcr3())); + (pmap->pm_pdirpa == (paddr_t) rcr3()) || + (pmap->pm_pdirpa_intel == (paddr_t) rcr3())); } /* @@ -484,7 +496,6 @@ pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs) return (0); } - /* * p m a p k e n t e r f u n c t i o n s * @@ -586,12 +597,12 @@ pmap_kremove(vaddr_t sva, vsize_t len) paddr_t pmap_bootstrap(paddr_t first_avail, paddr_t max_pa) { - vaddr_t kva, kva_end, kva_start = VM_MIN_KERNEL_ADDRESS; + vaddr_t kva_start = VM_MIN_KERNEL_ADDRESS; struct pmap *kpm; int i; - unsigned long p1i; long ndmpdp; paddr_t dmpd, dmpdp; + vaddr_t kva, kva_end; /* * define the boundaries of the managed kernel virtual address @@ -643,9 +654,14 @@ pmap_bootstrap(paddr_t first_avail, paddr_t max_pa) curpcb->pcb_pmap = kpm; /* proc0's pcb */ /* - * enable global TLB entries. + * Add PG_G attribute to already mapped kernel pages. pg_g_kern + * is calculated in locore0.S and may be set to: + * + * 0 if this CPU does not safely support global pages in the kernel + * (Intel/Meltdown) + * PG_G if this CPU does safely support global pages in the kernel + * (AMD) */ - /* add PG_G attribute to already mapped kernel pages */ #if KERNBASE == VM_MIN_KERNEL_ADDRESS for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ; #else @@ -653,7 +669,7 @@ pmap_bootstrap(paddr_t first_avail, paddr_t max_pa) for (kva = KERNBASE; kva < kva_end ; #endif kva += PAGE_SIZE) { - p1i = pl1_i(kva); + unsigned long p1i = pl1_i(kva); if (pmap_valid_entry(PTE_BASE[p1i])) PTE_BASE[p1i] |= pg_g_kern; } @@ -726,7 +742,7 @@ pmap_bootstrap(paddr_t first_avail, paddr_t max_pa) LIST_INIT(&pmaps); /* - * initialize the pmap pool. + * initialize the pmap pools. */ pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_NONE, 0, @@ -742,6 +758,9 @@ pmap_bootstrap(paddr_t first_avail, paddr_t max_pa) pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, IPL_NONE, PR_WAITOK, "pdppl", NULL); + kpm->pm_pdir_intel = 0; + kpm->pm_pdirpa_intel = 0; + /* * ensure the TLB is sync'd with reality by flushing it... */ @@ -894,13 +913,21 @@ pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, unsigned long index; int level; vaddr_t invaladdr; - pd_entry_t opde; + pd_entry_t opde, *mdpml4es; level = 1; do { pmap_freepage(pmap, ptp, level, pagelist); index = pl_i(va, level + 1); opde = pmap_pte_set(&pdes[level - 1][index], 0); + if (level == 3 && pmap->pm_pdir_intel) { + /* Zap special meltdown PML4e */ + mdpml4es = (pd_entry_t *)pmap->pm_pdir_intel; + opde = pmap_pte_set(&mdpml4es[index], 0); + DPRINTF("%s: cleared meltdown PML4e @ index %lu " + "(va range start 0x%llx)\n", __func__, index, + (uint64_t)(index << L4_SHIFT)); + } invaladdr = level == 1 ? (vaddr_t)ptes : (vaddr_t)pdes[level - 2]; pmap_tlb_shootpage(curpcb->pcb_pmap, @@ -934,7 +961,7 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t **pdes) struct vm_page *ptp, *pptp; int i; unsigned long index; - pd_entry_t *pva; + pd_entry_t *pva, *pva_intel; paddr_t ppa, pa; struct uvm_object *obj; @@ -973,6 +1000,20 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t **pdes) pmap->pm_ptphint[i - 2] = ptp; pa = VM_PAGE_TO_PHYS(ptp); pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V); + + /* + * Meltdown Special case - if we are adding a new PML4e for + * usermode addresses, just copy the PML4e to the U-K page + * table. + */ + if (pmap->pm_pdir_intel && i == 4 && va < VM_MAXUSER_ADDRESS) { + pva_intel = (pd_entry_t *)pmap->pm_pdir_intel; + pva_intel[index] = pva[index]; + DPRINTF("%s: copying usermode PML4e (content=0x%llx) " + "from 0x%llx -> 0x%llx\n", __func__, pva[index], + (uint64_t)&pva[index], (uint64_t)&pva_intel[index]); + } + pmap->pm_stats.resident_count++; /* * If we're not in the top level, increase the @@ -1048,6 +1089,15 @@ pmap_pdp_ctor(pd_entry_t *pdir) #endif } +void +pmap_pdp_ctor_intel(pd_entry_t *pdir) +{ + struct pmap *kpm = pmap_kernel(); + + /* Copy PML4es from pmap_kernel's U-K view */ + memcpy(pdir, kpm->pm_pdir_intel, PAGE_SIZE); +} + /* * pmap_create: create a pmap * @@ -1088,6 +1138,22 @@ pmap_create(void) pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME; + /* + * Intel CPUs need a special page table to be used during usermode + * execution, one that lacks all kernel mappings. + */ + if (cpu_meltdown) { + pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, PR_WAITOK); + pmap_pdp_ctor_intel(pmap->pm_pdir_intel); + if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel, + &pmap->pm_pdirpa_intel)) + panic("%s: unknown PA mapping for meltdown PML4\n", + __func__); + } else { + pmap->pm_pdir_intel = 0; + pmap->pm_pdirpa_intel = 0; + } + LIST_INSERT_HEAD(&pmaps, pmap, pm_list); return (pmap); } @@ -1145,6 +1211,9 @@ pmap_destroy(struct pmap *pmap) /* XXX: need to flush it out of other processor's space? */ pool_put(&pmap_pdp_pool, pmap->pm_pdir); + if (pmap->pm_pdir_intel) + pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel); + pool_put(&pmap_pmap_pool, pmap); } @@ -1959,6 +2028,137 @@ pmap_collect(struct pmap *pmap) * defined as macro in pmap.h */ +void +pmap_enter_special(vaddr_t va, paddr_t pa, vm_prot_t prot) +{ + uint64_t l4idx, l3idx, l2idx, l1idx; + pd_entry_t *pd, *ptp; + paddr_t npa; + struct pmap *pmap = pmap_kernel(); + + /* If CPU is secure, no need to do anything */ + if (!cpu_meltdown) + return; + + /* Must be kernel VA */ + if (va < VM_MIN_KERNEL_ADDRESS) + panic("%s: invalid special mapping va 0x%lx requested", + __func__, va); + + if (!pmap->pm_pdir_intel) + pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, + PR_WAITOK | PR_ZERO); + + l4idx = (va & L4_MASK) >> L4_SHIFT; /* PML4E idx */ + l3idx = (va & L3_MASK) >> L3_SHIFT; /* PDPTE idx */ + l2idx = (va & L2_MASK) >> L2_SHIFT; /* PDE idx */ + l1idx = (va & L1_MASK) >> L1_SHIFT; /* PTE idx */ + + DPRINTF("%s: va=0x%llx pa=0x%llx l4idx=%lld l3idx=%lld " + "l2idx=%lld l1idx=%lld\n", __func__, (uint64_t)va, + (uint64_t)pa, l4idx, l3idx, l2idx, l1idx); + + /* Start at PML4 / top level */ + pd = (pd_entry_t *)pmap->pm_pdir_intel; + + if (!pd) + panic("%s: PML4 not initialized for pmap @ %p\n", __func__, + pmap); + + /* npa = physaddr of PDPT */ + npa = pd[l4idx] & PMAP_PA_MASK; + + /* Valid PML4e for the 512GB region containing va? */ + if (!npa) { + /* No valid PML4E - allocate PDPT page and set PML4E */ + + ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); + + if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) + panic("%s: can't locate PDPT page\n", __func__); + + pd[l4idx] = (npa | PG_u | PG_RW | PG_V); + + DPRINTF("%s: allocated new PDPT page at phys 0x%llx, " + "setting PML4e[%lld] = 0x%llx\n", __func__, + (uint64_t)npa, l4idx, pd[l4idx]); + } + + pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); + if (!pd) + panic("%s: can't locate PDPT @ pa=0x%llx\n", __func__, + (uint64_t)npa); + + /* npa = physaddr of PD page */ + npa = pd[l3idx] & PMAP_PA_MASK; + + /* Valid PDPTe for the 1GB region containing va? */ + if (!npa) { + /* No valid PDPTe - allocate PD page and set PDPTe */ + + ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); + + if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) + panic("%s: can't locate PD page\n", __func__); + + pd[l3idx] = (npa | PG_u | PG_RW | PG_V); + + DPRINTF("%s: allocated new PD page at phys 0x%llx, " + "setting PDPTe[%lld] = 0x%llx\n", __func__, + (uint64_t)npa, l3idx, pd[l3idx]); + } + + pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); + if (!pd) + panic("%s: can't locate PD page @ pa=0x%llx\n", __func__, + (uint64_t)npa); + + /* npa = physaddr of PT page */ + npa = pd[l2idx] & PMAP_PA_MASK; + + /* Valid PDE for the 2MB region containing va? */ + if (!npa) { + /* No valid PDE - allocate PT page and set PDE */ + + ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); + + if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) + panic("%s: can't locate PT page\n", __func__); + + pd[l2idx] = (npa | PG_u | PG_RW | PG_V); + + DPRINTF("%s: allocated new PT page at phys 0x%llx, " + "setting PDE[%lld] = 0x%llx\n", __func__, + (uint64_t)npa, l2idx, pd[l2idx]); + } + + pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); + if (!pd) + panic("%s: can't locate PT page @ pa=0x%llx\n", __func__, + (uint64_t)npa); + + DPRINTF("%s: setting PTE, PT page @ phys 0x%llx virt 0x%llx prot " + "0x%llx was 0x%llx\n", __func__, (uint64_t)npa, (uint64_t)pd, (uint64_t)prot, (uint64_t)pd[l1idx]); + + pd[l1idx] = pa | protection_codes[prot] | PG_V | pg_g_kern | PG_W; + DPRINTF("%s: setting PTE[%lld] = 0x%llx\n", __func__, l1idx, pd[l1idx]); + + if (pg_g_kern) { + /* now set the PG_G flag on the corresponding U+K entry */ + pt_entry_t *ptes; + int level, offs; + + level = pmap_find_pte_direct(pmap, va, &ptes, &offs); + if (__predict_true(level == 0 && + pmap_valid_entry(ptes[offs]))) { + ptes[offs] |= pg_g_kern; + } else { + DPRINTF("%s: no U+K mapping for special mapping?\n", + __func__); + } + } +} + /* * pmap_enter: enter a mapping into a pmap * @@ -2439,10 +2639,10 @@ pmap_convert(struct pmap *pmap, int mode) * release the lock if we get an interrupt in a bad moment. */ -volatile long tlb_shoot_wait; +volatile long tlb_shoot_wait __attribute__((section(".kudata"))); -volatile vaddr_t tlb_shoot_addr1; -volatile vaddr_t tlb_shoot_addr2; +volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata"))); +volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata"))); void pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself) diff --git a/sys/arch/amd64/amd64/spl.S b/sys/arch/amd64/amd64/spl.S index c4b6fe697b6..2ea315f2fb5 100644 --- a/sys/arch/amd64/amd64/spl.S +++ b/sys/arch/amd64/amd64/spl.S @@ -1,4 +1,4 @@ -/* $OpenBSD: spl.S,v 1.11 2016/05/20 14:37:53 deraadt Exp $ */ +/* $OpenBSD: spl.S,v 1.12 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: spl.S,v 1.3 2004/06/28 09:13:11 fvdl Exp $ */ /* @@ -114,7 +114,7 @@ _C_LABEL(splx): * a lower-prio one first, which needs to take the kernel lock --> * the sending CPU will never see the that CPU accept the IPI */ -IDTVEC(spllower) +KIDTVEC(spllower) _PROF_PROLOGUE pushq %rbx pushq %r13 @@ -143,7 +143,7 @@ IDTVEC(spllower) * ebx - cpl to restore * r13 - address to resume loop at */ -IDTVEC(doreti) +KIDTVEC(doreti) popq %rbx # get previous priority decl CPUVAR(IDEPTH) leaq 1f(%rip),%r13 @@ -168,4 +168,8 @@ IDTVEC(doreti) call _C_LABEL(ast) cli jmp 5b -3: INTRFASTEXIT +3: +#ifdef DIAGNOSTIC + movl $254,%esi +#endif /* DIAGNOSTIC */ + INTRFASTEXIT diff --git a/sys/arch/amd64/amd64/trap.c b/sys/arch/amd64/amd64/trap.c index 47b3bee5128..dc2d115c207 100644 --- a/sys/arch/amd64/amd64/trap.c +++ b/sys/arch/amd64/amd64/trap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: trap.c,v 1.63 2018/01/05 11:10:25 pirofti Exp $ */ +/* $OpenBSD: trap.c,v 1.64 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: trap.c,v 1.2 2003/05/04 23:51:56 fvdl Exp $ */ /*- @@ -212,6 +212,18 @@ trap(struct trapframe *frame) frame->tf_rip = (u_int64_t)xrstor_resume; return; } + + /* + * Check for failure during return to user mode. + * We do this by looking at the address of the + * instruction that faulted. + */ + if (frame->tf_rip == (u_int64_t)doreti_iret) { + frame->tf_rip = (u_int64_t)resume_iret; + return; + } + /* FALLTHROUGH */ + case T_SEGNPFLT: case T_ALIGNFLT: case T_TSSFLT: @@ -223,16 +235,6 @@ copyfault: frame->tf_rip = (u_int64_t)pcb->pcb_onfault; return; } - - /* - * Check for failure during return to user mode. - * We do this by looking at the address of the - * instruction that faulted. - */ - if (frame->tf_rip == (u_int64_t)doreti_iret) { - frame->tf_rip = (u_int64_t)resume_iret; - return; - } goto we_re_toast; case T_PROTFLT|T_USER: /* protection fault */ @@ -459,8 +461,12 @@ out: static void frame_dump(struct trapframe *tf) { - printf("rip %p rsp %p rfl %p\n", - (void *)tf->tf_rip, (void *)tf->tf_rsp, (void *)tf->tf_rflags); + printf("rip %p cs 0x%x rfl %p rsp %p ss 0x%x\n", + (void *)tf->tf_rip, (unsigned)tf->tf_cs & 0xffff, + (void *)tf->tf_rflags, + (void *)tf->tf_rsp, (unsigned)tf->tf_ss & 0xffff); + printf("err 0x%llx trapno 0x%llx\n", + tf->tf_err, tf->tf_trapno); printf("rdi %p rsi %p rdx %p\n", (void *)tf->tf_rdi, (void *)tf->tf_rsi, (void *)tf->tf_rdx); printf("rcx %p r8 %p r9 %p\n", diff --git a/sys/arch/amd64/amd64/vector.S b/sys/arch/amd64/amd64/vector.S index 730220af132..5de23fe67ab 100644 --- a/sys/arch/amd64/amd64/vector.S +++ b/sys/arch/amd64/amd64/vector.S @@ -1,4 +1,4 @@ -/* $OpenBSD: vector.S,v 1.51 2017/10/04 02:10:33 guenther Exp $ */ +/* $OpenBSD: vector.S,v 1.52 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: vector.S,v 1.5 2004/06/28 09:13:11 fvdl Exp $ */ /* @@ -104,36 +104,97 @@ #define TRAP(a) pushq $(a) ; jmp _C_LABEL(alltraps) #define ZTRAP(a) pushq $0 ; TRAP(a) - .text IDTVEC(trap00) ZTRAP(T_DIVIDE) IDTVEC(trap01) ZTRAP(T_TRCTRAP) + +/* + * NMIs can happen at any time, so there's no simple way to tell + * which GS.base is in place at the time of the interrupt. Instead, + * borrow a couple ideas from FreeBSD and put the CPU's kernel + * GS.base in the memory right above the stack, storing the current + * one in a pair of callee-saved registers (%r12/13). We save the + * current %cr3 in a callee-saved register too (%r15). + * Note: we don't unblock interrupts because a nested normal interrupt + * would also reenable NMIs. + */ IDTVEC(trap02) - ZTRAP(T_NMI) + pushq $0 + pushq $T_NMI +calltrap_specstk: # special stack path + INTR_REENTRY + movl $MSR_FSBASE,%ecx # save current GS.base... + rdmsr + movq %rax,%r12 # ...in %r12 and %r13 + movq %rdx,%r13 + movq FRAMESIZE(%rsp),%rax # get kernel GS.base + movq %rax,%rdx + shrq $32,%rdx + wrmsr # switch to it + movq %cr3,%r15 # save current %cr3 in %r15 + movq CPUVAR(KERN_CR3),%rax # switch to kernel page tables + testq %rax,%rax + jz INTRENTRY_LABEL(calltrap_specstk) + movq %rax,%cr3 + jmp INTRENTRY_LABEL(calltrap_specstk) + .text + .globl INTRENTRY_LABEL(calltrap_specstk) +INTRENTRY_LABEL(calltrap_specstk): + cld + SMAP_CLAC + movq %rsp,%rdi + call trap + movl $MSR_FSBASE,%ecx # restore GS.base + movq %r12,%rax + movq %r13,%rdx + wrmsr + popq %rdi + popq %rsi + popq %rdx + popq %rcx + popq %r8 + popq %r9 + popq %r10 + popq %r11 + popq %r12 + popq %r13 + popq %r14 + jmp calltrap_specstk_tramp +KUENTRY(calltrap_specstk_tramp) + movq %r15,%cr3 # restore %cr3 + popq %r15 + popq %rbp + popq %rbx + popq %rax + addq $48,%rsp # ignored TF_[DEFG]S + iretq + IDTVEC(trap03) ZTRAP(T_BPTFLT) IDTVEC(trap04) - ZTRAP(T_OFLOW) + ZTRAP(T_OFLOW) # impossible: INTO instruction invalid in amd64 IDTVEC(trap05) - ZTRAP(T_BOUND) + ZTRAP(T_BOUND) # impossible: BOUND instruction invalid in amd64 IDTVEC(trap06) ZTRAP(T_PRIVINFLT) IDTVEC(trap07) pushq $0 # dummy error code pushq $T_DNA - INTRENTRY + INTRENTRY(trap07) sti cld SMAP_CLAC movq CPUVAR(SELF),%rdi movq %rsp, %rsi call _C_LABEL(fpudna) + cli INTRFASTEXIT IDTVEC(trap08) - TRAP(T_DOUBLEFLT) + pushq $T_DOUBLEFLT + jmp calltrap_specstk IDTVEC(trap09) - ZTRAP(T_FPOPFLT) + ZTRAP(T_FPOPFLT) # impossible: not generated on amd64 IDTVEC(trap0a) TRAP(T_TSSFLT) IDTVEC(trap0b) @@ -149,30 +210,49 @@ IDTVEC(trap0c) * so that we can do the necessary swapgs in that case. */ IDTVEC(trap0d) - subq $TF_ERR,%rsp - movl $T_PROTFLT,TF_TRAPNO(%rsp) - movq %rdi,TF_RDI(%rsp) - leaq _C_LABEL(doreti_iret)(%rip),%rdi - cmpq %rdi,TF_RIP(%rsp) + pushq %rcx + leaq _C_LABEL(doreti_iret)(%rip),%rcx + cmpq %rcx,16(%rsp) /* over %rcx and err to %rip */ + popq %rcx je 1f - testq $SEL_RPL,TF_CS(%rsp) - jz 2f + testq $SEL_RPL,16(%rsp) /* over err and %rip to %cs */ + je INTRENTRY_LABEL(trap0d) 1: swapgs -2: movq %r15,TF_R15(%rsp) - movq %r14,TF_R14(%rsp) - movq %r13,TF_R13(%rsp) - movq %r12,TF_R12(%rsp) - movq %r11,TF_R11(%rsp) - movq %r10,TF_R10(%rsp) - movq %r9,TF_R9(%rsp) - movq %r8,TF_R8(%rsp) - /*movq %rdi,TF_RDI(%rsp) done above */ - movq %rsi,TF_RSI(%rsp) - movq %rbp,TF_RBP(%rsp) - movq %rbx,TF_RBX(%rsp) - movq %rdx,TF_RDX(%rsp) + movq %rax,CPUVAR(SCRATCH) + movq CPUVAR(KERN_CR3),%rax + testq %rax,%rax + jz 98f + movq %rax,%cr3 + jmp 98f + .text + .globl INTRENTRY_LABEL(trap0d) +INTRENTRY_LABEL(trap0d): /* from kernel */ + pushq $T_PROTFLT + subq $152,%rsp movq %rcx,TF_RCX(%rsp) - movq %rax,TF_RAX(%rsp) + jmp 99f +98: /* from userspace */ + movq CPUVAR(KERN_RSP),%rax + xchgq %rax,%rsp + movq %rcx,TF_RCX(%rsp) + /* set trapno in the trap frame */ + movq $T_PROTFLT,TF_TRAPNO(%rsp) + /* copy err and iretq frame to the trap frame */ + movq 0(%rax),%rcx + movq %rcx,TF_ERR(%rsp) + add $8,%rax + movq IRETQ_RIP(%rax),%rcx + movq %rcx,TF_RIP(%rsp) + movq IRETQ_CS(%rax),%rcx + movq %rcx,TF_CS(%rsp) + movq IRETQ_RFLAGS(%rax),%rcx + movq %rcx,TF_RFLAGS(%rsp) + movq IRETQ_RSP(%rax),%rcx + movq %rcx,TF_RSP(%rsp) + movq IRETQ_SS(%rax),%rcx + movq %rcx,TF_SS(%rsp) + movq CPUVAR(SCRATCH),%rax +99: INTR_SAVE_MOST_GPRS_NO_ADJ sti jmp calltrap @@ -204,7 +284,9 @@ IDTVEC(trap1f) /* 20 - 31 reserved for future exp */ ZTRAP(T_RESERVED) -IDTVEC(exceptions) + .section .rodata + .globl Xexceptions +Xexceptions: .quad _C_LABEL(Xtrap00), _C_LABEL(Xtrap01) .quad _C_LABEL(Xtrap02), _C_LABEL(Xtrap03) .quad _C_LABEL(Xtrap04), _C_LABEL(Xtrap05) @@ -232,19 +314,44 @@ IDTVEC(exceptions) * protection fault. This will cause the process to get a SIGBUS. */ NENTRY(resume_iret) - pushq $0 - pushq $T_PROTFLT - subq $32,%rsp - INTR_SAVE_GPRS + movq %rax,CPUVAR(SCRATCH) + movq CPUVAR(KERN_CR3),%rax + testq %rax,%rax + jz INTRENTRY_LABEL(iret) + movq %rax,%cr3 + jmp INTRENTRY_LABEL(iret) + .text + .globl INTRENTRY_LABEL(iret) +INTRENTRY_LABEL(iret): /* from kernel */ + movq CPUVAR(KERN_RSP),%rax + xchgq %rax,%rsp + movq %rcx,TF_RCX(%rsp) + /* set trapno+err in the trap frame */ + movq $T_PROTFLT,TF_TRAPNO(%rsp) + movq $0,TF_ERR(%rsp) + /* copy iretq frame to the trap frame */ + movq IRETQ_RIP(%rax),%rcx + movq %rcx,TF_RIP(%rsp) + movq IRETQ_CS(%rax),%rcx + movq %rcx,TF_CS(%rsp) + movq IRETQ_RFLAGS(%rax),%rcx + movq %rcx,TF_RFLAGS(%rsp) + movq IRETQ_RSP(%rax),%rcx + movq %rcx,TF_RSP(%rsp) + movq IRETQ_SS(%rax),%rcx + movq %rcx,TF_SS(%rsp) + movq CPUVAR(SCRATCH),%rax + INTR_SAVE_MOST_GPRS_NO_ADJ sti jmp calltrap + /* * All traps go through here. Call the generic trap handler, and * check for ASTs afterwards. */ -NENTRY(alltraps) - INTRENTRY +KUENTRY(alltraps) + INTRENTRY(alltraps) sti calltrap: cld @@ -329,6 +436,7 @@ spl_lowered: /* XXX See comment in locore.s */ #define XINTR(name,num) Xintr_##name##num + KUTEXT .globl _C_LABEL(x2apic_eoi) _C_LABEL(x2apic_eoi): pushq %rax @@ -345,23 +453,23 @@ _C_LABEL(x2apic_eoi): #if NLAPIC > 0 #ifdef MULTIPROCESSOR -IDTVEC(recurse_lapic_ipi) +KIDTVEC(recurse_lapic_ipi) INTR_RECURSE_HWFRAME - pushq $0 + pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTR_REENTRY jmp 1f IDTVEC(intr_lapic_ipi) - pushq $0 + pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTRENTRY(intr_lapic_ipi) CODEPATCH_START movl $0,_C_LABEL(local_apic)+LAPIC_EOI CODEPATCH_END(CPTAG_EOI) movl CPUVAR(ILEVEL),%ebx cmpl $IPL_IPI,%ebx jae 2f -IDTVEC(resume_lapic_ipi) +KIDTVEC(resume_lapic_ipi) 1: incl CPUVAR(IDEPTH) movl $IPL_IPI,CPUVAR(ILEVEL) @@ -425,27 +533,27 @@ IDTVEC(ipi_invlrange) iretq #endif /* MULTIPROCESSOR */ - + /* * Interrupt from the local APIC timer. */ -IDTVEC(recurse_lapic_ltimer) +KIDTVEC(recurse_lapic_ltimer) INTR_RECURSE_HWFRAME - pushq $0 + pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTR_REENTRY jmp 1f IDTVEC(intr_lapic_ltimer) - pushq $0 + pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTRENTRY(intr_lapic_ltimer) CODEPATCH_START movl $0,_C_LABEL(local_apic)+LAPIC_EOI CODEPATCH_END(CPTAG_EOI) movl CPUVAR(ILEVEL),%ebx cmpl $IPL_CLOCK,%ebx jae 2f -IDTVEC(resume_lapic_ltimer) +KIDTVEC(resume_lapic_ltimer) 1: incl CPUVAR(IDEPTH) movl $IPL_CLOCK,CPUVAR(ILEVEL) @@ -466,21 +574,21 @@ IDTVEC(resume_lapic_ltimer) * Xen event channel upcall interrupt handler. * Only used when the hypervisor supports direct vector callbacks. */ -IDTVEC(recurse_xen_upcall) +KIDTVEC(recurse_xen_upcall) INTR_RECURSE_HWFRAME pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTR_REENTRY jmp 1f IDTVEC(intr_xen_upcall) pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTRENTRY(intr_xen_upcall) call _C_LABEL(xen_intr_ack) movl CPUVAR(ILEVEL),%ebx cmpl $IPL_NET,%ebx jae 2f -IDTVEC(resume_xen_upcall) +KIDTVEC(resume_xen_upcall) 1: incl CPUVAR(IDEPTH) movl $IPL_NET,CPUVAR(ILEVEL) @@ -502,20 +610,20 @@ IDTVEC(resume_xen_upcall) * Hyperv event channel upcall interrupt handler. * Only used when the hypervisor supports direct vector callbacks. */ -IDTVEC(recurse_hyperv_upcall) +KIDTVEC(recurse_hyperv_upcall) INTR_RECURSE_HWFRAME pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTR_REENTRY jmp 1f IDTVEC(intr_hyperv_upcall) pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTRENTRY(intr_hyperv_upcall) movl CPUVAR(ILEVEL),%ebx cmpl $IPL_NET,%ebx jae 2f -IDTVEC(resume_hyperv_upcall) +KIDTVEC(resume_hyperv_upcall) 1: incl CPUVAR(IDEPTH) movl $IPL_NET,CPUVAR(ILEVEL) @@ -542,11 +650,11 @@ IDTVEC(resume_hyperv_upcall) */ #define INTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \ -IDTVEC(recurse_##name##num) ;\ +KIDTVEC(recurse_##name##num) ;\ INTR_RECURSE_HWFRAME ;\ subq $16,%rsp /* space for __if_{trapno,err} */;\ - INTRENTRY ;\ -IDTVEC(resume_##name##num) \ + INTR_REENTRY ;\ +KIDTVEC(resume_##name##num) \ movq $IREENT_MAGIC,TF_ERR(%rsp) ;\ movl %ebx,%r13d ;\ movq CPUVAR(ISOURCES) + (num) * 8, %r14 ;\ @@ -555,7 +663,7 @@ IDTVEC(resume_##name##num) \ IDTVEC(intr_##name##num) ;\ pushq $0 /* dummy error code */ ;\ subq $8,%rsp /* unused __if_trapno */ ;\ - INTRENTRY ;\ + INTRENTRY(intr_##name##num) ;\ movq CPUVAR(ISOURCES) + (num) * 8, %r14 ;\ mask(num) /* mask it in hardware */ ;\ early_ack(num) /* and allow other intrs */ ;\ @@ -1094,8 +1202,7 @@ _C_LABEL(ioapic_level_stubs): /* * Soft interrupt handlers */ - .text -IDTVEC(softtty) +KIDTVEC(softtty) movl $IPL_SOFTTTY, CPUVAR(ILEVEL) sti incl CPUVAR(IDEPTH) @@ -1104,7 +1211,7 @@ IDTVEC(softtty) decl CPUVAR(IDEPTH) jmp *%r13 -IDTVEC(softnet) +KIDTVEC(softnet) movl $IPL_SOFTNET, CPUVAR(ILEVEL) sti incl CPUVAR(IDEPTH) @@ -1113,7 +1220,7 @@ IDTVEC(softnet) decl CPUVAR(IDEPTH) jmp *%r13 -IDTVEC(softclock) +KIDTVEC(softclock) movl $IPL_SOFTCLOCK, CPUVAR(ILEVEL) sti incl CPUVAR(IDEPTH) diff --git a/sys/arch/amd64/conf/ld.script b/sys/arch/amd64/conf/ld.script index 4d74b3eb8e3..9c60d69f2c8 100644 --- a/sys/arch/amd64/conf/ld.script +++ b/sys/arch/amd64/conf/ld.script @@ -1,4 +1,4 @@ -/* $OpenBSD: ld.script,v 1.8 2017/10/24 20:06:54 guenther Exp $ */ +/* $OpenBSD: ld.script,v 1.9 2018/02/21 19:24:15 guenther Exp $ */ /* * Copyright (c) 2009 Tobias Weingartner <weingart@tepid.org> @@ -52,6 +52,15 @@ SECTIONS *(.text .text.*) } :text =0xcccccccc + . = ALIGN(__ALIGN_SIZE); + __kernel_kutext_phys = (. - __kernel_virt_base) + 0x1000000; + .kutext : AT (__kernel_kutext_phys) + { + __kutext_start = ABSOLUTE(.); + *(.kutext) + __kutext_end = ABSOLUTE(.); + } :text =0xcccccccc + PROVIDE (etext = .); _etext = .; @@ -85,6 +94,17 @@ SECTIONS *(.data .data.*) } :data =0xcccccccc . = ALIGN(0x1000); + + . = ALIGN(__ALIGN_SIZE); + __kernel_kudata_phys = (. - __kernel_virt_base) + 0x1000000; + .kudata : AT (__kernel_kudata_phys) + { + __kudata_start = ABSOLUTE(.); + *(.kudata) + __kudata_end = ABSOLUTE(.); + } :data =0xcccccccc + + . = ALIGN(0x1000); PROVIDE (edata = .); _edata = .; diff --git a/sys/arch/amd64/include/asm.h b/sys/arch/amd64/include/asm.h index f64e5338f07..cd3922b4b26 100644 --- a/sys/arch/amd64/include/asm.h +++ b/sys/arch/amd64/include/asm.h @@ -1,4 +1,4 @@ -/* $OpenBSD: asm.h,v 1.8 2017/06/29 17:36:16 deraadt Exp $ */ +/* $OpenBSD: asm.h,v 1.9 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: asm.h,v 1.2 2003/05/02 18:05:47 yamt Exp $ */ /*- @@ -68,14 +68,19 @@ .text; _ALIGN_TEXT; .globl x; .type x,@function; x: #ifdef _KERNEL +#define KUTEXT .section .kutext, "ax" +/*#define KUTEXT .text */ + /* XXX Can't use __CONCAT() here, as it would be evaluated incorrectly. */ -#ifdef __STDC__ #define IDTVEC(name) \ - .text; ALIGN_TEXT; .globl X ## name; .type X ## name,@function; X ## name: -#else -#define IDTVEC(name) \ - .text; ALIGN_TEXT; .globl X/**/name; .type X/**/name,@function; X/**/name: -#endif /* __STDC__ */ + KUTEXT; ALIGN_TEXT; \ + .globl X ## name; .type X ## name,@function; X ## name: +#define KIDTVEC(name) \ + .text; ALIGN_TEXT; \ + .globl X ## name; .type X ## name,@function; X ## name: +#define KUENTRY(x) \ + KUTEXT; _ALIGN_TEXT; .globl x; .type x,@function; x: + #endif /* _KERNEL */ #ifdef __STDC__ diff --git a/sys/arch/amd64/include/cpu.h b/sys/arch/amd64/include/cpu.h index 59f99ebdc8a..8f973ba1423 100644 --- a/sys/arch/amd64/include/cpu.h +++ b/sys/arch/amd64/include/cpu.h @@ -1,4 +1,4 @@ -/* $OpenBSD: cpu.h,v 1.118 2018/01/07 01:08:20 mlarkin Exp $ */ +/* $OpenBSD: cpu.h,v 1.119 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: cpu.h,v 1.1 2003/04/26 18:39:39 fvdl Exp $ */ /*- @@ -43,7 +43,7 @@ */ #ifdef _KERNEL #include <machine/frame.h> -#include <machine/segments.h> +#include <machine/segments.h> /* USERMODE */ #include <machine/cacheinfo.h> #include <machine/intrdefs.h> #endif /* _KERNEL */ @@ -89,6 +89,17 @@ union vmm_cpu_cap { struct x86_64_tss; struct cpu_info { + /* + * The beginning of this structure in mapped in the userspace "u-k" + * page tables, so that these first couple members can be accessed + * from the trampoline code. The ci_PAGEALIGN member defines where + * the part that is *not* visible begins, so don't put anything + * above it that must be kept hidden from userspace! + */ + u_int64_t ci_kern_cr3; /* U+K page table */ + u_int64_t ci_scratch; /* for U<-->K transition */ + +#define ci_PAGEALIGN ci_dev struct device *ci_dev; struct cpu_info *ci_self; struct schedstate_percpu ci_schedstate; /* scheduler state */ @@ -100,7 +111,9 @@ struct cpu_info { u_int ci_acpi_proc_id; u_int32_t ci_randseed; - u_int64_t ci_scratch; + u_int64_t ci_kern_rsp; /* kernel-only stack */ + u_int64_t ci_intr_rsp; /* U<-->K trampoline stack */ + u_int64_t ci_user_cr3; /* U-K page table */ struct proc *ci_fpcurproc; struct proc *ci_fpsaveproc; @@ -216,7 +229,10 @@ struct cpu_info { #define PROC_PC(p) ((p)->p_md.md_regs->tf_rip) #define PROC_STACK(p) ((p)->p_md.md_regs->tf_rsp) -extern struct cpu_info cpu_info_primary; +struct cpu_info_full; +extern struct cpu_info_full cpu_info_full_primary; +#define cpu_info_primary (*(struct cpu_info *)((char *)&cpu_info_full_primary + 4096*2 - offsetof(struct cpu_info, ci_PAGEALIGN))) + extern struct cpu_info *cpu_info_list; #define CPU_INFO_ITERATOR int @@ -241,7 +257,8 @@ extern void need_resched(struct cpu_info *); #define CPU_START_CLEANUP(_ci) ((_ci)->ci_func->cleanup(_ci)) #define curcpu() ({struct cpu_info *__ci; \ - asm volatile("movq %%gs:8,%0" : "=r" (__ci)); \ + asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) \ + :"n" (offsetof(struct cpu_info, ci_self))); \ __ci;}) #define cpu_number() (curcpu()->ci_cpuid) @@ -262,8 +279,6 @@ void cpu_unidle(struct cpu_info *); #define MAXCPUS 1 #ifdef _KERNEL -extern struct cpu_info cpu_info_primary; - #define curcpu() (&cpu_info_primary) #define cpu_kick(ci) diff --git a/sys/arch/amd64/include/cpu_full.h b/sys/arch/amd64/include/cpu_full.h new file mode 100644 index 00000000000..995cab087cf --- /dev/null +++ b/sys/arch/amd64/include/cpu_full.h @@ -0,0 +1,66 @@ +/* $OpenBSD: cpu_full.h,v 1.1 2018/02/21 19:24:15 guenther Exp $ */ +/* + * Copyright (c) Philip Guenther <guenther@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _MACHINE_CPU_FULL_H_ +#define _MACHINE_CPU_FULL_H_ + +#include <sys/param.h> /* offsetof, PAGE_SIZE */ +#include <machine/segments.h> +#include <machine/tss.h> + +/* + * The layout of the full per-CPU information, including TSS, GDT, + * trampoline stacks, and cpu_info described in <machine/cpu.h> + */ +struct cpu_info_full { + /* page mapped kRO in u-k */ + union { + struct x86_64_tss u_tss; /* followed by gdt */ + char u_align[PAGE_SIZE]; + } cif_RO; +#define cif_tss cif_RO.u_tss + + /* start of page mapped kRW in u-k */ + uint64_t cif_tramp_stack[(PAGE_SIZE / 4 + - offsetof(struct cpu_info, ci_PAGEALIGN)) / sizeof(uint64_t)]; + uint64_t cif_dblflt_stack[(PAGE_SIZE / 4) / sizeof(uint64_t)]; + uint64_t cif_nmi_stack[(2 * PAGE_SIZE / 4) / sizeof(uint64_t)]; + + /* + * Beginning of this hangs over into the kRW page; rest is + * unmapped in u-k + */ + struct cpu_info cif_cpu; +} __aligned(PAGE_SIZE); + +/* tss, align shim, and gdt must fit in a page */ +CTASSERT(_ALIGN(sizeof(struct x86_64_tss)) + + sizeof(struct mem_segment_descriptor) * (NGDT_MEM + 2*NGDT_SYS) + < PAGE_SIZE); + +/* verify expected alignment */ +CTASSERT(offsetof(struct cpu_info_full, cif_cpu.ci_PAGEALIGN) % PAGE_SIZE == 0); + +/* verify total size is multiple of page size */ +CTASSERT(sizeof(struct cpu_info_full) % PAGE_SIZE == 0); + +extern struct cpu_info_full cpu_info_full_primary; + +/* Now make sure the cpu_info_primary macro is correct */ +CTASSERT(&cpu_info_primary == &cpu_info_full_primary.cif_cpu); + +#endif /* _MACHINE_CPU_FULL_H_ */ diff --git a/sys/arch/amd64/include/cpufunc.h b/sys/arch/amd64/include/cpufunc.h index b52e4b3d2ae..ed8c6ba8905 100644 --- a/sys/arch/amd64/include/cpufunc.h +++ b/sys/arch/amd64/include/cpufunc.h @@ -1,4 +1,4 @@ -/* $OpenBSD: cpufunc.h,v 1.23 2018/02/06 01:09:17 patrick Exp $ */ +/* $OpenBSD: cpufunc.h,v 1.24 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: cpufunc.h,v 1.3 2003/05/08 10:27:43 fvdl Exp $ */ /*- @@ -317,6 +317,9 @@ void amd64_errata(struct cpu_info *); void cpu_ucode_setup(void); void cpu_ucode_apply(struct cpu_info *); +struct cpu_info_full; +void cpu_enter_pages(struct cpu_info_full *); + #endif /* _KERNEL */ #endif /* !_MACHINE_CPUFUNC_H_ */ diff --git a/sys/arch/amd64/include/frame.h b/sys/arch/amd64/include/frame.h index e71d4093274..997adbf570c 100644 --- a/sys/arch/amd64/include/frame.h +++ b/sys/arch/amd64/include/frame.h @@ -1,4 +1,4 @@ -/* $OpenBSD: frame.h,v 1.6 2016/02/26 09:29:20 mpi Exp $ */ +/* $OpenBSD: frame.h,v 1.7 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: frame.h,v 1.1 2003/04/26 18:39:40 fvdl Exp $ */ /*- @@ -147,6 +147,20 @@ struct intrframe { int64_t if_ss; }; + +/* + * The trampoline frame used on the kernel stack page which is present + * but kernel-only, in the page tables used when in userspace. This is + * the minimum for iretq operation. + */ +struct iretq_frame { + int64_t iretq_rip; + int64_t iretq_cs; + int64_t iretq_rflags; + int64_t iretq_rsp; + int64_t iretq_ss; +}; + /* * Stack frame inside cpu_switch() */ diff --git a/sys/arch/amd64/include/frameasm.h b/sys/arch/amd64/include/frameasm.h index 88309d1dd4f..5e384acb9dc 100644 --- a/sys/arch/amd64/include/frameasm.h +++ b/sys/arch/amd64/include/frameasm.h @@ -1,4 +1,4 @@ -/* $OpenBSD: frameasm.h,v 1.11 2018/01/06 22:03:12 guenther Exp $ */ +/* $OpenBSD: frameasm.h,v 1.12 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: frameasm.h,v 1.1 2003/04/26 18:39:40 fvdl Exp $ */ #ifndef _AMD64_MACHINE_FRAMEASM_H @@ -13,7 +13,10 @@ * These are used on interrupt or trap entry or exit. */ #define INTR_SAVE_GPRS \ - subq $120,%rsp ; \ + subq $120,%rsp ; \ + INTR_SAVE_MOST_GPRS_NO_ADJ ; \ + movq %rcx,TF_RCX(%rsp) +#define INTR_SAVE_MOST_GPRS_NO_ADJ \ movq %r15,TF_R15(%rsp) ; \ movq %r14,TF_R14(%rsp) ; \ movq %r13,TF_R13(%rsp) ; \ @@ -27,15 +30,54 @@ movq %rbp,TF_RBP(%rsp) ; \ movq %rbx,TF_RBX(%rsp) ; \ movq %rdx,TF_RDX(%rsp) ; \ - movq %rcx,TF_RCX(%rsp) ; \ movq %rax,TF_RAX(%rsp) -#define INTRENTRY \ - subq $32,%rsp ; \ - testq $SEL_RPL,56(%rsp) ; \ - je 98f ; \ +/* For real interrupt code paths, where we can come from userspace */ +#define INTRENTRY_LABEL(label) X##label##_untramp +#define INTRENTRY(label) \ + testq $SEL_RPL,24(%rsp) ; \ + je INTRENTRY_LABEL(label) ; \ swapgs ; \ -98: INTR_SAVE_GPRS + movq %rax,CPUVAR(SCRATCH) ; \ + movq CPUVAR(KERN_CR3),%rax ; \ + testq %rax,%rax ; \ + jz 98f ; \ + movq %rax,%cr3 ; \ + jmp 98f ; \ + .text ; \ + .global INTRENTRY_LABEL(label) ; \ +INTRENTRY_LABEL(label): /* from kernel */ \ + subq $152,%rsp ; \ + movq %rcx,TF_RCX(%rsp) ; \ + jmp 99f ; \ +98: /* from userspace */ \ + movq CPUVAR(KERN_RSP),%rax ; \ + xchgq %rax,%rsp ; \ + movq %rcx,TF_RCX(%rsp) ; \ + /* copy trapno+err to the trap frame */ \ + movq 0(%rax),%rcx ; \ + movq %rcx,TF_TRAPNO(%rsp) ; \ + movq 8(%rax),%rcx ; \ + movq %rcx,TF_ERR(%rsp) ; \ + addq $16,%rax ; \ + /* copy iretq frame to the trap frame */ \ + movq IRETQ_RIP(%rax),%rcx ; \ + movq %rcx,TF_RIP(%rsp) ; \ + movq IRETQ_CS(%rax),%rcx ; \ + movq %rcx,TF_CS(%rsp) ; \ + movq IRETQ_RFLAGS(%rax),%rcx ; \ + movq %rcx,TF_RFLAGS(%rsp) ; \ + movq IRETQ_RSP(%rax),%rcx ; \ + movq %rcx,TF_RSP(%rsp) ; \ + movq IRETQ_SS(%rax),%rcx ; \ + movq %rcx,TF_SS(%rsp) ; \ + movq CPUVAR(SCRATCH),%rax ; \ +99: INTR_SAVE_MOST_GPRS_NO_ADJ + +/* For faking up an interrupt frame when we're already in the kernel */ +#define INTR_REENTRY \ + subq $32,%rsp ; \ + INTR_SAVE_GPRS #define INTRFASTEXIT \ jmp intr_fast_exit @@ -50,24 +92,6 @@ pushq %r11 ; \ pushq %r13 ; -/* - * Restore FS.base if it's not already in the CPU, and do the cli/swapgs. - * Uses %rax, %rcx, and %rdx - */ -#define INTR_RESTORE_SELECTORS \ - btsl $CPUF_USERSEGS_BIT, CPUVAR(FLAGS) ; \ - jc 99f ; \ - movq CPUVAR(CURPCB),%rdx /* for below */ ; \ - movq PCB_FSBASE(%rdx),%rax ; \ - cmpq $0,%rax ; \ - je 99f /* setting %fs has zeroed FS.base */ ; \ - movq %rax,%rdx ; \ - shrq $32,%rdx ; \ - movl $MSR_FSBASE,%ecx ; \ - wrmsr ; \ -99: cli ; \ - swapgs - #define INTR_FAKE_TRAP 0xbadabada #define CHECK_ASTPENDING(reg) movq CPUVAR(CURPROC),reg ; \ diff --git a/sys/arch/amd64/include/gdt.h b/sys/arch/amd64/include/gdt.h index 65a116e8bc1..bfdc521d6c2 100644 --- a/sys/arch/amd64/include/gdt.h +++ b/sys/arch/amd64/include/gdt.h @@ -1,4 +1,4 @@ -/* $OpenBSD: gdt.h,v 1.5 2010/11/13 04:16:42 guenther Exp $ */ +/* $OpenBSD: gdt.h,v 1.6 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: gdt.h,v 1.1 2003/04/26 18:39:40 fvdl Exp $ */ /*- @@ -31,4 +31,3 @@ */ void gdt_init_cpu(struct cpu_info *); -void gdt_alloc_cpu(struct cpu_info *); diff --git a/sys/arch/amd64/include/pmap.h b/sys/arch/amd64/include/pmap.h index ef776eb959f..c316521f6f3 100644 --- a/sys/arch/amd64/include/pmap.h +++ b/sys/arch/amd64/include/pmap.h @@ -1,4 +1,4 @@ -/* $OpenBSD: pmap.h,v 1.63 2018/01/07 21:43:25 mlarkin Exp $ */ +/* $OpenBSD: pmap.h,v 1.64 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: pmap.h,v 1.1 2003/04/26 18:39:46 fvdl Exp $ */ /* @@ -280,8 +280,19 @@ struct pmap { struct mutex pm_mtx; struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */ LIST_ENTRY(pmap) pm_list; /* list (lck by pm_list lock) */ - pd_entry_t *pm_pdir; /* VA of PD (lck by object lock) */ - paddr_t pm_pdirpa; /* PA of PD (read-only after create) */ + /* + * pm_pdir : VA of page table to be used when executing in + * privileged mode + * pm_pdirpa : PA of page table to be used when executing in + * privileged mode + * pm_pdir_intel : VA of special page table to be used when executing + * on an Intel CPU in usermode (no kernel mappings) + * pm_pdirpa_intel : PA of special page table to be used when executing + * on an Intel CPU in usermode (no kernel mappings) + */ + pd_entry_t *pm_pdir, *pm_pdir_intel; + paddr_t pm_pdirpa, pm_pdirpa_intel; + struct vm_page *pm_ptphint[PTP_LEVELS-1]; /* pointer to a PTP in our pmap */ struct pmap_statistics pm_stats; /* pmap stats (lck by object lock) */ @@ -375,6 +386,7 @@ paddr_t pmap_prealloc_lowmem_ptps(paddr_t); void pagezero(vaddr_t); int pmap_convert(struct pmap *, int); +void pmap_enter_special(vaddr_t, paddr_t, vm_prot_t); /* * functions for flushing the cache for vaddrs and pages. diff --git a/sys/arch/amd64/include/specialreg.h b/sys/arch/amd64/include/specialreg.h index ae81a593f9a..b7aa6e7a4d6 100644 --- a/sys/arch/amd64/include/specialreg.h +++ b/sys/arch/amd64/include/specialreg.h @@ -1,4 +1,4 @@ -/* $OpenBSD: specialreg.h,v 1.67 2018/02/10 09:46:58 jsg Exp $ */ +/* $OpenBSD: specialreg.h,v 1.68 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: specialreg.h,v 1.1 2003/04/26 18:39:48 fvdl Exp $ */ /* $NetBSD: x86/specialreg.h,v 1.2 2003/04/25 21:54:30 fvdl Exp $ */ @@ -219,6 +219,7 @@ #define SEFF0EDX_AVX512_4FMAPS 0x00000008 /* AVX-512 mult accum single prec */ #define SEFF0EDX_IBRS 0x04000000 /* IBRS / IBPB Speculation Control */ #define SEFF0EDX_STIBP 0x08000000 /* STIBP Speculation Control */ +#define SEFF0EDX_ARCH_CAP 0x20000000 /* Has IA32_ARCH_CAPABILITIES MSR */ /* * Thermal and Power Management (CPUID function 0x6) EAX bits @@ -351,6 +352,8 @@ #define MTRRcap_FIXED 0x100 /* bit 8 - fixed MTRRs supported */ #define MTRRcap_WC 0x400 /* bit 10 - WC type supported */ #define MTRRcap_SMRR 0x800 /* bit 11 - SMM range reg supported */ +#define MSR_ARCH_CAPABILITIES 0x10a +#define ARCH_CAPABILITIES_RDCL_NO (1ULL << 0) /* Meltdown safe */ #define MSR_BBL_CR_ADDR 0x116 /* PII+ only */ #define MSR_BBL_CR_DECC 0x118 /* PII+ only */ #define MSR_BBL_CR_CTL 0x119 /* PII+ only */ |