summaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
Diffstat (limited to 'sys')
-rw-r--r--sys/arch/amd64/amd64/cpu.c83
-rw-r--r--sys/arch/amd64/amd64/gdt.c29
-rw-r--r--sys/arch/amd64/amd64/genassym.cf17
-rw-r--r--sys/arch/amd64/amd64/identcpu.c7
-rw-r--r--sys/arch/amd64/amd64/lapic.c22
-rw-r--r--sys/arch/amd64/amd64/locore.S190
-rw-r--r--sys/arch/amd64/amd64/locore0.S47
-rw-r--r--sys/arch/amd64/amd64/machdep.c95
-rw-r--r--sys/arch/amd64/amd64/pmap.c230
-rw-r--r--sys/arch/amd64/amd64/spl.S12
-rw-r--r--sys/arch/amd64/amd64/trap.c32
-rw-r--r--sys/arch/amd64/amd64/vector.S237
-rw-r--r--sys/arch/amd64/conf/ld.script22
-rw-r--r--sys/arch/amd64/include/asm.h19
-rw-r--r--sys/arch/amd64/include/cpu.h29
-rw-r--r--sys/arch/amd64/include/cpu_full.h66
-rw-r--r--sys/arch/amd64/include/cpufunc.h5
-rw-r--r--sys/arch/amd64/include/frame.h16
-rw-r--r--sys/arch/amd64/include/frameasm.h76
-rw-r--r--sys/arch/amd64/include/gdt.h3
-rw-r--r--sys/arch/amd64/include/pmap.h18
-rw-r--r--sys/arch/amd64/include/specialreg.h5
22 files changed, 1020 insertions, 240 deletions
diff --git a/sys/arch/amd64/amd64/cpu.c b/sys/arch/amd64/amd64/cpu.c
index c8727f583b0..7e14b3709de 100644
--- a/sys/arch/amd64/amd64/cpu.c
+++ b/sys/arch/amd64/amd64/cpu.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: cpu.c,v 1.111 2018/02/06 01:09:17 patrick Exp $ */
+/* $OpenBSD: cpu.c,v 1.112 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: cpu.c,v 1.1 2003/04/26 18:39:26 fvdl Exp $ */
/*-
@@ -81,7 +81,7 @@
#include <uvm/uvm_extern.h>
#include <machine/codepatch.h>
-#include <machine/cpu.h>
+#include <machine/cpu_full.h>
#include <machine/cpufunc.h>
#include <machine/cpuvar.h>
#include <machine/pmap.h>
@@ -116,6 +116,14 @@
#include <machine/hibernate.h>
#endif /* HIBERNATE */
+/* #define CPU_DEBUG */
+
+#ifdef CPU_DEBUG
+#define DPRINTF(x...) do { printf(x); } while(0)
+#else
+#define DPRINTF(x...)
+#endif /* CPU_DEBUG */
+
int cpu_match(struct device *, void *, void *);
void cpu_attach(struct device *, struct device *, void *);
int cpu_activate(struct device *, int);
@@ -172,7 +180,7 @@ struct cfdriver cpu_cd = {
* CPU, on uniprocessors). The CPU info list is initialized to
* point at it.
*/
-struct cpu_info cpu_info_primary = { 0, &cpu_info_primary };
+struct cpu_info_full cpu_info_full_primary = { .cif_cpu = { .ci_self = &cpu_info_primary } };
struct cpu_info *cpu_info_list = &cpu_info_primary;
@@ -338,8 +346,15 @@ cpu_attach(struct device *parent, struct device *self, void *aux)
* structure, otherwise use the primary's.
*/
if (caa->cpu_role == CPU_ROLE_AP) {
- ci = malloc(sizeof(*ci), M_DEVBUF, M_WAITOK|M_ZERO);
+ struct cpu_info_full *cif;
+
+ cif = km_alloc(sizeof *cif, &kv_any, &kp_zero, &kd_waitok);
+ ci = &cif->cif_cpu;
#if defined(MULTIPROCESSOR)
+ ci->ci_tss = &cif->cif_tss;
+ ci->ci_gdt = (void *)(ci->ci_tss + 1);
+ memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE);
+ cpu_enter_pages(cif);
if (cpu_info[cpunum] != NULL)
panic("cpu at apic id %d already attached?", cpunum);
cpu_info[cpunum] = ci;
@@ -451,7 +466,6 @@ cpu_attach(struct device *parent, struct device *self, void *aux)
#if defined(MULTIPROCESSOR)
cpu_intr_init(ci);
- gdt_alloc_cpu(ci);
sched_init_cpu(ci);
cpu_start_secondary(ci);
ncpus++;
@@ -938,3 +952,62 @@ cpu_activate(struct device *self, int act)
return (0);
}
+
+/*
+ * cpu_enter_pages
+ *
+ * Requests mapping of various special pages required in the Intel Meltdown
+ * case (to be entered into the U-K page table):
+ *
+ * 1 tss+gdt page for each CPU
+ * 1 trampoline stack page for each CPU
+ *
+ * The cpu_info_full struct for each CPU straddles these pages. The offset into
+ * 'cif' is calculated below, for each page. For more information, consult
+ * the definition of struct cpu_info_full in cpu_full.h
+ *
+ * On CPUs unaffected by Meltdown, this function still configures 'cif' but
+ * the calls to pmap_enter_special become no-ops.
+ *
+ * Parameters:
+ * cif : the cpu_info_full structure describing a CPU whose pages are to be
+ * entered into the special meltdown U-K page table.
+ */
+void
+cpu_enter_pages(struct cpu_info_full *cif)
+{
+ vaddr_t va;
+ paddr_t pa;
+
+ /* The TSS+GDT need to be readable */
+ va = (vaddr_t)cif;
+ pmap_extract(pmap_kernel(), va, &pa);
+ pmap_enter_special(va, pa, PROT_READ);
+ DPRINTF("%s: entered tss+gdt page at va 0x%llx pa 0x%llx\n", __func__,
+ (uint64_t)va, (uint64_t)pa);
+
+ /* The trampoline stack page needs to be read/write */
+ va = (vaddr_t)&cif->cif_tramp_stack;
+ pmap_extract(pmap_kernel(), va, &pa);
+ pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
+ DPRINTF("%s: entered t.stack page at va 0x%llx pa 0x%llx\n", __func__,
+ (uint64_t)va, (uint64_t)pa);
+
+ cif->cif_tss.tss_rsp0 = va + sizeof(cif->cif_tramp_stack) - 16;
+ DPRINTF("%s: cif_tss.tss_rsp0 = 0x%llx\n" ,__func__,
+ (uint64_t)cif->cif_tss.tss_rsp0);
+ cif->cif_cpu.ci_intr_rsp = cif->cif_tss.tss_rsp0 -
+ sizeof(struct iretq_frame);
+
+#define SETUP_IST_SPECIAL_STACK(ist, cif, member) do { \
+ (cif)->cif_tss.tss_ist[(ist)] = (vaddr_t)&(cif)->member + \
+ sizeof((cif)->member) - 16; \
+ (cif)->member[nitems((cif)->member) - 2] = (int64_t)&(cif)->cif_cpu; \
+} while (0)
+
+ SETUP_IST_SPECIAL_STACK(0, cif, cif_dblflt_stack);
+ SETUP_IST_SPECIAL_STACK(1, cif, cif_nmi_stack);
+
+ /* an empty iomap, by setting its offset to the TSS limit */
+ cif->cif_tss.tss_iobase = sizeof(cif->cif_tss);
+}
diff --git a/sys/arch/amd64/amd64/gdt.c b/sys/arch/amd64/amd64/gdt.c
index 8aa28a098f9..1372ebd083e 100644
--- a/sys/arch/amd64/amd64/gdt.c
+++ b/sys/arch/amd64/amd64/gdt.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: gdt.c,v 1.25 2018/01/07 05:36:47 guenther Exp $ */
+/* $OpenBSD: gdt.c,v 1.26 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: gdt.c,v 1.1 2003/04/26 18:39:28 fvdl Exp $ */
/*-
@@ -40,33 +40,6 @@
#include <machine/pcb.h>
/*
- * Allocate shadow GDT for a slave cpu.
- */
-void
-gdt_alloc_cpu(struct cpu_info *ci)
-{
- struct vm_page *pg;
- vaddr_t va;
-
- ci->ci_gdt = (char *)uvm_km_valloc(kernel_map,
- GDT_SIZE + sizeof(*ci->ci_tss));
- ci->ci_tss = (void *)(ci->ci_gdt + GDT_SIZE);
- uvm_map_pageable(kernel_map, (vaddr_t)ci->ci_gdt,
- (vaddr_t)ci->ci_gdt + GDT_SIZE, FALSE, FALSE);
- for (va = (vaddr_t)ci->ci_gdt;
- va < (vaddr_t)ci->ci_gdt + GDT_SIZE + sizeof(*ci->ci_tss);
- va += PAGE_SIZE) {
- pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
- if (pg == NULL)
- panic("gdt_init: no pages");
- pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), PROT_READ | PROT_WRITE);
- }
- memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE);
- bzero(ci->ci_tss, sizeof(*ci->ci_tss));
-}
-
-
-/*
* Load appropriate gdt descriptor; we better be running on *ci
*/
void
diff --git a/sys/arch/amd64/amd64/genassym.cf b/sys/arch/amd64/amd64/genassym.cf
index f72dd494c9b..4d65a3a56a2 100644
--- a/sys/arch/amd64/amd64/genassym.cf
+++ b/sys/arch/amd64/amd64/genassym.cf
@@ -1,4 +1,4 @@
-# $OpenBSD: genassym.cf,v 1.33 2018/02/10 09:21:12 mpi Exp $
+# $OpenBSD: genassym.cf,v 1.34 2018/02/21 19:24:15 guenther Exp $
# Written by Artur Grabowski art@openbsd.org, Public Domain
include <sys/param.h>
@@ -78,6 +78,15 @@ member tf_ss
define FRAMESIZE sizeof(struct trapframe)
+struct iretq_frame
+member IRETQ_CS iretq_cs
+member IRETQ_RIP iretq_rip
+member IRETQ_RFLAGS iretq_rflags
+member IRETQ_RSP iretq_rsp
+member IRETQ_SS iretq_ss
+
+define IRETQ_SIZE sizeof(struct iretq_frame)
+
struct pcb
member pcb_cr3
member pcb_rsp
@@ -91,6 +100,8 @@ member pcb_cr0
struct pmap
member pm_cpus
+member pm_pdirpa
+member pm_pdirpa_intel
struct x86_64_tss
member tss_rsp0
@@ -115,6 +126,10 @@ endif
member CPU_INFO_GDT ci_gdt
member CPU_INFO_TSS ci_tss
member CPU_INFO_FLAGS ci_flags
+member CPU_INFO_KERN_CR3 ci_kern_cr3
+member CPU_INFO_USER_CR3 ci_user_cr3
+member CPU_INFO_KERN_RSP ci_kern_rsp
+member CPU_INFO_INTR_RSP ci_intr_rsp
export CPUF_USERSEGS_BIT
diff --git a/sys/arch/amd64/amd64/identcpu.c b/sys/arch/amd64/amd64/identcpu.c
index 046fde6855a..4bc2e6d10d5 100644
--- a/sys/arch/amd64/amd64/identcpu.c
+++ b/sys/arch/amd64/amd64/identcpu.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: identcpu.c,v 1.94 2018/02/10 09:46:58 jsg Exp $ */
+/* $OpenBSD: identcpu.c,v 1.95 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: identcpu.c,v 1.1 2003/04/26 18:39:28 fvdl Exp $ */
/*
@@ -208,6 +208,7 @@ const struct {
{ SEFF0EDX_AVX512_4FMAPS, "AVX512FMAPS" },
{ SEFF0EDX_IBRS, "IBRS,IBPB" },
{ SEFF0EDX_STIBP, "STIBP" },
+ /* SEFF0EDX_ARCH_CAP (not printed) */
}, cpu_tpm_eaxfeatures[] = {
{ TPM_SENSOR, "SENSOR" },
{ TPM_ARAT, "ARAT" },
@@ -455,6 +456,7 @@ identifycpu(struct cpu_info *ci)
int i;
char *brandstr_from, *brandstr_to;
int skipspace;
+ extern uint32_t cpu_meltdown;
CPUID(1, ci->ci_signature, val, dummy, ci->ci_feature_flags);
CPUID(0x80000000, ci->ci_pnfeatset, dummy, dummy, dummy);
@@ -612,6 +614,9 @@ identifycpu(struct cpu_info *ci)
}
}
+ if (cpu_meltdown)
+ printf(",MELTDOWN");
+
printf("\n");
x86_print_cacheinfo(ci);
diff --git a/sys/arch/amd64/amd64/lapic.c b/sys/arch/amd64/amd64/lapic.c
index 6a1086c2f62..83ee4472d9f 100644
--- a/sys/arch/amd64/amd64/lapic.c
+++ b/sys/arch/amd64/amd64/lapic.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: lapic.c,v 1.49 2017/10/14 04:44:43 jsg Exp $ */
+/* $OpenBSD: lapic.c,v 1.50 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: lapic.c,v 1.2 2003/05/08 01:04:35 fvdl Exp $ */
/*-
@@ -59,6 +59,14 @@
#include <machine/i82093var.h>
#endif
+/* #define LAPIC_DEBUG */
+
+#ifdef LAPIC_DEBUG
+#define DPRINTF(x...) do { printf(x); } while(0)
+#else
+#define DPRINTF(x...)
+#endif /* LAPIC_DEBUG */
+
struct evcount clk_count;
#ifdef MULTIPROCESSOR
struct evcount ipi_count;
@@ -201,6 +209,7 @@ lapic_map(paddr_t lapic_base)
codepatch_call(CPTAG_EOI, &x2apic_eoi);
lapic_writereg(LAPIC_TPRI, s);
+ va = (vaddr_t)&local_apic;
} else {
/*
* Map local apic. If we have a local apic, it's safe to
@@ -220,6 +229,17 @@ lapic_map(paddr_t lapic_base)
lapic_tpr = s;
}
+ /*
+ * Enter the LAPIC MMIO page in the U-K page table for handling
+ * Meltdown (needed in the interrupt stub to acknowledge the
+ * incoming interrupt). On CPUs unaffected by Meltdown,
+ * pmap_enter_special is a no-op.
+ * XXX - need to map this PG_N
+ */
+ pmap_enter_special(va, lapic_base, PROT_READ | PROT_WRITE);
+ DPRINTF("%s: entered lapic page va 0x%llx pa 0x%llx\n", __func__,
+ (uint64_t)va, (uint64_t)lapic_base);
+
enable_intr();
}
diff --git a/sys/arch/amd64/amd64/locore.S b/sys/arch/amd64/amd64/locore.S
index 6e00ce3dddf..282a25310c6 100644
--- a/sys/arch/amd64/amd64/locore.S
+++ b/sys/arch/amd64/amd64/locore.S
@@ -1,4 +1,4 @@
-/* $OpenBSD: locore.S,v 1.93 2018/01/07 19:56:19 mlarkin Exp $ */
+/* $OpenBSD: locore.S,v 1.94 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: locore.S,v 1.13 2004/03/25 18:33:17 drochner Exp $ */
/*
@@ -113,6 +113,7 @@
#include <sys/syscall.h>
#include <machine/param.h>
+#include <machine/psl.h>
#include <machine/segments.h>
#include <machine/specialreg.h>
#include <machine/frameasm.h>
@@ -176,6 +177,7 @@ _C_LABEL(lapic_isr):
.globl _C_LABEL(bootapiver)
.globl _C_LABEL(pg_nx)
.globl _C_LABEL(pg_g_kern)
+ .globl _C_LABEL(cpu_meltdown)
_C_LABEL(cpu_id): .long 0 # saved from `cpuid' instruction
_C_LABEL(cpu_feature): .long 0 # feature flags from 'cpuid'
# instruction
@@ -210,7 +212,8 @@ _C_LABEL(biosextmem): .long REALEXTMEM
_C_LABEL(pg_nx): .quad 0 # NX PTE bit (if CPU supports)
_C_LABEL(pg_g_kern): .quad 0 # 0x100 if global pages should be used
# in kernel mappings, 0 otherwise (for
- # Intel)
+ # insecure CPUs)
+_C_LABEL(cpu_meltdown): .long 0 # 1 if this CPU has Meltdown
#define _RELOC(x) ((x) - KERNBASE)
#define RELOC(x) _RELOC(_C_LABEL(x))
@@ -236,7 +239,7 @@ gdt64_end:
/*****************************************************************************/
/*
- * Signal trampoline; copied to top of user stack.
+ * Signal trampoline; copied to a page mapped into userspace.
* gdb's backtrace logic matches against the instructions in this.
*/
.section .rodata
@@ -401,20 +404,34 @@ restore_saved:
movq PCB_RSP(%r13),%rsp
movq PCB_RBP(%r13),%rbp
- movq CPUVAR(TSS),%rcx
- movq PCB_KSTACK(%r13),%rdx
- movq %rdx,TSS_RSP0(%rcx)
-
movq PCB_CR3(%r13),%rax
- movq %rax,%cr3
+ movq %rax,%cr3 /* %rax used below too */
/* Don't bother with the rest if switching to a system process. */
testl $P_SYSTEM,P_FLAG(%r12)
jnz switch_restored
+ /* record the bits needed for future U-->K transition */
+ movq PCB_KSTACK(%r13),%rdx
+ subq $FRAMESIZE,%rdx
+ movq %rdx,CPUVAR(KERN_RSP)
+ movq PCB_PMAP(%r13),%rcx
+
+ /*
+ * Meltdown: iff we're doing separate U+K and U-K page tables,
+ * then record them in cpu_info for easy access in syscall and
+ * interrupt trampolines. XXX code patch this
+ */
+
+ movq PM_PDIRPA_INTEL(%rcx),%rdx
+ testq %rdx,%rdx
+ jz 0f /* yay, no intel suckiness */
+ movq %rax,CPUVAR(KERN_CR3)
+ movq %rdx,CPUVAR(USER_CR3)
+0:
+
/* set the new pmap's bit for the cpu */
movl CPUVAR(CPUID),%edi
- movq PCB_PMAP(%r13),%rcx
lock
btsq %rdi,PM_CPUS(%rcx)
#ifdef DIAGNOSTIC
@@ -503,8 +520,7 @@ IDTVEC(syscall32)
sysret /* go away please */
/*
- * syscall insn entry. This currently isn't much faster, but
- * it can be made faster in the future.
+ * syscall insn entry.
*/
IDTVEC(syscall)
/*
@@ -514,13 +530,20 @@ IDTVEC(syscall)
* the user-space value.
* First order of business is to swap to the kernel gs.base so that
* we can access our struct cpu_info and use the scratch space there
- * to switch to our kernel stack. Once that's in place we can
+ * to switch to the kernel page tables (thank you, Intel), then
+ * switch to our kernel stack. Once that's in place we can
* unblock interrupts and save the rest of the syscall frame.
*/
swapgs
movq %r15,CPUVAR(SCRATCH)
- movq CPUVAR(CURPCB),%r15
- movq PCB_KSTACK(%r15),%r15
+ movq CPUVAR(KERN_CR3),%r15
+ testq %r15,%r15
+ jz Xsyscall_untramp
+ movq %r15,%cr3
+ jmp Xsyscall_untramp
+
+NENTRY(Xsyscall_untramp)
+ movq CPUVAR(KERN_RSP),%r15
xchgq %r15,%rsp
sti
@@ -531,12 +554,11 @@ IDTVEC(syscall)
* ss:rsp, etc, so that all GP registers can be
* saved. Then, fill in the rest.
*/
- pushq $(GSEL(GUDATA_SEL, SEL_UPL))
- pushq %r15
- subq $(TF_RSP-TF_TRAPNO),%rsp
+ movq $(GSEL(GUDATA_SEL, SEL_UPL)),TF_SS(%rsp)
+ movq %r15,TF_RSP(%rsp)
movq CPUVAR(SCRATCH),%r15
- subq $32,%rsp
- INTR_SAVE_GPRS
+ INTR_SAVE_MOST_GPRS_NO_ADJ
+ movq %rcx,TF_RCX(%rsp)
movq %r11, TF_RFLAGS(%rsp) /* old rflags from syscall insn */
movq $(GSEL(GUCODE_SEL, SEL_UPL)), TF_CS(%rsp)
movq %rcx,TF_RIP(%rsp)
@@ -581,16 +603,45 @@ IDTVEC(syscall)
movq TF_RBP(%rsp),%rbp
movq TF_RBX(%rsp),%rbx
- INTR_RESTORE_SELECTORS
+ /* Restore FS.base if it's not already in the CPU */
+ btsl $CPUF_USERSEGS_BIT,CPUVAR(FLAGS)
+ jc 99f
+ movq CPUVAR(CURPCB),%rdx
+ movq PCB_FSBASE(%rdx),%rax
+ movq %rax,%rdx
+ shrq $32,%rdx
+ movl $MSR_FSBASE,%ecx
+ wrmsr
+99:
+ /*
+ * We need to finish reading from the trapframe, then switch
+ * to the user page tables, swapgs, and return. We need
+ * to get the final value for the register that was used
+ * for the mov to %cr3 from somewhere accessible on the
+ * user page tables, so save it in CPUVAR(SCRATCH) across
+ * the switch.
+ */
movq TF_RDX(%rsp),%rdx
movq TF_RAX(%rsp),%rax
+ movq %rax,CPUVAR(SCRATCH)
+ movq CPUVAR(USER_CR3),%rax
movq TF_RIP(%rsp),%rcx
movq TF_RFLAGS(%rsp),%r11
movq TF_RSP(%rsp),%rsp
+ testq %rax,%rax
+ jz 1f
+ jmp syscall_trampback
+
+KUENTRY(syscall_trampback)
+ movq %rax,%cr3
+1: movq CPUVAR(SCRATCH),%rax
+ swapgs
sysretq
+ .text
+
#ifdef DIAGNOSTIC
.Lsyscall_spl_not_lowered:
movabsq $spl_lowered, %rdi
@@ -627,6 +678,12 @@ NENTRY(proc_trampoline)
* Return via iretq, for real interrupts and signal returns
*/
NENTRY(intr_fast_exit)
+#ifdef DIAGNOSTIC
+ pushfq
+ popq %rdx
+ testq $PSL_I,%rdx
+ jnz .Lintr_exit_not_blocked
+#endif /* DIAGNOSTIC */
movq TF_RDI(%rsp),%rdi
movq TF_RSI(%rsp),%rsi
movq TF_R8(%rsp),%r8
@@ -640,11 +697,68 @@ NENTRY(intr_fast_exit)
movq TF_RBX(%rsp),%rbx
testq $SEL_RPL,TF_CS(%rsp)
- je 5f
+ je intr_exit_recurse /* returning back to kernel? */
+
+ /* returning to userspace. XXX fix up iret frame here */
+
+ /* restore FS.base if it's not already in the CPU */
+ btsl $CPUF_USERSEGS_BIT,CPUVAR(FLAGS)
+ jc 99f
+ movq CPUVAR(CURPCB),%rdx /* for below */
+ movq PCB_FSBASE(%rdx),%rax
+ movq %rax,%rdx
+ shrq $32,%rdx
+ movl $MSR_FSBASE,%ecx
+ wrmsr
+99:
+ /*
+ * Returning to userspace. We need to go things in this order:
+ * - update the iret frame from the trapframe
+ * - finish reading from the trapframe
+ * - switch to the trampoline stack
+ * - jump to the .kutext segment
+ * - switch to the user page tables
+ * - swapgs
+ * - iretq
+ * To get the final value for the register that was used
+ * for the mov to %cr3, we need access to somewhere accessible
+ * on the user page tables, so we save it in CPUVAR(SCRATCH)
+ * across the switch.
+ */
+ /* update iret frame */
+ movq CPUVAR(INTR_RSP),%rdx
+ movq $(GSEL(GUCODE_SEL,SEL_UPL)),IRETQ_CS(%rdx)
+ movq TF_RIP(%rsp),%rax
+ movq %rax,IRETQ_RIP(%rdx)
+ movq TF_RFLAGS(%rsp),%rax
+ movq %rax,IRETQ_RFLAGS(%rdx)
+ movq TF_RSP(%rsp),%rax
+ movq %rax,IRETQ_RSP(%rdx)
+ movq $(GSEL(GUDATA_SEL,SEL_UPL)),IRETQ_SS(%rdx)
+ /* finish with the trap frame */
+ movq TF_RAX(%rsp),%rax
+ movq %rax,CPUVAR(SCRATCH)
+ movq TF_RCX(%rsp),%rcx
+ movq TF_R11(%rsp),%r11
+ /* switch to the trampoline stack */
+ xchgq %rdx,%rsp
+ movq TF_RDX(%rdx),%rdx
+ movq CPUVAR(USER_CR3),%rax
+ testq %rax,%rax
+ jz 1f
+ jmp iretq_tramp
- INTR_RESTORE_SELECTORS
+KUENTRY(iretq_tramp)
+ movq %rax,%cr3
+1: movq CPUVAR(SCRATCH),%rax
+ swapgs
-5: movq TF_RDX(%rsp),%rdx
+ .globl _C_LABEL(doreti_iret)
+_C_LABEL(doreti_iret):
+ iretq
+
+NENTRY(intr_exit_recurse)
+ movq TF_RDX(%rsp),%rdx
movq TF_RCX(%rsp),%rcx
movq TF_R11(%rsp),%r11
movq TF_RAX(%rsp),%rax
@@ -662,9 +776,6 @@ NENTRY(intr_fast_exit)
#endif /* !defined(GPROF) && defined(DDBPROF) */
addq $TF_RIP,%rsp
-
- .globl _C_LABEL(doreti_iret)
-_C_LABEL(doreti_iret):
iretq
@@ -697,6 +808,33 @@ _C_LABEL(doreti_iret):
addq $TF_RIP,%rsp
iretq
#endif /* !defined(GPROF) && defined(DDBPROF) */
+ .text
+
+#ifdef DIAGNOSTIC
+.Lintr_exit_not_blocked:
+ xchgw %bx, %bx
+ movl warn_once(%rip),%edi
+ testl %edi,%edi
+ jnz 1f
+ incl %edi
+ movl %edi,warn_once(%rip)
+ leaq .Lnot_blocked(%rip),%rdi
+ call _C_LABEL(printf)
+#ifdef DDB
+ int $3
+#endif /* DDB */
+1: cli
+ jmp intr_fast_exit
+
+ .data
+.global warn_once
+warn_once:
+ .long 0
+ .section .rodata
+.Lnot_blocked:
+ .asciz "WARNING: INTERRUPTS NOT BLOCKED ON INTERRUPT RETURN: 0x%x 0x%x\n"
+ .text
+#endif
ENTRY(xrstor_user)
movq %rsi, %rdx
diff --git a/sys/arch/amd64/amd64/locore0.S b/sys/arch/amd64/amd64/locore0.S
index 50f0c7ecd82..53ef3672be5 100644
--- a/sys/arch/amd64/amd64/locore0.S
+++ b/sys/arch/amd64/amd64/locore0.S
@@ -1,4 +1,4 @@
-/* $OpenBSD: locore0.S,v 1.6 2018/01/07 19:56:19 mlarkin Exp $ */
+/* $OpenBSD: locore0.S,v 1.7 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: locore.S,v 1.13 2004/03/25 18:33:17 drochner Exp $ */
/*
@@ -205,26 +205,47 @@ bi_size_ok:
movl $0, 12(%ebp)
/*
- * Determine if CPU is Intel. Intel CPUs cannot use PG_G (global
- * pages) in kernel mappings. If CPU is not Intel, this is safe.
- * Cache the result in pg_g_kern - 0 if not supported or PG_G (0x100)
- * if supported.
- *
- * This treatment is required for the meltdown CVE mitigation.
+ * Determine if CPU has meltdown. Certain Intel CPUs do not properly
+ * respect page permissions when speculatively loading data into
+ * the cache ("Meltdown" CVE). These CPUs must utilize a secondary
+ * sanitized page table lacking kernel mappings when executing user
+ * processes, and may not use PG_G global PTEs for kernel VAs.
*/
+ movl $0x1, RELOC(cpu_meltdown) /* assume insecure at first */
+ movl $0x0, RELOC(pg_g_kern)
+
cmpl $0x756e6547, %ebx # "Genu"
- jne not_intel
+ jne .Lcpu_secure
cmpl $0x6c65746e, %ecx # "ntel"
- jne not_intel
+ jne .Lcpu_secure
cmpl $0x49656e69, %edx # "ineI"
- jne not_intel
+ jne .Lcpu_secure
- jmp pg_g_check_finished
+ /*
+ * Intel CPU, now check if IA32_ARCH_CAPABILITIES is supported and
+ * if it says this CPU is safe.
+ */
+ movl $0x0, %eax
+ cpuid
+ cmpl $0x7, %eax
+ jl .Lcpu_check_finished
+
+ movl $0x7, %eax
+ cpuid
+ testl $SEFF0EDX_ARCH_CAP, %edx
+ jz .Lcpu_check_finished
+
+ /* IA32_ARCH_CAPABILITIES MSR avaialble, use it to check CPU security */
+ movl $MSR_ARCH_CAPABILITIES, %ecx
+ rdmsr
+ testl $ARCH_CAPABILITIES_RDCL_NO, %eax
+ jz .Lcpu_check_finished
-not_intel:
+.Lcpu_secure:
+ movl $0x0, RELOC(cpu_meltdown)
movl $PG_G, RELOC(pg_g_kern)
-pg_g_check_finished:
+.Lcpu_check_finished:
movl $1,%eax
cpuid
movl %eax,RELOC(cpu_id)
diff --git a/sys/arch/amd64/amd64/machdep.c b/sys/arch/amd64/amd64/machdep.c
index a1d5e02f340..dd0623b15c4 100644
--- a/sys/arch/amd64/amd64/machdep.c
+++ b/sys/arch/amd64/amd64/machdep.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: machdep.c,v 1.238 2018/02/06 01:09:17 patrick Exp $ */
+/* $OpenBSD: machdep.c,v 1.239 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: machdep.c,v 1.3 2003/05/07 22:58:18 fvdl Exp $ */
/*-
@@ -90,7 +90,7 @@
#include <sys/sysctl.h>
-#include <machine/cpu.h>
+#include <machine/cpu_full.h>
#include <machine/cpufunc.h>
#include <machine/pio.h>
#include <machine/psl.h>
@@ -141,6 +141,14 @@ extern int db_console;
#include <dev/ic/pckbcvar.h>
#endif
+/* #define MACHDEP_DEBUG */
+
+#ifdef MACHDEP_DEBUG
+#define DPRINTF(x...) do { printf(x); } while(0)
+#else
+#define DPRINTF(x...)
+#endif /* MACHDEP_DEBUG */
+
/* the following is used externally (sysctl_hw) */
char machine[] = MACHINE;
@@ -257,6 +265,7 @@ void cpu_init_extents(void);
void map_tramps(void);
void init_x86_64(paddr_t);
void (*cpuresetfn)(void);
+void enter_shared_special_pages(void);
#ifdef APERTURE
int allowaperture = 0;
@@ -313,6 +322,65 @@ cpu_startup(void)
#ifndef SMALL_KERNEL
cpu_ucode_setup();
#endif
+ /* enter the IDT and trampoline code in the u-k maps */
+ enter_shared_special_pages();
+
+ /* initialize CPU0's TSS and GDT and put them in the u-k maps */
+ cpu_enter_pages(&cpu_info_full_primary);
+}
+
+/*
+ * enter_shared_special_pages
+ *
+ * Requests mapping of various special pages required in the Intel Meltdown
+ * case (to be entered into the U-K page table):
+ *
+ * 1 IDT page
+ * Various number of pages covering the U-K ".kutext" section. This section
+ * contains code needed during trampoline operation
+ * Various number of pages covering the U-K ".kudata" section. This section
+ * contains data accessed by the trampoline, before switching to U+K
+ * (for example, various shared global variables used by IPIs, etc)
+ *
+ * The linker script places the required symbols in the sections above.
+ *
+ * On CPUs not affected by Meltdown, the calls to pmap_enter_special below
+ * become no-ops.
+ */
+void
+enter_shared_special_pages(void)
+{
+ extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[];
+ extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[];
+ vaddr_t va;
+ paddr_t pa;
+
+ /* idt */
+ pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ);
+ DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__,
+ (uint64_t)idt_vaddr, (uint64_t)idt_paddr);
+
+ /* .kutext section */
+ va = (vaddr_t)__kutext_start;
+ pa = (paddr_t)__kernel_kutext_phys;
+ while (va < (vaddr_t)__kutext_end) {
+ pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
+ DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n",
+ __func__, (uint64_t)va, (uint64_t)pa);
+ va += PAGE_SIZE;
+ pa += PAGE_SIZE;
+ }
+
+ /* .kudata section */
+ va = (vaddr_t)__kudata_start;
+ pa = (paddr_t)__kernel_kudata_phys;
+ while (va < (vaddr_t)__kudata_end) {
+ pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
+ DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n",
+ __func__, (uint64_t)va, (uint64_t)pa);
+ va += PAGE_SIZE;
+ pa += PAGE_SIZE;
+ }
}
/*
@@ -329,12 +397,6 @@ x86_64_proc0_tss_ldt_init(void)
pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16;
proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1;
- /* an empty iomap, by setting its offset to the TSS limit */
- cpu_info_primary.ci_tss->tss_iobase = sizeof(struct x86_64_tss);
- cpu_info_primary.ci_tss->tss_rsp0 = pcb->pcb_kstack;
- cpu_info_primary.ci_tss->tss_ist[0] =
- (u_int64_t)proc0.p_addr + PAGE_SIZE - 16;
-
ltr(GSYSSEL(GPROC0_SEL, SEL_KPL));
lldt(0);
}
@@ -346,15 +408,11 @@ x86_64_proc0_tss_ldt_init(void)
#ifdef MULTIPROCESSOR
void
x86_64_init_pcb_tss_ldt(struct cpu_info *ci)
-{
+{
struct pcb *pcb = ci->ci_idle_pcb;
- ci->ci_tss->tss_iobase = sizeof(*ci->ci_tss);
- ci->ci_tss->tss_rsp0 = pcb->pcb_kstack;
- ci->ci_tss->tss_ist[0] = pcb->pcb_kstack - USPACE + PAGE_SIZE;
-
pcb->pcb_cr0 = rcr0();
-}
+}
#endif /* MULTIPROCESSOR */
bios_diskinfo_t *
@@ -1551,8 +1609,6 @@ init_x86_64(paddr_t first_avail)
pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE);
- pmap_kenter_pa(idt_vaddr + PAGE_SIZE, idt_paddr + PAGE_SIZE,
- PROT_READ | PROT_WRITE);
#if defined(MULTIPROCESSOR) || \
(NACPI > 0 && !defined(SMALL_KERNEL))
@@ -1560,7 +1616,7 @@ init_x86_64(paddr_t first_avail)
#endif
idt = (struct gate_descriptor *)idt_vaddr;
- cpu_info_primary.ci_tss = (void *)(idt + NIDT);
+ cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss;
cpu_info_primary.ci_gdt = (void *)(cpu_info_primary.ci_tss + 1);
/* make gdt gates and memory segments */
@@ -1585,9 +1641,10 @@ init_x86_64(paddr_t first_avail)
/* exceptions */
for (x = 0; x < 32; x++) {
- ist = (x == 8) ? 1 : 0;
+ /* trap2 == NMI, trap8 == double fault */
+ ist = (x == 2) ? 2 : (x == 8) ? 1 : 0;
setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT,
- (x == 3 || x == 4) ? SEL_UPL : SEL_KPL,
+ (x == 3) ? SEL_UPL : SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
idt_allocmap[x] = 1;
}
diff --git a/sys/arch/amd64/amd64/pmap.c b/sys/arch/amd64/amd64/pmap.c
index bb7ba397bbe..3e559206608 100644
--- a/sys/arch/amd64/amd64/pmap.c
+++ b/sys/arch/amd64/amd64/pmap.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: pmap.c,v 1.108 2018/01/07 19:56:19 mlarkin Exp $ */
+/* $OpenBSD: pmap.c,v 1.109 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $ */
/*
@@ -119,6 +119,15 @@
#include "acpi.h"
+/* #define PMAP_DEBUG */
+
+#ifdef PMAP_DEBUG
+#define DPRINTF(x...) do { printf(x); } while(0)
+#else
+#define DPRINTF(x...)
+#endif /* PMAP_DEBUG */
+
+
/*
* general info:
*
@@ -255,6 +264,7 @@ TAILQ_HEAD(pg_to_free, vm_page);
struct pool pmap_pdp_pool;
void pmap_pdp_ctor(pd_entry_t *);
+void pmap_pdp_ctor_intel(pd_entry_t *);
extern vaddr_t msgbuf_vaddr;
extern paddr_t msgbuf_paddr;
@@ -268,6 +278,8 @@ extern vaddr_t lo32_paddr;
vaddr_t virtual_avail;
extern int end;
+extern uint32_t cpu_meltdown;
+
/*
* local prototypes
*/
@@ -309,7 +321,6 @@ void pmap_tlb_shootwait(void);
#define pmap_tlb_shootwait()
#endif
-
/*
* p m a p i n l i n e h e l p e r f u n c t i o n s
*/
@@ -323,7 +334,8 @@ static __inline boolean_t
pmap_is_curpmap(struct pmap *pmap)
{
return((pmap == pmap_kernel()) ||
- (pmap->pm_pdirpa == (paddr_t) rcr3()));
+ (pmap->pm_pdirpa == (paddr_t) rcr3()) ||
+ (pmap->pm_pdirpa_intel == (paddr_t) rcr3()));
}
/*
@@ -484,7 +496,6 @@ pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs)
return (0);
}
-
/*
* p m a p k e n t e r f u n c t i o n s
*
@@ -586,12 +597,12 @@ pmap_kremove(vaddr_t sva, vsize_t len)
paddr_t
pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
{
- vaddr_t kva, kva_end, kva_start = VM_MIN_KERNEL_ADDRESS;
+ vaddr_t kva_start = VM_MIN_KERNEL_ADDRESS;
struct pmap *kpm;
int i;
- unsigned long p1i;
long ndmpdp;
paddr_t dmpd, dmpdp;
+ vaddr_t kva, kva_end;
/*
* define the boundaries of the managed kernel virtual address
@@ -643,9 +654,14 @@ pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
curpcb->pcb_pmap = kpm; /* proc0's pcb */
/*
- * enable global TLB entries.
+ * Add PG_G attribute to already mapped kernel pages. pg_g_kern
+ * is calculated in locore0.S and may be set to:
+ *
+ * 0 if this CPU does not safely support global pages in the kernel
+ * (Intel/Meltdown)
+ * PG_G if this CPU does safely support global pages in the kernel
+ * (AMD)
*/
- /* add PG_G attribute to already mapped kernel pages */
#if KERNBASE == VM_MIN_KERNEL_ADDRESS
for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
#else
@@ -653,7 +669,7 @@ pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
for (kva = KERNBASE; kva < kva_end ;
#endif
kva += PAGE_SIZE) {
- p1i = pl1_i(kva);
+ unsigned long p1i = pl1_i(kva);
if (pmap_valid_entry(PTE_BASE[p1i]))
PTE_BASE[p1i] |= pg_g_kern;
}
@@ -726,7 +742,7 @@ pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
LIST_INIT(&pmaps);
/*
- * initialize the pmap pool.
+ * initialize the pmap pools.
*/
pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_NONE, 0,
@@ -742,6 +758,9 @@ pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, IPL_NONE, PR_WAITOK,
"pdppl", NULL);
+ kpm->pm_pdir_intel = 0;
+ kpm->pm_pdirpa_intel = 0;
+
/*
* ensure the TLB is sync'd with reality by flushing it...
*/
@@ -894,13 +913,21 @@ pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
unsigned long index;
int level;
vaddr_t invaladdr;
- pd_entry_t opde;
+ pd_entry_t opde, *mdpml4es;
level = 1;
do {
pmap_freepage(pmap, ptp, level, pagelist);
index = pl_i(va, level + 1);
opde = pmap_pte_set(&pdes[level - 1][index], 0);
+ if (level == 3 && pmap->pm_pdir_intel) {
+ /* Zap special meltdown PML4e */
+ mdpml4es = (pd_entry_t *)pmap->pm_pdir_intel;
+ opde = pmap_pte_set(&mdpml4es[index], 0);
+ DPRINTF("%s: cleared meltdown PML4e @ index %lu "
+ "(va range start 0x%llx)\n", __func__, index,
+ (uint64_t)(index << L4_SHIFT));
+ }
invaladdr = level == 1 ? (vaddr_t)ptes :
(vaddr_t)pdes[level - 2];
pmap_tlb_shootpage(curpcb->pcb_pmap,
@@ -934,7 +961,7 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t **pdes)
struct vm_page *ptp, *pptp;
int i;
unsigned long index;
- pd_entry_t *pva;
+ pd_entry_t *pva, *pva_intel;
paddr_t ppa, pa;
struct uvm_object *obj;
@@ -973,6 +1000,20 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t **pdes)
pmap->pm_ptphint[i - 2] = ptp;
pa = VM_PAGE_TO_PHYS(ptp);
pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V);
+
+ /*
+ * Meltdown Special case - if we are adding a new PML4e for
+ * usermode addresses, just copy the PML4e to the U-K page
+ * table.
+ */
+ if (pmap->pm_pdir_intel && i == 4 && va < VM_MAXUSER_ADDRESS) {
+ pva_intel = (pd_entry_t *)pmap->pm_pdir_intel;
+ pva_intel[index] = pva[index];
+ DPRINTF("%s: copying usermode PML4e (content=0x%llx) "
+ "from 0x%llx -> 0x%llx\n", __func__, pva[index],
+ (uint64_t)&pva[index], (uint64_t)&pva_intel[index]);
+ }
+
pmap->pm_stats.resident_count++;
/*
* If we're not in the top level, increase the
@@ -1048,6 +1089,15 @@ pmap_pdp_ctor(pd_entry_t *pdir)
#endif
}
+void
+pmap_pdp_ctor_intel(pd_entry_t *pdir)
+{
+ struct pmap *kpm = pmap_kernel();
+
+ /* Copy PML4es from pmap_kernel's U-K view */
+ memcpy(pdir, kpm->pm_pdir_intel, PAGE_SIZE);
+}
+
/*
* pmap_create: create a pmap
*
@@ -1088,6 +1138,22 @@ pmap_create(void)
pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME;
+ /*
+ * Intel CPUs need a special page table to be used during usermode
+ * execution, one that lacks all kernel mappings.
+ */
+ if (cpu_meltdown) {
+ pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, PR_WAITOK);
+ pmap_pdp_ctor_intel(pmap->pm_pdir_intel);
+ if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel,
+ &pmap->pm_pdirpa_intel))
+ panic("%s: unknown PA mapping for meltdown PML4\n",
+ __func__);
+ } else {
+ pmap->pm_pdir_intel = 0;
+ pmap->pm_pdirpa_intel = 0;
+ }
+
LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
return (pmap);
}
@@ -1145,6 +1211,9 @@ pmap_destroy(struct pmap *pmap)
/* XXX: need to flush it out of other processor's space? */
pool_put(&pmap_pdp_pool, pmap->pm_pdir);
+ if (pmap->pm_pdir_intel)
+ pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
+
pool_put(&pmap_pmap_pool, pmap);
}
@@ -1959,6 +2028,137 @@ pmap_collect(struct pmap *pmap)
* defined as macro in pmap.h
*/
+void
+pmap_enter_special(vaddr_t va, paddr_t pa, vm_prot_t prot)
+{
+ uint64_t l4idx, l3idx, l2idx, l1idx;
+ pd_entry_t *pd, *ptp;
+ paddr_t npa;
+ struct pmap *pmap = pmap_kernel();
+
+ /* If CPU is secure, no need to do anything */
+ if (!cpu_meltdown)
+ return;
+
+ /* Must be kernel VA */
+ if (va < VM_MIN_KERNEL_ADDRESS)
+ panic("%s: invalid special mapping va 0x%lx requested",
+ __func__, va);
+
+ if (!pmap->pm_pdir_intel)
+ pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool,
+ PR_WAITOK | PR_ZERO);
+
+ l4idx = (va & L4_MASK) >> L4_SHIFT; /* PML4E idx */
+ l3idx = (va & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
+ l2idx = (va & L2_MASK) >> L2_SHIFT; /* PDE idx */
+ l1idx = (va & L1_MASK) >> L1_SHIFT; /* PTE idx */
+
+ DPRINTF("%s: va=0x%llx pa=0x%llx l4idx=%lld l3idx=%lld "
+ "l2idx=%lld l1idx=%lld\n", __func__, (uint64_t)va,
+ (uint64_t)pa, l4idx, l3idx, l2idx, l1idx);
+
+ /* Start at PML4 / top level */
+ pd = (pd_entry_t *)pmap->pm_pdir_intel;
+
+ if (!pd)
+ panic("%s: PML4 not initialized for pmap @ %p\n", __func__,
+ pmap);
+
+ /* npa = physaddr of PDPT */
+ npa = pd[l4idx] & PMAP_PA_MASK;
+
+ /* Valid PML4e for the 512GB region containing va? */
+ if (!npa) {
+ /* No valid PML4E - allocate PDPT page and set PML4E */
+
+ ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
+
+ if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
+ panic("%s: can't locate PDPT page\n", __func__);
+
+ pd[l4idx] = (npa | PG_u | PG_RW | PG_V);
+
+ DPRINTF("%s: allocated new PDPT page at phys 0x%llx, "
+ "setting PML4e[%lld] = 0x%llx\n", __func__,
+ (uint64_t)npa, l4idx, pd[l4idx]);
+ }
+
+ pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
+ if (!pd)
+ panic("%s: can't locate PDPT @ pa=0x%llx\n", __func__,
+ (uint64_t)npa);
+
+ /* npa = physaddr of PD page */
+ npa = pd[l3idx] & PMAP_PA_MASK;
+
+ /* Valid PDPTe for the 1GB region containing va? */
+ if (!npa) {
+ /* No valid PDPTe - allocate PD page and set PDPTe */
+
+ ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
+
+ if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
+ panic("%s: can't locate PD page\n", __func__);
+
+ pd[l3idx] = (npa | PG_u | PG_RW | PG_V);
+
+ DPRINTF("%s: allocated new PD page at phys 0x%llx, "
+ "setting PDPTe[%lld] = 0x%llx\n", __func__,
+ (uint64_t)npa, l3idx, pd[l3idx]);
+ }
+
+ pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
+ if (!pd)
+ panic("%s: can't locate PD page @ pa=0x%llx\n", __func__,
+ (uint64_t)npa);
+
+ /* npa = physaddr of PT page */
+ npa = pd[l2idx] & PMAP_PA_MASK;
+
+ /* Valid PDE for the 2MB region containing va? */
+ if (!npa) {
+ /* No valid PDE - allocate PT page and set PDE */
+
+ ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
+
+ if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
+ panic("%s: can't locate PT page\n", __func__);
+
+ pd[l2idx] = (npa | PG_u | PG_RW | PG_V);
+
+ DPRINTF("%s: allocated new PT page at phys 0x%llx, "
+ "setting PDE[%lld] = 0x%llx\n", __func__,
+ (uint64_t)npa, l2idx, pd[l2idx]);
+ }
+
+ pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
+ if (!pd)
+ panic("%s: can't locate PT page @ pa=0x%llx\n", __func__,
+ (uint64_t)npa);
+
+ DPRINTF("%s: setting PTE, PT page @ phys 0x%llx virt 0x%llx prot "
+ "0x%llx was 0x%llx\n", __func__, (uint64_t)npa, (uint64_t)pd, (uint64_t)prot, (uint64_t)pd[l1idx]);
+
+ pd[l1idx] = pa | protection_codes[prot] | PG_V | pg_g_kern | PG_W;
+ DPRINTF("%s: setting PTE[%lld] = 0x%llx\n", __func__, l1idx, pd[l1idx]);
+
+ if (pg_g_kern) {
+ /* now set the PG_G flag on the corresponding U+K entry */
+ pt_entry_t *ptes;
+ int level, offs;
+
+ level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
+ if (__predict_true(level == 0 &&
+ pmap_valid_entry(ptes[offs]))) {
+ ptes[offs] |= pg_g_kern;
+ } else {
+ DPRINTF("%s: no U+K mapping for special mapping?\n",
+ __func__);
+ }
+ }
+}
+
/*
* pmap_enter: enter a mapping into a pmap
*
@@ -2439,10 +2639,10 @@ pmap_convert(struct pmap *pmap, int mode)
* release the lock if we get an interrupt in a bad moment.
*/
-volatile long tlb_shoot_wait;
+volatile long tlb_shoot_wait __attribute__((section(".kudata")));
-volatile vaddr_t tlb_shoot_addr1;
-volatile vaddr_t tlb_shoot_addr2;
+volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata")));
+volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata")));
void
pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
diff --git a/sys/arch/amd64/amd64/spl.S b/sys/arch/amd64/amd64/spl.S
index c4b6fe697b6..2ea315f2fb5 100644
--- a/sys/arch/amd64/amd64/spl.S
+++ b/sys/arch/amd64/amd64/spl.S
@@ -1,4 +1,4 @@
-/* $OpenBSD: spl.S,v 1.11 2016/05/20 14:37:53 deraadt Exp $ */
+/* $OpenBSD: spl.S,v 1.12 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: spl.S,v 1.3 2004/06/28 09:13:11 fvdl Exp $ */
/*
@@ -114,7 +114,7 @@ _C_LABEL(splx):
* a lower-prio one first, which needs to take the kernel lock -->
* the sending CPU will never see the that CPU accept the IPI
*/
-IDTVEC(spllower)
+KIDTVEC(spllower)
_PROF_PROLOGUE
pushq %rbx
pushq %r13
@@ -143,7 +143,7 @@ IDTVEC(spllower)
* ebx - cpl to restore
* r13 - address to resume loop at
*/
-IDTVEC(doreti)
+KIDTVEC(doreti)
popq %rbx # get previous priority
decl CPUVAR(IDEPTH)
leaq 1f(%rip),%r13
@@ -168,4 +168,8 @@ IDTVEC(doreti)
call _C_LABEL(ast)
cli
jmp 5b
-3: INTRFASTEXIT
+3:
+#ifdef DIAGNOSTIC
+ movl $254,%esi
+#endif /* DIAGNOSTIC */
+ INTRFASTEXIT
diff --git a/sys/arch/amd64/amd64/trap.c b/sys/arch/amd64/amd64/trap.c
index 47b3bee5128..dc2d115c207 100644
--- a/sys/arch/amd64/amd64/trap.c
+++ b/sys/arch/amd64/amd64/trap.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: trap.c,v 1.63 2018/01/05 11:10:25 pirofti Exp $ */
+/* $OpenBSD: trap.c,v 1.64 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: trap.c,v 1.2 2003/05/04 23:51:56 fvdl Exp $ */
/*-
@@ -212,6 +212,18 @@ trap(struct trapframe *frame)
frame->tf_rip = (u_int64_t)xrstor_resume;
return;
}
+
+ /*
+ * Check for failure during return to user mode.
+ * We do this by looking at the address of the
+ * instruction that faulted.
+ */
+ if (frame->tf_rip == (u_int64_t)doreti_iret) {
+ frame->tf_rip = (u_int64_t)resume_iret;
+ return;
+ }
+ /* FALLTHROUGH */
+
case T_SEGNPFLT:
case T_ALIGNFLT:
case T_TSSFLT:
@@ -223,16 +235,6 @@ copyfault:
frame->tf_rip = (u_int64_t)pcb->pcb_onfault;
return;
}
-
- /*
- * Check for failure during return to user mode.
- * We do this by looking at the address of the
- * instruction that faulted.
- */
- if (frame->tf_rip == (u_int64_t)doreti_iret) {
- frame->tf_rip = (u_int64_t)resume_iret;
- return;
- }
goto we_re_toast;
case T_PROTFLT|T_USER: /* protection fault */
@@ -459,8 +461,12 @@ out:
static void
frame_dump(struct trapframe *tf)
{
- printf("rip %p rsp %p rfl %p\n",
- (void *)tf->tf_rip, (void *)tf->tf_rsp, (void *)tf->tf_rflags);
+ printf("rip %p cs 0x%x rfl %p rsp %p ss 0x%x\n",
+ (void *)tf->tf_rip, (unsigned)tf->tf_cs & 0xffff,
+ (void *)tf->tf_rflags,
+ (void *)tf->tf_rsp, (unsigned)tf->tf_ss & 0xffff);
+ printf("err 0x%llx trapno 0x%llx\n",
+ tf->tf_err, tf->tf_trapno);
printf("rdi %p rsi %p rdx %p\n",
(void *)tf->tf_rdi, (void *)tf->tf_rsi, (void *)tf->tf_rdx);
printf("rcx %p r8 %p r9 %p\n",
diff --git a/sys/arch/amd64/amd64/vector.S b/sys/arch/amd64/amd64/vector.S
index 730220af132..5de23fe67ab 100644
--- a/sys/arch/amd64/amd64/vector.S
+++ b/sys/arch/amd64/amd64/vector.S
@@ -1,4 +1,4 @@
-/* $OpenBSD: vector.S,v 1.51 2017/10/04 02:10:33 guenther Exp $ */
+/* $OpenBSD: vector.S,v 1.52 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: vector.S,v 1.5 2004/06/28 09:13:11 fvdl Exp $ */
/*
@@ -104,36 +104,97 @@
#define TRAP(a) pushq $(a) ; jmp _C_LABEL(alltraps)
#define ZTRAP(a) pushq $0 ; TRAP(a)
- .text
IDTVEC(trap00)
ZTRAP(T_DIVIDE)
IDTVEC(trap01)
ZTRAP(T_TRCTRAP)
+
+/*
+ * NMIs can happen at any time, so there's no simple way to tell
+ * which GS.base is in place at the time of the interrupt. Instead,
+ * borrow a couple ideas from FreeBSD and put the CPU's kernel
+ * GS.base in the memory right above the stack, storing the current
+ * one in a pair of callee-saved registers (%r12/13). We save the
+ * current %cr3 in a callee-saved register too (%r15).
+ * Note: we don't unblock interrupts because a nested normal interrupt
+ * would also reenable NMIs.
+ */
IDTVEC(trap02)
- ZTRAP(T_NMI)
+ pushq $0
+ pushq $T_NMI
+calltrap_specstk: # special stack path
+ INTR_REENTRY
+ movl $MSR_FSBASE,%ecx # save current GS.base...
+ rdmsr
+ movq %rax,%r12 # ...in %r12 and %r13
+ movq %rdx,%r13
+ movq FRAMESIZE(%rsp),%rax # get kernel GS.base
+ movq %rax,%rdx
+ shrq $32,%rdx
+ wrmsr # switch to it
+ movq %cr3,%r15 # save current %cr3 in %r15
+ movq CPUVAR(KERN_CR3),%rax # switch to kernel page tables
+ testq %rax,%rax
+ jz INTRENTRY_LABEL(calltrap_specstk)
+ movq %rax,%cr3
+ jmp INTRENTRY_LABEL(calltrap_specstk)
+ .text
+ .globl INTRENTRY_LABEL(calltrap_specstk)
+INTRENTRY_LABEL(calltrap_specstk):
+ cld
+ SMAP_CLAC
+ movq %rsp,%rdi
+ call trap
+ movl $MSR_FSBASE,%ecx # restore GS.base
+ movq %r12,%rax
+ movq %r13,%rdx
+ wrmsr
+ popq %rdi
+ popq %rsi
+ popq %rdx
+ popq %rcx
+ popq %r8
+ popq %r9
+ popq %r10
+ popq %r11
+ popq %r12
+ popq %r13
+ popq %r14
+ jmp calltrap_specstk_tramp
+KUENTRY(calltrap_specstk_tramp)
+ movq %r15,%cr3 # restore %cr3
+ popq %r15
+ popq %rbp
+ popq %rbx
+ popq %rax
+ addq $48,%rsp # ignored TF_[DEFG]S
+ iretq
+
IDTVEC(trap03)
ZTRAP(T_BPTFLT)
IDTVEC(trap04)
- ZTRAP(T_OFLOW)
+ ZTRAP(T_OFLOW) # impossible: INTO instruction invalid in amd64
IDTVEC(trap05)
- ZTRAP(T_BOUND)
+ ZTRAP(T_BOUND) # impossible: BOUND instruction invalid in amd64
IDTVEC(trap06)
ZTRAP(T_PRIVINFLT)
IDTVEC(trap07)
pushq $0 # dummy error code
pushq $T_DNA
- INTRENTRY
+ INTRENTRY(trap07)
sti
cld
SMAP_CLAC
movq CPUVAR(SELF),%rdi
movq %rsp, %rsi
call _C_LABEL(fpudna)
+ cli
INTRFASTEXIT
IDTVEC(trap08)
- TRAP(T_DOUBLEFLT)
+ pushq $T_DOUBLEFLT
+ jmp calltrap_specstk
IDTVEC(trap09)
- ZTRAP(T_FPOPFLT)
+ ZTRAP(T_FPOPFLT) # impossible: not generated on amd64
IDTVEC(trap0a)
TRAP(T_TSSFLT)
IDTVEC(trap0b)
@@ -149,30 +210,49 @@ IDTVEC(trap0c)
* so that we can do the necessary swapgs in that case.
*/
IDTVEC(trap0d)
- subq $TF_ERR,%rsp
- movl $T_PROTFLT,TF_TRAPNO(%rsp)
- movq %rdi,TF_RDI(%rsp)
- leaq _C_LABEL(doreti_iret)(%rip),%rdi
- cmpq %rdi,TF_RIP(%rsp)
+ pushq %rcx
+ leaq _C_LABEL(doreti_iret)(%rip),%rcx
+ cmpq %rcx,16(%rsp) /* over %rcx and err to %rip */
+ popq %rcx
je 1f
- testq $SEL_RPL,TF_CS(%rsp)
- jz 2f
+ testq $SEL_RPL,16(%rsp) /* over err and %rip to %cs */
+ je INTRENTRY_LABEL(trap0d)
1: swapgs
-2: movq %r15,TF_R15(%rsp)
- movq %r14,TF_R14(%rsp)
- movq %r13,TF_R13(%rsp)
- movq %r12,TF_R12(%rsp)
- movq %r11,TF_R11(%rsp)
- movq %r10,TF_R10(%rsp)
- movq %r9,TF_R9(%rsp)
- movq %r8,TF_R8(%rsp)
- /*movq %rdi,TF_RDI(%rsp) done above */
- movq %rsi,TF_RSI(%rsp)
- movq %rbp,TF_RBP(%rsp)
- movq %rbx,TF_RBX(%rsp)
- movq %rdx,TF_RDX(%rsp)
+ movq %rax,CPUVAR(SCRATCH)
+ movq CPUVAR(KERN_CR3),%rax
+ testq %rax,%rax
+ jz 98f
+ movq %rax,%cr3
+ jmp 98f
+ .text
+ .globl INTRENTRY_LABEL(trap0d)
+INTRENTRY_LABEL(trap0d): /* from kernel */
+ pushq $T_PROTFLT
+ subq $152,%rsp
movq %rcx,TF_RCX(%rsp)
- movq %rax,TF_RAX(%rsp)
+ jmp 99f
+98: /* from userspace */
+ movq CPUVAR(KERN_RSP),%rax
+ xchgq %rax,%rsp
+ movq %rcx,TF_RCX(%rsp)
+ /* set trapno in the trap frame */
+ movq $T_PROTFLT,TF_TRAPNO(%rsp)
+ /* copy err and iretq frame to the trap frame */
+ movq 0(%rax),%rcx
+ movq %rcx,TF_ERR(%rsp)
+ add $8,%rax
+ movq IRETQ_RIP(%rax),%rcx
+ movq %rcx,TF_RIP(%rsp)
+ movq IRETQ_CS(%rax),%rcx
+ movq %rcx,TF_CS(%rsp)
+ movq IRETQ_RFLAGS(%rax),%rcx
+ movq %rcx,TF_RFLAGS(%rsp)
+ movq IRETQ_RSP(%rax),%rcx
+ movq %rcx,TF_RSP(%rsp)
+ movq IRETQ_SS(%rax),%rcx
+ movq %rcx,TF_SS(%rsp)
+ movq CPUVAR(SCRATCH),%rax
+99: INTR_SAVE_MOST_GPRS_NO_ADJ
sti
jmp calltrap
@@ -204,7 +284,9 @@ IDTVEC(trap1f)
/* 20 - 31 reserved for future exp */
ZTRAP(T_RESERVED)
-IDTVEC(exceptions)
+ .section .rodata
+ .globl Xexceptions
+Xexceptions:
.quad _C_LABEL(Xtrap00), _C_LABEL(Xtrap01)
.quad _C_LABEL(Xtrap02), _C_LABEL(Xtrap03)
.quad _C_LABEL(Xtrap04), _C_LABEL(Xtrap05)
@@ -232,19 +314,44 @@ IDTVEC(exceptions)
* protection fault. This will cause the process to get a SIGBUS.
*/
NENTRY(resume_iret)
- pushq $0
- pushq $T_PROTFLT
- subq $32,%rsp
- INTR_SAVE_GPRS
+ movq %rax,CPUVAR(SCRATCH)
+ movq CPUVAR(KERN_CR3),%rax
+ testq %rax,%rax
+ jz INTRENTRY_LABEL(iret)
+ movq %rax,%cr3
+ jmp INTRENTRY_LABEL(iret)
+ .text
+ .globl INTRENTRY_LABEL(iret)
+INTRENTRY_LABEL(iret): /* from kernel */
+ movq CPUVAR(KERN_RSP),%rax
+ xchgq %rax,%rsp
+ movq %rcx,TF_RCX(%rsp)
+ /* set trapno+err in the trap frame */
+ movq $T_PROTFLT,TF_TRAPNO(%rsp)
+ movq $0,TF_ERR(%rsp)
+ /* copy iretq frame to the trap frame */
+ movq IRETQ_RIP(%rax),%rcx
+ movq %rcx,TF_RIP(%rsp)
+ movq IRETQ_CS(%rax),%rcx
+ movq %rcx,TF_CS(%rsp)
+ movq IRETQ_RFLAGS(%rax),%rcx
+ movq %rcx,TF_RFLAGS(%rsp)
+ movq IRETQ_RSP(%rax),%rcx
+ movq %rcx,TF_RSP(%rsp)
+ movq IRETQ_SS(%rax),%rcx
+ movq %rcx,TF_SS(%rsp)
+ movq CPUVAR(SCRATCH),%rax
+ INTR_SAVE_MOST_GPRS_NO_ADJ
sti
jmp calltrap
+
/*
* All traps go through here. Call the generic trap handler, and
* check for ASTs afterwards.
*/
-NENTRY(alltraps)
- INTRENTRY
+KUENTRY(alltraps)
+ INTRENTRY(alltraps)
sti
calltrap:
cld
@@ -329,6 +436,7 @@ spl_lowered:
/* XXX See comment in locore.s */
#define XINTR(name,num) Xintr_##name##num
+ KUTEXT
.globl _C_LABEL(x2apic_eoi)
_C_LABEL(x2apic_eoi):
pushq %rax
@@ -345,23 +453,23 @@ _C_LABEL(x2apic_eoi):
#if NLAPIC > 0
#ifdef MULTIPROCESSOR
-IDTVEC(recurse_lapic_ipi)
+KIDTVEC(recurse_lapic_ipi)
INTR_RECURSE_HWFRAME
- pushq $0
+ pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTR_REENTRY
jmp 1f
IDTVEC(intr_lapic_ipi)
- pushq $0
+ pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTRENTRY(intr_lapic_ipi)
CODEPATCH_START
movl $0,_C_LABEL(local_apic)+LAPIC_EOI
CODEPATCH_END(CPTAG_EOI)
movl CPUVAR(ILEVEL),%ebx
cmpl $IPL_IPI,%ebx
jae 2f
-IDTVEC(resume_lapic_ipi)
+KIDTVEC(resume_lapic_ipi)
1:
incl CPUVAR(IDEPTH)
movl $IPL_IPI,CPUVAR(ILEVEL)
@@ -425,27 +533,27 @@ IDTVEC(ipi_invlrange)
iretq
#endif /* MULTIPROCESSOR */
-
+
/*
* Interrupt from the local APIC timer.
*/
-IDTVEC(recurse_lapic_ltimer)
+KIDTVEC(recurse_lapic_ltimer)
INTR_RECURSE_HWFRAME
- pushq $0
+ pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTR_REENTRY
jmp 1f
IDTVEC(intr_lapic_ltimer)
- pushq $0
+ pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTRENTRY(intr_lapic_ltimer)
CODEPATCH_START
movl $0,_C_LABEL(local_apic)+LAPIC_EOI
CODEPATCH_END(CPTAG_EOI)
movl CPUVAR(ILEVEL),%ebx
cmpl $IPL_CLOCK,%ebx
jae 2f
-IDTVEC(resume_lapic_ltimer)
+KIDTVEC(resume_lapic_ltimer)
1:
incl CPUVAR(IDEPTH)
movl $IPL_CLOCK,CPUVAR(ILEVEL)
@@ -466,21 +574,21 @@ IDTVEC(resume_lapic_ltimer)
* Xen event channel upcall interrupt handler.
* Only used when the hypervisor supports direct vector callbacks.
*/
-IDTVEC(recurse_xen_upcall)
+KIDTVEC(recurse_xen_upcall)
INTR_RECURSE_HWFRAME
pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTR_REENTRY
jmp 1f
IDTVEC(intr_xen_upcall)
pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTRENTRY(intr_xen_upcall)
call _C_LABEL(xen_intr_ack)
movl CPUVAR(ILEVEL),%ebx
cmpl $IPL_NET,%ebx
jae 2f
-IDTVEC(resume_xen_upcall)
+KIDTVEC(resume_xen_upcall)
1:
incl CPUVAR(IDEPTH)
movl $IPL_NET,CPUVAR(ILEVEL)
@@ -502,20 +610,20 @@ IDTVEC(resume_xen_upcall)
* Hyperv event channel upcall interrupt handler.
* Only used when the hypervisor supports direct vector callbacks.
*/
-IDTVEC(recurse_hyperv_upcall)
+KIDTVEC(recurse_hyperv_upcall)
INTR_RECURSE_HWFRAME
pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTR_REENTRY
jmp 1f
IDTVEC(intr_hyperv_upcall)
pushq $0
subq $8,%rsp /* unused __if_trapno */
- INTRENTRY
+ INTRENTRY(intr_hyperv_upcall)
movl CPUVAR(ILEVEL),%ebx
cmpl $IPL_NET,%ebx
jae 2f
-IDTVEC(resume_hyperv_upcall)
+KIDTVEC(resume_hyperv_upcall)
1:
incl CPUVAR(IDEPTH)
movl $IPL_NET,CPUVAR(ILEVEL)
@@ -542,11 +650,11 @@ IDTVEC(resume_hyperv_upcall)
*/
#define INTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \
-IDTVEC(recurse_##name##num) ;\
+KIDTVEC(recurse_##name##num) ;\
INTR_RECURSE_HWFRAME ;\
subq $16,%rsp /* space for __if_{trapno,err} */;\
- INTRENTRY ;\
-IDTVEC(resume_##name##num) \
+ INTR_REENTRY ;\
+KIDTVEC(resume_##name##num) \
movq $IREENT_MAGIC,TF_ERR(%rsp) ;\
movl %ebx,%r13d ;\
movq CPUVAR(ISOURCES) + (num) * 8, %r14 ;\
@@ -555,7 +663,7 @@ IDTVEC(resume_##name##num) \
IDTVEC(intr_##name##num) ;\
pushq $0 /* dummy error code */ ;\
subq $8,%rsp /* unused __if_trapno */ ;\
- INTRENTRY ;\
+ INTRENTRY(intr_##name##num) ;\
movq CPUVAR(ISOURCES) + (num) * 8, %r14 ;\
mask(num) /* mask it in hardware */ ;\
early_ack(num) /* and allow other intrs */ ;\
@@ -1094,8 +1202,7 @@ _C_LABEL(ioapic_level_stubs):
/*
* Soft interrupt handlers
*/
- .text
-IDTVEC(softtty)
+KIDTVEC(softtty)
movl $IPL_SOFTTTY, CPUVAR(ILEVEL)
sti
incl CPUVAR(IDEPTH)
@@ -1104,7 +1211,7 @@ IDTVEC(softtty)
decl CPUVAR(IDEPTH)
jmp *%r13
-IDTVEC(softnet)
+KIDTVEC(softnet)
movl $IPL_SOFTNET, CPUVAR(ILEVEL)
sti
incl CPUVAR(IDEPTH)
@@ -1113,7 +1220,7 @@ IDTVEC(softnet)
decl CPUVAR(IDEPTH)
jmp *%r13
-IDTVEC(softclock)
+KIDTVEC(softclock)
movl $IPL_SOFTCLOCK, CPUVAR(ILEVEL)
sti
incl CPUVAR(IDEPTH)
diff --git a/sys/arch/amd64/conf/ld.script b/sys/arch/amd64/conf/ld.script
index 4d74b3eb8e3..9c60d69f2c8 100644
--- a/sys/arch/amd64/conf/ld.script
+++ b/sys/arch/amd64/conf/ld.script
@@ -1,4 +1,4 @@
-/* $OpenBSD: ld.script,v 1.8 2017/10/24 20:06:54 guenther Exp $ */
+/* $OpenBSD: ld.script,v 1.9 2018/02/21 19:24:15 guenther Exp $ */
/*
* Copyright (c) 2009 Tobias Weingartner <weingart@tepid.org>
@@ -52,6 +52,15 @@ SECTIONS
*(.text .text.*)
} :text =0xcccccccc
+ . = ALIGN(__ALIGN_SIZE);
+ __kernel_kutext_phys = (. - __kernel_virt_base) + 0x1000000;
+ .kutext : AT (__kernel_kutext_phys)
+ {
+ __kutext_start = ABSOLUTE(.);
+ *(.kutext)
+ __kutext_end = ABSOLUTE(.);
+ } :text =0xcccccccc
+
PROVIDE (etext = .);
_etext = .;
@@ -85,6 +94,17 @@ SECTIONS
*(.data .data.*)
} :data =0xcccccccc
. = ALIGN(0x1000);
+
+ . = ALIGN(__ALIGN_SIZE);
+ __kernel_kudata_phys = (. - __kernel_virt_base) + 0x1000000;
+ .kudata : AT (__kernel_kudata_phys)
+ {
+ __kudata_start = ABSOLUTE(.);
+ *(.kudata)
+ __kudata_end = ABSOLUTE(.);
+ } :data =0xcccccccc
+
+ . = ALIGN(0x1000);
PROVIDE (edata = .);
_edata = .;
diff --git a/sys/arch/amd64/include/asm.h b/sys/arch/amd64/include/asm.h
index f64e5338f07..cd3922b4b26 100644
--- a/sys/arch/amd64/include/asm.h
+++ b/sys/arch/amd64/include/asm.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: asm.h,v 1.8 2017/06/29 17:36:16 deraadt Exp $ */
+/* $OpenBSD: asm.h,v 1.9 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: asm.h,v 1.2 2003/05/02 18:05:47 yamt Exp $ */
/*-
@@ -68,14 +68,19 @@
.text; _ALIGN_TEXT; .globl x; .type x,@function; x:
#ifdef _KERNEL
+#define KUTEXT .section .kutext, "ax"
+/*#define KUTEXT .text */
+
/* XXX Can't use __CONCAT() here, as it would be evaluated incorrectly. */
-#ifdef __STDC__
#define IDTVEC(name) \
- .text; ALIGN_TEXT; .globl X ## name; .type X ## name,@function; X ## name:
-#else
-#define IDTVEC(name) \
- .text; ALIGN_TEXT; .globl X/**/name; .type X/**/name,@function; X/**/name:
-#endif /* __STDC__ */
+ KUTEXT; ALIGN_TEXT; \
+ .globl X ## name; .type X ## name,@function; X ## name:
+#define KIDTVEC(name) \
+ .text; ALIGN_TEXT; \
+ .globl X ## name; .type X ## name,@function; X ## name:
+#define KUENTRY(x) \
+ KUTEXT; _ALIGN_TEXT; .globl x; .type x,@function; x:
+
#endif /* _KERNEL */
#ifdef __STDC__
diff --git a/sys/arch/amd64/include/cpu.h b/sys/arch/amd64/include/cpu.h
index 59f99ebdc8a..8f973ba1423 100644
--- a/sys/arch/amd64/include/cpu.h
+++ b/sys/arch/amd64/include/cpu.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: cpu.h,v 1.118 2018/01/07 01:08:20 mlarkin Exp $ */
+/* $OpenBSD: cpu.h,v 1.119 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: cpu.h,v 1.1 2003/04/26 18:39:39 fvdl Exp $ */
/*-
@@ -43,7 +43,7 @@
*/
#ifdef _KERNEL
#include <machine/frame.h>
-#include <machine/segments.h>
+#include <machine/segments.h> /* USERMODE */
#include <machine/cacheinfo.h>
#include <machine/intrdefs.h>
#endif /* _KERNEL */
@@ -89,6 +89,17 @@ union vmm_cpu_cap {
struct x86_64_tss;
struct cpu_info {
+ /*
+ * The beginning of this structure in mapped in the userspace "u-k"
+ * page tables, so that these first couple members can be accessed
+ * from the trampoline code. The ci_PAGEALIGN member defines where
+ * the part that is *not* visible begins, so don't put anything
+ * above it that must be kept hidden from userspace!
+ */
+ u_int64_t ci_kern_cr3; /* U+K page table */
+ u_int64_t ci_scratch; /* for U<-->K transition */
+
+#define ci_PAGEALIGN ci_dev
struct device *ci_dev;
struct cpu_info *ci_self;
struct schedstate_percpu ci_schedstate; /* scheduler state */
@@ -100,7 +111,9 @@ struct cpu_info {
u_int ci_acpi_proc_id;
u_int32_t ci_randseed;
- u_int64_t ci_scratch;
+ u_int64_t ci_kern_rsp; /* kernel-only stack */
+ u_int64_t ci_intr_rsp; /* U<-->K trampoline stack */
+ u_int64_t ci_user_cr3; /* U-K page table */
struct proc *ci_fpcurproc;
struct proc *ci_fpsaveproc;
@@ -216,7 +229,10 @@ struct cpu_info {
#define PROC_PC(p) ((p)->p_md.md_regs->tf_rip)
#define PROC_STACK(p) ((p)->p_md.md_regs->tf_rsp)
-extern struct cpu_info cpu_info_primary;
+struct cpu_info_full;
+extern struct cpu_info_full cpu_info_full_primary;
+#define cpu_info_primary (*(struct cpu_info *)((char *)&cpu_info_full_primary + 4096*2 - offsetof(struct cpu_info, ci_PAGEALIGN)))
+
extern struct cpu_info *cpu_info_list;
#define CPU_INFO_ITERATOR int
@@ -241,7 +257,8 @@ extern void need_resched(struct cpu_info *);
#define CPU_START_CLEANUP(_ci) ((_ci)->ci_func->cleanup(_ci))
#define curcpu() ({struct cpu_info *__ci; \
- asm volatile("movq %%gs:8,%0" : "=r" (__ci)); \
+ asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) \
+ :"n" (offsetof(struct cpu_info, ci_self))); \
__ci;})
#define cpu_number() (curcpu()->ci_cpuid)
@@ -262,8 +279,6 @@ void cpu_unidle(struct cpu_info *);
#define MAXCPUS 1
#ifdef _KERNEL
-extern struct cpu_info cpu_info_primary;
-
#define curcpu() (&cpu_info_primary)
#define cpu_kick(ci)
diff --git a/sys/arch/amd64/include/cpu_full.h b/sys/arch/amd64/include/cpu_full.h
new file mode 100644
index 00000000000..995cab087cf
--- /dev/null
+++ b/sys/arch/amd64/include/cpu_full.h
@@ -0,0 +1,66 @@
+/* $OpenBSD: cpu_full.h,v 1.1 2018/02/21 19:24:15 guenther Exp $ */
+/*
+ * Copyright (c) Philip Guenther <guenther@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _MACHINE_CPU_FULL_H_
+#define _MACHINE_CPU_FULL_H_
+
+#include <sys/param.h> /* offsetof, PAGE_SIZE */
+#include <machine/segments.h>
+#include <machine/tss.h>
+
+/*
+ * The layout of the full per-CPU information, including TSS, GDT,
+ * trampoline stacks, and cpu_info described in <machine/cpu.h>
+ */
+struct cpu_info_full {
+ /* page mapped kRO in u-k */
+ union {
+ struct x86_64_tss u_tss; /* followed by gdt */
+ char u_align[PAGE_SIZE];
+ } cif_RO;
+#define cif_tss cif_RO.u_tss
+
+ /* start of page mapped kRW in u-k */
+ uint64_t cif_tramp_stack[(PAGE_SIZE / 4
+ - offsetof(struct cpu_info, ci_PAGEALIGN)) / sizeof(uint64_t)];
+ uint64_t cif_dblflt_stack[(PAGE_SIZE / 4) / sizeof(uint64_t)];
+ uint64_t cif_nmi_stack[(2 * PAGE_SIZE / 4) / sizeof(uint64_t)];
+
+ /*
+ * Beginning of this hangs over into the kRW page; rest is
+ * unmapped in u-k
+ */
+ struct cpu_info cif_cpu;
+} __aligned(PAGE_SIZE);
+
+/* tss, align shim, and gdt must fit in a page */
+CTASSERT(_ALIGN(sizeof(struct x86_64_tss)) +
+ sizeof(struct mem_segment_descriptor) * (NGDT_MEM + 2*NGDT_SYS)
+ < PAGE_SIZE);
+
+/* verify expected alignment */
+CTASSERT(offsetof(struct cpu_info_full, cif_cpu.ci_PAGEALIGN) % PAGE_SIZE == 0);
+
+/* verify total size is multiple of page size */
+CTASSERT(sizeof(struct cpu_info_full) % PAGE_SIZE == 0);
+
+extern struct cpu_info_full cpu_info_full_primary;
+
+/* Now make sure the cpu_info_primary macro is correct */
+CTASSERT(&cpu_info_primary == &cpu_info_full_primary.cif_cpu);
+
+#endif /* _MACHINE_CPU_FULL_H_ */
diff --git a/sys/arch/amd64/include/cpufunc.h b/sys/arch/amd64/include/cpufunc.h
index b52e4b3d2ae..ed8c6ba8905 100644
--- a/sys/arch/amd64/include/cpufunc.h
+++ b/sys/arch/amd64/include/cpufunc.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: cpufunc.h,v 1.23 2018/02/06 01:09:17 patrick Exp $ */
+/* $OpenBSD: cpufunc.h,v 1.24 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: cpufunc.h,v 1.3 2003/05/08 10:27:43 fvdl Exp $ */
/*-
@@ -317,6 +317,9 @@ void amd64_errata(struct cpu_info *);
void cpu_ucode_setup(void);
void cpu_ucode_apply(struct cpu_info *);
+struct cpu_info_full;
+void cpu_enter_pages(struct cpu_info_full *);
+
#endif /* _KERNEL */
#endif /* !_MACHINE_CPUFUNC_H_ */
diff --git a/sys/arch/amd64/include/frame.h b/sys/arch/amd64/include/frame.h
index e71d4093274..997adbf570c 100644
--- a/sys/arch/amd64/include/frame.h
+++ b/sys/arch/amd64/include/frame.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: frame.h,v 1.6 2016/02/26 09:29:20 mpi Exp $ */
+/* $OpenBSD: frame.h,v 1.7 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: frame.h,v 1.1 2003/04/26 18:39:40 fvdl Exp $ */
/*-
@@ -147,6 +147,20 @@ struct intrframe {
int64_t if_ss;
};
+
+/*
+ * The trampoline frame used on the kernel stack page which is present
+ * but kernel-only, in the page tables used when in userspace. This is
+ * the minimum for iretq operation.
+ */
+struct iretq_frame {
+ int64_t iretq_rip;
+ int64_t iretq_cs;
+ int64_t iretq_rflags;
+ int64_t iretq_rsp;
+ int64_t iretq_ss;
+};
+
/*
* Stack frame inside cpu_switch()
*/
diff --git a/sys/arch/amd64/include/frameasm.h b/sys/arch/amd64/include/frameasm.h
index 88309d1dd4f..5e384acb9dc 100644
--- a/sys/arch/amd64/include/frameasm.h
+++ b/sys/arch/amd64/include/frameasm.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: frameasm.h,v 1.11 2018/01/06 22:03:12 guenther Exp $ */
+/* $OpenBSD: frameasm.h,v 1.12 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: frameasm.h,v 1.1 2003/04/26 18:39:40 fvdl Exp $ */
#ifndef _AMD64_MACHINE_FRAMEASM_H
@@ -13,7 +13,10 @@
* These are used on interrupt or trap entry or exit.
*/
#define INTR_SAVE_GPRS \
- subq $120,%rsp ; \
+ subq $120,%rsp ; \
+ INTR_SAVE_MOST_GPRS_NO_ADJ ; \
+ movq %rcx,TF_RCX(%rsp)
+#define INTR_SAVE_MOST_GPRS_NO_ADJ \
movq %r15,TF_R15(%rsp) ; \
movq %r14,TF_R14(%rsp) ; \
movq %r13,TF_R13(%rsp) ; \
@@ -27,15 +30,54 @@
movq %rbp,TF_RBP(%rsp) ; \
movq %rbx,TF_RBX(%rsp) ; \
movq %rdx,TF_RDX(%rsp) ; \
- movq %rcx,TF_RCX(%rsp) ; \
movq %rax,TF_RAX(%rsp)
-#define INTRENTRY \
- subq $32,%rsp ; \
- testq $SEL_RPL,56(%rsp) ; \
- je 98f ; \
+/* For real interrupt code paths, where we can come from userspace */
+#define INTRENTRY_LABEL(label) X##label##_untramp
+#define INTRENTRY(label) \
+ testq $SEL_RPL,24(%rsp) ; \
+ je INTRENTRY_LABEL(label) ; \
swapgs ; \
-98: INTR_SAVE_GPRS
+ movq %rax,CPUVAR(SCRATCH) ; \
+ movq CPUVAR(KERN_CR3),%rax ; \
+ testq %rax,%rax ; \
+ jz 98f ; \
+ movq %rax,%cr3 ; \
+ jmp 98f ; \
+ .text ; \
+ .global INTRENTRY_LABEL(label) ; \
+INTRENTRY_LABEL(label): /* from kernel */ \
+ subq $152,%rsp ; \
+ movq %rcx,TF_RCX(%rsp) ; \
+ jmp 99f ; \
+98: /* from userspace */ \
+ movq CPUVAR(KERN_RSP),%rax ; \
+ xchgq %rax,%rsp ; \
+ movq %rcx,TF_RCX(%rsp) ; \
+ /* copy trapno+err to the trap frame */ \
+ movq 0(%rax),%rcx ; \
+ movq %rcx,TF_TRAPNO(%rsp) ; \
+ movq 8(%rax),%rcx ; \
+ movq %rcx,TF_ERR(%rsp) ; \
+ addq $16,%rax ; \
+ /* copy iretq frame to the trap frame */ \
+ movq IRETQ_RIP(%rax),%rcx ; \
+ movq %rcx,TF_RIP(%rsp) ; \
+ movq IRETQ_CS(%rax),%rcx ; \
+ movq %rcx,TF_CS(%rsp) ; \
+ movq IRETQ_RFLAGS(%rax),%rcx ; \
+ movq %rcx,TF_RFLAGS(%rsp) ; \
+ movq IRETQ_RSP(%rax),%rcx ; \
+ movq %rcx,TF_RSP(%rsp) ; \
+ movq IRETQ_SS(%rax),%rcx ; \
+ movq %rcx,TF_SS(%rsp) ; \
+ movq CPUVAR(SCRATCH),%rax ; \
+99: INTR_SAVE_MOST_GPRS_NO_ADJ
+
+/* For faking up an interrupt frame when we're already in the kernel */
+#define INTR_REENTRY \
+ subq $32,%rsp ; \
+ INTR_SAVE_GPRS
#define INTRFASTEXIT \
jmp intr_fast_exit
@@ -50,24 +92,6 @@
pushq %r11 ; \
pushq %r13 ;
-/*
- * Restore FS.base if it's not already in the CPU, and do the cli/swapgs.
- * Uses %rax, %rcx, and %rdx
- */
-#define INTR_RESTORE_SELECTORS \
- btsl $CPUF_USERSEGS_BIT, CPUVAR(FLAGS) ; \
- jc 99f ; \
- movq CPUVAR(CURPCB),%rdx /* for below */ ; \
- movq PCB_FSBASE(%rdx),%rax ; \
- cmpq $0,%rax ; \
- je 99f /* setting %fs has zeroed FS.base */ ; \
- movq %rax,%rdx ; \
- shrq $32,%rdx ; \
- movl $MSR_FSBASE,%ecx ; \
- wrmsr ; \
-99: cli ; \
- swapgs
-
#define INTR_FAKE_TRAP 0xbadabada
#define CHECK_ASTPENDING(reg) movq CPUVAR(CURPROC),reg ; \
diff --git a/sys/arch/amd64/include/gdt.h b/sys/arch/amd64/include/gdt.h
index 65a116e8bc1..bfdc521d6c2 100644
--- a/sys/arch/amd64/include/gdt.h
+++ b/sys/arch/amd64/include/gdt.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: gdt.h,v 1.5 2010/11/13 04:16:42 guenther Exp $ */
+/* $OpenBSD: gdt.h,v 1.6 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: gdt.h,v 1.1 2003/04/26 18:39:40 fvdl Exp $ */
/*-
@@ -31,4 +31,3 @@
*/
void gdt_init_cpu(struct cpu_info *);
-void gdt_alloc_cpu(struct cpu_info *);
diff --git a/sys/arch/amd64/include/pmap.h b/sys/arch/amd64/include/pmap.h
index ef776eb959f..c316521f6f3 100644
--- a/sys/arch/amd64/include/pmap.h
+++ b/sys/arch/amd64/include/pmap.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: pmap.h,v 1.63 2018/01/07 21:43:25 mlarkin Exp $ */
+/* $OpenBSD: pmap.h,v 1.64 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: pmap.h,v 1.1 2003/04/26 18:39:46 fvdl Exp $ */
/*
@@ -280,8 +280,19 @@ struct pmap {
struct mutex pm_mtx;
struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */
LIST_ENTRY(pmap) pm_list; /* list (lck by pm_list lock) */
- pd_entry_t *pm_pdir; /* VA of PD (lck by object lock) */
- paddr_t pm_pdirpa; /* PA of PD (read-only after create) */
+ /*
+ * pm_pdir : VA of page table to be used when executing in
+ * privileged mode
+ * pm_pdirpa : PA of page table to be used when executing in
+ * privileged mode
+ * pm_pdir_intel : VA of special page table to be used when executing
+ * on an Intel CPU in usermode (no kernel mappings)
+ * pm_pdirpa_intel : PA of special page table to be used when executing
+ * on an Intel CPU in usermode (no kernel mappings)
+ */
+ pd_entry_t *pm_pdir, *pm_pdir_intel;
+ paddr_t pm_pdirpa, pm_pdirpa_intel;
+
struct vm_page *pm_ptphint[PTP_LEVELS-1];
/* pointer to a PTP in our pmap */
struct pmap_statistics pm_stats; /* pmap stats (lck by object lock) */
@@ -375,6 +386,7 @@ paddr_t pmap_prealloc_lowmem_ptps(paddr_t);
void pagezero(vaddr_t);
int pmap_convert(struct pmap *, int);
+void pmap_enter_special(vaddr_t, paddr_t, vm_prot_t);
/*
* functions for flushing the cache for vaddrs and pages.
diff --git a/sys/arch/amd64/include/specialreg.h b/sys/arch/amd64/include/specialreg.h
index ae81a593f9a..b7aa6e7a4d6 100644
--- a/sys/arch/amd64/include/specialreg.h
+++ b/sys/arch/amd64/include/specialreg.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: specialreg.h,v 1.67 2018/02/10 09:46:58 jsg Exp $ */
+/* $OpenBSD: specialreg.h,v 1.68 2018/02/21 19:24:15 guenther Exp $ */
/* $NetBSD: specialreg.h,v 1.1 2003/04/26 18:39:48 fvdl Exp $ */
/* $NetBSD: x86/specialreg.h,v 1.2 2003/04/25 21:54:30 fvdl Exp $ */
@@ -219,6 +219,7 @@
#define SEFF0EDX_AVX512_4FMAPS 0x00000008 /* AVX-512 mult accum single prec */
#define SEFF0EDX_IBRS 0x04000000 /* IBRS / IBPB Speculation Control */
#define SEFF0EDX_STIBP 0x08000000 /* STIBP Speculation Control */
+#define SEFF0EDX_ARCH_CAP 0x20000000 /* Has IA32_ARCH_CAPABILITIES MSR */
/*
* Thermal and Power Management (CPUID function 0x6) EAX bits
@@ -351,6 +352,8 @@
#define MTRRcap_FIXED 0x100 /* bit 8 - fixed MTRRs supported */
#define MTRRcap_WC 0x400 /* bit 10 - WC type supported */
#define MTRRcap_SMRR 0x800 /* bit 11 - SMM range reg supported */
+#define MSR_ARCH_CAPABILITIES 0x10a
+#define ARCH_CAPABILITIES_RDCL_NO (1ULL << 0) /* Meltdown safe */
#define MSR_BBL_CR_ADDR 0x116 /* PII+ only */
#define MSR_BBL_CR_DECC 0x118 /* PII+ only */
#define MSR_BBL_CR_CTL 0x119 /* PII+ only */