summaryrefslogtreecommitdiff
path: root/sys/arch/amd64
diff options
context:
space:
mode:
authorMike Larkin <mlarkin@cvs.openbsd.org>2015-11-13 07:52:21 +0000
committerMike Larkin <mlarkin@cvs.openbsd.org>2015-11-13 07:52:21 +0000
commit207e48e84cdbc60695417e94cc230d340d5f2028 (patch)
treef012cc1efae9cdaa15a35bd7e4310a53c4cbde57 /sys/arch/amd64
parente61de7fde46443b42ab120089f15aa14219d1c30 (diff)
vmm(4) kernel code
circulated on hackers@, no objections. Disabled by default.
Diffstat (limited to 'sys/arch/amd64')
-rw-r--r--sys/arch/amd64/amd64/cacheinfo.c10
-rw-r--r--sys/arch/amd64/amd64/conf.c15
-rw-r--r--sys/arch/amd64/amd64/cpu.c46
-rw-r--r--sys/arch/amd64/amd64/identcpu.c150
-rw-r--r--sys/arch/amd64/amd64/ipifuncs.c33
-rw-r--r--sys/arch/amd64/amd64/mainbus.c8
-rw-r--r--sys/arch/amd64/amd64/vmm.c3414
-rw-r--r--sys/arch/amd64/amd64/vmm_support.S384
-rw-r--r--sys/arch/amd64/conf/GENERIC4
-rw-r--r--sys/arch/amd64/conf/Makefile.amd644
-rw-r--r--sys/arch/amd64/conf/files.amd6410
-rw-r--r--sys/arch/amd64/include/cpu.h53
-rw-r--r--sys/arch/amd64/include/intrdefs.h9
-rw-r--r--sys/arch/amd64/include/pmap.h3
-rw-r--r--sys/arch/amd64/include/specialreg.h231
-rw-r--r--sys/arch/amd64/include/vmmvar.h387
16 files changed, 4713 insertions, 48 deletions
diff --git a/sys/arch/amd64/amd64/cacheinfo.c b/sys/arch/amd64/amd64/cacheinfo.c
index 8926949af6d..eb319b909ec 100644
--- a/sys/arch/amd64/amd64/cacheinfo.c
+++ b/sys/arch/amd64/amd64/cacheinfo.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: cacheinfo.c,v 1.6 2012/03/16 01:53:00 haesbaert Exp $ */
+/* $OpenBSD: cacheinfo.c,v 1.7 2015/11/13 07:52:20 mlarkin Exp $ */
/*-
* Copyright (c) 2000 The NetBSD Foundation, Inc.
@@ -185,6 +185,10 @@ amd_cpu_cacheinfo(struct cpu_info *ci)
}
CPUID(0x80000005, descs[0], descs[1], descs[2], descs[3]);
+ ci->ci_amdcacheinfo[0] = descs[0];
+ ci->ci_amdcacheinfo[1] = descs[1];
+ ci->ci_amdcacheinfo[2] = descs[2];
+ ci->ci_amdcacheinfo[3] = descs[3];
/*
* K6-III and higher have large page TLBs.
@@ -230,6 +234,10 @@ amd_cpu_cacheinfo(struct cpu_info *ci)
}
CPUID(0x80000006, descs[0], descs[1], descs[2], descs[3]);
+ ci->ci_extcacheinfo[0] = descs[0];
+ ci->ci_extcacheinfo[1] = descs[1];
+ ci->ci_extcacheinfo[2] = descs[2];
+ ci->ci_extcacheinfo[3] = descs[3];
cai = &ci->ci_cinfo[CAI_L2CACHE];
cai->cai_totalsize = AMD_L2_ECX_C_SIZE(descs[2]);
diff --git a/sys/arch/amd64/amd64/conf.c b/sys/arch/amd64/amd64/conf.c
index 0bf95c223e4..0c9c20d898e 100644
--- a/sys/arch/amd64/amd64/conf.c
+++ b/sys/arch/amd64/amd64/conf.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: conf.c,v 1.51 2015/10/23 15:10:52 claudio Exp $ */
+/* $OpenBSD: conf.c,v 1.52 2015/11/13 07:52:20 mlarkin Exp $ */
/*
* Copyright (c) 1994, 1995 Charles M. Hannum. All rights reserved.
@@ -103,6 +103,15 @@ int nblkdev = nitems(bdevsw);
(dev_type_stop((*))) enodev, 0, seltrue, \
(dev_type_mmap((*))) enodev, 0 }
+/* open, close, ioctl */
+#define cdev_vmm_init(c,n) { \
+ dev_init(c,n,open), dev_init(c,n,close), \
+ (dev_type_read((*))) enodev, \
+ (dev_type_write((*))) enodev, \
+ dev_init(c,n,ioctl), \
+ (dev_type_stop((*))) enodev, 0, seltrue, \
+ (dev_type_mmap((*))) enodev }
+
#define mmread mmrw
#define mmwrite mmrw
@@ -154,6 +163,8 @@ cdev_decl(cztty);
cdev_decl(nvram);
#include "drm.h"
cdev_decl(drm);
+#include "vmm.h"
+cdev_decl(vmm);
#include "wsdisplay.h"
#include "wskbd.h"
@@ -184,7 +195,7 @@ struct cdevsw cdevsw[] =
cdev_log_init(1,log), /* 7: /dev/klog */
cdev_tty_init(NCOM,com), /* 8: serial port */
cdev_disk_init(NFD,fd), /* 9: floppy disk */
- cdev_notdef(), /* 10 */
+ cdev_vmm_init(NVMM,vmm), /* 10 vmm */
cdev_notdef(), /* 11: Sony CD-ROM */
cdev_wsdisplay_init(NWSDISPLAY, /* 12: frame buffers, etc. */
wsdisplay),
diff --git a/sys/arch/amd64/amd64/cpu.c b/sys/arch/amd64/amd64/cpu.c
index 588090c0157..97013c18cc9 100644
--- a/sys/arch/amd64/amd64/cpu.c
+++ b/sys/arch/amd64/amd64/cpu.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: cpu.c,v 1.88 2015/07/18 19:21:02 sf Exp $ */
+/* $OpenBSD: cpu.c,v 1.89 2015/11/13 07:52:20 mlarkin Exp $ */
/* $NetBSD: cpu.c,v 1.1 2003/04/26 18:39:26 fvdl Exp $ */
/*-
@@ -66,6 +66,7 @@
#include "lapic.h"
#include "ioapic.h"
+#include "vmm.h"
#include <sys/param.h>
#include <sys/timeout.h>
@@ -114,6 +115,9 @@ int cpu_match(struct device *, void *, void *);
void cpu_attach(struct device *, struct device *, void *);
int cpu_activate(struct device *, int);
void patinit(struct cpu_info *ci);
+#ifdef VMM
+void cpu_init_vmm(struct cpu_info *ci);
+#endif /* VMM */
struct cpu_softc {
struct device sc_dev; /* device tree glue */
@@ -463,6 +467,9 @@ cpu_attach(struct device *parent, struct device *self, void *aux)
sc->sc_dev.dv_xname, pcb, pcb->pcb_rsp);
}
#endif
+#ifdef VMM
+ cpu_init_vmm(ci);
+#endif /* VMM */
}
/*
@@ -485,12 +492,12 @@ cpu_init(struct cpu_info *ci)
lcr0(rcr0() | CR0_WP);
cr4 = rcr4() | CR4_DEFAULT;
- if (ci->ci_feature_sefflags & SEFF0EBX_SMEP)
+ if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMEP)
cr4 |= CR4_SMEP;
#ifndef SMALL_KERNEL
- if (ci->ci_feature_sefflags & SEFF0EBX_SMAP)
+ if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMAP)
cr4 |= CR4_SMAP;
- if (ci->ci_feature_sefflags & SEFF0EBX_FSGSBASE)
+ if (ci->ci_feature_sefflags_ebx & SEFF0EBX_FSGSBASE)
cr4 |= CR4_FSGSBASE;
#endif
if (cpu_ecxfeature & CPUIDECX_XSAVE)
@@ -515,6 +522,30 @@ cpu_init(struct cpu_info *ci)
#endif
}
+#ifdef VMM
+/*
+ * cpu_init_vmm
+ *
+ * Initializes per-cpu VMM state
+ *
+ * Parameters:
+ * ci: the cpu for which state is being initialized
+ */
+void
+cpu_init_vmm(struct cpu_info *ci)
+{
+ /*
+ * Allocate a per-cpu VMXON region for VMX CPUs
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ ci->ci_vmxon_region = (struct vmxon_region *)malloc(PAGE_SIZE,
+ M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!pmap_extract(pmap_kernel(), (vaddr_t)ci->ci_vmxon_region,
+ &ci->ci_vmxon_region_pa))
+ panic("Can't locate VMXON region in phys mem\n");
+ }
+}
+#endif /* VMM */
#ifdef MULTIPROCESSOR
void
@@ -813,13 +844,6 @@ patinit(struct cpu_info *ci)
if ((ci->ci_feature_flags & CPUID_PAT) == 0)
return;
-#define PATENTRY(n, type) (type << ((n) * 8))
-#define PAT_UC 0x0UL
-#define PAT_WC 0x1UL
-#define PAT_WT 0x4UL
-#define PAT_WP 0x5UL
-#define PAT_WB 0x6UL
-#define PAT_UCMINUS 0x7UL
/*
* Set up PAT bits.
* The default pat table is the following:
diff --git a/sys/arch/amd64/amd64/identcpu.c b/sys/arch/amd64/amd64/identcpu.c
index 352c3f39beb..6183e7743b0 100644
--- a/sys/arch/amd64/amd64/identcpu.c
+++ b/sys/arch/amd64/amd64/identcpu.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: identcpu.c,v 1.65 2015/11/07 01:37:26 naddy Exp $ */
+/* $OpenBSD: identcpu.c,v 1.66 2015/11/13 07:52:20 mlarkin Exp $ */
/* $NetBSD: identcpu.c,v 1.1 2003/04/26 18:39:28 fvdl Exp $ */
/*
@@ -39,12 +39,18 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
+
+#include "vmm.h"
+
#include <machine/cpu.h>
#include <machine/cpufunc.h>
void replacesmap(void);
u_int64_t cpu_tsc_freq(struct cpu_info *);
u_int64_t cpu_tsc_freq_ctr(struct cpu_info *);
+#ifdef VMM
+void cpu_check_vmm_cap(struct cpu_info *);
+#endif /* VMM */
/* sysctl wants this. */
char cpu_model[48];
@@ -167,6 +173,9 @@ const struct {
{ SEFF0EBX_RDSEED, "RDSEED" },
{ SEFF0EBX_ADX, "ADX" },
{ SEFF0EBX_SMAP, "SMAP" },
+}, cpu_seff0_ecxfeatures[] = {
+ { SEFF0ECX_PREFETCHWT1, "PREFETCHWT1" },
+ { SEFF0ECX_PKU, "PKU" },
}, cpu_tpm_eaxfeatures[] = {
{ TPM_SENSOR, "SENSOR" },
{ TPM_ARAT, "ARAT" },
@@ -406,32 +415,32 @@ cpu_tsc_freq(struct cpu_info *ci)
void
identifycpu(struct cpu_info *ci)
{
- u_int32_t dummy, val, pnfeatset;
- u_int32_t brand[12];
+ u_int32_t dummy, val;
char mycpu_model[48];
int i;
char *brandstr_from, *brandstr_to;
int skipspace;
CPUID(1, ci->ci_signature, val, dummy, ci->ci_feature_flags);
- CPUID(0x80000000, pnfeatset, dummy, dummy, dummy);
- if (pnfeatset >= 0x80000001) {
- u_int32_t ecx;
-
- CPUID(0x80000001, dummy, dummy,
- ecx, ci->ci_feature_eflags);
+ CPUID(0x80000000, ci->ci_pnfeatset, dummy, dummy, dummy);
+ if (ci->ci_pnfeatset >= 0x80000001) {
+ CPUID(0x80000001, ci->ci_efeature_eax, dummy,
+ ci->ci_efeature_ecx, ci->ci_feature_eflags);
/* Other bits may clash */
ci->ci_feature_flags |= (ci->ci_feature_eflags & CPUID_NXE);
if (ci->ci_flags & CPUF_PRIMARY)
- ecpu_ecxfeature = ecx;
+ ecpu_ecxfeature = ci->ci_efeature_ecx;
/* Let cpu_feature be the common bits */
cpu_feature &= ci->ci_feature_flags;
}
- CPUID(0x80000002, brand[0], brand[1], brand[2], brand[3]);
- CPUID(0x80000003, brand[4], brand[5], brand[6], brand[7]);
- CPUID(0x80000004, brand[8], brand[9], brand[10], brand[11]);
- strlcpy(mycpu_model, (char *)brand, sizeof(mycpu_model));
+ CPUID(0x80000002, ci->ci_brand[0],
+ ci->ci_brand[1], ci->ci_brand[2], ci->ci_brand[3]);
+ CPUID(0x80000003, ci->ci_brand[4],
+ ci->ci_brand[5], ci->ci_brand[6], ci->ci_brand[7]);
+ CPUID(0x80000004, ci->ci_brand[8],
+ ci->ci_brand[9], ci->ci_brand[10], ci->ci_brand[11]);
+ strlcpy(mycpu_model, (char *)ci->ci_brand, sizeof(mycpu_model));
/* Remove leading, trailing and duplicated spaces from mycpu_model */
brandstr_from = brandstr_to = mycpu_model;
@@ -524,11 +533,16 @@ identifycpu(struct cpu_info *ci)
if (cpuid_level >= 0x07) {
/* "Structured Extended Feature Flags" */
- CPUID_LEAF(0x7, 0, dummy, ci->ci_feature_sefflags, dummy, dummy);
+ CPUID_LEAF(0x7, 0, dummy, ci->ci_feature_sefflags_ebx,
+ ci->ci_feature_sefflags_ecx, dummy);
for (i = 0; i < nitems(cpu_seff0_ebxfeatures); i++)
- if (ci->ci_feature_sefflags &
+ if (ci->ci_feature_sefflags_ebx &
cpu_seff0_ebxfeatures[i].bit)
printf(",%s", cpu_seff0_ebxfeatures[i].str);
+ for (i = 0; i < nitems(cpu_seff0_ecxfeatures); i++)
+ if (ci->ci_feature_sefflags_ecx &
+ cpu_seff0_ecxfeatures[i].bit)
+ printf(",%s", cpu_seff0_ecxfeatures[i].str);
}
if (!strcmp(cpu_vendor, "GenuineIntel") && cpuid_level >= 0x06 ) {
@@ -546,10 +560,10 @@ identifycpu(struct cpu_info *ci)
#ifndef SMALL_KERNEL
if (ci->ci_flags & CPUF_PRIMARY) {
if (!strcmp(cpu_vendor, "AuthenticAMD") &&
- pnfeatset >= 0x80000007) {
- CPUID(0x80000007, dummy, dummy, dummy, pnfeatset);
+ ci->ci_pnfeatset >= 0x80000007) {
+ CPUID(0x80000007, dummy, dummy, dummy, val);
- if (pnfeatset & 0x06) {
+ if (val & 0x06) {
if ((ci->ci_signature & 0xF00) == 0xF00)
setperf_setup = k8_powernow_init;
}
@@ -576,7 +590,7 @@ identifycpu(struct cpu_info *ci)
has_hv_cpuid = 1;
#endif
- if (ci->ci_feature_sefflags & SEFF0EBX_SMAP)
+ if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMAP)
replacesmap();
}
if (!strncmp(mycpu_model, "Intel", 5)) {
@@ -614,6 +628,9 @@ identifycpu(struct cpu_info *ci)
}
cpu_topology(ci);
+#ifdef VMM
+ cpu_check_vmm_cap(ci);
+#endif /* VMM */
}
#ifndef SMALL_KERNEL
@@ -736,3 +753,96 @@ no_topology:
ci->ci_core_id = ci->ci_cpuid;
ci->ci_pkg_id = 0;
}
+
+#ifdef VMM
+/*
+ * cpu_check_vmm_cap
+ *
+ * Checks for VMM capabilities for 'ci'. Initializes certain per-cpu VMM
+ * state in 'ci' if virtualization extensions are found.
+ *
+ * Parameters:
+ * ci: the cpu being checked
+ */
+void
+cpu_check_vmm_cap(struct cpu_info *ci)
+{
+ uint64_t msr;
+ uint32_t cap, dummy;
+
+ /*
+ * Check for workable VMX
+ */
+ if (cpu_ecxfeature & CPUIDECX_VMX) {
+ msr = rdmsr(MSR_IA32_FEATURE_CONTROL);
+
+ if (!(msr & IA32_FEATURE_CONTROL_LOCK))
+ ci->ci_vmm_flags |= CI_VMM_VMX;
+ else {
+ if (msr & IA32_FEATURE_CONTROL_VMX_EN)
+ ci->ci_vmm_flags |= CI_VMM_VMX;
+ }
+ }
+
+ /*
+ * Check for EPT (Intel Nested Paging)
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ /* Secondary controls available? */
+ /* XXX should we check true procbased ctls here if avail? */
+ msr = rdmsr(IA32_VMX_PROCBASED_CTLS);
+ if (msr & (IA32_VMX_ACTIVATE_SECONDARY_CONTROLS) << 32) {
+ msr = rdmsr(IA32_VMX_PROCBASED2_CTLS);
+ /* EPT available? */
+ if (msr & (IA32_VMX_ENABLE_EPT) << 32)
+ ci->ci_vmm_flags |= CI_VMM_EPT;
+ }
+ }
+
+ /*
+ * Check startup config (VMX)
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ /* CR0 fixed and flexible bits */
+ msr = rdmsr(IA32_VMX_CR0_FIXED0);
+ ci->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0 = msr;
+ msr = rdmsr(IA32_VMX_CR0_FIXED1);
+ ci->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1 = msr;
+
+ /* CR4 fixed and flexible bits */
+ msr = rdmsr(IA32_VMX_CR4_FIXED0);
+ ci->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0 = msr;
+ msr = rdmsr(IA32_VMX_CR4_FIXED1);
+ ci->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1 = msr;
+
+ /* VMXON region revision ID (bits 30:0 of IA32_VMX_BASIC) */
+ msr = rdmsr(IA32_VMX_BASIC);
+ ci->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision =
+ (uint32_t)(msr & 0x7FFFFFFF);
+
+ /* MSR save / load table size */
+ msr = rdmsr(IA32_VMX_MISC);
+ ci->ci_vmm_cap.vcc_vmx.vmx_msr_table_size =
+ (uint32_t)(msr & IA32_VMX_MSR_LIST_SIZE_MASK) >> 25;
+ }
+
+ /*
+ * Check for workable SVM
+ */
+ if (ecpu_ecxfeature & CPUIDECX_SVM) {
+ msr = rdmsr(MSR_AMD_VM_CR);
+
+ if (!(msr & AMD_SVMDIS))
+ ci->ci_vmm_flags |= CI_VMM_SVM;
+ }
+
+ /*
+ * Check for SVM Nested Paging
+ */
+ if (ci->ci_vmm_flags & CI_VMM_SVM) {
+ CPUID(CPUID_AMD_SVM_CAP, dummy, dummy, dummy, cap);
+ if (cap & AMD_SVM_NESTED_PAGING_CAP)
+ ci->ci_vmm_flags |= CI_VMM_RVI;
+ }
+}
+#endif /* VMM */
diff --git a/sys/arch/amd64/amd64/ipifuncs.c b/sys/arch/amd64/amd64/ipifuncs.c
index 0279d446ea5..03f4fa827b6 100644
--- a/sys/arch/amd64/amd64/ipifuncs.c
+++ b/sys/arch/amd64/amd64/ipifuncs.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ipifuncs.c,v 1.26 2015/03/14 03:38:46 jsg Exp $ */
+/* $OpenBSD: ipifuncs.c,v 1.27 2015/11/13 07:52:20 mlarkin Exp $ */
/* $NetBSD: ipifuncs.c,v 1.1 2003/04/26 18:39:28 fvdl Exp $ */
/*-
@@ -54,12 +54,22 @@
#include <machine/db_machdep.h>
+#include "vmm.h"
+#ifdef VMM
+#include <machine/vmmvar.h>
+#endif /* VMM */
+
void x86_64_ipi_nop(struct cpu_info *);
void x86_64_ipi_halt(struct cpu_info *);
void x86_64_ipi_synch_fpu(struct cpu_info *);
void x86_64_ipi_flush_fpu(struct cpu_info *);
+#ifdef VMM
+void x86_64_ipi_start_vmm(struct cpu_info *);
+void x86_64_ipi_stop_vmm(struct cpu_info *);
+#endif /* VMM */
+
#ifdef HIBERNATE
void x86_64_ipi_halt_realmode(struct cpu_info *);
extern void hibernate_drop_to_real_mode(void);
@@ -85,6 +95,13 @@ void (*ipifunc[X86_NIPI])(struct cpu_info *) =
#else
NULL,
#endif
+#ifdef VMM
+ x86_64_ipi_start_vmm,
+ x86_64_ipi_stop_vmm,
+#else
+ NULL,
+ NULL,
+#endif /* VMM */
};
void
@@ -132,3 +149,17 @@ x86_64_ipi_reload_mtrr(struct cpu_info *ci)
mem_range_softc.mr_op->reload(&mem_range_softc);
}
#endif
+
+#ifdef VMM
+void
+x86_64_ipi_start_vmm(struct cpu_info *ci)
+{
+ start_vmm_on_cpu(ci);
+}
+
+void
+x86_64_ipi_stop_vmm(struct cpu_info *ci)
+{
+ stop_vmm_on_cpu(ci);
+}
+#endif /* VMM */
diff --git a/sys/arch/amd64/amd64/mainbus.c b/sys/arch/amd64/amd64/mainbus.c
index b236fed7ade..8baf89862fa 100644
--- a/sys/arch/amd64/amd64/mainbus.c
+++ b/sys/arch/amd64/amd64/mainbus.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: mainbus.c,v 1.33 2015/08/31 19:56:32 kettenis Exp $ */
+/* $OpenBSD: mainbus.c,v 1.34 2015/11/13 07:52:20 mlarkin Exp $ */
/* $NetBSD: mainbus.c,v 1.1 2003/04/26 18:39:29 fvdl Exp $ */
/*
@@ -48,6 +48,7 @@
#include "ipmi.h"
#include "bios.h"
#include "mpbios.h"
+#include "vmm.h"
#include "pvbus.h"
#include "efifb.h"
@@ -239,6 +240,11 @@ mainbus_attach(struct device *parent, struct device *self, void *aux)
config_found(self, &mba_iba, mainbus_print);
#endif
+#ifdef VMM
+ mba.mba_busname = "vmm";
+ config_found(self, &mba.mba_busname, mainbus_print);
+#endif /* VMM */
+
#if NEFIFB > 0
if (bios_efiinfo != NULL) {
mba.mba_eaa.eaa_name = "efifb";
diff --git a/sys/arch/amd64/amd64/vmm.c b/sys/arch/amd64/amd64/vmm.c
new file mode 100644
index 00000000000..df72910cfdb
--- /dev/null
+++ b/sys/arch/amd64/amd64/vmm.c
@@ -0,0 +1,3414 @@
+/*
+ * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/device.h>
+#include <sys/pool.h>
+#include <sys/proc.h>
+#include <sys/ioctl.h>
+#include <sys/queue.h>
+#include <sys/rwlock.h>
+#include <uvm/uvm.h>
+#include <machine/pmap.h>
+#include <machine/biosvar.h>
+#include <machine/segments.h>
+#include <machine/cpufunc.h>
+#include <machine/vmmvar.h>
+#include <machine/i82489reg.h>
+#include <dev/isa/isareg.h>
+
+#define DEVNAME(s) ((s)->sc_dev.dv_xname)
+
+#define CTRL_DUMP(x,y,z) printf(" %s: Can set:%s Can clear:%s\n", #z , \
+ vcpu_vmx_check_cap(x, IA32_VMX_##y ##_CTLS, \
+ IA32_VMX_##z, 1) ? "Yes" : "No", \
+ vcpu_vmx_check_cap(x, IA32_VMX_##y ##_CTLS, \
+ IA32_VMX_##z, 0) ? "Yes" : "No");
+
+SLIST_HEAD(vmlist_head, vm);
+
+struct vmm_softc {
+ struct device sc_dev;
+
+ /* Capabilities */
+ uint32_t nr_vmx_cpus;
+ uint32_t nr_svm_cpus;
+ uint32_t nr_rvi_cpus;
+ uint32_t nr_ept_cpus;
+
+ /* Managed VMs */
+ struct vmlist_head vm_list;
+
+ int mode;
+
+ struct rwlock vm_lock;
+ size_t vm_ct;
+ size_t vm_idx;
+};
+
+int vmm_probe(struct device *, void *, void *);
+void vmm_attach(struct device *, struct device *, void *);
+int vmm_activate(struct device *, int);
+int vmmopen(dev_t, int, int, struct proc *);
+int vmmioctl(dev_t, u_long, caddr_t, int, struct proc *);
+int vmmclose(dev_t, int, int, struct proc *);
+int vmm_start(void);
+int vmm_stop(void);
+int vm_create(struct vm_create_params *, struct proc *);
+int vm_run(struct vm_run_params *);
+int vm_terminate(struct vm_terminate_params *);
+int vm_get_info(struct vm_info_params *);
+int vm_writepage(struct vm_writepage_params *);
+int vm_readpage(struct vm_readpage_params *);
+int vcpu_init(struct vcpu *);
+int vcpu_init_vmx(struct vcpu *);
+int vcpu_init_svm(struct vcpu *);
+int vcpu_run_vmx(struct vcpu *, uint8_t, int16_t *);
+int vcpu_run_svm(struct vcpu *, uint8_t);
+void vcpu_deinit(struct vcpu *);
+void vcpu_deinit_vmx(struct vcpu *);
+void vcpu_deinit_svm(struct vcpu *);
+int vm_impl_init(struct vm *);
+int vm_impl_init_vmx(struct vm *);
+int vm_impl_init_svm(struct vm *);
+void vm_impl_deinit(struct vm *);
+void vm_impl_deinit_vmx(struct vm *);
+void vm_impl_deinit_svm(struct vm *);
+void vm_teardown(struct vm *);
+int vcpu_vmx_check_cap(struct vcpu *, uint32_t, uint32_t, int);
+int vcpu_vmx_compute_ctrl(struct vcpu *, uint64_t, uint16_t, uint32_t,
+ uint32_t, uint32_t *);
+int vmx_handle_exit(struct vcpu *, int *);
+int vmx_handle_cpuid(struct vcpu *);
+int vmx_handle_cr(struct vcpu *);
+int vmx_handle_inout(struct vcpu *);
+int vmx_handle_hlt(struct vcpu *);
+void vmx_handle_intr(struct vcpu *);
+void vmx_handle_intwin(struct vcpu *);
+int vmm_get_guest_memtype(struct vm *, paddr_t);
+int vmm_get_guest_faulttype(void);
+int vmx_get_guest_faulttype(void);
+int svm_get_guest_faulttype(void);
+int vmx_get_exit_qualification(uint64_t *);
+int vmx_fault_page(struct vcpu *, paddr_t);
+int vmx_handle_np_fault(struct vcpu *);
+int vmx_fix_ept_pte(struct pmap *, vaddr_t);
+const char *vmx_exit_reason_decode(uint32_t);
+const char *vmx_instruction_error_decode(uint32_t);
+void dump_vcpu(struct vcpu *);
+
+const char *vmm_hv_signature = VMM_HV_SIGNATURE;
+
+struct cfdriver vmm_cd = {
+ NULL, "vmm", DV_DULL
+};
+
+struct cfattach vmm_ca = {
+ sizeof(struct vmm_softc), vmm_probe, vmm_attach, NULL,
+ vmm_activate
+};
+
+/* Pools for VMs and VCPUs */
+struct pool vm_pool;
+struct pool vcpu_pool;
+
+struct vmm_softc *vmm_softc;
+
+/* IDT information used when populating host state area */
+extern vaddr_t idt_vaddr;
+extern struct gate_descriptor *idt;
+
+/* XXX Temporary hack for the PIT clock */
+#define CLOCK_BIAS 8192
+uint64_t vmmclk = 0;
+
+/* Constants used in "CR access exit" */
+#define CR_WRITE 0
+#define CR_READ 1
+#define CR_CLTS 2
+#define CR_LMSW 3
+
+/*
+ * vmm_probe
+ *
+ * Checks if we have at least one CPU with either VMX or SVM.
+ * Returns 1 if we have at least one of either type, but not both, 0 otherwise.
+ */
+int
+vmm_probe(struct device *parent, void *match, void *aux)
+{
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+ const char **busname = (const char **)aux;
+ boolean_t found_vmx, found_svm;
+
+ /* Check if this probe is for us */
+ if (strcmp(*busname, vmm_cd.cd_name) != 0)
+ return (0);
+
+ found_vmx = FALSE;
+ found_svm = FALSE;
+
+ /* Check if we have at least one CPU with either VMX or SVM */
+ CPU_INFO_FOREACH(cii, ci) {
+ if (ci->ci_vmm_flags & CI_VMM_VMX)
+ found_vmx = TRUE;
+ if (ci->ci_vmm_flags & CI_VMM_SVM)
+ found_svm = TRUE;
+ }
+
+ /* Don't support both SVM and VMX at the same time */
+ if (found_vmx && found_svm)
+ return (0);
+
+ return (found_vmx || found_svm);
+}
+
+/*
+ * vmm_attach
+ *
+ * Calculates how many of each type of CPU we have, prints this into dmesg
+ * during attach. Initializes various locks, pools, and list structures for the
+ * VMM.
+ */
+void
+vmm_attach(struct device *parent, struct device *self, void *aux)
+{
+ struct vmm_softc *sc;
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+
+ sc = (struct vmm_softc *)self;
+ sc->nr_vmx_cpus = 0;
+ sc->nr_svm_cpus = 0;
+ sc->nr_rvi_cpus = 0;
+ sc->nr_ept_cpus = 0;
+ sc->vm_ct = 0;
+ sc->vm_idx = 0;
+
+ /* Calculate CPU features */
+ CPU_INFO_FOREACH(cii, ci) {
+ if (ci->ci_vmm_flags & CI_VMM_VMX)
+ sc->nr_vmx_cpus++;
+ if (ci->ci_vmm_flags & CI_VMM_SVM)
+ sc->nr_svm_cpus++;
+ if (ci->ci_vmm_flags & CI_VMM_RVI)
+ sc->nr_rvi_cpus++;
+ if (ci->ci_vmm_flags & CI_VMM_EPT)
+ sc->nr_ept_cpus++;
+ }
+
+ SLIST_INIT(&sc->vm_list);
+ rw_init(&sc->vm_lock, "vmlistlock");
+
+ printf(": initialized\n");
+
+ if (sc->nr_vmx_cpus)
+ printf("%s: %u VMX capable CPU(s), %u are EPT capable\n",
+ DEVNAME(sc), sc->nr_vmx_cpus, sc->nr_ept_cpus);
+ if (sc->nr_svm_cpus)
+ printf("%s: %u SVM capable CPU(s), %u are RVI capable\n",
+ DEVNAME(sc), sc->nr_svm_cpus, sc->nr_rvi_cpus);
+
+ pool_init(&vm_pool, sizeof(struct vm), 0, 0, PR_WAITOK, "vmpool",
+ NULL);
+ pool_init(&vcpu_pool, sizeof(struct vcpu), 0, 0, PR_WAITOK, "vcpupl",
+ NULL);
+
+ sc->mode = VMM_MODE_UNKNOWN;
+ if (sc->nr_ept_cpus > 0)
+ sc->mode = VMM_MODE_EPT;
+ else if (sc->nr_vmx_cpus > 0)
+ sc->mode = VMM_MODE_VMX;
+ else if (sc->nr_rvi_cpus > 0)
+ sc->mode = VMM_MODE_RVI;
+ else
+ sc->mode = VMM_MODE_SVM;
+
+ vmm_softc = sc;
+}
+
+/*
+ * vmm_activate
+ *
+ * Autoconf routine used during activate/deactivate.
+ *
+ * XXX need this for suspend/resume
+ */
+int
+vmm_activate(struct device *self, int act)
+{
+ return 0;
+}
+
+/*
+ * vmmopen
+ *
+ * Called during open of /dev/vmm. Presently unused.
+ */
+int
+vmmopen(dev_t dev, int flag, int mode, struct proc *p)
+{
+ return 0;
+}
+
+/*
+ * vmmioctl
+ *
+ * Main ioctl dispatch routine for /dev/vmm. Parses ioctl type and calls
+ * appropriate lower level handler routine. Returns result to ioctl caller.
+ */
+int
+vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
+{
+ int ret;
+
+ /* Don't allow ioctls if we have no supported CPUs */
+ if (vmm_softc->mode == VMM_MODE_UNKNOWN)
+ return (ENOTTY);
+
+ switch(cmd) {
+ case VMM_IOC_START:
+ ret = vmm_start();
+ break;
+ case VMM_IOC_STOP:
+ ret = vmm_stop();
+ break;
+ case VMM_IOC_CREATE:
+ ret = vm_create((struct vm_create_params *)data, p);
+ break;
+ case VMM_IOC_RUN:
+ ret = vm_run((struct vm_run_params *)data);
+ break;
+ case VMM_IOC_INFO:
+ ret = vm_get_info((struct vm_info_params *)data);
+ break;
+ case VMM_IOC_TERM:
+ ret = vm_terminate((struct vm_terminate_params *)data);
+ break;
+ case VMM_IOC_WRITEPAGE:
+ ret = vm_writepage((struct vm_writepage_params *)data);
+ break;
+ case VMM_IOC_READPAGE:
+ ret = vm_readpage((struct vm_readpage_params *)data);
+ break;
+ default:
+ ret = ENOTTY;
+ }
+
+ return (ret);
+}
+
+/*
+ * vmmclose
+ *
+ * Called when /dev/vmm is closed. Presently unused.
+ */
+int
+vmmclose(dev_t dev, int flag, int mode, struct proc *p)
+{
+ return 0;
+}
+
+/*
+ * vm_readpage
+ *
+ * Reads a region (PAGE_SIZE max) of guest physical memory using the parameters
+ * defined in 'vrp'.
+ *
+ * Returns 0 if successful, or various error codes on failure:
+ * ENOENT if the VM id contained in 'vrp' refers to an unknown VM
+ * EINVAL if the memory region described by vrp is not regular memory
+ * EFAULT if the memory region described by vrp has not yet been faulted in
+ * by the guest
+ */
+int
+vm_readpage(struct vm_readpage_params *vrp)
+{
+ struct vm *vm;
+ paddr_t host_pa;
+ void *kva;
+ int found;
+ vaddr_t vr_page;
+
+ /* Find the desired VM */
+ rw_enter_read(&vmm_softc->vm_lock);
+ found = 0;
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ if (vm->vm_id == vrp->vrp_vm_id) {
+ found = 1;
+ break;
+ }
+ }
+
+ /* Not found? exit. */
+ if (!found) {
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (ENOENT);
+ }
+
+ /* Calculate page containing vrp->vrp_paddr */
+ vr_page = vrp->vrp_paddr & ~PAGE_MASK;
+
+ /* If not regular memory, exit. */
+ if (vmm_get_guest_memtype(vm, vr_page) !=
+ VMM_MEM_TYPE_REGULAR) {
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (EINVAL);
+ }
+
+ /* Find the phys page where this guest page exists in real memory */
+ if (!pmap_extract(vm->vm_map->pmap, vr_page, &host_pa)) {
+ return (EFAULT);
+ }
+
+ /* Allocate temporary KVA for the guest page */
+ kva = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
+ if (!kva) {
+ dprintf("vm_readpage: can't alloc kva\n");
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (EFAULT);
+ }
+
+ /* Enter the mapping in the kernel pmap and copyout */
+ pmap_kenter_pa((vaddr_t)kva, host_pa, PROT_READ);
+
+ if (copyout(kva + ((vaddr_t)vrp->vrp_paddr & PAGE_MASK),
+ vrp->vrp_data, vrp->vrp_len) == EFAULT) {
+ dprintf("vm_readpage: can't copyout\n");
+ pmap_kremove((vaddr_t)kva, PAGE_SIZE);
+ km_free(kva, PAGE_SIZE, &kv_any, &kp_none);
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (EFAULT);
+ }
+
+ /* Cleanup and exit */
+ pmap_kremove((vaddr_t)kva, PAGE_SIZE);
+ km_free(kva, PAGE_SIZE, &kv_any, &kp_none);
+
+ rw_exit_read(&vmm_softc->vm_lock);
+
+ return (0);
+}
+
+/*
+ * vm_writepage
+ *
+ * Writes a region (PAGE_SIZE max) of guest physical memory using the parameters
+ * defined in 'vrp'.
+ *
+ * Returns 0 if successful, or various error codes on failure:
+ * ENOENT if the VM id contained in 'vrp' refers to an unknown VM
+ * EINVAL if the memory region described by vrp is not regular memory
+ * EFAULT if the source data in vrp contains an invalid address
+ * ENOMEM if a memory allocation error occurs
+ */
+int
+vm_writepage(struct vm_writepage_params *vwp)
+{
+ char *pagedata;
+ struct vm *vm;
+ paddr_t host_pa;
+ void *kva;
+ int found, ret;
+ vaddr_t vw_page, dst;
+
+ /* Find the desired VM */
+ rw_enter_read(&vmm_softc->vm_lock);
+ found = 0;
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ if (vm->vm_id == vwp->vwp_vm_id) {
+ found = 1;
+ break;
+ }
+ }
+
+ /* Not found? exit. */
+ if (!found) {
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (ENOENT);
+ }
+
+ /* Calculate page containing vwp->vwp_paddr */
+ vw_page = vwp->vwp_paddr & ~PAGE_MASK;
+
+ /* If not regular memory, exit. */
+ if (vmm_get_guest_memtype(vm, vw_page) !=
+ VMM_MEM_TYPE_REGULAR) {
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (EINVAL);
+ }
+
+ /* Allocate temporary region to copyin into */
+ pagedata = malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO);
+
+ if (!pagedata) {
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (ENOMEM);
+ }
+
+ /* Copy supplied data to kernel */
+ if (copyin(vwp->vwp_data, pagedata, vwp->vwp_len) == EFAULT) {
+ free(pagedata, M_DEVBUF, PAGE_SIZE);
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (EFAULT);
+ }
+
+ /* Find the phys page where this guest page exists in real memory */
+ if (!pmap_extract(vm->vm_map->pmap, vw_page, &host_pa)) {
+ /* page not present */
+ ret = uvm_fault(vm->vm_map, vw_page,
+ PROT_WRITE, PROT_READ | PROT_WRITE | PROT_EXEC);
+ if (ret) {
+ free(pagedata, M_DEVBUF, PAGE_SIZE);
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (EFAULT);
+ }
+
+ if (!pmap_extract(vm->vm_map->pmap, vw_page, &host_pa)) {
+ panic("vm_writepage: still not mapped GPA 0x%llx\n",
+ (uint64_t)vwp->vwp_paddr);
+ }
+ }
+
+ /* Allocate kva for guest page */
+ kva = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
+ if (!kva) {
+ dprintf("vm_writepage: can't alloc kva\n");
+ free(pagedata, M_DEVBUF, PAGE_SIZE);
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (EFAULT);
+ }
+
+ /* Enter mapping and copy data */
+ pmap_kenter_pa((vaddr_t)kva, host_pa, PROT_READ | PROT_WRITE);
+ dst = (vaddr_t)kva + ((vaddr_t)vwp->vwp_paddr & PAGE_MASK);
+ memcpy((void *)dst, pagedata, vwp->vwp_len);
+
+ /* Cleanup */
+ pmap_kremove((vaddr_t)kva, PAGE_SIZE);
+ km_free(kva, PAGE_SIZE, &kv_any, &kp_none);
+
+ free(pagedata, M_DEVBUF, PAGE_SIZE);
+
+ /* Fixup the EPT map for this page */
+ if (vmx_fix_ept_pte(vm->vm_map->pmap, vw_page)) {
+ dprintf("vm_writepage: cant fixup ept pte for gpa 0x%llx\n",
+ (uint64_t)vwp->vwp_paddr);
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (EFAULT);
+ }
+ rw_exit_read(&vmm_softc->vm_lock);
+
+ return (0);
+}
+
+/*
+ * vmm_start
+ *
+ * Starts VMM mode on the system
+ */
+int
+vmm_start(void)
+{
+ struct cpu_info *self;
+ int ret = 0;
+
+#ifdef MULTIPROCESSOR
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+ int i;
+#endif /* MULTIPROCESSOR */
+
+ self = curcpu();
+#ifdef MULTIPROCESSOR
+ /* Broadcast start VMM IPI */
+ x86_broadcast_ipi(X86_IPI_START_VMM);
+
+ CPU_INFO_FOREACH(cii, ci) {
+ if (ci == self)
+ continue;
+ for (i = 100000; (!(ci->ci_flags & CPUF_VMM)) && i>0;i--)
+ delay(10);
+ if (!(ci->ci_flags & CPUF_VMM)) {
+ printf("%s: failed to enter VMM mode\n",
+ ci->ci_dev->dv_xname);
+ ret = EIO;
+ } else
+ printf("%s: entered VMM mode\n", ci->ci_dev->dv_xname);
+ }
+#endif /* MULTIPROCESSOR */
+
+ /* Start VMM on this CPU */
+ start_vmm_on_cpu(self);
+ if (!(self->ci_flags & CPUF_VMM)) {
+ printf("%s: failed to enter VMM mode\n",
+ self->ci_dev->dv_xname);
+ ret = EIO;
+ } else
+ printf("%s: entered VMM mode\n", self->ci_dev->dv_xname);
+
+ return (ret);
+}
+
+/*
+ * vmm_stop
+ *
+ * Stops VMM mode on the system
+ *
+ * XXX should restrict this function to not stop VMM mode while VMs are running
+ */
+int
+vmm_stop(void)
+{
+ struct cpu_info *self;
+ int ret = 0;
+
+#ifdef MULTIPROCESSOR
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+ int i;
+#endif /* MULTIPROCESSOR */
+
+ self = curcpu();
+#ifdef MULTIPROCESSOR
+ /* Stop VMM on other CPUs */
+ x86_broadcast_ipi(X86_IPI_STOP_VMM);
+
+ CPU_INFO_FOREACH(cii, ci) {
+ if (ci == self)
+ continue;
+ for (i = 100000; (ci->ci_flags & CPUF_VMM) && i>0 ;i--)
+ delay(10);
+ if (ci->ci_flags & CPUF_VMM) {
+ printf("%s: failed to exit VMM mode\n",
+ ci->ci_dev->dv_xname);
+ ret = EIO;
+ } else
+ printf("%s: exited VMM mode\n", ci->ci_dev->dv_xname);
+ }
+#endif /* MULTIPROCESSOR */
+
+ /* Stop VMM on this CPU */
+ stop_vmm_on_cpu(self);
+ if (self->ci_flags & CPUF_VMM) {
+ printf("%s: failed to exit VMM mode\n",
+ self->ci_dev->dv_xname);
+ ret = EIO;
+ } else
+ printf("%s: exited VMM mode\n", self->ci_dev->dv_xname);
+
+ return (ret);
+}
+
+/*
+ * start_vmm_on_cpu
+ *
+ * Starts VMM mode on 'ci' by executing the appropriate CPU-specific insn
+ * sequence to enter VMM mode (eg, VMXON)
+ */
+void
+start_vmm_on_cpu(struct cpu_info *ci)
+{
+ uint64_t msr;
+ uint32_t cr4;
+
+ /* No VMM mode? exit. */
+ if (ci->ci_flags & CPUF_VMM)
+ return;
+
+ /*
+ * AMD SVM
+ */
+ if (ci->ci_vmm_flags & CI_VMM_SVM) {
+ msr = rdmsr(MSR_EFER);
+ msr |= EFER_SVME;
+ wrmsr(MSR_EFER, msr);
+ }
+
+ /*
+ * Intel VMX
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ if (ci->ci_vmxon_region == 0)
+ panic("NULL vmxon region specified\n");
+ else {
+ bzero(ci->ci_vmxon_region, PAGE_SIZE);
+ ci->ci_vmxon_region->vr_revision =
+ ci->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision;
+
+ /* Set CR4.VMXE */
+ cr4 = rcr4();
+ cr4 |= CR4_VMXE;
+ lcr4(cr4);
+
+ /* Enable VMX */
+ msr = rdmsr(MSR_IA32_FEATURE_CONTROL);
+ if (msr & IA32_FEATURE_CONTROL_LOCK) {
+ if (!(msr & IA32_FEATURE_CONTROL_VMX_EN))
+ return;
+ } else {
+ msr |= IA32_FEATURE_CONTROL_VMX_EN |
+ IA32_FEATURE_CONTROL_LOCK;
+ wrmsr(MSR_IA32_FEATURE_CONTROL, msr);
+ }
+
+ /* Enter VMX mode */
+ if (vmxon((uint64_t *)&ci->ci_vmxon_region_pa))
+ panic("VMXON failed\n");
+ }
+ }
+
+ ci->ci_flags |= CPUF_VMM;
+}
+
+/*
+ * stop_vmm_on_cpu
+ *
+ * Stops VMM mode on 'ci' by executing the appropriate CPU-specific insn
+ * sequence to exit VMM mode (eg, VMXOFF)
+ */
+void
+stop_vmm_on_cpu(struct cpu_info *ci)
+{
+ uint64_t msr;
+ uint32_t cr4;
+
+ if (!(ci->ci_flags & CPUF_VMM))
+ return;
+
+ /*
+ * AMD SVM
+ */
+ if (ci->ci_vmm_flags & CI_VMM_SVM) {
+ msr = rdmsr(MSR_EFER);
+ msr &= ~EFER_SVME;
+ wrmsr(MSR_EFER, msr);
+ }
+
+ /*
+ * Intel VMX
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ if (vmxoff())
+ panic("VMXOFF failed\n");
+
+ cr4 = rcr4();
+ cr4 &= ~CR4_VMXE;
+ lcr4(cr4);
+ }
+
+ ci->ci_flags &= ~CPUF_VMM;
+}
+
+/*
+ * vm_create
+ *
+ * Creates the in-memory VMM structures for the VM defined by 'vcp'. The
+ * parent of this VM shall be the process defined by 'p'.
+ * This function does not start the VCPU(s) - see vm_start.
+ *
+ * Return Values:
+ * 0: the create operation was successful
+ * ENOMEM: out of memory
+ * various other errors from vcpu_init/vm_impl_init
+ */
+int
+vm_create(struct vm_create_params *vcp, struct proc *p)
+{
+ int i, ret;
+ struct vm *vm;
+ struct vcpu *vcpu;
+
+ vm = pool_get(&vm_pool, PR_WAITOK | PR_ZERO);
+ SLIST_INIT(&vm->vm_vcpu_list);
+ rw_init(&vm->vm_vcpu_lock, "vcpulock");
+
+ vm->vm_creator_pid = p->p_p->ps_pid;
+ vm->vm_memory_size = vcp->vcp_memory_size;
+ strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN);
+
+ if (vm_impl_init(vm)) {
+ printf("failed to init arch-specific features for vm 0x%p\n",
+ vm);
+ vm_teardown(vm);
+ return ENOMEM;
+ }
+
+ rw_enter_write(&vmm_softc->vm_lock);
+ vmm_softc->vm_ct++;
+ vmm_softc->vm_idx++;
+
+ /*
+ * XXX we use the vm_id for the VPID/ASID, so we need to prevent
+ * wrapping around 65536/4096 entries here
+ */
+ vm->vm_id = vmm_softc->vm_idx;
+ vm->vm_vcpu_ct = 0;
+
+ /* Initialize each VCPU defined in 'vcp' */
+ for (i = 0; i < vcp->vcp_ncpus; i++) {
+ vcpu = pool_get(&vcpu_pool, PR_WAITOK | PR_ZERO);
+ vcpu->vc_parent = vm;
+ if ((ret = vcpu_init(vcpu)) != 0) {
+ printf("failed to init vcpu %d for vm 0x%p\n", i, vm);
+ vm_teardown(vm);
+ vmm_softc->vm_ct--;
+ vmm_softc->vm_idx--;
+ rw_exit_write(&vmm_softc->vm_lock);
+ return (ret);
+ }
+ rw_enter_write(&vm->vm_vcpu_lock);
+ vcpu->vc_id = vm->vm_vcpu_ct;
+ vm->vm_vcpu_ct++;
+ SLIST_INSERT_HEAD(&vm->vm_vcpu_list, vcpu, vc_vcpu_link);
+ rw_exit_write(&vm->vm_vcpu_lock);
+ }
+
+ /* XXX init various other hardware parts (vlapic, vioapic, etc) */
+
+ SLIST_INSERT_HEAD(&vmm_softc->vm_list, vm, vm_link);
+ rw_exit_write(&vmm_softc->vm_lock);
+
+ vcp->vcp_id = vm->vm_id;
+
+ return (0);
+}
+
+/*
+ * vm_impl_init_vmx
+ *
+ * Intel VMX specific VM initialization routine
+ */
+int
+vm_impl_init_vmx(struct vm *vm)
+{
+ struct pmap *pmap;
+ size_t memsize;
+ vaddr_t startp;
+ int ret;
+
+ /* If not EPT, nothing to do here */
+ if (vmm_softc->mode != VMM_MODE_EPT)
+ return (0);
+
+ /* Create a new pmap for this VM */
+ pmap = pmap_create();
+ if (!pmap) {
+ printf("vm_impl_init_vmx: pmap_create failed\n");
+ return (ENOMEM);
+ }
+
+ startp = 0;
+ memsize = vm->vm_memory_size * 1024 * 1024;
+
+ /*
+ * Create a new UVM map for this VM, and assign it the pmap just
+ * created.
+ */
+ vm->vm_map = uvm_map_create(pmap, 0, memsize,
+ VM_MAP_ISVMSPACE | VM_MAP_PAGEABLE);
+
+ if (!vm->vm_map) {
+ printf("vm_impl_init_vmx: uvm_map_create failed\n");
+ pmap_destroy(pmap);
+ return (ENOMEM);
+ }
+
+ /* Map the new map with an anon */
+ dprintf(("vm_impl_init_vmx: created vm_map @ %p\n", vm->vm_map));
+ ret = uvm_mapanon(vm->vm_map, &startp, memsize, 0,
+ UVM_MAPFLAG(PROT_READ | PROT_WRITE | PROT_EXEC,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_INHERIT_NONE,
+ MADV_NORMAL,
+ UVM_FLAG_FIXED | UVM_FLAG_OVERLAY));
+ if (ret) {
+ printf("vm_impl_init_vmx: uvm_mapanon failed (%d)\n", ret);
+ /* uvm_map_deallocate calls pmap_destroy for us */
+ uvm_map_deallocate(vm->vm_map);
+ vm->vm_map = NULL;
+ return (ENOMEM);
+ }
+
+ /* Convert the low 512GB of the pmap to EPT */
+ ret = pmap_convert(pmap, PMAP_TYPE_EPT);
+ if (ret) {
+ printf("vm_impl_init_vmx: pmap_convert failed\n");
+ /* uvm_map_deallocate calls pmap_destroy for us */
+ uvm_map_deallocate(vm->vm_map);
+ vm->vm_map = NULL;
+ return (ENOMEM);
+ }
+
+ return (0);
+}
+
+/*
+ * vm_impl_init_svm
+ *
+ * AMD SVM specific VM initialization routine
+ */
+int
+vm_impl_init_svm(struct vm *vm)
+{
+ /* XXX removed due to rot */
+ return (0);
+}
+
+/*
+ * vm_impl_init
+ *
+ * Calls the architecture-specific VM init routine
+ */
+int
+vm_impl_init(struct vm *vm)
+{
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ return vm_impl_init_vmx(vm);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ return vm_impl_init_svm(vm);
+ else
+ panic("unknown vmm mode\n");
+}
+
+/*
+ * vm_impl_deinit_vmx
+ *
+ * Intel VMX specific VM initialization routine
+ */
+void
+vm_impl_deinit_vmx(struct vm *vm)
+{
+ /* Unused */
+}
+
+/*
+ * vm_impl_deinit_svm
+ *
+ * AMD SVM specific VM initialization routine
+ */
+void
+vm_impl_deinit_svm(struct vm *vm)
+{
+ /* Unused */
+}
+
+/*
+ * vm_impl_deinit
+ *
+ * Calls the architecture-specific VM init routine
+ */
+void
+vm_impl_deinit(struct vm *vm)
+{
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ vm_impl_deinit_vmx(vm);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ vm_impl_deinit_svm(vm);
+ else
+ panic("unknown vmm mode\n");
+}
+
+/*
+ * vcpu_init_vmx
+ *
+ * Intel VMX specific VCPU initialization routine.
+ *
+ * This function allocates various per-VCPU memory regions, sets up initial
+ * VCPU VMCS controls, and sets initial register values.
+ *
+ * This function is very long but is only performing a bunch of register
+ * setups, over and over.
+ */
+int
+vcpu_init_vmx(struct vcpu *vcpu)
+{
+ struct vmcs *vmcs;
+ uint16_t ctrl;
+ uint64_t pat_default, msr, ctrlval, eptp;
+ uint32_t pinbased, procbased, procbased2, exit, entry;
+ uint32_t want1, want0;
+ uint32_t cr0, cr4;
+ paddr_t control_pa;
+ int ret;
+ struct vmx_msr_store *msr_store;
+
+ ret = 0;
+
+ /* Allocate VMCS VA */
+ vcpu->vc_control_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, &kp_zero,
+ &kd_waitok);
+
+ if (!vcpu->vc_control_va)
+ return (ENOMEM);
+
+ /* Compute VMCS PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_control_va, &control_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ vcpu->vc_control_pa = (uint64_t)control_pa;
+
+ /* Allocate MSR bitmap VA */
+ /* XXX dont need this if no msr bitmap support */
+ vcpu->vc_msr_bitmap_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, &kp_zero,
+ &kd_waitok);
+
+ if (!vcpu->vc_msr_bitmap_va) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Compute MSR bitmap PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_msr_bitmap_va, &control_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ vcpu->vc_msr_bitmap_pa = (uint64_t)control_pa;
+
+ /* Allocate MSR exit load area VA */
+ /* XXX may not need this with MSR bitmaps */
+ vcpu->vc_vmx_msr_exit_load_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page,
+ &kp_zero, &kd_waitok);
+
+ if (!vcpu->vc_vmx_msr_exit_load_va) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Compute MSR exit load area PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_exit_load_va,
+ &vcpu->vc_vmx_msr_exit_load_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Allocate MSR exit save area VA */
+ /* XXX may not need this with MSR bitmaps */
+ vcpu->vc_vmx_msr_exit_save_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page,
+ &kp_zero, &kd_waitok);
+
+ if (!vcpu->vc_vmx_msr_exit_save_va) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Compute MSR exit save area PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_exit_save_va,
+ &vcpu->vc_vmx_msr_exit_save_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Allocate MSR entry load area VA */
+ /* XXX may not need this with MSR bitmaps */
+ vcpu->vc_vmx_msr_entry_load_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page,
+ &kp_zero, &kd_waitok);
+
+ if (!vcpu->vc_vmx_msr_entry_load_va) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Compute MSR entry load area PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_entry_load_va,
+ &vcpu->vc_vmx_msr_entry_load_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ dprintf(("exit save va/pa 0x%llx 0x%llx\n",
+ (uint64_t)vcpu->vc_vmx_msr_exit_save_va,
+ (uint64_t)vcpu->vc_vmx_msr_exit_save_pa));
+ dprintf(("exit load va/pa 0x%llx 0x%llx\n",
+ (uint64_t)vcpu->vc_vmx_msr_exit_load_va,
+ (uint64_t)vcpu->vc_vmx_msr_exit_load_pa));
+ dprintf(("entry load va/pa 0x%llx 0x%llx\n",
+ (uint64_t)vcpu->vc_vmx_msr_entry_load_va,
+ (uint64_t)vcpu->vc_vmx_msr_entry_load_pa));
+ dprintf(("vlapic va/pa 0x%llx 0x%llx\n",
+ (uint64_t)vcpu->vc_vlapic_va,
+ (uint64_t)vcpu->vc_vlapic_pa));
+ dprintf(("msr bitmap va/pa 0x%llx 0x%llx\n",
+ (uint64_t)vcpu->vc_msr_bitmap_va,
+ (uint64_t)vcpu->vc_msr_bitmap_pa));
+
+ vmcs = (struct vmcs *)vcpu->vc_control_va;
+ vmcs->vmcs_revision = curcpu()->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision;
+
+ /* Clear the VMCS */
+ if (vmclear(&vcpu->vc_control_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Load the VMCS onto this PCPU so we can write registers and controls
+ */
+ if (vmptrld(&vcpu->vc_control_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Compute Basic Entry / Exit Controls */
+ vcpu->vc_vmx_basic = rdmsr(IA32_VMX_BASIC);
+ vcpu->vc_vmx_entry_ctls = rdmsr(IA32_VMX_ENTRY_CTLS);
+ vcpu->vc_vmx_exit_ctls = rdmsr(IA32_VMX_EXIT_CTLS);
+ vcpu->vc_vmx_pinbased_ctls = rdmsr(IA32_VMX_PINBASED_CTLS);
+ vcpu->vc_vmx_procbased_ctls = rdmsr(IA32_VMX_PROCBASED_CTLS);
+
+ /* Compute True Entry / Exit Controls (if applicable) */
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ vcpu->vc_vmx_true_entry_ctls = rdmsr(IA32_VMX_TRUE_ENTRY_CTLS);
+ vcpu->vc_vmx_true_exit_ctls = rdmsr(IA32_VMX_TRUE_EXIT_CTLS);
+ vcpu->vc_vmx_true_pinbased_ctls =
+ rdmsr(IA32_VMX_TRUE_PINBASED_CTLS);
+ vcpu->vc_vmx_true_procbased_ctls =
+ rdmsr(IA32_VMX_TRUE_PROCBASED_CTLS);
+ }
+
+ /* Compute Secondary Procbased Controls (if applicable) */
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1))
+ vcpu->vc_vmx_procbased2_ctls = rdmsr(IA32_VMX_PROCBASED2_CTLS);
+
+
+# if 0
+ /* XXX not needed now with MSR list */
+
+ /* Default Guest PAT (if applicable) */
+ if ((vcpu_vmx_check_cap(vcpu, IA32_VMX_ENTRY_CTLS,
+ IA32_VMX_LOAD_IA32_PAT_ON_ENTRY, 1)) ||
+ vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS,
+ IA32_VMX_SAVE_IA32_PAT_ON_EXIT, 1)) {
+ pat_default = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WT) |
+ PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
+ PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WT) |
+ PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
+ if (vmwrite(VMCS_GUEST_IA32_PAT, pat_default)) {
+ ret = EINVAL;
+ goto exit;
+ }
+ }
+
+ /* Host PAT (if applicable) */
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS,
+ IA32_VMX_LOAD_IA32_PAT_ON_EXIT, 1)) {
+ msr = rdmsr(MSR_CR_PAT);
+ if (vmwrite(VMCS_HOST_IA32_PAT, msr)) {
+ ret = EINVAL;
+ goto exit;
+ }
+ }
+#endif
+
+ /* Host CR0 */
+ cr0 = rcr0();
+ if (vmwrite(VMCS_HOST_IA32_CR0, cr0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Host CR4 */
+ cr4 = rcr4();
+ if (vmwrite(VMCS_HOST_IA32_CR4, cr4)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Host Segment Selectors */
+ if (vmwrite(VMCS_HOST_IA32_CS_SEL, GSEL(GCODE_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_DS_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_ES_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_FS_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_GS_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_SS_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_TR_SEL, GSYSSEL(GPROC0_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Host IDTR base */
+ if (vmwrite(VMCS_HOST_IA32_IDTR_BASE, idt_vaddr)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* VMCS link */
+ if (vmwrite(VMCS_LINK_POINTER, 0xFFFFFFFFFFFFFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Pinbased ctrls
+ *
+ * We must be able to set the following:
+ * IA32_VMX_EXTERNAL_INT_EXITING - exit on host interrupt
+ * IA32_VMX_NMI_EXITING - exit on host NMI
+ */
+ want1 = IA32_VMX_EXTERNAL_INT_EXITING |
+ IA32_VMX_NMI_EXITING;
+ want0 = 0;
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ ctrl = IA32_VMX_TRUE_PINBASED_CTLS;
+ ctrlval = vcpu->vc_vmx_true_pinbased_ctls;
+ } else {
+ ctrl = IA32_VMX_PINBASED_CTLS;
+ ctrlval = vcpu->vc_vmx_pinbased_ctls;
+ }
+
+ if (vcpu_vmx_compute_ctrl(vcpu, ctrlval, ctrl, want1, want0,
+ &pinbased)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_PINBASED_CTLS, pinbased)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Procbased ctrls
+ *
+ * We must be able to set the following:
+ * IA32_VMX_HLT_EXITING - exit on HLT instruction
+ * IA32_VMX_MWAIT_EXITING - exit on MWAIT instruction
+ * IA32_VMX_UNCONDITIONAL_IO_EXITING - exit on I/O instructions
+ * IA32_VMX_USE_MSR_BITMAPS - exit on various MSR accesses
+ * IA32_VMX_CR8_LOAD_EXITING - guest TPR access
+ * IA32_VMX_CR8_STORE_EXITING - guest TPR access
+ * IA32_VMX_USE_TPR_SHADOW - guest TPR access (shadow)
+ *
+ * If we have EPT, we must be able to clear the following
+ * IA32_VMX_CR3_LOAD_EXITING - don't care about guest CR3 accesses
+ * IA32_VMX_CR3_STORE_EXITING - don't care about guest CR3 accesses
+ */
+ want1 = IA32_VMX_HLT_EXITING |
+ IA32_VMX_MWAIT_EXITING |
+ IA32_VMX_UNCONDITIONAL_IO_EXITING |
+ IA32_VMX_USE_MSR_BITMAPS |
+ IA32_VMX_CR8_LOAD_EXITING |
+ IA32_VMX_CR8_STORE_EXITING |
+ IA32_VMX_USE_TPR_SHADOW;
+ want0 = 0;
+
+ if (vmm_softc->mode == VMM_MODE_EPT) {
+ want1 |= IA32_VMX_ACTIVATE_SECONDARY_CONTROLS;
+ want0 |= IA32_VMX_CR3_LOAD_EXITING |
+ IA32_VMX_CR3_STORE_EXITING;
+ }
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ ctrl = IA32_VMX_TRUE_PROCBASED_CTLS;
+ ctrlval = vcpu->vc_vmx_true_procbased_ctls;
+ } else {
+ ctrl = IA32_VMX_PROCBASED_CTLS;
+ ctrlval = vcpu->vc_vmx_procbased_ctls;
+ }
+
+ if (vcpu_vmx_compute_ctrl(vcpu, ctrlval, ctrl, want1, want0,
+ &procbased)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_PROCBASED_CTLS, procbased)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Secondary Procbased ctrls
+ *
+ * We want to be able to set the following, if available:
+ * IA32_VMX_ENABLE_VPID - use VPIDs where available
+ *
+ * If we have EPT, we must be able to set the following:
+ * IA32_VMX_ENABLE_EPT - enable EPT
+ *
+ * If we have unrestricted guest capability, we must be able to set
+ * the following:
+ * IA32_VMX_UNRESTRICTED_GUEST - enable unrestricted guest
+ */
+ want1 = 0;
+
+ /* XXX checking for 2ndary controls can be combined here */
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_VPID, 1))
+ want1 |= IA32_VMX_ENABLE_VPID;
+ }
+
+ if (vmm_softc->mode == VMM_MODE_EPT)
+ want1 |= IA32_VMX_ENABLE_EPT;
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_UNRESTRICTED_GUEST, 1))
+ want1 |= IA32_VMX_UNRESTRICTED_GUEST;
+ }
+
+ want0 = ~want1;
+ ctrlval = vcpu->vc_vmx_procbased2_ctls;
+ ctrl = IA32_VMX_PROCBASED2_CTLS;
+
+ if (vcpu_vmx_compute_ctrl(vcpu, ctrlval, ctrl, want1, want0,
+ &procbased2)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_PROCBASED2_CTLS, procbased2)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Exit ctrls
+ *
+ * We must be able to set the following:
+ * IA32_VMX_HOST_SPACE_ADDRESS_SIZE - exit to long mode
+ * IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT - ack interrupt on exit
+ * XXX clear save_debug_ctrls on exit ?
+ */
+ want1 = IA32_VMX_HOST_SPACE_ADDRESS_SIZE |
+ IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT;
+ want0 = 0;
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ ctrl = IA32_VMX_TRUE_EXIT_CTLS;
+ ctrlval = vcpu->vc_vmx_true_exit_ctls;
+ } else {
+ ctrl = IA32_VMX_EXIT_CTLS;
+ ctrlval = vcpu->vc_vmx_exit_ctls;
+ }
+
+ if (vcpu_vmx_compute_ctrl(vcpu, ctrlval, ctrl, want1, want0, &exit)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_CTLS, exit)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Entry ctrls
+ *
+ * We must be able to clear the following:
+ * IA32_VMX_ENTRY_TO_SMM - enter to SMM
+ * IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT
+ * XXX clear load debug_ctrls on entry ?
+ */
+ want1 = 0;
+ want0 = IA32_VMX_ENTRY_TO_SMM |
+ IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT;
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ ctrl = IA32_VMX_TRUE_ENTRY_CTLS;
+ ctrlval = vcpu->vc_vmx_true_entry_ctls;
+ } else {
+ ctrl = IA32_VMX_ENTRY_CTLS;
+ ctrlval = vcpu->vc_vmx_entry_ctls;
+ }
+
+ if (vcpu_vmx_compute_ctrl(vcpu, ctrlval, ctrl, want1, want0, &entry)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_ENTRY_CTLS, entry)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmm_softc->mode == VMM_MODE_EPT) {
+ eptp = vcpu->vc_parent->vm_map->pmap->pm_pdirpa;
+ msr = rdmsr(IA32_VMX_EPT_VPID_CAP);
+ if (msr & IA32_EPT_VPID_CAP_PAGE_WALK_4) {
+ /* Page walk length 4 supported */
+ eptp |= ((IA32_EPT_PAGE_WALK_LENGTH - 1) << 3);
+ }
+
+
+ if (msr & IA32_EPT_VPID_CAP_WB) {
+ /* WB cache type supported */
+ eptp |= IA32_EPT_PAGING_CACHE_TYPE_WB;
+ }
+
+ dprintf(("guest eptp = 0x%llx\n", eptp));
+ if (vmwrite(VMCS_GUEST_IA32_EPTP, eptp)) {
+ ret = EINVAL;
+ goto exit;
+ }
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_VPID, 1))
+ if (vmwrite(VMCS_GUEST_VPID,
+ (uint16_t)vcpu->vc_parent->vm_id)) {
+ ret = EINVAL;
+ goto exit;
+ }
+ }
+
+ /*
+ * The next portion of code sets up the VMCS for the register state
+ * we want during VCPU start. This matches what the CPU state would
+ * be after a bootloader transition to 'start'.
+ */
+ if (vmwrite(VMCS_GUEST_IA32_RFLAGS, 0x2)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * XXX -
+ * vg_rip gets special treatment here since we will rewrite
+ * it just before vmx_enter_guest, so it needs to match.
+ * we could just set vg_rip here and be done with (no vmwrite
+ * here) but that would require us to have proper resume
+ * handling (resume=1) in the exit handler, so for now we
+ * will just end up doing an extra vmwrite here.
+ *
+ * This can now change from the hardcoded value of 0x1000160
+ * to the marks[start] from vmd's bootloader. That needs to
+ * be hoisted up into vcpu create parameters via vm create params.
+ */
+ vcpu->vc_gueststate.vg_rip = 0x01000160;
+ if (vmwrite(VMCS_GUEST_IA32_RIP, 0x01000160)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Determine default CR0 as per Intel SDM A.7
+ * All flexible bits are set to 0
+ */
+ cr0 = (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0) &
+ (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1);
+ cr0 |= (CR0_CD | CR0_NW | CR0_ET);
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_UNRESTRICTED_GUEST, 1))
+// cr0 &= ~(CR0_PG);
+ cr0 &= ~(CR0_PG | CR0_PE);
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_CR0, cr0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_CR3, 0x0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Determine default CR4 as per Intel SDM A.8
+ * All flexible bits are set to 0
+ */
+ cr4 = (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0) &
+ (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1);
+
+ if (vmwrite(VMCS_GUEST_IA32_CR4, cr4)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Set guest stack for 0x10000 - sizeof(bootloader stack setup) */
+ if (vmwrite(VMCS_GUEST_IA32_RSP, 0xFFDC)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_SS_SEL, 0x10)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_SS_LIMIT, 0xFFFFFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_SS_AR, 0xC093)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_SS_BASE, 0x0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_DS_SEL, 0x10)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_DS_LIMIT, 0xFFFFFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_DS_AR, 0xC093)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_DS_BASE, 0x0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_ES_SEL, 0x10)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_ES_LIMIT, 0xFFFFFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_ES_AR, 0xC093)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_ES_BASE, 0x0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_FS_SEL, 0x10)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_FS_LIMIT, 0xFFFFFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_FS_AR, 0xC093)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_FS_BASE, 0x0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_GS_SEL, 0x10)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_GS_LIMIT, 0xFFFFFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_GS_AR, 0xC093)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_GS_BASE, 0x0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_CS_SEL, 0x8)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_CS_LIMIT, 0xFFFFFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_CS_AR, 0xC09F)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_CS_BASE, 0x0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_GDTR_LIMIT, 0xFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_GDTR_BASE, 0x10000)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_IDTR_LIMIT, 0xFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_IDTR_BASE, 0x0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_LDTR_SEL, 0x0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_LDTR_LIMIT, 0xFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_LDTR_AR, 0x0082)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_LDTR_BASE, 0x0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_TR_SEL, 0x0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_TR_LIMIT, 0xFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_TR_AR, 0x008B)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_TR_BASE, 0x0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Select MSRs to be saved on exit
+ */
+ msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_save_va;
+ msr_store[0].vms_index = MSR_EFER;
+ msr_store[1].vms_index = MSR_CR_PAT;
+ msr_store[2].vms_index = MSR_STAR;
+ msr_store[3].vms_index = MSR_LSTAR;
+ msr_store[4].vms_index = MSR_CSTAR;
+ msr_store[5].vms_index = MSR_SFMASK;
+ msr_store[6].vms_index = MSR_KERNELGSBASE;
+
+ /*
+ * Select MSRs to be loaded on exit
+ */
+ msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_load_va;
+ msr_store[0].vms_index = MSR_EFER;
+ msr_store[0].vms_data = rdmsr(MSR_EFER);
+ msr_store[1].vms_index = MSR_CR_PAT;
+ msr_store[1].vms_data = rdmsr(MSR_CR_PAT);
+ msr_store[2].vms_index = MSR_STAR;
+ msr_store[2].vms_data = rdmsr(MSR_STAR);
+ msr_store[3].vms_index = MSR_LSTAR;
+ msr_store[3].vms_data = rdmsr(MSR_LSTAR);
+ msr_store[4].vms_index = MSR_CSTAR;
+ msr_store[4].vms_data = rdmsr(MSR_CSTAR);
+ msr_store[5].vms_index = MSR_SFMASK;
+ msr_store[5].vms_data = rdmsr(MSR_SFMASK);
+ msr_store[6].vms_index = MSR_KERNELGSBASE;
+ msr_store[6].vms_data = rdmsr(MSR_KERNELGSBASE);
+
+ /*
+ * Select MSRs to be loaded on entry
+ */
+ msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_entry_load_va;
+ msr_store[0].vms_index = MSR_EFER;
+ msr_store[0].vms_data = 0ULL; /* Initial value */
+ msr_store[1].vms_index = MSR_CR_PAT;
+ msr_store[1].vms_data = pat_default; /* Initial value */
+ msr_store[2].vms_index = MSR_STAR;
+ msr_store[2].vms_data = 0ULL; /* Initial value */
+ msr_store[3].vms_index = MSR_LSTAR;
+ msr_store[3].vms_data = 0ULL; /* Initial value */
+ msr_store[4].vms_index = MSR_CSTAR;
+ msr_store[4].vms_data = 0ULL; /* Initial value */
+ msr_store[5].vms_index = MSR_SFMASK;
+ msr_store[5].vms_data = 0ULL; /* Initial value */
+ msr_store[6].vms_index = MSR_KERNELGSBASE;
+ msr_store[6].vms_data = 0ULL; /* Initial value */
+
+ if (vmwrite(VMCS_EXIT_MSR_STORE_COUNT, 0x7)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_MSR_LOAD_COUNT, 0x7)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, 0x7)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_STORE_MSR_ADDRESS,
+ vcpu->vc_vmx_msr_exit_save_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_LOAD_MSR_ADDRESS,
+ vcpu->vc_vmx_msr_exit_load_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_ENTRY_LOAD_MSR_ADDRESS,
+ vcpu->vc_vmx_msr_exit_save_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_MSR_BITMAP_ADDRESS,
+ vcpu->vc_msr_bitmap_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* XXX msr bitmap - set restrictions */
+ /* XXX CR0 shadow */
+ /* XXX CR4 shadow */
+
+ /* Flush content of VMCS to memory */
+ if (vmclear(&vcpu->vc_control_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+exit:
+ if (ret) {
+ if (vcpu->vc_control_va)
+ km_free((void *)vcpu->vc_control_va, PAGE_SIZE,
+ &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_exit_save_va)
+ km_free((void *)vcpu->vc_vmx_msr_exit_save_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_exit_load_va)
+ km_free((void *)vcpu->vc_vmx_msr_exit_load_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_entry_load_va)
+ km_free((void *)vcpu->vc_vmx_msr_entry_load_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ }
+
+ return (ret);
+}
+
+/*
+ * vcpu_init_svm
+ *
+ * AMD SVM specific VCPU initialization routine.
+ */
+int
+vcpu_init_svm(struct vcpu *vcpu)
+{
+ /* XXX removed due to rot */
+ return (0);
+}
+
+/*
+ * vcpu_init
+ *
+ * Calls the architecture-specific VCPU init routine
+ */
+int
+vcpu_init(struct vcpu *vcpu)
+{
+ int ret;
+
+ ret = 0;
+ vcpu->vc_hsa_stack_va = (vaddr_t)malloc(PAGE_SIZE,
+ M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!vcpu->vc_hsa_stack_va)
+ return (ENOMEM);
+
+ vcpu->vc_virt_mode = vmm_softc->mode;
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT) {
+ ret = vcpu_init_vmx(vcpu);
+ if (ret)
+ free((void *)vcpu->vc_hsa_stack_va, M_DEVBUF,
+ PAGE_SIZE);
+ }
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI) {
+ ret = vcpu_init_svm(vcpu);
+ if (ret)
+ free((void *)vcpu->vc_hsa_stack_va, M_DEVBUF,
+ PAGE_SIZE);
+ }
+ else
+ panic("unknown vmm mode\n");
+
+ return (ret);
+}
+
+/*
+ * vcpu_deinit_vmx
+ *
+ * Deinitializes the vcpu described by 'vcpu'
+ */
+void
+vcpu_deinit_vmx(struct vcpu *vcpu)
+{
+ if (vcpu->vc_control_va)
+ km_free((void *)vcpu->vc_control_va, PAGE_SIZE,
+ &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_exit_save_va)
+ km_free((void *)vcpu->vc_vmx_msr_exit_save_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_exit_load_va)
+ km_free((void *)vcpu->vc_vmx_msr_exit_load_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_entry_load_va)
+ km_free((void *)vcpu->vc_vmx_msr_entry_load_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_hsa_stack_va)
+ free((void *)vcpu->vc_hsa_stack_va, M_DEVBUF, PAGE_SIZE);
+}
+
+/*
+ * vcpu_deinit_svm
+ *
+ * Deinitializes the vcpu described by 'vcpu'
+ */
+void
+vcpu_deinit_svm(struct vcpu *vcpu)
+{
+ /* Unused */
+}
+
+/*
+ * vcpu_deinit
+ *
+ * Calls the architecture-specific VCPU deinit routine
+ */
+void
+vcpu_deinit(struct vcpu *vcpu)
+{
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ vcpu_deinit_vmx(vcpu);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ vcpu_deinit_svm(vcpu);
+ else
+ panic("unknown vmm mode\n");
+}
+
+/*
+ * vm_teardown
+ *
+ * Tears down (destroys) the vm indicated by 'vm'.
+ */
+void
+vm_teardown(struct vm *vm)
+{
+ struct vcpu *vcpu, *tmp;
+
+ /* XXX coordinate a stop of all VCPUs first */
+
+ /* Free VCPUs */
+ rw_enter_write(&vm->vm_vcpu_lock);
+ SLIST_FOREACH_SAFE(vcpu, &vm->vm_vcpu_list, vc_vcpu_link, tmp) {
+ SLIST_REMOVE(&vm->vm_vcpu_list, vcpu, vcpu, vc_vcpu_link);
+ vcpu_deinit(vcpu);
+ pool_put(&vcpu_pool, vcpu);
+ }
+ rw_exit_write(&vm->vm_vcpu_lock);
+
+ vm_impl_deinit(vm);
+
+ /* XXX teardown guest vmspace, free pages */
+
+ pool_put(&vm_pool, vm);
+}
+
+/*
+ * vcpu_vmx_check_cap
+ *
+ * Checks if the 'cap' bit in the 'msr' MSR can be set or cleared (set = 1
+ * or set = 0, respectively).
+ *
+ * When considering 'msr', we check to see if true controls are available,
+ * and use those if so.
+ *
+ * Returns 1 of 'cap' can be set/cleared as requested, 0 otherwise.
+ */
+int
+vcpu_vmx_check_cap(struct vcpu *vcpu, uint32_t msr, uint32_t cap, int set)
+{
+ uint64_t ctl;
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ switch (msr) {
+ case IA32_VMX_PINBASED_CTLS:
+ ctl = vcpu->vc_vmx_true_pinbased_ctls;
+ break;
+ case IA32_VMX_PROCBASED_CTLS:
+ ctl = vcpu->vc_vmx_true_procbased_ctls;
+ break;
+ case IA32_VMX_PROCBASED2_CTLS:
+ ctl = vcpu->vc_vmx_procbased2_ctls;
+ break;
+ case IA32_VMX_ENTRY_CTLS:
+ ctl = vcpu->vc_vmx_true_entry_ctls;
+ break;
+ case IA32_VMX_EXIT_CTLS:
+ ctl = vcpu->vc_vmx_true_exit_ctls;
+ break;
+ default:
+ return (0);
+ }
+ } else {
+ switch (msr) {
+ case IA32_VMX_PINBASED_CTLS:
+ ctl = vcpu->vc_vmx_pinbased_ctls;
+ break;
+ case IA32_VMX_PROCBASED_CTLS:
+ ctl = vcpu->vc_vmx_procbased_ctls;
+ break;
+ case IA32_VMX_PROCBASED2_CTLS:
+ ctl = vcpu->vc_vmx_procbased2_ctls;
+ break;
+ case IA32_VMX_ENTRY_CTLS:
+ ctl = vcpu->vc_vmx_entry_ctls;
+ break;
+ case IA32_VMX_EXIT_CTLS:
+ ctl = vcpu->vc_vmx_exit_ctls;
+ break;
+ default:
+ return (0);
+ }
+ }
+
+ if (set) {
+ /* Check bit 'cap << 32', must be !0 */
+ return (ctl & ((uint64_t)cap << 32)) != 0;
+ } else {
+ /* Check bit 'cap', must be 0 */
+ return (ctl & cap) == 0;
+ }
+}
+
+/*
+ * vcpu_vmx_compute_ctrl
+ *
+ * Computes the appropriate control value, given the supplied parameters
+ * and CPU capabilities.
+ *
+ * Intel has made somewhat of a mess of this computation - it is described
+ * using no fewer than three different approaches, spread across many
+ * pages of the SDM. Further compounding the problem is the fact that now
+ * we have "true controls" for each type of "control", and each needs to
+ * be examined to get the calculation right, but only if "true" controls
+ * are present on the CPU we're on.
+ *
+ * Parameters:
+ * vcpu: the vcpu for which controls are to be computed. (XXX now unused)
+ * ctrlval: the control value, as read from the CPU MSR
+ * ctrl: which control is being set (eg, pinbased, procbased, etc)
+ * want0: the set of desired 0 bits
+ * want1: the set of desired 1 bits
+ * out: (out) the correct value to write into the VMCS for this VCPU,
+ * for the 'ctrl' desired.
+ *
+ * Returns 0 if successful, or EINVAL if the supplied parameters define
+ * an unworkable control setup.
+ */
+int
+vcpu_vmx_compute_ctrl(struct vcpu *vcpu, uint64_t ctrlval, uint16_t ctrl,
+ uint32_t want1, uint32_t want0, uint32_t *out)
+{
+ int i, set, clear;
+
+ /*
+ * The Intel SDM gives three formulae for determining which bits to
+ * set/clear for a given control and desired functionality. Formula
+ * 1 is the simplest but disallows use of newer features that are
+ * enabled by functionality in later CPUs.
+ *
+ * Formulas 2 and 3 allow such extra functionality. We use formula
+ * 2 - this requires us to know the identity of controls in the
+ * "default1" class for each control register, but allows us to not
+ * have to pass along and/or query both sets of capability MSRs for
+ * each control lookup. This makes the code slightly longer,
+ * however.
+ */
+ for (i = 0; i < 32; i++) {
+ /* Figure out if we can set and / or clear this bit */
+ set = (ctrlval & (1ULL << (i + 32))) != 0;
+ clear = ((1ULL << i) & ((uint64_t)ctrlval)) == 0;
+
+ /* If the bit can't be set nor cleared, something's wrong */
+ if (!set && !clear)
+ return (EINVAL);
+
+ /*
+ * Formula 2.c.i - "If the relevant VMX capability MSR
+ * reports that a control has a single setting, use that
+ * setting."
+ */
+ if (set && !clear) {
+ if (want0 & (1ULL << i))
+ return (EINVAL);
+ else
+ *out |= (1ULL << i);
+ } else if (clear && !set) {
+ if (want1 & (1ULL << i))
+ return (EINVAL);
+ else
+ *out &= ~(1ULL << i);
+ } else {
+ /*
+ * 2.c.ii - "If the relevant VMX capability MSR
+ * reports that a control can be set to 0 or 1
+ * and that control's meaning is known to the VMM,
+ * set the control based on the functionality desired."
+ */
+ if (want1 & (1ULL << i))
+ *out |= (1ULL << i);
+ else if (want0 & (1 << i))
+ *out &= ~(1ULL << i);
+ else {
+ /*
+ * ... assuming the control's meaning is not
+ * known to the VMM ...
+ *
+ * 2.c.iii - "If the relevant VMX capability
+ * MSR reports that a control can be set to 0
+ * or 1 and the control is not in the default1
+ * class, set the control to 0."
+ *
+ * 2.c.iv - "If the relevant VMX capability
+ * MSR reports that a control can be set to 0
+ * or 1 and the control is in the default1
+ * class, set the control to 1."
+ */
+ switch (ctrl) {
+ case IA32_VMX_PINBASED_CTLS:
+ case IA32_VMX_TRUE_PINBASED_CTLS:
+ /*
+ * A.3.1 - default1 class of pinbased
+ * controls comprises bits 1,2,4
+ */
+ switch (i) {
+ case 1:
+ case 2:
+ case 4:
+ *out |= (1ULL << i);
+ break;
+ default:
+ *out &= ~(1ULL << i);
+ break;
+ }
+ break;
+ case IA32_VMX_PROCBASED_CTLS:
+ case IA32_VMX_TRUE_PROCBASED_CTLS:
+ /*
+ * A.3.2 - default1 class of procbased
+ * controls comprises bits 1, 4-6, 8,
+ * 13-16, 26
+ */
+ switch (i) {
+ case 1:
+ case 4 ... 6:
+ case 8:
+ case 13 ... 16:
+ case 26:
+ *out |= (1ULL << i);
+ break;
+ default:
+ *out &= ~(1ULL << i);
+ break;
+ }
+ break;
+ /*
+ * Unknown secondary procbased controls
+ * can always be set to 0
+ */
+ case IA32_VMX_PROCBASED2_CTLS:
+ *out &= ~(1ULL << i);
+ break;
+ case IA32_VMX_EXIT_CTLS:
+ case IA32_VMX_TRUE_EXIT_CTLS:
+ /*
+ * A.4 - default1 class of exit
+ * controls comprises bits 0-8, 10,
+ * 11, 13, 14, 16, 17
+ */
+ switch (i) {
+ case 0 ... 8:
+ case 10 ... 11:
+ case 13 ... 14:
+ case 16 ... 17:
+ *out |= (1ULL << i);
+ break;
+ default:
+ *out &= ~(1ULL << i);
+ break;
+ }
+ break;
+ case IA32_VMX_ENTRY_CTLS:
+ case IA32_VMX_TRUE_ENTRY_CTLS:
+ /*
+ * A.5 - default1 class of entry
+ * controls comprises bits 0-8, 12
+ */
+ switch (i) {
+ case 0 ... 8:
+ case 12:
+ *out |= (1ULL << i);
+ break;
+ default:
+ *out &= ~(1ULL << i);
+ break;
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * vm_get_info
+ *
+ * Returns information about the VM indicated by 'vip'.
+ */
+int
+vm_get_info(struct vm_info_params *vip)
+{
+ struct vm_info_result *out;
+ struct vm *vm;
+ struct vcpu *vcpu;
+ int i, j;
+ size_t need;
+
+ rw_enter_read(&vmm_softc->vm_lock);
+ need = vmm_softc->vm_ct * sizeof(struct vm_info_result);
+ if (vip->vip_size < need) {
+ vip->vip_info_ct = 0;
+ vip->vip_size = need;
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (0);
+ }
+
+ out = malloc(need, M_DEVBUF, M_NOWAIT);
+ if (out == NULL) {
+ vip->vip_info_ct = 0;
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (ENOMEM);
+ }
+
+ i = 0;
+ vip->vip_info_ct = vmm_softc->vm_ct;
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ out[i].vir_memory_size = vm->vm_memory_size;
+ out[i].vir_ncpus = vm->vm_vcpu_ct;
+ out[i].vir_id = vm->vm_id;
+ out[i].vir_creator_pid = vm->vm_creator_pid;
+ strncpy(out[i].vir_name, vm->vm_name, VMM_MAX_NAME_LEN);
+ rw_enter_read(&vm->vm_vcpu_lock);
+ for (j = 0; j < vm->vm_vcpu_ct; j++) {
+ out[i].vir_vcpu_state[j] = VCPU_STATE_UNKNOWN;
+ SLIST_FOREACH(vcpu, &vm->vm_vcpu_list,
+ vc_vcpu_link) {
+ if (vcpu->vc_id == j)
+ out[i].vir_vcpu_state[j] =
+ vcpu->vc_state;
+ }
+ }
+ rw_exit_read(&vm->vm_vcpu_lock);
+ i++;
+ }
+ rw_exit_read(&vmm_softc->vm_lock);
+ if (copyout(out, vip->vip_info, need) == EFAULT) {
+ free(out, M_DEVBUF, need);
+ return (EFAULT);
+ }
+
+ free(out, M_DEVBUF, need);
+ return (0);
+}
+
+/*
+ * vm_terminate
+ *
+ * Terminates the VM indicated by 'vtp'.
+ */
+int
+vm_terminate(struct vm_terminate_params *vtp)
+{
+ struct vm *vm, *found_vm;
+ struct vcpu *vcpu;
+
+ found_vm = NULL;
+
+ /*
+ * Find desired VM
+ */
+ rw_enter_read(&vmm_softc->vm_lock);
+
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ if (vm->vm_id == vtp->vtp_vm_id)
+ found_vm = vm;
+ }
+
+ if (found_vm) {
+ rw_enter_read(&found_vm->vm_vcpu_lock);
+ SLIST_FOREACH(vcpu, &found_vm->vm_vcpu_list, vc_vcpu_link) {
+ vcpu->vc_state = VCPU_STATE_REQSTOP;
+ }
+ rw_exit_read(&found_vm->vm_vcpu_lock);
+ }
+
+ rw_exit_read(&vmm_softc->vm_lock);
+
+ if (!found_vm)
+ return (ENOENT);
+
+ /* XXX possible race here two threads terminating the same vm? */
+ rw_enter_write(&vmm_softc->vm_lock);
+ vmm_softc->vm_ct--;
+ SLIST_REMOVE(&vmm_softc->vm_list, found_vm, vm, vm_link);
+ rw_exit_write(&vmm_softc->vm_lock);
+ vm_teardown(found_vm);
+
+ return (0);
+}
+
+/*
+ * vm_run
+ *
+ * Run the vm / vcpu specified by 'vrp'
+ */
+int
+vm_run(struct vm_run_params *vrp)
+{
+ struct vm *vm, *found_vm;
+ struct vcpu *vcpu, *found_vcpu;
+ int ret;
+
+ found_vm = NULL;
+ found_vcpu = NULL;
+ ret = 0;
+
+ /*
+ * Find desired VM
+ */
+ rw_enter_read(&vmm_softc->vm_lock);
+
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ if (vm->vm_id == vrp->vrp_vm_id)
+ found_vm = vm;
+ }
+
+ if (found_vm) {
+ rw_enter_read(&found_vm->vm_vcpu_lock);
+ SLIST_FOREACH(vcpu, &found_vm->vm_vcpu_list, vc_vcpu_link) {
+ if (vcpu->vc_id == vrp->vrp_vcpu_id) {
+ found_vcpu = vcpu;
+ if (found_vcpu->vc_state != VCPU_STATE_STOPPED)
+ ret = EBUSY;
+ else
+ found_vcpu->vc_state =
+ VCPU_STATE_RUNNING;
+ }
+ }
+ rw_exit_read(&found_vm->vm_vcpu_lock);
+
+ if (!found_vcpu)
+ ret = ENOENT;
+ }
+
+ rw_exit_read(&vmm_softc->vm_lock);
+
+ if (!found_vm)
+ ret = ENOENT;
+
+ if (ret)
+ return (ret);
+
+ /*
+ * We may be returning from userland helping us from the last exit.
+ * If so (vrp_continue == 1), copy in the exit data from vmd.
+ */
+ if (vrp->vrp_continue) {
+ if (copyin(vrp->vrp_exit, &found_vcpu->vc_exit,
+ sizeof(union vm_exit)) == EFAULT) {
+ return (EFAULT);
+ }
+ }
+
+ /* Run the VCPU specified in vrp */
+ if (found_vcpu->vc_virt_mode == VMM_MODE_VMX ||
+ found_vcpu->vc_virt_mode == VMM_MODE_EPT) {
+ ret = vcpu_run_vmx(found_vcpu, vrp->vrp_continue, &vrp->vrp_injint);
+ } else if (found_vcpu->vc_virt_mode == VMM_MODE_SVM ||
+ found_vcpu->vc_virt_mode == VMM_MODE_RVI) {
+ ret = vcpu_run_svm(found_vcpu,
+ vrp->vrp_continue);
+ }
+
+ /* If we are exiting, populate exit data so vmd can help */
+ if (ret == EAGAIN) {
+ vrp->vrp_exit_reason =
+ found_vcpu->vc_gueststate.vg_exit_reason;
+
+ if (copyout(&found_vcpu->vc_exit,
+ vrp->vrp_exit, sizeof(union vm_exit)) == EFAULT) {
+ ret = EFAULT;
+ } else
+ ret = 0;
+ } else
+ vrp->vrp_exit_reason = VM_EXIT_NONE;
+
+ return (ret);
+}
+
+/*
+ * vcpu_run_vmx
+ *
+ * VMM main loop used to run a VCPU.
+ *
+ * Parameters:
+ * vcpu: The VCPU to run
+ * from_exit: 1 if returning directly from an exit to vmd during the
+ * previous run, or 0 if we exited last time without needing to
+ * exit to vmd.
+ * injint: Interrupt that should be injected during this run, or -1 if
+ * no interrupt should be injected.
+ *
+ * Return values:
+ * 0: The run loop exited and no help is needed from vmd
+ * EAGAIN: The run loop exited and help from vmd is needed
+ * EINVAL: an error occured
+ */
+int
+vcpu_run_vmx(struct vcpu *vcpu, uint8_t from_exit, int16_t *injint)
+{
+ int ret, resume, exit_handled;
+ struct region_descriptor gdt;
+ struct cpu_info *ci;
+ uint64_t exit_reason, cr3, vmcs_ptr;
+ struct schedstate_percpu *spc;
+ struct vmx_invvpid_descriptor vid;
+ uint64_t rflags, eii;
+
+ exit_handled = 1;
+ resume = 0;
+
+ while (exit_handled) {
+ if (!resume) {
+ /*
+ * We are launching for the first time, or we are
+ * resuming from a different pcpu, so we need to
+ * reset certain pcpu-specific values.
+ */
+ ci = curcpu();
+ setregion(&gdt, ci->ci_gdt, GDT_SIZE - 1);
+
+ if (vmptrld(&vcpu->vc_control_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (gdt.rd_base == 0) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Host GDTR base */
+ if (vmwrite(VMCS_HOST_IA32_GDTR_BASE, gdt.rd_base)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Host TR base */
+ if (vmwrite(VMCS_HOST_IA32_TR_BASE,
+ (uint64_t)curcpu()->ci_tss)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Host CR3 */
+ cr3 = rcr3();
+ if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) {
+ ret = EINVAL;
+ goto exit;
+ }
+ }
+
+ /*
+ * If we are returning from userspace (vmd) because we exited
+ * last time, fix up any needed vcpu state first.
+ */
+ if (from_exit) {
+ from_exit = 0;
+ switch (vcpu->vc_gueststate.vg_exit_reason) {
+ case VMX_EXIT_IO:
+ vcpu->vc_gueststate.vg_rax =
+ vcpu->vc_exit.vei.vei_data;
+ break;
+ case VMX_EXIT_HLT:
+ break;
+ default:
+ printf("vmx_enter_guest: returning from exit "
+ "with unknown reason %d\n",
+ vcpu->vc_gueststate.vg_exit_reason);
+ break;
+ }
+ }
+
+ /*
+ * XXX - clock hack. We don't track host clocks while not
+ * running inside a VM, and thus we lose many clocks while
+ * the OS is running other processes. For now, approximate
+ * when a clock should be injected by injecting one clock
+ * per CLOCK_BIAS exits.
+ *
+ * This should be changed to track host clocks to know if
+ * a clock tick was missed, and "catch up" clock interrupt
+ * injections later as needed.
+ *
+ * Note that checking injint here and not injecting the
+ * clock interrupt if injint is set also violates interrupt
+ * priority, until this hack is fixed.
+ */
+ vmmclk++;
+ eii = 0xFFFFFFFFFFFFFFFFULL;
+
+ if (vmmclk % CLOCK_BIAS == 0)
+ eii = 0x20;
+
+ if (*injint != -1)
+ eii = *injint + 0x20;
+
+ if (eii != 0xFFFFFFFFFFFFFFFFULL) {
+ if (vmread(VMCS_GUEST_IA32_RFLAGS, &rflags)) {
+ printf("intr: can't read guest rflags\n");
+ rflags = 0;
+ }
+
+ if (rflags & PSL_I) {
+ eii |= (1ULL << 31); /* Valid */
+ eii |= (0ULL << 8); /* Hardware Interrupt */
+ if (vmwrite(VMCS_ENTRY_INTERRUPTION_INFO, eii)) {
+ printf("intr: can't vector clock "
+ "interrupt to guest\n");
+ }
+ if (*injint != -1)
+ *injint = -1;
+ }
+ }
+
+ /* XXX end clock hack */
+
+ /* Invalidate old TLB mappings */
+ vid.vid_vpid = vcpu->vc_parent->vm_id;
+ vid.vid_addr = 0;
+ invvpid(IA32_VMX_INVVPID_SINGLE_CTX_GLB, &vid);
+
+ /* Start / resume the VM / VCPU */
+ /* XXX unlock the biglock here */
+ ret = vmx_enter_guest(&vcpu->vc_control_pa,
+ &vcpu->vc_gueststate, resume);
+ /* XXX lock the biglock here */
+
+ /* If we exited successfully ... */
+ if (ret == 0) {
+ resume = 1;
+ vcpu->vc_last_pcpu = ci;
+ if (vmread(VMCS_GUEST_IA32_RIP,
+ &vcpu->vc_gueststate.vg_rip)) {
+ printf("vcpu_run_vmx: cannot read guest rip\n");
+ ret = EINVAL;
+ exit_handled = 0;
+ goto exit;
+ }
+
+ if (vmread(VMCS_EXIT_REASON, &exit_reason)) {
+ printf("vcpu_run_vmx: cant read exit reason\n");
+ ret = EINVAL;
+ exit_handled = 0;
+ goto exit;
+ }
+
+ /*
+ * Handle the exit. This will alter "ret" to EAGAIN if
+ * the exit handler determines help from vmd is needed.
+ */
+ vcpu->vc_gueststate.vg_exit_reason = exit_reason;
+ exit_handled = vmx_handle_exit(vcpu, &ret);
+
+ /* Check if we should yield - don't hog the cpu */
+ spc = &ci->ci_schedstate;
+ if (spc->spc_schedflags & SPCF_SHOULDYIELD) {
+ resume = 0;
+ if (vmclear(&vcpu->vc_control_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+ yield();
+ }
+ } else if (ret == VMX_FAIL_LAUNCH_INVALID_VMCS) {
+ printf("vmx_enter_guest: failed launch with invalid "
+ "vmcs\n");
+ ret = EINVAL;
+ exit_handled = 0;
+ } else if (ret == VMX_FAIL_LAUNCH_VALID_VMCS) {
+ exit_reason = vcpu->vc_gueststate.vg_exit_reason;
+ printf("vmx_enter_guest: failed launch with valid "
+ "vmcs, code=%lld (%s)\n", exit_reason,
+ vmx_instruction_error_decode(exit_reason));
+ ret = EINVAL;
+ exit_handled = 0;
+ } else {
+ printf("vmx_enter_guest: failed launch for unknown "
+ "reason\n");
+ ret = EINVAL;
+ exit_handled = 0;
+ }
+
+ }
+ vcpu->vc_state = VCPU_STATE_STOPPED;
+
+exit:
+ /*
+ * We are heading back to userspace (vmd), either because we need help
+ * handling an exit, or we failed in some way to enter the guest.
+ * Clear any current VMCS pointer as we may end up coming back on
+ * a different CPU.
+ */
+ if (!vmptrst(&vmcs_ptr)) {
+ if (vmcs_ptr != 0xFFFFFFFFFFFFFFFFULL)
+ if (vmclear(&vcpu->vc_control_pa))
+ ret = EINVAL;
+ } else
+ ret = EINVAL;
+
+ return (ret);
+}
+
+/*
+ * vmx_handle_intr
+ *
+ * Handle host (external) interrupts. We read which interrupt fired by
+ * extracting the vector from the VMCS and dispatch the interrupt directly
+ * to the host using vmm_dispatch_intr.
+ */
+void
+vmx_handle_intr(struct vcpu *vcpu)
+{
+ uint8_t vec;
+ uint64_t eii;
+ struct gate_descriptor *idte;
+ vaddr_t handler;
+
+ if (vmread(VMCS_EXIT_INTERRUPTION_INFO, &eii)) {
+ printf("vmx_handle_intr: can't obtain intr info\n");
+ return;
+ }
+
+ vec = eii & 0xFF;
+
+ /* XXX check "error valid" code in eii, abort if 0 */
+ idte=&idt[vec];
+ handler = idte->gd_looffset + ((uint64_t)idte->gd_hioffset << 16);
+ vmm_dispatch_intr(handler);
+}
+
+/*
+ * vmx_handle_hlt
+ *
+ * Handle HLT exits
+ */
+int
+vmx_handle_hlt(struct vcpu *vcpu)
+{
+ uint64_t insn_length;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("vmx_handle_hlt: can't obtain instruction length\n");
+ return (1);
+ }
+
+ vcpu->vc_gueststate.vg_rip += insn_length;
+ return (0);
+}
+
+/*
+ * vmx_handle_exit
+ *
+ * Handle exits from the VM by decoding the exit reason and calling various
+ * subhandlers as needed.
+ */
+int
+vmx_handle_exit(struct vcpu *vcpu, int *result)
+{
+ uint64_t exit_reason;
+ int update_rip, handled;
+
+ update_rip = 0;
+ handled = 1;
+ exit_reason = vcpu->vc_gueststate.vg_exit_reason;
+
+ switch (exit_reason) {
+ case VMX_EXIT_EPT_VIOLATION:
+ *result = vmx_handle_np_fault(vcpu);
+ break;
+ case VMX_EXIT_CPUID:
+ *result = vmx_handle_cpuid(vcpu);
+ update_rip = 1;
+ break;
+ case VMX_EXIT_IO:
+ *result = vmx_handle_inout(vcpu);
+ update_rip = 1;
+ if (*result)
+ handled = 0;
+ break;
+ case VMX_EXIT_EXTINT:
+ vmx_handle_intr(vcpu);
+ update_rip = 0;
+ break;
+ case VMX_EXIT_CR_ACCESS:
+ *result = vmx_handle_cr(vcpu);
+ update_rip = 1;
+ break;
+ case VMX_EXIT_HLT:
+ *result = vmx_handle_hlt(vcpu);
+ update_rip = 1;
+ handled = 0;
+ break;
+ default:
+ dprintf(("vmx_handle_exit: unhandled exit %lld (%s)\n",
+ exit_reason, vmx_exit_reason_decode(exit_reason)));
+ *result = EINVAL;
+ return (0);
+ }
+
+ if (update_rip) {
+ if (vmwrite(VMCS_GUEST_IA32_RIP,
+ vcpu->vc_gueststate.vg_rip)) {
+ printf("vmx_handle_exit: can't advance rip\n");
+ *result = EINVAL;
+ return (0);
+ }
+ }
+
+ return (handled);
+}
+
+/*
+ * vmm_get_guest_memtype
+ *
+ * Returns the type of memory 'gpa' refers to in the context of vm 'vm'
+ */
+int
+vmm_get_guest_memtype(struct vm *vm, paddr_t gpa)
+{
+
+ if (gpa >= VMM_PCI_MMIO_BAR_BASE && gpa <= VMM_PCI_MMIO_BAR_END) {
+ dprintf(("guest mmio access @ 0x%llx\n", (uint64_t)gpa));
+ return (VMM_MEM_TYPE_REGULAR);
+ }
+
+ if (gpa < vm->vm_memory_size * (1024 * 1024))
+ return (VMM_MEM_TYPE_REGULAR);
+ else {
+ dprintf(("guest memtype @ 0x%llx unknown\n", (uint64_t)gpa));
+ return (VMM_MEM_TYPE_UNKNOWN);
+ }
+}
+
+/*
+ * vmm_get_guest_faulttype
+ *
+ * Determines the type (R/W/X) of the last fault on the VCPU last run on
+ * this PCPU. Calls the appropriate architecture-specific subroutine.
+ */
+int
+vmm_get_guest_faulttype(void)
+{
+ if (vmm_softc->mode == VMM_MODE_EPT)
+ return vmx_get_guest_faulttype();
+ else if (vmm_softc->mode == VMM_MODE_RVI)
+ return vmx_get_guest_faulttype();
+ else
+ panic("unknown vmm mode\n");
+
+}
+
+/*
+ * vmx_get_exit_qualification
+ *
+ * Return the current VMCS' exit qualification information
+ */
+int
+vmx_get_exit_qualification(uint64_t *exit_qualification)
+{
+ if (vmread(VMCS_GUEST_EXIT_QUALIFICATION, exit_qualification)) {
+ printf("vmm_get_exit_qualification: cant extract exit qual\n");
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * vmx_get_guest_faulttype
+ *
+ * Determines the type (R/W/X) of the last fault on the VCPU last run on
+ * this PCPU.
+ */
+int
+vmx_get_guest_faulttype(void)
+{
+ uint64_t exit_qualification;
+
+ if (vmx_get_exit_qualification(&exit_qualification))
+ return (EINVAL);
+
+ if (exit_qualification & IA32_VMX_EPT_FAULT_WRITE)
+ return (PROT_WRITE);
+ else if (exit_qualification & IA32_VMX_EPT_FAULT_READ)
+ return (PROT_READ);
+ else if (exit_qualification & IA32_VMX_EPT_FAULT_EXEC)
+ return (PROT_EXEC);
+ else
+ return (EINVAL);
+}
+
+/*
+ * svm_get_guest_faulttype
+ *
+ * Determines the type (R/W/X) of the last fault on the VCPU last run on
+ * this PCPU.
+ */
+int
+svm_get_guest_faulttype(void)
+{
+ /* XXX removed due to rot */
+ return (EINVAL);
+}
+
+/*
+ * vmx_fault_page
+ *
+ * Request a new page to be faulted into the UVM map of the VM owning 'vcpu'
+ * at address 'gpa'.
+ */
+int
+vmx_fault_page(struct vcpu *vcpu, paddr_t gpa)
+{
+ int fault_type, ret;
+ vaddr_t kva;
+ paddr_t host_pa;
+ struct pmap *pmap;
+
+ fault_type = vmx_get_guest_faulttype();
+ if (fault_type == EINVAL) {
+ printf("vmx_fault_page: invalid fault type\n");
+ return (EINVAL);
+ }
+
+ ret = uvm_fault(vcpu->vc_parent->vm_map, gpa, fault_type,
+ PROT_READ | PROT_WRITE | PROT_EXEC);
+ if (!ret) {
+ pmap = vcpu->vc_parent->vm_map->pmap;
+ if (!vmx_fix_ept_pte(pmap, gpa)) {
+ if (pmap_extract(pmap, (vaddr_t)gpa, &host_pa)) {
+ kva = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
+ &kp_none, &kd_nowait);
+ if (kva) {
+ pmap_kenter_pa(kva, host_pa,
+ PROT_READ | PROT_WRITE);
+ bzero((void *)kva, PAGE_SIZE);
+ pmap_kremove(kva, PAGE_SIZE);
+ km_free((void *)kva, PAGE_SIZE, &kv_any,
+ &kp_none);
+ } else {
+ printf("vmx_fault_page: kva failure\n");
+ ret = ENOMEM;
+ }
+ } else {
+ printf("vmx_fault_page: extract failure\n");
+ ret = EFAULT;
+ }
+ } else {
+ printf("vmx_fault_page: ept fixup failure\n");
+ ret = EINVAL;
+ }
+ } else {
+ printf("vmx_fault_page: uvm_fault returns %d\n", ret);
+ }
+
+ return (ret);
+}
+
+/*
+ * vmx_handle_np_fault
+ *
+ * High level nested paging handler for VMX. Verifies that a fault is for a
+ * valid memory region, then faults a page, or aborts otherwise.
+ */
+int
+vmx_handle_np_fault(struct vcpu *vcpu)
+{
+ uint64_t gpa;
+ int gpa_memtype, ret;
+
+ ret = 0;
+ if (vmread(VMCS_GUEST_PHYSICAL_ADDRESS, &gpa)) {
+ printf("vmm_handle_np_fault: cannot extract faulting pa\n");
+ return (EINVAL);
+ }
+
+ gpa_memtype = vmm_get_guest_memtype(vcpu->vc_parent, gpa);
+ switch(gpa_memtype) {
+ case VMM_MEM_TYPE_REGULAR:
+ ret = vmx_fault_page(vcpu, gpa);
+ break;
+ default:
+ printf("unknown memory type %d for GPA 0x%llx\n",
+ gpa_memtype, gpa);
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * vmx_handle_inout
+ *
+ * Exit handler for IN/OUT instructions.
+ *
+ * The vmm can handle certain IN/OUTS without exiting to vmd, but most of these
+ * will be passed to vmd for completion.
+ */
+int
+vmx_handle_inout(struct vcpu *vcpu)
+{
+ uint64_t insn_length, exit_qual;
+ int ret;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("vmx_handle_inout: can't obtain instruction length\n");
+ return (1);
+ }
+
+ if (vmx_get_exit_qualification(&exit_qual)) {
+ printf("vmx_handle_inout: can't get exit qual\n");
+ return (1);
+ }
+
+ /* Bits 0:2 - size of exit */
+ vcpu->vc_exit.vei.vei_size = (exit_qual & 0x7) + 1;
+ /* Bit 3 - direction */
+ vcpu->vc_exit.vei.vei_dir = (exit_qual & 0x8) >> 3;
+ /* Bit 4 - string instruction? */
+ vcpu->vc_exit.vei.vei_string = (exit_qual & 0x10) >> 4;
+ /* Bit 5 - REP prefix? */
+ vcpu->vc_exit.vei.vei_rep = (exit_qual & 0x20) >> 5;
+ /* Bit 6 - Operand encoding */
+ vcpu->vc_exit.vei.vei_encoding = (exit_qual & 0x40) >> 6;
+ /* Bit 16:31 - port */
+ vcpu->vc_exit.vei.vei_port = (exit_qual & 0xFFFF0000) >> 16;
+ /* Data */
+ vcpu->vc_exit.vei.vei_data = (uint32_t)vcpu->vc_gueststate.vg_rax;
+
+ vcpu->vc_gueststate.vg_rip += insn_length;
+
+ /*
+ * The following ports usually belong to devices owned by vmd.
+ * Return EAGAIN to signal help needed from userspace (vmd).
+ * Return 0 to indicate we don't care about this port.
+ *
+ * XXX something better than a hardcoded list here, maybe
+ * configure via vmd via the device list in vm create params?
+ *
+ * XXX handle not eax target
+ */
+ switch(vcpu->vc_exit.vei.vei_port) {
+ case 0x40 ... 0x43:
+ case 0x3f8 ... 0x3ff:
+ case 0xcf8:
+ case 0xcfc:
+ case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
+ ret = EAGAIN;
+ break;
+ case IO_RTC ... IO_RTC + 1:
+ /* We can directly read the RTC on behalf of the guest */
+ if (vcpu->vc_exit.vei.vei_dir == 1) {
+ vcpu->vc_gueststate.vg_rax =
+ inb(vcpu->vc_exit.vei.vei_port);
+ }
+ ret = 0;
+ break;
+ default:
+ /* Read from unsupported ports returns FFs */
+ if (vcpu->vc_exit.vei.vei_dir == 1)
+ vcpu->vc_gueststate.vg_rax = 0xFFFFFFFF;
+ ret = 0;
+ }
+
+ return (ret);
+}
+
+/*
+ * vmx_handle_cr
+ *
+ * Handle reads/writes to control registers (except CR3)
+ */
+int
+vmx_handle_cr(struct vcpu *vcpu)
+{
+ uint64_t insn_length, exit_qual;
+ uint8_t crnum, dir;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("vmx_handle_cr: can't obtain instruction length\n");
+ return (1);
+ }
+
+ if (vmx_get_exit_qualification(&exit_qual)) {
+ printf("vmx_handle_cr: can't get exit qual\n");
+ return (1);
+ }
+
+ /* Low 4 bits of exit_qual represent the CR number */
+ crnum = exit_qual & 0xf;
+
+ dir = (exit_qual & 0x30) >> 4;
+
+ switch (dir) {
+ case CR_WRITE:
+ dprintf(("vmx_handle_cr: mov to cr%d @ %llx\n",
+ crnum, vcpu->vc_gueststate.vg_rip));
+ break;
+ case CR_READ:
+ dprintf(("vmx_handle_cr: mov from cr%d @ %llx\n",
+ crnum, vcpu->vc_gueststate.vg_rip));
+ break;
+ case CR_CLTS:
+ dprintf(("vmx_handle_cr: clts instruction @ %llx\n",
+ vcpu->vc_gueststate.vg_rip));
+ break;
+ case CR_LMSW:
+ dprintf(("vmx_handle_cr: lmsw instruction @ %llx\n",
+ vcpu->vc_gueststate.vg_rip));
+ break;
+ default:
+ dprintf(("vmx_handle_cr: unknown cr access @ %llx\n",
+ vcpu->vc_gueststate.vg_rip));
+ }
+
+ vcpu->vc_gueststate.vg_rip += insn_length;
+
+ return (0);
+}
+
+/*
+ * vmx_handle_cpuid
+ *
+ * Exit handler for CPUID instruction
+ */
+int
+vmx_handle_cpuid(struct vcpu *vcpu)
+{
+ uint64_t insn_length;
+ uint64_t *rax, *rbx, *rcx, *rdx;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("vmx_handle_cpuid: can't obtain instruction length\n");
+ return (1);
+ }
+
+ /* All CPUID instructions are 0x0F 0xA2 */
+ KASSERT(insn_length == 2);
+
+ rax = &vcpu->vc_gueststate.vg_rax;
+ rbx = &vcpu->vc_gueststate.vg_rbx;
+ rcx = &vcpu->vc_gueststate.vg_rcx;
+ rdx = &vcpu->vc_gueststate.vg_rdx;
+
+ switch (*rax) {
+ case 0x00: /* Max level and vendor ID */
+ *rax = 0x07; /* cpuid_level */
+ *rbx = *((uint32_t *)&cpu_vendor);
+ *rcx = *((uint32_t *)&cpu_vendor + 1);
+ *rdx = *((uint32_t *)&cpu_vendor + 2);
+ break;
+ case 0x01: /* Version, brand, feature info */
+ *rax = cpu_id;
+ /* mask off host's APIC ID, reset to vcpu id */
+ *rbx = cpu_ebxfeature & 0x00FFFFFF;
+ *rbx &= (vcpu->vc_id & 0xFF) << 24;
+ /*
+ * clone host capabilities minus:
+ * speedstep (CPUIDECX_EST)
+ * vmx (CPUIDECX_VMX)
+ * xsave (CPUIDECX_XSAVE)
+ * thermal (CPUIDECX_TM2, CPUID_ACPI, CPUID_TM)
+ * XXX - timestamp (CPUID_TSC)
+ * monitor/mwait (CPUIDECX_MWAIT)
+ * performance monitoring (CPUIDECX_PDCM)
+ * plus:
+ * hypervisor (CPUIDECX_HV)
+ */
+ *rcx = (cpu_ecxfeature | CPUIDECX_HV) &
+ ~(CPUIDECX_EST | CPUIDECX_TM2 |
+ CPUIDECX_MWAIT | CPUIDECX_PDCM |
+ CPUIDECX_VMX | CPUIDECX_XSAVE);
+ *rdx = curcpu()->ci_feature_flags &
+ ~(CPUID_ACPI | CPUID_TM | CPUID_TSC);
+ break;
+ case 0x02: /* Cache and TLB information */
+ dprintf(("vmx_handle_cpuid: function 0x02 (cache/TLB) not"
+ " supported\n"));
+ break;
+ case 0x03: /* Processor serial number (not supported) */
+ *rax = 0;
+ *rbx = 0;
+ *rcx = 0;
+ *rdx = 0;
+ break;
+ case 0x04:
+ dprintf(("vmx_handle_cpuid: function 0x04 (deterministic "
+ "cache info) not supported\n"));
+ break;
+ case 0x05: /* MONITOR/MWAIT (not supported) */
+ *rax = 0;
+ *rbx = 0;
+ *rcx = 0;
+ *rdx = 0;
+ break;
+ case 0x06: /* Thermal / Power management */
+ /* Only ARAT is exposed in function 0x06 */
+ *rax = TPM_ARAT;
+ *rbx = 0;
+ *rcx = 0;
+ *rdx = 0;
+ break;
+ case 0x07: /* SEFF */
+ if (*rcx == 0) {
+ *rax = 0; /* Highest subleaf supported */
+ *rbx = curcpu()->ci_feature_sefflags_ebx;
+ *rcx = curcpu()->ci_feature_sefflags_ecx;
+ *rdx = 0;
+ } else {
+ /* Unsupported subleaf */
+ *rax = 0;
+ *rbx = 0;
+ *rcx = 0;
+ *rdx = 0;
+ }
+ break;
+ case 0x09: /* Direct Cache Access (not supported) */
+ dprintf(("vmx_handle_cpuid: function 0x09 (direct cache access)"
+ " not supported\n"));
+ break;
+ case 0x0a: /* Architectural performance monitoring */
+ *rax = 0;
+ *rbx = 0;
+ *rcx = 0;
+ *rdx = 0;
+ break;
+ case 0x0b: /* Extended topology enumeration (not supported) */
+ dprintf(("vmx_handle_cpuid: function 0x0b (topology enumeration)"
+ " not supported\n"));
+ break;
+ case 0x0d: /* Processor ext. state information (not supported) */
+ dprintf(("vmx_handle_cpuid: function 0x0d (ext. state info)"
+ " not supported\n"));
+ break;
+ case 0x0f: /* QoS info (not supported) */
+ dprintf(("vmx_handle_cpuid: function 0x0f (QoS info)"
+ " not supported\n"));
+ break;
+ case 0x14: /* Processor Trace info (not supported) */
+ dprintf(("vmx_handle_cpuid: function 0x14 (processor trace info)"
+ " not supported\n"));
+ break;
+ case 0x15: /* TSC / Core Crystal Clock info (not supported) */
+ dprintf(("vmx_handle_cpuid: function 0x15 (TSC / CCC info)"
+ " not supported\n"));
+ break;
+ case 0x16: /* Processor frequency info (not supported) */
+ dprintf(("vmx_handle_cpuid: function 0x16 (frequency info)"
+ " not supported\n"));
+ break;
+ case 0x40000000: /* Hypervisor information */
+ *rax = 0;
+ *rbx = *((uint32_t *)&vmm_hv_signature[0]);
+ *rcx = *((uint32_t *)&vmm_hv_signature[4]);
+ *rdx = *((uint32_t *)&vmm_hv_signature[8]);
+ break;
+ case 0x80000000: /* Extended function level */
+ *rax = 0x80000007; /* curcpu()->ci_pnfeatset */
+ *rbx = 0;
+ *rcx = 0;
+ *rdx = 0;
+ case 0x80000001: /* Extended function info */
+ *rax = curcpu()->ci_efeature_eax;
+ *rbx = 0; /* Reserved */
+ *rcx = curcpu()->ci_efeature_ecx;
+ *rdx = curcpu()->ci_feature_eflags;
+ break;
+ case 0x80000002: /* Brand string */
+ *rax = curcpu()->ci_brand[0];
+ *rbx = curcpu()->ci_brand[1];
+ *rcx = curcpu()->ci_brand[2];
+ *rdx = curcpu()->ci_brand[3];
+ break;
+ case 0x80000003: /* Brand string */
+ *rax = curcpu()->ci_brand[4];
+ *rbx = curcpu()->ci_brand[5];
+ *rcx = curcpu()->ci_brand[6];
+ *rdx = curcpu()->ci_brand[7];
+ break;
+ case 0x80000004: /* Brand string */
+ *rax = curcpu()->ci_brand[8];
+ *rbx = curcpu()->ci_brand[9];
+ *rcx = curcpu()->ci_brand[10];
+ *rdx = curcpu()->ci_brand[11];
+ break;
+ case 0x80000005: /* Reserved (Intel), cacheinfo (AMD) */
+ *rax = curcpu()->ci_amdcacheinfo[0];
+ *rbx = curcpu()->ci_amdcacheinfo[1];
+ *rcx = curcpu()->ci_amdcacheinfo[2];
+ *rdx = curcpu()->ci_amdcacheinfo[3];
+ break;
+ case 0x80000006: /* ext. cache info */
+ *rax = curcpu()->ci_extcacheinfo[0];
+ *rbx = curcpu()->ci_extcacheinfo[1];
+ *rcx = curcpu()->ci_extcacheinfo[2];
+ *rdx = curcpu()->ci_extcacheinfo[3];
+ break;
+ case 0x80000007: /* apmi */
+ *rax = 0; /* Reserved */
+ *rbx = 0; /* Reserved */
+ *rcx = 0; /* Reserved */
+ *rdx = cpu_apmi_edx;
+ break;
+ case 0x80000008: /* Phys bits info and topology (AMD) */
+ dprintf(("vmx_handle_cpuid: function 0x80000008 (phys bits info)"
+ " not supported\n"));
+ break;
+ default:
+ dprintf(("vmx_handle_cpuid: unsupported rax=0x%llx\n", *rax));
+ }
+
+ vcpu->vc_gueststate.vg_rip += insn_length;
+
+ return (0);
+}
+
+/*
+ * vcpu_run_svm
+ *
+ * VMM main loop used to run a VCPU.
+ */
+int
+vcpu_run_svm(struct vcpu *vcpu, uint8_t from_exit)
+{
+ /* XXX removed due to rot */
+ return (0);
+}
+
+/*
+ * vmx_fix_ept_pte
+ *
+ * Fixes up the pmap PTE entry for 'addr' to reflect proper EPT format
+ */
+int
+vmx_fix_ept_pte(struct pmap *pmap, vaddr_t addr)
+{
+ int offs, level;
+
+ level = pmap_fix_ept(pmap, addr, &offs);
+ KASSERT(level == 0);
+
+ return (0);
+}
+
+/*
+ * vmx_exit_reason_decode
+ *
+ * Returns a human readable string describing exit type 'code'
+ */
+const char *
+vmx_exit_reason_decode(uint32_t code)
+{
+ switch(code) {
+ case VMX_EXIT_NMI: return "NMI";
+ case VMX_EXIT_EXTINT: return "external interrupt";
+ case VMX_EXIT_TRIPLE_FAULT: return "triple fault";
+ case VMX_EXIT_INIT: return "INIT signal";
+ case VMX_EXIT_SIPI: return "SIPI signal";
+ case VMX_EXIT_IO_SMI: return "I/O SMI";
+ case VMX_EXIT_OTHER_SMI: return "other SMI";
+ case VMX_EXIT_INT_WINDOW: return "interrupt window";
+ case VMX_EXIT_NMI_WINDOW: return "NMI window";
+ case VMX_EXIT_TASK_SWITCH: return "task switch";
+ case VMX_EXIT_CPUID: return "CPUID instruction";
+ case VMX_EXIT_GETSEC: return "GETSEC instruction";
+ case VMX_EXIT_HLT: return "HLT instruction";
+ case VMX_EXIT_INVD: return "INVD instruction";
+ case VMX_EXIT_INVLPG: return "INVLPG instruction";
+ case VMX_EXIT_RDPMC: return "RDPMC instruction";
+ case VMX_EXIT_RDTSC: return "RDTSC instruction";
+ case VMX_EXIT_RSM: return "RSM instruction";
+ case VMX_EXIT_VMCALL: return "VMCALL instruction";
+ case VMX_EXIT_VMCLEAR: return "VMCLEAR instruction";
+ case VMX_EXIT_VMLAUNCH: return "VMLAUNCH instruction";
+ case VMX_EXIT_VMPTRLD: return "VMPTRLD instruction";
+ case VMX_EXIT_VMPTRST: return "VMPTRST instruction";
+ case VMX_EXIT_VMREAD: return "VMREAD instruction";
+ case VMX_EXIT_VMRESUME: return "VMRESUME instruction";
+ case VMX_EXIT_VMWRITE: return "VMWRITE instruction";
+ case VMX_EXIT_VMXOFF: return "VMXOFF instruction";
+ case VMX_EXIT_VMXON: return "VMXON instruction";
+ case VMX_EXIT_CR_ACCESS: return "CR access";
+ case VMX_EXIT_MOV_DR: return "MOV DR instruction";
+ case VMX_EXIT_IO: return "I/O instruction";
+ case VMX_EXIT_RDMSR: return "RDMSR instruction";
+ case VMX_EXIT_WRMSR: return "WRMSR instruction";
+ case VMX_EXIT_ENTRY_FAILED_GUEST_STATE: return "guest state invalid";
+ case VMX_EXIT_ENTRY_FAILED_MSR_LOAD: return "MSR load failed";
+ case VMX_EXIT_MWAIT: return "MWAIT instruction";
+ case VMX_EXIT_MTF: return "monitor trap flag";
+ case VMX_EXIT_MONITOR: return "MONITOR instruction";
+ case VMX_EXIT_PAUSE: return "PAUSE instruction";
+ case VMX_EXIT_ENTRY_FAILED_MCE: return "MCE during entry";
+ case VMX_EXIT_TPR_BELOW_THRESHOLD: return "TPR below threshold";
+ case VMX_EXIT_APIC_ACCESS: return "APIC access";
+ case VMX_EXIT_VIRTUALIZED_EOI: return "virtualized EOI";
+ case VMX_EXIT_GDTR_IDTR: return "GDTR/IDTR access";
+ case VMX_EXIT_LDTR_TR: return "LDTR/TR access";
+ case VMX_EXIT_EPT_VIOLATION: return "EPT violation";
+ case VMX_EXIT_EPT_MISCONFIGURATION: return "EPT misconfiguration";
+ case VMX_EXIT_INVEPT: return "INVEPT instruction";
+ case VMX_EXIT_RDTSCP: return "RDTSCP instruction";
+ case VMX_EXIT_VMX_PREEMPTION_TIMER_EXPIRED:
+ return "preemption timer expired";
+ case VMX_EXIT_INVVPID: return "INVVPID instruction";
+ case VMX_EXIT_WBINVD: return "WBINVD instruction";
+ case VMX_EXIT_XSETBV: return "XSETBV instruction";
+ case VMX_EXIT_APIC_WRITE: return "APIC write";
+ case VMX_EXIT_RDRAND: return "RDRAND instruction";
+ case VMX_EXIT_INVPCID: return "INVPCID instruction";
+ case VMX_EXIT_VMFUNC: return "VMFUNC instruction";
+ default: return "unknown";
+ }
+}
+
+/*
+ * vmx_instruction_error_decode
+ *
+ * Returns a human readable string describing the instruction error in 'code'
+ */
+const char *
+vmx_instruction_error_decode(uint32_t code)
+{
+ switch(code) {
+ case 1: return "VMCALL: unsupported in VMX root";
+ case 2: return "VMCLEAR: invalid paddr";
+ case 3: return "VMCLEAR: VMXON pointer";
+ case 4: return "VMLAUNCH: non-clear VMCS";
+ case 5: return "VMRESUME: non-launched VMCS";
+ case 6: return "VMRESUME: executed after VMXOFF";
+ case 7: return "VM entry: invalid control field(s)";
+ case 8: return "VM entry: invalid host state field(s)";
+ case 9: return "VMPTRLD: invalid paddr";
+ case 10: return "VMPTRLD: VMXON pointer";
+ case 11: return "VMPTRLD: incorrect VMCS revid";
+ case 12: return "VMREAD/VMWRITE: unsupported VMCS field";
+ case 13: return "VMWRITE: RO VMCS field";
+ case 15: return "VMXON: unsupported in VMX root";
+ case 20: return "VMCALL: invalid VM exit control fields";
+ case 26: return "VM entry: blocked by MOV SS";
+ case 28: return "Invalid operand to INVEPT/INVVPID";
+ default: return "unknown";
+ }
+}
+
+/*
+ * dump_vcpu
+ *
+ * Dumps the VMX capabilites of vcpu 'vcpu'
+ */
+void
+dump_vcpu(struct vcpu *vcpu)
+{
+ printf("vcpu @ 0x%llx\n", (uint64_t)vcpu);
+ printf(" parent vm @ 0x%llx\n", (uint64_t)vcpu->vc_parent);
+ printf(" mode: ");
+ if (vcpu->vc_virt_mode == VMM_MODE_VMX ||
+ vcpu->vc_virt_mode == VMM_MODE_EPT) {
+ printf("VMX\n");
+ printf(" pinbased ctls: 0x%llx\n",
+ vcpu->vc_vmx_pinbased_ctls);
+ printf(" true pinbased ctls: 0x%llx\n",
+ vcpu->vc_vmx_true_pinbased_ctls);
+ CTRL_DUMP(vcpu, PINBASED, EXTERNAL_INT_EXITING);
+ CTRL_DUMP(vcpu, PINBASED, NMI_EXITING);
+ CTRL_DUMP(vcpu, PINBASED, VIRTUAL_NMIS);
+ CTRL_DUMP(vcpu, PINBASED, ACTIVATE_VMX_PREEMPTION_TIMER);
+ CTRL_DUMP(vcpu, PINBASED, PROCESS_POSTED_INTERRUPTS);
+ printf(" procbased ctls: 0x%llx\n",
+ vcpu->vc_vmx_procbased_ctls);
+ printf(" true procbased ctls: 0x%llx\n",
+ vcpu->vc_vmx_true_procbased_ctls);
+ CTRL_DUMP(vcpu, PROCBASED, INTERRUPT_WINDOW_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, USE_TSC_OFFSETTING);
+ CTRL_DUMP(vcpu, PROCBASED, HLT_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, INVLPG_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, MWAIT_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, RDPMC_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, RDTSC_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, CR3_LOAD_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, CR3_STORE_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, CR8_LOAD_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, CR8_STORE_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, USE_TPR_SHADOW);
+ CTRL_DUMP(vcpu, PROCBASED, NMI_WINDOW_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, MOV_DR_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, UNCONDITIONAL_IO_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, USE_IO_BITMAPS);
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
+ printf(" procbased2 ctls: 0x%llx\n",
+ vcpu->vc_vmx_procbased2_ctls);
+ CTRL_DUMP(vcpu, PROCBASED2, VIRTUALIZE_APIC);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_EPT);
+ CTRL_DUMP(vcpu, PROCBASED2, DESCRIPTOR_TABLE_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_RDTSCP);
+ CTRL_DUMP(vcpu, PROCBASED2, VIRTUALIZE_X2APIC_MODE);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_VPID);
+ CTRL_DUMP(vcpu, PROCBASED2, WBINVD_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, UNRESTRICTED_GUEST);
+ CTRL_DUMP(vcpu, PROCBASED2,
+ APIC_REGISTER_VIRTUALIZATION);
+ CTRL_DUMP(vcpu, PROCBASED2,
+ VIRTUAL_INTERRUPT_DELIVERY);
+ CTRL_DUMP(vcpu, PROCBASED2, PAUSE_LOOP_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, RDRAND_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_INVPCID);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_VM_FUNCTIONS);
+ CTRL_DUMP(vcpu, PROCBASED2, VMCS_SHADOWING);
+ CTRL_DUMP(vcpu, PROCBASED2, EPT_VIOLATION_VE);
+ }
+ printf(" entry ctls: 0x%llx\n",
+ vcpu->vc_vmx_entry_ctls);
+ printf(" true entry ctls: 0x%llx\n",
+ vcpu->vc_vmx_true_procbased_ctls);
+ CTRL_DUMP(vcpu, ENTRY, LOAD_DEBUG_CONTROLS);
+ CTRL_DUMP(vcpu, ENTRY, IA32E_MODE_GUEST);
+ CTRL_DUMP(vcpu, ENTRY, ENTRY_TO_SMM);
+ CTRL_DUMP(vcpu, ENTRY, DEACTIVATE_DUAL_MONITOR_TREATMENT);
+ CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY);
+ CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_PAT_ON_ENTRY);
+ CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_EFER_ON_ENTRY);
+ printf(" exit ctls: 0x%llx\n",
+ vcpu->vc_vmx_exit_ctls);
+ printf(" true exit ctls: 0x%llx\n",
+ vcpu->vc_vmx_true_exit_ctls);
+ CTRL_DUMP(vcpu, EXIT, SAVE_DEBUG_CONTROLS);
+ CTRL_DUMP(vcpu, EXIT, HOST_SPACE_ADDRESS_SIZE);
+ CTRL_DUMP(vcpu, EXIT, LOAD_IA32_PERF_GLOBAL_CTRL_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, ACKNOWLEDGE_INTERRUPT_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, SAVE_IA32_PAT_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, LOAD_IA32_PAT_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, SAVE_IA32_EFER_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, LOAD_IA32_EFER_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, SAVE_VMX_PREEMPTION_TIMER);
+ }
+}
diff --git a/sys/arch/amd64/amd64/vmm_support.S b/sys/arch/amd64/amd64/vmm_support.S
new file mode 100644
index 00000000000..c783a2fe5c2
--- /dev/null
+++ b/sys/arch/amd64/amd64/vmm_support.S
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "assym.h"
+#include <machine/asm.h>
+#include <machine/specialreg.h>
+
+/*
+ * XXX duplicated in vmmvar.h due to song-and-dance with sys/rwlock.h inclusion
+ * here
+ */
+#define VMX_FAIL_LAUNCH_UNKNOWN 1
+#define VMX_FAIL_LAUNCH_INVALID_VMCS 2
+#define VMX_FAIL_LAUNCH_VALID_VMCS 3
+
+ .text
+ .code64
+ .align 16
+ .global _C_LABEL(vmxon)
+ .global _C_LABEL(vmxoff)
+ .global _C_LABEL(vmclear)
+ .global _C_LABEL(vmptrld)
+ .global _C_LABEL(vmptrst)
+ .global _C_LABEL(vmwrite)
+ .global _C_LABEL(vmread)
+ .global _C_LABEL(invvpid)
+ .global _C_LABEL(invept)
+ .global _C_LABEL(vmx_enter_guest)
+ .global _C_LABEL(vmm_dispatch_intr)
+_C_LABEL(vmm_dispatch_intr):
+ movq %rsp, %r11 /* r11 = temporary register */
+ andq $0xFFFFFFFFFFFFFFF0, %rsp
+ movw %ss, %ax
+ pushq %ax
+ pushq %r11
+ pushfq
+ movw %cs, %ax
+ pushq %ax
+ cli
+ callq *%rdi
+ ret
+
+_C_LABEL(vmxon):
+ vmxon (%rdi)
+ jz failed_on
+ jc failed_on
+ xorq %rax, %rax
+ ret
+failed_on:
+ movq $0x01, %rax
+ ret
+
+_C_LABEL(vmxoff):
+ vmxoff
+ jz failed_off
+ jc failed_off
+ xorq %rax, %rax
+ ret
+failed_off:
+ movq $0x01, %rax
+ ret
+
+_C_LABEL(vmclear):
+ vmclear (%rdi)
+ jz failed_clear
+ jc failed_clear
+ xorq %rax, %rax
+ ret
+failed_clear:
+ movq $0x01, %rax
+ ret
+
+_C_LABEL(vmptrld):
+ vmptrld (%rdi)
+ jz failed_ptrld
+ jc failed_ptrld
+ xorq %rax, %rax
+ ret
+failed_ptrld:
+ movq $0x01, %rax
+ ret
+
+_C_LABEL(vmptrst):
+ vmptrst (%rdi)
+ jz failed_ptrst
+ jc failed_ptrst
+ xorq %rax, %rax
+ ret
+failed_ptrst:
+ movq $0x01, %rax
+ ret
+
+_C_LABEL(vmwrite):
+ vmwrite %rsi, %rdi
+ jz failed_write
+ jc failed_write
+ xorq %rax, %rax
+ ret
+failed_write:
+ movq $0x01, %rax
+ ret
+
+_C_LABEL(vmread):
+ vmread %rdi, (%rsi)
+ jz failed_read
+ jc failed_read
+ xorq %rax, %rax
+ ret
+failed_read:
+ movq $0x01, %rax
+ ret
+
+_C_LABEL(invvpid):
+ invvpid (%rsi), %rdi
+ ret
+
+_C_LABEL(invept):
+ invept (%rsi), %rdi
+ ret
+
+_C_LABEL(vmx_enter_guest):
+ movq %rdx, %r8 /* resume flag */
+ testq %r8, %r8
+ jnz skip_init
+
+ /*
+ * XXX make vmx_exit_handler a global and put this in the per-vcpu
+ * init code
+ */
+ movq $VMCS_HOST_IA32_RIP, %rdi
+ movq $vmx_exit_handler_asm, %rax
+ vmwrite %rax, %rdi /* Host RIP */
+
+skip_init:
+ /*
+ * XXX use msr list here for restore instead of all this
+ * stack jiggery-pokery
+ */
+
+ pushfq
+
+ /*
+ * Save (possibly) lazy-switched selectors
+ */
+ movw %es, %ax
+ pushw %ax
+ movw %ds, %ax
+ pushw %ax
+ movw %ss, %ax
+ pushw %ax
+
+ movq $MSR_FSBASE, %rcx
+ rdmsr
+ pushq %rax
+ pushq %rdx
+ pushw %fs
+ movq $MSR_GSBASE, %rcx
+ rdmsr
+ pushq %rax
+ pushq %rdx
+ pushw %gs
+ movq $MSR_KERNELGSBASE, %rcx
+ rdmsr
+ pushq %rax
+ pushq %rdx
+
+ /*
+ * Save various MSRs
+ */
+ movq $MSR_STAR, %rcx
+ rdmsr
+ pushq %rax
+ pushq %rdx
+
+ movq $MSR_LSTAR, %rcx
+ rdmsr
+ pushq %rax
+ pushq %rdx
+
+ /* XXX - unused? */
+ movq $MSR_CSTAR, %rcx
+ rdmsr
+ pushq %rax
+ pushq %rdx
+
+ movq $MSR_SFMASK, %rcx
+ rdmsr
+ pushq %rax
+ pushq %rdx
+
+ /* Preserve callee-preserved registers as per AMD64 ABI */
+ pushq %r15
+ pushq %r14
+ pushq %r13
+ pushq %r12
+ pushq %rbp
+ pushq %rbx
+ pushq %rsi /* Guest Regs Pointer */
+
+ movq $VMCS_HOST_IA32_RSP, %rdi
+ movq %rsp, %rax
+ vmwrite %rax, %rdi /* Host RSP */
+
+ testq %r8, %r8
+ jnz do_resume
+
+ /* Restore guest registers */
+ movq 0x78(%rsi), %rax
+ movq %rax, %cr2
+ movq 0x70(%rsi), %r15
+ movq 0x68(%rsi), %r14
+ movq 0x60(%rsi), %r13
+ movq 0x58(%rsi), %r12
+ movq 0x50(%rsi), %r11
+ movq 0x48(%rsi), %r10
+ movq 0x40(%rsi), %r9
+ movq 0x38(%rsi), %r8
+ movq 0x30(%rsi), %rbp
+ movq 0x28(%rsi), %rdi
+ movq 0x20(%rsi), %rdx
+ movq 0x18(%rsi), %rcx
+ movq 0x10(%rsi), %rbx
+ movq 0x08(%rsi), %rax
+ movq 0x00(%rsi), %rsi
+
+ vmlaunch
+ jmp fail_launch_or_resume
+do_resume:
+ /* Restore guest registers */
+ movq 0x78(%rsi), %rax
+ movq %rax, %cr2
+ movq 0x70(%rsi), %r15
+ movq 0x68(%rsi), %r14
+ movq 0x60(%rsi), %r13
+ movq 0x58(%rsi), %r12
+ movq 0x50(%rsi), %r11
+ movq 0x48(%rsi), %r10
+ movq 0x40(%rsi), %r9
+ movq 0x38(%rsi), %r8
+ movq 0x30(%rsi), %rbp
+ movq 0x28(%rsi), %rdi
+ movq 0x20(%rsi), %rdx
+ movq 0x18(%rsi), %rcx
+ movq 0x10(%rsi), %rbx
+ movq 0x08(%rsi), %rax
+ movq 0x00(%rsi), %rsi
+ vmresume
+fail_launch_or_resume:
+ /* Failed launch/resume (fell through) */
+ jc fail_launch_invalid_vmcs /* Invalid VMCS */
+ jz fail_launch_valid_vmcs /* Valid VMCS, failed launch/resume */
+
+ /* Unknown failure mode (not documented as per Intel SDM) */
+fail_launch_unknown:
+ movq $VMX_FAIL_LAUNCH_UNKNOWN, %rdi
+ popq %rsi
+ jmp restore_host
+
+fail_launch_invalid_vmcs:
+ movq $VMX_FAIL_LAUNCH_INVALID_VMCS, %rdi
+ popq %rsi
+ jmp restore_host
+
+fail_launch_valid_vmcs:
+ movq $VMCS_INSTRUCTION_ERROR, %rdi
+ popq %rsi
+ vmread %rdi, %rax
+ /* XXX check failure of vmread */
+ movl %eax, 0x80(%rsi)
+ movq $VMX_FAIL_LAUNCH_VALID_VMCS, %rdi
+ jmp restore_host
+
+vmx_exit_handler_asm:
+ /* Preserve guest registers not saved in VMCS */
+ pushq %rsi
+ pushq %rdi
+ movq 0x10(%rsp), %rdi
+ movq 0x8(%rsp), %rsi
+ movq %rsi, (%rdi)
+ popq %rdi
+ popq %rsi /* discard */
+
+ popq %rsi
+ movq %rax, 0x8(%rsi)
+ movq %rbx, 0x10(%rsi)
+ movq %rcx, 0x18(%rsi)
+ movq %rdx, 0x20(%rsi)
+ movq %rdi, 0x28(%rsi)
+ movq %rbp, 0x30(%rsi)
+ movq %r8, 0x38(%rsi)
+ movq %r9, 0x40(%rsi)
+ movq %r10, 0x48(%rsi)
+ movq %r11, 0x50(%rsi)
+ movq %r12, 0x58(%rsi)
+ movq %r13, 0x60(%rsi)
+ movq %r14, 0x68(%rsi)
+ movq %r15, 0x70(%rsi)
+ movq %cr2, %rax
+ movq %rax, 0x78(%rsi)
+
+ /* %rdi = 0 means we took an exit */
+ xorq %rdi, %rdi
+
+restore_host:
+ popq %rbx
+ popq %rbp
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %r15
+
+ /*
+ * Restore saved MSRs
+ */
+ popq %rdx
+ popq %rax
+ movq $MSR_SFMASK, %rcx
+ wrmsr
+
+ /* XXX - unused? */
+ popq %rdx
+ popq %rax
+ movq $MSR_CSTAR, %rcx
+ wrmsr
+
+ popq %rdx
+ popq %rax
+ movq $MSR_LSTAR, %rcx
+ wrmsr
+
+ popq %rdx
+ popq %rax
+ movq $MSR_STAR, %rcx
+ wrmsr
+
+ /*
+ * popw %gs will reset gsbase to 0, so preserve it
+ * first. This is to accomodate possibly lazy-switched
+ * selectors from above
+ */
+ cli
+ popq %rdx
+ popq %rax
+ movq $MSR_KERNELGSBASE, %rcx
+ wrmsr
+
+ popw %gs
+ popq %rdx
+ popq %rax
+ movq $MSR_GSBASE, %rcx
+ wrmsr
+
+ popw %fs
+ popq %rdx
+ popq %rax
+ movq $MSR_FSBASE, %rcx
+ wrmsr
+ sti
+
+ popw %ax
+ movw %ax, %ss
+ popw %ax
+ movw %ax, %ds
+ popw %ax
+ movw %ax, %es
+
+ popfq
+
+ movq %rdi, %rax
+ ret
+
diff --git a/sys/arch/amd64/conf/GENERIC b/sys/arch/amd64/conf/GENERIC
index 5150f3ee287..a77f7dae17b 100644
--- a/sys/arch/amd64/conf/GENERIC
+++ b/sys/arch/amd64/conf/GENERIC
@@ -1,4 +1,4 @@
-# $OpenBSD: GENERIC,v 1.399 2015/10/29 07:47:02 kettenis Exp $
+# $OpenBSD: GENERIC,v 1.400 2015/11/13 07:52:20 mlarkin Exp $
#
# For further information on compiling OpenBSD kernels, see the config(8)
# man page.
@@ -24,6 +24,7 @@ option MTRR # CPU memory range attributes control
option NTFS # NTFS support
option HIBERNATE # Hibernate support
+#option VMM # VMM support
config bsd swap generic
@@ -37,6 +38,7 @@ isa0 at pcib?
isa0 at amdpcib?
isa0 at tcpcib?
pci* at mainbus0
+#vmm0 at mainbus0
pvbus0 at mainbus0
acpi0 at bios0
diff --git a/sys/arch/amd64/conf/Makefile.amd64 b/sys/arch/amd64/conf/Makefile.amd64
index c9dcb318834..c0a84a967a7 100644
--- a/sys/arch/amd64/conf/Makefile.amd64
+++ b/sys/arch/amd64/conf/Makefile.amd64
@@ -1,4 +1,4 @@
-# $OpenBSD: Makefile.amd64,v 1.65 2015/01/13 01:12:49 deraadt Exp $
+# $OpenBSD: Makefile.amd64,v 1.66 2015/11/13 07:52:20 mlarkin Exp $
# For instructions on building kernels consult the config(8) and options(4)
# manual pages.
@@ -142,7 +142,7 @@ db_structinfo.h: $S/ddb/db_structinfo.c $S/ddb/parse_structinfo.pl
rm -f db_structinfo.o
locore.o: ${_machdir}/${_mach}/locore.S assym.h
-mutex.o vector.o copy.o spl.o mptramp.o acpi_wakecode.o: assym.h
+mutex.o vector.o copy.o spl.o mptramp.o acpi_wakecode.o vmm_support.o: assym.h
# The install target can be redefined by putting a
# install-kernel-${MACHINE_NAME} target into /etc/mk.conf
diff --git a/sys/arch/amd64/conf/files.amd64 b/sys/arch/amd64/conf/files.amd64
index 7a082997b2c..cca555c839b 100644
--- a/sys/arch/amd64/conf/files.amd64
+++ b/sys/arch/amd64/conf/files.amd64
@@ -1,4 +1,4 @@
-# $OpenBSD: files.amd64,v 1.82 2015/10/29 07:47:02 kettenis Exp $
+# $OpenBSD: files.amd64,v 1.83 2015/11/13 07:52:20 mlarkin Exp $
maxpartitions 16
maxusers 2 16 128
@@ -227,6 +227,14 @@ file arch/amd64/amd64/acpi_machdep.c acpi
file arch/amd64/amd64/acpi_wakecode.S acpi & !small_kernel
#
+# VMM
+#
+device vmm {}
+attach vmm at mainbus
+file arch/amd64/amd64/vmm.c vmm & !small_kernel needs-flag
+file arch/amd64/amd64/vmm_support.S vmm & !small_kernel
+
+#
# Machine-independent SD/MMC drivers
#
include "dev/sdmmc/files.sdmmc"
diff --git a/sys/arch/amd64/include/cpu.h b/sys/arch/amd64/include/cpu.h
index 678a2b1c8f3..e772200f8d4 100644
--- a/sys/arch/amd64/include/cpu.h
+++ b/sys/arch/amd64/include/cpu.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: cpu.h,v 1.97 2015/07/02 01:33:59 dlg Exp $ */
+/* $OpenBSD: cpu.h,v 1.98 2015/11/13 07:52:20 mlarkin Exp $ */
/* $NetBSD: cpu.h,v 1.1 2003/04/26 18:39:39 fvdl Exp $ */
/*-
@@ -54,6 +54,36 @@
#ifdef _KERNEL
+#ifdef VMM
+/* VMXON region (Intel) */
+struct vmxon_region {
+ uint32_t vr_revision;
+};
+
+/*
+ * VMX for Intel CPUs
+ */
+struct vmx {
+ uint64_t vmx_cr0_fixed0;
+ uint64_t vmx_cr0_fixed1;
+ uint64_t vmx_cr4_fixed0;
+ uint64_t vmx_cr4_fixed1;
+ uint32_t vmx_vmxon_revision;
+ uint32_t vmx_msr_table_size;
+};
+
+/*
+ * SVM for AMD CPUs
+ */
+struct svm {
+};
+
+union vmm_cpu_cap {
+ struct vmx vcc_vmx;
+ struct svm vcc_svm;
+};
+#endif /* VMM */
+
struct x86_64_tss;
struct cpu_info {
struct device *ci_dev;
@@ -91,8 +121,15 @@ struct cpu_info {
u_int32_t ci_feature_flags;
u_int32_t ci_feature_eflags;
- u_int32_t ci_feature_sefflags;
+ u_int32_t ci_feature_sefflags_ebx;
+ u_int32_t ci_feature_sefflags_ecx;
u_int32_t ci_feature_tpmflags;
+ u_int32_t ci_pnfeatset;
+ u_int32_t ci_efeature_eax;
+ u_int32_t ci_efeature_ecx;
+ u_int32_t ci_brand[12];
+ u_int32_t ci_amdcacheinfo[4];
+ u_int32_t ci_extcacheinfo[4];
u_int32_t ci_signature;
u_int32_t ci_family;
u_int32_t ci_model;
@@ -140,6 +177,16 @@ struct cpu_info {
#ifdef GPROF
struct gmonparam *ci_gmon;
#endif
+#ifdef VMM
+ u_int32_t ci_vmm_flags;
+#define CI_VMM_VMX (1 << 0)
+#define CI_VMM_SVM (1 << 1)
+#define CI_VMM_RVI (1 << 2)
+#define CI_VMM_EPT (1 << 3)
+ union vmm_cpu_cap ci_vmm_cap;
+ paddr_t ci_vmxon_region_pa;
+ struct vmxon_region *ci_vmxon_region;
+#endif /* VMM */
};
#define CPUF_BSP 0x0001 /* CPU is the original BSP */
@@ -159,6 +206,7 @@ struct cpu_info {
#define CPUF_PAUSE 0x4000 /* CPU is paused in DDB */
#define CPUF_GO 0x8000 /* CPU should start running */
#define CPUF_PARK 0x10000 /* CPU should self-park in real mode */
+#define CPUF_VMM 0x20000 /* CPU is executing in VMM mode */
#define PROC_PC(p) ((p)->p_md.md_regs->tf_rip)
#define PROC_STACK(p) ((p)->p_md.md_regs->tf_rsp)
@@ -282,6 +330,7 @@ extern int biosbasemem;
extern int biosextmem;
extern int cpu;
extern int cpu_feature;
+extern int cpu_ebxfeature;
extern int cpu_ecxfeature;
extern int cpu_perf_eax;
extern int cpu_perf_ebx;
diff --git a/sys/arch/amd64/include/intrdefs.h b/sys/arch/amd64/include/intrdefs.h
index 650d40ab5da..0d152869613 100644
--- a/sys/arch/amd64/include/intrdefs.h
+++ b/sys/arch/amd64/include/intrdefs.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: intrdefs.h,v 1.13 2015/02/10 05:35:19 mlarkin Exp $ */
+/* $OpenBSD: intrdefs.h,v 1.14 2015/11/13 07:52:20 mlarkin Exp $ */
/* $NetBSD: intrdefs.h,v 1.2 2003/05/04 22:01:56 fvdl Exp $ */
#ifndef _AMD64_INTRDEFS_H
@@ -79,12 +79,15 @@
#define X86_IPI_MTRR 0x00000020
#define X86_IPI_SETPERF 0x00000040
#define X86_IPI_DDB 0x00000080
+#define X86_IPI_START_VMM 0x00000100
+#define X86_IPI_STOP_VMM 0x00000200
-#define X86_NIPI 8
+#define X86_NIPI 10
#define X86_IPI_NAMES { "halt IPI", "nop IPI", "FPU flush IPI", \
"FPU synch IPI", "TLB shootdown IPI", \
- "MTRR update IPI", "setperf IPI", "ddb IPI" }
+ "MTRR update IPI", "setperf IPI", "ddb IPI", \
+ "VMM start IPI", "VMM stop IPI" }
#define IREENT_MAGIC 0x18041969
diff --git a/sys/arch/amd64/include/pmap.h b/sys/arch/amd64/include/pmap.h
index aaa9638984f..2a87c431671 100644
--- a/sys/arch/amd64/include/pmap.h
+++ b/sys/arch/amd64/include/pmap.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: pmap.h,v 1.60 2015/11/10 08:57:39 mlarkin Exp $ */
+/* $OpenBSD: pmap.h,v 1.61 2015/11/13 07:52:20 mlarkin Exp $ */
/* $NetBSD: pmap.h,v 1.1 2003/04/26 18:39:46 fvdl Exp $ */
/*
@@ -369,6 +369,7 @@ static void pmap_update_pg(vaddr_t);
static void pmap_update_2pg(vaddr_t,vaddr_t);
void pmap_write_protect(struct pmap *, vaddr_t,
vaddr_t, vm_prot_t);
+int pmap_fix_ept(struct pmap *, vaddr_t, int *);
vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */
diff --git a/sys/arch/amd64/include/specialreg.h b/sys/arch/amd64/include/specialreg.h
index 3899a140e06..f6bed264874 100644
--- a/sys/arch/amd64/include/specialreg.h
+++ b/sys/arch/amd64/include/specialreg.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: specialreg.h,v 1.37 2015/06/07 08:11:50 guenther Exp $ */
+/* $OpenBSD: specialreg.h,v 1.38 2015/11/13 07:52:20 mlarkin Exp $ */
/* $NetBSD: specialreg.h,v 1.1 2003/04/26 18:39:48 fvdl Exp $ */
/* $NetBSD: x86/specialreg.h,v 1.2 2003/04/25 21:54:30 fvdl Exp $ */
@@ -172,7 +172,6 @@
* "Structured Extended Feature Flags Parameters" (CPUID function 0x7, leaf 0)
* EBX bits
*/
-
#define SEFF0EBX_FSGSBASE 0x00000001 /* {RD,WR}[FG]SBASE instructions */
#define SEFF0EBX_BMI1 0x00000008 /* advanced bit manipulation */
#define SEFF0EBX_HLE 0x00000010 /* Hardware Lock Elision */
@@ -185,6 +184,9 @@
#define SEFF0EBX_RDSEED 0x00040000 /* RDSEED instruction */
#define SEFF0EBX_ADX 0x00080000 /* ADCX/ADOX instructions */
#define SEFF0EBX_SMAP 0x00100000 /* Supervisor mode access prevent */
+/* SEFF ECX bits */
+#define SEFF0ECX_PREFETCHWT1 0x00000001 /* PREFETCHWT1 instruction */
+#define SEFF0ECX_PKU 0x00000008 /* Page prot keys for user mode */
/*
* Thermal and Power Management (CPUID function 0x6) EAX bits
@@ -286,6 +288,7 @@
#define MSR_EBL_CR_POWERON 0x02a
#define MSR_EBC_FREQUENCY_ID 0x02c /* Pentium 4 only */
#define MSR_TEST_CTL 0x033
+#define MSR_IA32_FEATURE_CONTROL 0x03a
#define MSR_BIOS_UPDT_TRIG 0x079
#define MSR_BBL_CR_D0 0x088 /* PII+ only */
#define MSR_BBL_CR_D1 0x089 /* PII+ only */
@@ -403,6 +406,7 @@
#define EFER_LME 0x00000100 /* Long Mode Enabled */
#define EFER_LMA 0x00000400 /* Long Mode Active */
#define EFER_NXE 0x00000800 /* No-Execute Enabled */
+#define EFER_SVME 0x00001000 /* SVM Enabled */
#define MSR_STAR 0xc0000081 /* 32 bit syscall gate addr */
#define MSR_LSTAR 0xc0000082 /* 64 bit syscall gate addr */
@@ -797,3 +801,226 @@
#define C3_CRYPT_CWLO_KEY128 0x0000000a /* 128bit, 10 rds */
#define C3_CRYPT_CWLO_KEY192 0x0000040c /* 192bit, 12 rds */
#define C3_CRYPT_CWLO_KEY256 0x0000080e /* 256bit, 15 rds */
+
+/*
+ * VMX
+ */
+#define IA32_FEATURE_CONTROL_LOCK 0x01
+#define IA32_FEATURE_CONTROL_SMX_EN 0x02
+#define IA32_FEATURE_CONTROL_VMX_EN 0x04
+#define IA32_VMX_BASIC 0x480
+#define IA32_VMX_PINBASED_CTLS 0x481
+#define IA32_VMX_PROCBASED_CTLS 0x482
+#define IA32_VMX_EXIT_CTLS 0x483
+#define IA32_VMX_ENTRY_CTLS 0x484
+#define IA32_VMX_MISC 0x485
+#define IA32_VMX_CR0_FIXED0 0x486
+#define IA32_VMX_CR0_FIXED1 0x487
+#define IA32_VMX_CR4_FIXED0 0x488
+#define IA32_VMX_CR4_FIXED1 0x489
+#define IA32_VMX_PROCBASED2_CTLS 0x48B
+#define IA32_VMX_EPT_VPID_CAP 0x48C
+#define IA32_VMX_TRUE_PINBASED_CTLS 0x48D
+#define IA32_VMX_TRUE_PROCBASED_CTLS 0x48E
+#define IA32_VMX_TRUE_EXIT_CTLS 0x48F
+#define IA32_VMX_TRUE_ENTRY_CTLS 0x490
+
+#define IA32_EPT_VPID_CAP_PAGE_WALK_4 (1ULL << 6)
+#define IA32_EPT_VPID_CAP_WB (1ULL << 14)
+#define IA32_EPT_VPID_CAP_AD_BITS (1ULL << 21)
+
+#define IA32_EPT_PAGING_CACHE_TYPE_UC 0x0
+#define IA32_EPT_PAGING_CACHE_TYPE_WB 0x6
+#define IA32_EPT_AD_BITS_ENABLE (1ULL << 6)
+#define IA32_EPT_PAGE_WALK_LENGTH 0x4
+
+/* VMX : IA32_VMX_BASIC bits */
+#define IA32_VMX_TRUE_CTLS_AVAIL (1ULL << 55)
+
+/* VMX : IA32_VMX_PINBASED_CTLS bits */
+#define IA32_VMX_EXTERNAL_INT_EXITING (1ULL << 0)
+#define IA32_VMX_NMI_EXITING (1ULL << 3)
+#define IA32_VMX_VIRTUAL_NMIS (1ULL << 5)
+#define IA32_VMX_ACTIVATE_VMX_PREEMPTION_TIMER (1ULL << 6)
+#define IA32_VMX_PROCESS_POSTED_INTERRUPTS (1ULL << 7)
+
+/* VMX : IA32_VMX_PROCBASED_CTLS bits */
+#define IA32_VMX_INTERRUPT_WINDOW_EXITING (1ULL << 2)
+#define IA32_VMX_USE_TSC_OFFSETTING (1ULL << 3)
+#define IA32_VMX_HLT_EXITING (1ULL << 7)
+#define IA32_VMX_INVLPG_EXITING (1ULL << 9)
+#define IA32_VMX_MWAIT_EXITING (1ULL << 10)
+#define IA32_VMX_RDPMC_EXITING (1ULL << 11)
+#define IA32_VMX_RDTSC_EXITING (1ULL << 12)
+#define IA32_VMX_CR3_LOAD_EXITING (1ULL << 15)
+#define IA32_VMX_CR3_STORE_EXITING (1ULL << 16)
+#define IA32_VMX_CR8_LOAD_EXITING (1ULL << 19)
+#define IA32_VMX_CR8_STORE_EXITING (1ULL << 20)
+#define IA32_VMX_USE_TPR_SHADOW (1ULL << 21)
+#define IA32_VMX_NMI_WINDOW_EXITING (1ULL << 22)
+#define IA32_VMX_MOV_DR_EXITING (1ULL << 23)
+#define IA32_VMX_UNCONDITIONAL_IO_EXITING (1ULL << 24)
+#define IA32_VMX_USE_IO_BITMAPS (1ULL << 25)
+#define IA32_VMX_MONITOR_TRAP_FLAG (1ULL << 27)
+#define IA32_VMX_USE_MSR_BITMAPS (1ULL << 28)
+#define IA32_VMX_MONITOR_EXITING (1ULL << 29)
+#define IA32_VMX_PAUSE_EXITING (1ULL << 30)
+#define IA32_VMX_ACTIVATE_SECONDARY_CONTROLS (1ULL << 31)
+
+/* VMX : IA32_VMX_PROCBASED2_CTLS bits */
+#define IA32_VMX_VIRTUALIZE_APIC (1ULL << 0)
+#define IA32_VMX_ENABLE_EPT (1ULL << 1)
+#define IA32_VMX_DESCRIPTOR_TABLE_EXITING (1ULL << 2)
+#define IA32_VMX_ENABLE_RDTSCP (1ULL << 3)
+#define IA32_VMX_VIRTUALIZE_X2APIC_MODE (1ULL << 4)
+#define IA32_VMX_ENABLE_VPID (1ULL << 5)
+#define IA32_VMX_WBINVD_EXITING (1ULL << 6)
+#define IA32_VMX_UNRESTRICTED_GUEST (1ULL << 7)
+#define IA32_VMX_APIC_REGISTER_VIRTUALIZATION (1ULL << 8)
+#define IA32_VMX_VIRTUAL_INTERRUPT_DELIVERY (1ULL << 9)
+#define IA32_VMX_PAUSE_LOOP_EXITING (1ULL << 10)
+#define IA32_VMX_RDRAND_EXITING (1ULL << 11)
+#define IA32_VMX_ENABLE_INVPCID (1ULL << 12)
+#define IA32_VMX_ENABLE_VM_FUNCTIONS (1ULL << 13)
+#define IA32_VMX_VMCS_SHADOWING (1ULL << 14)
+#define IA32_VMX_EPT_VIOLATION_VE (1ULL << 18)
+
+/* VMX : IA32_VMX_EXIT_CTLS bits */
+#define IA32_VMX_SAVE_DEBUG_CONTROLS (1ULL << 2)
+#define IA32_VMX_HOST_SPACE_ADDRESS_SIZE (1ULL << 9)
+#define IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_EXIT (1ULL << 12)
+#define IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT (1ULL << 15)
+#define IA32_VMX_SAVE_IA32_PAT_ON_EXIT (1ULL << 18)
+#define IA32_VMX_LOAD_IA32_PAT_ON_EXIT (1ULL << 19)
+#define IA32_VMX_SAVE_IA32_EFER_ON_EXIT (1ULL << 20)
+#define IA32_VMX_LOAD_IA32_EFER_ON_EXIT (1ULL << 21)
+#define IA32_VMX_SAVE_VMX_PREEMPTION_TIMER (1ULL << 22)
+
+/* VMX: IA32_VMX_ENTRY_CTLS bits */
+#define IA32_VMX_LOAD_DEBUG_CONTROLS (1ULL << 2)
+#define IA32_VMX_IA32E_MODE_GUEST (1ULL << 9)
+#define IA32_VMX_ENTRY_TO_SMM (1ULL << 10)
+#define IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT (1ULL << 11)
+#define IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY (1ULL << 13)
+#define IA32_VMX_LOAD_IA32_PAT_ON_ENTRY (1ULL << 14)
+#define IA32_VMX_LOAD_IA32_EFER_ON_ENTRY (1ULL << 15)
+
+/* VMX : VMCS Fields */
+#define VMCS_GUEST_VPID 0x0000
+#define VMCS_GUEST_IA32_ES_SEL 0x0800
+#define VMCS_GUEST_IA32_CS_SEL 0x0802
+#define VMCS_GUEST_IA32_SS_SEL 0x0804
+#define VMCS_GUEST_IA32_DS_SEL 0x0806
+#define VMCS_GUEST_IA32_FS_SEL 0x0808
+#define VMCS_GUEST_IA32_GS_SEL 0x080A
+#define VMCS_GUEST_IA32_LDTR_SEL 0x080C
+#define VMCS_GUEST_IA32_TR_SEL 0x080E
+#define VMCS_HOST_IA32_ES_SEL 0x0C00
+#define VMCS_HOST_IA32_CS_SEL 0x0C02
+#define VMCS_HOST_IA32_SS_SEL 0x0C04
+#define VMCS_HOST_IA32_DS_SEL 0x0C06
+#define VMCS_HOST_IA32_FS_SEL 0x0C08
+#define VMCS_HOST_IA32_GS_SEL 0x0C0A
+#define VMCS_HOST_IA32_TR_SEL 0x0C0C
+#define VMCS_MSR_BITMAP_ADDRESS 0x2004
+#define VMCS_EXIT_STORE_MSR_ADDRESS 0x2006
+#define VMCS_EXIT_LOAD_MSR_ADDRESS 0x2008
+#define VMCS_ENTRY_LOAD_MSR_ADDRESS 0x200A
+#define VMCS_APIC_ACCESS_ADDRESS 0x2014
+#define VMCS_GUEST_IA32_EPTP 0x201A
+#define VMCS_GUEST_PHYSICAL_ADDRESS 0x2400
+#define VMCS_LINK_POINTER 0x2800
+#define VMCS_GUEST_IA32_PAT 0x2804
+#define VMCS_HOST_IA32_PAT 0x2C00
+#define VMCS_HOST_IA32_EFER 0x2C02
+#define VMCS_PINBASED_CTLS 0x4000
+#define VMCS_PROCBASED_CTLS 0x4002
+#define VMCS_EXIT_CTLS 0x400C
+#define VMCS_EXIT_MSR_STORE_COUNT 0x400E
+#define VMCS_EXIT_MSR_LOAD_COUNT 0x4010
+#define VMCS_ENTRY_CTLS 0x4012
+#define VMCS_ENTRY_MSR_LOAD_COUNT 0x4014
+#define VMCS_ENTRY_INTERRUPTION_INFO 0x4016
+#define VMCS_PROCBASED2_CTLS 0x401E
+#define VMCS_INSTRUCTION_ERROR 0x4400
+#define VMCS_EXIT_REASON 0x4402
+#define VMCS_EXIT_INTERRUPTION_INFO 0x4404
+#define VMCS_INSTRUCTION_LENGTH 0x440C
+#define VMCS_GUEST_IA32_ES_LIMIT 0x4800
+#define VMCS_GUEST_IA32_CS_LIMIT 0x4802
+#define VMCS_GUEST_IA32_SS_LIMIT 0x4804
+#define VMCS_GUEST_IA32_DS_LIMIT 0x4806
+#define VMCS_GUEST_IA32_FS_LIMIT 0x4808
+#define VMCS_GUEST_IA32_GS_LIMIT 0x480A
+#define VMCS_GUEST_IA32_LDTR_LIMIT 0x480C
+#define VMCS_GUEST_IA32_TR_LIMIT 0x480E
+#define VMCS_GUEST_IA32_GDTR_LIMIT 0x4810
+#define VMCS_GUEST_IA32_IDTR_LIMIT 0x4812
+#define VMCS_GUEST_IA32_ES_AR 0x4814
+#define VMCS_GUEST_IA32_CS_AR 0x4816
+#define VMCS_GUEST_IA32_SS_AR 0x4818
+#define VMCS_GUEST_IA32_DS_AR 0x481A
+#define VMCS_GUEST_IA32_FS_AR 0x481C
+#define VMCS_GUEST_IA32_GS_AR 0x481E
+#define VMCS_GUEST_IA32_LDTR_AR 0x4820
+#define VMCS_GUEST_IA32_TR_AR 0x4822
+#define VMCS_GUEST_EXIT_QUALIFICATION 0x6400
+#define VMCS_GUEST_IA32_CR0 0x6800
+#define VMCS_GUEST_IA32_CR3 0x6802
+#define VMCS_GUEST_IA32_CR4 0x6804
+#define VMCS_GUEST_IA32_ES_BASE 0x6806
+#define VMCS_GUEST_IA32_CS_BASE 0x6808
+#define VMCS_GUEST_IA32_SS_BASE 0x680A
+#define VMCS_GUEST_IA32_DS_BASE 0x680C
+#define VMCS_GUEST_IA32_FS_BASE 0x680E
+#define VMCS_GUEST_IA32_GS_BASE 0x6810
+#define VMCS_GUEST_IA32_LDTR_BASE 0x6812
+#define VMCS_GUEST_IA32_TR_BASE 0x6814
+#define VMCS_GUEST_IA32_GDTR_BASE 0x6816
+#define VMCS_GUEST_IA32_IDTR_BASE 0x6818
+#define VMCS_GUEST_IA32_RSP 0x681C
+#define VMCS_GUEST_IA32_RIP 0x681E
+#define VMCS_GUEST_IA32_RFLAGS 0x6820
+#define VMCS_HOST_IA32_CR0 0x6C00
+#define VMCS_HOST_IA32_CR3 0x6C02
+#define VMCS_HOST_IA32_CR4 0x6C04
+#define VMCS_HOST_IA32_FS_BASE 0x6C06
+#define VMCS_HOST_IA32_TR_BASE 0x6C0A
+#define VMCS_HOST_IA32_GDTR_BASE 0x6C0C
+#define VMCS_HOST_IA32_IDTR_BASE 0x6C0E
+#define VMCS_HOST_IA32_RSP 0x6C14
+#define VMCS_HOST_IA32_RIP 0x6C16
+
+#define IA32_VMX_INVVPID_INDIV_ADDR_CTX 0x0
+#define IA32_VMX_INVVPID_SINGLE_CTX 0x1
+#define IA32_VMX_INVVPID_ALL_CTX 0x2
+#define IA32_VMX_INVVPID_SINGLE_CTX_GLB 0x3
+
+#define IA32_VMX_INVEPT_SINGLE_CTX 0x1
+#define IA32_VMX_INVEPT_GLOBAL_CTX 0x2
+
+#define IA32_VMX_EPT_FAULT_READ (1ULL << 0)
+#define IA32_VMX_EPT_FAULT_WRITE (1ULL << 1)
+#define IA32_VMX_EPT_FAULT_EXEC (1ULL << 2)
+
+#define IA32_VMX_MSR_LIST_SIZE_MASK (3ULL << 25)
+
+/*
+ * SVM
+ */
+#define MSR_AMD_VM_CR 0xc0010114
+#define CPUID_AMD_SVM_CAP 0x8000000A
+#define AMD_SVMDIS 0x10
+#define AMD_SVM_NESTED_PAGING_CAP (1 << 0)
+
+/*
+ * PAT
+ */
+#define PATENTRY(n, type) (type << ((n) * 8))
+#define PAT_UC 0x0UL
+#define PAT_WC 0x1UL
+#define PAT_WT 0x4UL
+#define PAT_WP 0x5UL
+#define PAT_WB 0x6UL
+#define PAT_UCMINUS 0x7UL
+
diff --git a/sys/arch/amd64/include/vmmvar.h b/sys/arch/amd64/include/vmmvar.h
new file mode 100644
index 00000000000..576115dfb3f
--- /dev/null
+++ b/sys/arch/amd64/include/vmmvar.h
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * CPU capabilities for VMM operation
+ */
+#ifndef _MACHINE_VMMVAR_H_
+#define _MACHINE_VMMVAR_H_
+
+#include <sys/rwlock.h>
+
+#define VMM_HV_SIGNATURE "OpenBSDVMM58"
+
+#define VMM_MAX_DISKS_PER_VM 2
+#define VMM_MAX_PATH_DISK 128
+#define VMM_MAX_NAME_LEN 32
+#define VMM_MAX_KERNEL_PATH 128
+#define VMM_MAX_VCPUS_PER_VM 64
+#define VMM_MAX_VM_MEM_SIZE (512 * 1024)
+#define VMM_MAX_NICS_PER_VM 2
+
+#define VMM_PCI_MMIO_BAR_BASE 0xF0000000
+#define VMM_PCI_MMIO_BAR_END 0xF0FFFFFF
+#define VMM_PCI_MMIO_BAR_SIZE 0x00010000
+#define VMM_PCI_IO_BAR_BASE 0x1000
+#define VMM_PCI_IO_BAR_END 0xFFFF
+#define VMM_PCI_IO_BAR_SIZE 0x1000
+
+/* VMX: Basic Exit Reasons */
+#define VMX_EXIT_NMI 0
+#define VMX_EXIT_EXTINT 1
+#define VMX_EXIT_TRIPLE_FAULT 2
+#define VMX_EXIT_INIT 3
+#define VMX_EXIT_SIPI 4
+#define VMX_EXIT_IO_SMI 5
+#define VMX_EXIT_OTHER_SMI 6
+#define VMX_EXIT_INT_WINDOW 7
+#define VMX_EXIT_NMI_WINDOW 8
+#define VMX_EXIT_TASK_SWITCH 9
+#define VMX_EXIT_CPUID 10
+#define VMX_EXIT_GETSEC 11
+#define VMX_EXIT_HLT 12
+#define VMX_EXIT_INVD 13
+#define VMX_EXIT_INVLPG 14
+#define VMX_EXIT_RDPMC 15
+#define VMX_EXIT_RDTSC 16
+#define VMX_EXIT_RSM 17
+#define VMX_EXIT_VMCALL 18
+#define VMX_EXIT_VMCLEAR 19
+#define VMX_EXIT_VMLAUNCH 20
+#define VMX_EXIT_VMPTRLD 21
+#define VMX_EXIT_VMPTRST 22
+#define VMX_EXIT_VMREAD 23
+#define VMX_EXIT_VMRESUME 24
+#define VMX_EXIT_VMWRITE 25
+#define VMX_EXIT_VMXOFF 26
+#define VMX_EXIT_VMXON 27
+#define VMX_EXIT_CR_ACCESS 28
+#define VMX_EXIT_MOV_DR 29
+#define VMX_EXIT_IO 30
+#define VMX_EXIT_RDMSR 31
+#define VMX_EXIT_WRMSR 32
+#define VMX_EXIT_ENTRY_FAILED_GUEST_STATE 33
+#define VMX_EXIT_ENTRY_FAILED_MSR_LOAD 34
+#define VMX_EXIT_MWAIT 36
+#define VMX_EXIT_MTF 37
+#define VMX_EXIT_MONITOR 39
+#define VMX_EXIT_PAUSE 40
+#define VMX_EXIT_ENTRY_FAILED_MCE 41
+#define VMX_EXIT_TPR_BELOW_THRESHOLD 43
+#define VMX_EXIT_APIC_ACCESS 44
+#define VMX_EXIT_VIRTUALIZED_EOI 45
+#define VMX_EXIT_GDTR_IDTR 46
+#define VMX_EXIT_LDTR_TR 47
+#define VMX_EXIT_EPT_VIOLATION 48
+#define VMX_EXIT_EPT_MISCONFIGURATION 49
+#define VMX_EXIT_INVEPT 50
+#define VMX_EXIT_RDTSCP 51
+#define VMX_EXIT_VMX_PREEMPTION_TIMER_EXPIRED 52
+#define VMX_EXIT_INVVPID 53
+#define VMX_EXIT_WBINVD 54
+#define VMX_EXIT_XSETBV 55
+#define VMX_EXIT_APIC_WRITE 56
+#define VMX_EXIT_RDRAND 57
+#define VMX_EXIT_INVPCID 58
+#define VMX_EXIT_VMFUNC 59
+
+#define VM_EXIT_TERMINATED 0xFFFE
+#define VM_EXIT_NONE 0xFFFF
+
+enum {
+ VCPU_STATE_STOPPED,
+ VCPU_STATE_RUNNING,
+ VCPU_STATE_REQSTOP,
+ VCPU_STATE_UNKNOWN
+};
+
+enum {
+ VEI_DIR_OUT,
+ VEI_DIR_IN
+};
+
+
+/*
+ * vm exit data
+ * vm_exit_inout : describes an IN/OUT exit
+ */
+struct vm_exit_inout {
+ uint8_t vei_size; /* Size of access */
+ uint8_t vei_dir; /* Direction */
+ uint8_t vei_rep; /* REP prefix? */
+ uint8_t vei_string; /* string variety? */
+ uint8_t vei_encoding; /* operand encoding */
+ uint16_t vei_port; /* port */
+ uint32_t vei_data; /* data (for IN insns) */
+};
+
+union vm_exit {
+ struct vm_exit_inout vei; /* IN/OUT exit */
+};
+
+
+struct vm_create_params {
+ /* Input parameters to VMM_IOC_CREATE */
+ size_t vcp_memory_size;
+ size_t vcp_ncpus;
+ size_t vcp_ndisks;
+ size_t vcp_nnics;
+ char vcp_disks[VMM_MAX_DISKS_PER_VM][VMM_MAX_PATH_DISK];
+ char vcp_name[VMM_MAX_NAME_LEN];
+ char vcp_kernel[VMM_MAX_KERNEL_PATH];
+ uint8_t vcp_macs[VMM_MAX_NICS_PER_VM][6];
+
+ /* Output parameter from VMM_IOC_CREATE */
+ uint32_t vcp_id;
+};
+
+struct vm_run_params {
+ /* Input parameters to VMM_IOC_RUN */
+ uint32_t vrp_vm_id;
+ uint32_t vrp_vcpu_id;
+ uint8_t vrp_continue; /* Continuing from an exit */
+ int16_t vrp_injint; /* Injected interrupt vector */
+
+ /* Input/output parameter to VMM_IOC_RUN */
+ union vm_exit *vrp_exit; /* updated exit data */
+
+ /* Output parameter from VMM_IOC_RUN */
+ uint16_t vrp_exit_reason; /* exit reason */
+};
+
+struct vm_info_result {
+ /* Output parameters from VMM_IOC_INFO */
+ size_t vir_memory_size;
+ size_t vir_ncpus;
+ uint8_t vir_vcpu_state[VMM_MAX_VCPUS_PER_VM];
+ pid_t vir_creator_pid;
+ uint32_t vir_id;
+ char vir_name[VMM_MAX_NAME_LEN];
+};
+
+struct vm_info_params {
+ /* Input parameters to VMM_IOC_INFO */
+ size_t vip_size; /* Output buffer size */
+
+ /* Output Parameters from VMM_IOC_INFO */
+ size_t vip_info_ct; /* # of entries returned */
+ struct vm_info_result *vip_info; /* Output buffer */
+};
+
+struct vm_terminate_params {
+ /* Input parameters to VMM_IOC_TERM */
+ uint32_t vtp_vm_id;
+};
+
+struct vm_writepage_params {
+ /* Input parameters to VMM_IOC_WRITEPAGE */
+ uint32_t vwp_vm_id; /* VM ID */
+ paddr_t vwp_paddr; /* Phys Addr */
+ char *vwp_data; /* Page Data */
+ uint32_t vwp_len; /* Length */
+};
+
+struct vm_readpage_params {
+ /* Input parameters to VMM_IOC_READPAGE */
+ uint32_t vrp_vm_id; /* VM ID */
+ paddr_t vrp_paddr; /* Phys Addr */
+ uint32_t vrp_len; /* Length */
+
+ /* Output parameters from VMM_IOC_READPAGE */
+ char *vrp_data; /* Page Data */
+};
+
+/* IOCTL definitions */
+#define VMM_IOC_START _IO('V', 1) /* Start virtualization */
+#define VMM_IOC_STOP _IO('V', 2) /* Stop virtualization */
+#define VMM_IOC_CREATE _IOWR('V', 3, struct vm_create_params) /* Create VM */
+#define VMM_IOC_RUN _IOWR('V', 4, struct vm_run_params) /* Run VCPU */
+#define VMM_IOC_INFO _IOWR('V', 5, struct vm_info_params) /* Get VM Info */
+#define VMM_IOC_TERM _IOW('V', 6, struct vm_terminate_params) /* Terminate VM */
+#define VMM_IOC_WRITEPAGE _IOW('V', 7, struct vm_writepage_params) /* Wr Pg */
+#define VMM_IOC_READPAGE _IOW('V', 8, struct vm_readpage_params) /* Rd Pg */
+
+#ifdef _KERNEL
+
+#include <uvm/uvm_extern.h>
+
+#define VMX_FAIL_LAUNCH_UNKNOWN 1
+#define VMX_FAIL_LAUNCH_INVALID_VMCS 2
+#define VMX_FAIL_LAUNCH_VALID_VMCS 3
+
+#ifdef VMM_DEBUG
+#define dprintf(x...) do { if (vmm_debug) printf(x); } while(0)
+#else
+#define dprintf(x...)
+#endif /* VMM_DEBUG */
+
+enum {
+ VMM_MODE_UNKNOWN,
+ VMM_MODE_VMX,
+ VMM_MODE_EPT,
+ VMM_MODE_SVM,
+ VMM_MODE_RVI
+};
+
+enum {
+ VMM_MEM_TYPE_REGULAR,
+ VMM_MEM_TYPE_UNKNOWN
+};
+
+/* Forward declarations */
+struct vm;
+
+/*
+ * Implementation-specific cpu state
+ */
+struct vmcb {
+};
+
+struct vmcs {
+ uint32_t vmcs_revision;
+};
+
+struct vmx_invvpid_descriptor
+{
+ uint64_t vid_vpid; // : 16;
+ uint64_t vid_addr;
+};
+
+struct vmx_invept_descriptor
+{
+ uint64_t vid_eptp;
+ uint64_t vid_reserved;
+};
+
+struct vmx_msr_store
+{
+ uint64_t vms_index : 32;
+ uint64_t vms_data;
+};
+
+/*
+ * Storage for guest registers not preserved in VMCS and various exit
+ * information.
+ *
+ * Note that vmx_enter_guest depends on the layout of this struct for
+ * field access.
+ */
+struct vmx_gueststate
+{
+ /* %rsi should be first */
+ uint64_t vg_rsi; /* 0x00 */
+ uint64_t vg_rax; /* 0x08 */
+ uint64_t vg_rbx; /* 0x10 */
+ uint64_t vg_rcx; /* 0x18 */
+ uint64_t vg_rdx; /* 0x20 */
+ uint64_t vg_rdi; /* 0x28 */
+ uint64_t vg_rbp; /* 0x30 */
+ uint64_t vg_r8; /* 0x38 */
+ uint64_t vg_r9; /* 0x40 */
+ uint64_t vg_r10; /* 0x48 */
+ uint64_t vg_r11; /* 0x50 */
+ uint64_t vg_r12; /* 0x58 */
+ uint64_t vg_r13; /* 0x60 */
+ uint64_t vg_r14; /* 0x68 */
+ uint64_t vg_r15; /* 0x70 */
+ uint64_t vg_cr2; /* 0x78 */
+ uint64_t vg_rip; /* 0x80 */
+ uint32_t vg_exit_reason; /* 0x88 */
+};
+
+/*
+ * Virtual CPU
+ */
+struct vcpu {
+ /* VMCS / VMCB pointer */
+ vaddr_t vc_control_va;
+ uint64_t vc_control_pa;
+
+ /* VLAPIC pointer */
+ vaddr_t vc_vlapic_va;
+ uint64_t vc_vlapic_pa;
+
+ /* MSR bitmap address */
+ vaddr_t vc_msr_bitmap_va;
+ uint64_t vc_msr_bitmap_pa;
+
+ struct vm *vc_parent;
+ uint32_t vc_id;
+ SLIST_ENTRY(vcpu) vc_vcpu_link;
+ vaddr_t vc_hsa_stack_va;
+
+ uint8_t vc_virt_mode;
+ uint8_t vc_state;
+
+ struct cpu_info *vc_last_pcpu;
+ union vm_exit vc_exit;
+
+ /* VMX only */
+ uint64_t vc_vmx_basic;
+ uint64_t vc_vmx_entry_ctls;
+ uint64_t vc_vmx_true_entry_ctls;
+ uint64_t vc_vmx_exit_ctls;
+ uint64_t vc_vmx_true_exit_ctls;
+ uint64_t vc_vmx_pinbased_ctls;
+ uint64_t vc_vmx_true_pinbased_ctls;
+ uint64_t vc_vmx_procbased_ctls;
+ uint64_t vc_vmx_true_procbased_ctls;
+ uint64_t vc_vmx_procbased2_ctls;
+ struct vmx_gueststate vc_gueststate;
+ vaddr_t vc_vmx_msr_exit_save_va;
+ paddr_t vc_vmx_msr_exit_save_pa;
+ vaddr_t vc_vmx_msr_exit_load_va;
+ paddr_t vc_vmx_msr_exit_load_pa;
+ vaddr_t vc_vmx_msr_entry_load_va;
+ paddr_t vc_vmx_msr_entry_load_pa;
+};
+
+SLIST_HEAD(vcpu_head, vcpu);
+
+/*
+ * Virtual Machine
+ */
+struct vm {
+ vm_map_t vm_map;
+ uint32_t vm_id;
+ pid_t vm_creator_pid;
+ uint32_t vm_memory_size;
+ char vm_name[VMM_MAX_NAME_LEN];
+
+ struct vcpu_head vm_vcpu_list;
+ uint32_t vm_vcpu_ct;
+ struct rwlock vm_vcpu_lock;
+
+ SLIST_ENTRY(vm) vm_link;
+};
+
+void vmm_dispatch_intr(vaddr_t);
+int vmxon(uint64_t *);
+int vmxoff(void);
+int vmclear(uint64_t *);
+int vmptrld(uint64_t *);
+int vmptrst(uint64_t *);
+int vmwrite(uint64_t, uint64_t);
+int vmread(uint64_t, uint64_t *);
+void invvpid(uint64_t, struct vmx_invvpid_descriptor *);
+void invept(uint64_t, struct vmx_invept_descriptor *);
+int vmx_enter_guest(uint64_t *, struct vmx_gueststate *, int);
+void start_vmm_on_cpu(struct cpu_info *);
+void stop_vmm_on_cpu(struct cpu_info *);
+
+#endif /* _KERNEL */
+
+#endif /* ! _MACHINE_VMMVAR_H_ */