summaryrefslogtreecommitdiff
path: root/sys/arch
diff options
context:
space:
mode:
authorMike Larkin <mlarkin@cvs.openbsd.org>2016-10-21 06:21:00 +0000
committerMike Larkin <mlarkin@cvs.openbsd.org>2016-10-21 06:21:00 +0000
commitcae887b67e44c408266479082c32423bcc360289 (patch)
treef56b330a85d48f2e7e9325252035d3c87a847b20 /sys/arch
parentd66165444be3ed84f18a8d5a2fe5afc088beb166 (diff)
vmm(4) for i386. Userland changes forthcoming. Note that for the time being,
i386 hosts are limited to running only i386 guests, even if the underlying hardware supports amd64. This is a restriction I hope to lift moving forward, but for now please don't report problems running amd64 guests on i386 hosts. This was a straightforward port of the in-tree amd64 code plus the old rotted tree I had from last year for i386 support. Changes included converting 64-bit VMREAD/VMWRITE ops to 2x32-bit ops, and fixing treatment of the TSS, which differs on i386. ok deraadt@
Diffstat (limited to 'sys/arch')
-rw-r--r--sys/arch/i386/conf/GENERIC3
-rw-r--r--sys/arch/i386/conf/Makefile.i3864
-rw-r--r--sys/arch/i386/conf/files.i38610
-rw-r--r--sys/arch/i386/i386/conf.c14
-rw-r--r--sys/arch/i386/i386/cpu.c34
-rw-r--r--sys/arch/i386/i386/ipifuncs.c30
-rw-r--r--sys/arch/i386/i386/machdep.c114
-rw-r--r--sys/arch/i386/i386/mainbus.c8
-rw-r--r--sys/arch/i386/i386/pmap.c24
-rw-r--r--sys/arch/i386/i386/pmapae.c64
-rw-r--r--sys/arch/i386/i386/vmm.c5433
-rw-r--r--sys/arch/i386/i386/vmm_support.S291
-rw-r--r--sys/arch/i386/include/cpu.h41
-rw-r--r--sys/arch/i386/include/intrdefs.h9
-rw-r--r--sys/arch/i386/include/pmap.h12
-rw-r--r--sys/arch/i386/include/pte.h9
-rw-r--r--sys/arch/i386/include/specialreg.h367
-rw-r--r--sys/arch/i386/include/vmmvar.h446
18 files changed, 6887 insertions, 26 deletions
diff --git a/sys/arch/i386/conf/GENERIC b/sys/arch/i386/conf/GENERIC
index 808917af519..9e3d2a265be 100644
--- a/sys/arch/i386/conf/GENERIC
+++ b/sys/arch/i386/conf/GENERIC
@@ -1,4 +1,4 @@
-# $OpenBSD: GENERIC,v 1.823 2016/09/12 08:28:44 mpi Exp $
+# $OpenBSD: GENERIC,v 1.824 2016/10/21 06:20:58 mlarkin Exp $
#
# For further information on compiling OpenBSD kernels, see the config(8)
# man page.
@@ -79,6 +79,7 @@ isa0 at gscpcib?
isa0 at glxpcib?
eisa0 at mainbus0
pci* at mainbus0
+vmm0 at mainbus0
pchb* at pci? # PCI-Host bridges
ppb* at pci? # PCI-PCI bridges
diff --git a/sys/arch/i386/conf/Makefile.i386 b/sys/arch/i386/conf/Makefile.i386
index 18f05560470..13b1b7cf8a2 100644
--- a/sys/arch/i386/conf/Makefile.i386
+++ b/sys/arch/i386/conf/Makefile.i386
@@ -1,4 +1,4 @@
-# $OpenBSD: Makefile.i386,v 1.97 2016/10/15 13:45:08 deraadt Exp $
+# $OpenBSD: Makefile.i386,v 1.98 2016/10/21 06:20:58 mlarkin Exp $
# For instructions on building kernels consult the config(8) and options(4)
# manual pages.
@@ -149,7 +149,7 @@ db_structinfo.h: $S/ddb/db_structinfo.c $S/ddb/parse_structinfo.pl
rm -f db_structinfo.o
locore.o: ${_machdir}/${_mach}/locore.s assym.h
-in_cksum.o mptramp.o kvm86call.o acpi_wakecode.o: assym.h
+in_cksum.o mptramp.o kvm86call.o acpi_wakecode.o vmm_support.o: assym.h
# The install target can be redefined by putting a
# install-kernel-${MACHINE_NAME} target into /etc/mk.conf
diff --git a/sys/arch/i386/conf/files.i386 b/sys/arch/i386/conf/files.i386
index 7f1ef1eb725..efb759667b0 100644
--- a/sys/arch/i386/conf/files.i386
+++ b/sys/arch/i386/conf/files.i386
@@ -1,4 +1,4 @@
-# $OpenBSD: files.i386,v 1.229 2016/02/28 15:46:18 naddy Exp $
+# $OpenBSD: files.i386,v 1.230 2016/10/21 06:20:58 mlarkin Exp $
#
# new style config file for i386 architecture
#
@@ -389,6 +389,14 @@ file arch/i386/i386/acpi_machdep.c acpi
file arch/i386/i386/acpi_wakecode.S acpi & !small_kernel
#
+# VMM
+#
+device vmm {}
+attach vmm at mainbus
+file arch/i386/i386/vmm.c vmm needs-flag
+file arch/i386/i386/vmm_support.S vmm
+
+#
# IPMI
#
attach ipmi at mainbus
diff --git a/sys/arch/i386/i386/conf.c b/sys/arch/i386/i386/conf.c
index 812d82d8550..1622e6a90eb 100644
--- a/sys/arch/i386/i386/conf.c
+++ b/sys/arch/i386/i386/conf.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: conf.c,v 1.157 2016/09/04 10:51:23 naddy Exp $ */
+/* $OpenBSD: conf.c,v 1.158 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: conf.c,v 1.75 1996/05/03 19:40:20 christos Exp $ */
/*
@@ -105,6 +105,14 @@ int nblkdev = nitems(bdevsw);
(dev_type_stop((*))) enodev, 0, seltrue, \
(dev_type_mmap((*))) enodev, 0 }
+/* open, close, ioctl */
+#define cdev_vmm_init(c,n) { \
+ dev_init(c,n,open), dev_init(c,n,close), \
+ (dev_type_read((*))) enodev, \
+ (dev_type_write((*))) enodev, \
+ dev_init(c,n,ioctl), \
+ (dev_type_stop((*))) enodev, 0, seltrue, \
+ (dev_type_mmap((*))) enodev }
#define mmread mmrw
#define mmwrite mmrw
@@ -178,6 +186,8 @@ cdev_decl(pci);
#include "pvbus.h"
#include "ipmi.h"
#include "switch.h"
+#include "vmm.h"
+cdev_decl(vmm);
struct cdevsw cdevsw[] =
{
@@ -191,7 +201,7 @@ struct cdevsw cdevsw[] =
cdev_log_init(1,log), /* 7: /dev/klog */
cdev_tty_init(NCOM,com), /* 8: serial port */
cdev_disk_init(NFD,fd), /* 9: floppy disk */
- cdev_notdef(), /* 10 */
+ cdev_vmm_init(NVMM,vmm), /* 10: vmm */
cdev_notdef(), /* 11 */
cdev_wsdisplay_init(NWSDISPLAY, /* 12: frame buffers, etc. */
wsdisplay),
diff --git a/sys/arch/i386/i386/cpu.c b/sys/arch/i386/i386/cpu.c
index babc4f56b76..3ce489a5531 100644
--- a/sys/arch/i386/i386/cpu.c
+++ b/sys/arch/i386/i386/cpu.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: cpu.c,v 1.79 2016/07/28 21:57:56 kettenis Exp $ */
+/* $OpenBSD: cpu.c,v 1.80 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: cpu.c,v 1.1.2.7 2000/06/26 02:04:05 sommerfeld Exp $ */
/*-
@@ -66,6 +66,7 @@
#include "lapic.h"
#include "ioapic.h"
+#include "vmm.h"
#include <sys/param.h>
#include <sys/timeout.h>
@@ -113,6 +114,9 @@ int cpu_activate(struct device *, int);
void patinit(struct cpu_info *ci);
void cpu_idle_mwait_cycle(void);
void cpu_init_mwait(struct device *);
+#if NVMM > 0
+void cpu_init_vmm(struct cpu_info *ci);
+#endif /* NVMM > 0 */
u_int cpu_mwait_size, cpu_mwait_states;
@@ -345,6 +349,10 @@ cpu_attach(struct device *parent, struct device *self, void *aux)
ci->ci_dev.dv_xname, pcb, pcb->pcb_esp);
}
#endif
+
+#if NVMM > 0
+ cpu_init_vmm(ci);
+#endif /* NVMM > 0 */
}
/*
@@ -407,6 +415,23 @@ cpu_init(struct cpu_info *ci)
}
void
+cpu_init_vmm(struct cpu_info *ci)
+{
+ /*
+ * Allocate a per-cpu VMXON region
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ ci->ci_vmxon_region_pa = 0;
+ ci->ci_vmxon_region = (struct vmxon_region *)malloc(PAGE_SIZE,
+ M_DEVBUF, M_WAITOK|M_ZERO);
+ if (!pmap_extract(pmap_kernel(), (vaddr_t)ci->ci_vmxon_region,
+ (paddr_t *)&ci->ci_vmxon_region_pa))
+ panic("Can't locate VMXON region in phys mem\n");
+ }
+}
+
+
+void
patinit(struct cpu_info *ci)
{
extern int pmap_pg_wc;
@@ -415,13 +440,6 @@ patinit(struct cpu_info *ci)
if ((ci->ci_feature_flags & CPUID_PAT) == 0)
return;
-#define PATENTRY(n, type) ((u_int64_t)type << ((n) * 8))
-#define PAT_UC 0x0UL
-#define PAT_WC 0x1UL
-#define PAT_WT 0x4UL
-#define PAT_WP 0x5UL
-#define PAT_WB 0x6UL
-#define PAT_UCMINUS 0x7UL
/*
* Set up PAT bits.
* The default pat table is the following:
diff --git a/sys/arch/i386/i386/ipifuncs.c b/sys/arch/i386/i386/ipifuncs.c
index b313879b852..e1b820fd77c 100644
--- a/sys/arch/i386/i386/ipifuncs.c
+++ b/sys/arch/i386/i386/ipifuncs.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ipifuncs.c,v 1.27 2015/07/19 18:53:49 sf Exp $ */
+/* $OpenBSD: ipifuncs.c,v 1.28 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: ipifuncs.c,v 1.1.2.3 2000/06/26 02:04:06 sommerfeld Exp $ */
/*-
@@ -37,6 +37,7 @@
*/
#include "npx.h"
+#include "vmm.h"
#include <sys/param.h>
#include <sys/device.h>
@@ -70,6 +71,11 @@ void i386_ipi_reload_mtrr(struct cpu_info *);
#define i386_ipi_reload_mtrr 0
#endif
+#if NVMM > 0
+void i386_ipi_start_vmm(struct cpu_info *);
+void i386_ipi_stop_vmm(struct cpu_info *);
+#endif /* NVMM > 0 */
+
void (*ipifunc[I386_NIPI])(struct cpu_info *) =
{
i386_ipi_halt,
@@ -88,6 +94,13 @@ void (*ipifunc[I386_NIPI])(struct cpu_info *) =
NULL,
#endif
i386_setperf_ipi,
+#if NVMM > 0
+ i386_ipi_start_vmm,
+ i386_ipi_stop_vmm,
+#else
+ NULL,
+ NULL,
+#endif /* NVMM > 0 */
};
void
@@ -208,3 +221,18 @@ i386_ipi_handler(void)
}
}
}
+
+#if NVMM > 0
+void
+i386_ipi_start_vmm(struct cpu_info *ci)
+{
+ start_vmm_on_cpu(ci);
+}
+
+void
+i386_ipi_stop_vmm(struct cpu_info *ci)
+{
+ stop_vmm_on_cpu(ci);
+}
+#endif /* NVMM > 0 */
+
diff --git a/sys/arch/i386/i386/machdep.c b/sys/arch/i386/i386/machdep.c
index d6af51e1d80..d2ca55c98d8 100644
--- a/sys/arch/i386/i386/machdep.c
+++ b/sys/arch/i386/i386/machdep.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: machdep.c,v 1.592 2016/10/14 04:53:26 mlarkin Exp $ */
+/* $OpenBSD: machdep.c,v 1.593 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: machdep.c,v 1.214 1996/11/10 03:16:17 thorpej Exp $ */
/*-
@@ -168,6 +168,7 @@ extern struct proc *npxproc;
#include <machine/hibernate_var.h>
#endif /* HIBERNATE */
+#include "vmm.h"
void replacesmap(void);
int intr_handler(struct intrframe *, struct intrhand *);
@@ -339,6 +340,9 @@ void p3_get_bus_clock(struct cpu_info *);
void p4_update_cpuspeed(void);
void p3_update_cpuspeed(void);
int pentium_cpuspeed(int *);
+#if NVMM > 0
+void cpu_check_vmm_cap(struct cpu_info *);
+#endif /* NVMM > 0 */
static __inline u_char
cyrix_read_reg(u_char reg)
@@ -2077,6 +2081,10 @@ identifycpu(struct cpu_info *ci)
} else
i386_use_fxsave = 0;
+#if NVMM > 0
+ cpu_check_vmm_cap(ci);
+#endif /* NVMM > 0 */
+
}
char *
@@ -3967,3 +3975,107 @@ intr_barrier(void *ih)
{
sched_barrier(NULL);
}
+
+#if NVMM > 0
+/*
+ * cpu_check_vmm_cap
+ *
+ * Checks for VMM capabilities for 'ci'. Initializes certain per-cpu VMM
+ * state in 'ci' if virtualization extensions are found.
+ *
+ * Parameters:
+ * ci: the cpu being checked
+ */
+void
+cpu_check_vmm_cap(struct cpu_info *ci)
+{
+ uint64_t msr;
+ uint32_t cap, dummy;
+
+ /*
+ * Check for workable VMX
+ */
+ if (cpu_ecxfeature & CPUIDECX_VMX) {
+ msr = rdmsr(MSR_IA32_FEATURE_CONTROL);
+
+ if (!(msr & IA32_FEATURE_CONTROL_LOCK))
+ ci->ci_vmm_flags |= CI_VMM_VMX;
+ else {
+ if (msr & IA32_FEATURE_CONTROL_VMX_EN)
+ ci->ci_vmm_flags |= CI_VMM_VMX;
+ }
+ }
+
+ /*
+ * Check for EPT (Intel Nested Paging) and other secondary
+ * controls
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ /* Secondary controls available? */
+ /* XXX should we check true procbased ctls here if avail? */
+ msr = rdmsr(IA32_VMX_PROCBASED_CTLS);
+ if (msr & (IA32_VMX_ACTIVATE_SECONDARY_CONTROLS) << 32) {
+ msr = rdmsr(IA32_VMX_PROCBASED2_CTLS);
+ /* EPT available? */
+ if (msr & (IA32_VMX_ENABLE_EPT) << 32)
+ ci->ci_vmm_flags |= CI_VMM_EPT;
+ /* VM Functions available? */
+ if (msr & (IA32_VMX_ENABLE_VM_FUNCTIONS) << 32) {
+ ci->ci_vmm_cap.vcc_vmx.vmx_vm_func =
+ rdmsr(IA32_VMX_VMFUNC);
+ }
+ }
+ }
+
+ /*
+ * Check startup config (VMX)
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ /* CR0 fixed and flexible bits */
+ msr = rdmsr(IA32_VMX_CR0_FIXED0);
+ ci->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0 = msr;
+ msr = rdmsr(IA32_VMX_CR0_FIXED1);
+ ci->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1 = msr;
+
+ /* CR4 fixed and flexible bits */
+ msr = rdmsr(IA32_VMX_CR4_FIXED0);
+ ci->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0 = msr;
+ msr = rdmsr(IA32_VMX_CR4_FIXED1);
+ ci->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1 = msr;
+
+ /* VMXON region revision ID (bits 30:0 of IA32_VMX_BASIC) */
+ msr = rdmsr(IA32_VMX_BASIC);
+ ci->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision =
+ (uint32_t)(msr & 0x7FFFFFFF);
+
+ /* MSR save / load table size */
+ msr = rdmsr(IA32_VMX_MISC);
+ ci->ci_vmm_cap.vcc_vmx.vmx_msr_table_size =
+ (uint32_t)(msr & IA32_VMX_MSR_LIST_SIZE_MASK) >> 25;
+
+ /* CR3 target count size */
+ ci->ci_vmm_cap.vcc_vmx.vmx_cr3_tgt_count =
+ (uint32_t)(msr & IA32_VMX_CR3_TGT_SIZE_MASK) >> 16;
+ }
+
+ /*
+ * Check for workable SVM
+ */
+ if (ecpu_ecxfeature & CPUIDECX_SVM) {
+ msr = rdmsr(MSR_AMD_VM_CR);
+
+ if (!(msr & AMD_SVMDIS))
+ ci->ci_vmm_flags |= CI_VMM_SVM;
+ }
+
+ /*
+ * Check for SVM Nested Paging
+ */
+ if (ci->ci_vmm_flags & CI_VMM_SVM) {
+ CPUID(CPUID_AMD_SVM_CAP, dummy, dummy, dummy, cap);
+ if (cap & AMD_SVM_NESTED_PAGING_CAP)
+ ci->ci_vmm_flags |= CI_VMM_RVI;
+ }
+}
+#endif /* NVMM > 0 */
+
diff --git a/sys/arch/i386/i386/mainbus.c b/sys/arch/i386/i386/mainbus.c
index d44a0f1c695..56acb1f57d6 100644
--- a/sys/arch/i386/i386/mainbus.c
+++ b/sys/arch/i386/i386/mainbus.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: mainbus.c,v 1.55 2016/07/28 21:57:56 kettenis Exp $ */
+/* $OpenBSD: mainbus.c,v 1.56 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: mainbus.c,v 1.21 1997/06/06 23:14:20 thorpej Exp $ */
/*
@@ -54,6 +54,7 @@
#include "ipmi.h"
#include "esm.h"
#include "amdmsr.h"
+#include "vmm.h"
#include "pvbus.h"
#include <machine/cpuvar.h>
@@ -269,6 +270,11 @@ mainbus_attach(struct device *parent, struct device *self, void *aux)
#endif
config_found(self, &mba.mba_iba, mainbus_print);
}
+
+#if NVMM > 0
+ mba.mba_busname = "vmm";
+ config_found(self, &mba.mba_busname, mainbus_print);
+#endif /* NVMM > 0 */
}
int
diff --git a/sys/arch/i386/i386/pmap.c b/sys/arch/i386/i386/pmap.c
index 81337e8f24b..04248baa30d 100644
--- a/sys/arch/i386/i386/pmap.c
+++ b/sys/arch/i386/i386/pmap.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: pmap.c,v 1.194 2016/09/17 07:37:57 mlarkin Exp $ */
+/* $OpenBSD: pmap.c,v 1.195 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: pmap.c,v 1.91 2000/06/02 17:46:37 thorpej Exp $ */
/*
@@ -74,6 +74,8 @@
#include <sys/msgbuf.h>
#include <stand/boot/bootarg.h>
+#include "vmm.h"
+
/*
* this file contains the code for the "pmap module." the module's
* job is to manage the hardware's virtual to physical address mappings.
@@ -931,6 +933,11 @@ pmap_bootstrap(vaddr_t kva_start)
kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3;
kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
atop(kva_start - VM_MIN_KERNEL_ADDRESS);
+ kpm->pm_type = PMAP_TYPE_NORMAL;
+#if NVMM > 0
+ kpm->pm_npt_pml4 = 0;
+ kpm->pm_npt_pdpt = 0;
+#endif /* NVMM > 0 */
/*
* the above is just a rough estimate and not critical to the proper
@@ -1289,6 +1296,12 @@ pmap_create(void)
setsegment(&pmap->pm_codeseg, 0, atop(I386_MAX_EXE_ADDR) - 1,
SDT_MEMERA, SEL_UPL, 1, 1);
+ pmap->pm_type = PMAP_TYPE_NORMAL;
+#if NVMM > 0
+ pmap->pm_npt_pml4 = 0;
+ pmap->pm_npt_pdpt = 0;
+#endif /* NVMM > 0 */
+
pmap_pinit_pd(pmap);
return (pmap);
}
@@ -1356,6 +1369,15 @@ pmap_destroy(struct pmap *pmap)
uvm_km_free(kernel_map, pmap->pm_pdir, pmap->pm_pdirsize);
pmap->pm_pdir = 0;
+#if NVMM > 0
+ if (pmap->pm_npt_pml4)
+ km_free((void *)pmap->pm_npt_pml4, PAGE_SIZE, &kv_any,
+ &kp_zero);
+ if (pmap->pm_npt_pdpt)
+ km_free((void *)pmap->pm_npt_pdpt, PAGE_SIZE, &kv_any,
+ &kp_zero);
+#endif /* NVMM > 0 */
+
pool_put(&pmap_pmap_pool, pmap);
}
diff --git a/sys/arch/i386/i386/pmapae.c b/sys/arch/i386/i386/pmapae.c
index 46b366b0360..e4ffa837c9d 100644
--- a/sys/arch/i386/i386/pmapae.c
+++ b/sys/arch/i386/i386/pmapae.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: pmapae.c,v 1.51 2016/09/17 07:37:57 mlarkin Exp $ */
+/* $OpenBSD: pmapae.c,v 1.52 2016/10/21 06:20:58 mlarkin Exp $ */
/*
* Copyright (c) 2006-2008 Michael Shalayeff
@@ -1915,3 +1915,65 @@ pmap_flush_page_pae(paddr_t pa)
*pte = 0;
pmap_update_pg(va);
}
+
+int
+pmap_convert(struct pmap *pmap, int mode)
+{
+ int ret;
+ pt_entry_t *pte;
+ paddr_t pml4_pa, pdpt_pa;
+
+ pmap->pm_type = mode;
+
+ ret = 0;
+ if (mode == PMAP_TYPE_EPT) {
+ pmap->pm_npt_pml4 = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
+ &kp_zero, &kd_nowait);
+ if (!pmap->pm_npt_pml4) {
+ ret = ENOMEM;
+ goto error;
+ }
+
+ pmap->pm_npt_pdpt = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
+ &kp_zero, &kd_nowait);
+ if (!pmap->pm_npt_pdpt) {
+ ret = ENOMEM;
+ goto error;
+ }
+
+ if (!pmap_extract(pmap_kernel(), pmap->pm_npt_pml4,
+ &pml4_pa)) {
+ ret = ENOMEM;
+ goto error;
+ }
+ pmap->pm_npt_pa = pml4_pa;
+
+ if (!pmap_extract(pmap_kernel(), pmap->pm_npt_pdpt,
+ &pdpt_pa)) {
+ ret = ENOMEM;
+ goto error;
+ }
+
+ pte = (pt_entry_t *)pmap->pm_npt_pml4;
+ pte[0] = (pdpt_pa & PG_FRAME) | EPT_R | EPT_W | EPT_X;
+ pte = (pt_entry_t *)pmap->pm_npt_pdpt;
+ pte[0] = (pmap->pm_pdidx[0] & PG_FRAME) |
+ EPT_R | EPT_W | EPT_X;
+ pte[1] = (pmap->pm_pdidx[1] & PG_FRAME) |
+ EPT_R | EPT_W | EPT_X;
+ pte[2] = (pmap->pm_pdidx[2] & PG_FRAME) |
+ EPT_R | EPT_W | EPT_X;
+ pte[3] = (pmap->pm_pdidx[3] & PG_FRAME) |
+ EPT_R | EPT_W | EPT_X;
+ }
+
+ return (ret);
+
+error:
+ if (pmap->pm_npt_pml4)
+ km_free((void *)pmap->pm_npt_pml4, PAGE_SIZE, &kv_any, &kp_zero);
+ if (pmap->pm_npt_pdpt)
+ km_free((void *)pmap->pm_npt_pdpt, PAGE_SIZE, &kv_any, &kp_zero);
+
+ return (ret);
+}
diff --git a/sys/arch/i386/i386/vmm.c b/sys/arch/i386/i386/vmm.c
new file mode 100644
index 00000000000..cea820e3bf4
--- /dev/null
+++ b/sys/arch/i386/i386/vmm.c
@@ -0,0 +1,5433 @@
+/*
+ * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/signalvar.h>
+#include <sys/malloc.h>
+#include <sys/device.h>
+#include <sys/pool.h>
+#include <sys/proc.h>
+#include <sys/ioctl.h>
+#include <sys/queue.h>
+#include <sys/rwlock.h>
+#include <sys/pledge.h>
+#include <sys/memrange.h>
+
+#include <uvm/uvm_extern.h>
+
+#include <machine/pmap.h>
+#include <machine/biosvar.h>
+#include <machine/segments.h>
+#include <machine/cpufunc.h>
+#include <machine/vmmvar.h>
+#include <machine/i82489reg.h>
+
+#include <dev/isa/isareg.h>
+
+#define VMM_DEBUG
+
+#ifdef VMM_DEBUG
+int vmm_debug = 1;
+#define DPRINTF(x...) do { if (vmm_debug) printf(x); } while(0)
+#else
+#define DPRINTF(x...)
+#endif /* VMM_DEBUG */
+
+#define DEVNAME(s) ((s)->sc_dev.dv_xname)
+
+#define CTRL_DUMP(x,y,z) printf(" %s: Can set:%s Can clear:%s\n", #z , \
+ vcpu_vmx_check_cap(x, IA32_VMX_##y ##_CTLS, \
+ IA32_VMX_##z, 1) ? "Yes" : "No", \
+ vcpu_vmx_check_cap(x, IA32_VMX_##y ##_CTLS, \
+ IA32_VMX_##z, 0) ? "Yes" : "No");
+
+#define VMX_EXIT_INFO_HAVE_RIP 0x1
+#define VMX_EXIT_INFO_HAVE_REASON 0x2
+#define VMX_EXIT_INFO_COMPLETE \
+ (VMX_EXIT_INFO_HAVE_RIP | VMX_EXIT_INFO_HAVE_REASON)
+
+struct vm {
+ vm_map_t vm_map;
+ uint32_t vm_id;
+ pid_t vm_creator_pid;
+ size_t vm_nmemranges;
+ size_t vm_memory_size;
+ char vm_name[VMM_MAX_NAME_LEN];
+ struct vm_mem_range vm_memranges[VMM_MAX_MEM_RANGES];
+
+ struct vcpu_head vm_vcpu_list;
+ uint32_t vm_vcpu_ct;
+ u_int vm_vcpus_running;
+ struct rwlock vm_vcpu_lock;
+
+ SLIST_ENTRY(vm) vm_link;
+};
+
+SLIST_HEAD(vmlist_head, vm);
+
+struct vmm_softc {
+ struct device sc_dev;
+
+ /* Capabilities */
+ uint32_t nr_vmx_cpus;
+ uint32_t nr_svm_cpus;
+ uint32_t nr_rvi_cpus;
+ uint32_t nr_ept_cpus;
+
+ /* Managed VMs */
+ struct vmlist_head vm_list;
+
+ int mode;
+
+ struct rwlock vm_lock;
+ size_t vm_ct; /* number of in-memory VMs */
+ size_t vm_idx; /* next unique VM index */
+};
+
+int vmm_probe(struct device *, void *, void *);
+void vmm_attach(struct device *, struct device *, void *);
+int vmmopen(dev_t, int, int, struct proc *);
+int vmmioctl(dev_t, u_long, caddr_t, int, struct proc *);
+int vmmclose(dev_t, int, int, struct proc *);
+int vmm_start(void);
+int vmm_stop(void);
+size_t vm_create_check_mem_ranges(struct vm_create_params *);
+int vm_create(struct vm_create_params *, struct proc *);
+int vm_run(struct vm_run_params *);
+int vm_terminate(struct vm_terminate_params *);
+int vm_get_info(struct vm_info_params *);
+int vm_resetcpu(struct vm_resetcpu_params *);
+int vm_intr_pending(struct vm_intr_params *);
+int vm_rwregs(struct vm_rwregs_params *, int);
+int vcpu_readregs_vmx(struct vcpu *, uint64_t, struct vcpu_reg_state *);
+int vcpu_readregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *);
+int vcpu_writeregs_vmx(struct vcpu *, uint64_t, int, struct vcpu_reg_state *);
+int vcpu_writeregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *);
+int vcpu_reset_regs(struct vcpu *, struct vcpu_reg_state *);
+int vcpu_reset_regs_vmx(struct vcpu *, struct vcpu_reg_state *);
+int vcpu_reset_regs_svm(struct vcpu *, struct vcpu_reg_state *);
+int vcpu_reload_vmcs_vmx(uint64_t *);
+int vcpu_init(struct vcpu *);
+int vcpu_init_vmx(struct vcpu *);
+int vcpu_init_svm(struct vcpu *);
+int vcpu_must_stop(struct vcpu *);
+int vcpu_run_vmx(struct vcpu *, struct vm_run_params *);
+int vcpu_run_svm(struct vcpu *, struct vm_run_params *);
+void vcpu_deinit(struct vcpu *);
+void vcpu_deinit_vmx(struct vcpu *);
+void vcpu_deinit_svm(struct vcpu *);
+int vm_impl_init(struct vm *, struct proc *);
+int vm_impl_init_vmx(struct vm *, struct proc *);
+int vm_impl_init_svm(struct vm *, struct proc *);
+void vm_impl_deinit(struct vm *);
+void vm_impl_deinit_vmx(struct vm *);
+void vm_impl_deinit_svm(struct vm *);
+void vm_teardown(struct vm *);
+int vcpu_vmx_check_cap(struct vcpu *, uint32_t, uint32_t, int);
+int vcpu_vmx_compute_ctrl(uint64_t, uint16_t, uint32_t, uint32_t, uint32_t *);
+int vmx_get_exit_info(uint32_t *, uint32_t *);
+int vmx_handle_exit(struct vcpu *);
+int vmx_handle_cpuid(struct vcpu *);
+int vmx_handle_rdmsr(struct vcpu *);
+int vmx_handle_wrmsr(struct vcpu *);
+int vmx_handle_cr(struct vcpu *);
+int vmx_handle_inout(struct vcpu *);
+int vmx_handle_hlt(struct vcpu *);
+void vmx_handle_intr(struct vcpu *);
+void vmx_handle_intwin(struct vcpu *);
+int vmm_get_guest_memtype(struct vm *, paddr_t);
+int vmm_get_guest_faulttype(void);
+int vmx_get_guest_faulttype(void);
+int svm_get_guest_faulttype(void);
+int vmx_get_exit_qualification(uint32_t *);
+int vmx_fault_page(struct vcpu *, paddr_t);
+int vmx_handle_np_fault(struct vcpu *);
+const char *vcpu_state_decode(u_int);
+const char *vmx_exit_reason_decode(uint32_t);
+const char *vmx_instruction_error_decode(uint32_t);
+void vmx_setmsrbr(struct vcpu *, uint32_t);
+void vmx_setmsrbw(struct vcpu *, uint32_t);
+void vmx_setmsrbrw(struct vcpu *, uint32_t);
+
+#ifdef VMM_DEBUG
+void dump_vcpu(struct vcpu *);
+void vmx_vcpu_dump_regs(struct vcpu *);
+void vmx_dump_vmcs(struct vcpu *);
+const char *msr_name_decode(uint32_t);
+void vmm_segment_desc_decode(uint32_t);
+void vmm_decode_cr0(uint32_t);
+void vmm_decode_cr4(uint32_t);
+void vmm_decode_msr_value(uint64_t, uint64_t);
+void vmm_decode_apicbase_msr_value(uint64_t);
+void vmm_decode_ia32_fc_value(uint64_t);
+void vmm_decode_mtrrcap_value(uint64_t);
+void vmm_decode_perf_status_value(uint64_t);
+void vmm_decode_perf_ctl_value(uint64_t);
+void vmm_decode_mtrrdeftype_value(uint64_t);
+void vmm_decode_efer_value(uint64_t);
+
+extern int mtrr2mrt(int);
+
+struct vmm_reg_debug_info {
+ uint64_t vrdi_bit;
+ const char *vrdi_present;
+ const char *vrdi_absent;
+};
+#endif /* VMM_DEBUG */
+
+const char *vmm_hv_signature = VMM_HV_SIGNATURE;
+
+struct cfdriver vmm_cd = {
+ NULL, "vmm", DV_DULL
+};
+
+const struct cfattach vmm_ca = {
+ sizeof(struct vmm_softc), vmm_probe, vmm_attach, NULL, NULL
+};
+
+/*
+ * Helper struct to easily get the VMCS field IDs needed in vmread/vmwrite
+ * to access the individual fields of the guest segment registers. This
+ * struct is indexed by VCPU_REGS_* id.
+ */
+const struct {
+ uint64_t selid;
+ uint64_t limitid;
+ uint64_t arid;
+ uint64_t baseid;
+} vmm_vmx_sreg_vmcs_fields[] = {
+ { VMCS_GUEST_IA32_CS_SEL, VMCS_GUEST_IA32_CS_LIMIT,
+ VMCS_GUEST_IA32_CS_AR, VMCS_GUEST_IA32_CS_BASE },
+ { VMCS_GUEST_IA32_DS_SEL, VMCS_GUEST_IA32_DS_LIMIT,
+ VMCS_GUEST_IA32_DS_AR, VMCS_GUEST_IA32_DS_BASE },
+ { VMCS_GUEST_IA32_ES_SEL, VMCS_GUEST_IA32_ES_LIMIT,
+ VMCS_GUEST_IA32_ES_AR, VMCS_GUEST_IA32_ES_BASE },
+ { VMCS_GUEST_IA32_FS_SEL, VMCS_GUEST_IA32_FS_LIMIT,
+ VMCS_GUEST_IA32_FS_AR, VMCS_GUEST_IA32_FS_BASE },
+ { VMCS_GUEST_IA32_GS_SEL, VMCS_GUEST_IA32_GS_LIMIT,
+ VMCS_GUEST_IA32_GS_AR, VMCS_GUEST_IA32_GS_BASE },
+ { VMCS_GUEST_IA32_SS_SEL, VMCS_GUEST_IA32_SS_LIMIT,
+ VMCS_GUEST_IA32_SS_AR, VMCS_GUEST_IA32_SS_BASE },
+ { VMCS_GUEST_IA32_LDTR_SEL, VMCS_GUEST_IA32_LDTR_LIMIT,
+ VMCS_GUEST_IA32_LDTR_AR, VMCS_GUEST_IA32_LDTR_BASE },
+ { VMCS_GUEST_IA32_TR_SEL, VMCS_GUEST_IA32_TR_LIMIT,
+ VMCS_GUEST_IA32_TR_AR, VMCS_GUEST_IA32_TR_BASE }
+};
+
+/* Pools for VMs and VCPUs */
+struct pool vm_pool;
+struct pool vcpu_pool;
+
+struct vmm_softc *vmm_softc;
+
+/* IDT information used when populating host state area */
+extern vaddr_t idt_vaddr;
+extern struct gate_descriptor *idt;
+
+/* CPU info (i386) */
+extern char cpu_brandstr[];
+extern uint32_t ecpu_eaxfeature;
+
+/* Constants used in "CR access exit" */
+#define CR_WRITE 0
+#define CR_READ 1
+#define CR_CLTS 2
+#define CR_LMSW 3
+
+/*
+ * vmm_probe
+ *
+ * Checks if we have at least one CPU with either VMX or SVM.
+ * Returns 1 if we have at least one of either type, but not both, 0 otherwise.
+ */
+int
+vmm_probe(struct device *parent, void *match, void *aux)
+{
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+ const char **busname = (const char **)aux;
+ int found_vmx, found_svm;
+
+ /* Check if this probe is for us */
+ if (strcmp(*busname, vmm_cd.cd_name) != 0)
+ return (0);
+
+ found_vmx = 0;
+ found_svm = 0;
+
+ /* Check if we have at least one CPU with either VMX or SVM */
+ CPU_INFO_FOREACH(cii, ci) {
+ if (ci->ci_vmm_flags & CI_VMM_VMX)
+ found_vmx = 1;
+ if (ci->ci_vmm_flags & CI_VMM_SVM)
+ found_svm = 1;
+ }
+
+ /* Don't support both SVM and VMX at the same time */
+ if (found_vmx && found_svm)
+ return (0);
+
+ return (found_vmx || found_svm);
+}
+
+/*
+ * vmm_attach
+ *
+ * Calculates how many of each type of CPU we have, prints this into dmesg
+ * during attach. Initializes various locks, pools, and list structures for the
+ * VMM.
+ */
+void
+vmm_attach(struct device *parent, struct device *self, void *aux)
+{
+ struct vmm_softc *sc = (struct vmm_softc *)self;
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+
+ sc->nr_vmx_cpus = 0;
+ sc->nr_svm_cpus = 0;
+ sc->nr_rvi_cpus = 0;
+ sc->nr_ept_cpus = 0;
+ sc->vm_ct = 0;
+ sc->vm_idx = 0;
+
+ /* Calculate CPU features */
+ CPU_INFO_FOREACH(cii, ci) {
+ if (ci->ci_vmm_flags & CI_VMM_VMX)
+ sc->nr_vmx_cpus++;
+ if (ci->ci_vmm_flags & CI_VMM_SVM)
+ sc->nr_svm_cpus++;
+ if (ci->ci_vmm_flags & CI_VMM_RVI)
+ sc->nr_rvi_cpus++;
+ if (ci->ci_vmm_flags & CI_VMM_EPT)
+ sc->nr_ept_cpus++;
+ }
+
+ SLIST_INIT(&sc->vm_list);
+ rw_init(&sc->vm_lock, "vmlistlock");
+
+ if (sc->nr_ept_cpus) {
+ printf(": VMX/EPT\n");
+ sc->mode = VMM_MODE_EPT;
+ } else if (sc->nr_vmx_cpus) {
+ printf(": VMX\n");
+ sc->mode = VMM_MODE_VMX;
+ } else if (sc->nr_rvi_cpus) {
+ printf(": SVM/RVI\n");
+ sc->mode = VMM_MODE_RVI;
+ } else if (sc->nr_svm_cpus) {
+ printf(": SVM\n");
+ sc->mode = VMM_MODE_SVM;
+ } else {
+ printf(": unknown\n");
+ sc->mode = VMM_MODE_UNKNOWN;
+ }
+
+ pool_init(&vm_pool, sizeof(struct vm), 0, IPL_NONE, PR_WAITOK,
+ "vmpool", NULL);
+ pool_init(&vcpu_pool, sizeof(struct vcpu), 0, IPL_NONE, PR_WAITOK,
+ "vcpupl", NULL);
+
+ vmm_softc = sc;
+}
+
+/*
+ * vmmopen
+ *
+ * Called during open of /dev/vmm. Presently unused.
+ */
+int
+vmmopen(dev_t dev, int flag, int mode, struct proc *p)
+{
+ /* Don't allow open if we didn't attach */
+ if (vmm_softc == NULL)
+ return (ENODEV);
+
+ /* Don't allow open if we didn't detect any supported CPUs */
+ /* XXX presently this means EPT until SP and SVM are back */
+ if (vmm_softc->mode != VMM_MODE_EPT)
+ return (ENODEV);
+
+ return 0;
+}
+
+/*
+ * vmmioctl
+ *
+ * Main ioctl dispatch routine for /dev/vmm. Parses ioctl type and calls
+ * appropriate lower level handler routine. Returns result to ioctl caller.
+ */
+int
+vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
+{
+ int ret;
+
+ switch (cmd) {
+ case VMM_IOC_CREATE:
+ if ((ret = vmm_start()) != 0) {
+ vmm_stop();
+ break;
+ }
+ ret = vm_create((struct vm_create_params *)data, p);
+ break;
+ case VMM_IOC_RUN:
+ ret = vm_run((struct vm_run_params *)data);
+ break;
+ case VMM_IOC_INFO:
+ ret = vm_get_info((struct vm_info_params *)data);
+ break;
+ case VMM_IOC_TERM:
+ ret = vm_terminate((struct vm_terminate_params *)data);
+ break;
+ case VMM_IOC_RESETCPU:
+ ret = vm_resetcpu((struct vm_resetcpu_params *)data);
+ break;
+ case VMM_IOC_INTR:
+ ret = vm_intr_pending((struct vm_intr_params *)data);
+ break;
+ case VMM_IOC_READREGS:
+ ret = vm_rwregs((struct vm_rwregs_params *)data, 0);
+ break;
+ case VMM_IOC_WRITEREGS:
+ ret = vm_rwregs((struct vm_rwregs_params *)data, 1);
+ break;
+ default:
+ DPRINTF("vmmioctl: unknown ioctl code 0x%lx\n", cmd);
+ ret = ENOTTY;
+ }
+
+ return (ret);
+}
+
+/*
+ * pledge_ioctl_vmm
+ *
+ * Restrict the allowed ioctls in a pledged process context.
+ * Is called from pledge_ioctl().
+ */
+int
+pledge_ioctl_vmm(struct proc *p, long com)
+{
+ switch (com) {
+ case VMM_IOC_CREATE:
+ case VMM_IOC_INFO:
+ /* The "parent" process in vmd forks and manages VMs */
+ if (p->p_p->ps_pledge & PLEDGE_PROC)
+ return (0);
+ break;
+ case VMM_IOC_TERM:
+ /* XXX VM processes should only terminate themselves */
+ case VMM_IOC_RUN:
+ case VMM_IOC_RESETCPU:
+ return (0);
+ }
+
+ return (EPERM);
+}
+
+/*
+ * vmmclose
+ *
+ * Called when /dev/vmm is closed. Presently unused.
+ */
+int
+vmmclose(dev_t dev, int flag, int mode, struct proc *p)
+{
+ return 0;
+}
+
+/*
+ * vm_resetcpu
+ *
+ * Resets the vcpu defined in 'vrp' to power-on-init register state
+ *
+ * Parameters:
+ * vrp: ioctl structure defining the vcpu to reset (see vmmvar.h)
+ *
+ * Returns 0 if successful, or various error codes on failure:
+ * ENOENT if the VM id contained in 'vrp' refers to an unknown VM or
+ * if vrp describes an unknown vcpu for this VM
+ * EBUSY if the indicated VCPU is not stopped
+ * EIO if the indicated VCPU failed to reset
+ */
+int
+vm_resetcpu(struct vm_resetcpu_params *vrp)
+{
+ struct vm *vm;
+ struct vcpu *vcpu;
+
+ /* Find the desired VM */
+ rw_enter_read(&vmm_softc->vm_lock);
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ if (vm->vm_id == vrp->vrp_vm_id)
+ break;
+ }
+ rw_exit_read(&vmm_softc->vm_lock);
+
+ /* Not found? exit. */
+ if (vm == NULL) {
+ DPRINTF("vm_resetcpu: vm id %u not found\n",
+ vrp->vrp_vm_id);
+ return (ENOENT);
+ }
+
+ rw_enter_read(&vm->vm_vcpu_lock);
+ SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
+ if (vcpu->vc_id == vrp->vrp_vcpu_id)
+ break;
+ }
+ rw_exit_read(&vm->vm_vcpu_lock);
+
+ if (vcpu == NULL) {
+ DPRINTF("vm_resetcpu: vcpu id %u of vm %u not found\n",
+ vrp->vrp_vcpu_id, vrp->vrp_vm_id);
+ return (ENOENT);
+ }
+
+ if (vcpu->vc_state != VCPU_STATE_STOPPED) {
+ DPRINTF("vm_resetcpu: reset of vcpu %u on vm %u attempted "
+ "while vcpu was in state %u (%s)\n", vrp->vrp_vcpu_id,
+ vrp->vrp_vm_id, vcpu->vc_state,
+ vcpu_state_decode(vcpu->vc_state));
+
+ return (EBUSY);
+ }
+
+ DPRINTF("vm_resetcpu: resetting vm %d vcpu %d to power on defaults\n",
+ vm->vm_id, vcpu->vc_id);
+
+ if (vcpu_reset_regs(vcpu, &vrp->vrp_init_state)) {
+ printf("vm_resetcpu: failed\n");
+#ifdef VMM_DEBUG
+ dump_vcpu(vcpu);
+#endif /* VMM_DEBUG */
+ return (EIO);
+ }
+
+ return (0);
+}
+
+/*
+ * vm_intr_pending
+ *
+ * IOCTL handler routine for VMM_IOC_INTR messages, sent from vmd when an
+ * interrupt is pending and needs acknowledgment
+ *
+ * Parameters:
+ * vip: Describes the vm/vcpu for which the interrupt is pending
+ *
+ * Return values:
+ * 0: if successful
+ * ENOENT: if the VM/VCPU defined by 'vip' cannot be found
+ */
+int
+vm_intr_pending(struct vm_intr_params *vip)
+{
+ struct vm *vm;
+ struct vcpu *vcpu;
+
+ /* Find the desired VM */
+ rw_enter_read(&vmm_softc->vm_lock);
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ if (vm->vm_id == vip->vip_vm_id)
+ break;
+ }
+
+ /* Not found? exit. */
+ if (vm == NULL) {
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (ENOENT);
+ }
+
+ rw_enter_read(&vm->vm_vcpu_lock);
+ SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
+ if (vcpu->vc_id == vip->vip_vcpu_id)
+ break;
+ }
+ rw_exit_read(&vm->vm_vcpu_lock);
+ rw_exit_read(&vmm_softc->vm_lock);
+
+ if (vcpu == NULL)
+ return (ENOENT);
+
+ vcpu->vc_intr = vip->vip_intr;
+
+#ifdef MULTIPROCESSOR
+ /*
+ * If the vcpu is running on another PCPU, attempt to force it
+ * to exit to process the pending interrupt. This could race as
+ * it could be running when we do the check but be stopped by the
+ * time we send the IPI. In this case, there is a small extra
+ * overhead to process the IPI but no other side effects.
+ *
+ * There is also a chance that the vcpu may have interrupts blocked.
+ * That's ok as that condition will be checked on exit, and we will
+ * simply re-enter the guest. This "fast notification" is done only
+ * as an optimization.
+ */
+ if (vcpu->vc_state == VCPU_STATE_RUNNING &&
+ vip->vip_intr == 1)
+ x86_send_ipi(vcpu->vc_last_pcpu, X86_IPI_NOP);
+#endif /* MULTIPROCESSOR */
+
+ return (0);
+}
+
+/*
+ * vm_readregs
+ *
+ * IOCTL handler to read/write the current register values of a guest VCPU.
+ * The VCPU must not be running.
+ *
+ * Parameters:
+ * vrwp: Describes the VM and VCPU to get/set the registers from. The
+ * register values are returned here as well.
+ * dir: 0 for reading, 1 for writing
+ *
+ * Return values:
+ * 0: if successful
+ * ENOENT: if the VM/VCPU defined by 'vgp' cannot be found
+ * EINVAL: if an error occured reading the registers of the guest
+ */
+int
+vm_rwregs(struct vm_rwregs_params *vrwp, int dir)
+{
+ struct vm *vm;
+ struct vcpu *vcpu;
+ struct vcpu_reg_state *vrs = &vrwp->vrwp_regs;
+
+ /* Find the desired VM */
+ rw_enter_read(&vmm_softc->vm_lock);
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ if (vm->vm_id == vrwp->vrwp_vm_id)
+ break;
+ }
+
+ /* Not found? exit. */
+ if (vm == NULL) {
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (ENOENT);
+ }
+
+ rw_enter_read(&vm->vm_vcpu_lock);
+ SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
+ if (vcpu->vc_id == vrwp->vrwp_vcpu_id)
+ break;
+ }
+ rw_exit_read(&vm->vm_vcpu_lock);
+ rw_exit_read(&vmm_softc->vm_lock);
+
+ if (vcpu == NULL)
+ return (ENOENT);
+
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ return (dir == 0) ?
+ vcpu_readregs_vmx(vcpu, vrwp->vrwp_mask, vrs) :
+ vcpu_writeregs_vmx(vcpu, vrwp->vrwp_mask, 1, vrs);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ return (dir == 0) ?
+ vcpu_readregs_svm(vcpu, vrwp->vrwp_mask, vrs) :
+ vcpu_writeregs_svm(vcpu, vrwp->vrwp_mask, vrs);
+ else
+ panic("unknown vmm mode\n");
+}
+
+/*
+ * vmm_start
+ *
+ * Starts VMM mode on the system
+ */
+int
+vmm_start(void)
+{
+ struct cpu_info *self = curcpu();
+ int ret = 0;
+#ifdef MULTIPROCESSOR
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+ int i;
+#endif
+
+ /* VMM is already running */
+ if (self->ci_flags & CPUF_VMM)
+ return (0);
+
+#ifdef MULTIPROCESSOR
+ /* Broadcast start VMM IPI */
+ x86_broadcast_ipi(X86_IPI_START_VMM);
+
+ CPU_INFO_FOREACH(cii, ci) {
+ if (ci == self)
+ continue;
+ for (i = 100000; (!(ci->ci_flags & CPUF_VMM)) && i>0;i--)
+ delay(10);
+ if (!(ci->ci_flags & CPUF_VMM)) {
+ printf("%s: failed to enter VMM mode\n",
+ ci->ci_dev->dv_xname);
+ ret = EIO;
+ }
+ }
+#endif /* MULTIPROCESSOR */
+
+ /* Start VMM on this CPU */
+ start_vmm_on_cpu(self);
+ if (!(self->ci_flags & CPUF_VMM)) {
+ printf("%s: failed to enter VMM mode\n",
+ self->ci_dev.dv_xname);
+ ret = EIO;
+ }
+
+ return (ret);
+}
+
+/*
+ * vmm_stop
+ *
+ * Stops VMM mode on the system
+ */
+int
+vmm_stop(void)
+{
+ struct cpu_info *self = curcpu();
+ int ret = 0;
+#ifdef MULTIPROCESSOR
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+ int i;
+#endif
+
+ /* VMM is not running */
+ if (!(self->ci_flags & CPUF_VMM))
+ return (0);
+
+#ifdef MULTIPROCESSOR
+ /* Stop VMM on other CPUs */
+ x86_broadcast_ipi(X86_IPI_STOP_VMM);
+
+ CPU_INFO_FOREACH(cii, ci) {
+ if (ci == self)
+ continue;
+ for (i = 100000; (ci->ci_flags & CPUF_VMM) && i>0 ;i--)
+ delay(10);
+ if (ci->ci_flags & CPUF_VMM) {
+ printf("%s: failed to exit VMM mode\n",
+ ci->ci_dev->dv_xname);
+ ret = EIO;
+ }
+ }
+#endif /* MULTIPROCESSOR */
+
+ /* Stop VMM on this CPU */
+ stop_vmm_on_cpu(self);
+ if (self->ci_flags & CPUF_VMM) {
+ printf("%s: failed to exit VMM mode\n",
+ self->ci_dev.dv_xname);
+ ret = EIO;
+ }
+
+ return (ret);
+}
+
+/*
+ * start_vmm_on_cpu
+ *
+ * Starts VMM mode on 'ci' by executing the appropriate CPU-specific insn
+ * sequence to enter VMM mode (eg, VMXON)
+ */
+void
+start_vmm_on_cpu(struct cpu_info *ci)
+{
+ uint64_t msr;
+ uint32_t cr4;
+
+ /* No VMM mode? exit. */
+ if ((ci->ci_vmm_flags & CI_VMM_VMX) == 0 &&
+ (ci->ci_vmm_flags & CI_VMM_SVM) == 0)
+ return;
+
+ /*
+ * AMD SVM
+ */
+ if (ci->ci_vmm_flags & CI_VMM_SVM) {
+ msr = rdmsr(MSR_EFER);
+ msr |= EFER_SVME;
+ wrmsr(MSR_EFER, msr);
+ }
+
+ /*
+ * Intel VMX
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ if (ci->ci_vmxon_region == 0)
+ return;
+ else {
+ bzero(ci->ci_vmxon_region, PAGE_SIZE);
+ ci->ci_vmxon_region->vr_revision =
+ ci->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision;
+
+ /* Set CR4.VMXE */
+ cr4 = rcr4();
+ cr4 |= CR4_VMXE;
+ lcr4(cr4);
+
+ /* Enable VMX */
+ msr = rdmsr(MSR_IA32_FEATURE_CONTROL);
+ if (msr & IA32_FEATURE_CONTROL_LOCK) {
+ if (!(msr & IA32_FEATURE_CONTROL_VMX_EN))
+ return;
+ } else {
+ msr |= IA32_FEATURE_CONTROL_VMX_EN |
+ IA32_FEATURE_CONTROL_LOCK;
+ wrmsr(MSR_IA32_FEATURE_CONTROL, msr);
+ }
+
+ /* Enter VMX mode */
+ if (vmxon(&ci->ci_vmxon_region_pa))
+ return;
+ }
+ }
+
+ ci->ci_flags |= CPUF_VMM;
+}
+
+/*
+ * stop_vmm_on_cpu
+ *
+ * Stops VMM mode on 'ci' by executing the appropriate CPU-specific insn
+ * sequence to exit VMM mode (eg, VMXOFF)
+ */
+void
+stop_vmm_on_cpu(struct cpu_info *ci)
+{
+ uint64_t msr;
+ uint32_t cr4;
+
+ if (!(ci->ci_flags & CPUF_VMM))
+ return;
+
+ /*
+ * AMD SVM
+ */
+ if (ci->ci_vmm_flags & CI_VMM_SVM) {
+ msr = rdmsr(MSR_EFER);
+ msr &= ~EFER_SVME;
+ wrmsr(MSR_EFER, msr);
+ }
+
+ /*
+ * Intel VMX
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ if (vmxoff())
+ panic("VMXOFF failed\n");
+
+ cr4 = rcr4();
+ cr4 &= ~CR4_VMXE;
+ lcr4(cr4);
+ }
+
+ ci->ci_flags &= ~CPUF_VMM;
+}
+
+/*
+ * vm_create_check_mem_ranges:
+ *
+ * Make sure that the guest physical memory ranges given by the user process
+ * do not overlap and are in ascending order.
+ *
+ * The last physical address may not exceed VMM_MAX_VM_MEM_SIZE.
+ *
+ * Return Values:
+ * The total memory size in MB if the checks were successful
+ * 0: One of the memory ranges was invalid, or VMM_MAX_VM_MEM_SIZE was
+ * exceeded
+ */
+size_t
+vm_create_check_mem_ranges(struct vm_create_params *vcp)
+{
+ int disjunct_range;
+ size_t i, memsize = 0;
+ struct vm_mem_range *vmr, *pvmr;
+ const paddr_t maxgpa = (uint32_t)VMM_MAX_VM_MEM_SIZE * 1024 * 1024;
+
+ if (vcp->vcp_nmemranges == 0 ||
+ vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
+ return (0);
+
+ for (i = 0; i < vcp->vcp_nmemranges; i++) {
+ vmr = &vcp->vcp_memranges[i];
+
+ /* Only page-aligned addresses and sizes are permitted */
+ if ((vmr->vmr_gpa & PAGE_MASK) || (vmr->vmr_va & PAGE_MASK) ||
+ (vmr->vmr_size & PAGE_MASK) || vmr->vmr_size == 0)
+ return (0);
+
+ /* Make sure that VMM_MAX_VM_MEM_SIZE is not exceeded */
+ if (vmr->vmr_gpa >= maxgpa ||
+ vmr->vmr_size > maxgpa - vmr->vmr_gpa)
+ return (0);
+
+ /*
+ * Make sure that all virtual addresses are within the address
+ * space of the process and that they do not wrap around.
+ * Calling uvm_share() when creating the VM will take care of
+ * further checks.
+ */
+ if (vmr->vmr_va < VM_MIN_ADDRESS ||
+ vmr->vmr_va >= VM_MAXUSER_ADDRESS ||
+ vmr->vmr_size >= VM_MAXUSER_ADDRESS - vmr->vmr_va)
+ return (0);
+
+ /* Specifying ranges within the PCI MMIO space is forbidden */
+ disjunct_range = (vmr->vmr_gpa > VMM_PCI_MMIO_BAR_END) ||
+ (vmr->vmr_gpa + vmr->vmr_size <= VMM_PCI_MMIO_BAR_BASE);
+ if (!disjunct_range)
+ return (0);
+
+ /*
+ * Make sure that guest physcal memory ranges do not overlap
+ * and that they are ascending.
+ */
+ if (i > 0 && pvmr->vmr_gpa + pvmr->vmr_size > vmr->vmr_gpa)
+ return (0);
+
+ memsize += vmr->vmr_size;
+ pvmr = vmr;
+ }
+
+ if (memsize % (1024 * 1024) != 0)
+ return (0);
+ memsize /= 1024 * 1024;
+ return (memsize);
+}
+
+/*
+ * vm_create
+ *
+ * Creates the in-memory VMM structures for the VM defined by 'vcp'. The
+ * parent of this VM shall be the process defined by 'p'.
+ * This function does not start the VCPU(s) - see vm_start.
+ *
+ * Return Values:
+ * 0: the create operation was successful
+ * ENOMEM: out of memory
+ * various other errors from vcpu_init/vm_impl_init
+ */
+int
+vm_create(struct vm_create_params *vcp, struct proc *p)
+{
+ int i, ret;
+ size_t memsize;
+ struct vm *vm;
+ struct vcpu *vcpu;
+
+ if (!(curcpu()->ci_flags & CPUF_VMM))
+ return (EINVAL);
+
+ memsize = vm_create_check_mem_ranges(vcp);
+ if (memsize == 0)
+ return (EINVAL);
+
+ /* XXX - support UP only (for now) */
+ if (vcp->vcp_ncpus != 1)
+ return (EINVAL);
+
+ vm = pool_get(&vm_pool, PR_WAITOK | PR_ZERO);
+ SLIST_INIT(&vm->vm_vcpu_list);
+ rw_init(&vm->vm_vcpu_lock, "vcpulock");
+
+ vm->vm_creator_pid = p->p_p->ps_pid;
+ vm->vm_nmemranges = vcp->vcp_nmemranges;
+ memcpy(vm->vm_memranges, vcp->vcp_memranges,
+ vm->vm_nmemranges * sizeof(vm->vm_memranges[0]));
+ vm->vm_memory_size = memsize;
+ strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN);
+
+ if (vm_impl_init(vm, p)) {
+ printf("failed to init arch-specific features for vm 0x%p\n",
+ vm);
+ vm_teardown(vm);
+ return (ENOMEM);
+ }
+
+ rw_enter_write(&vmm_softc->vm_lock);
+ vmm_softc->vm_ct++;
+ vmm_softc->vm_idx++;
+
+ /*
+ * XXX we use the vm_id for the VPID/ASID, so we need to prevent
+ * wrapping around 65536/4096 entries here
+ */
+ vm->vm_id = vmm_softc->vm_idx;
+ vm->vm_vcpu_ct = 0;
+ vm->vm_vcpus_running = 0;
+
+ /* Initialize each VCPU defined in 'vcp' */
+ for (i = 0; i < vcp->vcp_ncpus; i++) {
+ vcpu = pool_get(&vcpu_pool, PR_WAITOK | PR_ZERO);
+ vcpu->vc_parent = vm;
+ if ((ret = vcpu_init(vcpu)) != 0) {
+ printf("failed to init vcpu %d for vm 0x%p\n", i, vm);
+ vm_teardown(vm);
+ vmm_softc->vm_ct--;
+ vmm_softc->vm_idx--;
+ rw_exit_write(&vmm_softc->vm_lock);
+ return (ret);
+ }
+ rw_enter_write(&vm->vm_vcpu_lock);
+ vcpu->vc_id = vm->vm_vcpu_ct;
+ vm->vm_vcpu_ct++;
+ SLIST_INSERT_HEAD(&vm->vm_vcpu_list, vcpu, vc_vcpu_link);
+ rw_exit_write(&vm->vm_vcpu_lock);
+ }
+
+ /* XXX init various other hardware parts (vlapic, vioapic, etc) */
+
+ SLIST_INSERT_HEAD(&vmm_softc->vm_list, vm, vm_link);
+ rw_exit_write(&vmm_softc->vm_lock);
+
+ vcp->vcp_id = vm->vm_id;
+
+ return (0);
+}
+
+/*
+ * vm_impl_init_vmx
+ *
+ * Intel VMX specific VM initialization routine
+ */
+int
+vm_impl_init_vmx(struct vm *vm, struct proc *p)
+{
+ int i, ret;
+ vaddr_t mingpa, maxgpa;
+ struct pmap *pmap;
+ struct vm_mem_range *vmr;
+
+ /* If not EPT, nothing to do here */
+ if (vmm_softc->mode != VMM_MODE_EPT)
+ return (0);
+
+ /* Create a new pmap for this VM */
+ pmap = pmap_create();
+ if (!pmap) {
+ printf("vm_impl_init_vmx: pmap_create failed\n");
+ return (ENOMEM);
+ }
+
+ /*
+ * Create a new UVM map for this VM, and assign it the pmap just
+ * created.
+ */
+ vmr = &vm->vm_memranges[0];
+ mingpa = vmr->vmr_gpa;
+ vmr = &vm->vm_memranges[vm->vm_nmemranges - 1];
+ maxgpa = vmr->vmr_gpa + vmr->vmr_size;
+ vm->vm_map = uvm_map_create(pmap, mingpa, maxgpa,
+ VM_MAP_ISVMSPACE | VM_MAP_PAGEABLE);
+
+ if (!vm->vm_map) {
+ printf("vm_impl_init_vmx: uvm_map_create failed\n");
+ pmap_destroy(pmap);
+ return (ENOMEM);
+ }
+
+ /* Map the new map with an anon */
+ DPRINTF("vm_impl_init_vmx: created vm_map @ %p\n", vm->vm_map);
+ for (i = 0; i < vm->vm_nmemranges; i++) {
+ vmr = &vm->vm_memranges[i];
+ ret = uvm_share(vm->vm_map, vmr->vmr_gpa,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ &p->p_vmspace->vm_map, vmr->vmr_va, vmr->vmr_size);
+ if (ret) {
+ printf("vm_impl_init_vmx: uvm_share failed (%d)\n",
+ ret);
+ /* uvm_map_deallocate calls pmap_destroy for us */
+ uvm_map_deallocate(vm->vm_map);
+ vm->vm_map = NULL;
+ return (ENOMEM);
+ }
+ }
+
+ /* Convert the low 512GB of the pmap to EPT */
+ ret = pmap_convert(pmap, PMAP_TYPE_EPT);
+ if (ret) {
+ printf("vm_impl_init_vmx: pmap_convert failed\n");
+ /* uvm_map_deallocate calls pmap_destroy for us */
+ uvm_map_deallocate(vm->vm_map);
+ vm->vm_map = NULL;
+ return (ENOMEM);
+ }
+
+ return (0);
+}
+
+/*
+ * vm_impl_init_svm
+ *
+ * AMD SVM specific VM initialization routine
+ */
+int
+vm_impl_init_svm(struct vm *vm, struct proc *p)
+{
+ /* XXX removed due to rot */
+ return (-1);
+}
+
+/*
+ * vm_impl_init
+ *
+ * Calls the architecture-specific VM init routine
+ */
+int
+vm_impl_init(struct vm *vm, struct proc *p)
+{
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ return vm_impl_init_vmx(vm, p);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ return vm_impl_init_svm(vm, p);
+ else
+ panic("unknown vmm mode\n");
+}
+
+/*
+ * vm_impl_deinit_vmx
+ *
+ * Intel VMX specific VM initialization routine
+ */
+void
+vm_impl_deinit_vmx(struct vm *vm)
+{
+ /* Unused */
+}
+
+/*
+ * vm_impl_deinit_svm
+ *
+ * AMD SVM specific VM initialization routine
+ */
+void
+vm_impl_deinit_svm(struct vm *vm)
+{
+ /* Unused */
+}
+
+/*
+ * vm_impl_deinit
+ *
+ * Calls the architecture-specific VM init routine
+ */
+void
+vm_impl_deinit(struct vm *vm)
+{
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ vm_impl_deinit_vmx(vm);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ vm_impl_deinit_svm(vm);
+ else
+ panic("unknown vmm mode\n");
+}
+
+/*
+ * vcpu_reload_vmcs_vmx
+ *
+ * Loads 'vmcs' on the current CPU, possibly flushing any old vmcs state
+ * of the previous occupant.
+ *
+ * Parameters:
+ * vmcs: Pointer to uint64_t containing the PA of the vmcs to load
+ *
+ * Return values:
+ * 0: if successful
+ * EINVAL: an error occurred during flush or reload
+ */
+int
+vcpu_reload_vmcs_vmx(uint64_t *vmcs)
+{
+ uint64_t old;
+
+ /* Flush any old state */
+ if (!vmptrst(&old)) {
+ if (old != 0xFFFFFFFFFFFFFFFFULL) {
+ if (vmclear(&old))
+ return (EINVAL);
+ }
+ } else
+ return (EINVAL);
+
+ /*
+ * Load the VMCS onto this PCPU
+ */
+ if (vmptrld(vmcs))
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * vcpu_readregs_vmx
+ *
+ * Reads 'vcpu's registers
+ *
+ * Parameters:
+ * vcpu: the vcpu to read register values from
+ * regmask: the types of registers to read
+ * vrs: output parameter where register values are stored
+ *
+ * Return values:
+ * 0: if successful
+ * EINVAL: an error reading registers occured
+ */
+int
+vcpu_readregs_vmx(struct vcpu *vcpu, uint64_t regmask,
+ struct vcpu_reg_state *vrs)
+{
+ int i, ret = 0;
+ uint32_t ar, sel;
+ uint32_t limit;
+ uint32_t *gprs = vrs->vrs_gprs;
+ uint32_t *crs = vrs->vrs_crs;
+ struct vcpu_segment_info *sregs = vrs->vrs_sregs;
+
+ if (vcpu_reload_vmcs_vmx(&vcpu->vc_control_pa))
+ return (EINVAL);
+
+ if (regmask & VM_RWREGS_GPRS) {
+ gprs[VCPU_REGS_EAX] = vcpu->vc_gueststate.vg_eax;
+ gprs[VCPU_REGS_EBX] = vcpu->vc_gueststate.vg_ebx;
+ gprs[VCPU_REGS_ECX] = vcpu->vc_gueststate.vg_ecx;
+ gprs[VCPU_REGS_EDX] = vcpu->vc_gueststate.vg_edx;
+ gprs[VCPU_REGS_ESI] = vcpu->vc_gueststate.vg_esi;
+ gprs[VCPU_REGS_EDI] = vcpu->vc_gueststate.vg_edi;
+ gprs[VCPU_REGS_EBP] = vcpu->vc_gueststate.vg_ebp;
+ gprs[VCPU_REGS_EIP] = vcpu->vc_gueststate.vg_eip;
+ if (vmread(VMCS_GUEST_IA32_RSP, &gprs[VCPU_REGS_ESP]))
+ goto errout;
+ if (vmread(VMCS_GUEST_IA32_RFLAGS, &gprs[VCPU_REGS_EFLAGS]))
+ goto errout;
+ }
+ if (regmask & VM_RWREGS_SREGS) {
+ for (i = 0; i < nitems(vmm_vmx_sreg_vmcs_fields); i++) {
+ if (vmread(vmm_vmx_sreg_vmcs_fields[i].selid, &sel))
+ goto errout;
+ if (vmread(vmm_vmx_sreg_vmcs_fields[i].limitid, &limit))
+ goto errout;
+ if (vmread(vmm_vmx_sreg_vmcs_fields[i].arid, &ar))
+ goto errout;
+ if (vmread(vmm_vmx_sreg_vmcs_fields[i].baseid,
+ &sregs[i].vsi_base))
+ goto errout;
+
+ sregs[i].vsi_sel = sel;
+ sregs[i].vsi_limit = limit;
+ sregs[i].vsi_ar = ar;
+ }
+
+ if (vmread(VMCS_GUEST_IA32_GDTR_LIMIT, &limit))
+ goto errout;
+ if (vmread(VMCS_GUEST_IA32_GDTR_BASE,
+ &vrs->vrs_gdtr.vsi_base))
+ goto errout;
+ vrs->vrs_gdtr.vsi_limit = limit;
+
+ if (vmread(VMCS_GUEST_IA32_IDTR_LIMIT, &limit))
+ goto errout;
+ if (vmread(VMCS_GUEST_IA32_IDTR_BASE,
+ &vrs->vrs_idtr.vsi_base))
+ goto errout;
+ vrs->vrs_idtr.vsi_limit = limit;
+ }
+ if (regmask & VM_RWREGS_CRS) {
+ crs[VCPU_REGS_CR2] = vcpu->vc_gueststate.vg_cr2;
+ if (vmread(VMCS_GUEST_IA32_CR0, &crs[VCPU_REGS_CR0]))
+ goto errout;
+ if (vmread(VMCS_GUEST_IA32_CR3, &crs[VCPU_REGS_CR3]))
+ goto errout;
+ if (vmread(VMCS_GUEST_IA32_CR4, &crs[VCPU_REGS_CR4]))
+ goto errout;
+ }
+
+ goto out;
+
+errout:
+ ret = EINVAL;
+out:
+ if (vmclear(&vcpu->vc_control_pa))
+ ret = EINVAL;
+ return (ret);
+}
+
+/*
+ * vcpu_readregs_svm
+ *
+ * XXX - unimplemented
+ */
+int
+vcpu_readregs_svm(struct vcpu *vcpu, uint64_t regmask,
+ struct vcpu_reg_state *regs)
+{
+ return (0);
+}
+
+/*
+ * vcpu_writeregs_vmx
+ *
+ * Writes 'vcpu's registers
+ *
+ * Parameters:
+ * vcpu: the vcpu that has to get its registers written to
+ * regmask: the types of registers to write
+ * loadvmcs: bit to indicate whether the VMCS has to be loaded first
+ * vrs: the register values to write
+ *
+ * Return values:
+ * 0: if successful
+ * EINVAL an error writing registers occured
+ */
+int
+vcpu_writeregs_vmx(struct vcpu *vcpu, uint64_t regmask, int loadvmcs,
+ struct vcpu_reg_state *vrs)
+{
+ int i, ret = 0;
+ uint16_t sel;
+ uint32_t limit, ar;
+ uint32_t *gprs = vrs->vrs_gprs;
+ uint32_t *crs = vrs->vrs_crs;
+ struct vcpu_segment_info *sregs = vrs->vrs_sregs;
+
+ if (loadvmcs) {
+ if (vcpu_reload_vmcs_vmx(&vcpu->vc_control_pa))
+ return (EINVAL);
+ }
+
+ if (regmask & VM_RWREGS_GPRS) {
+ vcpu->vc_gueststate.vg_eax = gprs[VCPU_REGS_EAX];
+ vcpu->vc_gueststate.vg_ebx = gprs[VCPU_REGS_EBX];
+ vcpu->vc_gueststate.vg_ecx = gprs[VCPU_REGS_ECX];
+ vcpu->vc_gueststate.vg_edx = gprs[VCPU_REGS_EDX];
+ vcpu->vc_gueststate.vg_esi = gprs[VCPU_REGS_ESI];
+ vcpu->vc_gueststate.vg_edi = gprs[VCPU_REGS_EDI];
+ vcpu->vc_gueststate.vg_ebp = gprs[VCPU_REGS_EBP];
+ vcpu->vc_gueststate.vg_eip = gprs[VCPU_REGS_EIP];
+ if (vmwrite(VMCS_GUEST_IA32_RIP, gprs[VCPU_REGS_EIP]))
+ goto errout;
+ if (vmwrite(VMCS_GUEST_IA32_RSP, gprs[VCPU_REGS_ESP]))
+ goto errout;
+ if (vmwrite(VMCS_GUEST_IA32_RFLAGS, gprs[VCPU_REGS_EFLAGS]))
+ goto errout;
+ }
+ if (regmask & VM_RWREGS_SREGS) {
+ for (i = 0; i < nitems(vmm_vmx_sreg_vmcs_fields); i++) {
+ sel = sregs[i].vsi_sel;
+ limit = sregs[i].vsi_limit;
+ ar = sregs[i].vsi_ar;
+
+ if (vmwrite(vmm_vmx_sreg_vmcs_fields[i].selid, sel))
+ goto errout;
+ if (vmwrite(vmm_vmx_sreg_vmcs_fields[i].limitid, limit))
+ goto errout;
+ if (vmwrite(vmm_vmx_sreg_vmcs_fields[i].arid, ar))
+ goto errout;
+ if (vmwrite(vmm_vmx_sreg_vmcs_fields[i].baseid,
+ sregs[i].vsi_base))
+ goto errout;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_GDTR_LIMIT,
+ vrs->vrs_gdtr.vsi_limit))
+ goto errout;
+ if (vmwrite(VMCS_GUEST_IA32_GDTR_BASE,
+ vrs->vrs_gdtr.vsi_base))
+ goto errout;
+ if (vmwrite(VMCS_GUEST_IA32_IDTR_LIMIT,
+ vrs->vrs_idtr.vsi_limit))
+ goto errout;
+ if (vmwrite(VMCS_GUEST_IA32_IDTR_BASE,
+ vrs->vrs_idtr.vsi_base))
+ goto errout;
+ }
+ if (regmask & VM_RWREGS_CRS) {
+ if (vmwrite(VMCS_GUEST_IA32_CR0, crs[VCPU_REGS_CR0]))
+ goto errout;
+ if (vmwrite(VMCS_GUEST_IA32_CR3, crs[VCPU_REGS_CR3]))
+ goto errout;
+ if (vmwrite(VMCS_GUEST_IA32_CR4, crs[VCPU_REGS_CR4]))
+ goto errout;
+ }
+
+ goto out;
+
+errout:
+ ret = EINVAL;
+out:
+ if (loadvmcs) {
+ if (vmclear(&vcpu->vc_control_pa))
+ ret = EINVAL;
+ }
+ return (ret);
+}
+
+/*
+ * vcpu_writeregs_svm
+ *
+ * XXX - unimplemented
+ */
+int
+vcpu_writeregs_svm(struct vcpu *vcpu, uint64_t regmask,
+ struct vcpu_reg_state *vrs)
+{
+ return (0);
+}
+
+/*
+ * vcpu_reset_regs_svm
+ *
+ * XXX - unimplemented
+ */
+int
+vcpu_reset_regs_svm(struct vcpu *vcpu, struct vcpu_reg_state *vrs)
+{
+ return (0);
+}
+
+/*
+ * vmx_setmsrbr
+ *
+ * Allow read access to the specified msr on the supplied vcpu.
+ *
+ * Parameters:
+ * vcpu: the VCPU to allow access
+ * msr: the MSR number to allow access to
+ */
+void
+vmx_setmsrbr(struct vcpu *vcpu, uint32_t msr)
+{
+ uint8_t *msrs;
+ uint16_t idx;
+
+ msrs = (uint8_t *)vcpu->vc_msr_bitmap_va;
+
+ /*
+ * MSR Read bitmap layout:
+ * "Low" MSRs (0x0 - 0x1fff) @ 0x0
+ * "High" MSRs (0xc0000000 - 0xc0001fff) @ 0x400
+ */
+ if (msr <= 0x1fff) {
+ idx = MSRIDX(msr);
+ msrs[idx] &= ~(MSRBIT(msr));
+ } else if (msr >= 0xc0000000 && msr <= 0xc0001fff) {
+ idx = MSRIDX(msr - 0xc0000000) + 0x400;
+ msrs[idx] &= ~(MSRBIT(msr - 0xc0000000));
+ } else
+ printf("%s: invalid msr 0x%x\n", __func__, msr);
+}
+
+/*
+ * vmx_setmsrbw
+ *
+ * Allow write access to the specified msr on the supplied vcpu
+ *
+ * Parameters:
+ * vcpu: the VCPU to allow access
+ * msr: the MSR number to allow access to
+ */
+void
+vmx_setmsrbw(struct vcpu *vcpu, uint32_t msr)
+{
+ uint8_t *msrs;
+ uint16_t idx;
+
+ msrs = (uint8_t *)vcpu->vc_msr_bitmap_va;
+
+ /*
+ * MSR Write bitmap layout:
+ * "Low" MSRs (0x0 - 0x1fff) @ 0x800
+ * "High" MSRs (0xc0000000 - 0xc0001fff) @ 0xc00
+ */
+ if (msr <= 0x1fff) {
+ idx = MSRIDX(msr) + 0x800;
+ msrs[idx] &= ~(MSRBIT(msr));
+ } else if (msr >= 0xc0000000 && msr <= 0xc0001fff) {
+ idx = MSRIDX(msr - 0xc0000000) + 0xc00;
+ msrs[idx] &= ~(MSRBIT(msr - 0xc0000000));
+ } else
+ printf("%s: invalid msr 0x%x\n", __func__, msr);
+}
+
+/*
+ * vmx_setmsrbrw
+ *
+ * Allow read/write access to the specified msr on the supplied vcpu
+ *
+ * Parameters:
+ * vcpu: the VCPU to allow access
+ * msr: the MSR number to allow access to
+ */
+void
+vmx_setmsrbrw(struct vcpu *vcpu, uint32_t msr)
+{
+ vmx_setmsrbr(vcpu, msr);
+ vmx_setmsrbw(vcpu, msr);
+}
+
+/*
+ * vcpu_reset_regs_vmx
+ *
+ * Initializes 'vcpu's registers to supplied state
+ *
+ * Parameters:
+ * vcpu: the vcpu whose register state is to be initialized
+ * vrs: the register state to set
+ *
+ * Return values:
+ * 0: registers init'ed successfully
+ * EINVAL: an error occurred setting register state
+ */
+int
+vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs)
+{
+ int ret, ug;
+ uint32_t cr0, cr4;
+ uint32_t pinbased, procbased, procbased2, exit, entry;
+ uint32_t want1, want0;
+ uint64_t msr, ctrlval, eptp, cr3;
+ uint16_t ctrl;
+ struct vmx_msr_store *msr_store;
+
+ ret = 0;
+ ug = 0;
+
+ if (vcpu_reload_vmcs_vmx(&vcpu->vc_control_pa))
+ return (EINVAL);
+
+ /* Compute Basic Entry / Exit Controls */
+ vcpu->vc_vmx_basic = rdmsr(IA32_VMX_BASIC);
+ vcpu->vc_vmx_entry_ctls = rdmsr(IA32_VMX_ENTRY_CTLS);
+ vcpu->vc_vmx_exit_ctls = rdmsr(IA32_VMX_EXIT_CTLS);
+ vcpu->vc_vmx_pinbased_ctls = rdmsr(IA32_VMX_PINBASED_CTLS);
+ vcpu->vc_vmx_procbased_ctls = rdmsr(IA32_VMX_PROCBASED_CTLS);
+
+ /* Compute True Entry / Exit Controls (if applicable) */
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ vcpu->vc_vmx_true_entry_ctls = rdmsr(IA32_VMX_TRUE_ENTRY_CTLS);
+ vcpu->vc_vmx_true_exit_ctls = rdmsr(IA32_VMX_TRUE_EXIT_CTLS);
+ vcpu->vc_vmx_true_pinbased_ctls =
+ rdmsr(IA32_VMX_TRUE_PINBASED_CTLS);
+ vcpu->vc_vmx_true_procbased_ctls =
+ rdmsr(IA32_VMX_TRUE_PROCBASED_CTLS);
+ }
+
+ /* Compute Secondary Procbased Controls (if applicable) */
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1))
+ vcpu->vc_vmx_procbased2_ctls = rdmsr(IA32_VMX_PROCBASED2_CTLS);
+
+ /*
+ * Pinbased ctrls
+ *
+ * We must be able to set the following:
+ * IA32_VMX_EXTERNAL_INT_EXITING - exit on host interrupt
+ * IA32_VMX_NMI_EXITING - exit on host NMI
+ */
+ want1 = IA32_VMX_EXTERNAL_INT_EXITING |
+ IA32_VMX_NMI_EXITING;
+ want0 = 0;
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ ctrl = IA32_VMX_TRUE_PINBASED_CTLS;
+ ctrlval = vcpu->vc_vmx_true_pinbased_ctls;
+ } else {
+ ctrl = IA32_VMX_PINBASED_CTLS;
+ ctrlval = vcpu->vc_vmx_pinbased_ctls;
+ }
+
+ if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &pinbased)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_PINBASED_CTLS, pinbased)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Procbased ctrls
+ *
+ * We must be able to set the following:
+ * IA32_VMX_HLT_EXITING - exit on HLT instruction
+ * IA32_VMX_MWAIT_EXITING - exit on MWAIT instruction
+ * IA32_VMX_UNCONDITIONAL_IO_EXITING - exit on I/O instructions
+ * IA32_VMX_USE_MSR_BITMAPS - exit on various MSR accesses
+ * IA32_VMX_CR8_LOAD_EXITING - guest TPR access
+ * IA32_VMX_CR8_STORE_EXITING - guest TPR access
+ * IA32_VMX_USE_TPR_SHADOW - guest TPR access (shadow)
+ *
+ * If we have EPT, we must be able to clear the following
+ * IA32_VMX_CR3_LOAD_EXITING - don't care about guest CR3 accesses
+ * IA32_VMX_CR3_STORE_EXITING - don't care about guest CR3 accesses
+ */
+ want1 = IA32_VMX_HLT_EXITING |
+ IA32_VMX_MWAIT_EXITING |
+ IA32_VMX_UNCONDITIONAL_IO_EXITING |
+ IA32_VMX_USE_MSR_BITMAPS |
+ IA32_VMX_CR8_LOAD_EXITING |
+ IA32_VMX_CR8_STORE_EXITING |
+ IA32_VMX_USE_TPR_SHADOW;
+ want0 = 0;
+
+ if (vmm_softc->mode == VMM_MODE_EPT) {
+ want1 |= IA32_VMX_ACTIVATE_SECONDARY_CONTROLS;
+ want0 |= IA32_VMX_CR3_LOAD_EXITING |
+ IA32_VMX_CR3_STORE_EXITING;
+ }
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ ctrl = IA32_VMX_TRUE_PROCBASED_CTLS;
+ ctrlval = vcpu->vc_vmx_true_procbased_ctls;
+ } else {
+ ctrl = IA32_VMX_PROCBASED_CTLS;
+ ctrlval = vcpu->vc_vmx_procbased_ctls;
+ }
+
+ if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &procbased)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_PROCBASED_CTLS, procbased)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Secondary Procbased ctrls
+ *
+ * We want to be able to set the following, if available:
+ * IA32_VMX_ENABLE_VPID - use VPIDs where available
+ *
+ * If we have EPT, we must be able to set the following:
+ * IA32_VMX_ENABLE_EPT - enable EPT
+ *
+ * If we have unrestricted guest capability, we must be able to set
+ * the following:
+ * IA32_VMX_UNRESTRICTED_GUEST - enable unrestricted guest
+ */
+ want1 = 0;
+
+ /* XXX checking for 2ndary controls can be combined here */
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_VPID, 1))
+ want1 |= IA32_VMX_ENABLE_VPID;
+ }
+
+ if (vmm_softc->mode == VMM_MODE_EPT)
+ want1 |= IA32_VMX_ENABLE_EPT;
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_UNRESTRICTED_GUEST, 1)) {
+ want1 |= IA32_VMX_UNRESTRICTED_GUEST;
+ ug = 1;
+ }
+ }
+
+ want0 = ~want1;
+ ctrlval = vcpu->vc_vmx_procbased2_ctls;
+ ctrl = IA32_VMX_PROCBASED2_CTLS;
+
+ if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &procbased2)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_PROCBASED2_CTLS, procbased2)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Exit ctrls
+ *
+ * We must be able to set the following:
+ * IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT - ack interrupt on exit
+ * XXX clear save_debug_ctrls on exit ?
+ */
+ want1 = IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT;
+ want0 = 0;
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ ctrl = IA32_VMX_TRUE_EXIT_CTLS;
+ ctrlval = vcpu->vc_vmx_true_exit_ctls;
+ } else {
+ ctrl = IA32_VMX_EXIT_CTLS;
+ ctrlval = vcpu->vc_vmx_exit_ctls;
+ }
+
+ if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &exit)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_CTLS, exit)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Entry ctrls
+ *
+ * We must be able to set the following:
+ * IA32_VMX_IA32E_MODE_GUEST (if no unrestricted guest)
+ * We must be able to clear the following:
+ * IA32_VMX_ENTRY_TO_SMM - enter to SMM
+ * IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT
+ * IA32_VMX_LOAD_DEBUG_CONTROLS
+ * IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY
+ */
+ if (ug == 1)
+ want1 = 0;
+ else
+ want1 = IA32_VMX_IA32E_MODE_GUEST;
+
+ want0 = IA32_VMX_ENTRY_TO_SMM |
+ IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT |
+ IA32_VMX_LOAD_DEBUG_CONTROLS |
+ IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY;
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ ctrl = IA32_VMX_TRUE_ENTRY_CTLS;
+ ctrlval = vcpu->vc_vmx_true_entry_ctls;
+ } else {
+ ctrl = IA32_VMX_ENTRY_CTLS;
+ ctrlval = vcpu->vc_vmx_entry_ctls;
+ }
+
+ if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &entry)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_ENTRY_CTLS, entry)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmm_softc->mode == VMM_MODE_EPT) {
+ eptp = vcpu->vc_parent->vm_map->pmap->pm_npt_pa;
+ msr = rdmsr(IA32_VMX_EPT_VPID_CAP);
+ if (msr & IA32_EPT_VPID_CAP_PAGE_WALK_4) {
+ /* Page walk length 4 supported */
+ eptp |= ((IA32_EPT_PAGE_WALK_LENGTH - 1) << 3);
+ }
+
+ if (msr & IA32_EPT_VPID_CAP_WB) {
+ /* WB cache type supported */
+ eptp |= IA32_EPT_PAGING_CACHE_TYPE_WB;
+ }
+
+ if (msr & IA32_EPT_VPID_CAP_AD_BITS) {
+ /* EPT A/D bits supported */
+ eptp |= IA32_EPT_AD_BITS_ENABLE;
+ }
+
+ DPRINTF("guest eptp = 0x%llx\n", eptp);
+ DPRINTF("write 0x%x to EPT_LO\n", (uint32_t)(eptp & 0xFFFFFFFFUL));
+ if (vmwrite(VMCS_GUEST_IA32_EPTP, (uint32_t)(eptp & 0xFFFFFFFFUL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_EPTP_HI, 0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_VPID, 1))
+ if (vmwrite(VMCS_GUEST_VPID,
+ (uint16_t)vcpu->vc_parent->vm_id)) {
+ ret = EINVAL;
+ goto exit;
+ }
+ }
+
+ /*
+ * Determine which bits in CR0 have to be set to a fixed
+ * value as per Intel SDM A.7.
+ * CR0 bits in the vrs parameter must match these.
+ */
+
+ want1 = (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0) &
+ (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1);
+ want0 = ~(curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0) &
+ ~(curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1);
+
+ /*
+ * CR0_FIXED0 and CR0_FIXED1 may report the CR0_PG and CR0_PE bits as
+ * fixed to 1 even if the CPU supports the unrestricted guest
+ * feature. Update want1 and want0 accordingly to allow
+ * any value for CR0_PG and CR0_PE in vrs->vrs_crs[VCPU_REGS_CR0] if
+ * the CPU has the unrestricted guest capability.
+ */
+ cr0 = vrs->vrs_crs[VCPU_REGS_CR0];
+
+ if (ug) {
+ want1 &= ~(CR0_PG | CR0_PE);
+ want0 &= ~(CR0_PG | CR0_PE);
+ cr0 &= ~(CR0_PG | CR0_PE);
+ }
+
+ /*
+ * VMX may require some bits to be set that userland should not have
+ * to care about. Set those here.
+ */
+ if (want1 & CR0_NE)
+ cr0 |= CR0_NE;
+
+ if ((cr0 & want1) != want1) {
+ ret = EINVAL;
+ goto exit;
+ }
+ if ((~cr0 & want0) != want0) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (ug)
+ cr3 = 0;
+ else
+ cr3 = vrs->vrs_crs[VCPU_REGS_CR3];
+
+ /*
+ * Determine default CR4 as per Intel SDM A.8
+ * All flexible bits are set to 0
+ */
+ cr4 = (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0) &
+ (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1);
+
+ /*
+ * If we are starting in restricted guest mode, enable PAE
+ */
+ if (ug == 0)
+ cr4 |= CR4_PAE;
+
+ vrs->vrs_crs[VCPU_REGS_CR0] = cr0;
+ vrs->vrs_crs[VCPU_REGS_CR3] = cr3;
+ vrs->vrs_crs[VCPU_REGS_CR4] = cr4;
+
+ /*
+ * Select MSRs to be loaded on exit
+ */
+ msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_load_va;
+ msr_store[0].vms_index = MSR_EFER;
+ msr_store[0].vms_data = rdmsr(MSR_EFER);
+
+ /*
+ * Select MSRs to be loaded on entry / saved on exit
+ */
+ msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_save_va;
+
+ msr_store[0].vms_index = MSR_EFER;
+ msr_store[0].vms_data = 0ULL; /* Initial value */
+
+ /*
+ * Currently we have the same count of entry/exit MSRs loads/stores
+ * but this is not an architectural requirement.
+ */
+ if (vmwrite(VMCS_EXIT_MSR_STORE_COUNT, VMX_NUM_MSR_STORE)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_MSR_LOAD_COUNT, VMX_NUM_MSR_STORE)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, VMX_NUM_MSR_STORE)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_STORE_MSR_ADDRESS,
+ vcpu->vc_vmx_msr_exit_save_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_STORE_MSR_ADDRESS_HI, 0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_LOAD_MSR_ADDRESS,
+ vcpu->vc_vmx_msr_exit_load_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_LOAD_MSR_ADDRESS_HI, 0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_ENTRY_LOAD_MSR_ADDRESS,
+ vcpu->vc_vmx_msr_exit_save_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_ENTRY_LOAD_MSR_ADDRESS_HI, 0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_MSR_BITMAP_ADDRESS,
+ vcpu->vc_msr_bitmap_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_MSR_BITMAP_ADDRESS_HI, 0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Set up the VMCS for the register state we want during VCPU start.
+ * This matches what the CPU state would be after a bootloader
+ * transition to 'start'.
+ */
+ ret = vcpu_writeregs_vmx(vcpu, VM_RWREGS_ALL, 0, vrs);
+
+ /*
+ * Set up the MSR bitmap
+ */
+ memset((uint8_t *)vcpu->vc_msr_bitmap_va, 0xFF, PAGE_SIZE);
+ vmx_setmsrbrw(vcpu, MSR_IA32_FEATURE_CONTROL);
+ vmx_setmsrbrw(vcpu, MSR_MTRRcap);
+ vmx_setmsrbrw(vcpu, MSR_SYSENTER_CS);
+ vmx_setmsrbrw(vcpu, MSR_SYSENTER_ESP);
+ vmx_setmsrbrw(vcpu, MSR_SYSENTER_EIP);
+ vmx_setmsrbrw(vcpu, MSR_MTRRvarBase);
+ vmx_setmsrbrw(vcpu, MSR_CR_PAT);
+ vmx_setmsrbrw(vcpu, MSR_MTRRdefType);
+ vmx_setmsrbrw(vcpu, MSR_EFER);
+ vmx_setmsrbrw(vcpu, MSR_STAR);
+ vmx_setmsrbrw(vcpu, MSR_LSTAR);
+ vmx_setmsrbrw(vcpu, MSR_CSTAR);
+ vmx_setmsrbrw(vcpu, MSR_SFMASK);
+ vmx_setmsrbrw(vcpu, MSR_FSBASE);
+ vmx_setmsrbrw(vcpu, MSR_GSBASE);
+ vmx_setmsrbrw(vcpu, MSR_KERNELGSBASE);
+
+
+ /* XXX CR0 shadow */
+ /* XXX CR4 shadow */
+
+ /* Flush the VMCS */
+ if (vmclear(&vcpu->vc_control_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+exit:
+ return (ret);
+}
+
+/*
+ * vcpu_init_vmx
+ *
+ * Intel VMX specific VCPU initialization routine.
+ *
+ * This function allocates various per-VCPU memory regions, sets up initial
+ * VCPU VMCS controls, and sets initial register values.
+ */
+int
+vcpu_init_vmx(struct vcpu *vcpu)
+{
+ struct vmcs *vmcs;
+ uint32_t cr0, cr4;
+ int ret;
+
+ ret = 0;
+
+ /* Allocate VMCS VA */
+ vcpu->vc_control_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, &kp_zero,
+ &kd_waitok);
+
+ if (!vcpu->vc_control_va)
+ return (ENOMEM);
+
+ /* Compute VMCS PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_control_va,
+ (paddr_t *)&vcpu->vc_control_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Allocate MSR bitmap VA */
+ vcpu->vc_msr_bitmap_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, &kp_zero,
+ &kd_waitok);
+
+ if (!vcpu->vc_msr_bitmap_va) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Compute MSR bitmap PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_msr_bitmap_va,
+ (paddr_t *)&vcpu->vc_msr_bitmap_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Allocate MSR exit load area VA */
+ vcpu->vc_vmx_msr_exit_load_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page,
+ &kp_zero, &kd_waitok);
+
+ if (!vcpu->vc_vmx_msr_exit_load_va) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Compute MSR exit load area PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_exit_load_va,
+ &vcpu->vc_vmx_msr_exit_load_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Allocate MSR exit save area VA */
+ vcpu->vc_vmx_msr_exit_save_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page,
+ &kp_zero, &kd_waitok);
+
+ if (!vcpu->vc_vmx_msr_exit_save_va) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Compute MSR exit save area PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_exit_save_va,
+ &vcpu->vc_vmx_msr_exit_save_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Allocate MSR entry load area VA */
+ vcpu->vc_vmx_msr_entry_load_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page,
+ &kp_zero, &kd_waitok);
+
+ if (!vcpu->vc_vmx_msr_entry_load_va) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Compute MSR entry load area PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_entry_load_va,
+ &vcpu->vc_vmx_msr_entry_load_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ vmcs = (struct vmcs *)vcpu->vc_control_va;
+ vmcs->vmcs_revision = curcpu()->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision;
+
+ /*
+ * Load the VMCS onto this PCPU so we can write registers
+ */
+ if (vmptrld(&vcpu->vc_control_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Host CR0 */
+ cr0 = rcr0();
+ if (vmwrite(VMCS_HOST_IA32_CR0, cr0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Host CR4 */
+ cr4 = rcr4();
+ if (vmwrite(VMCS_HOST_IA32_CR4, cr4)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Host Segment Selectors */
+ if (vmwrite(VMCS_HOST_IA32_CS_SEL, GSEL(GCODE_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_DS_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_ES_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_FS_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_GS_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_SS_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_TR_SEL, proc0.p_md.md_tss_sel)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Host IDTR base */
+ if (vmwrite(VMCS_HOST_IA32_IDTR_BASE, (uint32_t)idt)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* VMCS link */
+ if (vmwrite(VMCS_LINK_POINTER, 0xFFFFFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_LINK_POINTER_HI, 0xFFFFFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+exit:
+ if (ret) {
+ if (vcpu->vc_control_va)
+ km_free((void *)vcpu->vc_control_va, PAGE_SIZE,
+ &kv_page, &kp_zero);
+ if (vcpu->vc_msr_bitmap_va)
+ km_free((void *)vcpu->vc_msr_bitmap_va, PAGE_SIZE,
+ &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_exit_save_va)
+ km_free((void *)vcpu->vc_vmx_msr_exit_save_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_exit_load_va)
+ km_free((void *)vcpu->vc_vmx_msr_exit_load_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_entry_load_va)
+ km_free((void *)vcpu->vc_vmx_msr_entry_load_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ }
+
+ return (ret);
+}
+
+/*
+ * vcpu_reset_regs
+ *
+ * Resets a vcpu's registers to the provided state
+ *
+ * Parameters:
+ * vcpu: the vcpu whose registers shall be reset
+ * vrs: the desired register state
+ *
+ * Return values:
+ * 0: the vcpu's registers were successfully reset
+ * !0: the vcpu's registers could not be reset (see arch-specific reset
+ * function for various values that can be returned here)
+ */
+int
+vcpu_reset_regs(struct vcpu *vcpu, struct vcpu_reg_state *vrs)
+{
+ int ret;
+
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ ret = vcpu_reset_regs_vmx(vcpu, vrs);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ ret = vcpu_reset_regs_svm(vcpu, vrs);
+ else
+ panic("unknown vmm mode\n");
+
+ return (ret);
+}
+
+/*
+ * vcpu_init_svm
+ *
+ * AMD SVM specific VCPU initialization routine.
+ */
+int
+vcpu_init_svm(struct vcpu *vcpu)
+{
+ /* XXX removed due to rot */
+ return (0);
+}
+
+/*
+ * vcpu_init
+ *
+ * Calls the architecture-specific VCPU init routine
+ */
+int
+vcpu_init(struct vcpu *vcpu)
+{
+ int ret = 0;
+
+ vcpu->vc_hsa_stack_va = (vaddr_t)malloc(PAGE_SIZE, M_DEVBUF,
+ M_NOWAIT|M_ZERO);
+ if (!vcpu->vc_hsa_stack_va)
+ return (ENOMEM);
+
+ vcpu->vc_virt_mode = vmm_softc->mode;
+ vcpu->vc_state = VCPU_STATE_STOPPED;
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ ret = vcpu_init_vmx(vcpu);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ ret = vcpu_init_svm(vcpu);
+ else
+ panic("unknown vmm mode\n");
+
+ if (ret)
+ free((void *)vcpu->vc_hsa_stack_va, M_DEVBUF, PAGE_SIZE);
+
+ return (ret);
+}
+
+/*
+ * vcpu_deinit_vmx
+ *
+ * Deinitializes the vcpu described by 'vcpu'
+ */
+void
+vcpu_deinit_vmx(struct vcpu *vcpu)
+{
+ if (vcpu->vc_control_va)
+ km_free((void *)vcpu->vc_control_va, PAGE_SIZE,
+ &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_exit_save_va)
+ km_free((void *)vcpu->vc_vmx_msr_exit_save_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_exit_load_va)
+ km_free((void *)vcpu->vc_vmx_msr_exit_load_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_entry_load_va)
+ km_free((void *)vcpu->vc_vmx_msr_entry_load_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_hsa_stack_va)
+ free((void *)vcpu->vc_hsa_stack_va, M_DEVBUF, PAGE_SIZE);
+}
+
+/*
+ * vcpu_deinit_svm
+ *
+ * Deinitializes the vcpu described by 'vcpu'
+ */
+void
+vcpu_deinit_svm(struct vcpu *vcpu)
+{
+ /* Unused */
+}
+
+/*
+ * vcpu_deinit
+ *
+ * Calls the architecture-specific VCPU deinit routine
+ */
+void
+vcpu_deinit(struct vcpu *vcpu)
+{
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ vcpu_deinit_vmx(vcpu);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ vcpu_deinit_svm(vcpu);
+ else
+ panic("unknown vmm mode\n");
+}
+
+/*
+ * vm_teardown
+ *
+ * Tears down (destroys) the vm indicated by 'vm'.
+ */
+void
+vm_teardown(struct vm *vm)
+{
+ struct vcpu *vcpu, *tmp;
+
+ /* Free VCPUs */
+ rw_enter_write(&vm->vm_vcpu_lock);
+ SLIST_FOREACH_SAFE(vcpu, &vm->vm_vcpu_list, vc_vcpu_link, tmp) {
+ SLIST_REMOVE(&vm->vm_vcpu_list, vcpu, vcpu, vc_vcpu_link);
+ vcpu_deinit(vcpu);
+ pool_put(&vcpu_pool, vcpu);
+ }
+
+ vm_impl_deinit(vm);
+
+ /* teardown guest vmspace */
+ if (vm->vm_map != NULL)
+ uvm_map_deallocate(vm->vm_map);
+
+ vmm_softc->vm_ct--;
+ if (vmm_softc->vm_ct < 1)
+ vmm_stop();
+ rw_exit_write(&vm->vm_vcpu_lock);
+ pool_put(&vm_pool, vm);
+}
+
+/*
+ * vcpu_vmx_check_cap
+ *
+ * Checks if the 'cap' bit in the 'msr' MSR can be set or cleared (set = 1
+ * or set = 0, respectively).
+ *
+ * When considering 'msr', we check to see if true controls are available,
+ * and use those if so.
+ *
+ * Returns 1 of 'cap' can be set/cleared as requested, 0 otherwise.
+ */
+int
+vcpu_vmx_check_cap(struct vcpu *vcpu, uint32_t msr, uint32_t cap, int set)
+{
+ uint64_t ctl;
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ switch (msr) {
+ case IA32_VMX_PINBASED_CTLS:
+ ctl = vcpu->vc_vmx_true_pinbased_ctls;
+ break;
+ case IA32_VMX_PROCBASED_CTLS:
+ ctl = vcpu->vc_vmx_true_procbased_ctls;
+ break;
+ case IA32_VMX_PROCBASED2_CTLS:
+ ctl = vcpu->vc_vmx_procbased2_ctls;
+ break;
+ case IA32_VMX_ENTRY_CTLS:
+ ctl = vcpu->vc_vmx_true_entry_ctls;
+ break;
+ case IA32_VMX_EXIT_CTLS:
+ ctl = vcpu->vc_vmx_true_exit_ctls;
+ break;
+ default:
+ return (0);
+ }
+ } else {
+ switch (msr) {
+ case IA32_VMX_PINBASED_CTLS:
+ ctl = vcpu->vc_vmx_pinbased_ctls;
+ break;
+ case IA32_VMX_PROCBASED_CTLS:
+ ctl = vcpu->vc_vmx_procbased_ctls;
+ break;
+ case IA32_VMX_PROCBASED2_CTLS:
+ ctl = vcpu->vc_vmx_procbased2_ctls;
+ break;
+ case IA32_VMX_ENTRY_CTLS:
+ ctl = vcpu->vc_vmx_entry_ctls;
+ break;
+ case IA32_VMX_EXIT_CTLS:
+ ctl = vcpu->vc_vmx_exit_ctls;
+ break;
+ default:
+ return (0);
+ }
+ }
+
+ if (set) {
+ /* Check bit 'cap << 32', must be !0 */
+ return (ctl & ((uint64_t)cap << 32)) != 0;
+ } else {
+ /* Check bit 'cap', must be 0 */
+ return (ctl & cap) == 0;
+ }
+}
+
+/*
+ * vcpu_vmx_compute_ctrl
+ *
+ * Computes the appropriate control value, given the supplied parameters
+ * and CPU capabilities.
+ *
+ * Intel has made somewhat of a mess of this computation - it is described
+ * using no fewer than three different approaches, spread across many
+ * pages of the SDM. Further compounding the problem is the fact that now
+ * we have "true controls" for each type of "control", and each needs to
+ * be examined to get the calculation right, but only if "true" controls
+ * are present on the CPU we're on.
+ *
+ * Parameters:
+ * ctrlval: the control value, as read from the CPU MSR
+ * ctrl: which control is being set (eg, pinbased, procbased, etc)
+ * want0: the set of desired 0 bits
+ * want1: the set of desired 1 bits
+ * out: (out) the correct value to write into the VMCS for this VCPU,
+ * for the 'ctrl' desired.
+ *
+ * Returns 0 if successful, or EINVAL if the supplied parameters define
+ * an unworkable control setup.
+ */
+int
+vcpu_vmx_compute_ctrl(uint64_t ctrlval, uint16_t ctrl, uint32_t want1,
+ uint32_t want0, uint32_t *out)
+{
+ int i, set, clear;
+
+ /*
+ * The Intel SDM gives three formulae for determining which bits to
+ * set/clear for a given control and desired functionality. Formula
+ * 1 is the simplest but disallows use of newer features that are
+ * enabled by functionality in later CPUs.
+ *
+ * Formulas 2 and 3 allow such extra functionality. We use formula
+ * 2 - this requires us to know the identity of controls in the
+ * "default1" class for each control register, but allows us to not
+ * have to pass along and/or query both sets of capability MSRs for
+ * each control lookup. This makes the code slightly longer,
+ * however.
+ */
+ for (i = 0; i < 32; i++) {
+ /* Figure out if we can set and / or clear this bit */
+ set = (ctrlval & (1ULL << (i + 32))) != 0;
+ clear = ((1ULL << i) & ((uint64_t)ctrlval)) == 0;
+
+ /* If the bit can't be set nor cleared, something's wrong */
+ if (!set && !clear)
+ return (EINVAL);
+
+ /*
+ * Formula 2.c.i - "If the relevant VMX capability MSR
+ * reports that a control has a single setting, use that
+ * setting."
+ */
+ if (set && !clear) {
+ if (want0 & (1ULL << i))
+ return (EINVAL);
+ else
+ *out |= (1ULL << i);
+ } else if (clear && !set) {
+ if (want1 & (1ULL << i))
+ return (EINVAL);
+ else
+ *out &= ~(1ULL << i);
+ } else {
+ /*
+ * 2.c.ii - "If the relevant VMX capability MSR
+ * reports that a control can be set to 0 or 1
+ * and that control's meaning is known to the VMM,
+ * set the control based on the functionality desired."
+ */
+ if (want1 & (1ULL << i))
+ *out |= (1ULL << i);
+ else if (want0 & (1 << i))
+ *out &= ~(1ULL << i);
+ else {
+ /*
+ * ... assuming the control's meaning is not
+ * known to the VMM ...
+ *
+ * 2.c.iii - "If the relevant VMX capability
+ * MSR reports that a control can be set to 0
+ * or 1 and the control is not in the default1
+ * class, set the control to 0."
+ *
+ * 2.c.iv - "If the relevant VMX capability
+ * MSR reports that a control can be set to 0
+ * or 1 and the control is in the default1
+ * class, set the control to 1."
+ */
+ switch (ctrl) {
+ case IA32_VMX_PINBASED_CTLS:
+ case IA32_VMX_TRUE_PINBASED_CTLS:
+ /*
+ * A.3.1 - default1 class of pinbased
+ * controls comprises bits 1,2,4
+ */
+ switch (i) {
+ case 1:
+ case 2:
+ case 4:
+ *out |= (1ULL << i);
+ break;
+ default:
+ *out &= ~(1ULL << i);
+ break;
+ }
+ break;
+ case IA32_VMX_PROCBASED_CTLS:
+ case IA32_VMX_TRUE_PROCBASED_CTLS:
+ /*
+ * A.3.2 - default1 class of procbased
+ * controls comprises bits 1, 4-6, 8,
+ * 13-16, 26
+ */
+ switch (i) {
+ case 1:
+ case 4 ... 6:
+ case 8:
+ case 13 ... 16:
+ case 26:
+ *out |= (1ULL << i);
+ break;
+ default:
+ *out &= ~(1ULL << i);
+ break;
+ }
+ break;
+ /*
+ * Unknown secondary procbased controls
+ * can always be set to 0
+ */
+ case IA32_VMX_PROCBASED2_CTLS:
+ *out &= ~(1ULL << i);
+ break;
+ case IA32_VMX_EXIT_CTLS:
+ case IA32_VMX_TRUE_EXIT_CTLS:
+ /*
+ * A.4 - default1 class of exit
+ * controls comprises bits 0-8, 10,
+ * 11, 13, 14, 16, 17
+ */
+ switch (i) {
+ case 0 ... 8:
+ case 10 ... 11:
+ case 13 ... 14:
+ case 16 ... 17:
+ *out |= (1ULL << i);
+ break;
+ default:
+ *out &= ~(1ULL << i);
+ break;
+ }
+ break;
+ case IA32_VMX_ENTRY_CTLS:
+ case IA32_VMX_TRUE_ENTRY_CTLS:
+ /*
+ * A.5 - default1 class of entry
+ * controls comprises bits 0-8, 12
+ */
+ switch (i) {
+ case 0 ... 8:
+ case 12:
+ *out |= (1ULL << i);
+ break;
+ default:
+ *out &= ~(1ULL << i);
+ break;
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * vm_get_info
+ *
+ * Returns information about the VM indicated by 'vip'.
+ */
+int
+vm_get_info(struct vm_info_params *vip)
+{
+ struct vm_info_result *out;
+ struct vm *vm;
+ struct vcpu *vcpu;
+ int i, j;
+ size_t need;
+
+ rw_enter_read(&vmm_softc->vm_lock);
+ need = vmm_softc->vm_ct * sizeof(struct vm_info_result);
+ if (vip->vip_size < need) {
+ vip->vip_info_ct = 0;
+ vip->vip_size = need;
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (0);
+ }
+
+ out = malloc(need, M_DEVBUF, M_NOWAIT|M_ZERO);
+ if (out == NULL) {
+ vip->vip_info_ct = 0;
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (ENOMEM);
+ }
+
+ i = 0;
+ vip->vip_info_ct = vmm_softc->vm_ct;
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ out[i].vir_memory_size = vm->vm_memory_size;
+ out[i].vir_used_size =
+ pmap_resident_count(vm->vm_map->pmap) * PAGE_SIZE;
+ out[i].vir_ncpus = vm->vm_vcpu_ct;
+ out[i].vir_id = vm->vm_id;
+ out[i].vir_creator_pid = vm->vm_creator_pid;
+ strncpy(out[i].vir_name, vm->vm_name, VMM_MAX_NAME_LEN);
+ rw_enter_read(&vm->vm_vcpu_lock);
+ for (j = 0; j < vm->vm_vcpu_ct; j++) {
+ out[i].vir_vcpu_state[j] = VCPU_STATE_UNKNOWN;
+ SLIST_FOREACH(vcpu, &vm->vm_vcpu_list,
+ vc_vcpu_link) {
+ if (vcpu->vc_id == j)
+ out[i].vir_vcpu_state[j] =
+ vcpu->vc_state;
+ }
+ }
+ rw_exit_read(&vm->vm_vcpu_lock);
+ i++;
+ }
+ rw_exit_read(&vmm_softc->vm_lock);
+ if (copyout(out, vip->vip_info, need) == EFAULT) {
+ free(out, M_DEVBUF, need);
+ return (EFAULT);
+ }
+
+ free(out, M_DEVBUF, need);
+ return (0);
+}
+
+/*
+ * vm_terminate
+ *
+ * Terminates the VM indicated by 'vtp'.
+ */
+int
+vm_terminate(struct vm_terminate_params *vtp)
+{
+ struct vm *vm;
+ struct vcpu *vcpu;
+ u_int old, next;
+
+ /*
+ * Find desired VM
+ */
+ rw_enter_read(&vmm_softc->vm_lock);
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ if (vm->vm_id == vtp->vtp_vm_id)
+ break;
+ }
+
+ if (vm != NULL) {
+ rw_enter_read(&vm->vm_vcpu_lock);
+ SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
+ do {
+ old = vcpu->vc_state;
+ if (old == VCPU_STATE_RUNNING)
+ next = VCPU_STATE_REQTERM;
+ else if (old == VCPU_STATE_STOPPED)
+ next = VCPU_STATE_TERMINATED;
+ else /* must be REQTERM or TERMINATED */
+ break;
+ } while (old != atomic_cas_uint(&vcpu->vc_state,
+ old, next));
+ }
+ rw_exit_read(&vm->vm_vcpu_lock);
+ }
+ rw_exit_read(&vmm_softc->vm_lock);
+
+ if (vm == NULL)
+ return (ENOENT);
+
+ /* XXX possible race here two threads terminating the same vm? */
+ rw_enter_write(&vmm_softc->vm_lock);
+ SLIST_REMOVE(&vmm_softc->vm_list, vm, vm, vm_link);
+ rw_exit_write(&vmm_softc->vm_lock);
+ if (vm->vm_vcpus_running == 0)
+ vm_teardown(vm);
+
+ return (0);
+}
+
+/*
+ * vm_run
+ *
+ * Run the vm / vcpu specified by 'vrp'
+ */
+int
+vm_run(struct vm_run_params *vrp)
+{
+ struct vm *vm;
+ struct vcpu *vcpu;
+ int ret = 0;
+ u_int old, next;
+
+ /*
+ * Find desired VM
+ */
+ rw_enter_read(&vmm_softc->vm_lock);
+
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ if (vm->vm_id == vrp->vrp_vm_id)
+ break;
+ }
+
+ /*
+ * Attempt to locate the requested VCPU. If found, attempt to
+ * to transition from VCPU_STATE_STOPPED -> VCPU_STATE_RUNNING.
+ * Failure to make the transition indicates the VCPU is busy.
+ */
+ if (vm != NULL) {
+ rw_enter_read(&vm->vm_vcpu_lock);
+ SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
+ if (vcpu->vc_id == vrp->vrp_vcpu_id)
+ break;
+ }
+
+ if (vcpu != NULL) {
+ old = VCPU_STATE_STOPPED;
+ next = VCPU_STATE_RUNNING;
+
+ if (atomic_cas_uint(&vcpu->vc_state, old, next) != old)
+ ret = EBUSY;
+ else
+ atomic_inc_int(&vm->vm_vcpus_running);
+ }
+ rw_exit_read(&vm->vm_vcpu_lock);
+
+ if (vcpu == NULL)
+ ret = ENOENT;
+ }
+ rw_exit_read(&vmm_softc->vm_lock);
+
+ if (vm == NULL)
+ ret = ENOENT;
+
+ /* Bail if errors detected in the previous steps */
+ if (ret)
+ return (ret);
+
+ /*
+ * We may be returning from userland helping us from the last exit.
+ * If so (vrp_continue == 1), copy in the exit data from vmd. The
+ * exit data will be consumed before the next entry (this typically
+ * comprises VCPU register changes as the result of vmd(8)'s actions).
+ */
+ if (vrp->vrp_continue) {
+ if (copyin(vrp->vrp_exit, &vcpu->vc_exit,
+ sizeof(union vm_exit)) == EFAULT) {
+ return (EFAULT);
+ }
+ }
+
+ /* Run the VCPU specified in vrp */
+ if (vcpu->vc_virt_mode == VMM_MODE_VMX ||
+ vcpu->vc_virt_mode == VMM_MODE_EPT) {
+ ret = vcpu_run_vmx(vcpu, vrp);
+ } else if (vcpu->vc_virt_mode == VMM_MODE_SVM ||
+ vcpu->vc_virt_mode == VMM_MODE_RVI) {
+ ret = vcpu_run_svm(vcpu, vrp);
+ }
+
+ /*
+ * We can set the VCPU states here without CAS because once
+ * a VCPU is in state RUNNING or REQTERM, only the VCPU itself
+ * can switch the state.
+ */
+ atomic_dec_int(&vm->vm_vcpus_running);
+ if (vcpu->vc_state == VCPU_STATE_REQTERM) {
+ vrp->vrp_exit_reason = VM_EXIT_TERMINATED;
+ vcpu->vc_state = VCPU_STATE_TERMINATED;
+ if (vm->vm_vcpus_running == 0)
+ vm_teardown(vm);
+ ret = 0;
+ } else if (ret == EAGAIN) {
+ /* If we are exiting, populate exit data so vmd can help. */
+ vrp->vrp_exit_reason = vcpu->vc_gueststate.vg_exit_reason;
+ vrp->vrp_irqready = vcpu->vc_irqready;
+ vcpu->vc_state = VCPU_STATE_STOPPED;
+
+ if (copyout(&vcpu->vc_exit, vrp->vrp_exit,
+ sizeof(union vm_exit)) == EFAULT) {
+ ret = EFAULT;
+ } else
+ ret = 0;
+ } else if (ret == 0) {
+ vrp->vrp_exit_reason = VM_EXIT_NONE;
+ vcpu->vc_state = VCPU_STATE_STOPPED;
+ } else {
+ vrp->vrp_exit_reason = VM_EXIT_TERMINATED;
+ vcpu->vc_state = VCPU_STATE_TERMINATED;
+ }
+
+ return (ret);
+}
+
+/*
+ * vcpu_must_stop
+ *
+ * Check if we need to (temporarily) stop running the VCPU for some reason,
+ * such as:
+ * - the VM was requested to terminate
+ * - the proc running this VCPU has pending signals
+ */
+int
+vcpu_must_stop(struct vcpu *vcpu)
+{
+ struct proc *p = curproc;
+
+ if (vcpu->vc_state == VCPU_STATE_REQTERM)
+ return (1);
+ if (CURSIG(p) != 0)
+ return (1);
+ return (0);
+}
+
+/*
+ * vcpu_run_vmx
+ *
+ * VMM main loop used to run a VCPU.
+ *
+ * Parameters:
+ * vcpu: The VCPU to run
+ * vrp: run parameters
+ *
+ * Return values:
+ * 0: The run loop exited and no help is needed from vmd
+ * EAGAIN: The run loop exited and help from vmd is needed
+ * EINVAL: an error occured
+ */
+int
+vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *vrp)
+{
+ int ret = 0, resume, locked, exitinfo;
+ struct region_descriptor gdt;
+ struct cpu_info *ci;
+ uint64_t cr3, vmcs_ptr;
+ uint32_t insn_error, exit_reason;
+ struct schedstate_percpu *spc;
+ struct vmx_invvpid_descriptor vid;
+ uint32_t eii;
+ uint32_t procbased;
+ uint16_t irq;
+
+ resume = 0;
+ irq = vrp->vrp_irq;
+
+ /*
+ * If we are returning from userspace (vmd) because we exited
+ * last time, fix up any needed vcpu state first. Which state
+ * needs to be fixed up depends on what vmd populated in the
+ * exit data structure.
+ */
+ if (vrp->vrp_continue) {
+ switch (vcpu->vc_gueststate.vg_exit_reason) {
+ case VMX_EXIT_IO:
+ vcpu->vc_gueststate.vg_eax =
+ vcpu->vc_exit.vei.vei_data;
+ break;
+ case VMX_EXIT_HLT:
+ break;
+ case VMX_EXIT_INT_WINDOW:
+ break;
+ case VMX_EXIT_EXTINT:
+ break;
+ case VMX_EXIT_EPT_VIOLATION:
+ break;
+#ifdef VMM_DEBUG
+ case VMX_EXIT_TRIPLE_FAULT:
+ DPRINTF("%s: vm %d vcpu %d triple fault\n",
+ __func__, vcpu->vc_parent->vm_id,
+ vcpu->vc_id);
+ vmx_vcpu_dump_regs(vcpu);
+ dump_vcpu(vcpu);
+ vmx_dump_vmcs(vcpu);
+ break;
+ case VMX_EXIT_ENTRY_FAILED_GUEST_STATE:
+ DPRINTF("%s: vm %d vcpu %d failed entry "
+ "due to invalid guest state\n",
+ __func__, vcpu->vc_parent->vm_id,
+ vcpu->vc_id);
+ vmx_vcpu_dump_regs(vcpu);
+ dump_vcpu(vcpu);
+ return EINVAL;
+ default:
+ DPRINTF("%s: unimplemented exit type %d (%s)\n",
+ __func__,
+ vcpu->vc_gueststate.vg_exit_reason,
+ vmx_exit_reason_decode(
+ vcpu->vc_gueststate.vg_exit_reason));
+ vmx_vcpu_dump_regs(vcpu);
+ dump_vcpu(vcpu);
+ break;
+#endif /* VMM_DEBUG */
+ }
+ }
+
+ while (ret == 0) {
+ if (!resume) {
+ /*
+ * We are launching for the first time, or we are
+ * resuming from a different pcpu, so we need to
+ * reset certain pcpu-specific values.
+ */
+ ci = curcpu();
+ setregion(&gdt, ci->ci_gdt, NGDT * sizeof(union descriptor) - 1);
+
+ vcpu->vc_last_pcpu = ci;
+
+ if (vmptrld(&vcpu->vc_control_pa)) {
+ ret = EINVAL;
+ break;
+ }
+
+ if (gdt.rd_base == 0) {
+ ret = EINVAL;
+ break;
+ }
+
+ /* Host GDTR base */
+ if (vmwrite(VMCS_HOST_IA32_GDTR_BASE, gdt.rd_base)) {
+ ret = EINVAL;
+ break;
+ }
+
+ /* Host TR base */
+ if (vmwrite(VMCS_HOST_IA32_TR_BASE,
+ proc0.p_md.md_tss_sel)) {
+ ret = EINVAL;
+ break;
+ }
+
+ /* Host CR3 */
+ cr3 = rcr3();
+ if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) {
+ ret = EINVAL;
+ break;
+ }
+ }
+
+ /* Handle vmd(8) injected interrupts */
+ /* XXX - 0x20 should be changed to PIC's vector base */
+
+ /* Is there an interrupt pending injection? */
+ if (irq != 0xFFFF) {
+ if (!vcpu->vc_irqready) {
+ printf("vcpu_run_vmx: error - irq injected"
+ " while not ready\n");
+ ret = EINVAL;
+ break;
+ }
+
+ eii = (irq & 0xFF) + 0x20;
+ eii |= (1ULL << 31); /* Valid */
+ eii |= (0ULL << 8); /* Hardware Interrupt */
+ if (vmwrite(VMCS_ENTRY_INTERRUPTION_INFO, eii)) {
+ printf("vcpu_run_vmx: can't vector "
+ "interrupt to guest\n");
+ ret = EINVAL;
+ break;
+ }
+
+ irq = 0xFFFF;
+ } else if (!vcpu->vc_intr) {
+ /*
+ * Disable window exiting
+ */
+ if (vmread(VMCS_PROCBASED_CTLS, &procbased)) {
+ printf("vcpu_run_vmx: can't read"
+ "procbased ctls on exit\n");
+ ret = EINVAL;
+ break;
+ } else {
+ procbased &= ~IA32_VMX_INTERRUPT_WINDOW_EXITING;
+ if (vmwrite(VMCS_PROCBASED_CTLS, procbased)) {
+ printf("vcpu_run_vmx: can't write"
+ " procbased ctls on exit\n");
+ ret = EINVAL;
+ break;
+ }
+ }
+ }
+
+ /* Invalidate old TLB mappings */
+ vid.vid_vpid = (uint64_t)vcpu->vc_parent->vm_id;
+ vid.vid_addr = 0ULL;
+ invvpid(IA32_VMX_INVVPID_SINGLE_CTX_GLB, &vid);
+
+ /* Start / resume the VCPU */
+ KERNEL_ASSERT_LOCKED();
+ KERNEL_UNLOCK();
+ ret = vmx_enter_guest(&vcpu->vc_control_pa,
+ &vcpu->vc_gueststate, resume, gdt.rd_base);
+
+ /* XXX */
+ tlbflushg();
+
+ exit_reason = VM_EXIT_NONE;
+ if (ret == 0) {
+ /*
+ * ret == 0 implies we entered the guest, and later
+ * exited for some valid reason
+ */
+ exitinfo = vmx_get_exit_info(
+ &vcpu->vc_gueststate.vg_eip, &exit_reason);
+ if (vmread(VMCS_GUEST_IA32_RFLAGS,
+ &vcpu->vc_gueststate.vg_eflags)) {
+ printf("vcpu_run_vmx: can't read guest rflags"
+ " during exit\n");
+ ret = EINVAL;
+ break;
+ }
+ }
+
+ if (ret || exitinfo != VMX_EXIT_INFO_COMPLETE ||
+ exit_reason != VMX_EXIT_EXTINT) {
+ KERNEL_LOCK();
+ locked = 1;
+ } else
+ locked = 0;
+
+ /* If we exited successfully ... */
+ if (ret == 0) {
+ resume = 1;
+ if (!(exitinfo & VMX_EXIT_INFO_HAVE_RIP)) {
+ printf("vcpu_run_vmx: cannot read guest rip\n");
+ ret = EINVAL;
+ break;
+ }
+
+ if (!(exitinfo & VMX_EXIT_INFO_HAVE_REASON)) {
+ printf("vcpu_run_vmx: cant read exit reason\n");
+ ret = EINVAL;
+ break;
+ }
+
+ /*
+ * Handle the exit. This will alter "ret" to EAGAIN if
+ * the exit handler determines help from vmd is needed.
+ */
+ vcpu->vc_gueststate.vg_exit_reason = exit_reason;
+ ret = vmx_handle_exit(vcpu);
+
+ /*
+ * When the guest exited due to an external interrupt,
+ * we do not yet hold the kernel lock: we need to
+ * handle interrupts first before grabbing the lock:
+ * the interrupt handler might do work that
+ * another CPU holding the kernel lock waits for.
+ *
+ * Example: the TLB shootdown code in the pmap module
+ * sends an IPI to all other CPUs and busy-waits for
+ * them to decrement tlb_shoot_wait to zero. While
+ * busy-waiting, the kernel lock is held.
+ *
+ * If this code here attempted to grab the kernel lock
+ * before handling the interrupt, it would block
+ * forever.
+ */
+ if (!locked)
+ KERNEL_LOCK();
+
+ if (vcpu->vc_gueststate.vg_eflags & PSL_I)
+ vcpu->vc_irqready = 1;
+ else
+ vcpu->vc_irqready = 0;
+
+ /*
+ * If not ready for interrupts, but interrupts pending,
+ * enable interrupt window exiting.
+ */
+ if (vcpu->vc_irqready == 0 && vcpu->vc_intr) {
+ if (vmread(VMCS_PROCBASED_CTLS, &procbased)) {
+ printf("vcpu_run_vmx: can't read"
+ " procbased ctls on intwin exit\n");
+ ret = EINVAL;
+ break;
+ }
+
+ procbased |= IA32_VMX_INTERRUPT_WINDOW_EXITING;
+ if (vmwrite(VMCS_PROCBASED_CTLS, procbased)) {
+ printf("vcpu_run_vmx: can't write"
+ " procbased ctls on intwin exit\n");
+ ret = EINVAL;
+ break;
+ }
+ }
+
+ /*
+ * Exit to vmd if we are terminating, failed to enter,
+ * or need help (device I/O)
+ */
+ if (ret || vcpu_must_stop(vcpu))
+ break;
+
+ if (vcpu->vc_intr && vcpu->vc_irqready) {
+ ret = EAGAIN;
+ break;
+ }
+
+ /* Check if we should yield - don't hog the cpu */
+ spc = &ci->ci_schedstate;
+ if (spc->spc_schedflags & SPCF_SHOULDYIELD) {
+ resume = 0;
+ if (vmclear(&vcpu->vc_control_pa)) {
+ ret = EINVAL;
+ break;
+ }
+ yield();
+ }
+ } else if (ret == VMX_FAIL_LAUNCH_INVALID_VMCS) {
+ printf("vcpu_run_vmx: failed launch with invalid "
+ "vmcs\n");
+#ifdef VMM_DEBUG
+ vmx_vcpu_dump_regs(vcpu);
+ dump_vcpu(vcpu);
+#endif /* VMM_DEBUG */
+ ret = EINVAL;
+ } else if (ret == VMX_FAIL_LAUNCH_VALID_VMCS) {
+ exit_reason = vcpu->vc_gueststate.vg_exit_reason;
+ printf("vcpu_run_vmx: failed launch with valid "
+ "vmcs, code=%d (%s)\n", exit_reason,
+ vmx_instruction_error_decode(exit_reason));
+ if (vmread(VMCS_INSTRUCTION_ERROR, &insn_error)) {
+ printf("vcpu_run_vmx: can't read"
+ " insn error field\n");
+ } else
+ printf("vcpu_run_vmx: insn error code = "
+ "%d\n", insn_error);
+#ifdef VMM_DEBUG
+ vmx_vcpu_dump_regs(vcpu);
+ dump_vcpu(vcpu);
+#endif /* VMM_DEBUG */
+ ret = EINVAL;
+ } else {
+ printf("vcpu_run_vmx: failed launch for unknown "
+ "reason %d\n", ret);
+#ifdef VMM_DEBUG
+ vmx_vcpu_dump_regs(vcpu);
+ dump_vcpu(vcpu);
+#endif /* VMM_DEBUG */
+ ret = EINVAL;
+ }
+ }
+
+ /*
+ * We are heading back to userspace (vmd), either because we need help
+ * handling an exit, a guest interrupt is pending, or we failed in some
+ * way to enter the guest. Clear any current VMCS pointer as we may end
+ * up coming back on a different CPU.
+ */
+ if (!vmptrst(&vmcs_ptr)) {
+ if (vmcs_ptr != 0xFFFFFFFFFFFFFFFFULL)
+ if (vmclear(&vcpu->vc_control_pa))
+ ret = EINVAL;
+ } else
+ ret = EINVAL;
+
+ return (ret);
+}
+
+/*
+ * vmx_handle_intr
+ *
+ * Handle host (external) interrupts. We read which interrupt fired by
+ * extracting the vector from the VMCS and dispatch the interrupt directly
+ * to the host using vmm_dispatch_intr.
+ */
+void
+vmx_handle_intr(struct vcpu *vcpu)
+{
+ uint8_t vec;
+ uint32_t eii;
+ struct gate_descriptor *idte;
+ vaddr_t handler;
+
+ if (vmread(VMCS_EXIT_INTERRUPTION_INFO, &eii)) {
+ printf("vmx_handle_intr: can't obtain intr info\n");
+ return;
+ }
+
+ vec = eii & 0xFF;
+
+ /* XXX check "error valid" code in eii, abort if 0 */
+ idte=&idt[vec];
+ handler = idte->gd_looffset + ((uint64_t)idte->gd_hioffset << 16);
+ vmm_dispatch_intr(handler);
+}
+
+/*
+ * vmx_handle_hlt
+ *
+ * Handle HLT exits
+ */
+int
+vmx_handle_hlt(struct vcpu *vcpu)
+{
+ uint32_t insn_length;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("vmx_handle_hlt: can't obtain instruction length\n");
+ return (EINVAL);
+ }
+
+ vcpu->vc_gueststate.vg_eip += insn_length;
+ return (EAGAIN);
+}
+
+/*
+ * vmx_get_exit_info
+ *
+ * Returns exit information containing the current guest RIP and exit reason
+ * in rip and exit_reason. The return value is a bitmask indicating whether
+ * reading the RIP and exit reason was successful.
+ */
+int
+vmx_get_exit_info(uint32_t *eip, uint32_t *exit_reason)
+{
+ int rv = 0;
+
+ if (vmread(VMCS_GUEST_IA32_RIP, eip) == 0) {
+ rv |= VMX_EXIT_INFO_HAVE_RIP;
+ if (vmread(VMCS_EXIT_REASON, exit_reason) == 0)
+ rv |= VMX_EXIT_INFO_HAVE_REASON;
+ }
+ return (rv);
+}
+
+/*
+ * vmx_handle_exit
+ *
+ * Handle exits from the VM by decoding the exit reason and calling various
+ * subhandlers as needed.
+ */
+int
+vmx_handle_exit(struct vcpu *vcpu)
+{
+ uint64_t exit_reason;
+ uint32_t eflags;
+ int update_rip, ret = 0;
+
+ update_rip = 0;
+ exit_reason = vcpu->vc_gueststate.vg_exit_reason;
+ eflags = vcpu->vc_gueststate.vg_eflags;
+
+ switch (exit_reason) {
+ case VMX_EXIT_INT_WINDOW:
+ if (!(eflags & PSL_I)) {
+ DPRINTF("vmx_handle_exit: impossible interrupt window"
+ " exit config\n");
+ ret = EINVAL;
+ break;
+ }
+
+ ret = EAGAIN;
+ update_rip = 0;
+ break;
+ case VMX_EXIT_EPT_VIOLATION:
+ ret = vmx_handle_np_fault(vcpu);
+ break;
+ case VMX_EXIT_CPUID:
+ ret = vmx_handle_cpuid(vcpu);
+ update_rip = 1;
+ break;
+ case VMX_EXIT_IO:
+ ret = vmx_handle_inout(vcpu);
+ update_rip = 1;
+ break;
+ case VMX_EXIT_EXTINT:
+ vmx_handle_intr(vcpu);
+ update_rip = 0;
+ break;
+ case VMX_EXIT_CR_ACCESS:
+ ret = vmx_handle_cr(vcpu);
+ update_rip = 1;
+ break;
+ case VMX_EXIT_HLT:
+ ret = vmx_handle_hlt(vcpu);
+ update_rip = 1;
+ break;
+ case VMX_EXIT_RDMSR:
+ ret = vmx_handle_rdmsr(vcpu);
+ update_rip = 1;
+ break;
+ case VMX_EXIT_WRMSR:
+ ret = vmx_handle_wrmsr(vcpu);
+ update_rip = 1;
+ break;
+ case VMX_EXIT_TRIPLE_FAULT:
+#ifdef VMM_DEBUG
+ DPRINTF("vmx_handle_exit: vm %d vcpu %d triple fault\n",
+ vcpu->vc_parent->vm_id, vcpu->vc_id);
+ vmx_vcpu_dump_regs(vcpu);
+ dump_vcpu(vcpu);
+ vmx_dump_vmcs(vcpu);
+#endif /* VMM_DEBUG */
+ ret = EAGAIN;
+ update_rip = 0;
+ break;
+ default:
+ DPRINTF("vmx_handle_exit: unhandled exit %lld (%s)\n",
+ exit_reason, vmx_exit_reason_decode(exit_reason));
+ return (EINVAL);
+ }
+
+ if (update_rip) {
+ if (vmwrite(VMCS_GUEST_IA32_RIP,
+ vcpu->vc_gueststate.vg_eip)) {
+ printf("vmx_handle_exit: can't advance rip\n");
+ return (EINVAL);
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * vmm_get_guest_memtype
+ *
+ * Returns the type of memory 'gpa' refers to in the context of vm 'vm'
+ */
+int
+vmm_get_guest_memtype(struct vm *vm, paddr_t gpa)
+{
+ int i;
+ struct vm_mem_range *vmr;
+
+ if (gpa >= VMM_PCI_MMIO_BAR_BASE && gpa <= VMM_PCI_MMIO_BAR_END) {
+ DPRINTF("guest mmio access @ 0x%llx\n", (uint64_t)gpa);
+ return (VMM_MEM_TYPE_REGULAR);
+ }
+
+ /* XXX Use binary search? */
+ for (i = 0; i < vm->vm_nmemranges; i++) {
+ vmr = &vm->vm_memranges[i];
+
+ /*
+ * vm_memranges are ascending. gpa can no longer be in one of
+ * the memranges
+ */
+ if (gpa < vmr->vmr_gpa)
+ break;
+
+ if (gpa < vmr->vmr_gpa + vmr->vmr_size)
+ return (VMM_MEM_TYPE_REGULAR);
+ }
+
+ DPRINTF("guest memtype @ 0x%llx unknown\n", (uint64_t)gpa);
+ return (VMM_MEM_TYPE_UNKNOWN);
+}
+
+/*
+ * vmm_get_guest_faulttype
+ *
+ * Determines the type (R/W/X) of the last fault on the VCPU last run on
+ * this PCPU. Calls the appropriate architecture-specific subroutine.
+ */
+int
+vmm_get_guest_faulttype(void)
+{
+ if (vmm_softc->mode == VMM_MODE_EPT)
+ return vmx_get_guest_faulttype();
+ else if (vmm_softc->mode == VMM_MODE_RVI)
+ return vmx_get_guest_faulttype();
+ else
+ panic("unknown vmm mode\n");
+}
+
+/*
+ * vmx_get_exit_qualification
+ *
+ * Return the current VMCS' exit qualification information
+ */
+int
+vmx_get_exit_qualification(uint32_t *exit_qualification)
+{
+ if (vmread(VMCS_GUEST_EXIT_QUALIFICATION, exit_qualification)) {
+ printf("vmm_get_exit_qualification: cant extract exit qual\n");
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * vmx_get_guest_faulttype
+ *
+ * Determines the type (R/W/X) of the last fault on the VCPU last run on
+ * this PCPU.
+ */
+int
+vmx_get_guest_faulttype(void)
+{
+ uint32_t exit_qualification;
+ uint64_t presentmask = IA32_VMX_EPT_FAULT_WAS_READABLE |
+ IA32_VMX_EPT_FAULT_WAS_WRITABLE | IA32_VMX_EPT_FAULT_WAS_EXECABLE;
+ uint64_t protmask = IA32_VMX_EPT_FAULT_READ |
+ IA32_VMX_EPT_FAULT_WRITE | IA32_VMX_EPT_FAULT_EXEC;
+
+ if (vmx_get_exit_qualification(&exit_qualification))
+ return (-1);
+
+ if ((exit_qualification & presentmask) == 0)
+ return VM_FAULT_INVALID;
+ if (exit_qualification & protmask)
+ return VM_FAULT_PROTECT;
+ return (-1);
+}
+
+/*
+ * svm_get_guest_faulttype
+ *
+ * Determines the type (R/W/X) of the last fault on the VCPU last run on
+ * this PCPU.
+ */
+int
+svm_get_guest_faulttype(void)
+{
+ /* XXX removed due to rot */
+ return (-1);
+}
+
+/*
+ * vmx_fault_page
+ *
+ * Request a new page to be faulted into the UVM map of the VM owning 'vcpu'
+ * at address 'gpa'.
+ */
+int
+vmx_fault_page(struct vcpu *vcpu, paddr_t gpa)
+{
+ int fault_type, ret;
+
+ fault_type = vmx_get_guest_faulttype();
+ if (fault_type == -1) {
+ printf("vmx_fault_page: invalid fault type\n");
+ return (EINVAL);
+ }
+
+ ret = uvm_fault(vcpu->vc_parent->vm_map, gpa, fault_type,
+ PROT_READ | PROT_WRITE | PROT_EXEC);
+ if (ret)
+ printf("vmx_fault_page: uvm_fault returns %d\n", ret);
+
+ return (ret);
+}
+
+/*
+ * vmx_handle_np_fault
+ *
+ * High level nested paging handler for VMX. Verifies that a fault is for a
+ * valid memory region, then faults a page, or aborts otherwise.
+ */
+int
+vmx_handle_np_fault(struct vcpu *vcpu)
+{
+ uint64_t gpa;
+ uint32_t gpa_lo, gpa_hi;
+ int gpa_memtype, ret;
+
+ ret = 0;
+ if (vmread(VMCS_GUEST_PHYSICAL_ADDRESS, &gpa_lo)) {
+ printf("vmm_handle_np_fault: cannot extract faulting pa lo\n");
+ return (EINVAL);
+ }
+
+ if (vmread(VMCS_GUEST_PHYSICAL_ADDRESS_HI, &gpa_hi)) {
+ printf("vmm_handle_np_fault: cannot extract faulting pa hi\n");
+ return (EINVAL);
+ }
+
+ gpa = (uint64_t)gpa_lo | (uint64_t)gpa_hi << 32ULL;
+
+ gpa_memtype = vmm_get_guest_memtype(vcpu->vc_parent, gpa);
+ switch (gpa_memtype) {
+ case VMM_MEM_TYPE_REGULAR:
+ ret = vmx_fault_page(vcpu, gpa);
+ break;
+ default:
+ printf("unknown memory type %d for GPA 0x%llx\n",
+ gpa_memtype, gpa);
+ return (EINVAL);
+ }
+
+ return (ret);
+}
+
+/*
+ * vmx_handle_inout
+ *
+ * Exit handler for IN/OUT instructions.
+ *
+ * The vmm can handle certain IN/OUTS without exiting to vmd, but most of these
+ * will be passed to vmd for completion.
+ */
+int
+vmx_handle_inout(struct vcpu *vcpu)
+{
+ uint32_t insn_length;
+ uint32_t exit_qual;
+ int ret;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("vmx_handle_inout: can't obtain instruction length\n");
+ return (EINVAL);
+ }
+
+ if (vmx_get_exit_qualification(&exit_qual)) {
+ printf("vmx_handle_inout: can't get exit qual\n");
+ return (EINVAL);
+ }
+
+ /* Bits 0:2 - size of exit */
+ vcpu->vc_exit.vei.vei_size = (exit_qual & 0x7) + 1;
+ /* Bit 3 - direction */
+ vcpu->vc_exit.vei.vei_dir = (exit_qual & 0x8) >> 3;
+ /* Bit 4 - string instruction? */
+ vcpu->vc_exit.vei.vei_string = (exit_qual & 0x10) >> 4;
+ /* Bit 5 - REP prefix? */
+ vcpu->vc_exit.vei.vei_rep = (exit_qual & 0x20) >> 5;
+ /* Bit 6 - Operand encoding */
+ vcpu->vc_exit.vei.vei_encoding = (exit_qual & 0x40) >> 6;
+ /* Bit 16:31 - port */
+ vcpu->vc_exit.vei.vei_port = (exit_qual & 0xFFFF0000) >> 16;
+ /* Data */
+ vcpu->vc_exit.vei.vei_data = vcpu->vc_gueststate.vg_eax;
+
+ vcpu->vc_gueststate.vg_eip += insn_length;
+
+ /*
+ * The following ports usually belong to devices owned by vmd.
+ * Return EAGAIN to signal help needed from userspace (vmd).
+ * Return 0 to indicate we don't care about this port.
+ *
+ * XXX something better than a hardcoded list here, maybe
+ * configure via vmd via the device list in vm create params?
+ *
+ * XXX handle not eax target
+ */
+ switch (vcpu->vc_exit.vei.vei_port) {
+ case IO_ICU1 ... IO_ICU1 + 1:
+ case 0x40 ... 0x43:
+ case IO_RTC ... IO_RTC + 1:
+ case IO_ICU2 ... IO_ICU2 + 1:
+ case 0x3f8 ... 0x3ff:
+ case 0xcf8:
+ case 0xcfc:
+ case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
+ ret = EAGAIN;
+ break;
+ default:
+ /* Read from unsupported ports returns FFs */
+ if (vcpu->vc_exit.vei.vei_dir == 1)
+ vcpu->vc_gueststate.vg_eax = 0xFFFFFFFF;
+ ret = 0;
+ }
+
+ return (ret);
+}
+
+/*
+ * vmx_handle_cr
+ *
+ * Handle reads/writes to control registers (except CR3)
+ */
+int
+vmx_handle_cr(struct vcpu *vcpu)
+{
+ uint32_t insn_length;
+ uint32_t exit_qual;
+ uint8_t crnum, dir;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("vmx_handle_cr: can't obtain instruction length\n");
+ return (EINVAL);
+ }
+
+ if (vmx_get_exit_qualification(&exit_qual)) {
+ printf("vmx_handle_cr: can't get exit qual\n");
+ return (EINVAL);
+ }
+
+ /* Low 4 bits of exit_qual represent the CR number */
+ crnum = exit_qual & 0xf;
+
+ dir = (exit_qual & 0x30) >> 4;
+
+ switch (dir) {
+ case CR_WRITE:
+ DPRINTF("vmx_handle_cr: mov to cr%d @ %x\n",
+ crnum, vcpu->vc_gueststate.vg_eip);
+ break;
+ case CR_READ:
+ DPRINTF("vmx_handle_cr: mov from cr%d @ %x\n",
+ crnum, vcpu->vc_gueststate.vg_eip);
+ break;
+ case CR_CLTS:
+ DPRINTF("vmx_handle_cr: clts instruction @ %x\n",
+ vcpu->vc_gueststate.vg_eip);
+ break;
+ case CR_LMSW:
+ DPRINTF("vmx_handle_cr: lmsw instruction @ %x\n",
+ vcpu->vc_gueststate.vg_eip);
+ break;
+ default:
+ DPRINTF("vmx_handle_cr: unknown cr access @ %x\n",
+ vcpu->vc_gueststate.vg_eip);
+ }
+
+ vcpu->vc_gueststate.vg_eip += insn_length;
+
+ return (0);
+}
+
+/*
+ * vmx_handle_rdmsr
+ *
+ * Handler for rdmsr instructions. Bitmap MSRs are allowed implicit access
+ * and won't end up here. This handler is primarily intended to catch otherwise
+ * unknown MSR access for possible later inclusion in the bitmap list. For
+ * each MSR access that ends up here, we log the access.
+ *
+ * Parameters:
+ * vcpu: vcpu structure containing instruction info causing the exit
+ *
+ * Return value:
+ * 0: The operation was successful
+ * 1: An error occurred
+ */
+int
+vmx_handle_rdmsr(struct vcpu *vcpu)
+{
+ uint32_t insn_length;
+ uint64_t msr;
+ uint32_t *eax, *ecx, *edx;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("%s: can't obtain instruction length\n", __func__);
+ return (EINVAL);
+ }
+
+ /* All RDMSR instructions are 0x0F 0x32 */
+ KASSERT(insn_length == 2);
+
+ eax = &vcpu->vc_gueststate.vg_eax;
+ ecx = &vcpu->vc_gueststate.vg_ecx;
+ edx = &vcpu->vc_gueststate.vg_edx;
+
+ msr = rdmsr(*ecx);
+ *eax = msr & 0xFFFFFFFFULL;
+ *edx = msr >> 32;
+
+ /* XXX log the access for now, to be able to identify unknown MSRs */
+ printf("%s: rdmsr exit, msr=0x%x, data returned to "
+ "guest=0x%x:0x%x\n", __func__, *ecx, *edx, *eax);
+
+ vcpu->vc_gueststate.vg_eip += insn_length;
+
+ return (0);
+}
+
+/*
+ * vmx_handle_wrmsr
+ *
+ * Handler for wrmsr instructions. This handler logs the access, and discards
+ * the written data. Any valid wrmsr will not end up here (it will be
+ * whitelisted in the MSR bitmap).
+ *
+ * Parameters:
+ * vcpu: vcpu structure containing instruction info causing the exit
+ *
+ * Return value:
+ * 0: The operation was successful
+ * 1: An error occurred
+ */
+int
+vmx_handle_wrmsr(struct vcpu *vcpu)
+{
+ uint32_t insn_length;
+ uint32_t *eax, *ecx, *edx;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("%s: can't obtain instruction length\n", __func__);
+ return (EINVAL);
+ }
+
+ /* All WRMSR instructions are 0x0F 0x30 */
+ KASSERT(insn_length == 2);
+
+ eax = &vcpu->vc_gueststate.vg_eax;
+ ecx = &vcpu->vc_gueststate.vg_ecx;
+ edx = &vcpu->vc_gueststate.vg_edx;
+
+ /* XXX log the access for now, to be able to identify unknown MSRs */
+ printf("%s: wrmsr exit, msr=0x%x, discarding data written from "
+ "guest=0x%x:0x%x\n", __func__, *ecx, *edx, *eax);
+
+ vcpu->vc_gueststate.vg_eip += insn_length;
+
+ return (0);
+}
+
+/*
+ * vmx_handle_cpuid
+ *
+ * Exit handler for CPUID instruction
+ */
+int
+vmx_handle_cpuid(struct vcpu *vcpu)
+{
+ uint32_t insn_length;
+ uint32_t *eax, *ebx, *ecx, *edx;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("vmx_handle_cpuid: can't obtain instruction length\n");
+ return (EINVAL);
+ }
+
+ /* All CPUID instructions are 0x0F 0xA2 */
+ KASSERT(insn_length == 2);
+
+ eax = &vcpu->vc_gueststate.vg_eax;
+ ebx = &vcpu->vc_gueststate.vg_ebx;
+ ecx = &vcpu->vc_gueststate.vg_ecx;
+ edx = &vcpu->vc_gueststate.vg_edx;
+
+ switch (*eax) {
+ case 0x00: /* Max level and vendor ID */
+ *eax = 0x07; /* cpuid_level */
+ *ebx = *((uint32_t *)&cpu_vendor);
+ *edx = *((uint32_t *)&cpu_vendor + 1);
+ *ecx = *((uint32_t *)&cpu_vendor + 2);
+ break;
+ case 0x01: /* Version, brand, feature info */
+ *eax = cpu_id;
+ /* mask off host's APIC ID, reset to vcpu id */
+ *ebx = cpu_miscinfo & 0x00FFFFFF;
+ *ebx &= (vcpu->vc_id & 0xFF) << 24;
+ /*
+ * clone host capabilities minus:
+ * debug store (CPUIDECX_DTES64, CPUIDECX_DSCPL, CPUID_DS)
+ * monitor/mwait (CPUIDECX_MWAIT)
+ * vmx (CPUIDECX_VMX)
+ * smx (CPUIDECX_SMX)
+ * speedstep (CPUIDECX_EST)
+ * thermal (CPUIDECX_TM2, CPUID_ACPI, CPUID_TM)
+ * context id (CPUIDECX_CNXTID)
+ * silicon debug (CPUIDECX_SDBG)
+ * xTPR (CPUIDECX_XTPR)
+ * perf/debug (CPUIDECX_PDCM)
+ * pcid (CPUIDECX_PCID)
+ * direct cache access (CPUIDECX_DCA)
+ * x2APIC (CPUIDECX_X2APIC)
+ * apic deadline (CPUIDECX_DEADLINE)
+ * timestamp (CPUID_TSC)
+ * apic (CPUID_APIC)
+ * psn (CPUID_PSN)
+ * self snoop (CPUID_SS)
+ * hyperthreading (CPUID_HTT)
+ * pending break enabled (CPUID_PBE)
+ * MTRR (CPUID_MTRR)
+ * plus:
+ * hypervisor (CPUIDECX_HV)
+ */
+ *ecx = (cpu_ecxfeature | CPUIDECX_HV) &
+ ~(CPUIDECX_EST | CPUIDECX_TM2 |
+ CPUIDECX_MWAIT | CPUIDECX_PDCM |
+ CPUIDECX_VMX | CPUIDECX_DTES64 |
+ CPUIDECX_DSCPL | CPUIDECX_SMX |
+ CPUIDECX_CNXTID | CPUIDECX_SDBG |
+ CPUIDECX_XTPR |
+ CPUIDECX_PCID | CPUIDECX_DCA |
+ CPUIDECX_X2APIC | CPUIDECX_DEADLINE);
+ *edx = curcpu()->ci_feature_flags &
+ ~(CPUID_ACPI | CPUID_TM | CPUID_TSC |
+ CPUID_HTT | CPUID_DS | CPUID_APIC |
+ CPUID_PSN | CPUID_SS | CPUID_PBE |
+ CPUID_MTRR);
+ break;
+ case 0x02: /* Cache and TLB information */
+ DPRINTF("vmx_handle_cpuid: function 0x02 (cache/TLB) not"
+ " supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x03: /* Processor serial number (not supported) */
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x04:
+ DPRINTF("vmx_handle_cpuid: function 0x04 (deterministic "
+ "cache info) not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x05: /* MONITOR/MWAIT (not supported) */
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x06: /* Thermal / Power management */
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x07: /* SEFF */
+ if (*ecx == 0) {
+ /*
+ * SEFF flags - copy from host minus:
+ * SGX (SEFF0EBX_SGX)
+ * HLE (SEFF0EBX_HLE)
+ * INVPCID (SEFF0EBX_INVPCID)
+ * RTM (SEFF0EBX_RTM)
+ * PQM (SEFF0EBX_PQM)
+ * MPX (SEFF0EBX_MPX)
+ * PCOMMIT (SEFF0EBX_PCOMMIT)
+ * PT (SEFF0EBX_PT)
+ */
+ *eax = 0; /* Highest subleaf supported */
+ *ebx = curcpu()->ci_feature_sefflags_ebx &
+ ~(SEFF0EBX_SGX | SEFF0EBX_HLE | SEFF0EBX_INVPCID |
+ SEFF0EBX_RTM | SEFF0EBX_PQM | SEFF0EBX_MPX |
+ SEFF0EBX_PCOMMIT | SEFF0EBX_PT);
+ *ecx = curcpu()->ci_feature_sefflags_ecx;
+ *edx = 0;
+ } else {
+ /* Unsupported subleaf */
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ }
+ break;
+ case 0x09: /* Direct Cache Access (not supported) */
+ DPRINTF("vmx_handle_cpuid: function 0x09 (direct cache access)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x0a: /* Architectural performance monitoring */
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x0b: /* Extended topology enumeration (not supported) */
+ DPRINTF("vmx_handle_cpuid: function 0x0b (topology enumeration)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x0d: /* Processor ext. state information (not supported) */
+ DPRINTF("vmx_handle_cpuid: function 0x0d (ext. state info)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x0f: /* QoS info (not supported) */
+ DPRINTF("vmx_handle_cpuid: function 0x0f (QoS info)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x14: /* Processor Trace info (not supported) */
+ DPRINTF("vmx_handle_cpuid: function 0x14 (processor trace info)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x15: /* TSC / Core Crystal Clock info (not supported) */
+ DPRINTF("vmx_handle_cpuid: function 0x15 (TSC / CCC info)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x16: /* Processor frequency info (not supported) */
+ DPRINTF("vmx_handle_cpuid: function 0x16 (frequency info)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x40000000: /* Hypervisor information */
+ *eax = 0;
+ *ebx = *((uint32_t *)&vmm_hv_signature[0]);
+ *ecx = *((uint32_t *)&vmm_hv_signature[4]);
+ *edx = *((uint32_t *)&vmm_hv_signature[8]);
+ break;
+ case 0x80000000: /* Extended function level */
+ *eax = 0x80000007; /* curcpu()->ci_pnfeatset */
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ case 0x80000001: /* Extended function info */
+ *eax = ecpu_eaxfeature;
+ *ebx = 0; /* Reserved */
+ *ecx = ecpu_ecxfeature;
+ *edx = ecpu_feature;
+ break;
+ case 0x80000002: /* Brand string */
+ *eax = cpu_brandstr[0];
+ *ebx = cpu_brandstr[1];
+ *ecx = cpu_brandstr[2];
+ *edx = cpu_brandstr[3];
+ break;
+ case 0x80000003: /* Brand string */
+ *eax = cpu_brandstr[4];
+ *ebx = cpu_brandstr[5];
+ *ecx = cpu_brandstr[6];
+ *edx = cpu_brandstr[7];
+ break;
+ case 0x80000004: /* Brand string */
+ *eax = cpu_brandstr[8];
+ *ebx = cpu_brandstr[9];
+ *ecx = cpu_brandstr[10];
+ *edx = cpu_brandstr[11];
+ break;
+ case 0x80000005: /* Reserved (Intel), cacheinfo (AMD) */
+ *eax = curcpu()->ci_amdcacheinfo[0];
+ *ebx = curcpu()->ci_amdcacheinfo[1];
+ *ecx = curcpu()->ci_amdcacheinfo[2];
+ *edx = curcpu()->ci_amdcacheinfo[3];
+ break;
+ case 0x80000006: /* ext. cache info */
+ *eax = curcpu()->ci_extcacheinfo[0];
+ *ebx = curcpu()->ci_extcacheinfo[1];
+ *ecx = curcpu()->ci_extcacheinfo[2];
+ *edx = curcpu()->ci_extcacheinfo[3];
+ break;
+ case 0x80000007: /* apmi */
+ *eax = 0; /* Reserved */
+ *ebx = 0; /* Reserved */
+ *ecx = 0; /* Reserved */
+ *edx = 0; /* unsupported ITSC */
+ break;
+ case 0x80000008: /* Phys bits info and topology (AMD) */
+ DPRINTF("vmx_handle_cpuid: function 0x80000008 (phys bits info)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ default:
+ DPRINTF("vmx_handle_cpuid: unsupported eax=0x%x\n", *eax);
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ }
+
+ vcpu->vc_gueststate.vg_eip += insn_length;
+
+ return (0);
+}
+
+/*
+ * vcpu_run_svm
+ *
+ * VMM main loop used to run a VCPU.
+ */
+int
+vcpu_run_svm(struct vcpu *vcpu, struct vm_run_params *vrp)
+{
+ /* XXX removed due to rot */
+ return (0);
+}
+
+/*
+ * vmx_exit_reason_decode
+ *
+ * Returns a human readable string describing exit type 'code'
+ */
+const char *
+vmx_exit_reason_decode(uint32_t code)
+{
+ switch (code) {
+ case VMX_EXIT_NMI: return "NMI";
+ case VMX_EXIT_EXTINT: return "external interrupt";
+ case VMX_EXIT_TRIPLE_FAULT: return "triple fault";
+ case VMX_EXIT_INIT: return "INIT signal";
+ case VMX_EXIT_SIPI: return "SIPI signal";
+ case VMX_EXIT_IO_SMI: return "I/O SMI";
+ case VMX_EXIT_OTHER_SMI: return "other SMI";
+ case VMX_EXIT_INT_WINDOW: return "interrupt window";
+ case VMX_EXIT_NMI_WINDOW: return "NMI window";
+ case VMX_EXIT_TASK_SWITCH: return "task switch";
+ case VMX_EXIT_CPUID: return "CPUID instruction";
+ case VMX_EXIT_GETSEC: return "GETSEC instruction";
+ case VMX_EXIT_HLT: return "HLT instruction";
+ case VMX_EXIT_INVD: return "INVD instruction";
+ case VMX_EXIT_INVLPG: return "INVLPG instruction";
+ case VMX_EXIT_RDPMC: return "RDPMC instruction";
+ case VMX_EXIT_RDTSC: return "RDTSC instruction";
+ case VMX_EXIT_RSM: return "RSM instruction";
+ case VMX_EXIT_VMCALL: return "VMCALL instruction";
+ case VMX_EXIT_VMCLEAR: return "VMCLEAR instruction";
+ case VMX_EXIT_VMLAUNCH: return "VMLAUNCH instruction";
+ case VMX_EXIT_VMPTRLD: return "VMPTRLD instruction";
+ case VMX_EXIT_VMPTRST: return "VMPTRST instruction";
+ case VMX_EXIT_VMREAD: return "VMREAD instruction";
+ case VMX_EXIT_VMRESUME: return "VMRESUME instruction";
+ case VMX_EXIT_VMWRITE: return "VMWRITE instruction";
+ case VMX_EXIT_VMXOFF: return "VMXOFF instruction";
+ case VMX_EXIT_VMXON: return "VMXON instruction";
+ case VMX_EXIT_CR_ACCESS: return "CR access";
+ case VMX_EXIT_MOV_DR: return "MOV DR instruction";
+ case VMX_EXIT_IO: return "I/O instruction";
+ case VMX_EXIT_RDMSR: return "RDMSR instruction";
+ case VMX_EXIT_WRMSR: return "WRMSR instruction";
+ case VMX_EXIT_ENTRY_FAILED_GUEST_STATE: return "guest state invalid";
+ case VMX_EXIT_ENTRY_FAILED_MSR_LOAD: return "MSR load failed";
+ case VMX_EXIT_MWAIT: return "MWAIT instruction";
+ case VMX_EXIT_MTF: return "monitor trap flag";
+ case VMX_EXIT_MONITOR: return "MONITOR instruction";
+ case VMX_EXIT_PAUSE: return "PAUSE instruction";
+ case VMX_EXIT_ENTRY_FAILED_MCE: return "MCE during entry";
+ case VMX_EXIT_TPR_BELOW_THRESHOLD: return "TPR below threshold";
+ case VMX_EXIT_APIC_ACCESS: return "APIC access";
+ case VMX_EXIT_VIRTUALIZED_EOI: return "virtualized EOI";
+ case VMX_EXIT_GDTR_IDTR: return "GDTR/IDTR access";
+ case VMX_EXIT_LDTR_TR: return "LDTR/TR access";
+ case VMX_EXIT_EPT_VIOLATION: return "EPT violation";
+ case VMX_EXIT_EPT_MISCONFIGURATION: return "EPT misconfiguration";
+ case VMX_EXIT_INVEPT: return "INVEPT instruction";
+ case VMX_EXIT_RDTSCP: return "RDTSCP instruction";
+ case VMX_EXIT_VMX_PREEMPTION_TIMER_EXPIRED:
+ return "preemption timer expired";
+ case VMX_EXIT_INVVPID: return "INVVPID instruction";
+ case VMX_EXIT_WBINVD: return "WBINVD instruction";
+ case VMX_EXIT_XSETBV: return "XSETBV instruction";
+ case VMX_EXIT_APIC_WRITE: return "APIC write";
+ case VMX_EXIT_RDRAND: return "RDRAND instruction";
+ case VMX_EXIT_INVPCID: return "INVPCID instruction";
+ case VMX_EXIT_VMFUNC: return "VMFUNC instruction";
+ case VMX_EXIT_RDSEED: return "RDSEED instruction";
+ case VMX_EXIT_XSAVES: return "XSAVES instruction";
+ case VMX_EXIT_XRSTORS: return "XRSTORS instruction";
+ default: return "unknown";
+ }
+}
+
+/*
+ * vmx_instruction_error_decode
+ *
+ * Returns a human readable string describing the instruction error in 'code'
+ */
+const char *
+vmx_instruction_error_decode(uint32_t code)
+{
+ switch (code) {
+ case 1: return "VMCALL: unsupported in VMX root";
+ case 2: return "VMCLEAR: invalid paddr";
+ case 3: return "VMCLEAR: VMXON pointer";
+ case 4: return "VMLAUNCH: non-clear VMCS";
+ case 5: return "VMRESUME: non-launched VMCS";
+ case 6: return "VMRESUME: executed after VMXOFF";
+ case 7: return "VM entry: invalid control field(s)";
+ case 8: return "VM entry: invalid host state field(s)";
+ case 9: return "VMPTRLD: invalid paddr";
+ case 10: return "VMPTRLD: VMXON pointer";
+ case 11: return "VMPTRLD: incorrect VMCS revid";
+ case 12: return "VMREAD/VMWRITE: unsupported VMCS field";
+ case 13: return "VMWRITE: RO VMCS field";
+ case 15: return "VMXON: unsupported in VMX root";
+ case 20: return "VMCALL: invalid VM exit control fields";
+ case 26: return "VM entry: blocked by MOV SS";
+ case 28: return "Invalid operand to INVEPT/INVVPID";
+ default: return "unknown";
+ }
+}
+
+/*
+ * vcpu_state_decode
+ *
+ * Returns a human readable string describing the vcpu state in 'state'.
+ */
+const char *
+vcpu_state_decode(u_int state)
+{
+ switch (state) {
+ case VCPU_STATE_STOPPED: return "stopped";
+ case VCPU_STATE_RUNNING: return "running";
+ case VCPU_STATE_REQTERM: return "requesting termination";
+ case VCPU_STATE_TERMINATED: return "terminated";
+ case VCPU_STATE_UNKNOWN: return "unknown";
+ default: return "invalid";
+ }
+}
+
+#ifdef VMM_DEBUG
+/*
+ * dump_vcpu
+ *
+ * Dumps the VMX capabilites of vcpu 'vcpu'
+ */
+void
+dump_vcpu(struct vcpu *vcpu)
+{
+ printf("vcpu @ %p\n", vcpu);
+ printf(" parent vm @ %p\n", vcpu->vc_parent);
+ printf(" mode: ");
+ if (vcpu->vc_virt_mode == VMM_MODE_VMX ||
+ vcpu->vc_virt_mode == VMM_MODE_EPT) {
+ printf("VMX\n");
+ printf(" pinbased ctls: 0x%llx\n",
+ vcpu->vc_vmx_pinbased_ctls);
+ printf(" true pinbased ctls: 0x%llx\n",
+ vcpu->vc_vmx_true_pinbased_ctls);
+ CTRL_DUMP(vcpu, PINBASED, EXTERNAL_INT_EXITING);
+ CTRL_DUMP(vcpu, PINBASED, NMI_EXITING);
+ CTRL_DUMP(vcpu, PINBASED, VIRTUAL_NMIS);
+ CTRL_DUMP(vcpu, PINBASED, ACTIVATE_VMX_PREEMPTION_TIMER);
+ CTRL_DUMP(vcpu, PINBASED, PROCESS_POSTED_INTERRUPTS);
+ printf(" procbased ctls: 0x%llx\n",
+ vcpu->vc_vmx_procbased_ctls);
+ printf(" true procbased ctls: 0x%llx\n",
+ vcpu->vc_vmx_true_procbased_ctls);
+ CTRL_DUMP(vcpu, PROCBASED, INTERRUPT_WINDOW_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, USE_TSC_OFFSETTING);
+ CTRL_DUMP(vcpu, PROCBASED, HLT_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, INVLPG_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, MWAIT_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, RDPMC_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, RDTSC_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, CR3_LOAD_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, CR3_STORE_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, CR8_LOAD_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, CR8_STORE_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, USE_TPR_SHADOW);
+ CTRL_DUMP(vcpu, PROCBASED, NMI_WINDOW_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, MOV_DR_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, UNCONDITIONAL_IO_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, USE_IO_BITMAPS);
+ CTRL_DUMP(vcpu, PROCBASED, MONITOR_TRAP_FLAG);
+ CTRL_DUMP(vcpu, PROCBASED, USE_MSR_BITMAPS);
+ CTRL_DUMP(vcpu, PROCBASED, MONITOR_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, PAUSE_EXITING);
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
+ printf(" procbased2 ctls: 0x%llx\n",
+ vcpu->vc_vmx_procbased2_ctls);
+ CTRL_DUMP(vcpu, PROCBASED2, VIRTUALIZE_APIC);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_EPT);
+ CTRL_DUMP(vcpu, PROCBASED2, DESCRIPTOR_TABLE_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_RDTSCP);
+ CTRL_DUMP(vcpu, PROCBASED2, VIRTUALIZE_X2APIC_MODE);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_VPID);
+ CTRL_DUMP(vcpu, PROCBASED2, WBINVD_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, UNRESTRICTED_GUEST);
+ CTRL_DUMP(vcpu, PROCBASED2,
+ APIC_REGISTER_VIRTUALIZATION);
+ CTRL_DUMP(vcpu, PROCBASED2,
+ VIRTUAL_INTERRUPT_DELIVERY);
+ CTRL_DUMP(vcpu, PROCBASED2, PAUSE_LOOP_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, RDRAND_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_INVPCID);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_VM_FUNCTIONS);
+ CTRL_DUMP(vcpu, PROCBASED2, VMCS_SHADOWING);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_ENCLS_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, RDSEED_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_PML);
+ CTRL_DUMP(vcpu, PROCBASED2, EPT_VIOLATION_VE);
+ CTRL_DUMP(vcpu, PROCBASED2, CONCEAL_VMX_FROM_PT);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_XSAVES_XRSTORS);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_TSC_SCALING);
+ }
+ printf(" entry ctls: 0x%llx\n",
+ vcpu->vc_vmx_entry_ctls);
+ printf(" true entry ctls: 0x%llx\n",
+ vcpu->vc_vmx_true_entry_ctls);
+ CTRL_DUMP(vcpu, ENTRY, LOAD_DEBUG_CONTROLS);
+ CTRL_DUMP(vcpu, ENTRY, IA32E_MODE_GUEST);
+ CTRL_DUMP(vcpu, ENTRY, ENTRY_TO_SMM);
+ CTRL_DUMP(vcpu, ENTRY, DEACTIVATE_DUAL_MONITOR_TREATMENT);
+ CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY);
+ CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_PAT_ON_ENTRY);
+ CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_EFER_ON_ENTRY);
+ CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_BNDCFGS_ON_ENTRY);
+ CTRL_DUMP(vcpu, ENTRY, CONCEAL_VM_ENTRIES_FROM_PT);
+ printf(" exit ctls: 0x%llx\n",
+ vcpu->vc_vmx_exit_ctls);
+ printf(" true exit ctls: 0x%llx\n",
+ vcpu->vc_vmx_true_exit_ctls);
+ CTRL_DUMP(vcpu, EXIT, SAVE_DEBUG_CONTROLS);
+ CTRL_DUMP(vcpu, EXIT, HOST_SPACE_ADDRESS_SIZE);
+ CTRL_DUMP(vcpu, EXIT, LOAD_IA32_PERF_GLOBAL_CTRL_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, ACKNOWLEDGE_INTERRUPT_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, SAVE_IA32_PAT_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, LOAD_IA32_PAT_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, SAVE_IA32_EFER_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, LOAD_IA32_EFER_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, SAVE_VMX_PREEMPTION_TIMER);
+ CTRL_DUMP(vcpu, EXIT, CLEAR_IA32_BNDCFGS_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, CONCEAL_VM_EXITS_FROM_PT);
+ }
+}
+
+/*
+ * vmx_dump_vmcs_field
+ *
+ * Debug function to dump the contents of a single VMCS field
+ *
+ * Parameters:
+ * fieldid: VMCS Field ID
+ * msg: string to display
+ */
+void
+vmx_dump_vmcs_field(uint16_t fieldid, const char *msg)
+{
+ uint8_t width;
+ uint64_t val;
+ uint32_t val_lo, val_hi;
+
+ DPRINTF("%s (0x%04x): ", msg, fieldid);
+ width = (fieldid >> 13) & 0x3;
+
+ if (width == 1) {
+ if (vmread(fieldid, &val_lo)) {
+ DPRINTF("???? ");
+ return;
+ }
+ if (vmread(fieldid + 1, &val_hi)) {
+ DPRINTF("???? ");
+ return;
+ }
+
+ val = (uint64_t)val_lo | (uint64_t)val_hi << 32ULL;
+ }
+
+ /*
+ * Field width encoding : bits 13:14
+ *
+ * 0: 16-bit
+ * 1: 64-bit
+ * 2: 32-bit
+ * 3: natural width
+ */
+ switch (width) {
+ case 0: DPRINTF("0x%04llx ", val); break;
+ case 1:
+ case 3: DPRINTF("0x%016llx ", val); break;
+ case 2: DPRINTF("0x%08llx ", val);
+ }
+}
+
+/*
+ * vmx_dump_vmcs
+ *
+ * Debug function to dump the contents of the current VMCS.
+ */
+void
+vmx_dump_vmcs(struct vcpu *vcpu)
+{
+ int has_sec, i;
+ uint32_t cr3_tgt_ct;
+
+ /* XXX save and load new vmcs, restore at end */
+
+ DPRINTF("--CURRENT VMCS STATE--\n");
+ DPRINTF("VMXON revision : 0x%x\n",
+ curcpu()->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision);
+ DPRINTF("CR0 fixed0: 0x%llx\n",
+ curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0);
+ DPRINTF("CR0 fixed1: 0x%llx\n",
+ curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1);
+ DPRINTF("CR4 fixed0: 0x%llx\n",
+ curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0);
+ DPRINTF("CR4 fixed1: 0x%llx\n",
+ curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1);
+ DPRINTF("MSR table size: 0x%x\n",
+ 512 * (curcpu()->ci_vmm_cap.vcc_vmx.vmx_msr_table_size + 1));
+
+ has_sec = vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1);
+
+ if (has_sec) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_VPID, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_VPID, "VPID");
+ }
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PINBASED_CTLS,
+ IA32_VMX_PROCESS_POSTED_INTERRUPTS, 1)) {
+ vmx_dump_vmcs_field(VMCS_POSTED_INT_NOTIF_VECTOR,
+ "Posted Int Notif Vec");
+ }
+
+ if (has_sec) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_EPT_VIOLATION_VE, 1)) {
+ vmx_dump_vmcs_field(VMCS_EPTP_INDEX, "EPTP idx");
+ }
+ }
+
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_ES_SEL, "G.ES");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_CS_SEL, "G.CS");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_SS_SEL, "G.SS");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_DS_SEL, "G.DS");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_FS_SEL, "G.FS");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_GS_SEL, "G.GS");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_LDTR_SEL, "LDTR");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_TR_SEL, "G.TR");
+
+ if (has_sec) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_VIRTUAL_INTERRUPT_DELIVERY, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_INTERRUPT_STATUS,
+ "Int sts");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_PML, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_PML_INDEX, "PML Idx");
+ }
+ }
+
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_ES_SEL, "H.ES");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_CS_SEL, "H.CS");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_SS_SEL, "H.SS");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_DS_SEL, "H.DS");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_FS_SEL, "H.FS");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_GS_SEL, "H.GS");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_IO_BITMAP_A, "I/O Bitmap A");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_IO_BITMAP_B, "I/O Bitmap B");
+ DPRINTF("\n");
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_USE_MSR_BITMAPS, 1)) {
+ vmx_dump_vmcs_field(VMCS_MSR_BITMAP_ADDRESS, "MSR Bitmap");
+ DPRINTF("\n");
+ }
+
+ vmx_dump_vmcs_field(VMCS_EXIT_STORE_MSR_ADDRESS, "Exit Store MSRs");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EXIT_LOAD_MSR_ADDRESS, "Exit Load MSRs");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_ENTRY_LOAD_MSR_ADDRESS, "Entry Load MSRs");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EXECUTIVE_VMCS_POINTER, "Exec VMCS Ptr");
+ DPRINTF("\n");
+
+ if (has_sec) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_PML, 1)) {
+ vmx_dump_vmcs_field(VMCS_PML_ADDRESS, "PML Addr");
+ DPRINTF("\n");
+ }
+ }
+
+ vmx_dump_vmcs_field(VMCS_TSC_OFFSET, "TSC Offset");
+ DPRINTF("\n");
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_USE_TPR_SHADOW, 1)) {
+ vmx_dump_vmcs_field(VMCS_VIRTUAL_APIC_ADDRESS,
+ "Virtual APIC Addr");
+ DPRINTF("\n");
+ }
+
+ if (has_sec) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_VIRTUALIZE_APIC, 1)) {
+ vmx_dump_vmcs_field(VMCS_APIC_ACCESS_ADDRESS,
+ "APIC Access Addr");
+ DPRINTF("\n");
+ }
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PINBASED_CTLS,
+ IA32_VMX_PROCESS_POSTED_INTERRUPTS, 1)) {
+ vmx_dump_vmcs_field(VMCS_POSTED_INTERRUPT_DESC,
+ "Posted Int Desc Addr");
+ DPRINTF("\n");
+ }
+
+ if (has_sec) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_VM_FUNCTIONS, 1)) {
+ vmx_dump_vmcs_field(VMCS_VM_FUNCTION_CONTROLS,
+ "VM Function Controls");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_EPT, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_EPTP,
+ "EPT Pointer");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_VIRTUAL_INTERRUPT_DELIVERY, 1)) {
+ vmx_dump_vmcs_field(VMCS_EOI_EXIT_BITMAP_0,
+ "EOI Exit Bitmap 0");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EOI_EXIT_BITMAP_1,
+ "EOI Exit Bitmap 1");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EOI_EXIT_BITMAP_2,
+ "EOI Exit Bitmap 2");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EOI_EXIT_BITMAP_3,
+ "EOI Exit Bitmap 3");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_VM_FUNCTIONS, 1)) {
+ /* We assume all CPUs have the same VMFUNC caps */
+ if (curcpu()->ci_vmm_cap.vcc_vmx.vmx_vm_func & 0x1) {
+ vmx_dump_vmcs_field(VMCS_EPTP_LIST_ADDRESS,
+ "EPTP List Addr");
+ DPRINTF("\n");
+ }
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_VMCS_SHADOWING, 1)) {
+ vmx_dump_vmcs_field(VMCS_VMREAD_BITMAP_ADDRESS,
+ "VMREAD Bitmap Addr");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_VMWRITE_BITMAP_ADDRESS,
+ "VMWRITE Bitmap Addr");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_EPT_VIOLATION_VE, 1)) {
+ vmx_dump_vmcs_field(VMCS_VIRTUALIZATION_EXC_ADDRESS,
+ "#VE Addr");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_XSAVES_XRSTORS, 1)) {
+ vmx_dump_vmcs_field(VMCS_XSS_EXITING_BITMAP,
+ "XSS exiting bitmap addr");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_ENCLS_EXITING, 1)) {
+ vmx_dump_vmcs_field(VMCS_ENCLS_EXITING_BITMAP,
+ "Encls exiting bitmap addr");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_TSC_SCALING, 1)) {
+ vmx_dump_vmcs_field(VMCS_TSC_MULTIPLIER,
+ "TSC scaling factor");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_EPT, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_PHYSICAL_ADDRESS,
+ "Guest PA");
+ DPRINTF("\n");
+ }
+ }
+
+ vmx_dump_vmcs_field(VMCS_LINK_POINTER, "VMCS Link Pointer");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_DEBUGCTL, "Guest DEBUGCTL");
+ DPRINTF("\n");
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_ENTRY_CTLS,
+ IA32_VMX_LOAD_IA32_PAT_ON_ENTRY, 1) ||
+ vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS,
+ IA32_VMX_SAVE_IA32_PAT_ON_EXIT, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_PAT,
+ "Guest PAT");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_ENTRY_CTLS,
+ IA32_VMX_LOAD_IA32_EFER_ON_ENTRY, 1) ||
+ vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS,
+ IA32_VMX_SAVE_IA32_EFER_ON_EXIT, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_EFER,
+ "Guest EFER");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_ENTRY_CTLS,
+ IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_PERF_GBL_CTRL,
+ "Guest Perf Global Ctrl");
+ DPRINTF("\n");
+ }
+
+ if (has_sec) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_EPT, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_PDPTE0, "Guest PDPTE0");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_PDPTE1, "Guest PDPTE1");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_PDPTE2, "Guest PDPTE2");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_PDPTE3, "Guest PDPTE3");
+ DPRINTF("\n");
+ }
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_ENTRY_CTLS,
+ IA32_VMX_LOAD_IA32_BNDCFGS_ON_ENTRY, 1) ||
+ vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS,
+ IA32_VMX_CLEAR_IA32_BNDCFGS_ON_EXIT, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_BNDCFGS,
+ "Guest BNDCFGS");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS,
+ IA32_VMX_LOAD_IA32_PAT_ON_EXIT, 1)) {
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_PAT,
+ "Host PAT");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS,
+ IA32_VMX_LOAD_IA32_EFER_ON_EXIT, 1)) {
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_EFER,
+ "Host EFER");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS,
+ IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_EXIT, 1)) {
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_PERF_GBL_CTRL,
+ "Host Perf Global Ctrl");
+ DPRINTF("\n");
+ }
+
+ vmx_dump_vmcs_field(VMCS_PINBASED_CTLS, "Pinbased Ctrls");
+ vmx_dump_vmcs_field(VMCS_PROCBASED_CTLS, "Procbased Ctrls");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EXCEPTION_BITMAP, "Exception Bitmap");
+ vmx_dump_vmcs_field(VMCS_PF_ERROR_CODE_MASK, "#PF Err Code Mask");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_PF_ERROR_CODE_MATCH, "#PF Err Code Match");
+ vmx_dump_vmcs_field(VMCS_CR3_TARGET_COUNT, "CR3 Tgt Count");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EXIT_CTLS, "Exit Ctrls");
+ vmx_dump_vmcs_field(VMCS_EXIT_MSR_STORE_COUNT, "Exit MSR Store Ct");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EXIT_MSR_LOAD_COUNT, "Exit MSR Load Ct");
+ vmx_dump_vmcs_field(VMCS_ENTRY_CTLS, "Entry Ctrls");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_ENTRY_MSR_LOAD_COUNT, "Entry MSR Load Ct");
+ vmx_dump_vmcs_field(VMCS_ENTRY_INTERRUPTION_INFO, "Entry Int. Info");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_ENTRY_EXCEPTION_ERROR_CODE,
+ "Entry Ex. Err Code");
+ vmx_dump_vmcs_field(VMCS_ENTRY_INSTRUCTION_LENGTH, "Entry Insn Len");
+ DPRINTF("\n");
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_USE_TPR_SHADOW, 1)) {
+ vmx_dump_vmcs_field(VMCS_TPR_THRESHOLD, "TPR Threshold");
+ DPRINTF("\n");
+ }
+
+ if (has_sec) {
+ vmx_dump_vmcs_field(VMCS_PROCBASED2_CTLS, "2ndary Ctrls");
+ DPRINTF("\n");
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_PAUSE_LOOP_EXITING, 1)) {
+ vmx_dump_vmcs_field(VMCS_PLE_GAP, "PLE Gap");
+ vmx_dump_vmcs_field(VMCS_PLE_WINDOW, "PLE Window");
+ }
+ DPRINTF("\n");
+ }
+
+ vmx_dump_vmcs_field(VMCS_INSTRUCTION_ERROR, "Insn Error");
+ vmx_dump_vmcs_field(VMCS_EXIT_REASON, "Exit Reason");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_EXIT_INTERRUPTION_INFO, "Exit Int. Info");
+ vmx_dump_vmcs_field(VMCS_EXIT_INTERRUPTION_ERR_CODE,
+ "Exit Int. Err Code");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_IDT_VECTORING_INFO, "IDT vect info");
+ vmx_dump_vmcs_field(VMCS_IDT_VECTORING_ERROR_CODE,
+ "IDT vect err code");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_INSTRUCTION_LENGTH, "Insn Len");
+ vmx_dump_vmcs_field(VMCS_EXIT_INSTRUCTION_INFO, "Exit Insn Info");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_ES_LIMIT, "G. ES Lim");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_CS_LIMIT, "G. CS Lim");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_SS_LIMIT, "G. SS Lim");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_DS_LIMIT, "G. DS Lim");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_FS_LIMIT, "G. FS Lim");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_GS_LIMIT, "G. GS Lim");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_LDTR_LIMIT, "G. LDTR Lim");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_TR_LIMIT, "G. TR Lim");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_GDTR_LIMIT, "G. GDTR Lim");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_IDTR_LIMIT, "G. IDTR Lim");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_ES_AR, "G. ES AR");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_CS_AR, "G. CS AR");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_SS_AR, "G. SS AR");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_DS_AR, "G. DS AR");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_FS_AR, "G. FS AR");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_GS_AR, "G. GS AR");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_LDTR_AR, "G. LDTR AR");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_TR_AR, "G. TR AR");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_INTERRUPTIBILITY_ST, "G. Int St.");
+ vmx_dump_vmcs_field(VMCS_GUEST_ACTIVITY_STATE, "G. Act St.");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_SMBASE, "G. SMBASE");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_SYSENTER_CS, "G. SYSENTER CS");
+ DPRINTF("\n");
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PINBASED_CTLS,
+ IA32_VMX_ACTIVATE_VMX_PREEMPTION_TIMER, 1)) {
+ vmx_dump_vmcs_field(VMCS_VMX_PREEMPTION_TIMER_VAL,
+ "VMX Preempt Timer");
+ DPRINTF("\n");
+ }
+
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_SYSENTER_CS, "H. SYSENTER CS");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_CR0_MASK, "CR0 Mask");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_CR4_MASK, "CR4 Mask");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_CR0_READ_SHADOW, "CR0 RD Shadow");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_CR4_READ_SHADOW, "CR4 RD Shadow");
+ DPRINTF("\n");
+
+ /* We assume all CPUs have the same max CR3 target ct */
+ cr3_tgt_ct = curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr3_tgt_count;
+ DPRINTF("Max CR3 target count: 0x%x\n", cr3_tgt_ct);
+ if (cr3_tgt_ct <= VMX_MAX_CR3_TARGETS) {
+ for (i = 0 ; i < cr3_tgt_ct; i++) {
+ vmx_dump_vmcs_field(VMCS_CR3_TARGET_0 + (2 * i),
+ "CR3 Target");
+ DPRINTF("\n");
+ }
+ } else {
+ DPRINTF("(Bogus CR3 Target Count > %d", VMX_MAX_CR3_TARGETS);
+ }
+
+ vmx_dump_vmcs_field(VMCS_GUEST_EXIT_QUALIFICATION, "G. Exit Qual");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_IO_RCX, "I/O RCX");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_IO_RSI, "I/O RSI");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_IO_RDI, "I/O RDI");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_IO_RIP, "I/O RIP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_LINEAR_ADDRESS, "G. Lin Addr");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_CR0, "G. CR0");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_CR3, "G. CR3");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_CR4, "G. CR4");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_ES_BASE, "G. ES Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_CS_BASE, "G. CS Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_SS_BASE, "G. SS Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_DS_BASE, "G. DS Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_FS_BASE, "G. FS Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_GS_BASE, "G. GS Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_LDTR_BASE, "G. LDTR Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_TR_BASE, "G. TR Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_GDTR_BASE, "G. GDTR Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_IDTR_BASE, "G. IDTR Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_DR7, "G. DR7");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_RSP, "G. RSP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_RIP, "G. RIP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_RFLAGS, "G. RFLAGS");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_PENDING_DBG_EXC, "G. Pend Dbg Exc");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_SYSENTER_ESP, "G. SYSENTER ESP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_SYSENTER_EIP, "G. SYSENTER EIP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_CR0, "H. CR0");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_CR3, "H. CR3");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_CR4, "H. CR4");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_FS_BASE, "H. FS Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_GS_BASE, "H. GS Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_TR_BASE, "H. TR Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_GDTR_BASE, "H. GDTR Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_IDTR_BASE, "H. IDTR Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_SYSENTER_ESP, "H. SYSENTER ESP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_SYSENTER_EIP, "H. SYSENTER EIP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_RSP, "H. RSP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_RIP, "H. RIP");
+ DPRINTF("\n");
+}
+
+/*
+ * vmx_vcpu_dump_regs
+ *
+ * Debug function to print vcpu regs from the current vcpu
+ * note - vmcs for 'vcpu' must be on this pcpu.
+ *
+ * Parameters:
+ * vcpu - vcpu whose registers should be dumped
+ */
+void
+vmx_vcpu_dump_regs(struct vcpu *vcpu)
+{
+ uint32_t r;
+ int i;
+ struct vmx_msr_store *msr_store;
+
+ DPRINTF("vcpu @ %p\n", vcpu);
+ DPRINTF(" eax=0x%08x ebx=0x%08x ecx=0x%08x\n",
+ vcpu->vc_gueststate.vg_eax, vcpu->vc_gueststate.vg_ebx,
+ vcpu->vc_gueststate.vg_ecx);
+ DPRINTF(" edx=0x%08x ebp=0x%08x edi=0x%08x\n",
+ vcpu->vc_gueststate.vg_edx, vcpu->vc_gueststate.vg_ebp,
+ vcpu->vc_gueststate.vg_edi);
+ DPRINTF(" esi=0x%08x\n", vcpu->vc_gueststate.vg_esi);
+
+ DPRINTF(" eip=0x%08x rsp=", vcpu->vc_gueststate.vg_eip);
+ if (vmread(VMCS_GUEST_IA32_RSP, &r))
+ DPRINTF("(error reading)\n");
+ else
+ DPRINTF("0x%08x\n", r);
+
+ DPRINTF(" cr0=");
+ if (vmread(VMCS_GUEST_IA32_CR0, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%08x ", r);
+ vmm_decode_cr0(r);
+ }
+
+ DPRINTF(" cr2=0x%08x\n", vcpu->vc_gueststate.vg_cr2);
+
+ DPRINTF(" cr3=");
+ if (vmread(VMCS_GUEST_IA32_CR3, &r))
+ DPRINTF("(error reading)\n");
+ else
+ DPRINTF("0x%08x ", r);
+
+ DPRINTF(" cr4=");
+ if (vmread(VMCS_GUEST_IA32_CR4, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%08x ", r);
+ vmm_decode_cr4(r);
+ }
+
+ DPRINTF(" --Guest Segment Info--\n");
+
+ DPRINTF(" cs=");
+ if (vmread(VMCS_GUEST_IA32_CS_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x rpl=%d", r, r & 0x3);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_CS_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_CS_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_CS_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" ds=");
+ if (vmread(VMCS_GUEST_IA32_DS_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x rpl=%d", r, r & 0x3);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_DS_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_DS_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_DS_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" es=");
+ if (vmread(VMCS_GUEST_IA32_ES_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x rpl=%d", r, r & 0x3);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_ES_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_ES_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_ES_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" fs=");
+ if (vmread(VMCS_GUEST_IA32_FS_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x rpl=%d", r, r & 0x3);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_FS_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_FS_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_FS_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" gs=");
+ if (vmread(VMCS_GUEST_IA32_GS_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x rpl=%d", r, r & 0x3);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_GS_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_GS_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_GS_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" ss=");
+ if (vmread(VMCS_GUEST_IA32_SS_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x rpl=%d", r, r & 0x3);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_SS_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_SS_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_SS_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" tr=");
+ if (vmread(VMCS_GUEST_IA32_TR_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x", r);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_TR_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_TR_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_TR_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" gdtr base=");
+ if (vmread(VMCS_GUEST_IA32_GDTR_BASE, &r))
+ DPRINTF("(error reading) ");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_GDTR_LIMIT, &r))
+ DPRINTF("(error reading)\n");
+ else
+ DPRINTF("0x%08x\n", r);
+
+ DPRINTF(" idtr base=");
+ if (vmread(VMCS_GUEST_IA32_IDTR_BASE, &r))
+ DPRINTF("(error reading) ");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_IDTR_LIMIT, &r))
+ DPRINTF("(error reading)\n");
+ else
+ DPRINTF("0x%08x\n", r);
+
+ DPRINTF(" ldtr=");
+ if (vmread(VMCS_GUEST_IA32_LDTR_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x", r);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_LDTR_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_LDTR_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_LDTR_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" --Guest MSRs @ 0x%08x (paddr: 0x%08x)--\n",
+ (uint32_t)vcpu->vc_vmx_msr_exit_save_va,
+ (uint32_t)vcpu->vc_vmx_msr_exit_save_pa);
+
+ msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_save_va;
+
+ for (i = 0; i < VMX_NUM_MSR_STORE; i++) {
+ DPRINTF(" MSR %d @ %p : 0x%08x (%s), "
+ "value=0x%016llx ",
+ i, &msr_store[i], msr_store[i].vms_index,
+ msr_name_decode(msr_store[i].vms_index),
+ msr_store[i].vms_data);
+ vmm_decode_msr_value(msr_store[i].vms_index,
+ msr_store[i].vms_data);
+ }
+
+ DPRINTF(" last PIC irq=%d\n", vcpu->vc_intr);
+}
+
+/*
+ * msr_name_decode
+ *
+ * Returns a human-readable name for the MSR supplied in 'msr'.
+ *
+ * Parameters:
+ * msr - The MSR to decode
+ *
+ * Return value:
+ * NULL-terminated character string containing the name of the MSR requested
+ */
+const char *
+msr_name_decode(uint32_t msr)
+{
+ /*
+ * Add as needed. Also consider adding a decode function when
+ * adding to this table.
+ */
+
+ switch (msr) {
+ case MSR_TSC: return "TSC";
+ case MSR_APICBASE: return "APIC base";
+ case MSR_IA32_FEATURE_CONTROL: return "IA32 feature control";
+ case MSR_PERFCTR0: return "perf counter 0";
+ case MSR_PERFCTR1: return "perf counter 1";
+ case MSR_TEMPERATURE_TARGET: return "temperature target";
+ case MSR_MTRRcap: return "MTRR cap";
+ case MSR_PERF_STATUS: return "perf status";
+ case MSR_PERF_CTL: return "perf control";
+ case MSR_MTRRvarBase: return "MTRR variable base";
+ case MSR_MTRRfix64K_00000: return "MTRR fixed 64K";
+ case MSR_MTRRfix16K_80000: return "MTRR fixed 16K";
+ case MSR_MTRRfix4K_C0000: return "MTRR fixed 4K";
+ case MSR_CR_PAT: return "PAT";
+ case MSR_MTRRdefType: return "MTRR default type";
+ case MSR_EFER: return "EFER";
+ case MSR_STAR: return "STAR";
+ case MSR_LSTAR: return "LSTAR";
+ case MSR_CSTAR: return "CSTAR";
+ case MSR_SFMASK: return "SFMASK";
+ case MSR_FSBASE: return "FSBASE";
+ case MSR_GSBASE: return "GSBASE";
+ case MSR_KERNELGSBASE: return "KGSBASE";
+ default: return "Unknown MSR";
+ }
+}
+
+/*
+ * vmm_segment_desc_decode
+ *
+ * Debug function to print segment information for supplied descriptor
+ *
+ * Parameters:
+ * val - The A/R bytes for the segment descriptor to decode
+ */
+void
+vmm_segment_desc_decode(uint32_t val)
+{
+ uint16_t ar;
+ uint8_t g, type, s, dpl, p, dib, l;
+ uint32_t unusable;
+
+ /* Exit early on unusable descriptors */
+ unusable = val & 0x10000;
+ if (unusable) {
+ DPRINTF("(unusable)\n");
+ return;
+ }
+
+ ar = (uint16_t)val;
+
+ g = (ar & 0x8000) >> 15;
+ dib = (ar & 0x4000) >> 14;
+ l = (ar & 0x2000) >> 13;
+ p = (ar & 0x80) >> 7;
+ dpl = (ar & 0x60) >> 5;
+ s = (ar & 0x10) >> 4;
+ type = (ar & 0xf);
+
+ DPRINTF("granularity=%d dib=%d l(64 bit)=%d present=%d sys=%d ",
+ g, dib, l, p, s);
+
+ DPRINTF("type=");
+ if (!s) {
+ switch (type) {
+ case SDT_SYSLDT: DPRINTF("ldt\n"); break;
+ case SDT_SYS386TSS: DPRINTF("tss (available)\n"); break;
+ case SDT_SYS386BSY: DPRINTF("tss (busy)\n"); break;
+ case SDT_SYS386CGT: DPRINTF("call gate\n"); break;
+ case SDT_SYS386IGT: DPRINTF("interrupt gate\n"); break;
+ case SDT_SYS386TGT: DPRINTF("trap gate\n"); break;
+ /* XXX handle 32 bit segment types by inspecting mode */
+ default: DPRINTF("unknown");
+ }
+ } else {
+ switch (type + 16) {
+ case SDT_MEMRO: DPRINTF("data, r/o\n"); break;
+ case SDT_MEMROA: DPRINTF("data, r/o, accessed\n"); break;
+ case SDT_MEMRW: DPRINTF("data, r/w\n"); break;
+ case SDT_MEMRWA: DPRINTF("data, r/w, accessed\n"); break;
+ case SDT_MEMROD: DPRINTF("data, r/o, expand down\n"); break;
+ case SDT_MEMRODA: DPRINTF("data, r/o, expand down, "
+ "accessed\n");
+ break;
+ case SDT_MEMRWD: DPRINTF("data, r/w, expand down\n"); break;
+ case SDT_MEMRWDA: DPRINTF("data, r/w, expand down, "
+ "accessed\n");
+ break;
+ case SDT_MEME: DPRINTF("code, x only\n"); break;
+ case SDT_MEMEA: DPRINTF("code, x only, accessed\n");
+ case SDT_MEMER: DPRINTF("code, r/x\n"); break;
+ case SDT_MEMERA: DPRINTF("code, r/x, accessed\n"); break;
+ case SDT_MEMEC: DPRINTF("code, x only, conforming\n"); break;
+ case SDT_MEMEAC: DPRINTF("code, x only, conforming, "
+ "accessed\n");
+ break;
+ case SDT_MEMERC: DPRINTF("code, r/x, conforming\n"); break;
+ case SDT_MEMERAC: DPRINTF("code, r/x, conforming, accessed\n");
+ break;
+ }
+ }
+}
+
+void
+vmm_decode_cr0(uint32_t cr0)
+{
+ struct vmm_reg_debug_info cr0_info[11] = {
+ { CR0_PG, "PG ", "pg " },
+ { CR0_CD, "CD ", "cd " },
+ { CR0_NW, "NW ", "nw " },
+ { CR0_AM, "AM ", "am " },
+ { CR0_WP, "WP ", "wp " },
+ { CR0_NE, "NE ", "ne " },
+ { CR0_ET, "ET ", "et " },
+ { CR0_TS, "TS ", "ts " },
+ { CR0_EM, "EM ", "em " },
+ { CR0_MP, "MP ", "mp " },
+ { CR0_PE, "PE", "pe" }
+ };
+
+ uint8_t i;
+
+ DPRINTF("(");
+ for (i = 0; i < 11; i++)
+ if (cr0 & cr0_info[i].vrdi_bit)
+ DPRINTF(cr0_info[i].vrdi_present);
+ else
+ DPRINTF(cr0_info[i].vrdi_absent);
+
+ DPRINTF(")\n");
+}
+
+void
+vmm_decode_cr4(uint32_t cr4)
+{
+ struct vmm_reg_debug_info cr4_info[19] = {
+ { CR4_PKE, "PKE ", "pke "},
+ { CR4_SMAP, "SMAP ", "smap "},
+ { CR4_SMEP, "SMEP ", "smep "},
+ { CR4_OSXSAVE, "OSXSAVE ", "osxsave "},
+ { CR4_PCIDE, "PCIDE ", "pcide "},
+ { CR4_FSGSBASE, "FSGSBASE ", "fsgsbase "},
+ { CR4_SMXE, "SMXE ", "smxe "},
+ { CR4_VMXE, "VMXE ", "vmxe "},
+ { CR4_OSXMMEXCPT, "OSXMMEXCPT ", "osxmmexcpt "},
+ { CR4_OSFXSR, "OSFXSR ", "osfxsr "},
+ { CR4_PCE, "PCE ", "pce "},
+ { CR4_PGE, "PGE ", "pge "},
+ { CR4_MCE, "MCE ", "mce "},
+ { CR4_PAE, "PAE ", "pae "},
+ { CR4_PSE, "PSE ", "pse "},
+ { CR4_DE, "DE ", "de "},
+ { CR4_TSD, "TSD ", "tsd "},
+ { CR4_PVI, "PVI ", "pvi "},
+ { CR4_VME, "VME", "vme"}
+ };
+
+ uint8_t i;
+
+ DPRINTF("(");
+ for (i = 0; i < 19; i++)
+ if (cr4 & cr4_info[i].vrdi_bit)
+ DPRINTF(cr4_info[i].vrdi_present);
+ else
+ DPRINTF(cr4_info[i].vrdi_absent);
+
+ DPRINTF(")\n");
+}
+
+void
+vmm_decode_apicbase_msr_value(uint64_t apicbase)
+{
+ struct vmm_reg_debug_info apicbase_info[3] = {
+ { APICBASE_BSP, "BSP ", "bsp "},
+ { APICBASE_ENABLE_X2APIC, "X2APIC ", "x2apic "},
+ { APICBASE_GLOBAL_ENABLE, "GLB_EN", "glb_en"}
+ };
+
+ uint8_t i;
+
+ DPRINTF("(");
+ for (i = 0; i < 3; i++)
+ if (apicbase & apicbase_info[i].vrdi_bit)
+ DPRINTF(apicbase_info[i].vrdi_present);
+ else
+ DPRINTF(apicbase_info[i].vrdi_absent);
+
+ DPRINTF(")\n");
+}
+
+void
+vmm_decode_ia32_fc_value(uint64_t fcr)
+{
+ struct vmm_reg_debug_info fcr_info[4] = {
+ { IA32_FEATURE_CONTROL_LOCK, "LOCK ", "lock "},
+ { IA32_FEATURE_CONTROL_SMX_EN, "SMX ", "smx "},
+ { IA32_FEATURE_CONTROL_VMX_EN, "VMX ", "vmx "},
+ { IA32_FEATURE_CONTROL_SENTER_EN, "SENTER ", "senter "}
+ };
+
+ uint8_t i;
+
+ DPRINTF("(");
+ for (i = 0; i < 4; i++)
+ if (fcr & fcr_info[i].vrdi_bit)
+ DPRINTF(fcr_info[i].vrdi_present);
+ else
+ DPRINTF(fcr_info[i].vrdi_absent);
+
+ if (fcr & IA32_FEATURE_CONTROL_SENTER_EN)
+ DPRINTF(" [SENTER param = 0x%llx]",
+ (fcr & IA32_FEATURE_CONTROL_SENTER_PARAM_MASK) >> 8);
+
+ DPRINTF(")\n");
+}
+
+void
+vmm_decode_mtrrcap_value(uint64_t val)
+{
+ struct vmm_reg_debug_info mtrrcap_info[3] = {
+ { MTRRcap_FIXED, "FIXED ", "fixed "},
+ { MTRRcap_WC, "WC ", "wc "},
+ { MTRRcap_SMRR, "SMRR ", "smrr "}
+ };
+
+ uint8_t i;
+
+ DPRINTF("(");
+ for (i = 0; i < 3; i++)
+ if (val & mtrrcap_info[i].vrdi_bit)
+ DPRINTF(mtrrcap_info[i].vrdi_present);
+ else
+ DPRINTF(mtrrcap_info[i].vrdi_absent);
+
+ if (val & MTRRcap_FIXED)
+ DPRINTF(" [nr fixed ranges = 0x%llx]",
+ (val & 0xff));
+
+ DPRINTF(")\n");
+}
+
+void
+vmm_decode_perf_status_value(uint64_t val)
+{
+ DPRINTF("(pstate ratio = 0x%llx)\n", (val & 0xffff));
+}
+
+void vmm_decode_perf_ctl_value(uint64_t val)
+{
+ DPRINTF("(%s ", (val & PERF_CTL_TURBO) ? "TURBO" : "turbo");
+ DPRINTF("pstate req = 0x%llx)\n", (val & 0xfffF));
+}
+
+void
+vmm_decode_mtrrdeftype_value(uint64_t mtrrdeftype)
+{
+ struct vmm_reg_debug_info mtrrdeftype_info[2] = {
+ { MTRRdefType_FIXED_ENABLE, "FIXED ", "fixed "},
+ { MTRRdefType_ENABLE, "ENABLED ", "enabled "},
+ };
+
+ uint8_t i;
+ int type;
+
+ DPRINTF("(");
+ for (i = 0; i < 2; i++)
+ if (mtrrdeftype & mtrrdeftype_info[i].vrdi_bit)
+ DPRINTF(mtrrdeftype_info[i].vrdi_present);
+ else
+ DPRINTF(mtrrdeftype_info[i].vrdi_absent);
+
+ DPRINTF("type = ");
+ type = mtrr2mrt(mtrrdeftype & 0xff);
+ switch (type) {
+ case MDF_UNCACHEABLE: DPRINTF("UC"); break;
+ case MDF_WRITECOMBINE: DPRINTF("WC"); break;
+ case MDF_WRITETHROUGH: DPRINTF("WT"); break;
+ case MDF_WRITEPROTECT: DPRINTF("RO"); break;
+ case MDF_WRITEBACK: DPRINTF("WB"); break;
+ case MDF_UNKNOWN:
+ default:
+ DPRINTF("??");
+ break;
+ }
+
+ DPRINTF(")\n");
+}
+
+void
+vmm_decode_efer_value(uint64_t efer)
+{
+ struct vmm_reg_debug_info efer_info[4] = {
+ { EFER_SCE, "SCE ", "sce "},
+ { EFER_LME, "LME ", "lme "},
+ { EFER_LMA, "LMA ", "lma "},
+ { EFER_NXE, "NXE", "nxe"},
+ };
+
+ uint8_t i;
+
+ DPRINTF("(");
+ for (i = 0; i < 4; i++)
+ if (efer & efer_info[i].vrdi_bit)
+ DPRINTF(efer_info[i].vrdi_present);
+ else
+ DPRINTF(efer_info[i].vrdi_absent);
+
+ DPRINTF(")\n");
+}
+
+void
+vmm_decode_msr_value(uint64_t msr, uint64_t val)
+{
+ switch (msr) {
+ case MSR_APICBASE: vmm_decode_apicbase_msr_value(val); break;
+ case MSR_IA32_FEATURE_CONTROL: vmm_decode_ia32_fc_value(val); break;
+ case MSR_MTRRcap: vmm_decode_mtrrcap_value(val); break;
+ case MSR_PERF_STATUS: vmm_decode_perf_status_value(val); break;
+ case MSR_PERF_CTL: vmm_decode_perf_ctl_value(val); break;
+ case MSR_MTRRdefType: vmm_decode_mtrrdeftype_value(val); break;
+ case MSR_EFER: vmm_decode_efer_value(val); break;
+ default: DPRINTF("\n");
+ }
+}
+#endif /* VMM_DEBUG */
diff --git a/sys/arch/i386/i386/vmm_support.S b/sys/arch/i386/i386/vmm_support.S
new file mode 100644
index 00000000000..54d41349586
--- /dev/null
+++ b/sys/arch/i386/i386/vmm_support.S
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "assym.h"
+#include <machine/asm.h>
+#include <machine/specialreg.h>
+
+/*
+ * XXX duplicated in vmmvar.h due to song-and-dance with sys/rwlock.h inclusion
+ * here
+ */
+#define VMX_FAIL_LAUNCH_UNKNOWN 1
+#define VMX_FAIL_LAUNCH_INVALID_VMCS 2
+#define VMX_FAIL_LAUNCH_VALID_VMCS 3
+
+ .text
+ .code32
+ .align 16
+ .global _C_LABEL(vmxon)
+ .global _C_LABEL(vmxoff)
+ .global _C_LABEL(vmclear)
+ .global _C_LABEL(vmptrld)
+ .global _C_LABEL(vmptrst)
+ .global _C_LABEL(vmwrite)
+ .global _C_LABEL(vmread)
+ .global _C_LABEL(invvpid)
+ .global _C_LABEL(invept)
+ .global _C_LABEL(vmx_enter_guest)
+ .global _C_LABEL(vmm_dispatch_intr)
+
+_C_LABEL(vmm_dispatch_intr):
+ movl %esp, %eax
+ andl $0xFFFFFFF0, %esp
+ pushl %ss
+ pushl %eax
+ pushfl
+ pushl %cs
+ cli
+ movl 4(%eax), %eax
+ calll *%eax
+ addl $0x8, %esp
+ ret
+
+_C_LABEL(vmxon):
+ movl 4(%esp), %eax
+ vmxon (%eax)
+ jz failed_on
+ jc failed_on
+ xorl %eax, %eax
+ ret
+failed_on:
+ movl $0x01, %eax
+ ret
+
+_C_LABEL(vmxoff):
+ vmxoff
+ jz failed_off
+ jc failed_off
+ xorl %eax, %eax
+ ret
+failed_off:
+ movl $0x01, %eax
+ ret
+
+_C_LABEL(vmclear):
+ movl 0x04(%esp), %eax
+ vmclear (%eax)
+ jz failed_clear
+ jc failed_clear
+ xorl %eax, %eax
+ ret
+failed_clear:
+ movl $0x01, %eax
+ ret
+
+_C_LABEL(vmptrld):
+ movl 4(%esp), %eax
+ vmptrld (%eax)
+ jz failed_ptrld
+ jc failed_ptrld
+ xorl %eax, %eax
+ ret
+failed_ptrld:
+ movl $0x01, %eax
+ ret
+
+_C_LABEL(vmptrst):
+ movl 0x04(%esp), %eax
+ vmptrst (%eax)
+ jz failed_ptrst
+ jc failed_ptrst
+ xorl %eax, %eax
+ ret
+failed_ptrst:
+ movl $0x01, %eax
+ ret
+
+_C_LABEL(vmwrite):
+ movl 0x04(%esp), %eax
+ vmwrite 0x08(%esp), %eax
+ jz failed_write
+ jc failed_write
+ xorl %eax, %eax
+ ret
+failed_write:
+ movl $0x01, %eax
+ ret
+
+_C_LABEL(vmread):
+ pushl %ebx
+ movl 0x08(%esp), %ebx
+ movl 0x0c(%esp), %eax
+ vmread %ebx, (%eax)
+ jz failed_read
+ jc failed_read
+ popl %ebx
+ xorl %eax, %eax
+ ret
+failed_read:
+ popl %ebx
+ movl $0x01, %eax
+ ret
+
+_C_LABEL(invvpid):
+ pushl %ebx
+ movl 0x08(%esp), %eax
+ movl 0x0c(%esp), %ebx
+ invvpid (%ebx), %eax
+ popl %ebx
+ ret
+
+_C_LABEL(invept):
+ movl 0x04(%esp), %eax
+ invept 0x08(%esp), %eax
+ ret
+
+_C_LABEL(vmx_enter_guest):
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 0x14(%esp), %edx /* Guest Regs Pointer */
+ movl 0x18(%esp), %ebx /* resume flag */
+ testl %ebx, %ebx
+ jnz skip_init
+
+ /*
+ * XXX make vmx_exit_handler a global and put this in the per-vcpu
+ * init code
+ */
+ movl $VMCS_HOST_IA32_RIP, %eax
+ movl $vmx_exit_handler_asm, %ecx
+ vmwrite %ecx, %eax
+
+skip_init:
+ pushfl
+
+ strw %ax
+ pushw %ax
+ movw %es, %ax
+ pushw %ax
+ movw %ds, %ax
+ pushw %ax
+ movw %ss, %ax
+ pushw %ax
+ pushw %fs
+ pushw %gs
+
+ pushl %ebp
+ pushl %esi
+ pushl %edi
+ pushl %edx /* Guest Regs Pointer */
+
+ movl $VMCS_HOST_IA32_RSP, %edi
+ movl %esp, %eax
+ vmwrite %eax, %edi
+
+ testl %ebx, %ebx
+ jnz do_resume
+
+ /* Restore guest registers */
+ movl 0x1c(%edx), %eax
+ movl %eax, %cr2
+ movl 0x18(%edx), %ebp
+ movl 0x14(%edx), %edi
+ movl 0x0c(%edx), %ecx
+ movl 0x08(%edx), %ebx
+ movl 0x04(%edx), %eax
+ movl (%edx), %esi
+ movl 0x10(%edx), %edx
+
+ vmlaunch
+ jmp fail_launch_or_resume
+do_resume:
+ /* Restore guest registers */
+ movl 0x1c(%edx), %eax
+ movl %eax, %cr2
+ movl 0x18(%edx), %ebp
+ movl 0x14(%edx), %edi
+ movl 0x0c(%edx), %ecx
+ movl 0x08(%edx), %ebx
+ movl 0x04(%edx), %eax
+ movl (%edx), %esi
+ movl 0x10(%edx), %edx
+ vmresume
+fail_launch_or_resume:
+ /* Failed launch/resume (fell through) */
+ jc fail_launch_invalid_vmcs /* Invalid VMCS */
+ jz fail_launch_valid_vmcs /* Valid VMCS, failed launch/resume */
+
+ /* Unknown failure mode (not documented as per Intel SDM) */
+ movl $VMX_FAIL_LAUNCH_UNKNOWN, %eax
+ popl %edx
+ jmp restore_host
+
+fail_launch_invalid_vmcs:
+ movl $VMX_FAIL_LAUNCH_INVALID_VMCS, %eax
+ popl %edx
+ jmp restore_host
+
+fail_launch_valid_vmcs:
+ movl $VMCS_INSTRUCTION_ERROR, %edi
+ popl %edx
+ vmread %edi, %eax
+ /* XXX check failure of vmread */
+ movl %eax, 0x20(%edx)
+ movl $VMX_FAIL_LAUNCH_VALID_VMCS, %eax
+ jmp restore_host
+
+vmx_exit_handler_asm:
+ /* Preserve guest registers not saved in VMCS */
+ pushl %esi
+ pushl %edi
+ movl 0x8(%esp), %edi
+ movl 0x4(%esp), %esi
+ movl %esi, (%edi)
+ popl %edi
+ popl %esi /* discard */
+
+ popl %esi
+ movl %eax, 0x4(%esi)
+ movl %ebx, 0x8(%esi)
+ movl %ecx, 0xc(%esi)
+ movl %edx, 0x10(%esi)
+ movl %edi, 0x14(%esi)
+ movl %ebp, 0x18(%esi)
+ movl %cr2, %eax
+ movl %eax, 0x1c(%esi)
+
+restore_host:
+ popl %edi
+ popl %esi
+ popl %ebp
+
+ popw %gs
+ popw %fs
+ popw %ax
+ movw %ax, %ss
+ popw %ax
+ movw %ax, %ds
+ popw %ax
+ movw %ax, %es
+ xorl %ecx, %ecx
+ popw %cx
+
+ popfl
+
+ movl 0x1c(%esp), %ebx
+ leal (%ebx, %ecx), %eax
+ andb $0xF9, 5(%eax)
+ ltr %cx
+
+ popl %edx
+ popl %ecx
+ popl %ebx
+
+ xorl %eax, %eax
+
+ ret
diff --git a/sys/arch/i386/include/cpu.h b/sys/arch/i386/include/cpu.h
index 58b823d64ab..3c140f26cd3 100644
--- a/sys/arch/i386/include/cpu.h
+++ b/sys/arch/i386/include/cpu.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: cpu.h,v 1.149 2016/10/14 04:53:26 mlarkin Exp $ */
+/* $OpenBSD: cpu.h,v 1.150 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: cpu.h,v 1.35 1996/05/05 19:29:26 christos Exp $ */
/*-
@@ -69,6 +69,36 @@
struct intrsource;
+/* VMXON region (Intel) */
+struct vmxon_region {
+ uint32_t vr_revision;
+};
+
+/*
+ * VMX for Intel CPUs
+ */
+struct vmx {
+ uint64_t vmx_cr0_fixed0;
+ uint64_t vmx_cr0_fixed1;
+ uint64_t vmx_cr4_fixed0;
+ uint64_t vmx_cr4_fixed1;
+ uint32_t vmx_vmxon_revision;
+ uint32_t vmx_msr_table_size;
+ uint32_t vmx_cr3_tgt_count;
+ uint64_t vmx_vm_func;
+};
+
+/*
+ * SVM for AMD CPUs
+ */
+struct svm {
+};
+
+union vmm_cpu_cap {
+ struct vmx vcc_vmx;
+ struct svm vcc_svm;
+};
+
#ifdef _KERNEL
/* XXX stuff to move to cpuvar.h later */
struct cpu_info {
@@ -158,6 +188,14 @@ struct cpu_info {
#ifdef GPROF
struct gmonparam *ci_gmon;
#endif
+ u_int32_t ci_vmm_flags;
+#define CI_VMM_VMX (1 << 0)
+#define CI_VMM_SVM (1 << 1)
+#define CI_VMM_RVI (1 << 2)
+#define CI_VMM_EPT (1 << 3)
+ union vmm_cpu_cap ci_vmm_cap;
+ uint64_t ci_vmxon_region_pa; /* Must be 64 bit */
+ struct vmxon_region *ci_vmxon_region;
};
/*
@@ -177,6 +215,7 @@ struct cpu_info {
#define CPUF_PRESENT 0x1000 /* CPU is present */
#define CPUF_RUNNING 0x2000 /* CPU is running */
+#define CPUF_VMM 0x4000 /* CPU is executing in VMM mode */
/*
* We statically allocate the CPU info for the primary CPU (or,
diff --git a/sys/arch/i386/include/intrdefs.h b/sys/arch/i386/include/intrdefs.h
index 0384febd3f8..fba06ef79e9 100644
--- a/sys/arch/i386/include/intrdefs.h
+++ b/sys/arch/i386/include/intrdefs.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: intrdefs.h,v 1.14 2013/05/16 19:26:04 kettenis Exp $ */
+/* $OpenBSD: intrdefs.h,v 1.15 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: intrdefs.h,v 1.2 2003/05/04 22:01:56 fvdl Exp $ */
#ifndef _I386_INTRDEFS_H
@@ -115,13 +115,16 @@
#define I386_IPI_GDT 0x00000020
#define I386_IPI_DDB 0x00000040 /* synchronize while in ddb */
#define I386_IPI_SETPERF 0x00000080
+#define I386_IPI_START_VMM 0x00000100
+#define I386_IPI_STOP_VMM 0x00000200
-#define I386_NIPI 8
+#define I386_NIPI 10
#define I386_IPI_NAMES { "halt IPI", "nop IPI", "FPU flush IPI", \
"FPU synch IPI", \
"MTRR update IPI", "GDT update IPI", \
- "DDB IPI", "setperf IPI" }
+ "DDB IPI", "setperf IPI", "VMM start IPI", \
+ "VMM stop IPI" }
#define IREENT_MAGIC 0x18041969
diff --git a/sys/arch/i386/include/pmap.h b/sys/arch/i386/include/pmap.h
index 1614b117cab..8751e11be56 100644
--- a/sys/arch/i386/include/pmap.h
+++ b/sys/arch/i386/include/pmap.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: pmap.h,v 1.82 2016/03/15 03:17:51 guenther Exp $ */
+/* $OpenBSD: pmap.h,v 1.83 2016/10/21 06:20:59 mlarkin Exp $ */
/* $NetBSD: pmap.h,v 1.44 2000/04/24 17:18:18 thorpej Exp $ */
/*
@@ -88,6 +88,11 @@ LIST_HEAD(pmap_head, pmap); /* struct pmap_head: head of a pmap list */
* page list, and number of PTPs within the pmap.
*/
+#define PMAP_TYPE_NORMAL 1
+#define PMAP_TYPE_EPT 2
+#define PMAP_TYPE_RVI 3
+#define pmap_nested(pm) ((pm)->pm_type != PMAP_TYPE_NORMAL)
+
struct pmap {
uint64_t pm_pdidx[4]; /* PDIEs for PAE mode */
@@ -106,6 +111,10 @@ struct pmap {
int pm_flags; /* see below */
struct segment_descriptor pm_codeseg; /* cs descriptor for process */
+ int pm_type; /* Type of pmap this is (PMAP_TYPE_x) */
+ vaddr_t pm_npt_pml4; /* Nested paging PML4 VA */
+ paddr_t pm_npt_pa; /* Nested paging PML4 PA */
+ vaddr_t pm_npt_pdpt; /* Nested paging PDPT */
};
/*
@@ -246,6 +255,7 @@ void pmap_switch(struct proc *, struct proc *);
vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */
paddr_t vtophys(vaddr_t va);
paddr_t vtophys_pae(vaddr_t va);
+int pmap_convert(struct pmap *, int);
extern u_int32_t (*pmap_pte_set_p)(vaddr_t, paddr_t, u_int32_t);
extern u_int32_t (*pmap_pte_setbits_p)(vaddr_t, u_int32_t, u_int32_t);
diff --git a/sys/arch/i386/include/pte.h b/sys/arch/i386/include/pte.h
index c0e1ccfb83d..aa9b62341d6 100644
--- a/sys/arch/i386/include/pte.h
+++ b/sys/arch/i386/include/pte.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: pte.h,v 1.21 2015/04/12 18:37:54 mlarkin Exp $ */
+/* $OpenBSD: pte.h,v 1.22 2016/10/21 06:20:59 mlarkin Exp $ */
/* $NetBSD: pte.h,v 1.11 1998/02/06 21:58:05 thorpej Exp $ */
/*
@@ -67,6 +67,13 @@
#define PG_AVAIL3 0x00000800 /* ignored by hardware */
#define PG_PATLG 0x00001000 /* PAT on large pages */
+/* EPT PTE bits */
+#define EPT_R (1ULL << 0)
+#define EPT_W (1ULL << 1)
+#define EPT_X (1ULL << 2)
+#define EPT_WB (6ULL << 3)
+#define EPT_PS (1ULL << 7)
+
/* Cacheability bits when we are using PAT */
#define PG_WB (0) /* The default */
#define PG_WC (PG_WT) /* WT and CD is WC */
diff --git a/sys/arch/i386/include/specialreg.h b/sys/arch/i386/include/specialreg.h
index 8bfd61b766e..aa02392022b 100644
--- a/sys/arch/i386/include/specialreg.h
+++ b/sys/arch/i386/include/specialreg.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: specialreg.h,v 1.57 2016/09/03 13:35:03 mlarkin Exp $ */
+/* $OpenBSD: specialreg.h,v 1.58 2016/10/21 06:20:59 mlarkin Exp $ */
/* $NetBSD: specialreg.h,v 1.7 1994/10/27 04:16:26 cgd Exp $ */
/*-
@@ -69,6 +69,12 @@
/* the remaining 7 bits of this register are reserved */
/*
+ * bits in CR3
+ */
+#define CR3_PWT (1ULL << 3)
+#define CR3_PCD (1ULL << 4)
+
+/*
* bits in the pentiums %cr4 register:
*/
@@ -91,6 +97,7 @@
#define CR4_OSXSAVE 0x00040000 /* enable XSAVE and extended states */
#define CR4_SMEP 0x00100000 /* supervisor mode exec protection */
#define CR4_SMAP 0x00200000 /* supervisor mode access prevention */
+#define CR4_PKE 0x00400000 /* protection key enable */
/*
* CPUID "features" bits (CPUID function 0x1):
@@ -296,14 +303,20 @@
#define P5MSR_CTR0 0x012 /* P5 only (trap on P6) */
#define P5MSR_CTR1 0x013 /* P5 only (trap on P6) */
#define MSR_APICBASE 0x01b
+#define APICBASE_BSP 0x100
+#define APICBASE_ENABLE_X2APIC 0x400
+#define APICBASE_GLOBAL_ENABLE 0x800
#define MSR_EBL_CR_POWERON 0x02a
#define MSR_EBC_FREQUENCY_ID 0x02c /* Pentium 4 only */
#define MSR_TEST_CTL 0x033
+#define MSR_IA32_FEATURE_CONTROL 0x03a
#define MSR_BIOS_UPDT_TRIG 0x079
#define MSR_BBL_CR_D0 0x088 /* PII+ only */
#define MSR_BBL_CR_D1 0x089 /* PII+ only */
#define MSR_BBL_CR_D2 0x08a /* PII+ only */
#define MSR_BIOS_SIGN 0x08b
+#define MSR_PERFCTR0 0x0c1
+#define MSR_PERFCTR1 0x0c2
#define P6MSR_CTR0 0x0c1
#define P6MSR_CTR1 0x0c2
#define MSR_FSB_FREQ 0x0cd /* Core Duo/Solo only */
@@ -422,6 +435,7 @@
#define EFER_LME 0x00000100 /* Long Mode Active */
#define EFER_LMA 0x00000400 /* Long Mode Enabled */
#define EFER_NXE 0x00000800 /* No-Execute Enabled */
+#define EFER_SVME 0x00001000 /* SVM Enabled */
#define MSR_STAR 0xc0000081 /* 32 bit syscall gate addr */
#define MSR_LSTAR 0xc0000082 /* 64 bit syscall gate addr */
@@ -688,3 +702,354 @@
#define C3_CRYPT_CWLO_KEY128 0x0000000a /* 128bit, 10 rds */
#define C3_CRYPT_CWLO_KEY192 0x0000040c /* 192bit, 12 rds */
#define C3_CRYPT_CWLO_KEY256 0x0000080e /* 256bit, 15 rds */
+
+/*
+ * VMX
+ */
+#define IA32_FEATURE_CONTROL_LOCK 0x01
+#define IA32_FEATURE_CONTROL_SMX_EN 0x02
+#define IA32_FEATURE_CONTROL_VMX_EN 0x04
+#define IA32_FEATURE_CONTROL_SENTER_EN (1ULL << 15)
+#define IA32_FEATURE_CONTROL_SENTER_PARAM_MASK 0x7f00
+#define IA32_VMX_BASIC 0x480
+#define IA32_VMX_PINBASED_CTLS 0x481
+#define IA32_VMX_PROCBASED_CTLS 0x482
+#define IA32_VMX_EXIT_CTLS 0x483
+#define IA32_VMX_ENTRY_CTLS 0x484
+#define IA32_VMX_MISC 0x485
+#define IA32_VMX_CR0_FIXED0 0x486
+#define IA32_VMX_CR0_FIXED1 0x487
+#define IA32_VMX_CR4_FIXED0 0x488
+#define IA32_VMX_CR4_FIXED1 0x489
+#define IA32_VMX_PROCBASED2_CTLS 0x48B
+#define IA32_VMX_EPT_VPID_CAP 0x48C
+#define IA32_VMX_TRUE_PINBASED_CTLS 0x48D
+#define IA32_VMX_TRUE_PROCBASED_CTLS 0x48E
+#define IA32_VMX_TRUE_EXIT_CTLS 0x48F
+#define IA32_VMX_TRUE_ENTRY_CTLS 0x490
+#define IA32_VMX_VMFUNC 0x491
+
+#define IA32_EPT_VPID_CAP_PAGE_WALK_4 (1ULL << 6)
+#define IA32_EPT_VPID_CAP_WB (1ULL << 14)
+#define IA32_EPT_VPID_CAP_AD_BITS (1ULL << 21)
+
+#define IA32_EPT_PAGING_CACHE_TYPE_UC 0x0
+#define IA32_EPT_PAGING_CACHE_TYPE_WB 0x6
+#define IA32_EPT_AD_BITS_ENABLE (1ULL << 6)
+#define IA32_EPT_PAGE_WALK_LENGTH 0x4
+
+/* VMX : IA32_VMX_BASIC bits */
+#define IA32_VMX_TRUE_CTLS_AVAIL (1ULL << 55)
+
+/* VMX : IA32_VMX_PINBASED_CTLS bits */
+#define IA32_VMX_EXTERNAL_INT_EXITING (1ULL << 0)
+#define IA32_VMX_NMI_EXITING (1ULL << 3)
+#define IA32_VMX_VIRTUAL_NMIS (1ULL << 5)
+#define IA32_VMX_ACTIVATE_VMX_PREEMPTION_TIMER (1ULL << 6)
+#define IA32_VMX_PROCESS_POSTED_INTERRUPTS (1ULL << 7)
+
+/* VMX : IA32_VMX_PROCBASED_CTLS bits */
+#define IA32_VMX_INTERRUPT_WINDOW_EXITING (1ULL << 2)
+#define IA32_VMX_USE_TSC_OFFSETTING (1ULL << 3)
+#define IA32_VMX_HLT_EXITING (1ULL << 7)
+#define IA32_VMX_INVLPG_EXITING (1ULL << 9)
+#define IA32_VMX_MWAIT_EXITING (1ULL << 10)
+#define IA32_VMX_RDPMC_EXITING (1ULL << 11)
+#define IA32_VMX_RDTSC_EXITING (1ULL << 12)
+#define IA32_VMX_CR3_LOAD_EXITING (1ULL << 15)
+#define IA32_VMX_CR3_STORE_EXITING (1ULL << 16)
+#define IA32_VMX_CR8_LOAD_EXITING (1ULL << 19)
+#define IA32_VMX_CR8_STORE_EXITING (1ULL << 20)
+#define IA32_VMX_USE_TPR_SHADOW (1ULL << 21)
+#define IA32_VMX_NMI_WINDOW_EXITING (1ULL << 22)
+#define IA32_VMX_MOV_DR_EXITING (1ULL << 23)
+#define IA32_VMX_UNCONDITIONAL_IO_EXITING (1ULL << 24)
+#define IA32_VMX_USE_IO_BITMAPS (1ULL << 25)
+#define IA32_VMX_MONITOR_TRAP_FLAG (1ULL << 27)
+#define IA32_VMX_USE_MSR_BITMAPS (1ULL << 28)
+#define IA32_VMX_MONITOR_EXITING (1ULL << 29)
+#define IA32_VMX_PAUSE_EXITING (1ULL << 30)
+#define IA32_VMX_ACTIVATE_SECONDARY_CONTROLS (1ULL << 31)
+
+/* VMX : IA32_VMX_PROCBASED2_CTLS bits */
+#define IA32_VMX_VIRTUALIZE_APIC (1ULL << 0)
+#define IA32_VMX_ENABLE_EPT (1ULL << 1)
+#define IA32_VMX_DESCRIPTOR_TABLE_EXITING (1ULL << 2)
+#define IA32_VMX_ENABLE_RDTSCP (1ULL << 3)
+#define IA32_VMX_VIRTUALIZE_X2APIC_MODE (1ULL << 4)
+#define IA32_VMX_ENABLE_VPID (1ULL << 5)
+#define IA32_VMX_WBINVD_EXITING (1ULL << 6)
+#define IA32_VMX_UNRESTRICTED_GUEST (1ULL << 7)
+#define IA32_VMX_APIC_REGISTER_VIRTUALIZATION (1ULL << 8)
+#define IA32_VMX_VIRTUAL_INTERRUPT_DELIVERY (1ULL << 9)
+#define IA32_VMX_PAUSE_LOOP_EXITING (1ULL << 10)
+#define IA32_VMX_RDRAND_EXITING (1ULL << 11)
+#define IA32_VMX_ENABLE_INVPCID (1ULL << 12)
+#define IA32_VMX_ENABLE_VM_FUNCTIONS (1ULL << 13)
+#define IA32_VMX_VMCS_SHADOWING (1ULL << 14)
+#define IA32_VMX_ENABLE_ENCLS_EXITING (1ULL << 15)
+#define IA32_VMX_RDSEED_EXITING (1ULL << 16)
+#define IA32_VMX_ENABLE_PML (1ULL << 17)
+#define IA32_VMX_EPT_VIOLATION_VE (1ULL << 18)
+#define IA32_VMX_CONCEAL_VMX_FROM_PT (1ULL << 19)
+#define IA32_VMX_ENABLE_XSAVES_XRSTORS (1ULL << 20)
+#define IA32_VMX_ENABLE_TSC_SCALING (1ULL << 25)
+
+/* VMX : IA32_VMX_EXIT_CTLS bits */
+#define IA32_VMX_SAVE_DEBUG_CONTROLS (1ULL << 2)
+#define IA32_VMX_HOST_SPACE_ADDRESS_SIZE (1ULL << 9)
+#define IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_EXIT (1ULL << 12)
+#define IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT (1ULL << 15)
+#define IA32_VMX_SAVE_IA32_PAT_ON_EXIT (1ULL << 18)
+#define IA32_VMX_LOAD_IA32_PAT_ON_EXIT (1ULL << 19)
+#define IA32_VMX_SAVE_IA32_EFER_ON_EXIT (1ULL << 20)
+#define IA32_VMX_LOAD_IA32_EFER_ON_EXIT (1ULL << 21)
+#define IA32_VMX_SAVE_VMX_PREEMPTION_TIMER (1ULL << 22)
+#define IA32_VMX_CLEAR_IA32_BNDCFGS_ON_EXIT (1ULL << 23)
+#define IA32_VMX_CONCEAL_VM_EXITS_FROM_PT (1ULL << 24)
+
+/* VMX: IA32_VMX_ENTRY_CTLS bits */
+#define IA32_VMX_LOAD_DEBUG_CONTROLS (1ULL << 2)
+#define IA32_VMX_IA32E_MODE_GUEST (1ULL << 9)
+#define IA32_VMX_ENTRY_TO_SMM (1ULL << 10)
+#define IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT (1ULL << 11)
+#define IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY (1ULL << 13)
+#define IA32_VMX_LOAD_IA32_PAT_ON_ENTRY (1ULL << 14)
+#define IA32_VMX_LOAD_IA32_EFER_ON_ENTRY (1ULL << 15)
+#define IA32_VMX_LOAD_IA32_BNDCFGS_ON_ENTRY (1ULL << 16)
+#define IA32_VMX_CONCEAL_VM_ENTRIES_FROM_PT (1ULL << 17)
+
+/*
+ * VMX : VMCS Fields
+ */
+
+/* 16-bit control fields */
+#define VMCS_GUEST_VPID 0x0000
+#define VMCS_POSTED_INT_NOTIF_VECTOR 0x0002
+#define VMCS_EPTP_INDEX 0x0004
+
+/* 16-bit guest state fields */
+#define VMCS_GUEST_IA32_ES_SEL 0x0800
+#define VMCS_GUEST_IA32_CS_SEL 0x0802
+#define VMCS_GUEST_IA32_SS_SEL 0x0804
+#define VMCS_GUEST_IA32_DS_SEL 0x0806
+#define VMCS_GUEST_IA32_FS_SEL 0x0808
+#define VMCS_GUEST_IA32_GS_SEL 0x080A
+#define VMCS_GUEST_IA32_LDTR_SEL 0x080C
+#define VMCS_GUEST_IA32_TR_SEL 0x080E
+#define VMCS_GUEST_INTERRUPT_STATUS 0x0810
+#define VMCS_GUEST_PML_INDEX 0x0812
+
+/* 16-bit host state fields */
+#define VMCS_HOST_IA32_ES_SEL 0x0C00
+#define VMCS_HOST_IA32_CS_SEL 0x0C02
+#define VMCS_HOST_IA32_SS_SEL 0x0C04
+#define VMCS_HOST_IA32_DS_SEL 0x0C06
+#define VMCS_HOST_IA32_FS_SEL 0x0C08
+#define VMCS_HOST_IA32_GS_SEL 0x0C0A
+#define VMCS_HOST_IA32_TR_SEL 0x0C0C
+
+/* 64-bit control fields */
+#define VMCS_IO_BITMAP_A 0x2000
+#define VMCS_IO_BITMAP_B 0x2002
+#define VMCS_MSR_BITMAP_ADDRESS 0x2004
+#define VMCS_MSR_BITMAP_ADDRESS_HI 0x2005
+#define VMCS_EXIT_STORE_MSR_ADDRESS 0x2006
+#define VMCS_EXIT_STORE_MSR_ADDRESS_HI 0x2007
+#define VMCS_EXIT_LOAD_MSR_ADDRESS 0x2008
+#define VMCS_EXIT_LOAD_MSR_ADDRESS_HI 0x2009
+#define VMCS_ENTRY_LOAD_MSR_ADDRESS 0x200A
+#define VMCS_ENTRY_LOAD_MSR_ADDRESS_HI 0x200B
+#define VMCS_EXECUTIVE_VMCS_POINTER 0x200C
+#define VMCS_PML_ADDRESS 0x200E
+#define VMCS_TSC_OFFSET 0x2010
+#define VMCS_VIRTUAL_APIC_ADDRESS 0x2012
+#define VMCS_APIC_ACCESS_ADDRESS 0x2014
+#define VMCS_POSTED_INTERRUPT_DESC 0x2016
+#define VMCS_VM_FUNCTION_CONTROLS 0x2018
+#define VMCS_GUEST_IA32_EPTP 0x201A
+#define VMCS_GUEST_IA32_EPTP_HI 0x201B
+#define VMCS_EOI_EXIT_BITMAP_0 0x201C
+#define VMCS_EOI_EXIT_BITMAP_1 0x201E
+#define VMCS_EOI_EXIT_BITMAP_2 0x2020
+#define VMCS_EOI_EXIT_BITMAP_3 0x2022
+#define VMCS_EPTP_LIST_ADDRESS 0x2024
+#define VMCS_VMREAD_BITMAP_ADDRESS 0x2026
+#define VMCS_VMWRITE_BITMAP_ADDRESS 0x2028
+#define VMCS_VIRTUALIZATION_EXC_ADDRESS 0x202A
+#define VMCS_XSS_EXITING_BITMAP 0x202C
+#define VMCS_ENCLS_EXITING_BITMAP 0x202E
+#define VMCS_TSC_MULTIPLIER 0x2032
+
+/* 64-bit RO data field */
+#define VMCS_GUEST_PHYSICAL_ADDRESS 0x2400
+#define VMCS_GUEST_PHYSICAL_ADDRESS_HI 0x2401
+
+/* 64-bit guest state fields */
+#define VMCS_LINK_POINTER 0x2800
+#define VMCS_LINK_POINTER_HI 0x2801
+#define VMCS_GUEST_IA32_DEBUGCTL 0x2802
+#define VMCS_GUEST_IA32_PAT 0x2804
+#define VMCS_GUEST_IA32_EFER 0x2806
+#define VMCS_GUEST_IA32_PERF_GBL_CTRL 0x2808
+#define VMCS_GUEST_PDPTE0 0x280A
+#define VMCS_GUEST_PDPTE1 0x280C
+#define VMCS_GUEST_PDPTE2 0x280E
+#define VMCS_GUEST_PDPTE3 0x2810
+#define VMCS_GUEST_IA32_BNDCFGS 0x2812
+
+/* 64-bit host state fields */
+#define VMCS_HOST_IA32_PAT 0x2C00
+#define VMCS_HOST_IA32_EFER 0x2C02
+#define VMCS_HOST_IA32_PERF_GBL_CTRL 0x2C04
+
+/* 32-bit control fields */
+#define VMCS_PINBASED_CTLS 0x4000
+#define VMCS_PROCBASED_CTLS 0x4002
+#define VMCS_EXCEPTION_BITMAP 0x4004
+#define VMCS_PF_ERROR_CODE_MASK 0x4006
+#define VMCS_PF_ERROR_CODE_MATCH 0x4008
+#define VMCS_CR3_TARGET_COUNT 0x400A
+#define VMCS_EXIT_CTLS 0x400C
+#define VMCS_EXIT_MSR_STORE_COUNT 0x400E
+#define VMCS_EXIT_MSR_LOAD_COUNT 0x4010
+#define VMCS_ENTRY_CTLS 0x4012
+#define VMCS_ENTRY_MSR_LOAD_COUNT 0x4014
+#define VMCS_ENTRY_INTERRUPTION_INFO 0x4016
+#define VMCS_ENTRY_EXCEPTION_ERROR_CODE 0x4018
+#define VMCS_ENTRY_INSTRUCTION_LENGTH 0x401A
+#define VMCS_TPR_THRESHOLD 0x401C
+#define VMCS_PROCBASED2_CTLS 0x401E
+#define VMCS_PLE_GAP 0x4020
+#define VMCS_PLE_WINDOW 0x4022
+
+/* 32-bit RO data fields */
+#define VMCS_INSTRUCTION_ERROR 0x4400
+#define VMCS_EXIT_REASON 0x4402
+#define VMCS_EXIT_INTERRUPTION_INFO 0x4404
+#define VMCS_EXIT_INTERRUPTION_ERR_CODE 0x4406
+#define VMCS_IDT_VECTORING_INFO 0x4408
+#define VMCS_IDT_VECTORING_ERROR_CODE 0x440A
+#define VMCS_INSTRUCTION_LENGTH 0x440C
+#define VMCS_EXIT_INSTRUCTION_INFO 0x440E
+
+/* 32-bit guest state fields */
+#define VMCS_GUEST_IA32_ES_LIMIT 0x4800
+#define VMCS_GUEST_IA32_CS_LIMIT 0x4802
+#define VMCS_GUEST_IA32_SS_LIMIT 0x4804
+#define VMCS_GUEST_IA32_DS_LIMIT 0x4806
+#define VMCS_GUEST_IA32_FS_LIMIT 0x4808
+#define VMCS_GUEST_IA32_GS_LIMIT 0x480A
+#define VMCS_GUEST_IA32_LDTR_LIMIT 0x480C
+#define VMCS_GUEST_IA32_TR_LIMIT 0x480E
+#define VMCS_GUEST_IA32_GDTR_LIMIT 0x4810
+#define VMCS_GUEST_IA32_IDTR_LIMIT 0x4812
+#define VMCS_GUEST_IA32_ES_AR 0x4814
+#define VMCS_GUEST_IA32_CS_AR 0x4816
+#define VMCS_GUEST_IA32_SS_AR 0x4818
+#define VMCS_GUEST_IA32_DS_AR 0x481A
+#define VMCS_GUEST_IA32_FS_AR 0x481C
+#define VMCS_GUEST_IA32_GS_AR 0x481E
+#define VMCS_GUEST_IA32_LDTR_AR 0x4820
+#define VMCS_GUEST_IA32_TR_AR 0x4822
+#define VMCS_GUEST_INTERRUPTIBILITY_ST 0x4824
+#define VMCS_GUEST_ACTIVITY_STATE 0x4826
+#define VMCS_GUEST_SMBASE 0x4828
+#define VMCS_GUEST_IA32_SYSENTER_CS 0x482A
+#define VMCS_VMX_PREEMPTION_TIMER_VAL 0x482E
+
+/* 32-bit host state field */
+#define VMCS_HOST_IA32_SYSENTER_CS 0x4C00
+
+/* Natural-width control fields */
+#define VMCS_CR0_MASK 0x6000
+#define VMCS_CR4_MASK 0x6002
+#define VMCS_CR0_READ_SHADOW 0x6004
+#define VMCS_CR4_READ_SHADOW 0x6006
+#define VMCS_CR3_TARGET_0 0x6008
+#define VMCS_CR3_TARGET_1 0x600A
+#define VMCS_CR3_TARGET_2 0x600C
+#define VMCS_CR3_TARGET_3 0x600E
+
+/* Natural-width RO fields */
+#define VMCS_GUEST_EXIT_QUALIFICATION 0x6400
+#define VMCS_IO_RCX 0x6402
+#define VMCS_IO_RSI 0x6404
+#define VMCS_IO_RDI 0x6406
+#define VMCS_IO_RIP 0x6408
+#define VMCS_GUEST_LINEAR_ADDRESS 0x640A
+
+/* Natural-width guest state fields */
+#define VMCS_GUEST_IA32_CR0 0x6800
+#define VMCS_GUEST_IA32_CR3 0x6802
+#define VMCS_GUEST_IA32_CR4 0x6804
+#define VMCS_GUEST_IA32_ES_BASE 0x6806
+#define VMCS_GUEST_IA32_CS_BASE 0x6808
+#define VMCS_GUEST_IA32_SS_BASE 0x680A
+#define VMCS_GUEST_IA32_DS_BASE 0x680C
+#define VMCS_GUEST_IA32_FS_BASE 0x680E
+#define VMCS_GUEST_IA32_GS_BASE 0x6810
+#define VMCS_GUEST_IA32_LDTR_BASE 0x6812
+#define VMCS_GUEST_IA32_TR_BASE 0x6814
+#define VMCS_GUEST_IA32_GDTR_BASE 0x6816
+#define VMCS_GUEST_IA32_IDTR_BASE 0x6818
+#define VMCS_GUEST_IA32_DR7 0x681A
+#define VMCS_GUEST_IA32_RSP 0x681C
+#define VMCS_GUEST_IA32_RIP 0x681E
+#define VMCS_GUEST_IA32_RFLAGS 0x6820
+#define VMCS_GUEST_PENDING_DBG_EXC 0x6822
+#define VMCS_GUEST_IA32_SYSENTER_ESP 0x6824
+#define VMCS_GUEST_IA32_SYSENTER_EIP 0x6826
+
+/* Natural-width host state fields */
+#define VMCS_HOST_IA32_CR0 0x6C00
+#define VMCS_HOST_IA32_CR3 0x6C02
+#define VMCS_HOST_IA32_CR4 0x6C04
+#define VMCS_HOST_IA32_FS_BASE 0x6C06
+#define VMCS_HOST_IA32_GS_BASE 0x6C08
+#define VMCS_HOST_IA32_TR_BASE 0x6C0A
+#define VMCS_HOST_IA32_GDTR_BASE 0x6C0C
+#define VMCS_HOST_IA32_IDTR_BASE 0x6C0E
+#define VMCS_HOST_IA32_SYSENTER_ESP 0x6C10
+#define VMCS_HOST_IA32_SYSENTER_EIP 0x6C12
+#define VMCS_HOST_IA32_RSP 0x6C14
+#define VMCS_HOST_IA32_RIP 0x6C16
+
+#define IA32_VMX_INVVPID_INDIV_ADDR_CTX 0x0
+#define IA32_VMX_INVVPID_SINGLE_CTX 0x1
+#define IA32_VMX_INVVPID_ALL_CTX 0x2
+#define IA32_VMX_INVVPID_SINGLE_CTX_GLB 0x3
+
+#define IA32_VMX_INVEPT_SINGLE_CTX 0x1
+#define IA32_VMX_INVEPT_GLOBAL_CTX 0x2
+
+#define IA32_VMX_EPT_FAULT_READ (1ULL << 0)
+#define IA32_VMX_EPT_FAULT_WRITE (1ULL << 1)
+#define IA32_VMX_EPT_FAULT_EXEC (1ULL << 2)
+
+#define IA32_VMX_EPT_FAULT_WAS_READABLE (1ULL << 3)
+#define IA32_VMX_EPT_FAULT_WAS_WRITABLE (1ULL << 4)
+#define IA32_VMX_EPT_FAULT_WAS_EXECABLE (1ULL << 5)
+
+#define IA32_VMX_MSR_LIST_SIZE_MASK (7ULL << 25)
+#define IA32_VMX_CR3_TGT_SIZE_MASK (0x1FFULL << 16)
+
+/*
+ * SVM
+ */
+#define MSR_AMD_VM_CR 0xc0010114
+#define CPUID_AMD_SVM_CAP 0x8000000A
+#define AMD_SVMDIS 0x10
+#define AMD_SVM_NESTED_PAGING_CAP (1 << 0)
+
+/*
+ * PAT
+ */
+#define PATENTRY(n, type) ((uint64_t)type << ((n) * 8))
+#define PAT_UC 0x0UL
+#define PAT_WC 0x1UL
+#define PAT_WT 0x4UL
+#define PAT_WP 0x5UL
+#define PAT_WB 0x6UL
+#define PAT_UCMINUS 0x7UL
+
diff --git a/sys/arch/i386/include/vmmvar.h b/sys/arch/i386/include/vmmvar.h
new file mode 100644
index 00000000000..4b8edf7756b
--- /dev/null
+++ b/sys/arch/i386/include/vmmvar.h
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * CPU capabilities for VMM operation
+ */
+#ifndef _MACHINE_VMMVAR_H_
+#define _MACHINE_VMMVAR_H_
+
+#define VMM_HV_SIGNATURE "OpenBSDVMM58"
+
+#define VMM_MAX_MEM_RANGES 16
+#define VMM_MAX_DISKS_PER_VM 2
+#define VMM_MAX_PATH_DISK 128
+#define VMM_MAX_NAME_LEN 32
+#define VMM_MAX_KERNEL_PATH 128
+#define VMM_MAX_VCPUS_PER_VM 64
+#define VMM_MAX_VM_MEM_SIZE 2048
+#define VMM_MAX_NICS_PER_VM 2
+
+#define VMM_PCI_MMIO_BAR_BASE 0xF0000000
+#define VMM_PCI_MMIO_BAR_END 0xF0FFFFFF
+#define VMM_PCI_MMIO_BAR_SIZE 0x00010000
+#define VMM_PCI_IO_BAR_BASE 0x1000
+#define VMM_PCI_IO_BAR_END 0xFFFF
+#define VMM_PCI_IO_BAR_SIZE 0x1000
+
+/* VMX: Basic Exit Reasons */
+#define VMX_EXIT_NMI 0
+#define VMX_EXIT_EXTINT 1
+#define VMX_EXIT_TRIPLE_FAULT 2
+#define VMX_EXIT_INIT 3
+#define VMX_EXIT_SIPI 4
+#define VMX_EXIT_IO_SMI 5
+#define VMX_EXIT_OTHER_SMI 6
+#define VMX_EXIT_INT_WINDOW 7
+#define VMX_EXIT_NMI_WINDOW 8
+#define VMX_EXIT_TASK_SWITCH 9
+#define VMX_EXIT_CPUID 10
+#define VMX_EXIT_GETSEC 11
+#define VMX_EXIT_HLT 12
+#define VMX_EXIT_INVD 13
+#define VMX_EXIT_INVLPG 14
+#define VMX_EXIT_RDPMC 15
+#define VMX_EXIT_RDTSC 16
+#define VMX_EXIT_RSM 17
+#define VMX_EXIT_VMCALL 18
+#define VMX_EXIT_VMCLEAR 19
+#define VMX_EXIT_VMLAUNCH 20
+#define VMX_EXIT_VMPTRLD 21
+#define VMX_EXIT_VMPTRST 22
+#define VMX_EXIT_VMREAD 23
+#define VMX_EXIT_VMRESUME 24
+#define VMX_EXIT_VMWRITE 25
+#define VMX_EXIT_VMXOFF 26
+#define VMX_EXIT_VMXON 27
+#define VMX_EXIT_CR_ACCESS 28
+#define VMX_EXIT_MOV_DR 29
+#define VMX_EXIT_IO 30
+#define VMX_EXIT_RDMSR 31
+#define VMX_EXIT_WRMSR 32
+#define VMX_EXIT_ENTRY_FAILED_GUEST_STATE 33
+#define VMX_EXIT_ENTRY_FAILED_MSR_LOAD 34
+#define VMX_EXIT_MWAIT 36
+#define VMX_EXIT_MTF 37
+#define VMX_EXIT_MONITOR 39
+#define VMX_EXIT_PAUSE 40
+#define VMX_EXIT_ENTRY_FAILED_MCE 41
+#define VMX_EXIT_TPR_BELOW_THRESHOLD 43
+#define VMX_EXIT_APIC_ACCESS 44
+#define VMX_EXIT_VIRTUALIZED_EOI 45
+#define VMX_EXIT_GDTR_IDTR 46
+#define VMX_EXIT_LDTR_TR 47
+#define VMX_EXIT_EPT_VIOLATION 48
+#define VMX_EXIT_EPT_MISCONFIGURATION 49
+#define VMX_EXIT_INVEPT 50
+#define VMX_EXIT_RDTSCP 51
+#define VMX_EXIT_VMX_PREEMPTION_TIMER_EXPIRED 52
+#define VMX_EXIT_INVVPID 53
+#define VMX_EXIT_WBINVD 54
+#define VMX_EXIT_XSETBV 55
+#define VMX_EXIT_APIC_WRITE 56
+#define VMX_EXIT_RDRAND 57
+#define VMX_EXIT_INVPCID 58
+#define VMX_EXIT_VMFUNC 59
+#define VMX_EXIT_RDSEED 61
+#define VMX_EXIT_XSAVES 63
+#define VMX_EXIT_XRSTORS 64
+
+/*
+ * VMX: Misc defines
+ */
+#define VMX_MAX_CR3_TARGETS 256
+
+#define VM_EXIT_TERMINATED 0xFFFE
+#define VM_EXIT_NONE 0xFFFF
+
+/*
+ * VCPU state values. Note that there is a conversion function in vmm.c
+ * (vcpu_state_decode) that converts these to human readable strings,
+ * so this enum and vcpu_state_decode should be kept in sync.
+ */
+enum {
+ VCPU_STATE_STOPPED,
+ VCPU_STATE_RUNNING,
+ VCPU_STATE_REQTERM,
+ VCPU_STATE_TERMINATED,
+ VCPU_STATE_UNKNOWN,
+};
+
+enum {
+ VEI_DIR_OUT,
+ VEI_DIR_IN
+};
+
+/*
+ * vm exit data
+ * vm_exit_inout : describes an IN/OUT exit
+ */
+struct vm_exit_inout {
+ uint8_t vei_size; /* Size of access */
+ uint8_t vei_dir; /* Direction */
+ uint8_t vei_rep; /* REP prefix? */
+ uint8_t vei_string; /* string variety? */
+ uint8_t vei_encoding; /* operand encoding */
+ uint16_t vei_port; /* port */
+ uint32_t vei_data; /* data (for IN insns) */
+};
+
+union vm_exit {
+ struct vm_exit_inout vei; /* IN/OUT exit */
+};
+
+/*
+ * struct vcpu_segment_info describes a segment + selector set, used
+ * in constructing the initial vcpu register content
+ */
+struct vcpu_segment_info {
+ uint16_t vsi_sel;
+ uint32_t vsi_limit;
+ uint32_t vsi_ar;
+ uint32_t vsi_base;
+};
+
+#define VCPU_REGS_EAX 0
+#define VCPU_REGS_EBX 1
+#define VCPU_REGS_ECX 2
+#define VCPU_REGS_EDX 3
+#define VCPU_REGS_ESI 4
+#define VCPU_REGS_EDI 5
+#define VCPU_REGS_ESP 6
+#define VCPU_REGS_EBP 7
+#define VCPU_REGS_EIP 8
+#define VCPU_REGS_EFLAGS 9
+#define VCPU_REGS_NGPRS (VCPU_REGS_EFLAGS + 1)
+
+#define VCPU_REGS_CR0 0
+#define VCPU_REGS_CR2 1
+#define VCPU_REGS_CR3 2
+#define VCPU_REGS_CR4 3
+#define VCPU_REGS_CR8 4
+#define VCPU_REGS_NCRS (VCPU_REGS_CR8 + 1)
+
+#define VCPU_REGS_CS 0
+#define VCPU_REGS_DS 1
+#define VCPU_REGS_ES 2
+#define VCPU_REGS_FS 3
+#define VCPU_REGS_GS 4
+#define VCPU_REGS_SS 5
+#define VCPU_REGS_LDTR 6
+#define VCPU_REGS_TR 7
+#define VCPU_REGS_NSREGS (VCPU_REGS_TR + 1)
+
+struct vcpu_reg_state {
+ uint32_t vrs_gprs[VCPU_REGS_NGPRS];
+ uint32_t vrs_crs[VCPU_REGS_NCRS];
+ struct vcpu_segment_info vrs_sregs[VCPU_REGS_NSREGS];
+ struct vcpu_segment_info vrs_gdtr;
+ struct vcpu_segment_info vrs_idtr;
+};
+
+struct vm_mem_range {
+ paddr_t vmr_gpa;
+ vaddr_t vmr_va;
+ size_t vmr_size;
+};
+
+struct vm_create_params {
+ /* Input parameters to VMM_IOC_CREATE */
+ size_t vcp_nmemranges;
+ size_t vcp_ncpus;
+ size_t vcp_ndisks;
+ size_t vcp_nnics;
+ struct vm_mem_range vcp_memranges[VMM_MAX_MEM_RANGES];
+ char vcp_disks[VMM_MAX_DISKS_PER_VM][VMM_MAX_PATH_DISK];
+ char vcp_name[VMM_MAX_NAME_LEN];
+ char vcp_kernel[VMM_MAX_KERNEL_PATH];
+ uint8_t vcp_macs[VMM_MAX_NICS_PER_VM][6];
+
+ /* Output parameter from VMM_IOC_CREATE */
+ uint32_t vcp_id;
+};
+
+struct vm_run_params {
+ /* Input parameters to VMM_IOC_RUN */
+ uint32_t vrp_vm_id;
+ uint32_t vrp_vcpu_id;
+ uint8_t vrp_continue; /* Continuing from an exit */
+ uint16_t vrp_irq; /* IRQ to inject */
+
+ /* Input/output parameter to VMM_IOC_RUN */
+ union vm_exit *vrp_exit; /* updated exit data */
+
+ /* Output parameter from VMM_IOC_RUN */
+ uint16_t vrp_exit_reason; /* exit reason */
+ uint8_t vrp_irqready; /* ready for IRQ on entry */
+};
+
+struct vm_info_result {
+ /* Output parameters from VMM_IOC_INFO */
+ size_t vir_memory_size;
+ size_t vir_used_size;
+ size_t vir_ncpus;
+ uint8_t vir_vcpu_state[VMM_MAX_VCPUS_PER_VM];
+ pid_t vir_creator_pid;
+ uint32_t vir_id;
+ char vir_name[VMM_MAX_NAME_LEN];
+};
+
+struct vm_info_params {
+ /* Input parameters to VMM_IOC_INFO */
+ size_t vip_size; /* Output buffer size */
+
+ /* Output Parameters from VMM_IOC_INFO */
+ size_t vip_info_ct; /* # of entries returned */
+ struct vm_info_result *vip_info; /* Output buffer */
+};
+
+struct vm_terminate_params {
+ /* Input parameters to VMM_IOC_TERM */
+ uint32_t vtp_vm_id;
+};
+
+struct vm_resetcpu_params {
+ /* Input parameters to VMM_IOC_RESETCPU */
+ uint32_t vrp_vm_id;
+ uint32_t vrp_vcpu_id;
+ struct vcpu_reg_state vrp_init_state;
+};
+
+struct vm_intr_params {
+ /* Input parameters to VMM_IOC_INTR */
+ uint32_t vip_vm_id;
+ uint32_t vip_vcpu_id;
+ uint16_t vip_intr;
+};
+
+#define VM_RWREGS_GPRS 0x1 /* read/write GPRs */
+#define VM_RWREGS_SREGS 0x2 /* read/write segment registers */
+#define VM_RWREGS_CRS 0x4 /* read/write CRs */
+#define VM_RWREGS_ALL (VM_RWREGS_GPRS | VM_RWREGS_SREGS | VM_RWREGS_CRS)
+
+struct vm_rwregs_params {
+ uint32_t vrwp_vm_id;
+ uint32_t vrwp_vcpu_id;
+ uint64_t vrwp_mask;
+ struct vcpu_reg_state vrwp_regs;
+};
+
+/* IOCTL definitions */
+#define VMM_IOC_CREATE _IOWR('V', 1, struct vm_create_params) /* Create VM */
+#define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */
+#define VMM_IOC_INFO _IOWR('V', 3, struct vm_info_params) /* Get VM Info */
+#define VMM_IOC_TERM _IOW('V', 4, struct vm_terminate_params) /* Terminate VM */
+#define VMM_IOC_RESETCPU _IOW('V', 5, struct vm_resetcpu_params) /* Reset */
+#define VMM_IOC_INTR _IOW('V', 6, struct vm_intr_params) /* Intr pending */
+#define VMM_IOC_READREGS _IOWR('V', 7, struct vm_rwregs_params) /* Get registers */
+#define VMM_IOC_WRITEREGS _IOW('V', 8, struct vm_rwregs_params) /* Set registers */
+
+#ifdef _KERNEL
+
+#define VMX_FAIL_LAUNCH_UNKNOWN 1
+#define VMX_FAIL_LAUNCH_INVALID_VMCS 2
+#define VMX_FAIL_LAUNCH_VALID_VMCS 3
+
+#define VMX_NUM_MSR_STORE 0
+// #define VMX_NUM_MSR_STORE 1
+
+/* MSR bitmap manipulation macros */
+#define MSRIDX(m) ((m) / 8)
+#define MSRBIT(m) (1 << (m) % 8)
+
+enum {
+ VMM_MODE_UNKNOWN,
+ VMM_MODE_VMX,
+ VMM_MODE_EPT,
+ VMM_MODE_SVM,
+ VMM_MODE_RVI
+};
+
+enum {
+ VMM_MEM_TYPE_REGULAR,
+ VMM_MEM_TYPE_UNKNOWN
+};
+
+/* Forward declarations */
+struct vm;
+
+/*
+ * Implementation-specific cpu state
+ */
+struct vmcb {
+};
+
+struct vmcs {
+ uint32_t vmcs_revision;
+};
+
+struct vmx_invvpid_descriptor
+{
+ uint64_t vid_vpid; // : 16;
+ uint64_t vid_addr;
+};
+
+struct vmx_invept_descriptor
+{
+ uint64_t vid_eptp;
+ uint64_t vid_reserved;
+};
+
+struct vmx_msr_store
+{
+ uint64_t vms_index : 32;
+ uint64_t vms_data;
+};
+
+/*
+ * Storage for guest registers not preserved in VMCS and various exit
+ * information.
+ *
+ * Note that vmx_enter_guest depends on the layout of this struct for
+ * field access.
+ */
+struct vmx_gueststate
+{
+ /* %esi should be first */
+ uint32_t vg_esi; /* 0x00 */
+ uint32_t vg_eax; /* 0x04 */
+ uint32_t vg_ebx; /* 0x08 */
+ uint32_t vg_ecx; /* 0x0c */
+ uint32_t vg_edx; /* 0x10 */
+ uint32_t vg_edi; /* 0x14 */
+ uint32_t vg_ebp; /* 0x18 */
+ uint32_t vg_cr2; /* 0x1c */
+ uint32_t vg_eip; /* 0x20 */
+ uint32_t vg_exit_reason; /* 0x24 */
+ uint32_t vg_eflags; /* 0x28 */
+};
+
+/*
+ * Virtual Machine
+ */
+struct vm;
+
+/*
+ * Virtual CPU
+ */
+struct vcpu {
+ /* VMCS / VMCB pointer */
+ vaddr_t vc_control_va;
+ uint64_t vc_control_pa;
+
+ /* VLAPIC pointer */
+ vaddr_t vc_vlapic_va;
+ uint64_t vc_vlapic_pa;
+
+ /* MSR bitmap address */
+ vaddr_t vc_msr_bitmap_va;
+ uint64_t vc_msr_bitmap_pa;
+
+ struct vm *vc_parent;
+ uint32_t vc_id;
+ u_int vc_state;
+ SLIST_ENTRY(vcpu) vc_vcpu_link;
+ vaddr_t vc_hsa_stack_va;
+
+ uint8_t vc_virt_mode;
+
+ struct cpu_info *vc_last_pcpu;
+ union vm_exit vc_exit;
+
+ uint16_t vc_intr;
+ uint8_t vc_irqready;
+
+ /* VMX only */
+ uint64_t vc_vmx_basic;
+ uint64_t vc_vmx_entry_ctls;
+ uint64_t vc_vmx_true_entry_ctls;
+ uint64_t vc_vmx_exit_ctls;
+ uint64_t vc_vmx_true_exit_ctls;
+ uint64_t vc_vmx_pinbased_ctls;
+ uint64_t vc_vmx_true_pinbased_ctls;
+ uint64_t vc_vmx_procbased_ctls;
+ uint64_t vc_vmx_true_procbased_ctls;
+ uint64_t vc_vmx_procbased2_ctls;
+ struct vmx_gueststate vc_gueststate;
+ vaddr_t vc_vmx_msr_exit_save_va;
+ paddr_t vc_vmx_msr_exit_save_pa;
+ vaddr_t vc_vmx_msr_exit_load_va;
+ paddr_t vc_vmx_msr_exit_load_pa;
+ vaddr_t vc_vmx_msr_entry_load_va;
+ paddr_t vc_vmx_msr_entry_load_pa;
+};
+
+SLIST_HEAD(vcpu_head, vcpu);
+
+void vmm_dispatch_intr(vaddr_t);
+int vmxon(uint64_t *);
+int vmxoff(void);
+int vmclear(uint64_t *);
+int vmptrld(uint64_t *);
+int vmptrst(uint64_t *);
+int vmwrite(uint32_t, uint32_t);
+int vmread(uint32_t, uint32_t *);
+void invvpid(uint32_t, struct vmx_invvpid_descriptor *);
+void invept(uint32_t, struct vmx_invept_descriptor *);
+int vmx_enter_guest(uint64_t *, struct vmx_gueststate *, int, vaddr_t);
+void start_vmm_on_cpu(struct cpu_info *);
+void stop_vmm_on_cpu(struct cpu_info *);
+
+#endif /* _KERNEL */
+
+#endif /* ! _MACHINE_VMMVAR_H_ */