diff options
author | Mike Larkin <mlarkin@cvs.openbsd.org> | 2016-10-21 06:21:00 +0000 |
---|---|---|
committer | Mike Larkin <mlarkin@cvs.openbsd.org> | 2016-10-21 06:21:00 +0000 |
commit | cae887b67e44c408266479082c32423bcc360289 (patch) | |
tree | f56b330a85d48f2e7e9325252035d3c87a847b20 /sys/arch | |
parent | d66165444be3ed84f18a8d5a2fe5afc088beb166 (diff) |
vmm(4) for i386. Userland changes forthcoming. Note that for the time being,
i386 hosts are limited to running only i386 guests, even if the underlying
hardware supports amd64. This is a restriction I hope to lift moving forward,
but for now please don't report problems running amd64 guests on i386 hosts.
This was a straightforward port of the in-tree amd64 code plus the old rotted
tree I had from last year for i386 support. Changes included converting 64-bit
VMREAD/VMWRITE ops to 2x32-bit ops, and fixing treatment of the TSS, which
differs on i386.
ok deraadt@
Diffstat (limited to 'sys/arch')
-rw-r--r-- | sys/arch/i386/conf/GENERIC | 3 | ||||
-rw-r--r-- | sys/arch/i386/conf/Makefile.i386 | 4 | ||||
-rw-r--r-- | sys/arch/i386/conf/files.i386 | 10 | ||||
-rw-r--r-- | sys/arch/i386/i386/conf.c | 14 | ||||
-rw-r--r-- | sys/arch/i386/i386/cpu.c | 34 | ||||
-rw-r--r-- | sys/arch/i386/i386/ipifuncs.c | 30 | ||||
-rw-r--r-- | sys/arch/i386/i386/machdep.c | 114 | ||||
-rw-r--r-- | sys/arch/i386/i386/mainbus.c | 8 | ||||
-rw-r--r-- | sys/arch/i386/i386/pmap.c | 24 | ||||
-rw-r--r-- | sys/arch/i386/i386/pmapae.c | 64 | ||||
-rw-r--r-- | sys/arch/i386/i386/vmm.c | 5433 | ||||
-rw-r--r-- | sys/arch/i386/i386/vmm_support.S | 291 | ||||
-rw-r--r-- | sys/arch/i386/include/cpu.h | 41 | ||||
-rw-r--r-- | sys/arch/i386/include/intrdefs.h | 9 | ||||
-rw-r--r-- | sys/arch/i386/include/pmap.h | 12 | ||||
-rw-r--r-- | sys/arch/i386/include/pte.h | 9 | ||||
-rw-r--r-- | sys/arch/i386/include/specialreg.h | 367 | ||||
-rw-r--r-- | sys/arch/i386/include/vmmvar.h | 446 |
18 files changed, 6887 insertions, 26 deletions
diff --git a/sys/arch/i386/conf/GENERIC b/sys/arch/i386/conf/GENERIC index 808917af519..9e3d2a265be 100644 --- a/sys/arch/i386/conf/GENERIC +++ b/sys/arch/i386/conf/GENERIC @@ -1,4 +1,4 @@ -# $OpenBSD: GENERIC,v 1.823 2016/09/12 08:28:44 mpi Exp $ +# $OpenBSD: GENERIC,v 1.824 2016/10/21 06:20:58 mlarkin Exp $ # # For further information on compiling OpenBSD kernels, see the config(8) # man page. @@ -79,6 +79,7 @@ isa0 at gscpcib? isa0 at glxpcib? eisa0 at mainbus0 pci* at mainbus0 +vmm0 at mainbus0 pchb* at pci? # PCI-Host bridges ppb* at pci? # PCI-PCI bridges diff --git a/sys/arch/i386/conf/Makefile.i386 b/sys/arch/i386/conf/Makefile.i386 index 18f05560470..13b1b7cf8a2 100644 --- a/sys/arch/i386/conf/Makefile.i386 +++ b/sys/arch/i386/conf/Makefile.i386 @@ -1,4 +1,4 @@ -# $OpenBSD: Makefile.i386,v 1.97 2016/10/15 13:45:08 deraadt Exp $ +# $OpenBSD: Makefile.i386,v 1.98 2016/10/21 06:20:58 mlarkin Exp $ # For instructions on building kernels consult the config(8) and options(4) # manual pages. @@ -149,7 +149,7 @@ db_structinfo.h: $S/ddb/db_structinfo.c $S/ddb/parse_structinfo.pl rm -f db_structinfo.o locore.o: ${_machdir}/${_mach}/locore.s assym.h -in_cksum.o mptramp.o kvm86call.o acpi_wakecode.o: assym.h +in_cksum.o mptramp.o kvm86call.o acpi_wakecode.o vmm_support.o: assym.h # The install target can be redefined by putting a # install-kernel-${MACHINE_NAME} target into /etc/mk.conf diff --git a/sys/arch/i386/conf/files.i386 b/sys/arch/i386/conf/files.i386 index 7f1ef1eb725..efb759667b0 100644 --- a/sys/arch/i386/conf/files.i386 +++ b/sys/arch/i386/conf/files.i386 @@ -1,4 +1,4 @@ -# $OpenBSD: files.i386,v 1.229 2016/02/28 15:46:18 naddy Exp $ +# $OpenBSD: files.i386,v 1.230 2016/10/21 06:20:58 mlarkin Exp $ # # new style config file for i386 architecture # @@ -389,6 +389,14 @@ file arch/i386/i386/acpi_machdep.c acpi file arch/i386/i386/acpi_wakecode.S acpi & !small_kernel # +# VMM +# +device vmm {} +attach vmm at mainbus +file arch/i386/i386/vmm.c vmm needs-flag +file arch/i386/i386/vmm_support.S vmm + +# # IPMI # attach ipmi at mainbus diff --git a/sys/arch/i386/i386/conf.c b/sys/arch/i386/i386/conf.c index 812d82d8550..1622e6a90eb 100644 --- a/sys/arch/i386/i386/conf.c +++ b/sys/arch/i386/i386/conf.c @@ -1,4 +1,4 @@ -/* $OpenBSD: conf.c,v 1.157 2016/09/04 10:51:23 naddy Exp $ */ +/* $OpenBSD: conf.c,v 1.158 2016/10/21 06:20:58 mlarkin Exp $ */ /* $NetBSD: conf.c,v 1.75 1996/05/03 19:40:20 christos Exp $ */ /* @@ -105,6 +105,14 @@ int nblkdev = nitems(bdevsw); (dev_type_stop((*))) enodev, 0, seltrue, \ (dev_type_mmap((*))) enodev, 0 } +/* open, close, ioctl */ +#define cdev_vmm_init(c,n) { \ + dev_init(c,n,open), dev_init(c,n,close), \ + (dev_type_read((*))) enodev, \ + (dev_type_write((*))) enodev, \ + dev_init(c,n,ioctl), \ + (dev_type_stop((*))) enodev, 0, seltrue, \ + (dev_type_mmap((*))) enodev } #define mmread mmrw #define mmwrite mmrw @@ -178,6 +186,8 @@ cdev_decl(pci); #include "pvbus.h" #include "ipmi.h" #include "switch.h" +#include "vmm.h" +cdev_decl(vmm); struct cdevsw cdevsw[] = { @@ -191,7 +201,7 @@ struct cdevsw cdevsw[] = cdev_log_init(1,log), /* 7: /dev/klog */ cdev_tty_init(NCOM,com), /* 8: serial port */ cdev_disk_init(NFD,fd), /* 9: floppy disk */ - cdev_notdef(), /* 10 */ + cdev_vmm_init(NVMM,vmm), /* 10: vmm */ cdev_notdef(), /* 11 */ cdev_wsdisplay_init(NWSDISPLAY, /* 12: frame buffers, etc. */ wsdisplay), diff --git a/sys/arch/i386/i386/cpu.c b/sys/arch/i386/i386/cpu.c index babc4f56b76..3ce489a5531 100644 --- a/sys/arch/i386/i386/cpu.c +++ b/sys/arch/i386/i386/cpu.c @@ -1,4 +1,4 @@ -/* $OpenBSD: cpu.c,v 1.79 2016/07/28 21:57:56 kettenis Exp $ */ +/* $OpenBSD: cpu.c,v 1.80 2016/10/21 06:20:58 mlarkin Exp $ */ /* $NetBSD: cpu.c,v 1.1.2.7 2000/06/26 02:04:05 sommerfeld Exp $ */ /*- @@ -66,6 +66,7 @@ #include "lapic.h" #include "ioapic.h" +#include "vmm.h" #include <sys/param.h> #include <sys/timeout.h> @@ -113,6 +114,9 @@ int cpu_activate(struct device *, int); void patinit(struct cpu_info *ci); void cpu_idle_mwait_cycle(void); void cpu_init_mwait(struct device *); +#if NVMM > 0 +void cpu_init_vmm(struct cpu_info *ci); +#endif /* NVMM > 0 */ u_int cpu_mwait_size, cpu_mwait_states; @@ -345,6 +349,10 @@ cpu_attach(struct device *parent, struct device *self, void *aux) ci->ci_dev.dv_xname, pcb, pcb->pcb_esp); } #endif + +#if NVMM > 0 + cpu_init_vmm(ci); +#endif /* NVMM > 0 */ } /* @@ -407,6 +415,23 @@ cpu_init(struct cpu_info *ci) } void +cpu_init_vmm(struct cpu_info *ci) +{ + /* + * Allocate a per-cpu VMXON region + */ + if (ci->ci_vmm_flags & CI_VMM_VMX) { + ci->ci_vmxon_region_pa = 0; + ci->ci_vmxon_region = (struct vmxon_region *)malloc(PAGE_SIZE, + M_DEVBUF, M_WAITOK|M_ZERO); + if (!pmap_extract(pmap_kernel(), (vaddr_t)ci->ci_vmxon_region, + (paddr_t *)&ci->ci_vmxon_region_pa)) + panic("Can't locate VMXON region in phys mem\n"); + } +} + + +void patinit(struct cpu_info *ci) { extern int pmap_pg_wc; @@ -415,13 +440,6 @@ patinit(struct cpu_info *ci) if ((ci->ci_feature_flags & CPUID_PAT) == 0) return; -#define PATENTRY(n, type) ((u_int64_t)type << ((n) * 8)) -#define PAT_UC 0x0UL -#define PAT_WC 0x1UL -#define PAT_WT 0x4UL -#define PAT_WP 0x5UL -#define PAT_WB 0x6UL -#define PAT_UCMINUS 0x7UL /* * Set up PAT bits. * The default pat table is the following: diff --git a/sys/arch/i386/i386/ipifuncs.c b/sys/arch/i386/i386/ipifuncs.c index b313879b852..e1b820fd77c 100644 --- a/sys/arch/i386/i386/ipifuncs.c +++ b/sys/arch/i386/i386/ipifuncs.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ipifuncs.c,v 1.27 2015/07/19 18:53:49 sf Exp $ */ +/* $OpenBSD: ipifuncs.c,v 1.28 2016/10/21 06:20:58 mlarkin Exp $ */ /* $NetBSD: ipifuncs.c,v 1.1.2.3 2000/06/26 02:04:06 sommerfeld Exp $ */ /*- @@ -37,6 +37,7 @@ */ #include "npx.h" +#include "vmm.h" #include <sys/param.h> #include <sys/device.h> @@ -70,6 +71,11 @@ void i386_ipi_reload_mtrr(struct cpu_info *); #define i386_ipi_reload_mtrr 0 #endif +#if NVMM > 0 +void i386_ipi_start_vmm(struct cpu_info *); +void i386_ipi_stop_vmm(struct cpu_info *); +#endif /* NVMM > 0 */ + void (*ipifunc[I386_NIPI])(struct cpu_info *) = { i386_ipi_halt, @@ -88,6 +94,13 @@ void (*ipifunc[I386_NIPI])(struct cpu_info *) = NULL, #endif i386_setperf_ipi, +#if NVMM > 0 + i386_ipi_start_vmm, + i386_ipi_stop_vmm, +#else + NULL, + NULL, +#endif /* NVMM > 0 */ }; void @@ -208,3 +221,18 @@ i386_ipi_handler(void) } } } + +#if NVMM > 0 +void +i386_ipi_start_vmm(struct cpu_info *ci) +{ + start_vmm_on_cpu(ci); +} + +void +i386_ipi_stop_vmm(struct cpu_info *ci) +{ + stop_vmm_on_cpu(ci); +} +#endif /* NVMM > 0 */ + diff --git a/sys/arch/i386/i386/machdep.c b/sys/arch/i386/i386/machdep.c index d6af51e1d80..d2ca55c98d8 100644 --- a/sys/arch/i386/i386/machdep.c +++ b/sys/arch/i386/i386/machdep.c @@ -1,4 +1,4 @@ -/* $OpenBSD: machdep.c,v 1.592 2016/10/14 04:53:26 mlarkin Exp $ */ +/* $OpenBSD: machdep.c,v 1.593 2016/10/21 06:20:58 mlarkin Exp $ */ /* $NetBSD: machdep.c,v 1.214 1996/11/10 03:16:17 thorpej Exp $ */ /*- @@ -168,6 +168,7 @@ extern struct proc *npxproc; #include <machine/hibernate_var.h> #endif /* HIBERNATE */ +#include "vmm.h" void replacesmap(void); int intr_handler(struct intrframe *, struct intrhand *); @@ -339,6 +340,9 @@ void p3_get_bus_clock(struct cpu_info *); void p4_update_cpuspeed(void); void p3_update_cpuspeed(void); int pentium_cpuspeed(int *); +#if NVMM > 0 +void cpu_check_vmm_cap(struct cpu_info *); +#endif /* NVMM > 0 */ static __inline u_char cyrix_read_reg(u_char reg) @@ -2077,6 +2081,10 @@ identifycpu(struct cpu_info *ci) } else i386_use_fxsave = 0; +#if NVMM > 0 + cpu_check_vmm_cap(ci); +#endif /* NVMM > 0 */ + } char * @@ -3967,3 +3975,107 @@ intr_barrier(void *ih) { sched_barrier(NULL); } + +#if NVMM > 0 +/* + * cpu_check_vmm_cap + * + * Checks for VMM capabilities for 'ci'. Initializes certain per-cpu VMM + * state in 'ci' if virtualization extensions are found. + * + * Parameters: + * ci: the cpu being checked + */ +void +cpu_check_vmm_cap(struct cpu_info *ci) +{ + uint64_t msr; + uint32_t cap, dummy; + + /* + * Check for workable VMX + */ + if (cpu_ecxfeature & CPUIDECX_VMX) { + msr = rdmsr(MSR_IA32_FEATURE_CONTROL); + + if (!(msr & IA32_FEATURE_CONTROL_LOCK)) + ci->ci_vmm_flags |= CI_VMM_VMX; + else { + if (msr & IA32_FEATURE_CONTROL_VMX_EN) + ci->ci_vmm_flags |= CI_VMM_VMX; + } + } + + /* + * Check for EPT (Intel Nested Paging) and other secondary + * controls + */ + if (ci->ci_vmm_flags & CI_VMM_VMX) { + /* Secondary controls available? */ + /* XXX should we check true procbased ctls here if avail? */ + msr = rdmsr(IA32_VMX_PROCBASED_CTLS); + if (msr & (IA32_VMX_ACTIVATE_SECONDARY_CONTROLS) << 32) { + msr = rdmsr(IA32_VMX_PROCBASED2_CTLS); + /* EPT available? */ + if (msr & (IA32_VMX_ENABLE_EPT) << 32) + ci->ci_vmm_flags |= CI_VMM_EPT; + /* VM Functions available? */ + if (msr & (IA32_VMX_ENABLE_VM_FUNCTIONS) << 32) { + ci->ci_vmm_cap.vcc_vmx.vmx_vm_func = + rdmsr(IA32_VMX_VMFUNC); + } + } + } + + /* + * Check startup config (VMX) + */ + if (ci->ci_vmm_flags & CI_VMM_VMX) { + /* CR0 fixed and flexible bits */ + msr = rdmsr(IA32_VMX_CR0_FIXED0); + ci->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0 = msr; + msr = rdmsr(IA32_VMX_CR0_FIXED1); + ci->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1 = msr; + + /* CR4 fixed and flexible bits */ + msr = rdmsr(IA32_VMX_CR4_FIXED0); + ci->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0 = msr; + msr = rdmsr(IA32_VMX_CR4_FIXED1); + ci->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1 = msr; + + /* VMXON region revision ID (bits 30:0 of IA32_VMX_BASIC) */ + msr = rdmsr(IA32_VMX_BASIC); + ci->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision = + (uint32_t)(msr & 0x7FFFFFFF); + + /* MSR save / load table size */ + msr = rdmsr(IA32_VMX_MISC); + ci->ci_vmm_cap.vcc_vmx.vmx_msr_table_size = + (uint32_t)(msr & IA32_VMX_MSR_LIST_SIZE_MASK) >> 25; + + /* CR3 target count size */ + ci->ci_vmm_cap.vcc_vmx.vmx_cr3_tgt_count = + (uint32_t)(msr & IA32_VMX_CR3_TGT_SIZE_MASK) >> 16; + } + + /* + * Check for workable SVM + */ + if (ecpu_ecxfeature & CPUIDECX_SVM) { + msr = rdmsr(MSR_AMD_VM_CR); + + if (!(msr & AMD_SVMDIS)) + ci->ci_vmm_flags |= CI_VMM_SVM; + } + + /* + * Check for SVM Nested Paging + */ + if (ci->ci_vmm_flags & CI_VMM_SVM) { + CPUID(CPUID_AMD_SVM_CAP, dummy, dummy, dummy, cap); + if (cap & AMD_SVM_NESTED_PAGING_CAP) + ci->ci_vmm_flags |= CI_VMM_RVI; + } +} +#endif /* NVMM > 0 */ + diff --git a/sys/arch/i386/i386/mainbus.c b/sys/arch/i386/i386/mainbus.c index d44a0f1c695..56acb1f57d6 100644 --- a/sys/arch/i386/i386/mainbus.c +++ b/sys/arch/i386/i386/mainbus.c @@ -1,4 +1,4 @@ -/* $OpenBSD: mainbus.c,v 1.55 2016/07/28 21:57:56 kettenis Exp $ */ +/* $OpenBSD: mainbus.c,v 1.56 2016/10/21 06:20:58 mlarkin Exp $ */ /* $NetBSD: mainbus.c,v 1.21 1997/06/06 23:14:20 thorpej Exp $ */ /* @@ -54,6 +54,7 @@ #include "ipmi.h" #include "esm.h" #include "amdmsr.h" +#include "vmm.h" #include "pvbus.h" #include <machine/cpuvar.h> @@ -269,6 +270,11 @@ mainbus_attach(struct device *parent, struct device *self, void *aux) #endif config_found(self, &mba.mba_iba, mainbus_print); } + +#if NVMM > 0 + mba.mba_busname = "vmm"; + config_found(self, &mba.mba_busname, mainbus_print); +#endif /* NVMM > 0 */ } int diff --git a/sys/arch/i386/i386/pmap.c b/sys/arch/i386/i386/pmap.c index 81337e8f24b..04248baa30d 100644 --- a/sys/arch/i386/i386/pmap.c +++ b/sys/arch/i386/i386/pmap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pmap.c,v 1.194 2016/09/17 07:37:57 mlarkin Exp $ */ +/* $OpenBSD: pmap.c,v 1.195 2016/10/21 06:20:58 mlarkin Exp $ */ /* $NetBSD: pmap.c,v 1.91 2000/06/02 17:46:37 thorpej Exp $ */ /* @@ -74,6 +74,8 @@ #include <sys/msgbuf.h> #include <stand/boot/bootarg.h> +#include "vmm.h" + /* * this file contains the code for the "pmap module." the module's * job is to manage the hardware's virtual to physical address mappings. @@ -931,6 +933,11 @@ pmap_bootstrap(vaddr_t kva_start) kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3; kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = atop(kva_start - VM_MIN_KERNEL_ADDRESS); + kpm->pm_type = PMAP_TYPE_NORMAL; +#if NVMM > 0 + kpm->pm_npt_pml4 = 0; + kpm->pm_npt_pdpt = 0; +#endif /* NVMM > 0 */ /* * the above is just a rough estimate and not critical to the proper @@ -1289,6 +1296,12 @@ pmap_create(void) setsegment(&pmap->pm_codeseg, 0, atop(I386_MAX_EXE_ADDR) - 1, SDT_MEMERA, SEL_UPL, 1, 1); + pmap->pm_type = PMAP_TYPE_NORMAL; +#if NVMM > 0 + pmap->pm_npt_pml4 = 0; + pmap->pm_npt_pdpt = 0; +#endif /* NVMM > 0 */ + pmap_pinit_pd(pmap); return (pmap); } @@ -1356,6 +1369,15 @@ pmap_destroy(struct pmap *pmap) uvm_km_free(kernel_map, pmap->pm_pdir, pmap->pm_pdirsize); pmap->pm_pdir = 0; +#if NVMM > 0 + if (pmap->pm_npt_pml4) + km_free((void *)pmap->pm_npt_pml4, PAGE_SIZE, &kv_any, + &kp_zero); + if (pmap->pm_npt_pdpt) + km_free((void *)pmap->pm_npt_pdpt, PAGE_SIZE, &kv_any, + &kp_zero); +#endif /* NVMM > 0 */ + pool_put(&pmap_pmap_pool, pmap); } diff --git a/sys/arch/i386/i386/pmapae.c b/sys/arch/i386/i386/pmapae.c index 46b366b0360..e4ffa837c9d 100644 --- a/sys/arch/i386/i386/pmapae.c +++ b/sys/arch/i386/i386/pmapae.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pmapae.c,v 1.51 2016/09/17 07:37:57 mlarkin Exp $ */ +/* $OpenBSD: pmapae.c,v 1.52 2016/10/21 06:20:58 mlarkin Exp $ */ /* * Copyright (c) 2006-2008 Michael Shalayeff @@ -1915,3 +1915,65 @@ pmap_flush_page_pae(paddr_t pa) *pte = 0; pmap_update_pg(va); } + +int +pmap_convert(struct pmap *pmap, int mode) +{ + int ret; + pt_entry_t *pte; + paddr_t pml4_pa, pdpt_pa; + + pmap->pm_type = mode; + + ret = 0; + if (mode == PMAP_TYPE_EPT) { + pmap->pm_npt_pml4 = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, + &kp_zero, &kd_nowait); + if (!pmap->pm_npt_pml4) { + ret = ENOMEM; + goto error; + } + + pmap->pm_npt_pdpt = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, + &kp_zero, &kd_nowait); + if (!pmap->pm_npt_pdpt) { + ret = ENOMEM; + goto error; + } + + if (!pmap_extract(pmap_kernel(), pmap->pm_npt_pml4, + &pml4_pa)) { + ret = ENOMEM; + goto error; + } + pmap->pm_npt_pa = pml4_pa; + + if (!pmap_extract(pmap_kernel(), pmap->pm_npt_pdpt, + &pdpt_pa)) { + ret = ENOMEM; + goto error; + } + + pte = (pt_entry_t *)pmap->pm_npt_pml4; + pte[0] = (pdpt_pa & PG_FRAME) | EPT_R | EPT_W | EPT_X; + pte = (pt_entry_t *)pmap->pm_npt_pdpt; + pte[0] = (pmap->pm_pdidx[0] & PG_FRAME) | + EPT_R | EPT_W | EPT_X; + pte[1] = (pmap->pm_pdidx[1] & PG_FRAME) | + EPT_R | EPT_W | EPT_X; + pte[2] = (pmap->pm_pdidx[2] & PG_FRAME) | + EPT_R | EPT_W | EPT_X; + pte[3] = (pmap->pm_pdidx[3] & PG_FRAME) | + EPT_R | EPT_W | EPT_X; + } + + return (ret); + +error: + if (pmap->pm_npt_pml4) + km_free((void *)pmap->pm_npt_pml4, PAGE_SIZE, &kv_any, &kp_zero); + if (pmap->pm_npt_pdpt) + km_free((void *)pmap->pm_npt_pdpt, PAGE_SIZE, &kv_any, &kp_zero); + + return (ret); +} diff --git a/sys/arch/i386/i386/vmm.c b/sys/arch/i386/i386/vmm.c new file mode 100644 index 00000000000..cea820e3bf4 --- /dev/null +++ b/sys/arch/i386/i386/vmm.c @@ -0,0 +1,5433 @@ +/* + * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/signalvar.h> +#include <sys/malloc.h> +#include <sys/device.h> +#include <sys/pool.h> +#include <sys/proc.h> +#include <sys/ioctl.h> +#include <sys/queue.h> +#include <sys/rwlock.h> +#include <sys/pledge.h> +#include <sys/memrange.h> + +#include <uvm/uvm_extern.h> + +#include <machine/pmap.h> +#include <machine/biosvar.h> +#include <machine/segments.h> +#include <machine/cpufunc.h> +#include <machine/vmmvar.h> +#include <machine/i82489reg.h> + +#include <dev/isa/isareg.h> + +#define VMM_DEBUG + +#ifdef VMM_DEBUG +int vmm_debug = 1; +#define DPRINTF(x...) do { if (vmm_debug) printf(x); } while(0) +#else +#define DPRINTF(x...) +#endif /* VMM_DEBUG */ + +#define DEVNAME(s) ((s)->sc_dev.dv_xname) + +#define CTRL_DUMP(x,y,z) printf(" %s: Can set:%s Can clear:%s\n", #z , \ + vcpu_vmx_check_cap(x, IA32_VMX_##y ##_CTLS, \ + IA32_VMX_##z, 1) ? "Yes" : "No", \ + vcpu_vmx_check_cap(x, IA32_VMX_##y ##_CTLS, \ + IA32_VMX_##z, 0) ? "Yes" : "No"); + +#define VMX_EXIT_INFO_HAVE_RIP 0x1 +#define VMX_EXIT_INFO_HAVE_REASON 0x2 +#define VMX_EXIT_INFO_COMPLETE \ + (VMX_EXIT_INFO_HAVE_RIP | VMX_EXIT_INFO_HAVE_REASON) + +struct vm { + vm_map_t vm_map; + uint32_t vm_id; + pid_t vm_creator_pid; + size_t vm_nmemranges; + size_t vm_memory_size; + char vm_name[VMM_MAX_NAME_LEN]; + struct vm_mem_range vm_memranges[VMM_MAX_MEM_RANGES]; + + struct vcpu_head vm_vcpu_list; + uint32_t vm_vcpu_ct; + u_int vm_vcpus_running; + struct rwlock vm_vcpu_lock; + + SLIST_ENTRY(vm) vm_link; +}; + +SLIST_HEAD(vmlist_head, vm); + +struct vmm_softc { + struct device sc_dev; + + /* Capabilities */ + uint32_t nr_vmx_cpus; + uint32_t nr_svm_cpus; + uint32_t nr_rvi_cpus; + uint32_t nr_ept_cpus; + + /* Managed VMs */ + struct vmlist_head vm_list; + + int mode; + + struct rwlock vm_lock; + size_t vm_ct; /* number of in-memory VMs */ + size_t vm_idx; /* next unique VM index */ +}; + +int vmm_probe(struct device *, void *, void *); +void vmm_attach(struct device *, struct device *, void *); +int vmmopen(dev_t, int, int, struct proc *); +int vmmioctl(dev_t, u_long, caddr_t, int, struct proc *); +int vmmclose(dev_t, int, int, struct proc *); +int vmm_start(void); +int vmm_stop(void); +size_t vm_create_check_mem_ranges(struct vm_create_params *); +int vm_create(struct vm_create_params *, struct proc *); +int vm_run(struct vm_run_params *); +int vm_terminate(struct vm_terminate_params *); +int vm_get_info(struct vm_info_params *); +int vm_resetcpu(struct vm_resetcpu_params *); +int vm_intr_pending(struct vm_intr_params *); +int vm_rwregs(struct vm_rwregs_params *, int); +int vcpu_readregs_vmx(struct vcpu *, uint64_t, struct vcpu_reg_state *); +int vcpu_readregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *); +int vcpu_writeregs_vmx(struct vcpu *, uint64_t, int, struct vcpu_reg_state *); +int vcpu_writeregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *); +int vcpu_reset_regs(struct vcpu *, struct vcpu_reg_state *); +int vcpu_reset_regs_vmx(struct vcpu *, struct vcpu_reg_state *); +int vcpu_reset_regs_svm(struct vcpu *, struct vcpu_reg_state *); +int vcpu_reload_vmcs_vmx(uint64_t *); +int vcpu_init(struct vcpu *); +int vcpu_init_vmx(struct vcpu *); +int vcpu_init_svm(struct vcpu *); +int vcpu_must_stop(struct vcpu *); +int vcpu_run_vmx(struct vcpu *, struct vm_run_params *); +int vcpu_run_svm(struct vcpu *, struct vm_run_params *); +void vcpu_deinit(struct vcpu *); +void vcpu_deinit_vmx(struct vcpu *); +void vcpu_deinit_svm(struct vcpu *); +int vm_impl_init(struct vm *, struct proc *); +int vm_impl_init_vmx(struct vm *, struct proc *); +int vm_impl_init_svm(struct vm *, struct proc *); +void vm_impl_deinit(struct vm *); +void vm_impl_deinit_vmx(struct vm *); +void vm_impl_deinit_svm(struct vm *); +void vm_teardown(struct vm *); +int vcpu_vmx_check_cap(struct vcpu *, uint32_t, uint32_t, int); +int vcpu_vmx_compute_ctrl(uint64_t, uint16_t, uint32_t, uint32_t, uint32_t *); +int vmx_get_exit_info(uint32_t *, uint32_t *); +int vmx_handle_exit(struct vcpu *); +int vmx_handle_cpuid(struct vcpu *); +int vmx_handle_rdmsr(struct vcpu *); +int vmx_handle_wrmsr(struct vcpu *); +int vmx_handle_cr(struct vcpu *); +int vmx_handle_inout(struct vcpu *); +int vmx_handle_hlt(struct vcpu *); +void vmx_handle_intr(struct vcpu *); +void vmx_handle_intwin(struct vcpu *); +int vmm_get_guest_memtype(struct vm *, paddr_t); +int vmm_get_guest_faulttype(void); +int vmx_get_guest_faulttype(void); +int svm_get_guest_faulttype(void); +int vmx_get_exit_qualification(uint32_t *); +int vmx_fault_page(struct vcpu *, paddr_t); +int vmx_handle_np_fault(struct vcpu *); +const char *vcpu_state_decode(u_int); +const char *vmx_exit_reason_decode(uint32_t); +const char *vmx_instruction_error_decode(uint32_t); +void vmx_setmsrbr(struct vcpu *, uint32_t); +void vmx_setmsrbw(struct vcpu *, uint32_t); +void vmx_setmsrbrw(struct vcpu *, uint32_t); + +#ifdef VMM_DEBUG +void dump_vcpu(struct vcpu *); +void vmx_vcpu_dump_regs(struct vcpu *); +void vmx_dump_vmcs(struct vcpu *); +const char *msr_name_decode(uint32_t); +void vmm_segment_desc_decode(uint32_t); +void vmm_decode_cr0(uint32_t); +void vmm_decode_cr4(uint32_t); +void vmm_decode_msr_value(uint64_t, uint64_t); +void vmm_decode_apicbase_msr_value(uint64_t); +void vmm_decode_ia32_fc_value(uint64_t); +void vmm_decode_mtrrcap_value(uint64_t); +void vmm_decode_perf_status_value(uint64_t); +void vmm_decode_perf_ctl_value(uint64_t); +void vmm_decode_mtrrdeftype_value(uint64_t); +void vmm_decode_efer_value(uint64_t); + +extern int mtrr2mrt(int); + +struct vmm_reg_debug_info { + uint64_t vrdi_bit; + const char *vrdi_present; + const char *vrdi_absent; +}; +#endif /* VMM_DEBUG */ + +const char *vmm_hv_signature = VMM_HV_SIGNATURE; + +struct cfdriver vmm_cd = { + NULL, "vmm", DV_DULL +}; + +const struct cfattach vmm_ca = { + sizeof(struct vmm_softc), vmm_probe, vmm_attach, NULL, NULL +}; + +/* + * Helper struct to easily get the VMCS field IDs needed in vmread/vmwrite + * to access the individual fields of the guest segment registers. This + * struct is indexed by VCPU_REGS_* id. + */ +const struct { + uint64_t selid; + uint64_t limitid; + uint64_t arid; + uint64_t baseid; +} vmm_vmx_sreg_vmcs_fields[] = { + { VMCS_GUEST_IA32_CS_SEL, VMCS_GUEST_IA32_CS_LIMIT, + VMCS_GUEST_IA32_CS_AR, VMCS_GUEST_IA32_CS_BASE }, + { VMCS_GUEST_IA32_DS_SEL, VMCS_GUEST_IA32_DS_LIMIT, + VMCS_GUEST_IA32_DS_AR, VMCS_GUEST_IA32_DS_BASE }, + { VMCS_GUEST_IA32_ES_SEL, VMCS_GUEST_IA32_ES_LIMIT, + VMCS_GUEST_IA32_ES_AR, VMCS_GUEST_IA32_ES_BASE }, + { VMCS_GUEST_IA32_FS_SEL, VMCS_GUEST_IA32_FS_LIMIT, + VMCS_GUEST_IA32_FS_AR, VMCS_GUEST_IA32_FS_BASE }, + { VMCS_GUEST_IA32_GS_SEL, VMCS_GUEST_IA32_GS_LIMIT, + VMCS_GUEST_IA32_GS_AR, VMCS_GUEST_IA32_GS_BASE }, + { VMCS_GUEST_IA32_SS_SEL, VMCS_GUEST_IA32_SS_LIMIT, + VMCS_GUEST_IA32_SS_AR, VMCS_GUEST_IA32_SS_BASE }, + { VMCS_GUEST_IA32_LDTR_SEL, VMCS_GUEST_IA32_LDTR_LIMIT, + VMCS_GUEST_IA32_LDTR_AR, VMCS_GUEST_IA32_LDTR_BASE }, + { VMCS_GUEST_IA32_TR_SEL, VMCS_GUEST_IA32_TR_LIMIT, + VMCS_GUEST_IA32_TR_AR, VMCS_GUEST_IA32_TR_BASE } +}; + +/* Pools for VMs and VCPUs */ +struct pool vm_pool; +struct pool vcpu_pool; + +struct vmm_softc *vmm_softc; + +/* IDT information used when populating host state area */ +extern vaddr_t idt_vaddr; +extern struct gate_descriptor *idt; + +/* CPU info (i386) */ +extern char cpu_brandstr[]; +extern uint32_t ecpu_eaxfeature; + +/* Constants used in "CR access exit" */ +#define CR_WRITE 0 +#define CR_READ 1 +#define CR_CLTS 2 +#define CR_LMSW 3 + +/* + * vmm_probe + * + * Checks if we have at least one CPU with either VMX or SVM. + * Returns 1 if we have at least one of either type, but not both, 0 otherwise. + */ +int +vmm_probe(struct device *parent, void *match, void *aux) +{ + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + const char **busname = (const char **)aux; + int found_vmx, found_svm; + + /* Check if this probe is for us */ + if (strcmp(*busname, vmm_cd.cd_name) != 0) + return (0); + + found_vmx = 0; + found_svm = 0; + + /* Check if we have at least one CPU with either VMX or SVM */ + CPU_INFO_FOREACH(cii, ci) { + if (ci->ci_vmm_flags & CI_VMM_VMX) + found_vmx = 1; + if (ci->ci_vmm_flags & CI_VMM_SVM) + found_svm = 1; + } + + /* Don't support both SVM and VMX at the same time */ + if (found_vmx && found_svm) + return (0); + + return (found_vmx || found_svm); +} + +/* + * vmm_attach + * + * Calculates how many of each type of CPU we have, prints this into dmesg + * during attach. Initializes various locks, pools, and list structures for the + * VMM. + */ +void +vmm_attach(struct device *parent, struct device *self, void *aux) +{ + struct vmm_softc *sc = (struct vmm_softc *)self; + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + + sc->nr_vmx_cpus = 0; + sc->nr_svm_cpus = 0; + sc->nr_rvi_cpus = 0; + sc->nr_ept_cpus = 0; + sc->vm_ct = 0; + sc->vm_idx = 0; + + /* Calculate CPU features */ + CPU_INFO_FOREACH(cii, ci) { + if (ci->ci_vmm_flags & CI_VMM_VMX) + sc->nr_vmx_cpus++; + if (ci->ci_vmm_flags & CI_VMM_SVM) + sc->nr_svm_cpus++; + if (ci->ci_vmm_flags & CI_VMM_RVI) + sc->nr_rvi_cpus++; + if (ci->ci_vmm_flags & CI_VMM_EPT) + sc->nr_ept_cpus++; + } + + SLIST_INIT(&sc->vm_list); + rw_init(&sc->vm_lock, "vmlistlock"); + + if (sc->nr_ept_cpus) { + printf(": VMX/EPT\n"); + sc->mode = VMM_MODE_EPT; + } else if (sc->nr_vmx_cpus) { + printf(": VMX\n"); + sc->mode = VMM_MODE_VMX; + } else if (sc->nr_rvi_cpus) { + printf(": SVM/RVI\n"); + sc->mode = VMM_MODE_RVI; + } else if (sc->nr_svm_cpus) { + printf(": SVM\n"); + sc->mode = VMM_MODE_SVM; + } else { + printf(": unknown\n"); + sc->mode = VMM_MODE_UNKNOWN; + } + + pool_init(&vm_pool, sizeof(struct vm), 0, IPL_NONE, PR_WAITOK, + "vmpool", NULL); + pool_init(&vcpu_pool, sizeof(struct vcpu), 0, IPL_NONE, PR_WAITOK, + "vcpupl", NULL); + + vmm_softc = sc; +} + +/* + * vmmopen + * + * Called during open of /dev/vmm. Presently unused. + */ +int +vmmopen(dev_t dev, int flag, int mode, struct proc *p) +{ + /* Don't allow open if we didn't attach */ + if (vmm_softc == NULL) + return (ENODEV); + + /* Don't allow open if we didn't detect any supported CPUs */ + /* XXX presently this means EPT until SP and SVM are back */ + if (vmm_softc->mode != VMM_MODE_EPT) + return (ENODEV); + + return 0; +} + +/* + * vmmioctl + * + * Main ioctl dispatch routine for /dev/vmm. Parses ioctl type and calls + * appropriate lower level handler routine. Returns result to ioctl caller. + */ +int +vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) +{ + int ret; + + switch (cmd) { + case VMM_IOC_CREATE: + if ((ret = vmm_start()) != 0) { + vmm_stop(); + break; + } + ret = vm_create((struct vm_create_params *)data, p); + break; + case VMM_IOC_RUN: + ret = vm_run((struct vm_run_params *)data); + break; + case VMM_IOC_INFO: + ret = vm_get_info((struct vm_info_params *)data); + break; + case VMM_IOC_TERM: + ret = vm_terminate((struct vm_terminate_params *)data); + break; + case VMM_IOC_RESETCPU: + ret = vm_resetcpu((struct vm_resetcpu_params *)data); + break; + case VMM_IOC_INTR: + ret = vm_intr_pending((struct vm_intr_params *)data); + break; + case VMM_IOC_READREGS: + ret = vm_rwregs((struct vm_rwregs_params *)data, 0); + break; + case VMM_IOC_WRITEREGS: + ret = vm_rwregs((struct vm_rwregs_params *)data, 1); + break; + default: + DPRINTF("vmmioctl: unknown ioctl code 0x%lx\n", cmd); + ret = ENOTTY; + } + + return (ret); +} + +/* + * pledge_ioctl_vmm + * + * Restrict the allowed ioctls in a pledged process context. + * Is called from pledge_ioctl(). + */ +int +pledge_ioctl_vmm(struct proc *p, long com) +{ + switch (com) { + case VMM_IOC_CREATE: + case VMM_IOC_INFO: + /* The "parent" process in vmd forks and manages VMs */ + if (p->p_p->ps_pledge & PLEDGE_PROC) + return (0); + break; + case VMM_IOC_TERM: + /* XXX VM processes should only terminate themselves */ + case VMM_IOC_RUN: + case VMM_IOC_RESETCPU: + return (0); + } + + return (EPERM); +} + +/* + * vmmclose + * + * Called when /dev/vmm is closed. Presently unused. + */ +int +vmmclose(dev_t dev, int flag, int mode, struct proc *p) +{ + return 0; +} + +/* + * vm_resetcpu + * + * Resets the vcpu defined in 'vrp' to power-on-init register state + * + * Parameters: + * vrp: ioctl structure defining the vcpu to reset (see vmmvar.h) + * + * Returns 0 if successful, or various error codes on failure: + * ENOENT if the VM id contained in 'vrp' refers to an unknown VM or + * if vrp describes an unknown vcpu for this VM + * EBUSY if the indicated VCPU is not stopped + * EIO if the indicated VCPU failed to reset + */ +int +vm_resetcpu(struct vm_resetcpu_params *vrp) +{ + struct vm *vm; + struct vcpu *vcpu; + + /* Find the desired VM */ + rw_enter_read(&vmm_softc->vm_lock); + SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) { + if (vm->vm_id == vrp->vrp_vm_id) + break; + } + rw_exit_read(&vmm_softc->vm_lock); + + /* Not found? exit. */ + if (vm == NULL) { + DPRINTF("vm_resetcpu: vm id %u not found\n", + vrp->vrp_vm_id); + return (ENOENT); + } + + rw_enter_read(&vm->vm_vcpu_lock); + SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) { + if (vcpu->vc_id == vrp->vrp_vcpu_id) + break; + } + rw_exit_read(&vm->vm_vcpu_lock); + + if (vcpu == NULL) { + DPRINTF("vm_resetcpu: vcpu id %u of vm %u not found\n", + vrp->vrp_vcpu_id, vrp->vrp_vm_id); + return (ENOENT); + } + + if (vcpu->vc_state != VCPU_STATE_STOPPED) { + DPRINTF("vm_resetcpu: reset of vcpu %u on vm %u attempted " + "while vcpu was in state %u (%s)\n", vrp->vrp_vcpu_id, + vrp->vrp_vm_id, vcpu->vc_state, + vcpu_state_decode(vcpu->vc_state)); + + return (EBUSY); + } + + DPRINTF("vm_resetcpu: resetting vm %d vcpu %d to power on defaults\n", + vm->vm_id, vcpu->vc_id); + + if (vcpu_reset_regs(vcpu, &vrp->vrp_init_state)) { + printf("vm_resetcpu: failed\n"); +#ifdef VMM_DEBUG + dump_vcpu(vcpu); +#endif /* VMM_DEBUG */ + return (EIO); + } + + return (0); +} + +/* + * vm_intr_pending + * + * IOCTL handler routine for VMM_IOC_INTR messages, sent from vmd when an + * interrupt is pending and needs acknowledgment + * + * Parameters: + * vip: Describes the vm/vcpu for which the interrupt is pending + * + * Return values: + * 0: if successful + * ENOENT: if the VM/VCPU defined by 'vip' cannot be found + */ +int +vm_intr_pending(struct vm_intr_params *vip) +{ + struct vm *vm; + struct vcpu *vcpu; + + /* Find the desired VM */ + rw_enter_read(&vmm_softc->vm_lock); + SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) { + if (vm->vm_id == vip->vip_vm_id) + break; + } + + /* Not found? exit. */ + if (vm == NULL) { + rw_exit_read(&vmm_softc->vm_lock); + return (ENOENT); + } + + rw_enter_read(&vm->vm_vcpu_lock); + SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) { + if (vcpu->vc_id == vip->vip_vcpu_id) + break; + } + rw_exit_read(&vm->vm_vcpu_lock); + rw_exit_read(&vmm_softc->vm_lock); + + if (vcpu == NULL) + return (ENOENT); + + vcpu->vc_intr = vip->vip_intr; + +#ifdef MULTIPROCESSOR + /* + * If the vcpu is running on another PCPU, attempt to force it + * to exit to process the pending interrupt. This could race as + * it could be running when we do the check but be stopped by the + * time we send the IPI. In this case, there is a small extra + * overhead to process the IPI but no other side effects. + * + * There is also a chance that the vcpu may have interrupts blocked. + * That's ok as that condition will be checked on exit, and we will + * simply re-enter the guest. This "fast notification" is done only + * as an optimization. + */ + if (vcpu->vc_state == VCPU_STATE_RUNNING && + vip->vip_intr == 1) + x86_send_ipi(vcpu->vc_last_pcpu, X86_IPI_NOP); +#endif /* MULTIPROCESSOR */ + + return (0); +} + +/* + * vm_readregs + * + * IOCTL handler to read/write the current register values of a guest VCPU. + * The VCPU must not be running. + * + * Parameters: + * vrwp: Describes the VM and VCPU to get/set the registers from. The + * register values are returned here as well. + * dir: 0 for reading, 1 for writing + * + * Return values: + * 0: if successful + * ENOENT: if the VM/VCPU defined by 'vgp' cannot be found + * EINVAL: if an error occured reading the registers of the guest + */ +int +vm_rwregs(struct vm_rwregs_params *vrwp, int dir) +{ + struct vm *vm; + struct vcpu *vcpu; + struct vcpu_reg_state *vrs = &vrwp->vrwp_regs; + + /* Find the desired VM */ + rw_enter_read(&vmm_softc->vm_lock); + SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) { + if (vm->vm_id == vrwp->vrwp_vm_id) + break; + } + + /* Not found? exit. */ + if (vm == NULL) { + rw_exit_read(&vmm_softc->vm_lock); + return (ENOENT); + } + + rw_enter_read(&vm->vm_vcpu_lock); + SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) { + if (vcpu->vc_id == vrwp->vrwp_vcpu_id) + break; + } + rw_exit_read(&vm->vm_vcpu_lock); + rw_exit_read(&vmm_softc->vm_lock); + + if (vcpu == NULL) + return (ENOENT); + + if (vmm_softc->mode == VMM_MODE_VMX || + vmm_softc->mode == VMM_MODE_EPT) + return (dir == 0) ? + vcpu_readregs_vmx(vcpu, vrwp->vrwp_mask, vrs) : + vcpu_writeregs_vmx(vcpu, vrwp->vrwp_mask, 1, vrs); + else if (vmm_softc->mode == VMM_MODE_SVM || + vmm_softc->mode == VMM_MODE_RVI) + return (dir == 0) ? + vcpu_readregs_svm(vcpu, vrwp->vrwp_mask, vrs) : + vcpu_writeregs_svm(vcpu, vrwp->vrwp_mask, vrs); + else + panic("unknown vmm mode\n"); +} + +/* + * vmm_start + * + * Starts VMM mode on the system + */ +int +vmm_start(void) +{ + struct cpu_info *self = curcpu(); + int ret = 0; +#ifdef MULTIPROCESSOR + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + int i; +#endif + + /* VMM is already running */ + if (self->ci_flags & CPUF_VMM) + return (0); + +#ifdef MULTIPROCESSOR + /* Broadcast start VMM IPI */ + x86_broadcast_ipi(X86_IPI_START_VMM); + + CPU_INFO_FOREACH(cii, ci) { + if (ci == self) + continue; + for (i = 100000; (!(ci->ci_flags & CPUF_VMM)) && i>0;i--) + delay(10); + if (!(ci->ci_flags & CPUF_VMM)) { + printf("%s: failed to enter VMM mode\n", + ci->ci_dev->dv_xname); + ret = EIO; + } + } +#endif /* MULTIPROCESSOR */ + + /* Start VMM on this CPU */ + start_vmm_on_cpu(self); + if (!(self->ci_flags & CPUF_VMM)) { + printf("%s: failed to enter VMM mode\n", + self->ci_dev.dv_xname); + ret = EIO; + } + + return (ret); +} + +/* + * vmm_stop + * + * Stops VMM mode on the system + */ +int +vmm_stop(void) +{ + struct cpu_info *self = curcpu(); + int ret = 0; +#ifdef MULTIPROCESSOR + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + int i; +#endif + + /* VMM is not running */ + if (!(self->ci_flags & CPUF_VMM)) + return (0); + +#ifdef MULTIPROCESSOR + /* Stop VMM on other CPUs */ + x86_broadcast_ipi(X86_IPI_STOP_VMM); + + CPU_INFO_FOREACH(cii, ci) { + if (ci == self) + continue; + for (i = 100000; (ci->ci_flags & CPUF_VMM) && i>0 ;i--) + delay(10); + if (ci->ci_flags & CPUF_VMM) { + printf("%s: failed to exit VMM mode\n", + ci->ci_dev->dv_xname); + ret = EIO; + } + } +#endif /* MULTIPROCESSOR */ + + /* Stop VMM on this CPU */ + stop_vmm_on_cpu(self); + if (self->ci_flags & CPUF_VMM) { + printf("%s: failed to exit VMM mode\n", + self->ci_dev.dv_xname); + ret = EIO; + } + + return (ret); +} + +/* + * start_vmm_on_cpu + * + * Starts VMM mode on 'ci' by executing the appropriate CPU-specific insn + * sequence to enter VMM mode (eg, VMXON) + */ +void +start_vmm_on_cpu(struct cpu_info *ci) +{ + uint64_t msr; + uint32_t cr4; + + /* No VMM mode? exit. */ + if ((ci->ci_vmm_flags & CI_VMM_VMX) == 0 && + (ci->ci_vmm_flags & CI_VMM_SVM) == 0) + return; + + /* + * AMD SVM + */ + if (ci->ci_vmm_flags & CI_VMM_SVM) { + msr = rdmsr(MSR_EFER); + msr |= EFER_SVME; + wrmsr(MSR_EFER, msr); + } + + /* + * Intel VMX + */ + if (ci->ci_vmm_flags & CI_VMM_VMX) { + if (ci->ci_vmxon_region == 0) + return; + else { + bzero(ci->ci_vmxon_region, PAGE_SIZE); + ci->ci_vmxon_region->vr_revision = + ci->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision; + + /* Set CR4.VMXE */ + cr4 = rcr4(); + cr4 |= CR4_VMXE; + lcr4(cr4); + + /* Enable VMX */ + msr = rdmsr(MSR_IA32_FEATURE_CONTROL); + if (msr & IA32_FEATURE_CONTROL_LOCK) { + if (!(msr & IA32_FEATURE_CONTROL_VMX_EN)) + return; + } else { + msr |= IA32_FEATURE_CONTROL_VMX_EN | + IA32_FEATURE_CONTROL_LOCK; + wrmsr(MSR_IA32_FEATURE_CONTROL, msr); + } + + /* Enter VMX mode */ + if (vmxon(&ci->ci_vmxon_region_pa)) + return; + } + } + + ci->ci_flags |= CPUF_VMM; +} + +/* + * stop_vmm_on_cpu + * + * Stops VMM mode on 'ci' by executing the appropriate CPU-specific insn + * sequence to exit VMM mode (eg, VMXOFF) + */ +void +stop_vmm_on_cpu(struct cpu_info *ci) +{ + uint64_t msr; + uint32_t cr4; + + if (!(ci->ci_flags & CPUF_VMM)) + return; + + /* + * AMD SVM + */ + if (ci->ci_vmm_flags & CI_VMM_SVM) { + msr = rdmsr(MSR_EFER); + msr &= ~EFER_SVME; + wrmsr(MSR_EFER, msr); + } + + /* + * Intel VMX + */ + if (ci->ci_vmm_flags & CI_VMM_VMX) { + if (vmxoff()) + panic("VMXOFF failed\n"); + + cr4 = rcr4(); + cr4 &= ~CR4_VMXE; + lcr4(cr4); + } + + ci->ci_flags &= ~CPUF_VMM; +} + +/* + * vm_create_check_mem_ranges: + * + * Make sure that the guest physical memory ranges given by the user process + * do not overlap and are in ascending order. + * + * The last physical address may not exceed VMM_MAX_VM_MEM_SIZE. + * + * Return Values: + * The total memory size in MB if the checks were successful + * 0: One of the memory ranges was invalid, or VMM_MAX_VM_MEM_SIZE was + * exceeded + */ +size_t +vm_create_check_mem_ranges(struct vm_create_params *vcp) +{ + int disjunct_range; + size_t i, memsize = 0; + struct vm_mem_range *vmr, *pvmr; + const paddr_t maxgpa = (uint32_t)VMM_MAX_VM_MEM_SIZE * 1024 * 1024; + + if (vcp->vcp_nmemranges == 0 || + vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) + return (0); + + for (i = 0; i < vcp->vcp_nmemranges; i++) { + vmr = &vcp->vcp_memranges[i]; + + /* Only page-aligned addresses and sizes are permitted */ + if ((vmr->vmr_gpa & PAGE_MASK) || (vmr->vmr_va & PAGE_MASK) || + (vmr->vmr_size & PAGE_MASK) || vmr->vmr_size == 0) + return (0); + + /* Make sure that VMM_MAX_VM_MEM_SIZE is not exceeded */ + if (vmr->vmr_gpa >= maxgpa || + vmr->vmr_size > maxgpa - vmr->vmr_gpa) + return (0); + + /* + * Make sure that all virtual addresses are within the address + * space of the process and that they do not wrap around. + * Calling uvm_share() when creating the VM will take care of + * further checks. + */ + if (vmr->vmr_va < VM_MIN_ADDRESS || + vmr->vmr_va >= VM_MAXUSER_ADDRESS || + vmr->vmr_size >= VM_MAXUSER_ADDRESS - vmr->vmr_va) + return (0); + + /* Specifying ranges within the PCI MMIO space is forbidden */ + disjunct_range = (vmr->vmr_gpa > VMM_PCI_MMIO_BAR_END) || + (vmr->vmr_gpa + vmr->vmr_size <= VMM_PCI_MMIO_BAR_BASE); + if (!disjunct_range) + return (0); + + /* + * Make sure that guest physcal memory ranges do not overlap + * and that they are ascending. + */ + if (i > 0 && pvmr->vmr_gpa + pvmr->vmr_size > vmr->vmr_gpa) + return (0); + + memsize += vmr->vmr_size; + pvmr = vmr; + } + + if (memsize % (1024 * 1024) != 0) + return (0); + memsize /= 1024 * 1024; + return (memsize); +} + +/* + * vm_create + * + * Creates the in-memory VMM structures for the VM defined by 'vcp'. The + * parent of this VM shall be the process defined by 'p'. + * This function does not start the VCPU(s) - see vm_start. + * + * Return Values: + * 0: the create operation was successful + * ENOMEM: out of memory + * various other errors from vcpu_init/vm_impl_init + */ +int +vm_create(struct vm_create_params *vcp, struct proc *p) +{ + int i, ret; + size_t memsize; + struct vm *vm; + struct vcpu *vcpu; + + if (!(curcpu()->ci_flags & CPUF_VMM)) + return (EINVAL); + + memsize = vm_create_check_mem_ranges(vcp); + if (memsize == 0) + return (EINVAL); + + /* XXX - support UP only (for now) */ + if (vcp->vcp_ncpus != 1) + return (EINVAL); + + vm = pool_get(&vm_pool, PR_WAITOK | PR_ZERO); + SLIST_INIT(&vm->vm_vcpu_list); + rw_init(&vm->vm_vcpu_lock, "vcpulock"); + + vm->vm_creator_pid = p->p_p->ps_pid; + vm->vm_nmemranges = vcp->vcp_nmemranges; + memcpy(vm->vm_memranges, vcp->vcp_memranges, + vm->vm_nmemranges * sizeof(vm->vm_memranges[0])); + vm->vm_memory_size = memsize; + strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN); + + if (vm_impl_init(vm, p)) { + printf("failed to init arch-specific features for vm 0x%p\n", + vm); + vm_teardown(vm); + return (ENOMEM); + } + + rw_enter_write(&vmm_softc->vm_lock); + vmm_softc->vm_ct++; + vmm_softc->vm_idx++; + + /* + * XXX we use the vm_id for the VPID/ASID, so we need to prevent + * wrapping around 65536/4096 entries here + */ + vm->vm_id = vmm_softc->vm_idx; + vm->vm_vcpu_ct = 0; + vm->vm_vcpus_running = 0; + + /* Initialize each VCPU defined in 'vcp' */ + for (i = 0; i < vcp->vcp_ncpus; i++) { + vcpu = pool_get(&vcpu_pool, PR_WAITOK | PR_ZERO); + vcpu->vc_parent = vm; + if ((ret = vcpu_init(vcpu)) != 0) { + printf("failed to init vcpu %d for vm 0x%p\n", i, vm); + vm_teardown(vm); + vmm_softc->vm_ct--; + vmm_softc->vm_idx--; + rw_exit_write(&vmm_softc->vm_lock); + return (ret); + } + rw_enter_write(&vm->vm_vcpu_lock); + vcpu->vc_id = vm->vm_vcpu_ct; + vm->vm_vcpu_ct++; + SLIST_INSERT_HEAD(&vm->vm_vcpu_list, vcpu, vc_vcpu_link); + rw_exit_write(&vm->vm_vcpu_lock); + } + + /* XXX init various other hardware parts (vlapic, vioapic, etc) */ + + SLIST_INSERT_HEAD(&vmm_softc->vm_list, vm, vm_link); + rw_exit_write(&vmm_softc->vm_lock); + + vcp->vcp_id = vm->vm_id; + + return (0); +} + +/* + * vm_impl_init_vmx + * + * Intel VMX specific VM initialization routine + */ +int +vm_impl_init_vmx(struct vm *vm, struct proc *p) +{ + int i, ret; + vaddr_t mingpa, maxgpa; + struct pmap *pmap; + struct vm_mem_range *vmr; + + /* If not EPT, nothing to do here */ + if (vmm_softc->mode != VMM_MODE_EPT) + return (0); + + /* Create a new pmap for this VM */ + pmap = pmap_create(); + if (!pmap) { + printf("vm_impl_init_vmx: pmap_create failed\n"); + return (ENOMEM); + } + + /* + * Create a new UVM map for this VM, and assign it the pmap just + * created. + */ + vmr = &vm->vm_memranges[0]; + mingpa = vmr->vmr_gpa; + vmr = &vm->vm_memranges[vm->vm_nmemranges - 1]; + maxgpa = vmr->vmr_gpa + vmr->vmr_size; + vm->vm_map = uvm_map_create(pmap, mingpa, maxgpa, + VM_MAP_ISVMSPACE | VM_MAP_PAGEABLE); + + if (!vm->vm_map) { + printf("vm_impl_init_vmx: uvm_map_create failed\n"); + pmap_destroy(pmap); + return (ENOMEM); + } + + /* Map the new map with an anon */ + DPRINTF("vm_impl_init_vmx: created vm_map @ %p\n", vm->vm_map); + for (i = 0; i < vm->vm_nmemranges; i++) { + vmr = &vm->vm_memranges[i]; + ret = uvm_share(vm->vm_map, vmr->vmr_gpa, + PROT_READ | PROT_WRITE | PROT_EXEC, + &p->p_vmspace->vm_map, vmr->vmr_va, vmr->vmr_size); + if (ret) { + printf("vm_impl_init_vmx: uvm_share failed (%d)\n", + ret); + /* uvm_map_deallocate calls pmap_destroy for us */ + uvm_map_deallocate(vm->vm_map); + vm->vm_map = NULL; + return (ENOMEM); + } + } + + /* Convert the low 512GB of the pmap to EPT */ + ret = pmap_convert(pmap, PMAP_TYPE_EPT); + if (ret) { + printf("vm_impl_init_vmx: pmap_convert failed\n"); + /* uvm_map_deallocate calls pmap_destroy for us */ + uvm_map_deallocate(vm->vm_map); + vm->vm_map = NULL; + return (ENOMEM); + } + + return (0); +} + +/* + * vm_impl_init_svm + * + * AMD SVM specific VM initialization routine + */ +int +vm_impl_init_svm(struct vm *vm, struct proc *p) +{ + /* XXX removed due to rot */ + return (-1); +} + +/* + * vm_impl_init + * + * Calls the architecture-specific VM init routine + */ +int +vm_impl_init(struct vm *vm, struct proc *p) +{ + if (vmm_softc->mode == VMM_MODE_VMX || + vmm_softc->mode == VMM_MODE_EPT) + return vm_impl_init_vmx(vm, p); + else if (vmm_softc->mode == VMM_MODE_SVM || + vmm_softc->mode == VMM_MODE_RVI) + return vm_impl_init_svm(vm, p); + else + panic("unknown vmm mode\n"); +} + +/* + * vm_impl_deinit_vmx + * + * Intel VMX specific VM initialization routine + */ +void +vm_impl_deinit_vmx(struct vm *vm) +{ + /* Unused */ +} + +/* + * vm_impl_deinit_svm + * + * AMD SVM specific VM initialization routine + */ +void +vm_impl_deinit_svm(struct vm *vm) +{ + /* Unused */ +} + +/* + * vm_impl_deinit + * + * Calls the architecture-specific VM init routine + */ +void +vm_impl_deinit(struct vm *vm) +{ + if (vmm_softc->mode == VMM_MODE_VMX || + vmm_softc->mode == VMM_MODE_EPT) + vm_impl_deinit_vmx(vm); + else if (vmm_softc->mode == VMM_MODE_SVM || + vmm_softc->mode == VMM_MODE_RVI) + vm_impl_deinit_svm(vm); + else + panic("unknown vmm mode\n"); +} + +/* + * vcpu_reload_vmcs_vmx + * + * Loads 'vmcs' on the current CPU, possibly flushing any old vmcs state + * of the previous occupant. + * + * Parameters: + * vmcs: Pointer to uint64_t containing the PA of the vmcs to load + * + * Return values: + * 0: if successful + * EINVAL: an error occurred during flush or reload + */ +int +vcpu_reload_vmcs_vmx(uint64_t *vmcs) +{ + uint64_t old; + + /* Flush any old state */ + if (!vmptrst(&old)) { + if (old != 0xFFFFFFFFFFFFFFFFULL) { + if (vmclear(&old)) + return (EINVAL); + } + } else + return (EINVAL); + + /* + * Load the VMCS onto this PCPU + */ + if (vmptrld(vmcs)) + return (EINVAL); + + return (0); +} + +/* + * vcpu_readregs_vmx + * + * Reads 'vcpu's registers + * + * Parameters: + * vcpu: the vcpu to read register values from + * regmask: the types of registers to read + * vrs: output parameter where register values are stored + * + * Return values: + * 0: if successful + * EINVAL: an error reading registers occured + */ +int +vcpu_readregs_vmx(struct vcpu *vcpu, uint64_t regmask, + struct vcpu_reg_state *vrs) +{ + int i, ret = 0; + uint32_t ar, sel; + uint32_t limit; + uint32_t *gprs = vrs->vrs_gprs; + uint32_t *crs = vrs->vrs_crs; + struct vcpu_segment_info *sregs = vrs->vrs_sregs; + + if (vcpu_reload_vmcs_vmx(&vcpu->vc_control_pa)) + return (EINVAL); + + if (regmask & VM_RWREGS_GPRS) { + gprs[VCPU_REGS_EAX] = vcpu->vc_gueststate.vg_eax; + gprs[VCPU_REGS_EBX] = vcpu->vc_gueststate.vg_ebx; + gprs[VCPU_REGS_ECX] = vcpu->vc_gueststate.vg_ecx; + gprs[VCPU_REGS_EDX] = vcpu->vc_gueststate.vg_edx; + gprs[VCPU_REGS_ESI] = vcpu->vc_gueststate.vg_esi; + gprs[VCPU_REGS_EDI] = vcpu->vc_gueststate.vg_edi; + gprs[VCPU_REGS_EBP] = vcpu->vc_gueststate.vg_ebp; + gprs[VCPU_REGS_EIP] = vcpu->vc_gueststate.vg_eip; + if (vmread(VMCS_GUEST_IA32_RSP, &gprs[VCPU_REGS_ESP])) + goto errout; + if (vmread(VMCS_GUEST_IA32_RFLAGS, &gprs[VCPU_REGS_EFLAGS])) + goto errout; + } + if (regmask & VM_RWREGS_SREGS) { + for (i = 0; i < nitems(vmm_vmx_sreg_vmcs_fields); i++) { + if (vmread(vmm_vmx_sreg_vmcs_fields[i].selid, &sel)) + goto errout; + if (vmread(vmm_vmx_sreg_vmcs_fields[i].limitid, &limit)) + goto errout; + if (vmread(vmm_vmx_sreg_vmcs_fields[i].arid, &ar)) + goto errout; + if (vmread(vmm_vmx_sreg_vmcs_fields[i].baseid, + &sregs[i].vsi_base)) + goto errout; + + sregs[i].vsi_sel = sel; + sregs[i].vsi_limit = limit; + sregs[i].vsi_ar = ar; + } + + if (vmread(VMCS_GUEST_IA32_GDTR_LIMIT, &limit)) + goto errout; + if (vmread(VMCS_GUEST_IA32_GDTR_BASE, + &vrs->vrs_gdtr.vsi_base)) + goto errout; + vrs->vrs_gdtr.vsi_limit = limit; + + if (vmread(VMCS_GUEST_IA32_IDTR_LIMIT, &limit)) + goto errout; + if (vmread(VMCS_GUEST_IA32_IDTR_BASE, + &vrs->vrs_idtr.vsi_base)) + goto errout; + vrs->vrs_idtr.vsi_limit = limit; + } + if (regmask & VM_RWREGS_CRS) { + crs[VCPU_REGS_CR2] = vcpu->vc_gueststate.vg_cr2; + if (vmread(VMCS_GUEST_IA32_CR0, &crs[VCPU_REGS_CR0])) + goto errout; + if (vmread(VMCS_GUEST_IA32_CR3, &crs[VCPU_REGS_CR3])) + goto errout; + if (vmread(VMCS_GUEST_IA32_CR4, &crs[VCPU_REGS_CR4])) + goto errout; + } + + goto out; + +errout: + ret = EINVAL; +out: + if (vmclear(&vcpu->vc_control_pa)) + ret = EINVAL; + return (ret); +} + +/* + * vcpu_readregs_svm + * + * XXX - unimplemented + */ +int +vcpu_readregs_svm(struct vcpu *vcpu, uint64_t regmask, + struct vcpu_reg_state *regs) +{ + return (0); +} + +/* + * vcpu_writeregs_vmx + * + * Writes 'vcpu's registers + * + * Parameters: + * vcpu: the vcpu that has to get its registers written to + * regmask: the types of registers to write + * loadvmcs: bit to indicate whether the VMCS has to be loaded first + * vrs: the register values to write + * + * Return values: + * 0: if successful + * EINVAL an error writing registers occured + */ +int +vcpu_writeregs_vmx(struct vcpu *vcpu, uint64_t regmask, int loadvmcs, + struct vcpu_reg_state *vrs) +{ + int i, ret = 0; + uint16_t sel; + uint32_t limit, ar; + uint32_t *gprs = vrs->vrs_gprs; + uint32_t *crs = vrs->vrs_crs; + struct vcpu_segment_info *sregs = vrs->vrs_sregs; + + if (loadvmcs) { + if (vcpu_reload_vmcs_vmx(&vcpu->vc_control_pa)) + return (EINVAL); + } + + if (regmask & VM_RWREGS_GPRS) { + vcpu->vc_gueststate.vg_eax = gprs[VCPU_REGS_EAX]; + vcpu->vc_gueststate.vg_ebx = gprs[VCPU_REGS_EBX]; + vcpu->vc_gueststate.vg_ecx = gprs[VCPU_REGS_ECX]; + vcpu->vc_gueststate.vg_edx = gprs[VCPU_REGS_EDX]; + vcpu->vc_gueststate.vg_esi = gprs[VCPU_REGS_ESI]; + vcpu->vc_gueststate.vg_edi = gprs[VCPU_REGS_EDI]; + vcpu->vc_gueststate.vg_ebp = gprs[VCPU_REGS_EBP]; + vcpu->vc_gueststate.vg_eip = gprs[VCPU_REGS_EIP]; + if (vmwrite(VMCS_GUEST_IA32_RIP, gprs[VCPU_REGS_EIP])) + goto errout; + if (vmwrite(VMCS_GUEST_IA32_RSP, gprs[VCPU_REGS_ESP])) + goto errout; + if (vmwrite(VMCS_GUEST_IA32_RFLAGS, gprs[VCPU_REGS_EFLAGS])) + goto errout; + } + if (regmask & VM_RWREGS_SREGS) { + for (i = 0; i < nitems(vmm_vmx_sreg_vmcs_fields); i++) { + sel = sregs[i].vsi_sel; + limit = sregs[i].vsi_limit; + ar = sregs[i].vsi_ar; + + if (vmwrite(vmm_vmx_sreg_vmcs_fields[i].selid, sel)) + goto errout; + if (vmwrite(vmm_vmx_sreg_vmcs_fields[i].limitid, limit)) + goto errout; + if (vmwrite(vmm_vmx_sreg_vmcs_fields[i].arid, ar)) + goto errout; + if (vmwrite(vmm_vmx_sreg_vmcs_fields[i].baseid, + sregs[i].vsi_base)) + goto errout; + } + + if (vmwrite(VMCS_GUEST_IA32_GDTR_LIMIT, + vrs->vrs_gdtr.vsi_limit)) + goto errout; + if (vmwrite(VMCS_GUEST_IA32_GDTR_BASE, + vrs->vrs_gdtr.vsi_base)) + goto errout; + if (vmwrite(VMCS_GUEST_IA32_IDTR_LIMIT, + vrs->vrs_idtr.vsi_limit)) + goto errout; + if (vmwrite(VMCS_GUEST_IA32_IDTR_BASE, + vrs->vrs_idtr.vsi_base)) + goto errout; + } + if (regmask & VM_RWREGS_CRS) { + if (vmwrite(VMCS_GUEST_IA32_CR0, crs[VCPU_REGS_CR0])) + goto errout; + if (vmwrite(VMCS_GUEST_IA32_CR3, crs[VCPU_REGS_CR3])) + goto errout; + if (vmwrite(VMCS_GUEST_IA32_CR4, crs[VCPU_REGS_CR4])) + goto errout; + } + + goto out; + +errout: + ret = EINVAL; +out: + if (loadvmcs) { + if (vmclear(&vcpu->vc_control_pa)) + ret = EINVAL; + } + return (ret); +} + +/* + * vcpu_writeregs_svm + * + * XXX - unimplemented + */ +int +vcpu_writeregs_svm(struct vcpu *vcpu, uint64_t regmask, + struct vcpu_reg_state *vrs) +{ + return (0); +} + +/* + * vcpu_reset_regs_svm + * + * XXX - unimplemented + */ +int +vcpu_reset_regs_svm(struct vcpu *vcpu, struct vcpu_reg_state *vrs) +{ + return (0); +} + +/* + * vmx_setmsrbr + * + * Allow read access to the specified msr on the supplied vcpu. + * + * Parameters: + * vcpu: the VCPU to allow access + * msr: the MSR number to allow access to + */ +void +vmx_setmsrbr(struct vcpu *vcpu, uint32_t msr) +{ + uint8_t *msrs; + uint16_t idx; + + msrs = (uint8_t *)vcpu->vc_msr_bitmap_va; + + /* + * MSR Read bitmap layout: + * "Low" MSRs (0x0 - 0x1fff) @ 0x0 + * "High" MSRs (0xc0000000 - 0xc0001fff) @ 0x400 + */ + if (msr <= 0x1fff) { + idx = MSRIDX(msr); + msrs[idx] &= ~(MSRBIT(msr)); + } else if (msr >= 0xc0000000 && msr <= 0xc0001fff) { + idx = MSRIDX(msr - 0xc0000000) + 0x400; + msrs[idx] &= ~(MSRBIT(msr - 0xc0000000)); + } else + printf("%s: invalid msr 0x%x\n", __func__, msr); +} + +/* + * vmx_setmsrbw + * + * Allow write access to the specified msr on the supplied vcpu + * + * Parameters: + * vcpu: the VCPU to allow access + * msr: the MSR number to allow access to + */ +void +vmx_setmsrbw(struct vcpu *vcpu, uint32_t msr) +{ + uint8_t *msrs; + uint16_t idx; + + msrs = (uint8_t *)vcpu->vc_msr_bitmap_va; + + /* + * MSR Write bitmap layout: + * "Low" MSRs (0x0 - 0x1fff) @ 0x800 + * "High" MSRs (0xc0000000 - 0xc0001fff) @ 0xc00 + */ + if (msr <= 0x1fff) { + idx = MSRIDX(msr) + 0x800; + msrs[idx] &= ~(MSRBIT(msr)); + } else if (msr >= 0xc0000000 && msr <= 0xc0001fff) { + idx = MSRIDX(msr - 0xc0000000) + 0xc00; + msrs[idx] &= ~(MSRBIT(msr - 0xc0000000)); + } else + printf("%s: invalid msr 0x%x\n", __func__, msr); +} + +/* + * vmx_setmsrbrw + * + * Allow read/write access to the specified msr on the supplied vcpu + * + * Parameters: + * vcpu: the VCPU to allow access + * msr: the MSR number to allow access to + */ +void +vmx_setmsrbrw(struct vcpu *vcpu, uint32_t msr) +{ + vmx_setmsrbr(vcpu, msr); + vmx_setmsrbw(vcpu, msr); +} + +/* + * vcpu_reset_regs_vmx + * + * Initializes 'vcpu's registers to supplied state + * + * Parameters: + * vcpu: the vcpu whose register state is to be initialized + * vrs: the register state to set + * + * Return values: + * 0: registers init'ed successfully + * EINVAL: an error occurred setting register state + */ +int +vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) +{ + int ret, ug; + uint32_t cr0, cr4; + uint32_t pinbased, procbased, procbased2, exit, entry; + uint32_t want1, want0; + uint64_t msr, ctrlval, eptp, cr3; + uint16_t ctrl; + struct vmx_msr_store *msr_store; + + ret = 0; + ug = 0; + + if (vcpu_reload_vmcs_vmx(&vcpu->vc_control_pa)) + return (EINVAL); + + /* Compute Basic Entry / Exit Controls */ + vcpu->vc_vmx_basic = rdmsr(IA32_VMX_BASIC); + vcpu->vc_vmx_entry_ctls = rdmsr(IA32_VMX_ENTRY_CTLS); + vcpu->vc_vmx_exit_ctls = rdmsr(IA32_VMX_EXIT_CTLS); + vcpu->vc_vmx_pinbased_ctls = rdmsr(IA32_VMX_PINBASED_CTLS); + vcpu->vc_vmx_procbased_ctls = rdmsr(IA32_VMX_PROCBASED_CTLS); + + /* Compute True Entry / Exit Controls (if applicable) */ + if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) { + vcpu->vc_vmx_true_entry_ctls = rdmsr(IA32_VMX_TRUE_ENTRY_CTLS); + vcpu->vc_vmx_true_exit_ctls = rdmsr(IA32_VMX_TRUE_EXIT_CTLS); + vcpu->vc_vmx_true_pinbased_ctls = + rdmsr(IA32_VMX_TRUE_PINBASED_CTLS); + vcpu->vc_vmx_true_procbased_ctls = + rdmsr(IA32_VMX_TRUE_PROCBASED_CTLS); + } + + /* Compute Secondary Procbased Controls (if applicable) */ + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) + vcpu->vc_vmx_procbased2_ctls = rdmsr(IA32_VMX_PROCBASED2_CTLS); + + /* + * Pinbased ctrls + * + * We must be able to set the following: + * IA32_VMX_EXTERNAL_INT_EXITING - exit on host interrupt + * IA32_VMX_NMI_EXITING - exit on host NMI + */ + want1 = IA32_VMX_EXTERNAL_INT_EXITING | + IA32_VMX_NMI_EXITING; + want0 = 0; + + if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) { + ctrl = IA32_VMX_TRUE_PINBASED_CTLS; + ctrlval = vcpu->vc_vmx_true_pinbased_ctls; + } else { + ctrl = IA32_VMX_PINBASED_CTLS; + ctrlval = vcpu->vc_vmx_pinbased_ctls; + } + + if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &pinbased)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_PINBASED_CTLS, pinbased)) { + ret = EINVAL; + goto exit; + } + + /* + * Procbased ctrls + * + * We must be able to set the following: + * IA32_VMX_HLT_EXITING - exit on HLT instruction + * IA32_VMX_MWAIT_EXITING - exit on MWAIT instruction + * IA32_VMX_UNCONDITIONAL_IO_EXITING - exit on I/O instructions + * IA32_VMX_USE_MSR_BITMAPS - exit on various MSR accesses + * IA32_VMX_CR8_LOAD_EXITING - guest TPR access + * IA32_VMX_CR8_STORE_EXITING - guest TPR access + * IA32_VMX_USE_TPR_SHADOW - guest TPR access (shadow) + * + * If we have EPT, we must be able to clear the following + * IA32_VMX_CR3_LOAD_EXITING - don't care about guest CR3 accesses + * IA32_VMX_CR3_STORE_EXITING - don't care about guest CR3 accesses + */ + want1 = IA32_VMX_HLT_EXITING | + IA32_VMX_MWAIT_EXITING | + IA32_VMX_UNCONDITIONAL_IO_EXITING | + IA32_VMX_USE_MSR_BITMAPS | + IA32_VMX_CR8_LOAD_EXITING | + IA32_VMX_CR8_STORE_EXITING | + IA32_VMX_USE_TPR_SHADOW; + want0 = 0; + + if (vmm_softc->mode == VMM_MODE_EPT) { + want1 |= IA32_VMX_ACTIVATE_SECONDARY_CONTROLS; + want0 |= IA32_VMX_CR3_LOAD_EXITING | + IA32_VMX_CR3_STORE_EXITING; + } + + if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) { + ctrl = IA32_VMX_TRUE_PROCBASED_CTLS; + ctrlval = vcpu->vc_vmx_true_procbased_ctls; + } else { + ctrl = IA32_VMX_PROCBASED_CTLS; + ctrlval = vcpu->vc_vmx_procbased_ctls; + } + + if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &procbased)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_PROCBASED_CTLS, procbased)) { + ret = EINVAL; + goto exit; + } + + /* + * Secondary Procbased ctrls + * + * We want to be able to set the following, if available: + * IA32_VMX_ENABLE_VPID - use VPIDs where available + * + * If we have EPT, we must be able to set the following: + * IA32_VMX_ENABLE_EPT - enable EPT + * + * If we have unrestricted guest capability, we must be able to set + * the following: + * IA32_VMX_UNRESTRICTED_GUEST - enable unrestricted guest + */ + want1 = 0; + + /* XXX checking for 2ndary controls can be combined here */ + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) { + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_VPID, 1)) + want1 |= IA32_VMX_ENABLE_VPID; + } + + if (vmm_softc->mode == VMM_MODE_EPT) + want1 |= IA32_VMX_ENABLE_EPT; + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) { + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_UNRESTRICTED_GUEST, 1)) { + want1 |= IA32_VMX_UNRESTRICTED_GUEST; + ug = 1; + } + } + + want0 = ~want1; + ctrlval = vcpu->vc_vmx_procbased2_ctls; + ctrl = IA32_VMX_PROCBASED2_CTLS; + + if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &procbased2)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_PROCBASED2_CTLS, procbased2)) { + ret = EINVAL; + goto exit; + } + + /* + * Exit ctrls + * + * We must be able to set the following: + * IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT - ack interrupt on exit + * XXX clear save_debug_ctrls on exit ? + */ + want1 = IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT; + want0 = 0; + + if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) { + ctrl = IA32_VMX_TRUE_EXIT_CTLS; + ctrlval = vcpu->vc_vmx_true_exit_ctls; + } else { + ctrl = IA32_VMX_EXIT_CTLS; + ctrlval = vcpu->vc_vmx_exit_ctls; + } + + if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &exit)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_EXIT_CTLS, exit)) { + ret = EINVAL; + goto exit; + } + + /* + * Entry ctrls + * + * We must be able to set the following: + * IA32_VMX_IA32E_MODE_GUEST (if no unrestricted guest) + * We must be able to clear the following: + * IA32_VMX_ENTRY_TO_SMM - enter to SMM + * IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT + * IA32_VMX_LOAD_DEBUG_CONTROLS + * IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY + */ + if (ug == 1) + want1 = 0; + else + want1 = IA32_VMX_IA32E_MODE_GUEST; + + want0 = IA32_VMX_ENTRY_TO_SMM | + IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT | + IA32_VMX_LOAD_DEBUG_CONTROLS | + IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY; + + if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) { + ctrl = IA32_VMX_TRUE_ENTRY_CTLS; + ctrlval = vcpu->vc_vmx_true_entry_ctls; + } else { + ctrl = IA32_VMX_ENTRY_CTLS; + ctrlval = vcpu->vc_vmx_entry_ctls; + } + + if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &entry)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_ENTRY_CTLS, entry)) { + ret = EINVAL; + goto exit; + } + + if (vmm_softc->mode == VMM_MODE_EPT) { + eptp = vcpu->vc_parent->vm_map->pmap->pm_npt_pa; + msr = rdmsr(IA32_VMX_EPT_VPID_CAP); + if (msr & IA32_EPT_VPID_CAP_PAGE_WALK_4) { + /* Page walk length 4 supported */ + eptp |= ((IA32_EPT_PAGE_WALK_LENGTH - 1) << 3); + } + + if (msr & IA32_EPT_VPID_CAP_WB) { + /* WB cache type supported */ + eptp |= IA32_EPT_PAGING_CACHE_TYPE_WB; + } + + if (msr & IA32_EPT_VPID_CAP_AD_BITS) { + /* EPT A/D bits supported */ + eptp |= IA32_EPT_AD_BITS_ENABLE; + } + + DPRINTF("guest eptp = 0x%llx\n", eptp); + DPRINTF("write 0x%x to EPT_LO\n", (uint32_t)(eptp & 0xFFFFFFFFUL)); + if (vmwrite(VMCS_GUEST_IA32_EPTP, (uint32_t)(eptp & 0xFFFFFFFFUL))) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_EPTP_HI, 0)) { + ret = EINVAL; + goto exit; + } + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) { + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_VPID, 1)) + if (vmwrite(VMCS_GUEST_VPID, + (uint16_t)vcpu->vc_parent->vm_id)) { + ret = EINVAL; + goto exit; + } + } + + /* + * Determine which bits in CR0 have to be set to a fixed + * value as per Intel SDM A.7. + * CR0 bits in the vrs parameter must match these. + */ + + want1 = (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0) & + (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1); + want0 = ~(curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0) & + ~(curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1); + + /* + * CR0_FIXED0 and CR0_FIXED1 may report the CR0_PG and CR0_PE bits as + * fixed to 1 even if the CPU supports the unrestricted guest + * feature. Update want1 and want0 accordingly to allow + * any value for CR0_PG and CR0_PE in vrs->vrs_crs[VCPU_REGS_CR0] if + * the CPU has the unrestricted guest capability. + */ + cr0 = vrs->vrs_crs[VCPU_REGS_CR0]; + + if (ug) { + want1 &= ~(CR0_PG | CR0_PE); + want0 &= ~(CR0_PG | CR0_PE); + cr0 &= ~(CR0_PG | CR0_PE); + } + + /* + * VMX may require some bits to be set that userland should not have + * to care about. Set those here. + */ + if (want1 & CR0_NE) + cr0 |= CR0_NE; + + if ((cr0 & want1) != want1) { + ret = EINVAL; + goto exit; + } + if ((~cr0 & want0) != want0) { + ret = EINVAL; + goto exit; + } + + if (ug) + cr3 = 0; + else + cr3 = vrs->vrs_crs[VCPU_REGS_CR3]; + + /* + * Determine default CR4 as per Intel SDM A.8 + * All flexible bits are set to 0 + */ + cr4 = (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0) & + (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1); + + /* + * If we are starting in restricted guest mode, enable PAE + */ + if (ug == 0) + cr4 |= CR4_PAE; + + vrs->vrs_crs[VCPU_REGS_CR0] = cr0; + vrs->vrs_crs[VCPU_REGS_CR3] = cr3; + vrs->vrs_crs[VCPU_REGS_CR4] = cr4; + + /* + * Select MSRs to be loaded on exit + */ + msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_load_va; + msr_store[0].vms_index = MSR_EFER; + msr_store[0].vms_data = rdmsr(MSR_EFER); + + /* + * Select MSRs to be loaded on entry / saved on exit + */ + msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_save_va; + + msr_store[0].vms_index = MSR_EFER; + msr_store[0].vms_data = 0ULL; /* Initial value */ + + /* + * Currently we have the same count of entry/exit MSRs loads/stores + * but this is not an architectural requirement. + */ + if (vmwrite(VMCS_EXIT_MSR_STORE_COUNT, VMX_NUM_MSR_STORE)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_EXIT_MSR_LOAD_COUNT, VMX_NUM_MSR_STORE)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, VMX_NUM_MSR_STORE)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_EXIT_STORE_MSR_ADDRESS, + vcpu->vc_vmx_msr_exit_save_pa)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_EXIT_STORE_MSR_ADDRESS_HI, 0)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_EXIT_LOAD_MSR_ADDRESS, + vcpu->vc_vmx_msr_exit_load_pa)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_EXIT_LOAD_MSR_ADDRESS_HI, 0)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_ENTRY_LOAD_MSR_ADDRESS, + vcpu->vc_vmx_msr_exit_save_pa)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_ENTRY_LOAD_MSR_ADDRESS_HI, 0)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_MSR_BITMAP_ADDRESS, + vcpu->vc_msr_bitmap_pa)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_MSR_BITMAP_ADDRESS_HI, 0)) { + ret = EINVAL; + goto exit; + } + + /* + * Set up the VMCS for the register state we want during VCPU start. + * This matches what the CPU state would be after a bootloader + * transition to 'start'. + */ + ret = vcpu_writeregs_vmx(vcpu, VM_RWREGS_ALL, 0, vrs); + + /* + * Set up the MSR bitmap + */ + memset((uint8_t *)vcpu->vc_msr_bitmap_va, 0xFF, PAGE_SIZE); + vmx_setmsrbrw(vcpu, MSR_IA32_FEATURE_CONTROL); + vmx_setmsrbrw(vcpu, MSR_MTRRcap); + vmx_setmsrbrw(vcpu, MSR_SYSENTER_CS); + vmx_setmsrbrw(vcpu, MSR_SYSENTER_ESP); + vmx_setmsrbrw(vcpu, MSR_SYSENTER_EIP); + vmx_setmsrbrw(vcpu, MSR_MTRRvarBase); + vmx_setmsrbrw(vcpu, MSR_CR_PAT); + vmx_setmsrbrw(vcpu, MSR_MTRRdefType); + vmx_setmsrbrw(vcpu, MSR_EFER); + vmx_setmsrbrw(vcpu, MSR_STAR); + vmx_setmsrbrw(vcpu, MSR_LSTAR); + vmx_setmsrbrw(vcpu, MSR_CSTAR); + vmx_setmsrbrw(vcpu, MSR_SFMASK); + vmx_setmsrbrw(vcpu, MSR_FSBASE); + vmx_setmsrbrw(vcpu, MSR_GSBASE); + vmx_setmsrbrw(vcpu, MSR_KERNELGSBASE); + + + /* XXX CR0 shadow */ + /* XXX CR4 shadow */ + + /* Flush the VMCS */ + if (vmclear(&vcpu->vc_control_pa)) { + ret = EINVAL; + goto exit; + } + +exit: + return (ret); +} + +/* + * vcpu_init_vmx + * + * Intel VMX specific VCPU initialization routine. + * + * This function allocates various per-VCPU memory regions, sets up initial + * VCPU VMCS controls, and sets initial register values. + */ +int +vcpu_init_vmx(struct vcpu *vcpu) +{ + struct vmcs *vmcs; + uint32_t cr0, cr4; + int ret; + + ret = 0; + + /* Allocate VMCS VA */ + vcpu->vc_control_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, &kp_zero, + &kd_waitok); + + if (!vcpu->vc_control_va) + return (ENOMEM); + + /* Compute VMCS PA */ + if (!pmap_extract(pmap_kernel(), vcpu->vc_control_va, + (paddr_t *)&vcpu->vc_control_pa)) { + ret = ENOMEM; + goto exit; + } + + /* Allocate MSR bitmap VA */ + vcpu->vc_msr_bitmap_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, &kp_zero, + &kd_waitok); + + if (!vcpu->vc_msr_bitmap_va) { + ret = ENOMEM; + goto exit; + } + + /* Compute MSR bitmap PA */ + if (!pmap_extract(pmap_kernel(), vcpu->vc_msr_bitmap_va, + (paddr_t *)&vcpu->vc_msr_bitmap_pa)) { + ret = ENOMEM; + goto exit; + } + + /* Allocate MSR exit load area VA */ + vcpu->vc_vmx_msr_exit_load_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, + &kp_zero, &kd_waitok); + + if (!vcpu->vc_vmx_msr_exit_load_va) { + ret = ENOMEM; + goto exit; + } + + /* Compute MSR exit load area PA */ + if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_exit_load_va, + &vcpu->vc_vmx_msr_exit_load_pa)) { + ret = ENOMEM; + goto exit; + } + + /* Allocate MSR exit save area VA */ + vcpu->vc_vmx_msr_exit_save_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, + &kp_zero, &kd_waitok); + + if (!vcpu->vc_vmx_msr_exit_save_va) { + ret = ENOMEM; + goto exit; + } + + /* Compute MSR exit save area PA */ + if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_exit_save_va, + &vcpu->vc_vmx_msr_exit_save_pa)) { + ret = ENOMEM; + goto exit; + } + + /* Allocate MSR entry load area VA */ + vcpu->vc_vmx_msr_entry_load_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, + &kp_zero, &kd_waitok); + + if (!vcpu->vc_vmx_msr_entry_load_va) { + ret = ENOMEM; + goto exit; + } + + /* Compute MSR entry load area PA */ + if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_entry_load_va, + &vcpu->vc_vmx_msr_entry_load_pa)) { + ret = ENOMEM; + goto exit; + } + + vmcs = (struct vmcs *)vcpu->vc_control_va; + vmcs->vmcs_revision = curcpu()->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision; + + /* + * Load the VMCS onto this PCPU so we can write registers + */ + if (vmptrld(&vcpu->vc_control_pa)) { + ret = EINVAL; + goto exit; + } + + /* Host CR0 */ + cr0 = rcr0(); + if (vmwrite(VMCS_HOST_IA32_CR0, cr0)) { + ret = EINVAL; + goto exit; + } + + /* Host CR4 */ + cr4 = rcr4(); + if (vmwrite(VMCS_HOST_IA32_CR4, cr4)) { + ret = EINVAL; + goto exit; + } + + /* Host Segment Selectors */ + if (vmwrite(VMCS_HOST_IA32_CS_SEL, GSEL(GCODE_SEL, SEL_KPL))) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_HOST_IA32_DS_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_HOST_IA32_ES_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_HOST_IA32_FS_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_HOST_IA32_GS_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_HOST_IA32_SS_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_HOST_IA32_TR_SEL, proc0.p_md.md_tss_sel)) { + ret = EINVAL; + goto exit; + } + + /* Host IDTR base */ + if (vmwrite(VMCS_HOST_IA32_IDTR_BASE, (uint32_t)idt)) { + ret = EINVAL; + goto exit; + } + + /* VMCS link */ + if (vmwrite(VMCS_LINK_POINTER, 0xFFFFFFFF)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_LINK_POINTER_HI, 0xFFFFFFFF)) { + ret = EINVAL; + goto exit; + } + +exit: + if (ret) { + if (vcpu->vc_control_va) + km_free((void *)vcpu->vc_control_va, PAGE_SIZE, + &kv_page, &kp_zero); + if (vcpu->vc_msr_bitmap_va) + km_free((void *)vcpu->vc_msr_bitmap_va, PAGE_SIZE, + &kv_page, &kp_zero); + if (vcpu->vc_vmx_msr_exit_save_va) + km_free((void *)vcpu->vc_vmx_msr_exit_save_va, + PAGE_SIZE, &kv_page, &kp_zero); + if (vcpu->vc_vmx_msr_exit_load_va) + km_free((void *)vcpu->vc_vmx_msr_exit_load_va, + PAGE_SIZE, &kv_page, &kp_zero); + if (vcpu->vc_vmx_msr_entry_load_va) + km_free((void *)vcpu->vc_vmx_msr_entry_load_va, + PAGE_SIZE, &kv_page, &kp_zero); + } + + return (ret); +} + +/* + * vcpu_reset_regs + * + * Resets a vcpu's registers to the provided state + * + * Parameters: + * vcpu: the vcpu whose registers shall be reset + * vrs: the desired register state + * + * Return values: + * 0: the vcpu's registers were successfully reset + * !0: the vcpu's registers could not be reset (see arch-specific reset + * function for various values that can be returned here) + */ +int +vcpu_reset_regs(struct vcpu *vcpu, struct vcpu_reg_state *vrs) +{ + int ret; + + if (vmm_softc->mode == VMM_MODE_VMX || + vmm_softc->mode == VMM_MODE_EPT) + ret = vcpu_reset_regs_vmx(vcpu, vrs); + else if (vmm_softc->mode == VMM_MODE_SVM || + vmm_softc->mode == VMM_MODE_RVI) + ret = vcpu_reset_regs_svm(vcpu, vrs); + else + panic("unknown vmm mode\n"); + + return (ret); +} + +/* + * vcpu_init_svm + * + * AMD SVM specific VCPU initialization routine. + */ +int +vcpu_init_svm(struct vcpu *vcpu) +{ + /* XXX removed due to rot */ + return (0); +} + +/* + * vcpu_init + * + * Calls the architecture-specific VCPU init routine + */ +int +vcpu_init(struct vcpu *vcpu) +{ + int ret = 0; + + vcpu->vc_hsa_stack_va = (vaddr_t)malloc(PAGE_SIZE, M_DEVBUF, + M_NOWAIT|M_ZERO); + if (!vcpu->vc_hsa_stack_va) + return (ENOMEM); + + vcpu->vc_virt_mode = vmm_softc->mode; + vcpu->vc_state = VCPU_STATE_STOPPED; + if (vmm_softc->mode == VMM_MODE_VMX || + vmm_softc->mode == VMM_MODE_EPT) + ret = vcpu_init_vmx(vcpu); + else if (vmm_softc->mode == VMM_MODE_SVM || + vmm_softc->mode == VMM_MODE_RVI) + ret = vcpu_init_svm(vcpu); + else + panic("unknown vmm mode\n"); + + if (ret) + free((void *)vcpu->vc_hsa_stack_va, M_DEVBUF, PAGE_SIZE); + + return (ret); +} + +/* + * vcpu_deinit_vmx + * + * Deinitializes the vcpu described by 'vcpu' + */ +void +vcpu_deinit_vmx(struct vcpu *vcpu) +{ + if (vcpu->vc_control_va) + km_free((void *)vcpu->vc_control_va, PAGE_SIZE, + &kv_page, &kp_zero); + if (vcpu->vc_vmx_msr_exit_save_va) + km_free((void *)vcpu->vc_vmx_msr_exit_save_va, + PAGE_SIZE, &kv_page, &kp_zero); + if (vcpu->vc_vmx_msr_exit_load_va) + km_free((void *)vcpu->vc_vmx_msr_exit_load_va, + PAGE_SIZE, &kv_page, &kp_zero); + if (vcpu->vc_vmx_msr_entry_load_va) + km_free((void *)vcpu->vc_vmx_msr_entry_load_va, + PAGE_SIZE, &kv_page, &kp_zero); + if (vcpu->vc_hsa_stack_va) + free((void *)vcpu->vc_hsa_stack_va, M_DEVBUF, PAGE_SIZE); +} + +/* + * vcpu_deinit_svm + * + * Deinitializes the vcpu described by 'vcpu' + */ +void +vcpu_deinit_svm(struct vcpu *vcpu) +{ + /* Unused */ +} + +/* + * vcpu_deinit + * + * Calls the architecture-specific VCPU deinit routine + */ +void +vcpu_deinit(struct vcpu *vcpu) +{ + if (vmm_softc->mode == VMM_MODE_VMX || + vmm_softc->mode == VMM_MODE_EPT) + vcpu_deinit_vmx(vcpu); + else if (vmm_softc->mode == VMM_MODE_SVM || + vmm_softc->mode == VMM_MODE_RVI) + vcpu_deinit_svm(vcpu); + else + panic("unknown vmm mode\n"); +} + +/* + * vm_teardown + * + * Tears down (destroys) the vm indicated by 'vm'. + */ +void +vm_teardown(struct vm *vm) +{ + struct vcpu *vcpu, *tmp; + + /* Free VCPUs */ + rw_enter_write(&vm->vm_vcpu_lock); + SLIST_FOREACH_SAFE(vcpu, &vm->vm_vcpu_list, vc_vcpu_link, tmp) { + SLIST_REMOVE(&vm->vm_vcpu_list, vcpu, vcpu, vc_vcpu_link); + vcpu_deinit(vcpu); + pool_put(&vcpu_pool, vcpu); + } + + vm_impl_deinit(vm); + + /* teardown guest vmspace */ + if (vm->vm_map != NULL) + uvm_map_deallocate(vm->vm_map); + + vmm_softc->vm_ct--; + if (vmm_softc->vm_ct < 1) + vmm_stop(); + rw_exit_write(&vm->vm_vcpu_lock); + pool_put(&vm_pool, vm); +} + +/* + * vcpu_vmx_check_cap + * + * Checks if the 'cap' bit in the 'msr' MSR can be set or cleared (set = 1 + * or set = 0, respectively). + * + * When considering 'msr', we check to see if true controls are available, + * and use those if so. + * + * Returns 1 of 'cap' can be set/cleared as requested, 0 otherwise. + */ +int +vcpu_vmx_check_cap(struct vcpu *vcpu, uint32_t msr, uint32_t cap, int set) +{ + uint64_t ctl; + + if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) { + switch (msr) { + case IA32_VMX_PINBASED_CTLS: + ctl = vcpu->vc_vmx_true_pinbased_ctls; + break; + case IA32_VMX_PROCBASED_CTLS: + ctl = vcpu->vc_vmx_true_procbased_ctls; + break; + case IA32_VMX_PROCBASED2_CTLS: + ctl = vcpu->vc_vmx_procbased2_ctls; + break; + case IA32_VMX_ENTRY_CTLS: + ctl = vcpu->vc_vmx_true_entry_ctls; + break; + case IA32_VMX_EXIT_CTLS: + ctl = vcpu->vc_vmx_true_exit_ctls; + break; + default: + return (0); + } + } else { + switch (msr) { + case IA32_VMX_PINBASED_CTLS: + ctl = vcpu->vc_vmx_pinbased_ctls; + break; + case IA32_VMX_PROCBASED_CTLS: + ctl = vcpu->vc_vmx_procbased_ctls; + break; + case IA32_VMX_PROCBASED2_CTLS: + ctl = vcpu->vc_vmx_procbased2_ctls; + break; + case IA32_VMX_ENTRY_CTLS: + ctl = vcpu->vc_vmx_entry_ctls; + break; + case IA32_VMX_EXIT_CTLS: + ctl = vcpu->vc_vmx_exit_ctls; + break; + default: + return (0); + } + } + + if (set) { + /* Check bit 'cap << 32', must be !0 */ + return (ctl & ((uint64_t)cap << 32)) != 0; + } else { + /* Check bit 'cap', must be 0 */ + return (ctl & cap) == 0; + } +} + +/* + * vcpu_vmx_compute_ctrl + * + * Computes the appropriate control value, given the supplied parameters + * and CPU capabilities. + * + * Intel has made somewhat of a mess of this computation - it is described + * using no fewer than three different approaches, spread across many + * pages of the SDM. Further compounding the problem is the fact that now + * we have "true controls" for each type of "control", and each needs to + * be examined to get the calculation right, but only if "true" controls + * are present on the CPU we're on. + * + * Parameters: + * ctrlval: the control value, as read from the CPU MSR + * ctrl: which control is being set (eg, pinbased, procbased, etc) + * want0: the set of desired 0 bits + * want1: the set of desired 1 bits + * out: (out) the correct value to write into the VMCS for this VCPU, + * for the 'ctrl' desired. + * + * Returns 0 if successful, or EINVAL if the supplied parameters define + * an unworkable control setup. + */ +int +vcpu_vmx_compute_ctrl(uint64_t ctrlval, uint16_t ctrl, uint32_t want1, + uint32_t want0, uint32_t *out) +{ + int i, set, clear; + + /* + * The Intel SDM gives three formulae for determining which bits to + * set/clear for a given control and desired functionality. Formula + * 1 is the simplest but disallows use of newer features that are + * enabled by functionality in later CPUs. + * + * Formulas 2 and 3 allow such extra functionality. We use formula + * 2 - this requires us to know the identity of controls in the + * "default1" class for each control register, but allows us to not + * have to pass along and/or query both sets of capability MSRs for + * each control lookup. This makes the code slightly longer, + * however. + */ + for (i = 0; i < 32; i++) { + /* Figure out if we can set and / or clear this bit */ + set = (ctrlval & (1ULL << (i + 32))) != 0; + clear = ((1ULL << i) & ((uint64_t)ctrlval)) == 0; + + /* If the bit can't be set nor cleared, something's wrong */ + if (!set && !clear) + return (EINVAL); + + /* + * Formula 2.c.i - "If the relevant VMX capability MSR + * reports that a control has a single setting, use that + * setting." + */ + if (set && !clear) { + if (want0 & (1ULL << i)) + return (EINVAL); + else + *out |= (1ULL << i); + } else if (clear && !set) { + if (want1 & (1ULL << i)) + return (EINVAL); + else + *out &= ~(1ULL << i); + } else { + /* + * 2.c.ii - "If the relevant VMX capability MSR + * reports that a control can be set to 0 or 1 + * and that control's meaning is known to the VMM, + * set the control based on the functionality desired." + */ + if (want1 & (1ULL << i)) + *out |= (1ULL << i); + else if (want0 & (1 << i)) + *out &= ~(1ULL << i); + else { + /* + * ... assuming the control's meaning is not + * known to the VMM ... + * + * 2.c.iii - "If the relevant VMX capability + * MSR reports that a control can be set to 0 + * or 1 and the control is not in the default1 + * class, set the control to 0." + * + * 2.c.iv - "If the relevant VMX capability + * MSR reports that a control can be set to 0 + * or 1 and the control is in the default1 + * class, set the control to 1." + */ + switch (ctrl) { + case IA32_VMX_PINBASED_CTLS: + case IA32_VMX_TRUE_PINBASED_CTLS: + /* + * A.3.1 - default1 class of pinbased + * controls comprises bits 1,2,4 + */ + switch (i) { + case 1: + case 2: + case 4: + *out |= (1ULL << i); + break; + default: + *out &= ~(1ULL << i); + break; + } + break; + case IA32_VMX_PROCBASED_CTLS: + case IA32_VMX_TRUE_PROCBASED_CTLS: + /* + * A.3.2 - default1 class of procbased + * controls comprises bits 1, 4-6, 8, + * 13-16, 26 + */ + switch (i) { + case 1: + case 4 ... 6: + case 8: + case 13 ... 16: + case 26: + *out |= (1ULL << i); + break; + default: + *out &= ~(1ULL << i); + break; + } + break; + /* + * Unknown secondary procbased controls + * can always be set to 0 + */ + case IA32_VMX_PROCBASED2_CTLS: + *out &= ~(1ULL << i); + break; + case IA32_VMX_EXIT_CTLS: + case IA32_VMX_TRUE_EXIT_CTLS: + /* + * A.4 - default1 class of exit + * controls comprises bits 0-8, 10, + * 11, 13, 14, 16, 17 + */ + switch (i) { + case 0 ... 8: + case 10 ... 11: + case 13 ... 14: + case 16 ... 17: + *out |= (1ULL << i); + break; + default: + *out &= ~(1ULL << i); + break; + } + break; + case IA32_VMX_ENTRY_CTLS: + case IA32_VMX_TRUE_ENTRY_CTLS: + /* + * A.5 - default1 class of entry + * controls comprises bits 0-8, 12 + */ + switch (i) { + case 0 ... 8: + case 12: + *out |= (1ULL << i); + break; + default: + *out &= ~(1ULL << i); + break; + } + break; + } + } + } + } + + return (0); +} + +/* + * vm_get_info + * + * Returns information about the VM indicated by 'vip'. + */ +int +vm_get_info(struct vm_info_params *vip) +{ + struct vm_info_result *out; + struct vm *vm; + struct vcpu *vcpu; + int i, j; + size_t need; + + rw_enter_read(&vmm_softc->vm_lock); + need = vmm_softc->vm_ct * sizeof(struct vm_info_result); + if (vip->vip_size < need) { + vip->vip_info_ct = 0; + vip->vip_size = need; + rw_exit_read(&vmm_softc->vm_lock); + return (0); + } + + out = malloc(need, M_DEVBUF, M_NOWAIT|M_ZERO); + if (out == NULL) { + vip->vip_info_ct = 0; + rw_exit_read(&vmm_softc->vm_lock); + return (ENOMEM); + } + + i = 0; + vip->vip_info_ct = vmm_softc->vm_ct; + SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) { + out[i].vir_memory_size = vm->vm_memory_size; + out[i].vir_used_size = + pmap_resident_count(vm->vm_map->pmap) * PAGE_SIZE; + out[i].vir_ncpus = vm->vm_vcpu_ct; + out[i].vir_id = vm->vm_id; + out[i].vir_creator_pid = vm->vm_creator_pid; + strncpy(out[i].vir_name, vm->vm_name, VMM_MAX_NAME_LEN); + rw_enter_read(&vm->vm_vcpu_lock); + for (j = 0; j < vm->vm_vcpu_ct; j++) { + out[i].vir_vcpu_state[j] = VCPU_STATE_UNKNOWN; + SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, + vc_vcpu_link) { + if (vcpu->vc_id == j) + out[i].vir_vcpu_state[j] = + vcpu->vc_state; + } + } + rw_exit_read(&vm->vm_vcpu_lock); + i++; + } + rw_exit_read(&vmm_softc->vm_lock); + if (copyout(out, vip->vip_info, need) == EFAULT) { + free(out, M_DEVBUF, need); + return (EFAULT); + } + + free(out, M_DEVBUF, need); + return (0); +} + +/* + * vm_terminate + * + * Terminates the VM indicated by 'vtp'. + */ +int +vm_terminate(struct vm_terminate_params *vtp) +{ + struct vm *vm; + struct vcpu *vcpu; + u_int old, next; + + /* + * Find desired VM + */ + rw_enter_read(&vmm_softc->vm_lock); + SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) { + if (vm->vm_id == vtp->vtp_vm_id) + break; + } + + if (vm != NULL) { + rw_enter_read(&vm->vm_vcpu_lock); + SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) { + do { + old = vcpu->vc_state; + if (old == VCPU_STATE_RUNNING) + next = VCPU_STATE_REQTERM; + else if (old == VCPU_STATE_STOPPED) + next = VCPU_STATE_TERMINATED; + else /* must be REQTERM or TERMINATED */ + break; + } while (old != atomic_cas_uint(&vcpu->vc_state, + old, next)); + } + rw_exit_read(&vm->vm_vcpu_lock); + } + rw_exit_read(&vmm_softc->vm_lock); + + if (vm == NULL) + return (ENOENT); + + /* XXX possible race here two threads terminating the same vm? */ + rw_enter_write(&vmm_softc->vm_lock); + SLIST_REMOVE(&vmm_softc->vm_list, vm, vm, vm_link); + rw_exit_write(&vmm_softc->vm_lock); + if (vm->vm_vcpus_running == 0) + vm_teardown(vm); + + return (0); +} + +/* + * vm_run + * + * Run the vm / vcpu specified by 'vrp' + */ +int +vm_run(struct vm_run_params *vrp) +{ + struct vm *vm; + struct vcpu *vcpu; + int ret = 0; + u_int old, next; + + /* + * Find desired VM + */ + rw_enter_read(&vmm_softc->vm_lock); + + SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) { + if (vm->vm_id == vrp->vrp_vm_id) + break; + } + + /* + * Attempt to locate the requested VCPU. If found, attempt to + * to transition from VCPU_STATE_STOPPED -> VCPU_STATE_RUNNING. + * Failure to make the transition indicates the VCPU is busy. + */ + if (vm != NULL) { + rw_enter_read(&vm->vm_vcpu_lock); + SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) { + if (vcpu->vc_id == vrp->vrp_vcpu_id) + break; + } + + if (vcpu != NULL) { + old = VCPU_STATE_STOPPED; + next = VCPU_STATE_RUNNING; + + if (atomic_cas_uint(&vcpu->vc_state, old, next) != old) + ret = EBUSY; + else + atomic_inc_int(&vm->vm_vcpus_running); + } + rw_exit_read(&vm->vm_vcpu_lock); + + if (vcpu == NULL) + ret = ENOENT; + } + rw_exit_read(&vmm_softc->vm_lock); + + if (vm == NULL) + ret = ENOENT; + + /* Bail if errors detected in the previous steps */ + if (ret) + return (ret); + + /* + * We may be returning from userland helping us from the last exit. + * If so (vrp_continue == 1), copy in the exit data from vmd. The + * exit data will be consumed before the next entry (this typically + * comprises VCPU register changes as the result of vmd(8)'s actions). + */ + if (vrp->vrp_continue) { + if (copyin(vrp->vrp_exit, &vcpu->vc_exit, + sizeof(union vm_exit)) == EFAULT) { + return (EFAULT); + } + } + + /* Run the VCPU specified in vrp */ + if (vcpu->vc_virt_mode == VMM_MODE_VMX || + vcpu->vc_virt_mode == VMM_MODE_EPT) { + ret = vcpu_run_vmx(vcpu, vrp); + } else if (vcpu->vc_virt_mode == VMM_MODE_SVM || + vcpu->vc_virt_mode == VMM_MODE_RVI) { + ret = vcpu_run_svm(vcpu, vrp); + } + + /* + * We can set the VCPU states here without CAS because once + * a VCPU is in state RUNNING or REQTERM, only the VCPU itself + * can switch the state. + */ + atomic_dec_int(&vm->vm_vcpus_running); + if (vcpu->vc_state == VCPU_STATE_REQTERM) { + vrp->vrp_exit_reason = VM_EXIT_TERMINATED; + vcpu->vc_state = VCPU_STATE_TERMINATED; + if (vm->vm_vcpus_running == 0) + vm_teardown(vm); + ret = 0; + } else if (ret == EAGAIN) { + /* If we are exiting, populate exit data so vmd can help. */ + vrp->vrp_exit_reason = vcpu->vc_gueststate.vg_exit_reason; + vrp->vrp_irqready = vcpu->vc_irqready; + vcpu->vc_state = VCPU_STATE_STOPPED; + + if (copyout(&vcpu->vc_exit, vrp->vrp_exit, + sizeof(union vm_exit)) == EFAULT) { + ret = EFAULT; + } else + ret = 0; + } else if (ret == 0) { + vrp->vrp_exit_reason = VM_EXIT_NONE; + vcpu->vc_state = VCPU_STATE_STOPPED; + } else { + vrp->vrp_exit_reason = VM_EXIT_TERMINATED; + vcpu->vc_state = VCPU_STATE_TERMINATED; + } + + return (ret); +} + +/* + * vcpu_must_stop + * + * Check if we need to (temporarily) stop running the VCPU for some reason, + * such as: + * - the VM was requested to terminate + * - the proc running this VCPU has pending signals + */ +int +vcpu_must_stop(struct vcpu *vcpu) +{ + struct proc *p = curproc; + + if (vcpu->vc_state == VCPU_STATE_REQTERM) + return (1); + if (CURSIG(p) != 0) + return (1); + return (0); +} + +/* + * vcpu_run_vmx + * + * VMM main loop used to run a VCPU. + * + * Parameters: + * vcpu: The VCPU to run + * vrp: run parameters + * + * Return values: + * 0: The run loop exited and no help is needed from vmd + * EAGAIN: The run loop exited and help from vmd is needed + * EINVAL: an error occured + */ +int +vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *vrp) +{ + int ret = 0, resume, locked, exitinfo; + struct region_descriptor gdt; + struct cpu_info *ci; + uint64_t cr3, vmcs_ptr; + uint32_t insn_error, exit_reason; + struct schedstate_percpu *spc; + struct vmx_invvpid_descriptor vid; + uint32_t eii; + uint32_t procbased; + uint16_t irq; + + resume = 0; + irq = vrp->vrp_irq; + + /* + * If we are returning from userspace (vmd) because we exited + * last time, fix up any needed vcpu state first. Which state + * needs to be fixed up depends on what vmd populated in the + * exit data structure. + */ + if (vrp->vrp_continue) { + switch (vcpu->vc_gueststate.vg_exit_reason) { + case VMX_EXIT_IO: + vcpu->vc_gueststate.vg_eax = + vcpu->vc_exit.vei.vei_data; + break; + case VMX_EXIT_HLT: + break; + case VMX_EXIT_INT_WINDOW: + break; + case VMX_EXIT_EXTINT: + break; + case VMX_EXIT_EPT_VIOLATION: + break; +#ifdef VMM_DEBUG + case VMX_EXIT_TRIPLE_FAULT: + DPRINTF("%s: vm %d vcpu %d triple fault\n", + __func__, vcpu->vc_parent->vm_id, + vcpu->vc_id); + vmx_vcpu_dump_regs(vcpu); + dump_vcpu(vcpu); + vmx_dump_vmcs(vcpu); + break; + case VMX_EXIT_ENTRY_FAILED_GUEST_STATE: + DPRINTF("%s: vm %d vcpu %d failed entry " + "due to invalid guest state\n", + __func__, vcpu->vc_parent->vm_id, + vcpu->vc_id); + vmx_vcpu_dump_regs(vcpu); + dump_vcpu(vcpu); + return EINVAL; + default: + DPRINTF("%s: unimplemented exit type %d (%s)\n", + __func__, + vcpu->vc_gueststate.vg_exit_reason, + vmx_exit_reason_decode( + vcpu->vc_gueststate.vg_exit_reason)); + vmx_vcpu_dump_regs(vcpu); + dump_vcpu(vcpu); + break; +#endif /* VMM_DEBUG */ + } + } + + while (ret == 0) { + if (!resume) { + /* + * We are launching for the first time, or we are + * resuming from a different pcpu, so we need to + * reset certain pcpu-specific values. + */ + ci = curcpu(); + setregion(&gdt, ci->ci_gdt, NGDT * sizeof(union descriptor) - 1); + + vcpu->vc_last_pcpu = ci; + + if (vmptrld(&vcpu->vc_control_pa)) { + ret = EINVAL; + break; + } + + if (gdt.rd_base == 0) { + ret = EINVAL; + break; + } + + /* Host GDTR base */ + if (vmwrite(VMCS_HOST_IA32_GDTR_BASE, gdt.rd_base)) { + ret = EINVAL; + break; + } + + /* Host TR base */ + if (vmwrite(VMCS_HOST_IA32_TR_BASE, + proc0.p_md.md_tss_sel)) { + ret = EINVAL; + break; + } + + /* Host CR3 */ + cr3 = rcr3(); + if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) { + ret = EINVAL; + break; + } + } + + /* Handle vmd(8) injected interrupts */ + /* XXX - 0x20 should be changed to PIC's vector base */ + + /* Is there an interrupt pending injection? */ + if (irq != 0xFFFF) { + if (!vcpu->vc_irqready) { + printf("vcpu_run_vmx: error - irq injected" + " while not ready\n"); + ret = EINVAL; + break; + } + + eii = (irq & 0xFF) + 0x20; + eii |= (1ULL << 31); /* Valid */ + eii |= (0ULL << 8); /* Hardware Interrupt */ + if (vmwrite(VMCS_ENTRY_INTERRUPTION_INFO, eii)) { + printf("vcpu_run_vmx: can't vector " + "interrupt to guest\n"); + ret = EINVAL; + break; + } + + irq = 0xFFFF; + } else if (!vcpu->vc_intr) { + /* + * Disable window exiting + */ + if (vmread(VMCS_PROCBASED_CTLS, &procbased)) { + printf("vcpu_run_vmx: can't read" + "procbased ctls on exit\n"); + ret = EINVAL; + break; + } else { + procbased &= ~IA32_VMX_INTERRUPT_WINDOW_EXITING; + if (vmwrite(VMCS_PROCBASED_CTLS, procbased)) { + printf("vcpu_run_vmx: can't write" + " procbased ctls on exit\n"); + ret = EINVAL; + break; + } + } + } + + /* Invalidate old TLB mappings */ + vid.vid_vpid = (uint64_t)vcpu->vc_parent->vm_id; + vid.vid_addr = 0ULL; + invvpid(IA32_VMX_INVVPID_SINGLE_CTX_GLB, &vid); + + /* Start / resume the VCPU */ + KERNEL_ASSERT_LOCKED(); + KERNEL_UNLOCK(); + ret = vmx_enter_guest(&vcpu->vc_control_pa, + &vcpu->vc_gueststate, resume, gdt.rd_base); + + /* XXX */ + tlbflushg(); + + exit_reason = VM_EXIT_NONE; + if (ret == 0) { + /* + * ret == 0 implies we entered the guest, and later + * exited for some valid reason + */ + exitinfo = vmx_get_exit_info( + &vcpu->vc_gueststate.vg_eip, &exit_reason); + if (vmread(VMCS_GUEST_IA32_RFLAGS, + &vcpu->vc_gueststate.vg_eflags)) { + printf("vcpu_run_vmx: can't read guest rflags" + " during exit\n"); + ret = EINVAL; + break; + } + } + + if (ret || exitinfo != VMX_EXIT_INFO_COMPLETE || + exit_reason != VMX_EXIT_EXTINT) { + KERNEL_LOCK(); + locked = 1; + } else + locked = 0; + + /* If we exited successfully ... */ + if (ret == 0) { + resume = 1; + if (!(exitinfo & VMX_EXIT_INFO_HAVE_RIP)) { + printf("vcpu_run_vmx: cannot read guest rip\n"); + ret = EINVAL; + break; + } + + if (!(exitinfo & VMX_EXIT_INFO_HAVE_REASON)) { + printf("vcpu_run_vmx: cant read exit reason\n"); + ret = EINVAL; + break; + } + + /* + * Handle the exit. This will alter "ret" to EAGAIN if + * the exit handler determines help from vmd is needed. + */ + vcpu->vc_gueststate.vg_exit_reason = exit_reason; + ret = vmx_handle_exit(vcpu); + + /* + * When the guest exited due to an external interrupt, + * we do not yet hold the kernel lock: we need to + * handle interrupts first before grabbing the lock: + * the interrupt handler might do work that + * another CPU holding the kernel lock waits for. + * + * Example: the TLB shootdown code in the pmap module + * sends an IPI to all other CPUs and busy-waits for + * them to decrement tlb_shoot_wait to zero. While + * busy-waiting, the kernel lock is held. + * + * If this code here attempted to grab the kernel lock + * before handling the interrupt, it would block + * forever. + */ + if (!locked) + KERNEL_LOCK(); + + if (vcpu->vc_gueststate.vg_eflags & PSL_I) + vcpu->vc_irqready = 1; + else + vcpu->vc_irqready = 0; + + /* + * If not ready for interrupts, but interrupts pending, + * enable interrupt window exiting. + */ + if (vcpu->vc_irqready == 0 && vcpu->vc_intr) { + if (vmread(VMCS_PROCBASED_CTLS, &procbased)) { + printf("vcpu_run_vmx: can't read" + " procbased ctls on intwin exit\n"); + ret = EINVAL; + break; + } + + procbased |= IA32_VMX_INTERRUPT_WINDOW_EXITING; + if (vmwrite(VMCS_PROCBASED_CTLS, procbased)) { + printf("vcpu_run_vmx: can't write" + " procbased ctls on intwin exit\n"); + ret = EINVAL; + break; + } + } + + /* + * Exit to vmd if we are terminating, failed to enter, + * or need help (device I/O) + */ + if (ret || vcpu_must_stop(vcpu)) + break; + + if (vcpu->vc_intr && vcpu->vc_irqready) { + ret = EAGAIN; + break; + } + + /* Check if we should yield - don't hog the cpu */ + spc = &ci->ci_schedstate; + if (spc->spc_schedflags & SPCF_SHOULDYIELD) { + resume = 0; + if (vmclear(&vcpu->vc_control_pa)) { + ret = EINVAL; + break; + } + yield(); + } + } else if (ret == VMX_FAIL_LAUNCH_INVALID_VMCS) { + printf("vcpu_run_vmx: failed launch with invalid " + "vmcs\n"); +#ifdef VMM_DEBUG + vmx_vcpu_dump_regs(vcpu); + dump_vcpu(vcpu); +#endif /* VMM_DEBUG */ + ret = EINVAL; + } else if (ret == VMX_FAIL_LAUNCH_VALID_VMCS) { + exit_reason = vcpu->vc_gueststate.vg_exit_reason; + printf("vcpu_run_vmx: failed launch with valid " + "vmcs, code=%d (%s)\n", exit_reason, + vmx_instruction_error_decode(exit_reason)); + if (vmread(VMCS_INSTRUCTION_ERROR, &insn_error)) { + printf("vcpu_run_vmx: can't read" + " insn error field\n"); + } else + printf("vcpu_run_vmx: insn error code = " + "%d\n", insn_error); +#ifdef VMM_DEBUG + vmx_vcpu_dump_regs(vcpu); + dump_vcpu(vcpu); +#endif /* VMM_DEBUG */ + ret = EINVAL; + } else { + printf("vcpu_run_vmx: failed launch for unknown " + "reason %d\n", ret); +#ifdef VMM_DEBUG + vmx_vcpu_dump_regs(vcpu); + dump_vcpu(vcpu); +#endif /* VMM_DEBUG */ + ret = EINVAL; + } + } + + /* + * We are heading back to userspace (vmd), either because we need help + * handling an exit, a guest interrupt is pending, or we failed in some + * way to enter the guest. Clear any current VMCS pointer as we may end + * up coming back on a different CPU. + */ + if (!vmptrst(&vmcs_ptr)) { + if (vmcs_ptr != 0xFFFFFFFFFFFFFFFFULL) + if (vmclear(&vcpu->vc_control_pa)) + ret = EINVAL; + } else + ret = EINVAL; + + return (ret); +} + +/* + * vmx_handle_intr + * + * Handle host (external) interrupts. We read which interrupt fired by + * extracting the vector from the VMCS and dispatch the interrupt directly + * to the host using vmm_dispatch_intr. + */ +void +vmx_handle_intr(struct vcpu *vcpu) +{ + uint8_t vec; + uint32_t eii; + struct gate_descriptor *idte; + vaddr_t handler; + + if (vmread(VMCS_EXIT_INTERRUPTION_INFO, &eii)) { + printf("vmx_handle_intr: can't obtain intr info\n"); + return; + } + + vec = eii & 0xFF; + + /* XXX check "error valid" code in eii, abort if 0 */ + idte=&idt[vec]; + handler = idte->gd_looffset + ((uint64_t)idte->gd_hioffset << 16); + vmm_dispatch_intr(handler); +} + +/* + * vmx_handle_hlt + * + * Handle HLT exits + */ +int +vmx_handle_hlt(struct vcpu *vcpu) +{ + uint32_t insn_length; + + if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) { + printf("vmx_handle_hlt: can't obtain instruction length\n"); + return (EINVAL); + } + + vcpu->vc_gueststate.vg_eip += insn_length; + return (EAGAIN); +} + +/* + * vmx_get_exit_info + * + * Returns exit information containing the current guest RIP and exit reason + * in rip and exit_reason. The return value is a bitmask indicating whether + * reading the RIP and exit reason was successful. + */ +int +vmx_get_exit_info(uint32_t *eip, uint32_t *exit_reason) +{ + int rv = 0; + + if (vmread(VMCS_GUEST_IA32_RIP, eip) == 0) { + rv |= VMX_EXIT_INFO_HAVE_RIP; + if (vmread(VMCS_EXIT_REASON, exit_reason) == 0) + rv |= VMX_EXIT_INFO_HAVE_REASON; + } + return (rv); +} + +/* + * vmx_handle_exit + * + * Handle exits from the VM by decoding the exit reason and calling various + * subhandlers as needed. + */ +int +vmx_handle_exit(struct vcpu *vcpu) +{ + uint64_t exit_reason; + uint32_t eflags; + int update_rip, ret = 0; + + update_rip = 0; + exit_reason = vcpu->vc_gueststate.vg_exit_reason; + eflags = vcpu->vc_gueststate.vg_eflags; + + switch (exit_reason) { + case VMX_EXIT_INT_WINDOW: + if (!(eflags & PSL_I)) { + DPRINTF("vmx_handle_exit: impossible interrupt window" + " exit config\n"); + ret = EINVAL; + break; + } + + ret = EAGAIN; + update_rip = 0; + break; + case VMX_EXIT_EPT_VIOLATION: + ret = vmx_handle_np_fault(vcpu); + break; + case VMX_EXIT_CPUID: + ret = vmx_handle_cpuid(vcpu); + update_rip = 1; + break; + case VMX_EXIT_IO: + ret = vmx_handle_inout(vcpu); + update_rip = 1; + break; + case VMX_EXIT_EXTINT: + vmx_handle_intr(vcpu); + update_rip = 0; + break; + case VMX_EXIT_CR_ACCESS: + ret = vmx_handle_cr(vcpu); + update_rip = 1; + break; + case VMX_EXIT_HLT: + ret = vmx_handle_hlt(vcpu); + update_rip = 1; + break; + case VMX_EXIT_RDMSR: + ret = vmx_handle_rdmsr(vcpu); + update_rip = 1; + break; + case VMX_EXIT_WRMSR: + ret = vmx_handle_wrmsr(vcpu); + update_rip = 1; + break; + case VMX_EXIT_TRIPLE_FAULT: +#ifdef VMM_DEBUG + DPRINTF("vmx_handle_exit: vm %d vcpu %d triple fault\n", + vcpu->vc_parent->vm_id, vcpu->vc_id); + vmx_vcpu_dump_regs(vcpu); + dump_vcpu(vcpu); + vmx_dump_vmcs(vcpu); +#endif /* VMM_DEBUG */ + ret = EAGAIN; + update_rip = 0; + break; + default: + DPRINTF("vmx_handle_exit: unhandled exit %lld (%s)\n", + exit_reason, vmx_exit_reason_decode(exit_reason)); + return (EINVAL); + } + + if (update_rip) { + if (vmwrite(VMCS_GUEST_IA32_RIP, + vcpu->vc_gueststate.vg_eip)) { + printf("vmx_handle_exit: can't advance rip\n"); + return (EINVAL); + } + } + + return (ret); +} + +/* + * vmm_get_guest_memtype + * + * Returns the type of memory 'gpa' refers to in the context of vm 'vm' + */ +int +vmm_get_guest_memtype(struct vm *vm, paddr_t gpa) +{ + int i; + struct vm_mem_range *vmr; + + if (gpa >= VMM_PCI_MMIO_BAR_BASE && gpa <= VMM_PCI_MMIO_BAR_END) { + DPRINTF("guest mmio access @ 0x%llx\n", (uint64_t)gpa); + return (VMM_MEM_TYPE_REGULAR); + } + + /* XXX Use binary search? */ + for (i = 0; i < vm->vm_nmemranges; i++) { + vmr = &vm->vm_memranges[i]; + + /* + * vm_memranges are ascending. gpa can no longer be in one of + * the memranges + */ + if (gpa < vmr->vmr_gpa) + break; + + if (gpa < vmr->vmr_gpa + vmr->vmr_size) + return (VMM_MEM_TYPE_REGULAR); + } + + DPRINTF("guest memtype @ 0x%llx unknown\n", (uint64_t)gpa); + return (VMM_MEM_TYPE_UNKNOWN); +} + +/* + * vmm_get_guest_faulttype + * + * Determines the type (R/W/X) of the last fault on the VCPU last run on + * this PCPU. Calls the appropriate architecture-specific subroutine. + */ +int +vmm_get_guest_faulttype(void) +{ + if (vmm_softc->mode == VMM_MODE_EPT) + return vmx_get_guest_faulttype(); + else if (vmm_softc->mode == VMM_MODE_RVI) + return vmx_get_guest_faulttype(); + else + panic("unknown vmm mode\n"); +} + +/* + * vmx_get_exit_qualification + * + * Return the current VMCS' exit qualification information + */ +int +vmx_get_exit_qualification(uint32_t *exit_qualification) +{ + if (vmread(VMCS_GUEST_EXIT_QUALIFICATION, exit_qualification)) { + printf("vmm_get_exit_qualification: cant extract exit qual\n"); + return (EINVAL); + } + + return (0); +} + +/* + * vmx_get_guest_faulttype + * + * Determines the type (R/W/X) of the last fault on the VCPU last run on + * this PCPU. + */ +int +vmx_get_guest_faulttype(void) +{ + uint32_t exit_qualification; + uint64_t presentmask = IA32_VMX_EPT_FAULT_WAS_READABLE | + IA32_VMX_EPT_FAULT_WAS_WRITABLE | IA32_VMX_EPT_FAULT_WAS_EXECABLE; + uint64_t protmask = IA32_VMX_EPT_FAULT_READ | + IA32_VMX_EPT_FAULT_WRITE | IA32_VMX_EPT_FAULT_EXEC; + + if (vmx_get_exit_qualification(&exit_qualification)) + return (-1); + + if ((exit_qualification & presentmask) == 0) + return VM_FAULT_INVALID; + if (exit_qualification & protmask) + return VM_FAULT_PROTECT; + return (-1); +} + +/* + * svm_get_guest_faulttype + * + * Determines the type (R/W/X) of the last fault on the VCPU last run on + * this PCPU. + */ +int +svm_get_guest_faulttype(void) +{ + /* XXX removed due to rot */ + return (-1); +} + +/* + * vmx_fault_page + * + * Request a new page to be faulted into the UVM map of the VM owning 'vcpu' + * at address 'gpa'. + */ +int +vmx_fault_page(struct vcpu *vcpu, paddr_t gpa) +{ + int fault_type, ret; + + fault_type = vmx_get_guest_faulttype(); + if (fault_type == -1) { + printf("vmx_fault_page: invalid fault type\n"); + return (EINVAL); + } + + ret = uvm_fault(vcpu->vc_parent->vm_map, gpa, fault_type, + PROT_READ | PROT_WRITE | PROT_EXEC); + if (ret) + printf("vmx_fault_page: uvm_fault returns %d\n", ret); + + return (ret); +} + +/* + * vmx_handle_np_fault + * + * High level nested paging handler for VMX. Verifies that a fault is for a + * valid memory region, then faults a page, or aborts otherwise. + */ +int +vmx_handle_np_fault(struct vcpu *vcpu) +{ + uint64_t gpa; + uint32_t gpa_lo, gpa_hi; + int gpa_memtype, ret; + + ret = 0; + if (vmread(VMCS_GUEST_PHYSICAL_ADDRESS, &gpa_lo)) { + printf("vmm_handle_np_fault: cannot extract faulting pa lo\n"); + return (EINVAL); + } + + if (vmread(VMCS_GUEST_PHYSICAL_ADDRESS_HI, &gpa_hi)) { + printf("vmm_handle_np_fault: cannot extract faulting pa hi\n"); + return (EINVAL); + } + + gpa = (uint64_t)gpa_lo | (uint64_t)gpa_hi << 32ULL; + + gpa_memtype = vmm_get_guest_memtype(vcpu->vc_parent, gpa); + switch (gpa_memtype) { + case VMM_MEM_TYPE_REGULAR: + ret = vmx_fault_page(vcpu, gpa); + break; + default: + printf("unknown memory type %d for GPA 0x%llx\n", + gpa_memtype, gpa); + return (EINVAL); + } + + return (ret); +} + +/* + * vmx_handle_inout + * + * Exit handler for IN/OUT instructions. + * + * The vmm can handle certain IN/OUTS without exiting to vmd, but most of these + * will be passed to vmd for completion. + */ +int +vmx_handle_inout(struct vcpu *vcpu) +{ + uint32_t insn_length; + uint32_t exit_qual; + int ret; + + if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) { + printf("vmx_handle_inout: can't obtain instruction length\n"); + return (EINVAL); + } + + if (vmx_get_exit_qualification(&exit_qual)) { + printf("vmx_handle_inout: can't get exit qual\n"); + return (EINVAL); + } + + /* Bits 0:2 - size of exit */ + vcpu->vc_exit.vei.vei_size = (exit_qual & 0x7) + 1; + /* Bit 3 - direction */ + vcpu->vc_exit.vei.vei_dir = (exit_qual & 0x8) >> 3; + /* Bit 4 - string instruction? */ + vcpu->vc_exit.vei.vei_string = (exit_qual & 0x10) >> 4; + /* Bit 5 - REP prefix? */ + vcpu->vc_exit.vei.vei_rep = (exit_qual & 0x20) >> 5; + /* Bit 6 - Operand encoding */ + vcpu->vc_exit.vei.vei_encoding = (exit_qual & 0x40) >> 6; + /* Bit 16:31 - port */ + vcpu->vc_exit.vei.vei_port = (exit_qual & 0xFFFF0000) >> 16; + /* Data */ + vcpu->vc_exit.vei.vei_data = vcpu->vc_gueststate.vg_eax; + + vcpu->vc_gueststate.vg_eip += insn_length; + + /* + * The following ports usually belong to devices owned by vmd. + * Return EAGAIN to signal help needed from userspace (vmd). + * Return 0 to indicate we don't care about this port. + * + * XXX something better than a hardcoded list here, maybe + * configure via vmd via the device list in vm create params? + * + * XXX handle not eax target + */ + switch (vcpu->vc_exit.vei.vei_port) { + case IO_ICU1 ... IO_ICU1 + 1: + case 0x40 ... 0x43: + case IO_RTC ... IO_RTC + 1: + case IO_ICU2 ... IO_ICU2 + 1: + case 0x3f8 ... 0x3ff: + case 0xcf8: + case 0xcfc: + case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: + ret = EAGAIN; + break; + default: + /* Read from unsupported ports returns FFs */ + if (vcpu->vc_exit.vei.vei_dir == 1) + vcpu->vc_gueststate.vg_eax = 0xFFFFFFFF; + ret = 0; + } + + return (ret); +} + +/* + * vmx_handle_cr + * + * Handle reads/writes to control registers (except CR3) + */ +int +vmx_handle_cr(struct vcpu *vcpu) +{ + uint32_t insn_length; + uint32_t exit_qual; + uint8_t crnum, dir; + + if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) { + printf("vmx_handle_cr: can't obtain instruction length\n"); + return (EINVAL); + } + + if (vmx_get_exit_qualification(&exit_qual)) { + printf("vmx_handle_cr: can't get exit qual\n"); + return (EINVAL); + } + + /* Low 4 bits of exit_qual represent the CR number */ + crnum = exit_qual & 0xf; + + dir = (exit_qual & 0x30) >> 4; + + switch (dir) { + case CR_WRITE: + DPRINTF("vmx_handle_cr: mov to cr%d @ %x\n", + crnum, vcpu->vc_gueststate.vg_eip); + break; + case CR_READ: + DPRINTF("vmx_handle_cr: mov from cr%d @ %x\n", + crnum, vcpu->vc_gueststate.vg_eip); + break; + case CR_CLTS: + DPRINTF("vmx_handle_cr: clts instruction @ %x\n", + vcpu->vc_gueststate.vg_eip); + break; + case CR_LMSW: + DPRINTF("vmx_handle_cr: lmsw instruction @ %x\n", + vcpu->vc_gueststate.vg_eip); + break; + default: + DPRINTF("vmx_handle_cr: unknown cr access @ %x\n", + vcpu->vc_gueststate.vg_eip); + } + + vcpu->vc_gueststate.vg_eip += insn_length; + + return (0); +} + +/* + * vmx_handle_rdmsr + * + * Handler for rdmsr instructions. Bitmap MSRs are allowed implicit access + * and won't end up here. This handler is primarily intended to catch otherwise + * unknown MSR access for possible later inclusion in the bitmap list. For + * each MSR access that ends up here, we log the access. + * + * Parameters: + * vcpu: vcpu structure containing instruction info causing the exit + * + * Return value: + * 0: The operation was successful + * 1: An error occurred + */ +int +vmx_handle_rdmsr(struct vcpu *vcpu) +{ + uint32_t insn_length; + uint64_t msr; + uint32_t *eax, *ecx, *edx; + + if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) { + printf("%s: can't obtain instruction length\n", __func__); + return (EINVAL); + } + + /* All RDMSR instructions are 0x0F 0x32 */ + KASSERT(insn_length == 2); + + eax = &vcpu->vc_gueststate.vg_eax; + ecx = &vcpu->vc_gueststate.vg_ecx; + edx = &vcpu->vc_gueststate.vg_edx; + + msr = rdmsr(*ecx); + *eax = msr & 0xFFFFFFFFULL; + *edx = msr >> 32; + + /* XXX log the access for now, to be able to identify unknown MSRs */ + printf("%s: rdmsr exit, msr=0x%x, data returned to " + "guest=0x%x:0x%x\n", __func__, *ecx, *edx, *eax); + + vcpu->vc_gueststate.vg_eip += insn_length; + + return (0); +} + +/* + * vmx_handle_wrmsr + * + * Handler for wrmsr instructions. This handler logs the access, and discards + * the written data. Any valid wrmsr will not end up here (it will be + * whitelisted in the MSR bitmap). + * + * Parameters: + * vcpu: vcpu structure containing instruction info causing the exit + * + * Return value: + * 0: The operation was successful + * 1: An error occurred + */ +int +vmx_handle_wrmsr(struct vcpu *vcpu) +{ + uint32_t insn_length; + uint32_t *eax, *ecx, *edx; + + if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) { + printf("%s: can't obtain instruction length\n", __func__); + return (EINVAL); + } + + /* All WRMSR instructions are 0x0F 0x30 */ + KASSERT(insn_length == 2); + + eax = &vcpu->vc_gueststate.vg_eax; + ecx = &vcpu->vc_gueststate.vg_ecx; + edx = &vcpu->vc_gueststate.vg_edx; + + /* XXX log the access for now, to be able to identify unknown MSRs */ + printf("%s: wrmsr exit, msr=0x%x, discarding data written from " + "guest=0x%x:0x%x\n", __func__, *ecx, *edx, *eax); + + vcpu->vc_gueststate.vg_eip += insn_length; + + return (0); +} + +/* + * vmx_handle_cpuid + * + * Exit handler for CPUID instruction + */ +int +vmx_handle_cpuid(struct vcpu *vcpu) +{ + uint32_t insn_length; + uint32_t *eax, *ebx, *ecx, *edx; + + if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) { + printf("vmx_handle_cpuid: can't obtain instruction length\n"); + return (EINVAL); + } + + /* All CPUID instructions are 0x0F 0xA2 */ + KASSERT(insn_length == 2); + + eax = &vcpu->vc_gueststate.vg_eax; + ebx = &vcpu->vc_gueststate.vg_ebx; + ecx = &vcpu->vc_gueststate.vg_ecx; + edx = &vcpu->vc_gueststate.vg_edx; + + switch (*eax) { + case 0x00: /* Max level and vendor ID */ + *eax = 0x07; /* cpuid_level */ + *ebx = *((uint32_t *)&cpu_vendor); + *edx = *((uint32_t *)&cpu_vendor + 1); + *ecx = *((uint32_t *)&cpu_vendor + 2); + break; + case 0x01: /* Version, brand, feature info */ + *eax = cpu_id; + /* mask off host's APIC ID, reset to vcpu id */ + *ebx = cpu_miscinfo & 0x00FFFFFF; + *ebx &= (vcpu->vc_id & 0xFF) << 24; + /* + * clone host capabilities minus: + * debug store (CPUIDECX_DTES64, CPUIDECX_DSCPL, CPUID_DS) + * monitor/mwait (CPUIDECX_MWAIT) + * vmx (CPUIDECX_VMX) + * smx (CPUIDECX_SMX) + * speedstep (CPUIDECX_EST) + * thermal (CPUIDECX_TM2, CPUID_ACPI, CPUID_TM) + * context id (CPUIDECX_CNXTID) + * silicon debug (CPUIDECX_SDBG) + * xTPR (CPUIDECX_XTPR) + * perf/debug (CPUIDECX_PDCM) + * pcid (CPUIDECX_PCID) + * direct cache access (CPUIDECX_DCA) + * x2APIC (CPUIDECX_X2APIC) + * apic deadline (CPUIDECX_DEADLINE) + * timestamp (CPUID_TSC) + * apic (CPUID_APIC) + * psn (CPUID_PSN) + * self snoop (CPUID_SS) + * hyperthreading (CPUID_HTT) + * pending break enabled (CPUID_PBE) + * MTRR (CPUID_MTRR) + * plus: + * hypervisor (CPUIDECX_HV) + */ + *ecx = (cpu_ecxfeature | CPUIDECX_HV) & + ~(CPUIDECX_EST | CPUIDECX_TM2 | + CPUIDECX_MWAIT | CPUIDECX_PDCM | + CPUIDECX_VMX | CPUIDECX_DTES64 | + CPUIDECX_DSCPL | CPUIDECX_SMX | + CPUIDECX_CNXTID | CPUIDECX_SDBG | + CPUIDECX_XTPR | + CPUIDECX_PCID | CPUIDECX_DCA | + CPUIDECX_X2APIC | CPUIDECX_DEADLINE); + *edx = curcpu()->ci_feature_flags & + ~(CPUID_ACPI | CPUID_TM | CPUID_TSC | + CPUID_HTT | CPUID_DS | CPUID_APIC | + CPUID_PSN | CPUID_SS | CPUID_PBE | + CPUID_MTRR); + break; + case 0x02: /* Cache and TLB information */ + DPRINTF("vmx_handle_cpuid: function 0x02 (cache/TLB) not" + " supported\n"); + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x03: /* Processor serial number (not supported) */ + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x04: + DPRINTF("vmx_handle_cpuid: function 0x04 (deterministic " + "cache info) not supported\n"); + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x05: /* MONITOR/MWAIT (not supported) */ + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x06: /* Thermal / Power management */ + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x07: /* SEFF */ + if (*ecx == 0) { + /* + * SEFF flags - copy from host minus: + * SGX (SEFF0EBX_SGX) + * HLE (SEFF0EBX_HLE) + * INVPCID (SEFF0EBX_INVPCID) + * RTM (SEFF0EBX_RTM) + * PQM (SEFF0EBX_PQM) + * MPX (SEFF0EBX_MPX) + * PCOMMIT (SEFF0EBX_PCOMMIT) + * PT (SEFF0EBX_PT) + */ + *eax = 0; /* Highest subleaf supported */ + *ebx = curcpu()->ci_feature_sefflags_ebx & + ~(SEFF0EBX_SGX | SEFF0EBX_HLE | SEFF0EBX_INVPCID | + SEFF0EBX_RTM | SEFF0EBX_PQM | SEFF0EBX_MPX | + SEFF0EBX_PCOMMIT | SEFF0EBX_PT); + *ecx = curcpu()->ci_feature_sefflags_ecx; + *edx = 0; + } else { + /* Unsupported subleaf */ + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + } + break; + case 0x09: /* Direct Cache Access (not supported) */ + DPRINTF("vmx_handle_cpuid: function 0x09 (direct cache access)" + " not supported\n"); + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x0a: /* Architectural performance monitoring */ + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x0b: /* Extended topology enumeration (not supported) */ + DPRINTF("vmx_handle_cpuid: function 0x0b (topology enumeration)" + " not supported\n"); + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x0d: /* Processor ext. state information (not supported) */ + DPRINTF("vmx_handle_cpuid: function 0x0d (ext. state info)" + " not supported\n"); + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x0f: /* QoS info (not supported) */ + DPRINTF("vmx_handle_cpuid: function 0x0f (QoS info)" + " not supported\n"); + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x14: /* Processor Trace info (not supported) */ + DPRINTF("vmx_handle_cpuid: function 0x14 (processor trace info)" + " not supported\n"); + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x15: /* TSC / Core Crystal Clock info (not supported) */ + DPRINTF("vmx_handle_cpuid: function 0x15 (TSC / CCC info)" + " not supported\n"); + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x16: /* Processor frequency info (not supported) */ + DPRINTF("vmx_handle_cpuid: function 0x16 (frequency info)" + " not supported\n"); + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x40000000: /* Hypervisor information */ + *eax = 0; + *ebx = *((uint32_t *)&vmm_hv_signature[0]); + *ecx = *((uint32_t *)&vmm_hv_signature[4]); + *edx = *((uint32_t *)&vmm_hv_signature[8]); + break; + case 0x80000000: /* Extended function level */ + *eax = 0x80000007; /* curcpu()->ci_pnfeatset */ + *ebx = 0; + *ecx = 0; + *edx = 0; + case 0x80000001: /* Extended function info */ + *eax = ecpu_eaxfeature; + *ebx = 0; /* Reserved */ + *ecx = ecpu_ecxfeature; + *edx = ecpu_feature; + break; + case 0x80000002: /* Brand string */ + *eax = cpu_brandstr[0]; + *ebx = cpu_brandstr[1]; + *ecx = cpu_brandstr[2]; + *edx = cpu_brandstr[3]; + break; + case 0x80000003: /* Brand string */ + *eax = cpu_brandstr[4]; + *ebx = cpu_brandstr[5]; + *ecx = cpu_brandstr[6]; + *edx = cpu_brandstr[7]; + break; + case 0x80000004: /* Brand string */ + *eax = cpu_brandstr[8]; + *ebx = cpu_brandstr[9]; + *ecx = cpu_brandstr[10]; + *edx = cpu_brandstr[11]; + break; + case 0x80000005: /* Reserved (Intel), cacheinfo (AMD) */ + *eax = curcpu()->ci_amdcacheinfo[0]; + *ebx = curcpu()->ci_amdcacheinfo[1]; + *ecx = curcpu()->ci_amdcacheinfo[2]; + *edx = curcpu()->ci_amdcacheinfo[3]; + break; + case 0x80000006: /* ext. cache info */ + *eax = curcpu()->ci_extcacheinfo[0]; + *ebx = curcpu()->ci_extcacheinfo[1]; + *ecx = curcpu()->ci_extcacheinfo[2]; + *edx = curcpu()->ci_extcacheinfo[3]; + break; + case 0x80000007: /* apmi */ + *eax = 0; /* Reserved */ + *ebx = 0; /* Reserved */ + *ecx = 0; /* Reserved */ + *edx = 0; /* unsupported ITSC */ + break; + case 0x80000008: /* Phys bits info and topology (AMD) */ + DPRINTF("vmx_handle_cpuid: function 0x80000008 (phys bits info)" + " not supported\n"); + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + default: + DPRINTF("vmx_handle_cpuid: unsupported eax=0x%x\n", *eax); + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + } + + vcpu->vc_gueststate.vg_eip += insn_length; + + return (0); +} + +/* + * vcpu_run_svm + * + * VMM main loop used to run a VCPU. + */ +int +vcpu_run_svm(struct vcpu *vcpu, struct vm_run_params *vrp) +{ + /* XXX removed due to rot */ + return (0); +} + +/* + * vmx_exit_reason_decode + * + * Returns a human readable string describing exit type 'code' + */ +const char * +vmx_exit_reason_decode(uint32_t code) +{ + switch (code) { + case VMX_EXIT_NMI: return "NMI"; + case VMX_EXIT_EXTINT: return "external interrupt"; + case VMX_EXIT_TRIPLE_FAULT: return "triple fault"; + case VMX_EXIT_INIT: return "INIT signal"; + case VMX_EXIT_SIPI: return "SIPI signal"; + case VMX_EXIT_IO_SMI: return "I/O SMI"; + case VMX_EXIT_OTHER_SMI: return "other SMI"; + case VMX_EXIT_INT_WINDOW: return "interrupt window"; + case VMX_EXIT_NMI_WINDOW: return "NMI window"; + case VMX_EXIT_TASK_SWITCH: return "task switch"; + case VMX_EXIT_CPUID: return "CPUID instruction"; + case VMX_EXIT_GETSEC: return "GETSEC instruction"; + case VMX_EXIT_HLT: return "HLT instruction"; + case VMX_EXIT_INVD: return "INVD instruction"; + case VMX_EXIT_INVLPG: return "INVLPG instruction"; + case VMX_EXIT_RDPMC: return "RDPMC instruction"; + case VMX_EXIT_RDTSC: return "RDTSC instruction"; + case VMX_EXIT_RSM: return "RSM instruction"; + case VMX_EXIT_VMCALL: return "VMCALL instruction"; + case VMX_EXIT_VMCLEAR: return "VMCLEAR instruction"; + case VMX_EXIT_VMLAUNCH: return "VMLAUNCH instruction"; + case VMX_EXIT_VMPTRLD: return "VMPTRLD instruction"; + case VMX_EXIT_VMPTRST: return "VMPTRST instruction"; + case VMX_EXIT_VMREAD: return "VMREAD instruction"; + case VMX_EXIT_VMRESUME: return "VMRESUME instruction"; + case VMX_EXIT_VMWRITE: return "VMWRITE instruction"; + case VMX_EXIT_VMXOFF: return "VMXOFF instruction"; + case VMX_EXIT_VMXON: return "VMXON instruction"; + case VMX_EXIT_CR_ACCESS: return "CR access"; + case VMX_EXIT_MOV_DR: return "MOV DR instruction"; + case VMX_EXIT_IO: return "I/O instruction"; + case VMX_EXIT_RDMSR: return "RDMSR instruction"; + case VMX_EXIT_WRMSR: return "WRMSR instruction"; + case VMX_EXIT_ENTRY_FAILED_GUEST_STATE: return "guest state invalid"; + case VMX_EXIT_ENTRY_FAILED_MSR_LOAD: return "MSR load failed"; + case VMX_EXIT_MWAIT: return "MWAIT instruction"; + case VMX_EXIT_MTF: return "monitor trap flag"; + case VMX_EXIT_MONITOR: return "MONITOR instruction"; + case VMX_EXIT_PAUSE: return "PAUSE instruction"; + case VMX_EXIT_ENTRY_FAILED_MCE: return "MCE during entry"; + case VMX_EXIT_TPR_BELOW_THRESHOLD: return "TPR below threshold"; + case VMX_EXIT_APIC_ACCESS: return "APIC access"; + case VMX_EXIT_VIRTUALIZED_EOI: return "virtualized EOI"; + case VMX_EXIT_GDTR_IDTR: return "GDTR/IDTR access"; + case VMX_EXIT_LDTR_TR: return "LDTR/TR access"; + case VMX_EXIT_EPT_VIOLATION: return "EPT violation"; + case VMX_EXIT_EPT_MISCONFIGURATION: return "EPT misconfiguration"; + case VMX_EXIT_INVEPT: return "INVEPT instruction"; + case VMX_EXIT_RDTSCP: return "RDTSCP instruction"; + case VMX_EXIT_VMX_PREEMPTION_TIMER_EXPIRED: + return "preemption timer expired"; + case VMX_EXIT_INVVPID: return "INVVPID instruction"; + case VMX_EXIT_WBINVD: return "WBINVD instruction"; + case VMX_EXIT_XSETBV: return "XSETBV instruction"; + case VMX_EXIT_APIC_WRITE: return "APIC write"; + case VMX_EXIT_RDRAND: return "RDRAND instruction"; + case VMX_EXIT_INVPCID: return "INVPCID instruction"; + case VMX_EXIT_VMFUNC: return "VMFUNC instruction"; + case VMX_EXIT_RDSEED: return "RDSEED instruction"; + case VMX_EXIT_XSAVES: return "XSAVES instruction"; + case VMX_EXIT_XRSTORS: return "XRSTORS instruction"; + default: return "unknown"; + } +} + +/* + * vmx_instruction_error_decode + * + * Returns a human readable string describing the instruction error in 'code' + */ +const char * +vmx_instruction_error_decode(uint32_t code) +{ + switch (code) { + case 1: return "VMCALL: unsupported in VMX root"; + case 2: return "VMCLEAR: invalid paddr"; + case 3: return "VMCLEAR: VMXON pointer"; + case 4: return "VMLAUNCH: non-clear VMCS"; + case 5: return "VMRESUME: non-launched VMCS"; + case 6: return "VMRESUME: executed after VMXOFF"; + case 7: return "VM entry: invalid control field(s)"; + case 8: return "VM entry: invalid host state field(s)"; + case 9: return "VMPTRLD: invalid paddr"; + case 10: return "VMPTRLD: VMXON pointer"; + case 11: return "VMPTRLD: incorrect VMCS revid"; + case 12: return "VMREAD/VMWRITE: unsupported VMCS field"; + case 13: return "VMWRITE: RO VMCS field"; + case 15: return "VMXON: unsupported in VMX root"; + case 20: return "VMCALL: invalid VM exit control fields"; + case 26: return "VM entry: blocked by MOV SS"; + case 28: return "Invalid operand to INVEPT/INVVPID"; + default: return "unknown"; + } +} + +/* + * vcpu_state_decode + * + * Returns a human readable string describing the vcpu state in 'state'. + */ +const char * +vcpu_state_decode(u_int state) +{ + switch (state) { + case VCPU_STATE_STOPPED: return "stopped"; + case VCPU_STATE_RUNNING: return "running"; + case VCPU_STATE_REQTERM: return "requesting termination"; + case VCPU_STATE_TERMINATED: return "terminated"; + case VCPU_STATE_UNKNOWN: return "unknown"; + default: return "invalid"; + } +} + +#ifdef VMM_DEBUG +/* + * dump_vcpu + * + * Dumps the VMX capabilites of vcpu 'vcpu' + */ +void +dump_vcpu(struct vcpu *vcpu) +{ + printf("vcpu @ %p\n", vcpu); + printf(" parent vm @ %p\n", vcpu->vc_parent); + printf(" mode: "); + if (vcpu->vc_virt_mode == VMM_MODE_VMX || + vcpu->vc_virt_mode == VMM_MODE_EPT) { + printf("VMX\n"); + printf(" pinbased ctls: 0x%llx\n", + vcpu->vc_vmx_pinbased_ctls); + printf(" true pinbased ctls: 0x%llx\n", + vcpu->vc_vmx_true_pinbased_ctls); + CTRL_DUMP(vcpu, PINBASED, EXTERNAL_INT_EXITING); + CTRL_DUMP(vcpu, PINBASED, NMI_EXITING); + CTRL_DUMP(vcpu, PINBASED, VIRTUAL_NMIS); + CTRL_DUMP(vcpu, PINBASED, ACTIVATE_VMX_PREEMPTION_TIMER); + CTRL_DUMP(vcpu, PINBASED, PROCESS_POSTED_INTERRUPTS); + printf(" procbased ctls: 0x%llx\n", + vcpu->vc_vmx_procbased_ctls); + printf(" true procbased ctls: 0x%llx\n", + vcpu->vc_vmx_true_procbased_ctls); + CTRL_DUMP(vcpu, PROCBASED, INTERRUPT_WINDOW_EXITING); + CTRL_DUMP(vcpu, PROCBASED, USE_TSC_OFFSETTING); + CTRL_DUMP(vcpu, PROCBASED, HLT_EXITING); + CTRL_DUMP(vcpu, PROCBASED, INVLPG_EXITING); + CTRL_DUMP(vcpu, PROCBASED, MWAIT_EXITING); + CTRL_DUMP(vcpu, PROCBASED, RDPMC_EXITING); + CTRL_DUMP(vcpu, PROCBASED, RDTSC_EXITING); + CTRL_DUMP(vcpu, PROCBASED, CR3_LOAD_EXITING); + CTRL_DUMP(vcpu, PROCBASED, CR3_STORE_EXITING); + CTRL_DUMP(vcpu, PROCBASED, CR8_LOAD_EXITING); + CTRL_DUMP(vcpu, PROCBASED, CR8_STORE_EXITING); + CTRL_DUMP(vcpu, PROCBASED, USE_TPR_SHADOW); + CTRL_DUMP(vcpu, PROCBASED, NMI_WINDOW_EXITING); + CTRL_DUMP(vcpu, PROCBASED, MOV_DR_EXITING); + CTRL_DUMP(vcpu, PROCBASED, UNCONDITIONAL_IO_EXITING); + CTRL_DUMP(vcpu, PROCBASED, USE_IO_BITMAPS); + CTRL_DUMP(vcpu, PROCBASED, MONITOR_TRAP_FLAG); + CTRL_DUMP(vcpu, PROCBASED, USE_MSR_BITMAPS); + CTRL_DUMP(vcpu, PROCBASED, MONITOR_EXITING); + CTRL_DUMP(vcpu, PROCBASED, PAUSE_EXITING); + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) { + printf(" procbased2 ctls: 0x%llx\n", + vcpu->vc_vmx_procbased2_ctls); + CTRL_DUMP(vcpu, PROCBASED2, VIRTUALIZE_APIC); + CTRL_DUMP(vcpu, PROCBASED2, ENABLE_EPT); + CTRL_DUMP(vcpu, PROCBASED2, DESCRIPTOR_TABLE_EXITING); + CTRL_DUMP(vcpu, PROCBASED2, ENABLE_RDTSCP); + CTRL_DUMP(vcpu, PROCBASED2, VIRTUALIZE_X2APIC_MODE); + CTRL_DUMP(vcpu, PROCBASED2, ENABLE_VPID); + CTRL_DUMP(vcpu, PROCBASED2, WBINVD_EXITING); + CTRL_DUMP(vcpu, PROCBASED2, UNRESTRICTED_GUEST); + CTRL_DUMP(vcpu, PROCBASED2, + APIC_REGISTER_VIRTUALIZATION); + CTRL_DUMP(vcpu, PROCBASED2, + VIRTUAL_INTERRUPT_DELIVERY); + CTRL_DUMP(vcpu, PROCBASED2, PAUSE_LOOP_EXITING); + CTRL_DUMP(vcpu, PROCBASED2, RDRAND_EXITING); + CTRL_DUMP(vcpu, PROCBASED2, ENABLE_INVPCID); + CTRL_DUMP(vcpu, PROCBASED2, ENABLE_VM_FUNCTIONS); + CTRL_DUMP(vcpu, PROCBASED2, VMCS_SHADOWING); + CTRL_DUMP(vcpu, PROCBASED2, ENABLE_ENCLS_EXITING); + CTRL_DUMP(vcpu, PROCBASED2, RDSEED_EXITING); + CTRL_DUMP(vcpu, PROCBASED2, ENABLE_PML); + CTRL_DUMP(vcpu, PROCBASED2, EPT_VIOLATION_VE); + CTRL_DUMP(vcpu, PROCBASED2, CONCEAL_VMX_FROM_PT); + CTRL_DUMP(vcpu, PROCBASED2, ENABLE_XSAVES_XRSTORS); + CTRL_DUMP(vcpu, PROCBASED2, ENABLE_TSC_SCALING); + } + printf(" entry ctls: 0x%llx\n", + vcpu->vc_vmx_entry_ctls); + printf(" true entry ctls: 0x%llx\n", + vcpu->vc_vmx_true_entry_ctls); + CTRL_DUMP(vcpu, ENTRY, LOAD_DEBUG_CONTROLS); + CTRL_DUMP(vcpu, ENTRY, IA32E_MODE_GUEST); + CTRL_DUMP(vcpu, ENTRY, ENTRY_TO_SMM); + CTRL_DUMP(vcpu, ENTRY, DEACTIVATE_DUAL_MONITOR_TREATMENT); + CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY); + CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_PAT_ON_ENTRY); + CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_EFER_ON_ENTRY); + CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_BNDCFGS_ON_ENTRY); + CTRL_DUMP(vcpu, ENTRY, CONCEAL_VM_ENTRIES_FROM_PT); + printf(" exit ctls: 0x%llx\n", + vcpu->vc_vmx_exit_ctls); + printf(" true exit ctls: 0x%llx\n", + vcpu->vc_vmx_true_exit_ctls); + CTRL_DUMP(vcpu, EXIT, SAVE_DEBUG_CONTROLS); + CTRL_DUMP(vcpu, EXIT, HOST_SPACE_ADDRESS_SIZE); + CTRL_DUMP(vcpu, EXIT, LOAD_IA32_PERF_GLOBAL_CTRL_ON_EXIT); + CTRL_DUMP(vcpu, EXIT, ACKNOWLEDGE_INTERRUPT_ON_EXIT); + CTRL_DUMP(vcpu, EXIT, SAVE_IA32_PAT_ON_EXIT); + CTRL_DUMP(vcpu, EXIT, LOAD_IA32_PAT_ON_EXIT); + CTRL_DUMP(vcpu, EXIT, SAVE_IA32_EFER_ON_EXIT); + CTRL_DUMP(vcpu, EXIT, LOAD_IA32_EFER_ON_EXIT); + CTRL_DUMP(vcpu, EXIT, SAVE_VMX_PREEMPTION_TIMER); + CTRL_DUMP(vcpu, EXIT, CLEAR_IA32_BNDCFGS_ON_EXIT); + CTRL_DUMP(vcpu, EXIT, CONCEAL_VM_EXITS_FROM_PT); + } +} + +/* + * vmx_dump_vmcs_field + * + * Debug function to dump the contents of a single VMCS field + * + * Parameters: + * fieldid: VMCS Field ID + * msg: string to display + */ +void +vmx_dump_vmcs_field(uint16_t fieldid, const char *msg) +{ + uint8_t width; + uint64_t val; + uint32_t val_lo, val_hi; + + DPRINTF("%s (0x%04x): ", msg, fieldid); + width = (fieldid >> 13) & 0x3; + + if (width == 1) { + if (vmread(fieldid, &val_lo)) { + DPRINTF("???? "); + return; + } + if (vmread(fieldid + 1, &val_hi)) { + DPRINTF("???? "); + return; + } + + val = (uint64_t)val_lo | (uint64_t)val_hi << 32ULL; + } + + /* + * Field width encoding : bits 13:14 + * + * 0: 16-bit + * 1: 64-bit + * 2: 32-bit + * 3: natural width + */ + switch (width) { + case 0: DPRINTF("0x%04llx ", val); break; + case 1: + case 3: DPRINTF("0x%016llx ", val); break; + case 2: DPRINTF("0x%08llx ", val); + } +} + +/* + * vmx_dump_vmcs + * + * Debug function to dump the contents of the current VMCS. + */ +void +vmx_dump_vmcs(struct vcpu *vcpu) +{ + int has_sec, i; + uint32_t cr3_tgt_ct; + + /* XXX save and load new vmcs, restore at end */ + + DPRINTF("--CURRENT VMCS STATE--\n"); + DPRINTF("VMXON revision : 0x%x\n", + curcpu()->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision); + DPRINTF("CR0 fixed0: 0x%llx\n", + curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0); + DPRINTF("CR0 fixed1: 0x%llx\n", + curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1); + DPRINTF("CR4 fixed0: 0x%llx\n", + curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0); + DPRINTF("CR4 fixed1: 0x%llx\n", + curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1); + DPRINTF("MSR table size: 0x%x\n", + 512 * (curcpu()->ci_vmm_cap.vcc_vmx.vmx_msr_table_size + 1)); + + has_sec = vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1); + + if (has_sec) { + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_VPID, 1)) { + vmx_dump_vmcs_field(VMCS_GUEST_VPID, "VPID"); + } + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PINBASED_CTLS, + IA32_VMX_PROCESS_POSTED_INTERRUPTS, 1)) { + vmx_dump_vmcs_field(VMCS_POSTED_INT_NOTIF_VECTOR, + "Posted Int Notif Vec"); + } + + if (has_sec) { + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_EPT_VIOLATION_VE, 1)) { + vmx_dump_vmcs_field(VMCS_EPTP_INDEX, "EPTP idx"); + } + } + + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_ES_SEL, "G.ES"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_CS_SEL, "G.CS"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_SS_SEL, "G.SS"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_DS_SEL, "G.DS"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_FS_SEL, "G.FS"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_GS_SEL, "G.GS"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_LDTR_SEL, "LDTR"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_TR_SEL, "G.TR"); + + if (has_sec) { + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_VIRTUAL_INTERRUPT_DELIVERY, 1)) { + vmx_dump_vmcs_field(VMCS_GUEST_INTERRUPT_STATUS, + "Int sts"); + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_PML, 1)) { + vmx_dump_vmcs_field(VMCS_GUEST_PML_INDEX, "PML Idx"); + } + } + + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_ES_SEL, "H.ES"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_CS_SEL, "H.CS"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_SS_SEL, "H.SS"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_DS_SEL, "H.DS"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_FS_SEL, "H.FS"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_GS_SEL, "H.GS"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_IO_BITMAP_A, "I/O Bitmap A"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_IO_BITMAP_B, "I/O Bitmap B"); + DPRINTF("\n"); + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_USE_MSR_BITMAPS, 1)) { + vmx_dump_vmcs_field(VMCS_MSR_BITMAP_ADDRESS, "MSR Bitmap"); + DPRINTF("\n"); + } + + vmx_dump_vmcs_field(VMCS_EXIT_STORE_MSR_ADDRESS, "Exit Store MSRs"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_EXIT_LOAD_MSR_ADDRESS, "Exit Load MSRs"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_ENTRY_LOAD_MSR_ADDRESS, "Entry Load MSRs"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_EXECUTIVE_VMCS_POINTER, "Exec VMCS Ptr"); + DPRINTF("\n"); + + if (has_sec) { + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_PML, 1)) { + vmx_dump_vmcs_field(VMCS_PML_ADDRESS, "PML Addr"); + DPRINTF("\n"); + } + } + + vmx_dump_vmcs_field(VMCS_TSC_OFFSET, "TSC Offset"); + DPRINTF("\n"); + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_USE_TPR_SHADOW, 1)) { + vmx_dump_vmcs_field(VMCS_VIRTUAL_APIC_ADDRESS, + "Virtual APIC Addr"); + DPRINTF("\n"); + } + + if (has_sec) { + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_VIRTUALIZE_APIC, 1)) { + vmx_dump_vmcs_field(VMCS_APIC_ACCESS_ADDRESS, + "APIC Access Addr"); + DPRINTF("\n"); + } + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PINBASED_CTLS, + IA32_VMX_PROCESS_POSTED_INTERRUPTS, 1)) { + vmx_dump_vmcs_field(VMCS_POSTED_INTERRUPT_DESC, + "Posted Int Desc Addr"); + DPRINTF("\n"); + } + + if (has_sec) { + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_VM_FUNCTIONS, 1)) { + vmx_dump_vmcs_field(VMCS_VM_FUNCTION_CONTROLS, + "VM Function Controls"); + DPRINTF("\n"); + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_EPT, 1)) { + vmx_dump_vmcs_field(VMCS_GUEST_IA32_EPTP, + "EPT Pointer"); + DPRINTF("\n"); + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_VIRTUAL_INTERRUPT_DELIVERY, 1)) { + vmx_dump_vmcs_field(VMCS_EOI_EXIT_BITMAP_0, + "EOI Exit Bitmap 0"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_EOI_EXIT_BITMAP_1, + "EOI Exit Bitmap 1"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_EOI_EXIT_BITMAP_2, + "EOI Exit Bitmap 2"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_EOI_EXIT_BITMAP_3, + "EOI Exit Bitmap 3"); + DPRINTF("\n"); + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_VM_FUNCTIONS, 1)) { + /* We assume all CPUs have the same VMFUNC caps */ + if (curcpu()->ci_vmm_cap.vcc_vmx.vmx_vm_func & 0x1) { + vmx_dump_vmcs_field(VMCS_EPTP_LIST_ADDRESS, + "EPTP List Addr"); + DPRINTF("\n"); + } + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_VMCS_SHADOWING, 1)) { + vmx_dump_vmcs_field(VMCS_VMREAD_BITMAP_ADDRESS, + "VMREAD Bitmap Addr"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_VMWRITE_BITMAP_ADDRESS, + "VMWRITE Bitmap Addr"); + DPRINTF("\n"); + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_EPT_VIOLATION_VE, 1)) { + vmx_dump_vmcs_field(VMCS_VIRTUALIZATION_EXC_ADDRESS, + "#VE Addr"); + DPRINTF("\n"); + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_XSAVES_XRSTORS, 1)) { + vmx_dump_vmcs_field(VMCS_XSS_EXITING_BITMAP, + "XSS exiting bitmap addr"); + DPRINTF("\n"); + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_ENCLS_EXITING, 1)) { + vmx_dump_vmcs_field(VMCS_ENCLS_EXITING_BITMAP, + "Encls exiting bitmap addr"); + DPRINTF("\n"); + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_TSC_SCALING, 1)) { + vmx_dump_vmcs_field(VMCS_TSC_MULTIPLIER, + "TSC scaling factor"); + DPRINTF("\n"); + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_EPT, 1)) { + vmx_dump_vmcs_field(VMCS_GUEST_PHYSICAL_ADDRESS, + "Guest PA"); + DPRINTF("\n"); + } + } + + vmx_dump_vmcs_field(VMCS_LINK_POINTER, "VMCS Link Pointer"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_DEBUGCTL, "Guest DEBUGCTL"); + DPRINTF("\n"); + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_ENTRY_CTLS, + IA32_VMX_LOAD_IA32_PAT_ON_ENTRY, 1) || + vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS, + IA32_VMX_SAVE_IA32_PAT_ON_EXIT, 1)) { + vmx_dump_vmcs_field(VMCS_GUEST_IA32_PAT, + "Guest PAT"); + DPRINTF("\n"); + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_ENTRY_CTLS, + IA32_VMX_LOAD_IA32_EFER_ON_ENTRY, 1) || + vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS, + IA32_VMX_SAVE_IA32_EFER_ON_EXIT, 1)) { + vmx_dump_vmcs_field(VMCS_GUEST_IA32_EFER, + "Guest EFER"); + DPRINTF("\n"); + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_ENTRY_CTLS, + IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY, 1)) { + vmx_dump_vmcs_field(VMCS_GUEST_IA32_PERF_GBL_CTRL, + "Guest Perf Global Ctrl"); + DPRINTF("\n"); + } + + if (has_sec) { + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_EPT, 1)) { + vmx_dump_vmcs_field(VMCS_GUEST_PDPTE0, "Guest PDPTE0"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_PDPTE1, "Guest PDPTE1"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_PDPTE2, "Guest PDPTE2"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_PDPTE3, "Guest PDPTE3"); + DPRINTF("\n"); + } + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_ENTRY_CTLS, + IA32_VMX_LOAD_IA32_BNDCFGS_ON_ENTRY, 1) || + vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS, + IA32_VMX_CLEAR_IA32_BNDCFGS_ON_EXIT, 1)) { + vmx_dump_vmcs_field(VMCS_GUEST_IA32_BNDCFGS, + "Guest BNDCFGS"); + DPRINTF("\n"); + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS, + IA32_VMX_LOAD_IA32_PAT_ON_EXIT, 1)) { + vmx_dump_vmcs_field(VMCS_HOST_IA32_PAT, + "Host PAT"); + DPRINTF("\n"); + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS, + IA32_VMX_LOAD_IA32_EFER_ON_EXIT, 1)) { + vmx_dump_vmcs_field(VMCS_HOST_IA32_EFER, + "Host EFER"); + DPRINTF("\n"); + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS, + IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_EXIT, 1)) { + vmx_dump_vmcs_field(VMCS_HOST_IA32_PERF_GBL_CTRL, + "Host Perf Global Ctrl"); + DPRINTF("\n"); + } + + vmx_dump_vmcs_field(VMCS_PINBASED_CTLS, "Pinbased Ctrls"); + vmx_dump_vmcs_field(VMCS_PROCBASED_CTLS, "Procbased Ctrls"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_EXCEPTION_BITMAP, "Exception Bitmap"); + vmx_dump_vmcs_field(VMCS_PF_ERROR_CODE_MASK, "#PF Err Code Mask"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_PF_ERROR_CODE_MATCH, "#PF Err Code Match"); + vmx_dump_vmcs_field(VMCS_CR3_TARGET_COUNT, "CR3 Tgt Count"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_EXIT_CTLS, "Exit Ctrls"); + vmx_dump_vmcs_field(VMCS_EXIT_MSR_STORE_COUNT, "Exit MSR Store Ct"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_EXIT_MSR_LOAD_COUNT, "Exit MSR Load Ct"); + vmx_dump_vmcs_field(VMCS_ENTRY_CTLS, "Entry Ctrls"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_ENTRY_MSR_LOAD_COUNT, "Entry MSR Load Ct"); + vmx_dump_vmcs_field(VMCS_ENTRY_INTERRUPTION_INFO, "Entry Int. Info"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_ENTRY_EXCEPTION_ERROR_CODE, + "Entry Ex. Err Code"); + vmx_dump_vmcs_field(VMCS_ENTRY_INSTRUCTION_LENGTH, "Entry Insn Len"); + DPRINTF("\n"); + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_USE_TPR_SHADOW, 1)) { + vmx_dump_vmcs_field(VMCS_TPR_THRESHOLD, "TPR Threshold"); + DPRINTF("\n"); + } + + if (has_sec) { + vmx_dump_vmcs_field(VMCS_PROCBASED2_CTLS, "2ndary Ctrls"); + DPRINTF("\n"); + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_PAUSE_LOOP_EXITING, 1)) { + vmx_dump_vmcs_field(VMCS_PLE_GAP, "PLE Gap"); + vmx_dump_vmcs_field(VMCS_PLE_WINDOW, "PLE Window"); + } + DPRINTF("\n"); + } + + vmx_dump_vmcs_field(VMCS_INSTRUCTION_ERROR, "Insn Error"); + vmx_dump_vmcs_field(VMCS_EXIT_REASON, "Exit Reason"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_EXIT_INTERRUPTION_INFO, "Exit Int. Info"); + vmx_dump_vmcs_field(VMCS_EXIT_INTERRUPTION_ERR_CODE, + "Exit Int. Err Code"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_IDT_VECTORING_INFO, "IDT vect info"); + vmx_dump_vmcs_field(VMCS_IDT_VECTORING_ERROR_CODE, + "IDT vect err code"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_INSTRUCTION_LENGTH, "Insn Len"); + vmx_dump_vmcs_field(VMCS_EXIT_INSTRUCTION_INFO, "Exit Insn Info"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_GUEST_IA32_ES_LIMIT, "G. ES Lim"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_CS_LIMIT, "G. CS Lim"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_GUEST_IA32_SS_LIMIT, "G. SS Lim"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_DS_LIMIT, "G. DS Lim"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_GUEST_IA32_FS_LIMIT, "G. FS Lim"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_GS_LIMIT, "G. GS Lim"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_GUEST_IA32_LDTR_LIMIT, "G. LDTR Lim"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_TR_LIMIT, "G. TR Lim"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_GUEST_IA32_GDTR_LIMIT, "G. GDTR Lim"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_IDTR_LIMIT, "G. IDTR Lim"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_GUEST_IA32_ES_AR, "G. ES AR"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_CS_AR, "G. CS AR"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_GUEST_IA32_SS_AR, "G. SS AR"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_DS_AR, "G. DS AR"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_GUEST_IA32_FS_AR, "G. FS AR"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_GS_AR, "G. GS AR"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_GUEST_IA32_LDTR_AR, "G. LDTR AR"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_TR_AR, "G. TR AR"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_GUEST_INTERRUPTIBILITY_ST, "G. Int St."); + vmx_dump_vmcs_field(VMCS_GUEST_ACTIVITY_STATE, "G. Act St."); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_GUEST_SMBASE, "G. SMBASE"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_SYSENTER_CS, "G. SYSENTER CS"); + DPRINTF("\n"); + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PINBASED_CTLS, + IA32_VMX_ACTIVATE_VMX_PREEMPTION_TIMER, 1)) { + vmx_dump_vmcs_field(VMCS_VMX_PREEMPTION_TIMER_VAL, + "VMX Preempt Timer"); + DPRINTF("\n"); + } + + vmx_dump_vmcs_field(VMCS_HOST_IA32_SYSENTER_CS, "H. SYSENTER CS"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_CR0_MASK, "CR0 Mask"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_CR4_MASK, "CR4 Mask"); + DPRINTF("\n"); + + vmx_dump_vmcs_field(VMCS_CR0_READ_SHADOW, "CR0 RD Shadow"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_CR4_READ_SHADOW, "CR4 RD Shadow"); + DPRINTF("\n"); + + /* We assume all CPUs have the same max CR3 target ct */ + cr3_tgt_ct = curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr3_tgt_count; + DPRINTF("Max CR3 target count: 0x%x\n", cr3_tgt_ct); + if (cr3_tgt_ct <= VMX_MAX_CR3_TARGETS) { + for (i = 0 ; i < cr3_tgt_ct; i++) { + vmx_dump_vmcs_field(VMCS_CR3_TARGET_0 + (2 * i), + "CR3 Target"); + DPRINTF("\n"); + } + } else { + DPRINTF("(Bogus CR3 Target Count > %d", VMX_MAX_CR3_TARGETS); + } + + vmx_dump_vmcs_field(VMCS_GUEST_EXIT_QUALIFICATION, "G. Exit Qual"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_IO_RCX, "I/O RCX"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_IO_RSI, "I/O RSI"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_IO_RDI, "I/O RDI"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_IO_RIP, "I/O RIP"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_LINEAR_ADDRESS, "G. Lin Addr"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_CR0, "G. CR0"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_CR3, "G. CR3"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_CR4, "G. CR4"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_ES_BASE, "G. ES Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_CS_BASE, "G. CS Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_SS_BASE, "G. SS Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_DS_BASE, "G. DS Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_FS_BASE, "G. FS Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_GS_BASE, "G. GS Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_LDTR_BASE, "G. LDTR Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_TR_BASE, "G. TR Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_GDTR_BASE, "G. GDTR Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_IDTR_BASE, "G. IDTR Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_DR7, "G. DR7"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_RSP, "G. RSP"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_RIP, "G. RIP"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_RFLAGS, "G. RFLAGS"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_PENDING_DBG_EXC, "G. Pend Dbg Exc"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_SYSENTER_ESP, "G. SYSENTER ESP"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_GUEST_IA32_SYSENTER_EIP, "G. SYSENTER EIP"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_CR0, "H. CR0"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_CR3, "H. CR3"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_CR4, "H. CR4"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_FS_BASE, "H. FS Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_GS_BASE, "H. GS Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_TR_BASE, "H. TR Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_GDTR_BASE, "H. GDTR Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_IDTR_BASE, "H. IDTR Base"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_SYSENTER_ESP, "H. SYSENTER ESP"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_SYSENTER_EIP, "H. SYSENTER EIP"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_RSP, "H. RSP"); + DPRINTF("\n"); + vmx_dump_vmcs_field(VMCS_HOST_IA32_RIP, "H. RIP"); + DPRINTF("\n"); +} + +/* + * vmx_vcpu_dump_regs + * + * Debug function to print vcpu regs from the current vcpu + * note - vmcs for 'vcpu' must be on this pcpu. + * + * Parameters: + * vcpu - vcpu whose registers should be dumped + */ +void +vmx_vcpu_dump_regs(struct vcpu *vcpu) +{ + uint32_t r; + int i; + struct vmx_msr_store *msr_store; + + DPRINTF("vcpu @ %p\n", vcpu); + DPRINTF(" eax=0x%08x ebx=0x%08x ecx=0x%08x\n", + vcpu->vc_gueststate.vg_eax, vcpu->vc_gueststate.vg_ebx, + vcpu->vc_gueststate.vg_ecx); + DPRINTF(" edx=0x%08x ebp=0x%08x edi=0x%08x\n", + vcpu->vc_gueststate.vg_edx, vcpu->vc_gueststate.vg_ebp, + vcpu->vc_gueststate.vg_edi); + DPRINTF(" esi=0x%08x\n", vcpu->vc_gueststate.vg_esi); + + DPRINTF(" eip=0x%08x rsp=", vcpu->vc_gueststate.vg_eip); + if (vmread(VMCS_GUEST_IA32_RSP, &r)) + DPRINTF("(error reading)\n"); + else + DPRINTF("0x%08x\n", r); + + DPRINTF(" cr0="); + if (vmread(VMCS_GUEST_IA32_CR0, &r)) + DPRINTF("(error reading)\n"); + else { + DPRINTF("0x%08x ", r); + vmm_decode_cr0(r); + } + + DPRINTF(" cr2=0x%08x\n", vcpu->vc_gueststate.vg_cr2); + + DPRINTF(" cr3="); + if (vmread(VMCS_GUEST_IA32_CR3, &r)) + DPRINTF("(error reading)\n"); + else + DPRINTF("0x%08x ", r); + + DPRINTF(" cr4="); + if (vmread(VMCS_GUEST_IA32_CR4, &r)) + DPRINTF("(error reading)\n"); + else { + DPRINTF("0x%08x ", r); + vmm_decode_cr4(r); + } + + DPRINTF(" --Guest Segment Info--\n"); + + DPRINTF(" cs="); + if (vmread(VMCS_GUEST_IA32_CS_SEL, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%04x rpl=%d", r, r & 0x3); + + DPRINTF(" base="); + if (vmread(VMCS_GUEST_IA32_CS_BASE, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" limit="); + if (vmread(VMCS_GUEST_IA32_CS_LIMIT, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" a/r="); + if (vmread(VMCS_GUEST_IA32_CS_AR, &r)) + DPRINTF("(error reading)\n"); + else { + DPRINTF("0x%04x\n ", r); + vmm_segment_desc_decode(r); + } + + DPRINTF(" ds="); + if (vmread(VMCS_GUEST_IA32_DS_SEL, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%04x rpl=%d", r, r & 0x3); + + DPRINTF(" base="); + if (vmread(VMCS_GUEST_IA32_DS_BASE, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" limit="); + if (vmread(VMCS_GUEST_IA32_DS_LIMIT, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" a/r="); + if (vmread(VMCS_GUEST_IA32_DS_AR, &r)) + DPRINTF("(error reading)\n"); + else { + DPRINTF("0x%04x\n ", r); + vmm_segment_desc_decode(r); + } + + DPRINTF(" es="); + if (vmread(VMCS_GUEST_IA32_ES_SEL, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%04x rpl=%d", r, r & 0x3); + + DPRINTF(" base="); + if (vmread(VMCS_GUEST_IA32_ES_BASE, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" limit="); + if (vmread(VMCS_GUEST_IA32_ES_LIMIT, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" a/r="); + if (vmread(VMCS_GUEST_IA32_ES_AR, &r)) + DPRINTF("(error reading)\n"); + else { + DPRINTF("0x%04x\n ", r); + vmm_segment_desc_decode(r); + } + + DPRINTF(" fs="); + if (vmread(VMCS_GUEST_IA32_FS_SEL, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%04x rpl=%d", r, r & 0x3); + + DPRINTF(" base="); + if (vmread(VMCS_GUEST_IA32_FS_BASE, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" limit="); + if (vmread(VMCS_GUEST_IA32_FS_LIMIT, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" a/r="); + if (vmread(VMCS_GUEST_IA32_FS_AR, &r)) + DPRINTF("(error reading)\n"); + else { + DPRINTF("0x%04x\n ", r); + vmm_segment_desc_decode(r); + } + + DPRINTF(" gs="); + if (vmread(VMCS_GUEST_IA32_GS_SEL, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%04x rpl=%d", r, r & 0x3); + + DPRINTF(" base="); + if (vmread(VMCS_GUEST_IA32_GS_BASE, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" limit="); + if (vmread(VMCS_GUEST_IA32_GS_LIMIT, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" a/r="); + if (vmread(VMCS_GUEST_IA32_GS_AR, &r)) + DPRINTF("(error reading)\n"); + else { + DPRINTF("0x%04x\n ", r); + vmm_segment_desc_decode(r); + } + + DPRINTF(" ss="); + if (vmread(VMCS_GUEST_IA32_SS_SEL, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%04x rpl=%d", r, r & 0x3); + + DPRINTF(" base="); + if (vmread(VMCS_GUEST_IA32_SS_BASE, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" limit="); + if (vmread(VMCS_GUEST_IA32_SS_LIMIT, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" a/r="); + if (vmread(VMCS_GUEST_IA32_SS_AR, &r)) + DPRINTF("(error reading)\n"); + else { + DPRINTF("0x%04x\n ", r); + vmm_segment_desc_decode(r); + } + + DPRINTF(" tr="); + if (vmread(VMCS_GUEST_IA32_TR_SEL, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%04x", r); + + DPRINTF(" base="); + if (vmread(VMCS_GUEST_IA32_TR_BASE, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" limit="); + if (vmread(VMCS_GUEST_IA32_TR_LIMIT, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" a/r="); + if (vmread(VMCS_GUEST_IA32_TR_AR, &r)) + DPRINTF("(error reading)\n"); + else { + DPRINTF("0x%04x\n ", r); + vmm_segment_desc_decode(r); + } + + DPRINTF(" gdtr base="); + if (vmread(VMCS_GUEST_IA32_GDTR_BASE, &r)) + DPRINTF("(error reading) "); + else + DPRINTF("0x%08x", r); + + DPRINTF(" limit="); + if (vmread(VMCS_GUEST_IA32_GDTR_LIMIT, &r)) + DPRINTF("(error reading)\n"); + else + DPRINTF("0x%08x\n", r); + + DPRINTF(" idtr base="); + if (vmread(VMCS_GUEST_IA32_IDTR_BASE, &r)) + DPRINTF("(error reading) "); + else + DPRINTF("0x%08x", r); + + DPRINTF(" limit="); + if (vmread(VMCS_GUEST_IA32_IDTR_LIMIT, &r)) + DPRINTF("(error reading)\n"); + else + DPRINTF("0x%08x\n", r); + + DPRINTF(" ldtr="); + if (vmread(VMCS_GUEST_IA32_LDTR_SEL, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%04x", r); + + DPRINTF(" base="); + if (vmread(VMCS_GUEST_IA32_LDTR_BASE, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" limit="); + if (vmread(VMCS_GUEST_IA32_LDTR_LIMIT, &r)) + DPRINTF("(error reading)"); + else + DPRINTF("0x%08x", r); + + DPRINTF(" a/r="); + if (vmread(VMCS_GUEST_IA32_LDTR_AR, &r)) + DPRINTF("(error reading)\n"); + else { + DPRINTF("0x%04x\n ", r); + vmm_segment_desc_decode(r); + } + + DPRINTF(" --Guest MSRs @ 0x%08x (paddr: 0x%08x)--\n", + (uint32_t)vcpu->vc_vmx_msr_exit_save_va, + (uint32_t)vcpu->vc_vmx_msr_exit_save_pa); + + msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_save_va; + + for (i = 0; i < VMX_NUM_MSR_STORE; i++) { + DPRINTF(" MSR %d @ %p : 0x%08x (%s), " + "value=0x%016llx ", + i, &msr_store[i], msr_store[i].vms_index, + msr_name_decode(msr_store[i].vms_index), + msr_store[i].vms_data); + vmm_decode_msr_value(msr_store[i].vms_index, + msr_store[i].vms_data); + } + + DPRINTF(" last PIC irq=%d\n", vcpu->vc_intr); +} + +/* + * msr_name_decode + * + * Returns a human-readable name for the MSR supplied in 'msr'. + * + * Parameters: + * msr - The MSR to decode + * + * Return value: + * NULL-terminated character string containing the name of the MSR requested + */ +const char * +msr_name_decode(uint32_t msr) +{ + /* + * Add as needed. Also consider adding a decode function when + * adding to this table. + */ + + switch (msr) { + case MSR_TSC: return "TSC"; + case MSR_APICBASE: return "APIC base"; + case MSR_IA32_FEATURE_CONTROL: return "IA32 feature control"; + case MSR_PERFCTR0: return "perf counter 0"; + case MSR_PERFCTR1: return "perf counter 1"; + case MSR_TEMPERATURE_TARGET: return "temperature target"; + case MSR_MTRRcap: return "MTRR cap"; + case MSR_PERF_STATUS: return "perf status"; + case MSR_PERF_CTL: return "perf control"; + case MSR_MTRRvarBase: return "MTRR variable base"; + case MSR_MTRRfix64K_00000: return "MTRR fixed 64K"; + case MSR_MTRRfix16K_80000: return "MTRR fixed 16K"; + case MSR_MTRRfix4K_C0000: return "MTRR fixed 4K"; + case MSR_CR_PAT: return "PAT"; + case MSR_MTRRdefType: return "MTRR default type"; + case MSR_EFER: return "EFER"; + case MSR_STAR: return "STAR"; + case MSR_LSTAR: return "LSTAR"; + case MSR_CSTAR: return "CSTAR"; + case MSR_SFMASK: return "SFMASK"; + case MSR_FSBASE: return "FSBASE"; + case MSR_GSBASE: return "GSBASE"; + case MSR_KERNELGSBASE: return "KGSBASE"; + default: return "Unknown MSR"; + } +} + +/* + * vmm_segment_desc_decode + * + * Debug function to print segment information for supplied descriptor + * + * Parameters: + * val - The A/R bytes for the segment descriptor to decode + */ +void +vmm_segment_desc_decode(uint32_t val) +{ + uint16_t ar; + uint8_t g, type, s, dpl, p, dib, l; + uint32_t unusable; + + /* Exit early on unusable descriptors */ + unusable = val & 0x10000; + if (unusable) { + DPRINTF("(unusable)\n"); + return; + } + + ar = (uint16_t)val; + + g = (ar & 0x8000) >> 15; + dib = (ar & 0x4000) >> 14; + l = (ar & 0x2000) >> 13; + p = (ar & 0x80) >> 7; + dpl = (ar & 0x60) >> 5; + s = (ar & 0x10) >> 4; + type = (ar & 0xf); + + DPRINTF("granularity=%d dib=%d l(64 bit)=%d present=%d sys=%d ", + g, dib, l, p, s); + + DPRINTF("type="); + if (!s) { + switch (type) { + case SDT_SYSLDT: DPRINTF("ldt\n"); break; + case SDT_SYS386TSS: DPRINTF("tss (available)\n"); break; + case SDT_SYS386BSY: DPRINTF("tss (busy)\n"); break; + case SDT_SYS386CGT: DPRINTF("call gate\n"); break; + case SDT_SYS386IGT: DPRINTF("interrupt gate\n"); break; + case SDT_SYS386TGT: DPRINTF("trap gate\n"); break; + /* XXX handle 32 bit segment types by inspecting mode */ + default: DPRINTF("unknown"); + } + } else { + switch (type + 16) { + case SDT_MEMRO: DPRINTF("data, r/o\n"); break; + case SDT_MEMROA: DPRINTF("data, r/o, accessed\n"); break; + case SDT_MEMRW: DPRINTF("data, r/w\n"); break; + case SDT_MEMRWA: DPRINTF("data, r/w, accessed\n"); break; + case SDT_MEMROD: DPRINTF("data, r/o, expand down\n"); break; + case SDT_MEMRODA: DPRINTF("data, r/o, expand down, " + "accessed\n"); + break; + case SDT_MEMRWD: DPRINTF("data, r/w, expand down\n"); break; + case SDT_MEMRWDA: DPRINTF("data, r/w, expand down, " + "accessed\n"); + break; + case SDT_MEME: DPRINTF("code, x only\n"); break; + case SDT_MEMEA: DPRINTF("code, x only, accessed\n"); + case SDT_MEMER: DPRINTF("code, r/x\n"); break; + case SDT_MEMERA: DPRINTF("code, r/x, accessed\n"); break; + case SDT_MEMEC: DPRINTF("code, x only, conforming\n"); break; + case SDT_MEMEAC: DPRINTF("code, x only, conforming, " + "accessed\n"); + break; + case SDT_MEMERC: DPRINTF("code, r/x, conforming\n"); break; + case SDT_MEMERAC: DPRINTF("code, r/x, conforming, accessed\n"); + break; + } + } +} + +void +vmm_decode_cr0(uint32_t cr0) +{ + struct vmm_reg_debug_info cr0_info[11] = { + { CR0_PG, "PG ", "pg " }, + { CR0_CD, "CD ", "cd " }, + { CR0_NW, "NW ", "nw " }, + { CR0_AM, "AM ", "am " }, + { CR0_WP, "WP ", "wp " }, + { CR0_NE, "NE ", "ne " }, + { CR0_ET, "ET ", "et " }, + { CR0_TS, "TS ", "ts " }, + { CR0_EM, "EM ", "em " }, + { CR0_MP, "MP ", "mp " }, + { CR0_PE, "PE", "pe" } + }; + + uint8_t i; + + DPRINTF("("); + for (i = 0; i < 11; i++) + if (cr0 & cr0_info[i].vrdi_bit) + DPRINTF(cr0_info[i].vrdi_present); + else + DPRINTF(cr0_info[i].vrdi_absent); + + DPRINTF(")\n"); +} + +void +vmm_decode_cr4(uint32_t cr4) +{ + struct vmm_reg_debug_info cr4_info[19] = { + { CR4_PKE, "PKE ", "pke "}, + { CR4_SMAP, "SMAP ", "smap "}, + { CR4_SMEP, "SMEP ", "smep "}, + { CR4_OSXSAVE, "OSXSAVE ", "osxsave "}, + { CR4_PCIDE, "PCIDE ", "pcide "}, + { CR4_FSGSBASE, "FSGSBASE ", "fsgsbase "}, + { CR4_SMXE, "SMXE ", "smxe "}, + { CR4_VMXE, "VMXE ", "vmxe "}, + { CR4_OSXMMEXCPT, "OSXMMEXCPT ", "osxmmexcpt "}, + { CR4_OSFXSR, "OSFXSR ", "osfxsr "}, + { CR4_PCE, "PCE ", "pce "}, + { CR4_PGE, "PGE ", "pge "}, + { CR4_MCE, "MCE ", "mce "}, + { CR4_PAE, "PAE ", "pae "}, + { CR4_PSE, "PSE ", "pse "}, + { CR4_DE, "DE ", "de "}, + { CR4_TSD, "TSD ", "tsd "}, + { CR4_PVI, "PVI ", "pvi "}, + { CR4_VME, "VME", "vme"} + }; + + uint8_t i; + + DPRINTF("("); + for (i = 0; i < 19; i++) + if (cr4 & cr4_info[i].vrdi_bit) + DPRINTF(cr4_info[i].vrdi_present); + else + DPRINTF(cr4_info[i].vrdi_absent); + + DPRINTF(")\n"); +} + +void +vmm_decode_apicbase_msr_value(uint64_t apicbase) +{ + struct vmm_reg_debug_info apicbase_info[3] = { + { APICBASE_BSP, "BSP ", "bsp "}, + { APICBASE_ENABLE_X2APIC, "X2APIC ", "x2apic "}, + { APICBASE_GLOBAL_ENABLE, "GLB_EN", "glb_en"} + }; + + uint8_t i; + + DPRINTF("("); + for (i = 0; i < 3; i++) + if (apicbase & apicbase_info[i].vrdi_bit) + DPRINTF(apicbase_info[i].vrdi_present); + else + DPRINTF(apicbase_info[i].vrdi_absent); + + DPRINTF(")\n"); +} + +void +vmm_decode_ia32_fc_value(uint64_t fcr) +{ + struct vmm_reg_debug_info fcr_info[4] = { + { IA32_FEATURE_CONTROL_LOCK, "LOCK ", "lock "}, + { IA32_FEATURE_CONTROL_SMX_EN, "SMX ", "smx "}, + { IA32_FEATURE_CONTROL_VMX_EN, "VMX ", "vmx "}, + { IA32_FEATURE_CONTROL_SENTER_EN, "SENTER ", "senter "} + }; + + uint8_t i; + + DPRINTF("("); + for (i = 0; i < 4; i++) + if (fcr & fcr_info[i].vrdi_bit) + DPRINTF(fcr_info[i].vrdi_present); + else + DPRINTF(fcr_info[i].vrdi_absent); + + if (fcr & IA32_FEATURE_CONTROL_SENTER_EN) + DPRINTF(" [SENTER param = 0x%llx]", + (fcr & IA32_FEATURE_CONTROL_SENTER_PARAM_MASK) >> 8); + + DPRINTF(")\n"); +} + +void +vmm_decode_mtrrcap_value(uint64_t val) +{ + struct vmm_reg_debug_info mtrrcap_info[3] = { + { MTRRcap_FIXED, "FIXED ", "fixed "}, + { MTRRcap_WC, "WC ", "wc "}, + { MTRRcap_SMRR, "SMRR ", "smrr "} + }; + + uint8_t i; + + DPRINTF("("); + for (i = 0; i < 3; i++) + if (val & mtrrcap_info[i].vrdi_bit) + DPRINTF(mtrrcap_info[i].vrdi_present); + else + DPRINTF(mtrrcap_info[i].vrdi_absent); + + if (val & MTRRcap_FIXED) + DPRINTF(" [nr fixed ranges = 0x%llx]", + (val & 0xff)); + + DPRINTF(")\n"); +} + +void +vmm_decode_perf_status_value(uint64_t val) +{ + DPRINTF("(pstate ratio = 0x%llx)\n", (val & 0xffff)); +} + +void vmm_decode_perf_ctl_value(uint64_t val) +{ + DPRINTF("(%s ", (val & PERF_CTL_TURBO) ? "TURBO" : "turbo"); + DPRINTF("pstate req = 0x%llx)\n", (val & 0xfffF)); +} + +void +vmm_decode_mtrrdeftype_value(uint64_t mtrrdeftype) +{ + struct vmm_reg_debug_info mtrrdeftype_info[2] = { + { MTRRdefType_FIXED_ENABLE, "FIXED ", "fixed "}, + { MTRRdefType_ENABLE, "ENABLED ", "enabled "}, + }; + + uint8_t i; + int type; + + DPRINTF("("); + for (i = 0; i < 2; i++) + if (mtrrdeftype & mtrrdeftype_info[i].vrdi_bit) + DPRINTF(mtrrdeftype_info[i].vrdi_present); + else + DPRINTF(mtrrdeftype_info[i].vrdi_absent); + + DPRINTF("type = "); + type = mtrr2mrt(mtrrdeftype & 0xff); + switch (type) { + case MDF_UNCACHEABLE: DPRINTF("UC"); break; + case MDF_WRITECOMBINE: DPRINTF("WC"); break; + case MDF_WRITETHROUGH: DPRINTF("WT"); break; + case MDF_WRITEPROTECT: DPRINTF("RO"); break; + case MDF_WRITEBACK: DPRINTF("WB"); break; + case MDF_UNKNOWN: + default: + DPRINTF("??"); + break; + } + + DPRINTF(")\n"); +} + +void +vmm_decode_efer_value(uint64_t efer) +{ + struct vmm_reg_debug_info efer_info[4] = { + { EFER_SCE, "SCE ", "sce "}, + { EFER_LME, "LME ", "lme "}, + { EFER_LMA, "LMA ", "lma "}, + { EFER_NXE, "NXE", "nxe"}, + }; + + uint8_t i; + + DPRINTF("("); + for (i = 0; i < 4; i++) + if (efer & efer_info[i].vrdi_bit) + DPRINTF(efer_info[i].vrdi_present); + else + DPRINTF(efer_info[i].vrdi_absent); + + DPRINTF(")\n"); +} + +void +vmm_decode_msr_value(uint64_t msr, uint64_t val) +{ + switch (msr) { + case MSR_APICBASE: vmm_decode_apicbase_msr_value(val); break; + case MSR_IA32_FEATURE_CONTROL: vmm_decode_ia32_fc_value(val); break; + case MSR_MTRRcap: vmm_decode_mtrrcap_value(val); break; + case MSR_PERF_STATUS: vmm_decode_perf_status_value(val); break; + case MSR_PERF_CTL: vmm_decode_perf_ctl_value(val); break; + case MSR_MTRRdefType: vmm_decode_mtrrdeftype_value(val); break; + case MSR_EFER: vmm_decode_efer_value(val); break; + default: DPRINTF("\n"); + } +} +#endif /* VMM_DEBUG */ diff --git a/sys/arch/i386/i386/vmm_support.S b/sys/arch/i386/i386/vmm_support.S new file mode 100644 index 00000000000..54d41349586 --- /dev/null +++ b/sys/arch/i386/i386/vmm_support.S @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "assym.h" +#include <machine/asm.h> +#include <machine/specialreg.h> + +/* + * XXX duplicated in vmmvar.h due to song-and-dance with sys/rwlock.h inclusion + * here + */ +#define VMX_FAIL_LAUNCH_UNKNOWN 1 +#define VMX_FAIL_LAUNCH_INVALID_VMCS 2 +#define VMX_FAIL_LAUNCH_VALID_VMCS 3 + + .text + .code32 + .align 16 + .global _C_LABEL(vmxon) + .global _C_LABEL(vmxoff) + .global _C_LABEL(vmclear) + .global _C_LABEL(vmptrld) + .global _C_LABEL(vmptrst) + .global _C_LABEL(vmwrite) + .global _C_LABEL(vmread) + .global _C_LABEL(invvpid) + .global _C_LABEL(invept) + .global _C_LABEL(vmx_enter_guest) + .global _C_LABEL(vmm_dispatch_intr) + +_C_LABEL(vmm_dispatch_intr): + movl %esp, %eax + andl $0xFFFFFFF0, %esp + pushl %ss + pushl %eax + pushfl + pushl %cs + cli + movl 4(%eax), %eax + calll *%eax + addl $0x8, %esp + ret + +_C_LABEL(vmxon): + movl 4(%esp), %eax + vmxon (%eax) + jz failed_on + jc failed_on + xorl %eax, %eax + ret +failed_on: + movl $0x01, %eax + ret + +_C_LABEL(vmxoff): + vmxoff + jz failed_off + jc failed_off + xorl %eax, %eax + ret +failed_off: + movl $0x01, %eax + ret + +_C_LABEL(vmclear): + movl 0x04(%esp), %eax + vmclear (%eax) + jz failed_clear + jc failed_clear + xorl %eax, %eax + ret +failed_clear: + movl $0x01, %eax + ret + +_C_LABEL(vmptrld): + movl 4(%esp), %eax + vmptrld (%eax) + jz failed_ptrld + jc failed_ptrld + xorl %eax, %eax + ret +failed_ptrld: + movl $0x01, %eax + ret + +_C_LABEL(vmptrst): + movl 0x04(%esp), %eax + vmptrst (%eax) + jz failed_ptrst + jc failed_ptrst + xorl %eax, %eax + ret +failed_ptrst: + movl $0x01, %eax + ret + +_C_LABEL(vmwrite): + movl 0x04(%esp), %eax + vmwrite 0x08(%esp), %eax + jz failed_write + jc failed_write + xorl %eax, %eax + ret +failed_write: + movl $0x01, %eax + ret + +_C_LABEL(vmread): + pushl %ebx + movl 0x08(%esp), %ebx + movl 0x0c(%esp), %eax + vmread %ebx, (%eax) + jz failed_read + jc failed_read + popl %ebx + xorl %eax, %eax + ret +failed_read: + popl %ebx + movl $0x01, %eax + ret + +_C_LABEL(invvpid): + pushl %ebx + movl 0x08(%esp), %eax + movl 0x0c(%esp), %ebx + invvpid (%ebx), %eax + popl %ebx + ret + +_C_LABEL(invept): + movl 0x04(%esp), %eax + invept 0x08(%esp), %eax + ret + +_C_LABEL(vmx_enter_guest): + pushl %ebx + pushl %ecx + pushl %edx + movl 0x14(%esp), %edx /* Guest Regs Pointer */ + movl 0x18(%esp), %ebx /* resume flag */ + testl %ebx, %ebx + jnz skip_init + + /* + * XXX make vmx_exit_handler a global and put this in the per-vcpu + * init code + */ + movl $VMCS_HOST_IA32_RIP, %eax + movl $vmx_exit_handler_asm, %ecx + vmwrite %ecx, %eax + +skip_init: + pushfl + + strw %ax + pushw %ax + movw %es, %ax + pushw %ax + movw %ds, %ax + pushw %ax + movw %ss, %ax + pushw %ax + pushw %fs + pushw %gs + + pushl %ebp + pushl %esi + pushl %edi + pushl %edx /* Guest Regs Pointer */ + + movl $VMCS_HOST_IA32_RSP, %edi + movl %esp, %eax + vmwrite %eax, %edi + + testl %ebx, %ebx + jnz do_resume + + /* Restore guest registers */ + movl 0x1c(%edx), %eax + movl %eax, %cr2 + movl 0x18(%edx), %ebp + movl 0x14(%edx), %edi + movl 0x0c(%edx), %ecx + movl 0x08(%edx), %ebx + movl 0x04(%edx), %eax + movl (%edx), %esi + movl 0x10(%edx), %edx + + vmlaunch + jmp fail_launch_or_resume +do_resume: + /* Restore guest registers */ + movl 0x1c(%edx), %eax + movl %eax, %cr2 + movl 0x18(%edx), %ebp + movl 0x14(%edx), %edi + movl 0x0c(%edx), %ecx + movl 0x08(%edx), %ebx + movl 0x04(%edx), %eax + movl (%edx), %esi + movl 0x10(%edx), %edx + vmresume +fail_launch_or_resume: + /* Failed launch/resume (fell through) */ + jc fail_launch_invalid_vmcs /* Invalid VMCS */ + jz fail_launch_valid_vmcs /* Valid VMCS, failed launch/resume */ + + /* Unknown failure mode (not documented as per Intel SDM) */ + movl $VMX_FAIL_LAUNCH_UNKNOWN, %eax + popl %edx + jmp restore_host + +fail_launch_invalid_vmcs: + movl $VMX_FAIL_LAUNCH_INVALID_VMCS, %eax + popl %edx + jmp restore_host + +fail_launch_valid_vmcs: + movl $VMCS_INSTRUCTION_ERROR, %edi + popl %edx + vmread %edi, %eax + /* XXX check failure of vmread */ + movl %eax, 0x20(%edx) + movl $VMX_FAIL_LAUNCH_VALID_VMCS, %eax + jmp restore_host + +vmx_exit_handler_asm: + /* Preserve guest registers not saved in VMCS */ + pushl %esi + pushl %edi + movl 0x8(%esp), %edi + movl 0x4(%esp), %esi + movl %esi, (%edi) + popl %edi + popl %esi /* discard */ + + popl %esi + movl %eax, 0x4(%esi) + movl %ebx, 0x8(%esi) + movl %ecx, 0xc(%esi) + movl %edx, 0x10(%esi) + movl %edi, 0x14(%esi) + movl %ebp, 0x18(%esi) + movl %cr2, %eax + movl %eax, 0x1c(%esi) + +restore_host: + popl %edi + popl %esi + popl %ebp + + popw %gs + popw %fs + popw %ax + movw %ax, %ss + popw %ax + movw %ax, %ds + popw %ax + movw %ax, %es + xorl %ecx, %ecx + popw %cx + + popfl + + movl 0x1c(%esp), %ebx + leal (%ebx, %ecx), %eax + andb $0xF9, 5(%eax) + ltr %cx + + popl %edx + popl %ecx + popl %ebx + + xorl %eax, %eax + + ret diff --git a/sys/arch/i386/include/cpu.h b/sys/arch/i386/include/cpu.h index 58b823d64ab..3c140f26cd3 100644 --- a/sys/arch/i386/include/cpu.h +++ b/sys/arch/i386/include/cpu.h @@ -1,4 +1,4 @@ -/* $OpenBSD: cpu.h,v 1.149 2016/10/14 04:53:26 mlarkin Exp $ */ +/* $OpenBSD: cpu.h,v 1.150 2016/10/21 06:20:58 mlarkin Exp $ */ /* $NetBSD: cpu.h,v 1.35 1996/05/05 19:29:26 christos Exp $ */ /*- @@ -69,6 +69,36 @@ struct intrsource; +/* VMXON region (Intel) */ +struct vmxon_region { + uint32_t vr_revision; +}; + +/* + * VMX for Intel CPUs + */ +struct vmx { + uint64_t vmx_cr0_fixed0; + uint64_t vmx_cr0_fixed1; + uint64_t vmx_cr4_fixed0; + uint64_t vmx_cr4_fixed1; + uint32_t vmx_vmxon_revision; + uint32_t vmx_msr_table_size; + uint32_t vmx_cr3_tgt_count; + uint64_t vmx_vm_func; +}; + +/* + * SVM for AMD CPUs + */ +struct svm { +}; + +union vmm_cpu_cap { + struct vmx vcc_vmx; + struct svm vcc_svm; +}; + #ifdef _KERNEL /* XXX stuff to move to cpuvar.h later */ struct cpu_info { @@ -158,6 +188,14 @@ struct cpu_info { #ifdef GPROF struct gmonparam *ci_gmon; #endif + u_int32_t ci_vmm_flags; +#define CI_VMM_VMX (1 << 0) +#define CI_VMM_SVM (1 << 1) +#define CI_VMM_RVI (1 << 2) +#define CI_VMM_EPT (1 << 3) + union vmm_cpu_cap ci_vmm_cap; + uint64_t ci_vmxon_region_pa; /* Must be 64 bit */ + struct vmxon_region *ci_vmxon_region; }; /* @@ -177,6 +215,7 @@ struct cpu_info { #define CPUF_PRESENT 0x1000 /* CPU is present */ #define CPUF_RUNNING 0x2000 /* CPU is running */ +#define CPUF_VMM 0x4000 /* CPU is executing in VMM mode */ /* * We statically allocate the CPU info for the primary CPU (or, diff --git a/sys/arch/i386/include/intrdefs.h b/sys/arch/i386/include/intrdefs.h index 0384febd3f8..fba06ef79e9 100644 --- a/sys/arch/i386/include/intrdefs.h +++ b/sys/arch/i386/include/intrdefs.h @@ -1,4 +1,4 @@ -/* $OpenBSD: intrdefs.h,v 1.14 2013/05/16 19:26:04 kettenis Exp $ */ +/* $OpenBSD: intrdefs.h,v 1.15 2016/10/21 06:20:58 mlarkin Exp $ */ /* $NetBSD: intrdefs.h,v 1.2 2003/05/04 22:01:56 fvdl Exp $ */ #ifndef _I386_INTRDEFS_H @@ -115,13 +115,16 @@ #define I386_IPI_GDT 0x00000020 #define I386_IPI_DDB 0x00000040 /* synchronize while in ddb */ #define I386_IPI_SETPERF 0x00000080 +#define I386_IPI_START_VMM 0x00000100 +#define I386_IPI_STOP_VMM 0x00000200 -#define I386_NIPI 8 +#define I386_NIPI 10 #define I386_IPI_NAMES { "halt IPI", "nop IPI", "FPU flush IPI", \ "FPU synch IPI", \ "MTRR update IPI", "GDT update IPI", \ - "DDB IPI", "setperf IPI" } + "DDB IPI", "setperf IPI", "VMM start IPI", \ + "VMM stop IPI" } #define IREENT_MAGIC 0x18041969 diff --git a/sys/arch/i386/include/pmap.h b/sys/arch/i386/include/pmap.h index 1614b117cab..8751e11be56 100644 --- a/sys/arch/i386/include/pmap.h +++ b/sys/arch/i386/include/pmap.h @@ -1,4 +1,4 @@ -/* $OpenBSD: pmap.h,v 1.82 2016/03/15 03:17:51 guenther Exp $ */ +/* $OpenBSD: pmap.h,v 1.83 2016/10/21 06:20:59 mlarkin Exp $ */ /* $NetBSD: pmap.h,v 1.44 2000/04/24 17:18:18 thorpej Exp $ */ /* @@ -88,6 +88,11 @@ LIST_HEAD(pmap_head, pmap); /* struct pmap_head: head of a pmap list */ * page list, and number of PTPs within the pmap. */ +#define PMAP_TYPE_NORMAL 1 +#define PMAP_TYPE_EPT 2 +#define PMAP_TYPE_RVI 3 +#define pmap_nested(pm) ((pm)->pm_type != PMAP_TYPE_NORMAL) + struct pmap { uint64_t pm_pdidx[4]; /* PDIEs for PAE mode */ @@ -106,6 +111,10 @@ struct pmap { int pm_flags; /* see below */ struct segment_descriptor pm_codeseg; /* cs descriptor for process */ + int pm_type; /* Type of pmap this is (PMAP_TYPE_x) */ + vaddr_t pm_npt_pml4; /* Nested paging PML4 VA */ + paddr_t pm_npt_pa; /* Nested paging PML4 PA */ + vaddr_t pm_npt_pdpt; /* Nested paging PDPT */ }; /* @@ -246,6 +255,7 @@ void pmap_switch(struct proc *, struct proc *); vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */ paddr_t vtophys(vaddr_t va); paddr_t vtophys_pae(vaddr_t va); +int pmap_convert(struct pmap *, int); extern u_int32_t (*pmap_pte_set_p)(vaddr_t, paddr_t, u_int32_t); extern u_int32_t (*pmap_pte_setbits_p)(vaddr_t, u_int32_t, u_int32_t); diff --git a/sys/arch/i386/include/pte.h b/sys/arch/i386/include/pte.h index c0e1ccfb83d..aa9b62341d6 100644 --- a/sys/arch/i386/include/pte.h +++ b/sys/arch/i386/include/pte.h @@ -1,4 +1,4 @@ -/* $OpenBSD: pte.h,v 1.21 2015/04/12 18:37:54 mlarkin Exp $ */ +/* $OpenBSD: pte.h,v 1.22 2016/10/21 06:20:59 mlarkin Exp $ */ /* $NetBSD: pte.h,v 1.11 1998/02/06 21:58:05 thorpej Exp $ */ /* @@ -67,6 +67,13 @@ #define PG_AVAIL3 0x00000800 /* ignored by hardware */ #define PG_PATLG 0x00001000 /* PAT on large pages */ +/* EPT PTE bits */ +#define EPT_R (1ULL << 0) +#define EPT_W (1ULL << 1) +#define EPT_X (1ULL << 2) +#define EPT_WB (6ULL << 3) +#define EPT_PS (1ULL << 7) + /* Cacheability bits when we are using PAT */ #define PG_WB (0) /* The default */ #define PG_WC (PG_WT) /* WT and CD is WC */ diff --git a/sys/arch/i386/include/specialreg.h b/sys/arch/i386/include/specialreg.h index 8bfd61b766e..aa02392022b 100644 --- a/sys/arch/i386/include/specialreg.h +++ b/sys/arch/i386/include/specialreg.h @@ -1,4 +1,4 @@ -/* $OpenBSD: specialreg.h,v 1.57 2016/09/03 13:35:03 mlarkin Exp $ */ +/* $OpenBSD: specialreg.h,v 1.58 2016/10/21 06:20:59 mlarkin Exp $ */ /* $NetBSD: specialreg.h,v 1.7 1994/10/27 04:16:26 cgd Exp $ */ /*- @@ -69,6 +69,12 @@ /* the remaining 7 bits of this register are reserved */ /* + * bits in CR3 + */ +#define CR3_PWT (1ULL << 3) +#define CR3_PCD (1ULL << 4) + +/* * bits in the pentiums %cr4 register: */ @@ -91,6 +97,7 @@ #define CR4_OSXSAVE 0x00040000 /* enable XSAVE and extended states */ #define CR4_SMEP 0x00100000 /* supervisor mode exec protection */ #define CR4_SMAP 0x00200000 /* supervisor mode access prevention */ +#define CR4_PKE 0x00400000 /* protection key enable */ /* * CPUID "features" bits (CPUID function 0x1): @@ -296,14 +303,20 @@ #define P5MSR_CTR0 0x012 /* P5 only (trap on P6) */ #define P5MSR_CTR1 0x013 /* P5 only (trap on P6) */ #define MSR_APICBASE 0x01b +#define APICBASE_BSP 0x100 +#define APICBASE_ENABLE_X2APIC 0x400 +#define APICBASE_GLOBAL_ENABLE 0x800 #define MSR_EBL_CR_POWERON 0x02a #define MSR_EBC_FREQUENCY_ID 0x02c /* Pentium 4 only */ #define MSR_TEST_CTL 0x033 +#define MSR_IA32_FEATURE_CONTROL 0x03a #define MSR_BIOS_UPDT_TRIG 0x079 #define MSR_BBL_CR_D0 0x088 /* PII+ only */ #define MSR_BBL_CR_D1 0x089 /* PII+ only */ #define MSR_BBL_CR_D2 0x08a /* PII+ only */ #define MSR_BIOS_SIGN 0x08b +#define MSR_PERFCTR0 0x0c1 +#define MSR_PERFCTR1 0x0c2 #define P6MSR_CTR0 0x0c1 #define P6MSR_CTR1 0x0c2 #define MSR_FSB_FREQ 0x0cd /* Core Duo/Solo only */ @@ -422,6 +435,7 @@ #define EFER_LME 0x00000100 /* Long Mode Active */ #define EFER_LMA 0x00000400 /* Long Mode Enabled */ #define EFER_NXE 0x00000800 /* No-Execute Enabled */ +#define EFER_SVME 0x00001000 /* SVM Enabled */ #define MSR_STAR 0xc0000081 /* 32 bit syscall gate addr */ #define MSR_LSTAR 0xc0000082 /* 64 bit syscall gate addr */ @@ -688,3 +702,354 @@ #define C3_CRYPT_CWLO_KEY128 0x0000000a /* 128bit, 10 rds */ #define C3_CRYPT_CWLO_KEY192 0x0000040c /* 192bit, 12 rds */ #define C3_CRYPT_CWLO_KEY256 0x0000080e /* 256bit, 15 rds */ + +/* + * VMX + */ +#define IA32_FEATURE_CONTROL_LOCK 0x01 +#define IA32_FEATURE_CONTROL_SMX_EN 0x02 +#define IA32_FEATURE_CONTROL_VMX_EN 0x04 +#define IA32_FEATURE_CONTROL_SENTER_EN (1ULL << 15) +#define IA32_FEATURE_CONTROL_SENTER_PARAM_MASK 0x7f00 +#define IA32_VMX_BASIC 0x480 +#define IA32_VMX_PINBASED_CTLS 0x481 +#define IA32_VMX_PROCBASED_CTLS 0x482 +#define IA32_VMX_EXIT_CTLS 0x483 +#define IA32_VMX_ENTRY_CTLS 0x484 +#define IA32_VMX_MISC 0x485 +#define IA32_VMX_CR0_FIXED0 0x486 +#define IA32_VMX_CR0_FIXED1 0x487 +#define IA32_VMX_CR4_FIXED0 0x488 +#define IA32_VMX_CR4_FIXED1 0x489 +#define IA32_VMX_PROCBASED2_CTLS 0x48B +#define IA32_VMX_EPT_VPID_CAP 0x48C +#define IA32_VMX_TRUE_PINBASED_CTLS 0x48D +#define IA32_VMX_TRUE_PROCBASED_CTLS 0x48E +#define IA32_VMX_TRUE_EXIT_CTLS 0x48F +#define IA32_VMX_TRUE_ENTRY_CTLS 0x490 +#define IA32_VMX_VMFUNC 0x491 + +#define IA32_EPT_VPID_CAP_PAGE_WALK_4 (1ULL << 6) +#define IA32_EPT_VPID_CAP_WB (1ULL << 14) +#define IA32_EPT_VPID_CAP_AD_BITS (1ULL << 21) + +#define IA32_EPT_PAGING_CACHE_TYPE_UC 0x0 +#define IA32_EPT_PAGING_CACHE_TYPE_WB 0x6 +#define IA32_EPT_AD_BITS_ENABLE (1ULL << 6) +#define IA32_EPT_PAGE_WALK_LENGTH 0x4 + +/* VMX : IA32_VMX_BASIC bits */ +#define IA32_VMX_TRUE_CTLS_AVAIL (1ULL << 55) + +/* VMX : IA32_VMX_PINBASED_CTLS bits */ +#define IA32_VMX_EXTERNAL_INT_EXITING (1ULL << 0) +#define IA32_VMX_NMI_EXITING (1ULL << 3) +#define IA32_VMX_VIRTUAL_NMIS (1ULL << 5) +#define IA32_VMX_ACTIVATE_VMX_PREEMPTION_TIMER (1ULL << 6) +#define IA32_VMX_PROCESS_POSTED_INTERRUPTS (1ULL << 7) + +/* VMX : IA32_VMX_PROCBASED_CTLS bits */ +#define IA32_VMX_INTERRUPT_WINDOW_EXITING (1ULL << 2) +#define IA32_VMX_USE_TSC_OFFSETTING (1ULL << 3) +#define IA32_VMX_HLT_EXITING (1ULL << 7) +#define IA32_VMX_INVLPG_EXITING (1ULL << 9) +#define IA32_VMX_MWAIT_EXITING (1ULL << 10) +#define IA32_VMX_RDPMC_EXITING (1ULL << 11) +#define IA32_VMX_RDTSC_EXITING (1ULL << 12) +#define IA32_VMX_CR3_LOAD_EXITING (1ULL << 15) +#define IA32_VMX_CR3_STORE_EXITING (1ULL << 16) +#define IA32_VMX_CR8_LOAD_EXITING (1ULL << 19) +#define IA32_VMX_CR8_STORE_EXITING (1ULL << 20) +#define IA32_VMX_USE_TPR_SHADOW (1ULL << 21) +#define IA32_VMX_NMI_WINDOW_EXITING (1ULL << 22) +#define IA32_VMX_MOV_DR_EXITING (1ULL << 23) +#define IA32_VMX_UNCONDITIONAL_IO_EXITING (1ULL << 24) +#define IA32_VMX_USE_IO_BITMAPS (1ULL << 25) +#define IA32_VMX_MONITOR_TRAP_FLAG (1ULL << 27) +#define IA32_VMX_USE_MSR_BITMAPS (1ULL << 28) +#define IA32_VMX_MONITOR_EXITING (1ULL << 29) +#define IA32_VMX_PAUSE_EXITING (1ULL << 30) +#define IA32_VMX_ACTIVATE_SECONDARY_CONTROLS (1ULL << 31) + +/* VMX : IA32_VMX_PROCBASED2_CTLS bits */ +#define IA32_VMX_VIRTUALIZE_APIC (1ULL << 0) +#define IA32_VMX_ENABLE_EPT (1ULL << 1) +#define IA32_VMX_DESCRIPTOR_TABLE_EXITING (1ULL << 2) +#define IA32_VMX_ENABLE_RDTSCP (1ULL << 3) +#define IA32_VMX_VIRTUALIZE_X2APIC_MODE (1ULL << 4) +#define IA32_VMX_ENABLE_VPID (1ULL << 5) +#define IA32_VMX_WBINVD_EXITING (1ULL << 6) +#define IA32_VMX_UNRESTRICTED_GUEST (1ULL << 7) +#define IA32_VMX_APIC_REGISTER_VIRTUALIZATION (1ULL << 8) +#define IA32_VMX_VIRTUAL_INTERRUPT_DELIVERY (1ULL << 9) +#define IA32_VMX_PAUSE_LOOP_EXITING (1ULL << 10) +#define IA32_VMX_RDRAND_EXITING (1ULL << 11) +#define IA32_VMX_ENABLE_INVPCID (1ULL << 12) +#define IA32_VMX_ENABLE_VM_FUNCTIONS (1ULL << 13) +#define IA32_VMX_VMCS_SHADOWING (1ULL << 14) +#define IA32_VMX_ENABLE_ENCLS_EXITING (1ULL << 15) +#define IA32_VMX_RDSEED_EXITING (1ULL << 16) +#define IA32_VMX_ENABLE_PML (1ULL << 17) +#define IA32_VMX_EPT_VIOLATION_VE (1ULL << 18) +#define IA32_VMX_CONCEAL_VMX_FROM_PT (1ULL << 19) +#define IA32_VMX_ENABLE_XSAVES_XRSTORS (1ULL << 20) +#define IA32_VMX_ENABLE_TSC_SCALING (1ULL << 25) + +/* VMX : IA32_VMX_EXIT_CTLS bits */ +#define IA32_VMX_SAVE_DEBUG_CONTROLS (1ULL << 2) +#define IA32_VMX_HOST_SPACE_ADDRESS_SIZE (1ULL << 9) +#define IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_EXIT (1ULL << 12) +#define IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT (1ULL << 15) +#define IA32_VMX_SAVE_IA32_PAT_ON_EXIT (1ULL << 18) +#define IA32_VMX_LOAD_IA32_PAT_ON_EXIT (1ULL << 19) +#define IA32_VMX_SAVE_IA32_EFER_ON_EXIT (1ULL << 20) +#define IA32_VMX_LOAD_IA32_EFER_ON_EXIT (1ULL << 21) +#define IA32_VMX_SAVE_VMX_PREEMPTION_TIMER (1ULL << 22) +#define IA32_VMX_CLEAR_IA32_BNDCFGS_ON_EXIT (1ULL << 23) +#define IA32_VMX_CONCEAL_VM_EXITS_FROM_PT (1ULL << 24) + +/* VMX: IA32_VMX_ENTRY_CTLS bits */ +#define IA32_VMX_LOAD_DEBUG_CONTROLS (1ULL << 2) +#define IA32_VMX_IA32E_MODE_GUEST (1ULL << 9) +#define IA32_VMX_ENTRY_TO_SMM (1ULL << 10) +#define IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT (1ULL << 11) +#define IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY (1ULL << 13) +#define IA32_VMX_LOAD_IA32_PAT_ON_ENTRY (1ULL << 14) +#define IA32_VMX_LOAD_IA32_EFER_ON_ENTRY (1ULL << 15) +#define IA32_VMX_LOAD_IA32_BNDCFGS_ON_ENTRY (1ULL << 16) +#define IA32_VMX_CONCEAL_VM_ENTRIES_FROM_PT (1ULL << 17) + +/* + * VMX : VMCS Fields + */ + +/* 16-bit control fields */ +#define VMCS_GUEST_VPID 0x0000 +#define VMCS_POSTED_INT_NOTIF_VECTOR 0x0002 +#define VMCS_EPTP_INDEX 0x0004 + +/* 16-bit guest state fields */ +#define VMCS_GUEST_IA32_ES_SEL 0x0800 +#define VMCS_GUEST_IA32_CS_SEL 0x0802 +#define VMCS_GUEST_IA32_SS_SEL 0x0804 +#define VMCS_GUEST_IA32_DS_SEL 0x0806 +#define VMCS_GUEST_IA32_FS_SEL 0x0808 +#define VMCS_GUEST_IA32_GS_SEL 0x080A +#define VMCS_GUEST_IA32_LDTR_SEL 0x080C +#define VMCS_GUEST_IA32_TR_SEL 0x080E +#define VMCS_GUEST_INTERRUPT_STATUS 0x0810 +#define VMCS_GUEST_PML_INDEX 0x0812 + +/* 16-bit host state fields */ +#define VMCS_HOST_IA32_ES_SEL 0x0C00 +#define VMCS_HOST_IA32_CS_SEL 0x0C02 +#define VMCS_HOST_IA32_SS_SEL 0x0C04 +#define VMCS_HOST_IA32_DS_SEL 0x0C06 +#define VMCS_HOST_IA32_FS_SEL 0x0C08 +#define VMCS_HOST_IA32_GS_SEL 0x0C0A +#define VMCS_HOST_IA32_TR_SEL 0x0C0C + +/* 64-bit control fields */ +#define VMCS_IO_BITMAP_A 0x2000 +#define VMCS_IO_BITMAP_B 0x2002 +#define VMCS_MSR_BITMAP_ADDRESS 0x2004 +#define VMCS_MSR_BITMAP_ADDRESS_HI 0x2005 +#define VMCS_EXIT_STORE_MSR_ADDRESS 0x2006 +#define VMCS_EXIT_STORE_MSR_ADDRESS_HI 0x2007 +#define VMCS_EXIT_LOAD_MSR_ADDRESS 0x2008 +#define VMCS_EXIT_LOAD_MSR_ADDRESS_HI 0x2009 +#define VMCS_ENTRY_LOAD_MSR_ADDRESS 0x200A +#define VMCS_ENTRY_LOAD_MSR_ADDRESS_HI 0x200B +#define VMCS_EXECUTIVE_VMCS_POINTER 0x200C +#define VMCS_PML_ADDRESS 0x200E +#define VMCS_TSC_OFFSET 0x2010 +#define VMCS_VIRTUAL_APIC_ADDRESS 0x2012 +#define VMCS_APIC_ACCESS_ADDRESS 0x2014 +#define VMCS_POSTED_INTERRUPT_DESC 0x2016 +#define VMCS_VM_FUNCTION_CONTROLS 0x2018 +#define VMCS_GUEST_IA32_EPTP 0x201A +#define VMCS_GUEST_IA32_EPTP_HI 0x201B +#define VMCS_EOI_EXIT_BITMAP_0 0x201C +#define VMCS_EOI_EXIT_BITMAP_1 0x201E +#define VMCS_EOI_EXIT_BITMAP_2 0x2020 +#define VMCS_EOI_EXIT_BITMAP_3 0x2022 +#define VMCS_EPTP_LIST_ADDRESS 0x2024 +#define VMCS_VMREAD_BITMAP_ADDRESS 0x2026 +#define VMCS_VMWRITE_BITMAP_ADDRESS 0x2028 +#define VMCS_VIRTUALIZATION_EXC_ADDRESS 0x202A +#define VMCS_XSS_EXITING_BITMAP 0x202C +#define VMCS_ENCLS_EXITING_BITMAP 0x202E +#define VMCS_TSC_MULTIPLIER 0x2032 + +/* 64-bit RO data field */ +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x2400 +#define VMCS_GUEST_PHYSICAL_ADDRESS_HI 0x2401 + +/* 64-bit guest state fields */ +#define VMCS_LINK_POINTER 0x2800 +#define VMCS_LINK_POINTER_HI 0x2801 +#define VMCS_GUEST_IA32_DEBUGCTL 0x2802 +#define VMCS_GUEST_IA32_PAT 0x2804 +#define VMCS_GUEST_IA32_EFER 0x2806 +#define VMCS_GUEST_IA32_PERF_GBL_CTRL 0x2808 +#define VMCS_GUEST_PDPTE0 0x280A +#define VMCS_GUEST_PDPTE1 0x280C +#define VMCS_GUEST_PDPTE2 0x280E +#define VMCS_GUEST_PDPTE3 0x2810 +#define VMCS_GUEST_IA32_BNDCFGS 0x2812 + +/* 64-bit host state fields */ +#define VMCS_HOST_IA32_PAT 0x2C00 +#define VMCS_HOST_IA32_EFER 0x2C02 +#define VMCS_HOST_IA32_PERF_GBL_CTRL 0x2C04 + +/* 32-bit control fields */ +#define VMCS_PINBASED_CTLS 0x4000 +#define VMCS_PROCBASED_CTLS 0x4002 +#define VMCS_EXCEPTION_BITMAP 0x4004 +#define VMCS_PF_ERROR_CODE_MASK 0x4006 +#define VMCS_PF_ERROR_CODE_MATCH 0x4008 +#define VMCS_CR3_TARGET_COUNT 0x400A +#define VMCS_EXIT_CTLS 0x400C +#define VMCS_EXIT_MSR_STORE_COUNT 0x400E +#define VMCS_EXIT_MSR_LOAD_COUNT 0x4010 +#define VMCS_ENTRY_CTLS 0x4012 +#define VMCS_ENTRY_MSR_LOAD_COUNT 0x4014 +#define VMCS_ENTRY_INTERRUPTION_INFO 0x4016 +#define VMCS_ENTRY_EXCEPTION_ERROR_CODE 0x4018 +#define VMCS_ENTRY_INSTRUCTION_LENGTH 0x401A +#define VMCS_TPR_THRESHOLD 0x401C +#define VMCS_PROCBASED2_CTLS 0x401E +#define VMCS_PLE_GAP 0x4020 +#define VMCS_PLE_WINDOW 0x4022 + +/* 32-bit RO data fields */ +#define VMCS_INSTRUCTION_ERROR 0x4400 +#define VMCS_EXIT_REASON 0x4402 +#define VMCS_EXIT_INTERRUPTION_INFO 0x4404 +#define VMCS_EXIT_INTERRUPTION_ERR_CODE 0x4406 +#define VMCS_IDT_VECTORING_INFO 0x4408 +#define VMCS_IDT_VECTORING_ERROR_CODE 0x440A +#define VMCS_INSTRUCTION_LENGTH 0x440C +#define VMCS_EXIT_INSTRUCTION_INFO 0x440E + +/* 32-bit guest state fields */ +#define VMCS_GUEST_IA32_ES_LIMIT 0x4800 +#define VMCS_GUEST_IA32_CS_LIMIT 0x4802 +#define VMCS_GUEST_IA32_SS_LIMIT 0x4804 +#define VMCS_GUEST_IA32_DS_LIMIT 0x4806 +#define VMCS_GUEST_IA32_FS_LIMIT 0x4808 +#define VMCS_GUEST_IA32_GS_LIMIT 0x480A +#define VMCS_GUEST_IA32_LDTR_LIMIT 0x480C +#define VMCS_GUEST_IA32_TR_LIMIT 0x480E +#define VMCS_GUEST_IA32_GDTR_LIMIT 0x4810 +#define VMCS_GUEST_IA32_IDTR_LIMIT 0x4812 +#define VMCS_GUEST_IA32_ES_AR 0x4814 +#define VMCS_GUEST_IA32_CS_AR 0x4816 +#define VMCS_GUEST_IA32_SS_AR 0x4818 +#define VMCS_GUEST_IA32_DS_AR 0x481A +#define VMCS_GUEST_IA32_FS_AR 0x481C +#define VMCS_GUEST_IA32_GS_AR 0x481E +#define VMCS_GUEST_IA32_LDTR_AR 0x4820 +#define VMCS_GUEST_IA32_TR_AR 0x4822 +#define VMCS_GUEST_INTERRUPTIBILITY_ST 0x4824 +#define VMCS_GUEST_ACTIVITY_STATE 0x4826 +#define VMCS_GUEST_SMBASE 0x4828 +#define VMCS_GUEST_IA32_SYSENTER_CS 0x482A +#define VMCS_VMX_PREEMPTION_TIMER_VAL 0x482E + +/* 32-bit host state field */ +#define VMCS_HOST_IA32_SYSENTER_CS 0x4C00 + +/* Natural-width control fields */ +#define VMCS_CR0_MASK 0x6000 +#define VMCS_CR4_MASK 0x6002 +#define VMCS_CR0_READ_SHADOW 0x6004 +#define VMCS_CR4_READ_SHADOW 0x6006 +#define VMCS_CR3_TARGET_0 0x6008 +#define VMCS_CR3_TARGET_1 0x600A +#define VMCS_CR3_TARGET_2 0x600C +#define VMCS_CR3_TARGET_3 0x600E + +/* Natural-width RO fields */ +#define VMCS_GUEST_EXIT_QUALIFICATION 0x6400 +#define VMCS_IO_RCX 0x6402 +#define VMCS_IO_RSI 0x6404 +#define VMCS_IO_RDI 0x6406 +#define VMCS_IO_RIP 0x6408 +#define VMCS_GUEST_LINEAR_ADDRESS 0x640A + +/* Natural-width guest state fields */ +#define VMCS_GUEST_IA32_CR0 0x6800 +#define VMCS_GUEST_IA32_CR3 0x6802 +#define VMCS_GUEST_IA32_CR4 0x6804 +#define VMCS_GUEST_IA32_ES_BASE 0x6806 +#define VMCS_GUEST_IA32_CS_BASE 0x6808 +#define VMCS_GUEST_IA32_SS_BASE 0x680A +#define VMCS_GUEST_IA32_DS_BASE 0x680C +#define VMCS_GUEST_IA32_FS_BASE 0x680E +#define VMCS_GUEST_IA32_GS_BASE 0x6810 +#define VMCS_GUEST_IA32_LDTR_BASE 0x6812 +#define VMCS_GUEST_IA32_TR_BASE 0x6814 +#define VMCS_GUEST_IA32_GDTR_BASE 0x6816 +#define VMCS_GUEST_IA32_IDTR_BASE 0x6818 +#define VMCS_GUEST_IA32_DR7 0x681A +#define VMCS_GUEST_IA32_RSP 0x681C +#define VMCS_GUEST_IA32_RIP 0x681E +#define VMCS_GUEST_IA32_RFLAGS 0x6820 +#define VMCS_GUEST_PENDING_DBG_EXC 0x6822 +#define VMCS_GUEST_IA32_SYSENTER_ESP 0x6824 +#define VMCS_GUEST_IA32_SYSENTER_EIP 0x6826 + +/* Natural-width host state fields */ +#define VMCS_HOST_IA32_CR0 0x6C00 +#define VMCS_HOST_IA32_CR3 0x6C02 +#define VMCS_HOST_IA32_CR4 0x6C04 +#define VMCS_HOST_IA32_FS_BASE 0x6C06 +#define VMCS_HOST_IA32_GS_BASE 0x6C08 +#define VMCS_HOST_IA32_TR_BASE 0x6C0A +#define VMCS_HOST_IA32_GDTR_BASE 0x6C0C +#define VMCS_HOST_IA32_IDTR_BASE 0x6C0E +#define VMCS_HOST_IA32_SYSENTER_ESP 0x6C10 +#define VMCS_HOST_IA32_SYSENTER_EIP 0x6C12 +#define VMCS_HOST_IA32_RSP 0x6C14 +#define VMCS_HOST_IA32_RIP 0x6C16 + +#define IA32_VMX_INVVPID_INDIV_ADDR_CTX 0x0 +#define IA32_VMX_INVVPID_SINGLE_CTX 0x1 +#define IA32_VMX_INVVPID_ALL_CTX 0x2 +#define IA32_VMX_INVVPID_SINGLE_CTX_GLB 0x3 + +#define IA32_VMX_INVEPT_SINGLE_CTX 0x1 +#define IA32_VMX_INVEPT_GLOBAL_CTX 0x2 + +#define IA32_VMX_EPT_FAULT_READ (1ULL << 0) +#define IA32_VMX_EPT_FAULT_WRITE (1ULL << 1) +#define IA32_VMX_EPT_FAULT_EXEC (1ULL << 2) + +#define IA32_VMX_EPT_FAULT_WAS_READABLE (1ULL << 3) +#define IA32_VMX_EPT_FAULT_WAS_WRITABLE (1ULL << 4) +#define IA32_VMX_EPT_FAULT_WAS_EXECABLE (1ULL << 5) + +#define IA32_VMX_MSR_LIST_SIZE_MASK (7ULL << 25) +#define IA32_VMX_CR3_TGT_SIZE_MASK (0x1FFULL << 16) + +/* + * SVM + */ +#define MSR_AMD_VM_CR 0xc0010114 +#define CPUID_AMD_SVM_CAP 0x8000000A +#define AMD_SVMDIS 0x10 +#define AMD_SVM_NESTED_PAGING_CAP (1 << 0) + +/* + * PAT + */ +#define PATENTRY(n, type) ((uint64_t)type << ((n) * 8)) +#define PAT_UC 0x0UL +#define PAT_WC 0x1UL +#define PAT_WT 0x4UL +#define PAT_WP 0x5UL +#define PAT_WB 0x6UL +#define PAT_UCMINUS 0x7UL + diff --git a/sys/arch/i386/include/vmmvar.h b/sys/arch/i386/include/vmmvar.h new file mode 100644 index 00000000000..4b8edf7756b --- /dev/null +++ b/sys/arch/i386/include/vmmvar.h @@ -0,0 +1,446 @@ +/* + * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * CPU capabilities for VMM operation + */ +#ifndef _MACHINE_VMMVAR_H_ +#define _MACHINE_VMMVAR_H_ + +#define VMM_HV_SIGNATURE "OpenBSDVMM58" + +#define VMM_MAX_MEM_RANGES 16 +#define VMM_MAX_DISKS_PER_VM 2 +#define VMM_MAX_PATH_DISK 128 +#define VMM_MAX_NAME_LEN 32 +#define VMM_MAX_KERNEL_PATH 128 +#define VMM_MAX_VCPUS_PER_VM 64 +#define VMM_MAX_VM_MEM_SIZE 2048 +#define VMM_MAX_NICS_PER_VM 2 + +#define VMM_PCI_MMIO_BAR_BASE 0xF0000000 +#define VMM_PCI_MMIO_BAR_END 0xF0FFFFFF +#define VMM_PCI_MMIO_BAR_SIZE 0x00010000 +#define VMM_PCI_IO_BAR_BASE 0x1000 +#define VMM_PCI_IO_BAR_END 0xFFFF +#define VMM_PCI_IO_BAR_SIZE 0x1000 + +/* VMX: Basic Exit Reasons */ +#define VMX_EXIT_NMI 0 +#define VMX_EXIT_EXTINT 1 +#define VMX_EXIT_TRIPLE_FAULT 2 +#define VMX_EXIT_INIT 3 +#define VMX_EXIT_SIPI 4 +#define VMX_EXIT_IO_SMI 5 +#define VMX_EXIT_OTHER_SMI 6 +#define VMX_EXIT_INT_WINDOW 7 +#define VMX_EXIT_NMI_WINDOW 8 +#define VMX_EXIT_TASK_SWITCH 9 +#define VMX_EXIT_CPUID 10 +#define VMX_EXIT_GETSEC 11 +#define VMX_EXIT_HLT 12 +#define VMX_EXIT_INVD 13 +#define VMX_EXIT_INVLPG 14 +#define VMX_EXIT_RDPMC 15 +#define VMX_EXIT_RDTSC 16 +#define VMX_EXIT_RSM 17 +#define VMX_EXIT_VMCALL 18 +#define VMX_EXIT_VMCLEAR 19 +#define VMX_EXIT_VMLAUNCH 20 +#define VMX_EXIT_VMPTRLD 21 +#define VMX_EXIT_VMPTRST 22 +#define VMX_EXIT_VMREAD 23 +#define VMX_EXIT_VMRESUME 24 +#define VMX_EXIT_VMWRITE 25 +#define VMX_EXIT_VMXOFF 26 +#define VMX_EXIT_VMXON 27 +#define VMX_EXIT_CR_ACCESS 28 +#define VMX_EXIT_MOV_DR 29 +#define VMX_EXIT_IO 30 +#define VMX_EXIT_RDMSR 31 +#define VMX_EXIT_WRMSR 32 +#define VMX_EXIT_ENTRY_FAILED_GUEST_STATE 33 +#define VMX_EXIT_ENTRY_FAILED_MSR_LOAD 34 +#define VMX_EXIT_MWAIT 36 +#define VMX_EXIT_MTF 37 +#define VMX_EXIT_MONITOR 39 +#define VMX_EXIT_PAUSE 40 +#define VMX_EXIT_ENTRY_FAILED_MCE 41 +#define VMX_EXIT_TPR_BELOW_THRESHOLD 43 +#define VMX_EXIT_APIC_ACCESS 44 +#define VMX_EXIT_VIRTUALIZED_EOI 45 +#define VMX_EXIT_GDTR_IDTR 46 +#define VMX_EXIT_LDTR_TR 47 +#define VMX_EXIT_EPT_VIOLATION 48 +#define VMX_EXIT_EPT_MISCONFIGURATION 49 +#define VMX_EXIT_INVEPT 50 +#define VMX_EXIT_RDTSCP 51 +#define VMX_EXIT_VMX_PREEMPTION_TIMER_EXPIRED 52 +#define VMX_EXIT_INVVPID 53 +#define VMX_EXIT_WBINVD 54 +#define VMX_EXIT_XSETBV 55 +#define VMX_EXIT_APIC_WRITE 56 +#define VMX_EXIT_RDRAND 57 +#define VMX_EXIT_INVPCID 58 +#define VMX_EXIT_VMFUNC 59 +#define VMX_EXIT_RDSEED 61 +#define VMX_EXIT_XSAVES 63 +#define VMX_EXIT_XRSTORS 64 + +/* + * VMX: Misc defines + */ +#define VMX_MAX_CR3_TARGETS 256 + +#define VM_EXIT_TERMINATED 0xFFFE +#define VM_EXIT_NONE 0xFFFF + +/* + * VCPU state values. Note that there is a conversion function in vmm.c + * (vcpu_state_decode) that converts these to human readable strings, + * so this enum and vcpu_state_decode should be kept in sync. + */ +enum { + VCPU_STATE_STOPPED, + VCPU_STATE_RUNNING, + VCPU_STATE_REQTERM, + VCPU_STATE_TERMINATED, + VCPU_STATE_UNKNOWN, +}; + +enum { + VEI_DIR_OUT, + VEI_DIR_IN +}; + +/* + * vm exit data + * vm_exit_inout : describes an IN/OUT exit + */ +struct vm_exit_inout { + uint8_t vei_size; /* Size of access */ + uint8_t vei_dir; /* Direction */ + uint8_t vei_rep; /* REP prefix? */ + uint8_t vei_string; /* string variety? */ + uint8_t vei_encoding; /* operand encoding */ + uint16_t vei_port; /* port */ + uint32_t vei_data; /* data (for IN insns) */ +}; + +union vm_exit { + struct vm_exit_inout vei; /* IN/OUT exit */ +}; + +/* + * struct vcpu_segment_info describes a segment + selector set, used + * in constructing the initial vcpu register content + */ +struct vcpu_segment_info { + uint16_t vsi_sel; + uint32_t vsi_limit; + uint32_t vsi_ar; + uint32_t vsi_base; +}; + +#define VCPU_REGS_EAX 0 +#define VCPU_REGS_EBX 1 +#define VCPU_REGS_ECX 2 +#define VCPU_REGS_EDX 3 +#define VCPU_REGS_ESI 4 +#define VCPU_REGS_EDI 5 +#define VCPU_REGS_ESP 6 +#define VCPU_REGS_EBP 7 +#define VCPU_REGS_EIP 8 +#define VCPU_REGS_EFLAGS 9 +#define VCPU_REGS_NGPRS (VCPU_REGS_EFLAGS + 1) + +#define VCPU_REGS_CR0 0 +#define VCPU_REGS_CR2 1 +#define VCPU_REGS_CR3 2 +#define VCPU_REGS_CR4 3 +#define VCPU_REGS_CR8 4 +#define VCPU_REGS_NCRS (VCPU_REGS_CR8 + 1) + +#define VCPU_REGS_CS 0 +#define VCPU_REGS_DS 1 +#define VCPU_REGS_ES 2 +#define VCPU_REGS_FS 3 +#define VCPU_REGS_GS 4 +#define VCPU_REGS_SS 5 +#define VCPU_REGS_LDTR 6 +#define VCPU_REGS_TR 7 +#define VCPU_REGS_NSREGS (VCPU_REGS_TR + 1) + +struct vcpu_reg_state { + uint32_t vrs_gprs[VCPU_REGS_NGPRS]; + uint32_t vrs_crs[VCPU_REGS_NCRS]; + struct vcpu_segment_info vrs_sregs[VCPU_REGS_NSREGS]; + struct vcpu_segment_info vrs_gdtr; + struct vcpu_segment_info vrs_idtr; +}; + +struct vm_mem_range { + paddr_t vmr_gpa; + vaddr_t vmr_va; + size_t vmr_size; +}; + +struct vm_create_params { + /* Input parameters to VMM_IOC_CREATE */ + size_t vcp_nmemranges; + size_t vcp_ncpus; + size_t vcp_ndisks; + size_t vcp_nnics; + struct vm_mem_range vcp_memranges[VMM_MAX_MEM_RANGES]; + char vcp_disks[VMM_MAX_DISKS_PER_VM][VMM_MAX_PATH_DISK]; + char vcp_name[VMM_MAX_NAME_LEN]; + char vcp_kernel[VMM_MAX_KERNEL_PATH]; + uint8_t vcp_macs[VMM_MAX_NICS_PER_VM][6]; + + /* Output parameter from VMM_IOC_CREATE */ + uint32_t vcp_id; +}; + +struct vm_run_params { + /* Input parameters to VMM_IOC_RUN */ + uint32_t vrp_vm_id; + uint32_t vrp_vcpu_id; + uint8_t vrp_continue; /* Continuing from an exit */ + uint16_t vrp_irq; /* IRQ to inject */ + + /* Input/output parameter to VMM_IOC_RUN */ + union vm_exit *vrp_exit; /* updated exit data */ + + /* Output parameter from VMM_IOC_RUN */ + uint16_t vrp_exit_reason; /* exit reason */ + uint8_t vrp_irqready; /* ready for IRQ on entry */ +}; + +struct vm_info_result { + /* Output parameters from VMM_IOC_INFO */ + size_t vir_memory_size; + size_t vir_used_size; + size_t vir_ncpus; + uint8_t vir_vcpu_state[VMM_MAX_VCPUS_PER_VM]; + pid_t vir_creator_pid; + uint32_t vir_id; + char vir_name[VMM_MAX_NAME_LEN]; +}; + +struct vm_info_params { + /* Input parameters to VMM_IOC_INFO */ + size_t vip_size; /* Output buffer size */ + + /* Output Parameters from VMM_IOC_INFO */ + size_t vip_info_ct; /* # of entries returned */ + struct vm_info_result *vip_info; /* Output buffer */ +}; + +struct vm_terminate_params { + /* Input parameters to VMM_IOC_TERM */ + uint32_t vtp_vm_id; +}; + +struct vm_resetcpu_params { + /* Input parameters to VMM_IOC_RESETCPU */ + uint32_t vrp_vm_id; + uint32_t vrp_vcpu_id; + struct vcpu_reg_state vrp_init_state; +}; + +struct vm_intr_params { + /* Input parameters to VMM_IOC_INTR */ + uint32_t vip_vm_id; + uint32_t vip_vcpu_id; + uint16_t vip_intr; +}; + +#define VM_RWREGS_GPRS 0x1 /* read/write GPRs */ +#define VM_RWREGS_SREGS 0x2 /* read/write segment registers */ +#define VM_RWREGS_CRS 0x4 /* read/write CRs */ +#define VM_RWREGS_ALL (VM_RWREGS_GPRS | VM_RWREGS_SREGS | VM_RWREGS_CRS) + +struct vm_rwregs_params { + uint32_t vrwp_vm_id; + uint32_t vrwp_vcpu_id; + uint64_t vrwp_mask; + struct vcpu_reg_state vrwp_regs; +}; + +/* IOCTL definitions */ +#define VMM_IOC_CREATE _IOWR('V', 1, struct vm_create_params) /* Create VM */ +#define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */ +#define VMM_IOC_INFO _IOWR('V', 3, struct vm_info_params) /* Get VM Info */ +#define VMM_IOC_TERM _IOW('V', 4, struct vm_terminate_params) /* Terminate VM */ +#define VMM_IOC_RESETCPU _IOW('V', 5, struct vm_resetcpu_params) /* Reset */ +#define VMM_IOC_INTR _IOW('V', 6, struct vm_intr_params) /* Intr pending */ +#define VMM_IOC_READREGS _IOWR('V', 7, struct vm_rwregs_params) /* Get registers */ +#define VMM_IOC_WRITEREGS _IOW('V', 8, struct vm_rwregs_params) /* Set registers */ + +#ifdef _KERNEL + +#define VMX_FAIL_LAUNCH_UNKNOWN 1 +#define VMX_FAIL_LAUNCH_INVALID_VMCS 2 +#define VMX_FAIL_LAUNCH_VALID_VMCS 3 + +#define VMX_NUM_MSR_STORE 0 +// #define VMX_NUM_MSR_STORE 1 + +/* MSR bitmap manipulation macros */ +#define MSRIDX(m) ((m) / 8) +#define MSRBIT(m) (1 << (m) % 8) + +enum { + VMM_MODE_UNKNOWN, + VMM_MODE_VMX, + VMM_MODE_EPT, + VMM_MODE_SVM, + VMM_MODE_RVI +}; + +enum { + VMM_MEM_TYPE_REGULAR, + VMM_MEM_TYPE_UNKNOWN +}; + +/* Forward declarations */ +struct vm; + +/* + * Implementation-specific cpu state + */ +struct vmcb { +}; + +struct vmcs { + uint32_t vmcs_revision; +}; + +struct vmx_invvpid_descriptor +{ + uint64_t vid_vpid; // : 16; + uint64_t vid_addr; +}; + +struct vmx_invept_descriptor +{ + uint64_t vid_eptp; + uint64_t vid_reserved; +}; + +struct vmx_msr_store +{ + uint64_t vms_index : 32; + uint64_t vms_data; +}; + +/* + * Storage for guest registers not preserved in VMCS and various exit + * information. + * + * Note that vmx_enter_guest depends on the layout of this struct for + * field access. + */ +struct vmx_gueststate +{ + /* %esi should be first */ + uint32_t vg_esi; /* 0x00 */ + uint32_t vg_eax; /* 0x04 */ + uint32_t vg_ebx; /* 0x08 */ + uint32_t vg_ecx; /* 0x0c */ + uint32_t vg_edx; /* 0x10 */ + uint32_t vg_edi; /* 0x14 */ + uint32_t vg_ebp; /* 0x18 */ + uint32_t vg_cr2; /* 0x1c */ + uint32_t vg_eip; /* 0x20 */ + uint32_t vg_exit_reason; /* 0x24 */ + uint32_t vg_eflags; /* 0x28 */ +}; + +/* + * Virtual Machine + */ +struct vm; + +/* + * Virtual CPU + */ +struct vcpu { + /* VMCS / VMCB pointer */ + vaddr_t vc_control_va; + uint64_t vc_control_pa; + + /* VLAPIC pointer */ + vaddr_t vc_vlapic_va; + uint64_t vc_vlapic_pa; + + /* MSR bitmap address */ + vaddr_t vc_msr_bitmap_va; + uint64_t vc_msr_bitmap_pa; + + struct vm *vc_parent; + uint32_t vc_id; + u_int vc_state; + SLIST_ENTRY(vcpu) vc_vcpu_link; + vaddr_t vc_hsa_stack_va; + + uint8_t vc_virt_mode; + + struct cpu_info *vc_last_pcpu; + union vm_exit vc_exit; + + uint16_t vc_intr; + uint8_t vc_irqready; + + /* VMX only */ + uint64_t vc_vmx_basic; + uint64_t vc_vmx_entry_ctls; + uint64_t vc_vmx_true_entry_ctls; + uint64_t vc_vmx_exit_ctls; + uint64_t vc_vmx_true_exit_ctls; + uint64_t vc_vmx_pinbased_ctls; + uint64_t vc_vmx_true_pinbased_ctls; + uint64_t vc_vmx_procbased_ctls; + uint64_t vc_vmx_true_procbased_ctls; + uint64_t vc_vmx_procbased2_ctls; + struct vmx_gueststate vc_gueststate; + vaddr_t vc_vmx_msr_exit_save_va; + paddr_t vc_vmx_msr_exit_save_pa; + vaddr_t vc_vmx_msr_exit_load_va; + paddr_t vc_vmx_msr_exit_load_pa; + vaddr_t vc_vmx_msr_entry_load_va; + paddr_t vc_vmx_msr_entry_load_pa; +}; + +SLIST_HEAD(vcpu_head, vcpu); + +void vmm_dispatch_intr(vaddr_t); +int vmxon(uint64_t *); +int vmxoff(void); +int vmclear(uint64_t *); +int vmptrld(uint64_t *); +int vmptrst(uint64_t *); +int vmwrite(uint32_t, uint32_t); +int vmread(uint32_t, uint32_t *); +void invvpid(uint32_t, struct vmx_invvpid_descriptor *); +void invept(uint32_t, struct vmx_invept_descriptor *); +int vmx_enter_guest(uint64_t *, struct vmx_gueststate *, int, vaddr_t); +void start_vmm_on_cpu(struct cpu_info *); +void stop_vmm_on_cpu(struct cpu_info *); + +#endif /* _KERNEL */ + +#endif /* ! _MACHINE_VMMVAR_H_ */ |