diff options
author | Mike Larkin <mlarkin@cvs.openbsd.org> | 2015-11-13 07:52:21 +0000 |
---|---|---|
committer | Mike Larkin <mlarkin@cvs.openbsd.org> | 2015-11-13 07:52:21 +0000 |
commit | 207e48e84cdbc60695417e94cc230d340d5f2028 (patch) | |
tree | f012cc1efae9cdaa15a35bd7e4310a53c4cbde57 /sys/arch/amd64 | |
parent | e61de7fde46443b42ab120089f15aa14219d1c30 (diff) |
vmm(4) kernel code
circulated on hackers@, no objections. Disabled by default.
Diffstat (limited to 'sys/arch/amd64')
-rw-r--r-- | sys/arch/amd64/amd64/cacheinfo.c | 10 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/conf.c | 15 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/cpu.c | 46 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/identcpu.c | 150 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/ipifuncs.c | 33 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/mainbus.c | 8 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/vmm.c | 3414 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/vmm_support.S | 384 | ||||
-rw-r--r-- | sys/arch/amd64/conf/GENERIC | 4 | ||||
-rw-r--r-- | sys/arch/amd64/conf/Makefile.amd64 | 4 | ||||
-rw-r--r-- | sys/arch/amd64/conf/files.amd64 | 10 | ||||
-rw-r--r-- | sys/arch/amd64/include/cpu.h | 53 | ||||
-rw-r--r-- | sys/arch/amd64/include/intrdefs.h | 9 | ||||
-rw-r--r-- | sys/arch/amd64/include/pmap.h | 3 | ||||
-rw-r--r-- | sys/arch/amd64/include/specialreg.h | 231 | ||||
-rw-r--r-- | sys/arch/amd64/include/vmmvar.h | 387 |
16 files changed, 4713 insertions, 48 deletions
diff --git a/sys/arch/amd64/amd64/cacheinfo.c b/sys/arch/amd64/amd64/cacheinfo.c index 8926949af6d..eb319b909ec 100644 --- a/sys/arch/amd64/amd64/cacheinfo.c +++ b/sys/arch/amd64/amd64/cacheinfo.c @@ -1,4 +1,4 @@ -/* $OpenBSD: cacheinfo.c,v 1.6 2012/03/16 01:53:00 haesbaert Exp $ */ +/* $OpenBSD: cacheinfo.c,v 1.7 2015/11/13 07:52:20 mlarkin Exp $ */ /*- * Copyright (c) 2000 The NetBSD Foundation, Inc. @@ -185,6 +185,10 @@ amd_cpu_cacheinfo(struct cpu_info *ci) } CPUID(0x80000005, descs[0], descs[1], descs[2], descs[3]); + ci->ci_amdcacheinfo[0] = descs[0]; + ci->ci_amdcacheinfo[1] = descs[1]; + ci->ci_amdcacheinfo[2] = descs[2]; + ci->ci_amdcacheinfo[3] = descs[3]; /* * K6-III and higher have large page TLBs. @@ -230,6 +234,10 @@ amd_cpu_cacheinfo(struct cpu_info *ci) } CPUID(0x80000006, descs[0], descs[1], descs[2], descs[3]); + ci->ci_extcacheinfo[0] = descs[0]; + ci->ci_extcacheinfo[1] = descs[1]; + ci->ci_extcacheinfo[2] = descs[2]; + ci->ci_extcacheinfo[3] = descs[3]; cai = &ci->ci_cinfo[CAI_L2CACHE]; cai->cai_totalsize = AMD_L2_ECX_C_SIZE(descs[2]); diff --git a/sys/arch/amd64/amd64/conf.c b/sys/arch/amd64/amd64/conf.c index 0bf95c223e4..0c9c20d898e 100644 --- a/sys/arch/amd64/amd64/conf.c +++ b/sys/arch/amd64/amd64/conf.c @@ -1,4 +1,4 @@ -/* $OpenBSD: conf.c,v 1.51 2015/10/23 15:10:52 claudio Exp $ */ +/* $OpenBSD: conf.c,v 1.52 2015/11/13 07:52:20 mlarkin Exp $ */ /* * Copyright (c) 1994, 1995 Charles M. Hannum. All rights reserved. @@ -103,6 +103,15 @@ int nblkdev = nitems(bdevsw); (dev_type_stop((*))) enodev, 0, seltrue, \ (dev_type_mmap((*))) enodev, 0 } +/* open, close, ioctl */ +#define cdev_vmm_init(c,n) { \ + dev_init(c,n,open), dev_init(c,n,close), \ + (dev_type_read((*))) enodev, \ + (dev_type_write((*))) enodev, \ + dev_init(c,n,ioctl), \ + (dev_type_stop((*))) enodev, 0, seltrue, \ + (dev_type_mmap((*))) enodev } + #define mmread mmrw #define mmwrite mmrw @@ -154,6 +163,8 @@ cdev_decl(cztty); cdev_decl(nvram); #include "drm.h" cdev_decl(drm); +#include "vmm.h" +cdev_decl(vmm); #include "wsdisplay.h" #include "wskbd.h" @@ -184,7 +195,7 @@ struct cdevsw cdevsw[] = cdev_log_init(1,log), /* 7: /dev/klog */ cdev_tty_init(NCOM,com), /* 8: serial port */ cdev_disk_init(NFD,fd), /* 9: floppy disk */ - cdev_notdef(), /* 10 */ + cdev_vmm_init(NVMM,vmm), /* 10 vmm */ cdev_notdef(), /* 11: Sony CD-ROM */ cdev_wsdisplay_init(NWSDISPLAY, /* 12: frame buffers, etc. */ wsdisplay), diff --git a/sys/arch/amd64/amd64/cpu.c b/sys/arch/amd64/amd64/cpu.c index 588090c0157..97013c18cc9 100644 --- a/sys/arch/amd64/amd64/cpu.c +++ b/sys/arch/amd64/amd64/cpu.c @@ -1,4 +1,4 @@ -/* $OpenBSD: cpu.c,v 1.88 2015/07/18 19:21:02 sf Exp $ */ +/* $OpenBSD: cpu.c,v 1.89 2015/11/13 07:52:20 mlarkin Exp $ */ /* $NetBSD: cpu.c,v 1.1 2003/04/26 18:39:26 fvdl Exp $ */ /*- @@ -66,6 +66,7 @@ #include "lapic.h" #include "ioapic.h" +#include "vmm.h" #include <sys/param.h> #include <sys/timeout.h> @@ -114,6 +115,9 @@ int cpu_match(struct device *, void *, void *); void cpu_attach(struct device *, struct device *, void *); int cpu_activate(struct device *, int); void patinit(struct cpu_info *ci); +#ifdef VMM +void cpu_init_vmm(struct cpu_info *ci); +#endif /* VMM */ struct cpu_softc { struct device sc_dev; /* device tree glue */ @@ -463,6 +467,9 @@ cpu_attach(struct device *parent, struct device *self, void *aux) sc->sc_dev.dv_xname, pcb, pcb->pcb_rsp); } #endif +#ifdef VMM + cpu_init_vmm(ci); +#endif /* VMM */ } /* @@ -485,12 +492,12 @@ cpu_init(struct cpu_info *ci) lcr0(rcr0() | CR0_WP); cr4 = rcr4() | CR4_DEFAULT; - if (ci->ci_feature_sefflags & SEFF0EBX_SMEP) + if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMEP) cr4 |= CR4_SMEP; #ifndef SMALL_KERNEL - if (ci->ci_feature_sefflags & SEFF0EBX_SMAP) + if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMAP) cr4 |= CR4_SMAP; - if (ci->ci_feature_sefflags & SEFF0EBX_FSGSBASE) + if (ci->ci_feature_sefflags_ebx & SEFF0EBX_FSGSBASE) cr4 |= CR4_FSGSBASE; #endif if (cpu_ecxfeature & CPUIDECX_XSAVE) @@ -515,6 +522,30 @@ cpu_init(struct cpu_info *ci) #endif } +#ifdef VMM +/* + * cpu_init_vmm + * + * Initializes per-cpu VMM state + * + * Parameters: + * ci: the cpu for which state is being initialized + */ +void +cpu_init_vmm(struct cpu_info *ci) +{ + /* + * Allocate a per-cpu VMXON region for VMX CPUs + */ + if (ci->ci_vmm_flags & CI_VMM_VMX) { + ci->ci_vmxon_region = (struct vmxon_region *)malloc(PAGE_SIZE, + M_DEVBUF, M_WAITOK | M_ZERO); + if (!pmap_extract(pmap_kernel(), (vaddr_t)ci->ci_vmxon_region, + &ci->ci_vmxon_region_pa)) + panic("Can't locate VMXON region in phys mem\n"); + } +} +#endif /* VMM */ #ifdef MULTIPROCESSOR void @@ -813,13 +844,6 @@ patinit(struct cpu_info *ci) if ((ci->ci_feature_flags & CPUID_PAT) == 0) return; -#define PATENTRY(n, type) (type << ((n) * 8)) -#define PAT_UC 0x0UL -#define PAT_WC 0x1UL -#define PAT_WT 0x4UL -#define PAT_WP 0x5UL -#define PAT_WB 0x6UL -#define PAT_UCMINUS 0x7UL /* * Set up PAT bits. * The default pat table is the following: diff --git a/sys/arch/amd64/amd64/identcpu.c b/sys/arch/amd64/amd64/identcpu.c index 352c3f39beb..6183e7743b0 100644 --- a/sys/arch/amd64/amd64/identcpu.c +++ b/sys/arch/amd64/amd64/identcpu.c @@ -1,4 +1,4 @@ -/* $OpenBSD: identcpu.c,v 1.65 2015/11/07 01:37:26 naddy Exp $ */ +/* $OpenBSD: identcpu.c,v 1.66 2015/11/13 07:52:20 mlarkin Exp $ */ /* $NetBSD: identcpu.c,v 1.1 2003/04/26 18:39:28 fvdl Exp $ */ /* @@ -39,12 +39,18 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/sysctl.h> + +#include "vmm.h" + #include <machine/cpu.h> #include <machine/cpufunc.h> void replacesmap(void); u_int64_t cpu_tsc_freq(struct cpu_info *); u_int64_t cpu_tsc_freq_ctr(struct cpu_info *); +#ifdef VMM +void cpu_check_vmm_cap(struct cpu_info *); +#endif /* VMM */ /* sysctl wants this. */ char cpu_model[48]; @@ -167,6 +173,9 @@ const struct { { SEFF0EBX_RDSEED, "RDSEED" }, { SEFF0EBX_ADX, "ADX" }, { SEFF0EBX_SMAP, "SMAP" }, +}, cpu_seff0_ecxfeatures[] = { + { SEFF0ECX_PREFETCHWT1, "PREFETCHWT1" }, + { SEFF0ECX_PKU, "PKU" }, }, cpu_tpm_eaxfeatures[] = { { TPM_SENSOR, "SENSOR" }, { TPM_ARAT, "ARAT" }, @@ -406,32 +415,32 @@ cpu_tsc_freq(struct cpu_info *ci) void identifycpu(struct cpu_info *ci) { - u_int32_t dummy, val, pnfeatset; - u_int32_t brand[12]; + u_int32_t dummy, val; char mycpu_model[48]; int i; char *brandstr_from, *brandstr_to; int skipspace; CPUID(1, ci->ci_signature, val, dummy, ci->ci_feature_flags); - CPUID(0x80000000, pnfeatset, dummy, dummy, dummy); - if (pnfeatset >= 0x80000001) { - u_int32_t ecx; - - CPUID(0x80000001, dummy, dummy, - ecx, ci->ci_feature_eflags); + CPUID(0x80000000, ci->ci_pnfeatset, dummy, dummy, dummy); + if (ci->ci_pnfeatset >= 0x80000001) { + CPUID(0x80000001, ci->ci_efeature_eax, dummy, + ci->ci_efeature_ecx, ci->ci_feature_eflags); /* Other bits may clash */ ci->ci_feature_flags |= (ci->ci_feature_eflags & CPUID_NXE); if (ci->ci_flags & CPUF_PRIMARY) - ecpu_ecxfeature = ecx; + ecpu_ecxfeature = ci->ci_efeature_ecx; /* Let cpu_feature be the common bits */ cpu_feature &= ci->ci_feature_flags; } - CPUID(0x80000002, brand[0], brand[1], brand[2], brand[3]); - CPUID(0x80000003, brand[4], brand[5], brand[6], brand[7]); - CPUID(0x80000004, brand[8], brand[9], brand[10], brand[11]); - strlcpy(mycpu_model, (char *)brand, sizeof(mycpu_model)); + CPUID(0x80000002, ci->ci_brand[0], + ci->ci_brand[1], ci->ci_brand[2], ci->ci_brand[3]); + CPUID(0x80000003, ci->ci_brand[4], + ci->ci_brand[5], ci->ci_brand[6], ci->ci_brand[7]); + CPUID(0x80000004, ci->ci_brand[8], + ci->ci_brand[9], ci->ci_brand[10], ci->ci_brand[11]); + strlcpy(mycpu_model, (char *)ci->ci_brand, sizeof(mycpu_model)); /* Remove leading, trailing and duplicated spaces from mycpu_model */ brandstr_from = brandstr_to = mycpu_model; @@ -524,11 +533,16 @@ identifycpu(struct cpu_info *ci) if (cpuid_level >= 0x07) { /* "Structured Extended Feature Flags" */ - CPUID_LEAF(0x7, 0, dummy, ci->ci_feature_sefflags, dummy, dummy); + CPUID_LEAF(0x7, 0, dummy, ci->ci_feature_sefflags_ebx, + ci->ci_feature_sefflags_ecx, dummy); for (i = 0; i < nitems(cpu_seff0_ebxfeatures); i++) - if (ci->ci_feature_sefflags & + if (ci->ci_feature_sefflags_ebx & cpu_seff0_ebxfeatures[i].bit) printf(",%s", cpu_seff0_ebxfeatures[i].str); + for (i = 0; i < nitems(cpu_seff0_ecxfeatures); i++) + if (ci->ci_feature_sefflags_ecx & + cpu_seff0_ecxfeatures[i].bit) + printf(",%s", cpu_seff0_ecxfeatures[i].str); } if (!strcmp(cpu_vendor, "GenuineIntel") && cpuid_level >= 0x06 ) { @@ -546,10 +560,10 @@ identifycpu(struct cpu_info *ci) #ifndef SMALL_KERNEL if (ci->ci_flags & CPUF_PRIMARY) { if (!strcmp(cpu_vendor, "AuthenticAMD") && - pnfeatset >= 0x80000007) { - CPUID(0x80000007, dummy, dummy, dummy, pnfeatset); + ci->ci_pnfeatset >= 0x80000007) { + CPUID(0x80000007, dummy, dummy, dummy, val); - if (pnfeatset & 0x06) { + if (val & 0x06) { if ((ci->ci_signature & 0xF00) == 0xF00) setperf_setup = k8_powernow_init; } @@ -576,7 +590,7 @@ identifycpu(struct cpu_info *ci) has_hv_cpuid = 1; #endif - if (ci->ci_feature_sefflags & SEFF0EBX_SMAP) + if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMAP) replacesmap(); } if (!strncmp(mycpu_model, "Intel", 5)) { @@ -614,6 +628,9 @@ identifycpu(struct cpu_info *ci) } cpu_topology(ci); +#ifdef VMM + cpu_check_vmm_cap(ci); +#endif /* VMM */ } #ifndef SMALL_KERNEL @@ -736,3 +753,96 @@ no_topology: ci->ci_core_id = ci->ci_cpuid; ci->ci_pkg_id = 0; } + +#ifdef VMM +/* + * cpu_check_vmm_cap + * + * Checks for VMM capabilities for 'ci'. Initializes certain per-cpu VMM + * state in 'ci' if virtualization extensions are found. + * + * Parameters: + * ci: the cpu being checked + */ +void +cpu_check_vmm_cap(struct cpu_info *ci) +{ + uint64_t msr; + uint32_t cap, dummy; + + /* + * Check for workable VMX + */ + if (cpu_ecxfeature & CPUIDECX_VMX) { + msr = rdmsr(MSR_IA32_FEATURE_CONTROL); + + if (!(msr & IA32_FEATURE_CONTROL_LOCK)) + ci->ci_vmm_flags |= CI_VMM_VMX; + else { + if (msr & IA32_FEATURE_CONTROL_VMX_EN) + ci->ci_vmm_flags |= CI_VMM_VMX; + } + } + + /* + * Check for EPT (Intel Nested Paging) + */ + if (ci->ci_vmm_flags & CI_VMM_VMX) { + /* Secondary controls available? */ + /* XXX should we check true procbased ctls here if avail? */ + msr = rdmsr(IA32_VMX_PROCBASED_CTLS); + if (msr & (IA32_VMX_ACTIVATE_SECONDARY_CONTROLS) << 32) { + msr = rdmsr(IA32_VMX_PROCBASED2_CTLS); + /* EPT available? */ + if (msr & (IA32_VMX_ENABLE_EPT) << 32) + ci->ci_vmm_flags |= CI_VMM_EPT; + } + } + + /* + * Check startup config (VMX) + */ + if (ci->ci_vmm_flags & CI_VMM_VMX) { + /* CR0 fixed and flexible bits */ + msr = rdmsr(IA32_VMX_CR0_FIXED0); + ci->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0 = msr; + msr = rdmsr(IA32_VMX_CR0_FIXED1); + ci->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1 = msr; + + /* CR4 fixed and flexible bits */ + msr = rdmsr(IA32_VMX_CR4_FIXED0); + ci->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0 = msr; + msr = rdmsr(IA32_VMX_CR4_FIXED1); + ci->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1 = msr; + + /* VMXON region revision ID (bits 30:0 of IA32_VMX_BASIC) */ + msr = rdmsr(IA32_VMX_BASIC); + ci->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision = + (uint32_t)(msr & 0x7FFFFFFF); + + /* MSR save / load table size */ + msr = rdmsr(IA32_VMX_MISC); + ci->ci_vmm_cap.vcc_vmx.vmx_msr_table_size = + (uint32_t)(msr & IA32_VMX_MSR_LIST_SIZE_MASK) >> 25; + } + + /* + * Check for workable SVM + */ + if (ecpu_ecxfeature & CPUIDECX_SVM) { + msr = rdmsr(MSR_AMD_VM_CR); + + if (!(msr & AMD_SVMDIS)) + ci->ci_vmm_flags |= CI_VMM_SVM; + } + + /* + * Check for SVM Nested Paging + */ + if (ci->ci_vmm_flags & CI_VMM_SVM) { + CPUID(CPUID_AMD_SVM_CAP, dummy, dummy, dummy, cap); + if (cap & AMD_SVM_NESTED_PAGING_CAP) + ci->ci_vmm_flags |= CI_VMM_RVI; + } +} +#endif /* VMM */ diff --git a/sys/arch/amd64/amd64/ipifuncs.c b/sys/arch/amd64/amd64/ipifuncs.c index 0279d446ea5..03f4fa827b6 100644 --- a/sys/arch/amd64/amd64/ipifuncs.c +++ b/sys/arch/amd64/amd64/ipifuncs.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ipifuncs.c,v 1.26 2015/03/14 03:38:46 jsg Exp $ */ +/* $OpenBSD: ipifuncs.c,v 1.27 2015/11/13 07:52:20 mlarkin Exp $ */ /* $NetBSD: ipifuncs.c,v 1.1 2003/04/26 18:39:28 fvdl Exp $ */ /*- @@ -54,12 +54,22 @@ #include <machine/db_machdep.h> +#include "vmm.h" +#ifdef VMM +#include <machine/vmmvar.h> +#endif /* VMM */ + void x86_64_ipi_nop(struct cpu_info *); void x86_64_ipi_halt(struct cpu_info *); void x86_64_ipi_synch_fpu(struct cpu_info *); void x86_64_ipi_flush_fpu(struct cpu_info *); +#ifdef VMM +void x86_64_ipi_start_vmm(struct cpu_info *); +void x86_64_ipi_stop_vmm(struct cpu_info *); +#endif /* VMM */ + #ifdef HIBERNATE void x86_64_ipi_halt_realmode(struct cpu_info *); extern void hibernate_drop_to_real_mode(void); @@ -85,6 +95,13 @@ void (*ipifunc[X86_NIPI])(struct cpu_info *) = #else NULL, #endif +#ifdef VMM + x86_64_ipi_start_vmm, + x86_64_ipi_stop_vmm, +#else + NULL, + NULL, +#endif /* VMM */ }; void @@ -132,3 +149,17 @@ x86_64_ipi_reload_mtrr(struct cpu_info *ci) mem_range_softc.mr_op->reload(&mem_range_softc); } #endif + +#ifdef VMM +void +x86_64_ipi_start_vmm(struct cpu_info *ci) +{ + start_vmm_on_cpu(ci); +} + +void +x86_64_ipi_stop_vmm(struct cpu_info *ci) +{ + stop_vmm_on_cpu(ci); +} +#endif /* VMM */ diff --git a/sys/arch/amd64/amd64/mainbus.c b/sys/arch/amd64/amd64/mainbus.c index b236fed7ade..8baf89862fa 100644 --- a/sys/arch/amd64/amd64/mainbus.c +++ b/sys/arch/amd64/amd64/mainbus.c @@ -1,4 +1,4 @@ -/* $OpenBSD: mainbus.c,v 1.33 2015/08/31 19:56:32 kettenis Exp $ */ +/* $OpenBSD: mainbus.c,v 1.34 2015/11/13 07:52:20 mlarkin Exp $ */ /* $NetBSD: mainbus.c,v 1.1 2003/04/26 18:39:29 fvdl Exp $ */ /* @@ -48,6 +48,7 @@ #include "ipmi.h" #include "bios.h" #include "mpbios.h" +#include "vmm.h" #include "pvbus.h" #include "efifb.h" @@ -239,6 +240,11 @@ mainbus_attach(struct device *parent, struct device *self, void *aux) config_found(self, &mba_iba, mainbus_print); #endif +#ifdef VMM + mba.mba_busname = "vmm"; + config_found(self, &mba.mba_busname, mainbus_print); +#endif /* VMM */ + #if NEFIFB > 0 if (bios_efiinfo != NULL) { mba.mba_eaa.eaa_name = "efifb"; diff --git a/sys/arch/amd64/amd64/vmm.c b/sys/arch/amd64/amd64/vmm.c new file mode 100644 index 00000000000..df72910cfdb --- /dev/null +++ b/sys/arch/amd64/amd64/vmm.c @@ -0,0 +1,3414 @@ +/* + * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/device.h> +#include <sys/pool.h> +#include <sys/proc.h> +#include <sys/ioctl.h> +#include <sys/queue.h> +#include <sys/rwlock.h> +#include <uvm/uvm.h> +#include <machine/pmap.h> +#include <machine/biosvar.h> +#include <machine/segments.h> +#include <machine/cpufunc.h> +#include <machine/vmmvar.h> +#include <machine/i82489reg.h> +#include <dev/isa/isareg.h> + +#define DEVNAME(s) ((s)->sc_dev.dv_xname) + +#define CTRL_DUMP(x,y,z) printf(" %s: Can set:%s Can clear:%s\n", #z , \ + vcpu_vmx_check_cap(x, IA32_VMX_##y ##_CTLS, \ + IA32_VMX_##z, 1) ? "Yes" : "No", \ + vcpu_vmx_check_cap(x, IA32_VMX_##y ##_CTLS, \ + IA32_VMX_##z, 0) ? "Yes" : "No"); + +SLIST_HEAD(vmlist_head, vm); + +struct vmm_softc { + struct device sc_dev; + + /* Capabilities */ + uint32_t nr_vmx_cpus; + uint32_t nr_svm_cpus; + uint32_t nr_rvi_cpus; + uint32_t nr_ept_cpus; + + /* Managed VMs */ + struct vmlist_head vm_list; + + int mode; + + struct rwlock vm_lock; + size_t vm_ct; + size_t vm_idx; +}; + +int vmm_probe(struct device *, void *, void *); +void vmm_attach(struct device *, struct device *, void *); +int vmm_activate(struct device *, int); +int vmmopen(dev_t, int, int, struct proc *); +int vmmioctl(dev_t, u_long, caddr_t, int, struct proc *); +int vmmclose(dev_t, int, int, struct proc *); +int vmm_start(void); +int vmm_stop(void); +int vm_create(struct vm_create_params *, struct proc *); +int vm_run(struct vm_run_params *); +int vm_terminate(struct vm_terminate_params *); +int vm_get_info(struct vm_info_params *); +int vm_writepage(struct vm_writepage_params *); +int vm_readpage(struct vm_readpage_params *); +int vcpu_init(struct vcpu *); +int vcpu_init_vmx(struct vcpu *); +int vcpu_init_svm(struct vcpu *); +int vcpu_run_vmx(struct vcpu *, uint8_t, int16_t *); +int vcpu_run_svm(struct vcpu *, uint8_t); +void vcpu_deinit(struct vcpu *); +void vcpu_deinit_vmx(struct vcpu *); +void vcpu_deinit_svm(struct vcpu *); +int vm_impl_init(struct vm *); +int vm_impl_init_vmx(struct vm *); +int vm_impl_init_svm(struct vm *); +void vm_impl_deinit(struct vm *); +void vm_impl_deinit_vmx(struct vm *); +void vm_impl_deinit_svm(struct vm *); +void vm_teardown(struct vm *); +int vcpu_vmx_check_cap(struct vcpu *, uint32_t, uint32_t, int); +int vcpu_vmx_compute_ctrl(struct vcpu *, uint64_t, uint16_t, uint32_t, + uint32_t, uint32_t *); +int vmx_handle_exit(struct vcpu *, int *); +int vmx_handle_cpuid(struct vcpu *); +int vmx_handle_cr(struct vcpu *); +int vmx_handle_inout(struct vcpu *); +int vmx_handle_hlt(struct vcpu *); +void vmx_handle_intr(struct vcpu *); +void vmx_handle_intwin(struct vcpu *); +int vmm_get_guest_memtype(struct vm *, paddr_t); +int vmm_get_guest_faulttype(void); +int vmx_get_guest_faulttype(void); +int svm_get_guest_faulttype(void); +int vmx_get_exit_qualification(uint64_t *); +int vmx_fault_page(struct vcpu *, paddr_t); +int vmx_handle_np_fault(struct vcpu *); +int vmx_fix_ept_pte(struct pmap *, vaddr_t); +const char *vmx_exit_reason_decode(uint32_t); +const char *vmx_instruction_error_decode(uint32_t); +void dump_vcpu(struct vcpu *); + +const char *vmm_hv_signature = VMM_HV_SIGNATURE; + +struct cfdriver vmm_cd = { + NULL, "vmm", DV_DULL +}; + +struct cfattach vmm_ca = { + sizeof(struct vmm_softc), vmm_probe, vmm_attach, NULL, + vmm_activate +}; + +/* Pools for VMs and VCPUs */ +struct pool vm_pool; +struct pool vcpu_pool; + +struct vmm_softc *vmm_softc; + +/* IDT information used when populating host state area */ +extern vaddr_t idt_vaddr; +extern struct gate_descriptor *idt; + +/* XXX Temporary hack for the PIT clock */ +#define CLOCK_BIAS 8192 +uint64_t vmmclk = 0; + +/* Constants used in "CR access exit" */ +#define CR_WRITE 0 +#define CR_READ 1 +#define CR_CLTS 2 +#define CR_LMSW 3 + +/* + * vmm_probe + * + * Checks if we have at least one CPU with either VMX or SVM. + * Returns 1 if we have at least one of either type, but not both, 0 otherwise. + */ +int +vmm_probe(struct device *parent, void *match, void *aux) +{ + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + const char **busname = (const char **)aux; + boolean_t found_vmx, found_svm; + + /* Check if this probe is for us */ + if (strcmp(*busname, vmm_cd.cd_name) != 0) + return (0); + + found_vmx = FALSE; + found_svm = FALSE; + + /* Check if we have at least one CPU with either VMX or SVM */ + CPU_INFO_FOREACH(cii, ci) { + if (ci->ci_vmm_flags & CI_VMM_VMX) + found_vmx = TRUE; + if (ci->ci_vmm_flags & CI_VMM_SVM) + found_svm = TRUE; + } + + /* Don't support both SVM and VMX at the same time */ + if (found_vmx && found_svm) + return (0); + + return (found_vmx || found_svm); +} + +/* + * vmm_attach + * + * Calculates how many of each type of CPU we have, prints this into dmesg + * during attach. Initializes various locks, pools, and list structures for the + * VMM. + */ +void +vmm_attach(struct device *parent, struct device *self, void *aux) +{ + struct vmm_softc *sc; + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + + sc = (struct vmm_softc *)self; + sc->nr_vmx_cpus = 0; + sc->nr_svm_cpus = 0; + sc->nr_rvi_cpus = 0; + sc->nr_ept_cpus = 0; + sc->vm_ct = 0; + sc->vm_idx = 0; + + /* Calculate CPU features */ + CPU_INFO_FOREACH(cii, ci) { + if (ci->ci_vmm_flags & CI_VMM_VMX) + sc->nr_vmx_cpus++; + if (ci->ci_vmm_flags & CI_VMM_SVM) + sc->nr_svm_cpus++; + if (ci->ci_vmm_flags & CI_VMM_RVI) + sc->nr_rvi_cpus++; + if (ci->ci_vmm_flags & CI_VMM_EPT) + sc->nr_ept_cpus++; + } + + SLIST_INIT(&sc->vm_list); + rw_init(&sc->vm_lock, "vmlistlock"); + + printf(": initialized\n"); + + if (sc->nr_vmx_cpus) + printf("%s: %u VMX capable CPU(s), %u are EPT capable\n", + DEVNAME(sc), sc->nr_vmx_cpus, sc->nr_ept_cpus); + if (sc->nr_svm_cpus) + printf("%s: %u SVM capable CPU(s), %u are RVI capable\n", + DEVNAME(sc), sc->nr_svm_cpus, sc->nr_rvi_cpus); + + pool_init(&vm_pool, sizeof(struct vm), 0, 0, PR_WAITOK, "vmpool", + NULL); + pool_init(&vcpu_pool, sizeof(struct vcpu), 0, 0, PR_WAITOK, "vcpupl", + NULL); + + sc->mode = VMM_MODE_UNKNOWN; + if (sc->nr_ept_cpus > 0) + sc->mode = VMM_MODE_EPT; + else if (sc->nr_vmx_cpus > 0) + sc->mode = VMM_MODE_VMX; + else if (sc->nr_rvi_cpus > 0) + sc->mode = VMM_MODE_RVI; + else + sc->mode = VMM_MODE_SVM; + + vmm_softc = sc; +} + +/* + * vmm_activate + * + * Autoconf routine used during activate/deactivate. + * + * XXX need this for suspend/resume + */ +int +vmm_activate(struct device *self, int act) +{ + return 0; +} + +/* + * vmmopen + * + * Called during open of /dev/vmm. Presently unused. + */ +int +vmmopen(dev_t dev, int flag, int mode, struct proc *p) +{ + return 0; +} + +/* + * vmmioctl + * + * Main ioctl dispatch routine for /dev/vmm. Parses ioctl type and calls + * appropriate lower level handler routine. Returns result to ioctl caller. + */ +int +vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) +{ + int ret; + + /* Don't allow ioctls if we have no supported CPUs */ + if (vmm_softc->mode == VMM_MODE_UNKNOWN) + return (ENOTTY); + + switch(cmd) { + case VMM_IOC_START: + ret = vmm_start(); + break; + case VMM_IOC_STOP: + ret = vmm_stop(); + break; + case VMM_IOC_CREATE: + ret = vm_create((struct vm_create_params *)data, p); + break; + case VMM_IOC_RUN: + ret = vm_run((struct vm_run_params *)data); + break; + case VMM_IOC_INFO: + ret = vm_get_info((struct vm_info_params *)data); + break; + case VMM_IOC_TERM: + ret = vm_terminate((struct vm_terminate_params *)data); + break; + case VMM_IOC_WRITEPAGE: + ret = vm_writepage((struct vm_writepage_params *)data); + break; + case VMM_IOC_READPAGE: + ret = vm_readpage((struct vm_readpage_params *)data); + break; + default: + ret = ENOTTY; + } + + return (ret); +} + +/* + * vmmclose + * + * Called when /dev/vmm is closed. Presently unused. + */ +int +vmmclose(dev_t dev, int flag, int mode, struct proc *p) +{ + return 0; +} + +/* + * vm_readpage + * + * Reads a region (PAGE_SIZE max) of guest physical memory using the parameters + * defined in 'vrp'. + * + * Returns 0 if successful, or various error codes on failure: + * ENOENT if the VM id contained in 'vrp' refers to an unknown VM + * EINVAL if the memory region described by vrp is not regular memory + * EFAULT if the memory region described by vrp has not yet been faulted in + * by the guest + */ +int +vm_readpage(struct vm_readpage_params *vrp) +{ + struct vm *vm; + paddr_t host_pa; + void *kva; + int found; + vaddr_t vr_page; + + /* Find the desired VM */ + rw_enter_read(&vmm_softc->vm_lock); + found = 0; + SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) { + if (vm->vm_id == vrp->vrp_vm_id) { + found = 1; + break; + } + } + + /* Not found? exit. */ + if (!found) { + rw_exit_read(&vmm_softc->vm_lock); + return (ENOENT); + } + + /* Calculate page containing vrp->vrp_paddr */ + vr_page = vrp->vrp_paddr & ~PAGE_MASK; + + /* If not regular memory, exit. */ + if (vmm_get_guest_memtype(vm, vr_page) != + VMM_MEM_TYPE_REGULAR) { + rw_exit_read(&vmm_softc->vm_lock); + return (EINVAL); + } + + /* Find the phys page where this guest page exists in real memory */ + if (!pmap_extract(vm->vm_map->pmap, vr_page, &host_pa)) { + return (EFAULT); + } + + /* Allocate temporary KVA for the guest page */ + kva = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait); + if (!kva) { + dprintf("vm_readpage: can't alloc kva\n"); + rw_exit_read(&vmm_softc->vm_lock); + return (EFAULT); + } + + /* Enter the mapping in the kernel pmap and copyout */ + pmap_kenter_pa((vaddr_t)kva, host_pa, PROT_READ); + + if (copyout(kva + ((vaddr_t)vrp->vrp_paddr & PAGE_MASK), + vrp->vrp_data, vrp->vrp_len) == EFAULT) { + dprintf("vm_readpage: can't copyout\n"); + pmap_kremove((vaddr_t)kva, PAGE_SIZE); + km_free(kva, PAGE_SIZE, &kv_any, &kp_none); + rw_exit_read(&vmm_softc->vm_lock); + return (EFAULT); + } + + /* Cleanup and exit */ + pmap_kremove((vaddr_t)kva, PAGE_SIZE); + km_free(kva, PAGE_SIZE, &kv_any, &kp_none); + + rw_exit_read(&vmm_softc->vm_lock); + + return (0); +} + +/* + * vm_writepage + * + * Writes a region (PAGE_SIZE max) of guest physical memory using the parameters + * defined in 'vrp'. + * + * Returns 0 if successful, or various error codes on failure: + * ENOENT if the VM id contained in 'vrp' refers to an unknown VM + * EINVAL if the memory region described by vrp is not regular memory + * EFAULT if the source data in vrp contains an invalid address + * ENOMEM if a memory allocation error occurs + */ +int +vm_writepage(struct vm_writepage_params *vwp) +{ + char *pagedata; + struct vm *vm; + paddr_t host_pa; + void *kva; + int found, ret; + vaddr_t vw_page, dst; + + /* Find the desired VM */ + rw_enter_read(&vmm_softc->vm_lock); + found = 0; + SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) { + if (vm->vm_id == vwp->vwp_vm_id) { + found = 1; + break; + } + } + + /* Not found? exit. */ + if (!found) { + rw_exit_read(&vmm_softc->vm_lock); + return (ENOENT); + } + + /* Calculate page containing vwp->vwp_paddr */ + vw_page = vwp->vwp_paddr & ~PAGE_MASK; + + /* If not regular memory, exit. */ + if (vmm_get_guest_memtype(vm, vw_page) != + VMM_MEM_TYPE_REGULAR) { + rw_exit_read(&vmm_softc->vm_lock); + return (EINVAL); + } + + /* Allocate temporary region to copyin into */ + pagedata = malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO); + + if (!pagedata) { + rw_exit_read(&vmm_softc->vm_lock); + return (ENOMEM); + } + + /* Copy supplied data to kernel */ + if (copyin(vwp->vwp_data, pagedata, vwp->vwp_len) == EFAULT) { + free(pagedata, M_DEVBUF, PAGE_SIZE); + rw_exit_read(&vmm_softc->vm_lock); + return (EFAULT); + } + + /* Find the phys page where this guest page exists in real memory */ + if (!pmap_extract(vm->vm_map->pmap, vw_page, &host_pa)) { + /* page not present */ + ret = uvm_fault(vm->vm_map, vw_page, + PROT_WRITE, PROT_READ | PROT_WRITE | PROT_EXEC); + if (ret) { + free(pagedata, M_DEVBUF, PAGE_SIZE); + rw_exit_read(&vmm_softc->vm_lock); + return (EFAULT); + } + + if (!pmap_extract(vm->vm_map->pmap, vw_page, &host_pa)) { + panic("vm_writepage: still not mapped GPA 0x%llx\n", + (uint64_t)vwp->vwp_paddr); + } + } + + /* Allocate kva for guest page */ + kva = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait); + if (!kva) { + dprintf("vm_writepage: can't alloc kva\n"); + free(pagedata, M_DEVBUF, PAGE_SIZE); + rw_exit_read(&vmm_softc->vm_lock); + return (EFAULT); + } + + /* Enter mapping and copy data */ + pmap_kenter_pa((vaddr_t)kva, host_pa, PROT_READ | PROT_WRITE); + dst = (vaddr_t)kva + ((vaddr_t)vwp->vwp_paddr & PAGE_MASK); + memcpy((void *)dst, pagedata, vwp->vwp_len); + + /* Cleanup */ + pmap_kremove((vaddr_t)kva, PAGE_SIZE); + km_free(kva, PAGE_SIZE, &kv_any, &kp_none); + + free(pagedata, M_DEVBUF, PAGE_SIZE); + + /* Fixup the EPT map for this page */ + if (vmx_fix_ept_pte(vm->vm_map->pmap, vw_page)) { + dprintf("vm_writepage: cant fixup ept pte for gpa 0x%llx\n", + (uint64_t)vwp->vwp_paddr); + rw_exit_read(&vmm_softc->vm_lock); + return (EFAULT); + } + rw_exit_read(&vmm_softc->vm_lock); + + return (0); +} + +/* + * vmm_start + * + * Starts VMM mode on the system + */ +int +vmm_start(void) +{ + struct cpu_info *self; + int ret = 0; + +#ifdef MULTIPROCESSOR + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + int i; +#endif /* MULTIPROCESSOR */ + + self = curcpu(); +#ifdef MULTIPROCESSOR + /* Broadcast start VMM IPI */ + x86_broadcast_ipi(X86_IPI_START_VMM); + + CPU_INFO_FOREACH(cii, ci) { + if (ci == self) + continue; + for (i = 100000; (!(ci->ci_flags & CPUF_VMM)) && i>0;i--) + delay(10); + if (!(ci->ci_flags & CPUF_VMM)) { + printf("%s: failed to enter VMM mode\n", + ci->ci_dev->dv_xname); + ret = EIO; + } else + printf("%s: entered VMM mode\n", ci->ci_dev->dv_xname); + } +#endif /* MULTIPROCESSOR */ + + /* Start VMM on this CPU */ + start_vmm_on_cpu(self); + if (!(self->ci_flags & CPUF_VMM)) { + printf("%s: failed to enter VMM mode\n", + self->ci_dev->dv_xname); + ret = EIO; + } else + printf("%s: entered VMM mode\n", self->ci_dev->dv_xname); + + return (ret); +} + +/* + * vmm_stop + * + * Stops VMM mode on the system + * + * XXX should restrict this function to not stop VMM mode while VMs are running + */ +int +vmm_stop(void) +{ + struct cpu_info *self; + int ret = 0; + +#ifdef MULTIPROCESSOR + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + int i; +#endif /* MULTIPROCESSOR */ + + self = curcpu(); +#ifdef MULTIPROCESSOR + /* Stop VMM on other CPUs */ + x86_broadcast_ipi(X86_IPI_STOP_VMM); + + CPU_INFO_FOREACH(cii, ci) { + if (ci == self) + continue; + for (i = 100000; (ci->ci_flags & CPUF_VMM) && i>0 ;i--) + delay(10); + if (ci->ci_flags & CPUF_VMM) { + printf("%s: failed to exit VMM mode\n", + ci->ci_dev->dv_xname); + ret = EIO; + } else + printf("%s: exited VMM mode\n", ci->ci_dev->dv_xname); + } +#endif /* MULTIPROCESSOR */ + + /* Stop VMM on this CPU */ + stop_vmm_on_cpu(self); + if (self->ci_flags & CPUF_VMM) { + printf("%s: failed to exit VMM mode\n", + self->ci_dev->dv_xname); + ret = EIO; + } else + printf("%s: exited VMM mode\n", self->ci_dev->dv_xname); + + return (ret); +} + +/* + * start_vmm_on_cpu + * + * Starts VMM mode on 'ci' by executing the appropriate CPU-specific insn + * sequence to enter VMM mode (eg, VMXON) + */ +void +start_vmm_on_cpu(struct cpu_info *ci) +{ + uint64_t msr; + uint32_t cr4; + + /* No VMM mode? exit. */ + if (ci->ci_flags & CPUF_VMM) + return; + + /* + * AMD SVM + */ + if (ci->ci_vmm_flags & CI_VMM_SVM) { + msr = rdmsr(MSR_EFER); + msr |= EFER_SVME; + wrmsr(MSR_EFER, msr); + } + + /* + * Intel VMX + */ + if (ci->ci_vmm_flags & CI_VMM_VMX) { + if (ci->ci_vmxon_region == 0) + panic("NULL vmxon region specified\n"); + else { + bzero(ci->ci_vmxon_region, PAGE_SIZE); + ci->ci_vmxon_region->vr_revision = + ci->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision; + + /* Set CR4.VMXE */ + cr4 = rcr4(); + cr4 |= CR4_VMXE; + lcr4(cr4); + + /* Enable VMX */ + msr = rdmsr(MSR_IA32_FEATURE_CONTROL); + if (msr & IA32_FEATURE_CONTROL_LOCK) { + if (!(msr & IA32_FEATURE_CONTROL_VMX_EN)) + return; + } else { + msr |= IA32_FEATURE_CONTROL_VMX_EN | + IA32_FEATURE_CONTROL_LOCK; + wrmsr(MSR_IA32_FEATURE_CONTROL, msr); + } + + /* Enter VMX mode */ + if (vmxon((uint64_t *)&ci->ci_vmxon_region_pa)) + panic("VMXON failed\n"); + } + } + + ci->ci_flags |= CPUF_VMM; +} + +/* + * stop_vmm_on_cpu + * + * Stops VMM mode on 'ci' by executing the appropriate CPU-specific insn + * sequence to exit VMM mode (eg, VMXOFF) + */ +void +stop_vmm_on_cpu(struct cpu_info *ci) +{ + uint64_t msr; + uint32_t cr4; + + if (!(ci->ci_flags & CPUF_VMM)) + return; + + /* + * AMD SVM + */ + if (ci->ci_vmm_flags & CI_VMM_SVM) { + msr = rdmsr(MSR_EFER); + msr &= ~EFER_SVME; + wrmsr(MSR_EFER, msr); + } + + /* + * Intel VMX + */ + if (ci->ci_vmm_flags & CI_VMM_VMX) { + if (vmxoff()) + panic("VMXOFF failed\n"); + + cr4 = rcr4(); + cr4 &= ~CR4_VMXE; + lcr4(cr4); + } + + ci->ci_flags &= ~CPUF_VMM; +} + +/* + * vm_create + * + * Creates the in-memory VMM structures for the VM defined by 'vcp'. The + * parent of this VM shall be the process defined by 'p'. + * This function does not start the VCPU(s) - see vm_start. + * + * Return Values: + * 0: the create operation was successful + * ENOMEM: out of memory + * various other errors from vcpu_init/vm_impl_init + */ +int +vm_create(struct vm_create_params *vcp, struct proc *p) +{ + int i, ret; + struct vm *vm; + struct vcpu *vcpu; + + vm = pool_get(&vm_pool, PR_WAITOK | PR_ZERO); + SLIST_INIT(&vm->vm_vcpu_list); + rw_init(&vm->vm_vcpu_lock, "vcpulock"); + + vm->vm_creator_pid = p->p_p->ps_pid; + vm->vm_memory_size = vcp->vcp_memory_size; + strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN); + + if (vm_impl_init(vm)) { + printf("failed to init arch-specific features for vm 0x%p\n", + vm); + vm_teardown(vm); + return ENOMEM; + } + + rw_enter_write(&vmm_softc->vm_lock); + vmm_softc->vm_ct++; + vmm_softc->vm_idx++; + + /* + * XXX we use the vm_id for the VPID/ASID, so we need to prevent + * wrapping around 65536/4096 entries here + */ + vm->vm_id = vmm_softc->vm_idx; + vm->vm_vcpu_ct = 0; + + /* Initialize each VCPU defined in 'vcp' */ + for (i = 0; i < vcp->vcp_ncpus; i++) { + vcpu = pool_get(&vcpu_pool, PR_WAITOK | PR_ZERO); + vcpu->vc_parent = vm; + if ((ret = vcpu_init(vcpu)) != 0) { + printf("failed to init vcpu %d for vm 0x%p\n", i, vm); + vm_teardown(vm); + vmm_softc->vm_ct--; + vmm_softc->vm_idx--; + rw_exit_write(&vmm_softc->vm_lock); + return (ret); + } + rw_enter_write(&vm->vm_vcpu_lock); + vcpu->vc_id = vm->vm_vcpu_ct; + vm->vm_vcpu_ct++; + SLIST_INSERT_HEAD(&vm->vm_vcpu_list, vcpu, vc_vcpu_link); + rw_exit_write(&vm->vm_vcpu_lock); + } + + /* XXX init various other hardware parts (vlapic, vioapic, etc) */ + + SLIST_INSERT_HEAD(&vmm_softc->vm_list, vm, vm_link); + rw_exit_write(&vmm_softc->vm_lock); + + vcp->vcp_id = vm->vm_id; + + return (0); +} + +/* + * vm_impl_init_vmx + * + * Intel VMX specific VM initialization routine + */ +int +vm_impl_init_vmx(struct vm *vm) +{ + struct pmap *pmap; + size_t memsize; + vaddr_t startp; + int ret; + + /* If not EPT, nothing to do here */ + if (vmm_softc->mode != VMM_MODE_EPT) + return (0); + + /* Create a new pmap for this VM */ + pmap = pmap_create(); + if (!pmap) { + printf("vm_impl_init_vmx: pmap_create failed\n"); + return (ENOMEM); + } + + startp = 0; + memsize = vm->vm_memory_size * 1024 * 1024; + + /* + * Create a new UVM map for this VM, and assign it the pmap just + * created. + */ + vm->vm_map = uvm_map_create(pmap, 0, memsize, + VM_MAP_ISVMSPACE | VM_MAP_PAGEABLE); + + if (!vm->vm_map) { + printf("vm_impl_init_vmx: uvm_map_create failed\n"); + pmap_destroy(pmap); + return (ENOMEM); + } + + /* Map the new map with an anon */ + dprintf(("vm_impl_init_vmx: created vm_map @ %p\n", vm->vm_map)); + ret = uvm_mapanon(vm->vm_map, &startp, memsize, 0, + UVM_MAPFLAG(PROT_READ | PROT_WRITE | PROT_EXEC, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_INHERIT_NONE, + MADV_NORMAL, + UVM_FLAG_FIXED | UVM_FLAG_OVERLAY)); + if (ret) { + printf("vm_impl_init_vmx: uvm_mapanon failed (%d)\n", ret); + /* uvm_map_deallocate calls pmap_destroy for us */ + uvm_map_deallocate(vm->vm_map); + vm->vm_map = NULL; + return (ENOMEM); + } + + /* Convert the low 512GB of the pmap to EPT */ + ret = pmap_convert(pmap, PMAP_TYPE_EPT); + if (ret) { + printf("vm_impl_init_vmx: pmap_convert failed\n"); + /* uvm_map_deallocate calls pmap_destroy for us */ + uvm_map_deallocate(vm->vm_map); + vm->vm_map = NULL; + return (ENOMEM); + } + + return (0); +} + +/* + * vm_impl_init_svm + * + * AMD SVM specific VM initialization routine + */ +int +vm_impl_init_svm(struct vm *vm) +{ + /* XXX removed due to rot */ + return (0); +} + +/* + * vm_impl_init + * + * Calls the architecture-specific VM init routine + */ +int +vm_impl_init(struct vm *vm) +{ + if (vmm_softc->mode == VMM_MODE_VMX || + vmm_softc->mode == VMM_MODE_EPT) + return vm_impl_init_vmx(vm); + else if (vmm_softc->mode == VMM_MODE_SVM || + vmm_softc->mode == VMM_MODE_RVI) + return vm_impl_init_svm(vm); + else + panic("unknown vmm mode\n"); +} + +/* + * vm_impl_deinit_vmx + * + * Intel VMX specific VM initialization routine + */ +void +vm_impl_deinit_vmx(struct vm *vm) +{ + /* Unused */ +} + +/* + * vm_impl_deinit_svm + * + * AMD SVM specific VM initialization routine + */ +void +vm_impl_deinit_svm(struct vm *vm) +{ + /* Unused */ +} + +/* + * vm_impl_deinit + * + * Calls the architecture-specific VM init routine + */ +void +vm_impl_deinit(struct vm *vm) +{ + if (vmm_softc->mode == VMM_MODE_VMX || + vmm_softc->mode == VMM_MODE_EPT) + vm_impl_deinit_vmx(vm); + else if (vmm_softc->mode == VMM_MODE_SVM || + vmm_softc->mode == VMM_MODE_RVI) + vm_impl_deinit_svm(vm); + else + panic("unknown vmm mode\n"); +} + +/* + * vcpu_init_vmx + * + * Intel VMX specific VCPU initialization routine. + * + * This function allocates various per-VCPU memory regions, sets up initial + * VCPU VMCS controls, and sets initial register values. + * + * This function is very long but is only performing a bunch of register + * setups, over and over. + */ +int +vcpu_init_vmx(struct vcpu *vcpu) +{ + struct vmcs *vmcs; + uint16_t ctrl; + uint64_t pat_default, msr, ctrlval, eptp; + uint32_t pinbased, procbased, procbased2, exit, entry; + uint32_t want1, want0; + uint32_t cr0, cr4; + paddr_t control_pa; + int ret; + struct vmx_msr_store *msr_store; + + ret = 0; + + /* Allocate VMCS VA */ + vcpu->vc_control_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, &kp_zero, + &kd_waitok); + + if (!vcpu->vc_control_va) + return (ENOMEM); + + /* Compute VMCS PA */ + if (!pmap_extract(pmap_kernel(), vcpu->vc_control_va, &control_pa)) { + ret = ENOMEM; + goto exit; + } + + vcpu->vc_control_pa = (uint64_t)control_pa; + + /* Allocate MSR bitmap VA */ + /* XXX dont need this if no msr bitmap support */ + vcpu->vc_msr_bitmap_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, &kp_zero, + &kd_waitok); + + if (!vcpu->vc_msr_bitmap_va) { + ret = ENOMEM; + goto exit; + } + + /* Compute MSR bitmap PA */ + if (!pmap_extract(pmap_kernel(), vcpu->vc_msr_bitmap_va, &control_pa)) { + ret = ENOMEM; + goto exit; + } + + vcpu->vc_msr_bitmap_pa = (uint64_t)control_pa; + + /* Allocate MSR exit load area VA */ + /* XXX may not need this with MSR bitmaps */ + vcpu->vc_vmx_msr_exit_load_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, + &kp_zero, &kd_waitok); + + if (!vcpu->vc_vmx_msr_exit_load_va) { + ret = ENOMEM; + goto exit; + } + + /* Compute MSR exit load area PA */ + if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_exit_load_va, + &vcpu->vc_vmx_msr_exit_load_pa)) { + ret = ENOMEM; + goto exit; + } + + /* Allocate MSR exit save area VA */ + /* XXX may not need this with MSR bitmaps */ + vcpu->vc_vmx_msr_exit_save_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, + &kp_zero, &kd_waitok); + + if (!vcpu->vc_vmx_msr_exit_save_va) { + ret = ENOMEM; + goto exit; + } + + /* Compute MSR exit save area PA */ + if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_exit_save_va, + &vcpu->vc_vmx_msr_exit_save_pa)) { + ret = ENOMEM; + goto exit; + } + + /* Allocate MSR entry load area VA */ + /* XXX may not need this with MSR bitmaps */ + vcpu->vc_vmx_msr_entry_load_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, + &kp_zero, &kd_waitok); + + if (!vcpu->vc_vmx_msr_entry_load_va) { + ret = ENOMEM; + goto exit; + } + + /* Compute MSR entry load area PA */ + if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_entry_load_va, + &vcpu->vc_vmx_msr_entry_load_pa)) { + ret = ENOMEM; + goto exit; + } + + dprintf(("exit save va/pa 0x%llx 0x%llx\n", + (uint64_t)vcpu->vc_vmx_msr_exit_save_va, + (uint64_t)vcpu->vc_vmx_msr_exit_save_pa)); + dprintf(("exit load va/pa 0x%llx 0x%llx\n", + (uint64_t)vcpu->vc_vmx_msr_exit_load_va, + (uint64_t)vcpu->vc_vmx_msr_exit_load_pa)); + dprintf(("entry load va/pa 0x%llx 0x%llx\n", + (uint64_t)vcpu->vc_vmx_msr_entry_load_va, + (uint64_t)vcpu->vc_vmx_msr_entry_load_pa)); + dprintf(("vlapic va/pa 0x%llx 0x%llx\n", + (uint64_t)vcpu->vc_vlapic_va, + (uint64_t)vcpu->vc_vlapic_pa)); + dprintf(("msr bitmap va/pa 0x%llx 0x%llx\n", + (uint64_t)vcpu->vc_msr_bitmap_va, + (uint64_t)vcpu->vc_msr_bitmap_pa)); + + vmcs = (struct vmcs *)vcpu->vc_control_va; + vmcs->vmcs_revision = curcpu()->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision; + + /* Clear the VMCS */ + if (vmclear(&vcpu->vc_control_pa)) { + ret = EINVAL; + goto exit; + } + + /* + * Load the VMCS onto this PCPU so we can write registers and controls + */ + if (vmptrld(&vcpu->vc_control_pa)) { + ret = EINVAL; + goto exit; + } + + /* Compute Basic Entry / Exit Controls */ + vcpu->vc_vmx_basic = rdmsr(IA32_VMX_BASIC); + vcpu->vc_vmx_entry_ctls = rdmsr(IA32_VMX_ENTRY_CTLS); + vcpu->vc_vmx_exit_ctls = rdmsr(IA32_VMX_EXIT_CTLS); + vcpu->vc_vmx_pinbased_ctls = rdmsr(IA32_VMX_PINBASED_CTLS); + vcpu->vc_vmx_procbased_ctls = rdmsr(IA32_VMX_PROCBASED_CTLS); + + /* Compute True Entry / Exit Controls (if applicable) */ + if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) { + vcpu->vc_vmx_true_entry_ctls = rdmsr(IA32_VMX_TRUE_ENTRY_CTLS); + vcpu->vc_vmx_true_exit_ctls = rdmsr(IA32_VMX_TRUE_EXIT_CTLS); + vcpu->vc_vmx_true_pinbased_ctls = + rdmsr(IA32_VMX_TRUE_PINBASED_CTLS); + vcpu->vc_vmx_true_procbased_ctls = + rdmsr(IA32_VMX_TRUE_PROCBASED_CTLS); + } + + /* Compute Secondary Procbased Controls (if applicable) */ + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) + vcpu->vc_vmx_procbased2_ctls = rdmsr(IA32_VMX_PROCBASED2_CTLS); + + +# if 0 + /* XXX not needed now with MSR list */ + + /* Default Guest PAT (if applicable) */ + if ((vcpu_vmx_check_cap(vcpu, IA32_VMX_ENTRY_CTLS, + IA32_VMX_LOAD_IA32_PAT_ON_ENTRY, 1)) || + vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS, + IA32_VMX_SAVE_IA32_PAT_ON_EXIT, 1)) { + pat_default = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WT) | + PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | + PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WT) | + PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); + if (vmwrite(VMCS_GUEST_IA32_PAT, pat_default)) { + ret = EINVAL; + goto exit; + } + } + + /* Host PAT (if applicable) */ + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS, + IA32_VMX_LOAD_IA32_PAT_ON_EXIT, 1)) { + msr = rdmsr(MSR_CR_PAT); + if (vmwrite(VMCS_HOST_IA32_PAT, msr)) { + ret = EINVAL; + goto exit; + } + } +#endif + + /* Host CR0 */ + cr0 = rcr0(); + if (vmwrite(VMCS_HOST_IA32_CR0, cr0)) { + ret = EINVAL; + goto exit; + } + + /* Host CR4 */ + cr4 = rcr4(); + if (vmwrite(VMCS_HOST_IA32_CR4, cr4)) { + ret = EINVAL; + goto exit; + } + + /* Host Segment Selectors */ + if (vmwrite(VMCS_HOST_IA32_CS_SEL, GSEL(GCODE_SEL, SEL_KPL))) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_HOST_IA32_DS_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_HOST_IA32_ES_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_HOST_IA32_FS_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_HOST_IA32_GS_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_HOST_IA32_SS_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_HOST_IA32_TR_SEL, GSYSSEL(GPROC0_SEL, SEL_KPL))) { + ret = EINVAL; + goto exit; + } + + /* Host IDTR base */ + if (vmwrite(VMCS_HOST_IA32_IDTR_BASE, idt_vaddr)) { + ret = EINVAL; + goto exit; + } + + /* VMCS link */ + if (vmwrite(VMCS_LINK_POINTER, 0xFFFFFFFFFFFFFFFF)) { + ret = EINVAL; + goto exit; + } + + /* + * Pinbased ctrls + * + * We must be able to set the following: + * IA32_VMX_EXTERNAL_INT_EXITING - exit on host interrupt + * IA32_VMX_NMI_EXITING - exit on host NMI + */ + want1 = IA32_VMX_EXTERNAL_INT_EXITING | + IA32_VMX_NMI_EXITING; + want0 = 0; + + if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) { + ctrl = IA32_VMX_TRUE_PINBASED_CTLS; + ctrlval = vcpu->vc_vmx_true_pinbased_ctls; + } else { + ctrl = IA32_VMX_PINBASED_CTLS; + ctrlval = vcpu->vc_vmx_pinbased_ctls; + } + + if (vcpu_vmx_compute_ctrl(vcpu, ctrlval, ctrl, want1, want0, + &pinbased)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_PINBASED_CTLS, pinbased)) { + ret = EINVAL; + goto exit; + } + + /* + * Procbased ctrls + * + * We must be able to set the following: + * IA32_VMX_HLT_EXITING - exit on HLT instruction + * IA32_VMX_MWAIT_EXITING - exit on MWAIT instruction + * IA32_VMX_UNCONDITIONAL_IO_EXITING - exit on I/O instructions + * IA32_VMX_USE_MSR_BITMAPS - exit on various MSR accesses + * IA32_VMX_CR8_LOAD_EXITING - guest TPR access + * IA32_VMX_CR8_STORE_EXITING - guest TPR access + * IA32_VMX_USE_TPR_SHADOW - guest TPR access (shadow) + * + * If we have EPT, we must be able to clear the following + * IA32_VMX_CR3_LOAD_EXITING - don't care about guest CR3 accesses + * IA32_VMX_CR3_STORE_EXITING - don't care about guest CR3 accesses + */ + want1 = IA32_VMX_HLT_EXITING | + IA32_VMX_MWAIT_EXITING | + IA32_VMX_UNCONDITIONAL_IO_EXITING | + IA32_VMX_USE_MSR_BITMAPS | + IA32_VMX_CR8_LOAD_EXITING | + IA32_VMX_CR8_STORE_EXITING | + IA32_VMX_USE_TPR_SHADOW; + want0 = 0; + + if (vmm_softc->mode == VMM_MODE_EPT) { + want1 |= IA32_VMX_ACTIVATE_SECONDARY_CONTROLS; + want0 |= IA32_VMX_CR3_LOAD_EXITING | + IA32_VMX_CR3_STORE_EXITING; + } + + if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) { + ctrl = IA32_VMX_TRUE_PROCBASED_CTLS; + ctrlval = vcpu->vc_vmx_true_procbased_ctls; + } else { + ctrl = IA32_VMX_PROCBASED_CTLS; + ctrlval = vcpu->vc_vmx_procbased_ctls; + } + + if (vcpu_vmx_compute_ctrl(vcpu, ctrlval, ctrl, want1, want0, + &procbased)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_PROCBASED_CTLS, procbased)) { + ret = EINVAL; + goto exit; + } + + /* + * Secondary Procbased ctrls + * + * We want to be able to set the following, if available: + * IA32_VMX_ENABLE_VPID - use VPIDs where available + * + * If we have EPT, we must be able to set the following: + * IA32_VMX_ENABLE_EPT - enable EPT + * + * If we have unrestricted guest capability, we must be able to set + * the following: + * IA32_VMX_UNRESTRICTED_GUEST - enable unrestricted guest + */ + want1 = 0; + + /* XXX checking for 2ndary controls can be combined here */ + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) { + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_VPID, 1)) + want1 |= IA32_VMX_ENABLE_VPID; + } + + if (vmm_softc->mode == VMM_MODE_EPT) + want1 |= IA32_VMX_ENABLE_EPT; + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) { + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_UNRESTRICTED_GUEST, 1)) + want1 |= IA32_VMX_UNRESTRICTED_GUEST; + } + + want0 = ~want1; + ctrlval = vcpu->vc_vmx_procbased2_ctls; + ctrl = IA32_VMX_PROCBASED2_CTLS; + + if (vcpu_vmx_compute_ctrl(vcpu, ctrlval, ctrl, want1, want0, + &procbased2)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_PROCBASED2_CTLS, procbased2)) { + ret = EINVAL; + goto exit; + } + + /* + * Exit ctrls + * + * We must be able to set the following: + * IA32_VMX_HOST_SPACE_ADDRESS_SIZE - exit to long mode + * IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT - ack interrupt on exit + * XXX clear save_debug_ctrls on exit ? + */ + want1 = IA32_VMX_HOST_SPACE_ADDRESS_SIZE | + IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT; + want0 = 0; + + if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) { + ctrl = IA32_VMX_TRUE_EXIT_CTLS; + ctrlval = vcpu->vc_vmx_true_exit_ctls; + } else { + ctrl = IA32_VMX_EXIT_CTLS; + ctrlval = vcpu->vc_vmx_exit_ctls; + } + + if (vcpu_vmx_compute_ctrl(vcpu, ctrlval, ctrl, want1, want0, &exit)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_EXIT_CTLS, exit)) { + ret = EINVAL; + goto exit; + } + + /* + * Entry ctrls + * + * We must be able to clear the following: + * IA32_VMX_ENTRY_TO_SMM - enter to SMM + * IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT + * XXX clear load debug_ctrls on entry ? + */ + want1 = 0; + want0 = IA32_VMX_ENTRY_TO_SMM | + IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT; + + if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) { + ctrl = IA32_VMX_TRUE_ENTRY_CTLS; + ctrlval = vcpu->vc_vmx_true_entry_ctls; + } else { + ctrl = IA32_VMX_ENTRY_CTLS; + ctrlval = vcpu->vc_vmx_entry_ctls; + } + + if (vcpu_vmx_compute_ctrl(vcpu, ctrlval, ctrl, want1, want0, &entry)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_ENTRY_CTLS, entry)) { + ret = EINVAL; + goto exit; + } + + if (vmm_softc->mode == VMM_MODE_EPT) { + eptp = vcpu->vc_parent->vm_map->pmap->pm_pdirpa; + msr = rdmsr(IA32_VMX_EPT_VPID_CAP); + if (msr & IA32_EPT_VPID_CAP_PAGE_WALK_4) { + /* Page walk length 4 supported */ + eptp |= ((IA32_EPT_PAGE_WALK_LENGTH - 1) << 3); + } + + + if (msr & IA32_EPT_VPID_CAP_WB) { + /* WB cache type supported */ + eptp |= IA32_EPT_PAGING_CACHE_TYPE_WB; + } + + dprintf(("guest eptp = 0x%llx\n", eptp)); + if (vmwrite(VMCS_GUEST_IA32_EPTP, eptp)) { + ret = EINVAL; + goto exit; + } + } + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) { + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_ENABLE_VPID, 1)) + if (vmwrite(VMCS_GUEST_VPID, + (uint16_t)vcpu->vc_parent->vm_id)) { + ret = EINVAL; + goto exit; + } + } + + /* + * The next portion of code sets up the VMCS for the register state + * we want during VCPU start. This matches what the CPU state would + * be after a bootloader transition to 'start'. + */ + if (vmwrite(VMCS_GUEST_IA32_RFLAGS, 0x2)) { + ret = EINVAL; + goto exit; + } + + /* + * XXX - + * vg_rip gets special treatment here since we will rewrite + * it just before vmx_enter_guest, so it needs to match. + * we could just set vg_rip here and be done with (no vmwrite + * here) but that would require us to have proper resume + * handling (resume=1) in the exit handler, so for now we + * will just end up doing an extra vmwrite here. + * + * This can now change from the hardcoded value of 0x1000160 + * to the marks[start] from vmd's bootloader. That needs to + * be hoisted up into vcpu create parameters via vm create params. + */ + vcpu->vc_gueststate.vg_rip = 0x01000160; + if (vmwrite(VMCS_GUEST_IA32_RIP, 0x01000160)) { + ret = EINVAL; + goto exit; + } + + /* + * Determine default CR0 as per Intel SDM A.7 + * All flexible bits are set to 0 + */ + cr0 = (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0) & + (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1); + cr0 |= (CR0_CD | CR0_NW | CR0_ET); + + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) { + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, + IA32_VMX_UNRESTRICTED_GUEST, 1)) +// cr0 &= ~(CR0_PG); + cr0 &= ~(CR0_PG | CR0_PE); + } + + if (vmwrite(VMCS_GUEST_IA32_CR0, cr0)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_CR3, 0x0)) { + ret = EINVAL; + goto exit; + } + + /* + * Determine default CR4 as per Intel SDM A.8 + * All flexible bits are set to 0 + */ + cr4 = (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0) & + (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1); + + if (vmwrite(VMCS_GUEST_IA32_CR4, cr4)) { + ret = EINVAL; + goto exit; + } + + /* Set guest stack for 0x10000 - sizeof(bootloader stack setup) */ + if (vmwrite(VMCS_GUEST_IA32_RSP, 0xFFDC)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_SS_SEL, 0x10)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_SS_LIMIT, 0xFFFFFFFF)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_SS_AR, 0xC093)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_SS_BASE, 0x0)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_DS_SEL, 0x10)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_DS_LIMIT, 0xFFFFFFFF)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_DS_AR, 0xC093)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_DS_BASE, 0x0)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_ES_SEL, 0x10)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_ES_LIMIT, 0xFFFFFFFF)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_ES_AR, 0xC093)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_ES_BASE, 0x0)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_FS_SEL, 0x10)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_FS_LIMIT, 0xFFFFFFFF)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_FS_AR, 0xC093)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_FS_BASE, 0x0)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_GS_SEL, 0x10)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_GS_LIMIT, 0xFFFFFFFF)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_GS_AR, 0xC093)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_GS_BASE, 0x0)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_CS_SEL, 0x8)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_CS_LIMIT, 0xFFFFFFFF)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_CS_AR, 0xC09F)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_CS_BASE, 0x0)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_GDTR_LIMIT, 0xFFFF)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_GDTR_BASE, 0x10000)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_IDTR_LIMIT, 0xFFFF)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_IDTR_BASE, 0x0)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_LDTR_SEL, 0x0)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_LDTR_LIMIT, 0xFFFF)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_LDTR_AR, 0x0082)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_LDTR_BASE, 0x0)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_TR_SEL, 0x0)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_TR_LIMIT, 0xFFFF)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_TR_AR, 0x008B)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_IA32_TR_BASE, 0x0)) { + ret = EINVAL; + goto exit; + } + + /* + * Select MSRs to be saved on exit + */ + msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_save_va; + msr_store[0].vms_index = MSR_EFER; + msr_store[1].vms_index = MSR_CR_PAT; + msr_store[2].vms_index = MSR_STAR; + msr_store[3].vms_index = MSR_LSTAR; + msr_store[4].vms_index = MSR_CSTAR; + msr_store[5].vms_index = MSR_SFMASK; + msr_store[6].vms_index = MSR_KERNELGSBASE; + + /* + * Select MSRs to be loaded on exit + */ + msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_load_va; + msr_store[0].vms_index = MSR_EFER; + msr_store[0].vms_data = rdmsr(MSR_EFER); + msr_store[1].vms_index = MSR_CR_PAT; + msr_store[1].vms_data = rdmsr(MSR_CR_PAT); + msr_store[2].vms_index = MSR_STAR; + msr_store[2].vms_data = rdmsr(MSR_STAR); + msr_store[3].vms_index = MSR_LSTAR; + msr_store[3].vms_data = rdmsr(MSR_LSTAR); + msr_store[4].vms_index = MSR_CSTAR; + msr_store[4].vms_data = rdmsr(MSR_CSTAR); + msr_store[5].vms_index = MSR_SFMASK; + msr_store[5].vms_data = rdmsr(MSR_SFMASK); + msr_store[6].vms_index = MSR_KERNELGSBASE; + msr_store[6].vms_data = rdmsr(MSR_KERNELGSBASE); + + /* + * Select MSRs to be loaded on entry + */ + msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_entry_load_va; + msr_store[0].vms_index = MSR_EFER; + msr_store[0].vms_data = 0ULL; /* Initial value */ + msr_store[1].vms_index = MSR_CR_PAT; + msr_store[1].vms_data = pat_default; /* Initial value */ + msr_store[2].vms_index = MSR_STAR; + msr_store[2].vms_data = 0ULL; /* Initial value */ + msr_store[3].vms_index = MSR_LSTAR; + msr_store[3].vms_data = 0ULL; /* Initial value */ + msr_store[4].vms_index = MSR_CSTAR; + msr_store[4].vms_data = 0ULL; /* Initial value */ + msr_store[5].vms_index = MSR_SFMASK; + msr_store[5].vms_data = 0ULL; /* Initial value */ + msr_store[6].vms_index = MSR_KERNELGSBASE; + msr_store[6].vms_data = 0ULL; /* Initial value */ + + if (vmwrite(VMCS_EXIT_MSR_STORE_COUNT, 0x7)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_EXIT_MSR_LOAD_COUNT, 0x7)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, 0x7)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_EXIT_STORE_MSR_ADDRESS, + vcpu->vc_vmx_msr_exit_save_pa)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_EXIT_LOAD_MSR_ADDRESS, + vcpu->vc_vmx_msr_exit_load_pa)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_ENTRY_LOAD_MSR_ADDRESS, + vcpu->vc_vmx_msr_exit_save_pa)) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_MSR_BITMAP_ADDRESS, + vcpu->vc_msr_bitmap_pa)) { + ret = EINVAL; + goto exit; + } + + /* XXX msr bitmap - set restrictions */ + /* XXX CR0 shadow */ + /* XXX CR4 shadow */ + + /* Flush content of VMCS to memory */ + if (vmclear(&vcpu->vc_control_pa)) { + ret = EINVAL; + goto exit; + } + +exit: + if (ret) { + if (vcpu->vc_control_va) + km_free((void *)vcpu->vc_control_va, PAGE_SIZE, + &kv_page, &kp_zero); + if (vcpu->vc_vmx_msr_exit_save_va) + km_free((void *)vcpu->vc_vmx_msr_exit_save_va, + PAGE_SIZE, &kv_page, &kp_zero); + if (vcpu->vc_vmx_msr_exit_load_va) + km_free((void *)vcpu->vc_vmx_msr_exit_load_va, + PAGE_SIZE, &kv_page, &kp_zero); + if (vcpu->vc_vmx_msr_entry_load_va) + km_free((void *)vcpu->vc_vmx_msr_entry_load_va, + PAGE_SIZE, &kv_page, &kp_zero); + } + + return (ret); +} + +/* + * vcpu_init_svm + * + * AMD SVM specific VCPU initialization routine. + */ +int +vcpu_init_svm(struct vcpu *vcpu) +{ + /* XXX removed due to rot */ + return (0); +} + +/* + * vcpu_init + * + * Calls the architecture-specific VCPU init routine + */ +int +vcpu_init(struct vcpu *vcpu) +{ + int ret; + + ret = 0; + vcpu->vc_hsa_stack_va = (vaddr_t)malloc(PAGE_SIZE, + M_DEVBUF, M_WAITOK | M_ZERO); + if (!vcpu->vc_hsa_stack_va) + return (ENOMEM); + + vcpu->vc_virt_mode = vmm_softc->mode; + if (vmm_softc->mode == VMM_MODE_VMX || + vmm_softc->mode == VMM_MODE_EPT) { + ret = vcpu_init_vmx(vcpu); + if (ret) + free((void *)vcpu->vc_hsa_stack_va, M_DEVBUF, + PAGE_SIZE); + } + else if (vmm_softc->mode == VMM_MODE_SVM || + vmm_softc->mode == VMM_MODE_RVI) { + ret = vcpu_init_svm(vcpu); + if (ret) + free((void *)vcpu->vc_hsa_stack_va, M_DEVBUF, + PAGE_SIZE); + } + else + panic("unknown vmm mode\n"); + + return (ret); +} + +/* + * vcpu_deinit_vmx + * + * Deinitializes the vcpu described by 'vcpu' + */ +void +vcpu_deinit_vmx(struct vcpu *vcpu) +{ + if (vcpu->vc_control_va) + km_free((void *)vcpu->vc_control_va, PAGE_SIZE, + &kv_page, &kp_zero); + if (vcpu->vc_vmx_msr_exit_save_va) + km_free((void *)vcpu->vc_vmx_msr_exit_save_va, + PAGE_SIZE, &kv_page, &kp_zero); + if (vcpu->vc_vmx_msr_exit_load_va) + km_free((void *)vcpu->vc_vmx_msr_exit_load_va, + PAGE_SIZE, &kv_page, &kp_zero); + if (vcpu->vc_vmx_msr_entry_load_va) + km_free((void *)vcpu->vc_vmx_msr_entry_load_va, + PAGE_SIZE, &kv_page, &kp_zero); + if (vcpu->vc_hsa_stack_va) + free((void *)vcpu->vc_hsa_stack_va, M_DEVBUF, PAGE_SIZE); +} + +/* + * vcpu_deinit_svm + * + * Deinitializes the vcpu described by 'vcpu' + */ +void +vcpu_deinit_svm(struct vcpu *vcpu) +{ + /* Unused */ +} + +/* + * vcpu_deinit + * + * Calls the architecture-specific VCPU deinit routine + */ +void +vcpu_deinit(struct vcpu *vcpu) +{ + if (vmm_softc->mode == VMM_MODE_VMX || + vmm_softc->mode == VMM_MODE_EPT) + vcpu_deinit_vmx(vcpu); + else if (vmm_softc->mode == VMM_MODE_SVM || + vmm_softc->mode == VMM_MODE_RVI) + vcpu_deinit_svm(vcpu); + else + panic("unknown vmm mode\n"); +} + +/* + * vm_teardown + * + * Tears down (destroys) the vm indicated by 'vm'. + */ +void +vm_teardown(struct vm *vm) +{ + struct vcpu *vcpu, *tmp; + + /* XXX coordinate a stop of all VCPUs first */ + + /* Free VCPUs */ + rw_enter_write(&vm->vm_vcpu_lock); + SLIST_FOREACH_SAFE(vcpu, &vm->vm_vcpu_list, vc_vcpu_link, tmp) { + SLIST_REMOVE(&vm->vm_vcpu_list, vcpu, vcpu, vc_vcpu_link); + vcpu_deinit(vcpu); + pool_put(&vcpu_pool, vcpu); + } + rw_exit_write(&vm->vm_vcpu_lock); + + vm_impl_deinit(vm); + + /* XXX teardown guest vmspace, free pages */ + + pool_put(&vm_pool, vm); +} + +/* + * vcpu_vmx_check_cap + * + * Checks if the 'cap' bit in the 'msr' MSR can be set or cleared (set = 1 + * or set = 0, respectively). + * + * When considering 'msr', we check to see if true controls are available, + * and use those if so. + * + * Returns 1 of 'cap' can be set/cleared as requested, 0 otherwise. + */ +int +vcpu_vmx_check_cap(struct vcpu *vcpu, uint32_t msr, uint32_t cap, int set) +{ + uint64_t ctl; + + if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) { + switch (msr) { + case IA32_VMX_PINBASED_CTLS: + ctl = vcpu->vc_vmx_true_pinbased_ctls; + break; + case IA32_VMX_PROCBASED_CTLS: + ctl = vcpu->vc_vmx_true_procbased_ctls; + break; + case IA32_VMX_PROCBASED2_CTLS: + ctl = vcpu->vc_vmx_procbased2_ctls; + break; + case IA32_VMX_ENTRY_CTLS: + ctl = vcpu->vc_vmx_true_entry_ctls; + break; + case IA32_VMX_EXIT_CTLS: + ctl = vcpu->vc_vmx_true_exit_ctls; + break; + default: + return (0); + } + } else { + switch (msr) { + case IA32_VMX_PINBASED_CTLS: + ctl = vcpu->vc_vmx_pinbased_ctls; + break; + case IA32_VMX_PROCBASED_CTLS: + ctl = vcpu->vc_vmx_procbased_ctls; + break; + case IA32_VMX_PROCBASED2_CTLS: + ctl = vcpu->vc_vmx_procbased2_ctls; + break; + case IA32_VMX_ENTRY_CTLS: + ctl = vcpu->vc_vmx_entry_ctls; + break; + case IA32_VMX_EXIT_CTLS: + ctl = vcpu->vc_vmx_exit_ctls; + break; + default: + return (0); + } + } + + if (set) { + /* Check bit 'cap << 32', must be !0 */ + return (ctl & ((uint64_t)cap << 32)) != 0; + } else { + /* Check bit 'cap', must be 0 */ + return (ctl & cap) == 0; + } +} + +/* + * vcpu_vmx_compute_ctrl + * + * Computes the appropriate control value, given the supplied parameters + * and CPU capabilities. + * + * Intel has made somewhat of a mess of this computation - it is described + * using no fewer than three different approaches, spread across many + * pages of the SDM. Further compounding the problem is the fact that now + * we have "true controls" for each type of "control", and each needs to + * be examined to get the calculation right, but only if "true" controls + * are present on the CPU we're on. + * + * Parameters: + * vcpu: the vcpu for which controls are to be computed. (XXX now unused) + * ctrlval: the control value, as read from the CPU MSR + * ctrl: which control is being set (eg, pinbased, procbased, etc) + * want0: the set of desired 0 bits + * want1: the set of desired 1 bits + * out: (out) the correct value to write into the VMCS for this VCPU, + * for the 'ctrl' desired. + * + * Returns 0 if successful, or EINVAL if the supplied parameters define + * an unworkable control setup. + */ +int +vcpu_vmx_compute_ctrl(struct vcpu *vcpu, uint64_t ctrlval, uint16_t ctrl, + uint32_t want1, uint32_t want0, uint32_t *out) +{ + int i, set, clear; + + /* + * The Intel SDM gives three formulae for determining which bits to + * set/clear for a given control and desired functionality. Formula + * 1 is the simplest but disallows use of newer features that are + * enabled by functionality in later CPUs. + * + * Formulas 2 and 3 allow such extra functionality. We use formula + * 2 - this requires us to know the identity of controls in the + * "default1" class for each control register, but allows us to not + * have to pass along and/or query both sets of capability MSRs for + * each control lookup. This makes the code slightly longer, + * however. + */ + for (i = 0; i < 32; i++) { + /* Figure out if we can set and / or clear this bit */ + set = (ctrlval & (1ULL << (i + 32))) != 0; + clear = ((1ULL << i) & ((uint64_t)ctrlval)) == 0; + + /* If the bit can't be set nor cleared, something's wrong */ + if (!set && !clear) + return (EINVAL); + + /* + * Formula 2.c.i - "If the relevant VMX capability MSR + * reports that a control has a single setting, use that + * setting." + */ + if (set && !clear) { + if (want0 & (1ULL << i)) + return (EINVAL); + else + *out |= (1ULL << i); + } else if (clear && !set) { + if (want1 & (1ULL << i)) + return (EINVAL); + else + *out &= ~(1ULL << i); + } else { + /* + * 2.c.ii - "If the relevant VMX capability MSR + * reports that a control can be set to 0 or 1 + * and that control's meaning is known to the VMM, + * set the control based on the functionality desired." + */ + if (want1 & (1ULL << i)) + *out |= (1ULL << i); + else if (want0 & (1 << i)) + *out &= ~(1ULL << i); + else { + /* + * ... assuming the control's meaning is not + * known to the VMM ... + * + * 2.c.iii - "If the relevant VMX capability + * MSR reports that a control can be set to 0 + * or 1 and the control is not in the default1 + * class, set the control to 0." + * + * 2.c.iv - "If the relevant VMX capability + * MSR reports that a control can be set to 0 + * or 1 and the control is in the default1 + * class, set the control to 1." + */ + switch (ctrl) { + case IA32_VMX_PINBASED_CTLS: + case IA32_VMX_TRUE_PINBASED_CTLS: + /* + * A.3.1 - default1 class of pinbased + * controls comprises bits 1,2,4 + */ + switch (i) { + case 1: + case 2: + case 4: + *out |= (1ULL << i); + break; + default: + *out &= ~(1ULL << i); + break; + } + break; + case IA32_VMX_PROCBASED_CTLS: + case IA32_VMX_TRUE_PROCBASED_CTLS: + /* + * A.3.2 - default1 class of procbased + * controls comprises bits 1, 4-6, 8, + * 13-16, 26 + */ + switch (i) { + case 1: + case 4 ... 6: + case 8: + case 13 ... 16: + case 26: + *out |= (1ULL << i); + break; + default: + *out &= ~(1ULL << i); + break; + } + break; + /* + * Unknown secondary procbased controls + * can always be set to 0 + */ + case IA32_VMX_PROCBASED2_CTLS: + *out &= ~(1ULL << i); + break; + case IA32_VMX_EXIT_CTLS: + case IA32_VMX_TRUE_EXIT_CTLS: + /* + * A.4 - default1 class of exit + * controls comprises bits 0-8, 10, + * 11, 13, 14, 16, 17 + */ + switch (i) { + case 0 ... 8: + case 10 ... 11: + case 13 ... 14: + case 16 ... 17: + *out |= (1ULL << i); + break; + default: + *out &= ~(1ULL << i); + break; + } + break; + case IA32_VMX_ENTRY_CTLS: + case IA32_VMX_TRUE_ENTRY_CTLS: + /* + * A.5 - default1 class of entry + * controls comprises bits 0-8, 12 + */ + switch (i) { + case 0 ... 8: + case 12: + *out |= (1ULL << i); + break; + default: + *out &= ~(1ULL << i); + break; + } + break; + } + } + } + } + + return (0); +} + +/* + * vm_get_info + * + * Returns information about the VM indicated by 'vip'. + */ +int +vm_get_info(struct vm_info_params *vip) +{ + struct vm_info_result *out; + struct vm *vm; + struct vcpu *vcpu; + int i, j; + size_t need; + + rw_enter_read(&vmm_softc->vm_lock); + need = vmm_softc->vm_ct * sizeof(struct vm_info_result); + if (vip->vip_size < need) { + vip->vip_info_ct = 0; + vip->vip_size = need; + rw_exit_read(&vmm_softc->vm_lock); + return (0); + } + + out = malloc(need, M_DEVBUF, M_NOWAIT); + if (out == NULL) { + vip->vip_info_ct = 0; + rw_exit_read(&vmm_softc->vm_lock); + return (ENOMEM); + } + + i = 0; + vip->vip_info_ct = vmm_softc->vm_ct; + SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) { + out[i].vir_memory_size = vm->vm_memory_size; + out[i].vir_ncpus = vm->vm_vcpu_ct; + out[i].vir_id = vm->vm_id; + out[i].vir_creator_pid = vm->vm_creator_pid; + strncpy(out[i].vir_name, vm->vm_name, VMM_MAX_NAME_LEN); + rw_enter_read(&vm->vm_vcpu_lock); + for (j = 0; j < vm->vm_vcpu_ct; j++) { + out[i].vir_vcpu_state[j] = VCPU_STATE_UNKNOWN; + SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, + vc_vcpu_link) { + if (vcpu->vc_id == j) + out[i].vir_vcpu_state[j] = + vcpu->vc_state; + } + } + rw_exit_read(&vm->vm_vcpu_lock); + i++; + } + rw_exit_read(&vmm_softc->vm_lock); + if (copyout(out, vip->vip_info, need) == EFAULT) { + free(out, M_DEVBUF, need); + return (EFAULT); + } + + free(out, M_DEVBUF, need); + return (0); +} + +/* + * vm_terminate + * + * Terminates the VM indicated by 'vtp'. + */ +int +vm_terminate(struct vm_terminate_params *vtp) +{ + struct vm *vm, *found_vm; + struct vcpu *vcpu; + + found_vm = NULL; + + /* + * Find desired VM + */ + rw_enter_read(&vmm_softc->vm_lock); + + SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) { + if (vm->vm_id == vtp->vtp_vm_id) + found_vm = vm; + } + + if (found_vm) { + rw_enter_read(&found_vm->vm_vcpu_lock); + SLIST_FOREACH(vcpu, &found_vm->vm_vcpu_list, vc_vcpu_link) { + vcpu->vc_state = VCPU_STATE_REQSTOP; + } + rw_exit_read(&found_vm->vm_vcpu_lock); + } + + rw_exit_read(&vmm_softc->vm_lock); + + if (!found_vm) + return (ENOENT); + + /* XXX possible race here two threads terminating the same vm? */ + rw_enter_write(&vmm_softc->vm_lock); + vmm_softc->vm_ct--; + SLIST_REMOVE(&vmm_softc->vm_list, found_vm, vm, vm_link); + rw_exit_write(&vmm_softc->vm_lock); + vm_teardown(found_vm); + + return (0); +} + +/* + * vm_run + * + * Run the vm / vcpu specified by 'vrp' + */ +int +vm_run(struct vm_run_params *vrp) +{ + struct vm *vm, *found_vm; + struct vcpu *vcpu, *found_vcpu; + int ret; + + found_vm = NULL; + found_vcpu = NULL; + ret = 0; + + /* + * Find desired VM + */ + rw_enter_read(&vmm_softc->vm_lock); + + SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) { + if (vm->vm_id == vrp->vrp_vm_id) + found_vm = vm; + } + + if (found_vm) { + rw_enter_read(&found_vm->vm_vcpu_lock); + SLIST_FOREACH(vcpu, &found_vm->vm_vcpu_list, vc_vcpu_link) { + if (vcpu->vc_id == vrp->vrp_vcpu_id) { + found_vcpu = vcpu; + if (found_vcpu->vc_state != VCPU_STATE_STOPPED) + ret = EBUSY; + else + found_vcpu->vc_state = + VCPU_STATE_RUNNING; + } + } + rw_exit_read(&found_vm->vm_vcpu_lock); + + if (!found_vcpu) + ret = ENOENT; + } + + rw_exit_read(&vmm_softc->vm_lock); + + if (!found_vm) + ret = ENOENT; + + if (ret) + return (ret); + + /* + * We may be returning from userland helping us from the last exit. + * If so (vrp_continue == 1), copy in the exit data from vmd. + */ + if (vrp->vrp_continue) { + if (copyin(vrp->vrp_exit, &found_vcpu->vc_exit, + sizeof(union vm_exit)) == EFAULT) { + return (EFAULT); + } + } + + /* Run the VCPU specified in vrp */ + if (found_vcpu->vc_virt_mode == VMM_MODE_VMX || + found_vcpu->vc_virt_mode == VMM_MODE_EPT) { + ret = vcpu_run_vmx(found_vcpu, vrp->vrp_continue, &vrp->vrp_injint); + } else if (found_vcpu->vc_virt_mode == VMM_MODE_SVM || + found_vcpu->vc_virt_mode == VMM_MODE_RVI) { + ret = vcpu_run_svm(found_vcpu, + vrp->vrp_continue); + } + + /* If we are exiting, populate exit data so vmd can help */ + if (ret == EAGAIN) { + vrp->vrp_exit_reason = + found_vcpu->vc_gueststate.vg_exit_reason; + + if (copyout(&found_vcpu->vc_exit, + vrp->vrp_exit, sizeof(union vm_exit)) == EFAULT) { + ret = EFAULT; + } else + ret = 0; + } else + vrp->vrp_exit_reason = VM_EXIT_NONE; + + return (ret); +} + +/* + * vcpu_run_vmx + * + * VMM main loop used to run a VCPU. + * + * Parameters: + * vcpu: The VCPU to run + * from_exit: 1 if returning directly from an exit to vmd during the + * previous run, or 0 if we exited last time without needing to + * exit to vmd. + * injint: Interrupt that should be injected during this run, or -1 if + * no interrupt should be injected. + * + * Return values: + * 0: The run loop exited and no help is needed from vmd + * EAGAIN: The run loop exited and help from vmd is needed + * EINVAL: an error occured + */ +int +vcpu_run_vmx(struct vcpu *vcpu, uint8_t from_exit, int16_t *injint) +{ + int ret, resume, exit_handled; + struct region_descriptor gdt; + struct cpu_info *ci; + uint64_t exit_reason, cr3, vmcs_ptr; + struct schedstate_percpu *spc; + struct vmx_invvpid_descriptor vid; + uint64_t rflags, eii; + + exit_handled = 1; + resume = 0; + + while (exit_handled) { + if (!resume) { + /* + * We are launching for the first time, or we are + * resuming from a different pcpu, so we need to + * reset certain pcpu-specific values. + */ + ci = curcpu(); + setregion(&gdt, ci->ci_gdt, GDT_SIZE - 1); + + if (vmptrld(&vcpu->vc_control_pa)) { + ret = EINVAL; + goto exit; + } + + if (gdt.rd_base == 0) { + ret = EINVAL; + goto exit; + } + + /* Host GDTR base */ + if (vmwrite(VMCS_HOST_IA32_GDTR_BASE, gdt.rd_base)) { + ret = EINVAL; + goto exit; + } + + /* Host TR base */ + if (vmwrite(VMCS_HOST_IA32_TR_BASE, + (uint64_t)curcpu()->ci_tss)) { + ret = EINVAL; + goto exit; + } + + /* Host CR3 */ + cr3 = rcr3(); + if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) { + ret = EINVAL; + goto exit; + } + } + + /* + * If we are returning from userspace (vmd) because we exited + * last time, fix up any needed vcpu state first. + */ + if (from_exit) { + from_exit = 0; + switch (vcpu->vc_gueststate.vg_exit_reason) { + case VMX_EXIT_IO: + vcpu->vc_gueststate.vg_rax = + vcpu->vc_exit.vei.vei_data; + break; + case VMX_EXIT_HLT: + break; + default: + printf("vmx_enter_guest: returning from exit " + "with unknown reason %d\n", + vcpu->vc_gueststate.vg_exit_reason); + break; + } + } + + /* + * XXX - clock hack. We don't track host clocks while not + * running inside a VM, and thus we lose many clocks while + * the OS is running other processes. For now, approximate + * when a clock should be injected by injecting one clock + * per CLOCK_BIAS exits. + * + * This should be changed to track host clocks to know if + * a clock tick was missed, and "catch up" clock interrupt + * injections later as needed. + * + * Note that checking injint here and not injecting the + * clock interrupt if injint is set also violates interrupt + * priority, until this hack is fixed. + */ + vmmclk++; + eii = 0xFFFFFFFFFFFFFFFFULL; + + if (vmmclk % CLOCK_BIAS == 0) + eii = 0x20; + + if (*injint != -1) + eii = *injint + 0x20; + + if (eii != 0xFFFFFFFFFFFFFFFFULL) { + if (vmread(VMCS_GUEST_IA32_RFLAGS, &rflags)) { + printf("intr: can't read guest rflags\n"); + rflags = 0; + } + + if (rflags & PSL_I) { + eii |= (1ULL << 31); /* Valid */ + eii |= (0ULL << 8); /* Hardware Interrupt */ + if (vmwrite(VMCS_ENTRY_INTERRUPTION_INFO, eii)) { + printf("intr: can't vector clock " + "interrupt to guest\n"); + } + if (*injint != -1) + *injint = -1; + } + } + + /* XXX end clock hack */ + + /* Invalidate old TLB mappings */ + vid.vid_vpid = vcpu->vc_parent->vm_id; + vid.vid_addr = 0; + invvpid(IA32_VMX_INVVPID_SINGLE_CTX_GLB, &vid); + + /* Start / resume the VM / VCPU */ + /* XXX unlock the biglock here */ + ret = vmx_enter_guest(&vcpu->vc_control_pa, + &vcpu->vc_gueststate, resume); + /* XXX lock the biglock here */ + + /* If we exited successfully ... */ + if (ret == 0) { + resume = 1; + vcpu->vc_last_pcpu = ci; + if (vmread(VMCS_GUEST_IA32_RIP, + &vcpu->vc_gueststate.vg_rip)) { + printf("vcpu_run_vmx: cannot read guest rip\n"); + ret = EINVAL; + exit_handled = 0; + goto exit; + } + + if (vmread(VMCS_EXIT_REASON, &exit_reason)) { + printf("vcpu_run_vmx: cant read exit reason\n"); + ret = EINVAL; + exit_handled = 0; + goto exit; + } + + /* + * Handle the exit. This will alter "ret" to EAGAIN if + * the exit handler determines help from vmd is needed. + */ + vcpu->vc_gueststate.vg_exit_reason = exit_reason; + exit_handled = vmx_handle_exit(vcpu, &ret); + + /* Check if we should yield - don't hog the cpu */ + spc = &ci->ci_schedstate; + if (spc->spc_schedflags & SPCF_SHOULDYIELD) { + resume = 0; + if (vmclear(&vcpu->vc_control_pa)) { + ret = EINVAL; + goto exit; + } + yield(); + } + } else if (ret == VMX_FAIL_LAUNCH_INVALID_VMCS) { + printf("vmx_enter_guest: failed launch with invalid " + "vmcs\n"); + ret = EINVAL; + exit_handled = 0; + } else if (ret == VMX_FAIL_LAUNCH_VALID_VMCS) { + exit_reason = vcpu->vc_gueststate.vg_exit_reason; + printf("vmx_enter_guest: failed launch with valid " + "vmcs, code=%lld (%s)\n", exit_reason, + vmx_instruction_error_decode(exit_reason)); + ret = EINVAL; + exit_handled = 0; + } else { + printf("vmx_enter_guest: failed launch for unknown " + "reason\n"); + ret = EINVAL; + exit_handled = 0; + } + + } + vcpu->vc_state = VCPU_STATE_STOPPED; + +exit: + /* + * We are heading back to userspace (vmd), either because we need help + * handling an exit, or we failed in some way to enter the guest. + * Clear any current VMCS pointer as we may end up coming back on + * a different CPU. + */ + if (!vmptrst(&vmcs_ptr)) { + if (vmcs_ptr != 0xFFFFFFFFFFFFFFFFULL) + if (vmclear(&vcpu->vc_control_pa)) + ret = EINVAL; + } else + ret = EINVAL; + + return (ret); +} + +/* + * vmx_handle_intr + * + * Handle host (external) interrupts. We read which interrupt fired by + * extracting the vector from the VMCS and dispatch the interrupt directly + * to the host using vmm_dispatch_intr. + */ +void +vmx_handle_intr(struct vcpu *vcpu) +{ + uint8_t vec; + uint64_t eii; + struct gate_descriptor *idte; + vaddr_t handler; + + if (vmread(VMCS_EXIT_INTERRUPTION_INFO, &eii)) { + printf("vmx_handle_intr: can't obtain intr info\n"); + return; + } + + vec = eii & 0xFF; + + /* XXX check "error valid" code in eii, abort if 0 */ + idte=&idt[vec]; + handler = idte->gd_looffset + ((uint64_t)idte->gd_hioffset << 16); + vmm_dispatch_intr(handler); +} + +/* + * vmx_handle_hlt + * + * Handle HLT exits + */ +int +vmx_handle_hlt(struct vcpu *vcpu) +{ + uint64_t insn_length; + + if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) { + printf("vmx_handle_hlt: can't obtain instruction length\n"); + return (1); + } + + vcpu->vc_gueststate.vg_rip += insn_length; + return (0); +} + +/* + * vmx_handle_exit + * + * Handle exits from the VM by decoding the exit reason and calling various + * subhandlers as needed. + */ +int +vmx_handle_exit(struct vcpu *vcpu, int *result) +{ + uint64_t exit_reason; + int update_rip, handled; + + update_rip = 0; + handled = 1; + exit_reason = vcpu->vc_gueststate.vg_exit_reason; + + switch (exit_reason) { + case VMX_EXIT_EPT_VIOLATION: + *result = vmx_handle_np_fault(vcpu); + break; + case VMX_EXIT_CPUID: + *result = vmx_handle_cpuid(vcpu); + update_rip = 1; + break; + case VMX_EXIT_IO: + *result = vmx_handle_inout(vcpu); + update_rip = 1; + if (*result) + handled = 0; + break; + case VMX_EXIT_EXTINT: + vmx_handle_intr(vcpu); + update_rip = 0; + break; + case VMX_EXIT_CR_ACCESS: + *result = vmx_handle_cr(vcpu); + update_rip = 1; + break; + case VMX_EXIT_HLT: + *result = vmx_handle_hlt(vcpu); + update_rip = 1; + handled = 0; + break; + default: + dprintf(("vmx_handle_exit: unhandled exit %lld (%s)\n", + exit_reason, vmx_exit_reason_decode(exit_reason))); + *result = EINVAL; + return (0); + } + + if (update_rip) { + if (vmwrite(VMCS_GUEST_IA32_RIP, + vcpu->vc_gueststate.vg_rip)) { + printf("vmx_handle_exit: can't advance rip\n"); + *result = EINVAL; + return (0); + } + } + + return (handled); +} + +/* + * vmm_get_guest_memtype + * + * Returns the type of memory 'gpa' refers to in the context of vm 'vm' + */ +int +vmm_get_guest_memtype(struct vm *vm, paddr_t gpa) +{ + + if (gpa >= VMM_PCI_MMIO_BAR_BASE && gpa <= VMM_PCI_MMIO_BAR_END) { + dprintf(("guest mmio access @ 0x%llx\n", (uint64_t)gpa)); + return (VMM_MEM_TYPE_REGULAR); + } + + if (gpa < vm->vm_memory_size * (1024 * 1024)) + return (VMM_MEM_TYPE_REGULAR); + else { + dprintf(("guest memtype @ 0x%llx unknown\n", (uint64_t)gpa)); + return (VMM_MEM_TYPE_UNKNOWN); + } +} + +/* + * vmm_get_guest_faulttype + * + * Determines the type (R/W/X) of the last fault on the VCPU last run on + * this PCPU. Calls the appropriate architecture-specific subroutine. + */ +int +vmm_get_guest_faulttype(void) +{ + if (vmm_softc->mode == VMM_MODE_EPT) + return vmx_get_guest_faulttype(); + else if (vmm_softc->mode == VMM_MODE_RVI) + return vmx_get_guest_faulttype(); + else + panic("unknown vmm mode\n"); + +} + +/* + * vmx_get_exit_qualification + * + * Return the current VMCS' exit qualification information + */ +int +vmx_get_exit_qualification(uint64_t *exit_qualification) +{ + if (vmread(VMCS_GUEST_EXIT_QUALIFICATION, exit_qualification)) { + printf("vmm_get_exit_qualification: cant extract exit qual\n"); + return (EINVAL); + } + + return (0); +} + +/* + * vmx_get_guest_faulttype + * + * Determines the type (R/W/X) of the last fault on the VCPU last run on + * this PCPU. + */ +int +vmx_get_guest_faulttype(void) +{ + uint64_t exit_qualification; + + if (vmx_get_exit_qualification(&exit_qualification)) + return (EINVAL); + + if (exit_qualification & IA32_VMX_EPT_FAULT_WRITE) + return (PROT_WRITE); + else if (exit_qualification & IA32_VMX_EPT_FAULT_READ) + return (PROT_READ); + else if (exit_qualification & IA32_VMX_EPT_FAULT_EXEC) + return (PROT_EXEC); + else + return (EINVAL); +} + +/* + * svm_get_guest_faulttype + * + * Determines the type (R/W/X) of the last fault on the VCPU last run on + * this PCPU. + */ +int +svm_get_guest_faulttype(void) +{ + /* XXX removed due to rot */ + return (EINVAL); +} + +/* + * vmx_fault_page + * + * Request a new page to be faulted into the UVM map of the VM owning 'vcpu' + * at address 'gpa'. + */ +int +vmx_fault_page(struct vcpu *vcpu, paddr_t gpa) +{ + int fault_type, ret; + vaddr_t kva; + paddr_t host_pa; + struct pmap *pmap; + + fault_type = vmx_get_guest_faulttype(); + if (fault_type == EINVAL) { + printf("vmx_fault_page: invalid fault type\n"); + return (EINVAL); + } + + ret = uvm_fault(vcpu->vc_parent->vm_map, gpa, fault_type, + PROT_READ | PROT_WRITE | PROT_EXEC); + if (!ret) { + pmap = vcpu->vc_parent->vm_map->pmap; + if (!vmx_fix_ept_pte(pmap, gpa)) { + if (pmap_extract(pmap, (vaddr_t)gpa, &host_pa)) { + kva = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, + &kp_none, &kd_nowait); + if (kva) { + pmap_kenter_pa(kva, host_pa, + PROT_READ | PROT_WRITE); + bzero((void *)kva, PAGE_SIZE); + pmap_kremove(kva, PAGE_SIZE); + km_free((void *)kva, PAGE_SIZE, &kv_any, + &kp_none); + } else { + printf("vmx_fault_page: kva failure\n"); + ret = ENOMEM; + } + } else { + printf("vmx_fault_page: extract failure\n"); + ret = EFAULT; + } + } else { + printf("vmx_fault_page: ept fixup failure\n"); + ret = EINVAL; + } + } else { + printf("vmx_fault_page: uvm_fault returns %d\n", ret); + } + + return (ret); +} + +/* + * vmx_handle_np_fault + * + * High level nested paging handler for VMX. Verifies that a fault is for a + * valid memory region, then faults a page, or aborts otherwise. + */ +int +vmx_handle_np_fault(struct vcpu *vcpu) +{ + uint64_t gpa; + int gpa_memtype, ret; + + ret = 0; + if (vmread(VMCS_GUEST_PHYSICAL_ADDRESS, &gpa)) { + printf("vmm_handle_np_fault: cannot extract faulting pa\n"); + return (EINVAL); + } + + gpa_memtype = vmm_get_guest_memtype(vcpu->vc_parent, gpa); + switch(gpa_memtype) { + case VMM_MEM_TYPE_REGULAR: + ret = vmx_fault_page(vcpu, gpa); + break; + default: + printf("unknown memory type %d for GPA 0x%llx\n", + gpa_memtype, gpa); + break; + } + + return (ret); +} + +/* + * vmx_handle_inout + * + * Exit handler for IN/OUT instructions. + * + * The vmm can handle certain IN/OUTS without exiting to vmd, but most of these + * will be passed to vmd for completion. + */ +int +vmx_handle_inout(struct vcpu *vcpu) +{ + uint64_t insn_length, exit_qual; + int ret; + + if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) { + printf("vmx_handle_inout: can't obtain instruction length\n"); + return (1); + } + + if (vmx_get_exit_qualification(&exit_qual)) { + printf("vmx_handle_inout: can't get exit qual\n"); + return (1); + } + + /* Bits 0:2 - size of exit */ + vcpu->vc_exit.vei.vei_size = (exit_qual & 0x7) + 1; + /* Bit 3 - direction */ + vcpu->vc_exit.vei.vei_dir = (exit_qual & 0x8) >> 3; + /* Bit 4 - string instruction? */ + vcpu->vc_exit.vei.vei_string = (exit_qual & 0x10) >> 4; + /* Bit 5 - REP prefix? */ + vcpu->vc_exit.vei.vei_rep = (exit_qual & 0x20) >> 5; + /* Bit 6 - Operand encoding */ + vcpu->vc_exit.vei.vei_encoding = (exit_qual & 0x40) >> 6; + /* Bit 16:31 - port */ + vcpu->vc_exit.vei.vei_port = (exit_qual & 0xFFFF0000) >> 16; + /* Data */ + vcpu->vc_exit.vei.vei_data = (uint32_t)vcpu->vc_gueststate.vg_rax; + + vcpu->vc_gueststate.vg_rip += insn_length; + + /* + * The following ports usually belong to devices owned by vmd. + * Return EAGAIN to signal help needed from userspace (vmd). + * Return 0 to indicate we don't care about this port. + * + * XXX something better than a hardcoded list here, maybe + * configure via vmd via the device list in vm create params? + * + * XXX handle not eax target + */ + switch(vcpu->vc_exit.vei.vei_port) { + case 0x40 ... 0x43: + case 0x3f8 ... 0x3ff: + case 0xcf8: + case 0xcfc: + case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: + ret = EAGAIN; + break; + case IO_RTC ... IO_RTC + 1: + /* We can directly read the RTC on behalf of the guest */ + if (vcpu->vc_exit.vei.vei_dir == 1) { + vcpu->vc_gueststate.vg_rax = + inb(vcpu->vc_exit.vei.vei_port); + } + ret = 0; + break; + default: + /* Read from unsupported ports returns FFs */ + if (vcpu->vc_exit.vei.vei_dir == 1) + vcpu->vc_gueststate.vg_rax = 0xFFFFFFFF; + ret = 0; + } + + return (ret); +} + +/* + * vmx_handle_cr + * + * Handle reads/writes to control registers (except CR3) + */ +int +vmx_handle_cr(struct vcpu *vcpu) +{ + uint64_t insn_length, exit_qual; + uint8_t crnum, dir; + + if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) { + printf("vmx_handle_cr: can't obtain instruction length\n"); + return (1); + } + + if (vmx_get_exit_qualification(&exit_qual)) { + printf("vmx_handle_cr: can't get exit qual\n"); + return (1); + } + + /* Low 4 bits of exit_qual represent the CR number */ + crnum = exit_qual & 0xf; + + dir = (exit_qual & 0x30) >> 4; + + switch (dir) { + case CR_WRITE: + dprintf(("vmx_handle_cr: mov to cr%d @ %llx\n", + crnum, vcpu->vc_gueststate.vg_rip)); + break; + case CR_READ: + dprintf(("vmx_handle_cr: mov from cr%d @ %llx\n", + crnum, vcpu->vc_gueststate.vg_rip)); + break; + case CR_CLTS: + dprintf(("vmx_handle_cr: clts instruction @ %llx\n", + vcpu->vc_gueststate.vg_rip)); + break; + case CR_LMSW: + dprintf(("vmx_handle_cr: lmsw instruction @ %llx\n", + vcpu->vc_gueststate.vg_rip)); + break; + default: + dprintf(("vmx_handle_cr: unknown cr access @ %llx\n", + vcpu->vc_gueststate.vg_rip)); + } + + vcpu->vc_gueststate.vg_rip += insn_length; + + return (0); +} + +/* + * vmx_handle_cpuid + * + * Exit handler for CPUID instruction + */ +int +vmx_handle_cpuid(struct vcpu *vcpu) +{ + uint64_t insn_length; + uint64_t *rax, *rbx, *rcx, *rdx; + + if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) { + printf("vmx_handle_cpuid: can't obtain instruction length\n"); + return (1); + } + + /* All CPUID instructions are 0x0F 0xA2 */ + KASSERT(insn_length == 2); + + rax = &vcpu->vc_gueststate.vg_rax; + rbx = &vcpu->vc_gueststate.vg_rbx; + rcx = &vcpu->vc_gueststate.vg_rcx; + rdx = &vcpu->vc_gueststate.vg_rdx; + + switch (*rax) { + case 0x00: /* Max level and vendor ID */ + *rax = 0x07; /* cpuid_level */ + *rbx = *((uint32_t *)&cpu_vendor); + *rcx = *((uint32_t *)&cpu_vendor + 1); + *rdx = *((uint32_t *)&cpu_vendor + 2); + break; + case 0x01: /* Version, brand, feature info */ + *rax = cpu_id; + /* mask off host's APIC ID, reset to vcpu id */ + *rbx = cpu_ebxfeature & 0x00FFFFFF; + *rbx &= (vcpu->vc_id & 0xFF) << 24; + /* + * clone host capabilities minus: + * speedstep (CPUIDECX_EST) + * vmx (CPUIDECX_VMX) + * xsave (CPUIDECX_XSAVE) + * thermal (CPUIDECX_TM2, CPUID_ACPI, CPUID_TM) + * XXX - timestamp (CPUID_TSC) + * monitor/mwait (CPUIDECX_MWAIT) + * performance monitoring (CPUIDECX_PDCM) + * plus: + * hypervisor (CPUIDECX_HV) + */ + *rcx = (cpu_ecxfeature | CPUIDECX_HV) & + ~(CPUIDECX_EST | CPUIDECX_TM2 | + CPUIDECX_MWAIT | CPUIDECX_PDCM | + CPUIDECX_VMX | CPUIDECX_XSAVE); + *rdx = curcpu()->ci_feature_flags & + ~(CPUID_ACPI | CPUID_TM | CPUID_TSC); + break; + case 0x02: /* Cache and TLB information */ + dprintf(("vmx_handle_cpuid: function 0x02 (cache/TLB) not" + " supported\n")); + break; + case 0x03: /* Processor serial number (not supported) */ + *rax = 0; + *rbx = 0; + *rcx = 0; + *rdx = 0; + break; + case 0x04: + dprintf(("vmx_handle_cpuid: function 0x04 (deterministic " + "cache info) not supported\n")); + break; + case 0x05: /* MONITOR/MWAIT (not supported) */ + *rax = 0; + *rbx = 0; + *rcx = 0; + *rdx = 0; + break; + case 0x06: /* Thermal / Power management */ + /* Only ARAT is exposed in function 0x06 */ + *rax = TPM_ARAT; + *rbx = 0; + *rcx = 0; + *rdx = 0; + break; + case 0x07: /* SEFF */ + if (*rcx == 0) { + *rax = 0; /* Highest subleaf supported */ + *rbx = curcpu()->ci_feature_sefflags_ebx; + *rcx = curcpu()->ci_feature_sefflags_ecx; + *rdx = 0; + } else { + /* Unsupported subleaf */ + *rax = 0; + *rbx = 0; + *rcx = 0; + *rdx = 0; + } + break; + case 0x09: /* Direct Cache Access (not supported) */ + dprintf(("vmx_handle_cpuid: function 0x09 (direct cache access)" + " not supported\n")); + break; + case 0x0a: /* Architectural performance monitoring */ + *rax = 0; + *rbx = 0; + *rcx = 0; + *rdx = 0; + break; + case 0x0b: /* Extended topology enumeration (not supported) */ + dprintf(("vmx_handle_cpuid: function 0x0b (topology enumeration)" + " not supported\n")); + break; + case 0x0d: /* Processor ext. state information (not supported) */ + dprintf(("vmx_handle_cpuid: function 0x0d (ext. state info)" + " not supported\n")); + break; + case 0x0f: /* QoS info (not supported) */ + dprintf(("vmx_handle_cpuid: function 0x0f (QoS info)" + " not supported\n")); + break; + case 0x14: /* Processor Trace info (not supported) */ + dprintf(("vmx_handle_cpuid: function 0x14 (processor trace info)" + " not supported\n")); + break; + case 0x15: /* TSC / Core Crystal Clock info (not supported) */ + dprintf(("vmx_handle_cpuid: function 0x15 (TSC / CCC info)" + " not supported\n")); + break; + case 0x16: /* Processor frequency info (not supported) */ + dprintf(("vmx_handle_cpuid: function 0x16 (frequency info)" + " not supported\n")); + break; + case 0x40000000: /* Hypervisor information */ + *rax = 0; + *rbx = *((uint32_t *)&vmm_hv_signature[0]); + *rcx = *((uint32_t *)&vmm_hv_signature[4]); + *rdx = *((uint32_t *)&vmm_hv_signature[8]); + break; + case 0x80000000: /* Extended function level */ + *rax = 0x80000007; /* curcpu()->ci_pnfeatset */ + *rbx = 0; + *rcx = 0; + *rdx = 0; + case 0x80000001: /* Extended function info */ + *rax = curcpu()->ci_efeature_eax; + *rbx = 0; /* Reserved */ + *rcx = curcpu()->ci_efeature_ecx; + *rdx = curcpu()->ci_feature_eflags; + break; + case 0x80000002: /* Brand string */ + *rax = curcpu()->ci_brand[0]; + *rbx = curcpu()->ci_brand[1]; + *rcx = curcpu()->ci_brand[2]; + *rdx = curcpu()->ci_brand[3]; + break; + case 0x80000003: /* Brand string */ + *rax = curcpu()->ci_brand[4]; + *rbx = curcpu()->ci_brand[5]; + *rcx = curcpu()->ci_brand[6]; + *rdx = curcpu()->ci_brand[7]; + break; + case 0x80000004: /* Brand string */ + *rax = curcpu()->ci_brand[8]; + *rbx = curcpu()->ci_brand[9]; + *rcx = curcpu()->ci_brand[10]; + *rdx = curcpu()->ci_brand[11]; + break; + case 0x80000005: /* Reserved (Intel), cacheinfo (AMD) */ + *rax = curcpu()->ci_amdcacheinfo[0]; + *rbx = curcpu()->ci_amdcacheinfo[1]; + *rcx = curcpu()->ci_amdcacheinfo[2]; + *rdx = curcpu()->ci_amdcacheinfo[3]; + break; + case 0x80000006: /* ext. cache info */ + *rax = curcpu()->ci_extcacheinfo[0]; + *rbx = curcpu()->ci_extcacheinfo[1]; + *rcx = curcpu()->ci_extcacheinfo[2]; + *rdx = curcpu()->ci_extcacheinfo[3]; + break; + case 0x80000007: /* apmi */ + *rax = 0; /* Reserved */ + *rbx = 0; /* Reserved */ + *rcx = 0; /* Reserved */ + *rdx = cpu_apmi_edx; + break; + case 0x80000008: /* Phys bits info and topology (AMD) */ + dprintf(("vmx_handle_cpuid: function 0x80000008 (phys bits info)" + " not supported\n")); + break; + default: + dprintf(("vmx_handle_cpuid: unsupported rax=0x%llx\n", *rax)); + } + + vcpu->vc_gueststate.vg_rip += insn_length; + + return (0); +} + +/* + * vcpu_run_svm + * + * VMM main loop used to run a VCPU. + */ +int +vcpu_run_svm(struct vcpu *vcpu, uint8_t from_exit) +{ + /* XXX removed due to rot */ + return (0); +} + +/* + * vmx_fix_ept_pte + * + * Fixes up the pmap PTE entry for 'addr' to reflect proper EPT format + */ +int +vmx_fix_ept_pte(struct pmap *pmap, vaddr_t addr) +{ + int offs, level; + + level = pmap_fix_ept(pmap, addr, &offs); + KASSERT(level == 0); + + return (0); +} + +/* + * vmx_exit_reason_decode + * + * Returns a human readable string describing exit type 'code' + */ +const char * +vmx_exit_reason_decode(uint32_t code) +{ + switch(code) { + case VMX_EXIT_NMI: return "NMI"; + case VMX_EXIT_EXTINT: return "external interrupt"; + case VMX_EXIT_TRIPLE_FAULT: return "triple fault"; + case VMX_EXIT_INIT: return "INIT signal"; + case VMX_EXIT_SIPI: return "SIPI signal"; + case VMX_EXIT_IO_SMI: return "I/O SMI"; + case VMX_EXIT_OTHER_SMI: return "other SMI"; + case VMX_EXIT_INT_WINDOW: return "interrupt window"; + case VMX_EXIT_NMI_WINDOW: return "NMI window"; + case VMX_EXIT_TASK_SWITCH: return "task switch"; + case VMX_EXIT_CPUID: return "CPUID instruction"; + case VMX_EXIT_GETSEC: return "GETSEC instruction"; + case VMX_EXIT_HLT: return "HLT instruction"; + case VMX_EXIT_INVD: return "INVD instruction"; + case VMX_EXIT_INVLPG: return "INVLPG instruction"; + case VMX_EXIT_RDPMC: return "RDPMC instruction"; + case VMX_EXIT_RDTSC: return "RDTSC instruction"; + case VMX_EXIT_RSM: return "RSM instruction"; + case VMX_EXIT_VMCALL: return "VMCALL instruction"; + case VMX_EXIT_VMCLEAR: return "VMCLEAR instruction"; + case VMX_EXIT_VMLAUNCH: return "VMLAUNCH instruction"; + case VMX_EXIT_VMPTRLD: return "VMPTRLD instruction"; + case VMX_EXIT_VMPTRST: return "VMPTRST instruction"; + case VMX_EXIT_VMREAD: return "VMREAD instruction"; + case VMX_EXIT_VMRESUME: return "VMRESUME instruction"; + case VMX_EXIT_VMWRITE: return "VMWRITE instruction"; + case VMX_EXIT_VMXOFF: return "VMXOFF instruction"; + case VMX_EXIT_VMXON: return "VMXON instruction"; + case VMX_EXIT_CR_ACCESS: return "CR access"; + case VMX_EXIT_MOV_DR: return "MOV DR instruction"; + case VMX_EXIT_IO: return "I/O instruction"; + case VMX_EXIT_RDMSR: return "RDMSR instruction"; + case VMX_EXIT_WRMSR: return "WRMSR instruction"; + case VMX_EXIT_ENTRY_FAILED_GUEST_STATE: return "guest state invalid"; + case VMX_EXIT_ENTRY_FAILED_MSR_LOAD: return "MSR load failed"; + case VMX_EXIT_MWAIT: return "MWAIT instruction"; + case VMX_EXIT_MTF: return "monitor trap flag"; + case VMX_EXIT_MONITOR: return "MONITOR instruction"; + case VMX_EXIT_PAUSE: return "PAUSE instruction"; + case VMX_EXIT_ENTRY_FAILED_MCE: return "MCE during entry"; + case VMX_EXIT_TPR_BELOW_THRESHOLD: return "TPR below threshold"; + case VMX_EXIT_APIC_ACCESS: return "APIC access"; + case VMX_EXIT_VIRTUALIZED_EOI: return "virtualized EOI"; + case VMX_EXIT_GDTR_IDTR: return "GDTR/IDTR access"; + case VMX_EXIT_LDTR_TR: return "LDTR/TR access"; + case VMX_EXIT_EPT_VIOLATION: return "EPT violation"; + case VMX_EXIT_EPT_MISCONFIGURATION: return "EPT misconfiguration"; + case VMX_EXIT_INVEPT: return "INVEPT instruction"; + case VMX_EXIT_RDTSCP: return "RDTSCP instruction"; + case VMX_EXIT_VMX_PREEMPTION_TIMER_EXPIRED: + return "preemption timer expired"; + case VMX_EXIT_INVVPID: return "INVVPID instruction"; + case VMX_EXIT_WBINVD: return "WBINVD instruction"; + case VMX_EXIT_XSETBV: return "XSETBV instruction"; + case VMX_EXIT_APIC_WRITE: return "APIC write"; + case VMX_EXIT_RDRAND: return "RDRAND instruction"; + case VMX_EXIT_INVPCID: return "INVPCID instruction"; + case VMX_EXIT_VMFUNC: return "VMFUNC instruction"; + default: return "unknown"; + } +} + +/* + * vmx_instruction_error_decode + * + * Returns a human readable string describing the instruction error in 'code' + */ +const char * +vmx_instruction_error_decode(uint32_t code) +{ + switch(code) { + case 1: return "VMCALL: unsupported in VMX root"; + case 2: return "VMCLEAR: invalid paddr"; + case 3: return "VMCLEAR: VMXON pointer"; + case 4: return "VMLAUNCH: non-clear VMCS"; + case 5: return "VMRESUME: non-launched VMCS"; + case 6: return "VMRESUME: executed after VMXOFF"; + case 7: return "VM entry: invalid control field(s)"; + case 8: return "VM entry: invalid host state field(s)"; + case 9: return "VMPTRLD: invalid paddr"; + case 10: return "VMPTRLD: VMXON pointer"; + case 11: return "VMPTRLD: incorrect VMCS revid"; + case 12: return "VMREAD/VMWRITE: unsupported VMCS field"; + case 13: return "VMWRITE: RO VMCS field"; + case 15: return "VMXON: unsupported in VMX root"; + case 20: return "VMCALL: invalid VM exit control fields"; + case 26: return "VM entry: blocked by MOV SS"; + case 28: return "Invalid operand to INVEPT/INVVPID"; + default: return "unknown"; + } +} + +/* + * dump_vcpu + * + * Dumps the VMX capabilites of vcpu 'vcpu' + */ +void +dump_vcpu(struct vcpu *vcpu) +{ + printf("vcpu @ 0x%llx\n", (uint64_t)vcpu); + printf(" parent vm @ 0x%llx\n", (uint64_t)vcpu->vc_parent); + printf(" mode: "); + if (vcpu->vc_virt_mode == VMM_MODE_VMX || + vcpu->vc_virt_mode == VMM_MODE_EPT) { + printf("VMX\n"); + printf(" pinbased ctls: 0x%llx\n", + vcpu->vc_vmx_pinbased_ctls); + printf(" true pinbased ctls: 0x%llx\n", + vcpu->vc_vmx_true_pinbased_ctls); + CTRL_DUMP(vcpu, PINBASED, EXTERNAL_INT_EXITING); + CTRL_DUMP(vcpu, PINBASED, NMI_EXITING); + CTRL_DUMP(vcpu, PINBASED, VIRTUAL_NMIS); + CTRL_DUMP(vcpu, PINBASED, ACTIVATE_VMX_PREEMPTION_TIMER); + CTRL_DUMP(vcpu, PINBASED, PROCESS_POSTED_INTERRUPTS); + printf(" procbased ctls: 0x%llx\n", + vcpu->vc_vmx_procbased_ctls); + printf(" true procbased ctls: 0x%llx\n", + vcpu->vc_vmx_true_procbased_ctls); + CTRL_DUMP(vcpu, PROCBASED, INTERRUPT_WINDOW_EXITING); + CTRL_DUMP(vcpu, PROCBASED, USE_TSC_OFFSETTING); + CTRL_DUMP(vcpu, PROCBASED, HLT_EXITING); + CTRL_DUMP(vcpu, PROCBASED, INVLPG_EXITING); + CTRL_DUMP(vcpu, PROCBASED, MWAIT_EXITING); + CTRL_DUMP(vcpu, PROCBASED, RDPMC_EXITING); + CTRL_DUMP(vcpu, PROCBASED, RDTSC_EXITING); + CTRL_DUMP(vcpu, PROCBASED, CR3_LOAD_EXITING); + CTRL_DUMP(vcpu, PROCBASED, CR3_STORE_EXITING); + CTRL_DUMP(vcpu, PROCBASED, CR8_LOAD_EXITING); + CTRL_DUMP(vcpu, PROCBASED, CR8_STORE_EXITING); + CTRL_DUMP(vcpu, PROCBASED, USE_TPR_SHADOW); + CTRL_DUMP(vcpu, PROCBASED, NMI_WINDOW_EXITING); + CTRL_DUMP(vcpu, PROCBASED, MOV_DR_EXITING); + CTRL_DUMP(vcpu, PROCBASED, UNCONDITIONAL_IO_EXITING); + CTRL_DUMP(vcpu, PROCBASED, USE_IO_BITMAPS); + if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) { + printf(" procbased2 ctls: 0x%llx\n", + vcpu->vc_vmx_procbased2_ctls); + CTRL_DUMP(vcpu, PROCBASED2, VIRTUALIZE_APIC); + CTRL_DUMP(vcpu, PROCBASED2, ENABLE_EPT); + CTRL_DUMP(vcpu, PROCBASED2, DESCRIPTOR_TABLE_EXITING); + CTRL_DUMP(vcpu, PROCBASED2, ENABLE_RDTSCP); + CTRL_DUMP(vcpu, PROCBASED2, VIRTUALIZE_X2APIC_MODE); + CTRL_DUMP(vcpu, PROCBASED2, ENABLE_VPID); + CTRL_DUMP(vcpu, PROCBASED2, WBINVD_EXITING); + CTRL_DUMP(vcpu, PROCBASED2, UNRESTRICTED_GUEST); + CTRL_DUMP(vcpu, PROCBASED2, + APIC_REGISTER_VIRTUALIZATION); + CTRL_DUMP(vcpu, PROCBASED2, + VIRTUAL_INTERRUPT_DELIVERY); + CTRL_DUMP(vcpu, PROCBASED2, PAUSE_LOOP_EXITING); + CTRL_DUMP(vcpu, PROCBASED2, RDRAND_EXITING); + CTRL_DUMP(vcpu, PROCBASED2, ENABLE_INVPCID); + CTRL_DUMP(vcpu, PROCBASED2, ENABLE_VM_FUNCTIONS); + CTRL_DUMP(vcpu, PROCBASED2, VMCS_SHADOWING); + CTRL_DUMP(vcpu, PROCBASED2, EPT_VIOLATION_VE); + } + printf(" entry ctls: 0x%llx\n", + vcpu->vc_vmx_entry_ctls); + printf(" true entry ctls: 0x%llx\n", + vcpu->vc_vmx_true_procbased_ctls); + CTRL_DUMP(vcpu, ENTRY, LOAD_DEBUG_CONTROLS); + CTRL_DUMP(vcpu, ENTRY, IA32E_MODE_GUEST); + CTRL_DUMP(vcpu, ENTRY, ENTRY_TO_SMM); + CTRL_DUMP(vcpu, ENTRY, DEACTIVATE_DUAL_MONITOR_TREATMENT); + CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY); + CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_PAT_ON_ENTRY); + CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_EFER_ON_ENTRY); + printf(" exit ctls: 0x%llx\n", + vcpu->vc_vmx_exit_ctls); + printf(" true exit ctls: 0x%llx\n", + vcpu->vc_vmx_true_exit_ctls); + CTRL_DUMP(vcpu, EXIT, SAVE_DEBUG_CONTROLS); + CTRL_DUMP(vcpu, EXIT, HOST_SPACE_ADDRESS_SIZE); + CTRL_DUMP(vcpu, EXIT, LOAD_IA32_PERF_GLOBAL_CTRL_ON_EXIT); + CTRL_DUMP(vcpu, EXIT, ACKNOWLEDGE_INTERRUPT_ON_EXIT); + CTRL_DUMP(vcpu, EXIT, SAVE_IA32_PAT_ON_EXIT); + CTRL_DUMP(vcpu, EXIT, LOAD_IA32_PAT_ON_EXIT); + CTRL_DUMP(vcpu, EXIT, SAVE_IA32_EFER_ON_EXIT); + CTRL_DUMP(vcpu, EXIT, LOAD_IA32_EFER_ON_EXIT); + CTRL_DUMP(vcpu, EXIT, SAVE_VMX_PREEMPTION_TIMER); + } +} diff --git a/sys/arch/amd64/amd64/vmm_support.S b/sys/arch/amd64/amd64/vmm_support.S new file mode 100644 index 00000000000..c783a2fe5c2 --- /dev/null +++ b/sys/arch/amd64/amd64/vmm_support.S @@ -0,0 +1,384 @@ +/* + * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "assym.h" +#include <machine/asm.h> +#include <machine/specialreg.h> + +/* + * XXX duplicated in vmmvar.h due to song-and-dance with sys/rwlock.h inclusion + * here + */ +#define VMX_FAIL_LAUNCH_UNKNOWN 1 +#define VMX_FAIL_LAUNCH_INVALID_VMCS 2 +#define VMX_FAIL_LAUNCH_VALID_VMCS 3 + + .text + .code64 + .align 16 + .global _C_LABEL(vmxon) + .global _C_LABEL(vmxoff) + .global _C_LABEL(vmclear) + .global _C_LABEL(vmptrld) + .global _C_LABEL(vmptrst) + .global _C_LABEL(vmwrite) + .global _C_LABEL(vmread) + .global _C_LABEL(invvpid) + .global _C_LABEL(invept) + .global _C_LABEL(vmx_enter_guest) + .global _C_LABEL(vmm_dispatch_intr) +_C_LABEL(vmm_dispatch_intr): + movq %rsp, %r11 /* r11 = temporary register */ + andq $0xFFFFFFFFFFFFFFF0, %rsp + movw %ss, %ax + pushq %ax + pushq %r11 + pushfq + movw %cs, %ax + pushq %ax + cli + callq *%rdi + ret + +_C_LABEL(vmxon): + vmxon (%rdi) + jz failed_on + jc failed_on + xorq %rax, %rax + ret +failed_on: + movq $0x01, %rax + ret + +_C_LABEL(vmxoff): + vmxoff + jz failed_off + jc failed_off + xorq %rax, %rax + ret +failed_off: + movq $0x01, %rax + ret + +_C_LABEL(vmclear): + vmclear (%rdi) + jz failed_clear + jc failed_clear + xorq %rax, %rax + ret +failed_clear: + movq $0x01, %rax + ret + +_C_LABEL(vmptrld): + vmptrld (%rdi) + jz failed_ptrld + jc failed_ptrld + xorq %rax, %rax + ret +failed_ptrld: + movq $0x01, %rax + ret + +_C_LABEL(vmptrst): + vmptrst (%rdi) + jz failed_ptrst + jc failed_ptrst + xorq %rax, %rax + ret +failed_ptrst: + movq $0x01, %rax + ret + +_C_LABEL(vmwrite): + vmwrite %rsi, %rdi + jz failed_write + jc failed_write + xorq %rax, %rax + ret +failed_write: + movq $0x01, %rax + ret + +_C_LABEL(vmread): + vmread %rdi, (%rsi) + jz failed_read + jc failed_read + xorq %rax, %rax + ret +failed_read: + movq $0x01, %rax + ret + +_C_LABEL(invvpid): + invvpid (%rsi), %rdi + ret + +_C_LABEL(invept): + invept (%rsi), %rdi + ret + +_C_LABEL(vmx_enter_guest): + movq %rdx, %r8 /* resume flag */ + testq %r8, %r8 + jnz skip_init + + /* + * XXX make vmx_exit_handler a global and put this in the per-vcpu + * init code + */ + movq $VMCS_HOST_IA32_RIP, %rdi + movq $vmx_exit_handler_asm, %rax + vmwrite %rax, %rdi /* Host RIP */ + +skip_init: + /* + * XXX use msr list here for restore instead of all this + * stack jiggery-pokery + */ + + pushfq + + /* + * Save (possibly) lazy-switched selectors + */ + movw %es, %ax + pushw %ax + movw %ds, %ax + pushw %ax + movw %ss, %ax + pushw %ax + + movq $MSR_FSBASE, %rcx + rdmsr + pushq %rax + pushq %rdx + pushw %fs + movq $MSR_GSBASE, %rcx + rdmsr + pushq %rax + pushq %rdx + pushw %gs + movq $MSR_KERNELGSBASE, %rcx + rdmsr + pushq %rax + pushq %rdx + + /* + * Save various MSRs + */ + movq $MSR_STAR, %rcx + rdmsr + pushq %rax + pushq %rdx + + movq $MSR_LSTAR, %rcx + rdmsr + pushq %rax + pushq %rdx + + /* XXX - unused? */ + movq $MSR_CSTAR, %rcx + rdmsr + pushq %rax + pushq %rdx + + movq $MSR_SFMASK, %rcx + rdmsr + pushq %rax + pushq %rdx + + /* Preserve callee-preserved registers as per AMD64 ABI */ + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + pushq %rsi /* Guest Regs Pointer */ + + movq $VMCS_HOST_IA32_RSP, %rdi + movq %rsp, %rax + vmwrite %rax, %rdi /* Host RSP */ + + testq %r8, %r8 + jnz do_resume + + /* Restore guest registers */ + movq 0x78(%rsi), %rax + movq %rax, %cr2 + movq 0x70(%rsi), %r15 + movq 0x68(%rsi), %r14 + movq 0x60(%rsi), %r13 + movq 0x58(%rsi), %r12 + movq 0x50(%rsi), %r11 + movq 0x48(%rsi), %r10 + movq 0x40(%rsi), %r9 + movq 0x38(%rsi), %r8 + movq 0x30(%rsi), %rbp + movq 0x28(%rsi), %rdi + movq 0x20(%rsi), %rdx + movq 0x18(%rsi), %rcx + movq 0x10(%rsi), %rbx + movq 0x08(%rsi), %rax + movq 0x00(%rsi), %rsi + + vmlaunch + jmp fail_launch_or_resume +do_resume: + /* Restore guest registers */ + movq 0x78(%rsi), %rax + movq %rax, %cr2 + movq 0x70(%rsi), %r15 + movq 0x68(%rsi), %r14 + movq 0x60(%rsi), %r13 + movq 0x58(%rsi), %r12 + movq 0x50(%rsi), %r11 + movq 0x48(%rsi), %r10 + movq 0x40(%rsi), %r9 + movq 0x38(%rsi), %r8 + movq 0x30(%rsi), %rbp + movq 0x28(%rsi), %rdi + movq 0x20(%rsi), %rdx + movq 0x18(%rsi), %rcx + movq 0x10(%rsi), %rbx + movq 0x08(%rsi), %rax + movq 0x00(%rsi), %rsi + vmresume +fail_launch_or_resume: + /* Failed launch/resume (fell through) */ + jc fail_launch_invalid_vmcs /* Invalid VMCS */ + jz fail_launch_valid_vmcs /* Valid VMCS, failed launch/resume */ + + /* Unknown failure mode (not documented as per Intel SDM) */ +fail_launch_unknown: + movq $VMX_FAIL_LAUNCH_UNKNOWN, %rdi + popq %rsi + jmp restore_host + +fail_launch_invalid_vmcs: + movq $VMX_FAIL_LAUNCH_INVALID_VMCS, %rdi + popq %rsi + jmp restore_host + +fail_launch_valid_vmcs: + movq $VMCS_INSTRUCTION_ERROR, %rdi + popq %rsi + vmread %rdi, %rax + /* XXX check failure of vmread */ + movl %eax, 0x80(%rsi) + movq $VMX_FAIL_LAUNCH_VALID_VMCS, %rdi + jmp restore_host + +vmx_exit_handler_asm: + /* Preserve guest registers not saved in VMCS */ + pushq %rsi + pushq %rdi + movq 0x10(%rsp), %rdi + movq 0x8(%rsp), %rsi + movq %rsi, (%rdi) + popq %rdi + popq %rsi /* discard */ + + popq %rsi + movq %rax, 0x8(%rsi) + movq %rbx, 0x10(%rsi) + movq %rcx, 0x18(%rsi) + movq %rdx, 0x20(%rsi) + movq %rdi, 0x28(%rsi) + movq %rbp, 0x30(%rsi) + movq %r8, 0x38(%rsi) + movq %r9, 0x40(%rsi) + movq %r10, 0x48(%rsi) + movq %r11, 0x50(%rsi) + movq %r12, 0x58(%rsi) + movq %r13, 0x60(%rsi) + movq %r14, 0x68(%rsi) + movq %r15, 0x70(%rsi) + movq %cr2, %rax + movq %rax, 0x78(%rsi) + + /* %rdi = 0 means we took an exit */ + xorq %rdi, %rdi + +restore_host: + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + + /* + * Restore saved MSRs + */ + popq %rdx + popq %rax + movq $MSR_SFMASK, %rcx + wrmsr + + /* XXX - unused? */ + popq %rdx + popq %rax + movq $MSR_CSTAR, %rcx + wrmsr + + popq %rdx + popq %rax + movq $MSR_LSTAR, %rcx + wrmsr + + popq %rdx + popq %rax + movq $MSR_STAR, %rcx + wrmsr + + /* + * popw %gs will reset gsbase to 0, so preserve it + * first. This is to accomodate possibly lazy-switched + * selectors from above + */ + cli + popq %rdx + popq %rax + movq $MSR_KERNELGSBASE, %rcx + wrmsr + + popw %gs + popq %rdx + popq %rax + movq $MSR_GSBASE, %rcx + wrmsr + + popw %fs + popq %rdx + popq %rax + movq $MSR_FSBASE, %rcx + wrmsr + sti + + popw %ax + movw %ax, %ss + popw %ax + movw %ax, %ds + popw %ax + movw %ax, %es + + popfq + + movq %rdi, %rax + ret + diff --git a/sys/arch/amd64/conf/GENERIC b/sys/arch/amd64/conf/GENERIC index 5150f3ee287..a77f7dae17b 100644 --- a/sys/arch/amd64/conf/GENERIC +++ b/sys/arch/amd64/conf/GENERIC @@ -1,4 +1,4 @@ -# $OpenBSD: GENERIC,v 1.399 2015/10/29 07:47:02 kettenis Exp $ +# $OpenBSD: GENERIC,v 1.400 2015/11/13 07:52:20 mlarkin Exp $ # # For further information on compiling OpenBSD kernels, see the config(8) # man page. @@ -24,6 +24,7 @@ option MTRR # CPU memory range attributes control option NTFS # NTFS support option HIBERNATE # Hibernate support +#option VMM # VMM support config bsd swap generic @@ -37,6 +38,7 @@ isa0 at pcib? isa0 at amdpcib? isa0 at tcpcib? pci* at mainbus0 +#vmm0 at mainbus0 pvbus0 at mainbus0 acpi0 at bios0 diff --git a/sys/arch/amd64/conf/Makefile.amd64 b/sys/arch/amd64/conf/Makefile.amd64 index c9dcb318834..c0a84a967a7 100644 --- a/sys/arch/amd64/conf/Makefile.amd64 +++ b/sys/arch/amd64/conf/Makefile.amd64 @@ -1,4 +1,4 @@ -# $OpenBSD: Makefile.amd64,v 1.65 2015/01/13 01:12:49 deraadt Exp $ +# $OpenBSD: Makefile.amd64,v 1.66 2015/11/13 07:52:20 mlarkin Exp $ # For instructions on building kernels consult the config(8) and options(4) # manual pages. @@ -142,7 +142,7 @@ db_structinfo.h: $S/ddb/db_structinfo.c $S/ddb/parse_structinfo.pl rm -f db_structinfo.o locore.o: ${_machdir}/${_mach}/locore.S assym.h -mutex.o vector.o copy.o spl.o mptramp.o acpi_wakecode.o: assym.h +mutex.o vector.o copy.o spl.o mptramp.o acpi_wakecode.o vmm_support.o: assym.h # The install target can be redefined by putting a # install-kernel-${MACHINE_NAME} target into /etc/mk.conf diff --git a/sys/arch/amd64/conf/files.amd64 b/sys/arch/amd64/conf/files.amd64 index 7a082997b2c..cca555c839b 100644 --- a/sys/arch/amd64/conf/files.amd64 +++ b/sys/arch/amd64/conf/files.amd64 @@ -1,4 +1,4 @@ -# $OpenBSD: files.amd64,v 1.82 2015/10/29 07:47:02 kettenis Exp $ +# $OpenBSD: files.amd64,v 1.83 2015/11/13 07:52:20 mlarkin Exp $ maxpartitions 16 maxusers 2 16 128 @@ -227,6 +227,14 @@ file arch/amd64/amd64/acpi_machdep.c acpi file arch/amd64/amd64/acpi_wakecode.S acpi & !small_kernel # +# VMM +# +device vmm {} +attach vmm at mainbus +file arch/amd64/amd64/vmm.c vmm & !small_kernel needs-flag +file arch/amd64/amd64/vmm_support.S vmm & !small_kernel + +# # Machine-independent SD/MMC drivers # include "dev/sdmmc/files.sdmmc" diff --git a/sys/arch/amd64/include/cpu.h b/sys/arch/amd64/include/cpu.h index 678a2b1c8f3..e772200f8d4 100644 --- a/sys/arch/amd64/include/cpu.h +++ b/sys/arch/amd64/include/cpu.h @@ -1,4 +1,4 @@ -/* $OpenBSD: cpu.h,v 1.97 2015/07/02 01:33:59 dlg Exp $ */ +/* $OpenBSD: cpu.h,v 1.98 2015/11/13 07:52:20 mlarkin Exp $ */ /* $NetBSD: cpu.h,v 1.1 2003/04/26 18:39:39 fvdl Exp $ */ /*- @@ -54,6 +54,36 @@ #ifdef _KERNEL +#ifdef VMM +/* VMXON region (Intel) */ +struct vmxon_region { + uint32_t vr_revision; +}; + +/* + * VMX for Intel CPUs + */ +struct vmx { + uint64_t vmx_cr0_fixed0; + uint64_t vmx_cr0_fixed1; + uint64_t vmx_cr4_fixed0; + uint64_t vmx_cr4_fixed1; + uint32_t vmx_vmxon_revision; + uint32_t vmx_msr_table_size; +}; + +/* + * SVM for AMD CPUs + */ +struct svm { +}; + +union vmm_cpu_cap { + struct vmx vcc_vmx; + struct svm vcc_svm; +}; +#endif /* VMM */ + struct x86_64_tss; struct cpu_info { struct device *ci_dev; @@ -91,8 +121,15 @@ struct cpu_info { u_int32_t ci_feature_flags; u_int32_t ci_feature_eflags; - u_int32_t ci_feature_sefflags; + u_int32_t ci_feature_sefflags_ebx; + u_int32_t ci_feature_sefflags_ecx; u_int32_t ci_feature_tpmflags; + u_int32_t ci_pnfeatset; + u_int32_t ci_efeature_eax; + u_int32_t ci_efeature_ecx; + u_int32_t ci_brand[12]; + u_int32_t ci_amdcacheinfo[4]; + u_int32_t ci_extcacheinfo[4]; u_int32_t ci_signature; u_int32_t ci_family; u_int32_t ci_model; @@ -140,6 +177,16 @@ struct cpu_info { #ifdef GPROF struct gmonparam *ci_gmon; #endif +#ifdef VMM + u_int32_t ci_vmm_flags; +#define CI_VMM_VMX (1 << 0) +#define CI_VMM_SVM (1 << 1) +#define CI_VMM_RVI (1 << 2) +#define CI_VMM_EPT (1 << 3) + union vmm_cpu_cap ci_vmm_cap; + paddr_t ci_vmxon_region_pa; + struct vmxon_region *ci_vmxon_region; +#endif /* VMM */ }; #define CPUF_BSP 0x0001 /* CPU is the original BSP */ @@ -159,6 +206,7 @@ struct cpu_info { #define CPUF_PAUSE 0x4000 /* CPU is paused in DDB */ #define CPUF_GO 0x8000 /* CPU should start running */ #define CPUF_PARK 0x10000 /* CPU should self-park in real mode */ +#define CPUF_VMM 0x20000 /* CPU is executing in VMM mode */ #define PROC_PC(p) ((p)->p_md.md_regs->tf_rip) #define PROC_STACK(p) ((p)->p_md.md_regs->tf_rsp) @@ -282,6 +330,7 @@ extern int biosbasemem; extern int biosextmem; extern int cpu; extern int cpu_feature; +extern int cpu_ebxfeature; extern int cpu_ecxfeature; extern int cpu_perf_eax; extern int cpu_perf_ebx; diff --git a/sys/arch/amd64/include/intrdefs.h b/sys/arch/amd64/include/intrdefs.h index 650d40ab5da..0d152869613 100644 --- a/sys/arch/amd64/include/intrdefs.h +++ b/sys/arch/amd64/include/intrdefs.h @@ -1,4 +1,4 @@ -/* $OpenBSD: intrdefs.h,v 1.13 2015/02/10 05:35:19 mlarkin Exp $ */ +/* $OpenBSD: intrdefs.h,v 1.14 2015/11/13 07:52:20 mlarkin Exp $ */ /* $NetBSD: intrdefs.h,v 1.2 2003/05/04 22:01:56 fvdl Exp $ */ #ifndef _AMD64_INTRDEFS_H @@ -79,12 +79,15 @@ #define X86_IPI_MTRR 0x00000020 #define X86_IPI_SETPERF 0x00000040 #define X86_IPI_DDB 0x00000080 +#define X86_IPI_START_VMM 0x00000100 +#define X86_IPI_STOP_VMM 0x00000200 -#define X86_NIPI 8 +#define X86_NIPI 10 #define X86_IPI_NAMES { "halt IPI", "nop IPI", "FPU flush IPI", \ "FPU synch IPI", "TLB shootdown IPI", \ - "MTRR update IPI", "setperf IPI", "ddb IPI" } + "MTRR update IPI", "setperf IPI", "ddb IPI", \ + "VMM start IPI", "VMM stop IPI" } #define IREENT_MAGIC 0x18041969 diff --git a/sys/arch/amd64/include/pmap.h b/sys/arch/amd64/include/pmap.h index aaa9638984f..2a87c431671 100644 --- a/sys/arch/amd64/include/pmap.h +++ b/sys/arch/amd64/include/pmap.h @@ -1,4 +1,4 @@ -/* $OpenBSD: pmap.h,v 1.60 2015/11/10 08:57:39 mlarkin Exp $ */ +/* $OpenBSD: pmap.h,v 1.61 2015/11/13 07:52:20 mlarkin Exp $ */ /* $NetBSD: pmap.h,v 1.1 2003/04/26 18:39:46 fvdl Exp $ */ /* @@ -369,6 +369,7 @@ static void pmap_update_pg(vaddr_t); static void pmap_update_2pg(vaddr_t,vaddr_t); void pmap_write_protect(struct pmap *, vaddr_t, vaddr_t, vm_prot_t); +int pmap_fix_ept(struct pmap *, vaddr_t, int *); vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */ diff --git a/sys/arch/amd64/include/specialreg.h b/sys/arch/amd64/include/specialreg.h index 3899a140e06..f6bed264874 100644 --- a/sys/arch/amd64/include/specialreg.h +++ b/sys/arch/amd64/include/specialreg.h @@ -1,4 +1,4 @@ -/* $OpenBSD: specialreg.h,v 1.37 2015/06/07 08:11:50 guenther Exp $ */ +/* $OpenBSD: specialreg.h,v 1.38 2015/11/13 07:52:20 mlarkin Exp $ */ /* $NetBSD: specialreg.h,v 1.1 2003/04/26 18:39:48 fvdl Exp $ */ /* $NetBSD: x86/specialreg.h,v 1.2 2003/04/25 21:54:30 fvdl Exp $ */ @@ -172,7 +172,6 @@ * "Structured Extended Feature Flags Parameters" (CPUID function 0x7, leaf 0) * EBX bits */ - #define SEFF0EBX_FSGSBASE 0x00000001 /* {RD,WR}[FG]SBASE instructions */ #define SEFF0EBX_BMI1 0x00000008 /* advanced bit manipulation */ #define SEFF0EBX_HLE 0x00000010 /* Hardware Lock Elision */ @@ -185,6 +184,9 @@ #define SEFF0EBX_RDSEED 0x00040000 /* RDSEED instruction */ #define SEFF0EBX_ADX 0x00080000 /* ADCX/ADOX instructions */ #define SEFF0EBX_SMAP 0x00100000 /* Supervisor mode access prevent */ +/* SEFF ECX bits */ +#define SEFF0ECX_PREFETCHWT1 0x00000001 /* PREFETCHWT1 instruction */ +#define SEFF0ECX_PKU 0x00000008 /* Page prot keys for user mode */ /* * Thermal and Power Management (CPUID function 0x6) EAX bits @@ -286,6 +288,7 @@ #define MSR_EBL_CR_POWERON 0x02a #define MSR_EBC_FREQUENCY_ID 0x02c /* Pentium 4 only */ #define MSR_TEST_CTL 0x033 +#define MSR_IA32_FEATURE_CONTROL 0x03a #define MSR_BIOS_UPDT_TRIG 0x079 #define MSR_BBL_CR_D0 0x088 /* PII+ only */ #define MSR_BBL_CR_D1 0x089 /* PII+ only */ @@ -403,6 +406,7 @@ #define EFER_LME 0x00000100 /* Long Mode Enabled */ #define EFER_LMA 0x00000400 /* Long Mode Active */ #define EFER_NXE 0x00000800 /* No-Execute Enabled */ +#define EFER_SVME 0x00001000 /* SVM Enabled */ #define MSR_STAR 0xc0000081 /* 32 bit syscall gate addr */ #define MSR_LSTAR 0xc0000082 /* 64 bit syscall gate addr */ @@ -797,3 +801,226 @@ #define C3_CRYPT_CWLO_KEY128 0x0000000a /* 128bit, 10 rds */ #define C3_CRYPT_CWLO_KEY192 0x0000040c /* 192bit, 12 rds */ #define C3_CRYPT_CWLO_KEY256 0x0000080e /* 256bit, 15 rds */ + +/* + * VMX + */ +#define IA32_FEATURE_CONTROL_LOCK 0x01 +#define IA32_FEATURE_CONTROL_SMX_EN 0x02 +#define IA32_FEATURE_CONTROL_VMX_EN 0x04 +#define IA32_VMX_BASIC 0x480 +#define IA32_VMX_PINBASED_CTLS 0x481 +#define IA32_VMX_PROCBASED_CTLS 0x482 +#define IA32_VMX_EXIT_CTLS 0x483 +#define IA32_VMX_ENTRY_CTLS 0x484 +#define IA32_VMX_MISC 0x485 +#define IA32_VMX_CR0_FIXED0 0x486 +#define IA32_VMX_CR0_FIXED1 0x487 +#define IA32_VMX_CR4_FIXED0 0x488 +#define IA32_VMX_CR4_FIXED1 0x489 +#define IA32_VMX_PROCBASED2_CTLS 0x48B +#define IA32_VMX_EPT_VPID_CAP 0x48C +#define IA32_VMX_TRUE_PINBASED_CTLS 0x48D +#define IA32_VMX_TRUE_PROCBASED_CTLS 0x48E +#define IA32_VMX_TRUE_EXIT_CTLS 0x48F +#define IA32_VMX_TRUE_ENTRY_CTLS 0x490 + +#define IA32_EPT_VPID_CAP_PAGE_WALK_4 (1ULL << 6) +#define IA32_EPT_VPID_CAP_WB (1ULL << 14) +#define IA32_EPT_VPID_CAP_AD_BITS (1ULL << 21) + +#define IA32_EPT_PAGING_CACHE_TYPE_UC 0x0 +#define IA32_EPT_PAGING_CACHE_TYPE_WB 0x6 +#define IA32_EPT_AD_BITS_ENABLE (1ULL << 6) +#define IA32_EPT_PAGE_WALK_LENGTH 0x4 + +/* VMX : IA32_VMX_BASIC bits */ +#define IA32_VMX_TRUE_CTLS_AVAIL (1ULL << 55) + +/* VMX : IA32_VMX_PINBASED_CTLS bits */ +#define IA32_VMX_EXTERNAL_INT_EXITING (1ULL << 0) +#define IA32_VMX_NMI_EXITING (1ULL << 3) +#define IA32_VMX_VIRTUAL_NMIS (1ULL << 5) +#define IA32_VMX_ACTIVATE_VMX_PREEMPTION_TIMER (1ULL << 6) +#define IA32_VMX_PROCESS_POSTED_INTERRUPTS (1ULL << 7) + +/* VMX : IA32_VMX_PROCBASED_CTLS bits */ +#define IA32_VMX_INTERRUPT_WINDOW_EXITING (1ULL << 2) +#define IA32_VMX_USE_TSC_OFFSETTING (1ULL << 3) +#define IA32_VMX_HLT_EXITING (1ULL << 7) +#define IA32_VMX_INVLPG_EXITING (1ULL << 9) +#define IA32_VMX_MWAIT_EXITING (1ULL << 10) +#define IA32_VMX_RDPMC_EXITING (1ULL << 11) +#define IA32_VMX_RDTSC_EXITING (1ULL << 12) +#define IA32_VMX_CR3_LOAD_EXITING (1ULL << 15) +#define IA32_VMX_CR3_STORE_EXITING (1ULL << 16) +#define IA32_VMX_CR8_LOAD_EXITING (1ULL << 19) +#define IA32_VMX_CR8_STORE_EXITING (1ULL << 20) +#define IA32_VMX_USE_TPR_SHADOW (1ULL << 21) +#define IA32_VMX_NMI_WINDOW_EXITING (1ULL << 22) +#define IA32_VMX_MOV_DR_EXITING (1ULL << 23) +#define IA32_VMX_UNCONDITIONAL_IO_EXITING (1ULL << 24) +#define IA32_VMX_USE_IO_BITMAPS (1ULL << 25) +#define IA32_VMX_MONITOR_TRAP_FLAG (1ULL << 27) +#define IA32_VMX_USE_MSR_BITMAPS (1ULL << 28) +#define IA32_VMX_MONITOR_EXITING (1ULL << 29) +#define IA32_VMX_PAUSE_EXITING (1ULL << 30) +#define IA32_VMX_ACTIVATE_SECONDARY_CONTROLS (1ULL << 31) + +/* VMX : IA32_VMX_PROCBASED2_CTLS bits */ +#define IA32_VMX_VIRTUALIZE_APIC (1ULL << 0) +#define IA32_VMX_ENABLE_EPT (1ULL << 1) +#define IA32_VMX_DESCRIPTOR_TABLE_EXITING (1ULL << 2) +#define IA32_VMX_ENABLE_RDTSCP (1ULL << 3) +#define IA32_VMX_VIRTUALIZE_X2APIC_MODE (1ULL << 4) +#define IA32_VMX_ENABLE_VPID (1ULL << 5) +#define IA32_VMX_WBINVD_EXITING (1ULL << 6) +#define IA32_VMX_UNRESTRICTED_GUEST (1ULL << 7) +#define IA32_VMX_APIC_REGISTER_VIRTUALIZATION (1ULL << 8) +#define IA32_VMX_VIRTUAL_INTERRUPT_DELIVERY (1ULL << 9) +#define IA32_VMX_PAUSE_LOOP_EXITING (1ULL << 10) +#define IA32_VMX_RDRAND_EXITING (1ULL << 11) +#define IA32_VMX_ENABLE_INVPCID (1ULL << 12) +#define IA32_VMX_ENABLE_VM_FUNCTIONS (1ULL << 13) +#define IA32_VMX_VMCS_SHADOWING (1ULL << 14) +#define IA32_VMX_EPT_VIOLATION_VE (1ULL << 18) + +/* VMX : IA32_VMX_EXIT_CTLS bits */ +#define IA32_VMX_SAVE_DEBUG_CONTROLS (1ULL << 2) +#define IA32_VMX_HOST_SPACE_ADDRESS_SIZE (1ULL << 9) +#define IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_EXIT (1ULL << 12) +#define IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT (1ULL << 15) +#define IA32_VMX_SAVE_IA32_PAT_ON_EXIT (1ULL << 18) +#define IA32_VMX_LOAD_IA32_PAT_ON_EXIT (1ULL << 19) +#define IA32_VMX_SAVE_IA32_EFER_ON_EXIT (1ULL << 20) +#define IA32_VMX_LOAD_IA32_EFER_ON_EXIT (1ULL << 21) +#define IA32_VMX_SAVE_VMX_PREEMPTION_TIMER (1ULL << 22) + +/* VMX: IA32_VMX_ENTRY_CTLS bits */ +#define IA32_VMX_LOAD_DEBUG_CONTROLS (1ULL << 2) +#define IA32_VMX_IA32E_MODE_GUEST (1ULL << 9) +#define IA32_VMX_ENTRY_TO_SMM (1ULL << 10) +#define IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT (1ULL << 11) +#define IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY (1ULL << 13) +#define IA32_VMX_LOAD_IA32_PAT_ON_ENTRY (1ULL << 14) +#define IA32_VMX_LOAD_IA32_EFER_ON_ENTRY (1ULL << 15) + +/* VMX : VMCS Fields */ +#define VMCS_GUEST_VPID 0x0000 +#define VMCS_GUEST_IA32_ES_SEL 0x0800 +#define VMCS_GUEST_IA32_CS_SEL 0x0802 +#define VMCS_GUEST_IA32_SS_SEL 0x0804 +#define VMCS_GUEST_IA32_DS_SEL 0x0806 +#define VMCS_GUEST_IA32_FS_SEL 0x0808 +#define VMCS_GUEST_IA32_GS_SEL 0x080A +#define VMCS_GUEST_IA32_LDTR_SEL 0x080C +#define VMCS_GUEST_IA32_TR_SEL 0x080E +#define VMCS_HOST_IA32_ES_SEL 0x0C00 +#define VMCS_HOST_IA32_CS_SEL 0x0C02 +#define VMCS_HOST_IA32_SS_SEL 0x0C04 +#define VMCS_HOST_IA32_DS_SEL 0x0C06 +#define VMCS_HOST_IA32_FS_SEL 0x0C08 +#define VMCS_HOST_IA32_GS_SEL 0x0C0A +#define VMCS_HOST_IA32_TR_SEL 0x0C0C +#define VMCS_MSR_BITMAP_ADDRESS 0x2004 +#define VMCS_EXIT_STORE_MSR_ADDRESS 0x2006 +#define VMCS_EXIT_LOAD_MSR_ADDRESS 0x2008 +#define VMCS_ENTRY_LOAD_MSR_ADDRESS 0x200A +#define VMCS_APIC_ACCESS_ADDRESS 0x2014 +#define VMCS_GUEST_IA32_EPTP 0x201A +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x2400 +#define VMCS_LINK_POINTER 0x2800 +#define VMCS_GUEST_IA32_PAT 0x2804 +#define VMCS_HOST_IA32_PAT 0x2C00 +#define VMCS_HOST_IA32_EFER 0x2C02 +#define VMCS_PINBASED_CTLS 0x4000 +#define VMCS_PROCBASED_CTLS 0x4002 +#define VMCS_EXIT_CTLS 0x400C +#define VMCS_EXIT_MSR_STORE_COUNT 0x400E +#define VMCS_EXIT_MSR_LOAD_COUNT 0x4010 +#define VMCS_ENTRY_CTLS 0x4012 +#define VMCS_ENTRY_MSR_LOAD_COUNT 0x4014 +#define VMCS_ENTRY_INTERRUPTION_INFO 0x4016 +#define VMCS_PROCBASED2_CTLS 0x401E +#define VMCS_INSTRUCTION_ERROR 0x4400 +#define VMCS_EXIT_REASON 0x4402 +#define VMCS_EXIT_INTERRUPTION_INFO 0x4404 +#define VMCS_INSTRUCTION_LENGTH 0x440C +#define VMCS_GUEST_IA32_ES_LIMIT 0x4800 +#define VMCS_GUEST_IA32_CS_LIMIT 0x4802 +#define VMCS_GUEST_IA32_SS_LIMIT 0x4804 +#define VMCS_GUEST_IA32_DS_LIMIT 0x4806 +#define VMCS_GUEST_IA32_FS_LIMIT 0x4808 +#define VMCS_GUEST_IA32_GS_LIMIT 0x480A +#define VMCS_GUEST_IA32_LDTR_LIMIT 0x480C +#define VMCS_GUEST_IA32_TR_LIMIT 0x480E +#define VMCS_GUEST_IA32_GDTR_LIMIT 0x4810 +#define VMCS_GUEST_IA32_IDTR_LIMIT 0x4812 +#define VMCS_GUEST_IA32_ES_AR 0x4814 +#define VMCS_GUEST_IA32_CS_AR 0x4816 +#define VMCS_GUEST_IA32_SS_AR 0x4818 +#define VMCS_GUEST_IA32_DS_AR 0x481A +#define VMCS_GUEST_IA32_FS_AR 0x481C +#define VMCS_GUEST_IA32_GS_AR 0x481E +#define VMCS_GUEST_IA32_LDTR_AR 0x4820 +#define VMCS_GUEST_IA32_TR_AR 0x4822 +#define VMCS_GUEST_EXIT_QUALIFICATION 0x6400 +#define VMCS_GUEST_IA32_CR0 0x6800 +#define VMCS_GUEST_IA32_CR3 0x6802 +#define VMCS_GUEST_IA32_CR4 0x6804 +#define VMCS_GUEST_IA32_ES_BASE 0x6806 +#define VMCS_GUEST_IA32_CS_BASE 0x6808 +#define VMCS_GUEST_IA32_SS_BASE 0x680A +#define VMCS_GUEST_IA32_DS_BASE 0x680C +#define VMCS_GUEST_IA32_FS_BASE 0x680E +#define VMCS_GUEST_IA32_GS_BASE 0x6810 +#define VMCS_GUEST_IA32_LDTR_BASE 0x6812 +#define VMCS_GUEST_IA32_TR_BASE 0x6814 +#define VMCS_GUEST_IA32_GDTR_BASE 0x6816 +#define VMCS_GUEST_IA32_IDTR_BASE 0x6818 +#define VMCS_GUEST_IA32_RSP 0x681C +#define VMCS_GUEST_IA32_RIP 0x681E +#define VMCS_GUEST_IA32_RFLAGS 0x6820 +#define VMCS_HOST_IA32_CR0 0x6C00 +#define VMCS_HOST_IA32_CR3 0x6C02 +#define VMCS_HOST_IA32_CR4 0x6C04 +#define VMCS_HOST_IA32_FS_BASE 0x6C06 +#define VMCS_HOST_IA32_TR_BASE 0x6C0A +#define VMCS_HOST_IA32_GDTR_BASE 0x6C0C +#define VMCS_HOST_IA32_IDTR_BASE 0x6C0E +#define VMCS_HOST_IA32_RSP 0x6C14 +#define VMCS_HOST_IA32_RIP 0x6C16 + +#define IA32_VMX_INVVPID_INDIV_ADDR_CTX 0x0 +#define IA32_VMX_INVVPID_SINGLE_CTX 0x1 +#define IA32_VMX_INVVPID_ALL_CTX 0x2 +#define IA32_VMX_INVVPID_SINGLE_CTX_GLB 0x3 + +#define IA32_VMX_INVEPT_SINGLE_CTX 0x1 +#define IA32_VMX_INVEPT_GLOBAL_CTX 0x2 + +#define IA32_VMX_EPT_FAULT_READ (1ULL << 0) +#define IA32_VMX_EPT_FAULT_WRITE (1ULL << 1) +#define IA32_VMX_EPT_FAULT_EXEC (1ULL << 2) + +#define IA32_VMX_MSR_LIST_SIZE_MASK (3ULL << 25) + +/* + * SVM + */ +#define MSR_AMD_VM_CR 0xc0010114 +#define CPUID_AMD_SVM_CAP 0x8000000A +#define AMD_SVMDIS 0x10 +#define AMD_SVM_NESTED_PAGING_CAP (1 << 0) + +/* + * PAT + */ +#define PATENTRY(n, type) (type << ((n) * 8)) +#define PAT_UC 0x0UL +#define PAT_WC 0x1UL +#define PAT_WT 0x4UL +#define PAT_WP 0x5UL +#define PAT_WB 0x6UL +#define PAT_UCMINUS 0x7UL + diff --git a/sys/arch/amd64/include/vmmvar.h b/sys/arch/amd64/include/vmmvar.h new file mode 100644 index 00000000000..576115dfb3f --- /dev/null +++ b/sys/arch/amd64/include/vmmvar.h @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * CPU capabilities for VMM operation + */ +#ifndef _MACHINE_VMMVAR_H_ +#define _MACHINE_VMMVAR_H_ + +#include <sys/rwlock.h> + +#define VMM_HV_SIGNATURE "OpenBSDVMM58" + +#define VMM_MAX_DISKS_PER_VM 2 +#define VMM_MAX_PATH_DISK 128 +#define VMM_MAX_NAME_LEN 32 +#define VMM_MAX_KERNEL_PATH 128 +#define VMM_MAX_VCPUS_PER_VM 64 +#define VMM_MAX_VM_MEM_SIZE (512 * 1024) +#define VMM_MAX_NICS_PER_VM 2 + +#define VMM_PCI_MMIO_BAR_BASE 0xF0000000 +#define VMM_PCI_MMIO_BAR_END 0xF0FFFFFF +#define VMM_PCI_MMIO_BAR_SIZE 0x00010000 +#define VMM_PCI_IO_BAR_BASE 0x1000 +#define VMM_PCI_IO_BAR_END 0xFFFF +#define VMM_PCI_IO_BAR_SIZE 0x1000 + +/* VMX: Basic Exit Reasons */ +#define VMX_EXIT_NMI 0 +#define VMX_EXIT_EXTINT 1 +#define VMX_EXIT_TRIPLE_FAULT 2 +#define VMX_EXIT_INIT 3 +#define VMX_EXIT_SIPI 4 +#define VMX_EXIT_IO_SMI 5 +#define VMX_EXIT_OTHER_SMI 6 +#define VMX_EXIT_INT_WINDOW 7 +#define VMX_EXIT_NMI_WINDOW 8 +#define VMX_EXIT_TASK_SWITCH 9 +#define VMX_EXIT_CPUID 10 +#define VMX_EXIT_GETSEC 11 +#define VMX_EXIT_HLT 12 +#define VMX_EXIT_INVD 13 +#define VMX_EXIT_INVLPG 14 +#define VMX_EXIT_RDPMC 15 +#define VMX_EXIT_RDTSC 16 +#define VMX_EXIT_RSM 17 +#define VMX_EXIT_VMCALL 18 +#define VMX_EXIT_VMCLEAR 19 +#define VMX_EXIT_VMLAUNCH 20 +#define VMX_EXIT_VMPTRLD 21 +#define VMX_EXIT_VMPTRST 22 +#define VMX_EXIT_VMREAD 23 +#define VMX_EXIT_VMRESUME 24 +#define VMX_EXIT_VMWRITE 25 +#define VMX_EXIT_VMXOFF 26 +#define VMX_EXIT_VMXON 27 +#define VMX_EXIT_CR_ACCESS 28 +#define VMX_EXIT_MOV_DR 29 +#define VMX_EXIT_IO 30 +#define VMX_EXIT_RDMSR 31 +#define VMX_EXIT_WRMSR 32 +#define VMX_EXIT_ENTRY_FAILED_GUEST_STATE 33 +#define VMX_EXIT_ENTRY_FAILED_MSR_LOAD 34 +#define VMX_EXIT_MWAIT 36 +#define VMX_EXIT_MTF 37 +#define VMX_EXIT_MONITOR 39 +#define VMX_EXIT_PAUSE 40 +#define VMX_EXIT_ENTRY_FAILED_MCE 41 +#define VMX_EXIT_TPR_BELOW_THRESHOLD 43 +#define VMX_EXIT_APIC_ACCESS 44 +#define VMX_EXIT_VIRTUALIZED_EOI 45 +#define VMX_EXIT_GDTR_IDTR 46 +#define VMX_EXIT_LDTR_TR 47 +#define VMX_EXIT_EPT_VIOLATION 48 +#define VMX_EXIT_EPT_MISCONFIGURATION 49 +#define VMX_EXIT_INVEPT 50 +#define VMX_EXIT_RDTSCP 51 +#define VMX_EXIT_VMX_PREEMPTION_TIMER_EXPIRED 52 +#define VMX_EXIT_INVVPID 53 +#define VMX_EXIT_WBINVD 54 +#define VMX_EXIT_XSETBV 55 +#define VMX_EXIT_APIC_WRITE 56 +#define VMX_EXIT_RDRAND 57 +#define VMX_EXIT_INVPCID 58 +#define VMX_EXIT_VMFUNC 59 + +#define VM_EXIT_TERMINATED 0xFFFE +#define VM_EXIT_NONE 0xFFFF + +enum { + VCPU_STATE_STOPPED, + VCPU_STATE_RUNNING, + VCPU_STATE_REQSTOP, + VCPU_STATE_UNKNOWN +}; + +enum { + VEI_DIR_OUT, + VEI_DIR_IN +}; + + +/* + * vm exit data + * vm_exit_inout : describes an IN/OUT exit + */ +struct vm_exit_inout { + uint8_t vei_size; /* Size of access */ + uint8_t vei_dir; /* Direction */ + uint8_t vei_rep; /* REP prefix? */ + uint8_t vei_string; /* string variety? */ + uint8_t vei_encoding; /* operand encoding */ + uint16_t vei_port; /* port */ + uint32_t vei_data; /* data (for IN insns) */ +}; + +union vm_exit { + struct vm_exit_inout vei; /* IN/OUT exit */ +}; + + +struct vm_create_params { + /* Input parameters to VMM_IOC_CREATE */ + size_t vcp_memory_size; + size_t vcp_ncpus; + size_t vcp_ndisks; + size_t vcp_nnics; + char vcp_disks[VMM_MAX_DISKS_PER_VM][VMM_MAX_PATH_DISK]; + char vcp_name[VMM_MAX_NAME_LEN]; + char vcp_kernel[VMM_MAX_KERNEL_PATH]; + uint8_t vcp_macs[VMM_MAX_NICS_PER_VM][6]; + + /* Output parameter from VMM_IOC_CREATE */ + uint32_t vcp_id; +}; + +struct vm_run_params { + /* Input parameters to VMM_IOC_RUN */ + uint32_t vrp_vm_id; + uint32_t vrp_vcpu_id; + uint8_t vrp_continue; /* Continuing from an exit */ + int16_t vrp_injint; /* Injected interrupt vector */ + + /* Input/output parameter to VMM_IOC_RUN */ + union vm_exit *vrp_exit; /* updated exit data */ + + /* Output parameter from VMM_IOC_RUN */ + uint16_t vrp_exit_reason; /* exit reason */ +}; + +struct vm_info_result { + /* Output parameters from VMM_IOC_INFO */ + size_t vir_memory_size; + size_t vir_ncpus; + uint8_t vir_vcpu_state[VMM_MAX_VCPUS_PER_VM]; + pid_t vir_creator_pid; + uint32_t vir_id; + char vir_name[VMM_MAX_NAME_LEN]; +}; + +struct vm_info_params { + /* Input parameters to VMM_IOC_INFO */ + size_t vip_size; /* Output buffer size */ + + /* Output Parameters from VMM_IOC_INFO */ + size_t vip_info_ct; /* # of entries returned */ + struct vm_info_result *vip_info; /* Output buffer */ +}; + +struct vm_terminate_params { + /* Input parameters to VMM_IOC_TERM */ + uint32_t vtp_vm_id; +}; + +struct vm_writepage_params { + /* Input parameters to VMM_IOC_WRITEPAGE */ + uint32_t vwp_vm_id; /* VM ID */ + paddr_t vwp_paddr; /* Phys Addr */ + char *vwp_data; /* Page Data */ + uint32_t vwp_len; /* Length */ +}; + +struct vm_readpage_params { + /* Input parameters to VMM_IOC_READPAGE */ + uint32_t vrp_vm_id; /* VM ID */ + paddr_t vrp_paddr; /* Phys Addr */ + uint32_t vrp_len; /* Length */ + + /* Output parameters from VMM_IOC_READPAGE */ + char *vrp_data; /* Page Data */ +}; + +/* IOCTL definitions */ +#define VMM_IOC_START _IO('V', 1) /* Start virtualization */ +#define VMM_IOC_STOP _IO('V', 2) /* Stop virtualization */ +#define VMM_IOC_CREATE _IOWR('V', 3, struct vm_create_params) /* Create VM */ +#define VMM_IOC_RUN _IOWR('V', 4, struct vm_run_params) /* Run VCPU */ +#define VMM_IOC_INFO _IOWR('V', 5, struct vm_info_params) /* Get VM Info */ +#define VMM_IOC_TERM _IOW('V', 6, struct vm_terminate_params) /* Terminate VM */ +#define VMM_IOC_WRITEPAGE _IOW('V', 7, struct vm_writepage_params) /* Wr Pg */ +#define VMM_IOC_READPAGE _IOW('V', 8, struct vm_readpage_params) /* Rd Pg */ + +#ifdef _KERNEL + +#include <uvm/uvm_extern.h> + +#define VMX_FAIL_LAUNCH_UNKNOWN 1 +#define VMX_FAIL_LAUNCH_INVALID_VMCS 2 +#define VMX_FAIL_LAUNCH_VALID_VMCS 3 + +#ifdef VMM_DEBUG +#define dprintf(x...) do { if (vmm_debug) printf(x); } while(0) +#else +#define dprintf(x...) +#endif /* VMM_DEBUG */ + +enum { + VMM_MODE_UNKNOWN, + VMM_MODE_VMX, + VMM_MODE_EPT, + VMM_MODE_SVM, + VMM_MODE_RVI +}; + +enum { + VMM_MEM_TYPE_REGULAR, + VMM_MEM_TYPE_UNKNOWN +}; + +/* Forward declarations */ +struct vm; + +/* + * Implementation-specific cpu state + */ +struct vmcb { +}; + +struct vmcs { + uint32_t vmcs_revision; +}; + +struct vmx_invvpid_descriptor +{ + uint64_t vid_vpid; // : 16; + uint64_t vid_addr; +}; + +struct vmx_invept_descriptor +{ + uint64_t vid_eptp; + uint64_t vid_reserved; +}; + +struct vmx_msr_store +{ + uint64_t vms_index : 32; + uint64_t vms_data; +}; + +/* + * Storage for guest registers not preserved in VMCS and various exit + * information. + * + * Note that vmx_enter_guest depends on the layout of this struct for + * field access. + */ +struct vmx_gueststate +{ + /* %rsi should be first */ + uint64_t vg_rsi; /* 0x00 */ + uint64_t vg_rax; /* 0x08 */ + uint64_t vg_rbx; /* 0x10 */ + uint64_t vg_rcx; /* 0x18 */ + uint64_t vg_rdx; /* 0x20 */ + uint64_t vg_rdi; /* 0x28 */ + uint64_t vg_rbp; /* 0x30 */ + uint64_t vg_r8; /* 0x38 */ + uint64_t vg_r9; /* 0x40 */ + uint64_t vg_r10; /* 0x48 */ + uint64_t vg_r11; /* 0x50 */ + uint64_t vg_r12; /* 0x58 */ + uint64_t vg_r13; /* 0x60 */ + uint64_t vg_r14; /* 0x68 */ + uint64_t vg_r15; /* 0x70 */ + uint64_t vg_cr2; /* 0x78 */ + uint64_t vg_rip; /* 0x80 */ + uint32_t vg_exit_reason; /* 0x88 */ +}; + +/* + * Virtual CPU + */ +struct vcpu { + /* VMCS / VMCB pointer */ + vaddr_t vc_control_va; + uint64_t vc_control_pa; + + /* VLAPIC pointer */ + vaddr_t vc_vlapic_va; + uint64_t vc_vlapic_pa; + + /* MSR bitmap address */ + vaddr_t vc_msr_bitmap_va; + uint64_t vc_msr_bitmap_pa; + + struct vm *vc_parent; + uint32_t vc_id; + SLIST_ENTRY(vcpu) vc_vcpu_link; + vaddr_t vc_hsa_stack_va; + + uint8_t vc_virt_mode; + uint8_t vc_state; + + struct cpu_info *vc_last_pcpu; + union vm_exit vc_exit; + + /* VMX only */ + uint64_t vc_vmx_basic; + uint64_t vc_vmx_entry_ctls; + uint64_t vc_vmx_true_entry_ctls; + uint64_t vc_vmx_exit_ctls; + uint64_t vc_vmx_true_exit_ctls; + uint64_t vc_vmx_pinbased_ctls; + uint64_t vc_vmx_true_pinbased_ctls; + uint64_t vc_vmx_procbased_ctls; + uint64_t vc_vmx_true_procbased_ctls; + uint64_t vc_vmx_procbased2_ctls; + struct vmx_gueststate vc_gueststate; + vaddr_t vc_vmx_msr_exit_save_va; + paddr_t vc_vmx_msr_exit_save_pa; + vaddr_t vc_vmx_msr_exit_load_va; + paddr_t vc_vmx_msr_exit_load_pa; + vaddr_t vc_vmx_msr_entry_load_va; + paddr_t vc_vmx_msr_entry_load_pa; +}; + +SLIST_HEAD(vcpu_head, vcpu); + +/* + * Virtual Machine + */ +struct vm { + vm_map_t vm_map; + uint32_t vm_id; + pid_t vm_creator_pid; + uint32_t vm_memory_size; + char vm_name[VMM_MAX_NAME_LEN]; + + struct vcpu_head vm_vcpu_list; + uint32_t vm_vcpu_ct; + struct rwlock vm_vcpu_lock; + + SLIST_ENTRY(vm) vm_link; +}; + +void vmm_dispatch_intr(vaddr_t); +int vmxon(uint64_t *); +int vmxoff(void); +int vmclear(uint64_t *); +int vmptrld(uint64_t *); +int vmptrst(uint64_t *); +int vmwrite(uint64_t, uint64_t); +int vmread(uint64_t, uint64_t *); +void invvpid(uint64_t, struct vmx_invvpid_descriptor *); +void invept(uint64_t, struct vmx_invept_descriptor *); +int vmx_enter_guest(uint64_t *, struct vmx_gueststate *, int); +void start_vmm_on_cpu(struct cpu_info *); +void stop_vmm_on_cpu(struct cpu_info *); + +#endif /* _KERNEL */ + +#endif /* ! _MACHINE_VMMVAR_H_ */ |