summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--usr.sbin/vmd/Makefile8
-rw-r--r--usr.sbin/vmd/vm.c1262
-rw-r--r--usr.sbin/vmd/vmd.h6
-rw-r--r--usr.sbin/vmd/vmm.c1217
4 files changed, 1297 insertions, 1196 deletions
diff --git a/usr.sbin/vmd/Makefile b/usr.sbin/vmd/Makefile
index 39fd337c581..09a9c263dee 100644
--- a/usr.sbin/vmd/Makefile
+++ b/usr.sbin/vmd/Makefile
@@ -1,11 +1,11 @@
-# $OpenBSD: Makefile,v 1.12 2016/11/26 20:03:42 reyk Exp $
+# $OpenBSD: Makefile,v 1.13 2017/03/01 18:00:50 reyk Exp $
.if ${MACHINE} == "amd64" || ${MACHINE} == "i386"
PROG= vmd
-SRCS= vmm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
-SRCS+= vmd.c control.c log.c priv.c proc.c config.c ns8250.c i8253.c
-SRCS+= vmboot.c ufs.c disklabel.c
+SRCS= vmd.c control.c log.c priv.c proc.c config.c vmm.c
+SRCS+= vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
+SRCS+= ns8250.c i8253.c vmboot.c ufs.c disklabel.c
SRCS+= parse.y
CFLAGS+= -Wall -I${.CURDIR}
diff --git a/usr.sbin/vmd/vm.c b/usr.sbin/vmd/vm.c
new file mode 100644
index 00000000000..76d213e8adf
--- /dev/null
+++ b/usr.sbin/vmd/vm.c
@@ -0,0 +1,1262 @@
+/* $OpenBSD: vm.c,v 1.1 2017/03/01 18:00:50 reyk Exp $ */
+
+/*
+ * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/queue.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+
+#include <dev/ic/i8253reg.h>
+#include <dev/isa/isareg.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/param.h>
+#include <machine/psl.h>
+#include <machine/specialreg.h>
+#include <machine/vmmvar.h>
+
+#include <net/if.h>
+
+#include <errno.h>
+#include <event.h>
+#include <fcntl.h>
+#include <imsg.h>
+#include <limits.h>
+#include <poll.h>
+#include <pthread.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <util.h>
+
+#include "vmd.h"
+#include "vmm.h"
+#include "loadfile.h"
+#include "pci.h"
+#include "virtio.h"
+#include "proc.h"
+#include "i8253.h"
+#include "i8259.h"
+#include "ns8250.h"
+#include "mc146818.h"
+
+io_fn_t ioports_map[MAX_PORTS];
+
+int run_vm(int *, int *, struct vm_create_params *, struct vcpu_reg_state *);
+void vm_dispatch_vmm(int, short, void *);
+void *event_thread(void *);
+void *vcpu_run_loop(void *);
+int vcpu_exit(struct vm_run_params *);
+int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
+void create_memory_map(struct vm_create_params *);
+int alloc_guest_mem(struct vm_create_params *);
+int vmm_create_vm(struct vm_create_params *);
+void init_emulated_hw(struct vm_create_params *, int *, int *);
+void vcpu_exit_inout(struct vm_run_params *);
+uint8_t vcpu_exit_pci(struct vm_run_params *);
+int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
+
+static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
+ size_t);
+
+int con_fd;
+struct vmd_vm *current_vm;
+
+extern struct vmd *env;
+
+extern char *__progname;
+
+pthread_mutex_t threadmutex;
+pthread_cond_t threadcond;
+
+pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
+pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
+uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
+uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
+
+/*
+ * Represents a standard register set for an OS to be booted
+ * as a flat 32 bit address space, before paging is enabled.
+ *
+ * NOT set here are:
+ * RIP
+ * RSP
+ * GDTR BASE
+ *
+ * Specific bootloaders should clone this structure and override
+ * those fields as needed.
+ *
+ * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
+ * features of the CPU in use.
+ */
+static const struct vcpu_reg_state vcpu_init_flat32 = {
+#ifdef __i386__
+ .vrs_gprs[VCPU_REGS_EFLAGS] = 0x2,
+ .vrs_gprs[VCPU_REGS_EIP] = 0x0,
+ .vrs_gprs[VCPU_REGS_ESP] = 0x0,
+#else
+ .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
+ .vrs_gprs[VCPU_REGS_RIP] = 0x0,
+ .vrs_gprs[VCPU_REGS_RSP] = 0x0,
+#endif
+ .vrs_crs[VCPU_REGS_CR0] = CR0_CD | CR0_NW | CR0_ET | CR0_PE | CR0_PG,
+ .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
+ .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
+ .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+ .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+ .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+ .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+ .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+ .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
+ .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
+ .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
+ .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
+};
+
+/*
+ * start_vm
+ *
+ * After forking a new VM process, starts the new VM with the creation
+ * parameters supplied (in the incoming vm->vm_params field). This
+ * function performs a basic sanity check on the incoming parameters
+ * and then performs the following steps to complete the creation of the VM:
+ *
+ * 1. validates and create the new VM
+ * 2. opens the imsg control channel to the parent and drops more privilege
+ * 3. drops additional privleges by calling pledge(2)
+ * 4. loads the kernel from the disk image or file descriptor
+ * 5. runs the VM's VCPU loops.
+ *
+ * Parameters:
+ * vm: The VM data structure that is including the VM create parameters.
+ * fd: The imsg socket that is connected to the parent process.
+ *
+ * Return values:
+ * 0: success
+ * !0 : failure - typically an errno indicating the source of the failure
+ */
+int
+start_vm(struct vmd_vm *vm, int fd)
+{
+ struct vm_create_params *vcp = &vm->vm_params.vmc_params;
+ struct vcpu_reg_state vrs;
+ int nicfds[VMM_MAX_NICS_PER_VM];
+ int ret;
+ FILE *kernfp;
+ struct vmboot_params vmboot;
+ size_t i;
+
+ /* Child */
+ setproctitle("%s", vcp->vcp_name);
+ log_procinit(vcp->vcp_name);
+
+ create_memory_map(vcp);
+ ret = alloc_guest_mem(vcp);
+ if (ret) {
+ errno = ret;
+ fatal("could not allocate guest memory - exiting");
+ }
+
+ ret = vmm_create_vm(vcp);
+ current_vm = vm;
+
+ /* send back the kernel-generated vm id (0 on error) */
+ if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
+ sizeof(vcp->vcp_id))
+ fatal("write vcp id");
+
+ if (ret) {
+ errno = ret;
+ fatal("create vmm ioctl failed - exiting");
+ }
+
+ /*
+ * pledge in the vm processes:
+ * stdio - for malloc and basic I/O including events.
+ * vmm - for the vmm ioctls and operations.
+ */
+ if (pledge("stdio vmm", NULL) == -1)
+ fatal("pledge");
+
+ /*
+ * Set up default "flat 32 bit" register state - RIP,
+ * RSP, and GDT info will be set in bootloader
+ */
+ memcpy(&vrs, &vcpu_init_flat32, sizeof(struct vcpu_reg_state));
+
+ /* Find and open kernel image */
+ if ((kernfp = vmboot_open(vm->vm_kernel,
+ vm->vm_disks[0], &vmboot)) == NULL)
+ fatalx("failed to open kernel - exiting");
+
+ /* Load kernel image */
+ ret = loadelf_main(kernfp, vcp, &vrs,
+ vmboot.vbp_bootdev, vmboot.vbp_howto);
+ if (ret) {
+ errno = ret;
+ fatal("failed to load kernel - exiting");
+ }
+
+ vmboot_close(kernfp, &vmboot);
+
+ if (vm->vm_kernel != -1)
+ close(vm->vm_kernel);
+
+ con_fd = vm->vm_tty;
+ if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
+ fatal("failed to set nonblocking mode on console");
+
+ for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
+ nicfds[i] = vm->vm_ifs[i].vif_fd;
+
+ event_init();
+
+ if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
+ fatal("setup vm pipe");
+
+ /* Execute the vcpu run loop(s) for this VM */
+ ret = run_vm(vm->vm_disks, nicfds, vcp, &vrs);
+
+ return (ret);
+}
+
+/*
+ * vm_dispatch_vmm
+ *
+ * imsg callback for messages that are received from the vmm parent process.
+ */
+void
+vm_dispatch_vmm(int fd, short event, void *arg)
+{
+ struct vmd_vm *vm = arg;
+ struct imsgev *iev = &vm->vm_iev;
+ struct imsgbuf *ibuf = &iev->ibuf;
+ struct imsg imsg;
+ ssize_t n;
+ int verbose;
+
+ if (event & EV_READ) {
+ if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
+ fatal("%s: imsg_read", __func__);
+ if (n == 0)
+ _exit(0);
+ }
+
+ if (event & EV_WRITE) {
+ if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
+ fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
+ if (n == 0)
+ _exit(0);
+ }
+
+ for (;;) {
+ if ((n = imsg_get(ibuf, &imsg)) == -1)
+ fatal("%s: imsg_get", __func__);
+ if (n == 0)
+ break;
+
+#if DEBUG > 1
+ log_debug("%s: got imsg %d from %s",
+ __func__, imsg.hdr.type,
+ vm->vm_params.vmc_params.vcp_name);
+#endif
+
+ switch (imsg.hdr.type) {
+ case IMSG_CTL_VERBOSE:
+ IMSG_SIZE_CHECK(&imsg, &verbose);
+ memcpy(&verbose, imsg.data, sizeof(verbose));
+ log_setverbose(verbose);
+ break;
+ case IMSG_VMDOP_VM_SHUTDOWN:
+ if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
+ _exit(0);
+ break;
+ case IMSG_VMDOP_VM_REBOOT:
+ if (vmmci_ctl(VMMCI_REBOOT) == -1)
+ _exit(0);
+ break;
+ default:
+ fatalx("%s: got invalid imsg %d from %s",
+ __func__, imsg.hdr.type,
+ vm->vm_params.vmc_params.vcp_name);
+ }
+ imsg_free(&imsg);
+ }
+ imsg_event_add(iev);
+}
+
+/*
+ * vcpu_reset
+ *
+ * Requests vmm(4) to reset the VCPUs in the indicated VM to
+ * the register state provided
+ *
+ * Parameters
+ * vmid: VM ID to reset
+ * vcpu_id: VCPU ID to reset
+ * vrs: the register state to initialize
+ *
+ * Return values:
+ * 0: success
+ * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
+ * valid)
+ */
+int
+vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
+{
+ struct vm_resetcpu_params vrp;
+
+ memset(&vrp, 0, sizeof(vrp));
+ vrp.vrp_vm_id = vmid;
+ vrp.vrp_vcpu_id = vcpu_id;
+ memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
+
+ log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
+
+ if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) < 0)
+ return (errno);
+
+ return (0);
+}
+
+/*
+ * create_memory_map
+ *
+ * Sets up the guest physical memory ranges that the VM can access.
+ *
+ * Return values:
+ * nothing
+ */
+void
+create_memory_map(struct vm_create_params *vcp)
+{
+ size_t len, mem_bytes, mem_mb;
+
+ mem_mb = vcp->vcp_memranges[0].vmr_size;
+ vcp->vcp_nmemranges = 0;
+ if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE)
+ return;
+
+ mem_bytes = mem_mb * 1024 * 1024;
+
+ /* First memory region: 0 - LOWMEM_KB (DOS low mem) */
+ len = LOWMEM_KB * 1024;
+ vcp->vcp_memranges[0].vmr_gpa = 0x0;
+ vcp->vcp_memranges[0].vmr_size = len;
+ mem_bytes -= len;
+
+ /*
+ * Second memory region: LOWMEM_KB - 1MB.
+ *
+ * N.B. - Normally ROMs or parts of video RAM are mapped here.
+ * We have to add this region, because some systems
+ * unconditionally write to 0xb8000 (VGA RAM), and
+ * we need to make sure that vmm(4) permits accesses
+ * to it. So allocate guest memory for it.
+ */
+ len = 0x100000 - LOWMEM_KB * 1024;
+ vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
+ vcp->vcp_memranges[1].vmr_size = len;
+ mem_bytes -= len;
+
+ /* Make sure that we do not place physical memory into MMIO ranges. */
+ if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000)
+ len = VMM_PCI_MMIO_BAR_BASE - 0x100000;
+ else
+ len = mem_bytes;
+
+ /* Third memory region: 1MB - (1MB + len) */
+ vcp->vcp_memranges[2].vmr_gpa = 0x100000;
+ vcp->vcp_memranges[2].vmr_size = len;
+ mem_bytes -= len;
+
+ if (mem_bytes > 0) {
+ /* Fourth memory region for the remaining memory (if any) */
+ vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
+ vcp->vcp_memranges[3].vmr_size = mem_bytes;
+ vcp->vcp_nmemranges = 4;
+ } else
+ vcp->vcp_nmemranges = 3;
+}
+
+/*
+ * alloc_guest_mem
+ *
+ * Allocates memory for the guest.
+ * Instead of doing a single allocation with one mmap(), we allocate memory
+ * separately for every range for the following reasons:
+ * - ASLR for the individual ranges
+ * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
+ * map the single mmap'd userspace memory to the individual guest physical
+ * memory ranges, the underlying amap of the single mmap'd range would have
+ * to allocate per-page reference counters. The reason is that the
+ * individual guest physical ranges would reference the single mmap'd region
+ * only partially. However, if every guest physical range has its own
+ * corresponding mmap'd userspace allocation, there are no partial
+ * references: every guest physical range fully references an mmap'd
+ * range => no per-page reference counters have to be allocated.
+ *
+ * Return values:
+ * 0: success
+ * !0: failure - errno indicating the source of the failure
+ */
+int
+alloc_guest_mem(struct vm_create_params *vcp)
+{
+ void *p;
+ int ret;
+ size_t i, j;
+ struct vm_mem_range *vmr;
+
+ for (i = 0; i < vcp->vcp_nmemranges; i++) {
+ vmr = &vcp->vcp_memranges[i];
+ p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (p == MAP_FAILED) {
+ ret = errno;
+ for (j = 0; j < i; j++) {
+ vmr = &vcp->vcp_memranges[j];
+ munmap((void *)vmr->vmr_va, vmr->vmr_size);
+ }
+
+ return (ret);
+ }
+
+ vmr->vmr_va = (vaddr_t)p;
+ }
+
+ return (0);
+}
+
+/*
+ * vmm_create_vm
+ *
+ * Requests vmm(4) to create a new VM using the supplied creation
+ * parameters. This operation results in the creation of the in-kernel
+ * structures for the VM, but does not start the VM's vcpu(s).
+ *
+ * Parameters:
+ * vcp: vm_create_params struct containing the VM's desired creation
+ * configuration
+ *
+ * Return values:
+ * 0: success
+ * !0 : ioctl to vmm(4) failed
+ */
+int
+vmm_create_vm(struct vm_create_params *vcp)
+{
+ /* Sanity check arguments */
+ if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
+ return (EINVAL);
+
+ if (vcp->vcp_nmemranges == 0 ||
+ vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
+ return (EINVAL);
+
+ if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
+ return (EINVAL);
+
+ if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
+ return (EINVAL);
+
+ if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) < 0)
+ return (errno);
+
+ return (0);
+}
+
+/*
+ * init_emulated_hw
+ *
+ * Initializes the userspace hardware emulation
+ */
+void
+init_emulated_hw(struct vm_create_params *vcp, int *child_disks,
+ int *child_taps)
+{
+ int i;
+
+ /* Reset the IO port map */
+ memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
+
+ /* Init i8253 PIT */
+ i8253_init(vcp->vcp_id);
+ ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
+ ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
+ ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
+ ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
+
+ /* Init mc146818 RTC */
+ mc146818_init(vcp->vcp_id);
+ ioports_map[IO_RTC] = vcpu_exit_mc146818;
+ ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
+
+ /* Init master and slave PICs */
+ i8259_init();
+ ioports_map[IO_ICU1] = vcpu_exit_i8259;
+ ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
+ ioports_map[IO_ICU2] = vcpu_exit_i8259;
+ ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
+
+ /* Init ns8250 UART */
+ ns8250_init(con_fd, vcp->vcp_id);
+ for (i = COM1_DATA; i <= COM1_SCR; i++)
+ ioports_map[i] = vcpu_exit_com;
+
+ /* Initialize PCI */
+ for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
+ ioports_map[i] = vcpu_exit_pci;
+
+ ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
+ ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
+ pci_init();
+
+ /* Initialize virtio devices */
+ virtio_init(vcp, child_disks, child_taps);
+}
+
+/*
+ * run_vm
+ *
+ * Runs the VM whose creation parameters are specified in vcp
+ *
+ * Parameters:
+ * child_disks: previously-opened child VM disk file file descriptors
+ * child_taps: previously-opened child tap file descriptors
+ * vcp: vm_create_params struct containing the VM's desired creation
+ * configuration
+ * vrs: VCPU register state to initialize
+ *
+ * Return values:
+ * 0: the VM exited normally
+ * !0 : the VM exited abnormally or failed to start
+ */
+int
+run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp,
+ struct vcpu_reg_state *vrs)
+{
+ uint8_t evdone = 0;
+ size_t i;
+ int ret;
+ pthread_t *tid, evtid;
+ struct vm_run_params **vrp;
+ void *exit_status;
+
+ if (vcp == NULL)
+ return (EINVAL);
+
+ if (child_disks == NULL && vcp->vcp_ndisks != 0)
+ return (EINVAL);
+
+ if (child_taps == NULL && vcp->vcp_nnics != 0)
+ return (EINVAL);
+
+ if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
+ return (EINVAL);
+
+ if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
+ return (EINVAL);
+
+ if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
+ return (EINVAL);
+
+ if (vcp->vcp_nmemranges == 0 ||
+ vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
+ return (EINVAL);
+
+ tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
+ vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
+ if (tid == NULL || vrp == NULL) {
+ log_warn("%s: memory allocation error - exiting.",
+ __progname);
+ return (ENOMEM);
+ }
+
+ log_debug("%s: initializing hardware for vm %s", __func__,
+ vcp->vcp_name);
+
+ init_emulated_hw(vcp, child_disks, child_taps);
+
+ ret = pthread_mutex_init(&threadmutex, NULL);
+ if (ret) {
+ log_warn("%s: could not initialize thread state mutex",
+ __func__);
+ return (ret);
+ }
+ ret = pthread_cond_init(&threadcond, NULL);
+ if (ret) {
+ log_warn("%s: could not initialize thread state "
+ "condition variable", __func__);
+ return (ret);
+ }
+
+ mutex_lock(&threadmutex);
+
+ log_debug("%s: starting vcpu threads for vm %s", __func__,
+ vcp->vcp_name);
+
+ /*
+ * Create and launch one thread for each VCPU. These threads may
+ * migrate between PCPUs over time; the need to reload CPU state
+ * in such situations is detected and performed by vmm(4) in the
+ * kernel.
+ */
+ for (i = 0 ; i < vcp->vcp_ncpus; i++) {
+ vrp[i] = malloc(sizeof(struct vm_run_params));
+ if (vrp[i] == NULL) {
+ log_warn("%s: memory allocation error - "
+ "exiting.", __progname);
+ /* caller will exit, so skip free'ing */
+ return (ENOMEM);
+ }
+ vrp[i]->vrp_exit = malloc(sizeof(union vm_exit));
+ if (vrp[i]->vrp_exit == NULL) {
+ log_warn("%s: memory allocation error - "
+ "exiting.", __progname);
+ /* caller will exit, so skip free'ing */
+ return (ENOMEM);
+ }
+ vrp[i]->vrp_vm_id = vcp->vcp_id;
+ vrp[i]->vrp_vcpu_id = i;
+
+ if (vcpu_reset(vcp->vcp_id, i, vrs)) {
+ log_warnx("%s: cannot reset VCPU %zu - exiting.",
+ __progname, i);
+ return (EIO);
+ }
+
+ ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
+ if (ret) {
+ log_warnx("%s: cannot initialize cond var (%d)",
+ __progname, ret);
+ return (ret);
+ }
+
+ ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
+ if (ret) {
+ log_warnx("%s: cannot initialize mtx (%d)",
+ __progname, ret);
+ return (ret);
+ }
+
+ vcpu_hlt[i] = 0;
+
+ /* Start each VCPU run thread at vcpu_run_loop */
+ ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
+ if (ret) {
+ /* caller will _exit after this return */
+ ret = errno;
+ log_warn("%s: could not create vcpu thread %zu",
+ __func__, i);
+ return (ret);
+ }
+ }
+
+ log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
+ ret = pthread_create(&evtid, NULL, event_thread, &evdone);
+ if (ret) {
+ errno = ret;
+ log_warn("%s: could not create event thread", __func__);
+ return (ret);
+ }
+
+ for (;;) {
+ ret = pthread_cond_wait(&threadcond, &threadmutex);
+ if (ret) {
+ log_warn("%s: waiting on thread state condition "
+ "variable failed", __func__);
+ return (ret);
+ }
+
+ /*
+ * Did a VCPU thread exit with an error? => return the first one
+ */
+ for (i = 0; i < vcp->vcp_ncpus; i++) {
+ if (vcpu_done[i] == 0)
+ continue;
+
+ if (pthread_join(tid[i], &exit_status)) {
+ log_warn("%s: failed to join thread %zd - "
+ "exiting", __progname, i);
+ return (EIO);
+ }
+
+ ret = (long long)exit_status;
+ }
+
+ /* Did the event thread exit? => return with an error */
+ if (evdone) {
+ if (pthread_join(evtid, &exit_status)) {
+ log_warn("%s: failed to join event thread - "
+ "exiting", __progname);
+ return (EIO);
+ }
+
+ log_warnx("%s: vm %d event thread exited "
+ "unexpectedly", __progname, vcp->vcp_id);
+ return (EIO);
+ }
+
+ /* Did all VCPU threads exit successfully? => return */
+ for (i = 0; i < vcp->vcp_ncpus; i++) {
+ if (vcpu_done[i] == 0)
+ break;
+ }
+ if (i == vcp->vcp_ncpus)
+ return (ret);
+
+ /* Some more threads to wait for, start over */
+ }
+
+ return (ret);
+}
+
+void *
+event_thread(void *arg)
+{
+ uint8_t *donep = arg;
+ intptr_t ret;
+
+ ret = event_dispatch();
+
+ mutex_lock(&threadmutex);
+ *donep = 1;
+ pthread_cond_signal(&threadcond);
+ mutex_unlock(&threadmutex);
+
+ return (void *)ret;
+ }
+
+/*
+ * vcpu_run_loop
+ *
+ * Runs a single VCPU until vmm(4) requires help handling an exit,
+ * or the VM terminates.
+ *
+ * Parameters:
+ * arg: vcpu_run_params for the VCPU being run by this thread
+ *
+ * Return values:
+ * NULL: the VCPU shutdown properly
+ * !NULL: error processing VCPU run, or the VCPU shutdown abnormally
+ */
+void *
+vcpu_run_loop(void *arg)
+{
+ struct vm_run_params *vrp = (struct vm_run_params *)arg;
+ intptr_t ret = 0;
+ int irq;
+ uint32_t n;
+
+ vrp->vrp_continue = 0;
+ n = vrp->vrp_vcpu_id;
+
+ for (;;) {
+ ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
+
+ if (ret) {
+ log_warnx("%s: can't lock vcpu run mtx (%d)",
+ __func__, (int)ret);
+ return ((void *)ret);
+ }
+
+ /* If we are halted, wait */
+ if (vcpu_hlt[n]) {
+ ret = pthread_cond_wait(&vcpu_run_cond[n],
+ &vcpu_run_mtx[n]);
+
+ if (ret) {
+ log_warnx("%s: can't wait on cond (%d)",
+ __func__, (int)ret);
+ (void)pthread_mutex_unlock(&vcpu_run_mtx[n]);
+ break;
+ }
+ }
+
+ ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
+ if (ret) {
+ log_warnx("%s: can't unlock mutex on cond (%d)",
+ __func__, (int)ret);
+ break;
+ }
+
+ if (vrp->vrp_irqready && i8259_is_pending()) {
+ irq = i8259_ack();
+ vrp->vrp_irq = irq;
+ } else
+ vrp->vrp_irq = 0xFFFF;
+
+ /* Still more pending? */
+ if (i8259_is_pending()) {
+ /* XXX can probably avoid ioctls here by providing intr in vrp */
+ if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 1)) {
+ fatal("can't set INTR");
+ }
+ } else {
+ if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 0)) {
+ fatal("can't clear INTR");
+ }
+ }
+
+ if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) < 0) {
+ /* If run ioctl failed, exit */
+ ret = errno;
+ log_warn("%s: vm %d / vcpu %d run ioctl failed",
+ __func__, vrp->vrp_vm_id, n);
+ break;
+ }
+
+ /* If the VM is terminating, exit normally */
+ if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
+ ret = (intptr_t)NULL;
+ break;
+ }
+
+ if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
+ /*
+ * vmm(4) needs help handling an exit, handle in
+ * vcpu_exit.
+ */
+ ret = vcpu_exit(vrp);
+ if (ret)
+ break;
+ }
+ }
+
+ mutex_lock(&threadmutex);
+ vcpu_done[n] = 1;
+ pthread_cond_signal(&threadcond);
+ mutex_unlock(&threadmutex);
+
+ return ((void *)ret);
+}
+
+int
+vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
+{
+ struct vm_intr_params vip;
+
+ memset(&vip, 0, sizeof(vip));
+
+ vip.vip_vm_id = vm_id;
+ vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
+ vip.vip_intr = intr;
+
+ if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) < 0)
+ return (errno);
+
+ return (0);
+}
+
+/*
+ * vcpu_exit_pci
+ *
+ * Handle all I/O to the emulated PCI subsystem.
+ *
+ * Parameters:
+ * vrp: vcpu run paramters containing guest state for this exit
+ *
+ * Return value:
+ * Interrupt to inject to the guest VM, or 0xFF if no interrupt should
+ * be injected.
+ */
+uint8_t
+vcpu_exit_pci(struct vm_run_params *vrp)
+{
+ union vm_exit *vei = vrp->vrp_exit;
+ uint8_t intr;
+
+ intr = 0xFF;
+
+ switch (vei->vei.vei_port) {
+ case PCI_MODE1_ADDRESS_REG:
+ pci_handle_address_reg(vrp);
+ break;
+ case PCI_MODE1_DATA_REG:
+ pci_handle_data_reg(vrp);
+ break;
+ case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
+ intr = pci_handle_io(vrp);
+ break;
+ default:
+ log_warnx("%s: unknown PCI register 0x%llx",
+ __progname, (uint64_t)vei->vei.vei_port);
+ break;
+ }
+
+ return (intr);
+}
+
+/*
+ * vcpu_exit_inout
+ *
+ * Handle all I/O exits that need to be emulated in vmd. This includes the
+ * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
+ *
+ * Parameters:
+ * vrp: vcpu run parameters containing guest state for this exit
+ */
+void
+vcpu_exit_inout(struct vm_run_params *vrp)
+{
+ union vm_exit *vei = vrp->vrp_exit;
+ uint8_t intr = 0xFF;
+
+ if (ioports_map[vei->vei.vei_port] != NULL)
+ intr = ioports_map[vei->vei.vei_port](vrp);
+ else if (vei->vei.vei_dir == VEI_DIR_IN)
+ vei->vei.vei_data = 0xFFFFFFFF;
+
+ if (intr != 0xFF)
+ vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
+}
+
+/*
+ * vcpu_exit
+ *
+ * Handle a vcpu exit. This function is called when it is determined that
+ * vmm(4) requires the assistance of vmd to support a particular guest
+ * exit type (eg, accessing an I/O port or device). Guest state is contained
+ * in 'vrp', and will be resent to vmm(4) on exit completion.
+ *
+ * Upon conclusion of handling the exit, the function determines if any
+ * interrupts should be injected into the guest, and asserts the proper
+ * IRQ line whose interrupt should be vectored.
+ *
+ * Parameters:
+ * vrp: vcpu run parameters containing guest state for this exit
+ *
+ * Return values:
+ * 0: the exit was handled successfully
+ * 1: an error occurred (eg, unknown exit reason passed in 'vrp')
+ */
+int
+vcpu_exit(struct vm_run_params *vrp)
+{
+ int ret;
+
+ switch (vrp->vrp_exit_reason) {
+ case VMX_EXIT_INT_WINDOW:
+ case VMX_EXIT_EXTINT:
+ case VMX_EXIT_EPT_VIOLATION:
+ case SVM_VMEXIT_NPF:
+ /*
+ * We may be exiting to vmd to handle a pending interrupt but
+ * at the same time the last exit type may have been one of
+ * these. In this case, there's nothing extra to be done
+ * here (and falling through to the default case below results
+ * in more vmd log spam).
+ */
+ break;
+ case VMX_EXIT_IO:
+ case SVM_VMEXIT_IOIO:
+ vcpu_exit_inout(vrp);
+ break;
+ case VMX_EXIT_HLT:
+ case SVM_VMEXIT_HLT:
+ ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
+ if (ret) {
+ log_warnx("%s: can't lock vcpu mutex (%d)",
+ __func__, ret);
+ return (ret);
+ }
+ vcpu_hlt[vrp->vrp_vcpu_id] = 1;
+ ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
+ if (ret) {
+ log_warnx("%s: can't unlock vcpu mutex (%d)",
+ __func__, ret);
+ return (ret);
+ }
+ break;
+ case VMX_EXIT_TRIPLE_FAULT:
+ case SVM_VMEXIT_SHUTDOWN:
+ /* XXX reset VM since we do not support reboot yet */
+ return (EAGAIN);
+ default:
+ log_debug("%s: unknown exit reason %d",
+ __progname, vrp->vrp_exit_reason);
+ }
+
+ /* Process any pending traffic */
+ vionet_process_rx(vrp->vrp_vm_id);
+
+ vrp->vrp_continue = 1;
+
+ return (0);
+}
+
+/*
+ * find_gpa_range
+ *
+ * Search for a contiguous guest physical mem range.
+ *
+ * Parameters:
+ * vcp: VM create parameters that contain the memory map to search in
+ * gpa: the starting guest physical address
+ * len: the length of the memory range
+ *
+ * Return values:
+ * NULL: on failure if there is no memory range as described by the parameters
+ * Pointer to vm_mem_range that contains the start of the range otherwise.
+ */
+static struct vm_mem_range *
+find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
+{
+ size_t i, n;
+ struct vm_mem_range *vmr;
+
+ /* Find the first vm_mem_range that contains gpa */
+ for (i = 0; i < vcp->vcp_nmemranges; i++) {
+ vmr = &vcp->vcp_memranges[i];
+ if (vmr->vmr_gpa + vmr->vmr_size >= gpa)
+ break;
+ }
+
+ /* No range found. */
+ if (i == vcp->vcp_nmemranges)
+ return (NULL);
+
+ /*
+ * vmr may cover the range [gpa, gpa + len) only partly. Make
+ * sure that the following vm_mem_ranges are contiguous and
+ * cover the rest.
+ */
+ n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
+ if (len < n)
+ len = 0;
+ else
+ len -= n;
+ gpa = vmr->vmr_gpa + vmr->vmr_size;
+ for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
+ vmr = &vcp->vcp_memranges[i];
+ if (gpa != vmr->vmr_gpa)
+ return (NULL);
+ if (len <= vmr->vmr_size)
+ len = 0;
+ else
+ len -= vmr->vmr_size;
+
+ gpa = vmr->vmr_gpa + vmr->vmr_size;
+ }
+
+ if (len != 0)
+ return (NULL);
+
+ return (vmr);
+}
+
+/*
+ * write_mem
+ *
+ * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
+ *
+ * Parameters:
+ * dst: the destination paddr_t in the guest VM
+ * buf: data to copy
+ * len: number of bytes to copy
+ *
+ * Return values:
+ * 0: success
+ * EINVAL: if the guest physical memory range [dst, dst + len) does not
+ * exist in the guest.
+ */
+int
+write_mem(paddr_t dst, void *buf, size_t len)
+{
+ char *from = buf, *to;
+ size_t n, off;
+ struct vm_mem_range *vmr;
+
+ vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
+ if (vmr == NULL) {
+ errno = EINVAL;
+ log_warn("%s: failed - invalid memory range dst = 0x%lx, "
+ "len = 0x%zx", __func__, dst, len);
+ return (EINVAL);
+ }
+
+ off = dst - vmr->vmr_gpa;
+ while (len != 0) {
+ n = vmr->vmr_size - off;
+ if (len < n)
+ n = len;
+
+ to = (char *)vmr->vmr_va + off;
+ memcpy(to, from, n);
+
+ from += n;
+ len -= n;
+ off = 0;
+ vmr++;
+ }
+
+ return (0);
+}
+
+/*
+ * read_mem
+ *
+ * Reads memory at guest paddr 'src' into 'buf'.
+ *
+ * Parameters:
+ * src: the source paddr_t in the guest VM to read from.
+ * buf: destination (local) buffer
+ * len: number of bytes to read
+ *
+ * Return values:
+ * 0: success
+ * EINVAL: if the guest physical memory range [dst, dst + len) does not
+ * exist in the guest.
+ */
+int
+read_mem(paddr_t src, void *buf, size_t len)
+{
+ char *from, *to = buf;
+ size_t n, off;
+ struct vm_mem_range *vmr;
+
+ vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
+ if (vmr == NULL) {
+ errno = EINVAL;
+ log_warn("%s: failed - invalid memory range src = 0x%lx, "
+ "len = 0x%zx", __func__, src, len);
+ return (EINVAL);
+ }
+
+ off = src - vmr->vmr_gpa;
+ while (len != 0) {
+ n = vmr->vmr_size - off;
+ if (len < n)
+ n = len;
+
+ from = (char *)vmr->vmr_va + off;
+ memcpy(to, from, n);
+
+ to += n;
+ len -= n;
+ off = 0;
+ vmr++;
+ }
+
+ return (0);
+}
+
+/*
+ * vcpu_assert_pic_irq
+ *
+ * Injects the specified IRQ on the supplied vcpu/vm
+ *
+ * Parameters:
+ * vm_id: VM ID to inject to
+ * vcpu_id: VCPU ID to inject to
+ * irq: IRQ to inject
+ */
+void
+vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
+{
+ int ret;
+
+ i8259_assert_irq(irq);
+
+ if (i8259_is_pending()) {
+ if (vcpu_pic_intr(vm_id, vcpu_id, 1))
+ fatalx("%s: can't assert INTR", __func__);
+
+ ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
+ if (ret)
+ fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
+
+ vcpu_hlt[vcpu_id] = 0;
+ ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
+ if (ret)
+ fatalx("%s: can't signal (%d)", __func__, ret);
+ ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
+ if (ret)
+ fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
+ }
+}
+
+/*
+ * fd_hasdata
+ *
+ * Determines if data can be read from a file descriptor.
+ *
+ * Parameters:
+ * fd: the fd to check
+ *
+ * Return values:
+ * 1 if data can be read from an fd, or 0 otherwise.
+ */
+int
+fd_hasdata(int fd)
+{
+ struct pollfd pfd[1];
+ int nready, hasdata = 0;
+
+ pfd[0].fd = fd;
+ pfd[0].events = POLLIN;
+ nready = poll(pfd, 1, 0);
+ if (nready == -1)
+ log_warn("checking file descriptor for data failed");
+ else if (nready == 1 && pfd[0].revents & POLLIN)
+ hasdata = 1;
+ return (hasdata);
+}
+
+/*
+ * mutex_lock
+ *
+ * Wrapper function for pthread_mutex_lock that does error checking and that
+ * exits on failure
+ */
+void
+mutex_lock(pthread_mutex_t *m)
+{
+ int ret;
+
+ ret = pthread_mutex_lock(m);
+ if (ret) {
+ errno = ret;
+ fatal("could not acquire mutex");
+ }
+}
+
+/*
+ * mutex_unlock
+ *
+ * Wrapper function for pthread_mutex_unlock that does error checking and that
+ * exits on failure
+ */
+void
+mutex_unlock(pthread_mutex_t *m)
+{
+ int ret;
+
+ ret = pthread_mutex_unlock(m);
+ if (ret) {
+ errno = ret;
+ fatal("could not release mutex");
+ }
+}
diff --git a/usr.sbin/vmd/vmd.h b/usr.sbin/vmd/vmd.h
index 34f8f2192f5..3046bb36c2b 100644
--- a/usr.sbin/vmd/vmd.h
+++ b/usr.sbin/vmd/vmd.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: vmd.h,v 1.45 2017/03/01 07:43:33 reyk Exp $ */
+/* $OpenBSD: vmd.h,v 1.46 2017/03/01 18:00:50 reyk Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -229,6 +229,10 @@ int opentap(char *);
int fd_hasdata(int);
void mutex_lock(pthread_mutex_t *);
void mutex_unlock(pthread_mutex_t *);
+int vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *));
+
+/* vm.c */
+int start_vm(struct vmd_vm *, int);
/* control.c */
int config_init(struct vmd *);
diff --git a/usr.sbin/vmd/vmm.c b/usr.sbin/vmd/vmm.c
index 5f46d7a567e..ef4bc810876 100644
--- a/usr.sbin/vmd/vmm.c
+++ b/usr.sbin/vmd/vmm.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: vmm.c,v 1.65 2017/01/24 09:58:00 mlarkin Exp $ */
+/* $OpenBSD: vmm.c,v 1.66 2017/03/01 18:00:50 reyk Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -52,102 +52,22 @@
#include "vmd.h"
#include "vmm.h"
-#include "loadfile.h"
-#include "pci.h"
-#include "virtio.h"
-#include "proc.h"
-#include "i8253.h"
-#include "i8259.h"
-#include "ns8250.h"
-#include "mc146818.h"
-
-io_fn_t ioports_map[MAX_PORTS];
void vmm_sighdlr(int, short, void *);
-int opentap(char *);
-int start_vm(struct imsg *, uint32_t *);
-int terminate_vm(struct vm_terminate_params *);
-int get_info_vm(struct privsep *, struct imsg *, int);
-int run_vm(int *, int *, struct vm_create_params *, struct vcpu_reg_state *);
-void *event_thread(void *);
-void *vcpu_run_loop(void *);
-int vcpu_exit(struct vm_run_params *);
-int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
-void create_memory_map(struct vm_create_params *);
-int alloc_guest_mem(struct vm_create_params *);
-int vmm_create_vm(struct vm_create_params *);
-void init_emulated_hw(struct vm_create_params *, int *, int *);
-void vcpu_exit_inout(struct vm_run_params *);
-uint8_t vcpu_exit_pci(struct vm_run_params *);
+int vmm_start_vm(struct imsg *, uint32_t *);
int vmm_dispatch_parent(int, struct privsep_proc *, struct imsg *);
void vmm_run(struct privsep *, struct privsep_proc *, void *);
-int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
-
-int vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *));
void vmm_dispatch_vm(int, short, void *);
-void vm_dispatch_vmm(int, short, void *);
-
-static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
- size_t);
-
-int con_fd;
-struct vmd_vm *current_vm;
+int terminate_vm(struct vm_terminate_params *);
+int get_info_vm(struct privsep *, struct imsg *, int);
+int opentap(char *);
extern struct vmd *env;
-extern char *__progname;
-
-pthread_mutex_t threadmutex;
-pthread_cond_t threadcond;
-
-pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
-pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
-uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
-uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
-
static struct privsep_proc procs[] = {
{ "parent", PROC_PARENT, vmm_dispatch_parent },
};
-/*
- * Represents a standard register set for an OS to be booted
- * as a flat 32 bit address space, before paging is enabled.
- *
- * NOT set here are:
- * RIP
- * RSP
- * GDTR BASE
- *
- * Specific bootloaders should clone this structure and override
- * those fields as needed.
- *
- * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
- * features of the CPU in use.
- */
-static const struct vcpu_reg_state vcpu_init_flat32 = {
-#ifdef __i386__
- .vrs_gprs[VCPU_REGS_EFLAGS] = 0x2,
- .vrs_gprs[VCPU_REGS_EIP] = 0x0,
- .vrs_gprs[VCPU_REGS_ESP] = 0x0,
-#else
- .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
- .vrs_gprs[VCPU_REGS_RIP] = 0x0,
- .vrs_gprs[VCPU_REGS_RSP] = 0x0,
-#endif
- .vrs_crs[VCPU_REGS_CR0] = CR0_CD | CR0_NW | CR0_ET | CR0_PE | CR0_PG,
- .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
- .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
- .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
- .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
- .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
- .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
- .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
- .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
- .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
- .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
- .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
-};
-
void
vmm(struct privsep *ps, struct privsep_proc *p)
{
@@ -212,7 +132,7 @@ vmm_dispatch_parent(int fd, struct privsep_proc *p, struct imsg *imsg)
}
break;
case IMSG_VMDOP_START_VM_END:
- res = start_vm(imsg, &id);
+ res = vmm_start_vm(imsg, &id);
cmd = IMSG_VMDOP_START_VM_RESPONSE;
break;
case IMSG_VMDOP_TERMINATE_VM_REQUEST:
@@ -386,6 +306,12 @@ vmm_shutdown(void)
}
}
+/*
+ * vmm_pipe
+ *
+ * Create a new imsg control channel between vmm parent and a VM
+ * (can be called on both sides).
+ */
int
vmm_pipe(struct vmd_vm *vm, int fd, void (*cb)(int, short, void *))
{
@@ -404,6 +330,11 @@ vmm_pipe(struct vmd_vm *vm, int fd, void (*cb)(int, short, void *))
return (0);
}
+/*
+ * vmm_dispatch_vm
+ *
+ * imsg callback for messages that are received from a VM child process.
+ */
void
vmm_dispatch_vm(int fd, short event, void *arg)
{
@@ -456,100 +387,6 @@ vmm_dispatch_vm(int fd, short event, void *arg)
imsg_event_add(iev);
}
-void
-vm_dispatch_vmm(int fd, short event, void *arg)
-{
- struct vmd_vm *vm = arg;
- struct imsgev *iev = &vm->vm_iev;
- struct imsgbuf *ibuf = &iev->ibuf;
- struct imsg imsg;
- ssize_t n;
- int verbose;
-
- if (event & EV_READ) {
- if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
- fatal("%s: imsg_read", __func__);
- if (n == 0)
- _exit(0);
- }
-
- if (event & EV_WRITE) {
- if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
- fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
- if (n == 0)
- _exit(0);
- }
-
- for (;;) {
- if ((n = imsg_get(ibuf, &imsg)) == -1)
- fatal("%s: imsg_get", __func__);
- if (n == 0)
- break;
-
-#if DEBUG > 1
- log_debug("%s: got imsg %d from %s",
- __func__, imsg.hdr.type,
- vm->vm_params.vmc_params.vcp_name);
-#endif
-
- switch (imsg.hdr.type) {
- case IMSG_CTL_VERBOSE:
- IMSG_SIZE_CHECK(&imsg, &verbose);
- memcpy(&verbose, imsg.data, sizeof(verbose));
- log_setverbose(verbose);
- break;
- case IMSG_VMDOP_VM_SHUTDOWN:
- if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
- _exit(0);
- break;
- case IMSG_VMDOP_VM_REBOOT:
- if (vmmci_ctl(VMMCI_REBOOT) == -1)
- _exit(0);
- break;
- default:
- fatalx("%s: got invalid imsg %d from %s",
- __func__, imsg.hdr.type,
- vm->vm_params.vmc_params.vcp_name);
- }
- imsg_free(&imsg);
- }
- imsg_event_add(iev);
-}
-
-/*
- * vcpu_reset
- *
- * Requests vmm(4) to reset the VCPUs in the indicated VM to
- * the register state provided
- *
- * Parameters
- * vmid: VM ID to reset
- * vcpu_id: VCPU ID to reset
- * vrs: the register state to initialize
- *
- * Return values:
- * 0: success
- * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
- * valid)
- */
-int
-vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
-{
- struct vm_resetcpu_params vrp;
-
- memset(&vrp, 0, sizeof(vrp));
- vrp.vrp_vm_id = vmid;
- vrp.vrp_vcpu_id = vcpu_id;
- memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
-
- log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
-
- if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) < 0)
- return (errno);
-
- return (0);
-}
-
/*
* terminate_vm
*
@@ -605,40 +442,26 @@ opentap(char *ifname)
}
/*
- * start_vm
- *
- * Starts a new VM with the creation parameters supplied (in the incoming
- * imsg->data field). This function performs a basic sanity check on the
- * incoming parameters and then performs the following steps to complete
- * the creation of the VM:
+ * vmm_start_vm
*
- * 1. opens the VM disk image files specified in the VM creation parameters
- * 2. opens the specified VM kernel
- * 3. creates a VM console tty pair using openpty
- * 4. forks, passing the file descriptors opened in steps 1-3 to the child
- * vmd responsible for dropping privilege and running the VM's VCPU
- * loops.
+ * Prepares and forks a new VM process.
*
* Parameters:
- * imsg: The incoming imsg body whose 'data' field is a vm_create_params
- * struct containing the VM creation parameters.
- * id: Returns the VM id as reported by the kernel.
+ * imsg: The VM data structure that is including the VM create parameters.
+ * id: Returns the VM id as reported by the kernel and obtained from the VM.
*
* Return values:
* 0: success
* !0 : failure - typically an errno indicating the source of the failure
*/
int
-start_vm(struct imsg *imsg, uint32_t *id)
+vmm_start_vm(struct imsg *imsg, uint32_t *id)
{
struct vm_create_params *vcp;
- struct vmboot_params vmboot;
struct vmd_vm *vm;
- size_t i;
int ret = EINVAL;
- int fds[2], nicfds[VMM_MAX_NICS_PER_VM];
- struct vcpu_reg_state vrs;
- FILE *kernfp;
+ int fds[2];
+ size_t i;
if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
log_warnx("%s: can't find vm", __func__);
@@ -668,6 +491,7 @@ start_vm(struct imsg *imsg, uint32_t *id)
if (ret > 0) {
/* Parent */
vm->vm_pid = ret;
+ close(fds[1]);
for (i = 0 ; i < vcp->vcp_ndisks; i++) {
close(vm->vm_disks[i]);
@@ -686,7 +510,6 @@ start_vm(struct imsg *imsg, uint32_t *id)
vm->vm_tty = -1;
/* read back the kernel-generated vm id from the child */
- close(fds[1]);
if (read(fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
sizeof(vcp->vcp_id))
fatal("read vcp id");
@@ -702,76 +525,9 @@ start_vm(struct imsg *imsg, uint32_t *id)
return (0);
} else {
/* Child */
- setproctitle("%s", vcp->vcp_name);
- log_procinit(vcp->vcp_name);
-
- create_memory_map(vcp);
- ret = alloc_guest_mem(vcp);
- if (ret) {
- errno = ret;
- fatal("could not allocate guest memory - exiting");
- }
-
- ret = vmm_create_vm(vcp);
- current_vm = vm;
-
- /* send back the kernel-generated vm id (0 on error) */
close(fds[0]);
- if (write(fds[1], &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
- sizeof(vcp->vcp_id))
- fatal("write vcp id");
-
- if (ret) {
- errno = ret;
- fatal("create vmm ioctl failed - exiting");
- }
-
- /*
- * pledge in the vm processes:
- * stdio - for malloc and basic I/O including events.
- * vmm - for the vmm ioctls and operations.
- */
- if (pledge("stdio vmm", NULL) == -1)
- fatal("pledge");
-
- /*
- * Set up default "flat 32 bit" register state - RIP,
- * RSP, and GDT info will be set in bootloader
- */
- memcpy(&vrs, &vcpu_init_flat32, sizeof(struct vcpu_reg_state));
-
- /* Find and open kernel image */
- if ((kernfp = vmboot_open(vm->vm_kernel,
- vm->vm_disks[0], &vmboot)) == NULL)
- fatalx("failed to open kernel - exiting");
-
- /* Load kernel image */
- ret = loadelf_main(kernfp, vcp, &vrs,
- vmboot.vbp_bootdev, vmboot.vbp_howto);
- if (ret) {
- errno = ret;
- fatal("failed to load kernel - exiting");
- }
-
- vmboot_close(kernfp, &vmboot);
-
- if (vm->vm_kernel != -1)
- close(vm->vm_kernel);
-
- con_fd = vm->vm_tty;
- if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
- fatal("failed to set nonblocking mode on console");
-
- for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
- nicfds[i] = vm->vm_ifs[i].vif_fd;
-
- event_init();
-
- if (vmm_pipe(vm, fds[1], vm_dispatch_vmm) == -1)
- fatal("setup vm pipe");
- /* Execute the vcpu run loop(s) for this VM */
- ret = run_vm(vm->vm_disks, nicfds, vcp, &vrs);
+ ret = start_vm(vm, fds[1]);
_exit(ret);
}
@@ -864,924 +620,3 @@ get_info_vm(struct privsep *ps, struct imsg *imsg, int terminate)
free(info);
return (0);
}
-
-/*
- * create_memory_map
- *
- * Sets up the guest physical memory ranges that the VM can access.
- *
- * Return values:
- * nothing
- */
-void
-create_memory_map(struct vm_create_params *vcp)
-{
- size_t len, mem_bytes, mem_mb;
-
- mem_mb = vcp->vcp_memranges[0].vmr_size;
- vcp->vcp_nmemranges = 0;
- if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE)
- return;
-
- mem_bytes = mem_mb * 1024 * 1024;
-
- /* First memory region: 0 - LOWMEM_KB (DOS low mem) */
- len = LOWMEM_KB * 1024;
- vcp->vcp_memranges[0].vmr_gpa = 0x0;
- vcp->vcp_memranges[0].vmr_size = len;
- mem_bytes -= len;
-
- /*
- * Second memory region: LOWMEM_KB - 1MB.
- *
- * N.B. - Normally ROMs or parts of video RAM are mapped here.
- * We have to add this region, because some systems
- * unconditionally write to 0xb8000 (VGA RAM), and
- * we need to make sure that vmm(4) permits accesses
- * to it. So allocate guest memory for it.
- */
- len = 0x100000 - LOWMEM_KB * 1024;
- vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
- vcp->vcp_memranges[1].vmr_size = len;
- mem_bytes -= len;
-
- /* Make sure that we do not place physical memory into MMIO ranges. */
- if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000)
- len = VMM_PCI_MMIO_BAR_BASE - 0x100000;
- else
- len = mem_bytes;
-
- /* Third memory region: 1MB - (1MB + len) */
- vcp->vcp_memranges[2].vmr_gpa = 0x100000;
- vcp->vcp_memranges[2].vmr_size = len;
- mem_bytes -= len;
-
- if (mem_bytes > 0) {
- /* Fourth memory region for the remaining memory (if any) */
- vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
- vcp->vcp_memranges[3].vmr_size = mem_bytes;
- vcp->vcp_nmemranges = 4;
- } else
- vcp->vcp_nmemranges = 3;
-}
-
-/*
- * alloc_guest_mem
- *
- * Allocates memory for the guest.
- * Instead of doing a single allocation with one mmap(), we allocate memory
- * separately for every range for the following reasons:
- * - ASLR for the individual ranges
- * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
- * map the single mmap'd userspace memory to the individual guest physical
- * memory ranges, the underlying amap of the single mmap'd range would have
- * to allocate per-page reference counters. The reason is that the
- * individual guest physical ranges would reference the single mmap'd region
- * only partially. However, if every guest physical range has its own
- * corresponding mmap'd userspace allocation, there are no partial
- * references: every guest physical range fully references an mmap'd
- * range => no per-page reference counters have to be allocated.
- *
- * Return values:
- * 0: success
- * !0: failure - errno indicating the source of the failure
- */
-int
-alloc_guest_mem(struct vm_create_params *vcp)
-{
- void *p;
- int ret;
- size_t i, j;
- struct vm_mem_range *vmr;
-
- for (i = 0; i < vcp->vcp_nmemranges; i++) {
- vmr = &vcp->vcp_memranges[i];
- p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANON, -1, 0);
- if (p == MAP_FAILED) {
- ret = errno;
- for (j = 0; j < i; j++) {
- vmr = &vcp->vcp_memranges[j];
- munmap((void *)vmr->vmr_va, vmr->vmr_size);
- }
-
- return (ret);
- }
-
- vmr->vmr_va = (vaddr_t)p;
- }
-
- return (0);
-}
-
-/*
- * vmm_create_vm
- *
- * Requests vmm(4) to create a new VM using the supplied creation
- * parameters. This operation results in the creation of the in-kernel
- * structures for the VM, but does not start the VM's vcpu(s).
- *
- * Parameters:
- * vcp: vm_create_params struct containing the VM's desired creation
- * configuration
- *
- * Return values:
- * 0: success
- * !0 : ioctl to vmm(4) failed
- */
-int
-vmm_create_vm(struct vm_create_params *vcp)
-{
- /* Sanity check arguments */
- if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
- return (EINVAL);
-
- if (vcp->vcp_nmemranges == 0 ||
- vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
- return (EINVAL);
-
- if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
- return (EINVAL);
-
- if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
- return (EINVAL);
-
- if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) < 0)
- return (errno);
-
- return (0);
-}
-
-/*
- * init_emulated_hw
- *
- * Initializes the userspace hardware emulation
- */
-void
-init_emulated_hw(struct vm_create_params *vcp, int *child_disks,
- int *child_taps)
-{
- int i;
-
- /* Reset the IO port map */
- memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
-
- /* Init i8253 PIT */
- i8253_init(vcp->vcp_id);
- ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
- ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
- ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
- ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
-
- /* Init mc146818 RTC */
- mc146818_init(vcp->vcp_id);
- ioports_map[IO_RTC] = vcpu_exit_mc146818;
- ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
-
- /* Init master and slave PICs */
- i8259_init();
- ioports_map[IO_ICU1] = vcpu_exit_i8259;
- ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
- ioports_map[IO_ICU2] = vcpu_exit_i8259;
- ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
-
- /* Init ns8250 UART */
- ns8250_init(con_fd, vcp->vcp_id);
- for (i = COM1_DATA; i <= COM1_SCR; i++)
- ioports_map[i] = vcpu_exit_com;
-
- /* Initialize PCI */
- for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
- ioports_map[i] = vcpu_exit_pci;
-
- ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
- ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
- pci_init();
-
- /* Initialize virtio devices */
- virtio_init(vcp, child_disks, child_taps);
-}
-
-/*
- * run_vm
- *
- * Runs the VM whose creation parameters are specified in vcp
- *
- * Parameters:
- * child_disks: previously-opened child VM disk file file descriptors
- * child_taps: previously-opened child tap file descriptors
- * vcp: vm_create_params struct containing the VM's desired creation
- * configuration
- * vrs: VCPU register state to initialize
- *
- * Return values:
- * 0: the VM exited normally
- * !0 : the VM exited abnormally or failed to start
- */
-int
-run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp,
- struct vcpu_reg_state *vrs)
-{
- uint8_t evdone = 0;
- size_t i;
- int ret;
- pthread_t *tid, evtid;
- struct vm_run_params **vrp;
- void *exit_status;
-
- if (vcp == NULL)
- return (EINVAL);
-
- if (child_disks == NULL && vcp->vcp_ndisks != 0)
- return (EINVAL);
-
- if (child_taps == NULL && vcp->vcp_nnics != 0)
- return (EINVAL);
-
- if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
- return (EINVAL);
-
- if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
- return (EINVAL);
-
- if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
- return (EINVAL);
-
- if (vcp->vcp_nmemranges == 0 ||
- vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
- return (EINVAL);
-
- tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
- vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
- if (tid == NULL || vrp == NULL) {
- log_warn("%s: memory allocation error - exiting.",
- __progname);
- return (ENOMEM);
- }
-
- log_debug("%s: initializing hardware for vm %s", __func__,
- vcp->vcp_name);
-
- init_emulated_hw(vcp, child_disks, child_taps);
-
- ret = pthread_mutex_init(&threadmutex, NULL);
- if (ret) {
- log_warn("%s: could not initialize thread state mutex",
- __func__);
- return (ret);
- }
- ret = pthread_cond_init(&threadcond, NULL);
- if (ret) {
- log_warn("%s: could not initialize thread state "
- "condition variable", __func__);
- return (ret);
- }
-
- mutex_lock(&threadmutex);
-
- log_debug("%s: starting vcpu threads for vm %s", __func__,
- vcp->vcp_name);
-
- /*
- * Create and launch one thread for each VCPU. These threads may
- * migrate between PCPUs over time; the need to reload CPU state
- * in such situations is detected and performed by vmm(4) in the
- * kernel.
- */
- for (i = 0 ; i < vcp->vcp_ncpus; i++) {
- vrp[i] = malloc(sizeof(struct vm_run_params));
- if (vrp[i] == NULL) {
- log_warn("%s: memory allocation error - "
- "exiting.", __progname);
- /* caller will exit, so skip free'ing */
- return (ENOMEM);
- }
- vrp[i]->vrp_exit = malloc(sizeof(union vm_exit));
- if (vrp[i]->vrp_exit == NULL) {
- log_warn("%s: memory allocation error - "
- "exiting.", __progname);
- /* caller will exit, so skip free'ing */
- return (ENOMEM);
- }
- vrp[i]->vrp_vm_id = vcp->vcp_id;
- vrp[i]->vrp_vcpu_id = i;
-
- if (vcpu_reset(vcp->vcp_id, i, vrs)) {
- log_warnx("%s: cannot reset VCPU %zu - exiting.",
- __progname, i);
- return (EIO);
- }
-
- ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
- if (ret) {
- log_warnx("%s: cannot initialize cond var (%d)",
- __progname, ret);
- return (ret);
- }
-
- ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
- if (ret) {
- log_warnx("%s: cannot initialize mtx (%d)",
- __progname, ret);
- return (ret);
- }
-
- vcpu_hlt[i] = 0;
-
- /* Start each VCPU run thread at vcpu_run_loop */
- ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
- if (ret) {
- /* caller will _exit after this return */
- ret = errno;
- log_warn("%s: could not create vcpu thread %zu",
- __func__, i);
- return (ret);
- }
- }
-
- log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
- ret = pthread_create(&evtid, NULL, event_thread, &evdone);
- if (ret) {
- errno = ret;
- log_warn("%s: could not create event thread", __func__);
- return (ret);
- }
-
- for (;;) {
- ret = pthread_cond_wait(&threadcond, &threadmutex);
- if (ret) {
- log_warn("%s: waiting on thread state condition "
- "variable failed", __func__);
- return (ret);
- }
-
- /*
- * Did a VCPU thread exit with an error? => return the first one
- */
- for (i = 0; i < vcp->vcp_ncpus; i++) {
- if (vcpu_done[i] == 0)
- continue;
-
- if (pthread_join(tid[i], &exit_status)) {
- log_warn("%s: failed to join thread %zd - "
- "exiting", __progname, i);
- return (EIO);
- }
-
- ret = (long long)exit_status;
- }
-
- /* Did the event thread exit? => return with an error */
- if (evdone) {
- if (pthread_join(evtid, &exit_status)) {
- log_warn("%s: failed to join event thread - "
- "exiting", __progname);
- return (EIO);
- }
-
- log_warnx("%s: vm %d event thread exited "
- "unexpectedly", __progname, vcp->vcp_id);
- return (EIO);
- }
-
- /* Did all VCPU threads exit successfully? => return */
- for (i = 0; i < vcp->vcp_ncpus; i++) {
- if (vcpu_done[i] == 0)
- break;
- }
- if (i == vcp->vcp_ncpus)
- return (ret);
-
- /* Some more threads to wait for, start over */
- }
-
- return (ret);
-}
-
-void *
-event_thread(void *arg)
-{
- uint8_t *donep = arg;
- intptr_t ret;
-
- ret = event_dispatch();
-
- mutex_lock(&threadmutex);
- *donep = 1;
- pthread_cond_signal(&threadcond);
- mutex_unlock(&threadmutex);
-
- return (void *)ret;
- }
-
-/*
- * vcpu_run_loop
- *
- * Runs a single VCPU until vmm(4) requires help handling an exit,
- * or the VM terminates.
- *
- * Parameters:
- * arg: vcpu_run_params for the VCPU being run by this thread
- *
- * Return values:
- * NULL: the VCPU shutdown properly
- * !NULL: error processing VCPU run, or the VCPU shutdown abnormally
- */
-void *
-vcpu_run_loop(void *arg)
-{
- struct vm_run_params *vrp = (struct vm_run_params *)arg;
- intptr_t ret = 0;
- int irq;
- uint32_t n;
-
- vrp->vrp_continue = 0;
- n = vrp->vrp_vcpu_id;
-
- for (;;) {
- ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
-
- if (ret) {
- log_warnx("%s: can't lock vcpu run mtx (%d)",
- __func__, (int)ret);
- return ((void *)ret);
- }
-
- /* If we are halted, wait */
- if (vcpu_hlt[n]) {
- ret = pthread_cond_wait(&vcpu_run_cond[n],
- &vcpu_run_mtx[n]);
-
- if (ret) {
- log_warnx("%s: can't wait on cond (%d)",
- __func__, (int)ret);
- (void)pthread_mutex_unlock(&vcpu_run_mtx[n]);
- break;
- }
- }
-
- ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
- if (ret) {
- log_warnx("%s: can't unlock mutex on cond (%d)",
- __func__, (int)ret);
- break;
- }
-
- if (vrp->vrp_irqready && i8259_is_pending()) {
- irq = i8259_ack();
- vrp->vrp_irq = irq;
- } else
- vrp->vrp_irq = 0xFFFF;
-
- /* Still more pending? */
- if (i8259_is_pending()) {
- /* XXX can probably avoid ioctls here by providing intr in vrp */
- if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 1)) {
- fatal("can't set INTR");
- }
- } else {
- if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 0)) {
- fatal("can't clear INTR");
- }
- }
-
- if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) < 0) {
- /* If run ioctl failed, exit */
- ret = errno;
- log_warn("%s: vm %d / vcpu %d run ioctl failed",
- __func__, vrp->vrp_vm_id, n);
- break;
- }
-
- /* If the VM is terminating, exit normally */
- if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
- ret = (intptr_t)NULL;
- break;
- }
-
- if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
- /*
- * vmm(4) needs help handling an exit, handle in
- * vcpu_exit.
- */
- ret = vcpu_exit(vrp);
- if (ret)
- break;
- }
- }
-
- mutex_lock(&threadmutex);
- vcpu_done[n] = 1;
- pthread_cond_signal(&threadcond);
- mutex_unlock(&threadmutex);
-
- return ((void *)ret);
-}
-
-int
-vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
-{
- struct vm_intr_params vip;
-
- memset(&vip, 0, sizeof(vip));
-
- vip.vip_vm_id = vm_id;
- vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
- vip.vip_intr = intr;
-
- if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) < 0)
- return (errno);
-
- return (0);
-}
-
-/*
- * vcpu_exit_pci
- *
- * Handle all I/O to the emulated PCI subsystem.
- *
- * Parameters:
- * vrp: vcpu run paramters containing guest state for this exit
- *
- * Return value:
- * Interrupt to inject to the guest VM, or 0xFF if no interrupt should
- * be injected.
- */
-uint8_t
-vcpu_exit_pci(struct vm_run_params *vrp)
-{
- union vm_exit *vei = vrp->vrp_exit;
- uint8_t intr;
-
- intr = 0xFF;
-
- switch (vei->vei.vei_port) {
- case PCI_MODE1_ADDRESS_REG:
- pci_handle_address_reg(vrp);
- break;
- case PCI_MODE1_DATA_REG:
- pci_handle_data_reg(vrp);
- break;
- case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
- intr = pci_handle_io(vrp);
- break;
- default:
- log_warnx("%s: unknown PCI register 0x%llx",
- __progname, (uint64_t)vei->vei.vei_port);
- break;
- }
-
- return (intr);
-}
-
-/*
- * vcpu_exit_inout
- *
- * Handle all I/O exits that need to be emulated in vmd. This includes the
- * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
- *
- * Parameters:
- * vrp: vcpu run parameters containing guest state for this exit
- */
-void
-vcpu_exit_inout(struct vm_run_params *vrp)
-{
- union vm_exit *vei = vrp->vrp_exit;
- uint8_t intr = 0xFF;
-
- if (ioports_map[vei->vei.vei_port] != NULL)
- intr = ioports_map[vei->vei.vei_port](vrp);
- else if (vei->vei.vei_dir == VEI_DIR_IN)
- vei->vei.vei_data = 0xFFFFFFFF;
-
- if (intr != 0xFF)
- vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
-}
-
-/*
- * vcpu_exit
- *
- * Handle a vcpu exit. This function is called when it is determined that
- * vmm(4) requires the assistance of vmd to support a particular guest
- * exit type (eg, accessing an I/O port or device). Guest state is contained
- * in 'vrp', and will be resent to vmm(4) on exit completion.
- *
- * Upon conclusion of handling the exit, the function determines if any
- * interrupts should be injected into the guest, and asserts the proper
- * IRQ line whose interrupt should be vectored.
- *
- * Parameters:
- * vrp: vcpu run parameters containing guest state for this exit
- *
- * Return values:
- * 0: the exit was handled successfully
- * 1: an error occurred (eg, unknown exit reason passed in 'vrp')
- */
-int
-vcpu_exit(struct vm_run_params *vrp)
-{
- int ret;
-
- switch (vrp->vrp_exit_reason) {
- case VMX_EXIT_INT_WINDOW:
- case VMX_EXIT_EXTINT:
- case VMX_EXIT_EPT_VIOLATION:
- case SVM_VMEXIT_NPF:
- /*
- * We may be exiting to vmd to handle a pending interrupt but
- * at the same time the last exit type may have been one of
- * these. In this case, there's nothing extra to be done
- * here (and falling through to the default case below results
- * in more vmd log spam).
- */
- break;
- case VMX_EXIT_IO:
- case SVM_VMEXIT_IOIO:
- vcpu_exit_inout(vrp);
- break;
- case VMX_EXIT_HLT:
- case SVM_VMEXIT_HLT:
- ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
- if (ret) {
- log_warnx("%s: can't lock vcpu mutex (%d)",
- __func__, ret);
- return (ret);
- }
- vcpu_hlt[vrp->vrp_vcpu_id] = 1;
- ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
- if (ret) {
- log_warnx("%s: can't unlock vcpu mutex (%d)",
- __func__, ret);
- return (ret);
- }
- break;
- case VMX_EXIT_TRIPLE_FAULT:
- case SVM_VMEXIT_SHUTDOWN:
- /* XXX reset VM since we do not support reboot yet */
- return (EAGAIN);
- default:
- log_debug("%s: unknown exit reason %d",
- __progname, vrp->vrp_exit_reason);
- }
-
- /* Process any pending traffic */
- vionet_process_rx(vrp->vrp_vm_id);
-
- vrp->vrp_continue = 1;
-
- return (0);
-}
-
-/*
- * find_gpa_range
- *
- * Search for a contiguous guest physical mem range.
- *
- * Parameters:
- * vcp: VM create parameters that contain the memory map to search in
- * gpa: the starting guest physical address
- * len: the length of the memory range
- *
- * Return values:
- * NULL: on failure if there is no memory range as described by the parameters
- * Pointer to vm_mem_range that contains the start of the range otherwise.
- */
-static struct vm_mem_range *
-find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
-{
- size_t i, n;
- struct vm_mem_range *vmr;
-
- /* Find the first vm_mem_range that contains gpa */
- for (i = 0; i < vcp->vcp_nmemranges; i++) {
- vmr = &vcp->vcp_memranges[i];
- if (vmr->vmr_gpa + vmr->vmr_size >= gpa)
- break;
- }
-
- /* No range found. */
- if (i == vcp->vcp_nmemranges)
- return (NULL);
-
- /*
- * vmr may cover the range [gpa, gpa + len) only partly. Make
- * sure that the following vm_mem_ranges are contiguous and
- * cover the rest.
- */
- n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
- if (len < n)
- len = 0;
- else
- len -= n;
- gpa = vmr->vmr_gpa + vmr->vmr_size;
- for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
- vmr = &vcp->vcp_memranges[i];
- if (gpa != vmr->vmr_gpa)
- return (NULL);
- if (len <= vmr->vmr_size)
- len = 0;
- else
- len -= vmr->vmr_size;
-
- gpa = vmr->vmr_gpa + vmr->vmr_size;
- }
-
- if (len != 0)
- return (NULL);
-
- return (vmr);
-}
-
-/*
- * write_mem
- *
- * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
- *
- * Parameters:
- * dst: the destination paddr_t in the guest VM
- * buf: data to copy
- * len: number of bytes to copy
- *
- * Return values:
- * 0: success
- * EINVAL: if the guest physical memory range [dst, dst + len) does not
- * exist in the guest.
- */
-int
-write_mem(paddr_t dst, void *buf, size_t len)
-{
- char *from = buf, *to;
- size_t n, off;
- struct vm_mem_range *vmr;
-
- vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
- if (vmr == NULL) {
- errno = EINVAL;
- log_warn("%s: failed - invalid memory range dst = 0x%lx, "
- "len = 0x%zx", __func__, dst, len);
- return (EINVAL);
- }
-
- off = dst - vmr->vmr_gpa;
- while (len != 0) {
- n = vmr->vmr_size - off;
- if (len < n)
- n = len;
-
- to = (char *)vmr->vmr_va + off;
- memcpy(to, from, n);
-
- from += n;
- len -= n;
- off = 0;
- vmr++;
- }
-
- return (0);
-}
-
-/*
- * read_mem
- *
- * Reads memory at guest paddr 'src' into 'buf'.
- *
- * Parameters:
- * src: the source paddr_t in the guest VM to read from.
- * buf: destination (local) buffer
- * len: number of bytes to read
- *
- * Return values:
- * 0: success
- * EINVAL: if the guest physical memory range [dst, dst + len) does not
- * exist in the guest.
- */
-int
-read_mem(paddr_t src, void *buf, size_t len)
-{
- char *from, *to = buf;
- size_t n, off;
- struct vm_mem_range *vmr;
-
- vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
- if (vmr == NULL) {
- errno = EINVAL;
- log_warn("%s: failed - invalid memory range src = 0x%lx, "
- "len = 0x%zx", __func__, src, len);
- return (EINVAL);
- }
-
- off = src - vmr->vmr_gpa;
- while (len != 0) {
- n = vmr->vmr_size - off;
- if (len < n)
- n = len;
-
- from = (char *)vmr->vmr_va + off;
- memcpy(to, from, n);
-
- to += n;
- len -= n;
- off = 0;
- vmr++;
- }
-
- return (0);
-}
-
-/*
- * vcpu_assert_pic_irq
- *
- * Injects the specified IRQ on the supplied vcpu/vm
- *
- * Parameters:
- * vm_id: VM ID to inject to
- * vcpu_id: VCPU ID to inject to
- * irq: IRQ to inject
- */
-void
-vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
-{
- int ret;
-
- i8259_assert_irq(irq);
-
- if (i8259_is_pending()) {
- if (vcpu_pic_intr(vm_id, vcpu_id, 1))
- fatalx("%s: can't assert INTR", __func__);
-
- ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
- if (ret)
- fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
-
- vcpu_hlt[vcpu_id] = 0;
- ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
- if (ret)
- fatalx("%s: can't signal (%d)", __func__, ret);
- ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
- if (ret)
- fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
- }
-}
-
-/*
- * fd_hasdata
- *
- * Determines if data can be read from a file descriptor.
- *
- * Parameters:
- * fd: the fd to check
- *
- * Return values:
- * 1 if data can be read from an fd, or 0 otherwise.
- */
-int
-fd_hasdata(int fd)
-{
- struct pollfd pfd[1];
- int nready, hasdata = 0;
-
- pfd[0].fd = fd;
- pfd[0].events = POLLIN;
- nready = poll(pfd, 1, 0);
- if (nready == -1)
- log_warn("checking file descriptor for data failed");
- else if (nready == 1 && pfd[0].revents & POLLIN)
- hasdata = 1;
- return (hasdata);
-}
-
-/*
- * mutex_lock
- *
- * Wrapper function for pthread_mutex_lock that does error checking and that
- * exits on failure
- */
-void
-mutex_lock(pthread_mutex_t *m)
-{
- int ret;
-
- ret = pthread_mutex_lock(m);
- if (ret) {
- errno = ret;
- fatal("could not acquire mutex");
- }
-}
-
-/*
- * mutex_unlock
- *
- * Wrapper function for pthread_mutex_unlock that does error checking and that
- * exits on failure
- */
-void
-mutex_unlock(pthread_mutex_t *m)
-{
- int ret;
-
- ret = pthread_mutex_unlock(m);
- if (ret) {
- errno = ret;
- fatal("could not release mutex");
- }
-}