4 files changed, 1297 insertions, 1196 deletions
diff --git a/usr.sbin/vmd/Makefile b/usr.sbin/vmd/Makefile
index 39fd337c581..09a9c263dee 100644
--- a/usr.sbin/vmd/Makefile
+++ b/usr.sbin/vmd/Makefile
@@ -1,11 +1,11 @@
-#	$OpenBSD: Makefile,v 1.12 2016/11/26 20:03:42 reyk Exp $
+#	$OpenBSD: Makefile,v 1.13 2017/03/01 18:00:50 reyk Exp $
 
 .if ${MACHINE} == "amd64" || ${MACHINE} == "i386"
 
 PROG=		vmd
-SRCS=		vmm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
-SRCS+=		vmd.c control.c log.c priv.c proc.c config.c ns8250.c i8253.c
-SRCS+=		vmboot.c ufs.c disklabel.c
+SRCS=		vmd.c control.c log.c priv.c proc.c config.c vmm.c
+SRCS+=		vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
+SRCS+=		ns8250.c i8253.c vmboot.c ufs.c disklabel.c
 SRCS+=		parse.y
 
 CFLAGS+=	-Wall -I${.CURDIR}
diff --git a/usr.sbin/vmd/vm.c b/usr.sbin/vmd/vm.c
new file mode 100644
index 00000000000..76d213e8adf
--- /dev/null
+++ b/usr.sbin/vmd/vm.c
@@ -0,0 +1,1262 @@
+/*	$OpenBSD: vm.c,v 1.1 2017/03/01 18:00:50 reyk Exp $	*/
+
+/*
+ * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/queue.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+
+#include <dev/ic/i8253reg.h>
+#include <dev/isa/isareg.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/param.h>
+#include <machine/psl.h>
+#include <machine/specialreg.h>
+#include <machine/vmmvar.h>
+
+#include <net/if.h>
+
+#include <errno.h>
+#include <event.h>
+#include <fcntl.h>
+#include <imsg.h>
+#include <limits.h>
+#include <poll.h>
+#include <pthread.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <util.h>
+
+#include "vmd.h"
+#include "vmm.h"
+#include "loadfile.h"
+#include "pci.h"
+#include "virtio.h"
+#include "proc.h"
+#include "i8253.h"
+#include "i8259.h"
+#include "ns8250.h"
+#include "mc146818.h"
+
+io_fn_t ioports_map[MAX_PORTS];
+
+int run_vm(int *, int *, struct vm_create_params *, struct vcpu_reg_state *);
+void vm_dispatch_vmm(int, short, void *);
+void *event_thread(void *);
+void *vcpu_run_loop(void *);
+int vcpu_exit(struct vm_run_params *);
+int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
+void create_memory_map(struct vm_create_params *);
+int alloc_guest_mem(struct vm_create_params *);
+int vmm_create_vm(struct vm_create_params *);
+void init_emulated_hw(struct vm_create_params *, int *, int *);
+void vcpu_exit_inout(struct vm_run_params *);
+uint8_t vcpu_exit_pci(struct vm_run_params *);
+int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
+
+static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
+    size_t);
+
+int con_fd;
+struct vmd_vm *current_vm;
+
+extern struct vmd *env;
+
+extern char *__progname;
+
+pthread_mutex_t threadmutex;
+pthread_cond_t threadcond;
+
+pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
+pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
+uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
+uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
+
+/*
+ * Represents a standard register set for an OS to be booted
+ * as a flat 32 bit address space, before paging is enabled.
+ *
+ * NOT set here are:
+ *  RIP
+ *  RSP
+ *  GDTR BASE
+ *
+ * Specific bootloaders should clone this structure and override
+ * those fields as needed.
+ *
+ * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
+ *        features of the CPU in use.
+ */
+static const struct vcpu_reg_state vcpu_init_flat32 = {
+#ifdef __i386__
+	.vrs_gprs[VCPU_REGS_EFLAGS] = 0x2,
+	.vrs_gprs[VCPU_REGS_EIP] = 0x0,
+	.vrs_gprs[VCPU_REGS_ESP] = 0x0,
+#else
+	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
+	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
+	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
+#endif
+	.vrs_crs[VCPU_REGS_CR0] = CR0_CD | CR0_NW | CR0_ET | CR0_PE | CR0_PG,
+	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
+	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
+	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
+	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
+	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
+	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
+};
+
+/*
+ * start_vm
+ *
+ * After forking a new VM process, starts the new VM with the creation
+ * parameters supplied (in the incoming vm->vm_params field). This
+ * function performs a basic sanity check on the incoming parameters
+ * and then performs the following steps to complete the creation of the VM:
+ *
+ * 1. validates and create the new VM
+ * 2. opens the imsg control channel to the parent and drops more privilege
+ * 3. drops additional privleges by calling pledge(2)
+ * 4. loads the kernel from the disk image or file descriptor
+ * 5. runs the VM's VCPU loops.
+ *
+ * Parameters:
+ *  vm: The VM data structure that is including the VM create parameters.
+ *  fd: The imsg socket that is connected to the parent process.
+ *
+ * Return values:
+ *  0: success
+ *  !0 : failure - typically an errno indicating the source of the failure
+ */
+int
+start_vm(struct vmd_vm *vm, int fd)
+{
+	struct vm_create_params	*vcp = &vm->vm_params.vmc_params;
+	struct vcpu_reg_state	 vrs;
+	int			 nicfds[VMM_MAX_NICS_PER_VM];
+	int			 ret;
+	FILE			*kernfp;
+	struct vmboot_params	 vmboot;
+	size_t			 i;
+
+	/* Child */
+	setproctitle("%s", vcp->vcp_name);
+	log_procinit(vcp->vcp_name);
+
+	create_memory_map(vcp);
+	ret = alloc_guest_mem(vcp);
+	if (ret) {
+		errno = ret;
+		fatal("could not allocate guest memory - exiting");
+	}
+
+	ret = vmm_create_vm(vcp);
+	current_vm = vm;
+
+	/* send back the kernel-generated vm id (0 on error) */
+	if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
+	    sizeof(vcp->vcp_id))
+		fatal("write vcp id");
+
+	if (ret) {
+		errno = ret;
+		fatal("create vmm ioctl failed - exiting");
+	}
+
+	/*
+	 * pledge in the vm processes:
+	 * stdio - for malloc and basic I/O including events.
+	 * vmm - for the vmm ioctls and operations.
+	 */
+	if (pledge("stdio vmm", NULL) == -1)
+		fatal("pledge");
+
+	/*
+	 * Set up default "flat 32 bit" register state - RIP,
+	 * RSP, and GDT info will be set in bootloader
+	 */
+	memcpy(&vrs, &vcpu_init_flat32, sizeof(struct vcpu_reg_state));
+
+	/* Find and open kernel image */
+	if ((kernfp = vmboot_open(vm->vm_kernel,
+	    vm->vm_disks[0], &vmboot)) == NULL)
+		fatalx("failed to open kernel - exiting");
+
+	/* Load kernel image */
+	ret = loadelf_main(kernfp, vcp, &vrs,
+	    vmboot.vbp_bootdev, vmboot.vbp_howto);
+	if (ret) {
+		errno = ret;
+		fatal("failed to load kernel - exiting");
+	}
+
+	vmboot_close(kernfp, &vmboot);
+
+	if (vm->vm_kernel != -1)
+		close(vm->vm_kernel);
+
+	con_fd = vm->vm_tty;
+	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
+		fatal("failed to set nonblocking mode on console");
+
+	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
+		nicfds[i] = vm->vm_ifs[i].vif_fd;
+
+	event_init();
+
+	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
+		fatal("setup vm pipe");
+
+	/* Execute the vcpu run loop(s) for this VM */
+	ret = run_vm(vm->vm_disks, nicfds, vcp, &vrs);
+
+	return (ret);
+}
+
+/*
+ * vm_dispatch_vmm
+ *
+ * imsg callback for messages that are received from the vmm parent process.
+ */
+void
+vm_dispatch_vmm(int fd, short event, void *arg)
+{
+	struct vmd_vm		*vm = arg;
+	struct imsgev		*iev = &vm->vm_iev;
+	struct imsgbuf		*ibuf = &iev->ibuf;
+	struct imsg		 imsg;
+	ssize_t			 n;
+	int			 verbose;
+
+	if (event & EV_READ) {
+		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
+			fatal("%s: imsg_read", __func__);
+		if (n == 0)
+			_exit(0);
+	}
+
+	if (event & EV_WRITE) {
+		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
+			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
+		if (n == 0)
+			_exit(0);
+	}
+
+	for (;;) {
+		if ((n = imsg_get(ibuf, &imsg)) == -1)
+			fatal("%s: imsg_get", __func__);
+		if (n == 0)
+			break;
+
+#if DEBUG > 1
+		log_debug("%s: got imsg %d from %s",
+		    __func__, imsg.hdr.type,
+		    vm->vm_params.vmc_params.vcp_name);
+#endif
+
+		switch (imsg.hdr.type) {
+		case IMSG_CTL_VERBOSE:
+			IMSG_SIZE_CHECK(&imsg, &verbose);
+			memcpy(&verbose, imsg.data, sizeof(verbose));
+			log_setverbose(verbose);
+			break;
+		case IMSG_VMDOP_VM_SHUTDOWN:
+			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
+				_exit(0);
+			break;
+		case IMSG_VMDOP_VM_REBOOT:
+			if (vmmci_ctl(VMMCI_REBOOT) == -1)
+				_exit(0);
+			break;
+		default:
+			fatalx("%s: got invalid imsg %d from %s",
+			    __func__, imsg.hdr.type,
+			    vm->vm_params.vmc_params.vcp_name);
+		}
+		imsg_free(&imsg);
+	}
+	imsg_event_add(iev);
+}
+
+/*
+ * vcpu_reset
+ *
+ * Requests vmm(4) to reset the VCPUs in the indicated VM to
+ * the register state provided
+ *
+ * Parameters
+ *  vmid: VM ID to reset
+ *  vcpu_id: VCPU ID to reset
+ *  vrs: the register state to initialize
+ *
+ * Return values:
+ *  0: success
+ *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
+ *      valid)
+ */
+int
+vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
+{
+	struct vm_resetcpu_params vrp;
+
+	memset(&vrp, 0, sizeof(vrp));
+	vrp.vrp_vm_id = vmid;
+	vrp.vrp_vcpu_id = vcpu_id;
+	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
+
+	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
+
+	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) < 0)
+		return (errno);
+
+	return (0);
+}
+
+/*
+ * create_memory_map
+ *
+ * Sets up the guest physical memory ranges that the VM can access.
+ *
+ * Return values:
+ *  nothing
+ */
+void
+create_memory_map(struct vm_create_params *vcp)
+{
+	size_t len, mem_bytes, mem_mb;
+
+	mem_mb = vcp->vcp_memranges[0].vmr_size;
+	vcp->vcp_nmemranges = 0;
+	if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE)
+		return;
+
+	mem_bytes = mem_mb * 1024 * 1024;
+
+	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
+	len = LOWMEM_KB * 1024;
+	vcp->vcp_memranges[0].vmr_gpa = 0x0;
+	vcp->vcp_memranges[0].vmr_size = len;
+	mem_bytes -= len;
+
+	/*
+	 * Second memory region: LOWMEM_KB - 1MB.
+	 *
+	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
+	 * We have to add this region, because some systems
+	 * unconditionally write to 0xb8000 (VGA RAM), and
+	 * we need to make sure that vmm(4) permits accesses
+	 * to it. So allocate guest memory for it.
+	 */
+	len = 0x100000 - LOWMEM_KB * 1024;
+	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
+	vcp->vcp_memranges[1].vmr_size = len;
+	mem_bytes -= len;
+
+	/* Make sure that we do not place physical memory into MMIO ranges. */
+	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000)
+		len = VMM_PCI_MMIO_BAR_BASE - 0x100000;
+	else
+		len = mem_bytes;
+
+	/* Third memory region: 1MB - (1MB + len) */
+	vcp->vcp_memranges[2].vmr_gpa = 0x100000;
+	vcp->vcp_memranges[2].vmr_size = len;
+	mem_bytes -= len;
+
+	if (mem_bytes > 0) {
+		/* Fourth memory region for the remaining memory (if any) */
+		vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
+		vcp->vcp_memranges[3].vmr_size = mem_bytes;
+		vcp->vcp_nmemranges = 4;
+	} else
+		vcp->vcp_nmemranges = 3;
+}
+
+/*
+ * alloc_guest_mem
+ *
+ * Allocates memory for the guest.
+ * Instead of doing a single allocation with one mmap(), we allocate memory
+ * separately for every range for the following reasons:
+ * - ASLR for the individual ranges
+ * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
+ *   map the single mmap'd userspace memory to the individual guest physical
+ *   memory ranges, the underlying amap of the single mmap'd range would have
+ *   to allocate per-page reference counters. The reason is that the
+ *   individual guest physical ranges would reference the single mmap'd region
+ *   only partially. However, if every guest physical range has its own
+ *   corresponding mmap'd userspace allocation, there are no partial
+ *   references: every guest physical range fully references an mmap'd
+ *   range => no per-page reference counters have to be allocated.
+ *
+ * Return values:
+ *  0: success
+ *  !0: failure - errno indicating the source of the failure
+ */
+int
+alloc_guest_mem(struct vm_create_params *vcp)
+{
+	void *p;
+	int ret;
+	size_t i, j;
+	struct vm_mem_range *vmr;
+
+	for (i = 0; i < vcp->vcp_nmemranges; i++) {
+		vmr = &vcp->vcp_memranges[i];
+		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
+		    MAP_PRIVATE | MAP_ANON, -1, 0);
+		if (p == MAP_FAILED) {
+			ret = errno;
+			for (j = 0; j < i; j++) {
+				vmr = &vcp->vcp_memranges[j];
+				munmap((void *)vmr->vmr_va, vmr->vmr_size);
+			}
+
+			return (ret);
+		}
+
+		vmr->vmr_va = (vaddr_t)p;
+	}
+
+	return (0);
+}
+
+/*
+ * vmm_create_vm
+ *
+ * Requests vmm(4) to create a new VM using the supplied creation
+ * parameters. This operation results in the creation of the in-kernel
+ * structures for the VM, but does not start the VM's vcpu(s).
+ *
+ * Parameters:
+ *  vcp: vm_create_params struct containing the VM's desired creation
+ *      configuration
+ *
+ * Return values:
+ *  0: success
+ *  !0 : ioctl to vmm(4) failed
+ */
+int
+vmm_create_vm(struct vm_create_params *vcp)
+{
+	/* Sanity check arguments */
+	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
+		return (EINVAL);
+
+	if (vcp->vcp_nmemranges == 0 ||
+	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
+		return (EINVAL);
+
+	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
+		return (EINVAL);
+
+	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
+		return (EINVAL);
+
+	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) < 0)
+		return (errno);
+
+	return (0);
+}
+
+/*
+ * init_emulated_hw
+ *
+ * Initializes the userspace hardware emulation
+ */
+void
+init_emulated_hw(struct vm_create_params *vcp, int *child_disks,
+    int *child_taps)
+{
+	int i;
+
+	/* Reset the IO port map */
+	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
+
+	/* Init i8253 PIT */
+	i8253_init(vcp->vcp_id);
+	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
+	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
+	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
+	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
+
+	/* Init mc146818 RTC */
+	mc146818_init(vcp->vcp_id);
+	ioports_map[IO_RTC] = vcpu_exit_mc146818;
+	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
+
+	/* Init master and slave PICs */
+	i8259_init();
+	ioports_map[IO_ICU1] = vcpu_exit_i8259;
+	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
+	ioports_map[IO_ICU2] = vcpu_exit_i8259;
+	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
+
+	/* Init ns8250 UART */
+	ns8250_init(con_fd, vcp->vcp_id);
+	for (i = COM1_DATA; i <= COM1_SCR; i++)
+		ioports_map[i] = vcpu_exit_com;
+
+	/* Initialize PCI */
+	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
+		ioports_map[i] = vcpu_exit_pci;
+
+	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
+	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
+	pci_init();
+
+	/* Initialize virtio devices */
+	virtio_init(vcp, child_disks, child_taps);
+}
+
+/*
+ * run_vm
+ *
+ * Runs the VM whose creation parameters are specified in vcp
+ *
+ * Parameters:
+ *  child_disks: previously-opened child VM disk file file descriptors
+ *  child_taps: previously-opened child tap file descriptors
+ *  vcp: vm_create_params struct containing the VM's desired creation
+ *      configuration
+ *  vrs: VCPU register state to initialize
+ *
+ * Return values:
+ *  0: the VM exited normally
+ *  !0 : the VM exited abnormally or failed to start
+ */
+int
+run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp,
+    struct vcpu_reg_state *vrs)
+{
+	uint8_t evdone = 0;
+	size_t i;
+	int ret;
+	pthread_t *tid, evtid;
+	struct vm_run_params **vrp;
+	void *exit_status;
+
+	if (vcp == NULL)
+		return (EINVAL);
+
+	if (child_disks == NULL && vcp->vcp_ndisks != 0)
+		return (EINVAL);
+
+	if (child_taps == NULL && vcp->vcp_nnics != 0)
+		return (EINVAL);
+
+	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
+		return (EINVAL);
+
+	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
+		return (EINVAL);
+
+	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
+		return (EINVAL);
+
+	if (vcp->vcp_nmemranges == 0 ||
+	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
+		return (EINVAL);
+
+	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
+	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
+	if (tid == NULL || vrp == NULL) {
+		log_warn("%s: memory allocation error - exiting.",
+		    __progname);
+		return (ENOMEM);
+	}
+
+	log_debug("%s: initializing hardware for vm %s", __func__,
+	    vcp->vcp_name);
+
+	init_emulated_hw(vcp, child_disks, child_taps);
+
+	ret = pthread_mutex_init(&threadmutex, NULL);
+	if (ret) {
+		log_warn("%s: could not initialize thread state mutex",
+		    __func__);
+		return (ret);
+	}
+	ret = pthread_cond_init(&threadcond, NULL);
+	if (ret) {
+		log_warn("%s: could not initialize thread state "
+		    "condition variable", __func__);
+		return (ret);
+	}
+
+	mutex_lock(&threadmutex);
+
+	log_debug("%s: starting vcpu threads for vm %s", __func__,
+	    vcp->vcp_name);
+
+	/*
+	 * Create and launch one thread for each VCPU. These threads may
+	 * migrate between PCPUs over time; the need to reload CPU state
+	 * in such situations is detected and performed by vmm(4) in the
+	 * kernel.
+	 */
+	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
+		vrp[i] = malloc(sizeof(struct vm_run_params));
+		if (vrp[i] == NULL) {
+			log_warn("%s: memory allocation error - "
+			    "exiting.", __progname);
+			/* caller will exit, so skip free'ing */
+			return (ENOMEM);
+		}
+		vrp[i]->vrp_exit = malloc(sizeof(union vm_exit));
+		if (vrp[i]->vrp_exit == NULL) {
+			log_warn("%s: memory allocation error - "
+			    "exiting.", __progname);
+			/* caller will exit, so skip free'ing */
+			return (ENOMEM);
+		}
+		vrp[i]->vrp_vm_id = vcp->vcp_id;
+		vrp[i]->vrp_vcpu_id = i;
+
+		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
+			log_warnx("%s: cannot reset VCPU %zu - exiting.",
+			    __progname, i);
+			return (EIO);
+		}
+
+		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
+		if (ret) {
+			log_warnx("%s: cannot initialize cond var (%d)",
+			    __progname, ret);
+			return (ret);
+		}
+
+		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
+		if (ret) {
+			log_warnx("%s: cannot initialize mtx (%d)",
+			    __progname, ret);
+			return (ret);
+		}
+
+		vcpu_hlt[i] = 0;
+
+		/* Start each VCPU run thread at vcpu_run_loop */
+		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
+		if (ret) {
+			/* caller will _exit after this return */
+			ret = errno;
+			log_warn("%s: could not create vcpu thread %zu",
+			    __func__, i);
+			return (ret);
+		}
+	}
+
+	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
+	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
+	if (ret) {
+		errno = ret;
+		log_warn("%s: could not create event thread", __func__);
+		return (ret);
+	}
+
+	for (;;) {
+		ret = pthread_cond_wait(&threadcond, &threadmutex);
+		if (ret) {
+			log_warn("%s: waiting on thread state condition "
+			    "variable failed", __func__);
+			return (ret);
+		}
+
+		/*
+		 * Did a VCPU thread exit with an error? => return the first one
+		 */
+		for (i = 0; i < vcp->vcp_ncpus; i++) {
+			if (vcpu_done[i] == 0)
+				continue;
+
+			if (pthread_join(tid[i], &exit_status)) {
+				log_warn("%s: failed to join thread %zd - "
+				    "exiting", __progname, i);
+				return (EIO);
+			}
+
+			ret = (long long)exit_status;
+		}
+
+		/* Did the event thread exit? => return with an error */
+		if (evdone) {
+			if (pthread_join(evtid, &exit_status)) {
+				log_warn("%s: failed to join event thread - "
+				    "exiting", __progname);
+				return (EIO);
+			}
+
+			log_warnx("%s: vm %d event thread exited "
+			    "unexpectedly", __progname, vcp->vcp_id);
+			return (EIO);
+		}
+
+		/* Did all VCPU threads exit successfully? => return */
+		for (i = 0; i < vcp->vcp_ncpus; i++) {
+			if (vcpu_done[i] == 0)
+				break;
+		}
+		if (i == vcp->vcp_ncpus)
+			return (ret);
+
+		/* Some more threads to wait for, start over */
+	}
+
+	return (ret);
+}
+
+void *
+event_thread(void *arg)
+{
+	uint8_t *donep = arg;
+	intptr_t ret;
+
+	ret = event_dispatch();
+
+	mutex_lock(&threadmutex);
+	*donep = 1;
+	pthread_cond_signal(&threadcond);
+	mutex_unlock(&threadmutex);
+
+	return (void *)ret;
+ }
+
+/*
+ * vcpu_run_loop
+ *
+ * Runs a single VCPU until vmm(4) requires help handling an exit,
+ * or the VM terminates.
+ *
+ * Parameters:
+ *  arg: vcpu_run_params for the VCPU being run by this thread
+ *
+ * Return values:
+ *  NULL: the VCPU shutdown properly
+ *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
+ */
+void *
+vcpu_run_loop(void *arg)
+{
+	struct vm_run_params *vrp = (struct vm_run_params *)arg;
+	intptr_t ret = 0;
+	int irq;
+	uint32_t n;
+
+	vrp->vrp_continue = 0;
+	n = vrp->vrp_vcpu_id;
+
+	for (;;) {
+		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
+
+		if (ret) {
+			log_warnx("%s: can't lock vcpu run mtx (%d)",
+			    __func__, (int)ret);
+			return ((void *)ret);
+		}
+
+		/* If we are halted, wait */
+		if (vcpu_hlt[n]) {
+			ret = pthread_cond_wait(&vcpu_run_cond[n],
+			    &vcpu_run_mtx[n]);
+
+			if (ret) {
+				log_warnx("%s: can't wait on cond (%d)",
+				    __func__, (int)ret);
+				(void)pthread_mutex_unlock(&vcpu_run_mtx[n]);
+				break;
+			}
+		}
+
+		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
+		if (ret) {
+			log_warnx("%s: can't unlock mutex on cond (%d)",
+			    __func__, (int)ret);
+			break;
+		}
+
+		if (vrp->vrp_irqready && i8259_is_pending()) {
+			irq = i8259_ack();
+			vrp->vrp_irq = irq;
+		} else
+			vrp->vrp_irq = 0xFFFF;
+
+		/* Still more pending? */
+		if (i8259_is_pending()) {
+			/* XXX can probably avoid ioctls here by providing intr in vrp */
+			if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 1)) {
+				fatal("can't set INTR");
+			}
+		} else {
+			if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 0)) {
+				fatal("can't clear INTR");
+			}
+		}
+
+		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) < 0) {
+			/* If run ioctl failed, exit */
+			ret = errno;
+			log_warn("%s: vm %d / vcpu %d run ioctl failed",
+			    __func__, vrp->vrp_vm_id, n);
+			break;
+		}
+
+		/* If the VM is terminating, exit normally */
+		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
+			ret = (intptr_t)NULL;
+			break;
+		}
+
+		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
+			/*
+			 * vmm(4) needs help handling an exit, handle in
+			 * vcpu_exit.
+			 */
+			ret = vcpu_exit(vrp);
+			if (ret)
+				break;
+		}
+	}
+
+	mutex_lock(&threadmutex);
+	vcpu_done[n] = 1;
+	pthread_cond_signal(&threadcond);
+	mutex_unlock(&threadmutex);
+
+	return ((void *)ret);
+}
+
+int
+vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
+{
+	struct vm_intr_params vip;
+
+	memset(&vip, 0, sizeof(vip));
+
+	vip.vip_vm_id = vm_id;
+	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
+	vip.vip_intr = intr;
+
+	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) < 0)
+		return (errno);
+
+	return (0);
+}
+
+/*
+ * vcpu_exit_pci
+ *
+ * Handle all I/O to the emulated PCI subsystem.
+ *
+ * Parameters:
+ *  vrp: vcpu run paramters containing guest state for this exit
+ *
+ * Return value:
+ *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
+ *      be injected.
+ */
+uint8_t
+vcpu_exit_pci(struct vm_run_params *vrp)
+{
+	union vm_exit *vei = vrp->vrp_exit;
+	uint8_t intr;
+
+	intr = 0xFF;
+
+	switch (vei->vei.vei_port) {
+	case PCI_MODE1_ADDRESS_REG:
+		pci_handle_address_reg(vrp);
+		break;
+	case PCI_MODE1_DATA_REG:
+		pci_handle_data_reg(vrp);
+		break;
+	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
+		intr = pci_handle_io(vrp);
+		break;
+	default:
+		log_warnx("%s: unknown PCI register 0x%llx",
+		    __progname, (uint64_t)vei->vei.vei_port);
+		break;
+	}
+
+	return (intr);
+}
+
+/*
+ * vcpu_exit_inout
+ *
+ * Handle all I/O exits that need to be emulated in vmd. This includes the
+ * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
+ *
+ * Parameters:
+ *  vrp: vcpu run parameters containing guest state for this exit
+ */
+void
+vcpu_exit_inout(struct vm_run_params *vrp)
+{
+	union vm_exit *vei = vrp->vrp_exit;
+	uint8_t intr = 0xFF;
+
+	if (ioports_map[vei->vei.vei_port] != NULL)
+		intr = ioports_map[vei->vei.vei_port](vrp);
+	else if (vei->vei.vei_dir == VEI_DIR_IN)
+			vei->vei.vei_data = 0xFFFFFFFF;
+
+	if (intr != 0xFF)
+		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
+}
+
+/*
+ * vcpu_exit
+ *
+ * Handle a vcpu exit. This function is called when it is determined that
+ * vmm(4) requires the assistance of vmd to support a particular guest
+ * exit type (eg, accessing an I/O port or device). Guest state is contained
+ * in 'vrp', and will be resent to vmm(4) on exit completion.
+ *
+ * Upon conclusion of handling the exit, the function determines if any
+ * interrupts should be injected into the guest, and asserts the proper
+ * IRQ line whose interrupt should be vectored.
+ *
+ * Parameters:
+ *  vrp: vcpu run parameters containing guest state for this exit
+ *
+ * Return values:
+ *  0: the exit was handled successfully
+ *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
+ */
+int
+vcpu_exit(struct vm_run_params *vrp)
+{
+	int ret;
+
+	switch (vrp->vrp_exit_reason) {
+	case VMX_EXIT_INT_WINDOW:
+	case VMX_EXIT_EXTINT:
+	case VMX_EXIT_EPT_VIOLATION:
+	case SVM_VMEXIT_NPF:
+		/*
+		 * We may be exiting to vmd to handle a pending interrupt but
+		 * at the same time the last exit type may have been one of
+		 * these. In this case, there's nothing extra to be done
+		 * here (and falling through to the default case below results
+		 * in more vmd log spam).
+		 */
+		break;
+	case VMX_EXIT_IO:
+	case SVM_VMEXIT_IOIO:
+		vcpu_exit_inout(vrp);
+		break;
+	case VMX_EXIT_HLT:
+	case SVM_VMEXIT_HLT:
+		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
+		if (ret) {
+			log_warnx("%s: can't lock vcpu mutex (%d)",
+			    __func__, ret);
+			return (ret);
+		}
+		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
+		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
+		if (ret) {
+			log_warnx("%s: can't unlock vcpu mutex (%d)",
+			    __func__, ret);
+			return (ret);
+		}
+		break;
+	case VMX_EXIT_TRIPLE_FAULT:
+	case SVM_VMEXIT_SHUTDOWN:
+		/* XXX reset VM since we do not support reboot yet */
+		return (EAGAIN);
+	default:
+		log_debug("%s: unknown exit reason %d",
+		    __progname, vrp->vrp_exit_reason);
+	}
+
+	/* Process any pending traffic */
+	vionet_process_rx(vrp->vrp_vm_id);
+
+	vrp->vrp_continue = 1;
+
+	return (0);
+}
+
+/*
+ * find_gpa_range
+ *
+ * Search for a contiguous guest physical mem range.
+ *
+ * Parameters:
+ *  vcp: VM create parameters that contain the memory map to search in
+ *  gpa: the starting guest physical address
+ *  len: the length of the memory range
+ *
+ * Return values:
+ *  NULL: on failure if there is no memory range as described by the parameters
+ *  Pointer to vm_mem_range that contains the start of the range otherwise.
+ */
+static struct vm_mem_range *
+find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
+{
+	size_t i, n;
+	struct vm_mem_range *vmr;
+
+	/* Find the first vm_mem_range that contains gpa */
+	for (i = 0; i < vcp->vcp_nmemranges; i++) {
+		vmr = &vcp->vcp_memranges[i];
+		if (vmr->vmr_gpa + vmr->vmr_size >= gpa)
+			break;
+	}
+
+	/* No range found. */
+	if (i == vcp->vcp_nmemranges)
+		return (NULL);
+
+	/*
+	 * vmr may cover the range [gpa, gpa + len) only partly. Make
+	 * sure that the following vm_mem_ranges are contiguous and
+	 * cover the rest.
+	 */
+	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
+	if (len < n)
+		len = 0;
+	else
+		len -= n;
+	gpa = vmr->vmr_gpa + vmr->vmr_size;
+	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
+		vmr = &vcp->vcp_memranges[i];
+		if (gpa != vmr->vmr_gpa)
+			return (NULL);
+		if (len <= vmr->vmr_size)
+			len = 0;
+		else
+			len -= vmr->vmr_size;
+
+		gpa = vmr->vmr_gpa + vmr->vmr_size;
+	}
+
+	if (len != 0)
+		return (NULL);
+
+	return (vmr);
+}
+
+/*
+ * write_mem
+ *
+ * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
+ *
+ * Parameters:
+ *  dst: the destination paddr_t in the guest VM
+ *  buf: data to copy
+ *  len: number of bytes to copy
+ *
+ * Return values:
+ *  0: success
+ *  EINVAL: if the guest physical memory range [dst, dst + len) does not
+ *      exist in the guest.
+ */
+int
+write_mem(paddr_t dst, void *buf, size_t len)
+{
+	char *from = buf, *to;
+	size_t n, off;
+	struct vm_mem_range *vmr;
+
+	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
+	if (vmr == NULL) {
+		errno = EINVAL;
+		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
+		    "len = 0x%zx", __func__, dst, len);
+		return (EINVAL);
+	}
+
+	off = dst - vmr->vmr_gpa;
+	while (len != 0) {
+		n = vmr->vmr_size - off;
+		if (len < n)
+			n = len;
+
+		to = (char *)vmr->vmr_va + off;
+		memcpy(to, from, n);
+
+		from += n;
+		len -= n;
+		off = 0;
+		vmr++;
+	}
+
+	return (0);
+}
+
+/*
+ * read_mem
+ *
+ * Reads memory at guest paddr 'src' into 'buf'.
+ *
+ * Parameters:
+ *  src: the source paddr_t in the guest VM to read from.
+ *  buf: destination (local) buffer
+ *  len: number of bytes to read
+ *
+ * Return values:
+ *  0: success
+ *  EINVAL: if the guest physical memory range [dst, dst + len) does not
+ *      exist in the guest.
+ */
+int
+read_mem(paddr_t src, void *buf, size_t len)
+{
+	char *from, *to = buf;
+	size_t n, off;
+	struct vm_mem_range *vmr;
+
+	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
+	if (vmr == NULL) {
+		errno = EINVAL;
+		log_warn("%s: failed - invalid memory range src = 0x%lx, "
+		    "len = 0x%zx", __func__, src, len);
+		return (EINVAL);
+	}
+
+	off = src - vmr->vmr_gpa;
+	while (len != 0) {
+		n = vmr->vmr_size - off;
+		if (len < n)
+			n = len;
+
+		from = (char *)vmr->vmr_va + off;
+		memcpy(to, from, n);
+
+		to += n;
+		len -= n;
+		off = 0;
+		vmr++;
+	}
+
+	return (0);
+}
+
+/*
+ * vcpu_assert_pic_irq
+ *
+ * Injects the specified IRQ on the supplied vcpu/vm
+ *
+ * Parameters:
+ *  vm_id: VM ID to inject to
+ *  vcpu_id: VCPU ID to inject to
+ *  irq: IRQ to inject
+ */
+void
+vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
+{
+	int ret;
+
+	i8259_assert_irq(irq);
+
+	if (i8259_is_pending()) {
+		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
+			fatalx("%s: can't assert INTR", __func__);
+
+		ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
+		if (ret)
+			fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
+
+		vcpu_hlt[vcpu_id] = 0;
+		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
+		if (ret)
+			fatalx("%s: can't signal (%d)", __func__, ret);
+		ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
+		if (ret)
+			fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
+	}
+}
+
+/*
+ * fd_hasdata
+ *
+ * Determines if data can be read from a file descriptor.
+ *
+ * Parameters:
+ *  fd: the fd to check
+ *
+ * Return values:
+ *  1 if data can be read from an fd, or 0 otherwise.
+ */
+int
+fd_hasdata(int fd)
+{
+	struct pollfd pfd[1];
+	int nready, hasdata = 0;
+
+	pfd[0].fd = fd;
+	pfd[0].events = POLLIN;
+	nready = poll(pfd, 1, 0);
+	if (nready == -1)
+		log_warn("checking file descriptor for data failed");
+	else if (nready == 1 && pfd[0].revents & POLLIN)
+		hasdata = 1;
+	return (hasdata);
+}
+
+/*
+ * mutex_lock
+ *
+ * Wrapper function for pthread_mutex_lock that does error checking and that
+ * exits on failure
+ */
+void
+mutex_lock(pthread_mutex_t *m)
+{
+	int ret;
+
+	ret = pthread_mutex_lock(m);
+	if (ret) {
+		errno = ret;
+		fatal("could not acquire mutex");
+	}
+}
+
+/*
+ * mutex_unlock
+ *
+ * Wrapper function for pthread_mutex_unlock that does error checking and that
+ * exits on failure
+ */
+void
+mutex_unlock(pthread_mutex_t *m)
+{
+	int ret;
+
+	ret = pthread_mutex_unlock(m);
+	if (ret) {
+		errno = ret;
+		fatal("could not release mutex");
+	}
+}
diff --git a/usr.sbin/vmd/vmd.h b/usr.sbin/vmd/vmd.h
index 34f8f2192f5..3046bb36c2b 100644
--- a/usr.sbin/vmd/vmd.h
+++ b/usr.sbin/vmd/vmd.h
@@ -1,4 +1,4 @@
-/*	$OpenBSD: vmd.h,v 1.45 2017/03/01 07:43:33 reyk Exp $	*/
+/*	$OpenBSD: vmd.h,v 1.46 2017/03/01 18:00:50 reyk Exp $	*/
 
 /*
  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -229,6 +229,10 @@ int	 opentap(char *);
 int	 fd_hasdata(int);
 void	 mutex_lock(pthread_mutex_t *);
 void	 mutex_unlock(pthread_mutex_t *);
+int	 vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *));
+
+/* vm.c */
+int	 start_vm(struct vmd_vm *, int);
 
 /* control.c */
 int	 config_init(struct vmd *);
diff --git a/usr.sbin/vmd/vmm.c b/usr.sbin/vmd/vmm.c
index 5f46d7a567e..ef4bc810876 100644
--- a/usr.sbin/vmd/vmm.c
+++ b/usr.sbin/vmd/vmm.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: vmm.c,v 1.65 2017/01/24 09:58:00 mlarkin Exp $	*/
+/*	$OpenBSD: vmm.c,v 1.66 2017/03/01 18:00:50 reyk Exp $	*/
 
 /*
  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -52,102 +52,22 @@
 
 #include "vmd.h"
 #include "vmm.h"
-#include "loadfile.h"
-#include "pci.h"
-#include "virtio.h"
-#include "proc.h"
-#include "i8253.h"
-#include "i8259.h"
-#include "ns8250.h"
-#include "mc146818.h"
-
-io_fn_t ioports_map[MAX_PORTS];
 
 void vmm_sighdlr(int, short, void *);
-int opentap(char *);
-int start_vm(struct imsg *, uint32_t *);
-int terminate_vm(struct vm_terminate_params *);
-int get_info_vm(struct privsep *, struct imsg *, int);
-int run_vm(int *, int *, struct vm_create_params *, struct vcpu_reg_state *);
-void *event_thread(void *);
-void *vcpu_run_loop(void *);
-int vcpu_exit(struct vm_run_params *);
-int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
-void create_memory_map(struct vm_create_params *);
-int alloc_guest_mem(struct vm_create_params *);
-int vmm_create_vm(struct vm_create_params *);
-void init_emulated_hw(struct vm_create_params *, int *, int *);
-void vcpu_exit_inout(struct vm_run_params *);
-uint8_t vcpu_exit_pci(struct vm_run_params *);
+int vmm_start_vm(struct imsg *, uint32_t *);
 int vmm_dispatch_parent(int, struct privsep_proc *, struct imsg *);
 void vmm_run(struct privsep *, struct privsep_proc *, void *);
-int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
-
-int vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *));
 void vmm_dispatch_vm(int, short, void *);
-void vm_dispatch_vmm(int, short, void *);
-
-static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
-    size_t);
-
-int con_fd;
-struct vmd_vm *current_vm;
+int terminate_vm(struct vm_terminate_params *);
+int get_info_vm(struct privsep *, struct imsg *, int);
+int opentap(char *);
 
 extern struct vmd *env;
 
-extern char *__progname;
-
-pthread_mutex_t threadmutex;
-pthread_cond_t threadcond;
-
-pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
-pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
-uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
-uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
-
 static struct privsep_proc procs[] = {
 	{ "parent",	PROC_PARENT,	vmm_dispatch_parent  },
 };
 
-/*
- * Represents a standard register set for an OS to be booted
- * as a flat 32 bit address space, before paging is enabled.
- *
- * NOT set here are:
- *  RIP
- *  RSP
- *  GDTR BASE
- *
- * Specific bootloaders should clone this structure and override
- * those fields as needed.
- *
- * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
- *        features of the CPU in use.
- */
-static const struct vcpu_reg_state vcpu_init_flat32 = {
-#ifdef __i386__
-	.vrs_gprs[VCPU_REGS_EFLAGS] = 0x2,
-	.vrs_gprs[VCPU_REGS_EIP] = 0x0,
-	.vrs_gprs[VCPU_REGS_ESP] = 0x0,
-#else
-	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
-	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
-	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
-#endif
-	.vrs_crs[VCPU_REGS_CR0] = CR0_CD | CR0_NW | CR0_ET | CR0_PE | CR0_PG,
-	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
-	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
-	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
-	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
-	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
-	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
-	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
-	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
-	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
-	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
-	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
-};
-
 void
 vmm(struct privsep *ps, struct privsep_proc *p)
 {
@@ -212,7 +132,7 @@ vmm_dispatch_parent(int fd, struct privsep_proc *p, struct imsg *imsg)
 		}
 		break;
 	case IMSG_VMDOP_START_VM_END:
-		res = start_vm(imsg, &id);
+		res = vmm_start_vm(imsg, &id);
 		cmd = IMSG_VMDOP_START_VM_RESPONSE;
 		break;
 	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
@@ -386,6 +306,12 @@ vmm_shutdown(void)
 	}
 }
 
+/*
+ * vmm_pipe
+ *
+ * Create a new imsg control channel between vmm parent and a VM
+ * (can be called on both sides).
+ */
 int
 vmm_pipe(struct vmd_vm *vm, int fd, void (*cb)(int, short, void *))
 {
@@ -404,6 +330,11 @@ vmm_pipe(struct vmd_vm *vm, int fd, void (*cb)(int, short, void *))
 	return (0);
 }
 
+/*
+ * vmm_dispatch_vm
+ *
+ * imsg callback for messages that are received from a VM child process.
+ */
 void
 vmm_dispatch_vm(int fd, short event, void *arg)
 {
@@ -456,100 +387,6 @@ vmm_dispatch_vm(int fd, short event, void *arg)
 	imsg_event_add(iev);
 }
 
-void
-vm_dispatch_vmm(int fd, short event, void *arg)
-{
-	struct vmd_vm		*vm = arg;
-	struct imsgev		*iev = &vm->vm_iev;
-	struct imsgbuf		*ibuf = &iev->ibuf;
-	struct imsg		 imsg;
-	ssize_t			 n;
-	int			 verbose;
-
-	if (event & EV_READ) {
-		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
-			fatal("%s: imsg_read", __func__);
-		if (n == 0)
-			_exit(0);
-	}
-
-	if (event & EV_WRITE) {
-		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
-			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
-		if (n == 0)
-			_exit(0);
-	}
-
-	for (;;) {
-		if ((n = imsg_get(ibuf, &imsg)) == -1)
-			fatal("%s: imsg_get", __func__);
-		if (n == 0)
-			break;
-
-#if DEBUG > 1
-		log_debug("%s: got imsg %d from %s",
-		    __func__, imsg.hdr.type,
-		    vm->vm_params.vmc_params.vcp_name);
-#endif
-
-		switch (imsg.hdr.type) {
-		case IMSG_CTL_VERBOSE:
-			IMSG_SIZE_CHECK(&imsg, &verbose);
-			memcpy(&verbose, imsg.data, sizeof(verbose));
-			log_setverbose(verbose);
-			break;
-		case IMSG_VMDOP_VM_SHUTDOWN:
-			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
-				_exit(0);
-			break;
-		case IMSG_VMDOP_VM_REBOOT:
-			if (vmmci_ctl(VMMCI_REBOOT) == -1)
-				_exit(0);
-			break;
-		default:
-			fatalx("%s: got invalid imsg %d from %s",
-			    __func__, imsg.hdr.type,
-			    vm->vm_params.vmc_params.vcp_name);
-		}
-		imsg_free(&imsg);
-	}
-	imsg_event_add(iev);
-}
-
-/*
- * vcpu_reset
- *
- * Requests vmm(4) to reset the VCPUs in the indicated VM to
- * the register state provided
- *
- * Parameters
- *  vmid: VM ID to reset
- *  vcpu_id: VCPU ID to reset
- *  vrs: the register state to initialize
- *
- * Return values:
- *  0: success
- *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
- *      valid)
- */
-int
-vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
-{
-	struct vm_resetcpu_params vrp;
-
-	memset(&vrp, 0, sizeof(vrp));
-	vrp.vrp_vm_id = vmid;
-	vrp.vrp_vcpu_id = vcpu_id;
-	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
-
-	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
-
-	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) < 0)
-		return (errno);
-
-	return (0);
-}
-
 /*
  * terminate_vm
  *
@@ -605,40 +442,26 @@ opentap(char *ifname)
 }
 
 /*
- * start_vm
- *
- * Starts a new VM with the creation parameters supplied (in the incoming
- * imsg->data field). This function performs a basic sanity check on the
- * incoming parameters and then performs the following steps to complete
- * the creation of the VM:
+ * vmm_start_vm
  *
- * 1. opens the VM disk image files specified in the VM creation parameters
- * 2. opens the specified VM kernel
- * 3. creates a VM console tty pair using openpty
- * 4. forks, passing the file descriptors opened in steps 1-3 to the child
- *     vmd responsible for dropping privilege and running the VM's VCPU
- *     loops.
+ * Prepares and forks a new VM process.
  *
  * Parameters:
- *  imsg: The incoming imsg body whose 'data' field is a vm_create_params
- *      struct containing the VM creation parameters.
- *  id: Returns the VM id as reported by the kernel.
+ *  imsg: The VM data structure that is including the VM create parameters.
+ *  id: Returns the VM id as reported by the kernel and obtained from the VM.
  *
  * Return values:
  *  0: success
  *  !0 : failure - typically an errno indicating the source of the failure
  */
 int
-start_vm(struct imsg *imsg, uint32_t *id)
+vmm_start_vm(struct imsg *imsg, uint32_t *id)
 {
 	struct vm_create_params	*vcp;
-	struct vmboot_params	 vmboot;
 	struct vmd_vm		*vm;
-	size_t			 i;
 	int			 ret = EINVAL;
-	int			 fds[2], nicfds[VMM_MAX_NICS_PER_VM];
-	struct vcpu_reg_state	 vrs;
-	FILE			*kernfp;
+	int			 fds[2];
+	size_t			 i;
 
 	if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
 		log_warnx("%s: can't find vm", __func__);
@@ -668,6 +491,7 @@ start_vm(struct imsg *imsg, uint32_t *id)
 	if (ret > 0) {
 		/* Parent */
 		vm->vm_pid = ret;
+		close(fds[1]);
 
 		for (i = 0 ; i < vcp->vcp_ndisks; i++) {
 			close(vm->vm_disks[i]);
@@ -686,7 +510,6 @@ start_vm(struct imsg *imsg, uint32_t *id)
 		vm->vm_tty = -1;
 
 		/* read back the kernel-generated vm id from the child */
-		close(fds[1]);
 		if (read(fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
 		    sizeof(vcp->vcp_id))
 			fatal("read vcp id");
@@ -702,76 +525,9 @@ start_vm(struct imsg *imsg, uint32_t *id)
 		return (0);
 	} else {
 		/* Child */
-		setproctitle("%s", vcp->vcp_name);
-		log_procinit(vcp->vcp_name);
-
-		create_memory_map(vcp);
-		ret = alloc_guest_mem(vcp);
-		if (ret) {
-			errno = ret;
-			fatal("could not allocate guest memory - exiting");
-		}
-
-		ret = vmm_create_vm(vcp);
-		current_vm = vm;
-
-		/* send back the kernel-generated vm id (0 on error) */
 		close(fds[0]);
-		if (write(fds[1], &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
-		    sizeof(vcp->vcp_id))
-			fatal("write vcp id");
-
-		if (ret) {
-			errno = ret;
-			fatal("create vmm ioctl failed - exiting");
-		}
-
-		/*
-		 * pledge in the vm processes:
-		 * stdio - for malloc and basic I/O including events.
-		 * vmm - for the vmm ioctls and operations.
-		 */
-		if (pledge("stdio vmm", NULL) == -1)
-			fatal("pledge");
-
-		/*
-		 * Set up default "flat 32 bit" register state - RIP,
-		 * RSP, and GDT info will be set in bootloader
-		 */
-		memcpy(&vrs, &vcpu_init_flat32, sizeof(struct vcpu_reg_state));
-
-		/* Find and open kernel image */
-		if ((kernfp = vmboot_open(vm->vm_kernel,
-		    vm->vm_disks[0], &vmboot)) == NULL)
-			fatalx("failed to open kernel - exiting");
-
-		/* Load kernel image */
-		ret = loadelf_main(kernfp, vcp, &vrs,
-		    vmboot.vbp_bootdev, vmboot.vbp_howto);
-		if (ret) {
-			errno = ret;
-			fatal("failed to load kernel - exiting");
-		}
-
-		vmboot_close(kernfp, &vmboot);
-
-		if (vm->vm_kernel != -1)
-			close(vm->vm_kernel);
-
-		con_fd = vm->vm_tty;
-		if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
-			fatal("failed to set nonblocking mode on console");
-
-		for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
-			nicfds[i] = vm->vm_ifs[i].vif_fd;
-
-		event_init();
-
-		if (vmm_pipe(vm, fds[1], vm_dispatch_vmm) == -1)
-			fatal("setup vm pipe");
 
-		/* Execute the vcpu run loop(s) for this VM */
-		ret = run_vm(vm->vm_disks, nicfds, vcp, &vrs);
+		ret = start_vm(vm, fds[1]);
 
 		_exit(ret);
 	}
@@ -864,924 +620,3 @@ get_info_vm(struct privsep *ps, struct imsg *imsg, int terminate)
 	free(info);
 	return (0);
 }
-
-/*
- * create_memory_map
- *
- * Sets up the guest physical memory ranges that the VM can access.
- *
- * Return values:
- *  nothing
- */
-void
-create_memory_map(struct vm_create_params *vcp)
-{
-	size_t len, mem_bytes, mem_mb;
-
-	mem_mb = vcp->vcp_memranges[0].vmr_size;
-	vcp->vcp_nmemranges = 0;
-	if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE)
-		return;
-
-	mem_bytes = mem_mb * 1024 * 1024;
-
-	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
-	len = LOWMEM_KB * 1024;
-	vcp->vcp_memranges[0].vmr_gpa = 0x0;
-	vcp->vcp_memranges[0].vmr_size = len;
-	mem_bytes -= len;
-
-	/*
-	 * Second memory region: LOWMEM_KB - 1MB.
-	 *
-	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
-	 * We have to add this region, because some systems
-	 * unconditionally write to 0xb8000 (VGA RAM), and
-	 * we need to make sure that vmm(4) permits accesses
-	 * to it. So allocate guest memory for it.
-	 */
-	len = 0x100000 - LOWMEM_KB * 1024;
-	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
-	vcp->vcp_memranges[1].vmr_size = len;
-	mem_bytes -= len;
-
-	/* Make sure that we do not place physical memory into MMIO ranges. */
-	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000)
-		len = VMM_PCI_MMIO_BAR_BASE - 0x100000;
-	else
-		len = mem_bytes;
-
-	/* Third memory region: 1MB - (1MB + len) */
-	vcp->vcp_memranges[2].vmr_gpa = 0x100000;
-	vcp->vcp_memranges[2].vmr_size = len;
-	mem_bytes -= len;
-
-	if (mem_bytes > 0) {
-		/* Fourth memory region for the remaining memory (if any) */
-		vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
-		vcp->vcp_memranges[3].vmr_size = mem_bytes;
-		vcp->vcp_nmemranges = 4;
-	} else
-		vcp->vcp_nmemranges = 3;
-}
-
-/*
- * alloc_guest_mem
- *
- * Allocates memory for the guest.
- * Instead of doing a single allocation with one mmap(), we allocate memory
- * separately for every range for the following reasons:
- * - ASLR for the individual ranges
- * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
- *   map the single mmap'd userspace memory to the individual guest physical
- *   memory ranges, the underlying amap of the single mmap'd range would have
- *   to allocate per-page reference counters. The reason is that the
- *   individual guest physical ranges would reference the single mmap'd region
- *   only partially. However, if every guest physical range has its own
- *   corresponding mmap'd userspace allocation, there are no partial
- *   references: every guest physical range fully references an mmap'd
- *   range => no per-page reference counters have to be allocated.
- *
- * Return values:
- *  0: success
- *  !0: failure - errno indicating the source of the failure
- */
-int
-alloc_guest_mem(struct vm_create_params *vcp)
-{
-	void *p;
-	int ret;
-	size_t i, j;
-	struct vm_mem_range *vmr;
-
-	for (i = 0; i < vcp->vcp_nmemranges; i++) {
-		vmr = &vcp->vcp_memranges[i];
-		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
-		    MAP_PRIVATE | MAP_ANON, -1, 0);
-		if (p == MAP_FAILED) {
-			ret = errno;
-			for (j = 0; j < i; j++) {
-				vmr = &vcp->vcp_memranges[j];
-				munmap((void *)vmr->vmr_va, vmr->vmr_size);
-			}
-
-			return (ret);
-		}
-
-		vmr->vmr_va = (vaddr_t)p;
-	}
-
-	return (0);
-}
-
-/*
- * vmm_create_vm
- *
- * Requests vmm(4) to create a new VM using the supplied creation
- * parameters. This operation results in the creation of the in-kernel
- * structures for the VM, but does not start the VM's vcpu(s).
- *
- * Parameters:
- *  vcp: vm_create_params struct containing the VM's desired creation
- *      configuration
- *
- * Return values:
- *  0: success
- *  !0 : ioctl to vmm(4) failed
- */
-int
-vmm_create_vm(struct vm_create_params *vcp)
-{
-	/* Sanity check arguments */
-	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
-		return (EINVAL);
-
-	if (vcp->vcp_nmemranges == 0 ||
-	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
-		return (EINVAL);
-
-	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
-		return (EINVAL);
-
-	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
-		return (EINVAL);
-
-	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) < 0)
-		return (errno);
-
-	return (0);
-}
-
-/*
- * init_emulated_hw
- *
- * Initializes the userspace hardware emulation
- */
-void
-init_emulated_hw(struct vm_create_params *vcp, int *child_disks,
-    int *child_taps)
-{
-	int i;
-
-	/* Reset the IO port map */
-	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
-
-	/* Init i8253 PIT */
-	i8253_init(vcp->vcp_id);
-	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
-	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
-	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
-	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
-
-	/* Init mc146818 RTC */
-	mc146818_init(vcp->vcp_id);
-	ioports_map[IO_RTC] = vcpu_exit_mc146818;
-	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
-
-	/* Init master and slave PICs */
-	i8259_init();
-	ioports_map[IO_ICU1] = vcpu_exit_i8259;
-	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
-	ioports_map[IO_ICU2] = vcpu_exit_i8259;
-	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
-
-	/* Init ns8250 UART */
-	ns8250_init(con_fd, vcp->vcp_id);
-	for (i = COM1_DATA; i <= COM1_SCR; i++)
-		ioports_map[i] = vcpu_exit_com;
-
-	/* Initialize PCI */
-	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
-		ioports_map[i] = vcpu_exit_pci;
-
-	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
-	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
-	pci_init();
-
-	/* Initialize virtio devices */
-	virtio_init(vcp, child_disks, child_taps);
-}
-
-/*
- * run_vm
- *
- * Runs the VM whose creation parameters are specified in vcp
- *
- * Parameters:
- *  child_disks: previously-opened child VM disk file file descriptors
- *  child_taps: previously-opened child tap file descriptors
- *  vcp: vm_create_params struct containing the VM's desired creation
- *      configuration
- *  vrs: VCPU register state to initialize
- *
- * Return values:
- *  0: the VM exited normally
- *  !0 : the VM exited abnormally or failed to start
- */
-int
-run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp,
-    struct vcpu_reg_state *vrs)
-{
-	uint8_t evdone = 0;
-	size_t i;
-	int ret;
-	pthread_t *tid, evtid;
-	struct vm_run_params **vrp;
-	void *exit_status;
-
-	if (vcp == NULL)
-		return (EINVAL);
-
-	if (child_disks == NULL && vcp->vcp_ndisks != 0)
-		return (EINVAL);
-
-	if (child_taps == NULL && vcp->vcp_nnics != 0)
-		return (EINVAL);
-
-	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
-		return (EINVAL);
-
-	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
-		return (EINVAL);
-
-	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
-		return (EINVAL);
-
-	if (vcp->vcp_nmemranges == 0 ||
-	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
-		return (EINVAL);
-
-	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
-	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
-	if (tid == NULL || vrp == NULL) {
-		log_warn("%s: memory allocation error - exiting.",
-		    __progname);
-		return (ENOMEM);
-	}
-
-	log_debug("%s: initializing hardware for vm %s", __func__,
-	    vcp->vcp_name);
-
-	init_emulated_hw(vcp, child_disks, child_taps);
-
-	ret = pthread_mutex_init(&threadmutex, NULL);
-	if (ret) {
-		log_warn("%s: could not initialize thread state mutex",
-		    __func__);
-		return (ret);
-	}
-	ret = pthread_cond_init(&threadcond, NULL);
-	if (ret) {
-		log_warn("%s: could not initialize thread state "
-		    "condition variable", __func__);
-		return (ret);
-	}
-
-	mutex_lock(&threadmutex);
-
-	log_debug("%s: starting vcpu threads for vm %s", __func__,
-	    vcp->vcp_name);
-
-	/*
-	 * Create and launch one thread for each VCPU. These threads may
-	 * migrate between PCPUs over time; the need to reload CPU state
-	 * in such situations is detected and performed by vmm(4) in the
-	 * kernel.
-	 */
-	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
-		vrp[i] = malloc(sizeof(struct vm_run_params));
-		if (vrp[i] == NULL) {
-			log_warn("%s: memory allocation error - "
-			    "exiting.", __progname);
-			/* caller will exit, so skip free'ing */
-			return (ENOMEM);
-		}
-		vrp[i]->vrp_exit = malloc(sizeof(union vm_exit));
-		if (vrp[i]->vrp_exit == NULL) {
-			log_warn("%s: memory allocation error - "
-			    "exiting.", __progname);
-			/* caller will exit, so skip free'ing */
-			return (ENOMEM);
-		}
-		vrp[i]->vrp_vm_id = vcp->vcp_id;
-		vrp[i]->vrp_vcpu_id = i;
-
-		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
-			log_warnx("%s: cannot reset VCPU %zu - exiting.",
-			    __progname, i);
-			return (EIO);
-		}
-
-		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
-		if (ret) {
-			log_warnx("%s: cannot initialize cond var (%d)",
-			    __progname, ret);
-			return (ret);
-		}
-
-		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
-		if (ret) {
-			log_warnx("%s: cannot initialize mtx (%d)",
-			    __progname, ret);
-			return (ret);
-		}
-
-		vcpu_hlt[i] = 0;
-
-		/* Start each VCPU run thread at vcpu_run_loop */
-		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
-		if (ret) {
-			/* caller will _exit after this return */
-			ret = errno;
-			log_warn("%s: could not create vcpu thread %zu",
-			    __func__, i);
-			return (ret);
-		}
-	}
-
-	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
-	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
-	if (ret) {
-		errno = ret;
-		log_warn("%s: could not create event thread", __func__);
-		return (ret);
-	}
-
-	for (;;) {
-		ret = pthread_cond_wait(&threadcond, &threadmutex);
-		if (ret) {
-			log_warn("%s: waiting on thread state condition "
-			    "variable failed", __func__);
-			return (ret);
-		}
-
-		/*
-		 * Did a VCPU thread exit with an error? => return the first one
-		 */
-		for (i = 0; i < vcp->vcp_ncpus; i++) {
-			if (vcpu_done[i] == 0)
-				continue;
-
-			if (pthread_join(tid[i], &exit_status)) {
-				log_warn("%s: failed to join thread %zd - "
-				    "exiting", __progname, i);
-				return (EIO);
-			}
-
-			ret = (long long)exit_status;
-		}
-
-		/* Did the event thread exit? => return with an error */
-		if (evdone) {
-			if (pthread_join(evtid, &exit_status)) {
-				log_warn("%s: failed to join event thread - "
-				    "exiting", __progname);
-				return (EIO);
-			}
-
-			log_warnx("%s: vm %d event thread exited "
-			    "unexpectedly", __progname, vcp->vcp_id);
-			return (EIO);
-		}
-
-		/* Did all VCPU threads exit successfully? => return */
-		for (i = 0; i < vcp->vcp_ncpus; i++) {
-			if (vcpu_done[i] == 0)
-				break;
-		}
-		if (i == vcp->vcp_ncpus)
-			return (ret);
-
-		/* Some more threads to wait for, start over */
-	}
-
-	return (ret);
-}
-
-void *
-event_thread(void *arg)
-{
-	uint8_t *donep = arg;
-	intptr_t ret;
-
-	ret = event_dispatch();
-
-	mutex_lock(&threadmutex);
-	*donep = 1;
-	pthread_cond_signal(&threadcond);
-	mutex_unlock(&threadmutex);
-
-	return (void *)ret;
- }
-
-/*
- * vcpu_run_loop
- *
- * Runs a single VCPU until vmm(4) requires help handling an exit,
- * or the VM terminates.
- *
- * Parameters:
- *  arg: vcpu_run_params for the VCPU being run by this thread
- *
- * Return values:
- *  NULL: the VCPU shutdown properly
- *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
- */
-void *
-vcpu_run_loop(void *arg)
-{
-	struct vm_run_params *vrp = (struct vm_run_params *)arg;
-	intptr_t ret = 0;
-	int irq;
-	uint32_t n;
-
-	vrp->vrp_continue = 0;
-	n = vrp->vrp_vcpu_id;
-
-	for (;;) {
-		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
-
-		if (ret) {
-			log_warnx("%s: can't lock vcpu run mtx (%d)",
-			    __func__, (int)ret);
-			return ((void *)ret);
-		}
-
-		/* If we are halted, wait */
-		if (vcpu_hlt[n]) {
-			ret = pthread_cond_wait(&vcpu_run_cond[n],
-			    &vcpu_run_mtx[n]);
-
-			if (ret) {
-				log_warnx("%s: can't wait on cond (%d)",
-				    __func__, (int)ret);
-				(void)pthread_mutex_unlock(&vcpu_run_mtx[n]);
-				break;
-			}
-		}
-
-		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
-		if (ret) {
-			log_warnx("%s: can't unlock mutex on cond (%d)",
-			    __func__, (int)ret);
-			break;
-		}
-
-		if (vrp->vrp_irqready && i8259_is_pending()) {
-			irq = i8259_ack();
-			vrp->vrp_irq = irq;
-		} else
-			vrp->vrp_irq = 0xFFFF;
-
-		/* Still more pending? */
-		if (i8259_is_pending()) {
-			/* XXX can probably avoid ioctls here by providing intr in vrp */
-			if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 1)) {
-				fatal("can't set INTR");
-			}
-		} else {
-			if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 0)) {
-				fatal("can't clear INTR");
-			}
-		}
-
-		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) < 0) {
-			/* If run ioctl failed, exit */
-			ret = errno;
-			log_warn("%s: vm %d / vcpu %d run ioctl failed",
-			    __func__, vrp->vrp_vm_id, n);
-			break;
-		}
-
-		/* If the VM is terminating, exit normally */
-		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
-			ret = (intptr_t)NULL;
-			break;
-		}
-
-		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
-			/*
-			 * vmm(4) needs help handling an exit, handle in
-			 * vcpu_exit.
-			 */
-			ret = vcpu_exit(vrp);
-			if (ret)
-				break;
-		}
-	}
-
-	mutex_lock(&threadmutex);
-	vcpu_done[n] = 1;
-	pthread_cond_signal(&threadcond);
-	mutex_unlock(&threadmutex);
-
-	return ((void *)ret);
-}
-
-int
-vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
-{
-	struct vm_intr_params vip;
-
-	memset(&vip, 0, sizeof(vip));
-
-	vip.vip_vm_id = vm_id;
-	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
-	vip.vip_intr = intr;
-
-	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) < 0)
-		return (errno);
-
-	return (0);
-}
-
-/*
- * vcpu_exit_pci
- *
- * Handle all I/O to the emulated PCI subsystem.
- *
- * Parameters:
- *  vrp: vcpu run paramters containing guest state for this exit
- *
- * Return value:
- *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
- *      be injected.
- */
-uint8_t
-vcpu_exit_pci(struct vm_run_params *vrp)
-{
-	union vm_exit *vei = vrp->vrp_exit;
-	uint8_t intr;
-
-	intr = 0xFF;
-
-	switch (vei->vei.vei_port) {
-	case PCI_MODE1_ADDRESS_REG:
-		pci_handle_address_reg(vrp);
-		break;
-	case PCI_MODE1_DATA_REG:
-		pci_handle_data_reg(vrp);
-		break;
-	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
-		intr = pci_handle_io(vrp);
-		break;
-	default:
-		log_warnx("%s: unknown PCI register 0x%llx",
-		    __progname, (uint64_t)vei->vei.vei_port);
-		break;
-	}
-
-	return (intr);
-}
-
-/*
- * vcpu_exit_inout
- *
- * Handle all I/O exits that need to be emulated in vmd. This includes the
- * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
- *
- * Parameters:
- *  vrp: vcpu run parameters containing guest state for this exit
- */
-void
-vcpu_exit_inout(struct vm_run_params *vrp)
-{
-	union vm_exit *vei = vrp->vrp_exit;
-	uint8_t intr = 0xFF;
-
-	if (ioports_map[vei->vei.vei_port] != NULL)
-		intr = ioports_map[vei->vei.vei_port](vrp);
-	else if (vei->vei.vei_dir == VEI_DIR_IN)
-			vei->vei.vei_data = 0xFFFFFFFF;
-
-	if (intr != 0xFF)
-		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
-}
-
-/*
- * vcpu_exit
- *
- * Handle a vcpu exit. This function is called when it is determined that
- * vmm(4) requires the assistance of vmd to support a particular guest
- * exit type (eg, accessing an I/O port or device). Guest state is contained
- * in 'vrp', and will be resent to vmm(4) on exit completion.
- *
- * Upon conclusion of handling the exit, the function determines if any
- * interrupts should be injected into the guest, and asserts the proper
- * IRQ line whose interrupt should be vectored.
- *
- * Parameters:
- *  vrp: vcpu run parameters containing guest state for this exit
- *
- * Return values:
- *  0: the exit was handled successfully
- *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
- */
-int
-vcpu_exit(struct vm_run_params *vrp)
-{
-	int ret;
-
-	switch (vrp->vrp_exit_reason) {
-	case VMX_EXIT_INT_WINDOW:
-	case VMX_EXIT_EXTINT:
-	case VMX_EXIT_EPT_VIOLATION:
-	case SVM_VMEXIT_NPF:
-		/*
-		 * We may be exiting to vmd to handle a pending interrupt but
-		 * at the same time the last exit type may have been one of
-		 * these. In this case, there's nothing extra to be done
-		 * here (and falling through to the default case below results
-		 * in more vmd log spam).
-		 */
-		break;
-	case VMX_EXIT_IO:
-	case SVM_VMEXIT_IOIO:
-		vcpu_exit_inout(vrp);
-		break;
-	case VMX_EXIT_HLT:
-	case SVM_VMEXIT_HLT:
-		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
-		if (ret) {
-			log_warnx("%s: can't lock vcpu mutex (%d)",
-			    __func__, ret);
-			return (ret);
-		}
-		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
-		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
-		if (ret) {
-			log_warnx("%s: can't unlock vcpu mutex (%d)",
-			    __func__, ret);
-			return (ret);
-		}
-		break;
-	case VMX_EXIT_TRIPLE_FAULT:
-	case SVM_VMEXIT_SHUTDOWN:
-		/* XXX reset VM since we do not support reboot yet */
-		return (EAGAIN);
-	default:
-		log_debug("%s: unknown exit reason %d",
-		    __progname, vrp->vrp_exit_reason);
-	}
-
-	/* Process any pending traffic */
-	vionet_process_rx(vrp->vrp_vm_id);
-
-	vrp->vrp_continue = 1;
-
-	return (0);
-}
-
-/*
- * find_gpa_range
- *
- * Search for a contiguous guest physical mem range.
- *
- * Parameters:
- *  vcp: VM create parameters that contain the memory map to search in
- *  gpa: the starting guest physical address
- *  len: the length of the memory range
- *
- * Return values:
- *  NULL: on failure if there is no memory range as described by the parameters
- *  Pointer to vm_mem_range that contains the start of the range otherwise.
- */
-static struct vm_mem_range *
-find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
-{
-	size_t i, n;
-	struct vm_mem_range *vmr;
-
-	/* Find the first vm_mem_range that contains gpa */
-	for (i = 0; i < vcp->vcp_nmemranges; i++) {
-		vmr = &vcp->vcp_memranges[i];
-		if (vmr->vmr_gpa + vmr->vmr_size >= gpa)
-			break;
-	}
-
-	/* No range found. */
-	if (i == vcp->vcp_nmemranges)
-		return (NULL);
-
-	/*
-	 * vmr may cover the range [gpa, gpa + len) only partly. Make
-	 * sure that the following vm_mem_ranges are contiguous and
-	 * cover the rest.
-	 */
-	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
-	if (len < n)
-		len = 0;
-	else
-		len -= n;
-	gpa = vmr->vmr_gpa + vmr->vmr_size;
-	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
-		vmr = &vcp->vcp_memranges[i];
-		if (gpa != vmr->vmr_gpa)
-			return (NULL);
-		if (len <= vmr->vmr_size)
-			len = 0;
-		else
-			len -= vmr->vmr_size;
-
-		gpa = vmr->vmr_gpa + vmr->vmr_size;
-	}
-
-	if (len != 0)
-		return (NULL);
-
-	return (vmr);
-}
-
-/*
- * write_mem
- *
- * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
- *
- * Parameters:
- *  dst: the destination paddr_t in the guest VM
- *  buf: data to copy
- *  len: number of bytes to copy
- *
- * Return values:
- *  0: success
- *  EINVAL: if the guest physical memory range [dst, dst + len) does not
- *      exist in the guest.
- */
-int
-write_mem(paddr_t dst, void *buf, size_t len)
-{
-	char *from = buf, *to;
-	size_t n, off;
-	struct vm_mem_range *vmr;
-
-	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
-	if (vmr == NULL) {
-		errno = EINVAL;
-		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
-		    "len = 0x%zx", __func__, dst, len);
-		return (EINVAL);
-	}
-
-	off = dst - vmr->vmr_gpa;
-	while (len != 0) {
-		n = vmr->vmr_size - off;
-		if (len < n)
-			n = len;
-
-		to = (char *)vmr->vmr_va + off;
-		memcpy(to, from, n);
-
-		from += n;
-		len -= n;
-		off = 0;
-		vmr++;
-	}
-
-	return (0);
-}
-
-/*
- * read_mem
- *
- * Reads memory at guest paddr 'src' into 'buf'.
- *
- * Parameters:
- *  src: the source paddr_t in the guest VM to read from.
- *  buf: destination (local) buffer
- *  len: number of bytes to read
- *
- * Return values:
- *  0: success
- *  EINVAL: if the guest physical memory range [dst, dst + len) does not
- *      exist in the guest.
- */
-int
-read_mem(paddr_t src, void *buf, size_t len)
-{
-	char *from, *to = buf;
-	size_t n, off;
-	struct vm_mem_range *vmr;
-
-	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
-	if (vmr == NULL) {
-		errno = EINVAL;
-		log_warn("%s: failed - invalid memory range src = 0x%lx, "
-		    "len = 0x%zx", __func__, src, len);
-		return (EINVAL);
-	}
-
-	off = src - vmr->vmr_gpa;
-	while (len != 0) {
-		n = vmr->vmr_size - off;
-		if (len < n)
-			n = len;
-
-		from = (char *)vmr->vmr_va + off;
-		memcpy(to, from, n);
-
-		to += n;
-		len -= n;
-		off = 0;
-		vmr++;
-	}
-
-	return (0);
-}
-
-/*
- * vcpu_assert_pic_irq
- *
- * Injects the specified IRQ on the supplied vcpu/vm
- *
- * Parameters:
- *  vm_id: VM ID to inject to
- *  vcpu_id: VCPU ID to inject to
- *  irq: IRQ to inject
- */
-void
-vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
-{
-	int ret;
-
-	i8259_assert_irq(irq);
-
-	if (i8259_is_pending()) {
-		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
-			fatalx("%s: can't assert INTR", __func__);
-
-		ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
-		if (ret)
-			fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
-
-		vcpu_hlt[vcpu_id] = 0;
-		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
-		if (ret)
-			fatalx("%s: can't signal (%d)", __func__, ret);
-		ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
-		if (ret)
-			fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
-	}
-}
-
-/*
- * fd_hasdata
- *
- * Determines if data can be read from a file descriptor.
- *
- * Parameters:
- *  fd: the fd to check
- *
- * Return values:
- *  1 if data can be read from an fd, or 0 otherwise.
- */
-int
-fd_hasdata(int fd)
-{
-	struct pollfd pfd[1];
-	int nready, hasdata = 0;
-
-	pfd[0].fd = fd;
-	pfd[0].events = POLLIN;
-	nready = poll(pfd, 1, 0);
-	if (nready == -1)
-		log_warn("checking file descriptor for data failed");
-	else if (nready == 1 && pfd[0].revents & POLLIN)
-		hasdata = 1;
-	return (hasdata);
-}
-
-/*
- * mutex_lock
- *
- * Wrapper function for pthread_mutex_lock that does error checking and that
- * exits on failure
- */
-void
-mutex_lock(pthread_mutex_t *m)
-{
-	int ret;
-
-	ret = pthread_mutex_lock(m);
-	if (ret) {
-		errno = ret;
-		fatal("could not acquire mutex");
-	}
-}
-
-/*
- * mutex_unlock
- *
- * Wrapper function for pthread_mutex_unlock that does error checking and that
- * exits on failure
- */
-void
-mutex_unlock(pthread_mutex_t *m)
-{
-	int ret;
-
-	ret = pthread_mutex_unlock(m);
-	if (ret) {
-		errno = ret;
-		fatal("could not release mutex");
-	}
-}