diff options
-rw-r--r-- | usr.sbin/vmd/Makefile | 8 | ||||
-rw-r--r-- | usr.sbin/vmd/vm.c | 1262 | ||||
-rw-r--r-- | usr.sbin/vmd/vmd.h | 6 | ||||
-rw-r--r-- | usr.sbin/vmd/vmm.c | 1217 |
4 files changed, 1297 insertions, 1196 deletions
diff --git a/usr.sbin/vmd/Makefile b/usr.sbin/vmd/Makefile index 39fd337c581..09a9c263dee 100644 --- a/usr.sbin/vmd/Makefile +++ b/usr.sbin/vmd/Makefile @@ -1,11 +1,11 @@ -# $OpenBSD: Makefile,v 1.12 2016/11/26 20:03:42 reyk Exp $ +# $OpenBSD: Makefile,v 1.13 2017/03/01 18:00:50 reyk Exp $ .if ${MACHINE} == "amd64" || ${MACHINE} == "i386" PROG= vmd -SRCS= vmm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c -SRCS+= vmd.c control.c log.c priv.c proc.c config.c ns8250.c i8253.c -SRCS+= vmboot.c ufs.c disklabel.c +SRCS= vmd.c control.c log.c priv.c proc.c config.c vmm.c +SRCS+= vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c +SRCS+= ns8250.c i8253.c vmboot.c ufs.c disklabel.c SRCS+= parse.y CFLAGS+= -Wall -I${.CURDIR} diff --git a/usr.sbin/vmd/vm.c b/usr.sbin/vmd/vm.c new file mode 100644 index 00000000000..76d213e8adf --- /dev/null +++ b/usr.sbin/vmd/vm.c @@ -0,0 +1,1262 @@ +/* $OpenBSD: vm.c,v 1.1 2017/03/01 18:00:50 reyk Exp $ */ + +/* + * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/types.h> +#include <sys/ioctl.h> +#include <sys/queue.h> +#include <sys/wait.h> +#include <sys/uio.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/mman.h> + +#include <dev/ic/i8253reg.h> +#include <dev/isa/isareg.h> +#include <dev/pci/pcireg.h> + +#include <machine/param.h> +#include <machine/psl.h> +#include <machine/specialreg.h> +#include <machine/vmmvar.h> + +#include <net/if.h> + +#include <errno.h> +#include <event.h> +#include <fcntl.h> +#include <imsg.h> +#include <limits.h> +#include <poll.h> +#include <pthread.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <util.h> + +#include "vmd.h" +#include "vmm.h" +#include "loadfile.h" +#include "pci.h" +#include "virtio.h" +#include "proc.h" +#include "i8253.h" +#include "i8259.h" +#include "ns8250.h" +#include "mc146818.h" + +io_fn_t ioports_map[MAX_PORTS]; + +int run_vm(int *, int *, struct vm_create_params *, struct vcpu_reg_state *); +void vm_dispatch_vmm(int, short, void *); +void *event_thread(void *); +void *vcpu_run_loop(void *); +int vcpu_exit(struct vm_run_params *); +int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); +void create_memory_map(struct vm_create_params *); +int alloc_guest_mem(struct vm_create_params *); +int vmm_create_vm(struct vm_create_params *); +void init_emulated_hw(struct vm_create_params *, int *, int *); +void vcpu_exit_inout(struct vm_run_params *); +uint8_t vcpu_exit_pci(struct vm_run_params *); +int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); + +static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, + size_t); + +int con_fd; +struct vmd_vm *current_vm; + +extern struct vmd *env; + +extern char *__progname; + +pthread_mutex_t threadmutex; +pthread_cond_t threadcond; + +pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM]; +pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM]; +uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; +uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; + +/* + * Represents a standard register set for an OS to be booted + * as a flat 32 bit address space, before paging is enabled. + * + * NOT set here are: + * RIP + * RSP + * GDTR BASE + * + * Specific bootloaders should clone this structure and override + * those fields as needed. + * + * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on + * features of the CPU in use. + */ +static const struct vcpu_reg_state vcpu_init_flat32 = { +#ifdef __i386__ + .vrs_gprs[VCPU_REGS_EFLAGS] = 0x2, + .vrs_gprs[VCPU_REGS_EIP] = 0x0, + .vrs_gprs[VCPU_REGS_ESP] = 0x0, +#else + .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, + .vrs_gprs[VCPU_REGS_RIP] = 0x0, + .vrs_gprs[VCPU_REGS_RSP] = 0x0, +#endif + .vrs_crs[VCPU_REGS_CR0] = CR0_CD | CR0_NW | CR0_ET | CR0_PE | CR0_PG, + .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, + .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, + .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, + .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, + .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, + .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, + .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, + .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, + .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, + .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, + .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, +}; + +/* + * start_vm + * + * After forking a new VM process, starts the new VM with the creation + * parameters supplied (in the incoming vm->vm_params field). This + * function performs a basic sanity check on the incoming parameters + * and then performs the following steps to complete the creation of the VM: + * + * 1. validates and create the new VM + * 2. opens the imsg control channel to the parent and drops more privilege + * 3. drops additional privleges by calling pledge(2) + * 4. loads the kernel from the disk image or file descriptor + * 5. runs the VM's VCPU loops. + * + * Parameters: + * vm: The VM data structure that is including the VM create parameters. + * fd: The imsg socket that is connected to the parent process. + * + * Return values: + * 0: success + * !0 : failure - typically an errno indicating the source of the failure + */ +int +start_vm(struct vmd_vm *vm, int fd) +{ + struct vm_create_params *vcp = &vm->vm_params.vmc_params; + struct vcpu_reg_state vrs; + int nicfds[VMM_MAX_NICS_PER_VM]; + int ret; + FILE *kernfp; + struct vmboot_params vmboot; + size_t i; + + /* Child */ + setproctitle("%s", vcp->vcp_name); + log_procinit(vcp->vcp_name); + + create_memory_map(vcp); + ret = alloc_guest_mem(vcp); + if (ret) { + errno = ret; + fatal("could not allocate guest memory - exiting"); + } + + ret = vmm_create_vm(vcp); + current_vm = vm; + + /* send back the kernel-generated vm id (0 on error) */ + if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != + sizeof(vcp->vcp_id)) + fatal("write vcp id"); + + if (ret) { + errno = ret; + fatal("create vmm ioctl failed - exiting"); + } + + /* + * pledge in the vm processes: + * stdio - for malloc and basic I/O including events. + * vmm - for the vmm ioctls and operations. + */ + if (pledge("stdio vmm", NULL) == -1) + fatal("pledge"); + + /* + * Set up default "flat 32 bit" register state - RIP, + * RSP, and GDT info will be set in bootloader + */ + memcpy(&vrs, &vcpu_init_flat32, sizeof(struct vcpu_reg_state)); + + /* Find and open kernel image */ + if ((kernfp = vmboot_open(vm->vm_kernel, + vm->vm_disks[0], &vmboot)) == NULL) + fatalx("failed to open kernel - exiting"); + + /* Load kernel image */ + ret = loadelf_main(kernfp, vcp, &vrs, + vmboot.vbp_bootdev, vmboot.vbp_howto); + if (ret) { + errno = ret; + fatal("failed to load kernel - exiting"); + } + + vmboot_close(kernfp, &vmboot); + + if (vm->vm_kernel != -1) + close(vm->vm_kernel); + + con_fd = vm->vm_tty; + if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) + fatal("failed to set nonblocking mode on console"); + + for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) + nicfds[i] = vm->vm_ifs[i].vif_fd; + + event_init(); + + if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) + fatal("setup vm pipe"); + + /* Execute the vcpu run loop(s) for this VM */ + ret = run_vm(vm->vm_disks, nicfds, vcp, &vrs); + + return (ret); +} + +/* + * vm_dispatch_vmm + * + * imsg callback for messages that are received from the vmm parent process. + */ +void +vm_dispatch_vmm(int fd, short event, void *arg) +{ + struct vmd_vm *vm = arg; + struct imsgev *iev = &vm->vm_iev; + struct imsgbuf *ibuf = &iev->ibuf; + struct imsg imsg; + ssize_t n; + int verbose; + + if (event & EV_READ) { + if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) + fatal("%s: imsg_read", __func__); + if (n == 0) + _exit(0); + } + + if (event & EV_WRITE) { + if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) + fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd); + if (n == 0) + _exit(0); + } + + for (;;) { + if ((n = imsg_get(ibuf, &imsg)) == -1) + fatal("%s: imsg_get", __func__); + if (n == 0) + break; + +#if DEBUG > 1 + log_debug("%s: got imsg %d from %s", + __func__, imsg.hdr.type, + vm->vm_params.vmc_params.vcp_name); +#endif + + switch (imsg.hdr.type) { + case IMSG_CTL_VERBOSE: + IMSG_SIZE_CHECK(&imsg, &verbose); + memcpy(&verbose, imsg.data, sizeof(verbose)); + log_setverbose(verbose); + break; + case IMSG_VMDOP_VM_SHUTDOWN: + if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) + _exit(0); + break; + case IMSG_VMDOP_VM_REBOOT: + if (vmmci_ctl(VMMCI_REBOOT) == -1) + _exit(0); + break; + default: + fatalx("%s: got invalid imsg %d from %s", + __func__, imsg.hdr.type, + vm->vm_params.vmc_params.vcp_name); + } + imsg_free(&imsg); + } + imsg_event_add(iev); +} + +/* + * vcpu_reset + * + * Requests vmm(4) to reset the VCPUs in the indicated VM to + * the register state provided + * + * Parameters + * vmid: VM ID to reset + * vcpu_id: VCPU ID to reset + * vrs: the register state to initialize + * + * Return values: + * 0: success + * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not + * valid) + */ +int +vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) +{ + struct vm_resetcpu_params vrp; + + memset(&vrp, 0, sizeof(vrp)); + vrp.vrp_vm_id = vmid; + vrp.vrp_vcpu_id = vcpu_id; + memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); + + log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); + + if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) < 0) + return (errno); + + return (0); +} + +/* + * create_memory_map + * + * Sets up the guest physical memory ranges that the VM can access. + * + * Return values: + * nothing + */ +void +create_memory_map(struct vm_create_params *vcp) +{ + size_t len, mem_bytes, mem_mb; + + mem_mb = vcp->vcp_memranges[0].vmr_size; + vcp->vcp_nmemranges = 0; + if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE) + return; + + mem_bytes = mem_mb * 1024 * 1024; + + /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ + len = LOWMEM_KB * 1024; + vcp->vcp_memranges[0].vmr_gpa = 0x0; + vcp->vcp_memranges[0].vmr_size = len; + mem_bytes -= len; + + /* + * Second memory region: LOWMEM_KB - 1MB. + * + * N.B. - Normally ROMs or parts of video RAM are mapped here. + * We have to add this region, because some systems + * unconditionally write to 0xb8000 (VGA RAM), and + * we need to make sure that vmm(4) permits accesses + * to it. So allocate guest memory for it. + */ + len = 0x100000 - LOWMEM_KB * 1024; + vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; + vcp->vcp_memranges[1].vmr_size = len; + mem_bytes -= len; + + /* Make sure that we do not place physical memory into MMIO ranges. */ + if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000) + len = VMM_PCI_MMIO_BAR_BASE - 0x100000; + else + len = mem_bytes; + + /* Third memory region: 1MB - (1MB + len) */ + vcp->vcp_memranges[2].vmr_gpa = 0x100000; + vcp->vcp_memranges[2].vmr_size = len; + mem_bytes -= len; + + if (mem_bytes > 0) { + /* Fourth memory region for the remaining memory (if any) */ + vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; + vcp->vcp_memranges[3].vmr_size = mem_bytes; + vcp->vcp_nmemranges = 4; + } else + vcp->vcp_nmemranges = 3; +} + +/* + * alloc_guest_mem + * + * Allocates memory for the guest. + * Instead of doing a single allocation with one mmap(), we allocate memory + * separately for every range for the following reasons: + * - ASLR for the individual ranges + * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to + * map the single mmap'd userspace memory to the individual guest physical + * memory ranges, the underlying amap of the single mmap'd range would have + * to allocate per-page reference counters. The reason is that the + * individual guest physical ranges would reference the single mmap'd region + * only partially. However, if every guest physical range has its own + * corresponding mmap'd userspace allocation, there are no partial + * references: every guest physical range fully references an mmap'd + * range => no per-page reference counters have to be allocated. + * + * Return values: + * 0: success + * !0: failure - errno indicating the source of the failure + */ +int +alloc_guest_mem(struct vm_create_params *vcp) +{ + void *p; + int ret; + size_t i, j; + struct vm_mem_range *vmr; + + for (i = 0; i < vcp->vcp_nmemranges; i++) { + vmr = &vcp->vcp_memranges[i]; + p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + if (p == MAP_FAILED) { + ret = errno; + for (j = 0; j < i; j++) { + vmr = &vcp->vcp_memranges[j]; + munmap((void *)vmr->vmr_va, vmr->vmr_size); + } + + return (ret); + } + + vmr->vmr_va = (vaddr_t)p; + } + + return (0); +} + +/* + * vmm_create_vm + * + * Requests vmm(4) to create a new VM using the supplied creation + * parameters. This operation results in the creation of the in-kernel + * structures for the VM, but does not start the VM's vcpu(s). + * + * Parameters: + * vcp: vm_create_params struct containing the VM's desired creation + * configuration + * + * Return values: + * 0: success + * !0 : ioctl to vmm(4) failed + */ +int +vmm_create_vm(struct vm_create_params *vcp) +{ + /* Sanity check arguments */ + if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) + return (EINVAL); + + if (vcp->vcp_nmemranges == 0 || + vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) + return (EINVAL); + + if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) + return (EINVAL); + + if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) + return (EINVAL); + + if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) < 0) + return (errno); + + return (0); +} + +/* + * init_emulated_hw + * + * Initializes the userspace hardware emulation + */ +void +init_emulated_hw(struct vm_create_params *vcp, int *child_disks, + int *child_taps) +{ + int i; + + /* Reset the IO port map */ + memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); + + /* Init i8253 PIT */ + i8253_init(vcp->vcp_id); + ioports_map[TIMER_CTRL] = vcpu_exit_i8253; + ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; + ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; + ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; + + /* Init mc146818 RTC */ + mc146818_init(vcp->vcp_id); + ioports_map[IO_RTC] = vcpu_exit_mc146818; + ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; + + /* Init master and slave PICs */ + i8259_init(); + ioports_map[IO_ICU1] = vcpu_exit_i8259; + ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; + ioports_map[IO_ICU2] = vcpu_exit_i8259; + ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; + + /* Init ns8250 UART */ + ns8250_init(con_fd, vcp->vcp_id); + for (i = COM1_DATA; i <= COM1_SCR; i++) + ioports_map[i] = vcpu_exit_com; + + /* Initialize PCI */ + for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) + ioports_map[i] = vcpu_exit_pci; + + ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; + ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; + pci_init(); + + /* Initialize virtio devices */ + virtio_init(vcp, child_disks, child_taps); +} + +/* + * run_vm + * + * Runs the VM whose creation parameters are specified in vcp + * + * Parameters: + * child_disks: previously-opened child VM disk file file descriptors + * child_taps: previously-opened child tap file descriptors + * vcp: vm_create_params struct containing the VM's desired creation + * configuration + * vrs: VCPU register state to initialize + * + * Return values: + * 0: the VM exited normally + * !0 : the VM exited abnormally or failed to start + */ +int +run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp, + struct vcpu_reg_state *vrs) +{ + uint8_t evdone = 0; + size_t i; + int ret; + pthread_t *tid, evtid; + struct vm_run_params **vrp; + void *exit_status; + + if (vcp == NULL) + return (EINVAL); + + if (child_disks == NULL && vcp->vcp_ndisks != 0) + return (EINVAL); + + if (child_taps == NULL && vcp->vcp_nnics != 0) + return (EINVAL); + + if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) + return (EINVAL); + + if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) + return (EINVAL); + + if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) + return (EINVAL); + + if (vcp->vcp_nmemranges == 0 || + vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) + return (EINVAL); + + tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); + vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); + if (tid == NULL || vrp == NULL) { + log_warn("%s: memory allocation error - exiting.", + __progname); + return (ENOMEM); + } + + log_debug("%s: initializing hardware for vm %s", __func__, + vcp->vcp_name); + + init_emulated_hw(vcp, child_disks, child_taps); + + ret = pthread_mutex_init(&threadmutex, NULL); + if (ret) { + log_warn("%s: could not initialize thread state mutex", + __func__); + return (ret); + } + ret = pthread_cond_init(&threadcond, NULL); + if (ret) { + log_warn("%s: could not initialize thread state " + "condition variable", __func__); + return (ret); + } + + mutex_lock(&threadmutex); + + log_debug("%s: starting vcpu threads for vm %s", __func__, + vcp->vcp_name); + + /* + * Create and launch one thread for each VCPU. These threads may + * migrate between PCPUs over time; the need to reload CPU state + * in such situations is detected and performed by vmm(4) in the + * kernel. + */ + for (i = 0 ; i < vcp->vcp_ncpus; i++) { + vrp[i] = malloc(sizeof(struct vm_run_params)); + if (vrp[i] == NULL) { + log_warn("%s: memory allocation error - " + "exiting.", __progname); + /* caller will exit, so skip free'ing */ + return (ENOMEM); + } + vrp[i]->vrp_exit = malloc(sizeof(union vm_exit)); + if (vrp[i]->vrp_exit == NULL) { + log_warn("%s: memory allocation error - " + "exiting.", __progname); + /* caller will exit, so skip free'ing */ + return (ENOMEM); + } + vrp[i]->vrp_vm_id = vcp->vcp_id; + vrp[i]->vrp_vcpu_id = i; + + if (vcpu_reset(vcp->vcp_id, i, vrs)) { + log_warnx("%s: cannot reset VCPU %zu - exiting.", + __progname, i); + return (EIO); + } + + ret = pthread_cond_init(&vcpu_run_cond[i], NULL); + if (ret) { + log_warnx("%s: cannot initialize cond var (%d)", + __progname, ret); + return (ret); + } + + ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL); + if (ret) { + log_warnx("%s: cannot initialize mtx (%d)", + __progname, ret); + return (ret); + } + + vcpu_hlt[i] = 0; + + /* Start each VCPU run thread at vcpu_run_loop */ + ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); + if (ret) { + /* caller will _exit after this return */ + ret = errno; + log_warn("%s: could not create vcpu thread %zu", + __func__, i); + return (ret); + } + } + + log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); + ret = pthread_create(&evtid, NULL, event_thread, &evdone); + if (ret) { + errno = ret; + log_warn("%s: could not create event thread", __func__); + return (ret); + } + + for (;;) { + ret = pthread_cond_wait(&threadcond, &threadmutex); + if (ret) { + log_warn("%s: waiting on thread state condition " + "variable failed", __func__); + return (ret); + } + + /* + * Did a VCPU thread exit with an error? => return the first one + */ + for (i = 0; i < vcp->vcp_ncpus; i++) { + if (vcpu_done[i] == 0) + continue; + + if (pthread_join(tid[i], &exit_status)) { + log_warn("%s: failed to join thread %zd - " + "exiting", __progname, i); + return (EIO); + } + + ret = (long long)exit_status; + } + + /* Did the event thread exit? => return with an error */ + if (evdone) { + if (pthread_join(evtid, &exit_status)) { + log_warn("%s: failed to join event thread - " + "exiting", __progname); + return (EIO); + } + + log_warnx("%s: vm %d event thread exited " + "unexpectedly", __progname, vcp->vcp_id); + return (EIO); + } + + /* Did all VCPU threads exit successfully? => return */ + for (i = 0; i < vcp->vcp_ncpus; i++) { + if (vcpu_done[i] == 0) + break; + } + if (i == vcp->vcp_ncpus) + return (ret); + + /* Some more threads to wait for, start over */ + } + + return (ret); +} + +void * +event_thread(void *arg) +{ + uint8_t *donep = arg; + intptr_t ret; + + ret = event_dispatch(); + + mutex_lock(&threadmutex); + *donep = 1; + pthread_cond_signal(&threadcond); + mutex_unlock(&threadmutex); + + return (void *)ret; + } + +/* + * vcpu_run_loop + * + * Runs a single VCPU until vmm(4) requires help handling an exit, + * or the VM terminates. + * + * Parameters: + * arg: vcpu_run_params for the VCPU being run by this thread + * + * Return values: + * NULL: the VCPU shutdown properly + * !NULL: error processing VCPU run, or the VCPU shutdown abnormally + */ +void * +vcpu_run_loop(void *arg) +{ + struct vm_run_params *vrp = (struct vm_run_params *)arg; + intptr_t ret = 0; + int irq; + uint32_t n; + + vrp->vrp_continue = 0; + n = vrp->vrp_vcpu_id; + + for (;;) { + ret = pthread_mutex_lock(&vcpu_run_mtx[n]); + + if (ret) { + log_warnx("%s: can't lock vcpu run mtx (%d)", + __func__, (int)ret); + return ((void *)ret); + } + + /* If we are halted, wait */ + if (vcpu_hlt[n]) { + ret = pthread_cond_wait(&vcpu_run_cond[n], + &vcpu_run_mtx[n]); + + if (ret) { + log_warnx("%s: can't wait on cond (%d)", + __func__, (int)ret); + (void)pthread_mutex_unlock(&vcpu_run_mtx[n]); + break; + } + } + + ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); + if (ret) { + log_warnx("%s: can't unlock mutex on cond (%d)", + __func__, (int)ret); + break; + } + + if (vrp->vrp_irqready && i8259_is_pending()) { + irq = i8259_ack(); + vrp->vrp_irq = irq; + } else + vrp->vrp_irq = 0xFFFF; + + /* Still more pending? */ + if (i8259_is_pending()) { + /* XXX can probably avoid ioctls here by providing intr in vrp */ + if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 1)) { + fatal("can't set INTR"); + } + } else { + if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 0)) { + fatal("can't clear INTR"); + } + } + + if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) < 0) { + /* If run ioctl failed, exit */ + ret = errno; + log_warn("%s: vm %d / vcpu %d run ioctl failed", + __func__, vrp->vrp_vm_id, n); + break; + } + + /* If the VM is terminating, exit normally */ + if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) { + ret = (intptr_t)NULL; + break; + } + + if (vrp->vrp_exit_reason != VM_EXIT_NONE) { + /* + * vmm(4) needs help handling an exit, handle in + * vcpu_exit. + */ + ret = vcpu_exit(vrp); + if (ret) + break; + } + } + + mutex_lock(&threadmutex); + vcpu_done[n] = 1; + pthread_cond_signal(&threadcond); + mutex_unlock(&threadmutex); + + return ((void *)ret); +} + +int +vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) +{ + struct vm_intr_params vip; + + memset(&vip, 0, sizeof(vip)); + + vip.vip_vm_id = vm_id; + vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ + vip.vip_intr = intr; + + if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) < 0) + return (errno); + + return (0); +} + +/* + * vcpu_exit_pci + * + * Handle all I/O to the emulated PCI subsystem. + * + * Parameters: + * vrp: vcpu run paramters containing guest state for this exit + * + * Return value: + * Interrupt to inject to the guest VM, or 0xFF if no interrupt should + * be injected. + */ +uint8_t +vcpu_exit_pci(struct vm_run_params *vrp) +{ + union vm_exit *vei = vrp->vrp_exit; + uint8_t intr; + + intr = 0xFF; + + switch (vei->vei.vei_port) { + case PCI_MODE1_ADDRESS_REG: + pci_handle_address_reg(vrp); + break; + case PCI_MODE1_DATA_REG: + pci_handle_data_reg(vrp); + break; + case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: + intr = pci_handle_io(vrp); + break; + default: + log_warnx("%s: unknown PCI register 0x%llx", + __progname, (uint64_t)vei->vei.vei_port); + break; + } + + return (intr); +} + +/* + * vcpu_exit_inout + * + * Handle all I/O exits that need to be emulated in vmd. This includes the + * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. + * + * Parameters: + * vrp: vcpu run parameters containing guest state for this exit + */ +void +vcpu_exit_inout(struct vm_run_params *vrp) +{ + union vm_exit *vei = vrp->vrp_exit; + uint8_t intr = 0xFF; + + if (ioports_map[vei->vei.vei_port] != NULL) + intr = ioports_map[vei->vei.vei_port](vrp); + else if (vei->vei.vei_dir == VEI_DIR_IN) + vei->vei.vei_data = 0xFFFFFFFF; + + if (intr != 0xFF) + vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); +} + +/* + * vcpu_exit + * + * Handle a vcpu exit. This function is called when it is determined that + * vmm(4) requires the assistance of vmd to support a particular guest + * exit type (eg, accessing an I/O port or device). Guest state is contained + * in 'vrp', and will be resent to vmm(4) on exit completion. + * + * Upon conclusion of handling the exit, the function determines if any + * interrupts should be injected into the guest, and asserts the proper + * IRQ line whose interrupt should be vectored. + * + * Parameters: + * vrp: vcpu run parameters containing guest state for this exit + * + * Return values: + * 0: the exit was handled successfully + * 1: an error occurred (eg, unknown exit reason passed in 'vrp') + */ +int +vcpu_exit(struct vm_run_params *vrp) +{ + int ret; + + switch (vrp->vrp_exit_reason) { + case VMX_EXIT_INT_WINDOW: + case VMX_EXIT_EXTINT: + case VMX_EXIT_EPT_VIOLATION: + case SVM_VMEXIT_NPF: + /* + * We may be exiting to vmd to handle a pending interrupt but + * at the same time the last exit type may have been one of + * these. In this case, there's nothing extra to be done + * here (and falling through to the default case below results + * in more vmd log spam). + */ + break; + case VMX_EXIT_IO: + case SVM_VMEXIT_IOIO: + vcpu_exit_inout(vrp); + break; + case VMX_EXIT_HLT: + case SVM_VMEXIT_HLT: + ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); + if (ret) { + log_warnx("%s: can't lock vcpu mutex (%d)", + __func__, ret); + return (ret); + } + vcpu_hlt[vrp->vrp_vcpu_id] = 1; + ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); + if (ret) { + log_warnx("%s: can't unlock vcpu mutex (%d)", + __func__, ret); + return (ret); + } + break; + case VMX_EXIT_TRIPLE_FAULT: + case SVM_VMEXIT_SHUTDOWN: + /* XXX reset VM since we do not support reboot yet */ + return (EAGAIN); + default: + log_debug("%s: unknown exit reason %d", + __progname, vrp->vrp_exit_reason); + } + + /* Process any pending traffic */ + vionet_process_rx(vrp->vrp_vm_id); + + vrp->vrp_continue = 1; + + return (0); +} + +/* + * find_gpa_range + * + * Search for a contiguous guest physical mem range. + * + * Parameters: + * vcp: VM create parameters that contain the memory map to search in + * gpa: the starting guest physical address + * len: the length of the memory range + * + * Return values: + * NULL: on failure if there is no memory range as described by the parameters + * Pointer to vm_mem_range that contains the start of the range otherwise. + */ +static struct vm_mem_range * +find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) +{ + size_t i, n; + struct vm_mem_range *vmr; + + /* Find the first vm_mem_range that contains gpa */ + for (i = 0; i < vcp->vcp_nmemranges; i++) { + vmr = &vcp->vcp_memranges[i]; + if (vmr->vmr_gpa + vmr->vmr_size >= gpa) + break; + } + + /* No range found. */ + if (i == vcp->vcp_nmemranges) + return (NULL); + + /* + * vmr may cover the range [gpa, gpa + len) only partly. Make + * sure that the following vm_mem_ranges are contiguous and + * cover the rest. + */ + n = vmr->vmr_size - (gpa - vmr->vmr_gpa); + if (len < n) + len = 0; + else + len -= n; + gpa = vmr->vmr_gpa + vmr->vmr_size; + for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { + vmr = &vcp->vcp_memranges[i]; + if (gpa != vmr->vmr_gpa) + return (NULL); + if (len <= vmr->vmr_size) + len = 0; + else + len -= vmr->vmr_size; + + gpa = vmr->vmr_gpa + vmr->vmr_size; + } + + if (len != 0) + return (NULL); + + return (vmr); +} + +/* + * write_mem + * + * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. + * + * Parameters: + * dst: the destination paddr_t in the guest VM + * buf: data to copy + * len: number of bytes to copy + * + * Return values: + * 0: success + * EINVAL: if the guest physical memory range [dst, dst + len) does not + * exist in the guest. + */ +int +write_mem(paddr_t dst, void *buf, size_t len) +{ + char *from = buf, *to; + size_t n, off; + struct vm_mem_range *vmr; + + vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); + if (vmr == NULL) { + errno = EINVAL; + log_warn("%s: failed - invalid memory range dst = 0x%lx, " + "len = 0x%zx", __func__, dst, len); + return (EINVAL); + } + + off = dst - vmr->vmr_gpa; + while (len != 0) { + n = vmr->vmr_size - off; + if (len < n) + n = len; + + to = (char *)vmr->vmr_va + off; + memcpy(to, from, n); + + from += n; + len -= n; + off = 0; + vmr++; + } + + return (0); +} + +/* + * read_mem + * + * Reads memory at guest paddr 'src' into 'buf'. + * + * Parameters: + * src: the source paddr_t in the guest VM to read from. + * buf: destination (local) buffer + * len: number of bytes to read + * + * Return values: + * 0: success + * EINVAL: if the guest physical memory range [dst, dst + len) does not + * exist in the guest. + */ +int +read_mem(paddr_t src, void *buf, size_t len) +{ + char *from, *to = buf; + size_t n, off; + struct vm_mem_range *vmr; + + vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); + if (vmr == NULL) { + errno = EINVAL; + log_warn("%s: failed - invalid memory range src = 0x%lx, " + "len = 0x%zx", __func__, src, len); + return (EINVAL); + } + + off = src - vmr->vmr_gpa; + while (len != 0) { + n = vmr->vmr_size - off; + if (len < n) + n = len; + + from = (char *)vmr->vmr_va + off; + memcpy(to, from, n); + + to += n; + len -= n; + off = 0; + vmr++; + } + + return (0); +} + +/* + * vcpu_assert_pic_irq + * + * Injects the specified IRQ on the supplied vcpu/vm + * + * Parameters: + * vm_id: VM ID to inject to + * vcpu_id: VCPU ID to inject to + * irq: IRQ to inject + */ +void +vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) +{ + int ret; + + i8259_assert_irq(irq); + + if (i8259_is_pending()) { + if (vcpu_pic_intr(vm_id, vcpu_id, 1)) + fatalx("%s: can't assert INTR", __func__); + + ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]); + if (ret) + fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret); + + vcpu_hlt[vcpu_id] = 0; + ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); + if (ret) + fatalx("%s: can't signal (%d)", __func__, ret); + ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]); + if (ret) + fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret); + } +} + +/* + * fd_hasdata + * + * Determines if data can be read from a file descriptor. + * + * Parameters: + * fd: the fd to check + * + * Return values: + * 1 if data can be read from an fd, or 0 otherwise. + */ +int +fd_hasdata(int fd) +{ + struct pollfd pfd[1]; + int nready, hasdata = 0; + + pfd[0].fd = fd; + pfd[0].events = POLLIN; + nready = poll(pfd, 1, 0); + if (nready == -1) + log_warn("checking file descriptor for data failed"); + else if (nready == 1 && pfd[0].revents & POLLIN) + hasdata = 1; + return (hasdata); +} + +/* + * mutex_lock + * + * Wrapper function for pthread_mutex_lock that does error checking and that + * exits on failure + */ +void +mutex_lock(pthread_mutex_t *m) +{ + int ret; + + ret = pthread_mutex_lock(m); + if (ret) { + errno = ret; + fatal("could not acquire mutex"); + } +} + +/* + * mutex_unlock + * + * Wrapper function for pthread_mutex_unlock that does error checking and that + * exits on failure + */ +void +mutex_unlock(pthread_mutex_t *m) +{ + int ret; + + ret = pthread_mutex_unlock(m); + if (ret) { + errno = ret; + fatal("could not release mutex"); + } +} diff --git a/usr.sbin/vmd/vmd.h b/usr.sbin/vmd/vmd.h index 34f8f2192f5..3046bb36c2b 100644 --- a/usr.sbin/vmd/vmd.h +++ b/usr.sbin/vmd/vmd.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmd.h,v 1.45 2017/03/01 07:43:33 reyk Exp $ */ +/* $OpenBSD: vmd.h,v 1.46 2017/03/01 18:00:50 reyk Exp $ */ /* * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> @@ -229,6 +229,10 @@ int opentap(char *); int fd_hasdata(int); void mutex_lock(pthread_mutex_t *); void mutex_unlock(pthread_mutex_t *); +int vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *)); + +/* vm.c */ +int start_vm(struct vmd_vm *, int); /* control.c */ int config_init(struct vmd *); diff --git a/usr.sbin/vmd/vmm.c b/usr.sbin/vmd/vmm.c index 5f46d7a567e..ef4bc810876 100644 --- a/usr.sbin/vmd/vmm.c +++ b/usr.sbin/vmd/vmm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmm.c,v 1.65 2017/01/24 09:58:00 mlarkin Exp $ */ +/* $OpenBSD: vmm.c,v 1.66 2017/03/01 18:00:50 reyk Exp $ */ /* * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> @@ -52,102 +52,22 @@ #include "vmd.h" #include "vmm.h" -#include "loadfile.h" -#include "pci.h" -#include "virtio.h" -#include "proc.h" -#include "i8253.h" -#include "i8259.h" -#include "ns8250.h" -#include "mc146818.h" - -io_fn_t ioports_map[MAX_PORTS]; void vmm_sighdlr(int, short, void *); -int opentap(char *); -int start_vm(struct imsg *, uint32_t *); -int terminate_vm(struct vm_terminate_params *); -int get_info_vm(struct privsep *, struct imsg *, int); -int run_vm(int *, int *, struct vm_create_params *, struct vcpu_reg_state *); -void *event_thread(void *); -void *vcpu_run_loop(void *); -int vcpu_exit(struct vm_run_params *); -int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); -void create_memory_map(struct vm_create_params *); -int alloc_guest_mem(struct vm_create_params *); -int vmm_create_vm(struct vm_create_params *); -void init_emulated_hw(struct vm_create_params *, int *, int *); -void vcpu_exit_inout(struct vm_run_params *); -uint8_t vcpu_exit_pci(struct vm_run_params *); +int vmm_start_vm(struct imsg *, uint32_t *); int vmm_dispatch_parent(int, struct privsep_proc *, struct imsg *); void vmm_run(struct privsep *, struct privsep_proc *, void *); -int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); - -int vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *)); void vmm_dispatch_vm(int, short, void *); -void vm_dispatch_vmm(int, short, void *); - -static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, - size_t); - -int con_fd; -struct vmd_vm *current_vm; +int terminate_vm(struct vm_terminate_params *); +int get_info_vm(struct privsep *, struct imsg *, int); +int opentap(char *); extern struct vmd *env; -extern char *__progname; - -pthread_mutex_t threadmutex; -pthread_cond_t threadcond; - -pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM]; -pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM]; -uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; -uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; - static struct privsep_proc procs[] = { { "parent", PROC_PARENT, vmm_dispatch_parent }, }; -/* - * Represents a standard register set for an OS to be booted - * as a flat 32 bit address space, before paging is enabled. - * - * NOT set here are: - * RIP - * RSP - * GDTR BASE - * - * Specific bootloaders should clone this structure and override - * those fields as needed. - * - * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on - * features of the CPU in use. - */ -static const struct vcpu_reg_state vcpu_init_flat32 = { -#ifdef __i386__ - .vrs_gprs[VCPU_REGS_EFLAGS] = 0x2, - .vrs_gprs[VCPU_REGS_EIP] = 0x0, - .vrs_gprs[VCPU_REGS_ESP] = 0x0, -#else - .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, - .vrs_gprs[VCPU_REGS_RIP] = 0x0, - .vrs_gprs[VCPU_REGS_RSP] = 0x0, -#endif - .vrs_crs[VCPU_REGS_CR0] = CR0_CD | CR0_NW | CR0_ET | CR0_PE | CR0_PG, - .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, - .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, - .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, - .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, - .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, - .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, - .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, - .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, - .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, - .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, - .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, -}; - void vmm(struct privsep *ps, struct privsep_proc *p) { @@ -212,7 +132,7 @@ vmm_dispatch_parent(int fd, struct privsep_proc *p, struct imsg *imsg) } break; case IMSG_VMDOP_START_VM_END: - res = start_vm(imsg, &id); + res = vmm_start_vm(imsg, &id); cmd = IMSG_VMDOP_START_VM_RESPONSE; break; case IMSG_VMDOP_TERMINATE_VM_REQUEST: @@ -386,6 +306,12 @@ vmm_shutdown(void) } } +/* + * vmm_pipe + * + * Create a new imsg control channel between vmm parent and a VM + * (can be called on both sides). + */ int vmm_pipe(struct vmd_vm *vm, int fd, void (*cb)(int, short, void *)) { @@ -404,6 +330,11 @@ vmm_pipe(struct vmd_vm *vm, int fd, void (*cb)(int, short, void *)) return (0); } +/* + * vmm_dispatch_vm + * + * imsg callback for messages that are received from a VM child process. + */ void vmm_dispatch_vm(int fd, short event, void *arg) { @@ -456,100 +387,6 @@ vmm_dispatch_vm(int fd, short event, void *arg) imsg_event_add(iev); } -void -vm_dispatch_vmm(int fd, short event, void *arg) -{ - struct vmd_vm *vm = arg; - struct imsgev *iev = &vm->vm_iev; - struct imsgbuf *ibuf = &iev->ibuf; - struct imsg imsg; - ssize_t n; - int verbose; - - if (event & EV_READ) { - if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) - fatal("%s: imsg_read", __func__); - if (n == 0) - _exit(0); - } - - if (event & EV_WRITE) { - if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) - fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd); - if (n == 0) - _exit(0); - } - - for (;;) { - if ((n = imsg_get(ibuf, &imsg)) == -1) - fatal("%s: imsg_get", __func__); - if (n == 0) - break; - -#if DEBUG > 1 - log_debug("%s: got imsg %d from %s", - __func__, imsg.hdr.type, - vm->vm_params.vmc_params.vcp_name); -#endif - - switch (imsg.hdr.type) { - case IMSG_CTL_VERBOSE: - IMSG_SIZE_CHECK(&imsg, &verbose); - memcpy(&verbose, imsg.data, sizeof(verbose)); - log_setverbose(verbose); - break; - case IMSG_VMDOP_VM_SHUTDOWN: - if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) - _exit(0); - break; - case IMSG_VMDOP_VM_REBOOT: - if (vmmci_ctl(VMMCI_REBOOT) == -1) - _exit(0); - break; - default: - fatalx("%s: got invalid imsg %d from %s", - __func__, imsg.hdr.type, - vm->vm_params.vmc_params.vcp_name); - } - imsg_free(&imsg); - } - imsg_event_add(iev); -} - -/* - * vcpu_reset - * - * Requests vmm(4) to reset the VCPUs in the indicated VM to - * the register state provided - * - * Parameters - * vmid: VM ID to reset - * vcpu_id: VCPU ID to reset - * vrs: the register state to initialize - * - * Return values: - * 0: success - * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not - * valid) - */ -int -vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) -{ - struct vm_resetcpu_params vrp; - - memset(&vrp, 0, sizeof(vrp)); - vrp.vrp_vm_id = vmid; - vrp.vrp_vcpu_id = vcpu_id; - memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); - - log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); - - if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) < 0) - return (errno); - - return (0); -} - /* * terminate_vm * @@ -605,40 +442,26 @@ opentap(char *ifname) } /* - * start_vm - * - * Starts a new VM with the creation parameters supplied (in the incoming - * imsg->data field). This function performs a basic sanity check on the - * incoming parameters and then performs the following steps to complete - * the creation of the VM: + * vmm_start_vm * - * 1. opens the VM disk image files specified in the VM creation parameters - * 2. opens the specified VM kernel - * 3. creates a VM console tty pair using openpty - * 4. forks, passing the file descriptors opened in steps 1-3 to the child - * vmd responsible for dropping privilege and running the VM's VCPU - * loops. + * Prepares and forks a new VM process. * * Parameters: - * imsg: The incoming imsg body whose 'data' field is a vm_create_params - * struct containing the VM creation parameters. - * id: Returns the VM id as reported by the kernel. + * imsg: The VM data structure that is including the VM create parameters. + * id: Returns the VM id as reported by the kernel and obtained from the VM. * * Return values: * 0: success * !0 : failure - typically an errno indicating the source of the failure */ int -start_vm(struct imsg *imsg, uint32_t *id) +vmm_start_vm(struct imsg *imsg, uint32_t *id) { struct vm_create_params *vcp; - struct vmboot_params vmboot; struct vmd_vm *vm; - size_t i; int ret = EINVAL; - int fds[2], nicfds[VMM_MAX_NICS_PER_VM]; - struct vcpu_reg_state vrs; - FILE *kernfp; + int fds[2]; + size_t i; if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) { log_warnx("%s: can't find vm", __func__); @@ -668,6 +491,7 @@ start_vm(struct imsg *imsg, uint32_t *id) if (ret > 0) { /* Parent */ vm->vm_pid = ret; + close(fds[1]); for (i = 0 ; i < vcp->vcp_ndisks; i++) { close(vm->vm_disks[i]); @@ -686,7 +510,6 @@ start_vm(struct imsg *imsg, uint32_t *id) vm->vm_tty = -1; /* read back the kernel-generated vm id from the child */ - close(fds[1]); if (read(fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id)) != sizeof(vcp->vcp_id)) fatal("read vcp id"); @@ -702,76 +525,9 @@ start_vm(struct imsg *imsg, uint32_t *id) return (0); } else { /* Child */ - setproctitle("%s", vcp->vcp_name); - log_procinit(vcp->vcp_name); - - create_memory_map(vcp); - ret = alloc_guest_mem(vcp); - if (ret) { - errno = ret; - fatal("could not allocate guest memory - exiting"); - } - - ret = vmm_create_vm(vcp); - current_vm = vm; - - /* send back the kernel-generated vm id (0 on error) */ close(fds[0]); - if (write(fds[1], &vcp->vcp_id, sizeof(vcp->vcp_id)) != - sizeof(vcp->vcp_id)) - fatal("write vcp id"); - - if (ret) { - errno = ret; - fatal("create vmm ioctl failed - exiting"); - } - - /* - * pledge in the vm processes: - * stdio - for malloc and basic I/O including events. - * vmm - for the vmm ioctls and operations. - */ - if (pledge("stdio vmm", NULL) == -1) - fatal("pledge"); - - /* - * Set up default "flat 32 bit" register state - RIP, - * RSP, and GDT info will be set in bootloader - */ - memcpy(&vrs, &vcpu_init_flat32, sizeof(struct vcpu_reg_state)); - - /* Find and open kernel image */ - if ((kernfp = vmboot_open(vm->vm_kernel, - vm->vm_disks[0], &vmboot)) == NULL) - fatalx("failed to open kernel - exiting"); - - /* Load kernel image */ - ret = loadelf_main(kernfp, vcp, &vrs, - vmboot.vbp_bootdev, vmboot.vbp_howto); - if (ret) { - errno = ret; - fatal("failed to load kernel - exiting"); - } - - vmboot_close(kernfp, &vmboot); - - if (vm->vm_kernel != -1) - close(vm->vm_kernel); - - con_fd = vm->vm_tty; - if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) - fatal("failed to set nonblocking mode on console"); - - for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) - nicfds[i] = vm->vm_ifs[i].vif_fd; - - event_init(); - - if (vmm_pipe(vm, fds[1], vm_dispatch_vmm) == -1) - fatal("setup vm pipe"); - /* Execute the vcpu run loop(s) for this VM */ - ret = run_vm(vm->vm_disks, nicfds, vcp, &vrs); + ret = start_vm(vm, fds[1]); _exit(ret); } @@ -864,924 +620,3 @@ get_info_vm(struct privsep *ps, struct imsg *imsg, int terminate) free(info); return (0); } - -/* - * create_memory_map - * - * Sets up the guest physical memory ranges that the VM can access. - * - * Return values: - * nothing - */ -void -create_memory_map(struct vm_create_params *vcp) -{ - size_t len, mem_bytes, mem_mb; - - mem_mb = vcp->vcp_memranges[0].vmr_size; - vcp->vcp_nmemranges = 0; - if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE) - return; - - mem_bytes = mem_mb * 1024 * 1024; - - /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ - len = LOWMEM_KB * 1024; - vcp->vcp_memranges[0].vmr_gpa = 0x0; - vcp->vcp_memranges[0].vmr_size = len; - mem_bytes -= len; - - /* - * Second memory region: LOWMEM_KB - 1MB. - * - * N.B. - Normally ROMs or parts of video RAM are mapped here. - * We have to add this region, because some systems - * unconditionally write to 0xb8000 (VGA RAM), and - * we need to make sure that vmm(4) permits accesses - * to it. So allocate guest memory for it. - */ - len = 0x100000 - LOWMEM_KB * 1024; - vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; - vcp->vcp_memranges[1].vmr_size = len; - mem_bytes -= len; - - /* Make sure that we do not place physical memory into MMIO ranges. */ - if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000) - len = VMM_PCI_MMIO_BAR_BASE - 0x100000; - else - len = mem_bytes; - - /* Third memory region: 1MB - (1MB + len) */ - vcp->vcp_memranges[2].vmr_gpa = 0x100000; - vcp->vcp_memranges[2].vmr_size = len; - mem_bytes -= len; - - if (mem_bytes > 0) { - /* Fourth memory region for the remaining memory (if any) */ - vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; - vcp->vcp_memranges[3].vmr_size = mem_bytes; - vcp->vcp_nmemranges = 4; - } else - vcp->vcp_nmemranges = 3; -} - -/* - * alloc_guest_mem - * - * Allocates memory for the guest. - * Instead of doing a single allocation with one mmap(), we allocate memory - * separately for every range for the following reasons: - * - ASLR for the individual ranges - * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to - * map the single mmap'd userspace memory to the individual guest physical - * memory ranges, the underlying amap of the single mmap'd range would have - * to allocate per-page reference counters. The reason is that the - * individual guest physical ranges would reference the single mmap'd region - * only partially. However, if every guest physical range has its own - * corresponding mmap'd userspace allocation, there are no partial - * references: every guest physical range fully references an mmap'd - * range => no per-page reference counters have to be allocated. - * - * Return values: - * 0: success - * !0: failure - errno indicating the source of the failure - */ -int -alloc_guest_mem(struct vm_create_params *vcp) -{ - void *p; - int ret; - size_t i, j; - struct vm_mem_range *vmr; - - for (i = 0; i < vcp->vcp_nmemranges; i++) { - vmr = &vcp->vcp_memranges[i]; - p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, -1, 0); - if (p == MAP_FAILED) { - ret = errno; - for (j = 0; j < i; j++) { - vmr = &vcp->vcp_memranges[j]; - munmap((void *)vmr->vmr_va, vmr->vmr_size); - } - - return (ret); - } - - vmr->vmr_va = (vaddr_t)p; - } - - return (0); -} - -/* - * vmm_create_vm - * - * Requests vmm(4) to create a new VM using the supplied creation - * parameters. This operation results in the creation of the in-kernel - * structures for the VM, but does not start the VM's vcpu(s). - * - * Parameters: - * vcp: vm_create_params struct containing the VM's desired creation - * configuration - * - * Return values: - * 0: success - * !0 : ioctl to vmm(4) failed - */ -int -vmm_create_vm(struct vm_create_params *vcp) -{ - /* Sanity check arguments */ - if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) - return (EINVAL); - - if (vcp->vcp_nmemranges == 0 || - vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) - return (EINVAL); - - if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) - return (EINVAL); - - if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) - return (EINVAL); - - if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) < 0) - return (errno); - - return (0); -} - -/* - * init_emulated_hw - * - * Initializes the userspace hardware emulation - */ -void -init_emulated_hw(struct vm_create_params *vcp, int *child_disks, - int *child_taps) -{ - int i; - - /* Reset the IO port map */ - memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); - - /* Init i8253 PIT */ - i8253_init(vcp->vcp_id); - ioports_map[TIMER_CTRL] = vcpu_exit_i8253; - ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; - ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; - ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; - - /* Init mc146818 RTC */ - mc146818_init(vcp->vcp_id); - ioports_map[IO_RTC] = vcpu_exit_mc146818; - ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; - - /* Init master and slave PICs */ - i8259_init(); - ioports_map[IO_ICU1] = vcpu_exit_i8259; - ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; - ioports_map[IO_ICU2] = vcpu_exit_i8259; - ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; - - /* Init ns8250 UART */ - ns8250_init(con_fd, vcp->vcp_id); - for (i = COM1_DATA; i <= COM1_SCR; i++) - ioports_map[i] = vcpu_exit_com; - - /* Initialize PCI */ - for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) - ioports_map[i] = vcpu_exit_pci; - - ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; - ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; - pci_init(); - - /* Initialize virtio devices */ - virtio_init(vcp, child_disks, child_taps); -} - -/* - * run_vm - * - * Runs the VM whose creation parameters are specified in vcp - * - * Parameters: - * child_disks: previously-opened child VM disk file file descriptors - * child_taps: previously-opened child tap file descriptors - * vcp: vm_create_params struct containing the VM's desired creation - * configuration - * vrs: VCPU register state to initialize - * - * Return values: - * 0: the VM exited normally - * !0 : the VM exited abnormally or failed to start - */ -int -run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp, - struct vcpu_reg_state *vrs) -{ - uint8_t evdone = 0; - size_t i; - int ret; - pthread_t *tid, evtid; - struct vm_run_params **vrp; - void *exit_status; - - if (vcp == NULL) - return (EINVAL); - - if (child_disks == NULL && vcp->vcp_ndisks != 0) - return (EINVAL); - - if (child_taps == NULL && vcp->vcp_nnics != 0) - return (EINVAL); - - if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) - return (EINVAL); - - if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) - return (EINVAL); - - if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) - return (EINVAL); - - if (vcp->vcp_nmemranges == 0 || - vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) - return (EINVAL); - - tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); - vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); - if (tid == NULL || vrp == NULL) { - log_warn("%s: memory allocation error - exiting.", - __progname); - return (ENOMEM); - } - - log_debug("%s: initializing hardware for vm %s", __func__, - vcp->vcp_name); - - init_emulated_hw(vcp, child_disks, child_taps); - - ret = pthread_mutex_init(&threadmutex, NULL); - if (ret) { - log_warn("%s: could not initialize thread state mutex", - __func__); - return (ret); - } - ret = pthread_cond_init(&threadcond, NULL); - if (ret) { - log_warn("%s: could not initialize thread state " - "condition variable", __func__); - return (ret); - } - - mutex_lock(&threadmutex); - - log_debug("%s: starting vcpu threads for vm %s", __func__, - vcp->vcp_name); - - /* - * Create and launch one thread for each VCPU. These threads may - * migrate between PCPUs over time; the need to reload CPU state - * in such situations is detected and performed by vmm(4) in the - * kernel. - */ - for (i = 0 ; i < vcp->vcp_ncpus; i++) { - vrp[i] = malloc(sizeof(struct vm_run_params)); - if (vrp[i] == NULL) { - log_warn("%s: memory allocation error - " - "exiting.", __progname); - /* caller will exit, so skip free'ing */ - return (ENOMEM); - } - vrp[i]->vrp_exit = malloc(sizeof(union vm_exit)); - if (vrp[i]->vrp_exit == NULL) { - log_warn("%s: memory allocation error - " - "exiting.", __progname); - /* caller will exit, so skip free'ing */ - return (ENOMEM); - } - vrp[i]->vrp_vm_id = vcp->vcp_id; - vrp[i]->vrp_vcpu_id = i; - - if (vcpu_reset(vcp->vcp_id, i, vrs)) { - log_warnx("%s: cannot reset VCPU %zu - exiting.", - __progname, i); - return (EIO); - } - - ret = pthread_cond_init(&vcpu_run_cond[i], NULL); - if (ret) { - log_warnx("%s: cannot initialize cond var (%d)", - __progname, ret); - return (ret); - } - - ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL); - if (ret) { - log_warnx("%s: cannot initialize mtx (%d)", - __progname, ret); - return (ret); - } - - vcpu_hlt[i] = 0; - - /* Start each VCPU run thread at vcpu_run_loop */ - ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); - if (ret) { - /* caller will _exit after this return */ - ret = errno; - log_warn("%s: could not create vcpu thread %zu", - __func__, i); - return (ret); - } - } - - log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); - ret = pthread_create(&evtid, NULL, event_thread, &evdone); - if (ret) { - errno = ret; - log_warn("%s: could not create event thread", __func__); - return (ret); - } - - for (;;) { - ret = pthread_cond_wait(&threadcond, &threadmutex); - if (ret) { - log_warn("%s: waiting on thread state condition " - "variable failed", __func__); - return (ret); - } - - /* - * Did a VCPU thread exit with an error? => return the first one - */ - for (i = 0; i < vcp->vcp_ncpus; i++) { - if (vcpu_done[i] == 0) - continue; - - if (pthread_join(tid[i], &exit_status)) { - log_warn("%s: failed to join thread %zd - " - "exiting", __progname, i); - return (EIO); - } - - ret = (long long)exit_status; - } - - /* Did the event thread exit? => return with an error */ - if (evdone) { - if (pthread_join(evtid, &exit_status)) { - log_warn("%s: failed to join event thread - " - "exiting", __progname); - return (EIO); - } - - log_warnx("%s: vm %d event thread exited " - "unexpectedly", __progname, vcp->vcp_id); - return (EIO); - } - - /* Did all VCPU threads exit successfully? => return */ - for (i = 0; i < vcp->vcp_ncpus; i++) { - if (vcpu_done[i] == 0) - break; - } - if (i == vcp->vcp_ncpus) - return (ret); - - /* Some more threads to wait for, start over */ - } - - return (ret); -} - -void * -event_thread(void *arg) -{ - uint8_t *donep = arg; - intptr_t ret; - - ret = event_dispatch(); - - mutex_lock(&threadmutex); - *donep = 1; - pthread_cond_signal(&threadcond); - mutex_unlock(&threadmutex); - - return (void *)ret; - } - -/* - * vcpu_run_loop - * - * Runs a single VCPU until vmm(4) requires help handling an exit, - * or the VM terminates. - * - * Parameters: - * arg: vcpu_run_params for the VCPU being run by this thread - * - * Return values: - * NULL: the VCPU shutdown properly - * !NULL: error processing VCPU run, or the VCPU shutdown abnormally - */ -void * -vcpu_run_loop(void *arg) -{ - struct vm_run_params *vrp = (struct vm_run_params *)arg; - intptr_t ret = 0; - int irq; - uint32_t n; - - vrp->vrp_continue = 0; - n = vrp->vrp_vcpu_id; - - for (;;) { - ret = pthread_mutex_lock(&vcpu_run_mtx[n]); - - if (ret) { - log_warnx("%s: can't lock vcpu run mtx (%d)", - __func__, (int)ret); - return ((void *)ret); - } - - /* If we are halted, wait */ - if (vcpu_hlt[n]) { - ret = pthread_cond_wait(&vcpu_run_cond[n], - &vcpu_run_mtx[n]); - - if (ret) { - log_warnx("%s: can't wait on cond (%d)", - __func__, (int)ret); - (void)pthread_mutex_unlock(&vcpu_run_mtx[n]); - break; - } - } - - ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); - if (ret) { - log_warnx("%s: can't unlock mutex on cond (%d)", - __func__, (int)ret); - break; - } - - if (vrp->vrp_irqready && i8259_is_pending()) { - irq = i8259_ack(); - vrp->vrp_irq = irq; - } else - vrp->vrp_irq = 0xFFFF; - - /* Still more pending? */ - if (i8259_is_pending()) { - /* XXX can probably avoid ioctls here by providing intr in vrp */ - if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 1)) { - fatal("can't set INTR"); - } - } else { - if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 0)) { - fatal("can't clear INTR"); - } - } - - if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) < 0) { - /* If run ioctl failed, exit */ - ret = errno; - log_warn("%s: vm %d / vcpu %d run ioctl failed", - __func__, vrp->vrp_vm_id, n); - break; - } - - /* If the VM is terminating, exit normally */ - if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) { - ret = (intptr_t)NULL; - break; - } - - if (vrp->vrp_exit_reason != VM_EXIT_NONE) { - /* - * vmm(4) needs help handling an exit, handle in - * vcpu_exit. - */ - ret = vcpu_exit(vrp); - if (ret) - break; - } - } - - mutex_lock(&threadmutex); - vcpu_done[n] = 1; - pthread_cond_signal(&threadcond); - mutex_unlock(&threadmutex); - - return ((void *)ret); -} - -int -vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) -{ - struct vm_intr_params vip; - - memset(&vip, 0, sizeof(vip)); - - vip.vip_vm_id = vm_id; - vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ - vip.vip_intr = intr; - - if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) < 0) - return (errno); - - return (0); -} - -/* - * vcpu_exit_pci - * - * Handle all I/O to the emulated PCI subsystem. - * - * Parameters: - * vrp: vcpu run paramters containing guest state for this exit - * - * Return value: - * Interrupt to inject to the guest VM, or 0xFF if no interrupt should - * be injected. - */ -uint8_t -vcpu_exit_pci(struct vm_run_params *vrp) -{ - union vm_exit *vei = vrp->vrp_exit; - uint8_t intr; - - intr = 0xFF; - - switch (vei->vei.vei_port) { - case PCI_MODE1_ADDRESS_REG: - pci_handle_address_reg(vrp); - break; - case PCI_MODE1_DATA_REG: - pci_handle_data_reg(vrp); - break; - case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: - intr = pci_handle_io(vrp); - break; - default: - log_warnx("%s: unknown PCI register 0x%llx", - __progname, (uint64_t)vei->vei.vei_port); - break; - } - - return (intr); -} - -/* - * vcpu_exit_inout - * - * Handle all I/O exits that need to be emulated in vmd. This includes the - * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. - * - * Parameters: - * vrp: vcpu run parameters containing guest state for this exit - */ -void -vcpu_exit_inout(struct vm_run_params *vrp) -{ - union vm_exit *vei = vrp->vrp_exit; - uint8_t intr = 0xFF; - - if (ioports_map[vei->vei.vei_port] != NULL) - intr = ioports_map[vei->vei.vei_port](vrp); - else if (vei->vei.vei_dir == VEI_DIR_IN) - vei->vei.vei_data = 0xFFFFFFFF; - - if (intr != 0xFF) - vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); -} - -/* - * vcpu_exit - * - * Handle a vcpu exit. This function is called when it is determined that - * vmm(4) requires the assistance of vmd to support a particular guest - * exit type (eg, accessing an I/O port or device). Guest state is contained - * in 'vrp', and will be resent to vmm(4) on exit completion. - * - * Upon conclusion of handling the exit, the function determines if any - * interrupts should be injected into the guest, and asserts the proper - * IRQ line whose interrupt should be vectored. - * - * Parameters: - * vrp: vcpu run parameters containing guest state for this exit - * - * Return values: - * 0: the exit was handled successfully - * 1: an error occurred (eg, unknown exit reason passed in 'vrp') - */ -int -vcpu_exit(struct vm_run_params *vrp) -{ - int ret; - - switch (vrp->vrp_exit_reason) { - case VMX_EXIT_INT_WINDOW: - case VMX_EXIT_EXTINT: - case VMX_EXIT_EPT_VIOLATION: - case SVM_VMEXIT_NPF: - /* - * We may be exiting to vmd to handle a pending interrupt but - * at the same time the last exit type may have been one of - * these. In this case, there's nothing extra to be done - * here (and falling through to the default case below results - * in more vmd log spam). - */ - break; - case VMX_EXIT_IO: - case SVM_VMEXIT_IOIO: - vcpu_exit_inout(vrp); - break; - case VMX_EXIT_HLT: - case SVM_VMEXIT_HLT: - ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); - if (ret) { - log_warnx("%s: can't lock vcpu mutex (%d)", - __func__, ret); - return (ret); - } - vcpu_hlt[vrp->vrp_vcpu_id] = 1; - ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); - if (ret) { - log_warnx("%s: can't unlock vcpu mutex (%d)", - __func__, ret); - return (ret); - } - break; - case VMX_EXIT_TRIPLE_FAULT: - case SVM_VMEXIT_SHUTDOWN: - /* XXX reset VM since we do not support reboot yet */ - return (EAGAIN); - default: - log_debug("%s: unknown exit reason %d", - __progname, vrp->vrp_exit_reason); - } - - /* Process any pending traffic */ - vionet_process_rx(vrp->vrp_vm_id); - - vrp->vrp_continue = 1; - - return (0); -} - -/* - * find_gpa_range - * - * Search for a contiguous guest physical mem range. - * - * Parameters: - * vcp: VM create parameters that contain the memory map to search in - * gpa: the starting guest physical address - * len: the length of the memory range - * - * Return values: - * NULL: on failure if there is no memory range as described by the parameters - * Pointer to vm_mem_range that contains the start of the range otherwise. - */ -static struct vm_mem_range * -find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) -{ - size_t i, n; - struct vm_mem_range *vmr; - - /* Find the first vm_mem_range that contains gpa */ - for (i = 0; i < vcp->vcp_nmemranges; i++) { - vmr = &vcp->vcp_memranges[i]; - if (vmr->vmr_gpa + vmr->vmr_size >= gpa) - break; - } - - /* No range found. */ - if (i == vcp->vcp_nmemranges) - return (NULL); - - /* - * vmr may cover the range [gpa, gpa + len) only partly. Make - * sure that the following vm_mem_ranges are contiguous and - * cover the rest. - */ - n = vmr->vmr_size - (gpa - vmr->vmr_gpa); - if (len < n) - len = 0; - else - len -= n; - gpa = vmr->vmr_gpa + vmr->vmr_size; - for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { - vmr = &vcp->vcp_memranges[i]; - if (gpa != vmr->vmr_gpa) - return (NULL); - if (len <= vmr->vmr_size) - len = 0; - else - len -= vmr->vmr_size; - - gpa = vmr->vmr_gpa + vmr->vmr_size; - } - - if (len != 0) - return (NULL); - - return (vmr); -} - -/* - * write_mem - * - * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. - * - * Parameters: - * dst: the destination paddr_t in the guest VM - * buf: data to copy - * len: number of bytes to copy - * - * Return values: - * 0: success - * EINVAL: if the guest physical memory range [dst, dst + len) does not - * exist in the guest. - */ -int -write_mem(paddr_t dst, void *buf, size_t len) -{ - char *from = buf, *to; - size_t n, off; - struct vm_mem_range *vmr; - - vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); - if (vmr == NULL) { - errno = EINVAL; - log_warn("%s: failed - invalid memory range dst = 0x%lx, " - "len = 0x%zx", __func__, dst, len); - return (EINVAL); - } - - off = dst - vmr->vmr_gpa; - while (len != 0) { - n = vmr->vmr_size - off; - if (len < n) - n = len; - - to = (char *)vmr->vmr_va + off; - memcpy(to, from, n); - - from += n; - len -= n; - off = 0; - vmr++; - } - - return (0); -} - -/* - * read_mem - * - * Reads memory at guest paddr 'src' into 'buf'. - * - * Parameters: - * src: the source paddr_t in the guest VM to read from. - * buf: destination (local) buffer - * len: number of bytes to read - * - * Return values: - * 0: success - * EINVAL: if the guest physical memory range [dst, dst + len) does not - * exist in the guest. - */ -int -read_mem(paddr_t src, void *buf, size_t len) -{ - char *from, *to = buf; - size_t n, off; - struct vm_mem_range *vmr; - - vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); - if (vmr == NULL) { - errno = EINVAL; - log_warn("%s: failed - invalid memory range src = 0x%lx, " - "len = 0x%zx", __func__, src, len); - return (EINVAL); - } - - off = src - vmr->vmr_gpa; - while (len != 0) { - n = vmr->vmr_size - off; - if (len < n) - n = len; - - from = (char *)vmr->vmr_va + off; - memcpy(to, from, n); - - to += n; - len -= n; - off = 0; - vmr++; - } - - return (0); -} - -/* - * vcpu_assert_pic_irq - * - * Injects the specified IRQ on the supplied vcpu/vm - * - * Parameters: - * vm_id: VM ID to inject to - * vcpu_id: VCPU ID to inject to - * irq: IRQ to inject - */ -void -vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) -{ - int ret; - - i8259_assert_irq(irq); - - if (i8259_is_pending()) { - if (vcpu_pic_intr(vm_id, vcpu_id, 1)) - fatalx("%s: can't assert INTR", __func__); - - ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]); - if (ret) - fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret); - - vcpu_hlt[vcpu_id] = 0; - ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); - if (ret) - fatalx("%s: can't signal (%d)", __func__, ret); - ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]); - if (ret) - fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret); - } -} - -/* - * fd_hasdata - * - * Determines if data can be read from a file descriptor. - * - * Parameters: - * fd: the fd to check - * - * Return values: - * 1 if data can be read from an fd, or 0 otherwise. - */ -int -fd_hasdata(int fd) -{ - struct pollfd pfd[1]; - int nready, hasdata = 0; - - pfd[0].fd = fd; - pfd[0].events = POLLIN; - nready = poll(pfd, 1, 0); - if (nready == -1) - log_warn("checking file descriptor for data failed"); - else if (nready == 1 && pfd[0].revents & POLLIN) - hasdata = 1; - return (hasdata); -} - -/* - * mutex_lock - * - * Wrapper function for pthread_mutex_lock that does error checking and that - * exits on failure - */ -void -mutex_lock(pthread_mutex_t *m) -{ - int ret; - - ret = pthread_mutex_lock(m); - if (ret) { - errno = ret; - fatal("could not acquire mutex"); - } -} - -/* - * mutex_unlock - * - * Wrapper function for pthread_mutex_unlock that does error checking and that - * exits on failure - */ -void -mutex_unlock(pthread_mutex_t *m) -{ - int ret; - - ret = pthread_mutex_unlock(m); - if (ret) { - errno = ret; - fatal("could not release mutex"); - } -} |