/* $OpenBSD: vmm.c,v 1.26 2016/04/07 05:51:26 guenther Exp $ */ /* * Copyright (c) 2015 Mike Larkin * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmd.h" #include "loadfile.h" #include "pci.h" #include "virtio.h" #include "proc.h" #define MAX_PORTS 65536 /* * Emulated 8250 UART */ #define COM1_DATA 0x3f8 #define COM1_IER 0x3f9 #define COM1_IIR 0x3fa #define COM1_LCR 0x3fb #define COM1_MCR 0x3fc #define COM1_LSR 0x3fd #define COM1_MSR 0x3fe #define COM1_SCR 0x3ff /* * Emulated i8253 PIT (counter) */ #define TIMER_BASE 0x40 #define TIMER_CTRL 0x43 /* 8253 Timer #1 */ #define NS_PER_TICK (1000000000 / TIMER_FREQ) /* i8253 registers */ struct i8253_counter { struct timeval tv; /* timer start time */ uint16_t start; /* starting value */ uint16_t olatch; /* output latch */ uint16_t ilatch; /* input latch */ uint8_t last_r; /* last read byte (MSB/LSB) */ uint8_t last_w; /* last written byte (MSB/LSB) */ }; /* ns8250 UART registers */ struct ns8250_regs { uint8_t lcr; /* Line Control Register */ uint8_t fcr; /* FIFO Control Register */ uint8_t iir; /* Interrupt ID Register */ uint8_t ier; /* Interrupt Enable Register */ uint8_t divlo; /* Baud rate divisor low byte */ uint8_t divhi; /* Baud rate divisor high byte */ uint8_t msr; /* Modem Status Register */ uint8_t lsr; /* Line Status Register */ uint8_t mcr; /* Modem Control Register */ uint8_t scr; /* Scratch Register */ uint8_t data; /* Unread input data */ }; typedef uint8_t (*io_fn_t)(struct vm_run_params *); struct i8253_counter i8253_counter[3]; struct ns8250_regs com1_regs; io_fn_t ioports_map[MAX_PORTS]; int start_client_vmd(void); int opentap(void); int start_vm(struct imsg *, uint32_t *); int terminate_vm(struct vm_terminate_params *); int get_info_vm(struct privsep *, struct imsg *, int); int run_vm(int *, int *, struct vm_create_params *, struct vcpu_init_state *); void *vcpu_run_loop(void *); int vcpu_exit(struct vm_run_params *); int vcpu_reset(uint32_t, uint32_t, struct vcpu_init_state *); void create_memory_map(struct vm_create_params *); int vmm_create_vm(struct vm_create_params *); void init_emulated_hw(struct vm_create_params *, int *, int *); void vcpu_exit_inout(struct vm_run_params *); uint8_t vcpu_exit_pci(struct vm_run_params *); uint8_t vcpu_exit_i8253(struct vm_run_params *); uint8_t vcpu_exit_com(struct vm_run_params *); void vcpu_process_com_data(union vm_exit *); void vcpu_process_com_lcr(union vm_exit *); void vcpu_process_com_lsr(union vm_exit *); void vcpu_process_com_ier(union vm_exit *); void vcpu_process_com_mcr(union vm_exit *); void vcpu_process_com_iir(union vm_exit *); void vcpu_process_com_msr(union vm_exit *); void vcpu_process_com_scr(union vm_exit *); int vmm_dispatch_parent(int, struct privsep_proc *, struct imsg *); void vmm_run(struct privsep *, struct privsep_proc *, void *); int con_fd; struct vmd_vm *current_vm; extern struct vmd *env; extern char *__progname; static struct privsep_proc procs[] = { { "parent", PROC_PARENT, vmm_dispatch_parent }, }; /* * Represents a standard register set for an OS to be booted * as a flat 32 bit address space, before paging is enabled. * * NOT set here are: * RIP * RSP * GDTR BASE * * Specific bootloaders should clone this structure and override * those fields as needed. * * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on * features of the CPU in use. */ static const struct vcpu_init_state vcpu_init_flat32 = { 0x2, /* RFLAGS */ 0x0, /* RIP */ 0x0, /* RSP */ CR0_CD | CR0_NW | CR0_ET | CR0_PE | CR0_PG, /* CR0 */ PML4_PAGE, /* CR3 */ { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, /* CS */ { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, /* DS */ { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, /* ES */ { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, /* FS */ { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, /* GS */ { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, /* SS */ { 0x0, 0xFFFF, 0x0, 0x0}, /* GDTR */ { 0x0, 0xFFFF, 0x0, 0x0}, /* IDTR */ { 0x0, 0xFFFF, 0x0082, 0x0}, /* LDTR */ { 0x0, 0xFFFF, 0x008B, 0x0}, /* TR */ }; pid_t vmm(struct privsep *ps, struct privsep_proc *p) { return (proc_run(ps, p, procs, nitems(procs), vmm_run, NULL)); } void vmm_run(struct privsep *ps, struct privsep_proc *p, void *arg) { if (config_init(ps->ps_env) == -1) fatal("failed to initialize configuration"); #if 0 /* * pledge in the vmm process: * stdio - for malloc and basic I/O including events. * vmm - for the vmm ioctls and operations. * proc - for forking and maitaining vms. * recvfd - for disks, interfaces and other fds. */ /* XXX'ed pledge to hide it from grep as long as it's disabled */ if (XXX("stdio vmm recvfd proc", NULL) == -1) fatal("pledge"); #endif /* Get and terminate all running VMs */ get_info_vm(ps, NULL, 1); } int vmm_dispatch_parent(int fd, struct privsep_proc *p, struct imsg *imsg) { struct privsep *ps = p->p_ps; int res = 0, cmd = 0; struct vm_create_params vcp; struct vm_terminate_params vtp; struct vmop_result vmr; uint32_t id = 0; struct vmd_vm *vm; switch (imsg->hdr.type) { case IMSG_VMDOP_START_VM_REQUEST: IMSG_SIZE_CHECK(imsg, &vcp); memcpy(&vcp, imsg->data, sizeof(vcp)); res = config_getvm(ps, &vcp, imsg->fd, imsg->hdr.peerid); if (res == -1) { res = errno; cmd = IMSG_VMDOP_START_VM_RESPONSE; } break; case IMSG_VMDOP_START_VM_DISK: res = config_getdisk(ps, imsg); if (res == -1) { res = errno; cmd = IMSG_VMDOP_START_VM_RESPONSE; } break; case IMSG_VMDOP_START_VM_IF: res = config_getif(ps, imsg); if (res == -1) { res = errno; cmd = IMSG_VMDOP_START_VM_RESPONSE; } break; case IMSG_VMDOP_START_VM_END: res = start_vm(imsg, &id); cmd = IMSG_VMDOP_START_VM_RESPONSE; break; case IMSG_VMDOP_TERMINATE_VM_REQUEST: IMSG_SIZE_CHECK(imsg, &vtp); memcpy(&vtp, imsg->data, sizeof(vtp)); id = vtp.vtp_vm_id; res = terminate_vm(&vtp); cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; if (res == 0) { /* Remove local reference */ vm = vm_getbyid(id); vm_remove(vm); } break; case IMSG_VMDOP_GET_INFO_VM_REQUEST: res = get_info_vm(ps, imsg, 0); cmd = IMSG_VMDOP_GET_INFO_VM_END_DATA; break; case IMSG_CTL_RESET: config_getreset(env, imsg); break; default: return (-1); } switch (cmd) { case 0: break; case IMSG_VMDOP_START_VM_RESPONSE: if (res != 0) { vm = vm_getbyvmid(imsg->hdr.peerid); vm_remove(vm); } case IMSG_VMDOP_TERMINATE_VM_RESPONSE: memset(&vmr, 0, sizeof(vmr)); vmr.vmr_result = res; vmr.vmr_id = id; if (proc_compose_imsg(ps, PROC_PARENT, -1, cmd, imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1) return (-1); break; default: if (proc_compose_imsg(ps, PROC_PARENT, -1, cmd, imsg->hdr.peerid, -1, &res, sizeof(res)) == -1) return (-1); break; } return (0); } /* * vcpu_reset * * Requests vmm(4) to reset the VCPUs in the indicated VM to * the register state provided * * Parameters * vmid: VM ID to reset * vcpu_id: VCPU ID to reset * vis: the register state to initialize * * Return values: * 0: success * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not * valid) */ int vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_init_state *vis) { struct vm_resetcpu_params vrp; memset(&vrp, 0, sizeof(vrp)); vrp.vrp_vm_id = vmid; vrp.vrp_vcpu_id = vcpu_id; memcpy(&vrp.vrp_init_state, vis, sizeof(struct vcpu_init_state)); if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) < 0) return (errno); return (0); } /* * terminate_vm * * Requests vmm(4) to terminate the VM whose ID is provided in the * supplied vm_terminate_params structure (vtp->vtp_vm_id) * * Parameters * vtp: vm_create_params struct containing the ID of the VM to terminate * * Return values: * 0: success * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM is not * valid) */ int terminate_vm(struct vm_terminate_params *vtp) { if (ioctl(env->vmd_fd, VMM_IOC_TERM, vtp) < 0) return (errno); return (0); } /* * opentap * * Opens the next available tap device, up to MAX_TAP. * * Returns a file descriptor to the tap node opened, or -1 if no tap * devices were available. */ int opentap(void) { int i, fd; char path[PATH_MAX]; for (i = 0; i < MAX_TAP; i++) { snprintf(path, PATH_MAX, "/dev/tap%d", i); fd = open(path, O_RDWR | O_NONBLOCK); if (fd != -1) return (fd); } return (-1); } /* * start_vm * * Starts a new VM with the creation parameters supplied (in the incoming * imsg->data field). This function performs a basic sanity check on the * incoming parameters and then performs the following steps to complete * the creation of the VM: * * 1. opens the VM disk image files specified in the VM creation parameters * 2. opens the specified VM kernel * 3. creates a VM console tty pair using openpty * 4. forks, passing the file descriptors opened in steps 1-3 to the child * vmd responsible for dropping privilege and running the VM's VCPU * loops. * * Parameters: * imsg: The incoming imsg body whose 'data' field is a vm_create_params * struct containing the VM creation parameters. * id: Returns the VM id as reported by the kernel. * * Return values: * 0: success * !0 : failure - typically an errno indicating the source of the failure */ int start_vm(struct imsg *imsg, uint32_t *id) { struct vm_create_params *vcp; struct vmd_vm *vm; size_t i; int ret = EINVAL; int fds[2]; struct vcpu_init_state vis; if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) { log_warn("%s: can't find vm", __func__); ret = ENOENT; goto err; } vcp = &vm->vm_params; if ((vm->vm_tty = imsg->fd) == -1) { log_warn("%s: can't get tty", __func__); goto err; } if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, fds) == -1) fatal("socketpair"); /* Start child vmd for this VM (fork, chroot, drop privs) */ ret = start_client_vmd(); /* Start child failed? - cleanup and leave */ if (ret == -1) { log_warn("%s: start child failed", __func__); ret = EIO; goto err; } if (ret > 0) { /* Parent */ for (i = 0 ; i < vcp->vcp_ndisks; i++) { close(vm->vm_disks[i]); vm->vm_disks[i] = -1; } for (i = 0 ; i < vcp->vcp_nnics; i++) { close(vm->vm_ifs[i]); vm->vm_ifs[i] = -1; } close(vm->vm_kernel); vm->vm_kernel = -1; close(vm->vm_tty); vm->vm_tty = -1; /* read back the kernel-generated vm id from the child */ close(fds[1]); if (read(fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id)) != sizeof(vcp->vcp_id)) fatal("read vcp id"); close(fds[0]); if (vcp->vcp_id == 0) goto err; *id = vcp->vcp_id; return (0); } else { /* Child */ setproctitle(vcp->vcp_name); log_procinit(vcp->vcp_name); create_memory_map(vcp); ret = vmm_create_vm(vcp); current_vm = vm; /* send back the kernel-generated vm id (0 on error) */ close(fds[0]); if (write(fds[1], &vcp->vcp_id, sizeof(vcp->vcp_id)) != sizeof(vcp->vcp_id)) fatal("write vcp id"); close(fds[1]); if (ret) { errno = ret; fatal("create vmm ioctl failed - exiting"); } #if 0 /* * pledge in the vm processes: * stdio - for malloc and basic I/O including events. * vmm - for the vmm ioctls and operations. */ if (XXX("stdio vmm", NULL) == -1) fatal("pledge"); #endif /* * Set up default "flat 32 bit" register state - RIP, * RSP, and GDT info will be set in bootloader */ memcpy(&vis, &vcpu_init_flat32, sizeof(struct vcpu_init_state)); /* Load kernel image */ ret = loadelf_main(vm->vm_kernel, vcp, &vis); if (ret) { errno = ret; fatal("failed to load kernel - exiting"); } close(vm->vm_kernel); con_fd = vm->vm_tty; if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) fatal("failed to set nonblocking mode on console"); /* Execute the vcpu run loop(s) for this VM */ ret = run_vm(vm->vm_disks, vm->vm_ifs, vcp, &vis); _exit(ret != 0); } return (0); err: vm_remove(vm); return (ret); } /* * get_info_vm * * Returns a list of VMs known to vmm(4). * * Parameters: * ps: the privsep context. * imsg: the received imsg including the peer id. * terminate: terminate the listed vm. * * Return values: * 0: success * !0 : failure (eg, ENOMEM, EIO or another error code from vmm(4) ioctl) */ int get_info_vm(struct privsep *ps, struct imsg *imsg, int terminate) { int ret; size_t ct, i; struct vm_info_params vip; struct vm_info_result *info; struct vm_terminate_params vtp; struct vmop_info_result vir; /* * We issue the VMM_IOC_INFO ioctl twice, once with an input * buffer size of 0, which results in vmm(4) returning the * number of bytes required back to us in vip.vip_size, * and then we call it again after malloc'ing the required * number of bytes. * * It is possible that we could fail a second time (eg, if * another VM was created in the instant between the two * ioctls, but in that case the caller can just try again * as vmm(4) will return a zero-sized list in that case. */ vip.vip_size = 0; info = NULL; ret = 0; memset(&vir, 0, sizeof(vir)); /* First ioctl to see how many bytes needed (vip.vip_size) */ if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) < 0) return (errno); if (vip.vip_info_ct != 0) return (EIO); info = malloc(vip.vip_size); if (info == NULL) return (ENOMEM); /* Second ioctl to get the actual list */ vip.vip_info = info; if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) < 0) { ret = errno; free(info); return (ret); } /* Return info */ ct = vip.vip_size / sizeof(struct vm_info_result); for (i = 0; i < ct; i++) { if (terminate) { vtp.vtp_vm_id = info[i].vir_id; if ((ret = terminate_vm(&vtp)) != 0) return (ret); log_debug("%s: terminated id %d", info[i].vir_name, info[i].vir_id); continue; } memcpy(&vir.vir_info, &info[i], sizeof(vir.vir_info)); if (proc_compose_imsg(ps, PROC_PARENT, -1, IMSG_VMDOP_GET_INFO_VM_DATA, imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) return (EIO); } free(info); return (0); } /* * start_client_vmd * * forks a copy of the parent vmd, chroots to VMD_USER's home, drops * privileges (changes to user VMD_USER), and returns. * Should the fork operation succeed, but later chroot/privsep * fail, the child exits. * * Return values (returns to both child and parent on success): * -1 : failure * 0: return to child vmd returns 0 * !0 : return to parent vmd returns the child's pid */ int start_client_vmd(void) { int child_pid; child_pid = fork(); if (child_pid < 0) return (-1); if (!child_pid) { /* child, already running without privileges */ return (0); } /* Parent */ return (child_pid); } /* * create_memory_map * * Sets up the guest physical memory ranges that the VM can access. * * Return values: * nothing */ void create_memory_map(struct vm_create_params *vcp) { size_t mem_mb; uint64_t mem_bytes, len; mem_mb = vcp->vcp_memranges[0].vmr_size; vcp->vcp_nmemranges = 0; if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE) return; mem_bytes = (uint64_t)mem_mb * 1024 * 1024; /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ vcp->vcp_memranges[0].vmr_gpa = 0x0; vcp->vcp_memranges[0].vmr_size = LOWMEM_KB * 1024; mem_bytes -= LOWMEM_KB * 1024; /* * Second memory region: LOWMEM_KB - 1MB. * XXX Normally ROMs or parts of video RAM are mapped here. * We have to add this region, because some systems * unconditionally write to 0xb8000 (video RAM), and * we need to make sure that vmm(4) permits accesses * to it. So allocate guest memory for it. */ len = 0x100000 - LOWMEM_KB * 1024; vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; vcp->vcp_memranges[1].vmr_size = len; mem_bytes -= len; /* Make sure that we do not place physical memory into MMIO ranges. */ if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000) len = VMM_PCI_MMIO_BAR_BASE - 0x100000; else len = mem_bytes; /* Third memory region: 1MB - (1MB + len) */ vcp->vcp_memranges[2].vmr_gpa = 0x100000; vcp->vcp_memranges[2].vmr_size = len; mem_bytes -= len; if (mem_bytes > 0) { /* Fourth memory region for the remaining memory (if any) */ vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; vcp->vcp_memranges[3].vmr_size = mem_bytes; vcp->vcp_nmemranges = 4; } else vcp->vcp_nmemranges = 3; } /* * vmm_create_vm * * Requests vmm(4) to create a new VM using the supplied creation * parameters. This operation results in the creation of the in-kernel * structures for the VM, but does not start the VM's vcpu(s). * * Parameters: * vcp: vm_create_params struct containing the VM's desired creation * configuration * * Return values: * 0: success * !0 : ioctl to vmm(4) failed */ int vmm_create_vm(struct vm_create_params *vcp) { /* Sanity check arguments */ if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) return (EINVAL); if (vcp->vcp_nmemranges == 0 || vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) return (EINVAL); if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) return (EINVAL); if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) < 0) return (errno); return (0); } /* * init_emulated_hw * * Initializes the userspace hardware emulation */ void init_emulated_hw(struct vm_create_params *vcp, int *child_disks, int *child_taps) { int i; /* Reset the IO port map */ memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); /* Init the i8253 PIT's 3 counters */ memset(&i8253_counter, 0, sizeof(struct i8253_counter) * 3); gettimeofday(&i8253_counter[0].tv, NULL); gettimeofday(&i8253_counter[1].tv, NULL); gettimeofday(&i8253_counter[2].tv, NULL); i8253_counter[0].start = TIMER_DIV(100); i8253_counter[1].start = TIMER_DIV(100); i8253_counter[2].start = TIMER_DIV(100); ioports_map[TIMER_CTRL] = vcpu_exit_i8253; ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; /* Init ns8250 UART */ memset(&com1_regs, 0, sizeof(struct ns8250_regs)); for (i = COM1_DATA; i <= COM1_SCR; i++) ioports_map[i] = vcpu_exit_com; /* Initialize PCI */ for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) ioports_map[i] = vcpu_exit_pci; ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; pci_init(); /* Initialize virtio devices */ virtio_init(vcp, child_disks, child_taps); } /* * run_vm * * Runs the VM whose creation parameters are specified in vcp * * Parameters: * child_disks: previously-opened child VM disk file file descriptors * child_taps: previously-opened child tap file descriptors * vcp: vm_create_params struct containing the VM's desired creation * configuration * vis: VCPU register state to initialize * * Return values: * 0: the VM exited normally * !0 : the VM exited abnormally or failed to start */ int run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp, struct vcpu_init_state *vis) { size_t i; int ret; pthread_t *tid; void *exit_status; struct vm_run_params **vrp; struct vm_terminate_params vtp; ret = 0; /* XXX cap vcp_ncpus to avoid overflow here */ /* * XXX ensure nvcpus in vcp is same as vm, or fix vmm to return einval * on bad vcpu id */ tid = malloc(sizeof(pthread_t) * vcp->vcp_ncpus); vrp = malloc(sizeof(struct vm_run_params *) * vcp->vcp_ncpus); if (tid == NULL || vrp == NULL) { log_warn("%s: memory allocation error - exiting.", __progname); return (ENOMEM); } init_emulated_hw(vcp, child_disks, child_taps); /* * Create and launch one thread for each VCPU. These threads may * migrate between PCPUs over time; the need to reload CPU state * in such situations is detected and performed by vmm(4) in the * kernel. */ for (i = 0 ; i < vcp->vcp_ncpus; i++) { vrp[i] = malloc(sizeof(struct vm_run_params)); if (vrp[i] == NULL) { log_warn("%s: memory allocation error - " "exiting.", __progname); /* caller will exit, so skip free'ing */ return (ENOMEM); } vrp[i]->vrp_exit = malloc(sizeof(union vm_exit)); if (vrp[i]->vrp_exit == NULL) { log_warn("%s: memory allocation error - " "exiting.", __progname); /* caller will exit, so skip free'ing */ return (ENOMEM); } vrp[i]->vrp_vm_id = vcp->vcp_id; vrp[i]->vrp_vcpu_id = i; if (vcpu_reset(vcp->vcp_id, i, vis)) { log_warn("%s: cannot reset VCPU %zu - exiting.", __progname, i); return (EIO); } /* Start each VCPU run thread at vcpu_run_loop */ ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); if (ret) { /* caller will _exit after this return */ return (ret); } } /* Wait for all the threads to exit */ for (i = 0; i < vcp->vcp_ncpus; i++) { if (pthread_join(tid[i], &exit_status)) { log_warn("%s: failed to join thread %zd - " "exiting", __progname, i); return (EIO); } if (exit_status != NULL) { log_warnx("%s: vm %d vcpu run thread %zd exited " "abnormally", __progname, vcp->vcp_id, i); /* Terminate the VM if we can */ memset(&vtp, 0, sizeof(vtp)); vtp.vtp_vm_id = vcp->vcp_id; if (terminate_vm(&vtp)) { log_warnx("%s: could not terminate vm %d", __progname, vcp->vcp_id); } ret = EIO; } } return (ret); } /* * vcpu_run_loop * * Runs a single VCPU until vmm(4) requires help handling an exit, * or the VM terminates. * * Parameters: * arg: vcpu_run_params for the VCPU being run by this thread * * Return values: * NULL: the VCPU shutdown properly * !NULL: error processing VCPU run, or the VCPU shutdown abnormally */ void * vcpu_run_loop(void *arg) { struct vm_run_params *vrp = (struct vm_run_params *)arg; intptr_t ret; vrp->vrp_continue = 0; vrp->vrp_injint = -1; for (;;) { if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) < 0) { /* If run ioctl failed, exit */ ret = errno; return ((void *)ret); } /* If the VM is terminating, exit normally */ if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) return (NULL); if (vrp->vrp_exit_reason != VM_EXIT_NONE) { /* * vmm(4) needs help handling an exit, handle in * vcpu_exit. */ if (vcpu_exit(vrp)) return ((void *)EIO); } } return (NULL); } /* * vcpu_exit_i8253 * * Handles emulated i8253 PIT access (in/out instruction to PIT ports). * We don't emulate all the modes of the i8253, just the basic squarewave * clock. * * Parameters: * vrp: vm run parameters containing exit information for the I/O * instruction being performed * * Return value: * Interrupt to inject to the guest VM, or 0xFF if no interrupt should * be injected. */ uint8_t vcpu_exit_i8253(struct vm_run_params *vrp) { uint32_t out_data; uint8_t sel, rw, data; uint64_t ns, ticks; struct timeval now, delta; union vm_exit *vei = vrp->vrp_exit; if (vei->vei.vei_port == TIMER_CTRL) { if (vei->vei.vei_dir == 0) { /* OUT instruction */ out_data = vei->vei.vei_data; sel = out_data & (TIMER_SEL0 | TIMER_SEL1 | TIMER_SEL2); sel = sel >> 6; if (sel > 2) { log_warnx("%s: i8253 PIT: invalid " "timer selected (%d)", __progname, sel); goto ret; } rw = vei->vei.vei_data & (TIMER_LATCH | TIMER_16BIT); if ((rw & TIMER_16BIT) == TIMER_LSB || (rw & TIMER_16BIT) == TIMER_MSB) { log_warnx("%s: i8253 PIT: invalid timer mode " "0x%x selected", __func__, (rw & TIMER_16BIT)); } /* * Since we don't truly emulate each tick of the PIT * clock, when the guest asks for the timer to be * latched, simulate what the counter would have been * had we performed full emulation. We do this by * calculating when the counter was reset vs how much * time has elapsed, then bias by the counter tick * rate. */ if (rw == TIMER_LATCH) { gettimeofday(&now, NULL); delta.tv_sec = now.tv_sec - i8253_counter[sel].tv.tv_sec; delta.tv_usec = now.tv_usec - i8253_counter[sel].tv.tv_usec; if (delta.tv_usec < 0) { delta.tv_sec--; delta.tv_usec += 1000000; } if (delta.tv_usec > 1000000) { delta.tv_sec++; delta.tv_usec -= 1000000; } ns = delta.tv_usec * 1000 + delta.tv_sec * 1000000000; ticks = ns / NS_PER_TICK; i8253_counter[sel].olatch = i8253_counter[sel].start - ticks % i8253_counter[sel].start; goto ret; } goto ret; } else { /* XXX should this return 0xff as the data read? */ log_warnx("%s: i8253 PIT: read from control " "port unsupported", __progname); } } else { sel = vei->vei.vei_port - (TIMER_CNTR0 + TIMER_BASE); if (vei->vei.vei_dir == 0) { /* OUT instruction */ if (i8253_counter[sel].last_w == 0) { out_data = vei->vei.vei_data; i8253_counter[sel].ilatch |= (out_data << 8); i8253_counter[sel].last_w = 1; } else { out_data = vei->vei.vei_data; i8253_counter[sel].ilatch |= out_data; i8253_counter[sel].start = i8253_counter[sel].ilatch; i8253_counter[sel].last_w = 0; } } else { if (i8253_counter[sel].last_r == 0) { data = i8253_counter[sel].olatch >> 8; vei->vei.vei_data = data; i8253_counter[sel].last_w = 1; } else { data = i8253_counter[sel].olatch & 0xFF; vei->vei.vei_data = data; i8253_counter[sel].last_w = 0; } } } ret: /* XXX don't yet support interrupts generated from the 8253 */ return (0xFF); } /* * vcpu_process_com_data * * Emulate in/out instructions to the com1 (ns8250) UART data register * * Parameters: * vei: vm exit information from vmm(4) containing information on the in/out * instruction being performed */ void vcpu_process_com_data(union vm_exit *vei) { /* * vei_dir == 0 : out instruction * * The guest wrote to the data register. Since we are emulating a * no-fifo chip, write the character immediately to the pty and * assert TXRDY in IIR (if the guest has requested TXRDY interrupt * reporting) */ if (vei->vei.vei_dir == 0) { write(con_fd, &vei->vei.vei_data, 1); if (com1_regs.ier & 0x2) { /* Set TXRDY */ com1_regs.iir |= IIR_TXRDY; /* Set "interrupt pending" (IIR low bit cleared) */ com1_regs.iir &= ~0x1; } } else { /* * vei_dir == 1 : in instruction * * The guest read from the data register. Check to see if * there is data available (RXRDY) and if so, consume the * input data and return to the guest. Also clear the * interrupt info register regardless. */ if (com1_regs.lsr & LSR_RXRDY) { vei->vei.vei_data = com1_regs.data; com1_regs.data = 0x0; com1_regs.lsr &= ~LSR_RXRDY; } else { /* XXX should this be com1_regs.data or 0xff? */ vei->vei.vei_data = com1_regs.data; log_warnx("guest reading com1 when not ready"); } /* Reading the data register always clears RXRDY from IIR */ com1_regs.iir &= ~IIR_RXRDY; /* * Clear "interrupt pending" by setting IIR low bit to 1 * if no interrupt are pending */ if (com1_regs.iir == 0x0) com1_regs.iir = 0x1; } } /* * vcpu_process_com_lcr * * Emulate in/out instructions to the com1 (ns8250) UART line control register * * Paramters: * vei: vm exit information from vmm(4) containing information on the in/out * instruction being performed */ void vcpu_process_com_lcr(union vm_exit *vei) { /* * vei_dir == 0 : out instruction * * Write content to line control register */ if (vei->vei.vei_dir == 0) { com1_regs.lcr = (uint8_t)vei->vei.vei_data; } else { /* * vei_dir == 1 : in instruction * * Read line control register */ vei->vei.vei_data = com1_regs.lcr; } } /* * vcpu_process_com_iir * * Emulate in/out instructions to the com1 (ns8250) UART interrupt information * register. Note that writes to this register actually are to a different * register, the FCR (FIFO control register) that we don't emulate but still * consume the data provided. * * Parameters: * vei: vm exit information from vmm(4) containing information on the in/out * instruction being performed */ void vcpu_process_com_iir(union vm_exit *vei) { /* * vei_dir == 0 : out instruction * * Write to FCR */ if (vei->vei.vei_dir == 0) { com1_regs.fcr = vei->vei.vei_data; } else { /* * vei_dir == 1 : in instruction * * Read IIR. Reading the IIR resets the TXRDY bit in the IIR * after the data is read. */ vei->vei.vei_data = com1_regs.iir; com1_regs.iir &= ~IIR_TXRDY; /* * Clear "interrupt pending" by setting IIR low bit to 1 * if no interrupts are pending */ if (com1_regs.iir == 0x0) com1_regs.iir = 0x1; } } /* * vcpu_process_com_mcr * * Emulate in/out instructions to the com1 (ns8250) UART modem control * register. * * Parameters: * vei: vm exit information from vmm(4) containing information on the in/out * instruction being performed */ void vcpu_process_com_mcr(union vm_exit *vei) { /* * vei_dir == 0 : out instruction * * Write to MCR */ if (vei->vei.vei_dir == 0) { com1_regs.mcr = vei->vei.vei_data; } else { /* * vei_dir == 1 : in instruction * * Read from MCR */ vei->vei.vei_data = com1_regs.mcr; } } /* * vcpu_process_com_lsr * * Emulate in/out instructions to the com1 (ns8250) UART line status register. * * Parameters: * vei: vm exit information from vmm(4) containing information on the in/out * instruction being performed */ void vcpu_process_com_lsr(union vm_exit *vei) { /* * vei_dir == 0 : out instruction * * Write to LSR. This is an illegal operation, so we just log it and * continue. */ if (vei->vei.vei_dir == 0) { log_warnx("%s: LSR UART write 0x%x unsupported", __progname, vei->vei.vei_data); } else { /* * vei_dir == 1 : in instruction * * Read from LSR. We always report TXRDY and TSRE since we * can process output characters immediately (at any time). */ vei->vei.vei_data = com1_regs.lsr | LSR_TSRE | LSR_TXRDY; } } /* * vcpu_process_com_msr * * Emulate in/out instructions to the com1 (ns8250) UART modem status register. * * Parameters: * vei: vm exit information from vmm(4) containing information on the in/out * instruction being performed */ void vcpu_process_com_msr(union vm_exit *vei) { /* * vei_dir == 0 : out instruction * * Write to MSR. This is an illegal operation, so we just log it and * continue. */ if (vei->vei.vei_dir == 0) { log_warnx("%s: MSR UART write 0x%x unsupported", __progname, vei->vei.vei_data); } else { /* * vei_dir == 1 : in instruction * * Read from MSR. We always report DCD, DSR, and CTS. */ vei->vei.vei_data = com1_regs.lsr | MSR_DCD | MSR_DSR | MSR_CTS; } } /* * vcpu_process_com_scr * * Emulate in/out instructions to the com1 (ns8250) UART scratch register. The * scratch register is sometimes used to distinguish an 8250 from a 16450, * and/or used to distinguish submodels of the 8250 (eg 8250A, 8250B). We * simulate an "original" 8250 by forcing the scratch register to return data * on read that is different from what was written. * * Parameters: * vei: vm exit information from vmm(4) containing information on the in/out * instruction being performed */ void vcpu_process_com_scr(union vm_exit *vei) { /* * vei_dir == 0 : out instruction * * Write to SCR */ if (vei->vei.vei_dir == 0) { com1_regs.scr = vei->vei.vei_data; } else { /* * vei_dir == 1 : in instruction * * Read from SCR. To make sure we don't accidentally simulate * a real scratch register, we negate what was written on * subsequent readback. */ vei->vei.vei_data = ~com1_regs.scr; } } /* * vcpu_process_com_ier * * Emulate in/out instructions to the com1 (ns8250) UART interrupt enable * register. * * Parameters: * vei: vm exit information from vmm(4) containing information on the in/out * instruction being performed */ void vcpu_process_com_ier(union vm_exit *vei) { /* * vei_dir == 0 : out instruction * * Write to IER */ if (vei->vei.vei_dir == 0) { com1_regs.ier = vei->vei.vei_data; } else { /* * vei_dir == 1 : in instruction * * Read from IER */ vei->vei.vei_data = com1_regs.ier; } } /* * vcpu_exit_com * * Process com1 (ns8250) UART exits. vmd handles most basic 8250 * features with the exception of the divisor latch (eg, no baud * rate support) * * Parameters: * vrp: vcpu run parameters containing guest state for this exit * * Return value: * Interrupt to inject to the guest VM, or 0xFF if no interrupt should * be injected. */ uint8_t vcpu_exit_com(struct vm_run_params *vrp) { union vm_exit *vei = vrp->vrp_exit; switch (vei->vei.vei_port) { case COM1_LCR: vcpu_process_com_lcr(vei); break; case COM1_IER: vcpu_process_com_ier(vei); break; case COM1_IIR: vcpu_process_com_iir(vei); break; case COM1_MCR: vcpu_process_com_mcr(vei); break; case COM1_LSR: vcpu_process_com_lsr(vei); break; case COM1_MSR: vcpu_process_com_msr(vei); break; case COM1_SCR: vcpu_process_com_scr(vei); break; case COM1_DATA: vcpu_process_com_data(vei); break; } return (0xFF); } /* * vcpu_exit_pci * * Handle all I/O to the emulated PCI subsystem. * * Parameters: * vrp: vcpu run paramters containing guest state for this exit * * Return value: * Interrupt to inject to the guest VM, or 0xFF if no interrupt should * be injected. */ uint8_t vcpu_exit_pci(struct vm_run_params *vrp) { union vm_exit *vei = vrp->vrp_exit; uint8_t intr; intr = 0xFF; switch (vei->vei.vei_port) { case PCI_MODE1_ADDRESS_REG: pci_handle_address_reg(vrp); break; case PCI_MODE1_DATA_REG: pci_handle_data_reg(vrp); break; case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: intr = pci_handle_io(vrp); break; default: log_warnx("%s: unknown PCI register 0x%llx", __progname, (uint64_t)vei->vei.vei_port); break; } return (intr); } /* * vcpu_exit_inout * * Handle all I/O exits that need to be emulated in vmd. This includes the * i8253 PIT and the com1 ns8250 UART. * * Parameters: * vrp: vcpu run parameters containing guest state for this exit */ void vcpu_exit_inout(struct vm_run_params *vrp) { union vm_exit *vei = vrp->vrp_exit; uint8_t intr = 0xFF; if (ioports_map[vei->vei.vei_port] != NULL) intr = ioports_map[vei->vei.vei_port](vrp); else if (vei->vei.vei_dir == 1) vei->vei.vei_data = 0xFFFFFFFF; if (intr != 0xFF) vrp->vrp_injint = intr; else vrp->vrp_injint = -1; } /* * vcpu_exit * * Handle a vcpu exit. This function is called when it is determined that * vmm(4) requires the assistance of vmd to support a particular guest * exit type (eg, accessing an I/O port or device). Guest state is contained * in 'vrp', and will be resent to vmm(4) on exit completion. * * Upon conclusion of handling the exit, the function determines if any * interrupts should be injected into the guest, and sets vrp->vrp_injint * to the IRQ line whose interrupt should be vectored (or -1 if no interrupt * is to be injected). * * Parameters: * vrp: vcpu run parameters containing guest state for this exit * * Return values: * 0: the exit was handled successfully * 1: an error occurred (exit not handled) */ int vcpu_exit(struct vm_run_params *vrp) { ssize_t sz; char ch; switch (vrp->vrp_exit_reason) { case VMX_EXIT_IO: vcpu_exit_inout(vrp); break; case VMX_EXIT_HLT: /* * XXX handle halted state, no reason to run this vcpu again * until a vm interrupt is to be injected */ break; default: log_warnx("%s: unknown exit reason %d", __progname, vrp->vrp_exit_reason); return (1); } /* XXX interrupt priority */ if (vionet_process_rx()) vrp->vrp_injint = 9; /* * Is there a new character available on com1? * If so, consume the character, buffer it into the com1 data register * assert IRQ4, and set the line status register RXRDY bit. * * XXX - move all this com intr checking to another function */ sz = read(con_fd, &ch, sizeof(char)); if (sz == 1) { com1_regs.lsr |= LSR_RXRDY; com1_regs.data = ch; /* XXX these ier and iir bits should be IER_x and IIR_x */ if (com1_regs.ier & 0x1) { com1_regs.iir |= (2 << 1); com1_regs.iir &= ~0x1; } } /* * Clear "interrupt pending" by setting IIR low bit to 1 if no * interrupts are pending */ /* XXX these iir magic numbers should be IIR_x */ if ((com1_regs.iir & ~0x1) == 0x0) com1_regs.iir = 0x1; /* If pending interrupt and nothing waiting to be injected, inject */ if ((com1_regs.iir & 0x1) == 0) if (vrp->vrp_injint == -1) vrp->vrp_injint = 0x4; vrp->vrp_continue = 1; return (0); } /* * write_mem * * Pushes data from 'buf' into the guest VM's memory at paddr 'dst'. * * Parameters: * dst: the destination paddr_t in the guest VM to push into. * If there is no guest paddr mapping at 'dst', a new page will be * faulted in by the VMM (provided 'dst' represents a valid paddr * in the guest's address space) * buf: data to push * len: size of 'buf' * * Return values: * various return values from ioctl(VMM_IOC_WRITEPAGE), or 0 if no error * occurred. */ int write_mem(paddr_t dst, void *buf, size_t len) { char *p = buf; size_t n, left; paddr_t gpa; struct vm_writepage_params vwp; left = len; for (gpa = dst; gpa < dst + len; gpa = (gpa & ~PAGE_MASK) + PAGE_SIZE) { n = left; if (n > PAGE_SIZE) n = PAGE_SIZE; if (n > (PAGE_SIZE - (gpa & PAGE_MASK))) n = PAGE_SIZE - (gpa & PAGE_MASK); vwp.vwp_paddr = (paddr_t)gpa; vwp.vwp_data = p; vwp.vwp_vm_id = current_vm->vm_params.vcp_id; vwp.vwp_len = n; if (ioctl(env->vmd_fd, VMM_IOC_WRITEPAGE, &vwp) < 0) { log_warn("writepage ioctl failed @ 0x%lx: " "dst = 0x%lx, len = 0x%zx", gpa, dst, len); return (errno); } left -= n; p += n; } return (0); } /* * read_mem * * Reads memory at guest paddr 'src' into 'buf'. * * Parameters: * src: the source paddr_t in the guest VM to read from. * buf: destination (local) buffer * len: size of 'buf' * * Return values: * various return values from ioctl(VMM_IOC_READPAGE), or 0 if no error * occurred. */ int read_mem(paddr_t src, void *buf, size_t len) { char *p = buf; size_t n, left; paddr_t gpa; struct vm_readpage_params vrp; left = len; for (gpa = src; gpa < src + len; gpa = (gpa & ~PAGE_MASK) + PAGE_SIZE) { n = left; if (n > PAGE_SIZE) n = PAGE_SIZE; if (n > (PAGE_SIZE - (gpa & PAGE_MASK))) n = PAGE_SIZE - (gpa & PAGE_MASK); vrp.vrp_paddr = (paddr_t)gpa; vrp.vrp_data = p; vrp.vrp_vm_id = current_vm->vm_params.vcp_id; vrp.vrp_len = n; if (ioctl(env->vmd_fd, VMM_IOC_READPAGE, &vrp) < 0) { log_warn("readpage ioctl failed @ 0x%lx: " "src = 0x%lx, len = 0x%zx", gpa, src, len); return (errno); } left -= n; p += n; } return (0); }