diff options
author | Dave Voutila <dv@cvs.openbsd.org> | 2023-05-13 23:15:29 +0000 |
---|---|---|
committer | Dave Voutila <dv@cvs.openbsd.org> | 2023-05-13 23:15:29 +0000 |
commit | 0450f77e1971ab09e8861ab1cab51909d3e1adbb (patch) | |
tree | 54405ab1af285f1b5301105e31cc0e9c675cda5d /usr.sbin | |
parent | 7698cea1a4c2fa7eff4cc2405547ca04b294213b (diff) |
vmm(4)/vmd(8): switch to anonymous shared mappings.
While splitting out emulated virtio network and block devices into
separate processes, I originally used named mappings via shm_mkstemp(3).
While this functionally achieved the desired result, it had two
unintended consequences:
1) tearing down a vm process and its child processes required
excessive locking as the guest memory was tied into the VFS layer.
2) it was observed by mlarkin@ that actions in other parts of the
VFS layer could cause some of the guest memory to flush to storage,
possibly filling /tmp.
This commit adds a new vmm(4) ioctl dedicated to allowing a process
request the kernel share a mapping of guest memory into its own vm
space. This requires an open fd to /dev/vmm (requiring root) and
both the "vmm" and "proc" pledge(2) promises. In addition, the caller
must know enough about the original memory ranges to reconstruct them
to make the vm's ranges.
Tested with help from Mischa Peters.
ok mlarkin@
Diffstat (limited to 'usr.sbin')
-rw-r--r-- | usr.sbin/vmd/vioblk.c | 25 | ||||
-rw-r--r-- | usr.sbin/vmd/vionet.c | 29 | ||||
-rw-r--r-- | usr.sbin/vmd/virtio.c | 16 | ||||
-rw-r--r-- | usr.sbin/vmd/vm.c | 126 | ||||
-rw-r--r-- | usr.sbin/vmd/vmd.c | 20 | ||||
-rw-r--r-- | usr.sbin/vmd/vmd.h | 13 | ||||
-rw-r--r-- | usr.sbin/vmd/vmm.c | 24 |
7 files changed, 135 insertions, 118 deletions
diff --git a/usr.sbin/vmd/vioblk.c b/usr.sbin/vmd/vioblk.c index 9373a135aa8..33d447a9438 100644 --- a/usr.sbin/vmd/vioblk.c +++ b/usr.sbin/vmd/vioblk.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vioblk.c,v 1.2 2023/04/28 18:52:22 dv Exp $ */ +/* $OpenBSD: vioblk.c,v 1.3 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2023 Dave Voutila <dv@openbsd.org> @@ -58,7 +58,7 @@ disk_type(int type) } __dead void -vioblk_main(int fd) +vioblk_main(int fd, int fd_vmm) { struct virtio_dev dev; struct vioblk_dev *vioblk; @@ -71,8 +71,11 @@ vioblk_main(int fd) log_procinit("vioblk"); - /* stdio - needed for read/write to disk fds and channels to the vm. */ - if (pledge("stdio", NULL) == -1) + /* + * stdio - needed for read/write to disk fds and channels to the vm. + * vmm + proc - needed to create shared vm mappings. + */ + if (pledge("stdio vmm proc", NULL) == -1) fatal("pledge"); /* Receive our virtio_dev, mostly preconfigured. */ @@ -92,8 +95,9 @@ vioblk_main(int fd) vioblk = &dev.vioblk; log_debug("%s: got viblk dev. num disk fds = %d, sync fd = %d, " - "async fd = %d, sz = %lld maxfer = %d", __func__, vioblk->ndisk_fd, - dev.sync_fd, dev.async_fd, vioblk->sz, vioblk->max_xfer); + "async fd = %d, sz = %lld maxfer = %d, vmm fd = %d", __func__, + vioblk->ndisk_fd, dev.sync_fd, dev.async_fd, vioblk->sz, + vioblk->max_xfer, fd_vmm); /* Receive our vm information from the vm process. */ memset(&vm, 0, sizeof(vm)); @@ -108,12 +112,19 @@ vioblk_main(int fd) setproctitle("%s/vioblk[%d]", vcp->vcp_name, vioblk->idx); /* Now that we have our vm information, we can remap memory. */ - ret = remap_guest_mem(&vm); + ret = remap_guest_mem(&vm, fd_vmm); if (ret) { log_warnx("failed to remap guest memory"); goto fail; } + /* + * We no longer need /dev/vmm access. + */ + close_fd(fd_vmm); + if (pledge("stdio", NULL) == -1) + fatal("pledge2"); + /* Initialize the virtio block abstractions. */ type = vm.vm_params.vmc_disktypes[vioblk->idx]; switch (type) { diff --git a/usr.sbin/vmd/vionet.c b/usr.sbin/vmd/vionet.c index 6ce905fdccf..c16ad2635ea 100644 --- a/usr.sbin/vmd/vionet.c +++ b/usr.sbin/vmd/vionet.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vionet.c,v 1.2 2023/04/28 18:52:22 dv Exp $ */ +/* $OpenBSD: vionet.c,v 1.3 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2023 Dave Voutila <dv@openbsd.org> @@ -61,7 +61,7 @@ static void dev_dispatch_vm(int, short, void *); static void handle_sync_io(int, short, void *); __dead void -vionet_main(int fd) +vionet_main(int fd, int fd_vmm) { struct virtio_dev dev; struct vionet_dev *vionet = NULL; @@ -73,8 +73,11 @@ vionet_main(int fd) log_procinit("vionet"); - /* stdio - needed for read/write to tap fd and channels to the vm. */ - if (pledge("stdio", NULL) == -1) + /* + * stdio - needed for read/write to disk fds and channels to the vm. + * vmm + proc - needed to create shared vm mappings. + */ + if (pledge("stdio vmm proc", NULL) == -1) fatal("pledge"); /* Receive our vionet_dev, mostly preconfigured. */ @@ -92,8 +95,9 @@ vionet_main(int fd) dev.sync_fd = fd; vionet = &dev.vionet; - log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d", - __func__, vionet->data_fd, dev.sync_fd, dev.async_fd); + log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d" + ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd, + dev.async_fd, fd_vmm); /* Receive our vm information from the vm process. */ memset(&vm, 0, sizeof(vm)); @@ -108,9 +112,18 @@ vionet_main(int fd) setproctitle("%s/vionet[%d]", vcp->vcp_name, vionet->idx); /* Now that we have our vm information, we can remap memory. */ - ret = remap_guest_mem(&vm); - if (ret) + ret = remap_guest_mem(&vm, fd_vmm); + if (ret) { + fatal("%s: failed to remap", __func__); goto fail; + } + + /* + * We no longer need /dev/vmm access. + */ + close_fd(fd_vmm); + if (pledge("stdio", NULL) == -1) + fatal("pledge2"); /* If we're restoring hardware, re-initialize virtqueue hva's. */ if (vm.vm_state & VM_STATE_RECEIVED) { diff --git a/usr.sbin/vmd/virtio.c b/usr.sbin/vmd/virtio.c index 92e77b8f834..d29b9e7b883 100644 --- a/usr.sbin/vmd/virtio.c +++ b/usr.sbin/vmd/virtio.c @@ -1,4 +1,4 @@ -/* $OpenBSD: virtio.c,v 1.102 2023/04/27 22:47:27 dv Exp $ */ +/* $OpenBSD: virtio.c,v 1.103 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> @@ -1297,7 +1297,7 @@ virtio_start(struct vmd_vm *vm) static int virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev) { - char *nargv[8], num[32], t[2]; + char *nargv[10], num[32], vmm_fd[32], t[2]; pid_t dev_pid; int data_fds[VM_MAX_BASE_PER_DISK], sync_fds[2], async_fds[2], ret = 0; size_t i, j, data_fds_sz, sz = 0; @@ -1483,6 +1483,8 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev) memset(&nargv, 0, sizeof(nargv)); memset(num, 0, sizeof(num)); snprintf(num, sizeof(num), "%d", sync_fds[1]); + memset(vmm_fd, 0, sizeof(vmm_fd)); + snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd); t[0] = dev->dev_type; t[1] = '\0'; @@ -1492,13 +1494,15 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev) nargv[2] = num; nargv[3] = "-t"; nargv[4] = t; - nargv[5] = "-n"; + nargv[5] = "-i"; + nargv[6] = vmm_fd; + nargv[7] = "-n"; if (env->vmd_verbose) { - nargv[6] = "-v"; - nargv[7] = NULL; + nargv[8] = "-v"; + nargv[9] = NULL; } else - nargv[6] = NULL; + nargv[8] = NULL; /* Control resumes in vmd.c:main(). */ execvp(nargv[0], nargv); diff --git a/usr.sbin/vmd/vm.c b/usr.sbin/vmd/vm.c index d42abb5a834..8ec69d6056e 100644 --- a/usr.sbin/vmd/vm.c +++ b/usr.sbin/vmd/vm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vm.c,v 1.88 2023/04/28 19:46:42 dv Exp $ */ +/* $OpenBSD: vm.c,v 1.89 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> @@ -218,9 +218,10 @@ static const struct vcpu_reg_state vcpu_init_flat16 = { * Primary entrypoint for launching a vm. Does not return. * * fd: file descriptor for communicating with vmm process. + * fd_vmm: file descriptor for communicating with vmm(4) device */ void -vm_main(int fd) +vm_main(int fd, int vmm_fd) { struct vm_create_params *vcp = NULL; struct vmd_vm vm; @@ -241,9 +242,8 @@ vm_main(int fd) * vmm - for the vmm ioctls and operations. * proc exec - fork/exec for launching devices. * recvfd - for vm send/recv and sending fd to devices. - * tmppath/rpath - for shm_mkstemp, ftruncate, unlink */ - if (pledge("stdio vmm proc exec recvfd tmppath rpath", NULL) == -1) + if (pledge("stdio vmm proc exec recvfd", NULL) == -1) fatal("pledge"); /* Receive our vm configuration. */ @@ -254,13 +254,6 @@ vm_main(int fd) _exit(EIO); } - /* Receive the /dev/vmm fd number. */ - sz = atomicio(read, fd, &env->vmd_fd, sizeof(env->vmd_fd)); - if (sz != sizeof(env->vmd_fd)) { - log_warnx("failed to receive /dev/vmm fd"); - _exit(EIO); - } - /* Update process with the vm name. */ vcp = &vm.vm_params.vmc_params; setproctitle("%s", vcp->vcp_name); @@ -1099,63 +1092,34 @@ int alloc_guest_mem(struct vmd_vm *vm) { void *p; - char *tmp; - int fd, ret = 0; + int ret = 0; size_t i, j; struct vm_create_params *vcp = &vm->vm_params.vmc_params; struct vm_mem_range *vmr; - tmp = calloc(32, sizeof(char)); - if (tmp == NULL) { - ret = errno; - log_warn("%s: calloc", __func__); - return (ret); - } - strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32); - - vm->vm_nmemfds = vcp->vcp_nmemranges; - for (i = 0; i < vcp->vcp_nmemranges; i++) { vmr = &vcp->vcp_memranges[i]; - fd = shm_mkstemp(tmp); - if (fd < 0) { - ret = errno; - log_warn("%s: shm_mkstemp", __func__); - return (ret); - } - if (ftruncate(fd, vmr->vmr_size) == -1) { - ret = errno; - log_warn("%s: ftruncate", __func__); - goto out; - } - if (fcntl(fd, F_SETFD, 0) == -1) { - ret = errno; - log_warn("%s: fcntl", __func__); - goto out; - } - if (shm_unlink(tmp) == -1) { - ret = errno; - log_warn("%s: shm_unlink", __func__); - goto out; - } - strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32); - + /* + * We only need R/W as userland. vmm(4) will use R/W/X in its + * mapping. + * + * We must use MAP_SHARED so emulated devices will be able + * to generate shared mappings. + */ p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_CONCEAL, fd, 0); + MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0); if (p == MAP_FAILED) { ret = errno; for (j = 0; j < i; j++) { vmr = &vcp->vcp_memranges[j]; munmap((void *)vmr->vmr_va, vmr->vmr_size); } - goto out; + return (ret); } - vm->vm_memfds[i] = fd; vmr->vmr_va = (vaddr_t)p; } -out: - free(tmp); + return (ret); } @@ -2552,10 +2516,11 @@ vm_pipe_recv(struct vm_dev_pipe *p) * Returns 0 on success, non-zero in event of failure. */ int -remap_guest_mem(struct vmd_vm *vm) +remap_guest_mem(struct vmd_vm *vm, int vmm_fd) { struct vm_create_params *vcp; struct vm_mem_range *vmr; + struct vm_sharemem_params vsp; size_t i, j; void *p = NULL; int ret; @@ -2566,23 +2531,32 @@ remap_guest_mem(struct vmd_vm *vm) vcp = &vm->vm_params.vmc_params; /* - * We've execve'd, so we need to re-map the guest VM memory. Iterate - * over all possible vm_mem_range entries so we can initialize all - * file descriptors to a value. + * Initialize our VM shared memory request using our original + * creation parameters. We'll overwrite the va's after mmap(2). + */ + memset(&vsp, 0, sizeof(vsp)); + vsp.vsp_nmemranges = vcp->vcp_nmemranges; + vsp.vsp_vm_id = vcp->vcp_id; + memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges, + sizeof(vsp.vsp_memranges)); + + /* + * Use mmap(2) to identify virtual address space for our mappings. */ for (i = 0; i < VMM_MAX_MEM_RANGES; i++) { - if (i < vcp->vcp_nmemranges) { - vmr = &vcp->vcp_memranges[i]; - /* Skip ranges we know we don't need right now. */ + if (i < vsp.vsp_nmemranges) { + vmr = &vsp.vsp_memranges[i]; + + /* Ignore any MMIO ranges. */ if (vmr->vmr_type == VM_MEM_MMIO) { - log_debug("%s: skipping range i=%ld, type=%d", - __func__, i, vmr->vmr_type); - vm->vm_memfds[i] = -1; + vmr->vmr_va = 0; + vcp->vcp_memranges[i].vmr_va = 0; continue; } - /* Re-mmap the memrange. */ - p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_CONCEAL, vm->vm_memfds[i], 0); + + /* Make initial mappings for the memrange. */ + p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1, + 0); if (p == MAP_FAILED) { ret = errno; log_warn("%s: mmap", __func__); @@ -2594,11 +2568,29 @@ remap_guest_mem(struct vmd_vm *vm) return (ret); } vmr->vmr_va = (vaddr_t)p; - } else { - /* Initialize with an invalid fd. */ - vm->vm_memfds[i] = -1; + vcp->vcp_memranges[i].vmr_va = vmr->vmr_va; } } + /* + * munmap(2) now that we have va's and ranges that don't overlap. vmm + * will use the va's and sizes to recreate the mappings for us. + */ + for (i = 0; i < vsp.vsp_nmemranges; i++) { + vmr = &vsp.vsp_memranges[i]; + if (vmr->vmr_type == VM_MEM_MMIO) + continue; + if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1) + fatal("%s: munmap", __func__); + } + + /* + * Ask vmm to enter the shared mappings for us. They'll point + * to the same host physical memory, but will have a randomized + * virtual address for the calling process. + */ + if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1) + return (errno); + return (0); } diff --git a/usr.sbin/vmd/vmd.c b/usr.sbin/vmd/vmd.c index b8cc0a09fe3..86a5132fe22 100644 --- a/usr.sbin/vmd/vmd.c +++ b/usr.sbin/vmd/vmd.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmd.c,v 1.148 2023/05/12 16:18:17 dv Exp $ */ +/* $OpenBSD: vmd.c,v 1.149 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org> @@ -788,7 +788,8 @@ main(int argc, char **argv) struct privsep *ps; int ch; enum privsep_procid proc_id = PROC_PARENT; - int proc_instance = 0, vm_launch = 0, vm_fd = -1; + int proc_instance = 0, vm_launch = 0; + int vmm_fd = -1, vm_fd = -1; const char *errp, *title = NULL; int argc0 = argc; char dev_type = '\0'; @@ -798,7 +799,7 @@ main(int argc, char **argv) if ((env = calloc(1, sizeof(*env))) == NULL) fatal("calloc: env"); - while ((ch = getopt(argc, argv, "D:P:I:V:X:df:nt:v")) != -1) { + while ((ch = getopt(argc, argv, "D:P:I:V:X:df:i:nt:v")) != -1) { switch (ch) { case 'D': if (cmdline_symset(optarg) < 0) @@ -852,6 +853,11 @@ main(int argc, char **argv) default: fatalx("invalid device type"); } break; + case 'i': + vmm_fd = strtonum(optarg, 0, 128, &errp); + if (errp) + fatalx("invalid vmm fd"); + break; default: usage(); } @@ -880,7 +886,7 @@ main(int argc, char **argv) ps = &env->vmd_ps; ps->ps_env = env; - env->vmd_fd = -1; + env->vmd_fd = vmm_fd; if (config_init(env) == -1) fatal("failed to initialize configuration"); @@ -896,14 +902,14 @@ main(int argc, char **argv) * If we're launching a new vm or its device, we short out here. */ if (vm_launch == VMD_LAUNCH_VM) { - vm_main(vm_fd); + vm_main(vm_fd, vmm_fd); /* NOTREACHED */ } else if (vm_launch == VMD_LAUNCH_DEV) { if (dev_type == VMD_DEVTYPE_NET) { - vionet_main(vm_fd); + vionet_main(vm_fd, vmm_fd); /* NOTREACHED */ } else if (dev_type == VMD_DEVTYPE_DISK) { - vioblk_main(vm_fd); + vioblk_main(vm_fd, vmm_fd); /* NOTREACHED */ } fatalx("unsupported device type '%c'", dev_type); diff --git a/usr.sbin/vmd/vmd.h b/usr.sbin/vmd/vmd.h index 68de0544706..9c25b0c92ad 100644 --- a/usr.sbin/vmd/vmd.h +++ b/usr.sbin/vmd/vmd.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmd.h,v 1.121 2023/04/28 19:46:42 dv Exp $ */ +/* $OpenBSD: vmd.h,v 1.122 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> @@ -329,9 +329,6 @@ struct vmd_vm { struct timeval vm_start_tv; int vm_start_limit; - int vm_memfds[VMM_MAX_MEM_RANGES]; - size_t vm_nmemfds; - TAILQ_ENTRY(vmd_vm) vm_entry; }; TAILQ_HEAD(vmlist, vmd_vm); @@ -488,7 +485,7 @@ int fd_hasdata(int); int vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *)); /* vm.c */ -void vm_main(int); +void vm_main(int, int); void mutex_lock(pthread_mutex_t *); void mutex_unlock(pthread_mutex_t *); int read_mem(paddr_t, void *buf, size_t); @@ -499,7 +496,7 @@ void vm_pipe_send(struct vm_dev_pipe *, enum pipe_msg_type); enum pipe_msg_type vm_pipe_recv(struct vm_dev_pipe *); int write_mem(paddr_t, const void *buf, size_t); void* hvaddr_mem(paddr_t, size_t); -int remap_guest_mem(struct vmd_vm *); +int remap_guest_mem(struct vmd_vm *, int); /* config.c */ int config_init(struct vmd *); @@ -527,9 +524,9 @@ int host(const char *, struct address *); int virtio_get_base(int, char *, size_t, int, const char *); /* vionet.c */ -__dead void vionet_main(int); +__dead void vionet_main(int, int); /* vioblk.c */ -__dead void vioblk_main(int); +__dead void vioblk_main(int, int); #endif /* VMD_H */ diff --git a/usr.sbin/vmd/vmm.c b/usr.sbin/vmd/vmm.c index 35119673dc3..7f307f99c9c 100644 --- a/usr.sbin/vmd/vmm.c +++ b/usr.sbin/vmd/vmm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmm.c,v 1.111 2023/04/27 22:47:27 dv Exp $ */ +/* $OpenBSD: vmm.c,v 1.112 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> @@ -627,7 +627,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid) { struct vm_create_params *vcp; struct vmd_vm *vm; - char *nargv[6], num[32]; + char *nargv[8], num[32], vmm_fd[32]; int fd, ret = EINVAL; int fds[2]; pid_t vm_pid; @@ -701,16 +701,6 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid) if (ret == EIO) goto err; - /* Send the fd number for /dev/vmm. */ - sz = atomicio(vwrite, fds[0], &env->vmd_fd, - sizeof(env->vmd_fd)); - if (sz != sizeof(env->vmd_fd)) { - log_warnx("%s: failed to send /dev/vmm fd for vm '%s'", - __func__, vcp->vcp_name); - ret = EIO; - goto err; - } - /* Read back the kernel-generated vm id from the child */ sz = atomicio(read, fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id)); if (sz != sizeof(vcp->vcp_id)) { @@ -773,17 +763,21 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid) memset(&nargv, 0, sizeof(nargv)); memset(num, 0, sizeof(num)); snprintf(num, sizeof(num), "%d", fds[1]); + memset(vmm_fd, 0, sizeof(vmm_fd)); + snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd); nargv[0] = env->argv0; nargv[1] = "-V"; nargv[2] = num; nargv[3] = "-n"; + nargv[4] = "-i"; + nargv[5] = vmm_fd; if (env->vmd_verbose) { - nargv[4] = "-v"; - nargv[5] = NULL; + nargv[6] = "-v"; + nargv[7] = NULL; } else - nargv[4] = NULL; + nargv[6] = NULL; /* Control resumes in vmd main(). */ execvp(nargv[0], nargv); |