/* $OpenBSD: vmd.c,v 1.155 2024/02/05 21:58:09 dv Exp $ */ /* * Copyright (c) 2015 Reyk Floeter * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "proc.h" #include "atomicio.h" #include "vmd.h" __dead void usage(void); int main(int, char **); int vmd_configure(void); void vmd_sighdlr(int sig, short event, void *arg); void vmd_shutdown(void); int vmd_control_run(void); int vmd_dispatch_control(int, struct privsep_proc *, struct imsg *); int vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *); int vmd_dispatch_agentx(int, struct privsep_proc *, struct imsg *); int vmd_dispatch_priv(int, struct privsep_proc *, struct imsg *); int vmd_check_vmh(struct vm_dump_header *); int vm_instance(struct privsep *, struct vmd_vm **, struct vmop_create_params *, uid_t); int vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t); int vm_claimid(const char *, int, uint32_t *); void start_vm_batch(int, short, void*); static inline void vm_terminate(struct vmd_vm *, const char *); struct vmd *env; static struct privsep_proc procs[] = { /* Keep "priv" on top as procs[0] */ { "priv", PROC_PRIV, vmd_dispatch_priv, priv }, { "control", PROC_CONTROL, vmd_dispatch_control, control }, { "vmm", PROC_VMM, vmd_dispatch_vmm, vmm, vmm_shutdown, "/" }, { "agentx", PROC_AGENTX, vmd_dispatch_agentx, vm_agentx, vm_agentx_shutdown, "/" } }; enum privsep_procid privsep_process; struct event staggered_start_timer; /* For the privileged process */ static struct privsep_proc *proc_priv = &procs[0]; static struct passwd proc_privpw; static const uint8_t zero_mac[ETHER_ADDR_LEN]; const char default_conffile[] = VMD_CONF; const char *conffile = default_conffile; int vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg) { struct privsep *ps = p->p_ps; int res = 0, ret = 0, cmd = 0, verbose; int ifd; unsigned int v = 0, flags; struct vmop_create_params vmc; struct vmop_id vid; struct vmop_result vmr; struct vm_dump_header vmh; struct vmd_vm *vm = NULL; char *str = NULL; uint32_t id = 0; struct control_sock *rcs; switch (imsg->hdr.type) { case IMSG_VMDOP_START_VM_REQUEST: IMSG_SIZE_CHECK(imsg, &vmc); memcpy(&vmc, imsg->data, sizeof(vmc)); vmc.vmc_kernel = imsg_get_fd(imsg); /* Try registering our VM in our list of known VMs. */ if (vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid)) { res = errno; /* Did we have a failure during lookup of a parent? */ if (vm == NULL) { cmd = IMSG_VMDOP_START_VM_RESPONSE; break; } /* Does the VM already exist? */ if (res == EALREADY) { /* Is it already running? */ if (vm->vm_state & VM_STATE_RUNNING) { cmd = IMSG_VMDOP_START_VM_RESPONSE; break; } /* If not running, are our flags ok? */ if (vmc.vmc_flags && vmc.vmc_flags != VMOP_CREATE_KERNEL) { cmd = IMSG_VMDOP_START_VM_RESPONSE; break; } } res = 0; } /* Try to start the launch of the VM. */ res = config_setvm(ps, vm, imsg->hdr.peerid, vm->vm_params.vmc_owner.uid); if (res) cmd = IMSG_VMDOP_START_VM_RESPONSE; break; case IMSG_VMDOP_WAIT_VM_REQUEST: case IMSG_VMDOP_TERMINATE_VM_REQUEST: IMSG_SIZE_CHECK(imsg, &vid); memcpy(&vid, imsg->data, sizeof(vid)); flags = vid.vid_flags; cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; if ((id = vid.vid_id) == 0) { /* Lookup vm (id) by name */ if ((vm = vm_getbyname(vid.vid_name)) == NULL) { res = ENOENT; break; } id = vm->vm_vmid; } else if ((vm = vm_getbyvmid(id)) == NULL) { res = ENOENT; break; } /* Validate curent state of vm */ if ((vm->vm_state & VM_STATE_SHUTDOWN) && (flags & VMOP_FORCE) == 0) { res = EALREADY; break; } else if (!(vm->vm_state & VM_STATE_RUNNING)) { res = EINVAL; break; } else if (vm_checkperm(vm, &vm->vm_params.vmc_owner, vid.vid_uid)) { res = EPERM; break; } /* Only relay TERMINATION requests, not WAIT requests */ if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_REQUEST) { memset(&vid, 0, sizeof(vid)); vid.vid_id = id; vid.vid_flags = flags; if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type, imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1) return (-1); } break; case IMSG_VMDOP_GET_INFO_VM_REQUEST: proc_forward_imsg(ps, imsg, PROC_VMM, -1); break; case IMSG_VMDOP_LOAD: IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */ str = get_string((uint8_t *)imsg->data, IMSG_DATA_SIZE(imsg)); case IMSG_VMDOP_RELOAD: if (vmd_reload(0, str) == -1) cmd = IMSG_CTL_FAIL; else cmd = IMSG_CTL_OK; free(str); break; case IMSG_CTL_RESET: IMSG_SIZE_CHECK(imsg, &v); memcpy(&v, imsg->data, sizeof(v)); if (vmd_reload(v, NULL) == -1) cmd = IMSG_CTL_FAIL; else cmd = IMSG_CTL_OK; break; case IMSG_CTL_VERBOSE: IMSG_SIZE_CHECK(imsg, &verbose); memcpy(&verbose, imsg->data, sizeof(verbose)); log_setverbose(verbose); proc_forward_imsg(ps, imsg, PROC_VMM, -1); proc_forward_imsg(ps, imsg, PROC_PRIV, -1); cmd = IMSG_CTL_OK; break; case IMSG_VMDOP_PAUSE_VM: case IMSG_VMDOP_UNPAUSE_VM: IMSG_SIZE_CHECK(imsg, &vid); memcpy(&vid, imsg->data, sizeof(vid)); if (vid.vid_id == 0) { if ((vm = vm_getbyname(vid.vid_name)) == NULL) { res = ENOENT; cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM ? IMSG_VMDOP_PAUSE_VM_RESPONSE : IMSG_VMDOP_UNPAUSE_VM_RESPONSE; break; } else { vid.vid_id = vm->vm_vmid; } } else if ((vm = vm_getbyid(vid.vid_id)) == NULL) { res = ENOENT; cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM ? IMSG_VMDOP_PAUSE_VM_RESPONSE : IMSG_VMDOP_UNPAUSE_VM_RESPONSE; break; } if (vm_checkperm(vm, &vm->vm_params.vmc_owner, vid.vid_uid) != 0) { res = EPERM; cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM ? IMSG_VMDOP_PAUSE_VM_RESPONSE : IMSG_VMDOP_UNPAUSE_VM_RESPONSE; break; } proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type, imsg->hdr.peerid, -1, &vid, sizeof(vid)); break; case IMSG_VMDOP_SEND_VM_REQUEST: IMSG_SIZE_CHECK(imsg, &vid); memcpy(&vid, imsg->data, sizeof(vid)); id = vid.vid_id; ifd = imsg_get_fd(imsg); if (vid.vid_id == 0) { if ((vm = vm_getbyname(vid.vid_name)) == NULL) { res = ENOENT; cmd = IMSG_VMDOP_SEND_VM_RESPONSE; close(ifd); break; } else { vid.vid_id = vm->vm_vmid; } } else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) { res = ENOENT; cmd = IMSG_VMDOP_SEND_VM_RESPONSE; close(ifd); break; } vmr.vmr_id = vid.vid_id; log_debug("%s: sending fd to vmm", __func__); proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type, imsg->hdr.peerid, ifd, &vid, sizeof(vid)); break; case IMSG_VMDOP_RECEIVE_VM_REQUEST: IMSG_SIZE_CHECK(imsg, &vid); memcpy(&vid, imsg->data, sizeof(vid)); ifd = imsg_get_fd(imsg); if (ifd == -1) { log_warnx("%s: invalid fd", __func__); return (-1); } if (atomicio(read, ifd, &vmh, sizeof(vmh)) != sizeof(vmh)) { log_warnx("%s: error reading vmh from received vm", __func__); res = EIO; close(ifd); cmd = IMSG_VMDOP_START_VM_RESPONSE; break; } if (vmd_check_vmh(&vmh)) { res = ENOENT; close(ifd); cmd = IMSG_VMDOP_START_VM_RESPONSE; break; } if (atomicio(read, ifd, &vmc, sizeof(vmc)) != sizeof(vmc)) { log_warnx("%s: error reading vmc from received vm", __func__); res = EIO; close(ifd); cmd = IMSG_VMDOP_START_VM_RESPONSE; break; } strlcpy(vmc.vmc_params.vcp_name, vid.vid_name, sizeof(vmc.vmc_params.vcp_name)); vmc.vmc_params.vcp_id = 0; ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid); if (ret != 0) { res = errno; cmd = IMSG_VMDOP_START_VM_RESPONSE; close(ifd); } else { vm->vm_state |= VM_STATE_RECEIVED; config_setvm(ps, vm, imsg->hdr.peerid, vmc.vmc_owner.uid); log_debug("%s: sending fd to vmm", __func__); proc_compose_imsg(ps, PROC_VMM, -1, IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, ifd, NULL, 0); } break; case IMSG_VMDOP_DONE: control_reset(&ps->ps_csock); TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry) control_reset(rcs); cmd = 0; break; default: return (-1); } switch (cmd) { case 0: break; case IMSG_VMDOP_START_VM_RESPONSE: case IMSG_VMDOP_TERMINATE_VM_RESPONSE: memset(&vmr, 0, sizeof(vmr)); vmr.vmr_result = res; vmr.vmr_id = id; if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd, imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1) return (-1); break; default: if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd, imsg->hdr.peerid, -1, &res, sizeof(res)) == -1) return (-1); break; } return (0); } int vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg) { struct vmop_result vmr; struct privsep *ps = p->p_ps; int res = 0; struct vmd_vm *vm; struct vm_create_params *vcp; struct vmop_info_result vir; switch (imsg->hdr.type) { case IMSG_VMDOP_PAUSE_VM_RESPONSE: IMSG_SIZE_CHECK(imsg, &vmr); memcpy(&vmr, imsg->data, sizeof(vmr)); if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) break; proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type, imsg->hdr.peerid, -1, imsg->data, sizeof(imsg->data)); log_info("%s: paused vm %d successfully", vm->vm_params.vmc_params.vcp_name, vm->vm_vmid); vm->vm_state |= VM_STATE_PAUSED; break; case IMSG_VMDOP_UNPAUSE_VM_RESPONSE: IMSG_SIZE_CHECK(imsg, &vmr); memcpy(&vmr, imsg->data, sizeof(vmr)); if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) break; proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type, imsg->hdr.peerid, -1, imsg->data, sizeof(imsg->data)); log_info("%s: unpaused vm %d successfully.", vm->vm_params.vmc_params.vcp_name, vm->vm_vmid); vm->vm_state &= ~VM_STATE_PAUSED; break; case IMSG_VMDOP_START_VM_RESPONSE: IMSG_SIZE_CHECK(imsg, &vmr); memcpy(&vmr, imsg->data, sizeof(vmr)); if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) break; vm->vm_pid = vmr.vmr_pid; vcp = &vm->vm_params.vmc_params; vcp->vcp_id = vmr.vmr_id; /* * If the peerid is not -1, forward the response back to the * the control socket. If it is -1, the request originated * from the parent, not the control socket. */ if (vm->vm_peerid != (uint32_t)-1) { (void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname, sizeof(vmr.vmr_ttyname)); if (proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type, vm->vm_peerid, -1, &vmr, sizeof(vmr)) == -1) { errno = vmr.vmr_result; log_warn("%s: failed to forward vm result", vcp->vcp_name); vm_terminate(vm, __func__); return (-1); } } if (vmr.vmr_result) { log_warnx("%s: failed to start vm", vcp->vcp_name); vm_terminate(vm, __func__); errno = vmr.vmr_result; break; } /* Now configure all the interfaces */ if (vm_priv_ifconfig(ps, vm) == -1) { log_warn("%s: failed to configure vm", vcp->vcp_name); vm_terminate(vm, __func__); break; } log_info("started %s (vm %d) successfully, tty %s", vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname); break; case IMSG_VMDOP_TERMINATE_VM_RESPONSE: IMSG_SIZE_CHECK(imsg, &vmr); memcpy(&vmr, imsg->data, sizeof(vmr)); if (vmr.vmr_result) { DPRINTF("%s: forwarding TERMINATE VM for vm id %d", __func__, vmr.vmr_id); proc_forward_imsg(ps, imsg, PROC_CONTROL, -1); } else { if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) break; /* Mark VM as shutting down */ vm->vm_state |= VM_STATE_SHUTDOWN; } break; case IMSG_VMDOP_SEND_VM_RESPONSE: IMSG_SIZE_CHECK(imsg, &vmr); memcpy(&vmr, imsg->data, sizeof(vmr)); if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) break; if (!vmr.vmr_result) { log_info("%s: sent vm %d successfully.", vm->vm_params.vmc_params.vcp_name, vm->vm_vmid); vm_terminate(vm, __func__); } /* Send a response if a control client is waiting for it */ if (imsg->hdr.peerid != (uint32_t)-1) { /* the error is meaningless for deferred responses */ vmr.vmr_result = 0; if (proc_compose_imsg(ps, PROC_CONTROL, -1, IMSG_VMDOP_SEND_VM_RESPONSE, imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1) return (-1); } break; case IMSG_VMDOP_TERMINATE_VM_EVENT: IMSG_SIZE_CHECK(imsg, &vmr); memcpy(&vmr, imsg->data, sizeof(vmr)); DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d", __func__, vmr.vmr_id, vmr.vmr_result); if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) { log_debug("%s: vm %d is no longer available", __func__, vmr.vmr_id); break; } if (vmr.vmr_result != EAGAIN || vm->vm_params.vmc_bootdevice) { vm_terminate(vm, __func__); } else { /* Stop VM instance but keep the tty open */ vm_stop(vm, 1, __func__); config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid); } /* The error is meaningless for deferred responses */ vmr.vmr_result = 0; if (proc_compose_imsg(ps, PROC_CONTROL, -1, IMSG_VMDOP_TERMINATE_VM_EVENT, imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1) return (-1); break; case IMSG_VMDOP_GET_INFO_VM_DATA: IMSG_SIZE_CHECK(imsg, &vir); memcpy(&vir, imsg->data, sizeof(vir)); if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) { memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname)); if (vm->vm_ttyname[0] != '\0') strlcpy(vir.vir_ttyname, vm->vm_ttyname, sizeof(vir.vir_ttyname)); log_debug("%s: running vm: %d, vm_state: 0x%x", __func__, vm->vm_vmid, vm->vm_state); vir.vir_state = vm->vm_state; /* get the user id who started the vm */ vir.vir_uid = vm->vm_uid; vir.vir_gid = vm->vm_params.vmc_owner.gid; } if (proc_compose_imsg(ps, imsg->hdr.peerid == IMSG_AGENTX_PEERID ? PROC_AGENTX : PROC_CONTROL, -1, imsg->hdr.type, imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) { if (vm) vm_terminate(vm, __func__); return (-1); } break; case IMSG_VMDOP_GET_INFO_VM_END_DATA: /* * PROC_VMM has responded with the *running* VMs, now we * append the others. These use the special value 0 for their * kernel id to indicate that they are not running. */ TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { if (!(vm->vm_state & VM_STATE_RUNNING)) { memset(&vir, 0, sizeof(vir)); vir.vir_info.vir_id = vm->vm_vmid; strlcpy(vir.vir_info.vir_name, vm->vm_params.vmc_params.vcp_name, VMM_MAX_NAME_LEN); vir.vir_info.vir_memory_size = vm->vm_params.vmc_params. vcp_memranges[0].vmr_size; vir.vir_info.vir_ncpus = vm->vm_params.vmc_params.vcp_ncpus; /* get the configured user id for this vm */ vir.vir_uid = vm->vm_params.vmc_owner.uid; vir.vir_gid = vm->vm_params.vmc_owner.gid; log_debug("%s: vm: %d, vm_state: 0x%x", __func__, vm->vm_vmid, vm->vm_state); vir.vir_state = vm->vm_state; if (proc_compose_imsg(ps, imsg->hdr.peerid == IMSG_AGENTX_PEERID ? PROC_AGENTX : PROC_CONTROL, -1, IMSG_VMDOP_GET_INFO_VM_DATA, imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) { log_debug("%s: GET_INFO_VM_END failed", __func__); vm_terminate(vm, __func__); return (-1); } } } IMSG_SIZE_CHECK(imsg, &res); proc_forward_imsg(ps, imsg, imsg->hdr.peerid == IMSG_AGENTX_PEERID ? PROC_AGENTX : PROC_CONTROL, -1); break; default: return (-1); } return (0); } int vmd_dispatch_agentx(int fd, struct privsep_proc *p, struct imsg *imsg) { struct privsep *ps = p->p_ps; switch (imsg->hdr.type) { case IMSG_VMDOP_GET_INFO_VM_REQUEST: proc_forward_imsg(ps, imsg, PROC_VMM, -1); return (0); default: break; } return (-1); } int vmd_dispatch_priv(int fd, struct privsep_proc *p, struct imsg *imsg) { struct vmop_addr_result var; switch (imsg->hdr.type) { case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE: IMSG_SIZE_CHECK(imsg, &var); memcpy(&var, imsg->data, sizeof(var)); proc_forward_imsg(p->p_ps, imsg, PROC_VMM, -1); break; default: return (-1); } return (0); } int vmd_check_vmh(struct vm_dump_header *vmh) { int i; unsigned int code, leaf; unsigned int a, b, c, d; if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) { log_warnx("%s: incompatible dump signature", __func__); return (-1); } if (vmh->vmh_version != VM_DUMP_VERSION) { log_warnx("%s: incompatible dump version", __func__); return (-1); } for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { code = vmh->vmh_cpuids[i].code; leaf = vmh->vmh_cpuids[i].leaf; if (leaf != 0x00) { log_debug("%s: invalid leaf 0x%x for code 0x%x", __func__, leaf, code); return (-1); } switch (code) { case 0x00: CPUID_LEAF(code, leaf, a, b, c, d); if (vmh->vmh_cpuids[i].a > a) { log_debug("%s: incompatible cpuid level", __func__); return (-1); } if (!(vmh->vmh_cpuids[i].b == b && vmh->vmh_cpuids[i].c == c && vmh->vmh_cpuids[i].d == d)) { log_debug("%s: incompatible cpu brand", __func__); return (-1); } break; case 0x01: CPUID_LEAF(code, leaf, a, b, c, d); if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) != (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) { log_debug("%s: incompatible cpu features " "code: 0x%x leaf: 0x%x reg: c", __func__, code, leaf); return (-1); } if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) != (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) { log_debug("%s: incompatible cpu features " "code: 0x%x leaf: 0x%x reg: d", __func__, code, leaf); return (-1); } break; case 0x07: CPUID_LEAF(code, leaf, a, b, c, d); if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) != (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) { log_debug("%s: incompatible cpu features " "code: 0x%x leaf: 0x%x reg: c", __func__, code, leaf); return (-1); } if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) != (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) { log_debug("%s: incompatible cpu features " "code: 0x%x leaf: 0x%x reg: d", __func__, code, leaf); return (-1); } break; case 0x0d: CPUID_LEAF(code, leaf, a, b, c, d); if (vmh->vmh_cpuids[i].b > b) { log_debug("%s: incompatible cpu: insufficient " "max save area for enabled XCR0 features", __func__); return (-1); } if (vmh->vmh_cpuids[i].c > c) { log_debug("%s: incompatible cpu: insufficient " "max save area for supported XCR0 features", __func__); return (-1); } break; case 0x80000001: CPUID_LEAF(code, leaf, a, b, c, d); if ((vmh->vmh_cpuids[i].a & a) != vmh->vmh_cpuids[i].a) { log_debug("%s: incompatible cpu features " "code: 0x%x leaf: 0x%x reg: a", __func__, code, leaf); return (-1); } if ((vmh->vmh_cpuids[i].c & c) != vmh->vmh_cpuids[i].c) { log_debug("%s: incompatible cpu features " "code: 0x%x leaf: 0x%x reg: c", __func__, code, leaf); return (-1); } if ((vmh->vmh_cpuids[i].d & d) != vmh->vmh_cpuids[i].d) { log_debug("%s: incompatible cpu features " "code: 0x%x leaf: 0x%x reg: d", __func__, code, leaf); return (-1); } break; default: log_debug("%s: unknown code 0x%x", __func__, code); return (-1); } } return (0); } void vmd_sighdlr(int sig, short event, void *arg) { if (privsep_process != PROC_PARENT) return; log_debug("%s: handling signal", __func__); switch (sig) { case SIGHUP: log_info("%s: reload requested with SIGHUP", __func__); /* * This is safe because libevent uses async signal handlers * that run in the event loop and not in signal context. */ (void)vmd_reload(0, NULL); break; case SIGPIPE: log_info("%s: ignoring SIGPIPE", __func__); break; case SIGUSR1: log_info("%s: ignoring SIGUSR1", __func__); break; case SIGTERM: case SIGINT: vmd_shutdown(); break; default: fatalx("unexpected signal"); } } __dead void usage(void) { extern char *__progname; fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n", __progname); exit(1); } int main(int argc, char **argv) { struct privsep *ps; int ch; enum privsep_procid proc_id = PROC_PARENT; int proc_instance = 0, vm_launch = 0; int vmm_fd = -1, vm_fd = -1; const char *errp, *title = NULL; int argc0 = argc; char dev_type = '\0'; log_init(0, LOG_DAEMON); if ((env = calloc(1, sizeof(*env))) == NULL) fatal("calloc: env"); env->vmd_fd = -1; env->vmd_fd6 = -1; while ((ch = getopt(argc, argv, "D:P:I:V:X:df:i:nt:vp:")) != -1) { switch (ch) { case 'D': if (cmdline_symset(optarg) < 0) log_warnx("could not parse macro definition %s", optarg); break; case 'd': env->vmd_debug = 2; break; case 'f': conffile = optarg; break; case 'v': env->vmd_verbose++; break; /* vmd fork/exec */ case 'n': env->vmd_noaction = 1; break; case 'P': title = optarg; proc_id = proc_getid(procs, nitems(procs), title); if (proc_id == PROC_MAX) fatalx("invalid process name"); break; case 'I': proc_instance = strtonum(optarg, 0, PROC_MAX_INSTANCES, &errp); if (errp) fatalx("invalid process instance"); break; /* child vm and device fork/exec */ case 'p': title = optarg; break; case 'V': vm_launch = VMD_LAUNCH_VM; vm_fd = strtonum(optarg, 0, 128, &errp); if (errp) fatalx("invalid vm fd"); break; case 'X': vm_launch = VMD_LAUNCH_DEV; vm_fd = strtonum(optarg, 0, 128, &errp); if (errp) fatalx("invalid device fd"); break; case 't': dev_type = *optarg; switch (dev_type) { case VMD_DEVTYPE_NET: case VMD_DEVTYPE_DISK: break; default: fatalx("invalid device type"); } break; case 'i': vmm_fd = strtonum(optarg, 0, 128, &errp); if (errp) fatalx("invalid vmm fd"); break; default: usage(); } } argc -= optind; if (argc > 0) usage(); if (env->vmd_noaction && !env->vmd_debug) env->vmd_debug = 1; log_init(env->vmd_debug, LOG_DAEMON); log_setverbose(env->vmd_verbose); /* Re-exec from the vmm child process requires an absolute path. */ if (proc_id == PROC_PARENT && *argv[0] != '/' && !env->vmd_noaction) fatalx("re-exec requires execution with an absolute path"); env->argv0 = argv[0]; /* check for root privileges */ if (env->vmd_noaction == 0 && !vm_launch) { if (geteuid()) fatalx("need root privileges"); } ps = &env->vmd_ps; ps->ps_env = env; if (config_init(env) == -1) fatal("failed to initialize configuration"); if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL) fatal("unknown user %s", VMD_USER); /* First proc runs as root without pledge but in default chroot */ proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */ proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */ /* * If we're launching a new vm or its device, we short out here. */ if (vm_launch == VMD_LAUNCH_VM) { vm_main(vm_fd, vmm_fd); /* NOTREACHED */ } else if (vm_launch == VMD_LAUNCH_DEV) { if (dev_type == VMD_DEVTYPE_NET) { log_procinit("vm/%s/vionet", title); vionet_main(vm_fd, vmm_fd); /* NOTREACHED */ } else if (dev_type == VMD_DEVTYPE_DISK) { log_procinit("vm/%s/vioblk", title); vioblk_main(vm_fd, vmm_fd); /* NOTREACHED */ } fatalx("unsupported device type '%c'", dev_type); } /* Open /dev/vmm early. */ if (env->vmd_noaction == 0 && proc_id == PROC_PARENT) { env->vmd_fd = open(VMM_NODE, O_RDWR | O_CLOEXEC); if (env->vmd_fd == -1) fatal("%s", VMM_NODE); } /* Configure the control socket */ ps->ps_csock.cs_name = SOCKET_NAME; TAILQ_INIT(&ps->ps_rcsocks); /* Configuration will be parsed after forking the children */ env->vmd_conffile = conffile; if (env->vmd_noaction) ps->ps_noaction = 1; ps->ps_instance = proc_instance; if (title != NULL) ps->ps_title[proc_id] = title; /* only the parent returns */ proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv, proc_id); if (!env->vmd_debug && daemon(0, 0) == -1) fatal("can't daemonize"); if (ps->ps_noaction == 0) log_info("startup"); event_init(); signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps); signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps); signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps); signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps); signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps); signal_add(&ps->ps_evsigint, NULL); signal_add(&ps->ps_evsigterm, NULL); signal_add(&ps->ps_evsighup, NULL); signal_add(&ps->ps_evsigpipe, NULL); signal_add(&ps->ps_evsigusr1, NULL); if (!env->vmd_noaction) proc_connect(ps); if (vmd_configure() == -1) fatalx("configuration failed"); event_dispatch(); log_debug("exiting"); return (0); } void start_vm_batch(int fd, short type, void *args) { int i = 0; struct vmd_vm *vm; log_debug("%s: starting batch of %d vms", __func__, env->vmd_cfg.parallelism); TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { if (!(vm->vm_state & VM_STATE_WAITING)) { log_debug("%s: not starting vm %s (disabled)", __func__, vm->vm_params.vmc_params.vcp_name); continue; } i++; if (i > env->vmd_cfg.parallelism) { evtimer_add(&staggered_start_timer, &env->vmd_cfg.delay); break; } vm->vm_state &= ~VM_STATE_WAITING; config_setvm(&env->vmd_ps, vm, -1, vm->vm_params.vmc_owner.uid); } log_debug("%s: done starting vms", __func__); } int vmd_configure(void) { int ncpus; struct vmd_switch *vsw; int ncpu_mib[] = {CTL_HW, HW_NCPUONLINE}; size_t ncpus_sz = sizeof(ncpus); /* * pledge in the parent process: * stdio - for malloc and basic I/O including events. * rpath - for reload to open and read the configuration files. * wpath - for opening disk images and tap devices. * tty - for openpty and TIOCUCNTL. * proc - run kill to terminate its children safely. * sendfd - for disks, interfaces and other fds. * recvfd - for send and receive. * getpw - lookup user or group id by name. * chown, fattr - change tty ownership * flock - locking disk files */ if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw" " chown fattr flock", NULL) == -1) fatal("pledge"); if ((env->vmd_ptmfd = getptmfd()) == -1) fatal("getptmfd %s", PATH_PTMDEV); if (parse_config(env->vmd_conffile) == -1) { proc_kill(&env->vmd_ps); exit(1); } if (env->vmd_noaction) { fprintf(stderr, "configuration OK\n"); proc_kill(&env->vmd_ps); exit(0); } /* Send VMM device fd to vmm proc. */ proc_compose_imsg(&env->vmd_ps, PROC_VMM, -1, IMSG_VMDOP_RECEIVE_VMM_FD, -1, env->vmd_fd, NULL, 0); /* Send shared global configuration to all children */ if (config_setconfig(env) == -1) return (-1); TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) { if (vsw->sw_running) continue; if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) { log_warn("%s: failed to create switch %s", __func__, vsw->sw_name); switch_remove(vsw); return (-1); } } if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) { env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY; if (sysctl(ncpu_mib, nitems(ncpu_mib), &ncpus, &ncpus_sz, NULL, 0) == -1) ncpus = 1; env->vmd_cfg.parallelism = ncpus; log_debug("%s: setting staggered start configuration to " "parallelism: %d and delay: %lld", __func__, ncpus, (long long) env->vmd_cfg.delay.tv_sec); } log_debug("%s: starting vms in staggered fashion", __func__); evtimer_set(&staggered_start_timer, start_vm_batch, NULL); /* start first batch */ start_vm_batch(0, 0, NULL); return (0); } int vmd_reload(unsigned int reset, const char *filename) { struct vmd_vm *vm, *next_vm; struct vmd_switch *vsw; int reload = 0; /* Switch back to the default config file */ if (filename == NULL || *filename == '\0') { filename = env->vmd_conffile; reload = 1; } log_debug("%s: level %d config file %s", __func__, reset, filename); if (reset) { /* Purge the configuration */ config_purge(env, reset); config_setreset(env, reset); } else { /* * Load or reload the configuration. * * Reloading removes all non-running VMs before processing the * config file, whereas loading only adds to the existing list * of VMs. */ if (reload) { TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, next_vm) { if (!(vm->vm_state & VM_STATE_RUNNING)) { DPRINTF("%s: calling vm_remove", __func__); vm_remove(vm, __func__); } } } if (parse_config(filename) == -1) { log_debug("%s: failed to load config file %s", __func__, filename); return (-1); } if (reload) { /* Update shared global configuration in all children */ if (config_setconfig(env) == -1) return (-1); } TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) { if (vsw->sw_running) continue; if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) { log_warn("%s: failed to create switch %s", __func__, vsw->sw_name); switch_remove(vsw); return (-1); } } log_debug("%s: starting vms in staggered fashion", __func__); evtimer_set(&staggered_start_timer, start_vm_batch, NULL); /* start first batch */ start_vm_batch(0, 0, NULL); } return (0); } void vmd_shutdown(void) { struct vmd_vm *vm, *vm_next; log_debug("%s: performing shutdown", __func__); TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) { vm_remove(vm, __func__); } proc_kill(&env->vmd_ps); free(env); log_warnx("terminating"); exit(0); } struct vmd_vm * vm_getbyvmid(uint32_t vmid) { struct vmd_vm *vm; if (vmid == 0) return (NULL); TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { if (vm->vm_vmid == vmid) return (vm); } return (NULL); } struct vmd_vm * vm_getbyid(uint32_t id) { struct vmd_vm *vm; if (id == 0) return (NULL); TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { if (vm->vm_params.vmc_params.vcp_id == id) return (vm); } return (NULL); } uint32_t vm_id2vmid(uint32_t id, struct vmd_vm *vm) { if (vm == NULL && (vm = vm_getbyid(id)) == NULL) return (0); DPRINTF("%s: vmm id %u is vmid %u", __func__, id, vm->vm_vmid); return (vm->vm_vmid); } uint32_t vm_vmid2id(uint32_t vmid, struct vmd_vm *vm) { if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL) return (0); DPRINTF("%s: vmid %u is vmm id %u", __func__, vmid, vm->vm_params.vmc_params.vcp_id); return (vm->vm_params.vmc_params.vcp_id); } struct vmd_vm * vm_getbyname(const char *name) { struct vmd_vm *vm; if (name == NULL) return (NULL); TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0) return (vm); } return (NULL); } struct vmd_vm * vm_getbypid(pid_t pid) { struct vmd_vm *vm; TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { if (vm->vm_pid == pid) return (vm); } return (NULL); } void vm_stop(struct vmd_vm *vm, int keeptty, const char *caller) { struct privsep *ps = &env->vmd_ps; unsigned int i, j; if (vm == NULL) return; log_debug("%s: %s %s stopping vm %d%s", __func__, ps->ps_title[privsep_process], caller, vm->vm_vmid, keeptty ? ", keeping tty open" : ""); vm->vm_state &= ~(VM_STATE_RECEIVED | VM_STATE_RUNNING | VM_STATE_SHUTDOWN); if (vm->vm_iev.ibuf.fd != -1) { event_del(&vm->vm_iev.ev); close(vm->vm_iev.ibuf.fd); } for (i = 0; i < VM_MAX_DISKS_PER_VM; i++) { for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) { if (vm->vm_disks[i][j] != -1) { close(vm->vm_disks[i][j]); vm->vm_disks[i][j] = -1; } } } for (i = 0; i < VM_MAX_NICS_PER_VM; i++) { if (vm->vm_ifs[i].vif_fd != -1) { close(vm->vm_ifs[i].vif_fd); vm->vm_ifs[i].vif_fd = -1; } free(vm->vm_ifs[i].vif_name); free(vm->vm_ifs[i].vif_switch); free(vm->vm_ifs[i].vif_group); vm->vm_ifs[i].vif_name = NULL; vm->vm_ifs[i].vif_switch = NULL; vm->vm_ifs[i].vif_group = NULL; } if (vm->vm_kernel != -1) { close(vm->vm_kernel); vm->vm_kernel = -1; } if (vm->vm_cdrom != -1) { close(vm->vm_cdrom); vm->vm_cdrom = -1; } if (!keeptty) { vm_closetty(vm); vm->vm_uid = 0; } } void vm_remove(struct vmd_vm *vm, const char *caller) { struct privsep *ps = &env->vmd_ps; if (vm == NULL) return; log_debug("%s: %s %s removing vm %d from running config", __func__, ps->ps_title[privsep_process], caller, vm->vm_vmid); TAILQ_REMOVE(env->vmd_vms, vm, vm_entry); vm_stop(vm, 0, caller); if (vm->vm_kernel_path != NULL && !vm->vm_from_config) free(vm->vm_kernel_path); free(vm); } int vm_claimid(const char *name, int uid, uint32_t *id) { struct name2id *n2i = NULL; TAILQ_FOREACH(n2i, env->vmd_known, entry) if (strcmp(n2i->name, name) == 0 && n2i->uid == uid) goto out; if (++env->vmd_nvm == 0) { log_warnx("too many vms"); return (-1); } if ((n2i = calloc(1, sizeof(struct name2id))) == NULL) { log_warnx("could not alloc vm name"); return (-1); } n2i->id = env->vmd_nvm; n2i->uid = uid; if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name)) { log_warnx("vm name too long"); free(n2i); return (-1); } TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry); out: *id = n2i->id; return (0); } int vm_register(struct privsep *ps, struct vmop_create_params *vmc, struct vmd_vm **ret_vm, uint32_t id, uid_t uid) { struct vmd_vm *vm = NULL, *vm_parent = NULL; struct vm_create_params *vcp = &vmc->vmc_params; struct vmop_owner *vmo = NULL; uint32_t nid, rng; unsigned int i, j; struct vmd_switch *sw; char *s; int ret = 0; /* Check if this is an instance of another VM */ if ((ret = vm_instance(ps, &vm_parent, vmc, uid)) != 0) { errno = ret; /* XXX might set invalid errno */ return (-1); } errno = 0; *ret_vm = NULL; if ((vm = vm_getbyname(vcp->vcp_name)) != NULL || (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) { if (vm_checkperm(vm, &vm->vm_params.vmc_owner, uid) != 0) { errno = EPERM; goto fail; } vm->vm_kernel = vmc->vmc_kernel; *ret_vm = vm; errno = EALREADY; goto fail; } if (vm_parent != NULL) vmo = &vm_parent->vm_params.vmc_insowner; /* non-root users can only start existing VMs or instances */ if (vm_checkperm(NULL, vmo, uid) != 0) { log_warnx("permission denied"); errno = EPERM; goto fail; } if (vmc->vmc_flags == 0) { log_warnx("invalid configuration, no devices"); errno = VMD_DISK_MISSING; goto fail; } if (vcp->vcp_ncpus == 0) vcp->vcp_ncpus = 1; if (vcp->vcp_memranges[0].vmr_size == 0) vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY; if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) { log_warnx("invalid number of CPUs"); goto fail; } else if (vmc->vmc_ndisks > VM_MAX_DISKS_PER_VM) { log_warnx("invalid number of disks"); goto fail; } else if (vmc->vmc_nnics > VM_MAX_NICS_PER_VM) { log_warnx("invalid number of interfaces"); goto fail; } else if (vmc->vmc_kernel == -1 && vmc->vmc_ndisks == 0 && strlen(vmc->vmc_cdrom) == 0) { log_warnx("no kernel or disk/cdrom specified"); goto fail; } else if (strlen(vcp->vcp_name) == 0) { log_warnx("invalid VM name"); goto fail; } else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' || *vcp->vcp_name == '_') { log_warnx("invalid VM name"); goto fail; } else { for (s = vcp->vcp_name; *s != '\0'; ++s) { if (!(isalnum((unsigned char)*s) || *s == '.' || \ *s == '-' || *s == '_')) { log_warnx("invalid VM name"); goto fail; } } } if ((vm = calloc(1, sizeof(*vm))) == NULL) goto fail; memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params)); vmc = &vm->vm_params; vcp = &vmc->vmc_params; vm->vm_pid = -1; vm->vm_tty = -1; vm->vm_receive_fd = -1; vm->vm_kernel = -1; vm->vm_state &= ~VM_STATE_PAUSED; if (vmc->vmc_kernel > -1) vm->vm_kernel = vmc->vmc_kernel; for (i = 0; i < VM_MAX_DISKS_PER_VM; i++) for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) vm->vm_disks[i][j] = -1; for (i = 0; i < VM_MAX_NICS_PER_VM; i++) vm->vm_ifs[i].vif_fd = -1; for (i = 0; i < vmc->vmc_nnics; i++) { if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) { /* inherit per-interface flags from the switch */ vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK); } /* * If the MAC address is zero, always randomize it in vmd(8) * because we cannot rely on the guest OS to do the right * thing like OpenBSD does. Based on ether_fakeaddr() * from the kernel, incremented by one to differentiate * the source. */ if (memcmp(zero_mac, &vmc->vmc_macs[i], ETHER_ADDR_LEN) == 0) { rng = arc4random(); vmc->vmc_macs[i][0] = 0xfe; vmc->vmc_macs[i][1] = 0xe1; vmc->vmc_macs[i][2] = 0xba + 1; vmc->vmc_macs[i][3] = 0xd0 | ((i + 1) & 0xf); vmc->vmc_macs[i][4] = rng; vmc->vmc_macs[i][5] = rng >> 8; } } vm->vm_cdrom = -1; vm->vm_iev.ibuf.fd = -1; /* * Assign a new internal Id if not specified and we succeed in * claiming a new Id. */ if (id != 0) vm->vm_vmid = id; else if (vm_claimid(vcp->vcp_name, uid, &nid) == -1) goto fail; else vm->vm_vmid = nid; log_debug("%s: registering vm %d", __func__, vm->vm_vmid); TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry); *ret_vm = vm; return (0); fail: if (errno == 0) errno = EINVAL; return (-1); } int vm_instance(struct privsep *ps, struct vmd_vm **vm_parent, struct vmop_create_params *vmc, uid_t uid) { char *name; struct vm_create_params *vcp = &vmc->vmc_params; struct vmop_create_params *vmcp; struct vm_create_params *vcpp; unsigned int i, j; /* return without error if the parent is NULL (nothing to inherit) */ if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 || vmc->vmc_instance[0] == '\0') return (0); if ((*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) { return (VMD_PARENT_INVALID); } vmcp = &(*vm_parent)->vm_params; vcpp = &vmcp->vmc_params; /* Are we allowed to create an instance from this VM? */ if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) { log_warnx("vm \"%s\" no permission to create vm instance", vcpp->vcp_name); return (ENAMETOOLONG); } name = vcp->vcp_name; if (vm_getbyname(vcp->vcp_name) != NULL || vm_getbyvmid(vcp->vcp_id) != NULL) { return (EPROCLIM); } /* CPU */ if (vcp->vcp_ncpus == 0) vcp->vcp_ncpus = vcpp->vcp_ncpus; if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 && vcp->vcp_ncpus != vcpp->vcp_ncpus) { log_warnx("vm \"%s\" no permission to set cpus", name); return (EPERM); } /* memory */ if (vcp->vcp_memranges[0].vmr_size == 0) vcp->vcp_memranges[0].vmr_size = vcpp->vcp_memranges[0].vmr_size; if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 && vcp->vcp_memranges[0].vmr_size != vcpp->vcp_memranges[0].vmr_size) { log_warnx("vm \"%s\" no permission to set memory", name); return (EPERM); } /* disks cannot be inherited */ if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 && vmc->vmc_ndisks) { log_warnx("vm \"%s\" no permission to set disks", name); return (EPERM); } for (i = 0; i < vmc->vmc_ndisks; i++) { /* Check if this disk is already used in the parent */ for (j = 0; j < vmcp->vmc_ndisks; j++) { if (strcmp(vmc->vmc_disks[i], vmcp->vmc_disks[j]) == 0) { log_warnx("vm \"%s\" disk %s cannot be reused", name, vmc->vmc_disks[i]); return (EBUSY); } } vmc->vmc_checkaccess |= VMOP_CREATE_DISK; } /* interfaces */ if (vmc->vmc_nnics > 0 && vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 && vmc->vmc_nnics != vmcp->vmc_nnics) { log_warnx("vm \"%s\" no permission to set interfaces", name); return (EPERM); } for (i = 0; i < vmcp->vmc_nnics; i++) { /* Interface got overwritten */ if (i < vmc->vmc_nnics) continue; /* Copy interface from parent */ vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i]; (void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i], sizeof(vmc->vmc_ifnames[i])); (void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i], sizeof(vmc->vmc_ifswitch[i])); (void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i], sizeof(vmc->vmc_ifgroup[i])); memcpy(vmc->vmc_macs[i], vmcp->vmc_macs[i], sizeof(vmc->vmc_macs[i])); vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i]; vmc->vmc_nnics++; } for (i = 0; i < vmc->vmc_nnics; i++) { for (j = 0; j < vmcp->vmc_nnics; j++) { if (memcmp(zero_mac, vmc->vmc_macs[i], sizeof(vmc->vmc_macs[i])) != 0 && memcmp(vmcp->vmc_macs[i], vmc->vmc_macs[i], sizeof(vmc->vmc_macs[i])) != 0) { log_warnx("vm \"%s\" lladdr cannot be reused", name); return (EBUSY); } if (strlen(vmc->vmc_ifnames[i]) && strcmp(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[j]) == 0) { log_warnx("vm \"%s\" %s cannot be reused", vmc->vmc_ifnames[i], name); return (EBUSY); } } } /* kernel */ if (vmc->vmc_kernel > -1 || ((*vm_parent)->vm_kernel_path != NULL && strnlen((*vm_parent)->vm_kernel_path, PATH_MAX) < PATH_MAX)) { if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) { log_warnx("vm \"%s\" no permission to set boot image", name); return (EPERM); } vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL; } /* cdrom */ if (strlen(vmc->vmc_cdrom) > 0) { if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) { log_warnx("vm \"%s\" no permission to set cdrom", name); return (EPERM); } vmc->vmc_checkaccess |= VMOP_CREATE_CDROM; } else if (strlcpy(vmc->vmc_cdrom, vmcp->vmc_cdrom, sizeof(vmc->vmc_cdrom)) >= sizeof(vmc->vmc_cdrom)) { log_warnx("vm \"%s\" cdrom name too long", name); return (EINVAL); } /* user */ if (vmc->vmc_owner.uid == 0) vmc->vmc_owner.uid = vmcp->vmc_owner.uid; else if (vmc->vmc_owner.uid != uid && vmc->vmc_owner.uid != vmcp->vmc_owner.uid) { log_warnx("vm \"%s\" user mismatch", name); return (EPERM); } /* group */ if (vmc->vmc_owner.gid == 0) vmc->vmc_owner.gid = vmcp->vmc_owner.gid; else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) { log_warnx("vm \"%s\" group mismatch", name); return (EPERM); } /* child instances */ if (vmc->vmc_insflags) { log_warnx("vm \"%s\" cannot change instance permissions", name); return (EPERM); } if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) { vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid; vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid; vmc->vmc_insflags = vmcp->vmc_insflags; } else { vmc->vmc_insowner.gid = 0; vmc->vmc_insowner.uid = 0; vmc->vmc_insflags = 0; } /* finished, remove instance flags */ vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE; return (0); } /* * vm_checkperm * * Checks if the user represented by the 'uid' parameter is allowed to * manipulate the VM described by the 'vm' parameter (or connect to said VM's * console.) * * Parameters: * vm: the VM whose permission is to be checked * vmo: the required uid/gid to be checked * uid: the user ID of the user making the request * * Return values: * 0: the permission should be granted * -1: the permission check failed (also returned if vm == null) */ int vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid) { struct group *gr; struct passwd *pw; char **grmem; /* root has no restrictions */ if (uid == 0) return (0); if (vmo == NULL) return (-1); /* check user */ if (vm == NULL) { if (vmo->uid == uid) return (0); } else { /* * check user of running vm (the owner of a running vm can * be different to (or more specific than) the configured owner. */ if (((vm->vm_state & VM_STATE_RUNNING) && vm->vm_uid == uid) || (!(vm->vm_state & VM_STATE_RUNNING) && vmo->uid == uid)) return (0); } /* check groups */ if (vmo->gid != -1) { if ((pw = getpwuid(uid)) == NULL) return (-1); if (pw->pw_gid == vmo->gid) return (0); if ((gr = getgrgid(vmo->gid)) != NULL) { for (grmem = gr->gr_mem; *grmem; grmem++) if (strcmp(*grmem, pw->pw_name) == 0) return (0); } } return (-1); } /* * vm_checkinsflag * * Checks whether the non-root user is allowed to set an instance option. * * Parameters: * vmc: the VM create parameters * flag: the flag to be checked * uid: the user ID of the user making the request * * Return values: * 0: the permission should be granted * -1: the permission check failed (also returned if vm == null) */ int vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid) { /* root has no restrictions */ if (uid == 0) return (0); if ((vmc->vmc_insflags & flag) == 0) return (-1); return (0); } /* * vm_checkaccess * * Checks if the user represented by the 'uid' parameter is allowed to * access the file described by the 'path' parameter. * * Parameters: * fd: the file descriptor of the opened file * uflag: check if the userid has access to the file * uid: the user ID of the user making the request * amode: the access flags of R_OK and W_OK * * Return values: * 0: the permission should be granted * -1: the permission check failed */ int vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode) { struct group *gr; struct passwd *pw; char **grmem; struct stat st; mode_t mode; if (fd == -1) return (-1); /* * File has to be accessible and a regular file */ if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode)) return (-1); /* root has no restrictions */ if (uid == 0 || uflag == 0) return (0); /* check other */ mode = amode & W_OK ? S_IWOTH : 0; mode |= amode & R_OK ? S_IROTH : 0; if ((st.st_mode & mode) == mode) return (0); /* check user */ mode = amode & W_OK ? S_IWUSR : 0; mode |= amode & R_OK ? S_IRUSR : 0; if (uid == st.st_uid && (st.st_mode & mode) == mode) return (0); /* check groups */ mode = amode & W_OK ? S_IWGRP : 0; mode |= amode & R_OK ? S_IRGRP : 0; if ((st.st_mode & mode) != mode) return (-1); if ((pw = getpwuid(uid)) == NULL) return (-1); if (pw->pw_gid == st.st_gid) return (0); if ((gr = getgrgid(st.st_gid)) != NULL) { for (grmem = gr->gr_mem; *grmem; grmem++) if (strcmp(*grmem, pw->pw_name) == 0) return (0); } return (-1); } int vm_opentty(struct vmd_vm *vm) { struct stat st; struct group *gr; uid_t uid; gid_t gid; mode_t mode; int on = 1, tty_slave; /* * Open tty with pre-opened PTM fd */ if (fdopenpty(env->vmd_ptmfd, &vm->vm_tty, &tty_slave, vm->vm_ttyname, NULL, NULL) == -1) { log_warn("fdopenpty"); return (-1); } close(tty_slave); /* * We use user ioctl(2) mode to pass break commands. */ if (ioctl(vm->vm_tty, TIOCUCNTL, &on) == -1) { log_warn("could not enable user ioctl mode on %s", vm->vm_ttyname); goto fail; } uid = vm->vm_uid; gid = vm->vm_params.vmc_owner.gid; if (vm->vm_params.vmc_owner.gid != -1) { mode = 0660; } else if ((gr = getgrnam("tty")) != NULL) { gid = gr->gr_gid; mode = 0620; } else { mode = 0600; gid = 0; } log_debug("%s: vm %s tty %s uid %d gid %d mode %o", __func__, vm->vm_params.vmc_params.vcp_name, vm->vm_ttyname, uid, gid, mode); /* * Change ownership and mode of the tty as required. * Loosely based on the implementation of sshpty.c */ if (fstat(vm->vm_tty, &st) == -1) { log_warn("fstat failed for %s", vm->vm_ttyname); goto fail; } if (st.st_uid != uid || st.st_gid != gid) { if (chown(vm->vm_ttyname, uid, gid) == -1) { log_warn("chown %s %d %d failed, uid %d", vm->vm_ttyname, uid, gid, getuid()); /* Ignore failure on read-only filesystems */ if (!((errno == EROFS) && (st.st_uid == uid || st.st_uid == 0))) goto fail; } } if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) { if (chmod(vm->vm_ttyname, mode) == -1) { log_warn("chmod %s %o failed, uid %d", vm->vm_ttyname, mode, getuid()); /* Ignore failure on read-only filesystems */ if (!((errno == EROFS) && (st.st_uid == uid || st.st_uid == 0))) goto fail; } } return (0); fail: vm_closetty(vm); return (-1); } void vm_closetty(struct vmd_vm *vm) { if (vm->vm_tty != -1) { /* Release and close the tty */ if (fchown(vm->vm_tty, 0, 0) == -1) log_warn("chown %s 0 0 failed", vm->vm_ttyname); if (fchmod(vm->vm_tty, 0666) == -1) log_warn("chmod %s 0666 failed", vm->vm_ttyname); close(vm->vm_tty); vm->vm_tty = -1; } memset(&vm->vm_ttyname, 0, sizeof(vm->vm_ttyname)); } void switch_remove(struct vmd_switch *vsw) { if (vsw == NULL) return; TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry); free(vsw->sw_group); free(vsw->sw_name); free(vsw); } struct vmd_switch * switch_getbyname(const char *name) { struct vmd_switch *vsw; if (name == NULL) return (NULL); TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) { if (strcmp(vsw->sw_name, name) == 0) return (vsw); } return (NULL); } char * get_string(uint8_t *ptr, size_t len) { size_t i; for (i = 0; i < len; i++) if (!isprint((unsigned char)ptr[i])) break; return strndup(ptr, i); } uint32_t prefixlen2mask(uint8_t prefixlen) { if (prefixlen == 0) return (0); if (prefixlen > 32) prefixlen = 32; return (htonl(0xffffffff << (32 - prefixlen))); } void prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask) { struct in6_addr s6; int i; if (prefixlen > 128) prefixlen = 128; memset(&s6, 0, sizeof(s6)); for (i = 0; i < prefixlen / 8; i++) s6.s6_addr[i] = 0xff; i = prefixlen % 8; if (i) s6.s6_addr[prefixlen / 8] = 0xff00 >> i; memcpy(mask, &s6, sizeof(s6)); } void getmonotime(struct timeval *tv) { struct timespec ts; if (clock_gettime(CLOCK_MONOTONIC, &ts)) fatal("clock_gettime"); TIMESPEC_TO_TIMEVAL(tv, &ts); } static inline void vm_terminate(struct vmd_vm *vm, const char *caller) { if (vm->vm_from_config) vm_stop(vm, 0, caller); else { /* vm_remove calls vm_stop */ vm_remove(vm, caller); } } /* * Utility function for closing vm file descriptors. Assumes an fd of -1 was * already closed or never opened. * * Returns 0 on success, otherwise -1 on failure. */ int close_fd(int fd) { int ret; if (fd == -1) return (0); #ifdef POSIX_CLOSE_RESTART do { ret = close(fd); } while (ret == -1 && errno == EINTR); #else ret = close(fd); #endif /* POSIX_CLOSE_RESTART */ if (ret == -1 && errno == EIO) log_warn("%s(%d)", __func__, fd); return (ret); }